├── __init__.py
├── lib
    ├── __init__.py
    ├── envs
    │   ├── __init__.py
    │   ├── discrete.py
    │   ├── windy_gridworld.py
    │   ├── cliff_walking.py
    │   ├── gridworld.py
    │   └── blackjack.py
    ├── atari
    │   ├── __init__.py
    │   ├── helpers.py
    │   └── state_processor.py
    └── plotting.py
├── DQN
    ├── .gitignore
    ├── README.md
    └── dqn.py
├── PolicyGradient
    ├── a3c
    │   ├── README.md
    │   ├── policy_monitor_test.py
    │   ├── worker_test.py
    │   ├── policy_monitor.py
    │   ├── estimator_test.py
    │   ├── train.py
    │   ├── estimators.py
    │   └── worker.py
    ├── README.md
    └── Continuous MountainCar Actor Critic Solution.ipynb
├── LICENSE
├── Introduction
    └── README.md
├── .gitignore
├── FA
    └── README.md
├── DP
    ├── README.md
    ├── Gamblers Problem.ipynb
    ├── Policy Evaluation Solution.ipynb
    ├── Value Iteration Solution.ipynb
    ├── Policy Evaluation.ipynb
    ├── Policy Iteration Solution.ipynb
    ├── Value Iteration.ipynb
    └── Policy Iteration.ipynb
├── TD
    ├── README.md
    ├── Cliff Environment Playground.ipynb
    └── Windy Gridworld Playground.ipynb
├── MDP
    └── README.md
├── MC
    ├── README.md
    ├── MC Prediction.ipynb
    ├── MC Control with Epsilon-Greedy Policies.ipynb
    ├── Off-Policy MC Control with Weighted Importance Sampling.ipynb
    └── Blackjack Playground.ipynb
└── README.md


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/envs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/DQN/.gitignore:
--------------------------------------------------------------------------------
1 | experiments/


--------------------------------------------------------------------------------
/lib/atari/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/PolicyGradient/a3c/README.md:
--------------------------------------------------------------------------------
 1 | ## Implementation of A3C (Asynchronous Advantage Actor-Critic)
 2 | 
 3 | #### Running
 4 | 
 5 | ```
 6 | ./train.py --model_dir /tmp/a3c --env Breakout-v0 --t_max 5 --eval_every 300 --parallelism 8
 7 | ```
 8 | 
 9 | See `./train.py --help` for a full list of options. Then, monitor training progress in Tensorboard:
10 | 
11 | ```
12 | tensorboard --logdir=/tmp/a3c
13 | ```
14 | 
15 | #### Components
16 | 
17 | - [`train.py`](train.py) contains the main method to start training.
18 | - [`estimators.py`](estimators.py) contains the Tensorflow graph definitions for the Policy and Value networks.
19 | - [`worker.py`](worker.py) contains code that runs in each worker threads.
20 | - [`policy_monitor.py`](policy_monitor.py) contains code that evaluates the policy network by running an episode and saving rewards to Tensorboard.
21 | 


--------------------------------------------------------------------------------
/lib/atari/helpers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class AtariEnvWrapper(object):
 4 |   """
 5 |   Wraps an Atari environment to end an episode when a life is lost.
 6 |   """
 7 |   def __init__(self, env):
 8 |     self.env = env
 9 | 
10 |   def __getattr__(self, name):
11 |     return getattr(self.env, name)
12 | 
13 |   def step(self, *args, **kwargs):
14 |     lives_before = self.env.ale.lives()
15 |     next_state, reward, done, info = self.env.step(*args, **kwargs)
16 |     lives_after = self.env.ale.lives()
17 | 
18 |     # End the episode when a life is lost
19 |     if lives_before > lives_after:
20 |       done = True
21 | 
22 |     # Clip rewards to [-1,1]
23 |     reward = max(min(reward, 1), -1)
24 | 
25 |     return next_state, reward, done, info
26 | 
27 | def atari_make_initial_state(state):
28 |   return np.stack([state] * 4, axis=2)
29 | 
30 | def atari_make_next_state(state, next_state):
31 |   return np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Denny Britz
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/lib/atari/state_processor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | class StateProcessor():
 5 |     """
 6 |     Processes a raw Atari iamges. Resizes it and converts it to grayscale.
 7 |     """
 8 |     def __init__(self):
 9 |         # Build the Tensorflow graph
10 |         with tf.variable_scope("state_processor"):
11 |             self.input_state = tf.placeholder(shape=[210, 160, 3], dtype=tf.uint8)
12 |             self.output = tf.image.rgb_to_grayscale(self.input_state)
13 |             self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160)
14 |             self.output = tf.image.resize_images(
15 |                 self.output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
16 |             self.output = tf.squeeze(self.output)
17 | 
18 |     def process(self, state, sess=None):
19 |         """
20 |         Args:
21 |             sess: A Tensorflow session object
22 |             state: A [210, 160, 3] Atari RGB State
23 | 
24 |         Returns:
25 |             A processed [84, 84, 1] state representing grayscale values.
26 |         """
27 |         sess = sess or tf.get_default_session()
28 |         return sess.run(self.output, { self.input_state: state })


--------------------------------------------------------------------------------
/Introduction/README.md:
--------------------------------------------------------------------------------
 1 | ## Introduction
 2 | 
 3 | ### Learning Goals
 4 | 
 5 | - Understand the Reinforcement Learning problem and how it differs from Supervised Learning
 6 | 
 7 | 
 8 | ### Summary
 9 | 
10 | - Reinforcement Learning (RL) is concerned with goal-directed learning and decision-making.
11 | - In RL an agent learns from experiences it gains by interacting with the environment. In Supervised Learning we cannot affect the environment.
12 | - In RL rewards are often delayed in time and the agent tries to maximize a long-term goal. For example, one may need to make seemingly suboptimal moves to reach a winning position in a game.
13 | - An agent interacts with the environment via states, actions and rewards.
14 | 
15 | 
16 | ### Lectures & Readings
17 | 
18 | **Required:**
19 | 
20 | - [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 1: The Reinforcement Learning Problem
21 | - David Silver's RL Course Lecture 1 - Introduction to Reinforcement Learning ([video](https://www.youtube.com/watch?v=2pWv7GOvuf0), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/intro_RL.pdf))
22 | - [OpenAI Gym Tutorial](https://gym.openai.com/docs)
23 | 
24 | **Optional:**
25 | 
26 | N/A
27 | 
28 | 
29 | ### Exercises
30 | 
31 | - [Work through the OpenAI Gym Tutorial](https://gym.openai.com/docs)
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ### Python ###
 2 | # Byte-compiled / optimized / DLL files
 3 | __pycache__/
 4 | *.py[cod]
 5 | *$py.class
 6 | 
 7 | # C extensions
 8 | *.so
 9 | 
10 | # Distribution / packaging
11 | .Python
12 | env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | experiments/
27 | 
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | 
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 | 
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | .hypothesis/
48 | 
49 | # Translations
50 | *.mo
51 | *.pot
52 | 
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 | 
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 | 
61 | # Scrapy stuff:
62 | .scrapy
63 | 
64 | # Sphinx documentation
65 | docs/_build/
66 | 
67 | # PyBuilder
68 | target/
69 | 
70 | # IPython Notebook
71 | .ipynb_checkpoints
72 | 
73 | # pyenv
74 | .python-version
75 | 
76 | # celery beat schedule file
77 | celerybeat-schedule
78 | 
79 | # dotenv
80 | .env
81 | 
82 | # virtualenv
83 | venv/
84 | ENV/
85 | 
86 | # Spyder project settings
87 | .spyderproject
88 | 
89 | # Rope project settings
90 | .ropeproject
91 | 
92 | 
93 | ### IPythonNotebook ###
94 | # Temporary data
95 | .ipynb_checkpoints/


--------------------------------------------------------------------------------
/lib/envs/discrete.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from gym import Env, spaces
 4 | from gym.utils import seeding
 5 | from gym.envs.toy_text.utils import categorical_sample
 6 | 
 7 | class DiscreteEnv(Env):
 8 | 
 9 |     """
10 |     Has the following members
11 |     - nS: number of states
12 |     - nA: number of actions
13 |     - P: transitions (*)
14 |     - isd: initial state distribution (**)
15 | 
16 |     (*) dictionary of lists, where
17 |       P[s][a] == [(probability, nextstate, reward, done), ...]
18 |     (**) list or array of length nS
19 | 
20 | 
21 |     """
22 | 
23 |     def __init__(self, nS, nA, P, isd):
24 |         self.P = P
25 |         self.isd = isd
26 |         self.lastaction = None  # for rendering
27 |         self.nS = nS
28 |         self.nA = nA
29 | 
30 |         self.action_space = spaces.Discrete(self.nA)
31 |         self.observation_space = spaces.Discrete(self.nS)
32 | 
33 |         self.seed()
34 |         self.s = categorical_sample(self.isd, self.np_random)
35 | 
36 |     def seed(self, seed=None):
37 |         self.np_random, seed = seeding.np_random(seed)
38 |         return [seed]
39 | 
40 |     def reset(self):
41 |         self.s = categorical_sample(self.isd, self.np_random)
42 |         self.lastaction = None
43 |         return int(self.s)
44 | 
45 |     def step(self, a):
46 |         transitions = self.P[self.s][a]
47 |         i = categorical_sample([t[0] for t in transitions], self.np_random)
48 |         p, s, r, d = transitions[i]
49 |         self.s = s
50 |         self.lastaction = a
51 |         return (int(s), r, d, {"prob": p})
52 | 


--------------------------------------------------------------------------------
/PolicyGradient/a3c/policy_monitor_test.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import sys
 3 | import os
 4 | import itertools
 5 | import collections
 6 | import unittest
 7 | import numpy as np
 8 | import tensorflow as tf
 9 | import tempfile
10 | 
11 | from inspect import getsourcefile
12 | current_path = os.path.dirname(os.path.abspath(getsourcefile(lambda:0)))
13 | import_path = os.path.abspath(os.path.join(current_path, "../.."))
14 | 
15 | if import_path not in sys.path:
16 |   sys.path.append(import_path)
17 | 
18 | # from lib import plotting
19 | from lib.atari.state_processor import StateProcessor
20 | from lib.atari import helpers as atari_helpers
21 | from policy_monitor import PolicyMonitor
22 | from estimators import ValueEstimator, PolicyEstimator
23 | 
24 | def make_env():
25 |   return gym.envs.make("Breakout-v0")
26 | 
27 | VALID_ACTIONS = [0, 1, 2, 3]
28 | 
29 | class PolicyMonitorTest(tf.test.TestCase):
30 |   def setUp(self):
31 |     super(PolicyMonitorTest, self).setUp()
32 | 
33 |     self.env = make_env()
34 |     self.global_step = tf.Variable(0, name="global_step", trainable=False)
35 |     self.summary_writer = tf.train.SummaryWriter(tempfile.mkdtemp())
36 | 
37 |     with tf.variable_scope("global") as vs:
38 |       self.global_policy_net = PolicyEstimator(len(VALID_ACTIONS))
39 |       self.global_value_net = ValueEstimator(reuse=True)
40 | 
41 |   def testEvalOnce(self):
42 |     pe = PolicyMonitor(
43 |       env=self.env,
44 |       policy_net=self.global_policy_net,
45 |       summary_writer=self.summary_writer)
46 | 
47 |     with self.test_session() as sess:
48 |       sess.run(tf.initialize_all_variables())
49 |       total_reward, episode_length = pe.eval_once(sess)
50 |       self.assertTrue(episode_length > 0)
51 | 
52 | 
53 | if __name__ == '__main__':
54 |   unittest.main()


--------------------------------------------------------------------------------
/FA/README.md:
--------------------------------------------------------------------------------
 1 | ## Function Approximation
 2 | 
 3 | ### Learning Goals
 4 | 
 5 | - Understand the motivation for Function Approximation over Table Lookup
 6 | - Understand how to incorporate function approximation into existing algorithms
 7 | - Understand convergence properties of function approximators and RL algorithms
 8 | - Understand batching using experience replay
 9 | 
10 | 
11 | ### Summary
12 | 
13 | - Building a big table, one value for each state or state-action pair, is memory- and data-inefficient. Function Approximation can generalize to unseen states by  using a featurized state representation.
14 | - Treat RL as supervised learning problem with the MC- or TD-target as the label and the current state/action as the input. Often the target also depends on the function estimator but we simply ignore its gradient. That's why these methods are called semi-gradient methods.
15 | - Challenge: We have non-stationary (policy changes, bootstrapping) and non-iid (correlated in time) data.
16 | - Many methods assume that our action space is discrete because they rely on calculating the argmax over all actions. Large and continuous action spaces are ongoing research.
17 | - For Control very few convergence guarantees exist. For non-linear approximators there are basically no guarantees at all. But they tend to work in practice.
18 | - Experience Replay: Store experience as dataset, randomize it, and repeatedly apply minibatch SGD.
19 | - Tricks to stabilize non-linear function approximators: Fixed Targets. The target is calculated based on frozen parameter values from a previous time step.
20 | - For the non-episodic (continuing) case function approximation is more complex and we need to give up discounting and use an "average reward" formulation.
21 | 
22 | 
23 | ### Lectures & Readings
24 | 
25 | **Required:**
26 | 
27 | - David Silver's RL Course Lecture 6 - Value Function Approximation ([video](https://www.youtube.com/watch?v=UoPei5o4fps), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/FA.pdf))
28 | - [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 9: On-policy Prediction with Approximation
29 | - [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 10: On-policy Control with Approximation
30 | 
31 | **Optional:**
32 | 
33 | - [Tutorial: Introduction to Reinforcement Learning with Function Approximation](https://www.youtube.com/watch?v=ggqnxyjaKe4)
34 | 
35 | 
36 | ### Exercises
37 | 
38 | - Get familiar with the [Mountain Car Playground](MountainCar%20Playground.ipynb)
39 | 
40 | - Solve Mountain Car Problem using Q-Learning with Linear Function Approximation
41 |   - [Exercise](Q-Learning%20with%20Value%20Function%20Approximation.ipynb)
42 |   - [Solution](Q-Learning%20with%20Value%20Function%20Approximation%20Solution.ipynb)
43 | 


--------------------------------------------------------------------------------
/DP/README.md:
--------------------------------------------------------------------------------
 1 | ## Model-Based RL: Policy and Value Iteration using Dynamic Programming
 2 | 
 3 | ### Learning Goals
 4 | 
 5 | - Understand the difference between Policy Evaluation and Policy Improvement and how these processes interact
 6 | - Understand the Policy Iteration Algorithm
 7 | - Understand the Value Iteration Algorithm
 8 | - Understand the Limitations of Dynamic Programming Approaches
 9 | 
10 | 
11 | ### Summary
12 | 
13 | - Dynamic Programming (DP) methods assume that we have a perfect model of the environment's Markov Decision Process (MDP). That's usually not the case in practice, but it's important to study DP anyway.
14 | - Policy Evaluation: Calculates the state-value function `V(s)` for a given policy. In DP this is done using a "full backup". At each state, we look ahead one step at each possible action and next state. We can only do this because we have a perfect model of the environment.
15 | - Full backups are basically the Bellman equations turned into updates.
16 | - Policy Improvement: Given the correct state-value function for a policy we can act greedily with respect to it (i.e. pick the best action at each state). Then we are guaranteed to improve the policy or keep it fixed if it's already optimal.
17 | - Policy Iteration: Iteratively perform Policy Evaluation and Policy Improvement until we reach the optimal policy.
18 | - Value Iteration: Instead of doing multiple steps of Policy Evaluation to find the "correct" V(s) we only do a single step and improve the policy immediately. In practice, this converges faster.
19 | - Generalized Policy Iteration: The process of iteratively doing policy evaluation and improvement. We can pick different algorithms for each of these steps but the basic idea stays the same.
20 | - DP methods bootstrap: They update estimates based on other estimates (one step ahead).
21 | 
22 | 
23 | ### Lectures & Readings
24 | 
25 | **Required:**
26 | 
27 | - David Silver's RL Course Lecture 3 - Planning by Dynamic Programming ([video](https://www.youtube.com/watch?v=Nd1-UUMVfz4), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/DP.pdf))
28 | 
29 | **Optional:**
30 | 
31 | - [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 4: Dynamic Programming
32 | 
33 | 
34 | ### Exercises
35 | 
36 | - Implement Policy Evaluation in Python (Gridworld)
37 |   - [Exercise](Policy%20Evaluation.ipynb)
38 |   - [Solution](Policy%20Evaluation%20Solution.ipynb)
39 | 
40 | - Implement Policy Iteration in Python (Gridworld)
41 |   - [Exercise](Policy%20Iteration.ipynb)
42 |   - [Solution](Policy%20Iteration%20Solution.ipynb)
43 | 
44 | - Implement Value Iteration in Python (Gridworld)
45 |   - [Exercise](Value%20Iteration.ipynb)
46 |   - [Solution](Value%20Iteration%20Solution.ipynb)
47 | 
48 | - Implement Gambler's Problem
49 |   - [Exercise](Gamblers%20Problem.ipynb)
50 |   - [Solution](Gamblers%20Problem%20Solution.ipynb)


--------------------------------------------------------------------------------
/TD/README.md:
--------------------------------------------------------------------------------
 1 | ## Model-Free Prediction & Control with Temporal Difference (TD) and Q-Learning
 2 | 
 3 | 
 4 | ### Learning Goals
 5 | 
 6 | - Understand TD(0) for prediction
 7 | - Understand SARSA for on-policy control
 8 | - Understand Q-Learning for off-policy control
 9 | - Understand the benefits of TD algorithms over MC and DP approaches
10 | - Understand how n-step methods unify MC and TD approaches
11 | - Understand the backward and forward view of TD-Lambda
12 | 
13 | 
14 | ### Summary
15 | 
16 | - TD-Learning is a combination of Monte Carlo and Dynamic Programming ideas. Like Monte Carlo, TD works based on samples and doesn't require a model of the environment. Like Dynamic Programming, TD uses bootstrapping to make updates.
17 | - Whether MC or TD is better depends on the problem and there are no theoretical results that prove a clear winner.
18 | - General Update Rule: `Q[s,a] += learning_rate * (td_target - Q[s,a])`. `td_target - Q[s,a]` is also called the TD Error.
19 | - SARSA: On-Policy TD Control
20 | - TD Target for SARSA: `R[t+1] + discount_factor * Q[next_state][next_action]`
21 | - Q-Learning: Off-policy TD Control
22 | - TD Target for Q-Learning: `R[t+1] + discount_factor * max(Q[next_state])`
23 | - Q-Learning has a positive bias because it uses the maximum of estimated Q values to estimate the maximum action value, all from the same experience. Double Q-Learning gets around this by splitting the experience and using different Q functions for maximization and estimation.
24 | - N-Step methods unify MC and TD approaches. They making updates based on n-steps instead of a single step (TD-0) or a full episode (MC).
25 | 
26 | 
27 | ### Lectures & Readings
28 | 
29 | **Required:**
30 | 
31 | - [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 6: Temporal-Difference Learning
32 | - David Silver's RL Course Lecture 4 - Model-Free Prediction ([video](https://www.youtube.com/watch?v=PnHCvfgC_ZA), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MC-TD.pdf))
33 | - David Silver's RL Course Lecture 5 - Model-Free Control ([video](https://www.youtube.com/watch?v=0g4j2k_Ggc4), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/control.pdf))
34 | 
35 | **Optional:**
36 | 
37 | - [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 7: Multi-Step Bootstrapping
38 | - [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 12: Eligibility Traces
39 | 
40 | 
41 | ### Exercises
42 | 
43 | - Get familiar with the [Windy Gridworld Playground](Windy%20Gridworld%20Playground.ipynb)
44 | - Implement SARSA
45 |   - [Exercise](SARSA.ipynb)
46 |   - [Solution](SARSA%20Solution.ipynb)
47 | - Get familiar with the [Cliff Environment Playground](Cliff%20Environment%20Playground.ipynb)
48 | - Implement Q-Learning in Python
49 |   - [Exercise](Q-Learning.ipynb)
50 |   - [Solution](Q-Learning%20Solution.ipynb)
51 | 


--------------------------------------------------------------------------------
/lib/envs/windy_gridworld.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import gym
 3 | import numpy as np
 4 | import sys
 5 | 
 6 | from . import discrete
 7 | 
 8 | UP = 0
 9 | RIGHT = 1
10 | DOWN = 2
11 | LEFT = 3
12 | 
13 | class WindyGridworldEnv(discrete.DiscreteEnv):
14 | 
15 |     metadata = {'render.modes': ['human', 'ansi']}
16 | 
17 |     def _limit_coordinates(self, coord):
18 |         coord[0] = min(coord[0], self.shape[0] - 1)
19 |         coord[0] = max(coord[0], 0)
20 |         coord[1] = min(coord[1], self.shape[1] - 1)
21 |         coord[1] = max(coord[1], 0)
22 |         return coord
23 | 
24 |     def _calculate_transition_prob(self, current, delta, winds):
25 |         new_position = np.array(current) + np.array(delta) + np.array([-1, 0]) * winds[tuple(current)]
26 |         new_position = self._limit_coordinates(new_position).astype(int)
27 |         new_state = np.ravel_multi_index(tuple(new_position), self.shape)
28 |         is_done = tuple(new_position) == (3, 7)
29 |         return [(1.0, new_state, -1.0, is_done)]
30 | 
31 |     def __init__(self):
32 |         self.shape = (7, 10)
33 | 
34 |         nS = np.prod(self.shape)
35 |         nA = 4
36 | 
37 |         # Wind strength
38 |         winds = np.zeros(self.shape)
39 |         winds[:,[3,4,5,8]] = 1
40 |         winds[:,[6,7]] = 2
41 | 
42 |         # Calculate transition probabilities
43 |         P = {}
44 |         for s in range(nS):
45 |             position = np.unravel_index(s, self.shape)
46 |             P[s] = { a : [] for a in range(nA) }
47 |             P[s][UP] = self._calculate_transition_prob(position, [-1, 0], winds)
48 |             P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1], winds)
49 |             P[s][DOWN] = self._calculate_transition_prob(position, [1, 0], winds)
50 |             P[s][LEFT] = self._calculate_transition_prob(position, [0, -1], winds)
51 | 
52 |         # We always start in state (3, 0)
53 |         isd = np.zeros(nS)
54 |         isd[np.ravel_multi_index((3,0), self.shape)] = 1.0
55 | 
56 |         super(WindyGridworldEnv, self).__init__(nS, nA, P, isd)
57 | 
58 |     def render(self, mode='human', close=False):
59 |         self._render(mode, close)
60 | 
61 |     def _render(self, mode='human', close=False):
62 |         if close:
63 |             return
64 | 
65 |         outfile = io.StringIO() if mode == 'ansi' else sys.stdout
66 | 
67 |         for s in range(self.nS):
68 |             position = np.unravel_index(s, self.shape)
69 |             # print(self.s)
70 |             if self.s == s:
71 |                 output = " x "
72 |             elif position == (3,7):
73 |                 output = " T "
74 |             else:
75 |                 output = " o "
76 | 
77 |             if position[1] == 0:
78 |                 output = output.lstrip()
79 |             if position[1] == self.shape[1] - 1:
80 |                 output = output.rstrip()
81 |                 output += "\n"
82 | 
83 |             outfile.write(output)
84 |         outfile.write("\n")
85 | 


--------------------------------------------------------------------------------
/lib/envs/cliff_walking.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import numpy as np
 3 | import sys
 4 | 
 5 | from . import discrete
 6 | 
 7 | UP = 0
 8 | RIGHT = 1
 9 | DOWN = 2
10 | LEFT = 3
11 | 
12 | class CliffWalkingEnv(discrete.DiscreteEnv):
13 | 
14 |     metadata = {'render.modes': ['human', 'ansi']}
15 | 
16 |     def _limit_coordinates(self, coord):
17 |         coord[0] = min(coord[0], self.shape[0] - 1)
18 |         coord[0] = max(coord[0], 0)
19 |         coord[1] = min(coord[1], self.shape[1] - 1)
20 |         coord[1] = max(coord[1], 0)
21 |         return coord
22 | 
23 |     def _calculate_transition_prob(self, current, delta):
24 |         new_position = np.array(current) + np.array(delta)
25 |         new_position = self._limit_coordinates(new_position).astype(int)
26 |         new_state = np.ravel_multi_index(tuple(new_position), self.shape)
27 |         reward = -100.0 if self._cliff[tuple(new_position)] else -1.0
28 |         is_done = self._cliff[tuple(new_position)] or (tuple(new_position) == (3,11))
29 |         return [(1.0, new_state, reward, is_done)]
30 | 
31 |     def __init__(self):
32 |         self.shape = (4, 12)
33 | 
34 |         nS = np.prod(self.shape)
35 |         nA = 4
36 | 
37 |         # Cliff Location
38 |         self._cliff = np.zeros(self.shape, dtype=np.bool)
39 |         self._cliff[3, 1:-1] = True
40 | 
41 |         # Calculate transition probabilities
42 |         P = {}
43 |         for s in range(nS):
44 |             position = np.unravel_index(s, self.shape)
45 |             P[s] = { a : [] for a in range(nA) }
46 |             P[s][UP] = self._calculate_transition_prob(position, [-1, 0])
47 |             P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1])
48 |             P[s][DOWN] = self._calculate_transition_prob(position, [1, 0])
49 |             P[s][LEFT] = self._calculate_transition_prob(position, [0, -1])
50 | 
51 |         # We always start in state (3, 0)
52 |         isd = np.zeros(nS)
53 |         isd[np.ravel_multi_index((3,0), self.shape)] = 1.0
54 | 
55 |         super(CliffWalkingEnv, self).__init__(nS, nA, P, isd)
56 | 
57 |     def render(self, mode='human', close=False):
58 |         self._render(mode, close)
59 | 
60 |     def _render(self, mode='human', close=False):
61 |         if close:
62 |             return
63 | 
64 |         outfile = io.StringIO() if mode == 'ansi' else sys.stdout
65 | 
66 |         for s in range(self.nS):
67 |             position = np.unravel_index(s, self.shape)
68 |             # print(self.s)
69 |             if self.s == s:
70 |                 output = " x "
71 |             elif position == (3,11):
72 |                 output = " T "
73 |             elif self._cliff[position]:
74 |                 output = " C "
75 |             else:
76 |                 output = " o "
77 | 
78 |             if position[1] == 0:
79 |                 output = output.lstrip() 
80 |             if position[1] == self.shape[1] - 1:
81 |                 output = output.rstrip() 
82 |                 output += "\n"
83 | 
84 |             outfile.write(output)
85 |         outfile.write("\n")
86 | 


--------------------------------------------------------------------------------
/DQN/README.md:
--------------------------------------------------------------------------------
 1 | ## Deep Q-Learning
 2 | 
 3 | ### Learning Goals
 4 | 
 5 | - Understand the Deep Q-Learning (DQN) algorithm
 6 | - Understand why Experience Replay and a Target Network are necessary to make Deep Q-Learning work in practice
 7 | - (Optional) Understand Double Deep Q-Learning
 8 | - (Optional) Understand Prioritized Experience Replay
 9 | 
10 | 
11 | ### Summary
12 | 
13 | - DQN: Q-Learning but with a Deep Neural Network as a function approximator.
14 | - Using a non-linear Deep Neural Network is powerful, but training is unstable if we apply it naively.
15 | - Trick 1 - Experience Replay: Store experience `(S, A, R, S_next)` in a replay buffer and sample minibatches from it to train the network. This decorrelates the data and leads to better data efficiency. In the beginning, the replay buffer is filled with random experience.
16 | - Trick 2 - Target Network: Use a separate network to estimate the TD target. This target network has the same architecture as the function approximator but with frozen parameters. Every T steps (a hyperparameter) the parameters from the Q network are copied to the target network. This leads to more stable training because it keeps the target function fixed (for a while).
17 | - By using a Convolutional Neural Network as the function approximator on raw pixels of Atari games where the score is the reward we can learn to play many of those games at human-like performance.
18 | - Double DQN: Just like regular Q-Learning, DQN tends to overestimate values due to its max operation applied to both selecting and estimating actions. We get around this by using the Q network for selection and the target network for estimation when making updates.
19 | 
20 | 
21 | ### Lectures & Readings
22 | 
23 | **Required:**
24 | 
25 | - [Human-Level Control through Deep Reinforcement Learning](http://www.readcube.com/articles/10.1038/nature14236)
26 | - [Demystifying Deep Reinforcement Learning](https://ai.intel.com/demystifying-deep-reinforcement-learning/)
27 | - David Silver's RL Course Lecture 6 - Value Function Approximation ([video](https://www.youtube.com/watch?v=UoPei5o4fps), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/FA.pdf))
28 | 
29 | **Optional:**
30 | 
31 | - [Using Keras and Deep Q-Network to Play FlappyBird](https://yanpanlau.github.io/2016/07/10/FlappyBird-Keras.html)
32 | - [Deep Reinforcement Learning with Double Q-learning](http://arxiv.org/abs/1509.06461)
33 | - [Prioritized Experience Replay](http://arxiv.org/abs/1511.05952)
34 | 
35 | **Deep Learning:**
36 | 
37 | - [Tensorflow](http://www.tensorflow.org)
38 | - [Deep Learning Books](http://www.deeplearningbook.org/)
39 | 
40 | ### Exercises
41 | 
42 | - Get familiar with the [OpenAI Gym Atari Environment Playground](Breakout%20Playground.ipynb)
43 | - Deep-Q Learning for Atari Games
44 |   - [Exercise](Deep%20Q%20Learning.ipynb)
45 |   - [Solution](Deep%20Q%20Learning%20Solution.ipynb)
46 | - Double-Q Learning
47 |   - This is a minimal change to Q-Learning so use the same exercise as above
48 |   - [Solution](Double%20DQN%20Solution.ipynb)
49 | - Prioritized Experience Replay (WIP)
50 | 


--------------------------------------------------------------------------------
/MDP/README.md:
--------------------------------------------------------------------------------
 1 | ## MDPs and Bellman Equations
 2 | 
 3 | ### Learning Goals
 4 | 
 5 | - Understand the Agent-Environment interface
 6 | - Understand what MDPs (Markov Decision Processes) are and how to interpret transition diagrams
 7 | - Understand Value Functions, Action-Value Functions, and Policy Functions
 8 | - Understand the Bellman Equations and Bellman Optimality Equations for value functions and action-value functions
 9 | 
10 | 
11 | ### Summary
12 | 
13 | - Agent & Environment Interface: At each step `t` the agent receives a state `S_t`, performs an action `A_t` and receives a reward `R_{t+1}`. The action is chosen according to a policy function `pi`.
14 | - The total return `G_t` is the sum of all rewards starting from time t . Future rewards are discounted at a discount rate `gamma^k`.
15 | - Markov property: The environment's response at time `t+1` depends only on the state and action representations at time `t`. The future is independent of the past given the present. Even if an environment doesn't fully satisfy the Markov property we still treat it as if it is and try to construct the state representation to be approximately Markov.
16 | - Markov Decision Process (MDP): Defined by a state set S, action set A and one-step dynamics `p(s',r | s,a)`. If we have complete knowledge of the environment we know the transition dynamic. In practice, we often don't know the full MDP (but we know that it's some MDP).
17 | - The Value Function `v(s)` estimates how "good" it is for an agent to be in a particular state. More formally, it's the expected return `G_t` given that the agent is in state `s`. `v(s) = Ex[G_t | S_t = s]`. Note that the value function is specific to a given policy `pi`.
18 | - Action Value function: q(s, a) estimates how "good" it is for an agent to be in states and take action a. Similar to the value function, but also considers the action.
19 | - The Bellman equation expresses the relationship between the value of a state and the values of its successor states. It can be expressed using a "backup" diagram. Bellman equations exist for both the value function and the action value function.
20 | - Value functions define an ordering over policies. A policy `p1` is better than `p2` if `v_p1(s) >= v_p2(s)` for all states s. For MDPs, there exist one or more optimal policies that are better than or equal to all other policies.
21 | - The optimal state value function `v*(s)` is the value function for the optimal policy. Same for `q*(s, a)`. The Bellman Optimality Equation defines how the optimal value of a state is related to the optimal value of successor states. It has a "max" instead of an average.
22 | 
23 | 
24 | ### Lectures & Readings
25 | 
26 | **Required:**
27 | 
28 | - [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 3: Finite Markov Decision Processes
29 | - David Silver's RL Course Lecture 2 - Markov Decision Processes ([video](https://www.youtube.com/watch?v=lfHX2hHRMVQ), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MDP.pdf))
30 | 
31 | 
32 | ### Exercises
33 | 
34 | This chapter is mostly theory so there are no exercises.
35 | 


--------------------------------------------------------------------------------
/TD/Cliff Environment Playground.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import gym\n",
 10 |     "import numpy as np\n",
 11 |     "import sys\n",
 12 |     "\n",
 13 |     "if \"../\" not in sys.path:\n",
 14 |     "  sys.path.append(\"../\") \n",
 15 |     "\n",
 16 |     "from lib.envs.cliff_walking import CliffWalkingEnv"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 2,
 22 |    "metadata": {},
 23 |    "outputs": [
 24 |     {
 25 |      "name": "stdout",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "36\n",
 29 |       "o  o  o  o  o  o  o  o  o  o  o  o\n",
 30 |       "o  o  o  o  o  o  o  o  o  o  o  o\n",
 31 |       "o  o  o  o  o  o  o  o  o  o  o  o\n",
 32 |       "x  C  C  C  C  C  C  C  C  C  C  T\n",
 33 |       "\n",
 34 |       "(24, -1.0, False, {'prob': 1.0})\n",
 35 |       "o  o  o  o  o  o  o  o  o  o  o  o\n",
 36 |       "o  o  o  o  o  o  o  o  o  o  o  o\n",
 37 |       "x  o  o  o  o  o  o  o  o  o  o  o\n",
 38 |       "o  C  C  C  C  C  C  C  C  C  C  T\n",
 39 |       "\n",
 40 |       "(25, -1.0, False, {'prob': 1.0})\n",
 41 |       "o  o  o  o  o  o  o  o  o  o  o  o\n",
 42 |       "o  o  o  o  o  o  o  o  o  o  o  o\n",
 43 |       "o  x  o  o  o  o  o  o  o  o  o  o\n",
 44 |       "o  C  C  C  C  C  C  C  C  C  C  T\n",
 45 |       "\n",
 46 |       "(26, -1.0, False, {'prob': 1.0})\n",
 47 |       "o  o  o  o  o  o  o  o  o  o  o  o\n",
 48 |       "o  o  o  o  o  o  o  o  o  o  o  o\n",
 49 |       "o  o  x  o  o  o  o  o  o  o  o  o\n",
 50 |       "o  C  C  C  C  C  C  C  C  C  C  T\n",
 51 |       "\n",
 52 |       "(38, -100.0, True, {'prob': 1.0})\n",
 53 |       "o  o  o  o  o  o  o  o  o  o  o  o\n",
 54 |       "o  o  o  o  o  o  o  o  o  o  o  o\n",
 55 |       "o  o  o  o  o  o  o  o  o  o  o  o\n",
 56 |       "o  C  x  C  C  C  C  C  C  C  C  T\n",
 57 |       "\n"
 58 |      ]
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "env = CliffWalkingEnv()\n",
 63 |     "\n",
 64 |     "print(env.reset())\n",
 65 |     "env.render()\n",
 66 |     "\n",
 67 |     "print(env.step(0))\n",
 68 |     "env.render()\n",
 69 |     "\n",
 70 |     "print(env.step(1))\n",
 71 |     "env.render()\n",
 72 |     "\n",
 73 |     "print(env.step(1))\n",
 74 |     "env.render()\n",
 75 |     "\n",
 76 |     "print(env.step(2))\n",
 77 |     "env.render()"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": []
 86 |   }
 87 |  ],
 88 |  "metadata": {
 89 |   "kernelspec": {
 90 |    "display_name": "Python 3",
 91 |    "language": "python",
 92 |    "name": "python3"
 93 |   },
 94 |   "language_info": {
 95 |    "codemirror_mode": {
 96 |     "name": "ipython",
 97 |     "version": 3
 98 |    },
 99 |    "file_extension": ".py",
100 |    "mimetype": "text/x-python",
101 |    "name": "python",
102 |    "nbconvert_exporter": "python",
103 |    "pygments_lexer": "ipython3",
104 |    "version": "3.6.4"
105 |   }
106 |  },
107 |  "nbformat": 4,
108 |  "nbformat_minor": 1
109 | }
110 | 


--------------------------------------------------------------------------------
/MC/README.md:
--------------------------------------------------------------------------------
 1 | ## Model-Free Prediction & Control with Monte Carlo (MC)
 2 | 
 3 | 
 4 | ### Learning Goals
 5 | 
 6 | - Understand the difference between Prediction and Control
 7 | - Know how to use the MC method for predicting state values and state-action values
 8 | - Understand the on-policy first-visit MC control algorithm
 9 | - Understand off-policy MC control algorithms
10 | - Understand Weighted Importance Sampling
11 | - Understand the benefits of MC algorithms over the Dynamic Programming approach
12 | 
13 | 
14 | ### Summary
15 | 
16 | - Dynamic Programming approaches assume complete knowledge of the environment (the MDP). In practice, we often don't have full knowledge of how the world works.
17 | - Monte Carlo (MC) methods can learn directly from experience collected by interacting with the environment. An episode of experience is a series of `(State, Action, Reward, Next State)` tuples.
18 | - MC methods work based on episodes. We sample episodes of experience and make updates to our estimates at the end of each episode. MC methods have high variance (due to lots of random decisions within an episode) but are unbiased.
19 | - MC Policy Evaluation: Given a policy, we want to estimate the state-value function V(s). Sample episodes of experience and estimate V(s) to be the reward received from that state onwards averaged across all of your experience. The same technique works for the action-value function Q(s, a). Given enough samples, this is proven to converge.
20 | - MC Control: Idea is the same as for Dynamic Programming. Use MC Policy Evaluation to evaluate the current policy then improve the policy greedily. The Problem: How do we ensure that we explore all states if we don't know the full environment?
21 | - Solution to exploration problem: Use epsilon-greedy policies instead of full greedy policies. When making a decision act randomly with probability epsilon. This will learn the optimal epsilon-greedy policy.
22 | - Off-Policy Learning: How can we learn about the actual optimal (greedy) policy while following an exploratory (epsilon-greedy) policy? We can use importance sampling, which weighs returns by their probability of occurring under the policy we want to learn about.
23 | 
24 | 
25 | ### Lectures & Readings
26 | 
27 | **Required:**
28 | 
29 | - [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 5: Monte Carlo Methods
30 | 
31 | 
32 | **Optional:**
33 | 
34 | - David Silver's RL Course Lecture 4 - Model-Free Prediction ([video](https://www.youtube.com/watch?v=PnHCvfgC_ZA), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/MC-TD.pdf))
35 | - David Silver's RL Course Lecture 5 - Model-Free Control ([video](https://www.youtube.com/watch?v=0g4j2k_Ggc4), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/control.pdf))
36 | 
37 | 
38 | ### Exercises
39 | 
40 | - Get familiar with the [Blackjack environment (Blackjack-v0)](Blackjack%20Playground.ipynb)
41 | - Implement the Monte Carlo Prediction to estimate state-action values
42 |   - [Exercise](MC%20Prediction.ipynb)
43 |   - [Solution](MC%20Prediction%20Solution.ipynb)
44 | - Implement the on-policy first-visit Monte Carlo Control algorithm
45 |   - [Exercise](MC%20Control%20with%20Epsilon-Greedy%20Policies.ipynb)
46 |   - [Solution](MC%20Control%20with%20Epsilon-Greedy%20Policies%20Solution.ipynb)
47 | - Implement the off-policy every-visit Monte Carlo Control using Weighted Important Sampling algorithm
48 |   - [Exercise](Off-Policy%20MC%20Control%20with%20Weighted%20Importance%20Sampling.ipynb)
49 |   - [Solution](Off-Policy%20MC%20Control%20with%20Weighted%20Importance%20Sampling%20Solution.ipynb)
50 | 


--------------------------------------------------------------------------------
/PolicyGradient/a3c/worker_test.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import sys
  3 | import os
  4 | import itertools
  5 | import collections
  6 | import unittest
  7 | import numpy as np
  8 | import tensorflow as tf
  9 | 
 10 | from inspect import getsourcefile
 11 | current_path = os.path.dirname(os.path.abspath(getsourcefile(lambda:0)))
 12 | import_path = os.path.abspath(os.path.join(current_path, "../.."))
 13 | 
 14 | if import_path not in sys.path:
 15 |   sys.path.append(import_path)
 16 | 
 17 | # from lib import plotting
 18 | from lib.atari.state_processor import StateProcessor
 19 | from lib.atari import helpers as atari_helpers
 20 | from worker import Worker
 21 | from estimators import ValueEstimator, PolicyEstimator
 22 | 
 23 | def make_env():
 24 |   return gym.envs.make("Breakout-v0")
 25 | 
 26 | VALID_ACTIONS = [0, 1, 2, 3]
 27 | 
 28 | class WorkerTest(tf.test.TestCase):
 29 |   def setUp(self):
 30 |     super(WorkerTest, self).setUp()
 31 | 
 32 |     self.env = make_env()
 33 |     self.discount_factor = 0.99
 34 |     self.global_step = tf.Variable(0, name="global_step", trainable=False)
 35 |     self.global_counter = itertools.count()
 36 |     self.sp = StateProcessor()
 37 | 
 38 |     with tf.variable_scope("global") as vs:
 39 |       self.global_policy_net = PolicyEstimator(len(VALID_ACTIONS))
 40 |       self.global_value_net = ValueEstimator(reuse=True)
 41 | 
 42 |   def testPolicyNetPredict(self):
 43 |     w = Worker(
 44 |       name="test",
 45 |       env=make_env(),
 46 |       policy_net=self.global_policy_net,
 47 |       value_net=self.global_value_net,
 48 |       global_counter=self.global_counter,
 49 |       discount_factor=self.discount_factor)
 50 | 
 51 |     with self.test_session() as sess:
 52 |       sess.run(tf.initialize_all_variables())
 53 |       state = self.sp.process(self.env.reset())
 54 |       processed_state = atari_helpers.atari_make_initial_state(state)
 55 |       action_values = w._policy_net_predict(processed_state, sess)
 56 |       self.assertEqual(action_values.shape, (4,))
 57 | 
 58 | 
 59 |   def testValueNetPredict(self):
 60 |     w = Worker(
 61 |       name="test",
 62 |       env=make_env(),
 63 |       policy_net=self.global_policy_net,
 64 |       value_net=self.global_value_net,
 65 |       global_counter=self.global_counter,
 66 |       discount_factor=self.discount_factor)
 67 | 
 68 |     with self.test_session() as sess:
 69 |       sess.run(tf.initialize_all_variables())
 70 |       state = self.sp.process(self.env.reset())
 71 |       processed_state = atari_helpers.atari_make_initial_state(state)
 72 |       state_value = w._value_net_predict(processed_state, sess)
 73 |       self.assertEqual(state_value.shape, ())
 74 | 
 75 |   def testRunNStepsAndUpdate(self):
 76 |     w = Worker(
 77 |       name="test",
 78 |       env=make_env(),
 79 |       policy_net=self.global_policy_net,
 80 |       value_net=self.global_value_net,
 81 |       global_counter=self.global_counter,
 82 |       discount_factor=self.discount_factor)
 83 | 
 84 |     with self.test_session() as sess:
 85 |       sess.run(tf.initialize_all_variables())
 86 |       state = self.sp.process(self.env.reset())
 87 |       processed_state = atari_helpers.atari_make_initial_state(state)
 88 |       w.state = processed_state
 89 |       transitions, local_t, global_t = w.run_n_steps(10, sess)
 90 |       policy_net_loss, value_net_loss, policy_net_summaries, value_net_summaries = w.update(transitions, sess)
 91 | 
 92 |     self.assertEqual(len(transitions), 10)
 93 |     self.assertIsNotNone(policy_net_loss)
 94 |     self.assertIsNotNone(value_net_loss)
 95 |     self.assertIsNotNone(policy_net_summaries)
 96 |     self.assertIsNotNone(value_net_summaries)
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |   unittest.main()


--------------------------------------------------------------------------------
/lib/plotting.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | import numpy as np
 3 | import pandas as pd
 4 | from collections import namedtuple
 5 | from matplotlib import pyplot as plt
 6 | from mpl_toolkits.mplot3d import Axes3D
 7 | 
 8 | EpisodeStats = namedtuple("Stats",["episode_lengths", "episode_rewards"])
 9 | 
10 | def plot_cost_to_go_mountain_car(env, estimator, num_tiles=20):
11 |     x = np.linspace(env.observation_space.low[0], env.observation_space.high[0], num=num_tiles)
12 |     y = np.linspace(env.observation_space.low[1], env.observation_space.high[1], num=num_tiles)
13 |     X, Y = np.meshgrid(x, y)
14 |     Z = np.apply_along_axis(lambda _: -np.max(estimator.predict(_)), 2, np.dstack([X, Y]))
15 | 
16 |     fig = plt.figure(figsize=(10, 5))
17 |     ax = fig.add_subplot(111, projection='3d')
18 |     surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1,
19 |                            cmap=matplotlib.cm.coolwarm, vmin=-1.0, vmax=1.0)
20 |     ax.set_xlabel('Position')
21 |     ax.set_ylabel('Velocity')
22 |     ax.set_zlabel('Value')
23 |     ax.set_title("Mountain \"Cost To Go\" Function")
24 |     fig.colorbar(surf)
25 |     plt.show()
26 | 
27 | 
28 | def plot_value_function(V, title="Value Function"):
29 |     """
30 |     Plots the value function as a surface plot.
31 |     """
32 |     min_x = min(k[0] for k in V.keys())
33 |     max_x = max(k[0] for k in V.keys())
34 |     min_y = min(k[1] for k in V.keys())
35 |     max_y = max(k[1] for k in V.keys())
36 | 
37 |     x_range = np.arange(min_x, max_x + 1)
38 |     y_range = np.arange(min_y, max_y + 1)
39 |     X, Y = np.meshgrid(x_range, y_range)
40 | 
41 |     # Find value for all (x, y) coordinates
42 |     Z_noace = np.apply_along_axis(lambda _: V[(_[0], _[1], False)], 2, np.dstack([X, Y]))
43 |     Z_ace = np.apply_along_axis(lambda _: V[(_[0], _[1], True)], 2, np.dstack([X, Y]))
44 | 
45 |     def plot_surface(X, Y, Z, title):
46 |         fig = plt.figure(figsize=(20, 10))
47 |         ax = fig.add_subplot(111, projection='3d')
48 |         surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1,
49 |                                cmap=matplotlib.cm.coolwarm, vmin=-1.0, vmax=1.0)
50 |         ax.set_xlabel('Player Sum')
51 |         ax.set_ylabel('Dealer Showing')
52 |         ax.set_zlabel('Value')
53 |         ax.set_title(title)
54 |         ax.view_init(ax.elev, -120)
55 |         fig.colorbar(surf)
56 |         plt.show()
57 | 
58 |     plot_surface(X, Y, Z_noace, "{} (No Usable Ace)".format(title))
59 |     plot_surface(X, Y, Z_ace, "{} (Usable Ace)".format(title))
60 | 
61 | 
62 | 
63 | def plot_episode_stats(stats, smoothing_window=10, noshow=False):
64 |     # Plot the episode length over time
65 |     fig1 = plt.figure(figsize=(10,5))
66 |     plt.plot(stats.episode_lengths)
67 |     plt.xlabel("Episode")
68 |     plt.ylabel("Episode Length")
69 |     plt.title("Episode Length over Time")
70 |     if noshow:
71 |         plt.close(fig1)
72 |     else:
73 |         plt.show(fig1)
74 | 
75 |     # Plot the episode reward over time
76 |     fig2 = plt.figure(figsize=(10,5))
77 |     rewards_smoothed = pd.Series(stats.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
78 |     plt.plot(rewards_smoothed)
79 |     plt.xlabel("Episode")
80 |     plt.ylabel("Episode Reward (Smoothed)")
81 |     plt.title("Episode Reward over Time (Smoothed over window size {})".format(smoothing_window))
82 |     if noshow:
83 |         plt.close(fig2)
84 |     else:
85 |         plt.show(fig2)
86 | 
87 |     # Plot time steps and episode number
88 |     fig3 = plt.figure(figsize=(10,5))
89 |     plt.plot(np.cumsum(stats.episode_lengths), np.arange(len(stats.episode_lengths)))
90 |     plt.xlabel("Time Steps")
91 |     plt.ylabel("Episode")
92 |     plt.title("Episode per time step")
93 |     if noshow:
94 |         plt.close(fig3)
95 |     else:
96 |         plt.show(fig3)
97 | 
98 |     return fig1, fig2, fig3
99 | 


--------------------------------------------------------------------------------
/MC/MC Prediction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "%matplotlib inline\n",
 12 |     "\n",
 13 |     "import gym\n",
 14 |     "import matplotlib\n",
 15 |     "import numpy as np\n",
 16 |     "import sys\n",
 17 |     "\n",
 18 |     "from collections import defaultdict\n",
 19 |     "\n",
 20 |     "if \"../\" not in sys.path:\n",
 21 |     "  sys.path.append(\"../\") \n",
 22 |     "from lib.envs.blackjack import BlackjackEnv\n",
 23 |     "from lib import plotting\n",
 24 |     "\n",
 25 |     "matplotlib.style.use('ggplot')"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "env = BlackjackEnv()"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {
 43 |     "collapsed": true
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "def mc_prediction(policy, env, num_episodes, discount_factor=1.0):\n",
 48 |     "    \"\"\"\n",
 49 |     "    Monte Carlo prediction algorithm. Calculates the value function\n",
 50 |     "    for a given policy using sampling.\n",
 51 |     "    \n",
 52 |     "    Args:\n",
 53 |     "        policy: A function that maps an observation to action probabilities.\n",
 54 |     "        env: OpenAI gym environment.\n",
 55 |     "        num_episodes: Number of episodes to sample.\n",
 56 |     "        discount_factor: Gamma discount factor.\n",
 57 |     "    \n",
 58 |     "    Returns:\n",
 59 |     "        A dictionary that maps from state -> value.\n",
 60 |     "        The state is a tuple and the value is a float.\n",
 61 |     "    \"\"\"\n",
 62 |     "\n",
 63 |     "    # Keeps track of sum and count of returns for each state\n",
 64 |     "    # to calculate an average. We could use an array to save all\n",
 65 |     "    # returns (like in the book) but that's memory inefficient.\n",
 66 |     "    returns_sum = defaultdict(float)\n",
 67 |     "    returns_count = defaultdict(float)\n",
 68 |     "    \n",
 69 |     "    # The final value function\n",
 70 |     "    V = defaultdict(float)\n",
 71 |     "    \n",
 72 |     "    # Implement this!\n",
 73 |     "\n",
 74 |     "    return V    "
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {
 81 |     "collapsed": true
 82 |    },
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "def sample_policy(observation):\n",
 86 |     "    \"\"\"\n",
 87 |     "    A policy that sticks if the player score is > 20 and hits otherwise.\n",
 88 |     "    \"\"\"\n",
 89 |     "    score, dealer_score, usable_ace = observation\n",
 90 |     "    return 0 if score >= 20 else 1"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {
 97 |     "collapsed": true,
 98 |     "scrolled": false
 99 |    },
100 |    "outputs": [],
101 |    "source": [
102 |     "V_10k = mc_prediction(sample_policy, env, num_episodes=10000)\n",
103 |     "plotting.plot_value_function(V_10k, title=\"10,000 Steps\")\n",
104 |     "\n",
105 |     "V_500k = mc_prediction(sample_policy, env, num_episodes=500000)\n",
106 |     "plotting.plot_value_function(V_500k, title=\"500,000 Steps\")"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {
113 |     "collapsed": true
114 |    },
115 |    "outputs": [],
116 |    "source": []
117 |   }
118 |  ],
119 |  "metadata": {
120 |   "kernelspec": {
121 |    "display_name": "Python 3",
122 |    "language": "python",
123 |    "name": "python3"
124 |   },
125 |   "language_info": {
126 |    "codemirror_mode": {
127 |     "name": "ipython",
128 |     "version": 3
129 |    },
130 |    "file_extension": ".py",
131 |    "mimetype": "text/x-python",
132 |    "name": "python",
133 |    "nbconvert_exporter": "python",
134 |    "pygments_lexer": "ipython3",
135 |    "version": "3.5.2"
136 |   }
137 |  },
138 |  "nbformat": 4,
139 |  "nbformat_minor": 1
140 | }
141 | 


--------------------------------------------------------------------------------
/PolicyGradient/a3c/policy_monitor.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import itertools
  4 | import collections
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | import time
  8 | 
  9 | from inspect import getsourcefile
 10 | current_path = os.path.dirname(os.path.abspath(getsourcefile(lambda:0)))
 11 | import_path = os.path.abspath(os.path.join(current_path, "../.."))
 12 | 
 13 | if import_path not in sys.path:
 14 |   sys.path.append(import_path)
 15 | 
 16 | from gym.wrappers import Monitor
 17 | import gym
 18 | 
 19 | from lib.atari.state_processor import StateProcessor
 20 | from lib.atari import helpers as atari_helpers
 21 | from estimators import ValueEstimator, PolicyEstimator
 22 | from worker import make_copy_params_op
 23 | 
 24 | 
 25 | class PolicyMonitor(object):
 26 |   """
 27 |   Helps evaluating a policy by running an episode in an environment,
 28 |   saving a video, and plotting summaries to Tensorboard.
 29 | 
 30 |   Args:
 31 |     env: environment to run in
 32 |     policy_net: A policy estimator
 33 |     summary_writer: a tf.train.SummaryWriter used to write Tensorboard summaries
 34 |   """
 35 |   def __init__(self, env, policy_net, summary_writer, saver=None):
 36 | 
 37 |     self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos")
 38 |     self.video_dir = os.path.abspath(self.video_dir)
 39 | 
 40 |     self.env = Monitor(env, directory=self.video_dir, video_callable=lambda x: True, resume=True)
 41 |     self.global_policy_net = policy_net
 42 |     self.summary_writer = summary_writer
 43 |     self.saver = saver
 44 |     self.sp = StateProcessor()
 45 | 
 46 |     self.checkpoint_path = os.path.abspath(os.path.join(summary_writer.get_logdir(), "../checkpoints/model"))
 47 | 
 48 |     try:
 49 |       os.makedirs(self.video_dir)
 50 |     except FileExistsError:
 51 |       pass
 52 | 
 53 |     # Local policy net
 54 |     with tf.variable_scope("policy_eval"):
 55 |       self.policy_net = PolicyEstimator(policy_net.num_outputs)
 56 | 
 57 |     # Op to copy params from global policy/value net parameters
 58 |     self.copy_params_op = make_copy_params_op(
 59 |       tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES),
 60 |       tf.contrib.slim.get_variables(scope="policy_eval", collection=tf.GraphKeys.TRAINABLE_VARIABLES))
 61 | 
 62 |   def _policy_net_predict(self, state, sess):
 63 |     feed_dict = { self.policy_net.states: [state] }
 64 |     preds = sess.run(self.policy_net.predictions, feed_dict)
 65 |     return preds["probs"][0]
 66 | 
 67 |   def eval_once(self, sess):
 68 |     with sess.as_default(), sess.graph.as_default():
 69 |       # Copy params to local model
 70 |       global_step, _ = sess.run([tf.contrib.framework.get_global_step(), self.copy_params_op])
 71 | 
 72 |       # Run an episode
 73 |       done = False
 74 |       state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset()))
 75 |       total_reward = 0.0
 76 |       episode_length = 0
 77 |       while not done:
 78 |         action_probs = self._policy_net_predict(state, sess)
 79 |         action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
 80 |         next_state, reward, done, _ = self.env.step(action)
 81 |         next_state = atari_helpers.atari_make_next_state(state, self.sp.process(next_state))
 82 |         total_reward += reward
 83 |         episode_length += 1
 84 |         state = next_state
 85 | 
 86 |       # Add summaries
 87 |       episode_summary = tf.Summary()
 88 |       episode_summary.value.add(simple_value=total_reward, tag="eval/total_reward")
 89 |       episode_summary.value.add(simple_value=episode_length, tag="eval/episode_length")
 90 |       self.summary_writer.add_summary(episode_summary, global_step)
 91 |       self.summary_writer.flush()
 92 | 
 93 |       if self.saver is not None:
 94 |         self.saver.save(sess, self.checkpoint_path)
 95 | 
 96 |       tf.logging.info("Eval results at step {}: total_reward {}, episode_length {}".format(global_step, total_reward, episode_length))
 97 | 
 98 |       return total_reward, episode_length
 99 | 
100 |   def continuous_eval(self, eval_every, sess, coord):
101 |     """
102 |     Continuously evaluates the policy every [eval_every] seconds.
103 |     """
104 |     try:
105 |       while not coord.should_stop():
106 |         self.eval_once(sess)
107 |         # Sleep until next evaluation cycle
108 |         time.sleep(eval_every)
109 |     except tf.errors.CancelledError:
110 |       return
111 | 


--------------------------------------------------------------------------------
/lib/envs/gridworld.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import numpy as np
  3 | import sys
  4 | 
  5 | from . import discrete
  6 | 
  7 | UP = 0
  8 | RIGHT = 1
  9 | DOWN = 2
 10 | LEFT = 3
 11 | 
 12 | class GridworldEnv(discrete.DiscreteEnv):
 13 |     """
 14 |     Grid World environment from Sutton's Reinforcement Learning book chapter 4.
 15 |     You are an agent on an MxN grid and your goal is to reach the terminal
 16 |     state at the top left or the bottom right corner.
 17 | 
 18 |     For example, a 4x4 grid looks as follows:
 19 | 
 20 |     T  o  o  o
 21 |     o  x  o  o
 22 |     o  o  o  o
 23 |     o  o  o  T
 24 | 
 25 |     x is your position and T are the two terminal states.
 26 | 
 27 |     You can take actions in each direction (UP=0, RIGHT=1, DOWN=2, LEFT=3).
 28 |     Actions going off the edge leave you in your current state.
 29 |     You receive a reward of -1 at each step until you reach a terminal state.
 30 |     """
 31 | 
 32 |     metadata = {'render.modes': ['human', 'ansi']}
 33 | 
 34 |     def __init__(self, shape=[4,4]):
 35 |         if not isinstance(shape, (list, tuple)) or not len(shape) == 2:
 36 |             raise ValueError('shape argument must be a list/tuple of length 2')
 37 | 
 38 |         self.shape = shape
 39 | 
 40 |         nS = np.prod(shape)
 41 |         nA = 4
 42 | 
 43 |         MAX_Y = shape[0]
 44 |         MAX_X = shape[1]
 45 | 
 46 |         P = {}
 47 |         grid = np.arange(nS).reshape(shape)
 48 |         it = np.nditer(grid, flags=['multi_index'])
 49 | 
 50 |         while not it.finished:
 51 |             s = it.iterindex
 52 |             y, x = it.multi_index
 53 | 
 54 |             # P[s][a] = (prob, next_state, reward, is_done)
 55 |             P[s] = {a : [] for a in range(nA)}
 56 | 
 57 |             is_done = lambda s: s == 0 or s == (nS - 1)
 58 |             reward = 0.0 if is_done(s) else -1.0
 59 | 
 60 |             # We're stuck in a terminal state
 61 |             if is_done(s):
 62 |                 P[s][UP] = [(1.0, s, reward, True)]
 63 |                 P[s][RIGHT] = [(1.0, s, reward, True)]
 64 |                 P[s][DOWN] = [(1.0, s, reward, True)]
 65 |                 P[s][LEFT] = [(1.0, s, reward, True)]
 66 |             # Not a terminal state
 67 |             else:
 68 |                 ns_up = s if y == 0 else s - MAX_X
 69 |                 ns_right = s if x == (MAX_X - 1) else s + 1
 70 |                 ns_down = s if y == (MAX_Y - 1) else s + MAX_X
 71 |                 ns_left = s if x == 0 else s - 1
 72 |                 P[s][UP] = [(1.0, ns_up, reward, is_done(ns_up))]
 73 |                 P[s][RIGHT] = [(1.0, ns_right, reward, is_done(ns_right))]
 74 |                 P[s][DOWN] = [(1.0, ns_down, reward, is_done(ns_down))]
 75 |                 P[s][LEFT] = [(1.0, ns_left, reward, is_done(ns_left))]
 76 | 
 77 |             it.iternext()
 78 | 
 79 |         # Initial state distribution is uniform
 80 |         isd = np.ones(nS) / nS
 81 | 
 82 |         # We expose the model of the environment for educational purposes
 83 |         # This should not be used in any model-free learning algorithm
 84 |         self.P = P
 85 | 
 86 |         super(GridworldEnv, self).__init__(nS, nA, P, isd)
 87 | 
 88 |     def _render(self, mode='human', close=False):
 89 |         """ Renders the current gridworld layout
 90 | 
 91 |          For example, a 4x4 grid with the mode="human" looks like:
 92 |             T  o  o  o
 93 |             o  x  o  o
 94 |             o  o  o  o
 95 |             o  o  o  T
 96 |         where x is your position and T are the two terminal states.
 97 |         """
 98 |         if close:
 99 |             return
100 | 
101 |         outfile = io.StringIO() if mode == 'ansi' else sys.stdout
102 | 
103 |         grid = np.arange(self.nS).reshape(self.shape)
104 |         it = np.nditer(grid, flags=['multi_index'])
105 |         while not it.finished:
106 |             s = it.iterindex
107 |             y, x = it.multi_index
108 | 
109 |             if self.s == s:
110 |                 output = " x "
111 |             elif s == 0 or s == self.nS - 1:
112 |                 output = " T "
113 |             else:
114 |                 output = " o "
115 | 
116 |             if x == 0:
117 |                 output = output.lstrip()
118 |             if x == self.shape[1] - 1:
119 |                 output = output.rstrip()
120 | 
121 |             outfile.write(output)
122 | 
123 |             if x == self.shape[1] - 1:
124 |                 outfile.write("\n")
125 | 
126 |             it.iternext()
127 | 


--------------------------------------------------------------------------------
/PolicyGradient/a3c/estimator_test.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import gym
  3 | import sys
  4 | import os
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | 
  8 | from inspect import getsourcefile
  9 | current_path = os.path.dirname(os.path.abspath(getsourcefile(lambda:0)))
 10 | import_path = os.path.abspath(os.path.join(current_path, "../.."))
 11 | 
 12 | if import_path not in sys.path:
 13 |   sys.path.append(import_path)
 14 | 
 15 | # from lib import plotting
 16 | from lib.atari.state_processor import StateProcessor
 17 | from lib.atari import helpers as atari_helpers
 18 | from estimators import ValueEstimator, PolicyEstimator
 19 | 
 20 | 
 21 | def make_env():
 22 |   return gym.envs.make("Breakout-v0")
 23 | 
 24 | VALID_ACTIONS = [0, 1, 2, 3]
 25 | 
 26 | class PolicyEstimatorTest(tf.test.TestCase):
 27 |   def testPredict(self):
 28 |     env = make_env()
 29 |     sp = StateProcessor()
 30 |     estimator = PolicyEstimator(len(VALID_ACTIONS))
 31 | 
 32 |     with self.test_session() as sess:
 33 |       sess.run(tf.initialize_all_variables())
 34 | 
 35 |       # Generate a state
 36 |       state = sp.process(env.reset())
 37 |       processed_state = atari_helpers.atari_make_initial_state(state)
 38 |       processed_states = np.array([processed_state])
 39 | 
 40 |       # Run feeds
 41 |       feed_dict = {
 42 |         estimator.states: processed_states,
 43 |         estimator.targets: [1.0],
 44 |         estimator.actions: [1]
 45 |       }
 46 |       loss = sess.run(estimator.loss, feed_dict)
 47 |       pred = sess.run(estimator.predictions, feed_dict)
 48 | 
 49 |       # Assertions
 50 |       self.assertTrue(loss != 0.0)
 51 |       self.assertEqual(pred["probs"].shape, (1, len(VALID_ACTIONS)))
 52 |       self.assertEqual(pred["logits"].shape, (1, len(VALID_ACTIONS)))
 53 | 
 54 |   def testGradient(self):
 55 |     env = make_env()
 56 |     sp = StateProcessor()
 57 |     estimator = PolicyEstimator(len(VALID_ACTIONS))
 58 |     grads = [g for g, _ in estimator.grads_and_vars]
 59 | 
 60 |     with self.test_session() as sess:
 61 |       sess.run(tf.initialize_all_variables())
 62 | 
 63 |       # Generate a state
 64 |       state = sp.process(env.reset())
 65 |       processed_state = atari_helpers.atari_make_initial_state(state)
 66 |       processed_states = np.array([processed_state])
 67 | 
 68 |       # Run feeds to get gradients
 69 |       feed_dict = {
 70 |         estimator.states: processed_states,
 71 |         estimator.targets: [1.0],
 72 |         estimator.actions: [1]
 73 |       }
 74 |       grads_ = sess.run(grads, feed_dict)
 75 | 
 76 |       # Apply calculated gradients
 77 |       grad_feed_dict = { k: v for k, v in zip(grads, grads_) }
 78 |       _ = sess.run(estimator.train_op, grad_feed_dict)
 79 | 
 80 | 
 81 | class ValueEstimatorTest(tf.test.TestCase):
 82 |   def testPredict(self):
 83 |     env = make_env()
 84 |     sp = StateProcessor()
 85 |     estimator = ValueEstimator()
 86 | 
 87 |     with self.test_session() as sess:
 88 |       sess.run(tf.initialize_all_variables())
 89 | 
 90 |       # Generate a state
 91 |       state = sp.process(env.reset())
 92 |       processed_state = atari_helpers.atari_make_initial_state(state)
 93 |       processed_states = np.array([processed_state])
 94 | 
 95 |       # Run feeds
 96 |       feed_dict = {
 97 |         estimator.states: processed_states,
 98 |         estimator.targets: [1.0],
 99 |       }
100 |       loss = sess.run(estimator.loss, feed_dict)
101 |       pred = sess.run(estimator.predictions, feed_dict)
102 | 
103 |       # Assertions
104 |       self.assertTrue(loss != 0.0)
105 |       self.assertEqual(pred["logits"].shape, (1,))
106 | 
107 |   def testGradient(self):
108 |     env = make_env()
109 |     sp = StateProcessor()
110 |     estimator = ValueEstimator()
111 |     grads = [g for g, _ in estimator.grads_and_vars]
112 | 
113 |     with self.test_session() as sess:
114 |       sess.run(tf.initialize_all_variables())
115 | 
116 |       # Generate a state
117 |       state = sp.process(env.reset())
118 |       processed_state = atari_helpers.atari_make_initial_state(state)
119 |       processed_states = np.array([processed_state])
120 | 
121 |       # Run feeds
122 |       feed_dict = {
123 |         estimator.states: processed_states,
124 |         estimator.targets: [1.0],
125 |       }
126 |       grads_ = sess.run(grads, feed_dict)
127 | 
128 |       # Apply calculated gradients
129 |       grad_feed_dict = { k: v for k, v in zip(grads, grads_) }
130 |       _ = sess.run(estimator.train_op, grad_feed_dict)
131 | 
132 | if __name__ == '__main__':
133 |   unittest.main()


--------------------------------------------------------------------------------
/TD/Windy Gridworld Playground.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import gym\n",
 10 |     "import numpy as np\n",
 11 |     "import sys\n",
 12 |     "\n",
 13 |     "if \"../\" not in sys.path:\n",
 14 |     "  sys.path.append(\"../\") \n",
 15 |     "\n",
 16 |     "from lib.envs.windy_gridworld import WindyGridworldEnv"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 2,
 22 |    "metadata": {},
 23 |    "outputs": [
 24 |     {
 25 |      "name": "stdout",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "30\n",
 29 |       "o  o  o  o  o  o  o  o  o  o\n",
 30 |       "o  o  o  o  o  o  o  o  o  o\n",
 31 |       "o  o  o  o  o  o  o  o  o  o\n",
 32 |       "x  o  o  o  o  o  o  T  o  o\n",
 33 |       "o  o  o  o  o  o  o  o  o  o\n",
 34 |       "o  o  o  o  o  o  o  o  o  o\n",
 35 |       "o  o  o  o  o  o  o  o  o  o\n",
 36 |       "\n",
 37 |       "(31, -1.0, False, {'prob': 1.0})\n",
 38 |       "o  o  o  o  o  o  o  o  o  o\n",
 39 |       "o  o  o  o  o  o  o  o  o  o\n",
 40 |       "o  o  o  o  o  o  o  o  o  o\n",
 41 |       "o  x  o  o  o  o  o  T  o  o\n",
 42 |       "o  o  o  o  o  o  o  o  o  o\n",
 43 |       "o  o  o  o  o  o  o  o  o  o\n",
 44 |       "o  o  o  o  o  o  o  o  o  o\n",
 45 |       "\n",
 46 |       "(32, -1.0, False, {'prob': 1.0})\n",
 47 |       "o  o  o  o  o  o  o  o  o  o\n",
 48 |       "o  o  o  o  o  o  o  o  o  o\n",
 49 |       "o  o  o  o  o  o  o  o  o  o\n",
 50 |       "o  o  x  o  o  o  o  T  o  o\n",
 51 |       "o  o  o  o  o  o  o  o  o  o\n",
 52 |       "o  o  o  o  o  o  o  o  o  o\n",
 53 |       "o  o  o  o  o  o  o  o  o  o\n",
 54 |       "\n",
 55 |       "(33, -1.0, False, {'prob': 1.0})\n",
 56 |       "o  o  o  o  o  o  o  o  o  o\n",
 57 |       "o  o  o  o  o  o  o  o  o  o\n",
 58 |       "o  o  o  o  o  o  o  o  o  o\n",
 59 |       "o  o  o  x  o  o  o  T  o  o\n",
 60 |       "o  o  o  o  o  o  o  o  o  o\n",
 61 |       "o  o  o  o  o  o  o  o  o  o\n",
 62 |       "o  o  o  o  o  o  o  o  o  o\n",
 63 |       "\n",
 64 |       "(33, -1.0, False, {'prob': 1.0})\n",
 65 |       "o  o  o  o  o  o  o  o  o  o\n",
 66 |       "o  o  o  o  o  o  o  o  o  o\n",
 67 |       "o  o  o  o  o  o  o  o  o  o\n",
 68 |       "o  o  o  x  o  o  o  T  o  o\n",
 69 |       "o  o  o  o  o  o  o  o  o  o\n",
 70 |       "o  o  o  o  o  o  o  o  o  o\n",
 71 |       "o  o  o  o  o  o  o  o  o  o\n",
 72 |       "\n",
 73 |       "(24, -1.0, False, {'prob': 1.0})\n",
 74 |       "o  o  o  o  o  o  o  o  o  o\n",
 75 |       "o  o  o  o  o  o  o  o  o  o\n",
 76 |       "o  o  o  o  x  o  o  o  o  o\n",
 77 |       "o  o  o  o  o  o  o  T  o  o\n",
 78 |       "o  o  o  o  o  o  o  o  o  o\n",
 79 |       "o  o  o  o  o  o  o  o  o  o\n",
 80 |       "o  o  o  o  o  o  o  o  o  o\n",
 81 |       "\n",
 82 |       "(15, -1.0, False, {'prob': 1.0})\n",
 83 |       "o  o  o  o  o  o  o  o  o  o\n",
 84 |       "o  o  o  o  o  x  o  o  o  o\n",
 85 |       "o  o  o  o  o  o  o  o  o  o\n",
 86 |       "o  o  o  o  o  o  o  T  o  o\n",
 87 |       "o  o  o  o  o  o  o  o  o  o\n",
 88 |       "o  o  o  o  o  o  o  o  o  o\n",
 89 |       "o  o  o  o  o  o  o  o  o  o\n",
 90 |       "\n"
 91 |      ]
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "env = WindyGridworldEnv()\n",
 96 |     "\n",
 97 |     "print(env.reset())\n",
 98 |     "env.render()\n",
 99 |     "\n",
100 |     "print(env.step(1))\n",
101 |     "env.render()\n",
102 |     "\n",
103 |     "print(env.step(1))\n",
104 |     "env.render()\n",
105 |     "\n",
106 |     "print(env.step(1))\n",
107 |     "env.render()\n",
108 |     "\n",
109 |     "print(env.step(2))\n",
110 |     "env.render()\n",
111 |     "\n",
112 |     "print(env.step(1))\n",
113 |     "env.render()\n",
114 |     "\n",
115 |     "print(env.step(1))\n",
116 |     "env.render()"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": []
125 |   }
126 |  ],
127 |  "metadata": {
128 |   "kernelspec": {
129 |    "display_name": "Python 3",
130 |    "language": "python",
131 |    "name": "python3"
132 |   },
133 |   "language_info": {
134 |    "codemirror_mode": {
135 |     "name": "ipython",
136 |     "version": 3
137 |    },
138 |    "file_extension": ".py",
139 |    "mimetype": "text/x-python",
140 |    "name": "python",
141 |    "nbconvert_exporter": "python",
142 |    "pygments_lexer": "ipython3",
143 |    "version": "3.6.4"
144 |   }
145 |  },
146 |  "nbformat": 4,
147 |  "nbformat_minor": 1
148 | }
149 | 


--------------------------------------------------------------------------------
/lib/envs/blackjack.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | from gym import spaces
  3 | from gym.utils import seeding
  4 | 
  5 | def cmp(a, b):
  6 |     return int((a > b)) - int((a < b))
  7 | 
  8 | # 1 = Ace, 2-10 = Number cards, Jack/Queen/King = 10
  9 | deck = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10]
 10 | 
 11 | 
 12 | def draw_card(np_random):
 13 |     return np_random.choice(deck)
 14 | 
 15 | 
 16 | def draw_hand(np_random):
 17 |     return [draw_card(np_random), draw_card(np_random)]
 18 | 
 19 | 
 20 | def usable_ace(hand):  # Does this hand have a usable ace?
 21 |     return 1 in hand and sum(hand) + 10 <= 21
 22 | 
 23 | 
 24 | def sum_hand(hand):  # Return current hand total
 25 |     if usable_ace(hand):
 26 |             return sum(hand) + 10
 27 |     return sum(hand)
 28 | 
 29 | 
 30 | def is_bust(hand):  # Is this hand a bust?
 31 |     return sum_hand(hand) > 21
 32 | 
 33 | 
 34 | def score(hand):  # What is the score of this hand (0 if bust)
 35 |     return 0 if is_bust(hand) else sum_hand(hand)
 36 | 
 37 | 
 38 | def is_natural(hand):  # Is this hand a natural blackjack?
 39 |     return sorted(hand) == [1, 10]
 40 | 
 41 | 
 42 | class BlackjackEnv(gym.Env):
 43 |     """Simple blackjack environment
 44 |     Blackjack is a card game where the goal is to obtain cards that sum to as
 45 |     near as possible to 21 without going over.  They're playing against a fixed
 46 |     dealer.
 47 |     Face cards (Jack, Queen, King) have point value 10.
 48 |     Aces can either count as 11 or 1, and it's called 'usable' at 11.
 49 |     This game is placed with an infinite deck (or with replacement).
 50 |     The game starts with each (player and dealer) having one face up and one
 51 |     face down card.
 52 |     The player can request additional cards (hit=1) until they decide to stop
 53 |     (stick=0) or exceed 21 (bust).
 54 |     After the player sticks, the dealer reveals their facedown card, and draws
 55 |     until their sum is 17 or greater.  If the dealer goes bust the player wins.
 56 |     If neither player nor dealer busts, the outcome (win, lose, draw) is
 57 |     decided by whose sum is closer to 21.  The reward for winning is +1,
 58 |     drawing is 0, and losing is -1.
 59 |     The observation of a 3-tuple of: the players current sum,
 60 |     the dealer's one showing card (1-10 where 1 is ace),
 61 |     and whether or not the player holds a usable ace (0 or 1).
 62 |     This environment corresponds to the version of the blackjack problem
 63 |     described in Example 5.1 in Reinforcement Learning: An Introduction
 64 |     by Sutton and Barto (1998).
 65 |     https://webdocs.cs.ualberta.ca/~sutton/book/the-book.html
 66 |     """
 67 |     def __init__(self, natural=False):
 68 |         self.action_space = spaces.Discrete(2)
 69 |         self.observation_space = spaces.Tuple((
 70 |             spaces.Discrete(32),
 71 |             spaces.Discrete(11),
 72 |             spaces.Discrete(2)))
 73 |         self._seed()
 74 | 
 75 |         # Flag to payout 1.5 on a "natural" blackjack win, like casino rules
 76 |         # Ref: http://www.bicyclecards.com/how-to-play/blackjack/
 77 |         self.natural = natural
 78 |         # Start the first game
 79 |         self._reset()        # Number of 
 80 |         self.nA = 2
 81 | 
 82 |     def reset(self):
 83 |         return self._reset()
 84 | 
 85 |     def step(self, action):
 86 |         return self._step(action)
 87 | 
 88 |     def _seed(self, seed=None):
 89 |         self.np_random, seed = seeding.np_random(seed)
 90 |         return [seed]
 91 | 
 92 |     def _step(self, action):
 93 |         assert self.action_space.contains(action)
 94 |         if action:  # hit: add a card to players hand and return
 95 |             self.player.append(draw_card(self.np_random))
 96 |             if is_bust(self.player):
 97 |                 done = True
 98 |                 reward = -1
 99 |             else:
100 |                 done = False
101 |                 reward = 0
102 |         else:  # stick: play out the dealers hand, and score
103 |             done = True
104 |             while sum_hand(self.dealer) < 17:
105 |                 self.dealer.append(draw_card(self.np_random))
106 |             reward = cmp(score(self.player), score(self.dealer))
107 |             if self.natural and is_natural(self.player) and reward == 1:
108 |                 reward = 1.5
109 |         return self._get_obs(), reward, done, {}
110 | 
111 |     def _get_obs(self):
112 |         return (sum_hand(self.player), self.dealer[0], usable_ace(self.player))
113 | 
114 |     def _reset(self):
115 |         self.dealer = draw_hand(self.np_random)
116 |         self.player = draw_hand(self.np_random)
117 | 
118 |         # Auto-draw another card if the score is less than 12
119 |         while sum_hand(self.player) < 12:
120 |             self.player.append(draw_card(self.np_random))
121 | 
122 |         return self._get_obs()
123 | 


--------------------------------------------------------------------------------
/PolicyGradient/README.md:
--------------------------------------------------------------------------------
 1 | ## Policy Gradient Methods
 2 | 
 3 | 
 4 | ### Learning Goals
 5 | 
 6 | - Understand the difference between value-based and policy-based Reinforcement Learning
 7 | - Understand the REINFORCE Algorithm (Monte Carlo Policy Gradient)
 8 | - Understand Actor-Critic (AC) algorithms
 9 | - Understand Advantage Functions
10 | - Understand Deterministic Policy Gradients (Optional)
11 | - Understand how to scale up Policy Gradient methods using asynchronous actor-critic and Neural Networks (Optional)
12 | 
13 | 
14 | ### Summary
15 | 
16 | - Idea: Instead of parameterizing the value function and doing greedy policy improvement we parameterize the policy and do gradient descent into a direction that improves it.
17 | - Sometimes the policy is easier to approximate than the value function. Also, we need a parameterized policy to deal with continuous action spaces and environments where we need to act stochastically.
18 | - Policy Score Function `J(theta)`: Intuitively, it measures how good our policy is. For example, we can use the average value or average reward under a policy as our objective.
19 | - Common choices for the policy function: Softmax for discrete actions, Gaussian parameters for continuous actions.
20 | - Policy Gradient Theorem: `grad(J(theta)) = Ex[grad(log(pi(s, a))) * Q(s, a)]`. Basically, we move our policy into a direction of more reward.
21 | - REINFORCE (Monte Carlo Policy Gradient): We substitute a samples return `g_t` form an episode for Q(s, a) to make an update. Unbiased but high variance.
22 | - Baseline: Instead of measuring the absolute goodness of an action we want to know how much better than "average" it is to take an action given a state. E.g. some states are naturally bad and always give negative reward. This is called the advantage and is defined as `Q(s, a) - V(s)`. We use that for our policy update, e.g. `g_t - V(s)` for REINFORCE.
23 | - Actor-Critic: Instead of waiting until the end of an episode as in REINFORCE we use bootstrapping and make an update at each step. To do that we also train a Critic Q(theta) that approximates the value function. Now we have two function approximators: One of the policy, one for the critic. This is basically TD, but for Policy Gradients.
24 | - A good estimate of the advantage function in the Actor-Critic algorithm is the td error. Our update then becomes `grad(J(theta)) = Ex[grad(log(pi(s, a))) * td_error]`.
25 | - Can use policy gradients with td-lambda, eligibility traces, and so on.
26 | - Deterministic Policy Gradients: Useful for high-dimensional continuous action spaces where stochastic policy gradients are expensive to compute. The idea is to update the policy in the direction of the gradient of the action-value function. To ensure exploration we can use an off-policy actor-critic algorithm with added noise in action selection.
27 | - Deep Deterministic Policy Gradients: Apply tricks from DQN to Deterministic Policy Gradients ;)
28 | - Asynchronous Advantage Actor-Critic (A3C): Instead of using an experience replay buffer as in DQN use multiple agents on different threads to explore the state spaces and make decorrelated updates to the actor and the critic.
29 | 
30 | 
31 | ### Lectures & Readings
32 | 
33 | **Required:**
34 | 
35 | - David Silver's RL Course Lecture 7 - Policy Gradient Methods ([video](https://www.youtube.com/watch?v=KHZVXao4qXs), [slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/pg.pdf))
36 | 
37 | **Optional:**
38 | 
39 | - [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/RLbook2018.pdf) - Chapter 13: Policy Gradient Methods
40 | - [Deterministic Policy Gradient Algorithms](http://jmlr.org/proceedings/papers/v32/silver14.pdf)
41 | - [Deterministic Policy Gradient Algorithms (Talk)](http://techtalks.tv/talks/deterministic-policy-gradient-algorithms/61098/)
42 | - [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971)
43 | - [Deep Deterministic Policy Gradients in TensorFlow](http://pemami4911.github.io/blog_posts/2016/08/21/ddpg-rl.html)
44 | - [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/abs/1602.01783)
45 | - [Deep Reinforcement Learning: A Tutorial (Policy Gradient Section)](http://web.archive.org/web/20161029135055/https://gym.openai.com/docs/rl#id16)
46 | 
47 | 
48 | 
49 | ### Exercises
50 | 
51 | - REINFORCE with Baseline
52 |   - Exercise
53 |   - [Solution](CliffWalk%20REINFORCE%20with%20Baseline%20Solution.ipynb)
54 | - Actor-Critic with Baseline
55 |   - Exercise
56 |   - [Solution](CliffWalk%20Actor%20Critic%20Solution.ipynb)
57 | - Actor-Critic with Baseline for Continuous Action Spaces
58 |   - Exercise
59 |   - [Solution](Continuous%20MountainCar%20Actor%20Critic%20Solution.ipynb)
60 | - Deterministic Policy Gradients for Continuous Action Spaces (WIP)
61 | - Deep Deterministic Policy Gradients (WIP)
62 | - Asynchronous Advantage Actor-Critic (A3C)
63 |   - Exercise
64 |   - [Solution](a3c/)
65 | 


--------------------------------------------------------------------------------
/PolicyGradient/a3c/train.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | import unittest
  4 | import gym
  5 | import sys
  6 | import os
  7 | import numpy as np
  8 | import tensorflow as tf
  9 | import itertools
 10 | import shutil
 11 | import threading
 12 | import multiprocessing
 13 | 
 14 | from inspect import getsourcefile
 15 | current_path = os.path.dirname(os.path.abspath(getsourcefile(lambda:0)))
 16 | import_path = os.path.abspath(os.path.join(current_path, "../.."))
 17 | 
 18 | if import_path not in sys.path:
 19 |   sys.path.append(import_path)
 20 | 
 21 | from lib.atari import helpers as atari_helpers
 22 | from estimators import ValueEstimator, PolicyEstimator
 23 | from policy_monitor import PolicyMonitor
 24 | from worker import Worker
 25 | 
 26 | 
 27 | tf.flags.DEFINE_string("model_dir", "/tmp/a3c", "Directory to write Tensorboard summaries and videos to.")
 28 | tf.flags.DEFINE_string("env", "Breakout-v0", "Name of gym Atari environment, e.g. Breakout-v0")
 29 | tf.flags.DEFINE_integer("t_max", 5, "Number of steps before performing an update")
 30 | tf.flags.DEFINE_integer("max_global_steps", None, "Stop training after this many steps in the environment. Defaults to running indefinitely.")
 31 | tf.flags.DEFINE_integer("eval_every", 300, "Evaluate the policy every N seconds")
 32 | tf.flags.DEFINE_boolean("reset", False, "If set, delete the existing model directory and start training from scratch.")
 33 | tf.flags.DEFINE_integer("parallelism", None, "Number of threads to run. If not set we run [num_cpu_cores] threads.")
 34 | 
 35 | FLAGS = tf.flags.FLAGS
 36 | 
 37 | def make_env(wrap=True):
 38 |   env = gym.envs.make(FLAGS.env)
 39 |   # remove the timelimitwrapper
 40 |   env = env.env
 41 |   if wrap:
 42 |     env = atari_helpers.AtariEnvWrapper(env)
 43 |   return env
 44 | 
 45 | # Depending on the game we may have a limited action space
 46 | env_ = make_env()
 47 | if FLAGS.env == "Pong-v0" or FLAGS.env == "Breakout-v0":
 48 |   VALID_ACTIONS = list(range(4))
 49 | else:
 50 |   VALID_ACTIONS = list(range(env_.action_space.n))
 51 | env_.close()
 52 | 
 53 | 
 54 | # Set the number of workers
 55 | NUM_WORKERS = multiprocessing.cpu_count()
 56 | if FLAGS.parallelism:
 57 |   NUM_WORKERS = FLAGS.parallelism
 58 | 
 59 | MODEL_DIR = FLAGS.model_dir
 60 | CHECKPOINT_DIR = os.path.join(MODEL_DIR, "checkpoints")
 61 | 
 62 | # Optionally empty model directory
 63 | if FLAGS.reset:
 64 |   shutil.rmtree(MODEL_DIR, ignore_errors=True)
 65 | 
 66 | if not os.path.exists(CHECKPOINT_DIR):
 67 |   os.makedirs(CHECKPOINT_DIR)
 68 | 
 69 | summary_writer = tf.summary.FileWriter(os.path.join(MODEL_DIR, "train"))
 70 | 
 71 | with tf.device("/cpu:0"):
 72 | 
 73 |   # Keeps track of the number of updates we've performed
 74 |   global_step = tf.Variable(0, name="global_step", trainable=False)
 75 | 
 76 |   # Global policy and value nets
 77 |   with tf.variable_scope("global") as vs:
 78 |     policy_net = PolicyEstimator(num_outputs=len(VALID_ACTIONS))
 79 |     value_net = ValueEstimator(reuse=True)
 80 | 
 81 |   # Global step iterator
 82 |   global_counter = itertools.count()
 83 | 
 84 |   # Create worker graphs
 85 |   workers = []
 86 |   for worker_id in range(NUM_WORKERS):
 87 |     # We only write summaries in one of the workers because they're
 88 |     # pretty much identical and writing them on all workers
 89 |     # would be a waste of space
 90 |     worker_summary_writer = None
 91 |     if worker_id == 0:
 92 |       worker_summary_writer = summary_writer
 93 | 
 94 |     worker = Worker(
 95 |       name="worker_{}".format(worker_id),
 96 |       env=make_env(),
 97 |       policy_net=policy_net,
 98 |       value_net=value_net,
 99 |       global_counter=global_counter,
100 |       discount_factor = 0.99,
101 |       summary_writer=worker_summary_writer,
102 |       max_global_steps=FLAGS.max_global_steps)
103 |     workers.append(worker)
104 | 
105 |   saver = tf.train.Saver(keep_checkpoint_every_n_hours=2.0, max_to_keep=10)
106 | 
107 |   # Used to occasionally save videos for our policy net
108 |   # and write episode rewards to Tensorboard
109 |   pe = PolicyMonitor(
110 |     env=make_env(wrap=False),
111 |     policy_net=policy_net,
112 |     summary_writer=summary_writer,
113 |     saver=saver)
114 | 
115 | with tf.Session() as sess:
116 |   sess.run(tf.global_variables_initializer())
117 |   coord = tf.train.Coordinator()
118 | 
119 |   # Load a previous checkpoint if it exists
120 |   latest_checkpoint = tf.train.latest_checkpoint(CHECKPOINT_DIR)
121 |   if latest_checkpoint:
122 |     print("Loading model checkpoint: {}".format(latest_checkpoint))
123 |     saver.restore(sess, latest_checkpoint)
124 | 
125 |   # Start worker threads
126 |   worker_threads = []
127 |   for worker in workers:
128 |     worker_fn = lambda worker=worker: worker.run(sess, coord, FLAGS.t_max)
129 |     t = threading.Thread(target=worker_fn)
130 |     t.start()
131 |     worker_threads.append(t)
132 | 
133 |   # Start a thread for policy eval task
134 |   monitor_thread = threading.Thread(target=lambda: pe.continuous_eval(FLAGS.eval_every, sess, coord))
135 |   monitor_thread.start()
136 | 
137 |   # Wait for all workers to finish
138 |   coord.join(worker_threads)
139 | 


--------------------------------------------------------------------------------
/DP/Gamblers Problem.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "### This is Example 4.3. Gambler’s Problem from Sutton's book.\n",
 10 |     "\n",
 11 |     "A gambler has the opportunity to make bets on the outcomes of a sequence of coin flips. \n",
 12 |     "If the coin comes up heads, he wins as many dollars as he has staked on that flip; \n",
 13 |     "if it is tails, he loses his stake. The game ends when the gambler wins by reaching his goal of $100, \n",
 14 |     "or loses by running out of money. \n",
 15 |     "\n",
 16 |     "On each flip, the gambler must decide what portion of his capital to stake, in integer numbers of dollars. \n",
 17 |     "This problem can be formulated as an undiscounted, episodic, finite MDP. \n",
 18 |     "\n",
 19 |     "The state is the gambler’s capital, s ∈ {1, 2, . . . , 99}.\n",
 20 |     "The actions are stakes, a ∈ {0, 1, . . . , min(s, 100 − s)}. \n",
 21 |     "The reward is zero on all transitions except those on which the gambler reaches his goal, when it is +1.\n",
 22 |     "\n",
 23 |     "The state-value function then gives the probability of winning from each state. A policy is a mapping from levels of capital to stakes. The optimal policy maximizes the probability of reaching the goal. Let p_h denote the probability of the coin coming up heads. If p_h is known, then the entire problem is known and it can be solved, for instance, by value iteration.\n"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 1,
 29 |    "metadata": {
 30 |     "collapsed": true
 31 |    },
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "import numpy as np\n",
 35 |     "import sys\n",
 36 |     "import matplotlib.pyplot as plt\n",
 37 |     "if \"../\" not in sys.path:\n",
 38 |     "  sys.path.append(\"../\") "
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {
 44 |     "collapsed": true
 45 |    },
 46 |    "source": [
 47 |     "\n",
 48 |     "### Exercise 4.9 (programming)\n",
 49 |     "\n",
 50 |     "Implement value iteration for the gambler’s problem and solve it for p_h = 0.25 and p_h = 0.55.\n",
 51 |     "\n"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 1,
 57 |    "metadata": {
 58 |     "collapsed": true
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "def value_iteration_for_gamblers(p_h, theta=0.0001, discount_factor=1.0):\n",
 63 |     "    \"\"\"\n",
 64 |     "    Args:\n",
 65 |     "        p_h: Probability of the coin coming up heads\n",
 66 |     "    \"\"\"\n",
 67 |     "    \n",
 68 |     "    def one_step_lookahead(s, V, rewards):\n",
 69 |     "        \"\"\"\n",
 70 |     "        Helper function to calculate the value for all action in a given state.\n",
 71 |     "        \n",
 72 |     "        Args:\n",
 73 |     "            s: The gambler’s capital. Integer.\n",
 74 |     "            V: The vector that contains values at each state. \n",
 75 |     "            rewards: The reward vector.\n",
 76 |     "                        \n",
 77 |     "        Returns:\n",
 78 |     "            A vector containing the expected value of each action. \n",
 79 |     "            Its length equals to the number of actions.\n",
 80 |     "        \"\"\"\n",
 81 |     "        \n",
 82 |     "        # Implement!\n",
 83 |     "        \n",
 84 |     "        return A\n",
 85 |     "    \n",
 86 |     "    # Implement!\n",
 87 |     "    \n",
 88 |     "    return policy, V"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {
 95 |     "collapsed": true
 96 |    },
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "policy, v = value_iteration_for_gamblers(0.25)\n",
100 |     "\n",
101 |     "print(\"Optimized Policy:\")\n",
102 |     "print(policy)\n",
103 |     "print(\"\")\n",
104 |     "\n",
105 |     "print(\"Optimized Value Function:\")\n",
106 |     "print(v)\n",
107 |     "print(\"\")"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 5,
113 |    "metadata": {
114 |     "collapsed": true
115 |    },
116 |    "outputs": [],
117 |    "source": [
118 |     "# Plotting Final Policy (action stake) vs State (Capital)\n",
119 |     "\n",
120 |     "# Implement!"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 6,
126 |    "metadata": {
127 |     "collapsed": true
128 |    },
129 |    "outputs": [],
130 |    "source": [
131 |     "# Plotting Capital vs Final Policy\n",
132 |     "\n",
133 |     "# Implement!\n"
134 |    ]
135 |   }
136 |  ],
137 |  "metadata": {
138 |   "kernelspec": {
139 |    "display_name": "Python 3",
140 |    "language": "python",
141 |    "name": "python3"
142 |   },
143 |   "language_info": {
144 |    "codemirror_mode": {
145 |     "name": "ipython",
146 |     "version": 3
147 |    },
148 |    "file_extension": ".py",
149 |    "mimetype": "text/x-python",
150 |    "name": "python",
151 |    "nbconvert_exporter": "python",
152 |    "pygments_lexer": "ipython3",
153 |    "version": "3.6.3"
154 |   }
155 |  },
156 |  "nbformat": 4,
157 |  "nbformat_minor": 1
158 | }
159 | 


--------------------------------------------------------------------------------
/MC/MC Control with Epsilon-Greedy Policies.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "%matplotlib inline\n",
 12 |     "\n",
 13 |     "import gym\n",
 14 |     "import matplotlib\n",
 15 |     "import numpy as np\n",
 16 |     "import sys\n",
 17 |     "\n",
 18 |     "from collections import defaultdict\n",
 19 |     "if \"../\" not in sys.path:\n",
 20 |     "  sys.path.append(\"../\") \n",
 21 |     "from lib.envs.blackjack import BlackjackEnv\n",
 22 |     "from lib import plotting\n",
 23 |     "\n",
 24 |     "matplotlib.style.use('ggplot')"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {
 31 |     "collapsed": true
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "env = BlackjackEnv()"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {
 42 |     "collapsed": true
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "def make_epsilon_greedy_policy(Q, epsilon, nA):\n",
 47 |     "    \"\"\"\n",
 48 |     "    Creates an epsilon-greedy policy based on a given Q-function and epsilon.\n",
 49 |     "    \n",
 50 |     "    Args:\n",
 51 |     "        Q: A dictionary that maps from state -> action-values.\n",
 52 |     "            Each value is a numpy array of length nA (see below)\n",
 53 |     "        epsilon: The probability to select a random action . float between 0 and 1.\n",
 54 |     "        nA: Number of actions in the environment.\n",
 55 |     "    \n",
 56 |     "    Returns:\n",
 57 |     "        A function that takes the observation as an argument and returns\n",
 58 |     "        the probabilities for each action in the form of a numpy array of length nA.\n",
 59 |     "    \n",
 60 |     "    \"\"\"\n",
 61 |     "    def policy_fn(observation):\n",
 62 |     "        pass\n",
 63 |     "        # Implement this!\n",
 64 |     "    return policy_fn"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {
 71 |     "collapsed": true
 72 |    },
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "def mc_control_epsilon_greedy(env, num_episodes, discount_factor=1.0, epsilon=0.1):\n",
 76 |     "    \"\"\"\n",
 77 |     "    Monte Carlo Control using Epsilon-Greedy policies.\n",
 78 |     "    Finds an optimal epsilon-greedy policy.\n",
 79 |     "    \n",
 80 |     "    Args:\n",
 81 |     "        env: OpenAI gym environment.\n",
 82 |     "        num_episodes: Number of episodes to sample.\n",
 83 |     "        discount_factor: Gamma discount factor.\n",
 84 |     "        epsilon: Chance the sample a random action. Float betwen 0 and 1.\n",
 85 |     "    \n",
 86 |     "    Returns:\n",
 87 |     "        A tuple (Q, policy).\n",
 88 |     "        Q is a dictionary mapping state -> action values.\n",
 89 |     "        policy is a function that takes an observation as an argument and returns\n",
 90 |     "        action probabilities\n",
 91 |     "    \"\"\"\n",
 92 |     "    \n",
 93 |     "    # Keeps track of sum and count of returns for each state\n",
 94 |     "    # to calculate an average. We could use an array to save all\n",
 95 |     "    # returns (like in the book) but that's memory inefficient.\n",
 96 |     "    returns_sum = defaultdict(float)\n",
 97 |     "    returns_count = defaultdict(float)\n",
 98 |     "    \n",
 99 |     "    # The final action-value function.\n",
100 |     "    # A nested dictionary that maps state -> (action -> action-value).\n",
101 |     "    Q = defaultdict(lambda: np.zeros(env.action_space.n))\n",
102 |     "    \n",
103 |     "    # The policy we're following\n",
104 |     "    policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)\n",
105 |     "    \n",
106 |     "    # Implement this!\n",
107 |     "    \n",
108 |     "    return Q, policy"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {
115 |     "collapsed": true
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "Q, policy = mc_control_epsilon_greedy(env, num_episodes=500000, epsilon=0.1)"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {
126 |     "collapsed": true
127 |    },
128 |    "outputs": [],
129 |    "source": [
130 |     "# For plotting: Create value function from action-value function\n",
131 |     "# by picking the best action at each state\n",
132 |     "V = defaultdict(float)\n",
133 |     "for state, actions in Q.items():\n",
134 |     "    action_value = np.max(actions)\n",
135 |     "    V[state] = action_value\n",
136 |     "plotting.plot_value_function(V, title=\"Optimal Value Function\")"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {
143 |     "collapsed": true
144 |    },
145 |    "outputs": [],
146 |    "source": []
147 |   }
148 |  ],
149 |  "metadata": {
150 |   "kernelspec": {
151 |    "display_name": "Python 3",
152 |    "language": "python",
153 |    "name": "python3"
154 |   },
155 |   "language_info": {
156 |    "codemirror_mode": {
157 |     "name": "ipython",
158 |     "version": 3
159 |    },
160 |    "file_extension": ".py",
161 |    "mimetype": "text/x-python",
162 |    "name": "python",
163 |    "nbconvert_exporter": "python",
164 |    "pygments_lexer": "ipython3",
165 |    "version": "3.5.2"
166 |   }
167 |  },
168 |  "nbformat": 4,
169 |  "nbformat_minor": 1
170 | }
171 | 


--------------------------------------------------------------------------------
/DP/Policy Evaluation Solution.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from IPython.core.debugger import set_trace\n",
 10 |     "import numpy as np\n",
 11 |     "import pprint\n",
 12 |     "import sys\n",
 13 |     "if \"../\" not in sys.path:\n",
 14 |     "  sys.path.append(\"../\") \n",
 15 |     "from lib.envs.gridworld import GridworldEnv"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 2,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "pp = pprint.PrettyPrinter(indent=2)\n",
 25 |     "env = GridworldEnv()"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):\n",
 35 |     "    \"\"\"\n",
 36 |     "    Evaluate a policy given an environment and a full description of the environment's dynamics.\n",
 37 |     "    \n",
 38 |     "    Args:\n",
 39 |     "        policy: [S, A] shaped matrix representing the policy.\n",
 40 |     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
 41 |     "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
 42 |     "            env.nS is a number of states in the environment. \n",
 43 |     "            env.nA is a number of actions in the environment.\n",
 44 |     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
 45 |     "        discount_factor: Gamma discount factor.\n",
 46 |     "    \n",
 47 |     "    Returns:\n",
 48 |     "        Vector of length env.nS representing the value function.\n",
 49 |     "    \"\"\"\n",
 50 |     "    # Start with a random (all 0) value function\n",
 51 |     "    V = np.zeros(env.nS)\n",
 52 |     "    while True:\n",
 53 |     "        delta = 0\n",
 54 |     "        # For each state, perform a \"full backup\"\n",
 55 |     "        for s in range(env.nS):\n",
 56 |     "            v = 0\n",
 57 |     "            # Look at the possible next actions\n",
 58 |     "            for a, action_prob in enumerate(policy[s]):\n",
 59 |     "                # For each action, look at the possible next states...\n",
 60 |     "                for  prob, next_state, reward, done in env.P[s][a]:\n",
 61 |     "                    # Calculate the expected value. Ref: Sutton book eq. 4.6.\n",
 62 |     "                    v += action_prob * prob * (reward + discount_factor * V[next_state])\n",
 63 |     "            # How much our value function changed (across any states)\n",
 64 |     "            delta = max(delta, np.abs(v - V[s]))\n",
 65 |     "            V[s] = v\n",
 66 |     "        # Stop evaluating once our value function change is below a threshold\n",
 67 |     "        if delta < theta:\n",
 68 |     "            break\n",
 69 |     "    return np.array(V)"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 4,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "random_policy = np.ones([env.nS, env.nA]) / env.nA\n",
 79 |     "v = policy_eval(random_policy, env)"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 5,
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "name": "stdout",
 89 |      "output_type": "stream",
 90 |      "text": [
 91 |       "Value Function:\n",
 92 |       "[  0.         -13.99993529 -19.99990698 -21.99989761 -13.99993529\n",
 93 |       " -17.9999206  -19.99991379 -19.99991477 -19.99990698 -19.99991379\n",
 94 |       " -17.99992725 -13.99994569 -21.99989761 -19.99991477 -13.99994569\n",
 95 |       "   0.        ]\n",
 96 |       "\n",
 97 |       "Reshaped Grid Value Function:\n",
 98 |       "[[  0.         -13.99993529 -19.99990698 -21.99989761]\n",
 99 |       " [-13.99993529 -17.9999206  -19.99991379 -19.99991477]\n",
100 |       " [-19.99990698 -19.99991379 -17.99992725 -13.99994569]\n",
101 |       " [-21.99989761 -19.99991477 -13.99994569   0.        ]]\n",
102 |       "\n"
103 |      ]
104 |     }
105 |    ],
106 |    "source": [
107 |     "print(\"Value Function:\")\n",
108 |     "print(v)\n",
109 |     "print(\"\")\n",
110 |     "\n",
111 |     "print(\"Reshaped Grid Value Function:\")\n",
112 |     "print(v.reshape(env.shape))\n",
113 |     "print(\"\")"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 6,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "# Test: Make sure the evaluated policy is what we expected\n",
123 |     "expected_v = np.array([0, -14, -20, -22, -14, -18, -20, -20, -20, -20, -18, -14, -22, -20, -14, 0])\n",
124 |     "np.testing.assert_array_almost_equal(v, expected_v, decimal=2)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": []
133 |   }
134 |  ],
135 |  "metadata": {
136 |   "kernelspec": {
137 |    "display_name": "Python 3",
138 |    "language": "python",
139 |    "name": "python3"
140 |   },
141 |   "language_info": {
142 |    "codemirror_mode": {
143 |     "name": "ipython",
144 |     "version": 3
145 |    },
146 |    "file_extension": ".py",
147 |    "mimetype": "text/x-python",
148 |    "name": "python",
149 |    "nbconvert_exporter": "python",
150 |    "pygments_lexer": "ipython3",
151 |    "version": "3.6.4"
152 |   }
153 |  },
154 |  "nbformat": 4,
155 |  "nbformat_minor": 1
156 | }
157 | 


--------------------------------------------------------------------------------
/MC/Off-Policy MC Control with Weighted Importance Sampling.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "%matplotlib inline\n",
 12 |     "\n",
 13 |     "import gym\n",
 14 |     "import matplotlib\n",
 15 |     "import numpy as np\n",
 16 |     "import sys\n",
 17 |     "\n",
 18 |     "from collections import defaultdict\n",
 19 |     "if \"../\" not in sys.path:\n",
 20 |     "  sys.path.append(\"../\") \n",
 21 |     "from lib.envs.blackjack import BlackjackEnv\n",
 22 |     "from lib import plotting\n",
 23 |     "\n",
 24 |     "matplotlib.style.use('ggplot')"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {
 31 |     "collapsed": true
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "env = BlackjackEnv()"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {
 42 |     "collapsed": true
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "def create_random_policy(nA):\n",
 47 |     "    \"\"\"\n",
 48 |     "    Creates a random policy function.\n",
 49 |     "    \n",
 50 |     "    Args:\n",
 51 |     "        nA: Number of actions in the environment.\n",
 52 |     "    \n",
 53 |     "    Returns:\n",
 54 |     "        A function that takes an observation as input and returns a vector\n",
 55 |     "        of action probabilities\n",
 56 |     "    \"\"\"\n",
 57 |     "    A = np.ones(nA, dtype=float) / nA\n",
 58 |     "    def policy_fn(observation):\n",
 59 |     "        return A\n",
 60 |     "    return policy_fn"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {
 67 |     "collapsed": true
 68 |    },
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "def create_greedy_policy(Q):\n",
 72 |     "    \"\"\"\n",
 73 |     "    Creates a greedy policy based on Q values.\n",
 74 |     "    \n",
 75 |     "    Args:\n",
 76 |     "        Q: A dictionary that maps from state -> action values\n",
 77 |     "        \n",
 78 |     "    Returns:\n",
 79 |     "        A function that takes an observation as input and returns a vector\n",
 80 |     "        of action probabilities.\n",
 81 |     "    \"\"\"\n",
 82 |     "    \n",
 83 |     "    def policy_fn(observation):\n",
 84 |     "        pass\n",
 85 |     "        # Implement this!\n",
 86 |     "    return policy_fn"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {
 93 |     "collapsed": true
 94 |    },
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "def mc_control_importance_sampling(env, num_episodes, behavior_policy, discount_factor=1.0):\n",
 98 |     "    \"\"\"\n",
 99 |     "    Monte Carlo Control Off-Policy Control using Weighted Importance Sampling.\n",
100 |     "    Finds an optimal greedy policy.\n",
101 |     "    \n",
102 |     "    Args:\n",
103 |     "        env: OpenAI gym environment.\n",
104 |     "        num_episodes: Number of episodes to sample.\n",
105 |     "        behavior_policy: The behavior to follow while generating episodes.\n",
106 |     "            A function that given an observation returns a vector of probabilities for each action.\n",
107 |     "        discount_factor: Gamma discount factor.\n",
108 |     "    \n",
109 |     "    Returns:\n",
110 |     "        A tuple (Q, policy).\n",
111 |     "        Q is a dictionary mapping state -> action values.\n",
112 |     "        policy is a function that takes an observation as an argument and returns\n",
113 |     "        action probabilities. This is the optimal greedy policy.\n",
114 |     "    \"\"\"\n",
115 |     "    \n",
116 |     "    # The final action-value function.\n",
117 |     "    # A dictionary that maps state -> action values\n",
118 |     "    Q = defaultdict(lambda: np.zeros(env.action_space.n))\n",
119 |     "    \n",
120 |     "    # Our greedily policy we want to learn\n",
121 |     "    target_policy = create_greedy_policy(Q)\n",
122 |     "    \n",
123 |     "    # Implement this!\n",
124 |     "        \n",
125 |     "    return Q, target_policy"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {
132 |     "collapsed": true
133 |    },
134 |    "outputs": [],
135 |    "source": [
136 |     "random_policy = create_random_policy(env.action_space.n)\n",
137 |     "Q, policy = mc_control_importance_sampling(env, num_episodes=500000, behavior_policy=random_policy)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {
144 |     "collapsed": true
145 |    },
146 |    "outputs": [],
147 |    "source": [
148 |     "# For plotting: Create value function from action-value function\n",
149 |     "# by picking the best action at each state\n",
150 |     "V = defaultdict(float)\n",
151 |     "for state, action_values in Q.items():\n",
152 |     "    action_value = np.max(action_values)\n",
153 |     "    V[state] = action_value\n",
154 |     "plotting.plot_value_function(V, title=\"Optimal Value Function\")"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {
161 |     "collapsed": true
162 |    },
163 |    "outputs": [],
164 |    "source": []
165 |   }
166 |  ],
167 |  "metadata": {
168 |   "kernelspec": {
169 |    "display_name": "Python 3",
170 |    "language": "python",
171 |    "name": "python3"
172 |   },
173 |   "language_info": {
174 |    "codemirror_mode": {
175 |     "name": "ipython",
176 |     "version": 3
177 |    },
178 |    "file_extension": ".py",
179 |    "mimetype": "text/x-python",
180 |    "name": "python",
181 |    "nbconvert_exporter": "python",
182 |    "pygments_lexer": "ipython3",
183 |    "version": "3.5.2"
184 |   }
185 |  },
186 |  "nbformat": 4,
187 |  "nbformat_minor": 1
188 | }
189 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ### Overview
 2 | 
 3 | This repository provides code, exercises and solutions for popular Reinforcement Learning algorithms. These are meant to serve as a learning tool to complement the theoretical materials from
 4 | 
 5 | - [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/RLbook2018.pdf)
 6 | - [David Silver's Reinforcement Learning Course](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html)
 7 | 
 8 | Each folder in corresponds to one or more chapters of the above textbook and/or course. In addition to exercises and solution, each folder also contains a list of learning goals, a brief concept summary, and links to the relevant readings.
 9 | 
10 | All code is written in Python 3 and uses RL environments from [OpenAI Gym](https://gym.openai.com/). Advanced techniques use [Tensorflow](https://www.tensorflow.org/) for neural network implementations.
11 | 
12 | 
13 | ### Table of Contents
14 | 
15 | - [Introduction to RL problems & OpenAI Gym](Introduction/)
16 | - [MDPs and Bellman Equations](MDP/)
17 | - [Dynamic Programming: Model-Based RL, Policy Iteration and Value Iteration](DP/)
18 | - [Monte Carlo Model-Free Prediction & Control](MC/)
19 | - [Temporal Difference Model-Free Prediction & Control](TD/)
20 | - [Function Approximation](FA/)
21 | - [Deep Q Learning](DQN/) (WIP)
22 | - [Policy Gradient Methods](PolicyGradient/) (WIP)
23 | - Learning and Planning (WIP)
24 | - Exploration and Exploitation (WIP)
25 | 
26 | 
27 | ### List of Implemented Algorithms
28 | 
29 | - [Dynamic Programming Policy Evaluation](DP/Policy%20Evaluation%20Solution.ipynb)
30 | - [Dynamic Programming Policy Iteration](DP/Policy%20Iteration%20Solution.ipynb)
31 | - [Dynamic Programming Value Iteration](DP/Value%20Iteration%20Solution.ipynb)
32 | - [Monte Carlo Prediction](MC/MC%20Prediction%20Solution.ipynb)
33 | - [Monte Carlo Control with Epsilon-Greedy Policies](MC/MC%20Control%20with%20Epsilon-Greedy%20Policies%20Solution.ipynb)
34 | - [Monte Carlo Off-Policy Control with Importance Sampling](MC/Off-Policy%20MC%20Control%20with%20Weighted%20Importance%20Sampling%20Solution.ipynb)
35 | - [SARSA (On Policy TD Learning)](TD/SARSA%20Solution.ipynb)
36 | - [Q-Learning (Off Policy TD Learning)](TD/Q-Learning%20Solution.ipynb)
37 | - [Q-Learning with Linear Function Approximation](FA/Q-Learning%20with%20Value%20Function%20Approximation%20Solution.ipynb)
38 | - [Deep Q-Learning for Atari Games](DQN/Deep%20Q%20Learning%20Solution.ipynb)
39 | - [Double Deep-Q Learning for Atari Games](DQN/Double%20DQN%20Solution.ipynb)
40 | - Deep Q-Learning with Prioritized Experience Replay (WIP)
41 | - [Policy Gradient: REINFORCE with Baseline](PolicyGradient/CliffWalk%20REINFORCE%20with%20Baseline%20Solution.ipynb)
42 | - [Policy Gradient: Actor Critic with Baseline](PolicyGradient/CliffWalk%20Actor%20Critic%20Solution.ipynb)
43 | - [Policy Gradient: Actor Critic with Baseline for Continuous Action Spaces](PolicyGradient/Continuous%20MountainCar%20Actor%20Critic%20Solution.ipynb)
44 | - Deterministic Policy Gradients for Continuous Action Spaces (WIP)
45 | - Deep Deterministic Policy Gradients (DDPG) (WIP)
46 | - [Asynchronous Advantage Actor Critic (A3C)](PolicyGradient/a3c)
47 | 
48 | 
49 | ### Resources
50 | 
51 | Textbooks:
52 | 
53 | - [Reinforcement Learning: An Introduction (2nd Edition)](http://incompleteideas.net/book/RLbook2018.pdf)
54 | 
55 | Classes:
56 | 
57 | - [David Silver's Reinforcement Learning Course (UCL, 2015)](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html)
58 | - [CS294 - Deep Reinforcement Learning (Berkeley, Fall 2015)](http://rll.berkeley.edu/deeprlcourse/)
59 | - [CS 8803 - Reinforcement Learning (Georgia Tech)](https://www.udacity.com/course/reinforcement-learning--ud600)
60 | - [CS885 - Reinforcement Learning (UWaterloo), Spring 2018](https://cs.uwaterloo.ca/~ppoupart/teaching/cs885-spring18/)
61 | - [CS294-112 - Deep Reinforcement Learning (UC Berkeley)](http://rail.eecs.berkeley.edu/deeprlcourse/)
62 | 
63 | Talks/Tutorials:
64 | 
65 | - [Introduction to Reinforcement Learning (Joelle Pineau @ Deep Learning Summer School 2016)](http://videolectures.net/deeplearning2016_pineau_reinforcement_learning/)
66 | - [Deep Reinforcement Learning (Pieter Abbeel @ Deep Learning Summer School 2016)](http://videolectures.net/deeplearning2016_abbeel_deep_reinforcement/)
67 | - [Deep Reinforcement Learning ICML 2016 Tutorial (David Silver)](http://techtalks.tv/talks/deep-reinforcement-learning/62360/)
68 | - [Tutorial: Introduction to Reinforcement Learning with Function Approximation](https://www.youtube.com/watch?v=ggqnxyjaKe4)
69 | - [John Schulman - Deep Reinforcement Learning (4 Lectures)](https://www.youtube.com/playlist?list=PLjKEIQlKCTZYN3CYBlj8r58SbNorobqcp)
70 | - [Deep Reinforcement Learning Slides @ NIPS 2016](http://people.eecs.berkeley.edu/~pabbeel/nips-tutorial-policy-optimization-Schulman-Abbeel.pdf)
71 | - [OpenAI Spinning Up](https://spinningup.openai.com/en/latest/user/introduction.html)
72 | - [Advanced Deep Learning & Reinforcement Learning (UCL 2018, DeepMind)](https://www.youtube.com/playlist?list=PLqYmG7hTraZDNJre23vqCGIVpfZ_K2RZs)
73 | -[Deep RL Bootcamp](https://sites.google.com/view/deep-rl-bootcamp/lectures)
74 | 
75 | Other Projects:
76 | 
77 | - [carpedm20/deep-rl-tensorflow](https://github.com/carpedm20/deep-rl-tensorflow)
78 | - [matthiasplappert/keras-rl](https://github.com/matthiasplappert/keras-rl)
79 | 
80 | Selected Papers:
81 | 
82 | - [Human-Level Control through Deep Reinforcement Learning (2015-02)](http://www.readcube.com/articles/10.1038/nature14236)
83 | - [Deep Reinforcement Learning with Double Q-learning (2015-09)](http://arxiv.org/abs/1509.06461)
84 | - [Continuous control with deep reinforcement learning (2015-09)](https://arxiv.org/abs/1509.02971)
85 | - [Prioritized Experience Replay (2015-11)](http://arxiv.org/abs/1511.05952)
86 | - [Dueling Network Architectures for Deep Reinforcement Learning (2015-11)](http://arxiv.org/abs/1511.06581)
87 | - [Asynchronous Methods for Deep Reinforcement Learning (2016-02)](http://arxiv.org/abs/1602.01783)
88 | - [Deep Reinforcement Learning from Self-Play in Imperfect-Information Games (2016-03)](http://arxiv.org/abs/1603.01121)
89 | - [Mastering the game of Go with deep neural networks and tree search](https://gogameguru.com/i/2016/03/deepmind-mastering-go.pdf)
90 | 


--------------------------------------------------------------------------------
/DP/Value Iteration Solution.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pprint\n",
 11 |     "import sys\n",
 12 |     "if \"../\" not in sys.path:\n",
 13 |     "  sys.path.append(\"../\") \n",
 14 |     "from lib.envs.gridworld import GridworldEnv"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "pp = pprint.PrettyPrinter(indent=2)\n",
 24 |     "env = GridworldEnv()"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 3,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "def value_iteration(env, theta=0.0001, discount_factor=1.0):\n",
 34 |     "    \"\"\"\n",
 35 |     "    Value Iteration Algorithm.\n",
 36 |     "    \n",
 37 |     "    Args:\n",
 38 |     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
 39 |     "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
 40 |     "            env.nS is a number of states in the environment. \n",
 41 |     "            env.nA is a number of actions in the environment.\n",
 42 |     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
 43 |     "        discount_factor: Gamma discount factor.\n",
 44 |     "        \n",
 45 |     "    Returns:\n",
 46 |     "        A tuple (policy, V) of the optimal policy and the optimal value function.\n",
 47 |     "    \"\"\"\n",
 48 |     "    \n",
 49 |     "    def one_step_lookahead(state, V):\n",
 50 |     "        \"\"\"\n",
 51 |     "        Helper function to calculate the value for all action in a given state.\n",
 52 |     "        \n",
 53 |     "        Args:\n",
 54 |     "            state: The state to consider (int)\n",
 55 |     "            V: The value to use as an estimator, Vector of length env.nS\n",
 56 |     "        \n",
 57 |     "        Returns:\n",
 58 |     "            A vector of length env.nA containing the expected value of each action.\n",
 59 |     "        \"\"\"\n",
 60 |     "        A = np.zeros(env.nA)\n",
 61 |     "        for a in range(env.nA):\n",
 62 |     "            for prob, next_state, reward, done in env.P[state][a]:\n",
 63 |     "                A[a] += prob * (reward + discount_factor * V[next_state])\n",
 64 |     "        return A\n",
 65 |     "    \n",
 66 |     "    V = np.zeros(env.nS)\n",
 67 |     "    while True:\n",
 68 |     "        # Stopping condition\n",
 69 |     "        delta = 0\n",
 70 |     "        # Update each state...\n",
 71 |     "        for s in range(env.nS):\n",
 72 |     "            # Do a one-step lookahead to find the best action\n",
 73 |     "            A = one_step_lookahead(s, V)\n",
 74 |     "            best_action_value = np.max(A)\n",
 75 |     "            # Calculate delta across all states seen so far\n",
 76 |     "            delta = max(delta, np.abs(best_action_value - V[s]))\n",
 77 |     "            # Update the value function. Ref: Sutton book eq. 4.10. \n",
 78 |     "            V[s] = best_action_value        \n",
 79 |     "        # Check if we can stop \n",
 80 |     "        if delta < theta:\n",
 81 |     "            break\n",
 82 |     "    \n",
 83 |     "    # Create a deterministic policy using the optimal value function\n",
 84 |     "    policy = np.zeros([env.nS, env.nA])\n",
 85 |     "    for s in range(env.nS):\n",
 86 |     "        # One step lookahead to find the best action for this state\n",
 87 |     "        A = one_step_lookahead(s, V)\n",
 88 |     "        best_action = np.argmax(A)\n",
 89 |     "        # Always take the best action\n",
 90 |     "        policy[s, best_action] = 1.0\n",
 91 |     "    \n",
 92 |     "    return policy, V"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 4,
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "name": "stdout",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "Policy Probability Distribution:\n",
105 |       "[[1. 0. 0. 0.]\n",
106 |       " [0. 0. 0. 1.]\n",
107 |       " [0. 0. 0. 1.]\n",
108 |       " [0. 0. 1. 0.]\n",
109 |       " [1. 0. 0. 0.]\n",
110 |       " [1. 0. 0. 0.]\n",
111 |       " [1. 0. 0. 0.]\n",
112 |       " [0. 0. 1. 0.]\n",
113 |       " [1. 0. 0. 0.]\n",
114 |       " [1. 0. 0. 0.]\n",
115 |       " [0. 1. 0. 0.]\n",
116 |       " [0. 0. 1. 0.]\n",
117 |       " [1. 0. 0. 0.]\n",
118 |       " [0. 1. 0. 0.]\n",
119 |       " [0. 1. 0. 0.]\n",
120 |       " [1. 0. 0. 0.]]\n",
121 |       "\n",
122 |       "Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\n",
123 |       "[[0 3 3 2]\n",
124 |       " [0 0 0 2]\n",
125 |       " [0 0 1 2]\n",
126 |       " [0 1 1 0]]\n",
127 |       "\n",
128 |       "Value Function:\n",
129 |       "[ 0. -1. -2. -3. -1. -2. -3. -2. -2. -3. -2. -1. -3. -2. -1.  0.]\n",
130 |       "\n",
131 |       "Reshaped Grid Value Function:\n",
132 |       "[[ 0. -1. -2. -3.]\n",
133 |       " [-1. -2. -3. -2.]\n",
134 |       " [-2. -3. -2. -1.]\n",
135 |       " [-3. -2. -1.  0.]]\n",
136 |       "\n"
137 |      ]
138 |     }
139 |    ],
140 |    "source": [
141 |     "policy, v = value_iteration(env)\n",
142 |     "\n",
143 |     "print(\"Policy Probability Distribution:\")\n",
144 |     "print(policy)\n",
145 |     "print(\"\")\n",
146 |     "\n",
147 |     "print(\"Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\")\n",
148 |     "print(np.reshape(np.argmax(policy, axis=1), env.shape))\n",
149 |     "print(\"\")\n",
150 |     "\n",
151 |     "print(\"Value Function:\")\n",
152 |     "print(v)\n",
153 |     "print(\"\")\n",
154 |     "\n",
155 |     "print(\"Reshaped Grid Value Function:\")\n",
156 |     "print(v.reshape(env.shape))\n",
157 |     "print(\"\")"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 5,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "# Test the value function\n",
167 |     "expected_v = np.array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1,  0])\n",
168 |     "np.testing.assert_array_almost_equal(v, expected_v, decimal=2)"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": []
177 |   }
178 |  ],
179 |  "metadata": {
180 |   "anaconda-cloud": {},
181 |   "kernelspec": {
182 |    "display_name": "Python 3",
183 |    "language": "python",
184 |    "name": "python3"
185 |   },
186 |   "language_info": {
187 |    "codemirror_mode": {
188 |     "name": "ipython",
189 |     "version": 3
190 |    },
191 |    "file_extension": ".py",
192 |    "mimetype": "text/x-python",
193 |    "name": "python",
194 |    "nbconvert_exporter": "python",
195 |    "pygments_lexer": "ipython3",
196 |    "version": "3.6.4"
197 |   }
198 |  },
199 |  "nbformat": 4,
200 |  "nbformat_minor": 1
201 | }
202 | 


--------------------------------------------------------------------------------
/PolicyGradient/a3c/estimators.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | 
  4 | def build_shared_network(X, add_summaries=False):
  5 |   """
  6 |   Builds a 3-layer network conv -> conv -> fc as described
  7 |   in the A3C paper. This network is shared by both the policy and value net.
  8 | 
  9 |   Args:
 10 |     X: Inputs
 11 |     add_summaries: If true, add layer summaries to Tensorboard.
 12 | 
 13 |   Returns:
 14 |     Final layer activations.
 15 |   """
 16 | 
 17 |   # Three convolutional layers
 18 |   conv1 = tf.contrib.layers.conv2d(
 19 |     X, 16, 8, 4, activation_fn=tf.nn.relu, scope="conv1")
 20 |   conv2 = tf.contrib.layers.conv2d(
 21 |     conv1, 32, 4, 2, activation_fn=tf.nn.relu, scope="conv2")
 22 | 
 23 |   # Fully connected layer
 24 |   fc1 = tf.contrib.layers.fully_connected(
 25 |     inputs=tf.contrib.layers.flatten(conv2),
 26 |     num_outputs=256,
 27 |     scope="fc1")
 28 | 
 29 |   if add_summaries:
 30 |     tf.contrib.layers.summarize_activation(conv1)
 31 |     tf.contrib.layers.summarize_activation(conv2)
 32 |     tf.contrib.layers.summarize_activation(fc1)
 33 | 
 34 |   return fc1
 35 | 
 36 | class PolicyEstimator():
 37 |   """
 38 |   Policy Function approximator. Given a observation, returns probabilities
 39 |   over all possible actions.
 40 | 
 41 |   Args:
 42 |     num_outputs: Size of the action space.
 43 |     reuse: If true, an existing shared network will be re-used.
 44 |     trainable: If true we add train ops to the network.
 45 |       Actor threads that don't update their local models and don't need
 46 |       train ops would set this to false.
 47 |   """
 48 | 
 49 |   def __init__(self, num_outputs, reuse=False, trainable=True):
 50 |     self.num_outputs = num_outputs
 51 | 
 52 |     # Placeholders for our input
 53 |     # Our input are 4 RGB frames of shape 160, 160 each
 54 |     self.states = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name="X")
 55 |     # The TD target value
 56 |     self.targets = tf.placeholder(shape=[None], dtype=tf.float32, name="y")
 57 |     # Integer id of which action was selected
 58 |     self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="actions")
 59 | 
 60 |     # Normalize
 61 |     X = tf.to_float(self.states) / 255.0
 62 |     batch_size = tf.shape(self.states)[0]
 63 | 
 64 |     # Graph shared with Value Net
 65 |     with tf.variable_scope("shared", reuse=reuse):
 66 |       fc1 = build_shared_network(X, add_summaries=(not reuse))
 67 | 
 68 | 
 69 |     with tf.variable_scope("policy_net"):
 70 |       self.logits = tf.contrib.layers.fully_connected(fc1, num_outputs, activation_fn=None)
 71 |       self.probs = tf.nn.softmax(self.logits) + 1e-8
 72 | 
 73 |       self.predictions = {
 74 |         "logits": self.logits,
 75 |         "probs": self.probs
 76 |       }
 77 | 
 78 |       # We add entropy to the loss to encourage exploration
 79 |       self.entropy = -tf.reduce_sum(self.probs * tf.log(self.probs), 1, name="entropy")
 80 |       self.entropy_mean = tf.reduce_mean(self.entropy, name="entropy_mean")
 81 | 
 82 |       # Get the predictions for the chosen actions only
 83 |       gather_indices = tf.range(batch_size) * tf.shape(self.probs)[1] + self.actions
 84 |       self.picked_action_probs = tf.gather(tf.reshape(self.probs, [-1]), gather_indices)
 85 | 
 86 |       self.losses = - (tf.log(self.picked_action_probs) * self.targets + 0.01 * self.entropy)
 87 |       self.loss = tf.reduce_sum(self.losses, name="loss")
 88 | 
 89 |       tf.summary.scalar(self.loss.op.name, self.loss)
 90 |       tf.summary.scalar(self.entropy_mean.op.name, self.entropy_mean)
 91 |       tf.summary.histogram(self.entropy.op.name, self.entropy)
 92 | 
 93 |       if trainable:
 94 |         # self.optimizer = tf.train.AdamOptimizer(1e-4)
 95 |         self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
 96 |         self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
 97 |         self.grads_and_vars = [[grad, var] for grad, var in self.grads_and_vars if grad is not None]
 98 |         self.train_op = self.optimizer.apply_gradients(self.grads_and_vars,
 99 |           global_step=tf.contrib.framework.get_global_step())
100 | 
101 |     # Merge summaries from this network and the shared network (but not the value net)
102 |     var_scope_name = tf.get_variable_scope().name
103 |     summary_ops = tf.get_collection(tf.GraphKeys.SUMMARIES)
104 |     sumaries = [s for s in summary_ops if "policy_net" in s.name or "shared" in s.name]
105 |     sumaries = [s for s in summary_ops if var_scope_name in s.name]
106 |     self.summaries = tf.summary.merge(sumaries)
107 | 
108 | 
109 | class ValueEstimator():
110 |   """
111 |   Value Function approximator. Returns a value estimator for a batch of observations.
112 | 
113 |   Args:
114 |     reuse: If true, an existing shared network will be re-used.
115 |     trainable: If true we add train ops to the network.
116 |       Actor threads that don't update their local models and don't need
117 |       train ops would set this to false.
118 |   """
119 | 
120 |   def __init__(self, reuse=False, trainable=True):
121 |     # Placeholders for our input
122 |     # Our input are 4 RGB frames of shape 160, 160 each
123 |     self.states = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name="X")
124 |     # The TD target value
125 |     self.targets = tf.placeholder(shape=[None], dtype=tf.float32, name="y")
126 | 
127 |     X = tf.to_float(self.states) / 255.0
128 | 
129 |     # Graph shared with Value Net
130 |     with tf.variable_scope("shared", reuse=reuse):
131 |       fc1 = build_shared_network(X, add_summaries=(not reuse))
132 | 
133 |     with tf.variable_scope("value_net"):
134 |       self.logits = tf.contrib.layers.fully_connected(
135 |         inputs=fc1,
136 |         num_outputs=1,
137 |         activation_fn=None)
138 |       self.logits = tf.squeeze(self.logits, squeeze_dims=[1], name="logits")
139 | 
140 |       self.losses = tf.squared_difference(self.logits, self.targets)
141 |       self.loss = tf.reduce_sum(self.losses, name="loss")
142 | 
143 |       self.predictions = {
144 |         "logits": self.logits
145 |       }
146 | 
147 |       # Summaries
148 |       prefix = tf.get_variable_scope().name
149 |       tf.summary.scalar(self.loss.name, self.loss)
150 |       tf.summary.scalar("{}/max_value".format(prefix), tf.reduce_max(self.logits))
151 |       tf.summary.scalar("{}/min_value".format(prefix), tf.reduce_min(self.logits))
152 |       tf.summary.scalar("{}/mean_value".format(prefix), tf.reduce_mean(self.logits))
153 |       tf.summary.scalar("{}/reward_max".format(prefix), tf.reduce_max(self.targets))
154 |       tf.summary.scalar("{}/reward_min".format(prefix), tf.reduce_min(self.targets))
155 |       tf.summary.scalar("{}/reward_mean".format(prefix), tf.reduce_mean(self.targets))
156 |       tf.summary.histogram("{}/reward_targets".format(prefix), self.targets)
157 |       tf.summary.histogram("{}/values".format(prefix), self.logits)
158 | 
159 |       if trainable:
160 |         # self.optimizer = tf.train.AdamOptimizer(1e-4)
161 |         self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
162 |         self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
163 |         self.grads_and_vars = [[grad, var] for grad, var in self.grads_and_vars if grad is not None]
164 |         self.train_op = self.optimizer.apply_gradients(self.grads_and_vars,
165 |           global_step=tf.contrib.framework.get_global_step())
166 | 
167 |     var_scope_name = tf.get_variable_scope().name
168 |     summary_ops = tf.get_collection(tf.GraphKeys.SUMMARIES)
169 |     sumaries = [s for s in summary_ops if "policy_net" in s.name or "shared" in s.name]
170 |     sumaries = [s for s in summary_ops if var_scope_name in s.name]
171 |     self.summaries = tf.summary.merge(sumaries)
172 | 


--------------------------------------------------------------------------------
/DP/Policy Evaluation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 23,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import sys\n",
 13 |     "if \"../\" not in sys.path:\n",
 14 |     "  sys.path.append(\"../\") \n",
 15 |     "from lib.envs.gridworld import GridworldEnv"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 24,
 21 |    "metadata": {
 22 |     "collapsed": true
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "env = GridworldEnv()"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 25,
 32 |    "metadata": {
 33 |     "collapsed": true
 34 |    },
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):\n",
 38 |     "    \"\"\"\n",
 39 |     "    Evaluate a policy given an environment and a full description of the environment's dynamics.\n",
 40 |     "    \n",
 41 |     "    Args:\n",
 42 |     "        policy: [S, A] shaped matrix representing the policy.\n",
 43 |     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
 44 |     "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
 45 |     "            env.nS is a number of states in the environment. \n",
 46 |     "            env.nA is a number of actions in the environment.\n",
 47 |     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
 48 |     "        discount_factor: Gamma discount factor.\n",
 49 |     "    \n",
 50 |     "    Returns:\n",
 51 |     "        Vector of length env.nS representing the value function.\n",
 52 |     "    \"\"\"\n",
 53 |     "    # Start with a random (all 0) value function\n",
 54 |     "    V = np.zeros(env.nS)\n",
 55 |     "    while True:\n",
 56 |     "        # TODO: Implement!\n",
 57 |     "        break\n",
 58 |     "    return np.array(V)"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 26,
 64 |    "metadata": {
 65 |     "collapsed": true
 66 |    },
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "random_policy = np.ones([env.nS, env.nA]) / env.nA\n",
 70 |     "v = policy_eval(random_policy, env)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 22,
 76 |    "metadata": {},
 77 |    "outputs": [
 78 |     {
 79 |      "ename": "AssertionError",
 80 |      "evalue": "\nArrays are not almost equal to 2 decimals\n\n(mismatch 87.5%)\n x: array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,\n        0.,  0.,  0.])\n y: array([  0, -14, -20, -22, -14, -18, -20, -20, -20, -20, -18, -14, -22,\n       -20, -14,   0])",
 81 |      "output_type": "error",
 82 |      "traceback": [
 83 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 84 |       "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
 85 |       "\u001b[0;32m<ipython-input-22-235f39fb115c>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# Test: Make sure the evaluated policy is what we expected\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mexpected_v\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m14\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m22\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m14\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m18\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m18\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m14\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m22\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m14\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtesting\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0massert_array_almost_equal\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexpected_v\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdecimal\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
 86 |       "\u001b[0;32m/Users/dennybritz/venvs/tf/lib/python3.5/site-packages/numpy/testing/utils.py\u001b[0m in \u001b[0;36massert_array_almost_equal\u001b[0;34m(x, y, decimal, err_msg, verbose)\u001b[0m\n\u001b[1;32m    914\u001b[0m     assert_array_compare(compare, x, y, err_msg=err_msg, verbose=verbose,\n\u001b[1;32m    915\u001b[0m              \u001b[0mheader\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Arrays are not almost equal to %d decimals'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mdecimal\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 916\u001b[0;31m              precision=decimal)\n\u001b[0m\u001b[1;32m    917\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    918\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
 87 |       "\u001b[0;32m/Users/dennybritz/venvs/tf/lib/python3.5/site-packages/numpy/testing/utils.py\u001b[0m in \u001b[0;36massert_array_compare\u001b[0;34m(comparison, x, y, err_msg, verbose, header, precision)\u001b[0m\n\u001b[1;32m    735\u001b[0m                                 names=('x', 'y'), precision=precision)\n\u001b[1;32m    736\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mcond\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 737\u001b[0;31m                 \u001b[0;32mraise\u001b[0m \u001b[0mAssertionError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    738\u001b[0m     \u001b[0;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    739\u001b[0m         \u001b[0;32mimport\u001b[0m \u001b[0mtraceback\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 88 |       "\u001b[0;31mAssertionError\u001b[0m: \nArrays are not almost equal to 2 decimals\n\n(mismatch 87.5%)\n x: array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,\n        0.,  0.,  0.])\n y: array([  0, -14, -20, -22, -14, -18, -20, -20, -20, -20, -18, -14, -22,\n       -20, -14,   0])"
 89 |      ]
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "# Test: Make sure the evaluated policy is what we expected\n",
 94 |     "expected_v = np.array([0, -14, -20, -22, -14, -18, -20, -20, -20, -20, -18, -14, -22, -20, -14, 0])\n",
 95 |     "np.testing.assert_array_almost_equal(v, expected_v, decimal=2)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {
102 |     "collapsed": true
103 |    },
104 |    "outputs": [],
105 |    "source": []
106 |   }
107 |  ],
108 |  "metadata": {
109 |   "kernelspec": {
110 |    "display_name": "Python 3",
111 |    "language": "python",
112 |    "name": "python3"
113 |   },
114 |   "language_info": {
115 |    "codemirror_mode": {
116 |     "name": "ipython",
117 |     "version": 3
118 |    },
119 |    "file_extension": ".py",
120 |    "mimetype": "text/x-python",
121 |    "name": "python",
122 |    "nbconvert_exporter": "python",
123 |    "pygments_lexer": "ipython3",
124 |    "version": "3.5.2"
125 |   }
126 |  },
127 |  "nbformat": 4,
128 |  "nbformat_minor": 1
129 | }
130 | 


--------------------------------------------------------------------------------
/PolicyGradient/a3c/worker.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import sys
  3 | import os
  4 | import itertools
  5 | import collections
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | 
  9 | from inspect import getsourcefile
 10 | current_path = os.path.dirname(os.path.abspath(getsourcefile(lambda:0)))
 11 | import_path = os.path.abspath(os.path.join(current_path, "../.."))
 12 | 
 13 | if import_path not in sys.path:
 14 |   sys.path.append(import_path)
 15 | 
 16 | # from lib import plotting
 17 | from lib.atari.state_processor import StateProcessor
 18 | from lib.atari import helpers as atari_helpers
 19 | from estimators import ValueEstimator, PolicyEstimator
 20 | 
 21 | Transition = collections.namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
 22 | 
 23 | 
 24 | def make_copy_params_op(v1_list, v2_list):
 25 |   """
 26 |   Creates an operation that copies parameters from variable in v1_list to variables in v2_list.
 27 |   The ordering of the variables in the lists must be identical.
 28 |   """
 29 |   v1_list = list(sorted(v1_list, key=lambda v: v.name))
 30 |   v2_list = list(sorted(v2_list, key=lambda v: v.name))
 31 | 
 32 |   update_ops = []
 33 |   for v1, v2 in zip(v1_list, v2_list):
 34 |     op = v2.assign(v1)
 35 |     update_ops.append(op)
 36 | 
 37 |   return update_ops
 38 | 
 39 | def make_train_op(local_estimator, global_estimator):
 40 |   """
 41 |   Creates an op that applies local estimator gradients
 42 |   to the global estimator.
 43 |   """
 44 |   local_grads, _ = zip(*local_estimator.grads_and_vars)
 45 |   # Clip gradients
 46 |   local_grads, _ = tf.clip_by_global_norm(local_grads, 5.0)
 47 |   _, global_vars = zip(*global_estimator.grads_and_vars)
 48 |   local_global_grads_and_vars = list(zip(local_grads, global_vars))
 49 |   return global_estimator.optimizer.apply_gradients(local_global_grads_and_vars,
 50 |           global_step=tf.contrib.framework.get_global_step())
 51 | 
 52 | 
 53 | class Worker(object):
 54 |   """
 55 |   An A3C worker thread. Runs episodes locally and updates global shared value and policy nets.
 56 | 
 57 |   Args:
 58 |     name: A unique name for this worker
 59 |     env: The Gym environment used by this worker
 60 |     policy_net: Instance of the globally shared policy net
 61 |     value_net: Instance of the globally shared value net
 62 |     global_counter: Iterator that holds the global step
 63 |     discount_factor: Reward discount factor
 64 |     summary_writer: A tf.train.SummaryWriter for Tensorboard summaries
 65 |     max_global_steps: If set, stop coordinator when global_counter > max_global_steps
 66 |   """
 67 |   def __init__(self, name, env, policy_net, value_net, global_counter, discount_factor=0.99, summary_writer=None, max_global_steps=None):
 68 |     self.name = name
 69 |     self.discount_factor = discount_factor
 70 |     self.max_global_steps = max_global_steps
 71 |     self.global_step = tf.contrib.framework.get_global_step()
 72 |     self.global_policy_net = policy_net
 73 |     self.global_value_net = value_net
 74 |     self.global_counter = global_counter
 75 |     self.local_counter = itertools.count()
 76 |     self.sp = StateProcessor()
 77 |     self.summary_writer = summary_writer
 78 |     self.env = env
 79 | 
 80 |     # Create local policy/value nets that are not updated asynchronously
 81 |     with tf.variable_scope(name):
 82 |       self.policy_net = PolicyEstimator(policy_net.num_outputs)
 83 |       self.value_net = ValueEstimator(reuse=True)
 84 | 
 85 |     # Op to copy params from global policy/valuenets
 86 |     self.copy_params_op = make_copy_params_op(
 87 |       tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES),
 88 |       tf.contrib.slim.get_variables(scope=self.name+'/', collection=tf.GraphKeys.TRAINABLE_VARIABLES))
 89 | 
 90 |     self.vnet_train_op = make_train_op(self.value_net, self.global_value_net)
 91 |     self.pnet_train_op = make_train_op(self.policy_net, self.global_policy_net)
 92 | 
 93 |     self.state = None
 94 | 
 95 |   def run(self, sess, coord, t_max):
 96 |     with sess.as_default(), sess.graph.as_default():
 97 |       # Initial state
 98 |       self.state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset()))
 99 |       try:
100 |         while not coord.should_stop():
101 |           # Copy Parameters from the global networks
102 |           sess.run(self.copy_params_op)
103 | 
104 |           # Collect some experience
105 |           transitions, local_t, global_t = self.run_n_steps(t_max, sess)
106 | 
107 |           if self.max_global_steps is not None and global_t >= self.max_global_steps:
108 |             tf.logging.info("Reached global step {}. Stopping.".format(global_t))
109 |             coord.request_stop()
110 |             return
111 | 
112 |           # Update the global networks
113 |           self.update(transitions, sess)
114 | 
115 |       except tf.errors.CancelledError:
116 |         return
117 | 
118 |   def _policy_net_predict(self, state, sess):
119 |     feed_dict = { self.policy_net.states: [state] }
120 |     preds = sess.run(self.policy_net.predictions, feed_dict)
121 |     return preds["probs"][0]
122 | 
123 |   def _value_net_predict(self, state, sess):
124 |     feed_dict = { self.value_net.states: [state] }
125 |     preds = sess.run(self.value_net.predictions, feed_dict)
126 |     return preds["logits"][0]
127 | 
128 |   def run_n_steps(self, n, sess):
129 |     transitions = []
130 |     for _ in range(n):
131 |       # Take a step
132 |       action_probs = self._policy_net_predict(self.state, sess)
133 |       action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
134 |       next_state, reward, done, _ = self.env.step(action)
135 |       next_state = atari_helpers.atari_make_next_state(self.state, self.sp.process(next_state))
136 | 
137 |       # Store transition
138 |       transitions.append(Transition(
139 |         state=self.state, action=action, reward=reward, next_state=next_state, done=done))
140 | 
141 |       # Increase local and global counters
142 |       local_t = next(self.local_counter)
143 |       global_t = next(self.global_counter)
144 | 
145 |       if local_t % 100 == 0:
146 |         tf.logging.info("{}: local Step {}, global step {}".format(self.name, local_t, global_t))
147 | 
148 |       if done:
149 |         self.state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset()))
150 |         break
151 |       else:
152 |         self.state = next_state
153 |     return transitions, local_t, global_t
154 | 
155 |   def update(self, transitions, sess):
156 |     """
157 |     Updates global policy and value networks based on collected experience
158 | 
159 |     Args:
160 |       transitions: A list of experience transitions
161 |       sess: A Tensorflow session
162 |     """
163 | 
164 |     # If we episode was not done we bootstrap the value from the last state
165 |     reward = 0.0
166 |     if not transitions[-1].done:
167 |       reward = self._value_net_predict(transitions[-1].next_state, sess)
168 | 
169 |     # Accumulate minibatch exmaples
170 |     states = []
171 |     policy_targets = []
172 |     value_targets = []
173 |     actions = []
174 | 
175 |     for transition in transitions[::-1]:
176 |       reward = transition.reward + self.discount_factor * reward
177 |       policy_target = (reward - self._value_net_predict(transition.state, sess))
178 |       # Accumulate updates
179 |       states.append(transition.state)
180 |       actions.append(transition.action)
181 |       policy_targets.append(policy_target)
182 |       value_targets.append(reward)
183 | 
184 |     feed_dict = {
185 |       self.policy_net.states: np.array(states),
186 |       self.policy_net.targets: policy_targets,
187 |       self.policy_net.actions: actions,
188 |       self.value_net.states: np.array(states),
189 |       self.value_net.targets: value_targets,
190 |     }
191 | 
192 |     # Train the global estimators using local gradients
193 |     global_step, pnet_loss, vnet_loss, _, _, pnet_summaries, vnet_summaries = sess.run([
194 |       self.global_step,
195 |       self.policy_net.loss,
196 |       self.value_net.loss,
197 |       self.pnet_train_op,
198 |       self.vnet_train_op,
199 |       self.policy_net.summaries,
200 |       self.value_net.summaries
201 |     ], feed_dict)
202 | 
203 |     # Write summaries
204 |     if self.summary_writer is not None:
205 |       self.summary_writer.add_summary(pnet_summaries, global_step)
206 |       self.summary_writer.add_summary(vnet_summaries, global_step)
207 |       self.summary_writer.flush()
208 | 
209 |     return pnet_loss, vnet_loss, pnet_summaries, vnet_summaries
210 | 


--------------------------------------------------------------------------------
/MC/Blackjack Playground.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import sys\n",
 11 |     "if \"../\" not in sys.path:\n",
 12 |     "  sys.path.append(\"../\") \n",
 13 |     "from lib.envs.blackjack import BlackjackEnv"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "env = BlackjackEnv()"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 3,
 28 |    "metadata": {},
 29 |    "outputs": [
 30 |     {
 31 |      "name": "stdout",
 32 |      "output_type": "stream",
 33 |      "text": [
 34 |       "Player Score: 19 (Usable Ace: False), Dealer Score: 5\n",
 35 |       "Taking action: Hit\n",
 36 |       "Player Score: 27 (Usable Ace: False), Dealer Score: 5\n",
 37 |       "Game end. Reward: -1.0\n",
 38 |       "\n",
 39 |       "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n",
 40 |       "Taking action: Stick\n",
 41 |       "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n",
 42 |       "Game end. Reward: 0.0\n",
 43 |       "\n",
 44 |       "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n",
 45 |       "Taking action: Stick\n",
 46 |       "Player Score: 21 (Usable Ace: True), Dealer Score: 10\n",
 47 |       "Game end. Reward: 1.0\n",
 48 |       "\n",
 49 |       "Player Score: 14 (Usable Ace: True), Dealer Score: 10\n",
 50 |       "Taking action: Hit\n",
 51 |       "Player Score: 19 (Usable Ace: True), Dealer Score: 10\n",
 52 |       "Taking action: Hit\n",
 53 |       "Player Score: 15 (Usable Ace: False), Dealer Score: 10\n",
 54 |       "Taking action: Hit\n",
 55 |       "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n",
 56 |       "Taking action: Stick\n",
 57 |       "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n",
 58 |       "Game end. Reward: 1.0\n",
 59 |       "\n",
 60 |       "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n",
 61 |       "Taking action: Stick\n",
 62 |       "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n",
 63 |       "Game end. Reward: 1.0\n",
 64 |       "\n",
 65 |       "Player Score: 18 (Usable Ace: False), Dealer Score: 6\n",
 66 |       "Taking action: Hit\n",
 67 |       "Player Score: 27 (Usable Ace: False), Dealer Score: 6\n",
 68 |       "Game end. Reward: -1.0\n",
 69 |       "\n",
 70 |       "Player Score: 16 (Usable Ace: False), Dealer Score: 3\n",
 71 |       "Taking action: Hit\n",
 72 |       "Player Score: 18 (Usable Ace: False), Dealer Score: 3\n",
 73 |       "Taking action: Hit\n",
 74 |       "Player Score: 23 (Usable Ace: False), Dealer Score: 3\n",
 75 |       "Game end. Reward: -1.0\n",
 76 |       "\n",
 77 |       "Player Score: 19 (Usable Ace: False), Dealer Score: 10\n",
 78 |       "Taking action: Hit\n",
 79 |       "Player Score: 23 (Usable Ace: False), Dealer Score: 10\n",
 80 |       "Game end. Reward: -1.0\n",
 81 |       "\n",
 82 |       "Player Score: 19 (Usable Ace: False), Dealer Score: 4\n",
 83 |       "Taking action: Hit\n",
 84 |       "Player Score: 21 (Usable Ace: False), Dealer Score: 4\n",
 85 |       "Taking action: Stick\n",
 86 |       "Player Score: 21 (Usable Ace: False), Dealer Score: 4\n",
 87 |       "Game end. Reward: 1.0\n",
 88 |       "\n",
 89 |       "Player Score: 21 (Usable Ace: True), Dealer Score: 4\n",
 90 |       "Taking action: Stick\n",
 91 |       "Player Score: 21 (Usable Ace: True), Dealer Score: 4\n",
 92 |       "Game end. Reward: 1.0\n",
 93 |       "\n",
 94 |       "Player Score: 16 (Usable Ace: True), Dealer Score: 10\n",
 95 |       "Taking action: Hit\n",
 96 |       "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n",
 97 |       "Taking action: Hit\n",
 98 |       "Player Score: 26 (Usable Ace: False), Dealer Score: 10\n",
 99 |       "Game end. Reward: -1.0\n",
100 |       "\n",
101 |       "Player Score: 14 (Usable Ace: False), Dealer Score: 10\n",
102 |       "Taking action: Hit\n",
103 |       "Player Score: 23 (Usable Ace: False), Dealer Score: 10\n",
104 |       "Game end. Reward: -1.0\n",
105 |       "\n",
106 |       "Player Score: 12 (Usable Ace: False), Dealer Score: 10\n",
107 |       "Taking action: Hit\n",
108 |       "Player Score: 15 (Usable Ace: False), Dealer Score: 10\n",
109 |       "Taking action: Hit\n",
110 |       "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n",
111 |       "Taking action: Hit\n",
112 |       "Player Score: 26 (Usable Ace: False), Dealer Score: 10\n",
113 |       "Game end. Reward: -1.0\n",
114 |       "\n",
115 |       "Player Score: 16 (Usable Ace: True), Dealer Score: 8\n",
116 |       "Taking action: Hit\n",
117 |       "Player Score: 18 (Usable Ace: True), Dealer Score: 8\n",
118 |       "Taking action: Hit\n",
119 |       "Player Score: 18 (Usable Ace: False), Dealer Score: 8\n",
120 |       "Taking action: Hit\n",
121 |       "Player Score: 20 (Usable Ace: False), Dealer Score: 8\n",
122 |       "Taking action: Stick\n",
123 |       "Player Score: 20 (Usable Ace: False), Dealer Score: 8\n",
124 |       "Game end. Reward: 1.0\n",
125 |       "\n",
126 |       "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n",
127 |       "Taking action: Stick\n",
128 |       "Player Score: 20 (Usable Ace: False), Dealer Score: 10\n",
129 |       "Game end. Reward: -1.0\n",
130 |       "\n",
131 |       "Player Score: 15 (Usable Ace: False), Dealer Score: 10\n",
132 |       "Taking action: Hit\n",
133 |       "Player Score: 16 (Usable Ace: False), Dealer Score: 10\n",
134 |       "Taking action: Hit\n",
135 |       "Player Score: 23 (Usable Ace: False), Dealer Score: 10\n",
136 |       "Game end. Reward: -1.0\n",
137 |       "\n",
138 |       "Player Score: 12 (Usable Ace: False), Dealer Score: 4\n",
139 |       "Taking action: Hit\n",
140 |       "Player Score: 16 (Usable Ace: False), Dealer Score: 4\n",
141 |       "Taking action: Hit\n",
142 |       "Player Score: 24 (Usable Ace: False), Dealer Score: 4\n",
143 |       "Game end. Reward: -1.0\n",
144 |       "\n",
145 |       "Player Score: 20 (Usable Ace: False), Dealer Score: 7\n",
146 |       "Taking action: Stick\n",
147 |       "Player Score: 20 (Usable Ace: False), Dealer Score: 7\n",
148 |       "Game end. Reward: 1.0\n",
149 |       "\n",
150 |       "Player Score: 15 (Usable Ace: False), Dealer Score: 7\n",
151 |       "Taking action: Hit\n",
152 |       "Player Score: 21 (Usable Ace: False), Dealer Score: 7\n",
153 |       "Taking action: Stick\n",
154 |       "Player Score: 21 (Usable Ace: False), Dealer Score: 7\n",
155 |       "Game end. Reward: 1.0\n",
156 |       "\n",
157 |       "Player Score: 15 (Usable Ace: False), Dealer Score: 8\n",
158 |       "Taking action: Hit\n",
159 |       "Player Score: 23 (Usable Ace: False), Dealer Score: 8\n",
160 |       "Game end. Reward: -1.0\n",
161 |       "\n"
162 |      ]
163 |     }
164 |    ],
165 |    "source": [
166 |     "def print_observation(observation):\n",
167 |     "    score, dealer_score, usable_ace = observation\n",
168 |     "    print(\"Player Score: {} (Usable Ace: {}), Dealer Score: {}\".format(\n",
169 |     "          score, usable_ace, dealer_score))\n",
170 |     "\n",
171 |     "def strategy(observation):\n",
172 |     "    score, dealer_score, usable_ace = observation\n",
173 |     "    # Stick (action 0) if the score is > 20, hit (action 1) otherwise\n",
174 |     "    return 0 if score >= 20 else 1\n",
175 |     "\n",
176 |     "for i_episode in range(20):\n",
177 |     "    observation = env.reset()\n",
178 |     "    for t in range(100):\n",
179 |     "        print_observation(observation)\n",
180 |     "        action = strategy(observation)\n",
181 |     "        print(\"Taking action: {}\".format( [\"Stick\", \"Hit\"][action]))\n",
182 |     "        observation, reward, done, _ = env.step(action)\n",
183 |     "        if done:\n",
184 |     "            print_observation(observation)\n",
185 |     "            print(\"Game end. Reward: {}\\n\".format(float(reward)))\n",
186 |     "            break"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": []
195 |   }
196 |  ],
197 |  "metadata": {
198 |   "kernelspec": {
199 |    "display_name": "Python 3",
200 |    "language": "python",
201 |    "name": "python3"
202 |   },
203 |   "language_info": {
204 |    "codemirror_mode": {
205 |     "name": "ipython",
206 |     "version": 3
207 |    },
208 |    "file_extension": ".py",
209 |    "mimetype": "text/x-python",
210 |    "name": "python",
211 |    "nbconvert_exporter": "python",
212 |    "pygments_lexer": "ipython3",
213 |    "version": "3.6.4"
214 |   }
215 |  },
216 |  "nbformat": 4,
217 |  "nbformat_minor": 1
218 | }
219 | 


--------------------------------------------------------------------------------
/DP/Policy Iteration Solution.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pprint\n",
 11 |     "import sys\n",
 12 |     "if \"../\" not in sys.path:\n",
 13 |     "  sys.path.append(\"../\") \n",
 14 |     "from lib.envs.gridworld import GridworldEnv"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "pp = pprint.PrettyPrinter(indent=2)\n",
 24 |     "env = GridworldEnv()"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 3,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "# Taken from Policy Evaluation Exercise!\n",
 34 |     "\n",
 35 |     "def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):\n",
 36 |     "    \"\"\"\n",
 37 |     "    Evaluate a policy given an environment and a full description of the environment's dynamics.\n",
 38 |     "    \n",
 39 |     "    Args:\n",
 40 |     "        policy: [S, A] shaped matrix representing the policy.\n",
 41 |     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
 42 |     "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
 43 |     "            env.nS is a number of states in the environment. \n",
 44 |     "            env.nA is a number of actions in the environment.\n",
 45 |     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
 46 |     "        discount_factor: Gamma discount factor.\n",
 47 |     "    \n",
 48 |     "    Returns:\n",
 49 |     "        Vector of length env.nS representing the value function.\n",
 50 |     "    \"\"\"\n",
 51 |     "    # Start with a random (all 0) value function\n",
 52 |     "    V = np.zeros(env.nS)\n",
 53 |     "    while True:\n",
 54 |     "        delta = 0\n",
 55 |     "        # For each state, perform a \"full backup\"\n",
 56 |     "        for s in range(env.nS):\n",
 57 |     "            v = 0\n",
 58 |     "            # Look at the possible next actions\n",
 59 |     "            for a, action_prob in enumerate(policy[s]):\n",
 60 |     "                # For each action, look at the possible next states...\n",
 61 |     "                for  prob, next_state, reward, done in env.P[s][a]:\n",
 62 |     "                    # Calculate the expected value\n",
 63 |     "                    v += action_prob * prob * (reward + discount_factor * V[next_state])\n",
 64 |     "            # How much our value function changed (across any states)\n",
 65 |     "            delta = max(delta, np.abs(v - V[s]))\n",
 66 |     "            V[s] = v\n",
 67 |     "        # Stop evaluating once our value function change is below a threshold\n",
 68 |     "        if delta < theta:\n",
 69 |     "            break\n",
 70 |     "    return np.array(V)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 4,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "def policy_improvement(env, policy_eval_fn=policy_eval, discount_factor=1.0):\n",
 80 |     "    \"\"\"\n",
 81 |     "    Policy Improvement Algorithm. Iteratively evaluates and improves a policy\n",
 82 |     "    until an optimal policy is found.\n",
 83 |     "    \n",
 84 |     "    Args:\n",
 85 |     "        env: The OpenAI environment.\n",
 86 |     "        policy_eval_fn: Policy Evaluation function that takes 3 arguments:\n",
 87 |     "            policy, env, discount_factor.\n",
 88 |     "        discount_factor: gamma discount factor.\n",
 89 |     "        \n",
 90 |     "    Returns:\n",
 91 |     "        A tuple (policy, V). \n",
 92 |     "        policy is the optimal policy, a matrix of shape [S, A] where each state s\n",
 93 |     "        contains a valid probability distribution over actions.\n",
 94 |     "        V is the value function for the optimal policy.\n",
 95 |     "        \n",
 96 |     "    \"\"\"\n",
 97 |     "\n",
 98 |     "    def one_step_lookahead(state, V):\n",
 99 |     "        \"\"\"\n",
100 |     "        Helper function to calculate the value for all action in a given state.\n",
101 |     "        \n",
102 |     "        Args:\n",
103 |     "            state: The state to consider (int)\n",
104 |     "            V: The value to use as an estimator, Vector of length env.nS\n",
105 |     "        \n",
106 |     "        Returns:\n",
107 |     "            A vector of length env.nA containing the expected value of each action.\n",
108 |     "        \"\"\"\n",
109 |     "        A = np.zeros(env.nA)\n",
110 |     "        for a in range(env.nA):\n",
111 |     "            for prob, next_state, reward, done in env.P[state][a]:\n",
112 |     "                A[a] += prob * (reward + discount_factor * V[next_state])\n",
113 |     "        return A\n",
114 |     "    \n",
115 |     "    # Start with a random policy\n",
116 |     "    policy = np.ones([env.nS, env.nA]) / env.nA\n",
117 |     "    \n",
118 |     "    while True:\n",
119 |     "        # Evaluate the current policy\n",
120 |     "        V = policy_eval_fn(policy, env, discount_factor)\n",
121 |     "        \n",
122 |     "        # Will be set to false if we make any changes to the policy\n",
123 |     "        policy_stable = True\n",
124 |     "        \n",
125 |     "        # For each state...\n",
126 |     "        for s in range(env.nS):\n",
127 |     "            # The best action we would take under the current policy\n",
128 |     "            chosen_a = np.argmax(policy[s])\n",
129 |     "            \n",
130 |     "            # Find the best action by one-step lookahead\n",
131 |     "            # Ties are resolved arbitarily\n",
132 |     "            action_values = one_step_lookahead(s, V)\n",
133 |     "            best_a = np.argmax(action_values)\n",
134 |     "            \n",
135 |     "            # Greedily update the policy\n",
136 |     "            if chosen_a != best_a:\n",
137 |     "                policy_stable = False\n",
138 |     "            policy[s] = np.eye(env.nA)[best_a]\n",
139 |     "        \n",
140 |     "        # If the policy is stable we've found an optimal policy. Return it\n",
141 |     "        if policy_stable:\n",
142 |     "            return policy, V"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 5,
148 |    "metadata": {},
149 |    "outputs": [
150 |     {
151 |      "name": "stdout",
152 |      "output_type": "stream",
153 |      "text": [
154 |       "Policy Probability Distribution:\n",
155 |       "[[1. 0. 0. 0.]\n",
156 |       " [0. 0. 0. 1.]\n",
157 |       " [0. 0. 0. 1.]\n",
158 |       " [0. 0. 1. 0.]\n",
159 |       " [1. 0. 0. 0.]\n",
160 |       " [1. 0. 0. 0.]\n",
161 |       " [1. 0. 0. 0.]\n",
162 |       " [0. 0. 1. 0.]\n",
163 |       " [1. 0. 0. 0.]\n",
164 |       " [1. 0. 0. 0.]\n",
165 |       " [0. 1. 0. 0.]\n",
166 |       " [0. 0. 1. 0.]\n",
167 |       " [1. 0. 0. 0.]\n",
168 |       " [0. 1. 0. 0.]\n",
169 |       " [0. 1. 0. 0.]\n",
170 |       " [1. 0. 0. 0.]]\n",
171 |       "\n",
172 |       "Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\n",
173 |       "[[0 3 3 2]\n",
174 |       " [0 0 0 2]\n",
175 |       " [0 0 1 2]\n",
176 |       " [0 1 1 0]]\n",
177 |       "\n",
178 |       "Value Function:\n",
179 |       "[ 0. -1. -2. -3. -1. -2. -3. -2. -2. -3. -2. -1. -3. -2. -1.  0.]\n",
180 |       "\n",
181 |       "Reshaped Grid Value Function:\n",
182 |       "[[ 0. -1. -2. -3.]\n",
183 |       " [-1. -2. -3. -2.]\n",
184 |       " [-2. -3. -2. -1.]\n",
185 |       " [-3. -2. -1.  0.]]\n",
186 |       "\n"
187 |      ]
188 |     }
189 |    ],
190 |    "source": [
191 |     "policy, v = policy_improvement(env)\n",
192 |     "print(\"Policy Probability Distribution:\")\n",
193 |     "print(policy)\n",
194 |     "print(\"\")\n",
195 |     "\n",
196 |     "print(\"Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\")\n",
197 |     "print(np.reshape(np.argmax(policy, axis=1), env.shape))\n",
198 |     "print(\"\")\n",
199 |     "\n",
200 |     "print(\"Value Function:\")\n",
201 |     "print(v)\n",
202 |     "print(\"\")\n",
203 |     "\n",
204 |     "print(\"Reshaped Grid Value Function:\")\n",
205 |     "print(v.reshape(env.shape))\n",
206 |     "print(\"\")\n",
207 |     "\n"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 6,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "# Test the value function\n",
217 |     "expected_v = np.array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1,  0])\n",
218 |     "np.testing.assert_array_almost_equal(v, expected_v, decimal=2)"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": []
227 |   }
228 |  ],
229 |  "metadata": {
230 |   "kernelspec": {
231 |    "display_name": "Python 3",
232 |    "language": "python",
233 |    "name": "python3"
234 |   },
235 |   "language_info": {
236 |    "codemirror_mode": {
237 |     "name": "ipython",
238 |     "version": 3
239 |    },
240 |    "file_extension": ".py",
241 |    "mimetype": "text/x-python",
242 |    "name": "python",
243 |    "nbconvert_exporter": "python",
244 |    "pygments_lexer": "ipython3",
245 |    "version": "3.6.4"
246 |   }
247 |  },
248 |  "nbformat": 4,
249 |  "nbformat_minor": 1
250 | }
251 | 


--------------------------------------------------------------------------------
/DP/Value Iteration.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pprint\n",
 13 |     "import sys\n",
 14 |     "if \"../\" not in sys.path:\n",
 15 |     "  sys.path.append(\"../\") \n",
 16 |     "from lib.envs.gridworld import GridworldEnv"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 4,
 22 |    "metadata": {
 23 |     "collapsed": true
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "pp = pprint.PrettyPrinter(indent=2)\n",
 28 |     "env = GridworldEnv()"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 5,
 34 |    "metadata": {
 35 |     "collapsed": true
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "def value_iteration(env, theta=0.0001, discount_factor=1.0):\n",
 40 |     "    \"\"\"\n",
 41 |     "    Value Iteration Algorithm.\n",
 42 |     "    \n",
 43 |     "    Args:\n",
 44 |     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
 45 |     "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
 46 |     "            env.nS is a number of states in the environment. \n",
 47 |     "            env.nA is a number of actions in the environment.\n",
 48 |     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
 49 |     "        discount_factor: Gamma discount factor.\n",
 50 |     "        \n",
 51 |     "    Returns:\n",
 52 |     "        A tuple (policy, V) of the optimal policy and the optimal value function.        \n",
 53 |     "    \"\"\"\n",
 54 |     "    \n",
 55 |     "\n",
 56 |     "    V = np.zeros(env.nS)\n",
 57 |     "    policy = np.zeros([env.nS, env.nA])\n",
 58 |     "    \n",
 59 |     "    # Implement!\n",
 60 |     "    return policy, V"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 6,
 66 |    "metadata": {},
 67 |    "outputs": [
 68 |     {
 69 |      "name": "stdout",
 70 |      "output_type": "stream",
 71 |      "text": [
 72 |       "Policy Probability Distribution:\n",
 73 |       "[[ 0.  0.  0.  0.]\n",
 74 |       " [ 0.  0.  0.  0.]\n",
 75 |       " [ 0.  0.  0.  0.]\n",
 76 |       " [ 0.  0.  0.  0.]\n",
 77 |       " [ 0.  0.  0.  0.]\n",
 78 |       " [ 0.  0.  0.  0.]\n",
 79 |       " [ 0.  0.  0.  0.]\n",
 80 |       " [ 0.  0.  0.  0.]\n",
 81 |       " [ 0.  0.  0.  0.]\n",
 82 |       " [ 0.  0.  0.  0.]\n",
 83 |       " [ 0.  0.  0.  0.]\n",
 84 |       " [ 0.  0.  0.  0.]\n",
 85 |       " [ 0.  0.  0.  0.]\n",
 86 |       " [ 0.  0.  0.  0.]\n",
 87 |       " [ 0.  0.  0.  0.]\n",
 88 |       " [ 0.  0.  0.  0.]]\n",
 89 |       "\n",
 90 |       "Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\n",
 91 |       "[[0 0 0 0]\n",
 92 |       " [0 0 0 0]\n",
 93 |       " [0 0 0 0]\n",
 94 |       " [0 0 0 0]]\n",
 95 |       "\n",
 96 |       "Value Function:\n",
 97 |       "[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
 98 |       "\n",
 99 |       "Reshaped Grid Value Function:\n",
100 |       "[[ 0.  0.  0.  0.]\n",
101 |       " [ 0.  0.  0.  0.]\n",
102 |       " [ 0.  0.  0.  0.]\n",
103 |       " [ 0.  0.  0.  0.]]\n",
104 |       "\n"
105 |      ]
106 |     }
107 |    ],
108 |    "source": [
109 |     "policy, v = value_iteration(env)\n",
110 |     "\n",
111 |     "print(\"Policy Probability Distribution:\")\n",
112 |     "print(policy)\n",
113 |     "print(\"\")\n",
114 |     "\n",
115 |     "print(\"Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\")\n",
116 |     "print(np.reshape(np.argmax(policy, axis=1), env.shape))\n",
117 |     "print(\"\")\n",
118 |     "\n",
119 |     "print(\"Value Function:\")\n",
120 |     "print(v)\n",
121 |     "print(\"\")\n",
122 |     "\n",
123 |     "print(\"Reshaped Grid Value Function:\")\n",
124 |     "print(v.reshape(env.shape))\n",
125 |     "print(\"\")"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 7,
131 |    "metadata": {},
132 |    "outputs": [
133 |     {
134 |      "ename": "AssertionError",
135 |      "evalue": "\nArrays are not almost equal to 2 decimals\n\n(mismatch 87.5%)\n x: array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,\n        0.,  0.,  0.])\n y: array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1,  0])",
136 |      "output_type": "error",
137 |      "traceback": [
138 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
139 |       "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
140 |       "\u001b[0;32m<ipython-input-7-55581f8eb5c9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# Test the value function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mexpected_v\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m  \u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtesting\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0massert_array_almost_equal\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexpected_v\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdecimal\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
141 |       "\u001b[0;32m/Users/dennybritz/venvs/tf/lib/python3.5/site-packages/numpy/testing/utils.py\u001b[0m in \u001b[0;36massert_array_almost_equal\u001b[0;34m(x, y, decimal, err_msg, verbose)\u001b[0m\n\u001b[1;32m    914\u001b[0m     assert_array_compare(compare, x, y, err_msg=err_msg, verbose=verbose,\n\u001b[1;32m    915\u001b[0m              \u001b[0mheader\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Arrays are not almost equal to %d decimals'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mdecimal\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 916\u001b[0;31m              precision=decimal)\n\u001b[0m\u001b[1;32m    917\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    918\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
142 |       "\u001b[0;32m/Users/dennybritz/venvs/tf/lib/python3.5/site-packages/numpy/testing/utils.py\u001b[0m in \u001b[0;36massert_array_compare\u001b[0;34m(comparison, x, y, err_msg, verbose, header, precision)\u001b[0m\n\u001b[1;32m    735\u001b[0m                                 names=('x', 'y'), precision=precision)\n\u001b[1;32m    736\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mcond\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 737\u001b[0;31m                 \u001b[0;32mraise\u001b[0m \u001b[0mAssertionError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    738\u001b[0m     \u001b[0;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    739\u001b[0m         \u001b[0;32mimport\u001b[0m \u001b[0mtraceback\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
143 |       "\u001b[0;31mAssertionError\u001b[0m: \nArrays are not almost equal to 2 decimals\n\n(mismatch 87.5%)\n x: array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,\n        0.,  0.,  0.])\n y: array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1,  0])"
144 |      ]
145 |     }
146 |    ],
147 |    "source": [
148 |     "# Test the value function\n",
149 |     "expected_v = np.array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1,  0])\n",
150 |     "np.testing.assert_array_almost_equal(v, expected_v, decimal=2)"
151 |    ]
152 |   }
153 |  ],
154 |  "metadata": {
155 |   "kernelspec": {
156 |    "display_name": "Python 3",
157 |    "language": "python",
158 |    "name": "python3"
159 |   },
160 |   "language_info": {
161 |    "codemirror_mode": {
162 |     "name": "ipython",
163 |     "version": 3
164 |    },
165 |    "file_extension": ".py",
166 |    "mimetype": "text/x-python",
167 |    "name": "python",
168 |    "nbconvert_exporter": "python",
169 |    "pygments_lexer": "ipython3",
170 |    "version": "3.5.2"
171 |   }
172 |  },
173 |  "nbformat": 4,
174 |  "nbformat_minor": 1
175 | }
176 | 


--------------------------------------------------------------------------------
/DP/Policy Iteration.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 5,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pprint\n",
 13 |     "import sys\n",
 14 |     "if \"../\" not in sys.path:\n",
 15 |     "  sys.path.append(\"../\") \n",
 16 |     "from lib.envs.gridworld import GridworldEnv"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 6,
 22 |    "metadata": {
 23 |     "collapsed": true
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "pp = pprint.PrettyPrinter(indent=2)\n",
 28 |     "env = GridworldEnv()"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 7,
 34 |    "metadata": {
 35 |     "collapsed": true
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "# Taken from Policy Evaluation Exercise!\n",
 40 |     "\n",
 41 |     "def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):\n",
 42 |     "    \"\"\"\n",
 43 |     "    Evaluate a policy given an environment and a full description of the environment's dynamics.\n",
 44 |     "    \n",
 45 |     "    Args:\n",
 46 |     "        policy: [S, A] shaped matrix representing the policy.\n",
 47 |     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
 48 |     "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
 49 |     "            env.nS is a number of states in the environment. \n",
 50 |     "            env.nA is a number of actions in the environment.\n",
 51 |     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
 52 |     "        discount_factor: Gamma discount factor.\n",
 53 |     "    \n",
 54 |     "    Returns:\n",
 55 |     "        Vector of length env.nS representing the value function.\n",
 56 |     "    \"\"\"\n",
 57 |     "    # Start with a random (all 0) value function\n",
 58 |     "    V = np.zeros(env.nS)\n",
 59 |     "    while True:\n",
 60 |     "        delta = 0\n",
 61 |     "        # For each state, perform a \"full backup\"\n",
 62 |     "        for s in range(env.nS):\n",
 63 |     "            v = 0\n",
 64 |     "            # Look at the possible next actions\n",
 65 |     "            for a, action_prob in enumerate(policy[s]):\n",
 66 |     "                # For each action, look at the possible next states...\n",
 67 |     "                for  prob, next_state, reward, done in env.P[s][a]:\n",
 68 |     "                    # Calculate the expected value\n",
 69 |     "                    v += action_prob * prob * (reward + discount_factor * V[next_state])\n",
 70 |     "            # How much our value function changed (across any states)\n",
 71 |     "            delta = max(delta, np.abs(v - V[s]))\n",
 72 |     "            V[s] = v\n",
 73 |     "        # Stop evaluating once our value function change is below a threshold\n",
 74 |     "        if delta < theta:\n",
 75 |     "            break\n",
 76 |     "    return np.array(V)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 13,
 82 |    "metadata": {
 83 |     "collapsed": true
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "def policy_improvement(env, policy_eval_fn=policy_eval, discount_factor=1.0):\n",
 88 |     "    \"\"\"\n",
 89 |     "    Policy Improvement Algorithm. Iteratively evaluates and improves a policy\n",
 90 |     "    until an optimal policy is found.\n",
 91 |     "    \n",
 92 |     "    Args:\n",
 93 |     "        env: The OpenAI envrionment.\n",
 94 |     "        policy_eval_fn: Policy Evaluation function that takes 3 arguments:\n",
 95 |     "            policy, env, discount_factor.\n",
 96 |     "        discount_factor: gamma discount factor.\n",
 97 |     "        \n",
 98 |     "    Returns:\n",
 99 |     "        A tuple (policy, V). \n",
100 |     "        policy is the optimal policy, a matrix of shape [S, A] where each state s\n",
101 |     "        contains a valid probability distribution over actions.\n",
102 |     "        V is the value function for the optimal policy.\n",
103 |     "        \n",
104 |     "    \"\"\"\n",
105 |     "    # Start with a random policy\n",
106 |     "    policy = np.ones([env.nS, env.nA]) / env.nA\n",
107 |     "    \n",
108 |     "    while True:\n",
109 |     "        # Implement this!\n",
110 |     "        break\n",
111 |     "    \n",
112 |     "    return policy, np.zeros(env.nS)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 14,
118 |    "metadata": {},
119 |    "outputs": [
120 |     {
121 |      "name": "stdout",
122 |      "output_type": "stream",
123 |      "text": [
124 |       "Policy Probability Distribution:\n",
125 |       "[[ 0.25  0.25  0.25  0.25]\n",
126 |       " [ 0.25  0.25  0.25  0.25]\n",
127 |       " [ 0.25  0.25  0.25  0.25]\n",
128 |       " [ 0.25  0.25  0.25  0.25]\n",
129 |       " [ 0.25  0.25  0.25  0.25]\n",
130 |       " [ 0.25  0.25  0.25  0.25]\n",
131 |       " [ 0.25  0.25  0.25  0.25]\n",
132 |       " [ 0.25  0.25  0.25  0.25]\n",
133 |       " [ 0.25  0.25  0.25  0.25]\n",
134 |       " [ 0.25  0.25  0.25  0.25]\n",
135 |       " [ 0.25  0.25  0.25  0.25]\n",
136 |       " [ 0.25  0.25  0.25  0.25]\n",
137 |       " [ 0.25  0.25  0.25  0.25]\n",
138 |       " [ 0.25  0.25  0.25  0.25]\n",
139 |       " [ 0.25  0.25  0.25  0.25]\n",
140 |       " [ 0.25  0.25  0.25  0.25]]\n",
141 |       "\n",
142 |       "Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\n",
143 |       "[[0 0 0 0]\n",
144 |       " [0 0 0 0]\n",
145 |       " [0 0 0 0]\n",
146 |       " [0 0 0 0]]\n",
147 |       "\n",
148 |       "Value Function:\n",
149 |       "[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
150 |       "\n",
151 |       "Reshaped Grid Value Function:\n",
152 |       "[[ 0.  0.  0.  0.]\n",
153 |       " [ 0.  0.  0.  0.]\n",
154 |       " [ 0.  0.  0.  0.]\n",
155 |       " [ 0.  0.  0.  0.]]\n",
156 |       "\n"
157 |      ]
158 |     }
159 |    ],
160 |    "source": [
161 |     "policy, v = policy_improvement(env)\n",
162 |     "print(\"Policy Probability Distribution:\")\n",
163 |     "print(policy)\n",
164 |     "print(\"\")\n",
165 |     "\n",
166 |     "print(\"Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):\")\n",
167 |     "print(np.reshape(np.argmax(policy, axis=1), env.shape))\n",
168 |     "print(\"\")\n",
169 |     "\n",
170 |     "print(\"Value Function:\")\n",
171 |     "print(v)\n",
172 |     "print(\"\")\n",
173 |     "\n",
174 |     "print(\"Reshaped Grid Value Function:\")\n",
175 |     "print(v.reshape(env.shape))\n",
176 |     "print(\"\")\n",
177 |     "\n"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 15,
183 |    "metadata": {},
184 |    "outputs": [
185 |     {
186 |      "ename": "AssertionError",
187 |      "evalue": "\nArrays are not almost equal to 2 decimals\n\n(mismatch 87.5%)\n x: array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,\n        0.,  0.,  0.])\n y: array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1,  0])",
188 |      "output_type": "error",
189 |      "traceback": [
190 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
191 |       "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
192 |       "\u001b[0;32m<ipython-input-15-55581f8eb5c9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# Test the value function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mexpected_v\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m  \u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtesting\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0massert_array_almost_equal\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexpected_v\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdecimal\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
193 |       "\u001b[0;32m/Users/dennybritz/venvs/tf/lib/python3.5/site-packages/numpy/testing/utils.py\u001b[0m in \u001b[0;36massert_array_almost_equal\u001b[0;34m(x, y, decimal, err_msg, verbose)\u001b[0m\n\u001b[1;32m    914\u001b[0m     assert_array_compare(compare, x, y, err_msg=err_msg, verbose=verbose,\n\u001b[1;32m    915\u001b[0m              \u001b[0mheader\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Arrays are not almost equal to %d decimals'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mdecimal\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 916\u001b[0;31m              precision=decimal)\n\u001b[0m\u001b[1;32m    917\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    918\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
194 |       "\u001b[0;32m/Users/dennybritz/venvs/tf/lib/python3.5/site-packages/numpy/testing/utils.py\u001b[0m in \u001b[0;36massert_array_compare\u001b[0;34m(comparison, x, y, err_msg, verbose, header, precision)\u001b[0m\n\u001b[1;32m    735\u001b[0m                                 names=('x', 'y'), precision=precision)\n\u001b[1;32m    736\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mcond\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 737\u001b[0;31m                 \u001b[0;32mraise\u001b[0m \u001b[0mAssertionError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    738\u001b[0m     \u001b[0;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    739\u001b[0m         \u001b[0;32mimport\u001b[0m \u001b[0mtraceback\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
195 |       "\u001b[0;31mAssertionError\u001b[0m: \nArrays are not almost equal to 2 decimals\n\n(mismatch 87.5%)\n x: array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,\n        0.,  0.,  0.])\n y: array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1,  0])"
196 |      ]
197 |     }
198 |    ],
199 |    "source": [
200 |     "# Test the value function\n",
201 |     "expected_v = np.array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1,  0])\n",
202 |     "np.testing.assert_array_almost_equal(v, expected_v, decimal=2)"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {
209 |     "collapsed": true
210 |    },
211 |    "outputs": [],
212 |    "source": []
213 |   }
214 |  ],
215 |  "metadata": {
216 |   "kernelspec": {
217 |    "display_name": "Python 3",
218 |    "language": "python",
219 |    "name": "python3"
220 |   },
221 |   "language_info": {
222 |    "codemirror_mode": {
223 |     "name": "ipython",
224 |     "version": 3
225 |    },
226 |    "file_extension": ".py",
227 |    "mimetype": "text/x-python",
228 |    "name": "python",
229 |    "nbconvert_exporter": "python",
230 |    "pygments_lexer": "ipython3",
231 |    "version": "3.5.2"
232 |   }
233 |  },
234 |  "nbformat": 4,
235 |  "nbformat_minor": 1
236 | }
237 | 


--------------------------------------------------------------------------------
/PolicyGradient/Continuous MountainCar Actor Critic Solution.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "%matplotlib inline\n",
 12 |     "\n",
 13 |     "import gym\n",
 14 |     "import itertools\n",
 15 |     "import matplotlib\n",
 16 |     "import numpy as np\n",
 17 |     "import sys\n",
 18 |     "import tensorflow as tf\n",
 19 |     "import collections\n",
 20 |     "\n",
 21 |     "import sklearn.pipeline\n",
 22 |     "import sklearn.preprocessing\n",
 23 |     "\n",
 24 |     "if \"../\" not in sys.path:\n",
 25 |     "  sys.path.append(\"../\") \n",
 26 |     "from lib.envs.cliff_walking import CliffWalkingEnv\n",
 27 |     "from lib import plotting\n",
 28 |     "\n",
 29 |     "from sklearn.kernel_approximation import RBFSampler\n",
 30 |     "\n",
 31 |     "matplotlib.style.use('ggplot')"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "name": "stderr",
 41 |      "output_type": "stream",
 42 |      "text": [
 43 |       "[2017-06-16 13:11:05,265] Making new env: MountainCarContinuous-v0\n"
 44 |      ]
 45 |     },
 46 |     {
 47 |      "data": {
 48 |       "text/plain": [
 49 |        "array([-0.21213569,  0.03012651])"
 50 |       ]
 51 |      },
 52 |      "execution_count": 3,
 53 |      "metadata": {},
 54 |      "output_type": "execute_result"
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "env = gym.envs.make(\"MountainCarContinuous-v0\")\n",
 59 |     "env.observation_space.sample()"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 4,
 65 |    "metadata": {},
 66 |    "outputs": [
 67 |     {
 68 |      "data": {
 69 |       "text/plain": [
 70 |        "FeatureUnion(n_jobs=1,\n",
 71 |        "       transformer_list=[('rbf1', RBFSampler(gamma=5.0, n_components=100, random_state=None)), ('rbf2', RBFSampler(gamma=2.0, n_components=100, random_state=None)), ('rbf3', RBFSampler(gamma=1.0, n_components=100, random_state=None)), ('rbf4', RBFSampler(gamma=0.5, n_components=100, random_state=None))],\n",
 72 |        "       transformer_weights=None)"
 73 |       ]
 74 |      },
 75 |      "execution_count": 4,
 76 |      "metadata": {},
 77 |      "output_type": "execute_result"
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "# Feature Preprocessing: Normalize to zero mean and unit variance\n",
 82 |     "# We use a few samples from the observation space to do this\n",
 83 |     "observation_examples = np.array([env.observation_space.sample() for x in range(10000)])\n",
 84 |     "scaler = sklearn.preprocessing.StandardScaler()\n",
 85 |     "scaler.fit(observation_examples)\n",
 86 |     "\n",
 87 |     "# Used to converte a state to a featurizes represenation.\n",
 88 |     "# We use RBF kernels with different variances to cover different parts of the space\n",
 89 |     "featurizer = sklearn.pipeline.FeatureUnion([\n",
 90 |     "        (\"rbf1\", RBFSampler(gamma=5.0, n_components=100)),\n",
 91 |     "        (\"rbf2\", RBFSampler(gamma=2.0, n_components=100)),\n",
 92 |     "        (\"rbf3\", RBFSampler(gamma=1.0, n_components=100)),\n",
 93 |     "        (\"rbf4\", RBFSampler(gamma=0.5, n_components=100))\n",
 94 |     "        ])\n",
 95 |     "featurizer.fit(scaler.transform(observation_examples))"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 5,
101 |    "metadata": {
102 |     "collapsed": true
103 |    },
104 |    "outputs": [],
105 |    "source": [
106 |     "def featurize_state(state):\n",
107 |     "    \"\"\"\n",
108 |     "    Returns the featurized representation for a state.\n",
109 |     "    \"\"\"\n",
110 |     "    scaled = scaler.transform([state])\n",
111 |     "    featurized = featurizer.transform(scaled)\n",
112 |     "    return featurized[0]"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 6,
118 |    "metadata": {
119 |     "collapsed": true
120 |    },
121 |    "outputs": [],
122 |    "source": [
123 |     "class PolicyEstimator():\n",
124 |     "    \"\"\"\n",
125 |     "    Policy Function approximator. \n",
126 |     "    \"\"\"\n",
127 |     "    \n",
128 |     "    def __init__(self, learning_rate=0.01, scope=\"policy_estimator\"):\n",
129 |     "        with tf.variable_scope(scope):\n",
130 |     "            self.state = tf.placeholder(tf.float32, [400], \"state\")\n",
131 |     "            self.target = tf.placeholder(dtype=tf.float32, name=\"target\")\n",
132 |     "\n",
133 |     "            # This is just linear classifier\n",
134 |     "            self.mu = tf.contrib.layers.fully_connected(\n",
135 |     "                inputs=tf.expand_dims(self.state, 0),\n",
136 |     "                num_outputs=1,\n",
137 |     "                activation_fn=None,\n",
138 |     "                weights_initializer=tf.zeros_initializer)\n",
139 |     "            self.mu = tf.squeeze(self.mu)\n",
140 |     "            \n",
141 |     "            self.sigma = tf.contrib.layers.fully_connected(\n",
142 |     "                inputs=tf.expand_dims(self.state, 0),\n",
143 |     "                num_outputs=1,\n",
144 |     "                activation_fn=None,\n",
145 |     "                weights_initializer=tf.zeros_initializer)\n",
146 |     "            \n",
147 |     "            self.sigma = tf.squeeze(self.sigma)\n",
148 |     "            self.sigma = tf.nn.softplus(self.sigma) + 1e-5\n",
149 |     "            self.normal_dist = tf.contrib.distributions.Normal(self.mu, self.sigma)\n",
150 |     "            self.action = self.normal_dist._sample_n(1)\n",
151 |     "            self.action = tf.clip_by_value(self.action, env.action_space.low[0], env.action_space.high[0])\n",
152 |     "\n",
153 |     "            # Loss and train op\n",
154 |     "            self.loss = -self.normal_dist.log_prob(self.action) * self.target\n",
155 |     "            # Add cross entropy cost to encourage exploration\n",
156 |     "            self.loss -= 1e-1 * self.normal_dist.entropy()\n",
157 |     "            \n",
158 |     "            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)\n",
159 |     "            self.train_op = self.optimizer.minimize(\n",
160 |     "                self.loss, global_step=tf.contrib.framework.get_global_step())\n",
161 |     "    \n",
162 |     "    def predict(self, state, sess=None):\n",
163 |     "        sess = sess or tf.get_default_session()\n",
164 |     "        state = featurize_state(state)\n",
165 |     "        return sess.run(self.action, { self.state: state })\n",
166 |     "\n",
167 |     "    def update(self, state, target, action, sess=None):\n",
168 |     "        sess = sess or tf.get_default_session()\n",
169 |     "        state = featurize_state(state)\n",
170 |     "        feed_dict = { self.state: state, self.target: target, self.action: action  }\n",
171 |     "        _, loss = sess.run([self.train_op, self.loss], feed_dict)\n",
172 |     "        return loss"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 7,
178 |    "metadata": {
179 |     "collapsed": true
180 |    },
181 |    "outputs": [],
182 |    "source": [
183 |     "class ValueEstimator():\n",
184 |     "    \"\"\"\n",
185 |     "    Value Function approximator. \n",
186 |     "    \"\"\"\n",
187 |     "    \n",
188 |     "    def __init__(self, learning_rate=0.1, scope=\"value_estimator\"):\n",
189 |     "        with tf.variable_scope(scope):\n",
190 |     "            self.state = tf.placeholder(tf.float32, [400], \"state\")\n",
191 |     "            self.target = tf.placeholder(dtype=tf.float32, name=\"target\")\n",
192 |     "\n",
193 |     "            # This is just linear classifier\n",
194 |     "            self.output_layer = tf.contrib.layers.fully_connected(\n",
195 |     "                inputs=tf.expand_dims(self.state, 0),\n",
196 |     "                num_outputs=1,\n",
197 |     "                activation_fn=None,\n",
198 |     "                weights_initializer=tf.zeros_initializer)\n",
199 |     "\n",
200 |     "            self.value_estimate = tf.squeeze(self.output_layer)\n",
201 |     "            self.loss = tf.squared_difference(self.value_estimate, self.target)\n",
202 |     "\n",
203 |     "            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)\n",
204 |     "            self.train_op = self.optimizer.minimize(\n",
205 |     "                self.loss, global_step=tf.contrib.framework.get_global_step())        \n",
206 |     "    \n",
207 |     "    def predict(self, state, sess=None):\n",
208 |     "        sess = sess or tf.get_default_session()\n",
209 |     "        state = featurize_state(state)\n",
210 |     "        return sess.run(self.value_estimate, { self.state: state })\n",
211 |     "\n",
212 |     "    def update(self, state, target, sess=None):\n",
213 |     "        sess = sess or tf.get_default_session()\n",
214 |     "        state = featurize_state(state)\n",
215 |     "        feed_dict = { self.state: state, self.target: target }\n",
216 |     "        _, loss = sess.run([self.train_op, self.loss], feed_dict)\n",
217 |     "        return loss"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 15,
223 |    "metadata": {
224 |     "collapsed": true
225 |    },
226 |    "outputs": [],
227 |    "source": [
228 |     "def actor_critic(env, estimator_policy, estimator_value, num_episodes, discount_factor=1.0):\n",
229 |     "    \"\"\"\n",
230 |     "    Actor Critic Algorithm. Optimizes the policy \n",
231 |     "    function approximator using policy gradient.\n",
232 |     "    \n",
233 |     "    Args:\n",
234 |     "        env: OpenAI environment.\n",
235 |     "        estimator_policy: Policy Function to be optimized \n",
236 |     "        estimator_value: Value function approximator, used as a critic\n",
237 |     "        num_episodes: Number of episodes to run for\n",
238 |     "        discount_factor: Time-discount factor\n",
239 |     "    \n",
240 |     "    Returns:\n",
241 |     "        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.\n",
242 |     "    \"\"\"\n",
243 |     "\n",
244 |     "    # Keeps track of useful statistics\n",
245 |     "    stats = plotting.EpisodeStats(\n",
246 |     "        episode_lengths=np.zeros(num_episodes),\n",
247 |     "        episode_rewards=np.zeros(num_episodes))    \n",
248 |     "    \n",
249 |     "    Transition = collections.namedtuple(\"Transition\", [\"state\", \"action\", \"reward\", \"next_state\", \"done\"])\n",
250 |     "    \n",
251 |     "    for i_episode in range(num_episodes):\n",
252 |     "        # Reset the environment and pick the fisrst action\n",
253 |     "        state = env.reset()\n",
254 |     "        \n",
255 |     "        episode = []\n",
256 |     "        \n",
257 |     "        # One step in the environment\n",
258 |     "        for t in itertools.count():\n",
259 |     "            \n",
260 |     "            # env.render()\n",
261 |     "            \n",
262 |     "            # Take a step\n",
263 |     "            action = estimator_policy.predict(state)\n",
264 |     "            next_state, reward, done, _ = env.step(action)\n",
265 |     "            \n",
266 |     "            # Keep track of the transition\n",
267 |     "            episode.append(Transition(\n",
268 |     "              state=state, action=action, reward=reward, next_state=next_state, done=done))\n",
269 |     "            \n",
270 |     "            # Update statistics\n",
271 |     "            stats.episode_rewards[i_episode] += reward\n",
272 |     "            stats.episode_lengths[i_episode] = t\n",
273 |     "            \n",
274 |     "            # Calculate TD Target\n",
275 |     "            value_next = estimator_value.predict(next_state)\n",
276 |     "            td_target = reward + discount_factor * value_next\n",
277 |     "            td_error = td_target - estimator_value.predict(state)\n",
278 |     "            \n",
279 |     "            # Update the value estimator\n",
280 |     "            estimator_value.update(state, td_target)\n",
281 |     "            \n",
282 |     "            # Update the policy estimator\n",
283 |     "            # using the td error as our advantage estimate\n",
284 |     "            estimator_policy.update(state, td_error, action)\n",
285 |     "            \n",
286 |     "            # Print out which step we're on, useful for debugging.\n",
287 |     "            print(\"\\rStep {} @ Episode {}/{} ({})\".format(\n",
288 |     "                    t, i_episode + 1, num_episodes, stats.episode_rewards[i_episode - 1]), end=\"\")\n",
289 |     "\n",
290 |     "            if done:\n",
291 |     "                break\n",
292 |     "                \n",
293 |     "            state = next_state\n",
294 |     "    \n",
295 |     "    return stats"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": 19,
301 |    "metadata": {},
302 |    "outputs": [
303 |     {
304 |      "name": "stdout",
305 |      "output_type": "stream",
306 |      "text": [
307 |       "WARNING:tensorflow:From /Users/dennybritz/venv/py3/lib/python3.6/site-packages/tensorflow/python/util/tf_should_use.py:170: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.\n",
308 |       "Instructions for updating:\n",
309 |       "Use `tf.global_variables_initializer` instead.\n"
310 |      ]
311 |     },
312 |     {
313 |      "name": "stderr",
314 |      "output_type": "stream",
315 |      "text": [
316 |       "[2017-06-16 13:31:05,772] From /Users/dennybritz/venv/py3/lib/python3.6/site-packages/tensorflow/python/util/tf_should_use.py:170: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.\n",
317 |       "Instructions for updating:\n",
318 |       "Use `tf.global_variables_initializer` instead.\n"
319 |      ]
320 |     },
321 |     {
322 |      "name": "stdout",
323 |      "output_type": "stream",
324 |      "text": [
325 |       "Step 662 @ Episode 50/50 (65.13252566564918))"
326 |      ]
327 |     }
328 |    ],
329 |    "source": [
330 |     "tf.reset_default_graph()\n",
331 |     "\n",
332 |     "global_step = tf.Variable(0, name=\"global_step\", trainable=False)\n",
333 |     "policy_estimator = PolicyEstimator(learning_rate=0.001)\n",
334 |     "value_estimator = ValueEstimator(learning_rate=0.1)\n",
335 |     "\n",
336 |     "with tf.Session() as sess:\n",
337 |     "    sess.run(tf.initialize_all_variables())\n",
338 |     "    # Note, due to randomness in the policy the number of episodes you need varies\n",
339 |     "    # TODO: Sometimes the algorithm gets stuck, I'm not sure what exactly is happening there.\n",
340 |     "    stats = actor_critic(env, policy_estimator, value_estimator, 50, discount_factor=0.95)"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": null,
346 |    "metadata": {
347 |     "collapsed": true
348 |    },
349 |    "outputs": [],
350 |    "source": [
351 |     "plotting.plot_episode_stats(stats, smoothing_window=10)"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": null,
357 |    "metadata": {
358 |     "collapsed": true
359 |    },
360 |    "outputs": [],
361 |    "source": []
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": null,
366 |    "metadata": {
367 |     "collapsed": true
368 |    },
369 |    "outputs": [],
370 |    "source": []
371 |   }
372 |  ],
373 |  "metadata": {
374 |   "kernelspec": {
375 |    "display_name": "Python 3",
376 |    "language": "python",
377 |    "name": "python3"
378 |   },
379 |   "language_info": {
380 |    "codemirror_mode": {
381 |     "name": "ipython",
382 |     "version": 3
383 |    },
384 |    "file_extension": ".py",
385 |    "mimetype": "text/x-python",
386 |    "name": "python",
387 |    "nbconvert_exporter": "python",
388 |    "pygments_lexer": "ipython3",
389 |    "version": "3.5.2"
390 |   }
391 |  },
392 |  "nbformat": 4,
393 |  "nbformat_minor": 1
394 | }
395 | 


--------------------------------------------------------------------------------
/DQN/dqn.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | from gym.wrappers import Monitor
  3 | import itertools
  4 | import numpy as np
  5 | import os
  6 | import random
  7 | import sys
  8 | import tensorflow as tf
  9 | 
 10 | if "../" not in sys.path:
 11 |   sys.path.append("../")
 12 | 
 13 | from lib import plotting
 14 | from collections import deque, namedtuple
 15 | 
 16 | env = gym.envs.make("Breakout-v0")
 17 | 
 18 | # Atari Actions: 0 (noop), 1 (fire), 2 (left) and 3 (right) are valid actions
 19 | VALID_ACTIONS = [0, 1, 2, 3]
 20 | 
 21 | class StateProcessor():
 22 |     """
 23 |     Processes a raw Atari images. Resizes it and converts it to grayscale.
 24 |     """
 25 |     def __init__(self):
 26 |         # Build the Tensorflow graph
 27 |         with tf.variable_scope("state_processor"):
 28 |             self.input_state = tf.placeholder(shape=[210, 160, 3], dtype=tf.uint8)
 29 |             self.output = tf.image.rgb_to_grayscale(self.input_state)
 30 |             self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160)
 31 |             self.output = tf.image.resize_images(
 32 |                 self.output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
 33 |             self.output = tf.squeeze(self.output)
 34 | 
 35 |     def process(self, sess, state):
 36 |         """
 37 |         Args:
 38 |             sess: A Tensorflow session object
 39 |             state: A [210, 160, 3] Atari RGB State
 40 | 
 41 |         Returns:
 42 |             A processed [84, 84] state representing grayscale values.
 43 |         """
 44 |         return sess.run(self.output, { self.input_state: state })
 45 | 
 46 | class Estimator():
 47 |     """Q-Value Estimator neural network.
 48 | 
 49 |     This network is used for both the Q-Network and the Target Network.
 50 |     """
 51 | 
 52 |     def __init__(self, scope="estimator", summaries_dir=None):
 53 |         self.scope = scope
 54 |         # Writes Tensorboard summaries to disk
 55 |         self.summary_writer = None
 56 |         with tf.variable_scope(scope):
 57 |             # Build the graph
 58 |             self._build_model()
 59 |             if summaries_dir:
 60 |                 summary_dir = os.path.join(summaries_dir, "summaries_{}".format(scope))
 61 |                 if not os.path.exists(summary_dir):
 62 |                     os.makedirs(summary_dir)
 63 |                 self.summary_writer = tf.summary.FileWriter(summary_dir)
 64 | 
 65 |     def _build_model(self):
 66 |         """
 67 |         Builds the Tensorflow graph.
 68 |         """
 69 | 
 70 |         # Placeholders for our input
 71 |         # Our input are 4 RGB frames of shape 160, 160 each
 72 |         self.X_pl = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name="X")
 73 |         # The TD target value
 74 |         self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name="y")
 75 |         # Integer id of which action was selected
 76 |         self.actions_pl = tf.placeholder(shape=[None], dtype=tf.int32, name="actions")
 77 | 
 78 |         X = tf.to_float(self.X_pl) / 255.0
 79 |         batch_size = tf.shape(self.X_pl)[0]
 80 | 
 81 |         # Three convolutional layers
 82 |         conv1 = tf.contrib.layers.conv2d(
 83 |             X, 32, 8, 4, activation_fn=tf.nn.relu)
 84 |         conv2 = tf.contrib.layers.conv2d(
 85 |             conv1, 64, 4, 2, activation_fn=tf.nn.relu)
 86 |         conv3 = tf.contrib.layers.conv2d(
 87 |             conv2, 64, 3, 1, activation_fn=tf.nn.relu)
 88 | 
 89 |         # Fully connected layers
 90 |         flattened = tf.contrib.layers.flatten(conv3)
 91 |         fc1 = tf.contrib.layers.fully_connected(flattened, 512)
 92 |         self.predictions = tf.contrib.layers.fully_connected(fc1, len(VALID_ACTIONS))
 93 | 
 94 |         # Get the predictions for the chosen actions only
 95 |         gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl
 96 |         self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)
 97 | 
 98 |         # Calculate the loss
 99 |         self.losses = tf.squared_difference(self.y_pl, self.action_predictions)
100 |         self.loss = tf.reduce_mean(self.losses)
101 | 
102 |         # Optimizer Parameters from original paper
103 |         self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
104 |         self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())
105 | 
106 |         # Summaries for Tensorboard
107 |         self.summaries = tf.summary.merge([
108 |             tf.summary.scalar("loss", self.loss),
109 |             tf.summary.histogram("loss_hist", self.losses),
110 |             tf.summary.histogram("q_values_hist", self.predictions),
111 |             tf.summary.scalar("max_q_value", tf.reduce_max(self.predictions))
112 |         ])
113 | 
114 | 
115 |     def predict(self, sess, s):
116 |         """
117 |         Predicts action values.
118 | 
119 |         Args:
120 |           sess: Tensorflow session
121 |           s: State input of shape [batch_size, 4, 160, 160, 3]
122 | 
123 |         Returns:
124 |           Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated 
125 |           action values.
126 |         """
127 |         return sess.run(self.predictions, { self.X_pl: s })
128 | 
129 |     def update(self, sess, s, a, y):
130 |         """
131 |         Updates the estimator towards the given targets.
132 | 
133 |         Args:
134 |           sess: Tensorflow session object
135 |           s: State input of shape [batch_size, 4, 160, 160, 3]
136 |           a: Chosen actions of shape [batch_size]
137 |           y: Targets of shape [batch_size]
138 | 
139 |         Returns:
140 |           The calculated loss on the batch.
141 |         """
142 |         feed_dict = { self.X_pl: s, self.y_pl: y, self.actions_pl: a }
143 |         summaries, global_step, _, loss = sess.run(
144 |             [self.summaries, tf.contrib.framework.get_global_step(), self.train_op, self.loss],
145 |             feed_dict)
146 |         if self.summary_writer:
147 |             self.summary_writer.add_summary(summaries, global_step)
148 |         return loss
149 | 
150 | def copy_model_parameters(sess, estimator1, estimator2):
151 |     """
152 |     Copies the model parameters of one estimator to another.
153 | 
154 |     Args:
155 |       sess: Tensorflow session instance
156 |       estimator1: Estimator to copy the paramters from
157 |       estimator2: Estimator to copy the parameters to
158 |     """
159 |     e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]
160 |     e1_params = sorted(e1_params, key=lambda v: v.name)
161 |     e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]
162 |     e2_params = sorted(e2_params, key=lambda v: v.name)
163 | 
164 |     update_ops = []
165 |     for e1_v, e2_v in zip(e1_params, e2_params):
166 |         op = e2_v.assign(e1_v)
167 |         update_ops.append(op)
168 | 
169 |     sess.run(update_ops)
170 | 
171 | 
172 | def make_epsilon_greedy_policy(estimator, nA):
173 |     """
174 |     Creates an epsilon-greedy policy based on a given Q-function approximator and epsilon.
175 | 
176 |     Args:
177 |         estimator: An estimator that returns q values for a given state
178 |         nA: Number of actions in the environment.
179 | 
180 |     Returns:
181 |         A function that takes the (sess, observation, epsilon) as an argument and returns
182 |         the probabilities for each action in the form of a numpy array of length nA.
183 | 
184 |     """
185 |     def policy_fn(sess, observation, epsilon):
186 |         A = np.ones(nA, dtype=float) * epsilon / nA
187 |         q_values = estimator.predict(sess, np.expand_dims(observation, 0))[0]
188 |         best_action = np.argmax(q_values)
189 |         A[best_action] += (1.0 - epsilon)
190 |         return A
191 |     return policy_fn
192 | 
193 | 
194 | def deep_q_learning(sess,
195 |                     env,
196 |                     q_estimator,
197 |                     target_estimator,
198 |                     state_processor,
199 |                     num_episodes,
200 |                     experiment_dir,
201 |                     replay_memory_size=500000,
202 |                     replay_memory_init_size=50000,
203 |                     update_target_estimator_every=10000,
204 |                     discount_factor=0.99,
205 |                     epsilon_start=1.0,
206 |                     epsilon_end=0.1,
207 |                     epsilon_decay_steps=500000,
208 |                     batch_size=32,
209 |                     record_video_every=50):
210 |     """
211 |     Q-Learning algorithm for off-policy TD control using Function Approximation.
212 |     Finds the optimal greedy policy while following an epsilon-greedy policy.
213 | 
214 |     Args:
215 |         sess: Tensorflow Session object
216 |         env: OpenAI environment
217 |         q_estimator: Estimator object used for the q values
218 |         target_estimator: Estimator object used for the targets
219 |         state_processor: A StateProcessor object
220 |         num_episodes: Number of episodes to run for
221 |         experiment_dir: Directory to save Tensorflow summaries in
222 |         replay_memory_size: Size of the replay memory
223 |         replay_memory_init_size: Number of random experiences to sampel when initializing 
224 |           the reply memory.
225 |         update_target_estimator_every: Copy parameters from the Q estimator to the 
226 |           target estimator every N steps
227 |         discount_factor: Gamma discount factor
228 |         epsilon_start: Chance to sample a random action when taking an action.
229 |           Epsilon is decayed over time and this is the start value
230 |         epsilon_end: The final minimum value of epsilon after decaying is done
231 |         epsilon_decay_steps: Number of steps to decay epsilon over
232 |         batch_size: Size of batches to sample from the replay memory
233 |         record_video_every: Record a video every N episodes
234 | 
235 |     Returns:
236 |         An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
237 |     """
238 | 
239 |     Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
240 | 
241 |     # The replay memory
242 |     replay_memory = []
243 | 
244 |     # Keeps track of useful statistics
245 |     stats = plotting.EpisodeStats(
246 |         episode_lengths=np.zeros(num_episodes),
247 |         episode_rewards=np.zeros(num_episodes))
248 | 
249 |     # Create directories for checkpoints and summaries
250 |     checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
251 |     checkpoint_path = os.path.join(checkpoint_dir, "model")
252 |     monitor_path = os.path.join(experiment_dir, "monitor")
253 | 
254 |     if not os.path.exists(checkpoint_dir):
255 |         os.makedirs(checkpoint_dir)
256 |     if not os.path.exists(monitor_path):
257 |         os.makedirs(monitor_path)
258 | 
259 |     saver = tf.train.Saver()
260 |     # Load a previous checkpoint if we find one
261 |     latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
262 |     if latest_checkpoint:
263 |         print("Loading model checkpoint {}...\n".format(latest_checkpoint))
264 |         saver.restore(sess, latest_checkpoint)
265 | 
266 |     total_t = sess.run(tf.contrib.framework.get_global_step())
267 | 
268 |     # The epsilon decay schedule
269 |     epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)
270 | 
271 |     # The policy we're following
272 |     policy = make_epsilon_greedy_policy(
273 |         q_estimator,
274 |         len(VALID_ACTIONS))
275 | 
276 |     # Populate the replay memory with initial experience
277 |     print("Populating replay memory...")
278 |     state = env.reset()
279 |     state = state_processor.process(sess, state)
280 |     state = np.stack([state] * 4, axis=2)
281 |     for i in range(replay_memory_init_size):
282 |         action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps-1)])
283 |         action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
284 |         next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
285 |         next_state = state_processor.process(sess, next_state)
286 |         next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)
287 |         replay_memory.append(Transition(state, action, reward, next_state, done))
288 |         if done:
289 |             state = env.reset()
290 |             state = state_processor.process(sess, state)
291 |             state = np.stack([state] * 4, axis=2)
292 |         else:
293 |             state = next_state
294 | 
295 |     # Record videos
296 |     # Use the gym env Monitor wrapper
297 |     env = Monitor(env,
298 |                   directory=monitor_path,
299 |                   resume=True,
300 |                   video_callable=lambda count: count % record_video_every ==0)
301 | 
302 |     for i_episode in range(num_episodes):
303 | 
304 |         # Save the current checkpoint
305 |         saver.save(tf.get_default_session(), checkpoint_path)
306 | 
307 |         # Reset the environment
308 |         state = env.reset()
309 |         state = state_processor.process(sess, state)
310 |         state = np.stack([state] * 4, axis=2)
311 |         loss = None
312 | 
313 |         # One step in the environment
314 |         for t in itertools.count():
315 | 
316 |             # Epsilon for this time step
317 |             epsilon = epsilons[min(total_t, epsilon_decay_steps-1)]
318 | 
319 |             # Add epsilon to Tensorboard
320 |             episode_summary = tf.Summary()
321 |             episode_summary.value.add(simple_value=epsilon, tag="epsilon")
322 |             q_estimator.summary_writer.add_summary(episode_summary, total_t)
323 | 
324 |             # Maybe update the target estimator
325 |             if total_t % update_target_estimator_every == 0:
326 |                 copy_model_parameters(sess, q_estimator, target_estimator)
327 |                 print("\nCopied model parameters to target network.")
328 | 
329 |             # Print out which step we're on, useful for debugging.
330 |             print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
331 |                     t, total_t, i_episode + 1, num_episodes, loss), end="")
332 |             sys.stdout.flush()
333 | 
334 |             # Take a step
335 |             action_probs = policy(sess, state, epsilon)
336 |             action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
337 |             next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
338 |             next_state = state_processor.process(sess, next_state)
339 |             next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)
340 | 
341 |             # If our replay memory is full, pop the first element
342 |             if len(replay_memory) == replay_memory_size:
343 |                 replay_memory.pop(0)
344 | 
345 |             # Save transition to replay memory
346 |             replay_memory.append(Transition(state, action, reward, next_state, done))   
347 | 
348 |             # Update statistics
349 |             stats.episode_rewards[i_episode] += reward
350 |             stats.episode_lengths[i_episode] = t
351 | 
352 |             # Sample a minibatch from the replay memory
353 |             samples = random.sample(replay_memory, batch_size)
354 |             states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples))
355 | 
356 |             # Calculate q values and targets (Double DQN)
357 |             q_values_next = q_estimator.predict(sess, next_states_batch)
358 |             best_actions = np.argmax(q_values_next, axis=1)
359 |             q_values_next_target = target_estimator.predict(sess, next_states_batch)
360 |             targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * \
361 |                 discount_factor * q_values_next_target[np.arange(batch_size), best_actions]
362 | 
363 |             # Perform gradient descent update
364 |             states_batch = np.array(states_batch)
365 |             loss = q_estimator.update(sess, states_batch, action_batch, targets_batch)
366 | 
367 |             if done:
368 |                 break
369 | 
370 |             state = next_state
371 |             total_t += 1
372 | 
373 |         # Add summaries to tensorboard
374 |         episode_summary = tf.Summary()
375 |         episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward")
376 |         episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length")
377 |         q_estimator.summary_writer.add_summary(episode_summary, total_t)
378 |         q_estimator.summary_writer.flush()
379 | 
380 |         yield total_t, plotting.EpisodeStats(
381 |             episode_lengths=stats.episode_lengths[:i_episode+1],
382 |             episode_rewards=stats.episode_rewards[:i_episode+1])
383 | 
384 |     env.monitor.close()
385 |     return stats
386 | 
387 | 
388 | tf.reset_default_graph()
389 | 
390 | # Where we save our checkpoints and graphs
391 | experiment_dir = os.path.abspath("./experiments/{}".format(env.spec.id))
392 | 
393 | # Create a glboal step variable
394 | global_step = tf.Variable(0, name='global_step', trainable=False)
395 | 
396 | # Create estimators
397 | q_estimator = Estimator(scope="q", summaries_dir=experiment_dir)
398 | target_estimator = Estimator(scope="target_q")
399 | 
400 | # State processor
401 | state_processor = StateProcessor()
402 | 
403 | with tf.Session() as sess:
404 |     sess.run(tf.global_variables_initializer())
405 |     for t, stats in deep_q_learning(sess,
406 |                                     env,
407 |                                     q_estimator=q_estimator,
408 |                                     target_estimator=target_estimator,
409 |                                     state_processor=state_processor,
410 |                                     experiment_dir=experiment_dir,
411 |                                     num_episodes=10000,
412 |                                     replay_memory_size=500000,
413 |                                     replay_memory_init_size=50000,
414 |                                     update_target_estimator_every=10000,
415 |                                     epsilon_start=1.0,
416 |                                     epsilon_end=0.1,
417 |                                     epsilon_decay_steps=500000,
418 |                                     discount_factor=0.99,
419 |                                     batch_size=32):
420 | 
421 |         print("\nEpisode Reward: {}".format(stats.episode_rewards[-1]))
422 | 
423 | 


--------------------------------------------------------------------------------