├── .gitignore
├── LICENSE
├── hw1
    ├── HW1 torch.ipynb
    ├── HW1.ipynb
    ├── README.md
    ├── demo.bash
    ├── experts
    │   ├── Ant-v1.pkl
    │   ├── HalfCheetah-v1.pkl
    │   ├── Hopper-v1.pkl
    │   ├── Humanoid-v1.pkl
    │   ├── Reacher-v1.pkl
    │   └── Walker2d-v1.pkl
    ├── hw1fall2017.pdf
    ├── load_policy.py
    ├── run_expert.py
    └── tf_util.py
├── hw2
    ├── README.MD
    ├── deliver.sh
    ├── fig
    │   ├── 1_cartpole_sb.png
    │   ├── 2_cartpole_lb.png
    │   ├── 2_cartpole_sb_lb.png
    │   ├── 3_pendulum_2x16.png
    │   ├── 4_nn_baseline.png
    │   └── 5_hc.png
    ├── hw2_final.pdf
    ├── logz.py
    ├── plot.py
    └── train_pg.py
├── hw3
    ├── README
    ├── atari_wrappers.py
    ├── dqn.py
    ├── dqn_utils.py
    ├── hw3.pdf
    ├── run_dqn_atari.py
    └── run_dqn_ram.py
├── hw4
    ├── README.md
    ├── cheetah_env.py
    ├── controllers.py
    ├── cost_functions.py
    ├── dynamics.py
    ├── hw4.pdf
    ├── logz.py
    ├── main.py
    └── plot.py
└── sp17_hw
    ├── hw1
        ├── README.md
        ├── demo.bash
        ├── experts
        │   ├── Ant-v1.pkl
        │   ├── HalfCheetah-v1.pkl
        │   ├── Hopper-v1.pkl
        │   ├── Humanoid-v1.pkl
        │   ├── Reacher-v1.pkl
        │   └── Walker2d-v1.pkl
        ├── load_policy.py
        ├── run_expert.py
        └── tf_util.py
    ├── hw2
        ├── HW2.ipynb
        ├── discrete_env.py
        └── frozen_lake.py
    ├── hw3
        ├── README
        ├── atari_wrappers.py
        ├── dqn.py
        ├── dqn_utils.py
        ├── run_dqn_atari.py
        └── run_dqn_ram.py
    └── hw4
        ├── homework.md
        ├── logz.py
        ├── main.py
        └── plot_learning_curves.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # HW2 data
  2 | hw2/data/*
  3 | 
  4 | # General
  5 | .DS_Store
  6 | .AppleDouble
  7 | .LSOverride
  8 | 
  9 | # Icon must end with two \r
 10 | Icon
 11 | 
 12 | 
 13 | # Thumbnails
 14 | ._*
 15 | 
 16 | # Files that might appear in the root of a volume
 17 | .DocumentRevisions-V100
 18 | .fseventsd
 19 | .Spotlight-V100
 20 | .TemporaryItems
 21 | .Trashes
 22 | .VolumeIcon.icns
 23 | .com.apple.timemachine.donotpresent
 24 | 
 25 | # Directories potentially created on remote AFP share
 26 | .AppleDB
 27 | .AppleDesktop
 28 | Network Trash Folder
 29 | Temporary Items
 30 | .apdisk
 31 | 
 32 | # Byte-compiled / optimized / DLL files
 33 | __pycache__/
 34 | *.py[cod]
 35 | *$py.class
 36 | 
 37 | # C extensions
 38 | *.so
 39 | 
 40 | # Distribution / packaging
 41 | .Python
 42 | env/
 43 | build/
 44 | develop-eggs/
 45 | dist/
 46 | downloads/
 47 | eggs/
 48 | .eggs/
 49 | lib/
 50 | lib64/
 51 | parts/
 52 | sdist/
 53 | var/
 54 | *.egg-info/
 55 | .installed.cfg
 56 | *.egg
 57 | 
 58 | # PyInstaller
 59 | #  Usually these files are written by a python script from a template
 60 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 61 | *.manifest
 62 | *.spec
 63 | 
 64 | # Installer logs
 65 | pip-log.txt
 66 | pip-delete-this-directory.txt
 67 | 
 68 | # Unit test / coverage reports
 69 | htmlcov/
 70 | .tox/
 71 | .coverage
 72 | .coverage.*
 73 | .cache
 74 | nosetests.xml
 75 | coverage.xml
 76 | *,cover
 77 | .hypothesis/
 78 | 
 79 | # Translations
 80 | *.mo
 81 | *.pot
 82 | 
 83 | # Django stuff:
 84 | *.log
 85 | local_settings.py
 86 | 
 87 | # Flask stuff:
 88 | instance/
 89 | .webassets-cache
 90 | 
 91 | # Scrapy stuff:
 92 | .scrapy
 93 | 
 94 | # Sphinx documentation
 95 | docs/_build/
 96 | 
 97 | # PyBuilder
 98 | target/
 99 | 
100 | # IPython Notebook
101 | .ipynb_checkpoints
102 | 
103 | # pyenv
104 | .python-version
105 | 
106 | # celery beat schedule file
107 | celerybeat-schedule
108 | 
109 | # dotenv
110 | .env
111 | 
112 | # virtualenv
113 | venv/
114 | ENV/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | 
119 | # Rope project settings
120 | .ropeproject
121 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 berkeleydeeprlcourse
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/hw1/README.md:
--------------------------------------------------------------------------------
 1 | # CS294-112 HW 1: Imitation Learning
 2 | 
 3 | Dependencies: TensorFlow, MuJoCo version 1.31, OpenAI Gym
 4 | 
 5 | **Note**: MuJoCo versions until 1.5 do not support NVMe disks therefore won't be compatible with recent Mac machines.
 6 | There is a request for OpenAI to support it that can be followed [here](https://github.com/openai/gym/issues/638).
 7 | 
 8 | The only file that you need to look at is `run_expert.py`, which is code to load up an expert policy, run a specified number of roll-outs, and save out data.
 9 | 
10 | In `experts/`, the provided expert policies are:
11 | * Ant-v1.pkl
12 | * HalfCheetah-v1.pkl
13 | * Hopper-v1.pkl
14 | * Humanoid-v1.pkl
15 | * Reacher-v1.pkl
16 | * Walker2d-v1.pkl
17 | 
18 | The name of the pickle file corresponds to the name of the gym environment.
19 | 


--------------------------------------------------------------------------------
/hw1/demo.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eux
3 | for e in Hopper-v1 Ant-v1 HalfCheetah-v1 Humanoid-v1 Reacher-v1 Walker2d-v1
4 | do
5 |     python run_expert.py experts/$e.pkl $e --render --num_rollouts=1
6 | done
7 | 


--------------------------------------------------------------------------------
/hw1/experts/Ant-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw1/experts/Ant-v1.pkl


--------------------------------------------------------------------------------
/hw1/experts/HalfCheetah-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw1/experts/HalfCheetah-v1.pkl


--------------------------------------------------------------------------------
/hw1/experts/Hopper-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw1/experts/Hopper-v1.pkl


--------------------------------------------------------------------------------
/hw1/experts/Humanoid-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw1/experts/Humanoid-v1.pkl


--------------------------------------------------------------------------------
/hw1/experts/Reacher-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw1/experts/Reacher-v1.pkl


--------------------------------------------------------------------------------
/hw1/experts/Walker2d-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw1/experts/Walker2d-v1.pkl


--------------------------------------------------------------------------------
/hw1/hw1fall2017.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw1/hw1fall2017.pdf


--------------------------------------------------------------------------------
/hw1/load_policy.py:
--------------------------------------------------------------------------------
 1 | import pickle, tensorflow as tf, tf_util, numpy as np
 2 | 
 3 | def load_policy(filename):
 4 |     with open(filename, 'rb') as f:
 5 |         data = pickle.loads(f.read())
 6 | 
 7 |     # assert len(data.keys()) == 2
 8 |     nonlin_type = data['nonlin_type']
 9 |     policy_type = [k for k in data.keys() if k != 'nonlin_type'][0]
10 | 
11 |     assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type)
12 |     policy_params = data[policy_type]
13 | 
14 |     assert set(policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'}
15 | 
16 |     # Keep track of input and output dims (i.e. observation and action dims) for the user
17 | 
18 |     def build_policy(obs_bo):
19 |         def read_layer(l):
20 |             assert list(l.keys()) == ['AffineLayer']
21 |             assert sorted(l['AffineLayer'].keys()) == ['W', 'b']
22 |             return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer']['b'].astype(np.float32)
23 | 
24 |         def apply_nonlin(x):
25 |             if nonlin_type == 'lrelu':
26 |                 return tf_util.lrelu(x, leak=.01) # openai/imitation nn.py:233
27 |             elif nonlin_type == 'tanh':
28 |                 return tf.tanh(x)
29 |             else:
30 |                 raise NotImplementedError(nonlin_type)
31 | 
32 |         # Build the policy. First, observation normalization.
33 |         assert list(policy_params['obsnorm'].keys()) == ['Standardizer']
34 |         obsnorm_mean = policy_params['obsnorm']['Standardizer']['mean_1_D']
35 |         obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D']
36 |         obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean)))
37 |         print('obs', obsnorm_mean.shape, obsnorm_stdev.shape)
38 |         normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation
39 | 
40 |         curr_activations_bd = normedobs_bo
41 | 
42 |         # Hidden layers next
43 |         assert list(policy_params['hidden'].keys()) == ['FeedforwardNet']
44 |         layer_params = policy_params['hidden']['FeedforwardNet']
45 |         for layer_name in sorted(layer_params.keys()):
46 |             l = layer_params[layer_name]
47 |             W, b = read_layer(l)
48 |             curr_activations_bd = apply_nonlin(tf.matmul(curr_activations_bd, W) + b)
49 | 
50 |         # Output layer
51 |         W, b = read_layer(policy_params['out'])
52 |         output_bo = tf.matmul(curr_activations_bd, W) + b
53 |         return output_bo
54 | 
55 |     obs_bo = tf.placeholder(tf.float32, [None, None])
56 |     a_ba = build_policy(obs_bo)
57 |     policy_fn = tf_util.function([obs_bo], a_ba)
58 |     return policy_fn


--------------------------------------------------------------------------------
/hw1/run_expert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Code to load an expert policy and generate roll-out data for behavioral cloning.
 5 | Example usage:
 6 |     python run_expert.py experts/Humanoid-v1.pkl Humanoid-v1 --render \
 7 |             --num_rollouts 20
 8 | 
 9 | Author of this script and included expert policies: Jonathan Ho (hoj@openai.com)
10 | """
11 | 
12 | import pickle
13 | import tensorflow as tf
14 | import numpy as np
15 | import tf_util
16 | import gym
17 | import load_policy
18 | 
19 | def main():
20 |     import argparse
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument('expert_policy_file', type=str)
23 |     parser.add_argument('envname', type=str)
24 |     parser.add_argument('--render', action='store_true')
25 |     parser.add_argument("--max_timesteps", type=int)
26 |     parser.add_argument('--num_rollouts', type=int, default=20,
27 |                         help='Number of expert roll outs')
28 |     args = parser.parse_args()
29 | 
30 |     print('loading and building expert policy')
31 |     policy_fn = load_policy.load_policy(args.expert_policy_file)
32 |     print('loaded and built')
33 | 
34 |     with tf.Session():
35 |         tf_util.initialize()
36 | 
37 |         import gym
38 |         env = gym.make(args.envname)
39 |         max_steps = args.max_timesteps or env.spec.timestep_limit
40 | 
41 |         returns = []
42 |         observations = []
43 |         actions = []
44 |         for i in range(args.num_rollouts):
45 |             print('iter', i)
46 |             obs = env.reset()
47 |             done = False
48 |             totalr = 0.
49 |             steps = 0
50 |             while not done:
51 |                 action = policy_fn(obs[None,:])
52 |                 observations.append(obs)
53 |                 actions.append(action)
54 |                 obs, r, done, _ = env.step(action)
55 |                 totalr += r
56 |                 steps += 1
57 |                 if args.render:
58 |                     env.render()
59 |                 if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
60 |                 if steps >= max_steps:
61 |                     break
62 |             returns.append(totalr)
63 | 
64 |         print('returns', returns)
65 |         print('mean return', np.mean(returns))
66 |         print('std of return', np.std(returns))
67 | 
68 |         expert_data = {'observations': np.array(observations),
69 |                        'actions': np.array(actions)}
70 | 
71 | if __name__ == '__main__':
72 |     main()
73 | 


--------------------------------------------------------------------------------
/hw1/tf_util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf # pylint: ignore-module
  3 | #import builtins
  4 | import functools
  5 | import copy
  6 | import os
  7 | import collections
  8 | 
  9 | # ================================================================
 10 | # Import all names into common namespace
 11 | # ================================================================
 12 | 
 13 | clip = tf.clip_by_value
 14 | 
 15 | # Make consistent with numpy
 16 | # ----------------------------------------
 17 | 
 18 | def sum(x, axis=None, keepdims=False):
 19 |     return tf.reduce_sum(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims)
 20 | def mean(x, axis=None, keepdims=False):
 21 |     return tf.reduce_mean(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims)
 22 | def var(x, axis=None, keepdims=False):
 23 |     meanx = mean(x, axis=axis, keepdims=keepdims)
 24 |     return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims)
 25 | def std(x, axis=None, keepdims=False):
 26 |     return tf.sqrt(var(x, axis=axis, keepdims=keepdims))
 27 | def max(x, axis=None, keepdims=False):
 28 |     return tf.reduce_max(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims)
 29 | def min(x, axis=None, keepdims=False):
 30 |     return tf.reduce_min(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims)
 31 | def concatenate(arrs, axis=0):
 32 |     return tf.concat(axis, arrs)
 33 | def argmax(x, axis=None):
 34 |     return tf.argmax(x, dimension=axis)
 35 | 
 36 | def switch(condition, then_expression, else_expression):
 37 |     '''Switches between two operations depending on a scalar value (int or bool).
 38 |     Note that both `then_expression` and `else_expression`
 39 |     should be symbolic tensors of the *same shape*.
 40 | 
 41 |     # Arguments
 42 |         condition: scalar tensor.
 43 |         then_expression: TensorFlow operation.
 44 |         else_expression: TensorFlow operation.
 45 |     '''
 46 |     x_shape = copy.copy(then_expression.get_shape())
 47 |     x = tf.cond(tf.cast(condition, 'bool'),
 48 |                 lambda: then_expression,
 49 |                 lambda: else_expression)
 50 |     x.set_shape(x_shape)
 51 |     return x
 52 | 
 53 | # Extras
 54 | # ----------------------------------------
 55 | def l2loss(params):
 56 |     if len(params) == 0:
 57 |         return tf.constant(0.0)
 58 |     else:
 59 |         return tf.add_n([sum(tf.square(p)) for p in params])
 60 | def lrelu(x, leak=0.2):
 61 |     f1 = 0.5 * (1 + leak)
 62 |     f2 = 0.5 * (1 - leak)
 63 |     return f1 * x + f2 * abs(x)
 64 | def categorical_sample_logits(X):
 65 |     # https://github.com/tensorflow/tensorflow/issues/456
 66 |     U = tf.random_uniform(tf.shape(X))
 67 |     return argmax(X - tf.log(-tf.log(U)), axis=1)
 68 | 
 69 | # ================================================================
 70 | # Global session
 71 | # ================================================================
 72 | 
 73 | def get_session():
 74 |     return tf.get_default_session()
 75 | 
 76 | def single_threaded_session():
 77 |     tf_config = tf.ConfigProto(
 78 |         inter_op_parallelism_threads=1,
 79 |         intra_op_parallelism_threads=1)
 80 |     return tf.Session(config=tf_config)
 81 | 
 82 | def make_session(num_cpu):
 83 |     tf_config = tf.ConfigProto(
 84 |         inter_op_parallelism_threads=num_cpu,
 85 |         intra_op_parallelism_threads=num_cpu)
 86 |     return tf.Session(config=tf_config)
 87 | 
 88 | 
 89 | ALREADY_INITIALIZED = set()
 90 | def initialize():
 91 |     new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
 92 |     get_session().run(tf.variables_initializer(new_variables))
 93 |     ALREADY_INITIALIZED.update(new_variables)
 94 | 
 95 | 
 96 | def eval(expr, feed_dict=None):
 97 |     if feed_dict is None: feed_dict = {}
 98 |     return get_session().run(expr, feed_dict=feed_dict)
 99 | 
100 | def set_value(v, val):
101 |     get_session().run(v.assign(val))
102 | 
103 | def load_state(fname):
104 |     saver = tf.train.Saver()
105 |     saver.restore(get_session(), fname)
106 | 
107 | def save_state(fname):
108 |     os.makedirs(os.path.dirname(fname), exist_ok=True)
109 |     saver = tf.train.Saver()
110 |     saver.save(get_session(), fname)
111 | 
112 | # ================================================================
113 | # Model components
114 | # ================================================================
115 | 
116 | 
117 | def normc_initializer(std=1.0):
118 |     def _initializer(shape, dtype=None, partition_info=None): #pylint: disable=W0613
119 |         out = np.random.randn(*shape).astype(np.float32)
120 |         out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
121 |         return tf.constant(out)
122 |     return _initializer
123 | 
124 | 
125 | def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None,
126 |            summary_tag=None):
127 |     with tf.variable_scope(name):
128 |         stride_shape = [1, stride[0], stride[1], 1]
129 |         filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]), num_filters]
130 | 
131 |         # there are "num input feature maps * filter height * filter width"
132 |         # inputs to each hidden unit
133 |         fan_in = intprod(filter_shape[:3])
134 |         # each unit in the lower layer receives a gradient from:
135 |         # "num output feature maps * filter height * filter width" /
136 |         #   pooling size
137 |         fan_out = intprod(filter_shape[:2]) * num_filters
138 |         # initialize weights with random weights
139 |         w_bound = np.sqrt(6. / (fan_in + fan_out))
140 | 
141 |         w = tf.get_variable("W", filter_shape, dtype, tf.random_uniform_initializer(-w_bound, w_bound),
142 |                             collections=collections)
143 |         b = tf.get_variable("b", [1, 1, 1, num_filters], initializer=tf.zeros_initializer,
144 |                             collections=collections)
145 | 
146 |         if summary_tag is not None:
147 |             tf.image_summary(summary_tag,
148 |                              tf.transpose(tf.reshape(w, [filter_size[0], filter_size[1], -1, 1]),
149 |                                           [2, 0, 1, 3]),
150 |                              max_images=10)
151 | 
152 |         return tf.nn.conv2d(x, w, stride_shape, pad) + b
153 | 
154 | 
155 | def dense(x, size, name, weight_init=None, bias=True):
156 |     w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init)
157 |     ret = tf.matmul(x, w)
158 |     if bias:
159 |         b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer)
160 |         return ret + b
161 |     else:
162 |         return ret
163 | 
164 | def wndense(x, size, name, init_scale=1.0):
165 |     v = tf.get_variable(name + "/V", [int(x.get_shape()[1]), size],
166 |                         initializer=tf.random_normal_initializer(0, 0.05))
167 |     g = tf.get_variable(name + "/g", [size], initializer=tf.constant_initializer(init_scale))
168 |     b = tf.get_variable(name + "/b", [size], initializer=tf.constant_initializer(0.0))
169 | 
170 |     # use weight normalization (Salimans & Kingma, 2016)
171 |     x = tf.matmul(x, v)
172 |     scaler = g / tf.sqrt(sum(tf.square(v), axis=0, keepdims=True))
173 |     return tf.reshape(scaler, [1, size]) * x + tf.reshape(b, [1, size])
174 | 
175 | def densenobias(x, size, name, weight_init=None):
176 |     return dense(x, size, name, weight_init=weight_init, bias=False)
177 | 
178 | def dropout(x, pkeep, phase=None, mask=None):
179 |     mask = tf.floor(pkeep + tf.random_uniform(tf.shape(x))) if mask is None else mask
180 |     if phase is None:
181 |         return mask * x
182 |     else:
183 |         return switch(phase, mask*x, pkeep*x)
184 | 
185 | def batchnorm(x, name, phase, updates, gamma=0.96):
186 |     k = x.get_shape()[1]
187 |     runningmean = tf.get_variable(name+"/mean", shape=[1, k], initializer=tf.constant_initializer(0.0), trainable=False)
188 |     runningvar = tf.get_variable(name+"/var", shape=[1, k], initializer=tf.constant_initializer(1e-4), trainable=False)
189 |     testy = (x - runningmean) / tf.sqrt(runningvar)
190 | 
191 |     mean_ = mean(x, axis=0, keepdims=True)
192 |     var_ = mean(tf.square(x), axis=0, keepdims=True)
193 |     std = tf.sqrt(var_)
194 |     trainy = (x - mean_) / std
195 | 
196 |     updates.extend([
197 |         tf.assign(runningmean, runningmean * gamma + mean_ * (1 - gamma)),
198 |         tf.assign(runningvar, runningvar * gamma + var_ * (1 - gamma))
199 |     ])
200 | 
201 |     y = switch(phase, trainy, testy)
202 | 
203 |     out = y * tf.get_variable(name+"/scaling", shape=[1, k], initializer=tf.constant_initializer(1.0), trainable=True)\
204 |             + tf.get_variable(name+"/translation", shape=[1,k], initializer=tf.constant_initializer(0.0), trainable=True)
205 |     return out
206 | 
207 | 
208 | 
209 | # ================================================================
210 | # Basic Stuff
211 | # ================================================================
212 | 
213 | def function(inputs, outputs, updates=None, givens=None):
214 |     if isinstance(outputs, list):
215 |         return _Function(inputs, outputs, updates, givens=givens)
216 |     elif isinstance(outputs, (dict, collections.OrderedDict)):
217 |         f = _Function(inputs, outputs.values(), updates, givens=givens)
218 |         return lambda *inputs : type(outputs)(zip(outputs.keys(), f(*inputs)))
219 |     else:
220 |         f = _Function(inputs, [outputs], updates, givens=givens)
221 |         return lambda *inputs : f(*inputs)[0]
222 | 
223 | class _Function(object):
224 |     def __init__(self, inputs, outputs, updates, givens, check_nan=False):
225 |         assert all(len(i.op.inputs)==0 for i in inputs), "inputs should all be placeholders"
226 |         self.inputs = inputs
227 |         updates = updates or []
228 |         self.update_group = tf.group(*updates)
229 |         self.outputs_update = list(outputs) + [self.update_group]
230 |         self.givens = {} if givens is None else givens
231 |         self.check_nan = check_nan
232 |     def __call__(self, *inputvals):
233 |         assert len(inputvals) == len(self.inputs)
234 |         feed_dict = dict(zip(self.inputs, inputvals))
235 |         feed_dict.update(self.givens)
236 |         results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
237 |         if self.check_nan:
238 |             if any(np.isnan(r).any() for r in results):
239 |                 raise RuntimeError("Nan detected")
240 |         return results
241 | 
242 | def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size):
243 |     if isinstance(outputs, list):
244 |         return _MemFriendlyFunction(nondata_inputs, data_inputs, outputs, batch_size)
245 |     else:
246 |         f = _MemFriendlyFunction(nondata_inputs, data_inputs, [outputs], batch_size)
247 |         return lambda *inputs : f(*inputs)[0]
248 | 
249 | class _MemFriendlyFunction(object):
250 |     def __init__(self, nondata_inputs, data_inputs, outputs, batch_size):
251 |         self.nondata_inputs = nondata_inputs
252 |         self.data_inputs = data_inputs
253 |         self.outputs = list(outputs)
254 |         self.batch_size = batch_size
255 |     def __call__(self, *inputvals):
256 |         assert len(inputvals) == len(self.nondata_inputs) + len(self.data_inputs)
257 |         nondata_vals = inputvals[0:len(self.nondata_inputs)]
258 |         data_vals = inputvals[len(self.nondata_inputs):]
259 |         feed_dict = dict(zip(self.nondata_inputs, nondata_vals))
260 |         n = data_vals[0].shape[0]
261 |         for v in data_vals[1:]:
262 |             assert v.shape[0] == n
263 |         for i_start in range(0, n, self.batch_size):
264 |             slice_vals = [v[i_start:min(i_start+self.batch_size, n)] for v in data_vals]
265 |             for (var,val) in zip(self.data_inputs, slice_vals):
266 |                 feed_dict[var]=val
267 |             results = tf.get_default_session().run(self.outputs, feed_dict=feed_dict)
268 |             if i_start==0:
269 |                 sum_results = results
270 |             else:
271 |                 for i in range(len(results)):
272 |                     sum_results[i] = sum_results[i] + results[i]
273 |         for i in range(len(results)):
274 |             sum_results[i] = sum_results[i] / n
275 |         return sum_results
276 | 
277 | # ================================================================
278 | # Modules
279 | # ================================================================
280 | 
281 | class Module(object):
282 |     def __init__(self, name):
283 |         self.name = name
284 |         self.first_time = True
285 |         self.scope = None
286 |         self.cache = {}
287 |     def __call__(self, *args):
288 |         if args in self.cache:
289 |             print("(%s) retrieving value from cache"%self.name)
290 |             return self.cache[args]
291 |         with tf.variable_scope(self.name, reuse=not self.first_time):
292 |             scope = tf.get_variable_scope().name
293 |             if self.first_time:
294 |                 self.scope = scope
295 |                 print("(%s) running function for the first time"%self.name)
296 |             else:
297 |                 assert self.scope == scope, "Tried calling function with a different scope"
298 |                 print("(%s) running function on new inputs"%self.name)
299 |             self.first_time = False
300 |             out = self._call(*args)
301 |         self.cache[args] = out
302 |         return out
303 |     def _call(self, *args):
304 |         raise NotImplementedError
305 | 
306 |     @property
307 |     def trainable_variables(self):
308 |         assert self.scope is not None, "need to call module once before getting variables"
309 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
310 | 
311 |     @property
312 |     def variables(self):
313 |         assert self.scope is not None, "need to call module once before getting variables"
314 |         return tf.get_collection(tf.GraphKeys.VARIABLES, self.scope)
315 | 
316 | 
317 | def module(name):
318 |     @functools.wraps
319 |     def wrapper(f):
320 |         class WrapperModule(Module):
321 |             def _call(self, *args):
322 |                 return f(*args)
323 |         return WrapperModule(name)
324 |     return wrapper
325 | 
326 | # ================================================================
327 | # Graph traversal
328 | # ================================================================
329 | 
330 | VARIABLES = {}
331 | 
332 | 
333 | def get_parents(node):
334 |     return node.op.inputs
335 | 
336 | def topsorted(outputs):
337 |     """
338 |     Topological sort via non-recursive depth-first search
339 |     """
340 |     assert isinstance(outputs, (list,tuple))
341 |     marks = {}
342 |     out = []
343 |     stack = [] #pylint: disable=W0621
344 |     # i: node
345 |     # jidx = number of children visited so far from that node
346 |     # marks: state of each node, which is one of
347 |     #   0: haven't visited
348 |     #   1: have visited, but not done visiting children
349 |     #   2: done visiting children
350 |     for x in outputs:
351 |         stack.append((x,0))
352 |         while stack:
353 |             (i,jidx) = stack.pop()
354 |             if jidx == 0:
355 |                 m = marks.get(i,0)
356 |                 if m == 0:
357 |                     marks[i] = 1
358 |                 elif m == 1:
359 |                     raise ValueError("not a dag")
360 |                 else:
361 |                     continue
362 |             ps = get_parents(i)
363 |             if jidx == len(ps):
364 |                 marks[i] = 2
365 |                 out.append(i)
366 |             else:
367 |                 stack.append((i,jidx+1))
368 |                 j = ps[jidx]
369 |                 stack.append((j,0))
370 |     return out
371 | 
372 | 
373 | # ================================================================
374 | # Flat vectors
375 | # ================================================================
376 | 
377 | def var_shape(x):
378 |     out = [k.value for k in x.get_shape()]
379 |     assert all(isinstance(a, int) for a in out), \
380 |         "shape function assumes that shape is fully known"
381 |     return out
382 | 
383 | def numel(x):
384 |     return intprod(var_shape(x))
385 | 
386 | def intprod(x):
387 |     return int(np.prod(x))
388 | 
389 | def flatgrad(loss, var_list):
390 |     grads = tf.gradients(loss, var_list)
391 |     return tf.concat(0, [tf.reshape(grad, [numel(v)])
392 |         for (v, grad) in zip(var_list, grads)])
393 | 
394 | class SetFromFlat(object):
395 |     def __init__(self, var_list, dtype=tf.float32):
396 |         assigns = []
397 |         shapes = list(map(var_shape, var_list))
398 |         total_size = np.sum([intprod(shape) for shape in shapes])
399 | 
400 |         self.theta = theta = tf.placeholder(dtype,[total_size])
401 |         start=0
402 |         assigns = []
403 |         for (shape,v) in zip(shapes,var_list):
404 |             size = intprod(shape)
405 |             assigns.append(tf.assign(v, tf.reshape(theta[start:start+size],shape)))
406 |             start+=size
407 |         self.op = tf.group(*assigns)
408 |     def __call__(self, theta):
409 |         get_session().run(self.op, feed_dict={self.theta:theta})
410 | 
411 | class GetFlat(object):
412 |     def __init__(self, var_list):
413 |         self.op = tf.concat(0, [tf.reshape(v, [numel(v)]) for v in var_list])
414 |     def __call__(self):
415 |         return get_session().run(self.op)
416 | 
417 | # ================================================================
418 | # Misc
419 | # ================================================================
420 | 
421 | 
422 | def fancy_slice_2d(X, inds0, inds1):
423 |     """
424 |     like numpy X[inds0, inds1]
425 |     XXX this implementation is bad
426 |     """
427 |     inds0 = tf.cast(inds0, tf.int64)
428 |     inds1 = tf.cast(inds1, tf.int64)
429 |     shape = tf.cast(tf.shape(X), tf.int64)
430 |     ncols = shape[1]
431 |     Xflat = tf.reshape(X, [-1])
432 |     return tf.gather(Xflat, inds0 * ncols + inds1)
433 | 
434 | 
435 | def scope_vars(scope, trainable_only):
436 |     """
437 |     Get variables inside a scope
438 |     The scope can be specified as a string
439 |     """
440 |     return tf.get_collection(
441 |         tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.VARIABLES,
442 |         scope=scope if isinstance(scope, str) else scope.name
443 |     )
444 | 
445 | def lengths_to_mask(lengths_b, max_length):
446 |     """
447 |     Turns a vector of lengths into a boolean mask
448 | 
449 |     Args:
450 |         lengths_b: an integer vector of lengths
451 |         max_length: maximum length to fill the mask
452 | 
453 |     Returns:
454 |         a boolean array of shape (batch_size, max_length)
455 |         row[i] consists of True repeated lengths_b[i] times, followed by False
456 |     """
457 |     lengths_b = tf.convert_to_tensor(lengths_b)
458 |     assert lengths_b.get_shape().ndims == 1
459 |     mask_bt = tf.expand_dims(tf.range(max_length), 0) < tf.expand_dims(lengths_b, 1)
460 |     return mask_bt
461 | 
462 | 
463 | def in_session(f):
464 |     @functools.wraps(f)
465 |     def newfunc(*args, **kwargs):
466 |         with tf.Session():
467 |             f(*args, **kwargs)
468 |     return newfunc
469 | 
470 | 
471 | _PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape)
472 | def get_placeholder(name, dtype, shape):
473 |     print("calling get_placeholder", name)
474 |     if name in _PLACEHOLDER_CACHE:
475 |         out, dtype1, shape1 = _PLACEHOLDER_CACHE[name]
476 |         assert dtype1==dtype and shape1==shape
477 |         return out
478 |     else:
479 |         out = tf.placeholder(dtype=dtype, shape=shape, name=name)
480 |         _PLACEHOLDER_CACHE[name] = (out,dtype,shape)
481 |         return out
482 | def get_placeholder_cached(name):
483 |     return _PLACEHOLDER_CACHE[name][0]
484 | 
485 | def flattenallbut0(x):
486 |     return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
487 | 
488 | def reset():
489 |     global _PLACEHOLDER_CACHE
490 |     global VARIABLES
491 |     _PLACEHOLDER_CACHE = {}
492 |     VARIABLES = {}
493 |     tf.reset_default_graph()
494 | 


--------------------------------------------------------------------------------
/hw2/README.MD:
--------------------------------------------------------------------------------
  1 | # Homework 2 Report, Policy Gradient Deep RL
  2 | 
  3 | _Author_     Kay Ke
  4 | 
  5 | _Email_      kayke@uw.edu
  6 | 
  7 | _Update_    Sep 26, 2017.
  8 | 
  9 | ## Summary
 10 | 
 11 | For this homework, I have completed implementing policy gradient for both discrete and continuous action spaces, implementing reward to go (discount reward by current timestep instead of the begining of the trajectory), implementing advantage normalization and implementing neural network baseline (set baseline to predictions of NN rather than mean of sampled trajectory). I have shown that the algorithm could converge to the optimal score of 200 in CartPole game. I compared the performances for CartPole game using different parameters including network sizes, batches size, turning on/off reward to go and advantage normalization. I have shown that the algorithm could converge to the optimal score of 1000 in the InvertedPendulum game (1D, continuous action space) in 100 iterations, and could achieve an average score > 150 in 100 iterations for HalfCheetah game. 
 12 | 
 13 | At the bottom of this report I have attached personal "take-away" summaries for this project. 
 14 | 
 15 | ## Answers
 16 | 
 17 | 1. Compare the learning curves for the CartPole small batches (batch size 1000). There are three different settings: vanilla flavor (sb_no_rtg_dna), with reward to go (sb_rtg_dna), with reward to go and advantage normalization (sb_rtg_na). 
 18 | 
 19 |    ![](fig/1_cartpole_sb.png)
 20 | 
 21 | 2. Compare the learning curves for the CartPole large batches (batch size 5000). There are three different settings: vanilla flavor (lb_no_rtg_dna), with reward to go (lb_rtg_dna), with reward to go and advantage normalization (lb_rtg_na). 
 22 | 
 23 |    ![](fig/2_cartpole_lb.png)
 24 | 
 25 | 3. Compare the learning curves for small and large batches. 
 26 | 
 27 |    ![](fig/2_cartpole_sb_lb.png)
 28 | 
 29 | 4. Command lines that generated the images above
 30 | 
 31 |    ```Bash
 32 |    # Produce experiments results for CartPole
 33 |    python train_pg.py CartPole-v0 -n 100 -b 1000 -e 5 -dna --exp_name sb_no_rtg_dna
 34 |    python train_pg.py CartPole-v0 -n 100 -b 1000 -e 5 -rtg -dna --exp_name sb_rtg_dna
 35 |    python train_pg.py CartPole-v0 -n 100 -b 1000 -e 5 -rtg --exp_name sb_rtg_na
 36 |    python train_pg.py CartPole-v0 -n 100 -b 5000 -e 5 -dna --exp_name lb_no_rtg_dna
 37 |    python train_pg.py CartPole-v0 -n 100 -b 5000 -e 5 -rtg -dna --exp_name lb_rtg_dna
 38 |    python train_pg.py CartPole-v0 -n 100 -b 5000 -e 5 -rtg --exp_name lb_rtg_na
 39 |    ```
 40 | 
 41 |    ​
 42 | 
 43 | 5. Answer questions
 44 | 
 45 |    - Which gradient estimator has better performance without advantage-centering: the trajectory-centric one, or the one using reward-to-go?
 46 | 
 47 |      **From the figures for small / large batches, reward-to-go seems to learn faster from the begining and performs better**. 
 48 | 
 49 |    - Did advantage centering help?
 50 | 
 51 |      **Didn't help in my set up, with reward to go turned on.**
 52 | 
 53 |    - Describe what you expected from the math—do the empirical results match the theory?
 54 | 
 55 |      **TODO**
 56 | 
 57 |    - Did the batch size make an impact?
 58 | 
 59 |      **Yes, the larger batch size results in a smoother learning curve (less variations) and seems to learn faster (in terms of # of iterations)**
 60 | 
 61 | 6. Display a learning curve for InvertedPendulum-v1
 62 | 
 63 |    ```bash
 64 |    python train_pg.py InvertedPendulum-v1 -n 100 -b 1000 -e 5 -rtg --exp_name ip_sb_rtg_na --learning_rate 1e-2 --n_layers 2 --size 16
 65 |    ```
 66 | 
 67 |    ![](fig/3_pendulum_2x16.png)
 68 | 
 69 | 7. Implement NN Baseline
 70 | 
 71 |    Contrary to expectation, this does not seem to make a big impact on the variations. It's possible that this resulted from the small batch size I used?
 72 | 
 73 |    ```bash
 74 |    python train_pg.py InvertedPendulum-v1 -n 100 -b 1500 -e 3 -rtg --exp_name ip_rtg_na --learning_rate 3e-2 --n_layers 2 --size 16 --seed 13
 75 |    python train_pg.py InvertedPendulum-v1 -n 100 -b 1500 -e 3 -rtg -bl --exp_name ip_bl_rtg_na --learning_rate 3e-2 --n_layers 2 --size 16 --seed 13
 76 |    ```
 77 | 
 78 |    ![4_NN_Baseline](fig/4_nn_baseline.png)
 79 | 
 80 | 8. HalfCheetah achieved average score of >150 in 100 iterations.
 81 | 
 82 |    ```bash
 83 |    python train_pg.py HalfCheetah-v1 -ep 150 --discount 0.9 --exp_name hc2x32x15000x2e2 -n 100 -b 50000 -e 1 --learning_rate 4e-2 -rtg --n_layers 2 --size 32 --seed 17
 84 |    # The performance varies across seed a lot. This seed is selected because it performs well. Seeds tried include 27, 37, 47, 57. None could achieve >150 in 100 iterations.
 85 |    ```
 86 | 
 87 |    ![](fig/5_hc.png)
 88 | 
 89 |    ​
 90 | 
 91 | ## Takeaway
 92 | 
 93 | 1. Large batch stabalizes learning process: it allows training loss to smoothly decrease, avoid variations and potentially speeds training up (in terms of iteration #). 
 94 | 2. Reward to go helps training faster in infinite loop game.
 95 | 3. Advantage normalization on paper seems to be able to stablize training but in reality not necessarily. 
 96 | 4. The network does not need to be too deep / large. Small network (For HalfCheetah v1, 2*32 in this case) could be enough. Prioritize optimizing other parameters in training. 
 97 | 5. My dumb tensorflow misc:
 98 |    1. `tf.nn.softmax_cross_entropy_with_logits` does not like output layer that has been softmax activated. See [Official Doc](https://www.tensorflow.org/api_docs/python/tf/nn/softmax_cross_entropy_with_logits).
 99 |    2. Use `tf.nn.sparse_softmax_cross_entropy_with_logits` to skip hot coding. 
100 |    3. `tf.train.AdamOptimizer` is considered preferred to `GradientDescentOptimizer`, see [this post](https://stats.stackexchange.com/questions/184448/difference-between-gradientdescentoptimizer-and-adamoptimizer-tensorflow). On a high level it varies the step size (learning rate) by using the _moving averages of the parameter_. 
101 |    4. Fetch desired tensor in `sess.run([tensor_a,b,c,d], feed_dict={})` See [Official Doc](https://www.tensorflow.org/versions/r0.12/api_docs/python/client/session_management#Session.run). Execute nn update by fetching the updating_op: `sess.run(update_op, feed_dict={...})`. 


--------------------------------------------------------------------------------
/hw2/deliver.sh:
--------------------------------------------------------------------------------
 1 | # Produce experiments results for CartPole
 2 | python train_pg.py CartPole-v0 -n 100 -b 1000 -e 5 -dna --exp_name sb_no_rtg_dna
 3 | python train_pg.py CartPole-v0 -n 100 -b 1000 -e 5 -rtg -dna --exp_name sb_rtg_dna
 4 | python train_pg.py CartPole-v0 -n 100 -b 1000 -e 5 -rtg --exp_name sb_rtg_na
 5 | python train_pg.py CartPole-v0 -n 100 -b 5000 -e 5 -dna --exp_name lb_no_rtg_dna
 6 | python train_pg.py CartPole-v0 -n 100 -b 5000 -e 5 -rtg -dna --exp_name lb_rtg_dna
 7 | python train_pg.py CartPole-v0 -n 100 -b 5000 -e 5 -rtg --exp_name lb_rtg_na
 8 | 
 9 | # Produce plot from results
10 | python plot.py data/sb_rtg_na data/sb_rtg_dna data/sb_no_rtg_dna
11 | python plot.py data/lb_rtg_na data/lb_rtg_dna data/lb_no_rtg_dna
12 | python plot.py data/sb_rtg_dna data/lb_rtg_dna
13 | 
14 | # Produce experiments results for InvertedPendulum-v1
15 | python train_pg.py InvertedPendulum-v1 -n 100 -b 1500 -e 3 -rtg --exp_name ip_rtg_na --learning_rate 3e-2 --n_layers 2 --size 16 --seed 13
16 | python plot.py data/ip_
17 | 
18 | # NN Baseline
19 | python train_pg.py InvertedPendulum-v1 -n 100 -b 1500 -e 3 -rtg -bl --exp_name ip_bl_rtg_na --learning_rate 3e-2 --n_layers 2 --size 16 --seed 13
20 | 
21 | # Cheetah
22 | # basic
23 | python train_pg.py HalfCheetah-v1 -ep 150 --discount 0.9 --exp_name hc2x64 -n 100 -b 5000 -e 1 --learning_rate 5e-2 -rtg --n_layers 2 --size 64 --seed 17
24 | # tune
25 | python train_pg.py HalfCheetah-v1 -ep 150 --discount 0.9 --exp_name hc2x32x15000x2e2 -n 100 -b 50000 -e 5 --learning_rate 4e-2 -rtg --n_layers 2 --size 32 --seed 17


--------------------------------------------------------------------------------
/hw2/fig/1_cartpole_sb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw2/fig/1_cartpole_sb.png


--------------------------------------------------------------------------------
/hw2/fig/2_cartpole_lb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw2/fig/2_cartpole_lb.png


--------------------------------------------------------------------------------
/hw2/fig/2_cartpole_sb_lb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw2/fig/2_cartpole_sb_lb.png


--------------------------------------------------------------------------------
/hw2/fig/3_pendulum_2x16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw2/fig/3_pendulum_2x16.png


--------------------------------------------------------------------------------
/hw2/fig/4_nn_baseline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw2/fig/4_nn_baseline.png


--------------------------------------------------------------------------------
/hw2/fig/5_hc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw2/fig/5_hc.png


--------------------------------------------------------------------------------
/hw2/hw2_final.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw2/hw2_final.pdf


--------------------------------------------------------------------------------
/hw2/logz.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | """
  4 | 
  5 | Some simple logging functionality, inspired by rllab's logging.
  6 | Assumes that each diagnostic gets logged each iteration
  7 | 
  8 | Call logz.configure_output_dir() to start logging to a 
  9 | tab-separated-values file (some_folder_name/log.txt)
 10 | 
 11 | To load the learning curves, you can do, for example
 12 | 
 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
 14 | A['EpRewMean']
 15 | 
 16 | """
 17 | 
 18 | import os.path as osp, shutil, time, atexit, os, subprocess
 19 | import pickle
 20 | import tensorflow as tf
 21 | 
 22 | color2num = dict(
 23 |     gray=30,
 24 |     red=31,
 25 |     green=32,
 26 |     yellow=33,
 27 |     blue=34,
 28 |     magenta=35,
 29 |     cyan=36,
 30 |     white=37,
 31 |     crimson=38
 32 | )
 33 | 
 34 | def colorize(string, color, bold=False, highlight=False):
 35 |     attr = []
 36 |     num = color2num[color]
 37 |     if highlight: num += 10
 38 |     attr.append(str(num))
 39 |     if bold: attr.append('1')
 40 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
 41 | 
 42 | class G:
 43 |     output_dir = None
 44 |     output_file = None
 45 |     first_row = True
 46 |     log_headers = []
 47 |     log_current_row = {}
 48 | 
 49 | def configure_output_dir(d=None):
 50 |     """
 51 |     Set output directory to d, or to /tmp/somerandomnumber if d is None
 52 |     """
 53 |     G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
 54 |     assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir
 55 |     os.makedirs(G.output_dir)
 56 |     G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
 57 |     atexit.register(G.output_file.close)
 58 |     print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
 59 | 
 60 | def log_tabular(key, val):
 61 |     """
 62 |     Log a value of some diagnostic
 63 |     Call this once for each diagnostic quantity, each iteration
 64 |     """
 65 |     if G.first_row:
 66 |         G.log_headers.append(key)
 67 |     else:
 68 |         assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
 69 |     assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
 70 |     G.log_current_row[key] = val
 71 | 
 72 | def save_params(params):
 73 |     with open(osp.join(G.output_dir, "params.json"), 'w') as out:
 74 |         out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True))
 75 | 
 76 | def pickle_tf_vars():  
 77 |     """
 78 |     Saves tensorflow variables
 79 |     Requires them to be initialized first, also a default session must exist
 80 |     """
 81 |     _dict = {v.name : v.eval() for v in tf.global_variables()}
 82 |     with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f:
 83 |         pickle.dump(_dict, f)
 84 |     
 85 | 
 86 | def dump_tabular():
 87 |     """
 88 |     Write all of the diagnostics from the current iteration
 89 |     """
 90 |     vals = []
 91 |     key_lens = [len(key) for key in G.log_headers]
 92 |     max_key_len = max(15,max(key_lens))
 93 |     keystr = '%'+'%d'%max_key_len
 94 |     fmt = "| " + keystr + "s | %15s |"
 95 |     n_slashes = 22 + max_key_len
 96 |     print("-"*n_slashes)
 97 |     for key in G.log_headers:
 98 |         val = G.log_current_row.get(key, "")
 99 |         if hasattr(val, "__float__"): valstr = "%8.3g"%val
100 |         else: valstr = val
101 |         print(fmt%(key, valstr))
102 |         vals.append(val)
103 |     print("-"*n_slashes)
104 |     if G.output_file is not None:
105 |         if G.first_row:
106 |             G.output_file.write("\t".join(G.log_headers))
107 |             G.output_file.write("\n")
108 |         G.output_file.write("\t".join(map(str,vals)))
109 |         G.output_file.write("\n")
110 |         G.output_file.flush()
111 |     G.log_current_row.clear()
112 |     G.first_row=False
113 | 


--------------------------------------------------------------------------------
/hw2/plot.py:
--------------------------------------------------------------------------------
  1 | import seaborn as sns
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import json
  5 | import os
  6 | 
  7 | """
  8 | Using the plotter:
  9 | 
 10 | Call it from the command line, and supply it with logdirs to experiments.
 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 
 12 | random seeds. The runner code stored it in the directory structure
 13 | 
 14 |     data
 15 |     L test_EnvName_DateTime
 16 |       L  0
 17 |         L log.txt
 18 |         L params.json
 19 |       L  1
 20 |         L log.txt
 21 |         L params.json
 22 |        .
 23 |        .
 24 |        .
 25 |       L  9
 26 |         L log.txt
 27 |         L params.json
 28 | 
 29 | To plot learning curves from the experiment, averaged over all random
 30 | seeds, call
 31 | 
 32 |     python plot.py data/test_EnvName_DateTime --value AverageReturn
 33 | 
 34 | and voila. To see a different statistics, change what you put in for
 35 | the keyword --value. You can also enter /multiple/ values, and it will 
 36 | make all of them in order.
 37 | 
 38 | 
 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried
 40 | a different set of hyperparameters from 'test1', and now you would like 
 41 | to compare them -- see their learning curves side-by-side. Just call
 42 | 
 43 |     python plot.py data/test1 data/test2
 44 | 
 45 | and it will plot them both! They will be given titles in the legend according
 46 | to their exp_name parameters. If you want to use custom legend titles, use
 47 | the --legend flag and then provide a title for each logdir.
 48 | 
 49 | """
 50 | 
 51 | def plot_data(data, value="AverageReturn"):
 52 |     if isinstance(data, list):
 53 |         data = pd.concat(data, ignore_index=True)
 54 |     sns.set(style="darkgrid", font_scale=1.5)
 55 |     sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
 56 |     plt.legend(loc='best').draggable()
 57 |     plt.show()
 58 | 
 59 | 
 60 | def get_datasets(fpath, condition=None):
 61 |     unit = 0
 62 |     datasets = []
 63 |     for root, dir, files in os.walk(fpath):
 64 |         if 'log.txt' in files:
 65 |             param_path = open(os.path.join(root,'params.json'))
 66 |             params = json.load(param_path)
 67 |             exp_name = params['exp_name']
 68 |             
 69 |             log_path = os.path.join(root,'log.txt')
 70 |             experiment_data = pd.read_table(log_path)
 71 | 
 72 |             experiment_data.insert(
 73 |                 len(experiment_data.columns),
 74 |                 'Unit',
 75 |                 unit
 76 |                 )
 77 |             experiment_data.insert(
 78 |                 len(experiment_data.columns),
 79 |                 'Condition',
 80 |                 condition or exp_name
 81 |                 )
 82 | 
 83 |             datasets.append(experiment_data)
 84 |             unit += 1
 85 | 
 86 |     return datasets
 87 | 
 88 | 
 89 | def main():
 90 |     import argparse
 91 |     parser = argparse.ArgumentParser()
 92 |     parser.add_argument('logdir', nargs='*')
 93 |     parser.add_argument('--legend', nargs='*')
 94 |     parser.add_argument('--value', default='AverageReturn', nargs='*')
 95 |     args = parser.parse_args()
 96 | 
 97 |     use_legend = False
 98 |     if args.legend is not None:
 99 |         assert len(args.legend) == len(args.logdir), \
100 |             "Must give a legend title for each set of experiments."
101 |         use_legend = True
102 | 
103 |     data = []
104 |     if use_legend:
105 |         for logdir, legend_title in zip(args.logdir, args.legend):
106 |             data += get_datasets(logdir, legend_title)
107 |     else:
108 |         for logdir in args.logdir:
109 |             data += get_datasets(logdir)
110 | 
111 |     if isinstance(args.value, list):
112 |         values = args.value
113 |     else:
114 |         values = [args.value]
115 |     for value in values:
116 |         plot_data(data, value=value)
117 | 
118 | if __name__ == "__main__":
119 |     main()
120 | 


--------------------------------------------------------------------------------
/hw3/README:
--------------------------------------------------------------------------------
1 | See http://rll.berkeley.edu/deeprlcourse/f17docs/hw3.pdf for instructions
2 | 
3 | The starter code was based on an implementation of Q-learning for Atari
4 | generously provided by Szymon Sidor from OpenAI
5 | 
6 | 


--------------------------------------------------------------------------------
/hw3/atari_wrappers.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | from collections import deque
  4 | import gym
  5 | from gym import spaces
  6 | 
  7 | 
  8 | class NoopResetEnv(gym.Wrapper):
  9 |     def __init__(self, env=None, noop_max=30):
 10 |         """Sample initial states by taking random number of no-ops on reset.
 11 |         No-op is assumed to be action 0.
 12 |         """
 13 |         super(NoopResetEnv, self).__init__(env)
 14 |         self.noop_max = noop_max
 15 |         assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
 16 | 
 17 |     def _reset(self):
 18 |         """ Do no-op action for a number of steps in [1, noop_max]."""
 19 |         self.env.reset()
 20 |         noops = np.random.randint(1, self.noop_max + 1)
 21 |         for _ in range(noops):
 22 |             obs, _, _, _ = self.env.step(0)
 23 |         return obs
 24 | 
 25 | class FireResetEnv(gym.Wrapper):
 26 |     def __init__(self, env=None):
 27 |         """Take action on reset for environments that are fixed until firing."""
 28 |         super(FireResetEnv, self).__init__(env)
 29 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
 30 |         assert len(env.unwrapped.get_action_meanings()) >= 3
 31 | 
 32 |     def _reset(self):
 33 |         self.env.reset()
 34 |         obs, _, _, _ = self.env.step(1)
 35 |         obs, _, _, _ = self.env.step(2)
 36 |         return obs
 37 | 
 38 | class EpisodicLifeEnv(gym.Wrapper):
 39 |     def __init__(self, env=None):
 40 |         """Make end-of-life == end-of-episode, but only reset on true game over.
 41 |         Done by DeepMind for the DQN and co. since it helps value estimation.
 42 |         """
 43 |         super(EpisodicLifeEnv, self).__init__(env)
 44 |         self.lives = 0
 45 |         self.was_real_done  = True
 46 |         self.was_real_reset = False
 47 | 
 48 |     def _step(self, action):
 49 |         obs, reward, done, info = self.env.step(action)
 50 |         self.was_real_done = done
 51 |         # check current lives, make loss of life terminal,
 52 |         # then update lives to handle bonus lives
 53 |         lives = self.env.unwrapped.ale.lives()
 54 |         if lives < self.lives and lives > 0:
 55 |             # for Qbert somtimes we stay in lives == 0 condtion for a few frames
 56 |             # so its important to keep lives > 0, so that we only reset once
 57 |             # the environment advertises done.
 58 |             done = True
 59 |         self.lives = lives
 60 |         return obs, reward, done, info
 61 | 
 62 |     def _reset(self):
 63 |         """Reset only when lives are exhausted.
 64 |         This way all states are still reachable even though lives are episodic,
 65 |         and the learner need not know about any of this behind-the-scenes.
 66 |         """
 67 |         if self.was_real_done:
 68 |             obs = self.env.reset()
 69 |             self.was_real_reset = True
 70 |         else:
 71 |             # no-op step to advance from terminal/lost life state
 72 |             obs, _, _, _ = self.env.step(0)
 73 |             self.was_real_reset = False
 74 |         self.lives = self.env.unwrapped.ale.lives()
 75 |         return obs
 76 | 
 77 | class MaxAndSkipEnv(gym.Wrapper):
 78 |     def __init__(self, env=None, skip=4):
 79 |         """Return only every `skip`-th frame"""
 80 |         super(MaxAndSkipEnv, self).__init__(env)
 81 |         # most recent raw observations (for max pooling across time steps)
 82 |         self._obs_buffer = deque(maxlen=2)
 83 |         self._skip       = skip
 84 | 
 85 |     def _step(self, action):
 86 |         total_reward = 0.0
 87 |         done = None
 88 |         for _ in range(self._skip):
 89 |             obs, reward, done, info = self.env.step(action)
 90 |             self._obs_buffer.append(obs)
 91 |             total_reward += reward
 92 |             if done:
 93 |                 break
 94 | 
 95 |         max_frame = np.max(np.stack(self._obs_buffer), axis=0)
 96 | 
 97 |         return max_frame, total_reward, done, info
 98 | 
 99 |     def _reset(self):
100 |         """Clear past frame buffer and init. to first obs. from inner env."""
101 |         self._obs_buffer.clear()
102 |         obs = self.env.reset()
103 |         self._obs_buffer.append(obs)
104 |         return obs
105 | 
106 | def _process_frame84(frame):
107 |     img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
108 |     img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
109 |     resized_screen = cv2.resize(img, (84, 110),  interpolation=cv2.INTER_LINEAR)
110 |     x_t = resized_screen[18:102, :]
111 |     x_t = np.reshape(x_t, [84, 84, 1])
112 |     return x_t.astype(np.uint8)
113 | 
114 | class ProcessFrame84(gym.Wrapper):
115 |     def __init__(self, env=None):
116 |         super(ProcessFrame84, self).__init__(env)
117 |         self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1))
118 | 
119 |     def _step(self, action):
120 |         obs, reward, done, info = self.env.step(action)
121 |         return _process_frame84(obs), reward, done, info
122 | 
123 |     def _reset(self):
124 |         return _process_frame84(self.env.reset())
125 | 
126 | class ClippedRewardsWrapper(gym.Wrapper):
127 |     def _step(self, action):
128 |         obs, reward, done, info = self.env.step(action)
129 |         return obs, np.sign(reward), done, info
130 | 
131 | def wrap_deepmind_ram(env):
132 |     env = EpisodicLifeEnv(env)
133 |     env = NoopResetEnv(env, noop_max=30)
134 |     env = MaxAndSkipEnv(env, skip=4)
135 |     if 'FIRE' in env.unwrapped.get_action_meanings():
136 |         env = FireResetEnv(env)
137 |     env = ClippedRewardsWrapper(env)
138 |     return env
139 | 
140 | def wrap_deepmind(env):
141 |     assert 'NoFrameskip' in env.spec.id
142 |     env = EpisodicLifeEnv(env)
143 |     env = NoopResetEnv(env, noop_max=30)
144 |     env = MaxAndSkipEnv(env, skip=4)
145 |     if 'FIRE' in env.unwrapped.get_action_meanings():
146 |         env = FireResetEnv(env)
147 |     env = ProcessFrame84(env)
148 |     env = ClippedRewardsWrapper(env)
149 |     return env
150 | 


--------------------------------------------------------------------------------
/hw3/dqn.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import gym.spaces
  3 | import itertools
  4 | import numpy as np
  5 | import random
  6 | import tensorflow                as tf
  7 | import tensorflow.contrib.layers as layers
  8 | from collections import namedtuple
  9 | from dqn_utils import *
 10 | import logging
 11 | 
 12 | def d(s):
 13 |     logging.getLogger('dqn').debug(s)
 14 | 
 15 | OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs", "lr_schedule"])
 16 | 
 17 | def learn(env,
 18 |           q_func,
 19 |           optimizer_spec,
 20 |           session,
 21 |           exploration=LinearSchedule(1000000, 0.1),
 22 |           stopping_criterion=None,
 23 |           replay_buffer_size=1000000,
 24 |           batch_size=32,
 25 |           gamma=0.99,
 26 |           learning_starts=50000,
 27 |           learning_freq=4,
 28 |           frame_history_len=4,
 29 |           target_update_freq=10000,
 30 |           grad_norm_clipping=10):
 31 |     
 32 |     """Run Deep Q-learning algorithm.
 33 | 
 34 |     You can specify your own convnet using q_func.
 35 | 
 36 |     All schedules are w.r.t. total number of steps taken in the environment.
 37 | 
 38 |     Parameters
 39 |     ----------
 40 |     env: gym.Env
 41 |         gym environment to train on.
 42 |     q_func: function
 43 |         Model to use for computing the q function. It should accept the
 44 |         following named arguments:
 45 |             img_in: tf.Tensor
 46 |                 tensorflow tensor representing the input image
 47 |             num_actions: int
 48 |                 number of actions
 49 |             scope: str
 50 |                 scope in which all the model related variables
 51 |                 should be created
 52 |             reuse: bool
 53 |                 whether previously created variables should be reused.
 54 |     optimizer_spec: OptimizerSpec
 55 |         Specifying the constructor and kwargs, as well as learning rate schedule
 56 |         for the optimizer
 57 |     session: tf.Session
 58 |         tensorflow session to use.
 59 |     exploration: rl_algs.deepq.utils.schedules.Schedule
 60 |         schedule for probability of chosing random action.
 61 |     stopping_criterion: (env, t) -> bool
 62 |         should return true when it's ok for the RL algorithm to stop.
 63 |         takes in env and the number of steps executed so far.
 64 |     replay_buffer_size: int
 65 |         How many memories to store in the replay buffer.
 66 |     batch_size: int
 67 |         How many transitions to sample each time experience is replayed.
 68 |     gamma: float
 69 |         Discount Factor
 70 |     learning_starts: int
 71 |         After how many environment steps to start replaying experiences
 72 |     learning_freq: int
 73 |         How many steps of environment to take between every experience replay
 74 |     frame_history_len: int
 75 |         How many past frames to include as input to the model.
 76 |     target_update_freq: int
 77 |         How many experience replay rounds (not steps!) to perform between
 78 |         each update to the target Q network
 79 |     grad_norm_clipping: float or None
 80 |         If not None gradients' norms are clipped to this value.
 81 |     """
 82 |     assert type(env.observation_space) == gym.spaces.Box
 83 |     assert type(env.action_space)      == gym.spaces.Discrete
 84 | 
 85 |     ###############
 86 |     # BUILD MODEL #
 87 |     ###############
 88 | 
 89 |     if len(env.observation_space.shape) == 1:
 90 |         # This means we are running on low-dimensional observations (e.g. RAM)
 91 |         input_shape = env.observation_space.shape
 92 |     else:
 93 |         img_h, img_w, img_c = env.observation_space.shape
 94 |         input_shape = (img_h, img_w, frame_history_len * img_c)
 95 |     num_actions = env.action_space.n
 96 | 
 97 |     d('input_shape = {}'.format(input_shape))
 98 |     d('num_actions = {}'.format(num_actions))
 99 | 
100 |     # set up placeholders
101 |     # placeholder for current observation (or state)
102 |     obs_t_ph              = tf.placeholder(tf.uint8, [None] + list(input_shape))
103 |     # placeholder for current action
104 |     act_t_ph              = tf.placeholder(tf.int32,   [None])
105 |     # placeholder for current reward
106 |     rew_t_ph              = tf.placeholder(tf.float32, [None])
107 |     # placeholder for next observation (or state)
108 |     obs_tp1_ph            = tf.placeholder(tf.uint8, [None] + list(input_shape))
109 |     # placeholder for end of episode mask
110 |     # this value is 1 if the next state corresponds to the end of an episode,
111 |     # in which case there is no Q-value at the next state; at the end of an
112 |     # episode, only the current state reward contributes to the target, not the
113 |     # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1)
114 |     done_mask_ph          = tf.placeholder(tf.float32, [None])
115 | 
116 |     # casting to float on GPU ensures lower data transfer times.
117 |     obs_t_float   = tf.cast(obs_t_ph,   tf.float32) / 255.0
118 |     obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0
119 | 
120 |     # Here, you should fill in your own code to compute the Bellman error. This requires
121 |     # evaluating the current and next Q-values and constructing the corresponding error.
122 |     # TensorFlow will differentiate this error for you, you just need to pass it to the
123 |     # optimizer. See assignment text for details.
124 |     # Your code should produce one scalar-valued tensor: total_error
125 |     # This will be passed to the optimizer in the provided code below.
126 |     # Your code should also produce two collections of variables:
127 |     # q_func_vars
128 |     # target_q_func_vars
129 |     # These should hold all of the variables of the Q-function network and target network,
130 |     # respectively. A convenient way to get these is to make use of TF's "scope" feature.
131 |     # For example, you can create your Q-function network with the scope "q_func" like this:
132 |     # <something> = q_func(obs_t_float, num_actions, scope="q_func", reuse=False)
133 |     # And then you can obtain the variables like this:
134 |     # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func')
135 |     # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES"
136 |     ######
137 | 
138 |     # YOUR CODE HERE
139 | 
140 |     # Q values
141 |     pred_q = q_func(obs_t_float, num_actions, scope="q_func", reuse=False)
142 |     pred_ac = tf.argmax(pred_q, axis=1)
143 |     pred_q_a = tf.reduce_sum(pred_q * tf.one_hot(act_t_ph, depth=num_actions), axis=1)
144 | 
145 |     # Target
146 |     target_q = q_func(obs_tp1_float, num_actions, scope="q_func_target", reuse=False)
147 |     target_q_a = rew_t_ph + (1 - done_mask_ph) * gamma * tf.reduce_max(target_q, axis=1)
148 |     
149 |     # Loss
150 |     #total_error = huber_loss(pred_q_a, target_q_a)
151 |     #total_error = tf.nn.l2_loss(pred_q_a - target_q_a)
152 |     total_error = 0.5 * tf.reduce_sum(tf.square(pred_q_a - tf.stop_gradient(target_q_a)))
153 | 
154 |     # Get variables
155 |     q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func')
156 |     target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func_target')
157 |     
158 |     d("pred_q = {}".format(pred_q))
159 |     d("target_q = {}".format(target_q))
160 | 
161 |     d("pred_ac = {}".format(pred_ac))
162 |     d("pred_q_a = {}".format(pred_q_a))
163 |     d("target_q_a = {}".format(target_q_a))
164 |     d("total_error = {}".format(total_error))
165 | 
166 |     ######
167 | 
168 |     # construct optimization op (with gradient clipping)
169 |     learning_rate = tf.placeholder(tf.float32, (), name="learning_rate")
170 |     optimizer = optimizer_spec.constructor(learning_rate=learning_rate, **optimizer_spec.kwargs)
171 |     train_fn = minimize_and_clip(optimizer, total_error,
172 |                  var_list=q_func_vars, clip_val=grad_norm_clipping)
173 | 
174 |     # update_target_fn will be called periodically to copy Q network to target Q network
175 |     update_target_fn = []
176 |     for var, var_target in zip(sorted(q_func_vars,        key=lambda v: v.name),
177 |                                sorted(target_q_func_vars, key=lambda v: v.name)):
178 |         update_target_fn.append(var_target.assign(var))
179 |     update_target_fn = tf.group(*update_target_fn)
180 | 
181 |     # construct the replay buffer
182 |     replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)
183 | 
184 |     ###############
185 |     # RUN ENV     #
186 |     ###############
187 |     model_initialized = False
188 |     num_param_updates = 0
189 |     mean_episode_reward      = -float('nan')
190 |     best_mean_episode_reward = -float('inf')
191 |     last_obs = env.reset()
192 |     LOG_EVERY_N_STEPS = 10000
193 | 
194 |     for t in itertools.count():
195 |         ### 1. Check stopping criterion
196 |         if stopping_criterion is not None and stopping_criterion(env, t):
197 |             break
198 | 
199 |         ### 2. Step the env and store the transition
200 |         # At this point, "last_obs" contains the latest observation that was
201 |         # recorded from the simulator. Here, your code needs to store this
202 |         # observation and its outcome (reward, next observation, etc.) into
203 |         # the replay buffer while stepping the simulator forward one step.
204 |         # At the end of this block of code, the simulator should have been
205 |         # advanced one step, and the replay buffer should contain one more
206 |         # transition.
207 |         # Specifically, last_obs must point to the new latest observation.
208 |         # Useful functions you'll need to call:
209 |         # obs, reward, done, info = env.step(action)
210 |         # this steps the environment forward one step
211 |         # obs = env.reset()
212 |         # this resets the environment if you reached an episode boundary.
213 |         # Don't forget to call env.reset() to get a new observation if done
214 |         # is true!!
215 |         # Note that you cannot use "last_obs" directly as input
216 |         # into your network, since it needs to be processed to include context
217 |         # from previous frames. You should check out the replay buffer
218 |         # implementation in dqn_utils.py to see what functionality the replay
219 |         # buffer exposes. The replay buffer has a function called
220 |         # encode_recent_observation that will take the latest observation
221 |         # that you pushed into the buffer and compute the corresponding
222 |         # input that should be given to a Q network by appending some
223 |         # previous frames.
224 |         # Don't forget to include epsilon greedy exploration!
225 |         # And remember that the first time you enter this loop, the model
226 |         # may not yet have been initialized (but of course, the first step
227 |         # might as well be random, since you haven't trained your net...)
228 | 
229 |         #####
230 |         
231 |         # YOUR CODE HERE
232 |         idx = replay_buffer.store_frame(last_obs)
233 | 
234 |         if not model_initialized or random.random() < exploration.value(t):
235 |             action = random.randint(0, num_actions-1)
236 |         else:
237 |             obs = replay_buffer.encode_recent_observation()
238 |             action = session.run(pred_ac, {obs_t_ph: [obs]})[0]
239 | 
240 |         next_obs, reward, done, info = env.step(action)
241 |         replay_buffer.store_effect(idx, action, reward, done)
242 |         last_obs = env.reset() if done else next_obs
243 | 
244 |         #####
245 | 
246 |         # at this point, the environment should have been advanced one step (and
247 |         # reset if done was true), and last_obs should point to the new latest
248 |         # observation
249 | 
250 |         ### 3. Perform experience replay and train the network.
251 |         # note that this is only done if the replay buffer contains enough samples
252 |         # for us to learn something useful -- until then, the model will not be
253 |         # initialized and random actions should be taken
254 |         if (t > learning_starts and
255 |                 t % learning_freq == 0 and
256 |                 replay_buffer.can_sample(batch_size)):
257 |             # Here, you should perform training. Training consists of four steps:
258 |             # 3.a: use the replay buffer to sample a batch of transitions (see the
259 |             # replay buffer code for function definition, each batch that you sample
260 |             # should consist of current observations, current actions, rewards,
261 |             # next observations, and done indicator).
262 |             # 3.b: initialize the model if it has not been initialized yet; to do
263 |             # that, call
264 |             #    initialize_interdependent_variables(session, tf.global_variables(), {
265 |             #        obs_t_ph: obs_t_batch,
266 |             #        obs_tp1_ph: obs_tp1_batch,
267 |             #    })
268 |             # where obs_t_batch and obs_tp1_batch are the batches of observations at
269 |             # the current and next time step. The boolean variable model_initialized
270 |             # indicates whether or not the model has been initialized.
271 |             # Remember that you have to update the target network too (see 3.d)!
272 |             # 3.c: train the model. To do this, you'll need to use the train_fn and
273 |             # total_error ops that were created earlier: total_error is what you
274 |             # created to compute the total Bellman error in a batch, and train_fn
275 |             # will actually perform a gradient step and update the network parameters
276 |             # to reduce total_error. When calling session.run on these you'll need to
277 |             # populate the following placeholders:
278 |             # obs_t_ph
279 |             # act_t_ph
280 |             # rew_t_ph
281 |             # obs_tp1_ph
282 |             # done_mask_ph
283 |             # (this is needed for computing total_error)
284 |             # learning_rate -- you can get this from optimizer_spec.lr_schedule.value(t)
285 |             # (this is needed by the optimizer to choose the learning rate)
286 |             # 3.d: periodically update the target network by calling
287 |             # session.run(update_target_fn)
288 |             # you should update every target_update_freq steps, and you may find the
289 |             # variable num_param_updates useful for this (it was initialized to 0)
290 |             #####
291 |             
292 |             # 3.a sample a batch of transitions
293 |             obs_batch, act_batch, rew_batch, next_obs_batch, done_batch = replay_buffer.sample(batch_size)
294 | 
295 |             # 3.b initialize the model if haven't 
296 |             if not model_initialized:
297 |                 initialize_interdependent_variables(session, tf.global_variables(), {
298 |                     obs_t_ph: obs_batch,
299 |                     obs_tp1_ph: next_obs_batch,
300 |                 })
301 |                 session.run(update_target_fn)
302 |                 model_initialized = True
303 | 
304 |             # 3.c train the model
305 |             _, error = session.run([train_fn, total_error], {
306 |                 obs_t_ph: obs_batch,
307 |                 act_t_ph: act_batch,
308 |                 rew_t_ph: rew_batch,
309 |                 obs_tp1_ph: next_obs_batch,
310 |                 done_mask_ph: done_batch,
311 |                 learning_rate: optimizer_spec.lr_schedule.value(t)
312 |                 })
313 | 
314 |             # 3.d periodically update the target network
315 |             if t % target_update_freq == 0:
316 |                 # Use t here instead of num_param_updates
317 |                 # Under the default hyperparameter
318 |                 # this will speed up learning performance
319 |                 # Or you can set target_update_freq to less
320 |                 session.run(update_target_fn)
321 |                 num_param_updates += 1
322 | 
323 |             #####
324 | 
325 |         ### 4. Log progress
326 |         episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards()
327 |         if len(episode_rewards) > 0:
328 |             mean_episode_reward = np.mean(episode_rewards[-100:])
329 |         if len(episode_rewards) > 100:
330 |             best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward)
331 |         if t % LOG_EVERY_N_STEPS == 0 and model_initialized:
332 |             print("Timestep %d" % (t,))
333 |             print("mean reward (100 episodes) %f" % mean_episode_reward)
334 |             print("best mean reward %f" % best_mean_episode_reward)
335 |             print("episodes %d" % len(episode_rewards))
336 |             print("exploration %f" % exploration.value(t))
337 |             print("learning_rate %f" % optimizer_spec.lr_schedule.value(t))
338 |             print("total error %f" % error)
339 |             sys.stdout.flush()
340 | 


--------------------------------------------------------------------------------
/hw3/dqn_utils.py:
--------------------------------------------------------------------------------
  1 | """This file includes a collection of utility functions that are useful for
  2 | implementing DQN."""
  3 | import gym
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | import random
  7 | 
  8 | def huber_loss(x, delta=1.0):
  9 |     # https://en.wikipedia.org/wiki/Huber_loss
 10 |     return tf.where( # Tensorflow >= 1.0
 11 |         tf.abs(x) < delta,
 12 |         tf.square(x) * 0.5,
 13 |         delta * (tf.abs(x) - 0.5 * delta)
 14 |     )
 15 | 
 16 | def sample_n_unique(sampling_f, n):
 17 |     """Helper function. Given a function `sampling_f` that returns
 18 |     comparable objects, sample n such unique objects.
 19 |     """
 20 |     res = []
 21 |     while len(res) < n:
 22 |         candidate = sampling_f()
 23 |         if candidate not in res:
 24 |             res.append(candidate)
 25 |     return res
 26 | 
 27 | class Schedule(object):
 28 |     def value(self, t):
 29 |         """Value of the schedule at time t"""
 30 |         raise NotImplementedError()
 31 | 
 32 | class ConstantSchedule(object):
 33 |     def __init__(self, value):
 34 |         """Value remains constant over time.
 35 |         Parameters
 36 |         ----------
 37 |         value: float
 38 |             Constant value of the schedule
 39 |         """
 40 |         self._v = value
 41 | 
 42 |     def value(self, t):
 43 |         """See Schedule.value"""
 44 |         return self._v
 45 | 
 46 | def linear_interpolation(l, r, alpha):
 47 |     return l + alpha * (r - l)
 48 | 
 49 | class PiecewiseSchedule(object):
 50 |     def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
 51 |         """Piecewise schedule.
 52 |         endpoints: [(int, int)]
 53 |             list of pairs `(time, value)` meanining that schedule should output
 54 |             `value` when `t==time`. All the values for time must be sorted in
 55 |             an increasing order. When t is between two times, e.g. `(time_a, value_a)`
 56 |             and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
 57 |             `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
 58 |             time passed between `time_a` and `time_b` for time `t`.
 59 |         interpolation: lambda float, float, float: float
 60 |             a function that takes value to the left and to the right of t according
 61 |             to the `endpoints`. Alpha is the fraction of distance from left endpoint to
 62 |             right endpoint that t has covered. See linear_interpolation for example.
 63 |         outside_value: float
 64 |             if the value is requested outside of all the intervals sepecified in
 65 |             `endpoints` this value is returned. If None then AssertionError is
 66 |             raised when outside value is requested.
 67 |         """
 68 |         idxes = [e[0] for e in endpoints]
 69 |         assert idxes == sorted(idxes)
 70 |         self._interpolation = interpolation
 71 |         self._outside_value = outside_value
 72 |         self._endpoints      = endpoints
 73 | 
 74 |     def value(self, t):
 75 |         """See Schedule.value"""
 76 |         for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
 77 |             if l_t <= t and t < r_t:
 78 |                 alpha = float(t - l_t) / (r_t - l_t)
 79 |                 return self._interpolation(l, r, alpha)
 80 | 
 81 |         # t does not belong to any of the pieces, so doom.
 82 |         assert self._outside_value is not None
 83 |         return self._outside_value
 84 | 
 85 | class LinearSchedule(object):
 86 |     def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
 87 |         """Linear interpolation between initial_p and final_p over
 88 |         schedule_timesteps. After this many timesteps pass final_p is
 89 |         returned.
 90 |         Parameters
 91 |         ----------
 92 |         schedule_timesteps: int
 93 |             Number of timesteps for which to linearly anneal initial_p
 94 |             to final_p
 95 |         initial_p: float
 96 |             initial output value
 97 |         final_p: float
 98 |             final output value
 99 |         """
100 |         self.schedule_timesteps = schedule_timesteps
101 |         self.final_p            = final_p
102 |         self.initial_p          = initial_p
103 | 
104 |     def value(self, t):
105 |         """See Schedule.value"""
106 |         fraction  = min(float(t) / self.schedule_timesteps, 1.0)
107 |         return self.initial_p + fraction * (self.final_p - self.initial_p)
108 | 
109 | def compute_exponential_averages(variables, decay):
110 |     """Given a list of tensorflow scalar variables
111 |     create ops corresponding to their exponential
112 |     averages
113 |     Parameters
114 |     ----------
115 |     variables: [tf.Tensor]
116 |         List of scalar tensors.
117 |     Returns
118 |     -------
119 |     averages: [tf.Tensor]
120 |         List of scalar tensors corresponding to averages
121 |         of al the `variables` (in order)
122 |     apply_op: tf.runnable
123 |         Op to be run to update the averages with current value
124 |         of variables.
125 |     """
126 |     averager = tf.train.ExponentialMovingAverage(decay=decay)
127 |     apply_op = averager.apply(variables)
128 |     return [averager.average(v) for v in variables], apply_op
129 | 
130 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
131 |     """Minimized `objective` using `optimizer` w.r.t. variables in
132 |     `var_list` while ensure the norm of the gradients for each
133 |     variable is clipped to `clip_val`
134 |     """
135 |     gradients = optimizer.compute_gradients(objective, var_list=var_list)
136 |     for i, (grad, var) in enumerate(gradients):
137 |         if grad is not None:
138 |             gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
139 |     return optimizer.apply_gradients(gradients)
140 | 
141 | def initialize_interdependent_variables(session, vars_list, feed_dict):
142 |     """Initialize a list of variables one at a time, which is useful if
143 |     initialization of some variables depends on initialization of the others.
144 |     """
145 |     vars_left = vars_list
146 |     while len(vars_left) > 0:
147 |         new_vars_left = []
148 |         for v in vars_left:
149 |             try:
150 |                 # If using an older version of TensorFlow, uncomment the line
151 |                 # below and comment out the line after it.
152 | 		#session.run(tf.initialize_variables([v]), feed_dict)
153 |                 session.run(tf.variables_initializer([v]), feed_dict)
154 |             except tf.errors.FailedPreconditionError:
155 |                 new_vars_left.append(v)
156 |         if len(new_vars_left) >= len(vars_left):
157 |             # This can happend if the variables all depend on each other, or more likely if there's
158 |             # another variable outside of the list, that still needs to be initialized. This could be
159 |             # detected here, but life's finite.
160 |             raise Exception("Cycle in variable dependencies, or extenrnal precondition unsatisfied.")
161 |         else:
162 |             vars_left = new_vars_left
163 | 
164 | def get_wrapper_by_name(env, classname):
165 |     currentenv = env
166 |     while True:
167 |         if classname in currentenv.__class__.__name__:
168 |             return currentenv
169 |         elif isinstance(env, gym.Wrapper):
170 |             currentenv = currentenv.env
171 |         else:
172 |             raise ValueError("Couldn't find wrapper named %s"%classname)
173 | 
174 | class ReplayBuffer(object):
175 |     def __init__(self, size, frame_history_len):
176 |         """This is a memory efficient implementation of the replay buffer.
177 | 
178 |         The sepecific memory optimizations use here are:
179 |             - only store each frame once rather than k times
180 |               even if every observation normally consists of k last frames
181 |             - store frames as np.uint8 (actually it is most time-performance
182 |               to cast them back to float32 on GPU to minimize memory transfer
183 |               time)
184 |             - store frame_t and frame_(t+1) in the same buffer.
185 | 
186 |         For the tipical use case in Atari Deep RL buffer with 1M frames the total
187 |         memory footprint of this buffer is 10^6 * 84 * 84 bytes ~= 7 gigabytes
188 | 
189 |         Warning! Assumes that returning frame of zeros at the beginning
190 |         of the episode, when there is less frames than `frame_history_len`,
191 |         is acceptable.
192 | 
193 |         Parameters
194 |         ----------
195 |         size: int
196 |             Max number of transitions to store in the buffer. When the buffer
197 |             overflows the old memories are dropped.
198 |         frame_history_len: int
199 |             Number of memories to be retried for each observation.
200 |         """
201 |         self.size = size
202 |         self.frame_history_len = frame_history_len
203 | 
204 |         self.next_idx      = 0
205 |         self.num_in_buffer = 0
206 | 
207 |         self.obs      = None
208 |         self.action   = None
209 |         self.reward   = None
210 |         self.done     = None
211 | 
212 |     def can_sample(self, batch_size):
213 |         """Returns true if `batch_size` different transitions can be sampled from the buffer."""
214 |         return batch_size + 1 <= self.num_in_buffer
215 | 
216 |     def _encode_sample(self, idxes):
217 |         obs_batch      = np.concatenate([self._encode_observation(idx)[None] for idx in idxes], 0)
218 |         act_batch      = self.action[idxes]
219 |         rew_batch      = self.reward[idxes]
220 |         next_obs_batch = np.concatenate([self._encode_observation(idx + 1)[None] for idx in idxes], 0)
221 |         done_mask      = np.array([1.0 if self.done[idx] else 0.0 for idx in idxes], dtype=np.float32)
222 | 
223 |         return obs_batch, act_batch, rew_batch, next_obs_batch, done_mask
224 | 
225 |     def sample(self, batch_size):
226 |         """Sample `batch_size` different transitions.
227 | 
228 |         i-th sample transition is the following:
229 | 
230 |         when observing `obs_batch[i]`, action `act_batch[i]` was taken,
231 |         after which reward `rew_batch[i]` was received and subsequent
232 |         observation  next_obs_batch[i] was observed, unless the epsiode
233 |         was done which is represented by `done_mask[i]` which is equal
234 |         to 1 if episode has ended as a result of that action.
235 | 
236 |         Parameters
237 |         ----------
238 |         batch_size: int
239 |             How many transitions to sample.
240 | 
241 |         Returns
242 |         -------
243 |         obs_batch: np.array
244 |             Array of shape
245 |             (batch_size, img_h, img_w, img_c * frame_history_len)
246 |             and dtype np.uint8
247 |         act_batch: np.array
248 |             Array of shape (batch_size,) and dtype np.int32
249 |         rew_batch: np.array
250 |             Array of shape (batch_size,) and dtype np.float32
251 |         next_obs_batch: np.array
252 |             Array of shape
253 |             (batch_size, img_h, img_w, img_c * frame_history_len)
254 |             and dtype np.uint8
255 |         done_mask: np.array
256 |             Array of shape (batch_size,) and dtype np.float32
257 |         """
258 |         assert self.can_sample(batch_size)
259 |         idxes = sample_n_unique(lambda: random.randint(0, self.num_in_buffer - 2), batch_size)
260 |         return self._encode_sample(idxes)
261 | 
262 |     def encode_recent_observation(self):
263 |         """Return the most recent `frame_history_len` frames.
264 | 
265 |         Returns
266 |         -------
267 |         observation: np.array
268 |             Array of shape (img_h, img_w, img_c * frame_history_len)
269 |             and dtype np.uint8, where observation[:, :, i*img_c:(i+1)*img_c]
270 |             encodes frame at time `t - frame_history_len + i`
271 |         """
272 |         assert self.num_in_buffer > 0
273 |         return self._encode_observation((self.next_idx - 1) % self.size)
274 | 
275 |     def _encode_observation(self, idx):
276 |         end_idx   = idx + 1 # make noninclusive
277 |         start_idx = end_idx - self.frame_history_len
278 |         # this checks if we are using low-dimensional observations, such as RAM
279 |         # state, in which case we just directly return the latest RAM.
280 |         if len(self.obs.shape) == 2:
281 |             return self.obs[end_idx-1]
282 |         # if there weren't enough frames ever in the buffer for context
283 |         if start_idx < 0 and self.num_in_buffer != self.size:
284 |             start_idx = 0
285 |         for idx in range(start_idx, end_idx - 1):
286 |             if self.done[idx % self.size]:
287 |                 start_idx = idx + 1
288 |         missing_context = self.frame_history_len - (end_idx - start_idx)
289 |         # if zero padding is needed for missing context
290 |         # or we are on the boundry of the buffer
291 |         if start_idx < 0 or missing_context > 0:
292 |             frames = [np.zeros_like(self.obs[0]) for _ in range(missing_context)]
293 |             for idx in range(start_idx, end_idx):
294 |                 frames.append(self.obs[idx % self.size])
295 |             return np.concatenate(frames, 2)
296 |         else:
297 |             # this optimization has potential to saves about 30% compute time \o/
298 |             img_h, img_w = self.obs.shape[1], self.obs.shape[2]
299 |             return self.obs[start_idx:end_idx].transpose(1, 2, 0, 3).reshape(img_h, img_w, -1)
300 | 
301 |     def store_frame(self, frame):
302 |         """Store a single frame in the buffer at the next available index, overwriting
303 |         old frames if necessary.
304 | 
305 |         Parameters
306 |         ----------
307 |         frame: np.array
308 |             Array of shape (img_h, img_w, img_c) and dtype np.uint8
309 |             the frame to be stored
310 | 
311 |         Returns
312 |         -------
313 |         idx: int
314 |             Index at which the frame is stored. To be used for `store_effect` later.
315 |         """
316 |         if self.obs is None:
317 |             self.obs      = np.empty([self.size] + list(frame.shape), dtype=np.uint8)
318 |             self.action   = np.empty([self.size],                     dtype=np.int32)
319 |             self.reward   = np.empty([self.size],                     dtype=np.float32)
320 |             self.done     = np.empty([self.size],                     dtype=np.bool)
321 |         self.obs[self.next_idx] = frame
322 | 
323 |         ret = self.next_idx
324 |         self.next_idx = (self.next_idx + 1) % self.size
325 |         self.num_in_buffer = min(self.size, self.num_in_buffer + 1)
326 | 
327 |         return ret
328 | 
329 |     def store_effect(self, idx, action, reward, done):
330 |         """Store effects of action taken after obeserving frame stored
331 |         at index idx. The reason `store_frame` and `store_effect` is broken
332 |         up into two functions is so that once can call `encode_recent_observation`
333 |         in between.
334 | 
335 |         Paramters
336 |         ---------
337 |         idx: int
338 |             Index in buffer of recently observed frame (returned by `store_frame`).
339 |         action: int
340 |             Action that was performed upon observing this frame.
341 |         reward: float
342 |             Reward that was received when the actions was performed.
343 |         done: bool
344 |             True if episode was finished after performing that action.
345 |         """
346 |         self.action[idx] = action
347 |         self.reward[idx] = reward
348 |         self.done[idx]   = done
349 | 
350 | 


--------------------------------------------------------------------------------
/hw3/hw3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw3/hw3.pdf


--------------------------------------------------------------------------------
/hw3/run_dqn_atari.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | from gym import wrappers
  4 | import os.path as osp
  5 | import random
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import tensorflow.contrib.layers as layers
  9 | 
 10 | import dqn
 11 | from dqn_utils import *
 12 | from atari_wrappers import *
 13 | 
 14 | import logging
 15 | 
 16 | def atari_model(img_in, num_actions, scope, reuse=False):
 17 |     # as described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf
 18 |     with tf.variable_scope(scope, reuse=reuse):
 19 |         out = img_in
 20 |         with tf.variable_scope("convnet"):
 21 |             # original architecture
 22 |             out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu)
 23 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu)
 24 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu)
 25 |         out = layers.flatten(out)
 26 |         with tf.variable_scope("action_value"):
 27 |             out = layers.fully_connected(out, num_outputs=512,         activation_fn=tf.nn.relu)
 28 |             out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
 29 | 
 30 |         return out
 31 | 
 32 | def atari_learn(env,
 33 |                 session,
 34 |                 num_timesteps):
 35 |     # This is just a rough estimate
 36 |     num_iterations = float(num_timesteps) / 4.0
 37 | 
 38 |     lr_multiplier = 1.0
 39 |     lr_schedule = PiecewiseSchedule([
 40 |                                          (0,                   1e-4 * lr_multiplier),
 41 |                                          (num_iterations / 10, 1e-4 * lr_multiplier),
 42 |                                          (num_iterations / 2,  5e-5 * lr_multiplier),
 43 |                                     ],
 44 |                                     outside_value=5e-5 * lr_multiplier)
 45 |     optimizer = dqn.OptimizerSpec(
 46 |         constructor=tf.train.AdamOptimizer,
 47 |         kwargs=dict(epsilon=1e-4),
 48 |         lr_schedule=lr_schedule
 49 |     )
 50 | 
 51 |     def stopping_criterion(env, t):
 52 |         # notice that here t is the number of steps of the wrapped env,
 53 |         # which is different from the number of steps in the underlying env
 54 |         return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
 55 | 
 56 |     exploration_schedule = PiecewiseSchedule(
 57 |         [
 58 |             (0, 1.0),
 59 |             (1e6, 0.1),
 60 |             (num_iterations / 2, 0.01),
 61 |         ], outside_value=0.01
 62 |     )
 63 | 
 64 |     dqn.learn(
 65 |         env,
 66 |         q_func=atari_model,
 67 |         optimizer_spec=optimizer,
 68 |         session=session,
 69 |         exploration=exploration_schedule,
 70 |         stopping_criterion=stopping_criterion,
 71 |         replay_buffer_size=1000000,
 72 |         batch_size=32,
 73 |         gamma=0.99,
 74 |         learning_starts=50000,
 75 |         learning_freq=4,
 76 |         frame_history_len=4,
 77 |         target_update_freq=10000,
 78 |         grad_norm_clipping=10
 79 |     )
 80 |     env.close()
 81 | 
 82 | def get_available_gpus():
 83 |     from tensorflow.python.client import device_lib
 84 |     local_device_protos = device_lib.list_local_devices()
 85 |     return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU']
 86 | 
 87 | def set_global_seeds(i):
 88 |     try:
 89 |         import tensorflow as tf
 90 |     except ImportError:
 91 |         pass
 92 |     else:
 93 |         tf.set_random_seed(i) 
 94 |     np.random.seed(i)
 95 |     random.seed(i)
 96 | 
 97 | def get_session():
 98 |     tf.reset_default_graph()
 99 |     tf_config = tf.ConfigProto(
100 |         inter_op_parallelism_threads=1,
101 |         intra_op_parallelism_threads=1)
102 |     session = tf.Session(config=tf_config)
103 |     print("AVAILABLE GPUS: ", get_available_gpus())
104 |     return session
105 | 
106 | def get_env(task, seed):
107 |     env_id = task.env_id
108 | 
109 |     env = gym.make(env_id)
110 | 
111 |     set_global_seeds(seed)
112 |     env.seed(seed)
113 | 
114 |     expt_dir = '/tmp/hw3_vid_dir2/'
115 |     env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
116 |     env = wrap_deepmind(env)
117 | 
118 |     return env
119 | 
120 | def main():
121 |     
122 |     # Logger
123 |     # https://github.com/mwhittaker/homework/commit/cb043dbc980d898547f552e07f475696ce57f1d3
124 |     format = "[%(asctime)-15s %(pathname)s:%(lineno)-3s] %(message)s"
125 |     handler = logging.StreamHandler()
126 |     handler.setFormatter(logging.Formatter(format))
127 |     logger = logging.getLogger("dqn")
128 |     logger.propagate = False
129 |     logger.addHandler(handler)
130 |     logger.setLevel(logging.DEBUG)
131 | 
132 |     # Get Atari games.
133 |     benchmark = gym.benchmark_spec('Atari40M')
134 | 
135 |     # Change the index to select a different game.
136 |     task = benchmark.tasks[3]
137 | 
138 |     # Run training
139 |     seed = 0 # Use a seed of zero (you may want to randomize the seed!)
140 |     env = get_env(task, seed)
141 |     session = get_session()
142 |     atari_learn(env, session, num_timesteps=task.max_timesteps)
143 | 
144 | if __name__ == "__main__":
145 |     main()
146 | 


--------------------------------------------------------------------------------
/hw3/run_dqn_ram.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | from gym import wrappers
  4 | import os.path as osp
  5 | import random
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import tensorflow.contrib.layers as layers
  9 | 
 10 | import dqn
 11 | from dqn_utils import *
 12 | from atari_wrappers import *
 13 | 
 14 | import logging
 15 | 
 16 | def atari_model(ram_in, num_actions, scope, reuse=False):
 17 |     with tf.variable_scope(scope, reuse=reuse):
 18 |         out = ram_in
 19 |         #out = tf.concat(1,(ram_in[:,4:5],ram_in[:,8:9],ram_in[:,11:13],ram_in[:,21:22],ram_in[:,50:51], ram_in[:,60:61],ram_in[:,64:65]))
 20 |         with tf.variable_scope("action_value"):
 21 |             out = layers.fully_connected(out, num_outputs=256, activation_fn=tf.nn.relu)
 22 |             out = layers.fully_connected(out, num_outputs=128, activation_fn=tf.nn.relu)
 23 |             out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu)
 24 |             out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
 25 | 
 26 |         return out
 27 | 
 28 | def atari_learn(env,
 29 |                 session,
 30 |                 num_timesteps):
 31 |     # This is just a rough estimate
 32 |     num_iterations = float(num_timesteps) / 4.0
 33 | 
 34 |     lr_multiplier = 1.0 
 35 |     lr_schedule = PiecewiseSchedule([
 36 |                                          (0,                   1e-4 * lr_multiplier),
 37 |                                          (num_iterations / 10, 1e-4 * lr_multiplier),
 38 |                                          (num_iterations / 2,  5e-5 * lr_multiplier),
 39 |                                     ],
 40 |                                     outside_value=5e-5 * lr_multiplier)
 41 | 
 42 |     #lr_schedule = LinearSchedule(num_iterations, 0.0001, 0.01)
 43 |     optimizer = dqn.OptimizerSpec(
 44 |         constructor=tf.train.AdamOptimizer,
 45 |         kwargs=dict(epsilon=1e-4),
 46 |         lr_schedule=lr_schedule
 47 |     )
 48 | 
 49 |     def stopping_criterion(env, t):
 50 |         # notice that here t is the number of steps of the wrapped env,
 51 |         # which is different from the number of steps in the underlying env
 52 |         return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
 53 | 
 54 |     exploration_schedule = PiecewiseSchedule(
 55 |         [
 56 |             (0, 0.2),
 57 |             (1e6, 0.1),
 58 |             (num_iterations / 2, 0.01),
 59 |         ], outside_value=0.01
 60 |     )
 61 | 
 62 |     dqn.learn(
 63 |         env,
 64 |         q_func=atari_model,
 65 |         optimizer_spec=optimizer,
 66 |         session=session,
 67 |         exploration=exploration_schedule,
 68 |         stopping_criterion=stopping_criterion,
 69 |         replay_buffer_size=1000000,
 70 |         batch_size=32,
 71 |         gamma=0.99,
 72 |         learning_starts=50000,
 73 |         learning_freq=4,
 74 |         frame_history_len=1,
 75 |         target_update_freq=10000,
 76 |         grad_norm_clipping=10
 77 |     )
 78 |     env.close()
 79 | 
 80 | def get_available_gpus():
 81 |     from tensorflow.python.client import device_lib
 82 |     local_device_protos = device_lib.list_local_devices()
 83 |     return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU']
 84 | 
 85 | def set_global_seeds(i):
 86 |     try:
 87 |         import tensorflow as tf
 88 |     except ImportError:
 89 |         pass
 90 |     else:
 91 |         tf.set_random_seed(i) 
 92 |     np.random.seed(i)
 93 |     random.seed(i)
 94 | 
 95 | def get_session():
 96 |     tf.reset_default_graph()
 97 |     tf_config = tf.ConfigProto(
 98 |         inter_op_parallelism_threads=1,
 99 |         intra_op_parallelism_threads=1)
100 |     session = tf.Session(config=tf_config)
101 |     print("AVAILABLE GPUS: ", get_available_gpus())
102 |     return session
103 | 
104 | def get_env(seed):
105 |     env = gym.make('Pong-ram-v0')
106 | 
107 |     set_global_seeds(seed)
108 |     env.seed(seed)
109 | 
110 |     expt_dir = '/tmp/hw3_vid_dir/'
111 |     env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
112 |     env = wrap_deepmind_ram(env)
113 | 
114 |     return env
115 | 
116 | def main():
117 |     # Logger
118 |     # https://github.com/mwhittaker/homework/commit/cb043dbc980d898547f552e07f475696ce57f1d3
119 |     format = "[%(asctime)-15s %(pathname)s:%(lineno)-3s] %(message)s"
120 |     handler = logging.StreamHandler()
121 |     handler.setFormatter(logging.Formatter(format))
122 |     logger = logging.getLogger("dqn")
123 |     logger.propagate = False
124 |     logger.addHandler(handler)
125 |     logger.setLevel(logging.DEBUG)
126 |     
127 |     # Run training
128 |     seed = 0 # Use a seed of zero (you may want to randomize the seed!)
129 |     env = get_env(seed)
130 |     session = get_session()
131 |     atari_learn(env, session, num_timesteps=int(4e7))
132 | 
133 | if __name__ == "__main__":
134 |     main()
135 | 


--------------------------------------------------------------------------------
/hw4/README.md:
--------------------------------------------------------------------------------
 1 | # HW4, Model Based Deep RL
 2 | 
 3 | _Author_ Liyiming Ke
 4 | 
 5 | _Updated_ Mar 28, 2018
 6 | 
 7 | 
 8 | 1. To run the program, launch with `python main.py -n 15 -ep 300 -m 10 -sp 500 -r 100 -d 10`  (fast) or `python main.py -n 15 -ep 1000` (slow but yields higher return)
 9 | 
10 | 2. The performance of fast version
11 | 
12 | |Iteration|AverageCost|StdCost MinimumCost|MaximumCost|AverageReturn|StdReturn|MinimumReturn|MaximumReturn|
13 | |----|---|---|---|---|---|---|---|
14 | |0|-272.8721655363496|21.239603908942726|-320.43697618040017|-246.4736838787137|240.2439329582071|20.68344457836508|207.28770781466437|274.5577823545346|
15 | |1|-287.92028135886255|25.92310204677628|-325.54410164427884|-251.01360667647728|250.15164096384146|22.73045020236331|220.01663620467846|292.0170185841397|
16 | |2|-323.80080715005346|30.228382522490797|-389.0978409378912|-271.7796878141356|289.2029514835485|25.470647774294818|259.58129657072544|333.16807455607943|
17 | |3|-333.15430020245526|32.740974228661756|-385.3221695429378|-274.0394588141746|292.35852402398297|37.10565811001103|222.4680755620722|350.4272628072747|
18 | |4|-376.3197552271316|40.07809243241263|-444.0926281622156|-317.0400260108479|329.7078283201919|32.06590568480435|278.45885061461195|383.16172036212316|
19 | |5|-309.486577652699|20.222609987437725|-356.1101951949902|-278.77372207902937|277.1294051792327|21.8631623809154|243.78552026776572|316.4410201418656|
20 | |6|-333.27779679849357|31.09109231822848|-374.822923779812|-279.54658651433584|288.86654997633525|29.904001076855607|242.04216272993844|339.6146511277123|
21 | |7|-347.3543927604132|31.87449528755396|-398.9008770736232|-298.08452920988356|308.13962163935344|26.064436309153134|274.0523898129893|362.50855646019704|
22 | |8|-358.21187096791294|27.357604204029187|-427.34414545387995|-333.4367350634044|312.75195931842785|24.026301216233957|282.1552938485566|362.11430547952864|
23 | |9|-350.0931825856868|39.91494896369595|-419.03018683390167|-282.5787159965811|316.8553292620047|32.94504667833338|260.4670701650203|360.37675813391917|
24 | |10|-360.55493946390663|27.67517089380202|-404.1814038418491|-309.4632013566413|319.21174580449343|28.015782572396002|278.0781502604923|362.4407932260329|
25 | |11|-335.3625259508714|33.089863552783775|-394.8137162207922|-285.8209978255535|293.8460252502136|26.69139392022445|262.12653425412725|351.27653751254394|
26 | |12|-356.40964697880133|35.67182035029585|-404.97137852094636|-277.33927884191587|312.1967845782086|30.998412487074223|265.0126242278393|365.8393458066606|
27 | |13|-354.4348233587252|35.7811577088428|-435.0528204835982|-301.168475983296|313.8152077418059|28.98742963518402|270.61725419488414|367.821287239409|
28 | |14|-371.42544639124094|33.6907209955848|-442.3058340489825|-329.31225275852825|335.09668997592695|29.85285812344075|282.52093602247203|375.99342302061467|


--------------------------------------------------------------------------------
/hw4/cheetah_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import utils
 3 | from gym.envs.mujoco import mujoco_env
 4 | 
 5 | class HalfCheetahEnvNew(mujoco_env.MujocoEnv, utils.EzPickle):
 6 |     def __init__(self):
 7 |         mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 1)
 8 |         utils.EzPickle.__init__(self)
 9 | 
10 |     def _step(self, action):
11 |         xposbefore = self.model.data.qpos[0, 0]
12 |         self.do_simulation(action, self.frame_skip)
13 |         xposafter = self.model.data.qpos[0, 0]
14 |         ob = self._get_obs()
15 |         reward_ctrl = - 0.1 * np.square(action).sum()
16 |         reward_run = (xposafter - xposbefore)/self.dt
17 |         reward = reward_ctrl + reward_run
18 |         done = False
19 |         return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl)
20 | 
21 |     def _get_obs(self):
22 |         return np.concatenate([
23 |             self.model.data.qpos.flat[1:],
24 |             self.model.data.qvel.flat,
25 |             self.get_body_com("torso").flat,
26 |             # self.get_body_comvel("torso").flat,
27 |         ])
28 | 
29 |     def reset_model(self):
30 |         qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
31 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
32 |         self.set_state(qpos, qvel)
33 |         return self._get_obs()
34 | 
35 |     def viewer_setup(self):
36 |         self.viewer.cam.distance = self.model.stat.extent * 0.5


--------------------------------------------------------------------------------
/hw4/controllers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from cost_functions import trajectory_cost_fn
 3 | import time
 4 | 
 5 | class Controller():
 6 | 	def __init__(self):
 7 | 		pass
 8 | 
 9 | 	# Get the appropriate action(s) for this state(s)
10 | 	def get_action(self, state):
11 | 		pass
12 | 
13 | 
14 | class RandomController(Controller):
15 | 	def __init__(self, env):
16 | 		self.ac = env.action_space
17 | 
18 | 	def get_action(self, state):
19 | 		""" YOUR CODE HERE """
20 | 		""" Your code should randomly sample an action uniformly from the action space """
21 | 		return self.ac.sample()
22 | 
23 | class MPCcontroller(Controller):
24 | 	""" Controller built using the MPC method outlined in https://arxiv.org/abs/1708.02596 """
25 | 	def __init__(self, 
26 | 				 env, 
27 | 				 dyn_model, 
28 | 				 horizon=5, 
29 | 				 cost_fn=None, 
30 | 				 num_simulated_paths=10,
31 | 				 ):
32 | 		self.env = env
33 | 		self.dyn_model = dyn_model
34 | 		self.horizon = horizon
35 | 		self.cost_fn = cost_fn
36 | 		self.num_simulated_paths = num_simulated_paths
37 | 
38 | 	def get_action(self, state):
39 | 		""" YOUR CODE HERE """
40 | 		""" Note: be careful to batch your simulations through the model for speed """
41 | 
42 | 		sampled_acts = np.array([[self.env.action_space.sample() for j in range(self.num_simulated_paths)] for i in range(self.horizon)])
43 | 		states = [np.array([state] * self.num_simulated_paths)]
44 | 		nstates = []
45 | 
46 | 		for i in range(self.horizon):
47 | 			nstates.append(self.dyn_model.predict(states[-1], sampled_acts[i, :]))
48 | 			if i < self.horizon: states.append(nstates[-1])
49 | 
50 | 		costs = trajectory_cost_fn(self.cost_fn, states, sampled_acts, nstates)
51 | 		return sampled_acts[0][np.argmin(costs)]
52 | 


--------------------------------------------------------------------------------
/hw4/cost_functions.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | #========================================================
 5 | # 
 6 | # Environment-specific cost functions:
 7 | #
 8 | 
 9 | def cheetah_cost_fn(state, action, next_state):
10 |     if len(state.shape) > 1:
11 | 
12 |         heading_penalty_factor=10
13 |         scores=np.zeros((state.shape[0],))
14 | 
15 |         #dont move front shin back so far that you tilt forward
16 |         front_leg = state[:,5]
17 |         my_range = 0.2
18 |         scores[front_leg>=my_range] += heading_penalty_factor
19 | 
20 |         front_shin = state[:,6]
21 |         my_range = 0
22 |         scores[front_shin>=my_range] += heading_penalty_factor
23 | 
24 |         front_foot = state[:,7]
25 |         my_range = 0
26 |         scores[front_foot>=my_range] += heading_penalty_factor
27 | 
28 |         scores-= (next_state[:,17] - state[:,17]) / 0.01 #+ 0.1 * (np.sum(action**2, axis=1))
29 |         return scores
30 | 
31 |     heading_penalty_factor=10
32 |     score = 0
33 | 
34 |     #dont move front shin back so far that you tilt forward
35 |     front_leg = state[5]
36 |     my_range = 0.2
37 |     if front_leg>=my_range:
38 |         score += heading_penalty_factor
39 | 
40 |     front_shin = state[6]
41 |     my_range = 0
42 |     if front_shin>=my_range:
43 |         score += heading_penalty_factor
44 | 
45 |     front_foot = state[7]
46 |     my_range = 0
47 |     if front_foot>=my_range:
48 |         score += heading_penalty_factor
49 | 
50 |     score -= (next_state[17] - state[17]) / 0.01 #+ 0.1 * (np.sum(action**2))
51 |     return score
52 | 
53 | #========================================================
54 | # 
55 | # Cost function for a whole trajectory:
56 | #
57 | 
58 | def trajectory_cost_fn(cost_fn, states, actions, next_states):
59 |     trajectory_cost = 0
60 |     for i in range(len(actions)):
61 |         trajectory_cost += cost_fn(states[i], actions[i], next_states[i])
62 |     return trajectory_cost


--------------------------------------------------------------------------------
/hw4/dynamics.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | 
  5 | # Predefined function to build a feedforward neural network
  6 | def build_mlp(input_placeholder, 
  7 |               output_size,
  8 |               scope, 
  9 |               n_layers=2, 
 10 |               size=500, 
 11 |               activation=tf.tanh,
 12 |               output_activation=None
 13 |               ):
 14 |     out = input_placeholder
 15 |     with tf.variable_scope(scope):
 16 |         for _ in range(n_layers):
 17 |             out = tf.layers.dense(out, size, activation=activation)
 18 |         out = tf.layers.dense(out, output_size, activation=output_activation)
 19 |     return out
 20 | 
 21 | def normalize(data, normalization):
 22 |     return (data - normalization[0]) / (normalization[1] + 1e-10)
 23 | 
 24 | def denormalize(data, normalization):
 25 |     return data * (normalization[1] + 1e-10) + normalization[0]
 26 | 
 27 | def batch_index(batch_size, chunk_size):
 28 |     ind =  [(acc, acc + batch_size) for acc in range(0, chunk_size, batch_size)]
 29 |     if ind[-1][1] < chunk_size:
 30 |       ind.append((ind[-1][1], chunk_size))
 31 |     return ind
 32 | 
 33 | class NNDynamicsModel():
 34 |     def __init__(self, 
 35 |                  env, 
 36 |                  n_layers,
 37 |                  size, 
 38 |                  activation, 
 39 |                  output_activation, 
 40 |                  normalization,
 41 |                  batch_size,
 42 |                  iterations,
 43 |                  learning_rate,
 44 |                  sess
 45 |                  ):
 46 |         """ YOUR CODE HERE """
 47 |         """ Note: Be careful about normalization """
 48 |         ob_dim = env.observation_space.shape[0]
 49 |         ac_dim = env.action_space.shape[0]
 50 |         
 51 |         self.input_state = tf.placeholder(shape=(None, ob_dim), dtype=tf.float32)
 52 |         self.input_act = tf.placeholder(shape=(None, ac_dim), dtype=tf.float32)
 53 |         self.target_delta = tf.placeholder(shape=(None, ob_dim), dtype=tf.float32)
 54 | 
 55 |         self.dyn = build_mlp(tf.concat([self.input_state, self.input_act], axis=1), 
 56 |                 output_size=ob_dim, 
 57 |                 scope="NNDynamicsModel",
 58 |                 n_layers=n_layers,
 59 |                 size=size,
 60 |                 activation=activation,
 61 |                 output_activation=output_activation)
 62 | 
 63 |         self.normalization = normalization
 64 | 
 65 |         self.loss = tf.losses.mean_squared_error(labels=self.target_delta, predictions=self.dyn) 
 66 |         self.update = tf.train.AdamOptimizer(learning_rate).minimize(loss=self.loss)
 67 |         self.iterations = iterations
 68 |         self.batch_size = batch_size
 69 |         self.sess = sess
 70 | 
 71 |     def fit(self, data):
 72 |         """
 73 |         Write a function to take in a dataset of (unnormalized)states, (unnormalized)actions, (unnormalized)next_states and fit the dynamics model going from normalized states, normalized actions to normalized state differences (s_t+1 - s_t)
 74 |         """
 75 |         """YOUR CODE HERE """
 76 |         obs = normalize(data["observations"], self.normalization["observations"])
 77 |         acts = normalize(data["actions"], self.normalization["actions"])
 78 |         deltas = normalize(data["next_observations"] - data["observations"], self.normalization["deltas"])
 79 |         chunk_size = len(data["observations"])
 80 |         
 81 |         batch_indexes = batch_index(self.batch_size, chunk_size)
 82 |         loss = None
 83 |         for epoch in range(self.iterations):
 84 |           if epoch % 20 == 0: print("Epoch {}/{}: Loss {}".format(epoch, self.iterations, loss))
 85 |           for _, (a,b) in enumerate(batch_indexes):
 86 |             _, loss = self.sess.run(
 87 |                 [self.update, self.loss], 
 88 |                 feed_dict={
 89 |                     self.input_state: obs[a:b],
 90 |                     self.input_act: acts[a:b],
 91 |                     self.target_delta: deltas[a:b]
 92 |                     })
 93 | 
 94 | 
 95 |     def predict(self, states, actions):
 96 |         """ Write a function to take in a batch of (unnormalized) states and (unnormalized) actions and return the (unnormalized) next states as predicted by using the model """
 97 |         """ YOUR CODE HERE """
 98 |         n_states = normalize(states, self.normalization["observations"])
 99 |         n_acts = normalize(actions, self.normalization["actions"])
100 |         deltas = self.sess.run(self.dyn, feed_dict={self.input_state: n_states,
101 |                                                     self.input_act: n_acts})
102 |         deltas = denormalize(deltas, self.normalization["deltas"])
103 |         return deltas + states
104 | 
105 | 


--------------------------------------------------------------------------------
/hw4/hw4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw4/hw4.pdf


--------------------------------------------------------------------------------
/hw4/logz.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | """
  4 | 
  5 | Some simple logging functionality, inspired by rllab's logging.
  6 | Assumes that each diagnostic gets logged each iteration
  7 | 
  8 | Call logz.configure_output_dir() to start logging to a 
  9 | tab-separated-values file (some_folder_name/log.txt)
 10 | 
 11 | To load the learning curves, you can do, for example
 12 | 
 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
 14 | A['EpRewMean']
 15 | 
 16 | """
 17 | 
 18 | import os.path as osp, shutil, time, atexit, os, subprocess
 19 | import pickle
 20 | import tensorflow as tf
 21 | 
 22 | color2num = dict(
 23 |     gray=30,
 24 |     red=31,
 25 |     green=32,
 26 |     yellow=33,
 27 |     blue=34,
 28 |     magenta=35,
 29 |     cyan=36,
 30 |     white=37,
 31 |     crimson=38
 32 | )
 33 | 
 34 | def colorize(string, color, bold=False, highlight=False):
 35 |     attr = []
 36 |     num = color2num[color]
 37 |     if highlight: num += 10
 38 |     attr.append(str(num))
 39 |     if bold: attr.append('1')
 40 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
 41 | 
 42 | class G:
 43 |     output_dir = None
 44 |     output_file = None
 45 |     first_row = True
 46 |     log_headers = []
 47 |     log_current_row = {}
 48 | 
 49 | def configure_output_dir(d=None):
 50 |     """
 51 |     Set output directory to d, or to /tmp/somerandomnumber if d is None
 52 |     """
 53 |     G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
 54 |     if osp.exists(G.output_dir):
 55 |         print("Log dir %s already exists! Delete it first or use a different dir"%G.output_dir)
 56 |     else:
 57 |         os.makedirs(G.output_dir)
 58 |     G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
 59 |     atexit.register(G.output_file.close)
 60 |     print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
 61 | 
 62 | def log_tabular(key, val):
 63 |     """
 64 |     Log a value of some diagnostic
 65 |     Call this once for each diagnostic quantity, each iteration
 66 |     """
 67 |     if G.first_row:
 68 |         G.log_headers.append(key)
 69 |     else:
 70 |         assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
 71 |     assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
 72 |     G.log_current_row[key] = val
 73 | 
 74 | def save_params(params):
 75 |     with open(osp.join(G.output_dir, "params.json"), 'w') as out:
 76 |         out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True))
 77 | 
 78 | def pickle_tf_vars():  
 79 |     """
 80 |     Saves tensorflow variables
 81 |     Requires them to be initialized first, also a default session must exist
 82 |     """
 83 |     _dict = {v.name : v.eval() for v in tf.global_variables()}
 84 |     with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f:
 85 |         pickle.dump(_dict, f)
 86 |     
 87 | 
 88 | def dump_tabular():
 89 |     """
 90 |     Write all of the diagnostics from the current iteration
 91 |     """
 92 |     vals = []
 93 |     key_lens = [len(key) for key in G.log_headers]
 94 |     max_key_len = max(15,max(key_lens))
 95 |     keystr = '%'+'%d'%max_key_len
 96 |     fmt = "| " + keystr + "s | %15s |"
 97 |     n_slashes = 22 + max_key_len
 98 |     print("-"*n_slashes)
 99 |     for key in G.log_headers:
100 |         val = G.log_current_row.get(key, "")
101 |         if hasattr(val, "__float__"): valstr = "%8.3g"%val
102 |         else: valstr = val
103 |         print(fmt%(key, valstr))
104 |         vals.append(val)
105 |     print("-"*n_slashes)
106 |     if G.output_file is not None:
107 |         if G.first_row:
108 |             G.output_file.write("\t".join(G.log_headers))
109 |             G.output_file.write("\n")
110 |         G.output_file.write("\t".join(map(str,vals)))
111 |         G.output_file.write("\n")
112 |         G.output_file.flush()
113 |     G.log_current_row.clear()
114 |     G.first_row=False
115 | 


--------------------------------------------------------------------------------
/hw4/main.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import gym
  4 | from dynamics import NNDynamicsModel
  5 | from controllers import MPCcontroller, RandomController
  6 | from cost_functions import cheetah_cost_fn, trajectory_cost_fn
  7 | import time
  8 | import logz
  9 | import os
 10 | import copy
 11 | import matplotlib.pyplot as plt
 12 | from cheetah_env import HalfCheetahEnvNew
 13 | 
 14 | def sample(env, 
 15 |            controller, 
 16 |            num_paths=10, 
 17 |            horizon=1000, 
 18 |            render=False,
 19 |            verbose=False):
 20 |     """
 21 |         Write a sampler function which takes in an environment, a controller (either random or the MPC controller), 
 22 |         and returns rollouts by running on the env. 
 23 |         Each path can have elements for observations, next_observations, rewards, returns, actions, etc.
 24 |     """
 25 |     """YOUR CODE HERE """
 26 |     paths = {
 27 |         "observations":[], 
 28 |         "next_observations":[], 
 29 |         "rewards":[], 
 30 |         "actions":[], 
 31 |         "ep_lens":[],
 32 |         "acc_rewards":[]
 33 |     }
 34 | 
 35 |     for i in range(num_paths):
 36 |         animate_this_rollout = render and (i%10 == 0)
 37 |         print("Sample Path {} / {}".format(i, num_paths))
 38 |         ob = env.reset()
 39 |         ep_len = 0
 40 |         while ep_len < horizon:
 41 |             if animate_this_rollout:
 42 |                 env.render()
 43 |                 time.sleep(0.05)
 44 |             
 45 |             paths["observations"].append(ob)
 46 |             act = controller.get_action(ob)
 47 |             ob, rew, done, _ = env.step(act)
 48 |             
 49 |             paths["actions"].append(act)
 50 |             paths["next_observations"].append(ob)
 51 |             paths["rewards"].append(rew)
 52 | 
 53 |             ep_len += 1
 54 |             if done: break
 55 | 
 56 |         paths["ep_lens"].append(ep_len)
 57 |         paths["acc_rewards"].append(sum(paths["rewards"][-ep_len:]))
 58 | 
 59 |     if verbose:
 60 |         print("************* New Sample *************")
 61 |         returns = paths["acc_rewards"]
 62 |         ep_lengths = paths["ep_lens"]
 63 |         print("AverageReturn", np.mean(returns))
 64 |         print("StdReturn", np.std(returns))
 65 |         print("MaxReturn", np.max(returns))
 66 |         print("MinReturn", np.min(returns))
 67 |         print("EpLenMean", np.mean(ep_lengths))
 68 |         print("EpLenStd", np.std(ep_lengths))
 69 | 
 70 |     for key in paths.keys():
 71 |         paths[key] = np.array(paths[key])
 72 | 
 73 |     return paths
 74 | 
 75 | # Utility to compute cost a path for a given cost function
 76 | def path_cost(cost_fn, path):
 77 |     costs = []
 78 |     acc = 0
 79 |     for i in path["ep_lens"]:
 80 |         acc_n = acc + i
 81 |         costs.append(trajectory_cost_fn(cost_fn, path['observations'][acc:acc_n], path['actions'][acc:acc_n], path['next_observations'][acc:acc_n]))
 82 |         acc = acc_n
 83 |     return costs
 84 | 
 85 | def compute_normalization(data):
 86 |     """
 87 |     Write a function to take in a dataset and compute the means, and stds.
 88 |     Return 6 elements: mean of s_t, std of s_t, mean of (s_t+1 - s_t), std of (s_t+1 - s_t), mean of actions, std of actions
 89 |     """
 90 |     """ YOUR CODE HERE """
 91 |     return (np.mean(data, axis=0), np.std(data, axis=0))
 92 | 
 93 | 
 94 | def plot_comparison(env, dyn_model):
 95 |     """
 96 |     Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions. 
 97 |     """
 98 |     """ YOUR CODE HERE """
 99 |     pass
100 | 
101 | 
102 | def train(env, 
103 |          cost_fn,
104 |          logdir=None,
105 |          render=False,
106 |          learning_rate=1e-3,
107 |          onpol_iters=10,
108 |          dynamics_iters=60,
109 |          batch_size=512,
110 |          num_paths_random=10, 
111 |          num_paths_onpol=10, 
112 |          num_simulated_paths=10000,
113 |          env_horizon=1000, 
114 |          mpc_horizon=15,
115 |          n_layers=2,
116 |          size=500,
117 |          activation=tf.nn.relu,
118 |          output_activation=None
119 |          ):
120 | 
121 |     """
122 | 
123 |     Arguments:
124 | 
125 |     onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run. 
126 | 
127 |     dynamics_iters              Number of iterations of training for the dynamics model
128 |     |_                          which happen per iteration of the aggregation loop.
129 | 
130 |     batch_size                  Batch size for dynamics training.
131 | 
132 |     num_paths_random            Number of paths/trajectories/rollouts generated 
133 |     |                           by a random agent. We use these to train our 
134 |     |_                          initial dynamics model.
135 |     
136 |     num_paths_onpol             Number of paths to collect at each iteration of
137 |     |_                          aggregation, using the Model Predictive Control policy.
138 | 
139 |     num_simulated_paths         How many fictitious rollouts the MPC policy
140 |     |                           should generate each time it is asked for an
141 |     |_                          action.
142 | 
143 |     env_horizon                 Number of timesteps in each path.
144 | 
145 |     mpc_horizon                 The MPC policy generates actions by imagining 
146 |     |                           fictitious rollouts, and picking the first action
147 |     |                           of the best fictitious rollout. This argument is
148 |     |                           how many timesteps should be in each fictitious
149 |     |_                          rollout.
150 | 
151 |     n_layers/size/activations   Neural network architecture arguments. 
152 | 
153 |     """
154 | 
155 |     logz.configure_output_dir(logdir)
156 | 
157 |     #========================================================
158 |     # 
159 |     # First, we need a lot of data generated by a random
160 |     # agent, with which we'll begin to train our dynamics
161 |     # model.
162 | 
163 |     random_controller = RandomController(env)
164 |     """ YOUR CODE HERE """
165 |     paths = sample(env=env, controller=random_controller, 
166 |                     num_paths=num_paths_random, horizon=env_horizon, verbose=False)
167 | 
168 |     #========================================================
169 |     # 
170 |     # The random data will be used to get statistics (mean
171 |     # and std) for the observations, actions, and deltas
172 |     # (where deltas are o_{t+1} - o_t). These will be used
173 |     # for normalizing inputs and denormalizing outputs
174 |     # from the dynamics network. 
175 |     # 
176 |     """ YOUR CODE HERE """
177 |     normalization = {
178 |         "observations": compute_normalization(paths["observations"]),
179 |         "actions": compute_normalization(paths["actions"]), 
180 |         "deltas": compute_normalization(paths["next_observations"] - paths["observations"]) 
181 |     }
182 | 
183 |     #========================================================
184 |     # 
185 |     # Build dynamics model and MPC controllers.
186 |     # 
187 |     sess = tf.Session()
188 | 
189 |     dyn_model = NNDynamicsModel(env=env, 
190 |                                 n_layers=n_layers, 
191 |                                 size=size, 
192 |                                 activation=activation, 
193 |                                 output_activation=output_activation, 
194 |                                 normalization=normalization,
195 |                                 batch_size=batch_size,
196 |                                 iterations=dynamics_iters,
197 |                                 learning_rate=learning_rate,
198 |                                 sess=sess)
199 | 
200 |     mpc_controller = MPCcontroller(env=env, 
201 |                                    dyn_model=dyn_model, 
202 |                                    horizon=mpc_horizon, 
203 |                                    cost_fn=cost_fn, 
204 |                                    num_simulated_paths=num_simulated_paths)
205 | 
206 | 
207 |     #========================================================
208 |     # 
209 |     # Tensorflow session building.
210 |     # 
211 |     sess.__enter__()
212 |     tf.global_variables_initializer().run()
213 | 
214 |     #========================================================
215 |     # 
216 |     # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. 
217 |     # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
218 |     # 
219 |     for itr in range(onpol_iters):
220 |         """ YOUR CODE HERE """
221 |         shuffle_indexes = np.random.permutation(paths["observations"].shape[0])
222 |         for key in ['observations', 'actions', 'next_observations', 'rewards']:
223 |             paths[key] = paths[key][shuffle_indexes]
224 | 
225 |         dyn_model.fit(paths)
226 | 
227 |         newpaths = sample(env=env, controller=mpc_controller, 
228 |                             num_paths=num_paths_onpol, horizon=env_horizon, verbose=False)
229 | 
230 |         # LOGGING
231 |         # Statistics for performance of MPC policy using
232 |         # our learned dynamics model
233 |         costs = path_cost(cost_fn, newpaths)            
234 |         returns = newpaths["acc_rewards"]
235 | 
236 |         logz.log_tabular('Iteration', itr)
237 |         # In terms of cost function which your MPC controller uses to plan
238 |         logz.log_tabular('AverageCost', np.mean(costs))
239 |         logz.log_tabular('StdCost', np.std(costs))
240 |         logz.log_tabular('MinimumCost', np.min(costs))
241 |         logz.log_tabular('MaximumCost', np.max(costs))
242 |         # In terms of true environment reward of your rolled out trajectory using the MPC controller
243 |         logz.log_tabular('AverageReturn', np.mean(returns))
244 |         logz.log_tabular('StdReturn', np.std(returns))
245 |         logz.log_tabular('MinimumReturn', np.min(returns))
246 |         logz.log_tabular('MaximumReturn', np.max(returns))
247 |         logz.dump_tabular()
248 | 
249 |         for key in ['observations', 'actions', 'next_observations', 'rewards']:
250 |             paths[key] = np.concatenate([paths[key], newpaths[key]])
251 | 
252 | def main():
253 |     import argparse
254 |     parser = argparse.ArgumentParser()
255 |     parser.add_argument('--env_name', type=str, default='HalfCheetah-v1')
256 |     # Experiment meta-params
257 |     parser.add_argument('--exp_name', type=str, default='mb_mpc')
258 |     parser.add_argument('--seed', type=int, default=3)
259 |     parser.add_argument('--render', action='store_true')
260 |     # Training args
261 |     parser.add_argument('--learning_rate', '-lr', type=float, default=1e-3)
262 |     parser.add_argument('--onpol_iters', '-n', type=int, default=1)
263 |     parser.add_argument('--dyn_iters', '-nd', type=int, default=60)
264 |     parser.add_argument('--batch_size', '-b', type=int, default=512)
265 |     # Data collection
266 |     parser.add_argument('--random_paths', '-r', type=int, default=10)
267 |     parser.add_argument('--onpol_paths', '-d', type=int, default=10)
268 |     parser.add_argument('--simulated_paths', '-sp', type=int, default=1000)
269 |     parser.add_argument('--ep_len', '-ep', type=int, default=1000)
270 |     # Neural network architecture args
271 |     parser.add_argument('--n_layers', '-l', type=int, default=2)
272 |     parser.add_argument('--size', '-s', type=int, default=500)
273 |     # MPC Controller
274 |     parser.add_argument('--mpc_horizon', '-m', type=int, default=15)
275 |     args = parser.parse_args()
276 | 
277 |     # Set seed
278 |     np.random.seed(args.seed)
279 |     tf.set_random_seed(args.seed)
280 | 
281 |     # Make data directory if it does not already exist
282 |     if not(os.path.exists('data')):
283 |         os.makedirs('data')
284 |     logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
285 |     logdir = os.path.join('data', logdir)
286 |     if not(os.path.exists(logdir)):
287 |         os.makedirs(logdir)
288 | 
289 |     # Make env
290 |     if args.env_name is "HalfCheetah-v1":
291 |         env = HalfCheetahEnvNew()
292 |         cost_fn = cheetah_cost_fn
293 |     train(env=env, 
294 |             cost_fn=cost_fn,
295 |             logdir=logdir,
296 |             render=args.render,
297 |             learning_rate=args.learning_rate,
298 |             onpol_iters=args.onpol_iters,
299 |             dynamics_iters=args.dyn_iters,
300 |             batch_size=args.batch_size,
301 |             num_paths_random=args.random_paths, 
302 |             num_paths_onpol=args.onpol_paths, 
303 |             num_simulated_paths=args.simulated_paths,
304 |             env_horizon=args.ep_len, 
305 |             mpc_horizon=args.mpc_horizon,
306 |             n_layers = args.n_layers,
307 |             size=args.size,
308 |             activation=tf.nn.relu,
309 |             output_activation=None,
310 |             )
311 | 
312 | if __name__ == "__main__":
313 |     main()
314 | 


--------------------------------------------------------------------------------
/hw4/plot.py:
--------------------------------------------------------------------------------
  1 | import seaborn as sns
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import json
  5 | import os
  6 | 
  7 | """
  8 | Using the plotter:
  9 | 
 10 | Call it from the command line, and supply it with logdirs to experiments.
 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 
 12 | random seeds. The runner code stored it in the directory structure
 13 | 
 14 |     data
 15 |     L test_EnvName_DateTime
 16 |       L  0
 17 |         L log.txt
 18 |         L params.json
 19 |       L  1
 20 |         L log.txt
 21 |         L params.json
 22 |        .
 23 |        .
 24 |        .
 25 |       L  9
 26 |         L log.txt
 27 |         L params.json
 28 | 
 29 | To plot learning curves from the experiment, averaged over all random
 30 | seeds, call
 31 | 
 32 |     python plot.py data/test_EnvName_DateTime --value AverageReturn
 33 | 
 34 | and voila. To see a different statistics, change what you put in for
 35 | the keyword --value. You can also enter /multiple/ values, and it will 
 36 | make all of them in order.
 37 | 
 38 | 
 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried
 40 | a different set of hyperparameters from 'test1', and now you would like 
 41 | to compare them -- see their learning curves side-by-side. Just call
 42 | 
 43 |     python plot.py data/test1 data/test2
 44 | 
 45 | and it will plot them both! They will be given titles in the legend according
 46 | to their exp_name parameters. If you want to use custom legend titles, use
 47 | the --legend flag and then provide a title for each logdir.
 48 | 
 49 | """
 50 | 
 51 | def plot_data(data, value="AverageReturn"):
 52 |     if isinstance(data, list):
 53 |         data = pd.concat(data, ignore_index=True)
 54 |     sns.set(style="darkgrid", font_scale=1.5)
 55 |     sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
 56 |     plt.legend(loc='best').draggable()
 57 |     plt.show()
 58 | 
 59 | 
 60 | def get_datasets(fpath, condition=None):
 61 |     unit = 0
 62 |     datasets = []
 63 |     for root, dir, files in os.walk(fpath):
 64 |         if 'log.txt' in files:
 65 |             param_path = open(os.path.join(root,'params.json'))
 66 |             params = json.load(param_path)
 67 |             exp_name = params['exp_name']
 68 |             
 69 |             log_path = os.path.join(root,'log.txt')
 70 |             experiment_data = pd.read_table(log_path)
 71 | 
 72 |             experiment_data.insert(
 73 |                 len(experiment_data.columns),
 74 |                 'Unit',
 75 |                 unit
 76 |                 )
 77 |             experiment_data.insert(
 78 |                 len(experiment_data.columns),
 79 |                 'Condition',
 80 |                 condition or exp_name
 81 |                 )
 82 | 
 83 |             datasets.append(experiment_data)
 84 |             unit += 1
 85 | 
 86 |     return datasets
 87 | 
 88 | 
 89 | def main():
 90 |     import argparse
 91 |     parser = argparse.ArgumentParser()
 92 |     parser.add_argument('logdir', nargs='*')
 93 |     parser.add_argument('--legend', nargs='*')
 94 |     parser.add_argument('--value', default='AverageReturn', nargs='*')
 95 |     args = parser.parse_args()
 96 | 
 97 |     use_legend = False
 98 |     if args.legend is not None:
 99 |         assert len(args.legend) == len(args.logdir), \
100 |             "Must give a legend title for each set of experiments."
101 |         use_legend = True
102 | 
103 |     data = []
104 |     if use_legend:
105 |         for logdir, legend_title in zip(args.logdir, args.legend):
106 |             data += get_datasets(logdir, legend_title)
107 |     else:
108 |         for logdir in args.logdir:
109 |             data += get_datasets(logdir)
110 | 
111 |     if isinstance(args.value, list):
112 |         values = args.value
113 |     else:
114 |         values = [args.value]
115 |     for value in values:
116 |         plot_data(data, value=value)
117 | 
118 | if __name__ == "__main__":
119 |     main()
120 | 


--------------------------------------------------------------------------------
/sp17_hw/hw1/README.md:
--------------------------------------------------------------------------------
 1 | # CS294-112 HW 1: Imitation Learning
 2 | 
 3 | Dependencies: TensorFlow, MuJoCo version 1.31, OpenAI Gym
 4 | 
 5 | **Note**: MuJoCo versions until 1.5 do not support NVMe disks therefore won't be compatible with recent Mac machines.
 6 | There is a request for OpenAI to support it that can be followed [here](https://github.com/openai/gym/issues/638).
 7 | 
 8 | The only file that you need to look at is `run_expert.py`, which is code to load up an expert policy, run a specified number of roll-outs, and save out data.
 9 | 
10 | In `experts/`, the provided expert policies are:
11 | * Ant-v1.pkl
12 | * HalfCheetah-v1.pkl
13 | * Hopper-v1.pkl
14 | * Humanoid-v1.pkl
15 | * Reacher-v1.pkl
16 | * Walker2d-v1.pkl
17 | 
18 | The name of the pickle file corresponds to the name of the gym environment.
19 | 


--------------------------------------------------------------------------------
/sp17_hw/hw1/demo.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eux
3 | for e in Hopper-v1 Ant-v1 HalfCheetah-v1 Humanoid-v1 Reacher-v1 Walker2d-v1
4 | do
5 |     python run_expert.py experts/$e.pkl $e --render --num_rollouts=1
6 | done
7 | 


--------------------------------------------------------------------------------
/sp17_hw/hw1/experts/Ant-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/sp17_hw/hw1/experts/Ant-v1.pkl


--------------------------------------------------------------------------------
/sp17_hw/hw1/experts/HalfCheetah-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/sp17_hw/hw1/experts/HalfCheetah-v1.pkl


--------------------------------------------------------------------------------
/sp17_hw/hw1/experts/Hopper-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/sp17_hw/hw1/experts/Hopper-v1.pkl


--------------------------------------------------------------------------------
/sp17_hw/hw1/experts/Humanoid-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/sp17_hw/hw1/experts/Humanoid-v1.pkl


--------------------------------------------------------------------------------
/sp17_hw/hw1/experts/Reacher-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/sp17_hw/hw1/experts/Reacher-v1.pkl


--------------------------------------------------------------------------------
/sp17_hw/hw1/experts/Walker2d-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/sp17_hw/hw1/experts/Walker2d-v1.pkl


--------------------------------------------------------------------------------
/sp17_hw/hw1/load_policy.py:
--------------------------------------------------------------------------------
 1 | import pickle, tensorflow as tf, tf_util, numpy as np
 2 | 
 3 | def load_policy(filename):
 4 |     with open(filename, 'rb') as f:
 5 |         data = pickle.loads(f.read())
 6 | 
 7 |     # assert len(data.keys()) == 2
 8 |     nonlin_type = data['nonlin_type']
 9 |     policy_type = [k for k in data.keys() if k != 'nonlin_type'][0]
10 | 
11 |     assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type)
12 |     policy_params = data[policy_type]
13 | 
14 |     assert set(policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'}
15 | 
16 |     # Keep track of input and output dims (i.e. observation and action dims) for the user
17 | 
18 |     def build_policy(obs_bo):
19 |         def read_layer(l):
20 |             assert list(l.keys()) == ['AffineLayer']
21 |             assert sorted(l['AffineLayer'].keys()) == ['W', 'b']
22 |             return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer']['b'].astype(np.float32)
23 | 
24 |         def apply_nonlin(x):
25 |             if nonlin_type == 'lrelu':
26 |                 return tf_util.lrelu(x, leak=.01) # openai/imitation nn.py:233
27 |             elif nonlin_type == 'tanh':
28 |                 return tf.tanh(x)
29 |             else:
30 |                 raise NotImplementedError(nonlin_type)
31 | 
32 |         # Build the policy. First, observation normalization.
33 |         assert list(policy_params['obsnorm'].keys()) == ['Standardizer']
34 |         obsnorm_mean = policy_params['obsnorm']['Standardizer']['mean_1_D']
35 |         obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D']
36 |         obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean)))
37 |         print('obs', obsnorm_mean.shape, obsnorm_stdev.shape)
38 |         normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation
39 | 
40 |         curr_activations_bd = normedobs_bo
41 | 
42 |         # Hidden layers next
43 |         assert list(policy_params['hidden'].keys()) == ['FeedforwardNet']
44 |         layer_params = policy_params['hidden']['FeedforwardNet']
45 |         for layer_name in sorted(layer_params.keys()):
46 |             l = layer_params[layer_name]
47 |             W, b = read_layer(l)
48 |             curr_activations_bd = apply_nonlin(tf.matmul(curr_activations_bd, W) + b)
49 | 
50 |         # Output layer
51 |         W, b = read_layer(policy_params['out'])
52 |         output_bo = tf.matmul(curr_activations_bd, W) + b
53 |         return output_bo
54 | 
55 |     obs_bo = tf.placeholder(tf.float32, [None, None])
56 |     a_ba = build_policy(obs_bo)
57 |     policy_fn = tf_util.function([obs_bo], a_ba)
58 |     return policy_fn


--------------------------------------------------------------------------------
/sp17_hw/hw1/run_expert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Code to load an expert policy and generate roll-out data for behavioral cloning.
 5 | Example usage:
 6 |     python run_expert.py experts/Humanoid-v1.pkl Humanoid-v1 --render \
 7 |             --num_rollouts 20
 8 | 
 9 | Author of this script and included expert policies: Jonathan Ho (hoj@openai.com)
10 | """
11 | 
12 | import pickle
13 | import tensorflow as tf
14 | import numpy as np
15 | import tf_util
16 | import gym
17 | import load_policy
18 | 
19 | def main():
20 |     import argparse
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument('expert_policy_file', type=str)
23 |     parser.add_argument('envname', type=str)
24 |     parser.add_argument('--render', action='store_true')
25 |     parser.add_argument("--max_timesteps", type=int)
26 |     parser.add_argument('--num_rollouts', type=int, default=20,
27 |                         help='Number of expert roll outs')
28 |     args = parser.parse_args()
29 | 
30 |     print('loading and building expert policy')
31 |     policy_fn = load_policy.load_policy(args.expert_policy_file)
32 |     print('loaded and built')
33 | 
34 |     with tf.Session():
35 |         tf_util.initialize()
36 | 
37 |         import gym
38 |         env = gym.make(args.envname)
39 |         max_steps = args.max_timesteps or env.spec.timestep_limit
40 | 
41 |         returns = []
42 |         observations = []
43 |         actions = []
44 |         for i in range(args.num_rollouts):
45 |             print('iter', i)
46 |             obs = env.reset()
47 |             done = False
48 |             totalr = 0.
49 |             steps = 0
50 |             while not done:
51 |                 action = policy_fn(obs[None,:])
52 |                 observations.append(obs)
53 |                 actions.append(action)
54 |                 obs, r, done, _ = env.step(action)
55 |                 totalr += r
56 |                 steps += 1
57 |                 if args.render:
58 |                     env.render()
59 |                 if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
60 |                 if steps >= max_steps:
61 |                     break
62 |             returns.append(totalr)
63 | 
64 |         print('returns', returns)
65 |         print('mean return', np.mean(returns))
66 |         print('std of return', np.std(returns))
67 | 
68 |         expert_data = {'observations': np.array(observations),
69 |                        'actions': np.array(actions)}
70 | 
71 | if __name__ == '__main__':
72 |     main()
73 | 


--------------------------------------------------------------------------------
/sp17_hw/hw2/discrete_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from gym import Env, spaces
 4 | from gym.utils import seeding
 5 | 
 6 | def categorical_sample(prob_n, np_random):
 7 |     """
 8 |     Sample from categorical distribution
 9 |     Each row specifies class probabilities
10 |     """
11 |     prob_n = np.asarray(prob_n)
12 |     csprob_n = np.cumsum(prob_n)
13 |     return (csprob_n > np_random.rand()).argmax()
14 | 
15 | 
16 | class DiscreteEnv(Env):
17 | 
18 |     """
19 |     Has the following members
20 |     - nS: number of states
21 |     - nA: number of actions
22 |     - P: transitions (*)
23 |     - isd: initial state distribution (**)
24 | 
25 |     (*) dictionary dict of dicts of lists, where
26 |       P[s][a] == [(probability, nextstate, reward, done), ...]
27 |     (**) list or array of length nS
28 | 
29 | 
30 |     """
31 |     def __init__(self, nS, nA, P, isd):
32 |         self.P = P
33 |         self.isd = isd
34 |         self.lastaction=None # for rendering
35 |         self.nS = nS
36 |         self.nA = nA
37 | 
38 |         self.action_space = spaces.Discrete(self.nA)
39 |         self.observation_space = spaces.Discrete(self.nS)
40 | 
41 |         self._seed()
42 |         self._reset()
43 | 
44 |     def _seed(self, seed=None):
45 |         self.np_random, seed = seeding.np_random(seed)
46 |         return [seed]
47 | 
48 |     def _reset(self):
49 |         self.s = categorical_sample(self.isd, self.np_random)
50 |         self.lastaction=None
51 |         return self.s
52 | 
53 |     def _step(self, a):
54 |         transitions = self.P[self.s][a]
55 |         i = categorical_sample([t[0] for t in transitions], self.np_random)
56 |         p, s, r, d= transitions[i]
57 |         self.s = s
58 |         self.lastaction=a
59 |         return (s, r, d, {"prob" : p})
60 | 


--------------------------------------------------------------------------------
/sp17_hw/hw2/frozen_lake.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import sys
  3 | from six import StringIO, b
  4 | 
  5 | from gym import utils
  6 | import discrete_env
  7 | 
  8 | LEFT = 0
  9 | DOWN = 1
 10 | RIGHT = 2
 11 | UP = 3
 12 | 
 13 | MAPS = {
 14 |     "4x4": [
 15 |         "SFFF",
 16 |         "FHFH",
 17 |         "FFFH",
 18 |         "HFFG"
 19 |     ],
 20 |     "8x8": [
 21 |         "SFFFFFFF",
 22 |         "FFFFFFFF",
 23 |         "FFFHFFFF",
 24 |         "FFFFFHFF",
 25 |         "FFFHFFFF",
 26 |         "FHHFFFHF",
 27 |         "FHFFHFHF",
 28 |         "FFFHFFFG"
 29 |     ],
 30 | }
 31 | 
 32 | class FrozenLakeEnv(discrete_env.DiscreteEnv):
 33 |     """
 34 |     Winter is here. You and your friends were tossing around a frisbee at the park
 35 |     when you made a wild throw that left the frisbee out in the middle of the lake.
 36 |     The water is mostly frozen, but there are a few holes where the ice has melted.
 37 |     If you step into one of those holes, you'll fall into the freezing water.
 38 |     At this time, there's an international frisbee shortage, so it's absolutely imperative that
 39 |     you navigate across the lake and retrieve the disc.
 40 |     However, the ice is slippery, so you won't always move in the direction you intend.
 41 |     The surface is described using a grid like the following
 42 | 
 43 |         SFFF
 44 |         FHFH
 45 |         FFFH
 46 |         HFFG
 47 | 
 48 |     S : starting point, safe
 49 |     F : frozen surface, safe
 50 |     H : hole, fall to your doom
 51 |     G : goal, where the frisbee is located
 52 | 
 53 |     The episode ends when you reach the goal or fall in a hole.
 54 |     You receive a reward of 1 if you reach the goal, and zero otherwise.
 55 | 
 56 |     """
 57 | 
 58 |     metadata = {'render.modes': ['human', 'ansi']}
 59 | 
 60 |     def __init__(self, desc=None, map_name="4x4",is_slippery=True):
 61 |         if desc is None and map_name is None:
 62 |             raise ValueError('Must provide either desc or map_name')
 63 |         elif desc is None:
 64 |             desc = MAPS[map_name]
 65 |         self.desc = desc = np.asarray(desc,dtype='c')
 66 |         self.nrow, self.ncol = nrow, ncol = desc.shape
 67 | 
 68 |         nA = 4
 69 |         nS = nrow * ncol
 70 | 
 71 |         isd = np.array(desc == b'S').astype('float64').ravel()
 72 |         isd /= isd.sum()
 73 | 
 74 |         P = {s : {a : [] for a in range(nA)} for s in range(nS)}
 75 | 
 76 |         def to_s(row, col):
 77 |             return row*ncol + col
 78 |         def inc(row, col, a):
 79 |             if a==0: # left
 80 |                 col = max(col-1,0)
 81 |             elif a==1: # down
 82 |                 row = min(row+1,nrow-1)
 83 |             elif a==2: # right
 84 |                 col = min(col+1,ncol-1)
 85 |             elif a==3: # up
 86 |                 row = max(row-1,0)
 87 |             return (row, col)
 88 | 
 89 |         for row in range(nrow):
 90 |             for col in range(ncol):
 91 |                 s = to_s(row, col)
 92 |                 for a in range(4):
 93 |                     li = P[s][a]
 94 |                     letter = desc[row, col]
 95 |                     if letter in b'GH':
 96 |                         li.append((1.0, s, 0, True))
 97 |                     else:
 98 |                         if is_slippery:
 99 |                             for b in [(a-1)%4, a, (a+1)%4]:
100 |                                 newrow, newcol = inc(row, col, b)
101 |                                 newstate = to_s(newrow, newcol)
102 |                                 newletter = desc[newrow, newcol]
103 |                                 done = bytes(newletter) in b'GH'
104 |                                 rew = float(newletter == b'G')
105 |                                 li.append((0.8 if b==a else 0.1, newstate, rew, done))
106 |                         else:
107 |                             newrow, newcol = inc(row, col, a)
108 |                             newstate = to_s(newrow, newcol)
109 |                             newletter = desc[newrow, newcol]
110 |                             done = bytes(newletter) in b'GH'
111 |                             rew = float(newletter == b'G')
112 |                             li.append((1.0, newstate, rew, done))
113 | 
114 |         super(FrozenLakeEnv, self).__init__(nS, nA, P, isd)
115 | 
116 |     def _render(self, mode='human', close=False):
117 |         if close:
118 |             return
119 |         outfile = StringIO() if mode == 'ansi' else sys.stdout
120 | 
121 |         row, col = self.s // self.ncol, self.s % self.ncol
122 |         desc = self.desc.tolist()
123 |         desc = [[c.decode('utf-8') for c in line] for line in desc]
124 |         desc[row][col] = utils.colorize(desc[row][col], "red", highlight=True)
125 |         if self.lastaction is not None:
126 |             outfile.write("  ({})\n".format(["Left","Down","Right","Up"][self.lastaction]))
127 |         else:
128 |             outfile.write("\n")
129 |         outfile.write("\n".join(''.join(line) for line in desc)+"\n")
130 | 
131 |         return outfile
132 | 


--------------------------------------------------------------------------------
/sp17_hw/hw3/README:
--------------------------------------------------------------------------------
1 | See http://rll.berkeley.edu/deeprlcourse/docs/hw3.pdf for instructions
2 | 
3 | The starter code was based on an implementation of Q-learning for Atari
4 | generously provided by Szymon Sidor from OpenAI
5 | 
6 | 


--------------------------------------------------------------------------------
/sp17_hw/hw3/atari_wrappers.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | from collections import deque
  4 | import gym
  5 | from gym import spaces
  6 | 
  7 | 
  8 | class NoopResetEnv(gym.Wrapper):
  9 |     def __init__(self, env=None, noop_max=30):
 10 |         """Sample initial states by taking random number of no-ops on reset.
 11 |         No-op is assumed to be action 0.
 12 |         """
 13 |         super(NoopResetEnv, self).__init__(env)
 14 |         self.noop_max = noop_max
 15 |         assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
 16 | 
 17 |     def _reset(self):
 18 |         """ Do no-op action for a number of steps in [1, noop_max]."""
 19 |         self.env.reset()
 20 |         noops = np.random.randint(1, self.noop_max + 1)
 21 |         for _ in range(noops):
 22 |             obs, _, _, _ = self.env.step(0)
 23 |         return obs
 24 | 
 25 | class FireResetEnv(gym.Wrapper):
 26 |     def __init__(self, env=None):
 27 |         """Take action on reset for environments that are fixed until firing."""
 28 |         super(FireResetEnv, self).__init__(env)
 29 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
 30 |         assert len(env.unwrapped.get_action_meanings()) >= 3
 31 | 
 32 |     def _reset(self):
 33 |         self.env.reset()
 34 |         obs, _, _, _ = self.env.step(1)
 35 |         obs, _, _, _ = self.env.step(2)
 36 |         return obs
 37 | 
 38 | class EpisodicLifeEnv(gym.Wrapper):
 39 |     def __init__(self, env=None):
 40 |         """Make end-of-life == end-of-episode, but only reset on true game over.
 41 |         Done by DeepMind for the DQN and co. since it helps value estimation.
 42 |         """
 43 |         super(EpisodicLifeEnv, self).__init__(env)
 44 |         self.lives = 0
 45 |         self.was_real_done  = True
 46 |         self.was_real_reset = False
 47 | 
 48 |     def _step(self, action):
 49 |         obs, reward, done, info = self.env.step(action)
 50 |         self.was_real_done = done
 51 |         # check current lives, make loss of life terminal,
 52 |         # then update lives to handle bonus lives
 53 |         lives = self.env.unwrapped.ale.lives()
 54 |         if lives < self.lives and lives > 0:
 55 |             # for Qbert somtimes we stay in lives == 0 condtion for a few frames
 56 |             # so its important to keep lives > 0, so that we only reset once
 57 |             # the environment advertises done.
 58 |             done = True
 59 |         self.lives = lives
 60 |         return obs, reward, done, info
 61 | 
 62 |     def _reset(self):
 63 |         """Reset only when lives are exhausted.
 64 |         This way all states are still reachable even though lives are episodic,
 65 |         and the learner need not know about any of this behind-the-scenes.
 66 |         """
 67 |         if self.was_real_done:
 68 |             obs = self.env.reset()
 69 |             self.was_real_reset = True
 70 |         else:
 71 |             # no-op step to advance from terminal/lost life state
 72 |             obs, _, _, _ = self.env.step(0)
 73 |             self.was_real_reset = False
 74 |         self.lives = self.env.unwrapped.ale.lives()
 75 |         return obs
 76 | 
 77 | class MaxAndSkipEnv(gym.Wrapper):
 78 |     def __init__(self, env=None, skip=4):
 79 |         """Return only every `skip`-th frame"""
 80 |         super(MaxAndSkipEnv, self).__init__(env)
 81 |         # most recent raw observations (for max pooling across time steps)
 82 |         self._obs_buffer = deque(maxlen=2)
 83 |         self._skip       = skip
 84 | 
 85 |     def _step(self, action):
 86 |         total_reward = 0.0
 87 |         done = None
 88 |         for _ in range(self._skip):
 89 |             obs, reward, done, info = self.env.step(action)
 90 |             self._obs_buffer.append(obs)
 91 |             total_reward += reward
 92 |             if done:
 93 |                 break
 94 | 
 95 |         max_frame = np.max(np.stack(self._obs_buffer), axis=0)
 96 | 
 97 |         return max_frame, total_reward, done, info
 98 | 
 99 |     def _reset(self):
100 |         """Clear past frame buffer and init. to first obs. from inner env."""
101 |         self._obs_buffer.clear()
102 |         obs = self.env.reset()
103 |         self._obs_buffer.append(obs)
104 |         return obs
105 | 
106 | def _process_frame84(frame):
107 |     img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
108 |     img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
109 |     resized_screen = cv2.resize(img, (84, 110),  interpolation=cv2.INTER_LINEAR)
110 |     x_t = resized_screen[18:102, :]
111 |     x_t = np.reshape(x_t, [84, 84, 1])
112 |     return x_t.astype(np.uint8)
113 | 
114 | class ProcessFrame84(gym.Wrapper):
115 |     def __init__(self, env=None):
116 |         super(ProcessFrame84, self).__init__(env)
117 |         self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1))
118 | 
119 |     def _step(self, action):
120 |         obs, reward, done, info = self.env.step(action)
121 |         return _process_frame84(obs), reward, done, info
122 | 
123 |     def _reset(self):
124 |         return _process_frame84(self.env.reset())
125 | 
126 | class ClippedRewardsWrapper(gym.Wrapper):
127 |     def _step(self, action):
128 |         obs, reward, done, info = self.env.step(action)
129 |         return obs, np.sign(reward), done, info
130 | 
131 | def wrap_deepmind_ram(env):
132 |     env = EpisodicLifeEnv(env)
133 |     env = NoopResetEnv(env, noop_max=30)
134 |     env = MaxAndSkipEnv(env, skip=4)
135 |     if 'FIRE' in env.unwrapped.get_action_meanings():
136 |         env = FireResetEnv(env)
137 |     env = ClippedRewardsWrapper(env)
138 |     return env
139 | 
140 | def wrap_deepmind(env):
141 |     assert 'NoFrameskip' in env.spec.id
142 |     env = EpisodicLifeEnv(env)
143 |     env = NoopResetEnv(env, noop_max=30)
144 |     env = MaxAndSkipEnv(env, skip=4)
145 |     if 'FIRE' in env.unwrapped.get_action_meanings():
146 |         env = FireResetEnv(env)
147 |     env = ProcessFrame84(env)
148 |     env = ClippedRewardsWrapper(env)
149 |     return env
150 | 


--------------------------------------------------------------------------------
/sp17_hw/hw3/dqn.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import gym.spaces
  3 | import itertools
  4 | import numpy as np
  5 | import random
  6 | import tensorflow                as tf
  7 | import tensorflow.contrib.layers as layers
  8 | from collections import namedtuple
  9 | from dqn_utils import *
 10 | 
 11 | OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs", "lr_schedule"])
 12 | 
 13 | def learn(env,
 14 |           q_func,
 15 |           optimizer_spec,
 16 |           session,
 17 |           exploration=LinearSchedule(1000000, 0.1),
 18 |           stopping_criterion=None,
 19 |           replay_buffer_size=1000000,
 20 |           batch_size=32,
 21 |           gamma=0.99,
 22 |           learning_starts=50000,
 23 |           learning_freq=4,
 24 |           frame_history_len=4,
 25 |           target_update_freq=10000,
 26 |           grad_norm_clipping=10):
 27 |     """Run Deep Q-learning algorithm.
 28 | 
 29 |     You can specify your own convnet using q_func.
 30 | 
 31 |     All schedules are w.r.t. total number of steps taken in the environment.
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     env: gym.Env
 36 |         gym environment to train on.
 37 |     q_func: function
 38 |         Model to use for computing the q function. It should accept the
 39 |         following named arguments:
 40 |             img_in: tf.Tensor
 41 |                 tensorflow tensor representing the input image
 42 |             num_actions: int
 43 |                 number of actions
 44 |             scope: str
 45 |                 scope in which all the model related variables
 46 |                 should be created
 47 |             reuse: bool
 48 |                 whether previously created variables should be reused.
 49 |     optimizer_spec: OptimizerSpec
 50 |         Specifying the constructor and kwargs, as well as learning rate schedule
 51 |         for the optimizer
 52 |     session: tf.Session
 53 |         tensorflow session to use.
 54 |     exploration: rl_algs.deepq.utils.schedules.Schedule
 55 |         schedule for probability of chosing random action.
 56 |     stopping_criterion: (env, t) -> bool
 57 |         should return true when it's ok for the RL algorithm to stop.
 58 |         takes in env and the number of steps executed so far.
 59 |     replay_buffer_size: int
 60 |         How many memories to store in the replay buffer.
 61 |     batch_size: int
 62 |         How many transitions to sample each time experience is replayed.
 63 |     gamma: float
 64 |         Discount Factor
 65 |     learning_starts: int
 66 |         After how many environment steps to start replaying experiences
 67 |     learning_freq: int
 68 |         How many steps of environment to take between every experience replay
 69 |     frame_history_len: int
 70 |         How many past frames to include as input to the model.
 71 |     target_update_freq: int
 72 |         How many experience replay rounds (not steps!) to perform between
 73 |         each update to the target Q network
 74 |     grad_norm_clipping: float or None
 75 |         If not None gradients' norms are clipped to this value.
 76 |     """
 77 |     assert type(env.observation_space) == gym.spaces.Box
 78 |     assert type(env.action_space)      == gym.spaces.Discrete
 79 | 
 80 |     ###############
 81 |     # BUILD MODEL #
 82 |     ###############
 83 | 
 84 |     if len(env.observation_space.shape) == 1:
 85 |         # This means we are running on low-dimensional observations (e.g. RAM)
 86 |         input_shape = env.observation_space.shape
 87 |     else:
 88 |         img_h, img_w, img_c = env.observation_space.shape
 89 |         input_shape = (img_h, img_w, frame_history_len * img_c)
 90 |     num_actions = env.action_space.n
 91 | 
 92 |     # set up placeholders
 93 |     # placeholder for current observation (or state)
 94 |     obs_t_ph              = tf.placeholder(tf.uint8, [None] + list(input_shape))
 95 |     # placeholder for current action
 96 |     act_t_ph              = tf.placeholder(tf.int32,   [None])
 97 |     # placeholder for current reward
 98 |     rew_t_ph              = tf.placeholder(tf.float32, [None])
 99 |     # placeholder for next observation (or state)
100 |     obs_tp1_ph            = tf.placeholder(tf.uint8, [None] + list(input_shape))
101 |     # placeholder for end of episode mask
102 |     # this value is 1 if the next state corresponds to the end of an episode,
103 |     # in which case there is no Q-value at the next state; at the end of an
104 |     # episode, only the current state reward contributes to the target, not the
105 |     # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1)
106 |     done_mask_ph          = tf.placeholder(tf.float32, [None])
107 | 
108 |     # casting to float on GPU ensures lower data transfer times.
109 |     obs_t_float   = tf.cast(obs_t_ph,   tf.float32) / 255.0
110 |     obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0
111 | 
112 |     # Here, you should fill in your own code to compute the Bellman error. This requires
113 |     # evaluating the current and next Q-values and constructing the corresponding error.
114 |     # TensorFlow will differentiate this error for you, you just need to pass it to the
115 |     # optimizer. See assignment text for details.
116 |     # Your code should produce one scalar-valued tensor: total_error
117 |     # This will be passed to the optimizer in the provided code below.
118 |     # Your code should also produce two collections of variables:
119 |     # q_func_vars
120 |     # target_q_func_vars
121 |     # These should hold all of the variables of the Q-function network and target network,
122 |     # respectively. A convenient way to get these is to make use of TF's "scope" feature.
123 |     # For example, you can create your Q-function network with the scope "q_func" like this:
124 |     # <something> = q_func(obs_t_float, num_actions, scope="q_func", reuse=False)
125 |     # And then you can obtain the variables like this:
126 |     # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func')
127 |     # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES"
128 |     ######
129 |     
130 |     # YOUR CODE HERE
131 | 
132 |     ######
133 | 
134 |     # construct optimization op (with gradient clipping)
135 |     learning_rate = tf.placeholder(tf.float32, (), name="learning_rate")
136 |     optimizer = optimizer_spec.constructor(learning_rate=learning_rate, **optimizer_spec.kwargs)
137 |     train_fn = minimize_and_clip(optimizer, total_error,
138 |                  var_list=q_func_vars, clip_val=grad_norm_clipping)
139 | 
140 |     # update_target_fn will be called periodically to copy Q network to target Q network
141 |     update_target_fn = []
142 |     for var, var_target in zip(sorted(q_func_vars,        key=lambda v: v.name),
143 |                                sorted(target_q_func_vars, key=lambda v: v.name)):
144 |         update_target_fn.append(var_target.assign(var))
145 |     update_target_fn = tf.group(*update_target_fn)
146 | 
147 |     # construct the replay buffer
148 |     replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)
149 | 
150 |     ###############
151 |     # RUN ENV     #
152 |     ###############
153 |     model_initialized = False
154 |     num_param_updates = 0
155 |     mean_episode_reward      = -float('nan')
156 |     best_mean_episode_reward = -float('inf')
157 |     last_obs = env.reset()
158 |     LOG_EVERY_N_STEPS = 10000
159 | 
160 |     for t in itertools.count():
161 |         ### 1. Check stopping criterion
162 |         if stopping_criterion is not None and stopping_criterion(env, t):
163 |             break
164 | 
165 |         ### 2. Step the env and store the transition
166 |         # At this point, "last_obs" contains the latest observation that was
167 |         # recorded from the simulator. Here, your code needs to store this
168 |         # observation and its outcome (reward, next observation, etc.) into
169 |         # the replay buffer while stepping the simulator forward one step.
170 |         # At the end of this block of code, the simulator should have been
171 |         # advanced one step, and the replay buffer should contain one more
172 |         # transition.
173 |         # Specifically, last_obs must point to the new latest observation.
174 |         # Useful functions you'll need to call:
175 |         # obs, reward, done, info = env.step(action)
176 |         # this steps the environment forward one step
177 |         # obs = env.reset()
178 |         # this resets the environment if you reached an episode boundary.
179 |         # Don't forget to call env.reset() to get a new observation if done
180 |         # is true!!
181 |         # Note that you cannot use "last_obs" directly as input
182 |         # into your network, since it needs to be processed to include context
183 |         # from previous frames. You should check out the replay buffer
184 |         # implementation in dqn_utils.py to see what functionality the replay
185 |         # buffer exposes. The replay buffer has a function called
186 |         # encode_recent_observation that will take the latest observation
187 |         # that you pushed into the buffer and compute the corresponding
188 |         # input that should be given to a Q network by appending some
189 |         # previous frames.
190 |         # Don't forget to include epsilon greedy exploration!
191 |         # And remember that the first time you enter this loop, the model
192 |         # may not yet have been initialized (but of course, the first step
193 |         # might as well be random, since you haven't trained your net...)
194 | 
195 |         #####
196 |         
197 |         # YOUR CODE HERE
198 | 
199 |         #####
200 | 
201 |         # at this point, the environment should have been advanced one step (and
202 |         # reset if done was true), and last_obs should point to the new latest
203 |         # observation
204 | 
205 |         ### 3. Perform experience replay and train the network.
206 |         # note that this is only done if the replay buffer contains enough samples
207 |         # for us to learn something useful -- until then, the model will not be
208 |         # initialized and random actions should be taken
209 |         if (t > learning_starts and
210 |                 t % learning_freq == 0 and
211 |                 replay_buffer.can_sample(batch_size)):
212 |             # Here, you should perform training. Training consists of four steps:
213 |             # 3.a: use the replay buffer to sample a batch of transitions (see the
214 |             # replay buffer code for function definition, each batch that you sample
215 |             # should consist of current observations, current actions, rewards,
216 |             # next observations, and done indicator).
217 |             # 3.b: initialize the model if it has not been initialized yet; to do
218 |             # that, call
219 |             #    initialize_interdependent_variables(session, tf.global_variables(), {
220 |             #        obs_t_ph: obs_t_batch,
221 |             #        obs_tp1_ph: obs_tp1_batch,
222 |             #    })
223 |             # where obs_t_batch and obs_tp1_batch are the batches of observations at
224 |             # the current and next time step. The boolean variable model_initialized
225 |             # indicates whether or not the model has been initialized.
226 |             # Remember that you have to update the target network too (see 3.d)!
227 |             # 3.c: train the model. To do this, you'll need to use the train_fn and
228 |             # total_error ops that were created earlier: total_error is what you
229 |             # created to compute the total Bellman error in a batch, and train_fn
230 |             # will actually perform a gradient step and update the network parameters
231 |             # to reduce total_error. When calling session.run on these you'll need to
232 |             # populate the following placeholders:
233 |             # obs_t_ph
234 |             # act_t_ph
235 |             # rew_t_ph
236 |             # obs_tp1_ph
237 |             # done_mask_ph
238 |             # (this is needed for computing total_error)
239 |             # learning_rate -- you can get this from optimizer_spec.lr_schedule.value(t)
240 |             # (this is needed by the optimizer to choose the learning rate)
241 |             # 3.d: periodically update the target network by calling
242 |             # session.run(update_target_fn)
243 |             # you should update every target_update_freq steps, and you may find the
244 |             # variable num_param_updates useful for this (it was initialized to 0)
245 |             #####
246 |             
247 |             # YOUR CODE HERE
248 | 
249 |             #####
250 | 
251 |         ### 4. Log progress
252 |         episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards()
253 |         if len(episode_rewards) > 0:
254 |             mean_episode_reward = np.mean(episode_rewards[-100:])
255 |         if len(episode_rewards) > 100:
256 |             best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward)
257 |         if t % LOG_EVERY_N_STEPS == 0 and model_initialized:
258 |             print("Timestep %d" % (t,))
259 |             print("mean reward (100 episodes) %f" % mean_episode_reward)
260 |             print("best mean reward %f" % best_mean_episode_reward)
261 |             print("episodes %d" % len(episode_rewards))
262 |             print("exploration %f" % exploration.value(t))
263 |             print("learning_rate %f" % optimizer_spec.lr_schedule.value(t))
264 |             sys.stdout.flush()
265 | 


--------------------------------------------------------------------------------
/sp17_hw/hw3/dqn_utils.py:
--------------------------------------------------------------------------------
  1 | """This file includes a collection of utility functions that are useful for
  2 | implementing DQN."""
  3 | import gym
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | import random
  7 | 
  8 | def huber_loss(x, delta=1.0):
  9 |     # https://en.wikipedia.org/wiki/Huber_loss
 10 |     return tf.select(
 11 |         tf.abs(x) < delta,
 12 |         tf.square(x) * 0.5,
 13 |         delta * (tf.abs(x) - 0.5 * delta)
 14 |     )
 15 | 
 16 | def sample_n_unique(sampling_f, n):
 17 |     """Helper function. Given a function `sampling_f` that returns
 18 |     comparable objects, sample n such unique objects.
 19 |     """
 20 |     res = []
 21 |     while len(res) < n:
 22 |         candidate = sampling_f()
 23 |         if candidate not in res:
 24 |             res.append(candidate)
 25 |     return res
 26 | 
 27 | class Schedule(object):
 28 |     def value(self, t):
 29 |         """Value of the schedule at time t"""
 30 |         raise NotImplementedError()
 31 | 
 32 | class ConstantSchedule(object):
 33 |     def __init__(self, value):
 34 |         """Value remains constant over time.
 35 |         Parameters
 36 |         ----------
 37 |         value: float
 38 |             Constant value of the schedule
 39 |         """
 40 |         self._v = value
 41 | 
 42 |     def value(self, t):
 43 |         """See Schedule.value"""
 44 |         return self._v
 45 | 
 46 | def linear_interpolation(l, r, alpha):
 47 |     return l + alpha * (r - l)
 48 | 
 49 | class PiecewiseSchedule(object):
 50 |     def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
 51 |         """Piecewise schedule.
 52 |         endpoints: [(int, int)]
 53 |             list of pairs `(time, value)` meanining that schedule should output
 54 |             `value` when `t==time`. All the values for time must be sorted in
 55 |             an increasing order. When t is between two times, e.g. `(time_a, value_a)`
 56 |             and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
 57 |             `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
 58 |             time passed between `time_a` and `time_b` for time `t`.
 59 |         interpolation: lambda float, float, float: float
 60 |             a function that takes value to the left and to the right of t according
 61 |             to the `endpoints`. Alpha is the fraction of distance from left endpoint to
 62 |             right endpoint that t has covered. See linear_interpolation for example.
 63 |         outside_value: float
 64 |             if the value is requested outside of all the intervals sepecified in
 65 |             `endpoints` this value is returned. If None then AssertionError is
 66 |             raised when outside value is requested.
 67 |         """
 68 |         idxes = [e[0] for e in endpoints]
 69 |         assert idxes == sorted(idxes)
 70 |         self._interpolation = interpolation
 71 |         self._outside_value = outside_value
 72 |         self._endpoints      = endpoints
 73 | 
 74 |     def value(self, t):
 75 |         """See Schedule.value"""
 76 |         for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
 77 |             if l_t <= t and t < r_t:
 78 |                 alpha = float(t - l_t) / (r_t - l_t)
 79 |                 return self._interpolation(l, r, alpha)
 80 | 
 81 |         # t does not belong to any of the pieces, so doom.
 82 |         assert self._outside_value is not None
 83 |         return self._outside_value
 84 | 
 85 | class LinearSchedule(object):
 86 |     def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
 87 |         """Linear interpolation between initial_p and final_p over
 88 |         schedule_timesteps. After this many timesteps pass final_p is
 89 |         returned.
 90 |         Parameters
 91 |         ----------
 92 |         schedule_timesteps: int
 93 |             Number of timesteps for which to linearly anneal initial_p
 94 |             to final_p
 95 |         initial_p: float
 96 |             initial output value
 97 |         final_p: float
 98 |             final output value
 99 |         """
100 |         self.schedule_timesteps = schedule_timesteps
101 |         self.final_p            = final_p
102 |         self.initial_p          = initial_p
103 | 
104 |     def value(self, t):
105 |         """See Schedule.value"""
106 |         fraction  = min(float(t) / self.schedule_timesteps, 1.0)
107 |         return self.initial_p + fraction * (self.final_p - self.initial_p)
108 | 
109 | def compute_exponential_averages(variables, decay):
110 |     """Given a list of tensorflow scalar variables
111 |     create ops corresponding to their exponential
112 |     averages
113 |     Parameters
114 |     ----------
115 |     variables: [tf.Tensor]
116 |         List of scalar tensors.
117 |     Returns
118 |     -------
119 |     averages: [tf.Tensor]
120 |         List of scalar tensors corresponding to averages
121 |         of al the `variables` (in order)
122 |     apply_op: tf.runnable
123 |         Op to be run to update the averages with current value
124 |         of variables.
125 |     """
126 |     averager = tf.train.ExponentialMovingAverage(decay=decay)
127 |     apply_op = averager.apply(variables)
128 |     return [averager.average(v) for v in variables], apply_op
129 | 
130 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
131 |     """Minimized `objective` using `optimizer` w.r.t. variables in
132 |     `var_list` while ensure the norm of the gradients for each
133 |     variable is clipped to `clip_val`
134 |     """
135 |     gradients = optimizer.compute_gradients(objective, var_list=var_list)
136 |     for i, (grad, var) in enumerate(gradients):
137 |         if grad is not None:
138 |             gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
139 |     return optimizer.apply_gradients(gradients)
140 | 
141 | def initialize_interdependent_variables(session, vars_list, feed_dict):
142 |     """Initialize a list of variables one at a time, which is useful if
143 |     initialization of some variables depends on initialization of the others.
144 |     """
145 |     vars_left = vars_list
146 |     while len(vars_left) > 0:
147 |         new_vars_left = []
148 |         for v in vars_left:
149 |             try:
150 |                 # If using an older version of TensorFlow, uncomment the line
151 |                 # below and comment out the line after it.
152 | 		#session.run(tf.initialize_variables([v]), feed_dict)
153 |                 session.run(tf.variables_initializer([v]), feed_dict)
154 |             except tf.errors.FailedPreconditionError:
155 |                 new_vars_left.append(v)
156 |         if len(new_vars_left) >= len(vars_left):
157 |             # This can happend if the variables all depend on each other, or more likely if there's
158 |             # another variable outside of the list, that still needs to be initialized. This could be
159 |             # detected here, but life's finite.
160 |             raise Exception("Cycle in variable dependencies, or extenrnal precondition unsatisfied.")
161 |         else:
162 |             vars_left = new_vars_left
163 | 
164 | def get_wrapper_by_name(env, classname):
165 |     currentenv = env
166 |     while True:
167 |         if classname in currentenv.__class__.__name__:
168 |             return currentenv
169 |         elif isinstance(env, gym.Wrapper):
170 |             currentenv = currentenv.env
171 |         else:
172 |             raise ValueError("Couldn't find wrapper named %s"%classname)
173 | 
174 | class ReplayBuffer(object):
175 |     def __init__(self, size, frame_history_len):
176 |         """This is a memory efficient implementation of the replay buffer.
177 | 
178 |         The sepecific memory optimizations use here are:
179 |             - only store each frame once rather than k times
180 |               even if every observation normally consists of k last frames
181 |             - store frames as np.uint8 (actually it is most time-performance
182 |               to cast them back to float32 on GPU to minimize memory transfer
183 |               time)
184 |             - store frame_t and frame_(t+1) in the same buffer.
185 | 
186 |         For the tipical use case in Atari Deep RL buffer with 1M frames the total
187 |         memory footprint of this buffer is 10^6 * 84 * 84 bytes ~= 7 gigabytes
188 | 
189 |         Warning! Assumes that returning frame of zeros at the beginning
190 |         of the episode, when there is less frames than `frame_history_len`,
191 |         is acceptable.
192 | 
193 |         Parameters
194 |         ----------
195 |         size: int
196 |             Max number of transitions to store in the buffer. When the buffer
197 |             overflows the old memories are dropped.
198 |         frame_history_len: int
199 |             Number of memories to be retried for each observation.
200 |         """
201 |         self.size = size
202 |         self.frame_history_len = frame_history_len
203 | 
204 |         self.next_idx      = 0
205 |         self.num_in_buffer = 0
206 | 
207 |         self.obs      = None
208 |         self.action   = None
209 |         self.reward   = None
210 |         self.done     = None
211 | 
212 |     def can_sample(self, batch_size):
213 |         """Returns true if `batch_size` different transitions can be sampled from the buffer."""
214 |         return batch_size + 1 <= self.num_in_buffer
215 | 
216 |     def _encode_sample(self, idxes):
217 |         obs_batch      = np.concatenate([self._encode_observation(idx)[None] for idx in idxes], 0)
218 |         act_batch      = self.action[idxes]
219 |         rew_batch      = self.reward[idxes]
220 |         next_obs_batch = np.concatenate([self._encode_observation(idx + 1)[None] for idx in idxes], 0)
221 |         done_mask      = np.array([1.0 if self.done[idx] else 0.0 for idx in idxes], dtype=np.float32)
222 | 
223 |         return obs_batch, act_batch, rew_batch, next_obs_batch, done_mask
224 | 
225 | 
226 |     def sample(self, batch_size):
227 |         """Sample `batch_size` different transitions.
228 | 
229 |         i-th sample transition is the following:
230 | 
231 |         when observing `obs_batch[i]`, action `act_batch[i]` was taken,
232 |         after which reward `rew_batch[i]` was received and subsequent
233 |         observation  next_obs_batch[i] was observed, unless the epsiode
234 |         was done which is represented by `done_mask[i]` which is equal
235 |         to 1 if episode has ended as a result of that action.
236 | 
237 |         Parameters
238 |         ----------
239 |         batch_size: int
240 |             How many transitions to sample.
241 | 
242 |         Returns
243 |         -------
244 |         obs_batch: np.array
245 |             Array of shape
246 |             (batch_size, img_h, img_w, img_c * frame_history_len)
247 |             and dtype np.uint8
248 |         act_batch: np.array
249 |             Array of shape (batch_size,) and dtype np.int32
250 |         rew_batch: np.array
251 |             Array of shape (batch_size,) and dtype np.float32
252 |         next_obs_batch: np.array
253 |             Array of shape
254 |             (batch_size, img_h, img_w, img_c * frame_history_len)
255 |             and dtype np.uint8
256 |         done_mask: np.array
257 |             Array of shape (batch_size,) and dtype np.float32
258 |         """
259 |         assert self.can_sample(batch_size)
260 |         idxes = sample_n_unique(lambda: random.randint(0, self.num_in_buffer - 2), batch_size)
261 |         return self._encode_sample(idxes)
262 | 
263 |     def encode_recent_observation(self):
264 |         """Return the most recent `frame_history_len` frames.
265 | 
266 |         Returns
267 |         -------
268 |         observation: np.array
269 |             Array of shape (img_h, img_w, img_c * frame_history_len)
270 |             and dtype np.uint8, where observation[:, :, i*img_c:(i+1)*img_c]
271 |             encodes frame at time `t - frame_history_len + i`
272 |         """
273 |         assert self.num_in_buffer > 0
274 |         return self._encode_observation((self.next_idx - 1) % self.size)
275 | 
276 |     def _encode_observation(self, idx):
277 |         end_idx   = idx + 1 # make noninclusive
278 |         start_idx = end_idx - self.frame_history_len
279 |         # this checks if we are using low-dimensional observations, such as RAM
280 |         # state, in which case we just directly return the latest RAM.
281 |         if len(self.obs.shape) == 2:
282 |             return self.obs[end_idx-1]
283 |         # if there weren't enough frames ever in the buffer for context
284 |         if start_idx < 0 and self.num_in_buffer != self.size:
285 |             start_idx = 0
286 |         for idx in range(start_idx, end_idx - 1):
287 |             if self.done[idx % self.size]:
288 |                 start_idx = idx + 1
289 |         missing_context = self.frame_history_len - (end_idx - start_idx)
290 |         # if zero padding is needed for missing context
291 |         # or we are on the boundry of the buffer
292 |         if start_idx < 0 or missing_context > 0:
293 |             frames = [np.zeros_like(self.obs[0]) for _ in range(missing_context)]
294 |             for idx in range(start_idx, end_idx):
295 |                 frames.append(self.obs[idx % self.size])
296 |             return np.concatenate(frames, 2)
297 |         else:
298 |             # this optimization has potential to saves about 30% compute time \o/
299 |             img_h, img_w = self.obs.shape[1], self.obs.shape[2]
300 |             return self.obs[start_idx:end_idx].transpose(1, 2, 0, 3).reshape(img_h, img_w, -1)
301 | 
302 |     def store_frame(self, frame):
303 |         """Store a single frame in the buffer at the next available index, overwriting
304 |         old frames if necessary.
305 | 
306 |         Parameters
307 |         ----------
308 |         frame: np.array
309 |             Array of shape (img_h, img_w, img_c) and dtype np.uint8
310 |             the frame to be stored
311 | 
312 |         Returns
313 |         -------
314 |         idx: int
315 |             Index at which the frame is stored. To be used for `store_effect` later.
316 |         """
317 |         if self.obs is None:
318 |             self.obs      = np.empty([self.size] + list(frame.shape), dtype=np.uint8)
319 |             self.action   = np.empty([self.size],                     dtype=np.int32)
320 |             self.reward   = np.empty([self.size],                     dtype=np.float32)
321 |             self.done     = np.empty([self.size],                     dtype=np.bool)
322 |         self.obs[self.next_idx] = frame
323 | 
324 |         ret = self.next_idx
325 |         self.next_idx = (self.next_idx + 1) % self.size
326 |         self.num_in_buffer = min(self.size, self.num_in_buffer + 1)
327 | 
328 |         return ret
329 | 
330 |     def store_effect(self, idx, action, reward, done):
331 |         """Store effects of action taken after obeserving frame stored
332 |         at index idx. The reason `store_frame` and `store_effect` is broken
333 |         up into two functions is so that once can call `encode_recent_observation`
334 |         in between.
335 | 
336 |         Paramters
337 |         ---------
338 |         idx: int
339 |             Index in buffer of recently observed frame (returned by `store_frame`).
340 |         action: int
341 |             Action that was performed upon observing this frame.
342 |         reward: float
343 |             Reward that was received when the actions was performed.
344 |         done: bool
345 |             True if episode was finished after performing that action.
346 |         """
347 |         self.action[idx] = action
348 |         self.reward[idx] = reward
349 |         self.done[idx]   = done
350 | 
351 | 


--------------------------------------------------------------------------------
/sp17_hw/hw3/run_dqn_atari.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | from gym import wrappers
  4 | import os.path as osp
  5 | import random
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import tensorflow.contrib.layers as layers
  9 | 
 10 | import dqn
 11 | from dqn_utils import *
 12 | from atari_wrappers import *
 13 | 
 14 | 
 15 | def atari_model(img_in, num_actions, scope, reuse=False):
 16 |     # as described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf
 17 |     with tf.variable_scope(scope, reuse=reuse):
 18 |         out = img_in
 19 |         with tf.variable_scope("convnet"):
 20 |             # original architecture
 21 |             out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu)
 22 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu)
 23 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu)
 24 |         out = layers.flatten(out)
 25 |         with tf.variable_scope("action_value"):
 26 |             out = layers.fully_connected(out, num_outputs=512,         activation_fn=tf.nn.relu)
 27 |             out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
 28 | 
 29 |         return out
 30 | 
 31 | def atari_learn(env,
 32 |                 session,
 33 |                 num_timesteps):
 34 |     # This is just a rough estimate
 35 |     num_iterations = float(num_timesteps) / 4.0
 36 | 
 37 |     lr_multiplier = 1.0
 38 |     lr_schedule = PiecewiseSchedule([
 39 |                                          (0,                   1e-4 * lr_multiplier),
 40 |                                          (num_iterations / 10, 1e-4 * lr_multiplier),
 41 |                                          (num_iterations / 2,  5e-5 * lr_multiplier),
 42 |                                     ],
 43 |                                     outside_value=5e-5 * lr_multiplier)
 44 |     optimizer = dqn.OptimizerSpec(
 45 |         constructor=tf.train.AdamOptimizer,
 46 |         kwargs=dict(epsilon=1e-4),
 47 |         lr_schedule=lr_schedule
 48 |     )
 49 | 
 50 |     def stopping_criterion(env, t):
 51 |         # notice that here t is the number of steps of the wrapped env,
 52 |         # which is different from the number of steps in the underlying env
 53 |         return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
 54 | 
 55 |     exploration_schedule = PiecewiseSchedule(
 56 |         [
 57 |             (0, 1.0),
 58 |             (1e6, 0.1),
 59 |             (num_iterations / 2, 0.01),
 60 |         ], outside_value=0.01
 61 |     )
 62 | 
 63 |     dqn.learn(
 64 |         env,
 65 |         q_func=atari_model,
 66 |         optimizer_spec=optimizer,
 67 |         session=session,
 68 |         exploration=exploration_schedule,
 69 |         stopping_criterion=stopping_criterion,
 70 |         replay_buffer_size=1000000,
 71 |         batch_size=32,
 72 |         gamma=0.99,
 73 |         learning_starts=50000,
 74 |         learning_freq=4,
 75 |         frame_history_len=4,
 76 |         target_update_freq=10000,
 77 |         grad_norm_clipping=10
 78 |     )
 79 |     env.close()
 80 | 
 81 | def get_available_gpus():
 82 |     from tensorflow.python.client import device_lib
 83 |     local_device_protos = device_lib.list_local_devices()
 84 |     return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU']
 85 | 
 86 | def set_global_seeds(i):
 87 |     try:
 88 |         import tensorflow as tf
 89 |     except ImportError:
 90 |         pass
 91 |     else:
 92 |         tf.set_random_seed(i) 
 93 |     np.random.seed(i)
 94 |     random.seed(i)
 95 | 
 96 | def get_session():
 97 |     tf.reset_default_graph()
 98 |     tf_config = tf.ConfigProto(
 99 |         inter_op_parallelism_threads=1,
100 |         intra_op_parallelism_threads=1)
101 |     session = tf.Session(config=tf_config)
102 |     print("AVAILABLE GPUS: ", get_available_gpus())
103 |     return session
104 | 
105 | def get_env(task, seed):
106 |     env_id = task.env_id
107 | 
108 |     env = gym.make(env_id)
109 | 
110 |     set_global_seeds(seed)
111 |     env.seed(seed)
112 | 
113 |     expt_dir = '/tmp/hw3_vid_dir2/'
114 |     env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
115 |     env = wrap_deepmind(env)
116 | 
117 |     return env
118 | 
119 | def main():
120 |     # Get Atari games.
121 |     benchmark = gym.benchmark_spec('Atari40M')
122 | 
123 |     # Change the index to select a different game.
124 |     task = benchmark.tasks[3]
125 | 
126 |     # Run training
127 |     seed = 0 # Use a seed of zero (you may want to randomize the seed!)
128 |     env = get_env(task, seed)
129 |     session = get_session()
130 |     atari_learn(env, session, num_timesteps=task.max_timesteps)
131 | 
132 | if __name__ == "__main__":
133 |     main()
134 | 


--------------------------------------------------------------------------------
/sp17_hw/hw3/run_dqn_ram.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | from gym import wrappers
  4 | import os.path as osp
  5 | import random
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import tensorflow.contrib.layers as layers
  9 | 
 10 | import dqn
 11 | from dqn_utils import *
 12 | from atari_wrappers import *
 13 | 
 14 | 
 15 | def atari_model(ram_in, num_actions, scope, reuse=False):
 16 |     with tf.variable_scope(scope, reuse=reuse):
 17 |         out = ram_in
 18 |         #out = tf.concat(1,(ram_in[:,4:5],ram_in[:,8:9],ram_in[:,11:13],ram_in[:,21:22],ram_in[:,50:51], ram_in[:,60:61],ram_in[:,64:65]))
 19 |         with tf.variable_scope("action_value"):
 20 |             out = layers.fully_connected(out, num_outputs=256, activation_fn=tf.nn.relu)
 21 |             out = layers.fully_connected(out, num_outputs=128, activation_fn=tf.nn.relu)
 22 |             out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu)
 23 |             out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
 24 | 
 25 |         return out
 26 | 
 27 | def atari_learn(env,
 28 |                 session,
 29 |                 num_timesteps):
 30 |     # This is just a rough estimate
 31 |     num_iterations = float(num_timesteps) / 4.0
 32 | 
 33 |     lr_multiplier = 1.0 
 34 |     lr_schedule = PiecewiseSchedule([
 35 |                                          (0,                   1e-4 * lr_multiplier),
 36 |                                          (num_iterations / 10, 1e-4 * lr_multiplier),
 37 |                                          (num_iterations / 2,  5e-5 * lr_multiplier),
 38 |                                     ],
 39 |                                     outside_value=5e-5 * lr_multiplier)
 40 |     optimizer = dqn.OptimizerSpec(
 41 |         constructor=tf.train.AdamOptimizer,
 42 |         kwargs=dict(epsilon=1e-4),
 43 |         lr_schedule=lr_schedule
 44 |     )
 45 | 
 46 |     def stopping_criterion(env, t):
 47 |         # notice that here t is the number of steps of the wrapped env,
 48 |         # which is different from the number of steps in the underlying env
 49 |         return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
 50 | 
 51 |     exploration_schedule = PiecewiseSchedule(
 52 |         [
 53 |             (0, 0.2),
 54 |             (1e6, 0.1),
 55 |             (num_iterations / 2, 0.01),
 56 |         ], outside_value=0.01
 57 |     )
 58 | 
 59 |     dqn.learn(
 60 |         env,
 61 |         q_func=atari_model,
 62 |         optimizer_spec=optimizer,
 63 |         session=session,
 64 |         exploration=exploration_schedule,
 65 |         stopping_criterion=stopping_criterion,
 66 |         replay_buffer_size=1000000,
 67 |         batch_size=32,
 68 |         gamma=0.99,
 69 |         learning_starts=50000,
 70 |         learning_freq=4,
 71 |         frame_history_len=1,
 72 |         target_update_freq=10000,
 73 |         grad_norm_clipping=10
 74 |     )
 75 |     env.close()
 76 | 
 77 | def get_available_gpus():
 78 |     from tensorflow.python.client import device_lib
 79 |     local_device_protos = device_lib.list_local_devices()
 80 |     return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU']
 81 | 
 82 | def set_global_seeds(i):
 83 |     try:
 84 |         import tensorflow as tf
 85 |     except ImportError:
 86 |         pass
 87 |     else:
 88 |         tf.set_random_seed(i) 
 89 |     np.random.seed(i)
 90 |     random.seed(i)
 91 | 
 92 | def get_session():
 93 |     tf.reset_default_graph()
 94 |     tf_config = tf.ConfigProto(
 95 |         inter_op_parallelism_threads=1,
 96 |         intra_op_parallelism_threads=1)
 97 |     session = tf.Session(config=tf_config)
 98 |     print("AVAILABLE GPUS: ", get_available_gpus())
 99 |     return session
100 | 
101 | def get_env(seed):
102 |     env = gym.make('Pong-ram-v0')
103 | 
104 |     set_global_seeds(seed)
105 |     env.seed(seed)
106 | 
107 |     expt_dir = '/tmp/hw3_vid_dir/'
108 |     env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
109 |     env = wrap_deepmind_ram(env)
110 | 
111 |     return env
112 | 
113 | def main():
114 |     # Run training
115 |     seed = 0 # Use a seed of zero (you may want to randomize the seed!)
116 |     env = get_env(seed)
117 |     session = get_session()
118 |     atari_learn(env, session, num_timesteps=int(4e7))
119 | 
120 | if __name__ == "__main__":
121 |     main()
122 | 


--------------------------------------------------------------------------------
/sp17_hw/hw4/homework.md:
--------------------------------------------------------------------------------
 1 | # Homework 4
 2 | 
 3 | In `main.py` you will find an implementation of a "vanilla" policy gradient method, applied to an MDP with a discrete action space: an episodic version of the classic "cartpole" task. First, make sure the provided code works on your computer by running `python main.py`. We recommend reading through all of the code and comments in the function `main_cartpole`, starting at the top of the function.
 4 | 
 5 | The code computes some useful diagnostics, which you may find helpful to look at while tuning hyperparameters:
 6 |  
 7 | - **KL[policy before update || policy after update]**. Large spikes in KL divergence mean that the optimization took a large step, and sometimes these spikes cause a collapse in performance.
 8 | - **Entropy of the policy**. If entropy goes down too fast, then you may not explore enough, but if it goes down too slowly, you'll probably not reach optimal performance.
 9 | - **Explained variance of the value function**. If the value function perfectly explains the returns, then it will be 1; if you get a negative result, then it's worse than predicting a constant.
10 | 
11 | Software dependencies: 
12 | 
13 | - tensorflow
14 | - numpy + scipy (Anaconda recommended)
15 | - gym (I'm using 0.8.0, `pip install gym==0.8.0`, but old versions should work just as well)
16 | 
17 | ## Problem 1
18 | 
19 | Here you will modify the `main_cartpole` policy gradient implementation to work on a continuous action space, specifically, the gym environment `Pendulum-v`. Note that in `main_cartpole`, note that the neural network outputs "logits" (i.e., log-probabilities plus-or-minus a constant) that specify a categorical distribution. On the other hand, for the pendulum task, your neural network should output the mean of a Gaussian distribution, a separate parameter vector to parameterize the log standard deviation. For example, you could use the following code:
20 | 
21 | ```python
22 | 
23 |     mean_na = dense(h2, ac_dim, weight_init=normc_initializer(0.1)) # Mean control output
24 |     logstd_a = tf.get_variable("logstdev", [ac_dim], initializer=tf.zeros_initializer) # Variance
25 | 
26 |     sy_sampled_ac = YOUR_CODE_HERE
27 |     sy_logprob_n = YOUR_CODE_HERE
28 | 
29 | ```
30 | 
31 | You should also compute differential entropy (replacing `sy_ent`) and KL-divergence (`sy_kl`) for the Gaussian distribution. 
32 | 
33 | The pendulum problem is slightly harder, and using a fixed stepsize does not work reliably---thus, we instead recommend using an adaptive stepsize, where you adjust it based on the KL divergence between the new and old policy. Code for this stepsize adaptation is provided.
34 | 
35 | You can plot your results using the script `plot_learning_curves.py` or your own plotting code.
36 | 
37 | **Deliverables**
38 | 
39 | - Show a plot with the pendulum converging to EpRewMean of at least `-300`. Include EpRewMean, KL, Entropy in your plots.  
40 | - Describe the hyperparameters used and how many timesteps your algorithm took to learn.
41 | 
42 | ## Problem 2
43 | 
44 | 1. Implement a neural network value function with the same interface as `LinearVF`. Add it to the provided cartpole solver, and compare the performance of the linear and neural network value function (i.e., baseline).
45 | 2. Perform the same comparison--linear vs neural network--for your pendulum solver from Problem 1. You should be able to obtain faster learning using the neural network.
46 | 
47 | 
48 | **Deliverables**
49 | 
50 | - A comparison of linear vs neural network value function on the cartpole. Show the value function's explained variance (EVBefore) and mean episode reward (EpRewMean). 
51 | - A comparison of linear vs neural network value function on the pendulum. Show the value function's explained variance (EVBefore) and mean episode reward (EpRewMean). 
52 | 
53 | In both cases, list the hyperparameters used for neural network training.
54 | 
55 | ## Problem 3 (bonus)
56 | 
57 | Implement a more advanced policy gradient method from lecture (such as TRPO, or the advantage function estimator used in A3C or generalized advantage estimation), and apply it to the gym environment `Hopper-v1`. See if you can learn a good gait in less than 500,000 timesteps.
58 | Hint: it may help to standardize your inputs using a running estimate of mean and standard deviation.
59 | 
60 |     ob_rescaled = (ob_raw - mean) / (stdev + epsilon)
61 | 
62 | **Deliverables**
63 | 
64 | A description of what you implemented, and learning curves on the Hopper-v1 environment.


--------------------------------------------------------------------------------
/sp17_hw/hw4/logz.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | Some simple logging functionality, inspired by rllab's logging.
 4 | Assumes that each diagnostic gets logged each iteration
 5 | 
 6 | Call logz.configure_output_dir() to start logging to a 
 7 | tab-separated-values file (some_folder_name/log.txt)
 8 | 
 9 | To load the learning curves, you can do, for example
10 | 
11 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
12 | A['EpRewMean']
13 | 
14 | """
15 | 
16 | import os.path as osp, shutil, time, atexit, os, subprocess
17 | 
18 | color2num = dict(
19 |     gray=30,
20 |     red=31,
21 |     green=32,
22 |     yellow=33,
23 |     blue=34,
24 |     magenta=35,
25 |     cyan=36,
26 |     white=37,
27 |     crimson=38
28 | )
29 | 
30 | def colorize(string, color, bold=False, highlight=False):
31 |     attr = []
32 |     num = color2num[color]
33 |     if highlight: num += 10
34 |     attr.append(str(num))
35 |     if bold: attr.append('1')
36 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
37 | 
38 | class G:
39 |     output_dir = None
40 |     output_file = None
41 |     first_row = True
42 |     log_headers = []
43 |     log_current_row = {}
44 | 
45 | def configure_output_dir(d=None):
46 |     """
47 |     Set output directory to d, or to /tmp/somerandomnumber if d is None
48 |     """
49 |     G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
50 |     assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir
51 |     os.makedirs(G.output_dir)
52 |     G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
53 |     atexit.register(G.output_file.close)
54 |     try:
55 |         cmd = "cd %s && git diff > %s 2>/dev/null"%(osp.dirname(__file__), osp.join(G.output_dir, "a.diff"))
56 |         subprocess.check_call(cmd, shell=True) # Save git diff to experiment directory
57 |     except subprocess.CalledProcessError:
58 |         print("configure_output_dir: not storing the git diff, probably because you're not in a git repo")
59 |     print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
60 | 
61 | def log_tabular(key, val):
62 |     """
63 |     Log a value of some diagnostic
64 |     Call this once for each diagnostic quantity, each iteration
65 |     """
66 |     if G.first_row:
67 |         G.log_headers.append(key)
68 |     else:
69 |         assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
70 |     assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
71 |     G.log_current_row[key] = val
72 | 
73 | def dump_tabular():
74 |     """
75 |     Write all of the diagnostics from the current iteration
76 |     """
77 |     vals = []
78 |     print("-"*37)
79 |     for key in G.log_headers:
80 |         val = G.log_current_row.get(key, "")
81 |         if hasattr(val, "__float__"): valstr = "%8.3g"%val
82 |         else: valstr = val
83 |         print("| %15s | %15s |"%(key, valstr))
84 |         vals.append(val)
85 |     print("-"*37)
86 |     if G.output_file is not None:
87 |         if G.first_row:
88 |             G.output_file.write("\t".join(G.log_headers))
89 |             G.output_file.write("\n")
90 |         G.output_file.write("\t".join(map(str,vals)))
91 |         G.output_file.write("\n")
92 |         G.output_file.flush()
93 |     G.log_current_row.clear()
94 |     G.first_row=False


--------------------------------------------------------------------------------
/sp17_hw/hw4/main.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import gym
  4 | import logz
  5 | import scipy.signal
  6 | 
  7 | def normc_initializer(std=1.0):
  8 |     """
  9 |     Initialize array with normalized columns
 10 |     """
 11 |     def _initializer(shape, dtype=None, partition_info=None): #pylint: disable=W0613
 12 |         out = np.random.randn(*shape).astype(np.float32)
 13 |         out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
 14 |         return tf.constant(out)
 15 |     return _initializer
 16 | 
 17 | 
 18 | def dense(x, size, name, weight_init=None):
 19 |     """
 20 |     Dense (fully connected) layer
 21 |     """
 22 |     w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init)
 23 |     b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer())
 24 |     return tf.matmul(x, w) + b
 25 | 
 26 | def fancy_slice_2d(X, inds0, inds1):
 27 |     """
 28 |     Like numpy's X[inds0, inds1]
 29 |     """
 30 |     inds0 = tf.cast(inds0, tf.int64)
 31 |     inds1 = tf.cast(inds1, tf.int64)
 32 |     shape = tf.cast(tf.shape(X), tf.int64)
 33 |     ncols = shape[1]
 34 |     Xflat = tf.reshape(X, [-1])
 35 |     return tf.gather(Xflat, inds0 * ncols + inds1)
 36 | 
 37 | def discount(x, gamma):
 38 |     """
 39 |     Compute discounted sum of future values
 40 |     out[i] = in[i] + gamma * in[i+1] + gamma^2 * in[i+2] + ...
 41 |     """
 42 |     return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1]
 43 | 
 44 | def explained_variance_1d(ypred,y):
 45 |     """
 46 |     Var[ypred - y] / var[y]. 
 47 |     https://www.quora.com/What-is-the-meaning-proportion-of-variance-explained-in-linear-regression
 48 |     """
 49 |     assert y.ndim == 1 and ypred.ndim == 1    
 50 |     vary = np.var(y)
 51 |     return np.nan if vary==0 else 1 - np.var(y-ypred)/vary
 52 | 
 53 | def categorical_sample_logits(logits):
 54 |     """
 55 |     Samples (symbolically) from categorical distribution, where logits is a NxK
 56 |     matrix specifying N categorical distributions with K categories
 57 | 
 58 |     specifically, exp(logits) / sum( exp(logits), axis=1 ) is the 
 59 |     probabilities of the different classes
 60 | 
 61 |     Cleverly uses gumbell trick, based on
 62 |     https://github.com/tensorflow/tensorflow/issues/456
 63 |     """
 64 |     U = tf.random_uniform(tf.shape(logits))
 65 |     return tf.argmax(logits - tf.log(-tf.log(U)), dimension=1)
 66 | 
 67 | def pathlength(path):
 68 |     return len(path["reward"])
 69 | 
 70 | class LinearValueFunction(object):
 71 |     coef = None
 72 |     def fit(self, X, y):
 73 |         Xp = self.preproc(X)
 74 |         A = Xp.T.dot(Xp)
 75 |         nfeats = Xp.shape[1]
 76 |         A[np.arange(nfeats), np.arange(nfeats)] += 1e-3 # a little ridge regression
 77 |         b = Xp.T.dot(y)
 78 |         self.coef = np.linalg.solve(A, b)
 79 |     def predict(self, X):
 80 |         if self.coef is None:
 81 |             return np.zeros(X.shape[0])
 82 |         else:
 83 |             return self.preproc(X).dot(self.coef)
 84 |     def preproc(self, X):
 85 |         return np.concatenate([np.ones([X.shape[0], 1]), X, np.square(X)/2.0], axis=1)
 86 | 
 87 | class NnValueFunction(object):
 88 |     pass # YOUR CODE HERE
 89 | 
 90 | def lrelu(x, leak=0.2):
 91 |     f1 = 0.5 * (1 + leak)
 92 |     f2 = 0.5 * (1 - leak)
 93 |     return f1 * x + f2 * abs(x)
 94 | 
 95 | 
 96 | 
 97 | def main_cartpole(n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, stepsize=1e-2, animate=True, logdir=None):
 98 |     env = gym.make("CartPole-v0")
 99 |     ob_dim = env.observation_space.shape[0]
100 |     num_actions = env.action_space.n
101 |     logz.configure_output_dir(logdir)
102 |     vf = LinearValueFunction()
103 | 
104 |     # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
105 |     # that are computed later in these function
106 |     sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of observations
107 |     sy_ac_n = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) # batch of actions taken by the policy, used for policy gradient computation
108 |     sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # advantage function estimate
109 |     sy_h1 = lrelu(dense(sy_ob_no, 32, "h1", weight_init=normc_initializer(1.0))) # hidden layer
110 |     sy_logits_na = dense(sy_h1, num_actions, "final", weight_init=normc_initializer(0.05)) # "logits", describing probability distribution of final layer
111 |     # we use a small initialization for the last layer, so the initial policy has maximal entropy
112 |     sy_oldlogits_na = tf.placeholder(shape=[None, num_actions], name='oldlogits', dtype=tf.float32) # logits BEFORE update (just used for KL diagnostic)
113 |     sy_logp_na = tf.nn.log_softmax(sy_logits_na) # logprobability of actions
114 |     sy_sampled_ac = categorical_sample_logits(sy_logits_na)[0] # sampled actions, used for defining the policy (NOT computing the policy gradient)
115 |     sy_n = tf.shape(sy_ob_no)[0]
116 |     sy_logprob_n = fancy_slice_2d(sy_logp_na, tf.range(sy_n), sy_ac_n) # log-prob of actions taken -- used for policy gradient calculation
117 | 
118 |     # The following quantities are just used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES >>>>
119 |     sy_oldlogp_na = tf.nn.log_softmax(sy_oldlogits_na)
120 |     sy_oldp_na = tf.exp(sy_oldlogp_na) 
121 |     sy_kl = tf.reduce_sum(sy_oldp_na * (sy_oldlogp_na - sy_logp_na)) / tf.to_float(sy_n)
122 |     sy_p_na = tf.exp(sy_logp_na)
123 |     sy_ent = tf.reduce_sum( - sy_p_na * sy_logp_na) / tf.to_float(sy_n)
124 |     # <<<<<<<<<<<<<
125 | 
126 |     sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss")
127 | 
128 |     sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently)
129 |     update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr)
130 | 
131 |     tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 
132 |     # use single thread. on such a small problem, multithreading gives you a slowdown
133 |     # this way, we can better use multiple cores for different experiments
134 |     sess = tf.Session(config=tf_config)
135 |     sess.__enter__() # equivalent to `with sess:`
136 |     tf.global_variables_initializer().run() #pylint: disable=E1101
137 | 
138 |     total_timesteps = 0
139 | 
140 |     for i in range(n_iter):
141 |         print("********** Iteration %i ************"%i)
142 | 
143 |         # Collect paths until we have enough timesteps
144 |         timesteps_this_batch = 0
145 |         paths = []
146 |         while True:
147 |             ob = env.reset()
148 |             terminated = False
149 |             obs, acs, rewards = [], [], []
150 |             animate_this_episode=(len(paths)==0 and (i % 10 == 0) and animate)
151 |             while True:
152 |                 if animate_this_episode:
153 |                     env.render()
154 |                 obs.append(ob)
155 |                 ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]})
156 |                 acs.append(ac)
157 |                 ob, rew, done, _ = env.step(ac)
158 |                 rewards.append(rew)
159 |                 if done:
160 |                     break                    
161 |             path = {"observation" : np.array(obs), "terminated" : terminated,
162 |                     "reward" : np.array(rewards), "action" : np.array(acs)}
163 |             paths.append(path)
164 |             timesteps_this_batch += pathlength(path)
165 |             if timesteps_this_batch > min_timesteps_per_batch:
166 |                 break
167 |         total_timesteps += timesteps_this_batch
168 |         # Estimate advantage function
169 |         vtargs, vpreds, advs = [], [], []
170 |         for path in paths:
171 |             rew_t = path["reward"]
172 |             return_t = discount(rew_t, gamma)
173 |             vpred_t = vf.predict(path["observation"])
174 |             adv_t = return_t - vpred_t
175 |             advs.append(adv_t)
176 |             vtargs.append(return_t)
177 |             vpreds.append(vpred_t)
178 | 
179 |         # Build arrays for policy update
180 |         ob_no = np.concatenate([path["observation"] for path in paths])
181 |         ac_n = np.concatenate([path["action"] for path in paths])
182 |         adv_n = np.concatenate(advs)
183 |         standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
184 |         vtarg_n = np.concatenate(vtargs)
185 |         vpred_n = np.concatenate(vpreds)
186 |         vf.fit(ob_no, vtarg_n)
187 | 
188 |         # Policy update
189 |         _, oldlogits_na = sess.run([update_op, sy_logits_na], feed_dict={sy_ob_no:ob_no, sy_ac_n:ac_n, sy_adv_n:standardized_adv_n, sy_stepsize:stepsize})
190 |         kl, ent = sess.run([sy_kl, sy_ent], feed_dict={sy_ob_no:ob_no, sy_oldlogits_na:oldlogits_na})
191 | 
192 |         # Log diagnostics
193 |         logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
194 |         logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
195 |         logz.log_tabular("KLOldNew", kl)
196 |         logz.log_tabular("Entropy", ent)
197 |         logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n))
198 |         logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n))
199 |         logz.log_tabular("TimestepsSoFar", total_timesteps)
200 |         # If you're overfitting, EVAfter will be way larger than EVBefore.
201 |         # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias
202 |         logz.dump_tabular()
203 | 
204 | def main_pendulum(logdir, seed, n_iter, gamma, min_timesteps_per_batch, initial_stepsize, desired_kl, vf_type, vf_params, animate=False):
205 |     tf.set_random_seed(seed)
206 |     np.random.seed(seed)
207 |     env = gym.make("Pendulum-v0")
208 |     ob_dim = env.observation_space.shape[0]
209 |     ac_dim = env.action_space.shape[0]
210 |     logz.configure_output_dir(logdir)
211 |     if vf_type == 'linear':
212 |         vf = LinearValueFunction(**vf_params)
213 |     elif vf_type == 'nn':
214 |         vf = NnValueFunction(ob_dim=ob_dim, **vf_params)
215 | 
216 | 
217 |     YOUR_CODE_HERE
218 | 
219 | 
220 |     sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss")
221 | 
222 |     sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently)
223 |     update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr)
224 | 
225 |     sess = tf.Session()
226 |     sess.__enter__() # equivalent to `with sess:`
227 |     tf.global_variables_initializer().run() #pylint: disable=E1101
228 | 
229 |     total_timesteps = 0
230 |     stepsize = initial_stepsize
231 | 
232 |     for i in range(n_iter):
233 |         print("********** Iteration %i ************"%i)
234 | 
235 |         YOUR_CODE_HERE
236 | 
237 |         if kl > desired_kl * 2: 
238 |             stepsize /= 1.5
239 |             print('stepsize -> %s'%stepsize)
240 |         elif kl < desired_kl / 2: 
241 |             stepsize *= 1.5
242 |             print('stepsize -> %s'%stepsize)
243 |         else:
244 |             print('stepsize OK')
245 | 
246 | 
247 |         # Log diagnostics
248 |         logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
249 |         logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
250 |         logz.log_tabular("KLOldNew", kl)
251 |         logz.log_tabular("Entropy", ent)
252 |         logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n))
253 |         logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n))
254 |         logz.log_tabular("TimestepsSoFar", total_timesteps)
255 |         # If you're overfitting, EVAfter will be way larger than EVBefore.
256 |         # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias
257 |         logz.dump_tabular()
258 | 
259 | 
260 | def main_pendulum1(d):
261 |     return main_pendulum(**d)
262 | 
263 | if __name__ == "__main__":
264 |     if 1:
265 |         main_cartpole(logdir=None) # when you want to start collecting results, set the logdir
266 |     if 0:
267 |         general_params = dict(gamma=0.97, animate=False, min_timesteps_per_batch=2500, n_iter=300, initial_stepsize=1e-3)
268 |         params = [
269 |             dict(logdir='/tmp/ref/linearvf-kl2e-3-seed0', seed=0, desired_kl=2e-3, vf_type='linear', vf_params={}, **general_params),
270 |             dict(logdir='/tmp/ref/nnvf-kl2e-3-seed0', seed=0, desired_kl=2e-3, vf_type='nn', vf_params=dict(n_epochs=10, stepsize=1e-3), **general_params),
271 |             dict(logdir='/tmp/ref/linearvf-kl2e-3-seed1', seed=1, desired_kl=2e-3, vf_type='linear', vf_params={}, **general_params),
272 |             dict(logdir='/tmp/ref/nnvf-kl2e-3-seed1', seed=1, desired_kl=2e-3, vf_type='nn', vf_params=dict(n_epochs=10, stepsize=1e-3), **general_params),
273 |             dict(logdir='/tmp/ref/linearvf-kl2e-3-seed2', seed=2, desired_kl=2e-3, vf_type='linear', vf_params={}, **general_params),
274 |             dict(logdir='/tmp/ref/nnvf-kl2e-3-seed2', seed=2, desired_kl=2e-3, vf_type='nn', vf_params=dict(n_epochs=10, stepsize=1e-3), **general_params),
275 |         ]
276 |         import multiprocessing
277 |         p = multiprocessing.Pool()
278 |         p.map(main_pendulum1, params)
279 | 


--------------------------------------------------------------------------------
/sp17_hw/hw4/plot_learning_curves.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | parser = argparse.ArgumentParser()
 3 | parser.add_argument("expdir", help="experiment dir, e.g., /tmp/experiments")
 4 | args = parser.parse_args()
 5 | 
 6 | from pylab import *
 7 | import os
 8 | from os.path import join
 9 | 
10 | dirnames = os.listdir(args.expdir)
11 | 
12 | fig, axes = subplots(4)
13 | for dirname in dirnames:
14 |     print(dirname)
15 |     A = np.genfromtxt(join(args.expdir, dirname, 'log.txt'),delimiter='\t',dtype=None, names=True)
16 |     # axes[0].plot(scipy.signal.savgol_filter(A['EpRewMean'] , 21, 3), '-x')
17 |     x = A['TimestepsSoFar']
18 |     axes[0].plot(x, A['EpRewMean'], '-x')
19 |     axes[1].plot(x, A['KLOldNew'], '-x')
20 |     axes[2].plot(x, A['Entropy'], '-x')
21 |     axes[3].plot(x, A['EVBefore'], '-x')
22 | legend(dirnames,loc='best').draggable()
23 | axes[0].set_ylabel("EpRewMean")
24 | axes[1].set_ylabel("KLOldNew")
25 | axes[2].set_ylabel("Entropy")
26 | axes[3].set_ylabel("EVBefore")
27 | axes[3].set_ylim(-1,1)
28 | axes[-1].set_xlabel("Iterations")
29 | show()
30 | 


--------------------------------------------------------------------------------