├── LICENSE
├── README.md
├── hw1
    ├── README.md
    ├── demo.bash
    ├── experts
    │   ├── Ant-v1.pkl
    │   ├── HalfCheetah-v1.pkl
    │   ├── Hopper-v1.pkl
    │   ├── Humanoid-v1.pkl
    │   ├── Reacher-v1.pkl
    │   └── Walker2d-v1.pkl
    ├── load_policy.py
    ├── run_expert.py
    └── tf_util.py
├── hw2
    ├── .ipynb_checkpoints
    │   └── HW2-checkpoint.ipynb
    ├── HW2.ipynb
    ├── discrete_env.py
    ├── discrete_env.pyc
    ├── frozen_lake.py
    └── frozen_lake.pyc
├── hw3
    ├── README
    ├── atari_wrappers.py
    ├── dqn.py
    ├── dqn_utils.py
    ├── run_dqn_atari.py
    └── run_dqn_ram.py
├── hw4
    ├── homework.md
    ├── logz.py
    ├── main.py
    └── plot_learning_curves.py
└── plot_test.ipynb


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 berkeleydeeprlcourse
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # cs294
2 | Berkeley Deep Reinforcement Learning cs294 solutions
3 | Email: lightaime@gmail.com
4 | 


--------------------------------------------------------------------------------
/hw1/README.md:
--------------------------------------------------------------------------------
 1 | # CS294-112 HW 1: Imitation Learning
 2 | 
 3 | Dependencies: TensorFlow, MuJoCo version 1.31, OpenAI Gym
 4 | 
 5 | The only file that you need to look at is `run_expert.py`, which is code to load up an expert policy, run a specified number of roll-outs, and save out data.
 6 | 
 7 | In `experts/`, the provided expert policies are:
 8 | * Ant-v1.pkl
 9 | * HalfCheetah-v1.pkl
10 | * Hopper-v1.pkl
11 | * Humanoid-v1.pkl
12 | * Reacher-v1.pkl
13 | * Walker2d-v1.pkl
14 | 
15 | The name of the pickle file corresponds to the name of the gym environment.
16 | 


--------------------------------------------------------------------------------
/hw1/demo.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eux
3 | for e in Hopper-v1 Ant-v1 HalfCheetah-v1 Humanoid-v1 Reacher-v1 Walker2d-v1
4 | do
5 |     python run_expert.py experts/$e.pkl $e --render --num_rollouts=1
6 | done
7 | 


--------------------------------------------------------------------------------
/hw1/experts/Ant-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lightaime/cs294/25641a22d931326fef0454bddbb0e74502009207/hw1/experts/Ant-v1.pkl


--------------------------------------------------------------------------------
/hw1/experts/HalfCheetah-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lightaime/cs294/25641a22d931326fef0454bddbb0e74502009207/hw1/experts/HalfCheetah-v1.pkl


--------------------------------------------------------------------------------
/hw1/experts/Hopper-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lightaime/cs294/25641a22d931326fef0454bddbb0e74502009207/hw1/experts/Hopper-v1.pkl


--------------------------------------------------------------------------------
/hw1/experts/Humanoid-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lightaime/cs294/25641a22d931326fef0454bddbb0e74502009207/hw1/experts/Humanoid-v1.pkl


--------------------------------------------------------------------------------
/hw1/experts/Reacher-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lightaime/cs294/25641a22d931326fef0454bddbb0e74502009207/hw1/experts/Reacher-v1.pkl


--------------------------------------------------------------------------------
/hw1/experts/Walker2d-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lightaime/cs294/25641a22d931326fef0454bddbb0e74502009207/hw1/experts/Walker2d-v1.pkl


--------------------------------------------------------------------------------
/hw1/load_policy.py:
--------------------------------------------------------------------------------
 1 | import pickle, tensorflow as tf, tf_util, numpy as np
 2 | 
 3 | def load_policy(filename):
 4 |     with open(filename, 'rb') as f:
 5 |         data = pickle.loads(f.read())
 6 | 
 7 |     # assert len(data.keys()) == 2
 8 |     nonlin_type = data['nonlin_type']
 9 |     policy_type = [k for k in data.keys() if k != 'nonlin_type'][0]
10 | 
11 |     assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type)
12 |     policy_params = data[policy_type]
13 | 
14 |     assert set(policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'}
15 | 
16 |     # Keep track of input and output dims (i.e. observation and action dims) for the user
17 | 
18 |     def build_policy(obs_bo):
19 |         def read_layer(l):
20 |             assert list(l.keys()) == ['AffineLayer']
21 |             assert sorted(l['AffineLayer'].keys()) == ['W', 'b']
22 |             return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer']['b'].astype(np.float32)
23 | 
24 |         def apply_nonlin(x):
25 |             if nonlin_type == 'lrelu':
26 |                 return tf_util.lrelu(x, leak=.01) # openai/imitation nn.py:233
27 |             elif nonlin_type == 'tanh':
28 |                 return tf.tanh(x)
29 |             else:
30 |                 raise NotImplementedError(nonlin_type)
31 | 
32 |         # Build the policy. First, observation normalization.
33 |         assert list(policy_params['obsnorm'].keys()) == ['Standardizer']
34 |         obsnorm_mean = policy_params['obsnorm']['Standardizer']['mean_1_D']
35 |         obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D']
36 |         obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean)))
37 |         print('obs', obsnorm_mean.shape, obsnorm_stdev.shape)
38 |         normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation
39 | 
40 |         curr_activations_bd = normedobs_bo
41 | 
42 |         # Hidden layers next
43 |         assert list(policy_params['hidden'].keys()) == ['FeedforwardNet']
44 |         layer_params = policy_params['hidden']['FeedforwardNet']
45 |         for layer_name in sorted(layer_params.keys()):
46 |             l = layer_params[layer_name]
47 |             W, b = read_layer(l)
48 |             curr_activations_bd = apply_nonlin(tf.matmul(curr_activations_bd, W) + b)
49 | 
50 |         # Output layer
51 |         W, b = read_layer(policy_params['out'])
52 |         output_bo = tf.matmul(curr_activations_bd, W) + b
53 |         return output_bo
54 | 
55 |     obs_bo = tf.placeholder(tf.float32, [None, None])
56 |     a_ba = build_policy(obs_bo)
57 |     policy_fn = tf_util.function([obs_bo], a_ba)
58 |     return policy_fn


--------------------------------------------------------------------------------
/hw1/run_expert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Code to load an expert policy and generate roll-out data for behavioral cloning.
 5 | Example usage:
 6 |     python run_expert.py experts/Humanoid-v1.pkl Humanoid-v1 --render \
 7 |             --num_rollouts 20
 8 | 
 9 | Author of this script and included expert policies: Jonathan Ho (hoj@openai.com)
10 | """
11 | 
12 | import pickle
13 | import tensorflow as tf
14 | import numpy as np
15 | import tf_util
16 | import gym
17 | import load_policy
18 | 
19 | def main():
20 |     import argparse
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument('expert_policy_file', type=str)
23 |     parser.add_argument('envname', type=str)
24 |     parser.add_argument('--render', action='store_true')
25 |     parser.add_argument("--max_timesteps", type=int)
26 |     parser.add_argument('--num_rollouts', type=int, default=20,
27 |                         help='Number of expert roll outs')
28 |     args = parser.parse_args()
29 | 
30 |     print('loading and building expert policy')
31 |     policy_fn = load_policy.load_policy(args.expert_policy_file)
32 |     print('loaded and built')
33 | 
34 |     with tf.Session():
35 |         tf_util.initialize()
36 | 
37 |         import gym
38 |         env = gym.make(args.envname)
39 |         max_steps = args.max_timesteps or env.spec.timestep_limit
40 | 
41 |         returns = []
42 |         observations = []
43 |         actions = []
44 |         for i in range(args.num_rollouts):
45 |             print('iter', i)
46 |             obs = env.reset()
47 |             done = False
48 |             totalr = 0.
49 |             steps = 0
50 |             while not done:
51 |                 action = policy_fn(obs[None,:])
52 |                 observations.append(obs)
53 |                 actions.append(action)
54 |                 obs, r, done, _ = env.step(action)
55 |                 totalr += r
56 |                 steps += 1
57 |                 if args.render:
58 |                     env.render()
59 |                 if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
60 |                 if steps >= max_steps:
61 |                     break
62 |             returns.append(totalr)
63 | 
64 |         print('returns', returns)
65 |         print('mean return', np.mean(returns))
66 |         print('std of return', np.std(returns))
67 | 
68 |         expert_data = {'observations': np.array(observations),
69 |                        'actions': np.array(actions)}
70 | 
71 | if __name__ == '__main__':
72 |     main()
73 | 


--------------------------------------------------------------------------------
/hw1/tf_util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf # pylint: ignore-module
  3 | #import builtins
  4 | import functools
  5 | import copy
  6 | import os
  7 | import collections
  8 | 
  9 | # ================================================================
 10 | # Import all names into common namespace
 11 | # ================================================================
 12 | 
 13 | clip = tf.clip_by_value
 14 | 
 15 | # Make consistent with numpy
 16 | # ----------------------------------------
 17 | 
 18 | def sum(x, axis=None, keepdims=False):
 19 |     return tf.reduce_sum(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims)
 20 | def mean(x, axis=None, keepdims=False):
 21 |     return tf.reduce_mean(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims)
 22 | def var(x, axis=None, keepdims=False):
 23 |     meanx = mean(x, axis=axis, keepdims=keepdims)
 24 |     return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims)
 25 | def std(x, axis=None, keepdims=False):
 26 |     return tf.sqrt(var(x, axis=axis, keepdims=keepdims))
 27 | def max(x, axis=None, keepdims=False):
 28 |     return tf.reduce_max(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims)
 29 | def min(x, axis=None, keepdims=False):
 30 |     return tf.reduce_min(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims)
 31 | def concatenate(arrs, axis=0):
 32 |     return tf.concat(axis, arrs)
 33 | def argmax(x, axis=None):
 34 |     return tf.argmax(x, dimension=axis)
 35 | 
 36 | def switch(condition, then_expression, else_expression):
 37 |     '''Switches between two operations depending on a scalar value (int or bool).
 38 |     Note that both `then_expression` and `else_expression`
 39 |     should be symbolic tensors of the *same shape*.
 40 | 
 41 |     # Arguments
 42 |         condition: scalar tensor.
 43 |         then_expression: TensorFlow operation.
 44 |         else_expression: TensorFlow operation.
 45 |     '''
 46 |     x_shape = copy.copy(then_expression.get_shape())
 47 |     x = tf.cond(tf.cast(condition, 'bool'),
 48 |                 lambda: then_expression,
 49 |                 lambda: else_expression)
 50 |     x.set_shape(x_shape)
 51 |     return x
 52 | 
 53 | # Extras
 54 | # ----------------------------------------
 55 | def l2loss(params):
 56 |     if len(params) == 0:
 57 |         return tf.constant(0.0)
 58 |     else:
 59 |         return tf.add_n([sum(tf.square(p)) for p in params])
 60 | def lrelu(x, leak=0.2):
 61 |     f1 = 0.5 * (1 + leak)
 62 |     f2 = 0.5 * (1 - leak)
 63 |     return f1 * x + f2 * abs(x)
 64 | def categorical_sample_logits(X):
 65 |     # https://github.com/tensorflow/tensorflow/issues/456
 66 |     U = tf.random_uniform(tf.shape(X))
 67 |     return argmax(X - tf.log(-tf.log(U)), axis=1)
 68 | 
 69 | # ================================================================
 70 | # Global session
 71 | # ================================================================
 72 | 
 73 | def get_session():
 74 |     return tf.get_default_session()
 75 | 
 76 | def single_threaded_session():
 77 |     tf_config = tf.ConfigProto(
 78 |         inter_op_parallelism_threads=1,
 79 |         intra_op_parallelism_threads=1)
 80 |     return tf.Session(config=tf_config)
 81 | 
 82 | def make_session(num_cpu):
 83 |     tf_config = tf.ConfigProto(
 84 |         inter_op_parallelism_threads=num_cpu,
 85 |         intra_op_parallelism_threads=num_cpu)
 86 |     return tf.Session(config=tf_config)
 87 | 
 88 | 
 89 | ALREADY_INITIALIZED = set()
 90 | def initialize():
 91 |     new_variables = set(tf.all_variables()) - ALREADY_INITIALIZED
 92 |     get_session().run(tf.initialize_variables(new_variables))
 93 |     ALREADY_INITIALIZED.update(new_variables)
 94 | 
 95 | 
 96 | def eval(expr, feed_dict=None):
 97 |     if feed_dict is None: feed_dict = {}
 98 |     return get_session().run(expr, feed_dict=feed_dict)
 99 | 
100 | def set_value(v, val):
101 |     get_session().run(v.assign(val))
102 | 
103 | def load_state(fname):
104 |     saver = tf.train.Saver()
105 |     saver.restore(get_session(), fname)
106 | 
107 | def save_state(fname):
108 |     os.makedirs(os.path.dirname(fname), exist_ok=True)
109 |     saver = tf.train.Saver()
110 |     saver.save(get_session(), fname)
111 | 
112 | # ================================================================
113 | # Model components
114 | # ================================================================
115 | 
116 | 
117 | def normc_initializer(std=1.0):
118 |     def _initializer(shape, dtype=None, partition_info=None): #pylint: disable=W0613
119 |         out = np.random.randn(*shape).astype(np.float32)
120 |         out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
121 |         return tf.constant(out)
122 |     return _initializer
123 | 
124 | 
125 | def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None,
126 |            summary_tag=None):
127 |     with tf.variable_scope(name):
128 |         stride_shape = [1, stride[0], stride[1], 1]
129 |         filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]), num_filters]
130 | 
131 |         # there are "num input feature maps * filter height * filter width"
132 |         # inputs to each hidden unit
133 |         fan_in = intprod(filter_shape[:3])
134 |         # each unit in the lower layer receives a gradient from:
135 |         # "num output feature maps * filter height * filter width" /
136 |         #   pooling size
137 |         fan_out = intprod(filter_shape[:2]) * num_filters
138 |         # initialize weights with random weights
139 |         w_bound = np.sqrt(6. / (fan_in + fan_out))
140 | 
141 |         w = tf.get_variable("W", filter_shape, dtype, tf.random_uniform_initializer(-w_bound, w_bound),
142 |                             collections=collections)
143 |         b = tf.get_variable("b", [1, 1, 1, num_filters], initializer=tf.zeros_initializer,
144 |                             collections=collections)
145 | 
146 |         if summary_tag is not None:
147 |             tf.image_summary(summary_tag,
148 |                              tf.transpose(tf.reshape(w, [filter_size[0], filter_size[1], -1, 1]),
149 |                                           [2, 0, 1, 3]),
150 |                              max_images=10)
151 | 
152 |         return tf.nn.conv2d(x, w, stride_shape, pad) + b
153 | 
154 | 
155 | def dense(x, size, name, weight_init=None, bias=True):
156 |     w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init)
157 |     ret = tf.matmul(x, w)
158 |     if bias:
159 |         b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer)
160 |         return ret + b
161 |     else:
162 |         return ret
163 | 
164 | def wndense(x, size, name, init_scale=1.0):
165 |     v = tf.get_variable(name + "/V", [int(x.get_shape()[1]), size],
166 |                         initializer=tf.random_normal_initializer(0, 0.05))
167 |     g = tf.get_variable(name + "/g", [size], initializer=tf.constant_initializer(init_scale))
168 |     b = tf.get_variable(name + "/b", [size], initializer=tf.constant_initializer(0.0))
169 | 
170 |     # use weight normalization (Salimans & Kingma, 2016)
171 |     x = tf.matmul(x, v)
172 |     scaler = g / tf.sqrt(sum(tf.square(v), axis=0, keepdims=True))
173 |     return tf.reshape(scaler, [1, size]) * x + tf.reshape(b, [1, size])
174 | 
175 | def densenobias(x, size, name, weight_init=None):
176 |     return dense(x, size, name, weight_init=weight_init, bias=False)
177 | 
178 | def dropout(x, pkeep, phase=None, mask=None):
179 |     mask = tf.floor(pkeep + tf.random_uniform(tf.shape(x))) if mask is None else mask
180 |     if phase is None:
181 |         return mask * x
182 |     else:
183 |         return switch(phase, mask*x, pkeep*x)
184 | 
185 | def batchnorm(x, name, phase, updates, gamma=0.96):
186 |     k = x.get_shape()[1]
187 |     runningmean = tf.get_variable(name+"/mean", shape=[1, k], initializer=tf.constant_initializer(0.0), trainable=False)
188 |     runningvar = tf.get_variable(name+"/var", shape=[1, k], initializer=tf.constant_initializer(1e-4), trainable=False)
189 |     testy = (x - runningmean) / tf.sqrt(runningvar)
190 | 
191 |     mean_ = mean(x, axis=0, keepdims=True)
192 |     var_ = mean(tf.square(x), axis=0, keepdims=True)
193 |     std = tf.sqrt(var_)
194 |     trainy = (x - mean_) / std
195 | 
196 |     updates.extend([
197 |         tf.assign(runningmean, runningmean * gamma + mean_ * (1 - gamma)),
198 |         tf.assign(runningvar, runningvar * gamma + var_ * (1 - gamma))
199 |     ])
200 | 
201 |     y = switch(phase, trainy, testy)
202 | 
203 |     out = y * tf.get_variable(name+"/scaling", shape=[1, k], initializer=tf.constant_initializer(1.0), trainable=True)\
204 |             + tf.get_variable(name+"/translation", shape=[1,k], initializer=tf.constant_initializer(0.0), trainable=True)
205 |     return out
206 | 
207 | 
208 | 
209 | # ================================================================
210 | # Basic Stuff
211 | # ================================================================
212 | 
213 | def function(inputs, outputs, updates=None, givens=None):
214 |     if isinstance(outputs, list):
215 |         return _Function(inputs, outputs, updates, givens=givens)
216 |     elif isinstance(outputs, (dict, collections.OrderedDict)):
217 |         f = _Function(inputs, outputs.values(), updates, givens=givens)
218 |         return lambda *inputs : type(outputs)(zip(outputs.keys(), f(*inputs)))
219 |     else:
220 |         f = _Function(inputs, [outputs], updates, givens=givens)
221 |         return lambda *inputs : f(*inputs)[0]
222 | 
223 | class _Function(object):
224 |     def __init__(self, inputs, outputs, updates, givens, check_nan=False):
225 |         assert all(len(i.op.inputs)==0 for i in inputs), "inputs should all be placeholders"
226 |         self.inputs = inputs
227 |         updates = updates or []
228 |         self.update_group = tf.group(*updates)
229 |         self.outputs_update = list(outputs) + [self.update_group]
230 |         self.givens = {} if givens is None else givens
231 |         self.check_nan = check_nan
232 |     def __call__(self, *inputvals):
233 |         assert len(inputvals) == len(self.inputs)
234 |         feed_dict = dict(zip(self.inputs, inputvals))
235 |         feed_dict.update(self.givens)
236 |         results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
237 |         if self.check_nan:
238 |             if any(np.isnan(r).any() for r in results):
239 |                 raise RuntimeError("Nan detected")
240 |         return results
241 | 
242 | def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size):
243 |     if isinstance(outputs, list):
244 |         return _MemFriendlyFunction(nondata_inputs, data_inputs, outputs, batch_size)
245 |     else:
246 |         f = _MemFriendlyFunction(nondata_inputs, data_inputs, [outputs], batch_size)
247 |         return lambda *inputs : f(*inputs)[0]
248 | 
249 | class _MemFriendlyFunction(object):
250 |     def __init__(self, nondata_inputs, data_inputs, outputs, batch_size):
251 |         self.nondata_inputs = nondata_inputs
252 |         self.data_inputs = data_inputs
253 |         self.outputs = list(outputs)
254 |         self.batch_size = batch_size
255 |     def __call__(self, *inputvals):
256 |         assert len(inputvals) == len(self.nondata_inputs) + len(self.data_inputs)
257 |         nondata_vals = inputvals[0:len(self.nondata_inputs)]
258 |         data_vals = inputvals[len(self.nondata_inputs):]
259 |         feed_dict = dict(zip(self.nondata_inputs, nondata_vals))
260 |         n = data_vals[0].shape[0]
261 |         for v in data_vals[1:]:
262 |             assert v.shape[0] == n
263 |         for i_start in range(0, n, self.batch_size):
264 |             slice_vals = [v[i_start:min(i_start+self.batch_size, n)] for v in data_vals]
265 |             for (var,val) in zip(self.data_inputs, slice_vals):
266 |                 feed_dict[var]=val
267 |             results = tf.get_default_session().run(self.outputs, feed_dict=feed_dict)
268 |             if i_start==0:
269 |                 sum_results = results
270 |             else:
271 |                 for i in range(len(results)):
272 |                     sum_results[i] = sum_results[i] + results[i]
273 |         for i in range(len(results)):
274 |             sum_results[i] = sum_results[i] / n
275 |         return sum_results
276 | 
277 | # ================================================================
278 | # Modules
279 | # ================================================================
280 | 
281 | class Module(object):
282 |     def __init__(self, name):
283 |         self.name = name
284 |         self.first_time = True
285 |         self.scope = None
286 |         self.cache = {}
287 |     def __call__(self, *args):
288 |         if args in self.cache:
289 |             print("(%s) retrieving value from cache"%self.name)
290 |             return self.cache[args]
291 |         with tf.variable_scope(self.name, reuse=not self.first_time):
292 |             scope = tf.get_variable_scope().name
293 |             if self.first_time:
294 |                 self.scope = scope
295 |                 print("(%s) running function for the first time"%self.name)
296 |             else:
297 |                 assert self.scope == scope, "Tried calling function with a different scope"
298 |                 print("(%s) running function on new inputs"%self.name)
299 |             self.first_time = False
300 |             out = self._call(*args)
301 |         self.cache[args] = out
302 |         return out
303 |     def _call(self, *args):
304 |         raise NotImplementedError
305 | 
306 |     @property
307 |     def trainable_variables(self):
308 |         assert self.scope is not None, "need to call module once before getting variables"
309 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
310 | 
311 |     @property
312 |     def variables(self):
313 |         assert self.scope is not None, "need to call module once before getting variables"
314 |         return tf.get_collection(tf.GraphKeys.VARIABLES, self.scope)
315 | 
316 | 
317 | def module(name):
318 |     @functools.wraps
319 |     def wrapper(f):
320 |         class WrapperModule(Module):
321 |             def _call(self, *args):
322 |                 return f(*args)
323 |         return WrapperModule(name)
324 |     return wrapper
325 | 
326 | # ================================================================
327 | # Graph traversal
328 | # ================================================================
329 | 
330 | VARIABLES = {}
331 | 
332 | 
333 | def get_parents(node):
334 |     return node.op.inputs
335 | 
336 | def topsorted(outputs):
337 |     """
338 |     Topological sort via non-recursive depth-first search
339 |     """
340 |     assert isinstance(outputs, (list,tuple))
341 |     marks = {}
342 |     out = []
343 |     stack = [] #pylint: disable=W0621
344 |     # i: node
345 |     # jidx = number of children visited so far from that node
346 |     # marks: state of each node, which is one of
347 |     #   0: haven't visited
348 |     #   1: have visited, but not done visiting children
349 |     #   2: done visiting children
350 |     for x in outputs:
351 |         stack.append((x,0))
352 |         while stack:
353 |             (i,jidx) = stack.pop()
354 |             if jidx == 0:
355 |                 m = marks.get(i,0)
356 |                 if m == 0:
357 |                     marks[i] = 1
358 |                 elif m == 1:
359 |                     raise ValueError("not a dag")
360 |                 else:
361 |                     continue
362 |             ps = get_parents(i)
363 |             if jidx == len(ps):
364 |                 marks[i] = 2
365 |                 out.append(i)
366 |             else:
367 |                 stack.append((i,jidx+1))
368 |                 j = ps[jidx]
369 |                 stack.append((j,0))
370 |     return out
371 | 
372 | 
373 | # ================================================================
374 | # Flat vectors
375 | # ================================================================
376 | 
377 | def var_shape(x):
378 |     out = [k.value for k in x.get_shape()]
379 |     assert all(isinstance(a, int) for a in out), \
380 |         "shape function assumes that shape is fully known"
381 |     return out
382 | 
383 | def numel(x):
384 |     return intprod(var_shape(x))
385 | 
386 | def intprod(x):
387 |     return int(np.prod(x))
388 | 
389 | def flatgrad(loss, var_list):
390 |     grads = tf.gradients(loss, var_list)
391 |     return tf.concat(0, [tf.reshape(grad, [numel(v)])
392 |         for (v, grad) in zip(var_list, grads)])
393 | 
394 | class SetFromFlat(object):
395 |     def __init__(self, var_list, dtype=tf.float32):
396 |         assigns = []
397 |         shapes = list(map(var_shape, var_list))
398 |         total_size = np.sum([intprod(shape) for shape in shapes])
399 | 
400 |         self.theta = theta = tf.placeholder(dtype,[total_size])
401 |         start=0
402 |         assigns = []
403 |         for (shape,v) in zip(shapes,var_list):
404 |             size = intprod(shape)
405 |             assigns.append(tf.assign(v, tf.reshape(theta[start:start+size],shape)))
406 |             start+=size
407 |         self.op = tf.group(*assigns)
408 |     def __call__(self, theta):
409 |         get_session().run(self.op, feed_dict={self.theta:theta})
410 | 
411 | class GetFlat(object):
412 |     def __init__(self, var_list):
413 |         self.op = tf.concat(0, [tf.reshape(v, [numel(v)]) for v in var_list])
414 |     def __call__(self):
415 |         return get_session().run(self.op)
416 | 
417 | # ================================================================
418 | # Misc
419 | # ================================================================
420 | 
421 | 
422 | def fancy_slice_2d(X, inds0, inds1):
423 |     """
424 |     like numpy X[inds0, inds1]
425 |     XXX this implementation is bad
426 |     """
427 |     inds0 = tf.cast(inds0, tf.int64)
428 |     inds1 = tf.cast(inds1, tf.int64)
429 |     shape = tf.cast(tf.shape(X), tf.int64)
430 |     ncols = shape[1]
431 |     Xflat = tf.reshape(X, [-1])
432 |     return tf.gather(Xflat, inds0 * ncols + inds1)
433 | 
434 | 
435 | def scope_vars(scope, trainable_only):
436 |     """
437 |     Get variables inside a scope
438 |     The scope can be specified as a string
439 |     """
440 |     return tf.get_collection(
441 |         tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.VARIABLES,
442 |         scope=scope if isinstance(scope, str) else scope.name
443 |     )
444 | 
445 | def lengths_to_mask(lengths_b, max_length):
446 |     """
447 |     Turns a vector of lengths into a boolean mask
448 | 
449 |     Args:
450 |         lengths_b: an integer vector of lengths
451 |         max_length: maximum length to fill the mask
452 | 
453 |     Returns:
454 |         a boolean array of shape (batch_size, max_length)
455 |         row[i] consists of True repeated lengths_b[i] times, followed by False
456 |     """
457 |     lengths_b = tf.convert_to_tensor(lengths_b)
458 |     assert lengths_b.get_shape().ndims == 1
459 |     mask_bt = tf.expand_dims(tf.range(max_length), 0) < tf.expand_dims(lengths_b, 1)
460 |     return mask_bt
461 | 
462 | 
463 | def in_session(f):
464 |     @functools.wraps(f)
465 |     def newfunc(*args, **kwargs):
466 |         with tf.Session():
467 |             f(*args, **kwargs)
468 |     return newfunc
469 | 
470 | 
471 | _PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape)
472 | def get_placeholder(name, dtype, shape):
473 |     print("calling get_placeholder", name)
474 |     if name in _PLACEHOLDER_CACHE:
475 |         out, dtype1, shape1 = _PLACEHOLDER_CACHE[name]
476 |         assert dtype1==dtype and shape1==shape
477 |         return out
478 |     else:
479 |         out = tf.placeholder(dtype=dtype, shape=shape, name=name)
480 |         _PLACEHOLDER_CACHE[name] = (out,dtype,shape)
481 |         return out
482 | def get_placeholder_cached(name):
483 |     return _PLACEHOLDER_CACHE[name][0]
484 | 
485 | def flattenallbut0(x):
486 |     return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
487 | 
488 | def reset():
489 |     global _PLACEHOLDER_CACHE
490 |     global VARIABLES
491 |     _PLACEHOLDER_CACHE = {}
492 |     VARIABLES = {}
493 |     tf.reset_default_graph()
494 | 


--------------------------------------------------------------------------------
/hw2/discrete_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from gym import Env, spaces
 4 | from gym.utils import seeding
 5 | 
 6 | def categorical_sample(prob_n, np_random):
 7 |     """
 8 |     Sample from categorical distribution
 9 |     Each row specifies class probabilities
10 |     """
11 |     prob_n = np.asarray(prob_n)
12 |     csprob_n = np.cumsum(prob_n)
13 |     return (csprob_n > np_random.rand()).argmax()
14 | 
15 | 
16 | class DiscreteEnv(Env):
17 | 
18 |     """
19 |     Has the following members
20 |     - nS: number of states
21 |     - nA: number of actions
22 |     - P: transitions (*)
23 |     - isd: initial state distribution (**)
24 | 
25 |     (*) dictionary dict of dicts of lists, where
26 |       P[s][a] == [(probability, nextstate, reward, done), ...]
27 |     (**) list or array of length nS
28 | 
29 | 
30 |     """
31 |     def __init__(self, nS, nA, P, isd):
32 |         self.P = P
33 |         self.isd = isd
34 |         self.lastaction=None # for rendering
35 |         self.nS = nS
36 |         self.nA = nA
37 | 
38 |         self.action_space = spaces.Discrete(self.nA)
39 |         self.observation_space = spaces.Discrete(self.nS)
40 | 
41 |         self._seed()
42 |         self._reset()
43 | 
44 |     def _seed(self, seed=None):
45 |         self.np_random, seed = seeding.np_random(seed)
46 |         return [seed]
47 | 
48 |     def _reset(self):
49 |         self.s = categorical_sample(self.isd, self.np_random)
50 |         self.lastaction=None
51 |         return self.s
52 | 
53 |     def _step(self, a):
54 |         transitions = self.P[self.s][a]
55 |         i = categorical_sample([t[0] for t in transitions], self.np_random)
56 |         p, s, r, d= transitions[i]
57 |         self.s = s
58 |         self.lastaction=a
59 |         return (s, r, d, {"prob" : p})
60 | 


--------------------------------------------------------------------------------
/hw2/discrete_env.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lightaime/cs294/25641a22d931326fef0454bddbb0e74502009207/hw2/discrete_env.pyc


--------------------------------------------------------------------------------
/hw2/frozen_lake.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import sys
  3 | from six import StringIO, b
  4 | 
  5 | from gym import utils
  6 | import discrete_env
  7 | 
  8 | LEFT = 0
  9 | DOWN = 1
 10 | RIGHT = 2
 11 | UP = 3
 12 | 
 13 | MAPS = {
 14 |     "4x4": [
 15 |         "SFFF",
 16 |         "FHFH",
 17 |         "FFFH",
 18 |         "HFFG"
 19 |     ],
 20 |     "8x8": [
 21 |         "SFFFFFFF",
 22 |         "FFFFFFFF",
 23 |         "FFFHFFFF",
 24 |         "FFFFFHFF",
 25 |         "FFFHFFFF",
 26 |         "FHHFFFHF",
 27 |         "FHFFHFHF",
 28 |         "FFFHFFFG"
 29 |     ],
 30 | }
 31 | 
 32 | class FrozenLakeEnv(discrete_env.DiscreteEnv):
 33 |     """
 34 |     Winter is here. You and your friends were tossing around a frisbee at the park
 35 |     when you made a wild throw that left the frisbee out in the middle of the lake.
 36 |     The water is mostly frozen, but there are a few holes where the ice has melted.
 37 |     If you step into one of those holes, you'll fall into the freezing water.
 38 |     At this time, there's an international frisbee shortage, so it's absolutely imperative that
 39 |     you navigate across the lake and retrieve the disc.
 40 |     However, the ice is slippery, so you won't always move in the direction you intend.
 41 |     The surface is described using a grid like the following
 42 | 
 43 |         SFFF
 44 |         FHFH
 45 |         FFFH
 46 |         HFFG
 47 | 
 48 |     S : starting point, safe
 49 |     F : frozen surface, safe
 50 |     H : hole, fall to your doom
 51 |     G : goal, where the frisbee is located
 52 | 
 53 |     The episode ends when you reach the goal or fall in a hole.
 54 |     You receive a reward of 1 if you reach the goal, and zero otherwise.
 55 | 
 56 |     """
 57 | 
 58 |     metadata = {'render.modes': ['human', 'ansi']}
 59 | 
 60 |     def __init__(self, desc=None, map_name="4x4",is_slippery=True):
 61 |         if desc is None and map_name is None:
 62 |             raise ValueError('Must provide either desc or map_name')
 63 |         elif desc is None:
 64 |             desc = MAPS[map_name]
 65 |         self.desc = desc = np.asarray(desc,dtype='c')
 66 |         self.nrow, self.ncol = nrow, ncol = desc.shape
 67 | 
 68 |         nA = 4
 69 |         nS = nrow * ncol
 70 | 
 71 |         isd = np.array(desc == b'S').astype('float64').ravel()
 72 |         isd /= isd.sum()
 73 | 
 74 |         P = {s : {a : [] for a in range(nA)} for s in range(nS)}
 75 | 
 76 |         def to_s(row, col):
 77 |             return row*ncol + col
 78 |         def inc(row, col, a):
 79 |             if a==0: # left
 80 |                 col = max(col-1,0)
 81 |             elif a==1: # down
 82 |                 row = min(row+1,nrow-1)
 83 |             elif a==2: # right
 84 |                 col = min(col+1,ncol-1)
 85 |             elif a==3: # up
 86 |                 row = max(row-1,0)
 87 |             return (row, col)
 88 | 
 89 |         for row in range(nrow):
 90 |             for col in range(ncol):
 91 |                 s = to_s(row, col)
 92 |                 for a in range(4):
 93 |                     li = P[s][a]
 94 |                     letter = desc[row, col]
 95 |                     if letter in b'GH':
 96 |                         li.append((1.0, s, 0, True))
 97 |                     else:
 98 |                         if is_slippery:
 99 |                             for b in [(a-1)%4, a, (a+1)%4]:
100 |                                 newrow, newcol = inc(row, col, b)
101 |                                 newstate = to_s(newrow, newcol)
102 |                                 newletter = desc[newrow, newcol]
103 |                                 done = bytes(newletter) in b'GH'
104 |                                 rew = float(newletter == b'G')
105 |                                 li.append((0.8 if b==a else 0.1, newstate, rew, done))
106 |                         else:
107 |                             newrow, newcol = inc(row, col, a)
108 |                             newstate = to_s(newrow, newcol)
109 |                             newletter = desc[newrow, newcol]
110 |                             done = bytes(newletter) in b'GH'
111 |                             rew = float(newletter == b'G')
112 |                             li.append((1.0, newstate, rew, done))
113 | 
114 |         super(FrozenLakeEnv, self).__init__(nS, nA, P, isd)
115 | 
116 |     def _render(self, mode='human', close=False):
117 |         if close:
118 |             return
119 |         outfile = StringIO() if mode == 'ansi' else sys.stdout
120 | 
121 |         row, col = self.s // self.ncol, self.s % self.ncol
122 |         desc = self.desc.tolist()
123 |         desc = [[c.decode('utf-8') for c in line] for line in desc]
124 |         desc[row][col] = utils.colorize(desc[row][col], "red", highlight=True)
125 |         if self.lastaction is not None:
126 |             outfile.write("  ({})\n".format(["Left","Down","Right","Up"][self.lastaction]))
127 |         else:
128 |             outfile.write("\n")
129 |         outfile.write("\n".join(''.join(line) for line in desc)+"\n")
130 | 
131 |         return outfile
132 | 


--------------------------------------------------------------------------------
/hw2/frozen_lake.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lightaime/cs294/25641a22d931326fef0454bddbb0e74502009207/hw2/frozen_lake.pyc


--------------------------------------------------------------------------------
/hw3/README:
--------------------------------------------------------------------------------
1 | See http://rll.berkeley.edu/deeprlcourse/docs/hw3.pdf for instructions
2 | 
3 | The starter code was based on an implementation of Q-learning for Atari
4 | generously provided by Szymon Sidor from OpenAI
5 | 
6 | 


--------------------------------------------------------------------------------
/hw3/atari_wrappers.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | from collections import deque
  4 | import gym
  5 | from gym import spaces
  6 | 
  7 | 
  8 | class NoopResetEnv(gym.Wrapper):
  9 |     def __init__(self, env=None, noop_max=30):
 10 |         """Sample initial states by taking random number of no-ops on reset.
 11 |         No-op is assumed to be action 0.
 12 |         """
 13 |         super(NoopResetEnv, self).__init__(env)
 14 |         self.noop_max = noop_max
 15 |         assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
 16 | 
 17 |     def _reset(self):
 18 |         """ Do no-op action for a number of steps in [1, noop_max]."""
 19 |         self.env.reset()
 20 |         noops = np.random.randint(1, self.noop_max + 1)
 21 |         for _ in range(noops):
 22 |             obs, _, _, _ = self.env.step(0)
 23 |         return obs
 24 | 
 25 | class FireResetEnv(gym.Wrapper):
 26 |     def __init__(self, env=None):
 27 |         """Take action on reset for environments that are fixed until firing."""
 28 |         super(FireResetEnv, self).__init__(env)
 29 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
 30 |         assert len(env.unwrapped.get_action_meanings()) >= 3
 31 | 
 32 |     def _reset(self):
 33 |         self.env.reset()
 34 |         obs, _, _, _ = self.env.step(1)
 35 |         obs, _, _, _ = self.env.step(2)
 36 |         return obs
 37 | 
 38 | class EpisodicLifeEnv(gym.Wrapper):
 39 |     def __init__(self, env=None):
 40 |         """Make end-of-life == end-of-episode, but only reset on true game over.
 41 |         Done by DeepMind for the DQN and co. since it helps value estimation.
 42 |         """
 43 |         super(EpisodicLifeEnv, self).__init__(env)
 44 |         self.lives = 0
 45 |         self.was_real_done  = True
 46 |         self.was_real_reset = False
 47 | 
 48 |     def _step(self, action):
 49 |         obs, reward, done, info = self.env.step(action)
 50 |         self.was_real_done = done
 51 |         # check current lives, make loss of life terminal,
 52 |         # then update lives to handle bonus lives
 53 |         lives = self.env.unwrapped.ale.lives()
 54 |         if lives < self.lives and lives > 0:
 55 |             # for Qbert somtimes we stay in lives == 0 condtion for a few frames
 56 |             # so its important to keep lives > 0, so that we only reset once
 57 |             # the environment advertises done.
 58 |             done = True
 59 |         self.lives = lives
 60 |         return obs, reward, done, info
 61 | 
 62 |     def _reset(self):
 63 |         """Reset only when lives are exhausted.
 64 |         This way all states are still reachable even though lives are episodic,
 65 |         and the learner need not know about any of this behind-the-scenes.
 66 |         """
 67 |         if self.was_real_done:
 68 |             obs = self.env.reset()
 69 |             self.was_real_reset = True
 70 |         else:
 71 |             # no-op step to advance from terminal/lost life state
 72 |             obs, _, _, _ = self.env.step(0)
 73 |             self.was_real_reset = False
 74 |         self.lives = self.env.unwrapped.ale.lives()
 75 |         return obs
 76 | 
 77 | class MaxAndSkipEnv(gym.Wrapper):
 78 |     def __init__(self, env=None, skip=4):
 79 |         """Return only every `skip`-th frame"""
 80 |         super(MaxAndSkipEnv, self).__init__(env)
 81 |         # most recent raw observations (for max pooling across time steps)
 82 |         self._obs_buffer = deque(maxlen=2)
 83 |         self._skip       = skip
 84 | 
 85 |     def _step(self, action):
 86 |         total_reward = 0.0
 87 |         done = None
 88 |         for _ in range(self._skip):
 89 |             obs, reward, done, info = self.env.step(action)
 90 |             self._obs_buffer.append(obs)
 91 |             total_reward += reward
 92 |             if done:
 93 |                 break
 94 | 
 95 |         max_frame = np.max(np.stack(self._obs_buffer), axis=0)
 96 | 
 97 |         return max_frame, total_reward, done, info
 98 | 
 99 |     def _reset(self):
100 |         """Clear past frame buffer and init. to first obs. from inner env."""
101 |         self._obs_buffer.clear()
102 |         obs = self.env.reset()
103 |         self._obs_buffer.append(obs)
104 |         return obs
105 | 
106 | def _process_frame84(frame):
107 |     img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
108 |     img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
109 |     resized_screen = cv2.resize(img, (84, 110),  interpolation=cv2.INTER_LINEAR)
110 |     x_t = resized_screen[18:102, :]
111 |     x_t = np.reshape(x_t, [84, 84, 1])
112 |     return x_t.astype(np.uint8)
113 | 
114 | class ProcessFrame84(gym.Wrapper):
115 |     def __init__(self, env=None):
116 |         super(ProcessFrame84, self).__init__(env)
117 |         self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1))
118 | 
119 |     def _step(self, action):
120 |         obs, reward, done, info = self.env.step(action)
121 |         return _process_frame84(obs), reward, done, info
122 | 
123 |     def _reset(self):
124 |         return _process_frame84(self.env.reset())
125 | 
126 | class ClippedRewardsWrapper(gym.Wrapper):
127 |     def _step(self, action):
128 |         obs, reward, done, info = self.env.step(action)
129 |         return obs, np.sign(reward), done, info
130 | 
131 | def wrap_deepmind_ram(env):
132 |     env = EpisodicLifeEnv(env)
133 |     env = NoopResetEnv(env, noop_max=30)
134 |     env = MaxAndSkipEnv(env, skip=4)
135 |     if 'FIRE' in env.unwrapped.get_action_meanings():
136 |         env = FireResetEnv(env)
137 |     env = ClippedRewardsWrapper(env)
138 |     return env
139 | 
140 | def wrap_deepmind(env):
141 |     assert 'NoFrameskip' in env.spec.id
142 |     env = EpisodicLifeEnv(env)
143 |     env = NoopResetEnv(env, noop_max=30)
144 |     env = MaxAndSkipEnv(env, skip=4)
145 |     if 'FIRE' in env.unwrapped.get_action_meanings():
146 |         env = FireResetEnv(env)
147 |     env = ProcessFrame84(env)
148 |     env = ClippedRewardsWrapper(env)
149 |     return env
150 | 


--------------------------------------------------------------------------------
/hw3/dqn.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import gym.spaces
  3 | import itertools
  4 | import numpy as np
  5 | import random
  6 | import tensorflow                as tf
  7 | import tensorflow.contrib.layers as layers
  8 | from collections import namedtuple
  9 | from dqn_utils import *
 10 | 
 11 | OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs", "lr_schedule"])
 12 | 
 13 | def learn(env,
 14 |           q_func,
 15 |           optimizer_spec,
 16 |           session,
 17 |           exploration=LinearSchedule(1000000, 0.1),
 18 |           stopping_criterion=None,
 19 |           replay_buffer_size=1000000,
 20 |           batch_size=32,
 21 |           gamma=0.99,
 22 |           learning_starts=50000,
 23 |           learning_freq=4,
 24 |           frame_history_len=4,
 25 |           target_update_freq=10000,
 26 |           grad_norm_clipping=10):
 27 |     """Run Deep Q-learning algorithm.
 28 | 
 29 |     You can specify your own convnet using q_func.
 30 | 
 31 |     All schedules are w.r.t. total number of steps taken in the environment.
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     env: gym.Env
 36 |         gym environment to train on.
 37 |     q_func: function
 38 |         Model to use for computing the q function. It should accept the
 39 |         following named arguments:
 40 |             img_in: tf.Tensor
 41 |                 tensorflow tensor representing the input image
 42 |             num_actions: int
 43 |                 number of actions
 44 |             scope: str
 45 |                 scope in which all the model related variables
 46 |                 should be created
 47 |             reuse: bool
 48 |                 whether previously created variables should be reused.
 49 |     optimizer_spec: OptimizerSpec
 50 |         Specifying the constructor and kwargs, as well as learning rate schedule
 51 |         for the optimizer
 52 |     session: tf.Session
 53 |         tensorflow session to use.
 54 |     exploration: rl_algs.deepq.utils.schedules.Schedule
 55 |         schedule for probability of chosing random action.
 56 |     stopping_criterion: (env, t) -> bool
 57 |         should return true when it's ok for the RL algorithm to stop.
 58 |         takes in env and the number of steps executed so far.
 59 |     replay_buffer_size: int
 60 |         How many memories to store in the replay buffer.
 61 |     batch_size: int
 62 |         How many transitions to sample each time experience is replayed.
 63 |     gamma: float
 64 |         Discount Factor
 65 |     learning_starts: int
 66 |         After how many environment steps to start replaying experiences
 67 |     learning_freq: int
 68 |         How many steps of environment to take between every experience replay
 69 |     frame_history_len: int
 70 |         How many past frames to include as input to the model.
 71 |     target_update_freq: int
 72 |         How many experience replay rounds (not steps!) to perform between
 73 |         each update to the target Q network
 74 |     grad_norm_clipping: float or None
 75 |         If not None gradients' norms are clipped to this value.
 76 |     """
 77 |     assert type(env.observation_space) == gym.spaces.Box
 78 |     assert type(env.action_space)      == gym.spaces.Discrete
 79 | 
 80 |     ###############
 81 |     # BUILD MODEL #
 82 |     ###############
 83 | 
 84 |     if len(env.observation_space.shape) == 1:
 85 |         # This means we are running on low-dimensional observations (e.g. RAM)
 86 |         input_shape = env.observation_space.shape
 87 |     else:
 88 |         img_h, img_w, img_c = env.observation_space.shape
 89 |         input_shape = (img_h, img_w, frame_history_len * img_c)
 90 |     num_actions = env.action_space.n
 91 | 
 92 |     # set up placeholders
 93 |     # placeholder for current observation (or state)
 94 |     obs_t_ph              = tf.placeholder(tf.uint8, [None] + list(input_shape))
 95 |     # placeholder for current action
 96 |     act_t_ph              = tf.placeholder(tf.int32,   [None])
 97 |     # placeholder for current reward
 98 |     rew_t_ph              = tf.placeholder(tf.float32, [None])
 99 |     # placeholder for next observation (or state)
100 |     obs_tp1_ph            = tf.placeholder(tf.uint8, [None] + list(input_shape))
101 |     # placeholder for end of episode mask
102 |     # this value is 1 if the next state corresponds to the end of an episode,
103 |     # in which case there is no Q-value at the next state; at the end of an
104 |     # episode, only the current state reward contributes to the target, not the
105 |     # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1)
106 |     done_mask_ph          = tf.placeholder(tf.float32, [None])
107 | 
108 |     # casting to float on GPU ensures lower data transfer times.
109 |     obs_t_float   = tf.cast(obs_t_ph,   tf.float32) / 255.0
110 |     obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0
111 | 
112 |     # Here, you should fill in your own code to compute the Bellman error. This requires
113 |     # evaluating the current and next Q-values and constructing the corresponding error.
114 |     # TensorFlow will differentiate this error for you, you just need to pass it to the
115 |     # optimizer. See assignment text for details.
116 |     # Your code should produce one scalar-valued tensor: total_error
117 |     # This will be passed to the optimizer in the provided code below.
118 |     # Your code should also produce two collections of variables:
119 |     # q_func_vars
120 |     # target_q_func_vars
121 |     # These should hold all of the variables of the Q-function network and target network,
122 |     # respectively. A convenient way to get these is to make use of TF's "scope" feature.
123 |     # For example, you can create your Q-function network with the scope "q_func" like this:
124 |     # <something> = q_func(obs_t_float, num_actions, scope="q_func", reuse=False)
125 |     # And then you can obtain the variables like this:
126 |     # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func')
127 |     # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES"
128 |     ######
129 |     
130 |     # YOUR CODE HERE
131 | 
132 |     ######
133 | 
134 |     # construct optimization op (with gradient clipping)
135 |     learning_rate = tf.placeholder(tf.float32, (), name="learning_rate")
136 |     optimizer = optimizer_spec.constructor(learning_rate=learning_rate, **optimizer_spec.kwargs)
137 |     train_fn = minimize_and_clip(optimizer, total_error,
138 |                  var_list=q_func_vars, clip_val=grad_norm_clipping)
139 | 
140 |     # update_target_fn will be called periodically to copy Q network to target Q network
141 |     update_target_fn = []
142 |     for var, var_target in zip(sorted(q_func_vars,        key=lambda v: v.name),
143 |                                sorted(target_q_func_vars, key=lambda v: v.name)):
144 |         update_target_fn.append(var_target.assign(var))
145 |     update_target_fn = tf.group(*update_target_fn)
146 | 
147 |     # construct the replay buffer
148 |     replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)
149 | 
150 |     ###############
151 |     # RUN ENV     #
152 |     ###############
153 |     model_initialized = False
154 |     num_param_updates = 0
155 |     mean_episode_reward      = -float('nan')
156 |     best_mean_episode_reward = -float('inf')
157 |     last_obs = env.reset()
158 |     LOG_EVERY_N_STEPS = 10000
159 | 
160 |     for t in itertools.count():
161 |         ### 1. Check stopping criterion
162 |         if stopping_criterion is not None and stopping_criterion(env, t):
163 |             break
164 | 
165 |         ### 2. Step the env and store the transition
166 |         # At this point, "last_obs" contains the latest observation that was
167 |         # recorded from the simulator. Here, your code needs to store this
168 |         # observation and its outcome (reward, next observation, etc.) into
169 |         # the replay buffer while stepping the simulator forward one step.
170 |         # At the end of this block of code, the simulator should have been
171 |         # advanced one step, and the replay buffer should contain one more
172 |         # transition.
173 |         # Specifically, last_obs must point to the new latest observation.
174 |         # Useful functions you'll need to call:
175 |         # obs, reward, done, info = env.step(action)
176 |         # this steps the environment forward one step
177 |         # obs = env.reset()
178 |         # this resets the environment if you reached an episode boundary.
179 |         # Don't forget to call env.reset() to get a new observation if done
180 |         # is true!!
181 |         # Note that you cannot use "last_obs" directly as input
182 |         # into your network, since it needs to be processed to include context
183 |         # from previous frames. You should check out the replay buffer
184 |         # implementation in dqn_utils.py to see what functionality the replay
185 |         # buffer exposes. The replay buffer has a function called
186 |         # encode_recent_observation that will take the latest observation
187 |         # that you pushed into the buffer and compute the corresponding
188 |         # input that should be given to a Q network by appending some
189 |         # previous frames.
190 |         # Don't forget to include epsilon greedy exploration!
191 |         # And remember that the first time you enter this loop, the model
192 |         # may not yet have been initialized (but of course, the first step
193 |         # might as well be random, since you haven't trained your net...)
194 | 
195 |         #####
196 |         
197 |         # YOUR CODE HERE
198 | 
199 |         #####
200 | 
201 |         # at this point, the environment should have been advanced one step (and
202 |         # reset if done was true), and last_obs should point to the new latest
203 |         # observation
204 | 
205 |         ### 3. Perform experience replay and train the network.
206 |         # note that this is only done if the replay buffer contains enough samples
207 |         # for us to learn something useful -- until then, the model will not be
208 |         # initialized and random actions should be taken
209 |         if (t > learning_starts and
210 |                 t % learning_freq == 0 and
211 |                 replay_buffer.can_sample(batch_size)):
212 |             # Here, you should perform training. Training consists of four steps:
213 |             # 3.a: use the replay buffer to sample a batch of transitions (see the
214 |             # replay buffer code for function definition, each batch that you sample
215 |             # should consist of current observations, current actions, rewards,
216 |             # next observations, and done indicator).
217 |             # 3.b: initialize the model if it has not been initialized yet; to do
218 |             # that, call
219 |             #    initialize_interdependent_variables(session, tf.global_variables(), {
220 |             #        obs_t_ph: obs_t_batch,
221 |             #        obs_tp1_ph: obs_tp1_batch,
222 |             #    })
223 |             # where obs_t_batch and obs_tp1_batch are the batches of observations at
224 |             # the current and next time step. The boolean variable model_initialized
225 |             # indicates whether or not the model has been initialized.
226 |             # Remember that you have to update the target network too (see 3.d)!
227 |             # 3.c: train the model. To do this, you'll need to use the train_fn and
228 |             # total_error ops that were created earlier: total_error is what you
229 |             # created to compute the total Bellman error in a batch, and train_fn
230 |             # will actually perform a gradient step and update the network parameters
231 |             # to reduce total_error. When calling session.run on these you'll need to
232 |             # populate the following placeholders:
233 |             # obs_t_ph
234 |             # act_t_ph
235 |             # rew_t_ph
236 |             # obs_tp1_ph
237 |             # done_mask_ph
238 |             # (this is needed for computing total_error)
239 |             # learning_rate -- you can get this from optimizer_spec.lr_schedule.value(t)
240 |             # (this is needed by the optimizer to choose the learning rate)
241 |             # 3.d: periodically update the target network by calling
242 |             # session.run(update_target_fn)
243 |             # you should update every target_update_freq steps, and you may find the
244 |             # variable num_param_updates useful for this (it was initialized to 0)
245 |             #####
246 |             
247 |             # YOUR CODE HERE
248 | 
249 |             #####
250 | 
251 |         ### 4. Log progress
252 |         episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards()
253 |         if len(episode_rewards) > 0:
254 |             mean_episode_reward = np.mean(episode_rewards[-100:])
255 |         if len(episode_rewards) > 100:
256 |             best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward)
257 |         if t % LOG_EVERY_N_STEPS == 0 and model_initialized:
258 |             print("Timestep %d" % (t,))
259 |             print("mean reward (100 episodes) %f" % mean_episode_reward)
260 |             print("best mean reward %f" % best_mean_episode_reward)
261 |             print("episodes %d" % len(episode_rewards))
262 |             print("exploration %f" % exploration.value(t))
263 |             print("learning_rate %f" % optimizer_spec.lr_schedule.value(t))
264 |             sys.stdout.flush()
265 | 


--------------------------------------------------------------------------------
/hw3/dqn_utils.py:
--------------------------------------------------------------------------------
  1 | """This file includes a collection of utility functions that are useful for
  2 | implementing DQN."""
  3 | import gym
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | import random
  7 | 
  8 | def huber_loss(x, delta=1.0):
  9 |     # https://en.wikipedia.org/wiki/Huber_loss
 10 |     return tf.select(
 11 |         tf.abs(x) < delta,
 12 |         tf.square(x) * 0.5,
 13 |         delta * (tf.abs(x) - 0.5 * delta)
 14 |     )
 15 | 
 16 | def sample_n_unique(sampling_f, n):
 17 |     """Helper function. Given a function `sampling_f` that returns
 18 |     comparable objects, sample n such unique objects.
 19 |     """
 20 |     res = []
 21 |     while len(res) < n:
 22 |         candidate = sampling_f()
 23 |         if candidate not in res:
 24 |             res.append(candidate)
 25 |     return res
 26 | 
 27 | class Schedule(object):
 28 |     def value(self, t):
 29 |         """Value of the schedule at time t"""
 30 |         raise NotImplementedError()
 31 | 
 32 | class ConstantSchedule(object):
 33 |     def __init__(self, value):
 34 |         """Value remains constant over time.
 35 |         Parameters
 36 |         ----------
 37 |         value: float
 38 |             Constant value of the schedule
 39 |         """
 40 |         self._v = value
 41 | 
 42 |     def value(self, t):
 43 |         """See Schedule.value"""
 44 |         return self._v
 45 | 
 46 | def linear_interpolation(l, r, alpha):
 47 |     return l + alpha * (r - l)
 48 | 
 49 | class PiecewiseSchedule(object):
 50 |     def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
 51 |         """Piecewise schedule.
 52 |         endpoints: [(int, int)]
 53 |             list of pairs `(time, value)` meanining that schedule should output
 54 |             `value` when `t==time`. All the values for time must be sorted in
 55 |             an increasing order. When t is between two times, e.g. `(time_a, value_a)`
 56 |             and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
 57 |             `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
 58 |             time passed between `time_a` and `time_b` for time `t`.
 59 |         interpolation: lambda float, float, float: float
 60 |             a function that takes value to the left and to the right of t according
 61 |             to the `endpoints`. Alpha is the fraction of distance from left endpoint to
 62 |             right endpoint that t has covered. See linear_interpolation for example.
 63 |         outside_value: float
 64 |             if the value is requested outside of all the intervals sepecified in
 65 |             `endpoints` this value is returned. If None then AssertionError is
 66 |             raised when outside value is requested.
 67 |         """
 68 |         idxes = [e[0] for e in endpoints]
 69 |         assert idxes == sorted(idxes)
 70 |         self._interpolation = interpolation
 71 |         self._outside_value = outside_value
 72 |         self._endpoints      = endpoints
 73 | 
 74 |     def value(self, t):
 75 |         """See Schedule.value"""
 76 |         for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
 77 |             if l_t <= t and t < r_t:
 78 |                 alpha = float(t - l_t) / (r_t - l_t)
 79 |                 return self._interpolation(l, r, alpha)
 80 | 
 81 |         # t does not belong to any of the pieces, so doom.
 82 |         assert self._outside_value is not None
 83 |         return self._outside_value
 84 | 
 85 | class LinearSchedule(object):
 86 |     def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
 87 |         """Linear interpolation between initial_p and final_p over
 88 |         schedule_timesteps. After this many timesteps pass final_p is
 89 |         returned.
 90 |         Parameters
 91 |         ----------
 92 |         schedule_timesteps: int
 93 |             Number of timesteps for which to linearly anneal initial_p
 94 |             to final_p
 95 |         initial_p: float
 96 |             initial output value
 97 |         final_p: float
 98 |             final output value
 99 |         """
100 |         self.schedule_timesteps = schedule_timesteps
101 |         self.final_p            = final_p
102 |         self.initial_p          = initial_p
103 | 
104 |     def value(self, t):
105 |         """See Schedule.value"""
106 |         fraction  = min(float(t) / self.schedule_timesteps, 1.0)
107 |         return self.initial_p + fraction * (self.final_p - self.initial_p)
108 | 
109 | def compute_exponential_averages(variables, decay):
110 |     """Given a list of tensorflow scalar variables
111 |     create ops corresponding to their exponential
112 |     averages
113 |     Parameters
114 |     ----------
115 |     variables: [tf.Tensor]
116 |         List of scalar tensors.
117 |     Returns
118 |     -------
119 |     averages: [tf.Tensor]
120 |         List of scalar tensors corresponding to averages
121 |         of al the `variables` (in order)
122 |     apply_op: tf.runnable
123 |         Op to be run to update the averages with current value
124 |         of variables.
125 |     """
126 |     averager = tf.train.ExponentialMovingAverage(decay=decay)
127 |     apply_op = averager.apply(variables)
128 |     return [averager.average(v) for v in variables], apply_op
129 | 
130 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
131 |     """Minimized `objective` using `optimizer` w.r.t. variables in
132 |     `var_list` while ensure the norm of the gradients for each
133 |     variable is clipped to `clip_val`
134 |     """
135 |     gradients = optimizer.compute_gradients(objective, var_list=var_list)
136 |     for i, (grad, var) in enumerate(gradients):
137 |         if grad is not None:
138 |             gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
139 |     return optimizer.apply_gradients(gradients)
140 | 
141 | def initialize_interdependent_variables(session, vars_list, feed_dict):
142 |     """Initialize a list of variables one at a time, which is useful if
143 |     initialization of some variables depends on initialization of the others.
144 |     """
145 |     vars_left = vars_list
146 |     while len(vars_left) > 0:
147 |         new_vars_left = []
148 |         for v in vars_left:
149 |             try:
150 |                 # If using an older version of TensorFlow, uncomment the line
151 |                 # below and comment out the line after it.
152 | 		#session.run(tf.initialize_variables([v]), feed_dict)
153 |                 session.run(tf.variables_initializer([v]), feed_dict)
154 |             except tf.errors.FailedPreconditionError:
155 |                 new_vars_left.append(v)
156 |         if len(new_vars_left) >= len(vars_left):
157 |             # This can happend if the variables all depend on each other, or more likely if there's
158 |             # another variable outside of the list, that still needs to be initialized. This could be
159 |             # detected here, but life's finite.
160 |             raise Exception("Cycle in variable dependencies, or extenrnal precondition unsatisfied.")
161 |         else:
162 |             vars_left = new_vars_left
163 | 
164 | def get_wrapper_by_name(env, classname):
165 |     currentenv = env
166 |     while True:
167 |         if classname in currentenv.__class__.__name__:
168 |             return currentenv
169 |         elif isinstance(env, gym.Wrapper):
170 |             currentenv = currentenv.env
171 |         else:
172 |             raise ValueError("Couldn't find wrapper named %s"%classname)
173 | 
174 | class ReplayBuffer(object):
175 |     def __init__(self, size, frame_history_len):
176 |         """This is a memory efficient implementation of the replay buffer.
177 | 
178 |         The sepecific memory optimizations use here are:
179 |             - only store each frame once rather than k times
180 |               even if every observation normally consists of k last frames
181 |             - store frames as np.uint8 (actually it is most time-performance
182 |               to cast them back to float32 on GPU to minimize memory transfer
183 |               time)
184 |             - store frame_t and frame_(t+1) in the same buffer.
185 | 
186 |         For the tipical use case in Atari Deep RL buffer with 1M frames the total
187 |         memory footprint of this buffer is 10^6 * 84 * 84 bytes ~= 7 gigabytes
188 | 
189 |         Warning! Assumes that returning frame of zeros at the beginning
190 |         of the episode, when there is less frames than `frame_history_len`,
191 |         is acceptable.
192 | 
193 |         Parameters
194 |         ----------
195 |         size: int
196 |             Max number of transitions to store in the buffer. When the buffer
197 |             overflows the old memories are dropped.
198 |         frame_history_len: int
199 |             Number of memories to be retried for each observation.
200 |         """
201 |         self.size = size
202 |         self.frame_history_len = frame_history_len
203 | 
204 |         self.next_idx      = 0
205 |         self.num_in_buffer = 0
206 | 
207 |         self.obs      = None
208 |         self.action   = None
209 |         self.reward   = None
210 |         self.done     = None
211 | 
212 |     def can_sample(self, batch_size):
213 |         """Returns true if `batch_size` different transitions can be sampled from the buffer."""
214 |         return batch_size + 1 <= self.num_in_buffer
215 | 
216 |     def _encode_sample(self, idxes):
217 |         obs_batch      = np.concatenate([self._encode_observation(idx)[None] for idx in idxes], 0)
218 |         act_batch      = self.action[idxes]
219 |         rew_batch      = self.reward[idxes]
220 |         next_obs_batch = np.concatenate([self._encode_observation(idx + 1)[None] for idx in idxes], 0)
221 |         done_mask      = np.array([1.0 if self.done[idx] else 0.0 for idx in idxes], dtype=np.float32)
222 | 
223 |         return obs_batch, act_batch, rew_batch, next_obs_batch, done_mask
224 | 
225 | 
226 |     def sample(self, batch_size):
227 |         """Sample `batch_size` different transitions.
228 | 
229 |         i-th sample transition is the following:
230 | 
231 |         when observing `obs_batch[i]`, action `act_batch[i]` was taken,
232 |         after which reward `rew_batch[i]` was received and subsequent
233 |         observation  next_obs_batch[i] was observed, unless the epsiode
234 |         was done which is represented by `done_mask[i]` which is equal
235 |         to 1 if episode has ended as a result of that action.
236 | 
237 |         Parameters
238 |         ----------
239 |         batch_size: int
240 |             How many transitions to sample.
241 | 
242 |         Returns
243 |         -------
244 |         obs_batch: np.array
245 |             Array of shape
246 |             (batch_size, img_h, img_w, img_c * frame_history_len)
247 |             and dtype np.uint8
248 |         act_batch: np.array
249 |             Array of shape (batch_size,) and dtype np.int32
250 |         rew_batch: np.array
251 |             Array of shape (batch_size,) and dtype np.float32
252 |         next_obs_batch: np.array
253 |             Array of shape
254 |             (batch_size, img_h, img_w, img_c * frame_history_len)
255 |             and dtype np.uint8
256 |         done_mask: np.array
257 |             Array of shape (batch_size,) and dtype np.float32
258 |         """
259 |         assert self.can_sample(batch_size)
260 |         idxes = sample_n_unique(lambda: random.randint(0, self.num_in_buffer - 2), batch_size)
261 |         return self._encode_sample(idxes)
262 | 
263 |     def encode_recent_observation(self):
264 |         """Return the most recent `frame_history_len` frames.
265 | 
266 |         Returns
267 |         -------
268 |         observation: np.array
269 |             Array of shape (img_h, img_w, img_c * frame_history_len)
270 |             and dtype np.uint8, where observation[:, :, i*img_c:(i+1)*img_c]
271 |             encodes frame at time `t - frame_history_len + i`
272 |         """
273 |         assert self.num_in_buffer > 0
274 |         return self._encode_observation((self.next_idx - 1) % self.size)
275 | 
276 |     def _encode_observation(self, idx):
277 |         end_idx   = idx + 1 # make noninclusive
278 |         start_idx = end_idx - self.frame_history_len
279 |         # this checks if we are using low-dimensional observations, such as RAM
280 |         # state, in which case we just directly return the latest RAM.
281 |         if len(self.obs.shape) == 2:
282 |             return self.obs[end_idx-1]
283 |         # if there weren't enough frames ever in the buffer for context
284 |         if start_idx < 0 and self.num_in_buffer != self.size:
285 |             start_idx = 0
286 |         for idx in range(start_idx, end_idx - 1):
287 |             if self.done[idx % self.size]:
288 |                 start_idx = idx + 1
289 |         missing_context = self.frame_history_len - (end_idx - start_idx)
290 |         # if zero padding is needed for missing context
291 |         # or we are on the boundry of the buffer
292 |         if start_idx < 0 or missing_context > 0:
293 |             frames = [np.zeros_like(self.obs[0]) for _ in range(missing_context)]
294 |             for idx in range(start_idx, end_idx):
295 |                 frames.append(self.obs[idx % self.size])
296 |             return np.concatenate(frames, 2)
297 |         else:
298 |             # this optimization has potential to saves about 30% compute time \o/
299 |             img_h, img_w = self.obs.shape[1], self.obs.shape[2]
300 |             return self.obs[start_idx:end_idx].transpose(1, 2, 0, 3).reshape(img_h, img_w, -1)
301 | 
302 |     def store_frame(self, frame):
303 |         """Store a single frame in the buffer at the next available index, overwriting
304 |         old frames if necessary.
305 | 
306 |         Parameters
307 |         ----------
308 |         frame: np.array
309 |             Array of shape (img_h, img_w, img_c) and dtype np.uint8
310 |             the frame to be stored
311 | 
312 |         Returns
313 |         -------
314 |         idx: int
315 |             Index at which the frame is stored. To be used for `store_effect` later.
316 |         """
317 |         if self.obs is None:
318 |             self.obs      = np.empty([self.size] + list(frame.shape), dtype=np.uint8)
319 |             self.action   = np.empty([self.size],                     dtype=np.int32)
320 |             self.reward   = np.empty([self.size],                     dtype=np.float32)
321 |             self.done     = np.empty([self.size],                     dtype=np.bool)
322 |         self.obs[self.next_idx] = frame
323 | 
324 |         ret = self.next_idx
325 |         self.next_idx = (self.next_idx + 1) % self.size
326 |         self.num_in_buffer = min(self.size, self.num_in_buffer + 1)
327 | 
328 |         return ret
329 | 
330 |     def store_effect(self, idx, action, reward, done):
331 |         """Store effects of action taken after obeserving frame stored
332 |         at index idx. The reason `store_frame` and `store_effect` is broken
333 |         up into two functions is so that once can call `encode_recent_observation`
334 |         in between.
335 | 
336 |         Paramters
337 |         ---------
338 |         idx: int
339 |             Index in buffer of recently observed frame (returned by `store_frame`).
340 |         action: int
341 |             Action that was performed upon observing this frame.
342 |         reward: float
343 |             Reward that was received when the actions was performed.
344 |         done: bool
345 |             True if episode was finished after performing that action.
346 |         """
347 |         self.action[idx] = action
348 |         self.reward[idx] = reward
349 |         self.done[idx]   = done
350 | 
351 | 


--------------------------------------------------------------------------------
/hw3/run_dqn_atari.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | from gym import wrappers
  4 | import os.path as osp
  5 | import random
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import tensorflow.contrib.layers as layers
  9 | 
 10 | import dqn
 11 | from dqn_utils import *
 12 | from atari_wrappers import *
 13 | 
 14 | 
 15 | def atari_model(img_in, num_actions, scope, reuse=False):
 16 |     # as described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf
 17 |     with tf.variable_scope(scope, reuse=reuse):
 18 |         out = img_in
 19 |         with tf.variable_scope("convnet"):
 20 |             # original architecture
 21 |             out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu)
 22 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu)
 23 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu)
 24 |         out = layers.flatten(out)
 25 |         with tf.variable_scope("action_value"):
 26 |             out = layers.fully_connected(out, num_outputs=512,         activation_fn=tf.nn.relu)
 27 |             out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
 28 | 
 29 |         return out
 30 | 
 31 | def atari_learn(env,
 32 |                 session,
 33 |                 num_timesteps):
 34 |     # This is just a rough estimate
 35 |     num_iterations = float(num_timesteps) / 4.0
 36 | 
 37 |     lr_multiplier = 1.0
 38 |     lr_schedule = PiecewiseSchedule([
 39 |                                          (0,                   1e-4 * lr_multiplier),
 40 |                                          (num_iterations / 10, 1e-4 * lr_multiplier),
 41 |                                          (num_iterations / 2,  5e-5 * lr_multiplier),
 42 |                                     ],
 43 |                                     outside_value=5e-5 * lr_multiplier)
 44 |     optimizer = dqn.OptimizerSpec(
 45 |         constructor=tf.train.AdamOptimizer,
 46 |         kwargs=dict(epsilon=1e-4),
 47 |         lr_schedule=lr_schedule
 48 |     )
 49 | 
 50 |     def stopping_criterion(env, t):
 51 |         # notice that here t is the number of steps of the wrapped env,
 52 |         # which is different from the number of steps in the underlying env
 53 |         return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
 54 | 
 55 |     exploration_schedule = PiecewiseSchedule(
 56 |         [
 57 |             (0, 1.0),
 58 |             (1e6, 0.1),
 59 |             (num_iterations / 2, 0.01),
 60 |         ], outside_value=0.01
 61 |     )
 62 | 
 63 |     dqn.learn(
 64 |         env,
 65 |         q_func=atari_model,
 66 |         optimizer_spec=optimizer,
 67 |         session=session,
 68 |         exploration=exploration_schedule,
 69 |         stopping_criterion=stopping_criterion,
 70 |         replay_buffer_size=1000000,
 71 |         batch_size=32,
 72 |         gamma=0.99,
 73 |         learning_starts=50000,
 74 |         learning_freq=4,
 75 |         frame_history_len=4,
 76 |         target_update_freq=10000,
 77 |         grad_norm_clipping=10
 78 |     )
 79 |     env.close()
 80 | 
 81 | def get_available_gpus():
 82 |     from tensorflow.python.client import device_lib
 83 |     local_device_protos = device_lib.list_local_devices()
 84 |     return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU']
 85 | 
 86 | def set_global_seeds(i):
 87 |     try:
 88 |         import tensorflow as tf
 89 |     except ImportError:
 90 |         pass
 91 |     else:
 92 |         tf.set_random_seed(i) 
 93 |     np.random.seed(i)
 94 |     random.seed(i)
 95 | 
 96 | def get_session():
 97 |     tf.reset_default_graph()
 98 |     tf_config = tf.ConfigProto(
 99 |         inter_op_parallelism_threads=1,
100 |         intra_op_parallelism_threads=1)
101 |     session = tf.Session(config=tf_config)
102 |     print("AVAILABLE GPUS: ", get_available_gpus())
103 |     return session
104 | 
105 | def get_env(task, seed):
106 |     env_id = task.env_id
107 | 
108 |     env = gym.make(env_id)
109 | 
110 |     set_global_seeds(seed)
111 |     env.seed(seed)
112 | 
113 |     expt_dir = '/tmp/hw3_vid_dir2/'
114 |     env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
115 |     env = wrap_deepmind(env)
116 | 
117 |     return env
118 | 
119 | def main():
120 |     # Get Atari games.
121 |     benchmark = gym.benchmark_spec('Atari40M')
122 | 
123 |     # Change the index to select a different game.
124 |     task = benchmark.tasks[3]
125 | 
126 |     # Run training
127 |     seed = 0 # Use a seed of zero (you may want to randomize the seed!)
128 |     env = get_env(task, seed)
129 |     session = get_session()
130 |     atari_learn(env, session, num_timesteps=task.max_timesteps)
131 | 
132 | if __name__ == "__main__":
133 |     main()
134 | 


--------------------------------------------------------------------------------
/hw3/run_dqn_ram.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | from gym import wrappers
  4 | import os.path as osp
  5 | import random
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import tensorflow.contrib.layers as layers
  9 | 
 10 | import dqn
 11 | from dqn_utils import *
 12 | from atari_wrappers import *
 13 | 
 14 | 
 15 | def atari_model(ram_in, num_actions, scope, reuse=False):
 16 |     with tf.variable_scope(scope, reuse=reuse):
 17 |         out = ram_in
 18 |         #out = tf.concat(1,(ram_in[:,4:5],ram_in[:,8:9],ram_in[:,11:13],ram_in[:,21:22],ram_in[:,50:51], ram_in[:,60:61],ram_in[:,64:65]))
 19 |         with tf.variable_scope("action_value"):
 20 |             out = layers.fully_connected(out, num_outputs=256, activation_fn=tf.nn.relu)
 21 |             out = layers.fully_connected(out, num_outputs=128, activation_fn=tf.nn.relu)
 22 |             out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu)
 23 |             out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
 24 | 
 25 |         return out
 26 | 
 27 | def atari_learn(env,
 28 |                 session,
 29 |                 num_timesteps):
 30 |     # This is just a rough estimate
 31 |     num_iterations = float(num_timesteps) / 4.0
 32 | 
 33 |     lr_multiplier = 1.0 
 34 |     lr_schedule = PiecewiseSchedule([
 35 |                                          (0,                   1e-4 * lr_multiplier),
 36 |                                          (num_iterations / 10, 1e-4 * lr_multiplier),
 37 |                                          (num_iterations / 2,  5e-5 * lr_multiplier),
 38 |                                     ],
 39 |                                     outside_value=5e-5 * lr_multiplier)
 40 |     optimizer = dqn.OptimizerSpec(
 41 |         constructor=tf.train.AdamOptimizer,
 42 |         kwargs=dict(epsilon=1e-4),
 43 |         lr_schedule=lr_schedule
 44 |     )
 45 | 
 46 |     def stopping_criterion(env, t):
 47 |         # notice that here t is the number of steps of the wrapped env,
 48 |         # which is different from the number of steps in the underlying env
 49 |         return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
 50 | 
 51 |     exploration_schedule = PiecewiseSchedule(
 52 |         [
 53 |             (0, 0.2),
 54 |             (1e6, 0.1),
 55 |             (num_iterations / 2, 0.01),
 56 |         ], outside_value=0.01
 57 |     )
 58 | 
 59 |     dqn.learn(
 60 |         env,
 61 |         q_func=atari_model,
 62 |         optimizer_spec=optimizer,
 63 |         session=session,
 64 |         exploration=exploration_schedule,
 65 |         stopping_criterion=stopping_criterion,
 66 |         replay_buffer_size=1000000,
 67 |         batch_size=32,
 68 |         gamma=0.99,
 69 |         learning_starts=50000,
 70 |         learning_freq=4,
 71 |         frame_history_len=1,
 72 |         target_update_freq=10000,
 73 |         grad_norm_clipping=10
 74 |     )
 75 |     env.close()
 76 | 
 77 | def get_available_gpus():
 78 |     from tensorflow.python.client import device_lib
 79 |     local_device_protos = device_lib.list_local_devices()
 80 |     return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU']
 81 | 
 82 | def set_global_seeds(i):
 83 |     try:
 84 |         import tensorflow as tf
 85 |     except ImportError:
 86 |         pass
 87 |     else:
 88 |         tf.set_random_seed(i) 
 89 |     np.random.seed(i)
 90 |     random.seed(i)
 91 | 
 92 | def get_session():
 93 |     tf.reset_default_graph()
 94 |     tf_config = tf.ConfigProto(
 95 |         inter_op_parallelism_threads=1,
 96 |         intra_op_parallelism_threads=1)
 97 |     session = tf.Session(config=tf_config)
 98 |     print("AVAILABLE GPUS: ", get_available_gpus())
 99 |     return session
100 | 
101 | def get_env(seed):
102 |     env = gym.make('Pong-ram-v0')
103 | 
104 |     set_global_seeds(seed)
105 |     env.seed(seed)
106 | 
107 |     expt_dir = '/tmp/hw3_vid_dir/'
108 |     env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
109 |     env = wrap_deepmind_ram(env)
110 | 
111 |     return env
112 | 
113 | def main():
114 |     # Run training
115 |     seed = 0 # Use a seed of zero (you may want to randomize the seed!)
116 |     env = get_env(seed)
117 |     session = get_session()
118 |     atari_learn(env, session, num_timesteps=int(4e7))
119 | 
120 | if __name__ == "__main__":
121 |     main()
122 | 


--------------------------------------------------------------------------------
/hw4/homework.md:
--------------------------------------------------------------------------------
 1 | # Homework 4
 2 | 
 3 | In `main.py` you will find an implementation of a "vanilla" policy gradient method, applied to an MDP with a discrete action space: an episodic version of the classic "cartpole" task. First, make sure the provided code works on your computer by running `python main.py`. We recommend reading through all of the code and comments in the function `main_cartpole`, starting at the top of the function.
 4 | 
 5 | The code computes some useful diagnostics, which you may find helpful to look at while tuning hyperparameters:
 6 |  
 7 | - **KL[policy before update || policy after update]**. Large spikes in KL divergence mean that the optimization took a large step, and sometimes these spikes cause a collapse in performance.
 8 | - **Entropy of the policy**. If entropy goes down too fast, then you may not explore enough, but if it goes down too slowly, you'll probably not reach optimal performance.
 9 | - **Explained variance of the value function**. If the value function perfectly explains the returns, then it will be 1; if you get a negative result, then it's worse than predicting a constant.
10 | 
11 | Software dependencies: 
12 | 
13 | - tensorflow
14 | - numpy + scipy (Anaconda recommended)
15 | - gym (I'm using 0.8.0, `pip install gym==0.8.0`, but old versions should work just as well)
16 | 
17 | ## Problem 1
18 | 
19 | Here you will modify the `main_cartpole` policy gradient implementation to work on a continuous action space, specifically, the gym environment `Pendulum-v`. Note that in `main_cartpole`, note that the neural network outputs "logits" (i.e., log-probabilities plus-or-minus a constant) that specify a categorical distribution. On the other hand, for the pendulum task, your neural network should output the mean of a Gaussian distribution, a separate parameter vector to parameterize the log standard deviation. For example, you could use the following code:
20 | 
21 | ```python
22 | 
23 |     mean_na = dense(h2, ac_dim, weight_init=normc_initializer(0.1)) # Mean control output
24 |     logstd_a = tf.get_variable("logstdev", [ac_dim], initializer=tf.zeros_initializer) # Variance
25 | 
26 |     sy_sampled_ac = YOUR_CODE_HERE
27 |     sy_logprob_n = YOUR_CODE_HERE
28 | 
29 | ```
30 | 
31 | You should also compute differential entropy (replacing `sy_ent`) and KL-divergence (`sy_kl`) for the Gaussian distribution. 
32 | 
33 | The pendulum problem is slightly harder, and using a fixed stepsize does not work reliably---thus, we instead recommend using an adaptive stepsize, where you adjust it based on the KL divergence between the new and old policy. Code for this stepsize adaptation is provided.
34 | 
35 | You can plot your results using the script `plot_learning_curves.py` or your own plotting code.
36 | 
37 | **Deliverables**
38 | 
39 | - Show a plot with the pendulum converging to EpRewMean of at least `-300`. Include EpRewMean, KL, Entropy in your plots.  
40 | - Describe the hyperparameters used and how many timesteps your algorithm took to learn.
41 | 
42 | ## Problem 2
43 | 
44 | 1. Implement a neural network value function with the same interface as `LinearVF`. Add it to the provided cartpole solver, and compare the performance of the linear and neural network value function (i.e., baseline).
45 | 2. Perform the same comparison--linear vs neural network--for your pendulum solver from Problem 1. You should be able to obtain faster learning using the neural network.
46 | 
47 | 
48 | **Deliverables**
49 | 
50 | - A comparison of linear vs neural network value function on the cartpole. Show the value function's explained variance (EVBefore) and mean episode reward (EpRewMean). 
51 | - A comparison of linear vs neural network value function on the pendulum. Show the value function's explained variance (EVBefore) and mean episode reward (EpRewMean). 
52 | 
53 | In both cases, list the hyperparameters used for neural network training.
54 | 
55 | ## Problem 3 (bonus)
56 | 
57 | Implement a more advanced policy gradient method from lecture (such as TRPO, or the advantage function estimator used in A3C or generalized advantage estimation), and apply it to the gym environment `Hopper-v1`. See if you can learn a good gait in less than 500,000 timesteps.
58 | Hint: it may help to standardize your inputs using a running estimate of mean and standard deviation.
59 | 
60 |     ob_rescaled = (ob_raw - mean) / (stdev + epsilon)
61 | 
62 | **Deliverables**
63 | 
64 | A description of what you implemented, and learning curves on the Hopper-v1 environment.


--------------------------------------------------------------------------------
/hw4/logz.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | Some simple logging functionality, inspired by rllab's logging.
 4 | Assumes that each diagnostic gets logged each iteration
 5 | 
 6 | Call logz.configure_output_dir() to start logging to a 
 7 | tab-separated-values file (some_folder_name/log.txt)
 8 | 
 9 | To load the learning curves, you can do, for example
10 | 
11 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
12 | A['EpRewMean']
13 | 
14 | """
15 | 
16 | import os.path as osp, shutil, time, atexit, os, subprocess
17 | 
18 | color2num = dict(
19 |     gray=30,
20 |     red=31,
21 |     green=32,
22 |     yellow=33,
23 |     blue=34,
24 |     magenta=35,
25 |     cyan=36,
26 |     white=37,
27 |     crimson=38
28 | )
29 | 
30 | def colorize(string, color, bold=False, highlight=False):
31 |     attr = []
32 |     num = color2num[color]
33 |     if highlight: num += 10
34 |     attr.append(str(num))
35 |     if bold: attr.append('1')
36 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
37 | 
38 | class G:
39 |     output_dir = None
40 |     output_file = None
41 |     first_row = True
42 |     log_headers = []
43 |     log_current_row = {}
44 | 
45 | def configure_output_dir(d=None):
46 |     """
47 |     Set output directory to d, or to /tmp/somerandomnumber if d is None
48 |     """
49 |     G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
50 |     assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir
51 |     os.makedirs(G.output_dir)
52 |     G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
53 |     atexit.register(G.output_file.close)
54 |     try:
55 |         cmd = "cd %s && git diff > %s 2>/dev/null"%(osp.dirname(__file__), osp.join(G.output_dir, "a.diff"))
56 |         subprocess.check_call(cmd, shell=True) # Save git diff to experiment directory
57 |     except subprocess.CalledProcessError:
58 |         print("configure_output_dir: not storing the git diff, probably because you're not in a git repo")
59 |     print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
60 | 
61 | def log_tabular(key, val):
62 |     """
63 |     Log a value of some diagnostic
64 |     Call this once for each diagnostic quantity, each iteration
65 |     """
66 |     if G.first_row:
67 |         G.log_headers.append(key)
68 |     else:
69 |         assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
70 |     assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
71 |     G.log_current_row[key] = val
72 | 
73 | def dump_tabular():
74 |     """
75 |     Write all of the diagnostics from the current iteration
76 |     """
77 |     vals = []
78 |     print("-"*37)
79 |     for key in G.log_headers:
80 |         val = G.log_current_row.get(key, "")
81 |         if hasattr(val, "__float__"): valstr = "%8.3g"%val
82 |         else: valstr = val
83 |         print("| %15s | %15s |"%(key, valstr))
84 |         vals.append(val)
85 |     print("-"*37)
86 |     if G.output_file is not None:
87 |         if G.first_row:
88 |             G.output_file.write("\t".join(G.log_headers))
89 |             G.output_file.write("\n")
90 |         G.output_file.write("\t".join(map(str,vals)))
91 |         G.output_file.write("\n")
92 |         G.output_file.flush()
93 |     G.log_current_row.clear()
94 |     G.first_row=False


--------------------------------------------------------------------------------
/hw4/main.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import gym
  4 | import logz
  5 | import scipy.signal
  6 | 
  7 | def normc_initializer(std=1.0):
  8 |     """
  9 |     Initialize array with normalized columns
 10 |     """
 11 |     def _initializer(shape, dtype=None, partition_info=None): #pylint: disable=W0613
 12 |         out = np.random.randn(*shape).astype(np.float32)
 13 |         out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
 14 |         return tf.constant(out)
 15 |     return _initializer
 16 | 
 17 | 
 18 | def dense(x, size, name, weight_init=None):
 19 |     """
 20 |     Dense (fully connected) layer
 21 |     """
 22 |     w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init)
 23 |     b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer())
 24 |     return tf.matmul(x, w) + b
 25 | 
 26 | def fancy_slice_2d(X, inds0, inds1):
 27 |     """
 28 |     Like numpy's X[inds0, inds1]
 29 |     """
 30 |     inds0 = tf.cast(inds0, tf.int64)
 31 |     inds1 = tf.cast(inds1, tf.int64)
 32 |     shape = tf.cast(tf.shape(X), tf.int64)
 33 |     ncols = shape[1]
 34 |     Xflat = tf.reshape(X, [-1])
 35 |     return tf.gather(Xflat, inds0 * ncols + inds1)
 36 | 
 37 | def discount(x, gamma):
 38 |     """
 39 |     Compute discounted sum of future values
 40 |     out[i] = in[i] + gamma * in[i+1] + gamma^2 * in[i+2] + ...
 41 |     """
 42 |     return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1]
 43 | 
 44 | def explained_variance_1d(ypred,y):
 45 |     """
 46 |     Var[ypred - y] / var[y]. 
 47 |     https://www.quora.com/What-is-the-meaning-proportion-of-variance-explained-in-linear-regression
 48 |     """
 49 |     assert y.ndim == 1 and ypred.ndim == 1    
 50 |     vary = np.var(y)
 51 |     return np.nan if vary==0 else 1 - np.var(y-ypred)/vary
 52 | 
 53 | def categorical_sample_logits(logits):
 54 |     """
 55 |     Samples (symbolically) from categorical distribution, where logits is a NxK
 56 |     matrix specifying N categorical distributions with K categories
 57 | 
 58 |     specifically, exp(logits) / sum( exp(logits), axis=1 ) is the 
 59 |     probabilities of the different classes
 60 | 
 61 |     Cleverly uses gumbell trick, based on
 62 |     https://github.com/tensorflow/tensorflow/issues/456
 63 |     """
 64 |     U = tf.random_uniform(tf.shape(logits))
 65 |     return tf.argmax(logits - tf.log(-tf.log(U)), dimension=1)
 66 | 
 67 | def pathlength(path):
 68 |     return len(path["reward"])
 69 | 
 70 | class LinearValueFunction(object):
 71 |     coef = None
 72 |     def fit(self, X, y):
 73 |         Xp = self.preproc(X)
 74 |         A = Xp.T.dot(Xp)
 75 |         nfeats = Xp.shape[1]
 76 |         A[np.arange(nfeats), np.arange(nfeats)] += 1e-3 # a little ridge regression
 77 |         b = Xp.T.dot(y)
 78 |         self.coef = np.linalg.solve(A, b)
 79 |     def predict(self, X):
 80 |         if self.coef is None:
 81 |             return np.zeros(X.shape[0])
 82 |         else:
 83 |             return self.preproc(X).dot(self.coef)
 84 |     def preproc(self, X):
 85 |         return np.concatenate([np.ones([X.shape[0], 1]), X, np.square(X)/2.0], axis=1)
 86 | 
 87 | class NnValueFunction(object):
 88 |     pass # YOUR CODE HERE
 89 | 
 90 | def lrelu(x, leak=0.2):
 91 |     f1 = 0.5 * (1 + leak)
 92 |     f2 = 0.5 * (1 - leak)
 93 |     return f1 * x + f2 * abs(x)
 94 | 
 95 | 
 96 | 
 97 | def main_cartpole(n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, stepsize=1e-2, animate=True, logdir=None):
 98 |     env = gym.make("CartPole-v0")
 99 |     ob_dim = env.observation_space.shape[0]
100 |     num_actions = env.action_space.n
101 |     logz.configure_output_dir(logdir)
102 |     vf = LinearValueFunction()
103 | 
104 |     # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
105 |     # that are computed later in these function
106 |     sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of observations
107 |     sy_ac_n = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) # batch of actions taken by the policy, used for policy gradient computation
108 |     sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # advantage function estimate
109 |     sy_h1 = lrelu(dense(sy_ob_no, 32, "h1", weight_init=normc_initializer(1.0))) # hidden layer
110 |     sy_logits_na = dense(sy_h1, num_actions, "final", weight_init=normc_initializer(0.05)) # "logits", describing probability distribution of final layer
111 |     # we use a small initialization for the last layer, so the initial policy has maximal entropy
112 |     sy_oldlogits_na = tf.placeholder(shape=[None, num_actions], name='oldlogits', dtype=tf.float32) # logits BEFORE update (just used for KL diagnostic)
113 |     sy_logp_na = tf.nn.log_softmax(sy_logits_na) # logprobability of actions
114 |     sy_sampled_ac = categorical_sample_logits(sy_logits_na)[0] # sampled actions, used for defining the policy (NOT computing the policy gradient)
115 |     sy_n = tf.shape(sy_ob_no)[0]
116 |     sy_logprob_n = fancy_slice_2d(sy_logp_na, tf.range(sy_n), sy_ac_n) # log-prob of actions taken -- used for policy gradient calculation
117 | 
118 |     # The following quantities are just used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES >>>>
119 |     sy_oldlogp_na = tf.nn.log_softmax(sy_oldlogits_na)
120 |     sy_oldp_na = tf.exp(sy_oldlogp_na) 
121 |     sy_kl = tf.reduce_sum(sy_oldp_na * (sy_oldlogp_na - sy_logp_na)) / tf.to_float(sy_n)
122 |     sy_p_na = tf.exp(sy_logp_na)
123 |     sy_ent = tf.reduce_sum( - sy_p_na * sy_logp_na) / tf.to_float(sy_n)
124 |     # <<<<<<<<<<<<<
125 | 
126 |     sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss")
127 | 
128 |     sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently)
129 |     update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr)
130 | 
131 |     tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 
132 |     # use single thread. on such a small problem, multithreading gives you a slowdown
133 |     # this way, we can better use multiple cores for different experiments
134 |     sess = tf.Session(config=tf_config)
135 |     sess.__enter__() # equivalent to `with sess:`
136 |     tf.global_variables_initializer().run() #pylint: disable=E1101
137 | 
138 |     total_timesteps = 0
139 | 
140 |     for i in range(n_iter):
141 |         print("********** Iteration %i ************"%i)
142 | 
143 |         # Collect paths until we have enough timesteps
144 |         timesteps_this_batch = 0
145 |         paths = []
146 |         while True:
147 |             ob = env.reset()
148 |             terminated = False
149 |             obs, acs, rewards = [], [], []
150 |             animate_this_episode=(len(paths)==0 and (i % 10 == 0) and animate)
151 |             while True:
152 |                 if animate_this_episode:
153 |                     env.render()
154 |                 obs.append(ob)
155 |                 ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]})
156 |                 acs.append(ac)
157 |                 ob, rew, done, _ = env.step(ac)
158 |                 rewards.append(rew)
159 |                 if done:
160 |                     break                    
161 |             path = {"observation" : np.array(obs), "terminated" : terminated,
162 |                     "reward" : np.array(rewards), "action" : np.array(acs)}
163 |             paths.append(path)
164 |             timesteps_this_batch += pathlength(path)
165 |             if timesteps_this_batch > min_timesteps_per_batch:
166 |                 break
167 |         total_timesteps += timesteps_this_batch
168 |         # Estimate advantage function
169 |         vtargs, vpreds, advs = [], [], []
170 |         for path in paths:
171 |             rew_t = path["reward"]
172 |             return_t = discount(rew_t, gamma)
173 |             vpred_t = vf.predict(path["observation"])
174 |             adv_t = return_t - vpred_t
175 |             advs.append(adv_t)
176 |             vtargs.append(return_t)
177 |             vpreds.append(vpred_t)
178 | 
179 |         # Build arrays for policy update
180 |         ob_no = np.concatenate([path["observation"] for path in paths])
181 |         ac_n = np.concatenate([path["action"] for path in paths])
182 |         adv_n = np.concatenate(advs)
183 |         standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
184 |         vtarg_n = np.concatenate(vtargs)
185 |         vpred_n = np.concatenate(vpreds)
186 |         vf.fit(ob_no, vtarg_n)
187 | 
188 |         # Policy update
189 |         _, oldlogits_na = sess.run([update_op, sy_logits_na], feed_dict={sy_ob_no:ob_no, sy_ac_n:ac_n, sy_adv_n:standardized_adv_n, sy_stepsize:stepsize})
190 |         kl, ent = sess.run([sy_kl, sy_ent], feed_dict={sy_ob_no:ob_no, sy_oldlogits_na:oldlogits_na})
191 | 
192 |         # Log diagnostics
193 |         logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
194 |         logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
195 |         logz.log_tabular("KLOldNew", kl)
196 |         logz.log_tabular("Entropy", ent)
197 |         logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n))
198 |         logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n))
199 |         logz.log_tabular("TimestepsSoFar", total_timesteps)
200 |         # If you're overfitting, EVAfter will be way larger than EVBefore.
201 |         # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias
202 |         logz.dump_tabular()
203 | 
204 | def main_pendulum(logdir, seed, n_iter, gamma, min_timesteps_per_batch, initial_stepsize, desired_kl, vf_type, vf_params, animate=False):
205 |     tf.set_random_seed(seed)
206 |     np.random.seed(seed)
207 |     env = gym.make("Pendulum-v0")
208 |     ob_dim = env.observation_space.shape[0]
209 |     ac_dim = env.action_space.shape[0]
210 |     logz.configure_output_dir(logdir)
211 |     if vf_type == 'linear':
212 |         vf = LinearValueFunction(**vf_params)
213 |     elif vf_type == 'nn':
214 |         vf = NnValueFunction(ob_dim=ob_dim, **vf_params)
215 | 
216 | 
217 |     YOUR_CODE_HERE
218 | 
219 | 
220 |     sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss")
221 | 
222 |     sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently)
223 |     update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr)
224 | 
225 |     sess = tf.Session()
226 |     sess.__enter__() # equivalent to `with sess:`
227 |     tf.global_variables_initializer().run() #pylint: disable=E1101
228 | 
229 |     total_timesteps = 0
230 |     stepsize = initial_stepsize
231 | 
232 |     for i in range(n_iter):
233 |         print("********** Iteration %i ************"%i)
234 | 
235 |         YOUR_CODE_HERE
236 | 
237 |         if kl > desired_kl * 2: 
238 |             stepsize /= 1.5
239 |             print('stepsize -> %s'%stepsize)
240 |         elif kl < desired_kl / 2: 
241 |             stepsize *= 1.5
242 |             print('stepsize -> %s'%stepsize)
243 |         else:
244 |             print('stepsize OK')
245 | 
246 | 
247 |         # Log diagnostics
248 |         logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
249 |         logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
250 |         logz.log_tabular("KLOldNew", kl)
251 |         logz.log_tabular("Entropy", ent)
252 |         logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n))
253 |         logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n))
254 |         logz.log_tabular("TimestepsSoFar", total_timesteps)
255 |         # If you're overfitting, EVAfter will be way larger than EVBefore.
256 |         # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias
257 |         logz.dump_tabular()
258 | 
259 | 
260 | def main_pendulum1(d):
261 |     return main_pendulum(**d)
262 | 
263 | if __name__ == "__main__":
264 |     if 1:
265 |         main_cartpole(logdir=None) # when you want to start collecting results, set the logdir
266 |     if 0:
267 |         general_params = dict(gamma=0.97, animate=False, min_timesteps_per_batch=2500, n_iter=300, initial_stepsize=1e-3)
268 |         params = [
269 |             dict(logdir='/tmp/ref/linearvf-kl2e-3-seed0', seed=0, desired_kl=2e-3, vf_type='linear', vf_params={}, **general_params),
270 |             dict(logdir='/tmp/ref/nnvf-kl2e-3-seed0', seed=0, desired_kl=2e-3, vf_type='nn', vf_params=dict(n_epochs=10, stepsize=1e-3), **general_params),
271 |             dict(logdir='/tmp/ref/linearvf-kl2e-3-seed1', seed=1, desired_kl=2e-3, vf_type='linear', vf_params={}, **general_params),
272 |             dict(logdir='/tmp/ref/nnvf-kl2e-3-seed1', seed=1, desired_kl=2e-3, vf_type='nn', vf_params=dict(n_epochs=10, stepsize=1e-3), **general_params),
273 |             dict(logdir='/tmp/ref/linearvf-kl2e-3-seed2', seed=2, desired_kl=2e-3, vf_type='linear', vf_params={}, **general_params),
274 |             dict(logdir='/tmp/ref/nnvf-kl2e-3-seed2', seed=2, desired_kl=2e-3, vf_type='nn', vf_params=dict(n_epochs=10, stepsize=1e-3), **general_params),
275 |         ]
276 |         import multiprocessing
277 |         p = multiprocessing.Pool()
278 |         p.map(main_pendulum1, params)
279 | 


--------------------------------------------------------------------------------
/hw4/plot_learning_curves.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | parser = argparse.ArgumentParser()
 3 | parser.add_argument("expdir", help="experiment dir, e.g., /tmp/experiments")
 4 | args = parser.parse_args()
 5 | 
 6 | from pylab import *
 7 | import os
 8 | from os.path import join
 9 | 
10 | dirnames = os.listdir(args.expdir)
11 | 
12 | fig, axes = subplots(4)
13 | for dirname in dirnames:
14 |     print(dirname)
15 |     A = np.genfromtxt(join(args.expdir, dirname, 'log.txt'),delimiter='\t',dtype=None, names=True)
16 |     # axes[0].plot(scipy.signal.savgol_filter(A['EpRewMean'] , 21, 3), '-x')
17 |     x = A['TimestepsSoFar']
18 |     axes[0].plot(x, A['EpRewMean'], '-x')
19 |     axes[1].plot(x, A['KLOldNew'], '-x')
20 |     axes[2].plot(x, A['Entropy'], '-x')
21 |     axes[3].plot(x, A['EVBefore'], '-x')
22 | legend(dirnames,loc='best').draggable()
23 | axes[0].set_ylabel("EpRewMean")
24 | axes[1].set_ylabel("KLOldNew")
25 | axes[2].set_ylabel("Entropy")
26 | axes[3].set_ylabel("EVBefore")
27 | axes[3].set_ylim(-1,1)
28 | axes[-1].set_xlabel("Iterations")
29 | show()
30 | 


--------------------------------------------------------------------------------