├── LICENSE ├── README.md ├── hw1 ├── README.md ├── demo.bash ├── experts │ ├── Ant-v1.pkl │ ├── HalfCheetah-v1.pkl │ ├── Hopper-v1.pkl │ ├── Humanoid-v1.pkl │ ├── Reacher-v1.pkl │ └── Walker2d-v1.pkl ├── load_policy.py ├── run_expert.py └── tf_util.py ├── hw2 ├── .ipynb_checkpoints │ └── HW2-checkpoint.ipynb ├── HW2.ipynb ├── discrete_env.py ├── discrete_env.pyc ├── frozen_lake.py └── frozen_lake.pyc ├── hw3 ├── README ├── atari_wrappers.py ├── dqn.py ├── dqn_utils.py ├── run_dqn_atari.py └── run_dqn_ram.py ├── hw4 ├── homework.md ├── logz.py ├── main.py └── plot_learning_curves.py └── plot_test.ipynb /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 berkeleydeeprlcourse 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cs294 2 | Berkeley Deep Reinforcement Learning cs294 solutions 3 | Email: lightaime@gmail.com 4 | -------------------------------------------------------------------------------- /hw1/README.md: -------------------------------------------------------------------------------- 1 | # CS294-112 HW 1: Imitation Learning 2 | 3 | Dependencies: TensorFlow, MuJoCo version 1.31, OpenAI Gym 4 | 5 | The only file that you need to look at is `run_expert.py`, which is code to load up an expert policy, run a specified number of roll-outs, and save out data. 6 | 7 | In `experts/`, the provided expert policies are: 8 | * Ant-v1.pkl 9 | * HalfCheetah-v1.pkl 10 | * Hopper-v1.pkl 11 | * Humanoid-v1.pkl 12 | * Reacher-v1.pkl 13 | * Walker2d-v1.pkl 14 | 15 | The name of the pickle file corresponds to the name of the gym environment. 16 | -------------------------------------------------------------------------------- /hw1/demo.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eux 3 | for e in Hopper-v1 Ant-v1 HalfCheetah-v1 Humanoid-v1 Reacher-v1 Walker2d-v1 4 | do 5 | python run_expert.py experts/$e.pkl $e --render --num_rollouts=1 6 | done 7 | -------------------------------------------------------------------------------- /hw1/experts/Ant-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightaime/cs294/25641a22d931326fef0454bddbb0e74502009207/hw1/experts/Ant-v1.pkl -------------------------------------------------------------------------------- /hw1/experts/HalfCheetah-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightaime/cs294/25641a22d931326fef0454bddbb0e74502009207/hw1/experts/HalfCheetah-v1.pkl -------------------------------------------------------------------------------- /hw1/experts/Hopper-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightaime/cs294/25641a22d931326fef0454bddbb0e74502009207/hw1/experts/Hopper-v1.pkl -------------------------------------------------------------------------------- /hw1/experts/Humanoid-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightaime/cs294/25641a22d931326fef0454bddbb0e74502009207/hw1/experts/Humanoid-v1.pkl -------------------------------------------------------------------------------- /hw1/experts/Reacher-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightaime/cs294/25641a22d931326fef0454bddbb0e74502009207/hw1/experts/Reacher-v1.pkl -------------------------------------------------------------------------------- /hw1/experts/Walker2d-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightaime/cs294/25641a22d931326fef0454bddbb0e74502009207/hw1/experts/Walker2d-v1.pkl -------------------------------------------------------------------------------- /hw1/load_policy.py: -------------------------------------------------------------------------------- 1 | import pickle, tensorflow as tf, tf_util, numpy as np 2 | 3 | def load_policy(filename): 4 | with open(filename, 'rb') as f: 5 | data = pickle.loads(f.read()) 6 | 7 | # assert len(data.keys()) == 2 8 | nonlin_type = data['nonlin_type'] 9 | policy_type = [k for k in data.keys() if k != 'nonlin_type'][0] 10 | 11 | assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type) 12 | policy_params = data[policy_type] 13 | 14 | assert set(policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'} 15 | 16 | # Keep track of input and output dims (i.e. observation and action dims) for the user 17 | 18 | def build_policy(obs_bo): 19 | def read_layer(l): 20 | assert list(l.keys()) == ['AffineLayer'] 21 | assert sorted(l['AffineLayer'].keys()) == ['W', 'b'] 22 | return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer']['b'].astype(np.float32) 23 | 24 | def apply_nonlin(x): 25 | if nonlin_type == 'lrelu': 26 | return tf_util.lrelu(x, leak=.01) # openai/imitation nn.py:233 27 | elif nonlin_type == 'tanh': 28 | return tf.tanh(x) 29 | else: 30 | raise NotImplementedError(nonlin_type) 31 | 32 | # Build the policy. First, observation normalization. 33 | assert list(policy_params['obsnorm'].keys()) == ['Standardizer'] 34 | obsnorm_mean = policy_params['obsnorm']['Standardizer']['mean_1_D'] 35 | obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D'] 36 | obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean))) 37 | print('obs', obsnorm_mean.shape, obsnorm_stdev.shape) 38 | normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation 39 | 40 | curr_activations_bd = normedobs_bo 41 | 42 | # Hidden layers next 43 | assert list(policy_params['hidden'].keys()) == ['FeedforwardNet'] 44 | layer_params = policy_params['hidden']['FeedforwardNet'] 45 | for layer_name in sorted(layer_params.keys()): 46 | l = layer_params[layer_name] 47 | W, b = read_layer(l) 48 | curr_activations_bd = apply_nonlin(tf.matmul(curr_activations_bd, W) + b) 49 | 50 | # Output layer 51 | W, b = read_layer(policy_params['out']) 52 | output_bo = tf.matmul(curr_activations_bd, W) + b 53 | return output_bo 54 | 55 | obs_bo = tf.placeholder(tf.float32, [None, None]) 56 | a_ba = build_policy(obs_bo) 57 | policy_fn = tf_util.function([obs_bo], a_ba) 58 | return policy_fn -------------------------------------------------------------------------------- /hw1/run_expert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Code to load an expert policy and generate roll-out data for behavioral cloning. 5 | Example usage: 6 | python run_expert.py experts/Humanoid-v1.pkl Humanoid-v1 --render \ 7 | --num_rollouts 20 8 | 9 | Author of this script and included expert policies: Jonathan Ho (hoj@openai.com) 10 | """ 11 | 12 | import pickle 13 | import tensorflow as tf 14 | import numpy as np 15 | import tf_util 16 | import gym 17 | import load_policy 18 | 19 | def main(): 20 | import argparse 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('expert_policy_file', type=str) 23 | parser.add_argument('envname', type=str) 24 | parser.add_argument('--render', action='store_true') 25 | parser.add_argument("--max_timesteps", type=int) 26 | parser.add_argument('--num_rollouts', type=int, default=20, 27 | help='Number of expert roll outs') 28 | args = parser.parse_args() 29 | 30 | print('loading and building expert policy') 31 | policy_fn = load_policy.load_policy(args.expert_policy_file) 32 | print('loaded and built') 33 | 34 | with tf.Session(): 35 | tf_util.initialize() 36 | 37 | import gym 38 | env = gym.make(args.envname) 39 | max_steps = args.max_timesteps or env.spec.timestep_limit 40 | 41 | returns = [] 42 | observations = [] 43 | actions = [] 44 | for i in range(args.num_rollouts): 45 | print('iter', i) 46 | obs = env.reset() 47 | done = False 48 | totalr = 0. 49 | steps = 0 50 | while not done: 51 | action = policy_fn(obs[None,:]) 52 | observations.append(obs) 53 | actions.append(action) 54 | obs, r, done, _ = env.step(action) 55 | totalr += r 56 | steps += 1 57 | if args.render: 58 | env.render() 59 | if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) 60 | if steps >= max_steps: 61 | break 62 | returns.append(totalr) 63 | 64 | print('returns', returns) 65 | print('mean return', np.mean(returns)) 66 | print('std of return', np.std(returns)) 67 | 68 | expert_data = {'observations': np.array(observations), 69 | 'actions': np.array(actions)} 70 | 71 | if __name__ == '__main__': 72 | main() 73 | -------------------------------------------------------------------------------- /hw1/tf_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf # pylint: ignore-module 3 | #import builtins 4 | import functools 5 | import copy 6 | import os 7 | import collections 8 | 9 | # ================================================================ 10 | # Import all names into common namespace 11 | # ================================================================ 12 | 13 | clip = tf.clip_by_value 14 | 15 | # Make consistent with numpy 16 | # ---------------------------------------- 17 | 18 | def sum(x, axis=None, keepdims=False): 19 | return tf.reduce_sum(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims) 20 | def mean(x, axis=None, keepdims=False): 21 | return tf.reduce_mean(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims) 22 | def var(x, axis=None, keepdims=False): 23 | meanx = mean(x, axis=axis, keepdims=keepdims) 24 | return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims) 25 | def std(x, axis=None, keepdims=False): 26 | return tf.sqrt(var(x, axis=axis, keepdims=keepdims)) 27 | def max(x, axis=None, keepdims=False): 28 | return tf.reduce_max(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims) 29 | def min(x, axis=None, keepdims=False): 30 | return tf.reduce_min(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims) 31 | def concatenate(arrs, axis=0): 32 | return tf.concat(axis, arrs) 33 | def argmax(x, axis=None): 34 | return tf.argmax(x, dimension=axis) 35 | 36 | def switch(condition, then_expression, else_expression): 37 | '''Switches between two operations depending on a scalar value (int or bool). 38 | Note that both `then_expression` and `else_expression` 39 | should be symbolic tensors of the *same shape*. 40 | 41 | # Arguments 42 | condition: scalar tensor. 43 | then_expression: TensorFlow operation. 44 | else_expression: TensorFlow operation. 45 | ''' 46 | x_shape = copy.copy(then_expression.get_shape()) 47 | x = tf.cond(tf.cast(condition, 'bool'), 48 | lambda: then_expression, 49 | lambda: else_expression) 50 | x.set_shape(x_shape) 51 | return x 52 | 53 | # Extras 54 | # ---------------------------------------- 55 | def l2loss(params): 56 | if len(params) == 0: 57 | return tf.constant(0.0) 58 | else: 59 | return tf.add_n([sum(tf.square(p)) for p in params]) 60 | def lrelu(x, leak=0.2): 61 | f1 = 0.5 * (1 + leak) 62 | f2 = 0.5 * (1 - leak) 63 | return f1 * x + f2 * abs(x) 64 | def categorical_sample_logits(X): 65 | # https://github.com/tensorflow/tensorflow/issues/456 66 | U = tf.random_uniform(tf.shape(X)) 67 | return argmax(X - tf.log(-tf.log(U)), axis=1) 68 | 69 | # ================================================================ 70 | # Global session 71 | # ================================================================ 72 | 73 | def get_session(): 74 | return tf.get_default_session() 75 | 76 | def single_threaded_session(): 77 | tf_config = tf.ConfigProto( 78 | inter_op_parallelism_threads=1, 79 | intra_op_parallelism_threads=1) 80 | return tf.Session(config=tf_config) 81 | 82 | def make_session(num_cpu): 83 | tf_config = tf.ConfigProto( 84 | inter_op_parallelism_threads=num_cpu, 85 | intra_op_parallelism_threads=num_cpu) 86 | return tf.Session(config=tf_config) 87 | 88 | 89 | ALREADY_INITIALIZED = set() 90 | def initialize(): 91 | new_variables = set(tf.all_variables()) - ALREADY_INITIALIZED 92 | get_session().run(tf.initialize_variables(new_variables)) 93 | ALREADY_INITIALIZED.update(new_variables) 94 | 95 | 96 | def eval(expr, feed_dict=None): 97 | if feed_dict is None: feed_dict = {} 98 | return get_session().run(expr, feed_dict=feed_dict) 99 | 100 | def set_value(v, val): 101 | get_session().run(v.assign(val)) 102 | 103 | def load_state(fname): 104 | saver = tf.train.Saver() 105 | saver.restore(get_session(), fname) 106 | 107 | def save_state(fname): 108 | os.makedirs(os.path.dirname(fname), exist_ok=True) 109 | saver = tf.train.Saver() 110 | saver.save(get_session(), fname) 111 | 112 | # ================================================================ 113 | # Model components 114 | # ================================================================ 115 | 116 | 117 | def normc_initializer(std=1.0): 118 | def _initializer(shape, dtype=None, partition_info=None): #pylint: disable=W0613 119 | out = np.random.randn(*shape).astype(np.float32) 120 | out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True)) 121 | return tf.constant(out) 122 | return _initializer 123 | 124 | 125 | def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None, 126 | summary_tag=None): 127 | with tf.variable_scope(name): 128 | stride_shape = [1, stride[0], stride[1], 1] 129 | filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]), num_filters] 130 | 131 | # there are "num input feature maps * filter height * filter width" 132 | # inputs to each hidden unit 133 | fan_in = intprod(filter_shape[:3]) 134 | # each unit in the lower layer receives a gradient from: 135 | # "num output feature maps * filter height * filter width" / 136 | # pooling size 137 | fan_out = intprod(filter_shape[:2]) * num_filters 138 | # initialize weights with random weights 139 | w_bound = np.sqrt(6. / (fan_in + fan_out)) 140 | 141 | w = tf.get_variable("W", filter_shape, dtype, tf.random_uniform_initializer(-w_bound, w_bound), 142 | collections=collections) 143 | b = tf.get_variable("b", [1, 1, 1, num_filters], initializer=tf.zeros_initializer, 144 | collections=collections) 145 | 146 | if summary_tag is not None: 147 | tf.image_summary(summary_tag, 148 | tf.transpose(tf.reshape(w, [filter_size[0], filter_size[1], -1, 1]), 149 | [2, 0, 1, 3]), 150 | max_images=10) 151 | 152 | return tf.nn.conv2d(x, w, stride_shape, pad) + b 153 | 154 | 155 | def dense(x, size, name, weight_init=None, bias=True): 156 | w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init) 157 | ret = tf.matmul(x, w) 158 | if bias: 159 | b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer) 160 | return ret + b 161 | else: 162 | return ret 163 | 164 | def wndense(x, size, name, init_scale=1.0): 165 | v = tf.get_variable(name + "/V", [int(x.get_shape()[1]), size], 166 | initializer=tf.random_normal_initializer(0, 0.05)) 167 | g = tf.get_variable(name + "/g", [size], initializer=tf.constant_initializer(init_scale)) 168 | b = tf.get_variable(name + "/b", [size], initializer=tf.constant_initializer(0.0)) 169 | 170 | # use weight normalization (Salimans & Kingma, 2016) 171 | x = tf.matmul(x, v) 172 | scaler = g / tf.sqrt(sum(tf.square(v), axis=0, keepdims=True)) 173 | return tf.reshape(scaler, [1, size]) * x + tf.reshape(b, [1, size]) 174 | 175 | def densenobias(x, size, name, weight_init=None): 176 | return dense(x, size, name, weight_init=weight_init, bias=False) 177 | 178 | def dropout(x, pkeep, phase=None, mask=None): 179 | mask = tf.floor(pkeep + tf.random_uniform(tf.shape(x))) if mask is None else mask 180 | if phase is None: 181 | return mask * x 182 | else: 183 | return switch(phase, mask*x, pkeep*x) 184 | 185 | def batchnorm(x, name, phase, updates, gamma=0.96): 186 | k = x.get_shape()[1] 187 | runningmean = tf.get_variable(name+"/mean", shape=[1, k], initializer=tf.constant_initializer(0.0), trainable=False) 188 | runningvar = tf.get_variable(name+"/var", shape=[1, k], initializer=tf.constant_initializer(1e-4), trainable=False) 189 | testy = (x - runningmean) / tf.sqrt(runningvar) 190 | 191 | mean_ = mean(x, axis=0, keepdims=True) 192 | var_ = mean(tf.square(x), axis=0, keepdims=True) 193 | std = tf.sqrt(var_) 194 | trainy = (x - mean_) / std 195 | 196 | updates.extend([ 197 | tf.assign(runningmean, runningmean * gamma + mean_ * (1 - gamma)), 198 | tf.assign(runningvar, runningvar * gamma + var_ * (1 - gamma)) 199 | ]) 200 | 201 | y = switch(phase, trainy, testy) 202 | 203 | out = y * tf.get_variable(name+"/scaling", shape=[1, k], initializer=tf.constant_initializer(1.0), trainable=True)\ 204 | + tf.get_variable(name+"/translation", shape=[1,k], initializer=tf.constant_initializer(0.0), trainable=True) 205 | return out 206 | 207 | 208 | 209 | # ================================================================ 210 | # Basic Stuff 211 | # ================================================================ 212 | 213 | def function(inputs, outputs, updates=None, givens=None): 214 | if isinstance(outputs, list): 215 | return _Function(inputs, outputs, updates, givens=givens) 216 | elif isinstance(outputs, (dict, collections.OrderedDict)): 217 | f = _Function(inputs, outputs.values(), updates, givens=givens) 218 | return lambda *inputs : type(outputs)(zip(outputs.keys(), f(*inputs))) 219 | else: 220 | f = _Function(inputs, [outputs], updates, givens=givens) 221 | return lambda *inputs : f(*inputs)[0] 222 | 223 | class _Function(object): 224 | def __init__(self, inputs, outputs, updates, givens, check_nan=False): 225 | assert all(len(i.op.inputs)==0 for i in inputs), "inputs should all be placeholders" 226 | self.inputs = inputs 227 | updates = updates or [] 228 | self.update_group = tf.group(*updates) 229 | self.outputs_update = list(outputs) + [self.update_group] 230 | self.givens = {} if givens is None else givens 231 | self.check_nan = check_nan 232 | def __call__(self, *inputvals): 233 | assert len(inputvals) == len(self.inputs) 234 | feed_dict = dict(zip(self.inputs, inputvals)) 235 | feed_dict.update(self.givens) 236 | results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1] 237 | if self.check_nan: 238 | if any(np.isnan(r).any() for r in results): 239 | raise RuntimeError("Nan detected") 240 | return results 241 | 242 | def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size): 243 | if isinstance(outputs, list): 244 | return _MemFriendlyFunction(nondata_inputs, data_inputs, outputs, batch_size) 245 | else: 246 | f = _MemFriendlyFunction(nondata_inputs, data_inputs, [outputs], batch_size) 247 | return lambda *inputs : f(*inputs)[0] 248 | 249 | class _MemFriendlyFunction(object): 250 | def __init__(self, nondata_inputs, data_inputs, outputs, batch_size): 251 | self.nondata_inputs = nondata_inputs 252 | self.data_inputs = data_inputs 253 | self.outputs = list(outputs) 254 | self.batch_size = batch_size 255 | def __call__(self, *inputvals): 256 | assert len(inputvals) == len(self.nondata_inputs) + len(self.data_inputs) 257 | nondata_vals = inputvals[0:len(self.nondata_inputs)] 258 | data_vals = inputvals[len(self.nondata_inputs):] 259 | feed_dict = dict(zip(self.nondata_inputs, nondata_vals)) 260 | n = data_vals[0].shape[0] 261 | for v in data_vals[1:]: 262 | assert v.shape[0] == n 263 | for i_start in range(0, n, self.batch_size): 264 | slice_vals = [v[i_start:min(i_start+self.batch_size, n)] for v in data_vals] 265 | for (var,val) in zip(self.data_inputs, slice_vals): 266 | feed_dict[var]=val 267 | results = tf.get_default_session().run(self.outputs, feed_dict=feed_dict) 268 | if i_start==0: 269 | sum_results = results 270 | else: 271 | for i in range(len(results)): 272 | sum_results[i] = sum_results[i] + results[i] 273 | for i in range(len(results)): 274 | sum_results[i] = sum_results[i] / n 275 | return sum_results 276 | 277 | # ================================================================ 278 | # Modules 279 | # ================================================================ 280 | 281 | class Module(object): 282 | def __init__(self, name): 283 | self.name = name 284 | self.first_time = True 285 | self.scope = None 286 | self.cache = {} 287 | def __call__(self, *args): 288 | if args in self.cache: 289 | print("(%s) retrieving value from cache"%self.name) 290 | return self.cache[args] 291 | with tf.variable_scope(self.name, reuse=not self.first_time): 292 | scope = tf.get_variable_scope().name 293 | if self.first_time: 294 | self.scope = scope 295 | print("(%s) running function for the first time"%self.name) 296 | else: 297 | assert self.scope == scope, "Tried calling function with a different scope" 298 | print("(%s) running function on new inputs"%self.name) 299 | self.first_time = False 300 | out = self._call(*args) 301 | self.cache[args] = out 302 | return out 303 | def _call(self, *args): 304 | raise NotImplementedError 305 | 306 | @property 307 | def trainable_variables(self): 308 | assert self.scope is not None, "need to call module once before getting variables" 309 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) 310 | 311 | @property 312 | def variables(self): 313 | assert self.scope is not None, "need to call module once before getting variables" 314 | return tf.get_collection(tf.GraphKeys.VARIABLES, self.scope) 315 | 316 | 317 | def module(name): 318 | @functools.wraps 319 | def wrapper(f): 320 | class WrapperModule(Module): 321 | def _call(self, *args): 322 | return f(*args) 323 | return WrapperModule(name) 324 | return wrapper 325 | 326 | # ================================================================ 327 | # Graph traversal 328 | # ================================================================ 329 | 330 | VARIABLES = {} 331 | 332 | 333 | def get_parents(node): 334 | return node.op.inputs 335 | 336 | def topsorted(outputs): 337 | """ 338 | Topological sort via non-recursive depth-first search 339 | """ 340 | assert isinstance(outputs, (list,tuple)) 341 | marks = {} 342 | out = [] 343 | stack = [] #pylint: disable=W0621 344 | # i: node 345 | # jidx = number of children visited so far from that node 346 | # marks: state of each node, which is one of 347 | # 0: haven't visited 348 | # 1: have visited, but not done visiting children 349 | # 2: done visiting children 350 | for x in outputs: 351 | stack.append((x,0)) 352 | while stack: 353 | (i,jidx) = stack.pop() 354 | if jidx == 0: 355 | m = marks.get(i,0) 356 | if m == 0: 357 | marks[i] = 1 358 | elif m == 1: 359 | raise ValueError("not a dag") 360 | else: 361 | continue 362 | ps = get_parents(i) 363 | if jidx == len(ps): 364 | marks[i] = 2 365 | out.append(i) 366 | else: 367 | stack.append((i,jidx+1)) 368 | j = ps[jidx] 369 | stack.append((j,0)) 370 | return out 371 | 372 | 373 | # ================================================================ 374 | # Flat vectors 375 | # ================================================================ 376 | 377 | def var_shape(x): 378 | out = [k.value for k in x.get_shape()] 379 | assert all(isinstance(a, int) for a in out), \ 380 | "shape function assumes that shape is fully known" 381 | return out 382 | 383 | def numel(x): 384 | return intprod(var_shape(x)) 385 | 386 | def intprod(x): 387 | return int(np.prod(x)) 388 | 389 | def flatgrad(loss, var_list): 390 | grads = tf.gradients(loss, var_list) 391 | return tf.concat(0, [tf.reshape(grad, [numel(v)]) 392 | for (v, grad) in zip(var_list, grads)]) 393 | 394 | class SetFromFlat(object): 395 | def __init__(self, var_list, dtype=tf.float32): 396 | assigns = [] 397 | shapes = list(map(var_shape, var_list)) 398 | total_size = np.sum([intprod(shape) for shape in shapes]) 399 | 400 | self.theta = theta = tf.placeholder(dtype,[total_size]) 401 | start=0 402 | assigns = [] 403 | for (shape,v) in zip(shapes,var_list): 404 | size = intprod(shape) 405 | assigns.append(tf.assign(v, tf.reshape(theta[start:start+size],shape))) 406 | start+=size 407 | self.op = tf.group(*assigns) 408 | def __call__(self, theta): 409 | get_session().run(self.op, feed_dict={self.theta:theta}) 410 | 411 | class GetFlat(object): 412 | def __init__(self, var_list): 413 | self.op = tf.concat(0, [tf.reshape(v, [numel(v)]) for v in var_list]) 414 | def __call__(self): 415 | return get_session().run(self.op) 416 | 417 | # ================================================================ 418 | # Misc 419 | # ================================================================ 420 | 421 | 422 | def fancy_slice_2d(X, inds0, inds1): 423 | """ 424 | like numpy X[inds0, inds1] 425 | XXX this implementation is bad 426 | """ 427 | inds0 = tf.cast(inds0, tf.int64) 428 | inds1 = tf.cast(inds1, tf.int64) 429 | shape = tf.cast(tf.shape(X), tf.int64) 430 | ncols = shape[1] 431 | Xflat = tf.reshape(X, [-1]) 432 | return tf.gather(Xflat, inds0 * ncols + inds1) 433 | 434 | 435 | def scope_vars(scope, trainable_only): 436 | """ 437 | Get variables inside a scope 438 | The scope can be specified as a string 439 | """ 440 | return tf.get_collection( 441 | tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.VARIABLES, 442 | scope=scope if isinstance(scope, str) else scope.name 443 | ) 444 | 445 | def lengths_to_mask(lengths_b, max_length): 446 | """ 447 | Turns a vector of lengths into a boolean mask 448 | 449 | Args: 450 | lengths_b: an integer vector of lengths 451 | max_length: maximum length to fill the mask 452 | 453 | Returns: 454 | a boolean array of shape (batch_size, max_length) 455 | row[i] consists of True repeated lengths_b[i] times, followed by False 456 | """ 457 | lengths_b = tf.convert_to_tensor(lengths_b) 458 | assert lengths_b.get_shape().ndims == 1 459 | mask_bt = tf.expand_dims(tf.range(max_length), 0) < tf.expand_dims(lengths_b, 1) 460 | return mask_bt 461 | 462 | 463 | def in_session(f): 464 | @functools.wraps(f) 465 | def newfunc(*args, **kwargs): 466 | with tf.Session(): 467 | f(*args, **kwargs) 468 | return newfunc 469 | 470 | 471 | _PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape) 472 | def get_placeholder(name, dtype, shape): 473 | print("calling get_placeholder", name) 474 | if name in _PLACEHOLDER_CACHE: 475 | out, dtype1, shape1 = _PLACEHOLDER_CACHE[name] 476 | assert dtype1==dtype and shape1==shape 477 | return out 478 | else: 479 | out = tf.placeholder(dtype=dtype, shape=shape, name=name) 480 | _PLACEHOLDER_CACHE[name] = (out,dtype,shape) 481 | return out 482 | def get_placeholder_cached(name): 483 | return _PLACEHOLDER_CACHE[name][0] 484 | 485 | def flattenallbut0(x): 486 | return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])]) 487 | 488 | def reset(): 489 | global _PLACEHOLDER_CACHE 490 | global VARIABLES 491 | _PLACEHOLDER_CACHE = {} 492 | VARIABLES = {} 493 | tf.reset_default_graph() 494 | -------------------------------------------------------------------------------- /hw2/discrete_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from gym import Env, spaces 4 | from gym.utils import seeding 5 | 6 | def categorical_sample(prob_n, np_random): 7 | """ 8 | Sample from categorical distribution 9 | Each row specifies class probabilities 10 | """ 11 | prob_n = np.asarray(prob_n) 12 | csprob_n = np.cumsum(prob_n) 13 | return (csprob_n > np_random.rand()).argmax() 14 | 15 | 16 | class DiscreteEnv(Env): 17 | 18 | """ 19 | Has the following members 20 | - nS: number of states 21 | - nA: number of actions 22 | - P: transitions (*) 23 | - isd: initial state distribution (**) 24 | 25 | (*) dictionary dict of dicts of lists, where 26 | P[s][a] == [(probability, nextstate, reward, done), ...] 27 | (**) list or array of length nS 28 | 29 | 30 | """ 31 | def __init__(self, nS, nA, P, isd): 32 | self.P = P 33 | self.isd = isd 34 | self.lastaction=None # for rendering 35 | self.nS = nS 36 | self.nA = nA 37 | 38 | self.action_space = spaces.Discrete(self.nA) 39 | self.observation_space = spaces.Discrete(self.nS) 40 | 41 | self._seed() 42 | self._reset() 43 | 44 | def _seed(self, seed=None): 45 | self.np_random, seed = seeding.np_random(seed) 46 | return [seed] 47 | 48 | def _reset(self): 49 | self.s = categorical_sample(self.isd, self.np_random) 50 | self.lastaction=None 51 | return self.s 52 | 53 | def _step(self, a): 54 | transitions = self.P[self.s][a] 55 | i = categorical_sample([t[0] for t in transitions], self.np_random) 56 | p, s, r, d= transitions[i] 57 | self.s = s 58 | self.lastaction=a 59 | return (s, r, d, {"prob" : p}) 60 | -------------------------------------------------------------------------------- /hw2/discrete_env.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightaime/cs294/25641a22d931326fef0454bddbb0e74502009207/hw2/discrete_env.pyc -------------------------------------------------------------------------------- /hw2/frozen_lake.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | from six import StringIO, b 4 | 5 | from gym import utils 6 | import discrete_env 7 | 8 | LEFT = 0 9 | DOWN = 1 10 | RIGHT = 2 11 | UP = 3 12 | 13 | MAPS = { 14 | "4x4": [ 15 | "SFFF", 16 | "FHFH", 17 | "FFFH", 18 | "HFFG" 19 | ], 20 | "8x8": [ 21 | "SFFFFFFF", 22 | "FFFFFFFF", 23 | "FFFHFFFF", 24 | "FFFFFHFF", 25 | "FFFHFFFF", 26 | "FHHFFFHF", 27 | "FHFFHFHF", 28 | "FFFHFFFG" 29 | ], 30 | } 31 | 32 | class FrozenLakeEnv(discrete_env.DiscreteEnv): 33 | """ 34 | Winter is here. You and your friends were tossing around a frisbee at the park 35 | when you made a wild throw that left the frisbee out in the middle of the lake. 36 | The water is mostly frozen, but there are a few holes where the ice has melted. 37 | If you step into one of those holes, you'll fall into the freezing water. 38 | At this time, there's an international frisbee shortage, so it's absolutely imperative that 39 | you navigate across the lake and retrieve the disc. 40 | However, the ice is slippery, so you won't always move in the direction you intend. 41 | The surface is described using a grid like the following 42 | 43 | SFFF 44 | FHFH 45 | FFFH 46 | HFFG 47 | 48 | S : starting point, safe 49 | F : frozen surface, safe 50 | H : hole, fall to your doom 51 | G : goal, where the frisbee is located 52 | 53 | The episode ends when you reach the goal or fall in a hole. 54 | You receive a reward of 1 if you reach the goal, and zero otherwise. 55 | 56 | """ 57 | 58 | metadata = {'render.modes': ['human', 'ansi']} 59 | 60 | def __init__(self, desc=None, map_name="4x4",is_slippery=True): 61 | if desc is None and map_name is None: 62 | raise ValueError('Must provide either desc or map_name') 63 | elif desc is None: 64 | desc = MAPS[map_name] 65 | self.desc = desc = np.asarray(desc,dtype='c') 66 | self.nrow, self.ncol = nrow, ncol = desc.shape 67 | 68 | nA = 4 69 | nS = nrow * ncol 70 | 71 | isd = np.array(desc == b'S').astype('float64').ravel() 72 | isd /= isd.sum() 73 | 74 | P = {s : {a : [] for a in range(nA)} for s in range(nS)} 75 | 76 | def to_s(row, col): 77 | return row*ncol + col 78 | def inc(row, col, a): 79 | if a==0: # left 80 | col = max(col-1,0) 81 | elif a==1: # down 82 | row = min(row+1,nrow-1) 83 | elif a==2: # right 84 | col = min(col+1,ncol-1) 85 | elif a==3: # up 86 | row = max(row-1,0) 87 | return (row, col) 88 | 89 | for row in range(nrow): 90 | for col in range(ncol): 91 | s = to_s(row, col) 92 | for a in range(4): 93 | li = P[s][a] 94 | letter = desc[row, col] 95 | if letter in b'GH': 96 | li.append((1.0, s, 0, True)) 97 | else: 98 | if is_slippery: 99 | for b in [(a-1)%4, a, (a+1)%4]: 100 | newrow, newcol = inc(row, col, b) 101 | newstate = to_s(newrow, newcol) 102 | newletter = desc[newrow, newcol] 103 | done = bytes(newletter) in b'GH' 104 | rew = float(newletter == b'G') 105 | li.append((0.8 if b==a else 0.1, newstate, rew, done)) 106 | else: 107 | newrow, newcol = inc(row, col, a) 108 | newstate = to_s(newrow, newcol) 109 | newletter = desc[newrow, newcol] 110 | done = bytes(newletter) in b'GH' 111 | rew = float(newletter == b'G') 112 | li.append((1.0, newstate, rew, done)) 113 | 114 | super(FrozenLakeEnv, self).__init__(nS, nA, P, isd) 115 | 116 | def _render(self, mode='human', close=False): 117 | if close: 118 | return 119 | outfile = StringIO() if mode == 'ansi' else sys.stdout 120 | 121 | row, col = self.s // self.ncol, self.s % self.ncol 122 | desc = self.desc.tolist() 123 | desc = [[c.decode('utf-8') for c in line] for line in desc] 124 | desc[row][col] = utils.colorize(desc[row][col], "red", highlight=True) 125 | if self.lastaction is not None: 126 | outfile.write(" ({})\n".format(["Left","Down","Right","Up"][self.lastaction])) 127 | else: 128 | outfile.write("\n") 129 | outfile.write("\n".join(''.join(line) for line in desc)+"\n") 130 | 131 | return outfile 132 | -------------------------------------------------------------------------------- /hw2/frozen_lake.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightaime/cs294/25641a22d931326fef0454bddbb0e74502009207/hw2/frozen_lake.pyc -------------------------------------------------------------------------------- /hw3/README: -------------------------------------------------------------------------------- 1 | See http://rll.berkeley.edu/deeprlcourse/docs/hw3.pdf for instructions 2 | 3 | The starter code was based on an implementation of Q-learning for Atari 4 | generously provided by Szymon Sidor from OpenAI 5 | 6 | -------------------------------------------------------------------------------- /hw3/atari_wrappers.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from collections import deque 4 | import gym 5 | from gym import spaces 6 | 7 | 8 | class NoopResetEnv(gym.Wrapper): 9 | def __init__(self, env=None, noop_max=30): 10 | """Sample initial states by taking random number of no-ops on reset. 11 | No-op is assumed to be action 0. 12 | """ 13 | super(NoopResetEnv, self).__init__(env) 14 | self.noop_max = noop_max 15 | assert env.unwrapped.get_action_meanings()[0] == 'NOOP' 16 | 17 | def _reset(self): 18 | """ Do no-op action for a number of steps in [1, noop_max].""" 19 | self.env.reset() 20 | noops = np.random.randint(1, self.noop_max + 1) 21 | for _ in range(noops): 22 | obs, _, _, _ = self.env.step(0) 23 | return obs 24 | 25 | class FireResetEnv(gym.Wrapper): 26 | def __init__(self, env=None): 27 | """Take action on reset for environments that are fixed until firing.""" 28 | super(FireResetEnv, self).__init__(env) 29 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE' 30 | assert len(env.unwrapped.get_action_meanings()) >= 3 31 | 32 | def _reset(self): 33 | self.env.reset() 34 | obs, _, _, _ = self.env.step(1) 35 | obs, _, _, _ = self.env.step(2) 36 | return obs 37 | 38 | class EpisodicLifeEnv(gym.Wrapper): 39 | def __init__(self, env=None): 40 | """Make end-of-life == end-of-episode, but only reset on true game over. 41 | Done by DeepMind for the DQN and co. since it helps value estimation. 42 | """ 43 | super(EpisodicLifeEnv, self).__init__(env) 44 | self.lives = 0 45 | self.was_real_done = True 46 | self.was_real_reset = False 47 | 48 | def _step(self, action): 49 | obs, reward, done, info = self.env.step(action) 50 | self.was_real_done = done 51 | # check current lives, make loss of life terminal, 52 | # then update lives to handle bonus lives 53 | lives = self.env.unwrapped.ale.lives() 54 | if lives < self.lives and lives > 0: 55 | # for Qbert somtimes we stay in lives == 0 condtion for a few frames 56 | # so its important to keep lives > 0, so that we only reset once 57 | # the environment advertises done. 58 | done = True 59 | self.lives = lives 60 | return obs, reward, done, info 61 | 62 | def _reset(self): 63 | """Reset only when lives are exhausted. 64 | This way all states are still reachable even though lives are episodic, 65 | and the learner need not know about any of this behind-the-scenes. 66 | """ 67 | if self.was_real_done: 68 | obs = self.env.reset() 69 | self.was_real_reset = True 70 | else: 71 | # no-op step to advance from terminal/lost life state 72 | obs, _, _, _ = self.env.step(0) 73 | self.was_real_reset = False 74 | self.lives = self.env.unwrapped.ale.lives() 75 | return obs 76 | 77 | class MaxAndSkipEnv(gym.Wrapper): 78 | def __init__(self, env=None, skip=4): 79 | """Return only every `skip`-th frame""" 80 | super(MaxAndSkipEnv, self).__init__(env) 81 | # most recent raw observations (for max pooling across time steps) 82 | self._obs_buffer = deque(maxlen=2) 83 | self._skip = skip 84 | 85 | def _step(self, action): 86 | total_reward = 0.0 87 | done = None 88 | for _ in range(self._skip): 89 | obs, reward, done, info = self.env.step(action) 90 | self._obs_buffer.append(obs) 91 | total_reward += reward 92 | if done: 93 | break 94 | 95 | max_frame = np.max(np.stack(self._obs_buffer), axis=0) 96 | 97 | return max_frame, total_reward, done, info 98 | 99 | def _reset(self): 100 | """Clear past frame buffer and init. to first obs. from inner env.""" 101 | self._obs_buffer.clear() 102 | obs = self.env.reset() 103 | self._obs_buffer.append(obs) 104 | return obs 105 | 106 | def _process_frame84(frame): 107 | img = np.reshape(frame, [210, 160, 3]).astype(np.float32) 108 | img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114 109 | resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_LINEAR) 110 | x_t = resized_screen[18:102, :] 111 | x_t = np.reshape(x_t, [84, 84, 1]) 112 | return x_t.astype(np.uint8) 113 | 114 | class ProcessFrame84(gym.Wrapper): 115 | def __init__(self, env=None): 116 | super(ProcessFrame84, self).__init__(env) 117 | self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1)) 118 | 119 | def _step(self, action): 120 | obs, reward, done, info = self.env.step(action) 121 | return _process_frame84(obs), reward, done, info 122 | 123 | def _reset(self): 124 | return _process_frame84(self.env.reset()) 125 | 126 | class ClippedRewardsWrapper(gym.Wrapper): 127 | def _step(self, action): 128 | obs, reward, done, info = self.env.step(action) 129 | return obs, np.sign(reward), done, info 130 | 131 | def wrap_deepmind_ram(env): 132 | env = EpisodicLifeEnv(env) 133 | env = NoopResetEnv(env, noop_max=30) 134 | env = MaxAndSkipEnv(env, skip=4) 135 | if 'FIRE' in env.unwrapped.get_action_meanings(): 136 | env = FireResetEnv(env) 137 | env = ClippedRewardsWrapper(env) 138 | return env 139 | 140 | def wrap_deepmind(env): 141 | assert 'NoFrameskip' in env.spec.id 142 | env = EpisodicLifeEnv(env) 143 | env = NoopResetEnv(env, noop_max=30) 144 | env = MaxAndSkipEnv(env, skip=4) 145 | if 'FIRE' in env.unwrapped.get_action_meanings(): 146 | env = FireResetEnv(env) 147 | env = ProcessFrame84(env) 148 | env = ClippedRewardsWrapper(env) 149 | return env 150 | -------------------------------------------------------------------------------- /hw3/dqn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gym.spaces 3 | import itertools 4 | import numpy as np 5 | import random 6 | import tensorflow as tf 7 | import tensorflow.contrib.layers as layers 8 | from collections import namedtuple 9 | from dqn_utils import * 10 | 11 | OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs", "lr_schedule"]) 12 | 13 | def learn(env, 14 | q_func, 15 | optimizer_spec, 16 | session, 17 | exploration=LinearSchedule(1000000, 0.1), 18 | stopping_criterion=None, 19 | replay_buffer_size=1000000, 20 | batch_size=32, 21 | gamma=0.99, 22 | learning_starts=50000, 23 | learning_freq=4, 24 | frame_history_len=4, 25 | target_update_freq=10000, 26 | grad_norm_clipping=10): 27 | """Run Deep Q-learning algorithm. 28 | 29 | You can specify your own convnet using q_func. 30 | 31 | All schedules are w.r.t. total number of steps taken in the environment. 32 | 33 | Parameters 34 | ---------- 35 | env: gym.Env 36 | gym environment to train on. 37 | q_func: function 38 | Model to use for computing the q function. It should accept the 39 | following named arguments: 40 | img_in: tf.Tensor 41 | tensorflow tensor representing the input image 42 | num_actions: int 43 | number of actions 44 | scope: str 45 | scope in which all the model related variables 46 | should be created 47 | reuse: bool 48 | whether previously created variables should be reused. 49 | optimizer_spec: OptimizerSpec 50 | Specifying the constructor and kwargs, as well as learning rate schedule 51 | for the optimizer 52 | session: tf.Session 53 | tensorflow session to use. 54 | exploration: rl_algs.deepq.utils.schedules.Schedule 55 | schedule for probability of chosing random action. 56 | stopping_criterion: (env, t) -> bool 57 | should return true when it's ok for the RL algorithm to stop. 58 | takes in env and the number of steps executed so far. 59 | replay_buffer_size: int 60 | How many memories to store in the replay buffer. 61 | batch_size: int 62 | How many transitions to sample each time experience is replayed. 63 | gamma: float 64 | Discount Factor 65 | learning_starts: int 66 | After how many environment steps to start replaying experiences 67 | learning_freq: int 68 | How many steps of environment to take between every experience replay 69 | frame_history_len: int 70 | How many past frames to include as input to the model. 71 | target_update_freq: int 72 | How many experience replay rounds (not steps!) to perform between 73 | each update to the target Q network 74 | grad_norm_clipping: float or None 75 | If not None gradients' norms are clipped to this value. 76 | """ 77 | assert type(env.observation_space) == gym.spaces.Box 78 | assert type(env.action_space) == gym.spaces.Discrete 79 | 80 | ############### 81 | # BUILD MODEL # 82 | ############### 83 | 84 | if len(env.observation_space.shape) == 1: 85 | # This means we are running on low-dimensional observations (e.g. RAM) 86 | input_shape = env.observation_space.shape 87 | else: 88 | img_h, img_w, img_c = env.observation_space.shape 89 | input_shape = (img_h, img_w, frame_history_len * img_c) 90 | num_actions = env.action_space.n 91 | 92 | # set up placeholders 93 | # placeholder for current observation (or state) 94 | obs_t_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) 95 | # placeholder for current action 96 | act_t_ph = tf.placeholder(tf.int32, [None]) 97 | # placeholder for current reward 98 | rew_t_ph = tf.placeholder(tf.float32, [None]) 99 | # placeholder for next observation (or state) 100 | obs_tp1_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) 101 | # placeholder for end of episode mask 102 | # this value is 1 if the next state corresponds to the end of an episode, 103 | # in which case there is no Q-value at the next state; at the end of an 104 | # episode, only the current state reward contributes to the target, not the 105 | # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1) 106 | done_mask_ph = tf.placeholder(tf.float32, [None]) 107 | 108 | # casting to float on GPU ensures lower data transfer times. 109 | obs_t_float = tf.cast(obs_t_ph, tf.float32) / 255.0 110 | obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0 111 | 112 | # Here, you should fill in your own code to compute the Bellman error. This requires 113 | # evaluating the current and next Q-values and constructing the corresponding error. 114 | # TensorFlow will differentiate this error for you, you just need to pass it to the 115 | # optimizer. See assignment text for details. 116 | # Your code should produce one scalar-valued tensor: total_error 117 | # This will be passed to the optimizer in the provided code below. 118 | # Your code should also produce two collections of variables: 119 | # q_func_vars 120 | # target_q_func_vars 121 | # These should hold all of the variables of the Q-function network and target network, 122 | # respectively. A convenient way to get these is to make use of TF's "scope" feature. 123 | # For example, you can create your Q-function network with the scope "q_func" like this: 124 | # = q_func(obs_t_float, num_actions, scope="q_func", reuse=False) 125 | # And then you can obtain the variables like this: 126 | # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func') 127 | # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES" 128 | ###### 129 | 130 | # YOUR CODE HERE 131 | 132 | ###### 133 | 134 | # construct optimization op (with gradient clipping) 135 | learning_rate = tf.placeholder(tf.float32, (), name="learning_rate") 136 | optimizer = optimizer_spec.constructor(learning_rate=learning_rate, **optimizer_spec.kwargs) 137 | train_fn = minimize_and_clip(optimizer, total_error, 138 | var_list=q_func_vars, clip_val=grad_norm_clipping) 139 | 140 | # update_target_fn will be called periodically to copy Q network to target Q network 141 | update_target_fn = [] 142 | for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), 143 | sorted(target_q_func_vars, key=lambda v: v.name)): 144 | update_target_fn.append(var_target.assign(var)) 145 | update_target_fn = tf.group(*update_target_fn) 146 | 147 | # construct the replay buffer 148 | replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) 149 | 150 | ############### 151 | # RUN ENV # 152 | ############### 153 | model_initialized = False 154 | num_param_updates = 0 155 | mean_episode_reward = -float('nan') 156 | best_mean_episode_reward = -float('inf') 157 | last_obs = env.reset() 158 | LOG_EVERY_N_STEPS = 10000 159 | 160 | for t in itertools.count(): 161 | ### 1. Check stopping criterion 162 | if stopping_criterion is not None and stopping_criterion(env, t): 163 | break 164 | 165 | ### 2. Step the env and store the transition 166 | # At this point, "last_obs" contains the latest observation that was 167 | # recorded from the simulator. Here, your code needs to store this 168 | # observation and its outcome (reward, next observation, etc.) into 169 | # the replay buffer while stepping the simulator forward one step. 170 | # At the end of this block of code, the simulator should have been 171 | # advanced one step, and the replay buffer should contain one more 172 | # transition. 173 | # Specifically, last_obs must point to the new latest observation. 174 | # Useful functions you'll need to call: 175 | # obs, reward, done, info = env.step(action) 176 | # this steps the environment forward one step 177 | # obs = env.reset() 178 | # this resets the environment if you reached an episode boundary. 179 | # Don't forget to call env.reset() to get a new observation if done 180 | # is true!! 181 | # Note that you cannot use "last_obs" directly as input 182 | # into your network, since it needs to be processed to include context 183 | # from previous frames. You should check out the replay buffer 184 | # implementation in dqn_utils.py to see what functionality the replay 185 | # buffer exposes. The replay buffer has a function called 186 | # encode_recent_observation that will take the latest observation 187 | # that you pushed into the buffer and compute the corresponding 188 | # input that should be given to a Q network by appending some 189 | # previous frames. 190 | # Don't forget to include epsilon greedy exploration! 191 | # And remember that the first time you enter this loop, the model 192 | # may not yet have been initialized (but of course, the first step 193 | # might as well be random, since you haven't trained your net...) 194 | 195 | ##### 196 | 197 | # YOUR CODE HERE 198 | 199 | ##### 200 | 201 | # at this point, the environment should have been advanced one step (and 202 | # reset if done was true), and last_obs should point to the new latest 203 | # observation 204 | 205 | ### 3. Perform experience replay and train the network. 206 | # note that this is only done if the replay buffer contains enough samples 207 | # for us to learn something useful -- until then, the model will not be 208 | # initialized and random actions should be taken 209 | if (t > learning_starts and 210 | t % learning_freq == 0 and 211 | replay_buffer.can_sample(batch_size)): 212 | # Here, you should perform training. Training consists of four steps: 213 | # 3.a: use the replay buffer to sample a batch of transitions (see the 214 | # replay buffer code for function definition, each batch that you sample 215 | # should consist of current observations, current actions, rewards, 216 | # next observations, and done indicator). 217 | # 3.b: initialize the model if it has not been initialized yet; to do 218 | # that, call 219 | # initialize_interdependent_variables(session, tf.global_variables(), { 220 | # obs_t_ph: obs_t_batch, 221 | # obs_tp1_ph: obs_tp1_batch, 222 | # }) 223 | # where obs_t_batch and obs_tp1_batch are the batches of observations at 224 | # the current and next time step. The boolean variable model_initialized 225 | # indicates whether or not the model has been initialized. 226 | # Remember that you have to update the target network too (see 3.d)! 227 | # 3.c: train the model. To do this, you'll need to use the train_fn and 228 | # total_error ops that were created earlier: total_error is what you 229 | # created to compute the total Bellman error in a batch, and train_fn 230 | # will actually perform a gradient step and update the network parameters 231 | # to reduce total_error. When calling session.run on these you'll need to 232 | # populate the following placeholders: 233 | # obs_t_ph 234 | # act_t_ph 235 | # rew_t_ph 236 | # obs_tp1_ph 237 | # done_mask_ph 238 | # (this is needed for computing total_error) 239 | # learning_rate -- you can get this from optimizer_spec.lr_schedule.value(t) 240 | # (this is needed by the optimizer to choose the learning rate) 241 | # 3.d: periodically update the target network by calling 242 | # session.run(update_target_fn) 243 | # you should update every target_update_freq steps, and you may find the 244 | # variable num_param_updates useful for this (it was initialized to 0) 245 | ##### 246 | 247 | # YOUR CODE HERE 248 | 249 | ##### 250 | 251 | ### 4. Log progress 252 | episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() 253 | if len(episode_rewards) > 0: 254 | mean_episode_reward = np.mean(episode_rewards[-100:]) 255 | if len(episode_rewards) > 100: 256 | best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) 257 | if t % LOG_EVERY_N_STEPS == 0 and model_initialized: 258 | print("Timestep %d" % (t,)) 259 | print("mean reward (100 episodes) %f" % mean_episode_reward) 260 | print("best mean reward %f" % best_mean_episode_reward) 261 | print("episodes %d" % len(episode_rewards)) 262 | print("exploration %f" % exploration.value(t)) 263 | print("learning_rate %f" % optimizer_spec.lr_schedule.value(t)) 264 | sys.stdout.flush() 265 | -------------------------------------------------------------------------------- /hw3/dqn_utils.py: -------------------------------------------------------------------------------- 1 | """This file includes a collection of utility functions that are useful for 2 | implementing DQN.""" 3 | import gym 4 | import tensorflow as tf 5 | import numpy as np 6 | import random 7 | 8 | def huber_loss(x, delta=1.0): 9 | # https://en.wikipedia.org/wiki/Huber_loss 10 | return tf.select( 11 | tf.abs(x) < delta, 12 | tf.square(x) * 0.5, 13 | delta * (tf.abs(x) - 0.5 * delta) 14 | ) 15 | 16 | def sample_n_unique(sampling_f, n): 17 | """Helper function. Given a function `sampling_f` that returns 18 | comparable objects, sample n such unique objects. 19 | """ 20 | res = [] 21 | while len(res) < n: 22 | candidate = sampling_f() 23 | if candidate not in res: 24 | res.append(candidate) 25 | return res 26 | 27 | class Schedule(object): 28 | def value(self, t): 29 | """Value of the schedule at time t""" 30 | raise NotImplementedError() 31 | 32 | class ConstantSchedule(object): 33 | def __init__(self, value): 34 | """Value remains constant over time. 35 | Parameters 36 | ---------- 37 | value: float 38 | Constant value of the schedule 39 | """ 40 | self._v = value 41 | 42 | def value(self, t): 43 | """See Schedule.value""" 44 | return self._v 45 | 46 | def linear_interpolation(l, r, alpha): 47 | return l + alpha * (r - l) 48 | 49 | class PiecewiseSchedule(object): 50 | def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None): 51 | """Piecewise schedule. 52 | endpoints: [(int, int)] 53 | list of pairs `(time, value)` meanining that schedule should output 54 | `value` when `t==time`. All the values for time must be sorted in 55 | an increasing order. When t is between two times, e.g. `(time_a, value_a)` 56 | and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs 57 | `interpolation(value_a, value_b, alpha)` where alpha is a fraction of 58 | time passed between `time_a` and `time_b` for time `t`. 59 | interpolation: lambda float, float, float: float 60 | a function that takes value to the left and to the right of t according 61 | to the `endpoints`. Alpha is the fraction of distance from left endpoint to 62 | right endpoint that t has covered. See linear_interpolation for example. 63 | outside_value: float 64 | if the value is requested outside of all the intervals sepecified in 65 | `endpoints` this value is returned. If None then AssertionError is 66 | raised when outside value is requested. 67 | """ 68 | idxes = [e[0] for e in endpoints] 69 | assert idxes == sorted(idxes) 70 | self._interpolation = interpolation 71 | self._outside_value = outside_value 72 | self._endpoints = endpoints 73 | 74 | def value(self, t): 75 | """See Schedule.value""" 76 | for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]): 77 | if l_t <= t and t < r_t: 78 | alpha = float(t - l_t) / (r_t - l_t) 79 | return self._interpolation(l, r, alpha) 80 | 81 | # t does not belong to any of the pieces, so doom. 82 | assert self._outside_value is not None 83 | return self._outside_value 84 | 85 | class LinearSchedule(object): 86 | def __init__(self, schedule_timesteps, final_p, initial_p=1.0): 87 | """Linear interpolation between initial_p and final_p over 88 | schedule_timesteps. After this many timesteps pass final_p is 89 | returned. 90 | Parameters 91 | ---------- 92 | schedule_timesteps: int 93 | Number of timesteps for which to linearly anneal initial_p 94 | to final_p 95 | initial_p: float 96 | initial output value 97 | final_p: float 98 | final output value 99 | """ 100 | self.schedule_timesteps = schedule_timesteps 101 | self.final_p = final_p 102 | self.initial_p = initial_p 103 | 104 | def value(self, t): 105 | """See Schedule.value""" 106 | fraction = min(float(t) / self.schedule_timesteps, 1.0) 107 | return self.initial_p + fraction * (self.final_p - self.initial_p) 108 | 109 | def compute_exponential_averages(variables, decay): 110 | """Given a list of tensorflow scalar variables 111 | create ops corresponding to their exponential 112 | averages 113 | Parameters 114 | ---------- 115 | variables: [tf.Tensor] 116 | List of scalar tensors. 117 | Returns 118 | ------- 119 | averages: [tf.Tensor] 120 | List of scalar tensors corresponding to averages 121 | of al the `variables` (in order) 122 | apply_op: tf.runnable 123 | Op to be run to update the averages with current value 124 | of variables. 125 | """ 126 | averager = tf.train.ExponentialMovingAverage(decay=decay) 127 | apply_op = averager.apply(variables) 128 | return [averager.average(v) for v in variables], apply_op 129 | 130 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10): 131 | """Minimized `objective` using `optimizer` w.r.t. variables in 132 | `var_list` while ensure the norm of the gradients for each 133 | variable is clipped to `clip_val` 134 | """ 135 | gradients = optimizer.compute_gradients(objective, var_list=var_list) 136 | for i, (grad, var) in enumerate(gradients): 137 | if grad is not None: 138 | gradients[i] = (tf.clip_by_norm(grad, clip_val), var) 139 | return optimizer.apply_gradients(gradients) 140 | 141 | def initialize_interdependent_variables(session, vars_list, feed_dict): 142 | """Initialize a list of variables one at a time, which is useful if 143 | initialization of some variables depends on initialization of the others. 144 | """ 145 | vars_left = vars_list 146 | while len(vars_left) > 0: 147 | new_vars_left = [] 148 | for v in vars_left: 149 | try: 150 | # If using an older version of TensorFlow, uncomment the line 151 | # below and comment out the line after it. 152 | #session.run(tf.initialize_variables([v]), feed_dict) 153 | session.run(tf.variables_initializer([v]), feed_dict) 154 | except tf.errors.FailedPreconditionError: 155 | new_vars_left.append(v) 156 | if len(new_vars_left) >= len(vars_left): 157 | # This can happend if the variables all depend on each other, or more likely if there's 158 | # another variable outside of the list, that still needs to be initialized. This could be 159 | # detected here, but life's finite. 160 | raise Exception("Cycle in variable dependencies, or extenrnal precondition unsatisfied.") 161 | else: 162 | vars_left = new_vars_left 163 | 164 | def get_wrapper_by_name(env, classname): 165 | currentenv = env 166 | while True: 167 | if classname in currentenv.__class__.__name__: 168 | return currentenv 169 | elif isinstance(env, gym.Wrapper): 170 | currentenv = currentenv.env 171 | else: 172 | raise ValueError("Couldn't find wrapper named %s"%classname) 173 | 174 | class ReplayBuffer(object): 175 | def __init__(self, size, frame_history_len): 176 | """This is a memory efficient implementation of the replay buffer. 177 | 178 | The sepecific memory optimizations use here are: 179 | - only store each frame once rather than k times 180 | even if every observation normally consists of k last frames 181 | - store frames as np.uint8 (actually it is most time-performance 182 | to cast them back to float32 on GPU to minimize memory transfer 183 | time) 184 | - store frame_t and frame_(t+1) in the same buffer. 185 | 186 | For the tipical use case in Atari Deep RL buffer with 1M frames the total 187 | memory footprint of this buffer is 10^6 * 84 * 84 bytes ~= 7 gigabytes 188 | 189 | Warning! Assumes that returning frame of zeros at the beginning 190 | of the episode, when there is less frames than `frame_history_len`, 191 | is acceptable. 192 | 193 | Parameters 194 | ---------- 195 | size: int 196 | Max number of transitions to store in the buffer. When the buffer 197 | overflows the old memories are dropped. 198 | frame_history_len: int 199 | Number of memories to be retried for each observation. 200 | """ 201 | self.size = size 202 | self.frame_history_len = frame_history_len 203 | 204 | self.next_idx = 0 205 | self.num_in_buffer = 0 206 | 207 | self.obs = None 208 | self.action = None 209 | self.reward = None 210 | self.done = None 211 | 212 | def can_sample(self, batch_size): 213 | """Returns true if `batch_size` different transitions can be sampled from the buffer.""" 214 | return batch_size + 1 <= self.num_in_buffer 215 | 216 | def _encode_sample(self, idxes): 217 | obs_batch = np.concatenate([self._encode_observation(idx)[None] for idx in idxes], 0) 218 | act_batch = self.action[idxes] 219 | rew_batch = self.reward[idxes] 220 | next_obs_batch = np.concatenate([self._encode_observation(idx + 1)[None] for idx in idxes], 0) 221 | done_mask = np.array([1.0 if self.done[idx] else 0.0 for idx in idxes], dtype=np.float32) 222 | 223 | return obs_batch, act_batch, rew_batch, next_obs_batch, done_mask 224 | 225 | 226 | def sample(self, batch_size): 227 | """Sample `batch_size` different transitions. 228 | 229 | i-th sample transition is the following: 230 | 231 | when observing `obs_batch[i]`, action `act_batch[i]` was taken, 232 | after which reward `rew_batch[i]` was received and subsequent 233 | observation next_obs_batch[i] was observed, unless the epsiode 234 | was done which is represented by `done_mask[i]` which is equal 235 | to 1 if episode has ended as a result of that action. 236 | 237 | Parameters 238 | ---------- 239 | batch_size: int 240 | How many transitions to sample. 241 | 242 | Returns 243 | ------- 244 | obs_batch: np.array 245 | Array of shape 246 | (batch_size, img_h, img_w, img_c * frame_history_len) 247 | and dtype np.uint8 248 | act_batch: np.array 249 | Array of shape (batch_size,) and dtype np.int32 250 | rew_batch: np.array 251 | Array of shape (batch_size,) and dtype np.float32 252 | next_obs_batch: np.array 253 | Array of shape 254 | (batch_size, img_h, img_w, img_c * frame_history_len) 255 | and dtype np.uint8 256 | done_mask: np.array 257 | Array of shape (batch_size,) and dtype np.float32 258 | """ 259 | assert self.can_sample(batch_size) 260 | idxes = sample_n_unique(lambda: random.randint(0, self.num_in_buffer - 2), batch_size) 261 | return self._encode_sample(idxes) 262 | 263 | def encode_recent_observation(self): 264 | """Return the most recent `frame_history_len` frames. 265 | 266 | Returns 267 | ------- 268 | observation: np.array 269 | Array of shape (img_h, img_w, img_c * frame_history_len) 270 | and dtype np.uint8, where observation[:, :, i*img_c:(i+1)*img_c] 271 | encodes frame at time `t - frame_history_len + i` 272 | """ 273 | assert self.num_in_buffer > 0 274 | return self._encode_observation((self.next_idx - 1) % self.size) 275 | 276 | def _encode_observation(self, idx): 277 | end_idx = idx + 1 # make noninclusive 278 | start_idx = end_idx - self.frame_history_len 279 | # this checks if we are using low-dimensional observations, such as RAM 280 | # state, in which case we just directly return the latest RAM. 281 | if len(self.obs.shape) == 2: 282 | return self.obs[end_idx-1] 283 | # if there weren't enough frames ever in the buffer for context 284 | if start_idx < 0 and self.num_in_buffer != self.size: 285 | start_idx = 0 286 | for idx in range(start_idx, end_idx - 1): 287 | if self.done[idx % self.size]: 288 | start_idx = idx + 1 289 | missing_context = self.frame_history_len - (end_idx - start_idx) 290 | # if zero padding is needed for missing context 291 | # or we are on the boundry of the buffer 292 | if start_idx < 0 or missing_context > 0: 293 | frames = [np.zeros_like(self.obs[0]) for _ in range(missing_context)] 294 | for idx in range(start_idx, end_idx): 295 | frames.append(self.obs[idx % self.size]) 296 | return np.concatenate(frames, 2) 297 | else: 298 | # this optimization has potential to saves about 30% compute time \o/ 299 | img_h, img_w = self.obs.shape[1], self.obs.shape[2] 300 | return self.obs[start_idx:end_idx].transpose(1, 2, 0, 3).reshape(img_h, img_w, -1) 301 | 302 | def store_frame(self, frame): 303 | """Store a single frame in the buffer at the next available index, overwriting 304 | old frames if necessary. 305 | 306 | Parameters 307 | ---------- 308 | frame: np.array 309 | Array of shape (img_h, img_w, img_c) and dtype np.uint8 310 | the frame to be stored 311 | 312 | Returns 313 | ------- 314 | idx: int 315 | Index at which the frame is stored. To be used for `store_effect` later. 316 | """ 317 | if self.obs is None: 318 | self.obs = np.empty([self.size] + list(frame.shape), dtype=np.uint8) 319 | self.action = np.empty([self.size], dtype=np.int32) 320 | self.reward = np.empty([self.size], dtype=np.float32) 321 | self.done = np.empty([self.size], dtype=np.bool) 322 | self.obs[self.next_idx] = frame 323 | 324 | ret = self.next_idx 325 | self.next_idx = (self.next_idx + 1) % self.size 326 | self.num_in_buffer = min(self.size, self.num_in_buffer + 1) 327 | 328 | return ret 329 | 330 | def store_effect(self, idx, action, reward, done): 331 | """Store effects of action taken after obeserving frame stored 332 | at index idx. The reason `store_frame` and `store_effect` is broken 333 | up into two functions is so that once can call `encode_recent_observation` 334 | in between. 335 | 336 | Paramters 337 | --------- 338 | idx: int 339 | Index in buffer of recently observed frame (returned by `store_frame`). 340 | action: int 341 | Action that was performed upon observing this frame. 342 | reward: float 343 | Reward that was received when the actions was performed. 344 | done: bool 345 | True if episode was finished after performing that action. 346 | """ 347 | self.action[idx] = action 348 | self.reward[idx] = reward 349 | self.done[idx] = done 350 | 351 | -------------------------------------------------------------------------------- /hw3/run_dqn_atari.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | from gym import wrappers 4 | import os.path as osp 5 | import random 6 | import numpy as np 7 | import tensorflow as tf 8 | import tensorflow.contrib.layers as layers 9 | 10 | import dqn 11 | from dqn_utils import * 12 | from atari_wrappers import * 13 | 14 | 15 | def atari_model(img_in, num_actions, scope, reuse=False): 16 | # as described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf 17 | with tf.variable_scope(scope, reuse=reuse): 18 | out = img_in 19 | with tf.variable_scope("convnet"): 20 | # original architecture 21 | out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu) 22 | out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu) 23 | out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu) 24 | out = layers.flatten(out) 25 | with tf.variable_scope("action_value"): 26 | out = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu) 27 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 28 | 29 | return out 30 | 31 | def atari_learn(env, 32 | session, 33 | num_timesteps): 34 | # This is just a rough estimate 35 | num_iterations = float(num_timesteps) / 4.0 36 | 37 | lr_multiplier = 1.0 38 | lr_schedule = PiecewiseSchedule([ 39 | (0, 1e-4 * lr_multiplier), 40 | (num_iterations / 10, 1e-4 * lr_multiplier), 41 | (num_iterations / 2, 5e-5 * lr_multiplier), 42 | ], 43 | outside_value=5e-5 * lr_multiplier) 44 | optimizer = dqn.OptimizerSpec( 45 | constructor=tf.train.AdamOptimizer, 46 | kwargs=dict(epsilon=1e-4), 47 | lr_schedule=lr_schedule 48 | ) 49 | 50 | def stopping_criterion(env, t): 51 | # notice that here t is the number of steps of the wrapped env, 52 | # which is different from the number of steps in the underlying env 53 | return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps 54 | 55 | exploration_schedule = PiecewiseSchedule( 56 | [ 57 | (0, 1.0), 58 | (1e6, 0.1), 59 | (num_iterations / 2, 0.01), 60 | ], outside_value=0.01 61 | ) 62 | 63 | dqn.learn( 64 | env, 65 | q_func=atari_model, 66 | optimizer_spec=optimizer, 67 | session=session, 68 | exploration=exploration_schedule, 69 | stopping_criterion=stopping_criterion, 70 | replay_buffer_size=1000000, 71 | batch_size=32, 72 | gamma=0.99, 73 | learning_starts=50000, 74 | learning_freq=4, 75 | frame_history_len=4, 76 | target_update_freq=10000, 77 | grad_norm_clipping=10 78 | ) 79 | env.close() 80 | 81 | def get_available_gpus(): 82 | from tensorflow.python.client import device_lib 83 | local_device_protos = device_lib.list_local_devices() 84 | return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU'] 85 | 86 | def set_global_seeds(i): 87 | try: 88 | import tensorflow as tf 89 | except ImportError: 90 | pass 91 | else: 92 | tf.set_random_seed(i) 93 | np.random.seed(i) 94 | random.seed(i) 95 | 96 | def get_session(): 97 | tf.reset_default_graph() 98 | tf_config = tf.ConfigProto( 99 | inter_op_parallelism_threads=1, 100 | intra_op_parallelism_threads=1) 101 | session = tf.Session(config=tf_config) 102 | print("AVAILABLE GPUS: ", get_available_gpus()) 103 | return session 104 | 105 | def get_env(task, seed): 106 | env_id = task.env_id 107 | 108 | env = gym.make(env_id) 109 | 110 | set_global_seeds(seed) 111 | env.seed(seed) 112 | 113 | expt_dir = '/tmp/hw3_vid_dir2/' 114 | env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) 115 | env = wrap_deepmind(env) 116 | 117 | return env 118 | 119 | def main(): 120 | # Get Atari games. 121 | benchmark = gym.benchmark_spec('Atari40M') 122 | 123 | # Change the index to select a different game. 124 | task = benchmark.tasks[3] 125 | 126 | # Run training 127 | seed = 0 # Use a seed of zero (you may want to randomize the seed!) 128 | env = get_env(task, seed) 129 | session = get_session() 130 | atari_learn(env, session, num_timesteps=task.max_timesteps) 131 | 132 | if __name__ == "__main__": 133 | main() 134 | -------------------------------------------------------------------------------- /hw3/run_dqn_ram.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | from gym import wrappers 4 | import os.path as osp 5 | import random 6 | import numpy as np 7 | import tensorflow as tf 8 | import tensorflow.contrib.layers as layers 9 | 10 | import dqn 11 | from dqn_utils import * 12 | from atari_wrappers import * 13 | 14 | 15 | def atari_model(ram_in, num_actions, scope, reuse=False): 16 | with tf.variable_scope(scope, reuse=reuse): 17 | out = ram_in 18 | #out = tf.concat(1,(ram_in[:,4:5],ram_in[:,8:9],ram_in[:,11:13],ram_in[:,21:22],ram_in[:,50:51], ram_in[:,60:61],ram_in[:,64:65])) 19 | with tf.variable_scope("action_value"): 20 | out = layers.fully_connected(out, num_outputs=256, activation_fn=tf.nn.relu) 21 | out = layers.fully_connected(out, num_outputs=128, activation_fn=tf.nn.relu) 22 | out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu) 23 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 24 | 25 | return out 26 | 27 | def atari_learn(env, 28 | session, 29 | num_timesteps): 30 | # This is just a rough estimate 31 | num_iterations = float(num_timesteps) / 4.0 32 | 33 | lr_multiplier = 1.0 34 | lr_schedule = PiecewiseSchedule([ 35 | (0, 1e-4 * lr_multiplier), 36 | (num_iterations / 10, 1e-4 * lr_multiplier), 37 | (num_iterations / 2, 5e-5 * lr_multiplier), 38 | ], 39 | outside_value=5e-5 * lr_multiplier) 40 | optimizer = dqn.OptimizerSpec( 41 | constructor=tf.train.AdamOptimizer, 42 | kwargs=dict(epsilon=1e-4), 43 | lr_schedule=lr_schedule 44 | ) 45 | 46 | def stopping_criterion(env, t): 47 | # notice that here t is the number of steps of the wrapped env, 48 | # which is different from the number of steps in the underlying env 49 | return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps 50 | 51 | exploration_schedule = PiecewiseSchedule( 52 | [ 53 | (0, 0.2), 54 | (1e6, 0.1), 55 | (num_iterations / 2, 0.01), 56 | ], outside_value=0.01 57 | ) 58 | 59 | dqn.learn( 60 | env, 61 | q_func=atari_model, 62 | optimizer_spec=optimizer, 63 | session=session, 64 | exploration=exploration_schedule, 65 | stopping_criterion=stopping_criterion, 66 | replay_buffer_size=1000000, 67 | batch_size=32, 68 | gamma=0.99, 69 | learning_starts=50000, 70 | learning_freq=4, 71 | frame_history_len=1, 72 | target_update_freq=10000, 73 | grad_norm_clipping=10 74 | ) 75 | env.close() 76 | 77 | def get_available_gpus(): 78 | from tensorflow.python.client import device_lib 79 | local_device_protos = device_lib.list_local_devices() 80 | return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU'] 81 | 82 | def set_global_seeds(i): 83 | try: 84 | import tensorflow as tf 85 | except ImportError: 86 | pass 87 | else: 88 | tf.set_random_seed(i) 89 | np.random.seed(i) 90 | random.seed(i) 91 | 92 | def get_session(): 93 | tf.reset_default_graph() 94 | tf_config = tf.ConfigProto( 95 | inter_op_parallelism_threads=1, 96 | intra_op_parallelism_threads=1) 97 | session = tf.Session(config=tf_config) 98 | print("AVAILABLE GPUS: ", get_available_gpus()) 99 | return session 100 | 101 | def get_env(seed): 102 | env = gym.make('Pong-ram-v0') 103 | 104 | set_global_seeds(seed) 105 | env.seed(seed) 106 | 107 | expt_dir = '/tmp/hw3_vid_dir/' 108 | env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) 109 | env = wrap_deepmind_ram(env) 110 | 111 | return env 112 | 113 | def main(): 114 | # Run training 115 | seed = 0 # Use a seed of zero (you may want to randomize the seed!) 116 | env = get_env(seed) 117 | session = get_session() 118 | atari_learn(env, session, num_timesteps=int(4e7)) 119 | 120 | if __name__ == "__main__": 121 | main() 122 | -------------------------------------------------------------------------------- /hw4/homework.md: -------------------------------------------------------------------------------- 1 | # Homework 4 2 | 3 | In `main.py` you will find an implementation of a "vanilla" policy gradient method, applied to an MDP with a discrete action space: an episodic version of the classic "cartpole" task. First, make sure the provided code works on your computer by running `python main.py`. We recommend reading through all of the code and comments in the function `main_cartpole`, starting at the top of the function. 4 | 5 | The code computes some useful diagnostics, which you may find helpful to look at while tuning hyperparameters: 6 | 7 | - **KL[policy before update || policy after update]**. Large spikes in KL divergence mean that the optimization took a large step, and sometimes these spikes cause a collapse in performance. 8 | - **Entropy of the policy**. If entropy goes down too fast, then you may not explore enough, but if it goes down too slowly, you'll probably not reach optimal performance. 9 | - **Explained variance of the value function**. If the value function perfectly explains the returns, then it will be 1; if you get a negative result, then it's worse than predicting a constant. 10 | 11 | Software dependencies: 12 | 13 | - tensorflow 14 | - numpy + scipy (Anaconda recommended) 15 | - gym (I'm using 0.8.0, `pip install gym==0.8.0`, but old versions should work just as well) 16 | 17 | ## Problem 1 18 | 19 | Here you will modify the `main_cartpole` policy gradient implementation to work on a continuous action space, specifically, the gym environment `Pendulum-v`. Note that in `main_cartpole`, note that the neural network outputs "logits" (i.e., log-probabilities plus-or-minus a constant) that specify a categorical distribution. On the other hand, for the pendulum task, your neural network should output the mean of a Gaussian distribution, a separate parameter vector to parameterize the log standard deviation. For example, you could use the following code: 20 | 21 | ```python 22 | 23 | mean_na = dense(h2, ac_dim, weight_init=normc_initializer(0.1)) # Mean control output 24 | logstd_a = tf.get_variable("logstdev", [ac_dim], initializer=tf.zeros_initializer) # Variance 25 | 26 | sy_sampled_ac = YOUR_CODE_HERE 27 | sy_logprob_n = YOUR_CODE_HERE 28 | 29 | ``` 30 | 31 | You should also compute differential entropy (replacing `sy_ent`) and KL-divergence (`sy_kl`) for the Gaussian distribution. 32 | 33 | The pendulum problem is slightly harder, and using a fixed stepsize does not work reliably---thus, we instead recommend using an adaptive stepsize, where you adjust it based on the KL divergence between the new and old policy. Code for this stepsize adaptation is provided. 34 | 35 | You can plot your results using the script `plot_learning_curves.py` or your own plotting code. 36 | 37 | **Deliverables** 38 | 39 | - Show a plot with the pendulum converging to EpRewMean of at least `-300`. Include EpRewMean, KL, Entropy in your plots. 40 | - Describe the hyperparameters used and how many timesteps your algorithm took to learn. 41 | 42 | ## Problem 2 43 | 44 | 1. Implement a neural network value function with the same interface as `LinearVF`. Add it to the provided cartpole solver, and compare the performance of the linear and neural network value function (i.e., baseline). 45 | 2. Perform the same comparison--linear vs neural network--for your pendulum solver from Problem 1. You should be able to obtain faster learning using the neural network. 46 | 47 | 48 | **Deliverables** 49 | 50 | - A comparison of linear vs neural network value function on the cartpole. Show the value function's explained variance (EVBefore) and mean episode reward (EpRewMean). 51 | - A comparison of linear vs neural network value function on the pendulum. Show the value function's explained variance (EVBefore) and mean episode reward (EpRewMean). 52 | 53 | In both cases, list the hyperparameters used for neural network training. 54 | 55 | ## Problem 3 (bonus) 56 | 57 | Implement a more advanced policy gradient method from lecture (such as TRPO, or the advantage function estimator used in A3C or generalized advantage estimation), and apply it to the gym environment `Hopper-v1`. See if you can learn a good gait in less than 500,000 timesteps. 58 | Hint: it may help to standardize your inputs using a running estimate of mean and standard deviation. 59 | 60 | ob_rescaled = (ob_raw - mean) / (stdev + epsilon) 61 | 62 | **Deliverables** 63 | 64 | A description of what you implemented, and learning curves on the Hopper-v1 environment. -------------------------------------------------------------------------------- /hw4/logz.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Some simple logging functionality, inspired by rllab's logging. 4 | Assumes that each diagnostic gets logged each iteration 5 | 6 | Call logz.configure_output_dir() to start logging to a 7 | tab-separated-values file (some_folder_name/log.txt) 8 | 9 | To load the learning curves, you can do, for example 10 | 11 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True) 12 | A['EpRewMean'] 13 | 14 | """ 15 | 16 | import os.path as osp, shutil, time, atexit, os, subprocess 17 | 18 | color2num = dict( 19 | gray=30, 20 | red=31, 21 | green=32, 22 | yellow=33, 23 | blue=34, 24 | magenta=35, 25 | cyan=36, 26 | white=37, 27 | crimson=38 28 | ) 29 | 30 | def colorize(string, color, bold=False, highlight=False): 31 | attr = [] 32 | num = color2num[color] 33 | if highlight: num += 10 34 | attr.append(str(num)) 35 | if bold: attr.append('1') 36 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 37 | 38 | class G: 39 | output_dir = None 40 | output_file = None 41 | first_row = True 42 | log_headers = [] 43 | log_current_row = {} 44 | 45 | def configure_output_dir(d=None): 46 | """ 47 | Set output directory to d, or to /tmp/somerandomnumber if d is None 48 | """ 49 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time()) 50 | assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir 51 | os.makedirs(G.output_dir) 52 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w') 53 | atexit.register(G.output_file.close) 54 | try: 55 | cmd = "cd %s && git diff > %s 2>/dev/null"%(osp.dirname(__file__), osp.join(G.output_dir, "a.diff")) 56 | subprocess.check_call(cmd, shell=True) # Save git diff to experiment directory 57 | except subprocess.CalledProcessError: 58 | print("configure_output_dir: not storing the git diff, probably because you're not in a git repo") 59 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True)) 60 | 61 | def log_tabular(key, val): 62 | """ 63 | Log a value of some diagnostic 64 | Call this once for each diagnostic quantity, each iteration 65 | """ 66 | if G.first_row: 67 | G.log_headers.append(key) 68 | else: 69 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key 70 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key 71 | G.log_current_row[key] = val 72 | 73 | def dump_tabular(): 74 | """ 75 | Write all of the diagnostics from the current iteration 76 | """ 77 | vals = [] 78 | print("-"*37) 79 | for key in G.log_headers: 80 | val = G.log_current_row.get(key, "") 81 | if hasattr(val, "__float__"): valstr = "%8.3g"%val 82 | else: valstr = val 83 | print("| %15s | %15s |"%(key, valstr)) 84 | vals.append(val) 85 | print("-"*37) 86 | if G.output_file is not None: 87 | if G.first_row: 88 | G.output_file.write("\t".join(G.log_headers)) 89 | G.output_file.write("\n") 90 | G.output_file.write("\t".join(map(str,vals))) 91 | G.output_file.write("\n") 92 | G.output_file.flush() 93 | G.log_current_row.clear() 94 | G.first_row=False -------------------------------------------------------------------------------- /hw4/main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | import logz 5 | import scipy.signal 6 | 7 | def normc_initializer(std=1.0): 8 | """ 9 | Initialize array with normalized columns 10 | """ 11 | def _initializer(shape, dtype=None, partition_info=None): #pylint: disable=W0613 12 | out = np.random.randn(*shape).astype(np.float32) 13 | out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True)) 14 | return tf.constant(out) 15 | return _initializer 16 | 17 | 18 | def dense(x, size, name, weight_init=None): 19 | """ 20 | Dense (fully connected) layer 21 | """ 22 | w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init) 23 | b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer()) 24 | return tf.matmul(x, w) + b 25 | 26 | def fancy_slice_2d(X, inds0, inds1): 27 | """ 28 | Like numpy's X[inds0, inds1] 29 | """ 30 | inds0 = tf.cast(inds0, tf.int64) 31 | inds1 = tf.cast(inds1, tf.int64) 32 | shape = tf.cast(tf.shape(X), tf.int64) 33 | ncols = shape[1] 34 | Xflat = tf.reshape(X, [-1]) 35 | return tf.gather(Xflat, inds0 * ncols + inds1) 36 | 37 | def discount(x, gamma): 38 | """ 39 | Compute discounted sum of future values 40 | out[i] = in[i] + gamma * in[i+1] + gamma^2 * in[i+2] + ... 41 | """ 42 | return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1] 43 | 44 | def explained_variance_1d(ypred,y): 45 | """ 46 | Var[ypred - y] / var[y]. 47 | https://www.quora.com/What-is-the-meaning-proportion-of-variance-explained-in-linear-regression 48 | """ 49 | assert y.ndim == 1 and ypred.ndim == 1 50 | vary = np.var(y) 51 | return np.nan if vary==0 else 1 - np.var(y-ypred)/vary 52 | 53 | def categorical_sample_logits(logits): 54 | """ 55 | Samples (symbolically) from categorical distribution, where logits is a NxK 56 | matrix specifying N categorical distributions with K categories 57 | 58 | specifically, exp(logits) / sum( exp(logits), axis=1 ) is the 59 | probabilities of the different classes 60 | 61 | Cleverly uses gumbell trick, based on 62 | https://github.com/tensorflow/tensorflow/issues/456 63 | """ 64 | U = tf.random_uniform(tf.shape(logits)) 65 | return tf.argmax(logits - tf.log(-tf.log(U)), dimension=1) 66 | 67 | def pathlength(path): 68 | return len(path["reward"]) 69 | 70 | class LinearValueFunction(object): 71 | coef = None 72 | def fit(self, X, y): 73 | Xp = self.preproc(X) 74 | A = Xp.T.dot(Xp) 75 | nfeats = Xp.shape[1] 76 | A[np.arange(nfeats), np.arange(nfeats)] += 1e-3 # a little ridge regression 77 | b = Xp.T.dot(y) 78 | self.coef = np.linalg.solve(A, b) 79 | def predict(self, X): 80 | if self.coef is None: 81 | return np.zeros(X.shape[0]) 82 | else: 83 | return self.preproc(X).dot(self.coef) 84 | def preproc(self, X): 85 | return np.concatenate([np.ones([X.shape[0], 1]), X, np.square(X)/2.0], axis=1) 86 | 87 | class NnValueFunction(object): 88 | pass # YOUR CODE HERE 89 | 90 | def lrelu(x, leak=0.2): 91 | f1 = 0.5 * (1 + leak) 92 | f2 = 0.5 * (1 - leak) 93 | return f1 * x + f2 * abs(x) 94 | 95 | 96 | 97 | def main_cartpole(n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, stepsize=1e-2, animate=True, logdir=None): 98 | env = gym.make("CartPole-v0") 99 | ob_dim = env.observation_space.shape[0] 100 | num_actions = env.action_space.n 101 | logz.configure_output_dir(logdir) 102 | vf = LinearValueFunction() 103 | 104 | # Symbolic variables have the prefix sy_, to distinguish them from the numerical values 105 | # that are computed later in these function 106 | sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of observations 107 | sy_ac_n = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) # batch of actions taken by the policy, used for policy gradient computation 108 | sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # advantage function estimate 109 | sy_h1 = lrelu(dense(sy_ob_no, 32, "h1", weight_init=normc_initializer(1.0))) # hidden layer 110 | sy_logits_na = dense(sy_h1, num_actions, "final", weight_init=normc_initializer(0.05)) # "logits", describing probability distribution of final layer 111 | # we use a small initialization for the last layer, so the initial policy has maximal entropy 112 | sy_oldlogits_na = tf.placeholder(shape=[None, num_actions], name='oldlogits', dtype=tf.float32) # logits BEFORE update (just used for KL diagnostic) 113 | sy_logp_na = tf.nn.log_softmax(sy_logits_na) # logprobability of actions 114 | sy_sampled_ac = categorical_sample_logits(sy_logits_na)[0] # sampled actions, used for defining the policy (NOT computing the policy gradient) 115 | sy_n = tf.shape(sy_ob_no)[0] 116 | sy_logprob_n = fancy_slice_2d(sy_logp_na, tf.range(sy_n), sy_ac_n) # log-prob of actions taken -- used for policy gradient calculation 117 | 118 | # The following quantities are just used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES >>>> 119 | sy_oldlogp_na = tf.nn.log_softmax(sy_oldlogits_na) 120 | sy_oldp_na = tf.exp(sy_oldlogp_na) 121 | sy_kl = tf.reduce_sum(sy_oldp_na * (sy_oldlogp_na - sy_logp_na)) / tf.to_float(sy_n) 122 | sy_p_na = tf.exp(sy_logp_na) 123 | sy_ent = tf.reduce_sum( - sy_p_na * sy_logp_na) / tf.to_float(sy_n) 124 | # <<<<<<<<<<<<< 125 | 126 | sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss") 127 | 128 | sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently) 129 | update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr) 130 | 131 | tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 132 | # use single thread. on such a small problem, multithreading gives you a slowdown 133 | # this way, we can better use multiple cores for different experiments 134 | sess = tf.Session(config=tf_config) 135 | sess.__enter__() # equivalent to `with sess:` 136 | tf.global_variables_initializer().run() #pylint: disable=E1101 137 | 138 | total_timesteps = 0 139 | 140 | for i in range(n_iter): 141 | print("********** Iteration %i ************"%i) 142 | 143 | # Collect paths until we have enough timesteps 144 | timesteps_this_batch = 0 145 | paths = [] 146 | while True: 147 | ob = env.reset() 148 | terminated = False 149 | obs, acs, rewards = [], [], [] 150 | animate_this_episode=(len(paths)==0 and (i % 10 == 0) and animate) 151 | while True: 152 | if animate_this_episode: 153 | env.render() 154 | obs.append(ob) 155 | ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]}) 156 | acs.append(ac) 157 | ob, rew, done, _ = env.step(ac) 158 | rewards.append(rew) 159 | if done: 160 | break 161 | path = {"observation" : np.array(obs), "terminated" : terminated, 162 | "reward" : np.array(rewards), "action" : np.array(acs)} 163 | paths.append(path) 164 | timesteps_this_batch += pathlength(path) 165 | if timesteps_this_batch > min_timesteps_per_batch: 166 | break 167 | total_timesteps += timesteps_this_batch 168 | # Estimate advantage function 169 | vtargs, vpreds, advs = [], [], [] 170 | for path in paths: 171 | rew_t = path["reward"] 172 | return_t = discount(rew_t, gamma) 173 | vpred_t = vf.predict(path["observation"]) 174 | adv_t = return_t - vpred_t 175 | advs.append(adv_t) 176 | vtargs.append(return_t) 177 | vpreds.append(vpred_t) 178 | 179 | # Build arrays for policy update 180 | ob_no = np.concatenate([path["observation"] for path in paths]) 181 | ac_n = np.concatenate([path["action"] for path in paths]) 182 | adv_n = np.concatenate(advs) 183 | standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) 184 | vtarg_n = np.concatenate(vtargs) 185 | vpred_n = np.concatenate(vpreds) 186 | vf.fit(ob_no, vtarg_n) 187 | 188 | # Policy update 189 | _, oldlogits_na = sess.run([update_op, sy_logits_na], feed_dict={sy_ob_no:ob_no, sy_ac_n:ac_n, sy_adv_n:standardized_adv_n, sy_stepsize:stepsize}) 190 | kl, ent = sess.run([sy_kl, sy_ent], feed_dict={sy_ob_no:ob_no, sy_oldlogits_na:oldlogits_na}) 191 | 192 | # Log diagnostics 193 | logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) 194 | logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) 195 | logz.log_tabular("KLOldNew", kl) 196 | logz.log_tabular("Entropy", ent) 197 | logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n)) 198 | logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n)) 199 | logz.log_tabular("TimestepsSoFar", total_timesteps) 200 | # If you're overfitting, EVAfter will be way larger than EVBefore. 201 | # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias 202 | logz.dump_tabular() 203 | 204 | def main_pendulum(logdir, seed, n_iter, gamma, min_timesteps_per_batch, initial_stepsize, desired_kl, vf_type, vf_params, animate=False): 205 | tf.set_random_seed(seed) 206 | np.random.seed(seed) 207 | env = gym.make("Pendulum-v0") 208 | ob_dim = env.observation_space.shape[0] 209 | ac_dim = env.action_space.shape[0] 210 | logz.configure_output_dir(logdir) 211 | if vf_type == 'linear': 212 | vf = LinearValueFunction(**vf_params) 213 | elif vf_type == 'nn': 214 | vf = NnValueFunction(ob_dim=ob_dim, **vf_params) 215 | 216 | 217 | YOUR_CODE_HERE 218 | 219 | 220 | sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss") 221 | 222 | sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently) 223 | update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr) 224 | 225 | sess = tf.Session() 226 | sess.__enter__() # equivalent to `with sess:` 227 | tf.global_variables_initializer().run() #pylint: disable=E1101 228 | 229 | total_timesteps = 0 230 | stepsize = initial_stepsize 231 | 232 | for i in range(n_iter): 233 | print("********** Iteration %i ************"%i) 234 | 235 | YOUR_CODE_HERE 236 | 237 | if kl > desired_kl * 2: 238 | stepsize /= 1.5 239 | print('stepsize -> %s'%stepsize) 240 | elif kl < desired_kl / 2: 241 | stepsize *= 1.5 242 | print('stepsize -> %s'%stepsize) 243 | else: 244 | print('stepsize OK') 245 | 246 | 247 | # Log diagnostics 248 | logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) 249 | logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) 250 | logz.log_tabular("KLOldNew", kl) 251 | logz.log_tabular("Entropy", ent) 252 | logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n)) 253 | logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n)) 254 | logz.log_tabular("TimestepsSoFar", total_timesteps) 255 | # If you're overfitting, EVAfter will be way larger than EVBefore. 256 | # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias 257 | logz.dump_tabular() 258 | 259 | 260 | def main_pendulum1(d): 261 | return main_pendulum(**d) 262 | 263 | if __name__ == "__main__": 264 | if 1: 265 | main_cartpole(logdir=None) # when you want to start collecting results, set the logdir 266 | if 0: 267 | general_params = dict(gamma=0.97, animate=False, min_timesteps_per_batch=2500, n_iter=300, initial_stepsize=1e-3) 268 | params = [ 269 | dict(logdir='/tmp/ref/linearvf-kl2e-3-seed0', seed=0, desired_kl=2e-3, vf_type='linear', vf_params={}, **general_params), 270 | dict(logdir='/tmp/ref/nnvf-kl2e-3-seed0', seed=0, desired_kl=2e-3, vf_type='nn', vf_params=dict(n_epochs=10, stepsize=1e-3), **general_params), 271 | dict(logdir='/tmp/ref/linearvf-kl2e-3-seed1', seed=1, desired_kl=2e-3, vf_type='linear', vf_params={}, **general_params), 272 | dict(logdir='/tmp/ref/nnvf-kl2e-3-seed1', seed=1, desired_kl=2e-3, vf_type='nn', vf_params=dict(n_epochs=10, stepsize=1e-3), **general_params), 273 | dict(logdir='/tmp/ref/linearvf-kl2e-3-seed2', seed=2, desired_kl=2e-3, vf_type='linear', vf_params={}, **general_params), 274 | dict(logdir='/tmp/ref/nnvf-kl2e-3-seed2', seed=2, desired_kl=2e-3, vf_type='nn', vf_params=dict(n_epochs=10, stepsize=1e-3), **general_params), 275 | ] 276 | import multiprocessing 277 | p = multiprocessing.Pool() 278 | p.map(main_pendulum1, params) 279 | -------------------------------------------------------------------------------- /hw4/plot_learning_curves.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | parser = argparse.ArgumentParser() 3 | parser.add_argument("expdir", help="experiment dir, e.g., /tmp/experiments") 4 | args = parser.parse_args() 5 | 6 | from pylab import * 7 | import os 8 | from os.path import join 9 | 10 | dirnames = os.listdir(args.expdir) 11 | 12 | fig, axes = subplots(4) 13 | for dirname in dirnames: 14 | print(dirname) 15 | A = np.genfromtxt(join(args.expdir, dirname, 'log.txt'),delimiter='\t',dtype=None, names=True) 16 | # axes[0].plot(scipy.signal.savgol_filter(A['EpRewMean'] , 21, 3), '-x') 17 | x = A['TimestepsSoFar'] 18 | axes[0].plot(x, A['EpRewMean'], '-x') 19 | axes[1].plot(x, A['KLOldNew'], '-x') 20 | axes[2].plot(x, A['Entropy'], '-x') 21 | axes[3].plot(x, A['EVBefore'], '-x') 22 | legend(dirnames,loc='best').draggable() 23 | axes[0].set_ylabel("EpRewMean") 24 | axes[1].set_ylabel("KLOldNew") 25 | axes[2].set_ylabel("Entropy") 26 | axes[3].set_ylabel("EVBefore") 27 | axes[3].set_ylim(-1,1) 28 | axes[-1].set_xlabel("Iterations") 29 | show() 30 | --------------------------------------------------------------------------------