├── .gitignore ├── LICENSE ├── hw1 ├── HW1 torch.ipynb ├── HW1.ipynb ├── README.md ├── demo.bash ├── experts │ ├── Ant-v1.pkl │ ├── HalfCheetah-v1.pkl │ ├── Hopper-v1.pkl │ ├── Humanoid-v1.pkl │ ├── Reacher-v1.pkl │ └── Walker2d-v1.pkl ├── hw1fall2017.pdf ├── load_policy.py ├── run_expert.py └── tf_util.py ├── hw2 ├── README.MD ├── deliver.sh ├── fig │ ├── 1_cartpole_sb.png │ ├── 2_cartpole_lb.png │ ├── 2_cartpole_sb_lb.png │ ├── 3_pendulum_2x16.png │ ├── 4_nn_baseline.png │ └── 5_hc.png ├── hw2_final.pdf ├── logz.py ├── plot.py └── train_pg.py ├── hw3 ├── README ├── atari_wrappers.py ├── dqn.py ├── dqn_utils.py ├── hw3.pdf ├── run_dqn_atari.py └── run_dqn_ram.py ├── hw4 ├── README.md ├── cheetah_env.py ├── controllers.py ├── cost_functions.py ├── dynamics.py ├── hw4.pdf ├── logz.py ├── main.py └── plot.py └── sp17_hw ├── hw1 ├── README.md ├── demo.bash ├── experts │ ├── Ant-v1.pkl │ ├── HalfCheetah-v1.pkl │ ├── Hopper-v1.pkl │ ├── Humanoid-v1.pkl │ ├── Reacher-v1.pkl │ └── Walker2d-v1.pkl ├── load_policy.py ├── run_expert.py └── tf_util.py ├── hw2 ├── HW2.ipynb ├── discrete_env.py └── frozen_lake.py ├── hw3 ├── README ├── atari_wrappers.py ├── dqn.py ├── dqn_utils.py ├── run_dqn_atari.py └── run_dqn_ram.py └── hw4 ├── homework.md ├── logz.py ├── main.py └── plot_learning_curves.py /.gitignore: -------------------------------------------------------------------------------- 1 | # HW2 data 2 | hw2/data/* 3 | 4 | # General 5 | .DS_Store 6 | .AppleDouble 7 | .LSOverride 8 | 9 | # Icon must end with two \r 10 | Icon 11 | 12 | 13 | # Thumbnails 14 | ._* 15 | 16 | # Files that might appear in the root of a volume 17 | .DocumentRevisions-V100 18 | .fseventsd 19 | .Spotlight-V100 20 | .TemporaryItems 21 | .Trashes 22 | .VolumeIcon.icns 23 | .com.apple.timemachine.donotpresent 24 | 25 | # Directories potentially created on remote AFP share 26 | .AppleDB 27 | .AppleDesktop 28 | Network Trash Folder 29 | Temporary Items 30 | .apdisk 31 | 32 | # Byte-compiled / optimized / DLL files 33 | __pycache__/ 34 | *.py[cod] 35 | *$py.class 36 | 37 | # C extensions 38 | *.so 39 | 40 | # Distribution / packaging 41 | .Python 42 | env/ 43 | build/ 44 | develop-eggs/ 45 | dist/ 46 | downloads/ 47 | eggs/ 48 | .eggs/ 49 | lib/ 50 | lib64/ 51 | parts/ 52 | sdist/ 53 | var/ 54 | *.egg-info/ 55 | .installed.cfg 56 | *.egg 57 | 58 | # PyInstaller 59 | # Usually these files are written by a python script from a template 60 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 61 | *.manifest 62 | *.spec 63 | 64 | # Installer logs 65 | pip-log.txt 66 | pip-delete-this-directory.txt 67 | 68 | # Unit test / coverage reports 69 | htmlcov/ 70 | .tox/ 71 | .coverage 72 | .coverage.* 73 | .cache 74 | nosetests.xml 75 | coverage.xml 76 | *,cover 77 | .hypothesis/ 78 | 79 | # Translations 80 | *.mo 81 | *.pot 82 | 83 | # Django stuff: 84 | *.log 85 | local_settings.py 86 | 87 | # Flask stuff: 88 | instance/ 89 | .webassets-cache 90 | 91 | # Scrapy stuff: 92 | .scrapy 93 | 94 | # Sphinx documentation 95 | docs/_build/ 96 | 97 | # PyBuilder 98 | target/ 99 | 100 | # IPython Notebook 101 | .ipynb_checkpoints 102 | 103 | # pyenv 104 | .python-version 105 | 106 | # celery beat schedule file 107 | celerybeat-schedule 108 | 109 | # dotenv 110 | .env 111 | 112 | # virtualenv 113 | venv/ 114 | ENV/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 berkeleydeeprlcourse 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /hw1/README.md: -------------------------------------------------------------------------------- 1 | # CS294-112 HW 1: Imitation Learning 2 | 3 | Dependencies: TensorFlow, MuJoCo version 1.31, OpenAI Gym 4 | 5 | **Note**: MuJoCo versions until 1.5 do not support NVMe disks therefore won't be compatible with recent Mac machines. 6 | There is a request for OpenAI to support it that can be followed [here](https://github.com/openai/gym/issues/638). 7 | 8 | The only file that you need to look at is `run_expert.py`, which is code to load up an expert policy, run a specified number of roll-outs, and save out data. 9 | 10 | In `experts/`, the provided expert policies are: 11 | * Ant-v1.pkl 12 | * HalfCheetah-v1.pkl 13 | * Hopper-v1.pkl 14 | * Humanoid-v1.pkl 15 | * Reacher-v1.pkl 16 | * Walker2d-v1.pkl 17 | 18 | The name of the pickle file corresponds to the name of the gym environment. 19 | -------------------------------------------------------------------------------- /hw1/demo.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eux 3 | for e in Hopper-v1 Ant-v1 HalfCheetah-v1 Humanoid-v1 Reacher-v1 Walker2d-v1 4 | do 5 | python run_expert.py experts/$e.pkl $e --render --num_rollouts=1 6 | done 7 | -------------------------------------------------------------------------------- /hw1/experts/Ant-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw1/experts/Ant-v1.pkl -------------------------------------------------------------------------------- /hw1/experts/HalfCheetah-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw1/experts/HalfCheetah-v1.pkl -------------------------------------------------------------------------------- /hw1/experts/Hopper-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw1/experts/Hopper-v1.pkl -------------------------------------------------------------------------------- /hw1/experts/Humanoid-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw1/experts/Humanoid-v1.pkl -------------------------------------------------------------------------------- /hw1/experts/Reacher-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw1/experts/Reacher-v1.pkl -------------------------------------------------------------------------------- /hw1/experts/Walker2d-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw1/experts/Walker2d-v1.pkl -------------------------------------------------------------------------------- /hw1/hw1fall2017.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw1/hw1fall2017.pdf -------------------------------------------------------------------------------- /hw1/load_policy.py: -------------------------------------------------------------------------------- 1 | import pickle, tensorflow as tf, tf_util, numpy as np 2 | 3 | def load_policy(filename): 4 | with open(filename, 'rb') as f: 5 | data = pickle.loads(f.read()) 6 | 7 | # assert len(data.keys()) == 2 8 | nonlin_type = data['nonlin_type'] 9 | policy_type = [k for k in data.keys() if k != 'nonlin_type'][0] 10 | 11 | assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type) 12 | policy_params = data[policy_type] 13 | 14 | assert set(policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'} 15 | 16 | # Keep track of input and output dims (i.e. observation and action dims) for the user 17 | 18 | def build_policy(obs_bo): 19 | def read_layer(l): 20 | assert list(l.keys()) == ['AffineLayer'] 21 | assert sorted(l['AffineLayer'].keys()) == ['W', 'b'] 22 | return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer']['b'].astype(np.float32) 23 | 24 | def apply_nonlin(x): 25 | if nonlin_type == 'lrelu': 26 | return tf_util.lrelu(x, leak=.01) # openai/imitation nn.py:233 27 | elif nonlin_type == 'tanh': 28 | return tf.tanh(x) 29 | else: 30 | raise NotImplementedError(nonlin_type) 31 | 32 | # Build the policy. First, observation normalization. 33 | assert list(policy_params['obsnorm'].keys()) == ['Standardizer'] 34 | obsnorm_mean = policy_params['obsnorm']['Standardizer']['mean_1_D'] 35 | obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D'] 36 | obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean))) 37 | print('obs', obsnorm_mean.shape, obsnorm_stdev.shape) 38 | normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation 39 | 40 | curr_activations_bd = normedobs_bo 41 | 42 | # Hidden layers next 43 | assert list(policy_params['hidden'].keys()) == ['FeedforwardNet'] 44 | layer_params = policy_params['hidden']['FeedforwardNet'] 45 | for layer_name in sorted(layer_params.keys()): 46 | l = layer_params[layer_name] 47 | W, b = read_layer(l) 48 | curr_activations_bd = apply_nonlin(tf.matmul(curr_activations_bd, W) + b) 49 | 50 | # Output layer 51 | W, b = read_layer(policy_params['out']) 52 | output_bo = tf.matmul(curr_activations_bd, W) + b 53 | return output_bo 54 | 55 | obs_bo = tf.placeholder(tf.float32, [None, None]) 56 | a_ba = build_policy(obs_bo) 57 | policy_fn = tf_util.function([obs_bo], a_ba) 58 | return policy_fn -------------------------------------------------------------------------------- /hw1/run_expert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Code to load an expert policy and generate roll-out data for behavioral cloning. 5 | Example usage: 6 | python run_expert.py experts/Humanoid-v1.pkl Humanoid-v1 --render \ 7 | --num_rollouts 20 8 | 9 | Author of this script and included expert policies: Jonathan Ho (hoj@openai.com) 10 | """ 11 | 12 | import pickle 13 | import tensorflow as tf 14 | import numpy as np 15 | import tf_util 16 | import gym 17 | import load_policy 18 | 19 | def main(): 20 | import argparse 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('expert_policy_file', type=str) 23 | parser.add_argument('envname', type=str) 24 | parser.add_argument('--render', action='store_true') 25 | parser.add_argument("--max_timesteps", type=int) 26 | parser.add_argument('--num_rollouts', type=int, default=20, 27 | help='Number of expert roll outs') 28 | args = parser.parse_args() 29 | 30 | print('loading and building expert policy') 31 | policy_fn = load_policy.load_policy(args.expert_policy_file) 32 | print('loaded and built') 33 | 34 | with tf.Session(): 35 | tf_util.initialize() 36 | 37 | import gym 38 | env = gym.make(args.envname) 39 | max_steps = args.max_timesteps or env.spec.timestep_limit 40 | 41 | returns = [] 42 | observations = [] 43 | actions = [] 44 | for i in range(args.num_rollouts): 45 | print('iter', i) 46 | obs = env.reset() 47 | done = False 48 | totalr = 0. 49 | steps = 0 50 | while not done: 51 | action = policy_fn(obs[None,:]) 52 | observations.append(obs) 53 | actions.append(action) 54 | obs, r, done, _ = env.step(action) 55 | totalr += r 56 | steps += 1 57 | if args.render: 58 | env.render() 59 | if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) 60 | if steps >= max_steps: 61 | break 62 | returns.append(totalr) 63 | 64 | print('returns', returns) 65 | print('mean return', np.mean(returns)) 66 | print('std of return', np.std(returns)) 67 | 68 | expert_data = {'observations': np.array(observations), 69 | 'actions': np.array(actions)} 70 | 71 | if __name__ == '__main__': 72 | main() 73 | -------------------------------------------------------------------------------- /hw1/tf_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf # pylint: ignore-module 3 | #import builtins 4 | import functools 5 | import copy 6 | import os 7 | import collections 8 | 9 | # ================================================================ 10 | # Import all names into common namespace 11 | # ================================================================ 12 | 13 | clip = tf.clip_by_value 14 | 15 | # Make consistent with numpy 16 | # ---------------------------------------- 17 | 18 | def sum(x, axis=None, keepdims=False): 19 | return tf.reduce_sum(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims) 20 | def mean(x, axis=None, keepdims=False): 21 | return tf.reduce_mean(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims) 22 | def var(x, axis=None, keepdims=False): 23 | meanx = mean(x, axis=axis, keepdims=keepdims) 24 | return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims) 25 | def std(x, axis=None, keepdims=False): 26 | return tf.sqrt(var(x, axis=axis, keepdims=keepdims)) 27 | def max(x, axis=None, keepdims=False): 28 | return tf.reduce_max(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims) 29 | def min(x, axis=None, keepdims=False): 30 | return tf.reduce_min(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims) 31 | def concatenate(arrs, axis=0): 32 | return tf.concat(axis, arrs) 33 | def argmax(x, axis=None): 34 | return tf.argmax(x, dimension=axis) 35 | 36 | def switch(condition, then_expression, else_expression): 37 | '''Switches between two operations depending on a scalar value (int or bool). 38 | Note that both `then_expression` and `else_expression` 39 | should be symbolic tensors of the *same shape*. 40 | 41 | # Arguments 42 | condition: scalar tensor. 43 | then_expression: TensorFlow operation. 44 | else_expression: TensorFlow operation. 45 | ''' 46 | x_shape = copy.copy(then_expression.get_shape()) 47 | x = tf.cond(tf.cast(condition, 'bool'), 48 | lambda: then_expression, 49 | lambda: else_expression) 50 | x.set_shape(x_shape) 51 | return x 52 | 53 | # Extras 54 | # ---------------------------------------- 55 | def l2loss(params): 56 | if len(params) == 0: 57 | return tf.constant(0.0) 58 | else: 59 | return tf.add_n([sum(tf.square(p)) for p in params]) 60 | def lrelu(x, leak=0.2): 61 | f1 = 0.5 * (1 + leak) 62 | f2 = 0.5 * (1 - leak) 63 | return f1 * x + f2 * abs(x) 64 | def categorical_sample_logits(X): 65 | # https://github.com/tensorflow/tensorflow/issues/456 66 | U = tf.random_uniform(tf.shape(X)) 67 | return argmax(X - tf.log(-tf.log(U)), axis=1) 68 | 69 | # ================================================================ 70 | # Global session 71 | # ================================================================ 72 | 73 | def get_session(): 74 | return tf.get_default_session() 75 | 76 | def single_threaded_session(): 77 | tf_config = tf.ConfigProto( 78 | inter_op_parallelism_threads=1, 79 | intra_op_parallelism_threads=1) 80 | return tf.Session(config=tf_config) 81 | 82 | def make_session(num_cpu): 83 | tf_config = tf.ConfigProto( 84 | inter_op_parallelism_threads=num_cpu, 85 | intra_op_parallelism_threads=num_cpu) 86 | return tf.Session(config=tf_config) 87 | 88 | 89 | ALREADY_INITIALIZED = set() 90 | def initialize(): 91 | new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED 92 | get_session().run(tf.variables_initializer(new_variables)) 93 | ALREADY_INITIALIZED.update(new_variables) 94 | 95 | 96 | def eval(expr, feed_dict=None): 97 | if feed_dict is None: feed_dict = {} 98 | return get_session().run(expr, feed_dict=feed_dict) 99 | 100 | def set_value(v, val): 101 | get_session().run(v.assign(val)) 102 | 103 | def load_state(fname): 104 | saver = tf.train.Saver() 105 | saver.restore(get_session(), fname) 106 | 107 | def save_state(fname): 108 | os.makedirs(os.path.dirname(fname), exist_ok=True) 109 | saver = tf.train.Saver() 110 | saver.save(get_session(), fname) 111 | 112 | # ================================================================ 113 | # Model components 114 | # ================================================================ 115 | 116 | 117 | def normc_initializer(std=1.0): 118 | def _initializer(shape, dtype=None, partition_info=None): #pylint: disable=W0613 119 | out = np.random.randn(*shape).astype(np.float32) 120 | out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True)) 121 | return tf.constant(out) 122 | return _initializer 123 | 124 | 125 | def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None, 126 | summary_tag=None): 127 | with tf.variable_scope(name): 128 | stride_shape = [1, stride[0], stride[1], 1] 129 | filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]), num_filters] 130 | 131 | # there are "num input feature maps * filter height * filter width" 132 | # inputs to each hidden unit 133 | fan_in = intprod(filter_shape[:3]) 134 | # each unit in the lower layer receives a gradient from: 135 | # "num output feature maps * filter height * filter width" / 136 | # pooling size 137 | fan_out = intprod(filter_shape[:2]) * num_filters 138 | # initialize weights with random weights 139 | w_bound = np.sqrt(6. / (fan_in + fan_out)) 140 | 141 | w = tf.get_variable("W", filter_shape, dtype, tf.random_uniform_initializer(-w_bound, w_bound), 142 | collections=collections) 143 | b = tf.get_variable("b", [1, 1, 1, num_filters], initializer=tf.zeros_initializer, 144 | collections=collections) 145 | 146 | if summary_tag is not None: 147 | tf.image_summary(summary_tag, 148 | tf.transpose(tf.reshape(w, [filter_size[0], filter_size[1], -1, 1]), 149 | [2, 0, 1, 3]), 150 | max_images=10) 151 | 152 | return tf.nn.conv2d(x, w, stride_shape, pad) + b 153 | 154 | 155 | def dense(x, size, name, weight_init=None, bias=True): 156 | w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init) 157 | ret = tf.matmul(x, w) 158 | if bias: 159 | b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer) 160 | return ret + b 161 | else: 162 | return ret 163 | 164 | def wndense(x, size, name, init_scale=1.0): 165 | v = tf.get_variable(name + "/V", [int(x.get_shape()[1]), size], 166 | initializer=tf.random_normal_initializer(0, 0.05)) 167 | g = tf.get_variable(name + "/g", [size], initializer=tf.constant_initializer(init_scale)) 168 | b = tf.get_variable(name + "/b", [size], initializer=tf.constant_initializer(0.0)) 169 | 170 | # use weight normalization (Salimans & Kingma, 2016) 171 | x = tf.matmul(x, v) 172 | scaler = g / tf.sqrt(sum(tf.square(v), axis=0, keepdims=True)) 173 | return tf.reshape(scaler, [1, size]) * x + tf.reshape(b, [1, size]) 174 | 175 | def densenobias(x, size, name, weight_init=None): 176 | return dense(x, size, name, weight_init=weight_init, bias=False) 177 | 178 | def dropout(x, pkeep, phase=None, mask=None): 179 | mask = tf.floor(pkeep + tf.random_uniform(tf.shape(x))) if mask is None else mask 180 | if phase is None: 181 | return mask * x 182 | else: 183 | return switch(phase, mask*x, pkeep*x) 184 | 185 | def batchnorm(x, name, phase, updates, gamma=0.96): 186 | k = x.get_shape()[1] 187 | runningmean = tf.get_variable(name+"/mean", shape=[1, k], initializer=tf.constant_initializer(0.0), trainable=False) 188 | runningvar = tf.get_variable(name+"/var", shape=[1, k], initializer=tf.constant_initializer(1e-4), trainable=False) 189 | testy = (x - runningmean) / tf.sqrt(runningvar) 190 | 191 | mean_ = mean(x, axis=0, keepdims=True) 192 | var_ = mean(tf.square(x), axis=0, keepdims=True) 193 | std = tf.sqrt(var_) 194 | trainy = (x - mean_) / std 195 | 196 | updates.extend([ 197 | tf.assign(runningmean, runningmean * gamma + mean_ * (1 - gamma)), 198 | tf.assign(runningvar, runningvar * gamma + var_ * (1 - gamma)) 199 | ]) 200 | 201 | y = switch(phase, trainy, testy) 202 | 203 | out = y * tf.get_variable(name+"/scaling", shape=[1, k], initializer=tf.constant_initializer(1.0), trainable=True)\ 204 | + tf.get_variable(name+"/translation", shape=[1,k], initializer=tf.constant_initializer(0.0), trainable=True) 205 | return out 206 | 207 | 208 | 209 | # ================================================================ 210 | # Basic Stuff 211 | # ================================================================ 212 | 213 | def function(inputs, outputs, updates=None, givens=None): 214 | if isinstance(outputs, list): 215 | return _Function(inputs, outputs, updates, givens=givens) 216 | elif isinstance(outputs, (dict, collections.OrderedDict)): 217 | f = _Function(inputs, outputs.values(), updates, givens=givens) 218 | return lambda *inputs : type(outputs)(zip(outputs.keys(), f(*inputs))) 219 | else: 220 | f = _Function(inputs, [outputs], updates, givens=givens) 221 | return lambda *inputs : f(*inputs)[0] 222 | 223 | class _Function(object): 224 | def __init__(self, inputs, outputs, updates, givens, check_nan=False): 225 | assert all(len(i.op.inputs)==0 for i in inputs), "inputs should all be placeholders" 226 | self.inputs = inputs 227 | updates = updates or [] 228 | self.update_group = tf.group(*updates) 229 | self.outputs_update = list(outputs) + [self.update_group] 230 | self.givens = {} if givens is None else givens 231 | self.check_nan = check_nan 232 | def __call__(self, *inputvals): 233 | assert len(inputvals) == len(self.inputs) 234 | feed_dict = dict(zip(self.inputs, inputvals)) 235 | feed_dict.update(self.givens) 236 | results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1] 237 | if self.check_nan: 238 | if any(np.isnan(r).any() for r in results): 239 | raise RuntimeError("Nan detected") 240 | return results 241 | 242 | def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size): 243 | if isinstance(outputs, list): 244 | return _MemFriendlyFunction(nondata_inputs, data_inputs, outputs, batch_size) 245 | else: 246 | f = _MemFriendlyFunction(nondata_inputs, data_inputs, [outputs], batch_size) 247 | return lambda *inputs : f(*inputs)[0] 248 | 249 | class _MemFriendlyFunction(object): 250 | def __init__(self, nondata_inputs, data_inputs, outputs, batch_size): 251 | self.nondata_inputs = nondata_inputs 252 | self.data_inputs = data_inputs 253 | self.outputs = list(outputs) 254 | self.batch_size = batch_size 255 | def __call__(self, *inputvals): 256 | assert len(inputvals) == len(self.nondata_inputs) + len(self.data_inputs) 257 | nondata_vals = inputvals[0:len(self.nondata_inputs)] 258 | data_vals = inputvals[len(self.nondata_inputs):] 259 | feed_dict = dict(zip(self.nondata_inputs, nondata_vals)) 260 | n = data_vals[0].shape[0] 261 | for v in data_vals[1:]: 262 | assert v.shape[0] == n 263 | for i_start in range(0, n, self.batch_size): 264 | slice_vals = [v[i_start:min(i_start+self.batch_size, n)] for v in data_vals] 265 | for (var,val) in zip(self.data_inputs, slice_vals): 266 | feed_dict[var]=val 267 | results = tf.get_default_session().run(self.outputs, feed_dict=feed_dict) 268 | if i_start==0: 269 | sum_results = results 270 | else: 271 | for i in range(len(results)): 272 | sum_results[i] = sum_results[i] + results[i] 273 | for i in range(len(results)): 274 | sum_results[i] = sum_results[i] / n 275 | return sum_results 276 | 277 | # ================================================================ 278 | # Modules 279 | # ================================================================ 280 | 281 | class Module(object): 282 | def __init__(self, name): 283 | self.name = name 284 | self.first_time = True 285 | self.scope = None 286 | self.cache = {} 287 | def __call__(self, *args): 288 | if args in self.cache: 289 | print("(%s) retrieving value from cache"%self.name) 290 | return self.cache[args] 291 | with tf.variable_scope(self.name, reuse=not self.first_time): 292 | scope = tf.get_variable_scope().name 293 | if self.first_time: 294 | self.scope = scope 295 | print("(%s) running function for the first time"%self.name) 296 | else: 297 | assert self.scope == scope, "Tried calling function with a different scope" 298 | print("(%s) running function on new inputs"%self.name) 299 | self.first_time = False 300 | out = self._call(*args) 301 | self.cache[args] = out 302 | return out 303 | def _call(self, *args): 304 | raise NotImplementedError 305 | 306 | @property 307 | def trainable_variables(self): 308 | assert self.scope is not None, "need to call module once before getting variables" 309 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) 310 | 311 | @property 312 | def variables(self): 313 | assert self.scope is not None, "need to call module once before getting variables" 314 | return tf.get_collection(tf.GraphKeys.VARIABLES, self.scope) 315 | 316 | 317 | def module(name): 318 | @functools.wraps 319 | def wrapper(f): 320 | class WrapperModule(Module): 321 | def _call(self, *args): 322 | return f(*args) 323 | return WrapperModule(name) 324 | return wrapper 325 | 326 | # ================================================================ 327 | # Graph traversal 328 | # ================================================================ 329 | 330 | VARIABLES = {} 331 | 332 | 333 | def get_parents(node): 334 | return node.op.inputs 335 | 336 | def topsorted(outputs): 337 | """ 338 | Topological sort via non-recursive depth-first search 339 | """ 340 | assert isinstance(outputs, (list,tuple)) 341 | marks = {} 342 | out = [] 343 | stack = [] #pylint: disable=W0621 344 | # i: node 345 | # jidx = number of children visited so far from that node 346 | # marks: state of each node, which is one of 347 | # 0: haven't visited 348 | # 1: have visited, but not done visiting children 349 | # 2: done visiting children 350 | for x in outputs: 351 | stack.append((x,0)) 352 | while stack: 353 | (i,jidx) = stack.pop() 354 | if jidx == 0: 355 | m = marks.get(i,0) 356 | if m == 0: 357 | marks[i] = 1 358 | elif m == 1: 359 | raise ValueError("not a dag") 360 | else: 361 | continue 362 | ps = get_parents(i) 363 | if jidx == len(ps): 364 | marks[i] = 2 365 | out.append(i) 366 | else: 367 | stack.append((i,jidx+1)) 368 | j = ps[jidx] 369 | stack.append((j,0)) 370 | return out 371 | 372 | 373 | # ================================================================ 374 | # Flat vectors 375 | # ================================================================ 376 | 377 | def var_shape(x): 378 | out = [k.value for k in x.get_shape()] 379 | assert all(isinstance(a, int) for a in out), \ 380 | "shape function assumes that shape is fully known" 381 | return out 382 | 383 | def numel(x): 384 | return intprod(var_shape(x)) 385 | 386 | def intprod(x): 387 | return int(np.prod(x)) 388 | 389 | def flatgrad(loss, var_list): 390 | grads = tf.gradients(loss, var_list) 391 | return tf.concat(0, [tf.reshape(grad, [numel(v)]) 392 | for (v, grad) in zip(var_list, grads)]) 393 | 394 | class SetFromFlat(object): 395 | def __init__(self, var_list, dtype=tf.float32): 396 | assigns = [] 397 | shapes = list(map(var_shape, var_list)) 398 | total_size = np.sum([intprod(shape) for shape in shapes]) 399 | 400 | self.theta = theta = tf.placeholder(dtype,[total_size]) 401 | start=0 402 | assigns = [] 403 | for (shape,v) in zip(shapes,var_list): 404 | size = intprod(shape) 405 | assigns.append(tf.assign(v, tf.reshape(theta[start:start+size],shape))) 406 | start+=size 407 | self.op = tf.group(*assigns) 408 | def __call__(self, theta): 409 | get_session().run(self.op, feed_dict={self.theta:theta}) 410 | 411 | class GetFlat(object): 412 | def __init__(self, var_list): 413 | self.op = tf.concat(0, [tf.reshape(v, [numel(v)]) for v in var_list]) 414 | def __call__(self): 415 | return get_session().run(self.op) 416 | 417 | # ================================================================ 418 | # Misc 419 | # ================================================================ 420 | 421 | 422 | def fancy_slice_2d(X, inds0, inds1): 423 | """ 424 | like numpy X[inds0, inds1] 425 | XXX this implementation is bad 426 | """ 427 | inds0 = tf.cast(inds0, tf.int64) 428 | inds1 = tf.cast(inds1, tf.int64) 429 | shape = tf.cast(tf.shape(X), tf.int64) 430 | ncols = shape[1] 431 | Xflat = tf.reshape(X, [-1]) 432 | return tf.gather(Xflat, inds0 * ncols + inds1) 433 | 434 | 435 | def scope_vars(scope, trainable_only): 436 | """ 437 | Get variables inside a scope 438 | The scope can be specified as a string 439 | """ 440 | return tf.get_collection( 441 | tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.VARIABLES, 442 | scope=scope if isinstance(scope, str) else scope.name 443 | ) 444 | 445 | def lengths_to_mask(lengths_b, max_length): 446 | """ 447 | Turns a vector of lengths into a boolean mask 448 | 449 | Args: 450 | lengths_b: an integer vector of lengths 451 | max_length: maximum length to fill the mask 452 | 453 | Returns: 454 | a boolean array of shape (batch_size, max_length) 455 | row[i] consists of True repeated lengths_b[i] times, followed by False 456 | """ 457 | lengths_b = tf.convert_to_tensor(lengths_b) 458 | assert lengths_b.get_shape().ndims == 1 459 | mask_bt = tf.expand_dims(tf.range(max_length), 0) < tf.expand_dims(lengths_b, 1) 460 | return mask_bt 461 | 462 | 463 | def in_session(f): 464 | @functools.wraps(f) 465 | def newfunc(*args, **kwargs): 466 | with tf.Session(): 467 | f(*args, **kwargs) 468 | return newfunc 469 | 470 | 471 | _PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape) 472 | def get_placeholder(name, dtype, shape): 473 | print("calling get_placeholder", name) 474 | if name in _PLACEHOLDER_CACHE: 475 | out, dtype1, shape1 = _PLACEHOLDER_CACHE[name] 476 | assert dtype1==dtype and shape1==shape 477 | return out 478 | else: 479 | out = tf.placeholder(dtype=dtype, shape=shape, name=name) 480 | _PLACEHOLDER_CACHE[name] = (out,dtype,shape) 481 | return out 482 | def get_placeholder_cached(name): 483 | return _PLACEHOLDER_CACHE[name][0] 484 | 485 | def flattenallbut0(x): 486 | return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])]) 487 | 488 | def reset(): 489 | global _PLACEHOLDER_CACHE 490 | global VARIABLES 491 | _PLACEHOLDER_CACHE = {} 492 | VARIABLES = {} 493 | tf.reset_default_graph() 494 | -------------------------------------------------------------------------------- /hw2/README.MD: -------------------------------------------------------------------------------- 1 | # Homework 2 Report, Policy Gradient Deep RL 2 | 3 | _Author_ Kay Ke 4 | 5 | _Email_ kayke@uw.edu 6 | 7 | _Update_ Sep 26, 2017. 8 | 9 | ## Summary 10 | 11 | For this homework, I have completed implementing policy gradient for both discrete and continuous action spaces, implementing reward to go (discount reward by current timestep instead of the begining of the trajectory), implementing advantage normalization and implementing neural network baseline (set baseline to predictions of NN rather than mean of sampled trajectory). I have shown that the algorithm could converge to the optimal score of 200 in CartPole game. I compared the performances for CartPole game using different parameters including network sizes, batches size, turning on/off reward to go and advantage normalization. I have shown that the algorithm could converge to the optimal score of 1000 in the InvertedPendulum game (1D, continuous action space) in 100 iterations, and could achieve an average score > 150 in 100 iterations for HalfCheetah game. 12 | 13 | At the bottom of this report I have attached personal "take-away" summaries for this project. 14 | 15 | ## Answers 16 | 17 | 1. Compare the learning curves for the CartPole small batches (batch size 1000). There are three different settings: vanilla flavor (sb_no_rtg_dna), with reward to go (sb_rtg_dna), with reward to go and advantage normalization (sb_rtg_na). 18 | 19 | ![](fig/1_cartpole_sb.png) 20 | 21 | 2. Compare the learning curves for the CartPole large batches (batch size 5000). There are three different settings: vanilla flavor (lb_no_rtg_dna), with reward to go (lb_rtg_dna), with reward to go and advantage normalization (lb_rtg_na). 22 | 23 | ![](fig/2_cartpole_lb.png) 24 | 25 | 3. Compare the learning curves for small and large batches. 26 | 27 | ![](fig/2_cartpole_sb_lb.png) 28 | 29 | 4. Command lines that generated the images above 30 | 31 | ```Bash 32 | # Produce experiments results for CartPole 33 | python train_pg.py CartPole-v0 -n 100 -b 1000 -e 5 -dna --exp_name sb_no_rtg_dna 34 | python train_pg.py CartPole-v0 -n 100 -b 1000 -e 5 -rtg -dna --exp_name sb_rtg_dna 35 | python train_pg.py CartPole-v0 -n 100 -b 1000 -e 5 -rtg --exp_name sb_rtg_na 36 | python train_pg.py CartPole-v0 -n 100 -b 5000 -e 5 -dna --exp_name lb_no_rtg_dna 37 | python train_pg.py CartPole-v0 -n 100 -b 5000 -e 5 -rtg -dna --exp_name lb_rtg_dna 38 | python train_pg.py CartPole-v0 -n 100 -b 5000 -e 5 -rtg --exp_name lb_rtg_na 39 | ``` 40 | 41 | ​ 42 | 43 | 5. Answer questions 44 | 45 | - Which gradient estimator has better performance without advantage-centering: the trajectory-centric one, or the one using reward-to-go? 46 | 47 | **From the figures for small / large batches, reward-to-go seems to learn faster from the begining and performs better**. 48 | 49 | - Did advantage centering help? 50 | 51 | **Didn't help in my set up, with reward to go turned on.** 52 | 53 | - Describe what you expected from the math—do the empirical results match the theory? 54 | 55 | **TODO** 56 | 57 | - Did the batch size make an impact? 58 | 59 | **Yes, the larger batch size results in a smoother learning curve (less variations) and seems to learn faster (in terms of # of iterations)** 60 | 61 | 6. Display a learning curve for InvertedPendulum-v1 62 | 63 | ```bash 64 | python train_pg.py InvertedPendulum-v1 -n 100 -b 1000 -e 5 -rtg --exp_name ip_sb_rtg_na --learning_rate 1e-2 --n_layers 2 --size 16 65 | ``` 66 | 67 | ![](fig/3_pendulum_2x16.png) 68 | 69 | 7. Implement NN Baseline 70 | 71 | Contrary to expectation, this does not seem to make a big impact on the variations. It's possible that this resulted from the small batch size I used? 72 | 73 | ```bash 74 | python train_pg.py InvertedPendulum-v1 -n 100 -b 1500 -e 3 -rtg --exp_name ip_rtg_na --learning_rate 3e-2 --n_layers 2 --size 16 --seed 13 75 | python train_pg.py InvertedPendulum-v1 -n 100 -b 1500 -e 3 -rtg -bl --exp_name ip_bl_rtg_na --learning_rate 3e-2 --n_layers 2 --size 16 --seed 13 76 | ``` 77 | 78 | ![4_NN_Baseline](fig/4_nn_baseline.png) 79 | 80 | 8. HalfCheetah achieved average score of >150 in 100 iterations. 81 | 82 | ```bash 83 | python train_pg.py HalfCheetah-v1 -ep 150 --discount 0.9 --exp_name hc2x32x15000x2e2 -n 100 -b 50000 -e 1 --learning_rate 4e-2 -rtg --n_layers 2 --size 32 --seed 17 84 | # The performance varies across seed a lot. This seed is selected because it performs well. Seeds tried include 27, 37, 47, 57. None could achieve >150 in 100 iterations. 85 | ``` 86 | 87 | ![](fig/5_hc.png) 88 | 89 | ​ 90 | 91 | ## Takeaway 92 | 93 | 1. Large batch stabalizes learning process: it allows training loss to smoothly decrease, avoid variations and potentially speeds training up (in terms of iteration #). 94 | 2. Reward to go helps training faster in infinite loop game. 95 | 3. Advantage normalization on paper seems to be able to stablize training but in reality not necessarily. 96 | 4. The network does not need to be too deep / large. Small network (For HalfCheetah v1, 2*32 in this case) could be enough. Prioritize optimizing other parameters in training. 97 | 5. My dumb tensorflow misc: 98 | 1. `tf.nn.softmax_cross_entropy_with_logits` does not like output layer that has been softmax activated. See [Official Doc](https://www.tensorflow.org/api_docs/python/tf/nn/softmax_cross_entropy_with_logits). 99 | 2. Use `tf.nn.sparse_softmax_cross_entropy_with_logits` to skip hot coding. 100 | 3. `tf.train.AdamOptimizer` is considered preferred to `GradientDescentOptimizer`, see [this post](https://stats.stackexchange.com/questions/184448/difference-between-gradientdescentoptimizer-and-adamoptimizer-tensorflow). On a high level it varies the step size (learning rate) by using the _moving averages of the parameter_. 101 | 4. Fetch desired tensor in `sess.run([tensor_a,b,c,d], feed_dict={})` See [Official Doc](https://www.tensorflow.org/versions/r0.12/api_docs/python/client/session_management#Session.run). Execute nn update by fetching the updating_op: `sess.run(update_op, feed_dict={...})`. -------------------------------------------------------------------------------- /hw2/deliver.sh: -------------------------------------------------------------------------------- 1 | # Produce experiments results for CartPole 2 | python train_pg.py CartPole-v0 -n 100 -b 1000 -e 5 -dna --exp_name sb_no_rtg_dna 3 | python train_pg.py CartPole-v0 -n 100 -b 1000 -e 5 -rtg -dna --exp_name sb_rtg_dna 4 | python train_pg.py CartPole-v0 -n 100 -b 1000 -e 5 -rtg --exp_name sb_rtg_na 5 | python train_pg.py CartPole-v0 -n 100 -b 5000 -e 5 -dna --exp_name lb_no_rtg_dna 6 | python train_pg.py CartPole-v0 -n 100 -b 5000 -e 5 -rtg -dna --exp_name lb_rtg_dna 7 | python train_pg.py CartPole-v0 -n 100 -b 5000 -e 5 -rtg --exp_name lb_rtg_na 8 | 9 | # Produce plot from results 10 | python plot.py data/sb_rtg_na data/sb_rtg_dna data/sb_no_rtg_dna 11 | python plot.py data/lb_rtg_na data/lb_rtg_dna data/lb_no_rtg_dna 12 | python plot.py data/sb_rtg_dna data/lb_rtg_dna 13 | 14 | # Produce experiments results for InvertedPendulum-v1 15 | python train_pg.py InvertedPendulum-v1 -n 100 -b 1500 -e 3 -rtg --exp_name ip_rtg_na --learning_rate 3e-2 --n_layers 2 --size 16 --seed 13 16 | python plot.py data/ip_ 17 | 18 | # NN Baseline 19 | python train_pg.py InvertedPendulum-v1 -n 100 -b 1500 -e 3 -rtg -bl --exp_name ip_bl_rtg_na --learning_rate 3e-2 --n_layers 2 --size 16 --seed 13 20 | 21 | # Cheetah 22 | # basic 23 | python train_pg.py HalfCheetah-v1 -ep 150 --discount 0.9 --exp_name hc2x64 -n 100 -b 5000 -e 1 --learning_rate 5e-2 -rtg --n_layers 2 --size 64 --seed 17 24 | # tune 25 | python train_pg.py HalfCheetah-v1 -ep 150 --discount 0.9 --exp_name hc2x32x15000x2e2 -n 100 -b 50000 -e 5 --learning_rate 4e-2 -rtg --n_layers 2 --size 32 --seed 17 -------------------------------------------------------------------------------- /hw2/fig/1_cartpole_sb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw2/fig/1_cartpole_sb.png -------------------------------------------------------------------------------- /hw2/fig/2_cartpole_lb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw2/fig/2_cartpole_lb.png -------------------------------------------------------------------------------- /hw2/fig/2_cartpole_sb_lb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw2/fig/2_cartpole_sb_lb.png -------------------------------------------------------------------------------- /hw2/fig/3_pendulum_2x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw2/fig/3_pendulum_2x16.png -------------------------------------------------------------------------------- /hw2/fig/4_nn_baseline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw2/fig/4_nn_baseline.png -------------------------------------------------------------------------------- /hw2/fig/5_hc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw2/fig/5_hc.png -------------------------------------------------------------------------------- /hw2/hw2_final.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw2/hw2_final.pdf -------------------------------------------------------------------------------- /hw2/logz.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | """ 4 | 5 | Some simple logging functionality, inspired by rllab's logging. 6 | Assumes that each diagnostic gets logged each iteration 7 | 8 | Call logz.configure_output_dir() to start logging to a 9 | tab-separated-values file (some_folder_name/log.txt) 10 | 11 | To load the learning curves, you can do, for example 12 | 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True) 14 | A['EpRewMean'] 15 | 16 | """ 17 | 18 | import os.path as osp, shutil, time, atexit, os, subprocess 19 | import pickle 20 | import tensorflow as tf 21 | 22 | color2num = dict( 23 | gray=30, 24 | red=31, 25 | green=32, 26 | yellow=33, 27 | blue=34, 28 | magenta=35, 29 | cyan=36, 30 | white=37, 31 | crimson=38 32 | ) 33 | 34 | def colorize(string, color, bold=False, highlight=False): 35 | attr = [] 36 | num = color2num[color] 37 | if highlight: num += 10 38 | attr.append(str(num)) 39 | if bold: attr.append('1') 40 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 41 | 42 | class G: 43 | output_dir = None 44 | output_file = None 45 | first_row = True 46 | log_headers = [] 47 | log_current_row = {} 48 | 49 | def configure_output_dir(d=None): 50 | """ 51 | Set output directory to d, or to /tmp/somerandomnumber if d is None 52 | """ 53 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time()) 54 | assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir 55 | os.makedirs(G.output_dir) 56 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w') 57 | atexit.register(G.output_file.close) 58 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True)) 59 | 60 | def log_tabular(key, val): 61 | """ 62 | Log a value of some diagnostic 63 | Call this once for each diagnostic quantity, each iteration 64 | """ 65 | if G.first_row: 66 | G.log_headers.append(key) 67 | else: 68 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key 69 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key 70 | G.log_current_row[key] = val 71 | 72 | def save_params(params): 73 | with open(osp.join(G.output_dir, "params.json"), 'w') as out: 74 | out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True)) 75 | 76 | def pickle_tf_vars(): 77 | """ 78 | Saves tensorflow variables 79 | Requires them to be initialized first, also a default session must exist 80 | """ 81 | _dict = {v.name : v.eval() for v in tf.global_variables()} 82 | with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f: 83 | pickle.dump(_dict, f) 84 | 85 | 86 | def dump_tabular(): 87 | """ 88 | Write all of the diagnostics from the current iteration 89 | """ 90 | vals = [] 91 | key_lens = [len(key) for key in G.log_headers] 92 | max_key_len = max(15,max(key_lens)) 93 | keystr = '%'+'%d'%max_key_len 94 | fmt = "| " + keystr + "s | %15s |" 95 | n_slashes = 22 + max_key_len 96 | print("-"*n_slashes) 97 | for key in G.log_headers: 98 | val = G.log_current_row.get(key, "") 99 | if hasattr(val, "__float__"): valstr = "%8.3g"%val 100 | else: valstr = val 101 | print(fmt%(key, valstr)) 102 | vals.append(val) 103 | print("-"*n_slashes) 104 | if G.output_file is not None: 105 | if G.first_row: 106 | G.output_file.write("\t".join(G.log_headers)) 107 | G.output_file.write("\n") 108 | G.output_file.write("\t".join(map(str,vals))) 109 | G.output_file.write("\n") 110 | G.output_file.flush() 111 | G.log_current_row.clear() 112 | G.first_row=False 113 | -------------------------------------------------------------------------------- /hw2/plot.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import json 5 | import os 6 | 7 | """ 8 | Using the plotter: 9 | 10 | Call it from the command line, and supply it with logdirs to experiments. 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 12 | random seeds. The runner code stored it in the directory structure 13 | 14 | data 15 | L test_EnvName_DateTime 16 | L 0 17 | L log.txt 18 | L params.json 19 | L 1 20 | L log.txt 21 | L params.json 22 | . 23 | . 24 | . 25 | L 9 26 | L log.txt 27 | L params.json 28 | 29 | To plot learning curves from the experiment, averaged over all random 30 | seeds, call 31 | 32 | python plot.py data/test_EnvName_DateTime --value AverageReturn 33 | 34 | and voila. To see a different statistics, change what you put in for 35 | the keyword --value. You can also enter /multiple/ values, and it will 36 | make all of them in order. 37 | 38 | 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried 40 | a different set of hyperparameters from 'test1', and now you would like 41 | to compare them -- see their learning curves side-by-side. Just call 42 | 43 | python plot.py data/test1 data/test2 44 | 45 | and it will plot them both! They will be given titles in the legend according 46 | to their exp_name parameters. If you want to use custom legend titles, use 47 | the --legend flag and then provide a title for each logdir. 48 | 49 | """ 50 | 51 | def plot_data(data, value="AverageReturn"): 52 | if isinstance(data, list): 53 | data = pd.concat(data, ignore_index=True) 54 | sns.set(style="darkgrid", font_scale=1.5) 55 | sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition") 56 | plt.legend(loc='best').draggable() 57 | plt.show() 58 | 59 | 60 | def get_datasets(fpath, condition=None): 61 | unit = 0 62 | datasets = [] 63 | for root, dir, files in os.walk(fpath): 64 | if 'log.txt' in files: 65 | param_path = open(os.path.join(root,'params.json')) 66 | params = json.load(param_path) 67 | exp_name = params['exp_name'] 68 | 69 | log_path = os.path.join(root,'log.txt') 70 | experiment_data = pd.read_table(log_path) 71 | 72 | experiment_data.insert( 73 | len(experiment_data.columns), 74 | 'Unit', 75 | unit 76 | ) 77 | experiment_data.insert( 78 | len(experiment_data.columns), 79 | 'Condition', 80 | condition or exp_name 81 | ) 82 | 83 | datasets.append(experiment_data) 84 | unit += 1 85 | 86 | return datasets 87 | 88 | 89 | def main(): 90 | import argparse 91 | parser = argparse.ArgumentParser() 92 | parser.add_argument('logdir', nargs='*') 93 | parser.add_argument('--legend', nargs='*') 94 | parser.add_argument('--value', default='AverageReturn', nargs='*') 95 | args = parser.parse_args() 96 | 97 | use_legend = False 98 | if args.legend is not None: 99 | assert len(args.legend) == len(args.logdir), \ 100 | "Must give a legend title for each set of experiments." 101 | use_legend = True 102 | 103 | data = [] 104 | if use_legend: 105 | for logdir, legend_title in zip(args.logdir, args.legend): 106 | data += get_datasets(logdir, legend_title) 107 | else: 108 | for logdir in args.logdir: 109 | data += get_datasets(logdir) 110 | 111 | if isinstance(args.value, list): 112 | values = args.value 113 | else: 114 | values = [args.value] 115 | for value in values: 116 | plot_data(data, value=value) 117 | 118 | if __name__ == "__main__": 119 | main() 120 | -------------------------------------------------------------------------------- /hw3/README: -------------------------------------------------------------------------------- 1 | See http://rll.berkeley.edu/deeprlcourse/f17docs/hw3.pdf for instructions 2 | 3 | The starter code was based on an implementation of Q-learning for Atari 4 | generously provided by Szymon Sidor from OpenAI 5 | 6 | -------------------------------------------------------------------------------- /hw3/atari_wrappers.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from collections import deque 4 | import gym 5 | from gym import spaces 6 | 7 | 8 | class NoopResetEnv(gym.Wrapper): 9 | def __init__(self, env=None, noop_max=30): 10 | """Sample initial states by taking random number of no-ops on reset. 11 | No-op is assumed to be action 0. 12 | """ 13 | super(NoopResetEnv, self).__init__(env) 14 | self.noop_max = noop_max 15 | assert env.unwrapped.get_action_meanings()[0] == 'NOOP' 16 | 17 | def _reset(self): 18 | """ Do no-op action for a number of steps in [1, noop_max].""" 19 | self.env.reset() 20 | noops = np.random.randint(1, self.noop_max + 1) 21 | for _ in range(noops): 22 | obs, _, _, _ = self.env.step(0) 23 | return obs 24 | 25 | class FireResetEnv(gym.Wrapper): 26 | def __init__(self, env=None): 27 | """Take action on reset for environments that are fixed until firing.""" 28 | super(FireResetEnv, self).__init__(env) 29 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE' 30 | assert len(env.unwrapped.get_action_meanings()) >= 3 31 | 32 | def _reset(self): 33 | self.env.reset() 34 | obs, _, _, _ = self.env.step(1) 35 | obs, _, _, _ = self.env.step(2) 36 | return obs 37 | 38 | class EpisodicLifeEnv(gym.Wrapper): 39 | def __init__(self, env=None): 40 | """Make end-of-life == end-of-episode, but only reset on true game over. 41 | Done by DeepMind for the DQN and co. since it helps value estimation. 42 | """ 43 | super(EpisodicLifeEnv, self).__init__(env) 44 | self.lives = 0 45 | self.was_real_done = True 46 | self.was_real_reset = False 47 | 48 | def _step(self, action): 49 | obs, reward, done, info = self.env.step(action) 50 | self.was_real_done = done 51 | # check current lives, make loss of life terminal, 52 | # then update lives to handle bonus lives 53 | lives = self.env.unwrapped.ale.lives() 54 | if lives < self.lives and lives > 0: 55 | # for Qbert somtimes we stay in lives == 0 condtion for a few frames 56 | # so its important to keep lives > 0, so that we only reset once 57 | # the environment advertises done. 58 | done = True 59 | self.lives = lives 60 | return obs, reward, done, info 61 | 62 | def _reset(self): 63 | """Reset only when lives are exhausted. 64 | This way all states are still reachable even though lives are episodic, 65 | and the learner need not know about any of this behind-the-scenes. 66 | """ 67 | if self.was_real_done: 68 | obs = self.env.reset() 69 | self.was_real_reset = True 70 | else: 71 | # no-op step to advance from terminal/lost life state 72 | obs, _, _, _ = self.env.step(0) 73 | self.was_real_reset = False 74 | self.lives = self.env.unwrapped.ale.lives() 75 | return obs 76 | 77 | class MaxAndSkipEnv(gym.Wrapper): 78 | def __init__(self, env=None, skip=4): 79 | """Return only every `skip`-th frame""" 80 | super(MaxAndSkipEnv, self).__init__(env) 81 | # most recent raw observations (for max pooling across time steps) 82 | self._obs_buffer = deque(maxlen=2) 83 | self._skip = skip 84 | 85 | def _step(self, action): 86 | total_reward = 0.0 87 | done = None 88 | for _ in range(self._skip): 89 | obs, reward, done, info = self.env.step(action) 90 | self._obs_buffer.append(obs) 91 | total_reward += reward 92 | if done: 93 | break 94 | 95 | max_frame = np.max(np.stack(self._obs_buffer), axis=0) 96 | 97 | return max_frame, total_reward, done, info 98 | 99 | def _reset(self): 100 | """Clear past frame buffer and init. to first obs. from inner env.""" 101 | self._obs_buffer.clear() 102 | obs = self.env.reset() 103 | self._obs_buffer.append(obs) 104 | return obs 105 | 106 | def _process_frame84(frame): 107 | img = np.reshape(frame, [210, 160, 3]).astype(np.float32) 108 | img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114 109 | resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_LINEAR) 110 | x_t = resized_screen[18:102, :] 111 | x_t = np.reshape(x_t, [84, 84, 1]) 112 | return x_t.astype(np.uint8) 113 | 114 | class ProcessFrame84(gym.Wrapper): 115 | def __init__(self, env=None): 116 | super(ProcessFrame84, self).__init__(env) 117 | self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1)) 118 | 119 | def _step(self, action): 120 | obs, reward, done, info = self.env.step(action) 121 | return _process_frame84(obs), reward, done, info 122 | 123 | def _reset(self): 124 | return _process_frame84(self.env.reset()) 125 | 126 | class ClippedRewardsWrapper(gym.Wrapper): 127 | def _step(self, action): 128 | obs, reward, done, info = self.env.step(action) 129 | return obs, np.sign(reward), done, info 130 | 131 | def wrap_deepmind_ram(env): 132 | env = EpisodicLifeEnv(env) 133 | env = NoopResetEnv(env, noop_max=30) 134 | env = MaxAndSkipEnv(env, skip=4) 135 | if 'FIRE' in env.unwrapped.get_action_meanings(): 136 | env = FireResetEnv(env) 137 | env = ClippedRewardsWrapper(env) 138 | return env 139 | 140 | def wrap_deepmind(env): 141 | assert 'NoFrameskip' in env.spec.id 142 | env = EpisodicLifeEnv(env) 143 | env = NoopResetEnv(env, noop_max=30) 144 | env = MaxAndSkipEnv(env, skip=4) 145 | if 'FIRE' in env.unwrapped.get_action_meanings(): 146 | env = FireResetEnv(env) 147 | env = ProcessFrame84(env) 148 | env = ClippedRewardsWrapper(env) 149 | return env 150 | -------------------------------------------------------------------------------- /hw3/dqn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gym.spaces 3 | import itertools 4 | import numpy as np 5 | import random 6 | import tensorflow as tf 7 | import tensorflow.contrib.layers as layers 8 | from collections import namedtuple 9 | from dqn_utils import * 10 | import logging 11 | 12 | def d(s): 13 | logging.getLogger('dqn').debug(s) 14 | 15 | OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs", "lr_schedule"]) 16 | 17 | def learn(env, 18 | q_func, 19 | optimizer_spec, 20 | session, 21 | exploration=LinearSchedule(1000000, 0.1), 22 | stopping_criterion=None, 23 | replay_buffer_size=1000000, 24 | batch_size=32, 25 | gamma=0.99, 26 | learning_starts=50000, 27 | learning_freq=4, 28 | frame_history_len=4, 29 | target_update_freq=10000, 30 | grad_norm_clipping=10): 31 | 32 | """Run Deep Q-learning algorithm. 33 | 34 | You can specify your own convnet using q_func. 35 | 36 | All schedules are w.r.t. total number of steps taken in the environment. 37 | 38 | Parameters 39 | ---------- 40 | env: gym.Env 41 | gym environment to train on. 42 | q_func: function 43 | Model to use for computing the q function. It should accept the 44 | following named arguments: 45 | img_in: tf.Tensor 46 | tensorflow tensor representing the input image 47 | num_actions: int 48 | number of actions 49 | scope: str 50 | scope in which all the model related variables 51 | should be created 52 | reuse: bool 53 | whether previously created variables should be reused. 54 | optimizer_spec: OptimizerSpec 55 | Specifying the constructor and kwargs, as well as learning rate schedule 56 | for the optimizer 57 | session: tf.Session 58 | tensorflow session to use. 59 | exploration: rl_algs.deepq.utils.schedules.Schedule 60 | schedule for probability of chosing random action. 61 | stopping_criterion: (env, t) -> bool 62 | should return true when it's ok for the RL algorithm to stop. 63 | takes in env and the number of steps executed so far. 64 | replay_buffer_size: int 65 | How many memories to store in the replay buffer. 66 | batch_size: int 67 | How many transitions to sample each time experience is replayed. 68 | gamma: float 69 | Discount Factor 70 | learning_starts: int 71 | After how many environment steps to start replaying experiences 72 | learning_freq: int 73 | How many steps of environment to take between every experience replay 74 | frame_history_len: int 75 | How many past frames to include as input to the model. 76 | target_update_freq: int 77 | How many experience replay rounds (not steps!) to perform between 78 | each update to the target Q network 79 | grad_norm_clipping: float or None 80 | If not None gradients' norms are clipped to this value. 81 | """ 82 | assert type(env.observation_space) == gym.spaces.Box 83 | assert type(env.action_space) == gym.spaces.Discrete 84 | 85 | ############### 86 | # BUILD MODEL # 87 | ############### 88 | 89 | if len(env.observation_space.shape) == 1: 90 | # This means we are running on low-dimensional observations (e.g. RAM) 91 | input_shape = env.observation_space.shape 92 | else: 93 | img_h, img_w, img_c = env.observation_space.shape 94 | input_shape = (img_h, img_w, frame_history_len * img_c) 95 | num_actions = env.action_space.n 96 | 97 | d('input_shape = {}'.format(input_shape)) 98 | d('num_actions = {}'.format(num_actions)) 99 | 100 | # set up placeholders 101 | # placeholder for current observation (or state) 102 | obs_t_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) 103 | # placeholder for current action 104 | act_t_ph = tf.placeholder(tf.int32, [None]) 105 | # placeholder for current reward 106 | rew_t_ph = tf.placeholder(tf.float32, [None]) 107 | # placeholder for next observation (or state) 108 | obs_tp1_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) 109 | # placeholder for end of episode mask 110 | # this value is 1 if the next state corresponds to the end of an episode, 111 | # in which case there is no Q-value at the next state; at the end of an 112 | # episode, only the current state reward contributes to the target, not the 113 | # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1) 114 | done_mask_ph = tf.placeholder(tf.float32, [None]) 115 | 116 | # casting to float on GPU ensures lower data transfer times. 117 | obs_t_float = tf.cast(obs_t_ph, tf.float32) / 255.0 118 | obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0 119 | 120 | # Here, you should fill in your own code to compute the Bellman error. This requires 121 | # evaluating the current and next Q-values and constructing the corresponding error. 122 | # TensorFlow will differentiate this error for you, you just need to pass it to the 123 | # optimizer. See assignment text for details. 124 | # Your code should produce one scalar-valued tensor: total_error 125 | # This will be passed to the optimizer in the provided code below. 126 | # Your code should also produce two collections of variables: 127 | # q_func_vars 128 | # target_q_func_vars 129 | # These should hold all of the variables of the Q-function network and target network, 130 | # respectively. A convenient way to get these is to make use of TF's "scope" feature. 131 | # For example, you can create your Q-function network with the scope "q_func" like this: 132 | # = q_func(obs_t_float, num_actions, scope="q_func", reuse=False) 133 | # And then you can obtain the variables like this: 134 | # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func') 135 | # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES" 136 | ###### 137 | 138 | # YOUR CODE HERE 139 | 140 | # Q values 141 | pred_q = q_func(obs_t_float, num_actions, scope="q_func", reuse=False) 142 | pred_ac = tf.argmax(pred_q, axis=1) 143 | pred_q_a = tf.reduce_sum(pred_q * tf.one_hot(act_t_ph, depth=num_actions), axis=1) 144 | 145 | # Target 146 | target_q = q_func(obs_tp1_float, num_actions, scope="q_func_target", reuse=False) 147 | target_q_a = rew_t_ph + (1 - done_mask_ph) * gamma * tf.reduce_max(target_q, axis=1) 148 | 149 | # Loss 150 | #total_error = huber_loss(pred_q_a, target_q_a) 151 | #total_error = tf.nn.l2_loss(pred_q_a - target_q_a) 152 | total_error = 0.5 * tf.reduce_sum(tf.square(pred_q_a - tf.stop_gradient(target_q_a))) 153 | 154 | # Get variables 155 | q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func') 156 | target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func_target') 157 | 158 | d("pred_q = {}".format(pred_q)) 159 | d("target_q = {}".format(target_q)) 160 | 161 | d("pred_ac = {}".format(pred_ac)) 162 | d("pred_q_a = {}".format(pred_q_a)) 163 | d("target_q_a = {}".format(target_q_a)) 164 | d("total_error = {}".format(total_error)) 165 | 166 | ###### 167 | 168 | # construct optimization op (with gradient clipping) 169 | learning_rate = tf.placeholder(tf.float32, (), name="learning_rate") 170 | optimizer = optimizer_spec.constructor(learning_rate=learning_rate, **optimizer_spec.kwargs) 171 | train_fn = minimize_and_clip(optimizer, total_error, 172 | var_list=q_func_vars, clip_val=grad_norm_clipping) 173 | 174 | # update_target_fn will be called periodically to copy Q network to target Q network 175 | update_target_fn = [] 176 | for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), 177 | sorted(target_q_func_vars, key=lambda v: v.name)): 178 | update_target_fn.append(var_target.assign(var)) 179 | update_target_fn = tf.group(*update_target_fn) 180 | 181 | # construct the replay buffer 182 | replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) 183 | 184 | ############### 185 | # RUN ENV # 186 | ############### 187 | model_initialized = False 188 | num_param_updates = 0 189 | mean_episode_reward = -float('nan') 190 | best_mean_episode_reward = -float('inf') 191 | last_obs = env.reset() 192 | LOG_EVERY_N_STEPS = 10000 193 | 194 | for t in itertools.count(): 195 | ### 1. Check stopping criterion 196 | if stopping_criterion is not None and stopping_criterion(env, t): 197 | break 198 | 199 | ### 2. Step the env and store the transition 200 | # At this point, "last_obs" contains the latest observation that was 201 | # recorded from the simulator. Here, your code needs to store this 202 | # observation and its outcome (reward, next observation, etc.) into 203 | # the replay buffer while stepping the simulator forward one step. 204 | # At the end of this block of code, the simulator should have been 205 | # advanced one step, and the replay buffer should contain one more 206 | # transition. 207 | # Specifically, last_obs must point to the new latest observation. 208 | # Useful functions you'll need to call: 209 | # obs, reward, done, info = env.step(action) 210 | # this steps the environment forward one step 211 | # obs = env.reset() 212 | # this resets the environment if you reached an episode boundary. 213 | # Don't forget to call env.reset() to get a new observation if done 214 | # is true!! 215 | # Note that you cannot use "last_obs" directly as input 216 | # into your network, since it needs to be processed to include context 217 | # from previous frames. You should check out the replay buffer 218 | # implementation in dqn_utils.py to see what functionality the replay 219 | # buffer exposes. The replay buffer has a function called 220 | # encode_recent_observation that will take the latest observation 221 | # that you pushed into the buffer and compute the corresponding 222 | # input that should be given to a Q network by appending some 223 | # previous frames. 224 | # Don't forget to include epsilon greedy exploration! 225 | # And remember that the first time you enter this loop, the model 226 | # may not yet have been initialized (but of course, the first step 227 | # might as well be random, since you haven't trained your net...) 228 | 229 | ##### 230 | 231 | # YOUR CODE HERE 232 | idx = replay_buffer.store_frame(last_obs) 233 | 234 | if not model_initialized or random.random() < exploration.value(t): 235 | action = random.randint(0, num_actions-1) 236 | else: 237 | obs = replay_buffer.encode_recent_observation() 238 | action = session.run(pred_ac, {obs_t_ph: [obs]})[0] 239 | 240 | next_obs, reward, done, info = env.step(action) 241 | replay_buffer.store_effect(idx, action, reward, done) 242 | last_obs = env.reset() if done else next_obs 243 | 244 | ##### 245 | 246 | # at this point, the environment should have been advanced one step (and 247 | # reset if done was true), and last_obs should point to the new latest 248 | # observation 249 | 250 | ### 3. Perform experience replay and train the network. 251 | # note that this is only done if the replay buffer contains enough samples 252 | # for us to learn something useful -- until then, the model will not be 253 | # initialized and random actions should be taken 254 | if (t > learning_starts and 255 | t % learning_freq == 0 and 256 | replay_buffer.can_sample(batch_size)): 257 | # Here, you should perform training. Training consists of four steps: 258 | # 3.a: use the replay buffer to sample a batch of transitions (see the 259 | # replay buffer code for function definition, each batch that you sample 260 | # should consist of current observations, current actions, rewards, 261 | # next observations, and done indicator). 262 | # 3.b: initialize the model if it has not been initialized yet; to do 263 | # that, call 264 | # initialize_interdependent_variables(session, tf.global_variables(), { 265 | # obs_t_ph: obs_t_batch, 266 | # obs_tp1_ph: obs_tp1_batch, 267 | # }) 268 | # where obs_t_batch and obs_tp1_batch are the batches of observations at 269 | # the current and next time step. The boolean variable model_initialized 270 | # indicates whether or not the model has been initialized. 271 | # Remember that you have to update the target network too (see 3.d)! 272 | # 3.c: train the model. To do this, you'll need to use the train_fn and 273 | # total_error ops that were created earlier: total_error is what you 274 | # created to compute the total Bellman error in a batch, and train_fn 275 | # will actually perform a gradient step and update the network parameters 276 | # to reduce total_error. When calling session.run on these you'll need to 277 | # populate the following placeholders: 278 | # obs_t_ph 279 | # act_t_ph 280 | # rew_t_ph 281 | # obs_tp1_ph 282 | # done_mask_ph 283 | # (this is needed for computing total_error) 284 | # learning_rate -- you can get this from optimizer_spec.lr_schedule.value(t) 285 | # (this is needed by the optimizer to choose the learning rate) 286 | # 3.d: periodically update the target network by calling 287 | # session.run(update_target_fn) 288 | # you should update every target_update_freq steps, and you may find the 289 | # variable num_param_updates useful for this (it was initialized to 0) 290 | ##### 291 | 292 | # 3.a sample a batch of transitions 293 | obs_batch, act_batch, rew_batch, next_obs_batch, done_batch = replay_buffer.sample(batch_size) 294 | 295 | # 3.b initialize the model if haven't 296 | if not model_initialized: 297 | initialize_interdependent_variables(session, tf.global_variables(), { 298 | obs_t_ph: obs_batch, 299 | obs_tp1_ph: next_obs_batch, 300 | }) 301 | session.run(update_target_fn) 302 | model_initialized = True 303 | 304 | # 3.c train the model 305 | _, error = session.run([train_fn, total_error], { 306 | obs_t_ph: obs_batch, 307 | act_t_ph: act_batch, 308 | rew_t_ph: rew_batch, 309 | obs_tp1_ph: next_obs_batch, 310 | done_mask_ph: done_batch, 311 | learning_rate: optimizer_spec.lr_schedule.value(t) 312 | }) 313 | 314 | # 3.d periodically update the target network 315 | if t % target_update_freq == 0: 316 | # Use t here instead of num_param_updates 317 | # Under the default hyperparameter 318 | # this will speed up learning performance 319 | # Or you can set target_update_freq to less 320 | session.run(update_target_fn) 321 | num_param_updates += 1 322 | 323 | ##### 324 | 325 | ### 4. Log progress 326 | episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() 327 | if len(episode_rewards) > 0: 328 | mean_episode_reward = np.mean(episode_rewards[-100:]) 329 | if len(episode_rewards) > 100: 330 | best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) 331 | if t % LOG_EVERY_N_STEPS == 0 and model_initialized: 332 | print("Timestep %d" % (t,)) 333 | print("mean reward (100 episodes) %f" % mean_episode_reward) 334 | print("best mean reward %f" % best_mean_episode_reward) 335 | print("episodes %d" % len(episode_rewards)) 336 | print("exploration %f" % exploration.value(t)) 337 | print("learning_rate %f" % optimizer_spec.lr_schedule.value(t)) 338 | print("total error %f" % error) 339 | sys.stdout.flush() 340 | -------------------------------------------------------------------------------- /hw3/dqn_utils.py: -------------------------------------------------------------------------------- 1 | """This file includes a collection of utility functions that are useful for 2 | implementing DQN.""" 3 | import gym 4 | import tensorflow as tf 5 | import numpy as np 6 | import random 7 | 8 | def huber_loss(x, delta=1.0): 9 | # https://en.wikipedia.org/wiki/Huber_loss 10 | return tf.where( # Tensorflow >= 1.0 11 | tf.abs(x) < delta, 12 | tf.square(x) * 0.5, 13 | delta * (tf.abs(x) - 0.5 * delta) 14 | ) 15 | 16 | def sample_n_unique(sampling_f, n): 17 | """Helper function. Given a function `sampling_f` that returns 18 | comparable objects, sample n such unique objects. 19 | """ 20 | res = [] 21 | while len(res) < n: 22 | candidate = sampling_f() 23 | if candidate not in res: 24 | res.append(candidate) 25 | return res 26 | 27 | class Schedule(object): 28 | def value(self, t): 29 | """Value of the schedule at time t""" 30 | raise NotImplementedError() 31 | 32 | class ConstantSchedule(object): 33 | def __init__(self, value): 34 | """Value remains constant over time. 35 | Parameters 36 | ---------- 37 | value: float 38 | Constant value of the schedule 39 | """ 40 | self._v = value 41 | 42 | def value(self, t): 43 | """See Schedule.value""" 44 | return self._v 45 | 46 | def linear_interpolation(l, r, alpha): 47 | return l + alpha * (r - l) 48 | 49 | class PiecewiseSchedule(object): 50 | def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None): 51 | """Piecewise schedule. 52 | endpoints: [(int, int)] 53 | list of pairs `(time, value)` meanining that schedule should output 54 | `value` when `t==time`. All the values for time must be sorted in 55 | an increasing order. When t is between two times, e.g. `(time_a, value_a)` 56 | and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs 57 | `interpolation(value_a, value_b, alpha)` where alpha is a fraction of 58 | time passed between `time_a` and `time_b` for time `t`. 59 | interpolation: lambda float, float, float: float 60 | a function that takes value to the left and to the right of t according 61 | to the `endpoints`. Alpha is the fraction of distance from left endpoint to 62 | right endpoint that t has covered. See linear_interpolation for example. 63 | outside_value: float 64 | if the value is requested outside of all the intervals sepecified in 65 | `endpoints` this value is returned. If None then AssertionError is 66 | raised when outside value is requested. 67 | """ 68 | idxes = [e[0] for e in endpoints] 69 | assert idxes == sorted(idxes) 70 | self._interpolation = interpolation 71 | self._outside_value = outside_value 72 | self._endpoints = endpoints 73 | 74 | def value(self, t): 75 | """See Schedule.value""" 76 | for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]): 77 | if l_t <= t and t < r_t: 78 | alpha = float(t - l_t) / (r_t - l_t) 79 | return self._interpolation(l, r, alpha) 80 | 81 | # t does not belong to any of the pieces, so doom. 82 | assert self._outside_value is not None 83 | return self._outside_value 84 | 85 | class LinearSchedule(object): 86 | def __init__(self, schedule_timesteps, final_p, initial_p=1.0): 87 | """Linear interpolation between initial_p and final_p over 88 | schedule_timesteps. After this many timesteps pass final_p is 89 | returned. 90 | Parameters 91 | ---------- 92 | schedule_timesteps: int 93 | Number of timesteps for which to linearly anneal initial_p 94 | to final_p 95 | initial_p: float 96 | initial output value 97 | final_p: float 98 | final output value 99 | """ 100 | self.schedule_timesteps = schedule_timesteps 101 | self.final_p = final_p 102 | self.initial_p = initial_p 103 | 104 | def value(self, t): 105 | """See Schedule.value""" 106 | fraction = min(float(t) / self.schedule_timesteps, 1.0) 107 | return self.initial_p + fraction * (self.final_p - self.initial_p) 108 | 109 | def compute_exponential_averages(variables, decay): 110 | """Given a list of tensorflow scalar variables 111 | create ops corresponding to their exponential 112 | averages 113 | Parameters 114 | ---------- 115 | variables: [tf.Tensor] 116 | List of scalar tensors. 117 | Returns 118 | ------- 119 | averages: [tf.Tensor] 120 | List of scalar tensors corresponding to averages 121 | of al the `variables` (in order) 122 | apply_op: tf.runnable 123 | Op to be run to update the averages with current value 124 | of variables. 125 | """ 126 | averager = tf.train.ExponentialMovingAverage(decay=decay) 127 | apply_op = averager.apply(variables) 128 | return [averager.average(v) for v in variables], apply_op 129 | 130 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10): 131 | """Minimized `objective` using `optimizer` w.r.t. variables in 132 | `var_list` while ensure the norm of the gradients for each 133 | variable is clipped to `clip_val` 134 | """ 135 | gradients = optimizer.compute_gradients(objective, var_list=var_list) 136 | for i, (grad, var) in enumerate(gradients): 137 | if grad is not None: 138 | gradients[i] = (tf.clip_by_norm(grad, clip_val), var) 139 | return optimizer.apply_gradients(gradients) 140 | 141 | def initialize_interdependent_variables(session, vars_list, feed_dict): 142 | """Initialize a list of variables one at a time, which is useful if 143 | initialization of some variables depends on initialization of the others. 144 | """ 145 | vars_left = vars_list 146 | while len(vars_left) > 0: 147 | new_vars_left = [] 148 | for v in vars_left: 149 | try: 150 | # If using an older version of TensorFlow, uncomment the line 151 | # below and comment out the line after it. 152 | #session.run(tf.initialize_variables([v]), feed_dict) 153 | session.run(tf.variables_initializer([v]), feed_dict) 154 | except tf.errors.FailedPreconditionError: 155 | new_vars_left.append(v) 156 | if len(new_vars_left) >= len(vars_left): 157 | # This can happend if the variables all depend on each other, or more likely if there's 158 | # another variable outside of the list, that still needs to be initialized. This could be 159 | # detected here, but life's finite. 160 | raise Exception("Cycle in variable dependencies, or extenrnal precondition unsatisfied.") 161 | else: 162 | vars_left = new_vars_left 163 | 164 | def get_wrapper_by_name(env, classname): 165 | currentenv = env 166 | while True: 167 | if classname in currentenv.__class__.__name__: 168 | return currentenv 169 | elif isinstance(env, gym.Wrapper): 170 | currentenv = currentenv.env 171 | else: 172 | raise ValueError("Couldn't find wrapper named %s"%classname) 173 | 174 | class ReplayBuffer(object): 175 | def __init__(self, size, frame_history_len): 176 | """This is a memory efficient implementation of the replay buffer. 177 | 178 | The sepecific memory optimizations use here are: 179 | - only store each frame once rather than k times 180 | even if every observation normally consists of k last frames 181 | - store frames as np.uint8 (actually it is most time-performance 182 | to cast them back to float32 on GPU to minimize memory transfer 183 | time) 184 | - store frame_t and frame_(t+1) in the same buffer. 185 | 186 | For the tipical use case in Atari Deep RL buffer with 1M frames the total 187 | memory footprint of this buffer is 10^6 * 84 * 84 bytes ~= 7 gigabytes 188 | 189 | Warning! Assumes that returning frame of zeros at the beginning 190 | of the episode, when there is less frames than `frame_history_len`, 191 | is acceptable. 192 | 193 | Parameters 194 | ---------- 195 | size: int 196 | Max number of transitions to store in the buffer. When the buffer 197 | overflows the old memories are dropped. 198 | frame_history_len: int 199 | Number of memories to be retried for each observation. 200 | """ 201 | self.size = size 202 | self.frame_history_len = frame_history_len 203 | 204 | self.next_idx = 0 205 | self.num_in_buffer = 0 206 | 207 | self.obs = None 208 | self.action = None 209 | self.reward = None 210 | self.done = None 211 | 212 | def can_sample(self, batch_size): 213 | """Returns true if `batch_size` different transitions can be sampled from the buffer.""" 214 | return batch_size + 1 <= self.num_in_buffer 215 | 216 | def _encode_sample(self, idxes): 217 | obs_batch = np.concatenate([self._encode_observation(idx)[None] for idx in idxes], 0) 218 | act_batch = self.action[idxes] 219 | rew_batch = self.reward[idxes] 220 | next_obs_batch = np.concatenate([self._encode_observation(idx + 1)[None] for idx in idxes], 0) 221 | done_mask = np.array([1.0 if self.done[idx] else 0.0 for idx in idxes], dtype=np.float32) 222 | 223 | return obs_batch, act_batch, rew_batch, next_obs_batch, done_mask 224 | 225 | def sample(self, batch_size): 226 | """Sample `batch_size` different transitions. 227 | 228 | i-th sample transition is the following: 229 | 230 | when observing `obs_batch[i]`, action `act_batch[i]` was taken, 231 | after which reward `rew_batch[i]` was received and subsequent 232 | observation next_obs_batch[i] was observed, unless the epsiode 233 | was done which is represented by `done_mask[i]` which is equal 234 | to 1 if episode has ended as a result of that action. 235 | 236 | Parameters 237 | ---------- 238 | batch_size: int 239 | How many transitions to sample. 240 | 241 | Returns 242 | ------- 243 | obs_batch: np.array 244 | Array of shape 245 | (batch_size, img_h, img_w, img_c * frame_history_len) 246 | and dtype np.uint8 247 | act_batch: np.array 248 | Array of shape (batch_size,) and dtype np.int32 249 | rew_batch: np.array 250 | Array of shape (batch_size,) and dtype np.float32 251 | next_obs_batch: np.array 252 | Array of shape 253 | (batch_size, img_h, img_w, img_c * frame_history_len) 254 | and dtype np.uint8 255 | done_mask: np.array 256 | Array of shape (batch_size,) and dtype np.float32 257 | """ 258 | assert self.can_sample(batch_size) 259 | idxes = sample_n_unique(lambda: random.randint(0, self.num_in_buffer - 2), batch_size) 260 | return self._encode_sample(idxes) 261 | 262 | def encode_recent_observation(self): 263 | """Return the most recent `frame_history_len` frames. 264 | 265 | Returns 266 | ------- 267 | observation: np.array 268 | Array of shape (img_h, img_w, img_c * frame_history_len) 269 | and dtype np.uint8, where observation[:, :, i*img_c:(i+1)*img_c] 270 | encodes frame at time `t - frame_history_len + i` 271 | """ 272 | assert self.num_in_buffer > 0 273 | return self._encode_observation((self.next_idx - 1) % self.size) 274 | 275 | def _encode_observation(self, idx): 276 | end_idx = idx + 1 # make noninclusive 277 | start_idx = end_idx - self.frame_history_len 278 | # this checks if we are using low-dimensional observations, such as RAM 279 | # state, in which case we just directly return the latest RAM. 280 | if len(self.obs.shape) == 2: 281 | return self.obs[end_idx-1] 282 | # if there weren't enough frames ever in the buffer for context 283 | if start_idx < 0 and self.num_in_buffer != self.size: 284 | start_idx = 0 285 | for idx in range(start_idx, end_idx - 1): 286 | if self.done[idx % self.size]: 287 | start_idx = idx + 1 288 | missing_context = self.frame_history_len - (end_idx - start_idx) 289 | # if zero padding is needed for missing context 290 | # or we are on the boundry of the buffer 291 | if start_idx < 0 or missing_context > 0: 292 | frames = [np.zeros_like(self.obs[0]) for _ in range(missing_context)] 293 | for idx in range(start_idx, end_idx): 294 | frames.append(self.obs[idx % self.size]) 295 | return np.concatenate(frames, 2) 296 | else: 297 | # this optimization has potential to saves about 30% compute time \o/ 298 | img_h, img_w = self.obs.shape[1], self.obs.shape[2] 299 | return self.obs[start_idx:end_idx].transpose(1, 2, 0, 3).reshape(img_h, img_w, -1) 300 | 301 | def store_frame(self, frame): 302 | """Store a single frame in the buffer at the next available index, overwriting 303 | old frames if necessary. 304 | 305 | Parameters 306 | ---------- 307 | frame: np.array 308 | Array of shape (img_h, img_w, img_c) and dtype np.uint8 309 | the frame to be stored 310 | 311 | Returns 312 | ------- 313 | idx: int 314 | Index at which the frame is stored. To be used for `store_effect` later. 315 | """ 316 | if self.obs is None: 317 | self.obs = np.empty([self.size] + list(frame.shape), dtype=np.uint8) 318 | self.action = np.empty([self.size], dtype=np.int32) 319 | self.reward = np.empty([self.size], dtype=np.float32) 320 | self.done = np.empty([self.size], dtype=np.bool) 321 | self.obs[self.next_idx] = frame 322 | 323 | ret = self.next_idx 324 | self.next_idx = (self.next_idx + 1) % self.size 325 | self.num_in_buffer = min(self.size, self.num_in_buffer + 1) 326 | 327 | return ret 328 | 329 | def store_effect(self, idx, action, reward, done): 330 | """Store effects of action taken after obeserving frame stored 331 | at index idx. The reason `store_frame` and `store_effect` is broken 332 | up into two functions is so that once can call `encode_recent_observation` 333 | in between. 334 | 335 | Paramters 336 | --------- 337 | idx: int 338 | Index in buffer of recently observed frame (returned by `store_frame`). 339 | action: int 340 | Action that was performed upon observing this frame. 341 | reward: float 342 | Reward that was received when the actions was performed. 343 | done: bool 344 | True if episode was finished after performing that action. 345 | """ 346 | self.action[idx] = action 347 | self.reward[idx] = reward 348 | self.done[idx] = done 349 | 350 | -------------------------------------------------------------------------------- /hw3/hw3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw3/hw3.pdf -------------------------------------------------------------------------------- /hw3/run_dqn_atari.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | from gym import wrappers 4 | import os.path as osp 5 | import random 6 | import numpy as np 7 | import tensorflow as tf 8 | import tensorflow.contrib.layers as layers 9 | 10 | import dqn 11 | from dqn_utils import * 12 | from atari_wrappers import * 13 | 14 | import logging 15 | 16 | def atari_model(img_in, num_actions, scope, reuse=False): 17 | # as described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf 18 | with tf.variable_scope(scope, reuse=reuse): 19 | out = img_in 20 | with tf.variable_scope("convnet"): 21 | # original architecture 22 | out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu) 23 | out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu) 24 | out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu) 25 | out = layers.flatten(out) 26 | with tf.variable_scope("action_value"): 27 | out = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu) 28 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 29 | 30 | return out 31 | 32 | def atari_learn(env, 33 | session, 34 | num_timesteps): 35 | # This is just a rough estimate 36 | num_iterations = float(num_timesteps) / 4.0 37 | 38 | lr_multiplier = 1.0 39 | lr_schedule = PiecewiseSchedule([ 40 | (0, 1e-4 * lr_multiplier), 41 | (num_iterations / 10, 1e-4 * lr_multiplier), 42 | (num_iterations / 2, 5e-5 * lr_multiplier), 43 | ], 44 | outside_value=5e-5 * lr_multiplier) 45 | optimizer = dqn.OptimizerSpec( 46 | constructor=tf.train.AdamOptimizer, 47 | kwargs=dict(epsilon=1e-4), 48 | lr_schedule=lr_schedule 49 | ) 50 | 51 | def stopping_criterion(env, t): 52 | # notice that here t is the number of steps of the wrapped env, 53 | # which is different from the number of steps in the underlying env 54 | return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps 55 | 56 | exploration_schedule = PiecewiseSchedule( 57 | [ 58 | (0, 1.0), 59 | (1e6, 0.1), 60 | (num_iterations / 2, 0.01), 61 | ], outside_value=0.01 62 | ) 63 | 64 | dqn.learn( 65 | env, 66 | q_func=atari_model, 67 | optimizer_spec=optimizer, 68 | session=session, 69 | exploration=exploration_schedule, 70 | stopping_criterion=stopping_criterion, 71 | replay_buffer_size=1000000, 72 | batch_size=32, 73 | gamma=0.99, 74 | learning_starts=50000, 75 | learning_freq=4, 76 | frame_history_len=4, 77 | target_update_freq=10000, 78 | grad_norm_clipping=10 79 | ) 80 | env.close() 81 | 82 | def get_available_gpus(): 83 | from tensorflow.python.client import device_lib 84 | local_device_protos = device_lib.list_local_devices() 85 | return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU'] 86 | 87 | def set_global_seeds(i): 88 | try: 89 | import tensorflow as tf 90 | except ImportError: 91 | pass 92 | else: 93 | tf.set_random_seed(i) 94 | np.random.seed(i) 95 | random.seed(i) 96 | 97 | def get_session(): 98 | tf.reset_default_graph() 99 | tf_config = tf.ConfigProto( 100 | inter_op_parallelism_threads=1, 101 | intra_op_parallelism_threads=1) 102 | session = tf.Session(config=tf_config) 103 | print("AVAILABLE GPUS: ", get_available_gpus()) 104 | return session 105 | 106 | def get_env(task, seed): 107 | env_id = task.env_id 108 | 109 | env = gym.make(env_id) 110 | 111 | set_global_seeds(seed) 112 | env.seed(seed) 113 | 114 | expt_dir = '/tmp/hw3_vid_dir2/' 115 | env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) 116 | env = wrap_deepmind(env) 117 | 118 | return env 119 | 120 | def main(): 121 | 122 | # Logger 123 | # https://github.com/mwhittaker/homework/commit/cb043dbc980d898547f552e07f475696ce57f1d3 124 | format = "[%(asctime)-15s %(pathname)s:%(lineno)-3s] %(message)s" 125 | handler = logging.StreamHandler() 126 | handler.setFormatter(logging.Formatter(format)) 127 | logger = logging.getLogger("dqn") 128 | logger.propagate = False 129 | logger.addHandler(handler) 130 | logger.setLevel(logging.DEBUG) 131 | 132 | # Get Atari games. 133 | benchmark = gym.benchmark_spec('Atari40M') 134 | 135 | # Change the index to select a different game. 136 | task = benchmark.tasks[3] 137 | 138 | # Run training 139 | seed = 0 # Use a seed of zero (you may want to randomize the seed!) 140 | env = get_env(task, seed) 141 | session = get_session() 142 | atari_learn(env, session, num_timesteps=task.max_timesteps) 143 | 144 | if __name__ == "__main__": 145 | main() 146 | -------------------------------------------------------------------------------- /hw3/run_dqn_ram.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | from gym import wrappers 4 | import os.path as osp 5 | import random 6 | import numpy as np 7 | import tensorflow as tf 8 | import tensorflow.contrib.layers as layers 9 | 10 | import dqn 11 | from dqn_utils import * 12 | from atari_wrappers import * 13 | 14 | import logging 15 | 16 | def atari_model(ram_in, num_actions, scope, reuse=False): 17 | with tf.variable_scope(scope, reuse=reuse): 18 | out = ram_in 19 | #out = tf.concat(1,(ram_in[:,4:5],ram_in[:,8:9],ram_in[:,11:13],ram_in[:,21:22],ram_in[:,50:51], ram_in[:,60:61],ram_in[:,64:65])) 20 | with tf.variable_scope("action_value"): 21 | out = layers.fully_connected(out, num_outputs=256, activation_fn=tf.nn.relu) 22 | out = layers.fully_connected(out, num_outputs=128, activation_fn=tf.nn.relu) 23 | out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu) 24 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 25 | 26 | return out 27 | 28 | def atari_learn(env, 29 | session, 30 | num_timesteps): 31 | # This is just a rough estimate 32 | num_iterations = float(num_timesteps) / 4.0 33 | 34 | lr_multiplier = 1.0 35 | lr_schedule = PiecewiseSchedule([ 36 | (0, 1e-4 * lr_multiplier), 37 | (num_iterations / 10, 1e-4 * lr_multiplier), 38 | (num_iterations / 2, 5e-5 * lr_multiplier), 39 | ], 40 | outside_value=5e-5 * lr_multiplier) 41 | 42 | #lr_schedule = LinearSchedule(num_iterations, 0.0001, 0.01) 43 | optimizer = dqn.OptimizerSpec( 44 | constructor=tf.train.AdamOptimizer, 45 | kwargs=dict(epsilon=1e-4), 46 | lr_schedule=lr_schedule 47 | ) 48 | 49 | def stopping_criterion(env, t): 50 | # notice that here t is the number of steps of the wrapped env, 51 | # which is different from the number of steps in the underlying env 52 | return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps 53 | 54 | exploration_schedule = PiecewiseSchedule( 55 | [ 56 | (0, 0.2), 57 | (1e6, 0.1), 58 | (num_iterations / 2, 0.01), 59 | ], outside_value=0.01 60 | ) 61 | 62 | dqn.learn( 63 | env, 64 | q_func=atari_model, 65 | optimizer_spec=optimizer, 66 | session=session, 67 | exploration=exploration_schedule, 68 | stopping_criterion=stopping_criterion, 69 | replay_buffer_size=1000000, 70 | batch_size=32, 71 | gamma=0.99, 72 | learning_starts=50000, 73 | learning_freq=4, 74 | frame_history_len=1, 75 | target_update_freq=10000, 76 | grad_norm_clipping=10 77 | ) 78 | env.close() 79 | 80 | def get_available_gpus(): 81 | from tensorflow.python.client import device_lib 82 | local_device_protos = device_lib.list_local_devices() 83 | return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU'] 84 | 85 | def set_global_seeds(i): 86 | try: 87 | import tensorflow as tf 88 | except ImportError: 89 | pass 90 | else: 91 | tf.set_random_seed(i) 92 | np.random.seed(i) 93 | random.seed(i) 94 | 95 | def get_session(): 96 | tf.reset_default_graph() 97 | tf_config = tf.ConfigProto( 98 | inter_op_parallelism_threads=1, 99 | intra_op_parallelism_threads=1) 100 | session = tf.Session(config=tf_config) 101 | print("AVAILABLE GPUS: ", get_available_gpus()) 102 | return session 103 | 104 | def get_env(seed): 105 | env = gym.make('Pong-ram-v0') 106 | 107 | set_global_seeds(seed) 108 | env.seed(seed) 109 | 110 | expt_dir = '/tmp/hw3_vid_dir/' 111 | env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) 112 | env = wrap_deepmind_ram(env) 113 | 114 | return env 115 | 116 | def main(): 117 | # Logger 118 | # https://github.com/mwhittaker/homework/commit/cb043dbc980d898547f552e07f475696ce57f1d3 119 | format = "[%(asctime)-15s %(pathname)s:%(lineno)-3s] %(message)s" 120 | handler = logging.StreamHandler() 121 | handler.setFormatter(logging.Formatter(format)) 122 | logger = logging.getLogger("dqn") 123 | logger.propagate = False 124 | logger.addHandler(handler) 125 | logger.setLevel(logging.DEBUG) 126 | 127 | # Run training 128 | seed = 0 # Use a seed of zero (you may want to randomize the seed!) 129 | env = get_env(seed) 130 | session = get_session() 131 | atari_learn(env, session, num_timesteps=int(4e7)) 132 | 133 | if __name__ == "__main__": 134 | main() 135 | -------------------------------------------------------------------------------- /hw4/README.md: -------------------------------------------------------------------------------- 1 | # HW4, Model Based Deep RL 2 | 3 | _Author_ Liyiming Ke 4 | 5 | _Updated_ Mar 28, 2018 6 | 7 | 8 | 1. To run the program, launch with `python main.py -n 15 -ep 300 -m 10 -sp 500 -r 100 -d 10` (fast) or `python main.py -n 15 -ep 1000` (slow but yields higher return) 9 | 10 | 2. The performance of fast version 11 | 12 | |Iteration|AverageCost|StdCost MinimumCost|MaximumCost|AverageReturn|StdReturn|MinimumReturn|MaximumReturn| 13 | |----|---|---|---|---|---|---|---| 14 | |0|-272.8721655363496|21.239603908942726|-320.43697618040017|-246.4736838787137|240.2439329582071|20.68344457836508|207.28770781466437|274.5577823545346| 15 | |1|-287.92028135886255|25.92310204677628|-325.54410164427884|-251.01360667647728|250.15164096384146|22.73045020236331|220.01663620467846|292.0170185841397| 16 | |2|-323.80080715005346|30.228382522490797|-389.0978409378912|-271.7796878141356|289.2029514835485|25.470647774294818|259.58129657072544|333.16807455607943| 17 | |3|-333.15430020245526|32.740974228661756|-385.3221695429378|-274.0394588141746|292.35852402398297|37.10565811001103|222.4680755620722|350.4272628072747| 18 | |4|-376.3197552271316|40.07809243241263|-444.0926281622156|-317.0400260108479|329.7078283201919|32.06590568480435|278.45885061461195|383.16172036212316| 19 | |5|-309.486577652699|20.222609987437725|-356.1101951949902|-278.77372207902937|277.1294051792327|21.8631623809154|243.78552026776572|316.4410201418656| 20 | |6|-333.27779679849357|31.09109231822848|-374.822923779812|-279.54658651433584|288.86654997633525|29.904001076855607|242.04216272993844|339.6146511277123| 21 | |7|-347.3543927604132|31.87449528755396|-398.9008770736232|-298.08452920988356|308.13962163935344|26.064436309153134|274.0523898129893|362.50855646019704| 22 | |8|-358.21187096791294|27.357604204029187|-427.34414545387995|-333.4367350634044|312.75195931842785|24.026301216233957|282.1552938485566|362.11430547952864| 23 | |9|-350.0931825856868|39.91494896369595|-419.03018683390167|-282.5787159965811|316.8553292620047|32.94504667833338|260.4670701650203|360.37675813391917| 24 | |10|-360.55493946390663|27.67517089380202|-404.1814038418491|-309.4632013566413|319.21174580449343|28.015782572396002|278.0781502604923|362.4407932260329| 25 | |11|-335.3625259508714|33.089863552783775|-394.8137162207922|-285.8209978255535|293.8460252502136|26.69139392022445|262.12653425412725|351.27653751254394| 26 | |12|-356.40964697880133|35.67182035029585|-404.97137852094636|-277.33927884191587|312.1967845782086|30.998412487074223|265.0126242278393|365.8393458066606| 27 | |13|-354.4348233587252|35.7811577088428|-435.0528204835982|-301.168475983296|313.8152077418059|28.98742963518402|270.61725419488414|367.821287239409| 28 | |14|-371.42544639124094|33.6907209955848|-442.3058340489825|-329.31225275852825|335.09668997592695|29.85285812344075|282.52093602247203|375.99342302061467| -------------------------------------------------------------------------------- /hw4/cheetah_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from gym.envs.mujoco import mujoco_env 4 | 5 | class HalfCheetahEnvNew(mujoco_env.MujocoEnv, utils.EzPickle): 6 | def __init__(self): 7 | mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 1) 8 | utils.EzPickle.__init__(self) 9 | 10 | def _step(self, action): 11 | xposbefore = self.model.data.qpos[0, 0] 12 | self.do_simulation(action, self.frame_skip) 13 | xposafter = self.model.data.qpos[0, 0] 14 | ob = self._get_obs() 15 | reward_ctrl = - 0.1 * np.square(action).sum() 16 | reward_run = (xposafter - xposbefore)/self.dt 17 | reward = reward_ctrl + reward_run 18 | done = False 19 | return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl) 20 | 21 | def _get_obs(self): 22 | return np.concatenate([ 23 | self.model.data.qpos.flat[1:], 24 | self.model.data.qvel.flat, 25 | self.get_body_com("torso").flat, 26 | # self.get_body_comvel("torso").flat, 27 | ]) 28 | 29 | def reset_model(self): 30 | qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq) 31 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 32 | self.set_state(qpos, qvel) 33 | return self._get_obs() 34 | 35 | def viewer_setup(self): 36 | self.viewer.cam.distance = self.model.stat.extent * 0.5 -------------------------------------------------------------------------------- /hw4/controllers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from cost_functions import trajectory_cost_fn 3 | import time 4 | 5 | class Controller(): 6 | def __init__(self): 7 | pass 8 | 9 | # Get the appropriate action(s) for this state(s) 10 | def get_action(self, state): 11 | pass 12 | 13 | 14 | class RandomController(Controller): 15 | def __init__(self, env): 16 | self.ac = env.action_space 17 | 18 | def get_action(self, state): 19 | """ YOUR CODE HERE """ 20 | """ Your code should randomly sample an action uniformly from the action space """ 21 | return self.ac.sample() 22 | 23 | class MPCcontroller(Controller): 24 | """ Controller built using the MPC method outlined in https://arxiv.org/abs/1708.02596 """ 25 | def __init__(self, 26 | env, 27 | dyn_model, 28 | horizon=5, 29 | cost_fn=None, 30 | num_simulated_paths=10, 31 | ): 32 | self.env = env 33 | self.dyn_model = dyn_model 34 | self.horizon = horizon 35 | self.cost_fn = cost_fn 36 | self.num_simulated_paths = num_simulated_paths 37 | 38 | def get_action(self, state): 39 | """ YOUR CODE HERE """ 40 | """ Note: be careful to batch your simulations through the model for speed """ 41 | 42 | sampled_acts = np.array([[self.env.action_space.sample() for j in range(self.num_simulated_paths)] for i in range(self.horizon)]) 43 | states = [np.array([state] * self.num_simulated_paths)] 44 | nstates = [] 45 | 46 | for i in range(self.horizon): 47 | nstates.append(self.dyn_model.predict(states[-1], sampled_acts[i, :])) 48 | if i < self.horizon: states.append(nstates[-1]) 49 | 50 | costs = trajectory_cost_fn(self.cost_fn, states, sampled_acts, nstates) 51 | return sampled_acts[0][np.argmin(costs)] 52 | -------------------------------------------------------------------------------- /hw4/cost_functions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | #======================================================== 5 | # 6 | # Environment-specific cost functions: 7 | # 8 | 9 | def cheetah_cost_fn(state, action, next_state): 10 | if len(state.shape) > 1: 11 | 12 | heading_penalty_factor=10 13 | scores=np.zeros((state.shape[0],)) 14 | 15 | #dont move front shin back so far that you tilt forward 16 | front_leg = state[:,5] 17 | my_range = 0.2 18 | scores[front_leg>=my_range] += heading_penalty_factor 19 | 20 | front_shin = state[:,6] 21 | my_range = 0 22 | scores[front_shin>=my_range] += heading_penalty_factor 23 | 24 | front_foot = state[:,7] 25 | my_range = 0 26 | scores[front_foot>=my_range] += heading_penalty_factor 27 | 28 | scores-= (next_state[:,17] - state[:,17]) / 0.01 #+ 0.1 * (np.sum(action**2, axis=1)) 29 | return scores 30 | 31 | heading_penalty_factor=10 32 | score = 0 33 | 34 | #dont move front shin back so far that you tilt forward 35 | front_leg = state[5] 36 | my_range = 0.2 37 | if front_leg>=my_range: 38 | score += heading_penalty_factor 39 | 40 | front_shin = state[6] 41 | my_range = 0 42 | if front_shin>=my_range: 43 | score += heading_penalty_factor 44 | 45 | front_foot = state[7] 46 | my_range = 0 47 | if front_foot>=my_range: 48 | score += heading_penalty_factor 49 | 50 | score -= (next_state[17] - state[17]) / 0.01 #+ 0.1 * (np.sum(action**2)) 51 | return score 52 | 53 | #======================================================== 54 | # 55 | # Cost function for a whole trajectory: 56 | # 57 | 58 | def trajectory_cost_fn(cost_fn, states, actions, next_states): 59 | trajectory_cost = 0 60 | for i in range(len(actions)): 61 | trajectory_cost += cost_fn(states[i], actions[i], next_states[i]) 62 | return trajectory_cost -------------------------------------------------------------------------------- /hw4/dynamics.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | import numpy as np 4 | 5 | # Predefined function to build a feedforward neural network 6 | def build_mlp(input_placeholder, 7 | output_size, 8 | scope, 9 | n_layers=2, 10 | size=500, 11 | activation=tf.tanh, 12 | output_activation=None 13 | ): 14 | out = input_placeholder 15 | with tf.variable_scope(scope): 16 | for _ in range(n_layers): 17 | out = tf.layers.dense(out, size, activation=activation) 18 | out = tf.layers.dense(out, output_size, activation=output_activation) 19 | return out 20 | 21 | def normalize(data, normalization): 22 | return (data - normalization[0]) / (normalization[1] + 1e-10) 23 | 24 | def denormalize(data, normalization): 25 | return data * (normalization[1] + 1e-10) + normalization[0] 26 | 27 | def batch_index(batch_size, chunk_size): 28 | ind = [(acc, acc + batch_size) for acc in range(0, chunk_size, batch_size)] 29 | if ind[-1][1] < chunk_size: 30 | ind.append((ind[-1][1], chunk_size)) 31 | return ind 32 | 33 | class NNDynamicsModel(): 34 | def __init__(self, 35 | env, 36 | n_layers, 37 | size, 38 | activation, 39 | output_activation, 40 | normalization, 41 | batch_size, 42 | iterations, 43 | learning_rate, 44 | sess 45 | ): 46 | """ YOUR CODE HERE """ 47 | """ Note: Be careful about normalization """ 48 | ob_dim = env.observation_space.shape[0] 49 | ac_dim = env.action_space.shape[0] 50 | 51 | self.input_state = tf.placeholder(shape=(None, ob_dim), dtype=tf.float32) 52 | self.input_act = tf.placeholder(shape=(None, ac_dim), dtype=tf.float32) 53 | self.target_delta = tf.placeholder(shape=(None, ob_dim), dtype=tf.float32) 54 | 55 | self.dyn = build_mlp(tf.concat([self.input_state, self.input_act], axis=1), 56 | output_size=ob_dim, 57 | scope="NNDynamicsModel", 58 | n_layers=n_layers, 59 | size=size, 60 | activation=activation, 61 | output_activation=output_activation) 62 | 63 | self.normalization = normalization 64 | 65 | self.loss = tf.losses.mean_squared_error(labels=self.target_delta, predictions=self.dyn) 66 | self.update = tf.train.AdamOptimizer(learning_rate).minimize(loss=self.loss) 67 | self.iterations = iterations 68 | self.batch_size = batch_size 69 | self.sess = sess 70 | 71 | def fit(self, data): 72 | """ 73 | Write a function to take in a dataset of (unnormalized)states, (unnormalized)actions, (unnormalized)next_states and fit the dynamics model going from normalized states, normalized actions to normalized state differences (s_t+1 - s_t) 74 | """ 75 | """YOUR CODE HERE """ 76 | obs = normalize(data["observations"], self.normalization["observations"]) 77 | acts = normalize(data["actions"], self.normalization["actions"]) 78 | deltas = normalize(data["next_observations"] - data["observations"], self.normalization["deltas"]) 79 | chunk_size = len(data["observations"]) 80 | 81 | batch_indexes = batch_index(self.batch_size, chunk_size) 82 | loss = None 83 | for epoch in range(self.iterations): 84 | if epoch % 20 == 0: print("Epoch {}/{}: Loss {}".format(epoch, self.iterations, loss)) 85 | for _, (a,b) in enumerate(batch_indexes): 86 | _, loss = self.sess.run( 87 | [self.update, self.loss], 88 | feed_dict={ 89 | self.input_state: obs[a:b], 90 | self.input_act: acts[a:b], 91 | self.target_delta: deltas[a:b] 92 | }) 93 | 94 | 95 | def predict(self, states, actions): 96 | """ Write a function to take in a batch of (unnormalized) states and (unnormalized) actions and return the (unnormalized) next states as predicted by using the model """ 97 | """ YOUR CODE HERE """ 98 | n_states = normalize(states, self.normalization["observations"]) 99 | n_acts = normalize(actions, self.normalization["actions"]) 100 | deltas = self.sess.run(self.dyn, feed_dict={self.input_state: n_states, 101 | self.input_act: n_acts}) 102 | deltas = denormalize(deltas, self.normalization["deltas"]) 103 | return deltas + states 104 | 105 | -------------------------------------------------------------------------------- /hw4/hw4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/hw4/hw4.pdf -------------------------------------------------------------------------------- /hw4/logz.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | """ 4 | 5 | Some simple logging functionality, inspired by rllab's logging. 6 | Assumes that each diagnostic gets logged each iteration 7 | 8 | Call logz.configure_output_dir() to start logging to a 9 | tab-separated-values file (some_folder_name/log.txt) 10 | 11 | To load the learning curves, you can do, for example 12 | 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True) 14 | A['EpRewMean'] 15 | 16 | """ 17 | 18 | import os.path as osp, shutil, time, atexit, os, subprocess 19 | import pickle 20 | import tensorflow as tf 21 | 22 | color2num = dict( 23 | gray=30, 24 | red=31, 25 | green=32, 26 | yellow=33, 27 | blue=34, 28 | magenta=35, 29 | cyan=36, 30 | white=37, 31 | crimson=38 32 | ) 33 | 34 | def colorize(string, color, bold=False, highlight=False): 35 | attr = [] 36 | num = color2num[color] 37 | if highlight: num += 10 38 | attr.append(str(num)) 39 | if bold: attr.append('1') 40 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 41 | 42 | class G: 43 | output_dir = None 44 | output_file = None 45 | first_row = True 46 | log_headers = [] 47 | log_current_row = {} 48 | 49 | def configure_output_dir(d=None): 50 | """ 51 | Set output directory to d, or to /tmp/somerandomnumber if d is None 52 | """ 53 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time()) 54 | if osp.exists(G.output_dir): 55 | print("Log dir %s already exists! Delete it first or use a different dir"%G.output_dir) 56 | else: 57 | os.makedirs(G.output_dir) 58 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w') 59 | atexit.register(G.output_file.close) 60 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True)) 61 | 62 | def log_tabular(key, val): 63 | """ 64 | Log a value of some diagnostic 65 | Call this once for each diagnostic quantity, each iteration 66 | """ 67 | if G.first_row: 68 | G.log_headers.append(key) 69 | else: 70 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key 71 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key 72 | G.log_current_row[key] = val 73 | 74 | def save_params(params): 75 | with open(osp.join(G.output_dir, "params.json"), 'w') as out: 76 | out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True)) 77 | 78 | def pickle_tf_vars(): 79 | """ 80 | Saves tensorflow variables 81 | Requires them to be initialized first, also a default session must exist 82 | """ 83 | _dict = {v.name : v.eval() for v in tf.global_variables()} 84 | with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f: 85 | pickle.dump(_dict, f) 86 | 87 | 88 | def dump_tabular(): 89 | """ 90 | Write all of the diagnostics from the current iteration 91 | """ 92 | vals = [] 93 | key_lens = [len(key) for key in G.log_headers] 94 | max_key_len = max(15,max(key_lens)) 95 | keystr = '%'+'%d'%max_key_len 96 | fmt = "| " + keystr + "s | %15s |" 97 | n_slashes = 22 + max_key_len 98 | print("-"*n_slashes) 99 | for key in G.log_headers: 100 | val = G.log_current_row.get(key, "") 101 | if hasattr(val, "__float__"): valstr = "%8.3g"%val 102 | else: valstr = val 103 | print(fmt%(key, valstr)) 104 | vals.append(val) 105 | print("-"*n_slashes) 106 | if G.output_file is not None: 107 | if G.first_row: 108 | G.output_file.write("\t".join(G.log_headers)) 109 | G.output_file.write("\n") 110 | G.output_file.write("\t".join(map(str,vals))) 111 | G.output_file.write("\n") 112 | G.output_file.flush() 113 | G.log_current_row.clear() 114 | G.first_row=False 115 | -------------------------------------------------------------------------------- /hw4/main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | from dynamics import NNDynamicsModel 5 | from controllers import MPCcontroller, RandomController 6 | from cost_functions import cheetah_cost_fn, trajectory_cost_fn 7 | import time 8 | import logz 9 | import os 10 | import copy 11 | import matplotlib.pyplot as plt 12 | from cheetah_env import HalfCheetahEnvNew 13 | 14 | def sample(env, 15 | controller, 16 | num_paths=10, 17 | horizon=1000, 18 | render=False, 19 | verbose=False): 20 | """ 21 | Write a sampler function which takes in an environment, a controller (either random or the MPC controller), 22 | and returns rollouts by running on the env. 23 | Each path can have elements for observations, next_observations, rewards, returns, actions, etc. 24 | """ 25 | """YOUR CODE HERE """ 26 | paths = { 27 | "observations":[], 28 | "next_observations":[], 29 | "rewards":[], 30 | "actions":[], 31 | "ep_lens":[], 32 | "acc_rewards":[] 33 | } 34 | 35 | for i in range(num_paths): 36 | animate_this_rollout = render and (i%10 == 0) 37 | print("Sample Path {} / {}".format(i, num_paths)) 38 | ob = env.reset() 39 | ep_len = 0 40 | while ep_len < horizon: 41 | if animate_this_rollout: 42 | env.render() 43 | time.sleep(0.05) 44 | 45 | paths["observations"].append(ob) 46 | act = controller.get_action(ob) 47 | ob, rew, done, _ = env.step(act) 48 | 49 | paths["actions"].append(act) 50 | paths["next_observations"].append(ob) 51 | paths["rewards"].append(rew) 52 | 53 | ep_len += 1 54 | if done: break 55 | 56 | paths["ep_lens"].append(ep_len) 57 | paths["acc_rewards"].append(sum(paths["rewards"][-ep_len:])) 58 | 59 | if verbose: 60 | print("************* New Sample *************") 61 | returns = paths["acc_rewards"] 62 | ep_lengths = paths["ep_lens"] 63 | print("AverageReturn", np.mean(returns)) 64 | print("StdReturn", np.std(returns)) 65 | print("MaxReturn", np.max(returns)) 66 | print("MinReturn", np.min(returns)) 67 | print("EpLenMean", np.mean(ep_lengths)) 68 | print("EpLenStd", np.std(ep_lengths)) 69 | 70 | for key in paths.keys(): 71 | paths[key] = np.array(paths[key]) 72 | 73 | return paths 74 | 75 | # Utility to compute cost a path for a given cost function 76 | def path_cost(cost_fn, path): 77 | costs = [] 78 | acc = 0 79 | for i in path["ep_lens"]: 80 | acc_n = acc + i 81 | costs.append(trajectory_cost_fn(cost_fn, path['observations'][acc:acc_n], path['actions'][acc:acc_n], path['next_observations'][acc:acc_n])) 82 | acc = acc_n 83 | return costs 84 | 85 | def compute_normalization(data): 86 | """ 87 | Write a function to take in a dataset and compute the means, and stds. 88 | Return 6 elements: mean of s_t, std of s_t, mean of (s_t+1 - s_t), std of (s_t+1 - s_t), mean of actions, std of actions 89 | """ 90 | """ YOUR CODE HERE """ 91 | return (np.mean(data, axis=0), np.std(data, axis=0)) 92 | 93 | 94 | def plot_comparison(env, dyn_model): 95 | """ 96 | Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions. 97 | """ 98 | """ YOUR CODE HERE """ 99 | pass 100 | 101 | 102 | def train(env, 103 | cost_fn, 104 | logdir=None, 105 | render=False, 106 | learning_rate=1e-3, 107 | onpol_iters=10, 108 | dynamics_iters=60, 109 | batch_size=512, 110 | num_paths_random=10, 111 | num_paths_onpol=10, 112 | num_simulated_paths=10000, 113 | env_horizon=1000, 114 | mpc_horizon=15, 115 | n_layers=2, 116 | size=500, 117 | activation=tf.nn.relu, 118 | output_activation=None 119 | ): 120 | 121 | """ 122 | 123 | Arguments: 124 | 125 | onpol_iters Number of iterations of onpolicy aggregation for the loop to run. 126 | 127 | dynamics_iters Number of iterations of training for the dynamics model 128 | |_ which happen per iteration of the aggregation loop. 129 | 130 | batch_size Batch size for dynamics training. 131 | 132 | num_paths_random Number of paths/trajectories/rollouts generated 133 | | by a random agent. We use these to train our 134 | |_ initial dynamics model. 135 | 136 | num_paths_onpol Number of paths to collect at each iteration of 137 | |_ aggregation, using the Model Predictive Control policy. 138 | 139 | num_simulated_paths How many fictitious rollouts the MPC policy 140 | | should generate each time it is asked for an 141 | |_ action. 142 | 143 | env_horizon Number of timesteps in each path. 144 | 145 | mpc_horizon The MPC policy generates actions by imagining 146 | | fictitious rollouts, and picking the first action 147 | | of the best fictitious rollout. This argument is 148 | | how many timesteps should be in each fictitious 149 | |_ rollout. 150 | 151 | n_layers/size/activations Neural network architecture arguments. 152 | 153 | """ 154 | 155 | logz.configure_output_dir(logdir) 156 | 157 | #======================================================== 158 | # 159 | # First, we need a lot of data generated by a random 160 | # agent, with which we'll begin to train our dynamics 161 | # model. 162 | 163 | random_controller = RandomController(env) 164 | """ YOUR CODE HERE """ 165 | paths = sample(env=env, controller=random_controller, 166 | num_paths=num_paths_random, horizon=env_horizon, verbose=False) 167 | 168 | #======================================================== 169 | # 170 | # The random data will be used to get statistics (mean 171 | # and std) for the observations, actions, and deltas 172 | # (where deltas are o_{t+1} - o_t). These will be used 173 | # for normalizing inputs and denormalizing outputs 174 | # from the dynamics network. 175 | # 176 | """ YOUR CODE HERE """ 177 | normalization = { 178 | "observations": compute_normalization(paths["observations"]), 179 | "actions": compute_normalization(paths["actions"]), 180 | "deltas": compute_normalization(paths["next_observations"] - paths["observations"]) 181 | } 182 | 183 | #======================================================== 184 | # 185 | # Build dynamics model and MPC controllers. 186 | # 187 | sess = tf.Session() 188 | 189 | dyn_model = NNDynamicsModel(env=env, 190 | n_layers=n_layers, 191 | size=size, 192 | activation=activation, 193 | output_activation=output_activation, 194 | normalization=normalization, 195 | batch_size=batch_size, 196 | iterations=dynamics_iters, 197 | learning_rate=learning_rate, 198 | sess=sess) 199 | 200 | mpc_controller = MPCcontroller(env=env, 201 | dyn_model=dyn_model, 202 | horizon=mpc_horizon, 203 | cost_fn=cost_fn, 204 | num_simulated_paths=num_simulated_paths) 205 | 206 | 207 | #======================================================== 208 | # 209 | # Tensorflow session building. 210 | # 211 | sess.__enter__() 212 | tf.global_variables_initializer().run() 213 | 214 | #======================================================== 215 | # 216 | # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. 217 | # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 218 | # 219 | for itr in range(onpol_iters): 220 | """ YOUR CODE HERE """ 221 | shuffle_indexes = np.random.permutation(paths["observations"].shape[0]) 222 | for key in ['observations', 'actions', 'next_observations', 'rewards']: 223 | paths[key] = paths[key][shuffle_indexes] 224 | 225 | dyn_model.fit(paths) 226 | 227 | newpaths = sample(env=env, controller=mpc_controller, 228 | num_paths=num_paths_onpol, horizon=env_horizon, verbose=False) 229 | 230 | # LOGGING 231 | # Statistics for performance of MPC policy using 232 | # our learned dynamics model 233 | costs = path_cost(cost_fn, newpaths) 234 | returns = newpaths["acc_rewards"] 235 | 236 | logz.log_tabular('Iteration', itr) 237 | # In terms of cost function which your MPC controller uses to plan 238 | logz.log_tabular('AverageCost', np.mean(costs)) 239 | logz.log_tabular('StdCost', np.std(costs)) 240 | logz.log_tabular('MinimumCost', np.min(costs)) 241 | logz.log_tabular('MaximumCost', np.max(costs)) 242 | # In terms of true environment reward of your rolled out trajectory using the MPC controller 243 | logz.log_tabular('AverageReturn', np.mean(returns)) 244 | logz.log_tabular('StdReturn', np.std(returns)) 245 | logz.log_tabular('MinimumReturn', np.min(returns)) 246 | logz.log_tabular('MaximumReturn', np.max(returns)) 247 | logz.dump_tabular() 248 | 249 | for key in ['observations', 'actions', 'next_observations', 'rewards']: 250 | paths[key] = np.concatenate([paths[key], newpaths[key]]) 251 | 252 | def main(): 253 | import argparse 254 | parser = argparse.ArgumentParser() 255 | parser.add_argument('--env_name', type=str, default='HalfCheetah-v1') 256 | # Experiment meta-params 257 | parser.add_argument('--exp_name', type=str, default='mb_mpc') 258 | parser.add_argument('--seed', type=int, default=3) 259 | parser.add_argument('--render', action='store_true') 260 | # Training args 261 | parser.add_argument('--learning_rate', '-lr', type=float, default=1e-3) 262 | parser.add_argument('--onpol_iters', '-n', type=int, default=1) 263 | parser.add_argument('--dyn_iters', '-nd', type=int, default=60) 264 | parser.add_argument('--batch_size', '-b', type=int, default=512) 265 | # Data collection 266 | parser.add_argument('--random_paths', '-r', type=int, default=10) 267 | parser.add_argument('--onpol_paths', '-d', type=int, default=10) 268 | parser.add_argument('--simulated_paths', '-sp', type=int, default=1000) 269 | parser.add_argument('--ep_len', '-ep', type=int, default=1000) 270 | # Neural network architecture args 271 | parser.add_argument('--n_layers', '-l', type=int, default=2) 272 | parser.add_argument('--size', '-s', type=int, default=500) 273 | # MPC Controller 274 | parser.add_argument('--mpc_horizon', '-m', type=int, default=15) 275 | args = parser.parse_args() 276 | 277 | # Set seed 278 | np.random.seed(args.seed) 279 | tf.set_random_seed(args.seed) 280 | 281 | # Make data directory if it does not already exist 282 | if not(os.path.exists('data')): 283 | os.makedirs('data') 284 | logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") 285 | logdir = os.path.join('data', logdir) 286 | if not(os.path.exists(logdir)): 287 | os.makedirs(logdir) 288 | 289 | # Make env 290 | if args.env_name is "HalfCheetah-v1": 291 | env = HalfCheetahEnvNew() 292 | cost_fn = cheetah_cost_fn 293 | train(env=env, 294 | cost_fn=cost_fn, 295 | logdir=logdir, 296 | render=args.render, 297 | learning_rate=args.learning_rate, 298 | onpol_iters=args.onpol_iters, 299 | dynamics_iters=args.dyn_iters, 300 | batch_size=args.batch_size, 301 | num_paths_random=args.random_paths, 302 | num_paths_onpol=args.onpol_paths, 303 | num_simulated_paths=args.simulated_paths, 304 | env_horizon=args.ep_len, 305 | mpc_horizon=args.mpc_horizon, 306 | n_layers = args.n_layers, 307 | size=args.size, 308 | activation=tf.nn.relu, 309 | output_activation=None, 310 | ) 311 | 312 | if __name__ == "__main__": 313 | main() 314 | -------------------------------------------------------------------------------- /hw4/plot.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import json 5 | import os 6 | 7 | """ 8 | Using the plotter: 9 | 10 | Call it from the command line, and supply it with logdirs to experiments. 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 12 | random seeds. The runner code stored it in the directory structure 13 | 14 | data 15 | L test_EnvName_DateTime 16 | L 0 17 | L log.txt 18 | L params.json 19 | L 1 20 | L log.txt 21 | L params.json 22 | . 23 | . 24 | . 25 | L 9 26 | L log.txt 27 | L params.json 28 | 29 | To plot learning curves from the experiment, averaged over all random 30 | seeds, call 31 | 32 | python plot.py data/test_EnvName_DateTime --value AverageReturn 33 | 34 | and voila. To see a different statistics, change what you put in for 35 | the keyword --value. You can also enter /multiple/ values, and it will 36 | make all of them in order. 37 | 38 | 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried 40 | a different set of hyperparameters from 'test1', and now you would like 41 | to compare them -- see their learning curves side-by-side. Just call 42 | 43 | python plot.py data/test1 data/test2 44 | 45 | and it will plot them both! They will be given titles in the legend according 46 | to their exp_name parameters. If you want to use custom legend titles, use 47 | the --legend flag and then provide a title for each logdir. 48 | 49 | """ 50 | 51 | def plot_data(data, value="AverageReturn"): 52 | if isinstance(data, list): 53 | data = pd.concat(data, ignore_index=True) 54 | sns.set(style="darkgrid", font_scale=1.5) 55 | sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition") 56 | plt.legend(loc='best').draggable() 57 | plt.show() 58 | 59 | 60 | def get_datasets(fpath, condition=None): 61 | unit = 0 62 | datasets = [] 63 | for root, dir, files in os.walk(fpath): 64 | if 'log.txt' in files: 65 | param_path = open(os.path.join(root,'params.json')) 66 | params = json.load(param_path) 67 | exp_name = params['exp_name'] 68 | 69 | log_path = os.path.join(root,'log.txt') 70 | experiment_data = pd.read_table(log_path) 71 | 72 | experiment_data.insert( 73 | len(experiment_data.columns), 74 | 'Unit', 75 | unit 76 | ) 77 | experiment_data.insert( 78 | len(experiment_data.columns), 79 | 'Condition', 80 | condition or exp_name 81 | ) 82 | 83 | datasets.append(experiment_data) 84 | unit += 1 85 | 86 | return datasets 87 | 88 | 89 | def main(): 90 | import argparse 91 | parser = argparse.ArgumentParser() 92 | parser.add_argument('logdir', nargs='*') 93 | parser.add_argument('--legend', nargs='*') 94 | parser.add_argument('--value', default='AverageReturn', nargs='*') 95 | args = parser.parse_args() 96 | 97 | use_legend = False 98 | if args.legend is not None: 99 | assert len(args.legend) == len(args.logdir), \ 100 | "Must give a legend title for each set of experiments." 101 | use_legend = True 102 | 103 | data = [] 104 | if use_legend: 105 | for logdir, legend_title in zip(args.logdir, args.legend): 106 | data += get_datasets(logdir, legend_title) 107 | else: 108 | for logdir in args.logdir: 109 | data += get_datasets(logdir) 110 | 111 | if isinstance(args.value, list): 112 | values = args.value 113 | else: 114 | values = [args.value] 115 | for value in values: 116 | plot_data(data, value=value) 117 | 118 | if __name__ == "__main__": 119 | main() 120 | -------------------------------------------------------------------------------- /sp17_hw/hw1/README.md: -------------------------------------------------------------------------------- 1 | # CS294-112 HW 1: Imitation Learning 2 | 3 | Dependencies: TensorFlow, MuJoCo version 1.31, OpenAI Gym 4 | 5 | **Note**: MuJoCo versions until 1.5 do not support NVMe disks therefore won't be compatible with recent Mac machines. 6 | There is a request for OpenAI to support it that can be followed [here](https://github.com/openai/gym/issues/638). 7 | 8 | The only file that you need to look at is `run_expert.py`, which is code to load up an expert policy, run a specified number of roll-outs, and save out data. 9 | 10 | In `experts/`, the provided expert policies are: 11 | * Ant-v1.pkl 12 | * HalfCheetah-v1.pkl 13 | * Hopper-v1.pkl 14 | * Humanoid-v1.pkl 15 | * Reacher-v1.pkl 16 | * Walker2d-v1.pkl 17 | 18 | The name of the pickle file corresponds to the name of the gym environment. 19 | -------------------------------------------------------------------------------- /sp17_hw/hw1/demo.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eux 3 | for e in Hopper-v1 Ant-v1 HalfCheetah-v1 Humanoid-v1 Reacher-v1 Walker2d-v1 4 | do 5 | python run_expert.py experts/$e.pkl $e --render --num_rollouts=1 6 | done 7 | -------------------------------------------------------------------------------- /sp17_hw/hw1/experts/Ant-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/sp17_hw/hw1/experts/Ant-v1.pkl -------------------------------------------------------------------------------- /sp17_hw/hw1/experts/HalfCheetah-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/sp17_hw/hw1/experts/HalfCheetah-v1.pkl -------------------------------------------------------------------------------- /sp17_hw/hw1/experts/Hopper-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/sp17_hw/hw1/experts/Hopper-v1.pkl -------------------------------------------------------------------------------- /sp17_hw/hw1/experts/Humanoid-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/sp17_hw/hw1/experts/Humanoid-v1.pkl -------------------------------------------------------------------------------- /sp17_hw/hw1/experts/Reacher-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/sp17_hw/hw1/experts/Reacher-v1.pkl -------------------------------------------------------------------------------- /sp17_hw/hw1/experts/Walker2d-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kelym/DeepRL-UCB2017-Homework/d2355b92c1214d4bf20bb1d142b03bffacdfdefd/sp17_hw/hw1/experts/Walker2d-v1.pkl -------------------------------------------------------------------------------- /sp17_hw/hw1/load_policy.py: -------------------------------------------------------------------------------- 1 | import pickle, tensorflow as tf, tf_util, numpy as np 2 | 3 | def load_policy(filename): 4 | with open(filename, 'rb') as f: 5 | data = pickle.loads(f.read()) 6 | 7 | # assert len(data.keys()) == 2 8 | nonlin_type = data['nonlin_type'] 9 | policy_type = [k for k in data.keys() if k != 'nonlin_type'][0] 10 | 11 | assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type) 12 | policy_params = data[policy_type] 13 | 14 | assert set(policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'} 15 | 16 | # Keep track of input and output dims (i.e. observation and action dims) for the user 17 | 18 | def build_policy(obs_bo): 19 | def read_layer(l): 20 | assert list(l.keys()) == ['AffineLayer'] 21 | assert sorted(l['AffineLayer'].keys()) == ['W', 'b'] 22 | return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer']['b'].astype(np.float32) 23 | 24 | def apply_nonlin(x): 25 | if nonlin_type == 'lrelu': 26 | return tf_util.lrelu(x, leak=.01) # openai/imitation nn.py:233 27 | elif nonlin_type == 'tanh': 28 | return tf.tanh(x) 29 | else: 30 | raise NotImplementedError(nonlin_type) 31 | 32 | # Build the policy. First, observation normalization. 33 | assert list(policy_params['obsnorm'].keys()) == ['Standardizer'] 34 | obsnorm_mean = policy_params['obsnorm']['Standardizer']['mean_1_D'] 35 | obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D'] 36 | obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean))) 37 | print('obs', obsnorm_mean.shape, obsnorm_stdev.shape) 38 | normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation 39 | 40 | curr_activations_bd = normedobs_bo 41 | 42 | # Hidden layers next 43 | assert list(policy_params['hidden'].keys()) == ['FeedforwardNet'] 44 | layer_params = policy_params['hidden']['FeedforwardNet'] 45 | for layer_name in sorted(layer_params.keys()): 46 | l = layer_params[layer_name] 47 | W, b = read_layer(l) 48 | curr_activations_bd = apply_nonlin(tf.matmul(curr_activations_bd, W) + b) 49 | 50 | # Output layer 51 | W, b = read_layer(policy_params['out']) 52 | output_bo = tf.matmul(curr_activations_bd, W) + b 53 | return output_bo 54 | 55 | obs_bo = tf.placeholder(tf.float32, [None, None]) 56 | a_ba = build_policy(obs_bo) 57 | policy_fn = tf_util.function([obs_bo], a_ba) 58 | return policy_fn -------------------------------------------------------------------------------- /sp17_hw/hw1/run_expert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Code to load an expert policy and generate roll-out data for behavioral cloning. 5 | Example usage: 6 | python run_expert.py experts/Humanoid-v1.pkl Humanoid-v1 --render \ 7 | --num_rollouts 20 8 | 9 | Author of this script and included expert policies: Jonathan Ho (hoj@openai.com) 10 | """ 11 | 12 | import pickle 13 | import tensorflow as tf 14 | import numpy as np 15 | import tf_util 16 | import gym 17 | import load_policy 18 | 19 | def main(): 20 | import argparse 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('expert_policy_file', type=str) 23 | parser.add_argument('envname', type=str) 24 | parser.add_argument('--render', action='store_true') 25 | parser.add_argument("--max_timesteps", type=int) 26 | parser.add_argument('--num_rollouts', type=int, default=20, 27 | help='Number of expert roll outs') 28 | args = parser.parse_args() 29 | 30 | print('loading and building expert policy') 31 | policy_fn = load_policy.load_policy(args.expert_policy_file) 32 | print('loaded and built') 33 | 34 | with tf.Session(): 35 | tf_util.initialize() 36 | 37 | import gym 38 | env = gym.make(args.envname) 39 | max_steps = args.max_timesteps or env.spec.timestep_limit 40 | 41 | returns = [] 42 | observations = [] 43 | actions = [] 44 | for i in range(args.num_rollouts): 45 | print('iter', i) 46 | obs = env.reset() 47 | done = False 48 | totalr = 0. 49 | steps = 0 50 | while not done: 51 | action = policy_fn(obs[None,:]) 52 | observations.append(obs) 53 | actions.append(action) 54 | obs, r, done, _ = env.step(action) 55 | totalr += r 56 | steps += 1 57 | if args.render: 58 | env.render() 59 | if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) 60 | if steps >= max_steps: 61 | break 62 | returns.append(totalr) 63 | 64 | print('returns', returns) 65 | print('mean return', np.mean(returns)) 66 | print('std of return', np.std(returns)) 67 | 68 | expert_data = {'observations': np.array(observations), 69 | 'actions': np.array(actions)} 70 | 71 | if __name__ == '__main__': 72 | main() 73 | -------------------------------------------------------------------------------- /sp17_hw/hw2/discrete_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from gym import Env, spaces 4 | from gym.utils import seeding 5 | 6 | def categorical_sample(prob_n, np_random): 7 | """ 8 | Sample from categorical distribution 9 | Each row specifies class probabilities 10 | """ 11 | prob_n = np.asarray(prob_n) 12 | csprob_n = np.cumsum(prob_n) 13 | return (csprob_n > np_random.rand()).argmax() 14 | 15 | 16 | class DiscreteEnv(Env): 17 | 18 | """ 19 | Has the following members 20 | - nS: number of states 21 | - nA: number of actions 22 | - P: transitions (*) 23 | - isd: initial state distribution (**) 24 | 25 | (*) dictionary dict of dicts of lists, where 26 | P[s][a] == [(probability, nextstate, reward, done), ...] 27 | (**) list or array of length nS 28 | 29 | 30 | """ 31 | def __init__(self, nS, nA, P, isd): 32 | self.P = P 33 | self.isd = isd 34 | self.lastaction=None # for rendering 35 | self.nS = nS 36 | self.nA = nA 37 | 38 | self.action_space = spaces.Discrete(self.nA) 39 | self.observation_space = spaces.Discrete(self.nS) 40 | 41 | self._seed() 42 | self._reset() 43 | 44 | def _seed(self, seed=None): 45 | self.np_random, seed = seeding.np_random(seed) 46 | return [seed] 47 | 48 | def _reset(self): 49 | self.s = categorical_sample(self.isd, self.np_random) 50 | self.lastaction=None 51 | return self.s 52 | 53 | def _step(self, a): 54 | transitions = self.P[self.s][a] 55 | i = categorical_sample([t[0] for t in transitions], self.np_random) 56 | p, s, r, d= transitions[i] 57 | self.s = s 58 | self.lastaction=a 59 | return (s, r, d, {"prob" : p}) 60 | -------------------------------------------------------------------------------- /sp17_hw/hw2/frozen_lake.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | from six import StringIO, b 4 | 5 | from gym import utils 6 | import discrete_env 7 | 8 | LEFT = 0 9 | DOWN = 1 10 | RIGHT = 2 11 | UP = 3 12 | 13 | MAPS = { 14 | "4x4": [ 15 | "SFFF", 16 | "FHFH", 17 | "FFFH", 18 | "HFFG" 19 | ], 20 | "8x8": [ 21 | "SFFFFFFF", 22 | "FFFFFFFF", 23 | "FFFHFFFF", 24 | "FFFFFHFF", 25 | "FFFHFFFF", 26 | "FHHFFFHF", 27 | "FHFFHFHF", 28 | "FFFHFFFG" 29 | ], 30 | } 31 | 32 | class FrozenLakeEnv(discrete_env.DiscreteEnv): 33 | """ 34 | Winter is here. You and your friends were tossing around a frisbee at the park 35 | when you made a wild throw that left the frisbee out in the middle of the lake. 36 | The water is mostly frozen, but there are a few holes where the ice has melted. 37 | If you step into one of those holes, you'll fall into the freezing water. 38 | At this time, there's an international frisbee shortage, so it's absolutely imperative that 39 | you navigate across the lake and retrieve the disc. 40 | However, the ice is slippery, so you won't always move in the direction you intend. 41 | The surface is described using a grid like the following 42 | 43 | SFFF 44 | FHFH 45 | FFFH 46 | HFFG 47 | 48 | S : starting point, safe 49 | F : frozen surface, safe 50 | H : hole, fall to your doom 51 | G : goal, where the frisbee is located 52 | 53 | The episode ends when you reach the goal or fall in a hole. 54 | You receive a reward of 1 if you reach the goal, and zero otherwise. 55 | 56 | """ 57 | 58 | metadata = {'render.modes': ['human', 'ansi']} 59 | 60 | def __init__(self, desc=None, map_name="4x4",is_slippery=True): 61 | if desc is None and map_name is None: 62 | raise ValueError('Must provide either desc or map_name') 63 | elif desc is None: 64 | desc = MAPS[map_name] 65 | self.desc = desc = np.asarray(desc,dtype='c') 66 | self.nrow, self.ncol = nrow, ncol = desc.shape 67 | 68 | nA = 4 69 | nS = nrow * ncol 70 | 71 | isd = np.array(desc == b'S').astype('float64').ravel() 72 | isd /= isd.sum() 73 | 74 | P = {s : {a : [] for a in range(nA)} for s in range(nS)} 75 | 76 | def to_s(row, col): 77 | return row*ncol + col 78 | def inc(row, col, a): 79 | if a==0: # left 80 | col = max(col-1,0) 81 | elif a==1: # down 82 | row = min(row+1,nrow-1) 83 | elif a==2: # right 84 | col = min(col+1,ncol-1) 85 | elif a==3: # up 86 | row = max(row-1,0) 87 | return (row, col) 88 | 89 | for row in range(nrow): 90 | for col in range(ncol): 91 | s = to_s(row, col) 92 | for a in range(4): 93 | li = P[s][a] 94 | letter = desc[row, col] 95 | if letter in b'GH': 96 | li.append((1.0, s, 0, True)) 97 | else: 98 | if is_slippery: 99 | for b in [(a-1)%4, a, (a+1)%4]: 100 | newrow, newcol = inc(row, col, b) 101 | newstate = to_s(newrow, newcol) 102 | newletter = desc[newrow, newcol] 103 | done = bytes(newletter) in b'GH' 104 | rew = float(newletter == b'G') 105 | li.append((0.8 if b==a else 0.1, newstate, rew, done)) 106 | else: 107 | newrow, newcol = inc(row, col, a) 108 | newstate = to_s(newrow, newcol) 109 | newletter = desc[newrow, newcol] 110 | done = bytes(newletter) in b'GH' 111 | rew = float(newletter == b'G') 112 | li.append((1.0, newstate, rew, done)) 113 | 114 | super(FrozenLakeEnv, self).__init__(nS, nA, P, isd) 115 | 116 | def _render(self, mode='human', close=False): 117 | if close: 118 | return 119 | outfile = StringIO() if mode == 'ansi' else sys.stdout 120 | 121 | row, col = self.s // self.ncol, self.s % self.ncol 122 | desc = self.desc.tolist() 123 | desc = [[c.decode('utf-8') for c in line] for line in desc] 124 | desc[row][col] = utils.colorize(desc[row][col], "red", highlight=True) 125 | if self.lastaction is not None: 126 | outfile.write(" ({})\n".format(["Left","Down","Right","Up"][self.lastaction])) 127 | else: 128 | outfile.write("\n") 129 | outfile.write("\n".join(''.join(line) for line in desc)+"\n") 130 | 131 | return outfile 132 | -------------------------------------------------------------------------------- /sp17_hw/hw3/README: -------------------------------------------------------------------------------- 1 | See http://rll.berkeley.edu/deeprlcourse/docs/hw3.pdf for instructions 2 | 3 | The starter code was based on an implementation of Q-learning for Atari 4 | generously provided by Szymon Sidor from OpenAI 5 | 6 | -------------------------------------------------------------------------------- /sp17_hw/hw3/atari_wrappers.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from collections import deque 4 | import gym 5 | from gym import spaces 6 | 7 | 8 | class NoopResetEnv(gym.Wrapper): 9 | def __init__(self, env=None, noop_max=30): 10 | """Sample initial states by taking random number of no-ops on reset. 11 | No-op is assumed to be action 0. 12 | """ 13 | super(NoopResetEnv, self).__init__(env) 14 | self.noop_max = noop_max 15 | assert env.unwrapped.get_action_meanings()[0] == 'NOOP' 16 | 17 | def _reset(self): 18 | """ Do no-op action for a number of steps in [1, noop_max].""" 19 | self.env.reset() 20 | noops = np.random.randint(1, self.noop_max + 1) 21 | for _ in range(noops): 22 | obs, _, _, _ = self.env.step(0) 23 | return obs 24 | 25 | class FireResetEnv(gym.Wrapper): 26 | def __init__(self, env=None): 27 | """Take action on reset for environments that are fixed until firing.""" 28 | super(FireResetEnv, self).__init__(env) 29 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE' 30 | assert len(env.unwrapped.get_action_meanings()) >= 3 31 | 32 | def _reset(self): 33 | self.env.reset() 34 | obs, _, _, _ = self.env.step(1) 35 | obs, _, _, _ = self.env.step(2) 36 | return obs 37 | 38 | class EpisodicLifeEnv(gym.Wrapper): 39 | def __init__(self, env=None): 40 | """Make end-of-life == end-of-episode, but only reset on true game over. 41 | Done by DeepMind for the DQN and co. since it helps value estimation. 42 | """ 43 | super(EpisodicLifeEnv, self).__init__(env) 44 | self.lives = 0 45 | self.was_real_done = True 46 | self.was_real_reset = False 47 | 48 | def _step(self, action): 49 | obs, reward, done, info = self.env.step(action) 50 | self.was_real_done = done 51 | # check current lives, make loss of life terminal, 52 | # then update lives to handle bonus lives 53 | lives = self.env.unwrapped.ale.lives() 54 | if lives < self.lives and lives > 0: 55 | # for Qbert somtimes we stay in lives == 0 condtion for a few frames 56 | # so its important to keep lives > 0, so that we only reset once 57 | # the environment advertises done. 58 | done = True 59 | self.lives = lives 60 | return obs, reward, done, info 61 | 62 | def _reset(self): 63 | """Reset only when lives are exhausted. 64 | This way all states are still reachable even though lives are episodic, 65 | and the learner need not know about any of this behind-the-scenes. 66 | """ 67 | if self.was_real_done: 68 | obs = self.env.reset() 69 | self.was_real_reset = True 70 | else: 71 | # no-op step to advance from terminal/lost life state 72 | obs, _, _, _ = self.env.step(0) 73 | self.was_real_reset = False 74 | self.lives = self.env.unwrapped.ale.lives() 75 | return obs 76 | 77 | class MaxAndSkipEnv(gym.Wrapper): 78 | def __init__(self, env=None, skip=4): 79 | """Return only every `skip`-th frame""" 80 | super(MaxAndSkipEnv, self).__init__(env) 81 | # most recent raw observations (for max pooling across time steps) 82 | self._obs_buffer = deque(maxlen=2) 83 | self._skip = skip 84 | 85 | def _step(self, action): 86 | total_reward = 0.0 87 | done = None 88 | for _ in range(self._skip): 89 | obs, reward, done, info = self.env.step(action) 90 | self._obs_buffer.append(obs) 91 | total_reward += reward 92 | if done: 93 | break 94 | 95 | max_frame = np.max(np.stack(self._obs_buffer), axis=0) 96 | 97 | return max_frame, total_reward, done, info 98 | 99 | def _reset(self): 100 | """Clear past frame buffer and init. to first obs. from inner env.""" 101 | self._obs_buffer.clear() 102 | obs = self.env.reset() 103 | self._obs_buffer.append(obs) 104 | return obs 105 | 106 | def _process_frame84(frame): 107 | img = np.reshape(frame, [210, 160, 3]).astype(np.float32) 108 | img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114 109 | resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_LINEAR) 110 | x_t = resized_screen[18:102, :] 111 | x_t = np.reshape(x_t, [84, 84, 1]) 112 | return x_t.astype(np.uint8) 113 | 114 | class ProcessFrame84(gym.Wrapper): 115 | def __init__(self, env=None): 116 | super(ProcessFrame84, self).__init__(env) 117 | self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1)) 118 | 119 | def _step(self, action): 120 | obs, reward, done, info = self.env.step(action) 121 | return _process_frame84(obs), reward, done, info 122 | 123 | def _reset(self): 124 | return _process_frame84(self.env.reset()) 125 | 126 | class ClippedRewardsWrapper(gym.Wrapper): 127 | def _step(self, action): 128 | obs, reward, done, info = self.env.step(action) 129 | return obs, np.sign(reward), done, info 130 | 131 | def wrap_deepmind_ram(env): 132 | env = EpisodicLifeEnv(env) 133 | env = NoopResetEnv(env, noop_max=30) 134 | env = MaxAndSkipEnv(env, skip=4) 135 | if 'FIRE' in env.unwrapped.get_action_meanings(): 136 | env = FireResetEnv(env) 137 | env = ClippedRewardsWrapper(env) 138 | return env 139 | 140 | def wrap_deepmind(env): 141 | assert 'NoFrameskip' in env.spec.id 142 | env = EpisodicLifeEnv(env) 143 | env = NoopResetEnv(env, noop_max=30) 144 | env = MaxAndSkipEnv(env, skip=4) 145 | if 'FIRE' in env.unwrapped.get_action_meanings(): 146 | env = FireResetEnv(env) 147 | env = ProcessFrame84(env) 148 | env = ClippedRewardsWrapper(env) 149 | return env 150 | -------------------------------------------------------------------------------- /sp17_hw/hw3/dqn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gym.spaces 3 | import itertools 4 | import numpy as np 5 | import random 6 | import tensorflow as tf 7 | import tensorflow.contrib.layers as layers 8 | from collections import namedtuple 9 | from dqn_utils import * 10 | 11 | OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs", "lr_schedule"]) 12 | 13 | def learn(env, 14 | q_func, 15 | optimizer_spec, 16 | session, 17 | exploration=LinearSchedule(1000000, 0.1), 18 | stopping_criterion=None, 19 | replay_buffer_size=1000000, 20 | batch_size=32, 21 | gamma=0.99, 22 | learning_starts=50000, 23 | learning_freq=4, 24 | frame_history_len=4, 25 | target_update_freq=10000, 26 | grad_norm_clipping=10): 27 | """Run Deep Q-learning algorithm. 28 | 29 | You can specify your own convnet using q_func. 30 | 31 | All schedules are w.r.t. total number of steps taken in the environment. 32 | 33 | Parameters 34 | ---------- 35 | env: gym.Env 36 | gym environment to train on. 37 | q_func: function 38 | Model to use for computing the q function. It should accept the 39 | following named arguments: 40 | img_in: tf.Tensor 41 | tensorflow tensor representing the input image 42 | num_actions: int 43 | number of actions 44 | scope: str 45 | scope in which all the model related variables 46 | should be created 47 | reuse: bool 48 | whether previously created variables should be reused. 49 | optimizer_spec: OptimizerSpec 50 | Specifying the constructor and kwargs, as well as learning rate schedule 51 | for the optimizer 52 | session: tf.Session 53 | tensorflow session to use. 54 | exploration: rl_algs.deepq.utils.schedules.Schedule 55 | schedule for probability of chosing random action. 56 | stopping_criterion: (env, t) -> bool 57 | should return true when it's ok for the RL algorithm to stop. 58 | takes in env and the number of steps executed so far. 59 | replay_buffer_size: int 60 | How many memories to store in the replay buffer. 61 | batch_size: int 62 | How many transitions to sample each time experience is replayed. 63 | gamma: float 64 | Discount Factor 65 | learning_starts: int 66 | After how many environment steps to start replaying experiences 67 | learning_freq: int 68 | How many steps of environment to take between every experience replay 69 | frame_history_len: int 70 | How many past frames to include as input to the model. 71 | target_update_freq: int 72 | How many experience replay rounds (not steps!) to perform between 73 | each update to the target Q network 74 | grad_norm_clipping: float or None 75 | If not None gradients' norms are clipped to this value. 76 | """ 77 | assert type(env.observation_space) == gym.spaces.Box 78 | assert type(env.action_space) == gym.spaces.Discrete 79 | 80 | ############### 81 | # BUILD MODEL # 82 | ############### 83 | 84 | if len(env.observation_space.shape) == 1: 85 | # This means we are running on low-dimensional observations (e.g. RAM) 86 | input_shape = env.observation_space.shape 87 | else: 88 | img_h, img_w, img_c = env.observation_space.shape 89 | input_shape = (img_h, img_w, frame_history_len * img_c) 90 | num_actions = env.action_space.n 91 | 92 | # set up placeholders 93 | # placeholder for current observation (or state) 94 | obs_t_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) 95 | # placeholder for current action 96 | act_t_ph = tf.placeholder(tf.int32, [None]) 97 | # placeholder for current reward 98 | rew_t_ph = tf.placeholder(tf.float32, [None]) 99 | # placeholder for next observation (or state) 100 | obs_tp1_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) 101 | # placeholder for end of episode mask 102 | # this value is 1 if the next state corresponds to the end of an episode, 103 | # in which case there is no Q-value at the next state; at the end of an 104 | # episode, only the current state reward contributes to the target, not the 105 | # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1) 106 | done_mask_ph = tf.placeholder(tf.float32, [None]) 107 | 108 | # casting to float on GPU ensures lower data transfer times. 109 | obs_t_float = tf.cast(obs_t_ph, tf.float32) / 255.0 110 | obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0 111 | 112 | # Here, you should fill in your own code to compute the Bellman error. This requires 113 | # evaluating the current and next Q-values and constructing the corresponding error. 114 | # TensorFlow will differentiate this error for you, you just need to pass it to the 115 | # optimizer. See assignment text for details. 116 | # Your code should produce one scalar-valued tensor: total_error 117 | # This will be passed to the optimizer in the provided code below. 118 | # Your code should also produce two collections of variables: 119 | # q_func_vars 120 | # target_q_func_vars 121 | # These should hold all of the variables of the Q-function network and target network, 122 | # respectively. A convenient way to get these is to make use of TF's "scope" feature. 123 | # For example, you can create your Q-function network with the scope "q_func" like this: 124 | # = q_func(obs_t_float, num_actions, scope="q_func", reuse=False) 125 | # And then you can obtain the variables like this: 126 | # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func') 127 | # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES" 128 | ###### 129 | 130 | # YOUR CODE HERE 131 | 132 | ###### 133 | 134 | # construct optimization op (with gradient clipping) 135 | learning_rate = tf.placeholder(tf.float32, (), name="learning_rate") 136 | optimizer = optimizer_spec.constructor(learning_rate=learning_rate, **optimizer_spec.kwargs) 137 | train_fn = minimize_and_clip(optimizer, total_error, 138 | var_list=q_func_vars, clip_val=grad_norm_clipping) 139 | 140 | # update_target_fn will be called periodically to copy Q network to target Q network 141 | update_target_fn = [] 142 | for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), 143 | sorted(target_q_func_vars, key=lambda v: v.name)): 144 | update_target_fn.append(var_target.assign(var)) 145 | update_target_fn = tf.group(*update_target_fn) 146 | 147 | # construct the replay buffer 148 | replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) 149 | 150 | ############### 151 | # RUN ENV # 152 | ############### 153 | model_initialized = False 154 | num_param_updates = 0 155 | mean_episode_reward = -float('nan') 156 | best_mean_episode_reward = -float('inf') 157 | last_obs = env.reset() 158 | LOG_EVERY_N_STEPS = 10000 159 | 160 | for t in itertools.count(): 161 | ### 1. Check stopping criterion 162 | if stopping_criterion is not None and stopping_criterion(env, t): 163 | break 164 | 165 | ### 2. Step the env and store the transition 166 | # At this point, "last_obs" contains the latest observation that was 167 | # recorded from the simulator. Here, your code needs to store this 168 | # observation and its outcome (reward, next observation, etc.) into 169 | # the replay buffer while stepping the simulator forward one step. 170 | # At the end of this block of code, the simulator should have been 171 | # advanced one step, and the replay buffer should contain one more 172 | # transition. 173 | # Specifically, last_obs must point to the new latest observation. 174 | # Useful functions you'll need to call: 175 | # obs, reward, done, info = env.step(action) 176 | # this steps the environment forward one step 177 | # obs = env.reset() 178 | # this resets the environment if you reached an episode boundary. 179 | # Don't forget to call env.reset() to get a new observation if done 180 | # is true!! 181 | # Note that you cannot use "last_obs" directly as input 182 | # into your network, since it needs to be processed to include context 183 | # from previous frames. You should check out the replay buffer 184 | # implementation in dqn_utils.py to see what functionality the replay 185 | # buffer exposes. The replay buffer has a function called 186 | # encode_recent_observation that will take the latest observation 187 | # that you pushed into the buffer and compute the corresponding 188 | # input that should be given to a Q network by appending some 189 | # previous frames. 190 | # Don't forget to include epsilon greedy exploration! 191 | # And remember that the first time you enter this loop, the model 192 | # may not yet have been initialized (but of course, the first step 193 | # might as well be random, since you haven't trained your net...) 194 | 195 | ##### 196 | 197 | # YOUR CODE HERE 198 | 199 | ##### 200 | 201 | # at this point, the environment should have been advanced one step (and 202 | # reset if done was true), and last_obs should point to the new latest 203 | # observation 204 | 205 | ### 3. Perform experience replay and train the network. 206 | # note that this is only done if the replay buffer contains enough samples 207 | # for us to learn something useful -- until then, the model will not be 208 | # initialized and random actions should be taken 209 | if (t > learning_starts and 210 | t % learning_freq == 0 and 211 | replay_buffer.can_sample(batch_size)): 212 | # Here, you should perform training. Training consists of four steps: 213 | # 3.a: use the replay buffer to sample a batch of transitions (see the 214 | # replay buffer code for function definition, each batch that you sample 215 | # should consist of current observations, current actions, rewards, 216 | # next observations, and done indicator). 217 | # 3.b: initialize the model if it has not been initialized yet; to do 218 | # that, call 219 | # initialize_interdependent_variables(session, tf.global_variables(), { 220 | # obs_t_ph: obs_t_batch, 221 | # obs_tp1_ph: obs_tp1_batch, 222 | # }) 223 | # where obs_t_batch and obs_tp1_batch are the batches of observations at 224 | # the current and next time step. The boolean variable model_initialized 225 | # indicates whether or not the model has been initialized. 226 | # Remember that you have to update the target network too (see 3.d)! 227 | # 3.c: train the model. To do this, you'll need to use the train_fn and 228 | # total_error ops that were created earlier: total_error is what you 229 | # created to compute the total Bellman error in a batch, and train_fn 230 | # will actually perform a gradient step and update the network parameters 231 | # to reduce total_error. When calling session.run on these you'll need to 232 | # populate the following placeholders: 233 | # obs_t_ph 234 | # act_t_ph 235 | # rew_t_ph 236 | # obs_tp1_ph 237 | # done_mask_ph 238 | # (this is needed for computing total_error) 239 | # learning_rate -- you can get this from optimizer_spec.lr_schedule.value(t) 240 | # (this is needed by the optimizer to choose the learning rate) 241 | # 3.d: periodically update the target network by calling 242 | # session.run(update_target_fn) 243 | # you should update every target_update_freq steps, and you may find the 244 | # variable num_param_updates useful for this (it was initialized to 0) 245 | ##### 246 | 247 | # YOUR CODE HERE 248 | 249 | ##### 250 | 251 | ### 4. Log progress 252 | episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() 253 | if len(episode_rewards) > 0: 254 | mean_episode_reward = np.mean(episode_rewards[-100:]) 255 | if len(episode_rewards) > 100: 256 | best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) 257 | if t % LOG_EVERY_N_STEPS == 0 and model_initialized: 258 | print("Timestep %d" % (t,)) 259 | print("mean reward (100 episodes) %f" % mean_episode_reward) 260 | print("best mean reward %f" % best_mean_episode_reward) 261 | print("episodes %d" % len(episode_rewards)) 262 | print("exploration %f" % exploration.value(t)) 263 | print("learning_rate %f" % optimizer_spec.lr_schedule.value(t)) 264 | sys.stdout.flush() 265 | -------------------------------------------------------------------------------- /sp17_hw/hw3/dqn_utils.py: -------------------------------------------------------------------------------- 1 | """This file includes a collection of utility functions that are useful for 2 | implementing DQN.""" 3 | import gym 4 | import tensorflow as tf 5 | import numpy as np 6 | import random 7 | 8 | def huber_loss(x, delta=1.0): 9 | # https://en.wikipedia.org/wiki/Huber_loss 10 | return tf.select( 11 | tf.abs(x) < delta, 12 | tf.square(x) * 0.5, 13 | delta * (tf.abs(x) - 0.5 * delta) 14 | ) 15 | 16 | def sample_n_unique(sampling_f, n): 17 | """Helper function. Given a function `sampling_f` that returns 18 | comparable objects, sample n such unique objects. 19 | """ 20 | res = [] 21 | while len(res) < n: 22 | candidate = sampling_f() 23 | if candidate not in res: 24 | res.append(candidate) 25 | return res 26 | 27 | class Schedule(object): 28 | def value(self, t): 29 | """Value of the schedule at time t""" 30 | raise NotImplementedError() 31 | 32 | class ConstantSchedule(object): 33 | def __init__(self, value): 34 | """Value remains constant over time. 35 | Parameters 36 | ---------- 37 | value: float 38 | Constant value of the schedule 39 | """ 40 | self._v = value 41 | 42 | def value(self, t): 43 | """See Schedule.value""" 44 | return self._v 45 | 46 | def linear_interpolation(l, r, alpha): 47 | return l + alpha * (r - l) 48 | 49 | class PiecewiseSchedule(object): 50 | def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None): 51 | """Piecewise schedule. 52 | endpoints: [(int, int)] 53 | list of pairs `(time, value)` meanining that schedule should output 54 | `value` when `t==time`. All the values for time must be sorted in 55 | an increasing order. When t is between two times, e.g. `(time_a, value_a)` 56 | and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs 57 | `interpolation(value_a, value_b, alpha)` where alpha is a fraction of 58 | time passed between `time_a` and `time_b` for time `t`. 59 | interpolation: lambda float, float, float: float 60 | a function that takes value to the left and to the right of t according 61 | to the `endpoints`. Alpha is the fraction of distance from left endpoint to 62 | right endpoint that t has covered. See linear_interpolation for example. 63 | outside_value: float 64 | if the value is requested outside of all the intervals sepecified in 65 | `endpoints` this value is returned. If None then AssertionError is 66 | raised when outside value is requested. 67 | """ 68 | idxes = [e[0] for e in endpoints] 69 | assert idxes == sorted(idxes) 70 | self._interpolation = interpolation 71 | self._outside_value = outside_value 72 | self._endpoints = endpoints 73 | 74 | def value(self, t): 75 | """See Schedule.value""" 76 | for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]): 77 | if l_t <= t and t < r_t: 78 | alpha = float(t - l_t) / (r_t - l_t) 79 | return self._interpolation(l, r, alpha) 80 | 81 | # t does not belong to any of the pieces, so doom. 82 | assert self._outside_value is not None 83 | return self._outside_value 84 | 85 | class LinearSchedule(object): 86 | def __init__(self, schedule_timesteps, final_p, initial_p=1.0): 87 | """Linear interpolation between initial_p and final_p over 88 | schedule_timesteps. After this many timesteps pass final_p is 89 | returned. 90 | Parameters 91 | ---------- 92 | schedule_timesteps: int 93 | Number of timesteps for which to linearly anneal initial_p 94 | to final_p 95 | initial_p: float 96 | initial output value 97 | final_p: float 98 | final output value 99 | """ 100 | self.schedule_timesteps = schedule_timesteps 101 | self.final_p = final_p 102 | self.initial_p = initial_p 103 | 104 | def value(self, t): 105 | """See Schedule.value""" 106 | fraction = min(float(t) / self.schedule_timesteps, 1.0) 107 | return self.initial_p + fraction * (self.final_p - self.initial_p) 108 | 109 | def compute_exponential_averages(variables, decay): 110 | """Given a list of tensorflow scalar variables 111 | create ops corresponding to their exponential 112 | averages 113 | Parameters 114 | ---------- 115 | variables: [tf.Tensor] 116 | List of scalar tensors. 117 | Returns 118 | ------- 119 | averages: [tf.Tensor] 120 | List of scalar tensors corresponding to averages 121 | of al the `variables` (in order) 122 | apply_op: tf.runnable 123 | Op to be run to update the averages with current value 124 | of variables. 125 | """ 126 | averager = tf.train.ExponentialMovingAverage(decay=decay) 127 | apply_op = averager.apply(variables) 128 | return [averager.average(v) for v in variables], apply_op 129 | 130 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10): 131 | """Minimized `objective` using `optimizer` w.r.t. variables in 132 | `var_list` while ensure the norm of the gradients for each 133 | variable is clipped to `clip_val` 134 | """ 135 | gradients = optimizer.compute_gradients(objective, var_list=var_list) 136 | for i, (grad, var) in enumerate(gradients): 137 | if grad is not None: 138 | gradients[i] = (tf.clip_by_norm(grad, clip_val), var) 139 | return optimizer.apply_gradients(gradients) 140 | 141 | def initialize_interdependent_variables(session, vars_list, feed_dict): 142 | """Initialize a list of variables one at a time, which is useful if 143 | initialization of some variables depends on initialization of the others. 144 | """ 145 | vars_left = vars_list 146 | while len(vars_left) > 0: 147 | new_vars_left = [] 148 | for v in vars_left: 149 | try: 150 | # If using an older version of TensorFlow, uncomment the line 151 | # below and comment out the line after it. 152 | #session.run(tf.initialize_variables([v]), feed_dict) 153 | session.run(tf.variables_initializer([v]), feed_dict) 154 | except tf.errors.FailedPreconditionError: 155 | new_vars_left.append(v) 156 | if len(new_vars_left) >= len(vars_left): 157 | # This can happend if the variables all depend on each other, or more likely if there's 158 | # another variable outside of the list, that still needs to be initialized. This could be 159 | # detected here, but life's finite. 160 | raise Exception("Cycle in variable dependencies, or extenrnal precondition unsatisfied.") 161 | else: 162 | vars_left = new_vars_left 163 | 164 | def get_wrapper_by_name(env, classname): 165 | currentenv = env 166 | while True: 167 | if classname in currentenv.__class__.__name__: 168 | return currentenv 169 | elif isinstance(env, gym.Wrapper): 170 | currentenv = currentenv.env 171 | else: 172 | raise ValueError("Couldn't find wrapper named %s"%classname) 173 | 174 | class ReplayBuffer(object): 175 | def __init__(self, size, frame_history_len): 176 | """This is a memory efficient implementation of the replay buffer. 177 | 178 | The sepecific memory optimizations use here are: 179 | - only store each frame once rather than k times 180 | even if every observation normally consists of k last frames 181 | - store frames as np.uint8 (actually it is most time-performance 182 | to cast them back to float32 on GPU to minimize memory transfer 183 | time) 184 | - store frame_t and frame_(t+1) in the same buffer. 185 | 186 | For the tipical use case in Atari Deep RL buffer with 1M frames the total 187 | memory footprint of this buffer is 10^6 * 84 * 84 bytes ~= 7 gigabytes 188 | 189 | Warning! Assumes that returning frame of zeros at the beginning 190 | of the episode, when there is less frames than `frame_history_len`, 191 | is acceptable. 192 | 193 | Parameters 194 | ---------- 195 | size: int 196 | Max number of transitions to store in the buffer. When the buffer 197 | overflows the old memories are dropped. 198 | frame_history_len: int 199 | Number of memories to be retried for each observation. 200 | """ 201 | self.size = size 202 | self.frame_history_len = frame_history_len 203 | 204 | self.next_idx = 0 205 | self.num_in_buffer = 0 206 | 207 | self.obs = None 208 | self.action = None 209 | self.reward = None 210 | self.done = None 211 | 212 | def can_sample(self, batch_size): 213 | """Returns true if `batch_size` different transitions can be sampled from the buffer.""" 214 | return batch_size + 1 <= self.num_in_buffer 215 | 216 | def _encode_sample(self, idxes): 217 | obs_batch = np.concatenate([self._encode_observation(idx)[None] for idx in idxes], 0) 218 | act_batch = self.action[idxes] 219 | rew_batch = self.reward[idxes] 220 | next_obs_batch = np.concatenate([self._encode_observation(idx + 1)[None] for idx in idxes], 0) 221 | done_mask = np.array([1.0 if self.done[idx] else 0.0 for idx in idxes], dtype=np.float32) 222 | 223 | return obs_batch, act_batch, rew_batch, next_obs_batch, done_mask 224 | 225 | 226 | def sample(self, batch_size): 227 | """Sample `batch_size` different transitions. 228 | 229 | i-th sample transition is the following: 230 | 231 | when observing `obs_batch[i]`, action `act_batch[i]` was taken, 232 | after which reward `rew_batch[i]` was received and subsequent 233 | observation next_obs_batch[i] was observed, unless the epsiode 234 | was done which is represented by `done_mask[i]` which is equal 235 | to 1 if episode has ended as a result of that action. 236 | 237 | Parameters 238 | ---------- 239 | batch_size: int 240 | How many transitions to sample. 241 | 242 | Returns 243 | ------- 244 | obs_batch: np.array 245 | Array of shape 246 | (batch_size, img_h, img_w, img_c * frame_history_len) 247 | and dtype np.uint8 248 | act_batch: np.array 249 | Array of shape (batch_size,) and dtype np.int32 250 | rew_batch: np.array 251 | Array of shape (batch_size,) and dtype np.float32 252 | next_obs_batch: np.array 253 | Array of shape 254 | (batch_size, img_h, img_w, img_c * frame_history_len) 255 | and dtype np.uint8 256 | done_mask: np.array 257 | Array of shape (batch_size,) and dtype np.float32 258 | """ 259 | assert self.can_sample(batch_size) 260 | idxes = sample_n_unique(lambda: random.randint(0, self.num_in_buffer - 2), batch_size) 261 | return self._encode_sample(idxes) 262 | 263 | def encode_recent_observation(self): 264 | """Return the most recent `frame_history_len` frames. 265 | 266 | Returns 267 | ------- 268 | observation: np.array 269 | Array of shape (img_h, img_w, img_c * frame_history_len) 270 | and dtype np.uint8, where observation[:, :, i*img_c:(i+1)*img_c] 271 | encodes frame at time `t - frame_history_len + i` 272 | """ 273 | assert self.num_in_buffer > 0 274 | return self._encode_observation((self.next_idx - 1) % self.size) 275 | 276 | def _encode_observation(self, idx): 277 | end_idx = idx + 1 # make noninclusive 278 | start_idx = end_idx - self.frame_history_len 279 | # this checks if we are using low-dimensional observations, such as RAM 280 | # state, in which case we just directly return the latest RAM. 281 | if len(self.obs.shape) == 2: 282 | return self.obs[end_idx-1] 283 | # if there weren't enough frames ever in the buffer for context 284 | if start_idx < 0 and self.num_in_buffer != self.size: 285 | start_idx = 0 286 | for idx in range(start_idx, end_idx - 1): 287 | if self.done[idx % self.size]: 288 | start_idx = idx + 1 289 | missing_context = self.frame_history_len - (end_idx - start_idx) 290 | # if zero padding is needed for missing context 291 | # or we are on the boundry of the buffer 292 | if start_idx < 0 or missing_context > 0: 293 | frames = [np.zeros_like(self.obs[0]) for _ in range(missing_context)] 294 | for idx in range(start_idx, end_idx): 295 | frames.append(self.obs[idx % self.size]) 296 | return np.concatenate(frames, 2) 297 | else: 298 | # this optimization has potential to saves about 30% compute time \o/ 299 | img_h, img_w = self.obs.shape[1], self.obs.shape[2] 300 | return self.obs[start_idx:end_idx].transpose(1, 2, 0, 3).reshape(img_h, img_w, -1) 301 | 302 | def store_frame(self, frame): 303 | """Store a single frame in the buffer at the next available index, overwriting 304 | old frames if necessary. 305 | 306 | Parameters 307 | ---------- 308 | frame: np.array 309 | Array of shape (img_h, img_w, img_c) and dtype np.uint8 310 | the frame to be stored 311 | 312 | Returns 313 | ------- 314 | idx: int 315 | Index at which the frame is stored. To be used for `store_effect` later. 316 | """ 317 | if self.obs is None: 318 | self.obs = np.empty([self.size] + list(frame.shape), dtype=np.uint8) 319 | self.action = np.empty([self.size], dtype=np.int32) 320 | self.reward = np.empty([self.size], dtype=np.float32) 321 | self.done = np.empty([self.size], dtype=np.bool) 322 | self.obs[self.next_idx] = frame 323 | 324 | ret = self.next_idx 325 | self.next_idx = (self.next_idx + 1) % self.size 326 | self.num_in_buffer = min(self.size, self.num_in_buffer + 1) 327 | 328 | return ret 329 | 330 | def store_effect(self, idx, action, reward, done): 331 | """Store effects of action taken after obeserving frame stored 332 | at index idx. The reason `store_frame` and `store_effect` is broken 333 | up into two functions is so that once can call `encode_recent_observation` 334 | in between. 335 | 336 | Paramters 337 | --------- 338 | idx: int 339 | Index in buffer of recently observed frame (returned by `store_frame`). 340 | action: int 341 | Action that was performed upon observing this frame. 342 | reward: float 343 | Reward that was received when the actions was performed. 344 | done: bool 345 | True if episode was finished after performing that action. 346 | """ 347 | self.action[idx] = action 348 | self.reward[idx] = reward 349 | self.done[idx] = done 350 | 351 | -------------------------------------------------------------------------------- /sp17_hw/hw3/run_dqn_atari.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | from gym import wrappers 4 | import os.path as osp 5 | import random 6 | import numpy as np 7 | import tensorflow as tf 8 | import tensorflow.contrib.layers as layers 9 | 10 | import dqn 11 | from dqn_utils import * 12 | from atari_wrappers import * 13 | 14 | 15 | def atari_model(img_in, num_actions, scope, reuse=False): 16 | # as described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf 17 | with tf.variable_scope(scope, reuse=reuse): 18 | out = img_in 19 | with tf.variable_scope("convnet"): 20 | # original architecture 21 | out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu) 22 | out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu) 23 | out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu) 24 | out = layers.flatten(out) 25 | with tf.variable_scope("action_value"): 26 | out = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu) 27 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 28 | 29 | return out 30 | 31 | def atari_learn(env, 32 | session, 33 | num_timesteps): 34 | # This is just a rough estimate 35 | num_iterations = float(num_timesteps) / 4.0 36 | 37 | lr_multiplier = 1.0 38 | lr_schedule = PiecewiseSchedule([ 39 | (0, 1e-4 * lr_multiplier), 40 | (num_iterations / 10, 1e-4 * lr_multiplier), 41 | (num_iterations / 2, 5e-5 * lr_multiplier), 42 | ], 43 | outside_value=5e-5 * lr_multiplier) 44 | optimizer = dqn.OptimizerSpec( 45 | constructor=tf.train.AdamOptimizer, 46 | kwargs=dict(epsilon=1e-4), 47 | lr_schedule=lr_schedule 48 | ) 49 | 50 | def stopping_criterion(env, t): 51 | # notice that here t is the number of steps of the wrapped env, 52 | # which is different from the number of steps in the underlying env 53 | return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps 54 | 55 | exploration_schedule = PiecewiseSchedule( 56 | [ 57 | (0, 1.0), 58 | (1e6, 0.1), 59 | (num_iterations / 2, 0.01), 60 | ], outside_value=0.01 61 | ) 62 | 63 | dqn.learn( 64 | env, 65 | q_func=atari_model, 66 | optimizer_spec=optimizer, 67 | session=session, 68 | exploration=exploration_schedule, 69 | stopping_criterion=stopping_criterion, 70 | replay_buffer_size=1000000, 71 | batch_size=32, 72 | gamma=0.99, 73 | learning_starts=50000, 74 | learning_freq=4, 75 | frame_history_len=4, 76 | target_update_freq=10000, 77 | grad_norm_clipping=10 78 | ) 79 | env.close() 80 | 81 | def get_available_gpus(): 82 | from tensorflow.python.client import device_lib 83 | local_device_protos = device_lib.list_local_devices() 84 | return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU'] 85 | 86 | def set_global_seeds(i): 87 | try: 88 | import tensorflow as tf 89 | except ImportError: 90 | pass 91 | else: 92 | tf.set_random_seed(i) 93 | np.random.seed(i) 94 | random.seed(i) 95 | 96 | def get_session(): 97 | tf.reset_default_graph() 98 | tf_config = tf.ConfigProto( 99 | inter_op_parallelism_threads=1, 100 | intra_op_parallelism_threads=1) 101 | session = tf.Session(config=tf_config) 102 | print("AVAILABLE GPUS: ", get_available_gpus()) 103 | return session 104 | 105 | def get_env(task, seed): 106 | env_id = task.env_id 107 | 108 | env = gym.make(env_id) 109 | 110 | set_global_seeds(seed) 111 | env.seed(seed) 112 | 113 | expt_dir = '/tmp/hw3_vid_dir2/' 114 | env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) 115 | env = wrap_deepmind(env) 116 | 117 | return env 118 | 119 | def main(): 120 | # Get Atari games. 121 | benchmark = gym.benchmark_spec('Atari40M') 122 | 123 | # Change the index to select a different game. 124 | task = benchmark.tasks[3] 125 | 126 | # Run training 127 | seed = 0 # Use a seed of zero (you may want to randomize the seed!) 128 | env = get_env(task, seed) 129 | session = get_session() 130 | atari_learn(env, session, num_timesteps=task.max_timesteps) 131 | 132 | if __name__ == "__main__": 133 | main() 134 | -------------------------------------------------------------------------------- /sp17_hw/hw3/run_dqn_ram.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | from gym import wrappers 4 | import os.path as osp 5 | import random 6 | import numpy as np 7 | import tensorflow as tf 8 | import tensorflow.contrib.layers as layers 9 | 10 | import dqn 11 | from dqn_utils import * 12 | from atari_wrappers import * 13 | 14 | 15 | def atari_model(ram_in, num_actions, scope, reuse=False): 16 | with tf.variable_scope(scope, reuse=reuse): 17 | out = ram_in 18 | #out = tf.concat(1,(ram_in[:,4:5],ram_in[:,8:9],ram_in[:,11:13],ram_in[:,21:22],ram_in[:,50:51], ram_in[:,60:61],ram_in[:,64:65])) 19 | with tf.variable_scope("action_value"): 20 | out = layers.fully_connected(out, num_outputs=256, activation_fn=tf.nn.relu) 21 | out = layers.fully_connected(out, num_outputs=128, activation_fn=tf.nn.relu) 22 | out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu) 23 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 24 | 25 | return out 26 | 27 | def atari_learn(env, 28 | session, 29 | num_timesteps): 30 | # This is just a rough estimate 31 | num_iterations = float(num_timesteps) / 4.0 32 | 33 | lr_multiplier = 1.0 34 | lr_schedule = PiecewiseSchedule([ 35 | (0, 1e-4 * lr_multiplier), 36 | (num_iterations / 10, 1e-4 * lr_multiplier), 37 | (num_iterations / 2, 5e-5 * lr_multiplier), 38 | ], 39 | outside_value=5e-5 * lr_multiplier) 40 | optimizer = dqn.OptimizerSpec( 41 | constructor=tf.train.AdamOptimizer, 42 | kwargs=dict(epsilon=1e-4), 43 | lr_schedule=lr_schedule 44 | ) 45 | 46 | def stopping_criterion(env, t): 47 | # notice that here t is the number of steps of the wrapped env, 48 | # which is different from the number of steps in the underlying env 49 | return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps 50 | 51 | exploration_schedule = PiecewiseSchedule( 52 | [ 53 | (0, 0.2), 54 | (1e6, 0.1), 55 | (num_iterations / 2, 0.01), 56 | ], outside_value=0.01 57 | ) 58 | 59 | dqn.learn( 60 | env, 61 | q_func=atari_model, 62 | optimizer_spec=optimizer, 63 | session=session, 64 | exploration=exploration_schedule, 65 | stopping_criterion=stopping_criterion, 66 | replay_buffer_size=1000000, 67 | batch_size=32, 68 | gamma=0.99, 69 | learning_starts=50000, 70 | learning_freq=4, 71 | frame_history_len=1, 72 | target_update_freq=10000, 73 | grad_norm_clipping=10 74 | ) 75 | env.close() 76 | 77 | def get_available_gpus(): 78 | from tensorflow.python.client import device_lib 79 | local_device_protos = device_lib.list_local_devices() 80 | return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU'] 81 | 82 | def set_global_seeds(i): 83 | try: 84 | import tensorflow as tf 85 | except ImportError: 86 | pass 87 | else: 88 | tf.set_random_seed(i) 89 | np.random.seed(i) 90 | random.seed(i) 91 | 92 | def get_session(): 93 | tf.reset_default_graph() 94 | tf_config = tf.ConfigProto( 95 | inter_op_parallelism_threads=1, 96 | intra_op_parallelism_threads=1) 97 | session = tf.Session(config=tf_config) 98 | print("AVAILABLE GPUS: ", get_available_gpus()) 99 | return session 100 | 101 | def get_env(seed): 102 | env = gym.make('Pong-ram-v0') 103 | 104 | set_global_seeds(seed) 105 | env.seed(seed) 106 | 107 | expt_dir = '/tmp/hw3_vid_dir/' 108 | env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) 109 | env = wrap_deepmind_ram(env) 110 | 111 | return env 112 | 113 | def main(): 114 | # Run training 115 | seed = 0 # Use a seed of zero (you may want to randomize the seed!) 116 | env = get_env(seed) 117 | session = get_session() 118 | atari_learn(env, session, num_timesteps=int(4e7)) 119 | 120 | if __name__ == "__main__": 121 | main() 122 | -------------------------------------------------------------------------------- /sp17_hw/hw4/homework.md: -------------------------------------------------------------------------------- 1 | # Homework 4 2 | 3 | In `main.py` you will find an implementation of a "vanilla" policy gradient method, applied to an MDP with a discrete action space: an episodic version of the classic "cartpole" task. First, make sure the provided code works on your computer by running `python main.py`. We recommend reading through all of the code and comments in the function `main_cartpole`, starting at the top of the function. 4 | 5 | The code computes some useful diagnostics, which you may find helpful to look at while tuning hyperparameters: 6 | 7 | - **KL[policy before update || policy after update]**. Large spikes in KL divergence mean that the optimization took a large step, and sometimes these spikes cause a collapse in performance. 8 | - **Entropy of the policy**. If entropy goes down too fast, then you may not explore enough, but if it goes down too slowly, you'll probably not reach optimal performance. 9 | - **Explained variance of the value function**. If the value function perfectly explains the returns, then it will be 1; if you get a negative result, then it's worse than predicting a constant. 10 | 11 | Software dependencies: 12 | 13 | - tensorflow 14 | - numpy + scipy (Anaconda recommended) 15 | - gym (I'm using 0.8.0, `pip install gym==0.8.0`, but old versions should work just as well) 16 | 17 | ## Problem 1 18 | 19 | Here you will modify the `main_cartpole` policy gradient implementation to work on a continuous action space, specifically, the gym environment `Pendulum-v`. Note that in `main_cartpole`, note that the neural network outputs "logits" (i.e., log-probabilities plus-or-minus a constant) that specify a categorical distribution. On the other hand, for the pendulum task, your neural network should output the mean of a Gaussian distribution, a separate parameter vector to parameterize the log standard deviation. For example, you could use the following code: 20 | 21 | ```python 22 | 23 | mean_na = dense(h2, ac_dim, weight_init=normc_initializer(0.1)) # Mean control output 24 | logstd_a = tf.get_variable("logstdev", [ac_dim], initializer=tf.zeros_initializer) # Variance 25 | 26 | sy_sampled_ac = YOUR_CODE_HERE 27 | sy_logprob_n = YOUR_CODE_HERE 28 | 29 | ``` 30 | 31 | You should also compute differential entropy (replacing `sy_ent`) and KL-divergence (`sy_kl`) for the Gaussian distribution. 32 | 33 | The pendulum problem is slightly harder, and using a fixed stepsize does not work reliably---thus, we instead recommend using an adaptive stepsize, where you adjust it based on the KL divergence between the new and old policy. Code for this stepsize adaptation is provided. 34 | 35 | You can plot your results using the script `plot_learning_curves.py` or your own plotting code. 36 | 37 | **Deliverables** 38 | 39 | - Show a plot with the pendulum converging to EpRewMean of at least `-300`. Include EpRewMean, KL, Entropy in your plots. 40 | - Describe the hyperparameters used and how many timesteps your algorithm took to learn. 41 | 42 | ## Problem 2 43 | 44 | 1. Implement a neural network value function with the same interface as `LinearVF`. Add it to the provided cartpole solver, and compare the performance of the linear and neural network value function (i.e., baseline). 45 | 2. Perform the same comparison--linear vs neural network--for your pendulum solver from Problem 1. You should be able to obtain faster learning using the neural network. 46 | 47 | 48 | **Deliverables** 49 | 50 | - A comparison of linear vs neural network value function on the cartpole. Show the value function's explained variance (EVBefore) and mean episode reward (EpRewMean). 51 | - A comparison of linear vs neural network value function on the pendulum. Show the value function's explained variance (EVBefore) and mean episode reward (EpRewMean). 52 | 53 | In both cases, list the hyperparameters used for neural network training. 54 | 55 | ## Problem 3 (bonus) 56 | 57 | Implement a more advanced policy gradient method from lecture (such as TRPO, or the advantage function estimator used in A3C or generalized advantage estimation), and apply it to the gym environment `Hopper-v1`. See if you can learn a good gait in less than 500,000 timesteps. 58 | Hint: it may help to standardize your inputs using a running estimate of mean and standard deviation. 59 | 60 | ob_rescaled = (ob_raw - mean) / (stdev + epsilon) 61 | 62 | **Deliverables** 63 | 64 | A description of what you implemented, and learning curves on the Hopper-v1 environment. -------------------------------------------------------------------------------- /sp17_hw/hw4/logz.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Some simple logging functionality, inspired by rllab's logging. 4 | Assumes that each diagnostic gets logged each iteration 5 | 6 | Call logz.configure_output_dir() to start logging to a 7 | tab-separated-values file (some_folder_name/log.txt) 8 | 9 | To load the learning curves, you can do, for example 10 | 11 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True) 12 | A['EpRewMean'] 13 | 14 | """ 15 | 16 | import os.path as osp, shutil, time, atexit, os, subprocess 17 | 18 | color2num = dict( 19 | gray=30, 20 | red=31, 21 | green=32, 22 | yellow=33, 23 | blue=34, 24 | magenta=35, 25 | cyan=36, 26 | white=37, 27 | crimson=38 28 | ) 29 | 30 | def colorize(string, color, bold=False, highlight=False): 31 | attr = [] 32 | num = color2num[color] 33 | if highlight: num += 10 34 | attr.append(str(num)) 35 | if bold: attr.append('1') 36 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 37 | 38 | class G: 39 | output_dir = None 40 | output_file = None 41 | first_row = True 42 | log_headers = [] 43 | log_current_row = {} 44 | 45 | def configure_output_dir(d=None): 46 | """ 47 | Set output directory to d, or to /tmp/somerandomnumber if d is None 48 | """ 49 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time()) 50 | assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir 51 | os.makedirs(G.output_dir) 52 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w') 53 | atexit.register(G.output_file.close) 54 | try: 55 | cmd = "cd %s && git diff > %s 2>/dev/null"%(osp.dirname(__file__), osp.join(G.output_dir, "a.diff")) 56 | subprocess.check_call(cmd, shell=True) # Save git diff to experiment directory 57 | except subprocess.CalledProcessError: 58 | print("configure_output_dir: not storing the git diff, probably because you're not in a git repo") 59 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True)) 60 | 61 | def log_tabular(key, val): 62 | """ 63 | Log a value of some diagnostic 64 | Call this once for each diagnostic quantity, each iteration 65 | """ 66 | if G.first_row: 67 | G.log_headers.append(key) 68 | else: 69 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key 70 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key 71 | G.log_current_row[key] = val 72 | 73 | def dump_tabular(): 74 | """ 75 | Write all of the diagnostics from the current iteration 76 | """ 77 | vals = [] 78 | print("-"*37) 79 | for key in G.log_headers: 80 | val = G.log_current_row.get(key, "") 81 | if hasattr(val, "__float__"): valstr = "%8.3g"%val 82 | else: valstr = val 83 | print("| %15s | %15s |"%(key, valstr)) 84 | vals.append(val) 85 | print("-"*37) 86 | if G.output_file is not None: 87 | if G.first_row: 88 | G.output_file.write("\t".join(G.log_headers)) 89 | G.output_file.write("\n") 90 | G.output_file.write("\t".join(map(str,vals))) 91 | G.output_file.write("\n") 92 | G.output_file.flush() 93 | G.log_current_row.clear() 94 | G.first_row=False -------------------------------------------------------------------------------- /sp17_hw/hw4/main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | import logz 5 | import scipy.signal 6 | 7 | def normc_initializer(std=1.0): 8 | """ 9 | Initialize array with normalized columns 10 | """ 11 | def _initializer(shape, dtype=None, partition_info=None): #pylint: disable=W0613 12 | out = np.random.randn(*shape).astype(np.float32) 13 | out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True)) 14 | return tf.constant(out) 15 | return _initializer 16 | 17 | 18 | def dense(x, size, name, weight_init=None): 19 | """ 20 | Dense (fully connected) layer 21 | """ 22 | w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init) 23 | b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer()) 24 | return tf.matmul(x, w) + b 25 | 26 | def fancy_slice_2d(X, inds0, inds1): 27 | """ 28 | Like numpy's X[inds0, inds1] 29 | """ 30 | inds0 = tf.cast(inds0, tf.int64) 31 | inds1 = tf.cast(inds1, tf.int64) 32 | shape = tf.cast(tf.shape(X), tf.int64) 33 | ncols = shape[1] 34 | Xflat = tf.reshape(X, [-1]) 35 | return tf.gather(Xflat, inds0 * ncols + inds1) 36 | 37 | def discount(x, gamma): 38 | """ 39 | Compute discounted sum of future values 40 | out[i] = in[i] + gamma * in[i+1] + gamma^2 * in[i+2] + ... 41 | """ 42 | return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1] 43 | 44 | def explained_variance_1d(ypred,y): 45 | """ 46 | Var[ypred - y] / var[y]. 47 | https://www.quora.com/What-is-the-meaning-proportion-of-variance-explained-in-linear-regression 48 | """ 49 | assert y.ndim == 1 and ypred.ndim == 1 50 | vary = np.var(y) 51 | return np.nan if vary==0 else 1 - np.var(y-ypred)/vary 52 | 53 | def categorical_sample_logits(logits): 54 | """ 55 | Samples (symbolically) from categorical distribution, where logits is a NxK 56 | matrix specifying N categorical distributions with K categories 57 | 58 | specifically, exp(logits) / sum( exp(logits), axis=1 ) is the 59 | probabilities of the different classes 60 | 61 | Cleverly uses gumbell trick, based on 62 | https://github.com/tensorflow/tensorflow/issues/456 63 | """ 64 | U = tf.random_uniform(tf.shape(logits)) 65 | return tf.argmax(logits - tf.log(-tf.log(U)), dimension=1) 66 | 67 | def pathlength(path): 68 | return len(path["reward"]) 69 | 70 | class LinearValueFunction(object): 71 | coef = None 72 | def fit(self, X, y): 73 | Xp = self.preproc(X) 74 | A = Xp.T.dot(Xp) 75 | nfeats = Xp.shape[1] 76 | A[np.arange(nfeats), np.arange(nfeats)] += 1e-3 # a little ridge regression 77 | b = Xp.T.dot(y) 78 | self.coef = np.linalg.solve(A, b) 79 | def predict(self, X): 80 | if self.coef is None: 81 | return np.zeros(X.shape[0]) 82 | else: 83 | return self.preproc(X).dot(self.coef) 84 | def preproc(self, X): 85 | return np.concatenate([np.ones([X.shape[0], 1]), X, np.square(X)/2.0], axis=1) 86 | 87 | class NnValueFunction(object): 88 | pass # YOUR CODE HERE 89 | 90 | def lrelu(x, leak=0.2): 91 | f1 = 0.5 * (1 + leak) 92 | f2 = 0.5 * (1 - leak) 93 | return f1 * x + f2 * abs(x) 94 | 95 | 96 | 97 | def main_cartpole(n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, stepsize=1e-2, animate=True, logdir=None): 98 | env = gym.make("CartPole-v0") 99 | ob_dim = env.observation_space.shape[0] 100 | num_actions = env.action_space.n 101 | logz.configure_output_dir(logdir) 102 | vf = LinearValueFunction() 103 | 104 | # Symbolic variables have the prefix sy_, to distinguish them from the numerical values 105 | # that are computed later in these function 106 | sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of observations 107 | sy_ac_n = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) # batch of actions taken by the policy, used for policy gradient computation 108 | sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # advantage function estimate 109 | sy_h1 = lrelu(dense(sy_ob_no, 32, "h1", weight_init=normc_initializer(1.0))) # hidden layer 110 | sy_logits_na = dense(sy_h1, num_actions, "final", weight_init=normc_initializer(0.05)) # "logits", describing probability distribution of final layer 111 | # we use a small initialization for the last layer, so the initial policy has maximal entropy 112 | sy_oldlogits_na = tf.placeholder(shape=[None, num_actions], name='oldlogits', dtype=tf.float32) # logits BEFORE update (just used for KL diagnostic) 113 | sy_logp_na = tf.nn.log_softmax(sy_logits_na) # logprobability of actions 114 | sy_sampled_ac = categorical_sample_logits(sy_logits_na)[0] # sampled actions, used for defining the policy (NOT computing the policy gradient) 115 | sy_n = tf.shape(sy_ob_no)[0] 116 | sy_logprob_n = fancy_slice_2d(sy_logp_na, tf.range(sy_n), sy_ac_n) # log-prob of actions taken -- used for policy gradient calculation 117 | 118 | # The following quantities are just used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES >>>> 119 | sy_oldlogp_na = tf.nn.log_softmax(sy_oldlogits_na) 120 | sy_oldp_na = tf.exp(sy_oldlogp_na) 121 | sy_kl = tf.reduce_sum(sy_oldp_na * (sy_oldlogp_na - sy_logp_na)) / tf.to_float(sy_n) 122 | sy_p_na = tf.exp(sy_logp_na) 123 | sy_ent = tf.reduce_sum( - sy_p_na * sy_logp_na) / tf.to_float(sy_n) 124 | # <<<<<<<<<<<<< 125 | 126 | sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss") 127 | 128 | sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently) 129 | update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr) 130 | 131 | tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 132 | # use single thread. on such a small problem, multithreading gives you a slowdown 133 | # this way, we can better use multiple cores for different experiments 134 | sess = tf.Session(config=tf_config) 135 | sess.__enter__() # equivalent to `with sess:` 136 | tf.global_variables_initializer().run() #pylint: disable=E1101 137 | 138 | total_timesteps = 0 139 | 140 | for i in range(n_iter): 141 | print("********** Iteration %i ************"%i) 142 | 143 | # Collect paths until we have enough timesteps 144 | timesteps_this_batch = 0 145 | paths = [] 146 | while True: 147 | ob = env.reset() 148 | terminated = False 149 | obs, acs, rewards = [], [], [] 150 | animate_this_episode=(len(paths)==0 and (i % 10 == 0) and animate) 151 | while True: 152 | if animate_this_episode: 153 | env.render() 154 | obs.append(ob) 155 | ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]}) 156 | acs.append(ac) 157 | ob, rew, done, _ = env.step(ac) 158 | rewards.append(rew) 159 | if done: 160 | break 161 | path = {"observation" : np.array(obs), "terminated" : terminated, 162 | "reward" : np.array(rewards), "action" : np.array(acs)} 163 | paths.append(path) 164 | timesteps_this_batch += pathlength(path) 165 | if timesteps_this_batch > min_timesteps_per_batch: 166 | break 167 | total_timesteps += timesteps_this_batch 168 | # Estimate advantage function 169 | vtargs, vpreds, advs = [], [], [] 170 | for path in paths: 171 | rew_t = path["reward"] 172 | return_t = discount(rew_t, gamma) 173 | vpred_t = vf.predict(path["observation"]) 174 | adv_t = return_t - vpred_t 175 | advs.append(adv_t) 176 | vtargs.append(return_t) 177 | vpreds.append(vpred_t) 178 | 179 | # Build arrays for policy update 180 | ob_no = np.concatenate([path["observation"] for path in paths]) 181 | ac_n = np.concatenate([path["action"] for path in paths]) 182 | adv_n = np.concatenate(advs) 183 | standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) 184 | vtarg_n = np.concatenate(vtargs) 185 | vpred_n = np.concatenate(vpreds) 186 | vf.fit(ob_no, vtarg_n) 187 | 188 | # Policy update 189 | _, oldlogits_na = sess.run([update_op, sy_logits_na], feed_dict={sy_ob_no:ob_no, sy_ac_n:ac_n, sy_adv_n:standardized_adv_n, sy_stepsize:stepsize}) 190 | kl, ent = sess.run([sy_kl, sy_ent], feed_dict={sy_ob_no:ob_no, sy_oldlogits_na:oldlogits_na}) 191 | 192 | # Log diagnostics 193 | logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) 194 | logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) 195 | logz.log_tabular("KLOldNew", kl) 196 | logz.log_tabular("Entropy", ent) 197 | logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n)) 198 | logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n)) 199 | logz.log_tabular("TimestepsSoFar", total_timesteps) 200 | # If you're overfitting, EVAfter will be way larger than EVBefore. 201 | # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias 202 | logz.dump_tabular() 203 | 204 | def main_pendulum(logdir, seed, n_iter, gamma, min_timesteps_per_batch, initial_stepsize, desired_kl, vf_type, vf_params, animate=False): 205 | tf.set_random_seed(seed) 206 | np.random.seed(seed) 207 | env = gym.make("Pendulum-v0") 208 | ob_dim = env.observation_space.shape[0] 209 | ac_dim = env.action_space.shape[0] 210 | logz.configure_output_dir(logdir) 211 | if vf_type == 'linear': 212 | vf = LinearValueFunction(**vf_params) 213 | elif vf_type == 'nn': 214 | vf = NnValueFunction(ob_dim=ob_dim, **vf_params) 215 | 216 | 217 | YOUR_CODE_HERE 218 | 219 | 220 | sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss") 221 | 222 | sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently) 223 | update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr) 224 | 225 | sess = tf.Session() 226 | sess.__enter__() # equivalent to `with sess:` 227 | tf.global_variables_initializer().run() #pylint: disable=E1101 228 | 229 | total_timesteps = 0 230 | stepsize = initial_stepsize 231 | 232 | for i in range(n_iter): 233 | print("********** Iteration %i ************"%i) 234 | 235 | YOUR_CODE_HERE 236 | 237 | if kl > desired_kl * 2: 238 | stepsize /= 1.5 239 | print('stepsize -> %s'%stepsize) 240 | elif kl < desired_kl / 2: 241 | stepsize *= 1.5 242 | print('stepsize -> %s'%stepsize) 243 | else: 244 | print('stepsize OK') 245 | 246 | 247 | # Log diagnostics 248 | logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) 249 | logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) 250 | logz.log_tabular("KLOldNew", kl) 251 | logz.log_tabular("Entropy", ent) 252 | logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n)) 253 | logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n)) 254 | logz.log_tabular("TimestepsSoFar", total_timesteps) 255 | # If you're overfitting, EVAfter will be way larger than EVBefore. 256 | # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias 257 | logz.dump_tabular() 258 | 259 | 260 | def main_pendulum1(d): 261 | return main_pendulum(**d) 262 | 263 | if __name__ == "__main__": 264 | if 1: 265 | main_cartpole(logdir=None) # when you want to start collecting results, set the logdir 266 | if 0: 267 | general_params = dict(gamma=0.97, animate=False, min_timesteps_per_batch=2500, n_iter=300, initial_stepsize=1e-3) 268 | params = [ 269 | dict(logdir='/tmp/ref/linearvf-kl2e-3-seed0', seed=0, desired_kl=2e-3, vf_type='linear', vf_params={}, **general_params), 270 | dict(logdir='/tmp/ref/nnvf-kl2e-3-seed0', seed=0, desired_kl=2e-3, vf_type='nn', vf_params=dict(n_epochs=10, stepsize=1e-3), **general_params), 271 | dict(logdir='/tmp/ref/linearvf-kl2e-3-seed1', seed=1, desired_kl=2e-3, vf_type='linear', vf_params={}, **general_params), 272 | dict(logdir='/tmp/ref/nnvf-kl2e-3-seed1', seed=1, desired_kl=2e-3, vf_type='nn', vf_params=dict(n_epochs=10, stepsize=1e-3), **general_params), 273 | dict(logdir='/tmp/ref/linearvf-kl2e-3-seed2', seed=2, desired_kl=2e-3, vf_type='linear', vf_params={}, **general_params), 274 | dict(logdir='/tmp/ref/nnvf-kl2e-3-seed2', seed=2, desired_kl=2e-3, vf_type='nn', vf_params=dict(n_epochs=10, stepsize=1e-3), **general_params), 275 | ] 276 | import multiprocessing 277 | p = multiprocessing.Pool() 278 | p.map(main_pendulum1, params) 279 | -------------------------------------------------------------------------------- /sp17_hw/hw4/plot_learning_curves.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | parser = argparse.ArgumentParser() 3 | parser.add_argument("expdir", help="experiment dir, e.g., /tmp/experiments") 4 | args = parser.parse_args() 5 | 6 | from pylab import * 7 | import os 8 | from os.path import join 9 | 10 | dirnames = os.listdir(args.expdir) 11 | 12 | fig, axes = subplots(4) 13 | for dirname in dirnames: 14 | print(dirname) 15 | A = np.genfromtxt(join(args.expdir, dirname, 'log.txt'),delimiter='\t',dtype=None, names=True) 16 | # axes[0].plot(scipy.signal.savgol_filter(A['EpRewMean'] , 21, 3), '-x') 17 | x = A['TimestepsSoFar'] 18 | axes[0].plot(x, A['EpRewMean'], '-x') 19 | axes[1].plot(x, A['KLOldNew'], '-x') 20 | axes[2].plot(x, A['Entropy'], '-x') 21 | axes[3].plot(x, A['EVBefore'], '-x') 22 | legend(dirnames,loc='best').draggable() 23 | axes[0].set_ylabel("EpRewMean") 24 | axes[1].set_ylabel("KLOldNew") 25 | axes[2].set_ylabel("Entropy") 26 | axes[3].set_ylabel("EVBefore") 27 | axes[3].set_ylim(-1,1) 28 | axes[-1].set_xlabel("Iterations") 29 | show() 30 | --------------------------------------------------------------------------------