├── .gitignore ├── README.md ├── main.py ├── run.py ├── space_conversion.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | logs_* 2 | *.pyc 3 | *.swp 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TRPO 2 | 3 | This repo implements TRPO agent ( http://arxiv.org/abs/1502.05477 ). 4 | 5 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | import numpy as np 3 | import random 4 | import tensorflow as tf 5 | import time 6 | import os 7 | import logging 8 | import gym 9 | from gym import envs, scoreboard 10 | from gym.spaces import Discrete, Box 11 | import prettytensor as pt 12 | from space_conversion import SpaceConversionEnv 13 | import tempfile 14 | import sys 15 | 16 | class TRPOAgent(object): 17 | 18 | config = dict2(**{ 19 | "timesteps_per_batch": 1000, 20 | "max_pathlength": 10000, 21 | "max_kl": 0.01, 22 | "cg_damping": 0.1, 23 | "gamma": 0.95}) 24 | 25 | def __init__(self, env): 26 | self.env = env 27 | if not isinstance(env.observation_space, Box) or \ 28 | not isinstance(env.action_space, Discrete): 29 | print("Incompatible spaces.") 30 | exit(-1) 31 | print("Observation Space", env.observation_space) 32 | print("Action Space", env.action_space) 33 | self.session = tf.Session() 34 | self.end_count = 0 35 | self.train = True 36 | self.obs = obs = tf.placeholder( 37 | dtype, shape=[ 38 | None, 2 * env.observation_space.shape[0] + env.action_space.n], name="obs") 39 | self.prev_obs = np.zeros((1, env.observation_space.shape[0])) 40 | self.prev_action = np.zeros((1, env.action_space.n)) 41 | self.action = action = tf.placeholder(tf.int64, shape=[None], name="action") 42 | self.advant = advant = tf.placeholder(dtype, shape=[None], name="advant") 43 | self.oldaction_dist = oldaction_dist = tf.placeholder(dtype, shape=[None, env.action_space.n], name="oldaction_dist") 44 | 45 | # Create neural network. 46 | action_dist_n, _ = (pt.wrap(self.obs). 47 | fully_connected(64, activation_fn=tf.nn.tanh). 48 | softmax_classifier(env.action_space.n)) 49 | eps = 1e-6 50 | self.action_dist_n = action_dist_n 51 | N = tf.shape(obs)[0] 52 | p_n = slice_2d(action_dist_n, tf.range(0, N), action) 53 | oldp_n = slice_2d(oldaction_dist, tf.range(0, N), action) 54 | ratio_n = p_n / oldp_n 55 | Nf = tf.cast(N, dtype) 56 | surr = -tf.reduce_mean(ratio_n * advant) # Surrogate loss 57 | var_list = tf.trainable_variables() 58 | kl = tf.reduce_sum(oldaction_dist * tf.log((oldaction_dist + eps) / (action_dist_n + eps))) / Nf 59 | ent = tf.reduce_sum(-action_dist_n * tf.log(action_dist_n + eps)) / Nf 60 | 61 | self.losses = [surr, kl, ent] 62 | self.pg = flatgrad(surr, var_list) 63 | # KL divergence where first arg is fixed 64 | # replace old->tf.stop_gradient from previous kl 65 | kl_firstfixed = tf.reduce_sum(tf.stop_gradient( 66 | action_dist_n) * tf.log(tf.stop_gradient(action_dist_n + eps) / (action_dist_n + eps))) / Nf 67 | grads = tf.gradients(kl_firstfixed, var_list) 68 | self.flat_tangent = tf.placeholder(dtype, shape=[None]) 69 | shapes = map(var_shape, var_list) 70 | start = 0 71 | tangents = [] 72 | for shape in shapes: 73 | size = np.prod(shape) 74 | param = tf.reshape(self.flat_tangent[start:(start + size)], shape) 75 | tangents.append(param) 76 | start += size 77 | gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)] 78 | self.fvp = flatgrad(gvp, var_list) 79 | self.gf = GetFlat(self.session, var_list) 80 | self.sff = SetFromFlat(self.session, var_list) 81 | self.vf = VF(self.session) 82 | self.session.run(tf.initialize_all_variables()) 83 | 84 | def act(self, obs, *args): 85 | obs = np.expand_dims(obs, 0) 86 | self.prev_obs = obs 87 | obs_new = np.concatenate([obs, self.prev_obs, self.prev_action], 1) 88 | 89 | action_dist_n = self.session.run(self.action_dist_n, {self.obs: obs_new}) 90 | 91 | if self.train: 92 | action = int(cat_sample(action_dist_n)[0]) 93 | else: 94 | action = int(np.argmax(action_dist_n)) 95 | self.prev_action *= 0.0 96 | self.prev_action[0, action] = 1.0 97 | return action, action_dist_n, np.squeeze(obs_new) 98 | 99 | def learn(self): 100 | config = self.config 101 | start_time = time.time() 102 | numeptotal = 0 103 | i = 0 104 | while True: 105 | # Generating paths. 106 | print("Rollout") 107 | paths = rollout( 108 | self.env, 109 | self, 110 | config.max_pathlength, 111 | config.timesteps_per_batch) 112 | 113 | # Computing returns and estimating advantage function. 114 | for path in paths: 115 | path["baseline"] = self.vf.predict(path) 116 | path["returns"] = discount(path["rewards"], config.gamma) 117 | path["advant"] = path["returns"] - path["baseline"] 118 | 119 | # Updating policy. 120 | action_dist_n = np.concatenate([path["action_dists"] for path in paths]) 121 | obs_n = np.concatenate([path["obs"] for path in paths]) 122 | action_n = np.concatenate([path["actions"] for path in paths]) 123 | baseline_n = np.concatenate([path["baseline"] for path in paths]) 124 | returns_n = np.concatenate([path["returns"] for path in paths]) 125 | 126 | # Standardize the advantage function to have mean=0 and std=1. 127 | advant_n = np.concatenate([path["advant"] for path in paths]) 128 | advant_n -= advant_n.mean() 129 | 130 | # Computing baseline function for next iter. 131 | 132 | advant_n /= (advant_n.std() + 1e-8) 133 | 134 | feed = {self.obs: obs_n, 135 | self.action: action_n, 136 | self.advant: advant_n, 137 | self.oldaction_dist: action_dist_n} 138 | 139 | 140 | episoderewards = np.array( 141 | [path["rewards"].sum() for path in paths]) 142 | 143 | print "\n********** Iteration %i ************" % i 144 | if episoderewards.mean() > 1.1 * self.env._env.spec.reward_threshold: 145 | self.train = False 146 | if not self.train: 147 | print("Episode mean: %f" % episoderewards.mean()) 148 | self.end_count += 1 149 | if self.end_count > 100: 150 | break 151 | if self.train: 152 | self.vf.fit(paths) 153 | thprev = self.gf() 154 | 155 | def fisher_vector_product(p): 156 | feed[self.flat_tangent] = p 157 | return self.session.run(self.fvp, feed) + config.cg_damping * p 158 | 159 | g = self.session.run(self.pg, feed_dict=feed) 160 | stepdir = conjugate_gradient(fisher_vector_product, -g) 161 | shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) 162 | lm = np.sqrt(shs / config.max_kl) 163 | fullstep = stepdir / lm 164 | neggdotstepdir = -g.dot(stepdir) 165 | 166 | def loss(th): 167 | self.sff(th) 168 | return self.session.run(self.losses[0], feed_dict=feed) 169 | theta = linesearch(loss, thprev, fullstep, neggdotstepdir / lm) 170 | self.sff(theta) 171 | 172 | surrafter, kloldnew, entropy = self.session.run( 173 | self.losses, feed_dict=feed) 174 | if kloldnew > 2.0 * config.max_kl: 175 | self.sff(thprev) 176 | 177 | stats = {} 178 | 179 | numeptotal += len(episoderewards) 180 | stats["Total number of episodes"] = numeptotal 181 | stats["Average sum of rewards per episode"] = episoderewards.mean() 182 | stats["Entropy"] = entropy 183 | exp = explained_variance(np.array(baseline_n), np.array(returns_n)) 184 | stats["Baseline explained"] = exp 185 | stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0) 186 | stats["KL between old and new distribution"] = kloldnew 187 | stats["Surrogate loss"] = surrafter 188 | for k, v in stats.iteritems(): 189 | print(k + ": " + " " * (40 - len(k)) + str(v)) 190 | if entropy != entropy: 191 | exit(-1) 192 | if exp > 0.8: 193 | self.train = False 194 | i += 1 195 | 196 | training_dir = tempfile.mkdtemp() 197 | logging.getLogger().setLevel(logging.DEBUG) 198 | 199 | if len(sys.argv) > 1: 200 | task = sys.argv[1] 201 | else: 202 | task = "RepeatCopy-v0" 203 | 204 | env = envs.make(task) 205 | env.monitor.start(training_dir) 206 | 207 | env = SpaceConversionEnv(env, Box, Discrete) 208 | 209 | agent = TRPOAgent(env) 210 | agent.learn() 211 | env.monitor.close() 212 | gym.upload(training_dir, 213 | algorithm_id='trpo_ff') 214 | 215 | 216 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | tasks = ["Copy-v0", "DuplicatedInput-v0", "Reverse-v0", "RepeatCopy-v0"] 4 | 5 | os.system("rm logs_*") 6 | os.system("k screen") 7 | os.system("screen -wipe") 8 | 9 | 10 | for t in tasks: 11 | os.system("screen -dm -S trpo_%s bash -c '. ~/.profile; . ~/.bashrc; CUDA_VISIBLE_DEVICES=[] python main.py %s 2>&1 | tee logs_%s ; bash'" % (t, t, t)) 12 | -------------------------------------------------------------------------------- /space_conversion.py: -------------------------------------------------------------------------------- 1 | """ 2 | `SpaceConversionEnv` acts as a wrapper on 3 | any environment. It allows to convert some action spaces, and observation spaces to others. 4 | """ 5 | 6 | import numpy as np 7 | from gym.spaces import Discrete, Box, Tuple 8 | from gym import Env 9 | 10 | 11 | def box2box4obj(x, old_space_obj, new_space_obj): 12 | assert(old_space_obj.contains(x)) 13 | action = np.reshape(x, new_space_obj.shape) 14 | assert(new_space_obj.contains(action)) 15 | return action 16 | 17 | def box2box4class(box_space): 18 | shape = np.prod(box_space.shape) 19 | low = box_space.low 20 | high = box_space.high 21 | if isinstance(low, np.ndarray): 22 | low = np.reshape(low, (shape, )) 23 | if isinstance(high, np.ndarray): 24 | high = np.reshape(high, (shape, )) 25 | return Box(low, high) 26 | 27 | def discrete2tuple4obj(x, discrete_space, tuple_space): 28 | assert(discrete_space.contains(x)) 29 | action = [] 30 | for space in tuple_space.spaces: 31 | assert(isinstance(space, Discrete)) 32 | action.append(x % space.n) 33 | x = int(x / space.n) 34 | action = tuple(action) 35 | assert(tuple_space.contains(action)) 36 | return action 37 | 38 | def tuple2discrete4obj(x, old_space_obj, new_space_obj): 39 | assert(False) 40 | 41 | def tuple2discrete4class(tuple_space): 42 | n = 1 43 | for space in tuple_space.spaces: 44 | assert(isinstance(space, Discrete)) 45 | n *= space.n 46 | return Discrete(n) 47 | 48 | def box2discrete4obj(x, box_space, discrete_space): 49 | assert(False) 50 | 51 | def discrete2box4obj(x, discrete_space, box_space): 52 | ret = np.zeros(discrete_space.n) 53 | ret[x] = 1.0 54 | return ret 55 | 56 | def discrete2box4class(discrete_space): 57 | return Box(0.0, 1.0, discrete_space.n) 58 | 59 | def ident4obj(x, old_space_obj, new_space_obj): 60 | return x 61 | 62 | class SpaceConversionEnv(Env): 63 | convertable = {(Tuple, Discrete): (tuple2discrete4obj, discrete2tuple4obj, tuple2discrete4class), \ 64 | (Discrete, Box): (discrete2box4obj, box2discrete4obj, discrete2box4class), \ 65 | (Box, Box): (box2box4obj, box2box4obj, box2box4class)} 66 | 67 | def __init__(self, env, target_observation_space=None, target_action_space=None, verbose=False): 68 | self._verbose = verbose 69 | self._env = env 70 | self.action_convert = None 71 | self.observation_convert = None 72 | for pairs, convert in self.convertable.iteritems(): 73 | if env.action_space.__class__ == pairs[0] and \ 74 | target_action_space == pairs[1] and \ 75 | self.action_convert is None: 76 | self.action_convert = convert[1] 77 | self._action_space_ = convert[2](env.action_space) 78 | if env.observation_space.__class__ == pairs[0] and \ 79 | target_observation_space == pairs[1] and \ 80 | self.observation_convert is None: 81 | self.observation_convert = convert[0] 82 | self._observation_space_ = convert[2](env.observation_space) 83 | 84 | if self.action_convert is None and \ 85 | (self.action_space.__class__ == target_action_space or 86 | target_action_space is None): 87 | self.action_convert = ident4obj 88 | self._action_space = env.action_space 89 | if self.observation_convert is None and \ 90 | (self.observation_space.__class__ == target_observation_space or \ 91 | target_observation_space is None): 92 | self.observation_convert = ident4obj 93 | self._observation_space = env.observation_space 94 | 95 | assert(self.action_convert is not None) 96 | assert(self.observation_convert is not None) 97 | 98 | def step(self, action, **kwargs): 99 | conv_action = self.action_convert(action, self.action_space, self._env.action_space) 100 | if self._verbose and self.action_convert != ident4obj: 101 | print("Input action: %s, converted action: %s" % (action, conv_action)) 102 | step = self._env.step(conv_action, **kwargs) 103 | observation, reward, done, info = step 104 | 105 | conv_observation = self.observation_convert(observation, self._env.observation_space, self.observation_space) 106 | 107 | if self._verbose and self.observation_convert != ident4obj: 108 | print("Input observation: %s, converted observation: %s" % (observation, conv_observation)) 109 | return conv_observation, reward, done, {} 110 | 111 | def reset(self, **kwargs): 112 | observation = self._env.reset(**kwargs) 113 | conv_observation = self.observation_convert(observation, self._env.observation_space, self.observation_space) 114 | 115 | if self._verbose and self.observation_convert != ident4obj: 116 | print("Input observation: %s, converted observation: %s" % (observation, conv_observation)) 117 | return conv_observation 118 | 119 | @property 120 | def action_space(self): 121 | return self._action_space_ 122 | 123 | @property 124 | def observation_space(self): 125 | return self._observation_space_ 126 | 127 | def __getattr__(self, field): 128 | """ 129 | proxy everything to underlying env 130 | """ 131 | if hasattr(self._env, field): 132 | return getattr(self._env, field) 133 | raise AttributeError(field) 134 | 135 | def __repr__(self): 136 | if "object at" not in str(self._env): 137 | env_name = str(env._env) 138 | else: 139 | env_name = self._env.__class__.__name__ 140 | return env_name 141 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import random 4 | import scipy.signal 5 | import prettytensor as pt 6 | 7 | seed = 1 8 | random.seed(seed) 9 | np.random.seed(seed) 10 | tf.set_random_seed(seed) 11 | 12 | dtype = tf.float32 13 | 14 | def discount(x, gamma): 15 | assert x.ndim >= 1 16 | return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] 17 | 18 | def rollout(env, agent, max_pathlength, n_timesteps): 19 | paths = [] 20 | timesteps_sofar = 0 21 | while timesteps_sofar < n_timesteps: 22 | obs, actions, rewards, action_dists = [], [], [], [] 23 | ob = env.reset() 24 | agent.prev_action *= 0.0 25 | agent.prev_obs *= 0.0 26 | for _ in xrange(max_pathlength): 27 | action, action_dist, ob = agent.act(ob) 28 | obs.append(ob) 29 | actions.append(action) 30 | action_dists.append(action_dist) 31 | res = env.step(action) 32 | ob = res[0] 33 | rewards.append(res[1]) 34 | if res[2]: 35 | path = {"obs": np.concatenate(np.expand_dims(obs, 0)), 36 | "action_dists": np.concatenate(action_dists), 37 | "rewards": np.array(rewards), 38 | "actions": np.array(actions)} 39 | paths.append(path) 40 | agent.prev_action *= 0.0 41 | agent.prev_obs *= 0.0 42 | break 43 | timesteps_sofar += len(path["rewards"]) 44 | return paths 45 | 46 | 47 | class VF(object): 48 | coeffs = None 49 | 50 | def __init__(self, session): 51 | self.net = None 52 | self.session = session 53 | 54 | def create_net(self, shape): 55 | print(shape) 56 | self.x = tf.placeholder(tf.float32, shape=[None, shape], name="x") 57 | self.y = tf.placeholder(tf.float32, shape=[None], name="y") 58 | self.net = (pt.wrap(self.x). 59 | fully_connected(64, activation_fn=tf.nn.relu). 60 | fully_connected(64, activation_fn=tf.nn.relu). 61 | fully_connected(1)) 62 | self.net = tf.reshape(self.net, (-1, )) 63 | l2 = (self.net - self.y) * (self.net - self.y) 64 | self.train = tf.train.AdamOptimizer().minimize(l2) 65 | self.session.run(tf.initialize_all_variables()) 66 | 67 | 68 | def _features(self, path): 69 | o = path["obs"].astype('float32') 70 | o = o.reshape(o.shape[0], -1) 71 | act = path["action_dists"].astype('float32') 72 | l = len(path["rewards"]) 73 | al = np.arange(l).reshape(-1, 1) / 10.0 74 | ret = np.concatenate([o, act, al, np.ones((l, 1))], axis=1) 75 | return ret 76 | 77 | def fit(self, paths): 78 | featmat = np.concatenate([self._features(path) for path in paths]) 79 | if self.net is None: 80 | self.create_net(featmat.shape[1]) 81 | returns = np.concatenate([path["returns"] for path in paths]) 82 | for _ in range(50): 83 | self.session.run(self.train, {self.x: featmat, self.y: returns}) 84 | 85 | def predict(self, path): 86 | if self.net is None: 87 | return np.zeros(len(path["rewards"])) 88 | else: 89 | ret = self.session.run(self.net, {self.x: self._features(path)}) 90 | return np.reshape(ret, (ret.shape[0], )) 91 | 92 | 93 | def cat_sample(prob_nk): 94 | assert prob_nk.ndim == 2 95 | N = prob_nk.shape[0] 96 | csprob_nk = np.cumsum(prob_nk, axis=1) 97 | out = np.zeros(N, dtype='i') 98 | for (n, csprob_k, r) in zip(xrange(N), csprob_nk, np.random.rand(N)): 99 | for (k, csprob) in enumerate(csprob_k): 100 | if csprob > r: 101 | out[n] = k 102 | break 103 | return out 104 | 105 | 106 | def var_shape(x): 107 | out = [k.value for k in x.get_shape()] 108 | assert all(isinstance(a, int) for a in out), \ 109 | "shape function assumes that shape is fully known" 110 | return out 111 | 112 | 113 | def numel(x): 114 | return np.prod(var_shape(x)) 115 | 116 | 117 | def flatgrad(loss, var_list): 118 | grads = tf.gradients(loss, var_list) 119 | return tf.concat(0, [tf.reshape(grad, [numel(v)]) 120 | for (v, grad) in zip(var_list, grads)]) 121 | 122 | 123 | class SetFromFlat(object): 124 | 125 | def __init__(self, session, var_list): 126 | self.session = session 127 | assigns = [] 128 | shapes = map(var_shape, var_list) 129 | total_size = sum(np.prod(shape) for shape in shapes) 130 | self.theta = theta = tf.placeholder(dtype, [total_size]) 131 | start = 0 132 | assigns = [] 133 | for (shape, v) in zip(shapes, var_list): 134 | size = np.prod(shape) 135 | assigns.append( 136 | tf.assign( 137 | v, 138 | tf.reshape( 139 | theta[ 140 | start:start + 141 | size], 142 | shape))) 143 | start += size 144 | self.op = tf.group(*assigns) 145 | 146 | def __call__(self, theta): 147 | self.session.run(self.op, feed_dict={self.theta: theta}) 148 | 149 | 150 | class GetFlat(object): 151 | 152 | def __init__(self, session, var_list): 153 | self.session = session 154 | self.op = tf.concat(0, [tf.reshape(v, [numel(v)]) for v in var_list]) 155 | 156 | def __call__(self): 157 | return self.op.eval(session=self.session) 158 | 159 | 160 | def slice_2d(x, inds0, inds1): 161 | inds0 = tf.cast(inds0, tf.int64) 162 | inds1 = tf.cast(inds1, tf.int64) 163 | shape = tf.cast(tf.shape(x), tf.int64) 164 | ncols = shape[1] 165 | x_flat = tf.reshape(x, [-1]) 166 | return tf.gather(x_flat, inds0 * ncols + inds1) 167 | 168 | 169 | def linesearch(f, x, fullstep, expected_improve_rate): 170 | accept_ratio = .1 171 | max_backtracks = 10 172 | fval = f(x) 173 | for (_n_backtracks, stepfrac) in enumerate(.5**np.arange(max_backtracks)): 174 | xnew = x + stepfrac * fullstep 175 | newfval = f(xnew) 176 | actual_improve = fval - newfval 177 | expected_improve = expected_improve_rate * stepfrac 178 | ratio = actual_improve / expected_improve 179 | if ratio > accept_ratio and actual_improve > 0: 180 | return xnew 181 | return x 182 | 183 | 184 | def conjugate_gradient(f_Ax, b, cg_iters=10, residual_tol=1e-10): 185 | p = b.copy() 186 | r = b.copy() 187 | x = np.zeros_like(b) 188 | rdotr = r.dot(r) 189 | for i in xrange(cg_iters): 190 | z = f_Ax(p) 191 | v = rdotr / p.dot(z) 192 | x += v * p 193 | r -= v * z 194 | newrdotr = r.dot(r) 195 | mu = newrdotr / rdotr 196 | p = r + mu * p 197 | rdotr = newrdotr 198 | if rdotr < residual_tol: 199 | break 200 | return x 201 | 202 | class dict2(dict): 203 | def __init__(self, **kwargs): 204 | dict.__init__(self, kwargs) 205 | self.__dict__ = self 206 | 207 | def explained_variance(ypred, y): 208 | assert y.ndim == 1 and ypred.ndim == 1 209 | vary = np.var(y) 210 | return np.nan if vary==0 else 1 - np.var(y-ypred)/vary 211 | --------------------------------------------------------------------------------