├── .gitignore ├── README.md ├── main.py ├── space_conversion.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.swp 3 | *logs 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TRPO with continuous actions 2 | 3 | This repo implements a TRPO agent ( http://arxiv.org/abs/1502.05477 ) by modifying https://github.com/wojzaremba/trpo and replacing the softmax distributions with Gaussian distributions, and adding a tiny bit of bells and whistles. 4 | 5 | To run the code, simply type python main.py --task $TASK_NAME. Once training is complete, main.py will upload the run using your OpenAI gym account which should be stored in OPENAI_GYM_API_KEY. 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, absolute_import, division 2 | from utils import * 3 | import numpy as np 4 | import random 5 | import tensorflow as tf 6 | import time 7 | import os 8 | import logging 9 | import gym 10 | from gym import envs, scoreboard 11 | from gym.spaces import Discrete, Box 12 | import prettytensor as pt 13 | from space_conversion import SpaceConversionEnv 14 | import tempfile 15 | from sys import argv 16 | print ('python main.py {}'.format(' '.join(argv))) 17 | 18 | import argparse 19 | parser = argparse.ArgumentParser(description='Test the new good lib.') 20 | parser.add_argument("--task", type=str, default='InvertedDoublePendulum-v0') 21 | parser.add_argument("--timesteps_per_batch", type=int, default=20000) 22 | parser.add_argument("--max_pathlength", type=int, default=2000) 23 | parser.add_argument("--n_iter", type=int, default=30) 24 | parser.add_argument("--gamma", type=float, default=.99) 25 | parser.add_argument("--max_kl", type=float, default=.001) 26 | parser.add_argument("--cg_damping", type=float, default=1e-3) 27 | 28 | args = parser.parse_args() 29 | 30 | algo = 'continuous_action_TRPO_nIter={}_maxKl={}_gamma={}'.format( 31 | args.n_iter, args.max_kl, args.gamma) 32 | 33 | class ContinTRPOAgent(object): 34 | 35 | config = dict2(timesteps_per_batch = args.timesteps_per_batch, 36 | max_pathlength = args.max_pathlength, 37 | gamma = args.gamma, 38 | n_iter = args.n_iter, 39 | max_kl = args.max_kl, 40 | cg_damping = args.cg_damping) 41 | 42 | def __init__(self, env): 43 | self.env = env 44 | if not isinstance(env.observation_space, Box) or \ 45 | not isinstance(env.action_space, Box): 46 | print("Both the input space and the output space should be continous.") 47 | print("(Probably OK to remove the requirement for the input space).") 48 | exit(-1) 49 | self.session = tf.Session() 50 | self.obs = obs = tf.placeholder( 51 | dtype, shape=[ 52 | None, env.observation_space.shape[0]]) 53 | act_dim = np.prod(env.action_space.shape) 54 | self.action = action = tf.placeholder(tf.float32, shape=[None, act_dim]) 55 | self.advant = advant = tf.placeholder(dtype, shape=[None]) 56 | self.oldaction_dist_mu = oldaction_dist_mu = tf.placeholder(dtype, shape=[None, act_dim]) 57 | self.oldaction_dist_logstd = oldaction_dist_logstd = tf.placeholder(dtype, shape=[None, act_dim]) 58 | 59 | # Create neural network. 60 | action_dist_mu = (pt.wrap(self.obs). 61 | fully_connected(64, activation_fn=tf.nn.relu). 62 | fully_connected(64, activation_fn=tf.nn.relu). 63 | fully_connected(act_dim)) # output means and logstd's. Good! 64 | action_dist_logstd_param = tf.Variable((.01*np.random.randn(1, act_dim)).astype(np.float32)) 65 | action_dist_logstd = tf.tile(action_dist_logstd_param, tf.pack((tf.shape(action_dist_mu)[0], 1))) 66 | 67 | eps = 1e-8 68 | self.action_dist_mu = action_dist_mu 69 | self.action_dist_logstd = action_dist_logstd 70 | N = tf.shape(obs)[0] 71 | # compute probabilities of current actions and old action 72 | log_p_n = gauss_log_prob(action_dist_mu, action_dist_logstd, action) 73 | log_oldp_n = gauss_log_prob(oldaction_dist_mu, oldaction_dist_logstd, action) 74 | 75 | # proceed as before, good. 76 | ratio_n = tf.exp(log_p_n - log_oldp_n) 77 | Nf = tf.cast(N, dtype) 78 | surr = -tf.reduce_mean(ratio_n * advant) # Surrogate loss 79 | var_list = tf.trainable_variables() 80 | 81 | # Introduced the change into here: 82 | kl = gauss_KL(oldaction_dist_mu, oldaction_dist_logstd, 83 | action_dist_mu, action_dist_logstd) / Nf 84 | ent = gauss_ent(action_dist_mu, action_dist_logstd) / Nf 85 | 86 | self.losses = [surr, kl, ent] 87 | self.pg = flatgrad(surr, var_list) 88 | # KL divergence where first arg is fixed 89 | # replace old->tf.stop_gradient from previous kl 90 | kl_firstfixed = gauss_selfKL_firstfixed(action_dist_mu, action_dist_logstd) / Nf 91 | grads = tf.gradients(kl_firstfixed, var_list) 92 | self.flat_tangent = tf.placeholder(dtype, shape=[None]) 93 | shapes = map(var_shape, var_list) 94 | start = 0 95 | tangents = [] 96 | for shape in shapes: 97 | size = np.prod(shape) 98 | param = tf.reshape(self.flat_tangent[start:(start + size)], shape) 99 | tangents.append(param) 100 | start += size 101 | gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)] 102 | self.fvp = flatgrad(gvp, var_list) 103 | self.gf = GetFlat(self.session, var_list) 104 | self.sff = SetFromFlat(self.session, var_list) 105 | self.session.run(tf.initialize_variables(var_list)) 106 | self.vf = LinearVF() 107 | 108 | def act(self, obs, *args): 109 | obs = np.expand_dims(obs, 0) 110 | action_dist_mu, action_dist_logstd = \ 111 | self.session.run([self.action_dist_mu, self.action_dist_logstd], {self.obs: obs}) 112 | 113 | act = action_dist_mu + np.exp(action_dist_logstd)*np.random.randn(*action_dist_logstd.shape) 114 | 115 | return act.ravel(), \ 116 | dict2(action_dist_mu = action_dist_mu, 117 | action_dist_logstd = action_dist_logstd) 118 | 119 | 120 | 121 | def learn(self, render_freq=50): 122 | config = self.config 123 | start_time = time.time() 124 | numeptotal = 0 125 | 126 | for i in xrange(1, config.n_iter): 127 | # Generating paths. 128 | paths = rollout_contin( 129 | self.env, 130 | self, 131 | config.max_pathlength, 132 | config.timesteps_per_batch, 133 | render = False) #(i % render_freq) == 0) 134 | 135 | # Computing returns and estimating advantage function. 136 | for path in paths: 137 | path["baseline"] = self.vf.predict(path) 138 | path["returns"] = discount(path["rewards"], config.gamma) 139 | path["advant"] = path["returns"] - path["baseline"] 140 | 141 | # Updating policy. 142 | action_dist_mu = np.concatenate([path["action_dists_mu"] for path in paths]) 143 | action_dist_logstd = np.concatenate([path["action_dists_logstd"] for path in paths]) 144 | obs_n = np.concatenate([path["obs"] for path in paths]) 145 | action_n = np.concatenate([path["actions"] for path in paths]) 146 | 147 | # Standardize the advantage function to have mean=0 and std=1. 148 | advant_n = np.concatenate([path["advant"] for path in paths]) 149 | advant_n -= advant_n.mean() 150 | advant_n /= (advant_n.std() + 1e-8) 151 | 152 | # Computing baseline function for next iter. 153 | self.vf.fit(paths) 154 | 155 | 156 | 157 | feed = {self.obs: obs_n, 158 | self.action: action_n, 159 | self.advant: advant_n, 160 | self.oldaction_dist_mu: action_dist_mu, 161 | self.oldaction_dist_logstd: action_dist_logstd} 162 | 163 | thprev = self.gf() 164 | 165 | def fisher_vector_product(p): 166 | feed[self.flat_tangent] = p 167 | return self.session.run(self.fvp, feed) + p * config.cg_damping 168 | 169 | 170 | g = self.session.run(self.pg, feed_dict=feed) 171 | stepdir = conjugate_gradient(fisher_vector_product, -g) 172 | shs = (.5 * stepdir.dot(fisher_vector_product(stepdir)) ) 173 | assert shs > 0 174 | 175 | lm = np.sqrt(shs / config.max_kl) 176 | 177 | 178 | fullstep = stepdir / lm 179 | neggdotstepdir = -g.dot(stepdir) 180 | 181 | def loss(th): 182 | self.sff(th) 183 | return self.session.run(self.losses[0], feed_dict=feed) 184 | theta = linesearch(loss, thprev, fullstep, neggdotstepdir / lm) 185 | theta = thprev + fullstep 186 | self.sff(theta) 187 | 188 | surrafter, kloldnew, entropy = self.session.run( 189 | self.losses, feed_dict=feed) 190 | 191 | episoderewards = np.array( 192 | [path["rewards"].sum() for path in paths]) 193 | stats = {} 194 | numeptotal += len(episoderewards) 195 | stats["Total number of episodes"] = numeptotal 196 | stats["Average sum of rewards per episode"] = episoderewards.mean() 197 | stats["Entropy"] = entropy 198 | stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0) 199 | stats["KL between old and new distribution"] = kloldnew 200 | stats["Surrogate loss"] = surrafter 201 | print ("\n********** Iteration {} ************".format(i)) 202 | for k, v in stats.iteritems(): 203 | print(k + ": " + " " * (40 - len(k)) + str(v)) 204 | if entropy != entropy: 205 | exit(-1) 206 | 207 | def __call__(self, observation_n, reward_n, done_n): 208 | assert False 209 | env = self.env 210 | ret = [] 211 | for o, r, d in zip(observation_n, reward_n, done_n): 212 | o = env.observation_convert(o, env._env.observation_space, env.observation_space) 213 | obs = np.expand_dims(o, 0) 214 | action_dist_n = self.session.run(self.action_dist_n, {self.obs: obs}) 215 | action = int(np.argmax(action_dist_n, 1)[0]) 216 | action = env.action_convert(action, env.action_space, env._env.action_space) 217 | ret.append(action) 218 | return ret 219 | 220 | 221 | experiment_dir = tempfile.mkdtemp() 222 | logging.getLogger().setLevel(logging.DEBUG) 223 | print ("taks = {}".format(args.task)) 224 | env = envs.make(args.task) 225 | 226 | 227 | env.monitor.start(experiment_dir) 228 | 229 | agent = ContinTRPOAgent(env) 230 | agent.learn() 231 | env.monitor.close() 232 | gym.upload(experiment_dir, algorithm_id=algo) 233 | 234 | 235 | print (experiment_dir) 236 | 237 | from sys import argv 238 | print ('python {}'.format(' '.join(argv))) 239 | -------------------------------------------------------------------------------- /space_conversion.py: -------------------------------------------------------------------------------- 1 | """ 2 | `SpaceConversionEnv` acts as a wrapper on 3 | any environment. It allows to convert some action spaces, and observation spaces to others. 4 | """ 5 | 6 | import numpy as np 7 | from gym.spaces import Discrete, Box, Tuple 8 | from gym import Env 9 | 10 | 11 | def discrete2tuple4obj(x, discrete_space, tuple_space): 12 | assert(discrete_space.contains(x)) 13 | action = [] 14 | for space in tuple_space.spaces: 15 | assert(isinstance(space, Discrete)) 16 | action.append(x % space.n) 17 | x = int(x / space.n) 18 | action = tuple(action) 19 | assert(tuple_space.contains(action)) 20 | return action 21 | 22 | def tuple2discrete4obj(x, old_space_obj, new_space_obj): 23 | assert(False) 24 | 25 | def tuple2discrete4class(tuple_space): 26 | n = 1 27 | for space in tuple_space.spaces: 28 | assert(isinstance(space, Discrete)) 29 | n *= space.n 30 | return Discrete(n) 31 | 32 | def box2discrete4obj(x, box_space, discrete_space): 33 | assert(False) 34 | 35 | def discrete2box4obj(x, discrete_space, box_space): 36 | ret = np.zeros(discrete_space.n) 37 | ret[x] = 1.0 38 | return ret 39 | 40 | def discrete2box4class(discrete_space): 41 | return Box(0.0, 1.0, discrete_space.n) 42 | 43 | def ident4obj(x, old_space_obj, new_space_obj): 44 | return x 45 | 46 | class SpaceConversionEnv(Env): 47 | convertable = {(Tuple, Discrete): (tuple2discrete4obj, discrete2tuple4obj, tuple2discrete4class), \ 48 | (Discrete, Box): (discrete2box4obj, box2discrete4obj, discrete2box4class)} 49 | 50 | def __init__(self, env, target_observation_space=None, target_action_space=None, verbose=False): 51 | self._verbose = verbose 52 | self._env = env 53 | self.action_convert = None 54 | self.observation_convert = None 55 | if self.action_space.__class__ == target_action_space or \ 56 | target_action_space is None: 57 | self.action_convert = ident4obj 58 | self._action_space_ = env.action_space # another missing trailing _ 59 | if self.observation_space.__class__ == target_observation_space or \ 60 | target_observation_space is None: 61 | self.observation_convert = ident4obj 62 | # pretty sure the lack of the trailing _ in _observation_space_ 63 | # was a typo 64 | self._observation_space_ = env.observation_space 65 | 66 | for pairs, convert in self.convertable.iteritems(): 67 | if env.action_space.__class__ == pairs[0] and \ 68 | target_action_space == pairs[1] and \ 69 | self.action_convert is None: 70 | self.action_convert = convert[1] 71 | self._action_space_ = convert[2](env.action_space) 72 | if env.observation_space.__class__ == pairs[0] and \ 73 | target_observation_space == pairs[1] and \ 74 | self.observation_convert is None: 75 | self.observation_convert = convert[0] 76 | self._observation_space_ = convert[2](env.observation_space) 77 | assert(self.action_convert is not None) 78 | assert(self.observation_convert is not None) 79 | 80 | def step(self, action, **kwargs): 81 | conv_action = self.action_convert(action, self.action_space, self._env.action_space) 82 | if self._verbose and self.action_convert != ident4obj: 83 | print("Input action: %s, converted action: %s" % (action, conv_action)) 84 | step = self._env.step(conv_action, **kwargs) 85 | observation, reward, done, info = step 86 | 87 | conv_observation = self.observation_convert(observation, self._env.observation_space, self.observation_space) 88 | 89 | if self._verbose and self.observation_convert != ident4obj: 90 | print("Input observation: %s, converted observation: %s" % (observation, conv_observation)) 91 | return conv_observation, reward, done, {} 92 | 93 | def reset(self, **kwargs): 94 | observation = self._env.reset(**kwargs) 95 | conv_observation = self.observation_convert(observation, self._env.observation_space, self.observation_space) 96 | 97 | if self._verbose and self.observation_convert != ident4obj: 98 | print("Input observation: %s, converted observation: %s" % (observation, conv_observation)) 99 | return conv_observation 100 | 101 | @property 102 | def action_space(self): 103 | return self._action_space_ 104 | 105 | @property 106 | def observation_space(self): 107 | return self._observation_space_ 108 | 109 | def __getattr__(self, field): 110 | """ 111 | proxy everything to underlying env 112 | """ 113 | if hasattr(self._env, field): 114 | return getattr(self._env, field) 115 | raise AttributeError(field) 116 | 117 | def __repr__(self): 118 | if "object at" not in str(self._env): 119 | env_name = str(env._env) 120 | else: 121 | env_name = self._env.__class__.__name__ 122 | return env_name 123 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | import tensorflow as tf 4 | import random 5 | import scipy.signal 6 | 7 | 8 | seed = 1 9 | random.seed(seed) 10 | np.random.seed(seed) 11 | tf.set_random_seed(seed) 12 | 13 | dtype = tf.float32 14 | 15 | def discount(x, gamma): 16 | assert x.ndim >= 1 17 | return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] 18 | 19 | def gauss_prob(mu, logstd, x): 20 | std = tf.exp(logstd) 21 | var = tf.square(std) 22 | gp = tf.exp(-(x - mu)/(2*var)) / ((2*np.pi)**.5 * std) 23 | return tf.reduce_prod(gp, [1]) 24 | 25 | def gauss_log_prob(mu, logstd, x): 26 | var = tf.exp(2*logstd) 27 | gp = -tf.square(x - mu)/(2*var) - .5*tf.log(tf.constant(2*np.pi)) - logstd 28 | return tf.reduce_sum(gp, [1]) 29 | 30 | 31 | def gauss_selfKL_firstfixed(mu, logstd): 32 | # basically: compute KL with yourself, where the first 33 | # argument is treated as a constant. I can do it, totally. 34 | mu1, logstd1 = map(tf.stop_gradient, [mu, logstd]) 35 | mu2, logstd2 = mu, logstd 36 | 37 | return gauss_KL(mu1, logstd1, mu2, logstd2) 38 | 39 | 40 | def gauss_KL(mu1, logstd1, mu2, logstd2): 41 | # basically: compute KL with yourself, where the first 42 | # argument is treated as a constant. I can do it, totally. 43 | var1 = tf.exp(2*logstd1) 44 | var2 = tf.exp(2*logstd2) 45 | 46 | kl = tf.reduce_sum(logstd2 - logstd1 + (var1 + tf.square(mu1 - mu2))/(2*var2) - 0.5) 47 | return kl 48 | 49 | 50 | def gauss_ent(mu, logstd): 51 | h = tf.reduce_sum(logstd + tf.constant(0.5*np.log(2*np.pi*np.e), tf.float32)) 52 | return h 53 | 54 | def gauss_sample(mu, logstd): 55 | return mu + tf.exp(logstd)*tf.random_normal(tf.shape(logstd)) 56 | 57 | 58 | def rollout(env, agent, max_pathlength, n_timesteps): 59 | paths = [] 60 | timesteps_sofar = 0 61 | while timesteps_sofar < n_timesteps: 62 | obs, actions, rewards, action_dists = [], [], [], [] 63 | if np.random.randint(0, 100) == 0: 64 | env.monitor.configure(video=True) 65 | else: 66 | env.monitor.configure(video=False) 67 | ob = env.reset() 68 | for _ in xrange(max_pathlength): 69 | timesteps_sofar += 1 70 | obs.append(ob) 71 | action, info = agent.act(ob) 72 | actions.append(action) 73 | action_dists.append(info.get("action_dist", [])) 74 | res = env.step(action) 75 | ob = res[0] 76 | rewards.append(res[1]) 77 | if res[2] or timesteps_sofar == n_timesteps: #i.e., if done 78 | path = {"obs": np.concatenate(np.expand_dims(obs, 0)), 79 | "action_dists": np.concatenate(action_dists), 80 | "rewards": np.array(rewards), 81 | "actions": np.array(actions)} 82 | paths.append(path) 83 | break 84 | return paths 85 | 86 | 87 | 88 | class Filter: 89 | def __init__(self, filter_mean=True): 90 | self.m1 = 0 91 | self.v = 0 92 | self.n = 0. 93 | self.filter_mean = filter_mean 94 | 95 | def __call__(self, o): 96 | self.m1 = self.m1 * (self.n / (self.n + 1)) + o * 1/(1 + self.n) 97 | self.v = self.v * (self.n / (self.n + 1)) + (o - self.m1)**2 * 1/(1 + self.n) 98 | self.std = (self.v + 1e-6)**.5 # std 99 | self.n += 1 100 | if self.filter_mean: 101 | o1 = (o - self.m1)/self.std 102 | else: 103 | o1 = o/self.std 104 | o1 = (o1 > 10) * 10 + (o1 < -10)* (-10) + (o1 < 10) * (o1 > -10) * o1 105 | return o1 106 | filter = Filter() 107 | filter_std = Filter() 108 | 109 | def rollout_contin(env, agent, max_pathlength, n_timesteps, render=False): 110 | paths = [] 111 | timesteps_sofar = 0 112 | first = True 113 | while timesteps_sofar < n_timesteps: 114 | obs, actions, rewards, action_dists_mu, action_dists_logstd = [], [], [], [], [] 115 | ob = filter(env.reset()) 116 | for _ in xrange(max_pathlength): 117 | timesteps_sofar += 1 118 | obs.append(ob) 119 | action, info = agent.act(ob) 120 | actions.append(action) 121 | action_dists_mu.append(info.get("action_dist_mu", [])) 122 | action_dists_logstd.append(info.get("action_dist_logstd", [])) 123 | res = env.step(action) 124 | ob = filter(res[0]) 125 | rewards.append((res[1])) 126 | if render and first: env.render() 127 | if res[2] or timesteps_sofar == n_timesteps: 128 | # forceful termination if timesteps_sofar == n_timesteps 129 | # otherwise paths is empty, which also is bad. 130 | path = dict2(obs = np.concatenate(np.expand_dims(obs, 0)), 131 | action_dists_mu = np.concatenate(action_dists_mu), 132 | action_dists_logstd = np.concatenate(action_dists_logstd), 133 | rewards = np.array(rewards), 134 | actions = np.array(actions)) 135 | paths.append(path) 136 | break 137 | first = False 138 | return paths 139 | 140 | 141 | 142 | 143 | class LinearVF(object): 144 | coeffs = None 145 | 146 | def _features(self, path): 147 | o = path["obs"].astype('float32') 148 | o = o.reshape(o.shape[0], -1) 149 | l = len(path["rewards"]) 150 | al = np.arange(l).reshape(-1, 1) / 100.0 151 | return np.concatenate([o, o**2, al, al**2, np.ones((l, 1))], axis=1) 152 | 153 | def fit(self, paths): 154 | featmat = np.concatenate([self._features(path) for path in paths]) 155 | returns = np.concatenate([path["returns"] for path in paths]) 156 | n_col = featmat.shape[1] 157 | lamb = 2.0 158 | self.coeffs = np.linalg.lstsq(featmat.T.dot(featmat) + lamb * np.identity(n_col), featmat.T.dot(returns))[0] 159 | 160 | def predict(self, path): 161 | return np.zeros(len(path["rewards"])) if self.coeffs is None else self._features( 162 | path).dot(self.coeffs) 163 | 164 | 165 | def cat_sample(prob_nk): 166 | assert prob_nk.ndim == 2 167 | N = prob_nk.shape[0] 168 | csprob_nk = np.cumsum(prob_nk, axis=1) 169 | out = np.zeros(N, dtype='i') 170 | for (n, csprob_k, r) in zip(xrange(N), csprob_nk, np.random.rand(N)): 171 | for (k, csprob) in enumerate(csprob_k): 172 | if csprob > r: 173 | out[n] = k 174 | break 175 | return out 176 | 177 | 178 | def var_shape(x): 179 | out = [k.value for k in x.get_shape()] 180 | assert all(isinstance(a, int) for a in out), \ 181 | "shape function assumes that shape is fully known" 182 | return out 183 | 184 | 185 | def numel(x): 186 | return np.prod(var_shape(x)) 187 | 188 | 189 | def flatgrad(loss, var_list): 190 | grads = tf.gradients(loss, var_list) 191 | return tf.concat(0, [tf.reshape(grad, [numel(v)]) 192 | for (v, grad) in zip(var_list, grads)]) 193 | 194 | 195 | class SetFromFlat(object): 196 | 197 | def __init__(self, session, var_list): 198 | self.session = session 199 | assigns = [] 200 | shapes = map(var_shape, var_list) 201 | total_size = sum(np.prod(shape) for shape in shapes) 202 | self.theta = theta = tf.placeholder(dtype, [total_size]) 203 | start = 0 204 | assigns = [] 205 | for (shape, v) in zip(shapes, var_list): 206 | size = np.prod(shape) 207 | assigns.append( 208 | tf.assign( 209 | v, 210 | tf.reshape( 211 | theta[ 212 | start:start + 213 | size], 214 | shape))) 215 | start += size 216 | self.op = tf.group(*assigns) 217 | 218 | def __call__(self, theta): 219 | self.session.run(self.op, feed_dict={self.theta: theta}) 220 | 221 | 222 | class GetFlat(object): 223 | 224 | def __init__(self, session, var_list): 225 | self.session = session 226 | self.op = tf.concat(0, [tf.reshape(v, [numel(v)]) for v in var_list]) 227 | 228 | def __call__(self): 229 | return self.op.eval(session=self.session) 230 | 231 | 232 | def slice_2d(x, inds0, inds1): 233 | # in tf 234 | inds0 = tf.cast(inds0, tf.int64) 235 | inds1 = tf.cast(inds1, tf.int64) 236 | shape = tf.cast(tf.shape(x), tf.int64) 237 | ncols = shape[1] 238 | x_flat = tf.reshape(x, [-1]) 239 | return tf.gather(x_flat, inds0 * ncols + inds1) 240 | 241 | 242 | def linesearch(f, x, fullstep, expected_improve_rate): 243 | # in numpy 244 | accept_ratio = .1 245 | max_backtracks = 10 246 | fval = f(x) 247 | for (_n_backtracks, stepfrac) in enumerate(.5**np.arange(max_backtracks)): 248 | xnew = x + stepfrac * fullstep 249 | newfval = f(xnew) 250 | actual_improve = fval - newfval 251 | expected_improve = expected_improve_rate * stepfrac 252 | ratio = actual_improve / expected_improve 253 | if ratio > accept_ratio and actual_improve > 0: 254 | return xnew 255 | return x 256 | 257 | 258 | def conjugate_gradient(f_Ax, b, cg_iters=10, residual_tol=1e-10): 259 | # in numpy 260 | p = b.copy() 261 | r = b.copy() 262 | x = np.zeros_like(b) 263 | rdotr = r.dot(r) 264 | for i in xrange(cg_iters): 265 | z = f_Ax(p) 266 | v = rdotr / p.dot(z) 267 | x += v * p 268 | r -= v * z 269 | newrdotr = r.dot(r) 270 | mu = newrdotr / rdotr 271 | p = r + mu * p 272 | rdotr = newrdotr 273 | if rdotr < residual_tol: 274 | break 275 | return x 276 | 277 | class dict2(dict): 278 | def __init__(self, **kwargs): 279 | dict.__init__(self, kwargs) 280 | self.__dict__ = self 281 | 282 | 283 | --------------------------------------------------------------------------------