├── .gitignore
├── README.md
├── main.py
├── space_conversion.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.swp
3 | *logs
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # TRPO with continuous actions
 2 | 
 3 | This repo implements a TRPO agent ( http://arxiv.org/abs/1502.05477 ) by modifying https://github.com/wojzaremba/trpo and replacing the softmax distributions with Gaussian distributions, and adding a tiny bit of bells and whistles.
 4 | 
 5 | To run the code, simply type python main.py --task $TASK_NAME.  Once training is complete, main.py will upload the run using your OpenAI gym account which should be stored in OPENAI_GYM_API_KEY. 
 6 | 
 7 | 
 8 | 
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function, absolute_import, division
  2 | from utils import *
  3 | import numpy as np
  4 | import random
  5 | import tensorflow as tf
  6 | import time
  7 | import os
  8 | import logging
  9 | import gym
 10 | from gym import envs, scoreboard
 11 | from gym.spaces import Discrete, Box
 12 | import prettytensor as pt
 13 | from space_conversion import SpaceConversionEnv
 14 | import tempfile
 15 | from sys import argv
 16 | print ('python main.py {}'.format(' '.join(argv)))
 17 | 
 18 | import argparse
 19 | parser = argparse.ArgumentParser(description='Test the new good lib.')
 20 | parser.add_argument("--task", type=str, default='InvertedDoublePendulum-v0')
 21 | parser.add_argument("--timesteps_per_batch", type=int, default=20000)
 22 | parser.add_argument("--max_pathlength", type=int, default=2000)
 23 | parser.add_argument("--n_iter", type=int, default=30)
 24 | parser.add_argument("--gamma", type=float, default=.99)
 25 | parser.add_argument("--max_kl", type=float, default=.001)
 26 | parser.add_argument("--cg_damping", type=float, default=1e-3)
 27 | 
 28 | args =  parser.parse_args()
 29 | 
 30 | algo = 'continuous_action_TRPO_nIter={}_maxKl={}_gamma={}'.format(
 31 |     args.n_iter, args.max_kl, args.gamma)
 32 | 
 33 | class ContinTRPOAgent(object):
 34 | 
 35 |     config = dict2(timesteps_per_batch = args.timesteps_per_batch,
 36 |                    max_pathlength = args.max_pathlength,
 37 |                    gamma = args.gamma,
 38 |                    n_iter = args.n_iter,
 39 |                    max_kl = args.max_kl,
 40 |                    cg_damping = args.cg_damping)
 41 | 
 42 |     def __init__(self, env):
 43 |         self.env = env
 44 |         if not isinstance(env.observation_space, Box) or \
 45 |            not isinstance(env.action_space, Box):
 46 |             print("Both the input space and the output space should be continous.")
 47 |             print("(Probably OK to remove the requirement for the input space).")
 48 |             exit(-1)
 49 |         self.session = tf.Session()
 50 |         self.obs = obs = tf.placeholder(
 51 |             dtype, shape=[
 52 |                 None, env.observation_space.shape[0]])
 53 |         act_dim = np.prod(env.action_space.shape)
 54 |         self.action = action = tf.placeholder(tf.float32, shape=[None, act_dim])  
 55 |         self.advant = advant = tf.placeholder(dtype, shape=[None])  
 56 |         self.oldaction_dist_mu = oldaction_dist_mu = tf.placeholder(dtype, shape=[None, act_dim])
 57 |         self.oldaction_dist_logstd = oldaction_dist_logstd = tf.placeholder(dtype, shape=[None, act_dim])
 58 | 
 59 |         # Create neural network.
 60 |         action_dist_mu = (pt.wrap(self.obs).
 61 |                          fully_connected(64, activation_fn=tf.nn.relu).
 62 |                          fully_connected(64, activation_fn=tf.nn.relu).
 63 |                          fully_connected(act_dim)) # output means and logstd's.  Good! 
 64 |         action_dist_logstd_param = tf.Variable((.01*np.random.randn(1, act_dim)).astype(np.float32))
 65 |         action_dist_logstd = tf.tile(action_dist_logstd_param, tf.pack((tf.shape(action_dist_mu)[0], 1)))
 66 | 
 67 |         eps = 1e-8
 68 |         self.action_dist_mu = action_dist_mu
 69 |         self.action_dist_logstd = action_dist_logstd
 70 |         N = tf.shape(obs)[0]
 71 |         # compute probabilities of current actions and old action
 72 |         log_p_n = gauss_log_prob(action_dist_mu, action_dist_logstd, action)
 73 |         log_oldp_n = gauss_log_prob(oldaction_dist_mu, oldaction_dist_logstd, action)
 74 | 
 75 |         # proceed as before, good.
 76 |         ratio_n = tf.exp(log_p_n - log_oldp_n)
 77 |         Nf = tf.cast(N, dtype)
 78 |         surr = -tf.reduce_mean(ratio_n * advant)  # Surrogate loss
 79 |         var_list = tf.trainable_variables()
 80 |         
 81 |         # Introduced the change into here: 
 82 |         kl = gauss_KL(oldaction_dist_mu, oldaction_dist_logstd,
 83 |                       action_dist_mu, action_dist_logstd) / Nf
 84 |         ent = gauss_ent(action_dist_mu, action_dist_logstd) / Nf
 85 | 
 86 |         self.losses = [surr, kl, ent]
 87 |         self.pg = flatgrad(surr, var_list)
 88 |         # KL divergence where first arg is fixed
 89 |         # replace old->tf.stop_gradient from previous kl
 90 |         kl_firstfixed = gauss_selfKL_firstfixed(action_dist_mu, action_dist_logstd) / Nf
 91 |         grads = tf.gradients(kl_firstfixed, var_list)
 92 |         self.flat_tangent = tf.placeholder(dtype, shape=[None])
 93 |         shapes = map(var_shape, var_list)
 94 |         start = 0
 95 |         tangents = []
 96 |         for shape in shapes:
 97 |             size = np.prod(shape)
 98 |             param = tf.reshape(self.flat_tangent[start:(start + size)], shape)
 99 |             tangents.append(param)
100 |             start += size
101 |         gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)]
102 |         self.fvp = flatgrad(gvp, var_list)
103 |         self.gf = GetFlat(self.session, var_list)
104 |         self.sff = SetFromFlat(self.session, var_list)
105 |         self.session.run(tf.initialize_variables(var_list))
106 |         self.vf = LinearVF()
107 | 
108 |     def act(self, obs, *args):
109 |         obs = np.expand_dims(obs, 0)
110 |         action_dist_mu, action_dist_logstd  = \
111 |             self.session.run([self.action_dist_mu, self.action_dist_logstd], {self.obs: obs})
112 |         
113 |         act = action_dist_mu + np.exp(action_dist_logstd)*np.random.randn(*action_dist_logstd.shape)
114 | 
115 |         return act.ravel(), \
116 |             dict2(action_dist_mu = action_dist_mu,
117 |                   action_dist_logstd = action_dist_logstd)
118 |     
119 | 
120 | 
121 |     def learn(self, render_freq=50):
122 |         config = self.config
123 |         start_time = time.time()
124 |         numeptotal = 0
125 | 
126 |         for i in xrange(1, config.n_iter):
127 |             # Generating paths.
128 |             paths = rollout_contin(
129 |                 self.env,
130 |                 self,
131 |                 config.max_pathlength,
132 |                 config.timesteps_per_batch,
133 |                 render = False) #(i % render_freq) == 0)
134 | 
135 |             # Computing returns and estimating advantage function.
136 |             for path in paths:
137 |                 path["baseline"] = self.vf.predict(path)
138 |                 path["returns"] = discount(path["rewards"], config.gamma)
139 |                 path["advant"] = path["returns"] - path["baseline"]
140 | 
141 |             # Updating policy.
142 |             action_dist_mu = np.concatenate([path["action_dists_mu"] for path in paths])
143 |             action_dist_logstd = np.concatenate([path["action_dists_logstd"] for path in paths])
144 |             obs_n = np.concatenate([path["obs"] for path in paths])
145 |             action_n = np.concatenate([path["actions"] for path in paths])
146 | 
147 |             # Standardize the advantage function to have mean=0 and std=1.
148 |             advant_n = np.concatenate([path["advant"] for path in paths])
149 |             advant_n -= advant_n.mean()
150 |             advant_n /= (advant_n.std() + 1e-8)
151 | 
152 |             # Computing baseline function for next iter.
153 |             self.vf.fit(paths)
154 | 
155 | 
156 | 
157 |             feed = {self.obs: obs_n,
158 |                     self.action: action_n,
159 |                     self.advant: advant_n,
160 |                     self.oldaction_dist_mu: action_dist_mu,
161 |                     self.oldaction_dist_logstd: action_dist_logstd}
162 | 
163 |             thprev = self.gf()
164 | 
165 |             def fisher_vector_product(p):
166 |                 feed[self.flat_tangent] = p
167 |                 return self.session.run(self.fvp, feed) + p * config.cg_damping
168 | 
169 | 
170 |             g = self.session.run(self.pg, feed_dict=feed)
171 |             stepdir = conjugate_gradient(fisher_vector_product, -g)
172 |             shs = (.5 * stepdir.dot(fisher_vector_product(stepdir)) )
173 |             assert shs > 0
174 | 
175 |             lm = np.sqrt(shs / config.max_kl) 
176 | 
177 | 
178 |             fullstep = stepdir / lm
179 |             neggdotstepdir = -g.dot(stepdir)
180 | 
181 |             def loss(th):
182 |                 self.sff(th)
183 |                 return self.session.run(self.losses[0], feed_dict=feed)
184 |             theta = linesearch(loss, thprev, fullstep, neggdotstepdir / lm)
185 |             theta = thprev + fullstep
186 |             self.sff(theta)
187 | 
188 |             surrafter, kloldnew, entropy = self.session.run(
189 |                 self.losses, feed_dict=feed)
190 | 
191 |             episoderewards = np.array(
192 |                 [path["rewards"].sum() for path in paths])
193 |             stats = {}
194 |             numeptotal += len(episoderewards)
195 |             stats["Total number of episodes"] = numeptotal
196 |             stats["Average sum of rewards per episode"] = episoderewards.mean()
197 |             stats["Entropy"] = entropy
198 |             stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0)
199 |             stats["KL between old and new distribution"] = kloldnew
200 |             stats["Surrogate loss"] = surrafter
201 |             print ("\n********** Iteration {} ************".format(i))
202 |             for k, v in stats.iteritems():
203 |                 print(k + ": " + " " * (40 - len(k)) + str(v))
204 |             if entropy != entropy:
205 |                 exit(-1)
206 | 
207 |     def __call__(self, observation_n, reward_n, done_n):
208 |         assert False
209 |         env = self.env
210 |         ret = []
211 |         for o, r, d in zip(observation_n, reward_n, done_n):
212 |             o = env.observation_convert(o, env._env.observation_space, env.observation_space)  
213 |             obs = np.expand_dims(o, 0)
214 |             action_dist_n = self.session.run(self.action_dist_n, {self.obs: obs})
215 |             action = int(np.argmax(action_dist_n, 1)[0])
216 |             action = env.action_convert(action, env.action_space, env._env.action_space)
217 |             ret.append(action)
218 |         return ret
219 | 
220 | 
221 | experiment_dir = tempfile.mkdtemp()
222 | logging.getLogger().setLevel(logging.DEBUG)
223 | print ("taks = {}".format(args.task))
224 | env = envs.make(args.task)
225 | 
226 | 
227 | env.monitor.start(experiment_dir)
228 | 
229 | agent = ContinTRPOAgent(env)
230 | agent.learn()
231 | env.monitor.close()
232 | gym.upload(experiment_dir, algorithm_id=algo)
233 | 
234 | 
235 | print (experiment_dir)
236 | 
237 | from sys import argv
238 | print ('python {}'.format(' '.join(argv)))
239 | 


--------------------------------------------------------------------------------
/space_conversion.py:
--------------------------------------------------------------------------------
  1 | """
  2 | `SpaceConversionEnv` acts as a wrapper on
  3 | any environment. It allows to convert some action spaces, and observation spaces to others.
  4 | """
  5 | 
  6 | import numpy as np
  7 | from gym.spaces import Discrete, Box, Tuple
  8 | from gym import Env
  9 | 
 10 | 
 11 | def discrete2tuple4obj(x, discrete_space, tuple_space):
 12 |     assert(discrete_space.contains(x))
 13 |     action = []
 14 |     for space in tuple_space.spaces:
 15 |         assert(isinstance(space, Discrete))
 16 |         action.append(x % space.n)
 17 |         x = int(x / space.n)
 18 |     action = tuple(action)
 19 |     assert(tuple_space.contains(action))
 20 |     return action
 21 | 
 22 | def tuple2discrete4obj(x, old_space_obj, new_space_obj):
 23 |     assert(False)
 24 | 
 25 | def tuple2discrete4class(tuple_space):
 26 |     n = 1
 27 |     for space in tuple_space.spaces:
 28 |         assert(isinstance(space, Discrete))
 29 |         n *= space.n
 30 |     return Discrete(n)
 31 | 
 32 | def box2discrete4obj(x, box_space, discrete_space):
 33 |     assert(False)
 34 | 
 35 | def discrete2box4obj(x, discrete_space, box_space):
 36 |     ret = np.zeros(discrete_space.n)
 37 |     ret[x] = 1.0
 38 |     return ret
 39 | 
 40 | def discrete2box4class(discrete_space):
 41 |     return Box(0.0, 1.0, discrete_space.n)
 42 | 
 43 | def ident4obj(x, old_space_obj, new_space_obj):
 44 |     return x
 45 | 
 46 | class SpaceConversionEnv(Env):
 47 |     convertable = {(Tuple, Discrete): (tuple2discrete4obj, discrete2tuple4obj, tuple2discrete4class), \
 48 |                    (Discrete, Box): (discrete2box4obj, box2discrete4obj, discrete2box4class)}
 49 |     
 50 |     def __init__(self, env, target_observation_space=None, target_action_space=None, verbose=False):
 51 |         self._verbose = verbose
 52 |         self._env = env
 53 |         self.action_convert = None
 54 |         self.observation_convert = None
 55 |         if self.action_space.__class__ == target_action_space or \
 56 |            target_action_space is None:
 57 |             self.action_convert = ident4obj
 58 |             self._action_space_ = env.action_space # another missing trailing _
 59 |         if self.observation_space.__class__ == target_observation_space or \
 60 |            target_observation_space is None:
 61 |             self.observation_convert = ident4obj
 62 |             # pretty sure the lack of the trailing _ in _observation_space_
 63 |             # was a typo
 64 |             self._observation_space_ = env.observation_space
 65 | 
 66 |         for pairs, convert in self.convertable.iteritems():
 67 |             if env.action_space.__class__ == pairs[0] and \
 68 |                target_action_space == pairs[1] and \
 69 |                self.action_convert is None:
 70 |                 self.action_convert = convert[1]
 71 |                 self._action_space_ = convert[2](env.action_space)
 72 |             if env.observation_space.__class__ == pairs[0] and \
 73 |                target_observation_space == pairs[1] and \
 74 |                self.observation_convert is None:
 75 |                 self.observation_convert = convert[0]
 76 |                 self._observation_space_ = convert[2](env.observation_space)
 77 |         assert(self.action_convert is not None)
 78 |         assert(self.observation_convert is not None)
 79 | 
 80 |     def step(self, action, **kwargs):
 81 |         conv_action = self.action_convert(action, self.action_space, self._env.action_space)
 82 |         if self._verbose and self.action_convert != ident4obj:
 83 |             print("Input action: %s, converted action: %s" % (action, conv_action))
 84 |         step = self._env.step(conv_action, **kwargs)
 85 |         observation, reward, done, info = step
 86 | 
 87 |         conv_observation = self.observation_convert(observation, self._env.observation_space, self.observation_space)  
 88 | 
 89 |         if self._verbose and self.observation_convert != ident4obj:
 90 |             print("Input observation: %s, converted observation: %s" % (observation, conv_observation))
 91 |         return conv_observation, reward, done, {}
 92 | 
 93 |     def reset(self, **kwargs):
 94 |         observation = self._env.reset(**kwargs)
 95 |         conv_observation = self.observation_convert(observation, self._env.observation_space, self.observation_space)
 96 | 
 97 |         if self._verbose and self.observation_convert != ident4obj:
 98 |             print("Input observation: %s, converted observation: %s" % (observation, conv_observation))
 99 |         return conv_observation
100 |   
101 |     @property
102 |     def action_space(self):
103 |         return self._action_space_
104 | 
105 |     @property
106 |     def observation_space(self):
107 |         return self._observation_space_
108 | 
109 |     def __getattr__(self, field):
110 |         """
111 |         proxy everything to underlying env
112 |         """
113 |         if hasattr(self._env, field):
114 |             return getattr(self._env, field)
115 |         raise AttributeError(field)
116 |   
117 |     def __repr__(self):
118 |         if "object at" not in str(self._env):
119 |             env_name = str(env._env)
120 |         else:
121 |             env_name = self._env.__class__.__name__
122 |         return env_name
123 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division 
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import random
  5 | import scipy.signal
  6 | 
  7 | 
  8 | seed = 1
  9 | random.seed(seed)
 10 | np.random.seed(seed)
 11 | tf.set_random_seed(seed)
 12 | 
 13 | dtype = tf.float32
 14 | 
 15 | def discount(x, gamma):
 16 |     assert x.ndim >= 1
 17 |     return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
 18 | 
 19 | def gauss_prob(mu, logstd, x):
 20 |     std = tf.exp(logstd)
 21 |     var = tf.square(std)
 22 |     gp = tf.exp(-(x - mu)/(2*var)) / ((2*np.pi)**.5 * std)
 23 |     return  tf.reduce_prod(gp, [1])
 24 | 
 25 | def gauss_log_prob(mu, logstd, x):
 26 |     var = tf.exp(2*logstd)
 27 |     gp = -tf.square(x - mu)/(2*var) - .5*tf.log(tf.constant(2*np.pi)) - logstd
 28 |     return  tf.reduce_sum(gp, [1])
 29 | 
 30 | 
 31 | def gauss_selfKL_firstfixed(mu, logstd):
 32 |     # basically:  compute KL with yourself, where the first
 33 |     # argument is treated as a constant.  I can do it, totally.
 34 |     mu1, logstd1 = map(tf.stop_gradient, [mu, logstd])
 35 |     mu2, logstd2 = mu, logstd
 36 | 
 37 |     return gauss_KL(mu1, logstd1, mu2, logstd2)
 38 | 
 39 | 
 40 | def gauss_KL(mu1, logstd1, mu2, logstd2):
 41 |     # basically:  compute KL with yourself, where the first
 42 |     # argument is treated as a constant.  I can do it, totally.
 43 |     var1 = tf.exp(2*logstd1)
 44 |     var2 = tf.exp(2*logstd2)
 45 | 
 46 |     kl = tf.reduce_sum(logstd2 - logstd1 + (var1 + tf.square(mu1 - mu2))/(2*var2) - 0.5) 
 47 |     return kl
 48 | 
 49 | 
 50 | def gauss_ent(mu, logstd):
 51 |     h = tf.reduce_sum(logstd + tf.constant(0.5*np.log(2*np.pi*np.e), tf.float32))
 52 |     return h
 53 | 
 54 | def gauss_sample(mu, logstd):
 55 |     return mu + tf.exp(logstd)*tf.random_normal(tf.shape(logstd))
 56 | 
 57 | 
 58 | def rollout(env, agent, max_pathlength, n_timesteps):
 59 |     paths = []
 60 |     timesteps_sofar = 0
 61 |     while timesteps_sofar < n_timesteps:
 62 |         obs, actions, rewards, action_dists = [], [], [], []
 63 |         if np.random.randint(0, 100) == 0:
 64 |             env.monitor.configure(video=True)
 65 |         else:
 66 |             env.monitor.configure(video=False)
 67 |         ob = env.reset()
 68 |         for _ in xrange(max_pathlength):
 69 |             timesteps_sofar += 1
 70 |             obs.append(ob)
 71 |             action, info = agent.act(ob)
 72 |             actions.append(action)
 73 |             action_dists.append(info.get("action_dist", []))
 74 |             res = env.step(action)
 75 |             ob = res[0]
 76 |             rewards.append(res[1])
 77 |             if res[2] or timesteps_sofar == n_timesteps: #i.e., if done
 78 |                 path = {"obs": np.concatenate(np.expand_dims(obs, 0)),
 79 |                         "action_dists": np.concatenate(action_dists),
 80 |                         "rewards": np.array(rewards),
 81 |                         "actions": np.array(actions)}
 82 |                 paths.append(path)
 83 |                 break
 84 |     return paths
 85 | 
 86 | 
 87 | 
 88 | class Filter:
 89 |     def __init__(self, filter_mean=True):
 90 |         self.m1 = 0
 91 |         self.v = 0
 92 |         self.n = 0.
 93 |         self.filter_mean = filter_mean
 94 | 
 95 |     def __call__(self, o):
 96 |         self.m1 = self.m1 * (self.n / (self.n + 1)) + o    * 1/(1 + self.n)
 97 |         self.v = self.v * (self.n / (self.n + 1)) + (o - self.m1)**2 * 1/(1 + self.n)
 98 |         self.std = (self.v + 1e-6)**.5 # std
 99 |         self.n += 1
100 |         if self.filter_mean: 
101 |             o1 =  (o - self.m1)/self.std
102 |         else:
103 |             o1 =  o/self.std
104 |         o1 = (o1 > 10) * 10 + (o1 < -10)* (-10) + (o1 < 10) * (o1 > -10) * o1 
105 |         return o1
106 | filter = Filter()
107 | filter_std = Filter()
108 | 
109 | def rollout_contin(env, agent, max_pathlength, n_timesteps, render=False):
110 |     paths = []
111 |     timesteps_sofar = 0
112 |     first = True
113 |     while timesteps_sofar < n_timesteps:
114 |         obs, actions, rewards, action_dists_mu, action_dists_logstd = [], [], [], [], []
115 |         ob = filter(env.reset())
116 |         for _ in xrange(max_pathlength):
117 |             timesteps_sofar += 1
118 |             obs.append(ob)
119 |             action, info = agent.act(ob)
120 |             actions.append(action)
121 |             action_dists_mu.append(info.get("action_dist_mu", []))
122 |             action_dists_logstd.append(info.get("action_dist_logstd", []))
123 |             res = env.step(action)
124 |             ob = filter(res[0])
125 |             rewards.append((res[1]))
126 |             if render and first: env.render()
127 |             if res[2] or timesteps_sofar == n_timesteps:
128 |                 # forceful termination if timesteps_sofar == n_timesteps
129 |                 # otherwise paths is empty, which also is bad. 
130 |                 path = dict2(obs = np.concatenate(np.expand_dims(obs, 0)),
131 |                              action_dists_mu = np.concatenate(action_dists_mu),
132 |                              action_dists_logstd = np.concatenate(action_dists_logstd),
133 |                              rewards = np.array(rewards),
134 |                              actions =  np.array(actions))
135 |                 paths.append(path)
136 |                 break
137 |         first = False
138 |     return paths
139 | 
140 | 
141 | 
142 | 
143 | class LinearVF(object):
144 |     coeffs = None
145 | 
146 |     def _features(self, path):
147 |         o = path["obs"].astype('float32')
148 |         o = o.reshape(o.shape[0], -1)
149 |         l = len(path["rewards"])
150 |         al = np.arange(l).reshape(-1, 1) / 100.0
151 |         return np.concatenate([o, o**2, al, al**2, np.ones((l, 1))], axis=1)
152 | 
153 |     def fit(self, paths):
154 |         featmat = np.concatenate([self._features(path) for path in paths])
155 |         returns = np.concatenate([path["returns"] for path in paths])
156 |         n_col = featmat.shape[1]
157 |         lamb = 2.0
158 |         self.coeffs = np.linalg.lstsq(featmat.T.dot(featmat) + lamb * np.identity(n_col), featmat.T.dot(returns))[0]
159 | 
160 |     def predict(self, path):
161 |         return np.zeros(len(path["rewards"])) if self.coeffs is None else self._features(
162 |             path).dot(self.coeffs)
163 | 
164 | 
165 | def cat_sample(prob_nk):
166 |     assert prob_nk.ndim == 2
167 |     N = prob_nk.shape[0]
168 |     csprob_nk = np.cumsum(prob_nk, axis=1)
169 |     out = np.zeros(N, dtype='i')
170 |     for (n, csprob_k, r) in zip(xrange(N), csprob_nk, np.random.rand(N)):
171 |         for (k, csprob) in enumerate(csprob_k):
172 |             if csprob > r:
173 |                 out[n] = k
174 |                 break
175 |     return out
176 | 
177 | 
178 | def var_shape(x):
179 |     out = [k.value for k in x.get_shape()]
180 |     assert all(isinstance(a, int) for a in out), \
181 |         "shape function assumes that shape is fully known"
182 |     return out
183 | 
184 | 
185 | def numel(x):
186 |     return np.prod(var_shape(x))
187 | 
188 | 
189 | def flatgrad(loss, var_list):
190 |     grads = tf.gradients(loss, var_list)
191 |     return tf.concat(0, [tf.reshape(grad, [numel(v)])
192 |                          for (v, grad) in zip(var_list, grads)])
193 | 
194 | 
195 | class SetFromFlat(object):
196 | 
197 |     def __init__(self, session, var_list):
198 |         self.session = session
199 |         assigns = []
200 |         shapes = map(var_shape, var_list)
201 |         total_size = sum(np.prod(shape) for shape in shapes)
202 |         self.theta = theta = tf.placeholder(dtype, [total_size])
203 |         start = 0
204 |         assigns = []
205 |         for (shape, v) in zip(shapes, var_list):
206 |             size = np.prod(shape)
207 |             assigns.append(
208 |                 tf.assign(
209 |                     v,
210 |                     tf.reshape(
211 |                         theta[
212 |                             start:start +
213 |                             size],
214 |                         shape)))
215 |             start += size
216 |         self.op = tf.group(*assigns)
217 | 
218 |     def __call__(self, theta):
219 |         self.session.run(self.op, feed_dict={self.theta: theta})
220 | 
221 | 
222 | class GetFlat(object):
223 | 
224 |     def __init__(self, session, var_list):
225 |         self.session = session
226 |         self.op = tf.concat(0, [tf.reshape(v, [numel(v)]) for v in var_list])
227 | 
228 |     def __call__(self):
229 |         return self.op.eval(session=self.session)
230 | 
231 | 
232 | def slice_2d(x, inds0, inds1):
233 |     # in tf
234 |     inds0 = tf.cast(inds0, tf.int64)
235 |     inds1 = tf.cast(inds1, tf.int64)
236 |     shape = tf.cast(tf.shape(x), tf.int64)
237 |     ncols = shape[1]
238 |     x_flat = tf.reshape(x, [-1])
239 |     return tf.gather(x_flat, inds0 * ncols + inds1)
240 | 
241 | 
242 | def linesearch(f, x, fullstep, expected_improve_rate):
243 |     # in numpy
244 |     accept_ratio = .1
245 |     max_backtracks = 10
246 |     fval = f(x)
247 |     for (_n_backtracks, stepfrac) in enumerate(.5**np.arange(max_backtracks)):
248 |         xnew = x + stepfrac * fullstep
249 |         newfval = f(xnew)
250 |         actual_improve = fval - newfval
251 |         expected_improve = expected_improve_rate * stepfrac
252 |         ratio = actual_improve / expected_improve
253 |         if ratio > accept_ratio and actual_improve > 0:
254 |             return xnew
255 |     return x
256 | 
257 | 
258 | def conjugate_gradient(f_Ax, b, cg_iters=10, residual_tol=1e-10):
259 |     # in numpy
260 |     p = b.copy()
261 |     r = b.copy()
262 |     x = np.zeros_like(b)
263 |     rdotr = r.dot(r)
264 |     for i in xrange(cg_iters):
265 |         z = f_Ax(p)
266 |         v = rdotr / p.dot(z)
267 |         x += v * p
268 |         r -= v * z
269 |         newrdotr = r.dot(r)
270 |         mu = newrdotr / rdotr
271 |         p = r + mu * p
272 |         rdotr = newrdotr
273 |         if rdotr < residual_tol:
274 |             break
275 |     return x
276 | 
277 | class dict2(dict):
278 |     def __init__(self, **kwargs):
279 |         dict.__init__(self, kwargs)
280 |         self.__dict__ = self
281 | 
282 | 
283 | 


--------------------------------------------------------------------------------