├── README.md
├── gym_experiment.py
├── policy_gradient.py
├── test_pg.py
└── tf_util.py


/README.md:
--------------------------------------------------------------------------------
 1 | # tensorflow-policy-gradient
 2 | 
 3 | Still under construction...
 4 | 
 5 | ## Dependencies
 6 | - Python 2.7
 7 | - TensorFlow >= 0.8.0
 8 | - NumPy >= 1.10.0
 9 | - openai gym
10 | - matplotlib
11 | 
12 | ## Quick try
13 | Run
14 | ```bash
15 | python gym_experiment.py
16 | ```
17 | to train a softmax policy (without bias) using vanilla policy gradient on [CartPole task](https://gym.openai.com/envs/CartPole-v0). You can see that the return is stochastically increasing until it reaches the maximum (200).
18 | 


--------------------------------------------------------------------------------
/gym_experiment.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import cProfile as profile
  3 | import os
  4 | import sys
  5 | import time
  6 | 
  7 | import gym
  8 | import tensorflow as tf
  9 | import numpy as np
 10 | import policy_gradient as pg
 11 | import matplotlib.pyplot as plt
 12 | 
 13 | 
 14 | def main():
 15 |     parser = argparse.ArgumentParser()
 16 | 
 17 |     parser.add_argument('--env', type=str,
 18 |                         default='CartPole-v0',
 19 |                         help='Environment name.')
 20 | 
 21 |     parser.add_argument('--save_path', type=str,
 22 |                         default='',
 23 |                         help='Path to save experiments.')
 24 | 
 25 |     parser.add_argument('--config_file', type=str,
 26 |                         default='',
 27 |                         help='Json file containing configurations.')
 28 | 
 29 |     parser.add_argument('--n_iters', type=int,
 30 |                         default=200,
 31 |                         help='Number of iterations.')
 32 | 
 33 |     parser.add_argument('--evaluate_freq', type=int,
 34 |                         default=10,
 35 |                         help='How often to evaluate on a game.')
 36 | 
 37 |     parser.add_argument('--max_steps', type=int,
 38 |                         default=200,
 39 |                         help='Upper limit of episode length.')
 40 | 
 41 |     
 42 |     # Parameters for unittesting the implementation.
 43 |     parser.add_argument('--record', dest='record', action='store_true',
 44 |                         help='Whether to record and save this experiment.')
 45 |     parser.set_defaults(record=False)
 46 | 
 47 |     args = parser.parse_args()
 48 | 
 49 |     if not args.save_path:
 50 |         args.save_path =  os.path.join('/tmp/', args.env + '-tmp-experiment')
 51 |         
 52 |     env = gym.make(args.env)
 53 | 
 54 |     if args.record:
 55 |         env.monitor.start(args.save_path, force=True)
 56 |     
 57 |     if args.config_file:
 58 |         with open(args.config_file, 'r') as f:
 59 |             config = json.load(f)
 60 |         agent = pg.NNAgent(env.action_space, env.observation_space,
 61 |                            max_steps=args.max_steps, **config)
 62 |     else:
 63 |         agent = pg.NNAgent(env.action_space, env.observation_space,
 64 |                            max_steps=args.max_steps,
 65 |                            learning_rate=100.0, discount=0.98,
 66 |                            use_softmax_bias=False,
 67 |                            use_rnn=False)
 68 | 
 69 |     n_iters = args.n_iters
 70 |     iter_num = range(n_iters)
 71 |     returns = []
 72 |     t1 = time.time()
 73 |     for i in xrange(n_iters):
 74 |         returns.append(agent.train_batch(env, total_steps=2000, batch_size=None)[:2])
 75 |         # print agent.session.run(agent.train_graph.learning_rate)
 76 |         m_return = returns[-1][0]
 77 |         m_ep_len = returns[-1][1]
 78 |         print "Iteration %s:" % i
 79 |         print "  average return {}\n  average episode length {}".format(m_return, m_ep_len)
 80 | 
 81 |         if i % args.evaluate_freq == 0:
 82 |             evaluate(env, agent, 5, args.max_steps)
 83 |         
 84 |     t2 = time.time()
 85 |     print '{} sec used, {} sec per iteration.'.format(t2 - t1, (t2 - t1) / n_iters)
 86 | 
 87 |     if args.record:
 88 |         env.monitor.close()
 89 |     
 90 |     plt.plot(iter_num, [r[0] for r in returns])
 91 |     plt.xlabel('Number of iterations')
 92 |     plt.ylabel('Average return')
 93 |     plt.show()
 94 |     plt.plot(iter_num, [r[1] for r in returns])
 95 |     plt.ylabel('Average episode length')
 96 |     plt.show()
 97 | 
 98 | 
 99 | def evaluate(env, agent, n_eps, max_steps):
100 |     for i_episode in range(n_eps):
101 |         observation = env.reset()
102 |         for t in range(max_steps):
103 |             env.render()
104 |             action = agent.get_action(observation)
105 |             observation, reward, done, info = env.step(action)
106 |             if done:
107 |                 print("Episode finished after {} timesteps".format(t+1))
108 |                 break
109 |         else:
110 |             print("Episode reached maximum length  after {} timesteps".format(t+1))
111 | 
112 | 
113 | if __name__ == '__main__':
114 |     main()
115 | 


--------------------------------------------------------------------------------
/policy_gradient.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | import tf_util
  5 | 
  6 | 
  7 | class NNAgent(object):
  8 |     # An reinforcement learning agent using vanilla policy gradient.
  9 |     def __init__(self, action_space, observation_space,
 10 |                  use_rnn=False, use_fnn=False,
 11 |                  max_steps=100, discount=0.9, learning_rate=0.01,
 12 |                  use_softmax_bias=True,
 13 |                  rnn_model='rnn', rnn_hidden_size=32, rnn_num_layers=1,
 14 |                  fnn_hidden_sizes=[32, 32],
 15 |                  fnn_activation_fns=[tf.nn.relu, tf.nn.relu],
 16 |                  fnn_l2_scale=0.0):
 17 |         self.graph = tf.Graph()
 18 |         with self.graph.as_default():
 19 |             with tf.name_scope('Training'):
 20 |                 self.train_graph = NNGraph(action_space, observation_space,
 21 |                                            learning_rate=learning_rate,
 22 |                                            use_softmax_bias=use_softmax_bias,
 23 |                                            rnn_model=rnn_model,
 24 |                                            rnn_hidden_size=rnn_hidden_size,
 25 |                                            rnn_num_layers=rnn_num_layers,
 26 |                                            fnn_hidden_sizes=fnn_hidden_sizes,
 27 |                                            fnn_activation_fns=fnn_activation_fns,
 28 |                                            fnn_l2_scale=fnn_l2_scale,
 29 |                                            use_rnn=use_rnn, use_fnn=use_fnn)
 30 |                 self.inference_graph = self.train_graph
 31 |                 saver = tf.train.Saver(name='checkpoint_saver')
 32 |             init_op = tf.initialize_all_variables()
 33 |         self.session = tf.Session(graph=self.graph)
 34 |         self.session.run(init_op)
 35 |         self.n_actions = self.inference_graph.n_actions
 36 |         self.use_rnn = use_rnn
 37 |         self.max_steps = max_steps
 38 |         self.discount = discount
 39 |         if self.use_rnn:
 40 |             self.last_state = None
 41 |             self.need_reset = False
 42 | 
 43 |     def reset(self):
 44 |         if self.use_rnn:
 45 |             self.need_reset = True
 46 | 
 47 |     def get_actions(self, obs):
 48 |         "Given a batch of observations, produce a batch of actions."
 49 |         if self.use_rnn:
 50 |             if self.need_reset:
 51 |                 self.last_state = self.session.run(
 52 |                     self.inference_graph.zero_state,
 53 |                     feed_dict={self.inference_graph.obs: obs})
 54 |                 self.need_reset = False
 55 | 
 56 |             probs, self.last_state = self.session.run(
 57 |                 [self.inference_graph.probs,
 58 |                  self.inference_graph.final_state],
 59 |                 feed_dict={self.inference_graph.obs: obs,
 60 |                            self.inference_graph.initial_state:
 61 |                            self.last_state,
 62 |                            self.inference_graph.seq_lens: [1] * obs.shape[1]})
 63 |         else:
 64 |             probs = self.session.run(self.inference_graph.probs,
 65 |                                      feed_dict={self.inference_graph.obs:
 66 |                                                 obs})
 67 |         
 68 |         actions = []
 69 |         for prob in probs:
 70 |             actions.append(np.random.choice(self.n_actions, 1, p=prob)[0])
 71 |         return actions
 72 | 
 73 |     def get_action(self, ob):
 74 |         "Given one observation, produce one action."
 75 |         return self.get_actions(np.array([[ob]]))[0]
 76 | 
 77 |     def get_batch(self, env, batch_size=None,
 78 |                   total_steps=2000):
 79 |         paths = []
 80 |         if batch_size is None:
 81 |             batch_size = np.inf
 82 |         if total_steps is None:
 83 |             total_steps = np.inf
 84 |         if ((batch_size is None) and
 85 |             (total_steps is None)):
 86 |             raise ValueError("batch_size and total_steps can't all be None.")
 87 |             
 88 |         steps = 0
 89 |         i = 0
 90 |         # for _ in xrange(batch_size):
 91 |         while True:
 92 |             obs = []
 93 |             actions = []
 94 |             rewards = []
 95 |             paddings = []
 96 |             ob = env.reset()
 97 |             self.reset()
 98 |             for _ in xrange(self.max_steps):
 99 |                 if isinstance(ob, np.ndarray):
100 |                     ob = np.reshape(ob, [-1])
101 |                 action = self.get_action(ob)
102 |                 next_ob, reward, done, _ = env.step(action)
103 |                 obs.append(ob)
104 |                 actions.append(action)
105 |                 rewards.append(reward)
106 |                 ob = next_ob
107 |                 if done:
108 |                     break
109 |             # We need to compute the empirical return for each
110 |             # time step along the trajectory.
111 |             returns = []
112 |             return_so_far = 0.0
113 |             for t in xrange(len(rewards) - 1, -1, -1):
114 |                 return_so_far = rewards[t] + self.discount * return_so_far
115 |                 returns.append(return_so_far)            
116 |             # The returns are stored backwards in time, so we need to revert it.
117 |             returns = returns[::-1]
118 | 
119 |             steps += len(actions)
120 |             i += 1
121 |             if ((steps > total_steps) or
122 |                 (i > batch_size)):
123 |                 break
124 | 
125 |             paths.append(dict(
126 |                 observations=np.array(obs),
127 |                 actions=np.array(actions),
128 |                 rewards=np.array(rewards),
129 |                 returns=np.array(returns),
130 |                 ep_len=len(actions)))
131 | 
132 |         mean_return=np.mean([np.sum(path['rewards']) for path in paths])
133 |         mean_ep_len=np.mean([path['ep_len'] for path in paths])
134 |         return paths, mean_return, mean_ep_len
135 | 
136 |     def train_batch(self, env, batch_size=None,
137 |                     total_steps=2000):
138 |         paths, mean_return, mean_ep_len = self.get_batch(env, batch_size=batch_size,
139 |                                                          total_steps=total_steps)
140 |         obs_list = [path['observations'] for path in paths]
141 |         actions_list = [path['actions'] for path in paths]
142 |         returns_list = [path['returns'] for path in paths]
143 |         
144 |         if self.use_rnn:
145 |             seq_lens = [path['ep_len'] for path in paths]
146 |             max_ep_len = np.max(seq_lens)
147 |             obs = pad_batch(obs_list, max_ep_len)
148 |             actions = pad_batch(actions_list, max_ep_len)
149 |             returns = pad_batch(returns_list, max_ep_len)
150 |             
151 |             self.last_state = self.session.run(
152 |                 self.inference_graph.zero_state,
153 |                 feed_dict={self.inference_graph.obs: obs})
154 | 
155 |             # print actions.shape
156 |             # print obs.shape
157 |             # print self.train_graph.actions.get_shape()
158 |             _, outputs = self.session.run(
159 |                 [self.train_graph.train_op, self.train_graph.outputs],
160 |                 feed_dict={self.train_graph.obs: obs,
161 |                            self.train_graph.initial_state:
162 |                            self.last_state,
163 |                            self.train_graph.seq_lens: seq_lens,
164 |                            self.train_graph.returns: returns,
165 |                            self.train_graph.actions: actions})
166 |             # print outputs.shape
167 |         else:
168 |             # If not useing RNN, just concatenate every
169 |             # steps into one large list.
170 |             obs = np.array([np.concatenate(obs_list)])
171 |             actions = np.array([np.concatenate(actions_list)])
172 |             returns = np.array([np.concatenate(returns_list)])
173 | 
174 |             feed_dict = {self.train_graph.actions: actions,
175 |                          self.train_graph.returns: returns,
176 |                          self.train_graph.obs: obs}
177 | 
178 |             self.session.run([self.train_graph.train_op],
179 |                              feed_dict=feed_dict)
180 | 
181 |         return mean_return, mean_ep_len
182 | 
183 | 
184 | def pad_batch(batch, max_ep_len):
185 |     num_dim = len(batch[0].shape)
186 |     new_batch = []
187 |     for ep in batch:
188 |         # the first dimension, number of steps in the
189 |         # episode is padded to be the same as max_ep_len,
190 |         # the rest dimensions are not touched.
191 |         padded_ep = np.pad(ep, ([(0, max_ep_len - ep.shape[0])] +
192 |                                 [(0, 0)] * (num_dim - 1)),
193 |                            'constant', constant_values=0)
194 |         new_batch.append(padded_ep)
195 |     new_batch = np.array(new_batch)
196 |     time_major_batch = np.swapaxes(new_batch, 0, 1)
197 |     return time_major_batch
198 |         
199 | 
200 | class NNGraph(object):
201 |     def __init__(self, action_space, observation_space,
202 |                  learning_rate=0.001, use_rnn=False, use_fnn=False,
203 |                  max_grad_norm=5.0, rnn_model='lstm',
204 |                  rnn_hidden_size=128,  rnn_num_layers=2,
205 |                  fnn_hidden_sizes=[128, 128],
206 |                  fnn_activation_fns=[tf.nn.relu, tf.nn.relu],
207 |                  fnn_l2_scale=0.0,
208 |                  use_softmax_bias=True,
209 |                  is_training=True):
210 | 
211 |         self.n_actions = action_space.n
212 |         
213 |         try:
214 |             # observation is an instance of Box.
215 |             self.ob_dim = np.product(observation_space.shape)
216 |             self.is_discrete_ob = False
217 |         except AttributeError:
218 |             # observation space is an instance of Discrete.
219 |             self.ob_dim = observation_space.n
220 |             self.is_discrete_ob = True
221 |             
222 |         self.global_step = tf.get_variable(
223 |             'global_step', [],
224 |             initializer=tf.constant_initializer(0.0),
225 |             trainable=False)
226 | 
227 |         if use_rnn:
228 |             shape = [None, None]
229 |         else:
230 |             shape = [1, None]
231 |         
232 |         # Placeholder to feed in observations, actions and returns.
233 |         if self.is_discrete_ob:
234 |             # if observation_space is an instance of Discrete, then
235 |             # should use embeddings to expand it.
236 |             self.obs = tf.placeholder(tf.int64, shape,
237 |                                       name='Observation')
238 |             # Embeddings layers.
239 |             with tf.name_scope('Embeddings'):
240 |                 self.embedding = tf.constant(np.eye(self.ob_dim), dtype=tf.float32)
241 |             self.inputs = tf.nn.embedding_lookup(self.embedding, self.obs)
242 |             input_size = self.ob_dim
243 |         else:
244 |             # if observation_space is an instance of Box,
245 |             # then just use itself.
246 |             self.obs = tf.placeholder(tf.float32,
247 |                                       shape + [self.ob_dim], # list(observation_space.shape),
248 |                                       name='Observation')
249 | 
250 |             self.inputs = self.obs # tf.reshape(self.obs, [-1, ])
251 |             input_size = self.ob_dim
252 | 
253 |         if use_fnn:
254 |             self.processed_inputs = tf_util.create_fnn_ops(self.inputs, input_size,
255 |                                                         hidden_sizes=fnn_hidden_sizes,
256 |                                                         activation_fns=fnn_activation_fns,
257 |                                                         l2_scale=fnn_l2_scale)
258 |             self.processed_input_size = fnn_hidden_sizes[-1]
259 |         else:
260 |             self.processed_inputs = self.inputs
261 |             self.processed_input_size = self.ob_dim
262 | 
263 |         if use_rnn:
264 |             with tf.name_scope('Dynamic_RNN'):
265 |                 in_ops, out_ops = tf_util.create_rnn_ops(self.processed_inputs,
266 |                                                       self.processed_input_size,
267 |                                                       rnn_model=rnn_model,
268 |                                                       hidden_size=rnn_hidden_size,
269 |                                                       num_layers=rnn_num_layers)
270 |             self.zero_state = in_ops[0]
271 |             self.initial_state = in_ops[1]
272 |             self.seq_lens = in_ops[2]
273 |             self.outputs, self.final_state = out_ops
274 |             output_dim = rnn_hidden_size
275 |         else:
276 |             self.outputs = self.processed_inputs
277 |             output_dim = self.processed_input_size
278 | 
279 |         flat_outputs = tf.reshape(self.outputs, [-1, output_dim])
280 | 
281 |         self.logits, self.probs = tf_util.create_softmax_ops(flat_outputs,
282 |                                                           output_dim, self.n_actions,
283 |                                                           use_softmax_bias=use_softmax_bias)
284 | 
285 |         with tf.name_scope('Training'):
286 |             # actions and returns.
287 |             self.actions = tf.placeholder(tf.int64,
288 |                                           [None, None],
289 |                                           name='actions')
290 | 
291 |             self.returns = tf.placeholder(tf.float32,
292 |                                           [None, None],
293 |                                           name='returns')
294 | 
295 |             flat_actions = tf.reshape(self.actions, [-1])
296 |             flat_returns = tf.reshape(self.returns, [-1])
297 | 
298 |             if use_rnn:
299 |                 self.mean_weighted_neg_ll = tf_util.rnn_weighted_neg_ll(
300 |                     self.logits, flat_actions, flat_returns, self.seq_lens)
301 |             else:
302 |                 self.mean_weighted_neg_ll = tf_util.weighted_neg_ll(
303 |                     self.logits, flat_actions, flat_returns)
304 | 
305 |             with tf.name_scope('Optimization'):
306 |                 # self.learning_rate = tf.constant(learning_rate)
307 |                 self.learning_rate = tf.train.exponential_decay(
308 |                     learning_rate, self.global_step, 100, 1.0, staircase=True)
309 | 
310 |                 tvars = tf.trainable_variables()
311 |                 
312 |                 # print [tvar.name for tvar in tvars]
313 |                 self.model_size = np.sum([np.product(tvar.get_shape().as_list())
314 |                                           for tvar in tvars])
315 |                 print('model size is %s' % self.model_size)
316 |                 grads = tf.gradients(self.mean_weighted_neg_ll, tvars)
317 |                 self.grads = grads
318 | 
319 |                 if use_rnn:
320 |                     grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
321 | 
322 |                 # self.grad_1 = grads[0]
323 |                 # self.grad_2 = grads[1]
324 | 
325 |                 optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
326 |                     
327 |                 # optimizer = tf.train.RMSPropOptimizer(learning_rate, decay_rate)
328 |                 # optimizer = tf.train.AdamOptimizer(self.learning_rate)
329 | 
330 |                 self.train_op = optimizer.apply_gradients(zip(grads, tvars),
331 |                                                           global_step=self.global_step)
332 | 
333 | 
334 | 


--------------------------------------------------------------------------------
/test_pg.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import unittest
 3 | import numpy as np
 4 | import gym
 5 | 
 6 | import policy_gradient as pg
 7 | 
 8 | class TestPG(unittest.TestCase):
 9 |     def setUp(self):
10 |         self.env = gym.make('FrozenLake-v0')
11 |         self.agent = pg.NNAgent(self.env.action_space,
12 |                                 self.env.observation_space,
13 |                                 max_steps=100, learning_rate=100.0,
14 |                                 discount=0.98)
15 | 
16 |     def test_get_batch(self):
17 |         paths, mean_return, mean_ep_len = self.agent.get_batch(
18 |             self.env, total_steps=2000)
19 |         self.assertTrue(np.sum([path['ep_len'] for path in paths]) <= 2000)
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     unittest.main()
24 | 


--------------------------------------------------------------------------------
/tf_util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | 
  4 | 
  5 | def create_rnn_ops(inputs, input_size, rnn_model='lstm',
  6 |                    hidden_size=128, num_layers=2, use_dropout=False,
  7 |                    dropout_rate=0.0, time_major=True):
  8 |     "Utility function to create multi-layer RNN."
  9 |     if rnn_model == 'rnn':
 10 |         cell_fn = tf.nn.rnn_cell.BasicRNNCell
 11 |     elif rnn_model == 'lstm':
 12 |         cell_fn = tf.nn.rnn_cell.BasicLSTMCell
 13 |     elif rnn_model == 'gru':
 14 |         cell_fn = tf.nn.rnn_cell.GRUCell
 15 | 
 16 |     params = {'input_size': input_size}
 17 |     if rnn_model == 'lstm':
 18 |         # add bias to forget gate in lstm.
 19 |         params['forget_bias'] = 0.0
 20 | 
 21 |     # Create multilayer cell.
 22 |     cell = cell_fn(hidden_size,
 23 |                    **params)
 24 |     cells = [cell]
 25 |     params['input_size'] = hidden_size
 26 |     # more explicit way to create cells for MultiRNNCell than
 27 |     # [higher_layer_cell] * (self.num_layers - 1)
 28 |     for i in range(num_layers-1):
 29 |         higher_layer_cell = cell_fn(hidden_size,
 30 |                                     **params)
 31 |         cells.append(higher_layer_cell)
 32 | 
 33 |     if use_dropout and (dropout_rate > 0.0):
 34 |         # dropout_rate = tf.placeholder(tf.float32, [], 'dropout_rate')
 35 |         cells = [tf.nn.rnn_cell.DropoutWrapper(
 36 |             cell, output_keep_prob=1.0-dropout_rate)
 37 |                  for cell in cells]
 38 |         
 39 |     multi_cell = tf.nn.rnn_cell.MultiRNNCell(cells)
 40 | 
 41 |     # batch_size = tf.placeholder(tf.int32,
 42 |     #                             name='batch_size')
 43 | 
 44 |     batch_size = tf.shape(inputs)[1]
 45 | 
 46 |     with tf.name_scope('initial_state'):
 47 |         # zero_state is used to compute the intial state for cell.
 48 |         zero_state = multi_cell.zero_state(batch_size, tf.float32)
 49 |         # Placeholder to feed in initial state.
 50 |         initial_state = tf.placeholder(tf.float32,
 51 |                                        [None, multi_cell.state_size],
 52 |                                        'initial_state')
 53 | 
 54 |     seq_lens = tf.placeholder(tf.int64, None, 'sequence_lengths')
 55 | 
 56 |     outputs, final_state = tf.nn.dynamic_rnn(multi_cell, inputs, seq_lens,
 57 |                                              initial_state=initial_state,
 58 |                                              time_major=time_major)
 59 | 
 60 |     return ((zero_state, initial_state, seq_lens),
 61 |             (outputs, final_state))
 62 | 
 63 | 
 64 | def create_fnn_ops(inputs, input_dim,
 65 |                    hidden_sizes, activation_fns,
 66 |                    l2_scale=0.0):
 67 |     "Utility function to create multi-layer FNN with l2 regularization."
 68 |     x_dim = input_dim
 69 |     x = inputs
 70 |     for i, h in enumerate(hidden_sizes):
 71 |         if activation_fns[i] == tf.nn.relu:
 72 |             init_b = 0.1
 73 |         else:
 74 |             init_b = 0.0
 75 | 
 76 |         a = tf.contrib.layers.fully_connected(
 77 |             x, h, activation_fn=activation_fns[i],
 78 |             weight_init=tf.truncated_normal_initializer(mean=0.0, stddev=0.1),
 79 |             bias_init=tf.constant_initializer(value=init_b),
 80 |             weight_regularizer=tf.contrib.layers.l2_regularizer(l2_scale))
 81 |         x = a
 82 |     outputs = a
 83 |     return outputs
 84 | 
 85 | 
 86 | def create_softmax_ops(inputs, input_dim, n_classes, use_softmax_bias=True):
 87 |     "Uitlity function to create softmax operations."
 88 |     with tf.name_scope('Softmax'):
 89 |         softmax_w = tf.get_variable("weights", #[output_dim, self.n_actions],
 90 |                                     initializer=tf.zeros_initializer(
 91 |                                         [input_dim, n_classes]))
 92 |         
 93 |         if use_softmax_bias:
 94 |             softmax_b = tf.get_variable(
 95 |                 "bias", #[1, self.n_actions],
 96 |                 initializer=tf.zeros_initializer([n_classes]))
 97 |             logits = tf.matmul(inputs, softmax_w) + softmax_b
 98 |         else:
 99 |             logits = tf.matmul(inputs, softmax_w)
100 | 
101 |         probs = tf.nn.softmax(logits)
102 | 
103 |     return logits, probs
104 | 
105 | 
106 | def weighted_neg_ll(logits, labels, example_weights):
107 |     with tf.name_scope('weighted_neg_ll'):
108 |         # Compute mean cross entropy loss for each output.
109 |         neg_log_likelihood = tf.nn.sparse_softmax_cross_entropy_with_logits(
110 |             logits, labels)
111 |         mean_weighted_neg_ll = tf.reduce_mean(
112 |             neg_log_likelihood * example_weights)
113 |     return mean_weighted_neg_ll
114 | 
115 | 
116 | def rnn_weighted_neg_ll(logits, labels, example_weights, seq_lens):
117 |     with tf.name_scope('rnn_weighted_neg_ll'):
118 |         # Compute mean cross entropy loss for each output.
119 |         neg_log_likelihood = tf.nn.sparse_softmax_cross_entropy_with_logits(
120 |             logits, labels)
121 | 
122 |         mean_weighted_neg_ll = (tf.reduce_sum(neg_log_likelihood * example_weights) /
123 |                                 tf.to_float(tf.reduce_sum(seq_lens)))
124 | 
125 |     return mean_weighted_neg_ll
126 | 


--------------------------------------------------------------------------------