├── .gitignore
├── LICENSE
├── README.md
├── feudal_networks
    ├── __init__.py
    ├── algos
    │   ├── __init__.py
    │   ├── feudal_policy_optimizer.py
    │   └── policy_optimizer.py
    ├── envs
    │   ├── __init__.py
    │   ├── debug_envs.py
    │   └── vision_maze.py
    ├── models
    │   ├── __init__.py
    │   └── models.py
    └── policies
    │   ├── __init__.py
    │   ├── configs
    │       ├── __init__.py
    │       ├── feudal_config.py
    │       └── lstm_config.py
    │   ├── feudal_batch_processor.py
    │   ├── feudal_policy.py
    │   ├── lstm_policy.py
    │   ├── policy.py
    │   └── policy_utils.py
├── scripts
    └── training
    │   ├── README.md
    │   ├── __init__.py
    │   ├── envs.py
    │   ├── train.py
    │   └── worker.py
└── tests
    ├── __init__.py
    ├── run_tests.py
    ├── test_algos
        └── test_feudal_policy_optimizer.py
    ├── test_envs
        └── test_vision_maze.py
    └── test_policies
        ├── __init__.py
        ├── test_feudal_batch_processor.py
        ├── test_feudal_policy.py
        └── test_lstm_policy.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | 
  5 | # C extensions
  6 | *.so
  7 | 
  8 | # Distribution / packaging
  9 | .Python
 10 | env/
 11 | bin/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | eggs/
 16 | lib/
 17 | lib64/
 18 | parts/
 19 | sdist/
 20 | var/
 21 | *.egg-info/
 22 | .installed.cfg
 23 | *.egg
 24 | 
 25 | # Installer logs
 26 | pip-log.txt
 27 | pip-delete-this-directory.txt
 28 | 
 29 | # Unit test / coverage reports
 30 | htmlcov/
 31 | .tox/
 32 | .coverage
 33 | .cache
 34 | nosetests.xml
 35 | coverage.xml
 36 | 
 37 | # Translations
 38 | *.mo
 39 | 
 40 | # Mr Developer
 41 | .mr.developer.cfg
 42 | .project
 43 | .pydevproject
 44 | 
 45 | # Rope
 46 | .ropeproject
 47 | 
 48 | # Django stuff:
 49 | *.log
 50 | *.pot
 51 | 
 52 | # Sphinx documentation
 53 | docs/_build/
 54 | 
 55 | # mac
 56 | .DS_Store
 57 | 
 58 | # binaries
 59 | *.bin
 60 | *.out
 61 | 
 62 | # emacs
 63 | *~
 64 | 
 65 | # compile
 66 | /compile
 67 | 
 68 | # aws keys
 69 | *.key
 70 | 
 71 | # data
 72 | *.weights
 73 | *.meta
 74 | *.p
 75 | *.csv
 76 | *checkpoint
 77 | *.key
 78 | *.npz
 79 | *.dat
 80 | *.jld
 81 | *.idx
 82 | *.png
 83 | *.h5
 84 | *.dat
 85 | *.mat
 86 | *.zip
 87 | 
 88 | # notebook checkpoints
 89 | *checkpoint*
 90 | 
 91 | # cmake
 92 | CMakeCache.txt
 93 | CMakeFiles
 94 | CMakeScripts
 95 | Makefile
 96 | cmake_install.cmake
 97 | install_manifest.txt
 98 | CTestTestfile.cmake
 99 | 
100 | 
101 | # julia temp files
102 | *tmp_*
103 | 
104 | # venv
105 | *venv*
106 | 
107 | # emacs
108 | *#*#*
109 | 
110 | # batch byproducts
111 | *.err
112 | 
113 | # gym
114 | *.json
115 | 
116 | # snapshots
117 | *.pkl
118 | *.npz
119 | 
120 | # summaries
121 | *events*
122 | 
123 | # media
124 | *.pdf
125 | *.gif
126 | 
127 | # snapshots
128 | *snapshots*
129 | *media*
130 | *visualizations*


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 dmakian
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # feudal_networks
2 | An implementation of FeUdal Networks for Hierarchical Learning as published : https://arxiv.org/abs/1703.01161
3 | 
4 | Implementation and training framework derived from the OpenAI starter agent: https://github.com/openai/universe-starter-agent
5 | 


--------------------------------------------------------------------------------
/feudal_networks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidhershey/feudal_networks/5c988023d87206a739b44cd59f60556c7de4e289/feudal_networks/__init__.py


--------------------------------------------------------------------------------
/feudal_networks/algos/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidhershey/feudal_networks/5c988023d87206a739b44cd59f60556c7de4e289/feudal_networks/algos/__init__.py


--------------------------------------------------------------------------------
/feudal_networks/algos/feudal_policy_optimizer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Much of the code in this file was originally developed as part of the
  3 | universe starter agent: https://github.com/openai/universe-starter-agent
  4 | """
  5 | from collections import namedtuple
  6 | import numpy as np
  7 | import scipy.signal
  8 | import tensorflow as tf
  9 | import threading
 10 | import six.moves.queue as queue
 11 | 
 12 | from feudal_networks.policies.lstm_policy import LSTMPolicy
 13 | from feudal_networks.policies.feudal_policy import FeudalPolicy
 14 | 
 15 | def discount(x, gamma):
 16 |     return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
 17 | 
 18 | def process_rollout(rollout, gamma, lambda_=1.0):
 19 |     """
 20 | given a rollout, compute its returns and the advantage
 21 | """
 22 |     batch_si = np.asarray(rollout.states)
 23 |     batch_a = np.asarray(rollout.actions)
 24 | 
 25 |     rewards = np.asarray(rollout.rewards)
 26 |     vpred_t = np.asarray(rollout.values + [rollout.r])
 27 |     rewards_plus_v = np.asarray(rollout.rewards + [rollout.r])
 28 |     batch_r = discount(rewards_plus_v, gamma)[:-1]
 29 | 
 30 |     batch_s = np.asarray(rollout.ss)
 31 |     batch_g = np.asarray(rollout.gs)
 32 |     features = rollout.features
 33 |     return Batch(batch_si, batch_a, batch_r, rollout.terminal,batch_s,batch_g, features)
 34 | 
 35 | Batch = namedtuple("Batch", ["obs", "a", "returns", "terminal", "s", "g", "features"])
 36 | # Batch = namedtuple("Batch", ["si", "a", "adv", "r", "terminal", "features"])
 37 | 
 38 | class PartialRollout(object):
 39 |     """
 40 |     a piece of a complete rollout.  We run our agent, and process its experience
 41 |     once it has processed enough steps.
 42 |     """
 43 |     def __init__(self):
 44 |         self.states = []
 45 |         self.actions = []
 46 |         self.rewards = []
 47 |         self.values = []
 48 |         self.ss = []
 49 |         self.gs = []
 50 |         self.features = []
 51 |         self.r = 0.0
 52 |         self.terminal = False
 53 | 
 54 |     def add(self, state, action, reward, value,g,s, terminal, features):
 55 |         self.states += [state]
 56 |         self.actions += [action]
 57 |         self.rewards += [reward]
 58 |         self.values += [value]
 59 |         self.terminal = terminal
 60 |         self.features += [features]
 61 |         self.gs += [g]
 62 |         self.ss += [s]
 63 | 
 64 |     def extend(self, other):
 65 |         assert not self.terminal
 66 |         self.states.extend(other.states)
 67 |         self.actions.extend(other.actions)
 68 |         self.rewards.extend(other.rewards)
 69 |         self.values.extend(other.values)
 70 |         self.gs.extend(other.gs)
 71 |         self.ss.extend(other.ss)
 72 |         self.r = other.r
 73 |         self.terminal = other.terminal
 74 |         self.features.extend(other.features)
 75 | 
 76 | class RunnerThread(threading.Thread):
 77 |     """
 78 |     One of the key distinctions between a normal environment and a universe environment
 79 |     is that a universe environment is _real time_.  This means that there should be a thread
 80 |     that would constantly interact with the environment and tell it what to do.  This thread is here.
 81 |     """
 82 |     def __init__(self, env, policy, num_local_steps, visualise):
 83 |         threading.Thread.__init__(self)
 84 |         self.queue = queue.Queue(5)
 85 |         self.num_local_steps = num_local_steps
 86 |         self.env = env
 87 |         self.last_features = None
 88 |         self.policy = policy
 89 |         self.daemon = True
 90 |         self.sess = None
 91 |         self.summary_writer = None
 92 |         self.visualise = visualise
 93 | 
 94 |     def start_runner(self, sess, summary_writer):
 95 |         self.sess = sess
 96 |         self.summary_writer = summary_writer
 97 |         self.start()
 98 | 
 99 |     def run(self):
100 |         with self.sess.as_default():
101 |             self._run()
102 | 
103 |     def _run(self):
104 |         rollout_provider = env_runner(self.env, self.policy, self.num_local_steps, self.summary_writer, self.visualise)
105 |         while True:
106 |             # the timeout variable exists because apparently, if one worker dies, the other workers
107 |             # won't die with it, unless the timeout is set to some large number.  This is an empirical
108 |             # observation.
109 | 
110 |             self.queue.put(next(rollout_provider), timeout=600.0)
111 | 
112 | def env_runner(env, policy, num_local_steps, summary_writer,visualise):
113 |     """
114 |     The logic of the thread runner.  In brief, it constantly keeps on running
115 |     the policy, and as long as the rollout exceeds a certain length, the thread
116 |     runner appends the policy to the queue.
117 |     """
118 |     last_state = env.reset()
119 |     last_c_g,last_features = policy.get_initial_features()
120 |     # print last_c_g
121 |     length = 0
122 |     rewards = 0
123 | 
124 |     while True:
125 |         terminal_end = False
126 |         rollout = PartialRollout()
127 | 
128 |         for _ in range(num_local_steps):
129 |             # print last_c_g.shape
130 |             fetched = policy.act(last_state,last_c_g, *last_features)
131 |             action, value_, g,s,last_c_g,features = fetched[0], fetched[1], \
132 |                                                     fetched[2], fetched[3], \
133 |                                                     fetched[4], fetched[5:]
134 |             action_to_take = action.argmax()
135 |             # print action_to_take
136 |             # print action
137 |             # print g
138 |             # print s
139 |             # # exit(0)
140 |             state, reward, terminal, info = env.step(action_to_take)
141 | 
142 |             # collect the experience
143 |             rollout.add(last_state, action, reward, value_, g, s, terminal, last_features)
144 |             length += 1
145 |             rewards += reward
146 | 
147 |             last_state = state
148 |             last_features = features
149 | 
150 |             if info:
151 |                 summary = tf.Summary()
152 |                 for k, v in info.items():
153 |                     summary.value.add(tag=k, simple_value=float(v))
154 |                 summary_writer.add_summary(summary, policy.global_step.eval())
155 |                 summary_writer.flush()
156 | 
157 |             timestep_limit = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')
158 |             if terminal or length >= timestep_limit:
159 |                 terminal_end = True
160 |                 if length >= timestep_limit or not env.metadata.get('semantics.autoreset'):
161 |                     last_state = env.reset()
162 |                 last_c_g,last_features = policy.get_initial_features()
163 |                 print("Episode finished. Sum of rewards: %f. Length: %d" % (rewards, length))
164 |                 length = 0
165 |                 rewards = 0
166 |                 break
167 | 
168 |         if not terminal_end:
169 |             rollout.r = policy.value(last_state, last_c_g, *last_features)
170 | 
171 |         # once we have enough experience, yield it, and have the ThreadRunner place it on a queue
172 |         yield rollout
173 | 
174 | class FeudalPolicyOptimizer(object):
175 |     def __init__(self, env, task, policy,visualise):
176 |         self.env = env
177 |         self.task = task
178 | 
179 |         worker_device = "/job:worker/task:{}/cpu:0".format(task)
180 |         with tf.device(tf.train.replica_device_setter(1, worker_device=worker_device)):
181 |             with tf.variable_scope("global"):
182 |                 self.global_step = tf.get_variable("global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32),
183 |                                                    trainable=False)
184 |                 self.network = FeudalPolicy(env.observation_space.shape, env.action_space.n,self.global_step)
185 | 
186 |         with tf.device(worker_device):
187 |             with tf.variable_scope("local"):
188 |                 self.local_network = pi = FeudalPolicy(env.observation_space.shape, env.action_space.n,self.global_step)
189 |                 pi.global_step = self.global_step
190 |             self.policy = pi
191 |             # build runner thread for collecting rollouts
192 |             self.runner = RunnerThread(env, self.policy, 20,visualise)
193 | 
194 |             # formulate gradients
195 |             grads = tf.gradients(pi.loss, pi.var_list)
196 |             grads, _ = tf.clip_by_global_norm(grads, 40)
197 | 
198 |             # build sync
199 |             # copy weights from the parameter server to the local model
200 |             self.sync = tf.group(*[v1.assign(v2)
201 |                 for v1, v2 in zip(pi.var_list, self.network.var_list)])
202 |             grads_and_vars = list(zip(grads, self.network.var_list))
203 |             # for g,v in grads_and_vars:
204 |             #     print g.name,v.name
205 |             inc_step = self.global_step.assign_add(tf.shape(pi.obs)[0])
206 | 
207 |             # build train op
208 |             opt = tf.train.AdamOptimizer(1e-4)
209 |             self.train_op = tf.group(opt.apply_gradients(grads_and_vars), inc_step)
210 |             self.summary_writer = None
211 |             self.local_steps = 0
212 | 
213 |     def start(self, sess, summary_writer):
214 |         self.runner.start_runner(sess, summary_writer)
215 |         self.summary_writer = summary_writer
216 | 
217 |     def pull_batch_from_queue(self):
218 |         """
219 |         self explanatory:  take a rollout from the queue of the thread runner.
220 |         """
221 |         rollout = self.runner.queue.get(timeout=600.0)
222 |         while not rollout.terminal:
223 |             try:
224 |                 rollout.extend(self.runner.queue.get_nowait())
225 |             except queue.Empty:
226 |                 break
227 |         return rollout
228 | 
229 |     def train(self, sess):
230 |         """
231 |         This first runs the sync op so that the gradients are computed wrt the
232 |         current global weights. It then takes a rollout from the runner's queue,
233 |         converts it to a batch, and passes that batch and the train op to the
234 |         policy to perform an update.
235 |         """
236 |         # copy weights from shared to local
237 |         # this should be run first so that the updates are for the most
238 |         # recent global weights
239 |         sess.run(self.sync)
240 |         rollout = self.pull_batch_from_queue()
241 |         batch = process_rollout(rollout, gamma=.99)
242 |         batch = self.policy.update_batch(batch)
243 |         compute_summary = self.task == 0 and self.local_steps % 11 == 0
244 |         # should_compute_summary = True
245 |         should_compute_summary = self.task == 0 and self.local_steps % 11 == 0
246 | 
247 |         if should_compute_summary:
248 |             fetches = [self.policy.summary_op, self.train_op, self.global_step]
249 |         else:
250 |             fetches = [self.train_op, self.global_step]
251 | 
252 |         feed_dict = {
253 |             self.policy.obs: batch.obs,
254 |             self.network.obs: batch.obs,
255 | 
256 |             self.policy.ac: batch.a,
257 |             self.network.ac: batch.a,
258 | 
259 |             self.policy.r: batch.returns,
260 |             self.network.r: batch.returns,
261 | 
262 |             self.policy.s_diff: batch.s_diff,
263 |             self.network.s_diff: batch.s_diff,
264 | 
265 |             self.policy.prev_g: batch.gsum,
266 |             self.network.prev_g: batch.gsum,
267 | 
268 |             self.policy.ri: batch.ri,
269 |             self.network.ri: batch.ri
270 |         }
271 | 
272 |         for i in range(len(self.policy.state_in)):
273 |             feed_dict[self.policy.state_in[i]] = batch.features[i]
274 |             feed_dict[self.network.state_in[i]] = batch.features[i]
275 | 
276 | 
277 |         fetched = sess.run(fetches, feed_dict=feed_dict)
278 | 
279 |         if should_compute_summary:
280 |             self.summary_writer.add_summary(tf.Summary.FromString(fetched[0]), fetched[-1])
281 |             self.summary_writer.flush()
282 |         self.local_steps += 1
283 | 


--------------------------------------------------------------------------------
/feudal_networks/algos/policy_optimizer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Much of the code in this file was originally developed as part of the
  3 | universe starter agent: https://github.com/openai/universe-starter-agent
  4 | """
  5 | from collections import namedtuple
  6 | import numpy as np
  7 | import scipy.signal
  8 | import tensorflow as tf
  9 | import threading
 10 | import six.moves.queue as queue
 11 | 
 12 | from feudal_networks.policies.lstm_policy import LSTMPolicy
 13 | from feudal_networks.policies.feudal_policy import FeudalPolicy
 14 | 
 15 | def discount(x, gamma):
 16 |     return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
 17 | 
 18 | def process_rollout(rollout, gamma, lambda_=1.0):
 19 |     """
 20 | given a rollout, compute its returns and the advantage
 21 | """
 22 |     batch_si = np.asarray(rollout.states)
 23 |     batch_a = np.asarray(rollout.actions)
 24 |     rewards = np.asarray(rollout.rewards)
 25 |     vpred_t = np.asarray(rollout.values + [rollout.r])
 26 | 
 27 |     rewards_plus_v = np.asarray(rollout.rewards + [rollout.r])
 28 |     batch_r = discount(rewards_plus_v, gamma)[:-1]
 29 |     delta_t = rewards + gamma * vpred_t[1:] - vpred_t[:-1]
 30 |     # this formula for the advantage comes "Generalized Advantage Estimation":
 31 |     # https://arxiv.org/abs/1506.02438
 32 |     batch_adv = discount(delta_t, gamma * lambda_)
 33 | 
 34 |     features = rollout.features[0]
 35 |     # print features
 36 |     return Batch(batch_si, batch_a, batch_adv, batch_r, rollout.terminal, features)
 37 | 
 38 | # Batch = namedtuple("Batch", ["obs", "a", "returns", "terminal", "s", "g", "features"])
 39 | Batch = namedtuple("Batch", ["si", "a", "adv", "r", "terminal", "features"])
 40 | 
 41 | class PartialRollout(object):
 42 |     """
 43 |     a piece of a complete rollout.  We run our agent, and process its experience
 44 |     once it has processed enough steps.
 45 |     """
 46 |     def __init__(self):
 47 |         self.states = []
 48 |         self.actions = []
 49 |         self.rewards = []
 50 |         self.values = []
 51 |         self.r = 0.0
 52 |         self.terminal = False
 53 |         self.features = []
 54 | 
 55 |     def add(self, state, action, reward, value, terminal, features):
 56 |         self.states += [state]
 57 |         self.actions += [action]
 58 |         self.rewards += [reward]
 59 |         self.values += [value]
 60 |         self.terminal = terminal
 61 |         self.features += [features]
 62 | 
 63 |     def extend(self, other):
 64 |         assert not self.terminal
 65 |         self.states.extend(other.states)
 66 |         self.actions.extend(other.actions)
 67 |         self.rewards.extend(other.rewards)
 68 |         self.values.extend(other.values)
 69 |         self.r = other.r
 70 |         self.terminal = other.terminal
 71 |         self.features.extend(other.features)
 72 | 
 73 | class RunnerThread(threading.Thread):
 74 |     """
 75 |     One of the key distinctions between a normal environment and a universe environment
 76 |     is that a universe environment is _real time_.  This means that there should be a thread
 77 |     that would constantly interact with the environment and tell it what to do.  This thread is here.
 78 |     """
 79 |     def __init__(self, env, policy, num_local_steps, visualise):
 80 |         threading.Thread.__init__(self)
 81 |         self.queue = queue.Queue(5)
 82 |         self.num_local_steps = num_local_steps
 83 |         self.env = env
 84 |         self.last_features = None
 85 |         self.policy = policy
 86 |         self.daemon = True
 87 |         self.sess = None
 88 |         self.summary_writer = None
 89 |         self.visualise = visualise
 90 | 
 91 |     def start_runner(self, sess, summary_writer):
 92 |         self.sess = sess
 93 |         self.summary_writer = summary_writer
 94 |         self.start()
 95 | 
 96 |     def run(self):
 97 |         with self.sess.as_default():
 98 |             self._run()
 99 | 
100 |     def _run(self):
101 |         rollout_provider = env_runner(self.env, self.policy, self.num_local_steps, self.summary_writer, self.visualise)
102 |         while True:
103 |             # the timeout variable exists because apparently, if one worker dies, the other workers
104 |             # won't die with it, unless the timeout is set to some large number.  This is an empirical
105 |             # observation.
106 | 
107 |             self.queue.put(next(rollout_provider), timeout=600.0)
108 | 
109 | def env_runner(env, policy, num_local_steps, summary_writer,visualise):
110 |     """
111 |     The logic of the thread runner.  In brief, it constantly keeps on running
112 |     the policy, and as long as the rollout exceeds a certain length, the thread
113 |     runner appends the policy to the queue.
114 |     """
115 |     last_state = env.reset()
116 |     last_features = policy.get_initial_features()
117 |     length = 0
118 |     rewards = 0
119 | 
120 |     while True:
121 |         terminal_end = False
122 |         rollout = PartialRollout()
123 | 
124 |         for _ in range(num_local_steps):
125 |             fetched = policy.act(last_state, *last_features)
126 |             action, value_, features = fetched[0], fetched[1], fetched[2:]
127 |             action_to_take = action.argmax()
128 |             state, reward, terminal, info = env.step(action_to_take)
129 | 
130 |             # collect the experience
131 |             rollout.add(last_state, action, reward, value_, terminal, last_features)
132 |             length += 1
133 |             rewards += reward
134 | 
135 |             last_state = state
136 |             last_features = features
137 | 
138 |             if info:
139 |                 summary = tf.Summary()
140 |                 for k, v in info.items():
141 |                     summary.value.add(tag=k, simple_value=float(v))
142 |                 summary_writer.add_summary(summary, policy.global_step.eval())
143 |                 summary_writer.flush()
144 | 
145 |             timestep_limit = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')
146 |             if terminal or length >= timestep_limit:
147 |                 terminal_end = True
148 |                 if length >= timestep_limit or not env.metadata.get('semantics.autoreset'):
149 |                     last_state = env.reset()
150 |                 last_features = policy.get_initial_features()
151 |                 print("Episode finished. Sum of rewards: %f. Length: %d" % (rewards, length))
152 |                 length = 0
153 |                 rewards = 0
154 |                 break
155 | 
156 |         if not terminal_end:
157 |             rollout.r = policy.value(last_state, *last_features)
158 | 
159 |         # once we have enough experience, yield it, and have the ThreadRunner place it on a queue
160 |         yield rollout
161 | 
162 | class PolicyOptimizer(object):
163 |     def __init__(self, env, task, policy,visualise):
164 |         self.env = env
165 |         self.task = task
166 | 
167 |         worker_device = "/job:worker/task:{}/cpu:0".format(task)
168 |         with tf.device(tf.train.replica_device_setter(1, worker_device=worker_device)):
169 |             with tf.variable_scope("global"):
170 |                 self.global_step = tf.get_variable("global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32),
171 |                                                    trainable=False)
172 |                 if policy == 'lstm':
173 |                     self.network = LSTMPolicy(env.observation_space.shape, env.action_space.n,self.global_step)
174 |                 elif policy == 'feudal':
175 |                     self.network = FeudalPolicy(env.observation_space.shape, env.action_space.n,self.global_step)
176 |                 else:
177 |                     print("Policy type unknown")
178 |                     exit(0)
179 | 
180 |         with tf.device(worker_device):
181 |             with tf.variable_scope("local"):
182 |                 if policy == 'lstm':
183 |                     self.local_network = pi = LSTMPolicy(env.observation_space.shape, env.action_space.n,self.global_step)
184 |                 elif policy == 'feudal':
185 |                     self.local_network = pi = FeudalPolicy(env.observation_space.shape, env.action_space.n,self.global_step)
186 |                 else:
187 |                     print("Policy type unknown")
188 |                     exit(0)
189 |                 pi.global_step = self.global_step
190 |             self.policy = pi
191 |             # build runner thread for collecting rollouts
192 |             self.runner = RunnerThread(env, self.policy, 20,visualise)
193 | 
194 |             # formulate gradients
195 |             grads = tf.gradients(pi.loss, pi.var_list)
196 |             grads, _ = tf.clip_by_global_norm(grads, 40)
197 | 
198 |             # build sync
199 |             # copy weights from the parameter server to the local model
200 |             self.sync = tf.group(*[v1.assign(v2)
201 |                 for v1, v2 in zip(pi.var_list, self.network.var_list)])
202 |             grads_and_vars = list(zip(grads, self.network.var_list))
203 |             # for g,v in grads_and_vars:
204 |             #     print g.name,v.name
205 |             inc_step = self.global_step.assign_add(tf.shape(pi.obs)[0])
206 | 
207 |             # build train op
208 |             opt = tf.train.AdamOptimizer(1e-4)
209 |             self.train_op = tf.group(opt.apply_gradients(grads_and_vars), inc_step)
210 |             self.summary_writer = None
211 |             self.local_steps = 0
212 | 
213 |     def start(self, sess, summary_writer):
214 |         self.runner.start_runner(sess, summary_writer)
215 |         self.summary_writer = summary_writer
216 | 
217 |     def pull_batch_from_queue(self):
218 |         """
219 |         self explanatory:  take a rollout from the queue of the thread runner.
220 |         """
221 |         rollout = self.runner.queue.get(timeout=600.0)
222 |         while not rollout.terminal:
223 |             try:
224 |                 rollout.extend(self.runner.queue.get_nowait())
225 |             except queue.Empty:
226 |                 break
227 |         return rollout
228 | 
229 |     def train(self, sess):
230 |         """
231 |         This first runs the sync op so that the gradients are computed wrt the
232 |         current global weights. It then takes a rollout from the runner's queue,
233 |         converts it to a batch, and passes that batch and the train op to the
234 |         policy to perform an update.
235 |         """
236 |         # copy weights from shared to local
237 |         # this should be run first so that the updates are for the most
238 |         # recent global weights
239 |         sess.run(self.sync)
240 |         rollout = self.pull_batch_from_queue()
241 |         batch = process_rollout(rollout, gamma=.99)
242 |         batch = self.policy.update_batch(batch)
243 |         compute_summary = self.task == 0 and self.local_steps % 11 == 0
244 |         should_compute_summary = self.task == 0 and self.local_steps % 11 == 0
245 | 
246 |         if should_compute_summary:
247 |             fetches = [self.policy.summary_op, self.train_op, self.global_step]
248 |         else:
249 |             fetches = [self.train_op, self.global_step]
250 | 
251 |         feed_dict = {
252 |             self.policy.obs: batch.si,
253 |             self.network.obs: batch.si,
254 | 
255 |             self.policy.ac: batch.a,
256 |             self.network.ac: batch.a,
257 | 
258 |             self.policy.adv: batch.adv,
259 |             self.network.adv: batch.adv,
260 | 
261 |             self.policy.r: batch.r,
262 |             self.network.r: batch.r,
263 |         }
264 | 
265 |         for i in range(len(self.policy.state_in)):
266 |             feed_dict[self.policy.state_in[i]] = batch.features[i]
267 |             feed_dict[self.network.state_in[i]] = batch.features[i]
268 | 
269 | 
270 |         fetched = sess.run(fetches, feed_dict=feed_dict)
271 | 
272 |         if should_compute_summary:
273 |             self.summary_writer.add_summary(tf.Summary.FromString(fetched[0]), fetched[-1])
274 |             self.summary_writer.flush()
275 |         self.local_steps += 1
276 | 


--------------------------------------------------------------------------------
/feudal_networks/envs/__init__.py:
--------------------------------------------------------------------------------
 1 | from gym.envs.registration import register
 2 | 
 3 | register(
 4 |     id='OneRoundDeterministicRewardBoxObs-v0',
 5 |     entry_point='feudal_networks.envs.debug_envs:OneRoundDeterministicRewardBoxObsEnv',
 6 |     max_episode_steps=1,
 7 |     tags = {
 8 |         'feudal': True
 9 |     }
10 | )
11 | 
12 | register(
13 |     id='VisionMaze-v0',
14 |     entry_point='feudal_networks.envs.vision_maze:VisionMazeEnv',
15 |     max_episode_steps=200,
16 |     kwargs = {
17 |         'room_length': 3,
18 |         'num_rooms_per_side': 2
19 |     },
20 |     tags = {
21 |         'feudal': True
22 |     }
23 | )
24 | 


--------------------------------------------------------------------------------
/feudal_networks/envs/debug_envs.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Note: adapted from the original debugging environment to have Box obs space
 3 | 
 4 | Simple environment with known optimal policy and value function.
 5 | 
 6 | This environment has just two actions.
 7 | Action 0 yields 0 reward and then terminates the session.
 8 | Action 1 yields 1 reward and then terminates the session.
 9 | 
10 | Optimal policy: action 1.
11 | 
12 | Optimal value function: v(0)=1 (there is only one state, state 0)
13 | """
14 | 
15 | import numpy as np
16 | import gym
17 | from gym import spaces
18 | 
19 | class OneRoundDeterministicRewardBoxObsEnv(gym.Env):
20 |     def __init__(self, obs_shape=(64,64,1)):
21 |         self.action_space = spaces.Discrete(2)
22 |         self.observation_space = spaces.Box(low=0, high=0, shape=obs_shape)
23 |         self._obs = np.zeros(obs_shape)
24 | 
25 |     def _step(self, action):
26 |         assert self.action_space.contains(action)
27 |         reward = 1 if action == 1 else 0
28 |         return self._obs, reward, True, {}
29 | 
30 |     def _reset(self):
31 |         return self._obs


--------------------------------------------------------------------------------
/feudal_networks/envs/vision_maze.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import gym
  3 | from gym import spaces
  4 | import numpy as np
  5 | 
  6 | class VisionMazeEnv(gym.Env):
  7 |     def __init__(self, room_length=3, num_rooms_per_side=2):
  8 |         assert room_length % 2 == 1, "room_length must be odd"
  9 |         assert room_length >= 3, "room_length must be greater than 3"
 10 |         assert num_rooms_per_side >= 1, "must have at least 1 room"
 11 | 
 12 |         self.room_length = room_length
 13 |         self.num_rooms_per_side = num_rooms_per_side
 14 |         # 0 = up, 1 = right, 2 = down, 3 = left
 15 |         self.action_space = spaces.Discrete(4)
 16 |         self.max_pos = room_length * num_rooms_per_side - 1
 17 |         obs_space = (self.max_pos + 1, self.max_pos + 1, 1)
 18 |         self.observation_space = spaces.Box(low=0, high=1, shape=obs_space)
 19 |         self.goal_reward = 1
 20 |         self.goal_state = [self.max_pos, self.max_pos]
 21 |         self._obs = np.zeros(obs_space)
 22 |         self._reset()
 23 | 
 24 |     def _get_obs(self):
 25 |         self._obs.fill(0)
 26 |         self._obs[self.state[0], self.state[1], :] = 1
 27 |         return self._obs
 28 | 
 29 |     def _reset(self):
 30 |         # start in random state in the maze
 31 |         x = np.random.randint(self.max_pos)
 32 |         y = np.random.randint(self.max_pos)
 33 |         self.state = np.array([x, y])
 34 |         return self._get_obs()
 35 | 
 36 |     def _step(self, a):
 37 |         assert self.action_space.contains(a)
 38 |         x, y = self.state
 39 | 
 40 |         # up
 41 |         if a == 0:
 42 |             y = self._step_up(x, y)
 43 |         # right
 44 |         elif a == 1:
 45 |             x = self._step_right(x, y)
 46 |         # down
 47 |         elif a == 2:
 48 |             y = self._step_down(x, y)
 49 |         # left
 50 |         else:
 51 |             x = self._step_left(x, y)
 52 | 
 53 |         r, done = 0, False
 54 |         if x == self.goal_state[0] and y == self.goal_state[1]:
 55 |             r, done = self.goal_reward, True
 56 |             
 57 |         self.state = np.array([x, y])
 58 |         return self._get_obs(), r, done, {}
 59 | 
 60 |     def _step_up(self, x, y):
 61 |         ny = y + 1
 62 | 
 63 |         # convert to single room format
 64 |         local_ny = ny % self.room_length
 65 | 
 66 |         # this condition True indicates passing through wall
 67 |         if local_ny == 0:
 68 | 
 69 |             # this is only allowed if passing through doorway
 70 |             if not (x % self.room_length == self.room_length // 2):
 71 |                 ny = y
 72 | 
 73 |         ny = min(ny, self.max_pos)
 74 |         return ny
 75 | 
 76 |     def _step_right(self, x, y):
 77 |         nx = x + 1
 78 | 
 79 |         # convert to single room format
 80 |         local_nx = nx % self.room_length
 81 | 
 82 |         # this condition True indicates passing through wall
 83 |         if local_nx == 0:
 84 | 
 85 |             # this is only allowed if passing through doorway
 86 |             if not (y % self.room_length == self.room_length // 2):
 87 |                 nx = x
 88 | 
 89 |         nx = min(nx, self.max_pos)
 90 |         return nx
 91 | 
 92 |     def _step_down(self, x, y):        
 93 |         ny = y - 1
 94 | 
 95 |         # convert to single room format
 96 |         local_ny = ny % self.room_length
 97 | 
 98 |         # this condition True indicates passing through wall
 99 |         if local_ny == self.room_length - 1:
100 | 
101 |             # this is only allowed if passing through doorway
102 |             if not (x % self.room_length == self.room_length // 2):
103 |                 ny = y
104 | 
105 |         ny = max(0, ny)
106 |         return ny
107 | 
108 |     def _step_left(self, x, y):
109 |         nx = x - 1
110 | 
111 |         # convert to single room format
112 |         local_nx = nx % self.room_length
113 | 
114 |         # this condition True indicates passing through wall
115 |         if local_nx == self.room_length - 1:
116 | 
117 |             # this is only allowed if passing through doorway
118 |             if not (y % self.room_length == self.room_length // 2):
119 |                 nx = x
120 | 
121 |         nx = max(0, nx)
122 |         return nx
123 | 


--------------------------------------------------------------------------------
/feudal_networks/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidhershey/feudal_networks/5c988023d87206a739b44cd59f60556c7de4e289/feudal_networks/models/__init__.py


--------------------------------------------------------------------------------
/feudal_networks/models/models.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | import tensorflow.contrib.rnn as rnn
 5 | 
 6 | def normalized_columns_initializer(std=1.0):
 7 |     def _initializer(shape, dtype=None, partition_info=None):
 8 |         out = np.random.randn(*shape).astype(np.float32)
 9 |         out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
10 |         return tf.constant(out)
11 |     return _initializer
12 | 
13 | def linear(x, size, name, initializer=None, bias_init=0):
14 |     w = tf.get_variable(name + "/w", [x.get_shape()[1], size],
15 |         initializer=initializer)
16 |     b = tf.get_variable(name + "/b", [size],
17 |         initializer=tf.constant_initializer(bias_init))
18 |     return tf.matmul(x, w) + b
19 | 
20 | def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME",
21 |         dtype=tf.float32, collections=None):
22 |     with tf.variable_scope(name):
23 |         stride_shape = [1, stride[0], stride[1], 1]
24 |         filter_shape = [filter_size[0], filter_size[1],
25 |             int(x.get_shape()[3]), num_filters]
26 | 
27 |         # there are "num input feature maps * filter height * filter width"
28 |         # inputs to each hidden unit
29 |         fan_in = np.prod(filter_shape[:3])
30 |         # each unit in the lower layer receives a gradient from:
31 |         # "num output feature maps * filter height * filter width" /
32 |         #   pooling size
33 |         fan_out = np.prod(filter_shape[:2]) * num_filters
34 |         # initialize weights with random weights
35 |         w_bound = np.sqrt(6. / (fan_in + fan_out))
36 | 
37 |         w = tf.get_variable("W", filter_shape, dtype,
38 |             tf.random_uniform_initializer(-w_bound, w_bound),
39 |             collections=collections)
40 |         b = tf.get_variable("b", [1, 1, 1, num_filters],
41 |             initializer=tf.constant_initializer(0.0),
42 |             collections=collections)
43 |         return tf.nn.conv2d(x, w, stride_shape, pad) + b
44 | 
45 | def build_lstm(x, size, name, step_size):
46 |     lstm = rnn.BasicLSTMCell(size, state_is_tuple=True)
47 | 
48 |     c_init = np.zeros((1, lstm.state_size.c), np.float32)
49 |     h_init = np.zeros((1, lstm.state_size.h), np.float32)
50 |     state_init = [c_init, h_init]
51 | 
52 |     c_in = tf.placeholder(tf.float32, 
53 |             shape=[1, lstm.state_size.c],
54 |             name='c_in')
55 |     h_in = tf.placeholder(tf.float32, 
56 |             shape=[1, lstm.state_size.h],
57 |             name='h_in')
58 |     state_in = [c_in, h_in]
59 | 
60 |     state_in = rnn.LSTMStateTuple(c_in, h_in)
61 | 
62 |     lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
63 |         lstm, x, initial_state=state_in, sequence_length=step_size,
64 |         time_major=False)
65 |     lstm_outputs = tf.reshape(lstm_outputs, [-1, size])
66 | 
67 |     lstm_c, lstm_h = lstm_state
68 |     state_out = [lstm_c[:1, :], lstm_h[:1, :]]
69 |     return lstm_outputs, state_init, state_in, state_out
70 | 
71 | class SingleStepLSTM(object):
72 | 
73 |     def __init__(self,x,size,step_size):
74 |         lstm = rnn.BasicLSTMCell(size, state_is_tuple=True)
75 | 
76 |         c_init = np.zeros((1, lstm.state_size.c), np.float32)
77 |         h_init = np.zeros((1, lstm.state_size.h), np.float32)
78 |         self.state_init = [c_init, h_init]
79 | 
80 |         c_in = tf.placeholder(tf.float32, 
81 |                 shape=[1, lstm.state_size.c],
82 |                 name='c_in')
83 |         h_in = tf.placeholder(tf.float32, 
84 |                 shape=[1, lstm.state_size.h],
85 |                 name='h_in')
86 |         self.state_in = [c_in, h_in]
87 | 
88 |         state_in = rnn.LSTMStateTuple(c_in, h_in)
89 | 
90 |         lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
91 |             lstm, x, initial_state=state_in, sequence_length=step_size,
92 |             time_major=False)
93 |         lstm_outputs = tf.reshape(lstm_outputs, [-1, size])
94 | 
95 |         lstm_c, lstm_h = lstm_state
96 |         self.state_out = [lstm_c[:1, :], lstm_h[:1, :]]
97 |         self.output = lstm_outputs
98 | 


--------------------------------------------------------------------------------
/feudal_networks/policies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidhershey/feudal_networks/5c988023d87206a739b44cd59f60556c7de4e289/feudal_networks/policies/__init__.py


--------------------------------------------------------------------------------
/feudal_networks/policies/configs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidhershey/feudal_networks/5c988023d87206a739b44cd59f60556c7de4e289/feudal_networks/policies/configs/__init__.py


--------------------------------------------------------------------------------
/feudal_networks/policies/configs/feudal_config.py:
--------------------------------------------------------------------------------
 1 | class config():
 2 |     alpha = .5
 3 |     vf_hidden_size = 128
 4 |     k = 16 #Dimensionality of w
 5 |     g_dim = 256
 6 |     c = 10
 7 |     beta_start = .01
 8 |     beta_end = .001
 9 |     decay_steps = 50000000
10 | 


--------------------------------------------------------------------------------
/feudal_networks/policies/configs/lstm_config.py:
--------------------------------------------------------------------------------
1 | class config():
2 |     size = 256
3 |     n_percept_hidden_layer = 4
4 |     n_percept_filters = 32
5 |     beta_start = .01
6 |     beta_end = .001
7 |     decay_steps = 50000000
8 |     summary_steps = 10
9 | 


--------------------------------------------------------------------------------
/feudal_networks/policies/feudal_batch_processor.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | from collections import namedtuple
  4 | 
  5 | def cosine_similarity(u, v):
  6 |     return np.dot(np.squeeze(u),np.squeeze(v)) / (np.linalg.norm(u) * np.linalg.norm(v))
  7 | 
  8 | Batch = namedtuple("Batch", ["obs", "a", "returns", "s_diff", "ri", "gsum", "features"])
  9 | 
 10 | class FeudalBatch(object):
 11 |     def __init__(self):
 12 |         self.obs = []
 13 |         self.a = []
 14 |         self.returns = []
 15 |         self.s_diff = []
 16 |         self.ri = []
 17 |         self.gsum = []
 18 |         self.features = None
 19 | 
 20 |     def add(self, obs, a, returns, s_diff, ri, gsum, features):
 21 |         self.obs += [obs]
 22 |         self.a += [a]
 23 |         self.returns += [returns]
 24 |         self.s_diff += [s_diff]
 25 |         self.ri += [ri]
 26 |         self.gsum += [gsum]
 27 |         if not self.features:
 28 |             self.features = features
 29 | 
 30 |     def get_batch(self):
 31 |         batch_obs = np.asarray(self.obs)
 32 |         batch_a = np.asarray(self.a)
 33 |         batch_r = np.asarray(self.returns)
 34 |         batch_sd = np.squeeze(np.asarray(self.s_diff))
 35 |         batch_ri = np.asarray(self.ri)
 36 |         batch_gs = np.asarray(self.gsum)
 37 |         return Batch(batch_obs,batch_a,batch_r,batch_sd,batch_ri,batch_gs,self.features)
 38 | 
 39 | 
 40 | 
 41 | class FeudalBatchProcessor(object):
 42 |     """
 43 |     This class adapts the batch of PolicyOptimizer to a batch useable by
 44 |     the FeudalPolicy.
 45 |     """
 46 |     def __init__(self, c):
 47 |         self.c = c
 48 |         self.last_terminal = True
 49 | 
 50 |     def _extend(self, batch):
 51 |         if self.last_terminal:
 52 |             self.last_terminal = False
 53 |             self.s = [batch.s[0] for _ in range(self.c)]
 54 |             self.g = [batch.g[0] for _ in range(self.c)]
 55 |             # prepend with dummy values so indexing is the same
 56 |             self.obs = [None for _ in range(self.c)]
 57 |             self.a = [None for _ in range(self.c)]
 58 |             self.returns = [None for _ in range(self.c)]
 59 |             self.features = [None for _ in range(self.c)]
 60 | 
 61 |         # extend with the actual values
 62 |         self.obs.extend(batch.obs)
 63 |         self.a.extend(batch.a)
 64 |         self.returns.extend(batch.returns)
 65 |         self.s.extend(batch.s)
 66 |         self.g.extend(batch.g)
 67 |         self.features.extend(batch.features)
 68 | 
 69 |         # if this is a terminal batch, then append the final s and g c times
 70 |         # note that both this and the above case can occur at the same time
 71 |         if batch.terminal:
 72 |             self.s.extend([batch.s[-1] for _ in range(self.c)])
 73 |             self.g.extend([batch.g[-1] for _ in range(self.c)])
 74 | 
 75 |     def process_batch(self, batch):
 76 |         """
 77 |         Converts a normal batch into one used by the FeudalPolicy update.
 78 | 
 79 |         FeudalPolicy requires a batch of the form:
 80 | 
 81 |         c previous timesteps - batch size timesteps - c future timesteps
 82 | 
 83 |         This class handles the tracking the leading and following timesteps over
 84 |         time. Additionally, it also computes values across timesteps from the
 85 |         batch to provide to FeudalPolicy.
 86 |         """
 87 |         # extend with current batch
 88 |         self._extend(batch)
 89 | 
 90 |         # unpack and compute bounds
 91 |         length = len(self.obs)
 92 |         c = self.c
 93 | 
 94 |         # normally we cannot compute samples for the last c elements, but
 95 |         # in the terminal case, we halluciante values where necessary
 96 |         end = length if batch.terminal else length - c
 97 | 
 98 |         # collect samples to return in a FeudalBatch
 99 |         feudal_batch = FeudalBatch()
100 |         for t in range(c, end):
101 | 
102 |             # state difference
103 |             s_diff = self.s[t + c] - self.s[t]
104 | 
105 |             # intrinsic reward
106 |             ri = 0
107 |             # note that this for loop considers s and g values
108 |             # 1 timestep to c timesteps (inclusively) ago
109 |             for i in range(1, c + 1):
110 |                 ri_s_diff = self.s[t] - self.s[t - i]
111 |                 if np.linalg.norm(ri_s_diff) != 0:
112 |                     ri += cosine_similarity(ri_s_diff, self.g[t - i])
113 |             ri /= c
114 | 
115 |             # sum of g values used to derive w, input to the linear transform
116 |             gsum = np.zeros_like(self.g[t - c])
117 |             for i in range(t - c, t + 1):
118 |                 gsum += self.g[i]
119 | 
120 |             # add to the batch
121 |             feudal_batch.add(self.obs[t], self.a[t], self.returns[t], s_diff,
122 |                 ri, gsum, self.features[t])
123 | 
124 |         # in the terminal case, set reset flag
125 |         if batch.terminal:
126 |             self.last_terminal = True
127 |         # in the general case, forget all but the last 2 * c elements
128 |         # reason being that the first c of those we have already computed
129 |         # a batch for, and the second c need those first c
130 |         else:
131 |             twoc = 2 * self.c
132 |             self.obs = self.obs[-twoc:]
133 |             self.a = self.a[-twoc:]
134 |             self.returns = self.returns[-twoc:]
135 |             self.s = self.s[-twoc:]
136 |             self.g = self.g[-twoc:]
137 |             self.features = self.features[-twoc:]
138 | 
139 |         return feudal_batch.get_batch()
140 | 


--------------------------------------------------------------------------------
/feudal_networks/policies/feudal_policy.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import distutils.version
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | import tensorflow.contrib.rnn as rnn
  6 | 
  7 | import feudal_networks.policies.policy as policy
  8 | import feudal_networks.policies.policy_utils as policy_utils
  9 | from feudal_networks.models.models import SingleStepLSTM
 10 | from feudal_networks.policies.configs.feudal_config import config
 11 | from feudal_networks.policies.feudal_batch_processor import FeudalBatchProcessor
 12 | 
 13 | class FeudalPolicy(policy.Policy):
 14 |     """
 15 |     Policy of the Feudal network architecture.
 16 |     """
 17 | 
 18 |     def __init__(self, obs_space, act_space,global_step):
 19 |         self.global_step = global_step
 20 |         self.obs_space = obs_space
 21 |         self.act_space = act_space
 22 |         self.config = config
 23 |         self.k = config.k #Dimensionality of w
 24 |         self.g_dim = config.g_dim
 25 |         self.c = config.c
 26 |         self.batch_processor = FeudalBatchProcessor(self.c)
 27 |         self._build_model()
 28 | 
 29 |     def _build_model(self):
 30 |         """
 31 |         Builds the manager and worker models.
 32 |         """
 33 |         with tf.variable_scope('FeUdal'):
 34 |             self._build_placeholders()
 35 |             self._build_perception()
 36 |             self._build_manager()
 37 |             self._build_worker()
 38 |             self._build_loss()
 39 |             self.var_list = tf.get_collection(
 40 |                 tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
 41 |         # for v in self.var_list:
 42 |         #     print v.name
 43 | 
 44 |         self.state_in = [self.worker_lstm.state_in[0],\
 45 |                         self.worker_lstm.state_in[1],\
 46 |                         self.manager_lstm.state_in[0],\
 47 |                         self.manager_lstm.state_in[1]\
 48 |                         ]
 49 |         self.state_out = [self.worker_lstm.state_out[0],\
 50 |                           self.worker_lstm.state_out[1],\
 51 |                           self.manager_lstm.state_out[0],\
 52 |                           self.manager_lstm.state_out[1]\
 53 |                           ]
 54 |         # for v in self.var_list:
 55 |         #     print v
 56 | 
 57 |     def _build_placeholders(self):
 58 |         #standard for all policies
 59 |         self.obs = tf.placeholder(tf.float32, [None] + list(self.obs_space))
 60 |         self.r = tf.placeholder(tf.float32,(None,))
 61 |         self.ac = tf.placeholder(tf.float32,(None,self.act_space))
 62 |         self.adv = tf.placeholder(tf.float32, [None]) #unused
 63 | 
 64 |         #specific to FeUdal
 65 |         self.prev_g = tf.placeholder(tf.float32, (None,None,self.g_dim))
 66 |         self.ri = tf.placeholder(tf.float32,(None,))
 67 |         self.s_diff = tf.placeholder(tf.float32,(None,self.g_dim))
 68 | 
 69 | 
 70 |     def _build_perception(self):
 71 |         conv1 = tf.layers.conv2d(inputs=self.obs,
 72 |                                 filters=16,
 73 |                                 kernel_size=[8, 8],
 74 |                                 activation=tf.nn.elu,
 75 |                                 strides=4)
 76 |         conv2 = tf.layers.conv2d(inputs=conv1,
 77 |                                 filters=32,
 78 |                                 kernel_size=[4,4],
 79 |                                 activation=tf.nn.elu,
 80 |                                 strides=2)
 81 | 
 82 |         flattened_filters = policy_utils.flatten(conv2)
 83 |         self.z = tf.layers.dense(inputs=flattened_filters,\
 84 |                                 units=256,\
 85 |                                 activation=tf.nn.elu)
 86 | 
 87 |     def _build_manager(self):
 88 |         with tf.variable_scope('manager'):
 89 |             # Calculate manager internal state
 90 |             self.s = tf.layers.dense(inputs=self.z,\
 91 |                                             units=self.g_dim,\
 92 |                                             activation=tf.nn.elu)
 93 | 
 94 |             # Calculate manager output g
 95 |             x = tf.expand_dims(self.s, [0])
 96 |             self.manager_lstm = SingleStepLSTM(x,\
 97 |                                                 self.g_dim,\
 98 |                                                 step_size=tf.shape(self.obs)[:1])
 99 |             g_hat = self.manager_lstm.output
100 |             self.g = tf.nn.l2_normalize(g_hat, dim=1)
101 | 
102 |             self.manager_vf = self._build_value(g_hat)
103 |             # self.manager_vf = tf.Print(self.manager_vf,[self.manager_vf])
104 | 
105 |     def _build_worker(self):
106 |         with tf.variable_scope('worker'):
107 |             num_acts = self.act_space
108 | 
109 |             # Calculate U
110 |             self.worker_lstm = SingleStepLSTM(tf.expand_dims(self.z, [0]),\
111 |                                                 size=num_acts * self.k,
112 |                                                 step_size=tf.shape(self.obs)[:1])
113 |             flat_logits = self.worker_lstm.output
114 | 
115 |             self.worker_vf = self._build_value(flat_logits)
116 | 
117 |             U = tf.reshape(flat_logits,[-1,num_acts,self.k])
118 | 
119 |             # Calculate w
120 |             cut_g = tf.stop_gradient(self.g)
121 |             cut_g = tf.expand_dims(cut_g, [1])
122 |             gstack = tf.concat([self.prev_g,cut_g], axis=1)
123 | 
124 |             self.last_c_g = gstack[:,1:]
125 |             # print self.last_c_g
126 |             gsum = tf.reduce_sum(gstack, axis=1)
127 |             phi = tf.get_variable("phi", (self.g_dim, self.k))
128 |             w = tf.matmul(gsum,phi)
129 |             w = tf.expand_dims(w,[2])
130 |             # Calculate policy and sample
131 |             logits = tf.reshape(tf.matmul(U,w),[-1,num_acts])
132 |             self.pi = tf.nn.softmax(logits)
133 |             self.log_pi = tf.nn.log_softmax(logits)
134 |             self.sample = policy_utils.categorical_sample(
135 |                 tf.reshape(logits,[-1,num_acts]), num_acts)[0, :]
136 | 
137 |     def _build_value(self,input):
138 |         with tf.variable_scope('VF'):
139 |             hidden = tf.layers.dense(inputs=input,\
140 |                                 units=self.config.vf_hidden_size,\
141 |                                 activation=tf.nn.elu)
142 | 
143 |             w = tf.get_variable("weights", (self.config.vf_hidden_size, 1))
144 |             return tf.matmul(hidden,w)
145 | 
146 |     def _build_loss(self):
147 |         cutoff_vf_manager = tf.reshape(tf.stop_gradient(self.manager_vf),[-1])
148 |         dot = tf.reduce_sum(tf.multiply(self.s_diff,self.g ),axis=1)
149 |         gcut = tf.stop_gradient(self.g)
150 |         mag = tf.norm(self.s_diff,axis=1)*tf.norm(gcut,axis=1)+.0001
151 |         dcos = dot/mag
152 |         manager_loss = -tf.reduce_sum((self.r-cutoff_vf_manager)*dcos)
153 | 
154 |         cutoff_vf_worker = tf.reshape(tf.stop_gradient(self.worker_vf),[-1])
155 |         log_p = tf.reduce_sum(self.log_pi*self.ac,[1])
156 |         worker_loss = (self.r + self.config.alpha*self.ri - cutoff_vf_worker)*log_p
157 |         worker_loss = -tf.reduce_sum(worker_loss,axis=0)
158 | 
159 |         Am = self.r-self.manager_vf
160 |         manager_vf_loss = .5*tf.reduce_sum(tf.square(Am))
161 | 
162 |         Aw = (self.r + self.config.alpha*self.ri)-self.worker_vf
163 |         worker_vf_loss = .5*tf.reduce_sum(tf.square(Aw))
164 | 
165 |         entropy = -tf.reduce_sum(self.pi * self.log_pi)
166 | 
167 |         beta = tf.train.polynomial_decay(config.beta_start, self.global_step,
168 |                 end_learning_rate=config.beta_end,
169 |                 decay_steps=config.decay_steps,
170 |                 power=1)
171 | 
172 |         # worker_loss = tf.Print(worker_loss,[manager_loss,worker_loss,manager_vf_loss,worker_vf_loss,entropy])
173 |         self.loss = worker_loss+manager_loss+\
174 |                     worker_vf_loss + manager_vf_loss-\
175 |                     entropy*beta
176 | 
177 |         bs = tf.to_float(tf.shape(self.obs)[0])
178 |         tf.summary.scalar("model/manager_loss", manager_loss / bs)
179 |         tf.summary.scalar("model/worker_loss", worker_loss / bs)
180 |         tf.summary.scalar("model/value_mean", tf.reduce_mean(self.manager_vf))
181 |         tf.summary.scalar("model/value_loss", manager_vf_loss / bs)
182 |         tf.summary.scalar("model/value_loss_scaled", manager_vf_loss / bs * .5)
183 |         tf.summary.scalar("model/entropy", entropy / bs)
184 |         tf.summary.scalar("model/entropy_loss_scaleed", -entropy / bs * beta)
185 |         # tf.summary.scalar("model/grad_global_norm", tf.global_norm(grads))
186 |         tf.summary.scalar("model/var_global_norm", tf.global_norm(tf.get_collection(\
187 |             tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)))
188 |         tf.summary.scalar("model/beta", beta)
189 |         tf.summary.image("model/state", self.obs)
190 |         self.summary_op = tf.summary.merge_all()
191 | 
192 | 
193 |     def get_initial_features(self):
194 |         return np.zeros((1,1,self.g_dim),np.float32),self.worker_lstm.state_init+self.manager_lstm.state_init
195 | 
196 | 
197 |     def act(self, ob, g,cw,hw,cm,hm):
198 |         sess = tf.get_default_session()
199 |         return sess.run([self.sample, self.manager_vf, self.g, self.s, self.last_c_g] + self.state_out,
200 |                         {self.obs: [ob], self.state_in[0]: cw, self.state_in[1]: hw,\
201 |                          self.state_in[2]: cm, self.state_in[3]: hm,\
202 |                          self.prev_g: g})
203 | 
204 |     def value(self, ob, g, cw, hw, cm, hm):
205 |         sess = tf.get_default_session()
206 |         return sess.run(self.manager_vf,
207 |                         {self.obs: [ob], self.state_in[0]: cw, self.state_in[1]: hw,\
208 |                          self.state_in[2]: cm, self.state_in[3]: hm,\
209 |                          self.prev_g: g})[0]
210 | 
211 |     def update_batch(self,batch):
212 |         return self.batch_processor.process_batch(batch)
213 | 


--------------------------------------------------------------------------------
/feudal_networks/policies/lstm_policy.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import gym
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | 
  6 | from feudal_networks.models.models import (linear, conv2d, build_lstm,
  7 |     normalized_columns_initializer)
  8 | import feudal_networks.policies.policy_utils as policy_utils
  9 | 
 10 | from feudal_networks.policies.configs.lstm_config import config
 11 | 
 12 | class LSTMPolicy(object):
 13 |     def __init__(self, obs_space, act_space,global_step):
 14 |         self.global_step = global_step
 15 |         self.obs_space = obs_space
 16 |         self.act_space = act_space
 17 |         self.config = config
 18 |         self.local_steps = 0
 19 |         # build placeholders
 20 |         self.obs = x = tf.placeholder(tf.float32,
 21 |                                     [None] + list(obs_space),
 22 |                                     name='state')
 23 |         self.adv = tf.placeholder(tf.float32,
 24 |                                     [None],
 25 |                                     name="adv")
 26 |         self.ac = tf.placeholder(tf.float32,
 27 |                                     [None, act_space],
 28 |                                     name="ac")
 29 |         self.r = tf.placeholder(tf.float32,
 30 |                                     [None],
 31 |                                     name="r")
 32 | 
 33 |         print(self.r)
 34 |         # build perception
 35 |         for i in range(config.n_percept_hidden_layer):
 36 |             x = tf.nn.elu(conv2d(x, config.n_percept_filters,
 37 |                 "l{}".format(i + 1), [3, 3], [2, 2]))
 38 | 
 39 |         # introduce a "fake" batch dimension of 1 after flatten so that we
 40 |         # can do LSTM over time dim
 41 |         x = tf.expand_dims(policy_utils.flatten(x), [0])
 42 |         x, self.state_init, self.state_in, self.state_out = build_lstm(
 43 |             x, config.size, 'lstm', tf.shape(self.obs)[:1])
 44 | 
 45 |             # on the lstm to output values for both the policy and value function
 46 |         # add hidden layer to value output so that less of a burden is placed
 47 |         vfhid = tf.nn.elu(linear(x, config.size, "value_hidden",
 48 |             normalized_columns_initializer(0.01)))
 49 |         self.vf = tf.reshape(linear(vfhid, 1, "value",
 50 |             normalized_columns_initializer(1.0)), [-1])
 51 | 
 52 |         # retrieve logits, sampling op
 53 |         self.logits = linear(x, act_space, "action",
 54 |             normalized_columns_initializer(0.01))
 55 |         self.sample = policy_utils.categorical_sample(
 56 |             self.logits, act_space)[0, :]
 57 |         self.var_list = tf.get_collection(
 58 |             tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
 59 | 
 60 |         # build loss
 61 |         log_prob_tf = tf.nn.log_softmax(self.logits)
 62 |         prob_tf = tf.nn.softmax(self.logits)
 63 |         pi_loss = - tf.reduce_sum(
 64 |                     tf.reduce_sum(log_prob_tf * self.ac, [1]) * self.adv)
 65 |         entropy = - tf.reduce_sum(prob_tf * log_prob_tf)
 66 |         vf_loss = 0.5 * tf.reduce_sum(tf.square(self.vf - self.r))
 67 |         beta = tf.train.polynomial_decay(config.beta_start, self.global_step,
 68 |                 end_learning_rate=config.beta_end,
 69 |                 decay_steps=config.decay_steps,
 70 |                 power=1)
 71 |         self.loss = pi_loss + 0.5 * vf_loss - entropy * beta
 72 | 
 73 |         # summaries
 74 |         bs = tf.to_float(tf.shape(self.obs)[0])
 75 |         tf.summary.scalar("model/policy_loss", pi_loss / bs)
 76 |         tf.summary.scalar("model/value_mean", tf.reduce_mean(self.vf))
 77 |         tf.summary.scalar("model/value_loss", vf_loss / bs)
 78 |         tf.summary.scalar("model/value_loss_scaled", vf_loss / bs * .5)
 79 |         tf.summary.scalar("model/entropy", entropy / bs)
 80 |         tf.summary.scalar("model/entropy_loss_scaleed", -entropy / bs * beta)
 81 |         # tf.summary.scalar("model/grad_global_norm", tf.global_norm(grads))
 82 |         tf.summary.scalar("model/var_global_norm", tf.global_norm(self.var_list))
 83 |         tf.summary.scalar("model/beta", beta)
 84 |         tf.summary.image("model/state", self.obs)
 85 |         self.summary_op = tf.summary.merge_all()
 86 | 
 87 |     def get_initial_features(self):
 88 |         return self.state_init
 89 | 
 90 |     def act(self, ob, c, h):
 91 |         sess = tf.get_default_session()
 92 |         return sess.run([self.sample, self.vf] + self.state_out,
 93 |                         {self.obs: [ob], self.state_in[0]: c, self.state_in[1]: h})
 94 | 
 95 |     def value(self, ob, c, h):
 96 |         sess = tf.get_default_session()
 97 |         return sess.run(self.vf, {self.obs: [ob], self.state_in[0]: c, self.state_in[1]: h})[0]
 98 | 
 99 |     def update_batch(self,batch):
100 |         return batch
101 | 


--------------------------------------------------------------------------------
/feudal_networks/policies/policy.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class Policy(object):
 4 |     """
 5 |     An abstract class defining a learned policy to be used for a Reinforcment
 6 |     Learning problem.  This class interfaces with a policy optimizer class
 7 |     that oversees training the policy on some environment.
 8 | 
 9 |     The policy needs three externally facing methods:
10 |         act()
11 |         value()
12 |         update()
13 |     Which are further documented below.
14 | 
15 |     Further, upon initialization the following member variables should be
16 |     defined:
17 |         loss        - The tensorflow operation defining the loss function of the
18 |                       policy with respect to a batch of training data
19 |         var_list    - The variables that should be trained by the optimizer
20 |         internals_in- A list of placeholder variables needed at runtime
21 |                       in order to calculate act(), value() or update()
22 |                       (e.g. internal LSTM state)
23 |     """
24 |     def __init__(self,obs_space,act_space,config):
25 |         raise NotImplementedError("Please Implement this method")
26 | 
27 |     def _build_model(self):
28 |         raise NotImplementedError("Please Implement this method")
29 | 
30 |     def _build_placeholders(self):
31 |         raise NotImplementedError("Please Implement this method")
32 | 
33 |     def _build_loss(self):
34 |         """
35 |         Should initialize self.loss to be a tensorflow operation that calculates
36 |         the loss funtion for the current policy
37 |         """
38 |         raise NotImplementedError("Please Implement this method")
39 | 
40 |     def act(self, obs, prev_internal):
41 |         raise NotImplementedError("Please Implement this method")
42 | 
43 |     def value(self, obs,prev_internal):
44 |         raise NotImplementedError("Please Implement this method")
45 | 
46 |     def update(self, sess, train_op, batch):
47 |         raise NotImplementedError("Please Implement this method")
48 | 


--------------------------------------------------------------------------------
/feudal_networks/policies/policy_utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | 
 5 | def flatten(x):
 6 |     return tf.reshape(x, [-1, np.prod(x.get_shape().as_list()[1:])])
 7 | 
 8 | def categorical_sample(logits, d):
 9 |     value = tf.squeeze(tf.multinomial(logits - tf.reduce_max(
10 |         logits, [1], keep_dims=True), 1), [1])
11 |     return tf.one_hot(value, d)
12 | 


--------------------------------------------------------------------------------
/scripts/training/README.md:
--------------------------------------------------------------------------------
1 | scripts for executing training


--------------------------------------------------------------------------------
/scripts/training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidhershey/feudal_networks/5c988023d87206a739b44cd59f60556c7de4e289/scripts/training/__init__.py


--------------------------------------------------------------------------------
/scripts/training/envs.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | from gym.spaces.box import Box
  3 | import numpy as np
  4 | import gym
  5 | from gym import spaces
  6 | import logging
  7 | import universe
  8 | from universe import vectorized
  9 | from universe.wrappers import BlockingReset, GymCoreAction, EpisodeID, Unvectorize, Vectorize, Vision, Logger
 10 | from universe import spaces as vnc_spaces
 11 | from universe.spaces.vnc_event import keycode
 12 | import time
 13 | logger = logging.getLogger(__name__)
 14 | logger.setLevel(logging.INFO)
 15 | universe.configure_logging()
 16 | 
 17 | # for environments
 18 | import feudal_networks.envs
 19 | 
 20 | def create_env(env_id, client_id, remotes, **kwargs):
 21 |     spec = gym.spec(env_id)
 22 | 
 23 |     if spec.tags.get('feudal', False):
 24 |         return create_feudal_env(env_id, client_id, remotes, **kwargs)
 25 |     elif spec.tags.get('flashgames', False):
 26 |         return create_flash_env(env_id, client_id, remotes, **kwargs)
 27 |     elif spec.tags.get('atari', False) and spec.tags.get('vnc', False):
 28 |         return create_vncatari_env(env_id, client_id, remotes, **kwargs)
 29 |     else:
 30 |         # Assume atari.
 31 |         assert "." not in env_id  # universe environments have dots in names.
 32 |         return create_atari_env(env_id)
 33 | 
 34 | def create_feudal_env(env_id, client_id, remotes, **_):
 35 |     env = gym.make(env_id)
 36 |     return env
 37 | 
 38 | def create_flash_env(env_id, client_id, remotes, **_):
 39 |     env = gym.make(env_id)
 40 |     env = Vision(env)
 41 |     env = Logger(env)
 42 |     env = BlockingReset(env)
 43 | 
 44 |     reg = universe.runtime_spec('flashgames').server_registry
 45 |     height = reg[env_id]["height"]
 46 |     width = reg[env_id]["width"]
 47 |     env = CropScreen(env, height, width, 84, 18)
 48 |     env = FlashRescale(env)
 49 | 
 50 |     keys = ['left', 'right', 'up', 'down', 'x']
 51 |     if env_id == 'flashgames.NeonRace-v0':
 52 |         # Better key space for this game.
 53 |         keys = ['left', 'right', 'up', 'left up', 'right up', 'down', 'up x']
 54 |     logger.info('create_flash_env(%s): keys=%s', env_id, keys)
 55 | 
 56 |     env = DiscreteToFixedKeysVNCActions(env, keys)
 57 |     env = EpisodeID(env)
 58 |     env = DiagnosticsInfo(env)
 59 |     env = Unvectorize(env)
 60 |     env.configure(fps=5.0, remotes=remotes, start_timeout=15 * 60, client_id=client_id,
 61 |                   vnc_driver='go', vnc_kwargs={
 62 |                     'encoding': 'tight', 'compress_level': 0,
 63 |                     'fine_quality_level': 50, 'subsample_level': 3})
 64 |     return env
 65 | 
 66 | def create_vncatari_env(env_id, client_id, remotes, **_):
 67 |     env = gym.make(env_id)
 68 |     env = Vision(env)
 69 |     env = Logger(env)
 70 |     env = BlockingReset(env)
 71 |     env = GymCoreAction(env)
 72 |     env = AtariRescale42x42(env)
 73 |     env = EpisodeID(env)
 74 |     env = DiagnosticsInfo(env)
 75 |     env = Unvectorize(env)
 76 | 
 77 |     logger.info('Connecting to remotes: %s', remotes)
 78 |     fps = env.metadata['video.frames_per_second']
 79 |     env.configure(remotes=remotes, start_timeout=15 * 60, fps=fps, client_id=client_id)
 80 |     return env
 81 | 
 82 | def create_atari_env(env_id):
 83 |     env = gym.make(env_id)
 84 |     env = Vectorize(env)
 85 |     env = AtariRescale42x42(env)
 86 |     env = DiagnosticsInfo(env)
 87 |     env = Unvectorize(env)
 88 |     return env
 89 | 
 90 | def DiagnosticsInfo(env, *args, **kwargs):
 91 |     return vectorized.VectorizeFilter(env, DiagnosticsInfoI, *args, **kwargs)
 92 | 
 93 | class DiagnosticsInfoI(vectorized.Filter):
 94 |     def __init__(self, log_interval=503):
 95 |         super(DiagnosticsInfoI, self).__init__()
 96 | 
 97 |         self._episode_time = time.time()
 98 |         self._last_time = time.time()
 99 |         self._local_t = 0
100 |         self._log_interval = log_interval
101 |         self._episode_reward = 0
102 |         self._episode_length = 0
103 |         self._all_rewards = []
104 |         self._num_vnc_updates = 0
105 |         self._last_episode_id = -1
106 | 
107 |     def _after_reset(self, observation):
108 |         logger.info('Resetting environment')
109 |         self._episode_reward = 0
110 |         self._episode_length = 0
111 |         self._all_rewards = []
112 |         return observation
113 | 
114 |     def _after_step(self, observation, reward, done, info):
115 |         to_log = {}
116 |         if self._episode_length == 0:
117 |             self._episode_time = time.time()
118 | 
119 |         self._local_t += 1
120 |         if info.get("stats.vnc.updates.n") is not None:
121 |             self._num_vnc_updates += info.get("stats.vnc.updates.n")
122 | 
123 |         if self._local_t % self._log_interval == 0:
124 |             cur_time = time.time()
125 |             elapsed = cur_time - self._last_time
126 |             fps = self._log_interval / elapsed
127 |             self._last_time = cur_time
128 |             cur_episode_id = info.get('vectorized.episode_id', 0)
129 |             to_log["diagnostics/fps"] = fps
130 |             if self._last_episode_id == cur_episode_id:
131 |                 to_log["diagnostics/fps_within_episode"] = fps
132 |             self._last_episode_id = cur_episode_id
133 |             if info.get("stats.gauges.diagnostics.lag.action") is not None:
134 |                 to_log["diagnostics/action_lag_lb"] = info["stats.gauges.diagnostics.lag.action"][0]
135 |                 to_log["diagnostics/action_lag_ub"] = info["stats.gauges.diagnostics.lag.action"][1]
136 |             if info.get("reward.count") is not None:
137 |                 to_log["diagnostics/reward_count"] = info["reward.count"]
138 |             if info.get("stats.gauges.diagnostics.clock_skew") is not None:
139 |                 to_log["diagnostics/clock_skew_lb"] = info["stats.gauges.diagnostics.clock_skew"][0]
140 |                 to_log["diagnostics/clock_skew_ub"] = info["stats.gauges.diagnostics.clock_skew"][1]
141 |             if info.get("stats.gauges.diagnostics.lag.observation") is not None:
142 |                 to_log["diagnostics/observation_lag_lb"] = info["stats.gauges.diagnostics.lag.observation"][0]
143 |                 to_log["diagnostics/observation_lag_ub"] = info["stats.gauges.diagnostics.lag.observation"][1]
144 | 
145 |             if info.get("stats.vnc.updates.n") is not None:
146 |                 to_log["diagnostics/vnc_updates_n"] = info["stats.vnc.updates.n"]
147 |                 to_log["diagnostics/vnc_updates_n_ps"] = self._num_vnc_updates / elapsed
148 |                 self._num_vnc_updates = 0
149 |             if info.get("stats.vnc.updates.bytes") is not None:
150 |                 to_log["diagnostics/vnc_updates_bytes"] = info["stats.vnc.updates.bytes"]
151 |             if info.get("stats.vnc.updates.pixels") is not None:
152 |                 to_log["diagnostics/vnc_updates_pixels"] = info["stats.vnc.updates.pixels"]
153 |             if info.get("stats.vnc.updates.rectangles") is not None:
154 |                 to_log["diagnostics/vnc_updates_rectangles"] = info["stats.vnc.updates.rectangles"]
155 |             if info.get("env_status.state_id") is not None:
156 |                 to_log["diagnostics/env_state_id"] = info["env_status.state_id"]
157 | 
158 |         if reward is not None:
159 |             self._episode_reward += reward
160 |             if observation is not None:
161 |                 self._episode_length += 1
162 |             self._all_rewards.append(reward)
163 | 
164 |         if done:
165 |             logger.info('Episode terminating: episode_reward=%s episode_length=%s', self._episode_reward, self._episode_length)
166 |             total_time = time.time() - self._episode_time
167 |             to_log["global/episode_reward"] = self._episode_reward
168 |             to_log["global/episode_length"] = self._episode_length
169 |             to_log["global/episode_time"] = total_time
170 |             to_log["global/reward_per_time"] = self._episode_reward / total_time
171 |             self._episode_reward = 0
172 |             self._episode_length = 0
173 |             self._all_rewards = []
174 | 
175 |         return observation, reward, done, to_log
176 | 
177 | def _process_frame42(frame):
178 |     frame = frame[34:34+160, :160]
179 |     # Resize by half, then down to 42x42 (essentially mipmapping). If
180 |     # we resize directly we lose pixels that, when mapped to 42x42,
181 |     # aren't close enough to the pixel boundary.
182 |     frame = cv2.resize(frame, (80, 80))
183 |     frame = cv2.resize(frame, (42, 42))
184 |     frame = frame.mean(2)
185 |     frame = frame.astype(np.float32)
186 |     frame *= (1.0 / 255.0)
187 |     frame = np.reshape(frame, [42, 42, 1])
188 |     return frame
189 | 
190 | class AtariRescale42x42(vectorized.ObservationWrapper):
191 |     def __init__(self, env=None):
192 |         super(AtariRescale42x42, self).__init__(env)
193 |         self.observation_space = Box(0.0, 1.0, [42, 42, 1])
194 | 
195 |     def _observation(self, observation_n):
196 |         return [_process_frame42(observation) for observation in observation_n]
197 | 
198 | class FixedKeyState(object):
199 |     def __init__(self, keys):
200 |         self._keys = [keycode(key) for key in keys]
201 |         self._down_keysyms = set()
202 | 
203 |     def apply_vnc_actions(self, vnc_actions):
204 |         for event in vnc_actions:
205 |             if isinstance(event, vnc_spaces.KeyEvent):
206 |                 if event.down:
207 |                     self._down_keysyms.add(event.key)
208 |                 else:
209 |                     self._down_keysyms.discard(event.key)
210 | 
211 |     def to_index(self):
212 |         action_n = 0
213 |         for key in self._down_keysyms:
214 |             if key in self._keys:
215 |                 # If multiple keys are pressed, just use the first one
216 |                 action_n = self._keys.index(key) + 1
217 |                 break
218 |         return action_n
219 | 
220 | class DiscreteToFixedKeysVNCActions(vectorized.ActionWrapper):
221 |     """
222 |     Define a fixed action space. Action 0 is all keys up. Each element of keys can be a single key or a space-separated list of keys
223 | 
224 |     For example,
225 |        e=DiscreteToFixedKeysVNCActions(e, ['left', 'right'])
226 |     will have 3 actions: [none, left, right]
227 | 
228 |     You can define a state with more than one key down by separating with spaces. For example,
229 |        e=DiscreteToFixedKeysVNCActions(e, ['left', 'right', 'space', 'left space', 'right space'])
230 |     will have 6 actions: [none, left, right, space, left space, right space]
231 |     """
232 |     def __init__(self, env, keys):
233 |         super(DiscreteToFixedKeysVNCActions, self).__init__(env)
234 | 
235 |         self._keys = keys
236 |         self._generate_actions()
237 |         self.action_space = spaces.Discrete(len(self._actions))
238 | 
239 |     def _generate_actions(self):
240 |         self._actions = []
241 |         uniq_keys = set()
242 |         for key in self._keys:
243 |             for cur_key in key.split(' '):
244 |                 uniq_keys.add(cur_key)
245 | 
246 |         for key in [''] + self._keys:
247 |             split_keys = key.split(' ')
248 |             cur_action = []
249 |             for cur_key in uniq_keys:
250 |                 cur_action.append(vnc_spaces.KeyEvent.by_name(cur_key, down=(cur_key in split_keys)))
251 |             self._actions.append(cur_action)
252 |         self.key_state = FixedKeyState(uniq_keys)
253 | 
254 |     def _action(self, action_n):
255 |         # Each action might be a length-1 np.array. Cast to int to
256 |         # avoid warnings.
257 |         return [self._actions[int(action)] for action in action_n]
258 | 
259 | class CropScreen(vectorized.ObservationWrapper):
260 |     """Crops out a [height]x[width] area starting from (top,left) """
261 |     def __init__(self, env, height, width, top=0, left=0):
262 |         super(CropScreen, self).__init__(env)
263 |         self.height = height
264 |         self.width = width
265 |         self.top = top
266 |         self.left = left
267 |         self.observation_space = Box(0, 255, shape=(height, width, 3))
268 | 
269 |     def _observation(self, observation_n):
270 |         return [ob[self.top:self.top+self.height, self.left:self.left+self.width, :] if ob is not None else None
271 |                 for ob in observation_n]
272 | 
273 | def _process_frame_flash(frame):
274 |     frame = cv2.resize(frame, (200, 128))
275 |     frame = frame.mean(2).astype(np.float32)
276 |     frame *= (1.0 / 255.0)
277 |     frame = np.reshape(frame, [128, 200, 1])
278 |     return frame
279 | 
280 | class FlashRescale(vectorized.ObservationWrapper):
281 |     def __init__(self, env=None):
282 |         super(FlashRescale, self).__init__(env)
283 |         self.observation_space = Box(0.0, 1.0, [128, 200, 1])
284 | 
285 |     def _observation(self, observation_n):
286 |         return [_process_frame_flash(observation) for observation in observation_n]
287 | 


--------------------------------------------------------------------------------
/scripts/training/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import sys
  4 | from six.moves import shlex_quote
  5 | 
  6 | parser = argparse.ArgumentParser(description="Run commands")
  7 | parser.add_argument('-w', '--num-workers', default=1, type=int,
  8 |                     help="Number of workers")
  9 | parser.add_argument('-r', '--remotes', default=None,
 10 |                     help='The address of pre-existing VNC servers and '
 11 |                          'rewarders to use (e.g. -r vnc://localhost:5900+15900,vnc://localhost:5901+15901).')
 12 | parser.add_argument('-e', '--env-id', type=str, default="PongDeterministic-v4",
 13 |                     help="Environment id")
 14 | parser.add_argument('-l', '--log-dir', type=str, default="/tmp/pong",
 15 |                     help="Log directory path")
 16 | parser.add_argument('-n', '--dry-run', action='store_true',
 17 |                     help="Print out commands rather than executing them")
 18 | parser.add_argument('-m', '--mode', type=str, default='tmux',
 19 |                     help="tmux: run workers in a tmux session. nohup: run workers with nohup. child: run workers as child processes")
 20 | parser.add_argument('-p', '--policy', type=str, default='lstm',
 21 |                     help="lstm or feudal policy")
 22 | 
 23 | # Add visualise tag
 24 | parser.add_argument('--visualise', action='store_true',
 25 |                     help="Visualise the gym environment by running env.render() between each timestep")
 26 | 
 27 | 
 28 | def new_cmd(session, name, cmd, mode, logdir, shell):
 29 |     if isinstance(cmd, (list, tuple)):
 30 |         cmd = " ".join(shlex_quote(str(v)) for v in cmd)
 31 |     if mode == 'tmux':
 32 |         return name, "tmux send-keys -t {}:{} {} Enter".format(session, name, shlex_quote(cmd))
 33 |     elif mode == 'child':
 34 |         return name, "{} >{}/{}.{}.out 2>&1 & echo kill $! >>{}/kill.sh".format(cmd, logdir, session, name, logdir)
 35 |     elif mode == 'nohup':
 36 |         return name, "nohup {} -c {} >{}/{}.{}.out 2>&1 & echo kill $! >>{}/kill.sh".format(shell, shlex_quote(cmd), logdir, session, name, logdir)
 37 | 
 38 | 
 39 | def create_commands(session, num_workers, remotes, env_id, logdir, shell='bash', 
 40 |         policy='lstm', mode='tmux', visualise=False):
 41 |     # for launching the TF workers and for launching tensorboard
 42 |     base_cmd = [
 43 |         'CUDA_VISIBLE_DEVICES=',
 44 |         sys.executable, 'worker.py',
 45 |         '--log-dir', logdir,
 46 |         '--env-id', env_id,
 47 |         '--num-workers', str(num_workers)]
 48 | 
 49 |     if visualise:
 50 |         base_cmd += ['--visualise']
 51 | 
 52 |     if remotes is None:
 53 |         remotes = ["1"] * num_workers
 54 |     else:
 55 |         remotes = remotes.split(',')
 56 |         assert len(remotes) == num_workers
 57 | 
 58 |     cmds_map = [new_cmd(session, "ps", base_cmd + ["--job-name", "ps"], mode, logdir, shell)]
 59 |     for i in range(num_workers):
 60 |         cmds_map += [new_cmd(session,
 61 |             "w-%d" % i, base_cmd + ["--job-name", "worker", "--task", str(i), "--remotes", remotes[i], "--policy", policy], mode, logdir, shell)]
 62 | 
 63 |     cmds_map += [new_cmd(session, "tb", ["tensorboard", "--logdir", logdir, "--port", "12345"], mode, logdir, shell)]
 64 |     if mode == 'tmux':
 65 |         cmds_map += [new_cmd(session, "htop", ["htop"], mode, logdir, shell)]
 66 | 
 67 |     windows = [v[0] for v in cmds_map]
 68 | 
 69 |     notes = []
 70 |     cmds = [
 71 |         "mkdir -p {}".format(logdir),
 72 |         "echo {} {} > {}/cmd.sh".format(sys.executable, ' '.join([shlex_quote(arg) for arg in sys.argv if arg != '-n']), logdir),
 73 |     ]
 74 |     if mode == 'nohup' or mode == 'child':
 75 |         cmds += ["echo '#!/bin/sh' >{}/kill.sh".format(logdir)]
 76 |         notes += ["Run `source {}/kill.sh` to kill the job".format(logdir)]
 77 |     if mode == 'tmux':
 78 |         notes += ["Use `tmux attach -t {}` to watch process output".format(session)]
 79 |         notes += ["Use `tmux kill-session -t {}` to kill the job".format(session)]
 80 |     else:
 81 |         notes += ["Use `tail -f {}/*.out` to watch process output".format(logdir)]
 82 |     notes += ["Point your browser to http://localhost:12345 to see Tensorboard"]
 83 | 
 84 |     if mode == 'tmux':
 85 |         cmds += [
 86 |         "kill $( lsof -i:12345 -t ) > /dev/null 2>&1",  # kill any process using tensorboard's port
 87 |         "kill $( lsof -i:12222-{} -t ) > /dev/null 2>&1".format(num_workers+12222), # kill any processes using ps / worker ports
 88 |         "tmux kill-session -t {}".format(session),
 89 |         "tmux new-session -s {} -n {} -d {}".format(session, windows[0], shell)
 90 |         ]
 91 |         for w in windows[1:]:
 92 |             cmds += ["tmux new-window -t {} -n {} {}".format(session, w, shell)]
 93 |         cmds += ["sleep 1"]
 94 |     for window, cmd in cmds_map:
 95 |         cmds += [cmd]
 96 | 
 97 |     return cmds, notes
 98 | 
 99 | 
100 | def run():
101 |     args = parser.parse_args()
102 |     cmds, notes = create_commands("a3c", args.num_workers, args.remotes, args.env_id, args.log_dir, policy=args.policy, mode=args.mode, visualise=args.visualise)
103 |     if args.dry_run:
104 |         print("Dry-run mode due to -n flag, otherwise the following commands would be executed:")
105 |     else:
106 |         print("Executing the following commands:")
107 |     print("\n".join(cmds))
108 |     print("")
109 |     if not args.dry_run:
110 |         if args.mode == "tmux":
111 |             os.environ["TMUX"] = ""
112 |         os.system("\n".join(cmds))
113 |     print('\n'.join(notes))
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     run()


--------------------------------------------------------------------------------
/scripts/training/worker.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import cv2
  3 | import go_vncdriver
  4 | import tensorflow as tf
  5 | import argparse
  6 | import logging
  7 | import sys, signal
  8 | import time
  9 | import os
 10 | from envs import create_env
 11 | from feudal_networks.algos.policy_optimizer import PolicyOptimizer
 12 | from feudal_networks.algos.feudal_policy_optimizer import FeudalPolicyOptimizer
 13 | import distutils.version
 14 | use_tf12_api = distutils.version.LooseVersion(tf.VERSION) >= distutils.version.LooseVersion('0.12.0')
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | logger.setLevel(logging.INFO)
 18 | 
 19 | # Disables write_meta_graph argument, which freezes entire process and is mostly useless.
 20 | class FastSaver(tf.train.Saver):
 21 |     def save(self, sess, save_path, global_step=None, latest_filename=None,
 22 |              meta_graph_suffix="meta", write_meta_graph=True):
 23 |         super(FastSaver, self).save(sess, save_path, global_step, latest_filename,
 24 |                                     meta_graph_suffix, False)
 25 | 
 26 | def run(args, server):
 27 |     env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes)
 28 |     if args.policy == 'lstm':
 29 |         trainer = PolicyOptimizer(env, args.task, args.policy,args.visualise)
 30 |     elif args.policy == 'feudal':
 31 |         trainer = FeudalPolicyOptimizer(env, args.task, args.policy,args.visualise)
 32 |     else:
 33 |         print('Invalid policy type')
 34 |         exit(0)
 35 | 
 36 | 
 37 |     # Variable names that start with "local" are not saved in checkpoints.
 38 |     if use_tf12_api:
 39 |         variables_to_save = [v for v in tf.global_variables() if not v.name.startswith("local")]
 40 |         init_op = tf.variables_initializer(variables_to_save)
 41 |         init_all_op = tf.global_variables_initializer()
 42 |     else:
 43 |         variables_to_save = [v for v in tf.all_variables() if not v.name.startswith("local")]
 44 |         init_op = tf.initialize_variables(variables_to_save)
 45 |         init_all_op = tf.initialize_all_variables()
 46 |     saver = FastSaver(variables_to_save)
 47 | 
 48 |     var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
 49 |     logger.info('Trainable vars:')
 50 |     for v in var_list:
 51 |         logger.info('  %s %s', v.name, v.get_shape())
 52 | 
 53 |     def init_fn(ses):
 54 |         logger.info("Initializing all parameters.")
 55 |         ses.run(init_all_op)
 56 | 
 57 |     config = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)])
 58 |     logdir = os.path.join(args.log_dir, 'train')
 59 | 
 60 |     if use_tf12_api:
 61 |         summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task)
 62 |     else:
 63 |         summary_writer = tf.train.SummaryWriter(logdir + "_%d" % args.task)
 64 | 
 65 |     logger.info("Events directory: %s_%s", logdir, args.task)
 66 |     sv = tf.train.Supervisor(is_chief=(args.task == 0),
 67 |                              logdir=logdir,
 68 |                              saver=saver,
 69 |                              summary_op=None,
 70 |                              init_op=init_op,
 71 |                              init_fn=init_fn,
 72 |                              summary_writer=summary_writer,
 73 |                              ready_op=tf.report_uninitialized_variables(variables_to_save),
 74 |                              global_step=trainer.global_step,
 75 |                              save_model_secs=30,
 76 |                              save_summaries_secs=30)
 77 | 
 78 |     num_global_steps = 100000000
 79 | 
 80 |     logger.info(
 81 |         "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " +
 82 |         "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified.")
 83 |     with sv.managed_session(server.target, config=config) as sess, sess.as_default():
 84 |         sess.run(trainer.sync)
 85 |         trainer.start(sess, summary_writer)
 86 |         global_step = sess.run(trainer.global_step)
 87 |         logger.info("Starting training at step=%d", global_step)
 88 |         while not sv.should_stop() and (not num_global_steps or global_step < num_global_steps):
 89 |             trainer.train(sess)
 90 |             global_step = sess.run(trainer.global_step)
 91 | 
 92 |     # Ask for all the services to stop.
 93 |     sv.stop()
 94 |     logger.info('reached %s steps. worker stopped.', global_step)
 95 | 
 96 | def cluster_spec(num_workers, num_ps):
 97 |     """
 98 | More tensorflow setup for data parallelism
 99 | """
100 |     cluster = {}
101 |     port = 12222
102 | 
103 |     all_ps = []
104 |     host = '127.0.0.1'
105 |     for _ in range(num_ps):
106 |         all_ps.append('{}:{}'.format(host, port))
107 |         port += 1
108 |     cluster['ps'] = all_ps
109 | 
110 |     all_workers = []
111 |     for _ in range(num_workers):
112 |         all_workers.append('{}:{}'.format(host, port))
113 |         port += 1
114 |     cluster['worker'] = all_workers
115 |     return cluster
116 | 
117 | def main(_):
118 |     """
119 | Setting up Tensorflow for data parallel work
120 | """
121 | 
122 |     parser = argparse.ArgumentParser(description=None)
123 |     parser.add_argument('-v', '--verbose', action='count', dest='verbosity', default=0, help='Set verbosity.')
124 |     parser.add_argument('--task', default=0, type=int, help='Task index')
125 |     parser.add_argument('--job-name', default="worker", help='worker or ps')
126 |     parser.add_argument('--num-workers', default=1, type=int, help='Number of workers')
127 |     parser.add_argument('--log-dir', default="/tmp/pong", help='Log directory path')
128 |     parser.add_argument('--env-id', default="PongDeterministic-v4", help='Environment id')
129 |     parser.add_argument('--policy', type=str, default='lstm', help="lstm or feudal policy")
130 |     parser.add_argument('-r', '--remotes', default=None,
131 |                         help='References to environments to create (e.g. -r 20), '
132 |                              'or the address of pre-existing VNC servers and '
133 |                              'rewarders to use (e.g. -r vnc://localhost:5900+15900,vnc://localhost:5901+15901)')
134 | 
135 |     # Add visualisation argument
136 |     parser.add_argument('--visualise', action='store_true',
137 |                         help="Visualise the gym environment by running env.render() between each timestep")
138 | 
139 |     args = parser.parse_args()
140 |     spec = cluster_spec(args.num_workers, 1)
141 |     cluster = tf.train.ClusterSpec(spec).as_cluster_def()
142 | 
143 |     def shutdown(signal, frame):
144 |         logger.warn('Received signal %s: exiting', signal)
145 |         sys.exit(128+signal)
146 |     signal.signal(signal.SIGHUP, shutdown)
147 |     signal.signal(signal.SIGINT, shutdown)
148 |     signal.signal(signal.SIGTERM, shutdown)
149 | 
150 |     if args.job_name == "worker":
151 |         server = tf.train.Server(cluster, job_name="worker", task_index=args.task,
152 |                                  config=tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=2))
153 |         run(args, server)
154 |     else:
155 |         server = tf.train.Server(cluster, job_name="ps", task_index=args.task,
156 |                                  config=tf.ConfigProto(device_filters=["/job:ps"]))
157 |         while True:
158 |             time.sleep(1000)
159 | 
160 | if __name__ == "__main__":
161 |     tf.app.run()
162 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidhershey/feudal_networks/5c988023d87206a739b44cd59f60556c7de4e289/tests/__init__.py


--------------------------------------------------------------------------------
/tests/run_tests.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | test_loader = unittest.defaultTestLoader.discover( '.' )
4 | test_runner = unittest.TextTestRunner(verbosity=2)
5 | test_runner.run(test_loader)


--------------------------------------------------------------------------------
/tests/test_algos/test_feudal_policy_optimizer.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import gym 
 3 | import unittest
 4 | import tensorflow as tf
 5 | 
 6 | from feudal_networks.algos.feudal_policy_optimizer import FeudalPolicyOptimizer
 7 | from feudal_networks.policies.feudal_policy import FeudalPolicy
 8 | 
 9 | import feudal_networks.envs.debug_envs
10 | 
11 | class TestFeudalPolicyOptimizer(unittest.TestCase):
12 | 
13 |     def test_init(self):
14 |         env = gym.make('OneRoundDeterministicRewardBoxObs-v0')
15 |         with tf.Session() as session:
16 |             feudal_opt = FeudalPolicyOptimizer(env, 0, 'feudal', False)
17 | 
18 | if __name__ == '__main__':
19 |     unittest.main()
20 | 


--------------------------------------------------------------------------------
/tests/test_envs/test_vision_maze.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import unittest
 4 | 
 5 | 
 6 | 
 7 | from feudal_networks.envs.vision_maze import VisionMazeEnv
 8 | 
 9 | def to_coords(x):
10 |     return list(v[0] for v in np.where(x)[:2])
11 | 
12 | class TestVisionMaze(unittest.TestCase):
13 | 
14 |     def test_step(self):
15 |         maze = VisionMazeEnv(room_length=3, num_rooms_per_side=2)
16 | 
17 |         # up until wall
18 |         a = 0
19 |         maze.state = np.array([0, 0])
20 |         nx, _, _, _ = maze.step(a)
21 |         nx, _, _, _ = maze.step(a)
22 |         np.testing.assert_array_equal(to_coords(nx), [0,2])
23 |         nx, _, _, _ = maze.step(a)
24 |         np.testing.assert_array_equal(to_coords(nx), [0,2])
25 |         
26 |         # down until wall
27 |         a = 2
28 |         maze.state = np.array([0, 2])
29 |         nx, _, _, _ = maze.step(a)
30 |         nx, _, _, _ = maze.step(a)
31 |         np.testing.assert_array_equal(to_coords(nx), [0,0])
32 |         nx, _, _, _ = maze.step(a)
33 |         np.testing.assert_array_equal(to_coords(nx), [0,0])
34 | 
35 |         # right until wall
36 |         maze.state = np.array([0, 0])
37 |         a = 1
38 |         nx, _, _, _ = maze.step(a)
39 |         nx, _, _, _ = maze.step(a)
40 |         np.testing.assert_array_equal(to_coords(nx), [2,0])
41 |         nx, _, _, _ = maze.step(a)
42 |         np.testing.assert_array_equal(to_coords(nx), [2,0])
43 | 
44 |         # left until wall
45 |         maze.state = np.array([2, 0])
46 |         a = 3
47 |         nx, _, _, _ = maze.step(a)
48 |         nx, _, _, _ = maze.step(a)
49 |         np.testing.assert_array_equal(to_coords(nx), [0,0])
50 |         nx, _, _, _ = maze.step(a)
51 |         np.testing.assert_array_equal(to_coords(nx), [0,0])
52 | 
53 |         # through doorway to the right until wall
54 |         maze.state = np.array([0, 0])
55 |         nx, _, _, _ = maze.step(0) # up 
56 |         nx, _, _, _ = maze.step(1) # right
57 |         nx, _, _, _ = maze.step(1) # right
58 |         nx, _, _, _ = maze.step(1) # right
59 |         nx, _, _, _ = maze.step(1) # right
60 |         nx, _, _, _ = maze.step(1) # right
61 |         np.testing.assert_array_equal(to_coords(nx), [5,1])
62 |         nx, _, _, _ = maze.step(1) # right
63 |         np.testing.assert_array_equal(to_coords(nx), [5,1])
64 | 
65 |         # back through the doorway I came, and then up through the other doorway
66 |         maze.state = np.array([5, 1])
67 |         nx, _, _, _ = maze.step(3) # left
68 |         nx, _, _, _ = maze.step(3) # left
69 |         nx, _, _, _ = maze.step(3) # left
70 |         nx, _, _, _ = maze.step(3) # left
71 |         nx, _, _, _ = maze.step(0) # up
72 |         nx, _, _, _ = maze.step(0) # up
73 |         nx, _, _, _ = maze.step(0) # up
74 |         nx, _, _, _ = maze.step(0) # up
75 |         np.testing.assert_array_equal(to_coords(nx), [1,5])
76 |         nx, _, _, _ = maze.step(0) # up
77 |         np.testing.assert_array_equal(to_coords(nx), [1,5])
78 | 
79 |         # to the goal state
80 |         maze.state = np.array([1, 5])
81 |         nx, _, _, _ = maze.step(1) # right
82 |         nx, _, _, _ = maze.step(2) # down
83 |         nx, _, _, _ = maze.step(1) # right
84 |         nx, _, _, _ = maze.step(1) # right
85 |         nx, _, _, _ = maze.step(1) # right
86 |         nx, _, _, _ = maze.step(0) # up
87 |         np.testing.assert_array_equal(to_coords(nx), [5,5])
88 | 
89 |         # down until wall
90 |         maze.state = np.array([5, 5])
91 |         nx, _, _, _ = maze.step(2) # down
92 |         nx, _, _, _ = maze.step(2) # down
93 |         np.testing.assert_array_equal(to_coords(nx), [5,3])
94 |         nx, _, _, _ = maze.step(2) # down
95 |         np.testing.assert_array_equal(to_coords(nx), [5,3])
96 | 
97 | if __name__ == '__main__':
98 |     unittest.main()


--------------------------------------------------------------------------------
/tests/test_policies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidhershey/feudal_networks/5c988023d87206a739b44cd59f60556c7de4e289/tests/test_policies/__init__.py


--------------------------------------------------------------------------------
/tests/test_policies/test_feudal_batch_processor.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | import unittest
  4 | 
  5 | from feudal_networks.policies.feudal_batch_processor import FeudalBatchProcessor, FeudalBatch
  6 | from feudal_networks.algos.policy_optimizer import Batch
  7 | 
  8 | class TestFeudalBatchProcessor(unittest.TestCase):
  9 | 
 10 |     def test_simple_c_1(self):
 11 |         # simple case ignoring the fact that the different list have 
 12 |         # elements with different types 
 13 |         c = 1
 14 |         fbp = FeudalBatchProcessor(c)
 15 | 
 16 |         obs = [1,2]
 17 |         a = [1,2]
 18 |         returns = [1,2]
 19 |         terminal = False
 20 |         g = [1,2]
 21 |         s = [1,2]
 22 |         features = [1,2]
 23 |         b = Batch(obs, a, returns, terminal, g, s, features)
 24 |         fb = fbp.process_batch(b)
 25 |         np.testing.assert_array_equal(fb.obs, [1])
 26 |         np.testing.assert_array_equal(fb.a, [1])
 27 |         np.testing.assert_array_equal(fb.returns, [1])
 28 |         np.testing.assert_array_equal(fb.s_diff, [1])
 29 |         np.testing.assert_array_equal(fb.ri, [0])
 30 |         np.testing.assert_array_equal(fb.gsum, [2])
 31 |         np.testing.assert_array_equal(fb.features, [1])
 32 | 
 33 |         obs = [3,4]
 34 |         a = [3,4]
 35 |         returns = [3,4]
 36 |         terminal = False
 37 |         g = [3,4]
 38 |         s = [3,4]
 39 |         features = [3,4]
 40 |         b = Batch(obs, a, returns, terminal, g, s, features)
 41 |         fb = fbp.process_batch(b)
 42 |         np.testing.assert_array_equal(fb.obs, [2,3])
 43 |         np.testing.assert_array_equal(fb.a, [2,3])
 44 |         np.testing.assert_array_equal(fb.returns, [2,3])
 45 |         np.testing.assert_array_equal(fb.s_diff, [1,1])
 46 |         self.assertEqual(len(fb.ri), 2)
 47 |         np.testing.assert_array_equal(fb.gsum, [3, 5])
 48 |         np.testing.assert_array_equal(fb.features, [2,3])
 49 | 
 50 |         obs = [5]
 51 |         a = [5]
 52 |         returns = [5]
 53 |         terminal = True
 54 |         g = [5]
 55 |         s = [5]
 56 |         features = [5]
 57 |         b = Batch(obs, a, returns, terminal, g, s, features)
 58 |         fb = fbp.process_batch(b)
 59 |         np.testing.assert_array_equal(fb.obs, [4,5])
 60 |         np.testing.assert_array_equal(fb.a, [4,5])
 61 |         np.testing.assert_array_equal(fb.returns, [4,5])
 62 |         np.testing.assert_array_equal(fb.s_diff, [1,0])
 63 |         self.assertEqual(len(fb.ri), 2)
 64 |         np.testing.assert_array_equal(fb.gsum, [7,9])
 65 |         np.testing.assert_array_equal(fb.features, [4,5])
 66 | 
 67 |     def test_simple_c_2(self):
 68 |         # simple case ignoring the fact that the different list have 
 69 |         # elements with different types 
 70 |         c = 2
 71 |         obs = [1,2]
 72 |         a = [1,2]
 73 |         returns = [1,2]
 74 |         terminal = False
 75 |         g = [1,2]
 76 |         s = [1,2]
 77 |         features = [1,2]
 78 |         b = Batch(obs, a, returns, terminal, g, s, features)
 79 |         
 80 |         fbp = FeudalBatchProcessor(c)
 81 |         fb = fbp.process_batch(b)
 82 | 
 83 |         np.testing.assert_array_equal(fb.obs, [])
 84 |         np.testing.assert_array_equal(fb.a, [])
 85 |         np.testing.assert_array_equal(fb.returns, [])
 86 |         np.testing.assert_array_equal(fb.s_diff, [])
 87 |         np.testing.assert_array_equal(fb.ri, [])
 88 |         np.testing.assert_array_equal(fb.gsum, [])
 89 |         np.testing.assert_array_equal(fb.features, [])
 90 | 
 91 |         obs = [3,4]
 92 |         a = [3,4]
 93 |         returns = [3,4]
 94 |         terminal = False
 95 |         g = [3,4]
 96 |         s = [3,4]
 97 |         features = [3,4]
 98 |         b = Batch(obs, a, returns, terminal, g, s, features)
 99 |         fb = fbp.process_batch(b)
100 |         np.testing.assert_array_equal(fb.obs, [1,2])
101 |         np.testing.assert_array_equal(fb.a, [1,2])
102 |         np.testing.assert_array_equal(fb.returns, [1,2])
103 |         np.testing.assert_array_equal(fb.s_diff, [2,2])
104 |         self.assertEqual(len(fb.ri), 2)
105 |         np.testing.assert_array_equal(fb.gsum, [3,4])
106 |         np.testing.assert_array_equal(fb.features, [1,2])
107 | 
108 |         obs = [5]
109 |         a = [5]
110 |         returns = [5]
111 |         terminal = True
112 |         g = [5]
113 |         s = [5]
114 |         features = [5]
115 |         b = Batch(obs, a, returns, terminal, g, s, features)
116 |         fb = fbp.process_batch(b)
117 |         np.testing.assert_array_equal(fb.obs, [3,4,5])
118 |         np.testing.assert_array_equal(fb.a, [3,4,5])
119 |         np.testing.assert_array_equal(fb.returns, [3,4,5])
120 |         np.testing.assert_array_equal(fb.s_diff, [2,1,0])
121 |         self.assertEqual(len(fb.ri), 3)
122 |         np.testing.assert_array_equal(fb.gsum, [6,9,12])
123 |         np.testing.assert_array_equal(fb.features, [3,4,5])
124 | 
125 |     def test_simple_terminal_on_start(self):
126 |         c = 2
127 |         fbp = FeudalBatchProcessor(c)
128 | 
129 |         obs = [1,2]
130 |         a = [1,2]
131 |         returns = [1,2]
132 |         terminal = True
133 |         g = [1,2]
134 |         s = [1,2]
135 |         features = [1,2]
136 |         b = Batch(obs, a, returns, terminal, g, s, features)
137 |         fb = fbp.process_batch(b)
138 |         np.testing.assert_array_equal(fb.obs, [1,2])
139 |         np.testing.assert_array_equal(fb.a, [1,2])
140 |         np.testing.assert_array_equal(fb.returns, [1,2])
141 |         np.testing.assert_array_equal(fb.s_diff, [1,0])
142 |         self.assertEqual(len(fb.ri), 2)
143 |         np.testing.assert_array_equal(fb.gsum, [3,4])
144 |         np.testing.assert_array_equal(fb.features, [1,2])
145 | 
146 |     def test_intrinsic_reward_and_gsum_calculation(self):
147 |         c = 2
148 |         fbp = FeudalBatchProcessor(c)
149 | 
150 |         obs = a = returns = features = [None, None, None]
151 |         terminal = True
152 |         s = [np.array([2,1]), np.array([1,2]), np.array([2,3])]
153 |         g = [np.array([1,1]), np.array([2,2]), np.array([3,3])]
154 |         b = Batch(obs, a, returns, terminal, s, g, features)
155 |         fb = fbp.process_batch(b)
156 |         last_ri = (1. + 1. / np.sqrt(2)) / 2
157 |         np.testing.assert_array_almost_equal(fb.ri, [0,0,last_ri])
158 |         np.testing.assert_array_equal(fb.gsum, 
159 |             [np.array([3,3]), np.array([4,4]), np.array([6,6])])
160 | 
161 | if __name__ == '__main__':
162 |     unittest.main()
163 | 


--------------------------------------------------------------------------------
/tests/test_policies/test_feudal_policy.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | np.set_printoptions(suppress=True, precision=6)
  4 | import unittest
  5 | 
  6 | from feudal_networks.policies.feudal_policy import FeudalPolicy
  7 | import tensorflow as tf
  8 | 
  9 | class TestFeudalPolicy(unittest.TestCase):
 10 | 
 11 |     def setUp(self):
 12 |         # reset graph before each test case
 13 |         tf.reset_default_graph()
 14 | 
 15 |     def test_init(self):
 16 |         global_step = tf.get_variable("global_step", [], tf.int32,\
 17 |                                         initializer=tf.constant_initializer(0, dtype=tf.int32),
 18 |                                         trainable=False)
 19 |         feudal = FeudalPolicy((80,80,3), 4, global_step)
 20 | 
 21 |     def test_fit_simple_dataset(self):
 22 |         with tf.Session() as session:
 23 |             global_step = tf.get_variable("global_step", [], tf.int32,\
 24 |                                             initializer=tf.constant_initializer(0, dtype=tf.int32),
 25 |                                             trainable=False)
 26 |             obs_space = (80,80,3)
 27 |             act_space = 2
 28 |             lr = 1e-5
 29 |             g_dim = 256
 30 |             worker_hid_dim = 32
 31 |             manager_hid_dim = 256
 32 |             pi = FeudalPolicy(obs_space, act_space, global_step)
 33 | 
 34 |             grads = tf.gradients(pi.loss, pi.var_list)
 35 | 
 36 |             prints = []
 37 |             for g in grads:
 38 |                 prints.append(g.op.name)
 39 |                 prints.append(g)
 40 |             # grads[0] = tf.Print(grads[0],prints)
 41 |             grads, _ = tf.clip_by_global_norm(grads, 40)
 42 |             grads_and_vars = list(zip(grads, pi.var_list))
 43 |             opt = tf.train.AdamOptimizer(lr)
 44 |             train_op = opt.apply_gradients(grads_and_vars)
 45 | 
 46 |             # train_op = tf.train.AdamOptimizer(lr).minimize(pi.loss,var_list=pi.var_list)
 47 |             session.run(tf.global_variables_initializer())
 48 | 
 49 |             obs = [np.zeros(obs_space), np.zeros(obs_space)]
 50 |             a = [[1,0], [0,1]]
 51 |             returns = [0, 1]
 52 |             s_diff = [np.ones(g_dim), np.ones(g_dim)]
 53 |             gsum = [np.zeros((1,g_dim)), np.ones((1,g_dim))]
 54 |             ri = [0, 0]
 55 | 
 56 |             _,features = pi.get_initial_features()
 57 |             worker_features = features[0:2]
 58 |             manager_features = features[2:]
 59 | 
 60 |             feed_dict = {
 61 |                 pi.obs: obs,
 62 |                 pi.ac: a,
 63 |                 pi.r: returns,
 64 |                 pi.s_diff: s_diff,
 65 |                 pi.prev_g: gsum,
 66 |                 pi.ri: ri,
 67 |                 pi.state_in[0]: worker_features[0],
 68 |                 pi.state_in[1]: worker_features[1],
 69 |                 pi.state_in[2]: manager_features[0],
 70 |                 pi.state_in[3]: manager_features[1]
 71 |             }
 72 | 
 73 |             n_updates = 1000
 74 |             verbose = True
 75 |             for i in range(n_updates):
 76 |                 loss, vf, policy, _ = session.run([pi.loss,pi.manager_vf,pi.pi, train_op], feed_dict=feed_dict)
 77 |                 if verbose:
 78 |                     print('loss: {}\npolicy: {}\nvalue: {}\n-------'.format(
 79 |                         loss, policy, vf))
 80 | 
 81 |     def test_simple_manager_behavior(self):
 82 |         with tf.Session() as session:
 83 |             global_step = tf.get_variable("global_step", [], tf.int32,\
 84 |                     initializer=tf.constant_initializer(0, dtype=tf.int32),
 85 |                     trainable=False)
 86 |             obs_space = (80,80,3)
 87 |             act_space = 2
 88 |             lr = 5e-4
 89 |             g_dim = 256
 90 |             worker_hid_dim = 32
 91 |             manager_hid_dim = 256
 92 |             pi = FeudalPolicy(obs_space, act_space, global_step)
 93 |             
 94 |             train_op = tf.train.AdamOptimizer(lr).minimize(pi.loss)
 95 | 
 96 |             worker_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
 97 |             worker_vars = [v for v in worker_vars if 'worker' in v.name]
 98 |             worker_assign = tf.group(*[tf.assign(v, tf.zeros_like(v)) 
 99 |                 for v in worker_vars])
100 | 
101 |             session.run(tf.global_variables_initializer())
102 | 
103 |             obs = [np.zeros(obs_space), np.zeros(obs_space)]
104 |             a = [[1,0], [0,1]]
105 |             returns = [0, 1]
106 |             s_diff = [np.ones(g_dim), np.ones(g_dim)]
107 |             gsum = [np.zeros((1,g_dim)), np.ones((1,g_dim))]
108 |             ri = [0, 0]
109 | 
110 |             _, features = pi.get_initial_features()
111 |             worker_features = features[0:2]
112 |             manager_features = features[2:]
113 | 
114 |             feed_dict = {
115 |                 pi.obs: obs,
116 |                 pi.ac: a,
117 |                 pi.r: returns,
118 |                 pi.s_diff: s_diff,
119 |                 pi.prev_g: gsum,
120 |                 pi.ri: ri,
121 |                 pi.state_in[0]: worker_features[0],
122 |                 pi.state_in[1]: worker_features[1],
123 |                 pi.state_in[2]: manager_features[0],
124 |                 pi.state_in[3]: manager_features[1]
125 |             }
126 | 
127 |             n_updates = 1000
128 |             verbose = True
129 |             for i in range(n_updates):
130 |                 loss, vf, policy, _, _ = session.run(
131 |                     [pi.loss, pi.manager_vf, pi.pi, train_op, worker_assign], 
132 |                     feed_dict=feed_dict)
133 |                 
134 |                 if verbose:
135 |                     print('loss: {}\npolicy: {}\nvalue: {}\n-------'.format(
136 |                         loss, policy, vf))
137 | 
138 |                 worker_var_values = session.run(worker_vars)
139 |                 print(worker_var_values)
140 |                 U = session.run(pi.U, feed_dict=feed_dict)
141 |                 print(U)
142 |                 input()
143 | 
144 |     
145 | 
146 | if __name__ == '__main__':
147 |     unittest.main()
148 | 


--------------------------------------------------------------------------------
/tests/test_policies/test_lstm_policy.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from feudal_networks.policies.lstm_policy import LSTMPolicy
 4 | import tensorflow as tf
 5 | 
 6 | class TestLSTMPolicy(unittest.TestCase):
 7 | 
 8 |     def test_init(self):
 9 |         global_step = tf.get_variable("global_step", [], tf.int32,\
10 |                                         initializer=tf.constant_initializer(0, dtype=tf.int32),
11 |                                         trainable=False)
12 |         lstm_pi = LSTMPolicy((80,80,3), 4,global_step)
13 | 
14 | if __name__ == '__main__':
15 |     unittest.main()
16 | 


--------------------------------------------------------------------------------