├── .gitignore ├── LICENSE ├── README.md ├── feudal_networks ├── __init__.py ├── algos │ ├── __init__.py │ ├── feudal_policy_optimizer.py │ └── policy_optimizer.py ├── envs │ ├── __init__.py │ ├── debug_envs.py │ └── vision_maze.py ├── models │ ├── __init__.py │ └── models.py └── policies │ ├── __init__.py │ ├── configs │ ├── __init__.py │ ├── feudal_config.py │ └── lstm_config.py │ ├── feudal_batch_processor.py │ ├── feudal_policy.py │ ├── lstm_policy.py │ ├── policy.py │ └── policy_utils.py ├── scripts └── training │ ├── README.md │ ├── __init__.py │ ├── envs.py │ ├── train.py │ └── worker.py └── tests ├── __init__.py ├── run_tests.py ├── test_algos └── test_feudal_policy_optimizer.py ├── test_envs └── test_vision_maze.py └── test_policies ├── __init__.py ├── test_feudal_batch_processor.py ├── test_feudal_policy.py └── test_lstm_policy.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | bin/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # Installer logs 26 | pip-log.txt 27 | pip-delete-this-directory.txt 28 | 29 | # Unit test / coverage reports 30 | htmlcov/ 31 | .tox/ 32 | .coverage 33 | .cache 34 | nosetests.xml 35 | coverage.xml 36 | 37 | # Translations 38 | *.mo 39 | 40 | # Mr Developer 41 | .mr.developer.cfg 42 | .project 43 | .pydevproject 44 | 45 | # Rope 46 | .ropeproject 47 | 48 | # Django stuff: 49 | *.log 50 | *.pot 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | # mac 56 | .DS_Store 57 | 58 | # binaries 59 | *.bin 60 | *.out 61 | 62 | # emacs 63 | *~ 64 | 65 | # compile 66 | /compile 67 | 68 | # aws keys 69 | *.key 70 | 71 | # data 72 | *.weights 73 | *.meta 74 | *.p 75 | *.csv 76 | *checkpoint 77 | *.key 78 | *.npz 79 | *.dat 80 | *.jld 81 | *.idx 82 | *.png 83 | *.h5 84 | *.dat 85 | *.mat 86 | *.zip 87 | 88 | # notebook checkpoints 89 | *checkpoint* 90 | 91 | # cmake 92 | CMakeCache.txt 93 | CMakeFiles 94 | CMakeScripts 95 | Makefile 96 | cmake_install.cmake 97 | install_manifest.txt 98 | CTestTestfile.cmake 99 | 100 | 101 | # julia temp files 102 | *tmp_* 103 | 104 | # venv 105 | *venv* 106 | 107 | # emacs 108 | *#*#* 109 | 110 | # batch byproducts 111 | *.err 112 | 113 | # gym 114 | *.json 115 | 116 | # snapshots 117 | *.pkl 118 | *.npz 119 | 120 | # summaries 121 | *events* 122 | 123 | # media 124 | *.pdf 125 | *.gif 126 | 127 | # snapshots 128 | *snapshots* 129 | *media* 130 | *visualizations* -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 dmakian 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # feudal_networks 2 | An implementation of FeUdal Networks for Hierarchical Learning as published : https://arxiv.org/abs/1703.01161 3 | 4 | Implementation and training framework derived from the OpenAI starter agent: https://github.com/openai/universe-starter-agent 5 | -------------------------------------------------------------------------------- /feudal_networks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidhershey/feudal_networks/5c988023d87206a739b44cd59f60556c7de4e289/feudal_networks/__init__.py -------------------------------------------------------------------------------- /feudal_networks/algos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidhershey/feudal_networks/5c988023d87206a739b44cd59f60556c7de4e289/feudal_networks/algos/__init__.py -------------------------------------------------------------------------------- /feudal_networks/algos/feudal_policy_optimizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Much of the code in this file was originally developed as part of the 3 | universe starter agent: https://github.com/openai/universe-starter-agent 4 | """ 5 | from collections import namedtuple 6 | import numpy as np 7 | import scipy.signal 8 | import tensorflow as tf 9 | import threading 10 | import six.moves.queue as queue 11 | 12 | from feudal_networks.policies.lstm_policy import LSTMPolicy 13 | from feudal_networks.policies.feudal_policy import FeudalPolicy 14 | 15 | def discount(x, gamma): 16 | return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] 17 | 18 | def process_rollout(rollout, gamma, lambda_=1.0): 19 | """ 20 | given a rollout, compute its returns and the advantage 21 | """ 22 | batch_si = np.asarray(rollout.states) 23 | batch_a = np.asarray(rollout.actions) 24 | 25 | rewards = np.asarray(rollout.rewards) 26 | vpred_t = np.asarray(rollout.values + [rollout.r]) 27 | rewards_plus_v = np.asarray(rollout.rewards + [rollout.r]) 28 | batch_r = discount(rewards_plus_v, gamma)[:-1] 29 | 30 | batch_s = np.asarray(rollout.ss) 31 | batch_g = np.asarray(rollout.gs) 32 | features = rollout.features 33 | return Batch(batch_si, batch_a, batch_r, rollout.terminal,batch_s,batch_g, features) 34 | 35 | Batch = namedtuple("Batch", ["obs", "a", "returns", "terminal", "s", "g", "features"]) 36 | # Batch = namedtuple("Batch", ["si", "a", "adv", "r", "terminal", "features"]) 37 | 38 | class PartialRollout(object): 39 | """ 40 | a piece of a complete rollout. We run our agent, and process its experience 41 | once it has processed enough steps. 42 | """ 43 | def __init__(self): 44 | self.states = [] 45 | self.actions = [] 46 | self.rewards = [] 47 | self.values = [] 48 | self.ss = [] 49 | self.gs = [] 50 | self.features = [] 51 | self.r = 0.0 52 | self.terminal = False 53 | 54 | def add(self, state, action, reward, value,g,s, terminal, features): 55 | self.states += [state] 56 | self.actions += [action] 57 | self.rewards += [reward] 58 | self.values += [value] 59 | self.terminal = terminal 60 | self.features += [features] 61 | self.gs += [g] 62 | self.ss += [s] 63 | 64 | def extend(self, other): 65 | assert not self.terminal 66 | self.states.extend(other.states) 67 | self.actions.extend(other.actions) 68 | self.rewards.extend(other.rewards) 69 | self.values.extend(other.values) 70 | self.gs.extend(other.gs) 71 | self.ss.extend(other.ss) 72 | self.r = other.r 73 | self.terminal = other.terminal 74 | self.features.extend(other.features) 75 | 76 | class RunnerThread(threading.Thread): 77 | """ 78 | One of the key distinctions between a normal environment and a universe environment 79 | is that a universe environment is _real time_. This means that there should be a thread 80 | that would constantly interact with the environment and tell it what to do. This thread is here. 81 | """ 82 | def __init__(self, env, policy, num_local_steps, visualise): 83 | threading.Thread.__init__(self) 84 | self.queue = queue.Queue(5) 85 | self.num_local_steps = num_local_steps 86 | self.env = env 87 | self.last_features = None 88 | self.policy = policy 89 | self.daemon = True 90 | self.sess = None 91 | self.summary_writer = None 92 | self.visualise = visualise 93 | 94 | def start_runner(self, sess, summary_writer): 95 | self.sess = sess 96 | self.summary_writer = summary_writer 97 | self.start() 98 | 99 | def run(self): 100 | with self.sess.as_default(): 101 | self._run() 102 | 103 | def _run(self): 104 | rollout_provider = env_runner(self.env, self.policy, self.num_local_steps, self.summary_writer, self.visualise) 105 | while True: 106 | # the timeout variable exists because apparently, if one worker dies, the other workers 107 | # won't die with it, unless the timeout is set to some large number. This is an empirical 108 | # observation. 109 | 110 | self.queue.put(next(rollout_provider), timeout=600.0) 111 | 112 | def env_runner(env, policy, num_local_steps, summary_writer,visualise): 113 | """ 114 | The logic of the thread runner. In brief, it constantly keeps on running 115 | the policy, and as long as the rollout exceeds a certain length, the thread 116 | runner appends the policy to the queue. 117 | """ 118 | last_state = env.reset() 119 | last_c_g,last_features = policy.get_initial_features() 120 | # print last_c_g 121 | length = 0 122 | rewards = 0 123 | 124 | while True: 125 | terminal_end = False 126 | rollout = PartialRollout() 127 | 128 | for _ in range(num_local_steps): 129 | # print last_c_g.shape 130 | fetched = policy.act(last_state,last_c_g, *last_features) 131 | action, value_, g,s,last_c_g,features = fetched[0], fetched[1], \ 132 | fetched[2], fetched[3], \ 133 | fetched[4], fetched[5:] 134 | action_to_take = action.argmax() 135 | # print action_to_take 136 | # print action 137 | # print g 138 | # print s 139 | # # exit(0) 140 | state, reward, terminal, info = env.step(action_to_take) 141 | 142 | # collect the experience 143 | rollout.add(last_state, action, reward, value_, g, s, terminal, last_features) 144 | length += 1 145 | rewards += reward 146 | 147 | last_state = state 148 | last_features = features 149 | 150 | if info: 151 | summary = tf.Summary() 152 | for k, v in info.items(): 153 | summary.value.add(tag=k, simple_value=float(v)) 154 | summary_writer.add_summary(summary, policy.global_step.eval()) 155 | summary_writer.flush() 156 | 157 | timestep_limit = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps') 158 | if terminal or length >= timestep_limit: 159 | terminal_end = True 160 | if length >= timestep_limit or not env.metadata.get('semantics.autoreset'): 161 | last_state = env.reset() 162 | last_c_g,last_features = policy.get_initial_features() 163 | print("Episode finished. Sum of rewards: %f. Length: %d" % (rewards, length)) 164 | length = 0 165 | rewards = 0 166 | break 167 | 168 | if not terminal_end: 169 | rollout.r = policy.value(last_state, last_c_g, *last_features) 170 | 171 | # once we have enough experience, yield it, and have the ThreadRunner place it on a queue 172 | yield rollout 173 | 174 | class FeudalPolicyOptimizer(object): 175 | def __init__(self, env, task, policy,visualise): 176 | self.env = env 177 | self.task = task 178 | 179 | worker_device = "/job:worker/task:{}/cpu:0".format(task) 180 | with tf.device(tf.train.replica_device_setter(1, worker_device=worker_device)): 181 | with tf.variable_scope("global"): 182 | self.global_step = tf.get_variable("global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), 183 | trainable=False) 184 | self.network = FeudalPolicy(env.observation_space.shape, env.action_space.n,self.global_step) 185 | 186 | with tf.device(worker_device): 187 | with tf.variable_scope("local"): 188 | self.local_network = pi = FeudalPolicy(env.observation_space.shape, env.action_space.n,self.global_step) 189 | pi.global_step = self.global_step 190 | self.policy = pi 191 | # build runner thread for collecting rollouts 192 | self.runner = RunnerThread(env, self.policy, 20,visualise) 193 | 194 | # formulate gradients 195 | grads = tf.gradients(pi.loss, pi.var_list) 196 | grads, _ = tf.clip_by_global_norm(grads, 40) 197 | 198 | # build sync 199 | # copy weights from the parameter server to the local model 200 | self.sync = tf.group(*[v1.assign(v2) 201 | for v1, v2 in zip(pi.var_list, self.network.var_list)]) 202 | grads_and_vars = list(zip(grads, self.network.var_list)) 203 | # for g,v in grads_and_vars: 204 | # print g.name,v.name 205 | inc_step = self.global_step.assign_add(tf.shape(pi.obs)[0]) 206 | 207 | # build train op 208 | opt = tf.train.AdamOptimizer(1e-4) 209 | self.train_op = tf.group(opt.apply_gradients(grads_and_vars), inc_step) 210 | self.summary_writer = None 211 | self.local_steps = 0 212 | 213 | def start(self, sess, summary_writer): 214 | self.runner.start_runner(sess, summary_writer) 215 | self.summary_writer = summary_writer 216 | 217 | def pull_batch_from_queue(self): 218 | """ 219 | self explanatory: take a rollout from the queue of the thread runner. 220 | """ 221 | rollout = self.runner.queue.get(timeout=600.0) 222 | while not rollout.terminal: 223 | try: 224 | rollout.extend(self.runner.queue.get_nowait()) 225 | except queue.Empty: 226 | break 227 | return rollout 228 | 229 | def train(self, sess): 230 | """ 231 | This first runs the sync op so that the gradients are computed wrt the 232 | current global weights. It then takes a rollout from the runner's queue, 233 | converts it to a batch, and passes that batch and the train op to the 234 | policy to perform an update. 235 | """ 236 | # copy weights from shared to local 237 | # this should be run first so that the updates are for the most 238 | # recent global weights 239 | sess.run(self.sync) 240 | rollout = self.pull_batch_from_queue() 241 | batch = process_rollout(rollout, gamma=.99) 242 | batch = self.policy.update_batch(batch) 243 | compute_summary = self.task == 0 and self.local_steps % 11 == 0 244 | # should_compute_summary = True 245 | should_compute_summary = self.task == 0 and self.local_steps % 11 == 0 246 | 247 | if should_compute_summary: 248 | fetches = [self.policy.summary_op, self.train_op, self.global_step] 249 | else: 250 | fetches = [self.train_op, self.global_step] 251 | 252 | feed_dict = { 253 | self.policy.obs: batch.obs, 254 | self.network.obs: batch.obs, 255 | 256 | self.policy.ac: batch.a, 257 | self.network.ac: batch.a, 258 | 259 | self.policy.r: batch.returns, 260 | self.network.r: batch.returns, 261 | 262 | self.policy.s_diff: batch.s_diff, 263 | self.network.s_diff: batch.s_diff, 264 | 265 | self.policy.prev_g: batch.gsum, 266 | self.network.prev_g: batch.gsum, 267 | 268 | self.policy.ri: batch.ri, 269 | self.network.ri: batch.ri 270 | } 271 | 272 | for i in range(len(self.policy.state_in)): 273 | feed_dict[self.policy.state_in[i]] = batch.features[i] 274 | feed_dict[self.network.state_in[i]] = batch.features[i] 275 | 276 | 277 | fetched = sess.run(fetches, feed_dict=feed_dict) 278 | 279 | if should_compute_summary: 280 | self.summary_writer.add_summary(tf.Summary.FromString(fetched[0]), fetched[-1]) 281 | self.summary_writer.flush() 282 | self.local_steps += 1 283 | -------------------------------------------------------------------------------- /feudal_networks/algos/policy_optimizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Much of the code in this file was originally developed as part of the 3 | universe starter agent: https://github.com/openai/universe-starter-agent 4 | """ 5 | from collections import namedtuple 6 | import numpy as np 7 | import scipy.signal 8 | import tensorflow as tf 9 | import threading 10 | import six.moves.queue as queue 11 | 12 | from feudal_networks.policies.lstm_policy import LSTMPolicy 13 | from feudal_networks.policies.feudal_policy import FeudalPolicy 14 | 15 | def discount(x, gamma): 16 | return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] 17 | 18 | def process_rollout(rollout, gamma, lambda_=1.0): 19 | """ 20 | given a rollout, compute its returns and the advantage 21 | """ 22 | batch_si = np.asarray(rollout.states) 23 | batch_a = np.asarray(rollout.actions) 24 | rewards = np.asarray(rollout.rewards) 25 | vpred_t = np.asarray(rollout.values + [rollout.r]) 26 | 27 | rewards_plus_v = np.asarray(rollout.rewards + [rollout.r]) 28 | batch_r = discount(rewards_plus_v, gamma)[:-1] 29 | delta_t = rewards + gamma * vpred_t[1:] - vpred_t[:-1] 30 | # this formula for the advantage comes "Generalized Advantage Estimation": 31 | # https://arxiv.org/abs/1506.02438 32 | batch_adv = discount(delta_t, gamma * lambda_) 33 | 34 | features = rollout.features[0] 35 | # print features 36 | return Batch(batch_si, batch_a, batch_adv, batch_r, rollout.terminal, features) 37 | 38 | # Batch = namedtuple("Batch", ["obs", "a", "returns", "terminal", "s", "g", "features"]) 39 | Batch = namedtuple("Batch", ["si", "a", "adv", "r", "terminal", "features"]) 40 | 41 | class PartialRollout(object): 42 | """ 43 | a piece of a complete rollout. We run our agent, and process its experience 44 | once it has processed enough steps. 45 | """ 46 | def __init__(self): 47 | self.states = [] 48 | self.actions = [] 49 | self.rewards = [] 50 | self.values = [] 51 | self.r = 0.0 52 | self.terminal = False 53 | self.features = [] 54 | 55 | def add(self, state, action, reward, value, terminal, features): 56 | self.states += [state] 57 | self.actions += [action] 58 | self.rewards += [reward] 59 | self.values += [value] 60 | self.terminal = terminal 61 | self.features += [features] 62 | 63 | def extend(self, other): 64 | assert not self.terminal 65 | self.states.extend(other.states) 66 | self.actions.extend(other.actions) 67 | self.rewards.extend(other.rewards) 68 | self.values.extend(other.values) 69 | self.r = other.r 70 | self.terminal = other.terminal 71 | self.features.extend(other.features) 72 | 73 | class RunnerThread(threading.Thread): 74 | """ 75 | One of the key distinctions between a normal environment and a universe environment 76 | is that a universe environment is _real time_. This means that there should be a thread 77 | that would constantly interact with the environment and tell it what to do. This thread is here. 78 | """ 79 | def __init__(self, env, policy, num_local_steps, visualise): 80 | threading.Thread.__init__(self) 81 | self.queue = queue.Queue(5) 82 | self.num_local_steps = num_local_steps 83 | self.env = env 84 | self.last_features = None 85 | self.policy = policy 86 | self.daemon = True 87 | self.sess = None 88 | self.summary_writer = None 89 | self.visualise = visualise 90 | 91 | def start_runner(self, sess, summary_writer): 92 | self.sess = sess 93 | self.summary_writer = summary_writer 94 | self.start() 95 | 96 | def run(self): 97 | with self.sess.as_default(): 98 | self._run() 99 | 100 | def _run(self): 101 | rollout_provider = env_runner(self.env, self.policy, self.num_local_steps, self.summary_writer, self.visualise) 102 | while True: 103 | # the timeout variable exists because apparently, if one worker dies, the other workers 104 | # won't die with it, unless the timeout is set to some large number. This is an empirical 105 | # observation. 106 | 107 | self.queue.put(next(rollout_provider), timeout=600.0) 108 | 109 | def env_runner(env, policy, num_local_steps, summary_writer,visualise): 110 | """ 111 | The logic of the thread runner. In brief, it constantly keeps on running 112 | the policy, and as long as the rollout exceeds a certain length, the thread 113 | runner appends the policy to the queue. 114 | """ 115 | last_state = env.reset() 116 | last_features = policy.get_initial_features() 117 | length = 0 118 | rewards = 0 119 | 120 | while True: 121 | terminal_end = False 122 | rollout = PartialRollout() 123 | 124 | for _ in range(num_local_steps): 125 | fetched = policy.act(last_state, *last_features) 126 | action, value_, features = fetched[0], fetched[1], fetched[2:] 127 | action_to_take = action.argmax() 128 | state, reward, terminal, info = env.step(action_to_take) 129 | 130 | # collect the experience 131 | rollout.add(last_state, action, reward, value_, terminal, last_features) 132 | length += 1 133 | rewards += reward 134 | 135 | last_state = state 136 | last_features = features 137 | 138 | if info: 139 | summary = tf.Summary() 140 | for k, v in info.items(): 141 | summary.value.add(tag=k, simple_value=float(v)) 142 | summary_writer.add_summary(summary, policy.global_step.eval()) 143 | summary_writer.flush() 144 | 145 | timestep_limit = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps') 146 | if terminal or length >= timestep_limit: 147 | terminal_end = True 148 | if length >= timestep_limit or not env.metadata.get('semantics.autoreset'): 149 | last_state = env.reset() 150 | last_features = policy.get_initial_features() 151 | print("Episode finished. Sum of rewards: %f. Length: %d" % (rewards, length)) 152 | length = 0 153 | rewards = 0 154 | break 155 | 156 | if not terminal_end: 157 | rollout.r = policy.value(last_state, *last_features) 158 | 159 | # once we have enough experience, yield it, and have the ThreadRunner place it on a queue 160 | yield rollout 161 | 162 | class PolicyOptimizer(object): 163 | def __init__(self, env, task, policy,visualise): 164 | self.env = env 165 | self.task = task 166 | 167 | worker_device = "/job:worker/task:{}/cpu:0".format(task) 168 | with tf.device(tf.train.replica_device_setter(1, worker_device=worker_device)): 169 | with tf.variable_scope("global"): 170 | self.global_step = tf.get_variable("global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), 171 | trainable=False) 172 | if policy == 'lstm': 173 | self.network = LSTMPolicy(env.observation_space.shape, env.action_space.n,self.global_step) 174 | elif policy == 'feudal': 175 | self.network = FeudalPolicy(env.observation_space.shape, env.action_space.n,self.global_step) 176 | else: 177 | print("Policy type unknown") 178 | exit(0) 179 | 180 | with tf.device(worker_device): 181 | with tf.variable_scope("local"): 182 | if policy == 'lstm': 183 | self.local_network = pi = LSTMPolicy(env.observation_space.shape, env.action_space.n,self.global_step) 184 | elif policy == 'feudal': 185 | self.local_network = pi = FeudalPolicy(env.observation_space.shape, env.action_space.n,self.global_step) 186 | else: 187 | print("Policy type unknown") 188 | exit(0) 189 | pi.global_step = self.global_step 190 | self.policy = pi 191 | # build runner thread for collecting rollouts 192 | self.runner = RunnerThread(env, self.policy, 20,visualise) 193 | 194 | # formulate gradients 195 | grads = tf.gradients(pi.loss, pi.var_list) 196 | grads, _ = tf.clip_by_global_norm(grads, 40) 197 | 198 | # build sync 199 | # copy weights from the parameter server to the local model 200 | self.sync = tf.group(*[v1.assign(v2) 201 | for v1, v2 in zip(pi.var_list, self.network.var_list)]) 202 | grads_and_vars = list(zip(grads, self.network.var_list)) 203 | # for g,v in grads_and_vars: 204 | # print g.name,v.name 205 | inc_step = self.global_step.assign_add(tf.shape(pi.obs)[0]) 206 | 207 | # build train op 208 | opt = tf.train.AdamOptimizer(1e-4) 209 | self.train_op = tf.group(opt.apply_gradients(grads_and_vars), inc_step) 210 | self.summary_writer = None 211 | self.local_steps = 0 212 | 213 | def start(self, sess, summary_writer): 214 | self.runner.start_runner(sess, summary_writer) 215 | self.summary_writer = summary_writer 216 | 217 | def pull_batch_from_queue(self): 218 | """ 219 | self explanatory: take a rollout from the queue of the thread runner. 220 | """ 221 | rollout = self.runner.queue.get(timeout=600.0) 222 | while not rollout.terminal: 223 | try: 224 | rollout.extend(self.runner.queue.get_nowait()) 225 | except queue.Empty: 226 | break 227 | return rollout 228 | 229 | def train(self, sess): 230 | """ 231 | This first runs the sync op so that the gradients are computed wrt the 232 | current global weights. It then takes a rollout from the runner's queue, 233 | converts it to a batch, and passes that batch and the train op to the 234 | policy to perform an update. 235 | """ 236 | # copy weights from shared to local 237 | # this should be run first so that the updates are for the most 238 | # recent global weights 239 | sess.run(self.sync) 240 | rollout = self.pull_batch_from_queue() 241 | batch = process_rollout(rollout, gamma=.99) 242 | batch = self.policy.update_batch(batch) 243 | compute_summary = self.task == 0 and self.local_steps % 11 == 0 244 | should_compute_summary = self.task == 0 and self.local_steps % 11 == 0 245 | 246 | if should_compute_summary: 247 | fetches = [self.policy.summary_op, self.train_op, self.global_step] 248 | else: 249 | fetches = [self.train_op, self.global_step] 250 | 251 | feed_dict = { 252 | self.policy.obs: batch.si, 253 | self.network.obs: batch.si, 254 | 255 | self.policy.ac: batch.a, 256 | self.network.ac: batch.a, 257 | 258 | self.policy.adv: batch.adv, 259 | self.network.adv: batch.adv, 260 | 261 | self.policy.r: batch.r, 262 | self.network.r: batch.r, 263 | } 264 | 265 | for i in range(len(self.policy.state_in)): 266 | feed_dict[self.policy.state_in[i]] = batch.features[i] 267 | feed_dict[self.network.state_in[i]] = batch.features[i] 268 | 269 | 270 | fetched = sess.run(fetches, feed_dict=feed_dict) 271 | 272 | if should_compute_summary: 273 | self.summary_writer.add_summary(tf.Summary.FromString(fetched[0]), fetched[-1]) 274 | self.summary_writer.flush() 275 | self.local_steps += 1 276 | -------------------------------------------------------------------------------- /feudal_networks/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | register( 4 | id='OneRoundDeterministicRewardBoxObs-v0', 5 | entry_point='feudal_networks.envs.debug_envs:OneRoundDeterministicRewardBoxObsEnv', 6 | max_episode_steps=1, 7 | tags = { 8 | 'feudal': True 9 | } 10 | ) 11 | 12 | register( 13 | id='VisionMaze-v0', 14 | entry_point='feudal_networks.envs.vision_maze:VisionMazeEnv', 15 | max_episode_steps=200, 16 | kwargs = { 17 | 'room_length': 3, 18 | 'num_rooms_per_side': 2 19 | }, 20 | tags = { 21 | 'feudal': True 22 | } 23 | ) 24 | -------------------------------------------------------------------------------- /feudal_networks/envs/debug_envs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Note: adapted from the original debugging environment to have Box obs space 3 | 4 | Simple environment with known optimal policy and value function. 5 | 6 | This environment has just two actions. 7 | Action 0 yields 0 reward and then terminates the session. 8 | Action 1 yields 1 reward and then terminates the session. 9 | 10 | Optimal policy: action 1. 11 | 12 | Optimal value function: v(0)=1 (there is only one state, state 0) 13 | """ 14 | 15 | import numpy as np 16 | import gym 17 | from gym import spaces 18 | 19 | class OneRoundDeterministicRewardBoxObsEnv(gym.Env): 20 | def __init__(self, obs_shape=(64,64,1)): 21 | self.action_space = spaces.Discrete(2) 22 | self.observation_space = spaces.Box(low=0, high=0, shape=obs_shape) 23 | self._obs = np.zeros(obs_shape) 24 | 25 | def _step(self, action): 26 | assert self.action_space.contains(action) 27 | reward = 1 if action == 1 else 0 28 | return self._obs, reward, True, {} 29 | 30 | def _reset(self): 31 | return self._obs -------------------------------------------------------------------------------- /feudal_networks/envs/vision_maze.py: -------------------------------------------------------------------------------- 1 | 2 | import gym 3 | from gym import spaces 4 | import numpy as np 5 | 6 | class VisionMazeEnv(gym.Env): 7 | def __init__(self, room_length=3, num_rooms_per_side=2): 8 | assert room_length % 2 == 1, "room_length must be odd" 9 | assert room_length >= 3, "room_length must be greater than 3" 10 | assert num_rooms_per_side >= 1, "must have at least 1 room" 11 | 12 | self.room_length = room_length 13 | self.num_rooms_per_side = num_rooms_per_side 14 | # 0 = up, 1 = right, 2 = down, 3 = left 15 | self.action_space = spaces.Discrete(4) 16 | self.max_pos = room_length * num_rooms_per_side - 1 17 | obs_space = (self.max_pos + 1, self.max_pos + 1, 1) 18 | self.observation_space = spaces.Box(low=0, high=1, shape=obs_space) 19 | self.goal_reward = 1 20 | self.goal_state = [self.max_pos, self.max_pos] 21 | self._obs = np.zeros(obs_space) 22 | self._reset() 23 | 24 | def _get_obs(self): 25 | self._obs.fill(0) 26 | self._obs[self.state[0], self.state[1], :] = 1 27 | return self._obs 28 | 29 | def _reset(self): 30 | # start in random state in the maze 31 | x = np.random.randint(self.max_pos) 32 | y = np.random.randint(self.max_pos) 33 | self.state = np.array([x, y]) 34 | return self._get_obs() 35 | 36 | def _step(self, a): 37 | assert self.action_space.contains(a) 38 | x, y = self.state 39 | 40 | # up 41 | if a == 0: 42 | y = self._step_up(x, y) 43 | # right 44 | elif a == 1: 45 | x = self._step_right(x, y) 46 | # down 47 | elif a == 2: 48 | y = self._step_down(x, y) 49 | # left 50 | else: 51 | x = self._step_left(x, y) 52 | 53 | r, done = 0, False 54 | if x == self.goal_state[0] and y == self.goal_state[1]: 55 | r, done = self.goal_reward, True 56 | 57 | self.state = np.array([x, y]) 58 | return self._get_obs(), r, done, {} 59 | 60 | def _step_up(self, x, y): 61 | ny = y + 1 62 | 63 | # convert to single room format 64 | local_ny = ny % self.room_length 65 | 66 | # this condition True indicates passing through wall 67 | if local_ny == 0: 68 | 69 | # this is only allowed if passing through doorway 70 | if not (x % self.room_length == self.room_length // 2): 71 | ny = y 72 | 73 | ny = min(ny, self.max_pos) 74 | return ny 75 | 76 | def _step_right(self, x, y): 77 | nx = x + 1 78 | 79 | # convert to single room format 80 | local_nx = nx % self.room_length 81 | 82 | # this condition True indicates passing through wall 83 | if local_nx == 0: 84 | 85 | # this is only allowed if passing through doorway 86 | if not (y % self.room_length == self.room_length // 2): 87 | nx = x 88 | 89 | nx = min(nx, self.max_pos) 90 | return nx 91 | 92 | def _step_down(self, x, y): 93 | ny = y - 1 94 | 95 | # convert to single room format 96 | local_ny = ny % self.room_length 97 | 98 | # this condition True indicates passing through wall 99 | if local_ny == self.room_length - 1: 100 | 101 | # this is only allowed if passing through doorway 102 | if not (x % self.room_length == self.room_length // 2): 103 | ny = y 104 | 105 | ny = max(0, ny) 106 | return ny 107 | 108 | def _step_left(self, x, y): 109 | nx = x - 1 110 | 111 | # convert to single room format 112 | local_nx = nx % self.room_length 113 | 114 | # this condition True indicates passing through wall 115 | if local_nx == self.room_length - 1: 116 | 117 | # this is only allowed if passing through doorway 118 | if not (y % self.room_length == self.room_length // 2): 119 | nx = x 120 | 121 | nx = max(0, nx) 122 | return nx 123 | -------------------------------------------------------------------------------- /feudal_networks/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidhershey/feudal_networks/5c988023d87206a739b44cd59f60556c7de4e289/feudal_networks/models/__init__.py -------------------------------------------------------------------------------- /feudal_networks/models/models.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import tensorflow as tf 4 | import tensorflow.contrib.rnn as rnn 5 | 6 | def normalized_columns_initializer(std=1.0): 7 | def _initializer(shape, dtype=None, partition_info=None): 8 | out = np.random.randn(*shape).astype(np.float32) 9 | out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True)) 10 | return tf.constant(out) 11 | return _initializer 12 | 13 | def linear(x, size, name, initializer=None, bias_init=0): 14 | w = tf.get_variable(name + "/w", [x.get_shape()[1], size], 15 | initializer=initializer) 16 | b = tf.get_variable(name + "/b", [size], 17 | initializer=tf.constant_initializer(bias_init)) 18 | return tf.matmul(x, w) + b 19 | 20 | def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", 21 | dtype=tf.float32, collections=None): 22 | with tf.variable_scope(name): 23 | stride_shape = [1, stride[0], stride[1], 1] 24 | filter_shape = [filter_size[0], filter_size[1], 25 | int(x.get_shape()[3]), num_filters] 26 | 27 | # there are "num input feature maps * filter height * filter width" 28 | # inputs to each hidden unit 29 | fan_in = np.prod(filter_shape[:3]) 30 | # each unit in the lower layer receives a gradient from: 31 | # "num output feature maps * filter height * filter width" / 32 | # pooling size 33 | fan_out = np.prod(filter_shape[:2]) * num_filters 34 | # initialize weights with random weights 35 | w_bound = np.sqrt(6. / (fan_in + fan_out)) 36 | 37 | w = tf.get_variable("W", filter_shape, dtype, 38 | tf.random_uniform_initializer(-w_bound, w_bound), 39 | collections=collections) 40 | b = tf.get_variable("b", [1, 1, 1, num_filters], 41 | initializer=tf.constant_initializer(0.0), 42 | collections=collections) 43 | return tf.nn.conv2d(x, w, stride_shape, pad) + b 44 | 45 | def build_lstm(x, size, name, step_size): 46 | lstm = rnn.BasicLSTMCell(size, state_is_tuple=True) 47 | 48 | c_init = np.zeros((1, lstm.state_size.c), np.float32) 49 | h_init = np.zeros((1, lstm.state_size.h), np.float32) 50 | state_init = [c_init, h_init] 51 | 52 | c_in = tf.placeholder(tf.float32, 53 | shape=[1, lstm.state_size.c], 54 | name='c_in') 55 | h_in = tf.placeholder(tf.float32, 56 | shape=[1, lstm.state_size.h], 57 | name='h_in') 58 | state_in = [c_in, h_in] 59 | 60 | state_in = rnn.LSTMStateTuple(c_in, h_in) 61 | 62 | lstm_outputs, lstm_state = tf.nn.dynamic_rnn( 63 | lstm, x, initial_state=state_in, sequence_length=step_size, 64 | time_major=False) 65 | lstm_outputs = tf.reshape(lstm_outputs, [-1, size]) 66 | 67 | lstm_c, lstm_h = lstm_state 68 | state_out = [lstm_c[:1, :], lstm_h[:1, :]] 69 | return lstm_outputs, state_init, state_in, state_out 70 | 71 | class SingleStepLSTM(object): 72 | 73 | def __init__(self,x,size,step_size): 74 | lstm = rnn.BasicLSTMCell(size, state_is_tuple=True) 75 | 76 | c_init = np.zeros((1, lstm.state_size.c), np.float32) 77 | h_init = np.zeros((1, lstm.state_size.h), np.float32) 78 | self.state_init = [c_init, h_init] 79 | 80 | c_in = tf.placeholder(tf.float32, 81 | shape=[1, lstm.state_size.c], 82 | name='c_in') 83 | h_in = tf.placeholder(tf.float32, 84 | shape=[1, lstm.state_size.h], 85 | name='h_in') 86 | self.state_in = [c_in, h_in] 87 | 88 | state_in = rnn.LSTMStateTuple(c_in, h_in) 89 | 90 | lstm_outputs, lstm_state = tf.nn.dynamic_rnn( 91 | lstm, x, initial_state=state_in, sequence_length=step_size, 92 | time_major=False) 93 | lstm_outputs = tf.reshape(lstm_outputs, [-1, size]) 94 | 95 | lstm_c, lstm_h = lstm_state 96 | self.state_out = [lstm_c[:1, :], lstm_h[:1, :]] 97 | self.output = lstm_outputs 98 | -------------------------------------------------------------------------------- /feudal_networks/policies/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidhershey/feudal_networks/5c988023d87206a739b44cd59f60556c7de4e289/feudal_networks/policies/__init__.py -------------------------------------------------------------------------------- /feudal_networks/policies/configs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidhershey/feudal_networks/5c988023d87206a739b44cd59f60556c7de4e289/feudal_networks/policies/configs/__init__.py -------------------------------------------------------------------------------- /feudal_networks/policies/configs/feudal_config.py: -------------------------------------------------------------------------------- 1 | class config(): 2 | alpha = .5 3 | vf_hidden_size = 128 4 | k = 16 #Dimensionality of w 5 | g_dim = 256 6 | c = 10 7 | beta_start = .01 8 | beta_end = .001 9 | decay_steps = 50000000 10 | -------------------------------------------------------------------------------- /feudal_networks/policies/configs/lstm_config.py: -------------------------------------------------------------------------------- 1 | class config(): 2 | size = 256 3 | n_percept_hidden_layer = 4 4 | n_percept_filters = 32 5 | beta_start = .01 6 | beta_end = .001 7 | decay_steps = 50000000 8 | summary_steps = 10 9 | -------------------------------------------------------------------------------- /feudal_networks/policies/feudal_batch_processor.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | from collections import namedtuple 4 | 5 | def cosine_similarity(u, v): 6 | return np.dot(np.squeeze(u),np.squeeze(v)) / (np.linalg.norm(u) * np.linalg.norm(v)) 7 | 8 | Batch = namedtuple("Batch", ["obs", "a", "returns", "s_diff", "ri", "gsum", "features"]) 9 | 10 | class FeudalBatch(object): 11 | def __init__(self): 12 | self.obs = [] 13 | self.a = [] 14 | self.returns = [] 15 | self.s_diff = [] 16 | self.ri = [] 17 | self.gsum = [] 18 | self.features = None 19 | 20 | def add(self, obs, a, returns, s_diff, ri, gsum, features): 21 | self.obs += [obs] 22 | self.a += [a] 23 | self.returns += [returns] 24 | self.s_diff += [s_diff] 25 | self.ri += [ri] 26 | self.gsum += [gsum] 27 | if not self.features: 28 | self.features = features 29 | 30 | def get_batch(self): 31 | batch_obs = np.asarray(self.obs) 32 | batch_a = np.asarray(self.a) 33 | batch_r = np.asarray(self.returns) 34 | batch_sd = np.squeeze(np.asarray(self.s_diff)) 35 | batch_ri = np.asarray(self.ri) 36 | batch_gs = np.asarray(self.gsum) 37 | return Batch(batch_obs,batch_a,batch_r,batch_sd,batch_ri,batch_gs,self.features) 38 | 39 | 40 | 41 | class FeudalBatchProcessor(object): 42 | """ 43 | This class adapts the batch of PolicyOptimizer to a batch useable by 44 | the FeudalPolicy. 45 | """ 46 | def __init__(self, c): 47 | self.c = c 48 | self.last_terminal = True 49 | 50 | def _extend(self, batch): 51 | if self.last_terminal: 52 | self.last_terminal = False 53 | self.s = [batch.s[0] for _ in range(self.c)] 54 | self.g = [batch.g[0] for _ in range(self.c)] 55 | # prepend with dummy values so indexing is the same 56 | self.obs = [None for _ in range(self.c)] 57 | self.a = [None for _ in range(self.c)] 58 | self.returns = [None for _ in range(self.c)] 59 | self.features = [None for _ in range(self.c)] 60 | 61 | # extend with the actual values 62 | self.obs.extend(batch.obs) 63 | self.a.extend(batch.a) 64 | self.returns.extend(batch.returns) 65 | self.s.extend(batch.s) 66 | self.g.extend(batch.g) 67 | self.features.extend(batch.features) 68 | 69 | # if this is a terminal batch, then append the final s and g c times 70 | # note that both this and the above case can occur at the same time 71 | if batch.terminal: 72 | self.s.extend([batch.s[-1] for _ in range(self.c)]) 73 | self.g.extend([batch.g[-1] for _ in range(self.c)]) 74 | 75 | def process_batch(self, batch): 76 | """ 77 | Converts a normal batch into one used by the FeudalPolicy update. 78 | 79 | FeudalPolicy requires a batch of the form: 80 | 81 | c previous timesteps - batch size timesteps - c future timesteps 82 | 83 | This class handles the tracking the leading and following timesteps over 84 | time. Additionally, it also computes values across timesteps from the 85 | batch to provide to FeudalPolicy. 86 | """ 87 | # extend with current batch 88 | self._extend(batch) 89 | 90 | # unpack and compute bounds 91 | length = len(self.obs) 92 | c = self.c 93 | 94 | # normally we cannot compute samples for the last c elements, but 95 | # in the terminal case, we halluciante values where necessary 96 | end = length if batch.terminal else length - c 97 | 98 | # collect samples to return in a FeudalBatch 99 | feudal_batch = FeudalBatch() 100 | for t in range(c, end): 101 | 102 | # state difference 103 | s_diff = self.s[t + c] - self.s[t] 104 | 105 | # intrinsic reward 106 | ri = 0 107 | # note that this for loop considers s and g values 108 | # 1 timestep to c timesteps (inclusively) ago 109 | for i in range(1, c + 1): 110 | ri_s_diff = self.s[t] - self.s[t - i] 111 | if np.linalg.norm(ri_s_diff) != 0: 112 | ri += cosine_similarity(ri_s_diff, self.g[t - i]) 113 | ri /= c 114 | 115 | # sum of g values used to derive w, input to the linear transform 116 | gsum = np.zeros_like(self.g[t - c]) 117 | for i in range(t - c, t + 1): 118 | gsum += self.g[i] 119 | 120 | # add to the batch 121 | feudal_batch.add(self.obs[t], self.a[t], self.returns[t], s_diff, 122 | ri, gsum, self.features[t]) 123 | 124 | # in the terminal case, set reset flag 125 | if batch.terminal: 126 | self.last_terminal = True 127 | # in the general case, forget all but the last 2 * c elements 128 | # reason being that the first c of those we have already computed 129 | # a batch for, and the second c need those first c 130 | else: 131 | twoc = 2 * self.c 132 | self.obs = self.obs[-twoc:] 133 | self.a = self.a[-twoc:] 134 | self.returns = self.returns[-twoc:] 135 | self.s = self.s[-twoc:] 136 | self.g = self.g[-twoc:] 137 | self.features = self.features[-twoc:] 138 | 139 | return feudal_batch.get_batch() 140 | -------------------------------------------------------------------------------- /feudal_networks/policies/feudal_policy.py: -------------------------------------------------------------------------------- 1 | 2 | import distutils.version 3 | import numpy as np 4 | import tensorflow as tf 5 | import tensorflow.contrib.rnn as rnn 6 | 7 | import feudal_networks.policies.policy as policy 8 | import feudal_networks.policies.policy_utils as policy_utils 9 | from feudal_networks.models.models import SingleStepLSTM 10 | from feudal_networks.policies.configs.feudal_config import config 11 | from feudal_networks.policies.feudal_batch_processor import FeudalBatchProcessor 12 | 13 | class FeudalPolicy(policy.Policy): 14 | """ 15 | Policy of the Feudal network architecture. 16 | """ 17 | 18 | def __init__(self, obs_space, act_space,global_step): 19 | self.global_step = global_step 20 | self.obs_space = obs_space 21 | self.act_space = act_space 22 | self.config = config 23 | self.k = config.k #Dimensionality of w 24 | self.g_dim = config.g_dim 25 | self.c = config.c 26 | self.batch_processor = FeudalBatchProcessor(self.c) 27 | self._build_model() 28 | 29 | def _build_model(self): 30 | """ 31 | Builds the manager and worker models. 32 | """ 33 | with tf.variable_scope('FeUdal'): 34 | self._build_placeholders() 35 | self._build_perception() 36 | self._build_manager() 37 | self._build_worker() 38 | self._build_loss() 39 | self.var_list = tf.get_collection( 40 | tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) 41 | # for v in self.var_list: 42 | # print v.name 43 | 44 | self.state_in = [self.worker_lstm.state_in[0],\ 45 | self.worker_lstm.state_in[1],\ 46 | self.manager_lstm.state_in[0],\ 47 | self.manager_lstm.state_in[1]\ 48 | ] 49 | self.state_out = [self.worker_lstm.state_out[0],\ 50 | self.worker_lstm.state_out[1],\ 51 | self.manager_lstm.state_out[0],\ 52 | self.manager_lstm.state_out[1]\ 53 | ] 54 | # for v in self.var_list: 55 | # print v 56 | 57 | def _build_placeholders(self): 58 | #standard for all policies 59 | self.obs = tf.placeholder(tf.float32, [None] + list(self.obs_space)) 60 | self.r = tf.placeholder(tf.float32,(None,)) 61 | self.ac = tf.placeholder(tf.float32,(None,self.act_space)) 62 | self.adv = tf.placeholder(tf.float32, [None]) #unused 63 | 64 | #specific to FeUdal 65 | self.prev_g = tf.placeholder(tf.float32, (None,None,self.g_dim)) 66 | self.ri = tf.placeholder(tf.float32,(None,)) 67 | self.s_diff = tf.placeholder(tf.float32,(None,self.g_dim)) 68 | 69 | 70 | def _build_perception(self): 71 | conv1 = tf.layers.conv2d(inputs=self.obs, 72 | filters=16, 73 | kernel_size=[8, 8], 74 | activation=tf.nn.elu, 75 | strides=4) 76 | conv2 = tf.layers.conv2d(inputs=conv1, 77 | filters=32, 78 | kernel_size=[4,4], 79 | activation=tf.nn.elu, 80 | strides=2) 81 | 82 | flattened_filters = policy_utils.flatten(conv2) 83 | self.z = tf.layers.dense(inputs=flattened_filters,\ 84 | units=256,\ 85 | activation=tf.nn.elu) 86 | 87 | def _build_manager(self): 88 | with tf.variable_scope('manager'): 89 | # Calculate manager internal state 90 | self.s = tf.layers.dense(inputs=self.z,\ 91 | units=self.g_dim,\ 92 | activation=tf.nn.elu) 93 | 94 | # Calculate manager output g 95 | x = tf.expand_dims(self.s, [0]) 96 | self.manager_lstm = SingleStepLSTM(x,\ 97 | self.g_dim,\ 98 | step_size=tf.shape(self.obs)[:1]) 99 | g_hat = self.manager_lstm.output 100 | self.g = tf.nn.l2_normalize(g_hat, dim=1) 101 | 102 | self.manager_vf = self._build_value(g_hat) 103 | # self.manager_vf = tf.Print(self.manager_vf,[self.manager_vf]) 104 | 105 | def _build_worker(self): 106 | with tf.variable_scope('worker'): 107 | num_acts = self.act_space 108 | 109 | # Calculate U 110 | self.worker_lstm = SingleStepLSTM(tf.expand_dims(self.z, [0]),\ 111 | size=num_acts * self.k, 112 | step_size=tf.shape(self.obs)[:1]) 113 | flat_logits = self.worker_lstm.output 114 | 115 | self.worker_vf = self._build_value(flat_logits) 116 | 117 | U = tf.reshape(flat_logits,[-1,num_acts,self.k]) 118 | 119 | # Calculate w 120 | cut_g = tf.stop_gradient(self.g) 121 | cut_g = tf.expand_dims(cut_g, [1]) 122 | gstack = tf.concat([self.prev_g,cut_g], axis=1) 123 | 124 | self.last_c_g = gstack[:,1:] 125 | # print self.last_c_g 126 | gsum = tf.reduce_sum(gstack, axis=1) 127 | phi = tf.get_variable("phi", (self.g_dim, self.k)) 128 | w = tf.matmul(gsum,phi) 129 | w = tf.expand_dims(w,[2]) 130 | # Calculate policy and sample 131 | logits = tf.reshape(tf.matmul(U,w),[-1,num_acts]) 132 | self.pi = tf.nn.softmax(logits) 133 | self.log_pi = tf.nn.log_softmax(logits) 134 | self.sample = policy_utils.categorical_sample( 135 | tf.reshape(logits,[-1,num_acts]), num_acts)[0, :] 136 | 137 | def _build_value(self,input): 138 | with tf.variable_scope('VF'): 139 | hidden = tf.layers.dense(inputs=input,\ 140 | units=self.config.vf_hidden_size,\ 141 | activation=tf.nn.elu) 142 | 143 | w = tf.get_variable("weights", (self.config.vf_hidden_size, 1)) 144 | return tf.matmul(hidden,w) 145 | 146 | def _build_loss(self): 147 | cutoff_vf_manager = tf.reshape(tf.stop_gradient(self.manager_vf),[-1]) 148 | dot = tf.reduce_sum(tf.multiply(self.s_diff,self.g ),axis=1) 149 | gcut = tf.stop_gradient(self.g) 150 | mag = tf.norm(self.s_diff,axis=1)*tf.norm(gcut,axis=1)+.0001 151 | dcos = dot/mag 152 | manager_loss = -tf.reduce_sum((self.r-cutoff_vf_manager)*dcos) 153 | 154 | cutoff_vf_worker = tf.reshape(tf.stop_gradient(self.worker_vf),[-1]) 155 | log_p = tf.reduce_sum(self.log_pi*self.ac,[1]) 156 | worker_loss = (self.r + self.config.alpha*self.ri - cutoff_vf_worker)*log_p 157 | worker_loss = -tf.reduce_sum(worker_loss,axis=0) 158 | 159 | Am = self.r-self.manager_vf 160 | manager_vf_loss = .5*tf.reduce_sum(tf.square(Am)) 161 | 162 | Aw = (self.r + self.config.alpha*self.ri)-self.worker_vf 163 | worker_vf_loss = .5*tf.reduce_sum(tf.square(Aw)) 164 | 165 | entropy = -tf.reduce_sum(self.pi * self.log_pi) 166 | 167 | beta = tf.train.polynomial_decay(config.beta_start, self.global_step, 168 | end_learning_rate=config.beta_end, 169 | decay_steps=config.decay_steps, 170 | power=1) 171 | 172 | # worker_loss = tf.Print(worker_loss,[manager_loss,worker_loss,manager_vf_loss,worker_vf_loss,entropy]) 173 | self.loss = worker_loss+manager_loss+\ 174 | worker_vf_loss + manager_vf_loss-\ 175 | entropy*beta 176 | 177 | bs = tf.to_float(tf.shape(self.obs)[0]) 178 | tf.summary.scalar("model/manager_loss", manager_loss / bs) 179 | tf.summary.scalar("model/worker_loss", worker_loss / bs) 180 | tf.summary.scalar("model/value_mean", tf.reduce_mean(self.manager_vf)) 181 | tf.summary.scalar("model/value_loss", manager_vf_loss / bs) 182 | tf.summary.scalar("model/value_loss_scaled", manager_vf_loss / bs * .5) 183 | tf.summary.scalar("model/entropy", entropy / bs) 184 | tf.summary.scalar("model/entropy_loss_scaleed", -entropy / bs * beta) 185 | # tf.summary.scalar("model/grad_global_norm", tf.global_norm(grads)) 186 | tf.summary.scalar("model/var_global_norm", tf.global_norm(tf.get_collection(\ 187 | tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name))) 188 | tf.summary.scalar("model/beta", beta) 189 | tf.summary.image("model/state", self.obs) 190 | self.summary_op = tf.summary.merge_all() 191 | 192 | 193 | def get_initial_features(self): 194 | return np.zeros((1,1,self.g_dim),np.float32),self.worker_lstm.state_init+self.manager_lstm.state_init 195 | 196 | 197 | def act(self, ob, g,cw,hw,cm,hm): 198 | sess = tf.get_default_session() 199 | return sess.run([self.sample, self.manager_vf, self.g, self.s, self.last_c_g] + self.state_out, 200 | {self.obs: [ob], self.state_in[0]: cw, self.state_in[1]: hw,\ 201 | self.state_in[2]: cm, self.state_in[3]: hm,\ 202 | self.prev_g: g}) 203 | 204 | def value(self, ob, g, cw, hw, cm, hm): 205 | sess = tf.get_default_session() 206 | return sess.run(self.manager_vf, 207 | {self.obs: [ob], self.state_in[0]: cw, self.state_in[1]: hw,\ 208 | self.state_in[2]: cm, self.state_in[3]: hm,\ 209 | self.prev_g: g})[0] 210 | 211 | def update_batch(self,batch): 212 | return self.batch_processor.process_batch(batch) 213 | -------------------------------------------------------------------------------- /feudal_networks/policies/lstm_policy.py: -------------------------------------------------------------------------------- 1 | 2 | import gym 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | from feudal_networks.models.models import (linear, conv2d, build_lstm, 7 | normalized_columns_initializer) 8 | import feudal_networks.policies.policy_utils as policy_utils 9 | 10 | from feudal_networks.policies.configs.lstm_config import config 11 | 12 | class LSTMPolicy(object): 13 | def __init__(self, obs_space, act_space,global_step): 14 | self.global_step = global_step 15 | self.obs_space = obs_space 16 | self.act_space = act_space 17 | self.config = config 18 | self.local_steps = 0 19 | # build placeholders 20 | self.obs = x = tf.placeholder(tf.float32, 21 | [None] + list(obs_space), 22 | name='state') 23 | self.adv = tf.placeholder(tf.float32, 24 | [None], 25 | name="adv") 26 | self.ac = tf.placeholder(tf.float32, 27 | [None, act_space], 28 | name="ac") 29 | self.r = tf.placeholder(tf.float32, 30 | [None], 31 | name="r") 32 | 33 | print(self.r) 34 | # build perception 35 | for i in range(config.n_percept_hidden_layer): 36 | x = tf.nn.elu(conv2d(x, config.n_percept_filters, 37 | "l{}".format(i + 1), [3, 3], [2, 2])) 38 | 39 | # introduce a "fake" batch dimension of 1 after flatten so that we 40 | # can do LSTM over time dim 41 | x = tf.expand_dims(policy_utils.flatten(x), [0]) 42 | x, self.state_init, self.state_in, self.state_out = build_lstm( 43 | x, config.size, 'lstm', tf.shape(self.obs)[:1]) 44 | 45 | # on the lstm to output values for both the policy and value function 46 | # add hidden layer to value output so that less of a burden is placed 47 | vfhid = tf.nn.elu(linear(x, config.size, "value_hidden", 48 | normalized_columns_initializer(0.01))) 49 | self.vf = tf.reshape(linear(vfhid, 1, "value", 50 | normalized_columns_initializer(1.0)), [-1]) 51 | 52 | # retrieve logits, sampling op 53 | self.logits = linear(x, act_space, "action", 54 | normalized_columns_initializer(0.01)) 55 | self.sample = policy_utils.categorical_sample( 56 | self.logits, act_space)[0, :] 57 | self.var_list = tf.get_collection( 58 | tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) 59 | 60 | # build loss 61 | log_prob_tf = tf.nn.log_softmax(self.logits) 62 | prob_tf = tf.nn.softmax(self.logits) 63 | pi_loss = - tf.reduce_sum( 64 | tf.reduce_sum(log_prob_tf * self.ac, [1]) * self.adv) 65 | entropy = - tf.reduce_sum(prob_tf * log_prob_tf) 66 | vf_loss = 0.5 * tf.reduce_sum(tf.square(self.vf - self.r)) 67 | beta = tf.train.polynomial_decay(config.beta_start, self.global_step, 68 | end_learning_rate=config.beta_end, 69 | decay_steps=config.decay_steps, 70 | power=1) 71 | self.loss = pi_loss + 0.5 * vf_loss - entropy * beta 72 | 73 | # summaries 74 | bs = tf.to_float(tf.shape(self.obs)[0]) 75 | tf.summary.scalar("model/policy_loss", pi_loss / bs) 76 | tf.summary.scalar("model/value_mean", tf.reduce_mean(self.vf)) 77 | tf.summary.scalar("model/value_loss", vf_loss / bs) 78 | tf.summary.scalar("model/value_loss_scaled", vf_loss / bs * .5) 79 | tf.summary.scalar("model/entropy", entropy / bs) 80 | tf.summary.scalar("model/entropy_loss_scaleed", -entropy / bs * beta) 81 | # tf.summary.scalar("model/grad_global_norm", tf.global_norm(grads)) 82 | tf.summary.scalar("model/var_global_norm", tf.global_norm(self.var_list)) 83 | tf.summary.scalar("model/beta", beta) 84 | tf.summary.image("model/state", self.obs) 85 | self.summary_op = tf.summary.merge_all() 86 | 87 | def get_initial_features(self): 88 | return self.state_init 89 | 90 | def act(self, ob, c, h): 91 | sess = tf.get_default_session() 92 | return sess.run([self.sample, self.vf] + self.state_out, 93 | {self.obs: [ob], self.state_in[0]: c, self.state_in[1]: h}) 94 | 95 | def value(self, ob, c, h): 96 | sess = tf.get_default_session() 97 | return sess.run(self.vf, {self.obs: [ob], self.state_in[0]: c, self.state_in[1]: h})[0] 98 | 99 | def update_batch(self,batch): 100 | return batch 101 | -------------------------------------------------------------------------------- /feudal_networks/policies/policy.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Policy(object): 4 | """ 5 | An abstract class defining a learned policy to be used for a Reinforcment 6 | Learning problem. This class interfaces with a policy optimizer class 7 | that oversees training the policy on some environment. 8 | 9 | The policy needs three externally facing methods: 10 | act() 11 | value() 12 | update() 13 | Which are further documented below. 14 | 15 | Further, upon initialization the following member variables should be 16 | defined: 17 | loss - The tensorflow operation defining the loss function of the 18 | policy with respect to a batch of training data 19 | var_list - The variables that should be trained by the optimizer 20 | internals_in- A list of placeholder variables needed at runtime 21 | in order to calculate act(), value() or update() 22 | (e.g. internal LSTM state) 23 | """ 24 | def __init__(self,obs_space,act_space,config): 25 | raise NotImplementedError("Please Implement this method") 26 | 27 | def _build_model(self): 28 | raise NotImplementedError("Please Implement this method") 29 | 30 | def _build_placeholders(self): 31 | raise NotImplementedError("Please Implement this method") 32 | 33 | def _build_loss(self): 34 | """ 35 | Should initialize self.loss to be a tensorflow operation that calculates 36 | the loss funtion for the current policy 37 | """ 38 | raise NotImplementedError("Please Implement this method") 39 | 40 | def act(self, obs, prev_internal): 41 | raise NotImplementedError("Please Implement this method") 42 | 43 | def value(self, obs,prev_internal): 44 | raise NotImplementedError("Please Implement this method") 45 | 46 | def update(self, sess, train_op, batch): 47 | raise NotImplementedError("Please Implement this method") 48 | -------------------------------------------------------------------------------- /feudal_networks/policies/policy_utils.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import tensorflow as tf 4 | 5 | def flatten(x): 6 | return tf.reshape(x, [-1, np.prod(x.get_shape().as_list()[1:])]) 7 | 8 | def categorical_sample(logits, d): 9 | value = tf.squeeze(tf.multinomial(logits - tf.reduce_max( 10 | logits, [1], keep_dims=True), 1), [1]) 11 | return tf.one_hot(value, d) 12 | -------------------------------------------------------------------------------- /scripts/training/README.md: -------------------------------------------------------------------------------- 1 | scripts for executing training -------------------------------------------------------------------------------- /scripts/training/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidhershey/feudal_networks/5c988023d87206a739b44cd59f60556c7de4e289/scripts/training/__init__.py -------------------------------------------------------------------------------- /scripts/training/envs.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | from gym.spaces.box import Box 3 | import numpy as np 4 | import gym 5 | from gym import spaces 6 | import logging 7 | import universe 8 | from universe import vectorized 9 | from universe.wrappers import BlockingReset, GymCoreAction, EpisodeID, Unvectorize, Vectorize, Vision, Logger 10 | from universe import spaces as vnc_spaces 11 | from universe.spaces.vnc_event import keycode 12 | import time 13 | logger = logging.getLogger(__name__) 14 | logger.setLevel(logging.INFO) 15 | universe.configure_logging() 16 | 17 | # for environments 18 | import feudal_networks.envs 19 | 20 | def create_env(env_id, client_id, remotes, **kwargs): 21 | spec = gym.spec(env_id) 22 | 23 | if spec.tags.get('feudal', False): 24 | return create_feudal_env(env_id, client_id, remotes, **kwargs) 25 | elif spec.tags.get('flashgames', False): 26 | return create_flash_env(env_id, client_id, remotes, **kwargs) 27 | elif spec.tags.get('atari', False) and spec.tags.get('vnc', False): 28 | return create_vncatari_env(env_id, client_id, remotes, **kwargs) 29 | else: 30 | # Assume atari. 31 | assert "." not in env_id # universe environments have dots in names. 32 | return create_atari_env(env_id) 33 | 34 | def create_feudal_env(env_id, client_id, remotes, **_): 35 | env = gym.make(env_id) 36 | return env 37 | 38 | def create_flash_env(env_id, client_id, remotes, **_): 39 | env = gym.make(env_id) 40 | env = Vision(env) 41 | env = Logger(env) 42 | env = BlockingReset(env) 43 | 44 | reg = universe.runtime_spec('flashgames').server_registry 45 | height = reg[env_id]["height"] 46 | width = reg[env_id]["width"] 47 | env = CropScreen(env, height, width, 84, 18) 48 | env = FlashRescale(env) 49 | 50 | keys = ['left', 'right', 'up', 'down', 'x'] 51 | if env_id == 'flashgames.NeonRace-v0': 52 | # Better key space for this game. 53 | keys = ['left', 'right', 'up', 'left up', 'right up', 'down', 'up x'] 54 | logger.info('create_flash_env(%s): keys=%s', env_id, keys) 55 | 56 | env = DiscreteToFixedKeysVNCActions(env, keys) 57 | env = EpisodeID(env) 58 | env = DiagnosticsInfo(env) 59 | env = Unvectorize(env) 60 | env.configure(fps=5.0, remotes=remotes, start_timeout=15 * 60, client_id=client_id, 61 | vnc_driver='go', vnc_kwargs={ 62 | 'encoding': 'tight', 'compress_level': 0, 63 | 'fine_quality_level': 50, 'subsample_level': 3}) 64 | return env 65 | 66 | def create_vncatari_env(env_id, client_id, remotes, **_): 67 | env = gym.make(env_id) 68 | env = Vision(env) 69 | env = Logger(env) 70 | env = BlockingReset(env) 71 | env = GymCoreAction(env) 72 | env = AtariRescale42x42(env) 73 | env = EpisodeID(env) 74 | env = DiagnosticsInfo(env) 75 | env = Unvectorize(env) 76 | 77 | logger.info('Connecting to remotes: %s', remotes) 78 | fps = env.metadata['video.frames_per_second'] 79 | env.configure(remotes=remotes, start_timeout=15 * 60, fps=fps, client_id=client_id) 80 | return env 81 | 82 | def create_atari_env(env_id): 83 | env = gym.make(env_id) 84 | env = Vectorize(env) 85 | env = AtariRescale42x42(env) 86 | env = DiagnosticsInfo(env) 87 | env = Unvectorize(env) 88 | return env 89 | 90 | def DiagnosticsInfo(env, *args, **kwargs): 91 | return vectorized.VectorizeFilter(env, DiagnosticsInfoI, *args, **kwargs) 92 | 93 | class DiagnosticsInfoI(vectorized.Filter): 94 | def __init__(self, log_interval=503): 95 | super(DiagnosticsInfoI, self).__init__() 96 | 97 | self._episode_time = time.time() 98 | self._last_time = time.time() 99 | self._local_t = 0 100 | self._log_interval = log_interval 101 | self._episode_reward = 0 102 | self._episode_length = 0 103 | self._all_rewards = [] 104 | self._num_vnc_updates = 0 105 | self._last_episode_id = -1 106 | 107 | def _after_reset(self, observation): 108 | logger.info('Resetting environment') 109 | self._episode_reward = 0 110 | self._episode_length = 0 111 | self._all_rewards = [] 112 | return observation 113 | 114 | def _after_step(self, observation, reward, done, info): 115 | to_log = {} 116 | if self._episode_length == 0: 117 | self._episode_time = time.time() 118 | 119 | self._local_t += 1 120 | if info.get("stats.vnc.updates.n") is not None: 121 | self._num_vnc_updates += info.get("stats.vnc.updates.n") 122 | 123 | if self._local_t % self._log_interval == 0: 124 | cur_time = time.time() 125 | elapsed = cur_time - self._last_time 126 | fps = self._log_interval / elapsed 127 | self._last_time = cur_time 128 | cur_episode_id = info.get('vectorized.episode_id', 0) 129 | to_log["diagnostics/fps"] = fps 130 | if self._last_episode_id == cur_episode_id: 131 | to_log["diagnostics/fps_within_episode"] = fps 132 | self._last_episode_id = cur_episode_id 133 | if info.get("stats.gauges.diagnostics.lag.action") is not None: 134 | to_log["diagnostics/action_lag_lb"] = info["stats.gauges.diagnostics.lag.action"][0] 135 | to_log["diagnostics/action_lag_ub"] = info["stats.gauges.diagnostics.lag.action"][1] 136 | if info.get("reward.count") is not None: 137 | to_log["diagnostics/reward_count"] = info["reward.count"] 138 | if info.get("stats.gauges.diagnostics.clock_skew") is not None: 139 | to_log["diagnostics/clock_skew_lb"] = info["stats.gauges.diagnostics.clock_skew"][0] 140 | to_log["diagnostics/clock_skew_ub"] = info["stats.gauges.diagnostics.clock_skew"][1] 141 | if info.get("stats.gauges.diagnostics.lag.observation") is not None: 142 | to_log["diagnostics/observation_lag_lb"] = info["stats.gauges.diagnostics.lag.observation"][0] 143 | to_log["diagnostics/observation_lag_ub"] = info["stats.gauges.diagnostics.lag.observation"][1] 144 | 145 | if info.get("stats.vnc.updates.n") is not None: 146 | to_log["diagnostics/vnc_updates_n"] = info["stats.vnc.updates.n"] 147 | to_log["diagnostics/vnc_updates_n_ps"] = self._num_vnc_updates / elapsed 148 | self._num_vnc_updates = 0 149 | if info.get("stats.vnc.updates.bytes") is not None: 150 | to_log["diagnostics/vnc_updates_bytes"] = info["stats.vnc.updates.bytes"] 151 | if info.get("stats.vnc.updates.pixels") is not None: 152 | to_log["diagnostics/vnc_updates_pixels"] = info["stats.vnc.updates.pixels"] 153 | if info.get("stats.vnc.updates.rectangles") is not None: 154 | to_log["diagnostics/vnc_updates_rectangles"] = info["stats.vnc.updates.rectangles"] 155 | if info.get("env_status.state_id") is not None: 156 | to_log["diagnostics/env_state_id"] = info["env_status.state_id"] 157 | 158 | if reward is not None: 159 | self._episode_reward += reward 160 | if observation is not None: 161 | self._episode_length += 1 162 | self._all_rewards.append(reward) 163 | 164 | if done: 165 | logger.info('Episode terminating: episode_reward=%s episode_length=%s', self._episode_reward, self._episode_length) 166 | total_time = time.time() - self._episode_time 167 | to_log["global/episode_reward"] = self._episode_reward 168 | to_log["global/episode_length"] = self._episode_length 169 | to_log["global/episode_time"] = total_time 170 | to_log["global/reward_per_time"] = self._episode_reward / total_time 171 | self._episode_reward = 0 172 | self._episode_length = 0 173 | self._all_rewards = [] 174 | 175 | return observation, reward, done, to_log 176 | 177 | def _process_frame42(frame): 178 | frame = frame[34:34+160, :160] 179 | # Resize by half, then down to 42x42 (essentially mipmapping). If 180 | # we resize directly we lose pixels that, when mapped to 42x42, 181 | # aren't close enough to the pixel boundary. 182 | frame = cv2.resize(frame, (80, 80)) 183 | frame = cv2.resize(frame, (42, 42)) 184 | frame = frame.mean(2) 185 | frame = frame.astype(np.float32) 186 | frame *= (1.0 / 255.0) 187 | frame = np.reshape(frame, [42, 42, 1]) 188 | return frame 189 | 190 | class AtariRescale42x42(vectorized.ObservationWrapper): 191 | def __init__(self, env=None): 192 | super(AtariRescale42x42, self).__init__(env) 193 | self.observation_space = Box(0.0, 1.0, [42, 42, 1]) 194 | 195 | def _observation(self, observation_n): 196 | return [_process_frame42(observation) for observation in observation_n] 197 | 198 | class FixedKeyState(object): 199 | def __init__(self, keys): 200 | self._keys = [keycode(key) for key in keys] 201 | self._down_keysyms = set() 202 | 203 | def apply_vnc_actions(self, vnc_actions): 204 | for event in vnc_actions: 205 | if isinstance(event, vnc_spaces.KeyEvent): 206 | if event.down: 207 | self._down_keysyms.add(event.key) 208 | else: 209 | self._down_keysyms.discard(event.key) 210 | 211 | def to_index(self): 212 | action_n = 0 213 | for key in self._down_keysyms: 214 | if key in self._keys: 215 | # If multiple keys are pressed, just use the first one 216 | action_n = self._keys.index(key) + 1 217 | break 218 | return action_n 219 | 220 | class DiscreteToFixedKeysVNCActions(vectorized.ActionWrapper): 221 | """ 222 | Define a fixed action space. Action 0 is all keys up. Each element of keys can be a single key or a space-separated list of keys 223 | 224 | For example, 225 | e=DiscreteToFixedKeysVNCActions(e, ['left', 'right']) 226 | will have 3 actions: [none, left, right] 227 | 228 | You can define a state with more than one key down by separating with spaces. For example, 229 | e=DiscreteToFixedKeysVNCActions(e, ['left', 'right', 'space', 'left space', 'right space']) 230 | will have 6 actions: [none, left, right, space, left space, right space] 231 | """ 232 | def __init__(self, env, keys): 233 | super(DiscreteToFixedKeysVNCActions, self).__init__(env) 234 | 235 | self._keys = keys 236 | self._generate_actions() 237 | self.action_space = spaces.Discrete(len(self._actions)) 238 | 239 | def _generate_actions(self): 240 | self._actions = [] 241 | uniq_keys = set() 242 | for key in self._keys: 243 | for cur_key in key.split(' '): 244 | uniq_keys.add(cur_key) 245 | 246 | for key in [''] + self._keys: 247 | split_keys = key.split(' ') 248 | cur_action = [] 249 | for cur_key in uniq_keys: 250 | cur_action.append(vnc_spaces.KeyEvent.by_name(cur_key, down=(cur_key in split_keys))) 251 | self._actions.append(cur_action) 252 | self.key_state = FixedKeyState(uniq_keys) 253 | 254 | def _action(self, action_n): 255 | # Each action might be a length-1 np.array. Cast to int to 256 | # avoid warnings. 257 | return [self._actions[int(action)] for action in action_n] 258 | 259 | class CropScreen(vectorized.ObservationWrapper): 260 | """Crops out a [height]x[width] area starting from (top,left) """ 261 | def __init__(self, env, height, width, top=0, left=0): 262 | super(CropScreen, self).__init__(env) 263 | self.height = height 264 | self.width = width 265 | self.top = top 266 | self.left = left 267 | self.observation_space = Box(0, 255, shape=(height, width, 3)) 268 | 269 | def _observation(self, observation_n): 270 | return [ob[self.top:self.top+self.height, self.left:self.left+self.width, :] if ob is not None else None 271 | for ob in observation_n] 272 | 273 | def _process_frame_flash(frame): 274 | frame = cv2.resize(frame, (200, 128)) 275 | frame = frame.mean(2).astype(np.float32) 276 | frame *= (1.0 / 255.0) 277 | frame = np.reshape(frame, [128, 200, 1]) 278 | return frame 279 | 280 | class FlashRescale(vectorized.ObservationWrapper): 281 | def __init__(self, env=None): 282 | super(FlashRescale, self).__init__(env) 283 | self.observation_space = Box(0.0, 1.0, [128, 200, 1]) 284 | 285 | def _observation(self, observation_n): 286 | return [_process_frame_flash(observation) for observation in observation_n] 287 | -------------------------------------------------------------------------------- /scripts/training/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | from six.moves import shlex_quote 5 | 6 | parser = argparse.ArgumentParser(description="Run commands") 7 | parser.add_argument('-w', '--num-workers', default=1, type=int, 8 | help="Number of workers") 9 | parser.add_argument('-r', '--remotes', default=None, 10 | help='The address of pre-existing VNC servers and ' 11 | 'rewarders to use (e.g. -r vnc://localhost:5900+15900,vnc://localhost:5901+15901).') 12 | parser.add_argument('-e', '--env-id', type=str, default="PongDeterministic-v4", 13 | help="Environment id") 14 | parser.add_argument('-l', '--log-dir', type=str, default="/tmp/pong", 15 | help="Log directory path") 16 | parser.add_argument('-n', '--dry-run', action='store_true', 17 | help="Print out commands rather than executing them") 18 | parser.add_argument('-m', '--mode', type=str, default='tmux', 19 | help="tmux: run workers in a tmux session. nohup: run workers with nohup. child: run workers as child processes") 20 | parser.add_argument('-p', '--policy', type=str, default='lstm', 21 | help="lstm or feudal policy") 22 | 23 | # Add visualise tag 24 | parser.add_argument('--visualise', action='store_true', 25 | help="Visualise the gym environment by running env.render() between each timestep") 26 | 27 | 28 | def new_cmd(session, name, cmd, mode, logdir, shell): 29 | if isinstance(cmd, (list, tuple)): 30 | cmd = " ".join(shlex_quote(str(v)) for v in cmd) 31 | if mode == 'tmux': 32 | return name, "tmux send-keys -t {}:{} {} Enter".format(session, name, shlex_quote(cmd)) 33 | elif mode == 'child': 34 | return name, "{} >{}/{}.{}.out 2>&1 & echo kill $! >>{}/kill.sh".format(cmd, logdir, session, name, logdir) 35 | elif mode == 'nohup': 36 | return name, "nohup {} -c {} >{}/{}.{}.out 2>&1 & echo kill $! >>{}/kill.sh".format(shell, shlex_quote(cmd), logdir, session, name, logdir) 37 | 38 | 39 | def create_commands(session, num_workers, remotes, env_id, logdir, shell='bash', 40 | policy='lstm', mode='tmux', visualise=False): 41 | # for launching the TF workers and for launching tensorboard 42 | base_cmd = [ 43 | 'CUDA_VISIBLE_DEVICES=', 44 | sys.executable, 'worker.py', 45 | '--log-dir', logdir, 46 | '--env-id', env_id, 47 | '--num-workers', str(num_workers)] 48 | 49 | if visualise: 50 | base_cmd += ['--visualise'] 51 | 52 | if remotes is None: 53 | remotes = ["1"] * num_workers 54 | else: 55 | remotes = remotes.split(',') 56 | assert len(remotes) == num_workers 57 | 58 | cmds_map = [new_cmd(session, "ps", base_cmd + ["--job-name", "ps"], mode, logdir, shell)] 59 | for i in range(num_workers): 60 | cmds_map += [new_cmd(session, 61 | "w-%d" % i, base_cmd + ["--job-name", "worker", "--task", str(i), "--remotes", remotes[i], "--policy", policy], mode, logdir, shell)] 62 | 63 | cmds_map += [new_cmd(session, "tb", ["tensorboard", "--logdir", logdir, "--port", "12345"], mode, logdir, shell)] 64 | if mode == 'tmux': 65 | cmds_map += [new_cmd(session, "htop", ["htop"], mode, logdir, shell)] 66 | 67 | windows = [v[0] for v in cmds_map] 68 | 69 | notes = [] 70 | cmds = [ 71 | "mkdir -p {}".format(logdir), 72 | "echo {} {} > {}/cmd.sh".format(sys.executable, ' '.join([shlex_quote(arg) for arg in sys.argv if arg != '-n']), logdir), 73 | ] 74 | if mode == 'nohup' or mode == 'child': 75 | cmds += ["echo '#!/bin/sh' >{}/kill.sh".format(logdir)] 76 | notes += ["Run `source {}/kill.sh` to kill the job".format(logdir)] 77 | if mode == 'tmux': 78 | notes += ["Use `tmux attach -t {}` to watch process output".format(session)] 79 | notes += ["Use `tmux kill-session -t {}` to kill the job".format(session)] 80 | else: 81 | notes += ["Use `tail -f {}/*.out` to watch process output".format(logdir)] 82 | notes += ["Point your browser to http://localhost:12345 to see Tensorboard"] 83 | 84 | if mode == 'tmux': 85 | cmds += [ 86 | "kill $( lsof -i:12345 -t ) > /dev/null 2>&1", # kill any process using tensorboard's port 87 | "kill $( lsof -i:12222-{} -t ) > /dev/null 2>&1".format(num_workers+12222), # kill any processes using ps / worker ports 88 | "tmux kill-session -t {}".format(session), 89 | "tmux new-session -s {} -n {} -d {}".format(session, windows[0], shell) 90 | ] 91 | for w in windows[1:]: 92 | cmds += ["tmux new-window -t {} -n {} {}".format(session, w, shell)] 93 | cmds += ["sleep 1"] 94 | for window, cmd in cmds_map: 95 | cmds += [cmd] 96 | 97 | return cmds, notes 98 | 99 | 100 | def run(): 101 | args = parser.parse_args() 102 | cmds, notes = create_commands("a3c", args.num_workers, args.remotes, args.env_id, args.log_dir, policy=args.policy, mode=args.mode, visualise=args.visualise) 103 | if args.dry_run: 104 | print("Dry-run mode due to -n flag, otherwise the following commands would be executed:") 105 | else: 106 | print("Executing the following commands:") 107 | print("\n".join(cmds)) 108 | print("") 109 | if not args.dry_run: 110 | if args.mode == "tmux": 111 | os.environ["TMUX"] = "" 112 | os.system("\n".join(cmds)) 113 | print('\n'.join(notes)) 114 | 115 | 116 | if __name__ == "__main__": 117 | run() -------------------------------------------------------------------------------- /scripts/training/worker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import cv2 3 | import go_vncdriver 4 | import tensorflow as tf 5 | import argparse 6 | import logging 7 | import sys, signal 8 | import time 9 | import os 10 | from envs import create_env 11 | from feudal_networks.algos.policy_optimizer import PolicyOptimizer 12 | from feudal_networks.algos.feudal_policy_optimizer import FeudalPolicyOptimizer 13 | import distutils.version 14 | use_tf12_api = distutils.version.LooseVersion(tf.VERSION) >= distutils.version.LooseVersion('0.12.0') 15 | 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(logging.INFO) 18 | 19 | # Disables write_meta_graph argument, which freezes entire process and is mostly useless. 20 | class FastSaver(tf.train.Saver): 21 | def save(self, sess, save_path, global_step=None, latest_filename=None, 22 | meta_graph_suffix="meta", write_meta_graph=True): 23 | super(FastSaver, self).save(sess, save_path, global_step, latest_filename, 24 | meta_graph_suffix, False) 25 | 26 | def run(args, server): 27 | env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes) 28 | if args.policy == 'lstm': 29 | trainer = PolicyOptimizer(env, args.task, args.policy,args.visualise) 30 | elif args.policy == 'feudal': 31 | trainer = FeudalPolicyOptimizer(env, args.task, args.policy,args.visualise) 32 | else: 33 | print('Invalid policy type') 34 | exit(0) 35 | 36 | 37 | # Variable names that start with "local" are not saved in checkpoints. 38 | if use_tf12_api: 39 | variables_to_save = [v for v in tf.global_variables() if not v.name.startswith("local")] 40 | init_op = tf.variables_initializer(variables_to_save) 41 | init_all_op = tf.global_variables_initializer() 42 | else: 43 | variables_to_save = [v for v in tf.all_variables() if not v.name.startswith("local")] 44 | init_op = tf.initialize_variables(variables_to_save) 45 | init_all_op = tf.initialize_all_variables() 46 | saver = FastSaver(variables_to_save) 47 | 48 | var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) 49 | logger.info('Trainable vars:') 50 | for v in var_list: 51 | logger.info(' %s %s', v.name, v.get_shape()) 52 | 53 | def init_fn(ses): 54 | logger.info("Initializing all parameters.") 55 | ses.run(init_all_op) 56 | 57 | config = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)]) 58 | logdir = os.path.join(args.log_dir, 'train') 59 | 60 | if use_tf12_api: 61 | summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task) 62 | else: 63 | summary_writer = tf.train.SummaryWriter(logdir + "_%d" % args.task) 64 | 65 | logger.info("Events directory: %s_%s", logdir, args.task) 66 | sv = tf.train.Supervisor(is_chief=(args.task == 0), 67 | logdir=logdir, 68 | saver=saver, 69 | summary_op=None, 70 | init_op=init_op, 71 | init_fn=init_fn, 72 | summary_writer=summary_writer, 73 | ready_op=tf.report_uninitialized_variables(variables_to_save), 74 | global_step=trainer.global_step, 75 | save_model_secs=30, 76 | save_summaries_secs=30) 77 | 78 | num_global_steps = 100000000 79 | 80 | logger.info( 81 | "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " + 82 | "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified.") 83 | with sv.managed_session(server.target, config=config) as sess, sess.as_default(): 84 | sess.run(trainer.sync) 85 | trainer.start(sess, summary_writer) 86 | global_step = sess.run(trainer.global_step) 87 | logger.info("Starting training at step=%d", global_step) 88 | while not sv.should_stop() and (not num_global_steps or global_step < num_global_steps): 89 | trainer.train(sess) 90 | global_step = sess.run(trainer.global_step) 91 | 92 | # Ask for all the services to stop. 93 | sv.stop() 94 | logger.info('reached %s steps. worker stopped.', global_step) 95 | 96 | def cluster_spec(num_workers, num_ps): 97 | """ 98 | More tensorflow setup for data parallelism 99 | """ 100 | cluster = {} 101 | port = 12222 102 | 103 | all_ps = [] 104 | host = '127.0.0.1' 105 | for _ in range(num_ps): 106 | all_ps.append('{}:{}'.format(host, port)) 107 | port += 1 108 | cluster['ps'] = all_ps 109 | 110 | all_workers = [] 111 | for _ in range(num_workers): 112 | all_workers.append('{}:{}'.format(host, port)) 113 | port += 1 114 | cluster['worker'] = all_workers 115 | return cluster 116 | 117 | def main(_): 118 | """ 119 | Setting up Tensorflow for data parallel work 120 | """ 121 | 122 | parser = argparse.ArgumentParser(description=None) 123 | parser.add_argument('-v', '--verbose', action='count', dest='verbosity', default=0, help='Set verbosity.') 124 | parser.add_argument('--task', default=0, type=int, help='Task index') 125 | parser.add_argument('--job-name', default="worker", help='worker or ps') 126 | parser.add_argument('--num-workers', default=1, type=int, help='Number of workers') 127 | parser.add_argument('--log-dir', default="/tmp/pong", help='Log directory path') 128 | parser.add_argument('--env-id', default="PongDeterministic-v4", help='Environment id') 129 | parser.add_argument('--policy', type=str, default='lstm', help="lstm or feudal policy") 130 | parser.add_argument('-r', '--remotes', default=None, 131 | help='References to environments to create (e.g. -r 20), ' 132 | 'or the address of pre-existing VNC servers and ' 133 | 'rewarders to use (e.g. -r vnc://localhost:5900+15900,vnc://localhost:5901+15901)') 134 | 135 | # Add visualisation argument 136 | parser.add_argument('--visualise', action='store_true', 137 | help="Visualise the gym environment by running env.render() between each timestep") 138 | 139 | args = parser.parse_args() 140 | spec = cluster_spec(args.num_workers, 1) 141 | cluster = tf.train.ClusterSpec(spec).as_cluster_def() 142 | 143 | def shutdown(signal, frame): 144 | logger.warn('Received signal %s: exiting', signal) 145 | sys.exit(128+signal) 146 | signal.signal(signal.SIGHUP, shutdown) 147 | signal.signal(signal.SIGINT, shutdown) 148 | signal.signal(signal.SIGTERM, shutdown) 149 | 150 | if args.job_name == "worker": 151 | server = tf.train.Server(cluster, job_name="worker", task_index=args.task, 152 | config=tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=2)) 153 | run(args, server) 154 | else: 155 | server = tf.train.Server(cluster, job_name="ps", task_index=args.task, 156 | config=tf.ConfigProto(device_filters=["/job:ps"])) 157 | while True: 158 | time.sleep(1000) 159 | 160 | if __name__ == "__main__": 161 | tf.app.run() 162 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidhershey/feudal_networks/5c988023d87206a739b44cd59f60556c7de4e289/tests/__init__.py -------------------------------------------------------------------------------- /tests/run_tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | test_loader = unittest.defaultTestLoader.discover( '.' ) 4 | test_runner = unittest.TextTestRunner(verbosity=2) 5 | test_runner.run(test_loader) -------------------------------------------------------------------------------- /tests/test_algos/test_feudal_policy_optimizer.py: -------------------------------------------------------------------------------- 1 | 2 | import gym 3 | import unittest 4 | import tensorflow as tf 5 | 6 | from feudal_networks.algos.feudal_policy_optimizer import FeudalPolicyOptimizer 7 | from feudal_networks.policies.feudal_policy import FeudalPolicy 8 | 9 | import feudal_networks.envs.debug_envs 10 | 11 | class TestFeudalPolicyOptimizer(unittest.TestCase): 12 | 13 | def test_init(self): 14 | env = gym.make('OneRoundDeterministicRewardBoxObs-v0') 15 | with tf.Session() as session: 16 | feudal_opt = FeudalPolicyOptimizer(env, 0, 'feudal', False) 17 | 18 | if __name__ == '__main__': 19 | unittest.main() 20 | -------------------------------------------------------------------------------- /tests/test_envs/test_vision_maze.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import unittest 4 | 5 | 6 | 7 | from feudal_networks.envs.vision_maze import VisionMazeEnv 8 | 9 | def to_coords(x): 10 | return list(v[0] for v in np.where(x)[:2]) 11 | 12 | class TestVisionMaze(unittest.TestCase): 13 | 14 | def test_step(self): 15 | maze = VisionMazeEnv(room_length=3, num_rooms_per_side=2) 16 | 17 | # up until wall 18 | a = 0 19 | maze.state = np.array([0, 0]) 20 | nx, _, _, _ = maze.step(a) 21 | nx, _, _, _ = maze.step(a) 22 | np.testing.assert_array_equal(to_coords(nx), [0,2]) 23 | nx, _, _, _ = maze.step(a) 24 | np.testing.assert_array_equal(to_coords(nx), [0,2]) 25 | 26 | # down until wall 27 | a = 2 28 | maze.state = np.array([0, 2]) 29 | nx, _, _, _ = maze.step(a) 30 | nx, _, _, _ = maze.step(a) 31 | np.testing.assert_array_equal(to_coords(nx), [0,0]) 32 | nx, _, _, _ = maze.step(a) 33 | np.testing.assert_array_equal(to_coords(nx), [0,0]) 34 | 35 | # right until wall 36 | maze.state = np.array([0, 0]) 37 | a = 1 38 | nx, _, _, _ = maze.step(a) 39 | nx, _, _, _ = maze.step(a) 40 | np.testing.assert_array_equal(to_coords(nx), [2,0]) 41 | nx, _, _, _ = maze.step(a) 42 | np.testing.assert_array_equal(to_coords(nx), [2,0]) 43 | 44 | # left until wall 45 | maze.state = np.array([2, 0]) 46 | a = 3 47 | nx, _, _, _ = maze.step(a) 48 | nx, _, _, _ = maze.step(a) 49 | np.testing.assert_array_equal(to_coords(nx), [0,0]) 50 | nx, _, _, _ = maze.step(a) 51 | np.testing.assert_array_equal(to_coords(nx), [0,0]) 52 | 53 | # through doorway to the right until wall 54 | maze.state = np.array([0, 0]) 55 | nx, _, _, _ = maze.step(0) # up 56 | nx, _, _, _ = maze.step(1) # right 57 | nx, _, _, _ = maze.step(1) # right 58 | nx, _, _, _ = maze.step(1) # right 59 | nx, _, _, _ = maze.step(1) # right 60 | nx, _, _, _ = maze.step(1) # right 61 | np.testing.assert_array_equal(to_coords(nx), [5,1]) 62 | nx, _, _, _ = maze.step(1) # right 63 | np.testing.assert_array_equal(to_coords(nx), [5,1]) 64 | 65 | # back through the doorway I came, and then up through the other doorway 66 | maze.state = np.array([5, 1]) 67 | nx, _, _, _ = maze.step(3) # left 68 | nx, _, _, _ = maze.step(3) # left 69 | nx, _, _, _ = maze.step(3) # left 70 | nx, _, _, _ = maze.step(3) # left 71 | nx, _, _, _ = maze.step(0) # up 72 | nx, _, _, _ = maze.step(0) # up 73 | nx, _, _, _ = maze.step(0) # up 74 | nx, _, _, _ = maze.step(0) # up 75 | np.testing.assert_array_equal(to_coords(nx), [1,5]) 76 | nx, _, _, _ = maze.step(0) # up 77 | np.testing.assert_array_equal(to_coords(nx), [1,5]) 78 | 79 | # to the goal state 80 | maze.state = np.array([1, 5]) 81 | nx, _, _, _ = maze.step(1) # right 82 | nx, _, _, _ = maze.step(2) # down 83 | nx, _, _, _ = maze.step(1) # right 84 | nx, _, _, _ = maze.step(1) # right 85 | nx, _, _, _ = maze.step(1) # right 86 | nx, _, _, _ = maze.step(0) # up 87 | np.testing.assert_array_equal(to_coords(nx), [5,5]) 88 | 89 | # down until wall 90 | maze.state = np.array([5, 5]) 91 | nx, _, _, _ = maze.step(2) # down 92 | nx, _, _, _ = maze.step(2) # down 93 | np.testing.assert_array_equal(to_coords(nx), [5,3]) 94 | nx, _, _, _ = maze.step(2) # down 95 | np.testing.assert_array_equal(to_coords(nx), [5,3]) 96 | 97 | if __name__ == '__main__': 98 | unittest.main() -------------------------------------------------------------------------------- /tests/test_policies/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidhershey/feudal_networks/5c988023d87206a739b44cd59f60556c7de4e289/tests/test_policies/__init__.py -------------------------------------------------------------------------------- /tests/test_policies/test_feudal_batch_processor.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import unittest 4 | 5 | from feudal_networks.policies.feudal_batch_processor import FeudalBatchProcessor, FeudalBatch 6 | from feudal_networks.algos.policy_optimizer import Batch 7 | 8 | class TestFeudalBatchProcessor(unittest.TestCase): 9 | 10 | def test_simple_c_1(self): 11 | # simple case ignoring the fact that the different list have 12 | # elements with different types 13 | c = 1 14 | fbp = FeudalBatchProcessor(c) 15 | 16 | obs = [1,2] 17 | a = [1,2] 18 | returns = [1,2] 19 | terminal = False 20 | g = [1,2] 21 | s = [1,2] 22 | features = [1,2] 23 | b = Batch(obs, a, returns, terminal, g, s, features) 24 | fb = fbp.process_batch(b) 25 | np.testing.assert_array_equal(fb.obs, [1]) 26 | np.testing.assert_array_equal(fb.a, [1]) 27 | np.testing.assert_array_equal(fb.returns, [1]) 28 | np.testing.assert_array_equal(fb.s_diff, [1]) 29 | np.testing.assert_array_equal(fb.ri, [0]) 30 | np.testing.assert_array_equal(fb.gsum, [2]) 31 | np.testing.assert_array_equal(fb.features, [1]) 32 | 33 | obs = [3,4] 34 | a = [3,4] 35 | returns = [3,4] 36 | terminal = False 37 | g = [3,4] 38 | s = [3,4] 39 | features = [3,4] 40 | b = Batch(obs, a, returns, terminal, g, s, features) 41 | fb = fbp.process_batch(b) 42 | np.testing.assert_array_equal(fb.obs, [2,3]) 43 | np.testing.assert_array_equal(fb.a, [2,3]) 44 | np.testing.assert_array_equal(fb.returns, [2,3]) 45 | np.testing.assert_array_equal(fb.s_diff, [1,1]) 46 | self.assertEqual(len(fb.ri), 2) 47 | np.testing.assert_array_equal(fb.gsum, [3, 5]) 48 | np.testing.assert_array_equal(fb.features, [2,3]) 49 | 50 | obs = [5] 51 | a = [5] 52 | returns = [5] 53 | terminal = True 54 | g = [5] 55 | s = [5] 56 | features = [5] 57 | b = Batch(obs, a, returns, terminal, g, s, features) 58 | fb = fbp.process_batch(b) 59 | np.testing.assert_array_equal(fb.obs, [4,5]) 60 | np.testing.assert_array_equal(fb.a, [4,5]) 61 | np.testing.assert_array_equal(fb.returns, [4,5]) 62 | np.testing.assert_array_equal(fb.s_diff, [1,0]) 63 | self.assertEqual(len(fb.ri), 2) 64 | np.testing.assert_array_equal(fb.gsum, [7,9]) 65 | np.testing.assert_array_equal(fb.features, [4,5]) 66 | 67 | def test_simple_c_2(self): 68 | # simple case ignoring the fact that the different list have 69 | # elements with different types 70 | c = 2 71 | obs = [1,2] 72 | a = [1,2] 73 | returns = [1,2] 74 | terminal = False 75 | g = [1,2] 76 | s = [1,2] 77 | features = [1,2] 78 | b = Batch(obs, a, returns, terminal, g, s, features) 79 | 80 | fbp = FeudalBatchProcessor(c) 81 | fb = fbp.process_batch(b) 82 | 83 | np.testing.assert_array_equal(fb.obs, []) 84 | np.testing.assert_array_equal(fb.a, []) 85 | np.testing.assert_array_equal(fb.returns, []) 86 | np.testing.assert_array_equal(fb.s_diff, []) 87 | np.testing.assert_array_equal(fb.ri, []) 88 | np.testing.assert_array_equal(fb.gsum, []) 89 | np.testing.assert_array_equal(fb.features, []) 90 | 91 | obs = [3,4] 92 | a = [3,4] 93 | returns = [3,4] 94 | terminal = False 95 | g = [3,4] 96 | s = [3,4] 97 | features = [3,4] 98 | b = Batch(obs, a, returns, terminal, g, s, features) 99 | fb = fbp.process_batch(b) 100 | np.testing.assert_array_equal(fb.obs, [1,2]) 101 | np.testing.assert_array_equal(fb.a, [1,2]) 102 | np.testing.assert_array_equal(fb.returns, [1,2]) 103 | np.testing.assert_array_equal(fb.s_diff, [2,2]) 104 | self.assertEqual(len(fb.ri), 2) 105 | np.testing.assert_array_equal(fb.gsum, [3,4]) 106 | np.testing.assert_array_equal(fb.features, [1,2]) 107 | 108 | obs = [5] 109 | a = [5] 110 | returns = [5] 111 | terminal = True 112 | g = [5] 113 | s = [5] 114 | features = [5] 115 | b = Batch(obs, a, returns, terminal, g, s, features) 116 | fb = fbp.process_batch(b) 117 | np.testing.assert_array_equal(fb.obs, [3,4,5]) 118 | np.testing.assert_array_equal(fb.a, [3,4,5]) 119 | np.testing.assert_array_equal(fb.returns, [3,4,5]) 120 | np.testing.assert_array_equal(fb.s_diff, [2,1,0]) 121 | self.assertEqual(len(fb.ri), 3) 122 | np.testing.assert_array_equal(fb.gsum, [6,9,12]) 123 | np.testing.assert_array_equal(fb.features, [3,4,5]) 124 | 125 | def test_simple_terminal_on_start(self): 126 | c = 2 127 | fbp = FeudalBatchProcessor(c) 128 | 129 | obs = [1,2] 130 | a = [1,2] 131 | returns = [1,2] 132 | terminal = True 133 | g = [1,2] 134 | s = [1,2] 135 | features = [1,2] 136 | b = Batch(obs, a, returns, terminal, g, s, features) 137 | fb = fbp.process_batch(b) 138 | np.testing.assert_array_equal(fb.obs, [1,2]) 139 | np.testing.assert_array_equal(fb.a, [1,2]) 140 | np.testing.assert_array_equal(fb.returns, [1,2]) 141 | np.testing.assert_array_equal(fb.s_diff, [1,0]) 142 | self.assertEqual(len(fb.ri), 2) 143 | np.testing.assert_array_equal(fb.gsum, [3,4]) 144 | np.testing.assert_array_equal(fb.features, [1,2]) 145 | 146 | def test_intrinsic_reward_and_gsum_calculation(self): 147 | c = 2 148 | fbp = FeudalBatchProcessor(c) 149 | 150 | obs = a = returns = features = [None, None, None] 151 | terminal = True 152 | s = [np.array([2,1]), np.array([1,2]), np.array([2,3])] 153 | g = [np.array([1,1]), np.array([2,2]), np.array([3,3])] 154 | b = Batch(obs, a, returns, terminal, s, g, features) 155 | fb = fbp.process_batch(b) 156 | last_ri = (1. + 1. / np.sqrt(2)) / 2 157 | np.testing.assert_array_almost_equal(fb.ri, [0,0,last_ri]) 158 | np.testing.assert_array_equal(fb.gsum, 159 | [np.array([3,3]), np.array([4,4]), np.array([6,6])]) 160 | 161 | if __name__ == '__main__': 162 | unittest.main() 163 | -------------------------------------------------------------------------------- /tests/test_policies/test_feudal_policy.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | np.set_printoptions(suppress=True, precision=6) 4 | import unittest 5 | 6 | from feudal_networks.policies.feudal_policy import FeudalPolicy 7 | import tensorflow as tf 8 | 9 | class TestFeudalPolicy(unittest.TestCase): 10 | 11 | def setUp(self): 12 | # reset graph before each test case 13 | tf.reset_default_graph() 14 | 15 | def test_init(self): 16 | global_step = tf.get_variable("global_step", [], tf.int32,\ 17 | initializer=tf.constant_initializer(0, dtype=tf.int32), 18 | trainable=False) 19 | feudal = FeudalPolicy((80,80,3), 4, global_step) 20 | 21 | def test_fit_simple_dataset(self): 22 | with tf.Session() as session: 23 | global_step = tf.get_variable("global_step", [], tf.int32,\ 24 | initializer=tf.constant_initializer(0, dtype=tf.int32), 25 | trainable=False) 26 | obs_space = (80,80,3) 27 | act_space = 2 28 | lr = 1e-5 29 | g_dim = 256 30 | worker_hid_dim = 32 31 | manager_hid_dim = 256 32 | pi = FeudalPolicy(obs_space, act_space, global_step) 33 | 34 | grads = tf.gradients(pi.loss, pi.var_list) 35 | 36 | prints = [] 37 | for g in grads: 38 | prints.append(g.op.name) 39 | prints.append(g) 40 | # grads[0] = tf.Print(grads[0],prints) 41 | grads, _ = tf.clip_by_global_norm(grads, 40) 42 | grads_and_vars = list(zip(grads, pi.var_list)) 43 | opt = tf.train.AdamOptimizer(lr) 44 | train_op = opt.apply_gradients(grads_and_vars) 45 | 46 | # train_op = tf.train.AdamOptimizer(lr).minimize(pi.loss,var_list=pi.var_list) 47 | session.run(tf.global_variables_initializer()) 48 | 49 | obs = [np.zeros(obs_space), np.zeros(obs_space)] 50 | a = [[1,0], [0,1]] 51 | returns = [0, 1] 52 | s_diff = [np.ones(g_dim), np.ones(g_dim)] 53 | gsum = [np.zeros((1,g_dim)), np.ones((1,g_dim))] 54 | ri = [0, 0] 55 | 56 | _,features = pi.get_initial_features() 57 | worker_features = features[0:2] 58 | manager_features = features[2:] 59 | 60 | feed_dict = { 61 | pi.obs: obs, 62 | pi.ac: a, 63 | pi.r: returns, 64 | pi.s_diff: s_diff, 65 | pi.prev_g: gsum, 66 | pi.ri: ri, 67 | pi.state_in[0]: worker_features[0], 68 | pi.state_in[1]: worker_features[1], 69 | pi.state_in[2]: manager_features[0], 70 | pi.state_in[3]: manager_features[1] 71 | } 72 | 73 | n_updates = 1000 74 | verbose = True 75 | for i in range(n_updates): 76 | loss, vf, policy, _ = session.run([pi.loss,pi.manager_vf,pi.pi, train_op], feed_dict=feed_dict) 77 | if verbose: 78 | print('loss: {}\npolicy: {}\nvalue: {}\n-------'.format( 79 | loss, policy, vf)) 80 | 81 | def test_simple_manager_behavior(self): 82 | with tf.Session() as session: 83 | global_step = tf.get_variable("global_step", [], tf.int32,\ 84 | initializer=tf.constant_initializer(0, dtype=tf.int32), 85 | trainable=False) 86 | obs_space = (80,80,3) 87 | act_space = 2 88 | lr = 5e-4 89 | g_dim = 256 90 | worker_hid_dim = 32 91 | manager_hid_dim = 256 92 | pi = FeudalPolicy(obs_space, act_space, global_step) 93 | 94 | train_op = tf.train.AdamOptimizer(lr).minimize(pi.loss) 95 | 96 | worker_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) 97 | worker_vars = [v for v in worker_vars if 'worker' in v.name] 98 | worker_assign = tf.group(*[tf.assign(v, tf.zeros_like(v)) 99 | for v in worker_vars]) 100 | 101 | session.run(tf.global_variables_initializer()) 102 | 103 | obs = [np.zeros(obs_space), np.zeros(obs_space)] 104 | a = [[1,0], [0,1]] 105 | returns = [0, 1] 106 | s_diff = [np.ones(g_dim), np.ones(g_dim)] 107 | gsum = [np.zeros((1,g_dim)), np.ones((1,g_dim))] 108 | ri = [0, 0] 109 | 110 | _, features = pi.get_initial_features() 111 | worker_features = features[0:2] 112 | manager_features = features[2:] 113 | 114 | feed_dict = { 115 | pi.obs: obs, 116 | pi.ac: a, 117 | pi.r: returns, 118 | pi.s_diff: s_diff, 119 | pi.prev_g: gsum, 120 | pi.ri: ri, 121 | pi.state_in[0]: worker_features[0], 122 | pi.state_in[1]: worker_features[1], 123 | pi.state_in[2]: manager_features[0], 124 | pi.state_in[3]: manager_features[1] 125 | } 126 | 127 | n_updates = 1000 128 | verbose = True 129 | for i in range(n_updates): 130 | loss, vf, policy, _, _ = session.run( 131 | [pi.loss, pi.manager_vf, pi.pi, train_op, worker_assign], 132 | feed_dict=feed_dict) 133 | 134 | if verbose: 135 | print('loss: {}\npolicy: {}\nvalue: {}\n-------'.format( 136 | loss, policy, vf)) 137 | 138 | worker_var_values = session.run(worker_vars) 139 | print(worker_var_values) 140 | U = session.run(pi.U, feed_dict=feed_dict) 141 | print(U) 142 | input() 143 | 144 | 145 | 146 | if __name__ == '__main__': 147 | unittest.main() 148 | -------------------------------------------------------------------------------- /tests/test_policies/test_lstm_policy.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from feudal_networks.policies.lstm_policy import LSTMPolicy 4 | import tensorflow as tf 5 | 6 | class TestLSTMPolicy(unittest.TestCase): 7 | 8 | def test_init(self): 9 | global_step = tf.get_variable("global_step", [], tf.int32,\ 10 | initializer=tf.constant_initializer(0, dtype=tf.int32), 11 | trainable=False) 12 | lstm_pi = LSTMPolicy((80,80,3), 4,global_step) 13 | 14 | if __name__ == '__main__': 15 | unittest.main() 16 | --------------------------------------------------------------------------------