├── .gitignore ├── README.md ├── caesar.py ├── hyper.py ├── main.py ├── orbitvm ├── __init__.py ├── p1.py └── p1_env.py ├── orbitvm_solver.py ├── pg_agent.py ├── space_conversion.py ├── trpo_agent.py ├── trpo_caesar.py └── value_function.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # reinforcement_learning_playground 2 | Playground for reinforcement learning algorithms implemented in TensorFlow. 3 | 4 | ## OpenAI gym problems solved 5 | ### Vanilla Policy Gradients 6 | 7 | Vanilla policy gradients with ValueFunction to estimate value for the specific state (I use current observation, 8 | previous observation and previous action as a state). This same algorithm works fine without ValueFunction if you 9 | don't stop the learning process at step 200 and continue learning after that. OpenAI Gym's monitor stops the game 10 | at step 200 so you can't use monitor at the same time as training on more than 200 steps. (inspiration for the use of ValueFunction comes from https://github.com/wojzaremba/trpo) 11 | 12 | [Gym evaluation - CartPole-v0](https://gym.openai.com/evaluations/eval_dWo7uqR2Ti6RX7naakndQ) 13 | 14 | [Gym evaluation - CartPole-v1](https://gym.openai.com/evaluations/eval_eB5PuUG8QfyRSNjDK7xTA) 15 | 16 | Reproducing: 17 | * Consider changing the API key :) 18 | * `python pg_agent.py` 19 | * `python pg_agent.py CartPole-v1` 20 | 21 | ### Policy Gradients with TRPO 22 | 23 | The same as above but use conjugate gradients + line search method described in [TRPO paper](http://arxiv.org/abs/1502.05477). Inspiration for the implementation comes from the https://github.com/wojzaremba/trpo again but I tried to make it more readable and close to the paper. 24 | 25 | Please also note that this agent doesn't use dropout. The reason is that TRPO doesn't work well with dropout. That is, with high dropout rate the KL divergence may be very high even between exactly equal set of params. This is due to randomized nature of dropout. 26 | 27 | [Gym evaluation - CartPole-v0](https://gym.openai.com/evaluations/eval_hVkf4zsITBaLFLxVzhbJwg) 28 | 29 | [Gym evaluation - CartPole-v1](https://gym.openai.com/evaluations/eval_S5aAzDRtSjGFJgEycVqymw) 30 | 31 | [Gym evaluation - Copy-v0](https://gym.openai.com/evaluations/eval_y90gKlfdR7u9w0NzEbnw) 32 | 33 | Reproducing: 34 | * Consider changing the API key :) 35 | * `python trpo_agent.py` 36 | * `python trpo_agent.py CartPole-v1` 37 | * `python trpo_agent.py Copy-v0` 38 | 39 | ## New environments solved 40 | # Caesar cipher 41 | 42 | I am introducing new environment that is a fork of "Copy-v0" environment but except for copying input tape into output tape the agent needs to decode Caesar-ciphered text into output tape. The same algorithm that works with CartPole-v0 and Copy-v0 also works here. The only difference is the amount of hidden units. 43 | 44 | [![asciicast](https://asciinema.org/a/84292.png)](https://asciinema.org/a/84292) 45 | 46 | Reproducing: `python trpo_caesar.py` 47 | 48 | -------------------------------------------------------------------------------- /caesar.py: -------------------------------------------------------------------------------- 1 | """ 2 | Task is to decode text from input tape to 3 | the output tape. http://arxiv.org/abs/1511.07275 4 | """ 5 | import this 6 | import collections 7 | 8 | import numpy as np 9 | from gym.envs.algorithmic import algorithmic_env 10 | from gym.envs.algorithmic.algorithmic_env import ha 11 | from gym.envs.registration import register 12 | 13 | LEN_TO_WORD = collections.defaultdict(list) 14 | MAX_LEN = 0 15 | 16 | for word in this.s.split(): 17 | encoded = '' 18 | decoded = '' 19 | for c in word.lower(): 20 | dec = this.d.get(c, None) 21 | if dec: 22 | encoded += c 23 | decoded += dec 24 | LEN_TO_WORD[len(encoded)].append(encoded) 25 | MAX_LEN = max(MAX_LEN, len(encoded)) 26 | 27 | class CaesarEnv(algorithmic_env.AlgorithmicEnv): 28 | def __init__(self): 29 | algorithmic_env.AlgorithmicEnv.__init__(self, 30 | inp_dim=1, 31 | base=26, 32 | chars=True) 33 | def set_data(self): 34 | self.content = {} 35 | self.target = {} 36 | len_left = self.total_len 37 | i = 0 38 | while len_left > 0: 39 | rand_len = self.np_random.randint(1, min(len_left, MAX_LEN) + 1) 40 | if not LEN_TO_WORD[rand_len]: 41 | continue 42 | encoded = self.np_random.choice(LEN_TO_WORD[rand_len]) 43 | for c in encoded: 44 | enc_val = ord(c) - ord('a') 45 | dec_val = ord(this.d[c]) - ord('a') 46 | self.content[ha(np.array([i]))] = enc_val 47 | self.target[i] = dec_val 48 | i += 1 49 | len_left -= rand_len 50 | 51 | self.total_reward = self.total_len 52 | 53 | register( 54 | id='Caesar-v0', 55 | entry_point='caesar:CaesarEnv', 56 | timestep_limit=200, 57 | reward_threshold=25.0, 58 | ) 59 | -------------------------------------------------------------------------------- /hyper.py: -------------------------------------------------------------------------------- 1 | import datetime as dt 2 | import csv 3 | import gym 4 | import random 5 | import numpy as np 6 | import tensorflow as tf 7 | import logging 8 | 9 | from pg_agent import PGAgent 10 | 11 | # logging.getLogger('pg_agent').setLevel(logging.WARNING) 12 | 13 | def main(): 14 | seed = 1 15 | random.seed(seed) 16 | np.random.seed(seed) 17 | tf.set_random_seed(seed) 18 | 19 | env = gym.make("CartPole-v0") 20 | f = open('/Users/tilarids/Downloads/study_774036965_trials.csv') 21 | reader = csv.DictReader(f) 22 | for experiment in reader: 23 | if experiment['Status'] != 'PENDING': 24 | continue 25 | agent = PGAgent(env, 26 | win_step=199, 27 | H=int(experiment['H']), 28 | timesteps_per_batch=int(experiment['timesteps_per_batch']), 29 | learning_rate=float(experiment['learning_rate']), 30 | gamma=float(experiment['gamma']), 31 | epochs=int(experiment['epochs']), 32 | dropout=float(experiment['dropout']), 33 | win_reward=float(experiment['win_reward'])) 34 | time_before = dt.datetime.now() 35 | validation_mean_rewards, train_mean_rewards = agent.learn() 36 | elapsed_secs = (dt.datetime.now() - time_before).seconds 37 | print "For TrialId=%s validation result is %s and train result is %s in %s secs" % (experiment['TrialId'], validation_mean_rewards, train_mean_rewards, elapsed_secs) 38 | 39 | if __name__ == '__main__': 40 | main() 41 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import tensorflow as tf 4 | import time 5 | import os 6 | import logging 7 | import gym 8 | from gym import envs, scoreboard 9 | from gym.spaces import Discrete, Box 10 | import prettytensor as pt 11 | import tempfile 12 | import csv 13 | import datetime as dt 14 | import sys 15 | 16 | seed = 1 17 | random.seed(seed) 18 | np.random.seed(seed) 19 | tf.set_random_seed(seed) 20 | 21 | dtype = tf.float32 22 | 23 | 24 | 25 | # hyperparameters 26 | #H = 83 # number of hidden layer neurons 27 | # learning_rate = 1e-3 28 | GAMMA = 0.99 # discount factor for reward 29 | # decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2 30 | render = False 31 | monitor = False 32 | # epochs = 15 33 | 34 | 35 | def cat_sample(prob_nk): 36 | assert prob_nk.ndim == 2 37 | N = prob_nk.shape[0] 38 | csprob_nk = np.cumsum(prob_nk, axis=1) 39 | out = np.zeros(N, dtype='i') 40 | for (n, csprob_k, r) in zip(xrange(N), csprob_nk, np.random.rand(N)): 41 | for (k, csprob) in enumerate(csprob_k): 42 | if csprob > r: 43 | out[n] = k 44 | break 45 | return out 46 | 47 | def softmax(w, t = 1.0): 48 | e = np.exp(w / t) 49 | dist = e / np.sum(e) 50 | return dist 51 | 52 | def discount_rewards(r, gamma = GAMMA): 53 | """ take 1D float array of rewards and compute discounted reward """ 54 | discounted_r = np.zeros_like(r) 55 | running_add = 0 56 | for t in reversed(xrange(0, r.size)): 57 | #if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!) 58 | running_add = running_add * gamma + r[t] 59 | discounted_r[t] = running_add 60 | return discounted_r 61 | 62 | 63 | class LearningAgent(object): 64 | def __init__(self, env, H, timesteps_per_batch, learning_rate, gamma, epochs, dropout, win_reward): 65 | if not isinstance(env.observation_space, Box) or \ 66 | not isinstance(env.action_space, Discrete): 67 | print("Incompatible spaces.") 68 | exit(-1) 69 | 70 | self.H = H 71 | self.timesteps_per_batch = timesteps_per_batch 72 | self.learning_rate = learning_rate 73 | self.gamma = gamma 74 | self.epochs = epochs 75 | self.dropout = dropout 76 | self.win_reward = win_reward 77 | 78 | self.env = env 79 | self.session = tf.Session() 80 | self.obs = tf.placeholder( 81 | dtype, shape=[ 82 | None, 2 * env.observation_space.shape[0] + env.action_space.n], name="obs") 83 | self.prev_obs = np.zeros(env.observation_space.shape[0]) 84 | self.prev_action = np.zeros(env.action_space.n) 85 | self.action = action = tf.placeholder(dtype, shape=[None, env.action_space.n], name="action") 86 | self.advant = advant = tf.placeholder(dtype, shape=[None], name="advant") 87 | self.prev_policy = action = tf.placeholder(dtype, shape=[None, env.action_space.n], name="prev_policy") 88 | 89 | self.policy_network, _ = ( 90 | pt.wrap(self.obs) 91 | .fully_connected(H, activation_fn=tf.nn.tanh) 92 | .dropout(self.dropout) 93 | .softmax_classifier(env.action_space.n)) 94 | self.returns = tf.placeholder(dtype, shape=[None, env.action_space.n], name="returns") 95 | 96 | loss = - tf.reduce_sum(tf.mul(self.action, tf.div(self.policy_network, self.prev_policy)), 1) * self.advant 97 | self.train = tf.train.AdamOptimizer().minimize(loss) 98 | 99 | self.session.run(tf.initialize_all_variables()) 100 | 101 | def rollout(self, max_pathlength, timesteps_per_batch): 102 | paths = [] 103 | timesteps_sofar = 0 104 | while timesteps_sofar < timesteps_per_batch: 105 | 106 | obs, actions, rewards, action_dists, actions_one_hot = [], [], [], [], [] 107 | ob = self.env.reset() 108 | self.prev_action *= 0.0 109 | self.prev_obs *= 0.0 110 | for x in xrange(max_pathlength): 111 | if render and 0==x % 20: env.render() 112 | # import pdb; pdb.set_trace() 113 | obs_new = np.expand_dims(np.concatenate([ob, self.prev_obs, self.prev_action], 0), 0) 114 | 115 | action_dist_n = self.session.run(self.policy_network, {self.obs: obs_new}) 116 | 117 | action = int(cat_sample(action_dist_n)[0]) 118 | self.prev_obs = ob 119 | self.prev_action *= 0.0 120 | self.prev_action[action] = 1.0 121 | 122 | obs.append(ob) 123 | actions.append(action) 124 | action_dists.append(action_dist_n) 125 | actions_one_hot.append(np.copy(self.prev_action)) 126 | 127 | res = list(self.env.step(action)) 128 | if 199==len(rewards): 129 | rewards.append(self.win_reward) 130 | res[2] = True 131 | else: 132 | rewards.append(res[1]) 133 | ob = res[0] 134 | 135 | if res[2]: 136 | path = {"obs": np.concatenate(np.expand_dims(obs, 0)), 137 | "action_dists": np.concatenate(action_dists), 138 | "rewards": np.array(rewards), 139 | "actions": np.array(actions), 140 | "actions_one_hot": np.array(actions_one_hot)} 141 | paths.append(path) 142 | self.prev_action *= 0.0 143 | self.prev_obs *= 0.0 144 | timesteps_sofar += len(path["rewards"]) 145 | break 146 | else: 147 | timesteps_sofar += max_pathlength 148 | return paths 149 | 150 | def prepare_features(self, path): 151 | # import pdb; pdb.set_trace() 152 | obs = path["obs"] 153 | prev_obs = np.concatenate([np.zeros((1,obs.shape[1])), path["obs"][1:]], 0) 154 | prev_action = path['action_dists'] 155 | return np.concatenate([obs, prev_obs, prev_action], 1) 156 | 157 | def predict_for_path(self, path): 158 | features = self.prepare_features(path) 159 | return self.session.run(self.policy_network, {self.obs: features}) 160 | 161 | def learn(self): 162 | self.current_observation = env.reset() 163 | # self.prev_x = None # used in computing the difference frame 164 | 165 | xs,hs,dlogps,drs = [],[],[],[] 166 | 167 | running_reward = None 168 | reward_sum = 0 169 | episode_number = 0 170 | current_step = 0.0 171 | iteration_number = 0 172 | discounted_eprs = [] 173 | mean_path_lens = [] 174 | while True: 175 | 176 | paths = self.rollout(max_pathlength=10000, timesteps_per_batch=self.timesteps_per_batch) 177 | 178 | for path in paths: 179 | path["prev_policy"] = self.predict_for_path(path) 180 | path["returns"] = discount_rewards(path["rewards"], self.gamma) 181 | # path["advant"] = path["returns"] - path["baseline"] 182 | path["advant"] = path["returns"] 183 | 184 | features = np.concatenate([self.prepare_features(path) for path in paths]) 185 | 186 | advant = np.concatenate([path["advant"] for path in paths]) 187 | advant -= advant.mean() 188 | advant /= (advant.std() + 1e-8) 189 | 190 | actions = np.concatenate([path["actions_one_hot"] for path in paths]) 191 | prev_policy = np.concatenate([path["prev_policy"] for path in paths]) 192 | 193 | # predictions = np.concatenate([self.predict_for_path(path) for path in paths]) 194 | for _ in range(self.epochs): 195 | self.session.run(self.train, {self.obs: features, 196 | self.advant: advant, 197 | self.action: actions, 198 | self.prev_policy: prev_policy}) 199 | iteration_number += 1 200 | 201 | mean_path_len = np.mean([len(path['rewards']) for path in paths]) 202 | mean_path_lens.append(mean_path_len) 203 | if iteration_number > 100: 204 | paths = self.rollout(max_pathlength=10000, timesteps_per_batch=40000) 205 | return np.mean([len(path['rewards']) for path in paths]), np.mean(mean_path_lens) 206 | # if 0 == iteration_number % 25: 207 | # print "Iteration %s finished. Mean path len: %s" % (iteration_number, mean_path_lens[-10:]) 208 | 209 | env = gym.make("CartPole-v0") 210 | training_dir = tempfile.mkdtemp() 211 | 212 | # f = open('/Users/tilarids/Downloads/study_599437103_trials.csv') 213 | # reader = csv.DictReader(f) 214 | # for experiment in reader: 215 | # if experiment['Status'] != 'PENDING': 216 | # continue 217 | # agent = LearningAgent(env, 218 | # int(experiment['H']), 219 | # int(experiment['timesteps_per_batch']), 220 | # float(experiment['learning_rate']), 221 | # float(experiment['gamma']), 222 | # int(experiment['epochs']), 223 | # float(experiment['dropout']), 224 | # float(experiment['win_reward'])) 225 | # time_before = dt.datetime.now() 226 | # validation_mean_rewards, train_mean_rewards = agent.learn() 227 | # elapsed_secs = (dt.datetime.now() - time_before).seconds 228 | # print "For TrialId=%s validation result is %s and train result is %s in %s secs" % (experiment['TrialId'], validation_mean_rewards, train_mean_rewards, elapsed_secs) 229 | 230 | 231 | env.monitor.start(training_dir) 232 | 233 | # H, timesteps_per_batch, learning_rate, gamma, epochs, dropout, win_reward 234 | agent = LearningAgent(env, 235 | 87, 236 | 1793, 237 | 0.0000022134469959712048, 238 | 0.9709822161334865, 239 | 52, 240 | 0.7600433610799155, 241 | 317.75621177926485) 242 | agent.learn() 243 | 244 | env.monitor.close() 245 | gym.upload(training_dir, api_key='sk_lgS7sCv1Qxq5HFMdQXR6Sw') 246 | -------------------------------------------------------------------------------- /orbitvm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tilarids/reinforcement_learning_playground/c11a4fb403b3193497f0fb721999a908da1e1d0d/orbitvm/__init__.py -------------------------------------------------------------------------------- /orbitvm/p1.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | # class Wrapper(object): 4 | # def __init__(self, name, val): 5 | # self.name = name 6 | # self.val = val 7 | 8 | # def __get__(self, obj, objtype=None): 9 | # print "get %s -> %s" % (self.name, self.val) 10 | # return self.val 11 | 12 | # def __set__(self, obj, value): 13 | # print "set %s <- %s, prev: %s" % (self.name, value, self.val) 14 | # self.val = value 15 | 16 | # def __delete__(self, obj): 17 | # pass 18 | 19 | class P1(object): 20 | def __init__(self): 21 | self.d0 = 1.0 22 | self.d1 = 0.0 23 | self.d2 = 30.0 24 | self.d3 = 0.0 25 | self.d4 = 0.0 26 | self.d5 = 0.0 27 | self.d7 = 0.0 28 | self.d8 = 0.0 29 | self.d9 = 0.0 30 | self.d11 = 0.0 31 | self.d12 = 0.0 32 | self.d14 = 0.0 33 | self.d15 = 0.0 34 | self.d16 = 1000.0 35 | self.d18 = 0.0 36 | self.d19 = 2.0 37 | self.d20 = 0.0 38 | self.d22 = 0.0 39 | self.d24 = 0.0 40 | self.d25 = 0.0 41 | self.d26 = 0.0 42 | self.d27 = 1.1 43 | self.d28 = 42164.0 44 | self.d29 = 0.0 45 | self.d30 = 0.0 46 | self.d31 = 1004.0 47 | self.d32 = 0.0 48 | self.d33 = 0.0 49 | self.d35 = 0.0 50 | self.d36 = 1.5 51 | self.d37 = 0.0 52 | self.d38 = 1003.0 53 | self.d39 = 0.0 54 | self.d41 = 0.0 55 | self.d42 = 0.0 56 | self.d43 = 1002.0 57 | self.d44 = 0.0 58 | self.d46 = 0.0 59 | self.d47 = 1001.0 60 | self.d48 = 0.0 61 | self.d50 = 0.0 62 | self.d52 = 0.0 63 | self.d53 = 0.0 64 | self.d54 = 0.0 65 | self.d55 = 6.457e+06 66 | self.d57 = 0.0 67 | self.d59 = 0.0 68 | self.d60 = -6.35e+06 69 | self.d62 = 0.0 70 | self.d64 = 0.0 71 | self.d66 = 0.0 72 | self.d67 = 0.0 73 | self.d69 = 0.0 74 | self.d70 = 0.0 75 | self.d71 = 0.0 76 | self.d72 = 0.0 77 | self.d74 = 0.0 78 | self.d75 = 0.0 79 | self.d77 = 0.0 80 | self.d78 = 8.357e+06 81 | self.d80 = 0.0 82 | self.d81 = 6.357e+06 83 | self.d83 = 0.0 84 | self.d84 = 6.557e+06 85 | self.d86 = 0.0 86 | self.d88 = 0.0 87 | self.d89 = 0.0 88 | self.d90 = 0.0 89 | self.d91 = 0.0 90 | self.d92 = 0.0 91 | self.d93 = 0.0 92 | self.d94 = 0.0 93 | self.d95 = 0.0 94 | self.d96 = 6e+24 95 | self.d98 = 0.0 96 | self.d99 = 6.67428e-11 97 | self.d100 = 0.0 98 | self.d101 = 0.0 99 | self.d102 = 0.0 100 | self.d103 = 0.0 101 | self.d104 = 0.0 102 | self.d105 = 0.0 103 | self.d106 = 0.0 104 | self.d107 = 0.0 105 | self.d108 = 0.0 106 | self.d109 = 0.0 107 | self.d110 = 0.0 108 | self.d111 = -6922.34 109 | self.d113 = 0.0 110 | self.d114 = -4719.32 111 | self.d116 = 0.0 112 | self.d117 = -7814.93 113 | self.d119 = 0.0 114 | self.d121 = 0.0 115 | self.d122 = 0.0 116 | self.d123 = 0.0 117 | self.d124 = 0.0 118 | self.d125 = 0.0 119 | self.d126 = 0.0 120 | self.d127 = 0.0 121 | self.d128 = 0.0 122 | self.d129 = 0.0 123 | self.d130 = 0.0 124 | self.d131 = 0.0 125 | self.d132 = 0.0 126 | self.d133 = -7875.22 127 | self.d135 = 0.0 128 | self.d137 = 0.0 129 | self.d139 = 0.0 130 | self.d141 = 0.0 131 | self.d143 = 0.0 132 | self.d144 = 0.0 133 | self.d145 = 0.0 134 | self.d146 = 0.0 135 | self.d147 = 0.0 136 | self.d148 = 0.0 137 | self.d149 = 0.0 138 | self.d150 = 0.0 139 | self.d151 = 0.0 140 | self.d152 = 0.0 141 | self.d153 = 0.0 142 | self.d155 = 0.0 143 | self.d156 = 0.0 144 | self.d157 = 0.0 145 | self.d158 = 0.0 146 | self.d159 = 0.0 147 | self.d160 = 0.0 148 | self.d161 = 0.0 149 | self.d163 = 0.0 150 | self.d164 = 0.0 151 | self.d166 = 0.0 152 | self.d167 = 0.0 153 | self.d168 = 0.0 154 | self.d169 = 0.0 155 | self.d170 = 0.0 156 | self.d171 = 0.0 157 | self.d172 = 0.0 158 | self.d173 = 0.0 159 | self.d174 = 0.0 160 | self.d175 = 0.0 161 | self.d176 = 0.0 162 | self.d177 = 0.0 163 | self.d178 = 0.0 164 | self.d179 = 0.0 165 | self.d180 = 0.0 166 | self.d181 = 0.0 167 | self.d182 = 0.0 168 | self.d183 = 0.0 169 | self.d184 = 0.0 170 | self.d185 = 0.0 171 | self.d186 = 0.0 172 | self.d187 = 0.0 173 | self.d188 = 0.0 174 | self.d189 = 1.0 175 | self.d191 = 0.0 176 | self.d193 = 0.0 177 | self.d195 = 0.0 178 | self.d197 = 0.0 179 | self.d199 = 0.0 180 | self.d200 = 0.0 181 | self.d202 = 0.0 182 | self.d203 = 0.0 183 | self.d205 = 0.0 184 | self.d206 = 0.0 185 | self.d207 = 0.0 186 | self.d208 = 0.0 187 | self.d210 = 0.0 188 | self.d212 = 0.0 189 | self.d213 = 0.0 190 | self.d214 = 10000.0 191 | self.d216 = 0.0 192 | self.d217 = 0.0 193 | self.d218 = 0.0 194 | self.d219 = 25.0 195 | self.d220 = 45.0 196 | self.d221 = 0.0 197 | self.d222 = 0.0 198 | self.d223 = 0.0 199 | self.d224 = 0.0 200 | self.d225 = 0.0 201 | self.d226 = 900.0 202 | self.d227 = 0.0 203 | self.d228 = 0.0 204 | self.d230 = 0.0 205 | self.d231 = 0.0 206 | self.d232 = 0.0 207 | self.d234 = 0.0 208 | self.d235 = 0.0 209 | self.d237 = 0.0 210 | self.d238 = 6.357e+06 211 | self.d239 = 0.0 212 | self.d241 = 0.0 213 | self.d242 = 0.0 214 | self.d248 = 0.0 215 | self.d249 = 0.0 216 | self.d250 = 0.0 217 | self.d251 = 0.0 218 | self.d252 = 0.0 219 | self.d253 = 0.0 220 | self.d254 = 0.0 221 | self.d255 = 0.0 222 | self.d256 = 0.0 223 | self.d257 = 0.0 224 | self.d258 = 0.0 225 | self.d259 = 0.0 226 | self.d260 = 0.0 227 | self.d261 = 0.0 228 | self.d262 = 0.0 229 | self.d263 = 0.0 230 | self.d264 = 0.0 231 | self.d265 = 0.0 232 | 233 | def set_target_orbit(self, r): 234 | self.d28 = r / 1000 235 | 236 | def set_start_orbit(self, r): 237 | G = 6.67428e-11 # self.d99 238 | M = 6e+24 # self.d96 239 | v = (G * M / r) ** 0.5 240 | 241 | dx = (r ** 2 - v ** 2) ** 0.5 242 | dy = -v 243 | 244 | self.d84 = dx 245 | self.d117 = dy 246 | 247 | def step(self, input, output): 248 | """ 249 | Make one step of the VM for the problem 1. input and output are the 250 | lists with port values.""" 251 | 252 | self.d1 = self.d265 # copy 253 | self.d4 = self.d248 # copy 254 | self.d5 = self.d4 - self.d3 # sub 255 | self.d7 = (self.d2 if self.d5 == 0 else self.d1) # Phi 256 | self.d8 = self.d7 - self.d0 # sub 257 | self.d9 = self.d263 # copy 258 | self.d11 = (self.d0 if self.d5 == 0 else self.d9) # Phi 259 | self.d12 = self.d11 - self.d0 # sub 260 | self.d14 = (self.d8 if self.d12 == 0 else self.d7) # Phi 261 | self.d15 = self.d264 # copy 262 | self.d18 = (self.d16 if self.d5 == 0 else self.d15) # Phi 263 | self.d20 = self.d18 * self.d19 # mult 264 | self.d22 = (self.d20 if self.d12 == 0 else self.d18) # Phi 265 | self.d24 = (self.d18 if self.d12 == 0 else self.d12) # Phi 266 | self.d25 = self.d260 # copy 267 | self.d26 = self.d262 # copy 268 | self.d29 = self.d28 * self.d16 # mult 269 | self.d30 = (self.d29 / self.d27 if self.d27 != 0.0 else 0) # self.div 270 | self.d32 = input[16000] # input 271 | self.d33 = self.d32 - self.d31 # sub 272 | self.d35 = (self.d30 if self.d33 == 0 else self.d3) # Phi 273 | self.d37 = (self.d29 / self.d36 if self.d36 != 0.0 else 0) # self.div 274 | self.d39 = self.d32 - self.d38 # sub 275 | self.d41 = (self.d37 if self.d39 == 0 else self.d35) # Phi 276 | self.d42 = (self.d29 / self.d19 if self.d19 != 0.0 else 0) # self.div 277 | self.d44 = self.d32 - self.d43 # sub 278 | self.d46 = (self.d42 if self.d44 == 0 else self.d41) # Phi 279 | self.d48 = self.d32 - self.d47 # sub 280 | self.d50 = (self.d29 if self.d48 == 0 else self.d46) # Phi 281 | self.d52 = (self.d50 if self.d5 == 0 else self.d26) # Phi 282 | self.d53 = self.d255 # copy 283 | self.d57 = (self.d55 if self.d33 == 0 else self.d3) # Phi 284 | self.d59 = (self.d54 if self.d39 == 0 else self.d57) # Phi 285 | self.d62 = (self.d60 if self.d44 == 0 else self.d59) # Phi 286 | self.d64 = (self.d54 if self.d48 == 0 else self.d62) # Phi 287 | self.d66 = (self.d64 if self.d5 == 0 else self.d53) # Phi 288 | self.d67 = self.d250 # copy 289 | self.d69 = (self.d3 if self.d5 == 0 else self.d67) # Phi 290 | self.d70 = self.d69 - self.d66 # sub 291 | self.d71 = self.d70 * self.d70 # mult 292 | self.d72 = self.d249 # copy 293 | self.d74 = (self.d3 if self.d5 == 0 else self.d72) # Phi 294 | self.d75 = self.d254 # copy 295 | self.d77 = (self.d54 if self.d33 == 0 else self.d3) # Phi 296 | self.d80 = (self.d78 if self.d39 == 0 else self.d77) # Phi 297 | self.d83 = (self.d81 if self.d44 == 0 else self.d80) # Phi 298 | self.d86 = (self.d84 if self.d48 == 0 else self.d83) # Phi 299 | self.d88 = (self.d86 if self.d5 == 0 else self.d75) # Phi 300 | self.d89 = self.d74 - self.d88 # sub 301 | self.d90 = self.d89 * self.d89 # mult 302 | self.d91 = self.d90 + self.d71 # add 303 | self.d92 = math.sqrt(self.d91) # sqrt 304 | self.d93 = self.d92 * self.d92 # mult 305 | self.d94 = self.d93 * self.d92 # mult 306 | self.d95 = self.d251 # copy 307 | self.d98 = (self.d96 if self.d5 == 0 else self.d95) # Phi 308 | self.d100 = self.d99 * self.d98 # mult 309 | self.d101 = (self.d100 / self.d94 if self.d94 != 0.0 else 0) # self.div 310 | self.d102 = self.d70 * self.d101 # mult 311 | self.d103 = input[3] # input 312 | self.d104 = (self.d0 / self.d0 if self.d0 != 0.0 else 0) # self.div 313 | self.d105 = (self.d103 / self.d104 if self.d104 != 0.0 else 0) # self.div 314 | self.d106 = self.d105 + self.d102 # add 315 | self.d107 = self.d104 * self.d104 # mult 316 | self.d108 = (self.d107 / self.d19 if self.d19 != 0.0 else 0) # self.div 317 | self.d109 = self.d106 * self.d108 # mult 318 | self.d110 = self.d258 # copy 319 | self.d113 = (self.d111 if self.d39 == 0 else self.d77) # Phi 320 | self.d116 = (self.d114 if self.d44 == 0 else self.d113) # Phi 321 | self.d119 = (self.d117 if self.d48 == 0 else self.d116) # Phi 322 | self.d121 = (self.d119 if self.d5 == 0 else self.d110) # Phi 323 | self.d122 = self.d121 * self.d104 # mult 324 | self.d123 = self.d66 + self.d122 # add 325 | self.d124 = self.d123 + self.d109 # add 326 | self.d125 = self.d124 - self.d69 # sub 327 | self.d126 = self.d125 * self.d125 # mult 328 | self.d127 = self.d89 * self.d101 # mult 329 | self.d128 = input[2] # input 330 | self.d129 = (self.d128 / self.d104 if self.d104 != 0.0 else 0) # self.div 331 | self.d130 = self.d129 + self.d127 # add 332 | self.d131 = self.d130 * self.d108 # mult 333 | self.d132 = self.d257 # copy 334 | self.d135 = (self.d133 if self.d33 == 0 else self.d3) # Phi 335 | self.d137 = (self.d54 if self.d39 == 0 else self.d135) # Phi 336 | self.d139 = (self.d114 if self.d44 == 0 else self.d137) # Phi 337 | self.d141 = (self.d54 if self.d48 == 0 else self.d139) # Phi 338 | self.d143 = (self.d141 if self.d5 == 0 else self.d132) # Phi 339 | self.d144 = self.d143 * self.d104 # mult 340 | self.d145 = self.d88 + self.d144 # add 341 | self.d146 = self.d145 + self.d131 # add 342 | self.d147 = self.d146 - self.d74 # sub 343 | self.d148 = self.d147 * self.d147 # mult 344 | self.d149 = self.d148 + self.d126 # add 345 | self.d150 = math.sqrt(self.d149) # sqrt 346 | self.d151 = self.d150 - self.d52 # sub 347 | self.d152 = self.d3 - self.d151 # sub 348 | self.d153 = self.d151 - self.d3 # sub 349 | self.d155 = (self.d152 if self.d153 < 0 else self.d151) # Phi 350 | self.d156 = self.d25 + self.d155 # add 351 | self.d157 = self.d105 * self.d105 # mult 352 | self.d158 = self.d129 * self.d129 # mult 353 | self.d159 = self.d158 + self.d157 # add 354 | self.d160 = math.sqrt(self.d159) # sqrt 355 | self.d161 = self.d160 - self.d3 # sub 356 | self.d163 = (self.d156 if self.d161 == 0 else self.d3) # Phi 357 | self.d164 = self.d155 - self.d16 # sub 358 | self.d166 = (self.d163 if self.d164 < 0 else self.d3) # Phi 359 | self.d167 = self.d69 - self.d124 # sub 360 | self.d168 = self.d167 * self.d167 # mult 361 | self.d169 = self.d74 - self.d146 # sub 362 | self.d170 = self.d169 * self.d169 # mult 363 | self.d171 = self.d170 + self.d168 # add 364 | self.d172 = math.sqrt(self.d171) # sqrt 365 | self.d173 = self.d172 * self.d172 # mult 366 | self.d174 = self.d173 * self.d172 # mult 367 | self.d175 = (self.d100 / self.d174 if self.d174 != 0.0 else 0) # self.div 368 | self.d176 = self.d167 * self.d175 # mult 369 | self.d177 = self.d176 + self.d102 # add 370 | self.d178 = (self.d177 / self.d19 if self.d19 != 0.0 else 0) # self.div 371 | self.d179 = self.d105 + self.d178 # add 372 | self.d180 = self.d179 * self.d104 # mult 373 | self.d181 = self.d121 + self.d180 # add 374 | self.d182 = self.d169 * self.d175 # mult 375 | self.d183 = self.d182 + self.d127 # add 376 | self.d184 = (self.d183 / self.d19 if self.d19 != 0.0 else 0) # self.div 377 | self.d185 = self.d129 + self.d184 # add 378 | self.d186 = self.d185 * self.d104 # mult 379 | self.d187 = self.d143 + self.d186 # add 380 | self.d188 = self.d256 # copy 381 | self.d191 = (self.d189 if self.d33 == 0 else self.d3) # Phi 382 | self.d193 = (self.d189 if self.d39 == 0 else self.d191) # Phi 383 | self.d195 = (self.d189 if self.d44 == 0 else self.d193) # Phi 384 | self.d197 = (self.d189 if self.d48 == 0 else self.d195) # Phi 385 | self.d199 = (self.d197 if self.d5 == 0 else self.d188) # Phi 386 | self.d200 = self.d253 # copy 387 | self.d202 = (self.d3 if self.d5 == 0 else self.d200) # Phi 388 | self.d203 = self.d252 # copy 389 | self.d205 = (self.d3 if self.d5 == 0 else self.d203) # Phi 390 | self.d206 = self.d4 + self.d0 # add 391 | self.d207 = self.d259 # copy 392 | self.d208 = self.d207 + self.d0 # add 393 | self.d210 = (self.d208 if self.d161 == 0 else self.d3) # Phi 394 | self.d212 = (self.d210 if self.d164 < 0 else self.d3) # Phi 395 | self.d213 = self.d261 # copy 396 | self.d216 = (self.d214 if self.d5 == 0 else self.d213) # Phi 397 | self.d217 = self.d160 * self.d104 # mult 398 | self.d218 = self.d216 - self.d217 # sub 399 | self.d221 = self.d214 - self.d218 # sub 400 | self.d222 = (self.d221 / self.d214 if self.d214 != 0.0 else 0) # self.div 401 | self.d223 = self.d222 * self.d220 # mult 402 | self.d224 = self.d7 + self.d223 # add 403 | self.d225 = self.d224 + self.d219 # add 404 | self.d227 = (self.d226 / self.d104 if self.d104 != 0.0 else 0) # self.div 405 | self.d228 = self.d227 - self.d212 # sub 406 | self.d230 = (self.d225 if self.d228 < 0 else self.d3) # Phi 407 | self.d231 = self.d218 - self.d3 # sub 408 | self.d232 = self.d242 - self.d0 # sub 409 | self.d234 = (self.d232 if self.d231 < 0 else self.d230) # Phi 410 | self.d235 = self.d214 - self.d217 # sub 411 | self.d237 = (self.d232 if self.d235 < 0 else self.d234) # Phi 412 | self.d239 = self.d150 - self.d238 # sub 413 | self.d241 = (self.d232 if self.d239 < 0 else self.d237) # Phi 414 | output[0] = self.d241 # out 415 | output[1] = self.d218 # out 416 | output[2] = self.d169 # out 417 | output[3] = self.d167 # out 418 | output[4] = self.d52 # out 419 | self.d248 = self.d206 # copy 420 | self.d249 = self.d74 # copy 421 | self.d250 = self.d69 # copy 422 | self.d251 = self.d98 # copy 423 | self.d252 = self.d205 # copy 424 | self.d253 = self.d202 # copy 425 | self.d254 = self.d146 # copy 426 | self.d255 = self.d124 # copy 427 | self.d256 = self.d199 # copy 428 | self.d257 = self.d187 # copy 429 | self.d258 = self.d181 # copy 430 | self.d259 = self.d212 # copy 431 | self.d260 = self.d166 # copy 432 | self.d261 = self.d218 # copy 433 | self.d262 = self.d52 # copy 434 | self.d263 = self.d24 # copy 435 | self.d264 = self.d22 # copy 436 | self.d265 = self.d14 # copy 437 | 438 | -------------------------------------------------------------------------------- /orbitvm/p1_env.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | import gym 4 | from gym import spaces 5 | from gym.utils import seeding 6 | import numpy as np 7 | from gym.envs.registration import register 8 | 9 | from orbitvm.p1 import P1 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | MAX_STEPS = 100000 14 | class OrbitP1Env(gym.Env): 15 | metadata = { 16 | 'render.modes': ['human', 'rgb_array'], 17 | 'video.frames_per_second' : 50 18 | } 19 | 20 | def __init__(self): 21 | high2 = np.array([np.finfo(np.float32).max] * 2) 22 | high5 = np.array([np.finfo(np.float32).max] * 4) 23 | self.action_space = spaces.Box(-high2, high2) 24 | self.observation_space = spaces.Box(-high5, high5) 25 | self._seed() 26 | self.reset() 27 | self.viewer = None 28 | 29 | # Just need to initialize the relevant attributes 30 | self._configure() 31 | 32 | def _configure(self, display=None): 33 | self.display = display 34 | 35 | def _seed(self, seed=None): 36 | self.np_random, seed = seeding.np_random(seed) 37 | return [seed] 38 | 39 | def _step(self, action): 40 | assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action)) 41 | self.vm_input[0x3E80] = 1001 42 | self.vm_input[0x2] = action[0] 43 | self.vm_input[0x3] = action[1] 44 | self.orbitvm.step(self.vm_input, self.vm_output) 45 | reward = self.vm_output[0x0] + 1.0 - abs(math.sqrt(self.vm_output[0x2]**2 + self.vm_output[0x3]**2) - self.vm_output[0x4]) / 2e+08 46 | self._current_step += 1 47 | done = self._current_step > MAX_STEPS 48 | return np.array(self.vm_output), reward, done, {} 49 | 50 | def _reset(self): 51 | self.orbitvm = P1() 52 | self.vm_input = [0.0] * 16384 53 | self.vm_output = [0.0] * 6 54 | self._current_step = 0 55 | 56 | def _render(self, mode='human', close=False): 57 | if close: 58 | if self.viewer is not None: 59 | self.viewer.close() 60 | self.viewer = None 61 | return 62 | 63 | screen_width = 600 64 | screen_height = 400 65 | 66 | world_width = 2e+08 67 | scale = screen_width / world_width 68 | 69 | if self.viewer is None: 70 | from gym.envs.classic_control import rendering 71 | self.viewer = rendering.Viewer(screen_width, screen_height, display=self.display) 72 | 73 | earth = rendering.make_circle(radius=scale * 6.357e+06, filled=True) 74 | earth_trans = rendering.Transform(translation=(screen_width / 2, screen_height / 2)) 75 | earth.set_color(.1,.1,.8) 76 | earth.add_attr(earth_trans) 77 | 78 | satellite = rendering.make_circle(radius=scale * 6.357e+05, filled=True) 79 | satellite.set_color(.8,.6,.4) 80 | self.satellite_trans = rendering.Transform() 81 | satellite.add_attr(self.satellite_trans) 82 | 83 | orbit = rendering.make_circle(radius=scale * self.vm_output[0x4], filled=False) 84 | orbit_trans = rendering.Transform(translation=(screen_width / 2, screen_height / 2)) 85 | orbit.add_attr(orbit_trans) 86 | 87 | self.viewer.add_geom(orbit) 88 | self.viewer.add_geom(earth) 89 | self.viewer.add_geom(satellite) 90 | 91 | 92 | x, y = self.vm_output[0x2], self.vm_output[0x3] 93 | satx = x * scale + screen_width / 2.0 94 | saty = y * scale + screen_height / 2.0 95 | 96 | self.satellite_trans.set_translation(satx, saty) 97 | 98 | return self.viewer.render(return_rgb_array = mode=='rgb_array') 99 | 100 | 101 | register( 102 | id='OrbitP1-v0', 103 | entry_point='orbitvm.p1_env:OrbitP1Env') 104 | -------------------------------------------------------------------------------- /orbitvm_solver.py: -------------------------------------------------------------------------------- 1 | import random 2 | import tempfile 3 | import math 4 | 5 | import gym 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | import orbitvm.p1_env 10 | 11 | G = 6.67428e-11 12 | M = 6e+24 13 | 14 | def find_hohmann_impulse1(r1, r2): 15 | return math.sqrt(G * M / r1) * (math.sqrt(2 * r2 / (r1 + r2)) - 1) 16 | 17 | def find_hohmann_impulse2(r1, r2): 18 | return math.sqrt(G * M / r2) * (1 - math.sqrt(2 * r1 / (r1 + r2))) 19 | 20 | def find_transfer_time(r1, r2): 21 | rr = r1 + r2 22 | return math.pi * math.sqrt(rr * rr * rr / (8.0 * G * M)) 23 | 24 | def hohmann_transfer(env): 25 | state, reward_sum, _, _ = env.step(np.array([0,0])) 26 | x1, y1, rtarget = state[2:5] 27 | rcurrent = math.sqrt(x1 * x1 + y1 * y1) 28 | impulse1 = find_hohmann_impulse1(rcurrent, rtarget) 29 | impulse2 = find_hohmann_impulse2(rcurrent, rtarget) 30 | transfer_time = int(find_transfer_time(rcurrent, rtarget)) 31 | dx = - impulse1 * y1 / rcurrent 32 | dy = impulse1 * x1 / rcurrent 33 | 34 | # first pulse! 35 | _, reward, _, _ = env.step(np.array([dx,dy])) 36 | reward_sum += reward 37 | 38 | print "Transfer time: %s" % transfer_time 39 | # don't breathe! 40 | for x in xrange(transfer_time - 1): 41 | state, reward, done, _ = env.step(np.array([0,0])) 42 | reward_sum += reward 43 | if 0 == x % 100: 44 | env.render() 45 | 46 | state, reward, _, _ = env.step(np.array([0,0])) 47 | reward_sum += reward 48 | x2, y2, rtarget = state[2:5] 49 | rcurrent = math.sqrt(x2 * x2 + y2 * y2) 50 | 51 | dx = - impulse2 * y2 / rcurrent 52 | dy = impulse2 * x2 / rcurrent 53 | 54 | # second pulse! 55 | state, reward, _, _ = env.step(np.array([dx,dy])) 56 | reward_sum += reward 57 | 58 | for x in xrange(100000): 59 | state, reward, done, _ = env.step(np.array([0,0])) 60 | reward_sum += reward 61 | if 0 == x % 100: 62 | #print "State: %s" % state 63 | env.render() 64 | if done: 65 | print "It's done. breaking. " 66 | break 67 | print "Total reward: %s" % reward_sum 68 | 69 | if __name__ == '__main__': 70 | seed = 1 71 | random.seed(seed) 72 | np.random.seed(seed) 73 | tf.set_random_seed(seed) 74 | 75 | env_name = "OrbitP1-v0" 76 | max_iterations = 100 77 | 78 | env = gym.make(env_name) 79 | # training_dir = tempfile.mkdtemp() 80 | # env.monitor.start(training_dir) 81 | 82 | env.reset() 83 | 84 | target = 42164000.0 85 | start = target * 0.3 86 | 87 | env.orbitvm.set_target_orbit(target) 88 | env.orbitvm.set_start_orbit(start) 89 | 90 | hohmann_transfer(env) 91 | 92 | # env.monitor.close() 93 | -------------------------------------------------------------------------------- /pg_agent.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | import os 4 | import logging 5 | import csv 6 | import sys 7 | import tempfile 8 | 9 | import numpy as np 10 | import tensorflow as tf 11 | import gym 12 | from gym import envs, scoreboard 13 | from gym.spaces import Discrete, Box 14 | import prettytensor as pt 15 | 16 | from value_function import ValueFunction 17 | from space_conversion import SpaceConversionEnv 18 | 19 | DTYPE = tf.float32 20 | RENDER_EVERY = None 21 | MONITOR = True 22 | 23 | logger = logging.getLogger('pg_agent') 24 | logger.setLevel(logging.INFO) 25 | 26 | # Sample from the probability distribution. 27 | def cat_sample(prob_nk): 28 | assert prob_nk.ndim == 2 29 | N = prob_nk.shape[0] 30 | csprob_nk = np.cumsum(prob_nk, axis=1) 31 | out = np.zeros(N, dtype='i') 32 | for (n, csprob_k, r) in zip(xrange(N), csprob_nk, np.random.rand(N)): 33 | for (k, csprob) in enumerate(csprob_k): 34 | if csprob > r: 35 | out[n] = k 36 | break 37 | return out 38 | 39 | def write_csv(file_name, *arrays): 40 | with open(file_name, 'wb') as csvfile: 41 | writer = csv.writer(csvfile, delimiter=',') 42 | for row in zip(*arrays): 43 | writer.writerow(row) 44 | 45 | def discount_rewards(r, gamma): 46 | """ take 1D float array of rewards and compute discounted reward """ 47 | discounted_r = np.zeros_like(r) 48 | running_add = 0 49 | for t in reversed(xrange(0, r.size)): 50 | #if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!) 51 | running_add = running_add * gamma + r[t] 52 | discounted_r[t] = running_add 53 | return discounted_r 54 | 55 | # Learning agent. Encapsulates training and prediction. 56 | class PGAgent(object): 57 | def __init__(self, env, H, timesteps_per_batch, learning_rate, gamma, epochs, dropout): 58 | if not isinstance(env.observation_space, Box) or \ 59 | not isinstance(env.action_space, Discrete): 60 | logger.error("Incompatible spaces.") 61 | exit(-1) 62 | 63 | self.H = H 64 | self.timesteps_per_batch = timesteps_per_batch 65 | self.learning_rate = learning_rate 66 | self.gamma = gamma 67 | self.epochs = epochs 68 | self.dropout = dropout 69 | self.env = env 70 | self.session = tf.Session() 71 | 72 | # Full state used for next action prediction. Contains current 73 | # observation, previous observation and previous action. 74 | self.obs = tf.placeholder( 75 | DTYPE, 76 | shape=[None, 2 * env.observation_space.shape[0] + env.action_space.n], 77 | name="obs") 78 | self.prev_obs = np.zeros(env.observation_space.shape[0]) 79 | self.prev_action = np.zeros(env.action_space.n) 80 | 81 | # One hot encoding of the actual action taken. 82 | self.action = tf.placeholder(DTYPE, 83 | shape=[None, env.action_space.n], 84 | name="action") 85 | # Advantage, obviously. 86 | self.advant = tf.placeholder(DTYPE, shape=[None], name="advant") 87 | # Old policy prediction. 88 | self.prev_policy = tf.placeholder(DTYPE, 89 | shape=[None, env.action_space.n], 90 | name="prev_policy") 91 | 92 | self.policy_network, _ = ( 93 | pt.wrap(self.obs) 94 | .fully_connected(H, activation_fn=tf.nn.tanh) 95 | .dropout(self.dropout) 96 | .softmax_classifier(env.action_space.n)) 97 | self.returns = tf.placeholder(DTYPE, 98 | shape=[None, env.action_space.n], 99 | name="returns") 100 | 101 | loss = - tf.reduce_sum(tf.mul(self.action, 102 | tf.div(self.policy_network, 103 | self.prev_policy)), 1) * self.advant 104 | self.train = tf.train.AdamOptimizer().minimize(loss) 105 | 106 | features_count = 2 * env.observation_space.shape[0] + env.action_space.n + 2 107 | self.value_function = ValueFunction(self.session, 108 | features_count, 109 | learning_rate=1e-3, 110 | epochs=50, 111 | dropout=0.5) 112 | self.session.run(tf.initialize_all_variables()) 113 | 114 | def rollout(self, max_pathlength, timesteps_per_batch, render=False): 115 | paths = [] 116 | timesteps_sofar = 0 117 | while timesteps_sofar < timesteps_per_batch: 118 | 119 | obs, actions, rewards, action_dists, actions_one_hot = [], [], [], [], [] 120 | ob = self.env.reset() 121 | self.prev_action *= 0.0 122 | self.prev_obs *= 0.0 123 | for x in xrange(max_pathlength): 124 | if render: 125 | env.render() 126 | obs_new = np.expand_dims( 127 | np.concatenate([ob, self.prev_obs, self.prev_action], 0), 0) 128 | 129 | action_dist_n = self.session.run(self.policy_network, {self.obs: obs_new}) 130 | 131 | action = int(cat_sample(action_dist_n)[0]) 132 | self.prev_obs = ob 133 | self.prev_action *= 0.0 134 | self.prev_action[action] = 1.0 135 | 136 | obs.append(ob) 137 | actions.append(action) 138 | action_dists.append(action_dist_n) 139 | actions_one_hot.append(np.copy(self.prev_action)) 140 | 141 | res = list(self.env.step(action)) 142 | rewards.append(res[1]) 143 | ob = res[0] 144 | 145 | if res[2]: 146 | path = {"obs": np.concatenate(np.expand_dims(obs, 0)), 147 | "action_dists": np.concatenate(action_dists), 148 | "rewards": np.array(rewards), 149 | "actions": np.array(actions), 150 | "actions_one_hot": np.array(actions_one_hot)} 151 | paths.append(path) 152 | self.prev_action *= 0.0 153 | self.prev_obs *= 0.0 154 | timesteps_sofar += len(path["rewards"]) 155 | break 156 | else: 157 | timesteps_sofar += max_pathlength 158 | return paths 159 | 160 | def prepare_features(self, path): 161 | obs = path["obs"] 162 | prev_obs = np.concatenate([np.zeros((1,obs.shape[1])), path["obs"][1:]], 0) 163 | prev_action = path['action_dists'] 164 | return np.concatenate([obs, prev_obs, prev_action], axis=1) 165 | 166 | def predict(self, path): 167 | features = self.prepare_features(path) 168 | return self.session.run(self.policy_network, {self.obs: features}) 169 | 170 | def learn(self): 171 | self.current_observation = self.env.reset() 172 | 173 | xs,hs,dlogps,drs = [],[],[],[] 174 | 175 | running_reward = None 176 | reward_sum = 0 177 | episode_number = 0 178 | current_step = 0.0 179 | iteration_number = 0 180 | discounted_eprs = [] 181 | mean_path_lens = [] 182 | value_function_losses = [] 183 | 184 | while True: 185 | render = not RENDER_EVERY is None and 0 == iteration_number % RENDER_EVERY 186 | paths = self.rollout(max_pathlength=10000, 187 | timesteps_per_batch=self.timesteps_per_batch, 188 | render=render) 189 | 190 | for path in paths: 191 | path["baseline"] = self.value_function.predict(path) 192 | path["prev_policy"] = self.predict(path) 193 | path["returns"] = discount_rewards(path["rewards"], self.gamma) 194 | path["advant"] = path["returns"] - path["baseline"] 195 | 196 | value_function_losses.append(self.value_function.validate(paths)) 197 | self.value_function.fit(paths) 198 | features = np.concatenate([self.prepare_features(path) for path in paths]) 199 | 200 | advant = np.concatenate([path["advant"] for path in paths]) 201 | advant -= advant.mean() 202 | advant /= (advant.std() + 1e-8) 203 | 204 | actions = np.concatenate([path["actions_one_hot"] for path in paths]) 205 | prev_policy = np.concatenate([path["prev_policy"] for path in paths]) 206 | 207 | for _ in range(self.epochs): 208 | self.session.run(self.train, {self.obs: features, 209 | self.advant: advant, 210 | self.action: actions, 211 | self.prev_policy: prev_policy}) 212 | iteration_number += 1 213 | 214 | mean_path_len = np.mean([len(path['rewards']) for path in paths]) 215 | mean_path_lens.append(mean_path_len) 216 | logger.info("Iteration %s mean_path_len: %s", iteration_number, mean_path_len) 217 | if iteration_number > 100: 218 | paths = self.rollout(max_pathlength=10000, timesteps_per_batch=40000) 219 | ret = np.mean([len(path['rewards']) for path in paths]), np.mean(mean_path_lens) 220 | logger.info("Validation result: %s", ret[0]) 221 | if not MONITOR: 222 | write_csv('/tmp/out.csv', mean_path_lens, value_function_losses) 223 | return ret 224 | 225 | if __name__ == '__main__': 226 | seed = 1 227 | random.seed(seed) 228 | np.random.seed(seed) 229 | tf.set_random_seed(seed) 230 | env_name = "CartPole-v0" if len(sys.argv) < 2 else sys.argv[1] 231 | 232 | env = gym.make(env_name) 233 | env = SpaceConversionEnv(env, Box, Discrete) 234 | if MONITOR: 235 | training_dir = tempfile.mkdtemp() 236 | env.monitor.start(training_dir) 237 | 238 | agent = PGAgent(env, 239 | H=109, 240 | timesteps_per_batch=1369, 241 | learning_rate=0.028609296254614544, 242 | gamma=0.9914327475117531, 243 | epochs=4, 244 | dropout=0.5043049954791183) 245 | agent.learn() 246 | if MONITOR: 247 | env.monitor.close() 248 | gym.upload(training_dir, api_key='sk_lgS7sCv1Qxq5HFMdQXR6Sw') 249 | -------------------------------------------------------------------------------- /space_conversion.py: -------------------------------------------------------------------------------- 1 | """ 2 | `SpaceConversionEnv` acts as a wrapper on 3 | any environment. It allows to convert some action spaces, and observation spaces to others. 4 | 5 | Copied from https://github.com/wojzaremba/trpo/blob/master/space_conversion.py 6 | Thx! 7 | """ 8 | 9 | import numpy as np 10 | from gym.spaces import Discrete, Box, Tuple 11 | from gym import Env 12 | 13 | 14 | def box2box4obj(x, old_space_obj, new_space_obj): 15 | assert(old_space_obj.contains(x)) 16 | action = np.reshape(x, new_space_obj.shape) 17 | assert(new_space_obj.contains(action)) 18 | return action 19 | 20 | def box2box4class(box_space): 21 | shape = np.prod(box_space.shape) 22 | low = box_space.low 23 | high = box_space.high 24 | if isinstance(low, np.ndarray): 25 | low = np.reshape(low, (shape, )) 26 | if isinstance(high, np.ndarray): 27 | high = np.reshape(high, (shape, )) 28 | return Box(low, high) 29 | 30 | def discrete2tuple4obj(x, discrete_space, tuple_space): 31 | assert(discrete_space.contains(x)) 32 | action = [] 33 | for space in tuple_space.spaces: 34 | assert(isinstance(space, Discrete)) 35 | action.append(x % space.n) 36 | x = int(x / space.n) 37 | action = tuple(action) 38 | assert(tuple_space.contains(action)) 39 | return action 40 | 41 | def tuple2discrete4obj(x, old_space_obj, new_space_obj): 42 | assert(False) 43 | 44 | def tuple2discrete4class(tuple_space): 45 | n = 1 46 | for space in tuple_space.spaces: 47 | assert(isinstance(space, Discrete)) 48 | n *= space.n 49 | return Discrete(n) 50 | 51 | def box2discrete4obj(x, box_space, discrete_space): 52 | assert(False) 53 | 54 | def discrete2box4obj(x, discrete_space, box_space): 55 | ret = np.zeros(discrete_space.n) 56 | ret[x] = 1.0 57 | return ret 58 | 59 | def discrete2box4class(discrete_space): 60 | return Box(0.0, 1.0, discrete_space.n) 61 | 62 | def ident4obj(x, old_space_obj, new_space_obj): 63 | return x 64 | 65 | class SpaceConversionEnv(Env): 66 | convertable = {(Tuple, Discrete): (tuple2discrete4obj, discrete2tuple4obj, tuple2discrete4class), \ 67 | (Discrete, Box): (discrete2box4obj, box2discrete4obj, discrete2box4class), \ 68 | (Box, Box): (box2box4obj, box2box4obj, box2box4class)} 69 | 70 | def __init__(self, env, target_observation_space=None, target_action_space=None, verbose=False): 71 | self._verbose = verbose 72 | self._env = env 73 | self.action_convert = None 74 | self.observation_convert = None 75 | for pairs, convert in self.convertable.iteritems(): 76 | if env.action_space.__class__ == pairs[0] and \ 77 | target_action_space == pairs[1] and \ 78 | self.action_convert is None: 79 | self.action_convert = convert[1] 80 | self._action_space_ = convert[2](env.action_space) 81 | if env.observation_space.__class__ == pairs[0] and \ 82 | target_observation_space == pairs[1] and \ 83 | self.observation_convert is None: 84 | self.observation_convert = convert[0] 85 | self._observation_space_ = convert[2](env.observation_space) 86 | 87 | if self.action_convert is None and \ 88 | (self.action_space.__class__ == target_action_space or 89 | target_action_space is None): 90 | self.action_convert = ident4obj 91 | self._action_space = env.action_space 92 | if self.observation_convert is None and \ 93 | (self.observation_space.__class__ == target_observation_space or \ 94 | target_observation_space is None): 95 | self.observation_convert = ident4obj 96 | self._observation_space = env.observation_space 97 | 98 | assert(self.action_convert is not None) 99 | assert(self.observation_convert is not None) 100 | 101 | def step(self, action, **kwargs): 102 | conv_action = self.action_convert(action, self.action_space, self._env.action_space) 103 | if self._verbose and self.action_convert != ident4obj: 104 | print("Input action: %s, converted action: %s" % (action, conv_action)) 105 | step = self._env.step(conv_action, **kwargs) 106 | observation, reward, done, info = step 107 | 108 | conv_observation = self.observation_convert(observation, self._env.observation_space, self.observation_space) 109 | 110 | if self._verbose and self.observation_convert != ident4obj: 111 | print("Input observation: %s, converted observation: %s" % (observation, conv_observation)) 112 | return conv_observation, reward, done, {} 113 | 114 | def reset(self, **kwargs): 115 | observation = self._env.reset(**kwargs) 116 | conv_observation = self.observation_convert(observation, self._env.observation_space, self.observation_space) 117 | 118 | if self._verbose and self.observation_convert != ident4obj: 119 | print("Input observation: %s, converted observation: %s" % (observation, conv_observation)) 120 | return conv_observation 121 | 122 | @property 123 | def action_space(self): 124 | return self._action_space_ 125 | 126 | @property 127 | def observation_space(self): 128 | return self._observation_space_ 129 | 130 | def __getattr__(self, field): 131 | """ 132 | proxy everything to underlying env 133 | """ 134 | if hasattr(self._env, field): 135 | return getattr(self._env, field) 136 | raise AttributeError(field) 137 | 138 | def __repr__(self): 139 | if "object at" not in str(self._env): 140 | env_name = str(env._env) 141 | else: 142 | env_name = self._env.__class__.__name__ 143 | return env_name 144 | -------------------------------------------------------------------------------- /trpo_agent.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | import os 4 | import logging 5 | import csv 6 | import sys 7 | import tempfile 8 | 9 | import numpy as np 10 | import tensorflow as tf 11 | import gym 12 | from gym import envs, scoreboard 13 | from gym.spaces import Discrete, Box 14 | import prettytensor as pt 15 | 16 | from value_function import ValueFunction 17 | from space_conversion import SpaceConversionEnv 18 | 19 | DTYPE = tf.float32 20 | RENDER_EVERY = None 21 | MONITOR = True 22 | 23 | logger = logging.getLogger('trpo_agent') 24 | logger.setLevel(logging.INFO) 25 | 26 | # Sample from the probability distribution. 27 | def cat_sample(prob_nk): 28 | assert prob_nk.ndim == 2 29 | N = prob_nk.shape[0] 30 | csprob_nk = np.cumsum(prob_nk, axis=1) 31 | out = np.zeros(N, dtype='i') 32 | for (n, csprob_k, r) in zip(xrange(N), csprob_nk, np.random.rand(N)): 33 | for (k, csprob) in enumerate(csprob_k): 34 | if csprob > r: 35 | out[n] = k 36 | break 37 | return out 38 | 39 | def write_csv(file_name, *arrays): 40 | with open(file_name, 'wb') as csvfile: 41 | writer = csv.writer(csvfile, delimiter=',') 42 | for row in zip(*arrays): 43 | writer.writerow(row) 44 | 45 | # same as tf.gradients but returns flat tensor. 46 | def flat_gradients(loss, var_list): 47 | grads = tf.gradients(loss, var_list) 48 | return tf.concat(0, [tf.reshape(grad, [np.prod(var_shape(v))]) 49 | for (v, grad) in zip(var_list, grads)]) 50 | 51 | def discount_rewards(r, gamma): 52 | """ take 1D float array of rewards and compute discounted reward """ 53 | discounted_r = np.zeros_like(r) 54 | running_add = 0 55 | for t in reversed(xrange(0, r.size)): 56 | #if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!) 57 | running_add = running_add * gamma + r[t] 58 | discounted_r[t] = running_add 59 | return discounted_r 60 | 61 | 62 | # Math craziness. Implements a conjugate gradient algorithm. In short, solves 63 | # Ax = b for x having only a function x -> Ax (f_Ax) and b. 64 | def conjugate_gradient(f_Ax, b, cg_iters=10, residual_tol=1e-10): 65 | p = b.copy() 66 | r = b.copy() 67 | x = np.zeros_like(b) 68 | rdotr = r.dot(r) 69 | for i in xrange(cg_iters): 70 | z = f_Ax(p) 71 | v = rdotr / p.dot(z) 72 | x += v * p 73 | r -= v * z 74 | newrdotr = r.dot(r) 75 | mu = newrdotr / rdotr 76 | p = r + mu * p 77 | rdotr = newrdotr 78 | if rdotr < residual_tol: 79 | break 80 | return x 81 | 82 | # Simple line search algorithm. That is, having objective f and initial value 83 | # x search along the max_step vector (shrinking it exponentially) until we find 84 | # an improvement in f. Start with a max step and shrink it exponentially until 85 | # there is an improvement. 86 | def line_search(f, x, max_step): 87 | max_shrinks = 100 88 | shrink_multiplier = 0.9 89 | fval = f(x) 90 | step_frac = 1.0 91 | while max_shrinks > 0: 92 | xnew = x + step_frac * max_step 93 | newfval = f(xnew) 94 | if fval - newfval > 0: 95 | return xnew 96 | else: 97 | max_shrinks -= 1 98 | step_frac *= shrink_multiplier 99 | logger.info("Can not find an improvement with line search") 100 | return x 101 | 102 | def var_shape(x): 103 | out = [k.value for k in x.get_shape()] 104 | assert all(isinstance(a, int) for a in out), \ 105 | "shape function assumes that shape is fully known" 106 | return out 107 | 108 | # Learning agent. Encapsulates training and prediction. 109 | class TRPOAgent(object): 110 | def __init__(self, env, H, timesteps_per_batch, learning_rate, gamma, layers, dropout, max_iterations): 111 | if not isinstance(env.observation_space, Box) or \ 112 | not isinstance(env.action_space, Discrete): 113 | logger.error("Incompatible spaces.") 114 | exit(-1) 115 | 116 | self.H = H 117 | self.timesteps_per_batch = timesteps_per_batch 118 | self.learning_rate = learning_rate 119 | self.gamma = gamma 120 | self.layers = layers 121 | self.dropout = dropout 122 | self.env = env 123 | self.max_iterations = max_iterations 124 | self.session = tf.Session() 125 | 126 | # Full state used for next action prediction. Contains current 127 | # observation, previous observation and previous action. 128 | self.obs = tf.placeholder( 129 | DTYPE, 130 | shape=[None, 2 * env.observation_space.shape[0] + env.action_space.n], 131 | name="obs") 132 | self.prev_obs = np.zeros(env.observation_space.shape[0]) 133 | self.prev_action = np.zeros(env.action_space.n) 134 | 135 | # One hot encoding of the actual action taken. 136 | self.action = tf.placeholder(DTYPE, 137 | shape=[None, env.action_space.n], 138 | name="action") 139 | # Advantage, obviously. 140 | self.advant = tf.placeholder(DTYPE, shape=[None], name="advant") 141 | # Old policy prediction. 142 | self.prev_policy = tf.placeholder(DTYPE, 143 | shape=[None, env.action_space.n], 144 | name="prev_policy") 145 | 146 | self.policy_network = pt.wrap(self.obs) 147 | for _ in xrange(self.layers): 148 | self.policy_network = (self.policy_network 149 | .fully_connected(H, activation_fn=tf.nn.tanh) 150 | # .dropout(self.dropout) 151 | ) 152 | 153 | self.policy_network, _ = (self.policy_network 154 | .softmax_classifier(env.action_space.n)) 155 | self.returns = tf.placeholder(DTYPE, 156 | shape=[None, env.action_space.n], 157 | name="returns") 158 | 159 | loss = - tf.reduce_mean( 160 | tf.reduce_sum(tf.mul(self.action, 161 | tf.div(self.policy_network, 162 | self.prev_policy)), 1) * self.advant) 163 | self.loss = loss 164 | self.train = tf.train.AdamOptimizer().minimize(loss) 165 | 166 | # Start of TRPO/Fisher/conjugate gradients util code 167 | 168 | # get all trainable variable names. 169 | var_list = tf.trainable_variables() 170 | 171 | # define a function to get all trainable variables in a flat form. 172 | def get_variables_flat_form(): 173 | op = tf.concat( 174 | 0, [tf.reshape(v, [np.prod(var_shape(v))]) for v in var_list]) 175 | return op.eval(session=self.session) 176 | self.get_variables_flat_form = get_variables_flat_form 177 | 178 | # define a function to set all trainable variables from a flat tensor theta. 179 | def create_set_variables_from_flat_form_function(): 180 | assigns = [] 181 | shapes = map(var_shape, var_list) 182 | total_size = sum(np.prod(shape) for shape in shapes) 183 | theta_in = tf.placeholder(DTYPE, [total_size]) 184 | start = 0 185 | assigns = [] 186 | for (shape, v) in zip(shapes, var_list): 187 | size = np.prod(shape) 188 | assigns.append( 189 | tf.assign( 190 | v, 191 | tf.reshape( 192 | theta_in[ 193 | start:start + 194 | size], 195 | shape))) 196 | start += size 197 | op = tf.group(*assigns) 198 | 199 | def set_variables_from_flat_form(theta): 200 | return self.session.run(op, feed_dict={theta_in: theta}) 201 | return set_variables_from_flat_form 202 | 203 | self.set_variables_from_flat_form = create_set_variables_from_flat_form_function() 204 | 205 | # get operation to calculate all gradients (that is, find derivatives with 206 | # respect to var_list). 207 | self.policy_gradients_op = flat_gradients(loss, var_list) 208 | 209 | # get a KL divergence. Please note that we use stop_gradients here because 210 | # we don't care about prev_policy gradients and shouldn't update the 211 | # the prev_policy at all. 212 | self.kl_divergence_op = tf.reduce_sum( 213 | tf.stop_gradient(self.prev_policy) * 214 | tf.log((tf.stop_gradient(self.prev_policy) + 1e-8) / 215 | (self.policy_network + 1e-8))) / tf.cast(tf.shape(self.obs)[0], DTYPE) 216 | 217 | # As before, get an op to find derivatives. 218 | kl_divergence_gradients_op = tf.gradients(self.kl_divergence_op, var_list) 219 | 220 | # this is a flat representation of the variable that we are going to use in 221 | # our Fisher product (that is, in function y -> A*y where A is Fisher matrix, 222 | # flat_multiplier_tensor is our y) 223 | self.flat_multiplier_tensor = tf.placeholder(DTYPE, shape=[None]) 224 | 225 | # Do the actual multiplication. Some shape shifting magic. 226 | start = 0 227 | multiplier_parts = [] 228 | for var in var_list: 229 | shape = var_shape(var) 230 | size = np.prod(shape) 231 | part = tf.reshape(self.flat_multiplier_tensor[start:(start + size)], shape) 232 | multiplier_parts.append(part) 233 | start += size 234 | 235 | product_op_list = [tf.reduce_sum(kl_derivation * multiplier) for (kl_derivation, multiplier) in zip(kl_divergence_gradients_op, multiplier_parts)] 236 | 237 | # Second derivation, duh! 238 | self.fisher_product_op_list = flat_gradients(product_op_list, var_list) 239 | 240 | # End of TRPO/Fisher/conjugate gradients util code 241 | 242 | 243 | 244 | features_count = 2 * env.observation_space.shape[0] + env.action_space.n + 2 245 | self.value_function = ValueFunction(self.session, 246 | features_count, 247 | learning_rate=1e-3, 248 | epochs=50, 249 | dropout=0.5) 250 | self.session.run(tf.initialize_all_variables()) 251 | 252 | def rollout(self, max_pathlength, timesteps_per_batch, render=False): 253 | paths = [] 254 | timesteps_sofar = 0 255 | while timesteps_sofar < timesteps_per_batch: 256 | 257 | obs, actions, rewards, action_dists, actions_one_hot = [], [], [], [], [] 258 | features_list = [] 259 | ob = self.env.reset() 260 | self.prev_action *= 0.0 261 | self.prev_obs *= 0.0 262 | for x in xrange(max_pathlength): 263 | if render: 264 | env.render() 265 | features = np.concatenate([ob, self.prev_obs, self.prev_action], 0) 266 | action_dist_n = self.session.run(self.policy_network, 267 | {self.obs: np.expand_dims(features, 0)}) 268 | 269 | action = int(cat_sample(action_dist_n)[0]) 270 | self.prev_obs = ob 271 | self.prev_action *= 0.0 272 | self.prev_action[action] = 1.0 273 | 274 | obs.append(ob) 275 | actions.append(action) 276 | action_dists.append(action_dist_n) 277 | actions_one_hot.append(np.copy(self.prev_action)) 278 | features_list.append(features) 279 | 280 | res = list(self.env.step(action)) 281 | rewards.append(res[1]) 282 | ob = res[0] 283 | 284 | if res[2]: 285 | path = {"obs": np.concatenate(np.expand_dims(obs, 0)), 286 | "action_dists": np.concatenate(action_dists), 287 | "rewards": np.array(rewards), 288 | "actions": np.array(actions), 289 | "actions_one_hot": np.array(actions_one_hot), 290 | "features": np.array(features_list)} 291 | paths.append(path) 292 | self.prev_action *= 0.0 293 | self.prev_obs *= 0.0 294 | timesteps_sofar += len(path["rewards"]) 295 | break 296 | else: 297 | timesteps_sofar += max_pathlength 298 | return paths 299 | 300 | def learn(self): 301 | self.current_observation = self.env.reset() 302 | 303 | xs,hs,dlogps,drs = [],[],[],[] 304 | 305 | running_reward = None 306 | reward_sum = 0 307 | episode_number = 0 308 | current_step = 0.0 309 | iteration_number = 0 310 | discounted_eprs = [] 311 | mean_path_lens = [] 312 | value_function_losses = [] 313 | 314 | while True: 315 | render = not RENDER_EVERY is None and 0 == iteration_number % RENDER_EVERY 316 | paths = self.rollout(max_pathlength=10000, 317 | timesteps_per_batch=self.timesteps_per_batch, 318 | render=render) 319 | 320 | for path in paths: 321 | path["baseline"] = self.value_function.predict(path) 322 | path["returns"] = discount_rewards(path["rewards"], self.gamma) 323 | path["advant"] = path["returns"] - path["baseline"] 324 | 325 | value_function_losses.append(self.value_function.validate(paths)) 326 | # features = np.concatenate([self.prepare_features(path) for path in paths]) 327 | features = np.concatenate([path["features"] for path in paths]) 328 | 329 | advant = np.concatenate([path["advant"] for path in paths]) 330 | advant -= advant.mean() 331 | advant /= (advant.std() + 1e-8) 332 | 333 | actions = np.concatenate([path["actions_one_hot"] for path in paths]) 334 | 335 | prev_policy = np.concatenate([path["action_dists"] for path in paths]) 336 | self.value_function.fit(paths) 337 | 338 | # Start of conjugate gradient magic. 339 | 340 | # Get current theta (weights). 341 | previous_parameters_flat = self.get_variables_flat_form() 342 | 343 | feed_dict = {self.obs: features, 344 | self.advant: advant, 345 | self.action: actions, 346 | self.prev_policy: prev_policy} 347 | 348 | # This is a function that multipliers a vector by Fisher matrix. Used 349 | # by conjugate gradients algorithm. 350 | def fisher_vector_product(multiplier): 351 | feed_dict[self.flat_multiplier_tensor] = multiplier 352 | conjugate_gradients_damping = 0.1 353 | return self.session.run(self.fisher_product_op_list, feed_dict) + conjugate_gradients_damping * multiplier 354 | 355 | policy_gradients = self.session.run(self.policy_gradients_op, feed_dict) 356 | 357 | # Run the conjugate algorithm 358 | step_direction = conjugate_gradient(fisher_vector_product, -policy_gradients) 359 | 360 | # Calculate $s^{T}As$. 361 | hessian_vector_product = step_direction.dot(fisher_vector_product(step_direction)) 362 | max_kl = 0.01 363 | 364 | # This is our \beta. 365 | max_step_length = np.sqrt(2 * max_kl / hessian_vector_product) 366 | max_step = max_step_length * step_direction 367 | 368 | def get_loss_for(weights_flat): 369 | self.set_variables_from_flat_form(weights_flat) 370 | loss = self.session.run(self.loss, feed_dict) 371 | kl_divergence = self.session.run(self.kl_divergence_op, feed_dict) 372 | if kl_divergence > max_kl: 373 | logger.info("Hit the safeguard: %s", kl_divergence) 374 | return float('inf') 375 | else: 376 | return loss 377 | 378 | # search along the search direction. 379 | new_weights = line_search(get_loss_for, previous_parameters_flat, max_step) 380 | 381 | self.set_variables_from_flat_form(new_weights) 382 | 383 | # End of conjugate gradient magic. 384 | 385 | iteration_number += 1 386 | 387 | mean_path_len = np.mean([len(path['rewards']) for path in paths]) 388 | mean_path_lens.append(mean_path_len) 389 | logger.info("Iteration %s mean_path_len: %s", iteration_number, mean_path_len) 390 | if iteration_number > self.max_iterations: 391 | paths = self.rollout(max_pathlength=10000, timesteps_per_batch=40000) 392 | ret = np.mean([len(path['rewards']) for path in paths]), np.mean(mean_path_lens) 393 | logger.info("Validation result: %s", ret[0]) 394 | if not MONITOR: 395 | write_csv('/tmp/out.csv', mean_path_lens, value_function_losses) 396 | return ret 397 | 398 | if __name__ == '__main__': 399 | seed = 1 400 | random.seed(seed) 401 | np.random.seed(seed) 402 | tf.set_random_seed(seed) 403 | 404 | env_name = "CartPole-v0" if len(sys.argv) < 2 else sys.argv[1] 405 | max_iterations = 100 if len(sys.argv) < 3 else int(sys.argv[2]) 406 | 407 | env = gym.make(env_name) 408 | env = SpaceConversionEnv(env, Box, Discrete) 409 | 410 | if MONITOR: 411 | training_dir = tempfile.mkdtemp() 412 | env.monitor.start(training_dir) 413 | 414 | agent = TRPOAgent(env, 415 | H=109, 416 | timesteps_per_batch=1369, 417 | learning_rate=0.028609296254614544, 418 | gamma=0.9914327475117531, 419 | layers=1, 420 | dropout=0.5043049954791183, 421 | max_iterations=max_iterations) 422 | agent.learn() 423 | if MONITOR: 424 | env.monitor.close() 425 | gym.upload(training_dir, api_key='sk_lgS7sCv1Qxq5HFMdQXR6Sw') 426 | -------------------------------------------------------------------------------- /trpo_caesar.py: -------------------------------------------------------------------------------- 1 | import random 2 | import tempfile 3 | 4 | import gym 5 | import numpy as np 6 | import tensorflow as tf 7 | from gym.envs.algorithmic.algorithmic_env import ha, AlgorithmicEnv 8 | 9 | import trpo_agent 10 | import caesar 11 | import space_conversion 12 | 13 | class PredefinedStringEnv(AlgorithmicEnv): 14 | def __init__(self, input_data_string, output_data_string): 15 | self.input_data_string = input_data_string 16 | self.output_data_string = output_data_string 17 | AlgorithmicEnv.__init__(self, 18 | inp_dim=1, 19 | base=26, 20 | chars=True) 21 | 22 | 23 | 24 | def set_data(self): 25 | self.total_len = len(self.input_data_string) 26 | self.content = {} 27 | self.target = {} 28 | for i in range(self.total_len): 29 | self.content[ha(np.array([i]))] = ord(self.input_data_string[i])-ord('a') 30 | self.target[i] = ord(self.output_data_string[i]) - ord('a') 31 | self.total_reward = self.total_len 32 | 33 | def use_agent_for_decoding(agent): 34 | training_dir = tempfile.mkdtemp() 35 | 36 | for line in caesar.this.s.lower().split('\n'): 37 | cleaned_line = ''.join(x for x in line if ord('a') <= ord(x) <= ord('z')) 38 | decoded_cleaned_line = ''.join(caesar.this.d[x] for x in line if ord('a') <= ord(x) <= ord('z')) 39 | 40 | env = PredefinedStringEnv(cleaned_line, decoded_cleaned_line) 41 | env = space_conversion.SpaceConversionEnv(env, 42 | gym.spaces.Box, 43 | gym.spaces.Discrete) 44 | env.monitor.start(training_dir, resume=True, video_callable=lambda _: True) 45 | agent.env = env 46 | agent.rollout(10000, len(cleaned_line)) 47 | env.monitor.close() 48 | 49 | 50 | if __name__ == '__main__': 51 | seed = 1 52 | random.seed(seed) 53 | np.random.seed(seed) 54 | tf.set_random_seed(seed) 55 | 56 | env_name = "Caesar-v0" 57 | max_iterations = 1000 58 | 59 | env = gym.make(env_name) 60 | env = space_conversion.SpaceConversionEnv(env, 61 | gym.spaces.Box, 62 | gym.spaces.Discrete) 63 | 64 | training_dir = tempfile.mkdtemp() 65 | env.monitor.start(training_dir) 66 | 67 | agent = trpo_agent.TRPOAgent( 68 | env, 69 | H=309, 70 | timesteps_per_batch=1369, 71 | learning_rate=0.028609296254614544, 72 | gamma=0.9914327475117531, 73 | layers=1, 74 | dropout=0.5043049954791183, 75 | max_iterations=max_iterations) 76 | agent.learn() 77 | env.monitor.close() 78 | 79 | use_agent_for_decoding(agent) 80 | -------------------------------------------------------------------------------- /value_function.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import prettytensor as pt 3 | import numpy as np 4 | 5 | 6 | # Separate network to approximate value function. Need to init variables after 7 | # this is instantiated. 8 | class ValueFunction(object): 9 | def __init__(self, session, features_count, learning_rate, epochs, dropout): 10 | self.features_count = features_count 11 | self.learning_rate = learning_rate 12 | self.epochs = epochs 13 | self.session = session 14 | self.dropout = dropout 15 | self.x = tf.placeholder(tf.float32, shape=[None, features_count], name="x") 16 | self.y = tf.placeholder(tf.float32, shape=[None], name="y") 17 | self.net = (pt.wrap(self.x) 18 | .fully_connected(64, activation_fn=tf.nn.relu) 19 | .dropout(self.dropout) 20 | .fully_connected(64, activation_fn=tf.nn.relu) 21 | .dropout(self.dropout) 22 | .fully_connected(1)) 23 | self.net = tf.reshape(self.net, (-1, )) 24 | self.l2 = (self.net - self.y) * (self.net - self.y) 25 | self.train = tf.train.AdamOptimizer(self.learning_rate).minimize(self.l2) 26 | 27 | def prepare_features(self, path): 28 | obs = path["obs"] 29 | prev_obs = np.concatenate([np.zeros((1, obs.shape[1])), obs[1:]], 0) 30 | prev_action = np.concatenate([np.zeros((1, path['action_dists'].shape[1])), path['action_dists'][1:]], 0) 31 | l = len(path["rewards"]) 32 | arange = np.arange(l).reshape(-1, 1) 33 | ret = np.concatenate([obs, prev_obs, prev_action, arange, np.ones((l, 1))], axis=1) 34 | return ret 35 | 36 | def fit(self, paths): 37 | features = np.concatenate([self.prepare_features(path) for path in paths]) 38 | returns = np.concatenate([path["returns"] for path in paths]) 39 | for _ in range(self.epochs): 40 | self.session.run(self.train, {self.x: features, self.y: returns}) 41 | 42 | def validate(self, paths): 43 | features = np.concatenate([self.prepare_features(path) for path in paths]) 44 | returns = np.concatenate([path["returns"] for path in paths]) 45 | return np.mean(self.session.run(self.l2, {self.x: features, self.y: returns})) 46 | 47 | def predict(self, path): 48 | features = self.prepare_features(path) 49 | return self.session.run(self.net, {self.x: features}) 50 | --------------------------------------------------------------------------------