├── agent ├── __init__.py ├── agent_parallel.py ├── agent_continous_image.py ├── agent_continous.py ├── agent_continous_rnn.py ├── agent_cotinous_single_thread.py ├── AC_agent_continous.py ├── agent_base.py ├── agent_discrete.py ├── agent_continous_parallel_storage.py ├── agent_continous_single_process.py └── agent_continous_image_parallel_image.py ├── logger ├── __init__.py └── logger.py ├── baseline ├── __init__.py ├── baseline_zeros.py ├── baseline_average_reward.py ├── baseline_lstsq.py ├── baseline_tensorflow.py └── baseline_tf_image.py ├── experiment ├── __init__.py ├── main.py ├── main_lstm.py ├── main_ac.py ├── main_discrete.py ├── main_image.py ├── main_tf_parallel.py ├── main_multi_thread.py ├── main_image_multi_process.py └── main_multi_process.py ├── network ├── __init__.py ├── network_descrete.py ├── network_continous.py ├── network_continous_image.py └── network_continous_rnn.py ├── storage ├── __init__.py ├── storage.py ├── storage_image.py ├── storage_continous.py ├── storage_continous_parallel.py └── storage_continous_parallel_image.py ├── distribution ├── __init__.py ├── diagonal_category.py └── diagonal_gaussian.py ├── .gitignore ├── run.py ├── dealImage.py ├── parameters.py~ ├── README.md ├── environment.py ├── parameters.py ├── utils.py └── krylov.py /agent/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /logger/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baseline/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiment/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /network/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /storage/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /distribution/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /network/network_descrete.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | logs_* 2 | *.pyc 3 | *.swp 4 | checkpoint/ 5 | checkpoint_parallel/ 6 | log/ 7 | .idea/ 8 | .idea 9 | -------------------------------------------------------------------------------- /baseline/baseline_zeros.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Baseline(object): 5 | def fit(self, paths): 6 | self.temp = 0 7 | 8 | def predict(self, path): 9 | return np.zeros(len(path["rewards"])) -------------------------------------------------------------------------------- /baseline/baseline_average_reward.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class BaselineAverageReward(object): 5 | def fit(self, paths): 6 | self.temp = 0 7 | 8 | def predict(self, path): 9 | rewards = path["rewards"] 10 | mean_rewards = np.mean(rewards) 11 | return mean_rewards -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | tasks = ["Copy-v0", "DuplicatedInput-v0", "Reverse-v0", "RepeatCopy-v0"] 4 | 5 | os.system("rm logs_*") 6 | os.system("k screen") 7 | os.system("screen -wipe") 8 | 9 | 10 | for t in tasks: 11 | os.system("screen -dm -S trpo_%s bash -c '. ~/.profile; . ~/.bashrc; CUDA_VISIBLE_DEVICES=[] python main.py %s 2>&1 | tee logs_%s ; bash'" % (t, t, t)) 12 | -------------------------------------------------------------------------------- /experiment/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | from environment import Environment 4 | from agent.agent_continous import TRPOAgent 5 | from parameters import pms 6 | 7 | if not os.path.isdir("./checkpoint"): 8 | os.makedirs("./checkpoint") 9 | if not os.path.isdir("./log"): 10 | os.makedirs("./log") 11 | env = Environment(gym.make(pms.environment_name)) 12 | agent = TRPOAgent(env) 13 | 14 | if pms.train_flag: 15 | agent.learn() 16 | else: 17 | agent.test(pms.checkpoint_file) 18 | # env.monitor.close() 19 | # gym.upload(training_dir, 20 | # algorithm_id='trpo_ff') 21 | -------------------------------------------------------------------------------- /experiment/main_lstm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | from environment import Environment 4 | from agent.agent_continous_rnn import TRPOAgent 5 | from parameters import pms 6 | 7 | if not os.path.isdir("./checkpoint"): 8 | os.makedirs("./checkpoint") 9 | if not os.path.isdir("./log"): 10 | os.makedirs("./log") 11 | env = Environment(gym.make(pms.environment_name)) 12 | agent = TRPOAgent(env) 13 | 14 | if pms.train_flag: 15 | agent.learn() 16 | else: 17 | agent.test(pms.checkpoint_file) 18 | # env.monitor.close() 19 | # gym.upload(training_dir, 20 | # algorithm_id='trpo_ff') 21 | -------------------------------------------------------------------------------- /logger/logger.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import time 3 | 4 | class Logger(object): 5 | def __init__(self, head): 6 | self.head = [] 7 | self.file_name = self.get_file_name() 8 | self.csvfile = file("log/"+self.file_name , 'wb') 9 | self.csv_writer = csv.writer(self.csvfile) 10 | self.log_row(head) 11 | 12 | def log_row(self, data): 13 | self.csv_writer.writerow(data) 14 | 15 | def get_file_name(self): 16 | file_time = time.strftime("%Y-%m-%d-%H:%M:%S",time.localtime(time.time())) 17 | file_name = file_time+".csv" 18 | return file_name 19 | 20 | def __del__(self): 21 | self.csvfile.close() -------------------------------------------------------------------------------- /experiment/main_ac.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import gym 4 | from gym import envs, scoreboard 5 | from gym.spaces import Discrete, Box 6 | import tempfile 7 | import sys 8 | from environment import Environment 9 | from agent.AC_agent_continous import ACAgent 10 | from parameters import pms 11 | 12 | if not os.path.isdir("./checkpoint"): 13 | os.makedirs("./checkpoint") 14 | if not os.path.isdir("./log"): 15 | os.makedirs("./log") 16 | env = Environment(gym.make(pms.environment_name)) 17 | agent = ACAgent(env) 18 | 19 | if pms.train_flag: 20 | agent.learn() 21 | else: 22 | agent.test(pms.checkpoint_file) 23 | # env.monitor.close() 24 | # gym.upload(training_dir, 25 | # algorithm_id='trpo_ff') 26 | -------------------------------------------------------------------------------- /experiment/main_discrete.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import gym 4 | from gym import envs, scoreboard 5 | from gym.spaces import Discrete, Box 6 | import tempfile 7 | import sys 8 | from environment import Environment 9 | from agent.agent_discrete import TRPOAgent 10 | from parameters import pms 11 | 12 | if not os.path.isdir("./checkpoint"): 13 | os.makedirs("./checkpoint") 14 | if not os.path.isdir("./log"): 15 | os.makedirs("./log") 16 | env = Environment(gym.make(pms.environment_name)) 17 | agent = TRPOAgent(env) 18 | 19 | if pms.train_flag: 20 | agent.learn() 21 | else: 22 | agent.test(pms.checkpoint_file) 23 | # env.monitor.close() 24 | # gym.upload(training_dir, 25 | # algorithm_id='trpo_ff') 26 | -------------------------------------------------------------------------------- /experiment/main_image.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import gym 4 | from gym import envs, scoreboard 5 | from gym.spaces import Discrete, Box 6 | import tempfile 7 | import sys 8 | from environment import Environment 9 | from agent.agent_continous_image import TRPOAgent 10 | from parameters import pms 11 | 12 | if not os.path.isdir("./checkpoint"): 13 | os.makedirs("./checkpoint") 14 | if not os.path.isdir("./log"): 15 | os.makedirs("./log") 16 | env = Environment(gym.make(pms.environment_name)) 17 | agent = TRPOAgent(env) 18 | 19 | if pms.train_flag: 20 | agent.learn() 21 | else: 22 | agent.test(pms.checkpoint_file) 23 | # env.monitor.close() 24 | # gym.upload(training_dir, 25 | # algorithm_id='trpo_ff') 26 | -------------------------------------------------------------------------------- /dealImage.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Qt4Agg') 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | 7 | def vis_square(data, padsize=1, padval=0): 8 | data -= data.min() 9 | data /= data.max() 10 | 11 | 12 | n = int(np.ceil(np.sqrt(data.shape[0]))) 13 | padding = ((0, n ** 2 - data.shape[0]), (0, padsize), (0, padsize)) + ((0, 0),) * (data.ndim - 3) 14 | data = np.pad(data, padding, mode='constant', constant_values=(padval, padval)) 15 | 16 | 17 | data = data.reshape((n, n) + data.shape[1:]).transpose((0, 2, 1, 3) + tuple(range(4, data.ndim + 1))) 18 | data = data.reshape((n * data.shape[1], n * data.shape[3]) + data.shape[4:]) 19 | print data.shape 20 | plt.imshow(data) 21 | plt.show() -------------------------------------------------------------------------------- /parameters.py~: -------------------------------------------------------------------------------- 1 | # for image 2 | dims = (100, 100) 3 | obs_height = 100 4 | obs_width = 100 5 | obs_channel = 1 6 | history_number = 2 7 | 8 | # for trainning 9 | <<<<<<< HEAD 10 | jobs = 4 11 | 12 | ======= 13 | jobs = 2 14 | >>>>>>> 0356c098856467ec6db97061e73187c6a18a25a7 15 | max_iter_number = 10000 16 | paths_number = 1 17 | max_path_length = 199 18 | batch_size = max_path_length 19 | max_kl = 0.01 20 | gae_lambda = 1.0 21 | subsample_factor = 0.8 22 | cg_damping = 0.1 23 | discount = 0.99 24 | cg_iters = 10 25 | deviation = 0.1 26 | render = True 27 | train_flag = False 28 | iter_num_per_train = 1 29 | checkpoint_file = "checkpoint/iter240865.ckpt" 30 | record_movie = False 31 | upload_to_gym = False 32 | 33 | # for environment 34 | 35 | environment_name = "Pendulum-v0" 36 | 37 | # for continous action 38 | min_std = 1e-6 39 | center_adv = True 40 | positive_adv = False 41 | use_std_network = False 42 | std = 1.1 43 | obs_shape = 3 44 | action_shape = 1 45 | min_a = -2.0 46 | max_a = 2.0 47 | 48 | -------------------------------------------------------------------------------- /distribution/diagonal_category.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | import tensorflow as tf 3 | import numpy as np 4 | 5 | 6 | class DiagonalCategory(object): 7 | def __init__(self, dim=0): 8 | self._dim = dim 9 | 10 | @property 11 | def dim(self): 12 | return self._dim 13 | 14 | def kl_sym(self, old_dist_info_vars, new_dist_info_vars): 15 | return tf.reduce_mean(old_dist_info_vars * tf.log((old_dist_info_vars + 1e-8) / (new_dist_info_vars + 1e-8))) 16 | 17 | def likelihood_ratio_sym(self, x_var, new_dist_info_vars, old_dist_info_vars): 18 | """ 19 | \frac{\pi_\theta}{\pi_{old}} 20 | :param x_var: actions 21 | :param new_dist_info_vars: means + logstds 22 | :param old_dist_info_vars: old_means + old_logstds 23 | :return: 24 | """ 25 | N = tf.shape(x_var)[0] 26 | p_n = slice_2d(new_dist_info_vars, tf.range(0, N), x_var) 27 | oldp_n = slice_2d(old_dist_info_vars, tf.range(0, N), x_var) 28 | return p_n / oldp_n 29 | 30 | def entropy(self, dist_infos): 31 | return tf.reduce_mean(-dist_infos * tf.log(dist_infos + 1e-8)) -------------------------------------------------------------------------------- /baseline/baseline_lstsq.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | class Baseline(object): 3 | def __init__(self, reg_coeff=1e-5): 4 | self._coeffs = None 5 | self._reg_coeff = reg_coeff 6 | 7 | def get_param_values(self, **tags): 8 | return self._coeffs 9 | 10 | def set_param_values(self, val, **tags): 11 | self._coeffs = val 12 | 13 | def _features(self, path): 14 | o = path["observations"].astype('float32') 15 | o = o.reshape(o.shape[0], -1) 16 | l = len(path["rewards"]) 17 | al = np.arange(l).reshape(-1 , 1) / 100.0 18 | return np.concatenate([o, o ** 2, al, al ** 2, np.ones((l, 1))], axis=1) 19 | 20 | def fit(self, paths): 21 | featmat = np.concatenate([self._features(path) for path in paths]) 22 | returns = np.concatenate([path["returns"] for path in paths]) 23 | self._coeffs = np.linalg.lstsq( 24 | featmat.T.dot(featmat) + self._reg_coeff * np.identity(featmat.shape[1]), 25 | featmat.T.dot(returns) 26 | )[0] 27 | 28 | def predict(self, path): 29 | if self._coeffs is None: 30 | return np.zeros(len(path["rewards"])) 31 | return self._features(path).dot(self._coeffs) 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # recently the algorithm has been moved to https://github.com/jjkke88/RL_toolbox 2 | 3 | # trpo 4 | trust region policy optimitztion base on gym and tensorflow 5 | 6 |

There are three versions of trpo, one for decrete action space like mountaincar, one for decreate action space task with image as input like atari games, and the last for continuous action space for pendulems.

7 |

The environment is base on openAI gym.

8 |

part of code refer to rllab

9 | 10 | # dependency 11 | 16 | 17 | # constructure for code 18 | 31 | 32 | # recent work 33 | 38 | 39 | # future work 40 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /baseline/baseline_tensorflow.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import prettytensor as pt 4 | 5 | class Baseline(object): 6 | coeffs = None 7 | 8 | def __init__(self , session=None): 9 | self.net = None 10 | self.session = session 11 | 12 | def create_net(self , shape): 13 | print(shape) 14 | self.x = tf.placeholder(tf.float32 , shape=[None , shape] , name="x") 15 | self.y = tf.placeholder(tf.float32 , shape=[None] , name="y") 16 | self.net = (pt.wrap(self.x). 17 | fully_connected(64 , activation_fn=tf.nn.tanh). 18 | fully_connected(1)) 19 | self.net = tf.reshape(self.net , (-1 ,)) 20 | self.l2 = (self.net - self.y) * (self.net - self.y) 21 | self.train = tf.train.AdamOptimizer().minimize(self.l2) 22 | self.session.run(tf.initialize_all_variables()) 23 | 24 | def _features(self, path): 25 | o = path["observations"].astype('float32') 26 | o = o.reshape(o.shape[0] , -1) 27 | l = len(path["rewards"]) 28 | al = np.arange(l).reshape(-1 , 1) / 100.0 29 | return np.concatenate([o , o ** 2 , al , al ** 2 , np.ones((l , 1))] , axis=1) 30 | 31 | def fit(self, paths): 32 | featmat = np.concatenate([self._features(path) for path in paths]) 33 | if self.net is None: 34 | self.create_net(featmat.shape[1]) 35 | returns = np.concatenate([path["returns"] for path in paths]) 36 | for _ in range(10): 37 | loss, _ = self.session.run([self.l2, self.train], {self.x: featmat , self.y: returns}) 38 | 39 | def predict(self, path): 40 | if self.net is None: 41 | return np.zeros(len(path["rewards"])) 42 | else: 43 | ret = self.session.run(self.net , {self.x: self._features(path)}) 44 | return np.reshape(ret , (ret.shape[0] ,)) 45 | -------------------------------------------------------------------------------- /baseline/baseline_tf_image.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import prettytensor as pt 4 | from parameters import pms 5 | 6 | class BaselineTfImage(object): 7 | coeffs = None 8 | 9 | def __init__(self, session): 10 | self.net = None 11 | self.session = session 12 | 13 | def create_net(self, shape): 14 | self.x = tf.placeholder(tf.float32, shape=[None, shape[1], shape[2], shape[3]], name="x") 15 | self.y = tf.placeholder(tf.float32, shape=[None], name="y") 16 | self.net = (pt.wrap(self.x). 17 | conv2d(1, 16, stride=2, batch_normalize=True). 18 | conv2d(1, 16, stride=2, batch_normalize=True). 19 | flatten(). 20 | fully_connected(32, activation_fn=tf.nn.relu). 21 | fully_connected(32, activation_fn=tf.nn.relu). 22 | fully_connected(1)) 23 | self.net = tf.reshape(self.net, (-1, )) 24 | l2 = (self.net - self.y) * (self.net - self.y) 25 | self.train = tf.train.AdamOptimizer().minimize(l2) 26 | self.session.run(tf.initialize_all_variables()) 27 | 28 | def _features(self, path): 29 | ret = path["observations"].astype('float32') 30 | return ret 31 | 32 | def fit(self, paths): 33 | featmat = np.concatenate([self._features(path) for path in paths]) 34 | if self.net is None: 35 | self.create_net(featmat.shape) 36 | returns = np.concatenate([path["returns"] for path in paths]) 37 | for _ in range(100): 38 | self.session.run(self.train, {self.x: featmat, self.y: returns}) 39 | 40 | def predict(self, path): 41 | if self.net is None: 42 | return np.zeros(len(path["rewards"])) 43 | else: 44 | ret = self.session.run(self.net, {self.x: self._features(path)}) 45 | return np.reshape(ret, (ret.shape[0], )) -------------------------------------------------------------------------------- /network/network_continous.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | import numpy as np 3 | import tensorflow as tf 4 | import prettytensor as pt 5 | from parameters import pms 6 | 7 | seed = 1 8 | np.random.seed(seed) 9 | tf.set_random_seed(seed) 10 | 11 | class NetworkContinous(object): 12 | def __init__(self, scope): 13 | with tf.variable_scope("%s_shared" % scope): 14 | self.obs = obs = tf.placeholder( 15 | tf.float32, shape=[None, pms.obs_shape], name="%s_obs"%scope) 16 | self.action_n = tf.placeholder(tf.float32, shape=[None, pms.action_shape], name="%s_action"%scope) 17 | self.advant = tf.placeholder(tf.float32, shape=[None], name="%s_advant"%scope) 18 | self.old_dist_means_n = tf.placeholder(tf.float32, shape=[None, pms.action_shape], 19 | name="%s_oldaction_dist_means"%scope) 20 | self.old_dist_logstds_n = tf.placeholder(tf.float32, shape=[None, pms.action_shape], 21 | name="%s_oldaction_dist_logstds"%scope) 22 | self.action_dist_means_n = (pt.wrap(self.obs). 23 | fully_connected(64, activation_fn=tf.nn.relu, init=tf.random_normal_initializer(-0.05, 0.05), bias_init=tf.constant_initializer(0), 24 | name="%s_fc1"%scope). 25 | fully_connected(64, activation_fn=tf.nn.relu, init=tf.random_normal_initializer(-0.05, 0.05), bias_init=tf.constant_initializer(0), 26 | name="%s_fc2"%scope). 27 | fully_connected(pms.action_shape, init=tf.random_normal_initializer(-0.05, 0.05), bias_init=tf.constant_initializer(0), 28 | name="%s_fc3"%scope)) 29 | 30 | self.N = tf.shape(obs)[0] 31 | Nf = tf.cast(self.N, tf.float32) 32 | self.action_dist_logstd_param = tf.Variable((.01*np.random.randn(1, pms.action_shape)).astype(np.float32), name="%spolicy_logstd"%scope) 33 | self.action_dist_logstds_n = tf.tile(self.action_dist_logstd_param, 34 | tf.pack((tf.shape(self.action_dist_means_n)[0], 1))) 35 | self.var_list = [v for v in tf.trainable_variables()if v.name.startswith(scope)] 36 | 37 | def get_action_dist_means_n(self, session, obs): 38 | return session.run(self.action_dist_means_n, 39 | {self.obs: obs}) 40 | 41 | -------------------------------------------------------------------------------- /environment.py: -------------------------------------------------------------------------------- 1 | """ 2 | `SpaceConversionEnv` acts as a wrapper on 3 | any environment. It allows to convert some action spaces, and observation spaces to others. 4 | """ 5 | 6 | import numpy as np 7 | from gym.spaces import Discrete, Box, Tuple 8 | from gym import Env 9 | import cv2 10 | from parameters import pms 11 | import gym 12 | from gym.monitoring import monitor 13 | 14 | def convert_gym_space(space): 15 | if isinstance(space, gym.spaces.Box): 16 | return Box(low=space.low, high=space.high) 17 | elif isinstance(space, gym.spaces.Discrete): 18 | return Discrete(n=space.n) 19 | else: 20 | raise NotImplementedError 21 | 22 | class CappedCubicVideoSchedule(object): 23 | def __call__(self, count): 24 | return monitor.capped_cubic_video_schedule(count) 25 | 26 | class NoVideoSchedule(object): 27 | def __call__(self , count): 28 | return False 29 | 30 | class Environment(Env): 31 | 32 | def __init__(self, env, type="origin"): 33 | self.env = env 34 | self.type = type 35 | self.video_schedule = None 36 | if not pms.record_movie: 37 | self.video_schedule = NoVideoSchedule() 38 | else: 39 | if self.video_schedule is not None: 40 | self.video_schedule = CappedCubicVideoSchedule() 41 | self.env.monitor.start("log/trpo" ,self.video_schedule, force=True) 42 | self.monitoring = True 43 | 44 | def step(self, action, **kwargs): 45 | self._observation, reward, done, info = self.env.step(action) 46 | self._observation = np.clip(self._observation, self.env.observation_space.low, self.env.observation_space.high) 47 | return self.observation, reward, done, info 48 | 49 | def reset(self, **kwargs): 50 | self._observation = self.env.reset() 51 | return self.observation 52 | 53 | def render(self, mode="human", close=False): 54 | return self.env.render(mode) 55 | 56 | @property 57 | def observation(self): 58 | if self.type == "origin": 59 | return self._observation 60 | elif self.type == "gray_image": 61 | return cv2.resize(cv2.cvtColor(self._observation, cv2.COLOR_RGB2GRAY)/255., pms.dims) 62 | 63 | @property 64 | def action_space(self): 65 | return convert_gym_space(self.env.action_space) 66 | 67 | 68 | @property 69 | def observation_space(self): 70 | if self.type == "origin": 71 | return convert_gym_space(self.env.observation_space) 72 | else: 73 | return pms.dims 74 | 75 | # @property 76 | # def obs_dims(self): 77 | # if self.type == "origin": 78 | # return self.env.observation_space.shape 79 | # else: 80 | # return pms.dims -------------------------------------------------------------------------------- /network/network_continous_image.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | import numpy as np 3 | import tensorflow as tf 4 | import prettytensor as pt 5 | from parameters import pms 6 | 7 | seed = 1 8 | np.random.seed(seed) 9 | tf.set_random_seed(seed) 10 | 11 | class NetworkContinousImage(object): 12 | def __init__(self, scope): 13 | with tf.variable_scope("%s_shared" % scope): 14 | self.obs = obs = tf.placeholder( 15 | dtype, shape=[None, pms.obs_height, pms.obs_width, pms.obs_channel], name="%s_obs"%scope) 16 | self.action_n = tf.placeholder(dtype, shape=[None, pms.action_shape], name="%s_action"%scope) 17 | self.advant = tf.placeholder(dtype, shape=[None], name="%s_advant"%scope) 18 | self.old_dist_means_n = tf.placeholder(dtype, shape=[None, pms.action_shape], 19 | name="%s_oldaction_dist_means"%scope) 20 | self.old_dist_logstds_n = tf.placeholder(dtype, shape=[None, pms.action_shape], 21 | name="%s_oldaction_dist_logstds"%scope) 22 | self.action_dist_means_n = (pt.wrap(self.obs). 23 | conv2d(8 , 32 , stride=4 , batch_normalize=True). 24 | conv2d(4 , 64 , stride=2 , batch_normalize=True). 25 | conv2d(3 , 64 , stride=1 , batch_normalize=True). 26 | flatten(). 27 | fully_connected(128, activation_fn=tf.nn.relu, init=tf.random_normal_initializer(-0.05, 0.05), 28 | name="%s_fc1"%scope). 29 | fully_connected(128, activation_fn=tf.nn.relu, init=tf.random_normal_initializer(-0.05, 0.05), 30 | name="%s_fc2"%scope). 31 | fully_connected(pms.action_shape, init=tf.random_normal_initializer(-0.05, 0.05), 32 | name="%s_fc3"%scope)) 33 | 34 | self.N = tf.shape(obs)[0] 35 | Nf = tf.cast(self.N, dtype) 36 | self.action_dist_logstd_param = tf.Variable((.01*np.random.randn(1, pms.action_shape)).astype(np.float32), name="%spolicy_logstd"%scope) 37 | self.action_dist_logstds_n = tf.tile(self.action_dist_logstd_param, 38 | tf.pack((tf.shape(self.action_dist_means_n)[0], 1))) 39 | self.var_list = [v for v in tf.trainable_variables()if v.name.startswith(scope)] 40 | 41 | def get_action_dist_means_n(self, session, obs): 42 | return session.run(self.action_dist_means_n, 43 | {self.obs: obs}) 44 | 45 | -------------------------------------------------------------------------------- /parameters.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | flags = tf.app.flags 4 | flags.DEFINE_integer('obs_height', 100, 'image height') 5 | flags.DEFINE_integer('obs_width', 100, 'image width') 6 | flags.DEFINE_integer('obs_channel', 3, 'image channel') 7 | flags.DEFINE_integer('history_number', 2, 'image history number') 8 | flags.DEFINE_integer('jobs', 4, 'thread or process number') 9 | flags.DEFINE_integer('max_iter_number', 400, 'control the max iteration number for trainning') 10 | flags.DEFINE_integer('paths_number', 10, 'number of paths in each rollout') 11 | flags.DEFINE_integer('max_path_length',200, 'timesteps in each path') 12 | flags.DEFINE_integer('batch_size', 100, 'batch size for trainning') 13 | flags.DEFINE_float('max_kl', 0.01, 'the largest kl distance, \sigma in paper') 14 | flags.DEFINE_float('gae_lambda', 1.0, 'fix number') 15 | flags.DEFINE_float('subsample_factor', 0.5, 'ratio of the samples used in training process') 16 | flags.DEFINE_float('cg_damping', 0.001, 'conjugate gradient damping') 17 | flags.DEFINE_float('discount', 0.99, 'discount') 18 | flags.DEFINE_integer('cg_iters', 20, 'iteration number in conjugate gradient') 19 | flags.DEFINE_float('deviation', 0.1, 'fixed') 20 | flags.DEFINE_boolean('render', False, 'whether to render image') 21 | flags.DEFINE_boolean('train_flag', True, 'true for train and False for test') 22 | flags.DEFINE_integer('iter_num_per_train', 1, 'iteration number in each trainning process') 23 | flags.DEFINE_string('checkpoint_file', '', 'checkpoint file path, if empty then will load the latest one') 24 | flags.DEFINE_integer('save_model_times', 1, 'iteration number to save model, if 1, then model would be saved in each iteration') 25 | flags.DEFINE_boolean('record_movie', False, 'whether record the video in gym') 26 | flags.DEFINE_boolean('upload_to_gym', False, 'whether upload the result to gym') 27 | flags.DEFINE_string('checkpoint_dir', 'checkpoint/', 'checkpoint save and load path, for parallel, it should be checkpoint_parallel') 28 | flags.DEFINE_string('environment_name', 'Pendulum-v0', 'environment name') 29 | flags.DEFINE_float('min_std', 0.2, 'the smallest std') 30 | flags.DEFINE_boolean('center_adv', True, 'whether center advantage, fixed') 31 | flags.DEFINE_boolean('positive_adv', False, 'whether positive advantage, fixed') 32 | flags.DEFINE_boolean('use_std_network', False, 'whether use network to train std, it is not supported, fixed') 33 | flags.DEFINE_float('std', 1.1, 'if the std is set to constant, then this value will be used') 34 | flags.DEFINE_integer('obs_shape', 3, 'dimensions of observation') 35 | flags.DEFINE_integer('action_shape', 1, 'dimensions of action') 36 | flags.DEFINE_float('min_a', -2.0, 'the smallest action value') 37 | flags.DEFINE_float('max_a', 2.0, 'the largest action value') 38 | flags.DEFINE_string("decay_method", "adaptive", "decay_method:adaptive, linear, exponential") # adaptive, linear, exponential 39 | flags.DEFINE_integer("timestep_adapt", 600, "timestep to adapt kl") 40 | flags.DEFINE_float("kl_adapt", 0.0005, "kl adapt rate") 41 | pms = flags.FLAGS 42 | pms.checkpoint_file = None 43 | pms.batch_size = int(pms.subsample_factor * pms.paths_number * pms.max_path_length) -------------------------------------------------------------------------------- /experiment/main_tf_parallel.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import tensorflow as tf 3 | from agent.agent_parallel import TRPOAgentParallel 4 | from parameters import pms 5 | import gym 6 | import numpy as np 7 | from environment import Environment 8 | 9 | # Flags for defining the tf.train.ClusterSpec 10 | tf.app.flags.DEFINE_string("ps_hosts", "166.111.138.113:2223", 11 | "Comma-separated list of hostname:port pairs") 12 | tf.app.flags.DEFINE_string("worker_hosts", "166.111.138.137:2226,166.111.138.137:2227,166.111.138.137:2228", 13 | "Comma-separated list of hostname:port pairs") 14 | 15 | # Flags for defining the tf.train.Server 16 | tf.app.flags.DEFINE_string("job_name", "worker", "ps or worker") 17 | tf.app.flags.DEFINE_integer("task_index",2, "Index of task within the job") 18 | 19 | FLAGS = tf.app.flags.FLAGS 20 | 21 | seed = 1 22 | np.random.seed(seed) 23 | tf.set_random_seed(seed) 24 | 25 | def main(_): 26 | ps_hosts = FLAGS.ps_hosts.split(',') 27 | worker_hosts = FLAGS.worker_hosts.split(',') 28 | 29 | # Create a cluster from the parameter server and worker hosts. 30 | cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) 31 | 32 | # Create and start a server for the local task. 33 | # 创建并启动服务 34 | # 其参数中使用task_index 指定任务的编号 35 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1 / 3.0) 36 | server = tf.train.Server(cluster, 37 | job_name=FLAGS.job_name, 38 | task_index=FLAGS.task_index, 39 | config=tf.ConfigProto(gpu_options=gpu_options)) 40 | 41 | if FLAGS.job_name == "ps": 42 | server.join() 43 | elif FLAGS.job_name == "worker": 44 | # 将op 挂载到各个本地的worker上 45 | env = Environment(gym.make(pms.environment_name)) 46 | with tf.device(tf.train.replica_device_setter( 47 | worker_device="/job:worker/task:%d" % (FLAGS.task_index), 48 | cluster=cluster)): 49 | agent = TRPOAgentParallel(env) 50 | saver = tf.train.Saver(max_to_keep=10) 51 | init_op = tf.initialize_all_variables() 52 | summary_op = tf.merge_all_summaries() 53 | # Create a "supervisor", which oversees the training process. 54 | sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0), 55 | logdir="./checkpoint_parallel", 56 | init_op=init_op, 57 | global_step=agent.global_step, 58 | saver=saver, 59 | summary_op=None, 60 | save_model_secs=60) 61 | 62 | # The supervisor takes care of session initialization, restoring from 63 | # a checkpoint, and closing when done or an error occurs. 64 | with sv.managed_session(server.target) as sess: 65 | agent.session = sess 66 | agent.gf.session = sess 67 | agent.sff.session =sess 68 | agent.supervisor = sv 69 | 70 | if pms.train_flag: 71 | agent.learn() 72 | elif FLAGS.task_index == 0: 73 | agent.test(pms.checkpoint_file) 74 | # Ask for all the services to stop. 75 | sv.stop() 76 | 77 | if __name__ == "__main__": 78 | tf.app.run() -------------------------------------------------------------------------------- /experiment/main_multi_thread.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import tempfile 4 | import sys 5 | from utils import * 6 | import numpy as np 7 | import tensorflow as tf 8 | import signal 9 | from parameters import pms 10 | from logger.logger import Logger 11 | from agent.agent_cotinous_single_thread import TRPOAgentContinousSingleThread 12 | from network.network_continous import NetworkContinous 13 | 14 | seed = 1 15 | np.random.seed(seed) 16 | tf.set_random_seed(seed) 17 | 18 | training_dir = tempfile.mkdtemp() 19 | logging.getLogger().setLevel(logging.DEBUG) 20 | 21 | 22 | class MasterContinous(object): 23 | def __init__(self): 24 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1 / 3.0) 25 | self.session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) 26 | 27 | self.network = NetworkContinous("master") 28 | self.gf = GetFlat(self.network.var_list) # get theta from var_list 29 | self.gf.session = self.session 30 | self.sff = SetFromFlat(self.network.var_list) # set theta from var_List 31 | self.sff.session = self.session 32 | self.session.run(tf.initialize_all_variables()) 33 | self.saver = tf.train.Saver(max_to_keep=10) 34 | 35 | self.init_jobs() 36 | if pms.train_flag: 37 | self.init_logger() 38 | 39 | def init_jobs(self): 40 | self.jobs = [] 41 | for thread_id in xrange(pms.jobs): 42 | job = TRPOAgentContinousSingleThread(thread_id, self) 43 | self.jobs.append(job) 44 | 45 | def init_logger(self): 46 | head = ["average_episode_std", "sum steps episode number" "total number of episodes", 47 | "Average sum of rewards per episode", 48 | "KL between old and new distribution", "Surrogate loss", "Surrogate loss prev", "ds", "entropy", 49 | "mean_advant"] 50 | self.logger = Logger(head) 51 | 52 | def get_parameters(self): 53 | return self.gf() 54 | 55 | def apply_gradient(self, gradient): 56 | theta_prev = self.gf() 57 | theta_after = theta_prev + gradient 58 | self.sff(theta_after) 59 | 60 | def train(self): 61 | signal.signal(signal.SIGINT, signal_handler) 62 | for job in self.jobs: 63 | job.start() 64 | for job in self.jobs: 65 | job.join() 66 | 67 | def test(self): 68 | self.load_model(pms.checkpoint_file) 69 | self.jobs[0].test() 70 | 71 | def save_model(self, model_name): 72 | self.saver.save(self.session, "checkpoint/" + model_name + ".ckpt") 73 | 74 | def load_model(self , model_name): 75 | try: 76 | if model_name is not None: 77 | self.saver.restore(self.session , model_name) 78 | else: 79 | self.saver.restore(self.session , tf.train.latest_checkpoint("checkpoint/")) 80 | except: 81 | print "load model %s fail" % (model_name) 82 | 83 | def signal_handler(): 84 | sys.exit(0) 85 | 86 | 87 | if not os.path.isdir("./checkpoint"): 88 | os.makedirs("./checkpoint") 89 | if not os.path.isdir("./log"): 90 | os.makedirs("./log") 91 | master = MasterContinous() 92 | if pms.train_flag: 93 | master.train() 94 | else: 95 | master.test() 96 | # env.monitor.close() 97 | # gym.upload(training_dir, 98 | # algorithm_id='trpo_ff') 99 | -------------------------------------------------------------------------------- /storage/storage.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | from parameters import pms 3 | 4 | 5 | class Storage(object): 6 | def __init__(self, agent, env, baseline): 7 | self.paths = [] 8 | self.env = env 9 | self.agent = agent 10 | self.baseline = baseline 11 | 12 | def get_single_path(self): 13 | self.obs, actions, rewards, action_dists = [], [], [], [] 14 | ob = self.env.reset() 15 | episode_steps = 0 16 | for _ in xrange(pms.max_path_length): 17 | action, action_dist, ob = self.agent.act(ob) 18 | self.obs.append(ob) 19 | actions.append(action) 20 | action_dists.append(action_dist) 21 | res = self.env.step(action) # res 22 | if pms.render: 23 | self.env.render() 24 | ob = res[0] 25 | rewards.append([res[1]]) 26 | episode_steps += 1 27 | if res[2]: 28 | break 29 | path = dict( 30 | observations=np.concatenate(np.expand_dims(self.obs, 0)), 31 | agent_infos=np.concatenate(action_dists), 32 | rewards=np.array(rewards), 33 | actions=np.array(actions), 34 | episode_steps=episode_steps 35 | ) 36 | self.paths.append(path) 37 | 38 | def get_paths(self): 39 | paths = self.paths 40 | self.paths = [] 41 | return paths 42 | 43 | def process_paths(self, paths): 44 | sum_episode_steps = 0 45 | for path in paths: 46 | sum_episode_steps += path['episode_steps'] 47 | # r_t+V(S_{t+1})-V(S_t) = returns-baseline 48 | # path_baselines = np.append(self.baseline.predict(path) , 0) 49 | # # r_t+V(S_{t+1})-V(S_t) = returns-baseline 50 | # path["advantages"] = np.concatenate(path["rewards"]) + \ 51 | # pms.discount * path_baselines[1:] - \ 52 | # path_baselines[:-1] 53 | # path["returns"] = np.concatenate(discount(path["rewards"], pms.discount)) 54 | path_baselines = np.append(self.baseline.predict(path) , 0) 55 | deltas = np.concatenate(path["rewards"]) + \ 56 | pms.discount * path_baselines[1:] - \ 57 | path_baselines[:-1] 58 | path["advantages"] = discount( 59 | deltas , pms.discount * pms.gae_lambda) 60 | path["returns"] = np.concatenate(discount(path["rewards"] , pms.discount)) 61 | 62 | # Updating policy. 63 | action_dist_n = np.concatenate([path["agent_infos"] for path in paths]) 64 | obs_n = np.concatenate([path["observations"] for path in paths]) 65 | action_n = np.concatenate([path["actions"] for path in paths]) 66 | rewards = np.concatenate([path["rewards"] for path in paths]) 67 | advantages = np.concatenate([path["advantages"] for path in paths]) 68 | 69 | if pms.center_adv: 70 | advantages = (advantages - np.mean(advantages)) / (advantages.std() + 1e-8) 71 | 72 | self.baseline.fit(paths) 73 | 74 | samples_data = dict( 75 | observations=obs_n, 76 | actions=action_n, 77 | rewards=rewards, 78 | advantages=advantages, 79 | agent_infos=action_dist_n, 80 | paths=paths, 81 | sum_episode_steps=sum_episode_steps 82 | ) 83 | return samples_data 84 | -------------------------------------------------------------------------------- /distribution/diagonal_gaussian.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | 5 | class DiagonalGaussian(object): 6 | def __init__(self, dim): 7 | self._dim = dim 8 | 9 | @property 10 | def dim(self): 11 | return self._dim 12 | 13 | def kl_sym(self, old_dist_info_vars, new_dist_info_vars): 14 | old_means = old_dist_info_vars["mean"] 15 | old_log_stds = old_dist_info_vars["log_std"] 16 | new_means = new_dist_info_vars["mean"] 17 | new_log_stds = new_dist_info_vars["log_std"] 18 | """ 19 | Compute the KL divergence of two multivariate Gaussian distribution with 20 | diagonal covariance matrices 21 | """ 22 | old_std = tf.exp(old_log_stds) 23 | new_std = tf.exp(new_log_stds) 24 | # means: (N*A) 25 | # std: (N*A) 26 | # formula: 27 | # { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) + 28 | # ln(\sigma_2/\sigma_1) 29 | numerator = tf.square(old_means - new_means) + \ 30 | tf.square(old_std) - tf.square(new_std) 31 | denominator = 2 * tf.square(new_std) + 1e-8 32 | return tf.reduce_sum( 33 | numerator / denominator + new_log_stds - old_log_stds, -1) 34 | 35 | def likelihood_ratio_sym(self, x_var, new_dist_info_vars, old_dist_info_vars): 36 | """ 37 | \frac{\pi_\theta}{\pi_{old}} 38 | :param x_var: actions 39 | :param new_dist_info_vars: means + logstds 40 | :param old_dist_info_vars: old_means + old_logstds 41 | :return: 42 | """ 43 | logli_new = self.log_likelihood_sym(x_var, new_dist_info_vars) 44 | logli_old = self.log_likelihood_sym(x_var, old_dist_info_vars) 45 | return tf.exp(logli_new - logli_old) 46 | 47 | def log_likelihood_sym(self, x_var, dist_info_vars): 48 | """ 49 | \frac{1}{(2\pi)^{\frac{n}{2}}\sigma_\theta}exp(-(\frac{a-\mu_{\pi_\theta}}{2\sigma_\theta})^2) 50 | :param x_var: 51 | :param dist_info_vars: 52 | :return: 53 | """ 54 | means = dist_info_vars["mean"] 55 | log_stds = dist_info_vars["log_std"] 56 | zs = (x_var - means) / tf.exp(log_stds) 57 | return - tf.reduce_sum(log_stds, -1) - \ 58 | 0.5 * tf.reduce_sum(tf.square(zs), -1) - \ 59 | 0.5 *means.get_shape()[-1].value * np.log(2 * np.pi) 60 | 61 | def kl_sym_firstfixed(self, old_dist_info_vars): 62 | mu = old_dist_info_vars["mean"] 63 | logstd = old_dist_info_vars["log_std"] 64 | mu1 , logstd1 = map(tf.stop_gradient , [mu , logstd]) 65 | mu2 , logstd2 = mu , logstd 66 | 67 | return self.kl_sym(dict(mean=mu1, log_std=logstd1), dict(mean=mu2, log_std=logstd2)) 68 | 69 | def sample(self, dist_info): 70 | means = dist_info["mean"] 71 | log_stds = dist_info["log_std"] 72 | rnd = np.random.normal(size=means.shape) 73 | return rnd * np.exp(log_stds) + means 74 | 75 | def log_likelihood(self, xs, dist_info): 76 | means = dist_info["mean"] 77 | log_stds = dist_info["log_std"] 78 | zs = (xs - means) / np.exp(log_stds) 79 | return - np.sum(log_stds, axis=-1) - \ 80 | 0.5 * np.sum(np.square(zs), axis=-1) - \ 81 | 0.5 * means.shape[-1] * np.log(2 * np.pi) 82 | 83 | def entropy(self, dist_info): 84 | log_stds = dist_info["log_std"] 85 | return tf.reduce_sum(log_stds + np.log(np.sqrt(2 * np.pi * np.e))) 86 | 87 | @property 88 | def dist_info_keys(self): 89 | return ["mean", "log_std"] 90 | -------------------------------------------------------------------------------- /agent/agent_parallel.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | import numpy as np 3 | import tensorflow as tf 4 | from network.network_continous import NetworkContinous 5 | from parameters import pms 6 | from agent.agent_base import TRPOAgentBase 7 | 8 | seed = 1 9 | np.random.seed(seed) 10 | tf.set_random_seed(seed) 11 | class TRPOAgentParallel(TRPOAgentBase): 12 | 13 | def __init__(self, env): 14 | super(TRPOAgentParallel, self).__init__(env) 15 | self.init_network() 16 | # self.saver = tf.train.Saver(max_to_keep=10) 17 | 18 | def init_network(self): 19 | """ 20 | [input] 21 | self.obs 22 | self.action_n 23 | self.advant 24 | self.old_dist_means_n 25 | self.old_dist_logstds_n 26 | [output] 27 | self.action_dist_means_n 28 | self.action_dist_logstds_n 29 | var_list 30 | """ 31 | self.net = NetworkContinous("network_continous") 32 | self.global_step = tf.Variable(0 , trainable=False) 33 | self.step_op = tf.assign_add(self.global_step , 1 , use_locking=True) 34 | if pms.min_std is not None: 35 | log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std)) 36 | self.action_dist_stds_n = tf.exp(log_std_var) 37 | self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n) 38 | self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n) 39 | self.likehood_action_dist = self.distribution.log_likelihood_sym(self.net.action_n, self.new_dist_info_vars) 40 | self.ratio_n = self.distribution.likelihood_ratio_sym(self.net.action_n, self.new_dist_info_vars, 41 | self.old_dist_info_vars) 42 | surr = -tf.reduce_mean(self.ratio_n * self.net.advant) # Surrogate loss 43 | batch_size = tf.shape(self.net.obs)[0] 44 | batch_size_float = tf.cast(batch_size , tf.float32) 45 | kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars)) 46 | ent = self.distribution.entropy(self.old_dist_info_vars) 47 | # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf 48 | self.losses = [surr, kl, ent] 49 | var_list = self.net.var_list 50 | self.gf = GetFlat(var_list) # get theta from var_list 51 | self.gf.session = self.session 52 | self.sff = SetFromFlat(var_list) # set theta from var_List 53 | self.sff.session = self.session 54 | # get g 55 | self.pg = flatgrad(surr, var_list) 56 | # get A 57 | # KL divergence where first arg is fixed 58 | # replace old->tf.stop_gradient from previous kl 59 | kl_firstfixed = self.distribution.kl_sym_firstfixed(self.new_dist_info_vars) / batch_size_float 60 | grads = tf.gradients(kl_firstfixed, var_list) 61 | self.flat_tangent = tf.placeholder(dtype, shape=[None]) 62 | shapes = map(var_shape, var_list) 63 | start = 0 64 | tangents = [] 65 | for shape in shapes: 66 | size = np.prod(shape) 67 | param = tf.reshape(self.flat_tangent[start:(start + size)], shape) 68 | tangents.append(param) 69 | start += size 70 | self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)] 71 | self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list) # get kl''*p 72 | # self.saver = tf.train.Saver(max_to_keep=10) 73 | # self.load_model(pms.checkpoint_file) 74 | 75 | def learn(self): 76 | iter_num = 0 77 | while True: 78 | print "\n********** Iteration %i ************" % iter_num 79 | print self.gf().mean() 80 | stats, theta, thprev = self.train_mini_batch(linear_search=False) 81 | self.sff(theta) 82 | for k , v in stats.iteritems(): 83 | print(k + ": " + " " * (40 - len(k)) + str(v)) 84 | # if iter_num % pms.save_model_times == 0: 85 | # self.save_model(pms.environment_name + "-" + str(iter_num)) 86 | self.session.run(self.step_op) 87 | iter_num += 1 88 | -------------------------------------------------------------------------------- /experiment/main_image_multi_process.py: -------------------------------------------------------------------------------- 1 | import os 2 | if not os.path.isdir("./checkpoint"): 3 | os.makedirs("./checkpoint") 4 | if not os.path.isdir("./log"): 5 | os.makedirs("./log") 6 | 7 | 8 | import gym 9 | import multiprocessing 10 | import time 11 | from agent.agent_continous_image_parallel_image import TRPOAgentParallelImage 12 | from parameters import pms 13 | from storage.storage_continous_parallel_image import ParallelStorageImage 14 | 15 | args = pms 16 | args.max_pathlength = gym.spec(args.environment_name).timestep_limit 17 | 18 | learner_tasks = multiprocessing.JoinableQueue() 19 | learner_results = multiprocessing.Queue() 20 | learner_env = gym.make(args.environment_name) 21 | 22 | learner = TRPOAgentParallelImage(learner_env.observation_space, learner_env.action_space, learner_tasks, learner_results) 23 | learner.start() 24 | rollouts = ParallelStorageImage() 25 | 26 | learner_tasks.put(1) 27 | learner_tasks.join() 28 | starting_weights = learner_results.get() 29 | rollouts.set_policy_weights(starting_weights) 30 | 31 | start_time = time.time() 32 | history = {} 33 | history["rollout_time"] = [] 34 | history["learn_time"] = [] 35 | history["mean_reward"] = [] 36 | history["timesteps"] = [] 37 | 38 | # start it off with a big negative number 39 | last_reward = -1000000 40 | recent_total_reward = 0 41 | 42 | if pms.train_flag is True: 43 | for iteration in xrange(args.max_iter_number): 44 | # runs a bunch of async processes that collect rollouts 45 | paths = rollouts.get_paths() 46 | # Why is the learner in an async process? 47 | # Well, it turns out tensorflow has an issue: when there's a tf.Session in the main thread 48 | # and an async process creates another tf.Session, it will freeze up. 49 | # To solve this, we just make the learner's tf.Session in its own async process, 50 | # and wait until the learner's done before continuing the main thread. 51 | learn_start = time.time() 52 | if iteration%20 == 0: 53 | learner_tasks.put((2 , args.max_kl, 1, iteration)) 54 | else: 55 | learner_tasks.put((2, args.max_kl, 0, iteration)) 56 | learner_tasks.put(paths) 57 | learner_tasks.join() 58 | stats , theta , thprev = learner_results.get() 59 | learn_time = (time.time() - learn_start) / 60.0 60 | print 61 | print "-------- Iteration %d ----------" % iteration 62 | # print "Total time: %.2f mins" % ((time.time() - start_time) / 60.0) 63 | # 64 | # history["rollout_time"].append(rollout_time) 65 | # history["learn_time"].append(learn_time) 66 | # history["mean_reward"].append(mean_reward) 67 | # history["timesteps"].append(args.timesteps_per_batch) 68 | for k , v in stats.iteritems(): 69 | print(k + ": " + " " * (40 - len(k)) + str(v)) 70 | recent_total_reward += stats["Average sum of rewards per episode"] 71 | 72 | if args.decay_method == "adaptive": 73 | if iteration % 10 == 0: 74 | if recent_total_reward < last_reward: 75 | print "Policy is not improving. Decrease KL and increase steps." 76 | if args.max_kl > 0.001: 77 | args.max_kl -= args.kl_adapt 78 | else: 79 | print "Policy is improving. Increase KL and decrease steps." 80 | if args.max_kl < 0.01: 81 | args.max_kl += args.kl_adapt 82 | last_reward = recent_total_reward 83 | recent_total_reward = 0 84 | 85 | if args.decay_method == "linear": 86 | if args.max_kl > 0.001: 87 | args.max_kl -= args.kl_adapt 88 | 89 | if args.decay_method == "exponential": 90 | if args.max_kl > 0.001: 91 | args.max_kl *= args.kl_adapt 92 | rollouts.set_policy_weights(theta) 93 | else: 94 | from agent.agent_continous import TRPOAgent 95 | from environment import Environment 96 | env = Environment(gym.make(pms.environment_name)) 97 | agent = TRPOAgent(env) 98 | agent.test(pms.checkpoint_file) 99 | 100 | 101 | rollouts.end() 102 | -------------------------------------------------------------------------------- /agent/agent_continous_image.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | import numpy as np 3 | import tensorflow as tf 4 | from network.network_continous_image import NetworkContinousImage 5 | from baseline.baseline_tf_image import BaselineTfImage 6 | from storage.storage_image import Storage 7 | from parameters import pms 8 | from agent.agent_base import TRPOAgentBase 9 | 10 | seed = 1 11 | np.random.seed(seed) 12 | tf.set_random_seed(seed) 13 | 14 | """ 15 | class for continoust action space with image as input 16 | """ 17 | class TRPOAgent(TRPOAgentBase): 18 | 19 | def __init__(self, env): 20 | super(TRPOAgent, self).__init__(env) 21 | self.init_network() 22 | self.saver = tf.train.Saver(max_to_keep=10) 23 | self.baseline = BaselineTfImage(self.session) 24 | self.storage = Storage(self, env, self.baseline) 25 | 26 | def init_network(self): 27 | """ 28 | [input] 29 | self.obs 30 | self.action_n 31 | self.advant 32 | self.old_dist_means_n 33 | self.old_dist_logstds_n 34 | [output] 35 | self.action_dist_means_n 36 | self.action_dist_logstds_n 37 | var_list 38 | """ 39 | self.net = NetworkContinousImage("network_continous") 40 | if pms.min_std is not None: 41 | log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std)) 42 | self.action_dist_stds_n = tf.exp(log_std_var) 43 | self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n) 44 | self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n) 45 | self.likehood_action_dist = self.distribution.log_likelihood_sym(self.net.action_n, self.new_dist_info_vars) 46 | self.ratio_n = self.distribution.likelihood_ratio_sym(self.net.action_n, self.new_dist_info_vars, 47 | self.old_dist_info_vars) 48 | surr = -tf.reduce_mean(self.ratio_n * self.net.advant) # Surrogate loss 49 | batch_size = tf.shape(self.net.obs)[0] 50 | batch_size_float = tf.cast(batch_size , tf.float32) 51 | kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars)) 52 | ent = self.distribution.entropy(self.old_dist_info_vars) 53 | # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf 54 | self.losses = [surr, kl, ent] 55 | var_list = self.net.var_list 56 | self.gf = GetFlat(var_list) # get theta from var_list 57 | self.gf.session = self.session 58 | self.sff = SetFromFlat(var_list) # set theta from var_List 59 | self.sff.session = self.session 60 | # get g 61 | self.pg = flatgrad(surr, var_list) 62 | # get A 63 | # KL divergence where first arg is fixed 64 | # replace old->tf.stop_gradient from previous kl 65 | kl_firstfixed = self.distribution.kl_sym_firstfixed(self.new_dist_info_vars) / batch_size_float 66 | grads = tf.gradients(kl_firstfixed, var_list) 67 | self.flat_tangent = tf.placeholder(dtype, shape=[None]) 68 | shapes = map(var_shape, var_list) 69 | start = 0 70 | tangents = [] 71 | for shape in shapes: 72 | size = np.prod(shape) 73 | param = tf.reshape(self.flat_tangent[start:(start + size)], shape) 74 | tangents.append(param) 75 | start += size 76 | self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)] 77 | self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list) # get kl''*p 78 | self.session.run(tf.initialize_all_variables()) 79 | # self.saver = tf.train.Saver(max_to_keep=10) 80 | # self.load_model(pms.checkpoint_file) 81 | 82 | def learn(self): 83 | iter_num = 0 84 | while True: 85 | print "\n********** Iteration %i ************" % iter_num 86 | print self.gf().mean() 87 | stats, theta, thprev = self.train_mini_batch(linear_search=False) 88 | self.sff(theta) 89 | for k , v in stats.iteritems(): 90 | print(k + ": " + " " * (40 - len(k)) + str(v)) 91 | if iter_num % pms.save_model_times == 0: 92 | self.save_model(pms.environment_name + "-" + str(iter_num)) 93 | iter_num += 1 94 | -------------------------------------------------------------------------------- /experiment/main_multi_process.py: -------------------------------------------------------------------------------- 1 | import os 2 | if not os.path.isdir("./checkpoint"): 3 | os.makedirs("./checkpoint") 4 | if not os.path.isdir("./log"): 5 | os.makedirs("./log") 6 | 7 | 8 | import gym 9 | import multiprocessing 10 | import time 11 | from agent.agent_continous_parallel_storage import TRPOAgentParallel 12 | <<<<<<< HEAD 13 | import argparse 14 | import multiprocessing 15 | import time 16 | import json 17 | ======= 18 | >>>>>>> rnn 19 | from parameters import pms 20 | from storage.storage_continous_parallel import ParallelStorage 21 | 22 | args = pms 23 | args.max_pathlength = gym.spec(args.environment_name).timestep_limit 24 | 25 | learner_tasks = multiprocessing.JoinableQueue() 26 | learner_results = multiprocessing.Queue() 27 | learner_env = gym.make(args.environment_name) 28 | 29 | learner = TRPOAgentParallel(learner_env.observation_space, learner_env.action_space, learner_tasks, learner_results) 30 | learner.start() 31 | rollouts = ParallelStorage() 32 | 33 | learner_tasks.put(1) 34 | learner_tasks.join() 35 | starting_weights = learner_results.get() 36 | rollouts.set_policy_weights(starting_weights) 37 | 38 | start_time = time.time() 39 | history = {} 40 | history["rollout_time"] = [] 41 | history["learn_time"] = [] 42 | history["mean_reward"] = [] 43 | history["timesteps"] = [] 44 | 45 | # start it off with a big negative number 46 | last_reward = -1000000 47 | recent_total_reward = 0 48 | 49 | if pms.train_flag is True: 50 | for iteration in xrange(args.max_iter_number): 51 | # runs a bunch of async processes that collect rollouts 52 | paths = rollouts.get_paths() 53 | # Why is the learner in an async process? 54 | # Well, it turns out tensorflow has an issue: when there's a tf.Session in the main thread 55 | # and an async process creates another tf.Session, it will freeze up. 56 | # To solve this, we just make the learner's tf.Session in its own async process, 57 | # and wait until the learner's done before continuing the main thread. 58 | learn_start = time.time() 59 | if iteration%20 == 0: 60 | learner_tasks.put((2 , args.max_kl, 1, iteration)) 61 | else: 62 | learner_tasks.put((2, args.max_kl, 0, iteration)) 63 | learner_tasks.put(paths) 64 | learner_tasks.join() 65 | stats , theta , thprev = learner_results.get() 66 | learn_time = (time.time() - learn_start) / 60.0 67 | print 68 | print "-------- Iteration %d ----------" % iteration 69 | # print "Total time: %.2f mins" % ((time.time() - start_time) / 60.0) 70 | # 71 | # history["rollout_time"].append(rollout_time) 72 | # history["learn_time"].append(learn_time) 73 | # history["mean_reward"].append(mean_reward) 74 | # history["timesteps"].append(args.timesteps_per_batch) 75 | for k , v in stats.iteritems(): 76 | print(k + ": " + " " * (40 - len(k)) + str(v)) 77 | recent_total_reward += stats["Average sum of rewards per episode"] 78 | 79 | if args.decay_method == "adaptive": 80 | if iteration % 10 == 0: 81 | if recent_total_reward < last_reward: 82 | print "Policy is not improving. Decrease KL and increase steps." 83 | if args.max_kl > 0.001: 84 | args.max_kl -= args.kl_adapt 85 | else: 86 | print "Policy is improving. Increase KL and decrease steps." 87 | if args.max_kl < 0.01: 88 | args.max_kl += args.kl_adapt 89 | last_reward = recent_total_reward 90 | recent_total_reward = 0 91 | 92 | if args.decay_method == "linear": 93 | if args.max_kl > 0.001: 94 | args.max_kl -= args.kl_adapt 95 | 96 | if args.decay_method == "exponential": 97 | if args.max_kl > 0.001: 98 | args.max_kl *= args.kl_adapt 99 | rollouts.set_policy_weights(theta) 100 | else: 101 | from agent.agent_continous import TRPOAgent 102 | from environment import Environment 103 | env = Environment(gym.make(pms.environment_name)) 104 | agent = TRPOAgent(env) 105 | agent.test(pms.checkpoint_file) 106 | 107 | 108 | rollouts.end() 109 | -------------------------------------------------------------------------------- /agent/agent_continous.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | import numpy as np 3 | import tensorflow as tf 4 | from network.network_continous import NetworkContinous 5 | from parameters import pms 6 | from agent.agent_base import TRPOAgentBase 7 | from logger.logger import Logger 8 | from storage.storage_continous_parallel import ParallelStorage 9 | 10 | seed = 1 11 | np.random.seed(seed) 12 | tf.set_random_seed(seed) 13 | 14 | """ 15 | class for continoust action space 16 | """ 17 | class TRPOAgent(TRPOAgentBase): 18 | def __init__(self, env): 19 | super(TRPOAgent, self).__init__(env) 20 | self.init_network() 21 | self.saver = tf.train.Saver(max_to_keep=10) 22 | 23 | def init_network(self): 24 | """ 25 | [input] 26 | self.obs 27 | self.action_n 28 | self.advant 29 | self.old_dist_means_n 30 | self.old_dist_logstds_n 31 | [output] 32 | self.action_dist_means_n 33 | self.action_dist_logstds_n 34 | var_list 35 | """ 36 | self.net = NetworkContinous("network_continous") 37 | if pms.min_std is not None: 38 | log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std)) 39 | self.action_dist_stds_n = tf.exp(log_std_var) 40 | self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n) 41 | self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n) 42 | self.likehood_action_dist = self.distribution.log_likelihood_sym(self.net.action_n, self.new_dist_info_vars) 43 | self.ratio_n = self.distribution.likelihood_ratio_sym(self.net.action_n, self.new_dist_info_vars, 44 | self.old_dist_info_vars) 45 | surr = -tf.reduce_sum(self.ratio_n * self.net.advant) # Surrogate loss 46 | batch_size = tf.shape(self.net.obs)[0] 47 | batch_size_float = tf.cast(batch_size , tf.float32) 48 | kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars)) 49 | ent = self.distribution.entropy(self.old_dist_info_vars) 50 | # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf 51 | self.losses = [surr, kl, ent] 52 | var_list = self.net.var_list 53 | self.gf = GetFlat(var_list) # get theta from var_list 54 | self.gf.session = self.session 55 | self.sff = SetFromFlat(var_list) # set theta from var_List 56 | self.sff.session = self.session 57 | # get g 58 | self.pg = flatgrad(surr, var_list) 59 | # get A 60 | # KL divergence where first arg is fixed 61 | # replace old->tf.stop_gradient from previous kl 62 | kl_firstfixed = self.distribution.kl_sym_firstfixed(self.new_dist_info_vars) / batch_size_float 63 | grads = tf.gradients(kl_firstfixed, var_list) 64 | self.flat_tangent = tf.placeholder(dtype, shape=[None]) 65 | shapes = map(var_shape, var_list) 66 | start = 0 67 | tangents = [] 68 | for shape in shapes: 69 | size = np.prod(shape) 70 | param = tf.reshape(self.flat_tangent[start:(start + size)], shape) 71 | tangents.append(param) 72 | start += size 73 | self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)] 74 | self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list) # get kl''*p 75 | self.session.run(tf.initialize_all_variables()) 76 | # self.saver = tf.train.Saver(max_to_keep=10) 77 | # self.load_model(pms.checkpoint_file) 78 | 79 | def init_logger(self): 80 | head = ["rewards", "std"] 81 | self.logger = Logger(head) 82 | 83 | def learn(self): 84 | self.init_logger() 85 | iter_num = 0 86 | while True: 87 | print "\n********** Iteration %i ************" % iter_num 88 | print self.gf().mean() 89 | stats, theta, thprev = self.train_mini_batch(linear_search=False) 90 | self.sff(theta) 91 | self.logger.log_row([stats["Average sum of rewards per episode"], self.session.run(self.net.action_dist_logstd_param)[0][0]]) 92 | for k , v in stats.iteritems(): 93 | print(k + ": " + " " * (40 - len(k)) + str(v)) 94 | if iter_num % pms.save_model_times == 0: 95 | self.save_model(pms.environment_name + "-" + str(iter_num)) 96 | iter_num += 1 97 | -------------------------------------------------------------------------------- /agent/agent_continous_rnn.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | import numpy as np 3 | import tensorflow as tf 4 | from network.network_continous_rnn import NetworkContinousLSTM 5 | from parameters import pms 6 | from agent.agent_base import TRPOAgentBase 7 | from logger.logger import Logger 8 | from storage.storage_continous_parallel import ParallelStorage 9 | 10 | seed = 1 11 | np.random.seed(seed) 12 | tf.set_random_seed(seed) 13 | 14 | """ 15 | class for continoust action space 16 | """ 17 | class TRPOAgent(TRPOAgentBase): 18 | def __init__(self, env): 19 | super(TRPOAgent, self).__init__(env) 20 | self.init_network() 21 | self.saver = tf.train.Saver(max_to_keep=10) 22 | 23 | def init_network(self): 24 | """ 25 | [input] 26 | self.obs 27 | self.action_n 28 | self.advant 29 | self.old_dist_means_n 30 | self.old_dist_logstds_n 31 | [output] 32 | self.action_dist_means_n 33 | self.action_dist_logstds_n 34 | var_list 35 | """ 36 | self.net = NetworkContinousLSTM("network_continous") 37 | if pms.min_std is not None: 38 | log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std)) 39 | self.action_dist_stds_n = tf.exp(log_std_var) 40 | self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n) 41 | self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n) 42 | self.likehood_action_dist = self.distribution.log_likelihood_sym(self.net.action_n, self.new_dist_info_vars) 43 | self.ratio_n = self.distribution.likelihood_ratio_sym(self.net.action_n, self.new_dist_info_vars, 44 | self.old_dist_info_vars) 45 | surr = -tf.reduce_sum(self.ratio_n * self.net.advant) # Surrogate loss 46 | batch_size = tf.shape(self.net.obs)[0] 47 | batch_size_float = tf.cast(batch_size , tf.float32) 48 | kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars)) 49 | ent = self.distribution.entropy(self.old_dist_info_vars) 50 | # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf 51 | self.losses = [surr, kl, ent] 52 | var_list = self.net.var_list 53 | self.gf = GetFlat(var_list) # get theta from var_list 54 | self.gf.session = self.session 55 | self.sff = SetFromFlat(var_list) # set theta from var_List 56 | self.sff.session = self.session 57 | # get g 58 | self.pg = flatgrad(surr, var_list) 59 | # get A 60 | # KL divergence where first arg is fixed 61 | # replace old->tf.stop_gradient from previous kl 62 | kl_firstfixed = self.distribution.kl_sym_firstfixed(self.new_dist_info_vars) / batch_size_float 63 | grads = tf.gradients(kl_firstfixed, var_list) 64 | self.flat_tangent = tf.placeholder(dtype, shape=[None]) 65 | shapes = map(var_shape, var_list) 66 | start = 0 67 | tangents = [] 68 | for shape in shapes: 69 | size = np.prod(shape) 70 | param = tf.reshape(self.flat_tangent[start:(start + size)], shape) 71 | tangents.append(param) 72 | start += size 73 | self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)] 74 | self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list) # get kl''*p 75 | self.session.run(tf.initialize_all_variables()) 76 | # self.saver = tf.train.Saver(max_to_keep=10) 77 | # self.load_model(pms.checkpoint_file) 78 | 79 | def init_logger(self): 80 | head = ["rewards", "std"] 81 | self.logger = Logger(head) 82 | 83 | def learn(self): 84 | self.init_logger() 85 | iter_num = 0 86 | while True: 87 | print "\n********** Iteration %i ************" % iter_num 88 | print self.gf().mean() 89 | stats, theta, thprev = self.train_mini_batch(linear_search=False) 90 | self.sff(theta) 91 | self.logger.log_row([stats["Average sum of rewards per episode"], self.session.run(self.net.action_dist_logstd_param)[0][0]]) 92 | for k , v in stats.iteritems(): 93 | print(k + ": " + " " * (40 - len(k)) + str(v)) 94 | if iter_num % pms.save_model_times == 0: 95 | self.save_model(pms.environment_name + "-" + str(iter_num)) 96 | iter_num += 1 97 | -------------------------------------------------------------------------------- /storage/storage_image.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | from utils import * 3 | from parameters import pms 4 | 5 | 6 | class Storage(object): 7 | def __init__(self, agent, env, baseline): 8 | self.paths = [] 9 | self.env = env 10 | self.agent = agent 11 | self.obs = [] 12 | self.obs_origin = [] 13 | self.baseline = baseline 14 | 15 | def get_single_path(self): 16 | self.obs_origin, self.obs, actions, rewards, action_dists = [], [], [], [], [] 17 | ob = self.env.reset() 18 | ob = self.env.render('rgb_array') 19 | # self.agent.prev_action *= 0.0 20 | # self.agent.prev_obs *= 0.0 21 | episode_steps = 0 22 | for _ in xrange(pms.max_path_length): 23 | self.obs_origin.append(ob) 24 | deal_ob = self.deal_image(ob) 25 | action, action_dist = self.agent.get_action(deal_ob) 26 | self.obs.append(deal_ob) 27 | actions.append(action) 28 | action_dists.append(action_dist) 29 | res = self.env.step(action) # res 30 | if pms.render: 31 | self.env.render() 32 | ob = res[0] 33 | ob = self.env.render('rgb_array') 34 | rewards.append([res[1]]) 35 | episode_steps += 1 36 | if res[2]: 37 | break 38 | path = dict( 39 | observations=np.concatenate([self.obs]), 40 | agent_infos=np.concatenate([action_dists]), 41 | rewards=np.array(rewards), 42 | actions=np.array(actions), 43 | episode_steps=episode_steps 44 | ) 45 | self.paths.append(path) 46 | # self.agent.prev_action *= 0.0 47 | # self.agent.prev_obs *= 0.0 48 | return path 49 | 50 | def get_paths(self): 51 | paths = self.paths 52 | self.paths = [] 53 | return paths 54 | 55 | def process_paths(self, paths): 56 | sum_episode_steps = 0 57 | for path in paths: 58 | sum_episode_steps += path['episode_steps'] 59 | # r_t+V(S_{t+1})-V(S_t) = returns-baseline 60 | # path_baselines = np.append(self.baseline.predict(path) , 0) 61 | # # r_t+V(S_{t+1})-V(S_t) = returns-baseline 62 | # path["advantages"] = np.concatenate(path["rewards"]) + \ 63 | # pms.discount * path_baselines[1:] - \ 64 | # path_baselines[:-1] 65 | # path["returns"] = np.concatenate(discount(path["rewards"], pms.discount)) 66 | path_baselines = np.append(self.baseline.predict(path) , 0) 67 | deltas = np.concatenate(path["rewards"]) + \ 68 | pms.discount * path_baselines[1:] - \ 69 | path_baselines[:-1] 70 | path["advantages"] = discount( 71 | deltas , pms.discount * pms.gae_lambda) 72 | path["returns"] = np.concatenate(discount(path["rewards"] , pms.discount)) 73 | # Updating policy. 74 | action_dist_n = np.concatenate([path["agent_infos"] for path in paths]) 75 | obs_n = np.concatenate([path["observations"] for path in paths]) 76 | action_n = np.concatenate([path["actions"] for path in paths]) 77 | rewards = np.concatenate([path["rewards"] for path in paths]) 78 | advantages = np.concatenate([path["advantages"] for path in paths]) 79 | 80 | if pms.center_adv: 81 | advantages = (advantages - np.mean(advantages)) / (advantages.std() + 1e-8) 82 | 83 | self.baseline.fit(paths) 84 | 85 | samples_data = dict( 86 | observations=obs_n, 87 | actions=action_n, 88 | rewards=rewards, 89 | advantages=advantages, 90 | agent_infos=action_dist_n, 91 | paths=paths, 92 | sum_episode_steps=sum_episode_steps 93 | ) 94 | return samples_data 95 | 96 | def deal_image(self, image): 97 | index = len(self.obs_origin) 98 | image_end = [] 99 | if index 0: 113 | return 0 114 | else: 115 | return 1 116 | if abs(1 - np.var(y - ypred) / (vary + 1e-8)) > 1e5: 117 | import ipdb; 118 | ipdb.set_trace() 119 | return 1 - np.var(y - ypred) / (vary + 1e-8) 120 | 121 | 122 | class Rollout(threading.Thread): 123 | def __init__(self, thread_number, agent, env, baseline): 124 | super(Rollout, self).__init__() 125 | self.thread_number = thread_number 126 | self.storage = Storage(agent, env, baseline) 127 | 128 | def run(self): 129 | self.storage.get_single_path() 130 | -------------------------------------------------------------------------------- /agent/agent_cotinous_single_thread.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | import threading 3 | import gym 4 | import numpy as np 5 | import random 6 | import tensorflow as tf 7 | import time 8 | import threading 9 | import prettytensor as pt 10 | 11 | from storage.storage_continous import Storage 12 | from storage.storage_continous import Rollout 13 | import math 14 | from parameters import pms 15 | import krylov 16 | from logger.logger import Logger 17 | from distribution.diagonal_gaussian import DiagonalGaussian 18 | from baseline.baseline_lstsq import Baseline 19 | from environment import Environment 20 | from network.network_continous import NetworkContinous 21 | from agent.agent_base import TRPOAgentBase 22 | 23 | seed = 1 24 | np.random.seed(seed) 25 | tf.set_random_seed(seed) 26 | 27 | 28 | class TRPOAgentContinousSingleThread(TRPOAgentBase, threading.Thread): 29 | 30 | def __init__(self, thread_id, master): 31 | print "create thread %d"%(thread_id) 32 | self.thread_id = thread_id 33 | threading.Thread.__init__(self, name="thread_%d" % thread_id) 34 | self.master = master 35 | self.env = env = Environment(gym.make(pms.environment_name)) 36 | TRPOAgentBase.__init__(self, env) 37 | 38 | self.session = self.master.session 39 | self.init_network() 40 | self.saver = tf.train.Saver(max_to_keep=10) 41 | 42 | 43 | def init_network(self): 44 | """ 45 | [input] 46 | self.obs 47 | self.action_n 48 | self.advant 49 | self.old_dist_means_n 50 | self.old_dist_logstds_n 51 | [output] 52 | self.action_dist_means_n 53 | self.action_dist_logstds_n 54 | var_list 55 | """ 56 | self.net = NetworkContinous(str(self.thread_id)) 57 | if pms.min_std is not None: 58 | log_std_var = tf.maximum(self.net.action_dist_logstds_n , np.log(pms.min_std)) 59 | self.action_dist_stds_n = tf.exp(log_std_var) 60 | self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n , log_std=self.net.old_dist_logstds_n) 61 | self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n , log_std=self.net.action_dist_logstds_n) 62 | self.likehood_action_dist = self.distribution.log_likelihood_sym(self.net.action_n , self.new_dist_info_vars) 63 | self.ratio_n = self.distribution.likelihood_ratio_sym(self.net.action_n , self.new_dist_info_vars , 64 | self.old_dist_info_vars) 65 | surr = -tf.reduce_mean(self.ratio_n * self.net.advant) # Surrogate loss 66 | kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars , self.new_dist_info_vars)) 67 | ent = self.distribution.entropy(self.old_dist_info_vars) 68 | # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf 69 | self.losses = [surr , kl , ent] 70 | var_list = self.net.var_list 71 | self.gf = GetFlat(var_list) # get theta from var_list 72 | self.gf.session = self.session 73 | self.sff = SetFromFlat(var_list) # set theta from var_List 74 | self.sff.session = self.session 75 | # get g 76 | self.pg = flatgrad(surr , var_list) 77 | # get A 78 | # KL divergence where first arg is fixed 79 | # replace old->tf.stop_gradient from previous kl 80 | kl_firstfixed = self.distribution.kl_sym_firstfixed(self.new_dist_info_vars) 81 | grads = tf.gradients(kl_firstfixed , var_list) 82 | self.flat_tangent = tf.placeholder(dtype , shape=[None]) 83 | shapes = map(var_shape , var_list) 84 | start = 0 85 | tangents = [] 86 | for shape in shapes: 87 | size = np.prod(shape) 88 | param = tf.reshape(self.flat_tangent[start:(start + size)] , shape) 89 | tangents.append(param) 90 | start += size 91 | self.gvp = [tf.reduce_sum(g * t) for (g , t) in zip(grads , tangents)] 92 | self.fvp = flatgrad(tf.reduce_sum(self.gvp) , var_list) # get kl''*p 93 | 94 | def run(self): 95 | self.learn() 96 | 97 | def learn(self): 98 | i = 0 99 | sum_gradient = 0 100 | while True: 101 | self.sff(self.master.get_parameters()) 102 | 103 | # Generating paths. 104 | stats, theta, theprev = self.train_mini_batch(parallel=False) 105 | sum_gradient += theta-theprev 106 | self.master.apply_gradient(sum_gradient) 107 | print "\n********** Iteration %i ************" % i 108 | for k , v in stats.iteritems(): 109 | print(k + ": " + " " * (40 - len(k)) + str(v)) 110 | sum_gradient = 0 111 | if self.thread_id==1 and i%pms.save_model_times==0: 112 | self.save_model(pms.environment_name + "-" + str(i)) 113 | i += 1 114 | 115 | 116 | def test(self): 117 | self.sff(self.master.get_parameters()) 118 | for i in range(50): 119 | self.storage.get_single_path() 120 | -------------------------------------------------------------------------------- /agent/AC_agent_continous.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | import numpy as np 3 | import tensorflow as tf 4 | from network.network_continous import NetworkContinous 5 | from parameters import pms 6 | from agent.agent_base import TRPOAgentBase 7 | from logger.logger import Logger 8 | import math 9 | import time 10 | 11 | seed = 1 12 | np.random.seed(seed) 13 | tf.set_random_seed(seed) 14 | class ACAgent(TRPOAgentBase): 15 | 16 | def __init__(self, env): 17 | super(ACAgent, self).__init__(env) 18 | self.init_network() 19 | self.saver = tf.train.Saver(max_to_keep=10) 20 | 21 | 22 | def init_network(self): 23 | """ 24 | [input] 25 | self.obs 26 | self.action_n 27 | self.advant 28 | self.old_dist_means_n 29 | self.old_dist_logstds_n 30 | [output] 31 | self.action_dist_means_n 32 | self.action_dist_logstds_n 33 | var_list 34 | """ 35 | self.net = NetworkContinous("network_continous_ac") 36 | if pms.min_std is not None: 37 | log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std)) 38 | self.action_dist_stds_n = tf.exp(log_std_var) 39 | self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n) 40 | self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n) 41 | self.likehood_new_action_dist = self.distribution.log_likelihood_sym(self.net.action_n, self.new_dist_info_vars) 42 | # surr = - log(\pi_\theta)*(Q^\pi-V^\pi) 43 | value_loss = 0.5*tf.square(self.net.advant) 44 | surr = -tf.reduce_sum(self.likehood_new_action_dist*tf.stop_gradient(self.net.advant)+value_loss) # Surrogate loss 45 | 46 | batch_size = tf.shape(self.net.obs)[0] 47 | batch_size_float = tf.cast(batch_size , tf.float32) 48 | kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars)) 49 | ent = self.distribution.entropy(self.old_dist_info_vars) 50 | # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf 51 | self.losses = [surr, kl, ent] 52 | var_list = self.net.var_list 53 | self.gf = GetFlat(var_list) # get theta from var_list 54 | self.gf.session = self.session 55 | self.sff = SetFromFlat(var_list) # set theta from var_List 56 | self.sff.session = self.session 57 | # get g 58 | self.pg = flatgrad(surr, var_list) 59 | 60 | self.session.run(tf.initialize_all_variables()) 61 | # self.saver = tf.train.Saver(max_to_keep=10) 62 | # self.load_model(pms.checkpoint_file) 63 | 64 | def init_logger(self): 65 | head = ["std", "rewards"] 66 | self.logger = Logger(head) 67 | 68 | def train_mini_batch(self, parallel=False, linear_search=True): 69 | # Generating paths. 70 | print("Rollout") 71 | start_time = time.time() 72 | self.get_samples(pms.paths_number) 73 | paths = self.storage.get_paths() # get_paths 74 | # Computing returns and estimating advantage function. 75 | sample_data = self.storage.process_paths(paths) 76 | agent_infos = sample_data["agent_infos"] 77 | obs_n = sample_data["observations"] 78 | action_n = sample_data["actions"] 79 | advant_n = sample_data["advantages"] 80 | n_samples = len(obs_n) 81 | inds = np.random.choice(n_samples, int(math.floor(n_samples * pms.subsample_factor)), replace=False) 82 | # inds = range(n_samples) 83 | obs_n = obs_n[inds] 84 | action_n = action_n[inds] 85 | advant_n = advant_n[inds] 86 | action_dist_means_n = np.array([agent_info["mean"] for agent_info in agent_infos[inds]]) 87 | action_dist_logstds_n = np.array([agent_info["log_std"] for agent_info in agent_infos[inds]]) 88 | feed = {self.net.obs: obs_n, 89 | self.net.advant: advant_n, 90 | self.net.old_dist_means_n: action_dist_means_n, 91 | self.net.old_dist_logstds_n: action_dist_logstds_n, 92 | self.net.action_n: action_n 93 | } 94 | 95 | episoderewards = np.array([path["rewards"].sum() for path in paths]) 96 | thprev = self.gf() # get theta_old 97 | 98 | g = self.session.run(self.pg, feed_dict=feed) 99 | theta = thprev+0.01*g 100 | stats = {} 101 | stats["sum steps of episodes"] = sample_data["sum_episode_steps"] 102 | stats["Average sum of rewards per episode"] = episoderewards.mean() 103 | stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0) 104 | return stats, theta, thprev 105 | 106 | def learn(self): 107 | self.init_logger() 108 | iter_num = 0 109 | while True: 110 | print "\n********** Iteration %i ************" % iter_num 111 | stats, theta, thprev = self.train_mini_batch(linear_search=False) 112 | self.sff(theta) 113 | self.logger.log_row([stats["Average sum of rewards per episode"], self.session.run(self.net.action_dist_logstd_param)]) 114 | for k , v in stats.iteritems(): 115 | print(k + ": " + " " * (40 - len(k)) + str(v)) 116 | if iter_num % pms.save_model_times == 0: 117 | self.save_model(pms.environment_name + "-" + str(iter_num)) 118 | iter_num += 1 119 | -------------------------------------------------------------------------------- /network/network_continous_rnn.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | import numpy as np 3 | 4 | import tensorflow as tf 5 | import prettytensor as pt 6 | from parameters import pms 7 | 8 | seed = 1 9 | np.random.seed(seed) 10 | tf.set_random_seed(seed) 11 | 12 | class InnerLSTMCell(tf.nn.rnn_cell.BasicLSTMCell): 13 | def __init__(self , num_units , forget_bias=1.0 , input_size=None): 14 | tf.nn.rnn_cell.BasicLSTMCell.__init__(self , num_units , forget_bias=forget_bias , input_size=input_size) 15 | self.matrix , self.bias = None , None 16 | 17 | 18 | def __call__(self , inputs , state , scope=None): 19 | """ 20 | Long short-term memory cell (LSTM). 21 | implement from BasicLSTMCell.__call__ 22 | """ 23 | with tf.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" 24 | # Parameters of gates are concatenated into one multiply for efficiency. 25 | c , h = tf.split(1 , 2 , state) 26 | concat = self.linear([inputs , h] , 4 * self._num_units , True) 27 | 28 | # i = input_gate, j = new_input, f = forget_gate, o = output_gate 29 | i , j , f , o = tf.split(1 , 4 , concat) 30 | 31 | new_c = c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid(i) * tf.tanh(j) 32 | new_h = tf.tanh(new_c) * tf.sigmoid(o) 33 | 34 | return new_h , tf.concat(1 , [new_c , new_h]) 35 | 36 | 37 | def linear(self , args , output_size , bias , bias_start=0.0 , scope=None): 38 | """ 39 | Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. 40 | implement from function of tensorflow.python.ops.rnn_cell.linear() 41 | """ 42 | if args is None or (isinstance(args , (list , tuple)) and not args): 43 | raise ValueError("`args` must be specified") 44 | if not isinstance(args , (list , tuple)): 45 | args = [args] 46 | 47 | # Calculate the total size of arguments on dimension 1. 48 | total_arg_size = 0 49 | shapes = [a.get_shape().as_list() for a in args] 50 | for shape in shapes: 51 | if len(shape) != 2: 52 | raise ValueError("Linear is expecting 2D arguments: %s" % str(shapes)) 53 | if not shape[1]: 54 | raise ValueError("Linear expects shape[1] of arguments: %s" % str(shapes)) 55 | else: 56 | total_arg_size += shape[1] 57 | 58 | # Now the computation. 59 | with tf.variable_scope(scope or "Linear"): 60 | matrix = tf.get_variable("Matrix" , [total_arg_size , output_size]) 61 | if len(args) == 1: 62 | res = tf.matmul(args[0] , matrix) 63 | else: 64 | res = tf.matmul(tf.concat(1 , args) , matrix) 65 | if not bias: 66 | return res 67 | bias_term = tf.get_variable( 68 | "Bias" , [output_size] , 69 | initializer=tf.constant_initializer(bias_start)) 70 | self.matrix = matrix 71 | self.bias = bias_term 72 | return res + bias_term 73 | 74 | class NetworkContinousLSTM(object): 75 | def __init__(self, scope): 76 | with tf.variable_scope("%s_shared" % scope): 77 | self.obs = obs = tf.placeholder( 78 | dtype, shape=[None, pms.obs_shape], name="%s_obs"%scope) 79 | self.action_n = tf.placeholder(dtype, shape=[None, pms.action_shape], name="%s_action"%scope) 80 | self.advant = tf.placeholder(dtype, shape=[None], name="%s_advant"%scope) 81 | self.old_dist_means_n = tf.placeholder(dtype, shape=[None, pms.action_shape], 82 | name="%s_oldaction_dist_means"%scope) 83 | self.old_dist_logstds_n = tf.placeholder(dtype, shape=[None, pms.action_shape], 84 | name="%s_oldaction_dist_logstds"%scope) 85 | # self.obs_reshape = tf.reshape(self.obs, [None, 1, pms.obs_shape]) 86 | lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(3, forget_bias=1.0, state_is_tuple=True) 87 | lstm_cell = tf.nn.rnn_cell.DropoutWrapper( 88 | lstm_cell, output_keep_prob=0.5) 89 | rnn = tf.nn.rnn_cell.MultiRNNCell([lstm_cell], state_is_tuple=True) 90 | # rnn = tf.nn.rnn_cell.BasicRNNCell(3) 91 | self.initial_state = state = rnn.zero_state(tf.shape(self.obs)[0], tf.float32) 92 | # output , state = tf.nn.dynamic_rnn(rnn, self.obs) 93 | output, state = rnn(self.obs, state) 94 | self.action_dist_means_n = (pt.wrap(output). 95 | # fully_connected(64, activation_fn=tf.nn.relu, init=tf.random_normal_initializer(-0.05, 0.05), 96 | # name="%s_fc1"%scope). 97 | # fully_connected(64, activation_fn=tf.nn.relu, init=tf.random_normal_initializer(-0.05, 0.05), 98 | # name="%s_fc2"%scope). 99 | fully_connected(pms.action_shape, init=tf.random_normal_initializer(-0.05, 0.05), 100 | name="%s_fc3"%scope)) 101 | self.N = tf.shape(obs)[0] 102 | Nf = tf.cast(self.N, dtype) 103 | self.action_dist_logstd_param = tf.Variable((.01*np.random.randn(1, pms.action_shape)).astype(np.float32), trainable=False, name="%spolicy_logstd"%scope) 104 | self.action_dist_logstds_n = tf.tile(self.action_dist_logstd_param, 105 | tf.pack((tf.shape(self.action_dist_means_n)[0], 1))) 106 | self.var_list = [v for v in tf.trainable_variables()if v.name.startswith(scope)] 107 | 108 | def get_action_dist_means_n(self, session, obs): 109 | return session.run(self.action_dist_means_n, 110 | {self.obs: obs}) 111 | 112 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import random 4 | import scipy.signal 5 | import prettytensor as pt 6 | from parameters import pms 7 | import threading 8 | from tensorflow.contrib.layers.python.layers import initializers 9 | 10 | seed = 1 11 | random.seed(seed) 12 | np.random.seed(seed) 13 | tf.set_random_seed(seed) 14 | 15 | dtype = tf.float32 16 | 17 | def discount(x, gamma): 18 | """ 19 | scipy.signal.lfilter(b, a, x, axis=-1, zi=None)[source] 20 | a[0]*y[n] = b[0]*x[n] + b[1]*x[n-1] + ... + b[M]*x[n-M] 21 | - a[1]*y[n-1] - ... - a[N]*y[n-N] 22 | :param x: 23 | :param gamma: 24 | :return: 25 | """ 26 | assert x.ndim >= 1 27 | return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] 28 | 29 | 30 | 31 | 32 | 33 | def cat_sample(prob_nk): 34 | assert prob_nk.ndim == 2 35 | N = prob_nk.shape[0] 36 | csprob_nk = np.cumsum(prob_nk, axis=1) 37 | out = np.zeros(N, dtype='i') 38 | for (n, csprob_k, r) in zip(xrange(N), csprob_nk, np.random.rand(N)): 39 | for (k, csprob) in enumerate(csprob_k): 40 | if csprob > r: 41 | out[n] = k 42 | break 43 | return out 44 | 45 | 46 | def var_shape(x): 47 | out = [k.value for k in x.get_shape()] 48 | assert all(isinstance(a, int) for a in out), \ 49 | "shape function assumes that shape is fully known" 50 | return out 51 | 52 | 53 | def numel(x): 54 | return np.prod(var_shape(x)) 55 | 56 | 57 | def flatgrad(loss, var_list): 58 | grads = tf.gradients(loss, var_list) 59 | return tf.concat(0, [tf.reshape(grad, [np.prod(var_shape(v))]) 60 | for (grad, v) in zip( grads, var_list)]) 61 | 62 | # set theta 63 | class SetFromFlat(object): 64 | def __init__(self, var_list): 65 | assigns = [] 66 | shapes = map(var_shape, var_list) 67 | total_size = sum(np.prod(shape) for shape in shapes) 68 | self.theta = theta = tf.placeholder(tf.float32, [total_size]) 69 | start = 0 70 | assigns = [] 71 | for (shape, v) in zip(shapes, var_list): 72 | size = np.prod(shape) 73 | assigns.append( 74 | tf.assign( 75 | v, 76 | tf.reshape( 77 | theta[ 78 | start:start + 79 | size], 80 | shape))) 81 | start += size 82 | self.op = tf.group(*assigns) 83 | 84 | def __call__(self, theta): 85 | self.session.run(self.op, feed_dict={self.theta: theta}) 86 | 87 | # get theta 88 | class GetFlat(object): 89 | def __init__(self, var_list): 90 | self.op = tf.concat(0, [tf.reshape(v, [numel(v)]) for v in var_list]) 91 | 92 | def __call__(self): 93 | return self.op.eval(session=self.session) 94 | 95 | 96 | def slice_2d(x, inds0, inds1): 97 | # assume that a path have 1000 vector, then ncols=action dims, inds0=1000,inds1= 98 | inds0 = tf.cast(inds0, tf.int64) 99 | inds1 = tf.cast(inds1, tf.int64) 100 | shape = tf.cast(tf.shape(x), tf.int64) 101 | ncols = shape[1] 102 | x_flat = tf.reshape(x, [-1]) 103 | return tf.gather(x_flat, inds0 * ncols + inds1) 104 | 105 | 106 | # def linesearch(f, x, fullstep, expected_improve_rate): 107 | # accept_ratio = .1 108 | # max_backtracks = 10 109 | # fval, old_kl, entropy = f(x) 110 | # for (_n_backtracks, stepfrac) in enumerate(.5**np.arange(max_backtracks)): 111 | # xnew = x + stepfrac * fullstep 112 | # newfval, new_kl, new_ent= f(xnew) 113 | # # actual_improve = newfval - fval # minimize target object 114 | # # expected_improve = expected_improve_rate * stepfrac 115 | # # ratio = actual_improve / expected_improve 116 | # # if ratio > accept_ratio and actual_improve > 0: 117 | # # return xnew 118 | # if newfval accept_ratio and actual_improve > 0: 133 | # pms.max_kl *= 1.002 134 | # return xnew 135 | if newfval 1 105 | 106 | alphas = [] 107 | betas = [] 108 | qs = [] 109 | 110 | q = b / np.linalg.norm(b) 111 | beta = 0 112 | qm = np.zeros_like(b) 113 | for j in xrange(k): 114 | qs.append(q) 115 | 116 | z = f_Ax(q) 117 | 118 | alpha = q.dot(z) 119 | alphas.append(alpha) 120 | z -= alpha * q + beta * qm 121 | 122 | beta = np.linalg.norm(z) 123 | betas.append(beta) 124 | 125 | print "beta", beta 126 | if beta < 1e-9: 127 | print "lanczos: early after %i/%i dimensions" % (j + 1, k) 128 | break 129 | else: 130 | qm = q 131 | q = z / beta 132 | 133 | return np.array(qs, 'float64').T, np.array(alphas, 'float64'), np.array(betas[:-1], 'float64') 134 | 135 | 136 | def lanczos2(f_Ax, b, k, residual_thresh=1e-9): 137 | """ 138 | Runs Lanczos algorithm to generate a orthogonal basis for the Krylov subspace 139 | b, Ab, A^2b, ... 140 | as well as the upper hessenberg matrix T = Q^T A Q 141 | from Demmel ch 6 142 | """ 143 | b = b.astype('float64') 144 | assert k > 1 145 | H = np.zeros((k, k)) 146 | qs = [] 147 | 148 | q = b / np.linalg.norm(b) 149 | beta = 0 150 | 151 | for j in xrange(k): 152 | qs.append(q) 153 | 154 | z = f_Ax(q.astype('float64')).astype('float64') 155 | for (i, q) in enumerate(qs): 156 | H[j, i] = H[i, j] = h = q.dot(z) 157 | z -= h * q 158 | 159 | beta = np.linalg.norm(z) 160 | if beta < residual_thresh: 161 | print "lanczos2: stopping early after %i/%i dimensions residual %f < %f" % (j + 1, k, beta, residual_thresh) 162 | break 163 | else: 164 | q = z / beta 165 | 166 | return np.array(qs).T, H[:len(qs), :len(qs)] 167 | 168 | 169 | def make_tridiagonal(alphas, betas): 170 | assert len(alphas) == len(betas) + 1 171 | N = alphas.size 172 | out = np.zeros((N, N), 'float64') 173 | out.flat[0:N ** 2:N + 1] = alphas 174 | out.flat[1:N ** 2 - N:N + 1] = betas 175 | out.flat[N:N ** 2 - 1:N + 1] = betas 176 | return out 177 | 178 | 179 | def tridiagonal_eigenvalues(alphas, betas): 180 | T = make_tridiagonal(alphas, betas) 181 | return np.linalg.eigvalsh(T) 182 | 183 | 184 | def test_lanczos(): 185 | np.set_printoptions(precision=4) 186 | 187 | A = np.random.randn(5, 5) 188 | A = A.T.dot(A) 189 | b = np.random.randn(5) 190 | f_Ax = lambda x: A.dot(x) # pylint: disable=W0108 191 | Q, alphas, betas = lanczos(f_Ax, b, 10) 192 | H = make_tridiagonal(alphas, betas) 193 | assert np.allclose(Q.T.dot(A).dot(Q), H) 194 | assert np.allclose(Q.dot(H).dot(Q.T), A) 195 | assert np.allclose(np.linalg.eigvalsh(H), np.linalg.eigvalsh(A)) 196 | 197 | Q, H1 = lanczos2(f_Ax, b, 10) 198 | assert np.allclose(H, H1, atol=1e-6) 199 | 200 | print "ritz eigvals:" 201 | for i in xrange(1, 6): 202 | Qi = Q[:, :i] 203 | Hi = Qi.T.dot(A).dot(Qi) 204 | print np.linalg.eigvalsh(Hi)[::-1] 205 | print "true eigvals:" 206 | print np.linalg.eigvalsh(A)[::-1] 207 | 208 | print "lanczos on ill-conditioned problem" 209 | A = np.diag(10 ** np.arange(5)) 210 | Q, H1 = lanczos2(f_Ax, b, 10) 211 | print np.linalg.eigvalsh(H1) 212 | 213 | print "lanczos on ill-conditioned problem with noise" 214 | 215 | def f_Ax_noisy(x): 216 | return A.dot(x) + np.random.randn(x.size) * 1e-3 217 | 218 | Q, H1 = lanczos2(f_Ax_noisy, b, 10) 219 | print np.linalg.eigvalsh(H1) 220 | 221 | def compute_hessian(fn, vars): 222 | mat = [] 223 | for v1 in vars: 224 | temp = [] 225 | for v2 in vars: 226 | # computing derivative twice, first w.r.t v2 and then w.r.t v1 227 | temp.append(tf.gradients(tf.gradients(fn, v2)[0], v1)[0]) 228 | temp = [tf.cons(0) if t == None else t for t in temp] # tensorflow returns None when there is no gradient, so we replace None with 0 229 | temp = tf.pack(temp) 230 | mat.append(temp) 231 | mat = tf.pack(mat) 232 | return mat 233 | 234 | if __name__ == "__main__": 235 | test_lanczos() 236 | test_cg() 237 | 238 | -------------------------------------------------------------------------------- /storage/storage_continous_parallel.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import multiprocessing 4 | from utils import * 5 | import gym 6 | import time 7 | import copy 8 | from random import randint 9 | from parameters import pms 10 | import math 11 | from network.network_continous import NetworkContinous 12 | 13 | 14 | class Actor(multiprocessing.Process): 15 | def __init__(self, args, task_q, result_q, actor_id, monitor): 16 | multiprocessing.Process.__init__(self) 17 | self.actor_id = actor_id 18 | self.task_q = task_q 19 | self.result_q = result_q 20 | self.args = args 21 | self.monitor = monitor 22 | # pms.max_path_length = gym.spec(args.environment_name).timestep_limit 23 | 24 | 25 | def get_action(self, obs): 26 | if self.net == None: 27 | raise NameError("network have not been defined") 28 | obs = np.expand_dims(obs , 0) 29 | # action_dist_logstd = np.expand_dims([np.log(pms.std)], 0) 30 | action_dist_means_n , action_dist_logstds_n = self.session.run( 31 | [self.net.action_dist_means_n, self.net.action_dist_logstds_n], feed_dict={self.net.obs: obs}) 32 | if pms.train_flag: 33 | rnd = np.random.normal(size=action_dist_means_n[0].shape) 34 | action = rnd * np.exp(action_dist_logstds_n[0]) + action_dist_means_n[0] 35 | else: 36 | action = action_dist_means_n[0] 37 | # action = np.clip(action, pms.min_a, pms.max_a) 38 | return action , dict(mean=action_dist_means_n[0] , log_std=np.exp(action_dist_logstds_n[0])) 39 | 40 | def run(self): 41 | self.env = gym.make(self.args.environment_name) 42 | self.env.seed(randint(0, 999999)) 43 | if self.monitor: 44 | self.env.monitor.start('monitor/', force=True) 45 | 46 | self.net = NetworkContinous("rollout_network" + str(self.actor_id)) 47 | config = tf.ConfigProto( 48 | device_count={'GPU': 0} 49 | ) 50 | self.session = tf.Session(config=config) 51 | var_list = self.net.var_list 52 | self.session.run(tf.initialize_all_variables()) 53 | self.set_policy = SetFromFlat(var_list) 54 | self.set_policy.session = self.session 55 | while True: 56 | # get a task, or wait until it gets one 57 | next_task = self.task_q.get(block=True) 58 | if type(next_task) is int and next_task == 1: 59 | # the task is an actor request to collect experience 60 | path = self.rollout() 61 | # print "single rollout time:"+str(end-start) 62 | self.task_q.task_done() 63 | self.result_q.put(path) 64 | elif type(next_task) is int and next_task == 2: 65 | print "kill message" 66 | if self.monitor: 67 | self.env.monitor.close() 68 | self.task_q.task_done() 69 | break 70 | else: 71 | # the task is to set parameters of the actor policy 72 | next_task = np.array(next_task) 73 | self.set_policy(next_task) 74 | # super hacky method to make sure when we fill the queue with set parameter tasks, 75 | # an actor doesn't finish updating before the other actors can accept their own tasks. 76 | time.sleep(0.1) 77 | self.task_q.task_done() 78 | return 79 | 80 | def rollout(self): 81 | """ 82 | :param:observations:obs list 83 | :param:actions:action list 84 | :param:rewards:reward list 85 | :param:agent_infos: mean+log_std dictlist 86 | :param:env_infos: no use, just information about environment 87 | :return: a path, list 88 | """ 89 | # if pms.record_movie: 90 | # outdir = 'log/trpo' 91 | # self.env.monitor.start(outdir , force=True) 92 | observations = [] 93 | actions = [] 94 | rewards = [] 95 | agent_infos = [] 96 | env_infos = [] 97 | if pms.render: 98 | self.env.render() 99 | o = self.env.reset() 100 | episode_steps = 0 101 | for i in xrange(pms.max_path_length - 1): 102 | a, agent_info = self.get_action(o) 103 | next_o, reward, terminal, env_info = self.env.step(a) 104 | observations.append(o) 105 | rewards.append(np.array([reward])) 106 | actions.append(a) 107 | agent_infos.append([agent_info]) 108 | env_infos.append([]) 109 | episode_steps += 1 110 | if terminal: 111 | break 112 | o = next_o 113 | if pms.render: 114 | self.env.render() 115 | path = dict( 116 | observations=np.array(observations) , 117 | actions=np.array(actions) , 118 | rewards=np.array(rewards) , 119 | agent_infos=np.concatenate(agent_infos) , 120 | env_infos=np.concatenate(env_infos) , 121 | episode_steps=episode_steps 122 | ) 123 | return path 124 | 125 | class ParallelStorage(): 126 | def __init__(self): 127 | self.args = pms 128 | self.tasks = multiprocessing.JoinableQueue() 129 | self.results = multiprocessing.Queue() 130 | self.actors = [] 131 | self.actors.append(Actor(self.args, self.tasks, self.results, 9999, self.args.record_movie)) 132 | for i in xrange(self.args.jobs-1): 133 | self.actors.append(Actor(self.args, self.tasks, self.results, 37*(i+3), False)) 134 | for a in self.actors: 135 | a.start() 136 | # we will start by running 20,000 / 1000 = 20 episodes for the first ieration 137 | self.average_timesteps_in_episode = 1000 138 | 139 | def get_paths(self): 140 | # keep 20,000 timesteps per update 141 | num_rollouts = self.args.paths_number 142 | # print "rollout_number:"+str(num_rollouts) 143 | for i in xrange(num_rollouts): 144 | self.tasks.put(1) 145 | start = time.time() 146 | self.tasks.join() 147 | end = time.time() 148 | # print "rollout real time"+str(end-start) 149 | paths = [] 150 | while num_rollouts: 151 | num_rollouts -= 1 152 | paths.append(self.results.get()) 153 | return paths 154 | 155 | # def process_paths(self, paths): 156 | # sum_episode_steps = 0 157 | # for path in paths: 158 | # sum_episode_steps += path['episode_steps'] 159 | # # r_t+V(S_{t+1})-V(S_t) = returns-baseline 160 | # # path_baselines = np.append(self.baseline.predict(path) , 0) 161 | # # # r_t+V(S_{t+1})-V(S_t) = returns-baseline 162 | # # path["advantages"] = np.concatenate(path["rewards"]) + \ 163 | # # pms.discount * path_baselines[1:] - \ 164 | # # path_baselines[:-1] 165 | # # path["returns"] = np.concatenate(discount(path["rewards"], pms.discount)) 166 | # path_baselines = np.append(self.baseline.predict(path) , 0) 167 | # deltas = np.concatenate(path["rewards"]) + \ 168 | # pms.discount * path_baselines[1:] - \ 169 | # path_baselines[:-1] 170 | # path["advantages"] = discount( 171 | # deltas , pms.discount * pms.gae_lambda) 172 | # path["returns"] = np.concatenate(discount(path["rewards"] , pms.discount)) 173 | # observations = np.concatenate([path["observations"] for path in paths]) 174 | # actions = np.concatenate([path["actions"] for path in paths]) 175 | # rewards = np.concatenate([path["rewards"] for path in paths]) 176 | # advantages = np.concatenate([path["advantages"] for path in paths]) 177 | # env_infos = np.concatenate([path["env_infos"] for path in paths]) 178 | # agent_infos = np.concatenate([path["agent_infos"] for path in paths]) 179 | # if pms.center_adv: 180 | # advantages -= np.mean(advantages) 181 | # advantages /= (advantages.std() + 1e-8) 182 | # samples_data = dict( 183 | # observations=observations , 184 | # actions=actions , 185 | # rewards=rewards , 186 | # advantages=advantages , 187 | # env_infos=env_infos , 188 | # agent_infos=agent_infos , 189 | # paths=paths , 190 | # sum_episode_steps=sum_episode_steps 191 | # ) 192 | # self.baseline.fit(paths) 193 | # return samples_data 194 | 195 | def set_policy_weights(self, parameters): 196 | for i in xrange(self.args.jobs): 197 | self.tasks.put(parameters) 198 | self.tasks.join() 199 | 200 | def end(self): 201 | for i in xrange(self.args.jobs): 202 | self.tasks.put(2) -------------------------------------------------------------------------------- /agent/agent_discrete.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | from dealImage import * 3 | from logger.logger import Logger 4 | import krylov 5 | import numpy as np 6 | import random 7 | import tensorflow as tf 8 | import time 9 | 10 | import prettytensor as pt 11 | 12 | from storage.storage import Storage 13 | from parameters import pms 14 | from distribution.diagonal_category import DiagonalCategory 15 | from baseline.baseline_lstsq import Baseline 16 | import gym 17 | from environment import Environment 18 | 19 | class TRPOAgent(object): 20 | def __init__(self, env): 21 | self.env = env 22 | # if not isinstance(env.observation_space, Box) or \ 23 | # not isinstance(env.action_space, Discrete): 24 | # print("Incompatible spaces.") 25 | # exit(-1) 26 | print("Observation Space", env.observation_space) 27 | print("Action Space", env.action_space) 28 | self.distribution = DiagonalCategory() 29 | self.session = tf.Session() 30 | self.baseline = Baseline() 31 | self.end_count = 0 32 | self.paths = [] 33 | self.train = True 34 | self.storage = Storage(self, self.env, self.baseline) 35 | self.init_network() 36 | if pms.train_flag: 37 | self.init_logger() 38 | 39 | def init_logger(self): 40 | head = ["average_episode_std", "total number of episodes", "Average sum of rewards per episode", 41 | "KL between old and new distribution", "Surrogate loss", "Surrogate loss prev", "ds", "entropy", 42 | "mean_advant", "sum_episode_steps"] 43 | self.logger = Logger(head) 44 | 45 | def init_network(self): 46 | self.obs = obs = tf.placeholder( 47 | dtype, shape=[None, self.env.observation_space.shape[0]], name="obs") 48 | self.action = action = tf.placeholder(tf.int64, shape=[None], name="action") 49 | self.advant = advant = tf.placeholder(dtype, shape=[None], name="advant") 50 | self.oldaction_dist = oldaction_dist = tf.placeholder(dtype, shape=[None, self.env.action_space.n], 51 | name="oldaction_dist") 52 | 53 | # Create neural network. 54 | action_dist_n, _ = (pt.wrap(self.obs). 55 | fully_connected(32, activation_fn=tf.nn.relu). 56 | fully_connected(32, activation_fn=tf.nn.relu). 57 | softmax_classifier(self.env.action_space.n)) 58 | eps = 1e-6 59 | self.action_dist_n = action_dist_n 60 | N = tf.shape(obs)[0] 61 | ratio_n = self.distribution.likelihood_ratio_sym(action, action_dist_n, oldaction_dist) 62 | Nf = tf.cast(N, dtype) 63 | surr = -tf.reduce_mean(ratio_n * advant) # Surrogate loss 64 | kl = self.distribution.kl_sym(oldaction_dist, action_dist_n) 65 | ent = self.distribution.entropy(action_dist_n) 66 | 67 | self.losses = [surr, kl, ent] 68 | 69 | var_list = tf.trainable_variables() 70 | self.pg = flatgrad(surr, var_list) 71 | # KL divergence where first arg is fixed 72 | # replace old->tf.stop_gradient from previous kl 73 | kl_firstfixed = tf.reduce_sum(tf.stop_gradient( 74 | action_dist_n) * tf.log(tf.stop_gradient(action_dist_n + eps) / (action_dist_n + eps))) / Nf 75 | grads = tf.gradients(kl_firstfixed, var_list) 76 | self.flat_tangent = tf.placeholder(dtype, shape=[None]) 77 | shapes = map(var_shape, var_list) 78 | start = 0 79 | tangents = [] 80 | for shape in shapes: 81 | size = np.prod(shape) 82 | param = tf.reshape(self.flat_tangent[start:(start + size)], shape) 83 | tangents.append(param) 84 | start += size 85 | gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)] 86 | self.fvp = flatgrad(gvp, var_list) 87 | self.gf = GetFlat(var_list) 88 | self.gf.session = self.session 89 | self.sff = SetFromFlat(var_list) 90 | self.sff.session = self.session 91 | self.saver = tf.train.Saver(max_to_keep=10) 92 | self.session.run(tf.initialize_all_variables()) 93 | 94 | # self.load_model(pms.checkpoint_file) 95 | 96 | def get_samples(self, path_number): 97 | for i in range(path_number): 98 | self.storage.get_single_path() 99 | 100 | def act(self, obs, *args): 101 | obs = np.expand_dims(obs, 0) 102 | action_dist_n = self.session.run(self.action_dist_n, {self.obs: obs}) 103 | 104 | if self.train: 105 | action = int(cat_sample(action_dist_n)[0]) 106 | else: 107 | action = int(np.argmax(action_dist_n)) 108 | return action, action_dist_n, np.squeeze(obs) 109 | 110 | def learn(self): 111 | start_time = time.time() 112 | numeptotal = 0 113 | for iteration in range(pms.max_iter_number): 114 | # Generating paths. 115 | print("Rollout") 116 | self.get_samples(pms.paths_number) 117 | paths = self.storage.get_paths() # get_paths 118 | # Computing returns and estimating advantage function. 119 | 120 | sample_data = self.storage.process_paths(paths) 121 | # shape = sample_data["observations"].shape 122 | # vis_square(np.reshape(sample_data["observations"],(shape[0], shape[2], shape[3]))[1:10]) 123 | feed = {self.obs: sample_data["observations"], 124 | self.action: sample_data["actions"], 125 | self.advant: sample_data["advantages"], 126 | self.oldaction_dist: sample_data["agent_infos"]} 127 | 128 | print "\n********** Iteration %i ************" % iteration 129 | if self.train: 130 | thprev = self.gf() 131 | def fisher_vector_product(p): 132 | feed[self.flat_tangent] = p 133 | return self.session.run(self.fvp, feed) + pms.cg_damping * p 134 | 135 | g = self.session.run(self.pg, feed_dict=feed) 136 | stepdir = krylov.cg(fisher_vector_product, -g) 137 | shs = 0.5 * stepdir.dot(fisher_vector_product(stepdir)) # theta 138 | fullstep = stepdir * np.sqrt(2.0 * pms.max_kl / shs) 139 | neggdotstepdir = -g.dot(stepdir) 140 | 141 | def loss(th): 142 | self.sff(th) 143 | return self.session.run(self.losses, feed_dict=feed) 144 | 145 | surr_prev, kl_prev, entropy = loss(thprev) 146 | theta = linesearch(loss, thprev, fullstep, neggdotstepdir) 147 | self.sff(theta) 148 | 149 | surrafter, kloldnew, entropy = self.session.run( 150 | self.losses, feed_dict=feed) 151 | 152 | stats = {} 153 | episoderewards = np.sum(sample_data["rewards"]) 154 | numeptotal += len(sample_data["rewards"]) 155 | mean_advant = np.mean(sample_data["advantages"]) 156 | stats["Total number of episodes"] = numeptotal 157 | stats["Average sum of rewards per episode"] = np.mean(sample_data["rewards"]) 158 | # stats["Entropy"] = entropy 159 | # exp = explained_variance(np.array(sample_data[""]), np.array(returns_n)) 160 | # stats["Baseline explained"] = exp 161 | stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0) 162 | stats["KL between old and new distribution"] = kloldnew 163 | stats["Surrogate loss"] = surrafter 164 | stats['Sum episode steps'] = sample_data["sum_episode_steps"] 165 | log_data = [0, numeptotal, episoderewards.mean(), kloldnew, surrafter, surr_prev, 166 | surrafter - surr_prev, 167 | entropy, mean_advant, sample_data["sum_episode_steps"]] 168 | if pms.train_flag: 169 | self.logger.log_row(log_data) 170 | for k, v in stats.iteritems(): 171 | print(k + ": " + " " * (40 - len(k)) + str(v)) 172 | if iteration % pms.save_model_times == 0: 173 | self.save_model(pms.environment_name + "-" + str(iteration)) 174 | 175 | def test(self, model_name): 176 | self.load_model(model_name) 177 | if pms.record_movie: 178 | for i in range(100): 179 | self.storage.get_single_path() 180 | self.env.env.monitor.close() 181 | if pms.upload_to_gym: 182 | gym.upload("log/trpo" , algorithm_id='alg_8BgjkAsQRNiWu11xAhS4Hg' , api_key='sk_IJhy3b2QkqL3LWzgBXoVA') 183 | else: 184 | for i in range(50): 185 | self.storage.get_single_path() 186 | 187 | def save_model(self, model_name): 188 | self.saver.save(self.session, pms.checkpoint_dir + model_name + ".ckpt") 189 | 190 | def load_model(self, model_name): 191 | try: 192 | if model_name is not None: 193 | self.saver.restore(self.session, model_name) 194 | else: 195 | self.saver.restore(self.session, tf.train.latest_checkpoint(pms.checkpoint_dir)) 196 | except: 197 | print "load model %s fail" % (model_name) 198 | -------------------------------------------------------------------------------- /storage/storage_continous_parallel_image.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | from utils import * 3 | import gym 4 | import time 5 | from random import randint 6 | from parameters import pms 7 | from network.network_continous_image import NetworkContinousImage 8 | import cv2 9 | 10 | 11 | class Actor(multiprocessing.Process): 12 | def __init__(self, args, task_q, result_q, actor_id, monitor): 13 | multiprocessing.Process.__init__(self) 14 | self.actor_id = actor_id 15 | self.task_q = task_q 16 | self.result_q = result_q 17 | self.args = args 18 | self.monitor = monitor 19 | # pms.max_path_length = gym.spec(args.environment_name).timestep_limit 20 | 21 | def get_action(self, obs): 22 | if self.net == None: 23 | raise NameError("network have not been defined") 24 | obs = np.expand_dims(obs , 0) 25 | # action_dist_logstd = np.expand_dims([np.log(pms.std)], 0) 26 | action_dist_means_n , action_dist_logstds_n = self.session.run( 27 | [self.net.action_dist_means_n, self.net.action_dist_logstds_n], feed_dict={self.net.obs: obs}) 28 | if pms.train_flag: 29 | rnd = np.random.normal(size=action_dist_means_n[0].shape) 30 | action = rnd * np.exp(action_dist_logstds_n[0]) + action_dist_means_n[0] 31 | else: 32 | action = action_dist_means_n[0] 33 | # action = np.clip(action, pms.min_a, pms.max_a) 34 | return action, dict(mean=action_dist_means_n[0] , log_std=np.exp(action_dist_logstds_n[0])) 35 | 36 | def run(self): 37 | self.env = gym.make(self.args.environment_name) 38 | self.env.seed(randint(0, 999999)) 39 | if self.monitor: 40 | self.env.monitor.start('monitor/', force=True) 41 | 42 | self.net = NetworkContinousImage("rollout_network" + str(self.actor_id)) 43 | config = tf.ConfigProto( 44 | device_count={'GPU': 0} 45 | ) 46 | self.session = tf.Session(config=config) 47 | var_list = self.net.var_list 48 | self.session.run(tf.initialize_all_variables()) 49 | self.set_policy = SetFromFlat(var_list) 50 | self.set_policy.session = self.session 51 | while True: 52 | # get a task, or wait until it gets one 53 | next_task = self.task_q.get(block=True) 54 | if type(next_task) is int and next_task == 1: 55 | # the task is an actor request to collect experience 56 | path = self.rollout() 57 | # print "single rollout time:"+str(end-start) 58 | self.task_q.task_done() 59 | self.result_q.put(path) 60 | elif type(next_task) is int and next_task == 2: 61 | print "kill message" 62 | if self.monitor: 63 | self.env.monitor.close() 64 | self.task_q.task_done() 65 | break 66 | else: 67 | # the task is to set parameters of the actor policy 68 | next_task = np.array(next_task) 69 | self.set_policy(next_task) 70 | # super hacky method to make sure when we fill the queue with set parameter tasks, 71 | # an actor doesn't finish updating before the other actors can accept their own tasks. 72 | time.sleep(0.1) 73 | self.task_q.task_done() 74 | return 75 | 76 | def rollout(self): 77 | """ 78 | :param:observations:obs list 79 | :param:actions:action list 80 | :param:rewards:reward list 81 | :param:agent_infos: mean+log_std dictlist 82 | :param:env_infos: no use, just information about environment 83 | :return: a path, list 84 | """ 85 | # if pms.record_movie: 86 | # outdir = 'log/trpo' 87 | # self.env.monitor.start(outdir , force=True) 88 | observations = [] 89 | actions = [] 90 | rewards = [] 91 | agent_infos = [] 92 | env_infos = [] 93 | if pms.render: 94 | self.env.render() 95 | o = self.env.reset() 96 | 97 | episode_steps = 0 98 | for i in xrange(pms.max_path_length - 1): 99 | o = self.env.render('rgb_array') 100 | o = self.deal_image(o) 101 | a, agent_info = self.get_action(o) 102 | next_o, reward, terminal, env_info = self.env.step(a) 103 | observations.append(o) 104 | rewards.append(np.array([reward])) 105 | actions.append(a) 106 | agent_infos.append([agent_info]) 107 | env_infos.append([]) 108 | episode_steps += 1 109 | if terminal: 110 | break 111 | o = next_o 112 | if pms.render: 113 | self.env.render() 114 | path = dict( 115 | observations=np.array(observations) , 116 | actions=np.array(actions) , 117 | rewards=np.array(rewards) , 118 | agent_infos=np.concatenate(agent_infos) , 119 | env_infos=np.concatenate(env_infos) , 120 | episode_steps=episode_steps 121 | ) 122 | return path 123 | 124 | def deal_image(self , image): 125 | # index = len(self.obs_origin) 126 | # image_end = [] 127 | # if index < pms.history_number: 128 | # image_end = self.obs_origin[0:index] 129 | # for i in range(pms.history_number - index): 130 | # image_end.append(image) 131 | # else: 132 | # image_end = self.obs_origin[index - pms.history_number:index] 133 | # 134 | # image_end = np.concatenate(image_end) 135 | # # image_end = image_end.reshape((pms.obs_height, pms.obs_width, pms.history_number)) 136 | # obs = cv2.resize(cv2.cvtColor(image_end , cv2.COLOR_RGB2GRAY) / 255. , (pms.obs_height , pms.obs_width)) 137 | obs = cv2.resize(image, (pms.obs_height, pms.obs_width)) 138 | # obs = np.transpose(np.array(obs), (2, 0, 1)) 139 | return obs 140 | 141 | class ParallelStorageImage(): 142 | def __init__(self): 143 | self.args = pms 144 | self.tasks = multiprocessing.JoinableQueue() 145 | self.results = multiprocessing.Queue() 146 | self.actors = [] 147 | self.actors.append(Actor(self.args, self.tasks, self.results, 9999, self.args.record_movie)) 148 | for i in xrange(self.args.jobs-1): 149 | self.actors.append(Actor(self.args, self.tasks, self.results, 37*(i+3), False)) 150 | for a in self.actors: 151 | a.start() 152 | # we will start by running 20,000 / 1000 = 20 episodes for the first ieration 153 | self.average_timesteps_in_episode = 1000 154 | 155 | def get_paths(self): 156 | # keep 20,000 timesteps per update 157 | num_rollouts = self.args.paths_number 158 | # print "rollout_number:"+str(num_rollouts) 159 | for i in xrange(num_rollouts): 160 | self.tasks.put(1) 161 | start = time.time() 162 | self.tasks.join() 163 | end = time.time() 164 | # print "rollout real time"+str(end-start) 165 | paths = [] 166 | while num_rollouts: 167 | num_rollouts -= 1 168 | paths.append(self.results.get()) 169 | return paths 170 | 171 | # def process_paths(self, paths): 172 | # sum_episode_steps = 0 173 | # for path in paths: 174 | # sum_episode_steps += path['episode_steps'] 175 | # # r_t+V(S_{t+1})-V(S_t) = returns-baseline 176 | # # path_baselines = np.append(self.baseline.predict(path) , 0) 177 | # # # r_t+V(S_{t+1})-V(S_t) = returns-baseline 178 | # # path["advantages"] = np.concatenate(path["rewards"]) + \ 179 | # # pms.discount * path_baselines[1:] - \ 180 | # # path_baselines[:-1] 181 | # # path["returns"] = np.concatenate(discount(path["rewards"], pms.discount)) 182 | # path_baselines = np.append(self.baseline.predict(path) , 0) 183 | # deltas = np.concatenate(path["rewards"]) + \ 184 | # pms.discount * path_baselines[1:] - \ 185 | # path_baselines[:-1] 186 | # path["advantages"] = discount( 187 | # deltas , pms.discount * pms.gae_lambda) 188 | # path["returns"] = np.concatenate(discount(path["rewards"] , pms.discount)) 189 | # observations = np.concatenate([path["observations"] for path in paths]) 190 | # actions = np.concatenate([path["actions"] for path in paths]) 191 | # rewards = np.concatenate([path["rewards"] for path in paths]) 192 | # advantages = np.concatenate([path["advantages"] for path in paths]) 193 | # env_infos = np.concatenate([path["env_infos"] for path in paths]) 194 | # agent_infos = np.concatenate([path["agent_infos"] for path in paths]) 195 | # if pms.center_adv: 196 | # advantages -= np.mean(advantages) 197 | # advantages /= (advantages.std() + 1e-8) 198 | # samples_data = dict( 199 | # observations=observations , 200 | # actions=actions , 201 | # rewards=rewards , 202 | # advantages=advantages , 203 | # env_infos=env_infos , 204 | # agent_infos=agent_infos , 205 | # paths=paths , 206 | # sum_episode_steps=sum_episode_steps 207 | # ) 208 | # self.baseline.fit(paths) 209 | # return samples_data 210 | 211 | def set_policy_weights(self, parameters): 212 | for i in xrange(self.args.jobs): 213 | self.tasks.put(parameters) 214 | self.tasks.join() 215 | 216 | def end(self): 217 | for i in xrange(self.args.jobs): 218 | self.tasks.put(2) -------------------------------------------------------------------------------- /agent/agent_continous_parallel_storage.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | import numpy as np 3 | import tensorflow as tf 4 | from network.network_continous import NetworkContinous 5 | from parameters import pms 6 | 7 | import multiprocessing 8 | import krylov 9 | from baseline.baseline_lstsq import Baseline 10 | from distribution.diagonal_gaussian import DiagonalGaussian 11 | import time 12 | import math 13 | from logger.logger import Logger 14 | 15 | seed = 1 16 | np.random.seed(seed) 17 | tf.set_random_seed(seed) 18 | 19 | 20 | """ 21 | class for continoust action space in multi process 22 | """ 23 | class TRPOAgentParallel(multiprocessing.Process): 24 | 25 | 26 | def __init__(self , observation_space , action_space , task_q , result_q): 27 | multiprocessing.Process.__init__(self) 28 | self.task_q = task_q 29 | self.result_q = result_q 30 | self.observation_space = observation_space 31 | self.action_space = action_space 32 | self.args = pms 33 | self.baseline = Baseline() 34 | self.distribution = DiagonalGaussian(pms.action_shape) 35 | self.init_logger() 36 | 37 | def init_network(self): 38 | """ 39 | [input] 40 | self.obs 41 | self.action_n 42 | self.advant 43 | self.old_dist_means_n 44 | self.old_dist_logstds_n 45 | [output] 46 | self.action_dist_means_n 47 | self.action_dist_logstds_n 48 | var_list 49 | """ 50 | config = tf.ConfigProto( 51 | device_count={'GPU': 0} 52 | ) 53 | self.session = tf.Session(config=config) 54 | self.net = NetworkContinous("network_continous") 55 | if pms.min_std is not None: 56 | log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std)) 57 | self.action_dist_stds_n = tf.exp(log_std_var) 58 | self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n) 59 | self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n) 60 | self.likehood_action_dist = self.distribution.log_likelihood_sym(self.net.action_n, self.new_dist_info_vars) 61 | self.ratio_n = self.distribution.likelihood_ratio_sym(self.net.action_n, self.new_dist_info_vars, 62 | self.old_dist_info_vars) 63 | surr = -tf.reduce_mean(self.ratio_n * self.net.advant) # Surrogate loss 64 | batch_size = tf.shape(self.net.obs)[0] 65 | batch_size_float = tf.cast(batch_size , tf.float32) 66 | kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars)) 67 | ent = self.distribution.entropy(self.old_dist_info_vars) 68 | # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf 69 | self.losses = [surr, kl, ent] 70 | var_list = self.net.var_list 71 | 72 | self.gf = GetFlat(var_list) # get theta from var_list 73 | self.gf.session = self.session 74 | self.sff = SetFromFlat(var_list) # set theta from var_List 75 | self.sff.session = self.session 76 | # get g 77 | self.pg = flatgrad(surr, var_list) 78 | # get A 79 | # KL divergence where first arg is fixed 80 | # replace old->tf.stop_gradient from previous kl 81 | kl_firstfixed = self.distribution.kl_sym_firstfixed(self.new_dist_info_vars) / batch_size_float 82 | grads = tf.gradients(kl_firstfixed, var_list) 83 | self.flat_tangent = tf.placeholder(dtype, shape=[None]) 84 | shapes = map(var_shape, var_list) 85 | start = 0 86 | tangents = [] 87 | for shape in shapes: 88 | size = np.prod(shape) 89 | param = tf.reshape(self.flat_tangent[start:(start + size)], shape) 90 | tangents.append(param) 91 | start += size 92 | self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)] 93 | self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list) # get kl''*p 94 | self.session.run(tf.initialize_all_variables()) 95 | self.saver = tf.train.Saver(max_to_keep=5) 96 | 97 | def init_logger(self): 98 | head = ["factor", "rewards", "std"] 99 | self.logger = Logger(head) 100 | 101 | def run(self): 102 | self.init_network() 103 | while True: 104 | paths = self.task_q.get() 105 | if paths is None: 106 | # kill the learner 107 | self.task_q.task_done() 108 | break 109 | elif paths == 1: 110 | # just get params, no learn 111 | self.task_q.task_done() 112 | self.result_q.put(self.gf()) 113 | elif paths[0] == 2: 114 | # adjusting the max KL. 115 | self.args.max_kl = paths[1] 116 | if paths[2] == 1: 117 | print "saving checkpoint..." 118 | self.save_model(pms.environment_name + "-" + str(paths[3])) 119 | self.task_q.task_done() 120 | else: 121 | stats , theta, thprev = self.learn(paths, linear_search=False) 122 | self.sff(theta) 123 | self.task_q.task_done() 124 | self.result_q.put((stats, theta, thprev)) 125 | return 126 | 127 | def learn(self, paths, parallel=False, linear_search=False): 128 | start_time = time.time() 129 | sample_data = self.process_paths(paths) 130 | agent_infos = sample_data["agent_infos"] 131 | obs_all = sample_data["observations"] 132 | action_all = sample_data["actions"] 133 | advant_all = sample_data["advantages"] 134 | n_samples = len(obs_all) 135 | batch = int(1/pms.subsample_factor) 136 | batch_size = int(math.floor(n_samples * pms.subsample_factor)) 137 | accum_fullstep = 0.0 138 | for iteration in range(batch): 139 | print "batch: %d, batch_size: %d"%(iteration+1, batch_size) 140 | inds = np.random.choice(n_samples , batch_size , replace=False) 141 | obs_n = obs_all[inds] 142 | action_n = action_all[inds] 143 | advant_n = advant_all[inds] 144 | action_dist_means_n = np.array([agent_info["mean"] for agent_info in agent_infos[inds]]) 145 | action_dist_logstds_n = np.array([agent_info["log_std"] for agent_info in agent_infos[inds]]) 146 | feed = {self.net.obs: obs_n , 147 | self.net.advant: advant_n , 148 | self.net.old_dist_means_n: action_dist_means_n , 149 | self.net.old_dist_logstds_n: action_dist_logstds_n , 150 | self.net.action_n: action_n 151 | } 152 | 153 | episoderewards = np.array([path["rewards"].sum() for path in paths]) 154 | thprev = self.gf() # get theta_old 155 | 156 | def fisher_vector_product(p): 157 | feed[self.flat_tangent] = p 158 | return self.session.run(self.fvp , feed) + pms.cg_damping * p 159 | 160 | g = self.session.run(self.pg , feed_dict=feed) 161 | stepdir = krylov.cg(fisher_vector_product , -g , cg_iters=pms.cg_iters) 162 | shs = 0.5 * stepdir.dot(fisher_vector_product(stepdir)) # theta 163 | # if shs<0, then the nan error would appear 164 | lm = np.sqrt(shs / pms.max_kl) 165 | fullstep = stepdir / lm 166 | neggdotstepdir = -g.dot(stepdir) 167 | 168 | def loss(th): 169 | self.sff(th) 170 | return self.session.run(self.losses , feed_dict=feed) 171 | 172 | if parallel is True: 173 | theta = linesearch_parallel(loss , thprev , fullstep , neggdotstepdir / lm) 174 | else: 175 | if linear_search: 176 | theta = linesearch(loss , thprev , fullstep , neggdotstepdir / lm) 177 | else: 178 | theta = thprev + fullstep 179 | accum_fullstep += (theta - thprev) 180 | theta = thprev + accum_fullstep * pms.subsample_factor 181 | stats = {} 182 | stats["sum steps of episodes"] = sample_data["sum_episode_steps"] 183 | stats["Average sum of rewards per episode"] = episoderewards.mean() 184 | stats["surr loss"] = loss(theta)[0] 185 | stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0) 186 | self.logger.log_row([pms.subsample_factor, stats["Average sum of rewards per episode"], self.session.run(self.net.action_dist_logstd_param)[0][0]]) 187 | return stats , theta , thprev 188 | 189 | def process_paths(self, paths): 190 | sum_episode_steps = 0 191 | for path in paths: 192 | sum_episode_steps += path['episode_steps'] 193 | path['baselines'] = self.baseline.predict(path) 194 | path["returns"] = np.concatenate(discount(path["rewards"] , pms.discount)) 195 | path["advantages"] = path['returns'] - path['baselines'] 196 | 197 | observations = np.concatenate([path["observations"] for path in paths]) 198 | actions = np.concatenate([path["actions"] for path in paths]) 199 | rewards = np.concatenate([path["rewards"] for path in paths]) 200 | advantages = np.concatenate([path["advantages"] for path in paths]) 201 | env_infos = np.concatenate([path["env_infos"] for path in paths]) 202 | agent_infos = np.concatenate([path["agent_infos"] for path in paths]) 203 | if pms.center_adv: 204 | advantages -= advantages.mean() 205 | advantages /= (advantages.std() + 1e-8) 206 | 207 | # for some unknown reaseon, it can not be used 208 | # if pms.positive_adv: 209 | # advantages = (advantages - np.min(advantages)) + 1e-8 210 | 211 | # average_discounted_return = \ 212 | # np.mean([path["returns"][0] for path in paths]) 213 | # 214 | # undiscounted_returns = [sum(path["rewards"]) for path in paths] 215 | 216 | 217 | # ev = self.explained_variance_1d( 218 | # np.concatenate(baselines), 219 | # np.concatenate(returns) 220 | # ) 221 | samples_data = dict( 222 | observations=observations , 223 | actions=actions , 224 | rewards=rewards , 225 | advantages=advantages , 226 | env_infos=env_infos , 227 | agent_infos=agent_infos , 228 | paths=paths , 229 | sum_episode_steps=sum_episode_steps 230 | ) 231 | self.baseline.fit(paths) 232 | return samples_data 233 | 234 | def save_model(self , model_name): 235 | self.saver.save(self.session , "checkpoint/" + model_name + ".ckpt") 236 | 237 | def load_model(self , model_name): 238 | try: 239 | if model_name is not None: 240 | self.saver.restore(self.session , model_name) 241 | else: 242 | self.saver.restore(self.session , tf.train.latest_checkpoint(pms.checkpoint_dir)) 243 | except: 244 | print "load model %s fail" % (model_name) -------------------------------------------------------------------------------- /agent/agent_continous_single_process.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | import threading 3 | import gym 4 | import numpy as np 5 | import random 6 | import tensorflow as tf 7 | import time 8 | import threading 9 | import multiprocessing 10 | import prettytensor as pt 11 | 12 | from storage.storage_continous import Storage 13 | from storage.storage_continous import Rollout 14 | import math 15 | from parameters import pms 16 | import krylov 17 | from logger.logger import Logger 18 | from distribution.diagonal_gaussian import DiagonalGaussian 19 | from baseline.baseline_lstsq import Baseline 20 | from environment import Environment 21 | from network.network_continous import NetworkContinous 22 | 23 | seed = 1 24 | np.random.seed(seed) 25 | tf.set_random_seed(seed) 26 | 27 | 28 | class TRPOAgentContinousSingleProcess(object): 29 | 30 | def __init__(self, thread_id): 31 | print "create worker %d"%(thread_id) 32 | self.thread_id = thread_id 33 | self.env = env = Environment(gym.make(pms.environment_name)) 34 | # print("Observation Space", env.observation_space) 35 | # print("Action Space", env.action_space) 36 | # print("Action area, high:%f, low%f" % (env.action_space.high, env.action_space.low)) 37 | self.end_count = 0 38 | self.paths = [] 39 | self.train = True 40 | self.baseline = Baseline() 41 | self.storage = Storage(self, self.env, self.baseline) 42 | self.distribution = DiagonalGaussian(pms.action_shape) 43 | 44 | self.session = self.master.session 45 | self.init_network() 46 | 47 | 48 | def init_network(self): 49 | self.network = NetworkContinous(str(self.thread_id)) 50 | if pms.min_std is not None: 51 | log_std_var = tf.maximum(self.network.action_dist_logstds_n, np.log(pms.min_std)) 52 | self.action_dist_stds_n = tf.exp(log_std_var) 53 | 54 | self.old_dist_info_vars = dict(mean=self.network.old_dist_means_n, log_std=self.network.old_dist_logstds_n) 55 | self.new_dist_info_vars = dict(mean=self.network.action_dist_means_n, log_std=self.network.action_dist_logstds_n) 56 | self.likehood_action_dist = self.distribution.log_likelihood_sym(self.network.action_n, self.new_dist_info_vars) 57 | self.ratio_n = self.distribution.likelihood_ratio_sym(self.network.action_n, self.new_dist_info_vars, 58 | self.old_dist_info_vars) 59 | 60 | surr = -tf.reduce_mean(self.ratio_n * self.network.advant) # Surrogate loss 61 | kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars)) 62 | ent = self.distribution.entropy(self.old_dist_info_vars) 63 | # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf 64 | self.losses = [surr, kl, ent] 65 | var_list = self.network.var_list 66 | self.gf = GetFlat(self.session, var_list) # get theta from var_list 67 | self.sff = SetFromFlat(self.session, var_list) # set theta from var_List 68 | # get g 69 | self.pg = flatgrad(surr, var_list) 70 | # get A 71 | 72 | # KL divergence where first arg is fixed 73 | # replace old->tf.stop_gradient from previous kl 74 | kl_firstfixed = kl_sym_gradient(self.network.old_dist_means_n, self.network.old_dist_logstds_n, self.network.action_dist_means_n, 75 | self.network.action_dist_logstds_n) 76 | 77 | grads = tf.gradients(kl, var_list) 78 | self.flat_tangent = tf.placeholder(dtype, shape=[None]) 79 | shapes = map(var_shape, var_list) 80 | start = 0 81 | tangents = [] 82 | for shape in shapes: 83 | size = np.prod(shape) 84 | param = tf.reshape(self.flat_tangent[start:(start + size)], shape) 85 | tangents.append(param) 86 | start += size 87 | self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)] 88 | self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list) # get kl''*p 89 | # self.load_model() 90 | 91 | def get_samples(self, path_number): 92 | for i in range(pms.paths_number): 93 | self.storage.get_single_path() 94 | 95 | def get_action(self, obs, *args): 96 | obs = np.expand_dims(obs, 0) 97 | # action_dist_logstd = np.expand_dims([np.log(pms.std)], 0) 98 | if pms.use_std_network: 99 | action_dist_means_n, action_dist_logstds_n = self.session.run( 100 | [self.action_dist_means_n, self.action_dist_logstds_n], 101 | {self.obs: obs}) 102 | if pms.train_flag: 103 | rnd = np.random.normal(size=action_dist_means_n[0].shape) 104 | action = rnd * np.exp(action_dist_logstds_n[0]) + action_dist_means_n[0] 105 | else: 106 | action = action_dist_means_n[0] 107 | # action = np.clip(action, pms.min_a, pms.max_a) 108 | return action, dict(mean=action_dist_means_n[0], log_std=action_dist_logstds_n[0]) 109 | else: 110 | action_dist_logstd = np.expand_dims([np.log(pms.std)], 0) 111 | action_dist_means_n = self.network.get_action_dist_means_n(self.session, obs) 112 | if pms.train_flag: 113 | rnd = np.random.normal(size=action_dist_means_n[0].shape) 114 | action = rnd * np.exp(action_dist_logstd[0]) + action_dist_means_n[0] 115 | else: 116 | action = action_dist_means_n[0] 117 | # action = np.clip(action, pms.min_a, pms.max_a) 118 | return action, dict(mean=action_dist_means_n[0], log_std=action_dist_logstd[0]) 119 | 120 | def run(self): 121 | self.learn() 122 | 123 | def learn(self): 124 | start_time = time.time() 125 | 126 | numeptotal = 0 127 | while True: 128 | i = 0 129 | # Generating paths. 130 | # print("Rollout") 131 | self.get_samples(pms.paths_number) 132 | paths = self.storage.get_paths() # get_paths 133 | # Computing returns and estimating advantage function. 134 | sample_data = self.storage.process_paths(paths) 135 | 136 | agent_infos = sample_data["agent_infos"] 137 | obs_n = sample_data["observations"] 138 | action_n = sample_data["actions"] 139 | advant_n = sample_data["advantages"] 140 | n_samples = len(obs_n) 141 | inds = np.random.choice(n_samples, math.floor(n_samples * pms.subsample_factor), replace=False) 142 | obs_n = obs_n[inds] 143 | action_n = action_n[inds] 144 | advant_n = advant_n[inds] 145 | action_dist_means_n = np.array([agent_info["mean"] for agent_info in agent_infos[inds]]) 146 | action_dist_logstds_n = np.array([agent_info["log_std"] for agent_info in agent_infos[inds]]) 147 | feed = {self.network.obs: obs_n, 148 | self.network.advant: advant_n, 149 | self.network.old_dist_means_n: action_dist_means_n, 150 | self.network.old_dist_logstds_n: action_dist_logstds_n, 151 | self.network.action_dist_logstds_n: action_dist_logstds_n, 152 | self.network.action_n: action_n 153 | } 154 | 155 | episoderewards = np.array([path["rewards"].sum() for path in paths]) 156 | average_episode_std = np.mean(np.exp(action_dist_logstds_n)) 157 | 158 | # print "\n********** Iteration %i ************" % i 159 | for iter_num_per_train in range(pms.iter_num_per_train): 160 | # if not self.train: 161 | # print("Episode mean: %f" % episoderewards.mean()) 162 | # self.end_count += 1 163 | # if self.end_count > 100: 164 | # break 165 | if self.train: 166 | thprev = self.gf() # get theta_old 167 | 168 | def fisher_vector_product(p): 169 | feed[self.flat_tangent] = p 170 | return self.session.run(self.fvp, feed) + pms.cg_damping * p 171 | 172 | g = self.session.run(self.pg, feed_dict=feed) 173 | stepdir = krylov.cg(fisher_vector_product, g, cg_iters=pms.cg_iters) 174 | shs = 0.5 * stepdir.dot(fisher_vector_product(stepdir)) # theta 175 | fullstep = stepdir * np.sqrt(2.0 * pms.max_kl / shs) 176 | neggdotstepdir = -g.dot(stepdir) 177 | 178 | def loss(th): 179 | self.sff(th) 180 | return self.session.run(self.losses, feed_dict=feed) 181 | 182 | surr_prev, kl_prev, ent_prev = loss(thprev) 183 | mean_advant = np.mean(advant_n) 184 | theta = linesearch(loss, thprev, fullstep, neggdotstepdir) 185 | self.sff(theta) 186 | surrafter, kloldnew, entnew = self.session.run(self.losses, feed_dict=feed) 187 | stats = {} 188 | numeptotal += len(episoderewards) 189 | stats["average_episode_std"] = average_episode_std 190 | stats["sum steps of episodes"] = sample_data["sum_episode_steps"] 191 | stats["Total number of episodes"] = numeptotal 192 | stats["Average sum of rewards per episode"] = episoderewards.mean() 193 | # stats["Entropy"] = entropy 194 | # exp = explained_variance(np.array(baseline_n), np.array(returns_n)) 195 | # stats["Baseline explained"] = exp 196 | stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0) 197 | stats["KL between old and new distribution"] = kloldnew 198 | stats["Surrogate loss"] = surrafter 199 | stats["Surrogate loss prev"] = surr_prev 200 | stats["entropy"] = ent_prev 201 | stats["mean_advant"] = mean_advant 202 | log_data = [average_episode_std, len(episoderewards), numeptotal, episoderewards.mean(), kloldnew, surrafter, surr_prev, 203 | surrafter - surr_prev, 204 | ent_prev, mean_advant] 205 | self.master.logger.log_row(log_data) 206 | # for k, v in stats.iteritems(): 207 | # print(k + ": " + " " * (40 - len(k)) + str(v)) 208 | # # if entropy != entropy: 209 | # # exit(-1) 210 | # # if exp > 0.95: 211 | # # self.train = False 212 | if self.thread_id==1: 213 | self.master.save_model("iter" + str(i)) 214 | print episoderewards.mean() 215 | i += 1 216 | 217 | def test(self, model_name): 218 | self.load_model(model_name) 219 | for i in range(50): 220 | self.storage.get_single_path() 221 | 222 | def save_model(self, model_name): 223 | self.saver.save(self.session, "checkpoint/" + model_name + ".ckpt") 224 | 225 | def load_model(self, model_name): 226 | try: 227 | self.saver.restore(self.session, model_name) 228 | except: 229 | print "load model %s fail" % (model_name) 230 | -------------------------------------------------------------------------------- /agent/agent_continous_image_parallel_image.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | import numpy as np 3 | import tensorflow as tf 4 | from network.network_continous_image import NetworkContinousImage 5 | from parameters import pms 6 | 7 | import multiprocessing 8 | import krylov 9 | from baseline.baseline_zeros import Baseline 10 | from distribution.diagonal_gaussian import DiagonalGaussian 11 | import time 12 | import math 13 | from logger.logger import Logger 14 | 15 | seed = 1 16 | np.random.seed(seed) 17 | tf.set_random_seed(seed) 18 | 19 | 20 | """ 21 | class for continoust action space in multi process 22 | """ 23 | class TRPOAgentParallelImage(multiprocessing.Process): 24 | 25 | 26 | def __init__(self , observation_space , action_space , task_q , result_q): 27 | multiprocessing.Process.__init__(self) 28 | self.task_q = task_q 29 | self.result_q = result_q 30 | self.observation_space = observation_space 31 | self.action_space = action_space 32 | self.args = pms 33 | self.baseline = Baseline() 34 | self.distribution = DiagonalGaussian(pms.action_shape) 35 | self.init_logger() 36 | 37 | def init_network(self): 38 | """ 39 | [input] 40 | self.obs 41 | self.action_n 42 | self.advant 43 | self.old_dist_means_n 44 | self.old_dist_logstds_n 45 | [output] 46 | self.action_dist_means_n 47 | self.action_dist_logstds_n 48 | var_list 49 | """ 50 | config = tf.ConfigProto( 51 | device_count={'GPU': 0} 52 | ) 53 | self.session = tf.Session(config=config) 54 | self.net = NetworkContinousImage("network_continous_image") 55 | if pms.min_std is not None: 56 | log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std)) 57 | self.action_dist_stds_n = tf.exp(log_std_var) 58 | self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n) 59 | self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n) 60 | self.likehood_action_dist = self.distribution.log_likelihood_sym(self.net.action_n, self.new_dist_info_vars) 61 | self.ratio_n = self.distribution.likelihood_ratio_sym(self.net.action_n, self.new_dist_info_vars, 62 | self.old_dist_info_vars) 63 | surr = -tf.reduce_sum(self.ratio_n * self.net.advant) # Surrogate loss 64 | batch_size = tf.shape(self.net.obs)[0] 65 | batch_size_float = tf.cast(batch_size , tf.float32) 66 | kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars)) 67 | ent = self.distribution.entropy(self.old_dist_info_vars) 68 | # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf 69 | self.losses = [surr, kl, ent] 70 | var_list = self.net.var_list 71 | 72 | self.gf = GetFlat(var_list) # get theta from var_list 73 | self.gf.session = self.session 74 | self.sff = SetFromFlat(var_list) # set theta from var_List 75 | self.sff.session = self.session 76 | # get g 77 | self.pg = flatgrad(surr, var_list) 78 | # get A 79 | # KL divergence where first arg is fixed 80 | # replace old->tf.stop_gradient from previous kl 81 | kl_firstfixed = self.distribution.kl_sym_firstfixed(self.new_dist_info_vars) / batch_size_float 82 | grads = tf.gradients(kl_firstfixed, var_list) 83 | self.flat_tangent = tf.placeholder(dtype, shape=[None]) 84 | shapes = map(var_shape, var_list) 85 | start = 0 86 | tangents = [] 87 | for shape in shapes: 88 | size = np.prod(shape) 89 | param = tf.reshape(self.flat_tangent[start:(start + size)], shape) 90 | tangents.append(param) 91 | start += size 92 | self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)] 93 | self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list) # get kl''*p 94 | self.session.run(tf.initialize_all_variables()) 95 | self.saver = tf.train.Saver(max_to_keep=5) 96 | 97 | def init_logger(self): 98 | head = ["factor", "rewards", "std"] 99 | self.logger = Logger(head) 100 | 101 | def run(self): 102 | self.init_network() 103 | while True: 104 | paths = self.task_q.get() 105 | if paths is None: 106 | # kill the learner 107 | self.task_q.task_done() 108 | break 109 | elif paths == 1: 110 | # just get params, no learn 111 | self.task_q.task_done() 112 | self.result_q.put(self.gf()) 113 | elif paths[0] == 2: 114 | # adjusting the max KL. 115 | self.args.max_kl = paths[1] 116 | if paths[2] == 1: 117 | print "saving checkpoint..." 118 | self.save_model(pms.environment_name + "-" + str(paths[3])) 119 | self.task_q.task_done() 120 | else: 121 | stats , theta, thprev = self.learn(paths) 122 | self.sff(theta) 123 | self.task_q.task_done() 124 | self.result_q.put((stats, theta, thprev)) 125 | return 126 | 127 | def learn(self, paths, parallel=False, linear_search=False): 128 | start_time = time.time() 129 | sample_data = self.process_paths(paths) 130 | agent_infos = sample_data["agent_infos"] 131 | obs_all = sample_data["observations"] 132 | action_all = sample_data["actions"] 133 | advant_all = sample_data["advantages"] 134 | n_samples = len(obs_all) 135 | batch = int(1/pms.subsample_factor) 136 | batch_size = int(math.floor(n_samples * pms.subsample_factor)) 137 | accum_fullstep = 0.0 138 | for iteration in range(batch): 139 | print "batch: %d, batch_size: %d"%(iteration+1, batch_size) 140 | inds = np.random.choice(n_samples , batch_size , replace=False) 141 | obs_n = obs_all[inds] 142 | action_n = action_all[inds] 143 | advant_n = advant_all[inds] 144 | action_dist_means_n = np.array([agent_info["mean"] for agent_info in agent_infos[inds]]) 145 | action_dist_logstds_n = np.array([agent_info["log_std"] for agent_info in agent_infos[inds]]) 146 | feed = {self.net.obs: obs_n , 147 | self.net.advant: advant_n , 148 | self.net.old_dist_means_n: action_dist_means_n , 149 | self.net.old_dist_logstds_n: action_dist_logstds_n , 150 | self.net.action_n: action_n 151 | } 152 | 153 | episoderewards = np.array([path["rewards"].sum() for path in paths]) 154 | thprev = self.gf() # get theta_old 155 | 156 | def fisher_vector_product(p): 157 | feed[self.flat_tangent] = p 158 | return self.session.run(self.fvp , feed) + pms.cg_damping * p 159 | 160 | g = self.session.run(self.pg , feed_dict=feed) 161 | stepdir = krylov.cg(fisher_vector_product , -g , cg_iters=pms.cg_iters) 162 | shs = 0.5 * stepdir.dot(fisher_vector_product(stepdir)) # theta 163 | # if shs<0, then the nan error would appear 164 | lm = np.sqrt(shs / pms.max_kl) 165 | fullstep = stepdir / lm 166 | neggdotstepdir = -g.dot(stepdir) 167 | 168 | def loss(th): 169 | self.sff(th) 170 | return self.session.run(self.losses , feed_dict=feed) 171 | 172 | if parallel is True: 173 | theta = linesearch_parallel(loss , thprev , fullstep , neggdotstepdir / lm) 174 | else: 175 | if linear_search: 176 | theta = linesearch(loss , thprev , fullstep , neggdotstepdir / lm) 177 | else: 178 | theta = thprev + fullstep 179 | if math.isnan(theta.mean()): 180 | print shs is None 181 | theta = thprev 182 | accum_fullstep += (theta - thprev) 183 | theta = thprev + accum_fullstep * pms.subsample_factor 184 | stats = {} 185 | stats["sum steps of episodes"] = sample_data["sum_episode_steps"] 186 | stats["Average sum of rewards per episode"] = episoderewards.mean() 187 | stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0) 188 | self.logger.log_row([pms.subsample_factor, stats["Average sum of rewards per episode"], self.session.run(self.net.action_dist_logstd_param)[0][0]]) 189 | return stats , theta , thprev 190 | 191 | def process_paths(self, paths): 192 | sum_episode_steps = 0 193 | for path in paths: 194 | sum_episode_steps += path['episode_steps'] 195 | # r_t+V(S_{t+1})-V(S_t) = returns-baseline 196 | # path_baselines = np.append(self.baseline.predict(path) , 0) 197 | # # r_t+V(S_{t+1})-V(S_t) = returns-baseline 198 | # path["advantages"] = np.concatenate(path["rewards"]) + \ 199 | # pms.discount * path_baselines[1:] - \ 200 | # path_baselines[:-1] 201 | # path["returns"] = np.concatenate(discount(path["rewards"], pms.discount)) 202 | path_baselines = np.append(self.baseline.predict(path) , 0) 203 | deltas = np.concatenate(path["rewards"]) + \ 204 | pms.discount * path_baselines[1:] - \ 205 | path_baselines[:-1] 206 | path["advantages"] = discount( 207 | deltas , pms.discount * pms.gae_lambda) 208 | path["returns"] = np.concatenate(discount(path["rewards"] , pms.discount)) 209 | observations = np.concatenate([path["observations"] for path in paths]) 210 | actions = np.concatenate([path["actions"] for path in paths]) 211 | rewards = np.concatenate([path["rewards"] for path in paths]) 212 | advantages = np.concatenate([path["advantages"] for path in paths]) 213 | env_infos = np.concatenate([path["env_infos"] for path in paths]) 214 | agent_infos = np.concatenate([path["agent_infos"] for path in paths]) 215 | if pms.center_adv: 216 | advantages -= np.mean(advantages) 217 | advantages /= (advantages.std() + 1e-8) 218 | 219 | # for some unknown reaseon, it can not be used 220 | # if pms.positive_adv: 221 | # advantages = (advantages - np.min(advantages)) + 1e-8 222 | 223 | # average_discounted_return = \ 224 | # np.mean([path["returns"][0] for path in paths]) 225 | # 226 | # undiscounted_returns = [sum(path["rewards"]) for path in paths] 227 | 228 | 229 | # ev = self.explained_variance_1d( 230 | # np.concatenate(baselines), 231 | # np.concatenate(returns) 232 | # ) 233 | samples_data = dict( 234 | observations=observations , 235 | actions=actions , 236 | rewards=rewards , 237 | advantages=advantages , 238 | env_infos=env_infos , 239 | agent_infos=agent_infos , 240 | paths=paths , 241 | sum_episode_steps=sum_episode_steps 242 | ) 243 | self.baseline.fit(paths) 244 | return samples_data 245 | 246 | def save_model(self , model_name): 247 | self.saver.save(self.session , "checkpoint/" + model_name + ".ckpt") 248 | 249 | def load_model(self , model_name): 250 | try: 251 | if model_name is not None: 252 | self.saver.restore(self.session , model_name) 253 | else: 254 | self.saver.restore(self.session , tf.train.latest_checkpoint(pms.checkpoint_dir)) 255 | except: 256 | print "load model %s fail" % (model_name) --------------------------------------------------------------------------------