├── agent
    ├── __init__.py
    ├── agent_parallel.py
    ├── agent_continous_image.py
    ├── agent_continous.py
    ├── agent_continous_rnn.py
    ├── agent_cotinous_single_thread.py
    ├── AC_agent_continous.py
    ├── agent_base.py
    ├── agent_discrete.py
    ├── agent_continous_parallel_storage.py
    ├── agent_continous_single_process.py
    └── agent_continous_image_parallel_image.py
├── logger
    ├── __init__.py
    └── logger.py
├── baseline
    ├── __init__.py
    ├── baseline_zeros.py
    ├── baseline_average_reward.py
    ├── baseline_lstsq.py
    ├── baseline_tensorflow.py
    └── baseline_tf_image.py
├── experiment
    ├── __init__.py
    ├── main.py
    ├── main_lstm.py
    ├── main_ac.py
    ├── main_discrete.py
    ├── main_image.py
    ├── main_tf_parallel.py
    ├── main_multi_thread.py
    ├── main_image_multi_process.py
    └── main_multi_process.py
├── network
    ├── __init__.py
    ├── network_descrete.py
    ├── network_continous.py
    ├── network_continous_image.py
    └── network_continous_rnn.py
├── storage
    ├── __init__.py
    ├── storage.py
    ├── storage_image.py
    ├── storage_continous.py
    ├── storage_continous_parallel.py
    └── storage_continous_parallel_image.py
├── distribution
    ├── __init__.py
    ├── diagonal_category.py
    └── diagonal_gaussian.py
├── .gitignore
├── run.py
├── dealImage.py
├── parameters.py~
├── README.md
├── environment.py
├── parameters.py
├── utils.py
└── krylov.py


/agent/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/logger/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baseline/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/experiment/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/network/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/storage/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/distribution/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/network/network_descrete.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | logs_*
2 | *.pyc
3 | *.swp
4 | checkpoint/
5 | checkpoint_parallel/
6 | log/
7 | .idea/
8 | .idea
9 | 


--------------------------------------------------------------------------------
/baseline/baseline_zeros.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | 
3 | 
4 | class Baseline(object):
5 |     def fit(self, paths):
6 |         self.temp = 0
7 | 
8 |     def predict(self, path):
9 |         return np.zeros(len(path["rewards"]))


--------------------------------------------------------------------------------
/baseline/baseline_average_reward.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class BaselineAverageReward(object):
 5 |     def fit(self, paths):
 6 |         self.temp = 0
 7 | 
 8 |     def predict(self, path):
 9 |         rewards = path["rewards"]
10 |         mean_rewards = np.mean(rewards)
11 |         return mean_rewards


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | tasks = ["Copy-v0", "DuplicatedInput-v0", "Reverse-v0", "RepeatCopy-v0"]
 4 | 
 5 | os.system("rm logs_*")
 6 | os.system("k screen")
 7 | os.system("screen -wipe")
 8 | 
 9 | 
10 | for t in tasks:
11 |     os.system("screen -dm -S trpo_%s bash -c '. ~/.profile; . ~/.bashrc; CUDA_VISIBLE_DEVICES=[] python main.py %s 2>&1 | tee logs_%s ; bash'" % (t, t, t))
12 | 


--------------------------------------------------------------------------------
/experiment/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import gym
 3 | from environment import Environment
 4 | from agent.agent_continous import TRPOAgent
 5 | from parameters import pms
 6 | 
 7 | if not os.path.isdir("./checkpoint"):
 8 |     os.makedirs("./checkpoint")
 9 | if not os.path.isdir("./log"):
10 |     os.makedirs("./log")
11 | env = Environment(gym.make(pms.environment_name))
12 | agent = TRPOAgent(env)
13 | 
14 | if pms.train_flag:
15 |     agent.learn()
16 | else:
17 |     agent.test(pms.checkpoint_file)
18 | # env.monitor.close()
19 | # gym.upload(training_dir,
20 | #            algorithm_id='trpo_ff')
21 | 


--------------------------------------------------------------------------------
/experiment/main_lstm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import gym
 3 | from environment import Environment
 4 | from agent.agent_continous_rnn import TRPOAgent
 5 | from parameters import pms
 6 | 
 7 | if not os.path.isdir("./checkpoint"):
 8 |     os.makedirs("./checkpoint")
 9 | if not os.path.isdir("./log"):
10 |     os.makedirs("./log")
11 | env = Environment(gym.make(pms.environment_name))
12 | agent = TRPOAgent(env)
13 | 
14 | if pms.train_flag:
15 |     agent.learn()
16 | else:
17 |     agent.test(pms.checkpoint_file)
18 | # env.monitor.close()
19 | # gym.upload(training_dir,
20 | #            algorithm_id='trpo_ff')
21 | 


--------------------------------------------------------------------------------
/logger/logger.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import time
 3 | 
 4 | class Logger(object):
 5 |     def __init__(self, head):
 6 |         self.head = []
 7 |         self.file_name = self.get_file_name()
 8 |         self.csvfile = file("log/"+self.file_name , 'wb')
 9 |         self.csv_writer = csv.writer(self.csvfile)
10 |         self.log_row(head)
11 | 
12 |     def log_row(self, data):
13 |         self.csv_writer.writerow(data)
14 | 
15 |     def get_file_name(self):
16 |         file_time = time.strftime("%Y-%m-%d-%H:%M:%S",time.localtime(time.time()))
17 |         file_name = file_time+".csv"
18 |         return file_name
19 | 
20 |     def __del__(self):
21 |         self.csvfile.close()


--------------------------------------------------------------------------------
/experiment/main_ac.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import gym
 4 | from gym import envs, scoreboard
 5 | from gym.spaces import Discrete, Box
 6 | import tempfile
 7 | import sys
 8 | from environment import Environment
 9 | from agent.AC_agent_continous import ACAgent
10 | from parameters import pms
11 | 
12 | if not os.path.isdir("./checkpoint"):
13 |     os.makedirs("./checkpoint")
14 | if not os.path.isdir("./log"):
15 |     os.makedirs("./log")
16 | env = Environment(gym.make(pms.environment_name))
17 | agent = ACAgent(env)
18 | 
19 | if pms.train_flag:
20 |     agent.learn()
21 | else:
22 |     agent.test(pms.checkpoint_file)
23 | # env.monitor.close()
24 | # gym.upload(training_dir,
25 | #            algorithm_id='trpo_ff')
26 | 


--------------------------------------------------------------------------------
/experiment/main_discrete.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import gym
 4 | from gym import envs, scoreboard
 5 | from gym.spaces import Discrete, Box
 6 | import tempfile
 7 | import sys
 8 | from environment import Environment
 9 | from agent.agent_discrete import TRPOAgent
10 | from parameters import pms
11 | 
12 | if not os.path.isdir("./checkpoint"):
13 |     os.makedirs("./checkpoint")
14 | if not os.path.isdir("./log"):
15 |     os.makedirs("./log")
16 | env = Environment(gym.make(pms.environment_name))
17 | agent = TRPOAgent(env)
18 | 
19 | if pms.train_flag:
20 |     agent.learn()
21 | else:
22 |     agent.test(pms.checkpoint_file)
23 | # env.monitor.close()
24 | # gym.upload(training_dir,
25 | #            algorithm_id='trpo_ff')
26 | 


--------------------------------------------------------------------------------
/experiment/main_image.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import gym
 4 | from gym import envs, scoreboard
 5 | from gym.spaces import Discrete, Box
 6 | import tempfile
 7 | import sys
 8 | from environment import Environment
 9 | from agent.agent_continous_image import TRPOAgent
10 | from parameters import pms
11 | 
12 | if not os.path.isdir("./checkpoint"):
13 |     os.makedirs("./checkpoint")
14 | if not os.path.isdir("./log"):
15 |     os.makedirs("./log")
16 | env = Environment(gym.make(pms.environment_name))
17 | agent = TRPOAgent(env)
18 | 
19 | if pms.train_flag:
20 |     agent.learn()
21 | else:
22 |     agent.test(pms.checkpoint_file)
23 | # env.monitor.close()
24 | # gym.upload(training_dir,
25 | #            algorithm_id='trpo_ff')
26 | 


--------------------------------------------------------------------------------
/dealImage.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | matplotlib.use('Qt4Agg')
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | 
 6 | 
 7 | def vis_square(data, padsize=1, padval=0):
 8 |     data -= data.min()
 9 |     data /= data.max()
10 | 
11 | 
12 |     n = int(np.ceil(np.sqrt(data.shape[0])))
13 |     padding = ((0, n ** 2 - data.shape[0]), (0, padsize), (0, padsize)) + ((0, 0),) * (data.ndim - 3)
14 |     data = np.pad(data, padding, mode='constant', constant_values=(padval, padval))
15 | 
16 | 
17 |     data = data.reshape((n, n) + data.shape[1:]).transpose((0, 2, 1, 3) + tuple(range(4, data.ndim + 1)))
18 |     data = data.reshape((n * data.shape[1], n * data.shape[3]) + data.shape[4:])
19 |     print data.shape
20 |     plt.imshow(data)
21 |     plt.show()


--------------------------------------------------------------------------------
/parameters.py~:
--------------------------------------------------------------------------------
 1 | # for image
 2 | dims = (100, 100)
 3 | obs_height = 100
 4 | obs_width = 100
 5 | obs_channel = 1
 6 | history_number = 2
 7 | 
 8 | # for trainning
 9 | <<<<<<< HEAD
10 | jobs = 4
11 | 
12 | =======
13 | jobs = 2
14 | >>>>>>> 0356c098856467ec6db97061e73187c6a18a25a7
15 | max_iter_number = 10000
16 | paths_number = 1
17 | max_path_length = 199
18 | batch_size = max_path_length
19 | max_kl = 0.01
20 | gae_lambda = 1.0
21 | subsample_factor = 0.8
22 | cg_damping = 0.1
23 | discount = 0.99
24 | cg_iters = 10
25 | deviation = 0.1
26 | render = True
27 | train_flag = False
28 | iter_num_per_train = 1
29 | checkpoint_file = "checkpoint/iter240865.ckpt"
30 | record_movie = False
31 | upload_to_gym = False
32 | 
33 | # for environment
34 | 
35 | environment_name = "Pendulum-v0"
36 | 
37 | # for continous action
38 | min_std = 1e-6
39 | center_adv = True
40 | positive_adv = False
41 | use_std_network = False
42 | std = 1.1
43 | obs_shape = 3
44 | action_shape = 1
45 | min_a = -2.0
46 | max_a = 2.0
47 | 
48 | 


--------------------------------------------------------------------------------
/distribution/diagonal_category.py:
--------------------------------------------------------------------------------
 1 | from utils import *
 2 | import tensorflow as tf
 3 | import numpy as np
 4 | 
 5 | 
 6 | class DiagonalCategory(object):
 7 |     def __init__(self, dim=0):
 8 |         self._dim = dim
 9 | 
10 |     @property
11 |     def dim(self):
12 |         return self._dim
13 | 
14 |     def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
15 |         return tf.reduce_mean(old_dist_info_vars * tf.log((old_dist_info_vars + 1e-8) / (new_dist_info_vars + 1e-8)))
16 | 
17 |     def likelihood_ratio_sym(self, x_var, new_dist_info_vars, old_dist_info_vars):
18 |         """
19 |         \frac{\pi_\theta}{\pi_{old}}
20 |         :param x_var: actions
21 |         :param new_dist_info_vars: means + logstds
22 |         :param old_dist_info_vars: old_means + old_logstds
23 |         :return:
24 |         """
25 |         N = tf.shape(x_var)[0]
26 |         p_n = slice_2d(new_dist_info_vars, tf.range(0, N), x_var)
27 |         oldp_n = slice_2d(old_dist_info_vars, tf.range(0, N), x_var)
28 |         return p_n / oldp_n
29 | 
30 |     def entropy(self, dist_infos):
31 |         return tf.reduce_mean(-dist_infos * tf.log(dist_infos + 1e-8))


--------------------------------------------------------------------------------
/baseline/baseline_lstsq.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | class Baseline(object):
 3 |     def __init__(self, reg_coeff=1e-5):
 4 |         self._coeffs = None
 5 |         self._reg_coeff = reg_coeff
 6 | 
 7 |     def get_param_values(self, **tags):
 8 |         return self._coeffs
 9 | 
10 |     def set_param_values(self, val, **tags):
11 |         self._coeffs = val
12 | 
13 |     def _features(self, path):
14 |         o = path["observations"].astype('float32')
15 |         o = o.reshape(o.shape[0], -1)
16 |         l = len(path["rewards"])
17 |         al = np.arange(l).reshape(-1 , 1) / 100.0
18 |         return np.concatenate([o, o ** 2, al, al ** 2, np.ones((l, 1))], axis=1)
19 | 
20 |     def fit(self, paths):
21 |         featmat = np.concatenate([self._features(path) for path in paths])
22 |         returns = np.concatenate([path["returns"] for path in paths])
23 |         self._coeffs = np.linalg.lstsq(
24 |             featmat.T.dot(featmat) + self._reg_coeff * np.identity(featmat.shape[1]),
25 |             featmat.T.dot(returns)
26 |         )[0]
27 | 
28 |     def predict(self, path):
29 |         if self._coeffs is None:
30 |             return np.zeros(len(path["rewards"]))
31 |         return self._features(path).dot(self._coeffs)
32 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # recently the algorithm has been moved to https://github.com/jjkke88/RL_toolbox
 2 | 
 3 | # trpo
 4 | trust region policy optimitztion base on gym and tensorflow
 5 | 
 6 | <p>There are three versions of trpo, one for decrete action space like mountaincar, one for decreate action space task with image as input like atari games, and the last for continuous action space for pendulems.</p>
 7 | <p>The environment is base on openAI gym.</p>
 8 | <p>part of code refer to rllab</p>
 9 | 
10 | # dependency
11 | <ul>
12 | <li>tensorflow 0.10</li>
13 | <li>prettytensor</li>
14 | <li>latest openai gym</li>
15 | </ul>
16 | 
17 | # constructure for code
18 | <ul>
19 | <li>baseline:baseline estimation of baseline function  <img src="http://www.forkosh.com/mathtex.cgi?V_\pi"> </li>
20 | <li>checkpoint:folder to store model file, can not be delete or will cause some error</li>
21 | <li>distribution:distribution base class, it can be used to calculate probability of distributions, for example Gaussian.</li>
22 | <li>logger:have a Logger class for log data to .csv file</li>
23 | <li>agent:for disperse action space and continous action space</li>
24 | <li>log:store log file</li>
25 | <li>experiment: contain many different main file, run main file can start trainning or testing</li>
26 | <li>environment.py: environment</li>
27 | <li>krylov.py: implement of some math method:conjugate gradient descent , calculating hessian matrix</li>
28 | <li>parameters.py: config file</li>
29 | <li>utils.py: implement of some basic function: getFlat, setFlat, lineaSearch</li>
30 | </ul>
31 | 
32 | # recent work
33 | <ul>
34 | <li>imple multi-thread trpo run  python main_multi_thread.py to try</li>
35 | <li>imple tensorflow distributed trpo</li>
36 | <li>imple trpo multi-process</li>
37 | </ul>
38 | 
39 | # future work
40 | <ul>
41 | <li>complete trpo with image as input</li>
42 | </ul>
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/baseline/baseline_tensorflow.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | import prettytensor as pt
 4 | 
 5 | class Baseline(object):
 6 |     coeffs = None
 7 | 
 8 |     def __init__(self , session=None):
 9 |         self.net = None
10 |         self.session = session
11 | 
12 |     def create_net(self , shape):
13 |         print(shape)
14 |         self.x = tf.placeholder(tf.float32 , shape=[None , shape] , name="x")
15 |         self.y = tf.placeholder(tf.float32 , shape=[None] , name="y")
16 |         self.net = (pt.wrap(self.x).
17 |                     fully_connected(64 , activation_fn=tf.nn.tanh).
18 |                     fully_connected(1))
19 |         self.net = tf.reshape(self.net , (-1 ,))
20 |         self.l2 = (self.net - self.y) * (self.net - self.y)
21 |         self.train = tf.train.AdamOptimizer().minimize(self.l2)
22 |         self.session.run(tf.initialize_all_variables())
23 | 
24 |     def _features(self, path):
25 |         o = path["observations"].astype('float32')
26 |         o = o.reshape(o.shape[0] , -1)
27 |         l = len(path["rewards"])
28 |         al = np.arange(l).reshape(-1 , 1) / 100.0
29 |         return np.concatenate([o , o ** 2 , al , al ** 2 , np.ones((l , 1))] , axis=1)
30 | 
31 |     def fit(self, paths):
32 |         featmat = np.concatenate([self._features(path) for path in paths])
33 |         if self.net is None:
34 |             self.create_net(featmat.shape[1])
35 |         returns = np.concatenate([path["returns"] for path in paths])
36 |         for _ in range(10):
37 |             loss, _ = self.session.run([self.l2, self.train], {self.x: featmat , self.y: returns})
38 | 
39 |     def predict(self, path):
40 |         if self.net is None:
41 |             return np.zeros(len(path["rewards"]))
42 |         else:
43 |             ret = self.session.run(self.net , {self.x: self._features(path)})
44 |             return np.reshape(ret , (ret.shape[0] ,))
45 | 


--------------------------------------------------------------------------------
/baseline/baseline_tf_image.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | import prettytensor as pt
 4 | from parameters import pms
 5 | 
 6 | class BaselineTfImage(object):
 7 |     coeffs = None
 8 | 
 9 |     def __init__(self, session):
10 |         self.net = None
11 |         self.session = session
12 | 
13 |     def create_net(self, shape):
14 |         self.x = tf.placeholder(tf.float32, shape=[None, shape[1], shape[2], shape[3]], name="x")
15 |         self.y = tf.placeholder(tf.float32, shape=[None], name="y")
16 |         self.net = (pt.wrap(self.x).
17 |                     conv2d(1, 16, stride=2, batch_normalize=True).
18 |                     conv2d(1, 16, stride=2, batch_normalize=True).
19 |                     flatten().
20 |                     fully_connected(32, activation_fn=tf.nn.relu).
21 |                     fully_connected(32, activation_fn=tf.nn.relu).
22 |                     fully_connected(1))
23 |         self.net = tf.reshape(self.net, (-1, ))
24 |         l2 = (self.net - self.y) * (self.net - self.y)
25 |         self.train = tf.train.AdamOptimizer().minimize(l2)
26 |         self.session.run(tf.initialize_all_variables())
27 | 
28 |     def _features(self, path):
29 |         ret = path["observations"].astype('float32')
30 |         return ret
31 | 
32 |     def fit(self, paths):
33 |         featmat = np.concatenate([self._features(path) for path in paths])
34 |         if self.net is None:
35 |             self.create_net(featmat.shape)
36 |         returns = np.concatenate([path["returns"] for path in paths])
37 |         for _ in range(100):
38 |             self.session.run(self.train, {self.x: featmat, self.y: returns})
39 | 
40 |     def predict(self, path):
41 |         if self.net is None:
42 |             return np.zeros(len(path["rewards"]))
43 |         else:
44 |             ret = self.session.run(self.net, {self.x: self._features(path)})
45 |             return np.reshape(ret, (ret.shape[0], ))


--------------------------------------------------------------------------------
/network/network_continous.py:
--------------------------------------------------------------------------------
 1 | from utils import *
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | import prettytensor as pt
 5 | from parameters import pms
 6 | 
 7 | seed = 1
 8 | np.random.seed(seed)
 9 | tf.set_random_seed(seed)
10 | 
11 | class NetworkContinous(object):
12 |     def __init__(self, scope):
13 |         with tf.variable_scope("%s_shared" % scope):
14 |             self.obs = obs = tf.placeholder(
15 |                 tf.float32, shape=[None, pms.obs_shape], name="%s_obs"%scope)
16 |             self.action_n = tf.placeholder(tf.float32, shape=[None, pms.action_shape], name="%s_action"%scope)
17 |             self.advant = tf.placeholder(tf.float32, shape=[None], name="%s_advant"%scope)
18 |             self.old_dist_means_n = tf.placeholder(tf.float32, shape=[None, pms.action_shape],
19 |                                                    name="%s_oldaction_dist_means"%scope)
20 |             self.old_dist_logstds_n = tf.placeholder(tf.float32, shape=[None, pms.action_shape],
21 |                                                      name="%s_oldaction_dist_logstds"%scope)
22 |             self.action_dist_means_n = (pt.wrap(self.obs).
23 |                                         fully_connected(64, activation_fn=tf.nn.relu, init=tf.random_normal_initializer(-0.05, 0.05), bias_init=tf.constant_initializer(0),
24 |                                                         name="%s_fc1"%scope).
25 |                                         fully_connected(64, activation_fn=tf.nn.relu, init=tf.random_normal_initializer(-0.05, 0.05), bias_init=tf.constant_initializer(0),
26 |                                                          name="%s_fc2"%scope).
27 |                                         fully_connected(pms.action_shape, init=tf.random_normal_initializer(-0.05, 0.05), bias_init=tf.constant_initializer(0),
28 |                                                         name="%s_fc3"%scope))
29 | 
30 |             self.N = tf.shape(obs)[0]
31 |             Nf = tf.cast(self.N, tf.float32)
32 |             self.action_dist_logstd_param = tf.Variable((.01*np.random.randn(1, pms.action_shape)).astype(np.float32), name="%spolicy_logstd"%scope)
33 |             self.action_dist_logstds_n = tf.tile(self.action_dist_logstd_param,
34 |                                               tf.pack((tf.shape(self.action_dist_means_n)[0], 1)))
35 |             self.var_list = [v for v in tf.trainable_variables()if v.name.startswith(scope)]
36 | 
37 |     def get_action_dist_means_n(self, session, obs):
38 |         return session.run(self.action_dist_means_n,
39 |                          {self.obs: obs})
40 | 
41 | 


--------------------------------------------------------------------------------
/environment.py:
--------------------------------------------------------------------------------
 1 | """
 2 | `SpaceConversionEnv` acts as a wrapper on
 3 | any environment. It allows to convert some action spaces, and observation spaces to others.
 4 | """
 5 | 
 6 | import numpy as np
 7 | from gym.spaces import Discrete, Box, Tuple
 8 | from gym import Env
 9 | import cv2
10 | from parameters import pms
11 | import gym
12 | from gym.monitoring import monitor
13 | 
14 | def convert_gym_space(space):
15 |     if isinstance(space, gym.spaces.Box):
16 |         return Box(low=space.low, high=space.high)
17 |     elif isinstance(space, gym.spaces.Discrete):
18 |         return Discrete(n=space.n)
19 |     else:
20 |         raise NotImplementedError
21 | 
22 | class CappedCubicVideoSchedule(object):
23 |     def __call__(self, count):
24 |         return monitor.capped_cubic_video_schedule(count)
25 | 
26 | class NoVideoSchedule(object):
27 |     def __call__(self , count):
28 |         return False
29 | 
30 | class Environment(Env):
31 | 
32 |     def __init__(self, env, type="origin"):
33 |         self.env = env
34 |         self.type = type
35 |         self.video_schedule = None
36 |         if not pms.record_movie:
37 |             self.video_schedule = NoVideoSchedule()
38 |         else:
39 |             if self.video_schedule is not None:
40 |                 self.video_schedule = CappedCubicVideoSchedule()
41 |             self.env.monitor.start("log/trpo" ,self.video_schedule, force=True)
42 |             self.monitoring = True
43 | 
44 |     def step(self, action, **kwargs):
45 |         self._observation, reward, done, info = self.env.step(action)
46 |         self._observation = np.clip(self._observation, self.env.observation_space.low, self.env.observation_space.high)
47 |         return self.observation, reward, done, info
48 | 
49 |     def reset(self, **kwargs):
50 |         self._observation = self.env.reset()
51 |         return self.observation
52 | 
53 |     def render(self, mode="human", close=False):
54 |         return self.env.render(mode)
55 | 
56 |     @property
57 |     def observation(self):
58 |         if self.type == "origin":
59 |             return self._observation
60 |         elif self.type == "gray_image":
61 |             return cv2.resize(cv2.cvtColor(self._observation, cv2.COLOR_RGB2GRAY)/255., pms.dims)
62 | 
63 |     @property
64 |     def action_space(self):
65 |         return convert_gym_space(self.env.action_space)
66 | 
67 | 
68 |     @property
69 |     def observation_space(self):
70 |         if self.type == "origin":
71 |             return convert_gym_space(self.env.observation_space)
72 |         else:
73 |             return pms.dims
74 | 
75 |     # @property
76 |     # def obs_dims(self):
77 |     #     if self.type == "origin":
78 |     #         return self.env.observation_space.shape
79 |     #     else:
80 |     #         return pms.dims


--------------------------------------------------------------------------------
/network/network_continous_image.py:
--------------------------------------------------------------------------------
 1 | from utils import *
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | import prettytensor as pt
 5 | from parameters import pms
 6 | 
 7 | seed = 1
 8 | np.random.seed(seed)
 9 | tf.set_random_seed(seed)
10 | 
11 | class NetworkContinousImage(object):
12 |     def __init__(self, scope):
13 |         with tf.variable_scope("%s_shared" % scope):
14 |             self.obs = obs = tf.placeholder(
15 |                 dtype, shape=[None, pms.obs_height, pms.obs_width, pms.obs_channel], name="%s_obs"%scope)
16 |             self.action_n = tf.placeholder(dtype, shape=[None, pms.action_shape], name="%s_action"%scope)
17 |             self.advant = tf.placeholder(dtype, shape=[None], name="%s_advant"%scope)
18 |             self.old_dist_means_n = tf.placeholder(dtype, shape=[None, pms.action_shape],
19 |                                                    name="%s_oldaction_dist_means"%scope)
20 |             self.old_dist_logstds_n = tf.placeholder(dtype, shape=[None, pms.action_shape],
21 |                                                      name="%s_oldaction_dist_logstds"%scope)
22 |             self.action_dist_means_n = (pt.wrap(self.obs).
23 |                                         conv2d(8 , 32 , stride=4 , batch_normalize=True).
24 |                                         conv2d(4 , 64 , stride=2 , batch_normalize=True).
25 |                                         conv2d(3 , 64 , stride=1 , batch_normalize=True).
26 |                                         flatten().
27 |                                         fully_connected(128, activation_fn=tf.nn.relu, init=tf.random_normal_initializer(-0.05, 0.05),
28 |                                                         name="%s_fc1"%scope).
29 |                                         fully_connected(128, activation_fn=tf.nn.relu, init=tf.random_normal_initializer(-0.05, 0.05),
30 |                                                          name="%s_fc2"%scope).
31 |                                         fully_connected(pms.action_shape, init=tf.random_normal_initializer(-0.05, 0.05),
32 |                                                         name="%s_fc3"%scope))
33 | 
34 |             self.N = tf.shape(obs)[0]
35 |             Nf = tf.cast(self.N, dtype)
36 |             self.action_dist_logstd_param = tf.Variable((.01*np.random.randn(1, pms.action_shape)).astype(np.float32),  name="%spolicy_logstd"%scope)
37 |             self.action_dist_logstds_n = tf.tile(self.action_dist_logstd_param,
38 |                                               tf.pack((tf.shape(self.action_dist_means_n)[0], 1)))
39 |             self.var_list = [v for v in tf.trainable_variables()if v.name.startswith(scope)]
40 | 
41 |     def get_action_dist_means_n(self, session, obs):
42 |         return session.run(self.action_dist_means_n,
43 |                          {self.obs: obs})
44 | 
45 | 


--------------------------------------------------------------------------------
/parameters.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | flags = tf.app.flags
 4 | flags.DEFINE_integer('obs_height', 100, 'image height')
 5 | flags.DEFINE_integer('obs_width', 100, 'image width')
 6 | flags.DEFINE_integer('obs_channel', 3, 'image channel')
 7 | flags.DEFINE_integer('history_number', 2, 'image history number')
 8 | flags.DEFINE_integer('jobs', 4, 'thread or process number')
 9 | flags.DEFINE_integer('max_iter_number', 400, 'control the max iteration number for trainning')
10 | flags.DEFINE_integer('paths_number', 10, 'number of paths in each rollout')
11 | flags.DEFINE_integer('max_path_length',200, 'timesteps in each path')
12 | flags.DEFINE_integer('batch_size', 100, 'batch size for trainning')
13 | flags.DEFINE_float('max_kl', 0.01, 'the largest kl distance, \sigma in paper')
14 | flags.DEFINE_float('gae_lambda', 1.0, 'fix number')
15 | flags.DEFINE_float('subsample_factor', 0.5, 'ratio of the samples used in training process')
16 | flags.DEFINE_float('cg_damping', 0.001, 'conjugate gradient damping')
17 | flags.DEFINE_float('discount', 0.99, 'discount')
18 | flags.DEFINE_integer('cg_iters', 20, 'iteration number in conjugate gradient')
19 | flags.DEFINE_float('deviation', 0.1, 'fixed')
20 | flags.DEFINE_boolean('render', False, 'whether to render image')
21 | flags.DEFINE_boolean('train_flag', True, 'true for train and False for test')
22 | flags.DEFINE_integer('iter_num_per_train', 1, 'iteration number in each trainning process')
23 | flags.DEFINE_string('checkpoint_file', '', 'checkpoint file path, if empty then will load the latest one')
24 | flags.DEFINE_integer('save_model_times', 1, 'iteration number to save model, if 1, then model would be saved in each iteration')
25 | flags.DEFINE_boolean('record_movie', False, 'whether record the video in gym')
26 | flags.DEFINE_boolean('upload_to_gym', False, 'whether upload the result to gym')
27 | flags.DEFINE_string('checkpoint_dir', 'checkpoint/', 'checkpoint save and load path, for parallel, it should be checkpoint_parallel')
28 | flags.DEFINE_string('environment_name', 'Pendulum-v0', 'environment name')
29 | flags.DEFINE_float('min_std', 0.2, 'the smallest std')
30 | flags.DEFINE_boolean('center_adv', True, 'whether center advantage, fixed')
31 | flags.DEFINE_boolean('positive_adv', False, 'whether positive advantage, fixed')
32 | flags.DEFINE_boolean('use_std_network', False, 'whether use network to train std, it is not supported, fixed')
33 | flags.DEFINE_float('std', 1.1, 'if the std is set to constant, then this value will be used')
34 | flags.DEFINE_integer('obs_shape', 3, 'dimensions of observation')
35 | flags.DEFINE_integer('action_shape', 1, 'dimensions of action')
36 | flags.DEFINE_float('min_a', -2.0, 'the smallest action value')
37 | flags.DEFINE_float('max_a', 2.0, 'the largest action value')
38 | flags.DEFINE_string("decay_method", "adaptive", "decay_method:adaptive, linear, exponential") # adaptive, linear, exponential
39 | flags.DEFINE_integer("timestep_adapt", 600, "timestep to adapt kl")
40 | flags.DEFINE_float("kl_adapt", 0.0005, "kl adapt rate")
41 | pms = flags.FLAGS
42 | pms.checkpoint_file = None
43 | pms.batch_size = int(pms.subsample_factor * pms.paths_number * pms.max_path_length)


--------------------------------------------------------------------------------
/experiment/main_tf_parallel.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | import tensorflow as tf
 3 | from agent.agent_parallel import TRPOAgentParallel
 4 | from parameters import pms
 5 | import gym
 6 | import numpy as np
 7 | from environment import Environment
 8 | 
 9 | # Flags for defining the tf.train.ClusterSpec
10 | tf.app.flags.DEFINE_string("ps_hosts", "166.111.138.113:2223",
11 |                            "Comma-separated list of hostname:port pairs")
12 | tf.app.flags.DEFINE_string("worker_hosts", "166.111.138.137:2226,166.111.138.137:2227,166.111.138.137:2228",
13 |                            "Comma-separated list of hostname:port pairs")
14 | 
15 | # Flags for defining the tf.train.Server
16 | tf.app.flags.DEFINE_string("job_name", "worker", "ps or worker")
17 | tf.app.flags.DEFINE_integer("task_index",2, "Index of task within the job")
18 | 
19 | FLAGS = tf.app.flags.FLAGS
20 | 
21 | seed = 1
22 | np.random.seed(seed)
23 | tf.set_random_seed(seed)
24 | 
25 | def main(_):
26 |     ps_hosts = FLAGS.ps_hosts.split(',')
27 |     worker_hosts = FLAGS.worker_hosts.split(',')
28 | 
29 |     # Create a cluster from the parameter server and worker hosts.
30 |     cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
31 | 
32 |     # Create and start a server for the local task.
33 |     # 创建并启动服务
34 |     # 其参数中使用task_index 指定任务的编号
35 |     gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1 / 3.0)
36 |     server = tf.train.Server(cluster,
37 |                            job_name=FLAGS.job_name,
38 |                            task_index=FLAGS.task_index,
39 |                             config=tf.ConfigProto(gpu_options=gpu_options))
40 | 
41 |     if FLAGS.job_name == "ps":
42 |         server.join()
43 |     elif FLAGS.job_name == "worker":
44 |         # 将op 挂载到各个本地的worker上
45 |         env = Environment(gym.make(pms.environment_name))
46 |         with tf.device(tf.train.replica_device_setter(
47 |             worker_device="/job:worker/task:%d" % (FLAGS.task_index),
48 |             cluster=cluster)):
49 |             agent = TRPOAgentParallel(env)
50 |             saver = tf.train.Saver(max_to_keep=10)
51 |             init_op = tf.initialize_all_variables()
52 |             summary_op = tf.merge_all_summaries()
53 |         # Create a "supervisor", which oversees the training process.
54 |         sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
55 |                              logdir="./checkpoint_parallel",
56 |                              init_op=init_op,
57 |                              global_step=agent.global_step,
58 |                              saver=saver,
59 |                              summary_op=None,
60 |                              save_model_secs=60)
61 | 
62 |         # The supervisor takes care of session initialization, restoring from
63 |         # a checkpoint, and closing when done or an error occurs.
64 |         with sv.managed_session(server.target) as sess:
65 |             agent.session = sess
66 |             agent.gf.session = sess
67 |             agent.sff.session =sess
68 |             agent.supervisor = sv
69 | 
70 |             if pms.train_flag:
71 |                 agent.learn()
72 |             elif FLAGS.task_index == 0:
73 |                 agent.test(pms.checkpoint_file)
74 |         # Ask for all the services to stop.
75 |         sv.stop()
76 | 
77 | if __name__ == "__main__":
78 |   tf.app.run()


--------------------------------------------------------------------------------
/experiment/main_multi_thread.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import tempfile
 4 | import sys
 5 | from utils import *
 6 | import numpy as np
 7 | import tensorflow as tf
 8 | import signal
 9 | from parameters import pms
10 | from logger.logger import Logger
11 | from agent.agent_cotinous_single_thread import TRPOAgentContinousSingleThread
12 | from network.network_continous import NetworkContinous
13 | 
14 | seed = 1
15 | np.random.seed(seed)
16 | tf.set_random_seed(seed)
17 | 
18 | training_dir = tempfile.mkdtemp()
19 | logging.getLogger().setLevel(logging.DEBUG)
20 | 
21 | 
22 | class MasterContinous(object):
23 |     def __init__(self):
24 |         gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1 / 3.0)
25 |         self.session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
26 | 
27 |         self.network = NetworkContinous("master")
28 |         self.gf = GetFlat(self.network.var_list)  # get theta from var_list
29 |         self.gf.session = self.session
30 |         self.sff = SetFromFlat(self.network.var_list)  # set theta from var_List
31 |         self.sff.session = self.session
32 |         self.session.run(tf.initialize_all_variables())
33 |         self.saver = tf.train.Saver(max_to_keep=10)
34 | 
35 |         self.init_jobs()
36 |         if pms.train_flag:
37 |             self.init_logger()
38 | 
39 |     def init_jobs(self):
40 |         self.jobs = []
41 |         for thread_id in xrange(pms.jobs):
42 |             job = TRPOAgentContinousSingleThread(thread_id, self)
43 |             self.jobs.append(job)
44 | 
45 |     def init_logger(self):
46 |         head = ["average_episode_std", "sum steps episode number" "total number of episodes",
47 |                 "Average sum of rewards per episode",
48 |                 "KL between old and new distribution", "Surrogate loss", "Surrogate loss prev", "ds", "entropy",
49 |                 "mean_advant"]
50 |         self.logger = Logger(head)
51 | 
52 |     def get_parameters(self):
53 |         return self.gf()
54 | 
55 |     def apply_gradient(self, gradient):
56 |         theta_prev = self.gf()
57 |         theta_after = theta_prev + gradient
58 |         self.sff(theta_after)
59 | 
60 |     def train(self):
61 |         signal.signal(signal.SIGINT, signal_handler)
62 |         for job in self.jobs:
63 |             job.start()
64 |         for job in self.jobs:
65 |             job.join()
66 | 
67 |     def test(self):
68 |         self.load_model(pms.checkpoint_file)
69 |         self.jobs[0].test()
70 | 
71 |     def save_model(self, model_name):
72 |         self.saver.save(self.session, "checkpoint/" + model_name + ".ckpt")
73 | 
74 |     def load_model(self , model_name):
75 |         try:
76 |             if model_name is not None:
77 |                 self.saver.restore(self.session , model_name)
78 |             else:
79 |                 self.saver.restore(self.session , tf.train.latest_checkpoint("checkpoint/"))
80 |         except:
81 |             print "load model %s fail" % (model_name)
82 | 
83 | def signal_handler():
84 |     sys.exit(0)
85 | 
86 | 
87 | if not os.path.isdir("./checkpoint"):
88 |     os.makedirs("./checkpoint")
89 | if not os.path.isdir("./log"):
90 |     os.makedirs("./log")
91 | master = MasterContinous()
92 | if pms.train_flag:
93 |     master.train()
94 | else:
95 |     master.test()
96 | # env.monitor.close()
97 | # gym.upload(training_dir,
98 | #            algorithm_id='trpo_ff')
99 | 


--------------------------------------------------------------------------------
/storage/storage.py:
--------------------------------------------------------------------------------
 1 | from utils import *
 2 | from parameters import pms
 3 | 
 4 | 
 5 | class Storage(object):
 6 |     def __init__(self, agent, env, baseline):
 7 |         self.paths = []
 8 |         self.env = env
 9 |         self.agent = agent
10 |         self.baseline = baseline
11 | 
12 |     def get_single_path(self):
13 |         self.obs, actions, rewards, action_dists = [], [], [], []
14 |         ob = self.env.reset()
15 |         episode_steps = 0
16 |         for _ in xrange(pms.max_path_length):
17 |             action, action_dist, ob = self.agent.act(ob)
18 |             self.obs.append(ob)
19 |             actions.append(action)
20 |             action_dists.append(action_dist)
21 |             res = self.env.step(action)  # res
22 |             if pms.render:
23 |                 self.env.render()
24 |             ob = res[0]
25 |             rewards.append([res[1]])
26 |             episode_steps += 1
27 |             if res[2]:
28 |                 break
29 |         path = dict(
30 |             observations=np.concatenate(np.expand_dims(self.obs, 0)),
31 |             agent_infos=np.concatenate(action_dists),
32 |             rewards=np.array(rewards),
33 |             actions=np.array(actions),
34 |             episode_steps=episode_steps
35 |         )
36 |         self.paths.append(path)
37 | 
38 |     def get_paths(self):
39 |         paths = self.paths
40 |         self.paths = []
41 |         return paths
42 | 
43 |     def process_paths(self, paths):
44 |         sum_episode_steps = 0
45 |         for path in paths:
46 |             sum_episode_steps += path['episode_steps']
47 |             # r_t+V(S_{t+1})-V(S_t) = returns-baseline
48 |             # path_baselines = np.append(self.baseline.predict(path) , 0)
49 |             # # r_t+V(S_{t+1})-V(S_t) = returns-baseline
50 |             # path["advantages"] = np.concatenate(path["rewards"]) + \
51 |             #          pms.discount * path_baselines[1:] - \
52 |             #          path_baselines[:-1]
53 |             # path["returns"] = np.concatenate(discount(path["rewards"], pms.discount))
54 |             path_baselines = np.append(self.baseline.predict(path) , 0)
55 |             deltas = np.concatenate(path["rewards"]) + \
56 |                      pms.discount * path_baselines[1:] - \
57 |                      path_baselines[:-1]
58 |             path["advantages"] = discount(
59 |                 deltas , pms.discount * pms.gae_lambda)
60 |             path["returns"] = np.concatenate(discount(path["rewards"] , pms.discount))
61 | 
62 |         # Updating policy.
63 |         action_dist_n = np.concatenate([path["agent_infos"] for path in paths])
64 |         obs_n = np.concatenate([path["observations"] for path in paths])
65 |         action_n = np.concatenate([path["actions"] for path in paths])
66 |         rewards = np.concatenate([path["rewards"] for path in paths])
67 |         advantages = np.concatenate([path["advantages"] for path in paths])
68 | 
69 |         if pms.center_adv:
70 |             advantages = (advantages - np.mean(advantages)) / (advantages.std() + 1e-8)
71 | 
72 |         self.baseline.fit(paths)
73 | 
74 |         samples_data = dict(
75 |             observations=obs_n,
76 |             actions=action_n,
77 |             rewards=rewards,
78 |             advantages=advantages,
79 |             agent_infos=action_dist_n,
80 |             paths=paths,
81 |             sum_episode_steps=sum_episode_steps
82 |         )
83 |         return samples_data
84 | 


--------------------------------------------------------------------------------
/distribution/diagonal_gaussian.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | 
 5 | class DiagonalGaussian(object):
 6 |     def __init__(self, dim):
 7 |         self._dim = dim
 8 | 
 9 |     @property
10 |     def dim(self):
11 |         return self._dim
12 | 
13 |     def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
14 |         old_means = old_dist_info_vars["mean"]
15 |         old_log_stds = old_dist_info_vars["log_std"]
16 |         new_means = new_dist_info_vars["mean"]
17 |         new_log_stds = new_dist_info_vars["log_std"]
18 |         """
19 |         Compute the KL divergence of two multivariate Gaussian distribution with
20 |         diagonal covariance matrices
21 |         """
22 |         old_std = tf.exp(old_log_stds)
23 |         new_std = tf.exp(new_log_stds)
24 |         # means: (N*A)
25 |         # std: (N*A)
26 |         # formula:
27 |         # { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
28 |         # ln(\sigma_2/\sigma_1)
29 |         numerator = tf.square(old_means - new_means) + \
30 |                     tf.square(old_std) - tf.square(new_std)
31 |         denominator = 2 * tf.square(new_std) + 1e-8
32 |         return tf.reduce_sum(
33 |             numerator / denominator + new_log_stds - old_log_stds, -1)
34 | 
35 |     def likelihood_ratio_sym(self, x_var, new_dist_info_vars, old_dist_info_vars):
36 |         """
37 |         \frac{\pi_\theta}{\pi_{old}}
38 |         :param x_var: actions
39 |         :param new_dist_info_vars: means + logstds
40 |         :param old_dist_info_vars: old_means + old_logstds
41 |         :return:
42 |         """
43 |         logli_new = self.log_likelihood_sym(x_var, new_dist_info_vars)
44 |         logli_old = self.log_likelihood_sym(x_var, old_dist_info_vars)
45 |         return tf.exp(logli_new - logli_old)
46 | 
47 |     def log_likelihood_sym(self, x_var, dist_info_vars):
48 |         """
49 |         \frac{1}{(2\pi)^{\frac{n}{2}}\sigma_\theta}exp(-(\frac{a-\mu_{\pi_\theta}}{2\sigma_\theta})^2)
50 |         :param x_var:
51 |         :param dist_info_vars:
52 |         :return:
53 |         """
54 |         means = dist_info_vars["mean"]
55 |         log_stds = dist_info_vars["log_std"]
56 |         zs = (x_var - means) / tf.exp(log_stds)
57 |         return - tf.reduce_sum(log_stds, -1) - \
58 |                0.5 * tf.reduce_sum(tf.square(zs), -1) - \
59 |                0.5 *means.get_shape()[-1].value * np.log(2 * np.pi)
60 | 
61 |     def kl_sym_firstfixed(self, old_dist_info_vars):
62 |         mu = old_dist_info_vars["mean"]
63 |         logstd = old_dist_info_vars["log_std"]
64 |         mu1 , logstd1 = map(tf.stop_gradient , [mu , logstd])
65 |         mu2 , logstd2 = mu , logstd
66 | 
67 |         return self.kl_sym(dict(mean=mu1, log_std=logstd1), dict(mean=mu2, log_std=logstd2))
68 | 
69 |     def sample(self, dist_info):
70 |         means = dist_info["mean"]
71 |         log_stds = dist_info["log_std"]
72 |         rnd = np.random.normal(size=means.shape)
73 |         return rnd * np.exp(log_stds) + means
74 | 
75 |     def log_likelihood(self, xs, dist_info):
76 |         means = dist_info["mean"]
77 |         log_stds = dist_info["log_std"]
78 |         zs = (xs - means) / np.exp(log_stds)
79 |         return - np.sum(log_stds, axis=-1) - \
80 |                0.5 * np.sum(np.square(zs), axis=-1) - \
81 |                0.5 * means.shape[-1] * np.log(2 * np.pi)
82 | 
83 |     def entropy(self, dist_info):
84 |         log_stds = dist_info["log_std"]
85 |         return tf.reduce_sum(log_stds + np.log(np.sqrt(2 * np.pi * np.e)))
86 | 
87 |     @property
88 |     def dist_info_keys(self):
89 |         return ["mean", "log_std"]
90 | 


--------------------------------------------------------------------------------
/agent/agent_parallel.py:
--------------------------------------------------------------------------------
 1 | from utils import *
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | from network.network_continous import NetworkContinous
 5 | from parameters import pms
 6 | from agent.agent_base import TRPOAgentBase
 7 | 
 8 | seed = 1
 9 | np.random.seed(seed)
10 | tf.set_random_seed(seed)
11 | class TRPOAgentParallel(TRPOAgentBase):
12 | 
13 |     def __init__(self, env):
14 |         super(TRPOAgentParallel, self).__init__(env)
15 |         self.init_network()
16 |         # self.saver = tf.train.Saver(max_to_keep=10)
17 | 
18 |     def init_network(self):
19 |         """
20 |         [input]
21 |         self.obs
22 |         self.action_n
23 |         self.advant
24 |         self.old_dist_means_n
25 |         self.old_dist_logstds_n
26 |         [output]
27 |         self.action_dist_means_n
28 |         self.action_dist_logstds_n
29 |         var_list
30 |         """
31 |         self.net = NetworkContinous("network_continous")
32 |         self.global_step = tf.Variable(0 , trainable=False)
33 |         self.step_op = tf.assign_add(self.global_step , 1 , use_locking=True)
34 |         if pms.min_std is not None:
35 |             log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std))
36 |         self.action_dist_stds_n = tf.exp(log_std_var)
37 |         self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n)
38 |         self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n)
39 |         self.likehood_action_dist = self.distribution.log_likelihood_sym(self.net.action_n, self.new_dist_info_vars)
40 |         self.ratio_n = self.distribution.likelihood_ratio_sym(self.net.action_n, self.new_dist_info_vars,
41 |                                                               self.old_dist_info_vars)
42 |         surr = -tf.reduce_mean(self.ratio_n * self.net.advant)  # Surrogate loss
43 |         batch_size = tf.shape(self.net.obs)[0]
44 |         batch_size_float = tf.cast(batch_size , tf.float32)
45 |         kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars))
46 |         ent = self.distribution.entropy(self.old_dist_info_vars)
47 |         # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf
48 |         self.losses = [surr, kl, ent]
49 |         var_list = self.net.var_list
50 |         self.gf = GetFlat(var_list)  # get theta from var_list
51 |         self.gf.session = self.session
52 |         self.sff = SetFromFlat(var_list)  # set theta from var_List
53 |         self.sff.session = self.session
54 |         # get g
55 |         self.pg = flatgrad(surr, var_list)
56 |         # get A
57 |         # KL divergence where first arg is fixed
58 |         # replace old->tf.stop_gradient from previous kl
59 |         kl_firstfixed = self.distribution.kl_sym_firstfixed(self.new_dist_info_vars) / batch_size_float
60 |         grads = tf.gradients(kl_firstfixed, var_list)
61 |         self.flat_tangent = tf.placeholder(dtype, shape=[None])
62 |         shapes = map(var_shape, var_list)
63 |         start = 0
64 |         tangents = []
65 |         for shape in shapes:
66 |             size = np.prod(shape)
67 |             param = tf.reshape(self.flat_tangent[start:(start + size)], shape)
68 |             tangents.append(param)
69 |             start += size
70 |         self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)]
71 |         self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list)  # get kl''*p
72 |         # self.saver = tf.train.Saver(max_to_keep=10)
73 |         # self.load_model(pms.checkpoint_file)
74 | 
75 |     def learn(self):
76 |         iter_num = 0
77 |         while True:
78 |             print "\n********** Iteration %i ************" % iter_num
79 |             print self.gf().mean()
80 |             stats, theta, thprev = self.train_mini_batch(linear_search=False)
81 |             self.sff(theta)
82 |             for k , v in stats.iteritems():
83 |                 print(k + ": " + " " * (40 - len(k)) + str(v))
84 |             # if iter_num % pms.save_model_times == 0:
85 |             #     self.save_model(pms.environment_name + "-" + str(iter_num))
86 |             self.session.run(self.step_op)
87 |             iter_num += 1
88 | 


--------------------------------------------------------------------------------
/experiment/main_image_multi_process.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | if not os.path.isdir("./checkpoint"):
  3 |     os.makedirs("./checkpoint")
  4 | if not os.path.isdir("./log"):
  5 |     os.makedirs("./log")
  6 | 
  7 | 
  8 | import gym
  9 | import multiprocessing
 10 | import time
 11 | from agent.agent_continous_image_parallel_image import TRPOAgentParallelImage
 12 | from parameters import pms
 13 | from storage.storage_continous_parallel_image import ParallelStorageImage
 14 | 
 15 | args = pms
 16 | args.max_pathlength = gym.spec(args.environment_name).timestep_limit
 17 | 
 18 | learner_tasks = multiprocessing.JoinableQueue()
 19 | learner_results = multiprocessing.Queue()
 20 | learner_env = gym.make(args.environment_name)
 21 | 
 22 | learner = TRPOAgentParallelImage(learner_env.observation_space, learner_env.action_space, learner_tasks, learner_results)
 23 | learner.start()
 24 | rollouts = ParallelStorageImage()
 25 | 
 26 | learner_tasks.put(1)
 27 | learner_tasks.join()
 28 | starting_weights = learner_results.get()
 29 | rollouts.set_policy_weights(starting_weights)
 30 | 
 31 | start_time = time.time()
 32 | history = {}
 33 | history["rollout_time"] = []
 34 | history["learn_time"] = []
 35 | history["mean_reward"] = []
 36 | history["timesteps"] = []
 37 | 
 38 | # start it off with a big negative number
 39 | last_reward = -1000000
 40 | recent_total_reward = 0
 41 | 
 42 | if pms.train_flag is True:
 43 |     for iteration in xrange(args.max_iter_number):
 44 |         # runs a bunch of async processes that collect rollouts
 45 |         paths = rollouts.get_paths()
 46 |         # Why is the learner in an async process?
 47 |         # Well, it turns out tensorflow has an issue: when there's a tf.Session in the main thread
 48 |         # and an async process creates another tf.Session, it will freeze up.
 49 |         # To solve this, we just make the learner's tf.Session in its own async process,
 50 |         # and wait until the learner's done before continuing the main thread.
 51 |         learn_start = time.time()
 52 |         if iteration%20 == 0:
 53 |             learner_tasks.put((2 , args.max_kl, 1, iteration))
 54 |         else:
 55 |             learner_tasks.put((2, args.max_kl, 0, iteration))
 56 |         learner_tasks.put(paths)
 57 |         learner_tasks.join()
 58 |         stats , theta , thprev = learner_results.get()
 59 |         learn_time = (time.time() - learn_start) / 60.0
 60 |         print
 61 |         print "-------- Iteration %d ----------" % iteration
 62 |         # print "Total time: %.2f mins" % ((time.time() - start_time) / 60.0)
 63 |         #
 64 |         # history["rollout_time"].append(rollout_time)
 65 |         # history["learn_time"].append(learn_time)
 66 |         # history["mean_reward"].append(mean_reward)
 67 |         # history["timesteps"].append(args.timesteps_per_batch)
 68 |         for k , v in stats.iteritems():
 69 |             print(k + ": " + " " * (40 - len(k)) + str(v))
 70 |         recent_total_reward += stats["Average sum of rewards per episode"]
 71 | 
 72 |         if args.decay_method == "adaptive":
 73 |             if iteration % 10 == 0:
 74 |                 if recent_total_reward < last_reward:
 75 |                     print "Policy is not improving. Decrease KL and increase steps."
 76 |                     if args.max_kl > 0.001:
 77 |                         args.max_kl -= args.kl_adapt
 78 |                 else:
 79 |                     print "Policy is improving. Increase KL and decrease steps."
 80 |                     if args.max_kl < 0.01:
 81 |                         args.max_kl += args.kl_adapt
 82 |                 last_reward = recent_total_reward
 83 |                 recent_total_reward = 0
 84 | 
 85 |         if args.decay_method == "linear":
 86 |             if args.max_kl > 0.001:
 87 |                 args.max_kl -= args.kl_adapt
 88 | 
 89 |         if args.decay_method == "exponential":
 90 |             if args.max_kl > 0.001:
 91 |                 args.max_kl *= args.kl_adapt
 92 |         rollouts.set_policy_weights(theta)
 93 | else:
 94 |     from agent.agent_continous import TRPOAgent
 95 |     from environment import Environment
 96 |     env = Environment(gym.make(pms.environment_name))
 97 |     agent = TRPOAgent(env)
 98 |     agent.test(pms.checkpoint_file)
 99 | 
100 | 
101 | rollouts.end()
102 | 


--------------------------------------------------------------------------------
/agent/agent_continous_image.py:
--------------------------------------------------------------------------------
 1 | from utils import *
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | from network.network_continous_image import NetworkContinousImage
 5 | from baseline.baseline_tf_image import BaselineTfImage
 6 | from storage.storage_image import Storage
 7 | from parameters import pms
 8 | from agent.agent_base import TRPOAgentBase
 9 | 
10 | seed = 1
11 | np.random.seed(seed)
12 | tf.set_random_seed(seed)
13 | 
14 | """
15 | class for continoust action space with image as input
16 | """
17 | class TRPOAgent(TRPOAgentBase):
18 | 
19 |     def __init__(self, env):
20 |         super(TRPOAgent, self).__init__(env)
21 |         self.init_network()
22 |         self.saver = tf.train.Saver(max_to_keep=10)
23 |         self.baseline = BaselineTfImage(self.session)
24 |         self.storage = Storage(self, env, self.baseline)
25 | 
26 |     def init_network(self):
27 |         """
28 |         [input]
29 |         self.obs
30 |         self.action_n
31 |         self.advant
32 |         self.old_dist_means_n
33 |         self.old_dist_logstds_n
34 |         [output]
35 |         self.action_dist_means_n
36 |         self.action_dist_logstds_n
37 |         var_list
38 |         """
39 |         self.net = NetworkContinousImage("network_continous")
40 |         if pms.min_std is not None:
41 |             log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std))
42 |         self.action_dist_stds_n = tf.exp(log_std_var)
43 |         self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n)
44 |         self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n)
45 |         self.likehood_action_dist = self.distribution.log_likelihood_sym(self.net.action_n, self.new_dist_info_vars)
46 |         self.ratio_n = self.distribution.likelihood_ratio_sym(self.net.action_n, self.new_dist_info_vars,
47 |                                                               self.old_dist_info_vars)
48 |         surr = -tf.reduce_mean(self.ratio_n * self.net.advant)  # Surrogate loss
49 |         batch_size = tf.shape(self.net.obs)[0]
50 |         batch_size_float = tf.cast(batch_size , tf.float32)
51 |         kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars))
52 |         ent = self.distribution.entropy(self.old_dist_info_vars)
53 |         # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf
54 |         self.losses = [surr, kl, ent]
55 |         var_list = self.net.var_list
56 |         self.gf = GetFlat(var_list)  # get theta from var_list
57 |         self.gf.session = self.session
58 |         self.sff = SetFromFlat(var_list)  # set theta from var_List
59 |         self.sff.session = self.session
60 |         # get g
61 |         self.pg = flatgrad(surr, var_list)
62 |         # get A
63 |         # KL divergence where first arg is fixed
64 |         # replace old->tf.stop_gradient from previous kl
65 |         kl_firstfixed = self.distribution.kl_sym_firstfixed(self.new_dist_info_vars) / batch_size_float
66 |         grads = tf.gradients(kl_firstfixed, var_list)
67 |         self.flat_tangent = tf.placeholder(dtype, shape=[None])
68 |         shapes = map(var_shape, var_list)
69 |         start = 0
70 |         tangents = []
71 |         for shape in shapes:
72 |             size = np.prod(shape)
73 |             param = tf.reshape(self.flat_tangent[start:(start + size)], shape)
74 |             tangents.append(param)
75 |             start += size
76 |         self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)]
77 |         self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list)  # get kl''*p
78 |         self.session.run(tf.initialize_all_variables())
79 |         # self.saver = tf.train.Saver(max_to_keep=10)
80 |         # self.load_model(pms.checkpoint_file)
81 | 
82 |     def learn(self):
83 |         iter_num = 0
84 |         while True:
85 |             print "\n********** Iteration %i ************" % iter_num
86 |             print self.gf().mean()
87 |             stats, theta, thprev = self.train_mini_batch(linear_search=False)
88 |             self.sff(theta)
89 |             for k , v in stats.iteritems():
90 |                 print(k + ": " + " " * (40 - len(k)) + str(v))
91 |             if iter_num % pms.save_model_times == 0:
92 |                 self.save_model(pms.environment_name + "-" + str(iter_num))
93 |             iter_num += 1
94 | 


--------------------------------------------------------------------------------
/experiment/main_multi_process.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | if not os.path.isdir("./checkpoint"):
  3 |     os.makedirs("./checkpoint")
  4 | if not os.path.isdir("./log"):
  5 |     os.makedirs("./log")
  6 | 
  7 | 
  8 | import gym
  9 | import multiprocessing
 10 | import time
 11 | from agent.agent_continous_parallel_storage import TRPOAgentParallel
 12 | <<<<<<< HEAD
 13 | import argparse
 14 | import multiprocessing
 15 | import time
 16 | import json
 17 | =======
 18 | >>>>>>> rnn
 19 | from parameters import pms
 20 | from storage.storage_continous_parallel import ParallelStorage
 21 | 
 22 | args = pms
 23 | args.max_pathlength = gym.spec(args.environment_name).timestep_limit
 24 | 
 25 | learner_tasks = multiprocessing.JoinableQueue()
 26 | learner_results = multiprocessing.Queue()
 27 | learner_env = gym.make(args.environment_name)
 28 | 
 29 | learner = TRPOAgentParallel(learner_env.observation_space, learner_env.action_space, learner_tasks, learner_results)
 30 | learner.start()
 31 | rollouts = ParallelStorage()
 32 | 
 33 | learner_tasks.put(1)
 34 | learner_tasks.join()
 35 | starting_weights = learner_results.get()
 36 | rollouts.set_policy_weights(starting_weights)
 37 | 
 38 | start_time = time.time()
 39 | history = {}
 40 | history["rollout_time"] = []
 41 | history["learn_time"] = []
 42 | history["mean_reward"] = []
 43 | history["timesteps"] = []
 44 | 
 45 | # start it off with a big negative number
 46 | last_reward = -1000000
 47 | recent_total_reward = 0
 48 | 
 49 | if pms.train_flag is True:
 50 |     for iteration in xrange(args.max_iter_number):
 51 |         # runs a bunch of async processes that collect rollouts
 52 |         paths = rollouts.get_paths()
 53 |         # Why is the learner in an async process?
 54 |         # Well, it turns out tensorflow has an issue: when there's a tf.Session in the main thread
 55 |         # and an async process creates another tf.Session, it will freeze up.
 56 |         # To solve this, we just make the learner's tf.Session in its own async process,
 57 |         # and wait until the learner's done before continuing the main thread.
 58 |         learn_start = time.time()
 59 |         if iteration%20 == 0:
 60 |             learner_tasks.put((2 , args.max_kl, 1, iteration))
 61 |         else:
 62 |             learner_tasks.put((2, args.max_kl, 0, iteration))
 63 |         learner_tasks.put(paths)
 64 |         learner_tasks.join()
 65 |         stats , theta , thprev = learner_results.get()
 66 |         learn_time = (time.time() - learn_start) / 60.0
 67 |         print
 68 |         print "-------- Iteration %d ----------" % iteration
 69 |         # print "Total time: %.2f mins" % ((time.time() - start_time) / 60.0)
 70 |         #
 71 |         # history["rollout_time"].append(rollout_time)
 72 |         # history["learn_time"].append(learn_time)
 73 |         # history["mean_reward"].append(mean_reward)
 74 |         # history["timesteps"].append(args.timesteps_per_batch)
 75 |         for k , v in stats.iteritems():
 76 |             print(k + ": " + " " * (40 - len(k)) + str(v))
 77 |         recent_total_reward += stats["Average sum of rewards per episode"]
 78 | 
 79 |         if args.decay_method == "adaptive":
 80 |             if iteration % 10 == 0:
 81 |                 if recent_total_reward < last_reward:
 82 |                     print "Policy is not improving. Decrease KL and increase steps."
 83 |                     if args.max_kl > 0.001:
 84 |                         args.max_kl -= args.kl_adapt
 85 |                 else:
 86 |                     print "Policy is improving. Increase KL and decrease steps."
 87 |                     if args.max_kl < 0.01:
 88 |                         args.max_kl += args.kl_adapt
 89 |                 last_reward = recent_total_reward
 90 |                 recent_total_reward = 0
 91 | 
 92 |         if args.decay_method == "linear":
 93 |             if args.max_kl > 0.001:
 94 |                 args.max_kl -= args.kl_adapt
 95 | 
 96 |         if args.decay_method == "exponential":
 97 |             if args.max_kl > 0.001:
 98 |                 args.max_kl *= args.kl_adapt
 99 |         rollouts.set_policy_weights(theta)
100 | else:
101 |     from agent.agent_continous import TRPOAgent
102 |     from environment import Environment
103 |     env = Environment(gym.make(pms.environment_name))
104 |     agent = TRPOAgent(env)
105 |     agent.test(pms.checkpoint_file)
106 | 
107 | 
108 | rollouts.end()
109 | 


--------------------------------------------------------------------------------
/agent/agent_continous.py:
--------------------------------------------------------------------------------
 1 | from utils import *
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | from network.network_continous import NetworkContinous
 5 | from parameters import pms
 6 | from agent.agent_base import TRPOAgentBase
 7 | from logger.logger import Logger
 8 | from storage.storage_continous_parallel import ParallelStorage
 9 | 
10 | seed = 1
11 | np.random.seed(seed)
12 | tf.set_random_seed(seed)
13 | 
14 | """
15 | class for continoust action space
16 | """
17 | class TRPOAgent(TRPOAgentBase):
18 |     def __init__(self, env):
19 |         super(TRPOAgent, self).__init__(env)
20 |         self.init_network()
21 |         self.saver = tf.train.Saver(max_to_keep=10)
22 | 
23 |     def init_network(self):
24 |         """
25 |         [input]
26 |         self.obs
27 |         self.action_n
28 |         self.advant
29 |         self.old_dist_means_n
30 |         self.old_dist_logstds_n
31 |         [output]
32 |         self.action_dist_means_n
33 |         self.action_dist_logstds_n
34 |         var_list
35 |         """
36 |         self.net = NetworkContinous("network_continous")
37 |         if pms.min_std is not None:
38 |             log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std))
39 |         self.action_dist_stds_n = tf.exp(log_std_var)
40 |         self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n)
41 |         self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n)
42 |         self.likehood_action_dist = self.distribution.log_likelihood_sym(self.net.action_n, self.new_dist_info_vars)
43 |         self.ratio_n = self.distribution.likelihood_ratio_sym(self.net.action_n, self.new_dist_info_vars,
44 |                                                               self.old_dist_info_vars)
45 |         surr = -tf.reduce_sum(self.ratio_n * self.net.advant)  # Surrogate loss
46 |         batch_size = tf.shape(self.net.obs)[0]
47 |         batch_size_float = tf.cast(batch_size , tf.float32)
48 |         kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars))
49 |         ent = self.distribution.entropy(self.old_dist_info_vars)
50 |         # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf
51 |         self.losses = [surr, kl, ent]
52 |         var_list = self.net.var_list
53 |         self.gf = GetFlat(var_list)  # get theta from var_list
54 |         self.gf.session = self.session
55 |         self.sff = SetFromFlat(var_list)  # set theta from var_List
56 |         self.sff.session = self.session
57 |         # get g
58 |         self.pg = flatgrad(surr, var_list)
59 |         # get A
60 |         # KL divergence where first arg is fixed
61 |         # replace old->tf.stop_gradient from previous kl
62 |         kl_firstfixed = self.distribution.kl_sym_firstfixed(self.new_dist_info_vars) / batch_size_float
63 |         grads = tf.gradients(kl_firstfixed, var_list)
64 |         self.flat_tangent = tf.placeholder(dtype, shape=[None])
65 |         shapes = map(var_shape, var_list)
66 |         start = 0
67 |         tangents = []
68 |         for shape in shapes:
69 |             size = np.prod(shape)
70 |             param = tf.reshape(self.flat_tangent[start:(start + size)], shape)
71 |             tangents.append(param)
72 |             start += size
73 |         self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)]
74 |         self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list)  # get kl''*p
75 |         self.session.run(tf.initialize_all_variables())
76 |         # self.saver = tf.train.Saver(max_to_keep=10)
77 |         # self.load_model(pms.checkpoint_file)
78 | 
79 |     def init_logger(self):
80 |         head = ["rewards", "std"]
81 |         self.logger = Logger(head)
82 | 
83 |     def learn(self):
84 |         self.init_logger()
85 |         iter_num = 0
86 |         while True:
87 |             print "\n********** Iteration %i ************" % iter_num
88 |             print self.gf().mean()
89 |             stats, theta, thprev = self.train_mini_batch(linear_search=False)
90 |             self.sff(theta)
91 |             self.logger.log_row([stats["Average sum of rewards per episode"], self.session.run(self.net.action_dist_logstd_param)[0][0]])
92 |             for k , v in stats.iteritems():
93 |                 print(k + ": " + " " * (40 - len(k)) + str(v))
94 |             if iter_num % pms.save_model_times == 0:
95 |                 self.save_model(pms.environment_name + "-" + str(iter_num))
96 |             iter_num += 1
97 | 


--------------------------------------------------------------------------------
/agent/agent_continous_rnn.py:
--------------------------------------------------------------------------------
 1 | from utils import *
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | from network.network_continous_rnn import NetworkContinousLSTM
 5 | from parameters import pms
 6 | from agent.agent_base import TRPOAgentBase
 7 | from logger.logger import Logger
 8 | from storage.storage_continous_parallel import ParallelStorage
 9 | 
10 | seed = 1
11 | np.random.seed(seed)
12 | tf.set_random_seed(seed)
13 | 
14 | """
15 | class for continoust action space
16 | """
17 | class TRPOAgent(TRPOAgentBase):
18 |     def __init__(self, env):
19 |         super(TRPOAgent, self).__init__(env)
20 |         self.init_network()
21 |         self.saver = tf.train.Saver(max_to_keep=10)
22 | 
23 |     def init_network(self):
24 |         """
25 |         [input]
26 |         self.obs
27 |         self.action_n
28 |         self.advant
29 |         self.old_dist_means_n
30 |         self.old_dist_logstds_n
31 |         [output]
32 |         self.action_dist_means_n
33 |         self.action_dist_logstds_n
34 |         var_list
35 |         """
36 |         self.net = NetworkContinousLSTM("network_continous")
37 |         if pms.min_std is not None:
38 |             log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std))
39 |         self.action_dist_stds_n = tf.exp(log_std_var)
40 |         self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n)
41 |         self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n)
42 |         self.likehood_action_dist = self.distribution.log_likelihood_sym(self.net.action_n, self.new_dist_info_vars)
43 |         self.ratio_n = self.distribution.likelihood_ratio_sym(self.net.action_n, self.new_dist_info_vars,
44 |                                                               self.old_dist_info_vars)
45 |         surr = -tf.reduce_sum(self.ratio_n * self.net.advant)  # Surrogate loss
46 |         batch_size = tf.shape(self.net.obs)[0]
47 |         batch_size_float = tf.cast(batch_size , tf.float32)
48 |         kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars))
49 |         ent = self.distribution.entropy(self.old_dist_info_vars)
50 |         # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf
51 |         self.losses = [surr, kl, ent]
52 |         var_list = self.net.var_list
53 |         self.gf = GetFlat(var_list)  # get theta from var_list
54 |         self.gf.session = self.session
55 |         self.sff = SetFromFlat(var_list)  # set theta from var_List
56 |         self.sff.session = self.session
57 |         # get g
58 |         self.pg = flatgrad(surr, var_list)
59 |         # get A
60 |         # KL divergence where first arg is fixed
61 |         # replace old->tf.stop_gradient from previous kl
62 |         kl_firstfixed = self.distribution.kl_sym_firstfixed(self.new_dist_info_vars) / batch_size_float
63 |         grads = tf.gradients(kl_firstfixed, var_list)
64 |         self.flat_tangent = tf.placeholder(dtype, shape=[None])
65 |         shapes = map(var_shape, var_list)
66 |         start = 0
67 |         tangents = []
68 |         for shape in shapes:
69 |             size = np.prod(shape)
70 |             param = tf.reshape(self.flat_tangent[start:(start + size)], shape)
71 |             tangents.append(param)
72 |             start += size
73 |         self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)]
74 |         self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list)  # get kl''*p
75 |         self.session.run(tf.initialize_all_variables())
76 |         # self.saver = tf.train.Saver(max_to_keep=10)
77 |         # self.load_model(pms.checkpoint_file)
78 | 
79 |     def init_logger(self):
80 |         head = ["rewards", "std"]
81 |         self.logger = Logger(head)
82 | 
83 |     def learn(self):
84 |         self.init_logger()
85 |         iter_num = 0
86 |         while True:
87 |             print "\n********** Iteration %i ************" % iter_num
88 |             print self.gf().mean()
89 |             stats, theta, thprev = self.train_mini_batch(linear_search=False)
90 |             self.sff(theta)
91 |             self.logger.log_row([stats["Average sum of rewards per episode"], self.session.run(self.net.action_dist_logstd_param)[0][0]])
92 |             for k , v in stats.iteritems():
93 |                 print(k + ": " + " " * (40 - len(k)) + str(v))
94 |             if iter_num % pms.save_model_times == 0:
95 |                 self.save_model(pms.environment_name + "-" + str(iter_num))
96 |             iter_num += 1
97 | 


--------------------------------------------------------------------------------
/storage/storage_image.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | from utils import *
  3 | from parameters import pms
  4 | 
  5 | 
  6 | class Storage(object):
  7 |     def __init__(self, agent, env, baseline):
  8 |         self.paths = []
  9 |         self.env = env
 10 |         self.agent = agent
 11 |         self.obs = []
 12 |         self.obs_origin = []
 13 |         self.baseline = baseline
 14 | 
 15 |     def get_single_path(self):
 16 |         self.obs_origin, self.obs, actions, rewards, action_dists = [], [], [], [], []
 17 |         ob = self.env.reset()
 18 |         ob = self.env.render('rgb_array')
 19 |         # self.agent.prev_action *= 0.0
 20 |         # self.agent.prev_obs *= 0.0
 21 |         episode_steps = 0
 22 |         for _ in xrange(pms.max_path_length):
 23 |             self.obs_origin.append(ob)
 24 |             deal_ob = self.deal_image(ob)
 25 |             action, action_dist = self.agent.get_action(deal_ob)
 26 |             self.obs.append(deal_ob)
 27 |             actions.append(action)
 28 |             action_dists.append(action_dist)
 29 |             res = self.env.step(action) # res
 30 |             if pms.render:
 31 |                 self.env.render()
 32 |             ob = res[0]
 33 |             ob = self.env.render('rgb_array')
 34 |             rewards.append([res[1]])
 35 |             episode_steps += 1
 36 |             if res[2]:
 37 |                 break
 38 |         path = dict(
 39 |             observations=np.concatenate([self.obs]),
 40 |             agent_infos=np.concatenate([action_dists]),
 41 |             rewards=np.array(rewards),
 42 |             actions=np.array(actions),
 43 |             episode_steps=episode_steps
 44 |         )
 45 |         self.paths.append(path)
 46 |         # self.agent.prev_action *= 0.0
 47 |         # self.agent.prev_obs *= 0.0
 48 |         return path
 49 | 
 50 |     def get_paths(self):
 51 |         paths = self.paths
 52 |         self.paths = []
 53 |         return paths
 54 | 
 55 |     def process_paths(self, paths):
 56 |         sum_episode_steps = 0
 57 |         for path in paths:
 58 |             sum_episode_steps += path['episode_steps']
 59 |             # r_t+V(S_{t+1})-V(S_t) = returns-baseline
 60 |             # path_baselines = np.append(self.baseline.predict(path) , 0)
 61 |             # # r_t+V(S_{t+1})-V(S_t) = returns-baseline
 62 |             # path["advantages"] = np.concatenate(path["rewards"]) + \
 63 |             #          pms.discount * path_baselines[1:] - \
 64 |             #          path_baselines[:-1]
 65 |             # path["returns"] = np.concatenate(discount(path["rewards"], pms.discount))
 66 |             path_baselines = np.append(self.baseline.predict(path) , 0)
 67 |             deltas = np.concatenate(path["rewards"]) + \
 68 |                      pms.discount * path_baselines[1:] - \
 69 |                      path_baselines[:-1]
 70 |             path["advantages"] = discount(
 71 |                 deltas , pms.discount * pms.gae_lambda)
 72 |             path["returns"] = np.concatenate(discount(path["rewards"] , pms.discount))
 73 |         # Updating policy.
 74 |         action_dist_n = np.concatenate([path["agent_infos"] for path in paths])
 75 |         obs_n = np.concatenate([path["observations"] for path in paths])
 76 |         action_n = np.concatenate([path["actions"] for path in paths])
 77 |         rewards = np.concatenate([path["rewards"] for path in paths])
 78 |         advantages = np.concatenate([path["advantages"] for path in paths])
 79 | 
 80 |         if pms.center_adv:
 81 |             advantages = (advantages - np.mean(advantages)) / (advantages.std() + 1e-8)
 82 | 
 83 |         self.baseline.fit(paths)
 84 | 
 85 |         samples_data = dict(
 86 |             observations=obs_n,
 87 |             actions=action_n,
 88 |             rewards=rewards,
 89 |             advantages=advantages,
 90 |             agent_infos=action_dist_n,
 91 |             paths=paths,
 92 |             sum_episode_steps=sum_episode_steps
 93 |         )
 94 |         return samples_data
 95 | 
 96 |     def deal_image(self, image):
 97 |         index = len(self.obs_origin)
 98 |         image_end = []
 99 |         if index<pms.history_number:
100 |             image_end = self.obs_origin[0:index]
101 |             for i in range(pms.history_number-index):
102 |                 image_end.append(image)
103 |         else:
104 |             image_end = self.obs_origin[index-pms.history_number:index]
105 | 
106 |         image_end = np.concatenate(image_end)
107 |         # image_end = image_end.reshape((pms.obs_height, pms.obs_width, pms.history_number))
108 |         obs = cv2.resize(cv2.cvtColor(image_end, cv2.COLOR_RGB2GRAY) / 255., (pms.obs_height, pms.obs_width))
109 |         return np.expand_dims(obs, 0)


--------------------------------------------------------------------------------
/storage/storage_continous.py:
--------------------------------------------------------------------------------
  1 | from utils import *
  2 | from parameters import pms
  3 | import threading
  4 | import math
  5 | 
  6 | class Storage(object):
  7 |     def __init__(self, agent, env, baseline):
  8 |         self.paths = []
  9 |         self.env = env
 10 |         self.agent = agent
 11 |         self.baseline = baseline
 12 | 
 13 |     def get_single_path(self):
 14 |         """
 15 |         :param:observations:obs list
 16 |         :param:actions:action list
 17 |         :param:rewards:reward list
 18 |         :param:agent_infos: mean+log_std dictlist
 19 |         :param:env_infos: no use, just information about environment
 20 |         :return: a path, list
 21 |         """
 22 |         # if pms.record_movie:
 23 |         #     outdir = 'log/trpo'
 24 |         #     self.env.monitor.start(outdir , force=True)
 25 |         observations = []
 26 |         actions = []
 27 |         rewards = []
 28 |         agent_infos = []
 29 |         env_infos = []
 30 |         o = self.env.reset()
 31 |         if pms.render:
 32 |             self.env.render()
 33 |         o = self.env.reset()
 34 |         episode_steps = 0
 35 |         while episode_steps< pms.max_path_length:
 36 |             a, agent_info = self.agent.get_action(o)
 37 |             if math.isnan(a) is not True:
 38 |                 next_o, reward, terminal, env_info = self.env.step(a)
 39 |                 observations.append(o)
 40 |                 rewards.append(np.array([reward]))
 41 |                 actions.append(a)
 42 |                 agent_infos.append([agent_info])
 43 |                 env_infos.append([env_info])
 44 |                 episode_steps += 1
 45 |                 if terminal:
 46 |                     break
 47 |                 o = next_o
 48 |                 if pms.render:
 49 |                     self.env.render()
 50 |         self.paths.append(dict(
 51 |             observations=np.array(observations),
 52 |             actions=np.array(actions),
 53 |             rewards=np.array(rewards),
 54 |             agent_infos=np.concatenate(agent_infos),
 55 |             env_infos=np.concatenate(env_infos),
 56 |             episode_steps=episode_steps
 57 |         ))
 58 | 
 59 |     def get_paths(self):
 60 |         paths = self.paths
 61 |         self.paths = []
 62 |         return paths
 63 | 
 64 |     def process_paths(self, paths):
 65 |         sum_episode_steps = 0
 66 |         for path in paths:
 67 |             sum_episode_steps += path['episode_steps']
 68 |             path['baselines'] = self.baseline.predict(path)
 69 |             path["returns"] = np.concatenate(discount(path["rewards"] , pms.discount))
 70 |             path["advantages"] = path['returns'] - path['baselines']
 71 |         observations = np.concatenate([path["observations"] for path in paths])
 72 |         actions = np.concatenate([path["actions"] for path in paths])
 73 |         rewards = np.concatenate([path["rewards"] for path in paths])
 74 |         advantages = np.concatenate([path["advantages"] for path in paths])
 75 |         env_infos = np.concatenate([path["env_infos"] for path in paths])
 76 |         agent_infos = np.concatenate([path["agent_infos"] for path in paths])
 77 |         if pms.center_adv:
 78 |             advantages -= np.mean(advantages)
 79 |             advantages /= (advantages.std() + 1e-8)
 80 | 
 81 |         # for some unknown reaseon, it can not be used
 82 |         # if pms.positive_adv:
 83 |         #     advantages = (advantages - np.min(advantages)) + 1e-8
 84 | 
 85 |         # average_discounted_return = \
 86 |         #     np.mean([path["returns"][0] for path in paths])
 87 |         #
 88 |         # undiscounted_returns = [sum(path["rewards"]) for path in paths]
 89 | 
 90 | 
 91 |         # ev = self.explained_variance_1d(
 92 |         #     np.concatenate(baselines),
 93 |         #     np.concatenate(returns)
 94 |         # )
 95 |         samples_data = dict(
 96 |             observations=observations,
 97 |             actions=actions,
 98 |             rewards=rewards,
 99 |             advantages=advantages,
100 |             env_infos=env_infos,
101 |             agent_infos=agent_infos,
102 |             paths=paths,
103 |             sum_episode_steps=sum_episode_steps
104 |         )
105 |         self.baseline.fit(paths)
106 |         return samples_data
107 | 
108 |     def explained_variance_1d(ypred, y):
109 |         assert y.ndim == 1 and ypred.ndim == 1
110 |         vary = np.var(y)
111 |         if np.isclose(vary, 0):
112 |             if np.var(ypred) > 0:
113 |                 return 0
114 |             else:
115 |                 return 1
116 |         if abs(1 - np.var(y - ypred) / (vary + 1e-8)) > 1e5:
117 |             import ipdb;
118 |             ipdb.set_trace()
119 |         return 1 - np.var(y - ypred) / (vary + 1e-8)
120 | 
121 | 
122 | class Rollout(threading.Thread):
123 |     def __init__(self, thread_number, agent, env, baseline):
124 |         super(Rollout, self).__init__()
125 |         self.thread_number = thread_number
126 |         self.storage = Storage(agent, env, baseline)
127 | 
128 |     def run(self):
129 |         self.storage.get_single_path()
130 | 


--------------------------------------------------------------------------------
/agent/agent_cotinous_single_thread.py:
--------------------------------------------------------------------------------
  1 | from utils import *
  2 | import threading
  3 | import gym
  4 | import numpy as np
  5 | import random
  6 | import tensorflow as tf
  7 | import time
  8 | import threading
  9 | import prettytensor as pt
 10 | 
 11 | from storage.storage_continous import Storage
 12 | from storage.storage_continous import Rollout
 13 | import math
 14 | from parameters import pms
 15 | import krylov
 16 | from logger.logger import Logger
 17 | from distribution.diagonal_gaussian import DiagonalGaussian
 18 | from baseline.baseline_lstsq import Baseline
 19 | from environment import Environment
 20 | from network.network_continous import NetworkContinous
 21 | from agent.agent_base import TRPOAgentBase
 22 | 
 23 | seed = 1
 24 | np.random.seed(seed)
 25 | tf.set_random_seed(seed)
 26 | 
 27 | 
 28 | class TRPOAgentContinousSingleThread(TRPOAgentBase, threading.Thread):
 29 | 
 30 |     def __init__(self, thread_id, master):
 31 |         print "create thread %d"%(thread_id)
 32 |         self.thread_id = thread_id
 33 |         threading.Thread.__init__(self, name="thread_%d" % thread_id)
 34 |         self.master = master
 35 |         self.env = env = Environment(gym.make(pms.environment_name))
 36 |         TRPOAgentBase.__init__(self, env)
 37 | 
 38 |         self.session = self.master.session
 39 |         self.init_network()
 40 |         self.saver = tf.train.Saver(max_to_keep=10)
 41 | 
 42 | 
 43 |     def init_network(self):
 44 |         """
 45 |             [input]
 46 |             self.obs
 47 |             self.action_n
 48 |             self.advant
 49 |             self.old_dist_means_n
 50 |             self.old_dist_logstds_n
 51 |             [output]
 52 |             self.action_dist_means_n
 53 |             self.action_dist_logstds_n
 54 |             var_list
 55 |             """
 56 |         self.net = NetworkContinous(str(self.thread_id))
 57 |         if pms.min_std is not None:
 58 |             log_std_var = tf.maximum(self.net.action_dist_logstds_n , np.log(pms.min_std))
 59 |         self.action_dist_stds_n = tf.exp(log_std_var)
 60 |         self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n , log_std=self.net.old_dist_logstds_n)
 61 |         self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n , log_std=self.net.action_dist_logstds_n)
 62 |         self.likehood_action_dist = self.distribution.log_likelihood_sym(self.net.action_n , self.new_dist_info_vars)
 63 |         self.ratio_n = self.distribution.likelihood_ratio_sym(self.net.action_n , self.new_dist_info_vars ,
 64 |                                                               self.old_dist_info_vars)
 65 |         surr = -tf.reduce_mean(self.ratio_n * self.net.advant)  # Surrogate loss
 66 |         kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars , self.new_dist_info_vars))
 67 |         ent = self.distribution.entropy(self.old_dist_info_vars)
 68 |         # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf
 69 |         self.losses = [surr , kl , ent]
 70 |         var_list = self.net.var_list
 71 |         self.gf = GetFlat(var_list)  # get theta from var_list
 72 |         self.gf.session = self.session
 73 |         self.sff = SetFromFlat(var_list)  # set theta from var_List
 74 |         self.sff.session = self.session
 75 |         # get g
 76 |         self.pg = flatgrad(surr , var_list)
 77 |         # get A
 78 |         # KL divergence where first arg is fixed
 79 |         # replace old->tf.stop_gradient from previous kl
 80 |         kl_firstfixed = self.distribution.kl_sym_firstfixed(self.new_dist_info_vars)
 81 |         grads = tf.gradients(kl_firstfixed , var_list)
 82 |         self.flat_tangent = tf.placeholder(dtype , shape=[None])
 83 |         shapes = map(var_shape , var_list)
 84 |         start = 0
 85 |         tangents = []
 86 |         for shape in shapes:
 87 |             size = np.prod(shape)
 88 |             param = tf.reshape(self.flat_tangent[start:(start + size)] , shape)
 89 |             tangents.append(param)
 90 |             start += size
 91 |         self.gvp = [tf.reduce_sum(g * t) for (g , t) in zip(grads , tangents)]
 92 |         self.fvp = flatgrad(tf.reduce_sum(self.gvp) , var_list)  # get kl''*p
 93 | 
 94 |     def run(self):
 95 |         self.learn()
 96 | 
 97 |     def learn(self):
 98 |         i = 0
 99 |         sum_gradient = 0
100 |         while True:
101 |             self.sff(self.master.get_parameters())
102 | 
103 |             # Generating paths.
104 |             stats, theta, theprev = self.train_mini_batch(parallel=False)
105 |             sum_gradient += theta-theprev
106 |             self.master.apply_gradient(sum_gradient)
107 |             print "\n********** Iteration %i ************" % i
108 |             for k , v in stats.iteritems():
109 |                 print(k + ": " + " " * (40 - len(k)) + str(v))
110 |             sum_gradient = 0
111 |             if self.thread_id==1 and i%pms.save_model_times==0:
112 |                 self.save_model(pms.environment_name + "-" + str(i))
113 |             i += 1
114 | 
115 | 
116 |     def test(self):
117 |         self.sff(self.master.get_parameters())
118 |         for i in range(50):
119 |             self.storage.get_single_path()
120 | 


--------------------------------------------------------------------------------
/agent/AC_agent_continous.py:
--------------------------------------------------------------------------------
  1 | from utils import *
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | from network.network_continous import NetworkContinous
  5 | from parameters import pms
  6 | from agent.agent_base import TRPOAgentBase
  7 | from logger.logger import Logger
  8 | import math
  9 | import time
 10 | 
 11 | seed = 1
 12 | np.random.seed(seed)
 13 | tf.set_random_seed(seed)
 14 | class ACAgent(TRPOAgentBase):
 15 | 
 16 |     def __init__(self, env):
 17 |         super(ACAgent, self).__init__(env)
 18 |         self.init_network()
 19 |         self.saver = tf.train.Saver(max_to_keep=10)
 20 | 
 21 | 
 22 |     def init_network(self):
 23 |         """
 24 |         [input]
 25 |         self.obs
 26 |         self.action_n
 27 |         self.advant
 28 |         self.old_dist_means_n
 29 |         self.old_dist_logstds_n
 30 |         [output]
 31 |         self.action_dist_means_n
 32 |         self.action_dist_logstds_n
 33 |         var_list
 34 |         """
 35 |         self.net = NetworkContinous("network_continous_ac")
 36 |         if pms.min_std is not None:
 37 |             log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std))
 38 |         self.action_dist_stds_n = tf.exp(log_std_var)
 39 |         self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n)
 40 |         self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n)
 41 |         self.likehood_new_action_dist = self.distribution.log_likelihood_sym(self.net.action_n, self.new_dist_info_vars)
 42 |         # surr = - log(\pi_\theta)*(Q^\pi-V^\pi)
 43 |         value_loss = 0.5*tf.square(self.net.advant)
 44 |         surr = -tf.reduce_sum(self.likehood_new_action_dist*tf.stop_gradient(self.net.advant)+value_loss)  # Surrogate loss
 45 | 
 46 |         batch_size = tf.shape(self.net.obs)[0]
 47 |         batch_size_float = tf.cast(batch_size , tf.float32)
 48 |         kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars))
 49 |         ent = self.distribution.entropy(self.old_dist_info_vars)
 50 |         # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf
 51 |         self.losses = [surr, kl, ent]
 52 |         var_list = self.net.var_list
 53 |         self.gf = GetFlat(var_list)  # get theta from var_list
 54 |         self.gf.session = self.session
 55 |         self.sff = SetFromFlat(var_list)  # set theta from var_List
 56 |         self.sff.session = self.session
 57 |         # get g
 58 |         self.pg = flatgrad(surr, var_list)
 59 | 
 60 |         self.session.run(tf.initialize_all_variables())
 61 |         # self.saver = tf.train.Saver(max_to_keep=10)
 62 |         # self.load_model(pms.checkpoint_file)
 63 | 
 64 |     def init_logger(self):
 65 |         head = ["std", "rewards"]
 66 |         self.logger = Logger(head)
 67 | 
 68 |     def train_mini_batch(self, parallel=False, linear_search=True):
 69 |         # Generating paths.
 70 |         print("Rollout")
 71 |         start_time = time.time()
 72 |         self.get_samples(pms.paths_number)
 73 |         paths = self.storage.get_paths()  # get_paths
 74 |         # Computing returns and estimating advantage function.
 75 |         sample_data = self.storage.process_paths(paths)
 76 |         agent_infos = sample_data["agent_infos"]
 77 |         obs_n = sample_data["observations"]
 78 |         action_n = sample_data["actions"]
 79 |         advant_n = sample_data["advantages"]
 80 |         n_samples = len(obs_n)
 81 |         inds = np.random.choice(n_samples, int(math.floor(n_samples * pms.subsample_factor)), replace=False)
 82 |         # inds = range(n_samples)
 83 |         obs_n = obs_n[inds]
 84 |         action_n = action_n[inds]
 85 |         advant_n = advant_n[inds]
 86 |         action_dist_means_n = np.array([agent_info["mean"] for agent_info in agent_infos[inds]])
 87 |         action_dist_logstds_n = np.array([agent_info["log_std"] for agent_info in agent_infos[inds]])
 88 |         feed = {self.net.obs: obs_n,
 89 |                 self.net.advant: advant_n,
 90 |                 self.net.old_dist_means_n: action_dist_means_n,
 91 |                 self.net.old_dist_logstds_n: action_dist_logstds_n,
 92 |                 self.net.action_n: action_n
 93 |                 }
 94 | 
 95 |         episoderewards = np.array([path["rewards"].sum() for path in paths])
 96 |         thprev = self.gf()  # get theta_old
 97 | 
 98 |         g = self.session.run(self.pg, feed_dict=feed)
 99 |         theta = thprev+0.01*g
100 |         stats = {}
101 |         stats["sum steps of episodes"] = sample_data["sum_episode_steps"]
102 |         stats["Average sum of rewards per episode"] = episoderewards.mean()
103 |         stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0)
104 |         return stats, theta, thprev
105 | 
106 |     def learn(self):
107 |         self.init_logger()
108 |         iter_num = 0
109 |         while True:
110 |             print "\n********** Iteration %i ************" % iter_num
111 |             stats, theta, thprev = self.train_mini_batch(linear_search=False)
112 |             self.sff(theta)
113 |             self.logger.log_row([stats["Average sum of rewards per episode"], self.session.run(self.net.action_dist_logstd_param)])
114 |             for k , v in stats.iteritems():
115 |                 print(k + ": " + " " * (40 - len(k)) + str(v))
116 |             if iter_num % pms.save_model_times == 0:
117 |                 self.save_model(pms.environment_name + "-" + str(iter_num))
118 |             iter_num += 1
119 | 


--------------------------------------------------------------------------------
/network/network_continous_rnn.py:
--------------------------------------------------------------------------------
  1 | from utils import *
  2 | import numpy as np
  3 | 
  4 | import tensorflow as tf
  5 | import prettytensor as pt
  6 | from parameters import pms
  7 | 
  8 | seed = 1
  9 | np.random.seed(seed)
 10 | tf.set_random_seed(seed)
 11 | 
 12 | class InnerLSTMCell(tf.nn.rnn_cell.BasicLSTMCell):
 13 |     def __init__(self , num_units , forget_bias=1.0 , input_size=None):
 14 |         tf.nn.rnn_cell.BasicLSTMCell.__init__(self , num_units , forget_bias=forget_bias , input_size=input_size)
 15 |         self.matrix , self.bias = None , None
 16 | 
 17 | 
 18 |     def __call__(self , inputs , state , scope=None):
 19 |         """
 20 |             Long short-term memory cell (LSTM).
 21 |             implement from BasicLSTMCell.__call__
 22 |         """
 23 |         with tf.variable_scope(scope or type(self).__name__):  # "BasicLSTMCell"
 24 |             # Parameters of gates are concatenated into one multiply for efficiency.
 25 |             c , h = tf.split(1 , 2 , state)
 26 |             concat = self.linear([inputs , h] , 4 * self._num_units , True)
 27 | 
 28 |             # i = input_gate, j = new_input, f = forget_gate, o = output_gate
 29 |             i , j , f , o = tf.split(1 , 4 , concat)
 30 | 
 31 |             new_c = c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid(i) * tf.tanh(j)
 32 |             new_h = tf.tanh(new_c) * tf.sigmoid(o)
 33 | 
 34 |             return new_h , tf.concat(1 , [new_c , new_h])
 35 | 
 36 | 
 37 |     def linear(self , args , output_size , bias , bias_start=0.0 , scope=None):
 38 |         """
 39 |             Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
 40 |             implement from function of tensorflow.python.ops.rnn_cell.linear()
 41 |         """
 42 |         if args is None or (isinstance(args , (list , tuple)) and not args):
 43 |             raise ValueError("`args` must be specified")
 44 |         if not isinstance(args , (list , tuple)):
 45 |             args = [args]
 46 | 
 47 |             # Calculate the total size of arguments on dimension 1.
 48 |         total_arg_size = 0
 49 |         shapes = [a.get_shape().as_list() for a in args]
 50 |         for shape in shapes:
 51 |             if len(shape) != 2:
 52 |                 raise ValueError("Linear is expecting 2D arguments: %s" % str(shapes))
 53 |             if not shape[1]:
 54 |                 raise ValueError("Linear expects shape[1] of arguments: %s" % str(shapes))
 55 |             else:
 56 |                 total_arg_size += shape[1]
 57 | 
 58 |         # Now the computation.
 59 |         with tf.variable_scope(scope or "Linear"):
 60 |             matrix = tf.get_variable("Matrix" , [total_arg_size , output_size])
 61 |             if len(args) == 1:
 62 |                 res = tf.matmul(args[0] , matrix)
 63 |             else:
 64 |                 res = tf.matmul(tf.concat(1 , args) , matrix)
 65 |             if not bias:
 66 |                 return res
 67 |             bias_term = tf.get_variable(
 68 |                 "Bias" , [output_size] ,
 69 |                 initializer=tf.constant_initializer(bias_start))
 70 |             self.matrix = matrix
 71 |             self.bias = bias_term
 72 |         return res + bias_term
 73 | 
 74 | class NetworkContinousLSTM(object):
 75 |     def __init__(self, scope):
 76 |         with tf.variable_scope("%s_shared" % scope):
 77 |             self.obs = obs = tf.placeholder(
 78 |                 dtype, shape=[None, pms.obs_shape], name="%s_obs"%scope)
 79 |             self.action_n = tf.placeholder(dtype, shape=[None, pms.action_shape], name="%s_action"%scope)
 80 |             self.advant = tf.placeholder(dtype, shape=[None], name="%s_advant"%scope)
 81 |             self.old_dist_means_n = tf.placeholder(dtype, shape=[None, pms.action_shape],
 82 |                                                    name="%s_oldaction_dist_means"%scope)
 83 |             self.old_dist_logstds_n = tf.placeholder(dtype, shape=[None, pms.action_shape],
 84 |                                                      name="%s_oldaction_dist_logstds"%scope)
 85 |             # self.obs_reshape = tf.reshape(self.obs, [None, 1, pms.obs_shape])
 86 |             lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(3, forget_bias=1.0, state_is_tuple=True)
 87 |             lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
 88 |                 lstm_cell, output_keep_prob=0.5)
 89 |             rnn = tf.nn.rnn_cell.MultiRNNCell([lstm_cell], state_is_tuple=True)
 90 |             # rnn = tf.nn.rnn_cell.BasicRNNCell(3)
 91 |             self.initial_state = state = rnn.zero_state(tf.shape(self.obs)[0], tf.float32)
 92 |             # output , state = tf.nn.dynamic_rnn(rnn, self.obs)
 93 |             output, state = rnn(self.obs, state)
 94 |             self.action_dist_means_n = (pt.wrap(output).
 95 |                                         # fully_connected(64, activation_fn=tf.nn.relu, init=tf.random_normal_initializer(-0.05, 0.05),
 96 |                                         #                 name="%s_fc1"%scope).
 97 |                                         # fully_connected(64, activation_fn=tf.nn.relu, init=tf.random_normal_initializer(-0.05, 0.05),
 98 |                                         #                  name="%s_fc2"%scope).
 99 |                                         fully_connected(pms.action_shape, init=tf.random_normal_initializer(-0.05, 0.05),
100 |                                                         name="%s_fc3"%scope))
101 |             self.N = tf.shape(obs)[0]
102 |             Nf = tf.cast(self.N, dtype)
103 |             self.action_dist_logstd_param = tf.Variable((.01*np.random.randn(1, pms.action_shape)).astype(np.float32), trainable=False, name="%spolicy_logstd"%scope)
104 |             self.action_dist_logstds_n = tf.tile(self.action_dist_logstd_param,
105 |                                               tf.pack((tf.shape(self.action_dist_means_n)[0], 1)))
106 |             self.var_list = [v for v in tf.trainable_variables()if v.name.startswith(scope)]
107 | 
108 |     def get_action_dist_means_n(self, session, obs):
109 |         return session.run(self.action_dist_means_n,
110 |                          {self.obs: obs})
111 | 
112 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import random
  4 | import scipy.signal
  5 | import prettytensor as pt
  6 | from parameters import pms
  7 | import threading
  8 | from tensorflow.contrib.layers.python.layers import initializers
  9 | 
 10 | seed = 1
 11 | random.seed(seed)
 12 | np.random.seed(seed)
 13 | tf.set_random_seed(seed)
 14 | 
 15 | dtype = tf.float32
 16 | 
 17 | def discount(x, gamma):
 18 |     """
 19 |     scipy.signal.lfilter(b, a, x, axis=-1, zi=None)[source]
 20 |     a[0]*y[n] = b[0]*x[n] + b[1]*x[n-1] + ... + b[M]*x[n-M]
 21 |                       - a[1]*y[n-1] - ... - a[N]*y[n-N]
 22 |     :param x:
 23 |     :param gamma:
 24 |     :return:
 25 |     """
 26 |     assert x.ndim >= 1
 27 |     return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
 28 | 
 29 | 
 30 | 
 31 | 
 32 | 
 33 | def cat_sample(prob_nk):
 34 |     assert prob_nk.ndim == 2
 35 |     N = prob_nk.shape[0]
 36 |     csprob_nk = np.cumsum(prob_nk, axis=1)
 37 |     out = np.zeros(N, dtype='i')
 38 |     for (n, csprob_k, r) in zip(xrange(N), csprob_nk, np.random.rand(N)):
 39 |         for (k, csprob) in enumerate(csprob_k):
 40 |             if csprob > r:
 41 |                 out[n] = k
 42 |                 break
 43 |     return out
 44 | 
 45 | 
 46 | def var_shape(x):
 47 |     out = [k.value for k in x.get_shape()]
 48 |     assert all(isinstance(a, int) for a in out), \
 49 |         "shape function assumes that shape is fully known"
 50 |     return out
 51 | 
 52 | 
 53 | def numel(x):
 54 |     return np.prod(var_shape(x))
 55 | 
 56 | 
 57 | def flatgrad(loss, var_list):
 58 |     grads = tf.gradients(loss, var_list)
 59 |     return tf.concat(0, [tf.reshape(grad, [np.prod(var_shape(v))])
 60 |                          for (grad, v) in zip( grads, var_list)])
 61 | 
 62 | # set theta
 63 | class SetFromFlat(object):
 64 |     def __init__(self, var_list):
 65 |         assigns = []
 66 |         shapes = map(var_shape, var_list)
 67 |         total_size = sum(np.prod(shape) for shape in shapes)
 68 |         self.theta = theta = tf.placeholder(tf.float32, [total_size])
 69 |         start = 0
 70 |         assigns = []
 71 |         for (shape, v) in zip(shapes, var_list):
 72 |             size = np.prod(shape)
 73 |             assigns.append(
 74 |                 tf.assign(
 75 |                     v,
 76 |                     tf.reshape(
 77 |                         theta[
 78 |                             start:start +
 79 |                             size],
 80 |                         shape)))
 81 |             start += size
 82 |         self.op = tf.group(*assigns)
 83 | 
 84 |     def __call__(self, theta):
 85 |         self.session.run(self.op, feed_dict={self.theta: theta})
 86 | 
 87 | # get theta
 88 | class GetFlat(object):
 89 |     def __init__(self, var_list):
 90 |         self.op = tf.concat(0, [tf.reshape(v, [numel(v)]) for v in var_list])
 91 | 
 92 |     def __call__(self):
 93 |         return self.op.eval(session=self.session)
 94 | 
 95 | 
 96 | def slice_2d(x, inds0, inds1):
 97 |     # assume that a path have 1000 vector, then ncols=action dims, inds0=1000,inds1=
 98 |     inds0 = tf.cast(inds0, tf.int64)
 99 |     inds1 = tf.cast(inds1, tf.int64)
100 |     shape = tf.cast(tf.shape(x), tf.int64)
101 |     ncols = shape[1]
102 |     x_flat = tf.reshape(x, [-1])
103 |     return tf.gather(x_flat, inds0 * ncols + inds1)
104 | 
105 | 
106 | # def linesearch(f, x, fullstep, expected_improve_rate):
107 | #     accept_ratio = .1
108 | #     max_backtracks = 10
109 | #     fval, old_kl, entropy = f(x)
110 | #     for (_n_backtracks, stepfrac) in enumerate(.5**np.arange(max_backtracks)):
111 | #         xnew = x + stepfrac * fullstep
112 | #         newfval, new_kl, new_ent= f(xnew)
113 | #         # actual_improve = newfval - fval # minimize target object
114 | #         # expected_improve = expected_improve_rate * stepfrac
115 | #         # ratio = actual_improve / expected_improve
116 | #         # if ratio > accept_ratio and actual_improve > 0:
117 | #         #     return xnew
118 | #         if newfval<fval and new_kl<=pms.max_kl:
119 | #             return xnew
120 | #     return x
121 | 
122 | def linesearch(f, x, fullstep, expected_improve_rate):
123 |     accept_ratio = .1
124 |     max_backtracks = 10
125 |     fval, old_kl, entropy = f(x)
126 |     for (_n_backtracks, stepfrac) in enumerate(.3**np.arange(max_backtracks)):
127 |         xnew = x + stepfrac * fullstep
128 |         newfval, new_kl, new_ent= f(xnew)
129 |         # actual_improve = newfval - fval # minimize target object
130 |         # expected_improve = expected_improve_rate * stepfrac
131 |         # ratio = actual_improve / expected_improve
132 |         # if ratio > accept_ratio and actual_improve > 0:
133 |         #     pms.max_kl *= 1.002
134 |         #     return xnew
135 |         if newfval<fval and new_kl<=pms.max_kl:
136 |             pms.max_kl *=1.002
137 |             return xnew
138 |     return x
139 | 
140 | def linesearch_parallel(f, x, fullstep, expected_improve_rate):
141 |     fval, old_kl, entropy = f(x)
142 |     xnew = x - fullstep
143 |     newfval, new_kl, new_ent = f(xnew)
144 |     if newfval < fval and new_kl <= pms.max_kl:
145 |         pms.max_kl *= 1.002
146 |         return xnew
147 |     else:
148 |         f(x)
149 |         return x
150 | 
151 | 
152 | class dict2(dict):
153 |     def __init__(self, **kwargs):
154 |         dict.__init__(self, kwargs)
155 |         self.__dict__ = self
156 | 
157 | def explained_variance(ypred, y):
158 |     assert y.ndim == 1 and ypred.ndim == 1
159 |     vary = np.var(y)
160 |     return np.nan if vary==0 else 1 - np.var(y-ypred)/vary
161 | 
162 | def countMatrixMultiply(matrix):
163 |     result_end = []
164 |     for j in matrix:
165 |         result = 1.0
166 |         for i in j:
167 |             result *= i
168 |         result_end.append(result)
169 |     return np.array(result_end)
170 | 
171 | def kl_sym(old_dist_means, old_dist_logstds, new_dist_means, new_dist_logstds):
172 |     old_std = tf.exp(old_dist_logstds)
173 |     new_std = tf.exp(new_dist_logstds)
174 |     # means: (N*A)
175 |     # std: (N*A)
176 |     # formula:
177 |     # { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
178 |     # ln(\sigma_2/\sigma_1)
179 |     numerator = tf.square(old_dist_means - new_dist_means) + \
180 |                 tf.square(old_std) - tf.square(new_std)
181 |     denominator = 2 * tf.square(new_std) + 1e-8
182 |     return tf.reduce_sum(
183 |         numerator / denominator + new_dist_logstds - old_dist_logstds)
184 | 
185 | def kl_sym_gradient(old_dist_means, old_dist_logstds, new_dist_means, new_dist_logstds):
186 |     old_std = tf.exp(old_dist_logstds)
187 |     new_std = tf.exp(new_dist_logstds)
188 |     numerator = tf.square(tf.stop_gradient(new_dist_means) - new_dist_means) + \
189 |                 tf.square(tf.stop_gradient(new_std)) - tf.square(new_std)
190 | 
191 | 
192 |     denominator = 2 * tf.square(new_std) + 1e-8
193 |     return tf.reduce_sum(
194 |         numerator / denominator + new_dist_logstds - tf.stop_gradient(new_dist_logstds))
195 | 
196 | 


--------------------------------------------------------------------------------
/agent/agent_base.py:
--------------------------------------------------------------------------------
  1 | from utils import *
  2 | import gym
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | import time
  6 | from storage.storage_continous import Storage
  7 | import math
  8 | from parameters import pms
  9 | import krylov
 10 | from distribution.diagonal_gaussian import DiagonalGaussian
 11 | from baseline.baseline_lstsq import Baseline
 12 | 
 13 | seed = 1
 14 | np.random.seed(seed)
 15 | tf.set_random_seed(seed)
 16 | 
 17 | 
 18 | """
 19 | Base class for TRPOAgent
 20 | """
 21 | class TRPOAgentBase(object):
 22 |     def __init__(self, env):
 23 |         self.env = env
 24 |         # if not isinstance(env.observation_space, Box) or \
 25 |         #    not isinstance(env.action_space, Discrete):
 26 |         #     print("Incompatible spaces.")
 27 |         #     exit(-1)
 28 |         print("Observation Space", env.observation_space)
 29 |         print("Action Space", env.action_space)
 30 |         print("Action area, high:%f, low%f" % (env.action_space.high, env.action_space.low))
 31 |         gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1 / 3.0)
 32 |         self.session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
 33 |         self.end_count = 0
 34 |         self.paths = []
 35 |         self.train = True
 36 |         self.baseline = Baseline()
 37 |         self.storage = Storage(self, self.env, self.baseline)
 38 |         self.distribution = DiagonalGaussian(pms.action_shape)
 39 |         self.net = None
 40 | 
 41 |     # def init_logger(self):
 42 |     #     head = ["average_episode_std" , "sum steps episode number" "total number of episodes" ,
 43 |     #             "Average sum of rewards per episode" ,
 44 |     #             "KL between old and new distribution" , "Surrogate loss" , "Surrogate loss prev" , "ds" , "entropy" ,
 45 |     #             "mean_advant"]
 46 |     #     self.logger = Logger(head)
 47 | 
 48 |     def init_network(self):
 49 |         raise NotImplementedError
 50 | 
 51 |     def get_samples(self, path_number):
 52 |         for i in range(path_number):
 53 |             self.storage.get_single_path()
 54 | 
 55 |     def get_action(self, obs, *args):
 56 |         if self.net==None:
 57 |             raise NameError("network have not been defined")
 58 |         obs = np.expand_dims(obs, 0)
 59 |         # action_dist_logstd = np.expand_dims([np.log(pms.std)], 0)
 60 |         action_dist_means_n, action_dist_stds_n = self.session.run([self.net.action_dist_means_n, self.action_dist_stds_n],
 61 |                                                {self.net.obs: obs})
 62 |         if pms.train_flag:
 63 |             rnd = np.random.normal(size=action_dist_means_n[0].shape)
 64 |             action = rnd * action_dist_stds_n[0] + action_dist_means_n[0]
 65 |         else:
 66 |             action = action_dist_means_n[0]
 67 |         # action = np.clip(action, pms.min_a, pms.max_a)
 68 |         return action, dict(mean=action_dist_means_n[0], log_std=action_dist_stds_n[0])
 69 | 
 70 |     def train_mini_batch(self, parallel=False, linear_search=True):
 71 |         # Generating paths.
 72 |         self.get_samples(pms.paths_number)
 73 |         paths = self.storage.get_paths()  # get_paths
 74 |         # Computing returns and estimating advantage function.
 75 |         return self.train_paths(paths, parallel=parallel, linear_search=linear_search)
 76 | 
 77 | 
 78 |     def train_paths(self, paths, parallel=False, linear_search=True):
 79 |         start_time = time.time()
 80 |         sample_data = self.storage.process_paths(paths)
 81 |         agent_infos = sample_data["agent_infos"]
 82 |         obs_n = sample_data["observations"]
 83 |         action_n = sample_data["actions"]
 84 |         advant_n = sample_data["advantages"]
 85 |         n_samples = len(obs_n)
 86 |         inds = np.random.choice(n_samples , int(math.floor(n_samples * pms.subsample_factor)) , replace=False)
 87 |         # inds = range(n_samples)
 88 |         obs_n = obs_n[inds]
 89 |         action_n = action_n[inds]
 90 |         advant_n = advant_n[inds]
 91 |         action_dist_means_n = np.array([agent_info["mean"] for agent_info in agent_infos[inds]])
 92 |         action_dist_logstds_n = np.array([agent_info["log_std"] for agent_info in agent_infos[inds]])
 93 |         feed = {self.net.obs: obs_n ,
 94 |                 self.net.advant: advant_n ,
 95 |                 self.net.old_dist_means_n: action_dist_means_n ,
 96 |                 self.net.old_dist_logstds_n: action_dist_logstds_n ,
 97 |                 self.net.action_n: action_n
 98 |                 }
 99 | 
100 |         episoderewards = np.array([path["rewards"].sum() for path in paths])
101 |         thprev = self.gf()  # get theta_old
102 | 
103 |         def fisher_vector_product(p):
104 |             feed[self.flat_tangent] = p
105 |             return self.session.run(self.fvp , feed) + pms.cg_damping * p
106 | 
107 |         g = self.session.run(self.pg , feed_dict=feed)
108 |         stepdir = krylov.cg(fisher_vector_product , -g , cg_iters=pms.cg_iters)
109 |         shs = 0.5 * stepdir.dot(fisher_vector_product(stepdir))  # theta
110 |         # if shs<0, then the nan error would appear
111 |         lm = np.sqrt(shs / pms.max_kl)
112 |         fullstep = stepdir / lm
113 |         neggdotstepdir = -g.dot(stepdir)
114 | 
115 |         def loss(th):
116 |             self.sff(th)
117 |             return self.session.run(self.losses , feed_dict=feed)
118 | 
119 |         if parallel is True:
120 |             theta = linesearch_parallel(loss , thprev , fullstep , neggdotstepdir / lm)
121 |         else:
122 |             if linear_search:
123 |                 theta = linesearch(loss , thprev , fullstep , neggdotstepdir / lm)
124 |             else:
125 |                 theta = thprev + fullstep
126 |                 if math.isnan(theta.mean()):
127 |                     print shs is None
128 |                     theta = thprev
129 |         stats = {}
130 |         stats["sum steps of episodes"] = sample_data["sum_episode_steps"]
131 |         stats["Average sum of rewards per episode"] = episoderewards.mean()
132 |         stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0)
133 |         return stats , theta , thprev
134 | 
135 |     def learn(self):
136 |         raise NotImplementedError
137 | 
138 |     def test(self, model_name):
139 |         self.load_model(model_name)
140 |         if pms.record_movie:
141 |             for i in range(100):
142 |                 self.storage.get_single_path()
143 |             self.env.env.monitor.close()
144 |             if pms.upload_to_gym:
145 |                 gym.upload("log/trpo",algorithm_id='alg_8BgjkAsQRNiWu11xAhS4Hg', api_key='sk_IJhy3b2QkqL3LWzgBXoVA')
146 |         else:
147 |             for i in range(50):
148 |                 self.storage.get_single_path()
149 | 
150 |     def save_model(self, model_name):
151 |         self.saver.save(self.session, "checkpoint/" + model_name + ".ckpt")
152 | 
153 |     def load_model(self, model_name):
154 |         try:
155 |             if model_name is not None:
156 |                 self.saver.restore(self.session, model_name)
157 |             else:
158 |                 self.saver.restore(self.session, tf.train.latest_checkpoint(pms.checkpoint_dir))
159 |         except:
160 |             print "load model %s fail" % (model_name)
161 | 


--------------------------------------------------------------------------------
/krylov.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | 
  4 | EPS = np.finfo('float64').tiny
  5 | 
  6 | 
  7 | def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
  8 |     """
  9 |     Demmel p 312
 10 |     """
 11 |     p = b.copy()
 12 |     r = b.copy()
 13 |     x = np.zeros_like(b)
 14 |     rdotr = r.dot(r)
 15 | 
 16 |     fmtstr = "%10i %10.3g %10.3g"
 17 |     titlestr = "%10s %10s %10s"
 18 |     if verbose: print titlestr % ("iter", "residual norm", "soln norm")
 19 | 
 20 |     for i in xrange(cg_iters):
 21 |         if callback is not None:
 22 |             callback(x)
 23 |         if verbose: print fmtstr % (i, rdotr, np.linalg.norm(x))
 24 |         z = f_Ax(p)
 25 |         v = rdotr / (p.dot(z) + 1e-8)
 26 |         x += v * p
 27 |         r -= v * z
 28 |         newrdotr = r.dot(r)
 29 |         mu = newrdotr / (rdotr + 1e-8)
 30 |         p = r + mu * p
 31 | 
 32 |         rdotr = newrdotr
 33 |         if rdotr < residual_tol:
 34 |             break
 35 | 
 36 |     if callback is not None:
 37 |         callback(x)
 38 |     if verbose: print fmtstr % (i + 1, rdotr, np.linalg.norm(x))  # pylint: disable=W0631
 39 |     return x
 40 | 
 41 | 
 42 | def preconditioned_cg(f_Ax, f_Minvx, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
 43 |     """
 44 |     Demmel p 318
 45 |     """
 46 |     x = np.zeros_like(b)
 47 |     r = b.copy()
 48 |     p = f_Minvx(b)
 49 |     y = p
 50 |     ydotr = y.dot(r)
 51 | 
 52 |     fmtstr = "%10i %10.3g %10.3g"
 53 |     titlestr = "%10s %10s %10s"
 54 |     if verbose: print titlestr % ("iter", "residual norm", "soln norm")
 55 | 
 56 |     for i in xrange(cg_iters):
 57 |         if callback is not None:
 58 |             callback(x, f_Ax)
 59 |         if verbose: print fmtstr % (i, ydotr, np.linalg.norm(x))
 60 |         z = f_Ax(p)
 61 |         v = ydotr / p.dot(z)
 62 |         x += v * p
 63 |         r -= v * z
 64 |         y = f_Minvx(r)
 65 |         newydotr = y.dot(r)
 66 |         mu = newydotr / ydotr
 67 |         p = y + mu * p
 68 | 
 69 |         ydotr = newydotr
 70 | 
 71 |         if ydotr < residual_tol:
 72 |             break
 73 | 
 74 |     if verbose: print fmtstr % (cg_iters, ydotr, np.linalg.norm(x))
 75 | 
 76 |     return x
 77 | 
 78 | 
 79 | def test_cg():
 80 |     A = np.random.randn(5, 5)
 81 |     A = A.T.dot(A)
 82 |     b = np.random.randn(5)
 83 |     x = cg(lambda x: A.dot(x), b, cg_iters=5, verbose=True)  # pylint: disable=W0108
 84 |     assert np.allclose(A.dot(x), b)
 85 | 
 86 |     x = preconditioned_cg(lambda x: A.dot(x), lambda x: np.linalg.solve(A, x), b, cg_iters=5,
 87 |                           verbose=True)  # pylint: disable=W0108
 88 |     assert np.allclose(A.dot(x), b)
 89 | 
 90 |     x = preconditioned_cg(lambda x: A.dot(x), lambda x: x / np.diag(A), b, cg_iters=5,
 91 |                           verbose=True)  # pylint: disable=W0108
 92 |     assert np.allclose(A.dot(x), b)
 93 | 
 94 | 
 95 | def lanczos(f_Ax, b, k):
 96 |     """
 97 |     Runs Lanczos algorithm to generate a orthogonal basis for the Krylov subspace
 98 |     b, Ab, A^2b, ...
 99 |     as well as the upper hessenberg matrix T = Q^T A Q
100 | 
101 |     from Demmel ch 6
102 |     """
103 | 
104 |     assert k > 1
105 | 
106 |     alphas = []
107 |     betas = []
108 |     qs = []
109 | 
110 |     q = b / np.linalg.norm(b)
111 |     beta = 0
112 |     qm = np.zeros_like(b)
113 |     for j in xrange(k):
114 |         qs.append(q)
115 | 
116 |         z = f_Ax(q)
117 | 
118 |         alpha = q.dot(z)
119 |         alphas.append(alpha)
120 |         z -= alpha * q + beta * qm
121 | 
122 |         beta = np.linalg.norm(z)
123 |         betas.append(beta)
124 | 
125 |         print "beta", beta
126 |         if beta < 1e-9:
127 |             print "lanczos: early after %i/%i dimensions" % (j + 1, k)
128 |             break
129 |         else:
130 |             qm = q
131 |             q = z / beta
132 | 
133 |     return np.array(qs, 'float64').T, np.array(alphas, 'float64'), np.array(betas[:-1], 'float64')
134 | 
135 | 
136 | def lanczos2(f_Ax, b, k, residual_thresh=1e-9):
137 |     """
138 |     Runs Lanczos algorithm to generate a orthogonal basis for the Krylov subspace
139 |     b, Ab, A^2b, ...
140 |     as well as the upper hessenberg matrix T = Q^T A Q
141 |     from Demmel ch 6
142 |     """
143 |     b = b.astype('float64')
144 |     assert k > 1
145 |     H = np.zeros((k, k))
146 |     qs = []
147 | 
148 |     q = b / np.linalg.norm(b)
149 |     beta = 0
150 | 
151 |     for j in xrange(k):
152 |         qs.append(q)
153 | 
154 |         z = f_Ax(q.astype('float64')).astype('float64')
155 |         for (i, q) in enumerate(qs):
156 |             H[j, i] = H[i, j] = h = q.dot(z)
157 |             z -= h * q
158 | 
159 |         beta = np.linalg.norm(z)
160 |         if beta < residual_thresh:
161 |             print "lanczos2: stopping early after %i/%i dimensions residual %f < %f" % (j + 1, k, beta, residual_thresh)
162 |             break
163 |         else:
164 |             q = z / beta
165 | 
166 |     return np.array(qs).T, H[:len(qs), :len(qs)]
167 | 
168 | 
169 | def make_tridiagonal(alphas, betas):
170 |     assert len(alphas) == len(betas) + 1
171 |     N = alphas.size
172 |     out = np.zeros((N, N), 'float64')
173 |     out.flat[0:N ** 2:N + 1] = alphas
174 |     out.flat[1:N ** 2 - N:N + 1] = betas
175 |     out.flat[N:N ** 2 - 1:N + 1] = betas
176 |     return out
177 | 
178 | 
179 | def tridiagonal_eigenvalues(alphas, betas):
180 |     T = make_tridiagonal(alphas, betas)
181 |     return np.linalg.eigvalsh(T)
182 | 
183 | 
184 | def test_lanczos():
185 |     np.set_printoptions(precision=4)
186 | 
187 |     A = np.random.randn(5, 5)
188 |     A = A.T.dot(A)
189 |     b = np.random.randn(5)
190 |     f_Ax = lambda x: A.dot(x)  # pylint: disable=W0108
191 |     Q, alphas, betas = lanczos(f_Ax, b, 10)
192 |     H = make_tridiagonal(alphas, betas)
193 |     assert np.allclose(Q.T.dot(A).dot(Q), H)
194 |     assert np.allclose(Q.dot(H).dot(Q.T), A)
195 |     assert np.allclose(np.linalg.eigvalsh(H), np.linalg.eigvalsh(A))
196 | 
197 |     Q, H1 = lanczos2(f_Ax, b, 10)
198 |     assert np.allclose(H, H1, atol=1e-6)
199 | 
200 |     print "ritz eigvals:"
201 |     for i in xrange(1, 6):
202 |         Qi = Q[:, :i]
203 |         Hi = Qi.T.dot(A).dot(Qi)
204 |         print np.linalg.eigvalsh(Hi)[::-1]
205 |     print "true eigvals:"
206 |     print np.linalg.eigvalsh(A)[::-1]
207 | 
208 |     print "lanczos on ill-conditioned problem"
209 |     A = np.diag(10 ** np.arange(5))
210 |     Q, H1 = lanczos2(f_Ax, b, 10)
211 |     print np.linalg.eigvalsh(H1)
212 | 
213 |     print "lanczos on ill-conditioned problem with noise"
214 | 
215 |     def f_Ax_noisy(x):
216 |         return A.dot(x) + np.random.randn(x.size) * 1e-3
217 | 
218 |     Q, H1 = lanczos2(f_Ax_noisy, b, 10)
219 |     print np.linalg.eigvalsh(H1)
220 | 
221 | def compute_hessian(fn, vars):
222 |     mat = []
223 |     for v1 in vars:
224 |         temp = []
225 |         for v2 in vars:
226 |             # computing derivative twice, first w.r.t v2 and then w.r.t v1
227 |             temp.append(tf.gradients(tf.gradients(fn, v2)[0], v1)[0])
228 |         temp = [tf.cons(0) if t == None else t for t in temp] # tensorflow returns None when there is no gradient, so we replace None with 0
229 |         temp = tf.pack(temp)
230 |         mat.append(temp)
231 |     mat = tf.pack(mat)
232 |     return mat
233 | 
234 | if __name__ == "__main__":
235 |     test_lanczos()
236 |     test_cg()
237 | 
238 | 


--------------------------------------------------------------------------------
/storage/storage_continous_parallel.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import multiprocessing
  4 | from utils import *
  5 | import gym
  6 | import time
  7 | import copy
  8 | from random import randint
  9 | from parameters import pms
 10 | import math
 11 | from network.network_continous import NetworkContinous
 12 | 
 13 | 
 14 | class Actor(multiprocessing.Process):
 15 |     def __init__(self, args, task_q, result_q, actor_id, monitor):
 16 |         multiprocessing.Process.__init__(self)
 17 |         self.actor_id = actor_id
 18 |         self.task_q = task_q
 19 |         self.result_q = result_q
 20 |         self.args = args
 21 |         self.monitor = monitor
 22 |         # pms.max_path_length = gym.spec(args.environment_name).timestep_limit
 23 | 
 24 | 
 25 |     def get_action(self, obs):
 26 |         if self.net == None:
 27 |             raise NameError("network have not been defined")
 28 |         obs = np.expand_dims(obs , 0)
 29 |         # action_dist_logstd = np.expand_dims([np.log(pms.std)], 0)
 30 |         action_dist_means_n , action_dist_logstds_n = self.session.run(
 31 |             [self.net.action_dist_means_n, self.net.action_dist_logstds_n], feed_dict={self.net.obs: obs})
 32 |         if pms.train_flag:
 33 |             rnd = np.random.normal(size=action_dist_means_n[0].shape)
 34 |             action = rnd * np.exp(action_dist_logstds_n[0]) + action_dist_means_n[0]
 35 |         else:
 36 |             action = action_dist_means_n[0]
 37 |         # action = np.clip(action, pms.min_a, pms.max_a)
 38 |         return action , dict(mean=action_dist_means_n[0] , log_std=np.exp(action_dist_logstds_n[0]))
 39 | 
 40 |     def run(self):
 41 |         self.env = gym.make(self.args.environment_name)
 42 |         self.env.seed(randint(0, 999999))
 43 |         if self.monitor:
 44 |             self.env.monitor.start('monitor/', force=True)
 45 | 
 46 |         self.net = NetworkContinous("rollout_network" + str(self.actor_id))
 47 |         config = tf.ConfigProto(
 48 |             device_count={'GPU': 0}
 49 |         )
 50 |         self.session = tf.Session(config=config)
 51 |         var_list = self.net.var_list
 52 |         self.session.run(tf.initialize_all_variables())
 53 |         self.set_policy = SetFromFlat(var_list)
 54 |         self.set_policy.session = self.session
 55 |         while True:
 56 |             # get a task, or wait until it gets one
 57 |             next_task = self.task_q.get(block=True)
 58 |             if type(next_task) is int and next_task == 1:
 59 |                 # the task is an actor request to collect experience
 60 |                 path = self.rollout()
 61 |                 # print "single rollout time:"+str(end-start)
 62 |                 self.task_q.task_done()
 63 |                 self.result_q.put(path)
 64 |             elif type(next_task) is int and next_task == 2:
 65 |                 print "kill message"
 66 |                 if self.monitor:
 67 |                     self.env.monitor.close()
 68 |                 self.task_q.task_done()
 69 |                 break
 70 |             else:
 71 |                 # the task is to set parameters of the actor policy
 72 |                 next_task = np.array(next_task)
 73 |                 self.set_policy(next_task)
 74 |                 # super hacky method to make sure when we fill the queue with set parameter tasks,
 75 |                 # an actor doesn't finish updating before the other actors can accept their own tasks.
 76 |                 time.sleep(0.1)
 77 |                 self.task_q.task_done()
 78 |         return
 79 | 
 80 |     def rollout(self):
 81 |         """
 82 |         :param:observations:obs list
 83 |         :param:actions:action list
 84 |         :param:rewards:reward list
 85 |         :param:agent_infos: mean+log_std dictlist
 86 |         :param:env_infos: no use, just information about environment
 87 |         :return: a path, list
 88 |         """
 89 |         # if pms.record_movie:
 90 |         #     outdir = 'log/trpo'
 91 |         #     self.env.monitor.start(outdir , force=True)
 92 |         observations = []
 93 |         actions = []
 94 |         rewards = []
 95 |         agent_infos = []
 96 |         env_infos = []
 97 |         if pms.render:
 98 |             self.env.render()
 99 |         o = self.env.reset()
100 |         episode_steps = 0
101 |         for i in xrange(pms.max_path_length - 1):
102 |             a, agent_info = self.get_action(o)
103 |             next_o, reward, terminal, env_info = self.env.step(a)
104 |             observations.append(o)
105 |             rewards.append(np.array([reward]))
106 |             actions.append(a)
107 |             agent_infos.append([agent_info])
108 |             env_infos.append([])
109 |             episode_steps += 1
110 |             if terminal:
111 |                 break
112 |             o = next_o
113 |             if pms.render:
114 |                 self.env.render()
115 |         path = dict(
116 |             observations=np.array(observations) ,
117 |             actions=np.array(actions) ,
118 |             rewards=np.array(rewards) ,
119 |             agent_infos=np.concatenate(agent_infos) ,
120 |             env_infos=np.concatenate(env_infos) ,
121 |             episode_steps=episode_steps
122 |         )
123 |         return path
124 | 
125 | class ParallelStorage():
126 |     def __init__(self):
127 |         self.args = pms
128 |         self.tasks = multiprocessing.JoinableQueue()
129 |         self.results = multiprocessing.Queue()
130 |         self.actors = []
131 |         self.actors.append(Actor(self.args, self.tasks, self.results, 9999, self.args.record_movie))
132 |         for i in xrange(self.args.jobs-1):
133 |             self.actors.append(Actor(self.args, self.tasks, self.results, 37*(i+3), False))
134 |         for a in self.actors:
135 |             a.start()
136 |         # we will start by running 20,000 / 1000 = 20 episodes for the first ieration
137 |         self.average_timesteps_in_episode = 1000
138 | 
139 |     def get_paths(self):
140 |         # keep 20,000 timesteps per update
141 |         num_rollouts = self.args.paths_number
142 |         # print "rollout_number:"+str(num_rollouts)
143 |         for i in xrange(num_rollouts):
144 |             self.tasks.put(1)
145 |         start = time.time()
146 |         self.tasks.join()
147 |         end = time.time()
148 |         # print "rollout real time"+str(end-start)
149 |         paths = []
150 |         while num_rollouts:
151 |             num_rollouts -= 1
152 |             paths.append(self.results.get())
153 |         return paths
154 | 
155 |     # def process_paths(self, paths):
156 |     #     sum_episode_steps = 0
157 |     #     for path in paths:
158 |     #         sum_episode_steps += path['episode_steps']
159 |     #         # r_t+V(S_{t+1})-V(S_t) = returns-baseline
160 |     #         # path_baselines = np.append(self.baseline.predict(path) , 0)
161 |     #         # # r_t+V(S_{t+1})-V(S_t) = returns-baseline
162 |     #         # path["advantages"] = np.concatenate(path["rewards"]) + \
163 |     #         #          pms.discount * path_baselines[1:] - \
164 |     #         #          path_baselines[:-1]
165 |     #         # path["returns"] = np.concatenate(discount(path["rewards"], pms.discount))
166 |     #         path_baselines = np.append(self.baseline.predict(path) , 0)
167 |     #         deltas = np.concatenate(path["rewards"]) + \
168 |     #                  pms.discount * path_baselines[1:] - \
169 |     #                  path_baselines[:-1]
170 |     #         path["advantages"] = discount(
171 |     #             deltas , pms.discount * pms.gae_lambda)
172 |     #         path["returns"] = np.concatenate(discount(path["rewards"] , pms.discount))
173 |     #     observations = np.concatenate([path["observations"] for path in paths])
174 |     #     actions = np.concatenate([path["actions"] for path in paths])
175 |     #     rewards = np.concatenate([path["rewards"] for path in paths])
176 |     #     advantages = np.concatenate([path["advantages"] for path in paths])
177 |     #     env_infos = np.concatenate([path["env_infos"] for path in paths])
178 |     #     agent_infos = np.concatenate([path["agent_infos"] for path in paths])
179 |     #     if pms.center_adv:
180 |     #         advantages -= np.mean(advantages)
181 |     #         advantages /= (advantages.std() + 1e-8)
182 |     #     samples_data = dict(
183 |     #         observations=observations ,
184 |     #         actions=actions ,
185 |     #         rewards=rewards ,
186 |     #         advantages=advantages ,
187 |     #         env_infos=env_infos ,
188 |     #         agent_infos=agent_infos ,
189 |     #         paths=paths ,
190 |     #         sum_episode_steps=sum_episode_steps
191 |     #     )
192 |     #     self.baseline.fit(paths)
193 |     #     return samples_data
194 | 
195 |     def set_policy_weights(self, parameters):
196 |         for i in xrange(self.args.jobs):
197 |             self.tasks.put(parameters)
198 |         self.tasks.join()
199 | 
200 |     def end(self):
201 |         for i in xrange(self.args.jobs):
202 |             self.tasks.put(2)


--------------------------------------------------------------------------------
/agent/agent_discrete.py:
--------------------------------------------------------------------------------
  1 | from utils import *
  2 | from dealImage import *
  3 | from logger.logger import Logger
  4 | import krylov
  5 | import numpy as np
  6 | import random
  7 | import tensorflow as tf
  8 | import time
  9 | 
 10 | import prettytensor as pt
 11 | 
 12 | from storage.storage import Storage
 13 | from parameters import pms
 14 | from distribution.diagonal_category import DiagonalCategory
 15 | from baseline.baseline_lstsq import Baseline
 16 | import gym
 17 | from environment import Environment
 18 | 
 19 | class TRPOAgent(object):
 20 |     def __init__(self, env):
 21 |         self.env = env
 22 |         # if not isinstance(env.observation_space, Box) or \
 23 |         #    not isinstance(env.action_space, Discrete):
 24 |         #     print("Incompatible spaces.")
 25 |         #     exit(-1)
 26 |         print("Observation Space", env.observation_space)
 27 |         print("Action Space", env.action_space)
 28 |         self.distribution = DiagonalCategory()
 29 |         self.session = tf.Session()
 30 |         self.baseline = Baseline()
 31 |         self.end_count = 0
 32 |         self.paths = []
 33 |         self.train = True
 34 |         self.storage = Storage(self, self.env, self.baseline)
 35 |         self.init_network()
 36 |         if pms.train_flag:
 37 |             self.init_logger()
 38 | 
 39 |     def init_logger(self):
 40 |         head = ["average_episode_std", "total number of episodes", "Average sum of rewards per episode",
 41 |                 "KL between old and new distribution", "Surrogate loss", "Surrogate loss prev", "ds", "entropy",
 42 |                 "mean_advant", "sum_episode_steps"]
 43 |         self.logger = Logger(head)
 44 | 
 45 |     def init_network(self):
 46 |         self.obs = obs = tf.placeholder(
 47 |             dtype, shape=[None, self.env.observation_space.shape[0]], name="obs")
 48 |         self.action = action = tf.placeholder(tf.int64, shape=[None], name="action")
 49 |         self.advant = advant = tf.placeholder(dtype, shape=[None], name="advant")
 50 |         self.oldaction_dist = oldaction_dist = tf.placeholder(dtype, shape=[None, self.env.action_space.n],
 51 |                                                               name="oldaction_dist")
 52 | 
 53 |         # Create neural network.
 54 |         action_dist_n, _ = (pt.wrap(self.obs).
 55 |                             fully_connected(32, activation_fn=tf.nn.relu).
 56 |                             fully_connected(32, activation_fn=tf.nn.relu).
 57 |                             softmax_classifier(self.env.action_space.n))
 58 |         eps = 1e-6
 59 |         self.action_dist_n = action_dist_n
 60 |         N = tf.shape(obs)[0]
 61 |         ratio_n = self.distribution.likelihood_ratio_sym(action, action_dist_n, oldaction_dist)
 62 |         Nf = tf.cast(N, dtype)
 63 |         surr = -tf.reduce_mean(ratio_n * advant)  # Surrogate loss
 64 |         kl = self.distribution.kl_sym(oldaction_dist, action_dist_n)
 65 |         ent = self.distribution.entropy(action_dist_n)
 66 | 
 67 |         self.losses = [surr, kl, ent]
 68 | 
 69 |         var_list = tf.trainable_variables()
 70 |         self.pg = flatgrad(surr, var_list)
 71 |         # KL divergence where first arg is fixed
 72 |         # replace old->tf.stop_gradient from previous kl
 73 |         kl_firstfixed = tf.reduce_sum(tf.stop_gradient(
 74 |             action_dist_n) * tf.log(tf.stop_gradient(action_dist_n + eps) / (action_dist_n + eps))) / Nf
 75 |         grads = tf.gradients(kl_firstfixed, var_list)
 76 |         self.flat_tangent = tf.placeholder(dtype, shape=[None])
 77 |         shapes = map(var_shape, var_list)
 78 |         start = 0
 79 |         tangents = []
 80 |         for shape in shapes:
 81 |             size = np.prod(shape)
 82 |             param = tf.reshape(self.flat_tangent[start:(start + size)], shape)
 83 |             tangents.append(param)
 84 |             start += size
 85 |         gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)]
 86 |         self.fvp = flatgrad(gvp, var_list)
 87 |         self.gf = GetFlat(var_list)
 88 |         self.gf.session = self.session
 89 |         self.sff = SetFromFlat(var_list)
 90 |         self.sff.session = self.session
 91 |         self.saver = tf.train.Saver(max_to_keep=10)
 92 |         self.session.run(tf.initialize_all_variables())
 93 | 
 94 |         # self.load_model(pms.checkpoint_file)
 95 | 
 96 |     def get_samples(self, path_number):
 97 |         for i in range(path_number):
 98 |             self.storage.get_single_path()
 99 | 
100 |     def act(self, obs, *args):
101 |         obs = np.expand_dims(obs, 0)
102 |         action_dist_n = self.session.run(self.action_dist_n, {self.obs: obs})
103 | 
104 |         if self.train:
105 |             action = int(cat_sample(action_dist_n)[0])
106 |         else:
107 |             action = int(np.argmax(action_dist_n))
108 |         return action, action_dist_n, np.squeeze(obs)
109 | 
110 |     def learn(self):
111 |         start_time = time.time()
112 |         numeptotal = 0
113 |         for iteration in range(pms.max_iter_number):
114 |             # Generating paths.
115 |             print("Rollout")
116 |             self.get_samples(pms.paths_number)
117 |             paths = self.storage.get_paths()  # get_paths
118 |             # Computing returns and estimating advantage function.
119 | 
120 |             sample_data = self.storage.process_paths(paths)
121 |             # shape = sample_data["observations"].shape
122 |             # vis_square(np.reshape(sample_data["observations"],(shape[0], shape[2], shape[3]))[1:10])
123 |             feed = {self.obs: sample_data["observations"],
124 |                     self.action: sample_data["actions"],
125 |                     self.advant: sample_data["advantages"],
126 |                     self.oldaction_dist: sample_data["agent_infos"]}
127 | 
128 |             print "\n********** Iteration %i ************" % iteration
129 |             if self.train:
130 |                 thprev = self.gf()
131 |                 def fisher_vector_product(p):
132 |                     feed[self.flat_tangent] = p
133 |                     return self.session.run(self.fvp, feed) + pms.cg_damping * p
134 | 
135 |                 g = self.session.run(self.pg, feed_dict=feed)
136 |                 stepdir = krylov.cg(fisher_vector_product, -g)
137 |                 shs = 0.5 * stepdir.dot(fisher_vector_product(stepdir))  # theta
138 |                 fullstep = stepdir * np.sqrt(2.0 * pms.max_kl / shs)
139 |                 neggdotstepdir = -g.dot(stepdir)
140 | 
141 |                 def loss(th):
142 |                     self.sff(th)
143 |                     return self.session.run(self.losses, feed_dict=feed)
144 | 
145 |                 surr_prev, kl_prev, entropy = loss(thprev)
146 |                 theta = linesearch(loss, thprev, fullstep, neggdotstepdir)
147 |                 self.sff(theta)
148 | 
149 |                 surrafter, kloldnew, entropy = self.session.run(
150 |                     self.losses, feed_dict=feed)
151 | 
152 |                 stats = {}
153 |                 episoderewards = np.sum(sample_data["rewards"])
154 |                 numeptotal += len(sample_data["rewards"])
155 |                 mean_advant = np.mean(sample_data["advantages"])
156 |                 stats["Total number of episodes"] = numeptotal
157 |                 stats["Average sum of rewards per episode"] = np.mean(sample_data["rewards"])
158 |                 # stats["Entropy"] = entropy
159 |                 # exp = explained_variance(np.array(sample_data[""]), np.array(returns_n))
160 |                 # stats["Baseline explained"] = exp
161 |                 stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0)
162 |                 stats["KL between old and new distribution"] = kloldnew
163 |                 stats["Surrogate loss"] = surrafter
164 |                 stats['Sum episode steps'] = sample_data["sum_episode_steps"]
165 |                 log_data = [0, numeptotal, episoderewards.mean(), kloldnew, surrafter, surr_prev,
166 |                             surrafter - surr_prev,
167 |                             entropy, mean_advant, sample_data["sum_episode_steps"]]
168 |                 if pms.train_flag:
169 |                     self.logger.log_row(log_data)
170 |                 for k, v in stats.iteritems():
171 |                     print(k + ": " + " " * (40 - len(k)) + str(v))
172 |                 if iteration % pms.save_model_times == 0:
173 |                     self.save_model(pms.environment_name + "-" + str(iteration))
174 | 
175 |     def test(self, model_name):
176 |         self.load_model(model_name)
177 |         if pms.record_movie:
178 |             for i in range(100):
179 |                 self.storage.get_single_path()
180 |             self.env.env.monitor.close()
181 |             if pms.upload_to_gym:
182 |                 gym.upload("log/trpo" , algorithm_id='alg_8BgjkAsQRNiWu11xAhS4Hg' , api_key='sk_IJhy3b2QkqL3LWzgBXoVA')
183 |         else:
184 |             for i in range(50):
185 |                 self.storage.get_single_path()
186 | 
187 |     def save_model(self, model_name):
188 |         self.saver.save(self.session, pms.checkpoint_dir + model_name + ".ckpt")
189 | 
190 |     def load_model(self, model_name):
191 |         try:
192 |             if model_name is not None:
193 |                 self.saver.restore(self.session, model_name)
194 |             else:
195 |                 self.saver.restore(self.session, tf.train.latest_checkpoint(pms.checkpoint_dir))
196 |         except:
197 |             print "load model %s fail" % (model_name)
198 | 


--------------------------------------------------------------------------------
/storage/storage_continous_parallel_image.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing
  2 | from utils import *
  3 | import gym
  4 | import time
  5 | from random import randint
  6 | from parameters import pms
  7 | from network.network_continous_image import NetworkContinousImage
  8 | import cv2
  9 | 
 10 | 
 11 | class Actor(multiprocessing.Process):
 12 |     def __init__(self, args, task_q, result_q, actor_id, monitor):
 13 |         multiprocessing.Process.__init__(self)
 14 |         self.actor_id = actor_id
 15 |         self.task_q = task_q
 16 |         self.result_q = result_q
 17 |         self.args = args
 18 |         self.monitor = monitor
 19 |         # pms.max_path_length = gym.spec(args.environment_name).timestep_limit
 20 | 
 21 |     def get_action(self, obs):
 22 |         if self.net == None:
 23 |             raise NameError("network have not been defined")
 24 |         obs = np.expand_dims(obs , 0)
 25 |         # action_dist_logstd = np.expand_dims([np.log(pms.std)], 0)
 26 |         action_dist_means_n , action_dist_logstds_n = self.session.run(
 27 |             [self.net.action_dist_means_n, self.net.action_dist_logstds_n], feed_dict={self.net.obs: obs})
 28 |         if pms.train_flag:
 29 |             rnd = np.random.normal(size=action_dist_means_n[0].shape)
 30 |             action = rnd * np.exp(action_dist_logstds_n[0]) + action_dist_means_n[0]
 31 |         else:
 32 |             action = action_dist_means_n[0]
 33 |         # action = np.clip(action, pms.min_a, pms.max_a)
 34 |         return action, dict(mean=action_dist_means_n[0] , log_std=np.exp(action_dist_logstds_n[0]))
 35 | 
 36 |     def run(self):
 37 |         self.env = gym.make(self.args.environment_name)
 38 |         self.env.seed(randint(0, 999999))
 39 |         if self.monitor:
 40 |             self.env.monitor.start('monitor/', force=True)
 41 | 
 42 |         self.net = NetworkContinousImage("rollout_network" + str(self.actor_id))
 43 |         config = tf.ConfigProto(
 44 |             device_count={'GPU': 0}
 45 |         )
 46 |         self.session = tf.Session(config=config)
 47 |         var_list = self.net.var_list
 48 |         self.session.run(tf.initialize_all_variables())
 49 |         self.set_policy = SetFromFlat(var_list)
 50 |         self.set_policy.session = self.session
 51 |         while True:
 52 |             # get a task, or wait until it gets one
 53 |             next_task = self.task_q.get(block=True)
 54 |             if type(next_task) is int and next_task == 1:
 55 |                 # the task is an actor request to collect experience
 56 |                 path = self.rollout()
 57 |                 # print "single rollout time:"+str(end-start)
 58 |                 self.task_q.task_done()
 59 |                 self.result_q.put(path)
 60 |             elif type(next_task) is int and next_task == 2:
 61 |                 print "kill message"
 62 |                 if self.monitor:
 63 |                     self.env.monitor.close()
 64 |                 self.task_q.task_done()
 65 |                 break
 66 |             else:
 67 |                 # the task is to set parameters of the actor policy
 68 |                 next_task = np.array(next_task)
 69 |                 self.set_policy(next_task)
 70 |                 # super hacky method to make sure when we fill the queue with set parameter tasks,
 71 |                 # an actor doesn't finish updating before the other actors can accept their own tasks.
 72 |                 time.sleep(0.1)
 73 |                 self.task_q.task_done()
 74 |         return
 75 | 
 76 |     def rollout(self):
 77 |         """
 78 |         :param:observations:obs list
 79 |         :param:actions:action list
 80 |         :param:rewards:reward list
 81 |         :param:agent_infos: mean+log_std dictlist
 82 |         :param:env_infos: no use, just information about environment
 83 |         :return: a path, list
 84 |         """
 85 |         # if pms.record_movie:
 86 |         #     outdir = 'log/trpo'
 87 |         #     self.env.monitor.start(outdir , force=True)
 88 |         observations = []
 89 |         actions = []
 90 |         rewards = []
 91 |         agent_infos = []
 92 |         env_infos = []
 93 |         if pms.render:
 94 |             self.env.render()
 95 |         o = self.env.reset()
 96 | 
 97 |         episode_steps = 0
 98 |         for i in xrange(pms.max_path_length - 1):
 99 |             o = self.env.render('rgb_array')
100 |             o = self.deal_image(o)
101 |             a, agent_info = self.get_action(o)
102 |             next_o, reward, terminal, env_info = self.env.step(a)
103 |             observations.append(o)
104 |             rewards.append(np.array([reward]))
105 |             actions.append(a)
106 |             agent_infos.append([agent_info])
107 |             env_infos.append([])
108 |             episode_steps += 1
109 |             if terminal:
110 |                 break
111 |             o = next_o
112 |             if pms.render:
113 |                 self.env.render()
114 |         path = dict(
115 |             observations=np.array(observations) ,
116 |             actions=np.array(actions) ,
117 |             rewards=np.array(rewards) ,
118 |             agent_infos=np.concatenate(agent_infos) ,
119 |             env_infos=np.concatenate(env_infos) ,
120 |             episode_steps=episode_steps
121 |         )
122 |         return path
123 | 
124 |     def deal_image(self , image):
125 |         # index = len(self.obs_origin)
126 |         # image_end = []
127 |         # if index < pms.history_number:
128 |         #     image_end = self.obs_origin[0:index]
129 |         #     for i in range(pms.history_number - index):
130 |         #         image_end.append(image)
131 |         # else:
132 |         #     image_end = self.obs_origin[index - pms.history_number:index]
133 |         #
134 |         # image_end = np.concatenate(image_end)
135 |         # # image_end = image_end.reshape((pms.obs_height, pms.obs_width, pms.history_number))
136 |         # obs = cv2.resize(cv2.cvtColor(image_end , cv2.COLOR_RGB2GRAY) / 255. , (pms.obs_height , pms.obs_width))
137 |         obs = cv2.resize(image, (pms.obs_height, pms.obs_width))
138 |         # obs = np.transpose(np.array(obs), (2, 0, 1))
139 |         return obs
140 | 
141 | class ParallelStorageImage():
142 |     def __init__(self):
143 |         self.args = pms
144 |         self.tasks = multiprocessing.JoinableQueue()
145 |         self.results = multiprocessing.Queue()
146 |         self.actors = []
147 |         self.actors.append(Actor(self.args, self.tasks, self.results, 9999, self.args.record_movie))
148 |         for i in xrange(self.args.jobs-1):
149 |             self.actors.append(Actor(self.args, self.tasks, self.results, 37*(i+3), False))
150 |         for a in self.actors:
151 |             a.start()
152 |         # we will start by running 20,000 / 1000 = 20 episodes for the first ieration
153 |         self.average_timesteps_in_episode = 1000
154 | 
155 |     def get_paths(self):
156 |         # keep 20,000 timesteps per update
157 |         num_rollouts = self.args.paths_number
158 |         # print "rollout_number:"+str(num_rollouts)
159 |         for i in xrange(num_rollouts):
160 |             self.tasks.put(1)
161 |         start = time.time()
162 |         self.tasks.join()
163 |         end = time.time()
164 |         # print "rollout real time"+str(end-start)
165 |         paths = []
166 |         while num_rollouts:
167 |             num_rollouts -= 1
168 |             paths.append(self.results.get())
169 |         return paths
170 | 
171 |     # def process_paths(self, paths):
172 |     #     sum_episode_steps = 0
173 |     #     for path in paths:
174 |     #         sum_episode_steps += path['episode_steps']
175 |     #         # r_t+V(S_{t+1})-V(S_t) = returns-baseline
176 |     #         # path_baselines = np.append(self.baseline.predict(path) , 0)
177 |     #         # # r_t+V(S_{t+1})-V(S_t) = returns-baseline
178 |     #         # path["advantages"] = np.concatenate(path["rewards"]) + \
179 |     #         #          pms.discount * path_baselines[1:] - \
180 |     #         #          path_baselines[:-1]
181 |     #         # path["returns"] = np.concatenate(discount(path["rewards"], pms.discount))
182 |     #         path_baselines = np.append(self.baseline.predict(path) , 0)
183 |     #         deltas = np.concatenate(path["rewards"]) + \
184 |     #                  pms.discount * path_baselines[1:] - \
185 |     #                  path_baselines[:-1]
186 |     #         path["advantages"] = discount(
187 |     #             deltas , pms.discount * pms.gae_lambda)
188 |     #         path["returns"] = np.concatenate(discount(path["rewards"] , pms.discount))
189 |     #     observations = np.concatenate([path["observations"] for path in paths])
190 |     #     actions = np.concatenate([path["actions"] for path in paths])
191 |     #     rewards = np.concatenate([path["rewards"] for path in paths])
192 |     #     advantages = np.concatenate([path["advantages"] for path in paths])
193 |     #     env_infos = np.concatenate([path["env_infos"] for path in paths])
194 |     #     agent_infos = np.concatenate([path["agent_infos"] for path in paths])
195 |     #     if pms.center_adv:
196 |     #         advantages -= np.mean(advantages)
197 |     #         advantages /= (advantages.std() + 1e-8)
198 |     #     samples_data = dict(
199 |     #         observations=observations ,
200 |     #         actions=actions ,
201 |     #         rewards=rewards ,
202 |     #         advantages=advantages ,
203 |     #         env_infos=env_infos ,
204 |     #         agent_infos=agent_infos ,
205 |     #         paths=paths ,
206 |     #         sum_episode_steps=sum_episode_steps
207 |     #     )
208 |     #     self.baseline.fit(paths)
209 |     #     return samples_data
210 | 
211 |     def set_policy_weights(self, parameters):
212 |         for i in xrange(self.args.jobs):
213 |             self.tasks.put(parameters)
214 |         self.tasks.join()
215 | 
216 |     def end(self):
217 |         for i in xrange(self.args.jobs):
218 |             self.tasks.put(2)


--------------------------------------------------------------------------------
/agent/agent_continous_parallel_storage.py:
--------------------------------------------------------------------------------
  1 | from utils import *
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | from network.network_continous import NetworkContinous
  5 | from parameters import pms
  6 | 
  7 | import multiprocessing
  8 | import krylov
  9 | from baseline.baseline_lstsq import Baseline
 10 | from distribution.diagonal_gaussian import DiagonalGaussian
 11 | import time
 12 | import math
 13 | from logger.logger import Logger
 14 | 
 15 | seed = 1
 16 | np.random.seed(seed)
 17 | tf.set_random_seed(seed)
 18 | 
 19 | 
 20 | """
 21 | class for continoust action space in multi process
 22 | """
 23 | class TRPOAgentParallel(multiprocessing.Process):
 24 | 
 25 | 
 26 |     def __init__(self , observation_space , action_space , task_q , result_q):
 27 |         multiprocessing.Process.__init__(self)
 28 |         self.task_q = task_q
 29 |         self.result_q = result_q
 30 |         self.observation_space = observation_space
 31 |         self.action_space = action_space
 32 |         self.args = pms
 33 |         self.baseline = Baseline()
 34 |         self.distribution = DiagonalGaussian(pms.action_shape)
 35 |         self.init_logger()
 36 | 
 37 |     def init_network(self):
 38 |         """
 39 |         [input]
 40 |         self.obs
 41 |         self.action_n
 42 |         self.advant
 43 |         self.old_dist_means_n
 44 |         self.old_dist_logstds_n
 45 |         [output]
 46 |         self.action_dist_means_n
 47 |         self.action_dist_logstds_n
 48 |         var_list
 49 |         """
 50 |         config = tf.ConfigProto(
 51 |             device_count={'GPU': 0}
 52 |         )
 53 |         self.session = tf.Session(config=config)
 54 |         self.net = NetworkContinous("network_continous")
 55 |         if pms.min_std is not None:
 56 |             log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std))
 57 |         self.action_dist_stds_n = tf.exp(log_std_var)
 58 |         self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n)
 59 |         self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n)
 60 |         self.likehood_action_dist = self.distribution.log_likelihood_sym(self.net.action_n, self.new_dist_info_vars)
 61 |         self.ratio_n = self.distribution.likelihood_ratio_sym(self.net.action_n, self.new_dist_info_vars,
 62 |                                                               self.old_dist_info_vars)
 63 |         surr = -tf.reduce_mean(self.ratio_n * self.net.advant)  # Surrogate loss
 64 |         batch_size = tf.shape(self.net.obs)[0]
 65 |         batch_size_float = tf.cast(batch_size , tf.float32)
 66 |         kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars))
 67 |         ent = self.distribution.entropy(self.old_dist_info_vars)
 68 |         # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf
 69 |         self.losses = [surr, kl, ent]
 70 |         var_list = self.net.var_list
 71 | 
 72 |         self.gf = GetFlat(var_list)  # get theta from var_list
 73 |         self.gf.session = self.session
 74 |         self.sff = SetFromFlat(var_list)  # set theta from var_List
 75 |         self.sff.session = self.session
 76 |         # get g
 77 |         self.pg = flatgrad(surr, var_list)
 78 |         # get A
 79 |         # KL divergence where first arg is fixed
 80 |         # replace old->tf.stop_gradient from previous kl
 81 |         kl_firstfixed = self.distribution.kl_sym_firstfixed(self.new_dist_info_vars) / batch_size_float
 82 |         grads = tf.gradients(kl_firstfixed, var_list)
 83 |         self.flat_tangent = tf.placeholder(dtype, shape=[None])
 84 |         shapes = map(var_shape, var_list)
 85 |         start = 0
 86 |         tangents = []
 87 |         for shape in shapes:
 88 |             size = np.prod(shape)
 89 |             param = tf.reshape(self.flat_tangent[start:(start + size)], shape)
 90 |             tangents.append(param)
 91 |             start += size
 92 |         self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)]
 93 |         self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list)  # get kl''*p
 94 |         self.session.run(tf.initialize_all_variables())
 95 |         self.saver = tf.train.Saver(max_to_keep=5)
 96 | 
 97 |     def init_logger(self):
 98 |         head = ["factor", "rewards", "std"]
 99 |         self.logger = Logger(head)
100 | 
101 |     def run(self):
102 |         self.init_network()
103 |         while True:
104 |             paths = self.task_q.get()
105 |             if paths is None:
106 |                 # kill the learner
107 |                 self.task_q.task_done()
108 |                 break
109 |             elif paths == 1:
110 |                 # just get params, no learn
111 |                 self.task_q.task_done()
112 |                 self.result_q.put(self.gf())
113 |             elif paths[0] == 2:
114 |                 # adjusting the max KL.
115 |                 self.args.max_kl = paths[1]
116 |                 if paths[2] == 1:
117 |                     print "saving checkpoint..."
118 |                     self.save_model(pms.environment_name + "-" + str(paths[3]))
119 |                 self.task_q.task_done()
120 |             else:
121 |                 stats , theta, thprev = self.learn(paths, linear_search=False)
122 |                 self.sff(theta)
123 |                 self.task_q.task_done()
124 |                 self.result_q.put((stats, theta, thprev))
125 |         return
126 | 
127 |     def learn(self, paths, parallel=False, linear_search=False):
128 |         start_time = time.time()
129 |         sample_data = self.process_paths(paths)
130 |         agent_infos = sample_data["agent_infos"]
131 |         obs_all = sample_data["observations"]
132 |         action_all = sample_data["actions"]
133 |         advant_all = sample_data["advantages"]
134 |         n_samples = len(obs_all)
135 |         batch = int(1/pms.subsample_factor)
136 |         batch_size = int(math.floor(n_samples * pms.subsample_factor))
137 |         accum_fullstep = 0.0
138 |         for iteration in range(batch):
139 |             print "batch: %d, batch_size: %d"%(iteration+1, batch_size)
140 |             inds = np.random.choice(n_samples , batch_size , replace=False)
141 |             obs_n = obs_all[inds]
142 |             action_n = action_all[inds]
143 |             advant_n = advant_all[inds]
144 |             action_dist_means_n = np.array([agent_info["mean"] for agent_info in agent_infos[inds]])
145 |             action_dist_logstds_n = np.array([agent_info["log_std"] for agent_info in agent_infos[inds]])
146 |             feed = {self.net.obs: obs_n ,
147 |                     self.net.advant: advant_n ,
148 |                     self.net.old_dist_means_n: action_dist_means_n ,
149 |                     self.net.old_dist_logstds_n: action_dist_logstds_n ,
150 |                     self.net.action_n: action_n
151 |                     }
152 | 
153 |             episoderewards = np.array([path["rewards"].sum() for path in paths])
154 |             thprev = self.gf()  # get theta_old
155 | 
156 |             def fisher_vector_product(p):
157 |                 feed[self.flat_tangent] = p
158 |                 return self.session.run(self.fvp , feed) + pms.cg_damping * p
159 | 
160 |             g = self.session.run(self.pg , feed_dict=feed)
161 |             stepdir = krylov.cg(fisher_vector_product , -g , cg_iters=pms.cg_iters)
162 |             shs = 0.5 * stepdir.dot(fisher_vector_product(stepdir))  # theta
163 |             # if shs<0, then the nan error would appear
164 |             lm = np.sqrt(shs / pms.max_kl)
165 |             fullstep = stepdir / lm
166 |             neggdotstepdir = -g.dot(stepdir)
167 | 
168 |             def loss(th):
169 |                 self.sff(th)
170 |                 return self.session.run(self.losses , feed_dict=feed)
171 | 
172 |             if parallel is True:
173 |                 theta = linesearch_parallel(loss , thprev , fullstep , neggdotstepdir / lm)
174 |             else:
175 |                 if linear_search:
176 |                     theta = linesearch(loss , thprev , fullstep , neggdotstepdir / lm)
177 |                 else:
178 |                     theta = thprev + fullstep
179 |             accum_fullstep += (theta - thprev)
180 |         theta = thprev + accum_fullstep * pms.subsample_factor
181 |         stats = {}
182 |         stats["sum steps of episodes"] = sample_data["sum_episode_steps"]
183 |         stats["Average sum of rewards per episode"] = episoderewards.mean()
184 |         stats["surr loss"] = loss(theta)[0]
185 |         stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0)
186 |         self.logger.log_row([pms.subsample_factor, stats["Average sum of rewards per episode"], self.session.run(self.net.action_dist_logstd_param)[0][0]])
187 |         return stats , theta , thprev
188 | 
189 |     def process_paths(self, paths):
190 |         sum_episode_steps = 0
191 |         for path in paths:
192 |             sum_episode_steps += path['episode_steps']
193 |             path['baselines'] = self.baseline.predict(path)
194 |             path["returns"] = np.concatenate(discount(path["rewards"] , pms.discount))
195 |             path["advantages"] = path['returns'] - path['baselines']
196 | 
197 |         observations = np.concatenate([path["observations"] for path in paths])
198 |         actions = np.concatenate([path["actions"] for path in paths])
199 |         rewards = np.concatenate([path["rewards"] for path in paths])
200 |         advantages = np.concatenate([path["advantages"] for path in paths])
201 |         env_infos = np.concatenate([path["env_infos"] for path in paths])
202 |         agent_infos = np.concatenate([path["agent_infos"] for path in paths])
203 |         if pms.center_adv:
204 |             advantages -= advantages.mean()
205 |             advantages /= (advantages.std() + 1e-8)
206 | 
207 |         # for some unknown reaseon, it can not be used
208 |         # if pms.positive_adv:
209 |         #     advantages = (advantages - np.min(advantages)) + 1e-8
210 | 
211 |         # average_discounted_return = \
212 |         #     np.mean([path["returns"][0] for path in paths])
213 |         #
214 |         # undiscounted_returns = [sum(path["rewards"]) for path in paths]
215 | 
216 | 
217 |         # ev = self.explained_variance_1d(
218 |         #     np.concatenate(baselines),
219 |         #     np.concatenate(returns)
220 |         # )
221 |         samples_data = dict(
222 |             observations=observations ,
223 |             actions=actions ,
224 |             rewards=rewards ,
225 |             advantages=advantages ,
226 |             env_infos=env_infos ,
227 |             agent_infos=agent_infos ,
228 |             paths=paths ,
229 |             sum_episode_steps=sum_episode_steps
230 |         )
231 |         self.baseline.fit(paths)
232 |         return samples_data
233 | 
234 |     def save_model(self , model_name):
235 |         self.saver.save(self.session , "checkpoint/" + model_name + ".ckpt")
236 | 
237 |     def load_model(self , model_name):
238 |         try:
239 |             if model_name is not None:
240 |                 self.saver.restore(self.session , model_name)
241 |             else:
242 |                 self.saver.restore(self.session , tf.train.latest_checkpoint(pms.checkpoint_dir))
243 |         except:
244 |             print "load model %s fail" % (model_name)


--------------------------------------------------------------------------------
/agent/agent_continous_single_process.py:
--------------------------------------------------------------------------------
  1 | from utils import *
  2 | import threading
  3 | import gym
  4 | import numpy as np
  5 | import random
  6 | import tensorflow as tf
  7 | import time
  8 | import threading
  9 | import multiprocessing
 10 | import prettytensor as pt
 11 | 
 12 | from storage.storage_continous import Storage
 13 | from storage.storage_continous import Rollout
 14 | import math
 15 | from parameters import pms
 16 | import krylov
 17 | from logger.logger import Logger
 18 | from distribution.diagonal_gaussian import DiagonalGaussian
 19 | from baseline.baseline_lstsq import Baseline
 20 | from environment import Environment
 21 | from network.network_continous import NetworkContinous
 22 | 
 23 | seed = 1
 24 | np.random.seed(seed)
 25 | tf.set_random_seed(seed)
 26 | 
 27 | 
 28 | class TRPOAgentContinousSingleProcess(object):
 29 | 
 30 |     def __init__(self, thread_id):
 31 |         print "create worker %d"%(thread_id)
 32 |         self.thread_id = thread_id
 33 |         self.env = env = Environment(gym.make(pms.environment_name))
 34 |         # print("Observation Space", env.observation_space)
 35 |         # print("Action Space", env.action_space)
 36 |         # print("Action area, high:%f, low%f" % (env.action_space.high, env.action_space.low))
 37 |         self.end_count = 0
 38 |         self.paths = []
 39 |         self.train = True
 40 |         self.baseline = Baseline()
 41 |         self.storage = Storage(self, self.env, self.baseline)
 42 |         self.distribution = DiagonalGaussian(pms.action_shape)
 43 | 
 44 |         self.session = self.master.session
 45 |         self.init_network()
 46 | 
 47 | 
 48 |     def init_network(self):
 49 |         self.network = NetworkContinous(str(self.thread_id))
 50 |         if pms.min_std is not None:
 51 |             log_std_var = tf.maximum(self.network.action_dist_logstds_n, np.log(pms.min_std))
 52 |         self.action_dist_stds_n = tf.exp(log_std_var)
 53 | 
 54 |         self.old_dist_info_vars = dict(mean=self.network.old_dist_means_n, log_std=self.network.old_dist_logstds_n)
 55 |         self.new_dist_info_vars = dict(mean=self.network.action_dist_means_n, log_std=self.network.action_dist_logstds_n)
 56 |         self.likehood_action_dist = self.distribution.log_likelihood_sym(self.network.action_n, self.new_dist_info_vars)
 57 |         self.ratio_n = self.distribution.likelihood_ratio_sym(self.network.action_n, self.new_dist_info_vars,
 58 |                                                               self.old_dist_info_vars)
 59 | 
 60 |         surr = -tf.reduce_mean(self.ratio_n * self.network.advant)  # Surrogate loss
 61 |         kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars))
 62 |         ent = self.distribution.entropy(self.old_dist_info_vars)
 63 |         # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf
 64 |         self.losses = [surr, kl, ent]
 65 |         var_list = self.network.var_list
 66 |         self.gf = GetFlat(self.session, var_list)  # get theta from var_list
 67 |         self.sff = SetFromFlat(self.session, var_list)  # set theta from var_List
 68 |         # get g
 69 |         self.pg = flatgrad(surr, var_list)
 70 |         # get A
 71 | 
 72 |         # KL divergence where first arg is fixed
 73 |         # replace old->tf.stop_gradient from previous kl
 74 |         kl_firstfixed = kl_sym_gradient(self.network.old_dist_means_n, self.network.old_dist_logstds_n, self.network.action_dist_means_n,
 75 |                                         self.network.action_dist_logstds_n)
 76 | 
 77 |         grads = tf.gradients(kl, var_list)
 78 |         self.flat_tangent = tf.placeholder(dtype, shape=[None])
 79 |         shapes = map(var_shape, var_list)
 80 |         start = 0
 81 |         tangents = []
 82 |         for shape in shapes:
 83 |             size = np.prod(shape)
 84 |             param = tf.reshape(self.flat_tangent[start:(start + size)], shape)
 85 |             tangents.append(param)
 86 |             start += size
 87 |         self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)]
 88 |         self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list)  # get kl''*p
 89 |         # self.load_model()
 90 | 
 91 |     def get_samples(self, path_number):
 92 |         for i in range(pms.paths_number):
 93 |             self.storage.get_single_path()
 94 | 
 95 |     def get_action(self, obs, *args):
 96 |         obs = np.expand_dims(obs, 0)
 97 |         # action_dist_logstd = np.expand_dims([np.log(pms.std)], 0)
 98 |         if pms.use_std_network:
 99 |             action_dist_means_n, action_dist_logstds_n = self.session.run(
100 |                 [self.action_dist_means_n, self.action_dist_logstds_n],
101 |                 {self.obs: obs})
102 |             if pms.train_flag:
103 |                 rnd = np.random.normal(size=action_dist_means_n[0].shape)
104 |                 action = rnd * np.exp(action_dist_logstds_n[0]) + action_dist_means_n[0]
105 |             else:
106 |                 action = action_dist_means_n[0]
107 |             # action = np.clip(action, pms.min_a, pms.max_a)
108 |             return action, dict(mean=action_dist_means_n[0], log_std=action_dist_logstds_n[0])
109 |         else:
110 |             action_dist_logstd = np.expand_dims([np.log(pms.std)], 0)
111 |             action_dist_means_n = self.network.get_action_dist_means_n(self.session, obs)
112 |             if pms.train_flag:
113 |                 rnd = np.random.normal(size=action_dist_means_n[0].shape)
114 |                 action = rnd * np.exp(action_dist_logstd[0]) + action_dist_means_n[0]
115 |             else:
116 |                 action = action_dist_means_n[0]
117 |             # action = np.clip(action, pms.min_a, pms.max_a)
118 |             return action, dict(mean=action_dist_means_n[0], log_std=action_dist_logstd[0])
119 | 
120 |     def run(self):
121 |         self.learn()
122 | 
123 |     def learn(self):
124 |         start_time = time.time()
125 | 
126 |         numeptotal = 0
127 |         while True:
128 |             i = 0
129 |             # Generating paths.
130 |             # print("Rollout")
131 |             self.get_samples(pms.paths_number)
132 |             paths = self.storage.get_paths()  # get_paths
133 |             # Computing returns and estimating advantage function.
134 |             sample_data = self.storage.process_paths(paths)
135 | 
136 |             agent_infos = sample_data["agent_infos"]
137 |             obs_n = sample_data["observations"]
138 |             action_n = sample_data["actions"]
139 |             advant_n = sample_data["advantages"]
140 |             n_samples = len(obs_n)
141 |             inds = np.random.choice(n_samples, math.floor(n_samples * pms.subsample_factor), replace=False)
142 |             obs_n = obs_n[inds]
143 |             action_n = action_n[inds]
144 |             advant_n = advant_n[inds]
145 |             action_dist_means_n = np.array([agent_info["mean"] for agent_info in agent_infos[inds]])
146 |             action_dist_logstds_n = np.array([agent_info["log_std"] for agent_info in agent_infos[inds]])
147 |             feed = {self.network.obs: obs_n,
148 |                     self.network.advant: advant_n,
149 |                     self.network.old_dist_means_n: action_dist_means_n,
150 |                     self.network.old_dist_logstds_n: action_dist_logstds_n,
151 |                     self.network.action_dist_logstds_n: action_dist_logstds_n,
152 |                     self.network.action_n: action_n
153 |                     }
154 | 
155 |             episoderewards = np.array([path["rewards"].sum() for path in paths])
156 |             average_episode_std = np.mean(np.exp(action_dist_logstds_n))
157 | 
158 |             # print "\n********** Iteration %i ************" % i
159 |             for iter_num_per_train in range(pms.iter_num_per_train):
160 |                 # if not self.train:
161 |                 #     print("Episode mean: %f" % episoderewards.mean())
162 |                 #     self.end_count += 1
163 |                 #     if self.end_count > 100:
164 |                 #         break
165 |                 if self.train:
166 |                     thprev = self.gf()  # get theta_old
167 | 
168 |                     def fisher_vector_product(p):
169 |                         feed[self.flat_tangent] = p
170 |                         return self.session.run(self.fvp, feed) + pms.cg_damping * p
171 | 
172 |                     g = self.session.run(self.pg, feed_dict=feed)
173 |                     stepdir = krylov.cg(fisher_vector_product, g, cg_iters=pms.cg_iters)
174 |                     shs = 0.5 * stepdir.dot(fisher_vector_product(stepdir))  # theta
175 |                     fullstep = stepdir * np.sqrt(2.0 * pms.max_kl / shs)
176 |                     neggdotstepdir = -g.dot(stepdir)
177 | 
178 |                     def loss(th):
179 |                         self.sff(th)
180 |                         return self.session.run(self.losses, feed_dict=feed)
181 | 
182 |                     surr_prev, kl_prev, ent_prev = loss(thprev)
183 |                     mean_advant = np.mean(advant_n)
184 |                     theta = linesearch(loss, thprev, fullstep, neggdotstepdir)
185 |                     self.sff(theta)
186 |                     surrafter, kloldnew, entnew = self.session.run(self.losses, feed_dict=feed)
187 |                     stats = {}
188 |                     numeptotal += len(episoderewards)
189 |                     stats["average_episode_std"] = average_episode_std
190 |                     stats["sum steps of episodes"] = sample_data["sum_episode_steps"]
191 |                     stats["Total number of episodes"] = numeptotal
192 |                     stats["Average sum of rewards per episode"] = episoderewards.mean()
193 |                     # stats["Entropy"] = entropy
194 |                     # exp = explained_variance(np.array(baseline_n), np.array(returns_n))
195 |                     # stats["Baseline explained"] = exp
196 |                     stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0)
197 |                     stats["KL between old and new distribution"] = kloldnew
198 |                     stats["Surrogate loss"] = surrafter
199 |                     stats["Surrogate loss prev"] = surr_prev
200 |                     stats["entropy"] = ent_prev
201 |                     stats["mean_advant"] = mean_advant
202 |                     log_data = [average_episode_std, len(episoderewards), numeptotal, episoderewards.mean(), kloldnew, surrafter, surr_prev,
203 |                                 surrafter - surr_prev,
204 |                                 ent_prev, mean_advant]
205 |                     self.master.logger.log_row(log_data)
206 |                     # for k, v in stats.iteritems():
207 |                     #     print(k + ": " + " " * (40 - len(k)) + str(v))
208 |                     #     # if entropy != entropy:
209 |                     #     #     exit(-1)
210 |                     #     # if exp > 0.95:
211 |                     #     #     self.train = False
212 |             if self.thread_id==1:
213 |                 self.master.save_model("iter" + str(i))
214 |                 print episoderewards.mean()
215 |             i += 1
216 | 
217 |     def test(self, model_name):
218 |         self.load_model(model_name)
219 |         for i in range(50):
220 |             self.storage.get_single_path()
221 | 
222 |     def save_model(self, model_name):
223 |         self.saver.save(self.session, "checkpoint/" + model_name + ".ckpt")
224 | 
225 |     def load_model(self, model_name):
226 |         try:
227 |             self.saver.restore(self.session, model_name)
228 |         except:
229 |             print "load model %s fail" % (model_name)
230 | 


--------------------------------------------------------------------------------
/agent/agent_continous_image_parallel_image.py:
--------------------------------------------------------------------------------
  1 | from utils import *
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | from network.network_continous_image import NetworkContinousImage
  5 | from parameters import pms
  6 | 
  7 | import multiprocessing
  8 | import krylov
  9 | from baseline.baseline_zeros import Baseline
 10 | from distribution.diagonal_gaussian import DiagonalGaussian
 11 | import time
 12 | import math
 13 | from logger.logger import Logger
 14 | 
 15 | seed = 1
 16 | np.random.seed(seed)
 17 | tf.set_random_seed(seed)
 18 | 
 19 | 
 20 | """
 21 | class for continoust action space in multi process
 22 | """
 23 | class TRPOAgentParallelImage(multiprocessing.Process):
 24 | 
 25 | 
 26 |     def __init__(self , observation_space , action_space , task_q , result_q):
 27 |         multiprocessing.Process.__init__(self)
 28 |         self.task_q = task_q
 29 |         self.result_q = result_q
 30 |         self.observation_space = observation_space
 31 |         self.action_space = action_space
 32 |         self.args = pms
 33 |         self.baseline = Baseline()
 34 |         self.distribution = DiagonalGaussian(pms.action_shape)
 35 |         self.init_logger()
 36 | 
 37 |     def init_network(self):
 38 |         """
 39 |         [input]
 40 |         self.obs
 41 |         self.action_n
 42 |         self.advant
 43 |         self.old_dist_means_n
 44 |         self.old_dist_logstds_n
 45 |         [output]
 46 |         self.action_dist_means_n
 47 |         self.action_dist_logstds_n
 48 |         var_list
 49 |         """
 50 |         config = tf.ConfigProto(
 51 |             device_count={'GPU': 0}
 52 |         )
 53 |         self.session = tf.Session(config=config)
 54 |         self.net = NetworkContinousImage("network_continous_image")
 55 |         if pms.min_std is not None:
 56 |             log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std))
 57 |         self.action_dist_stds_n = tf.exp(log_std_var)
 58 |         self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n)
 59 |         self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n)
 60 |         self.likehood_action_dist = self.distribution.log_likelihood_sym(self.net.action_n, self.new_dist_info_vars)
 61 |         self.ratio_n = self.distribution.likelihood_ratio_sym(self.net.action_n, self.new_dist_info_vars,
 62 |                                                               self.old_dist_info_vars)
 63 |         surr = -tf.reduce_sum(self.ratio_n * self.net.advant)  # Surrogate loss
 64 |         batch_size = tf.shape(self.net.obs)[0]
 65 |         batch_size_float = tf.cast(batch_size , tf.float32)
 66 |         kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars))
 67 |         ent = self.distribution.entropy(self.old_dist_info_vars)
 68 |         # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf
 69 |         self.losses = [surr, kl, ent]
 70 |         var_list = self.net.var_list
 71 | 
 72 |         self.gf = GetFlat(var_list)  # get theta from var_list
 73 |         self.gf.session = self.session
 74 |         self.sff = SetFromFlat(var_list)  # set theta from var_List
 75 |         self.sff.session = self.session
 76 |         # get g
 77 |         self.pg = flatgrad(surr, var_list)
 78 |         # get A
 79 |         # KL divergence where first arg is fixed
 80 |         # replace old->tf.stop_gradient from previous kl
 81 |         kl_firstfixed = self.distribution.kl_sym_firstfixed(self.new_dist_info_vars) / batch_size_float
 82 |         grads = tf.gradients(kl_firstfixed, var_list)
 83 |         self.flat_tangent = tf.placeholder(dtype, shape=[None])
 84 |         shapes = map(var_shape, var_list)
 85 |         start = 0
 86 |         tangents = []
 87 |         for shape in shapes:
 88 |             size = np.prod(shape)
 89 |             param = tf.reshape(self.flat_tangent[start:(start + size)], shape)
 90 |             tangents.append(param)
 91 |             start += size
 92 |         self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)]
 93 |         self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list)  # get kl''*p
 94 |         self.session.run(tf.initialize_all_variables())
 95 |         self.saver = tf.train.Saver(max_to_keep=5)
 96 | 
 97 |     def init_logger(self):
 98 |         head = ["factor", "rewards", "std"]
 99 |         self.logger = Logger(head)
100 | 
101 |     def run(self):
102 |         self.init_network()
103 |         while True:
104 |             paths = self.task_q.get()
105 |             if paths is None:
106 |                 # kill the learner
107 |                 self.task_q.task_done()
108 |                 break
109 |             elif paths == 1:
110 |                 # just get params, no learn
111 |                 self.task_q.task_done()
112 |                 self.result_q.put(self.gf())
113 |             elif paths[0] == 2:
114 |                 # adjusting the max KL.
115 |                 self.args.max_kl = paths[1]
116 |                 if paths[2] == 1:
117 |                     print "saving checkpoint..."
118 |                     self.save_model(pms.environment_name + "-" + str(paths[3]))
119 |                 self.task_q.task_done()
120 |             else:
121 |                 stats , theta, thprev = self.learn(paths)
122 |                 self.sff(theta)
123 |                 self.task_q.task_done()
124 |                 self.result_q.put((stats, theta, thprev))
125 |         return
126 | 
127 |     def learn(self, paths, parallel=False, linear_search=False):
128 |         start_time = time.time()
129 |         sample_data = self.process_paths(paths)
130 |         agent_infos = sample_data["agent_infos"]
131 |         obs_all = sample_data["observations"]
132 |         action_all = sample_data["actions"]
133 |         advant_all = sample_data["advantages"]
134 |         n_samples = len(obs_all)
135 |         batch = int(1/pms.subsample_factor)
136 |         batch_size = int(math.floor(n_samples * pms.subsample_factor))
137 |         accum_fullstep = 0.0
138 |         for iteration in range(batch):
139 |             print "batch: %d, batch_size: %d"%(iteration+1, batch_size)
140 |             inds = np.random.choice(n_samples , batch_size , replace=False)
141 |             obs_n = obs_all[inds]
142 |             action_n = action_all[inds]
143 |             advant_n = advant_all[inds]
144 |             action_dist_means_n = np.array([agent_info["mean"] for agent_info in agent_infos[inds]])
145 |             action_dist_logstds_n = np.array([agent_info["log_std"] for agent_info in agent_infos[inds]])
146 |             feed = {self.net.obs: obs_n ,
147 |                     self.net.advant: advant_n ,
148 |                     self.net.old_dist_means_n: action_dist_means_n ,
149 |                     self.net.old_dist_logstds_n: action_dist_logstds_n ,
150 |                     self.net.action_n: action_n
151 |                     }
152 | 
153 |             episoderewards = np.array([path["rewards"].sum() for path in paths])
154 |             thprev = self.gf()  # get theta_old
155 | 
156 |             def fisher_vector_product(p):
157 |                 feed[self.flat_tangent] = p
158 |                 return self.session.run(self.fvp , feed) + pms.cg_damping * p
159 | 
160 |             g = self.session.run(self.pg , feed_dict=feed)
161 |             stepdir = krylov.cg(fisher_vector_product , -g , cg_iters=pms.cg_iters)
162 |             shs = 0.5 * stepdir.dot(fisher_vector_product(stepdir))  # theta
163 |             # if shs<0, then the nan error would appear
164 |             lm = np.sqrt(shs / pms.max_kl)
165 |             fullstep = stepdir / lm
166 |             neggdotstepdir = -g.dot(stepdir)
167 | 
168 |             def loss(th):
169 |                 self.sff(th)
170 |                 return self.session.run(self.losses , feed_dict=feed)
171 | 
172 |             if parallel is True:
173 |                 theta = linesearch_parallel(loss , thprev , fullstep , neggdotstepdir / lm)
174 |             else:
175 |                 if linear_search:
176 |                     theta = linesearch(loss , thprev , fullstep , neggdotstepdir / lm)
177 |                 else:
178 |                     theta = thprev + fullstep
179 |                     if math.isnan(theta.mean()):
180 |                         print shs is None
181 |                         theta = thprev
182 |             accum_fullstep += (theta - thprev)
183 |         theta = thprev + accum_fullstep * pms.subsample_factor
184 |         stats = {}
185 |         stats["sum steps of episodes"] = sample_data["sum_episode_steps"]
186 |         stats["Average sum of rewards per episode"] = episoderewards.mean()
187 |         stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0)
188 |         self.logger.log_row([pms.subsample_factor, stats["Average sum of rewards per episode"], self.session.run(self.net.action_dist_logstd_param)[0][0]])
189 |         return stats , theta , thprev
190 | 
191 |     def process_paths(self, paths):
192 |         sum_episode_steps = 0
193 |         for path in paths:
194 |             sum_episode_steps += path['episode_steps']
195 |             # r_t+V(S_{t+1})-V(S_t) = returns-baseline
196 |             # path_baselines = np.append(self.baseline.predict(path) , 0)
197 |             # # r_t+V(S_{t+1})-V(S_t) = returns-baseline
198 |             # path["advantages"] = np.concatenate(path["rewards"]) + \
199 |             #          pms.discount * path_baselines[1:] - \
200 |             #          path_baselines[:-1]
201 |             # path["returns"] = np.concatenate(discount(path["rewards"], pms.discount))
202 |             path_baselines = np.append(self.baseline.predict(path) , 0)
203 |             deltas = np.concatenate(path["rewards"]) + \
204 |                      pms.discount * path_baselines[1:] - \
205 |                      path_baselines[:-1]
206 |             path["advantages"] = discount(
207 |                 deltas , pms.discount * pms.gae_lambda)
208 |             path["returns"] = np.concatenate(discount(path["rewards"] , pms.discount))
209 |         observations = np.concatenate([path["observations"] for path in paths])
210 |         actions = np.concatenate([path["actions"] for path in paths])
211 |         rewards = np.concatenate([path["rewards"] for path in paths])
212 |         advantages = np.concatenate([path["advantages"] for path in paths])
213 |         env_infos = np.concatenate([path["env_infos"] for path in paths])
214 |         agent_infos = np.concatenate([path["agent_infos"] for path in paths])
215 |         if pms.center_adv:
216 |             advantages -= np.mean(advantages)
217 |             advantages /= (advantages.std() + 1e-8)
218 | 
219 |         # for some unknown reaseon, it can not be used
220 |         # if pms.positive_adv:
221 |         #     advantages = (advantages - np.min(advantages)) + 1e-8
222 | 
223 |         # average_discounted_return = \
224 |         #     np.mean([path["returns"][0] for path in paths])
225 |         #
226 |         # undiscounted_returns = [sum(path["rewards"]) for path in paths]
227 | 
228 | 
229 |         # ev = self.explained_variance_1d(
230 |         #     np.concatenate(baselines),
231 |         #     np.concatenate(returns)
232 |         # )
233 |         samples_data = dict(
234 |             observations=observations ,
235 |             actions=actions ,
236 |             rewards=rewards ,
237 |             advantages=advantages ,
238 |             env_infos=env_infos ,
239 |             agent_infos=agent_infos ,
240 |             paths=paths ,
241 |             sum_episode_steps=sum_episode_steps
242 |         )
243 |         self.baseline.fit(paths)
244 |         return samples_data
245 | 
246 |     def save_model(self , model_name):
247 |         self.saver.save(self.session , "checkpoint/" + model_name + ".ckpt")
248 | 
249 |     def load_model(self , model_name):
250 |         try:
251 |             if model_name is not None:
252 |                 self.saver.restore(self.session , model_name)
253 |             else:
254 |                 self.saver.restore(self.session , tf.train.latest_checkpoint(pms.checkpoint_dir))
255 |         except:
256 |             print "load model %s fail" % (model_name)


--------------------------------------------------------------------------------