├── agent
├── __init__.py
├── agent_parallel.py
├── agent_continous_image.py
├── agent_continous.py
├── agent_continous_rnn.py
├── agent_cotinous_single_thread.py
├── AC_agent_continous.py
├── agent_base.py
├── agent_discrete.py
├── agent_continous_parallel_storage.py
├── agent_continous_single_process.py
└── agent_continous_image_parallel_image.py
├── logger
├── __init__.py
└── logger.py
├── baseline
├── __init__.py
├── baseline_zeros.py
├── baseline_average_reward.py
├── baseline_lstsq.py
├── baseline_tensorflow.py
└── baseline_tf_image.py
├── experiment
├── __init__.py
├── main.py
├── main_lstm.py
├── main_ac.py
├── main_discrete.py
├── main_image.py
├── main_tf_parallel.py
├── main_multi_thread.py
├── main_image_multi_process.py
└── main_multi_process.py
├── network
├── __init__.py
├── network_descrete.py
├── network_continous.py
├── network_continous_image.py
└── network_continous_rnn.py
├── storage
├── __init__.py
├── storage.py
├── storage_image.py
├── storage_continous.py
├── storage_continous_parallel.py
└── storage_continous_parallel_image.py
├── distribution
├── __init__.py
├── diagonal_category.py
└── diagonal_gaussian.py
├── .gitignore
├── run.py
├── dealImage.py
├── parameters.py~
├── README.md
├── environment.py
├── parameters.py
├── utils.py
└── krylov.py
/agent/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/logger/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baseline/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiment/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/network/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/storage/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/distribution/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/network/network_descrete.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | logs_*
2 | *.pyc
3 | *.swp
4 | checkpoint/
5 | checkpoint_parallel/
6 | log/
7 | .idea/
8 | .idea
9 |
--------------------------------------------------------------------------------
/baseline/baseline_zeros.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class Baseline(object):
5 | def fit(self, paths):
6 | self.temp = 0
7 |
8 | def predict(self, path):
9 | return np.zeros(len(path["rewards"]))
--------------------------------------------------------------------------------
/baseline/baseline_average_reward.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class BaselineAverageReward(object):
5 | def fit(self, paths):
6 | self.temp = 0
7 |
8 | def predict(self, path):
9 | rewards = path["rewards"]
10 | mean_rewards = np.mean(rewards)
11 | return mean_rewards
--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | tasks = ["Copy-v0", "DuplicatedInput-v0", "Reverse-v0", "RepeatCopy-v0"]
4 |
5 | os.system("rm logs_*")
6 | os.system("k screen")
7 | os.system("screen -wipe")
8 |
9 |
10 | for t in tasks:
11 | os.system("screen -dm -S trpo_%s bash -c '. ~/.profile; . ~/.bashrc; CUDA_VISIBLE_DEVICES=[] python main.py %s 2>&1 | tee logs_%s ; bash'" % (t, t, t))
12 |
--------------------------------------------------------------------------------
/experiment/main.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | from environment import Environment
4 | from agent.agent_continous import TRPOAgent
5 | from parameters import pms
6 |
7 | if not os.path.isdir("./checkpoint"):
8 | os.makedirs("./checkpoint")
9 | if not os.path.isdir("./log"):
10 | os.makedirs("./log")
11 | env = Environment(gym.make(pms.environment_name))
12 | agent = TRPOAgent(env)
13 |
14 | if pms.train_flag:
15 | agent.learn()
16 | else:
17 | agent.test(pms.checkpoint_file)
18 | # env.monitor.close()
19 | # gym.upload(training_dir,
20 | # algorithm_id='trpo_ff')
21 |
--------------------------------------------------------------------------------
/experiment/main_lstm.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | from environment import Environment
4 | from agent.agent_continous_rnn import TRPOAgent
5 | from parameters import pms
6 |
7 | if not os.path.isdir("./checkpoint"):
8 | os.makedirs("./checkpoint")
9 | if not os.path.isdir("./log"):
10 | os.makedirs("./log")
11 | env = Environment(gym.make(pms.environment_name))
12 | agent = TRPOAgent(env)
13 |
14 | if pms.train_flag:
15 | agent.learn()
16 | else:
17 | agent.test(pms.checkpoint_file)
18 | # env.monitor.close()
19 | # gym.upload(training_dir,
20 | # algorithm_id='trpo_ff')
21 |
--------------------------------------------------------------------------------
/logger/logger.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import time
3 |
4 | class Logger(object):
5 | def __init__(self, head):
6 | self.head = []
7 | self.file_name = self.get_file_name()
8 | self.csvfile = file("log/"+self.file_name , 'wb')
9 | self.csv_writer = csv.writer(self.csvfile)
10 | self.log_row(head)
11 |
12 | def log_row(self, data):
13 | self.csv_writer.writerow(data)
14 |
15 | def get_file_name(self):
16 | file_time = time.strftime("%Y-%m-%d-%H:%M:%S",time.localtime(time.time()))
17 | file_name = file_time+".csv"
18 | return file_name
19 |
20 | def __del__(self):
21 | self.csvfile.close()
--------------------------------------------------------------------------------
/experiment/main_ac.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | import gym
4 | from gym import envs, scoreboard
5 | from gym.spaces import Discrete, Box
6 | import tempfile
7 | import sys
8 | from environment import Environment
9 | from agent.AC_agent_continous import ACAgent
10 | from parameters import pms
11 |
12 | if not os.path.isdir("./checkpoint"):
13 | os.makedirs("./checkpoint")
14 | if not os.path.isdir("./log"):
15 | os.makedirs("./log")
16 | env = Environment(gym.make(pms.environment_name))
17 | agent = ACAgent(env)
18 |
19 | if pms.train_flag:
20 | agent.learn()
21 | else:
22 | agent.test(pms.checkpoint_file)
23 | # env.monitor.close()
24 | # gym.upload(training_dir,
25 | # algorithm_id='trpo_ff')
26 |
--------------------------------------------------------------------------------
/experiment/main_discrete.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | import gym
4 | from gym import envs, scoreboard
5 | from gym.spaces import Discrete, Box
6 | import tempfile
7 | import sys
8 | from environment import Environment
9 | from agent.agent_discrete import TRPOAgent
10 | from parameters import pms
11 |
12 | if not os.path.isdir("./checkpoint"):
13 | os.makedirs("./checkpoint")
14 | if not os.path.isdir("./log"):
15 | os.makedirs("./log")
16 | env = Environment(gym.make(pms.environment_name))
17 | agent = TRPOAgent(env)
18 |
19 | if pms.train_flag:
20 | agent.learn()
21 | else:
22 | agent.test(pms.checkpoint_file)
23 | # env.monitor.close()
24 | # gym.upload(training_dir,
25 | # algorithm_id='trpo_ff')
26 |
--------------------------------------------------------------------------------
/experiment/main_image.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | import gym
4 | from gym import envs, scoreboard
5 | from gym.spaces import Discrete, Box
6 | import tempfile
7 | import sys
8 | from environment import Environment
9 | from agent.agent_continous_image import TRPOAgent
10 | from parameters import pms
11 |
12 | if not os.path.isdir("./checkpoint"):
13 | os.makedirs("./checkpoint")
14 | if not os.path.isdir("./log"):
15 | os.makedirs("./log")
16 | env = Environment(gym.make(pms.environment_name))
17 | agent = TRPOAgent(env)
18 |
19 | if pms.train_flag:
20 | agent.learn()
21 | else:
22 | agent.test(pms.checkpoint_file)
23 | # env.monitor.close()
24 | # gym.upload(training_dir,
25 | # algorithm_id='trpo_ff')
26 |
--------------------------------------------------------------------------------
/dealImage.py:
--------------------------------------------------------------------------------
1 | import matplotlib
2 | matplotlib.use('Qt4Agg')
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 |
6 |
7 | def vis_square(data, padsize=1, padval=0):
8 | data -= data.min()
9 | data /= data.max()
10 |
11 |
12 | n = int(np.ceil(np.sqrt(data.shape[0])))
13 | padding = ((0, n ** 2 - data.shape[0]), (0, padsize), (0, padsize)) + ((0, 0),) * (data.ndim - 3)
14 | data = np.pad(data, padding, mode='constant', constant_values=(padval, padval))
15 |
16 |
17 | data = data.reshape((n, n) + data.shape[1:]).transpose((0, 2, 1, 3) + tuple(range(4, data.ndim + 1)))
18 | data = data.reshape((n * data.shape[1], n * data.shape[3]) + data.shape[4:])
19 | print data.shape
20 | plt.imshow(data)
21 | plt.show()
--------------------------------------------------------------------------------
/parameters.py~:
--------------------------------------------------------------------------------
1 | # for image
2 | dims = (100, 100)
3 | obs_height = 100
4 | obs_width = 100
5 | obs_channel = 1
6 | history_number = 2
7 |
8 | # for trainning
9 | <<<<<<< HEAD
10 | jobs = 4
11 |
12 | =======
13 | jobs = 2
14 | >>>>>>> 0356c098856467ec6db97061e73187c6a18a25a7
15 | max_iter_number = 10000
16 | paths_number = 1
17 | max_path_length = 199
18 | batch_size = max_path_length
19 | max_kl = 0.01
20 | gae_lambda = 1.0
21 | subsample_factor = 0.8
22 | cg_damping = 0.1
23 | discount = 0.99
24 | cg_iters = 10
25 | deviation = 0.1
26 | render = True
27 | train_flag = False
28 | iter_num_per_train = 1
29 | checkpoint_file = "checkpoint/iter240865.ckpt"
30 | record_movie = False
31 | upload_to_gym = False
32 |
33 | # for environment
34 |
35 | environment_name = "Pendulum-v0"
36 |
37 | # for continous action
38 | min_std = 1e-6
39 | center_adv = True
40 | positive_adv = False
41 | use_std_network = False
42 | std = 1.1
43 | obs_shape = 3
44 | action_shape = 1
45 | min_a = -2.0
46 | max_a = 2.0
47 |
48 |
--------------------------------------------------------------------------------
/distribution/diagonal_category.py:
--------------------------------------------------------------------------------
1 | from utils import *
2 | import tensorflow as tf
3 | import numpy as np
4 |
5 |
6 | class DiagonalCategory(object):
7 | def __init__(self, dim=0):
8 | self._dim = dim
9 |
10 | @property
11 | def dim(self):
12 | return self._dim
13 |
14 | def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
15 | return tf.reduce_mean(old_dist_info_vars * tf.log((old_dist_info_vars + 1e-8) / (new_dist_info_vars + 1e-8)))
16 |
17 | def likelihood_ratio_sym(self, x_var, new_dist_info_vars, old_dist_info_vars):
18 | """
19 | \frac{\pi_\theta}{\pi_{old}}
20 | :param x_var: actions
21 | :param new_dist_info_vars: means + logstds
22 | :param old_dist_info_vars: old_means + old_logstds
23 | :return:
24 | """
25 | N = tf.shape(x_var)[0]
26 | p_n = slice_2d(new_dist_info_vars, tf.range(0, N), x_var)
27 | oldp_n = slice_2d(old_dist_info_vars, tf.range(0, N), x_var)
28 | return p_n / oldp_n
29 |
30 | def entropy(self, dist_infos):
31 | return tf.reduce_mean(-dist_infos * tf.log(dist_infos + 1e-8))
--------------------------------------------------------------------------------
/baseline/baseline_lstsq.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | class Baseline(object):
3 | def __init__(self, reg_coeff=1e-5):
4 | self._coeffs = None
5 | self._reg_coeff = reg_coeff
6 |
7 | def get_param_values(self, **tags):
8 | return self._coeffs
9 |
10 | def set_param_values(self, val, **tags):
11 | self._coeffs = val
12 |
13 | def _features(self, path):
14 | o = path["observations"].astype('float32')
15 | o = o.reshape(o.shape[0], -1)
16 | l = len(path["rewards"])
17 | al = np.arange(l).reshape(-1 , 1) / 100.0
18 | return np.concatenate([o, o ** 2, al, al ** 2, np.ones((l, 1))], axis=1)
19 |
20 | def fit(self, paths):
21 | featmat = np.concatenate([self._features(path) for path in paths])
22 | returns = np.concatenate([path["returns"] for path in paths])
23 | self._coeffs = np.linalg.lstsq(
24 | featmat.T.dot(featmat) + self._reg_coeff * np.identity(featmat.shape[1]),
25 | featmat.T.dot(returns)
26 | )[0]
27 |
28 | def predict(self, path):
29 | if self._coeffs is None:
30 | return np.zeros(len(path["rewards"]))
31 | return self._features(path).dot(self._coeffs)
32 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # recently the algorithm has been moved to https://github.com/jjkke88/RL_toolbox
2 |
3 | # trpo
4 | trust region policy optimitztion base on gym and tensorflow
5 |
6 |
There are three versions of trpo, one for decrete action space like mountaincar, one for decreate action space task with image as input like atari games, and the last for continuous action space for pendulems.
7 | The environment is base on openAI gym.
8 | part of code refer to rllab
9 |
10 | # dependency
11 |
12 | - tensorflow 0.10
13 | - prettytensor
14 | - latest openai gym
15 |
16 |
17 | # constructure for code
18 |
19 | - baseline:baseline estimation of baseline function
20 | - checkpoint:folder to store model file, can not be delete or will cause some error
21 | - distribution:distribution base class, it can be used to calculate probability of distributions, for example Gaussian.
22 | - logger:have a Logger class for log data to .csv file
23 | - agent:for disperse action space and continous action space
24 | - log:store log file
25 | - experiment: contain many different main file, run main file can start trainning or testing
26 | - environment.py: environment
27 | - krylov.py: implement of some math method:conjugate gradient descent , calculating hessian matrix
28 | - parameters.py: config file
29 | - utils.py: implement of some basic function: getFlat, setFlat, lineaSearch
30 |
31 |
32 | # recent work
33 |
34 | - imple multi-thread trpo run python main_multi_thread.py to try
35 | - imple tensorflow distributed trpo
36 | - imple trpo multi-process
37 |
38 |
39 | # future work
40 |
41 | - complete trpo with image as input
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/baseline/baseline_tensorflow.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 | import prettytensor as pt
4 |
5 | class Baseline(object):
6 | coeffs = None
7 |
8 | def __init__(self , session=None):
9 | self.net = None
10 | self.session = session
11 |
12 | def create_net(self , shape):
13 | print(shape)
14 | self.x = tf.placeholder(tf.float32 , shape=[None , shape] , name="x")
15 | self.y = tf.placeholder(tf.float32 , shape=[None] , name="y")
16 | self.net = (pt.wrap(self.x).
17 | fully_connected(64 , activation_fn=tf.nn.tanh).
18 | fully_connected(1))
19 | self.net = tf.reshape(self.net , (-1 ,))
20 | self.l2 = (self.net - self.y) * (self.net - self.y)
21 | self.train = tf.train.AdamOptimizer().minimize(self.l2)
22 | self.session.run(tf.initialize_all_variables())
23 |
24 | def _features(self, path):
25 | o = path["observations"].astype('float32')
26 | o = o.reshape(o.shape[0] , -1)
27 | l = len(path["rewards"])
28 | al = np.arange(l).reshape(-1 , 1) / 100.0
29 | return np.concatenate([o , o ** 2 , al , al ** 2 , np.ones((l , 1))] , axis=1)
30 |
31 | def fit(self, paths):
32 | featmat = np.concatenate([self._features(path) for path in paths])
33 | if self.net is None:
34 | self.create_net(featmat.shape[1])
35 | returns = np.concatenate([path["returns"] for path in paths])
36 | for _ in range(10):
37 | loss, _ = self.session.run([self.l2, self.train], {self.x: featmat , self.y: returns})
38 |
39 | def predict(self, path):
40 | if self.net is None:
41 | return np.zeros(len(path["rewards"]))
42 | else:
43 | ret = self.session.run(self.net , {self.x: self._features(path)})
44 | return np.reshape(ret , (ret.shape[0] ,))
45 |
--------------------------------------------------------------------------------
/baseline/baseline_tf_image.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 | import prettytensor as pt
4 | from parameters import pms
5 |
6 | class BaselineTfImage(object):
7 | coeffs = None
8 |
9 | def __init__(self, session):
10 | self.net = None
11 | self.session = session
12 |
13 | def create_net(self, shape):
14 | self.x = tf.placeholder(tf.float32, shape=[None, shape[1], shape[2], shape[3]], name="x")
15 | self.y = tf.placeholder(tf.float32, shape=[None], name="y")
16 | self.net = (pt.wrap(self.x).
17 | conv2d(1, 16, stride=2, batch_normalize=True).
18 | conv2d(1, 16, stride=2, batch_normalize=True).
19 | flatten().
20 | fully_connected(32, activation_fn=tf.nn.relu).
21 | fully_connected(32, activation_fn=tf.nn.relu).
22 | fully_connected(1))
23 | self.net = tf.reshape(self.net, (-1, ))
24 | l2 = (self.net - self.y) * (self.net - self.y)
25 | self.train = tf.train.AdamOptimizer().minimize(l2)
26 | self.session.run(tf.initialize_all_variables())
27 |
28 | def _features(self, path):
29 | ret = path["observations"].astype('float32')
30 | return ret
31 |
32 | def fit(self, paths):
33 | featmat = np.concatenate([self._features(path) for path in paths])
34 | if self.net is None:
35 | self.create_net(featmat.shape)
36 | returns = np.concatenate([path["returns"] for path in paths])
37 | for _ in range(100):
38 | self.session.run(self.train, {self.x: featmat, self.y: returns})
39 |
40 | def predict(self, path):
41 | if self.net is None:
42 | return np.zeros(len(path["rewards"]))
43 | else:
44 | ret = self.session.run(self.net, {self.x: self._features(path)})
45 | return np.reshape(ret, (ret.shape[0], ))
--------------------------------------------------------------------------------
/network/network_continous.py:
--------------------------------------------------------------------------------
1 | from utils import *
2 | import numpy as np
3 | import tensorflow as tf
4 | import prettytensor as pt
5 | from parameters import pms
6 |
7 | seed = 1
8 | np.random.seed(seed)
9 | tf.set_random_seed(seed)
10 |
11 | class NetworkContinous(object):
12 | def __init__(self, scope):
13 | with tf.variable_scope("%s_shared" % scope):
14 | self.obs = obs = tf.placeholder(
15 | tf.float32, shape=[None, pms.obs_shape], name="%s_obs"%scope)
16 | self.action_n = tf.placeholder(tf.float32, shape=[None, pms.action_shape], name="%s_action"%scope)
17 | self.advant = tf.placeholder(tf.float32, shape=[None], name="%s_advant"%scope)
18 | self.old_dist_means_n = tf.placeholder(tf.float32, shape=[None, pms.action_shape],
19 | name="%s_oldaction_dist_means"%scope)
20 | self.old_dist_logstds_n = tf.placeholder(tf.float32, shape=[None, pms.action_shape],
21 | name="%s_oldaction_dist_logstds"%scope)
22 | self.action_dist_means_n = (pt.wrap(self.obs).
23 | fully_connected(64, activation_fn=tf.nn.relu, init=tf.random_normal_initializer(-0.05, 0.05), bias_init=tf.constant_initializer(0),
24 | name="%s_fc1"%scope).
25 | fully_connected(64, activation_fn=tf.nn.relu, init=tf.random_normal_initializer(-0.05, 0.05), bias_init=tf.constant_initializer(0),
26 | name="%s_fc2"%scope).
27 | fully_connected(pms.action_shape, init=tf.random_normal_initializer(-0.05, 0.05), bias_init=tf.constant_initializer(0),
28 | name="%s_fc3"%scope))
29 |
30 | self.N = tf.shape(obs)[0]
31 | Nf = tf.cast(self.N, tf.float32)
32 | self.action_dist_logstd_param = tf.Variable((.01*np.random.randn(1, pms.action_shape)).astype(np.float32), name="%spolicy_logstd"%scope)
33 | self.action_dist_logstds_n = tf.tile(self.action_dist_logstd_param,
34 | tf.pack((tf.shape(self.action_dist_means_n)[0], 1)))
35 | self.var_list = [v for v in tf.trainable_variables()if v.name.startswith(scope)]
36 |
37 | def get_action_dist_means_n(self, session, obs):
38 | return session.run(self.action_dist_means_n,
39 | {self.obs: obs})
40 |
41 |
--------------------------------------------------------------------------------
/environment.py:
--------------------------------------------------------------------------------
1 | """
2 | `SpaceConversionEnv` acts as a wrapper on
3 | any environment. It allows to convert some action spaces, and observation spaces to others.
4 | """
5 |
6 | import numpy as np
7 | from gym.spaces import Discrete, Box, Tuple
8 | from gym import Env
9 | import cv2
10 | from parameters import pms
11 | import gym
12 | from gym.monitoring import monitor
13 |
14 | def convert_gym_space(space):
15 | if isinstance(space, gym.spaces.Box):
16 | return Box(low=space.low, high=space.high)
17 | elif isinstance(space, gym.spaces.Discrete):
18 | return Discrete(n=space.n)
19 | else:
20 | raise NotImplementedError
21 |
22 | class CappedCubicVideoSchedule(object):
23 | def __call__(self, count):
24 | return monitor.capped_cubic_video_schedule(count)
25 |
26 | class NoVideoSchedule(object):
27 | def __call__(self , count):
28 | return False
29 |
30 | class Environment(Env):
31 |
32 | def __init__(self, env, type="origin"):
33 | self.env = env
34 | self.type = type
35 | self.video_schedule = None
36 | if not pms.record_movie:
37 | self.video_schedule = NoVideoSchedule()
38 | else:
39 | if self.video_schedule is not None:
40 | self.video_schedule = CappedCubicVideoSchedule()
41 | self.env.monitor.start("log/trpo" ,self.video_schedule, force=True)
42 | self.monitoring = True
43 |
44 | def step(self, action, **kwargs):
45 | self._observation, reward, done, info = self.env.step(action)
46 | self._observation = np.clip(self._observation, self.env.observation_space.low, self.env.observation_space.high)
47 | return self.observation, reward, done, info
48 |
49 | def reset(self, **kwargs):
50 | self._observation = self.env.reset()
51 | return self.observation
52 |
53 | def render(self, mode="human", close=False):
54 | return self.env.render(mode)
55 |
56 | @property
57 | def observation(self):
58 | if self.type == "origin":
59 | return self._observation
60 | elif self.type == "gray_image":
61 | return cv2.resize(cv2.cvtColor(self._observation, cv2.COLOR_RGB2GRAY)/255., pms.dims)
62 |
63 | @property
64 | def action_space(self):
65 | return convert_gym_space(self.env.action_space)
66 |
67 |
68 | @property
69 | def observation_space(self):
70 | if self.type == "origin":
71 | return convert_gym_space(self.env.observation_space)
72 | else:
73 | return pms.dims
74 |
75 | # @property
76 | # def obs_dims(self):
77 | # if self.type == "origin":
78 | # return self.env.observation_space.shape
79 | # else:
80 | # return pms.dims
--------------------------------------------------------------------------------
/network/network_continous_image.py:
--------------------------------------------------------------------------------
1 | from utils import *
2 | import numpy as np
3 | import tensorflow as tf
4 | import prettytensor as pt
5 | from parameters import pms
6 |
7 | seed = 1
8 | np.random.seed(seed)
9 | tf.set_random_seed(seed)
10 |
11 | class NetworkContinousImage(object):
12 | def __init__(self, scope):
13 | with tf.variable_scope("%s_shared" % scope):
14 | self.obs = obs = tf.placeholder(
15 | dtype, shape=[None, pms.obs_height, pms.obs_width, pms.obs_channel], name="%s_obs"%scope)
16 | self.action_n = tf.placeholder(dtype, shape=[None, pms.action_shape], name="%s_action"%scope)
17 | self.advant = tf.placeholder(dtype, shape=[None], name="%s_advant"%scope)
18 | self.old_dist_means_n = tf.placeholder(dtype, shape=[None, pms.action_shape],
19 | name="%s_oldaction_dist_means"%scope)
20 | self.old_dist_logstds_n = tf.placeholder(dtype, shape=[None, pms.action_shape],
21 | name="%s_oldaction_dist_logstds"%scope)
22 | self.action_dist_means_n = (pt.wrap(self.obs).
23 | conv2d(8 , 32 , stride=4 , batch_normalize=True).
24 | conv2d(4 , 64 , stride=2 , batch_normalize=True).
25 | conv2d(3 , 64 , stride=1 , batch_normalize=True).
26 | flatten().
27 | fully_connected(128, activation_fn=tf.nn.relu, init=tf.random_normal_initializer(-0.05, 0.05),
28 | name="%s_fc1"%scope).
29 | fully_connected(128, activation_fn=tf.nn.relu, init=tf.random_normal_initializer(-0.05, 0.05),
30 | name="%s_fc2"%scope).
31 | fully_connected(pms.action_shape, init=tf.random_normal_initializer(-0.05, 0.05),
32 | name="%s_fc3"%scope))
33 |
34 | self.N = tf.shape(obs)[0]
35 | Nf = tf.cast(self.N, dtype)
36 | self.action_dist_logstd_param = tf.Variable((.01*np.random.randn(1, pms.action_shape)).astype(np.float32), name="%spolicy_logstd"%scope)
37 | self.action_dist_logstds_n = tf.tile(self.action_dist_logstd_param,
38 | tf.pack((tf.shape(self.action_dist_means_n)[0], 1)))
39 | self.var_list = [v for v in tf.trainable_variables()if v.name.startswith(scope)]
40 |
41 | def get_action_dist_means_n(self, session, obs):
42 | return session.run(self.action_dist_means_n,
43 | {self.obs: obs})
44 |
45 |
--------------------------------------------------------------------------------
/parameters.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | flags = tf.app.flags
4 | flags.DEFINE_integer('obs_height', 100, 'image height')
5 | flags.DEFINE_integer('obs_width', 100, 'image width')
6 | flags.DEFINE_integer('obs_channel', 3, 'image channel')
7 | flags.DEFINE_integer('history_number', 2, 'image history number')
8 | flags.DEFINE_integer('jobs', 4, 'thread or process number')
9 | flags.DEFINE_integer('max_iter_number', 400, 'control the max iteration number for trainning')
10 | flags.DEFINE_integer('paths_number', 10, 'number of paths in each rollout')
11 | flags.DEFINE_integer('max_path_length',200, 'timesteps in each path')
12 | flags.DEFINE_integer('batch_size', 100, 'batch size for trainning')
13 | flags.DEFINE_float('max_kl', 0.01, 'the largest kl distance, \sigma in paper')
14 | flags.DEFINE_float('gae_lambda', 1.0, 'fix number')
15 | flags.DEFINE_float('subsample_factor', 0.5, 'ratio of the samples used in training process')
16 | flags.DEFINE_float('cg_damping', 0.001, 'conjugate gradient damping')
17 | flags.DEFINE_float('discount', 0.99, 'discount')
18 | flags.DEFINE_integer('cg_iters', 20, 'iteration number in conjugate gradient')
19 | flags.DEFINE_float('deviation', 0.1, 'fixed')
20 | flags.DEFINE_boolean('render', False, 'whether to render image')
21 | flags.DEFINE_boolean('train_flag', True, 'true for train and False for test')
22 | flags.DEFINE_integer('iter_num_per_train', 1, 'iteration number in each trainning process')
23 | flags.DEFINE_string('checkpoint_file', '', 'checkpoint file path, if empty then will load the latest one')
24 | flags.DEFINE_integer('save_model_times', 1, 'iteration number to save model, if 1, then model would be saved in each iteration')
25 | flags.DEFINE_boolean('record_movie', False, 'whether record the video in gym')
26 | flags.DEFINE_boolean('upload_to_gym', False, 'whether upload the result to gym')
27 | flags.DEFINE_string('checkpoint_dir', 'checkpoint/', 'checkpoint save and load path, for parallel, it should be checkpoint_parallel')
28 | flags.DEFINE_string('environment_name', 'Pendulum-v0', 'environment name')
29 | flags.DEFINE_float('min_std', 0.2, 'the smallest std')
30 | flags.DEFINE_boolean('center_adv', True, 'whether center advantage, fixed')
31 | flags.DEFINE_boolean('positive_adv', False, 'whether positive advantage, fixed')
32 | flags.DEFINE_boolean('use_std_network', False, 'whether use network to train std, it is not supported, fixed')
33 | flags.DEFINE_float('std', 1.1, 'if the std is set to constant, then this value will be used')
34 | flags.DEFINE_integer('obs_shape', 3, 'dimensions of observation')
35 | flags.DEFINE_integer('action_shape', 1, 'dimensions of action')
36 | flags.DEFINE_float('min_a', -2.0, 'the smallest action value')
37 | flags.DEFINE_float('max_a', 2.0, 'the largest action value')
38 | flags.DEFINE_string("decay_method", "adaptive", "decay_method:adaptive, linear, exponential") # adaptive, linear, exponential
39 | flags.DEFINE_integer("timestep_adapt", 600, "timestep to adapt kl")
40 | flags.DEFINE_float("kl_adapt", 0.0005, "kl adapt rate")
41 | pms = flags.FLAGS
42 | pms.checkpoint_file = None
43 | pms.batch_size = int(pms.subsample_factor * pms.paths_number * pms.max_path_length)
--------------------------------------------------------------------------------
/experiment/main_tf_parallel.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 | import tensorflow as tf
3 | from agent.agent_parallel import TRPOAgentParallel
4 | from parameters import pms
5 | import gym
6 | import numpy as np
7 | from environment import Environment
8 |
9 | # Flags for defining the tf.train.ClusterSpec
10 | tf.app.flags.DEFINE_string("ps_hosts", "166.111.138.113:2223",
11 | "Comma-separated list of hostname:port pairs")
12 | tf.app.flags.DEFINE_string("worker_hosts", "166.111.138.137:2226,166.111.138.137:2227,166.111.138.137:2228",
13 | "Comma-separated list of hostname:port pairs")
14 |
15 | # Flags for defining the tf.train.Server
16 | tf.app.flags.DEFINE_string("job_name", "worker", "ps or worker")
17 | tf.app.flags.DEFINE_integer("task_index",2, "Index of task within the job")
18 |
19 | FLAGS = tf.app.flags.FLAGS
20 |
21 | seed = 1
22 | np.random.seed(seed)
23 | tf.set_random_seed(seed)
24 |
25 | def main(_):
26 | ps_hosts = FLAGS.ps_hosts.split(',')
27 | worker_hosts = FLAGS.worker_hosts.split(',')
28 |
29 | # Create a cluster from the parameter server and worker hosts.
30 | cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
31 |
32 | # Create and start a server for the local task.
33 | # 创建并启动服务
34 | # 其参数中使用task_index 指定任务的编号
35 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1 / 3.0)
36 | server = tf.train.Server(cluster,
37 | job_name=FLAGS.job_name,
38 | task_index=FLAGS.task_index,
39 | config=tf.ConfigProto(gpu_options=gpu_options))
40 |
41 | if FLAGS.job_name == "ps":
42 | server.join()
43 | elif FLAGS.job_name == "worker":
44 | # 将op 挂载到各个本地的worker上
45 | env = Environment(gym.make(pms.environment_name))
46 | with tf.device(tf.train.replica_device_setter(
47 | worker_device="/job:worker/task:%d" % (FLAGS.task_index),
48 | cluster=cluster)):
49 | agent = TRPOAgentParallel(env)
50 | saver = tf.train.Saver(max_to_keep=10)
51 | init_op = tf.initialize_all_variables()
52 | summary_op = tf.merge_all_summaries()
53 | # Create a "supervisor", which oversees the training process.
54 | sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
55 | logdir="./checkpoint_parallel",
56 | init_op=init_op,
57 | global_step=agent.global_step,
58 | saver=saver,
59 | summary_op=None,
60 | save_model_secs=60)
61 |
62 | # The supervisor takes care of session initialization, restoring from
63 | # a checkpoint, and closing when done or an error occurs.
64 | with sv.managed_session(server.target) as sess:
65 | agent.session = sess
66 | agent.gf.session = sess
67 | agent.sff.session =sess
68 | agent.supervisor = sv
69 |
70 | if pms.train_flag:
71 | agent.learn()
72 | elif FLAGS.task_index == 0:
73 | agent.test(pms.checkpoint_file)
74 | # Ask for all the services to stop.
75 | sv.stop()
76 |
77 | if __name__ == "__main__":
78 | tf.app.run()
--------------------------------------------------------------------------------
/experiment/main_multi_thread.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | import tempfile
4 | import sys
5 | from utils import *
6 | import numpy as np
7 | import tensorflow as tf
8 | import signal
9 | from parameters import pms
10 | from logger.logger import Logger
11 | from agent.agent_cotinous_single_thread import TRPOAgentContinousSingleThread
12 | from network.network_continous import NetworkContinous
13 |
14 | seed = 1
15 | np.random.seed(seed)
16 | tf.set_random_seed(seed)
17 |
18 | training_dir = tempfile.mkdtemp()
19 | logging.getLogger().setLevel(logging.DEBUG)
20 |
21 |
22 | class MasterContinous(object):
23 | def __init__(self):
24 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1 / 3.0)
25 | self.session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
26 |
27 | self.network = NetworkContinous("master")
28 | self.gf = GetFlat(self.network.var_list) # get theta from var_list
29 | self.gf.session = self.session
30 | self.sff = SetFromFlat(self.network.var_list) # set theta from var_List
31 | self.sff.session = self.session
32 | self.session.run(tf.initialize_all_variables())
33 | self.saver = tf.train.Saver(max_to_keep=10)
34 |
35 | self.init_jobs()
36 | if pms.train_flag:
37 | self.init_logger()
38 |
39 | def init_jobs(self):
40 | self.jobs = []
41 | for thread_id in xrange(pms.jobs):
42 | job = TRPOAgentContinousSingleThread(thread_id, self)
43 | self.jobs.append(job)
44 |
45 | def init_logger(self):
46 | head = ["average_episode_std", "sum steps episode number" "total number of episodes",
47 | "Average sum of rewards per episode",
48 | "KL between old and new distribution", "Surrogate loss", "Surrogate loss prev", "ds", "entropy",
49 | "mean_advant"]
50 | self.logger = Logger(head)
51 |
52 | def get_parameters(self):
53 | return self.gf()
54 |
55 | def apply_gradient(self, gradient):
56 | theta_prev = self.gf()
57 | theta_after = theta_prev + gradient
58 | self.sff(theta_after)
59 |
60 | def train(self):
61 | signal.signal(signal.SIGINT, signal_handler)
62 | for job in self.jobs:
63 | job.start()
64 | for job in self.jobs:
65 | job.join()
66 |
67 | def test(self):
68 | self.load_model(pms.checkpoint_file)
69 | self.jobs[0].test()
70 |
71 | def save_model(self, model_name):
72 | self.saver.save(self.session, "checkpoint/" + model_name + ".ckpt")
73 |
74 | def load_model(self , model_name):
75 | try:
76 | if model_name is not None:
77 | self.saver.restore(self.session , model_name)
78 | else:
79 | self.saver.restore(self.session , tf.train.latest_checkpoint("checkpoint/"))
80 | except:
81 | print "load model %s fail" % (model_name)
82 |
83 | def signal_handler():
84 | sys.exit(0)
85 |
86 |
87 | if not os.path.isdir("./checkpoint"):
88 | os.makedirs("./checkpoint")
89 | if not os.path.isdir("./log"):
90 | os.makedirs("./log")
91 | master = MasterContinous()
92 | if pms.train_flag:
93 | master.train()
94 | else:
95 | master.test()
96 | # env.monitor.close()
97 | # gym.upload(training_dir,
98 | # algorithm_id='trpo_ff')
99 |
--------------------------------------------------------------------------------
/storage/storage.py:
--------------------------------------------------------------------------------
1 | from utils import *
2 | from parameters import pms
3 |
4 |
5 | class Storage(object):
6 | def __init__(self, agent, env, baseline):
7 | self.paths = []
8 | self.env = env
9 | self.agent = agent
10 | self.baseline = baseline
11 |
12 | def get_single_path(self):
13 | self.obs, actions, rewards, action_dists = [], [], [], []
14 | ob = self.env.reset()
15 | episode_steps = 0
16 | for _ in xrange(pms.max_path_length):
17 | action, action_dist, ob = self.agent.act(ob)
18 | self.obs.append(ob)
19 | actions.append(action)
20 | action_dists.append(action_dist)
21 | res = self.env.step(action) # res
22 | if pms.render:
23 | self.env.render()
24 | ob = res[0]
25 | rewards.append([res[1]])
26 | episode_steps += 1
27 | if res[2]:
28 | break
29 | path = dict(
30 | observations=np.concatenate(np.expand_dims(self.obs, 0)),
31 | agent_infos=np.concatenate(action_dists),
32 | rewards=np.array(rewards),
33 | actions=np.array(actions),
34 | episode_steps=episode_steps
35 | )
36 | self.paths.append(path)
37 |
38 | def get_paths(self):
39 | paths = self.paths
40 | self.paths = []
41 | return paths
42 |
43 | def process_paths(self, paths):
44 | sum_episode_steps = 0
45 | for path in paths:
46 | sum_episode_steps += path['episode_steps']
47 | # r_t+V(S_{t+1})-V(S_t) = returns-baseline
48 | # path_baselines = np.append(self.baseline.predict(path) , 0)
49 | # # r_t+V(S_{t+1})-V(S_t) = returns-baseline
50 | # path["advantages"] = np.concatenate(path["rewards"]) + \
51 | # pms.discount * path_baselines[1:] - \
52 | # path_baselines[:-1]
53 | # path["returns"] = np.concatenate(discount(path["rewards"], pms.discount))
54 | path_baselines = np.append(self.baseline.predict(path) , 0)
55 | deltas = np.concatenate(path["rewards"]) + \
56 | pms.discount * path_baselines[1:] - \
57 | path_baselines[:-1]
58 | path["advantages"] = discount(
59 | deltas , pms.discount * pms.gae_lambda)
60 | path["returns"] = np.concatenate(discount(path["rewards"] , pms.discount))
61 |
62 | # Updating policy.
63 | action_dist_n = np.concatenate([path["agent_infos"] for path in paths])
64 | obs_n = np.concatenate([path["observations"] for path in paths])
65 | action_n = np.concatenate([path["actions"] for path in paths])
66 | rewards = np.concatenate([path["rewards"] for path in paths])
67 | advantages = np.concatenate([path["advantages"] for path in paths])
68 |
69 | if pms.center_adv:
70 | advantages = (advantages - np.mean(advantages)) / (advantages.std() + 1e-8)
71 |
72 | self.baseline.fit(paths)
73 |
74 | samples_data = dict(
75 | observations=obs_n,
76 | actions=action_n,
77 | rewards=rewards,
78 | advantages=advantages,
79 | agent_infos=action_dist_n,
80 | paths=paths,
81 | sum_episode_steps=sum_episode_steps
82 | )
83 | return samples_data
84 |
--------------------------------------------------------------------------------
/distribution/diagonal_gaussian.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 |
4 |
5 | class DiagonalGaussian(object):
6 | def __init__(self, dim):
7 | self._dim = dim
8 |
9 | @property
10 | def dim(self):
11 | return self._dim
12 |
13 | def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
14 | old_means = old_dist_info_vars["mean"]
15 | old_log_stds = old_dist_info_vars["log_std"]
16 | new_means = new_dist_info_vars["mean"]
17 | new_log_stds = new_dist_info_vars["log_std"]
18 | """
19 | Compute the KL divergence of two multivariate Gaussian distribution with
20 | diagonal covariance matrices
21 | """
22 | old_std = tf.exp(old_log_stds)
23 | new_std = tf.exp(new_log_stds)
24 | # means: (N*A)
25 | # std: (N*A)
26 | # formula:
27 | # { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
28 | # ln(\sigma_2/\sigma_1)
29 | numerator = tf.square(old_means - new_means) + \
30 | tf.square(old_std) - tf.square(new_std)
31 | denominator = 2 * tf.square(new_std) + 1e-8
32 | return tf.reduce_sum(
33 | numerator / denominator + new_log_stds - old_log_stds, -1)
34 |
35 | def likelihood_ratio_sym(self, x_var, new_dist_info_vars, old_dist_info_vars):
36 | """
37 | \frac{\pi_\theta}{\pi_{old}}
38 | :param x_var: actions
39 | :param new_dist_info_vars: means + logstds
40 | :param old_dist_info_vars: old_means + old_logstds
41 | :return:
42 | """
43 | logli_new = self.log_likelihood_sym(x_var, new_dist_info_vars)
44 | logli_old = self.log_likelihood_sym(x_var, old_dist_info_vars)
45 | return tf.exp(logli_new - logli_old)
46 |
47 | def log_likelihood_sym(self, x_var, dist_info_vars):
48 | """
49 | \frac{1}{(2\pi)^{\frac{n}{2}}\sigma_\theta}exp(-(\frac{a-\mu_{\pi_\theta}}{2\sigma_\theta})^2)
50 | :param x_var:
51 | :param dist_info_vars:
52 | :return:
53 | """
54 | means = dist_info_vars["mean"]
55 | log_stds = dist_info_vars["log_std"]
56 | zs = (x_var - means) / tf.exp(log_stds)
57 | return - tf.reduce_sum(log_stds, -1) - \
58 | 0.5 * tf.reduce_sum(tf.square(zs), -1) - \
59 | 0.5 *means.get_shape()[-1].value * np.log(2 * np.pi)
60 |
61 | def kl_sym_firstfixed(self, old_dist_info_vars):
62 | mu = old_dist_info_vars["mean"]
63 | logstd = old_dist_info_vars["log_std"]
64 | mu1 , logstd1 = map(tf.stop_gradient , [mu , logstd])
65 | mu2 , logstd2 = mu , logstd
66 |
67 | return self.kl_sym(dict(mean=mu1, log_std=logstd1), dict(mean=mu2, log_std=logstd2))
68 |
69 | def sample(self, dist_info):
70 | means = dist_info["mean"]
71 | log_stds = dist_info["log_std"]
72 | rnd = np.random.normal(size=means.shape)
73 | return rnd * np.exp(log_stds) + means
74 |
75 | def log_likelihood(self, xs, dist_info):
76 | means = dist_info["mean"]
77 | log_stds = dist_info["log_std"]
78 | zs = (xs - means) / np.exp(log_stds)
79 | return - np.sum(log_stds, axis=-1) - \
80 | 0.5 * np.sum(np.square(zs), axis=-1) - \
81 | 0.5 * means.shape[-1] * np.log(2 * np.pi)
82 |
83 | def entropy(self, dist_info):
84 | log_stds = dist_info["log_std"]
85 | return tf.reduce_sum(log_stds + np.log(np.sqrt(2 * np.pi * np.e)))
86 |
87 | @property
88 | def dist_info_keys(self):
89 | return ["mean", "log_std"]
90 |
--------------------------------------------------------------------------------
/agent/agent_parallel.py:
--------------------------------------------------------------------------------
1 | from utils import *
2 | import numpy as np
3 | import tensorflow as tf
4 | from network.network_continous import NetworkContinous
5 | from parameters import pms
6 | from agent.agent_base import TRPOAgentBase
7 |
8 | seed = 1
9 | np.random.seed(seed)
10 | tf.set_random_seed(seed)
11 | class TRPOAgentParallel(TRPOAgentBase):
12 |
13 | def __init__(self, env):
14 | super(TRPOAgentParallel, self).__init__(env)
15 | self.init_network()
16 | # self.saver = tf.train.Saver(max_to_keep=10)
17 |
18 | def init_network(self):
19 | """
20 | [input]
21 | self.obs
22 | self.action_n
23 | self.advant
24 | self.old_dist_means_n
25 | self.old_dist_logstds_n
26 | [output]
27 | self.action_dist_means_n
28 | self.action_dist_logstds_n
29 | var_list
30 | """
31 | self.net = NetworkContinous("network_continous")
32 | self.global_step = tf.Variable(0 , trainable=False)
33 | self.step_op = tf.assign_add(self.global_step , 1 , use_locking=True)
34 | if pms.min_std is not None:
35 | log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std))
36 | self.action_dist_stds_n = tf.exp(log_std_var)
37 | self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n)
38 | self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n)
39 | self.likehood_action_dist = self.distribution.log_likelihood_sym(self.net.action_n, self.new_dist_info_vars)
40 | self.ratio_n = self.distribution.likelihood_ratio_sym(self.net.action_n, self.new_dist_info_vars,
41 | self.old_dist_info_vars)
42 | surr = -tf.reduce_mean(self.ratio_n * self.net.advant) # Surrogate loss
43 | batch_size = tf.shape(self.net.obs)[0]
44 | batch_size_float = tf.cast(batch_size , tf.float32)
45 | kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars))
46 | ent = self.distribution.entropy(self.old_dist_info_vars)
47 | # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf
48 | self.losses = [surr, kl, ent]
49 | var_list = self.net.var_list
50 | self.gf = GetFlat(var_list) # get theta from var_list
51 | self.gf.session = self.session
52 | self.sff = SetFromFlat(var_list) # set theta from var_List
53 | self.sff.session = self.session
54 | # get g
55 | self.pg = flatgrad(surr, var_list)
56 | # get A
57 | # KL divergence where first arg is fixed
58 | # replace old->tf.stop_gradient from previous kl
59 | kl_firstfixed = self.distribution.kl_sym_firstfixed(self.new_dist_info_vars) / batch_size_float
60 | grads = tf.gradients(kl_firstfixed, var_list)
61 | self.flat_tangent = tf.placeholder(dtype, shape=[None])
62 | shapes = map(var_shape, var_list)
63 | start = 0
64 | tangents = []
65 | for shape in shapes:
66 | size = np.prod(shape)
67 | param = tf.reshape(self.flat_tangent[start:(start + size)], shape)
68 | tangents.append(param)
69 | start += size
70 | self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)]
71 | self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list) # get kl''*p
72 | # self.saver = tf.train.Saver(max_to_keep=10)
73 | # self.load_model(pms.checkpoint_file)
74 |
75 | def learn(self):
76 | iter_num = 0
77 | while True:
78 | print "\n********** Iteration %i ************" % iter_num
79 | print self.gf().mean()
80 | stats, theta, thprev = self.train_mini_batch(linear_search=False)
81 | self.sff(theta)
82 | for k , v in stats.iteritems():
83 | print(k + ": " + " " * (40 - len(k)) + str(v))
84 | # if iter_num % pms.save_model_times == 0:
85 | # self.save_model(pms.environment_name + "-" + str(iter_num))
86 | self.session.run(self.step_op)
87 | iter_num += 1
88 |
--------------------------------------------------------------------------------
/experiment/main_image_multi_process.py:
--------------------------------------------------------------------------------
1 | import os
2 | if not os.path.isdir("./checkpoint"):
3 | os.makedirs("./checkpoint")
4 | if not os.path.isdir("./log"):
5 | os.makedirs("./log")
6 |
7 |
8 | import gym
9 | import multiprocessing
10 | import time
11 | from agent.agent_continous_image_parallel_image import TRPOAgentParallelImage
12 | from parameters import pms
13 | from storage.storage_continous_parallel_image import ParallelStorageImage
14 |
15 | args = pms
16 | args.max_pathlength = gym.spec(args.environment_name).timestep_limit
17 |
18 | learner_tasks = multiprocessing.JoinableQueue()
19 | learner_results = multiprocessing.Queue()
20 | learner_env = gym.make(args.environment_name)
21 |
22 | learner = TRPOAgentParallelImage(learner_env.observation_space, learner_env.action_space, learner_tasks, learner_results)
23 | learner.start()
24 | rollouts = ParallelStorageImage()
25 |
26 | learner_tasks.put(1)
27 | learner_tasks.join()
28 | starting_weights = learner_results.get()
29 | rollouts.set_policy_weights(starting_weights)
30 |
31 | start_time = time.time()
32 | history = {}
33 | history["rollout_time"] = []
34 | history["learn_time"] = []
35 | history["mean_reward"] = []
36 | history["timesteps"] = []
37 |
38 | # start it off with a big negative number
39 | last_reward = -1000000
40 | recent_total_reward = 0
41 |
42 | if pms.train_flag is True:
43 | for iteration in xrange(args.max_iter_number):
44 | # runs a bunch of async processes that collect rollouts
45 | paths = rollouts.get_paths()
46 | # Why is the learner in an async process?
47 | # Well, it turns out tensorflow has an issue: when there's a tf.Session in the main thread
48 | # and an async process creates another tf.Session, it will freeze up.
49 | # To solve this, we just make the learner's tf.Session in its own async process,
50 | # and wait until the learner's done before continuing the main thread.
51 | learn_start = time.time()
52 | if iteration%20 == 0:
53 | learner_tasks.put((2 , args.max_kl, 1, iteration))
54 | else:
55 | learner_tasks.put((2, args.max_kl, 0, iteration))
56 | learner_tasks.put(paths)
57 | learner_tasks.join()
58 | stats , theta , thprev = learner_results.get()
59 | learn_time = (time.time() - learn_start) / 60.0
60 | print
61 | print "-------- Iteration %d ----------" % iteration
62 | # print "Total time: %.2f mins" % ((time.time() - start_time) / 60.0)
63 | #
64 | # history["rollout_time"].append(rollout_time)
65 | # history["learn_time"].append(learn_time)
66 | # history["mean_reward"].append(mean_reward)
67 | # history["timesteps"].append(args.timesteps_per_batch)
68 | for k , v in stats.iteritems():
69 | print(k + ": " + " " * (40 - len(k)) + str(v))
70 | recent_total_reward += stats["Average sum of rewards per episode"]
71 |
72 | if args.decay_method == "adaptive":
73 | if iteration % 10 == 0:
74 | if recent_total_reward < last_reward:
75 | print "Policy is not improving. Decrease KL and increase steps."
76 | if args.max_kl > 0.001:
77 | args.max_kl -= args.kl_adapt
78 | else:
79 | print "Policy is improving. Increase KL and decrease steps."
80 | if args.max_kl < 0.01:
81 | args.max_kl += args.kl_adapt
82 | last_reward = recent_total_reward
83 | recent_total_reward = 0
84 |
85 | if args.decay_method == "linear":
86 | if args.max_kl > 0.001:
87 | args.max_kl -= args.kl_adapt
88 |
89 | if args.decay_method == "exponential":
90 | if args.max_kl > 0.001:
91 | args.max_kl *= args.kl_adapt
92 | rollouts.set_policy_weights(theta)
93 | else:
94 | from agent.agent_continous import TRPOAgent
95 | from environment import Environment
96 | env = Environment(gym.make(pms.environment_name))
97 | agent = TRPOAgent(env)
98 | agent.test(pms.checkpoint_file)
99 |
100 |
101 | rollouts.end()
102 |
--------------------------------------------------------------------------------
/agent/agent_continous_image.py:
--------------------------------------------------------------------------------
1 | from utils import *
2 | import numpy as np
3 | import tensorflow as tf
4 | from network.network_continous_image import NetworkContinousImage
5 | from baseline.baseline_tf_image import BaselineTfImage
6 | from storage.storage_image import Storage
7 | from parameters import pms
8 | from agent.agent_base import TRPOAgentBase
9 |
10 | seed = 1
11 | np.random.seed(seed)
12 | tf.set_random_seed(seed)
13 |
14 | """
15 | class for continoust action space with image as input
16 | """
17 | class TRPOAgent(TRPOAgentBase):
18 |
19 | def __init__(self, env):
20 | super(TRPOAgent, self).__init__(env)
21 | self.init_network()
22 | self.saver = tf.train.Saver(max_to_keep=10)
23 | self.baseline = BaselineTfImage(self.session)
24 | self.storage = Storage(self, env, self.baseline)
25 |
26 | def init_network(self):
27 | """
28 | [input]
29 | self.obs
30 | self.action_n
31 | self.advant
32 | self.old_dist_means_n
33 | self.old_dist_logstds_n
34 | [output]
35 | self.action_dist_means_n
36 | self.action_dist_logstds_n
37 | var_list
38 | """
39 | self.net = NetworkContinousImage("network_continous")
40 | if pms.min_std is not None:
41 | log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std))
42 | self.action_dist_stds_n = tf.exp(log_std_var)
43 | self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n)
44 | self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n)
45 | self.likehood_action_dist = self.distribution.log_likelihood_sym(self.net.action_n, self.new_dist_info_vars)
46 | self.ratio_n = self.distribution.likelihood_ratio_sym(self.net.action_n, self.new_dist_info_vars,
47 | self.old_dist_info_vars)
48 | surr = -tf.reduce_mean(self.ratio_n * self.net.advant) # Surrogate loss
49 | batch_size = tf.shape(self.net.obs)[0]
50 | batch_size_float = tf.cast(batch_size , tf.float32)
51 | kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars))
52 | ent = self.distribution.entropy(self.old_dist_info_vars)
53 | # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf
54 | self.losses = [surr, kl, ent]
55 | var_list = self.net.var_list
56 | self.gf = GetFlat(var_list) # get theta from var_list
57 | self.gf.session = self.session
58 | self.sff = SetFromFlat(var_list) # set theta from var_List
59 | self.sff.session = self.session
60 | # get g
61 | self.pg = flatgrad(surr, var_list)
62 | # get A
63 | # KL divergence where first arg is fixed
64 | # replace old->tf.stop_gradient from previous kl
65 | kl_firstfixed = self.distribution.kl_sym_firstfixed(self.new_dist_info_vars) / batch_size_float
66 | grads = tf.gradients(kl_firstfixed, var_list)
67 | self.flat_tangent = tf.placeholder(dtype, shape=[None])
68 | shapes = map(var_shape, var_list)
69 | start = 0
70 | tangents = []
71 | for shape in shapes:
72 | size = np.prod(shape)
73 | param = tf.reshape(self.flat_tangent[start:(start + size)], shape)
74 | tangents.append(param)
75 | start += size
76 | self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)]
77 | self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list) # get kl''*p
78 | self.session.run(tf.initialize_all_variables())
79 | # self.saver = tf.train.Saver(max_to_keep=10)
80 | # self.load_model(pms.checkpoint_file)
81 |
82 | def learn(self):
83 | iter_num = 0
84 | while True:
85 | print "\n********** Iteration %i ************" % iter_num
86 | print self.gf().mean()
87 | stats, theta, thprev = self.train_mini_batch(linear_search=False)
88 | self.sff(theta)
89 | for k , v in stats.iteritems():
90 | print(k + ": " + " " * (40 - len(k)) + str(v))
91 | if iter_num % pms.save_model_times == 0:
92 | self.save_model(pms.environment_name + "-" + str(iter_num))
93 | iter_num += 1
94 |
--------------------------------------------------------------------------------
/experiment/main_multi_process.py:
--------------------------------------------------------------------------------
1 | import os
2 | if not os.path.isdir("./checkpoint"):
3 | os.makedirs("./checkpoint")
4 | if not os.path.isdir("./log"):
5 | os.makedirs("./log")
6 |
7 |
8 | import gym
9 | import multiprocessing
10 | import time
11 | from agent.agent_continous_parallel_storage import TRPOAgentParallel
12 | <<<<<<< HEAD
13 | import argparse
14 | import multiprocessing
15 | import time
16 | import json
17 | =======
18 | >>>>>>> rnn
19 | from parameters import pms
20 | from storage.storage_continous_parallel import ParallelStorage
21 |
22 | args = pms
23 | args.max_pathlength = gym.spec(args.environment_name).timestep_limit
24 |
25 | learner_tasks = multiprocessing.JoinableQueue()
26 | learner_results = multiprocessing.Queue()
27 | learner_env = gym.make(args.environment_name)
28 |
29 | learner = TRPOAgentParallel(learner_env.observation_space, learner_env.action_space, learner_tasks, learner_results)
30 | learner.start()
31 | rollouts = ParallelStorage()
32 |
33 | learner_tasks.put(1)
34 | learner_tasks.join()
35 | starting_weights = learner_results.get()
36 | rollouts.set_policy_weights(starting_weights)
37 |
38 | start_time = time.time()
39 | history = {}
40 | history["rollout_time"] = []
41 | history["learn_time"] = []
42 | history["mean_reward"] = []
43 | history["timesteps"] = []
44 |
45 | # start it off with a big negative number
46 | last_reward = -1000000
47 | recent_total_reward = 0
48 |
49 | if pms.train_flag is True:
50 | for iteration in xrange(args.max_iter_number):
51 | # runs a bunch of async processes that collect rollouts
52 | paths = rollouts.get_paths()
53 | # Why is the learner in an async process?
54 | # Well, it turns out tensorflow has an issue: when there's a tf.Session in the main thread
55 | # and an async process creates another tf.Session, it will freeze up.
56 | # To solve this, we just make the learner's tf.Session in its own async process,
57 | # and wait until the learner's done before continuing the main thread.
58 | learn_start = time.time()
59 | if iteration%20 == 0:
60 | learner_tasks.put((2 , args.max_kl, 1, iteration))
61 | else:
62 | learner_tasks.put((2, args.max_kl, 0, iteration))
63 | learner_tasks.put(paths)
64 | learner_tasks.join()
65 | stats , theta , thprev = learner_results.get()
66 | learn_time = (time.time() - learn_start) / 60.0
67 | print
68 | print "-------- Iteration %d ----------" % iteration
69 | # print "Total time: %.2f mins" % ((time.time() - start_time) / 60.0)
70 | #
71 | # history["rollout_time"].append(rollout_time)
72 | # history["learn_time"].append(learn_time)
73 | # history["mean_reward"].append(mean_reward)
74 | # history["timesteps"].append(args.timesteps_per_batch)
75 | for k , v in stats.iteritems():
76 | print(k + ": " + " " * (40 - len(k)) + str(v))
77 | recent_total_reward += stats["Average sum of rewards per episode"]
78 |
79 | if args.decay_method == "adaptive":
80 | if iteration % 10 == 0:
81 | if recent_total_reward < last_reward:
82 | print "Policy is not improving. Decrease KL and increase steps."
83 | if args.max_kl > 0.001:
84 | args.max_kl -= args.kl_adapt
85 | else:
86 | print "Policy is improving. Increase KL and decrease steps."
87 | if args.max_kl < 0.01:
88 | args.max_kl += args.kl_adapt
89 | last_reward = recent_total_reward
90 | recent_total_reward = 0
91 |
92 | if args.decay_method == "linear":
93 | if args.max_kl > 0.001:
94 | args.max_kl -= args.kl_adapt
95 |
96 | if args.decay_method == "exponential":
97 | if args.max_kl > 0.001:
98 | args.max_kl *= args.kl_adapt
99 | rollouts.set_policy_weights(theta)
100 | else:
101 | from agent.agent_continous import TRPOAgent
102 | from environment import Environment
103 | env = Environment(gym.make(pms.environment_name))
104 | agent = TRPOAgent(env)
105 | agent.test(pms.checkpoint_file)
106 |
107 |
108 | rollouts.end()
109 |
--------------------------------------------------------------------------------
/agent/agent_continous.py:
--------------------------------------------------------------------------------
1 | from utils import *
2 | import numpy as np
3 | import tensorflow as tf
4 | from network.network_continous import NetworkContinous
5 | from parameters import pms
6 | from agent.agent_base import TRPOAgentBase
7 | from logger.logger import Logger
8 | from storage.storage_continous_parallel import ParallelStorage
9 |
10 | seed = 1
11 | np.random.seed(seed)
12 | tf.set_random_seed(seed)
13 |
14 | """
15 | class for continoust action space
16 | """
17 | class TRPOAgent(TRPOAgentBase):
18 | def __init__(self, env):
19 | super(TRPOAgent, self).__init__(env)
20 | self.init_network()
21 | self.saver = tf.train.Saver(max_to_keep=10)
22 |
23 | def init_network(self):
24 | """
25 | [input]
26 | self.obs
27 | self.action_n
28 | self.advant
29 | self.old_dist_means_n
30 | self.old_dist_logstds_n
31 | [output]
32 | self.action_dist_means_n
33 | self.action_dist_logstds_n
34 | var_list
35 | """
36 | self.net = NetworkContinous("network_continous")
37 | if pms.min_std is not None:
38 | log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std))
39 | self.action_dist_stds_n = tf.exp(log_std_var)
40 | self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n)
41 | self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n)
42 | self.likehood_action_dist = self.distribution.log_likelihood_sym(self.net.action_n, self.new_dist_info_vars)
43 | self.ratio_n = self.distribution.likelihood_ratio_sym(self.net.action_n, self.new_dist_info_vars,
44 | self.old_dist_info_vars)
45 | surr = -tf.reduce_sum(self.ratio_n * self.net.advant) # Surrogate loss
46 | batch_size = tf.shape(self.net.obs)[0]
47 | batch_size_float = tf.cast(batch_size , tf.float32)
48 | kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars))
49 | ent = self.distribution.entropy(self.old_dist_info_vars)
50 | # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf
51 | self.losses = [surr, kl, ent]
52 | var_list = self.net.var_list
53 | self.gf = GetFlat(var_list) # get theta from var_list
54 | self.gf.session = self.session
55 | self.sff = SetFromFlat(var_list) # set theta from var_List
56 | self.sff.session = self.session
57 | # get g
58 | self.pg = flatgrad(surr, var_list)
59 | # get A
60 | # KL divergence where first arg is fixed
61 | # replace old->tf.stop_gradient from previous kl
62 | kl_firstfixed = self.distribution.kl_sym_firstfixed(self.new_dist_info_vars) / batch_size_float
63 | grads = tf.gradients(kl_firstfixed, var_list)
64 | self.flat_tangent = tf.placeholder(dtype, shape=[None])
65 | shapes = map(var_shape, var_list)
66 | start = 0
67 | tangents = []
68 | for shape in shapes:
69 | size = np.prod(shape)
70 | param = tf.reshape(self.flat_tangent[start:(start + size)], shape)
71 | tangents.append(param)
72 | start += size
73 | self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)]
74 | self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list) # get kl''*p
75 | self.session.run(tf.initialize_all_variables())
76 | # self.saver = tf.train.Saver(max_to_keep=10)
77 | # self.load_model(pms.checkpoint_file)
78 |
79 | def init_logger(self):
80 | head = ["rewards", "std"]
81 | self.logger = Logger(head)
82 |
83 | def learn(self):
84 | self.init_logger()
85 | iter_num = 0
86 | while True:
87 | print "\n********** Iteration %i ************" % iter_num
88 | print self.gf().mean()
89 | stats, theta, thprev = self.train_mini_batch(linear_search=False)
90 | self.sff(theta)
91 | self.logger.log_row([stats["Average sum of rewards per episode"], self.session.run(self.net.action_dist_logstd_param)[0][0]])
92 | for k , v in stats.iteritems():
93 | print(k + ": " + " " * (40 - len(k)) + str(v))
94 | if iter_num % pms.save_model_times == 0:
95 | self.save_model(pms.environment_name + "-" + str(iter_num))
96 | iter_num += 1
97 |
--------------------------------------------------------------------------------
/agent/agent_continous_rnn.py:
--------------------------------------------------------------------------------
1 | from utils import *
2 | import numpy as np
3 | import tensorflow as tf
4 | from network.network_continous_rnn import NetworkContinousLSTM
5 | from parameters import pms
6 | from agent.agent_base import TRPOAgentBase
7 | from logger.logger import Logger
8 | from storage.storage_continous_parallel import ParallelStorage
9 |
10 | seed = 1
11 | np.random.seed(seed)
12 | tf.set_random_seed(seed)
13 |
14 | """
15 | class for continoust action space
16 | """
17 | class TRPOAgent(TRPOAgentBase):
18 | def __init__(self, env):
19 | super(TRPOAgent, self).__init__(env)
20 | self.init_network()
21 | self.saver = tf.train.Saver(max_to_keep=10)
22 |
23 | def init_network(self):
24 | """
25 | [input]
26 | self.obs
27 | self.action_n
28 | self.advant
29 | self.old_dist_means_n
30 | self.old_dist_logstds_n
31 | [output]
32 | self.action_dist_means_n
33 | self.action_dist_logstds_n
34 | var_list
35 | """
36 | self.net = NetworkContinousLSTM("network_continous")
37 | if pms.min_std is not None:
38 | log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std))
39 | self.action_dist_stds_n = tf.exp(log_std_var)
40 | self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n)
41 | self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n)
42 | self.likehood_action_dist = self.distribution.log_likelihood_sym(self.net.action_n, self.new_dist_info_vars)
43 | self.ratio_n = self.distribution.likelihood_ratio_sym(self.net.action_n, self.new_dist_info_vars,
44 | self.old_dist_info_vars)
45 | surr = -tf.reduce_sum(self.ratio_n * self.net.advant) # Surrogate loss
46 | batch_size = tf.shape(self.net.obs)[0]
47 | batch_size_float = tf.cast(batch_size , tf.float32)
48 | kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars))
49 | ent = self.distribution.entropy(self.old_dist_info_vars)
50 | # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf
51 | self.losses = [surr, kl, ent]
52 | var_list = self.net.var_list
53 | self.gf = GetFlat(var_list) # get theta from var_list
54 | self.gf.session = self.session
55 | self.sff = SetFromFlat(var_list) # set theta from var_List
56 | self.sff.session = self.session
57 | # get g
58 | self.pg = flatgrad(surr, var_list)
59 | # get A
60 | # KL divergence where first arg is fixed
61 | # replace old->tf.stop_gradient from previous kl
62 | kl_firstfixed = self.distribution.kl_sym_firstfixed(self.new_dist_info_vars) / batch_size_float
63 | grads = tf.gradients(kl_firstfixed, var_list)
64 | self.flat_tangent = tf.placeholder(dtype, shape=[None])
65 | shapes = map(var_shape, var_list)
66 | start = 0
67 | tangents = []
68 | for shape in shapes:
69 | size = np.prod(shape)
70 | param = tf.reshape(self.flat_tangent[start:(start + size)], shape)
71 | tangents.append(param)
72 | start += size
73 | self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)]
74 | self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list) # get kl''*p
75 | self.session.run(tf.initialize_all_variables())
76 | # self.saver = tf.train.Saver(max_to_keep=10)
77 | # self.load_model(pms.checkpoint_file)
78 |
79 | def init_logger(self):
80 | head = ["rewards", "std"]
81 | self.logger = Logger(head)
82 |
83 | def learn(self):
84 | self.init_logger()
85 | iter_num = 0
86 | while True:
87 | print "\n********** Iteration %i ************" % iter_num
88 | print self.gf().mean()
89 | stats, theta, thprev = self.train_mini_batch(linear_search=False)
90 | self.sff(theta)
91 | self.logger.log_row([stats["Average sum of rewards per episode"], self.session.run(self.net.action_dist_logstd_param)[0][0]])
92 | for k , v in stats.iteritems():
93 | print(k + ": " + " " * (40 - len(k)) + str(v))
94 | if iter_num % pms.save_model_times == 0:
95 | self.save_model(pms.environment_name + "-" + str(iter_num))
96 | iter_num += 1
97 |
--------------------------------------------------------------------------------
/storage/storage_image.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | from utils import *
3 | from parameters import pms
4 |
5 |
6 | class Storage(object):
7 | def __init__(self, agent, env, baseline):
8 | self.paths = []
9 | self.env = env
10 | self.agent = agent
11 | self.obs = []
12 | self.obs_origin = []
13 | self.baseline = baseline
14 |
15 | def get_single_path(self):
16 | self.obs_origin, self.obs, actions, rewards, action_dists = [], [], [], [], []
17 | ob = self.env.reset()
18 | ob = self.env.render('rgb_array')
19 | # self.agent.prev_action *= 0.0
20 | # self.agent.prev_obs *= 0.0
21 | episode_steps = 0
22 | for _ in xrange(pms.max_path_length):
23 | self.obs_origin.append(ob)
24 | deal_ob = self.deal_image(ob)
25 | action, action_dist = self.agent.get_action(deal_ob)
26 | self.obs.append(deal_ob)
27 | actions.append(action)
28 | action_dists.append(action_dist)
29 | res = self.env.step(action) # res
30 | if pms.render:
31 | self.env.render()
32 | ob = res[0]
33 | ob = self.env.render('rgb_array')
34 | rewards.append([res[1]])
35 | episode_steps += 1
36 | if res[2]:
37 | break
38 | path = dict(
39 | observations=np.concatenate([self.obs]),
40 | agent_infos=np.concatenate([action_dists]),
41 | rewards=np.array(rewards),
42 | actions=np.array(actions),
43 | episode_steps=episode_steps
44 | )
45 | self.paths.append(path)
46 | # self.agent.prev_action *= 0.0
47 | # self.agent.prev_obs *= 0.0
48 | return path
49 |
50 | def get_paths(self):
51 | paths = self.paths
52 | self.paths = []
53 | return paths
54 |
55 | def process_paths(self, paths):
56 | sum_episode_steps = 0
57 | for path in paths:
58 | sum_episode_steps += path['episode_steps']
59 | # r_t+V(S_{t+1})-V(S_t) = returns-baseline
60 | # path_baselines = np.append(self.baseline.predict(path) , 0)
61 | # # r_t+V(S_{t+1})-V(S_t) = returns-baseline
62 | # path["advantages"] = np.concatenate(path["rewards"]) + \
63 | # pms.discount * path_baselines[1:] - \
64 | # path_baselines[:-1]
65 | # path["returns"] = np.concatenate(discount(path["rewards"], pms.discount))
66 | path_baselines = np.append(self.baseline.predict(path) , 0)
67 | deltas = np.concatenate(path["rewards"]) + \
68 | pms.discount * path_baselines[1:] - \
69 | path_baselines[:-1]
70 | path["advantages"] = discount(
71 | deltas , pms.discount * pms.gae_lambda)
72 | path["returns"] = np.concatenate(discount(path["rewards"] , pms.discount))
73 | # Updating policy.
74 | action_dist_n = np.concatenate([path["agent_infos"] for path in paths])
75 | obs_n = np.concatenate([path["observations"] for path in paths])
76 | action_n = np.concatenate([path["actions"] for path in paths])
77 | rewards = np.concatenate([path["rewards"] for path in paths])
78 | advantages = np.concatenate([path["advantages"] for path in paths])
79 |
80 | if pms.center_adv:
81 | advantages = (advantages - np.mean(advantages)) / (advantages.std() + 1e-8)
82 |
83 | self.baseline.fit(paths)
84 |
85 | samples_data = dict(
86 | observations=obs_n,
87 | actions=action_n,
88 | rewards=rewards,
89 | advantages=advantages,
90 | agent_infos=action_dist_n,
91 | paths=paths,
92 | sum_episode_steps=sum_episode_steps
93 | )
94 | return samples_data
95 |
96 | def deal_image(self, image):
97 | index = len(self.obs_origin)
98 | image_end = []
99 | if index 0:
113 | return 0
114 | else:
115 | return 1
116 | if abs(1 - np.var(y - ypred) / (vary + 1e-8)) > 1e5:
117 | import ipdb;
118 | ipdb.set_trace()
119 | return 1 - np.var(y - ypred) / (vary + 1e-8)
120 |
121 |
122 | class Rollout(threading.Thread):
123 | def __init__(self, thread_number, agent, env, baseline):
124 | super(Rollout, self).__init__()
125 | self.thread_number = thread_number
126 | self.storage = Storage(agent, env, baseline)
127 |
128 | def run(self):
129 | self.storage.get_single_path()
130 |
--------------------------------------------------------------------------------
/agent/agent_cotinous_single_thread.py:
--------------------------------------------------------------------------------
1 | from utils import *
2 | import threading
3 | import gym
4 | import numpy as np
5 | import random
6 | import tensorflow as tf
7 | import time
8 | import threading
9 | import prettytensor as pt
10 |
11 | from storage.storage_continous import Storage
12 | from storage.storage_continous import Rollout
13 | import math
14 | from parameters import pms
15 | import krylov
16 | from logger.logger import Logger
17 | from distribution.diagonal_gaussian import DiagonalGaussian
18 | from baseline.baseline_lstsq import Baseline
19 | from environment import Environment
20 | from network.network_continous import NetworkContinous
21 | from agent.agent_base import TRPOAgentBase
22 |
23 | seed = 1
24 | np.random.seed(seed)
25 | tf.set_random_seed(seed)
26 |
27 |
28 | class TRPOAgentContinousSingleThread(TRPOAgentBase, threading.Thread):
29 |
30 | def __init__(self, thread_id, master):
31 | print "create thread %d"%(thread_id)
32 | self.thread_id = thread_id
33 | threading.Thread.__init__(self, name="thread_%d" % thread_id)
34 | self.master = master
35 | self.env = env = Environment(gym.make(pms.environment_name))
36 | TRPOAgentBase.__init__(self, env)
37 |
38 | self.session = self.master.session
39 | self.init_network()
40 | self.saver = tf.train.Saver(max_to_keep=10)
41 |
42 |
43 | def init_network(self):
44 | """
45 | [input]
46 | self.obs
47 | self.action_n
48 | self.advant
49 | self.old_dist_means_n
50 | self.old_dist_logstds_n
51 | [output]
52 | self.action_dist_means_n
53 | self.action_dist_logstds_n
54 | var_list
55 | """
56 | self.net = NetworkContinous(str(self.thread_id))
57 | if pms.min_std is not None:
58 | log_std_var = tf.maximum(self.net.action_dist_logstds_n , np.log(pms.min_std))
59 | self.action_dist_stds_n = tf.exp(log_std_var)
60 | self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n , log_std=self.net.old_dist_logstds_n)
61 | self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n , log_std=self.net.action_dist_logstds_n)
62 | self.likehood_action_dist = self.distribution.log_likelihood_sym(self.net.action_n , self.new_dist_info_vars)
63 | self.ratio_n = self.distribution.likelihood_ratio_sym(self.net.action_n , self.new_dist_info_vars ,
64 | self.old_dist_info_vars)
65 | surr = -tf.reduce_mean(self.ratio_n * self.net.advant) # Surrogate loss
66 | kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars , self.new_dist_info_vars))
67 | ent = self.distribution.entropy(self.old_dist_info_vars)
68 | # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf
69 | self.losses = [surr , kl , ent]
70 | var_list = self.net.var_list
71 | self.gf = GetFlat(var_list) # get theta from var_list
72 | self.gf.session = self.session
73 | self.sff = SetFromFlat(var_list) # set theta from var_List
74 | self.sff.session = self.session
75 | # get g
76 | self.pg = flatgrad(surr , var_list)
77 | # get A
78 | # KL divergence where first arg is fixed
79 | # replace old->tf.stop_gradient from previous kl
80 | kl_firstfixed = self.distribution.kl_sym_firstfixed(self.new_dist_info_vars)
81 | grads = tf.gradients(kl_firstfixed , var_list)
82 | self.flat_tangent = tf.placeholder(dtype , shape=[None])
83 | shapes = map(var_shape , var_list)
84 | start = 0
85 | tangents = []
86 | for shape in shapes:
87 | size = np.prod(shape)
88 | param = tf.reshape(self.flat_tangent[start:(start + size)] , shape)
89 | tangents.append(param)
90 | start += size
91 | self.gvp = [tf.reduce_sum(g * t) for (g , t) in zip(grads , tangents)]
92 | self.fvp = flatgrad(tf.reduce_sum(self.gvp) , var_list) # get kl''*p
93 |
94 | def run(self):
95 | self.learn()
96 |
97 | def learn(self):
98 | i = 0
99 | sum_gradient = 0
100 | while True:
101 | self.sff(self.master.get_parameters())
102 |
103 | # Generating paths.
104 | stats, theta, theprev = self.train_mini_batch(parallel=False)
105 | sum_gradient += theta-theprev
106 | self.master.apply_gradient(sum_gradient)
107 | print "\n********** Iteration %i ************" % i
108 | for k , v in stats.iteritems():
109 | print(k + ": " + " " * (40 - len(k)) + str(v))
110 | sum_gradient = 0
111 | if self.thread_id==1 and i%pms.save_model_times==0:
112 | self.save_model(pms.environment_name + "-" + str(i))
113 | i += 1
114 |
115 |
116 | def test(self):
117 | self.sff(self.master.get_parameters())
118 | for i in range(50):
119 | self.storage.get_single_path()
120 |
--------------------------------------------------------------------------------
/agent/AC_agent_continous.py:
--------------------------------------------------------------------------------
1 | from utils import *
2 | import numpy as np
3 | import tensorflow as tf
4 | from network.network_continous import NetworkContinous
5 | from parameters import pms
6 | from agent.agent_base import TRPOAgentBase
7 | from logger.logger import Logger
8 | import math
9 | import time
10 |
11 | seed = 1
12 | np.random.seed(seed)
13 | tf.set_random_seed(seed)
14 | class ACAgent(TRPOAgentBase):
15 |
16 | def __init__(self, env):
17 | super(ACAgent, self).__init__(env)
18 | self.init_network()
19 | self.saver = tf.train.Saver(max_to_keep=10)
20 |
21 |
22 | def init_network(self):
23 | """
24 | [input]
25 | self.obs
26 | self.action_n
27 | self.advant
28 | self.old_dist_means_n
29 | self.old_dist_logstds_n
30 | [output]
31 | self.action_dist_means_n
32 | self.action_dist_logstds_n
33 | var_list
34 | """
35 | self.net = NetworkContinous("network_continous_ac")
36 | if pms.min_std is not None:
37 | log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std))
38 | self.action_dist_stds_n = tf.exp(log_std_var)
39 | self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n)
40 | self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n)
41 | self.likehood_new_action_dist = self.distribution.log_likelihood_sym(self.net.action_n, self.new_dist_info_vars)
42 | # surr = - log(\pi_\theta)*(Q^\pi-V^\pi)
43 | value_loss = 0.5*tf.square(self.net.advant)
44 | surr = -tf.reduce_sum(self.likehood_new_action_dist*tf.stop_gradient(self.net.advant)+value_loss) # Surrogate loss
45 |
46 | batch_size = tf.shape(self.net.obs)[0]
47 | batch_size_float = tf.cast(batch_size , tf.float32)
48 | kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars))
49 | ent = self.distribution.entropy(self.old_dist_info_vars)
50 | # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf
51 | self.losses = [surr, kl, ent]
52 | var_list = self.net.var_list
53 | self.gf = GetFlat(var_list) # get theta from var_list
54 | self.gf.session = self.session
55 | self.sff = SetFromFlat(var_list) # set theta from var_List
56 | self.sff.session = self.session
57 | # get g
58 | self.pg = flatgrad(surr, var_list)
59 |
60 | self.session.run(tf.initialize_all_variables())
61 | # self.saver = tf.train.Saver(max_to_keep=10)
62 | # self.load_model(pms.checkpoint_file)
63 |
64 | def init_logger(self):
65 | head = ["std", "rewards"]
66 | self.logger = Logger(head)
67 |
68 | def train_mini_batch(self, parallel=False, linear_search=True):
69 | # Generating paths.
70 | print("Rollout")
71 | start_time = time.time()
72 | self.get_samples(pms.paths_number)
73 | paths = self.storage.get_paths() # get_paths
74 | # Computing returns and estimating advantage function.
75 | sample_data = self.storage.process_paths(paths)
76 | agent_infos = sample_data["agent_infos"]
77 | obs_n = sample_data["observations"]
78 | action_n = sample_data["actions"]
79 | advant_n = sample_data["advantages"]
80 | n_samples = len(obs_n)
81 | inds = np.random.choice(n_samples, int(math.floor(n_samples * pms.subsample_factor)), replace=False)
82 | # inds = range(n_samples)
83 | obs_n = obs_n[inds]
84 | action_n = action_n[inds]
85 | advant_n = advant_n[inds]
86 | action_dist_means_n = np.array([agent_info["mean"] for agent_info in agent_infos[inds]])
87 | action_dist_logstds_n = np.array([agent_info["log_std"] for agent_info in agent_infos[inds]])
88 | feed = {self.net.obs: obs_n,
89 | self.net.advant: advant_n,
90 | self.net.old_dist_means_n: action_dist_means_n,
91 | self.net.old_dist_logstds_n: action_dist_logstds_n,
92 | self.net.action_n: action_n
93 | }
94 |
95 | episoderewards = np.array([path["rewards"].sum() for path in paths])
96 | thprev = self.gf() # get theta_old
97 |
98 | g = self.session.run(self.pg, feed_dict=feed)
99 | theta = thprev+0.01*g
100 | stats = {}
101 | stats["sum steps of episodes"] = sample_data["sum_episode_steps"]
102 | stats["Average sum of rewards per episode"] = episoderewards.mean()
103 | stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0)
104 | return stats, theta, thprev
105 |
106 | def learn(self):
107 | self.init_logger()
108 | iter_num = 0
109 | while True:
110 | print "\n********** Iteration %i ************" % iter_num
111 | stats, theta, thprev = self.train_mini_batch(linear_search=False)
112 | self.sff(theta)
113 | self.logger.log_row([stats["Average sum of rewards per episode"], self.session.run(self.net.action_dist_logstd_param)])
114 | for k , v in stats.iteritems():
115 | print(k + ": " + " " * (40 - len(k)) + str(v))
116 | if iter_num % pms.save_model_times == 0:
117 | self.save_model(pms.environment_name + "-" + str(iter_num))
118 | iter_num += 1
119 |
--------------------------------------------------------------------------------
/network/network_continous_rnn.py:
--------------------------------------------------------------------------------
1 | from utils import *
2 | import numpy as np
3 |
4 | import tensorflow as tf
5 | import prettytensor as pt
6 | from parameters import pms
7 |
8 | seed = 1
9 | np.random.seed(seed)
10 | tf.set_random_seed(seed)
11 |
12 | class InnerLSTMCell(tf.nn.rnn_cell.BasicLSTMCell):
13 | def __init__(self , num_units , forget_bias=1.0 , input_size=None):
14 | tf.nn.rnn_cell.BasicLSTMCell.__init__(self , num_units , forget_bias=forget_bias , input_size=input_size)
15 | self.matrix , self.bias = None , None
16 |
17 |
18 | def __call__(self , inputs , state , scope=None):
19 | """
20 | Long short-term memory cell (LSTM).
21 | implement from BasicLSTMCell.__call__
22 | """
23 | with tf.variable_scope(scope or type(self).__name__): # "BasicLSTMCell"
24 | # Parameters of gates are concatenated into one multiply for efficiency.
25 | c , h = tf.split(1 , 2 , state)
26 | concat = self.linear([inputs , h] , 4 * self._num_units , True)
27 |
28 | # i = input_gate, j = new_input, f = forget_gate, o = output_gate
29 | i , j , f , o = tf.split(1 , 4 , concat)
30 |
31 | new_c = c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid(i) * tf.tanh(j)
32 | new_h = tf.tanh(new_c) * tf.sigmoid(o)
33 |
34 | return new_h , tf.concat(1 , [new_c , new_h])
35 |
36 |
37 | def linear(self , args , output_size , bias , bias_start=0.0 , scope=None):
38 | """
39 | Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
40 | implement from function of tensorflow.python.ops.rnn_cell.linear()
41 | """
42 | if args is None or (isinstance(args , (list , tuple)) and not args):
43 | raise ValueError("`args` must be specified")
44 | if not isinstance(args , (list , tuple)):
45 | args = [args]
46 |
47 | # Calculate the total size of arguments on dimension 1.
48 | total_arg_size = 0
49 | shapes = [a.get_shape().as_list() for a in args]
50 | for shape in shapes:
51 | if len(shape) != 2:
52 | raise ValueError("Linear is expecting 2D arguments: %s" % str(shapes))
53 | if not shape[1]:
54 | raise ValueError("Linear expects shape[1] of arguments: %s" % str(shapes))
55 | else:
56 | total_arg_size += shape[1]
57 |
58 | # Now the computation.
59 | with tf.variable_scope(scope or "Linear"):
60 | matrix = tf.get_variable("Matrix" , [total_arg_size , output_size])
61 | if len(args) == 1:
62 | res = tf.matmul(args[0] , matrix)
63 | else:
64 | res = tf.matmul(tf.concat(1 , args) , matrix)
65 | if not bias:
66 | return res
67 | bias_term = tf.get_variable(
68 | "Bias" , [output_size] ,
69 | initializer=tf.constant_initializer(bias_start))
70 | self.matrix = matrix
71 | self.bias = bias_term
72 | return res + bias_term
73 |
74 | class NetworkContinousLSTM(object):
75 | def __init__(self, scope):
76 | with tf.variable_scope("%s_shared" % scope):
77 | self.obs = obs = tf.placeholder(
78 | dtype, shape=[None, pms.obs_shape], name="%s_obs"%scope)
79 | self.action_n = tf.placeholder(dtype, shape=[None, pms.action_shape], name="%s_action"%scope)
80 | self.advant = tf.placeholder(dtype, shape=[None], name="%s_advant"%scope)
81 | self.old_dist_means_n = tf.placeholder(dtype, shape=[None, pms.action_shape],
82 | name="%s_oldaction_dist_means"%scope)
83 | self.old_dist_logstds_n = tf.placeholder(dtype, shape=[None, pms.action_shape],
84 | name="%s_oldaction_dist_logstds"%scope)
85 | # self.obs_reshape = tf.reshape(self.obs, [None, 1, pms.obs_shape])
86 | lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(3, forget_bias=1.0, state_is_tuple=True)
87 | lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
88 | lstm_cell, output_keep_prob=0.5)
89 | rnn = tf.nn.rnn_cell.MultiRNNCell([lstm_cell], state_is_tuple=True)
90 | # rnn = tf.nn.rnn_cell.BasicRNNCell(3)
91 | self.initial_state = state = rnn.zero_state(tf.shape(self.obs)[0], tf.float32)
92 | # output , state = tf.nn.dynamic_rnn(rnn, self.obs)
93 | output, state = rnn(self.obs, state)
94 | self.action_dist_means_n = (pt.wrap(output).
95 | # fully_connected(64, activation_fn=tf.nn.relu, init=tf.random_normal_initializer(-0.05, 0.05),
96 | # name="%s_fc1"%scope).
97 | # fully_connected(64, activation_fn=tf.nn.relu, init=tf.random_normal_initializer(-0.05, 0.05),
98 | # name="%s_fc2"%scope).
99 | fully_connected(pms.action_shape, init=tf.random_normal_initializer(-0.05, 0.05),
100 | name="%s_fc3"%scope))
101 | self.N = tf.shape(obs)[0]
102 | Nf = tf.cast(self.N, dtype)
103 | self.action_dist_logstd_param = tf.Variable((.01*np.random.randn(1, pms.action_shape)).astype(np.float32), trainable=False, name="%spolicy_logstd"%scope)
104 | self.action_dist_logstds_n = tf.tile(self.action_dist_logstd_param,
105 | tf.pack((tf.shape(self.action_dist_means_n)[0], 1)))
106 | self.var_list = [v for v in tf.trainable_variables()if v.name.startswith(scope)]
107 |
108 | def get_action_dist_means_n(self, session, obs):
109 | return session.run(self.action_dist_means_n,
110 | {self.obs: obs})
111 |
112 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | import random
4 | import scipy.signal
5 | import prettytensor as pt
6 | from parameters import pms
7 | import threading
8 | from tensorflow.contrib.layers.python.layers import initializers
9 |
10 | seed = 1
11 | random.seed(seed)
12 | np.random.seed(seed)
13 | tf.set_random_seed(seed)
14 |
15 | dtype = tf.float32
16 |
17 | def discount(x, gamma):
18 | """
19 | scipy.signal.lfilter(b, a, x, axis=-1, zi=None)[source]
20 | a[0]*y[n] = b[0]*x[n] + b[1]*x[n-1] + ... + b[M]*x[n-M]
21 | - a[1]*y[n-1] - ... - a[N]*y[n-N]
22 | :param x:
23 | :param gamma:
24 | :return:
25 | """
26 | assert x.ndim >= 1
27 | return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
28 |
29 |
30 |
31 |
32 |
33 | def cat_sample(prob_nk):
34 | assert prob_nk.ndim == 2
35 | N = prob_nk.shape[0]
36 | csprob_nk = np.cumsum(prob_nk, axis=1)
37 | out = np.zeros(N, dtype='i')
38 | for (n, csprob_k, r) in zip(xrange(N), csprob_nk, np.random.rand(N)):
39 | for (k, csprob) in enumerate(csprob_k):
40 | if csprob > r:
41 | out[n] = k
42 | break
43 | return out
44 |
45 |
46 | def var_shape(x):
47 | out = [k.value for k in x.get_shape()]
48 | assert all(isinstance(a, int) for a in out), \
49 | "shape function assumes that shape is fully known"
50 | return out
51 |
52 |
53 | def numel(x):
54 | return np.prod(var_shape(x))
55 |
56 |
57 | def flatgrad(loss, var_list):
58 | grads = tf.gradients(loss, var_list)
59 | return tf.concat(0, [tf.reshape(grad, [np.prod(var_shape(v))])
60 | for (grad, v) in zip( grads, var_list)])
61 |
62 | # set theta
63 | class SetFromFlat(object):
64 | def __init__(self, var_list):
65 | assigns = []
66 | shapes = map(var_shape, var_list)
67 | total_size = sum(np.prod(shape) for shape in shapes)
68 | self.theta = theta = tf.placeholder(tf.float32, [total_size])
69 | start = 0
70 | assigns = []
71 | for (shape, v) in zip(shapes, var_list):
72 | size = np.prod(shape)
73 | assigns.append(
74 | tf.assign(
75 | v,
76 | tf.reshape(
77 | theta[
78 | start:start +
79 | size],
80 | shape)))
81 | start += size
82 | self.op = tf.group(*assigns)
83 |
84 | def __call__(self, theta):
85 | self.session.run(self.op, feed_dict={self.theta: theta})
86 |
87 | # get theta
88 | class GetFlat(object):
89 | def __init__(self, var_list):
90 | self.op = tf.concat(0, [tf.reshape(v, [numel(v)]) for v in var_list])
91 |
92 | def __call__(self):
93 | return self.op.eval(session=self.session)
94 |
95 |
96 | def slice_2d(x, inds0, inds1):
97 | # assume that a path have 1000 vector, then ncols=action dims, inds0=1000,inds1=
98 | inds0 = tf.cast(inds0, tf.int64)
99 | inds1 = tf.cast(inds1, tf.int64)
100 | shape = tf.cast(tf.shape(x), tf.int64)
101 | ncols = shape[1]
102 | x_flat = tf.reshape(x, [-1])
103 | return tf.gather(x_flat, inds0 * ncols + inds1)
104 |
105 |
106 | # def linesearch(f, x, fullstep, expected_improve_rate):
107 | # accept_ratio = .1
108 | # max_backtracks = 10
109 | # fval, old_kl, entropy = f(x)
110 | # for (_n_backtracks, stepfrac) in enumerate(.5**np.arange(max_backtracks)):
111 | # xnew = x + stepfrac * fullstep
112 | # newfval, new_kl, new_ent= f(xnew)
113 | # # actual_improve = newfval - fval # minimize target object
114 | # # expected_improve = expected_improve_rate * stepfrac
115 | # # ratio = actual_improve / expected_improve
116 | # # if ratio > accept_ratio and actual_improve > 0:
117 | # # return xnew
118 | # if newfval accept_ratio and actual_improve > 0:
133 | # pms.max_kl *= 1.002
134 | # return xnew
135 | if newfval 1
105 |
106 | alphas = []
107 | betas = []
108 | qs = []
109 |
110 | q = b / np.linalg.norm(b)
111 | beta = 0
112 | qm = np.zeros_like(b)
113 | for j in xrange(k):
114 | qs.append(q)
115 |
116 | z = f_Ax(q)
117 |
118 | alpha = q.dot(z)
119 | alphas.append(alpha)
120 | z -= alpha * q + beta * qm
121 |
122 | beta = np.linalg.norm(z)
123 | betas.append(beta)
124 |
125 | print "beta", beta
126 | if beta < 1e-9:
127 | print "lanczos: early after %i/%i dimensions" % (j + 1, k)
128 | break
129 | else:
130 | qm = q
131 | q = z / beta
132 |
133 | return np.array(qs, 'float64').T, np.array(alphas, 'float64'), np.array(betas[:-1], 'float64')
134 |
135 |
136 | def lanczos2(f_Ax, b, k, residual_thresh=1e-9):
137 | """
138 | Runs Lanczos algorithm to generate a orthogonal basis for the Krylov subspace
139 | b, Ab, A^2b, ...
140 | as well as the upper hessenberg matrix T = Q^T A Q
141 | from Demmel ch 6
142 | """
143 | b = b.astype('float64')
144 | assert k > 1
145 | H = np.zeros((k, k))
146 | qs = []
147 |
148 | q = b / np.linalg.norm(b)
149 | beta = 0
150 |
151 | for j in xrange(k):
152 | qs.append(q)
153 |
154 | z = f_Ax(q.astype('float64')).astype('float64')
155 | for (i, q) in enumerate(qs):
156 | H[j, i] = H[i, j] = h = q.dot(z)
157 | z -= h * q
158 |
159 | beta = np.linalg.norm(z)
160 | if beta < residual_thresh:
161 | print "lanczos2: stopping early after %i/%i dimensions residual %f < %f" % (j + 1, k, beta, residual_thresh)
162 | break
163 | else:
164 | q = z / beta
165 |
166 | return np.array(qs).T, H[:len(qs), :len(qs)]
167 |
168 |
169 | def make_tridiagonal(alphas, betas):
170 | assert len(alphas) == len(betas) + 1
171 | N = alphas.size
172 | out = np.zeros((N, N), 'float64')
173 | out.flat[0:N ** 2:N + 1] = alphas
174 | out.flat[1:N ** 2 - N:N + 1] = betas
175 | out.flat[N:N ** 2 - 1:N + 1] = betas
176 | return out
177 |
178 |
179 | def tridiagonal_eigenvalues(alphas, betas):
180 | T = make_tridiagonal(alphas, betas)
181 | return np.linalg.eigvalsh(T)
182 |
183 |
184 | def test_lanczos():
185 | np.set_printoptions(precision=4)
186 |
187 | A = np.random.randn(5, 5)
188 | A = A.T.dot(A)
189 | b = np.random.randn(5)
190 | f_Ax = lambda x: A.dot(x) # pylint: disable=W0108
191 | Q, alphas, betas = lanczos(f_Ax, b, 10)
192 | H = make_tridiagonal(alphas, betas)
193 | assert np.allclose(Q.T.dot(A).dot(Q), H)
194 | assert np.allclose(Q.dot(H).dot(Q.T), A)
195 | assert np.allclose(np.linalg.eigvalsh(H), np.linalg.eigvalsh(A))
196 |
197 | Q, H1 = lanczos2(f_Ax, b, 10)
198 | assert np.allclose(H, H1, atol=1e-6)
199 |
200 | print "ritz eigvals:"
201 | for i in xrange(1, 6):
202 | Qi = Q[:, :i]
203 | Hi = Qi.T.dot(A).dot(Qi)
204 | print np.linalg.eigvalsh(Hi)[::-1]
205 | print "true eigvals:"
206 | print np.linalg.eigvalsh(A)[::-1]
207 |
208 | print "lanczos on ill-conditioned problem"
209 | A = np.diag(10 ** np.arange(5))
210 | Q, H1 = lanczos2(f_Ax, b, 10)
211 | print np.linalg.eigvalsh(H1)
212 |
213 | print "lanczos on ill-conditioned problem with noise"
214 |
215 | def f_Ax_noisy(x):
216 | return A.dot(x) + np.random.randn(x.size) * 1e-3
217 |
218 | Q, H1 = lanczos2(f_Ax_noisy, b, 10)
219 | print np.linalg.eigvalsh(H1)
220 |
221 | def compute_hessian(fn, vars):
222 | mat = []
223 | for v1 in vars:
224 | temp = []
225 | for v2 in vars:
226 | # computing derivative twice, first w.r.t v2 and then w.r.t v1
227 | temp.append(tf.gradients(tf.gradients(fn, v2)[0], v1)[0])
228 | temp = [tf.cons(0) if t == None else t for t in temp] # tensorflow returns None when there is no gradient, so we replace None with 0
229 | temp = tf.pack(temp)
230 | mat.append(temp)
231 | mat = tf.pack(mat)
232 | return mat
233 |
234 | if __name__ == "__main__":
235 | test_lanczos()
236 | test_cg()
237 |
238 |
--------------------------------------------------------------------------------
/storage/storage_continous_parallel.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | import multiprocessing
4 | from utils import *
5 | import gym
6 | import time
7 | import copy
8 | from random import randint
9 | from parameters import pms
10 | import math
11 | from network.network_continous import NetworkContinous
12 |
13 |
14 | class Actor(multiprocessing.Process):
15 | def __init__(self, args, task_q, result_q, actor_id, monitor):
16 | multiprocessing.Process.__init__(self)
17 | self.actor_id = actor_id
18 | self.task_q = task_q
19 | self.result_q = result_q
20 | self.args = args
21 | self.monitor = monitor
22 | # pms.max_path_length = gym.spec(args.environment_name).timestep_limit
23 |
24 |
25 | def get_action(self, obs):
26 | if self.net == None:
27 | raise NameError("network have not been defined")
28 | obs = np.expand_dims(obs , 0)
29 | # action_dist_logstd = np.expand_dims([np.log(pms.std)], 0)
30 | action_dist_means_n , action_dist_logstds_n = self.session.run(
31 | [self.net.action_dist_means_n, self.net.action_dist_logstds_n], feed_dict={self.net.obs: obs})
32 | if pms.train_flag:
33 | rnd = np.random.normal(size=action_dist_means_n[0].shape)
34 | action = rnd * np.exp(action_dist_logstds_n[0]) + action_dist_means_n[0]
35 | else:
36 | action = action_dist_means_n[0]
37 | # action = np.clip(action, pms.min_a, pms.max_a)
38 | return action , dict(mean=action_dist_means_n[0] , log_std=np.exp(action_dist_logstds_n[0]))
39 |
40 | def run(self):
41 | self.env = gym.make(self.args.environment_name)
42 | self.env.seed(randint(0, 999999))
43 | if self.monitor:
44 | self.env.monitor.start('monitor/', force=True)
45 |
46 | self.net = NetworkContinous("rollout_network" + str(self.actor_id))
47 | config = tf.ConfigProto(
48 | device_count={'GPU': 0}
49 | )
50 | self.session = tf.Session(config=config)
51 | var_list = self.net.var_list
52 | self.session.run(tf.initialize_all_variables())
53 | self.set_policy = SetFromFlat(var_list)
54 | self.set_policy.session = self.session
55 | while True:
56 | # get a task, or wait until it gets one
57 | next_task = self.task_q.get(block=True)
58 | if type(next_task) is int and next_task == 1:
59 | # the task is an actor request to collect experience
60 | path = self.rollout()
61 | # print "single rollout time:"+str(end-start)
62 | self.task_q.task_done()
63 | self.result_q.put(path)
64 | elif type(next_task) is int and next_task == 2:
65 | print "kill message"
66 | if self.monitor:
67 | self.env.monitor.close()
68 | self.task_q.task_done()
69 | break
70 | else:
71 | # the task is to set parameters of the actor policy
72 | next_task = np.array(next_task)
73 | self.set_policy(next_task)
74 | # super hacky method to make sure when we fill the queue with set parameter tasks,
75 | # an actor doesn't finish updating before the other actors can accept their own tasks.
76 | time.sleep(0.1)
77 | self.task_q.task_done()
78 | return
79 |
80 | def rollout(self):
81 | """
82 | :param:observations:obs list
83 | :param:actions:action list
84 | :param:rewards:reward list
85 | :param:agent_infos: mean+log_std dictlist
86 | :param:env_infos: no use, just information about environment
87 | :return: a path, list
88 | """
89 | # if pms.record_movie:
90 | # outdir = 'log/trpo'
91 | # self.env.monitor.start(outdir , force=True)
92 | observations = []
93 | actions = []
94 | rewards = []
95 | agent_infos = []
96 | env_infos = []
97 | if pms.render:
98 | self.env.render()
99 | o = self.env.reset()
100 | episode_steps = 0
101 | for i in xrange(pms.max_path_length - 1):
102 | a, agent_info = self.get_action(o)
103 | next_o, reward, terminal, env_info = self.env.step(a)
104 | observations.append(o)
105 | rewards.append(np.array([reward]))
106 | actions.append(a)
107 | agent_infos.append([agent_info])
108 | env_infos.append([])
109 | episode_steps += 1
110 | if terminal:
111 | break
112 | o = next_o
113 | if pms.render:
114 | self.env.render()
115 | path = dict(
116 | observations=np.array(observations) ,
117 | actions=np.array(actions) ,
118 | rewards=np.array(rewards) ,
119 | agent_infos=np.concatenate(agent_infos) ,
120 | env_infos=np.concatenate(env_infos) ,
121 | episode_steps=episode_steps
122 | )
123 | return path
124 |
125 | class ParallelStorage():
126 | def __init__(self):
127 | self.args = pms
128 | self.tasks = multiprocessing.JoinableQueue()
129 | self.results = multiprocessing.Queue()
130 | self.actors = []
131 | self.actors.append(Actor(self.args, self.tasks, self.results, 9999, self.args.record_movie))
132 | for i in xrange(self.args.jobs-1):
133 | self.actors.append(Actor(self.args, self.tasks, self.results, 37*(i+3), False))
134 | for a in self.actors:
135 | a.start()
136 | # we will start by running 20,000 / 1000 = 20 episodes for the first ieration
137 | self.average_timesteps_in_episode = 1000
138 |
139 | def get_paths(self):
140 | # keep 20,000 timesteps per update
141 | num_rollouts = self.args.paths_number
142 | # print "rollout_number:"+str(num_rollouts)
143 | for i in xrange(num_rollouts):
144 | self.tasks.put(1)
145 | start = time.time()
146 | self.tasks.join()
147 | end = time.time()
148 | # print "rollout real time"+str(end-start)
149 | paths = []
150 | while num_rollouts:
151 | num_rollouts -= 1
152 | paths.append(self.results.get())
153 | return paths
154 |
155 | # def process_paths(self, paths):
156 | # sum_episode_steps = 0
157 | # for path in paths:
158 | # sum_episode_steps += path['episode_steps']
159 | # # r_t+V(S_{t+1})-V(S_t) = returns-baseline
160 | # # path_baselines = np.append(self.baseline.predict(path) , 0)
161 | # # # r_t+V(S_{t+1})-V(S_t) = returns-baseline
162 | # # path["advantages"] = np.concatenate(path["rewards"]) + \
163 | # # pms.discount * path_baselines[1:] - \
164 | # # path_baselines[:-1]
165 | # # path["returns"] = np.concatenate(discount(path["rewards"], pms.discount))
166 | # path_baselines = np.append(self.baseline.predict(path) , 0)
167 | # deltas = np.concatenate(path["rewards"]) + \
168 | # pms.discount * path_baselines[1:] - \
169 | # path_baselines[:-1]
170 | # path["advantages"] = discount(
171 | # deltas , pms.discount * pms.gae_lambda)
172 | # path["returns"] = np.concatenate(discount(path["rewards"] , pms.discount))
173 | # observations = np.concatenate([path["observations"] for path in paths])
174 | # actions = np.concatenate([path["actions"] for path in paths])
175 | # rewards = np.concatenate([path["rewards"] for path in paths])
176 | # advantages = np.concatenate([path["advantages"] for path in paths])
177 | # env_infos = np.concatenate([path["env_infos"] for path in paths])
178 | # agent_infos = np.concatenate([path["agent_infos"] for path in paths])
179 | # if pms.center_adv:
180 | # advantages -= np.mean(advantages)
181 | # advantages /= (advantages.std() + 1e-8)
182 | # samples_data = dict(
183 | # observations=observations ,
184 | # actions=actions ,
185 | # rewards=rewards ,
186 | # advantages=advantages ,
187 | # env_infos=env_infos ,
188 | # agent_infos=agent_infos ,
189 | # paths=paths ,
190 | # sum_episode_steps=sum_episode_steps
191 | # )
192 | # self.baseline.fit(paths)
193 | # return samples_data
194 |
195 | def set_policy_weights(self, parameters):
196 | for i in xrange(self.args.jobs):
197 | self.tasks.put(parameters)
198 | self.tasks.join()
199 |
200 | def end(self):
201 | for i in xrange(self.args.jobs):
202 | self.tasks.put(2)
--------------------------------------------------------------------------------
/agent/agent_discrete.py:
--------------------------------------------------------------------------------
1 | from utils import *
2 | from dealImage import *
3 | from logger.logger import Logger
4 | import krylov
5 | import numpy as np
6 | import random
7 | import tensorflow as tf
8 | import time
9 |
10 | import prettytensor as pt
11 |
12 | from storage.storage import Storage
13 | from parameters import pms
14 | from distribution.diagonal_category import DiagonalCategory
15 | from baseline.baseline_lstsq import Baseline
16 | import gym
17 | from environment import Environment
18 |
19 | class TRPOAgent(object):
20 | def __init__(self, env):
21 | self.env = env
22 | # if not isinstance(env.observation_space, Box) or \
23 | # not isinstance(env.action_space, Discrete):
24 | # print("Incompatible spaces.")
25 | # exit(-1)
26 | print("Observation Space", env.observation_space)
27 | print("Action Space", env.action_space)
28 | self.distribution = DiagonalCategory()
29 | self.session = tf.Session()
30 | self.baseline = Baseline()
31 | self.end_count = 0
32 | self.paths = []
33 | self.train = True
34 | self.storage = Storage(self, self.env, self.baseline)
35 | self.init_network()
36 | if pms.train_flag:
37 | self.init_logger()
38 |
39 | def init_logger(self):
40 | head = ["average_episode_std", "total number of episodes", "Average sum of rewards per episode",
41 | "KL between old and new distribution", "Surrogate loss", "Surrogate loss prev", "ds", "entropy",
42 | "mean_advant", "sum_episode_steps"]
43 | self.logger = Logger(head)
44 |
45 | def init_network(self):
46 | self.obs = obs = tf.placeholder(
47 | dtype, shape=[None, self.env.observation_space.shape[0]], name="obs")
48 | self.action = action = tf.placeholder(tf.int64, shape=[None], name="action")
49 | self.advant = advant = tf.placeholder(dtype, shape=[None], name="advant")
50 | self.oldaction_dist = oldaction_dist = tf.placeholder(dtype, shape=[None, self.env.action_space.n],
51 | name="oldaction_dist")
52 |
53 | # Create neural network.
54 | action_dist_n, _ = (pt.wrap(self.obs).
55 | fully_connected(32, activation_fn=tf.nn.relu).
56 | fully_connected(32, activation_fn=tf.nn.relu).
57 | softmax_classifier(self.env.action_space.n))
58 | eps = 1e-6
59 | self.action_dist_n = action_dist_n
60 | N = tf.shape(obs)[0]
61 | ratio_n = self.distribution.likelihood_ratio_sym(action, action_dist_n, oldaction_dist)
62 | Nf = tf.cast(N, dtype)
63 | surr = -tf.reduce_mean(ratio_n * advant) # Surrogate loss
64 | kl = self.distribution.kl_sym(oldaction_dist, action_dist_n)
65 | ent = self.distribution.entropy(action_dist_n)
66 |
67 | self.losses = [surr, kl, ent]
68 |
69 | var_list = tf.trainable_variables()
70 | self.pg = flatgrad(surr, var_list)
71 | # KL divergence where first arg is fixed
72 | # replace old->tf.stop_gradient from previous kl
73 | kl_firstfixed = tf.reduce_sum(tf.stop_gradient(
74 | action_dist_n) * tf.log(tf.stop_gradient(action_dist_n + eps) / (action_dist_n + eps))) / Nf
75 | grads = tf.gradients(kl_firstfixed, var_list)
76 | self.flat_tangent = tf.placeholder(dtype, shape=[None])
77 | shapes = map(var_shape, var_list)
78 | start = 0
79 | tangents = []
80 | for shape in shapes:
81 | size = np.prod(shape)
82 | param = tf.reshape(self.flat_tangent[start:(start + size)], shape)
83 | tangents.append(param)
84 | start += size
85 | gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)]
86 | self.fvp = flatgrad(gvp, var_list)
87 | self.gf = GetFlat(var_list)
88 | self.gf.session = self.session
89 | self.sff = SetFromFlat(var_list)
90 | self.sff.session = self.session
91 | self.saver = tf.train.Saver(max_to_keep=10)
92 | self.session.run(tf.initialize_all_variables())
93 |
94 | # self.load_model(pms.checkpoint_file)
95 |
96 | def get_samples(self, path_number):
97 | for i in range(path_number):
98 | self.storage.get_single_path()
99 |
100 | def act(self, obs, *args):
101 | obs = np.expand_dims(obs, 0)
102 | action_dist_n = self.session.run(self.action_dist_n, {self.obs: obs})
103 |
104 | if self.train:
105 | action = int(cat_sample(action_dist_n)[0])
106 | else:
107 | action = int(np.argmax(action_dist_n))
108 | return action, action_dist_n, np.squeeze(obs)
109 |
110 | def learn(self):
111 | start_time = time.time()
112 | numeptotal = 0
113 | for iteration in range(pms.max_iter_number):
114 | # Generating paths.
115 | print("Rollout")
116 | self.get_samples(pms.paths_number)
117 | paths = self.storage.get_paths() # get_paths
118 | # Computing returns and estimating advantage function.
119 |
120 | sample_data = self.storage.process_paths(paths)
121 | # shape = sample_data["observations"].shape
122 | # vis_square(np.reshape(sample_data["observations"],(shape[0], shape[2], shape[3]))[1:10])
123 | feed = {self.obs: sample_data["observations"],
124 | self.action: sample_data["actions"],
125 | self.advant: sample_data["advantages"],
126 | self.oldaction_dist: sample_data["agent_infos"]}
127 |
128 | print "\n********** Iteration %i ************" % iteration
129 | if self.train:
130 | thprev = self.gf()
131 | def fisher_vector_product(p):
132 | feed[self.flat_tangent] = p
133 | return self.session.run(self.fvp, feed) + pms.cg_damping * p
134 |
135 | g = self.session.run(self.pg, feed_dict=feed)
136 | stepdir = krylov.cg(fisher_vector_product, -g)
137 | shs = 0.5 * stepdir.dot(fisher_vector_product(stepdir)) # theta
138 | fullstep = stepdir * np.sqrt(2.0 * pms.max_kl / shs)
139 | neggdotstepdir = -g.dot(stepdir)
140 |
141 | def loss(th):
142 | self.sff(th)
143 | return self.session.run(self.losses, feed_dict=feed)
144 |
145 | surr_prev, kl_prev, entropy = loss(thprev)
146 | theta = linesearch(loss, thprev, fullstep, neggdotstepdir)
147 | self.sff(theta)
148 |
149 | surrafter, kloldnew, entropy = self.session.run(
150 | self.losses, feed_dict=feed)
151 |
152 | stats = {}
153 | episoderewards = np.sum(sample_data["rewards"])
154 | numeptotal += len(sample_data["rewards"])
155 | mean_advant = np.mean(sample_data["advantages"])
156 | stats["Total number of episodes"] = numeptotal
157 | stats["Average sum of rewards per episode"] = np.mean(sample_data["rewards"])
158 | # stats["Entropy"] = entropy
159 | # exp = explained_variance(np.array(sample_data[""]), np.array(returns_n))
160 | # stats["Baseline explained"] = exp
161 | stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0)
162 | stats["KL between old and new distribution"] = kloldnew
163 | stats["Surrogate loss"] = surrafter
164 | stats['Sum episode steps'] = sample_data["sum_episode_steps"]
165 | log_data = [0, numeptotal, episoderewards.mean(), kloldnew, surrafter, surr_prev,
166 | surrafter - surr_prev,
167 | entropy, mean_advant, sample_data["sum_episode_steps"]]
168 | if pms.train_flag:
169 | self.logger.log_row(log_data)
170 | for k, v in stats.iteritems():
171 | print(k + ": " + " " * (40 - len(k)) + str(v))
172 | if iteration % pms.save_model_times == 0:
173 | self.save_model(pms.environment_name + "-" + str(iteration))
174 |
175 | def test(self, model_name):
176 | self.load_model(model_name)
177 | if pms.record_movie:
178 | for i in range(100):
179 | self.storage.get_single_path()
180 | self.env.env.monitor.close()
181 | if pms.upload_to_gym:
182 | gym.upload("log/trpo" , algorithm_id='alg_8BgjkAsQRNiWu11xAhS4Hg' , api_key='sk_IJhy3b2QkqL3LWzgBXoVA')
183 | else:
184 | for i in range(50):
185 | self.storage.get_single_path()
186 |
187 | def save_model(self, model_name):
188 | self.saver.save(self.session, pms.checkpoint_dir + model_name + ".ckpt")
189 |
190 | def load_model(self, model_name):
191 | try:
192 | if model_name is not None:
193 | self.saver.restore(self.session, model_name)
194 | else:
195 | self.saver.restore(self.session, tf.train.latest_checkpoint(pms.checkpoint_dir))
196 | except:
197 | print "load model %s fail" % (model_name)
198 |
--------------------------------------------------------------------------------
/storage/storage_continous_parallel_image.py:
--------------------------------------------------------------------------------
1 | import multiprocessing
2 | from utils import *
3 | import gym
4 | import time
5 | from random import randint
6 | from parameters import pms
7 | from network.network_continous_image import NetworkContinousImage
8 | import cv2
9 |
10 |
11 | class Actor(multiprocessing.Process):
12 | def __init__(self, args, task_q, result_q, actor_id, monitor):
13 | multiprocessing.Process.__init__(self)
14 | self.actor_id = actor_id
15 | self.task_q = task_q
16 | self.result_q = result_q
17 | self.args = args
18 | self.monitor = monitor
19 | # pms.max_path_length = gym.spec(args.environment_name).timestep_limit
20 |
21 | def get_action(self, obs):
22 | if self.net == None:
23 | raise NameError("network have not been defined")
24 | obs = np.expand_dims(obs , 0)
25 | # action_dist_logstd = np.expand_dims([np.log(pms.std)], 0)
26 | action_dist_means_n , action_dist_logstds_n = self.session.run(
27 | [self.net.action_dist_means_n, self.net.action_dist_logstds_n], feed_dict={self.net.obs: obs})
28 | if pms.train_flag:
29 | rnd = np.random.normal(size=action_dist_means_n[0].shape)
30 | action = rnd * np.exp(action_dist_logstds_n[0]) + action_dist_means_n[0]
31 | else:
32 | action = action_dist_means_n[0]
33 | # action = np.clip(action, pms.min_a, pms.max_a)
34 | return action, dict(mean=action_dist_means_n[0] , log_std=np.exp(action_dist_logstds_n[0]))
35 |
36 | def run(self):
37 | self.env = gym.make(self.args.environment_name)
38 | self.env.seed(randint(0, 999999))
39 | if self.monitor:
40 | self.env.monitor.start('monitor/', force=True)
41 |
42 | self.net = NetworkContinousImage("rollout_network" + str(self.actor_id))
43 | config = tf.ConfigProto(
44 | device_count={'GPU': 0}
45 | )
46 | self.session = tf.Session(config=config)
47 | var_list = self.net.var_list
48 | self.session.run(tf.initialize_all_variables())
49 | self.set_policy = SetFromFlat(var_list)
50 | self.set_policy.session = self.session
51 | while True:
52 | # get a task, or wait until it gets one
53 | next_task = self.task_q.get(block=True)
54 | if type(next_task) is int and next_task == 1:
55 | # the task is an actor request to collect experience
56 | path = self.rollout()
57 | # print "single rollout time:"+str(end-start)
58 | self.task_q.task_done()
59 | self.result_q.put(path)
60 | elif type(next_task) is int and next_task == 2:
61 | print "kill message"
62 | if self.monitor:
63 | self.env.monitor.close()
64 | self.task_q.task_done()
65 | break
66 | else:
67 | # the task is to set parameters of the actor policy
68 | next_task = np.array(next_task)
69 | self.set_policy(next_task)
70 | # super hacky method to make sure when we fill the queue with set parameter tasks,
71 | # an actor doesn't finish updating before the other actors can accept their own tasks.
72 | time.sleep(0.1)
73 | self.task_q.task_done()
74 | return
75 |
76 | def rollout(self):
77 | """
78 | :param:observations:obs list
79 | :param:actions:action list
80 | :param:rewards:reward list
81 | :param:agent_infos: mean+log_std dictlist
82 | :param:env_infos: no use, just information about environment
83 | :return: a path, list
84 | """
85 | # if pms.record_movie:
86 | # outdir = 'log/trpo'
87 | # self.env.monitor.start(outdir , force=True)
88 | observations = []
89 | actions = []
90 | rewards = []
91 | agent_infos = []
92 | env_infos = []
93 | if pms.render:
94 | self.env.render()
95 | o = self.env.reset()
96 |
97 | episode_steps = 0
98 | for i in xrange(pms.max_path_length - 1):
99 | o = self.env.render('rgb_array')
100 | o = self.deal_image(o)
101 | a, agent_info = self.get_action(o)
102 | next_o, reward, terminal, env_info = self.env.step(a)
103 | observations.append(o)
104 | rewards.append(np.array([reward]))
105 | actions.append(a)
106 | agent_infos.append([agent_info])
107 | env_infos.append([])
108 | episode_steps += 1
109 | if terminal:
110 | break
111 | o = next_o
112 | if pms.render:
113 | self.env.render()
114 | path = dict(
115 | observations=np.array(observations) ,
116 | actions=np.array(actions) ,
117 | rewards=np.array(rewards) ,
118 | agent_infos=np.concatenate(agent_infos) ,
119 | env_infos=np.concatenate(env_infos) ,
120 | episode_steps=episode_steps
121 | )
122 | return path
123 |
124 | def deal_image(self , image):
125 | # index = len(self.obs_origin)
126 | # image_end = []
127 | # if index < pms.history_number:
128 | # image_end = self.obs_origin[0:index]
129 | # for i in range(pms.history_number - index):
130 | # image_end.append(image)
131 | # else:
132 | # image_end = self.obs_origin[index - pms.history_number:index]
133 | #
134 | # image_end = np.concatenate(image_end)
135 | # # image_end = image_end.reshape((pms.obs_height, pms.obs_width, pms.history_number))
136 | # obs = cv2.resize(cv2.cvtColor(image_end , cv2.COLOR_RGB2GRAY) / 255. , (pms.obs_height , pms.obs_width))
137 | obs = cv2.resize(image, (pms.obs_height, pms.obs_width))
138 | # obs = np.transpose(np.array(obs), (2, 0, 1))
139 | return obs
140 |
141 | class ParallelStorageImage():
142 | def __init__(self):
143 | self.args = pms
144 | self.tasks = multiprocessing.JoinableQueue()
145 | self.results = multiprocessing.Queue()
146 | self.actors = []
147 | self.actors.append(Actor(self.args, self.tasks, self.results, 9999, self.args.record_movie))
148 | for i in xrange(self.args.jobs-1):
149 | self.actors.append(Actor(self.args, self.tasks, self.results, 37*(i+3), False))
150 | for a in self.actors:
151 | a.start()
152 | # we will start by running 20,000 / 1000 = 20 episodes for the first ieration
153 | self.average_timesteps_in_episode = 1000
154 |
155 | def get_paths(self):
156 | # keep 20,000 timesteps per update
157 | num_rollouts = self.args.paths_number
158 | # print "rollout_number:"+str(num_rollouts)
159 | for i in xrange(num_rollouts):
160 | self.tasks.put(1)
161 | start = time.time()
162 | self.tasks.join()
163 | end = time.time()
164 | # print "rollout real time"+str(end-start)
165 | paths = []
166 | while num_rollouts:
167 | num_rollouts -= 1
168 | paths.append(self.results.get())
169 | return paths
170 |
171 | # def process_paths(self, paths):
172 | # sum_episode_steps = 0
173 | # for path in paths:
174 | # sum_episode_steps += path['episode_steps']
175 | # # r_t+V(S_{t+1})-V(S_t) = returns-baseline
176 | # # path_baselines = np.append(self.baseline.predict(path) , 0)
177 | # # # r_t+V(S_{t+1})-V(S_t) = returns-baseline
178 | # # path["advantages"] = np.concatenate(path["rewards"]) + \
179 | # # pms.discount * path_baselines[1:] - \
180 | # # path_baselines[:-1]
181 | # # path["returns"] = np.concatenate(discount(path["rewards"], pms.discount))
182 | # path_baselines = np.append(self.baseline.predict(path) , 0)
183 | # deltas = np.concatenate(path["rewards"]) + \
184 | # pms.discount * path_baselines[1:] - \
185 | # path_baselines[:-1]
186 | # path["advantages"] = discount(
187 | # deltas , pms.discount * pms.gae_lambda)
188 | # path["returns"] = np.concatenate(discount(path["rewards"] , pms.discount))
189 | # observations = np.concatenate([path["observations"] for path in paths])
190 | # actions = np.concatenate([path["actions"] for path in paths])
191 | # rewards = np.concatenate([path["rewards"] for path in paths])
192 | # advantages = np.concatenate([path["advantages"] for path in paths])
193 | # env_infos = np.concatenate([path["env_infos"] for path in paths])
194 | # agent_infos = np.concatenate([path["agent_infos"] for path in paths])
195 | # if pms.center_adv:
196 | # advantages -= np.mean(advantages)
197 | # advantages /= (advantages.std() + 1e-8)
198 | # samples_data = dict(
199 | # observations=observations ,
200 | # actions=actions ,
201 | # rewards=rewards ,
202 | # advantages=advantages ,
203 | # env_infos=env_infos ,
204 | # agent_infos=agent_infos ,
205 | # paths=paths ,
206 | # sum_episode_steps=sum_episode_steps
207 | # )
208 | # self.baseline.fit(paths)
209 | # return samples_data
210 |
211 | def set_policy_weights(self, parameters):
212 | for i in xrange(self.args.jobs):
213 | self.tasks.put(parameters)
214 | self.tasks.join()
215 |
216 | def end(self):
217 | for i in xrange(self.args.jobs):
218 | self.tasks.put(2)
--------------------------------------------------------------------------------
/agent/agent_continous_parallel_storage.py:
--------------------------------------------------------------------------------
1 | from utils import *
2 | import numpy as np
3 | import tensorflow as tf
4 | from network.network_continous import NetworkContinous
5 | from parameters import pms
6 |
7 | import multiprocessing
8 | import krylov
9 | from baseline.baseline_lstsq import Baseline
10 | from distribution.diagonal_gaussian import DiagonalGaussian
11 | import time
12 | import math
13 | from logger.logger import Logger
14 |
15 | seed = 1
16 | np.random.seed(seed)
17 | tf.set_random_seed(seed)
18 |
19 |
20 | """
21 | class for continoust action space in multi process
22 | """
23 | class TRPOAgentParallel(multiprocessing.Process):
24 |
25 |
26 | def __init__(self , observation_space , action_space , task_q , result_q):
27 | multiprocessing.Process.__init__(self)
28 | self.task_q = task_q
29 | self.result_q = result_q
30 | self.observation_space = observation_space
31 | self.action_space = action_space
32 | self.args = pms
33 | self.baseline = Baseline()
34 | self.distribution = DiagonalGaussian(pms.action_shape)
35 | self.init_logger()
36 |
37 | def init_network(self):
38 | """
39 | [input]
40 | self.obs
41 | self.action_n
42 | self.advant
43 | self.old_dist_means_n
44 | self.old_dist_logstds_n
45 | [output]
46 | self.action_dist_means_n
47 | self.action_dist_logstds_n
48 | var_list
49 | """
50 | config = tf.ConfigProto(
51 | device_count={'GPU': 0}
52 | )
53 | self.session = tf.Session(config=config)
54 | self.net = NetworkContinous("network_continous")
55 | if pms.min_std is not None:
56 | log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std))
57 | self.action_dist_stds_n = tf.exp(log_std_var)
58 | self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n)
59 | self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n)
60 | self.likehood_action_dist = self.distribution.log_likelihood_sym(self.net.action_n, self.new_dist_info_vars)
61 | self.ratio_n = self.distribution.likelihood_ratio_sym(self.net.action_n, self.new_dist_info_vars,
62 | self.old_dist_info_vars)
63 | surr = -tf.reduce_mean(self.ratio_n * self.net.advant) # Surrogate loss
64 | batch_size = tf.shape(self.net.obs)[0]
65 | batch_size_float = tf.cast(batch_size , tf.float32)
66 | kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars))
67 | ent = self.distribution.entropy(self.old_dist_info_vars)
68 | # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf
69 | self.losses = [surr, kl, ent]
70 | var_list = self.net.var_list
71 |
72 | self.gf = GetFlat(var_list) # get theta from var_list
73 | self.gf.session = self.session
74 | self.sff = SetFromFlat(var_list) # set theta from var_List
75 | self.sff.session = self.session
76 | # get g
77 | self.pg = flatgrad(surr, var_list)
78 | # get A
79 | # KL divergence where first arg is fixed
80 | # replace old->tf.stop_gradient from previous kl
81 | kl_firstfixed = self.distribution.kl_sym_firstfixed(self.new_dist_info_vars) / batch_size_float
82 | grads = tf.gradients(kl_firstfixed, var_list)
83 | self.flat_tangent = tf.placeholder(dtype, shape=[None])
84 | shapes = map(var_shape, var_list)
85 | start = 0
86 | tangents = []
87 | for shape in shapes:
88 | size = np.prod(shape)
89 | param = tf.reshape(self.flat_tangent[start:(start + size)], shape)
90 | tangents.append(param)
91 | start += size
92 | self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)]
93 | self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list) # get kl''*p
94 | self.session.run(tf.initialize_all_variables())
95 | self.saver = tf.train.Saver(max_to_keep=5)
96 |
97 | def init_logger(self):
98 | head = ["factor", "rewards", "std"]
99 | self.logger = Logger(head)
100 |
101 | def run(self):
102 | self.init_network()
103 | while True:
104 | paths = self.task_q.get()
105 | if paths is None:
106 | # kill the learner
107 | self.task_q.task_done()
108 | break
109 | elif paths == 1:
110 | # just get params, no learn
111 | self.task_q.task_done()
112 | self.result_q.put(self.gf())
113 | elif paths[0] == 2:
114 | # adjusting the max KL.
115 | self.args.max_kl = paths[1]
116 | if paths[2] == 1:
117 | print "saving checkpoint..."
118 | self.save_model(pms.environment_name + "-" + str(paths[3]))
119 | self.task_q.task_done()
120 | else:
121 | stats , theta, thprev = self.learn(paths, linear_search=False)
122 | self.sff(theta)
123 | self.task_q.task_done()
124 | self.result_q.put((stats, theta, thprev))
125 | return
126 |
127 | def learn(self, paths, parallel=False, linear_search=False):
128 | start_time = time.time()
129 | sample_data = self.process_paths(paths)
130 | agent_infos = sample_data["agent_infos"]
131 | obs_all = sample_data["observations"]
132 | action_all = sample_data["actions"]
133 | advant_all = sample_data["advantages"]
134 | n_samples = len(obs_all)
135 | batch = int(1/pms.subsample_factor)
136 | batch_size = int(math.floor(n_samples * pms.subsample_factor))
137 | accum_fullstep = 0.0
138 | for iteration in range(batch):
139 | print "batch: %d, batch_size: %d"%(iteration+1, batch_size)
140 | inds = np.random.choice(n_samples , batch_size , replace=False)
141 | obs_n = obs_all[inds]
142 | action_n = action_all[inds]
143 | advant_n = advant_all[inds]
144 | action_dist_means_n = np.array([agent_info["mean"] for agent_info in agent_infos[inds]])
145 | action_dist_logstds_n = np.array([agent_info["log_std"] for agent_info in agent_infos[inds]])
146 | feed = {self.net.obs: obs_n ,
147 | self.net.advant: advant_n ,
148 | self.net.old_dist_means_n: action_dist_means_n ,
149 | self.net.old_dist_logstds_n: action_dist_logstds_n ,
150 | self.net.action_n: action_n
151 | }
152 |
153 | episoderewards = np.array([path["rewards"].sum() for path in paths])
154 | thprev = self.gf() # get theta_old
155 |
156 | def fisher_vector_product(p):
157 | feed[self.flat_tangent] = p
158 | return self.session.run(self.fvp , feed) + pms.cg_damping * p
159 |
160 | g = self.session.run(self.pg , feed_dict=feed)
161 | stepdir = krylov.cg(fisher_vector_product , -g , cg_iters=pms.cg_iters)
162 | shs = 0.5 * stepdir.dot(fisher_vector_product(stepdir)) # theta
163 | # if shs<0, then the nan error would appear
164 | lm = np.sqrt(shs / pms.max_kl)
165 | fullstep = stepdir / lm
166 | neggdotstepdir = -g.dot(stepdir)
167 |
168 | def loss(th):
169 | self.sff(th)
170 | return self.session.run(self.losses , feed_dict=feed)
171 |
172 | if parallel is True:
173 | theta = linesearch_parallel(loss , thprev , fullstep , neggdotstepdir / lm)
174 | else:
175 | if linear_search:
176 | theta = linesearch(loss , thprev , fullstep , neggdotstepdir / lm)
177 | else:
178 | theta = thprev + fullstep
179 | accum_fullstep += (theta - thprev)
180 | theta = thprev + accum_fullstep * pms.subsample_factor
181 | stats = {}
182 | stats["sum steps of episodes"] = sample_data["sum_episode_steps"]
183 | stats["Average sum of rewards per episode"] = episoderewards.mean()
184 | stats["surr loss"] = loss(theta)[0]
185 | stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0)
186 | self.logger.log_row([pms.subsample_factor, stats["Average sum of rewards per episode"], self.session.run(self.net.action_dist_logstd_param)[0][0]])
187 | return stats , theta , thprev
188 |
189 | def process_paths(self, paths):
190 | sum_episode_steps = 0
191 | for path in paths:
192 | sum_episode_steps += path['episode_steps']
193 | path['baselines'] = self.baseline.predict(path)
194 | path["returns"] = np.concatenate(discount(path["rewards"] , pms.discount))
195 | path["advantages"] = path['returns'] - path['baselines']
196 |
197 | observations = np.concatenate([path["observations"] for path in paths])
198 | actions = np.concatenate([path["actions"] for path in paths])
199 | rewards = np.concatenate([path["rewards"] for path in paths])
200 | advantages = np.concatenate([path["advantages"] for path in paths])
201 | env_infos = np.concatenate([path["env_infos"] for path in paths])
202 | agent_infos = np.concatenate([path["agent_infos"] for path in paths])
203 | if pms.center_adv:
204 | advantages -= advantages.mean()
205 | advantages /= (advantages.std() + 1e-8)
206 |
207 | # for some unknown reaseon, it can not be used
208 | # if pms.positive_adv:
209 | # advantages = (advantages - np.min(advantages)) + 1e-8
210 |
211 | # average_discounted_return = \
212 | # np.mean([path["returns"][0] for path in paths])
213 | #
214 | # undiscounted_returns = [sum(path["rewards"]) for path in paths]
215 |
216 |
217 | # ev = self.explained_variance_1d(
218 | # np.concatenate(baselines),
219 | # np.concatenate(returns)
220 | # )
221 | samples_data = dict(
222 | observations=observations ,
223 | actions=actions ,
224 | rewards=rewards ,
225 | advantages=advantages ,
226 | env_infos=env_infos ,
227 | agent_infos=agent_infos ,
228 | paths=paths ,
229 | sum_episode_steps=sum_episode_steps
230 | )
231 | self.baseline.fit(paths)
232 | return samples_data
233 |
234 | def save_model(self , model_name):
235 | self.saver.save(self.session , "checkpoint/" + model_name + ".ckpt")
236 |
237 | def load_model(self , model_name):
238 | try:
239 | if model_name is not None:
240 | self.saver.restore(self.session , model_name)
241 | else:
242 | self.saver.restore(self.session , tf.train.latest_checkpoint(pms.checkpoint_dir))
243 | except:
244 | print "load model %s fail" % (model_name)
--------------------------------------------------------------------------------
/agent/agent_continous_single_process.py:
--------------------------------------------------------------------------------
1 | from utils import *
2 | import threading
3 | import gym
4 | import numpy as np
5 | import random
6 | import tensorflow as tf
7 | import time
8 | import threading
9 | import multiprocessing
10 | import prettytensor as pt
11 |
12 | from storage.storage_continous import Storage
13 | from storage.storage_continous import Rollout
14 | import math
15 | from parameters import pms
16 | import krylov
17 | from logger.logger import Logger
18 | from distribution.diagonal_gaussian import DiagonalGaussian
19 | from baseline.baseline_lstsq import Baseline
20 | from environment import Environment
21 | from network.network_continous import NetworkContinous
22 |
23 | seed = 1
24 | np.random.seed(seed)
25 | tf.set_random_seed(seed)
26 |
27 |
28 | class TRPOAgentContinousSingleProcess(object):
29 |
30 | def __init__(self, thread_id):
31 | print "create worker %d"%(thread_id)
32 | self.thread_id = thread_id
33 | self.env = env = Environment(gym.make(pms.environment_name))
34 | # print("Observation Space", env.observation_space)
35 | # print("Action Space", env.action_space)
36 | # print("Action area, high:%f, low%f" % (env.action_space.high, env.action_space.low))
37 | self.end_count = 0
38 | self.paths = []
39 | self.train = True
40 | self.baseline = Baseline()
41 | self.storage = Storage(self, self.env, self.baseline)
42 | self.distribution = DiagonalGaussian(pms.action_shape)
43 |
44 | self.session = self.master.session
45 | self.init_network()
46 |
47 |
48 | def init_network(self):
49 | self.network = NetworkContinous(str(self.thread_id))
50 | if pms.min_std is not None:
51 | log_std_var = tf.maximum(self.network.action_dist_logstds_n, np.log(pms.min_std))
52 | self.action_dist_stds_n = tf.exp(log_std_var)
53 |
54 | self.old_dist_info_vars = dict(mean=self.network.old_dist_means_n, log_std=self.network.old_dist_logstds_n)
55 | self.new_dist_info_vars = dict(mean=self.network.action_dist_means_n, log_std=self.network.action_dist_logstds_n)
56 | self.likehood_action_dist = self.distribution.log_likelihood_sym(self.network.action_n, self.new_dist_info_vars)
57 | self.ratio_n = self.distribution.likelihood_ratio_sym(self.network.action_n, self.new_dist_info_vars,
58 | self.old_dist_info_vars)
59 |
60 | surr = -tf.reduce_mean(self.ratio_n * self.network.advant) # Surrogate loss
61 | kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars))
62 | ent = self.distribution.entropy(self.old_dist_info_vars)
63 | # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf
64 | self.losses = [surr, kl, ent]
65 | var_list = self.network.var_list
66 | self.gf = GetFlat(self.session, var_list) # get theta from var_list
67 | self.sff = SetFromFlat(self.session, var_list) # set theta from var_List
68 | # get g
69 | self.pg = flatgrad(surr, var_list)
70 | # get A
71 |
72 | # KL divergence where first arg is fixed
73 | # replace old->tf.stop_gradient from previous kl
74 | kl_firstfixed = kl_sym_gradient(self.network.old_dist_means_n, self.network.old_dist_logstds_n, self.network.action_dist_means_n,
75 | self.network.action_dist_logstds_n)
76 |
77 | grads = tf.gradients(kl, var_list)
78 | self.flat_tangent = tf.placeholder(dtype, shape=[None])
79 | shapes = map(var_shape, var_list)
80 | start = 0
81 | tangents = []
82 | for shape in shapes:
83 | size = np.prod(shape)
84 | param = tf.reshape(self.flat_tangent[start:(start + size)], shape)
85 | tangents.append(param)
86 | start += size
87 | self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)]
88 | self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list) # get kl''*p
89 | # self.load_model()
90 |
91 | def get_samples(self, path_number):
92 | for i in range(pms.paths_number):
93 | self.storage.get_single_path()
94 |
95 | def get_action(self, obs, *args):
96 | obs = np.expand_dims(obs, 0)
97 | # action_dist_logstd = np.expand_dims([np.log(pms.std)], 0)
98 | if pms.use_std_network:
99 | action_dist_means_n, action_dist_logstds_n = self.session.run(
100 | [self.action_dist_means_n, self.action_dist_logstds_n],
101 | {self.obs: obs})
102 | if pms.train_flag:
103 | rnd = np.random.normal(size=action_dist_means_n[0].shape)
104 | action = rnd * np.exp(action_dist_logstds_n[0]) + action_dist_means_n[0]
105 | else:
106 | action = action_dist_means_n[0]
107 | # action = np.clip(action, pms.min_a, pms.max_a)
108 | return action, dict(mean=action_dist_means_n[0], log_std=action_dist_logstds_n[0])
109 | else:
110 | action_dist_logstd = np.expand_dims([np.log(pms.std)], 0)
111 | action_dist_means_n = self.network.get_action_dist_means_n(self.session, obs)
112 | if pms.train_flag:
113 | rnd = np.random.normal(size=action_dist_means_n[0].shape)
114 | action = rnd * np.exp(action_dist_logstd[0]) + action_dist_means_n[0]
115 | else:
116 | action = action_dist_means_n[0]
117 | # action = np.clip(action, pms.min_a, pms.max_a)
118 | return action, dict(mean=action_dist_means_n[0], log_std=action_dist_logstd[0])
119 |
120 | def run(self):
121 | self.learn()
122 |
123 | def learn(self):
124 | start_time = time.time()
125 |
126 | numeptotal = 0
127 | while True:
128 | i = 0
129 | # Generating paths.
130 | # print("Rollout")
131 | self.get_samples(pms.paths_number)
132 | paths = self.storage.get_paths() # get_paths
133 | # Computing returns and estimating advantage function.
134 | sample_data = self.storage.process_paths(paths)
135 |
136 | agent_infos = sample_data["agent_infos"]
137 | obs_n = sample_data["observations"]
138 | action_n = sample_data["actions"]
139 | advant_n = sample_data["advantages"]
140 | n_samples = len(obs_n)
141 | inds = np.random.choice(n_samples, math.floor(n_samples * pms.subsample_factor), replace=False)
142 | obs_n = obs_n[inds]
143 | action_n = action_n[inds]
144 | advant_n = advant_n[inds]
145 | action_dist_means_n = np.array([agent_info["mean"] for agent_info in agent_infos[inds]])
146 | action_dist_logstds_n = np.array([agent_info["log_std"] for agent_info in agent_infos[inds]])
147 | feed = {self.network.obs: obs_n,
148 | self.network.advant: advant_n,
149 | self.network.old_dist_means_n: action_dist_means_n,
150 | self.network.old_dist_logstds_n: action_dist_logstds_n,
151 | self.network.action_dist_logstds_n: action_dist_logstds_n,
152 | self.network.action_n: action_n
153 | }
154 |
155 | episoderewards = np.array([path["rewards"].sum() for path in paths])
156 | average_episode_std = np.mean(np.exp(action_dist_logstds_n))
157 |
158 | # print "\n********** Iteration %i ************" % i
159 | for iter_num_per_train in range(pms.iter_num_per_train):
160 | # if not self.train:
161 | # print("Episode mean: %f" % episoderewards.mean())
162 | # self.end_count += 1
163 | # if self.end_count > 100:
164 | # break
165 | if self.train:
166 | thprev = self.gf() # get theta_old
167 |
168 | def fisher_vector_product(p):
169 | feed[self.flat_tangent] = p
170 | return self.session.run(self.fvp, feed) + pms.cg_damping * p
171 |
172 | g = self.session.run(self.pg, feed_dict=feed)
173 | stepdir = krylov.cg(fisher_vector_product, g, cg_iters=pms.cg_iters)
174 | shs = 0.5 * stepdir.dot(fisher_vector_product(stepdir)) # theta
175 | fullstep = stepdir * np.sqrt(2.0 * pms.max_kl / shs)
176 | neggdotstepdir = -g.dot(stepdir)
177 |
178 | def loss(th):
179 | self.sff(th)
180 | return self.session.run(self.losses, feed_dict=feed)
181 |
182 | surr_prev, kl_prev, ent_prev = loss(thprev)
183 | mean_advant = np.mean(advant_n)
184 | theta = linesearch(loss, thprev, fullstep, neggdotstepdir)
185 | self.sff(theta)
186 | surrafter, kloldnew, entnew = self.session.run(self.losses, feed_dict=feed)
187 | stats = {}
188 | numeptotal += len(episoderewards)
189 | stats["average_episode_std"] = average_episode_std
190 | stats["sum steps of episodes"] = sample_data["sum_episode_steps"]
191 | stats["Total number of episodes"] = numeptotal
192 | stats["Average sum of rewards per episode"] = episoderewards.mean()
193 | # stats["Entropy"] = entropy
194 | # exp = explained_variance(np.array(baseline_n), np.array(returns_n))
195 | # stats["Baseline explained"] = exp
196 | stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0)
197 | stats["KL between old and new distribution"] = kloldnew
198 | stats["Surrogate loss"] = surrafter
199 | stats["Surrogate loss prev"] = surr_prev
200 | stats["entropy"] = ent_prev
201 | stats["mean_advant"] = mean_advant
202 | log_data = [average_episode_std, len(episoderewards), numeptotal, episoderewards.mean(), kloldnew, surrafter, surr_prev,
203 | surrafter - surr_prev,
204 | ent_prev, mean_advant]
205 | self.master.logger.log_row(log_data)
206 | # for k, v in stats.iteritems():
207 | # print(k + ": " + " " * (40 - len(k)) + str(v))
208 | # # if entropy != entropy:
209 | # # exit(-1)
210 | # # if exp > 0.95:
211 | # # self.train = False
212 | if self.thread_id==1:
213 | self.master.save_model("iter" + str(i))
214 | print episoderewards.mean()
215 | i += 1
216 |
217 | def test(self, model_name):
218 | self.load_model(model_name)
219 | for i in range(50):
220 | self.storage.get_single_path()
221 |
222 | def save_model(self, model_name):
223 | self.saver.save(self.session, "checkpoint/" + model_name + ".ckpt")
224 |
225 | def load_model(self, model_name):
226 | try:
227 | self.saver.restore(self.session, model_name)
228 | except:
229 | print "load model %s fail" % (model_name)
230 |
--------------------------------------------------------------------------------
/agent/agent_continous_image_parallel_image.py:
--------------------------------------------------------------------------------
1 | from utils import *
2 | import numpy as np
3 | import tensorflow as tf
4 | from network.network_continous_image import NetworkContinousImage
5 | from parameters import pms
6 |
7 | import multiprocessing
8 | import krylov
9 | from baseline.baseline_zeros import Baseline
10 | from distribution.diagonal_gaussian import DiagonalGaussian
11 | import time
12 | import math
13 | from logger.logger import Logger
14 |
15 | seed = 1
16 | np.random.seed(seed)
17 | tf.set_random_seed(seed)
18 |
19 |
20 | """
21 | class for continoust action space in multi process
22 | """
23 | class TRPOAgentParallelImage(multiprocessing.Process):
24 |
25 |
26 | def __init__(self , observation_space , action_space , task_q , result_q):
27 | multiprocessing.Process.__init__(self)
28 | self.task_q = task_q
29 | self.result_q = result_q
30 | self.observation_space = observation_space
31 | self.action_space = action_space
32 | self.args = pms
33 | self.baseline = Baseline()
34 | self.distribution = DiagonalGaussian(pms.action_shape)
35 | self.init_logger()
36 |
37 | def init_network(self):
38 | """
39 | [input]
40 | self.obs
41 | self.action_n
42 | self.advant
43 | self.old_dist_means_n
44 | self.old_dist_logstds_n
45 | [output]
46 | self.action_dist_means_n
47 | self.action_dist_logstds_n
48 | var_list
49 | """
50 | config = tf.ConfigProto(
51 | device_count={'GPU': 0}
52 | )
53 | self.session = tf.Session(config=config)
54 | self.net = NetworkContinousImage("network_continous_image")
55 | if pms.min_std is not None:
56 | log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std))
57 | self.action_dist_stds_n = tf.exp(log_std_var)
58 | self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n)
59 | self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n)
60 | self.likehood_action_dist = self.distribution.log_likelihood_sym(self.net.action_n, self.new_dist_info_vars)
61 | self.ratio_n = self.distribution.likelihood_ratio_sym(self.net.action_n, self.new_dist_info_vars,
62 | self.old_dist_info_vars)
63 | surr = -tf.reduce_sum(self.ratio_n * self.net.advant) # Surrogate loss
64 | batch_size = tf.shape(self.net.obs)[0]
65 | batch_size_float = tf.cast(batch_size , tf.float32)
66 | kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars))
67 | ent = self.distribution.entropy(self.old_dist_info_vars)
68 | # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf
69 | self.losses = [surr, kl, ent]
70 | var_list = self.net.var_list
71 |
72 | self.gf = GetFlat(var_list) # get theta from var_list
73 | self.gf.session = self.session
74 | self.sff = SetFromFlat(var_list) # set theta from var_List
75 | self.sff.session = self.session
76 | # get g
77 | self.pg = flatgrad(surr, var_list)
78 | # get A
79 | # KL divergence where first arg is fixed
80 | # replace old->tf.stop_gradient from previous kl
81 | kl_firstfixed = self.distribution.kl_sym_firstfixed(self.new_dist_info_vars) / batch_size_float
82 | grads = tf.gradients(kl_firstfixed, var_list)
83 | self.flat_tangent = tf.placeholder(dtype, shape=[None])
84 | shapes = map(var_shape, var_list)
85 | start = 0
86 | tangents = []
87 | for shape in shapes:
88 | size = np.prod(shape)
89 | param = tf.reshape(self.flat_tangent[start:(start + size)], shape)
90 | tangents.append(param)
91 | start += size
92 | self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)]
93 | self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list) # get kl''*p
94 | self.session.run(tf.initialize_all_variables())
95 | self.saver = tf.train.Saver(max_to_keep=5)
96 |
97 | def init_logger(self):
98 | head = ["factor", "rewards", "std"]
99 | self.logger = Logger(head)
100 |
101 | def run(self):
102 | self.init_network()
103 | while True:
104 | paths = self.task_q.get()
105 | if paths is None:
106 | # kill the learner
107 | self.task_q.task_done()
108 | break
109 | elif paths == 1:
110 | # just get params, no learn
111 | self.task_q.task_done()
112 | self.result_q.put(self.gf())
113 | elif paths[0] == 2:
114 | # adjusting the max KL.
115 | self.args.max_kl = paths[1]
116 | if paths[2] == 1:
117 | print "saving checkpoint..."
118 | self.save_model(pms.environment_name + "-" + str(paths[3]))
119 | self.task_q.task_done()
120 | else:
121 | stats , theta, thprev = self.learn(paths)
122 | self.sff(theta)
123 | self.task_q.task_done()
124 | self.result_q.put((stats, theta, thprev))
125 | return
126 |
127 | def learn(self, paths, parallel=False, linear_search=False):
128 | start_time = time.time()
129 | sample_data = self.process_paths(paths)
130 | agent_infos = sample_data["agent_infos"]
131 | obs_all = sample_data["observations"]
132 | action_all = sample_data["actions"]
133 | advant_all = sample_data["advantages"]
134 | n_samples = len(obs_all)
135 | batch = int(1/pms.subsample_factor)
136 | batch_size = int(math.floor(n_samples * pms.subsample_factor))
137 | accum_fullstep = 0.0
138 | for iteration in range(batch):
139 | print "batch: %d, batch_size: %d"%(iteration+1, batch_size)
140 | inds = np.random.choice(n_samples , batch_size , replace=False)
141 | obs_n = obs_all[inds]
142 | action_n = action_all[inds]
143 | advant_n = advant_all[inds]
144 | action_dist_means_n = np.array([agent_info["mean"] for agent_info in agent_infos[inds]])
145 | action_dist_logstds_n = np.array([agent_info["log_std"] for agent_info in agent_infos[inds]])
146 | feed = {self.net.obs: obs_n ,
147 | self.net.advant: advant_n ,
148 | self.net.old_dist_means_n: action_dist_means_n ,
149 | self.net.old_dist_logstds_n: action_dist_logstds_n ,
150 | self.net.action_n: action_n
151 | }
152 |
153 | episoderewards = np.array([path["rewards"].sum() for path in paths])
154 | thprev = self.gf() # get theta_old
155 |
156 | def fisher_vector_product(p):
157 | feed[self.flat_tangent] = p
158 | return self.session.run(self.fvp , feed) + pms.cg_damping * p
159 |
160 | g = self.session.run(self.pg , feed_dict=feed)
161 | stepdir = krylov.cg(fisher_vector_product , -g , cg_iters=pms.cg_iters)
162 | shs = 0.5 * stepdir.dot(fisher_vector_product(stepdir)) # theta
163 | # if shs<0, then the nan error would appear
164 | lm = np.sqrt(shs / pms.max_kl)
165 | fullstep = stepdir / lm
166 | neggdotstepdir = -g.dot(stepdir)
167 |
168 | def loss(th):
169 | self.sff(th)
170 | return self.session.run(self.losses , feed_dict=feed)
171 |
172 | if parallel is True:
173 | theta = linesearch_parallel(loss , thprev , fullstep , neggdotstepdir / lm)
174 | else:
175 | if linear_search:
176 | theta = linesearch(loss , thprev , fullstep , neggdotstepdir / lm)
177 | else:
178 | theta = thprev + fullstep
179 | if math.isnan(theta.mean()):
180 | print shs is None
181 | theta = thprev
182 | accum_fullstep += (theta - thprev)
183 | theta = thprev + accum_fullstep * pms.subsample_factor
184 | stats = {}
185 | stats["sum steps of episodes"] = sample_data["sum_episode_steps"]
186 | stats["Average sum of rewards per episode"] = episoderewards.mean()
187 | stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0)
188 | self.logger.log_row([pms.subsample_factor, stats["Average sum of rewards per episode"], self.session.run(self.net.action_dist_logstd_param)[0][0]])
189 | return stats , theta , thprev
190 |
191 | def process_paths(self, paths):
192 | sum_episode_steps = 0
193 | for path in paths:
194 | sum_episode_steps += path['episode_steps']
195 | # r_t+V(S_{t+1})-V(S_t) = returns-baseline
196 | # path_baselines = np.append(self.baseline.predict(path) , 0)
197 | # # r_t+V(S_{t+1})-V(S_t) = returns-baseline
198 | # path["advantages"] = np.concatenate(path["rewards"]) + \
199 | # pms.discount * path_baselines[1:] - \
200 | # path_baselines[:-1]
201 | # path["returns"] = np.concatenate(discount(path["rewards"], pms.discount))
202 | path_baselines = np.append(self.baseline.predict(path) , 0)
203 | deltas = np.concatenate(path["rewards"]) + \
204 | pms.discount * path_baselines[1:] - \
205 | path_baselines[:-1]
206 | path["advantages"] = discount(
207 | deltas , pms.discount * pms.gae_lambda)
208 | path["returns"] = np.concatenate(discount(path["rewards"] , pms.discount))
209 | observations = np.concatenate([path["observations"] for path in paths])
210 | actions = np.concatenate([path["actions"] for path in paths])
211 | rewards = np.concatenate([path["rewards"] for path in paths])
212 | advantages = np.concatenate([path["advantages"] for path in paths])
213 | env_infos = np.concatenate([path["env_infos"] for path in paths])
214 | agent_infos = np.concatenate([path["agent_infos"] for path in paths])
215 | if pms.center_adv:
216 | advantages -= np.mean(advantages)
217 | advantages /= (advantages.std() + 1e-8)
218 |
219 | # for some unknown reaseon, it can not be used
220 | # if pms.positive_adv:
221 | # advantages = (advantages - np.min(advantages)) + 1e-8
222 |
223 | # average_discounted_return = \
224 | # np.mean([path["returns"][0] for path in paths])
225 | #
226 | # undiscounted_returns = [sum(path["rewards"]) for path in paths]
227 |
228 |
229 | # ev = self.explained_variance_1d(
230 | # np.concatenate(baselines),
231 | # np.concatenate(returns)
232 | # )
233 | samples_data = dict(
234 | observations=observations ,
235 | actions=actions ,
236 | rewards=rewards ,
237 | advantages=advantages ,
238 | env_infos=env_infos ,
239 | agent_infos=agent_infos ,
240 | paths=paths ,
241 | sum_episode_steps=sum_episode_steps
242 | )
243 | self.baseline.fit(paths)
244 | return samples_data
245 |
246 | def save_model(self , model_name):
247 | self.saver.save(self.session , "checkpoint/" + model_name + ".ckpt")
248 |
249 | def load_model(self , model_name):
250 | try:
251 | if model_name is not None:
252 | self.saver.restore(self.session , model_name)
253 | else:
254 | self.saver.restore(self.session , tf.train.latest_checkpoint(pms.checkpoint_dir))
255 | except:
256 | print "load model %s fail" % (model_name)
--------------------------------------------------------------------------------