├── pictures ├── sac1.png ├── dsac1.png ├── ddrlframework.jpg └── rapid-architecture@2x--1-.png ├── tutorial ├── Pictures │ ├── dsac1w-sac.png │ └── ddrlframework.jpg ├── Parallelize your algorithm by Ray (1).md ├── Parallelize your algorithm by Ray (3).md └── Parallelize your algorithm by Ray (2).md ├── algos ├── test_scripts │ ├── Testing.py │ ├── Testing1.py │ └── dense_bn.py ├── dqn │ ├── core.py │ ├── hyperparams.py │ ├── actor_learner.py │ └── train.py ├── sac1 │ ├── render_test.py │ ├── hyperparams.py │ ├── core.py │ ├── sac1.py │ ├── actor_learner.py │ └── sac_ray.py ├── sqn │ ├── hyperparams.py │ ├── core.py │ ├── actor_learner.py │ └── train.py └── trading_env.py ├── README.md └── example ├── core.py ├── model.py ├── dsac.py └── sac.py /pictures/sac1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/createamind/Distributed-DRL/HEAD/pictures/sac1.png -------------------------------------------------------------------------------- /pictures/dsac1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/createamind/Distributed-DRL/HEAD/pictures/dsac1.png -------------------------------------------------------------------------------- /pictures/ddrlframework.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/createamind/Distributed-DRL/HEAD/pictures/ddrlframework.jpg -------------------------------------------------------------------------------- /tutorial/Pictures/dsac1w-sac.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/createamind/Distributed-DRL/HEAD/tutorial/Pictures/dsac1w-sac.png -------------------------------------------------------------------------------- /tutorial/Pictures/ddrlframework.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/createamind/Distributed-DRL/HEAD/tutorial/Pictures/ddrlframework.jpg -------------------------------------------------------------------------------- /pictures/rapid-architecture@2x--1-.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/createamind/Distributed-DRL/HEAD/pictures/rapid-architecture@2x--1-.png -------------------------------------------------------------------------------- /algos/test_scripts/Testing.py: -------------------------------------------------------------------------------- 1 | 2 | import ray 3 | import time 4 | 5 | ray.init(object_store_memory=1000000000, redis_max_memory=1000000000) 6 | 7 | y=111 8 | y_id = ray.put(y) 9 | 10 | 11 | @ray.remote 12 | class Cat: 13 | def __init__(self): 14 | self.cnt = 0 15 | global y_id 16 | y_id = ray.put(2) 17 | def incre(self): 18 | print('done.') 19 | time.sleep(1) 20 | self.cnt += ray.get(y_id) 21 | def get_cnt(self): 22 | return self.cnt 23 | 24 | cat = Cat.remote() 25 | 26 | 27 | 28 | class Dog: 29 | def __init__(self): 30 | self.cnt = 0 31 | global y_id 32 | y_id = ray.put(2) 33 | def incre(self): 34 | print('done.') 35 | time.sleep(1) 36 | self.cnt += ray.get(y_id) 37 | def get_cnt(self): 38 | return self.cnt 39 | 40 | dog = Dog() 41 | 42 | 43 | @ray.remote 44 | def remote_cat(cls1): 45 | cls1.incre.remote() # self.cnt will increase 46 | return 1 # cls1.get_cnt.remote() 47 | 48 | @ray.remote 49 | def remote_dog(cls1): 50 | cls1.incre() # self.cnt will not increase 51 | return 1 # cls1.get_cnt.remote() 52 | 53 | 54 | 55 | result_id = [remote_dog.remote(dog) for _ in range(5)] 56 | 57 | result = ray.get(result_id) 58 | 59 | print(result) 60 | 61 | time.sleep(5) 62 | # print(ray.get(cat.get_cnt.remote())) 63 | print(dog.get_cnt()) -------------------------------------------------------------------------------- /algos/dqn/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | EPS = 1e-8 5 | 6 | 7 | def placeholder(dim=None): 8 | return tf.placeholder(dtype=tf.float32, shape=(None, dim) if dim else (None,)) 9 | 10 | 11 | def placeholders(*args): 12 | return [placeholder(dim) for dim in args] 13 | 14 | 15 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 16 | for h in hidden_sizes[:-1]: 17 | x = tf.layers.dense(x, units=h, activation=activation) 18 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 19 | 20 | 21 | def get_vars(scope): 22 | return [x for x in tf.global_variables() if scope in x.name] 23 | 24 | 25 | def count_vars(scope): 26 | v = get_vars(scope) 27 | return sum([np.prod(var.shape.as_list()) for var in v]) 28 | 29 | 30 | def gaussian_likelihood(x, mu, log_std): 31 | pre_sum = -0.5 * (((x - mu) / (tf.exp(log_std) + EPS)) ** 2 + 2 * log_std + np.log(2 * np.pi)) 32 | return tf.reduce_sum(pre_sum, axis=1) 33 | 34 | 35 | """ 36 | Actor-Critics 37 | """ 38 | 39 | 40 | def q_function(x, x2, hidden_sizes, act_dim, activation=tf.nn.relu, output_activation=None): 41 | 42 | vf_mlp = lambda x: mlp(x, list(hidden_sizes) + [act_dim], activation, None) 43 | # Q 44 | q_tp = tf.make_template('q1', vf_mlp, create_scope_now_=True) 45 | 46 | q = q_tp(x) 47 | q_x2 = q_tp(x2) 48 | 49 | return q, q_x2 50 | -------------------------------------------------------------------------------- /algos/test_scripts/Testing1.py: -------------------------------------------------------------------------------- 1 | 2 | import ray, os, time 3 | 4 | ray.init(object_store_memory=1000000000, redis_max_memory=1000000000) 5 | 6 | print('main.pid:', os.getpid()) 7 | 8 | @ray.remote 9 | def f(x): 10 | print('f.pid:', os.getpid()) 11 | return x 12 | 13 | @ray.remote 14 | class Foo(): 15 | def __init__(self, f): 16 | self.x = ray.get(f.remote(100)) 17 | 18 | # @ray.remote # AttributeError: 'ActorHandle' object has no attribute 'bar' 19 | def bar(self): 20 | print('bar.pid:', os.getpid()) 21 | return 1 22 | 23 | foo = Foo.remote(f) 24 | 25 | obj_id1 = foo.bar.remote() 26 | 27 | print(ray.get(obj_id1)) 28 | 29 | 30 | ''' outputs: 31 | main.pid: 9521 32 | 1 33 | (pid=9593) bar.pid: 9593 34 | (pid=9602) f.pid: 9602 35 | ''' 36 | 37 | 38 | 39 | @ray.remote 40 | class Counter(object): 41 | def __init__(self): 42 | self.counter = 0 43 | 44 | def inc(self): 45 | self.counter += 1 46 | 47 | def get_counter(self): 48 | return self.counter 49 | 50 | @ray.remote 51 | def g(counter): 52 | print('g.pid:', os.getpid()) 53 | for _ in range(1000): 54 | time.sleep(0.1) 55 | counter.inc.remote() 56 | 57 | counter = Counter.remote() 58 | 59 | # Start some tasks that use the actor. 60 | [g.remote(counter) for _ in range(3)] 61 | 62 | # Print the counter value. 63 | for _ in range(10): 64 | time.sleep(1) 65 | print(ray.get(counter.get_counter.remote())) -------------------------------------------------------------------------------- /algos/sac1/render_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import time 4 | import ray 5 | import gym 6 | 7 | from hyperparams import HyperParameters 8 | from actor_learner import Actor, Learner 9 | 10 | import os 11 | import pickle 12 | import multiprocessing 13 | import copy 14 | import signal 15 | 16 | 17 | flags = tf.app.flags 18 | FLAGS = tf.app.flags.FLAGS 19 | 20 | # "Pendulum-v0" 'BipedalWalker-v2' 'LunarLanderContinuous-v2' 21 | flags.DEFINE_string("env_name", "BipedalWalkerHardcore-v2", "game env") 22 | flags.DEFINE_integer("total_epochs", 500, "total_epochs") 23 | flags.DEFINE_integer("num_workers", 1, "number of workers") 24 | flags.DEFINE_integer("num_learners", 1, "number of learners") 25 | flags.DEFINE_string("is_restore", "False", "True or False. True means restore weights from pickle file.") 26 | flags.DEFINE_float("a_l_ratio", 10, "steps / sample_times") 27 | 28 | opt = HyperParameters(FLAGS.env_name, FLAGS.total_epochs, FLAGS.num_workers, FLAGS.a_l_ratio) 29 | 30 | agent = Actor(opt, job="main") 31 | keys, weights = agent.get_weights() 32 | pickle_in = open("weights.pickle", "rb") 33 | weights = pickle.load(pickle_in) 34 | 35 | 36 | weights = [weights[key] for key in keys] 37 | 38 | agent.set_weights(keys, weights) 39 | 40 | test_env = gym.make(opt.env_name) 41 | 42 | n = 2 43 | 44 | rew = [] 45 | for j in range(n): 46 | o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 47 | while not (d or (ep_len == opt.max_ep_len)): 48 | # Take deterministic actions at test time 49 | test_env.render() 50 | action = agent.get_action(o, True) 51 | print(action) 52 | o, r, d, _ = test_env.step(action) 53 | ep_ret += r 54 | ep_len += 1 55 | rew.append(ep_ret) 56 | print("test_reward:", sum(rew)/n) 57 | -------------------------------------------------------------------------------- /algos/dqn/hyperparams.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import sys 4 | from gym.spaces import Box 5 | import datetime 6 | import gym 7 | from math import ceil 8 | 9 | 10 | class HyperParameters: 11 | def __init__(self, env, env_name, exp_name, num_nodes, num_workers, a_l_ratio, weights_file): 12 | # parameters set 13 | 14 | self.exp_name = exp_name 15 | self.env_name = env_name 16 | 17 | self.model = "mlp" 18 | assert self.model in ["mlp", "cnn"], "model must be mlp or cnn!" 19 | 20 | self.num_nodes = num_nodes 21 | self.num_workers = num_workers 22 | self.num_learners = 1 23 | 24 | self.push_freq = 100 25 | 26 | self.gamma = 0.99 27 | 28 | self.a_l_ratio = a_l_ratio 29 | self.weights_file = weights_file 30 | 31 | self.recover = False 32 | self.checkpoint_freq = 21600 # 21600s = 6h 33 | 34 | # gpu memory fraction 35 | self.gpu_fraction = 0.3 36 | 37 | self.hidden_size = [400, 300] 38 | 39 | self.obs_dim = env.observation_space.shape[0] 40 | self.obs_space = env.observation_space 41 | self.obs_shape = self.obs_space.shape 42 | 43 | self.act_dim = env.action_space.n 44 | self.act_space = env.action_space 45 | self.act_shape = self.act_space.shape 46 | 47 | # self.num_buffers = 1 48 | self.buffer_size = int(1e6) 49 | self.num_buffers = self.num_workers // 25 + 1 50 | self.buffer_size = self.buffer_size // self.num_buffers 51 | 52 | self.start_steps = int(1e4) // self.num_buffers 53 | 54 | if self.weights_file: 55 | self.start_steps = self.buffer_size 56 | 57 | self.lr = 1e-3 58 | self.polyak = 0.995 59 | 60 | self.batch_size = 128 61 | 62 | # n-step 63 | self.Ln = 1 64 | 65 | self.save_freq = 1 66 | 67 | self.seed = 0 68 | 69 | ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 70 | 71 | self.summary_dir = ROOT_DIR + '/tboard_ray' # Directory for storing tensorboard summary results 72 | self.save_dir = ROOT_DIR + '/' + self.exp_name # Directory for storing trained model 73 | self.save_interval = int(5e5) 74 | 75 | self.log_dir = self.summary_dir + "/" + str(datetime.datetime.now()) + "-workers_num:" + \ 76 | str(self.num_workers) + "%" + str(self.a_l_ratio) + self.env_name + "-" + self.exp_name 77 | -------------------------------------------------------------------------------- /algos/sqn/hyperparams.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import sys 4 | from gym.spaces import Box 5 | import datetime 6 | import gym 7 | from math import ceil 8 | 9 | 10 | class HyperParameters: 11 | def __init__(self, env, env_name, exp_name, num_nodes, num_workers, a_l_ratio, weights_file): 12 | # parameters set 13 | 14 | self.exp_name = exp_name 15 | self.env_name = env_name 16 | 17 | self.model = "mlp" 18 | assert self.model in ["mlp", "cnn"], "model must be mlp or cnn!" 19 | 20 | self.num_nodes = num_nodes 21 | self.num_workers = num_workers 22 | self.num_learners = 1 23 | 24 | self.push_freq = 100 25 | 26 | # alpha need > 0 27 | self.alpha = 0.1 28 | self.gamma = 0.99 29 | 30 | self.a_l_ratio = a_l_ratio 31 | self.weights_file = weights_file 32 | 33 | self.recover = False 34 | self.checkpoint_freq = 21600 # 21600s = 6h 35 | 36 | # gpu memory fraction 37 | self.gpu_fraction = 0.3 38 | 39 | self.hidden_size = [400, 300] 40 | 41 | self.obs_dim = env.observation_space.shape[0] 42 | self.obs_space = env.observation_space 43 | self.obs_shape = self.obs_space.shape 44 | 45 | self.act_dim = env.action_space.n 46 | self.act_space = env.action_space 47 | self.act_shape = self.act_space.shape 48 | 49 | # self.num_buffers = 1 50 | self.buffer_size = int(1e6) 51 | self.num_buffers = self.num_workers // 25 + 1 52 | self.buffer_size = self.buffer_size // self.num_buffers 53 | 54 | self.start_steps = int(1e4) // self.num_buffers 55 | 56 | if self.weights_file: 57 | self.start_steps = self.buffer_size 58 | 59 | self.lr = 1e-3 60 | self.polyak = 0.995 61 | 62 | self.batch_size = 128 63 | 64 | # n-step 65 | self.Ln = 1 66 | 67 | self.save_freq = 1 68 | 69 | self.seed = 0 70 | 71 | ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 72 | 73 | self.summary_dir = ROOT_DIR + '/tboard_ray' # Directory for storing tensorboard summary results 74 | self.save_dir = ROOT_DIR + '/' + self.exp_name # Directory for storing trained model 75 | self.save_interval = int(5e5) 76 | 77 | self.log_dir = self.summary_dir + "/" + str(datetime.datetime.now()) + "-workers_num:" + \ 78 | str(self.num_workers) + "%" + str(self.a_l_ratio) + self.env_name + "-" + self.exp_name 79 | -------------------------------------------------------------------------------- /algos/sqn/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | EPS = 1e-8 5 | 6 | 7 | def placeholder(dim=None): 8 | return tf.placeholder(dtype=tf.float32, shape=(None, dim) if dim else (None,)) 9 | 10 | 11 | def placeholders(*args): 12 | return [placeholder(dim) for dim in args] 13 | 14 | 15 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 16 | for h in hidden_sizes[:-1]: 17 | x = tf.layers.dense(x, units=h, activation=activation) 18 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 19 | 20 | 21 | def get_vars(scope): 22 | return [x for x in tf.global_variables() if scope in x.name] 23 | 24 | 25 | def count_vars(scope): 26 | v = get_vars(scope) 27 | return sum([np.prod(var.shape.as_list()) for var in v]) 28 | 29 | 30 | def softmax_policy(alpha, q, act_dim): 31 | 32 | pi_log = tf.nn.log_softmax(q/alpha, axis=1) 33 | mu = tf.argmax(pi_log, axis=1) 34 | 35 | # tf.random.multinomial( logits, num_samples, seed=None, name=None, output_dtype=None ) 36 | # logits: 2-D Tensor with shape [batch_size, num_classes]. Each slice [i, :] represents the unnormalized log-probabilities for all classes. 37 | # num_samples: 0-D. Number of independent samples to draw for each row slice. 38 | pi = tf.squeeze(tf.random.multinomial(pi_log, 1), axis=1) 39 | 40 | # logp_pi = tf.reduce_sum(tf.one_hot(mu, depth=act_dim) * pi_log, axis=1) # use max Q(s,a) 41 | # logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * pi_log, axis=1) 42 | logp_pi = tf.reduce_sum(tf.exp(pi_log)*pi_log, axis=1) # exact entropy 43 | 44 | return mu, pi, logp_pi 45 | 46 | 47 | """ 48 | Actor-Critics 49 | """ 50 | 51 | 52 | def q_function(x, x2, alpha, hidden_sizes, act_dim, activation=tf.nn.relu, 53 | output_activation=None, policy=softmax_policy, action_space=None): 54 | 55 | vf_mlp = lambda x: mlp(x, list(hidden_sizes) + [act_dim], activation, None) 56 | 57 | # Q1 58 | q1_tp = tf.make_template('q1', vf_mlp, create_scope_now_=True) 59 | 60 | q1 = q1_tp(x) 61 | 62 | # policy 63 | mu, pi, entropy = policy(alpha, q1, act_dim) 64 | q1_mu = tf.reduce_sum(q1 * tf.one_hot(mu, depth=act_dim), axis=1) 65 | 66 | q1_x2 = q1_tp(x2) 67 | 68 | # policy 69 | mu_x2, pi_x2, entropy_x2 = policy(alpha, q1_x2, act_dim) 70 | 71 | # Q2 72 | q2_tp = tf.make_template('q2', vf_mlp, create_scope_now_=True) 73 | q2 = q2_tp(x) 74 | 75 | # policy 76 | mu2, pi2, entropy2 = policy(alpha, q2, act_dim) 77 | q2_mu = tf.reduce_sum(q2 * tf.one_hot(mu2, depth=act_dim), axis=1) 78 | 79 | return mu, pi, entropy_x2, q1, q2, q1_mu, q2_mu 80 | -------------------------------------------------------------------------------- /algos/test_scripts/dense_bn.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | 7 | config = tf.ConfigProto() 8 | config.gpu_options.allow_growth = True 9 | session = tf.Session(config=config) 10 | 11 | 12 | from tensorflow.examples.tutorials.mnist import input_data 13 | 14 | mnist = input_data.read_data_sets("MNIST_data/", one_hot=True) 15 | 16 | 17 | regularizer_l2 = tf.contrib.layers.l2_regularizer(0.00) 18 | 19 | def dense(x, size, scope): 20 | return tf.layers.dense(x, size, activation=None, kernel_regularizer=regularizer_l2, bias_regularizer=regularizer_l2,) 21 | 22 | def dense_batch_relu(x, phase, scope): 23 | with tf.variable_scope(scope): 24 | h1 = tf.layers.dense(x, 100, activation=None, kernel_regularizer=regularizer_l2, bias_regularizer=regularizer_l2,) 25 | h2 = tf.contrib.layers.batch_norm(h1, 26 | center=True, scale=True, 27 | is_training=phase, fused=False, 28 | scope='bn') 29 | return tf.nn.relu(h2, 'relu') 30 | 31 | 32 | tf.reset_default_graph() 33 | x = tf.placeholder('float32', (None, 784), name='x') 34 | y = tf.placeholder('float32', (None, 10), name='y') 35 | phase = tf.placeholder(tf.bool, name='phase') 36 | 37 | h1 = dense_batch_relu(x, phase,'layer1') 38 | h2 = dense_batch_relu(h1, phase, 'layer2') 39 | logits = dense(h2, 10, 'logits') 40 | 41 | with tf.name_scope('accuracy'): 42 | accuracy = tf.reduce_mean(tf.cast( 43 | tf.equal(tf.argmax(y, 1), tf.argmax(logits, 1)), 44 | 'float32')) 45 | 46 | with tf.name_scope('loss'): 47 | loss = tf.reduce_mean( 48 | tf.nn.softmax_cross_entropy_with_logits_v2(labels=y, logits=logits,)) 49 | 50 | 51 | def train(): 52 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 53 | with tf.control_dependencies(update_ops): 54 | # Ensures that we execute the update_ops before performing the train_step 55 | train_step = tf.train.GradientDescentOptimizer(0.01).minimize(loss) 56 | sess = tf.Session() 57 | sess.run(tf.global_variables_initializer()) 58 | 59 | history = [] 60 | iterep = 500 61 | for i in range(iterep * 30): 62 | x_train, y_train = mnist.train.next_batch(100) 63 | sess.run(train_step, 64 | feed_dict={'x:0': x_train, 65 | 'y:0': y_train, 66 | 'phase:0': 1}) 67 | if (i + 1) % iterep == 0: 68 | epoch = (i + 1) / iterep 69 | tr = sess.run([loss, accuracy], 70 | feed_dict={'x:0': mnist.train.images, 71 | 'y:0': mnist.train.labels, 72 | 'phase:0': 1}) 73 | t = sess.run([loss, accuracy], 74 | feed_dict={'x:0': mnist.test.images, 75 | 'y:0': mnist.test.labels, 76 | 'phase:0': 0}) 77 | history += [[epoch] + tr + t] 78 | print(history[-1]) 79 | return history 80 | 81 | 82 | if __name__=="__main__": 83 | train() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Distributed-DRL 2 | Distributed Deep Reinforcement Learning 3 | 4 | This framework inspired by general-purpose RL training system **Rapid** from OpenAI. 5 | 6 | Rapid framework: 7 | ![rapid-architecture@2x--1-](./pictures/rapid-architecture@2x--1-.png) 8 | Our framework: 9 | ![ddrlframework](./pictures/ddrlframework.jpg) 10 | 11 | --- 12 | 13 | Tutorial 14 | 15 | - [Parallelize your algorithm by Ray (1)](tutorial/Parallelize%20your%20algorithm%20by%20Ray%20(1).md) 16 | - [Parallelize your algorithm by Ray (2)](tutorial/Parallelize%20your%20algorithm%20by%20Ray%20(2).md) 17 | - [Parallelize your algorithm by Ray (3)](tutorial/Parallelize%20your%20algorithm%20by%20Ray%20(3).md) 18 | 19 | --- 20 | 21 | This framework divides the reinforcement learning process into five parts: 22 | 23 | - Replay buffer (option) 24 | - Parameter server 25 | - train (learn) 26 | - rollout 27 | - test 28 | 29 | ```python 30 | @ray.remote 31 | class ReplayBuffer: 32 | ... 33 | # replay buffer 34 | 35 | @ray.remote 36 | class ParameterServer(object): 37 | ... 38 | # keep the newest network weights here 39 | # could pull and push the weights 40 | # also could save the weights to local 41 | 42 | @ray.remote(num_gpus=1, max_calls=1) 43 | def worker_train(ps, replay_buffer, opt, learner_index): 44 | ... 45 | # build a learner network 46 | # pull weights from ps 47 | # for loop: 48 | # get sample batch from replaybuffer 49 | # update network and push new weights to ps 50 | 51 | @ray.remote 52 | def worker_rollout(ps, replay_buffer, opt, worker_index): 53 | ... 54 | # bulid a rollout network 55 | # pull weights from ps 56 | # for loop: 57 | # interactive with environment 58 | # store experience to replay buffer 59 | # if end of episode: 60 | # pull weights from ps 61 | 62 | @ray.remote 63 | def worker_test(ps, replay_buffer, opt, worker_index=0): 64 | ... 65 | # bulid a test network usually same as rollout 66 | # while: 67 | # pull weights from ps 68 | # do test 69 | # might save model here 70 | 71 | if __name__ == '__main__': 72 | 73 | ray.init(object_store_memory=1000000000, redis_max_memory=1000000000) 74 | 75 | opt = HyperParameters(FLAGS.env_name, FLAGS.total_epochs, FLAGS.num_workers) 76 | 77 | # create the parameter server 78 | if FLAGS.is_restore == "True": 79 | ps = ParameterServer.remote([], [], is_restore=True) 80 | else: 81 | net = Learner(opt, job="main") 82 | all_keys, all_values = net.get_weights() 83 | ps = ParameterServer.remote(all_keys, all_values) 84 | 85 | # create replay buffer 86 | replay_buffer = ReplayBuffer.remote(obs_dim=opt.obs_dim, act_dim=opt.act_dim, size=opt.replay_size) 87 | 88 | # Start some rollout tasks. 89 | task_rollout = [worker_rollout.remote(ps, replay_buffer, opt, i) for i in range(FLAGS.num_workers)] 90 | 91 | time.sleep(5) 92 | 93 | # start training tasks 94 | task_train = [worker_train.remote(ps, replay_buffer, opt, i) for i in range(FLAGS.num_learners)] 95 | 96 | # start testing 97 | task_test = worker_test.remote(ps, replay_buffer, opt) 98 | 99 | # wait util task test end 100 | # Keep the main process running. Otherwise everything will shut down when main process finished. 101 | ray.wait([task_test, ]) 102 | ``` 103 | 104 | 105 | 106 | ### Result: 107 | 108 | Env: LunarLanderContinuous-v2 109 | GPU:GTX1060 x1 110 | 111 | **SAC1 without distribution:** gets 200+ in 1200s 112 | ![sac1](./pictures/sac1.png) 113 | **Distributed SAC1:** gets 200+ in 360s 114 | ![dsac1](./pictures/dsac1.png) 115 | -------------------------------------------------------------------------------- /algos/sac1/hyperparams.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import sys 4 | from gym.spaces import Box 5 | import datetime 6 | import gym 7 | from numbers import Number 8 | 9 | 10 | class HyperParameters: 11 | def __init__(self, env_name, exp_name, num_workers, a_l_ratio, weights_file): 12 | # parameters set 13 | 14 | self.exp_name = exp_name 15 | 16 | self.env_name = env_name 17 | # "_random", "_d_True", "" 18 | self.rollout_env_name = self.env_name + "" 19 | 20 | self.model = "mlp" 21 | assert self.model in ["mlp", "cnn"], "model must be mlp or cnn!" 22 | if self.model == "cnn": 23 | self.representation = "extracted" 24 | self.stacked = True 25 | else: 26 | self.representation = 'simple115' 27 | self.stacked = False 28 | 29 | self.a_l_ratio = a_l_ratio 30 | self.weights_file = weights_file 31 | self.start_steps = int(5e4) 32 | if self.weights_file: 33 | self.start_steps = int(10e6) 34 | 35 | # gpu memory fraction 36 | self.gpu_fraction = 0.3 37 | 38 | self.hidden_size = (300, 400, 300) 39 | 40 | self.obs_noise = 0 41 | self.act_noise = 0.3 42 | self.reward_scale = 5 43 | env = Wrapper(gym.make(self.env_name), self.obs_noise, self.act_noise, self.reward_scale, 3) 44 | 45 | # env = FootballWrapper(env_football) 46 | 47 | # self.obs_space = Box(low=-1.0, high=1.0, shape=self.obs_dim, dtype=np.float32) 48 | self.obs_dim = env.observation_space.shape 49 | self.obs_space = env.observation_space 50 | self.obs_shape = self.obs_space.shape 51 | 52 | self.act_dim = env.action_space.shape 53 | self.act_space = env.action_space 54 | self.act_shape = self.act_space.shape 55 | 56 | self.num_workers = num_workers 57 | self.num_learners = 1 58 | 59 | self.use_max = False 60 | self.alpha = 0.1 61 | # self.alpha = "auto" 62 | self.target_entropy = 0.5 63 | 64 | self.use_bn = False 65 | self.c_regularizer = 0.0 66 | 67 | self.gamma = 0.997 68 | 69 | # self.num_buffers = 1 70 | self.num_buffers = self.num_workers // 25 + 1 71 | if self.model == 'cnn': 72 | self.buffer_size = int(3e4) 73 | else: 74 | self.buffer_size = int(3e6) 75 | 76 | self.buffer_size = self.buffer_size // self.num_buffers 77 | 78 | self.lr = 5e-5 79 | self.polyak = 0.995 80 | 81 | self.steps_per_epoch = 5000 82 | self.batch_size = 256 83 | 84 | self.Ln = 8 85 | self.action_repeat = 2 86 | 87 | self.max_ep_len = 2900 88 | self.save_freq = 1 89 | 90 | self.max_ret = 0 91 | 92 | self.epsilon = 0 93 | self.epsilon_alpha = 7 94 | 95 | self.seed = 0 96 | 97 | cwd = os.getcwd() 98 | 99 | self.summary_dir = cwd + '/tboard_ray' # Directory for storing tensorboard summary results 100 | self.save_dir = cwd + '/' + self.exp_name # Directory for storing trained model 101 | self.save_interval = int(5e5) 102 | 103 | self.log_dir = self.summary_dir + "/" + str(datetime.datetime.now()) + "-workers_num:" + \ 104 | str(self.num_workers) + "%" + str(self.a_l_ratio) + self.env_name + "-" + self.exp_name 105 | 106 | 107 | class Wrapper(object): 108 | 109 | def __init__(self, env, obs_noise, act_noise, reward_scale, action_repeat=3): 110 | self._env = env 111 | self.action_repeat = action_repeat 112 | self.act_noise = act_noise 113 | self.obs_noise = obs_noise 114 | self.reward_scale = reward_scale 115 | 116 | def __getattr__(self, name): 117 | return getattr(self._env, name) 118 | 119 | def reset(self): 120 | obs = self._env.reset() + self.obs_noise * (-2 * np.random.random(24) + 1) 121 | return obs 122 | 123 | def step(self, action): 124 | action += self.act_noise * (-2 * np.random.random(4) + 1) 125 | r = 0.0 126 | for _ in range(self.action_repeat): 127 | obs_, reward_, done_, info_ = self._env.step(action) 128 | r = r + reward_ 129 | # r -= 0.001 130 | if done_ and self.action_repeat != 1: 131 | return obs_ + self.obs_noise * (-2 * np.random.random(24) + 1), 0.0, done_, info_ 132 | if self.action_repeat == 1: 133 | return obs_, r, done_, info_ 134 | return obs_ + self.obs_noise * (-2 * np.random.random(24) + 1), self.reward_scale * r, done_, info_ 135 | -------------------------------------------------------------------------------- /example/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | EPS = 1e-8 5 | 6 | 7 | def placeholder(dim=None): 8 | return tf.placeholder(dtype=tf.float32, shape=(None, dim) if dim else (None,)) 9 | 10 | 11 | def placeholders(*args): 12 | return [placeholder(dim) for dim in args] 13 | 14 | 15 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 16 | for h in hidden_sizes[:-1]: 17 | x = tf.layers.dense(x, units=h, activation=activation) 18 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 19 | 20 | 21 | def get_vars(scope): 22 | return [x for x in tf.global_variables() if scope in x.name] 23 | 24 | 25 | def count_vars(scope): 26 | v = get_vars(scope) 27 | return sum([np.prod(var.shape.as_list()) for var in v]) 28 | 29 | 30 | def gaussian_likelihood(x, mu, log_std): 31 | pre_sum = -0.5 * (((x - mu) / (tf.exp(log_std) + EPS)) ** 2 + 2 * log_std + np.log(2 * np.pi)) 32 | return tf.reduce_sum(pre_sum, axis=1) 33 | 34 | 35 | def clip_but_pass_gradient(x, l=-1., u=1.): 36 | clip_up = tf.cast(x > u, tf.float32) 37 | clip_low = tf.cast(x < l, tf.float32) 38 | return x + tf.stop_gradient((u - x) * clip_up + (l - x) * clip_low) 39 | 40 | 41 | """ 42 | Policies 43 | """ 44 | 45 | LOG_STD_MAX = 2 46 | LOG_STD_MIN = -20 47 | 48 | 49 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation): 50 | act_dim = a.shape.as_list()[-1] 51 | net = mlp(x, list(hidden_sizes), activation, activation) 52 | mu = tf.layers.dense(net, act_dim, activation=output_activation) 53 | 54 | """ 55 | Because algorithm maximizes trade-off of reward and entropy, 56 | entropy must be unique to state---and therefore log_stds need 57 | to be a neural network output instead of a shared-across-states 58 | learnable parameter vector. But for deep Relu and other nets, 59 | simply sticking an activationless dense layer at the end would 60 | be quite bad---at the beginning of training, a randomly initialized 61 | net could produce extremely large values for the log_stds, which 62 | would result in some actions being either entirely deterministic 63 | or too random to come back to earth. Either of these introduces 64 | numerical instability which could break the algorithm. To 65 | protect against that, we'll constrain the output range of the 66 | log_stds, to lie within [LOG_STD_MIN, LOG_STD_MAX]. This is 67 | slightly different from the trick used by the original authors of 68 | SAC---they used tf.clip_by_value instead of squashing and rescaling. 69 | I prefer this approach because it allows gradient propagation 70 | through log_std where clipping wouldn't, but I don't know if 71 | it makes much of a difference. 72 | """ 73 | log_std = tf.layers.dense(net, act_dim, activation=tf.tanh) 74 | log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) 75 | 76 | std = tf.exp(log_std) 77 | pi = mu + tf.random_normal(tf.shape(mu)) * std 78 | logp_pi = gaussian_likelihood(pi, mu, log_std) 79 | return mu, pi, logp_pi 80 | 81 | 82 | def apply_squashing_func(mu, pi, logp_pi): 83 | mu = tf.tanh(mu) 84 | pi = tf.tanh(pi) 85 | # To avoid evil machine precision error, strictly clip 1-pi**2 to [0,1] range. 86 | logp_pi -= tf.reduce_sum(tf.log(clip_but_pass_gradient(1 - pi ** 2, l=0, u=1) + 1e-6), axis=1) 87 | return mu, pi, logp_pi 88 | 89 | 90 | """ 91 | Actor-Critics 92 | """ 93 | 94 | 95 | def mlp_actor_critic(x, a, hidden_sizes=(400, 300), activation=tf.nn.relu, 96 | output_activation=None, policy=mlp_gaussian_policy, action_space=None): 97 | # policy 98 | with tf.variable_scope('pi'): 99 | mu, pi, logp_pi = policy(x, a, hidden_sizes, activation, output_activation) 100 | mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi) 101 | 102 | # make sure actions are in correct range 103 | action_scale = action_space.high[0] 104 | mu *= action_scale 105 | pi *= action_scale 106 | 107 | # vfs 108 | vf_mlp = lambda x: tf.squeeze(mlp(x, list(hidden_sizes) + [1], activation, None), axis=1) 109 | with tf.variable_scope('q1'): 110 | q1 = vf_mlp(tf.concat([x, a], axis=-1)) 111 | with tf.variable_scope('q1', reuse=True): 112 | q1_pi = vf_mlp(tf.concat([x, pi], axis=-1)) 113 | with tf.variable_scope('q2'): 114 | q2 = vf_mlp(tf.concat([x, a], axis=-1)) 115 | with tf.variable_scope('q2', reuse=True): 116 | q2_pi = vf_mlp(tf.concat([x, pi], axis=-1)) 117 | with tf.variable_scope('v'): 118 | v = vf_mlp(x) 119 | return mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v 120 | -------------------------------------------------------------------------------- /algos/sac1/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | EPS = 1e-8 5 | 6 | 7 | def placeholder(dim=None): 8 | return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,)) 9 | 10 | 11 | def placeholders(*args): 12 | return [placeholder(dim) for dim in args] 13 | 14 | 15 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 16 | for h in hidden_sizes[:-1]: 17 | x = tf.layers.dense(x, units=h, activation=activation) 18 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 19 | 20 | 21 | def get_vars(scope): 22 | return [x for x in tf.global_variables() if scope in x.name] 23 | 24 | 25 | def count_vars(scope): 26 | v = get_vars(scope) 27 | return sum([np.prod(var.shape.as_list()) for var in v]) 28 | 29 | 30 | def gaussian_likelihood(x, mu, log_std): 31 | pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) 32 | return tf.reduce_sum(pre_sum, axis=1) 33 | 34 | 35 | def clip_but_pass_gradient(x, l=-1., u=1.): 36 | clip_up = tf.cast(x > u, tf.float32) 37 | clip_low = tf.cast(x < l, tf.float32) 38 | return x + tf.stop_gradient((u - x)*clip_up + (l - x)*clip_low) 39 | 40 | 41 | """ 42 | Policies 43 | """ 44 | 45 | LOG_STD_MAX = 2 46 | LOG_STD_MIN = -20 47 | 48 | 49 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation): 50 | act_dim = a.shape.as_list()[-1] 51 | net = mlp(x, list(hidden_sizes), activation, activation) 52 | mu = tf.layers.dense(net, act_dim, activation=output_activation) 53 | 54 | """ 55 | Because algorithm maximizes trade-off of reward and entropy, 56 | entropy must be unique to state---and therefore log_stds need 57 | to be a neural network output instead of a shared-across-states 58 | learnable parameter vector. But for deep Relu and other nets, 59 | simply sticking an activationless dense layer at the end would 60 | be quite bad---at the beginning of training, a randomly initialized 61 | net could produce extremely large values for the log_stds, which 62 | would result in some actions being either entirely deterministic 63 | or too random to come back to earth. Either of these introduces 64 | numerical instability which could break the algorithm. To 65 | protect against that, we'll constrain the output range of the 66 | log_stds, to lie within [LOG_STD_MIN, LOG_STD_MAX]. This is 67 | slightly different from the trick used by the original authors of 68 | SAC---they used tf.clip_by_value instead of squashing and rescaling. 69 | I prefer this approach because it allows gradient propagation 70 | through log_std where clipping wouldn't, but I don't know if 71 | it makes much of a difference. 72 | """ 73 | log_std = tf.layers.dense(net, act_dim, activation=tf.tanh) 74 | log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) 75 | 76 | std = tf.exp(log_std) 77 | pi = mu + tf.random_normal(tf.shape(mu)) * std 78 | logp_pi = gaussian_likelihood(pi, mu, log_std) 79 | return mu, pi, logp_pi 80 | 81 | 82 | def apply_squashing_func(mu, pi, logp_pi): 83 | mu = tf.tanh(mu) 84 | pi = tf.tanh(pi) 85 | # To avoid evil machine precision error, strictly clip 1-pi**2 to [0,1] range. 86 | logp_pi -= tf.reduce_sum(tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1) 87 | return mu, pi, logp_pi 88 | 89 | 90 | # Actor-Critics 91 | def mlp_actor_critic(x, x2, a, hidden_sizes=(400,300), activation=tf.nn.relu, 92 | output_activation=None, policy=mlp_gaussian_policy, action_space=None): 93 | 94 | # policy 95 | with tf.variable_scope('pi'): 96 | mu, pi, logp_pi = policy(x, a, hidden_sizes, activation, output_activation) 97 | mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi) 98 | 99 | with tf.variable_scope('pi', reuse=True): 100 | mu2, pi2, logp_pi2 = policy(x2, a, hidden_sizes, activation, output_activation) 101 | mu2, pi2, logp_pi2 = apply_squashing_func(mu2, pi2, logp_pi2) 102 | 103 | # make sure actions are in correct range 104 | action_scale = action_space.high[0] 105 | mu *= action_scale 106 | pi *= action_scale 107 | 108 | # vfs 109 | # tf.squeeze( shape(?,1), axis=1 ) = shape(?,) 110 | vf_mlp = lambda x : tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1) 111 | 112 | with tf.variable_scope('q1'): 113 | q1 = vf_mlp(tf.concat([x,a], axis=-1)) 114 | with tf.variable_scope('q1', reuse=True): 115 | q1_pi = vf_mlp(tf.concat([x,pi], axis=-1)) 116 | with tf.variable_scope('q2'): 117 | q2 = vf_mlp(tf.concat([x,a], axis=-1)) 118 | with tf.variable_scope('q2', reuse=True): 119 | q2_pi = vf_mlp(tf.concat([x,pi], axis=-1)) 120 | 121 | return mu, pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi 122 | -------------------------------------------------------------------------------- /example/model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | import time 5 | from spinup.algos.sac import core 6 | from spinup.algos.sac.core import get_vars 7 | from spinup.utils.logx import EpochLogger 8 | from core import mlp_actor_critic as actor_critic 9 | import ray.experimental.tf_utils 10 | 11 | 12 | class Model(object): 13 | 14 | def __init__(self, args): 15 | 16 | # Inputs to computation graph 17 | self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders(args.obs_dim, args.act_dim, 18 | args.obs_dim, None, None) 19 | 20 | # Main outputs from computation graph 21 | with tf.variable_scope('main'): 22 | self.mu, self.pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(self.x_ph, self.a_ph, **args.ac_kwargs) 23 | 24 | # Target value network 25 | with tf.variable_scope('target'): 26 | _, _, _, _, _, _, _, v_targ = actor_critic(self.x2_ph, self.a_ph, **args.ac_kwargs) 27 | 28 | # Count variables 29 | var_counts = tuple(core.count_vars(scope) for scope in 30 | ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) 31 | print(('\nNumber of parameters: \t pi: %d, \t' + 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n') % var_counts) 32 | 33 | # Min Double-Q: 34 | min_q_pi = tf.minimum(q1_pi, q2_pi) 35 | 36 | # Targets for Q and V regression 37 | q_backup = tf.stop_gradient(self.r_ph + args.gamma * (1 - self.d_ph) * v_targ) 38 | v_backup = tf.stop_gradient(min_q_pi - args.alpha * logp_pi) 39 | 40 | # Soft actor-critic losses 41 | pi_loss = tf.reduce_mean(args.alpha * logp_pi - q1_pi) 42 | q1_loss = 0.5 * tf.reduce_mean((q_backup - q1) ** 2) 43 | q2_loss = 0.5 * tf.reduce_mean((q_backup - q2) ** 2) 44 | v_loss = 0.5 * tf.reduce_mean((v_backup - v) ** 2) 45 | self.value_loss = q1_loss + q2_loss + v_loss 46 | 47 | # Policy train op 48 | # (has to be separate from value train op, because q1_pi appears in pi_loss) 49 | pi_optimizer = tf.train.AdamOptimizer(learning_rate=args.lr) 50 | train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) 51 | 52 | # Value train op 53 | # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) 54 | value_optimizer = tf.train.AdamOptimizer(learning_rate=args.lr) 55 | value_params = get_vars('main/q') + get_vars('main/v') 56 | with tf.control_dependencies([train_pi_op]): 57 | train_value_op = value_optimizer.minimize(self.value_loss, var_list=value_params) 58 | 59 | # Polyak averaging for target variables 60 | # (control flow because sess.run otherwise evaluates in nondeterministic order) 61 | with tf.control_dependencies([train_value_op]): 62 | target_update = tf.group([tf.assign(v_targ, args.polyak * v_targ + (1 - args.polyak) * v_main) 63 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 64 | 65 | # All ops to call during one training step 66 | self.step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, 67 | train_pi_op, train_value_op, target_update] 68 | 69 | # Initializing targets to match main variables 70 | self.target_init = tf.group([tf.assign(v_targ, v_main) 71 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 72 | 73 | self.sess = tf.Session() 74 | self.sess.run(tf.global_variables_initializer()) 75 | self.variables = ray.experimental.tf_utils.TensorFlowVariables( 76 | self.value_loss, self.sess) 77 | 78 | def set_weights(self, variable_names, weights): 79 | self.variables.set_weights(dict(zip(variable_names, weights))) 80 | self.sess.run(self.target_init) 81 | 82 | def get_weights(self): 83 | weights = self.variables.get_weights() 84 | keys = [key for key in list(weights.keys()) if "main" in key] 85 | values = [weights[key] for key in keys] 86 | return keys, values 87 | 88 | def get_action(self, o, deterministic=False): 89 | act_op = self.mu if deterministic else self.pi 90 | return self.sess.run(act_op, feed_dict={self.x_ph: o.reshape(1, -1)})[0] 91 | 92 | def train(self, replay_buffer, args): 93 | 94 | batch = ray.get(replay_buffer.sample_batch.remote(args.batch_size)) 95 | feed_dict = {self.x_ph: batch['obs1'], 96 | self.x2_ph: batch['obs2'], 97 | self.a_ph: batch['acts'], 98 | self.r_ph: batch['rews'], 99 | self.d_ph: batch['done'], 100 | } 101 | outs = self.sess.run(self.step_ops, feed_dict) 102 | # logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], 103 | # LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], 104 | # VVals=outs[6], LogPi=outs[7]) 105 | 106 | def test_agent(self, test_env, args, n=10): 107 | test_ret = [] 108 | for j in range(n): 109 | o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 110 | while not (d or (ep_len == args.max_ep_len)): 111 | # Take deterministic actions at test time 112 | o, r, d, _ = test_env.step(self.get_action(o, True)) 113 | ep_ret += r 114 | ep_len += 1 115 | test_ret.append(ep_ret) 116 | # logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) 117 | ave_ret = sum(test_ret)/len(test_ret) 118 | return ave_ret 119 | -------------------------------------------------------------------------------- /example/dsac.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | import time 5 | from spinup.algos.sac import core 6 | from spinup.algos.sac.core import get_vars 7 | from spinup.utils.logx import EpochLogger 8 | 9 | import ray 10 | import pickle 11 | from model import Model 12 | 13 | 14 | @ray.remote 15 | class ReplayBuffer: 16 | """ 17 | A simple FIFO experience replay buffer for SAC agents. 18 | """ 19 | 20 | def __init__(self, obs_dim, act_dim, size): 21 | self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32) 22 | self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32) 23 | self.acts_buf = np.zeros([size, act_dim], dtype=np.float32) 24 | self.rews_buf = np.zeros(size, dtype=np.float32) 25 | self.done_buf = np.zeros(size, dtype=np.float32) 26 | self.ptr, self.size, self.max_size = 0, 0, size 27 | self.rollout_steps = 0 28 | 29 | def store(self, obs, act, rew, next_obs, done): 30 | self.obs1_buf[self.ptr] = obs 31 | self.obs2_buf[self.ptr] = next_obs 32 | self.acts_buf[self.ptr] = act 33 | self.rews_buf[self.ptr] = rew 34 | self.done_buf[self.ptr] = done 35 | self.ptr = (self.ptr + 1) % self.max_size 36 | self.size = min(self.size + 1, self.max_size) 37 | self.rollout_steps += 1 38 | 39 | def sample_batch(self, batch_size=32): 40 | idxs = np.random.randint(0, self.size, size=batch_size) 41 | return dict(obs1=self.obs1_buf[idxs], 42 | obs2=self.obs2_buf[idxs], 43 | acts=self.acts_buf[idxs], 44 | rews=self.rews_buf[idxs], 45 | done=self.done_buf[idxs]) 46 | 47 | def get_counts(self): 48 | return self.rollout_steps 49 | 50 | 51 | @ray.remote 52 | class ParameterServer(object): 53 | def __init__(self, keys, values): 54 | # These values will be mutated, so we must create a copy that is not 55 | # backed by the object store. 56 | values = [value.copy() for value in values] 57 | self.weights = dict(zip(keys, values)) 58 | 59 | def push(self, keys, values): 60 | values = [value.copy() for value in values] 61 | for key, value in zip(keys, values): 62 | self.weights[key] = value 63 | 64 | def pull(self, keys): 65 | return [self.weights[key] for key in keys] 66 | 67 | def get_weights(self): 68 | return self.weights 69 | 70 | # save weights to disk 71 | def save_weights(self, name): 72 | with open(name + "weights.pickle", "wb") as pickle_out: 73 | pickle.dump(self.weights, pickle_out) 74 | 75 | 76 | @ray.remote 77 | def worker_rollout(ps, replay_buffer, args): 78 | env = gym.make(args.env) 79 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 80 | total_steps = args.steps_per_epoch * args.epochs 81 | 82 | agent = Model(args) 83 | keys = agent.get_weights()[0] 84 | 85 | weights = ray.get(ps.pull.remote(keys)) 86 | agent.set_weights(keys, weights) 87 | 88 | # Main loop: collect experience in env and update/log each epoch 89 | for t in range(total_steps): 90 | 91 | """ 92 | Until start_steps have elapsed, randomly sample actions 93 | from a uniform distribution for better exploration. Afterwards, 94 | use the learned policy. 95 | """ 96 | if t > args.start_steps: 97 | a = agent.get_action(o) 98 | else: 99 | a = env.action_space.sample() 100 | 101 | # Step the env 102 | o2, r, d, _ = env.step(a) 103 | ep_ret += r 104 | ep_len += 1 105 | 106 | # Ignore the "done" signal if it comes from hitting the time 107 | # horizon (that is, when it's an artificial terminal signal 108 | # that isn't based on the agent's state) 109 | d = False if ep_len == args.max_ep_len else d 110 | 111 | # Store experience to replay buffer 112 | replay_buffer.store.remote(o, a, r, o2, d) 113 | 114 | # Super critical, easy to overlook step: make sure to update 115 | # most recent observation! 116 | o = o2 117 | 118 | if d or (ep_len == args.max_ep_len): 119 | """ 120 | Perform all SAC updates at the end of the trajectory. 121 | This is a slight difference from the SAC specified in the 122 | original paper. 123 | """ 124 | 125 | # print(ep_len, ep_ret) 126 | # logger.store(EpRet=ep_ret, EpLen=ep_len) 127 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 128 | 129 | weights = ray.get(ps.pull.remote(keys)) 130 | agent.set_weights(keys, weights) 131 | 132 | 133 | @ray.remote(num_gpus=1, max_calls=1) 134 | def worker_train(ps, replay_buffer, args): 135 | agent = Model(args) 136 | keys = agent.get_weights()[0] 137 | 138 | weights = ray.get(ps.pull.remote(keys)) 139 | agent.set_weights(keys, weights) 140 | 141 | cnt = 1 142 | while True: 143 | 144 | agent.train(replay_buffer, args) 145 | 146 | if cnt % 300 == 0: 147 | keys, values = agent.get_weights() 148 | ps.push.remote(keys, values) 149 | 150 | cnt += 1 151 | 152 | 153 | @ray.remote 154 | def worker_test(ps, start_time): 155 | 156 | from spinup.utils.run_utils import setup_logger_kwargs 157 | 158 | logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) 159 | logger = EpochLogger(**logger_kwargs) 160 | config = locals() 161 | del config['ps'] 162 | logger.save_config(config) 163 | 164 | agent = Model(args) 165 | keys = agent.get_weights()[0] 166 | 167 | weights = ray.get(ps.pull.remote(keys)) 168 | agent.set_weights(keys, weights) 169 | test_env = gym.make(args.env) 170 | while True: 171 | ave_ret = agent.test_agent(test_env, args) 172 | # print("test Average Ret:", ave_ret, "time:", time.time()-start_time) 173 | logger.log_tabular('AverageTestEpRet', ave_ret) 174 | logger.log_tabular('Time', time.time() - start_time) 175 | logger.dump_tabular() 176 | weights = ray.get(ps.pull.remote(keys)) 177 | agent.set_weights(keys, weights) 178 | 179 | 180 | if __name__ == '__main__': 181 | import argparse 182 | 183 | parser = argparse.ArgumentParser() 184 | parser.add_argument('--env', type=str, default='LunarLanderContinuous-v2') 185 | parser.add_argument('--hid', type=int, default=300) 186 | parser.add_argument('--l', type=int, default=2) 187 | parser.add_argument('--gamma', type=float, default=0.99) 188 | parser.add_argument('--seed', '-s', type=int, default=0) 189 | parser.add_argument('--epochs', type=int, default=600) 190 | parser.add_argument('--exp_name', type=str, default='dsac_6worker_E1') 191 | args = parser.parse_args() 192 | 193 | # ac_kwargs = dict() 194 | args.seed = 0 195 | args.steps_per_epoch = 5000 196 | args.epochs = 100 197 | args.replay_size = int(1e6) 198 | args.gamma = 0.99, 199 | args.polyak = 0.995 200 | args.lr = 1e-3 201 | args.alpha = 0.2 202 | args.batch_size = 100 203 | args.start_steps = 10000 204 | args.max_ep_len = 1000 205 | args.logger_kwargs = dict() 206 | args.save_freq = 1 207 | args.ac_kwargs = dict(hidden_sizes=[args.hid] * args.l) 208 | 209 | env = gym.make(args.env) 210 | args.obs_dim = env.observation_space.shape[0] 211 | args.act_dim = env.action_space.shape[0] 212 | # Share information about action space with policy architecture 213 | args.ac_kwargs['action_space'] = env.action_space 214 | 215 | args.num_workers = 6 216 | args.num_learners = 1 217 | 218 | ray.init() 219 | 220 | net = Model(args) 221 | all_keys, all_values = net.get_weights() 222 | ps = ParameterServer.remote(all_keys, all_values) 223 | 224 | replay_buffer = ReplayBuffer.remote(args.obs_dim, args.act_dim, args.replay_size) 225 | 226 | start_time = time.time() 227 | 228 | # Start some training tasks. 229 | task_rollout = [worker_rollout.remote(ps, replay_buffer, args) for i in range(args.num_workers)] 230 | 231 | time.sleep(20) 232 | 233 | task_train = [worker_train.remote(ps, replay_buffer, args) for i in range(args.num_learners)] 234 | 235 | time.sleep(10) 236 | 237 | task_test = worker_test.remote(ps, start_time) 238 | ray.wait(task_rollout) 239 | -------------------------------------------------------------------------------- /tutorial/Parallelize your algorithm by Ray (1).md: -------------------------------------------------------------------------------- 1 | 2 | 3 | Ray是一个实现分布式python程序的通用框架。Ray提供了统一的任务并行和actor抽象,并通过共享内存、零拷贝序列化和分布式调度实现了高性能。 4 | 5 | Ray里面还有用来调超参数的库[Tune](http://ray.readthedocs.io/en/latest/tune.html)和可扩展规模的强化学习库[Rllib](http://ray.readthedocs.io/en/latest/rllib.html)。 6 | 7 | ray的必备知识: 8 | 9 | 1. 使用远程方程(任务) [`ray.remote`] 10 | 2. 通过object IDs获取结果 [`ray.put`, `ray.get`, `ray.wait`] 11 | 3. 使用远程类 (actors) [`ray.remote`] 12 | 13 | 使用Ray,可以使你的代码从单机运行轻松地扩展到大集群上运行。 14 | 15 | 使用该命令安装Ray:`pip install -U ray` 16 | 17 | 18 | 19 | 开始使用ray,导入ray,然后初始化。 20 | 21 | ```python 22 | import ray 23 | 24 | # Start Ray. If you're connecting to an existing cluster, you would use 25 | # ray.init(address=) instead. 26 | ray.init() 27 | ``` 28 | 29 | 30 | 31 | 1. 使用远程方程(任务) [`ray.remote`] 32 | 33 | 将python函数转换为远程函数的标准方法使在函数上面添加一个`@ray.remote`装饰器。下面看一个例子。 34 | 35 | ```python 36 | # A regular Python function. 37 | def regular_function(): 38 | return 1 39 | 40 | # A Ray remote function. 41 | @ray.remote 42 | def remote_function(): 43 | return 1 44 | ``` 45 | 46 | ```python 47 | assert regular_function() == 1 48 | 49 | object_id = remote_function.remote() 50 | 51 | # The value of the original `regular_function` 52 | assert ray.get(object_id) == 1 53 | ``` 54 | 55 | **Parallelism:** Invocations of `regular_function` happen **serially**, for example 56 | 57 | 在调用的时候,普通函数将串行运行。 58 | 59 | ```python 60 | # These happen serially. 61 | for _ in range(4): 62 | regular_function() 63 | ``` 64 | 65 | 66 | 67 | whereas invocations of `remote_function` happen in **parallel**, for example 68 | 69 | 调用远程函数时,程序将并行运行。 70 | 71 | ```python 72 | # These happen in parallel. 73 | for _ in range(4): 74 | remote_function.remote() 75 | ``` 76 | 77 | 78 | 79 | Oftentimes, you may want to specify a task’s resource requirements (for example 80 | one task may require a GPU). The `ray.init()` command will automatically 81 | detect the available GPUs and CPUs on the machine. However, you can override 82 | this default behavior by passing in specific resources, e.g. 83 | 84 | 运行`ray.init()`后,ray将自动检查可用的GPU和CPU。我们也可以给我们传入参数设置特定的资源需求量。 85 | 86 | `ray.init(num_cpus=8, num_gpus=4, resources={'Custom': 2})` 87 | 88 | 远程函数/类也可以设置资源需求量,像这样`@ray.remote(num_cpus=2, num_gpus)` 89 | 90 | 如果没有设置,默认设置为1个CPU。 91 | 92 | If you do not specify any resources in the `@ray.remote` decorator, the 93 | default is 1 CPU resource and no other resources. 94 | 95 | 96 | 97 | 远程函数执行后并不会直接返回结果,而是会立即返回一个object ID。远程函数会在后台并行处理,等执行得到最终结果后,可以通过返回的object ID取得这个结果。 98 | 99 | `ray.put(*value*)`也会返回object ID 100 | 101 | put操作将对象存入object store里,然后返回它的object ID。 102 | 103 | Store an object in the object store. return: The object ID assigned to this value. 104 | 105 | ```python 106 | y = 1 107 | object_id = ray.put(y) 108 | ``` 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 通过object IDs获取结果 [`ray.put`, `ray.get`, `ray.wait`] 117 | 118 | ray.get(obj_id) 119 | 120 | 从object store获取远程对象或者一个列表的远程对象。 121 | 122 | Get a remote object or a list of remote objects from the object store. 123 | 124 | Then, if the object is a numpy array or a collection of numpy arrays, the `get` call is zero-copy and returns arrays backed by shared object store memory. 125 | Otherwise, we deserialize the object data into a Python object. 126 | 127 | This method blocks until the object corresponding to the object ID is 128 | available in the local object store. 129 | 130 | 需要注意的是,使用get方法时会锁,直到要取得的对象在本地的object store里可用。 131 | 132 | 调用remote操作是异步的,他们会返回object IDs而不是结果。想要得到真的的结果我们需要使用ray.get()。 133 | 134 | 我们之前写的这段语句,实际上results是一个由object IDs组成的列表。 135 | 136 | `results = [do_some_work.remote(x) for x in range(4)]` 137 | 138 | 如果改为下面,ray.get()将通过object ID取得真实的结果。 139 | 140 | `results = [ray.get(do_some_work.remote(x)) for x in range(4)]` 141 | 142 | 但是,这样写会有一个问题。ray.get()会锁进程,这意味着,ray.get()会一直等到do_some_work这个函数执行完返回结果后才执行结束然后进入下一个循环。这样的话,4次调用do_some_work函数就不再是并行运行的了。 143 | 144 | 为了可以并行运算,我们需要在调用完所有的任务后再调用ray.get()。像下面这样。 145 | 146 | `results = ray.get([do_some_work.remote(x) for x in range(4)])` 147 | 148 | 所以,需要小心使用ray.get()。因为它是一个锁进程的操作。如果太频繁调用ray.get(),将会影响并行性能。同时,尽可能的晚使用ray.get()以防止不必要的等待。 149 | 150 | 151 | 152 | Recall that remote operations are asynchronous and they return futures (i.e., object IDs) instead of the results themselves.To get the actual results, we need to use ray.get(), and here the first instinct is to just call ray.get() on the remote operation invocation i.e., replace line “results = [do_some_work.remote(x) for x in range(4)]” with: results = [ray.get(do_some_work.remote(x)) for x in range(4)] 153 | 154 | The observant reader will already have the answer: ray.get() is blocking, so calling it after each remote operation means that we wait for that operation to complete, which essentially means that we execute one operation at a time, hence no parallelism! 155 | 156 | To enable parallelism, we need to call ray.get() *after* invoking all tasks. We can easily do so in our example by replacing line “results = [do_some_work.remote(x) for x in range(4)]” with: 157 | 158 | 159 | 160 | ```python 161 | results = ray.get([do_some_work.remote(x) for x in range(4)]) 162 | ``` 163 | 164 | always keep in mind that ray.get() is a blocking operation, and thus if called eagerly it can hurt the parallelism. Instead, you should try to write your program such that ray.get() is called as late as possible. 165 | 166 | **Tip 1:** ***Delay calling ray.get() as much as possible.*** 167 | 168 | 169 | 170 | 远程类 171 | 172 | 通过远程类,我们可以实现一个共享的参数服务器。 173 | 174 | remote classes (Actors) 175 | 176 | 我们在类的定义上面加上修饰器ray.remote。这个类的实例就会是一个Ray的actor。每一个actor运行在自己的python进程上。 177 | 178 | Actors extend the Ray API from functions (tasks) to classes. The `ray.remote` decorator indicates that instances of the `Counter` class will be actors. An actor is essentially a stateful worker. Each actor runs in its own Python process. 179 | 180 | 181 | 182 | ```python 183 | @ray.remote 184 | class Counter(object): 185 | def __init__(self): 186 | self.value = 0 187 | 188 | def increment(self): 189 | self.value += 1 190 | return self.value 191 | ``` 192 | 193 | You can specify resource requirements in Actors too (see the [Actors section](https://ray.readthedocs.io/en/latest/actors.html) for more details.) 194 | 195 | 同样可以给actor设置资源请求量。 196 | 197 | ```python 198 | @ray.remote(num_cpus=2, num_gpus=0.5) 199 | class Actor(object): 200 | pass 201 | ``` 202 | 203 | 204 | 205 | We can interact with the actor by calling its methods with the `.remote` operator. We can then call `ray.get` on the object ID to retrieve the actual value. 206 | 207 | 在调用类的方法时加上`.remote`,然后使用`ray.get`获取实际的值。 208 | 209 | ``` 210 | obj_id = a1.increment.remote() 211 | ray.get(obj_id) == 1 212 | ``` 213 | 214 | Actor handles can be passed into other tasks. To illustrate this with a 215 | simple example, consider a simple actor definition. 216 | 217 | Actor可以作为参数传给别的任务,下面的例子就是实现一个参数服务器。不同的参数就可以公用一个参数服务器了。 218 | 219 | 220 | 221 | ps 222 | 223 | The @ray.remote decorator defines a service. It takes the 224 | `ParameterServer` class and allows it to be instantiated as a remote service or 225 | actor. 226 | 227 | 228 | 229 | **Sharding Across Multiple Parameter Servers:** When your parameters are large and your cluster is large, a single parameter server may not suffice because the application could be bottlenecked by the network bandwidth into and out of the machine that the parameter server is on (especially if there are many workers). 230 | 231 | 当你的参数特别大,而且你的集群也很大,一个parameter server可能就不够了。特别是有很多worker的时候,因为向一个parameter server的数据传输就会成为瓶颈。 232 | 233 | 简单的解决办法就是把参数分散在多个parameter server上。可以通过创建多个actor来实现。 234 | 235 | A natural solution in this case is to shard the parameters across multiple parameter servers. This can be achieved by simply starting up multiple parameter server actors. An example of how to do this is shown in the code example at the bottom. 236 | 237 | 238 | 239 | 为了保证ray并行的性能,远程任务应该花费至少几毫秒的时间。 240 | 241 | 当需要重复向不同远程任务传入相同对象时,可以先用ray.put()把类存入object store,然后传入它的object id。 242 | 243 | **Tip 2:** **For exploiting Ray’s parallelism, remote tasks should take at least several milliseconds.** 244 | 245 | **Tip 3:** ***When passing the same object repeatedly as an argument to a remote operation, use ray.put() to store it once in the object store and then pass its ID.*** 246 | 247 | **Tip 4:** **Use ray.wait() to process results as soon as they become available.** 248 | 249 | 250 | 251 | 252 | 253 | https://rise.cs.berkeley.edu/blog/ray-tips-for-first-time-users/ 254 | 255 | https://ray-project.github.io/2018/07/15/parameter-server-in-fifteen-lines.html -------------------------------------------------------------------------------- /algos/sac1/sac1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import time 4 | import ray 5 | import gym 6 | 7 | from hyperparams import HyperParameters, Wrapper 8 | from actor_learner import Actor, Learner 9 | 10 | import os 11 | import pickle 12 | import multiprocessing 13 | import copy 14 | import signal 15 | 16 | flags = tf.app.flags 17 | FLAGS = tf.app.flags.FLAGS 18 | 19 | # "Pendulum-v0" 'BipedalWalker-v2' 'LunarLanderContinuous-v2' 20 | flags.DEFINE_string("env_name", "BipedalWalkerHardcore-v2", "game env") 21 | flags.DEFINE_integer("total_epochs", 500, "total_epochs") 22 | flags.DEFINE_integer("num_workers", 1, "number of workers") 23 | flags.DEFINE_integer("num_learners", 1, "number of learners") 24 | flags.DEFINE_string("is_restore", "False", "True or False. True means restore weights from pickle file.") 25 | flags.DEFINE_float("a_l_ratio", 2, "steps / sample_times") 26 | 27 | 28 | @ray.remote 29 | class ReplayBuffer: 30 | """ 31 | A simple FIFO experience replay buffer for SAC agents. 32 | """ 33 | 34 | def __init__(self, obs_dim, act_dim, size): 35 | self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32) 36 | self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32) 37 | self.acts_buf = np.zeros([size, act_dim], dtype=np.float32) 38 | self.rews_buf = np.zeros(size, dtype=np.float32) 39 | self.done_buf = np.zeros(size, dtype=np.float32) 40 | self.ptr, self.size, self.max_size = 0, 0, size 41 | self.steps, self.sample_times = 0, 0 42 | 43 | def store(self, obs, act, rew, next_obs, done): 44 | self.obs1_buf[self.ptr] = obs 45 | self.obs2_buf[self.ptr] = next_obs 46 | self.acts_buf[self.ptr] = act 47 | self.rews_buf[self.ptr] = rew 48 | self.done_buf[self.ptr] = done 49 | self.ptr = (self.ptr+1) % self.max_size 50 | self.size = min(self.size+1, self.max_size) 51 | self.steps += 1 52 | 53 | def sample_batch(self, batch_size=128): 54 | idxs = np.random.randint(0, self.size, size=batch_size) 55 | self.sample_times += 1 56 | return dict(obs1=self.obs1_buf[idxs], 57 | obs2=self.obs2_buf[idxs], 58 | acts=self.acts_buf[idxs], 59 | rews=self.rews_buf[idxs], 60 | done=self.done_buf[idxs]) 61 | 62 | def get_counts(self): 63 | return self.sample_times, self.steps, self.size 64 | 65 | 66 | @ray.remote 67 | class ParameterServer(object): 68 | def __init__(self, keys, values, weights_file=""): 69 | # These values will be mutated, so we must create a copy that is not 70 | # backed by the object store. 71 | 72 | if weights_file: 73 | try: 74 | with open(weights_file, "rb") as pickle_in: 75 | self.weights = pickle.load(pickle_in) 76 | print("****** weights restored! ******") 77 | except: 78 | print("------------------------------------------------") 79 | print(weights_file) 80 | print("------ error: weights file doesn't exist! ------") 81 | exit() 82 | else: 83 | values = [value.copy() for value in values] 84 | self.weights = dict(zip(keys, values)) 85 | 86 | def push(self, keys, values): 87 | values = [value.copy() for value in values] 88 | for key, value in zip(keys, values): 89 | self.weights[key] = value 90 | 91 | def pull(self, keys): 92 | return [self.weights[key] for key in keys] 93 | 94 | def get_weights(self): 95 | return self.weights 96 | 97 | # save weights to disk 98 | def save_weights(self, name): 99 | with open(name + "weights.pickle", "wb") as pickle_out: 100 | pickle.dump(self.weights, pickle_out) 101 | 102 | 103 | class Cache(object): 104 | 105 | def __init__(self, replay_buffer): 106 | # cache for training data and model weights 107 | print('os.pid:', os.getpid()) 108 | self.replay_buffer = replay_buffer 109 | self.q1 = multiprocessing.Queue(10) 110 | self.q2 = multiprocessing.Queue(5) 111 | self.p1 = multiprocessing.Process(target=self.ps_update, args=(self.q1, self.q2, self.replay_buffer)) 112 | 113 | def ps_update(self, q1, q2, replay_buffer): 114 | print('os.pid of put_data():', os.getpid()) 115 | 116 | q1.put(copy.deepcopy(ray.get(replay_buffer.sample_batch.remote(opt.batch_size)))) 117 | 118 | while True: 119 | q1.put(copy.deepcopy(ray.get(replay_buffer.sample_batch.remote(opt.batch_size)))) 120 | 121 | if not q2.empty(): 122 | keys, values = q2.get() 123 | ps.push.remote(keys, values) 124 | 125 | def start(self): 126 | self.p1.start() 127 | self.p1.join(10) 128 | 129 | def end(self): 130 | self.p1.terminate() 131 | 132 | 133 | @ray.remote(num_gpus=1, max_calls=1) 134 | def worker_train(ps, replay_buffer, opt, learner_index): 135 | 136 | agent = Learner(opt, job="learner") 137 | keys = agent.get_weights()[0] 138 | weights = ray.get(ps.pull.remote(keys)) 139 | agent.set_weights(keys, weights) 140 | 141 | cache = Cache(replay_buffer) 142 | 143 | cache.start() 144 | 145 | cnt = 1 146 | while True: 147 | batch = cache.q1.get() 148 | agent.train(batch) 149 | if cnt % 300 == 0: 150 | # print('q1.qsize():', q1.qsize(), 'q2.qsize():', q2.qsize()) 151 | cache.q2.put(agent.get_weights()) 152 | # keys, values = agent.get_weights() 153 | # ps.push.remote(copy.deepcopy(keys), copy.deepcopy(values)) 154 | cnt += 1 155 | 156 | 157 | @ray.remote 158 | def worker_rollout(ps, replay_buffer, opt, worker_index): 159 | 160 | # env = gym.make(opt.env_name) 161 | 162 | env = Wrapper(gym.make(opt.env_name), opt.obs_noise, opt.act_noise, opt.reward_scale, 3) 163 | 164 | agent = Actor(opt, job="worker") 165 | keys = agent.get_weights()[0] 166 | 167 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 168 | 169 | # epochs = opt.total_epochs // opt.num_workers 170 | total_steps = opt.steps_per_epoch * opt.total_epochs 171 | 172 | weights = ray.get(ps.pull.remote(keys)) 173 | agent.set_weights(keys, weights) 174 | 175 | # TODO opt.start_steps 176 | # for t in range(total_steps): 177 | t = 0 178 | while True: 179 | if t > opt.start_steps: 180 | a = agent.get_action(o) 181 | else: 182 | a = env.action_space.sample() 183 | t += 1 184 | # Step the env 185 | o2, r, d, _ = env.step(a) 186 | ep_ret += r 187 | ep_len += 1 188 | 189 | # Ignore the "done" signal if it comes from hitting the time 190 | # horizon (that is, when it's an artificial terminal signal 191 | # that isn't based on the agent's state) 192 | d = False if ep_len == opt.max_ep_len else d 193 | 194 | # Store experience to replay buffer 195 | replay_buffer.store.remote(o, a, r, o2, d) 196 | 197 | # Super critical, easy to overlook step: make sure to update 198 | # most recent observation! 199 | o = o2 200 | 201 | # End of episode. Training (ep_len times). 202 | if d or (ep_len == opt.max_ep_len): 203 | sample_times, steps, _ = ray.get(replay_buffer.get_counts.remote()) 204 | 205 | while sample_times > 0 and steps / sample_times > opt.a_l_ratio: 206 | sample_times, steps, _ = ray.get(replay_buffer.get_counts.remote()) 207 | time.sleep(0.1) 208 | 209 | # update parameters every episode 210 | weights = ray.get(ps.pull.remote(keys)) 211 | agent.set_weights(keys, weights) 212 | 213 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 214 | 215 | 216 | @ray.remote 217 | def worker_test(ps, replay_buffer, opt): 218 | 219 | agent = Actor(opt, job="main") 220 | 221 | keys, weights = agent.get_weights() 222 | 223 | time0 = time1 = time.time() 224 | sample_times1, steps, size = ray.get(replay_buffer.get_counts.remote()) 225 | max_ret = -1000 226 | 227 | env = gym.make(opt.env_name) 228 | 229 | while True: 230 | weights = ray.get(ps.pull.remote(keys)) 231 | agent.set_weights(keys, weights) 232 | 233 | ep_ret = agent.test(env, replay_buffer) 234 | sample_times2, steps, size = ray.get(replay_buffer.get_counts.remote()) 235 | time2 = time.time() 236 | print("test_reward:", ep_ret, "sample_times:", sample_times2, "steps:", steps, "buffer_size:", size) 237 | print('update frequency:', (sample_times2-sample_times1)/(time2-time1), 'total time:', time2 - time0) 238 | 239 | if ep_ret > max_ret: 240 | ps.save_weights.remote() 241 | print("****** weights saved! ******") 242 | max_ret = ep_ret 243 | 244 | time1 = time2 245 | sample_times1 = sample_times2 246 | 247 | # if steps >= opt.total_epochs * opt.steps_per_epoch: 248 | # exit(0) 249 | # if time2 - time0 > 30: 250 | # exit(0) 251 | 252 | time.sleep(5) 253 | 254 | 255 | if __name__ == '__main__': 256 | 257 | ray.init() 258 | 259 | opt = HyperParameters(FLAGS.env_name, FLAGS.total_epochs, FLAGS.num_workers, FLAGS.a_l_ratio) 260 | 261 | # Create a parameter server with some random weights. 262 | if FLAGS.is_restore == "True": 263 | ps = ParameterServer.remote([], [], is_restore=True) 264 | else: 265 | net = Learner(opt, job="main") 266 | all_keys, all_values = net.get_weights() 267 | ps = ParameterServer.remote(all_keys, all_values) 268 | 269 | replay_buffer = ReplayBuffer.remote(obs_dim=opt.obs_dim, act_dim=opt.act_dim, size=opt.replay_size) 270 | 271 | # Start some training tasks. 272 | task_rollout = [worker_rollout.remote(ps, replay_buffer, opt, i) for i in range(FLAGS.num_workers)] 273 | 274 | time.sleep(5) 275 | 276 | task_train = [worker_train.remote(ps, replay_buffer, opt, i) for i in range(FLAGS.num_learners)] 277 | 278 | task_test = worker_test.remote(ps, replay_buffer, opt) 279 | 280 | ray.wait([task_test, ]) 281 | -------------------------------------------------------------------------------- /algos/sac1/actor_learner.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from numbers import Number 8 | import gym 9 | import datetime 10 | import time 11 | import ray 12 | import ray.experimental.tf_utils 13 | 14 | import core 15 | from core import get_vars 16 | from core import mlp_actor_critic as actor_critic 17 | 18 | 19 | class Learner(object): 20 | def __init__(self, opt, job): 21 | self.opt = opt 22 | with tf.Graph().as_default(): 23 | tf.set_random_seed(opt.seed) 24 | np.random.seed(opt.seed) 25 | 26 | # Inputs to computation graph 27 | self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \ 28 | core.placeholders(opt.obs_dim, opt.act_dim, opt.obs_dim, None, None) 29 | 30 | # Main outputs from computation graph 31 | with tf.variable_scope('main'): 32 | self.mu, self.pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = \ 33 | actor_critic(self.x_ph, self.x2_ph, self.a_ph, action_space=opt.ac_kwargs["action_space"]) 34 | 35 | # Target value network 36 | with tf.variable_scope('target'): 37 | _, _, logp_pi_, _, _, _, q1_pi_, q2_pi_ = \ 38 | actor_critic(self.x2_ph, self.x2_ph, self.a_ph, action_space=opt.ac_kwargs["action_space"]) 39 | 40 | # Count variables 41 | var_counts = tuple(core.count_vars(scope) for scope in 42 | ['main/pi', 'main/q1', 'main/q2', 'main']) 43 | print(('\nNumber of parameters: \t pi: %d, \t' + 'q1: %d, \t q2: %d, \t total: %d\n')%var_counts) 44 | 45 | ###### 46 | if opt.alpha == 'auto': 47 | target_entropy = (-np.prod(opt.action_space.shape)) 48 | 49 | log_alpha = tf.get_variable( 'log_alpha', dtype=tf.float32, initializer=0.0) 50 | alpha = tf.exp(log_alpha) 51 | 52 | alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi + target_entropy)) 53 | 54 | alpha_optimizer = tf.train.AdamOptimizer(learning_rate=opt.lr, name='alpha_optimizer') 55 | train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) 56 | ###### 57 | 58 | # Min Double-Q: 59 | min_q_pi = tf.minimum(q1_pi_, q2_pi_) 60 | 61 | # Targets for Q and V regression 62 | v_backup = tf.stop_gradient(min_q_pi - opt.alpha * logp_pi2) 63 | q_backup = self.r_ph + opt.gamma*(1-self.d_ph)*v_backup 64 | 65 | # Soft actor-critic losses 66 | pi_loss = tf.reduce_mean(opt.alpha * logp_pi - q1_pi) 67 | q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) 68 | q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) 69 | self.value_loss = q1_loss + q2_loss 70 | 71 | # Policy train op 72 | # (has to be separate from value train op, because q1_pi appears in pi_loss) 73 | pi_optimizer = tf.train.AdamOptimizer(learning_rate=opt.lr) 74 | train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) 75 | 76 | # Value train op 77 | # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) 78 | value_optimizer = tf.train.AdamOptimizer(learning_rate=opt.lr) 79 | value_params = get_vars('main/q') 80 | with tf.control_dependencies([train_pi_op]): 81 | train_value_op = value_optimizer.minimize(self.value_loss, var_list=value_params) 82 | 83 | # Polyak averaging for target variables 84 | # (control flow because sess.run otherwise evaluates in nondeterministic order) 85 | with tf.control_dependencies([train_value_op]): 86 | self.target_update = tf.group([tf.assign(v_targ, opt.polyak*v_targ + (1-opt.polyak)*v_main) 87 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 88 | 89 | # TODO 90 | # self.grads = self.optimizer.compute_gradients(self.cross_entropy) 91 | # self.grads_placeholder = [(tf.placeholder( 92 | # "float", shape=grad[1].get_shape()), grad[1]) 93 | # for grad in self.grads] 94 | 95 | # All ops to call during one training step 96 | if isinstance(opt.alpha, Number): 97 | self.step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, tf.identity(opt.alpha), 98 | train_pi_op, train_value_op, self.target_update] 99 | else: 100 | self.step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, opt.alpha, 101 | train_pi_op, train_value_op, self.target_update, train_alpha_op] 102 | 103 | # Initializing targets to match main variables 104 | self.target_init = tf.group([tf.assign(v_targ, v_main) 105 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 106 | 107 | if job == "learner": 108 | config = tf.ConfigProto() 109 | config.gpu_options.per_process_gpu_memory_fraction = opt.gpu_fraction 110 | config.inter_op_parallelism_threads = 1 111 | config.intra_op_parallelism_threads = 1 112 | self.sess = tf.Session(config=config) 113 | else: 114 | self.sess = tf.Session( 115 | config=tf.ConfigProto( 116 | device_count={'GPU': 0}, 117 | intra_op_parallelism_threads=1, 118 | inter_op_parallelism_threads=1)) 119 | 120 | self.sess.run(tf.global_variables_initializer()) 121 | 122 | self.variables = ray.experimental.tf_utils.TensorFlowVariables( 123 | self.value_loss, self.sess) 124 | 125 | def set_weights(self, variable_names, weights): 126 | self.variables.set_weights(dict(zip(variable_names, weights))) 127 | self.sess.run(self.target_init) 128 | 129 | def get_weights(self): 130 | weights = self.variables.get_weights() 131 | keys = [key for key in list(weights.keys()) if "main" in key] 132 | values = [weights[key] for key in keys] 133 | return keys, values 134 | 135 | def train(self, batch): 136 | feed_dict = {self.x_ph: batch['obs1'], 137 | self.x2_ph: batch['obs2'], 138 | self.a_ph: batch['acts'], 139 | self.r_ph: batch['rews'], 140 | self.d_ph: batch['done'], 141 | } 142 | self.sess.run(self.step_ops, feed_dict) 143 | 144 | def compute_gradients(self, x, y): 145 | pass 146 | 147 | def apply_gradients(self, gradients): 148 | pass 149 | 150 | 151 | class Actor(object): 152 | def __init__(self, opt, job): 153 | self.opt = opt 154 | with tf.Graph().as_default(): 155 | tf.set_random_seed(opt.seed) 156 | np.random.seed(opt.seed) 157 | 158 | # Inputs to computation graph 159 | self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \ 160 | core.placeholders(opt.obs_dim, opt.act_dim, opt.obs_dim, None, None) 161 | 162 | # Main outputs from computation graph 163 | with tf.variable_scope('main'): 164 | self.mu, self.pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = \ 165 | actor_critic(self.x_ph, self.x2_ph, self.a_ph, action_space=opt.ac_kwargs["action_space"]) 166 | 167 | # Set up summary Ops 168 | self.test_ops, self.test_vars = self.build_summaries() 169 | 170 | self.sess = tf.Session( 171 | config=tf.ConfigProto( 172 | device_count={'GPU': 0}, 173 | intra_op_parallelism_threads=1, 174 | inter_op_parallelism_threads=1)) 175 | 176 | self.sess.run(tf.global_variables_initializer()) 177 | 178 | if job == "main": 179 | self.writer = tf.summary.FileWriter( 180 | opt.summary_dir + "/" + str(datetime.datetime.now()) + "-" + opt.env_name + "-workers_num:" + str( 181 | opt.num_workers) + "%" + str(opt.a_l_ratio), self.sess.graph) 182 | 183 | self.variables = ray.experimental.tf_utils.TensorFlowVariables( 184 | self.pi, self.sess) 185 | 186 | def set_weights(self, variable_names, weights): 187 | self.variables.set_weights(dict(zip(variable_names, weights))) 188 | 189 | def get_weights(self): 190 | weights = self.variables.get_weights() 191 | keys = [key for key in list(weights.keys()) if "main" in key] 192 | values = [weights[key] for key in keys] 193 | return keys, values 194 | 195 | def get_action(self, o, deterministic=False): 196 | act_op = self.mu if deterministic else self.pi 197 | return self.sess.run(act_op, feed_dict={self.x_ph: o.reshape(1, -1)})[0] 198 | 199 | def test(self, test_env, replay_buffer, n=25): 200 | 201 | rew = [] 202 | for j in range(n): 203 | o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 204 | while not(d or (ep_len == self.opt.max_ep_len)): 205 | # Take deterministic actions at test time 206 | o, r, d, _ = test_env.step(self.get_action(o, True)) 207 | ep_ret += r 208 | ep_len += 1 209 | rew.append(ep_ret) 210 | 211 | sample_times, _, _ = ray.get(replay_buffer.get_counts.remote()) 212 | summary_str = self.sess.run(self.test_ops, feed_dict={ 213 | self.test_vars[0]: sum(rew)/25 214 | }) 215 | 216 | self.writer.add_summary(summary_str, sample_times) 217 | self.writer.flush() 218 | return sum(rew)/n 219 | 220 | # Tensorflow Summary Ops 221 | def build_summaries(self): 222 | test_summaries = [] 223 | episode_reward = tf.Variable(0.) 224 | test_summaries.append(tf.summary.scalar("Reward", episode_reward)) 225 | 226 | test_ops = tf.summary.merge(test_summaries) 227 | test_vars = [episode_reward] 228 | 229 | return test_ops, test_vars 230 | -------------------------------------------------------------------------------- /algos/dqn/actor_learner.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from numbers import Number 8 | import pickle 9 | 10 | import time 11 | import datetime 12 | import ray 13 | import ray.experimental.tf_utils 14 | 15 | import core 16 | from core import get_vars 17 | 18 | 19 | class Learner(object): 20 | def __init__(self, opt, job): 21 | self.opt = opt 22 | with tf.Graph().as_default(): 23 | tf.set_random_seed(opt.seed) 24 | np.random.seed(opt.seed) 25 | 26 | # Inputs to computation graph 27 | self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders(opt.obs_dim, None, opt.obs_dim, None, None) 28 | 29 | # Main outputs from computation graph 30 | with tf.variable_scope('main'): 31 | self.q, self.q_x2 = core.q_function(self.x_ph, self.x2_ph, opt.hidden_size, opt.act_dim) 32 | 33 | # Target value network 34 | with tf.variable_scope('target'): 35 | self.q_next, _ = core.q_function(self.x2_ph, self.x2_ph, opt.hidden_size, opt.act_dim) 36 | 37 | # Count variables 38 | var_counts = tuple(core.count_vars(scope) for scope in ['main']) 39 | print('\nNumber of parameters: total: %d\n' % var_counts) 40 | 41 | a_one_hot = tf.one_hot(tf.cast(self.a_ph, tf.int32), depth=opt.act_dim) 42 | q_value = tf.reduce_sum(self.q * a_one_hot, axis=1) 43 | 44 | # DDQN 45 | online_q_x2_a_one_hot = tf.one_hot(tf.argmax(self.q_x2, axis=1), depth=opt.act_dim) 46 | q_target = tf.reduce_sum(self.q_next * online_q_x2_a_one_hot, axis=1) 47 | 48 | # DQN 49 | # q_target = tf.reduce_max(self.q_next, axis=1) 50 | 51 | # Bellman backup for Q functions, using Clipped Double-Q targets 52 | q_backup = tf.stop_gradient(self.r_ph + opt.gamma * (1 - self.d_ph) * q_target) 53 | 54 | # q losses 55 | q_loss = 0.5 * tf.reduce_mean((q_backup - q_value) ** 2) 56 | 57 | # Value train op 58 | # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) 59 | value_optimizer = tf.train.AdamOptimizer(learning_rate=opt.lr) 60 | value_params = get_vars('main/q') 61 | train_value_op = value_optimizer.minimize(q_loss, var_list=value_params) 62 | 63 | # Polyak averaging for target variables 64 | # (control flow because sess.run otherwise evaluates in nondeterministic order) 65 | with tf.control_dependencies([train_value_op]): 66 | target_update = tf.group([tf.assign(v_targ, opt.polyak * v_targ + (1 - opt.polyak) * v_main) 67 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 68 | 69 | # All ops to call during one training step 70 | self.step_ops = [q_loss, self.q, train_value_op, target_update] 71 | 72 | # Initializing targets to match main variables 73 | self.target_init = tf.group([tf.assign(v_targ, v_main) 74 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 75 | 76 | if job == "learner": 77 | config = tf.ConfigProto() 78 | config.gpu_options.per_process_gpu_memory_fraction = opt.gpu_fraction 79 | config.inter_op_parallelism_threads = 1 80 | config.intra_op_parallelism_threads = 1 81 | self.sess = tf.Session(config=config) 82 | else: 83 | self.sess = tf.Session( 84 | config=tf.ConfigProto( 85 | # device_count={'GPU': 0}, 86 | intra_op_parallelism_threads=1, 87 | inter_op_parallelism_threads=1)) 88 | 89 | self.sess.run(tf.global_variables_initializer()) 90 | 91 | if job == "learner": 92 | # Set up summary Ops 93 | self.train_ops, self.train_vars = self.build_summaries() 94 | self.writer = tf.summary.FileWriter( 95 | opt.summary_dir + "/" + "^^^^^^^^^^" + str(datetime.datetime.now()) + opt.env_name + "-" + 96 | opt.exp_name + "-workers_num:" + str(opt.num_workers) + "%" + str(opt.a_l_ratio), self.sess.graph) 97 | 98 | self.variables = ray.experimental.tf_utils.TensorFlowVariables( 99 | q_loss, self.sess) 100 | 101 | def set_weights(self, variable_names, weights): 102 | self.variables.set_weights(dict(zip(variable_names, weights))) 103 | self.sess.run(self.target_init) 104 | 105 | def get_weights(self): 106 | weights = self.variables.get_weights() 107 | keys = [key for key in list(weights.keys()) if "main" in key] 108 | values = [weights[key] for key in keys] 109 | return keys, values 110 | 111 | def train(self, batch, cnt): 112 | 113 | feed_dict = {self.x_ph: batch['obs1'], 114 | self.x2_ph: batch['obs2'], 115 | self.a_ph: batch['acts'], 116 | self.r_ph: batch['rews'], 117 | self.d_ph: batch['done'], 118 | } 119 | 120 | outs = self.sess.run(self.step_ops, feed_dict) 121 | if cnt % 500 == 0: 122 | summary_str = self.sess.run(self.train_ops, feed_dict={ 123 | self.train_vars[0]: outs[0], 124 | self.train_vars[1]: np.mean(outs[1]) 125 | }) 126 | 127 | self.writer.add_summary(summary_str, cnt) 128 | self.writer.flush() 129 | 130 | def compute_gradients(self, x, y): 131 | pass 132 | 133 | def apply_gradients(self, gradients): 134 | pass 135 | 136 | # Tensorflow Summary Ops 137 | def build_summaries(self): 138 | train_summaries = [] 139 | LossQ = tf.Variable(0.) 140 | train_summaries.append(tf.summary.scalar("LossQ", LossQ)) 141 | QVals = tf.Variable(0.) 142 | train_summaries.append(tf.summary.scalar("QVals", QVals)) 143 | train_ops = tf.summary.merge(train_summaries) 144 | train_vars = [LossQ, QVals] 145 | 146 | return train_ops, train_vars 147 | 148 | 149 | class Actor(object): 150 | def __init__(self, opt, job): 151 | self.opt = opt 152 | with tf.Graph().as_default(): 153 | tf.set_random_seed(opt.seed) 154 | np.random.seed(opt.seed) 155 | 156 | # Inputs to computation graph 157 | self.x_ph, self.a_ph, self.x2_ph, = core.placeholders(opt.obs_dim, None, opt.obs_dim) 158 | 159 | # Main outputs from computation graph 160 | with tf.variable_scope('main'): 161 | self.q, _ = core.q_function(self.x_ph, self.x2_ph, opt.hidden_size, opt.act_dim) 162 | 163 | # Set up summary Ops 164 | self.test_ops, self.test_vars = self.build_summaries() 165 | 166 | self.sess = tf.Session( 167 | config=tf.ConfigProto( 168 | device_count={'GPU': 0}, 169 | intra_op_parallelism_threads=1, 170 | inter_op_parallelism_threads=1)) 171 | 172 | self.sess.run(tf.global_variables_initializer()) 173 | 174 | if job == "test": 175 | self.writer = tf.summary.FileWriter( 176 | opt.summary_dir + "/" + str(datetime.datetime.now()) + "-" + opt.env_name + "-" + opt.exp_name + 177 | "-workers_num:" + str(opt.num_workers) + "%" + str(opt.a_l_ratio), self.sess.graph) 178 | 179 | variables_all = tf.contrib.framework.get_variables_to_restore() 180 | variables_bn = [v for v in variables_all if 'moving_mean' in v.name or 'moving_variance' in v.name] 181 | 182 | self.variables = ray.experimental.tf_utils.TensorFlowVariables( 183 | self.q, self.sess, input_variables=variables_bn) 184 | 185 | def set_weights(self, variable_names, weights): 186 | self.variables.set_weights(dict(zip(variable_names, weights))) 187 | 188 | def get_weights(self): 189 | weights = self.variables.get_weights() 190 | keys = [key for key in list(weights.keys()) if "main" in key] 191 | values = [weights[key] for key in keys] 192 | return keys, values 193 | 194 | def get_action(self, o): 195 | if np.random.uniform() < 0.97: 196 | o = o[np.newaxis, :] 197 | actions_value = self.sess.run(self.q, feed_dict={self.x_ph: o}) 198 | action = np.argmax(actions_value) 199 | else: 200 | action = np.random.randint(0, self.opt.act_dim) 201 | return action 202 | 203 | # Tensorflow Summary Ops 204 | def build_summaries(self): 205 | test_summaries = [] 206 | episode_reward = tf.Variable(0.) 207 | episode_score = tf.Variable(0.) 208 | a_l_ratio = tf.Variable(0.) 209 | update_frequency = tf.Variable(0.) 210 | test_summaries.append(tf.summary.scalar("Reward", episode_reward)) 211 | test_summaries.append(tf.summary.scalar("score", episode_score)) 212 | test_summaries.append(tf.summary.scalar("a_l_ratio", a_l_ratio)) 213 | test_summaries.append(tf.summary.scalar("update_frequency", update_frequency)) 214 | test_ops = tf.summary.merge(test_summaries) 215 | test_vars = [episode_reward, episode_score, a_l_ratio, update_frequency] 216 | 217 | return test_ops, test_vars 218 | 219 | def write_tb(self, ave_test_reward, ave_score, alratio, update_frequency, total_learner_step): 220 | summary_str = self.sess.run(self.test_ops, feed_dict={ 221 | self.test_vars[0]: ave_test_reward, 222 | self.test_vars[1]: ave_score, 223 | self.test_vars[2]: alratio, 224 | self.test_vars[3]: update_frequency 225 | }) 226 | 227 | self.writer.add_summary(summary_str, total_learner_step) 228 | self.writer.flush() 229 | 230 | def test(self, test_env, n=10): 231 | 232 | test_rets = [] 233 | scores = [] 234 | 235 | for _ in range(n): 236 | o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 237 | 238 | while True: 239 | a = self.get_action(o) 240 | # Step the env 241 | o, r, d, _ = test_env.step(a) 242 | 243 | ep_ret += r 244 | ep_len += 1 245 | 246 | if d: 247 | test_rets.append(ep_ret) 248 | scores.append(test_env.rewards[0]) 249 | # print('test_ep_len:', ep_len, 'test_ep_ret:', ep_ret) 250 | break 251 | return np.mean(test_rets), np.mean(scores) 252 | 253 | 254 | 255 | -------------------------------------------------------------------------------- /algos/sqn/actor_learner.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from numbers import Number 8 | import pickle 9 | 10 | import time 11 | import datetime 12 | import ray 13 | import ray.experimental.tf_utils 14 | 15 | import core 16 | from core import get_vars 17 | 18 | 19 | class Learner(object): 20 | def __init__(self, opt, job): 21 | self.opt = opt 22 | with tf.Graph().as_default(): 23 | tf.set_random_seed(opt.seed) 24 | np.random.seed(opt.seed) 25 | 26 | # Inputs to computation graph 27 | self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders(opt.obs_dim, None, opt.obs_dim, None, None) 28 | 29 | # Main outputs from computation graph 30 | with tf.variable_scope('main'): 31 | self.mu, self.pi, entropy_x2, q1, q2, q1_mu, q2_mu = core.q_function(self.x_ph, self.x2_ph, opt.alpha, 32 | opt.hidden_size, opt.act_dim) 33 | 34 | # Target value network 35 | with tf.variable_scope('target'): 36 | mu_, pi_, entropy_x2_, q1_, q2_, q1_mu_, q2_mu_ = core.q_function(self.x2_ph, self.x2_ph, opt.alpha, 37 | opt.hidden_size, opt.act_dim) 38 | 39 | # Count variables 40 | var_counts = tuple(core.count_vars(scope) for scope in ['main']) 41 | print('\nNumber of parameters: total: %d\n' % var_counts) 42 | 43 | a_one_hot = tf.one_hot(tf.cast(self.a_ph, tf.int32), depth=opt.act_dim) 44 | q1_a = tf.reduce_sum(q1 * a_one_hot, axis=1) 45 | q2_a = tf.reduce_sum(q2 * a_one_hot, axis=1) 46 | 47 | # Min Double-Q: 48 | min_q_target = tf.minimum(q1_mu_, q2_mu_) 49 | 50 | # Bellman backup for Q functions 51 | v_backup = tf.stop_gradient(min_q_target - opt.alpha * entropy_x2) 52 | q_backup = self.r_ph + opt.gamma * (1 - self.d_ph) * v_backup 53 | 54 | # q losses 55 | # q_loss = 0.5 * tf.reduce_mean((q_backup - q_value) ** 2) 56 | q1_loss = 0.5 * tf.reduce_mean((q_backup - q1_a) ** 2) 57 | q2_loss = 0.5 * tf.reduce_mean((q_backup - q2_a) ** 2) 58 | q_loss = q1_loss + q2_loss 59 | 60 | # Value train op 61 | # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) 62 | value_optimizer = tf.train.AdamOptimizer(learning_rate=opt.lr) 63 | value_params = get_vars('main/q') 64 | train_value_op = value_optimizer.minimize(q_loss, var_list=value_params) 65 | 66 | # Polyak averaging for target variables 67 | # (control flow because sess.run otherwise evaluates in nondeterministic order) 68 | with tf.control_dependencies([train_value_op]): 69 | target_update = tf.group([tf.assign(v_targ, opt.polyak * v_targ + (1 - opt.polyak) * v_main) 70 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 71 | 72 | # All ops to call during one training step 73 | self.step_ops = [q_loss, q1, q2, train_value_op, target_update] 74 | 75 | # Initializing targets to match main variables 76 | self.target_init = tf.group([tf.assign(v_targ, v_main) 77 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 78 | 79 | if job == "learner": 80 | config = tf.ConfigProto() 81 | config.gpu_options.per_process_gpu_memory_fraction = opt.gpu_fraction 82 | config.inter_op_parallelism_threads = 1 83 | config.intra_op_parallelism_threads = 1 84 | self.sess = tf.Session(config=config) 85 | else: 86 | self.sess = tf.Session( 87 | config=tf.ConfigProto( 88 | # device_count={'GPU': 0}, 89 | intra_op_parallelism_threads=1, 90 | inter_op_parallelism_threads=1)) 91 | 92 | self.sess.run(tf.global_variables_initializer()) 93 | 94 | if job == "learner": 95 | # Set up summary Ops 96 | self.train_ops, self.train_vars = self.build_summaries() 97 | self.writer = tf.summary.FileWriter( 98 | opt.summary_dir + "/" + "^^^^^^^^^^" + str(datetime.datetime.now()) + opt.env_name + "-" + 99 | opt.exp_name + "-workers_num:" + str(opt.num_workers) + "%" + str(opt.a_l_ratio), self.sess.graph) 100 | 101 | self.variables = ray.experimental.tf_utils.TensorFlowVariables( 102 | q_loss, self.sess) 103 | 104 | def set_weights(self, variable_names, weights): 105 | self.variables.set_weights(dict(zip(variable_names, weights))) 106 | self.sess.run(self.target_init) 107 | 108 | def get_weights(self): 109 | weights = self.variables.get_weights() 110 | keys = [key for key in list(weights.keys()) if "main" in key] 111 | values = [weights[key] for key in keys] 112 | return keys, values 113 | 114 | def train(self, batch, cnt): 115 | 116 | feed_dict = {self.x_ph: batch['obs1'], 117 | self.x2_ph: batch['obs2'], 118 | self.a_ph: batch['acts'], 119 | self.r_ph: batch['rews'], 120 | self.d_ph: batch['done'], 121 | } 122 | 123 | outs = self.sess.run(self.step_ops, feed_dict) 124 | if cnt % 500 == 0: 125 | summary_str = self.sess.run(self.train_ops, feed_dict={ 126 | self.train_vars[0]: outs[0], 127 | self.train_vars[1]: np.mean(outs[1]) 128 | }) 129 | 130 | self.writer.add_summary(summary_str, cnt) 131 | self.writer.flush() 132 | 133 | def compute_gradients(self, x, y): 134 | pass 135 | 136 | def apply_gradients(self, gradients): 137 | pass 138 | 139 | # Tensorflow Summary Ops 140 | def build_summaries(self): 141 | train_summaries = [] 142 | LossQ = tf.Variable(0.) 143 | train_summaries.append(tf.summary.scalar("LossQ", LossQ)) 144 | QVals = tf.Variable(0.) 145 | train_summaries.append(tf.summary.scalar("QVals", QVals)) 146 | train_ops = tf.summary.merge(train_summaries) 147 | train_vars = [LossQ, QVals] 148 | 149 | return train_ops, train_vars 150 | 151 | 152 | class Actor(object): 153 | def __init__(self, opt, job): 154 | self.opt = opt 155 | with tf.Graph().as_default(): 156 | tf.set_random_seed(opt.seed) 157 | np.random.seed(opt.seed) 158 | 159 | # Inputs to computation graph 160 | self.x_ph, self.a_ph, self.x2_ph, = core.placeholders(opt.obs_dim, None, opt.obs_dim) 161 | 162 | # Main outputs from computation graph 163 | with tf.variable_scope('main'): 164 | self.mu, self.pi, entropy_x2, q1, q2, q1_mu, q2_mu = core.q_function(self.x_ph, self.x2_ph, opt.alpha, 165 | opt.hidden_size, opt.act_dim) 166 | 167 | # Set up summary Ops 168 | self.test_ops, self.test_vars = self.build_summaries() 169 | 170 | self.sess = tf.Session( 171 | config=tf.ConfigProto( 172 | device_count={'GPU': 0}, 173 | intra_op_parallelism_threads=1, 174 | inter_op_parallelism_threads=1)) 175 | 176 | self.sess.run(tf.global_variables_initializer()) 177 | 178 | if job == "test": 179 | self.writer = tf.summary.FileWriter( 180 | opt.summary_dir + "/" + str(datetime.datetime.now()) + "-" + opt.env_name + "-" + opt.exp_name + 181 | "-workers_num:" + str(opt.num_workers) + "%" + str(opt.a_l_ratio), self.sess.graph) 182 | 183 | variables_all = tf.contrib.framework.get_variables_to_restore() 184 | variables_bn = [v for v in variables_all if 'moving_mean' in v.name or 'moving_variance' in v.name] 185 | 186 | self.variables = ray.experimental.tf_utils.TensorFlowVariables( 187 | self.mu, self.sess, input_variables=variables_bn) 188 | 189 | def set_weights(self, variable_names, weights): 190 | self.variables.set_weights(dict(zip(variable_names, weights))) 191 | 192 | def get_weights(self): 193 | weights = self.variables.get_weights() 194 | keys = [key for key in list(weights.keys()) if "main" in key] 195 | values = [weights[key] for key in keys] 196 | return keys, values 197 | 198 | def get_action(self, o, deterministic=False): 199 | act_op = self.mu if deterministic else self.pi 200 | return self.sess.run(act_op, feed_dict={self.x_ph: np.expand_dims(o, axis=0)})[0] 201 | 202 | # Tensorflow Summary Ops 203 | def build_summaries(self): 204 | test_summaries = [] 205 | episode_reward = tf.Variable(0.) 206 | episode_score = tf.Variable(0.) 207 | a_l_ratio = tf.Variable(0.) 208 | update_frequency = tf.Variable(0.) 209 | test_summaries.append(tf.summary.scalar("Reward", episode_reward)) 210 | test_summaries.append(tf.summary.scalar("score", episode_score)) 211 | test_summaries.append(tf.summary.scalar("a_l_ratio", a_l_ratio)) 212 | test_summaries.append(tf.summary.scalar("update_frequency", update_frequency)) 213 | test_ops = tf.summary.merge(test_summaries) 214 | test_vars = [episode_reward, episode_score, a_l_ratio, update_frequency] 215 | 216 | return test_ops, test_vars 217 | 218 | def write_tb(self, ave_test_reward, ave_score, alratio, update_frequency, total_learner_step): 219 | summary_str = self.sess.run(self.test_ops, feed_dict={ 220 | self.test_vars[0]: ave_test_reward, 221 | self.test_vars[1]: ave_score, 222 | self.test_vars[2]: alratio, 223 | self.test_vars[3]: update_frequency 224 | }) 225 | 226 | self.writer.add_summary(summary_str, total_learner_step) 227 | self.writer.flush() 228 | 229 | def test(self, test_env, n=10): 230 | 231 | test_rets = [] 232 | scores = [] 233 | 234 | for _ in range(n): 235 | o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 236 | 237 | while True: 238 | a = self.get_action(o, deterministic=True) 239 | # Step the env 240 | o, r, d, _ = test_env.step(a) 241 | 242 | ep_ret += r 243 | ep_len += 1 244 | 245 | if d: 246 | test_rets.append(ep_ret) 247 | scores.append(test_env.rewards[0]) 248 | # print('test_ep_len:', ep_len, 'test_ep_ret:', ep_ret) 249 | break 250 | return np.mean(test_rets), np.mean(scores) 251 | 252 | 253 | 254 | -------------------------------------------------------------------------------- /tutorial/Parallelize your algorithm by Ray (3).md: -------------------------------------------------------------------------------- 1 | # 使用Ray并行化你的强化学习算法(三) 2 | 3 | ## SAC并行版本实现 4 | 5 | 这一章,我们将上节分好的各部分代码放入并行框架中。 6 | 7 | 我们的并行框架结构图(内容仅涉及到白色线条部分): 8 | 9 | ![ddrlframework](.\Pictures\ddrlframework.jpg) 10 | 11 | 下面是用ray实现的框架。 12 | 13 | ```python 14 | @ray.remote 15 | class ReplayBuffer: 16 | ... 17 | # replay buffer 18 | 19 | @ray.remote 20 | class ParameterServer(object): 21 | ... 22 | # keep the newest network weights here 23 | # could pull and push the weights 24 | # also could save the weights to local 25 | 26 | @ray.remote 27 | def worker_rollout(ps, replay_buffer, opt, worker_index): 28 | ... 29 | # bulid a rollout network 30 | # pull weights from ps 31 | # for loop: 32 | # interactive with environment 33 | # store experience to replay buffer 34 | # if end of episode: 35 | # pull weights from ps 36 | 37 | @ray.remote(num_gpus=1, max_calls=1) 38 | def worker_train(ps, replay_buffer, opt, learner_index): 39 | ... 40 | # build a learner network 41 | # pull weights from ps 42 | # for loop: 43 | # get sample batch from replaybuffer 44 | # update network and push new weights to ps 45 | 46 | @ray.remote 47 | def worker_test(ps, replay_buffer, opt, worker_index=0): 48 | ... 49 | # bulid a test network usually same as rollout 50 | # while: 51 | # pull weights from ps 52 | # do test 53 | # might save model here 54 | 55 | if __name__ == '__main__': 56 | 57 | ray.init(object_store_memory=1000000000, redis_max_memory=1000000000) 58 | 59 | # create the parameter server 60 | ps = ParameterServer.remote([], [], is_restore=True) 61 | 62 | # create replay buffer 63 | replay_buffer = ReplayBuffer.remote(obs_dim=opt.obs_dim, act_dim=opt.act_dim, size=opt.replay_size) 64 | 65 | # Start some rollout tasks 66 | task_rollout = [worker_rollout.remote(ps, replay_buffer, opt, i) for i in range(FLAGS.num_workers)] 67 | 68 | time.sleep(5) 69 | 70 | # start training tasks 71 | task_train = [worker_train.remote(ps, replay_buffer, opt, i) for i in range(FLAGS.num_learners)] 72 | 73 | # start testing 74 | task_test = worker_test.remote(ps, replay_buffer, opt) 75 | 76 | # wait util task test end 77 | # Keep the main process running. Otherwise everything will shut down when main process finished. 78 | ray.wait([task_test, ]) 79 | ``` 80 | 81 | --- 82 | 83 | 0. model 84 | 85 | 我们先看算法的核心部分:model,包含了TensorFlow建图,计算loss,训练和测试。 86 | 87 | 新建一个的文件,将之前model部分,训练部分和测试部分的代码都放入Model类中去。之后我们建立一个实例后,就可以调用方法生成动作,训练更新参数,测试评估参数。 88 | 89 | ```python 90 | class Model(object): 91 | 92 | def __init__(self, args): 93 | # model part code 94 | def get_action(self, o, deterministic=False): 95 | # get_action method 96 | def train(self, replay_buffer, args): 97 | # train part code 98 | def test_agent(self, test_env, args, n=10): 99 | # test method copy 100 | 101 | ``` 102 | 103 | --- 104 | 105 | 将代码放入对应位置。 106 | 107 | ```python 108 | import numpy as np 109 | import tensorflow as tf 110 | import gym 111 | import time 112 | from spinup.algos.sac import core 113 | from spinup.algos.sac.core import get_vars 114 | from spinup.utils.logx import EpochLogger 115 | from core import mlp_actor_critic as actor_critic 116 | import ray.experimental.tf_utils 117 | 118 | 119 | class Model(object): 120 | 121 | def __init__(self, args): 122 | 123 | # Inputs to computation graph 124 | 125 | 126 | def get_action(self, o, deterministic=False): 127 | act_op = mu if deterministic else pi 128 | return sess.run(act_op, feed_dict={self.x_ph: o.reshape(1, -1)})[0] 129 | 130 | def train(self, replay_buffer, args): 131 | 132 | for j in range(args.ep_len): 133 | batch = replay_buffer.sample_batch(args.batch_size) 134 | feed_dict = {self.x_ph: batch['obs1'], 135 | self.x2_ph: batch['obs2'], 136 | self.a_ph: batch['acts'], 137 | self.r_ph: batch['rews'], 138 | self.d_ph: batch['done'], 139 | } 140 | outs = sess.run(self.step_ops, feed_dict) 141 | # logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], 142 | # LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], 143 | # VVals=outs[6], LogPi=outs[7]) 144 | 145 | def test_agent(self, test_env, args, n=10): 146 | global sess, mu, pi, q1, q2, q1_pi, q2_pi 147 | for j in range(n): 148 | o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 149 | while not (d or (ep_len == args.max_ep_len)): 150 | # Take deterministic actions at test time 151 | o, r, d, _ = test_env.step(self.get_action(o, True)) 152 | ep_ret += r 153 | ep_len += 1 154 | print(ep_len, ep_ret) 155 | # logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) 156 | 157 | ``` 158 | 159 | --- 160 | 161 | 之外,我们还需要额外添加几个有用的方法。learner不断更新权重,需要把最新的权重导出到ps server上去。rollout需要不断从ps上下载最新权重并更换为自己的权重。 162 | 163 | ray中已经有写好的类。方便我们导入和导出权重。 164 | 165 | ```python 166 | def __init__(self, args): 167 | 168 | ... 169 | 170 | self.variables = ray.experimental.tf_utils.TensorFlowVariables(self.value_loss, self.sess) 171 | ``` 172 | 173 | 目标函数的权重在导入权重以后做初始化才有意义,所以把它放在更新权重方法里。 174 | 175 | ```python 176 | def set_weights(self, variable_names, weights): 177 | self.variables.set_weights(dict(zip(variable_names, weights))) 178 | self.sess.run(self.target_init) 179 | 180 | def get_weights(self): 181 | weights = self.variables.get_weights() 182 | keys = [key for key in list(weights.keys()) if "main" in key] 183 | values = [weights[key] for key in keys] 184 | return keys, values 185 | ``` 186 | 187 | --- 188 | 189 | 1. Replay Buffer,只要在上面加上ray的修饰器就行了。 190 | 191 | ```python 192 | @ray.remote 193 | class ReplayBuffer: 194 | ... 195 | # replay buffer 196 | ``` 197 | 198 | --- 199 | 200 | 2. Parameter Server 201 | 202 | 参数保存在字典里面。Parameter Server的主要功能就是给worker返回最新的权重,接收learner传来的最新的权重。 203 | 204 | ```python 205 | @ray.remote 206 | class ParameterServer(object): 207 | def __init__(self, keys, values): 208 | # These values will be mutated, so we must create a copy that is not 209 | # backed by the object store. 210 | values = [value.copy() for value in values] 211 | self.weights = dict(zip(keys, values)) 212 | 213 | def push(self, keys, values): 214 | values = [value.copy() for value in values] 215 | for key, value in zip(keys, values): 216 | self.weights[key] = value 217 | 218 | def pull(self, keys): 219 | return [self.weights[key] for key in keys] 220 | 221 | def get_weights(self): 222 | return self.weights 223 | 224 | # save weights to disk 225 | def save_weights(self, name): 226 | with open(name + "weights.pickle", "wb") as pickle_out: 227 | pickle.dump(self.weights, pickle_out) 228 | ``` 229 | 230 | --- 231 | 232 | 3. rollout 233 | 234 | rollout (worker) 与环境交互,产生数据并存入Replay Buffer。每个episode结束会从Parameter Server得到最新权重来更新自己。 235 | 236 | ```python 237 | @ray.remote 238 | def worker_rollout(ps, replay_buffer, args): 239 | env = gym.make(args.env) 240 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 241 | total_steps = args.steps_per_epoch * args.epochs 242 | 243 | agent = Model(args) 244 | keys = agent.get_weights()[0] 245 | 246 | weights = ray.get(ps.pull.remote(keys)) 247 | agent.set_weights(keys, weights) 248 | 249 | # Main loop: collect experience in env and update/log each epoch 250 | for t in range(total_steps): 251 | 252 | """ 253 | Until start_steps have elapsed, randomly sample actions 254 | from a uniform distribution for better exploration. Afterwards, 255 | use the learned policy. 256 | """ 257 | if t > args.start_steps: 258 | a = agent.get_action(o) 259 | else: 260 | a = env.action_space.sample() 261 | 262 | # Step the env 263 | o2, r, d, _ = env.step(a) 264 | ep_ret += r 265 | ep_len += 1 266 | 267 | # Ignore the "done" signal if it comes from hitting the time 268 | # horizon (that is, when it's an artificial terminal signal 269 | # that isn't based on the agent's state) 270 | d = False if ep_len == args.max_ep_len else d 271 | 272 | # Store experience to replay buffer 273 | replay_buffer.store.remote(o, a, r, o2, d) 274 | 275 | # Super critical, easy to overlook step: make sure to update 276 | # most recent observation! 277 | o = o2 278 | 279 | if d or (ep_len == args.max_ep_len): 280 | """ 281 | Perform all SAC updates at the end of the trajectory. 282 | This is a slight difference from the SAC specified in the 283 | original paper. 284 | """ 285 | 286 | # print(ep_len, ep_ret) 287 | # logger.store(EpRet=ep_ret, EpLen=ep_len) 288 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 289 | 290 | weights = ray.get(ps.pull.remote(keys)) 291 | agent.set_weights(keys, weights) 292 | ``` 293 | 294 | --- 295 | 296 | 4. train 297 | 298 | 我们使用一个GPU进行训练。所有在ray修饰器里我们设置资源请求量。 299 | 300 | 当使用GPU执行任务时,任务会在GPU上分配内存,而且有可能在执行结束后不释放。在设置中写入`max_calls=1`可以让任务运行结束后自动退出并释放GPU内存。 301 | 302 | ```python 303 | @ray.remote(num_gpus=1, max_calls=1) 304 | def worker_train(ps, replay_buffer, args): 305 | agent = Model(args) 306 | keys = agent.get_weights()[0] 307 | 308 | weights = ray.get(ps.pull.remote(keys)) 309 | agent.set_weights(keys, weights) 310 | 311 | cnt = 1 312 | while True: 313 | 314 | agent.train(replay_buffer, args) 315 | 316 | if cnt % 300 == 0: 317 | keys, values = agent.get_weights() 318 | ps.push.remote(keys, values) 319 | 320 | cnt += 1 321 | ``` 322 | 323 | --- 324 | 325 | 5. test 326 | 327 | ```python 328 | @ray.remote 329 | def worker_test(ps, start_time): 330 | 331 | from spinup.utils.run_utils import setup_logger_kwargs 332 | 333 | logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) 334 | logger = EpochLogger(**logger_kwargs) 335 | # print(locals()) 336 | # logger.save_config(locals()) 337 | 338 | agent = Model(args) 339 | keys = agent.get_weights()[0] 340 | 341 | weights = ray.get(ps.pull.remote(keys)) 342 | agent.set_weights(keys, weights) 343 | test_env = gym.make(args.env) 344 | while True: 345 | ave_ret = agent.test_agent(test_env, args) 346 | # print("test Average Ret:", ave_ret, "time:", time.time()-start_time) 347 | logger.log_tabular('test Average Ret', ave_ret) 348 | logger.log_tabular('Time', time.time() - start_time) 349 | logger.dump_tabular() 350 | weights = ray.get(ps.pull.remote(keys)) 351 | agent.set_weights(keys, weights) 352 | 353 | ``` 354 | 355 | --- 356 | 357 | 主程序调用 358 | 359 | ```python 360 | if __name__ == '__main__': 361 | 362 | ... 363 | 364 | ray.init() 365 | 366 | net = Model(args) 367 | all_keys, all_values = net.get_weights() 368 | ps = ParameterServer.remote(all_keys, all_values) 369 | 370 | replay_buffer = ReplayBuffer.remote(args.obs_dim, args.act_dim, args.replay_size) 371 | 372 | # Start some training tasks. 373 | task_rollout = [worker_rollout.remote(ps, replay_buffer, args) for i in range(args.num_workers)] 374 | 375 | time.sleep(20) 376 | 377 | task_train = [worker_train.remote(ps, replay_buffer, args) for i in range(args.num_learners)] 378 | 379 | time.sleep(10) 380 | 381 | task_test = worker_test.remote(ps) 382 | ray.wait([task_test, ]) 383 | ``` 384 | 385 | 本节完。 386 | 387 | 本文展示的代码是实现分布式算法的最小改动版本,还有许多地方可以优化。 388 | 389 | 简单实验对比: 390 | 391 | 实验:LunarLanderContinuous-v2 392 | 393 | 未调参,sac和dsac参数相同,dsac的worker数量:1。GPU:GTX1060 394 | 395 | ![dsac1w-sac](.\Pictures\dsac1w-sac.png) 396 | 397 | 完整代码链接: 398 | 399 | 参考资料: 400 | 401 | https://ray.readthedocs.io/en/master/auto_examples/plot_parameter_server.html -------------------------------------------------------------------------------- /algos/sac1/sac_ray.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import time 4 | import ray 5 | import gym 6 | 7 | from hyperparams import HyperParameters, Wrapper 8 | from actor_learner import Actor, Learner 9 | 10 | import os 11 | import pickle 12 | import multiprocessing 13 | import copy 14 | 15 | from collections import deque 16 | 17 | import inspect 18 | import json 19 | from ray.rllib.utils.compression import pack, unpack 20 | 21 | 22 | flags = tf.app.flags 23 | FLAGS = tf.app.flags.FLAGS 24 | 25 | 26 | flags.DEFINE_string("env_name", "BipedalWalker-v2", "game env") 27 | flags.DEFINE_string("exp_name", "Exp1", "experiments name") 28 | flags.DEFINE_integer("num_workers", 16, "number of workers") 29 | flags.DEFINE_string("weights_file", "", "empty means False. " 30 | "[Maxret_weights.pickle] means restore weights from this pickle file.") 31 | flags.DEFINE_float("a_l_ratio", 2, "steps / sample_times") 32 | 33 | 34 | @ray.remote(num_cpus=2) 35 | class ReplayBuffer: 36 | """ 37 | A simple FIFO experience replay buffer for SQN_N_STEP agents. 38 | """ 39 | 40 | def __init__(self, opt): 41 | self.opt = opt 42 | if opt.obs_shape != (115,): 43 | self.buffer_o = np.array([['0' * 2000] * (opt.Ln + 1)] * opt.buffer_size, dtype=np.str) 44 | else: 45 | self.buffer_o = np.zeros((opt.buffer_size, opt.Ln + 1) + opt.obs_shape, dtype=np.float32) 46 | self.buffer_a = np.zeros((opt.buffer_size, opt.Ln) + opt.act_shape, dtype=np.float32) 47 | self.buffer_r = np.zeros((opt.buffer_size, opt.Ln), dtype=np.float32) 48 | self.buffer_d = np.zeros((opt.buffer_size, opt.Ln), dtype=np.float32) 49 | self.ptr, self.size, self.max_size = 0, 0, opt.buffer_size 50 | self.steps, self.sample_times = 0, 0 51 | 52 | def store(self, o_queue, a_r_d_queue, worker_index): 53 | 54 | obs, = np.stack(o_queue, axis=1) 55 | 56 | if self.opt.obs_shape != (115,): 57 | self.buffer_o[self.ptr] = obs 58 | else: 59 | self.buffer_o[self.ptr] = np.array(list(obs), dtype=np.float32) 60 | 61 | a, r, d, = np.stack(a_r_d_queue, axis=1) 62 | self.buffer_a[self.ptr] = np.array(list(a), dtype=np.float32) 63 | self.buffer_r[self.ptr] = np.array(list(r), dtype=np.float32) 64 | self.buffer_d[self.ptr] = np.array(list(d), dtype=np.float32) 65 | 66 | self.ptr = (self.ptr + 1) % self.max_size 67 | self.size = min(self.size + 1, self.max_size) 68 | # TODO 69 | self.steps += 1 * self.opt.num_buffers 70 | # self.steps += opt.Ln * opt.action_repeat 71 | 72 | def sample_batch(self): 73 | idxs = np.random.randint(0, self.size, size=self.opt.batch_size) 74 | # TODO 75 | self.sample_times += 1 * self.opt.num_buffers 76 | 77 | return dict(obs=self.buffer_o[idxs], 78 | acts=self.buffer_a[idxs], 79 | rews=self.buffer_r[idxs], 80 | done=self.buffer_d[idxs], ) 81 | 82 | def get_counts(self): 83 | return self.sample_times, self.steps, self.size 84 | 85 | 86 | @ray.remote 87 | class ParameterServer(object): 88 | def __init__(self, keys, values, weights_file=""): 89 | # These values will be mutated, so we must create a copy that is not 90 | # backed by the object store. 91 | 92 | if weights_file: 93 | try: 94 | with open(weights_file, "rb") as pickle_in: 95 | self.weights = pickle.load(pickle_in) 96 | print("****** weights restored! ******") 97 | except: 98 | print("------------------------------------------------") 99 | print(weights_file) 100 | print("------ error: weights file doesn't exist! ------") 101 | exit() 102 | else: 103 | values = [value.copy() for value in values] 104 | self.weights = dict(zip(keys, values)) 105 | 106 | def push(self, keys, values): 107 | values = [value.copy() for value in values] 108 | for key, value in zip(keys, values): 109 | self.weights[key] = value 110 | 111 | def pull(self, keys): 112 | return [self.weights[key] for key in keys] 113 | 114 | def get_weights(self): 115 | return self.weights 116 | 117 | # save weights to disk 118 | def save_weights(self, name): 119 | with open(name + "weights.pickle", "wb") as pickle_out: 120 | pickle.dump(self.weights, pickle_out) 121 | 122 | 123 | class Cache(object): 124 | 125 | def __init__(self, replay_buffer): 126 | # cache for training data and model weights 127 | print('os.pid:', os.getpid()) 128 | self.replay_buffer = replay_buffer 129 | self.q1 = multiprocessing.Queue(10) 130 | self.q2 = multiprocessing.Queue(5) 131 | self.p1 = multiprocessing.Process(target=self.ps_update, args=(self.q1, self.q2, self.replay_buffer)) 132 | self.p1.daemon = True 133 | 134 | def ps_update(self, q1, q2, replay_buffer): 135 | print('os.pid of put_data():', os.getpid()) 136 | 137 | q1.put(copy.deepcopy(ray.get(replay_buffer[np.random.choice(opt.num_buffers, 1)[0]].sample_batch.remote()))) 138 | 139 | while True: 140 | if q1.qsize() < 10: 141 | q1.put(copy.deepcopy(ray.get(replay_buffer[np.random.choice(opt.num_buffers, 1)[0]].sample_batch.remote()))) 142 | 143 | if not q2.empty(): 144 | keys, values = q2.get() 145 | ps.push.remote(keys, values) 146 | 147 | def start(self): 148 | self.p1.start() 149 | self.p1.join(10) 150 | 151 | def end(self): 152 | self.p1.terminate() 153 | 154 | # TODO 155 | @ray.remote(num_cpus=2, num_gpus=1, max_calls=1) 156 | def worker_train(ps, replay_buffer, opt, learner_index): 157 | agent = Learner(opt, job="learner") 158 | keys = agent.get_weights()[0] 159 | weights = ray.get(ps.pull.remote(keys)) 160 | agent.set_weights(keys, weights) 161 | 162 | cache = Cache(replay_buffer) 163 | 164 | cache.start() 165 | 166 | cnt = 1 167 | while True: 168 | batch = cache.q1.get() 169 | if opt.model == "cnn": 170 | batch['obs'] = np.array([[unpack(o) for o in lno] for lno in batch['obs']]) 171 | agent.train(batch, cnt) 172 | # TODO cnt % 300 == 0 before 173 | if cnt % 100 == 0: 174 | cache.q2.put(agent.get_weights()) 175 | cnt += 1 176 | 177 | 178 | @ray.remote 179 | def worker_rollout(ps, replay_buffer, opt, worker_index): 180 | 181 | agent = Actor(opt, job="worker") 182 | keys = agent.get_weights()[0] 183 | 184 | filling_steps = 0 185 | while True: 186 | # ------ env set up ------ 187 | env = Wrapper(gym.make(opt.env_name), opt.obs_noise, opt.act_noise, opt.reward_scale, 3) 188 | # ------ env set up end ------ 189 | 190 | ################################## deques 191 | 192 | o_queue = deque([], maxlen=opt.Ln + 1) 193 | a_r_d_queue = deque([], maxlen=opt.Ln) 194 | 195 | ################################## deques 196 | 197 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 198 | 199 | ################################## deques reset 200 | t_queue = 1 201 | if opt.model == "cnn": 202 | compressed_o = pack(o) 203 | o_queue.append((compressed_o,)) 204 | else: 205 | o_queue.append((o,)) 206 | 207 | ################################## deques reset 208 | 209 | weights = ray.get(ps.pull.remote(keys)) 210 | agent.set_weights(keys, weights) 211 | 212 | while True: 213 | 214 | # don't need to random sample action if load weights from local. 215 | if filling_steps > opt.start_steps or opt.weights_file: 216 | a = agent.get_action(o, deterministic=False) 217 | else: 218 | a = env.action_space.sample() 219 | filling_steps += 1 220 | # Step the env 221 | o2, r, d, _ = env.step(a) 222 | 223 | ep_ret += r 224 | ep_len += 1 225 | 226 | # Ignore the "done" signal if it comes from hitting the time 227 | # horizon (that is, when it's an artificial terminal signal 228 | # that isn't based on the agent's state) 229 | # d = False if ep_len*opt.action_repeat >= opt.max_ep_len else d 230 | 231 | o = o2 232 | 233 | #################################### deques store 234 | 235 | a_r_d_queue.append((a, r, d,)) 236 | if opt.model == "cnn": 237 | compressed_o2 = pack(o2) 238 | o_queue.append((compressed_o2,)) 239 | else: 240 | o_queue.append((o2,)) 241 | 242 | # scheme 1: 243 | # TODO and t_queue % 2 == 0: %1 lead to q smaller 244 | # TODO 245 | if t_queue >= opt.Ln and t_queue % opt.save_freq == 0: 246 | replay_buffer[np.random.choice(opt.num_buffers, 1)[0]].store.remote(o_queue, a_r_d_queue, worker_index) 247 | 248 | t_queue += 1 249 | 250 | #################################### deques store 251 | 252 | # End of episode. Training (ep_len times). 253 | if d or (ep_len * opt.action_repeat >= opt.max_ep_len): 254 | # TODO 255 | sample_times, steps, _ = ray.get(replay_buffer[0].get_counts.remote()) 256 | 257 | print('rollout_ep_len:', ep_len * opt.action_repeat, 'rollout_ep_ret:', ep_ret) 258 | 259 | if steps > opt.start_steps: 260 | # update parameters every episode 261 | weights = ray.get(ps.pull.remote(keys)) 262 | agent.set_weights(keys, weights) 263 | 264 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 265 | 266 | ################################## deques reset 267 | t_queue = 1 268 | if opt.model == "cnn": 269 | compressed_o = pack(o) 270 | o_queue.append((compressed_o,)) 271 | else: 272 | o_queue.append((o,)) 273 | 274 | ################################## deques reset 275 | 276 | 277 | @ray.remote 278 | def worker_test(ps, replay_buffer, opt): 279 | agent = Actor(opt, job="main") 280 | 281 | test_env = Wrapper(gym.make(opt.env_name), opt.obs_noise, opt.act_noise, opt.reward_scale, 3) 282 | 283 | agent.test(ps, replay_buffer, opt, test_env) 284 | 285 | 286 | if __name__ == '__main__': 287 | 288 | # ray.init(object_store_memory=1000000000, redis_max_memory=1000000000) 289 | ray.init() 290 | 291 | # ------ HyperParameters ------ 292 | opt = HyperParameters(FLAGS.env_name, FLAGS.exp_name, FLAGS.num_workers, FLAGS.a_l_ratio, 293 | FLAGS.weights_file) 294 | All_Parameters = copy.deepcopy(vars(opt)) 295 | All_Parameters["wrapper"] = inspect.getsource(Wrapper) 296 | All_Parameters["obs_space"] = "" 297 | All_Parameters["act_space"] = "" 298 | 299 | try: 300 | os.makedirs(opt.save_dir) 301 | except OSError: 302 | pass 303 | with open(opt.save_dir + "/" + 'All_Parameters.json', 'w') as fp: 304 | json.dump(All_Parameters, fp, indent=4, sort_keys=True) 305 | 306 | # ------ end ------ 307 | 308 | if FLAGS.weights_file: 309 | ps = ParameterServer.remote([], [], weights_file=FLAGS.weights_file) 310 | else: 311 | net = Learner(opt, job="main") 312 | all_keys, all_values = net.get_weights() 313 | ps = ParameterServer.remote(all_keys, all_values) 314 | 315 | # Experience buffer 316 | # Methods called on different actors can execute in parallel, 317 | # and methods called on the same actor are executed serially in the order that they are called. 318 | # we need more buffer for more workers to keep high store speed. 319 | replay_buffer = [ReplayBuffer.remote(opt) for i in range(opt.num_buffers)] 320 | 321 | # Start some training tasks. 322 | for i in range(FLAGS.num_workers): 323 | worker_rollout.remote(ps, replay_buffer, opt, i) 324 | time.sleep(0.05) 325 | # task_rollout = [worker_rollout.remote(ps, replay_buffer, opt, i) for i in range(FLAGS.num_workers)] 326 | 327 | if opt.weights_file: 328 | fill_steps = opt.start_steps / 100 329 | else: 330 | fill_steps = opt.start_steps 331 | # store at least start_steps in buffer before training 332 | _, steps, _ = ray.get(replay_buffer[0].get_counts.remote()) 333 | while steps < fill_steps: 334 | _, steps, _ = ray.get(replay_buffer[0].get_counts.remote()) 335 | print('fill steps before learn:', steps) 336 | time.sleep(1) 337 | 338 | task_train = [worker_train.remote(ps, replay_buffer, opt, i) for i in range(opt.num_learners)] 339 | 340 | time.sleep(10) 341 | while True: 342 | task_test = worker_test.remote(ps, replay_buffer, opt) 343 | ray.wait([task_test, ]) 344 | -------------------------------------------------------------------------------- /example/sac.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | import time 5 | from spinup.algos.sac import core 6 | from spinup.algos.sac.core import get_vars 7 | from spinup.utils.logx import EpochLogger 8 | 9 | 10 | class ReplayBuffer: 11 | """ 12 | A simple FIFO experience replay buffer for SAC agents. 13 | """ 14 | 15 | def __init__(self, obs_dim, act_dim, size): 16 | self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32) 17 | self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32) 18 | self.acts_buf = np.zeros([size, act_dim], dtype=np.float32) 19 | self.rews_buf = np.zeros(size, dtype=np.float32) 20 | self.done_buf = np.zeros(size, dtype=np.float32) 21 | self.ptr, self.size, self.max_size = 0, 0, size 22 | 23 | def store(self, obs, act, rew, next_obs, done): 24 | self.obs1_buf[self.ptr] = obs 25 | self.obs2_buf[self.ptr] = next_obs 26 | self.acts_buf[self.ptr] = act 27 | self.rews_buf[self.ptr] = rew 28 | self.done_buf[self.ptr] = done 29 | self.ptr = (self.ptr+1) % self.max_size 30 | self.size = min(self.size+1, self.max_size) 31 | 32 | def sample_batch(self, batch_size=32): 33 | idxs = np.random.randint(0, self.size, size=batch_size) 34 | return dict(obs1=self.obs1_buf[idxs], 35 | obs2=self.obs2_buf[idxs], 36 | acts=self.acts_buf[idxs], 37 | rews=self.rews_buf[idxs], 38 | done=self.done_buf[idxs]) 39 | 40 | """ 41 | 42 | Soft Actor-Critic 43 | 44 | (With slight variations that bring it closer to TD3) 45 | 46 | """ 47 | def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, 48 | steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, 49 | polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, 50 | max_ep_len=1000, logger_kwargs=dict(), save_freq=1): 51 | """ 52 | 53 | Args: 54 | env_fn : A function which creates a copy of the environment. 55 | The environment must satisfy the OpenAI Gym API. 56 | 57 | actor_critic: A function which takes in placeholder symbols 58 | for state, ``x_ph``, and action, ``a_ph``, and returns the main 59 | outputs from the agent's Tensorflow computation graph: 60 | 61 | =========== ================ ====================================== 62 | Symbol Shape Description 63 | =========== ================ ====================================== 64 | ``mu`` (batch, act_dim) | Computes mean actions from policy 65 | | given states. 66 | ``pi`` (batch, act_dim) | Samples actions from policy given 67 | | states. 68 | ``logp_pi`` (batch,) | Gives log probability, according to 69 | | the policy, of the action sampled by 70 | | ``pi``. Critical: must be differentiable 71 | | with respect to policy parameters all 72 | | the way through action sampling. 73 | ``q1`` (batch,) | Gives one estimate of Q* for 74 | | states in ``x_ph`` and actions in 75 | | ``a_ph``. 76 | ``q2`` (batch,) | Gives another estimate of Q* for 77 | | states in ``x_ph`` and actions in 78 | | ``a_ph``. 79 | ``q1_pi`` (batch,) | Gives the composition of ``q1`` and 80 | | ``pi`` for states in ``x_ph``: 81 | | q1(x, pi(x)). 82 | ``q2_pi`` (batch,) | Gives the composition of ``q2`` and 83 | | ``pi`` for states in ``x_ph``: 84 | | q2(x, pi(x)). 85 | ``v`` (batch,) | Gives the value estimate for states 86 | | in ``x_ph``. 87 | =========== ================ ====================================== 88 | 89 | ac_kwargs (dict): Any kwargs appropriate for the actor_critic 90 | function you provided to SAC. 91 | 92 | seed (int): Seed for random number generators. 93 | 94 | steps_per_epoch (int): Number of steps of interaction (state-action pairs) 95 | for the agent and the environment in each epoch. 96 | 97 | epochs (int): Number of epochs to run and train agent. 98 | 99 | replay_size (int): Maximum length of replay buffer. 100 | 101 | gamma (float): Discount factor. (Always between 0 and 1.) 102 | 103 | polyak (float): Interpolation factor in polyak averaging for target 104 | networks. Target networks are updated towards main networks 105 | according to: 106 | 107 | .. math:: \\theta_{\\text{targ}} \\leftarrow 108 | \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta 109 | 110 | where :math:`\\rho` is polyak. (Always between 0 and 1, usually 111 | close to 1.) 112 | 113 | lr (float): Learning rate (used for both policy and value learning). 114 | 115 | alpha (float): Entropy regularization coefficient. (Equivalent to 116 | inverse of reward scale in the original SAC paper.) 117 | 118 | batch_size (int): Minibatch size for SGD. 119 | 120 | start_steps (int): Number of steps for uniform-random action selection, 121 | before running real policy. Helps exploration. 122 | 123 | max_ep_len (int): Maximum length of trajectory / episode / rollout. 124 | 125 | logger_kwargs (dict): Keyword args for EpochLogger. 126 | 127 | save_freq (int): How often (in terms of gap between epochs) to save 128 | the current policy and value function. 129 | 130 | """ 131 | 132 | logger = EpochLogger(**logger_kwargs) 133 | logger.save_config(locals()) 134 | 135 | tf.set_random_seed(seed) 136 | np.random.seed(seed) 137 | 138 | env, test_env = env_fn(), env_fn() 139 | obs_dim = env.observation_space.shape[0] 140 | act_dim = env.action_space.shape[0] 141 | 142 | # Action limit for clamping: critically, assumes all dimensions share the same bound! 143 | act_limit = env.action_space.high[0] 144 | 145 | # Share information about action space with policy architecture 146 | ac_kwargs['action_space'] = env.action_space 147 | 148 | # Inputs to computation graph 149 | x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) 150 | 151 | # Main outputs from computation graph 152 | with tf.variable_scope('main'): 153 | mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) 154 | 155 | # Target value network 156 | with tf.variable_scope('target'): 157 | _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) 158 | 159 | # Experience buffer 160 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) 161 | 162 | # Count variables 163 | var_counts = tuple(core.count_vars(scope) for scope in 164 | ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) 165 | print(('\nNumber of parameters: \t pi: %d, \t' + \ 166 | 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts) 167 | 168 | # Min Double-Q: 169 | min_q_pi = tf.minimum(q1_pi, q2_pi) 170 | 171 | # Targets for Q and V regression 172 | q_backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ) 173 | v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi) 174 | 175 | # Soft actor-critic losses 176 | pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) 177 | q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) 178 | q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) 179 | v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) 180 | value_loss = q1_loss + q2_loss + v_loss 181 | 182 | # Policy train op 183 | # (has to be separate from value train op, because q1_pi appears in pi_loss) 184 | pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) 185 | train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) 186 | 187 | # Value train op 188 | # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) 189 | value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) 190 | value_params = get_vars('main/q') + get_vars('main/v') 191 | with tf.control_dependencies([train_pi_op]): 192 | train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) 193 | 194 | # Polyak averaging for target variables 195 | # (control flow because sess.run otherwise evaluates in nondeterministic order) 196 | with tf.control_dependencies([train_value_op]): 197 | target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) 198 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 199 | 200 | # All ops to call during one training step 201 | step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, 202 | train_pi_op, train_value_op, target_update] 203 | 204 | # Initializing targets to match main variables 205 | target_init = tf.group([tf.assign(v_targ, v_main) 206 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 207 | 208 | sess = tf.Session() 209 | sess.run(tf.global_variables_initializer()) 210 | sess.run(target_init) 211 | 212 | # Setup model saving 213 | logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, 214 | outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v}) 215 | 216 | def get_action(o, deterministic=False): 217 | act_op = mu if deterministic else pi 218 | return sess.run(act_op, feed_dict={x_ph: o.reshape(1,-1)})[0] 219 | 220 | def test_agent(n=10): 221 | global sess, mu, pi, q1, q2, q1_pi, q2_pi 222 | for j in range(n): 223 | o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 224 | while not(d or (ep_len == max_ep_len)): 225 | # Take deterministic actions at test time 226 | o, r, d, _ = test_env.step(get_action(o, True)) 227 | ep_ret += r 228 | ep_len += 1 229 | logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) 230 | 231 | start_time = time.time() 232 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 233 | total_steps = steps_per_epoch * epochs 234 | 235 | # Main loop: collect experience in env and update/log each epoch 236 | for t in range(total_steps): 237 | 238 | """ 239 | Until start_steps have elapsed, randomly sample actions 240 | from a uniform distribution for better exploration. Afterwards, 241 | use the learned policy. 242 | """ 243 | if t > start_steps: 244 | a = get_action(o) 245 | else: 246 | a = env.action_space.sample() 247 | 248 | # Step the env 249 | o2, r, d, _ = env.step(a) 250 | ep_ret += r 251 | ep_len += 1 252 | 253 | # Ignore the "done" signal if it comes from hitting the time 254 | # horizon (that is, when it's an artificial terminal signal 255 | # that isn't based on the agent's state) 256 | d = False if ep_len==max_ep_len else d 257 | 258 | # Store experience to replay buffer 259 | replay_buffer.store(o, a, r, o2, d) 260 | 261 | # Super critical, easy to overlook step: make sure to update 262 | # most recent observation! 263 | o = o2 264 | 265 | if d or (ep_len == max_ep_len): 266 | """ 267 | Perform all SAC updates at the end of the trajectory. 268 | This is a slight difference from the SAC specified in the 269 | original paper. 270 | """ 271 | for j in range(ep_len): 272 | batch = replay_buffer.sample_batch(batch_size) 273 | feed_dict = {x_ph: batch['obs1'], 274 | x2_ph: batch['obs2'], 275 | a_ph: batch['acts'], 276 | r_ph: batch['rews'], 277 | d_ph: batch['done'], 278 | } 279 | outs = sess.run(step_ops, feed_dict) 280 | logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], 281 | LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], 282 | VVals=outs[6], LogPi=outs[7]) 283 | 284 | logger.store(EpRet=ep_ret, EpLen=ep_len) 285 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 286 | 287 | 288 | # End of epoch wrap-up 289 | if t > 0 and t % steps_per_epoch == 0: 290 | epoch = t // steps_per_epoch 291 | 292 | # Save model 293 | if (epoch % save_freq == 0) or (epoch == epochs-1): 294 | logger.save_state({'env': env}, None) 295 | 296 | # Test the performance of the deterministic version of the agent. 297 | test_agent() 298 | 299 | # Log info about epoch 300 | logger.log_tabular('Epoch', epoch) 301 | logger.log_tabular('EpRet', with_min_and_max=True) 302 | logger.log_tabular('TestEpRet', with_min_and_max=True) 303 | logger.log_tabular('EpLen', average_only=True) 304 | logger.log_tabular('TestEpLen', average_only=True) 305 | logger.log_tabular('TotalEnvInteracts', t) 306 | logger.log_tabular('Q1Vals', with_min_and_max=True) 307 | logger.log_tabular('Q2Vals', with_min_and_max=True) 308 | logger.log_tabular('VVals', with_min_and_max=True) 309 | logger.log_tabular('LogPi', with_min_and_max=True) 310 | logger.log_tabular('LossPi', average_only=True) 311 | logger.log_tabular('LossQ1', average_only=True) 312 | logger.log_tabular('LossQ2', average_only=True) 313 | logger.log_tabular('LossV', average_only=True) 314 | logger.log_tabular('Time', time.time()-start_time) 315 | logger.dump_tabular() 316 | 317 | if __name__ == '__main__': 318 | import argparse 319 | parser = argparse.ArgumentParser() 320 | parser.add_argument('--env', type=str, default='HalfCheetah-v2') 321 | parser.add_argument('--hid', type=int, default=300) 322 | parser.add_argument('--l', type=int, default=1) 323 | parser.add_argument('--gamma', type=float, default=0.99) 324 | parser.add_argument('--seed', '-s', type=int, default=0) 325 | parser.add_argument('--epochs', type=int, default=50) 326 | parser.add_argument('--exp_name', type=str, default='sac') 327 | args = parser.parse_args() 328 | 329 | from spinup.utils.run_utils import setup_logger_kwargs 330 | logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) 331 | 332 | sac(lambda : gym.make(args.env), actor_critic=core.mlp_actor_critic, 333 | ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), 334 | gamma=args.gamma, seed=args.seed, epochs=args.epochs, 335 | logger_kwargs=logger_kwargs) -------------------------------------------------------------------------------- /algos/trading_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | from gym.spaces import Box 5 | import ctypes 6 | import json 7 | import os 8 | import sys 9 | from collections import deque 10 | import pandas as pd 11 | import pickle 12 | import time 13 | 14 | info_names = [ 15 | "Done", "LastPrice", "BidPrice1", "BidVolume1", "AskPrice1", "AskVolume1", "BidPrice2", "BidVolume2", 16 | "AskPrice2", "AskVolume2", "BidPrice3", "BidVolume3", "AskPrice3", "AskVolume3", "BidPrice4", 17 | "BidVolume4", "AskPrice4", "AskVolume4", "BidPrice5", "BidVolume5", "AskPrice5", "AskVolume5", "Volume", 18 | "HighestPrice", "LowestPrice", "TradingDay", "Target_Num", "Actual_Num", "AliveBidPrice1", 19 | "AliveBidVolume1", "AliveBidPrice2", "AliveBidVolume2", "AliveBidPrice3", "AliveBidVolume3", 20 | "AliveAskPrice1", "AliveAskVolume1", "AliveAskPrice2", "AliveAskVolume2", "AliveAskPrice3", 21 | "AliveAskVolume3", "score", "profit", "total_profit", "baseline_profit", "action", "designed_reward" 22 | ] 23 | 24 | data_v19_len = [ 25 | 225013, 225015, 225015, 225015, 225015, 225017, 225015, 225015, 225017, 225015, 225015, 225015, 225015, 225015, 26 | 225015, 225015, 225015, 225015, 225015, 225015, 225015, 225015, 225010, 225015, 225015, 135002, 225015, 225015, 27 | 225015, 225015, 225015, 225017, 225015, 225017, 225015, 225017, 225015, 225015, 225015, 225015, 225017, 225015, 28 | 225015, 225015, 225017, 225017, 225016, 225017, 225015, 225013, 225015, 225015, 225017, 225017, 225014, 225017, 29 | 225015, 225013, 225015, 225017, 225015, 225015, 225015, 225017, 225015, 225017, 225017, 225015, 225015, 225015, 30 | 225017, 225017, 225015, 225015, 225017, 225015, 225015, 225017, 225015, 225015, 225014, 225015, 225015, 225015, 31 | 225015, 225015, 225017, 225017, 225015, 225015, 225015, 225015, 225017, 225015, 225017, 225015, 225015, 225015, 32 | 225015, 99005, 225015, 225017, 99009, 225015, 225015, 225009, 225017, 225015, 225015, 225015, 225013, 225013, 33 | 225015, 225015, 225013, 225015, 225015, 225017, 225015, 126016 34 | ] # 120days 35 | 36 | 37 | class TradingEnv(gym.Env): 38 | 39 | def __init__(self, action_scheme_id, obs_dim, auto_follow=0, max_ep_len=3000, render=False): 40 | super(TradingEnv, self).__init__() 41 | 42 | self.data_len = data_v19_len 43 | self.trainning_set = 90 44 | ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 45 | os.chdir(ROOT_DIR + "/rl_game/game/") 46 | so_file = "./game.so" 47 | self.expso = ctypes.cdll.LoadLibrary(so_file) 48 | arr_len = 100 49 | arr1 = ctypes.c_int * arr_len 50 | arr = ctypes.c_int * 1 51 | 52 | self.ctx = None 53 | 54 | self.actions = arr1() 55 | self.action_len = arr() 56 | self.raw_obs = arr1() 57 | self.raw_obs_len = arr() 58 | self.rewards = arr1() 59 | self.rewards_len = arr() 60 | 61 | self._step = self._action_schemes(action_scheme_id) 62 | self.auto_follow = auto_follow 63 | 64 | self.obs_dim = obs_dim 65 | self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.obs_dim,), dtype=np.float32) 66 | 67 | self.step_len = 0 68 | 69 | self.max_ep_len = max_ep_len 70 | self.render = render 71 | 72 | self.his_price = deque(maxlen=30) 73 | 74 | def reset(self, start_day=None, start_skip=None, burn_in=0): 75 | 76 | # random start_day if no start_day 77 | if start_day is None: 78 | start_day = np.random.randint(1, self.trainning_set + 1, 1)[0] # first self.trainning_set days 79 | 80 | # random start_skip if no start_skip 81 | if start_skip is None: 82 | day_index = start_day - 1 83 | max_point = self.data_len[day_index] - self.max_ep_len - burn_in - 50 84 | start_skip = int(np.random.randint(0, max_point, 1)[0]) 85 | 86 | start_info = {"date_index": "{} - {}".format(start_day, start_day), "skip_steps": start_skip} 87 | # print(start_info) 88 | if self.ctx: 89 | self.close_env() 90 | self.ctx = self.expso.CreateContext(json.dumps(start_info).encode()) 91 | self.expso.GetActions(self.ctx, self.actions, self.action_len) 92 | self.expso.GetInfo(self.ctx, self.raw_obs, self.raw_obs_len) 93 | self.expso.GetReward(self.ctx, self.rewards, self.rewards_len) 94 | 95 | self.step_len = 0 96 | 97 | obs = self._get_obs(self.raw_obs) 98 | 99 | if self.render: 100 | self.rendering() 101 | 102 | return obs 103 | 104 | def step(self, action): 105 | target_num = self.raw_obs[26] 106 | actual_num = self.raw_obs[27] 107 | 108 | if self.auto_follow is not 0: 109 | if abs(actual_num - target_num) > self.auto_follow: 110 | if target_num > actual_num: 111 | action = 6 112 | else: 113 | action = 9 114 | 115 | self._step(action) 116 | self.expso.Step(self.ctx) 117 | self.expso.GetInfo(self.ctx, self.raw_obs, self.raw_obs_len) 118 | self.expso.GetReward(self.ctx, self.rewards, self.rewards_len) 119 | 120 | self.step_len += 1 121 | 122 | target_bias = abs(self.raw_obs[27] - self.raw_obs[26]) 123 | 124 | obs = self._get_obs(self.raw_obs) 125 | reward = -target_bias 126 | done = bool(self.raw_obs[0]) or self.max_ep_len == self.step_len 127 | 128 | info = {"TradingDay": self.raw_obs[25], 129 | "score": self.rewards[0], 130 | "profit": self.rewards[1], 131 | "target_bias": target_bias} 132 | 133 | if self.render and self.obs_dim == 38: 134 | self.rendering(action) 135 | 136 | self.his_price.append(obs[0]) 137 | obs[22] = max(self.his_price) 138 | obs[23] = min(self.his_price) 139 | 140 | return obs, reward, done, info 141 | 142 | def _get_obs(self, raw_obs): 143 | 144 | price_mean = 26440.28 145 | price_max = 27952.0 146 | bid_ask_volume_log_mean = 1.97 147 | bid_ask_volume_log_max = 6.42 148 | total_volume_mean = 120755.66 149 | total_volume_max = 321988.0 150 | # target_abs_mean = 51.018 151 | target_mean = 2.55 152 | target_max = 311.0 153 | 154 | price_filter = [1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 23, 24, 28, 30, 32, 36, 38, 40] 155 | bid_ask_volume_filter = [3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 29, 31, 33, 37, 39, 41] 156 | total_volume_filter = [22] 157 | target_filter = [26, 27] 158 | obs = np.array(raw_obs[:44], dtype=np.float32) 159 | 160 | obs[price_filter] = (obs[price_filter] - price_mean) / (price_max - price_mean) 161 | obs[bid_ask_volume_filter] = (np.log(obs[bid_ask_volume_filter]) - bid_ask_volume_log_mean) / ( 162 | bid_ask_volume_log_max - bid_ask_volume_log_mean) 163 | obs[total_volume_filter] = (obs[total_volume_filter] - total_volume_mean) / ( 164 | total_volume_max - total_volume_mean) 165 | obs[target_filter] = (obs[target_filter] - target_mean) / (target_max - target_mean) 166 | 167 | if self.obs_dim == 38: 168 | obs = np.delete(obs, [0, 25, 34, 35, 42, 43]) 169 | elif self.obs_dim == 26: 170 | obs = obs[:28] 171 | obs = np.delete(obs, [0, 25]) 172 | elif self.obs_dim == 24: 173 | obs = obs[:25] 174 | obs = np.delete(obs, [0]) 175 | else: 176 | assert False, "incorrect obs_dim!" 177 | obs[obs < -1] = -1 178 | obs[obs > 1] = 1 179 | 180 | return obs 181 | 182 | def _action_schemes(self, action_scheme_id): 183 | 184 | schemes = {} 185 | 186 | def scheme3(action): 187 | assert 0 <= action <= 2 or action == 6 or action == 9, "action should be 0,1,2" 188 | if action == 1: 189 | self.expso.Action(self.ctx, self.actions[18]) # 如果是买动作,卖方向全撤。 190 | self.expso.Action(self.ctx, self.actions[6]) 191 | elif action == 2: 192 | self.expso.Action(self.ctx, self.actions[15]) # 如果是卖动作,买方向全撤。 193 | self.expso.Action(self.ctx, self.actions[9]) 194 | elif action == 0: 195 | self.expso.Action(self.ctx, self.actions[action]) 196 | # for auto_clip 197 | elif action == 6: 198 | self.expso.Action(self.ctx, self.actions[18]) 199 | self.expso.Action(self.ctx, self.actions[6]) 200 | elif action == 9: 201 | self.expso.Action(self.ctx, self.actions[15]) 202 | self.expso.Action(self.ctx, self.actions[9]) 203 | 204 | schemes[3] = scheme3 205 | 206 | # 根据买卖方向进行自动反方向撤单操作 207 | def scheme15(action): 208 | assert 0 <= action <= 14, "action should be 0,1,...,14" 209 | if 1 <= action <= 7: 210 | self.expso.Action(self.ctx, self.actions[18]) # 如果是买动作,卖方向全撤。 211 | elif 8 <= action <= 14: 212 | self.expso.Action(self.ctx, self.actions[15]) # 如果是卖动作,买方向全撤。 213 | # 执行action 214 | self.expso.Action(self.ctx, self.actions[action]) 215 | 216 | schemes[15] = scheme15 217 | 218 | # 学习全撤单操作 219 | def scheme17(action): 220 | assert 0 <= action <= 16, "action should <=16" 221 | if action <= 14: 222 | self.expso.Action(self.ctx, self.actions[action]) 223 | elif action == 15: 224 | self.expso.Action(self.ctx, self.actions[15]) 225 | elif action == 16: 226 | self.expso.Action(self.ctx, self.actions[18]) 227 | 228 | schemes[17] = scheme17 229 | 230 | # 全部操作 231 | def scheme21(action): 232 | assert 0 <= action <= 20, "action should be 0,1,...,20" 233 | self.expso.Action(self.ctx, self.actions[action]) 234 | 235 | schemes[21] = scheme21 236 | 237 | # 这里添加新的scheme... 238 | # def scheme0(action): 239 | # pass 240 | # schemes[0] = scheme0 241 | 242 | self.action_dim = action_scheme_id 243 | self.action_space = spaces.Discrete(self.action_dim) 244 | 245 | return schemes[action_scheme_id] 246 | 247 | def policy_069(self): # actions: 0,6,9 248 | if self.raw_obs[26] > self.raw_obs[27]: 249 | action = 6 250 | elif self.raw_obs[26] < self.raw_obs[27]: 251 | action = 9 252 | else: 253 | action = 0 254 | return action 255 | 256 | def rendering(self, action=None): 257 | print("-----------------------") 258 | print("Action:", action) 259 | print("AliveAskPriceNUM:", self.raw_obs[42]) 260 | print("AliveAskVolumeNUM:", self.raw_obs[43]) 261 | print("AliveAskPrice3:", self.raw_obs[40]) 262 | print("AliveAskVolume3:", self.raw_obs[41]) 263 | print("AliveAskPrice2:", self.raw_obs[38]) 264 | print("AliveAskVolume2:", self.raw_obs[39]) 265 | print("AliveAskPrice1:", self.raw_obs[36]) 266 | print("AliveAskVolume1:", self.raw_obs[37]) 267 | print("AskPrice1:", self.raw_obs[4]) 268 | print("AskVolume1:", self.raw_obs[5]) 269 | print(".....") 270 | print("LastPrice:", self.raw_obs[1]) 271 | print("Actual_Num:", self.raw_obs[27]) 272 | print(".....") 273 | print("BidPrice1:", self.raw_obs[2]) 274 | print("BidVolume1:", self.raw_obs[3]) 275 | print("AliveBidPrice1:", self.raw_obs[28]) 276 | print("AliveBidVolume1:", self.raw_obs[29]) 277 | print("AliveBidPrice2:", self.raw_obs[30]) 278 | print("AliveBidVolume2:", self.raw_obs[31]) 279 | print("AliveBidPrice3:", self.raw_obs[32]) 280 | print("AliveBidVolume3:", self.raw_obs[33]) 281 | print("AliveBidPriceNUM:", self.raw_obs[34]) 282 | print("AliveBidVolumeNUM:", self.raw_obs[35]) 283 | print("-----------------------") 284 | 285 | def close_env(self): 286 | self.expso.ReleaseContext(self.ctx) 287 | 288 | 289 | class FrameStack(gym.Wrapper): 290 | def __init__(self, env, frame_stack, jump=1, model='mlp'): 291 | super(FrameStack, self).__init__(env) 292 | self.frame_stack = frame_stack 293 | self.jump = jump 294 | self.model = model 295 | self.total_frame = frame_stack * jump 296 | self.frames = deque([], maxlen=self.total_frame) 297 | if model == 'mlp': 298 | self.obs_dim = self.env.observation_space.shape[0] * frame_stack 299 | self.observation_space = Box(-np.inf, np.inf, shape=(self.obs_dim,), dtype=np.float32) 300 | else: 301 | self.observation_space = Box(-np.inf, np.inf, shape=(frame_stack, self.env.observation_space.shape[0]), 302 | dtype=np.float32) 303 | 304 | def reset(self, start_day=None, start_skip=None, duration=None, burn_in=0): 305 | ob = self.env.reset(start_day=start_day, start_skip=start_skip, duration=duration, burn_in=burn_in) 306 | ob = np.float32(ob) 307 | for _ in range(self.total_frame): 308 | self.frames.append(ob) 309 | return self.observation() 310 | 311 | def step(self, action): 312 | ob, reward, done, info = self.env.step(action) 313 | ob = np.float32(ob) 314 | self.frames.append(ob) 315 | return self.observation(), reward, done, info 316 | 317 | def observation(self): 318 | assert len(self.frames) == self.total_frame 319 | obs_stack = np.array(self.frames) 320 | idx = np.arange(0, self.total_frame, self.jump) 321 | obs = obs_stack[idx] 322 | if self.model == 'mlp': 323 | return np.stack(obs, axis=0).reshape((self.obs_dim,)) 324 | else: 325 | return obs 326 | 327 | 328 | if __name__ == "__main__": 329 | 330 | env = TradingEnv(action_scheme_id=3, obs_dim=38) 331 | # env = FrameStack(env, frame_stack=3, jump=3, model='cnn') 332 | 333 | cnt = 0 334 | 335 | for i in range(1): 336 | 337 | obs = env.reset() 338 | 339 | # burn-in 340 | # while env.target_diffs < 50: 341 | # action = env.baseline_policy(obs) 342 | # obs, reward, done, info = env.step(action) 343 | # cnt += 1 344 | # print("burn-in steps:", cnt) 345 | 346 | # print(env.raw_obs[26], env.raw_obs[27]) 347 | print(obs) 348 | step = 1 349 | t0 = time.time() 350 | price = 0.0 351 | while True: 352 | action = env.action_space.sample() 353 | # action = env.baseline_policy(obs) 354 | # action = 0 355 | obs, reward, done, info = env.step(action) 356 | step += 1 357 | if step % 10 == 0: 358 | # print(step, env.raw_obs[26], env.raw_obs[27], 359 | # (info["profit"], info["total_profit"], info["baseline_profit"]), 360 | # (info["baseline_profit"] - info["profit"]) * 10 / info["target_diffs"], info["score"], 361 | # (info["reward_score"], info["reward_target"], info["reward_action"],)) 362 | print(obs) 363 | # if price != info["price"]: 364 | # print('='*66) 365 | # price = info["price"] 366 | if done: 367 | print("Done!", done, cnt, step, 'time:', time.time() - t0) 368 | # all_data = env.all_data 369 | # all_data_df = pd.DataFrame(all_data) 370 | # print(all_data_df.tail()) 371 | break 372 | 373 | env.close_env() 374 | -------------------------------------------------------------------------------- /tutorial/Parallelize your algorithm by Ray (2).md: -------------------------------------------------------------------------------- 1 | # 使用Ray并行化你的强化学习算法(二) 2 | 3 | ## SAC代码分解 4 | 5 | spinningup给新手提供了几个重要算法的实现,具有很好的参考价值。除了SAC外,其他on policy算法都使用MPI进行并行化,唯独SAC没有并行实现。所以,我们使用Ray来完成SAC的并行实现。 6 | 7 | 这一节内容很简单,我们将spinningup里实现的sac分解开。在下一节,我们将分解开的每一个部分放入并行框架的对应位置。 8 | 9 | 我们的并行框架结构图: 10 | 11 | ![ddrlframework](C:\Users\Shuai\Documents\GitHub\Markdown\RL\Pictures\ddrlframework.jpg) 12 | 13 | 我们根据我们的并行框架将sac分解为下面五个部分: 14 | 15 | - Replay buffer 16 | - Parameter server 17 | - train (learn) 18 | - rollout 19 | - test 20 | 21 | 下面用注释将每一部分标注。 22 | 23 | ```python 24 | import numpy as np 25 | import tensorflow as tf 26 | import gym 27 | import time 28 | from spinup.algos.sac import core 29 | from spinup.algos.sac.core import get_vars 30 | from spinup.utils.logx import EpochLogger 31 | 32 | # ********************** replaybuffer part below ********************** 33 | class ReplayBuffer: 34 | """ 35 | A simple FIFO experience replay buffer for SAC agents. 36 | """ 37 | 38 | def __init__(self, obs_dim, act_dim, size): 39 | self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32) 40 | self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32) 41 | self.acts_buf = np.zeros([size, act_dim], dtype=np.float32) 42 | self.rews_buf = np.zeros(size, dtype=np.float32) 43 | self.done_buf = np.zeros(size, dtype=np.float32) 44 | self.ptr, self.size, self.max_size = 0, 0, size 45 | 46 | def store(self, obs, act, rew, next_obs, done): 47 | self.obs1_buf[self.ptr] = obs 48 | self.obs2_buf[self.ptr] = next_obs 49 | self.acts_buf[self.ptr] = act 50 | self.rews_buf[self.ptr] = rew 51 | self.done_buf[self.ptr] = done 52 | self.ptr = (self.ptr + 1) % self.max_size 53 | self.size = min(self.size + 1, self.max_size) 54 | 55 | def sample_batch(self, batch_size=32): 56 | idxs = np.random.randint(0, self.size, size=batch_size) 57 | return dict(obs1=self.obs1_buf[idxs], 58 | obs2=self.obs2_buf[idxs], 59 | acts=self.acts_buf[idxs], 60 | rews=self.rews_buf[idxs], 61 | done=self.done_buf[idxs]) 62 | # ********************** replaybuffer part above ********************** 63 | 64 | """ 65 | 66 | Soft Actor-Critic 67 | 68 | (With slight variations that bring it closer to TD3) 69 | 70 | """ 71 | 72 | 73 | def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, 74 | steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, 75 | polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, 76 | max_ep_len=1000, logger_kwargs=dict(), save_freq=1): 77 | """ 78 | 79 | Args: 80 | env_fn : A function which creates a copy of the environment. 81 | The environment must satisfy the OpenAI Gym API. 82 | 83 | actor_critic: A function which takes in placeholder symbols 84 | for state, ``x_ph``, and action, ``a_ph``, and returns the main 85 | outputs from the agent's Tensorflow computation graph: 86 | 87 | =========== ================ ====================================== 88 | Symbol Shape Description 89 | =========== ================ ====================================== 90 | ``mu`` (batch, act_dim) | Computes mean actions from policy 91 | | given states. 92 | ``pi`` (batch, act_dim) | Samples actions from policy given 93 | | states. 94 | ``logp_pi`` (batch,) | Gives log probability, according to 95 | | the policy, of the action sampled by 96 | | ``pi``. Critical: must be differentiable 97 | | with respect to policy parameters all 98 | | the way through action sampling. 99 | ``q1`` (batch,) | Gives one estimate of Q* for 100 | | states in ``x_ph`` and actions in 101 | | ``a_ph``. 102 | ``q2`` (batch,) | Gives another estimate of Q* for 103 | | states in ``x_ph`` and actions in 104 | | ``a_ph``. 105 | ``q1_pi`` (batch,) | Gives the composition of ``q1`` and 106 | | ``pi`` for states in ``x_ph``: 107 | | q1(x, pi(x)). 108 | ``q2_pi`` (batch,) | Gives the composition of ``q2`` and 109 | | ``pi`` for states in ``x_ph``: 110 | | q2(x, pi(x)). 111 | ``v`` (batch,) | Gives the value estimate for states 112 | | in ``x_ph``. 113 | =========== ================ ====================================== 114 | 115 | ac_kwargs (dict): Any kwargs appropriate for the actor_critic 116 | function you provided to SAC. 117 | 118 | seed (int): Seed for random number generators. 119 | 120 | steps_per_epoch (int): Number of steps of interaction (state-action pairs) 121 | for the agent and the environment in each epoch. 122 | 123 | epochs (int): Number of epochs to run and train agent. 124 | 125 | replay_size (int): Maximum length of replay buffer. 126 | 127 | gamma (float): Discount factor. (Always between 0 and 1.) 128 | 129 | polyak (float): Interpolation factor in polyak averaging for target 130 | networks. Target networks are updated towards main networks 131 | according to: 132 | 133 | .. math:: \\theta_{\\text{targ}} \\leftarrow 134 | \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta 135 | 136 | where :math:`\\rho` is polyak. (Always between 0 and 1, usually 137 | close to 1.) 138 | 139 | lr (float): Learning rate (used for both policy and value learning). 140 | 141 | alpha (float): Entropy regularization coefficient. (Equivalent to 142 | inverse of reward scale in the original SAC paper.) 143 | 144 | batch_size (int): Minibatch size for SGD. 145 | 146 | start_steps (int): Number of steps for uniform-random action selection, 147 | before running real policy. Helps exploration. 148 | 149 | max_ep_len (int): Maximum length of trajectory / episode / rollout. 150 | 151 | logger_kwargs (dict): Keyword args for EpochLogger. 152 | 153 | save_freq (int): How often (in terms of gap between epochs) to save 154 | the current policy and value function. 155 | 156 | """ 157 | 158 | # logger = EpochLogger(**logger_kwargs) 159 | # logger.save_config(locals()) 160 | 161 | tf.set_random_seed(seed) 162 | np.random.seed(seed) 163 | 164 | env, test_env = env_fn(), env_fn() 165 | obs_dim = env.observation_space.shape[0] 166 | act_dim = env.action_space.shape[0] 167 | 168 | # Action limit for clamping: critically, assumes all dimensions share the same bound! 169 | act_limit = env.action_space.high[0] 170 | 171 | # Share information about action space with policy architecture 172 | ac_kwargs['action_space'] = env.action_space 173 | 174 | # ********************** model part below ********************** 175 | 176 | # Inputs to computation graph 177 | x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) 178 | 179 | # Main outputs from computation graph 180 | with tf.variable_scope('main'): 181 | mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) 182 | 183 | # Target value network 184 | with tf.variable_scope('target'): 185 | _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) 186 | 187 | # ********************** model part above ********************** 188 | 189 | # Experience buffer 190 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) 191 | 192 | # ********************** model part below ********************** 193 | 194 | # Count variables 195 | var_counts = tuple(core.count_vars(scope) for scope in 196 | ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) 197 | print(('\nNumber of parameters: \t pi: %d, \t' + 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n') % var_counts) 198 | 199 | # Min Double-Q: 200 | min_q_pi = tf.minimum(q1_pi, q2_pi) 201 | 202 | # Targets for Q and V regression 203 | q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ) 204 | v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi) 205 | 206 | # Soft actor-critic losses 207 | pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) 208 | q1_loss = 0.5 * tf.reduce_mean((q_backup - q1) ** 2) 209 | q2_loss = 0.5 * tf.reduce_mean((q_backup - q2) ** 2) 210 | v_loss = 0.5 * tf.reduce_mean((v_backup - v) ** 2) 211 | value_loss = q1_loss + q2_loss + v_loss 212 | 213 | # Policy train op 214 | # (has to be separate from value train op, because q1_pi appears in pi_loss) 215 | pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) 216 | train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) 217 | 218 | # Value train op 219 | # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) 220 | value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) 221 | value_params = get_vars('main/q') + get_vars('main/v') 222 | with tf.control_dependencies([train_pi_op]): 223 | train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) 224 | 225 | # Polyak averaging for target variables 226 | # (control flow because sess.run otherwise evaluates in nondeterministic order) 227 | with tf.control_dependencies([train_value_op]): 228 | target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) 229 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 230 | 231 | # All ops to call during one training step 232 | step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, 233 | train_pi_op, train_value_op, target_update] 234 | 235 | # Initializing targets to match main variables 236 | target_init = tf.group([tf.assign(v_targ, v_main) 237 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 238 | 239 | sess = tf.Session() 240 | sess.run(tf.global_variables_initializer()) 241 | sess.run(target_init) 242 | 243 | # ********************** model part above ********************** 244 | 245 | # Setup model saving 246 | # logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, 247 | # outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v}) 248 | 249 | def get_action(o, deterministic=False): 250 | act_op = mu if deterministic else pi 251 | return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0] 252 | 253 | def test_agent(n=10): 254 | global sess, mu, pi, q1, q2, q1_pi, q2_pi 255 | for j in range(n): 256 | o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 257 | while not (d or (ep_len == max_ep_len)): 258 | # Take deterministic actions at test time 259 | o, r, d, _ = test_env.step(get_action(o, True)) 260 | ep_ret += r 261 | ep_len += 1 262 | print(ep_len, ep_ret) 263 | # logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) 264 | 265 | # ********************** rollout part below ********************** 266 | 267 | start_time = time.time() 268 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 269 | total_steps = steps_per_epoch * epochs 270 | 271 | # Main loop: collect experience in env and update/log each epoch 272 | for t in range(total_steps): 273 | 274 | """ 275 | Until start_steps have elapsed, randomly sample actions 276 | from a uniform distribution for better exploration. Afterwards, 277 | use the learned policy. 278 | """ 279 | if t > start_steps: 280 | a = get_action(o) 281 | else: 282 | a = env.action_space.sample() 283 | 284 | # Step the env 285 | o2, r, d, _ = env.step(a) 286 | ep_ret += r 287 | ep_len += 1 288 | 289 | # Ignore the "done" signal if it comes from hitting the time 290 | # horizon (that is, when it's an artificial terminal signal 291 | # that isn't based on the agent's state) 292 | d = False if ep_len == max_ep_len else d 293 | 294 | # Store experience to replay buffer 295 | replay_buffer.store(o, a, r, o2, d) 296 | 297 | # Super critical, easy to overlook step: make sure to update 298 | # most recent observation! 299 | o = o2 300 | 301 | if d or (ep_len == max_ep_len): 302 | """ 303 | Perform all SAC updates at the end of the trajectory. 304 | This is a slight difference from the SAC specified in the 305 | original paper. 306 | """ 307 | 308 | # ********************** train part below ********************** 309 | 310 | for j in range(ep_len): 311 | batch = replay_buffer.sample_batch(batch_size) 312 | feed_dict = {x_ph: batch['obs1'], 313 | x2_ph: batch['obs2'], 314 | a_ph: batch['acts'], 315 | r_ph: batch['rews'], 316 | d_ph: batch['done'], 317 | } 318 | outs = sess.run(step_ops, feed_dict) 319 | # logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], 320 | # LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], 321 | # VVals=outs[6], LogPi=outs[7]) 322 | 323 | # ********************** train part above ********************** 324 | 325 | # logger.store(EpRet=ep_ret, EpLen=ep_len) 326 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 327 | 328 | # ********************** rollout part above ********************** 329 | 330 | # End of epoch wrap-up 331 | if t > 0 and t % steps_per_epoch == 0: 332 | epoch = t // steps_per_epoch 333 | 334 | # Save model 335 | # if (epoch % save_freq == 0) or (epoch == epochs - 1): 336 | # logger.save_state({'env': env}, None) 337 | 338 | # Test the performance of the deterministic version of the agent. 339 | test_agent() 340 | 341 | # Log info about epoch 342 | # logger.log_tabular('Epoch', epoch) 343 | # logger.log_tabular('EpRet', with_min_and_max=True) 344 | # logger.log_tabular('TestEpRet', with_min_and_max=True) 345 | # logger.log_tabular('EpLen', average_only=True) 346 | # logger.log_tabular('TestEpLen', average_only=True) 347 | # logger.log_tabular('TotalEnvInteracts', t) 348 | # logger.log_tabular('Q1Vals', with_min_and_max=True) 349 | # logger.log_tabular('Q2Vals', with_min_and_max=True) 350 | # logger.log_tabular('VVals', with_min_and_max=True) 351 | # logger.log_tabular('LogPi', with_min_and_max=True) 352 | # logger.log_tabular('LossPi', average_only=True) 353 | # logger.log_tabular('LossQ1', average_only=True) 354 | # logger.log_tabular('LossQ2', average_only=True) 355 | # logger.log_tabular('LossV', average_only=True) 356 | # logger.log_tabular('Time', time.time() - start_time) 357 | # logger.dump_tabular() 358 | 359 | 360 | if __name__ == '__main__': 361 | import argparse 362 | 363 | parser = argparse.ArgumentParser() 364 | parser.add_argument('--env', type=str, default='BipedalWalker-v2') 365 | parser.add_argument('--hid', type=int, default=300) 366 | parser.add_argument('--l', type=int, default=1) 367 | parser.add_argument('--gamma', type=float, default=0.99) 368 | parser.add_argument('--seed', '-s', type=int, default=0) 369 | parser.add_argument('--epochs', type=int, default=50) 370 | parser.add_argument('--exp_name', type=str, default='sac') 371 | args = parser.parse_args() 372 | 373 | # from spinup.utils.run_utils import setup_logger_kwargs 374 | # 375 | # logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) 376 | 377 | sac(lambda: gym.make(args.env), actor_critic=core.mlp_actor_critic, 378 | ac_kwargs=dict(hidden_sizes=[args.hid] * args.l), 379 | gamma=args.gamma, seed=args.seed, epochs=args.epochs,) 380 | # logger_kwargs=logger_kwargs) 381 | 382 | ``` 383 | 384 | 本节完。 -------------------------------------------------------------------------------- /algos/dqn/train.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | import time 5 | import ray 6 | 7 | import os 8 | import sys 9 | 10 | from hyperparams import HyperParameters 11 | from actor_learner import Actor, Learner 12 | 13 | import os 14 | import pickle 15 | import multiprocessing 16 | import copy 17 | import json 18 | 19 | ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 20 | sys.path.append(ROOT) 21 | from trading_env import TradingEnv, FrameStack 22 | 23 | 24 | flags = tf.app.flags 25 | FLAGS = tf.app.flags.FLAGS 26 | 27 | flags.DEFINE_string("env_name", "Trading", "game env") 28 | flags.DEFINE_string("exp_name", "ddqn-trading", "experiments name") 29 | flags.DEFINE_integer("num_nodes", 1, "number of nodes") 30 | flags.DEFINE_integer("num_workers", 6, "number of workers") 31 | flags.DEFINE_string("weights_file", "", "empty means False.") 32 | flags.DEFINE_float("a_l_ratio", 10, "actor_steps / learner_steps") 33 | flags.DEFINE_bool("recover", False, "back training from last checkpoint") 34 | flags.DEFINE_string("checkpoint_path", "", "empty means opt.save_dir. ") 35 | 36 | 37 | @ray.remote(num_cpus=2) 38 | class ReplayBuffer: 39 | """ 40 | A simple FIFO experience replay buffer for SQN_N_STEP agents. 41 | """ 42 | 43 | def __init__(self, opt, buffer_index): 44 | self.opt = opt 45 | self.buffer_index = buffer_index 46 | self.obs1_buf = np.zeros([opt.buffer_size, opt.obs_dim], dtype=np.float32) 47 | self.obs2_buf = np.zeros([opt.buffer_size, opt.obs_dim], dtype=np.float32) 48 | self.acts_buf = np.zeros(opt.buffer_size, dtype=np.float32) 49 | self.rews_buf = np.zeros(opt.buffer_size, dtype=np.float32) 50 | self.done_buf = np.zeros(opt.buffer_size, dtype=np.float32) 51 | self.ptr, self.size, self.max_size = 0, 0, opt.buffer_size 52 | self.actor_steps, self.learner_steps = 0, 0 53 | 54 | def store(self, obs, act, rew, next_obs, done, worker_index): 55 | 56 | self.obs1_buf[self.ptr] = obs 57 | self.obs2_buf[self.ptr] = next_obs 58 | self.acts_buf[self.ptr] = act 59 | self.rews_buf[self.ptr] = rew 60 | self.done_buf[self.ptr] = done 61 | 62 | self.ptr = (self.ptr + 1) % self.max_size 63 | self.size = min(self.size + 1, self.max_size) 64 | self.actor_steps += 1 65 | 66 | def sample_batch(self): 67 | idxs = np.random.randint(0, self.size, size=self.opt.batch_size) 68 | self.learner_steps += 1 69 | return dict(obs1=self.obs1_buf[idxs], 70 | obs2=self.obs2_buf[idxs], 71 | acts=self.acts_buf[idxs], 72 | rews=self.rews_buf[idxs], 73 | done=self.done_buf[idxs]) 74 | 75 | def get_counts(self): 76 | return self.learner_steps, self.actor_steps, self.size 77 | 78 | # debug 79 | def show(self): 80 | return self.obs1_buf, self.ptr, self.size, self.max_size 81 | 82 | def save(self): 83 | np.save(self.opt.save_dir + "/checkpoint/" + 'obs1_buf-' + str(self.buffer_index), self.obs1_buf) 84 | np.save(self.opt.save_dir + "/checkpoint/" + 'obs2_buf-' + str(self.buffer_index), self.obs2_buf) 85 | np.save(self.opt.save_dir + "/checkpoint/" + 'acts_buf-' + str(self.buffer_index), self.acts_buf) 86 | np.save(self.opt.save_dir + "/checkpoint/" + 'rews_buf-' + str(self.buffer_index), self.rews_buf) 87 | np.save(self.opt.save_dir + "/checkpoint/" + 'done_buf-' + str(self.buffer_index), self.done_buf) 88 | buffer_infos = np.array((self.ptr, self.size, self.max_size, self.actor_steps, self.learner_steps)) 89 | np.save(self.opt.save_dir + "/checkpoint/" + 'buffer_infos-' + str(self.buffer_index), buffer_infos) 90 | print("****** buffer " + str(self.buffer_index) + " saved! ******") 91 | 92 | def load(self, checkpoint_path): 93 | if not checkpoint_path: 94 | checkpoint_path = self.opt.save_dir + "/checkpoint" 95 | 96 | self.obs1_buf = np.load(checkpoint_path + '/obs1_buf-' + str(self.buffer_index) + '.npy') 97 | self.obs2_buf = np.load(checkpoint_path + '/obs2_buf-' + str(self.buffer_index) + '.npy') 98 | self.acts_buf = np.load(checkpoint_path + '/acts_buf-' + str(self.buffer_index) + '.npy') 99 | self.rews_buf = np.load(checkpoint_path + '/rews_buf-' + str(self.buffer_index) + '.npy') 100 | self.done_buf = np.load(checkpoint_path + '/done_buf-' + str(self.buffer_index) + '.npy') 101 | buffer_infos = np.load(checkpoint_path + '/buffer_infos-' + str(self.buffer_index) + '.npy') 102 | 103 | self.ptr, self.size, self.max_size, self.actor_steps, self.learner_steps = buffer_infos[0], buffer_infos[1], \ 104 | buffer_infos[2], buffer_infos[3], \ 105 | buffer_infos[4] 106 | print("****** buffer number " + str(self.buffer_index) + " restored! ******") 107 | print("****** buffer number " + str(self.buffer_index) + " infos:", self.ptr, self.size, self.max_size, 108 | self.actor_steps, self.learner_steps) 109 | 110 | 111 | @ray.remote(num_cpus=2) 112 | class ParameterServer: 113 | def __init__(self, opt, weights_file, checkpoint_path, ps_index): 114 | # each node will have a Parameter Server 115 | 116 | self.opt = opt 117 | self.learner_step = 0 118 | net = Learner(opt, job="ps") 119 | keys, values = net.get_weights() 120 | 121 | # --- make dir for all nodes and save parameters --- 122 | try: 123 | os.makedirs(opt.save_dir) 124 | os.makedirs(opt.save_dir + '/checkpoint') 125 | except OSError: 126 | pass 127 | all_parameters = copy.deepcopy(vars(opt)) 128 | all_parameters["obs_space"] = "" 129 | all_parameters["act_space"] = "" 130 | with open(opt.save_dir + "/" + 'All_Parameters.json', 'w') as fp: 131 | json.dump(all_parameters, fp, indent=4, sort_keys=True) 132 | # --- end --- 133 | 134 | self.weights = None 135 | 136 | if not checkpoint_path: 137 | checkpoint_path = opt.save_dir + "/checkpoint" 138 | 139 | if opt.recover: 140 | with open(checkpoint_path + "/checkpoint_weights.pickle", "rb") as pickle_in: 141 | self.weights = pickle.load(pickle_in) 142 | print("****** weights restored! ******") 143 | 144 | if weights_file: 145 | try: 146 | with open(weights_file, "rb") as pickle_in: 147 | self.weights = pickle.load(pickle_in) 148 | print("****** weights restored! ******") 149 | except: 150 | print("------------------------------------------------") 151 | print(weights_file) 152 | print("------ error: weights file doesn't exist! ------") 153 | exit() 154 | 155 | if not opt.recover and not weights_file: 156 | values = [value.copy() for value in values] 157 | self.weights = dict(zip(keys, values)) 158 | 159 | def push(self, keys, values): 160 | values = [value.copy() for value in values] 161 | for key, value in zip(keys, values): 162 | self.weights[key] = value 163 | self.learner_step += opt.push_freq 164 | 165 | def pull(self, keys): 166 | return [self.weights[key] for key in keys] 167 | 168 | def get_weights(self): 169 | return copy.deepcopy(self.weights) 170 | 171 | # save weights to disk 172 | def save_weights(self): 173 | with open(self.opt.save_dir + "/checkpoint/" + "checkpoint_weights.pickle", "wb") as pickle_out: 174 | pickle.dump(self.weights, pickle_out) 175 | 176 | 177 | class Cache(object): 178 | 179 | def __init__(self, node_buffer): 180 | # cache for training data and model weights 181 | print('os.pid:', os.getpid()) 182 | self.node_buffer = node_buffer 183 | self.q1 = multiprocessing.Queue(12) 184 | self.q2 = multiprocessing.Queue(5) 185 | self.p1 = multiprocessing.Process(target=self.ps_update, args=(self.q1, self.q2, self.node_buffer)) 186 | self.p1.daemon = True 187 | 188 | def ps_update(self, q1, q2, node_buffer): 189 | print('os.pid of put_data():', os.getpid()) 190 | 191 | node_idx = np.random.choice(opt.num_nodes, 1)[0] 192 | buffer_idx = np.random.choice(opt.num_buffers, 1)[0] 193 | q1.put(copy.deepcopy(ray.get(node_buffer[node_idx][buffer_idx].sample_batch.remote()))) 194 | 195 | while True: 196 | if q1.qsize() < 10: 197 | node_idx = np.random.choice(opt.num_nodes, 1)[0] 198 | buffer_idx = np.random.choice(opt.num_buffers, 1)[0] 199 | q1.put(copy.deepcopy(ray.get(node_buffer[node_idx][buffer_idx].sample_batch.remote()))) 200 | 201 | if not q2.empty(): 202 | keys, values = q2.get() 203 | [node_ps[i].push.remote(keys, values) for i in range(opt.num_nodes)] 204 | 205 | def start(self): 206 | self.p1.start() 207 | self.p1.join(10) 208 | 209 | def end(self): 210 | self.p1.terminate() 211 | 212 | 213 | @ray.remote(num_cpus=2, num_gpus=1, max_calls=1) 214 | def worker_train(ps, node_buffer, opt, learner_index): 215 | agent = Learner(opt, job="learner") 216 | keys = agent.get_weights()[0] 217 | weights = ray.get(ps.pull.remote(keys)) 218 | agent.set_weights(keys, weights) 219 | 220 | cache = Cache(node_buffer) 221 | 222 | cache.start() 223 | 224 | cnt = 1 225 | while True: 226 | batch = cache.q1.get() 227 | agent.train(batch, cnt) 228 | 229 | if cnt % opt.push_freq == 0: 230 | cache.q2.put(agent.get_weights()) 231 | cnt += 1 232 | 233 | 234 | @ray.remote 235 | def worker_rollout(ps, replay_buffer, opt, worker_index): 236 | agent = Actor(opt, job="worker") 237 | keys = agent.get_weights()[0] 238 | np.random.seed() 239 | 240 | ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 241 | sys.path.append(ROOT) 242 | from trading_env import TradingEnv, FrameStack 243 | # ------ env set up ------ 244 | # env = gym.make(opt.env_name) 245 | env = TradingEnv(action_scheme_id=3, obs_dim=38) 246 | 247 | while True: 248 | 249 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 250 | 251 | weights = ray.get(ps.pull.remote(keys)) 252 | agent.set_weights(keys, weights) 253 | 254 | # for a_l_ratio control 255 | np.random.seed() 256 | rand_buff = np.random.choice(opt.num_buffers, 1)[0] 257 | last_learner_steps, last_actor_steps, _size = ray.get(replay_buffer[rand_buff].get_counts.remote()) 258 | 259 | while True: 260 | 261 | # don't need to random sample action if load weights from local. 262 | if last_actor_steps * opt.num_buffers > opt.start_steps or opt.recover: 263 | a = agent.get_action(o) 264 | else: 265 | a = env.action_space.sample() 266 | # Step the env 267 | o2, r, d, _ = env.step(a) 268 | 269 | ep_ret += r 270 | ep_len += 1 271 | 272 | # Ignore the "done" signal if it comes from hitting the time 273 | # horizon (that is, when it's an artificial terminal signal 274 | # that isn't based on the agent's state) 275 | # d = False if ep_len*opt.action_repeat >= opt.max_ep_len else d 276 | 277 | np.random.seed() 278 | rand_buff = np.random.choice(opt.num_buffers, 1)[0] 279 | replay_buffer[rand_buff].store.remote(o, a, r, o2, d, worker_index) 280 | 281 | o = o2 282 | 283 | # End of episode. Training (ep_len times). 284 | # if d or (ep_len * opt.action_repeat >= opt.max_ep_len): 285 | if d: 286 | break 287 | 288 | 289 | @ray.remote 290 | def worker_test(ps, node_buffer, opt): 291 | 292 | agent = Actor(opt, job="test") 293 | keys = agent.get_weights()[0] 294 | 295 | # test_env = gym.make(opt.env_name) 296 | ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 297 | sys.path.append(ROOT) 298 | from trading_env import TradingEnv, FrameStack 299 | test_env = TradingEnv(action_scheme_id=3, obs_dim=38) 300 | 301 | init_time = time.time() 302 | save_times = 0 303 | checkpoint_times = 0 304 | 305 | while True: 306 | # weights_all for save it to local 307 | weights_all = ray.get(ps.get_weights.remote()) 308 | weights = [weights_all[key] for key in keys] 309 | agent.set_weights(keys, weights) 310 | 311 | start_actor_step, start_learner_step, _ = get_al_status(node_buffer) 312 | start_time = time.time() 313 | 314 | ave_test_reward, ave_score = agent.test(test_env, 10) 315 | 316 | last_actor_step, last_learner_step, _ = get_al_status(node_buffer) 317 | actor_step = np.sum(last_actor_step) - np.sum(start_actor_step) 318 | learner_step = np.sum(last_learner_step) - np.sum(start_learner_step) 319 | alratio = actor_step / (learner_step + 1) 320 | update_frequency = int(learner_step / (time.time() - start_time)) 321 | total_learner_step = np.sum(last_learner_step) 322 | 323 | print("---------------------------------------------------") 324 | print("average test reward:", ave_test_reward) 325 | print("average test score:", ave_score) 326 | print("frame freq:", np.round((last_actor_step - start_actor_step) / (time.time() - start_time))) 327 | print("actor_steps:", np.sum(last_actor_step), "learner_step:", total_learner_step) 328 | print("actor leaner ratio: %.2f" % alratio) 329 | print("learner freq:", update_frequency) 330 | print("Ray total resources:", ray.cluster_resources()) 331 | print("available resources:", ray.available_resources()) 332 | print("---------------------------------------------------") 333 | if learner_step < 100: 334 | alratio = 0 335 | agent.write_tb(ave_test_reward, ave_score, alratio, update_frequency, total_learner_step) 336 | 337 | total_time = time.time() - init_time 338 | 339 | if total_learner_step // opt.save_interval > save_times: 340 | with open(opt.save_dir + "/" + str(total_learner_step / 1e6) + "M_" + str(ave_test_reward) + "_weights.pickle", "wb") as pickle_out: 341 | pickle.dump(weights_all, pickle_out) 342 | print("****** Weights saved by time! ******") 343 | save_times = total_learner_step // opt.save_interval 344 | 345 | # save everything every checkpoint_freq s 346 | if total_time // opt.checkpoint_freq > checkpoint_times: 347 | print("save everything!") 348 | save_start_time = time.time() 349 | 350 | ps_save_op = [node_ps[i].save_weights.remote() for i in range(opt.num_nodes)] 351 | buffer_save_op = [node_buffer[node_index][i].save.remote() for i in range(opt.num_buffers) for node_index in range(opt.num_nodes)] 352 | ray.wait(buffer_save_op + ps_save_op, num_returns=opt.num_nodes*opt.num_buffers + 1) 353 | 354 | print("total time for saving :", time.time() - save_start_time) 355 | checkpoint_times = total_time // opt.checkpoint_freq 356 | 357 | 358 | def get_al_status(node_buffer): 359 | 360 | buffer_learner_step = [] 361 | buffer_actor_step = [] 362 | buffer_cur_size = [] 363 | 364 | for node_index in range(opt.num_nodes): 365 | for i in range(opt.num_buffers): 366 | learner_step, actor_step, cur_size = ray.get(node_buffer[node_index][i].get_counts.remote()) 367 | buffer_learner_step.append(learner_step) 368 | buffer_actor_step.append(actor_step) 369 | buffer_cur_size.append(cur_size) 370 | 371 | return np.array(buffer_actor_step), np.array(buffer_learner_step), np.array(buffer_cur_size) 372 | 373 | 374 | if __name__ == '__main__': 375 | 376 | # ray.init() 377 | ray.init(resources={"node0": 256}) 378 | 379 | # env = gym.make(FLAGS.env_name) 380 | env = TradingEnv(action_scheme_id=3, obs_dim=38) 381 | 382 | # ------ HyperParameters ------ 383 | opt = HyperParameters(env, FLAGS.env_name, FLAGS.exp_name, FLAGS.num_nodes, FLAGS.num_workers, FLAGS.a_l_ratio, FLAGS.weights_file) 384 | 385 | if FLAGS.recover: 386 | opt.recover = True 387 | # ------ end ------ 388 | 389 | node_ps = [] 390 | node_buffer = [] 391 | 392 | for node_index in range(FLAGS.num_nodes): 393 | 394 | # ------ Parameter Server (ray actor) ------ 395 | # create model to get weights and create a parameter server 396 | node_ps.append(ParameterServer._remote(args=[opt, FLAGS.weights_file, FLAGS.checkpoint_path, node_index], resources={"node"+str(node_index): 1})) 397 | print(f"Node{node_index} Parameter Server all set.") 398 | # ------ Parameter Server end ------ 399 | 400 | # ------ Experience buffer (ray actor) ------ 401 | node_buffer.append([ReplayBuffer._remote(args=[opt, i+node_index*opt.num_buffers], resources={"node"+str(node_index): 1}) for i in range(opt.num_buffers)]) 402 | 403 | if FLAGS.recover: 404 | buffer_load_op = [node_buffer[node_index][i].load.remote(FLAGS.checkpoint_path) for i in range(opt.num_buffers)] 405 | ray.wait(buffer_load_op, num_returns=opt.num_buffers) 406 | print(f"Node{node_index} Experience buffer all set.") 407 | # ------ Experience buffer end ------ 408 | 409 | # ------ roll out worker (ray task) ------ 410 | for i in range(FLAGS.num_workers): 411 | worker_rollout._remote(args=[node_ps[node_index], node_buffer[node_index], opt, i+node_index*FLAGS.num_workers], resources={"node"+str(node_index): 1}) 412 | time.sleep(0.19) 413 | 414 | print(f"Node{node_index} roll out worker all up.") 415 | # ------ roll out worker end ------ 416 | 417 | print(f"num of ps up: {len(node_ps)}, num of buffer up: {len(node_buffer)*len(node_buffer[0])}") 418 | 419 | print("Ray total resources:", ray.cluster_resources()) 420 | print("available resources:", ray.available_resources()) 421 | 422 | # --- save nodes info --- 423 | nodes_info = { 424 | "node_buffer": np.array(node_buffer), 425 | "num_nodes": opt.num_nodes, 426 | "num_buffers": opt.num_buffers 427 | } 428 | f_name = './nodes_info.pickle' 429 | with open(f_name, "wb") as pickle_out: 430 | pickle.dump(nodes_info, pickle_out) 431 | print("****** save nodes_info ******") 432 | # --- end --- 433 | 434 | # control learner start time 435 | if not opt.recover: 436 | 437 | start_time = time.time() 438 | 439 | total_cur_size = 0 440 | while total_cur_size < opt.start_steps: 441 | 442 | buffer_actor_step, buffer_learner_step, buffer_cur_size = get_al_status(node_buffer) 443 | total_cur_size = np.sum(buffer_cur_size) 444 | 445 | print("---------------------------------------------------") 446 | print("learner_step:", buffer_learner_step, "actor_steps:", buffer_actor_step) 447 | print("frame freq:", np.round(buffer_actor_step/(time.time()-start_time))) 448 | print("total frame freq:", int(np.sum(buffer_actor_step)/(time.time()-start_time))) 449 | print('start steps before learning:', total_cur_size, '/', opt.start_steps) 450 | print("Ray total resources:", ray.cluster_resources()) 451 | print("available resources:", ray.available_resources()) 452 | print("---------------------------------------------------") 453 | time.sleep(10) 454 | else: 455 | time.sleep(0.0) 456 | 457 | # ------ learner ------ 458 | task_train = worker_train._remote(args=[node_ps[0], node_buffer, opt, 0], resources={"node0": 1}) 459 | # ------ learner end ------ 460 | 461 | task_test = worker_test.remote(node_ps[0], node_buffer, opt) 462 | ray.wait([task_test]) 463 | -------------------------------------------------------------------------------- /algos/sqn/train.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | import time 5 | import ray 6 | 7 | import os 8 | import sys 9 | 10 | from hyperparams import HyperParameters 11 | from actor_learner import Actor, Learner 12 | 13 | import os 14 | import pickle 15 | import multiprocessing 16 | import copy 17 | import json 18 | 19 | ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 20 | sys.path.append(ROOT) 21 | from trading_env import TradingEnv, FrameStack 22 | 23 | 24 | flags = tf.app.flags 25 | FLAGS = tf.app.flags.FLAGS 26 | 27 | flags.DEFINE_string("env_name", "Trading", "game env") 28 | flags.DEFINE_string("exp_name", "sqn-trading", "experiments name") 29 | flags.DEFINE_integer("num_nodes", 1, "number of nodes") 30 | flags.DEFINE_integer("num_workers", 12, "number of workers") 31 | flags.DEFINE_string("weights_file", "", "empty means False.") 32 | flags.DEFINE_float("a_l_ratio", 10, "actor_steps / learner_steps") 33 | flags.DEFINE_bool("recover", False, "back training from last checkpoint") 34 | flags.DEFINE_string("checkpoint_path", "", "empty means opt.save_dir. ") 35 | 36 | 37 | @ray.remote(num_cpus=2) 38 | class ReplayBuffer: 39 | """ 40 | A simple FIFO experience replay buffer for SQN_N_STEP agents. 41 | """ 42 | 43 | def __init__(self, opt, buffer_index): 44 | self.opt = opt 45 | self.buffer_index = buffer_index 46 | self.obs1_buf = np.zeros([opt.buffer_size, opt.obs_dim], dtype=np.float32) 47 | self.obs2_buf = np.zeros([opt.buffer_size, opt.obs_dim], dtype=np.float32) 48 | self.acts_buf = np.zeros(opt.buffer_size, dtype=np.float32) 49 | self.rews_buf = np.zeros(opt.buffer_size, dtype=np.float32) 50 | self.done_buf = np.zeros(opt.buffer_size, dtype=np.float32) 51 | self.ptr, self.size, self.max_size = 0, 0, opt.buffer_size 52 | self.actor_steps, self.learner_steps = 0, 0 53 | 54 | def store(self, obs, act, rew, next_obs, done, worker_index): 55 | 56 | self.obs1_buf[self.ptr] = obs 57 | self.obs2_buf[self.ptr] = next_obs 58 | self.acts_buf[self.ptr] = act 59 | self.rews_buf[self.ptr] = rew 60 | self.done_buf[self.ptr] = done 61 | 62 | self.ptr = (self.ptr + 1) % self.max_size 63 | self.size = min(self.size + 1, self.max_size) 64 | self.actor_steps += 1 65 | 66 | def sample_batch(self): 67 | idxs = np.random.randint(0, self.size, size=self.opt.batch_size) 68 | self.learner_steps += 1 69 | return dict(obs1=self.obs1_buf[idxs], 70 | obs2=self.obs2_buf[idxs], 71 | acts=self.acts_buf[idxs], 72 | rews=self.rews_buf[idxs], 73 | done=self.done_buf[idxs]) 74 | 75 | def get_counts(self): 76 | return self.learner_steps, self.actor_steps, self.size 77 | 78 | # debug 79 | def show(self): 80 | return self.obs1_buf, self.ptr, self.size, self.max_size 81 | 82 | def save(self): 83 | np.save(self.opt.save_dir + "/checkpoint/" + 'obs1_buf-' + str(self.buffer_index), self.obs1_buf) 84 | np.save(self.opt.save_dir + "/checkpoint/" + 'obs2_buf-' + str(self.buffer_index), self.obs2_buf) 85 | np.save(self.opt.save_dir + "/checkpoint/" + 'acts_buf-' + str(self.buffer_index), self.acts_buf) 86 | np.save(self.opt.save_dir + "/checkpoint/" + 'rews_buf-' + str(self.buffer_index), self.rews_buf) 87 | np.save(self.opt.save_dir + "/checkpoint/" + 'done_buf-' + str(self.buffer_index), self.done_buf) 88 | buffer_infos = np.array((self.ptr, self.size, self.max_size, self.actor_steps, self.learner_steps)) 89 | np.save(self.opt.save_dir + "/checkpoint/" + 'buffer_infos-' + str(self.buffer_index), buffer_infos) 90 | print("****** buffer " + str(self.buffer_index) + " saved! ******") 91 | 92 | def load(self, checkpoint_path): 93 | if not checkpoint_path: 94 | checkpoint_path = self.opt.save_dir + "/checkpoint" 95 | 96 | self.obs1_buf = np.load(checkpoint_path + '/obs1_buf-' + str(self.buffer_index) + '.npy') 97 | self.obs2_buf = np.load(checkpoint_path + '/obs2_buf-' + str(self.buffer_index) + '.npy') 98 | self.acts_buf = np.load(checkpoint_path + '/acts_buf-' + str(self.buffer_index) + '.npy') 99 | self.rews_buf = np.load(checkpoint_path + '/rews_buf-' + str(self.buffer_index) + '.npy') 100 | self.done_buf = np.load(checkpoint_path + '/done_buf-' + str(self.buffer_index) + '.npy') 101 | buffer_infos = np.load(checkpoint_path + '/buffer_infos-' + str(self.buffer_index) + '.npy') 102 | 103 | self.ptr, self.size, self.max_size, self.actor_steps, self.learner_steps = buffer_infos[0], buffer_infos[1], \ 104 | buffer_infos[2], buffer_infos[3], \ 105 | buffer_infos[4] 106 | print("****** buffer number " + str(self.buffer_index) + " restored! ******") 107 | print("****** buffer number " + str(self.buffer_index) + " infos:", self.ptr, self.size, self.max_size, 108 | self.actor_steps, self.learner_steps) 109 | 110 | 111 | @ray.remote(num_cpus=2) 112 | class ParameterServer: 113 | def __init__(self, opt, weights_file, checkpoint_path, ps_index): 114 | # each node will have a Parameter Server 115 | 116 | self.opt = opt 117 | self.learner_step = 0 118 | net = Learner(opt, job="ps") 119 | keys, values = net.get_weights() 120 | 121 | # --- make dir for all nodes and save parameters --- 122 | try: 123 | os.makedirs(opt.save_dir) 124 | os.makedirs(opt.save_dir + '/checkpoint') 125 | except OSError: 126 | pass 127 | all_parameters = copy.deepcopy(vars(opt)) 128 | all_parameters["obs_space"] = "" 129 | all_parameters["act_space"] = "" 130 | with open(opt.save_dir + "/" + 'All_Parameters.json', 'w') as fp: 131 | json.dump(all_parameters, fp, indent=4, sort_keys=True) 132 | # --- end --- 133 | 134 | self.weights = None 135 | 136 | if not checkpoint_path: 137 | checkpoint_path = opt.save_dir + "/checkpoint" 138 | 139 | if opt.recover: 140 | with open(checkpoint_path + "/checkpoint_weights.pickle", "rb") as pickle_in: 141 | self.weights = pickle.load(pickle_in) 142 | print("****** weights restored! ******") 143 | 144 | if weights_file: 145 | try: 146 | with open(weights_file, "rb") as pickle_in: 147 | self.weights = pickle.load(pickle_in) 148 | print("****** weights restored! ******") 149 | except: 150 | print("------------------------------------------------") 151 | print(weights_file) 152 | print("------ error: weights file doesn't exist! ------") 153 | exit() 154 | 155 | if not opt.recover and not weights_file: 156 | values = [value.copy() for value in values] 157 | self.weights = dict(zip(keys, values)) 158 | 159 | def push(self, keys, values): 160 | values = [value.copy() for value in values] 161 | for key, value in zip(keys, values): 162 | self.weights[key] = value 163 | self.learner_step += opt.push_freq 164 | 165 | def pull(self, keys): 166 | return [self.weights[key] for key in keys] 167 | 168 | def get_weights(self): 169 | return copy.deepcopy(self.weights) 170 | 171 | # save weights to disk 172 | def save_weights(self): 173 | with open(self.opt.save_dir + "/checkpoint/" + "checkpoint_weights.pickle", "wb") as pickle_out: 174 | pickle.dump(self.weights, pickle_out) 175 | 176 | 177 | class Cache(object): 178 | 179 | def __init__(self, node_buffer): 180 | # cache for training data and model weights 181 | print('os.pid:', os.getpid()) 182 | self.node_buffer = node_buffer 183 | self.q1 = multiprocessing.Queue(12) 184 | self.q2 = multiprocessing.Queue(5) 185 | self.p1 = multiprocessing.Process(target=self.ps_update, args=(self.q1, self.q2, self.node_buffer)) 186 | self.p1.daemon = True 187 | 188 | def ps_update(self, q1, q2, node_buffer): 189 | print('os.pid of put_data():', os.getpid()) 190 | 191 | node_idx = np.random.choice(opt.num_nodes, 1)[0] 192 | buffer_idx = np.random.choice(opt.num_buffers, 1)[0] 193 | q1.put(copy.deepcopy(ray.get(node_buffer[node_idx][buffer_idx].sample_batch.remote()))) 194 | 195 | while True: 196 | if q1.qsize() < 10: 197 | node_idx = np.random.choice(opt.num_nodes, 1)[0] 198 | buffer_idx = np.random.choice(opt.num_buffers, 1)[0] 199 | q1.put(copy.deepcopy(ray.get(node_buffer[node_idx][buffer_idx].sample_batch.remote()))) 200 | 201 | if not q2.empty(): 202 | keys, values = q2.get() 203 | [node_ps[i].push.remote(keys, values) for i in range(opt.num_nodes)] 204 | 205 | def start(self): 206 | self.p1.start() 207 | self.p1.join(10) 208 | 209 | def end(self): 210 | self.p1.terminate() 211 | 212 | 213 | @ray.remote(num_cpus=2, num_gpus=1, max_calls=1) 214 | def worker_train(ps, node_buffer, opt, learner_index): 215 | agent = Learner(opt, job="learner") 216 | keys = agent.get_weights()[0] 217 | weights = ray.get(ps.pull.remote(keys)) 218 | agent.set_weights(keys, weights) 219 | 220 | cache = Cache(node_buffer) 221 | 222 | cache.start() 223 | 224 | cnt = 1 225 | while True: 226 | batch = cache.q1.get() 227 | agent.train(batch, cnt) 228 | 229 | if cnt % opt.push_freq == 0: 230 | cache.q2.put(agent.get_weights()) 231 | cnt += 1 232 | 233 | 234 | @ray.remote 235 | def worker_rollout(ps, replay_buffer, opt, worker_index): 236 | agent = Actor(opt, job="worker") 237 | keys = agent.get_weights()[0] 238 | np.random.seed() 239 | 240 | ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 241 | sys.path.append(ROOT) 242 | from trading_env import TradingEnv, FrameStack 243 | # ------ env set up ------ 244 | # env = gym.make(opt.env_name) 245 | env = TradingEnv(action_scheme_id=3, obs_dim=38) 246 | 247 | while True: 248 | 249 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 250 | 251 | weights = ray.get(ps.pull.remote(keys)) 252 | agent.set_weights(keys, weights) 253 | 254 | # for a_l_ratio control 255 | np.random.seed() 256 | rand_buff = np.random.choice(opt.num_buffers, 1)[0] 257 | last_learner_steps, last_actor_steps, _size = ray.get(replay_buffer[rand_buff].get_counts.remote()) 258 | 259 | while True: 260 | 261 | # don't need to random sample action if load weights from local. 262 | if last_actor_steps * opt.num_buffers > opt.start_steps or opt.recover: 263 | a = agent.get_action(o) 264 | else: 265 | a = env.action_space.sample() 266 | # Step the env 267 | o2, r, d, _ = env.step(a) 268 | 269 | ep_ret += r 270 | ep_len += 1 271 | 272 | # Ignore the "done" signal if it comes from hitting the time 273 | # horizon (that is, when it's an artificial terminal signal 274 | # that isn't based on the agent's state) 275 | # d = False if ep_len*opt.action_repeat >= opt.max_ep_len else d 276 | 277 | np.random.seed() 278 | rand_buff = np.random.choice(opt.num_buffers, 1)[0] 279 | replay_buffer[rand_buff].store.remote(o, a, r, o2, d, worker_index) 280 | 281 | o = o2 282 | 283 | # End of episode. Training (ep_len times). 284 | # if d or (ep_len * opt.action_repeat >= opt.max_ep_len): 285 | if d: 286 | break 287 | 288 | 289 | @ray.remote 290 | def worker_test(ps, node_buffer, opt): 291 | 292 | agent = Actor(opt, job="test") 293 | keys = agent.get_weights()[0] 294 | 295 | # test_env = gym.make(opt.env_name) 296 | ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 297 | sys.path.append(ROOT) 298 | from trading_env import TradingEnv, FrameStack 299 | test_env = TradingEnv(action_scheme_id=3, obs_dim=38) 300 | 301 | init_time = time.time() 302 | save_times = 0 303 | checkpoint_times = 0 304 | 305 | while True: 306 | # weights_all for save it to local 307 | weights_all = ray.get(ps.get_weights.remote()) 308 | weights = [weights_all[key] for key in keys] 309 | agent.set_weights(keys, weights) 310 | 311 | start_actor_step, start_learner_step, _ = get_al_status(node_buffer) 312 | start_time = time.time() 313 | 314 | ave_test_reward, ave_score = agent.test(test_env, 10) 315 | 316 | last_actor_step, last_learner_step, _ = get_al_status(node_buffer) 317 | actor_step = np.sum(last_actor_step) - np.sum(start_actor_step) 318 | learner_step = np.sum(last_learner_step) - np.sum(start_learner_step) 319 | alratio = actor_step / (learner_step + 1) 320 | update_frequency = int(learner_step / (time.time() - start_time)) 321 | total_learner_step = np.sum(last_learner_step) 322 | 323 | print("---------------------------------------------------") 324 | print("average test reward:", ave_test_reward) 325 | print("average test score:", ave_score) 326 | print("frame freq:", np.round((last_actor_step - start_actor_step) / (time.time() - start_time))) 327 | print("actor_steps:", np.sum(last_actor_step), "learner_step:", total_learner_step) 328 | print("actor leaner ratio: %.2f" % alratio) 329 | print("learner freq:", update_frequency) 330 | print("Ray total resources:", ray.cluster_resources()) 331 | print("available resources:", ray.available_resources()) 332 | print("---------------------------------------------------") 333 | if learner_step < 100: 334 | alratio = 0 335 | agent.write_tb(ave_test_reward, ave_score, alratio, update_frequency, total_learner_step) 336 | 337 | total_time = time.time() - init_time 338 | 339 | if total_learner_step // opt.save_interval > save_times: 340 | with open(opt.save_dir + "/" + str(total_learner_step / 1e6) + "M_" + str(ave_test_reward) + "_weights.pickle", "wb") as pickle_out: 341 | pickle.dump(weights_all, pickle_out) 342 | print("****** Weights saved by time! ******") 343 | save_times = total_learner_step // opt.save_interval 344 | 345 | # save everything every checkpoint_freq s 346 | if total_time // opt.checkpoint_freq > checkpoint_times: 347 | print("save everything!") 348 | save_start_time = time.time() 349 | 350 | ps_save_op = [node_ps[i].save_weights.remote() for i in range(opt.num_nodes)] 351 | buffer_save_op = [node_buffer[node_index][i].save.remote() for i in range(opt.num_buffers) for node_index in range(opt.num_nodes)] 352 | ray.wait(buffer_save_op + ps_save_op, num_returns=opt.num_nodes*opt.num_buffers + 1) 353 | 354 | print("total time for saving :", time.time() - save_start_time) 355 | checkpoint_times = total_time // opt.checkpoint_freq 356 | 357 | 358 | def get_al_status(node_buffer): 359 | 360 | buffer_learner_step = [] 361 | buffer_actor_step = [] 362 | buffer_cur_size = [] 363 | 364 | for node_index in range(opt.num_nodes): 365 | for i in range(opt.num_buffers): 366 | learner_step, actor_step, cur_size = ray.get(node_buffer[node_index][i].get_counts.remote()) 367 | buffer_learner_step.append(learner_step) 368 | buffer_actor_step.append(actor_step) 369 | buffer_cur_size.append(cur_size) 370 | 371 | return np.array(buffer_actor_step), np.array(buffer_learner_step), np.array(buffer_cur_size) 372 | 373 | 374 | if __name__ == '__main__': 375 | 376 | # ray.init() 377 | ray.init(resources={"node0": 256}) 378 | 379 | # env = gym.make(FLAGS.env_name) 380 | env = TradingEnv(action_scheme_id=3, obs_dim=38) 381 | 382 | # ------ HyperParameters ------ 383 | opt = HyperParameters(env, FLAGS.env_name, FLAGS.exp_name, FLAGS.num_nodes, FLAGS.num_workers, FLAGS.a_l_ratio, FLAGS.weights_file) 384 | 385 | if FLAGS.recover: 386 | opt.recover = True 387 | # ------ end ------ 388 | 389 | node_ps = [] 390 | node_buffer = [] 391 | 392 | for node_index in range(FLAGS.num_nodes): 393 | 394 | # ------ Parameter Server (ray actor) ------ 395 | # create model to get weights and create a parameter server 396 | node_ps.append(ParameterServer._remote(args=[opt, FLAGS.weights_file, FLAGS.checkpoint_path, node_index], resources={"node"+str(node_index): 1})) 397 | print(f"Node{node_index} Parameter Server all set.") 398 | # ------ Parameter Server end ------ 399 | 400 | # ------ Experience buffer (ray actor) ------ 401 | node_buffer.append([ReplayBuffer._remote(args=[opt, i+node_index*opt.num_buffers], resources={"node"+str(node_index): 1}) for i in range(opt.num_buffers)]) 402 | 403 | if FLAGS.recover: 404 | buffer_load_op = [node_buffer[node_index][i].load.remote(FLAGS.checkpoint_path) for i in range(opt.num_buffers)] 405 | ray.wait(buffer_load_op, num_returns=opt.num_buffers) 406 | print(f"Node{node_index} Experience buffer all set.") 407 | # ------ Experience buffer end ------ 408 | 409 | # ------ roll out worker (ray task) ------ 410 | for i in range(FLAGS.num_workers): 411 | worker_rollout._remote(args=[node_ps[node_index], node_buffer[node_index], opt, i+node_index*FLAGS.num_workers], resources={"node"+str(node_index): 1}) 412 | time.sleep(0.19) 413 | 414 | print(f"Node{node_index} roll out worker all up.") 415 | # ------ roll out worker end ------ 416 | 417 | print(f"num of ps up: {len(node_ps)}, num of buffer up: {len(node_buffer)*len(node_buffer[0])}") 418 | 419 | print("Ray total resources:", ray.cluster_resources()) 420 | print("available resources:", ray.available_resources()) 421 | 422 | # --- save nodes info --- 423 | nodes_info = { 424 | "node_buffer": np.array(node_buffer), 425 | "num_nodes": opt.num_nodes, 426 | "num_buffers": opt.num_buffers 427 | } 428 | f_name = './nodes_info.pickle' 429 | with open(f_name, "wb") as pickle_out: 430 | pickle.dump(nodes_info, pickle_out) 431 | print("****** save nodes_info ******") 432 | # --- end --- 433 | 434 | # control learner start time 435 | if not opt.recover: 436 | 437 | start_time = time.time() 438 | 439 | total_cur_size = 0 440 | while total_cur_size < opt.start_steps: 441 | 442 | buffer_actor_step, buffer_learner_step, buffer_cur_size = get_al_status(node_buffer) 443 | total_cur_size = np.sum(buffer_cur_size) 444 | 445 | print("---------------------------------------------------") 446 | print("learner_step:", buffer_learner_step, "actor_steps:", buffer_actor_step) 447 | print("frame freq:", np.round(buffer_actor_step/(time.time()-start_time))) 448 | print("total frame freq:", int(np.sum(buffer_actor_step)/(time.time()-start_time))) 449 | print('start steps before learning:', total_cur_size, '/', opt.start_steps) 450 | print("Ray total resources:", ray.cluster_resources()) 451 | print("available resources:", ray.available_resources()) 452 | print("---------------------------------------------------") 453 | time.sleep(10) 454 | else: 455 | time.sleep(0.0) 456 | 457 | # ------ learner ------ 458 | task_train = worker_train._remote(args=[node_ps[0], node_buffer, opt, 0], resources={"node0": 1}) 459 | # ------ learner end ------ 460 | 461 | task_test = worker_test.remote(node_ps[0], node_buffer, opt) 462 | ray.wait([task_test]) 463 | --------------------------------------------------------------------------------