├── pictures
    ├── sac1.png
    ├── dsac1.png
    ├── ddrlframework.jpg
    └── rapid-architecture@2x--1-.png
├── tutorial
    ├── Pictures
    │   ├── dsac1w-sac.png
    │   └── ddrlframework.jpg
    ├── Parallelize your algorithm by Ray (1).md
    ├── Parallelize your algorithm by Ray (3).md
    └── Parallelize your algorithm by Ray (2).md
├── algos
    ├── test_scripts
    │   ├── Testing.py
    │   ├── Testing1.py
    │   └── dense_bn.py
    ├── dqn
    │   ├── core.py
    │   ├── hyperparams.py
    │   ├── actor_learner.py
    │   └── train.py
    ├── sac1
    │   ├── render_test.py
    │   ├── hyperparams.py
    │   ├── core.py
    │   ├── sac1.py
    │   ├── actor_learner.py
    │   └── sac_ray.py
    ├── sqn
    │   ├── hyperparams.py
    │   ├── core.py
    │   ├── actor_learner.py
    │   └── train.py
    └── trading_env.py
├── README.md
└── example
    ├── core.py
    ├── model.py
    ├── dsac.py
    └── sac.py


/pictures/sac1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/createamind/Distributed-DRL/HEAD/pictures/sac1.png


--------------------------------------------------------------------------------
/pictures/dsac1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/createamind/Distributed-DRL/HEAD/pictures/dsac1.png


--------------------------------------------------------------------------------
/pictures/ddrlframework.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/createamind/Distributed-DRL/HEAD/pictures/ddrlframework.jpg


--------------------------------------------------------------------------------
/tutorial/Pictures/dsac1w-sac.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/createamind/Distributed-DRL/HEAD/tutorial/Pictures/dsac1w-sac.png


--------------------------------------------------------------------------------
/tutorial/Pictures/ddrlframework.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/createamind/Distributed-DRL/HEAD/tutorial/Pictures/ddrlframework.jpg


--------------------------------------------------------------------------------
/pictures/rapid-architecture@2x--1-.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/createamind/Distributed-DRL/HEAD/pictures/rapid-architecture@2x--1-.png


--------------------------------------------------------------------------------
/algos/test_scripts/Testing.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import ray
 3 | import time
 4 | 
 5 | ray.init(object_store_memory=1000000000, redis_max_memory=1000000000)
 6 | 
 7 | y=111
 8 | y_id = ray.put(y)
 9 | 
10 | 
11 | @ray.remote
12 | class Cat:
13 |     def __init__(self):
14 |         self.cnt = 0
15 |         global y_id
16 |         y_id = ray.put(2)
17 |     def incre(self):
18 |         print('done.')
19 |         time.sleep(1)
20 |         self.cnt += ray.get(y_id)
21 |     def get_cnt(self):
22 |         return self.cnt
23 | 
24 | cat = Cat.remote()
25 | 
26 | 
27 | 
28 | class Dog:
29 |     def __init__(self):
30 |         self.cnt = 0
31 |         global y_id
32 |         y_id = ray.put(2)
33 |     def incre(self):
34 |         print('done.')
35 |         time.sleep(1)
36 |         self.cnt += ray.get(y_id)
37 |     def get_cnt(self):
38 |         return self.cnt
39 | 
40 | dog = Dog()
41 | 
42 | 
43 | @ray.remote
44 | def remote_cat(cls1):
45 |     cls1.incre.remote()  # self.cnt will increase
46 |     return 1 # cls1.get_cnt.remote()
47 | 
48 | @ray.remote
49 | def remote_dog(cls1):
50 |     cls1.incre()    # self.cnt will not increase
51 |     return 1 # cls1.get_cnt.remote()
52 | 
53 | 
54 | 
55 | result_id = [remote_dog.remote(dog) for _ in range(5)]
56 | 
57 | result = ray.get(result_id)
58 | 
59 | print(result)
60 | 
61 | time.sleep(5)
62 | # print(ray.get(cat.get_cnt.remote()))
63 | print(dog.get_cnt())


--------------------------------------------------------------------------------
/algos/dqn/core.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | EPS = 1e-8
 5 | 
 6 | 
 7 | def placeholder(dim=None):
 8 |     return tf.placeholder(dtype=tf.float32, shape=(None, dim) if dim else (None,))
 9 | 
10 | 
11 | def placeholders(*args):
12 |     return [placeholder(dim) for dim in args]
13 | 
14 | 
15 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
16 |     for h in hidden_sizes[:-1]:
17 |         x = tf.layers.dense(x, units=h, activation=activation)
18 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
19 | 
20 | 
21 | def get_vars(scope):
22 |     return [x for x in tf.global_variables() if scope in x.name]
23 | 
24 | 
25 | def count_vars(scope):
26 |     v = get_vars(scope)
27 |     return sum([np.prod(var.shape.as_list()) for var in v])
28 | 
29 | 
30 | def gaussian_likelihood(x, mu, log_std):
31 |     pre_sum = -0.5 * (((x - mu) / (tf.exp(log_std) + EPS)) ** 2 + 2 * log_std + np.log(2 * np.pi))
32 |     return tf.reduce_sum(pre_sum, axis=1)
33 | 
34 | 
35 | """
36 | Actor-Critics
37 | """
38 | 
39 | 
40 | def q_function(x, x2, hidden_sizes, act_dim, activation=tf.nn.relu, output_activation=None):
41 | 
42 |     vf_mlp = lambda x: mlp(x, list(hidden_sizes) + [act_dim], activation, None)
43 |     # Q
44 |     q_tp = tf.make_template('q1', vf_mlp, create_scope_now_=True)
45 | 
46 |     q = q_tp(x)
47 |     q_x2 = q_tp(x2)
48 | 
49 |     return q, q_x2
50 | 


--------------------------------------------------------------------------------
/algos/test_scripts/Testing1.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import ray, os, time
 3 | 
 4 | ray.init(object_store_memory=1000000000, redis_max_memory=1000000000)
 5 | 
 6 | print('main.pid:', os.getpid())
 7 | 
 8 | @ray.remote
 9 | def f(x):
10 |     print('f.pid:', os.getpid())
11 |     return x
12 | 
13 | @ray.remote
14 | class Foo():
15 |     def __init__(self, f):
16 |         self.x = ray.get(f.remote(100))
17 | 
18 |     # @ray.remote     # AttributeError: 'ActorHandle' object has no attribute 'bar'
19 |     def bar(self):
20 |         print('bar.pid:', os.getpid())
21 |         return 1
22 | 
23 | foo = Foo.remote(f)
24 | 
25 | obj_id1 = foo.bar.remote()
26 | 
27 | print(ray.get(obj_id1))
28 | 
29 | 
30 | ''' outputs:
31 | main.pid: 9521
32 | 1
33 | (pid=9593) bar.pid: 9593
34 | (pid=9602) f.pid: 9602
35 | '''
36 | 
37 | 
38 | 
39 | @ray.remote
40 | class Counter(object):
41 |     def __init__(self):
42 |         self.counter = 0
43 | 
44 |     def inc(self):
45 |         self.counter += 1
46 | 
47 |     def get_counter(self):
48 |         return self.counter
49 | 
50 | @ray.remote
51 | def g(counter):
52 |     print('g.pid:', os.getpid())
53 |     for _ in range(1000):
54 |         time.sleep(0.1)
55 |         counter.inc.remote()
56 | 
57 | counter = Counter.remote()
58 | 
59 | # Start some tasks that use the actor.
60 | [g.remote(counter) for _ in range(3)]
61 | 
62 | # Print the counter value.
63 | for _ in range(10):
64 |     time.sleep(1)
65 |     print(ray.get(counter.get_counter.remote()))


--------------------------------------------------------------------------------
/algos/sac1/render_test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import time
 4 | import ray
 5 | import gym
 6 | 
 7 | from hyperparams import HyperParameters
 8 | from actor_learner import Actor, Learner
 9 | 
10 | import os
11 | import pickle
12 | import multiprocessing
13 | import copy
14 | import signal
15 | 
16 | 
17 | flags = tf.app.flags
18 | FLAGS = tf.app.flags.FLAGS
19 | 
20 | # "Pendulum-v0" 'BipedalWalker-v2' 'LunarLanderContinuous-v2'
21 | flags.DEFINE_string("env_name", "BipedalWalkerHardcore-v2", "game env")
22 | flags.DEFINE_integer("total_epochs", 500, "total_epochs")
23 | flags.DEFINE_integer("num_workers", 1, "number of workers")
24 | flags.DEFINE_integer("num_learners", 1, "number of learners")
25 | flags.DEFINE_string("is_restore", "False", "True or False. True means restore weights from pickle file.")
26 | flags.DEFINE_float("a_l_ratio", 10, "steps / sample_times")
27 | 
28 | opt = HyperParameters(FLAGS.env_name, FLAGS.total_epochs, FLAGS.num_workers, FLAGS.a_l_ratio)
29 | 
30 | agent = Actor(opt, job="main")
31 | keys, weights = agent.get_weights()
32 | pickle_in = open("weights.pickle", "rb")
33 | weights = pickle.load(pickle_in)
34 | 
35 | 
36 | weights = [weights[key] for key in keys]
37 | 
38 | agent.set_weights(keys, weights)
39 | 
40 | test_env = gym.make(opt.env_name)
41 | 
42 | n = 2
43 | 
44 | rew = []
45 | for j in range(n):
46 |     o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
47 |     while not (d or (ep_len == opt.max_ep_len)):
48 |         # Take deterministic actions at test time
49 |         test_env.render()
50 |         action = agent.get_action(o, True)
51 |         print(action)
52 |         o, r, d, _ = test_env.step(action)
53 |         ep_ret += r
54 |         ep_len += 1
55 |     rew.append(ep_ret)
56 | print("test_reward:", sum(rew)/n)
57 | 


--------------------------------------------------------------------------------
/algos/dqn/hyperparams.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | import sys
 4 | from gym.spaces import Box
 5 | import datetime
 6 | import gym
 7 | from math import ceil
 8 | 
 9 | 
10 | class HyperParameters:
11 |     def __init__(self, env, env_name, exp_name, num_nodes, num_workers, a_l_ratio, weights_file):
12 |         # parameters set
13 | 
14 |         self.exp_name = exp_name
15 |         self.env_name = env_name
16 | 
17 |         self.model = "mlp"
18 |         assert self.model in ["mlp", "cnn"], "model must be mlp or cnn!"
19 | 
20 |         self.num_nodes = num_nodes
21 |         self.num_workers = num_workers
22 |         self.num_learners = 1
23 | 
24 |         self.push_freq = 100
25 | 
26 |         self.gamma = 0.99
27 | 
28 |         self.a_l_ratio = a_l_ratio
29 |         self.weights_file = weights_file
30 | 
31 |         self.recover = False
32 |         self.checkpoint_freq = 21600  # 21600s = 6h
33 | 
34 |         # gpu memory fraction
35 |         self.gpu_fraction = 0.3
36 | 
37 |         self.hidden_size = [400, 300]
38 | 
39 |         self.obs_dim = env.observation_space.shape[0]
40 |         self.obs_space = env.observation_space
41 |         self.obs_shape = self.obs_space.shape
42 | 
43 |         self.act_dim = env.action_space.n
44 |         self.act_space = env.action_space
45 |         self.act_shape = self.act_space.shape
46 | 
47 |         # self.num_buffers = 1
48 |         self.buffer_size = int(1e6)
49 |         self.num_buffers = self.num_workers // 25 + 1
50 |         self.buffer_size = self.buffer_size // self.num_buffers
51 | 
52 |         self.start_steps = int(1e4) // self.num_buffers
53 | 
54 |         if self.weights_file:
55 |             self.start_steps = self.buffer_size
56 | 
57 |         self.lr = 1e-3
58 |         self.polyak = 0.995
59 | 
60 |         self.batch_size = 128
61 | 
62 |         # n-step
63 |         self.Ln = 1
64 | 
65 |         self.save_freq = 1
66 | 
67 |         self.seed = 0
68 | 
69 |         ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
70 | 
71 |         self.summary_dir = ROOT_DIR + '/tboard_ray'  # Directory for storing tensorboard summary results
72 |         self.save_dir = ROOT_DIR + '/' + self.exp_name  # Directory for storing trained model
73 |         self.save_interval = int(5e5)
74 | 
75 |         self.log_dir = self.summary_dir + "/" + str(datetime.datetime.now()) + "-workers_num:" + \
76 |                        str(self.num_workers) + "%" + str(self.a_l_ratio) + self.env_name + "-" + self.exp_name
77 | 


--------------------------------------------------------------------------------
/algos/sqn/hyperparams.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | import sys
 4 | from gym.spaces import Box
 5 | import datetime
 6 | import gym
 7 | from math import ceil
 8 | 
 9 | 
10 | class HyperParameters:
11 |     def __init__(self, env, env_name, exp_name, num_nodes, num_workers, a_l_ratio, weights_file):
12 |         # parameters set
13 | 
14 |         self.exp_name = exp_name
15 |         self.env_name = env_name
16 | 
17 |         self.model = "mlp"
18 |         assert self.model in ["mlp", "cnn"], "model must be mlp or cnn!"
19 | 
20 |         self.num_nodes = num_nodes
21 |         self.num_workers = num_workers
22 |         self.num_learners = 1
23 | 
24 |         self.push_freq = 100
25 | 
26 |         # alpha need > 0
27 |         self.alpha = 0.1
28 |         self.gamma = 0.99
29 | 
30 |         self.a_l_ratio = a_l_ratio
31 |         self.weights_file = weights_file
32 | 
33 |         self.recover = False
34 |         self.checkpoint_freq = 21600  # 21600s = 6h
35 | 
36 |         # gpu memory fraction
37 |         self.gpu_fraction = 0.3
38 | 
39 |         self.hidden_size = [400, 300]
40 | 
41 |         self.obs_dim = env.observation_space.shape[0]
42 |         self.obs_space = env.observation_space
43 |         self.obs_shape = self.obs_space.shape
44 | 
45 |         self.act_dim = env.action_space.n
46 |         self.act_space = env.action_space
47 |         self.act_shape = self.act_space.shape
48 | 
49 |         # self.num_buffers = 1
50 |         self.buffer_size = int(1e6)
51 |         self.num_buffers = self.num_workers // 25 + 1
52 |         self.buffer_size = self.buffer_size // self.num_buffers
53 | 
54 |         self.start_steps = int(1e4) // self.num_buffers
55 | 
56 |         if self.weights_file:
57 |             self.start_steps = self.buffer_size
58 | 
59 |         self.lr = 1e-3
60 |         self.polyak = 0.995
61 | 
62 |         self.batch_size = 128
63 | 
64 |         # n-step
65 |         self.Ln = 1
66 | 
67 |         self.save_freq = 1
68 | 
69 |         self.seed = 0
70 | 
71 |         ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
72 | 
73 |         self.summary_dir = ROOT_DIR + '/tboard_ray'  # Directory for storing tensorboard summary results
74 |         self.save_dir = ROOT_DIR + '/' + self.exp_name  # Directory for storing trained model
75 |         self.save_interval = int(5e5)
76 | 
77 |         self.log_dir = self.summary_dir + "/" + str(datetime.datetime.now()) + "-workers_num:" + \
78 |                        str(self.num_workers) + "%" + str(self.a_l_ratio) + self.env_name + "-" + self.exp_name
79 | 


--------------------------------------------------------------------------------
/algos/sqn/core.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | EPS = 1e-8
 5 | 
 6 | 
 7 | def placeholder(dim=None):
 8 |     return tf.placeholder(dtype=tf.float32, shape=(None, dim) if dim else (None,))
 9 | 
10 | 
11 | def placeholders(*args):
12 |     return [placeholder(dim) for dim in args]
13 | 
14 | 
15 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
16 |     for h in hidden_sizes[:-1]:
17 |         x = tf.layers.dense(x, units=h, activation=activation)
18 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
19 | 
20 | 
21 | def get_vars(scope):
22 |     return [x for x in tf.global_variables() if scope in x.name]
23 | 
24 | 
25 | def count_vars(scope):
26 |     v = get_vars(scope)
27 |     return sum([np.prod(var.shape.as_list()) for var in v])
28 | 
29 | 
30 | def softmax_policy(alpha, q, act_dim):
31 | 
32 |     pi_log = tf.nn.log_softmax(q/alpha, axis=1)
33 |     mu = tf.argmax(pi_log, axis=1)
34 | 
35 |     # tf.random.multinomial( logits, num_samples, seed=None, name=None, output_dtype=None )
36 |     # logits: 2-D Tensor with shape [batch_size, num_classes]. Each slice [i, :] represents the unnormalized log-probabilities for all classes.
37 |     # num_samples: 0-D. Number of independent samples to draw for each row slice.
38 |     pi = tf.squeeze(tf.random.multinomial(pi_log, 1), axis=1)
39 | 
40 |     # logp_pi = tf.reduce_sum(tf.one_hot(mu, depth=act_dim) * pi_log, axis=1)  # use max Q(s,a)
41 |     # logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * pi_log, axis=1)
42 |     logp_pi = tf.reduce_sum(tf.exp(pi_log)*pi_log, axis=1)                     # exact entropy
43 | 
44 |     return mu, pi, logp_pi
45 | 
46 | 
47 | """
48 | Actor-Critics
49 | """
50 | 
51 | 
52 | def q_function(x, x2, alpha, hidden_sizes, act_dim, activation=tf.nn.relu,
53 |                      output_activation=None, policy=softmax_policy, action_space=None):
54 | 
55 |     vf_mlp = lambda x: mlp(x, list(hidden_sizes) + [act_dim], activation, None)
56 | 
57 |     # Q1
58 |     q1_tp = tf.make_template('q1', vf_mlp, create_scope_now_=True)
59 | 
60 |     q1 = q1_tp(x)
61 | 
62 |     # policy
63 |     mu, pi, entropy = policy(alpha, q1, act_dim)
64 |     q1_mu = tf.reduce_sum(q1 * tf.one_hot(mu, depth=act_dim), axis=1)
65 | 
66 |     q1_x2 = q1_tp(x2)
67 | 
68 |     # policy
69 |     mu_x2, pi_x2, entropy_x2 = policy(alpha, q1_x2, act_dim)
70 | 
71 |     # Q2
72 |     q2_tp = tf.make_template('q2', vf_mlp, create_scope_now_=True)
73 |     q2 = q2_tp(x)
74 | 
75 |     # policy
76 |     mu2, pi2, entropy2 = policy(alpha, q2, act_dim)
77 |     q2_mu = tf.reduce_sum(q2 * tf.one_hot(mu2, depth=act_dim), axis=1)
78 | 
79 |     return mu, pi, entropy_x2, q1, q2, q1_mu, q2_mu
80 | 


--------------------------------------------------------------------------------
/algos/test_scripts/dense_bn.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import numpy as np
 4 | import tensorflow as tf
 5 | 
 6 | 
 7 | config = tf.ConfigProto()
 8 | config.gpu_options.allow_growth = True
 9 | session = tf.Session(config=config)
10 | 
11 | 
12 | from tensorflow.examples.tutorials.mnist import input_data
13 | 
14 | mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
15 | 
16 | 
17 | regularizer_l2 = tf.contrib.layers.l2_regularizer(0.00)
18 | 
19 | def dense(x, size, scope):
20 |     return tf.layers.dense(x, size, activation=None, kernel_regularizer=regularizer_l2, bias_regularizer=regularizer_l2,)
21 | 
22 | def dense_batch_relu(x, phase, scope):
23 |     with tf.variable_scope(scope):
24 |         h1 = tf.layers.dense(x, 100, activation=None, kernel_regularizer=regularizer_l2, bias_regularizer=regularizer_l2,)
25 |         h2 = tf.contrib.layers.batch_norm(h1,
26 |                                           center=True, scale=True,
27 |                                           is_training=phase, fused=False,
28 |                                           scope='bn')
29 |         return tf.nn.relu(h2, 'relu')
30 | 
31 | 
32 | tf.reset_default_graph()
33 | x = tf.placeholder('float32', (None, 784), name='x')
34 | y = tf.placeholder('float32', (None, 10), name='y')
35 | phase = tf.placeholder(tf.bool, name='phase')
36 | 
37 | h1 = dense_batch_relu(x, phase,'layer1')
38 | h2 = dense_batch_relu(h1, phase, 'layer2')
39 | logits = dense(h2, 10, 'logits')
40 | 
41 | with tf.name_scope('accuracy'):
42 |     accuracy = tf.reduce_mean(tf.cast(
43 |             tf.equal(tf.argmax(y, 1), tf.argmax(logits, 1)),
44 |             'float32'))
45 | 
46 | with tf.name_scope('loss'):
47 |     loss = tf.reduce_mean(
48 |         tf.nn.softmax_cross_entropy_with_logits_v2(labels=y, logits=logits,))
49 | 
50 | 
51 | def train():
52 |     update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
53 |     with tf.control_dependencies(update_ops):
54 |         # Ensures that we execute the update_ops before performing the train_step
55 |         train_step = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
56 |     sess = tf.Session()
57 |     sess.run(tf.global_variables_initializer())
58 | 
59 |     history = []
60 |     iterep = 500
61 |     for i in range(iterep * 30):
62 |         x_train, y_train = mnist.train.next_batch(100)
63 |         sess.run(train_step,
64 |                  feed_dict={'x:0': x_train,
65 |                             'y:0': y_train,
66 |                             'phase:0': 1})
67 |         if (i + 1) % iterep == 0:
68 |             epoch = (i + 1) / iterep
69 |             tr = sess.run([loss, accuracy],
70 |                           feed_dict={'x:0': mnist.train.images,
71 |                                      'y:0': mnist.train.labels,
72 |                                      'phase:0': 1})
73 |             t = sess.run([loss, accuracy],
74 |                          feed_dict={'x:0': mnist.test.images,
75 |                                     'y:0': mnist.test.labels,
76 |                                     'phase:0': 0})
77 |             history += [[epoch] + tr + t]
78 |             print(history[-1])
79 |     return history
80 | 
81 | 
82 | if __name__=="__main__":
83 |     train()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Distributed-DRL
  2 | Distributed Deep Reinforcement Learning
  3 | 
  4 | This framework inspired by general-purpose RL training system **Rapid** from OpenAI.
  5 | 
  6 | Rapid framework:
  7 | ![rapid-architecture@2x--1-](./pictures/rapid-architecture@2x--1-.png)
  8 | Our framework:
  9 | ![ddrlframework](./pictures/ddrlframework.jpg)
 10 | 
 11 | ---
 12 | 
 13 | Tutorial
 14 | 
 15 | - [Parallelize your algorithm by Ray (1)](tutorial/Parallelize%20your%20algorithm%20by%20Ray%20(1).md)
 16 | - [Parallelize your algorithm by Ray (2)](tutorial/Parallelize%20your%20algorithm%20by%20Ray%20(2).md)
 17 | - [Parallelize your algorithm by Ray (3)](tutorial/Parallelize%20your%20algorithm%20by%20Ray%20(3).md)
 18 | 
 19 | ---
 20 | 
 21 | This framework divides the reinforcement learning process into five parts:
 22 | 
 23 | - Replay buffer (option)
 24 | - Parameter server
 25 | - train (learn)
 26 | - rollout
 27 | - test
 28 | 
 29 | ```python
 30 | @ray.remote
 31 | class ReplayBuffer:
 32 | 	...
 33 |     # replay buffer
 34 |     
 35 | @ray.remote
 36 | class ParameterServer(object):
 37 | 	...
 38 |     # keep the newest network weights here
 39 |     # could pull and push the weights
 40 |     # also could save the weights to local
 41 |     
 42 | @ray.remote(num_gpus=1, max_calls=1)
 43 | def worker_train(ps, replay_buffer, opt, learner_index):
 44 |     ...
 45 |     # build a learner network
 46 |     # pull weights from ps
 47 |   	# for loop:
 48 |     #	get sample batch from replaybuffer
 49 |     #	update network and push new weights to ps
 50 |     
 51 | @ray.remote
 52 | def worker_rollout(ps, replay_buffer, opt, worker_index):
 53 |     ...
 54 |     # bulid a rollout network
 55 |     # pull weights from ps
 56 |     # for loop:
 57 |     #	interactive with environment
 58 |     #	store experience to replay buffer
 59 |     #	if end of episode:
 60 |     #		pull weights from ps
 61 |     
 62 | @ray.remote
 63 | def worker_test(ps, replay_buffer, opt, worker_index=0):
 64 |     ...
 65 |     # bulid a test network usually same as rollout
 66 |     # while:
 67 |     #	pull weights from ps
 68 |     #	do test
 69 |     #	might save model here
 70 |     
 71 | if __name__ == '__main__':
 72 | 
 73 |     ray.init(object_store_memory=1000000000, redis_max_memory=1000000000)
 74 | 
 75 |     opt = HyperParameters(FLAGS.env_name, FLAGS.total_epochs, FLAGS.num_workers)
 76 | 
 77 |     # create the parameter server
 78 |     if FLAGS.is_restore == "True":
 79 |         ps = ParameterServer.remote([], [], is_restore=True)
 80 |     else:
 81 |         net = Learner(opt, job="main")
 82 |         all_keys, all_values = net.get_weights()
 83 |         ps = ParameterServer.remote(all_keys, all_values)
 84 | 
 85 |     # create replay buffer
 86 |     replay_buffer = ReplayBuffer.remote(obs_dim=opt.obs_dim, act_dim=opt.act_dim, size=opt.replay_size)
 87 | 
 88 |     # Start some rollout tasks.
 89 |     task_rollout = [worker_rollout.remote(ps, replay_buffer, opt, i) for i in range(FLAGS.num_workers)]
 90 | 
 91 |     time.sleep(5)
 92 | 	
 93 | 	# start training tasks
 94 |     task_train = [worker_train.remote(ps, replay_buffer, opt, i) for i in range(FLAGS.num_learners)]
 95 | 
 96 |     # start testing
 97 |     task_test = worker_test.remote(ps, replay_buffer, opt)
 98 | 
 99 |     # wait util task test end
100 |     # Keep the main process running. Otherwise everything will shut down when main process finished.
101 |     ray.wait([task_test, ])
102 | ```
103 | 
104 | 
105 | 
106 | ### Result:
107 | 
108 | Env: LunarLanderContinuous-v2
109 | GPU:GTX1060 x1
110 | 
111 | **SAC1 without distribution:** gets 200+ in 1200s
112 | ![sac1](./pictures/sac1.png)
113 | **Distributed SAC1:** gets 200+ in 360s
114 | ![dsac1](./pictures/dsac1.png)
115 | 


--------------------------------------------------------------------------------
/algos/sac1/hyperparams.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import sys
  4 | from gym.spaces import Box
  5 | import datetime
  6 | import gym
  7 | from numbers import Number
  8 | 
  9 | 
 10 | class HyperParameters:
 11 |     def __init__(self, env_name, exp_name, num_workers, a_l_ratio, weights_file):
 12 |         # parameters set
 13 | 
 14 |         self.exp_name = exp_name
 15 | 
 16 |         self.env_name = env_name
 17 |         # "_random", "_d_True", ""
 18 |         self.rollout_env_name = self.env_name + ""
 19 | 
 20 |         self.model = "mlp"
 21 |         assert self.model in ["mlp", "cnn"], "model must be mlp or cnn!"
 22 |         if self.model == "cnn":
 23 |             self.representation = "extracted"
 24 |             self.stacked = True
 25 |         else:
 26 |             self.representation = 'simple115'
 27 |             self.stacked = False
 28 | 
 29 |         self.a_l_ratio = a_l_ratio
 30 |         self.weights_file = weights_file
 31 |         self.start_steps = int(5e4)
 32 |         if self.weights_file:
 33 |             self.start_steps = int(10e6)
 34 | 
 35 |         # gpu memory fraction
 36 |         self.gpu_fraction = 0.3
 37 | 
 38 |         self.hidden_size = (300, 400, 300)
 39 | 
 40 |         self.obs_noise = 0
 41 |         self.act_noise = 0.3
 42 |         self.reward_scale = 5
 43 |         env = Wrapper(gym.make(self.env_name), self.obs_noise, self.act_noise, self.reward_scale, 3)
 44 | 
 45 |         # env = FootballWrapper(env_football)
 46 | 
 47 |         # self.obs_space = Box(low=-1.0, high=1.0, shape=self.obs_dim, dtype=np.float32)
 48 |         self.obs_dim = env.observation_space.shape
 49 |         self.obs_space = env.observation_space
 50 |         self.obs_shape = self.obs_space.shape
 51 | 
 52 |         self.act_dim = env.action_space.shape
 53 |         self.act_space = env.action_space
 54 |         self.act_shape = self.act_space.shape
 55 | 
 56 |         self.num_workers = num_workers
 57 |         self.num_learners = 1
 58 | 
 59 |         self.use_max = False
 60 |         self.alpha = 0.1
 61 |         # self.alpha = "auto"
 62 |         self.target_entropy = 0.5
 63 | 
 64 |         self.use_bn = False
 65 |         self.c_regularizer = 0.0
 66 | 
 67 |         self.gamma = 0.997
 68 | 
 69 |         # self.num_buffers = 1
 70 |         self.num_buffers = self.num_workers // 25 + 1
 71 |         if self.model == 'cnn':
 72 |             self.buffer_size = int(3e4)
 73 |         else:
 74 |             self.buffer_size = int(3e6)
 75 | 
 76 |         self.buffer_size = self.buffer_size // self.num_buffers
 77 | 
 78 |         self.lr = 5e-5
 79 |         self.polyak = 0.995
 80 | 
 81 |         self.steps_per_epoch = 5000
 82 |         self.batch_size = 256
 83 | 
 84 |         self.Ln = 8
 85 |         self.action_repeat = 2
 86 | 
 87 |         self.max_ep_len = 2900
 88 |         self.save_freq = 1
 89 | 
 90 |         self.max_ret = 0
 91 | 
 92 |         self.epsilon = 0
 93 |         self.epsilon_alpha = 7
 94 | 
 95 |         self.seed = 0
 96 | 
 97 |         cwd = os.getcwd()
 98 | 
 99 |         self.summary_dir = cwd + '/tboard_ray'  # Directory for storing tensorboard summary results
100 |         self.save_dir = cwd + '/' + self.exp_name  # Directory for storing trained model
101 |         self.save_interval = int(5e5)
102 | 
103 |         self.log_dir = self.summary_dir + "/" + str(datetime.datetime.now()) + "-workers_num:" + \
104 |                        str(self.num_workers) + "%" + str(self.a_l_ratio) + self.env_name + "-" + self.exp_name
105 | 
106 | 
107 | class Wrapper(object):
108 | 
109 |     def __init__(self, env, obs_noise, act_noise, reward_scale, action_repeat=3):
110 |         self._env = env
111 |         self.action_repeat = action_repeat
112 |         self.act_noise = act_noise
113 |         self.obs_noise = obs_noise
114 |         self.reward_scale = reward_scale
115 | 
116 |     def __getattr__(self, name):
117 |         return getattr(self._env, name)
118 | 
119 |     def reset(self):
120 |         obs = self._env.reset() + self.obs_noise * (-2 * np.random.random(24) + 1)
121 |         return obs
122 | 
123 |     def step(self, action):
124 |         action += self.act_noise * (-2 * np.random.random(4) + 1)
125 |         r = 0.0
126 |         for _ in range(self.action_repeat):
127 |             obs_, reward_, done_, info_ = self._env.step(action)
128 |             r = r + reward_
129 |             # r -= 0.001
130 |             if done_ and self.action_repeat != 1:
131 |                 return obs_ + self.obs_noise * (-2 * np.random.random(24) + 1), 0.0, done_, info_
132 |             if self.action_repeat == 1:
133 |                 return obs_, r, done_, info_
134 |         return obs_ + self.obs_noise * (-2 * np.random.random(24) + 1), self.reward_scale * r, done_, info_
135 | 


--------------------------------------------------------------------------------
/example/core.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | 
  4 | EPS = 1e-8
  5 | 
  6 | 
  7 | def placeholder(dim=None):
  8 |     return tf.placeholder(dtype=tf.float32, shape=(None, dim) if dim else (None,))
  9 | 
 10 | 
 11 | def placeholders(*args):
 12 |     return [placeholder(dim) for dim in args]
 13 | 
 14 | 
 15 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
 16 |     for h in hidden_sizes[:-1]:
 17 |         x = tf.layers.dense(x, units=h, activation=activation)
 18 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
 19 | 
 20 | 
 21 | def get_vars(scope):
 22 |     return [x for x in tf.global_variables() if scope in x.name]
 23 | 
 24 | 
 25 | def count_vars(scope):
 26 |     v = get_vars(scope)
 27 |     return sum([np.prod(var.shape.as_list()) for var in v])
 28 | 
 29 | 
 30 | def gaussian_likelihood(x, mu, log_std):
 31 |     pre_sum = -0.5 * (((x - mu) / (tf.exp(log_std) + EPS)) ** 2 + 2 * log_std + np.log(2 * np.pi))
 32 |     return tf.reduce_sum(pre_sum, axis=1)
 33 | 
 34 | 
 35 | def clip_but_pass_gradient(x, l=-1., u=1.):
 36 |     clip_up = tf.cast(x > u, tf.float32)
 37 |     clip_low = tf.cast(x < l, tf.float32)
 38 |     return x + tf.stop_gradient((u - x) * clip_up + (l - x) * clip_low)
 39 | 
 40 | 
 41 | """
 42 | Policies
 43 | """
 44 | 
 45 | LOG_STD_MAX = 2
 46 | LOG_STD_MIN = -20
 47 | 
 48 | 
 49 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation):
 50 |     act_dim = a.shape.as_list()[-1]
 51 |     net = mlp(x, list(hidden_sizes), activation, activation)
 52 |     mu = tf.layers.dense(net, act_dim, activation=output_activation)
 53 | 
 54 |     """
 55 |     Because algorithm maximizes trade-off of reward and entropy,
 56 |     entropy must be unique to state---and therefore log_stds need
 57 |     to be a neural network output instead of a shared-across-states
 58 |     learnable parameter vector. But for deep Relu and other nets,
 59 |     simply sticking an activationless dense layer at the end would
 60 |     be quite bad---at the beginning of training, a randomly initialized
 61 |     net could produce extremely large values for the log_stds, which
 62 |     would result in some actions being either entirely deterministic
 63 |     or too random to come back to earth. Either of these introduces
 64 |     numerical instability which could break the algorithm. To 
 65 |     protect against that, we'll constrain the output range of the 
 66 |     log_stds, to lie within [LOG_STD_MIN, LOG_STD_MAX]. This is 
 67 |     slightly different from the trick used by the original authors of
 68 |     SAC---they used tf.clip_by_value instead of squashing and rescaling.
 69 |     I prefer this approach because it allows gradient propagation
 70 |     through log_std where clipping wouldn't, but I don't know if
 71 |     it makes much of a difference.
 72 |     """
 73 |     log_std = tf.layers.dense(net, act_dim, activation=tf.tanh)
 74 |     log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)
 75 | 
 76 |     std = tf.exp(log_std)
 77 |     pi = mu + tf.random_normal(tf.shape(mu)) * std
 78 |     logp_pi = gaussian_likelihood(pi, mu, log_std)
 79 |     return mu, pi, logp_pi
 80 | 
 81 | 
 82 | def apply_squashing_func(mu, pi, logp_pi):
 83 |     mu = tf.tanh(mu)
 84 |     pi = tf.tanh(pi)
 85 |     # To avoid evil machine precision error, strictly clip 1-pi**2 to [0,1] range.
 86 |     logp_pi -= tf.reduce_sum(tf.log(clip_but_pass_gradient(1 - pi ** 2, l=0, u=1) + 1e-6), axis=1)
 87 |     return mu, pi, logp_pi
 88 | 
 89 | 
 90 | """
 91 | Actor-Critics
 92 | """
 93 | 
 94 | 
 95 | def mlp_actor_critic(x, a, hidden_sizes=(400, 300), activation=tf.nn.relu,
 96 |                      output_activation=None, policy=mlp_gaussian_policy, action_space=None):
 97 |     # policy
 98 |     with tf.variable_scope('pi'):
 99 |         mu, pi, logp_pi = policy(x, a, hidden_sizes, activation, output_activation)
100 |         mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi)
101 | 
102 |     # make sure actions are in correct range
103 |     action_scale = action_space.high[0]
104 |     mu *= action_scale
105 |     pi *= action_scale
106 | 
107 |     # vfs
108 |     vf_mlp = lambda x: tf.squeeze(mlp(x, list(hidden_sizes) + [1], activation, None), axis=1)
109 |     with tf.variable_scope('q1'):
110 |         q1 = vf_mlp(tf.concat([x, a], axis=-1))
111 |     with tf.variable_scope('q1', reuse=True):
112 |         q1_pi = vf_mlp(tf.concat([x, pi], axis=-1))
113 |     with tf.variable_scope('q2'):
114 |         q2 = vf_mlp(tf.concat([x, a], axis=-1))
115 |     with tf.variable_scope('q2', reuse=True):
116 |         q2_pi = vf_mlp(tf.concat([x, pi], axis=-1))
117 |     with tf.variable_scope('v'):
118 |         v = vf_mlp(x)
119 |     return mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v
120 | 


--------------------------------------------------------------------------------
/algos/sac1/core.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | 
  4 | EPS = 1e-8
  5 | 
  6 | 
  7 | def placeholder(dim=None):
  8 |     return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,))
  9 | 
 10 | 
 11 | def placeholders(*args):
 12 |     return [placeholder(dim) for dim in args]
 13 | 
 14 | 
 15 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
 16 |     for h in hidden_sizes[:-1]:
 17 |         x = tf.layers.dense(x, units=h, activation=activation)
 18 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
 19 | 
 20 | 
 21 | def get_vars(scope):
 22 |     return [x for x in tf.global_variables() if scope in x.name]
 23 | 
 24 | 
 25 | def count_vars(scope):
 26 |     v = get_vars(scope)
 27 |     return sum([np.prod(var.shape.as_list()) for var in v])
 28 | 
 29 | 
 30 | def gaussian_likelihood(x, mu, log_std):
 31 |     pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
 32 |     return tf.reduce_sum(pre_sum, axis=1)
 33 | 
 34 | 
 35 | def clip_but_pass_gradient(x, l=-1., u=1.):
 36 |     clip_up = tf.cast(x > u, tf.float32)
 37 |     clip_low = tf.cast(x < l, tf.float32)
 38 |     return x + tf.stop_gradient((u - x)*clip_up + (l - x)*clip_low)
 39 | 
 40 | 
 41 | """
 42 | Policies
 43 | """
 44 | 
 45 | LOG_STD_MAX = 2
 46 | LOG_STD_MIN = -20
 47 | 
 48 | 
 49 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation):
 50 |     act_dim = a.shape.as_list()[-1]
 51 |     net = mlp(x, list(hidden_sizes), activation, activation)
 52 |     mu = tf.layers.dense(net, act_dim, activation=output_activation)
 53 | 
 54 |     """
 55 |     Because algorithm maximizes trade-off of reward and entropy,
 56 |     entropy must be unique to state---and therefore log_stds need
 57 |     to be a neural network output instead of a shared-across-states
 58 |     learnable parameter vector. But for deep Relu and other nets,
 59 |     simply sticking an activationless dense layer at the end would
 60 |     be quite bad---at the beginning of training, a randomly initialized
 61 |     net could produce extremely large values for the log_stds, which
 62 |     would result in some actions being either entirely deterministic
 63 |     or too random to come back to earth. Either of these introduces
 64 |     numerical instability which could break the algorithm. To 
 65 |     protect against that, we'll constrain the output range of the 
 66 |     log_stds, to lie within [LOG_STD_MIN, LOG_STD_MAX]. This is 
 67 |     slightly different from the trick used by the original authors of
 68 |     SAC---they used tf.clip_by_value instead of squashing and rescaling.
 69 |     I prefer this approach because it allows gradient propagation
 70 |     through log_std where clipping wouldn't, but I don't know if
 71 |     it makes much of a difference.
 72 |     """
 73 |     log_std = tf.layers.dense(net, act_dim, activation=tf.tanh)
 74 |     log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)
 75 | 
 76 |     std = tf.exp(log_std)
 77 |     pi = mu + tf.random_normal(tf.shape(mu)) * std
 78 |     logp_pi = gaussian_likelihood(pi, mu, log_std)
 79 |     return mu, pi, logp_pi
 80 | 
 81 | 
 82 | def apply_squashing_func(mu, pi, logp_pi):
 83 |     mu = tf.tanh(mu)
 84 |     pi = tf.tanh(pi)
 85 |     # To avoid evil machine precision error, strictly clip 1-pi**2 to [0,1] range.
 86 |     logp_pi -= tf.reduce_sum(tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1)
 87 |     return mu, pi, logp_pi
 88 | 
 89 | 
 90 | # Actor-Critics
 91 | def mlp_actor_critic(x, x2, a, hidden_sizes=(400,300), activation=tf.nn.relu,
 92 |                      output_activation=None, policy=mlp_gaussian_policy, action_space=None):
 93 | 
 94 |     # policy
 95 |     with tf.variable_scope('pi'):
 96 |         mu, pi, logp_pi = policy(x, a, hidden_sizes, activation, output_activation)
 97 |         mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi)
 98 | 
 99 |     with tf.variable_scope('pi', reuse=True):
100 |         mu2, pi2, logp_pi2 = policy(x2, a, hidden_sizes, activation, output_activation)
101 |         mu2, pi2, logp_pi2 = apply_squashing_func(mu2, pi2, logp_pi2)
102 | 
103 |     # make sure actions are in correct range
104 |     action_scale = action_space.high[0]
105 |     mu *= action_scale
106 |     pi *= action_scale
107 | 
108 |     # vfs
109 |     # tf.squeeze( shape(?,1), axis=1 ) = shape(?,)
110 |     vf_mlp = lambda x : tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1)
111 | 
112 |     with tf.variable_scope('q1'):
113 |         q1 = vf_mlp(tf.concat([x,a], axis=-1))
114 |     with tf.variable_scope('q1', reuse=True):
115 |         q1_pi = vf_mlp(tf.concat([x,pi], axis=-1))
116 |     with tf.variable_scope('q2'):
117 |         q2 = vf_mlp(tf.concat([x,a], axis=-1))
118 |     with tf.variable_scope('q2', reuse=True):
119 |         q2_pi = vf_mlp(tf.concat([x,pi], axis=-1))
120 | 
121 |     return mu, pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi
122 | 


--------------------------------------------------------------------------------
/example/model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import gym
  4 | import time
  5 | from spinup.algos.sac import core
  6 | from spinup.algos.sac.core import get_vars
  7 | from spinup.utils.logx import EpochLogger
  8 | from core import mlp_actor_critic as actor_critic
  9 | import ray.experimental.tf_utils
 10 | 
 11 | 
 12 | class Model(object):
 13 | 
 14 |     def __init__(self, args):
 15 | 
 16 |         # Inputs to computation graph
 17 |         self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders(args.obs_dim, args.act_dim,
 18 |                                                                                    args.obs_dim, None, None)
 19 | 
 20 |         # Main outputs from computation graph
 21 |         with tf.variable_scope('main'):
 22 |             self.mu, self.pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(self.x_ph, self.a_ph, **args.ac_kwargs)
 23 | 
 24 |         # Target value network
 25 |         with tf.variable_scope('target'):
 26 |             _, _, _, _, _, _, _, v_targ = actor_critic(self.x2_ph, self.a_ph, **args.ac_kwargs)
 27 | 
 28 |         # Count variables
 29 |         var_counts = tuple(core.count_vars(scope) for scope in
 30 |                            ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main'])
 31 |         print(('\nNumber of parameters: \t pi: %d, \t' + 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n') % var_counts)
 32 | 
 33 |         # Min Double-Q:
 34 |         min_q_pi = tf.minimum(q1_pi, q2_pi)
 35 | 
 36 |         # Targets for Q and V regression
 37 |         q_backup = tf.stop_gradient(self.r_ph + args.gamma * (1 - self.d_ph) * v_targ)
 38 |         v_backup = tf.stop_gradient(min_q_pi - args.alpha * logp_pi)
 39 | 
 40 |         # Soft actor-critic losses
 41 |         pi_loss = tf.reduce_mean(args.alpha * logp_pi - q1_pi)
 42 |         q1_loss = 0.5 * tf.reduce_mean((q_backup - q1) ** 2)
 43 |         q2_loss = 0.5 * tf.reduce_mean((q_backup - q2) ** 2)
 44 |         v_loss = 0.5 * tf.reduce_mean((v_backup - v) ** 2)
 45 |         self.value_loss = q1_loss + q2_loss + v_loss
 46 | 
 47 |         # Policy train op
 48 |         # (has to be separate from value train op, because q1_pi appears in pi_loss)
 49 |         pi_optimizer = tf.train.AdamOptimizer(learning_rate=args.lr)
 50 |         train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
 51 | 
 52 |         # Value train op
 53 |         # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
 54 |         value_optimizer = tf.train.AdamOptimizer(learning_rate=args.lr)
 55 |         value_params = get_vars('main/q') + get_vars('main/v')
 56 |         with tf.control_dependencies([train_pi_op]):
 57 |             train_value_op = value_optimizer.minimize(self.value_loss, var_list=value_params)
 58 | 
 59 |         # Polyak averaging for target variables
 60 |         # (control flow because sess.run otherwise evaluates in nondeterministic order)
 61 |         with tf.control_dependencies([train_value_op]):
 62 |             target_update = tf.group([tf.assign(v_targ, args.polyak * v_targ + (1 - args.polyak) * v_main)
 63 |                                       for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
 64 | 
 65 |         # All ops to call during one training step
 66 |         self.step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi,
 67 |                          train_pi_op, train_value_op, target_update]
 68 | 
 69 |         # Initializing targets to match main variables
 70 |         self.target_init = tf.group([tf.assign(v_targ, v_main)
 71 |                                      for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
 72 | 
 73 |         self.sess = tf.Session()
 74 |         self.sess.run(tf.global_variables_initializer())
 75 |         self.variables = ray.experimental.tf_utils.TensorFlowVariables(
 76 |             self.value_loss, self.sess)
 77 | 
 78 |     def set_weights(self, variable_names, weights):
 79 |         self.variables.set_weights(dict(zip(variable_names, weights)))
 80 |         self.sess.run(self.target_init)
 81 | 
 82 |     def get_weights(self):
 83 |         weights = self.variables.get_weights()
 84 |         keys = [key for key in list(weights.keys()) if "main" in key]
 85 |         values = [weights[key] for key in keys]
 86 |         return keys, values
 87 | 
 88 |     def get_action(self, o, deterministic=False):
 89 |         act_op = self.mu if deterministic else self.pi
 90 |         return self.sess.run(act_op, feed_dict={self.x_ph: o.reshape(1, -1)})[0]
 91 | 
 92 |     def train(self, replay_buffer, args):
 93 | 
 94 |         batch = ray.get(replay_buffer.sample_batch.remote(args.batch_size))
 95 |         feed_dict = {self.x_ph: batch['obs1'],
 96 |                      self.x2_ph: batch['obs2'],
 97 |                      self.a_ph: batch['acts'],
 98 |                      self.r_ph: batch['rews'],
 99 |                      self.d_ph: batch['done'],
100 |                      }
101 |         outs = self.sess.run(self.step_ops, feed_dict)
102 |         # logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2],
103 |         #              LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5],
104 |         #              VVals=outs[6], LogPi=outs[7])
105 | 
106 |     def test_agent(self, test_env, args, n=10):
107 |         test_ret = []
108 |         for j in range(n):
109 |             o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
110 |             while not (d or (ep_len == args.max_ep_len)):
111 |                 # Take deterministic actions at test time
112 |                 o, r, d, _ = test_env.step(self.get_action(o, True))
113 |                 ep_ret += r
114 |                 ep_len += 1
115 |             test_ret.append(ep_ret)
116 |             # logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
117 |         ave_ret = sum(test_ret)/len(test_ret)
118 |         return ave_ret
119 | 


--------------------------------------------------------------------------------
/example/dsac.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import gym
  4 | import time
  5 | from spinup.algos.sac import core
  6 | from spinup.algos.sac.core import get_vars
  7 | from spinup.utils.logx import EpochLogger
  8 | 
  9 | import ray
 10 | import pickle
 11 | from model import Model
 12 | 
 13 | 
 14 | @ray.remote
 15 | class ReplayBuffer:
 16 |     """
 17 |     A simple FIFO experience replay buffer for SAC agents.
 18 |     """
 19 | 
 20 |     def __init__(self, obs_dim, act_dim, size):
 21 |         self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
 22 |         self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
 23 |         self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
 24 |         self.rews_buf = np.zeros(size, dtype=np.float32)
 25 |         self.done_buf = np.zeros(size, dtype=np.float32)
 26 |         self.ptr, self.size, self.max_size = 0, 0, size
 27 |         self.rollout_steps = 0
 28 | 
 29 |     def store(self, obs, act, rew, next_obs, done):
 30 |         self.obs1_buf[self.ptr] = obs
 31 |         self.obs2_buf[self.ptr] = next_obs
 32 |         self.acts_buf[self.ptr] = act
 33 |         self.rews_buf[self.ptr] = rew
 34 |         self.done_buf[self.ptr] = done
 35 |         self.ptr = (self.ptr + 1) % self.max_size
 36 |         self.size = min(self.size + 1, self.max_size)
 37 |         self.rollout_steps += 1
 38 | 
 39 |     def sample_batch(self, batch_size=32):
 40 |         idxs = np.random.randint(0, self.size, size=batch_size)
 41 |         return dict(obs1=self.obs1_buf[idxs],
 42 |                     obs2=self.obs2_buf[idxs],
 43 |                     acts=self.acts_buf[idxs],
 44 |                     rews=self.rews_buf[idxs],
 45 |                     done=self.done_buf[idxs])
 46 | 
 47 |     def get_counts(self):
 48 |         return self.rollout_steps
 49 | 
 50 | 
 51 | @ray.remote
 52 | class ParameterServer(object):
 53 |     def __init__(self, keys, values):
 54 |         # These values will be mutated, so we must create a copy that is not
 55 |         # backed by the object store.
 56 |         values = [value.copy() for value in values]
 57 |         self.weights = dict(zip(keys, values))
 58 | 
 59 |     def push(self, keys, values):
 60 |         values = [value.copy() for value in values]
 61 |         for key, value in zip(keys, values):
 62 |             self.weights[key] = value
 63 | 
 64 |     def pull(self, keys):
 65 |         return [self.weights[key] for key in keys]
 66 | 
 67 |     def get_weights(self):
 68 |         return self.weights
 69 | 
 70 |     # save weights to disk
 71 |     def save_weights(self, name):
 72 |         with open(name + "weights.pickle", "wb") as pickle_out:
 73 |             pickle.dump(self.weights, pickle_out)
 74 | 
 75 | 
 76 | @ray.remote
 77 | def worker_rollout(ps, replay_buffer, args):
 78 |     env = gym.make(args.env)
 79 |     o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
 80 |     total_steps = args.steps_per_epoch * args.epochs
 81 | 
 82 |     agent = Model(args)
 83 |     keys = agent.get_weights()[0]
 84 | 
 85 |     weights = ray.get(ps.pull.remote(keys))
 86 |     agent.set_weights(keys, weights)
 87 | 
 88 |     # Main loop: collect experience in env and update/log each epoch
 89 |     for t in range(total_steps):
 90 | 
 91 |         """
 92 |         Until start_steps have elapsed, randomly sample actions
 93 |         from a uniform distribution for better exploration. Afterwards, 
 94 |         use the learned policy. 
 95 |         """
 96 |         if t > args.start_steps:
 97 |             a = agent.get_action(o)
 98 |         else:
 99 |             a = env.action_space.sample()
100 | 
101 |         # Step the env
102 |         o2, r, d, _ = env.step(a)
103 |         ep_ret += r
104 |         ep_len += 1
105 | 
106 |         # Ignore the "done" signal if it comes from hitting the time
107 |         # horizon (that is, when it's an artificial terminal signal
108 |         # that isn't based on the agent's state)
109 |         d = False if ep_len == args.max_ep_len else d
110 | 
111 |         # Store experience to replay buffer
112 |         replay_buffer.store.remote(o, a, r, o2, d)
113 | 
114 |         # Super critical, easy to overlook step: make sure to update
115 |         # most recent observation!
116 |         o = o2
117 | 
118 |         if d or (ep_len == args.max_ep_len):
119 |             """
120 |             Perform all SAC updates at the end of the trajectory.
121 |             This is a slight difference from the SAC specified in the
122 |             original paper.
123 |             """
124 | 
125 |             # print(ep_len, ep_ret)
126 |             # logger.store(EpRet=ep_ret, EpLen=ep_len)
127 |             o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
128 | 
129 |             weights = ray.get(ps.pull.remote(keys))
130 |             agent.set_weights(keys, weights)
131 | 
132 | 
133 | @ray.remote(num_gpus=1, max_calls=1)
134 | def worker_train(ps, replay_buffer, args):
135 |     agent = Model(args)
136 |     keys = agent.get_weights()[0]
137 | 
138 |     weights = ray.get(ps.pull.remote(keys))
139 |     agent.set_weights(keys, weights)
140 | 
141 |     cnt = 1
142 |     while True:
143 | 
144 |         agent.train(replay_buffer, args)
145 | 
146 |         if cnt % 300 == 0:
147 |             keys, values = agent.get_weights()
148 |             ps.push.remote(keys, values)
149 | 
150 |         cnt += 1
151 | 
152 | 
153 | @ray.remote
154 | def worker_test(ps, start_time):
155 | 
156 |     from spinup.utils.run_utils import setup_logger_kwargs
157 | 
158 |     logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)
159 |     logger = EpochLogger(**logger_kwargs)
160 |     config = locals()
161 |     del config['ps']
162 |     logger.save_config(config)
163 | 
164 |     agent = Model(args)
165 |     keys = agent.get_weights()[0]
166 | 
167 |     weights = ray.get(ps.pull.remote(keys))
168 |     agent.set_weights(keys, weights)
169 |     test_env = gym.make(args.env)
170 |     while True:
171 |         ave_ret = agent.test_agent(test_env, args)
172 |         # print("test Average Ret:", ave_ret, "time:", time.time()-start_time)
173 |         logger.log_tabular('AverageTestEpRet', ave_ret)
174 |         logger.log_tabular('Time', time.time() - start_time)
175 |         logger.dump_tabular()
176 |         weights = ray.get(ps.pull.remote(keys))
177 |         agent.set_weights(keys, weights)
178 | 
179 | 
180 | if __name__ == '__main__':
181 |     import argparse
182 | 
183 |     parser = argparse.ArgumentParser()
184 |     parser.add_argument('--env', type=str, default='LunarLanderContinuous-v2')
185 |     parser.add_argument('--hid', type=int, default=300)
186 |     parser.add_argument('--l', type=int, default=2)
187 |     parser.add_argument('--gamma', type=float, default=0.99)
188 |     parser.add_argument('--seed', '-s', type=int, default=0)
189 |     parser.add_argument('--epochs', type=int, default=600)
190 |     parser.add_argument('--exp_name', type=str, default='dsac_6worker_E1')
191 |     args = parser.parse_args()
192 | 
193 |     # ac_kwargs = dict()
194 |     args.seed = 0
195 |     args.steps_per_epoch = 5000
196 |     args.epochs = 100
197 |     args.replay_size = int(1e6)
198 |     args.gamma = 0.99,
199 |     args.polyak = 0.995
200 |     args.lr = 1e-3
201 |     args.alpha = 0.2
202 |     args.batch_size = 100
203 |     args.start_steps = 10000
204 |     args.max_ep_len = 1000
205 |     args.logger_kwargs = dict()
206 |     args.save_freq = 1
207 |     args.ac_kwargs = dict(hidden_sizes=[args.hid] * args.l)
208 | 
209 |     env = gym.make(args.env)
210 |     args.obs_dim = env.observation_space.shape[0]
211 |     args.act_dim = env.action_space.shape[0]
212 |     # Share information about action space with policy architecture
213 |     args.ac_kwargs['action_space'] = env.action_space
214 | 
215 |     args.num_workers = 6
216 |     args.num_learners = 1
217 | 
218 |     ray.init()
219 | 
220 |     net = Model(args)
221 |     all_keys, all_values = net.get_weights()
222 |     ps = ParameterServer.remote(all_keys, all_values)
223 | 
224 |     replay_buffer = ReplayBuffer.remote(args.obs_dim, args.act_dim, args.replay_size)
225 | 
226 |     start_time = time.time()
227 | 
228 |     # Start some training tasks.
229 |     task_rollout = [worker_rollout.remote(ps, replay_buffer, args) for i in range(args.num_workers)]
230 | 
231 |     time.sleep(20)
232 | 
233 |     task_train = [worker_train.remote(ps, replay_buffer, args) for i in range(args.num_learners)]
234 | 
235 |     time.sleep(10)
236 | 
237 |     task_test = worker_test.remote(ps, start_time)
238 |     ray.wait(task_rollout)
239 | 


--------------------------------------------------------------------------------
/tutorial/Parallelize your algorithm by Ray (1).md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | Ray是一个实现分布式python程序的通用框架。Ray提供了统一的任务并行和actor抽象，并通过共享内存、零拷贝序列化和分布式调度实现了高性能。
  4 | 
  5 | Ray里面还有用来调超参数的库[Tune](http://ray.readthedocs.io/en/latest/tune.html)和可扩展规模的强化学习库[Rllib](http://ray.readthedocs.io/en/latest/rllib.html)。
  6 | 
  7 | ray的必备知识：
  8 | 
  9 | 1. 使用远程方程（任务） [`ray.remote`]
 10 | 2. 通过object IDs获取结果 [`ray.put`, `ray.get`, `ray.wait`]
 11 | 3. 使用远程类 (actors) [`ray.remote`]
 12 | 
 13 | 使用Ray，可以使你的代码从单机运行轻松地扩展到大集群上运行。
 14 | 
 15 | 使用该命令安装Ray：`pip install -U ray`
 16 | 
 17 | 
 18 | 
 19 | 开始使用ray，导入ray，然后初始化。
 20 | 
 21 | ```python
 22 | import ray
 23 | 
 24 | # Start Ray. If you're connecting to an existing cluster, you would use
 25 | # ray.init(address=<cluster-address>) instead.
 26 | ray.init()
 27 | ```
 28 | 
 29 | 
 30 | 
 31 | 1. 使用远程方程（任务） [`ray.remote`]
 32 | 
 33 | 将python函数转换为远程函数的标准方法使在函数上面添加一个`@ray.remote`装饰器。下面看一个例子。
 34 | 
 35 | ```python
 36 | # A regular Python function.
 37 | def regular_function():
 38 |     return 1
 39 | 
 40 | # A Ray remote function.
 41 | @ray.remote
 42 | def remote_function():
 43 |     return 1
 44 | ```
 45 | 
 46 | ```python
 47 | assert regular_function() == 1
 48 | 
 49 | object_id = remote_function.remote()
 50 | 
 51 | # The value of the original `regular_function`
 52 | assert ray.get(object_id) == 1
 53 | ```
 54 | 
 55 | **Parallelism:** Invocations of `regular_function` happen **serially**, for example
 56 | 
 57 | 在调用的时候，普通函数将串行运行。
 58 | 
 59 | ```python
 60 | # These happen serially.
 61 | for _ in range(4):
 62 |     regular_function()
 63 | ```
 64 | 
 65 |  
 66 | 
 67 | whereas invocations of `remote_function` happen in **parallel**, for example
 68 | 
 69 | 调用远程函数时，程序将并行运行。
 70 | 
 71 | ```python
 72 | # These happen in parallel.
 73 | for _ in range(4):
 74 |     remote_function.remote()
 75 | ```
 76 | 
 77 | 
 78 | 
 79 | Oftentimes, you may want to specify a task’s resource requirements (for example
 80 | one task may require a GPU). The `ray.init()` command will automatically
 81 | detect the available GPUs and CPUs on the machine. However, you can override
 82 | this default behavior by passing in specific resources, e.g.
 83 | 
 84 | 运行`ray.init()`后，ray将自动检查可用的GPU和CPU。我们也可以给我们传入参数设置特定的资源需求量。
 85 | 
 86 | `ray.init(num_cpus=8, num_gpus=4, resources={'Custom': 2})`
 87 | 
 88 | 远程函数/类也可以设置资源需求量，像这样`@ray.remote(num_cpus=2, num_gpus)`
 89 | 
 90 | 如果没有设置，默认设置为1个CPU。
 91 | 
 92 | If you do not specify any resources in the `@ray.remote` decorator, the
 93 | default is 1 CPU resource and no other resources.
 94 | 
 95 | 
 96 | 
 97 | 远程函数执行后并不会直接返回结果，而是会立即返回一个object ID。远程函数会在后台并行处理，等执行得到最终结果后，可以通过返回的object ID取得这个结果。
 98 | 
 99 | `ray.put(*value*)`也会返回object ID
100 | 
101 | put操作将对象存入object store里，然后返回它的object ID。
102 | 
103 | Store an object in the object store.  return: The object ID assigned to this value.
104 | 
105 | ```python
106 | y = 1
107 | object_id = ray.put(y)
108 | ```
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 通过object IDs获取结果 [`ray.put`, `ray.get`, `ray.wait`]
117 | 
118 | ray.get(obj_id)
119 | 
120 | 从object store获取远程对象或者一个列表的远程对象。
121 | 
122 | Get a remote object or a list of remote objects from the object store.
123 | 
124 | Then, if the object is a numpy array or a collection of numpy arrays, the `get` call is zero-copy and returns arrays backed by shared object store memory.
125 | Otherwise, we deserialize the object data into a Python object.
126 | 
127 | This method blocks until the object corresponding to the object ID is
128 | available in the local object store.
129 | 
130 | 需要注意的是，使用get方法时会锁，直到要取得的对象在本地的object store里可用。
131 | 
132 | 调用remote操作是异步的，他们会返回object IDs而不是结果。想要得到真的的结果我们需要使用ray.get()。
133 | 
134 | 我们之前写的这段语句，实际上results是一个由object IDs组成的列表。
135 | 
136 | `results = [do_some_work.remote(x) for x in range(4)]`
137 | 
138 | 如果改为下面，ray.get()将通过object ID取得真实的结果。
139 | 
140 | `results = [ray.get(do_some_work.remote(x)) for x in range(4)]`
141 | 
142 | 但是，这样写会有一个问题。ray.get()会锁进程，这意味着，ray.get()会一直等到do_some_work这个函数执行完返回结果后才执行结束然后进入下一个循环。这样的话，4次调用do_some_work函数就不再是并行运行的了。
143 | 
144 | 为了可以并行运算，我们需要在调用完所有的任务后再调用ray.get()。像下面这样。
145 | 
146 | `results = ray.get([do_some_work.remote(x) for x in range(4)])`
147 | 
148 | 所以，需要小心使用ray.get()。因为它是一个锁进程的操作。如果太频繁调用ray.get()，将会影响并行性能。同时，尽可能的晚使用ray.get()以防止不必要的等待。
149 | 
150 | 
151 | 
152 | Recall that remote operations are asynchronous and they return futures (i.e., object IDs) instead of the results themselves.To get the actual results,  we need to use ray.get(), and here the first instinct is to just call ray.get() on the remote operation invocation i.e., replace line “results = [do_some_work.remote(x) for x in range(4)]” with: results = [ray.get(do_some_work.remote(x)) for x in range(4)]
153 | 
154 | The observant reader will already have the answer: ray.get() is blocking, so calling it after each remote operation means that we wait for that operation to complete, which essentially means that we execute one operation at a time, hence no parallelism!
155 | 
156 | To enable parallelism, we need to call ray.get() *after* invoking all tasks. We can easily do so in our example by replacing line “results = [do_some_work.remote(x) for x in range(4)]” with:
157 | 
158 |  
159 | 
160 | ```python
161 | results = ray.get([do_some_work.remote(x) for x in range(4)])
162 | ```
163 | 
164 | always keep in mind that ray.get() is a blocking operation, and thus if called eagerly it can hurt the parallelism. Instead, you should try to write your program such that ray.get() is called as late as possible.
165 | 
166 | **Tip 1:** ***Delay calling ray.get() as much as possible.***
167 | 
168 | 
169 | 
170 | 远程类
171 | 
172 | 通过远程类，我们可以实现一个共享的参数服务器。
173 | 
174 |  remote classes (Actors)
175 | 
176 | 我们在类的定义上面加上修饰器ray.remote。这个类的实例就会是一个Ray的actor。每一个actor运行在自己的python进程上。
177 | 
178 | Actors extend the Ray API from functions (tasks) to classes. The `ray.remote` decorator indicates that instances of the `Counter` class will be actors. An actor is essentially a stateful worker. Each actor runs in its own Python process.
179 | 
180 | 
181 | 
182 | ```python
183 | @ray.remote
184 | class Counter(object):
185 |     def __init__(self):
186 |         self.value = 0
187 | 
188 |     def increment(self):
189 |         self.value += 1
190 |         return self.value
191 | ```
192 | 
193 |  You can specify resource requirements in Actors too (see the [Actors section](https://ray.readthedocs.io/en/latest/actors.html) for more details.)
194 | 
195 |  同样可以给actor设置资源请求量。
196 | 
197 | ```python
198 | @ray.remote(num_cpus=2, num_gpus=0.5)
199 | class Actor(object):
200 |     pass
201 | ```
202 | 
203 |  
204 | 
205 | We can interact with the actor by calling its methods with the `.remote` operator. We can then call `ray.get` on the object ID to retrieve the actual value.
206 | 
207 |  在调用类的方法时加上`.remote`，然后使用`ray.get`获取实际的值。
208 | 
209 | ```
210 | obj_id = a1.increment.remote()
211 | ray.get(obj_id) == 1
212 | ```
213 | 
214 | Actor handles can be passed into other tasks. To illustrate this with a
215 | simple example, consider a simple actor definition.
216 | 
217 | Actor可以作为参数传给别的任务，下面的例子就是实现一个参数服务器。不同的参数就可以公用一个参数服务器了。
218 | 
219 | 
220 | 
221 | ps
222 | 
223 | The @ray.remote decorator defines a service. It takes the
224 | `ParameterServer` class and allows it to be instantiated as a remote service or
225 | actor.
226 | 
227 | 
228 | 
229 | **Sharding Across Multiple Parameter Servers:** When your parameters are large and your cluster is large, a single parameter server may not suffice because the application could be bottlenecked by the network bandwidth into and out of the machine that the parameter server is on (especially if there are many workers).
230 | 
231 | 当你的参数特别大，而且你的集群也很大，一个parameter server可能就不够了。特别是有很多worker的时候，因为向一个parameter server的数据传输就会成为瓶颈。
232 | 
233 | 简单的解决办法就是把参数分散在多个parameter server上。可以通过创建多个actor来实现。
234 | 
235 | A natural solution in this case is to shard the parameters across multiple parameter servers. This can be achieved by simply starting up multiple parameter server actors. An example of how to do this is shown in the code example at the bottom.
236 | 
237 | 
238 | 
239 | 为了保证ray并行的性能，远程任务应该花费至少几毫秒的时间。
240 | 
241 | 当需要重复向不同远程任务传入相同对象时，可以先用ray.put()把类存入object store，然后传入它的object id。
242 | 
243 | **Tip 2:** **For exploiting Ray’s parallelism, remote tasks should take at least several milliseconds.**
244 | 
245 | **Tip 3:** ***When passing the same object repeatedly as an argument to a remote operation, use ray.put() to store it once in the object store and then pass its ID.***
246 | 
247 | **Tip 4:** **Use ray.wait() to process results as soon as they become available.**
248 | 
249 | 
250 | 
251 | 
252 | 
253 | https://rise.cs.berkeley.edu/blog/ray-tips-for-first-time-users/
254 | 
255 | https://ray-project.github.io/2018/07/15/parameter-server-in-fifteen-lines.html


--------------------------------------------------------------------------------
/algos/sac1/sac1.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import time
  4 | import ray
  5 | import gym
  6 | 
  7 | from hyperparams import HyperParameters, Wrapper
  8 | from actor_learner import Actor, Learner
  9 | 
 10 | import os
 11 | import pickle
 12 | import multiprocessing
 13 | import copy
 14 | import signal
 15 | 
 16 | flags = tf.app.flags
 17 | FLAGS = tf.app.flags.FLAGS
 18 | 
 19 | # "Pendulum-v0" 'BipedalWalker-v2' 'LunarLanderContinuous-v2'
 20 | flags.DEFINE_string("env_name", "BipedalWalkerHardcore-v2", "game env")
 21 | flags.DEFINE_integer("total_epochs", 500, "total_epochs")
 22 | flags.DEFINE_integer("num_workers", 1, "number of workers")
 23 | flags.DEFINE_integer("num_learners", 1, "number of learners")
 24 | flags.DEFINE_string("is_restore", "False", "True or False. True means restore weights from pickle file.")
 25 | flags.DEFINE_float("a_l_ratio", 2, "steps / sample_times")
 26 | 
 27 | 
 28 | @ray.remote
 29 | class ReplayBuffer:
 30 |     """
 31 |     A simple FIFO experience replay buffer for SAC agents.
 32 |     """
 33 | 
 34 |     def __init__(self, obs_dim, act_dim, size):
 35 |         self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
 36 |         self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
 37 |         self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
 38 |         self.rews_buf = np.zeros(size, dtype=np.float32)
 39 |         self.done_buf = np.zeros(size, dtype=np.float32)
 40 |         self.ptr, self.size, self.max_size = 0, 0, size
 41 |         self.steps, self.sample_times = 0, 0
 42 | 
 43 |     def store(self, obs, act, rew, next_obs, done):
 44 |         self.obs1_buf[self.ptr] = obs
 45 |         self.obs2_buf[self.ptr] = next_obs
 46 |         self.acts_buf[self.ptr] = act
 47 |         self.rews_buf[self.ptr] = rew
 48 |         self.done_buf[self.ptr] = done
 49 |         self.ptr = (self.ptr+1) % self.max_size
 50 |         self.size = min(self.size+1, self.max_size)
 51 |         self.steps += 1
 52 | 
 53 |     def sample_batch(self, batch_size=128):
 54 |         idxs = np.random.randint(0, self.size, size=batch_size)
 55 |         self.sample_times += 1
 56 |         return dict(obs1=self.obs1_buf[idxs],
 57 |                     obs2=self.obs2_buf[idxs],
 58 |                     acts=self.acts_buf[idxs],
 59 |                     rews=self.rews_buf[idxs],
 60 |                     done=self.done_buf[idxs])
 61 | 
 62 |     def get_counts(self):
 63 |         return self.sample_times, self.steps, self.size
 64 | 
 65 | 
 66 | @ray.remote
 67 | class ParameterServer(object):
 68 |     def __init__(self, keys, values, weights_file=""):
 69 |         # These values will be mutated, so we must create a copy that is not
 70 |         # backed by the object store.
 71 | 
 72 |         if weights_file:
 73 |             try:
 74 |                 with open(weights_file, "rb") as pickle_in:
 75 |                     self.weights = pickle.load(pickle_in)
 76 |                     print("****** weights restored! ******")
 77 |             except:
 78 |                 print("------------------------------------------------")
 79 |                 print(weights_file)
 80 |                 print("------ error: weights file doesn't exist! ------")
 81 |                 exit()
 82 |         else:
 83 |             values = [value.copy() for value in values]
 84 |             self.weights = dict(zip(keys, values))
 85 | 
 86 |     def push(self, keys, values):
 87 |         values = [value.copy() for value in values]
 88 |         for key, value in zip(keys, values):
 89 |             self.weights[key] = value
 90 | 
 91 |     def pull(self, keys):
 92 |         return [self.weights[key] for key in keys]
 93 | 
 94 |     def get_weights(self):
 95 |         return self.weights
 96 | 
 97 |     # save weights to disk
 98 |     def save_weights(self, name):
 99 |         with open(name + "weights.pickle", "wb") as pickle_out:
100 |             pickle.dump(self.weights, pickle_out)
101 | 
102 | 
103 | class Cache(object):
104 | 
105 |     def __init__(self, replay_buffer):
106 |         # cache for training data and model weights
107 |         print('os.pid:', os.getpid())
108 |         self.replay_buffer = replay_buffer
109 |         self.q1 = multiprocessing.Queue(10)
110 |         self.q2 = multiprocessing.Queue(5)
111 |         self.p1 = multiprocessing.Process(target=self.ps_update, args=(self.q1, self.q2, self.replay_buffer))
112 | 
113 |     def ps_update(self, q1, q2, replay_buffer):
114 |         print('os.pid of put_data():', os.getpid())
115 | 
116 |         q1.put(copy.deepcopy(ray.get(replay_buffer.sample_batch.remote(opt.batch_size))))
117 | 
118 |         while True:
119 |             q1.put(copy.deepcopy(ray.get(replay_buffer.sample_batch.remote(opt.batch_size))))
120 | 
121 |             if not q2.empty():
122 |                 keys, values = q2.get()
123 |                 ps.push.remote(keys, values)
124 | 
125 |     def start(self):
126 |         self.p1.start()
127 |         self.p1.join(10)
128 | 
129 |     def end(self):
130 |         self.p1.terminate()
131 | 
132 | 
133 | @ray.remote(num_gpus=1, max_calls=1)
134 | def worker_train(ps, replay_buffer, opt, learner_index):
135 | 
136 |     agent = Learner(opt, job="learner")
137 |     keys = agent.get_weights()[0]
138 |     weights = ray.get(ps.pull.remote(keys))
139 |     agent.set_weights(keys, weights)
140 | 
141 |     cache = Cache(replay_buffer)
142 | 
143 |     cache.start()
144 | 
145 |     cnt = 1
146 |     while True:
147 |         batch = cache.q1.get()
148 |         agent.train(batch)
149 |         if cnt % 300 == 0:
150 |             # print('q1.qsize():', q1.qsize(), 'q2.qsize():', q2.qsize())
151 |             cache.q2.put(agent.get_weights())
152 |             # keys, values = agent.get_weights()
153 |             # ps.push.remote(copy.deepcopy(keys), copy.deepcopy(values))
154 |         cnt += 1
155 | 
156 | 
157 | @ray.remote
158 | def worker_rollout(ps, replay_buffer, opt, worker_index):
159 | 
160 |     # env = gym.make(opt.env_name)
161 | 
162 |     env = Wrapper(gym.make(opt.env_name), opt.obs_noise, opt.act_noise, opt.reward_scale, 3)
163 | 
164 |     agent = Actor(opt, job="worker")
165 |     keys = agent.get_weights()[0]
166 | 
167 |     o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
168 | 
169 |     # epochs = opt.total_epochs // opt.num_workers
170 |     total_steps = opt.steps_per_epoch * opt.total_epochs
171 | 
172 |     weights = ray.get(ps.pull.remote(keys))
173 |     agent.set_weights(keys, weights)
174 | 
175 |     # TODO opt.start_steps
176 |     # for t in range(total_steps):
177 |     t = 0
178 |     while True:
179 |         if t > opt.start_steps:
180 |             a = agent.get_action(o)
181 |         else:
182 |             a = env.action_space.sample()
183 |         t += 1
184 |         # Step the env
185 |         o2, r, d, _ = env.step(a)
186 |         ep_ret += r
187 |         ep_len += 1
188 | 
189 |         # Ignore the "done" signal if it comes from hitting the time
190 |         # horizon (that is, when it's an artificial terminal signal
191 |         # that isn't based on the agent's state)
192 |         d = False if ep_len == opt.max_ep_len else d
193 | 
194 |         # Store experience to replay buffer
195 |         replay_buffer.store.remote(o, a, r, o2, d)
196 | 
197 |         # Super critical, easy to overlook step: make sure to update
198 |         # most recent observation!
199 |         o = o2
200 | 
201 |         # End of episode. Training (ep_len times).
202 |         if d or (ep_len == opt.max_ep_len):
203 |             sample_times, steps, _ = ray.get(replay_buffer.get_counts.remote())
204 | 
205 |             while sample_times > 0 and steps / sample_times > opt.a_l_ratio:
206 |                 sample_times, steps, _ = ray.get(replay_buffer.get_counts.remote())
207 |                 time.sleep(0.1)
208 | 
209 |             # update parameters every episode
210 |             weights = ray.get(ps.pull.remote(keys))
211 |             agent.set_weights(keys, weights)
212 | 
213 |             o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
214 | 
215 | 
216 | @ray.remote
217 | def worker_test(ps, replay_buffer, opt):
218 | 
219 |     agent = Actor(opt, job="main")
220 | 
221 |     keys, weights = agent.get_weights()
222 | 
223 |     time0 = time1 = time.time()
224 |     sample_times1, steps, size = ray.get(replay_buffer.get_counts.remote())
225 |     max_ret = -1000
226 | 
227 |     env = gym.make(opt.env_name)
228 | 
229 |     while True:
230 |         weights = ray.get(ps.pull.remote(keys))
231 |         agent.set_weights(keys, weights)
232 | 
233 |         ep_ret = agent.test(env, replay_buffer)
234 |         sample_times2, steps, size = ray.get(replay_buffer.get_counts.remote())
235 |         time2 = time.time()
236 |         print("test_reward:", ep_ret, "sample_times:", sample_times2, "steps:", steps, "buffer_size:", size)
237 |         print('update frequency:', (sample_times2-sample_times1)/(time2-time1), 'total time:', time2 - time0)
238 | 
239 |         if ep_ret > max_ret:
240 |             ps.save_weights.remote()
241 |             print("****** weights saved! ******")
242 |             max_ret = ep_ret
243 | 
244 |         time1 = time2
245 |         sample_times1 = sample_times2
246 | 
247 |         # if steps >= opt.total_epochs * opt.steps_per_epoch:
248 |         #     exit(0)
249 |         # if time2 - time0 > 30:
250 |         #     exit(0)
251 | 
252 |         time.sleep(5)
253 | 
254 | 
255 | if __name__ == '__main__':
256 | 
257 |     ray.init()
258 | 
259 |     opt = HyperParameters(FLAGS.env_name, FLAGS.total_epochs, FLAGS.num_workers, FLAGS.a_l_ratio)
260 | 
261 |     # Create a parameter server with some random weights.
262 |     if FLAGS.is_restore == "True":
263 |         ps = ParameterServer.remote([], [], is_restore=True)
264 |     else:
265 |         net = Learner(opt, job="main")
266 |         all_keys, all_values = net.get_weights()
267 |         ps = ParameterServer.remote(all_keys, all_values)
268 | 
269 |     replay_buffer = ReplayBuffer.remote(obs_dim=opt.obs_dim, act_dim=opt.act_dim, size=opt.replay_size)
270 | 
271 |     # Start some training tasks.
272 |     task_rollout = [worker_rollout.remote(ps, replay_buffer, opt, i) for i in range(FLAGS.num_workers)]
273 | 
274 |     time.sleep(5)
275 | 
276 |     task_train = [worker_train.remote(ps, replay_buffer, opt, i) for i in range(FLAGS.num_learners)]
277 | 
278 |     task_test = worker_test.remote(ps, replay_buffer, opt)
279 | 
280 |     ray.wait([task_test, ])
281 | 


--------------------------------------------------------------------------------
/algos/sac1/actor_learner.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | from numbers import Number
  8 | import gym
  9 | import datetime
 10 | import time
 11 | import ray
 12 | import ray.experimental.tf_utils
 13 | 
 14 | import core
 15 | from core import get_vars
 16 | from core import mlp_actor_critic as actor_critic
 17 | 
 18 | 
 19 | class Learner(object):
 20 |     def __init__(self, opt, job):
 21 |         self.opt = opt
 22 |         with tf.Graph().as_default():
 23 |             tf.set_random_seed(opt.seed)
 24 |             np.random.seed(opt.seed)
 25 | 
 26 |             # Inputs to computation graph
 27 |             self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \
 28 |                 core.placeholders(opt.obs_dim, opt.act_dim, opt.obs_dim, None, None)
 29 | 
 30 |             # Main outputs from computation graph
 31 |             with tf.variable_scope('main'):
 32 |                 self.mu, self.pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = \
 33 |                     actor_critic(self.x_ph, self.x2_ph, self.a_ph, action_space=opt.ac_kwargs["action_space"])
 34 | 
 35 |             # Target value network
 36 |             with tf.variable_scope('target'):
 37 |                 _, _, logp_pi_, _, _, _, q1_pi_, q2_pi_ = \
 38 |                     actor_critic(self.x2_ph, self.x2_ph, self.a_ph, action_space=opt.ac_kwargs["action_space"])
 39 | 
 40 |             # Count variables
 41 |             var_counts = tuple(core.count_vars(scope) for scope in
 42 |                                ['main/pi', 'main/q1', 'main/q2', 'main'])
 43 |             print(('\nNumber of parameters: \t pi: %d, \t' + 'q1: %d, \t q2: %d, \t total: %d\n')%var_counts)
 44 | 
 45 |         ######
 46 |             if opt.alpha == 'auto':
 47 |                 target_entropy = (-np.prod(opt.action_space.shape))
 48 | 
 49 |                 log_alpha = tf.get_variable( 'log_alpha', dtype=tf.float32, initializer=0.0)
 50 |                 alpha = tf.exp(log_alpha)
 51 | 
 52 |                 alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi + target_entropy))
 53 | 
 54 |                 alpha_optimizer = tf.train.AdamOptimizer(learning_rate=opt.lr, name='alpha_optimizer')
 55 |                 train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha])
 56 |         ######
 57 | 
 58 |             # Min Double-Q:
 59 |             min_q_pi = tf.minimum(q1_pi_, q2_pi_)
 60 | 
 61 |             # Targets for Q and V regression
 62 |             v_backup = tf.stop_gradient(min_q_pi - opt.alpha * logp_pi2)
 63 |             q_backup = self.r_ph + opt.gamma*(1-self.d_ph)*v_backup
 64 | 
 65 |             # Soft actor-critic losses
 66 |             pi_loss = tf.reduce_mean(opt.alpha * logp_pi - q1_pi)
 67 |             q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
 68 |             q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
 69 |             self.value_loss = q1_loss + q2_loss
 70 | 
 71 |             # Policy train op
 72 |             # (has to be separate from value train op, because q1_pi appears in pi_loss)
 73 |             pi_optimizer = tf.train.AdamOptimizer(learning_rate=opt.lr)
 74 |             train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
 75 | 
 76 |             # Value train op
 77 |             # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
 78 |             value_optimizer = tf.train.AdamOptimizer(learning_rate=opt.lr)
 79 |             value_params = get_vars('main/q')
 80 |             with tf.control_dependencies([train_pi_op]):
 81 |                 train_value_op = value_optimizer.minimize(self.value_loss, var_list=value_params)
 82 | 
 83 |             # Polyak averaging for target variables
 84 |             # (control flow because sess.run otherwise evaluates in nondeterministic order)
 85 |             with tf.control_dependencies([train_value_op]):
 86 |                 self.target_update = tf.group([tf.assign(v_targ, opt.polyak*v_targ + (1-opt.polyak)*v_main)
 87 |                                           for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
 88 | 
 89 |             # TODO
 90 |             # self.grads = self.optimizer.compute_gradients(self.cross_entropy)
 91 |             # self.grads_placeholder = [(tf.placeholder(
 92 |             #     "float", shape=grad[1].get_shape()), grad[1])
 93 |             #     for grad in self.grads]
 94 | 
 95 |             # All ops to call during one training step
 96 |             if isinstance(opt.alpha, Number):
 97 |                 self.step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, tf.identity(opt.alpha),
 98 |                         train_pi_op, train_value_op, self.target_update]
 99 |             else:
100 |                 self.step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, opt.alpha,
101 |                         train_pi_op, train_value_op, self.target_update, train_alpha_op]
102 | 
103 |             # Initializing targets to match main variables
104 |             self.target_init = tf.group([tf.assign(v_targ, v_main)
105 |                                       for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
106 | 
107 |             if job == "learner":
108 |                 config = tf.ConfigProto()
109 |                 config.gpu_options.per_process_gpu_memory_fraction = opt.gpu_fraction
110 |                 config.inter_op_parallelism_threads = 1
111 |                 config.intra_op_parallelism_threads = 1
112 |                 self.sess = tf.Session(config=config)
113 |             else:
114 |                 self.sess = tf.Session(
115 |                     config=tf.ConfigProto(
116 |                         device_count={'GPU': 0},
117 |                         intra_op_parallelism_threads=1,
118 |                         inter_op_parallelism_threads=1))
119 | 
120 |             self.sess.run(tf.global_variables_initializer())
121 | 
122 |             self.variables = ray.experimental.tf_utils.TensorFlowVariables(
123 |                 self.value_loss, self.sess)
124 | 
125 |     def set_weights(self, variable_names, weights):
126 |         self.variables.set_weights(dict(zip(variable_names, weights)))
127 |         self.sess.run(self.target_init)
128 | 
129 |     def get_weights(self):
130 |         weights = self.variables.get_weights()
131 |         keys = [key for key in list(weights.keys()) if "main" in key]
132 |         values = [weights[key] for key in keys]
133 |         return keys, values
134 | 
135 |     def train(self, batch):
136 |         feed_dict = {self.x_ph: batch['obs1'],
137 |                      self.x2_ph: batch['obs2'],
138 |                      self.a_ph: batch['acts'],
139 |                      self.r_ph: batch['rews'],
140 |                      self.d_ph: batch['done'],
141 |                      }
142 |         self.sess.run(self.step_ops, feed_dict)
143 | 
144 |     def compute_gradients(self, x, y):
145 |         pass
146 | 
147 |     def apply_gradients(self, gradients):
148 |         pass
149 | 
150 | 
151 | class Actor(object):
152 |     def __init__(self, opt, job):
153 |         self.opt = opt
154 |         with tf.Graph().as_default():
155 |             tf.set_random_seed(opt.seed)
156 |             np.random.seed(opt.seed)
157 | 
158 |             # Inputs to computation graph
159 |             self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \
160 |                 core.placeholders(opt.obs_dim, opt.act_dim, opt.obs_dim, None, None)
161 | 
162 |             # Main outputs from computation graph
163 |             with tf.variable_scope('main'):
164 |                 self.mu, self.pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = \
165 |                     actor_critic(self.x_ph, self.x2_ph, self.a_ph, action_space=opt.ac_kwargs["action_space"])
166 | 
167 |             # Set up summary Ops
168 |             self.test_ops, self.test_vars = self.build_summaries()
169 | 
170 |             self.sess = tf.Session(
171 |                 config=tf.ConfigProto(
172 |                     device_count={'GPU': 0},
173 |                     intra_op_parallelism_threads=1,
174 |                     inter_op_parallelism_threads=1))
175 | 
176 |             self.sess.run(tf.global_variables_initializer())
177 | 
178 |             if job == "main":
179 |                 self.writer = tf.summary.FileWriter(
180 |                     opt.summary_dir + "/" + str(datetime.datetime.now()) + "-" + opt.env_name + "-workers_num:" + str(
181 |                         opt.num_workers) + "%" + str(opt.a_l_ratio), self.sess.graph)
182 | 
183 |             self.variables = ray.experimental.tf_utils.TensorFlowVariables(
184 |                 self.pi, self.sess)
185 | 
186 |     def set_weights(self, variable_names, weights):
187 |         self.variables.set_weights(dict(zip(variable_names, weights)))
188 | 
189 |     def get_weights(self):
190 |         weights = self.variables.get_weights()
191 |         keys = [key for key in list(weights.keys()) if "main" in key]
192 |         values = [weights[key] for key in keys]
193 |         return keys, values
194 | 
195 |     def get_action(self, o, deterministic=False):
196 |         act_op = self.mu if deterministic else self.pi
197 |         return self.sess.run(act_op, feed_dict={self.x_ph: o.reshape(1, -1)})[0]
198 | 
199 |     def test(self, test_env, replay_buffer, n=25):
200 | 
201 |         rew = []
202 |         for j in range(n):
203 |             o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
204 |             while not(d or (ep_len == self.opt.max_ep_len)):
205 |                 # Take deterministic actions at test time
206 |                 o, r, d, _ = test_env.step(self.get_action(o, True))
207 |                 ep_ret += r
208 |                 ep_len += 1
209 |             rew.append(ep_ret)
210 | 
211 |         sample_times, _, _ = ray.get(replay_buffer.get_counts.remote())
212 |         summary_str = self.sess.run(self.test_ops, feed_dict={
213 |             self.test_vars[0]: sum(rew)/25
214 |         })
215 | 
216 |         self.writer.add_summary(summary_str, sample_times)
217 |         self.writer.flush()
218 |         return sum(rew)/n
219 | 
220 |     # Tensorflow Summary Ops
221 |     def build_summaries(self):
222 |         test_summaries = []
223 |         episode_reward = tf.Variable(0.)
224 |         test_summaries.append(tf.summary.scalar("Reward", episode_reward))
225 | 
226 |         test_ops = tf.summary.merge(test_summaries)
227 |         test_vars = [episode_reward]
228 | 
229 |         return test_ops, test_vars
230 | 


--------------------------------------------------------------------------------
/algos/dqn/actor_learner.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | from numbers import Number
  8 | import pickle
  9 | 
 10 | import time
 11 | import datetime
 12 | import ray
 13 | import ray.experimental.tf_utils
 14 | 
 15 | import core
 16 | from core import get_vars
 17 | 
 18 | 
 19 | class Learner(object):
 20 |     def __init__(self, opt, job):
 21 |         self.opt = opt
 22 |         with tf.Graph().as_default():
 23 |             tf.set_random_seed(opt.seed)
 24 |             np.random.seed(opt.seed)
 25 | 
 26 |             # Inputs to computation graph
 27 |             self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders(opt.obs_dim, None, opt.obs_dim, None, None)
 28 | 
 29 |             # Main outputs from computation graph
 30 |             with tf.variable_scope('main'):
 31 |                 self.q, self.q_x2 = core.q_function(self.x_ph, self.x2_ph, opt.hidden_size, opt.act_dim)
 32 | 
 33 |             # Target value network
 34 |             with tf.variable_scope('target'):
 35 |                 self.q_next, _ = core.q_function(self.x2_ph, self.x2_ph, opt.hidden_size, opt.act_dim)
 36 | 
 37 |             # Count variables
 38 |             var_counts = tuple(core.count_vars(scope) for scope in ['main'])
 39 |             print('\nNumber of parameters: total: %d\n' % var_counts)
 40 | 
 41 |             a_one_hot = tf.one_hot(tf.cast(self.a_ph, tf.int32), depth=opt.act_dim)
 42 |             q_value = tf.reduce_sum(self.q * a_one_hot, axis=1)
 43 | 
 44 |             # DDQN
 45 |             online_q_x2_a_one_hot = tf.one_hot(tf.argmax(self.q_x2, axis=1), depth=opt.act_dim)
 46 |             q_target = tf.reduce_sum(self.q_next * online_q_x2_a_one_hot, axis=1)
 47 | 
 48 |             # DQN
 49 |             # q_target = tf.reduce_max(self.q_next, axis=1)
 50 | 
 51 |             # Bellman backup for Q functions, using Clipped Double-Q targets
 52 |             q_backup = tf.stop_gradient(self.r_ph + opt.gamma * (1 - self.d_ph) * q_target)
 53 | 
 54 |             # q losses
 55 |             q_loss = 0.5 * tf.reduce_mean((q_backup - q_value) ** 2)
 56 | 
 57 |             # Value train op
 58 |             # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
 59 |             value_optimizer = tf.train.AdamOptimizer(learning_rate=opt.lr)
 60 |             value_params = get_vars('main/q')
 61 |             train_value_op = value_optimizer.minimize(q_loss, var_list=value_params)
 62 | 
 63 |             # Polyak averaging for target variables
 64 |             # (control flow because sess.run otherwise evaluates in nondeterministic order)
 65 |             with tf.control_dependencies([train_value_op]):
 66 |                 target_update = tf.group([tf.assign(v_targ, opt.polyak * v_targ + (1 - opt.polyak) * v_main)
 67 |                                           for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
 68 | 
 69 |             # All ops to call during one training step
 70 |             self.step_ops = [q_loss, self.q, train_value_op, target_update]
 71 | 
 72 |             # Initializing targets to match main variables
 73 |             self.target_init = tf.group([tf.assign(v_targ, v_main)
 74 |                                     for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
 75 | 
 76 |             if job == "learner":
 77 |                 config = tf.ConfigProto()
 78 |                 config.gpu_options.per_process_gpu_memory_fraction = opt.gpu_fraction
 79 |                 config.inter_op_parallelism_threads = 1
 80 |                 config.intra_op_parallelism_threads = 1
 81 |                 self.sess = tf.Session(config=config)
 82 |             else:
 83 |                 self.sess = tf.Session(
 84 |                     config=tf.ConfigProto(
 85 |                         # device_count={'GPU': 0},
 86 |                         intra_op_parallelism_threads=1,
 87 |                         inter_op_parallelism_threads=1))
 88 | 
 89 |             self.sess.run(tf.global_variables_initializer())
 90 | 
 91 |             if job == "learner":
 92 |                 # Set up summary Ops
 93 |                 self.train_ops, self.train_vars = self.build_summaries()
 94 |                 self.writer = tf.summary.FileWriter(
 95 |                     opt.summary_dir + "/" + "^^^^^^^^^^" + str(datetime.datetime.now()) + opt.env_name + "-" +
 96 |                     opt.exp_name + "-workers_num:" + str(opt.num_workers) + "%" + str(opt.a_l_ratio), self.sess.graph)
 97 | 
 98 |             self.variables = ray.experimental.tf_utils.TensorFlowVariables(
 99 |                 q_loss, self.sess)
100 | 
101 |     def set_weights(self, variable_names, weights):
102 |         self.variables.set_weights(dict(zip(variable_names, weights)))
103 |         self.sess.run(self.target_init)
104 | 
105 |     def get_weights(self):
106 |         weights = self.variables.get_weights()
107 |         keys = [key for key in list(weights.keys()) if "main" in key]
108 |         values = [weights[key] for key in keys]
109 |         return keys, values
110 | 
111 |     def train(self, batch, cnt):
112 | 
113 |         feed_dict = {self.x_ph: batch['obs1'],
114 |                      self.x2_ph: batch['obs2'],
115 |                      self.a_ph: batch['acts'],
116 |                      self.r_ph: batch['rews'],
117 |                      self.d_ph: batch['done'],
118 |                      }
119 | 
120 |         outs = self.sess.run(self.step_ops, feed_dict)
121 |         if cnt % 500 == 0:
122 |             summary_str = self.sess.run(self.train_ops, feed_dict={
123 |                 self.train_vars[0]: outs[0],
124 |                 self.train_vars[1]: np.mean(outs[1])
125 |             })
126 | 
127 |             self.writer.add_summary(summary_str, cnt)
128 |             self.writer.flush()
129 | 
130 |     def compute_gradients(self, x, y):
131 |         pass
132 | 
133 |     def apply_gradients(self, gradients):
134 |         pass
135 | 
136 |     # Tensorflow Summary Ops
137 |     def build_summaries(self):
138 |         train_summaries = []
139 |         LossQ = tf.Variable(0.)
140 |         train_summaries.append(tf.summary.scalar("LossQ", LossQ))
141 |         QVals = tf.Variable(0.)
142 |         train_summaries.append(tf.summary.scalar("QVals", QVals))
143 |         train_ops = tf.summary.merge(train_summaries)
144 |         train_vars = [LossQ, QVals]
145 | 
146 |         return train_ops, train_vars
147 | 
148 | 
149 | class Actor(object):
150 |     def __init__(self, opt, job):
151 |         self.opt = opt
152 |         with tf.Graph().as_default():
153 |             tf.set_random_seed(opt.seed)
154 |             np.random.seed(opt.seed)
155 | 
156 |             # Inputs to computation graph
157 |             self.x_ph, self.a_ph, self.x2_ph, = core.placeholders(opt.obs_dim, None, opt.obs_dim)
158 | 
159 |             # Main outputs from computation graph
160 |             with tf.variable_scope('main'):
161 |                 self.q, _ = core.q_function(self.x_ph, self.x2_ph, opt.hidden_size, opt.act_dim)
162 | 
163 |             # Set up summary Ops
164 |             self.test_ops, self.test_vars = self.build_summaries()
165 | 
166 |             self.sess = tf.Session(
167 |                 config=tf.ConfigProto(
168 |                     device_count={'GPU': 0},
169 |                     intra_op_parallelism_threads=1,
170 |                     inter_op_parallelism_threads=1))
171 | 
172 |             self.sess.run(tf.global_variables_initializer())
173 | 
174 |             if job == "test":
175 |                 self.writer = tf.summary.FileWriter(
176 |                     opt.summary_dir + "/" + str(datetime.datetime.now()) + "-" + opt.env_name + "-" + opt.exp_name +
177 |                     "-workers_num:" + str(opt.num_workers) + "%" + str(opt.a_l_ratio), self.sess.graph)
178 | 
179 |             variables_all = tf.contrib.framework.get_variables_to_restore()
180 |             variables_bn = [v for v in variables_all if 'moving_mean' in v.name or 'moving_variance' in v.name]
181 | 
182 |             self.variables = ray.experimental.tf_utils.TensorFlowVariables(
183 |                 self.q, self.sess, input_variables=variables_bn)
184 | 
185 |     def set_weights(self, variable_names, weights):
186 |         self.variables.set_weights(dict(zip(variable_names, weights)))
187 | 
188 |     def get_weights(self):
189 |         weights = self.variables.get_weights()
190 |         keys = [key for key in list(weights.keys()) if "main" in key]
191 |         values = [weights[key] for key in keys]
192 |         return keys, values
193 | 
194 |     def get_action(self, o):
195 |         if np.random.uniform() < 0.97:
196 |             o = o[np.newaxis, :]
197 |             actions_value = self.sess.run(self.q, feed_dict={self.x_ph: o})
198 |             action = np.argmax(actions_value)
199 |         else:
200 |             action = np.random.randint(0, self.opt.act_dim)
201 |         return action
202 | 
203 |     # Tensorflow Summary Ops
204 |     def build_summaries(self):
205 |         test_summaries = []
206 |         episode_reward = tf.Variable(0.)
207 |         episode_score = tf.Variable(0.)
208 |         a_l_ratio = tf.Variable(0.)
209 |         update_frequency = tf.Variable(0.)
210 |         test_summaries.append(tf.summary.scalar("Reward", episode_reward))
211 |         test_summaries.append(tf.summary.scalar("score", episode_score))
212 |         test_summaries.append(tf.summary.scalar("a_l_ratio", a_l_ratio))
213 |         test_summaries.append(tf.summary.scalar("update_frequency", update_frequency))
214 |         test_ops = tf.summary.merge(test_summaries)
215 |         test_vars = [episode_reward, episode_score, a_l_ratio, update_frequency]
216 | 
217 |         return test_ops, test_vars
218 | 
219 |     def write_tb(self, ave_test_reward, ave_score, alratio, update_frequency, total_learner_step):
220 |         summary_str = self.sess.run(self.test_ops, feed_dict={
221 |             self.test_vars[0]: ave_test_reward,
222 |             self.test_vars[1]: ave_score,
223 |             self.test_vars[2]: alratio,
224 |             self.test_vars[3]: update_frequency
225 |         })
226 | 
227 |         self.writer.add_summary(summary_str, total_learner_step)
228 |         self.writer.flush()
229 | 
230 |     def test(self, test_env, n=10):
231 | 
232 |         test_rets = []
233 |         scores = []
234 | 
235 |         for _ in range(n):
236 |             o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
237 | 
238 |             while True:
239 |                 a = self.get_action(o)
240 |                 # Step the env
241 |                 o, r, d, _ = test_env.step(a)
242 | 
243 |                 ep_ret += r
244 |                 ep_len += 1
245 | 
246 |                 if d:
247 |                     test_rets.append(ep_ret)
248 |                     scores.append(test_env.rewards[0])
249 |                     # print('test_ep_len:', ep_len, 'test_ep_ret:', ep_ret)
250 |                     break
251 |         return np.mean(test_rets), np.mean(scores)
252 | 
253 | 
254 | 
255 | 


--------------------------------------------------------------------------------
/algos/sqn/actor_learner.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | from numbers import Number
  8 | import pickle
  9 | 
 10 | import time
 11 | import datetime
 12 | import ray
 13 | import ray.experimental.tf_utils
 14 | 
 15 | import core
 16 | from core import get_vars
 17 | 
 18 | 
 19 | class Learner(object):
 20 |     def __init__(self, opt, job):
 21 |         self.opt = opt
 22 |         with tf.Graph().as_default():
 23 |             tf.set_random_seed(opt.seed)
 24 |             np.random.seed(opt.seed)
 25 | 
 26 |             # Inputs to computation graph
 27 |             self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders(opt.obs_dim, None, opt.obs_dim, None, None)
 28 | 
 29 |             # Main outputs from computation graph
 30 |             with tf.variable_scope('main'):
 31 |                 self.mu, self.pi, entropy_x2, q1, q2, q1_mu, q2_mu = core.q_function(self.x_ph, self.x2_ph, opt.alpha,
 32 |                                                                            opt.hidden_size, opt.act_dim)
 33 | 
 34 |             # Target value network
 35 |             with tf.variable_scope('target'):
 36 |                 mu_, pi_, entropy_x2_, q1_, q2_, q1_mu_, q2_mu_ = core.q_function(self.x2_ph, self.x2_ph, opt.alpha,
 37 |                                                                                   opt.hidden_size, opt.act_dim)
 38 | 
 39 |             # Count variables
 40 |             var_counts = tuple(core.count_vars(scope) for scope in ['main'])
 41 |             print('\nNumber of parameters: total: %d\n' % var_counts)
 42 | 
 43 |             a_one_hot = tf.one_hot(tf.cast(self.a_ph, tf.int32), depth=opt.act_dim)
 44 |             q1_a = tf.reduce_sum(q1 * a_one_hot, axis=1)
 45 |             q2_a = tf.reduce_sum(q2 * a_one_hot, axis=1)
 46 | 
 47 |             # Min Double-Q:
 48 |             min_q_target = tf.minimum(q1_mu_, q2_mu_)
 49 | 
 50 |             # Bellman backup for Q functions
 51 |             v_backup = tf.stop_gradient(min_q_target - opt.alpha * entropy_x2)
 52 |             q_backup = self.r_ph + opt.gamma * (1 - self.d_ph) * v_backup
 53 | 
 54 |             # q losses
 55 |             # q_loss = 0.5 * tf.reduce_mean((q_backup - q_value) ** 2)
 56 |             q1_loss = 0.5 * tf.reduce_mean((q_backup - q1_a) ** 2)
 57 |             q2_loss = 0.5 * tf.reduce_mean((q_backup - q2_a) ** 2)
 58 |             q_loss = q1_loss + q2_loss
 59 | 
 60 |             # Value train op
 61 |             # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
 62 |             value_optimizer = tf.train.AdamOptimizer(learning_rate=opt.lr)
 63 |             value_params = get_vars('main/q')
 64 |             train_value_op = value_optimizer.minimize(q_loss, var_list=value_params)
 65 | 
 66 |             # Polyak averaging for target variables
 67 |             # (control flow because sess.run otherwise evaluates in nondeterministic order)
 68 |             with tf.control_dependencies([train_value_op]):
 69 |                 target_update = tf.group([tf.assign(v_targ, opt.polyak * v_targ + (1 - opt.polyak) * v_main)
 70 |                                           for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
 71 | 
 72 |             # All ops to call during one training step
 73 |             self.step_ops = [q_loss, q1, q2, train_value_op, target_update]
 74 | 
 75 |             # Initializing targets to match main variables
 76 |             self.target_init = tf.group([tf.assign(v_targ, v_main)
 77 |                                     for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
 78 | 
 79 |             if job == "learner":
 80 |                 config = tf.ConfigProto()
 81 |                 config.gpu_options.per_process_gpu_memory_fraction = opt.gpu_fraction
 82 |                 config.inter_op_parallelism_threads = 1
 83 |                 config.intra_op_parallelism_threads = 1
 84 |                 self.sess = tf.Session(config=config)
 85 |             else:
 86 |                 self.sess = tf.Session(
 87 |                     config=tf.ConfigProto(
 88 |                         # device_count={'GPU': 0},
 89 |                         intra_op_parallelism_threads=1,
 90 |                         inter_op_parallelism_threads=1))
 91 | 
 92 |             self.sess.run(tf.global_variables_initializer())
 93 | 
 94 |             if job == "learner":
 95 |                 # Set up summary Ops
 96 |                 self.train_ops, self.train_vars = self.build_summaries()
 97 |                 self.writer = tf.summary.FileWriter(
 98 |                     opt.summary_dir + "/" + "^^^^^^^^^^" + str(datetime.datetime.now()) + opt.env_name + "-" +
 99 |                     opt.exp_name + "-workers_num:" + str(opt.num_workers) + "%" + str(opt.a_l_ratio), self.sess.graph)
100 | 
101 |             self.variables = ray.experimental.tf_utils.TensorFlowVariables(
102 |                 q_loss, self.sess)
103 | 
104 |     def set_weights(self, variable_names, weights):
105 |         self.variables.set_weights(dict(zip(variable_names, weights)))
106 |         self.sess.run(self.target_init)
107 | 
108 |     def get_weights(self):
109 |         weights = self.variables.get_weights()
110 |         keys = [key for key in list(weights.keys()) if "main" in key]
111 |         values = [weights[key] for key in keys]
112 |         return keys, values
113 | 
114 |     def train(self, batch, cnt):
115 | 
116 |         feed_dict = {self.x_ph: batch['obs1'],
117 |                      self.x2_ph: batch['obs2'],
118 |                      self.a_ph: batch['acts'],
119 |                      self.r_ph: batch['rews'],
120 |                      self.d_ph: batch['done'],
121 |                      }
122 | 
123 |         outs = self.sess.run(self.step_ops, feed_dict)
124 |         if cnt % 500 == 0:
125 |             summary_str = self.sess.run(self.train_ops, feed_dict={
126 |                 self.train_vars[0]: outs[0],
127 |                 self.train_vars[1]: np.mean(outs[1])
128 |             })
129 | 
130 |             self.writer.add_summary(summary_str, cnt)
131 |             self.writer.flush()
132 | 
133 |     def compute_gradients(self, x, y):
134 |         pass
135 | 
136 |     def apply_gradients(self, gradients):
137 |         pass
138 | 
139 |     # Tensorflow Summary Ops
140 |     def build_summaries(self):
141 |         train_summaries = []
142 |         LossQ = tf.Variable(0.)
143 |         train_summaries.append(tf.summary.scalar("LossQ", LossQ))
144 |         QVals = tf.Variable(0.)
145 |         train_summaries.append(tf.summary.scalar("QVals", QVals))
146 |         train_ops = tf.summary.merge(train_summaries)
147 |         train_vars = [LossQ, QVals]
148 | 
149 |         return train_ops, train_vars
150 | 
151 | 
152 | class Actor(object):
153 |     def __init__(self, opt, job):
154 |         self.opt = opt
155 |         with tf.Graph().as_default():
156 |             tf.set_random_seed(opt.seed)
157 |             np.random.seed(opt.seed)
158 | 
159 |             # Inputs to computation graph
160 |             self.x_ph, self.a_ph, self.x2_ph, = core.placeholders(opt.obs_dim, None, opt.obs_dim)
161 | 
162 |             # Main outputs from computation graph
163 |             with tf.variable_scope('main'):
164 |                 self.mu, self.pi, entropy_x2, q1, q2, q1_mu, q2_mu = core.q_function(self.x_ph, self.x2_ph, opt.alpha,
165 |                                                                            opt.hidden_size, opt.act_dim)
166 | 
167 |             # Set up summary Ops
168 |             self.test_ops, self.test_vars = self.build_summaries()
169 | 
170 |             self.sess = tf.Session(
171 |                 config=tf.ConfigProto(
172 |                     device_count={'GPU': 0},
173 |                     intra_op_parallelism_threads=1,
174 |                     inter_op_parallelism_threads=1))
175 | 
176 |             self.sess.run(tf.global_variables_initializer())
177 | 
178 |             if job == "test":
179 |                 self.writer = tf.summary.FileWriter(
180 |                     opt.summary_dir + "/" + str(datetime.datetime.now()) + "-" + opt.env_name + "-" + opt.exp_name +
181 |                     "-workers_num:" + str(opt.num_workers) + "%" + str(opt.a_l_ratio), self.sess.graph)
182 | 
183 |             variables_all = tf.contrib.framework.get_variables_to_restore()
184 |             variables_bn = [v for v in variables_all if 'moving_mean' in v.name or 'moving_variance' in v.name]
185 | 
186 |             self.variables = ray.experimental.tf_utils.TensorFlowVariables(
187 |                 self.mu, self.sess, input_variables=variables_bn)
188 | 
189 |     def set_weights(self, variable_names, weights):
190 |         self.variables.set_weights(dict(zip(variable_names, weights)))
191 | 
192 |     def get_weights(self):
193 |         weights = self.variables.get_weights()
194 |         keys = [key for key in list(weights.keys()) if "main" in key]
195 |         values = [weights[key] for key in keys]
196 |         return keys, values
197 | 
198 |     def get_action(self, o, deterministic=False):
199 |         act_op = self.mu if deterministic else self.pi
200 |         return self.sess.run(act_op, feed_dict={self.x_ph: np.expand_dims(o, axis=0)})[0]
201 | 
202 |     # Tensorflow Summary Ops
203 |     def build_summaries(self):
204 |         test_summaries = []
205 |         episode_reward = tf.Variable(0.)
206 |         episode_score = tf.Variable(0.)
207 |         a_l_ratio = tf.Variable(0.)
208 |         update_frequency = tf.Variable(0.)
209 |         test_summaries.append(tf.summary.scalar("Reward", episode_reward))
210 |         test_summaries.append(tf.summary.scalar("score", episode_score))
211 |         test_summaries.append(tf.summary.scalar("a_l_ratio", a_l_ratio))
212 |         test_summaries.append(tf.summary.scalar("update_frequency", update_frequency))
213 |         test_ops = tf.summary.merge(test_summaries)
214 |         test_vars = [episode_reward, episode_score, a_l_ratio, update_frequency]
215 | 
216 |         return test_ops, test_vars
217 | 
218 |     def write_tb(self, ave_test_reward, ave_score, alratio, update_frequency, total_learner_step):
219 |         summary_str = self.sess.run(self.test_ops, feed_dict={
220 |             self.test_vars[0]: ave_test_reward,
221 |             self.test_vars[1]: ave_score,
222 |             self.test_vars[2]: alratio,
223 |             self.test_vars[3]: update_frequency
224 |         })
225 | 
226 |         self.writer.add_summary(summary_str, total_learner_step)
227 |         self.writer.flush()
228 | 
229 |     def test(self, test_env, n=10):
230 | 
231 |         test_rets = []
232 |         scores = []
233 | 
234 |         for _ in range(n):
235 |             o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
236 | 
237 |             while True:
238 |                 a = self.get_action(o, deterministic=True)
239 |                 # Step the env
240 |                 o, r, d, _ = test_env.step(a)
241 | 
242 |                 ep_ret += r
243 |                 ep_len += 1
244 | 
245 |                 if d:
246 |                     test_rets.append(ep_ret)
247 |                     scores.append(test_env.rewards[0])
248 |                     # print('test_ep_len:', ep_len, 'test_ep_ret:', ep_ret)
249 |                     break
250 |         return np.mean(test_rets), np.mean(scores)
251 | 
252 | 
253 | 
254 | 


--------------------------------------------------------------------------------
/tutorial/Parallelize your algorithm by Ray (3).md:
--------------------------------------------------------------------------------
  1 | # 使用Ray并行化你的强化学习算法（三）
  2 | 
  3 | ## SAC并行版本实现
  4 | 
  5 | 这一章，我们将上节分好的各部分代码放入并行框架中。
  6 | 
  7 | 我们的并行框架结构图（内容仅涉及到白色线条部分）：
  8 | 
  9 | ![ddrlframework](.\Pictures\ddrlframework.jpg)
 10 | 
 11 | 下面是用ray实现的框架。
 12 | 
 13 | ```python
 14 | @ray.remote
 15 | class ReplayBuffer:
 16 | 	...
 17 |     # replay buffer
 18 |     
 19 | @ray.remote
 20 | class ParameterServer(object):
 21 | 	...
 22 |     # keep the newest network weights here
 23 |     # could pull and push the weights
 24 |     # also could save the weights to local
 25 |     
 26 | @ray.remote
 27 | def worker_rollout(ps, replay_buffer, opt, worker_index):
 28 |     ...
 29 |     # bulid a rollout network
 30 |     # pull weights from ps
 31 |     # for loop:
 32 |     #	interactive with environment
 33 |     #	store experience to replay buffer
 34 |     #	if end of episode:
 35 |     #		pull weights from ps
 36 | 
 37 | @ray.remote(num_gpus=1, max_calls=1)
 38 | def worker_train(ps, replay_buffer, opt, learner_index):
 39 |     ...
 40 |     # build a learner network
 41 |     # pull weights from ps
 42 |   	# for loop:
 43 |     #	get sample batch from replaybuffer
 44 |     #	update network and push new weights to ps
 45 |     
 46 | @ray.remote
 47 | def worker_test(ps, replay_buffer, opt, worker_index=0):
 48 |     ...
 49 |     # bulid a test network usually same as rollout
 50 |     # while:
 51 |     #	pull weights from ps
 52 |     #	do test
 53 |     #	might save model here
 54 |     
 55 | if __name__ == '__main__':
 56 | 
 57 |     ray.init(object_store_memory=1000000000, redis_max_memory=1000000000)
 58 | 
 59 |     # create the parameter server
 60 |     ps = ParameterServer.remote([], [], is_restore=True)
 61 | 
 62 |     # create replay buffer
 63 |     replay_buffer = ReplayBuffer.remote(obs_dim=opt.obs_dim, act_dim=opt.act_dim, size=opt.replay_size)
 64 | 
 65 |     # Start some rollout tasks
 66 |     task_rollout = [worker_rollout.remote(ps, replay_buffer, opt, i) for i in range(FLAGS.num_workers)]
 67 | 
 68 |     time.sleep(5)
 69 | 	
 70 | 	# start training tasks
 71 |     task_train = [worker_train.remote(ps, replay_buffer, opt, i) for i in range(FLAGS.num_learners)]
 72 | 
 73 |     # start testing
 74 |     task_test = worker_test.remote(ps, replay_buffer, opt)
 75 | 
 76 |     # wait util task test end
 77 |     # Keep the main process running. Otherwise everything will shut down when main process finished.
 78 |     ray.wait([task_test, ])
 79 | ```
 80 | 
 81 | ---
 82 | 
 83 | 0. model
 84 | 
 85 | 我们先看算法的核心部分：model，包含了TensorFlow建图，计算loss，训练和测试。
 86 | 
 87 | 新建一个的文件，将之前model部分，训练部分和测试部分的代码都放入Model类中去。之后我们建立一个实例后，就可以调用方法生成动作，训练更新参数，测试评估参数。
 88 | 
 89 | ```python
 90 | class Model(object):
 91 | 
 92 |     def __init__(self, args):
 93 |         # model part code
 94 |     def get_action(self, o, deterministic=False):
 95 |         # get_action method
 96 |     def train(self, replay_buffer, args):
 97 |         # train part code
 98 |     def test_agent(self, test_env, args, n=10):
 99 |         # test method copy
100 |         
101 | ```
102 | 
103 | ---
104 | 
105 | 将代码放入对应位置。
106 | 
107 | ```python
108 | import numpy as np
109 | import tensorflow as tf
110 | import gym
111 | import time
112 | from spinup.algos.sac import core
113 | from spinup.algos.sac.core import get_vars
114 | from spinup.utils.logx import EpochLogger
115 | from core import mlp_actor_critic as actor_critic
116 | import ray.experimental.tf_utils
117 | 
118 | 
119 | class Model(object):
120 | 
121 |     def __init__(self, args):
122 | 
123 |         # Inputs to computation graph
124 | 
125 | 
126 |     def get_action(self, o, deterministic=False):
127 |         act_op = mu if deterministic else pi
128 |         return sess.run(act_op, feed_dict={self.x_ph: o.reshape(1, -1)})[0]
129 | 
130 |     def train(self, replay_buffer, args):
131 | 
132 |         for j in range(args.ep_len):
133 |             batch = replay_buffer.sample_batch(args.batch_size)
134 |             feed_dict = {self.x_ph: batch['obs1'],
135 |                          self.x2_ph: batch['obs2'],
136 |                          self.a_ph: batch['acts'],
137 |                          self.r_ph: batch['rews'],
138 |                          self.d_ph: batch['done'],
139 |                          }
140 |             outs = sess.run(self.step_ops, feed_dict)
141 |             # logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2],
142 |             #              LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5],
143 |             #              VVals=outs[6], LogPi=outs[7])
144 | 
145 |     def test_agent(self, test_env, args, n=10):
146 |         global sess, mu, pi, q1, q2, q1_pi, q2_pi
147 |         for j in range(n):
148 |             o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
149 |             while not (d or (ep_len == args.max_ep_len)):
150 |                 # Take deterministic actions at test time
151 |                 o, r, d, _ = test_env.step(self.get_action(o, True))
152 |                 ep_ret += r
153 |                 ep_len += 1
154 |             print(ep_len, ep_ret)
155 |             # logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
156 | 
157 | ```
158 | 
159 | ---
160 | 
161 | 之外，我们还需要额外添加几个有用的方法。learner不断更新权重，需要把最新的权重导出到ps server上去。rollout需要不断从ps上下载最新权重并更换为自己的权重。
162 | 
163 | ray中已经有写好的类。方便我们导入和导出权重。
164 | 
165 | ```python
166 |     def __init__(self, args):
167 |         
168 |         ...
169 |         
170 | 		self.variables = ray.experimental.tf_utils.TensorFlowVariables(self.value_loss, self.sess)
171 | ```
172 | 
173 | 目标函数的权重在导入权重以后做初始化才有意义，所以把它放在更新权重方法里。
174 | 
175 | ```python
176 |     def set_weights(self, variable_names, weights):
177 |         self.variables.set_weights(dict(zip(variable_names, weights)))
178 |         self.sess.run(self.target_init)
179 | 
180 |     def get_weights(self):
181 |         weights = self.variables.get_weights()
182 |         keys = [key for key in list(weights.keys()) if "main" in key]
183 |         values = [weights[key] for key in keys]
184 |         return keys, values
185 | ```
186 | 
187 | ---
188 | 
189 | 1. Replay Buffer，只要在上面加上ray的修饰器就行了。
190 | 
191 | ```python
192 | @ray.remote
193 | class ReplayBuffer:
194 | 	...
195 |     # replay buffer
196 | ```
197 | 
198 | ---
199 | 
200 | 2. Parameter Server
201 | 
202 | 参数保存在字典里面。Parameter Server的主要功能就是给worker返回最新的权重，接收learner传来的最新的权重。
203 | 
204 | ```python
205 | @ray.remote
206 | class ParameterServer(object):
207 |     def __init__(self, keys, values):
208 |         # These values will be mutated, so we must create a copy that is not
209 |         # backed by the object store.
210 |         values = [value.copy() for value in values]
211 |         self.weights = dict(zip(keys, values))
212 | 
213 |     def push(self, keys, values):
214 |         values = [value.copy() for value in values]
215 |         for key, value in zip(keys, values):
216 |             self.weights[key] = value
217 | 
218 |     def pull(self, keys):
219 |         return [self.weights[key] for key in keys]
220 | 
221 |     def get_weights(self):
222 |         return self.weights
223 | 
224 |     # save weights to disk
225 |     def save_weights(self, name):
226 |         with open(name + "weights.pickle", "wb") as pickle_out:
227 |             pickle.dump(self.weights, pickle_out)
228 | ```
229 | 
230 | ---
231 | 
232 | 3. rollout
233 | 
234 | rollout (worker) 与环境交互，产生数据并存入Replay Buffer。每个episode结束会从Parameter Server得到最新权重来更新自己。
235 | 
236 | ```python
237 | @ray.remote
238 | def worker_rollout(ps, replay_buffer, args):
239 |     env = gym.make(args.env)
240 |     o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
241 |     total_steps = args.steps_per_epoch * args.epochs
242 | 
243 |     agent = Model(args)
244 |     keys = agent.get_weights()[0]
245 | 
246 |     weights = ray.get(ps.pull.remote(keys))
247 |     agent.set_weights(keys, weights)
248 | 
249 |     # Main loop: collect experience in env and update/log each epoch
250 |     for t in range(total_steps):
251 | 
252 |         """
253 |         Until start_steps have elapsed, randomly sample actions
254 |         from a uniform distribution for better exploration. Afterwards, 
255 |         use the learned policy. 
256 |         """
257 |         if t > args.start_steps:
258 |             a = agent.get_action(o)
259 |         else:
260 |             a = env.action_space.sample()
261 | 
262 |         # Step the env
263 |         o2, r, d, _ = env.step(a)
264 |         ep_ret += r
265 |         ep_len += 1
266 | 
267 |         # Ignore the "done" signal if it comes from hitting the time
268 |         # horizon (that is, when it's an artificial terminal signal
269 |         # that isn't based on the agent's state)
270 |         d = False if ep_len == args.max_ep_len else d
271 | 
272 |         # Store experience to replay buffer
273 |         replay_buffer.store.remote(o, a, r, o2, d)
274 | 
275 |         # Super critical, easy to overlook step: make sure to update
276 |         # most recent observation!
277 |         o = o2
278 | 
279 |         if d or (ep_len == args.max_ep_len):
280 |             """
281 |             Perform all SAC updates at the end of the trajectory.
282 |             This is a slight difference from the SAC specified in the
283 |             original paper.
284 |             """
285 | 
286 |             # print(ep_len, ep_ret)
287 |             # logger.store(EpRet=ep_ret, EpLen=ep_len)
288 |             o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
289 | 
290 |             weights = ray.get(ps.pull.remote(keys))
291 |             agent.set_weights(keys, weights)
292 | ```
293 | 
294 | ---
295 | 
296 | 4. train
297 | 
298 | 我们使用一个GPU进行训练。所有在ray修饰器里我们设置资源请求量。
299 | 
300 | 当使用GPU执行任务时，任务会在GPU上分配内存，而且有可能在执行结束后不释放。在设置中写入`max_calls=1`可以让任务运行结束后自动退出并释放GPU内存。
301 | 
302 | ```python
303 | @ray.remote(num_gpus=1, max_calls=1)
304 | def worker_train(ps, replay_buffer, args):
305 |     agent = Model(args)
306 |     keys = agent.get_weights()[0]
307 | 
308 |     weights = ray.get(ps.pull.remote(keys))
309 |     agent.set_weights(keys, weights)
310 | 
311 |     cnt = 1
312 |     while True:
313 | 
314 |         agent.train(replay_buffer, args)
315 | 
316 |         if cnt % 300 == 0:
317 |             keys, values = agent.get_weights()
318 |             ps.push.remote(keys, values)
319 | 
320 |         cnt += 1
321 | ```
322 | 
323 | ---
324 | 
325 | 5. test
326 | 
327 | ```python
328 | @ray.remote
329 | def worker_test(ps, start_time):
330 | 
331 |     from spinup.utils.run_utils import setup_logger_kwargs
332 | 
333 |     logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)
334 |     logger = EpochLogger(**logger_kwargs)
335 |     # print(locals())
336 |     # logger.save_config(locals())
337 | 
338 |     agent = Model(args)
339 |     keys = agent.get_weights()[0]
340 | 
341 |     weights = ray.get(ps.pull.remote(keys))
342 |     agent.set_weights(keys, weights)
343 |     test_env = gym.make(args.env)
344 |     while True:
345 |         ave_ret = agent.test_agent(test_env, args)
346 |         # print("test Average Ret:", ave_ret, "time:", time.time()-start_time)
347 |         logger.log_tabular('test Average Ret', ave_ret)
348 |         logger.log_tabular('Time', time.time() - start_time)
349 |         logger.dump_tabular()
350 |         weights = ray.get(ps.pull.remote(keys))
351 |         agent.set_weights(keys, weights)
352 | 
353 | ```
354 | 
355 | ---
356 | 
357 | 主程序调用
358 | 
359 | ```python
360 | if __name__ == '__main__':
361 |     
362 |     ...
363 |     
364 |     ray.init()
365 | 
366 |     net = Model(args)
367 |     all_keys, all_values = net.get_weights()
368 |     ps = ParameterServer.remote(all_keys, all_values)
369 | 
370 |     replay_buffer = ReplayBuffer.remote(args.obs_dim, args.act_dim, args.replay_size)
371 | 
372 |     # Start some training tasks.
373 |     task_rollout = [worker_rollout.remote(ps, replay_buffer, args) for i in range(args.num_workers)]
374 | 
375 |     time.sleep(20)
376 | 
377 |     task_train = [worker_train.remote(ps, replay_buffer, args) for i in range(args.num_learners)]
378 | 
379 |     time.sleep(10)
380 | 
381 |     task_test = worker_test.remote(ps)
382 |     ray.wait([task_test, ])
383 | ```
384 | 
385 | 本节完。
386 | 
387 | 本文展示的代码是实现分布式算法的最小改动版本，还有许多地方可以优化。
388 | 
389 | 简单实验对比：
390 | 
391 | 实验：LunarLanderContinuous-v2
392 | 
393 | 未调参，sac和dsac参数相同，dsac的worker数量：1。GPU：GTX1060
394 | 
395 | ![dsac1w-sac](.\Pictures\dsac1w-sac.png)
396 | 
397 | 完整代码链接：<https://github.com/createamind/Distributed-DRL/tree/master/example>
398 | 
399 | 参考资料：
400 | 
401 | https://ray.readthedocs.io/en/master/auto_examples/plot_parameter_server.html


--------------------------------------------------------------------------------
/algos/sac1/sac_ray.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import time
  4 | import ray
  5 | import gym
  6 | 
  7 | from hyperparams import HyperParameters, Wrapper
  8 | from actor_learner import Actor, Learner
  9 | 
 10 | import os
 11 | import pickle
 12 | import multiprocessing
 13 | import copy
 14 | 
 15 | from collections import deque
 16 | 
 17 | import inspect
 18 | import json
 19 | from ray.rllib.utils.compression import pack, unpack
 20 | 
 21 | 
 22 | flags = tf.app.flags
 23 | FLAGS = tf.app.flags.FLAGS
 24 | 
 25 | 
 26 | flags.DEFINE_string("env_name", "BipedalWalker-v2", "game env")
 27 | flags.DEFINE_string("exp_name", "Exp1", "experiments name")
 28 | flags.DEFINE_integer("num_workers", 16, "number of workers")
 29 | flags.DEFINE_string("weights_file", "", "empty means False. "
 30 |                                         "[Maxret_weights.pickle] means restore weights from this pickle file.")
 31 | flags.DEFINE_float("a_l_ratio", 2, "steps / sample_times")
 32 | 
 33 | 
 34 | @ray.remote(num_cpus=2)
 35 | class ReplayBuffer:
 36 |     """
 37 |     A simple FIFO experience replay buffer for SQN_N_STEP agents.
 38 |     """
 39 | 
 40 |     def __init__(self, opt):
 41 |         self.opt = opt
 42 |         if opt.obs_shape != (115,):
 43 |             self.buffer_o = np.array([['0' * 2000] * (opt.Ln + 1)] * opt.buffer_size, dtype=np.str)
 44 |         else:
 45 |             self.buffer_o = np.zeros((opt.buffer_size, opt.Ln + 1) + opt.obs_shape, dtype=np.float32)
 46 |         self.buffer_a = np.zeros((opt.buffer_size, opt.Ln) + opt.act_shape, dtype=np.float32)
 47 |         self.buffer_r = np.zeros((opt.buffer_size, opt.Ln), dtype=np.float32)
 48 |         self.buffer_d = np.zeros((opt.buffer_size, opt.Ln), dtype=np.float32)
 49 |         self.ptr, self.size, self.max_size = 0, 0, opt.buffer_size
 50 |         self.steps, self.sample_times = 0, 0
 51 | 
 52 |     def store(self, o_queue, a_r_d_queue, worker_index):
 53 | 
 54 |         obs, = np.stack(o_queue, axis=1)
 55 | 
 56 |         if self.opt.obs_shape != (115,):
 57 |             self.buffer_o[self.ptr] = obs
 58 |         else:
 59 |             self.buffer_o[self.ptr] = np.array(list(obs), dtype=np.float32)
 60 | 
 61 |         a, r, d, = np.stack(a_r_d_queue, axis=1)
 62 |         self.buffer_a[self.ptr] = np.array(list(a), dtype=np.float32)
 63 |         self.buffer_r[self.ptr] = np.array(list(r), dtype=np.float32)
 64 |         self.buffer_d[self.ptr] = np.array(list(d), dtype=np.float32)
 65 | 
 66 |         self.ptr = (self.ptr + 1) % self.max_size
 67 |         self.size = min(self.size + 1, self.max_size)
 68 |         # TODO
 69 |         self.steps += 1 * self.opt.num_buffers
 70 |         # self.steps += opt.Ln * opt.action_repeat
 71 | 
 72 |     def sample_batch(self):
 73 |         idxs = np.random.randint(0, self.size, size=self.opt.batch_size)
 74 |         # TODO
 75 |         self.sample_times += 1 * self.opt.num_buffers
 76 | 
 77 |         return dict(obs=self.buffer_o[idxs],
 78 |                     acts=self.buffer_a[idxs],
 79 |                     rews=self.buffer_r[idxs],
 80 |                     done=self.buffer_d[idxs], )
 81 | 
 82 |     def get_counts(self):
 83 |         return self.sample_times, self.steps, self.size
 84 | 
 85 | 
 86 | @ray.remote
 87 | class ParameterServer(object):
 88 |     def __init__(self, keys, values, weights_file=""):
 89 |         # These values will be mutated, so we must create a copy that is not
 90 |         # backed by the object store.
 91 | 
 92 |         if weights_file:
 93 |             try:
 94 |                 with open(weights_file, "rb") as pickle_in:
 95 |                     self.weights = pickle.load(pickle_in)
 96 |                     print("****** weights restored! ******")
 97 |             except:
 98 |                 print("------------------------------------------------")
 99 |                 print(weights_file)
100 |                 print("------ error: weights file doesn't exist! ------")
101 |                 exit()
102 |         else:
103 |             values = [value.copy() for value in values]
104 |             self.weights = dict(zip(keys, values))
105 | 
106 |     def push(self, keys, values):
107 |         values = [value.copy() for value in values]
108 |         for key, value in zip(keys, values):
109 |             self.weights[key] = value
110 | 
111 |     def pull(self, keys):
112 |         return [self.weights[key] for key in keys]
113 | 
114 |     def get_weights(self):
115 |         return self.weights
116 | 
117 |     # save weights to disk
118 |     def save_weights(self, name):
119 |         with open(name + "weights.pickle", "wb") as pickle_out:
120 |             pickle.dump(self.weights, pickle_out)
121 | 
122 | 
123 | class Cache(object):
124 | 
125 |     def __init__(self, replay_buffer):
126 |         # cache for training data and model weights
127 |         print('os.pid:', os.getpid())
128 |         self.replay_buffer = replay_buffer
129 |         self.q1 = multiprocessing.Queue(10)
130 |         self.q2 = multiprocessing.Queue(5)
131 |         self.p1 = multiprocessing.Process(target=self.ps_update, args=(self.q1, self.q2, self.replay_buffer))
132 |         self.p1.daemon = True
133 | 
134 |     def ps_update(self, q1, q2, replay_buffer):
135 |         print('os.pid of put_data():', os.getpid())
136 | 
137 |         q1.put(copy.deepcopy(ray.get(replay_buffer[np.random.choice(opt.num_buffers, 1)[0]].sample_batch.remote())))
138 | 
139 |         while True:
140 |             if q1.qsize() < 10:
141 |                 q1.put(copy.deepcopy(ray.get(replay_buffer[np.random.choice(opt.num_buffers, 1)[0]].sample_batch.remote())))
142 | 
143 |             if not q2.empty():
144 |                 keys, values = q2.get()
145 |                 ps.push.remote(keys, values)
146 | 
147 |     def start(self):
148 |         self.p1.start()
149 |         self.p1.join(10)
150 | 
151 |     def end(self):
152 |         self.p1.terminate()
153 | 
154 | # TODO
155 | @ray.remote(num_cpus=2, num_gpus=1, max_calls=1)
156 | def worker_train(ps, replay_buffer, opt, learner_index):
157 |     agent = Learner(opt, job="learner")
158 |     keys = agent.get_weights()[0]
159 |     weights = ray.get(ps.pull.remote(keys))
160 |     agent.set_weights(keys, weights)
161 | 
162 |     cache = Cache(replay_buffer)
163 | 
164 |     cache.start()
165 | 
166 |     cnt = 1
167 |     while True:
168 |         batch = cache.q1.get()
169 |         if opt.model == "cnn":
170 |             batch['obs'] = np.array([[unpack(o) for o in lno] for lno in batch['obs']])
171 |         agent.train(batch, cnt)
172 |         # TODO cnt % 300 == 0 before
173 |         if cnt % 100 == 0:
174 |             cache.q2.put(agent.get_weights())
175 |         cnt += 1
176 | 
177 | 
178 | @ray.remote
179 | def worker_rollout(ps, replay_buffer, opt, worker_index):
180 | 
181 |     agent = Actor(opt, job="worker")
182 |     keys = agent.get_weights()[0]
183 | 
184 |     filling_steps = 0
185 |     while True:
186 |         # ------ env set up ------
187 |         env = Wrapper(gym.make(opt.env_name), opt.obs_noise, opt.act_noise, opt.reward_scale, 3)
188 |         # ------ env set up end ------
189 | 
190 |         ################################## deques
191 | 
192 |         o_queue = deque([], maxlen=opt.Ln + 1)
193 |         a_r_d_queue = deque([], maxlen=opt.Ln)
194 | 
195 |         ################################## deques
196 | 
197 |         o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
198 | 
199 |         ################################## deques reset
200 |         t_queue = 1
201 |         if opt.model == "cnn":
202 |             compressed_o = pack(o)
203 |             o_queue.append((compressed_o,))
204 |         else:
205 |             o_queue.append((o,))
206 | 
207 |         ################################## deques reset
208 | 
209 |         weights = ray.get(ps.pull.remote(keys))
210 |         agent.set_weights(keys, weights)
211 | 
212 |         while True:
213 | 
214 |             # don't need to random sample action if load weights from local.
215 |             if filling_steps > opt.start_steps or opt.weights_file:
216 |                 a = agent.get_action(o, deterministic=False)
217 |             else:
218 |                 a = env.action_space.sample()
219 |                 filling_steps += 1
220 |             # Step the env
221 |             o2, r, d, _ = env.step(a)
222 | 
223 |             ep_ret += r
224 |             ep_len += 1
225 | 
226 |             # Ignore the "done" signal if it comes from hitting the time
227 |             # horizon (that is, when it's an artificial terminal signal
228 |             # that isn't based on the agent's state)
229 |             # d = False if ep_len*opt.action_repeat >= opt.max_ep_len else d
230 | 
231 |             o = o2
232 | 
233 |             #################################### deques store
234 | 
235 |             a_r_d_queue.append((a, r, d,))
236 |             if opt.model == "cnn":
237 |                 compressed_o2 = pack(o2)
238 |                 o_queue.append((compressed_o2,))
239 |             else:
240 |                 o_queue.append((o2,))
241 | 
242 |             # scheme 1:
243 |             # TODO  and t_queue % 2 == 0: %1 lead to q smaller
244 |             # TODO
245 |             if t_queue >= opt.Ln and t_queue % opt.save_freq == 0:
246 |                 replay_buffer[np.random.choice(opt.num_buffers, 1)[0]].store.remote(o_queue, a_r_d_queue, worker_index)
247 | 
248 |             t_queue += 1
249 | 
250 |             #################################### deques store
251 | 
252 |             # End of episode. Training (ep_len times).
253 |             if d or (ep_len * opt.action_repeat >= opt.max_ep_len):
254 |                 # TODO
255 |                 sample_times, steps, _ = ray.get(replay_buffer[0].get_counts.remote())
256 | 
257 |                 print('rollout_ep_len:', ep_len * opt.action_repeat, 'rollout_ep_ret:', ep_ret)
258 | 
259 |                 if steps > opt.start_steps:
260 |                     # update parameters every episode
261 |                     weights = ray.get(ps.pull.remote(keys))
262 |                     agent.set_weights(keys, weights)
263 | 
264 |                 o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
265 | 
266 |                 ################################## deques reset
267 |                 t_queue = 1
268 |                 if opt.model == "cnn":
269 |                     compressed_o = pack(o)
270 |                     o_queue.append((compressed_o,))
271 |                 else:
272 |                     o_queue.append((o,))
273 | 
274 |                 ################################## deques reset
275 | 
276 | 
277 | @ray.remote
278 | def worker_test(ps, replay_buffer, opt):
279 |     agent = Actor(opt, job="main")
280 | 
281 |     test_env = Wrapper(gym.make(opt.env_name), opt.obs_noise, opt.act_noise, opt.reward_scale, 3)
282 | 
283 |     agent.test(ps, replay_buffer, opt, test_env)
284 | 
285 | 
286 | if __name__ == '__main__':
287 | 
288 |     # ray.init(object_store_memory=1000000000, redis_max_memory=1000000000)
289 |     ray.init()
290 | 
291 |     # ------ HyperParameters ------
292 |     opt = HyperParameters(FLAGS.env_name, FLAGS.exp_name, FLAGS.num_workers, FLAGS.a_l_ratio,
293 |                           FLAGS.weights_file)
294 |     All_Parameters = copy.deepcopy(vars(opt))
295 |     All_Parameters["wrapper"] = inspect.getsource(Wrapper)
296 |     All_Parameters["obs_space"] = ""
297 |     All_Parameters["act_space"] = ""
298 | 
299 |     try:
300 |         os.makedirs(opt.save_dir)
301 |     except OSError:
302 |         pass
303 |     with open(opt.save_dir + "/" + 'All_Parameters.json', 'w') as fp:
304 |         json.dump(All_Parameters, fp, indent=4, sort_keys=True)
305 | 
306 |     # ------ end ------
307 | 
308 |     if FLAGS.weights_file:
309 |         ps = ParameterServer.remote([], [], weights_file=FLAGS.weights_file)
310 |     else:
311 |         net = Learner(opt, job="main")
312 |         all_keys, all_values = net.get_weights()
313 |         ps = ParameterServer.remote(all_keys, all_values)
314 | 
315 |     # Experience buffer
316 |     # Methods called on different actors can execute in parallel,
317 |     # and methods called on the same actor are executed serially in the order that they are called.
318 |     # we need more buffer for more workers to keep high store speed.
319 |     replay_buffer = [ReplayBuffer.remote(opt) for i in range(opt.num_buffers)]
320 | 
321 |     # Start some training tasks.
322 |     for i in range(FLAGS.num_workers):
323 |         worker_rollout.remote(ps, replay_buffer, opt, i)
324 |         time.sleep(0.05)
325 |     # task_rollout = [worker_rollout.remote(ps, replay_buffer, opt, i) for i in range(FLAGS.num_workers)]
326 | 
327 |     if opt.weights_file:
328 |         fill_steps = opt.start_steps / 100
329 |     else:
330 |         fill_steps = opt.start_steps
331 |     # store at least start_steps in buffer before training
332 |     _, steps, _ = ray.get(replay_buffer[0].get_counts.remote())
333 |     while steps < fill_steps:
334 |         _, steps, _ = ray.get(replay_buffer[0].get_counts.remote())
335 |         print('fill steps before learn:', steps)
336 |         time.sleep(1)
337 | 
338 |     task_train = [worker_train.remote(ps, replay_buffer, opt, i) for i in range(opt.num_learners)]
339 | 
340 |     time.sleep(10)
341 |     while True:
342 |         task_test = worker_test.remote(ps, replay_buffer, opt)
343 |         ray.wait([task_test, ])
344 | 


--------------------------------------------------------------------------------
/example/sac.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import gym
  4 | import time
  5 | from spinup.algos.sac import core
  6 | from spinup.algos.sac.core import get_vars
  7 | from spinup.utils.logx import EpochLogger
  8 | 
  9 | 
 10 | class ReplayBuffer:
 11 |     """
 12 |     A simple FIFO experience replay buffer for SAC agents.
 13 |     """
 14 | 
 15 |     def __init__(self, obs_dim, act_dim, size):
 16 |         self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
 17 |         self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
 18 |         self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
 19 |         self.rews_buf = np.zeros(size, dtype=np.float32)
 20 |         self.done_buf = np.zeros(size, dtype=np.float32)
 21 |         self.ptr, self.size, self.max_size = 0, 0, size
 22 | 
 23 |     def store(self, obs, act, rew, next_obs, done):
 24 |         self.obs1_buf[self.ptr] = obs
 25 |         self.obs2_buf[self.ptr] = next_obs
 26 |         self.acts_buf[self.ptr] = act
 27 |         self.rews_buf[self.ptr] = rew
 28 |         self.done_buf[self.ptr] = done
 29 |         self.ptr = (self.ptr+1) % self.max_size
 30 |         self.size = min(self.size+1, self.max_size)
 31 | 
 32 |     def sample_batch(self, batch_size=32):
 33 |         idxs = np.random.randint(0, self.size, size=batch_size)
 34 |         return dict(obs1=self.obs1_buf[idxs],
 35 |                     obs2=self.obs2_buf[idxs],
 36 |                     acts=self.acts_buf[idxs],
 37 |                     rews=self.rews_buf[idxs],
 38 |                     done=self.done_buf[idxs])
 39 | 
 40 | """
 41 | 
 42 | Soft Actor-Critic
 43 | 
 44 | (With slight variations that bring it closer to TD3)
 45 | 
 46 | """
 47 | def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, 
 48 |         steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, 
 49 |         polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, 
 50 |         max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
 51 |     """
 52 | 
 53 |     Args:
 54 |         env_fn : A function which creates a copy of the environment.
 55 |             The environment must satisfy the OpenAI Gym API.
 56 | 
 57 |         actor_critic: A function which takes in placeholder symbols 
 58 |             for state, ``x_ph``, and action, ``a_ph``, and returns the main 
 59 |             outputs from the agent's Tensorflow computation graph:
 60 | 
 61 |             ===========  ================  ======================================
 62 |             Symbol       Shape             Description
 63 |             ===========  ================  ======================================
 64 |             ``mu``       (batch, act_dim)  | Computes mean actions from policy
 65 |                                            | given states.
 66 |             ``pi``       (batch, act_dim)  | Samples actions from policy given 
 67 |                                            | states.
 68 |             ``logp_pi``  (batch,)          | Gives log probability, according to
 69 |                                            | the policy, of the action sampled by
 70 |                                            | ``pi``. Critical: must be differentiable
 71 |                                            | with respect to policy parameters all
 72 |                                            | the way through action sampling.
 73 |             ``q1``       (batch,)          | Gives one estimate of Q* for 
 74 |                                            | states in ``x_ph`` and actions in
 75 |                                            | ``a_ph``.
 76 |             ``q2``       (batch,)          | Gives another estimate of Q* for 
 77 |                                            | states in ``x_ph`` and actions in
 78 |                                            | ``a_ph``.
 79 |             ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
 80 |                                            | ``pi`` for states in ``x_ph``: 
 81 |                                            | q1(x, pi(x)).
 82 |             ``q2_pi``    (batch,)          | Gives the composition of ``q2`` and 
 83 |                                            | ``pi`` for states in ``x_ph``: 
 84 |                                            | q2(x, pi(x)).
 85 |             ``v``        (batch,)          | Gives the value estimate for states
 86 |                                            | in ``x_ph``. 
 87 |             ===========  ================  ======================================
 88 | 
 89 |         ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
 90 |             function you provided to SAC.
 91 | 
 92 |         seed (int): Seed for random number generators.
 93 | 
 94 |         steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
 95 |             for the agent and the environment in each epoch.
 96 | 
 97 |         epochs (int): Number of epochs to run and train agent.
 98 | 
 99 |         replay_size (int): Maximum length of replay buffer.
100 | 
101 |         gamma (float): Discount factor. (Always between 0 and 1.)
102 | 
103 |         polyak (float): Interpolation factor in polyak averaging for target 
104 |             networks. Target networks are updated towards main networks 
105 |             according to:
106 | 
107 |             .. math:: \\theta_{\\text{targ}} \\leftarrow 
108 |                 \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
109 | 
110 |             where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
111 |             close to 1.)
112 | 
113 |         lr (float): Learning rate (used for both policy and value learning).
114 | 
115 |         alpha (float): Entropy regularization coefficient. (Equivalent to 
116 |             inverse of reward scale in the original SAC paper.)
117 | 
118 |         batch_size (int): Minibatch size for SGD.
119 | 
120 |         start_steps (int): Number of steps for uniform-random action selection,
121 |             before running real policy. Helps exploration.
122 | 
123 |         max_ep_len (int): Maximum length of trajectory / episode / rollout.
124 | 
125 |         logger_kwargs (dict): Keyword args for EpochLogger.
126 | 
127 |         save_freq (int): How often (in terms of gap between epochs) to save
128 |             the current policy and value function.
129 | 
130 |     """
131 | 
132 |     logger = EpochLogger(**logger_kwargs)
133 |     logger.save_config(locals())
134 | 
135 |     tf.set_random_seed(seed)
136 |     np.random.seed(seed)
137 | 
138 |     env, test_env = env_fn(), env_fn()
139 |     obs_dim = env.observation_space.shape[0]
140 |     act_dim = env.action_space.shape[0]
141 | 
142 |     # Action limit for clamping: critically, assumes all dimensions share the same bound!
143 |     act_limit = env.action_space.high[0]
144 | 
145 |     # Share information about action space with policy architecture
146 |     ac_kwargs['action_space'] = env.action_space
147 | 
148 |     # Inputs to computation graph
149 |     x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None)
150 | 
151 |     # Main outputs from computation graph
152 |     with tf.variable_scope('main'):
153 |         mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)
154 |     
155 |     # Target value network
156 |     with tf.variable_scope('target'):
157 |         _, _, _, _, _, _, _, v_targ  = actor_critic(x2_ph, a_ph, **ac_kwargs)
158 | 
159 |     # Experience buffer
160 |     replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)
161 | 
162 |     # Count variables
163 |     var_counts = tuple(core.count_vars(scope) for scope in 
164 |                        ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main'])
165 |     print(('\nNumber of parameters: \t pi: %d, \t' + \
166 |            'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts)
167 | 
168 |     # Min Double-Q:
169 |     min_q_pi = tf.minimum(q1_pi, q2_pi)
170 | 
171 |     # Targets for Q and V regression
172 |     q_backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ)
173 |     v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi)
174 | 
175 |     # Soft actor-critic losses
176 |     pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi)
177 |     q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
178 |     q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
179 |     v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2)
180 |     value_loss = q1_loss + q2_loss + v_loss
181 | 
182 |     # Policy train op 
183 |     # (has to be separate from value train op, because q1_pi appears in pi_loss)
184 |     pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
185 |     train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
186 | 
187 |     # Value train op
188 |     # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
189 |     value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
190 |     value_params = get_vars('main/q') + get_vars('main/v')
191 |     with tf.control_dependencies([train_pi_op]):
192 |         train_value_op = value_optimizer.minimize(value_loss, var_list=value_params)
193 | 
194 |     # Polyak averaging for target variables
195 |     # (control flow because sess.run otherwise evaluates in nondeterministic order)
196 |     with tf.control_dependencies([train_value_op]):
197 |         target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
198 |                                   for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
199 | 
200 |     # All ops to call during one training step
201 |     step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, 
202 |                 train_pi_op, train_value_op, target_update]
203 | 
204 |     # Initializing targets to match main variables
205 |     target_init = tf.group([tf.assign(v_targ, v_main)
206 |                               for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
207 | 
208 |     sess = tf.Session()
209 |     sess.run(tf.global_variables_initializer())
210 |     sess.run(target_init)
211 | 
212 |     # Setup model saving
213 |     logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, 
214 |                                 outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v})
215 | 
216 |     def get_action(o, deterministic=False):
217 |         act_op = mu if deterministic else pi
218 |         return sess.run(act_op, feed_dict={x_ph: o.reshape(1,-1)})[0]
219 | 
220 |     def test_agent(n=10):
221 |         global sess, mu, pi, q1, q2, q1_pi, q2_pi
222 |         for j in range(n):
223 |             o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
224 |             while not(d or (ep_len == max_ep_len)):
225 |                 # Take deterministic actions at test time 
226 |                 o, r, d, _ = test_env.step(get_action(o, True))
227 |                 ep_ret += r
228 |                 ep_len += 1
229 |             logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
230 | 
231 |     start_time = time.time()
232 |     o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
233 |     total_steps = steps_per_epoch * epochs
234 | 
235 |     # Main loop: collect experience in env and update/log each epoch
236 |     for t in range(total_steps):
237 | 
238 |         """
239 |         Until start_steps have elapsed, randomly sample actions
240 |         from a uniform distribution for better exploration. Afterwards, 
241 |         use the learned policy. 
242 |         """
243 |         if t > start_steps:
244 |             a = get_action(o)
245 |         else:
246 |             a = env.action_space.sample()
247 | 
248 |         # Step the env
249 |         o2, r, d, _ = env.step(a)
250 |         ep_ret += r
251 |         ep_len += 1
252 | 
253 |         # Ignore the "done" signal if it comes from hitting the time
254 |         # horizon (that is, when it's an artificial terminal signal
255 |         # that isn't based on the agent's state)
256 |         d = False if ep_len==max_ep_len else d
257 | 
258 |         # Store experience to replay buffer
259 |         replay_buffer.store(o, a, r, o2, d)
260 | 
261 |         # Super critical, easy to overlook step: make sure to update 
262 |         # most recent observation!
263 |         o = o2
264 | 
265 |         if d or (ep_len == max_ep_len):
266 |             """
267 |             Perform all SAC updates at the end of the trajectory.
268 |             This is a slight difference from the SAC specified in the
269 |             original paper.
270 |             """
271 |             for j in range(ep_len):
272 |                 batch = replay_buffer.sample_batch(batch_size)
273 |                 feed_dict = {x_ph: batch['obs1'],
274 |                              x2_ph: batch['obs2'],
275 |                              a_ph: batch['acts'],
276 |                              r_ph: batch['rews'],
277 |                              d_ph: batch['done'],
278 |                             }
279 |                 outs = sess.run(step_ops, feed_dict)
280 |                 logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2],
281 |                              LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5],
282 |                              VVals=outs[6], LogPi=outs[7])
283 | 
284 |             logger.store(EpRet=ep_ret, EpLen=ep_len)
285 |             o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
286 | 
287 | 
288 |         # End of epoch wrap-up
289 |         if t > 0 and t % steps_per_epoch == 0:
290 |             epoch = t // steps_per_epoch
291 | 
292 |             # Save model
293 |             if (epoch % save_freq == 0) or (epoch == epochs-1):
294 |                 logger.save_state({'env': env}, None)
295 | 
296 |             # Test the performance of the deterministic version of the agent.
297 |             test_agent()
298 | 
299 |             # Log info about epoch
300 |             logger.log_tabular('Epoch', epoch)
301 |             logger.log_tabular('EpRet', with_min_and_max=True)
302 |             logger.log_tabular('TestEpRet', with_min_and_max=True)
303 |             logger.log_tabular('EpLen', average_only=True)
304 |             logger.log_tabular('TestEpLen', average_only=True)
305 |             logger.log_tabular('TotalEnvInteracts', t)
306 |             logger.log_tabular('Q1Vals', with_min_and_max=True) 
307 |             logger.log_tabular('Q2Vals', with_min_and_max=True) 
308 |             logger.log_tabular('VVals', with_min_and_max=True) 
309 |             logger.log_tabular('LogPi', with_min_and_max=True)
310 |             logger.log_tabular('LossPi', average_only=True)
311 |             logger.log_tabular('LossQ1', average_only=True)
312 |             logger.log_tabular('LossQ2', average_only=True)
313 |             logger.log_tabular('LossV', average_only=True)
314 |             logger.log_tabular('Time', time.time()-start_time)
315 |             logger.dump_tabular()
316 | 
317 | if __name__ == '__main__':
318 |     import argparse
319 |     parser = argparse.ArgumentParser()
320 |     parser.add_argument('--env', type=str, default='HalfCheetah-v2')
321 |     parser.add_argument('--hid', type=int, default=300)
322 |     parser.add_argument('--l', type=int, default=1)
323 |     parser.add_argument('--gamma', type=float, default=0.99)
324 |     parser.add_argument('--seed', '-s', type=int, default=0)
325 |     parser.add_argument('--epochs', type=int, default=50)
326 |     parser.add_argument('--exp_name', type=str, default='sac')
327 |     args = parser.parse_args()
328 | 
329 |     from spinup.utils.run_utils import setup_logger_kwargs
330 |     logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)
331 | 
332 |     sac(lambda : gym.make(args.env), actor_critic=core.mlp_actor_critic,
333 |         ac_kwargs=dict(hidden_sizes=[args.hid]*args.l),
334 |         gamma=args.gamma, seed=args.seed, epochs=args.epochs,
335 |         logger_kwargs=logger_kwargs)


--------------------------------------------------------------------------------
/algos/trading_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import gym
  3 | from gym import spaces
  4 | from gym.spaces import Box
  5 | import ctypes
  6 | import json
  7 | import os
  8 | import sys
  9 | from collections import deque
 10 | import pandas as pd
 11 | import pickle
 12 | import time
 13 | 
 14 | info_names = [
 15 |     "Done", "LastPrice", "BidPrice1", "BidVolume1", "AskPrice1", "AskVolume1", "BidPrice2", "BidVolume2",
 16 |     "AskPrice2", "AskVolume2", "BidPrice3", "BidVolume3", "AskPrice3", "AskVolume3", "BidPrice4",
 17 |     "BidVolume4", "AskPrice4", "AskVolume4", "BidPrice5", "BidVolume5", "AskPrice5", "AskVolume5", "Volume",
 18 |     "HighestPrice", "LowestPrice", "TradingDay", "Target_Num", "Actual_Num", "AliveBidPrice1",
 19 |     "AliveBidVolume1", "AliveBidPrice2", "AliveBidVolume2", "AliveBidPrice3", "AliveBidVolume3",
 20 |     "AliveAskPrice1", "AliveAskVolume1", "AliveAskPrice2", "AliveAskVolume2", "AliveAskPrice3",
 21 |     "AliveAskVolume3", "score", "profit", "total_profit", "baseline_profit", "action", "designed_reward"
 22 | ]
 23 | 
 24 | data_v19_len = [
 25 |     225013, 225015, 225015, 225015, 225015, 225017, 225015, 225015, 225017, 225015, 225015, 225015, 225015, 225015,
 26 |     225015, 225015, 225015, 225015, 225015, 225015, 225015, 225015, 225010, 225015, 225015, 135002, 225015, 225015,
 27 |     225015, 225015, 225015, 225017, 225015, 225017, 225015, 225017, 225015, 225015, 225015, 225015, 225017, 225015,
 28 |     225015, 225015, 225017, 225017, 225016, 225017, 225015, 225013, 225015, 225015, 225017, 225017, 225014, 225017,
 29 |     225015, 225013, 225015, 225017, 225015, 225015, 225015, 225017, 225015, 225017, 225017, 225015, 225015, 225015,
 30 |     225017, 225017, 225015, 225015, 225017, 225015, 225015, 225017, 225015, 225015, 225014, 225015, 225015, 225015,
 31 |     225015, 225015, 225017, 225017, 225015, 225015, 225015, 225015, 225017, 225015, 225017, 225015, 225015, 225015,
 32 |     225015, 99005, 225015, 225017, 99009, 225015, 225015, 225009, 225017, 225015, 225015, 225015, 225013, 225013,
 33 |     225015, 225015, 225013, 225015, 225015, 225017, 225015, 126016
 34 | ]   # 120days
 35 | 
 36 | 
 37 | class TradingEnv(gym.Env):
 38 | 
 39 |     def __init__(self, action_scheme_id, obs_dim, auto_follow=0, max_ep_len=3000, render=False):
 40 |         super(TradingEnv, self).__init__()
 41 | 
 42 |         self.data_len = data_v19_len
 43 |         self.trainning_set = 90
 44 |         ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 45 |         os.chdir(ROOT_DIR + "/rl_game/game/")
 46 |         so_file = "./game.so"
 47 |         self.expso = ctypes.cdll.LoadLibrary(so_file)
 48 |         arr_len = 100
 49 |         arr1 = ctypes.c_int * arr_len
 50 |         arr = ctypes.c_int * 1
 51 | 
 52 |         self.ctx = None
 53 | 
 54 |         self.actions = arr1()
 55 |         self.action_len = arr()
 56 |         self.raw_obs = arr1()
 57 |         self.raw_obs_len = arr()
 58 |         self.rewards = arr1()
 59 |         self.rewards_len = arr()
 60 | 
 61 |         self._step = self._action_schemes(action_scheme_id)
 62 |         self.auto_follow = auto_follow
 63 | 
 64 |         self.obs_dim = obs_dim
 65 |         self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.obs_dim,), dtype=np.float32)
 66 | 
 67 |         self.step_len = 0
 68 | 
 69 |         self.max_ep_len = max_ep_len
 70 |         self.render = render
 71 | 
 72 |         self.his_price = deque(maxlen=30)
 73 | 
 74 |     def reset(self, start_day=None, start_skip=None, burn_in=0):
 75 | 
 76 |         # random start_day if no start_day
 77 |         if start_day is None:
 78 |             start_day = np.random.randint(1, self.trainning_set + 1, 1)[0]  # first self.trainning_set days
 79 | 
 80 |         # random start_skip if no start_skip
 81 |         if start_skip is None:
 82 |             day_index = start_day - 1
 83 |             max_point = self.data_len[day_index] - self.max_ep_len - burn_in - 50
 84 |             start_skip = int(np.random.randint(0, max_point, 1)[0])
 85 | 
 86 |         start_info = {"date_index": "{} - {}".format(start_day, start_day), "skip_steps": start_skip}
 87 |         # print(start_info)
 88 |         if self.ctx:
 89 |             self.close_env()
 90 |         self.ctx = self.expso.CreateContext(json.dumps(start_info).encode())
 91 |         self.expso.GetActions(self.ctx, self.actions, self.action_len)
 92 |         self.expso.GetInfo(self.ctx, self.raw_obs, self.raw_obs_len)
 93 |         self.expso.GetReward(self.ctx, self.rewards, self.rewards_len)
 94 | 
 95 |         self.step_len = 0
 96 | 
 97 |         obs = self._get_obs(self.raw_obs)
 98 | 
 99 |         if self.render:
100 |             self.rendering()
101 | 
102 |         return obs
103 | 
104 |     def step(self, action):
105 |         target_num = self.raw_obs[26]
106 |         actual_num = self.raw_obs[27]
107 | 
108 |         if self.auto_follow is not 0:
109 |             if abs(actual_num - target_num) > self.auto_follow:
110 |                 if target_num > actual_num:
111 |                     action = 6
112 |                 else:
113 |                     action = 9
114 | 
115 |         self._step(action)
116 |         self.expso.Step(self.ctx)
117 |         self.expso.GetInfo(self.ctx, self.raw_obs, self.raw_obs_len)
118 |         self.expso.GetReward(self.ctx, self.rewards, self.rewards_len)
119 | 
120 |         self.step_len += 1
121 | 
122 |         target_bias = abs(self.raw_obs[27] - self.raw_obs[26])
123 | 
124 |         obs = self._get_obs(self.raw_obs)
125 |         reward = -target_bias
126 |         done = bool(self.raw_obs[0]) or self.max_ep_len == self.step_len
127 | 
128 |         info = {"TradingDay": self.raw_obs[25],
129 |                 "score": self.rewards[0],
130 |                 "profit": self.rewards[1],
131 |                 "target_bias": target_bias}
132 | 
133 |         if self.render and self.obs_dim == 38:
134 |             self.rendering(action)
135 | 
136 |         self.his_price.append(obs[0])
137 |         obs[22] = max(self.his_price)
138 |         obs[23] = min(self.his_price)
139 | 
140 |         return obs, reward, done, info
141 | 
142 |     def _get_obs(self, raw_obs):
143 | 
144 |         price_mean = 26440.28
145 |         price_max = 27952.0
146 |         bid_ask_volume_log_mean = 1.97
147 |         bid_ask_volume_log_max = 6.42
148 |         total_volume_mean = 120755.66
149 |         total_volume_max = 321988.0
150 |         # target_abs_mean = 51.018
151 |         target_mean = 2.55
152 |         target_max = 311.0
153 | 
154 |         price_filter = [1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 23, 24, 28, 30, 32, 36, 38, 40]
155 |         bid_ask_volume_filter = [3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 29, 31, 33, 37, 39, 41]
156 |         total_volume_filter = [22]
157 |         target_filter = [26, 27]
158 |         obs = np.array(raw_obs[:44], dtype=np.float32)
159 | 
160 |         obs[price_filter] = (obs[price_filter] - price_mean) / (price_max - price_mean)
161 |         obs[bid_ask_volume_filter] = (np.log(obs[bid_ask_volume_filter]) - bid_ask_volume_log_mean) / (
162 |                 bid_ask_volume_log_max - bid_ask_volume_log_mean)
163 |         obs[total_volume_filter] = (obs[total_volume_filter] - total_volume_mean) / (
164 |                 total_volume_max - total_volume_mean)
165 |         obs[target_filter] = (obs[target_filter] - target_mean) / (target_max - target_mean)
166 | 
167 |         if self.obs_dim == 38:
168 |             obs = np.delete(obs, [0, 25, 34, 35, 42, 43])
169 |         elif self.obs_dim == 26:
170 |             obs = obs[:28]
171 |             obs = np.delete(obs, [0, 25])
172 |         elif self.obs_dim == 24:
173 |             obs = obs[:25]
174 |             obs = np.delete(obs, [0])
175 |         else:
176 |             assert False, "incorrect obs_dim!"
177 |         obs[obs < -1] = -1
178 |         obs[obs > 1] = 1
179 | 
180 |         return obs
181 | 
182 |     def _action_schemes(self, action_scheme_id):
183 | 
184 |         schemes = {}
185 | 
186 |         def scheme3(action):
187 |             assert 0 <= action <= 2 or action == 6 or action == 9, "action should be 0,1,2"
188 |             if action == 1:
189 |                 self.expso.Action(self.ctx, self.actions[18])  # 如果是买动作，卖方向全撤。
190 |                 self.expso.Action(self.ctx, self.actions[6])
191 |             elif action == 2:
192 |                 self.expso.Action(self.ctx, self.actions[15])  # 如果是卖动作，买方向全撤。
193 |                 self.expso.Action(self.ctx, self.actions[9])
194 |             elif action == 0:
195 |                 self.expso.Action(self.ctx, self.actions[action])
196 |             # for auto_clip
197 |             elif action == 6:
198 |                 self.expso.Action(self.ctx, self.actions[18])
199 |                 self.expso.Action(self.ctx, self.actions[6])
200 |             elif action == 9:
201 |                 self.expso.Action(self.ctx, self.actions[15])
202 |                 self.expso.Action(self.ctx, self.actions[9])
203 | 
204 |         schemes[3] = scheme3
205 | 
206 |         # 根据买卖方向进行自动反方向撤单操作
207 |         def scheme15(action):
208 |             assert 0 <= action <= 14, "action should be 0,1,...,14"
209 |             if 1 <= action <= 7:
210 |                 self.expso.Action(self.ctx, self.actions[18])  # 如果是买动作，卖方向全撤。
211 |             elif 8 <= action <= 14:
212 |                 self.expso.Action(self.ctx, self.actions[15])  # 如果是卖动作，买方向全撤。
213 |             # 执行action
214 |             self.expso.Action(self.ctx, self.actions[action])
215 | 
216 |         schemes[15] = scheme15
217 | 
218 |         # 学习全撤单操作
219 |         def scheme17(action):
220 |             assert 0 <= action <= 16, "action should <=16"
221 |             if action <= 14:
222 |                 self.expso.Action(self.ctx, self.actions[action])
223 |             elif action == 15:
224 |                 self.expso.Action(self.ctx, self.actions[15])
225 |             elif action == 16:
226 |                 self.expso.Action(self.ctx, self.actions[18])
227 | 
228 |         schemes[17] = scheme17
229 | 
230 |         # 全部操作
231 |         def scheme21(action):
232 |             assert 0 <= action <= 20, "action should be 0,1,...,20"
233 |             self.expso.Action(self.ctx, self.actions[action])
234 | 
235 |         schemes[21] = scheme21
236 | 
237 |         # 这里添加新的scheme...
238 |         # def scheme0(action):
239 |         #     pass
240 |         # schemes[0] = scheme0
241 | 
242 |         self.action_dim = action_scheme_id
243 |         self.action_space = spaces.Discrete(self.action_dim)
244 | 
245 |         return schemes[action_scheme_id]
246 | 
247 |     def policy_069(self):  # actions: 0,6,9
248 |         if self.raw_obs[26] > self.raw_obs[27]:
249 |             action = 6
250 |         elif self.raw_obs[26] < self.raw_obs[27]:
251 |             action = 9
252 |         else:
253 |             action = 0
254 |         return action
255 | 
256 |     def rendering(self, action=None):
257 |         print("-----------------------")
258 |         print("Action:", action)
259 |         print("AliveAskPriceNUM:", self.raw_obs[42])
260 |         print("AliveAskVolumeNUM:", self.raw_obs[43])
261 |         print("AliveAskPrice3:", self.raw_obs[40])
262 |         print("AliveAskVolume3:", self.raw_obs[41])
263 |         print("AliveAskPrice2:", self.raw_obs[38])
264 |         print("AliveAskVolume2:", self.raw_obs[39])
265 |         print("AliveAskPrice1:", self.raw_obs[36])
266 |         print("AliveAskVolume1:", self.raw_obs[37])
267 |         print("AskPrice1:", self.raw_obs[4])
268 |         print("AskVolume1:", self.raw_obs[5])
269 |         print(".....")
270 |         print("LastPrice:", self.raw_obs[1])
271 |         print("Actual_Num:", self.raw_obs[27])
272 |         print(".....")
273 |         print("BidPrice1:", self.raw_obs[2])
274 |         print("BidVolume1:", self.raw_obs[3])
275 |         print("AliveBidPrice1:", self.raw_obs[28])
276 |         print("AliveBidVolume1:", self.raw_obs[29])
277 |         print("AliveBidPrice2:", self.raw_obs[30])
278 |         print("AliveBidVolume2:", self.raw_obs[31])
279 |         print("AliveBidPrice3:", self.raw_obs[32])
280 |         print("AliveBidVolume3:", self.raw_obs[33])
281 |         print("AliveBidPriceNUM:", self.raw_obs[34])
282 |         print("AliveBidVolumeNUM:", self.raw_obs[35])
283 |         print("-----------------------")
284 | 
285 |     def close_env(self):
286 |         self.expso.ReleaseContext(self.ctx)
287 | 
288 | 
289 | class FrameStack(gym.Wrapper):
290 |     def __init__(self, env, frame_stack, jump=1, model='mlp'):
291 |         super(FrameStack, self).__init__(env)
292 |         self.frame_stack = frame_stack
293 |         self.jump = jump
294 |         self.model = model
295 |         self.total_frame = frame_stack * jump
296 |         self.frames = deque([], maxlen=self.total_frame)
297 |         if model == 'mlp':
298 |             self.obs_dim = self.env.observation_space.shape[0] * frame_stack
299 |             self.observation_space = Box(-np.inf, np.inf, shape=(self.obs_dim,), dtype=np.float32)
300 |         else:
301 |             self.observation_space = Box(-np.inf, np.inf, shape=(frame_stack, self.env.observation_space.shape[0]),
302 |                                          dtype=np.float32)
303 | 
304 |     def reset(self, start_day=None, start_skip=None, duration=None, burn_in=0):
305 |         ob = self.env.reset(start_day=start_day, start_skip=start_skip, duration=duration, burn_in=burn_in)
306 |         ob = np.float32(ob)
307 |         for _ in range(self.total_frame):
308 |             self.frames.append(ob)
309 |         return self.observation()
310 | 
311 |     def step(self, action):
312 |         ob, reward, done, info = self.env.step(action)
313 |         ob = np.float32(ob)
314 |         self.frames.append(ob)
315 |         return self.observation(), reward, done, info
316 | 
317 |     def observation(self):
318 |         assert len(self.frames) == self.total_frame
319 |         obs_stack = np.array(self.frames)
320 |         idx = np.arange(0, self.total_frame, self.jump)
321 |         obs = obs_stack[idx]
322 |         if self.model == 'mlp':
323 |             return np.stack(obs, axis=0).reshape((self.obs_dim,))
324 |         else:
325 |             return obs
326 | 
327 | 
328 | if __name__ == "__main__":
329 | 
330 |     env = TradingEnv(action_scheme_id=3, obs_dim=38)
331 |     # env = FrameStack(env, frame_stack=3, jump=3, model='cnn')
332 | 
333 |     cnt = 0
334 | 
335 |     for i in range(1):
336 | 
337 |         obs = env.reset()
338 | 
339 |         # burn-in
340 |         # while env.target_diffs < 50:
341 |         #     action = env.baseline_policy(obs)
342 |         #     obs, reward, done, info = env.step(action)
343 |         #     cnt += 1
344 |         # print("burn-in steps:", cnt)
345 | 
346 |         # print(env.raw_obs[26], env.raw_obs[27])
347 |         print(obs)
348 |         step = 1
349 |         t0 = time.time()
350 |         price = 0.0
351 |         while True:
352 |             action = env.action_space.sample()
353 |             # action = env.baseline_policy(obs)
354 |             # action = 0
355 |             obs, reward, done, info = env.step(action)
356 |             step += 1
357 |             if step % 10 == 0:
358 |                 # print(step, env.raw_obs[26], env.raw_obs[27],
359 |                 #       (info["profit"], info["total_profit"], info["baseline_profit"]),
360 |                 #       (info["baseline_profit"] - info["profit"]) * 10 / info["target_diffs"], info["score"],
361 |                 #       (info["reward_score"], info["reward_target"], info["reward_action"],))
362 |                 print(obs)
363 |             # if price != info["price"]:
364 |             #     print('='*66)
365 |             #     price = info["price"]
366 |             if done:
367 |                 print("Done!", done, cnt, step, 'time:', time.time() - t0)
368 |                 # all_data = env.all_data
369 |                 # all_data_df = pd.DataFrame(all_data)
370 |                 # print(all_data_df.tail())
371 |                 break
372 | 
373 |     env.close_env()
374 | 


--------------------------------------------------------------------------------
/tutorial/Parallelize your algorithm by Ray (2).md:
--------------------------------------------------------------------------------
  1 | # 使用Ray并行化你的强化学习算法（二）
  2 | 
  3 | ## SAC代码分解
  4 | 
  5 | spinningup给新手提供了几个重要算法的实现，具有很好的参考价值。除了SAC外，其他on policy算法都使用MPI进行并行化，唯独SAC没有并行实现。所以，我们使用Ray来完成SAC的并行实现。
  6 | 
  7 | 这一节内容很简单，我们将spinningup里实现的sac分解开。在下一节，我们将分解开的每一个部分放入并行框架的对应位置。
  8 | 
  9 | 我们的并行框架结构图：
 10 | 
 11 | ![ddrlframework](C:\Users\Shuai\Documents\GitHub\Markdown\RL\Pictures\ddrlframework.jpg)
 12 | 
 13 | 我们根据我们的并行框架将sac分解为下面五个部分：
 14 | 
 15 | - Replay buffer
 16 | - Parameter server
 17 | - train (learn)
 18 | - rollout
 19 | - test
 20 | 
 21 | 下面用注释将每一部分标注。
 22 | 
 23 | ```python
 24 | import numpy as np
 25 | import tensorflow as tf
 26 | import gym
 27 | import time
 28 | from spinup.algos.sac import core
 29 | from spinup.algos.sac.core import get_vars
 30 | from spinup.utils.logx import EpochLogger
 31 | 
 32 | # ********************** replaybuffer part below **********************
 33 | class ReplayBuffer:
 34 |     """
 35 |     A simple FIFO experience replay buffer for SAC agents.
 36 |     """
 37 | 
 38 |     def __init__(self, obs_dim, act_dim, size):
 39 |         self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
 40 |         self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
 41 |         self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
 42 |         self.rews_buf = np.zeros(size, dtype=np.float32)
 43 |         self.done_buf = np.zeros(size, dtype=np.float32)
 44 |         self.ptr, self.size, self.max_size = 0, 0, size
 45 | 
 46 |     def store(self, obs, act, rew, next_obs, done):
 47 |         self.obs1_buf[self.ptr] = obs
 48 |         self.obs2_buf[self.ptr] = next_obs
 49 |         self.acts_buf[self.ptr] = act
 50 |         self.rews_buf[self.ptr] = rew
 51 |         self.done_buf[self.ptr] = done
 52 |         self.ptr = (self.ptr + 1) % self.max_size
 53 |         self.size = min(self.size + 1, self.max_size)
 54 | 
 55 |     def sample_batch(self, batch_size=32):
 56 |         idxs = np.random.randint(0, self.size, size=batch_size)
 57 |         return dict(obs1=self.obs1_buf[idxs],
 58 |                     obs2=self.obs2_buf[idxs],
 59 |                     acts=self.acts_buf[idxs],
 60 |                     rews=self.rews_buf[idxs],
 61 |                     done=self.done_buf[idxs])
 62 | # ********************** replaybuffer part above **********************
 63 | 
 64 | """
 65 | 
 66 | Soft Actor-Critic
 67 | 
 68 | (With slight variations that bring it closer to TD3)
 69 | 
 70 | """
 71 | 
 72 | 
 73 | def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
 74 |         steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99,
 75 |         polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000,
 76 |         max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
 77 |     """
 78 | 
 79 |     Args:
 80 |         env_fn : A function which creates a copy of the environment.
 81 |             The environment must satisfy the OpenAI Gym API.
 82 | 
 83 |         actor_critic: A function which takes in placeholder symbols
 84 |             for state, ``x_ph``, and action, ``a_ph``, and returns the main
 85 |             outputs from the agent's Tensorflow computation graph:
 86 | 
 87 |             ===========  ================  ======================================
 88 |             Symbol       Shape             Description
 89 |             ===========  ================  ======================================
 90 |             ``mu``       (batch, act_dim)  | Computes mean actions from policy
 91 |                                            | given states.
 92 |             ``pi``       (batch, act_dim)  | Samples actions from policy given
 93 |                                            | states.
 94 |             ``logp_pi``  (batch,)          | Gives log probability, according to
 95 |                                            | the policy, of the action sampled by
 96 |                                            | ``pi``. Critical: must be differentiable
 97 |                                            | with respect to policy parameters all
 98 |                                            | the way through action sampling.
 99 |             ``q1``       (batch,)          | Gives one estimate of Q* for
100 |                                            | states in ``x_ph`` and actions in
101 |                                            | ``a_ph``.
102 |             ``q2``       (batch,)          | Gives another estimate of Q* for
103 |                                            | states in ``x_ph`` and actions in
104 |                                            | ``a_ph``.
105 |             ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and
106 |                                            | ``pi`` for states in ``x_ph``:
107 |                                            | q1(x, pi(x)).
108 |             ``q2_pi``    (batch,)          | Gives the composition of ``q2`` and
109 |                                            | ``pi`` for states in ``x_ph``:
110 |                                            | q2(x, pi(x)).
111 |             ``v``        (batch,)          | Gives the value estimate for states
112 |                                            | in ``x_ph``.
113 |             ===========  ================  ======================================
114 | 
115 |         ac_kwargs (dict): Any kwargs appropriate for the actor_critic
116 |             function you provided to SAC.
117 | 
118 |         seed (int): Seed for random number generators.
119 | 
120 |         steps_per_epoch (int): Number of steps of interaction (state-action pairs)
121 |             for the agent and the environment in each epoch.
122 | 
123 |         epochs (int): Number of epochs to run and train agent.
124 | 
125 |         replay_size (int): Maximum length of replay buffer.
126 | 
127 |         gamma (float): Discount factor. (Always between 0 and 1.)
128 | 
129 |         polyak (float): Interpolation factor in polyak averaging for target
130 |             networks. Target networks are updated towards main networks
131 |             according to:
132 | 
133 |             .. math:: \\theta_{\\text{targ}} \\leftarrow
134 |                 \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
135 | 
136 |             where :math:`\\rho` is polyak. (Always between 0 and 1, usually
137 |             close to 1.)
138 | 
139 |         lr (float): Learning rate (used for both policy and value learning).
140 | 
141 |         alpha (float): Entropy regularization coefficient. (Equivalent to
142 |             inverse of reward scale in the original SAC paper.)
143 | 
144 |         batch_size (int): Minibatch size for SGD.
145 | 
146 |         start_steps (int): Number of steps for uniform-random action selection,
147 |             before running real policy. Helps exploration.
148 | 
149 |         max_ep_len (int): Maximum length of trajectory / episode / rollout.
150 | 
151 |         logger_kwargs (dict): Keyword args for EpochLogger.
152 | 
153 |         save_freq (int): How often (in terms of gap between epochs) to save
154 |             the current policy and value function.
155 | 
156 |     """
157 | 
158 |     # logger = EpochLogger(**logger_kwargs)
159 |     # logger.save_config(locals())
160 | 
161 |     tf.set_random_seed(seed)
162 |     np.random.seed(seed)
163 | 
164 |     env, test_env = env_fn(), env_fn()
165 |     obs_dim = env.observation_space.shape[0]
166 |     act_dim = env.action_space.shape[0]
167 | 
168 |     # Action limit for clamping: critically, assumes all dimensions share the same bound!
169 |     act_limit = env.action_space.high[0]
170 | 
171 |     # Share information about action space with policy architecture
172 |     ac_kwargs['action_space'] = env.action_space
173 | 
174 |     # ********************** model part below **********************
175 | 
176 |     # Inputs to computation graph
177 |     x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None)
178 | 
179 |     # Main outputs from computation graph
180 |     with tf.variable_scope('main'):
181 |         mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)
182 | 
183 |     # Target value network
184 |     with tf.variable_scope('target'):
185 |         _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)
186 | 
187 |     # ********************** model part above **********************
188 | 
189 |     # Experience buffer
190 |     replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)
191 | 
192 |     # ********************** model part below **********************
193 | 
194 |     # Count variables
195 |     var_counts = tuple(core.count_vars(scope) for scope in
196 |                        ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main'])
197 |     print(('\nNumber of parameters: \t pi: %d, \t' + 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n') % var_counts)
198 | 
199 |     # Min Double-Q:
200 |     min_q_pi = tf.minimum(q1_pi, q2_pi)
201 | 
202 |     # Targets for Q and V regression
203 |     q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ)
204 |     v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi)
205 | 
206 |     # Soft actor-critic losses
207 |     pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi)
208 |     q1_loss = 0.5 * tf.reduce_mean((q_backup - q1) ** 2)
209 |     q2_loss = 0.5 * tf.reduce_mean((q_backup - q2) ** 2)
210 |     v_loss = 0.5 * tf.reduce_mean((v_backup - v) ** 2)
211 |     value_loss = q1_loss + q2_loss + v_loss
212 | 
213 |     # Policy train op
214 |     # (has to be separate from value train op, because q1_pi appears in pi_loss)
215 |     pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
216 |     train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
217 | 
218 |     # Value train op
219 |     # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
220 |     value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
221 |     value_params = get_vars('main/q') + get_vars('main/v')
222 |     with tf.control_dependencies([train_pi_op]):
223 |         train_value_op = value_optimizer.minimize(value_loss, var_list=value_params)
224 | 
225 |     # Polyak averaging for target variables
226 |     # (control flow because sess.run otherwise evaluates in nondeterministic order)
227 |     with tf.control_dependencies([train_value_op]):
228 |         target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
229 |                                   for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
230 | 
231 |     # All ops to call during one training step
232 |     step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi,
233 |                 train_pi_op, train_value_op, target_update]
234 | 
235 |     # Initializing targets to match main variables
236 |     target_init = tf.group([tf.assign(v_targ, v_main)
237 |                             for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
238 | 
239 |     sess = tf.Session()
240 |     sess.run(tf.global_variables_initializer())
241 |     sess.run(target_init)
242 | 
243 |     # ********************** model part above **********************
244 | 
245 |     # Setup model saving
246 |     # logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph},
247 |     #                       outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v})
248 | 
249 |     def get_action(o, deterministic=False):
250 |         act_op = mu if deterministic else pi
251 |         return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0]
252 | 
253 |     def test_agent(n=10):
254 |         global sess, mu, pi, q1, q2, q1_pi, q2_pi
255 |         for j in range(n):
256 |             o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
257 |             while not (d or (ep_len == max_ep_len)):
258 |                 # Take deterministic actions at test time
259 |                 o, r, d, _ = test_env.step(get_action(o, True))
260 |                 ep_ret += r
261 |                 ep_len += 1
262 |             print(ep_len, ep_ret)
263 |             # logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
264 | 
265 |     # ********************** rollout part below **********************
266 | 
267 |     start_time = time.time()
268 |     o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
269 |     total_steps = steps_per_epoch * epochs
270 | 
271 |     # Main loop: collect experience in env and update/log each epoch
272 |     for t in range(total_steps):
273 | 
274 |         """
275 |         Until start_steps have elapsed, randomly sample actions
276 |         from a uniform distribution for better exploration. Afterwards, 
277 |         use the learned policy. 
278 |         """
279 |         if t > start_steps:
280 |             a = get_action(o)
281 |         else:
282 |             a = env.action_space.sample()
283 | 
284 |         # Step the env
285 |         o2, r, d, _ = env.step(a)
286 |         ep_ret += r
287 |         ep_len += 1
288 | 
289 |         # Ignore the "done" signal if it comes from hitting the time
290 |         # horizon (that is, when it's an artificial terminal signal
291 |         # that isn't based on the agent's state)
292 |         d = False if ep_len == max_ep_len else d
293 | 
294 |         # Store experience to replay buffer
295 |         replay_buffer.store(o, a, r, o2, d)
296 | 
297 |         # Super critical, easy to overlook step: make sure to update
298 |         # most recent observation!
299 |         o = o2
300 | 
301 |         if d or (ep_len == max_ep_len):
302 |             """
303 |             Perform all SAC updates at the end of the trajectory.
304 |             This is a slight difference from the SAC specified in the
305 |             original paper.
306 |             """
307 | 
308 |             # ********************** train part below **********************
309 | 
310 |             for j in range(ep_len):
311 |                 batch = replay_buffer.sample_batch(batch_size)
312 |                 feed_dict = {x_ph: batch['obs1'],
313 |                              x2_ph: batch['obs2'],
314 |                              a_ph: batch['acts'],
315 |                              r_ph: batch['rews'],
316 |                              d_ph: batch['done'],
317 |                              }
318 |                 outs = sess.run(step_ops, feed_dict)
319 |                 # logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2],
320 |                 #              LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5],
321 |                 #              VVals=outs[6], LogPi=outs[7])
322 | 
323 |             # ********************** train part above **********************
324 | 
325 |             # logger.store(EpRet=ep_ret, EpLen=ep_len)
326 |             o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
327 | 
328 |         # ********************** rollout part above **********************
329 | 
330 |         # End of epoch wrap-up
331 |         if t > 0 and t % steps_per_epoch == 0:
332 |             epoch = t // steps_per_epoch
333 | 
334 |             # Save model
335 |             # if (epoch % save_freq == 0) or (epoch == epochs - 1):
336 |             #     logger.save_state({'env': env}, None)
337 | 
338 |             # Test the performance of the deterministic version of the agent.
339 |             test_agent()
340 | 
341 |             # Log info about epoch
342 |             # logger.log_tabular('Epoch', epoch)
343 |             # logger.log_tabular('EpRet', with_min_and_max=True)
344 |             # logger.log_tabular('TestEpRet', with_min_and_max=True)
345 |             # logger.log_tabular('EpLen', average_only=True)
346 |             # logger.log_tabular('TestEpLen', average_only=True)
347 |             # logger.log_tabular('TotalEnvInteracts', t)
348 |             # logger.log_tabular('Q1Vals', with_min_and_max=True)
349 |             # logger.log_tabular('Q2Vals', with_min_and_max=True)
350 |             # logger.log_tabular('VVals', with_min_and_max=True)
351 |             # logger.log_tabular('LogPi', with_min_and_max=True)
352 |             # logger.log_tabular('LossPi', average_only=True)
353 |             # logger.log_tabular('LossQ1', average_only=True)
354 |             # logger.log_tabular('LossQ2', average_only=True)
355 |             # logger.log_tabular('LossV', average_only=True)
356 |             # logger.log_tabular('Time', time.time() - start_time)
357 |             # logger.dump_tabular()
358 | 
359 | 
360 | if __name__ == '__main__':
361 |     import argparse
362 | 
363 |     parser = argparse.ArgumentParser()
364 |     parser.add_argument('--env', type=str, default='BipedalWalker-v2')
365 |     parser.add_argument('--hid', type=int, default=300)
366 |     parser.add_argument('--l', type=int, default=1)
367 |     parser.add_argument('--gamma', type=float, default=0.99)
368 |     parser.add_argument('--seed', '-s', type=int, default=0)
369 |     parser.add_argument('--epochs', type=int, default=50)
370 |     parser.add_argument('--exp_name', type=str, default='sac')
371 |     args = parser.parse_args()
372 | 
373 |     # from spinup.utils.run_utils import setup_logger_kwargs
374 |     #
375 |     # logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)
376 | 
377 |     sac(lambda: gym.make(args.env), actor_critic=core.mlp_actor_critic,
378 |         ac_kwargs=dict(hidden_sizes=[args.hid] * args.l),
379 |         gamma=args.gamma, seed=args.seed, epochs=args.epochs,)
380 |         # logger_kwargs=logger_kwargs)
381 | 
382 | ```
383 | 
384 | 本节完。


--------------------------------------------------------------------------------
/algos/dqn/train.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import gym
  4 | import time
  5 | import ray
  6 | 
  7 | import os
  8 | import sys
  9 | 
 10 | from hyperparams import HyperParameters
 11 | from actor_learner import Actor, Learner
 12 | 
 13 | import os
 14 | import pickle
 15 | import multiprocessing
 16 | import copy
 17 | import json
 18 | 
 19 | ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 20 | sys.path.append(ROOT)
 21 | from trading_env import TradingEnv, FrameStack
 22 | 
 23 | 
 24 | flags = tf.app.flags
 25 | FLAGS = tf.app.flags.FLAGS
 26 | 
 27 | flags.DEFINE_string("env_name", "Trading", "game env")
 28 | flags.DEFINE_string("exp_name", "ddqn-trading", "experiments name")
 29 | flags.DEFINE_integer("num_nodes", 1, "number of nodes")
 30 | flags.DEFINE_integer("num_workers", 6, "number of workers")
 31 | flags.DEFINE_string("weights_file", "", "empty means False.")
 32 | flags.DEFINE_float("a_l_ratio", 10, "actor_steps / learner_steps")
 33 | flags.DEFINE_bool("recover", False, "back training from last checkpoint")
 34 | flags.DEFINE_string("checkpoint_path", "", "empty means opt.save_dir. ")
 35 | 
 36 | 
 37 | @ray.remote(num_cpus=2)
 38 | class ReplayBuffer:
 39 |     """
 40 |     A simple FIFO experience replay buffer for SQN_N_STEP agents.
 41 |     """
 42 | 
 43 |     def __init__(self, opt, buffer_index):
 44 |         self.opt = opt
 45 |         self.buffer_index = buffer_index
 46 |         self.obs1_buf = np.zeros([opt.buffer_size, opt.obs_dim], dtype=np.float32)
 47 |         self.obs2_buf = np.zeros([opt.buffer_size, opt.obs_dim], dtype=np.float32)
 48 |         self.acts_buf = np.zeros(opt.buffer_size, dtype=np.float32)
 49 |         self.rews_buf = np.zeros(opt.buffer_size, dtype=np.float32)
 50 |         self.done_buf = np.zeros(opt.buffer_size, dtype=np.float32)
 51 |         self.ptr, self.size, self.max_size = 0, 0, opt.buffer_size
 52 |         self.actor_steps, self.learner_steps = 0, 0
 53 | 
 54 |     def store(self, obs, act, rew, next_obs, done, worker_index):
 55 | 
 56 |         self.obs1_buf[self.ptr] = obs
 57 |         self.obs2_buf[self.ptr] = next_obs
 58 |         self.acts_buf[self.ptr] = act
 59 |         self.rews_buf[self.ptr] = rew
 60 |         self.done_buf[self.ptr] = done
 61 | 
 62 |         self.ptr = (self.ptr + 1) % self.max_size
 63 |         self.size = min(self.size + 1, self.max_size)
 64 |         self.actor_steps += 1
 65 | 
 66 |     def sample_batch(self):
 67 |         idxs = np.random.randint(0, self.size, size=self.opt.batch_size)
 68 |         self.learner_steps += 1
 69 |         return dict(obs1=self.obs1_buf[idxs],
 70 |                     obs2=self.obs2_buf[idxs],
 71 |                     acts=self.acts_buf[idxs],
 72 |                     rews=self.rews_buf[idxs],
 73 |                     done=self.done_buf[idxs])
 74 | 
 75 |     def get_counts(self):
 76 |         return self.learner_steps, self.actor_steps, self.size
 77 | 
 78 |     # debug
 79 |     def show(self):
 80 |         return self.obs1_buf, self.ptr, self.size, self.max_size
 81 | 
 82 |     def save(self):
 83 |         np.save(self.opt.save_dir + "/checkpoint/" + 'obs1_buf-' + str(self.buffer_index), self.obs1_buf)
 84 |         np.save(self.opt.save_dir + "/checkpoint/" + 'obs2_buf-' + str(self.buffer_index), self.obs2_buf)
 85 |         np.save(self.opt.save_dir + "/checkpoint/" + 'acts_buf-' + str(self.buffer_index), self.acts_buf)
 86 |         np.save(self.opt.save_dir + "/checkpoint/" + 'rews_buf-' + str(self.buffer_index), self.rews_buf)
 87 |         np.save(self.opt.save_dir + "/checkpoint/" + 'done_buf-' + str(self.buffer_index), self.done_buf)
 88 |         buffer_infos = np.array((self.ptr, self.size, self.max_size, self.actor_steps, self.learner_steps))
 89 |         np.save(self.opt.save_dir + "/checkpoint/" + 'buffer_infos-' + str(self.buffer_index), buffer_infos)
 90 |         print("****** buffer " + str(self.buffer_index) + " saved! ******")
 91 | 
 92 |     def load(self, checkpoint_path):
 93 |         if not checkpoint_path:
 94 |             checkpoint_path = self.opt.save_dir + "/checkpoint"
 95 | 
 96 |         self.obs1_buf = np.load(checkpoint_path + '/obs1_buf-' + str(self.buffer_index) + '.npy')
 97 |         self.obs2_buf = np.load(checkpoint_path + '/obs2_buf-' + str(self.buffer_index) + '.npy')
 98 |         self.acts_buf = np.load(checkpoint_path + '/acts_buf-' + str(self.buffer_index) + '.npy')
 99 |         self.rews_buf = np.load(checkpoint_path + '/rews_buf-' + str(self.buffer_index) + '.npy')
100 |         self.done_buf = np.load(checkpoint_path + '/done_buf-' + str(self.buffer_index) + '.npy')
101 |         buffer_infos = np.load(checkpoint_path + '/buffer_infos-' + str(self.buffer_index) + '.npy')
102 | 
103 |         self.ptr, self.size, self.max_size, self.actor_steps, self.learner_steps = buffer_infos[0], buffer_infos[1], \
104 |                                                                                    buffer_infos[2], buffer_infos[3], \
105 |                                                                                    buffer_infos[4]
106 |         print("****** buffer number " + str(self.buffer_index) + " restored! ******")
107 |         print("****** buffer number " + str(self.buffer_index) + " infos:", self.ptr, self.size, self.max_size,
108 |               self.actor_steps, self.learner_steps)
109 | 
110 | 
111 | @ray.remote(num_cpus=2)
112 | class ParameterServer:
113 |     def __init__(self, opt, weights_file, checkpoint_path, ps_index):
114 |         # each node will have a Parameter Server
115 | 
116 |         self.opt = opt
117 |         self.learner_step = 0
118 |         net = Learner(opt, job="ps")
119 |         keys, values = net.get_weights()
120 | 
121 |         # --- make dir for all nodes and save parameters ---
122 |         try:
123 |             os.makedirs(opt.save_dir)
124 |             os.makedirs(opt.save_dir + '/checkpoint')
125 |         except OSError:
126 |             pass
127 |         all_parameters = copy.deepcopy(vars(opt))
128 |         all_parameters["obs_space"] = ""
129 |         all_parameters["act_space"] = ""
130 |         with open(opt.save_dir + "/" + 'All_Parameters.json', 'w') as fp:
131 |             json.dump(all_parameters, fp, indent=4, sort_keys=True)
132 |         # --- end ---
133 | 
134 |         self.weights = None
135 | 
136 |         if not checkpoint_path:
137 |             checkpoint_path = opt.save_dir + "/checkpoint"
138 | 
139 |         if opt.recover:
140 |             with open(checkpoint_path + "/checkpoint_weights.pickle", "rb") as pickle_in:
141 |                 self.weights = pickle.load(pickle_in)
142 |                 print("****** weights restored! ******")
143 | 
144 |         if weights_file:
145 |             try:
146 |                 with open(weights_file, "rb") as pickle_in:
147 |                     self.weights = pickle.load(pickle_in)
148 |                     print("****** weights restored! ******")
149 |             except:
150 |                 print("------------------------------------------------")
151 |                 print(weights_file)
152 |                 print("------ error: weights file doesn't exist! ------")
153 |                 exit()
154 | 
155 |         if not opt.recover and not weights_file:
156 |             values = [value.copy() for value in values]
157 |             self.weights = dict(zip(keys, values))
158 | 
159 |     def push(self, keys, values):
160 |         values = [value.copy() for value in values]
161 |         for key, value in zip(keys, values):
162 |             self.weights[key] = value
163 |         self.learner_step += opt.push_freq
164 | 
165 |     def pull(self, keys):
166 |         return [self.weights[key] for key in keys]
167 | 
168 |     def get_weights(self):
169 |         return copy.deepcopy(self.weights)
170 | 
171 |     # save weights to disk
172 |     def save_weights(self):
173 |         with open(self.opt.save_dir + "/checkpoint/" + "checkpoint_weights.pickle", "wb") as pickle_out:
174 |             pickle.dump(self.weights, pickle_out)
175 | 
176 | 
177 | class Cache(object):
178 | 
179 |     def __init__(self, node_buffer):
180 |         # cache for training data and model weights
181 |         print('os.pid:', os.getpid())
182 |         self.node_buffer = node_buffer
183 |         self.q1 = multiprocessing.Queue(12)
184 |         self.q2 = multiprocessing.Queue(5)
185 |         self.p1 = multiprocessing.Process(target=self.ps_update, args=(self.q1, self.q2, self.node_buffer))
186 |         self.p1.daemon = True
187 | 
188 |     def ps_update(self, q1, q2, node_buffer):
189 |         print('os.pid of put_data():', os.getpid())
190 | 
191 |         node_idx = np.random.choice(opt.num_nodes, 1)[0]
192 |         buffer_idx = np.random.choice(opt.num_buffers, 1)[0]
193 |         q1.put(copy.deepcopy(ray.get(node_buffer[node_idx][buffer_idx].sample_batch.remote())))
194 | 
195 |         while True:
196 |             if q1.qsize() < 10:
197 |                 node_idx = np.random.choice(opt.num_nodes, 1)[0]
198 |                 buffer_idx = np.random.choice(opt.num_buffers, 1)[0]
199 |                 q1.put(copy.deepcopy(ray.get(node_buffer[node_idx][buffer_idx].sample_batch.remote())))
200 | 
201 |             if not q2.empty():
202 |                 keys, values = q2.get()
203 |                 [node_ps[i].push.remote(keys, values) for i in range(opt.num_nodes)]
204 | 
205 |     def start(self):
206 |         self.p1.start()
207 |         self.p1.join(10)
208 | 
209 |     def end(self):
210 |         self.p1.terminate()
211 | 
212 | 
213 | @ray.remote(num_cpus=2, num_gpus=1, max_calls=1)
214 | def worker_train(ps, node_buffer, opt, learner_index):
215 |     agent = Learner(opt, job="learner")
216 |     keys = agent.get_weights()[0]
217 |     weights = ray.get(ps.pull.remote(keys))
218 |     agent.set_weights(keys, weights)
219 | 
220 |     cache = Cache(node_buffer)
221 | 
222 |     cache.start()
223 | 
224 |     cnt = 1
225 |     while True:
226 |         batch = cache.q1.get()
227 |         agent.train(batch, cnt)
228 | 
229 |         if cnt % opt.push_freq == 0:
230 |             cache.q2.put(agent.get_weights())
231 |         cnt += 1
232 | 
233 | 
234 | @ray.remote
235 | def worker_rollout(ps, replay_buffer, opt, worker_index):
236 |     agent = Actor(opt, job="worker")
237 |     keys = agent.get_weights()[0]
238 |     np.random.seed()
239 | 
240 |     ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
241 |     sys.path.append(ROOT)
242 |     from trading_env import TradingEnv, FrameStack
243 |     # ------ env set up ------
244 |     # env = gym.make(opt.env_name)
245 |     env = TradingEnv(action_scheme_id=3, obs_dim=38)
246 | 
247 |     while True:
248 | 
249 |         o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
250 | 
251 |         weights = ray.get(ps.pull.remote(keys))
252 |         agent.set_weights(keys, weights)
253 | 
254 |         # for a_l_ratio control
255 |         np.random.seed()
256 |         rand_buff = np.random.choice(opt.num_buffers, 1)[0]
257 |         last_learner_steps, last_actor_steps, _size = ray.get(replay_buffer[rand_buff].get_counts.remote())
258 | 
259 |         while True:
260 | 
261 |             # don't need to random sample action if load weights from local.
262 |             if last_actor_steps * opt.num_buffers > opt.start_steps or opt.recover:
263 |                 a = agent.get_action(o)
264 |             else:
265 |                 a = env.action_space.sample()
266 |             # Step the env
267 |             o2, r, d, _ = env.step(a)
268 | 
269 |             ep_ret += r
270 |             ep_len += 1
271 | 
272 |             # Ignore the "done" signal if it comes from hitting the time
273 |             # horizon (that is, when it's an artificial terminal signal
274 |             # that isn't based on the agent's state)
275 |             # d = False if ep_len*opt.action_repeat >= opt.max_ep_len else d
276 | 
277 |             np.random.seed()
278 |             rand_buff = np.random.choice(opt.num_buffers, 1)[0]
279 |             replay_buffer[rand_buff].store.remote(o, a, r, o2, d, worker_index)
280 | 
281 |             o = o2
282 | 
283 |             # End of episode. Training (ep_len times).
284 |             # if d or (ep_len * opt.action_repeat >= opt.max_ep_len):
285 |             if d:
286 |                 break
287 | 
288 | 
289 | @ray.remote
290 | def worker_test(ps, node_buffer, opt):
291 | 
292 |     agent = Actor(opt, job="test")
293 |     keys = agent.get_weights()[0]
294 | 
295 |     # test_env = gym.make(opt.env_name)
296 |     ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
297 |     sys.path.append(ROOT)
298 |     from trading_env import TradingEnv, FrameStack
299 |     test_env = TradingEnv(action_scheme_id=3, obs_dim=38)
300 | 
301 |     init_time = time.time()
302 |     save_times = 0
303 |     checkpoint_times = 0
304 | 
305 |     while True:
306 |         # weights_all for save it to local
307 |         weights_all = ray.get(ps.get_weights.remote())
308 |         weights = [weights_all[key] for key in keys]
309 |         agent.set_weights(keys, weights)
310 | 
311 |         start_actor_step, start_learner_step, _ = get_al_status(node_buffer)
312 |         start_time = time.time()
313 | 
314 |         ave_test_reward, ave_score = agent.test(test_env, 10)
315 | 
316 |         last_actor_step, last_learner_step, _ = get_al_status(node_buffer)
317 |         actor_step = np.sum(last_actor_step) - np.sum(start_actor_step)
318 |         learner_step = np.sum(last_learner_step) - np.sum(start_learner_step)
319 |         alratio = actor_step / (learner_step + 1)
320 |         update_frequency = int(learner_step / (time.time() - start_time))
321 |         total_learner_step = np.sum(last_learner_step)
322 | 
323 |         print("---------------------------------------------------")
324 |         print("average test reward:", ave_test_reward)
325 |         print("average test score:", ave_score)
326 |         print("frame freq:", np.round((last_actor_step - start_actor_step) / (time.time() - start_time)))
327 |         print("actor_steps:", np.sum(last_actor_step), "learner_step:", total_learner_step)
328 |         print("actor leaner ratio: %.2f" % alratio)
329 |         print("learner freq:", update_frequency)
330 |         print("Ray total resources:", ray.cluster_resources())
331 |         print("available resources:", ray.available_resources())
332 |         print("---------------------------------------------------")
333 |         if learner_step < 100:
334 |             alratio = 0
335 |         agent.write_tb(ave_test_reward, ave_score, alratio, update_frequency, total_learner_step)
336 | 
337 |         total_time = time.time() - init_time
338 | 
339 |         if total_learner_step // opt.save_interval > save_times:
340 |             with open(opt.save_dir + "/" + str(total_learner_step / 1e6) + "M_" + str(ave_test_reward) + "_weights.pickle", "wb") as pickle_out:
341 |                 pickle.dump(weights_all, pickle_out)
342 |                 print("****** Weights saved by time! ******")
343 |             save_times = total_learner_step // opt.save_interval
344 | 
345 |         # save everything every checkpoint_freq s
346 |         if total_time // opt.checkpoint_freq > checkpoint_times:
347 |             print("save everything!")
348 |             save_start_time = time.time()
349 | 
350 |             ps_save_op = [node_ps[i].save_weights.remote() for i in range(opt.num_nodes)]
351 |             buffer_save_op = [node_buffer[node_index][i].save.remote() for i in range(opt.num_buffers) for node_index in range(opt.num_nodes)]
352 |             ray.wait(buffer_save_op + ps_save_op, num_returns=opt.num_nodes*opt.num_buffers + 1)
353 | 
354 |             print("total time for saving :", time.time() - save_start_time)
355 |             checkpoint_times = total_time // opt.checkpoint_freq
356 | 
357 | 
358 | def get_al_status(node_buffer):
359 | 
360 |     buffer_learner_step = []
361 |     buffer_actor_step = []
362 |     buffer_cur_size = []
363 | 
364 |     for node_index in range(opt.num_nodes):
365 |         for i in range(opt.num_buffers):
366 |             learner_step, actor_step, cur_size = ray.get(node_buffer[node_index][i].get_counts.remote())
367 |             buffer_learner_step.append(learner_step)
368 |             buffer_actor_step.append(actor_step)
369 |             buffer_cur_size.append(cur_size)
370 | 
371 |     return np.array(buffer_actor_step), np.array(buffer_learner_step), np.array(buffer_cur_size)
372 | 
373 | 
374 | if __name__ == '__main__':
375 | 
376 |     # ray.init()
377 |     ray.init(resources={"node0": 256})
378 | 
379 |     # env = gym.make(FLAGS.env_name)
380 |     env = TradingEnv(action_scheme_id=3, obs_dim=38)
381 | 
382 |     # ------ HyperParameters ------
383 |     opt = HyperParameters(env, FLAGS.env_name, FLAGS.exp_name, FLAGS.num_nodes, FLAGS.num_workers, FLAGS.a_l_ratio, FLAGS.weights_file)
384 | 
385 |     if FLAGS.recover:
386 |         opt.recover = True
387 |     # ------ end ------
388 | 
389 |     node_ps = []
390 |     node_buffer = []
391 | 
392 |     for node_index in range(FLAGS.num_nodes):
393 | 
394 |         # ------ Parameter Server (ray actor) ------
395 |         # create model to get weights and create a parameter server
396 |         node_ps.append(ParameterServer._remote(args=[opt, FLAGS.weights_file, FLAGS.checkpoint_path, node_index], resources={"node"+str(node_index): 1}))
397 |         print(f"Node{node_index} Parameter Server all set.")
398 |         # ------ Parameter Server end ------
399 | 
400 |         # ------ Experience buffer (ray actor) ------
401 |         node_buffer.append([ReplayBuffer._remote(args=[opt, i+node_index*opt.num_buffers], resources={"node"+str(node_index): 1}) for i in range(opt.num_buffers)])
402 | 
403 |         if FLAGS.recover:
404 |             buffer_load_op = [node_buffer[node_index][i].load.remote(FLAGS.checkpoint_path) for i in range(opt.num_buffers)]
405 |             ray.wait(buffer_load_op, num_returns=opt.num_buffers)
406 |         print(f"Node{node_index} Experience buffer all set.")
407 |         # ------ Experience buffer end ------
408 | 
409 |         # ------ roll out worker (ray task) ------
410 |         for i in range(FLAGS.num_workers):
411 |             worker_rollout._remote(args=[node_ps[node_index], node_buffer[node_index], opt, i+node_index*FLAGS.num_workers], resources={"node"+str(node_index): 1})
412 |             time.sleep(0.19)
413 | 
414 |         print(f"Node{node_index} roll out worker all up.")
415 |         # ------ roll out worker end ------
416 | 
417 |     print(f"num of ps up: {len(node_ps)}, num of buffer up: {len(node_buffer)*len(node_buffer[0])}")
418 | 
419 |     print("Ray total resources:", ray.cluster_resources())
420 |     print("available resources:", ray.available_resources())
421 | 
422 |     # --- save nodes info ---
423 |     nodes_info = {
424 |         "node_buffer": np.array(node_buffer),
425 |         "num_nodes": opt.num_nodes,
426 |         "num_buffers": opt.num_buffers
427 |     }
428 |     f_name = './nodes_info.pickle'
429 |     with open(f_name, "wb") as pickle_out:
430 |         pickle.dump(nodes_info, pickle_out)
431 |         print("****** save nodes_info ******")
432 |     # --- end ---
433 | 
434 |     # control learner start time
435 |     if not opt.recover:
436 | 
437 |         start_time = time.time()
438 | 
439 |         total_cur_size = 0
440 |         while total_cur_size < opt.start_steps:
441 | 
442 |             buffer_actor_step, buffer_learner_step, buffer_cur_size = get_al_status(node_buffer)
443 |             total_cur_size = np.sum(buffer_cur_size)
444 | 
445 |             print("---------------------------------------------------")
446 |             print("learner_step:", buffer_learner_step, "actor_steps:", buffer_actor_step)
447 |             print("frame freq:", np.round(buffer_actor_step/(time.time()-start_time)))
448 |             print("total frame freq:", int(np.sum(buffer_actor_step)/(time.time()-start_time)))
449 |             print('start steps before learning:', total_cur_size, '/', opt.start_steps)
450 |             print("Ray total resources:", ray.cluster_resources())
451 |             print("available resources:", ray.available_resources())
452 |             print("---------------------------------------------------")
453 |             time.sleep(10)
454 |     else:
455 |         time.sleep(0.0)
456 | 
457 |     # ------ learner ------
458 |     task_train = worker_train._remote(args=[node_ps[0], node_buffer, opt, 0], resources={"node0": 1})
459 |     # ------ learner end ------
460 | 
461 |     task_test = worker_test.remote(node_ps[0], node_buffer, opt)
462 |     ray.wait([task_test])
463 | 


--------------------------------------------------------------------------------
/algos/sqn/train.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import gym
  4 | import time
  5 | import ray
  6 | 
  7 | import os
  8 | import sys
  9 | 
 10 | from hyperparams import HyperParameters
 11 | from actor_learner import Actor, Learner
 12 | 
 13 | import os
 14 | import pickle
 15 | import multiprocessing
 16 | import copy
 17 | import json
 18 | 
 19 | ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 20 | sys.path.append(ROOT)
 21 | from trading_env import TradingEnv, FrameStack
 22 | 
 23 | 
 24 | flags = tf.app.flags
 25 | FLAGS = tf.app.flags.FLAGS
 26 | 
 27 | flags.DEFINE_string("env_name", "Trading", "game env")
 28 | flags.DEFINE_string("exp_name", "sqn-trading", "experiments name")
 29 | flags.DEFINE_integer("num_nodes", 1, "number of nodes")
 30 | flags.DEFINE_integer("num_workers", 12, "number of workers")
 31 | flags.DEFINE_string("weights_file", "", "empty means False.")
 32 | flags.DEFINE_float("a_l_ratio", 10, "actor_steps / learner_steps")
 33 | flags.DEFINE_bool("recover", False, "back training from last checkpoint")
 34 | flags.DEFINE_string("checkpoint_path", "", "empty means opt.save_dir. ")
 35 | 
 36 | 
 37 | @ray.remote(num_cpus=2)
 38 | class ReplayBuffer:
 39 |     """
 40 |     A simple FIFO experience replay buffer for SQN_N_STEP agents.
 41 |     """
 42 | 
 43 |     def __init__(self, opt, buffer_index):
 44 |         self.opt = opt
 45 |         self.buffer_index = buffer_index
 46 |         self.obs1_buf = np.zeros([opt.buffer_size, opt.obs_dim], dtype=np.float32)
 47 |         self.obs2_buf = np.zeros([opt.buffer_size, opt.obs_dim], dtype=np.float32)
 48 |         self.acts_buf = np.zeros(opt.buffer_size, dtype=np.float32)
 49 |         self.rews_buf = np.zeros(opt.buffer_size, dtype=np.float32)
 50 |         self.done_buf = np.zeros(opt.buffer_size, dtype=np.float32)
 51 |         self.ptr, self.size, self.max_size = 0, 0, opt.buffer_size
 52 |         self.actor_steps, self.learner_steps = 0, 0
 53 | 
 54 |     def store(self, obs, act, rew, next_obs, done, worker_index):
 55 | 
 56 |         self.obs1_buf[self.ptr] = obs
 57 |         self.obs2_buf[self.ptr] = next_obs
 58 |         self.acts_buf[self.ptr] = act
 59 |         self.rews_buf[self.ptr] = rew
 60 |         self.done_buf[self.ptr] = done
 61 | 
 62 |         self.ptr = (self.ptr + 1) % self.max_size
 63 |         self.size = min(self.size + 1, self.max_size)
 64 |         self.actor_steps += 1
 65 | 
 66 |     def sample_batch(self):
 67 |         idxs = np.random.randint(0, self.size, size=self.opt.batch_size)
 68 |         self.learner_steps += 1
 69 |         return dict(obs1=self.obs1_buf[idxs],
 70 |                     obs2=self.obs2_buf[idxs],
 71 |                     acts=self.acts_buf[idxs],
 72 |                     rews=self.rews_buf[idxs],
 73 |                     done=self.done_buf[idxs])
 74 | 
 75 |     def get_counts(self):
 76 |         return self.learner_steps, self.actor_steps, self.size
 77 | 
 78 |     # debug
 79 |     def show(self):
 80 |         return self.obs1_buf, self.ptr, self.size, self.max_size
 81 | 
 82 |     def save(self):
 83 |         np.save(self.opt.save_dir + "/checkpoint/" + 'obs1_buf-' + str(self.buffer_index), self.obs1_buf)
 84 |         np.save(self.opt.save_dir + "/checkpoint/" + 'obs2_buf-' + str(self.buffer_index), self.obs2_buf)
 85 |         np.save(self.opt.save_dir + "/checkpoint/" + 'acts_buf-' + str(self.buffer_index), self.acts_buf)
 86 |         np.save(self.opt.save_dir + "/checkpoint/" + 'rews_buf-' + str(self.buffer_index), self.rews_buf)
 87 |         np.save(self.opt.save_dir + "/checkpoint/" + 'done_buf-' + str(self.buffer_index), self.done_buf)
 88 |         buffer_infos = np.array((self.ptr, self.size, self.max_size, self.actor_steps, self.learner_steps))
 89 |         np.save(self.opt.save_dir + "/checkpoint/" + 'buffer_infos-' + str(self.buffer_index), buffer_infos)
 90 |         print("****** buffer " + str(self.buffer_index) + " saved! ******")
 91 | 
 92 |     def load(self, checkpoint_path):
 93 |         if not checkpoint_path:
 94 |             checkpoint_path = self.opt.save_dir + "/checkpoint"
 95 | 
 96 |         self.obs1_buf = np.load(checkpoint_path + '/obs1_buf-' + str(self.buffer_index) + '.npy')
 97 |         self.obs2_buf = np.load(checkpoint_path + '/obs2_buf-' + str(self.buffer_index) + '.npy')
 98 |         self.acts_buf = np.load(checkpoint_path + '/acts_buf-' + str(self.buffer_index) + '.npy')
 99 |         self.rews_buf = np.load(checkpoint_path + '/rews_buf-' + str(self.buffer_index) + '.npy')
100 |         self.done_buf = np.load(checkpoint_path + '/done_buf-' + str(self.buffer_index) + '.npy')
101 |         buffer_infos = np.load(checkpoint_path + '/buffer_infos-' + str(self.buffer_index) + '.npy')
102 | 
103 |         self.ptr, self.size, self.max_size, self.actor_steps, self.learner_steps = buffer_infos[0], buffer_infos[1], \
104 |                                                                                    buffer_infos[2], buffer_infos[3], \
105 |                                                                                    buffer_infos[4]
106 |         print("****** buffer number " + str(self.buffer_index) + " restored! ******")
107 |         print("****** buffer number " + str(self.buffer_index) + " infos:", self.ptr, self.size, self.max_size,
108 |               self.actor_steps, self.learner_steps)
109 | 
110 | 
111 | @ray.remote(num_cpus=2)
112 | class ParameterServer:
113 |     def __init__(self, opt, weights_file, checkpoint_path, ps_index):
114 |         # each node will have a Parameter Server
115 | 
116 |         self.opt = opt
117 |         self.learner_step = 0
118 |         net = Learner(opt, job="ps")
119 |         keys, values = net.get_weights()
120 | 
121 |         # --- make dir for all nodes and save parameters ---
122 |         try:
123 |             os.makedirs(opt.save_dir)
124 |             os.makedirs(opt.save_dir + '/checkpoint')
125 |         except OSError:
126 |             pass
127 |         all_parameters = copy.deepcopy(vars(opt))
128 |         all_parameters["obs_space"] = ""
129 |         all_parameters["act_space"] = ""
130 |         with open(opt.save_dir + "/" + 'All_Parameters.json', 'w') as fp:
131 |             json.dump(all_parameters, fp, indent=4, sort_keys=True)
132 |         # --- end ---
133 | 
134 |         self.weights = None
135 | 
136 |         if not checkpoint_path:
137 |             checkpoint_path = opt.save_dir + "/checkpoint"
138 | 
139 |         if opt.recover:
140 |             with open(checkpoint_path + "/checkpoint_weights.pickle", "rb") as pickle_in:
141 |                 self.weights = pickle.load(pickle_in)
142 |                 print("****** weights restored! ******")
143 | 
144 |         if weights_file:
145 |             try:
146 |                 with open(weights_file, "rb") as pickle_in:
147 |                     self.weights = pickle.load(pickle_in)
148 |                     print("****** weights restored! ******")
149 |             except:
150 |                 print("------------------------------------------------")
151 |                 print(weights_file)
152 |                 print("------ error: weights file doesn't exist! ------")
153 |                 exit()
154 | 
155 |         if not opt.recover and not weights_file:
156 |             values = [value.copy() for value in values]
157 |             self.weights = dict(zip(keys, values))
158 | 
159 |     def push(self, keys, values):
160 |         values = [value.copy() for value in values]
161 |         for key, value in zip(keys, values):
162 |             self.weights[key] = value
163 |         self.learner_step += opt.push_freq
164 | 
165 |     def pull(self, keys):
166 |         return [self.weights[key] for key in keys]
167 | 
168 |     def get_weights(self):
169 |         return copy.deepcopy(self.weights)
170 | 
171 |     # save weights to disk
172 |     def save_weights(self):
173 |         with open(self.opt.save_dir + "/checkpoint/" + "checkpoint_weights.pickle", "wb") as pickle_out:
174 |             pickle.dump(self.weights, pickle_out)
175 | 
176 | 
177 | class Cache(object):
178 | 
179 |     def __init__(self, node_buffer):
180 |         # cache for training data and model weights
181 |         print('os.pid:', os.getpid())
182 |         self.node_buffer = node_buffer
183 |         self.q1 = multiprocessing.Queue(12)
184 |         self.q2 = multiprocessing.Queue(5)
185 |         self.p1 = multiprocessing.Process(target=self.ps_update, args=(self.q1, self.q2, self.node_buffer))
186 |         self.p1.daemon = True
187 | 
188 |     def ps_update(self, q1, q2, node_buffer):
189 |         print('os.pid of put_data():', os.getpid())
190 | 
191 |         node_idx = np.random.choice(opt.num_nodes, 1)[0]
192 |         buffer_idx = np.random.choice(opt.num_buffers, 1)[0]
193 |         q1.put(copy.deepcopy(ray.get(node_buffer[node_idx][buffer_idx].sample_batch.remote())))
194 | 
195 |         while True:
196 |             if q1.qsize() < 10:
197 |                 node_idx = np.random.choice(opt.num_nodes, 1)[0]
198 |                 buffer_idx = np.random.choice(opt.num_buffers, 1)[0]
199 |                 q1.put(copy.deepcopy(ray.get(node_buffer[node_idx][buffer_idx].sample_batch.remote())))
200 | 
201 |             if not q2.empty():
202 |                 keys, values = q2.get()
203 |                 [node_ps[i].push.remote(keys, values) for i in range(opt.num_nodes)]
204 | 
205 |     def start(self):
206 |         self.p1.start()
207 |         self.p1.join(10)
208 | 
209 |     def end(self):
210 |         self.p1.terminate()
211 | 
212 | 
213 | @ray.remote(num_cpus=2, num_gpus=1, max_calls=1)
214 | def worker_train(ps, node_buffer, opt, learner_index):
215 |     agent = Learner(opt, job="learner")
216 |     keys = agent.get_weights()[0]
217 |     weights = ray.get(ps.pull.remote(keys))
218 |     agent.set_weights(keys, weights)
219 | 
220 |     cache = Cache(node_buffer)
221 | 
222 |     cache.start()
223 | 
224 |     cnt = 1
225 |     while True:
226 |         batch = cache.q1.get()
227 |         agent.train(batch, cnt)
228 | 
229 |         if cnt % opt.push_freq == 0:
230 |             cache.q2.put(agent.get_weights())
231 |         cnt += 1
232 | 
233 | 
234 | @ray.remote
235 | def worker_rollout(ps, replay_buffer, opt, worker_index):
236 |     agent = Actor(opt, job="worker")
237 |     keys = agent.get_weights()[0]
238 |     np.random.seed()
239 | 
240 |     ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
241 |     sys.path.append(ROOT)
242 |     from trading_env import TradingEnv, FrameStack
243 |     # ------ env set up ------
244 |     # env = gym.make(opt.env_name)
245 |     env = TradingEnv(action_scheme_id=3, obs_dim=38)
246 | 
247 |     while True:
248 | 
249 |         o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
250 | 
251 |         weights = ray.get(ps.pull.remote(keys))
252 |         agent.set_weights(keys, weights)
253 | 
254 |         # for a_l_ratio control
255 |         np.random.seed()
256 |         rand_buff = np.random.choice(opt.num_buffers, 1)[0]
257 |         last_learner_steps, last_actor_steps, _size = ray.get(replay_buffer[rand_buff].get_counts.remote())
258 | 
259 |         while True:
260 | 
261 |             # don't need to random sample action if load weights from local.
262 |             if last_actor_steps * opt.num_buffers > opt.start_steps or opt.recover:
263 |                 a = agent.get_action(o)
264 |             else:
265 |                 a = env.action_space.sample()
266 |             # Step the env
267 |             o2, r, d, _ = env.step(a)
268 | 
269 |             ep_ret += r
270 |             ep_len += 1
271 | 
272 |             # Ignore the "done" signal if it comes from hitting the time
273 |             # horizon (that is, when it's an artificial terminal signal
274 |             # that isn't based on the agent's state)
275 |             # d = False if ep_len*opt.action_repeat >= opt.max_ep_len else d
276 | 
277 |             np.random.seed()
278 |             rand_buff = np.random.choice(opt.num_buffers, 1)[0]
279 |             replay_buffer[rand_buff].store.remote(o, a, r, o2, d, worker_index)
280 | 
281 |             o = o2
282 | 
283 |             # End of episode. Training (ep_len times).
284 |             # if d or (ep_len * opt.action_repeat >= opt.max_ep_len):
285 |             if d:
286 |                 break
287 | 
288 | 
289 | @ray.remote
290 | def worker_test(ps, node_buffer, opt):
291 | 
292 |     agent = Actor(opt, job="test")
293 |     keys = agent.get_weights()[0]
294 | 
295 |     # test_env = gym.make(opt.env_name)
296 |     ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
297 |     sys.path.append(ROOT)
298 |     from trading_env import TradingEnv, FrameStack
299 |     test_env = TradingEnv(action_scheme_id=3, obs_dim=38)
300 | 
301 |     init_time = time.time()
302 |     save_times = 0
303 |     checkpoint_times = 0
304 | 
305 |     while True:
306 |         # weights_all for save it to local
307 |         weights_all = ray.get(ps.get_weights.remote())
308 |         weights = [weights_all[key] for key in keys]
309 |         agent.set_weights(keys, weights)
310 | 
311 |         start_actor_step, start_learner_step, _ = get_al_status(node_buffer)
312 |         start_time = time.time()
313 | 
314 |         ave_test_reward, ave_score = agent.test(test_env, 10)
315 | 
316 |         last_actor_step, last_learner_step, _ = get_al_status(node_buffer)
317 |         actor_step = np.sum(last_actor_step) - np.sum(start_actor_step)
318 |         learner_step = np.sum(last_learner_step) - np.sum(start_learner_step)
319 |         alratio = actor_step / (learner_step + 1)
320 |         update_frequency = int(learner_step / (time.time() - start_time))
321 |         total_learner_step = np.sum(last_learner_step)
322 | 
323 |         print("---------------------------------------------------")
324 |         print("average test reward:", ave_test_reward)
325 |         print("average test score:", ave_score)
326 |         print("frame freq:", np.round((last_actor_step - start_actor_step) / (time.time() - start_time)))
327 |         print("actor_steps:", np.sum(last_actor_step), "learner_step:", total_learner_step)
328 |         print("actor leaner ratio: %.2f" % alratio)
329 |         print("learner freq:", update_frequency)
330 |         print("Ray total resources:", ray.cluster_resources())
331 |         print("available resources:", ray.available_resources())
332 |         print("---------------------------------------------------")
333 |         if learner_step < 100:
334 |             alratio = 0
335 |         agent.write_tb(ave_test_reward, ave_score, alratio, update_frequency, total_learner_step)
336 | 
337 |         total_time = time.time() - init_time
338 | 
339 |         if total_learner_step // opt.save_interval > save_times:
340 |             with open(opt.save_dir + "/" + str(total_learner_step / 1e6) + "M_" + str(ave_test_reward) + "_weights.pickle", "wb") as pickle_out:
341 |                 pickle.dump(weights_all, pickle_out)
342 |                 print("****** Weights saved by time! ******")
343 |             save_times = total_learner_step // opt.save_interval
344 | 
345 |         # save everything every checkpoint_freq s
346 |         if total_time // opt.checkpoint_freq > checkpoint_times:
347 |             print("save everything!")
348 |             save_start_time = time.time()
349 | 
350 |             ps_save_op = [node_ps[i].save_weights.remote() for i in range(opt.num_nodes)]
351 |             buffer_save_op = [node_buffer[node_index][i].save.remote() for i in range(opt.num_buffers) for node_index in range(opt.num_nodes)]
352 |             ray.wait(buffer_save_op + ps_save_op, num_returns=opt.num_nodes*opt.num_buffers + 1)
353 | 
354 |             print("total time for saving :", time.time() - save_start_time)
355 |             checkpoint_times = total_time // opt.checkpoint_freq
356 | 
357 | 
358 | def get_al_status(node_buffer):
359 | 
360 |     buffer_learner_step = []
361 |     buffer_actor_step = []
362 |     buffer_cur_size = []
363 | 
364 |     for node_index in range(opt.num_nodes):
365 |         for i in range(opt.num_buffers):
366 |             learner_step, actor_step, cur_size = ray.get(node_buffer[node_index][i].get_counts.remote())
367 |             buffer_learner_step.append(learner_step)
368 |             buffer_actor_step.append(actor_step)
369 |             buffer_cur_size.append(cur_size)
370 | 
371 |     return np.array(buffer_actor_step), np.array(buffer_learner_step), np.array(buffer_cur_size)
372 | 
373 | 
374 | if __name__ == '__main__':
375 | 
376 |     # ray.init()
377 |     ray.init(resources={"node0": 256})
378 | 
379 |     # env = gym.make(FLAGS.env_name)
380 |     env = TradingEnv(action_scheme_id=3, obs_dim=38)
381 | 
382 |     # ------ HyperParameters ------
383 |     opt = HyperParameters(env, FLAGS.env_name, FLAGS.exp_name, FLAGS.num_nodes, FLAGS.num_workers, FLAGS.a_l_ratio, FLAGS.weights_file)
384 | 
385 |     if FLAGS.recover:
386 |         opt.recover = True
387 |     # ------ end ------
388 | 
389 |     node_ps = []
390 |     node_buffer = []
391 | 
392 |     for node_index in range(FLAGS.num_nodes):
393 | 
394 |         # ------ Parameter Server (ray actor) ------
395 |         # create model to get weights and create a parameter server
396 |         node_ps.append(ParameterServer._remote(args=[opt, FLAGS.weights_file, FLAGS.checkpoint_path, node_index], resources={"node"+str(node_index): 1}))
397 |         print(f"Node{node_index} Parameter Server all set.")
398 |         # ------ Parameter Server end ------
399 | 
400 |         # ------ Experience buffer (ray actor) ------
401 |         node_buffer.append([ReplayBuffer._remote(args=[opt, i+node_index*opt.num_buffers], resources={"node"+str(node_index): 1}) for i in range(opt.num_buffers)])
402 | 
403 |         if FLAGS.recover:
404 |             buffer_load_op = [node_buffer[node_index][i].load.remote(FLAGS.checkpoint_path) for i in range(opt.num_buffers)]
405 |             ray.wait(buffer_load_op, num_returns=opt.num_buffers)
406 |         print(f"Node{node_index} Experience buffer all set.")
407 |         # ------ Experience buffer end ------
408 | 
409 |         # ------ roll out worker (ray task) ------
410 |         for i in range(FLAGS.num_workers):
411 |             worker_rollout._remote(args=[node_ps[node_index], node_buffer[node_index], opt, i+node_index*FLAGS.num_workers], resources={"node"+str(node_index): 1})
412 |             time.sleep(0.19)
413 | 
414 |         print(f"Node{node_index} roll out worker all up.")
415 |         # ------ roll out worker end ------
416 | 
417 |     print(f"num of ps up: {len(node_ps)}, num of buffer up: {len(node_buffer)*len(node_buffer[0])}")
418 | 
419 |     print("Ray total resources:", ray.cluster_resources())
420 |     print("available resources:", ray.available_resources())
421 | 
422 |     # --- save nodes info ---
423 |     nodes_info = {
424 |         "node_buffer": np.array(node_buffer),
425 |         "num_nodes": opt.num_nodes,
426 |         "num_buffers": opt.num_buffers
427 |     }
428 |     f_name = './nodes_info.pickle'
429 |     with open(f_name, "wb") as pickle_out:
430 |         pickle.dump(nodes_info, pickle_out)
431 |         print("****** save nodes_info ******")
432 |     # --- end ---
433 | 
434 |     # control learner start time
435 |     if not opt.recover:
436 | 
437 |         start_time = time.time()
438 | 
439 |         total_cur_size = 0
440 |         while total_cur_size < opt.start_steps:
441 | 
442 |             buffer_actor_step, buffer_learner_step, buffer_cur_size = get_al_status(node_buffer)
443 |             total_cur_size = np.sum(buffer_cur_size)
444 | 
445 |             print("---------------------------------------------------")
446 |             print("learner_step:", buffer_learner_step, "actor_steps:", buffer_actor_step)
447 |             print("frame freq:", np.round(buffer_actor_step/(time.time()-start_time)))
448 |             print("total frame freq:", int(np.sum(buffer_actor_step)/(time.time()-start_time)))
449 |             print('start steps before learning:', total_cur_size, '/', opt.start_steps)
450 |             print("Ray total resources:", ray.cluster_resources())
451 |             print("available resources:", ray.available_resources())
452 |             print("---------------------------------------------------")
453 |             time.sleep(10)
454 |     else:
455 |         time.sleep(0.0)
456 | 
457 |     # ------ learner ------
458 |     task_train = worker_train._remote(args=[node_ps[0], node_buffer, opt, 0], resources={"node0": 1})
459 |     # ------ learner end ------
460 | 
461 |     task_test = worker_test.remote(node_ps[0], node_buffer, opt)
462 |     ray.wait([task_test])
463 | 


--------------------------------------------------------------------------------