├── .gitignore
├── README.md
├── Simulator
    ├── data_model.py
    ├── simrnn_model.py
    ├── baseline_model.py
    ├── baseline_main.py
    ├── simrnn_main.py
    └── simrnn_cell.py
└── RL
    ├── pre_train.py
    ├── exploration.py
    ├── ou_noise.py
    ├── replay_buffer.py
    ├── env.py
    ├── train_primal_dual.py
    ├── reward_critic_network.py
    ├── cost_critic_network.py
    ├── primal_dual_ddpg.py
    ├── actor_network.py
    └── util.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/
 2 | Material/
 3 | BoilerData/
 4 | .DS_Store
 5 | 
 6 | Simulator/__pycache__/
 7 | Simulator/.idea/
 8 | Simulator/data/
 9 | Simulator/logs/
10 | Simulator/.DS_Store
11 | 
12 | 
13 | RL/__pycache__/
14 | RL/.idea/
15 | RL/model/
16 | RL/saved_actor/
17 | RL/result/
18 | RL/logs/
19 | RL/.DS_Store
20 | 
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DeepThermal: Combustion Optimization for Thermal Power Generating Units Using Offline Reinforcement Learning
 2 | 
 3 | This is the code of the paper DeepThermal: Combustion Optimization for Thermal Power Generating Units Using Offline Reinforcement Learning accepted at AAAI'2022. The paper can be found [here](https://arxiv.org/abs/2102.11492).
 4 | 
 5 | ### Usage
 6 | The code of combustion simulator is in `Simulator/simrnn_model.py`, the code of model-based offline RL framework, MORE, is in `RL/primal_dual_ddpg.py`.
 7 | 
 8 | 
 9 | ### Bibtex
10 | ```
11 | @inproceedings{zhan2022deepthermal,
12 |   title={Deepthermal: Combustion optimization for thermal power generating units using offline reinforcement learning},
13 |   author={Zhan, Xianyuan and Xu, Haoran and Zhang, Yue and Zhu, Xiangyu and Yin, Honglei and Zheng, Yu},
14 |   booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
15 |   pages={4680--4688},
16 |   year={2022}
17 | }
18 | ```
19 | 
20 | 


--------------------------------------------------------------------------------
/Simulator/data_model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | import pandas as pd
 4 | import random
 5 | 
 6 | 
 7 | class BoilerDataSet(object):
 8 |     """
 9 |     first run data_preparation.py to generate data.csv
10 |     prepare boiler training and validation dataset
11 |     simple version(small action dimension)
12 | 
13 |     """
14 |     def __init__(self, num_steps, val_ratio=0.1):
15 |         self.num_steps = num_steps
16 |         self.val_ratio = val_ratio
17 | 
18 |         # Read csv file
19 |         self.raw_seq = pd.read_csv(os.path.join("data", "sim_train.csv"), index_col='date')
20 |         self.train_X, self.train_y, self.val_X, self.val_y = self._prepare_data(self.raw_seq)
21 | 
22 |     def _prepare_data(self, seq):
23 |         # split into groups of num_steps
24 |         X = np.array([seq.iloc[i: i + self.num_steps].values
25 |                       for i in range(len(seq) - self.num_steps)])
26 |         y = np.array([seq.ix[i + self.num_steps, 'A磨煤机料位':'1号机组下部水冷壁出口平均壁温'].values
27 |                       for i in range(len(seq) - self.num_steps)])
28 | 
29 |         train_size = int(len(X) * (1.0 - self.val_ratio))
30 |         train_X, val_X = X[:train_size], X[train_size:]
31 |         train_y, val_y = y[:train_size], y[train_size:]
32 |         return train_X, train_y, val_X, val_y
33 | 
34 |     def generate_one_epoch(self, data_X, data_y, batch_size):
35 |         num_batches = int(len(data_X)) // batch_size
36 |         # if batch_size * num_batches < len(self.train_X):
37 |         #     num_batches += 1
38 | 
39 |         batch_indices = list(range(num_batches))
40 |         random.shuffle(batch_indices)
41 |         for j in batch_indices:
42 |             batch_X = data_X[j * batch_size: (j + 1) * batch_size]
43 |             batch_y = data_y[j * batch_size: (j + 1) * batch_size]
44 |             yield batch_X, batch_y
45 | 
46 | 


--------------------------------------------------------------------------------
/RL/pre_train.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | def pre_train_actor_network(agent, train_data, epochs=50, load_model=False):
 4 |     """
 5 |             train critic network of agent
 6 |             data : from train_data_path (eg. origin data)
 7 |     """
 8 |     input_config = InputConfig_RL()
 9 |     replay_buffer = agent.replay_buffer
10 |     replay_buffer.read_from_csv(train_data)
11 | 
12 |     step = 0
13 |     del_list = list(range(12, 24)) + list(range(36, 48))
14 |     for epoch in range(epochs):
15 |         while 1:
16 |             if replay_buffer.use_nums > replay_buffer.count():
17 |                 replay_buffer.read_from_csv(train_data)
18 |                 step = 0
19 |                 break
20 | 
21 |             mini_batch = replay_buffer.get_batch(batch_size=input_config.batch_size)
22 |             step += 1
23 |             mini_batch = np.bmat(list(map(list, mini_batch))).A.flatten().reshape(-1, DONE_END)
24 |             state_batch = mini_batch[:, :OUTER_END]
25 |             action_batch = mini_batch[:, OUTER_END + 32:ACTION_END]
26 |             for i in range(12):
27 |                 action_batch[:, i] = (action_batch[:, i] + action_batch[:, 23 - i]) / 2
28 |                 action_batch[:, 24 + i] = (action_batch[:, 24 + i] + action_batch[:, 47 - i]) / 2
29 |             action_batch = np.delete(action_batch, del_list, axis=1)
30 |             limit_batch = mini_batch[:, ACTION_END:LIMIT_LOAD_END]
31 |             state_limit_batch = np.concatenate((state_batch, limit_batch), axis=1)
32 | 
33 |             mse, _ = agent.train_actor(state=state_limit_batch, action=action_batch)
34 | 
35 |             # display
36 |             if step % 100 == 0:
37 |                 print(replay_buffer.use_nums)
38 |                 print('-----------------pretrain actor network-----------------')
39 |                 print('epoch = {} step = {} mse = {:.6f}'.format(epoch, step, mse))


--------------------------------------------------------------------------------
/RL/exploration.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | 
 5 | class Exploration(object):
 6 | 
 7 |     def __init__(self, action_dim, kernel_num, sample_size):
 8 | 
 9 |         self.g = tf.Graph()
10 |         with self.g.as_default():
11 |             # data format
12 |             self.action_dim = action_dim
13 |             self.mean = tf.placeholder(shape=[self.action_dim], dtype=tf.float32)
14 |             self.stddev = tf.placeholder(shape=[self.action_dim], dtype=tf.float32)
15 |             self.action = tf.placeholder(shape=[self.action_dim], dtype=tf.float32)
16 |             self.weight = tf.placeholder(dtype=tf.float32)
17 | 
18 |             self.gaussian_exploration = None
19 |             self.kernel_num = kernel_num
20 |             self._sample_size = sample_size
21 | 
22 |             config = tf.ConfigProto(device_count={"CPU": self.kernel_num},
23 |                                     inter_op_parallelism_threads=0,
24 |                                     intra_op_parallelism_threads=0,
25 |                                     log_device_placement=True)
26 |             self.sess = tf.Session(config=config, graph=self.g)
27 | 
28 |             # for sample_index in range(self._sample_size):
29 |             #     gaussian_noise = tf.random_normal(shape=[self.action_dim], mean=self.mean, stddev=self.stddev)
30 |             #     self.gaussian_exploration.append(self.action + self.weight * gaussian_noise)
31 |             gaussian_noise = tf.random_normal(shape=[self.action_dim], mean=self.mean, stddev=self.stddev)
32 |             self.gaussian_exploration = self.action + self.weight * gaussian_noise
33 | 
34 |     def get_gaussian_exploration(self, action, mean, stddev, weight=0.01):
35 |         return self.sess.run(self.gaussian_exploration, feed_dict={self.action: action,
36 |                                                                    self.mean: mean,
37 |                                                                    self.stddev: stddev,
38 |                                                                    self.weight: weight})
39 | 
40 | 
41 | class Histogram(object):
42 |     def __init__(self, csv_path):
43 |         self.df = np.array(pd.read_csv(csv_path, header=None)).astype('float')
44 |         self.threshold = self.df[:, -1]
45 | 
46 |     def get_probability(self, x):
47 |         # print('value'+str(self.df[np.arange(len(x[:-1])).astype('int'), (x[:-1] * 20).astype('int')]))
48 |         prob = np.array(
49 |             self.df[np.arange(len(x[:-1])).astype('int'), (x[:-1] * 20).astype('int')[:]] > self.threshold).astype(
50 |             'int')
51 |         return prob
52 | 


--------------------------------------------------------------------------------
/RL/ou_noise.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------
 2 | # Ornstein-Uhlenbeck Noise
 3 | # Author: Flood Sung
 4 | # Date: 2016.5.4
 5 | # Reference: https://github.com/rllab/rllab/blob/master/rllab/exploration_strategies/ou_strategy.py
 6 | # --------------------------------------
 7 | 
 8 | import numpy as np
 9 | import numpy.random as nr
10 | 
11 | 
12 | class OUNoise:
13 |     """docstring for OUNoise"""
14 |     def __init__(self, action_dimension, mu=0.5, theta=0.4, sigma=0.2, weight_decay=0.9999):
15 |         self.action_dimension = action_dimension
16 |         self.mu = mu
17 |         self.theta = theta
18 |         self.sigma = sigma
19 |         self.state = np.ones(self.action_dimension) * self.mu
20 |         self.weight = 1
21 |         self.weight_decay = weight_decay
22 |         self.reset()
23 | 
24 |     def reset(self):
25 |         self.state = np.ones(self.action_dimension) * self.mu
26 | 
27 |     def update_weight(self):
28 |         self.weight *= self.weight_decay
29 | 
30 |     def noise(self):
31 |         x = self.state
32 |         dx = self.theta * (self.mu - x) + self.sigma * nr.randn(len(x))
33 |         self.state = x + dx * self.weight
34 |         return self.state
35 | 
36 | 
37 | 
38 | # # Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
39 | # class OrnsteinUhlenbeckActionNoise(ActionNoise):
40 | #     def __init__(self, mu, sigma, theta=.15, dt=1e-2, x0=None, weight_decay_factor=0.999):
41 | #         self.theta = theta
42 | #         self.mu = mu
43 | #         self.sigma = sigma
44 | #         self.dt = dt
45 | #         self.x0 = x0
46 | #         self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)
47 | #
48 | #         self.weight_decay_factor = weight_decay_factor
49 | #         self.weight_decay = 1
50 | #
51 | #         self.reset()
52 | #
53 | #     def get_noise(self):
54 | #         x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(
55 | #             self.dt) * np.random.normal(size=self.mu.shape)
56 | #         self.x_prev = x
57 | #         return x
58 | #
59 | #     @property
60 | #     def shape(self):
61 | #         return self.mu.shape
62 | #
63 | #     def reset(self):
64 | #         self.weight_decay = 1
65 | #
66 | #     def noise_decay(self):
67 | #         self.weight_decay *= self.weight_decay_factor
68 | #
69 | #     def __call__(self, action):
70 | #         r = action + self.get_noise() * self.weight_decay
71 | #         return r
72 | #
73 | #     def __repr__(self):
74 | #         return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={}, weight_decay_factor=)'.format(self.mu, self.sigma,
75 | #                                                                                             self.weight_decay_factor)
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     ou = OUNoise(3)
80 |     states = []
81 |     for i in range(10000):
82 |         ou.update_weight()
83 |         states.append(ou.noise())
84 |     import matplotlib.pyplot as plt
85 | 
86 |     plt.plot(states)
87 |     plt.show()


--------------------------------------------------------------------------------
/RL/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | from collections import deque
 2 | import random
 3 | import numpy as np
 4 | from RL.util import compute_reward, compute_cost, compute_done
 5 | 
 6 | 
 7 | class ReplayBuffer(object):
 8 |     """Using explorated data based on simulator"""
 9 |     def __init__(self, buffer_size):
10 |         self.buffer_size = buffer_size
11 |         self.num_experiences = 0
12 |         self.buffer = deque()
13 |         self.real_data = np.load('/Users/xhr/PycharmProjects/Boiler/Simulator/data/replay_buffer.npy')
14 |         nums = len(self.real_data)
15 |         self.num_indices = list(range(nums))
16 |         random.shuffle(self.num_indices)
17 |         self.real_start_indice = 0
18 | 
19 |     def get_batch(self, batch_size):
20 |         # Randomly sample batch_size examples
21 |         return random.sample(self.buffer, batch_size)
22 | 
23 |     def get_real_batch(self, batch_size):
24 |         return self.real_data[np.random.choice(self.real_data.shape[0], batch_size, replace=False), :]
25 | 
26 |     def size(self):
27 |         return self.buffer_size
28 | 
29 |     def add(self, state, action, reward, cost, new_state, done, mix_ratio):
30 |         experience = (state, action, reward, cost, new_state, done)
31 |         if self.num_experiences < self.buffer_size:
32 |             self.buffer.append(experience)
33 |             for _ in range(mix_ratio):
34 |                 s, a, s_, done = self.generate_real()
35 |                 r = compute_reward(s)
36 |                 c = compute_cost(s)
37 |                 d = compute_done(s)
38 |                 e = (s, a, r, c, s_, d)
39 |                 # print('s-{}-a{}-ns{}'.format(s.shape, a.shape, s_.shape))
40 | 
41 |                 self.buffer.append(e)
42 |             self.num_experiences += 1
43 |         else:
44 |             for _ in range(mix_ratio+1):
45 |                 self.buffer.popleft()
46 |             self.buffer.append(experience)
47 |             for _ in range(mix_ratio):
48 |                 s, a, s_, done = self.generate_real()
49 |                 r = compute_reward(s)
50 |                 c = compute_cost(s)
51 |                 d = compute_done(s)
52 |                 e = (s, a, r, c, s_, d)
53 |                 self.buffer.append(e)
54 | 
55 |     def generate_real(self):
56 |             s = self.real_data[self.real_start_indice, :58]
57 |             a = self.real_data[self.real_start_indice, 58:109]
58 |             s_ = self.real_data[self.real_start_indice, 109:156]
59 |             s_ = np.concatenate([s[:11], s_])
60 |             done = self.real_data[self.real_start_indice, -1]
61 |             self.real_start_indice += 1
62 |             if self.real_start_indice == len(self.real_data):
63 |                 self.real_start_indice = 0
64 |             return s, a, s_, done
65 | 
66 |     def count(self):
67 |         # if buffer is full, return buffer size
68 |         # otherwise, return experience counter
69 |         return self.num_experiences
70 | 
71 |     def erase(self):
72 |         self.buffer = deque()
73 |         self.num_experiences = 0
74 | 
75 | 
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/RL/env.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | from collections import deque
 4 | import random
 5 | 
 6 | from Simulator.simrnn_model import RNNSimulatorModel
 7 | from Simulator.simrnn_main import cell_config, FLAGS
 8 | from RL.util import *
 9 | 
10 | OUTER_START_POS = 0
11 | OUTER_SIZE = 11
12 | STATE_SIZE = 47
13 | ACTION_SIZE = 51
14 | STATE_START_POS = OUTER_START_POS + OUTER_SIZE
15 | ACTION_START_POS = STATE_START_POS + STATE_SIZE
16 | NEW_STATE_START_POS = ACTION_START_POS + ACTION_SIZE
17 | 
18 | 
19 | class SimulatorEnvironment(object):
20 |     def __init__(self, sess):
21 |         self.sess = sess
22 |         self.replay_buffer = np.load('../Simulator/data/replay_buffer.npy')
23 |         self.state_buffer = deque()
24 | 
25 |         # model construction
26 |         self.rnn_model = RNNSimulatorModel(cell_config(), FLAGS)
27 | 
28 |         self.sess.run(tf.global_variables_initializer())
29 | 
30 |         # path
31 |         model_name = "sim_rnn"
32 |         model_path = '../Simulator/logs/{}-{}-{}-{}-{}-{:.2f}-{:.4f}-{:.2f}-{:.5f}/'.format(
33 |             model_name, cell_config.num_units[0], cell_config.num_units[1], cell_config.num_units[2],
34 |             FLAGS.num_steps, FLAGS.keep_prob, FLAGS.learning_rate, FLAGS.learning_rate_decay, FLAGS.l2_weight)
35 |         model_path += 'saved_models/final_model.ckpt'
36 | 
37 |         saver = tf.train.Saver()
38 |         saver.restore(self.sess, model_path)
39 |         print("Model successfully restored from file: %s" % model_path)
40 | 
41 |     def reset(self):
42 |         """ Resets the state of the environment and returns an initial observation. """
43 |         self.state_buffer = deque()
44 |         nums = len(self.replay_buffer)
45 |         init_state_indice = random.randint(10, nums)
46 |         for i in range(10):
47 |             self.state_buffer.append(self.replay_buffer[init_state_indice-(9-i), :NEW_STATE_START_POS])
48 |         self.new_state = init_state = self.replay_buffer[init_state_indice, :ACTION_START_POS]
49 |         # self.new_state = init_state.reshape(1, -1)
50 |         self.outer_state = self.replay_buffer[init_state_indice, OUTER_START_POS:STATE_START_POS]
51 | 
52 |         return self.new_state
53 | 
54 |     def step(self, action):
55 |         """Run one timestep of the environment's dynamics. When end of
56 |         episode is reached, you are responsible for calling `reset()`
57 |         to reset this environment's state.
58 | 
59 |         Accepts an action and returns a tuple (observation, reward, cost, done, info).
60 |         """
61 |         self.state_buffer.append(np.concatenate([self.new_state, action]))
62 |         self.state_buffer.popleft()
63 | 
64 |         # transpose from 2D to 3D
65 |         model_inputs_2D = np.array(self.state_buffer)
66 |         num_step, dim = model_inputs_2D.shape
67 |         model_inputs_3D = model_inputs_2D.reshape(1, num_step, dim)
68 | 
69 |         test_data_feed = {
70 |             self.rnn_model.keep_prob: 1.0,
71 |             self.rnn_model.inputs: model_inputs_3D,
72 |         }
73 |         new_state = self.sess.run(self.rnn_model.pred, test_data_feed)
74 |         self.new_state = np.concatenate([self.outer_state, new_state[0]])  # (1, 47) -> (47, )
75 | 
76 |         reward = compute_reward(self.new_state)
77 |         cost = compute_cost(self.new_state)
78 |         done = compute_done(self.new_state)
79 | 
80 |         return self.new_state, reward, cost, done
81 | 
82 | 
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------
/Simulator/simrnn_model.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | import os
 4 | import random
 5 | import time
 6 | from Simulator.simrnn_cell import SimulatorRNNCell
 7 | 
 8 | 
 9 | class RNNSimulatorModel(object):
10 |     def __init__(self,
11 |                  cell_config,
12 |                  FLAGS):
13 |         """ Construct simulator model using self_designed cell """
14 |         self.coaler_cell_size, self.burner_cell_size, self.steamer_cell_size = cell_config.num_units
15 |         self.input_size = FLAGS.input_size
16 |         self.output_size = FLAGS.output_size
17 |         self.coaler_output_size = cell_config.coaler_state_size
18 |         self.burner_output_size = cell_config.burner_state_size
19 |         self.steamer_output_size = cell_config.steamer_state_size
20 | 
21 |         self.batch_size = FLAGS.batch_size
22 |         self.n_steps = FLAGS.num_steps
23 |         self.l2_weight = FLAGS.l2_weight
24 |         self.grad_clip = FLAGS.grad_clip
25 | 
26 |         # inputs.shape = (number of examples, number of input, dimension of each input).
27 |         self.inputs = tf.placeholder(tf.float32, [None, self.n_steps, self.input_size], name="inputs")
28 |         self.targets = tf.placeholder(tf.float32, [None, self.output_size], name="targets")
29 |         self.learning_rate = tf.placeholder(tf.float32, None, name="learning_rate")
30 |         self.keep_prob = tf.placeholder(tf.float32, None, name="keep_prob")
31 | 
32 |         self.cell = SimulatorRNNCell(cell_config, self.keep_prob)
33 |         # Run dynamic RNN
34 |         self.cell_init_state = self.cell.zero_state(self.batch_size, dtype=tf.float32)
35 |         cell_outputs, cell_final_state = tf.nn.dynamic_rnn(
36 |             self.cell, self.inputs, initial_state=self.cell_init_state, time_major=False, scope="dynamic_rnn")
37 | 
38 |         # outputs.get_shape() = (batch_size, num_steps, cell_size)
39 |         coaler_output, burner_output, steamer_output = cell_outputs
40 |         self.coaler_output = coaler_output[:, -1, :]
41 |         self.burner_output = burner_output[:, -1, :]
42 |         self.steamer_output = steamer_output[:, -1, :]
43 | 
44 |         # pred = W * out + b
45 |         ws_out_coaler = tf.Variable(
46 |             tf.truncated_normal([self.coaler_cell_size, self.coaler_output_size]), name="W_coaler")
47 |         bs_out_coaler = tf.Variable(
48 |             tf.constant(0.1, shape=[self.coaler_output_size]), name="bias_coaler")
49 |         ws_out_burner = tf.Variable(
50 |             tf.truncated_normal([self.burner_cell_size, self.burner_output_size]), name="W_burner")
51 |         bs_out_burner = tf.Variable(
52 |             tf.constant(0.1, shape=[self.burner_output_size]), name="bias_burner")
53 |         ws_out_steamer = tf.Variable(
54 |             tf.truncated_normal([self.steamer_cell_size, self.steamer_output_size]), name="W_steamer")
55 |         bs_out_steamer = tf.Variable(
56 |             tf.constant(0.1, shape=[self.steamer_output_size]), name="bias_steamer")
57 | 
58 |         self.coaler_pred = tf.matmul(self.coaler_output, ws_out_coaler) + bs_out_coaler
59 |         self.burner_pred = tf.matmul(self.burner_output, ws_out_burner) + bs_out_burner
60 |         self.steamer_pred = tf.matmul(self.steamer_output, ws_out_steamer) + bs_out_steamer
61 |         self.pred = tf.concat([self.coaler_pred, self.burner_pred, self.steamer_pred], axis=1)
62 |         self.pred = tf.sigmoid(self.pred)
63 |         # self.pred_summ = tf.summary.histogram("pred", self.pred)
64 | 
65 | 
66 |         # train loss
67 |         self.tv = tf.trainable_variables()
68 |         self.l2_loss = self.l2_weight * tf.reduce_sum(
69 |             [tf.nn.l2_loss(v) for v in self.tv if not ("noreg" in v.name or "bias" in v.name)], name="l2_loss")
70 |         self.mse = tf.reduce_mean(tf.square(self.pred - self.targets), name="loss_mse_train")
71 |         self.loss = self.mse + self.l2_loss
72 | 
73 |         # gradients clip
74 |         grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, self.tv), self.grad_clip)
75 |         # optimizer = tf.train.MomentumOptimizer(self.learning_rate, 0.9)
76 |         # optimizer = tf.train.RMSPropOptimizer(self.learning_rate)
77 |         optimizer = tf.train.AdamOptimizer(self.learning_rate)
78 |         self.train_opt = optimizer.apply_gradients(zip(grads, self.tv))
79 | 
80 |         # summary
81 |         self.loss_summ = tf.summary.scalar("loss_mse_train", self.loss)
82 |         self.learning_rate_summ = tf.summary.scalar("learning_rate", self.learning_rate)
83 |         # for var in tf.trainable_variables():
84 |         #     tf.summary.histogram(var.name, var)
85 |         self.merged_summ = tf.summary.merge_all()
86 | 
87 | 


--------------------------------------------------------------------------------
/Simulator/baseline_model.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.contrib import rnn
 3 | from tensorflow.contrib.layers import fully_connected
 4 | 
 5 | 
 6 | class BaseLineModel(object):
 7 |     def __init__(self,
 8 |                  FLAGS,
 9 |                  training=True):
10 |         """ Construct baseline model, including stacked LSTM, GRU and DNN """
11 |         self.num_units = FLAGS.num_units
12 |         self.num_layers = FLAGS.num_layers
13 |         self.input_size = FLAGS.input_size
14 |         self.output_size = FLAGS.output_size
15 | 
16 |         self.batch_size = FLAGS.batch_size
17 |         self.n_steps = FLAGS.num_steps
18 |         self.l2_weight = FLAGS.l2_weight
19 |         self.grad_clip = FLAGS.grad_clip
20 | 
21 |         # inputs.shape = (number of examples, number of input, dimension of each input).
22 |         if FLAGS.model == 'dnn':
23 |             self.inputs = tf.placeholder(tf.float32, [None, self.input_size], name="inputs")
24 |         else:
25 |             self.inputs = tf.placeholder(tf.float32, [None, self.n_steps, self.input_size], name="inputs")
26 |         self.targets = tf.placeholder(tf.float32, [None, self.output_size], name="targets")
27 |         self.learning_rate = tf.placeholder(tf.float32, None, name="learning_rate")
28 |         self.keep_prob = tf.placeholder(tf.float32, None, name="keep_prob")
29 | 
30 |         if training and FLAGS.keep_prob:
31 |             self.inputs = tf.nn.dropout(self.inputs, FLAGS.keep_prob)
32 | 
33 |         if FLAGS.model == 'dnn':
34 |             hidden = fully_connected(self.inputs, self.num_units)
35 |             for _ in range(self.num_layers - 1):
36 |                 hidden = fully_connected(hidden, self.num_units)
37 |                 if training and FLAGS.keep_prob < 1.0:
38 |                     hidden = rnn.DropoutWrapper(hidden,
39 |                                                 input_keep_prob=FLAGS.keep_prob,
40 |                                                 output_keep_prob=FLAGS.keep_prob)
41 |             self.cell_outputs = hidden
42 |         else:  # choose different rnn cell
43 |             if FLAGS.model == 'rnn':
44 |                 cell_fn = rnn.RNNCell
45 |             elif FLAGS.model == 'gru':
46 |                 cell_fn = rnn.GRUCell
47 |             elif FLAGS.model == 'lstm':
48 |                 cell_fn = rnn.LSTMCell
49 |             elif FLAGS.model == 'nas':
50 |                 cell_fn = rnn.NASCell
51 |             else:
52 |                 raise Exception("model type not supported: {}".format(FLAGS.model))
53 | 
54 |             # warp multi layered rnn cell into one cell with dropout
55 |             cells = []
56 |             for _ in range(self.num_layers):
57 |                 cell = cell_fn(self.num_units)
58 |                 if training and FLAGS.keep_prob < 1.0:
59 |                     cell = rnn.DropoutWrapper(cell,
60 |                                               input_keep_prob=FLAGS.keep_prob,
61 |                                               output_keep_prob=FLAGS.keep_prob)
62 |                 cells.append(cell)
63 |             self.cell = rnn.MultiRNNCell(cells, state_is_tuple=True)
64 | 
65 |             self.cell_init_state = self.cell.zero_state(self.batch_size, dtype=tf.float32)
66 |             cell_outputs, cell_final_state = tf.nn.dynamic_rnn(
67 |                 self.cell, self.inputs, initial_state=self.cell_init_state, time_major=False, scope="dynamic_rnn")
68 | 
69 |             # outputs.get_shape() = (batch_size, num_steps, cell_size)
70 |             self.cell_outputs = cell_outputs[:, -1, :]
71 | 
72 |         # pred = W * out + b
73 |         ws_out = tf.Variable(
74 |             tf.truncated_normal([self.num_units, self.output_size]), name="W_out")
75 |         bs_out = tf.Variable(
76 |             tf.constant(0.1, shape=[self.output_size]), name="bias_out")
77 |         self.pred = tf.matmul(self.cell_outputs, ws_out) + bs_out
78 | 
79 | 
80 |         # train loss
81 |         self.tv = tf.trainable_variables()
82 |         self.l2_loss = self.l2_weight * tf.reduce_sum(
83 |             [tf.nn.l2_loss(v) for v in self.tv if not ("noreg" in v.name or "bias" in v.name)], name="l2_loss")
84 |         self.mse = tf.reduce_mean(tf.square(self.pred - self.targets), name="loss_mse_train")
85 |         self.loss = self.mse + self.l2_loss
86 | 
87 |         # gradients clip
88 |         grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, self.tv), self.grad_clip)
89 |         optimizer = tf.train.AdamOptimizer(self.learning_rate)
90 |         self.train_opt = optimizer.apply_gradients(zip(grads, self.tv))
91 | 
92 |         # summary
93 |         self.loss_summ = tf.summary.scalar("loss_mse_train", self.loss)
94 |         self.learning_rate_summ = tf.summary.scalar("learning_rate", self.learning_rate)
95 |         # for var in tf.trainable_variables():
96 |         #     tf.summary.histogram(var.name, var)
97 |         self.merged_summ = tf.summary.merge_all()
98 | 
99 | 


--------------------------------------------------------------------------------
/RL/train_primal_dual.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | sys.path.append('../')
  4 | from RL.primal_dual_ddpg import *
  5 | from RL.env import *
  6 | 
  7 | 
  8 | 
  9 | MAX_EPISODES = 30000
 10 | MAX_EP_STEPS = 10
 11 | # TEST = 10
 12 | SIM_REAL_RATIO = 1
 13 | 
 14 | 
 15 | 
 16 | class input_config():
 17 |     batch_size = 32
 18 |     init_dual_lambda = 1
 19 |     state_dim = 58
 20 |     action_dim = 51
 21 |     clip_norm = 5.
 22 |     train_display_iter = 200
 23 |     model_save_path = './models/'
 24 |     # model_name = "sim_ddpg"
 25 |     # logdir = './logs/{}-{}-{}-{:.2f}/'.format(
 26 |     #     model_name, MAX_EP_STEPS, SIM_REAL_RATIO, init_dual_lambda)
 27 |     # log_path = logdir + 'saved_models/'
 28 |     log_path = "logs/nonpre_nonexp_" + str(SIM_REAL_RATIO) + "_pdddpg_summary"
 29 |     save_iter = 500
 30 |     log_iter = 100
 31 | 
 32 | 
 33 | def pre_train_actor_network(agent, epochs=3):
 34 |     replay_buffer = agent.replay_buffer
 35 | 
 36 |     for epoch in range(epochs):
 37 |         step = 0
 38 |         while step < 1000:
 39 |             minibatch = replay_buffer.get_real_batch(batch_size=input_config.batch_size)
 40 |             step += 1
 41 |             state_batch, action_batch, _, _ = convert_to_tuple(minibatch)
 42 | 
 43 |             _, mse = agent.actor_network.pretrain(state=state_batch, label=action_batch)
 44 | 
 45 |         # display
 46 |         if epoch % 1 == 0:
 47 |             print('-----------------pre-train actor network-----------------')
 48 |             print('epoch = {} mse = {:.4f}'.format(epoch, mse))
 49 | 
 50 | 
 51 | def pre_train_reward_critic_network(agent, epochs=3):
 52 |     replay_buffer = agent.replay_buffer
 53 |     for train_times in range(epochs):
 54 |         step = 0
 55 |         while step < 1000:
 56 |             minibatch = replay_buffer.get_real_batch(batch_size=input_config.batch_size)
 57 |             step += 1
 58 |             state_batch, action_batch, next_state_batch, _ = convert_to_tuple(minibatch)
 59 |             reward_batch = compute_reward(state_batch)
 60 | 
 61 |             y_batch = []
 62 |             target_action = agent.actor_network.target_actions(next_state_batch)
 63 |             target_value = agent.reward_critic_network.target_reward(next_state_batch, target_action)
 64 | 
 65 |             for i in range(len(minibatch)):
 66 |                 y_batch.append(reward_batch[i] + agent.gamma * target_value[i])
 67 | 
 68 |             # update critic network
 69 |             reward_critic_loss = agent.reward_critic_network.pretrain(y_batch, state_batch, action_batch)
 70 | 
 71 |         # display
 72 |         if train_times % 1 == 0:
 73 |             print('-----------------pre-train reward critic network-----------------')
 74 |             print("reward_critic: loss:{:.3f}".format(reward_critic_loss))
 75 | 
 76 | 
 77 | def pre_train_cost_critic_network(agent, epochs=3):
 78 |     replay_buffer = agent.replay_buffer
 79 |     step = 0
 80 |     for train_times in range(epochs):
 81 |         step = 0
 82 |         while step < 1000:
 83 |             minibatch = replay_buffer.get_real_batch(batch_size=input_config.batch_size)
 84 |             step += 1
 85 |             state_batch, action_batch, next_state_batch, _ = convert_to_tuple(minibatch)
 86 |             cost_batch = compute_cost(state_batch)
 87 | 
 88 |             z_batch = []
 89 |             target_action = agent.actor_network.target_actions(next_state_batch)
 90 |             target_value = agent.cost_critic_network.target_cost(next_state_batch, target_action)
 91 | 
 92 |             for i in range(len(minibatch)):
 93 |                 z_batch.append(cost_batch[i] + agent.gamma * target_value[i])
 94 | 
 95 |             # update critic network
 96 |             cost_critic_loss = agent.cost_critic_network.pretrain(z_batch, state_batch, action_batch)
 97 | 
 98 |         # display
 99 |         if train_times % 1 == 0:
100 |             print('-----------------pre-train cost critic network-----------------')
101 |             print("reward_critic: loss:{:.3f}".format(cost_critic_loss))
102 | 
103 | 
104 | def main():
105 |     # Set up summary writer
106 |     summary_writer = tf.summary.FileWriter(input_config.log_path)
107 | 
108 |     config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
109 |     config.gpu_options.allow_growth = True
110 | 
111 |     # build agent graph
112 |     tf.reset_default_graph()
113 |     agent_graph = tf.Graph()
114 |     agent_sess = tf.Session(config=config, graph=agent_graph)
115 |     with agent_graph.as_default():
116 |         agent = PrimalDualDDPG(sess=agent_sess, input_config=input_config, is_batch_norm=False, summ_writer=summary_writer)
117 |         total_parameters = 0
118 |         for variable in tf.trainable_variables():
119 |             # shape is an array of tf.Dimension
120 |             shape = variable.get_shape()
121 |             # print(shape)
122 |             # print(len(shape))
123 |             variable_parameters = 1
124 |             for dim in shape:
125 |                 # print(dim)
126 |                 variable_parameters *= dim.value
127 |             # print(variable_parameters)
128 |             total_parameters += variable_parameters
129 |         print('total parameters: {}'.format(total_parameters))
130 | 
131 |     # build environment graph
132 |     env_graph = tf.Graph()
133 |     env_sess = tf.Session(config=config, graph=env_graph)
134 |     with env_graph.as_default():
135 |         env = SimulatorEnvironment(sess=env_sess)
136 | 
137 |     # pre_train
138 |     # pre_train_actor_network(agent=agent, epochs=1)
139 |     # pre_train_reward_critic_network(agent=agent, epochs=1)
140 |     # pre_train_cost_critic_network(agent=agent, epochs=1)
141 |     # agent.actor_network.update_target()
142 |     # agent.reward_critic_network.update_target()
143 |     # agent.cost_critic_network.update_target()
144 | 
145 |     for episode in range(MAX_EPISODES):
146 |         dual_variable = input_config.init_dual_lambda
147 |         ep_reward = 0
148 |         ep_cost = 0
149 |         state = env.reset()
150 | 
151 |         for step in range(MAX_EP_STEPS):
152 |             # action = restrictive_action(agent.action(state), episode)
153 |             action = agent.noise_action(state, episode)
154 |             next_state, reward, cost, done = env.step(action)
155 |             ep_reward += reward
156 |             ep_cost += cost
157 |             agent.perceive(state, action, reward, cost, next_state, done, mix_ratio=SIM_REAL_RATIO)
158 |             dual_variable = agent.get_dual_lambda()
159 |             state = next_state
160 |         summary = tf.Summary()
161 |         summary.value.add(tag='Steps_sum_Reward', simple_value=float(ep_reward/MAX_EP_STEPS))
162 |         summary.value.add(tag='Steps_sum_Cost', simple_value=float(ep_cost/MAX_EP_STEPS))
163 |         summary.value.add(tag='Dual_variable', simple_value=float(dual_variable))
164 |         summary_writer.add_summary(summary, episode)
165 | 
166 |         summary_writer.flush()
167 | 
168 |         print('Episode:{} | Reward: {:.2f} | Cost: {:.2f}'.format(episode, ep_reward/MAX_EP_STEPS, ep_cost/MAX_EP_STEPS))
169 | 
170 |         if episode % 100 == 0 and episode >= 100:
171 |             agent.save_model()
172 | 
173 |     print("-------------save model--------------------")
174 |     agent.save_model()
175 | 
176 | 
177 | if __name__ == '__main__':
178 |     main()


--------------------------------------------------------------------------------
/Simulator/baseline_main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import pprint
  4 | import tensorflow as tf
  5 | import tensorflow.contrib.slim as slim
  6 | from Simulator.baseline_model import BaseLineModel
  7 | from Simulator.data_model import BoilerDataSet
  8 | 
  9 | flags = tf.app.flags
 10 | # Data and model checkpcheckpointsoints directories
 11 | flags.DEFINE_integer("display_iter", 200, "display_iter")
 12 | flags.DEFINE_integer("save_log_iter", 100, "save_log_iter")
 13 | # Model params
 14 | flags.DEFINE_string('model', 'lstm', 'Choose from lstm, gru, rnn, or dnn')
 15 | flags.DEFINE_integer("input_size", 109, "Input size")  # external_input + state + action
 16 | flags.DEFINE_integer("output_size", 47, "Output size")  # state size
 17 | flags.DEFINE_integer("num_units", 128, "Num of hidden units")
 18 | flags.DEFINE_integer("num_layers", 2, "Num of stacked layers")
 19 | # Optimization
 20 | flags.DEFINE_integer("num_steps", 5, "Num of steps")
 21 | flags.DEFINE_integer("batch_size", 256, "The size of batch")
 22 | flags.DEFINE_integer("max_epoch", 50, "Total training epoches")
 23 | flags.DEFINE_float("grad_clip", 5., "Clip gradients at this value")
 24 | flags.DEFINE_float("learning_rate", 0.001, "Initial learning rate at early stage. [0.001]")
 25 | flags.DEFINE_float("learning_rate_decay", 0.95, "Decay rate of learning rate. [0.99]")
 26 | flags.DEFINE_float("keep_prob", 1, "Keep probability of input data and dropout layer. [0.8]")
 27 | flags.DEFINE_float("l2_weight", 0.0, "weight of l2 loss")
 28 | 
 29 | 
 30 | FLAGS = flags.FLAGS
 31 | 
 32 | 
 33 | pp = pprint.PrettyPrinter()
 34 | 
 35 | if not os.path.exists("logs"):
 36 |     os.mkdir("logs")
 37 | 
 38 | 
 39 | def show_all_variables():
 40 |     model_vars = tf.trainable_variables()
 41 |     slim.model_analyzer.analyze_vars(model_vars, print_info=True)
 42 | 
 43 | 
 44 | def main(_):
 45 |     np.random.seed(2019)
 46 | 
 47 |     pp.pprint(flags.FLAGS.__flags)
 48 | 
 49 |     # os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 50 |     run_config = tf.ConfigProto()
 51 |     run_config.gpu_options.allow_growth = True
 52 | 
 53 |     # read data
 54 |     if FLAGS.model == 'dnn':
 55 |         boiler_dataset = BoilerDataSet(num_steps=1)
 56 |         train_X = boiler_dataset.train_X.reshape([-1, FLAGS.input_size])
 57 |         train_y = boiler_dataset.train_y.reshape([-1, FLAGS.output_size])
 58 |         val_X = boiler_dataset.val_X.reshape([-1, FLAGS.input_size])
 59 |         val_y = boiler_dataset.val_y.reshape([-1, FLAGS.output_size])
 60 | 
 61 |     else:
 62 |         boiler_dataset = BoilerDataSet(num_steps=FLAGS.num_steps)
 63 |         train_X = boiler_dataset.train_X
 64 |         train_y = boiler_dataset.train_y
 65 |         val_X = boiler_dataset.val_X
 66 |         val_y = boiler_dataset.val_y
 67 |     # print dataset info
 68 |     num_train = len(train_X)
 69 |     num_valid = len(val_X)
 70 |     print('train samples: {0}'.format(num_train))
 71 |     print('eval samples: {0}'.format(num_valid))
 72 | 
 73 |     # model construction
 74 |     tf.reset_default_graph()
 75 |     baseline_model = BaseLineModel(FLAGS)
 76 | 
 77 |     # print trainable params
 78 |     for i in tf.trainable_variables():
 79 |         print(i)
 80 |     # count the parameters in our model
 81 |     total_parameters = 0
 82 |     for variable in tf.trainable_variables():
 83 |         # shape is an array of tf.Dimension
 84 |         shape = variable.get_shape()
 85 |         # print(shape)
 86 |         # print(len(shape))
 87 |         variable_parameters = 1
 88 |         for dim in shape:
 89 |             # print(dim)
 90 |             variable_parameters *= dim.value
 91 |         # print(variable_parameters)
 92 |         total_parameters += variable_parameters
 93 |     print('total parameters: {}'.format(total_parameters))
 94 | 
 95 |     # path for log saving
 96 |     model_name = "baseline_" + FLAGS.model
 97 |     logdir = './logs/{}-{}-{}-{}-{:.2f}-{:.4f}-{:.2f}-{:.5f}/'.format(
 98 |         model_name, FLAGS.num_layers, FLAGS.num_units, FLAGS.num_steps,
 99 |         FLAGS.keep_prob, FLAGS.learning_rate, FLAGS.learning_rate_decay, FLAGS.l2_weight)
100 |     model_dir = logdir + 'saved_models/'
101 | 
102 |     if not os.path.exists(logdir):
103 |         os.mkdir(logdir)
104 |     if not os.path.exists(model_dir):
105 |         os.mkdir(model_dir)
106 |     results_dir = logdir + 'results/'
107 | 
108 |     with tf.Session(config=run_config) as sess:
109 |         summary_writer = tf.summary.FileWriter(logdir)
110 | 
111 |         sess.run(tf.global_variables_initializer())
112 |         saver = tf.train.Saver()
113 | 
114 |         iter = 0
115 |         valid_losses = [np.inf]
116 | 
117 |         for i in range(FLAGS.max_epoch):
118 |             print('----------epoch {}-----------'.format(i))
119 |             # learning_rate = FLAGS.learning_rate
120 |             learning_rate = FLAGS.learning_rate * (
121 |                 FLAGS.learning_rate_decay ** i
122 |             )
123 | 
124 |             for batch_X, batch_y in boiler_dataset.generate_one_epoch(train_X, train_y, FLAGS.batch_size):
125 |                 iter += 1
126 |                 train_data_feed = {
127 |                     baseline_model.learning_rate: learning_rate,
128 |                     baseline_model.keep_prob: FLAGS.keep_prob,
129 |                     baseline_model.inputs: batch_X,
130 |                     baseline_model.targets: batch_y,
131 |                 }
132 |                 train_loss, _, merged_summ = sess.run(
133 |                     [baseline_model.loss, baseline_model.train_opt, baseline_model.merged_summ], train_data_feed)
134 |                 if iter % FLAGS.save_log_iter == 0:
135 |                     summary_writer.add_summary(merged_summ, iter)
136 |                 if iter % FLAGS.display_iter == 0:
137 |                     valid_loss = 0
138 |                     for val_batch_X, val_batch_y in boiler_dataset.generate_one_epoch(val_X, val_y, FLAGS.batch_size):
139 |                         val_data_feed = {
140 |                             baseline_model.keep_prob: 1.0,
141 |                             baseline_model.inputs: val_batch_X,
142 |                             baseline_model.targets: val_batch_y,
143 |                         }
144 |                         batch_loss = sess.run(baseline_model.loss, val_data_feed)
145 |                         valid_loss += batch_loss
146 |                     num_batches = int(len(val_X)) // FLAGS.batch_size
147 |                     valid_loss /= num_batches
148 |                     valid_losses.append(valid_loss)
149 |                     valid_loss_sum = tf.Summary(
150 |                         value=[tf.Summary.Value(tag="valid_loss", simple_value=valid_loss)])
151 |                     summary_writer.add_summary(valid_loss_sum, iter)
152 | 
153 |                     if valid_loss < min(valid_losses[:-1]):
154 |                         print('iter {}\tvalid_loss = {:.6f}\tmodel saved!!'.format(
155 |                             iter, valid_loss))
156 |                         saver.save(sess, model_dir +
157 |                                    'model_{}.ckpt'.format(iter))
158 |                         saver.save(sess, model_dir + 'final_model.ckpt')
159 |                     else:
160 |                         print('iter {}\tvalid_loss = {:.6f}\t'.format(
161 |                             iter, valid_loss))
162 | 
163 |     print('stop training !!!')
164 | 
165 | 
166 | if __name__ == '__main__':
167 |     tf.app.run()


--------------------------------------------------------------------------------
/Simulator/simrnn_main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import pprint
  4 | import tensorflow as tf
  5 | import tensorflow.contrib.slim as slim
  6 | import sys
  7 | sys.path.append('../')
  8 | 
  9 | from Simulator.simrnn_model import RNNSimulatorModel
 10 | from Simulator.data_model import BoilerDataSet
 11 | 
 12 | flags = tf.app.flags
 13 | # Data and model checkpcheckpointsoints directories
 14 | flags.DEFINE_integer("display_iter", 200, "display_iter")
 15 | flags.DEFINE_integer("save_log_iter", 100, "save_log_iter")
 16 | # Model params
 17 | flags.DEFINE_integer("input_size", 109, "Input size")  # external_input + state + action
 18 | flags.DEFINE_integer("output_size", 47, "Output size")  # state size
 19 | # Optimization
 20 | flags.DEFINE_integer("num_steps", 10, "Num of steps")
 21 | flags.DEFINE_integer("batch_size", 1, "The size of batch")
 22 | flags.DEFINE_integer("max_epoch", 50, "Total training epoches")
 23 | flags.DEFINE_float("grad_clip", 5., "Clip gradients at this value")
 24 | flags.DEFINE_float("learning_rate", 0.001, "Initial learning rate at early stage. [0.001]")
 25 | flags.DEFINE_float("learning_rate_decay", 0.95, "Decay rate of learning rate. [0.99]")
 26 | flags.DEFINE_float("keep_prob", 1, "Keep probability of input data and dropout layer. [0.8]")
 27 | flags.DEFINE_float("l2_weight", 0.0, "weight of l2 loss")
 28 | 
 29 | FLAGS = flags.FLAGS
 30 | 
 31 | 
 32 | class cell_config(object):
 33 |     """ Simulator Cell config """
 34 |     # list, [coaler_num_units, burner_num_units, steamer_num_units]
 35 |     num_units = [128, 64, 64]
 36 | 
 37 |     # data is [external_input, state(coaler, burner, steamer), action(coaler, burner, steamer)]
 38 |     external_state_pos = 0
 39 |     external_state_size = 11
 40 |     coaler_state_pos = external_state_pos + external_state_size
 41 |     coaler_state_size = 25
 42 |     burner_state_pos = coaler_state_pos + coaler_state_size
 43 |     burner_state_size = 7
 44 |     steamer_state_pos = burner_state_pos + burner_state_size
 45 |     steamer_state_size = 15
 46 |     coaler_action_pos = steamer_state_pos + steamer_state_size
 47 |     coaler_action_size = 31
 48 |     burner_action_pos = coaler_action_pos + coaler_action_size
 49 |     burner_action_size = 15
 50 |     steamer_action_pos = burner_action_pos + burner_action_size
 51 |     steamer_action_size = 5
 52 | 
 53 | 
 54 | pp = pprint.PrettyPrinter()
 55 | 
 56 | if not os.path.exists("logs"):
 57 |     os.mkdir("logs")
 58 | 
 59 | 
 60 | def show_all_variables():
 61 |     model_vars = tf.trainable_variables()
 62 |     slim.model_analyzer.analyze_vars(model_vars, print_info=True)
 63 | 
 64 | 
 65 | def main(_):
 66 |     np.random.seed(2019)
 67 | 
 68 |     pp.pprint(flags.FLAGS.__flags)
 69 | 
 70 |     # os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 71 |     run_config = tf.ConfigProto()
 72 |     run_config.gpu_options.allow_growth = True
 73 | 
 74 |     # read data
 75 |     boiler_dataset = BoilerDataSet(num_steps=FLAGS.num_steps)
 76 |     train_X = boiler_dataset.train_X
 77 |     train_y = boiler_dataset.train_y
 78 |     val_X = boiler_dataset.val_X
 79 |     val_y = boiler_dataset.val_y
 80 |     # print dataset info
 81 |     num_train = len(train_X)
 82 |     num_valid = len(val_X)
 83 |     print('train samples: {0}'.format(num_train))
 84 |     print('eval samples: {0}'.format(num_valid))
 85 | 
 86 |     # model construction
 87 |     tf.reset_default_graph()
 88 |     rnn_model = RNNSimulatorModel(cell_config(), FLAGS)
 89 | 
 90 |     # print trainable params
 91 |     for i in tf.trainable_variables():
 92 |         print(i)
 93 |     # count the parameters in our model
 94 |     total_parameters = 0
 95 |     for variable in tf.trainable_variables():
 96 |         # shape is an array of tf.Dimension
 97 |         shape = variable.get_shape()
 98 |         # print(shape)
 99 |         # print(len(shape))
100 |         variable_parameters = 1
101 |         for dim in shape:
102 |             # print(dim)
103 |             variable_parameters *= dim.value
104 |         # print(variable_parameters)
105 |         total_parameters += variable_parameters
106 |     print('total parameters: {}'.format(total_parameters))
107 | 
108 |     # path for log saving
109 |     model_name = "sim_rnn"
110 |     logdir = './logs/{}-{}-{}-{}-{}-{:.2f}-{:.4f}-{:.2f}-{:.5f}/'.format(
111 |         model_name, cell_config.num_units[0], cell_config.num_units[1], cell_config.num_units[2],
112 |         FLAGS.num_steps, FLAGS.keep_prob, FLAGS.learning_rate, FLAGS.learning_rate_decay, FLAGS.l2_weight)
113 |     model_dir = logdir + 'saved_models/'
114 | 
115 |     if not os.path.exists(logdir):
116 |         os.mkdir(logdir)
117 |     if not os.path.exists(model_dir):
118 |         os.mkdir(model_dir)
119 |     results_dir = logdir + 'results/'
120 | 
121 |     with tf.Session(config=run_config) as sess:
122 |         summary_writer = tf.summary.FileWriter(logdir)
123 | 
124 |         sess.run(tf.global_variables_initializer())
125 |         saver = tf.train.Saver()
126 | 
127 |         iter = 0
128 |         valid_losses = [np.inf]
129 | 
130 |         for i in range(FLAGS.max_epoch):
131 |             print('----------epoch {}-----------'.format(i))
132 |             # learning_rate = FLAGS.learning_rate
133 |             learning_rate = FLAGS.learning_rate * (
134 |                 FLAGS.learning_rate_decay ** i
135 |             )
136 | 
137 |             for batch_X, batch_y in boiler_dataset.generate_one_epoch(train_X, train_y, FLAGS.batch_size):
138 |                 iter += 1
139 |                 train_data_feed = {
140 |                     rnn_model.learning_rate: learning_rate,
141 |                     rnn_model.keep_prob: FLAGS.keep_prob,
142 |                     rnn_model.inputs: batch_X,
143 |                     rnn_model.targets: batch_y,
144 |                 }
145 |                 train_loss, _, merged_summ = sess.run(
146 |                     [rnn_model.loss, rnn_model.train_opt, rnn_model.merged_summ], train_data_feed)
147 |                 if iter % FLAGS.save_log_iter == 0:
148 |                     summary_writer.add_summary(merged_summ, iter)
149 |                 if iter % FLAGS.display_iter == 0:
150 |                     valid_loss = 0
151 |                     for val_batch_X, val_batch_y in boiler_dataset.generate_one_epoch(val_X, val_y, FLAGS.batch_size):
152 |                         val_data_feed = {
153 |                             rnn_model.keep_prob: 1.0,
154 |                             rnn_model.inputs: val_batch_X,
155 |                             rnn_model.targets: val_batch_y,
156 |                         }
157 |                         batch_loss = sess.run(rnn_model.loss, val_data_feed)
158 |                         valid_loss += batch_loss
159 |                     num_batches = int(len(val_X)) // FLAGS.batch_size
160 |                     valid_loss /= num_batches
161 |                     valid_losses.append(valid_loss)
162 |                     valid_loss_sum = tf.Summary(
163 |                         value=[tf.Summary.Value(tag="valid_loss", simple_value=valid_loss)])
164 |                     summary_writer.add_summary(valid_loss_sum, iter)
165 | 
166 |                     if valid_loss < min(valid_losses[:-1]):
167 |                         print('iter {}\tvalid_loss = {:.6f}\tmodel saved!!'.format(
168 |                             iter, valid_loss))
169 |                         saver.save(sess, model_dir +
170 |                                    'model_{}.ckpt'.format(iter))
171 |                         saver.save(sess, model_dir + 'final_model.ckpt')
172 |                     else:
173 |                         print('iter {}\tvalid_loss = {:.6f}\t'.format(
174 |                             iter, valid_loss))
175 | 
176 |     print('stop training !!!')
177 | 
178 | 
179 | if __name__ == '__main__':
180 |     tf.app.run()


--------------------------------------------------------------------------------
/RL/reward_critic_network.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import math
  4 | 
  5 | 
  6 | LAYER1_SIZE = 256
  7 | LAYER2_SIZE = 256
  8 | LEARNING_RATE = 0.0001
  9 | TAU = 0.001
 10 | L2 = 0.0001
 11 | 
 12 | 
 13 | def weight_variable(shape):
 14 |     initial = tf.truncated_normal(shape, stddev=0.01)
 15 |     return tf.Variable(initial)
 16 | 
 17 | 
 18 | def bias_variable(shape):
 19 |     initial = tf.constant(0.03, shape=shape)
 20 |     return tf.Variable(initial)
 21 | 
 22 | 
 23 | class RewardCriticNetwork(object):
 24 |     def __init__(self, sess, input_config, summ_writer):
 25 |         self.time_step = 0
 26 |         self.sess = sess
 27 |         self.state_dim = input_config.state_dim
 28 |         self.action_dim = input_config.action_dim
 29 |         self.clip_norm = input_config.clip_norm
 30 |         self.step = 0
 31 |         self.log_iter = input_config.log_iter  # logging interval in training phase
 32 |         self.log_path = input_config.log_path  # logging interval in training phase
 33 | 
 34 |         self.train_writer = summ_writer
 35 | 
 36 |         # create reward network
 37 |         self.state_input, \
 38 |         self.action_input, \
 39 |         self.reward_value_output, \
 40 |         self.net = self.create_reward_network(self.state_dim, self.action_dim)
 41 | 
 42 |         # create target reward network (the same structure with reward network)
 43 |         self.target_state_input, \
 44 |         self.target_action_input, \
 45 |         self.target_reward_value_output, \
 46 |         self.target_update = self.create_target_reward_network(self.state_dim, self.action_dim, self.net)
 47 | 
 48 |         self.create_training_method()
 49 | 
 50 |         self.sess.run(tf.global_variables_initializer())
 51 | 
 52 |         self.update_target()
 53 | 
 54 |     def create_training_method(self):
 55 |         # Define training optimizer
 56 |         self.y_input = tf.placeholder("float", [None, 1])
 57 |         weight_decay = tf.add_n([L2 * tf.nn.l2_loss(var) for var in self.net])
 58 |         self.cost = tf.reduce_mean(tf.square(self.y_input - self.reward_value_output)) + weight_decay
 59 |         self.optimizer = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.cost)
 60 |         self.action_gradients = tf.gradients(self.reward_value_output, self.action_input)
 61 | 
 62 | 
 63 |     # def create_reward_network(self, state_dim, action_dim):
 64 |     #     # the layer size could be changed
 65 |     #     layer1_size = LAYER1_SIZE
 66 |     #     layer2_size = LAYER2_SIZE
 67 |     #
 68 |     #     state_input = tf.placeholder("float", [None, state_dim])
 69 |     #     action_input = tf.placeholder("float", [None, action_dim])
 70 |     #
 71 |     #     W1 = self.variable([state_dim, layer1_size], state_dim)
 72 |     #     b1 = self.variable([layer1_size], state_dim)
 73 |     #     W2 = self.variable([layer1_size, layer2_size], layer1_size + action_dim)
 74 |     #     W2_action = self.variable([action_dim, layer2_size], layer1_size + action_dim)
 75 |     #     b2 = self.variable([layer2_size], layer1_size + action_dim)
 76 |     #     W3 = tf.Variable(tf.random_uniform([layer2_size, 1], -3e-3, 3e-3))
 77 |     #     b3 = tf.Variable(tf.random_uniform([1], -3e-3, 3e-3))
 78 |     #
 79 |     #     layer1 = tf.nn.relu(tf.matmul(state_input, W1) + b1)
 80 |     #     layer2 = tf.nn.relu(tf.matmul(layer1, W2) + tf.matmul(action_input, W2_action) + b2)
 81 |     #     q_value_output = tf.identity(tf.matmul(layer2, W3) + b3)
 82 |     #
 83 |     #     return state_input, action_input, q_value_output, [W1, b1, W2, W2_action, b2, W3, b3]
 84 | 
 85 |     def create_reward_network(self, state_dim, action_dim):
 86 |         # the layer size could be changed
 87 |         layer1_size = LAYER1_SIZE
 88 |         layer2_size = LAYER2_SIZE
 89 | 
 90 |         state_input = tf.placeholder("float", [None, state_dim])
 91 |         action_input = tf.placeholder("float", [None, action_dim])
 92 | 
 93 |         # Input -> Hidden Layer
 94 |         w1 = weight_variable([state_dim, layer1_size])
 95 |         b1 = bias_variable([layer1_size])
 96 |         # Hidden Layer -> Hidden Layer + Action
 97 |         w2 = weight_variable([layer1_size, layer2_size])
 98 |         w2a = weight_variable([action_dim, layer2_size])
 99 |         b2 = bias_variable([layer2_size])
100 |         # Hidden Layer -> Output (Q)
101 |         w3 = weight_variable([layer2_size, 1])
102 |         b3 = bias_variable([1])
103 | 
104 |         # 1st Hidden layer, OPTION: Softmax, relu, tanh or sigmoid
105 |         h1 = tf.nn.relu(tf.matmul(state_input, w1) + b1)
106 |         # 2nd Hidden layer, OPTION: Softmax, relu, tanh or sigmoid
107 |         # Action inserted here
108 |         h2 = tf.nn.relu(tf.matmul(h1, w2) + tf.matmul(action_input, w2a) + b2)
109 | 
110 |         reward_value_output = tf.matmul(h2, w3) + b3
111 | 
112 |         return state_input, action_input, reward_value_output, [w1, b1, w2, w2a, b2, w3, b3]
113 | 
114 |     def create_target_reward_network(self, state_dim, action_dim, net):
115 |         state_input = tf.placeholder("float", [None, state_dim])
116 |         action_input = tf.placeholder("float", [None, action_dim])
117 | 
118 |         ema = tf.train.ExponentialMovingAverage(decay=1 - TAU)
119 |         target_update = ema.apply(net)
120 |         target_net = [ema.average(x) for x in net]
121 | 
122 |         layer1 = tf.nn.relu(tf.matmul(state_input, target_net[0]) + target_net[1])
123 |         layer2 = tf.nn.relu(tf.matmul(layer1, target_net[2]) + tf.matmul(action_input, target_net[3]) + target_net[4])
124 |         reward_value_output = tf.identity(tf.matmul(layer2, target_net[5]) + target_net[6])
125 | 
126 |         return state_input, action_input, reward_value_output, target_update
127 | 
128 |     def update_target(self):
129 |         self.sess.run(self.target_update)
130 | 
131 |     def train(self, y_batch, state_batch, action_batch):
132 |         # r_loss_summ = tf.summary.scalar('reward_critic_loss', self.cost)
133 |         # self.merged = tf.summary.merge([r_loss_summ])
134 | 
135 |         train_feed_dict = {
136 |             self.y_input: y_batch,
137 |             self.state_input: state_batch,
138 |             self.action_input: action_batch
139 |         }
140 |         _, reward_critic_loss, reward_action_grad_norm = \
141 |             self.sess.run([self.optimizer, self.cost, self.action_gradients], train_feed_dict)
142 | 
143 |         # if self.step % self.log_iter == 0:
144 |         #     self.train_writer.add_summary(merged_summ, global_step=self.step)
145 | 
146 |         self.step += 1
147 | 
148 |         return reward_critic_loss, reward_action_grad_norm
149 | 
150 |     def pretrain(self, y_batch, state_batch, action_batch):
151 |         train_feed_dict = {
152 |             self.y_input: y_batch,
153 |             self.state_input: state_batch,
154 |             self.action_input: action_batch
155 |         }
156 |         _, reward_critic_loss = self.sess.run([self.optimizer, self.cost], train_feed_dict)
157 |         return reward_critic_loss
158 | 
159 |     def gradients(self, state_batch, action_batch):
160 |         return self.sess.run(self.action_gradients, feed_dict={
161 |             self.state_input: state_batch,
162 |             self.action_input: action_batch
163 |         })[0]
164 | 
165 |     def target_reward(self, state_batch, action_batch):
166 |         return self.sess.run(self.target_reward_value_output, feed_dict={
167 |             self.target_state_input: state_batch,
168 |             self.target_action_input: action_batch
169 |         })
170 | 
171 |     def reward_value(self, state_batch, action_batch):
172 |         return self.sess.run(self.reward_value_output, feed_dict={
173 |             self.state_input: state_batch,
174 |             self.action_input: action_batch})
175 | 
176 |         # f fan-in size
177 |     def variable(self, shape, f):
178 |         return tf.Variable(tf.random_uniform(shape, -1 / math.sqrt(f), 1 / math.sqrt(f)))
179 | 
180 | '''
181 |     def load_network(self):
182 |         self.saver = tf.train.Saver()
183 |         checkpoint = tf.train.get_checkpoint_state("saved_reward_critic_networks")
184 |         if checkpoint and checkpoint.model_checkpoint_path:
185 |             self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
186 |             print "Successfully loaded:", checkpoint.model_checkpoint_path
187 |         else:
188 |             print "Could not find old network weights"
189 |     def save_network(self,time_step):
190 |         print 'save reward-critic-network...',time_step
191 |         self.saver.save(self.sess, 'saved_reward_critic_networks/' + 'reward-critic-network', global_step = time_step)
192 | '''
193 | 
194 | 
195 | 
196 | 
197 | 
198 | 
199 | 


--------------------------------------------------------------------------------
/RL/cost_critic_network.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import math
  4 | 
  5 | 
  6 | LAYER1_SIZE = 256
  7 | LAYER2_SIZE = 256
  8 | LEARNING_RATE = 0.0001
  9 | TAU = 0.001
 10 | L2 = 0.0001
 11 | 
 12 | 
 13 | def weight_variable(shape):
 14 |     initial = tf.truncated_normal(shape, stddev=0.01)
 15 |     return tf.Variable(initial)
 16 | 
 17 | 
 18 | def bias_variable(shape):
 19 |     initial = tf.constant(0.03, shape=shape)
 20 |     return tf.Variable(initial)
 21 | 
 22 | 
 23 | class CostCriticNetwork(object):
 24 |     def __init__(self, sess, input_config, summ_writer):
 25 |         self.time_step = 0
 26 |         self.sess = sess
 27 |         self.state_dim = input_config.state_dim
 28 |         self.action_dim = input_config.action_dim
 29 |         self.clip_norm = input_config.clip_norm
 30 |         self.step = 0
 31 |         self.log_iter = input_config.log_iter  # logging interval in training phase
 32 |         self.log_path = input_config.log_path  # logging interval in training phase
 33 | 
 34 |         self.train_writer_cost = summ_writer
 35 | 
 36 | 
 37 |         # create cost network
 38 |         self.state_input, \
 39 |         self.action_input, \
 40 |         self.cost_value_output, \
 41 |         self.cost_net = self.create_cost_network(self.state_dim, self.action_dim)
 42 | 
 43 |         # create target cost network (the same structure with cost network)
 44 |         self.target_state_input, \
 45 |         self.target_action_input, \
 46 |         self.target_cost_value_output, \
 47 |         self.cost_target_update = self.create_target_cost_network(self.state_dim, self.action_dim, self.cost_net)
 48 | 
 49 |         self.create_training_method()
 50 | 
 51 |         self.sess.run(tf.global_variables_initializer())
 52 | 
 53 |         self.update_target()
 54 | 
 55 | 
 56 |     def create_training_method(self):
 57 |         # Define training optimizer
 58 |         self.z_input = tf.placeholder("float", [None, 1])
 59 |         weight_decay = tf.add_n([L2 * tf.nn.l2_loss(var) for var in self.cost_net])
 60 |         self.cost_cost = tf.reduce_mean(tf.square(self.z_input - self.cost_value_output)) + weight_decay
 61 |         self.optimizer = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.cost_cost)
 62 |         self.action_gradients_cost = tf.gradients(self.cost_value_output, self.action_input)
 63 | 
 64 | 
 65 | 
 66 |     # def create_cost_network(self, state_dim, action_dim):
 67 |     #     # the layer size could be changed
 68 |     #     layer1_size = LAYER1_SIZE
 69 |     #     layer2_size = LAYER2_SIZE
 70 |     #
 71 |     #     state_input = tf.placeholder("float", [None, state_dim])
 72 |     #     action_input = tf.placeholder("float", [None, action_dim])
 73 |     #
 74 |     #     W1 = self.variable([state_dim, layer1_size], state_dim)
 75 |     #     b1 = self.variable([layer1_size], state_dim)
 76 |     #     W2 = self.variable([layer1_size, layer2_size], layer1_size + action_dim)
 77 |     #     W2_action = self.variable([action_dim, layer2_size], layer1_size + action_dim)
 78 |     #     b2 = self.variable([layer2_size], layer1_size + action_dim)
 79 |     #     W3 = tf.Variable(tf.random_uniform([layer2_size, 1], -3e-3, 3e-3))
 80 |     #     b3 = tf.Variable(tf.random_uniform([1], -3e-3, 3e-3))
 81 |     #
 82 |     #     layer1 = tf.nn.relu(tf.matmul(state_input, W1) + b1)
 83 |     #     layer2 = tf.nn.relu(tf.matmul(layer1, W2) + tf.matmul(action_input, W2_action) + b2)
 84 |     #     cost_value_output = tf.identity(tf.matmul(layer2, W3) + b3)
 85 |     #
 86 |     #     return state_input, action_input, cost_value_output, [W1, b1, W2, W2_action, b2, W3, b3]
 87 | 
 88 |     def create_cost_network(self, state_dim, action_dim):
 89 |         # the layer size could be changed
 90 |         layer1_size = LAYER1_SIZE
 91 |         layer2_size = LAYER2_SIZE
 92 | 
 93 |         state_input = tf.placeholder("float", [None, state_dim])
 94 |         action_input = tf.placeholder("float", [None, action_dim])
 95 | 
 96 |         # Input -> Hidden Layer
 97 |         w1 = weight_variable([state_dim, layer1_size])
 98 |         b1 = bias_variable([layer1_size])
 99 |         # Hidden Layer -> Hidden Layer + Action
100 |         w2 = weight_variable([layer1_size, layer2_size])
101 |         w2a = weight_variable([action_dim, layer2_size])
102 |         b2 = bias_variable([layer2_size])
103 |         # Hidden Layer -> Output (Q)
104 |         w3 = weight_variable([layer2_size, 1])
105 |         b3 = bias_variable([1])
106 | 
107 |         # 1st Hidden layer, OPTION: Softmax, relu, tanh or sigmoid
108 |         h1 = tf.nn.relu(tf.matmul(state_input, w1) + b1)
109 |         # 2nd Hidden layer, OPTION: Softmax, relu, tanh or sigmoid
110 |         # Action inserted here
111 |         h2 = tf.nn.relu(tf.matmul(h1, w2) + tf.matmul(action_input, w2a) + b2)
112 | 
113 |         cost_value_output = tf.matmul(h2, w3) + b3
114 | 
115 |         return state_input, action_input, cost_value_output, [w1, b1, w2, w2a, b2, w3, b3]
116 | 
117 |     def create_target_cost_network(self, state_dim, action_dim, net):
118 |         state_input = tf.placeholder("float", [None, state_dim])
119 |         action_input = tf.placeholder("float", [None, action_dim])
120 | 
121 |         ema = tf.train.ExponentialMovingAverage(decay=1 - TAU)
122 |         target_update = ema.apply(net)
123 |         target_net = [ema.average(x) for x in net]
124 | 
125 |         layer1 = tf.nn.relu(tf.matmul(state_input, target_net[0]) + target_net[1])
126 |         layer2 = tf.nn.relu(tf.matmul(layer1, target_net[2]) + tf.matmul(action_input, target_net[3]) + target_net[4])
127 |         cost_value_output = tf.identity(tf.matmul(layer2, target_net[5]) + target_net[6])
128 | 
129 |         return state_input, action_input, cost_value_output, target_update
130 | 
131 |     def update_target(self):
132 |         self.sess.run(self.cost_target_update)
133 | 
134 |     def train(self, z_batch, state_batch, action_batch):
135 |         # c_loss_summ = tf.summary.scalar('cost_critic_loss', self.cost_cost)
136 |         # self.merged_cost = tf.summary.merge([c_loss_summ])
137 | 
138 |         train_feed_dict = {
139 |             self.z_input: z_batch,
140 |             self.state_input: state_batch,
141 |             self.action_input: action_batch
142 |         }
143 |         _, cost_critic_loss, cost_action_grad_norm = \
144 |             self.sess.run([self.optimizer, self.cost_cost, self.action_gradients_cost], train_feed_dict)
145 | 
146 |         # if self.step % self.log_iter == 0:
147 |         #     self.train_writer_cost.add_summary(merged_summ_cost, global_step=self.step)
148 | 
149 |         self.step += 1
150 | 
151 |         return cost_critic_loss, cost_action_grad_norm
152 | 
153 |     def pretrain(self, z_batch, state_batch, action_batch):
154 |         train_feed_dict = {
155 |             self.z_input: z_batch,
156 |             self.state_input: state_batch,
157 |             self.action_input: action_batch
158 |         }
159 |         _, cost_critic_loss = self.sess.run([self.optimizer, self.cost_cost], train_feed_dict)
160 |         return cost_critic_loss
161 | 
162 |     def gradients(self, state_batch, action_batch):
163 |         return self.sess.run(self.action_gradients_cost, feed_dict={
164 |             self.state_input: state_batch,
165 |             self.action_input: action_batch
166 |         })[0]
167 | 
168 |     def target_cost(self, state_batch, action_batch):
169 |         return self.sess.run(self.target_cost_value_output, feed_dict={
170 |             self.target_state_input: state_batch,
171 |             self.target_action_input: action_batch
172 |         })
173 | 
174 |     def cost_value(self, state_batch, action_batch):
175 |         return self.sess.run(self.cost_value_output, feed_dict={
176 |             self.state_input: state_batch,
177 |             self.action_input: action_batch})
178 | 
179 |         # f fan-in size
180 |     def variable(self, shape, f):
181 |         return tf.Variable(tf.random_uniform(shape, -1 / math.sqrt(f), 1 / math.sqrt(f)))
182 | 
183 | '''
184 |     def load_network(self):
185 |         self.saver = tf.train.Saver()
186 |         checkpoint = tf.train.get_checkpoint_state("saved_cost_critic_networks")
187 |         if checkpoint and checkpoint.model_checkpoint_path:
188 |             self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
189 |             print "Successfully loaded:", checkpoint.model_checkpoint_path
190 |         else:
191 |             print "Could not find old network weights"
192 |     def save_network(self,time_step):
193 |         print 'save cost-critic-network...',time_step
194 |         self.saver.save(self.sess, 'saved_cost_critic_networks/' + 'cost-critic-network', global_step = time_step)
195 | '''
196 | 
197 | 
198 | 
199 | 
200 | 
201 | 
202 | 


--------------------------------------------------------------------------------
/RL/primal_dual_ddpg.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import os
  4 | from RL.ou_noise import OUNoise
  5 | from RL.reward_critic_network import RewardCriticNetwork
  6 | from RL.cost_critic_network import CostCriticNetwork
  7 | 
  8 | from RL.actor_network import ActorNetwork
  9 | from RL.replay_buffer import ReplayBuffer
 10 | from RL.util import *
 11 | 
 12 | # EPSILON定义一个极小值
 13 | EPSILON = 1e-5
 14 | # Hyper Parameters:
 15 | REPLAY_MEMORY_SIZE = 10000
 16 | REPLAY_START_SIZE = 1000
 17 | GAMMA = 0.9
 18 | COST_EPSILON = 1
 19 | DUAL_STEP_SIZE = 0.01
 20 | is_grad_inverter = False
 21 | 
 22 | 
 23 | class PrimalDualDDPG(object):
 24 |     """ Primal Dual Deep Deterministic Policy Gradient Algorithm"""
 25 | 
 26 |     def __init__(self, sess, input_config, is_batch_norm, summ_writer=None, load_model=False):
 27 |         self.state_dim = input_config.state_dim
 28 |         self.action_dim = input_config.action_dim
 29 |         self.dual_lambda = input_config.init_dual_lambda
 30 |         self.save_path = input_config.model_save_path
 31 |         self.train_display_iter = input_config.train_display_iter
 32 |         self.batch_size = input_config.batch_size
 33 |         self.gamma = GAMMA
 34 |         self.summay_writer = summ_writer
 35 | 
 36 |         self.sess = sess
 37 |         self.step = 0
 38 | 
 39 | 
 40 |         if is_batch_norm:
 41 |             self.rewward_critic_network = RewardCriticNetwork_bn(self.sess, self.state_dim, self.action_dim)
 42 |             self.cost_critic_network = CostCriticNetwork_bn(self.sess, self.state_dim, self.action_dim)
 43 |             self.actor_network = ActorNetwork_bn(self.sess, self.state_dim, self.action_dim)
 44 | 
 45 |         else:
 46 |             self.reward_critic_network = RewardCriticNetwork(self.sess, input_config, self.summay_writer)
 47 |             self.cost_critic_network = CostCriticNetwork(self.sess, input_config, self.summay_writer)
 48 |             self.actor_network = ActorNetwork(self.sess, input_config, load_model=False, summ_writer=self.summay_writer)
 49 | 
 50 |         # initialize replay buffer
 51 |         self.replay_buffer = ReplayBuffer(REPLAY_MEMORY_SIZE)
 52 | 
 53 |         # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
 54 |         self.exploration_noise = OUNoise(self.action_dim)
 55 | 
 56 |         # for name in input_config.__dict__:
 57 |         #     if isinstance(input_config.__dict__[name], int) or isinstance(input_config.__dict__[name], float):
 58 |         #         self.log(f'parameter|input_config_{name}:{input_config.__dict__[name]}')
 59 | 
 60 |         # model saver
 61 |         self.saver = tf.train.Saver()
 62 |         if load_model:
 63 |             self.saver.restore(sess=self.sess, save_path=tf.train.latest_checkpoint(self.save_path))
 64 | 
 65 | 
 66 |     # def __del__(self):
 67 |     #     self.logfile.close()
 68 |     #
 69 |     # def log(self, *args):
 70 |     #     self.logfile.write(*args)
 71 |     #     self.logfile.write('\n')
 72 | 
 73 |     def train(self):
 74 |         # print "train step", self.time_step
 75 |         # Sample a random minibatch of N transitions from replay buffer
 76 |         minibatch = self.replay_buffer.get_batch(self.batch_size)
 77 |         state_batch = np.asarray([data[0] for data in minibatch])
 78 |         action_batch = np.asarray([data[1] for data in minibatch])
 79 |         reward_batch = np.asarray([data[2] for data in minibatch])
 80 |         cost_batch = np.asarray([data[3] for data in minibatch])
 81 |         next_state_batch = np.asarray([data[4] for data in minibatch])
 82 |         done_batch = np.asarray([data[5] for data in minibatch])
 83 | 
 84 |         # Calculate y_batch
 85 |         target_action_batch = self.actor_network.target_actions(next_state_batch)
 86 |         target_reward_value = self.reward_critic_network.target_reward(next_state_batch, target_action_batch)
 87 |         target_cost_value = self.cost_critic_network.target_cost(next_state_batch, target_action_batch)
 88 |         y_batch, z_batch = [], []
 89 |         for i in range(len(minibatch)):
 90 |             if done_batch[i]:
 91 |                 y_batch.append(reward_batch[i])
 92 |                 z_batch.append(cost_batch[i])
 93 |             else:
 94 |                 y_batch.append(reward_batch[i] + GAMMA * target_reward_value[i])
 95 |                 z_batch.append(cost_batch[i] + GAMMA * target_cost_value[i])
 96 | 
 97 |         y_batch = np.resize(y_batch, [self.batch_size, 1])
 98 |         z_batch = np.resize(z_batch, [self.batch_size, 1])
 99 | 
100 |         # Update reward critic by minimizing the loss L
101 |         reward_critic_loss, reward_action_grad_norm = self.reward_critic_network.train(y_batch, state_batch, action_batch)
102 |         # q_value = self.critic_network.get_q_value(state_limit_batch, action_batch)
103 | 
104 |         # Update cost critic by minimizing the loss L
105 |         cost_critic_loss, cost_action_grad_norm = self.cost_critic_network.train(z_batch, state_batch, action_batch)
106 | 
107 |         # Update the actor policy using the sampled gradient
108 |         if is_grad_inverter:
109 |             action_batch_for_gradients = self.actor_network.actions(state_batch)
110 |             action_batch_for_gradients = self.grad_inv.invert(action_batch_for_gradients, )
111 |         else:
112 |             action_batch_for_gradients = self.actor_network.actions(state_batch)
113 |         print('action_batch_for_gradients', action_batch_for_gradients)
114 |         reward_gradient_batch = self.reward_critic_network.gradients(state_batch, action_batch_for_gradients)
115 |         cost_gradient_batch = self.cost_critic_network.gradients(state_batch, action_batch_for_gradients)
116 |         q_gradient_batch = reward_gradient_batch - self.dual_lambda * cost_gradient_batch
117 |         self.actor_network.train(q_gradient_batch, state_batch)
118 | 
119 |         # Update the dual variable using the sample gradient
120 |         cost_value_batch = self.cost_critic_network.cost_value(state_batch, action_batch_for_gradients)
121 |         cost_limit_batch = np.array([[COST_EPSILON] for _ in range(self.batch_size)])
122 |         self.dual_gradients = np.mean(cost_value_batch - cost_limit_batch)
123 |         self.dual_lambda += DUAL_STEP_SIZE * self.dual_gradients
124 |         self.dual_lambda = np.max([EPSILON, self.dual_lambda])  # ensure dual >= 0
125 | 
126 |         if self.step % self.train_display_iter == 0:
127 |             print("reward_critic: loss:{:.3f} action_grads_norm:{:.3f} "
128 |                   "| cost_critic: loss:{:.3f} action_grads_norm:{:.3f}"
129 |                   "| q_gradient:{:.3f}".format(
130 |                 reward_critic_loss, np.mean(reward_action_grad_norm),
131 |                 cost_critic_loss, np.mean(cost_action_grad_norm), np.mean(q_gradient_batch)))
132 |             print("Dual lambda: {}".format(self.dual_lambda))
133 | 
134 | 
135 |         # Update the target networks
136 |         self.reward_critic_network.update_target()
137 |         self.cost_critic_network.update_target()
138 |         self.actor_network.update_target()
139 |         self.step += 1
140 | 
141 |     def noise_action(self, state, episode):
142 |         # Select action a_t according to the current policy and exploration noise
143 |         action = self.actor_network.action(state)
144 |         if episode % 10 == 0:
145 |             self.exploration_noise.update_weight()
146 |         noise_action = action + self.exploration_noise.noise()
147 |         noise_action = np.minimum(np.maximum(noise_action, 0), 1)  # bound action to [0, 1]
148 |         return noise_action
149 | 
150 |     def action(self, state):
151 |         action = self.actor_network.action(state)
152 |         return action
153 | 
154 |     def get_dual_lambda(self):
155 |         return self.dual_lambda
156 | 
157 |     def perceive(self, state, action, reward, cost, next_state, done, mix_ratio):
158 |         # Store transition (s_t,a_t,r_t,c_t,s_{t+1}) in replay buffer
159 |         self.replay_buffer.add(state, action, reward, cost, next_state, done, mix_ratio)
160 | 
161 |         # Store transitions to replay start size then start training
162 |         if self.replay_buffer.count() > REPLAY_START_SIZE:
163 |             self.train()
164 | 
165 |         #if self.time_step % 10000 == 0:
166 |             #self.actor_network.save_network(self.time_step)
167 |             #self.critic_network.save_network(self.time_step)
168 | 
169 |         # Re-iniitialize the random process when an episode ends
170 |         if done:
171 |             self.exploration_noise.reset()
172 | 
173 |     def save_model(self):
174 |         self.saver.save(sess=self.sess, save_path=self.save_path)  #global_step=10,会自动生成名字-10
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 


--------------------------------------------------------------------------------
/RL/actor_network.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import math
  3 | 
  4 | 
  5 | # Hyper Parameters
  6 | LAYER1_SIZE = 256
  7 | LAYER2_SIZE = 256
  8 | LAYER3_SIZE = 128
  9 | LEARNING_RATE = 0.0001
 10 | TAU = 0.001
 11 | 
 12 | 
 13 | def weight_variable(shape, name):
 14 |     initial = tf.truncated_normal(shape, stddev=0.01)
 15 |     return tf.Variable(initial, name)
 16 | 
 17 | 
 18 | def bias_variable(shape, name):
 19 |     initial = tf.constant(0.03, shape=shape)
 20 |     return tf.Variable(initial, name)
 21 | 
 22 | 
 23 | class ActorNetwork(object):
 24 |     """ Map: state + limit_load -> action """
 25 | 
 26 |     def __init__(self, sess, input_config, load_model, summ_writer):
 27 |         self.sess = sess
 28 |         self.state_dim = input_config.state_dim
 29 |         self.action_dim = input_config.action_dim
 30 |         self.save_iter = input_config.save_iter  # interval of saving log
 31 |         self.save_path = input_config.model_save_path + "/actor"  # interval of saving model
 32 |         self.log_iter = input_config.log_iter  # logging interval in training phase
 33 |         self.log_path = input_config.log_path  # log path
 34 |         self.clip_norm = input_config.clip_norm
 35 |         self.step = 0
 36 | 
 37 |         self.train_writer = summ_writer
 38 | 
 39 |         # create actor network
 40 |         self.state_input, self.action_output, self.net = self.create_network(self.state_dim, self.action_dim)
 41 |         # create target actor network
 42 |         self.target_state_input, self.target_action_output, self.target_update, self.target_net = self.create_target_network(
 43 |             self.state_dim, self.action_dim, self.net)
 44 |         self.create_training_method()
 45 | 
 46 |         self.saver = tf.train.Saver()
 47 |         # self.saver = tf.train.Saver(tf.global_variables(scope=scope))
 48 |         if load_model:
 49 |             # restore actor network
 50 |             print('actor network restore weights')
 51 |             self.saver.restore(sess=self.sess, save_path=tf.train.latest_checkpoint(input_config.load_path))
 52 |         else:
 53 |             self.sess.run(tf.global_variables_initializer())
 54 | 
 55 |         self.update_target()
 56 | 
 57 | 
 58 |     def create_training_method(self):
 59 |         self.q_gradient_input = tf.placeholder("float", [None, self.action_dim])
 60 |         self.unnormalized_actor_gradients = tf.gradients(self.action_output, self.net, -self.q_gradient_input)
 61 |         # self.actor_gradients = list(map(lambda x: tf.div(x, BATCH_SIZE), self.unnormalized_actor_gradients))
 62 |         # gradients clip
 63 |         # self.actor_gradients, _ = tf.clip_by_global_norm(self.actor_gradients, clip_norm=self.clip_norm)
 64 | 
 65 |         # extra_ops = tf.get_collection('actor_parameters_extra_option')
 66 |         # apply_op = tf.train.AdamOptimizer(LEARNING_RATE).apply_gradients(zip(self.unnormalized_actor_gradients, self.net))
 67 |         apply_op = tf.train.RMSPropOptimizer(LEARNING_RATE).apply_gradients(zip(self.unnormalized_actor_gradients, self.net))
 68 | 
 69 |         # train_ops = [apply_op] + extra_ops
 70 |         # self.optimizer = tf.group(*apply_op)
 71 |         self.optimizer = apply_op
 72 | 
 73 |         diff = self.action_output - self.target_action_output
 74 |         self.mse = tf.reduce_mean(tf.square(diff))
 75 |         pretrain_grad = tf.gradients(self.mse, self.net)
 76 |         self.pretrain_update = tf.train.AdamOptimizer(LEARNING_RATE).apply_gradients(
 77 |             zip(pretrain_grad, self.net))
 78 | 
 79 | 
 80 | 
 81 |     def create_network(self, state_dim, action_dim):
 82 |         layer1_size = LAYER1_SIZE
 83 |         layer2_size = LAYER2_SIZE
 84 | 
 85 |         state_input = tf.placeholder("float", [None, state_dim])
 86 | 
 87 |         w1 = self.variable([state_dim, layer1_size], state_dim)
 88 |         b1 = self.variable([layer1_size], state_dim)
 89 |         w2 = self.variable([layer1_size, layer2_size], layer1_size)
 90 |         b2 = self.variable([layer2_size], layer1_size)
 91 |         w3 = tf.Variable(tf.random_uniform([layer2_size, action_dim], -3e-3, 3e-3))
 92 |         b3 = tf.Variable(tf.random_uniform([action_dim], -3e-3, 3e-3))
 93 | 
 94 |         layer1 = tf.nn.relu(tf.matmul(state_input, w1) + b1)
 95 |         layer2 = tf.nn.relu(tf.matmul(layer1, w2) + b2)
 96 |         action_output = tf.sigmoid(tf.matmul(layer2, w3) + b3)
 97 |         out_summ = tf.summary.histogram('action_output', action_output)
 98 | 
 99 |         w1_summ = tf.summary.histogram('W1', values=w1)
100 |         b1_summ = tf.summary.histogram('b1', values=b1)
101 | 
102 |         w2_summ = tf.summary.histogram('W2', values=w2)
103 |         b2_summ = tf.summary.histogram('b2', values=b2)
104 | 
105 |         w3_summ = tf.summary.histogram('W3', values=w3)
106 |         b3_summ = tf.summary.histogram('b3', values=b3)
107 | 
108 |         self.merged_summ = tf.summary.merge([out_summ, w1_summ, b1_summ, w2_summ, b2_summ, w3_summ, b3_summ])
109 |         # self.merged_summ = tf.summary.merge([out_summ])
110 | 
111 |         return state_input, action_output, [w1, b1, w2, b2, w3, b3]
112 | 
113 |     # def create_network(self, state_dim, action_dim):
114 |     #     layer1_size = LAYER1_SIZE
115 |     #     layer2_size = LAYER2_SIZE
116 |     #
117 |     #     state_input = tf.placeholder("float", [None, state_dim])
118 |     #
119 |     #     # Input -> Hidden Layer
120 |     #     w1 = weight_variable([self.state_dim, layer1_size], 'W1')
121 |     #     b1 = bias_variable([layer1_size], 'b1')
122 |     #     # Hidden Layer -> Hidden Layer
123 |     #     w2 = weight_variable([layer1_size, layer2_size], 'W2')
124 |     #     b2 = bias_variable([layer2_size], 'b2')
125 |     #     # Hidden Layer -> Output
126 |     #     w3 = weight_variable([layer2_size, self.action_dim], 'W3')
127 |     #     b3 = bias_variable([self.action_dim], 'b3')
128 |     #
129 |     #     # 1st Hidden layer, OPTION: Softmax, relu, tanh or sigmoid
130 |     #     h1 = tf.nn.relu(tf.matmul(state_input, w1) + b1)
131 |     #     # 2nd Hidden layer, OPTION: Softmax, relu, tanh or sigmoid
132 |     #     h2 = tf.nn.relu(tf.matmul(h1, w2) + b2)
133 |     #
134 |     #     # Run sigmoid on output to get 0 to 1
135 |     #     action_output = tf.nn.sigmoid(tf.matmul(h2, w3) + b3)
136 |     #     out_summ = tf.summary.histogram('action_output', action_output)
137 |     #
138 |     #     w1_summ = tf.summary.histogram('W1', values=w1)
139 |     #     b1_summ = tf.summary.histogram('b1', values=b1)
140 |     #
141 |     #     w2_summ = tf.summary.histogram('W2', values=w2)
142 |     #     b2_summ = tf.summary.histogram('b2', values=b2)
143 |     #
144 |     #     w3_summ = tf.summary.histogram('W3', values=w3)
145 |     #     b3_summ = tf.summary.histogram('b3', values=b3)
146 |     #
147 |     #     self.merged_summ = tf.summary.merge([out_summ, w1_summ, b1_summ, w2_summ, b2_summ, w3_summ, b3_summ])
148 |     #     # self.merged_summ = tf.summary.merge([out_summ])
149 |     #
150 |     #     # scaled_out = tf.multiply(out, self.action_bound)  # Scale output to -action_bound to action_bound
151 |     #
152 |     #     return state_input, action_output, [w1, b1, w2, b2, w3, b3]
153 | 
154 | 
155 |     def create_target_network(self, state_dim, action_dim, net):
156 |         state_input = tf.placeholder("float", [None, state_dim])
157 |         ema = tf.train.ExponentialMovingAverage(decay=1 - TAU)
158 |         target_update = ema.apply(net)
159 |         target_net = [ema.average(x) for x in net]
160 | 
161 |         layer1 = tf.nn.relu(tf.matmul(state_input, target_net[0]) + target_net[1])
162 |         layer2 = tf.nn.relu(tf.matmul(layer1, target_net[2]) + target_net[3])
163 | 
164 |         action_output = tf.tanh(tf.matmul(layer2, target_net[4]) + target_net[5])
165 | 
166 |         return state_input, action_output, target_update, target_net
167 | 
168 |     def update_target(self):
169 |         self.sess.run(self.target_update)
170 | 
171 |     def train(self, q_gradient_batch, state_batch):
172 |         train_feed_dict = {
173 |             self.q_gradient_input: q_gradient_batch,
174 |             self.state_input: state_batch
175 |         }
176 |         summ, _ = self.sess.run([self.merged_summ, self.optimizer], feed_dict=train_feed_dict)
177 |         # _ = self.sess.run([self.optimizer], feed_dict=train_feed_dict)
178 | 
179 |         # save actor network
180 |         if self.step % self.save_iter == 0:
181 |             self.saver.save(self.sess, save_path=self.save_path, global_step=self.step)
182 | 
183 |         if self.step % self.log_iter == 0:
184 |             self.train_writer.add_summary(summ, global_step=self.step)
185 | 
186 |         self.step += 1
187 | 
188 |     def pretrain(self, state, label):
189 |         # cost
190 |         train_feed_dict = {self.state_input: state, self.target_action_output: label}
191 |         _, net, mse = self.sess.run([self.pretrain_update, self.net, self.mse], feed_dict=train_feed_dict)
192 |         # save actor network
193 |         if self.step % self.save_iter == 0:
194 |             self.saver.save(self.sess, save_path=self.save_path, global_step=self.step)
195 | 
196 |         self.step += 1
197 |         return net, mse
198 | 
199 |     def actions(self, state_batch):
200 |         return self.sess.run(self.action_output, feed_dict={
201 |             self.state_input: state_batch
202 |         })
203 | 
204 |     def action(self, state):
205 |         return self.sess.run(self.action_output, feed_dict={
206 |             self.state_input: [state]
207 |         })[0]
208 | 
209 |     def target_actions(self, state_batch):
210 |         return self.sess.run(self.target_action_output, feed_dict={
211 |             self.target_state_input: state_batch
212 |         })
213 | 
214 |         # f fan-in size
215 |     def variable(self, shape, f):
216 |         return tf.Variable(tf.random_uniform(shape, -1 / math.sqrt(f), 1 / math.sqrt(f)))
217 | 
218 |     def save_network(self, episode):
219 |         print('save actor-network...', episode)
220 |         self.saver.save(self.sess, 'saved_actor_networks/' + 'actor-network', global_step=episode)
221 | 
222 | '''
223 |     def load_network(self):
224 |         self.saver = tf.train.Saver()
225 |         checkpoint = tf.train.get_checkpoint_state("saved_actor_networks")
226 |         if checkpoint and checkpoint.model_checkpoint_path:
227 |             self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
228 |             print "Successfully loaded:", checkpoint.model_checkpoint_path
229 |         else:
230 |             print "Could not find old network weights"
231 | 
232 | '''
233 | 
234 | 


--------------------------------------------------------------------------------
/RL/util.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | OUTER_START_POS = 0
  5 | OUTER_SIZE = 11
  6 | STATE_SIZE = 47
  7 | ACTION_SIZE = 51
  8 | STATE_START_POS = OUTER_START_POS + OUTER_SIZE
  9 | ACTION_START_POS = STATE_START_POS + STATE_SIZE
 10 | NEW_STATE_START_POS = ACTION_START_POS + ACTION_SIZE
 11 | 
 12 | NOX_POS = 40
 13 | STEAM_TEMP_POS = 48
 14 | STEAM_PRES_POS = 49
 15 | NEG_PRES_POS = 44
 16 | LIM_LOAD_POS = 11
 17 | LOAD_POS = 46
 18 | EFFI_WEIGHT = 0.8
 19 | 
 20 | # Read inv-normlization file
 21 | inv_norm = pd.read_csv('/Users/xhr/PycharmProjects/Boiler/Simulator/data/反归一化_new.csv', index_col='name')
 22 | inv_norm_min = inv_norm['min'].values  # convert to ndarray
 23 | inv_norm_max = inv_norm['max'].values  # convert to ndarray
 24 | 
 25 | 
 26 | def get_efficiency(state):
 27 |     if len(state.shape) == 1:
 28 |         # 主蒸汽流量
 29 |         h = state[47] * (inv_norm.loc['主蒸汽流量']['max'] - inv_norm.loc['主蒸汽流量']['min']) + inv_norm.loc['主蒸汽流量']['min']
 30 |         # 排烟含氧量
 31 |         i = state[39] * (inv_norm.loc['排烟含氧量']['max'] - inv_norm.loc['排烟含氧量']['min']) + inv_norm.loc['排烟含氧量']['min']
 32 |         # 引风机入口温度
 33 |         j = state[38] * (inv_norm.loc['引风机入口温度']['max'] - inv_norm.loc['引风机入口温度']['min']) + inv_norm.loc['引风机入口温度']['min']
 34 |         # 送风机入口温度
 35 |         k = state[10] * (inv_norm.loc['送风机入口温度']['max'] - inv_norm.loc['送风机入口温度']['min']) + inv_norm.loc['送风机入口温度']['min']
 36 |         # 低位发热量MJ/kg
 37 |         l = state[7] * (inv_norm.loc['低位发热量MJ/kg']['max'] - inv_norm.loc['低位发热量MJ/kg']['min']) + inv_norm.loc['低位发热量MJ/kg']['min']
 38 |         # 收到基水份%
 39 |         m = state[4] * (inv_norm.loc['收到基水份%']['max'] - inv_norm.loc['收到基水份%']['min']) + inv_norm.loc['收到基水份%']['min']
 40 |         # 收到基灰份%
 41 |         n = state[5] * (inv_norm.loc['收到基灰份%']['max'] - inv_norm.loc['收到基灰份%']['min']) + inv_norm.loc['收到基灰份%']['min']
 42 |         # 飞灰%
 43 |         p = state[9] * (inv_norm.loc['飞灰%']['max'] - inv_norm.loc['飞灰%']['min']) + inv_norm.loc['飞灰%']['min']
 44 |         # 渣%
 45 |         q = state[8] * (inv_norm.loc['渣%']['max'] - inv_norm.loc['渣%']['min']) + inv_norm.loc['渣%']['min']
 46 |         # 分析基水份%
 47 |         analytical_base_moisture = state[1] * (inv_norm.loc['分析基水份%']['max'] - inv_norm.loc['分析基水份%']['min']) + inv_norm.loc['分析基水份%']['min']
 48 |         # 分析基挥发分%
 49 |         analytical_base_volatile = state[3] * (inv_norm.loc['分析基挥发分%']['max'] - inv_norm.loc['分析基挥发分%']['min']) + inv_norm.loc['分析基挥发分%']['min']
 50 |     else:
 51 |         # 主蒸汽流量
 52 |         h = state[:, 47] * (inv_norm.loc['主蒸汽流量']['max'] - inv_norm.loc['主蒸汽流量']['min']) + inv_norm.loc['主蒸汽流量']['min']
 53 |         # 排烟含氧量
 54 |         i = state[:, 39] * (inv_norm.loc['排烟含氧量']['max'] - inv_norm.loc['排烟含氧量']['min']) + inv_norm.loc['排烟含氧量']['min']
 55 |         # 引风机入口温度
 56 |         j = state[:, 38] * (inv_norm.loc['引风机入口温度']['max'] - inv_norm.loc['引风机入口温度']['min']) + inv_norm.loc['引风机入口温度']['min']
 57 |         # 送风机入口温度
 58 |         k = state[:, 10] * (inv_norm.loc['送风机入口温度']['max'] - inv_norm.loc['送风机入口温度']['min']) + inv_norm.loc['送风机入口温度']['min']
 59 |         # 低位发热量MJ/kg
 60 |         l = state[:, 7] * (inv_norm.loc['低位发热量MJ/kg']['max'] - inv_norm.loc['低位发热量MJ/kg']['min']) + inv_norm.loc['低位发热量MJ/kg']['min']
 61 |         # 收到基水份%
 62 |         m = state[:, 4] * (inv_norm.loc['收到基水份%']['max'] - inv_norm.loc['收到基水份%']['min']) + inv_norm.loc['收到基水份%']['min']
 63 |         # 收到基灰份%
 64 |         n = state[:, 5] * (inv_norm.loc['收到基灰份%']['max'] - inv_norm.loc['收到基灰份%']['min']) + inv_norm.loc['收到基灰份%']['min']
 65 |         # 飞灰%
 66 |         p = state[:, 9] * (inv_norm.loc['飞灰%']['max'] - inv_norm.loc['飞灰%']['min']) + inv_norm.loc['飞灰%']['min']
 67 |         # 渣%
 68 |         q = state[:, 8] * (inv_norm.loc['渣%']['max'] - inv_norm.loc['渣%']['min']) + inv_norm.loc['渣%']['min']
 69 |         # 分析基水份%
 70 |         analytical_base_moisture = state[:, 1] * (inv_norm.loc['分析基水份%']['max'] - inv_norm.loc['分析基水份%']['min']) + inv_norm.loc['分析基水份%']['min']
 71 |         # 分析基挥发分%
 72 |         analytical_base_volatile = state[:, 3] * (inv_norm.loc['分析基挥发分%']['max'] - inv_norm.loc['分析基挥发分%']['min']) + inv_norm.loc['分析基挥发分%']['min']
 73 | 
 74 |     o = (100 - m) / (100 - analytical_base_moisture) * analytical_base_volatile
 75 |     l = l * 1000
 76 |     u = 10 * q / (100 - q) + 90 * p / (100 - p)
 77 |     v = 0.257 * (l - 3.3727 * n * u) / 1000
 78 |     w = 0.98 * v
 79 |     x = o * 100 / (100 - m - n)
 80 |     y = 2.1236 * x ** 0.2319
 81 |     z = y * (100 - m - n) / 100
 82 |     aa = 21 / (21 - i)
 83 |     ab = w + (aa - 1) * v
 84 |     ac = 1.24 * ((9 * z + m) / 100 + 1.293 * aa * v * 0.01)
 85 |     ad = 5.82 * 2141 ** (-0.38)
 86 |     s = ab * 1.38 * (j - k)
 87 |     t = ac * 1.51 * (j - k)
 88 | 
 89 |     c = (s + t) / l * 100
 90 |     # d = 126.36 * r * ab / l * 100
 91 |     e = 337.27 * n * u / l
 92 |     f = ad * (1095.4 / h)
 93 |     g = n * (10 * (800 - k) * 0.96 / (100 - q) + 90 * (j - k) * 0.82 / (100 - p)) / l
 94 | 
 95 |     effi = 100 - c - e - f - g
 96 |     norm_effi = (effi - inv_norm.loc['1号机组锅炉效率']['min']) / (inv_norm.loc['1号机组锅炉效率']['max'] - inv_norm.loc['1号机组锅炉效率']['min'])
 97 | 
 98 |     return norm_effi
 99 | 
100 | 
101 | def get_emission(state):
102 |     if len(state.shape) == 1:
103 |         return state[NOX_POS]
104 |     else:
105 |         return state[:, NOX_POS]
106 | 
107 | 
108 | def get_steam_temp(state):
109 |     if len(state.shape) == 1:
110 |         return state[STEAM_TEMP_POS]
111 |     else:
112 |         return state[:, STEAM_TEMP_POS]
113 | 
114 | 
115 | def get_given_steam_pres(state, load):
116 |     if len(state.shape) == 1:
117 |         if load >= 560:
118 |             given_steam_pres = 24.0
119 |         else:
120 |             given_steam_pres =  0.036072 * load + 3.89199
121 |     else:
122 |         given_steam_pres = np.ones([load.shape[0]]) * 24
123 |         given_steam_pres[load < 560] = 0.036072 * load[load < 560] + 3.89199
124 |     return given_steam_pres
125 | 
126 | 
127 | 
128 | def get_steam_pres(state):
129 |     if len(state.shape) == 1:
130 |         return state[STEAM_PRES_POS]
131 |     else:
132 |         return state[:, STEAM_PRES_POS]
133 | 
134 | 
135 | def get_neg_pres(state):
136 |     if len(state.shape) == 1:
137 |         return state[NEG_PRES_POS]
138 |     else:
139 |         return state[:, NEG_PRES_POS]
140 | 
141 | 
142 | def get_lim_load(state):
143 |     if len(state.shape) == 1:
144 |         return state[LIM_LOAD_POS]
145 |     else:
146 |         return state[:, LIM_LOAD_POS]
147 | 
148 | 
149 | def get_load(state):
150 |     if len(state.shape) == 1:
151 |         return state[LOAD_POS]
152 |     else:
153 |         return state[:, LOAD_POS]
154 | 
155 | 
156 | def compute_reward(state):
157 |     # coals = get_coals(action)
158 |     efficiency = get_efficiency(state)
159 |     emission = get_emission(state)
160 |     # print('effi', EFFI_WEIGHT * efficiency - (1-EFFI_WEIGHT) * emission)
161 |     reward = EFFI_WEIGHT * efficiency - (1-EFFI_WEIGHT) * emission
162 |     if np.mean(reward) > 1:
163 |         print(reward, efficiency, emission)
164 |     return 10*(EFFI_WEIGHT * efficiency - (1-EFFI_WEIGHT) * emission)
165 | 
166 | 
167 | # def compute_cost(state):
168 | #     lim_load = get_lim_load(state) * (inv_norm.loc['lim_load']['max'] - inv_norm.loc['lim_load']['min']) + inv_norm.loc['lim_load']['min']
169 | #     load = get_load(state) * (inv_norm.loc['#1机组锅炉负荷']['max'] - inv_norm.loc['#1机组锅炉负荷']['min']) + inv_norm.loc['#1机组锅炉负荷']['min']
170 | #     steam_temp = get_steam_temp(state) * (inv_norm.loc['锅炉主蒸汽温度']['max'] - inv_norm.loc['锅炉主蒸汽温度']['min']) + inv_norm.loc['锅炉主蒸汽温度']['min']
171 | #     given_steam_pres = get_given_steam_pres(load)
172 | #     steam_pres = get_steam_pres(state) * (inv_norm.loc['主蒸汽压力']['max'] - inv_norm.loc['主蒸汽压力']['min']) + inv_norm.loc['主蒸汽压力']['min']
173 | #     # neg_pressure = get_neg_pres(state) * (inv_norm.loc['炉膛负压']['max'] - inv_norm.loc['炉膛负压']['min']) + inv_norm.loc['炉膛负压']['min']
174 | #
175 | #     # cost 1, 负荷:lim_load ~ limload+25
176 | #     if len(state.shape) == 1:
177 | #         if load - lim_load > 25 or load < lim_load:
178 | #             cost_load = 1
179 | #         else:
180 | #             cost_load = 0
181 | #     else:
182 | #         cost_load = np.zeros([len(state), 1])
183 | #         cost_load[(load-lim_load > 25) | (load-lim_load < 0)] = 1
184 | #
185 | #     # else:
186 | #     #     if diff < 0.01:
187 | #     #         return 0
188 | #     #     elif diff < 0.1:
189 | #     #         return 0.2
190 | #     #     elif diff < 0.5:
191 | #     #         return 0.5
192 | #     #     else:
193 | #     #         return 1
194 | #
195 | #     # cost 2, 主蒸汽温度:569-10 ~ 569+5
196 | #     if len(state.shape) == 1:
197 | #         if steam_temp > 569+5 or steam_temp < 569-10:
198 | #             cost_steam_temp = 1
199 | #         else:
200 | #             cost_steam_temp = 0
201 | #     else:
202 | #         cost_steam_temp = np.zeros([len(state), 1])
203 | #         cost_steam_temp[(steam_temp > 569+5) | (steam_temp < 569-10)] = 1
204 | #
205 | #     # cost 3, 主蒸汽压力:given_pres-0.5 ~ given_pres+0.5
206 | #     if len(state.shape) == 1:
207 | #         if steam_pres > given_steam_pres+0.5 or steam_pres < given_steam_pres-0.5:
208 | #             cost_steam_pres = 1
209 | #         else:
210 | #             cost_steam_pres = 0
211 | #     else:
212 | #         cost_steam_pres = np.zeros([len(state), 1])
213 | #         cost_steam_pres[(steam_pres > given_steam_pres+0.5) & (steam_pres < given_steam_pres-0.5)] = 1
214 | #
215 | #     return 1/3*cost_load + 1/3*cost_steam_temp + 1/3*cost_steam_pres
216 | 
217 | def compute_cost(state):
218 |     lim_load = get_lim_load(state) * (inv_norm.loc['lim_load']['max'] - inv_norm.loc['lim_load']['min']) + inv_norm.loc['lim_load']['min']
219 |     load = get_load(state) * (inv_norm.loc['#1机组锅炉负荷']['max'] - inv_norm.loc['#1机组锅炉负荷']['min']) + inv_norm.loc['#1机组锅炉负荷']['min']
220 |     steam_temp = get_steam_temp(state) * (inv_norm.loc['锅炉主蒸汽温度']['max'] - inv_norm.loc['锅炉主蒸汽温度']['min']) + inv_norm.loc['锅炉主蒸汽温度']['min']
221 |     given_steam_pres = get_given_steam_pres(state, load)
222 |     steam_pres = get_steam_pres(state) * (inv_norm.loc['主蒸汽压力']['max'] - inv_norm.loc['主蒸汽压力']['min']) + inv_norm.loc['主蒸汽压力']['min']
223 |     # neg_pressure = get_neg_pres(state) * (inv_norm.loc['炉膛负压']['max'] - inv_norm.loc['炉膛负压']['min']) + inv_norm.loc['炉膛负压']['min']
224 | 
225 |     # cost 1, 负荷:lim_load ~ limload+25
226 |     if len(state.shape) == 1:
227 |         if load - lim_load > 25:
228 |             cost_load = np.abs(load - lim_load - 25) / 10
229 |         elif load < lim_load:
230 |             cost_load = 1
231 |         else:
232 |             cost_load = 0
233 |     else:
234 |         cost_load = np.zeros([len(state)])
235 |         cost_load[load-lim_load > 25] = np.abs(load - lim_load - 25)[load-lim_load > 25] / 10
236 |         cost_load[load-lim_load < 0] = 1
237 | 
238 | 
239 |     # else:
240 |     #     if diff < 0.01:
241 |     #         return 0
242 |     #     elif diff < 0.1:
243 |     #         return 0.2
244 |     #     elif diff < 0.5:
245 |     #         return 0.5
246 |     #     else:
247 |     #         return 1
248 | 
249 |     # cost 2, 主蒸汽温度:569-10 ~ 569+5
250 |     if len(state.shape) == 1:
251 |         if steam_temp > 569+10:
252 |             cost_steam_temp = np.abs(steam_temp - 569-10) / 10
253 |         elif steam_temp < 569-10:
254 |             cost_steam_temp = np.abs(steam_temp - 569+10) / 10
255 |         else:
256 |             cost_steam_temp = 0
257 |     else:
258 |         cost_steam_temp = np.zeros([len(state)])
259 |         cost_steam_temp[steam_temp > 569+10] = np.abs(steam_temp - 569-10)[steam_temp > 569+10] / 10
260 |         cost_steam_temp[steam_temp < 569-10] = np.abs(steam_temp - 569+10)[steam_temp < 569-10] / 10
261 | 
262 |     # cost 3, 主蒸汽压力:given_pres-0.5 ~ given_pres+0.5
263 |     if len(state.shape) == 1:
264 |         if steam_pres > given_steam_pres+1:
265 |             cost_steam_pres = np.abs(steam_pres - given_steam_pres-1) / 5
266 |         elif steam_pres < given_steam_pres-1:
267 |             cost_steam_pres = np.abs(steam_pres - given_steam_pres+1) / 5
268 |         else:
269 |             cost_steam_pres = 0
270 |     else:
271 |         cost_steam_pres = np.zeros([len(state)])
272 |         cost_steam_pres[steam_pres > given_steam_pres+1] = np.abs(steam_pres - given_steam_pres-1)[steam_pres > given_steam_pres+1] / 5
273 |         cost_steam_pres[steam_pres < given_steam_pres-1] = np.abs(steam_pres - given_steam_pres+1)[steam_pres < given_steam_pres-1] / 5
274 | 
275 | 
276 |     return 1/3*cost_load + 1/3*cost_steam_temp + 1/3*cost_steam_pres
277 | 
278 | 
279 | def compute_done(state):
280 |     return False
281 | 
282 | 
283 | def convert_to_tuple(batch):
284 |     outer = batch[:, OUTER_START_POS: OUTER_START_POS + OUTER_SIZE]
285 |     state_with_outer = batch[:, OUTER_START_POS: STATE_START_POS + STATE_SIZE]
286 |     action = batch[:, ACTION_START_POS: ACTION_START_POS + ACTION_SIZE]
287 |     new_state = batch[:, NEW_STATE_START_POS: NEW_STATE_START_POS + STATE_SIZE]
288 |     new_state_with_outer = np.concatenate([outer, new_state], axis=1)
289 |     done = batch[:, -1]
290 |     return (state_with_outer, action, new_state_with_outer, done)
291 | 
292 | 
293 | 
294 | def restrictive_action(action, episode):
295 |     action_histogram = np.array(pd.read_csv('../Simulator/data/action_histogram.csv', header=None)).astype('float')
296 |     threshold = action_histogram[:, -1]
297 |     noise_weight = 1
298 | 
299 |     if episode % 100 == 0 and episode > 0:
300 |         noise_weight *= 0.99
301 | 
302 |     # print('value'+str(self.df[np.arange(len(x[:-1])).astype('int'), (x[:-1] * 20).astype('int')]))
303 |     action_distri = np.array(action_histogram[np.arange(len(action)).astype('int'), (action * 20).astype('int')[:]] > threshold).astype('int')
304 |     for i in range(100):
305 |         if len(np.where(action_distri == 0)[0]) > 0:
306 |             unsatisfied_index = np.where(action_distri == 0)[0]
307 |             # print(f'unsatisfied index {unsatisfied_index}')
308 |             #       f'unsatisfied action {actions[unsatisfied_index]})')
309 |             random_noise = np.random.normal(np.zeros(len(unsatisfied_index)),
310 |                                             (0.1 + 0.01 * i) * np.ones(len(unsatisfied_index)),
311 |                                             len(unsatisfied_index))
312 |             action[unsatisfied_index] += random_noise * noise_weight
313 |             # print(f'fixed actions {actions[unsatisfied_index]}'
314 |         else:
315 |             # print(32 + np.where(action_distri[32:] == 0)[0])
316 |             # print('find action within '+str(i)+' times')
317 |             break
318 | 
319 |     # if len(np.where(action_distri == 0)[0]) > 0:
320 |     #     unsatisfied_index = np.where(action_distri == 0)[0]
321 |     #     print(f'Break! dissatisfied actions is {len(np.where(action_distri == 0)[0])}, '
322 |     #           f'index: {unsatisfied_index}, value: {action[unsatisfied_index]},')
323 |     #     break
324 | 
325 |     return action
326 | 


--------------------------------------------------------------------------------
/Simulator/simrnn_cell.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import collections
  4 | import hashlib
  5 | import numbers
  6 | 
  7 | from tensorflow.python.eager import context
  8 | from tensorflow.python.framework import constant_op
  9 | from tensorflow.python.framework import ops
 10 | from tensorflow.python.framework import dtypes
 11 | from tensorflow.python.layers import base as base_layer
 12 | from tensorflow.contrib.rnn import RNNCell
 13 | from tensorflow.contrib.rnn import LSTMCell
 14 | from tensorflow.contrib.rnn import GRUCell
 15 | from tensorflow.python.ops import array_ops
 16 | from tensorflow.python.ops import clip_ops
 17 | from tensorflow.python.ops import init_ops
 18 | from tensorflow.python.ops import math_ops
 19 | from tensorflow.python.ops import nn_ops
 20 | from tensorflow.python.ops import partitioned_variables
 21 | from tensorflow.python.ops import random_ops
 22 | from tensorflow.python.ops import tensor_array_ops
 23 | from tensorflow.python.ops import variable_scope as vs
 24 | from tensorflow.python.ops import variables as tf_variables
 25 | from tensorflow.python.ops.rnn_cell_impl import LSTMStateTuple
 26 | from tensorflow.python.util import nest
 27 | from tensorflow.python.util.tf_export import tf_export
 28 | from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
 29 | 
 30 | 
 31 | class _LayerRNNCell(RNNCell):
 32 |   """Subclass of RNNCells that act like proper `tf.Layer` objects.
 33 | 
 34 |   For backwards compatibility purposes, most `RNNCell` instances allow their
 35 |   `call` methods to instantiate variables via `tf.get_variable`.  The underlying
 36 |   variable scope thus keeps track of any variables, and returning cached
 37 |   versions.  This is atypical of `tf.layer` objects, which separate this
 38 |   part of layer building into a `build` method that is only called once.
 39 | 
 40 |   Here we provide a subclass for `RNNCell` objects that act exactly as
 41 |   `Layer` objects do.  They must provide a `build` method and their
 42 |   `call` methods do not access Variables `tf.get_variable`.
 43 |   """
 44 | 
 45 |   def __call__(self, inputs, state, scope=None, *args, **kwargs):
 46 |     """Run this RNN cell on inputs, starting from the given state.
 47 | 
 48 |     Args:
 49 |       inputs: `2-D` tensor with shape `[batch_size, input_size]`.
 50 |       state: if `self.state_size` is an integer, this should be a `2-D Tensor`
 51 |         with shape `[batch_size, self.state_size]`.  Otherwise, if
 52 |         `self.state_size` is a tuple of integers, this should be a tuple
 53 |         with shapes `[batch_size, s] for s in self.state_size`.
 54 |       scope: optional cell scope.
 55 |       *args: Additional positional arguments.
 56 |       **kwargs: Additional keyword arguments.
 57 | 
 58 |     Returns:
 59 |       A pair containing:
 60 | 
 61 |       - Output: A `2-D` tensor with shape `[batch_size, self.output_size]`.
 62 |       - New state: Either a single `2-D` tensor, or a tuple of tensors matching
 63 |         the arity and shapes of `state`.
 64 |     """
 65 |     # Bypass RNNCell's variable capturing semantics for LayerRNNCell.
 66 |     # Instead, it is up to subclasses to provide a proper build
 67 |     # method.  See the class docstring for more details.
 68 |     return base_layer.Layer.__call__(self, inputs, state, scope=scope,
 69 |                                      *args, **kwargs)
 70 | 
 71 | 
 72 | class SimulatorRNNCell(_LayerRNNCell):
 73 |     """
 74 |     coaler RNN: (external_input_t, coaler_hidden_t-1 , coaler_action_t) --> (coaler_hidden_t, coaler_cell_t)
 75 |     burner RNN: (coaler_hidden_t, burner_hidden_t-1 , burner_action_t) --> (burner_hidden_t, burner_cell_t)
 76 |     steamer RNN: (burner_hidden_t, steamer_hidden_t-1 , steamer_action_t) --> (steamer_hidden_t, steamer_cell_t)
 77 | 
 78 |     loss: sum of three parts
 79 |     part1: coaler_hidden_t, coaler_state_t
 80 |     part2: burner_hidden_t, burner_state_t
 81 |     part3: steamer_hidden_t, steamer_state_t
 82 |     """
 83 |     def __init__(self, cell_config,
 84 |                  keep_prob,
 85 |                  forget_bias=1.0,
 86 |                  activation=None,
 87 |                  reuse=None,
 88 |                  name=None):
 89 |         """
 90 |         Args:
 91 |           cell_config: simulator config
 92 |           num_units: list, [coaler_num_units, burner_num_units, steamer_num_units]
 93 |         """
 94 |         super(SimulatorRNNCell, self).__init__(_reuse=reuse, name=name)
 95 |         # Inputs must be 2-dimensional.
 96 |         self.input_spec = base_layer.InputSpec(ndim=2)
 97 | 
 98 |         self._external_state_pos = cell_config.external_state_pos
 99 |         self._coaler_state_pos = cell_config.coaler_state_pos
100 |         self._coaler_action_pos = cell_config.coaler_action_pos
101 |         self._burner_state_pos = cell_config.burner_state_pos
102 |         self._burner_action_pos = cell_config.burner_action_pos
103 |         self._steamer_state_pos = cell_config.steamer_state_pos
104 |         self._steamer_action_pos = cell_config.steamer_action_pos
105 | 
106 |         self._external_state_size = cell_config.external_state_size
107 |         self._coaler_state_size = cell_config.coaler_state_size
108 |         self._coaler_action_size = cell_config.coaler_action_size
109 |         self._burner_state_size = cell_config.burner_state_size
110 |         self._burner_action_size = cell_config.burner_action_size
111 |         self._steamer_state_size = cell_config.steamer_state_size
112 |         self._steamer_action_size = cell_config.steamer_action_size
113 | 
114 |         # num_units: list, [coaler_num_units, burner_num_units, steamer_num_units]
115 |         _num_units = cell_config.num_units  # TODO
116 |         self._coaler_num_units = _num_units[0]
117 |         self._burner_num_units = _num_units[1]
118 |         self._steamer_num_units = _num_units[2]
119 |         self._forget_bias = forget_bias
120 |         self._activation = activation or math_ops.tanh
121 |         self._input_keep_prob = self._output_keep_prob = keep_prob
122 | 
123 |     @property
124 |     def state_size(self):
125 |         c_tuple = tuple((self._coaler_num_units, self._burner_num_units, self._steamer_num_units))
126 |         h_tuple = tuple((self._coaler_num_units, self._burner_num_units, self._steamer_num_units))
127 |         return LSTMStateTuple(c_tuple, h_tuple)
128 | 
129 |     @property
130 |     def output_size(self):
131 |         return tuple((self._coaler_num_units, self._burner_num_units, self._steamer_num_units))
132 | 
133 |     def get_coaler_inputs(self, inputs):
134 |         # coaler inputs contains external_input, coaler_state and coaler_action
135 |         # input: (batch_size, feature_nums)
136 |         external_input = tf.slice(inputs, [0, self._external_state_pos],
137 |                                   [-1, self._external_state_size])
138 | 
139 |         coaler_state = tf.slice(inputs, [0, self._coaler_state_pos],
140 |                                 [-1, self._coaler_state_size])
141 |         coaler_action = tf.slice(inputs, [0, self._coaler_action_pos],
142 |                                  [-1, self._coaler_action_size])
143 |         return tf.concat([external_input, coaler_state, coaler_action], axis=1)
144 | 
145 |     def get_burner_inputs(self, inputs):
146 |         # burner inputs contains burner_state and burner_action
147 |         # input: (batch_size, feature_nums)
148 |         burner_state = tf.slice(inputs, [0, self._burner_state_pos],
149 |                                 [-1, self._burner_state_size])
150 |         burner_action = tf.slice(inputs, [0, self._burner_action_pos],
151 |                                  [-1, self._burner_action_size])
152 |         return tf.concat([burner_state, burner_action], axis=1)
153 | 
154 |     def get_steamer_inputs(self, inputs):
155 |         # steamer inputs contains steamer_state and steamer_action
156 |         # input: (batch_size, feature_nums)
157 |         steamer_state = tf.slice(inputs, [0, self._steamer_state_pos],
158 |                                  [-1, self._steamer_state_size])
159 |         steamer_action = tf.slice(inputs, [0, self._steamer_action_pos],
160 |                                   [-1, self._steamer_action_size])
161 |         return tf.concat([steamer_state, steamer_action], axis=1)
162 | 
163 |     def build(self, inputs_shape):
164 |         # coaler
165 |         external_input_depth = self._external_state_size
166 |         coaler_input_depth = self._coaler_state_size + self._coaler_action_size
167 |         self._coaler_kernel = self.add_variable(
168 |             "coaler_kernel",
169 |             shape=[external_input_depth + coaler_input_depth + self._coaler_num_units, 4 * self._coaler_num_units],
170 |             initializer=orthogonal_lstm_initializer())
171 |         self._coaler_bias = self.add_variable(
172 |             "coaler_bias",
173 |             shape=[4 * self._coaler_num_units],
174 |             initializer=init_ops.zeros_initializer(dtype=self.dtype))
175 |         # burner
176 |         burner_input_depth = self._burner_state_size + self._burner_action_size
177 |         self._burner_kernel = self.add_variable(
178 |             "burner_kernel",
179 |             shape=[burner_input_depth + self._burner_num_units + self._coaler_num_units, 4 * self._burner_num_units],
180 |             initializer=orthogonal_lstm_initializer())
181 |         self._burner_bias = self.add_variable(
182 |             "burner_bias",
183 |             shape=[4 * self._burner_num_units],
184 |             initializer=init_ops.zeros_initializer(dtype=self.dtype))
185 |         # steamer
186 |         steamer_input_depth = self._steamer_state_size + self._steamer_action_size
187 |         self._steamer_kernel = self.add_variable(
188 |             "steamer_kernel",
189 |             shape=[steamer_input_depth + self._steamer_num_units + self._burner_num_units, 4 * self._steamer_num_units],
190 |             initializer=orthogonal_lstm_initializer())
191 |         self._steamer_bias = self.add_variable(
192 |             "steamer_bias",
193 |             shape=[4 * self._steamer_num_units],
194 |             initializer=init_ops.zeros_initializer(dtype=self.dtype))
195 | 
196 |         self.built = True
197 | 
198 |     def zero_state(self, batch_size, dtype):
199 |         """Return zero-filled state tensor(s).
200 | 
201 |         Args:
202 |           batch_size: int, float, or unit Tensor representing the batch size.
203 |           dtype: the data type to use for the state.
204 | 
205 |         Returns:
206 |           If `state_size` is an int or TensorShape, then the return value is a
207 |           `N-D` tensor of shape `[batch_size, state_size]` filled with zeros.
208 | 
209 |           If `state_size` is a nested list or tuple, then the return value is
210 |           a nested list or tuple (of the same structure) of `2-D` tensors with
211 |           the shapes `[batch_size, s]` for each s in `state_size`.
212 |         """
213 |         # Try to use the last cached zero_state. This is done to avoid recreating
214 |         # zeros, especially when eager execution is enabled.
215 |         state_size = self.state_size
216 |         is_eager = context.in_eager_mode()
217 |         if is_eager and hasattr(self, "_last_zero_state"):
218 |             (last_state_size, last_batch_size, last_dtype,
219 |              last_output) = getattr(self, "_last_zero_state")
220 |             if (last_batch_size == batch_size and
221 |                 last_dtype == dtype and
222 |                 last_state_size == state_size):
223 |                 return last_output
224 |         with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
225 |             output = _zero_state_tensors(state_size, batch_size, dtype)
226 |         if is_eager:
227 |             self._last_zero_state = (state_size, batch_size, dtype, output)
228 |         return output
229 | 
230 |     def call(self, inputs, state):
231 |         # inputs: (external_input, coaler_input, burner_input, steamer_input)
232 |         # state: (c, h) is a 3-D tensor
233 |         # c: (c_coaler, c_burner, c_steamer)
234 |         # h: (h_coaler, h_burner, h_steamer)
235 |         # self._state_is_tuple is True for simplicity
236 |         def _should_dropout(p):
237 |             return (not isinstance(p, float)) or p < 1
238 | 
239 |         # input dropout
240 |         if _should_dropout(self._input_keep_prob):
241 |             inputs = nn_ops.dropout(inputs, keep_prob=self._input_keep_prob)
242 | 
243 |         coaler_inputs = self.get_coaler_inputs(inputs)
244 |         burner_inputs = self.get_burner_inputs(inputs)
245 |         steamer_inputs = self.get_steamer_inputs(inputs)
246 | 
247 |         sigmoid = math_ops.sigmoid
248 |         one = constant_op.constant(1, dtype=dtypes.int32)
249 | 
250 |         c, h = state
251 |         coaler_h, burner_h, steamer_h = h
252 |         coaler_c, burner_c, steamer_c = c
253 | 
254 |         # coal mill model
255 |         with tf.variable_scope('coaler'):
256 |             # inputs = self.batch_normalization(inputs, 'coal_mill_bn')
257 |             coaler_gate_inputs = math_ops.matmul(
258 |                 array_ops.concat([coaler_inputs, coaler_h], 1), self._coaler_kernel)
259 |             coaler_gate_inputs = nn_ops.bias_add(coaler_gate_inputs, self._coaler_bias)
260 | 
261 |             coaler_i, coaler_j, coaler_f, coaler_o = array_ops.split(
262 |                 value=coaler_gate_inputs, num_or_size_splits=4, axis=one)
263 | 
264 |             coaler_forget_bias_tensor = constant_op.constant(self._forget_bias, dtype=coaler_f.dtype)
265 |             # Note that using `add` and `multiply` instead of `+` and `*` gives a
266 |             # performance improvement. So using those at the cost of readability.
267 |             add = math_ops.add
268 |             multiply = math_ops.multiply
269 |             coaler_new_c = add(multiply(coaler_c, sigmoid(add(coaler_f, coaler_forget_bias_tensor))),
270 |                                multiply(sigmoid(coaler_i), self._activation(coaler_j)))
271 |             coaler_new_h = multiply(self._activation(coaler_new_c), sigmoid(coaler_o))
272 | 
273 |         with tf.variable_scope('burner'):
274 |             # inputs = self.batch_normalization(inputs, 'coal_mill_bn')
275 |             # only dropout coaler output
276 |             if _should_dropout(self._output_keep_prob):
277 |                 coaler_h = nn_ops.dropout(coaler_h, keep_prob=self._output_keep_prob)
278 | 
279 |             burner_gate_inputs = math_ops.matmul(
280 |                 array_ops.concat([burner_inputs, burner_h, coaler_h], 1), self._burner_kernel)
281 |             burner_gate_inputs = nn_ops.bias_add(burner_gate_inputs, self._burner_bias)
282 | 
283 |             burner_i, burner_j, burner_f, burner_o = array_ops.split(
284 |                 value=burner_gate_inputs, num_or_size_splits=4, axis=one)
285 | 
286 |             burner_forget_bias_tensor = constant_op.constant(self._forget_bias, dtype=burner_f.dtype)
287 |             # Note that using `add` and `multiply` instead of `+` and `*` gives a
288 |             # performance improvement. So using those at the cost of readability.
289 |             add = math_ops.add
290 |             multiply = math_ops.multiply
291 |             burner_new_c = add(multiply(burner_c, sigmoid(add(burner_f, burner_forget_bias_tensor))),
292 |                                multiply(sigmoid(burner_i), self._activation(burner_j)))
293 |             burner_new_h = multiply(self._activation(burner_new_c), sigmoid(burner_o))
294 | 
295 |         with tf.variable_scope('steamer'):
296 |             # inputs = self.batch_normalization(inputs, 'coal_mill_bn')
297 |             # only dropout burner output
298 |             if _should_dropout(self._output_keep_prob):
299 |                 burner_h = nn_ops.dropout(burner_h, keep_prob=self._output_keep_prob)
300 | 
301 |             steamer_gate_inputs = math_ops.matmul(
302 |                 array_ops.concat([steamer_inputs, steamer_h, burner_h], 1), self._steamer_kernel)
303 |             steamer_gate_inputs = nn_ops.bias_add(steamer_gate_inputs, self._steamer_bias)
304 | 
305 |             steamer_i, steamer_j, steamer_f, steamer_o = array_ops.split(
306 |                 value=steamer_gate_inputs, num_or_size_splits=4, axis=one)
307 | 
308 |             steamer_forget_bias_tensor = constant_op.constant(self._forget_bias, dtype=steamer_f.dtype)
309 |             # Note that using `add` and `multiply` instead of `+` and `*` gives a
310 |             # performance improvement. So using those at the cost of readability.
311 |             add = math_ops.add
312 |             multiply = math_ops.multiply
313 |             steamer_new_c = add(multiply(steamer_c, sigmoid(add(steamer_f, steamer_forget_bias_tensor))),
314 |                                 multiply(sigmoid(steamer_i), self._activation(steamer_j)))
315 |             steamer_new_h = multiply(self._activation(steamer_new_c), sigmoid(steamer_o))
316 | 
317 |         new_c = tuple((coaler_new_c, burner_new_c, steamer_new_c))
318 |         new_h = tuple((coaler_new_h, burner_new_h, steamer_new_h))
319 |         # concat_h = array_ops.concat([coaler_new_h, burner_new_h, steamer_new_h], axis=1)
320 |         new_state = LSTMStateTuple(new_c, new_h)
321 |         return new_h, new_state
322 | 
323 | 
324 | def orthogonal_lstm_initializer():
325 |     def orthogonal(shape, dtype=tf.float32, partition_info=None):
326 |         # taken from https://github.com/cooijmanstim/recurrent-batch-normalization
327 |         # taken from https://gist.github.com/kastnerkyle/f7464d98fe8ca14f2a1a
328 |         """ benanne lasagne ortho init (faster than qr approach)"""
329 |         flat_shape = (shape[0], np.prod(shape[1:]))
330 |         a = np.random.normal(0.0, 1.0, flat_shape)
331 |         u, _, v = np.linalg.svd(a, full_matrices=False)
332 |         q = u if u.shape == flat_shape else v  # pick the one with the correct shape
333 |         q = q.reshape(shape)
334 |         return tf.constant(q[:shape[0], :shape[1]], dtype)
335 |     return orthogonal
336 | 
337 | 
338 | 


--------------------------------------------------------------------------------