├── .gitignore ├── README.md ├── Simulator ├── data_model.py ├── simrnn_model.py ├── baseline_model.py ├── baseline_main.py ├── simrnn_main.py └── simrnn_cell.py └── RL ├── pre_train.py ├── exploration.py ├── ou_noise.py ├── replay_buffer.py ├── env.py ├── train_primal_dual.py ├── reward_critic_network.py ├── cost_critic_network.py ├── primal_dual_ddpg.py ├── actor_network.py └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | Material/ 3 | BoilerData/ 4 | .DS_Store 5 | 6 | Simulator/__pycache__/ 7 | Simulator/.idea/ 8 | Simulator/data/ 9 | Simulator/logs/ 10 | Simulator/.DS_Store 11 | 12 | 13 | RL/__pycache__/ 14 | RL/.idea/ 15 | RL/model/ 16 | RL/saved_actor/ 17 | RL/result/ 18 | RL/logs/ 19 | RL/.DS_Store 20 | 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeepThermal: Combustion Optimization for Thermal Power Generating Units Using Offline Reinforcement Learning 2 | 3 | This is the code of the paper DeepThermal: Combustion Optimization for Thermal Power Generating Units Using Offline Reinforcement Learning accepted at AAAI'2022. The paper can be found [here](https://arxiv.org/abs/2102.11492). 4 | 5 | ### Usage 6 | The code of combustion simulator is in `Simulator/simrnn_model.py`, the code of model-based offline RL framework, MORE, is in `RL/primal_dual_ddpg.py`. 7 | 8 | 9 | ### Bibtex 10 | ``` 11 | @inproceedings{zhan2022deepthermal, 12 | title={Deepthermal: Combustion optimization for thermal power generating units using offline reinforcement learning}, 13 | author={Zhan, Xianyuan and Xu, Haoran and Zhang, Yue and Zhu, Xiangyu and Yin, Honglei and Zheng, Yu}, 14 | booktitle={Proceedings of the AAAI Conference on Artificial Intelligence}, 15 | pages={4680--4688}, 16 | year={2022} 17 | } 18 | ``` 19 | 20 | -------------------------------------------------------------------------------- /Simulator/data_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import pandas as pd 4 | import random 5 | 6 | 7 | class BoilerDataSet(object): 8 | """ 9 | first run data_preparation.py to generate data.csv 10 | prepare boiler training and validation dataset 11 | simple version(small action dimension) 12 | 13 | """ 14 | def __init__(self, num_steps, val_ratio=0.1): 15 | self.num_steps = num_steps 16 | self.val_ratio = val_ratio 17 | 18 | # Read csv file 19 | self.raw_seq = pd.read_csv(os.path.join("data", "sim_train.csv"), index_col='date') 20 | self.train_X, self.train_y, self.val_X, self.val_y = self._prepare_data(self.raw_seq) 21 | 22 | def _prepare_data(self, seq): 23 | # split into groups of num_steps 24 | X = np.array([seq.iloc[i: i + self.num_steps].values 25 | for i in range(len(seq) - self.num_steps)]) 26 | y = np.array([seq.ix[i + self.num_steps, 'A磨煤机料位':'1号机组下部水冷壁出口平均壁温'].values 27 | for i in range(len(seq) - self.num_steps)]) 28 | 29 | train_size = int(len(X) * (1.0 - self.val_ratio)) 30 | train_X, val_X = X[:train_size], X[train_size:] 31 | train_y, val_y = y[:train_size], y[train_size:] 32 | return train_X, train_y, val_X, val_y 33 | 34 | def generate_one_epoch(self, data_X, data_y, batch_size): 35 | num_batches = int(len(data_X)) // batch_size 36 | # if batch_size * num_batches < len(self.train_X): 37 | # num_batches += 1 38 | 39 | batch_indices = list(range(num_batches)) 40 | random.shuffle(batch_indices) 41 | for j in batch_indices: 42 | batch_X = data_X[j * batch_size: (j + 1) * batch_size] 43 | batch_y = data_y[j * batch_size: (j + 1) * batch_size] 44 | yield batch_X, batch_y 45 | 46 | -------------------------------------------------------------------------------- /RL/pre_train.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def pre_train_actor_network(agent, train_data, epochs=50, load_model=False): 4 | """ 5 | train critic network of agent 6 | data : from train_data_path (eg. origin data) 7 | """ 8 | input_config = InputConfig_RL() 9 | replay_buffer = agent.replay_buffer 10 | replay_buffer.read_from_csv(train_data) 11 | 12 | step = 0 13 | del_list = list(range(12, 24)) + list(range(36, 48)) 14 | for epoch in range(epochs): 15 | while 1: 16 | if replay_buffer.use_nums > replay_buffer.count(): 17 | replay_buffer.read_from_csv(train_data) 18 | step = 0 19 | break 20 | 21 | mini_batch = replay_buffer.get_batch(batch_size=input_config.batch_size) 22 | step += 1 23 | mini_batch = np.bmat(list(map(list, mini_batch))).A.flatten().reshape(-1, DONE_END) 24 | state_batch = mini_batch[:, :OUTER_END] 25 | action_batch = mini_batch[:, OUTER_END + 32:ACTION_END] 26 | for i in range(12): 27 | action_batch[:, i] = (action_batch[:, i] + action_batch[:, 23 - i]) / 2 28 | action_batch[:, 24 + i] = (action_batch[:, 24 + i] + action_batch[:, 47 - i]) / 2 29 | action_batch = np.delete(action_batch, del_list, axis=1) 30 | limit_batch = mini_batch[:, ACTION_END:LIMIT_LOAD_END] 31 | state_limit_batch = np.concatenate((state_batch, limit_batch), axis=1) 32 | 33 | mse, _ = agent.train_actor(state=state_limit_batch, action=action_batch) 34 | 35 | # display 36 | if step % 100 == 0: 37 | print(replay_buffer.use_nums) 38 | print('-----------------pretrain actor network-----------------') 39 | print('epoch = {} step = {} mse = {:.6f}'.format(epoch, step, mse)) -------------------------------------------------------------------------------- /RL/exploration.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | 5 | class Exploration(object): 6 | 7 | def __init__(self, action_dim, kernel_num, sample_size): 8 | 9 | self.g = tf.Graph() 10 | with self.g.as_default(): 11 | # data format 12 | self.action_dim = action_dim 13 | self.mean = tf.placeholder(shape=[self.action_dim], dtype=tf.float32) 14 | self.stddev = tf.placeholder(shape=[self.action_dim], dtype=tf.float32) 15 | self.action = tf.placeholder(shape=[self.action_dim], dtype=tf.float32) 16 | self.weight = tf.placeholder(dtype=tf.float32) 17 | 18 | self.gaussian_exploration = None 19 | self.kernel_num = kernel_num 20 | self._sample_size = sample_size 21 | 22 | config = tf.ConfigProto(device_count={"CPU": self.kernel_num}, 23 | inter_op_parallelism_threads=0, 24 | intra_op_parallelism_threads=0, 25 | log_device_placement=True) 26 | self.sess = tf.Session(config=config, graph=self.g) 27 | 28 | # for sample_index in range(self._sample_size): 29 | # gaussian_noise = tf.random_normal(shape=[self.action_dim], mean=self.mean, stddev=self.stddev) 30 | # self.gaussian_exploration.append(self.action + self.weight * gaussian_noise) 31 | gaussian_noise = tf.random_normal(shape=[self.action_dim], mean=self.mean, stddev=self.stddev) 32 | self.gaussian_exploration = self.action + self.weight * gaussian_noise 33 | 34 | def get_gaussian_exploration(self, action, mean, stddev, weight=0.01): 35 | return self.sess.run(self.gaussian_exploration, feed_dict={self.action: action, 36 | self.mean: mean, 37 | self.stddev: stddev, 38 | self.weight: weight}) 39 | 40 | 41 | class Histogram(object): 42 | def __init__(self, csv_path): 43 | self.df = np.array(pd.read_csv(csv_path, header=None)).astype('float') 44 | self.threshold = self.df[:, -1] 45 | 46 | def get_probability(self, x): 47 | # print('value'+str(self.df[np.arange(len(x[:-1])).astype('int'), (x[:-1] * 20).astype('int')])) 48 | prob = np.array( 49 | self.df[np.arange(len(x[:-1])).astype('int'), (x[:-1] * 20).astype('int')[:]] > self.threshold).astype( 50 | 'int') 51 | return prob 52 | -------------------------------------------------------------------------------- /RL/ou_noise.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------- 2 | # Ornstein-Uhlenbeck Noise 3 | # Author: Flood Sung 4 | # Date: 2016.5.4 5 | # Reference: https://github.com/rllab/rllab/blob/master/rllab/exploration_strategies/ou_strategy.py 6 | # -------------------------------------- 7 | 8 | import numpy as np 9 | import numpy.random as nr 10 | 11 | 12 | class OUNoise: 13 | """docstring for OUNoise""" 14 | def __init__(self, action_dimension, mu=0.5, theta=0.4, sigma=0.2, weight_decay=0.9999): 15 | self.action_dimension = action_dimension 16 | self.mu = mu 17 | self.theta = theta 18 | self.sigma = sigma 19 | self.state = np.ones(self.action_dimension) * self.mu 20 | self.weight = 1 21 | self.weight_decay = weight_decay 22 | self.reset() 23 | 24 | def reset(self): 25 | self.state = np.ones(self.action_dimension) * self.mu 26 | 27 | def update_weight(self): 28 | self.weight *= self.weight_decay 29 | 30 | def noise(self): 31 | x = self.state 32 | dx = self.theta * (self.mu - x) + self.sigma * nr.randn(len(x)) 33 | self.state = x + dx * self.weight 34 | return self.state 35 | 36 | 37 | 38 | # # Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab 39 | # class OrnsteinUhlenbeckActionNoise(ActionNoise): 40 | # def __init__(self, mu, sigma, theta=.15, dt=1e-2, x0=None, weight_decay_factor=0.999): 41 | # self.theta = theta 42 | # self.mu = mu 43 | # self.sigma = sigma 44 | # self.dt = dt 45 | # self.x0 = x0 46 | # self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu) 47 | # 48 | # self.weight_decay_factor = weight_decay_factor 49 | # self.weight_decay = 1 50 | # 51 | # self.reset() 52 | # 53 | # def get_noise(self): 54 | # x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt( 55 | # self.dt) * np.random.normal(size=self.mu.shape) 56 | # self.x_prev = x 57 | # return x 58 | # 59 | # @property 60 | # def shape(self): 61 | # return self.mu.shape 62 | # 63 | # def reset(self): 64 | # self.weight_decay = 1 65 | # 66 | # def noise_decay(self): 67 | # self.weight_decay *= self.weight_decay_factor 68 | # 69 | # def __call__(self, action): 70 | # r = action + self.get_noise() * self.weight_decay 71 | # return r 72 | # 73 | # def __repr__(self): 74 | # return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={}, weight_decay_factor=)'.format(self.mu, self.sigma, 75 | # self.weight_decay_factor) 76 | 77 | 78 | if __name__ == '__main__': 79 | ou = OUNoise(3) 80 | states = [] 81 | for i in range(10000): 82 | ou.update_weight() 83 | states.append(ou.noise()) 84 | import matplotlib.pyplot as plt 85 | 86 | plt.plot(states) 87 | plt.show() -------------------------------------------------------------------------------- /RL/replay_buffer.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | import random 3 | import numpy as np 4 | from RL.util import compute_reward, compute_cost, compute_done 5 | 6 | 7 | class ReplayBuffer(object): 8 | """Using explorated data based on simulator""" 9 | def __init__(self, buffer_size): 10 | self.buffer_size = buffer_size 11 | self.num_experiences = 0 12 | self.buffer = deque() 13 | self.real_data = np.load('/Users/xhr/PycharmProjects/Boiler/Simulator/data/replay_buffer.npy') 14 | nums = len(self.real_data) 15 | self.num_indices = list(range(nums)) 16 | random.shuffle(self.num_indices) 17 | self.real_start_indice = 0 18 | 19 | def get_batch(self, batch_size): 20 | # Randomly sample batch_size examples 21 | return random.sample(self.buffer, batch_size) 22 | 23 | def get_real_batch(self, batch_size): 24 | return self.real_data[np.random.choice(self.real_data.shape[0], batch_size, replace=False), :] 25 | 26 | def size(self): 27 | return self.buffer_size 28 | 29 | def add(self, state, action, reward, cost, new_state, done, mix_ratio): 30 | experience = (state, action, reward, cost, new_state, done) 31 | if self.num_experiences < self.buffer_size: 32 | self.buffer.append(experience) 33 | for _ in range(mix_ratio): 34 | s, a, s_, done = self.generate_real() 35 | r = compute_reward(s) 36 | c = compute_cost(s) 37 | d = compute_done(s) 38 | e = (s, a, r, c, s_, d) 39 | # print('s-{}-a{}-ns{}'.format(s.shape, a.shape, s_.shape)) 40 | 41 | self.buffer.append(e) 42 | self.num_experiences += 1 43 | else: 44 | for _ in range(mix_ratio+1): 45 | self.buffer.popleft() 46 | self.buffer.append(experience) 47 | for _ in range(mix_ratio): 48 | s, a, s_, done = self.generate_real() 49 | r = compute_reward(s) 50 | c = compute_cost(s) 51 | d = compute_done(s) 52 | e = (s, a, r, c, s_, d) 53 | self.buffer.append(e) 54 | 55 | def generate_real(self): 56 | s = self.real_data[self.real_start_indice, :58] 57 | a = self.real_data[self.real_start_indice, 58:109] 58 | s_ = self.real_data[self.real_start_indice, 109:156] 59 | s_ = np.concatenate([s[:11], s_]) 60 | done = self.real_data[self.real_start_indice, -1] 61 | self.real_start_indice += 1 62 | if self.real_start_indice == len(self.real_data): 63 | self.real_start_indice = 0 64 | return s, a, s_, done 65 | 66 | def count(self): 67 | # if buffer is full, return buffer size 68 | # otherwise, return experience counter 69 | return self.num_experiences 70 | 71 | def erase(self): 72 | self.buffer = deque() 73 | self.num_experiences = 0 74 | 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /RL/env.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from collections import deque 4 | import random 5 | 6 | from Simulator.simrnn_model import RNNSimulatorModel 7 | from Simulator.simrnn_main import cell_config, FLAGS 8 | from RL.util import * 9 | 10 | OUTER_START_POS = 0 11 | OUTER_SIZE = 11 12 | STATE_SIZE = 47 13 | ACTION_SIZE = 51 14 | STATE_START_POS = OUTER_START_POS + OUTER_SIZE 15 | ACTION_START_POS = STATE_START_POS + STATE_SIZE 16 | NEW_STATE_START_POS = ACTION_START_POS + ACTION_SIZE 17 | 18 | 19 | class SimulatorEnvironment(object): 20 | def __init__(self, sess): 21 | self.sess = sess 22 | self.replay_buffer = np.load('../Simulator/data/replay_buffer.npy') 23 | self.state_buffer = deque() 24 | 25 | # model construction 26 | self.rnn_model = RNNSimulatorModel(cell_config(), FLAGS) 27 | 28 | self.sess.run(tf.global_variables_initializer()) 29 | 30 | # path 31 | model_name = "sim_rnn" 32 | model_path = '../Simulator/logs/{}-{}-{}-{}-{}-{:.2f}-{:.4f}-{:.2f}-{:.5f}/'.format( 33 | model_name, cell_config.num_units[0], cell_config.num_units[1], cell_config.num_units[2], 34 | FLAGS.num_steps, FLAGS.keep_prob, FLAGS.learning_rate, FLAGS.learning_rate_decay, FLAGS.l2_weight) 35 | model_path += 'saved_models/final_model.ckpt' 36 | 37 | saver = tf.train.Saver() 38 | saver.restore(self.sess, model_path) 39 | print("Model successfully restored from file: %s" % model_path) 40 | 41 | def reset(self): 42 | """ Resets the state of the environment and returns an initial observation. """ 43 | self.state_buffer = deque() 44 | nums = len(self.replay_buffer) 45 | init_state_indice = random.randint(10, nums) 46 | for i in range(10): 47 | self.state_buffer.append(self.replay_buffer[init_state_indice-(9-i), :NEW_STATE_START_POS]) 48 | self.new_state = init_state = self.replay_buffer[init_state_indice, :ACTION_START_POS] 49 | # self.new_state = init_state.reshape(1, -1) 50 | self.outer_state = self.replay_buffer[init_state_indice, OUTER_START_POS:STATE_START_POS] 51 | 52 | return self.new_state 53 | 54 | def step(self, action): 55 | """Run one timestep of the environment's dynamics. When end of 56 | episode is reached, you are responsible for calling `reset()` 57 | to reset this environment's state. 58 | 59 | Accepts an action and returns a tuple (observation, reward, cost, done, info). 60 | """ 61 | self.state_buffer.append(np.concatenate([self.new_state, action])) 62 | self.state_buffer.popleft() 63 | 64 | # transpose from 2D to 3D 65 | model_inputs_2D = np.array(self.state_buffer) 66 | num_step, dim = model_inputs_2D.shape 67 | model_inputs_3D = model_inputs_2D.reshape(1, num_step, dim) 68 | 69 | test_data_feed = { 70 | self.rnn_model.keep_prob: 1.0, 71 | self.rnn_model.inputs: model_inputs_3D, 72 | } 73 | new_state = self.sess.run(self.rnn_model.pred, test_data_feed) 74 | self.new_state = np.concatenate([self.outer_state, new_state[0]]) # (1, 47) -> (47, ) 75 | 76 | reward = compute_reward(self.new_state) 77 | cost = compute_cost(self.new_state) 78 | done = compute_done(self.new_state) 79 | 80 | return self.new_state, reward, cost, done 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /Simulator/simrnn_model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | import random 5 | import time 6 | from Simulator.simrnn_cell import SimulatorRNNCell 7 | 8 | 9 | class RNNSimulatorModel(object): 10 | def __init__(self, 11 | cell_config, 12 | FLAGS): 13 | """ Construct simulator model using self_designed cell """ 14 | self.coaler_cell_size, self.burner_cell_size, self.steamer_cell_size = cell_config.num_units 15 | self.input_size = FLAGS.input_size 16 | self.output_size = FLAGS.output_size 17 | self.coaler_output_size = cell_config.coaler_state_size 18 | self.burner_output_size = cell_config.burner_state_size 19 | self.steamer_output_size = cell_config.steamer_state_size 20 | 21 | self.batch_size = FLAGS.batch_size 22 | self.n_steps = FLAGS.num_steps 23 | self.l2_weight = FLAGS.l2_weight 24 | self.grad_clip = FLAGS.grad_clip 25 | 26 | # inputs.shape = (number of examples, number of input, dimension of each input). 27 | self.inputs = tf.placeholder(tf.float32, [None, self.n_steps, self.input_size], name="inputs") 28 | self.targets = tf.placeholder(tf.float32, [None, self.output_size], name="targets") 29 | self.learning_rate = tf.placeholder(tf.float32, None, name="learning_rate") 30 | self.keep_prob = tf.placeholder(tf.float32, None, name="keep_prob") 31 | 32 | self.cell = SimulatorRNNCell(cell_config, self.keep_prob) 33 | # Run dynamic RNN 34 | self.cell_init_state = self.cell.zero_state(self.batch_size, dtype=tf.float32) 35 | cell_outputs, cell_final_state = tf.nn.dynamic_rnn( 36 | self.cell, self.inputs, initial_state=self.cell_init_state, time_major=False, scope="dynamic_rnn") 37 | 38 | # outputs.get_shape() = (batch_size, num_steps, cell_size) 39 | coaler_output, burner_output, steamer_output = cell_outputs 40 | self.coaler_output = coaler_output[:, -1, :] 41 | self.burner_output = burner_output[:, -1, :] 42 | self.steamer_output = steamer_output[:, -1, :] 43 | 44 | # pred = W * out + b 45 | ws_out_coaler = tf.Variable( 46 | tf.truncated_normal([self.coaler_cell_size, self.coaler_output_size]), name="W_coaler") 47 | bs_out_coaler = tf.Variable( 48 | tf.constant(0.1, shape=[self.coaler_output_size]), name="bias_coaler") 49 | ws_out_burner = tf.Variable( 50 | tf.truncated_normal([self.burner_cell_size, self.burner_output_size]), name="W_burner") 51 | bs_out_burner = tf.Variable( 52 | tf.constant(0.1, shape=[self.burner_output_size]), name="bias_burner") 53 | ws_out_steamer = tf.Variable( 54 | tf.truncated_normal([self.steamer_cell_size, self.steamer_output_size]), name="W_steamer") 55 | bs_out_steamer = tf.Variable( 56 | tf.constant(0.1, shape=[self.steamer_output_size]), name="bias_steamer") 57 | 58 | self.coaler_pred = tf.matmul(self.coaler_output, ws_out_coaler) + bs_out_coaler 59 | self.burner_pred = tf.matmul(self.burner_output, ws_out_burner) + bs_out_burner 60 | self.steamer_pred = tf.matmul(self.steamer_output, ws_out_steamer) + bs_out_steamer 61 | self.pred = tf.concat([self.coaler_pred, self.burner_pred, self.steamer_pred], axis=1) 62 | self.pred = tf.sigmoid(self.pred) 63 | # self.pred_summ = tf.summary.histogram("pred", self.pred) 64 | 65 | 66 | # train loss 67 | self.tv = tf.trainable_variables() 68 | self.l2_loss = self.l2_weight * tf.reduce_sum( 69 | [tf.nn.l2_loss(v) for v in self.tv if not ("noreg" in v.name or "bias" in v.name)], name="l2_loss") 70 | self.mse = tf.reduce_mean(tf.square(self.pred - self.targets), name="loss_mse_train") 71 | self.loss = self.mse + self.l2_loss 72 | 73 | # gradients clip 74 | grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, self.tv), self.grad_clip) 75 | # optimizer = tf.train.MomentumOptimizer(self.learning_rate, 0.9) 76 | # optimizer = tf.train.RMSPropOptimizer(self.learning_rate) 77 | optimizer = tf.train.AdamOptimizer(self.learning_rate) 78 | self.train_opt = optimizer.apply_gradients(zip(grads, self.tv)) 79 | 80 | # summary 81 | self.loss_summ = tf.summary.scalar("loss_mse_train", self.loss) 82 | self.learning_rate_summ = tf.summary.scalar("learning_rate", self.learning_rate) 83 | # for var in tf.trainable_variables(): 84 | # tf.summary.histogram(var.name, var) 85 | self.merged_summ = tf.summary.merge_all() 86 | 87 | -------------------------------------------------------------------------------- /Simulator/baseline_model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.contrib import rnn 3 | from tensorflow.contrib.layers import fully_connected 4 | 5 | 6 | class BaseLineModel(object): 7 | def __init__(self, 8 | FLAGS, 9 | training=True): 10 | """ Construct baseline model, including stacked LSTM, GRU and DNN """ 11 | self.num_units = FLAGS.num_units 12 | self.num_layers = FLAGS.num_layers 13 | self.input_size = FLAGS.input_size 14 | self.output_size = FLAGS.output_size 15 | 16 | self.batch_size = FLAGS.batch_size 17 | self.n_steps = FLAGS.num_steps 18 | self.l2_weight = FLAGS.l2_weight 19 | self.grad_clip = FLAGS.grad_clip 20 | 21 | # inputs.shape = (number of examples, number of input, dimension of each input). 22 | if FLAGS.model == 'dnn': 23 | self.inputs = tf.placeholder(tf.float32, [None, self.input_size], name="inputs") 24 | else: 25 | self.inputs = tf.placeholder(tf.float32, [None, self.n_steps, self.input_size], name="inputs") 26 | self.targets = tf.placeholder(tf.float32, [None, self.output_size], name="targets") 27 | self.learning_rate = tf.placeholder(tf.float32, None, name="learning_rate") 28 | self.keep_prob = tf.placeholder(tf.float32, None, name="keep_prob") 29 | 30 | if training and FLAGS.keep_prob: 31 | self.inputs = tf.nn.dropout(self.inputs, FLAGS.keep_prob) 32 | 33 | if FLAGS.model == 'dnn': 34 | hidden = fully_connected(self.inputs, self.num_units) 35 | for _ in range(self.num_layers - 1): 36 | hidden = fully_connected(hidden, self.num_units) 37 | if training and FLAGS.keep_prob < 1.0: 38 | hidden = rnn.DropoutWrapper(hidden, 39 | input_keep_prob=FLAGS.keep_prob, 40 | output_keep_prob=FLAGS.keep_prob) 41 | self.cell_outputs = hidden 42 | else: # choose different rnn cell 43 | if FLAGS.model == 'rnn': 44 | cell_fn = rnn.RNNCell 45 | elif FLAGS.model == 'gru': 46 | cell_fn = rnn.GRUCell 47 | elif FLAGS.model == 'lstm': 48 | cell_fn = rnn.LSTMCell 49 | elif FLAGS.model == 'nas': 50 | cell_fn = rnn.NASCell 51 | else: 52 | raise Exception("model type not supported: {}".format(FLAGS.model)) 53 | 54 | # warp multi layered rnn cell into one cell with dropout 55 | cells = [] 56 | for _ in range(self.num_layers): 57 | cell = cell_fn(self.num_units) 58 | if training and FLAGS.keep_prob < 1.0: 59 | cell = rnn.DropoutWrapper(cell, 60 | input_keep_prob=FLAGS.keep_prob, 61 | output_keep_prob=FLAGS.keep_prob) 62 | cells.append(cell) 63 | self.cell = rnn.MultiRNNCell(cells, state_is_tuple=True) 64 | 65 | self.cell_init_state = self.cell.zero_state(self.batch_size, dtype=tf.float32) 66 | cell_outputs, cell_final_state = tf.nn.dynamic_rnn( 67 | self.cell, self.inputs, initial_state=self.cell_init_state, time_major=False, scope="dynamic_rnn") 68 | 69 | # outputs.get_shape() = (batch_size, num_steps, cell_size) 70 | self.cell_outputs = cell_outputs[:, -1, :] 71 | 72 | # pred = W * out + b 73 | ws_out = tf.Variable( 74 | tf.truncated_normal([self.num_units, self.output_size]), name="W_out") 75 | bs_out = tf.Variable( 76 | tf.constant(0.1, shape=[self.output_size]), name="bias_out") 77 | self.pred = tf.matmul(self.cell_outputs, ws_out) + bs_out 78 | 79 | 80 | # train loss 81 | self.tv = tf.trainable_variables() 82 | self.l2_loss = self.l2_weight * tf.reduce_sum( 83 | [tf.nn.l2_loss(v) for v in self.tv if not ("noreg" in v.name or "bias" in v.name)], name="l2_loss") 84 | self.mse = tf.reduce_mean(tf.square(self.pred - self.targets), name="loss_mse_train") 85 | self.loss = self.mse + self.l2_loss 86 | 87 | # gradients clip 88 | grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, self.tv), self.grad_clip) 89 | optimizer = tf.train.AdamOptimizer(self.learning_rate) 90 | self.train_opt = optimizer.apply_gradients(zip(grads, self.tv)) 91 | 92 | # summary 93 | self.loss_summ = tf.summary.scalar("loss_mse_train", self.loss) 94 | self.learning_rate_summ = tf.summary.scalar("learning_rate", self.learning_rate) 95 | # for var in tf.trainable_variables(): 96 | # tf.summary.histogram(var.name, var) 97 | self.merged_summ = tf.summary.merge_all() 98 | 99 | -------------------------------------------------------------------------------- /RL/train_primal_dual.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append('../') 4 | from RL.primal_dual_ddpg import * 5 | from RL.env import * 6 | 7 | 8 | 9 | MAX_EPISODES = 30000 10 | MAX_EP_STEPS = 10 11 | # TEST = 10 12 | SIM_REAL_RATIO = 1 13 | 14 | 15 | 16 | class input_config(): 17 | batch_size = 32 18 | init_dual_lambda = 1 19 | state_dim = 58 20 | action_dim = 51 21 | clip_norm = 5. 22 | train_display_iter = 200 23 | model_save_path = './models/' 24 | # model_name = "sim_ddpg" 25 | # logdir = './logs/{}-{}-{}-{:.2f}/'.format( 26 | # model_name, MAX_EP_STEPS, SIM_REAL_RATIO, init_dual_lambda) 27 | # log_path = logdir + 'saved_models/' 28 | log_path = "logs/nonpre_nonexp_" + str(SIM_REAL_RATIO) + "_pdddpg_summary" 29 | save_iter = 500 30 | log_iter = 100 31 | 32 | 33 | def pre_train_actor_network(agent, epochs=3): 34 | replay_buffer = agent.replay_buffer 35 | 36 | for epoch in range(epochs): 37 | step = 0 38 | while step < 1000: 39 | minibatch = replay_buffer.get_real_batch(batch_size=input_config.batch_size) 40 | step += 1 41 | state_batch, action_batch, _, _ = convert_to_tuple(minibatch) 42 | 43 | _, mse = agent.actor_network.pretrain(state=state_batch, label=action_batch) 44 | 45 | # display 46 | if epoch % 1 == 0: 47 | print('-----------------pre-train actor network-----------------') 48 | print('epoch = {} mse = {:.4f}'.format(epoch, mse)) 49 | 50 | 51 | def pre_train_reward_critic_network(agent, epochs=3): 52 | replay_buffer = agent.replay_buffer 53 | for train_times in range(epochs): 54 | step = 0 55 | while step < 1000: 56 | minibatch = replay_buffer.get_real_batch(batch_size=input_config.batch_size) 57 | step += 1 58 | state_batch, action_batch, next_state_batch, _ = convert_to_tuple(minibatch) 59 | reward_batch = compute_reward(state_batch) 60 | 61 | y_batch = [] 62 | target_action = agent.actor_network.target_actions(next_state_batch) 63 | target_value = agent.reward_critic_network.target_reward(next_state_batch, target_action) 64 | 65 | for i in range(len(minibatch)): 66 | y_batch.append(reward_batch[i] + agent.gamma * target_value[i]) 67 | 68 | # update critic network 69 | reward_critic_loss = agent.reward_critic_network.pretrain(y_batch, state_batch, action_batch) 70 | 71 | # display 72 | if train_times % 1 == 0: 73 | print('-----------------pre-train reward critic network-----------------') 74 | print("reward_critic: loss:{:.3f}".format(reward_critic_loss)) 75 | 76 | 77 | def pre_train_cost_critic_network(agent, epochs=3): 78 | replay_buffer = agent.replay_buffer 79 | step = 0 80 | for train_times in range(epochs): 81 | step = 0 82 | while step < 1000: 83 | minibatch = replay_buffer.get_real_batch(batch_size=input_config.batch_size) 84 | step += 1 85 | state_batch, action_batch, next_state_batch, _ = convert_to_tuple(minibatch) 86 | cost_batch = compute_cost(state_batch) 87 | 88 | z_batch = [] 89 | target_action = agent.actor_network.target_actions(next_state_batch) 90 | target_value = agent.cost_critic_network.target_cost(next_state_batch, target_action) 91 | 92 | for i in range(len(minibatch)): 93 | z_batch.append(cost_batch[i] + agent.gamma * target_value[i]) 94 | 95 | # update critic network 96 | cost_critic_loss = agent.cost_critic_network.pretrain(z_batch, state_batch, action_batch) 97 | 98 | # display 99 | if train_times % 1 == 0: 100 | print('-----------------pre-train cost critic network-----------------') 101 | print("reward_critic: loss:{:.3f}".format(cost_critic_loss)) 102 | 103 | 104 | def main(): 105 | # Set up summary writer 106 | summary_writer = tf.summary.FileWriter(input_config.log_path) 107 | 108 | config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 109 | config.gpu_options.allow_growth = True 110 | 111 | # build agent graph 112 | tf.reset_default_graph() 113 | agent_graph = tf.Graph() 114 | agent_sess = tf.Session(config=config, graph=agent_graph) 115 | with agent_graph.as_default(): 116 | agent = PrimalDualDDPG(sess=agent_sess, input_config=input_config, is_batch_norm=False, summ_writer=summary_writer) 117 | total_parameters = 0 118 | for variable in tf.trainable_variables(): 119 | # shape is an array of tf.Dimension 120 | shape = variable.get_shape() 121 | # print(shape) 122 | # print(len(shape)) 123 | variable_parameters = 1 124 | for dim in shape: 125 | # print(dim) 126 | variable_parameters *= dim.value 127 | # print(variable_parameters) 128 | total_parameters += variable_parameters 129 | print('total parameters: {}'.format(total_parameters)) 130 | 131 | # build environment graph 132 | env_graph = tf.Graph() 133 | env_sess = tf.Session(config=config, graph=env_graph) 134 | with env_graph.as_default(): 135 | env = SimulatorEnvironment(sess=env_sess) 136 | 137 | # pre_train 138 | # pre_train_actor_network(agent=agent, epochs=1) 139 | # pre_train_reward_critic_network(agent=agent, epochs=1) 140 | # pre_train_cost_critic_network(agent=agent, epochs=1) 141 | # agent.actor_network.update_target() 142 | # agent.reward_critic_network.update_target() 143 | # agent.cost_critic_network.update_target() 144 | 145 | for episode in range(MAX_EPISODES): 146 | dual_variable = input_config.init_dual_lambda 147 | ep_reward = 0 148 | ep_cost = 0 149 | state = env.reset() 150 | 151 | for step in range(MAX_EP_STEPS): 152 | # action = restrictive_action(agent.action(state), episode) 153 | action = agent.noise_action(state, episode) 154 | next_state, reward, cost, done = env.step(action) 155 | ep_reward += reward 156 | ep_cost += cost 157 | agent.perceive(state, action, reward, cost, next_state, done, mix_ratio=SIM_REAL_RATIO) 158 | dual_variable = agent.get_dual_lambda() 159 | state = next_state 160 | summary = tf.Summary() 161 | summary.value.add(tag='Steps_sum_Reward', simple_value=float(ep_reward/MAX_EP_STEPS)) 162 | summary.value.add(tag='Steps_sum_Cost', simple_value=float(ep_cost/MAX_EP_STEPS)) 163 | summary.value.add(tag='Dual_variable', simple_value=float(dual_variable)) 164 | summary_writer.add_summary(summary, episode) 165 | 166 | summary_writer.flush() 167 | 168 | print('Episode:{} | Reward: {:.2f} | Cost: {:.2f}'.format(episode, ep_reward/MAX_EP_STEPS, ep_cost/MAX_EP_STEPS)) 169 | 170 | if episode % 100 == 0 and episode >= 100: 171 | agent.save_model() 172 | 173 | print("-------------save model--------------------") 174 | agent.save_model() 175 | 176 | 177 | if __name__ == '__main__': 178 | main() -------------------------------------------------------------------------------- /Simulator/baseline_main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pprint 4 | import tensorflow as tf 5 | import tensorflow.contrib.slim as slim 6 | from Simulator.baseline_model import BaseLineModel 7 | from Simulator.data_model import BoilerDataSet 8 | 9 | flags = tf.app.flags 10 | # Data and model checkpcheckpointsoints directories 11 | flags.DEFINE_integer("display_iter", 200, "display_iter") 12 | flags.DEFINE_integer("save_log_iter", 100, "save_log_iter") 13 | # Model params 14 | flags.DEFINE_string('model', 'lstm', 'Choose from lstm, gru, rnn, or dnn') 15 | flags.DEFINE_integer("input_size", 109, "Input size") # external_input + state + action 16 | flags.DEFINE_integer("output_size", 47, "Output size") # state size 17 | flags.DEFINE_integer("num_units", 128, "Num of hidden units") 18 | flags.DEFINE_integer("num_layers", 2, "Num of stacked layers") 19 | # Optimization 20 | flags.DEFINE_integer("num_steps", 5, "Num of steps") 21 | flags.DEFINE_integer("batch_size", 256, "The size of batch") 22 | flags.DEFINE_integer("max_epoch", 50, "Total training epoches") 23 | flags.DEFINE_float("grad_clip", 5., "Clip gradients at this value") 24 | flags.DEFINE_float("learning_rate", 0.001, "Initial learning rate at early stage. [0.001]") 25 | flags.DEFINE_float("learning_rate_decay", 0.95, "Decay rate of learning rate. [0.99]") 26 | flags.DEFINE_float("keep_prob", 1, "Keep probability of input data and dropout layer. [0.8]") 27 | flags.DEFINE_float("l2_weight", 0.0, "weight of l2 loss") 28 | 29 | 30 | FLAGS = flags.FLAGS 31 | 32 | 33 | pp = pprint.PrettyPrinter() 34 | 35 | if not os.path.exists("logs"): 36 | os.mkdir("logs") 37 | 38 | 39 | def show_all_variables(): 40 | model_vars = tf.trainable_variables() 41 | slim.model_analyzer.analyze_vars(model_vars, print_info=True) 42 | 43 | 44 | def main(_): 45 | np.random.seed(2019) 46 | 47 | pp.pprint(flags.FLAGS.__flags) 48 | 49 | # os.environ["CUDA_VISIBLE_DEVICES"] = "0" 50 | run_config = tf.ConfigProto() 51 | run_config.gpu_options.allow_growth = True 52 | 53 | # read data 54 | if FLAGS.model == 'dnn': 55 | boiler_dataset = BoilerDataSet(num_steps=1) 56 | train_X = boiler_dataset.train_X.reshape([-1, FLAGS.input_size]) 57 | train_y = boiler_dataset.train_y.reshape([-1, FLAGS.output_size]) 58 | val_X = boiler_dataset.val_X.reshape([-1, FLAGS.input_size]) 59 | val_y = boiler_dataset.val_y.reshape([-1, FLAGS.output_size]) 60 | 61 | else: 62 | boiler_dataset = BoilerDataSet(num_steps=FLAGS.num_steps) 63 | train_X = boiler_dataset.train_X 64 | train_y = boiler_dataset.train_y 65 | val_X = boiler_dataset.val_X 66 | val_y = boiler_dataset.val_y 67 | # print dataset info 68 | num_train = len(train_X) 69 | num_valid = len(val_X) 70 | print('train samples: {0}'.format(num_train)) 71 | print('eval samples: {0}'.format(num_valid)) 72 | 73 | # model construction 74 | tf.reset_default_graph() 75 | baseline_model = BaseLineModel(FLAGS) 76 | 77 | # print trainable params 78 | for i in tf.trainable_variables(): 79 | print(i) 80 | # count the parameters in our model 81 | total_parameters = 0 82 | for variable in tf.trainable_variables(): 83 | # shape is an array of tf.Dimension 84 | shape = variable.get_shape() 85 | # print(shape) 86 | # print(len(shape)) 87 | variable_parameters = 1 88 | for dim in shape: 89 | # print(dim) 90 | variable_parameters *= dim.value 91 | # print(variable_parameters) 92 | total_parameters += variable_parameters 93 | print('total parameters: {}'.format(total_parameters)) 94 | 95 | # path for log saving 96 | model_name = "baseline_" + FLAGS.model 97 | logdir = './logs/{}-{}-{}-{}-{:.2f}-{:.4f}-{:.2f}-{:.5f}/'.format( 98 | model_name, FLAGS.num_layers, FLAGS.num_units, FLAGS.num_steps, 99 | FLAGS.keep_prob, FLAGS.learning_rate, FLAGS.learning_rate_decay, FLAGS.l2_weight) 100 | model_dir = logdir + 'saved_models/' 101 | 102 | if not os.path.exists(logdir): 103 | os.mkdir(logdir) 104 | if not os.path.exists(model_dir): 105 | os.mkdir(model_dir) 106 | results_dir = logdir + 'results/' 107 | 108 | with tf.Session(config=run_config) as sess: 109 | summary_writer = tf.summary.FileWriter(logdir) 110 | 111 | sess.run(tf.global_variables_initializer()) 112 | saver = tf.train.Saver() 113 | 114 | iter = 0 115 | valid_losses = [np.inf] 116 | 117 | for i in range(FLAGS.max_epoch): 118 | print('----------epoch {}-----------'.format(i)) 119 | # learning_rate = FLAGS.learning_rate 120 | learning_rate = FLAGS.learning_rate * ( 121 | FLAGS.learning_rate_decay ** i 122 | ) 123 | 124 | for batch_X, batch_y in boiler_dataset.generate_one_epoch(train_X, train_y, FLAGS.batch_size): 125 | iter += 1 126 | train_data_feed = { 127 | baseline_model.learning_rate: learning_rate, 128 | baseline_model.keep_prob: FLAGS.keep_prob, 129 | baseline_model.inputs: batch_X, 130 | baseline_model.targets: batch_y, 131 | } 132 | train_loss, _, merged_summ = sess.run( 133 | [baseline_model.loss, baseline_model.train_opt, baseline_model.merged_summ], train_data_feed) 134 | if iter % FLAGS.save_log_iter == 0: 135 | summary_writer.add_summary(merged_summ, iter) 136 | if iter % FLAGS.display_iter == 0: 137 | valid_loss = 0 138 | for val_batch_X, val_batch_y in boiler_dataset.generate_one_epoch(val_X, val_y, FLAGS.batch_size): 139 | val_data_feed = { 140 | baseline_model.keep_prob: 1.0, 141 | baseline_model.inputs: val_batch_X, 142 | baseline_model.targets: val_batch_y, 143 | } 144 | batch_loss = sess.run(baseline_model.loss, val_data_feed) 145 | valid_loss += batch_loss 146 | num_batches = int(len(val_X)) // FLAGS.batch_size 147 | valid_loss /= num_batches 148 | valid_losses.append(valid_loss) 149 | valid_loss_sum = tf.Summary( 150 | value=[tf.Summary.Value(tag="valid_loss", simple_value=valid_loss)]) 151 | summary_writer.add_summary(valid_loss_sum, iter) 152 | 153 | if valid_loss < min(valid_losses[:-1]): 154 | print('iter {}\tvalid_loss = {:.6f}\tmodel saved!!'.format( 155 | iter, valid_loss)) 156 | saver.save(sess, model_dir + 157 | 'model_{}.ckpt'.format(iter)) 158 | saver.save(sess, model_dir + 'final_model.ckpt') 159 | else: 160 | print('iter {}\tvalid_loss = {:.6f}\t'.format( 161 | iter, valid_loss)) 162 | 163 | print('stop training !!!') 164 | 165 | 166 | if __name__ == '__main__': 167 | tf.app.run() -------------------------------------------------------------------------------- /Simulator/simrnn_main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pprint 4 | import tensorflow as tf 5 | import tensorflow.contrib.slim as slim 6 | import sys 7 | sys.path.append('../') 8 | 9 | from Simulator.simrnn_model import RNNSimulatorModel 10 | from Simulator.data_model import BoilerDataSet 11 | 12 | flags = tf.app.flags 13 | # Data and model checkpcheckpointsoints directories 14 | flags.DEFINE_integer("display_iter", 200, "display_iter") 15 | flags.DEFINE_integer("save_log_iter", 100, "save_log_iter") 16 | # Model params 17 | flags.DEFINE_integer("input_size", 109, "Input size") # external_input + state + action 18 | flags.DEFINE_integer("output_size", 47, "Output size") # state size 19 | # Optimization 20 | flags.DEFINE_integer("num_steps", 10, "Num of steps") 21 | flags.DEFINE_integer("batch_size", 1, "The size of batch") 22 | flags.DEFINE_integer("max_epoch", 50, "Total training epoches") 23 | flags.DEFINE_float("grad_clip", 5., "Clip gradients at this value") 24 | flags.DEFINE_float("learning_rate", 0.001, "Initial learning rate at early stage. [0.001]") 25 | flags.DEFINE_float("learning_rate_decay", 0.95, "Decay rate of learning rate. [0.99]") 26 | flags.DEFINE_float("keep_prob", 1, "Keep probability of input data and dropout layer. [0.8]") 27 | flags.DEFINE_float("l2_weight", 0.0, "weight of l2 loss") 28 | 29 | FLAGS = flags.FLAGS 30 | 31 | 32 | class cell_config(object): 33 | """ Simulator Cell config """ 34 | # list, [coaler_num_units, burner_num_units, steamer_num_units] 35 | num_units = [128, 64, 64] 36 | 37 | # data is [external_input, state(coaler, burner, steamer), action(coaler, burner, steamer)] 38 | external_state_pos = 0 39 | external_state_size = 11 40 | coaler_state_pos = external_state_pos + external_state_size 41 | coaler_state_size = 25 42 | burner_state_pos = coaler_state_pos + coaler_state_size 43 | burner_state_size = 7 44 | steamer_state_pos = burner_state_pos + burner_state_size 45 | steamer_state_size = 15 46 | coaler_action_pos = steamer_state_pos + steamer_state_size 47 | coaler_action_size = 31 48 | burner_action_pos = coaler_action_pos + coaler_action_size 49 | burner_action_size = 15 50 | steamer_action_pos = burner_action_pos + burner_action_size 51 | steamer_action_size = 5 52 | 53 | 54 | pp = pprint.PrettyPrinter() 55 | 56 | if not os.path.exists("logs"): 57 | os.mkdir("logs") 58 | 59 | 60 | def show_all_variables(): 61 | model_vars = tf.trainable_variables() 62 | slim.model_analyzer.analyze_vars(model_vars, print_info=True) 63 | 64 | 65 | def main(_): 66 | np.random.seed(2019) 67 | 68 | pp.pprint(flags.FLAGS.__flags) 69 | 70 | # os.environ["CUDA_VISIBLE_DEVICES"] = "0" 71 | run_config = tf.ConfigProto() 72 | run_config.gpu_options.allow_growth = True 73 | 74 | # read data 75 | boiler_dataset = BoilerDataSet(num_steps=FLAGS.num_steps) 76 | train_X = boiler_dataset.train_X 77 | train_y = boiler_dataset.train_y 78 | val_X = boiler_dataset.val_X 79 | val_y = boiler_dataset.val_y 80 | # print dataset info 81 | num_train = len(train_X) 82 | num_valid = len(val_X) 83 | print('train samples: {0}'.format(num_train)) 84 | print('eval samples: {0}'.format(num_valid)) 85 | 86 | # model construction 87 | tf.reset_default_graph() 88 | rnn_model = RNNSimulatorModel(cell_config(), FLAGS) 89 | 90 | # print trainable params 91 | for i in tf.trainable_variables(): 92 | print(i) 93 | # count the parameters in our model 94 | total_parameters = 0 95 | for variable in tf.trainable_variables(): 96 | # shape is an array of tf.Dimension 97 | shape = variable.get_shape() 98 | # print(shape) 99 | # print(len(shape)) 100 | variable_parameters = 1 101 | for dim in shape: 102 | # print(dim) 103 | variable_parameters *= dim.value 104 | # print(variable_parameters) 105 | total_parameters += variable_parameters 106 | print('total parameters: {}'.format(total_parameters)) 107 | 108 | # path for log saving 109 | model_name = "sim_rnn" 110 | logdir = './logs/{}-{}-{}-{}-{}-{:.2f}-{:.4f}-{:.2f}-{:.5f}/'.format( 111 | model_name, cell_config.num_units[0], cell_config.num_units[1], cell_config.num_units[2], 112 | FLAGS.num_steps, FLAGS.keep_prob, FLAGS.learning_rate, FLAGS.learning_rate_decay, FLAGS.l2_weight) 113 | model_dir = logdir + 'saved_models/' 114 | 115 | if not os.path.exists(logdir): 116 | os.mkdir(logdir) 117 | if not os.path.exists(model_dir): 118 | os.mkdir(model_dir) 119 | results_dir = logdir + 'results/' 120 | 121 | with tf.Session(config=run_config) as sess: 122 | summary_writer = tf.summary.FileWriter(logdir) 123 | 124 | sess.run(tf.global_variables_initializer()) 125 | saver = tf.train.Saver() 126 | 127 | iter = 0 128 | valid_losses = [np.inf] 129 | 130 | for i in range(FLAGS.max_epoch): 131 | print('----------epoch {}-----------'.format(i)) 132 | # learning_rate = FLAGS.learning_rate 133 | learning_rate = FLAGS.learning_rate * ( 134 | FLAGS.learning_rate_decay ** i 135 | ) 136 | 137 | for batch_X, batch_y in boiler_dataset.generate_one_epoch(train_X, train_y, FLAGS.batch_size): 138 | iter += 1 139 | train_data_feed = { 140 | rnn_model.learning_rate: learning_rate, 141 | rnn_model.keep_prob: FLAGS.keep_prob, 142 | rnn_model.inputs: batch_X, 143 | rnn_model.targets: batch_y, 144 | } 145 | train_loss, _, merged_summ = sess.run( 146 | [rnn_model.loss, rnn_model.train_opt, rnn_model.merged_summ], train_data_feed) 147 | if iter % FLAGS.save_log_iter == 0: 148 | summary_writer.add_summary(merged_summ, iter) 149 | if iter % FLAGS.display_iter == 0: 150 | valid_loss = 0 151 | for val_batch_X, val_batch_y in boiler_dataset.generate_one_epoch(val_X, val_y, FLAGS.batch_size): 152 | val_data_feed = { 153 | rnn_model.keep_prob: 1.0, 154 | rnn_model.inputs: val_batch_X, 155 | rnn_model.targets: val_batch_y, 156 | } 157 | batch_loss = sess.run(rnn_model.loss, val_data_feed) 158 | valid_loss += batch_loss 159 | num_batches = int(len(val_X)) // FLAGS.batch_size 160 | valid_loss /= num_batches 161 | valid_losses.append(valid_loss) 162 | valid_loss_sum = tf.Summary( 163 | value=[tf.Summary.Value(tag="valid_loss", simple_value=valid_loss)]) 164 | summary_writer.add_summary(valid_loss_sum, iter) 165 | 166 | if valid_loss < min(valid_losses[:-1]): 167 | print('iter {}\tvalid_loss = {:.6f}\tmodel saved!!'.format( 168 | iter, valid_loss)) 169 | saver.save(sess, model_dir + 170 | 'model_{}.ckpt'.format(iter)) 171 | saver.save(sess, model_dir + 'final_model.ckpt') 172 | else: 173 | print('iter {}\tvalid_loss = {:.6f}\t'.format( 174 | iter, valid_loss)) 175 | 176 | print('stop training !!!') 177 | 178 | 179 | if __name__ == '__main__': 180 | tf.app.run() -------------------------------------------------------------------------------- /RL/reward_critic_network.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import math 4 | 5 | 6 | LAYER1_SIZE = 256 7 | LAYER2_SIZE = 256 8 | LEARNING_RATE = 0.0001 9 | TAU = 0.001 10 | L2 = 0.0001 11 | 12 | 13 | def weight_variable(shape): 14 | initial = tf.truncated_normal(shape, stddev=0.01) 15 | return tf.Variable(initial) 16 | 17 | 18 | def bias_variable(shape): 19 | initial = tf.constant(0.03, shape=shape) 20 | return tf.Variable(initial) 21 | 22 | 23 | class RewardCriticNetwork(object): 24 | def __init__(self, sess, input_config, summ_writer): 25 | self.time_step = 0 26 | self.sess = sess 27 | self.state_dim = input_config.state_dim 28 | self.action_dim = input_config.action_dim 29 | self.clip_norm = input_config.clip_norm 30 | self.step = 0 31 | self.log_iter = input_config.log_iter # logging interval in training phase 32 | self.log_path = input_config.log_path # logging interval in training phase 33 | 34 | self.train_writer = summ_writer 35 | 36 | # create reward network 37 | self.state_input, \ 38 | self.action_input, \ 39 | self.reward_value_output, \ 40 | self.net = self.create_reward_network(self.state_dim, self.action_dim) 41 | 42 | # create target reward network (the same structure with reward network) 43 | self.target_state_input, \ 44 | self.target_action_input, \ 45 | self.target_reward_value_output, \ 46 | self.target_update = self.create_target_reward_network(self.state_dim, self.action_dim, self.net) 47 | 48 | self.create_training_method() 49 | 50 | self.sess.run(tf.global_variables_initializer()) 51 | 52 | self.update_target() 53 | 54 | def create_training_method(self): 55 | # Define training optimizer 56 | self.y_input = tf.placeholder("float", [None, 1]) 57 | weight_decay = tf.add_n([L2 * tf.nn.l2_loss(var) for var in self.net]) 58 | self.cost = tf.reduce_mean(tf.square(self.y_input - self.reward_value_output)) + weight_decay 59 | self.optimizer = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.cost) 60 | self.action_gradients = tf.gradients(self.reward_value_output, self.action_input) 61 | 62 | 63 | # def create_reward_network(self, state_dim, action_dim): 64 | # # the layer size could be changed 65 | # layer1_size = LAYER1_SIZE 66 | # layer2_size = LAYER2_SIZE 67 | # 68 | # state_input = tf.placeholder("float", [None, state_dim]) 69 | # action_input = tf.placeholder("float", [None, action_dim]) 70 | # 71 | # W1 = self.variable([state_dim, layer1_size], state_dim) 72 | # b1 = self.variable([layer1_size], state_dim) 73 | # W2 = self.variable([layer1_size, layer2_size], layer1_size + action_dim) 74 | # W2_action = self.variable([action_dim, layer2_size], layer1_size + action_dim) 75 | # b2 = self.variable([layer2_size], layer1_size + action_dim) 76 | # W3 = tf.Variable(tf.random_uniform([layer2_size, 1], -3e-3, 3e-3)) 77 | # b3 = tf.Variable(tf.random_uniform([1], -3e-3, 3e-3)) 78 | # 79 | # layer1 = tf.nn.relu(tf.matmul(state_input, W1) + b1) 80 | # layer2 = tf.nn.relu(tf.matmul(layer1, W2) + tf.matmul(action_input, W2_action) + b2) 81 | # q_value_output = tf.identity(tf.matmul(layer2, W3) + b3) 82 | # 83 | # return state_input, action_input, q_value_output, [W1, b1, W2, W2_action, b2, W3, b3] 84 | 85 | def create_reward_network(self, state_dim, action_dim): 86 | # the layer size could be changed 87 | layer1_size = LAYER1_SIZE 88 | layer2_size = LAYER2_SIZE 89 | 90 | state_input = tf.placeholder("float", [None, state_dim]) 91 | action_input = tf.placeholder("float", [None, action_dim]) 92 | 93 | # Input -> Hidden Layer 94 | w1 = weight_variable([state_dim, layer1_size]) 95 | b1 = bias_variable([layer1_size]) 96 | # Hidden Layer -> Hidden Layer + Action 97 | w2 = weight_variable([layer1_size, layer2_size]) 98 | w2a = weight_variable([action_dim, layer2_size]) 99 | b2 = bias_variable([layer2_size]) 100 | # Hidden Layer -> Output (Q) 101 | w3 = weight_variable([layer2_size, 1]) 102 | b3 = bias_variable([1]) 103 | 104 | # 1st Hidden layer, OPTION: Softmax, relu, tanh or sigmoid 105 | h1 = tf.nn.relu(tf.matmul(state_input, w1) + b1) 106 | # 2nd Hidden layer, OPTION: Softmax, relu, tanh or sigmoid 107 | # Action inserted here 108 | h2 = tf.nn.relu(tf.matmul(h1, w2) + tf.matmul(action_input, w2a) + b2) 109 | 110 | reward_value_output = tf.matmul(h2, w3) + b3 111 | 112 | return state_input, action_input, reward_value_output, [w1, b1, w2, w2a, b2, w3, b3] 113 | 114 | def create_target_reward_network(self, state_dim, action_dim, net): 115 | state_input = tf.placeholder("float", [None, state_dim]) 116 | action_input = tf.placeholder("float", [None, action_dim]) 117 | 118 | ema = tf.train.ExponentialMovingAverage(decay=1 - TAU) 119 | target_update = ema.apply(net) 120 | target_net = [ema.average(x) for x in net] 121 | 122 | layer1 = tf.nn.relu(tf.matmul(state_input, target_net[0]) + target_net[1]) 123 | layer2 = tf.nn.relu(tf.matmul(layer1, target_net[2]) + tf.matmul(action_input, target_net[3]) + target_net[4]) 124 | reward_value_output = tf.identity(tf.matmul(layer2, target_net[5]) + target_net[6]) 125 | 126 | return state_input, action_input, reward_value_output, target_update 127 | 128 | def update_target(self): 129 | self.sess.run(self.target_update) 130 | 131 | def train(self, y_batch, state_batch, action_batch): 132 | # r_loss_summ = tf.summary.scalar('reward_critic_loss', self.cost) 133 | # self.merged = tf.summary.merge([r_loss_summ]) 134 | 135 | train_feed_dict = { 136 | self.y_input: y_batch, 137 | self.state_input: state_batch, 138 | self.action_input: action_batch 139 | } 140 | _, reward_critic_loss, reward_action_grad_norm = \ 141 | self.sess.run([self.optimizer, self.cost, self.action_gradients], train_feed_dict) 142 | 143 | # if self.step % self.log_iter == 0: 144 | # self.train_writer.add_summary(merged_summ, global_step=self.step) 145 | 146 | self.step += 1 147 | 148 | return reward_critic_loss, reward_action_grad_norm 149 | 150 | def pretrain(self, y_batch, state_batch, action_batch): 151 | train_feed_dict = { 152 | self.y_input: y_batch, 153 | self.state_input: state_batch, 154 | self.action_input: action_batch 155 | } 156 | _, reward_critic_loss = self.sess.run([self.optimizer, self.cost], train_feed_dict) 157 | return reward_critic_loss 158 | 159 | def gradients(self, state_batch, action_batch): 160 | return self.sess.run(self.action_gradients, feed_dict={ 161 | self.state_input: state_batch, 162 | self.action_input: action_batch 163 | })[0] 164 | 165 | def target_reward(self, state_batch, action_batch): 166 | return self.sess.run(self.target_reward_value_output, feed_dict={ 167 | self.target_state_input: state_batch, 168 | self.target_action_input: action_batch 169 | }) 170 | 171 | def reward_value(self, state_batch, action_batch): 172 | return self.sess.run(self.reward_value_output, feed_dict={ 173 | self.state_input: state_batch, 174 | self.action_input: action_batch}) 175 | 176 | # f fan-in size 177 | def variable(self, shape, f): 178 | return tf.Variable(tf.random_uniform(shape, -1 / math.sqrt(f), 1 / math.sqrt(f))) 179 | 180 | ''' 181 | def load_network(self): 182 | self.saver = tf.train.Saver() 183 | checkpoint = tf.train.get_checkpoint_state("saved_reward_critic_networks") 184 | if checkpoint and checkpoint.model_checkpoint_path: 185 | self.saver.restore(self.sess, checkpoint.model_checkpoint_path) 186 | print "Successfully loaded:", checkpoint.model_checkpoint_path 187 | else: 188 | print "Could not find old network weights" 189 | def save_network(self,time_step): 190 | print 'save reward-critic-network...',time_step 191 | self.saver.save(self.sess, 'saved_reward_critic_networks/' + 'reward-critic-network', global_step = time_step) 192 | ''' 193 | 194 | 195 | 196 | 197 | 198 | 199 | -------------------------------------------------------------------------------- /RL/cost_critic_network.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import math 4 | 5 | 6 | LAYER1_SIZE = 256 7 | LAYER2_SIZE = 256 8 | LEARNING_RATE = 0.0001 9 | TAU = 0.001 10 | L2 = 0.0001 11 | 12 | 13 | def weight_variable(shape): 14 | initial = tf.truncated_normal(shape, stddev=0.01) 15 | return tf.Variable(initial) 16 | 17 | 18 | def bias_variable(shape): 19 | initial = tf.constant(0.03, shape=shape) 20 | return tf.Variable(initial) 21 | 22 | 23 | class CostCriticNetwork(object): 24 | def __init__(self, sess, input_config, summ_writer): 25 | self.time_step = 0 26 | self.sess = sess 27 | self.state_dim = input_config.state_dim 28 | self.action_dim = input_config.action_dim 29 | self.clip_norm = input_config.clip_norm 30 | self.step = 0 31 | self.log_iter = input_config.log_iter # logging interval in training phase 32 | self.log_path = input_config.log_path # logging interval in training phase 33 | 34 | self.train_writer_cost = summ_writer 35 | 36 | 37 | # create cost network 38 | self.state_input, \ 39 | self.action_input, \ 40 | self.cost_value_output, \ 41 | self.cost_net = self.create_cost_network(self.state_dim, self.action_dim) 42 | 43 | # create target cost network (the same structure with cost network) 44 | self.target_state_input, \ 45 | self.target_action_input, \ 46 | self.target_cost_value_output, \ 47 | self.cost_target_update = self.create_target_cost_network(self.state_dim, self.action_dim, self.cost_net) 48 | 49 | self.create_training_method() 50 | 51 | self.sess.run(tf.global_variables_initializer()) 52 | 53 | self.update_target() 54 | 55 | 56 | def create_training_method(self): 57 | # Define training optimizer 58 | self.z_input = tf.placeholder("float", [None, 1]) 59 | weight_decay = tf.add_n([L2 * tf.nn.l2_loss(var) for var in self.cost_net]) 60 | self.cost_cost = tf.reduce_mean(tf.square(self.z_input - self.cost_value_output)) + weight_decay 61 | self.optimizer = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.cost_cost) 62 | self.action_gradients_cost = tf.gradients(self.cost_value_output, self.action_input) 63 | 64 | 65 | 66 | # def create_cost_network(self, state_dim, action_dim): 67 | # # the layer size could be changed 68 | # layer1_size = LAYER1_SIZE 69 | # layer2_size = LAYER2_SIZE 70 | # 71 | # state_input = tf.placeholder("float", [None, state_dim]) 72 | # action_input = tf.placeholder("float", [None, action_dim]) 73 | # 74 | # W1 = self.variable([state_dim, layer1_size], state_dim) 75 | # b1 = self.variable([layer1_size], state_dim) 76 | # W2 = self.variable([layer1_size, layer2_size], layer1_size + action_dim) 77 | # W2_action = self.variable([action_dim, layer2_size], layer1_size + action_dim) 78 | # b2 = self.variable([layer2_size], layer1_size + action_dim) 79 | # W3 = tf.Variable(tf.random_uniform([layer2_size, 1], -3e-3, 3e-3)) 80 | # b3 = tf.Variable(tf.random_uniform([1], -3e-3, 3e-3)) 81 | # 82 | # layer1 = tf.nn.relu(tf.matmul(state_input, W1) + b1) 83 | # layer2 = tf.nn.relu(tf.matmul(layer1, W2) + tf.matmul(action_input, W2_action) + b2) 84 | # cost_value_output = tf.identity(tf.matmul(layer2, W3) + b3) 85 | # 86 | # return state_input, action_input, cost_value_output, [W1, b1, W2, W2_action, b2, W3, b3] 87 | 88 | def create_cost_network(self, state_dim, action_dim): 89 | # the layer size could be changed 90 | layer1_size = LAYER1_SIZE 91 | layer2_size = LAYER2_SIZE 92 | 93 | state_input = tf.placeholder("float", [None, state_dim]) 94 | action_input = tf.placeholder("float", [None, action_dim]) 95 | 96 | # Input -> Hidden Layer 97 | w1 = weight_variable([state_dim, layer1_size]) 98 | b1 = bias_variable([layer1_size]) 99 | # Hidden Layer -> Hidden Layer + Action 100 | w2 = weight_variable([layer1_size, layer2_size]) 101 | w2a = weight_variable([action_dim, layer2_size]) 102 | b2 = bias_variable([layer2_size]) 103 | # Hidden Layer -> Output (Q) 104 | w3 = weight_variable([layer2_size, 1]) 105 | b3 = bias_variable([1]) 106 | 107 | # 1st Hidden layer, OPTION: Softmax, relu, tanh or sigmoid 108 | h1 = tf.nn.relu(tf.matmul(state_input, w1) + b1) 109 | # 2nd Hidden layer, OPTION: Softmax, relu, tanh or sigmoid 110 | # Action inserted here 111 | h2 = tf.nn.relu(tf.matmul(h1, w2) + tf.matmul(action_input, w2a) + b2) 112 | 113 | cost_value_output = tf.matmul(h2, w3) + b3 114 | 115 | return state_input, action_input, cost_value_output, [w1, b1, w2, w2a, b2, w3, b3] 116 | 117 | def create_target_cost_network(self, state_dim, action_dim, net): 118 | state_input = tf.placeholder("float", [None, state_dim]) 119 | action_input = tf.placeholder("float", [None, action_dim]) 120 | 121 | ema = tf.train.ExponentialMovingAverage(decay=1 - TAU) 122 | target_update = ema.apply(net) 123 | target_net = [ema.average(x) for x in net] 124 | 125 | layer1 = tf.nn.relu(tf.matmul(state_input, target_net[0]) + target_net[1]) 126 | layer2 = tf.nn.relu(tf.matmul(layer1, target_net[2]) + tf.matmul(action_input, target_net[3]) + target_net[4]) 127 | cost_value_output = tf.identity(tf.matmul(layer2, target_net[5]) + target_net[6]) 128 | 129 | return state_input, action_input, cost_value_output, target_update 130 | 131 | def update_target(self): 132 | self.sess.run(self.cost_target_update) 133 | 134 | def train(self, z_batch, state_batch, action_batch): 135 | # c_loss_summ = tf.summary.scalar('cost_critic_loss', self.cost_cost) 136 | # self.merged_cost = tf.summary.merge([c_loss_summ]) 137 | 138 | train_feed_dict = { 139 | self.z_input: z_batch, 140 | self.state_input: state_batch, 141 | self.action_input: action_batch 142 | } 143 | _, cost_critic_loss, cost_action_grad_norm = \ 144 | self.sess.run([self.optimizer, self.cost_cost, self.action_gradients_cost], train_feed_dict) 145 | 146 | # if self.step % self.log_iter == 0: 147 | # self.train_writer_cost.add_summary(merged_summ_cost, global_step=self.step) 148 | 149 | self.step += 1 150 | 151 | return cost_critic_loss, cost_action_grad_norm 152 | 153 | def pretrain(self, z_batch, state_batch, action_batch): 154 | train_feed_dict = { 155 | self.z_input: z_batch, 156 | self.state_input: state_batch, 157 | self.action_input: action_batch 158 | } 159 | _, cost_critic_loss = self.sess.run([self.optimizer, self.cost_cost], train_feed_dict) 160 | return cost_critic_loss 161 | 162 | def gradients(self, state_batch, action_batch): 163 | return self.sess.run(self.action_gradients_cost, feed_dict={ 164 | self.state_input: state_batch, 165 | self.action_input: action_batch 166 | })[0] 167 | 168 | def target_cost(self, state_batch, action_batch): 169 | return self.sess.run(self.target_cost_value_output, feed_dict={ 170 | self.target_state_input: state_batch, 171 | self.target_action_input: action_batch 172 | }) 173 | 174 | def cost_value(self, state_batch, action_batch): 175 | return self.sess.run(self.cost_value_output, feed_dict={ 176 | self.state_input: state_batch, 177 | self.action_input: action_batch}) 178 | 179 | # f fan-in size 180 | def variable(self, shape, f): 181 | return tf.Variable(tf.random_uniform(shape, -1 / math.sqrt(f), 1 / math.sqrt(f))) 182 | 183 | ''' 184 | def load_network(self): 185 | self.saver = tf.train.Saver() 186 | checkpoint = tf.train.get_checkpoint_state("saved_cost_critic_networks") 187 | if checkpoint and checkpoint.model_checkpoint_path: 188 | self.saver.restore(self.sess, checkpoint.model_checkpoint_path) 189 | print "Successfully loaded:", checkpoint.model_checkpoint_path 190 | else: 191 | print "Could not find old network weights" 192 | def save_network(self,time_step): 193 | print 'save cost-critic-network...',time_step 194 | self.saver.save(self.sess, 'saved_cost_critic_networks/' + 'cost-critic-network', global_step = time_step) 195 | ''' 196 | 197 | 198 | 199 | 200 | 201 | 202 | -------------------------------------------------------------------------------- /RL/primal_dual_ddpg.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | from RL.ou_noise import OUNoise 5 | from RL.reward_critic_network import RewardCriticNetwork 6 | from RL.cost_critic_network import CostCriticNetwork 7 | 8 | from RL.actor_network import ActorNetwork 9 | from RL.replay_buffer import ReplayBuffer 10 | from RL.util import * 11 | 12 | # EPSILON定义一个极小值 13 | EPSILON = 1e-5 14 | # Hyper Parameters: 15 | REPLAY_MEMORY_SIZE = 10000 16 | REPLAY_START_SIZE = 1000 17 | GAMMA = 0.9 18 | COST_EPSILON = 1 19 | DUAL_STEP_SIZE = 0.01 20 | is_grad_inverter = False 21 | 22 | 23 | class PrimalDualDDPG(object): 24 | """ Primal Dual Deep Deterministic Policy Gradient Algorithm""" 25 | 26 | def __init__(self, sess, input_config, is_batch_norm, summ_writer=None, load_model=False): 27 | self.state_dim = input_config.state_dim 28 | self.action_dim = input_config.action_dim 29 | self.dual_lambda = input_config.init_dual_lambda 30 | self.save_path = input_config.model_save_path 31 | self.train_display_iter = input_config.train_display_iter 32 | self.batch_size = input_config.batch_size 33 | self.gamma = GAMMA 34 | self.summay_writer = summ_writer 35 | 36 | self.sess = sess 37 | self.step = 0 38 | 39 | 40 | if is_batch_norm: 41 | self.rewward_critic_network = RewardCriticNetwork_bn(self.sess, self.state_dim, self.action_dim) 42 | self.cost_critic_network = CostCriticNetwork_bn(self.sess, self.state_dim, self.action_dim) 43 | self.actor_network = ActorNetwork_bn(self.sess, self.state_dim, self.action_dim) 44 | 45 | else: 46 | self.reward_critic_network = RewardCriticNetwork(self.sess, input_config, self.summay_writer) 47 | self.cost_critic_network = CostCriticNetwork(self.sess, input_config, self.summay_writer) 48 | self.actor_network = ActorNetwork(self.sess, input_config, load_model=False, summ_writer=self.summay_writer) 49 | 50 | # initialize replay buffer 51 | self.replay_buffer = ReplayBuffer(REPLAY_MEMORY_SIZE) 52 | 53 | # Initialize a random process the Ornstein-Uhlenbeck process for action exploration 54 | self.exploration_noise = OUNoise(self.action_dim) 55 | 56 | # for name in input_config.__dict__: 57 | # if isinstance(input_config.__dict__[name], int) or isinstance(input_config.__dict__[name], float): 58 | # self.log(f'parameter|input_config_{name}:{input_config.__dict__[name]}') 59 | 60 | # model saver 61 | self.saver = tf.train.Saver() 62 | if load_model: 63 | self.saver.restore(sess=self.sess, save_path=tf.train.latest_checkpoint(self.save_path)) 64 | 65 | 66 | # def __del__(self): 67 | # self.logfile.close() 68 | # 69 | # def log(self, *args): 70 | # self.logfile.write(*args) 71 | # self.logfile.write('\n') 72 | 73 | def train(self): 74 | # print "train step", self.time_step 75 | # Sample a random minibatch of N transitions from replay buffer 76 | minibatch = self.replay_buffer.get_batch(self.batch_size) 77 | state_batch = np.asarray([data[0] for data in minibatch]) 78 | action_batch = np.asarray([data[1] for data in minibatch]) 79 | reward_batch = np.asarray([data[2] for data in minibatch]) 80 | cost_batch = np.asarray([data[3] for data in minibatch]) 81 | next_state_batch = np.asarray([data[4] for data in minibatch]) 82 | done_batch = np.asarray([data[5] for data in minibatch]) 83 | 84 | # Calculate y_batch 85 | target_action_batch = self.actor_network.target_actions(next_state_batch) 86 | target_reward_value = self.reward_critic_network.target_reward(next_state_batch, target_action_batch) 87 | target_cost_value = self.cost_critic_network.target_cost(next_state_batch, target_action_batch) 88 | y_batch, z_batch = [], [] 89 | for i in range(len(minibatch)): 90 | if done_batch[i]: 91 | y_batch.append(reward_batch[i]) 92 | z_batch.append(cost_batch[i]) 93 | else: 94 | y_batch.append(reward_batch[i] + GAMMA * target_reward_value[i]) 95 | z_batch.append(cost_batch[i] + GAMMA * target_cost_value[i]) 96 | 97 | y_batch = np.resize(y_batch, [self.batch_size, 1]) 98 | z_batch = np.resize(z_batch, [self.batch_size, 1]) 99 | 100 | # Update reward critic by minimizing the loss L 101 | reward_critic_loss, reward_action_grad_norm = self.reward_critic_network.train(y_batch, state_batch, action_batch) 102 | # q_value = self.critic_network.get_q_value(state_limit_batch, action_batch) 103 | 104 | # Update cost critic by minimizing the loss L 105 | cost_critic_loss, cost_action_grad_norm = self.cost_critic_network.train(z_batch, state_batch, action_batch) 106 | 107 | # Update the actor policy using the sampled gradient 108 | if is_grad_inverter: 109 | action_batch_for_gradients = self.actor_network.actions(state_batch) 110 | action_batch_for_gradients = self.grad_inv.invert(action_batch_for_gradients, ) 111 | else: 112 | action_batch_for_gradients = self.actor_network.actions(state_batch) 113 | print('action_batch_for_gradients', action_batch_for_gradients) 114 | reward_gradient_batch = self.reward_critic_network.gradients(state_batch, action_batch_for_gradients) 115 | cost_gradient_batch = self.cost_critic_network.gradients(state_batch, action_batch_for_gradients) 116 | q_gradient_batch = reward_gradient_batch - self.dual_lambda * cost_gradient_batch 117 | self.actor_network.train(q_gradient_batch, state_batch) 118 | 119 | # Update the dual variable using the sample gradient 120 | cost_value_batch = self.cost_critic_network.cost_value(state_batch, action_batch_for_gradients) 121 | cost_limit_batch = np.array([[COST_EPSILON] for _ in range(self.batch_size)]) 122 | self.dual_gradients = np.mean(cost_value_batch - cost_limit_batch) 123 | self.dual_lambda += DUAL_STEP_SIZE * self.dual_gradients 124 | self.dual_lambda = np.max([EPSILON, self.dual_lambda]) # ensure dual >= 0 125 | 126 | if self.step % self.train_display_iter == 0: 127 | print("reward_critic: loss:{:.3f} action_grads_norm:{:.3f} " 128 | "| cost_critic: loss:{:.3f} action_grads_norm:{:.3f}" 129 | "| q_gradient:{:.3f}".format( 130 | reward_critic_loss, np.mean(reward_action_grad_norm), 131 | cost_critic_loss, np.mean(cost_action_grad_norm), np.mean(q_gradient_batch))) 132 | print("Dual lambda: {}".format(self.dual_lambda)) 133 | 134 | 135 | # Update the target networks 136 | self.reward_critic_network.update_target() 137 | self.cost_critic_network.update_target() 138 | self.actor_network.update_target() 139 | self.step += 1 140 | 141 | def noise_action(self, state, episode): 142 | # Select action a_t according to the current policy and exploration noise 143 | action = self.actor_network.action(state) 144 | if episode % 10 == 0: 145 | self.exploration_noise.update_weight() 146 | noise_action = action + self.exploration_noise.noise() 147 | noise_action = np.minimum(np.maximum(noise_action, 0), 1) # bound action to [0, 1] 148 | return noise_action 149 | 150 | def action(self, state): 151 | action = self.actor_network.action(state) 152 | return action 153 | 154 | def get_dual_lambda(self): 155 | return self.dual_lambda 156 | 157 | def perceive(self, state, action, reward, cost, next_state, done, mix_ratio): 158 | # Store transition (s_t,a_t,r_t,c_t,s_{t+1}) in replay buffer 159 | self.replay_buffer.add(state, action, reward, cost, next_state, done, mix_ratio) 160 | 161 | # Store transitions to replay start size then start training 162 | if self.replay_buffer.count() > REPLAY_START_SIZE: 163 | self.train() 164 | 165 | #if self.time_step % 10000 == 0: 166 | #self.actor_network.save_network(self.time_step) 167 | #self.critic_network.save_network(self.time_step) 168 | 169 | # Re-iniitialize the random process when an episode ends 170 | if done: 171 | self.exploration_noise.reset() 172 | 173 | def save_model(self): 174 | self.saver.save(sess=self.sess, save_path=self.save_path) #global_step=10,会自动生成名字-10 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | -------------------------------------------------------------------------------- /RL/actor_network.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import math 3 | 4 | 5 | # Hyper Parameters 6 | LAYER1_SIZE = 256 7 | LAYER2_SIZE = 256 8 | LAYER3_SIZE = 128 9 | LEARNING_RATE = 0.0001 10 | TAU = 0.001 11 | 12 | 13 | def weight_variable(shape, name): 14 | initial = tf.truncated_normal(shape, stddev=0.01) 15 | return tf.Variable(initial, name) 16 | 17 | 18 | def bias_variable(shape, name): 19 | initial = tf.constant(0.03, shape=shape) 20 | return tf.Variable(initial, name) 21 | 22 | 23 | class ActorNetwork(object): 24 | """ Map: state + limit_load -> action """ 25 | 26 | def __init__(self, sess, input_config, load_model, summ_writer): 27 | self.sess = sess 28 | self.state_dim = input_config.state_dim 29 | self.action_dim = input_config.action_dim 30 | self.save_iter = input_config.save_iter # interval of saving log 31 | self.save_path = input_config.model_save_path + "/actor" # interval of saving model 32 | self.log_iter = input_config.log_iter # logging interval in training phase 33 | self.log_path = input_config.log_path # log path 34 | self.clip_norm = input_config.clip_norm 35 | self.step = 0 36 | 37 | self.train_writer = summ_writer 38 | 39 | # create actor network 40 | self.state_input, self.action_output, self.net = self.create_network(self.state_dim, self.action_dim) 41 | # create target actor network 42 | self.target_state_input, self.target_action_output, self.target_update, self.target_net = self.create_target_network( 43 | self.state_dim, self.action_dim, self.net) 44 | self.create_training_method() 45 | 46 | self.saver = tf.train.Saver() 47 | # self.saver = tf.train.Saver(tf.global_variables(scope=scope)) 48 | if load_model: 49 | # restore actor network 50 | print('actor network restore weights') 51 | self.saver.restore(sess=self.sess, save_path=tf.train.latest_checkpoint(input_config.load_path)) 52 | else: 53 | self.sess.run(tf.global_variables_initializer()) 54 | 55 | self.update_target() 56 | 57 | 58 | def create_training_method(self): 59 | self.q_gradient_input = tf.placeholder("float", [None, self.action_dim]) 60 | self.unnormalized_actor_gradients = tf.gradients(self.action_output, self.net, -self.q_gradient_input) 61 | # self.actor_gradients = list(map(lambda x: tf.div(x, BATCH_SIZE), self.unnormalized_actor_gradients)) 62 | # gradients clip 63 | # self.actor_gradients, _ = tf.clip_by_global_norm(self.actor_gradients, clip_norm=self.clip_norm) 64 | 65 | # extra_ops = tf.get_collection('actor_parameters_extra_option') 66 | # apply_op = tf.train.AdamOptimizer(LEARNING_RATE).apply_gradients(zip(self.unnormalized_actor_gradients, self.net)) 67 | apply_op = tf.train.RMSPropOptimizer(LEARNING_RATE).apply_gradients(zip(self.unnormalized_actor_gradients, self.net)) 68 | 69 | # train_ops = [apply_op] + extra_ops 70 | # self.optimizer = tf.group(*apply_op) 71 | self.optimizer = apply_op 72 | 73 | diff = self.action_output - self.target_action_output 74 | self.mse = tf.reduce_mean(tf.square(diff)) 75 | pretrain_grad = tf.gradients(self.mse, self.net) 76 | self.pretrain_update = tf.train.AdamOptimizer(LEARNING_RATE).apply_gradients( 77 | zip(pretrain_grad, self.net)) 78 | 79 | 80 | 81 | def create_network(self, state_dim, action_dim): 82 | layer1_size = LAYER1_SIZE 83 | layer2_size = LAYER2_SIZE 84 | 85 | state_input = tf.placeholder("float", [None, state_dim]) 86 | 87 | w1 = self.variable([state_dim, layer1_size], state_dim) 88 | b1 = self.variable([layer1_size], state_dim) 89 | w2 = self.variable([layer1_size, layer2_size], layer1_size) 90 | b2 = self.variable([layer2_size], layer1_size) 91 | w3 = tf.Variable(tf.random_uniform([layer2_size, action_dim], -3e-3, 3e-3)) 92 | b3 = tf.Variable(tf.random_uniform([action_dim], -3e-3, 3e-3)) 93 | 94 | layer1 = tf.nn.relu(tf.matmul(state_input, w1) + b1) 95 | layer2 = tf.nn.relu(tf.matmul(layer1, w2) + b2) 96 | action_output = tf.sigmoid(tf.matmul(layer2, w3) + b3) 97 | out_summ = tf.summary.histogram('action_output', action_output) 98 | 99 | w1_summ = tf.summary.histogram('W1', values=w1) 100 | b1_summ = tf.summary.histogram('b1', values=b1) 101 | 102 | w2_summ = tf.summary.histogram('W2', values=w2) 103 | b2_summ = tf.summary.histogram('b2', values=b2) 104 | 105 | w3_summ = tf.summary.histogram('W3', values=w3) 106 | b3_summ = tf.summary.histogram('b3', values=b3) 107 | 108 | self.merged_summ = tf.summary.merge([out_summ, w1_summ, b1_summ, w2_summ, b2_summ, w3_summ, b3_summ]) 109 | # self.merged_summ = tf.summary.merge([out_summ]) 110 | 111 | return state_input, action_output, [w1, b1, w2, b2, w3, b3] 112 | 113 | # def create_network(self, state_dim, action_dim): 114 | # layer1_size = LAYER1_SIZE 115 | # layer2_size = LAYER2_SIZE 116 | # 117 | # state_input = tf.placeholder("float", [None, state_dim]) 118 | # 119 | # # Input -> Hidden Layer 120 | # w1 = weight_variable([self.state_dim, layer1_size], 'W1') 121 | # b1 = bias_variable([layer1_size], 'b1') 122 | # # Hidden Layer -> Hidden Layer 123 | # w2 = weight_variable([layer1_size, layer2_size], 'W2') 124 | # b2 = bias_variable([layer2_size], 'b2') 125 | # # Hidden Layer -> Output 126 | # w3 = weight_variable([layer2_size, self.action_dim], 'W3') 127 | # b3 = bias_variable([self.action_dim], 'b3') 128 | # 129 | # # 1st Hidden layer, OPTION: Softmax, relu, tanh or sigmoid 130 | # h1 = tf.nn.relu(tf.matmul(state_input, w1) + b1) 131 | # # 2nd Hidden layer, OPTION: Softmax, relu, tanh or sigmoid 132 | # h2 = tf.nn.relu(tf.matmul(h1, w2) + b2) 133 | # 134 | # # Run sigmoid on output to get 0 to 1 135 | # action_output = tf.nn.sigmoid(tf.matmul(h2, w3) + b3) 136 | # out_summ = tf.summary.histogram('action_output', action_output) 137 | # 138 | # w1_summ = tf.summary.histogram('W1', values=w1) 139 | # b1_summ = tf.summary.histogram('b1', values=b1) 140 | # 141 | # w2_summ = tf.summary.histogram('W2', values=w2) 142 | # b2_summ = tf.summary.histogram('b2', values=b2) 143 | # 144 | # w3_summ = tf.summary.histogram('W3', values=w3) 145 | # b3_summ = tf.summary.histogram('b3', values=b3) 146 | # 147 | # self.merged_summ = tf.summary.merge([out_summ, w1_summ, b1_summ, w2_summ, b2_summ, w3_summ, b3_summ]) 148 | # # self.merged_summ = tf.summary.merge([out_summ]) 149 | # 150 | # # scaled_out = tf.multiply(out, self.action_bound) # Scale output to -action_bound to action_bound 151 | # 152 | # return state_input, action_output, [w1, b1, w2, b2, w3, b3] 153 | 154 | 155 | def create_target_network(self, state_dim, action_dim, net): 156 | state_input = tf.placeholder("float", [None, state_dim]) 157 | ema = tf.train.ExponentialMovingAverage(decay=1 - TAU) 158 | target_update = ema.apply(net) 159 | target_net = [ema.average(x) for x in net] 160 | 161 | layer1 = tf.nn.relu(tf.matmul(state_input, target_net[0]) + target_net[1]) 162 | layer2 = tf.nn.relu(tf.matmul(layer1, target_net[2]) + target_net[3]) 163 | 164 | action_output = tf.tanh(tf.matmul(layer2, target_net[4]) + target_net[5]) 165 | 166 | return state_input, action_output, target_update, target_net 167 | 168 | def update_target(self): 169 | self.sess.run(self.target_update) 170 | 171 | def train(self, q_gradient_batch, state_batch): 172 | train_feed_dict = { 173 | self.q_gradient_input: q_gradient_batch, 174 | self.state_input: state_batch 175 | } 176 | summ, _ = self.sess.run([self.merged_summ, self.optimizer], feed_dict=train_feed_dict) 177 | # _ = self.sess.run([self.optimizer], feed_dict=train_feed_dict) 178 | 179 | # save actor network 180 | if self.step % self.save_iter == 0: 181 | self.saver.save(self.sess, save_path=self.save_path, global_step=self.step) 182 | 183 | if self.step % self.log_iter == 0: 184 | self.train_writer.add_summary(summ, global_step=self.step) 185 | 186 | self.step += 1 187 | 188 | def pretrain(self, state, label): 189 | # cost 190 | train_feed_dict = {self.state_input: state, self.target_action_output: label} 191 | _, net, mse = self.sess.run([self.pretrain_update, self.net, self.mse], feed_dict=train_feed_dict) 192 | # save actor network 193 | if self.step % self.save_iter == 0: 194 | self.saver.save(self.sess, save_path=self.save_path, global_step=self.step) 195 | 196 | self.step += 1 197 | return net, mse 198 | 199 | def actions(self, state_batch): 200 | return self.sess.run(self.action_output, feed_dict={ 201 | self.state_input: state_batch 202 | }) 203 | 204 | def action(self, state): 205 | return self.sess.run(self.action_output, feed_dict={ 206 | self.state_input: [state] 207 | })[0] 208 | 209 | def target_actions(self, state_batch): 210 | return self.sess.run(self.target_action_output, feed_dict={ 211 | self.target_state_input: state_batch 212 | }) 213 | 214 | # f fan-in size 215 | def variable(self, shape, f): 216 | return tf.Variable(tf.random_uniform(shape, -1 / math.sqrt(f), 1 / math.sqrt(f))) 217 | 218 | def save_network(self, episode): 219 | print('save actor-network...', episode) 220 | self.saver.save(self.sess, 'saved_actor_networks/' + 'actor-network', global_step=episode) 221 | 222 | ''' 223 | def load_network(self): 224 | self.saver = tf.train.Saver() 225 | checkpoint = tf.train.get_checkpoint_state("saved_actor_networks") 226 | if checkpoint and checkpoint.model_checkpoint_path: 227 | self.saver.restore(self.sess, checkpoint.model_checkpoint_path) 228 | print "Successfully loaded:", checkpoint.model_checkpoint_path 229 | else: 230 | print "Could not find old network weights" 231 | 232 | ''' 233 | 234 | -------------------------------------------------------------------------------- /RL/util.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | OUTER_START_POS = 0 5 | OUTER_SIZE = 11 6 | STATE_SIZE = 47 7 | ACTION_SIZE = 51 8 | STATE_START_POS = OUTER_START_POS + OUTER_SIZE 9 | ACTION_START_POS = STATE_START_POS + STATE_SIZE 10 | NEW_STATE_START_POS = ACTION_START_POS + ACTION_SIZE 11 | 12 | NOX_POS = 40 13 | STEAM_TEMP_POS = 48 14 | STEAM_PRES_POS = 49 15 | NEG_PRES_POS = 44 16 | LIM_LOAD_POS = 11 17 | LOAD_POS = 46 18 | EFFI_WEIGHT = 0.8 19 | 20 | # Read inv-normlization file 21 | inv_norm = pd.read_csv('/Users/xhr/PycharmProjects/Boiler/Simulator/data/反归一化_new.csv', index_col='name') 22 | inv_norm_min = inv_norm['min'].values # convert to ndarray 23 | inv_norm_max = inv_norm['max'].values # convert to ndarray 24 | 25 | 26 | def get_efficiency(state): 27 | if len(state.shape) == 1: 28 | # 主蒸汽流量 29 | h = state[47] * (inv_norm.loc['主蒸汽流量']['max'] - inv_norm.loc['主蒸汽流量']['min']) + inv_norm.loc['主蒸汽流量']['min'] 30 | # 排烟含氧量 31 | i = state[39] * (inv_norm.loc['排烟含氧量']['max'] - inv_norm.loc['排烟含氧量']['min']) + inv_norm.loc['排烟含氧量']['min'] 32 | # 引风机入口温度 33 | j = state[38] * (inv_norm.loc['引风机入口温度']['max'] - inv_norm.loc['引风机入口温度']['min']) + inv_norm.loc['引风机入口温度']['min'] 34 | # 送风机入口温度 35 | k = state[10] * (inv_norm.loc['送风机入口温度']['max'] - inv_norm.loc['送风机入口温度']['min']) + inv_norm.loc['送风机入口温度']['min'] 36 | # 低位发热量MJ/kg 37 | l = state[7] * (inv_norm.loc['低位发热量MJ/kg']['max'] - inv_norm.loc['低位发热量MJ/kg']['min']) + inv_norm.loc['低位发热量MJ/kg']['min'] 38 | # 收到基水份% 39 | m = state[4] * (inv_norm.loc['收到基水份%']['max'] - inv_norm.loc['收到基水份%']['min']) + inv_norm.loc['收到基水份%']['min'] 40 | # 收到基灰份% 41 | n = state[5] * (inv_norm.loc['收到基灰份%']['max'] - inv_norm.loc['收到基灰份%']['min']) + inv_norm.loc['收到基灰份%']['min'] 42 | # 飞灰% 43 | p = state[9] * (inv_norm.loc['飞灰%']['max'] - inv_norm.loc['飞灰%']['min']) + inv_norm.loc['飞灰%']['min'] 44 | # 渣% 45 | q = state[8] * (inv_norm.loc['渣%']['max'] - inv_norm.loc['渣%']['min']) + inv_norm.loc['渣%']['min'] 46 | # 分析基水份% 47 | analytical_base_moisture = state[1] * (inv_norm.loc['分析基水份%']['max'] - inv_norm.loc['分析基水份%']['min']) + inv_norm.loc['分析基水份%']['min'] 48 | # 分析基挥发分% 49 | analytical_base_volatile = state[3] * (inv_norm.loc['分析基挥发分%']['max'] - inv_norm.loc['分析基挥发分%']['min']) + inv_norm.loc['分析基挥发分%']['min'] 50 | else: 51 | # 主蒸汽流量 52 | h = state[:, 47] * (inv_norm.loc['主蒸汽流量']['max'] - inv_norm.loc['主蒸汽流量']['min']) + inv_norm.loc['主蒸汽流量']['min'] 53 | # 排烟含氧量 54 | i = state[:, 39] * (inv_norm.loc['排烟含氧量']['max'] - inv_norm.loc['排烟含氧量']['min']) + inv_norm.loc['排烟含氧量']['min'] 55 | # 引风机入口温度 56 | j = state[:, 38] * (inv_norm.loc['引风机入口温度']['max'] - inv_norm.loc['引风机入口温度']['min']) + inv_norm.loc['引风机入口温度']['min'] 57 | # 送风机入口温度 58 | k = state[:, 10] * (inv_norm.loc['送风机入口温度']['max'] - inv_norm.loc['送风机入口温度']['min']) + inv_norm.loc['送风机入口温度']['min'] 59 | # 低位发热量MJ/kg 60 | l = state[:, 7] * (inv_norm.loc['低位发热量MJ/kg']['max'] - inv_norm.loc['低位发热量MJ/kg']['min']) + inv_norm.loc['低位发热量MJ/kg']['min'] 61 | # 收到基水份% 62 | m = state[:, 4] * (inv_norm.loc['收到基水份%']['max'] - inv_norm.loc['收到基水份%']['min']) + inv_norm.loc['收到基水份%']['min'] 63 | # 收到基灰份% 64 | n = state[:, 5] * (inv_norm.loc['收到基灰份%']['max'] - inv_norm.loc['收到基灰份%']['min']) + inv_norm.loc['收到基灰份%']['min'] 65 | # 飞灰% 66 | p = state[:, 9] * (inv_norm.loc['飞灰%']['max'] - inv_norm.loc['飞灰%']['min']) + inv_norm.loc['飞灰%']['min'] 67 | # 渣% 68 | q = state[:, 8] * (inv_norm.loc['渣%']['max'] - inv_norm.loc['渣%']['min']) + inv_norm.loc['渣%']['min'] 69 | # 分析基水份% 70 | analytical_base_moisture = state[:, 1] * (inv_norm.loc['分析基水份%']['max'] - inv_norm.loc['分析基水份%']['min']) + inv_norm.loc['分析基水份%']['min'] 71 | # 分析基挥发分% 72 | analytical_base_volatile = state[:, 3] * (inv_norm.loc['分析基挥发分%']['max'] - inv_norm.loc['分析基挥发分%']['min']) + inv_norm.loc['分析基挥发分%']['min'] 73 | 74 | o = (100 - m) / (100 - analytical_base_moisture) * analytical_base_volatile 75 | l = l * 1000 76 | u = 10 * q / (100 - q) + 90 * p / (100 - p) 77 | v = 0.257 * (l - 3.3727 * n * u) / 1000 78 | w = 0.98 * v 79 | x = o * 100 / (100 - m - n) 80 | y = 2.1236 * x ** 0.2319 81 | z = y * (100 - m - n) / 100 82 | aa = 21 / (21 - i) 83 | ab = w + (aa - 1) * v 84 | ac = 1.24 * ((9 * z + m) / 100 + 1.293 * aa * v * 0.01) 85 | ad = 5.82 * 2141 ** (-0.38) 86 | s = ab * 1.38 * (j - k) 87 | t = ac * 1.51 * (j - k) 88 | 89 | c = (s + t) / l * 100 90 | # d = 126.36 * r * ab / l * 100 91 | e = 337.27 * n * u / l 92 | f = ad * (1095.4 / h) 93 | g = n * (10 * (800 - k) * 0.96 / (100 - q) + 90 * (j - k) * 0.82 / (100 - p)) / l 94 | 95 | effi = 100 - c - e - f - g 96 | norm_effi = (effi - inv_norm.loc['1号机组锅炉效率']['min']) / (inv_norm.loc['1号机组锅炉效率']['max'] - inv_norm.loc['1号机组锅炉效率']['min']) 97 | 98 | return norm_effi 99 | 100 | 101 | def get_emission(state): 102 | if len(state.shape) == 1: 103 | return state[NOX_POS] 104 | else: 105 | return state[:, NOX_POS] 106 | 107 | 108 | def get_steam_temp(state): 109 | if len(state.shape) == 1: 110 | return state[STEAM_TEMP_POS] 111 | else: 112 | return state[:, STEAM_TEMP_POS] 113 | 114 | 115 | def get_given_steam_pres(state, load): 116 | if len(state.shape) == 1: 117 | if load >= 560: 118 | given_steam_pres = 24.0 119 | else: 120 | given_steam_pres = 0.036072 * load + 3.89199 121 | else: 122 | given_steam_pres = np.ones([load.shape[0]]) * 24 123 | given_steam_pres[load < 560] = 0.036072 * load[load < 560] + 3.89199 124 | return given_steam_pres 125 | 126 | 127 | 128 | def get_steam_pres(state): 129 | if len(state.shape) == 1: 130 | return state[STEAM_PRES_POS] 131 | else: 132 | return state[:, STEAM_PRES_POS] 133 | 134 | 135 | def get_neg_pres(state): 136 | if len(state.shape) == 1: 137 | return state[NEG_PRES_POS] 138 | else: 139 | return state[:, NEG_PRES_POS] 140 | 141 | 142 | def get_lim_load(state): 143 | if len(state.shape) == 1: 144 | return state[LIM_LOAD_POS] 145 | else: 146 | return state[:, LIM_LOAD_POS] 147 | 148 | 149 | def get_load(state): 150 | if len(state.shape) == 1: 151 | return state[LOAD_POS] 152 | else: 153 | return state[:, LOAD_POS] 154 | 155 | 156 | def compute_reward(state): 157 | # coals = get_coals(action) 158 | efficiency = get_efficiency(state) 159 | emission = get_emission(state) 160 | # print('effi', EFFI_WEIGHT * efficiency - (1-EFFI_WEIGHT) * emission) 161 | reward = EFFI_WEIGHT * efficiency - (1-EFFI_WEIGHT) * emission 162 | if np.mean(reward) > 1: 163 | print(reward, efficiency, emission) 164 | return 10*(EFFI_WEIGHT * efficiency - (1-EFFI_WEIGHT) * emission) 165 | 166 | 167 | # def compute_cost(state): 168 | # lim_load = get_lim_load(state) * (inv_norm.loc['lim_load']['max'] - inv_norm.loc['lim_load']['min']) + inv_norm.loc['lim_load']['min'] 169 | # load = get_load(state) * (inv_norm.loc['#1机组锅炉负荷']['max'] - inv_norm.loc['#1机组锅炉负荷']['min']) + inv_norm.loc['#1机组锅炉负荷']['min'] 170 | # steam_temp = get_steam_temp(state) * (inv_norm.loc['锅炉主蒸汽温度']['max'] - inv_norm.loc['锅炉主蒸汽温度']['min']) + inv_norm.loc['锅炉主蒸汽温度']['min'] 171 | # given_steam_pres = get_given_steam_pres(load) 172 | # steam_pres = get_steam_pres(state) * (inv_norm.loc['主蒸汽压力']['max'] - inv_norm.loc['主蒸汽压力']['min']) + inv_norm.loc['主蒸汽压力']['min'] 173 | # # neg_pressure = get_neg_pres(state) * (inv_norm.loc['炉膛负压']['max'] - inv_norm.loc['炉膛负压']['min']) + inv_norm.loc['炉膛负压']['min'] 174 | # 175 | # # cost 1, 负荷:lim_load ~ limload+25 176 | # if len(state.shape) == 1: 177 | # if load - lim_load > 25 or load < lim_load: 178 | # cost_load = 1 179 | # else: 180 | # cost_load = 0 181 | # else: 182 | # cost_load = np.zeros([len(state), 1]) 183 | # cost_load[(load-lim_load > 25) | (load-lim_load < 0)] = 1 184 | # 185 | # # else: 186 | # # if diff < 0.01: 187 | # # return 0 188 | # # elif diff < 0.1: 189 | # # return 0.2 190 | # # elif diff < 0.5: 191 | # # return 0.5 192 | # # else: 193 | # # return 1 194 | # 195 | # # cost 2, 主蒸汽温度:569-10 ~ 569+5 196 | # if len(state.shape) == 1: 197 | # if steam_temp > 569+5 or steam_temp < 569-10: 198 | # cost_steam_temp = 1 199 | # else: 200 | # cost_steam_temp = 0 201 | # else: 202 | # cost_steam_temp = np.zeros([len(state), 1]) 203 | # cost_steam_temp[(steam_temp > 569+5) | (steam_temp < 569-10)] = 1 204 | # 205 | # # cost 3, 主蒸汽压力:given_pres-0.5 ~ given_pres+0.5 206 | # if len(state.shape) == 1: 207 | # if steam_pres > given_steam_pres+0.5 or steam_pres < given_steam_pres-0.5: 208 | # cost_steam_pres = 1 209 | # else: 210 | # cost_steam_pres = 0 211 | # else: 212 | # cost_steam_pres = np.zeros([len(state), 1]) 213 | # cost_steam_pres[(steam_pres > given_steam_pres+0.5) & (steam_pres < given_steam_pres-0.5)] = 1 214 | # 215 | # return 1/3*cost_load + 1/3*cost_steam_temp + 1/3*cost_steam_pres 216 | 217 | def compute_cost(state): 218 | lim_load = get_lim_load(state) * (inv_norm.loc['lim_load']['max'] - inv_norm.loc['lim_load']['min']) + inv_norm.loc['lim_load']['min'] 219 | load = get_load(state) * (inv_norm.loc['#1机组锅炉负荷']['max'] - inv_norm.loc['#1机组锅炉负荷']['min']) + inv_norm.loc['#1机组锅炉负荷']['min'] 220 | steam_temp = get_steam_temp(state) * (inv_norm.loc['锅炉主蒸汽温度']['max'] - inv_norm.loc['锅炉主蒸汽温度']['min']) + inv_norm.loc['锅炉主蒸汽温度']['min'] 221 | given_steam_pres = get_given_steam_pres(state, load) 222 | steam_pres = get_steam_pres(state) * (inv_norm.loc['主蒸汽压力']['max'] - inv_norm.loc['主蒸汽压力']['min']) + inv_norm.loc['主蒸汽压力']['min'] 223 | # neg_pressure = get_neg_pres(state) * (inv_norm.loc['炉膛负压']['max'] - inv_norm.loc['炉膛负压']['min']) + inv_norm.loc['炉膛负压']['min'] 224 | 225 | # cost 1, 负荷:lim_load ~ limload+25 226 | if len(state.shape) == 1: 227 | if load - lim_load > 25: 228 | cost_load = np.abs(load - lim_load - 25) / 10 229 | elif load < lim_load: 230 | cost_load = 1 231 | else: 232 | cost_load = 0 233 | else: 234 | cost_load = np.zeros([len(state)]) 235 | cost_load[load-lim_load > 25] = np.abs(load - lim_load - 25)[load-lim_load > 25] / 10 236 | cost_load[load-lim_load < 0] = 1 237 | 238 | 239 | # else: 240 | # if diff < 0.01: 241 | # return 0 242 | # elif diff < 0.1: 243 | # return 0.2 244 | # elif diff < 0.5: 245 | # return 0.5 246 | # else: 247 | # return 1 248 | 249 | # cost 2, 主蒸汽温度:569-10 ~ 569+5 250 | if len(state.shape) == 1: 251 | if steam_temp > 569+10: 252 | cost_steam_temp = np.abs(steam_temp - 569-10) / 10 253 | elif steam_temp < 569-10: 254 | cost_steam_temp = np.abs(steam_temp - 569+10) / 10 255 | else: 256 | cost_steam_temp = 0 257 | else: 258 | cost_steam_temp = np.zeros([len(state)]) 259 | cost_steam_temp[steam_temp > 569+10] = np.abs(steam_temp - 569-10)[steam_temp > 569+10] / 10 260 | cost_steam_temp[steam_temp < 569-10] = np.abs(steam_temp - 569+10)[steam_temp < 569-10] / 10 261 | 262 | # cost 3, 主蒸汽压力:given_pres-0.5 ~ given_pres+0.5 263 | if len(state.shape) == 1: 264 | if steam_pres > given_steam_pres+1: 265 | cost_steam_pres = np.abs(steam_pres - given_steam_pres-1) / 5 266 | elif steam_pres < given_steam_pres-1: 267 | cost_steam_pres = np.abs(steam_pres - given_steam_pres+1) / 5 268 | else: 269 | cost_steam_pres = 0 270 | else: 271 | cost_steam_pres = np.zeros([len(state)]) 272 | cost_steam_pres[steam_pres > given_steam_pres+1] = np.abs(steam_pres - given_steam_pres-1)[steam_pres > given_steam_pres+1] / 5 273 | cost_steam_pres[steam_pres < given_steam_pres-1] = np.abs(steam_pres - given_steam_pres+1)[steam_pres < given_steam_pres-1] / 5 274 | 275 | 276 | return 1/3*cost_load + 1/3*cost_steam_temp + 1/3*cost_steam_pres 277 | 278 | 279 | def compute_done(state): 280 | return False 281 | 282 | 283 | def convert_to_tuple(batch): 284 | outer = batch[:, OUTER_START_POS: OUTER_START_POS + OUTER_SIZE] 285 | state_with_outer = batch[:, OUTER_START_POS: STATE_START_POS + STATE_SIZE] 286 | action = batch[:, ACTION_START_POS: ACTION_START_POS + ACTION_SIZE] 287 | new_state = batch[:, NEW_STATE_START_POS: NEW_STATE_START_POS + STATE_SIZE] 288 | new_state_with_outer = np.concatenate([outer, new_state], axis=1) 289 | done = batch[:, -1] 290 | return (state_with_outer, action, new_state_with_outer, done) 291 | 292 | 293 | 294 | def restrictive_action(action, episode): 295 | action_histogram = np.array(pd.read_csv('../Simulator/data/action_histogram.csv', header=None)).astype('float') 296 | threshold = action_histogram[:, -1] 297 | noise_weight = 1 298 | 299 | if episode % 100 == 0 and episode > 0: 300 | noise_weight *= 0.99 301 | 302 | # print('value'+str(self.df[np.arange(len(x[:-1])).astype('int'), (x[:-1] * 20).astype('int')])) 303 | action_distri = np.array(action_histogram[np.arange(len(action)).astype('int'), (action * 20).astype('int')[:]] > threshold).astype('int') 304 | for i in range(100): 305 | if len(np.where(action_distri == 0)[0]) > 0: 306 | unsatisfied_index = np.where(action_distri == 0)[0] 307 | # print(f'unsatisfied index {unsatisfied_index}') 308 | # f'unsatisfied action {actions[unsatisfied_index]})') 309 | random_noise = np.random.normal(np.zeros(len(unsatisfied_index)), 310 | (0.1 + 0.01 * i) * np.ones(len(unsatisfied_index)), 311 | len(unsatisfied_index)) 312 | action[unsatisfied_index] += random_noise * noise_weight 313 | # print(f'fixed actions {actions[unsatisfied_index]}' 314 | else: 315 | # print(32 + np.where(action_distri[32:] == 0)[0]) 316 | # print('find action within '+str(i)+' times') 317 | break 318 | 319 | # if len(np.where(action_distri == 0)[0]) > 0: 320 | # unsatisfied_index = np.where(action_distri == 0)[0] 321 | # print(f'Break! dissatisfied actions is {len(np.where(action_distri == 0)[0])}, ' 322 | # f'index: {unsatisfied_index}, value: {action[unsatisfied_index]},') 323 | # break 324 | 325 | return action 326 | -------------------------------------------------------------------------------- /Simulator/simrnn_cell.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import collections 4 | import hashlib 5 | import numbers 6 | 7 | from tensorflow.python.eager import context 8 | from tensorflow.python.framework import constant_op 9 | from tensorflow.python.framework import ops 10 | from tensorflow.python.framework import dtypes 11 | from tensorflow.python.layers import base as base_layer 12 | from tensorflow.contrib.rnn import RNNCell 13 | from tensorflow.contrib.rnn import LSTMCell 14 | from tensorflow.contrib.rnn import GRUCell 15 | from tensorflow.python.ops import array_ops 16 | from tensorflow.python.ops import clip_ops 17 | from tensorflow.python.ops import init_ops 18 | from tensorflow.python.ops import math_ops 19 | from tensorflow.python.ops import nn_ops 20 | from tensorflow.python.ops import partitioned_variables 21 | from tensorflow.python.ops import random_ops 22 | from tensorflow.python.ops import tensor_array_ops 23 | from tensorflow.python.ops import variable_scope as vs 24 | from tensorflow.python.ops import variables as tf_variables 25 | from tensorflow.python.ops.rnn_cell_impl import LSTMStateTuple 26 | from tensorflow.python.util import nest 27 | from tensorflow.python.util.tf_export import tf_export 28 | from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors 29 | 30 | 31 | class _LayerRNNCell(RNNCell): 32 | """Subclass of RNNCells that act like proper `tf.Layer` objects. 33 | 34 | For backwards compatibility purposes, most `RNNCell` instances allow their 35 | `call` methods to instantiate variables via `tf.get_variable`. The underlying 36 | variable scope thus keeps track of any variables, and returning cached 37 | versions. This is atypical of `tf.layer` objects, which separate this 38 | part of layer building into a `build` method that is only called once. 39 | 40 | Here we provide a subclass for `RNNCell` objects that act exactly as 41 | `Layer` objects do. They must provide a `build` method and their 42 | `call` methods do not access Variables `tf.get_variable`. 43 | """ 44 | 45 | def __call__(self, inputs, state, scope=None, *args, **kwargs): 46 | """Run this RNN cell on inputs, starting from the given state. 47 | 48 | Args: 49 | inputs: `2-D` tensor with shape `[batch_size, input_size]`. 50 | state: if `self.state_size` is an integer, this should be a `2-D Tensor` 51 | with shape `[batch_size, self.state_size]`. Otherwise, if 52 | `self.state_size` is a tuple of integers, this should be a tuple 53 | with shapes `[batch_size, s] for s in self.state_size`. 54 | scope: optional cell scope. 55 | *args: Additional positional arguments. 56 | **kwargs: Additional keyword arguments. 57 | 58 | Returns: 59 | A pair containing: 60 | 61 | - Output: A `2-D` tensor with shape `[batch_size, self.output_size]`. 62 | - New state: Either a single `2-D` tensor, or a tuple of tensors matching 63 | the arity and shapes of `state`. 64 | """ 65 | # Bypass RNNCell's variable capturing semantics for LayerRNNCell. 66 | # Instead, it is up to subclasses to provide a proper build 67 | # method. See the class docstring for more details. 68 | return base_layer.Layer.__call__(self, inputs, state, scope=scope, 69 | *args, **kwargs) 70 | 71 | 72 | class SimulatorRNNCell(_LayerRNNCell): 73 | """ 74 | coaler RNN: (external_input_t, coaler_hidden_t-1 , coaler_action_t) --> (coaler_hidden_t, coaler_cell_t) 75 | burner RNN: (coaler_hidden_t, burner_hidden_t-1 , burner_action_t) --> (burner_hidden_t, burner_cell_t) 76 | steamer RNN: (burner_hidden_t, steamer_hidden_t-1 , steamer_action_t) --> (steamer_hidden_t, steamer_cell_t) 77 | 78 | loss: sum of three parts 79 | part1: coaler_hidden_t, coaler_state_t 80 | part2: burner_hidden_t, burner_state_t 81 | part3: steamer_hidden_t, steamer_state_t 82 | """ 83 | def __init__(self, cell_config, 84 | keep_prob, 85 | forget_bias=1.0, 86 | activation=None, 87 | reuse=None, 88 | name=None): 89 | """ 90 | Args: 91 | cell_config: simulator config 92 | num_units: list, [coaler_num_units, burner_num_units, steamer_num_units] 93 | """ 94 | super(SimulatorRNNCell, self).__init__(_reuse=reuse, name=name) 95 | # Inputs must be 2-dimensional. 96 | self.input_spec = base_layer.InputSpec(ndim=2) 97 | 98 | self._external_state_pos = cell_config.external_state_pos 99 | self._coaler_state_pos = cell_config.coaler_state_pos 100 | self._coaler_action_pos = cell_config.coaler_action_pos 101 | self._burner_state_pos = cell_config.burner_state_pos 102 | self._burner_action_pos = cell_config.burner_action_pos 103 | self._steamer_state_pos = cell_config.steamer_state_pos 104 | self._steamer_action_pos = cell_config.steamer_action_pos 105 | 106 | self._external_state_size = cell_config.external_state_size 107 | self._coaler_state_size = cell_config.coaler_state_size 108 | self._coaler_action_size = cell_config.coaler_action_size 109 | self._burner_state_size = cell_config.burner_state_size 110 | self._burner_action_size = cell_config.burner_action_size 111 | self._steamer_state_size = cell_config.steamer_state_size 112 | self._steamer_action_size = cell_config.steamer_action_size 113 | 114 | # num_units: list, [coaler_num_units, burner_num_units, steamer_num_units] 115 | _num_units = cell_config.num_units # TODO 116 | self._coaler_num_units = _num_units[0] 117 | self._burner_num_units = _num_units[1] 118 | self._steamer_num_units = _num_units[2] 119 | self._forget_bias = forget_bias 120 | self._activation = activation or math_ops.tanh 121 | self._input_keep_prob = self._output_keep_prob = keep_prob 122 | 123 | @property 124 | def state_size(self): 125 | c_tuple = tuple((self._coaler_num_units, self._burner_num_units, self._steamer_num_units)) 126 | h_tuple = tuple((self._coaler_num_units, self._burner_num_units, self._steamer_num_units)) 127 | return LSTMStateTuple(c_tuple, h_tuple) 128 | 129 | @property 130 | def output_size(self): 131 | return tuple((self._coaler_num_units, self._burner_num_units, self._steamer_num_units)) 132 | 133 | def get_coaler_inputs(self, inputs): 134 | # coaler inputs contains external_input, coaler_state and coaler_action 135 | # input: (batch_size, feature_nums) 136 | external_input = tf.slice(inputs, [0, self._external_state_pos], 137 | [-1, self._external_state_size]) 138 | 139 | coaler_state = tf.slice(inputs, [0, self._coaler_state_pos], 140 | [-1, self._coaler_state_size]) 141 | coaler_action = tf.slice(inputs, [0, self._coaler_action_pos], 142 | [-1, self._coaler_action_size]) 143 | return tf.concat([external_input, coaler_state, coaler_action], axis=1) 144 | 145 | def get_burner_inputs(self, inputs): 146 | # burner inputs contains burner_state and burner_action 147 | # input: (batch_size, feature_nums) 148 | burner_state = tf.slice(inputs, [0, self._burner_state_pos], 149 | [-1, self._burner_state_size]) 150 | burner_action = tf.slice(inputs, [0, self._burner_action_pos], 151 | [-1, self._burner_action_size]) 152 | return tf.concat([burner_state, burner_action], axis=1) 153 | 154 | def get_steamer_inputs(self, inputs): 155 | # steamer inputs contains steamer_state and steamer_action 156 | # input: (batch_size, feature_nums) 157 | steamer_state = tf.slice(inputs, [0, self._steamer_state_pos], 158 | [-1, self._steamer_state_size]) 159 | steamer_action = tf.slice(inputs, [0, self._steamer_action_pos], 160 | [-1, self._steamer_action_size]) 161 | return tf.concat([steamer_state, steamer_action], axis=1) 162 | 163 | def build(self, inputs_shape): 164 | # coaler 165 | external_input_depth = self._external_state_size 166 | coaler_input_depth = self._coaler_state_size + self._coaler_action_size 167 | self._coaler_kernel = self.add_variable( 168 | "coaler_kernel", 169 | shape=[external_input_depth + coaler_input_depth + self._coaler_num_units, 4 * self._coaler_num_units], 170 | initializer=orthogonal_lstm_initializer()) 171 | self._coaler_bias = self.add_variable( 172 | "coaler_bias", 173 | shape=[4 * self._coaler_num_units], 174 | initializer=init_ops.zeros_initializer(dtype=self.dtype)) 175 | # burner 176 | burner_input_depth = self._burner_state_size + self._burner_action_size 177 | self._burner_kernel = self.add_variable( 178 | "burner_kernel", 179 | shape=[burner_input_depth + self._burner_num_units + self._coaler_num_units, 4 * self._burner_num_units], 180 | initializer=orthogonal_lstm_initializer()) 181 | self._burner_bias = self.add_variable( 182 | "burner_bias", 183 | shape=[4 * self._burner_num_units], 184 | initializer=init_ops.zeros_initializer(dtype=self.dtype)) 185 | # steamer 186 | steamer_input_depth = self._steamer_state_size + self._steamer_action_size 187 | self._steamer_kernel = self.add_variable( 188 | "steamer_kernel", 189 | shape=[steamer_input_depth + self._steamer_num_units + self._burner_num_units, 4 * self._steamer_num_units], 190 | initializer=orthogonal_lstm_initializer()) 191 | self._steamer_bias = self.add_variable( 192 | "steamer_bias", 193 | shape=[4 * self._steamer_num_units], 194 | initializer=init_ops.zeros_initializer(dtype=self.dtype)) 195 | 196 | self.built = True 197 | 198 | def zero_state(self, batch_size, dtype): 199 | """Return zero-filled state tensor(s). 200 | 201 | Args: 202 | batch_size: int, float, or unit Tensor representing the batch size. 203 | dtype: the data type to use for the state. 204 | 205 | Returns: 206 | If `state_size` is an int or TensorShape, then the return value is a 207 | `N-D` tensor of shape `[batch_size, state_size]` filled with zeros. 208 | 209 | If `state_size` is a nested list or tuple, then the return value is 210 | a nested list or tuple (of the same structure) of `2-D` tensors with 211 | the shapes `[batch_size, s]` for each s in `state_size`. 212 | """ 213 | # Try to use the last cached zero_state. This is done to avoid recreating 214 | # zeros, especially when eager execution is enabled. 215 | state_size = self.state_size 216 | is_eager = context.in_eager_mode() 217 | if is_eager and hasattr(self, "_last_zero_state"): 218 | (last_state_size, last_batch_size, last_dtype, 219 | last_output) = getattr(self, "_last_zero_state") 220 | if (last_batch_size == batch_size and 221 | last_dtype == dtype and 222 | last_state_size == state_size): 223 | return last_output 224 | with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]): 225 | output = _zero_state_tensors(state_size, batch_size, dtype) 226 | if is_eager: 227 | self._last_zero_state = (state_size, batch_size, dtype, output) 228 | return output 229 | 230 | def call(self, inputs, state): 231 | # inputs: (external_input, coaler_input, burner_input, steamer_input) 232 | # state: (c, h) is a 3-D tensor 233 | # c: (c_coaler, c_burner, c_steamer) 234 | # h: (h_coaler, h_burner, h_steamer) 235 | # self._state_is_tuple is True for simplicity 236 | def _should_dropout(p): 237 | return (not isinstance(p, float)) or p < 1 238 | 239 | # input dropout 240 | if _should_dropout(self._input_keep_prob): 241 | inputs = nn_ops.dropout(inputs, keep_prob=self._input_keep_prob) 242 | 243 | coaler_inputs = self.get_coaler_inputs(inputs) 244 | burner_inputs = self.get_burner_inputs(inputs) 245 | steamer_inputs = self.get_steamer_inputs(inputs) 246 | 247 | sigmoid = math_ops.sigmoid 248 | one = constant_op.constant(1, dtype=dtypes.int32) 249 | 250 | c, h = state 251 | coaler_h, burner_h, steamer_h = h 252 | coaler_c, burner_c, steamer_c = c 253 | 254 | # coal mill model 255 | with tf.variable_scope('coaler'): 256 | # inputs = self.batch_normalization(inputs, 'coal_mill_bn') 257 | coaler_gate_inputs = math_ops.matmul( 258 | array_ops.concat([coaler_inputs, coaler_h], 1), self._coaler_kernel) 259 | coaler_gate_inputs = nn_ops.bias_add(coaler_gate_inputs, self._coaler_bias) 260 | 261 | coaler_i, coaler_j, coaler_f, coaler_o = array_ops.split( 262 | value=coaler_gate_inputs, num_or_size_splits=4, axis=one) 263 | 264 | coaler_forget_bias_tensor = constant_op.constant(self._forget_bias, dtype=coaler_f.dtype) 265 | # Note that using `add` and `multiply` instead of `+` and `*` gives a 266 | # performance improvement. So using those at the cost of readability. 267 | add = math_ops.add 268 | multiply = math_ops.multiply 269 | coaler_new_c = add(multiply(coaler_c, sigmoid(add(coaler_f, coaler_forget_bias_tensor))), 270 | multiply(sigmoid(coaler_i), self._activation(coaler_j))) 271 | coaler_new_h = multiply(self._activation(coaler_new_c), sigmoid(coaler_o)) 272 | 273 | with tf.variable_scope('burner'): 274 | # inputs = self.batch_normalization(inputs, 'coal_mill_bn') 275 | # only dropout coaler output 276 | if _should_dropout(self._output_keep_prob): 277 | coaler_h = nn_ops.dropout(coaler_h, keep_prob=self._output_keep_prob) 278 | 279 | burner_gate_inputs = math_ops.matmul( 280 | array_ops.concat([burner_inputs, burner_h, coaler_h], 1), self._burner_kernel) 281 | burner_gate_inputs = nn_ops.bias_add(burner_gate_inputs, self._burner_bias) 282 | 283 | burner_i, burner_j, burner_f, burner_o = array_ops.split( 284 | value=burner_gate_inputs, num_or_size_splits=4, axis=one) 285 | 286 | burner_forget_bias_tensor = constant_op.constant(self._forget_bias, dtype=burner_f.dtype) 287 | # Note that using `add` and `multiply` instead of `+` and `*` gives a 288 | # performance improvement. So using those at the cost of readability. 289 | add = math_ops.add 290 | multiply = math_ops.multiply 291 | burner_new_c = add(multiply(burner_c, sigmoid(add(burner_f, burner_forget_bias_tensor))), 292 | multiply(sigmoid(burner_i), self._activation(burner_j))) 293 | burner_new_h = multiply(self._activation(burner_new_c), sigmoid(burner_o)) 294 | 295 | with tf.variable_scope('steamer'): 296 | # inputs = self.batch_normalization(inputs, 'coal_mill_bn') 297 | # only dropout burner output 298 | if _should_dropout(self._output_keep_prob): 299 | burner_h = nn_ops.dropout(burner_h, keep_prob=self._output_keep_prob) 300 | 301 | steamer_gate_inputs = math_ops.matmul( 302 | array_ops.concat([steamer_inputs, steamer_h, burner_h], 1), self._steamer_kernel) 303 | steamer_gate_inputs = nn_ops.bias_add(steamer_gate_inputs, self._steamer_bias) 304 | 305 | steamer_i, steamer_j, steamer_f, steamer_o = array_ops.split( 306 | value=steamer_gate_inputs, num_or_size_splits=4, axis=one) 307 | 308 | steamer_forget_bias_tensor = constant_op.constant(self._forget_bias, dtype=steamer_f.dtype) 309 | # Note that using `add` and `multiply` instead of `+` and `*` gives a 310 | # performance improvement. So using those at the cost of readability. 311 | add = math_ops.add 312 | multiply = math_ops.multiply 313 | steamer_new_c = add(multiply(steamer_c, sigmoid(add(steamer_f, steamer_forget_bias_tensor))), 314 | multiply(sigmoid(steamer_i), self._activation(steamer_j))) 315 | steamer_new_h = multiply(self._activation(steamer_new_c), sigmoid(steamer_o)) 316 | 317 | new_c = tuple((coaler_new_c, burner_new_c, steamer_new_c)) 318 | new_h = tuple((coaler_new_h, burner_new_h, steamer_new_h)) 319 | # concat_h = array_ops.concat([coaler_new_h, burner_new_h, steamer_new_h], axis=1) 320 | new_state = LSTMStateTuple(new_c, new_h) 321 | return new_h, new_state 322 | 323 | 324 | def orthogonal_lstm_initializer(): 325 | def orthogonal(shape, dtype=tf.float32, partition_info=None): 326 | # taken from https://github.com/cooijmanstim/recurrent-batch-normalization 327 | # taken from https://gist.github.com/kastnerkyle/f7464d98fe8ca14f2a1a 328 | """ benanne lasagne ortho init (faster than qr approach)""" 329 | flat_shape = (shape[0], np.prod(shape[1:])) 330 | a = np.random.normal(0.0, 1.0, flat_shape) 331 | u, _, v = np.linalg.svd(a, full_matrices=False) 332 | q = u if u.shape == flat_shape else v # pick the one with the correct shape 333 | q = q.reshape(shape) 334 | return tf.constant(q[:shape[0], :shape[1]], dtype) 335 | return orthogonal 336 | 337 | 338 | --------------------------------------------------------------------------------