├── DDPG-forcasting ├── __pycache__ │ ├── DDPG_agent.cpython-37.pyc │ └── DataPreprocessing.cpython-37.pyc ├── DataPreprocessing.py ├── main.py └── DDPG_agent.py ├── RDPG-forecasting ├── __pycache__ │ ├── RDPG_agent.cpython-37.pyc │ └── DataPreprocessing.cpython-37.pyc ├── DataPreprocessing.py ├── main.py └── RDPG_agent.py ├── README.md ├── DQN ├── test.py └── DQN_agent.py ├── Q_learning ├── Teasure_hunt_env.py ├── test.py └── Q_learning_brain.py └── A3C-forecasting ├── A3C_agent.py └── worker.py /DDPG-forcasting/__pycache__/DDPG_agent.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChefLiutao/Time-series-forecasting-via-deep-reinforcement-learning/HEAD/DDPG-forcasting/__pycache__/DDPG_agent.cpython-37.pyc -------------------------------------------------------------------------------- /RDPG-forecasting/__pycache__/RDPG_agent.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChefLiutao/Time-series-forecasting-via-deep-reinforcement-learning/HEAD/RDPG-forecasting/__pycache__/RDPG_agent.cpython-37.pyc -------------------------------------------------------------------------------- /DDPG-forcasting/__pycache__/DataPreprocessing.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChefLiutao/Time-series-forecasting-via-deep-reinforcement-learning/HEAD/DDPG-forcasting/__pycache__/DataPreprocessing.cpython-37.pyc -------------------------------------------------------------------------------- /RDPG-forecasting/__pycache__/DataPreprocessing.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChefLiutao/Time-series-forecasting-via-deep-reinforcement-learning/HEAD/RDPG-forecasting/__pycache__/DataPreprocessing.cpython-37.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Time-series-forecasting-via-deep-reinforcement-learning 2 | Time series forecasting via deep reinforcement learning. 3 | 4 | Three deep reinforcement learning algorithms are deployed for time series forecasting, namely Asynchronous Advantage Actor-Critic(A3C), Deep Deterministic Policy Gradient(DDPG) as well as Recurrent Deterministic Policy Gradient(RDPG). 5 | 6 | 7 | More details can be seen in: https://doi.org/10.1016/j.enbuild.2019.109675 8 | -------------------------------------------------------------------------------- /DQN/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu May 21 22:54:17 2020 4 | 5 | @author: ChefLiutao 6 | 7 | To test DQN using Cartpole-v0 environment in package [gym]. 8 | """ 9 | import gym 10 | from DQN_agent import Nature_DQN 11 | 12 | env = gym.make('CartPole-v0') 13 | env = env.unwrapped #Lift some restrictions 14 | 15 | N_FEATURES = 4 16 | N_ACTIONS = 2 17 | MAX_EPISODES = 1000 18 | MAX_STEPS = 500 19 | dqn = Nature_DQN(4,2,16,0.003) 20 | 21 | for episode in range(MAX_EPISODES): 22 | s = env.reset() 23 | for step in range(MAX_STEPS): 24 | a = dqn.epsilon_choose_a(s) 25 | s_,r,done,info = env.step(a) 26 | done = 0 if done else 1 27 | dqn.store_transition(s,a,r,s_,done) 28 | dqn.learn() 29 | if (done == 0) or (step == 499): 30 | print('Episode %d:%d'%(episode,step+1)) 31 | break 32 | s = s_ 33 | 34 | dqn.plot_loss() 35 | 36 | -------------------------------------------------------------------------------- /Q_learning/Teasure_hunt_env.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue May 19 14:04:48 2020 4 | 5 | @author: ChefLiutao 6 | 7 | The environment of 1-dimension teasure hunt game, of which both the state space 8 | and action space are discrete. 9 | """ 10 | 11 | class One_D_teasure_env(): 12 | def __init__(self): 13 | self.state_space = [0,1,2,3,4,5] 14 | self.action_space = ['left','right'] 15 | 16 | def reset(self): 17 | ''' 18 | 初始化设置 19 | ''' 20 | state = 0 21 | return state 22 | 23 | 24 | def step(self,state,action): 25 | ''' 26 | 定义环境动力学,输入s和a,返回next_state和reward 27 | ''' 28 | if action == 'right': 29 | if state == 4: 30 | reward = 1 31 | next_state = 'terminal' 32 | else: 33 | reward = 0 34 | next_state = state + 1 35 | elif action == 'left': 36 | reward = 0 37 | if state == 0: 38 | next_state = state 39 | else: 40 | next_state = state - 1 41 | return next_state,reward 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /DDPG-forcasting/DataPreprocessing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon May 25 11:00:32 2020 4 | 5 | @author: ChefLiutao 6 | 7 | This part of code is to load and preprocess time series data. 8 | """ 9 | 10 | import numpy as np 11 | 12 | 13 | def build_s_a(sequence,n,m): 14 | ''' 15 | Args: 16 | sequence: Time series data 17 | n: The number of historical data denoting the current state 18 | m: The number of prediction steps in advance 19 | Return: 20 | state_mat: A matrix contains all states at each time step 21 | best_action: The optimal action based on each state 22 | ''' 23 | n_rows = len(sequence)-n-m+1 24 | state_mat = np.zeros((n_rows,n)) 25 | best_action = np.zeros(n_rows) 26 | for i in range(n_rows): 27 | state_mat[i] = sequence[i:(i+n)] 28 | best_action[i] = sequence[i+n+m-1] 29 | return state_mat,best_action 30 | 31 | 32 | 33 | def normalization(traindata,testdata): 34 | from sklearn.preprocessing import MinMaxScaler 35 | scaler = MinMaxScaler() 36 | scaler.fit(traindata) 37 | traindata_scaled = scaler.transform(traindata) 38 | testdata_scaled = scaler.transform(testdata) 39 | 40 | return traindata_scaled,testdata_scaled 41 | -------------------------------------------------------------------------------- /RDPG-forecasting/DataPreprocessing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon May 25 11:00:32 2020 4 | 5 | @author: ChefLiutao 6 | 7 | This part of code is to load and preprocess time series data. 8 | """ 9 | 10 | import numpy as np 11 | 12 | 13 | def build_s_a(sequence,n,m): 14 | ''' 15 | Args: 16 | sequence: Time series data 17 | n: The number of historical data denoting the current state 18 | m: The number of prediction steps in advance 19 | Return: 20 | state_mat: A matrix contains all states at each time step 21 | best_action: The optimal action based on each state 22 | ''' 23 | n_rows = len(sequence)-n-m+1 24 | state_mat = np.zeros((n_rows,n)) 25 | best_action = np.zeros(n_rows) 26 | for i in range(n_rows): 27 | state_mat[i] = sequence[i:(i+n)] 28 | best_action[i] = sequence[i+n+m-1] 29 | return state_mat,best_action 30 | 31 | 32 | 33 | def normalization(traindata,testdata): 34 | from sklearn.preprocessing import MinMaxScaler 35 | scaler = MinMaxScaler() 36 | scaler.fit(traindata) 37 | traindata_scaled = scaler.transform(traindata) 38 | testdata_scaled = scaler.transform(testdata) 39 | 40 | return traindata_scaled,testdata_scaled 41 | -------------------------------------------------------------------------------- /Q_learning/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue May 19 21:38:00 2020 4 | 5 | @author: ChefLiutao 6 | 7 | To test the effectiveness of Q-learning in 1D teasure hunt game. 8 | """ 9 | 10 | from Q_learning_brain import Tabular_q_learning 11 | from Teasure_hunt_env import One_D_teasure_env 12 | 13 | ###################################################### 14 | 15 | env = One_D_teasure_env() 16 | states = env.state_space 17 | actions = env.action_space 18 | 19 | LEARNING_RATE = 0.1 20 | EPSILON = 0.9 21 | GAMMA = 0.9 22 | MAX_EPISODES = 20 23 | MAX_STEPS = 100 24 | 25 | q_learning = Tabular_q_learning(states,actions,LEARNING_RATE,EPSILON,GAMMA) 26 | 27 | ###################################################### 28 | 29 | def rl_learn_loop(): 30 | for episode in range(MAX_EPISODES): 31 | s = env.reset() 32 | for step in range(MAX_STEPS): 33 | a = q_learning.epsilon_choose_action(s) 34 | s_,r = env.step(s,a) 35 | q_learning.learn(s,a,r,s_) 36 | 37 | if s_ == 'terminal': 38 | print('Episode %d - total steps - %d'%(episode,step+1)) 39 | break 40 | 41 | s = s_ 42 | 43 | 44 | if __name__ == '__main__': 45 | rl_learn_loop() 46 | print(q_learning.q_table) -------------------------------------------------------------------------------- /Q_learning/Q_learning_brain.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon May 18 20:53:22 2020 4 | 5 | @author: ChefLiutao 6 | 7 | Brain of Lookup Q-learning, i.e. the brain of Q-learning RL agent. 8 | """ 9 | import numpy as np 10 | import pandas as pd 11 | 12 | class Tabular_q_learning(): 13 | def __init__(self,states,actions,learning_rate,epsilon = 0.9,gamma = 0.9): 14 | ''' 15 | Args: 16 | states: A python list of all possible states in state space. 17 | actions: A python list of all possible actions in action space. 18 | learning_rate: Update speed of Q_table 19 | epsilon: A probablity that controls the trade-off between exploration and exploitation 20 | gamma: Reward discount rate 21 | ''' 22 | self.state_space = states 23 | self.action_space = actions 24 | self.n_states = len(self.state_space) 25 | self.n_actions = len(self.action_space) 26 | self.gamma = gamma 27 | self.epsilon = epsilon 28 | self.alpha =learning_rate 29 | self.q_table = self.build_q_table() 30 | 31 | 32 | def build_q_table(self): 33 | q_table = pd.DataFrame(np.zeros([self.n_states,self.n_actions]), 34 | index = self.state_space, 35 | columns = self.action_space) 36 | return q_table 37 | 38 | 39 | def epsilon_choose_action(self,state): 40 | state_action = self.q_table.loc[state] 41 | if (np.random.uniform() > self.epsilon) or ((state_action == 0).all()): 42 | action = np.random.choice(state_action.index) 43 | else: 44 | action = state_action.idxmax() 45 | return action 46 | 47 | 48 | def learn(self,state,action,reward,next_state): 49 | q_current = self.q_table.loc[state,action] 50 | q_target = reward if (next_state == 'terminal') else ( 51 | reward + self.gamma*self.q_table.loc[next_state].max()) 52 | 53 | self.q_table.loc[state,action] += self.alpha*(q_target - q_current) 54 | -------------------------------------------------------------------------------- /DDPG-forcasting/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri May 29 23:23:52 2020 4 | 5 | @author: ChefLiutao 6 | """ 7 | import pandas as pd 8 | import numpy as np 9 | from DataPreprocessing import normalization 10 | from DataPreprocessing import build_s_a 11 | from DDPG_agent import DDPG 12 | import matplotlib.pyplot as plt 13 | import os 14 | 15 | ##################### hyper parameters #################### 16 | N_FEATURES = 6 17 | A_LOW = 0 18 | A_HIGH = 1 19 | LR_A = 0.001 20 | LR_C = 0.003 21 | N_ACTOR_HIDDEN = 30 22 | N_CRITIC_HIDDEN = 30 23 | MAX_EPISODES = 300 24 | MAX_STEPS = 1000 25 | 26 | GAMMA = 0.9 # 折扣因子 27 | TAU = 0.1 # 软更新因子 28 | MEMORY_CAPACITY = 100000 #记忆库大小 29 | BATCH_SIZE = 128 #批梯度下降的m 30 | ############################################################# 31 | 32 | #Load data 33 | data_dir = os.getcwd() + 'V6.csv' 34 | data = pd.read_csv(data_dir,encoding = 'gbk') 35 | data = data.iloc[:,0] 36 | 37 | #Build state matrix and best action 38 | state,action = build_s_a(data,N_FEATURES,1) 39 | 40 | #Data split 41 | SPLIT_RATE = 0.75 42 | split_index = round(len(state)*SPLIT_RATE) 43 | train_s,train_a = state[:split_index],action[:split_index] 44 | test_s,test_a = state[split_index:],action[split_index:] 45 | 46 | #Normalization 47 | train_s_scaled,test_s_scaled = normalization(train_s,test_s) 48 | A,B = train_a.max(),train_a.min() 49 | train_a_scaled,test_a_scaled = (train_a-B)/(A-B),(test_a-B)/(A-B) 50 | 51 | # Training 52 | ddpg = DDPG(N_FEATURES,A_LOW,A_HIGH,LR_A,LR_C,N_ACTOR_HIDDEN,N_CRITIC_HIDDEN) 53 | for episode in range(MAX_EPISODES): 54 | index = np.random.choice(range(len(train_s_scaled))) 55 | s = train_s_scaled[index] 56 | ep_reward = 0 57 | 58 | for step in range(MAX_STEPS): 59 | a = ddpg.choose_action(s) 60 | r = -abs(a-train_a_scaled[index]) 61 | ep_reward += r 62 | index += 1 63 | s_ = train_s_scaled[index] 64 | 65 | ddpg.store_transition(s,a,r,s_) 66 | ddpg.learn() 67 | 68 | if (index == len(train_s_scaled)-1) or (step == MAX_STEPS-1): 69 | print('Episode %d : %.2f'%(episode,ep_reward)) 70 | break 71 | 72 | s = s_ 73 | 74 | # Testing 75 | pred = [] 76 | for i in range(len(test_s_scaled)): 77 | state = test_s_scaled[i] 78 | action = ddpg.choose_action(state) 79 | pred.append(action) 80 | 81 | pred = [pred[i][0] for i in range(len(test_s_scaled))] 82 | pred = pd.Series(pred) 83 | pred = pred*(A-B)+B 84 | actual = pd.Series(test_a) 85 | 86 | plt.scatter(pred,test_a,marker = '.') 87 | plt.xlabel('Predicted Value') 88 | plt.ylabel('Actual value') 89 | plt.show() 90 | -------------------------------------------------------------------------------- /RDPG-forecasting/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri May 29 23:23:52 2020 4 | 5 | @author: ChefLiutao 6 | """ 7 | import pandas as pd 8 | import numpy as np 9 | from DataPreprocessing import normalization 10 | from DataPreprocessing import build_s_a 11 | from RDPG_agent import RDPG 12 | import matplotlib.pyplot as plt 13 | import os 14 | 15 | 16 | ##################### hyper parameters #################### 17 | N_FEATURES = 6 18 | A_LOW = 0 19 | A_HIGH = 1 20 | LR_A = 0.001 21 | LR_C = 0.003 22 | N_ACTOR_HIDDEN = 30 23 | N_CRITIC_HIDDEN = 30 24 | MAX_EPISODES = 100 25 | MAX_STEPS = 1000 26 | 27 | GAMMA = 0.9 # 折扣因子 28 | TAU = 0.1 # 软更新因子 29 | MEMORY_CAPACITY = 100000 #记忆库大小 30 | BATCH_SIZE = 128 #批梯度下降的m 31 | ############################################################# 32 | 33 | #Load data 34 | data_dir = os.getcwd() + 'V6.csv' #directory of time series data 35 | data = pd.read_csv(data_dir,encoding = 'gbk') 36 | data = data.iloc[:,0] 37 | 38 | #Build state matrix and best action 39 | state,action = build_s_a(data,N_FEATURES,1) 40 | 41 | #Data split 42 | SPLIT_RATE = 0.75 43 | split_index = round(len(state)*SPLIT_RATE) 44 | train_s,train_a = state[:split_index],action[:split_index] 45 | test_s,test_a = state[split_index:],action[split_index:] 46 | 47 | #Normalization 48 | train_s_scaled,test_s_scaled = normalization(train_s,test_s) 49 | A,B = train_a.max(),train_a.min() 50 | train_a_scaled,test_a_scaled = (train_a-B)/(A-B),(test_a-B)/(A-B) 51 | 52 | # Training 53 | rdpg = RDPG(N_FEATURES,A_LOW,A_HIGH,LR_A,LR_C,N_ACTOR_HIDDEN,N_CRITIC_HIDDEN) 54 | for episode in range(MAX_EPISODES): 55 | index = np.random.choice(range(len(train_s_scaled))) 56 | s = train_s_scaled[index] 57 | ep_reward = 0 58 | 59 | for step in range(MAX_STEPS): 60 | a = rdpg.choose_action(s) 61 | r = -abs(a-train_a_scaled[index]) 62 | ep_reward += r 63 | index += 1 64 | s_ = train_s_scaled[index] 65 | 66 | rdpg.store_transition(s,a,r,s_) 67 | rdpg.learn() 68 | 69 | if (index == len(train_s_scaled)-1) or (step == MAX_STEPS-1): 70 | print('Episode %d : %.2f'%(episode,ep_reward)) 71 | break 72 | 73 | s = s_ 74 | 75 | # Testing 76 | pred = [] 77 | for i in range(len(test_s_scaled)): 78 | state = test_s_scaled[i] 79 | action = rdpg.choose_action(state) 80 | pred.append(action) 81 | 82 | pred = [pred[i][0] for i in range(len(test_s_scaled))] 83 | pred = pd.Series(pred) 84 | pred = pred*(A-B)+B 85 | actual = pd.Series(test_a) 86 | 87 | plt.scatter(pred,test_a,marker = '.') 88 | plt.xlabel('Predicted Value') 89 | plt.ylabel('Actual value') 90 | plt.show() 91 | -------------------------------------------------------------------------------- /A3C-forecasting/A3C_agent.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jun 8 23:01:56 2020 4 | 5 | @author: ChefLiutao 6 | 7 | The agent of DRL algorithm Asynchronous Advantage Actor-Critic (A3C). 8 | """ 9 | import multiprocessing 10 | import threading 11 | import tensorflow as tf 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | import pandas as pd 15 | import time 16 | 17 | #------------------- HyperParameters -------------------- 18 | 19 | 20 | class ACNet(): 21 | ''' 22 | 建立网络,可以被调用生成一个Global Net,也能被调用生成一个worker Net 23 | ''' 24 | def __init__(self, scope, N_S,N_A,A_BOUND,globalAC=None): 25 | 26 | self.N_S = N_S 27 | self.N_A = N_A 28 | self.A_BOUND = A_BOUND 29 | 30 | if scope == 'GLOBAL_NET': # get global network 31 | with tf.variable_scope(scope): 32 | self.s = tf.placeholder(tf.float32, [None, self.N_S], 'S') 33 | self.a_params, self.c_params = self._build_net(scope)[-2:] 34 | else: # local net, calculate losses 35 | with tf.variable_scope(scope): 36 | self.s = tf.placeholder(tf.float32, [None, self.N_S], 'S') 37 | self.a_his = tf.placeholder(tf.float32, [None, self.N_A], 'A') 38 | self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget') 39 | 40 | mu, sigma, self.v, self.a_params, self.c_params = self._build_net(scope) 41 | 42 | td = tf.subtract(self.v_target, self.v, name='TD_error') 43 | with tf.name_scope('c_loss'): 44 | self.c_loss = tf.reduce_mean(tf.square(td)) 45 | 46 | with tf.name_scope('wrap_a_out'): 47 | mu, sigma = mu * A_BOUND[1], sigma + 1e-4 48 | 49 | normal_dist = tf.distributions.Normal(mu, sigma) 50 | 51 | with tf.name_scope('a_loss'): 52 | log_prob = normal_dist.log_prob(self.a_his) 53 | exp_v = log_prob * tf.stop_gradient(td) 54 | entropy = normal_dist.entropy() # encourage exploration 55 | self.exp_v = ENTROPY_BETA * entropy + exp_v 56 | self.a_loss = tf.reduce_mean(-self.exp_v) 57 | 58 | with tf.name_scope('choose_a'): # use local params to choose action 59 | self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=[0, 1]), A_BOUND[0], A_BOUND[1]) 60 | with tf.name_scope('local_grad'): 61 | self.a_grads = tf.gradients(self.a_loss, self.a_params) 62 | self.c_grads = tf.gradients(self.c_loss, self.c_params) 63 | 64 | with tf.name_scope('sync'): 65 | with tf.name_scope('pull'): 66 | self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)] 67 | self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)] 68 | with tf.name_scope('push'): 69 | self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params)) 70 | self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params)) 71 | 72 | def _build_net(self, scope): 73 | w_init = tf.random_normal_initializer(0., .1) 74 | with tf.variable_scope('actor'): 75 | l_a = tf.layers.dense(self.s, 64, tf.nn.relu6, kernel_initializer=w_init, name='la') 76 | mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu') 77 | sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma') 78 | with tf.variable_scope('critic'): 79 | l_c = tf.layers.dense(self.s, 64, tf.nn.relu6, kernel_initializer=w_init, name='lc') 80 | v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value 81 | a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor') 82 | c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic') 83 | return mu, sigma, v, a_params, c_params 84 | 85 | def update_global(self, feed_dict): # run by a local 86 | SESS.run([self.update_a_op, self.update_c_op], feed_dict) # local grads applies to global net 87 | 88 | def pull_global(self): # run by a local 89 | SESS.run([self.pull_a_params_op, self.pull_c_params_op]) 90 | 91 | def choose_action(self, s): # run by a local 92 | s = s[np.newaxis, :] 93 | return SESS.run(self.A, {self.s: s}) 94 | -------------------------------------------------------------------------------- /A3C-forecasting/worker.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jun 8 23:20:16 2020 4 | 5 | @author: ChefLiutao 6 | """ 7 | 8 | class Worker(): 9 | def __init__(self, name, globalAC): 10 | self.name = name 11 | self.AC = ACNet(name, globalAC) 12 | 13 | def work(self): 14 | global GLOBAL_RUNNING_R, GLOBAL_EP 15 | total_step = 1 16 | buffer_s, buffer_a, buffer_r = [], [], [] 17 | while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: 18 | state_index = np.random.choice(range(len(train_state_mat))) 19 | s = train_state_mat[state_index] 20 | ep_r = 0 21 | for ep_t in range(MAX_EP_STEP): 22 | # if self.name == 'W_0': 23 | # self.env.render() 24 | a = self.AC.choose_action(s) 25 | 26 | s_ = train_state_mat[state_index+1] 27 | r = -abs(train_best_action[state_index]-a) 28 | if ep_t == MAX_EP_STEP -1 or state_index == len(train_state_mat)-2: 29 | done = True 30 | else: 31 | done = False 32 | 33 | ep_r += r 34 | buffer_s.append(s) 35 | buffer_a.append(a) 36 | buffer_r.append(r) 37 | # buffer_r.append((r+8)/8) # normalize 38 | 39 | if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net 40 | if done: 41 | v_s_ = 0 # terminal 42 | else: 43 | v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0] 44 | buffer_v_target = [] 45 | for r in buffer_r[::-1]: # reverse buffer r 46 | v_s_ = r + GAMMA * v_s_ 47 | buffer_v_target.append(v_s_) 48 | buffer_v_target.reverse() 49 | 50 | buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target) 51 | feed_dict = { 52 | self.AC.s: buffer_s, 53 | self.AC.a_his: buffer_a, 54 | self.AC.v_target: buffer_v_target, 55 | } 56 | self.AC.update_global(feed_dict) 57 | buffer_s, buffer_a, buffer_r = [], [], [] 58 | self.AC.pull_global() #把中央大脑最新的参数拉下来 59 | 60 | s = s_ 61 | state_index += 1 62 | total_step += 1 63 | 64 | #完成一个episode,查看训练效果 65 | # if done: 66 | # self.AC.pull_global() 67 | 68 | #训练集 69 | # pred = [] 70 | # for i_pred in range(len(train_state_mat)): 71 | # state = train_state_mat[i_pred] 72 | # action = self.AC.choose_action(state) 73 | # pred.append(action) 74 | # pred = [pred[i][0] for i in range(len(train_state_mat))] 75 | # pred = pd.Series(pred) 76 | # pred = pred*(B-A)+A 77 | # actual = train_best_action*(B-A)+A #反归一化 78 | # MAE = np.mean(abs(pred-actual)) #MAE 79 | # RMSE = (np.sum((pred-actual)**2)/len(pred))**0.5 80 | # print('MAE-1: %.2f' %MAE,'RMSE-1: %.2f' %RMSE) 81 | # mae_episode.append(MAE) 82 | # rmse_episode.append(RMSE) 83 | 84 | #测试集 85 | # pred = [] 86 | # for i_pred in range(len(test_state_mat)): 87 | # state = test_state_mat[i_pred] 88 | # action = self.AC.choose_action(state) 89 | # pred.append(action) 90 | # pred = [pred[i][0] for i in range(len(test_state_mat))] 91 | # pred = pd.Series(pred) 92 | # pred = pred*(B-A)+A 93 | # actual = test_best_action*(B-A)+A #反归一化 94 | # MAE = np.mean(abs(pred-actual)) #MAE 95 | # RMSE = (np.sum((pred-actual)**2)/len(pred))**0.5 96 | # print('MAE: %.2f' %MAE,'RMSE: %.2f' %RMSE) 97 | 98 | 99 | #完成一个episode 100 | if done: 101 | if len(GLOBAL_RUNNING_R) == 0: # record running episode reward 102 | GLOBAL_RUNNING_R.append(ep_r) 103 | else: 104 | GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r) 105 | print( 106 | self.name, 107 | "Ep:", GLOBAL_EP, 108 | "| Ep_r: %i" % GLOBAL_RUNNING_R[-1], 109 | ) 110 | GLOBAL_EP += 1 111 | break -------------------------------------------------------------------------------- /DQN/DQN_agent.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed May 20 15:05:29 2020 4 | 5 | @author: ChefLiutao 6 | """ 7 | import tensorflow as tf 8 | import numpy as np 9 | from collections import deque 10 | import random 11 | 12 | class Nature_DQN(): 13 | def __init__( 14 | self, 15 | n_features, 16 | n_actions, 17 | n_hidden, 18 | learning_rate, 19 | epsilon = 0.9, 20 | gamma = 0.9, 21 | memory_size = 1000, 22 | batch_size = 128, 23 | epsilon_increment = 0.0001): 24 | 25 | self.n_features = n_features #dimension of state 26 | # self.actions = actions #all possible actions 27 | self.n_actions = n_actions #dimension of action space 28 | self.n_hidden = n_hidden #hidden neurons of Q network 29 | self.lr = learning_rate #for Current Q network update 30 | self.epsilon = epsilon #e-greed 31 | self.gamma = gamma #reward discount rate 32 | self.memory_size = memory_size 33 | self.memory = deque(maxlen = memory_size) 34 | self.batch_size = batch_size 35 | self.epsilon_increment = epsilon_increment 36 | self.optimizer = tf.train.AdamOptimizer(self.lr) 37 | self.loss_history = [] 38 | self.learn_step_counter = 0 39 | 40 | self.s = tf.placeholder(dtype = tf.float32,shape = [None,self.n_features]) 41 | self.a = tf.placeholder(tf.int32,shape = [None,]) 42 | self.s_ = tf.placeholder(tf.float32,shape = [None,self.n_features]) 43 | self.r = tf.placeholder(tf.float32,shape = [None,]) 44 | self.done = tf.placeholder(tf.float32,shape = [None,]) #denote if the s_ is terminal:0→terminal,1→non-terminal 45 | 46 | self.q_current = self.build_current_net() # self.s → self.q_current; self.s_ → self.q_next 47 | self.q_next = self.build_target_net() 48 | 49 | self.current_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 50 | scope = 'current_net') 51 | self.target_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 52 | scope = 'target_net') 53 | 54 | self.params_replace = [tf.assign(t,e) for t,e in zip(self.target_params,self.current_params)] 55 | 56 | self.q_sa = tf.gather_nd(self.q_current,indices = tf.stack( 57 | [tf.range(tf.shape(self.a)[0],dtype = tf.int32),self.a], 58 | axis = 1)) #[tf.shape(self.a)[0],] 59 | 60 | self.q_s_a_ = tf.reduce_max(self.q_next,axis = 1) 61 | 62 | with tf.variable_scope('loss'): 63 | self.q_target = (self.r + self.gamma*self.q_s_a_)*self.done 64 | self.loss = tf.reduce_mean(tf.square(self.q_target - self.q_sa)) 65 | self.train_op = self.optimizer.minimize(self.loss) 66 | 67 | self.sess = tf.Session() 68 | self.sess.run(tf.global_variables_initializer()) 69 | 70 | def build_current_net(self): 71 | ''' 72 | Build Two Q neworks:current Q network and target Q network. 73 | Target Q network is respnsible for calculating Q(s_). 74 | Current Q network is responsible to estimate Q(s) 75 | ''' 76 | # Current Q Network ------------------------------------------ 77 | # self.s = tf.placeholder(shape = [None,self.n_features]) 78 | # self.q_target = tf.placeholder(shape = [None,len(self.actions)]) 79 | 80 | with tf.variable_scope('current_net'): 81 | w_init = tf.random_normal_initializer(0,0.1) 82 | b_init = tf.constant_initializer(0.1) 83 | w1 = tf.get_variable(name = 'w1', 84 | shape = [self.n_features,self.n_hidden], 85 | initializer = w_init, 86 | trainable = True) 87 | b1 = tf.get_variable(name = 'b1', 88 | shape = [self.n_hidden], 89 | initializer = b_init, 90 | trainable = True) 91 | 92 | # tf.add_to_collection('current_params',w1) 93 | # tf.add_to_collection('current_params',b1) 94 | 95 | hidden = tf.nn.relu(tf.matmul(self.s,w1) + b1) 96 | 97 | w2 = tf.get_variable(name = 'w2', 98 | shape = [self.n_hidden,self.n_actions], 99 | initializer = w_init, 100 | trainable = True) 101 | b2 = tf.get_variable(name = 'b2', 102 | shape = [self.n_actions], 103 | initializer = b_init, 104 | trainable = True) 105 | # tf.add_to_collection('current_params',w2) 106 | # tf.add_to_collection('current_params',b2) 107 | 108 | self.q_current = tf.matmul(hidden,w2) + b2 109 | return self.q_current 110 | 111 | def build_target_net(self): 112 | w_init = tf.random_normal_initializer(0,0.1) 113 | b_init = tf.constant_initializer(0.1) 114 | # Target Q Network ------------------------------------------------ 115 | # self.s_ = tf.placeholder(shape = [None,self.n_features]) 116 | with tf.variable_scope('target_net'): 117 | w1 = tf.get_variable(name = 'w1', 118 | shape = [self.n_features,self.n_hidden], 119 | initializer = w_init, 120 | trainable = False) 121 | b1 = tf.get_variable(name = 'b1', 122 | shape = [self.n_hidden], 123 | initializer = b_init, 124 | trainable = False) 125 | # tf.add_to_collection('target_params',w1) 126 | # tf.add_to_collection('target_params',b1) 127 | hidden = tf.nn.relu(tf.matmul(self.s_,w1) + b1) 128 | 129 | w2 = tf.get_variable(name = 'w2', 130 | shape = [self.n_hidden,self.n_actions], 131 | initializer = w_init, 132 | trainable = False) 133 | b2 = tf.get_variable(name = 'b2', 134 | shape = [self.n_actions], 135 | initializer = b_init, 136 | trainable = False) 137 | # tf.add_to_collection('target_params',w2) 138 | # tf.add_to_collection('target_params',b2) 139 | 140 | self.q_next = tf.matmul(hidden,w2) + b2 141 | return self.q_next 142 | 143 | 144 | def epsilon_choose_a(self,state): 145 | state = np.reshape(state,[-1,self.n_features]) 146 | if np.random.uniform() < self.epsilon: 147 | state_action = self.sess.run(self.q_current, 148 | feed_dict = {self.s:state}) 149 | action = np.argmax(state_action) 150 | else: 151 | action = np.random.choice(np.arange(self.n_actions)) 152 | return action 153 | 154 | 155 | def store_transition(self,state,action,reward,next_state,is_done): 156 | state,next_state = state[np.newaxis,:],next_state[np.newaxis,:] 157 | action,reward,is_done = np.array(action),np.array(reward),np.array(is_done) 158 | action = action.reshape([1,1]) 159 | reward = reward.reshape([1,1]) 160 | is_done = is_done.reshape([1,1]) 161 | 162 | transition = np.concatenate((state,action,reward,next_state,is_done),axis = 1) 163 | self.memory.append(transition[0,:]) 164 | 165 | 166 | def learn(self): 167 | if len(self.memory) == self.memory_size: 168 | if self.learn_step_counter % 500 == 0: 169 | self.sess.run(self.params_replace) 170 | self.learn_step_counter += 1 171 | 172 | batch = np.array(random.sample(self.memory,self.batch_size)) 173 | batch_s = batch[:,:self.n_features] 174 | batch_a = batch[:,self.n_features:(self.n_features + 1)][:,0] 175 | batch_r = batch[:,(self.n_features + 1):(self.n_features + 2)][:,0] 176 | batch_s_ = batch[:,(self.n_features + 2):(self.n_features*2 + 2)] 177 | batch_done = batch[:,-1] 178 | train_op,loss = self.sess.run((self.train_op,self.loss), 179 | feed_dict = {self.s:batch_s,self.a:batch_a, 180 | self.s_:batch_s_,self.r:batch_r, 181 | self.done:batch_done}) 182 | self.loss_history.append(loss) 183 | 184 | self.epsilon = self.epsilon + self.epsilon_increment if ( 185 | self.epsilon + self.epsilon_increment) < 1. else 1. 186 | 187 | 188 | def plot_loss(self): 189 | import matplotlib.pyplot as plt 190 | plt.plot(range(len(self.loss_history)),self.loss_history,'-') 191 | -------------------------------------------------------------------------------- /RDPG-forecasting/RDPG_agent.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat May 30 20:52:21 2020 4 | 5 | @author: ChefLiutao 6 | 7 | The agent of RL algorithm Recurrent Detrministic Policy Gradient. 8 | 9 | The Actor NNs are deployed as three-layer Fully-Connected NN. 10 | 11 | The Critic NNs are deployed as RNN. 12 | """ 13 | import tensorflow as tf 14 | import numpy as np 15 | from collections import deque 16 | import random 17 | 18 | class RDPG(): 19 | def __init__(self, 20 | n_features, 21 | # n_actions, 22 | a_low, 23 | a_high, 24 | learning_rate_actor, 25 | learning_rate_critic, 26 | n_actor_hidden, 27 | n_critic_hidden, 28 | gamma = 0.9, 29 | noise_varience = 3, 30 | soft_replace = 0.1, 31 | memory_size = 1000, 32 | batch_size = 128): 33 | self.n_features = n_features #dimension of states 34 | # self.n_actions = n_actions 35 | self.a_low = a_low #The low bound of action sapce 36 | self.a_high = a_high #The high bound of action space 37 | self.lr_a = learning_rate_actor #Learning rate of Actor NN 38 | self.lr_c = learning_rate_critic #Learning rate of Critic NN 39 | self.n_actor_hidden = n_actor_hidden #Number of hidden layer neurons in Actor 40 | self.n_critic_cells = n_critic_hidden #Number of hidden layer neurons in Critic 41 | self.gamma = gamma #Reward discount rate 42 | self.noise_var = noise_varience #Variance of output action distribution 43 | self.soft_replace = soft_replace #Update speed of target networks 44 | self.memory_size = memory_size #Size of experience replay buffer 45 | self.memory = deque(maxlen = self.memory_size) #Experience replay buffer 46 | self.batch_size = batch_size 47 | 48 | self.s = tf.placeholder(dtype = tf.float32,shape = [None,self.n_features]) 49 | self.s_ = tf.placeholder(dtype = tf.float32,shape = [None,self.n_features]) 50 | self.r = tf.placeholder(dtype = tf.float32,shape = [None,]) 51 | self.done = tf.placeholder(dtype = tf.float32,shape = [None,]) # 0 if s_ == terminal else 1 52 | 53 | self.a = self.build_Actor1() 54 | self.a_ = self.build_Actor2() 55 | self.q_sa = self.build_Critic1() #shape:[None,] 56 | self.q_s_a_ = self.build_Critic2() #shape:[None,] 57 | 58 | self.curr_a_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 59 | scope = 'Actor/Current') 60 | self.targ_a_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 61 | scope = 'Actor/Target') 62 | self.curr_c_params= tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 63 | scope = 'Critic/Current') 64 | self.targ_c_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 65 | scope = 'Critic/Target') 66 | 67 | # Soft replace of Targets NN parameters 68 | self.replace_a_params = [tf.assign(t,(1-self.soft_replace)*t + self.soft_replace*e) \ 69 | for (t,e) in zip(self.targ_a_params,self.curr_a_params)] 70 | self.replace_c_params = [tf.assign(t,(1-self.soft_replace)*t + self.soft_replace*e) \ 71 | for (t,e) in zip(self.targ_c_params,self.curr_c_params)] 72 | 73 | self.td_error = self.r + self.gamma*self.q_s_a_ - self.q_sa 74 | self.critic_loss = tf.reduce_mean(tf.square(self.td_error)) 75 | self.actor_loss = -tf.reduce_mean(self.q_sa) 76 | 77 | self.actor_train_op = tf.train.AdamOptimizer(self.lr_a).minimize(self.actor_loss, 78 | var_list = self.curr_a_params) 79 | self.critic_train_op = tf.train.AdamOptimizer(self.lr_c).minimize(self.critic_loss, 80 | var_list = self.curr_c_params) 81 | 82 | self.learn_step_counter = 0 83 | self.sess = tf.Session() 84 | self.sess.run(tf.global_variables_initializer()) 85 | 86 | 87 | def build_Actor1(self): 88 | ''' 89 | Building Current Actor network. 90 | ''' 91 | with tf.variable_scope('Actor/Current'): 92 | w_init = tf.random_normal_initializer(0,0.1) 93 | b_init = tf.constant_initializer(0.1) 94 | w1 = tf.get_variable(name = 'w1',shape = [self.n_features,self.n_actor_hidden], 95 | dtype = tf.float32,initializer = w_init, 96 | trainable = True) 97 | b1 = tf.get_variable('b1',shape = [self.n_actor_hidden,], 98 | dtype = tf.float32,initializer = b_init, 99 | trainable = True) 100 | w2 = tf.get_variable('w2',shape = [self.n_actor_hidden,1], 101 | dtype = tf.float32,initializer = w_init, 102 | trainable = True) 103 | b2 = tf.get_variable('b2',shape = [1,], 104 | dtype = tf.float32,initializer = b_init, 105 | trainable = True) 106 | hidden = tf.nn.relu(tf.matmul(self.s,w1) + b1) 107 | a = tf.matmul(hidden,w2) + b2 108 | return a[:,0] 109 | # return np.clip(np.random.normal(a,self.noise_var),self.a_low,self.a_high) 110 | 111 | def build_Actor2(self): 112 | ''' 113 | Building Target Actor network. 114 | ''' 115 | with tf.variable_scope('Actor/Target'): 116 | w_init = tf.random_normal_initializer(0,0.1) 117 | b_init = tf.constant_initializer(0.1) 118 | w1 = tf.get_variable('w1',shape = [self.n_features,self.n_actor_hidden], 119 | dtype = tf.float32,initializer = w_init, 120 | trainable = False) 121 | b1 = tf.get_variable('b1',shape = [self.n_actor_hidden,], 122 | dtype = tf.float32,initializer = b_init, 123 | trainable = False) 124 | w2 = tf.get_variable('w2',shape = [self.n_actor_hidden,1], 125 | dtype = tf.float32,initializer = w_init, 126 | trainable = False) 127 | b2 = tf.get_variable('b2',shape = [1,], 128 | dtype = tf.float32,initializer = b_init, 129 | trainable = False) 130 | hidden = tf.nn.relu(tf.matmul(self.s_,w1) + b1) 131 | a_ = tf.matmul(hidden,w2) + b2 132 | return a_[:,0] 133 | 134 | def build_Critic1(self): 135 | ''' 136 | Building Current Critic network. 137 | ''' 138 | with tf.variable_scope('Critic/Current'): 139 | w_init = tf.random_normal_initializer(0,0.1) 140 | b_init = tf.constant_initializer(0.1) 141 | 142 | rnn_cell = tf.contrib.rnn.BasicRNNCell(self.n_critic_cells) 143 | self.init_state = rnn_cell.zero_state(batch_size=1, dtype=tf.float64) 144 | s = tf.cast(tf.expand_dims(self.s,axis = 1),tf.float64) 145 | 146 | outputs, self.final_state = tf.nn.dynamic_rnn( 147 | cell = rnn_cell, inputs = s, 148 | initial_state = self.init_state, time_major = True) 149 | cell_out = tf.cast(tf.reshape(outputs, [-1, self.n_critic_cells]),tf.float32) 150 | 151 | a_out = tf.layers.dense(self.a[:,np.newaxis],self.n_critic_cells,trainable = True) 152 | q_sa = tf.layers.dense(cell_out + a_out,1,tf.nn.relu, 153 | kernel_initializer = w_init, 154 | bias_initializer = b_init,trainable = True) 155 | 156 | return q_sa[:,0] 157 | 158 | 159 | def build_Critic2(self): 160 | ''' 161 | Building Target Critic network. 162 | ''' 163 | with tf.variable_scope('Critic/Target'): 164 | w_init = tf.random_normal_initializer(0,0.1) 165 | b_init = tf.constant_initializer(0.1) 166 | 167 | rnn_cell = tf.contrib.rnn.BasicRNNCell(self.n_critic_cells) 168 | self.init_state = rnn_cell.zero_state(batch_size=1, dtype=tf.float64) 169 | s_ = tf.cast(tf.expand_dims(self.s_,axis = 1),tf.float64) 170 | 171 | outputs, self.final_state = tf.nn.dynamic_rnn( 172 | cell = rnn_cell, inputs = s_, 173 | initial_state = self.init_state, time_major = True) 174 | cell_out = tf.cast(tf.reshape(outputs, [-1, self.n_critic_cells]),tf.float32) 175 | 176 | a_out = tf.layers.dense(self.a_[:,np.newaxis],self.n_critic_cells,trainable = False) 177 | q_s_a_ = tf.layers.dense(cell_out + a_out,1,tf.nn.relu, 178 | kernel_initializer = w_init, 179 | bias_initializer = b_init,trainable = False) 180 | 181 | return q_s_a_[:,0] 182 | 183 | def choose_action(self,state): 184 | state = np.reshape(state,[-1,self.n_features]) 185 | action = self.sess.run(self.a,feed_dict = {self.s:state}) 186 | return action 187 | 188 | def store_transition(self,state,action,reward,next_state): 189 | state,next_state = state[np.newaxis,:],next_state[np.newaxis,:] 190 | action,reward = np.array(action),np.array(reward) 191 | action = np.reshape(action,[1,-1]) 192 | reward = np.reshape(reward,[1,-1]) 193 | # is_done = np.reshape(is_done,[1,-1]) 194 | 195 | transition = np.concatenate((state,action,reward,next_state),axis = 1) 196 | self.memory.append(transition[0,:]) 197 | 198 | def learn(self): 199 | if len(self.memory) == self.memory_size: 200 | if self.learn_step_counter % 200 == 0: 201 | self.sess.run((self.replace_a_params,self.replace_c_params)) 202 | 203 | self.noise_var *= 0.999 204 | 205 | batch = np.array(random.sample(self.memory,self.batch_size)) 206 | batch_s = batch[:,:self.n_features] 207 | batch_a = batch[:,self.n_features:(self.n_features + 1)][:,0] 208 | batch_r = batch[:,(self.n_features + 1):(self.n_features + 2)][:,0] 209 | batch_s_ = batch[:,(self.n_features + 2):(self.n_features*2 + 2)] 210 | 211 | self.sess.run(self.actor_train_op,feed_dict = {self.s:batch_s}) 212 | self.sess.run(self.critic_train_op,feed_dict = {self.s:batch_s, 213 | self.a:batch_a, 214 | self.s_:batch_s_, 215 | self.r:batch_r}) 216 | 217 | if __name__ == '__main__': 218 | rdpg = RDPG(5,0,1,0.03,0.01,30,30) -------------------------------------------------------------------------------- /DDPG-forcasting/DDPG_agent.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun May 24 18:32:39 2020 4 | 5 | @author: ChefLiutao 6 | 7 | The agent of RL algorithm Deep Detrministic Policy Gradient. 8 | 9 | Both the Actor and Critic neuron networks adopt three-layer Fully-Connected NN. 10 | """ 11 | import tensorflow as tf 12 | import numpy as np 13 | from collections import deque 14 | import random 15 | 16 | class DDPG(): 17 | def __init__(self, 18 | n_features, 19 | # n_actions, 20 | a_low, 21 | a_high, 22 | learning_rate_actor, 23 | learning_rate_critic, 24 | n_actor_hidden, 25 | n_critic_hidden, 26 | gamma = 0.9, 27 | noise_varience = 3, 28 | soft_replace = 0.1, 29 | memory_size = 1000, 30 | batch_size = 128): 31 | self.n_features = n_features #dimension of states 32 | # self.n_actions = n_actions 33 | self.a_low = a_low #The low bound of action sapce 34 | self.a_high = a_high #The high bound of action space 35 | self.lr_a = learning_rate_actor #Learning rate of Actor NN 36 | self.lr_c = learning_rate_critic #Learning rate of Critic NN 37 | self.n_actor_hidden = n_actor_hidden #Number of hidden layer neurons in Actor 38 | self.n_critic_hidden = n_critic_hidden #Number of hidden layer neurons in Critic 39 | self.gamma = gamma #Reward discount rate 40 | self.noise_var = noise_varience #Variance of output action distribution 41 | self.soft_replace = soft_replace #Update speed of target networks 42 | self.memory_size = memory_size #Size of experience replay buffer 43 | self.memory = deque(maxlen = self.memory_size) #Experience replay buffer 44 | self.batch_size = batch_size 45 | 46 | self.s = tf.placeholder(dtype = tf.float32,shape = [None,self.n_features]) 47 | self.s_ = tf.placeholder(dtype = tf.float32,shape = [None,self.n_features]) 48 | self.r = tf.placeholder(dtype = tf.float32,shape = [None,]) 49 | self.done = tf.placeholder(dtype = tf.float32,shape = [None,]) # 0 if s_ == terminal else 1 50 | 51 | self.a = self.build_Actor1() 52 | self.a_ = self.build_Actor2() 53 | self.q_sa = self.build_Critic1() #shape:[None,] 54 | self.q_s_a_ = self.build_Critic2() #shape:[None,] 55 | 56 | self.curr_a_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 57 | scope = 'Actor/Current') 58 | self.targ_a_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 59 | scope = 'Actor/Target') 60 | self.curr_c_params= tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 61 | scope = 'Critic/Current') 62 | self.targ_c_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 63 | scope = 'Critic/Target') 64 | 65 | # Soft replace of Targets NN parameters 66 | self.replace_a_params = [tf.assign(t,(1-self.soft_replace)*t + self.soft_replace*e) \ 67 | for (t,e) in zip(self.targ_a_params,self.curr_a_params)] 68 | self.replace_c_params = [tf.assign(t,(1-self.soft_replace)*t + self.soft_replace*e) \ 69 | for (t,e) in zip(self.targ_c_params,self.curr_c_params)] 70 | 71 | self.td_error = self.r + self.gamma*self.q_s_a_ - self.q_sa 72 | self.critic_loss = tf.reduce_mean(tf.square(self.td_error)) 73 | self.actor_loss = -tf.reduce_mean(self.q_sa) 74 | 75 | self.actor_train_op = tf.train.AdamOptimizer(self.lr_a).minimize(self.actor_loss, 76 | var_list = self.curr_a_params) 77 | self.critic_train_op = tf.train.AdamOptimizer(self.lr_c).minimize(self.critic_loss, 78 | var_list = self.curr_c_params) 79 | 80 | self.learn_step_counter = 0 81 | self.sess = tf.Session() 82 | self.sess.run(tf.global_variables_initializer()) 83 | 84 | 85 | def build_Actor1(self): 86 | ''' 87 | Building Current Actor network. 88 | ''' 89 | with tf.variable_scope('Actor/Current'): 90 | w_init = tf.random_normal_initializer(0,0.1) 91 | b_init = tf.constant_initializer(0.1) 92 | w1 = tf.get_variable(name = 'w1',shape = [self.n_features,self.n_actor_hidden], 93 | dtype = tf.float32,initializer = w_init, 94 | trainable = True) 95 | b1 = tf.get_variable('b1',shape = [self.n_actor_hidden,], 96 | dtype = tf.float32,initializer = b_init, 97 | trainable = True) 98 | w2 = tf.get_variable('w2',shape = [self.n_actor_hidden,1], 99 | dtype = tf.float32,initializer = w_init, 100 | trainable = True) 101 | b2 = tf.get_variable('b2',shape = [1,], 102 | dtype = tf.float32,initializer = b_init, 103 | trainable = True) 104 | hidden = tf.nn.relu(tf.matmul(self.s,w1) + b1) 105 | a = tf.matmul(hidden,w2) + b2 106 | return a[:,0] 107 | # return np.clip(np.random.normal(a,self.noise_var),self.a_low,self.a_high) 108 | 109 | def build_Actor2(self): 110 | ''' 111 | Building Target Actor network. 112 | ''' 113 | with tf.variable_scope('Actor/Target'): 114 | w_init = tf.random_normal_initializer(0,0.1) 115 | b_init = tf.constant_initializer(0.1) 116 | w1 = tf.get_variable('w1',shape = [self.n_features,self.n_actor_hidden], 117 | dtype = tf.float32,initializer = w_init, 118 | trainable = False) 119 | b1 = tf.get_variable('b1',shape = [self.n_actor_hidden,], 120 | dtype = tf.float32,initializer = b_init, 121 | trainable = False) 122 | w2 = tf.get_variable('w2',shape = [self.n_actor_hidden,1], 123 | dtype = tf.float32,initializer = w_init, 124 | trainable = False) 125 | b2 = tf.get_variable('b2',shape = [1,], 126 | dtype = tf.float32,initializer = b_init, 127 | trainable = False) 128 | hidden = tf.nn.relu(tf.matmul(self.s_,w1) + b1) 129 | a_ = tf.matmul(hidden,w2) + b2 130 | return a_[:,0] 131 | 132 | def build_Critic1(self): 133 | ''' 134 | Building Current Critic network. 135 | ''' 136 | with tf.variable_scope('Critic/Current'): 137 | w_init = tf.random_normal_initializer(0,0.1) 138 | b_init = tf.constant_initializer(0.1) 139 | w1_s = tf.get_variable('w1_s',shape = [self.n_features,self.n_critic_hidden], 140 | dtype = tf.float32,initializer = w_init, 141 | trainable = True) 142 | w1_a = tf.get_variable('w1_a',shape = [1,self.n_critic_hidden], 143 | dtype = tf.float32,initializer = w_init, 144 | trainable = True) 145 | b1 = tf.get_variable('b1',shape = [self.n_critic_hidden,], 146 | dtype = tf.float32,initializer = b_init, 147 | trainable = True) 148 | w2 = tf.get_variable('w2',shape = [self.n_critic_hidden,1], 149 | dtype = tf.float32,initializer = w_init, 150 | trainable = True) 151 | b2 = tf.get_variable('b2',shape = [1,],dtype = tf.float32, 152 | initializer = b_init,trainable = True) 153 | hidden = tf.nn.relu(tf.matmul(self.s,w1_s) + tf.matmul(self.a[:,np.newaxis],w1_a) + b1) 154 | q_sa = tf.matmul(hidden,w2) + b2 155 | return q_sa[:,0] 156 | 157 | def build_Critic2(self): 158 | ''' 159 | Building Target Critic network. 160 | ''' 161 | with tf.variable_scope('Critic/Target'): 162 | w_init = tf.random_normal_initializer(0,0.1) 163 | b_init = tf.constant_initializer(0.1) 164 | w1_s = tf.get_variable('w1_s',shape = [self.n_features,self.n_critic_hidden], 165 | dtype = tf.float32,initializer = w_init, 166 | trainable = False) 167 | w1_a = tf.get_variable('w1_a',shape = [1,self.n_critic_hidden], 168 | dtype = tf.float32,initializer = w_init, 169 | trainable = False) 170 | b1 = tf.get_variable('b1',shape = [self.n_critic_hidden,], 171 | dtype = tf.float32,initializer = b_init, 172 | trainable = False) 173 | w2 = tf.get_variable('w2',shape = [self.n_critic_hidden,1], 174 | dtype = tf.float32,initializer = w_init, 175 | trainable = False) 176 | b2 = tf.get_variable('b2',shape = [1,],dtype = tf.float32, 177 | initializer = b_init,trainable = True) 178 | hidden = tf.nn.relu(tf.matmul(self.s_,w1_s) + tf.matmul(self.a_[:,np.newaxis],w1_a) + b1) 179 | q_s_a_ = tf.matmul(hidden,w2) + b2 180 | return q_s_a_[:,0] 181 | 182 | def choose_action(self,state): 183 | state = np.reshape(state,[-1,self.n_features]) 184 | action = self.sess.run(self.a,feed_dict = {self.s:state}) 185 | return action 186 | 187 | def store_transition(self,state,action,reward,next_state): 188 | state,next_state = state[np.newaxis,:],next_state[np.newaxis,:] 189 | action,reward = np.array(action),np.array(reward) 190 | action = np.reshape(action,[1,-1]) 191 | reward = np.reshape(reward,[1,-1]) 192 | # is_done = np.reshape(is_done,[1,-1]) 193 | 194 | transition = np.concatenate((state,action,reward,next_state),axis = 1) 195 | self.memory.append(transition[0,:]) 196 | 197 | def learn(self): 198 | if len(self.memory) == self.memory_size: 199 | if self.learn_step_counter % 200 == 0: 200 | self.sess.run((self.replace_a_params,self.replace_c_params)) 201 | 202 | self.noise_var *= 0.999 203 | 204 | batch = np.array(random.sample(self.memory,self.batch_size)) 205 | batch_s = batch[:,:self.n_features] 206 | batch_a = batch[:,self.n_features:(self.n_features + 1)][:,0] 207 | batch_r = batch[:,(self.n_features + 1):(self.n_features + 2)][:,0] 208 | batch_s_ = batch[:,(self.n_features + 2):(self.n_features*2 + 2)] 209 | 210 | self.sess.run(self.actor_train_op,feed_dict = {self.s:batch_s}) 211 | self.sess.run(self.critic_train_op,feed_dict = {self.s:batch_s, 212 | self.a:batch_a, 213 | self.s_:batch_s_, 214 | self.r:batch_r}) 215 | if __name__ == '__main__': 216 | ddpg = DDPG(5,0,1,0.03,0.01,30,30) --------------------------------------------------------------------------------