├── DDPG-forcasting
    ├── __pycache__
    │   ├── DDPG_agent.cpython-37.pyc
    │   └── DataPreprocessing.cpython-37.pyc
    ├── DataPreprocessing.py
    ├── main.py
    └── DDPG_agent.py
├── RDPG-forecasting
    ├── __pycache__
    │   ├── RDPG_agent.cpython-37.pyc
    │   └── DataPreprocessing.cpython-37.pyc
    ├── DataPreprocessing.py
    ├── main.py
    └── RDPG_agent.py
├── README.md
├── DQN
    ├── test.py
    └── DQN_agent.py
├── Q_learning
    ├── Teasure_hunt_env.py
    ├── test.py
    └── Q_learning_brain.py
└── A3C-forecasting
    ├── A3C_agent.py
    └── worker.py


/DDPG-forcasting/__pycache__/DDPG_agent.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChefLiutao/Time-series-forecasting-via-deep-reinforcement-learning/HEAD/DDPG-forcasting/__pycache__/DDPG_agent.cpython-37.pyc


--------------------------------------------------------------------------------
/RDPG-forecasting/__pycache__/RDPG_agent.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChefLiutao/Time-series-forecasting-via-deep-reinforcement-learning/HEAD/RDPG-forecasting/__pycache__/RDPG_agent.cpython-37.pyc


--------------------------------------------------------------------------------
/DDPG-forcasting/__pycache__/DataPreprocessing.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChefLiutao/Time-series-forecasting-via-deep-reinforcement-learning/HEAD/DDPG-forcasting/__pycache__/DataPreprocessing.cpython-37.pyc


--------------------------------------------------------------------------------
/RDPG-forecasting/__pycache__/DataPreprocessing.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChefLiutao/Time-series-forecasting-via-deep-reinforcement-learning/HEAD/RDPG-forecasting/__pycache__/DataPreprocessing.cpython-37.pyc


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Time-series-forecasting-via-deep-reinforcement-learning
2 | Time series forecasting via deep reinforcement learning.
3 | 
4 | Three deep reinforcement learning algorithms are deployed for time series forecasting, namely Asynchronous Advantage Actor-Critic(A3C), Deep Deterministic Policy Gradient(DDPG) as well as Recurrent Deterministic Policy Gradient(RDPG).
5 | 
6 | 
7 | More details can be seen in: https://doi.org/10.1016/j.enbuild.2019.109675
8 | 


--------------------------------------------------------------------------------
/DQN/test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu May 21 22:54:17 2020
 4 | 
 5 | @author: ChefLiutao
 6 | 
 7 | To test DQN using Cartpole-v0 environment in package [gym].
 8 | """
 9 | import gym
10 | from DQN_agent import Nature_DQN
11 | 
12 | env = gym.make('CartPole-v0')  
13 | env = env.unwrapped   #Lift some restrictions
14 | 
15 | N_FEATURES = 4
16 | N_ACTIONS = 2
17 | MAX_EPISODES = 1000
18 | MAX_STEPS = 500
19 | dqn = Nature_DQN(4,2,16,0.003)
20 | 
21 | for episode in range(MAX_EPISODES):
22 |     s = env.reset()
23 |     for step in range(MAX_STEPS):
24 |         a = dqn.epsilon_choose_a(s)
25 |         s_,r,done,info = env.step(a)
26 |         done = 0 if done else 1
27 |         dqn.store_transition(s,a,r,s_,done)
28 |         dqn.learn()
29 |         if (done == 0) or (step == 499):
30 |             print('Episode %d:%d'%(episode,step+1))
31 |             break
32 |         s = s_
33 | 
34 | dqn.plot_loss()
35 | 
36 | 


--------------------------------------------------------------------------------
/Q_learning/Teasure_hunt_env.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue May 19 14:04:48 2020
 4 | 
 5 | @author: ChefLiutao
 6 | 
 7 | The environment of 1-dimension teasure hunt game, of which both the state space
 8 | and action space are discrete.
 9 | """
10 | 
11 | class One_D_teasure_env():
12 |     def __init__(self):
13 |         self.state_space = [0,1,2,3,4,5]
14 |         self.action_space = ['left','right']
15 |     
16 |     def reset(self):
17 |         '''
18 |         初始化设置
19 |         '''
20 |         state = 0
21 |         return state
22 |     
23 |     
24 |     def step(self,state,action):
25 |         '''
26 |         定义环境动力学，输入s和a，返回next_state和reward
27 |         '''
28 |         if action == 'right':
29 |             if state == 4:
30 |                 reward = 1
31 |                 next_state = 'terminal'
32 |             else:
33 |                 reward = 0
34 |                 next_state = state + 1
35 |         elif action == 'left':
36 |             reward = 0
37 |             if state == 0:
38 |                 next_state = state
39 |             else:
40 |                 next_state = state - 1
41 |         return next_state,reward
42 |                 
43 | 
44 |                 
45 | 


--------------------------------------------------------------------------------
/DDPG-forcasting/DataPreprocessing.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon May 25 11:00:32 2020
 4 | 
 5 | @author: ChefLiutao
 6 | 
 7 | This part of code is to load and preprocess time series data.
 8 | """
 9 | 
10 | import numpy as np
11 | 
12 | 
13 | def build_s_a(sequence,n,m):
14 |     '''
15 |     Args:
16 |         sequence: Time series data
17 |         n: The number of historical data denoting the current state
18 |         m: The number of prediction steps in advance
19 |     Return:
20 |         state_mat: A matrix contains all states at each time step
21 |         best_action: The optimal action based on each state
22 |     '''
23 |     n_rows = len(sequence)-n-m+1
24 |     state_mat = np.zeros((n_rows,n))
25 |     best_action = np.zeros(n_rows)
26 |     for i in range(n_rows):
27 |         state_mat[i] = sequence[i:(i+n)]
28 |         best_action[i] = sequence[i+n+m-1]
29 |     return state_mat,best_action
30 | 
31 | 
32 | 
33 | def normalization(traindata,testdata):
34 |     from sklearn.preprocessing import MinMaxScaler
35 |     scaler = MinMaxScaler()
36 |     scaler.fit(traindata)
37 |     traindata_scaled = scaler.transform(traindata)
38 |     testdata_scaled = scaler.transform(testdata)
39 |     
40 |     return traindata_scaled,testdata_scaled
41 | 


--------------------------------------------------------------------------------
/RDPG-forecasting/DataPreprocessing.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon May 25 11:00:32 2020
 4 | 
 5 | @author: ChefLiutao
 6 | 
 7 | This part of code is to load and preprocess time series data.
 8 | """
 9 | 
10 | import numpy as np
11 | 
12 | 
13 | def build_s_a(sequence,n,m):
14 |     '''
15 |     Args:
16 |         sequence: Time series data
17 |         n: The number of historical data denoting the current state
18 |         m: The number of prediction steps in advance
19 |     Return:
20 |         state_mat: A matrix contains all states at each time step
21 |         best_action: The optimal action based on each state
22 |     '''
23 |     n_rows = len(sequence)-n-m+1
24 |     state_mat = np.zeros((n_rows,n))
25 |     best_action = np.zeros(n_rows)
26 |     for i in range(n_rows):
27 |         state_mat[i] = sequence[i:(i+n)]
28 |         best_action[i] = sequence[i+n+m-1]
29 |     return state_mat,best_action
30 | 
31 | 
32 | 
33 | def normalization(traindata,testdata):
34 |     from sklearn.preprocessing import MinMaxScaler
35 |     scaler = MinMaxScaler()
36 |     scaler.fit(traindata)
37 |     traindata_scaled = scaler.transform(traindata)
38 |     testdata_scaled = scaler.transform(testdata)
39 |     
40 |     return traindata_scaled,testdata_scaled
41 | 


--------------------------------------------------------------------------------
/Q_learning/test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue May 19 21:38:00 2020
 4 | 
 5 | @author: ChefLiutao
 6 | 
 7 | To test the effectiveness of Q-learning in 1D teasure hunt game.
 8 | """
 9 | 
10 | from Q_learning_brain import Tabular_q_learning
11 | from Teasure_hunt_env import One_D_teasure_env
12 | 
13 | ######################################################
14 | 
15 | env = One_D_teasure_env()
16 | states = env.state_space
17 | actions = env.action_space
18 | 
19 | LEARNING_RATE = 0.1
20 | EPSILON = 0.9
21 | GAMMA = 0.9
22 | MAX_EPISODES = 20
23 | MAX_STEPS = 100
24 | 
25 | q_learning = Tabular_q_learning(states,actions,LEARNING_RATE,EPSILON,GAMMA)
26 | 
27 | ######################################################
28 | 
29 | def rl_learn_loop():
30 |     for episode in range(MAX_EPISODES):
31 |         s = env.reset()
32 |         for step in range(MAX_STEPS):
33 |             a = q_learning.epsilon_choose_action(s)
34 |             s_,r = env.step(s,a)
35 |             q_learning.learn(s,a,r,s_)
36 |             
37 |             if s_ == 'terminal':
38 |                 print('Episode %d - total steps - %d'%(episode,step+1))
39 |                 break
40 |             
41 |             s = s_
42 |             
43 | 
44 | if __name__ == '__main__':
45 |     rl_learn_loop()
46 |     print(q_learning.q_table)


--------------------------------------------------------------------------------
/Q_learning/Q_learning_brain.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon May 18 20:53:22 2020
 4 | 
 5 | @author: ChefLiutao
 6 | 
 7 | Brain of Lookup Q-learning, i.e. the brain of Q-learning RL agent.
 8 | """
 9 | import numpy as np
10 | import pandas as pd 
11 | 
12 | class Tabular_q_learning():
13 |     def __init__(self,states,actions,learning_rate,epsilon = 0.9,gamma = 0.9):
14 |         '''
15 |         Args:
16 |             states: A python list of all possible states in state space.
17 |             actions: A python list of all possible actions in action space.
18 |             learning_rate: Update speed of Q_table
19 |             epsilon: A probablity that controls the trade-off between exploration and exploitation
20 |             gamma: Reward discount rate
21 |         '''
22 |         self.state_space = states
23 |         self.action_space = actions
24 |         self.n_states = len(self.state_space)
25 |         self.n_actions = len(self.action_space)
26 |         self.gamma = gamma
27 |         self.epsilon = epsilon
28 |         self.alpha =learning_rate
29 |         self.q_table = self.build_q_table()
30 |       
31 |         
32 |     def build_q_table(self):
33 |         q_table = pd.DataFrame(np.zeros([self.n_states,self.n_actions]),
34 |                                index = self.state_space,
35 |                                columns = self.action_space)
36 |         return q_table
37 |     
38 |     
39 |     def epsilon_choose_action(self,state):
40 |         state_action = self.q_table.loc[state]
41 |         if (np.random.uniform() > self.epsilon) or ((state_action == 0).all()):
42 |             action = np.random.choice(state_action.index)
43 |         else:
44 |             action = state_action.idxmax()
45 |         return action
46 |     
47 |     
48 |     def learn(self,state,action,reward,next_state):
49 |         q_current = self.q_table.loc[state,action]
50 |         q_target = reward if (next_state == 'terminal') else (
51 |                 reward + self.gamma*self.q_table.loc[next_state].max())
52 |         
53 |         self.q_table.loc[state,action] += self.alpha*(q_target - q_current)
54 | 


--------------------------------------------------------------------------------
/DDPG-forcasting/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri May 29 23:23:52 2020
 4 | 
 5 | @author: ChefLiutao
 6 | """
 7 | import pandas as pd
 8 | import numpy as np
 9 | from DataPreprocessing import normalization
10 | from DataPreprocessing import build_s_a
11 | from DDPG_agent import DDPG
12 | import matplotlib.pyplot as plt
13 | import os
14 | 
15 | #####################  hyper parameters  ####################
16 | N_FEATURES = 6
17 | A_LOW = 0
18 | A_HIGH = 1
19 | LR_A = 0.001
20 | LR_C = 0.003
21 | N_ACTOR_HIDDEN = 30
22 | N_CRITIC_HIDDEN = 30
23 | MAX_EPISODES = 300
24 | MAX_STEPS = 1000
25 | 
26 | GAMMA = 0.9                # 折扣因子
27 | TAU = 0.1                 # 软更新因子
28 | MEMORY_CAPACITY = 100000    #记忆库大小
29 | BATCH_SIZE = 128            #批梯度下降的m
30 | #############################################################
31 | 
32 | #Load data 
33 | data_dir = os.getcwd() + 'V6.csv'
34 | data = pd.read_csv(data_dir,encoding = 'gbk')
35 | data = data.iloc[:,0]
36 | 
37 | #Build state matrix and best action
38 | state,action = build_s_a(data,N_FEATURES,1)
39 | 
40 | #Data split
41 | SPLIT_RATE = 0.75
42 | split_index = round(len(state)*SPLIT_RATE)
43 | train_s,train_a = state[:split_index],action[:split_index]
44 | test_s,test_a = state[split_index:],action[split_index:]
45 | 
46 | #Normalization
47 | train_s_scaled,test_s_scaled = normalization(train_s,test_s)
48 | A,B = train_a.max(),train_a.min()
49 | train_a_scaled,test_a_scaled = (train_a-B)/(A-B),(test_a-B)/(A-B)
50 | 
51 | # Training
52 | ddpg = DDPG(N_FEATURES,A_LOW,A_HIGH,LR_A,LR_C,N_ACTOR_HIDDEN,N_CRITIC_HIDDEN)
53 | for episode  in range(MAX_EPISODES):
54 |     index = np.random.choice(range(len(train_s_scaled)))
55 |     s = train_s_scaled[index]
56 |     ep_reward = 0
57 |     
58 |     for step in range(MAX_STEPS):
59 |         a = ddpg.choose_action(s)
60 |         r = -abs(a-train_a_scaled[index])
61 |         ep_reward += r
62 |         index += 1
63 |         s_ = train_s_scaled[index]
64 |         
65 |         ddpg.store_transition(s,a,r,s_)
66 |         ddpg.learn()
67 |         
68 |         if (index == len(train_s_scaled)-1) or (step == MAX_STEPS-1):
69 |             print('Episode %d : %.2f'%(episode,ep_reward))
70 |             break
71 |         
72 |         s = s_
73 | 
74 | # Testing
75 | pred = []
76 | for i in range(len(test_s_scaled)):
77 |     state = test_s_scaled[i]
78 |     action = ddpg.choose_action(state)
79 |     pred.append(action)
80 | 
81 | pred = [pred[i][0] for i in range(len(test_s_scaled))]
82 | pred = pd.Series(pred)
83 | pred = pred*(A-B)+B
84 | actual = pd.Series(test_a)
85 | 
86 | plt.scatter(pred,test_a,marker = '.')
87 | plt.xlabel('Predicted Value')
88 | plt.ylabel('Actual value')
89 | plt.show()
90 | 


--------------------------------------------------------------------------------
/RDPG-forecasting/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri May 29 23:23:52 2020
 4 | 
 5 | @author: ChefLiutao
 6 | """
 7 | import pandas as pd
 8 | import numpy as np
 9 | from DataPreprocessing import normalization
10 | from DataPreprocessing import build_s_a
11 | from RDPG_agent import RDPG
12 | import matplotlib.pyplot as plt
13 | import os
14 | 
15 | 
16 | #####################  hyper parameters  ####################
17 | N_FEATURES = 6
18 | A_LOW = 0
19 | A_HIGH = 1
20 | LR_A = 0.001
21 | LR_C = 0.003
22 | N_ACTOR_HIDDEN = 30
23 | N_CRITIC_HIDDEN = 30
24 | MAX_EPISODES = 100
25 | MAX_STEPS = 1000
26 | 
27 | GAMMA = 0.9                # 折扣因子
28 | TAU = 0.1                 # 软更新因子
29 | MEMORY_CAPACITY = 100000    #记忆库大小
30 | BATCH_SIZE = 128            #批梯度下降的m
31 | #############################################################
32 | 
33 | #Load data 
34 | data_dir = os.getcwd() + 'V6.csv'  #directory of time series data
35 | data = pd.read_csv(data_dir,encoding = 'gbk')
36 | data = data.iloc[:,0]
37 | 
38 | #Build state matrix and best action
39 | state,action = build_s_a(data,N_FEATURES,1)
40 | 
41 | #Data split
42 | SPLIT_RATE = 0.75
43 | split_index = round(len(state)*SPLIT_RATE)
44 | train_s,train_a = state[:split_index],action[:split_index]
45 | test_s,test_a = state[split_index:],action[split_index:]
46 | 
47 | #Normalization
48 | train_s_scaled,test_s_scaled = normalization(train_s,test_s)
49 | A,B = train_a.max(),train_a.min()
50 | train_a_scaled,test_a_scaled = (train_a-B)/(A-B),(test_a-B)/(A-B)
51 | 
52 | # Training
53 | rdpg = RDPG(N_FEATURES,A_LOW,A_HIGH,LR_A,LR_C,N_ACTOR_HIDDEN,N_CRITIC_HIDDEN)
54 | for episode  in range(MAX_EPISODES):
55 |     index = np.random.choice(range(len(train_s_scaled)))
56 |     s = train_s_scaled[index]
57 |     ep_reward = 0
58 |     
59 |     for step in range(MAX_STEPS):
60 |         a = rdpg.choose_action(s)
61 |         r = -abs(a-train_a_scaled[index])
62 |         ep_reward += r
63 |         index += 1
64 |         s_ = train_s_scaled[index]
65 |         
66 |         rdpg.store_transition(s,a,r,s_)
67 |         rdpg.learn()
68 |         
69 |         if (index == len(train_s_scaled)-1) or (step == MAX_STEPS-1):
70 |             print('Episode %d : %.2f'%(episode,ep_reward))
71 |             break
72 |         
73 |         s = s_
74 | 
75 | # Testing
76 | pred = []
77 | for i in range(len(test_s_scaled)):
78 |     state = test_s_scaled[i]
79 |     action = rdpg.choose_action(state)
80 |     pred.append(action)
81 | 
82 | pred = [pred[i][0] for i in range(len(test_s_scaled))]
83 | pred = pd.Series(pred)
84 | pred = pred*(A-B)+B
85 | actual = pd.Series(test_a)
86 | 
87 | plt.scatter(pred,test_a,marker = '.')
88 | plt.xlabel('Predicted Value')
89 | plt.ylabel('Actual value')
90 | plt.show()
91 | 


--------------------------------------------------------------------------------
/A3C-forecasting/A3C_agent.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Jun  8 23:01:56 2020
 4 | 
 5 | @author: ChefLiutao
 6 | 
 7 | The agent of DRL algorithm Asynchronous Advantage Actor-Critic (A3C).
 8 | """
 9 | import multiprocessing
10 | import threading
11 | import tensorflow as tf
12 | import numpy as np
13 | import matplotlib.pyplot as plt
14 | import pandas as pd
15 | import time
16 | 
17 | #------------------- HyperParameters --------------------
18 | 
19 | 
20 | class ACNet():
21 |     '''
22 |     建立网络，可以被调用生成一个Global Net，也能被调用生成一个worker Net
23 |     '''
24 |     def __init__(self, scope, N_S,N_A,A_BOUND,globalAC=None):
25 |         
26 |         self.N_S = N_S
27 |         self.N_A = N_A
28 |         self.A_BOUND = A_BOUND
29 | 
30 |         if scope == 'GLOBAL_NET':   # get global network
31 |             with tf.variable_scope(scope):
32 |                 self.s = tf.placeholder(tf.float32, [None, self.N_S], 'S')
33 |                 self.a_params, self.c_params = self._build_net(scope)[-2:]
34 |         else:   # local net, calculate losses
35 |             with tf.variable_scope(scope):
36 |                 self.s = tf.placeholder(tf.float32, [None, self.N_S], 'S')
37 |                 self.a_his = tf.placeholder(tf.float32, [None, self.N_A], 'A')
38 |                 self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
39 | 
40 |                 mu, sigma, self.v, self.a_params, self.c_params = self._build_net(scope)
41 | 
42 |                 td = tf.subtract(self.v_target, self.v, name='TD_error')
43 |                 with tf.name_scope('c_loss'):
44 |                     self.c_loss = tf.reduce_mean(tf.square(td))
45 | 
46 |                 with tf.name_scope('wrap_a_out'):
47 |                     mu, sigma = mu * A_BOUND[1], sigma + 1e-4
48 | 
49 |                 normal_dist = tf.distributions.Normal(mu, sigma)
50 | 
51 |                 with tf.name_scope('a_loss'):
52 |                     log_prob = normal_dist.log_prob(self.a_his)
53 |                     exp_v = log_prob * tf.stop_gradient(td)
54 |                     entropy = normal_dist.entropy()  # encourage exploration
55 |                     self.exp_v = ENTROPY_BETA * entropy + exp_v
56 |                     self.a_loss = tf.reduce_mean(-self.exp_v)
57 | 
58 |                 with tf.name_scope('choose_a'):  # use local params to choose action
59 |                     self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=[0, 1]), A_BOUND[0], A_BOUND[1])
60 |                 with tf.name_scope('local_grad'):
61 |                     self.a_grads = tf.gradients(self.a_loss, self.a_params)
62 |                     self.c_grads = tf.gradients(self.c_loss, self.c_params)
63 | 
64 |             with tf.name_scope('sync'):
65 |                 with tf.name_scope('pull'):
66 |                     self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
67 |                     self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
68 |                 with tf.name_scope('push'):
69 |                     self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
70 |                     self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
71 | 
72 |     def _build_net(self, scope):
73 |         w_init = tf.random_normal_initializer(0., .1)
74 |         with tf.variable_scope('actor'):
75 |             l_a = tf.layers.dense(self.s, 64, tf.nn.relu6, kernel_initializer=w_init, name='la')
76 |             mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu')
77 |             sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma')
78 |         with tf.variable_scope('critic'):
79 |             l_c = tf.layers.dense(self.s, 64, tf.nn.relu6, kernel_initializer=w_init, name='lc')
80 |             v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v')  # state value
81 |         a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
82 |         c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
83 |         return mu, sigma, v, a_params, c_params
84 | 
85 |     def update_global(self, feed_dict):  # run by a local
86 |         SESS.run([self.update_a_op, self.update_c_op], feed_dict)  # local grads applies to global net
87 | 
88 |     def pull_global(self):  # run by a local
89 |         SESS.run([self.pull_a_params_op, self.pull_c_params_op])
90 | 
91 |     def choose_action(self, s):  # run by a local
92 |         s = s[np.newaxis, :]
93 |         return SESS.run(self.A, {self.s: s})
94 | 


--------------------------------------------------------------------------------
/A3C-forecasting/worker.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Jun  8 23:20:16 2020
  4 | 
  5 | @author: ChefLiutao
  6 | """
  7 | 
  8 | class Worker():
  9 |     def __init__(self, name, globalAC):
 10 |         self.name = name
 11 |         self.AC = ACNet(name, globalAC)
 12 | 
 13 |     def work(self):
 14 |         global GLOBAL_RUNNING_R, GLOBAL_EP
 15 |         total_step = 1
 16 |         buffer_s, buffer_a, buffer_r = [], [], []
 17 |         while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
 18 |             state_index = np.random.choice(range(len(train_state_mat)))
 19 |             s = train_state_mat[state_index]
 20 |             ep_r = 0
 21 |             for ep_t in range(MAX_EP_STEP):
 22 |                 # if self.name == 'W_0':
 23 |                 #     self.env.render()
 24 |                 a = self.AC.choose_action(s)
 25 |                 
 26 |                 s_ = train_state_mat[state_index+1]
 27 |                 r = -abs(train_best_action[state_index]-a)
 28 |                 if ep_t == MAX_EP_STEP -1 or state_index == len(train_state_mat)-2:
 29 |                     done = True
 30 |                 else:
 31 |                     done = False
 32 |                 
 33 |                 ep_r += r
 34 |                 buffer_s.append(s)
 35 |                 buffer_a.append(a)
 36 |                 buffer_r.append(r)
 37 |             #    buffer_r.append((r+8)/8)    # normalize
 38 | 
 39 |                 if total_step % UPDATE_GLOBAL_ITER == 0 or done:   # update global and assign to local net
 40 |                     if done:
 41 |                         v_s_ = 0   # terminal
 42 |                     else:
 43 |                         v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
 44 |                     buffer_v_target = []
 45 |                     for r in buffer_r[::-1]:    # reverse buffer r
 46 |                         v_s_ = r + GAMMA * v_s_
 47 |                         buffer_v_target.append(v_s_)
 48 |                     buffer_v_target.reverse()
 49 | 
 50 |                     buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target)
 51 |                     feed_dict = {
 52 |                         self.AC.s: buffer_s,
 53 |                         self.AC.a_his: buffer_a,
 54 |                         self.AC.v_target: buffer_v_target,
 55 |                     }
 56 |                     self.AC.update_global(feed_dict)
 57 |                     buffer_s, buffer_a, buffer_r = [], [], []
 58 |                     self.AC.pull_global()   #把中央大脑最新的参数拉下来
 59 | 
 60 |                 s = s_
 61 |                 state_index += 1
 62 |                 total_step += 1
 63 |                 
 64 |                 #完成一个episode，查看训练效果
 65 | #                if done:
 66 | #                    self.AC.pull_global()
 67 |                     
 68 |                     #训练集
 69 | #                    pred = []
 70 | #                    for i_pred in range(len(train_state_mat)):
 71 | #                        state = train_state_mat[i_pred]
 72 | #                        action = self.AC.choose_action(state)
 73 | #                        pred.append(action)
 74 | #                    pred = [pred[i][0] for i in range(len(train_state_mat))]
 75 | #                    pred = pd.Series(pred)
 76 | #                    pred = pred*(B-A)+A
 77 | #                    actual = train_best_action*(B-A)+A   #反归一化
 78 | #                    MAE = np.mean(abs(pred-actual)) #MAE
 79 | #                    RMSE = (np.sum((pred-actual)**2)/len(pred))**0.5
 80 | #                    print('MAE-1: %.2f' %MAE,'RMSE-1: %.2f' %RMSE)
 81 | #                    mae_episode.append(MAE)
 82 | #                    rmse_episode.append(RMSE)
 83 | 
 84 |                     #测试集
 85 | #                    pred = []
 86 | #                    for i_pred in range(len(test_state_mat)):
 87 | #                        state = test_state_mat[i_pred]
 88 | #                        action = self.AC.choose_action(state)
 89 | #                        pred.append(action)
 90 | #                    pred = [pred[i][0] for i in range(len(test_state_mat))]
 91 | #                    pred = pd.Series(pred)
 92 | #                    pred = pred*(B-A)+A
 93 | #                    actual = test_best_action*(B-A)+A   #反归一化
 94 | #                    MAE = np.mean(abs(pred-actual)) #MAE
 95 | #                    RMSE = (np.sum((pred-actual)**2)/len(pred))**0.5
 96 | #                    print('MAE: %.2f' %MAE,'RMSE: %.2f' %RMSE)
 97 |                
 98 |                 
 99 |                 #完成一个episode
100 |                 if done:  
101 |                     if len(GLOBAL_RUNNING_R) == 0:  # record running episode reward
102 |                         GLOBAL_RUNNING_R.append(ep_r)
103 |                     else:
104 |                         GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r)
105 |                     print(
106 |                         self.name,
107 |                         "Ep:", GLOBAL_EP,
108 |                         "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
109 |                           )
110 |                     GLOBAL_EP += 1
111 |                     break


--------------------------------------------------------------------------------
/DQN/DQN_agent.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed May 20 15:05:29 2020
  4 | 
  5 | @author: ChefLiutao
  6 | """
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | from collections import deque
 10 | import random
 11 | 
 12 | class Nature_DQN():
 13 |     def __init__(
 14 |             self,
 15 |             n_features,
 16 |             n_actions,
 17 |             n_hidden,
 18 |             learning_rate,
 19 |             epsilon = 0.9,
 20 |             gamma = 0.9,
 21 |             memory_size = 1000,
 22 |             batch_size = 128,
 23 |             epsilon_increment = 0.0001):
 24 |         
 25 |         self.n_features = n_features       #dimension of state
 26 | #        self.actions = actions             #all possible actions
 27 |         self.n_actions = n_actions      #dimension of action space
 28 |         self.n_hidden = n_hidden           #hidden neurons of Q network
 29 |         self.lr = learning_rate            #for Current Q network update
 30 |         self.epsilon = epsilon             #e-greed
 31 |         self.gamma = gamma                 #reward discount rate
 32 |         self.memory_size = memory_size
 33 |         self.memory = deque(maxlen = memory_size)
 34 |         self.batch_size = batch_size
 35 |         self.epsilon_increment = epsilon_increment
 36 |         self.optimizer = tf.train.AdamOptimizer(self.lr)
 37 |         self.loss_history = []
 38 |         self.learn_step_counter = 0
 39 |         
 40 |         self.s = tf.placeholder(dtype = tf.float32,shape = [None,self.n_features])
 41 |         self.a = tf.placeholder(tf.int32,shape = [None,])
 42 |         self.s_ = tf.placeholder(tf.float32,shape = [None,self.n_features])
 43 |         self.r = tf.placeholder(tf.float32,shape = [None,])
 44 |         self.done = tf.placeholder(tf.float32,shape = [None,])  #denote if the s_ is terminal:0→terminal,1→non-terminal
 45 |         
 46 |         self.q_current = self.build_current_net()  # self.s → self.q_current; self.s_ → self.q_next
 47 |         self.q_next = self.build_target_net()
 48 |         
 49 |         self.current_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
 50 |                                                 scope = 'current_net')
 51 |         self.target_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
 52 |                                                scope = 'target_net')
 53 |         
 54 |         self.params_replace = [tf.assign(t,e) for t,e in zip(self.target_params,self.current_params)]
 55 |         
 56 |         self.q_sa = tf.gather_nd(self.q_current,indices = tf.stack(
 57 |                 [tf.range(tf.shape(self.a)[0],dtype = tf.int32),self.a],
 58 |                 axis = 1))             #[tf.shape(self.a)[0],]
 59 |         
 60 |         self.q_s_a_ = tf.reduce_max(self.q_next,axis = 1)
 61 |         
 62 |         with tf.variable_scope('loss'):
 63 |             self.q_target = (self.r + self.gamma*self.q_s_a_)*self.done
 64 |             self.loss = tf.reduce_mean(tf.square(self.q_target - self.q_sa))
 65 |             self.train_op = self.optimizer.minimize(self.loss)
 66 |             
 67 |         self.sess = tf.Session()
 68 |         self.sess.run(tf.global_variables_initializer())
 69 |     
 70 |     def build_current_net(self):
 71 |         '''
 72 |         Build Two Q neworks:current Q network and target Q network.
 73 |         Target Q network is respnsible for calculating Q(s_).
 74 |         Current Q network is responsible to estimate Q(s)
 75 |         '''
 76 |         # Current Q Network    ------------------------------------------
 77 | #        self.s = tf.placeholder(shape = [None,self.n_features])
 78 | #        self.q_target = tf.placeholder(shape = [None,len(self.actions)])
 79 |         
 80 |         with tf.variable_scope('current_net'):
 81 |             w_init = tf.random_normal_initializer(0,0.1)
 82 |             b_init = tf.constant_initializer(0.1)
 83 |             w1 = tf.get_variable(name = 'w1',
 84 |                                      shape = [self.n_features,self.n_hidden],
 85 |                                      initializer = w_init,
 86 |                                      trainable = True)
 87 |             b1 = tf.get_variable(name = 'b1',
 88 |                                      shape = [self.n_hidden],
 89 |                                      initializer = b_init,
 90 |                                      trainable = True)
 91 |     
 92 | #            tf.add_to_collection('current_params',w1)
 93 | #            tf.add_to_collection('current_params',b1)
 94 |                 
 95 |             hidden = tf.nn.relu(tf.matmul(self.s,w1) + b1)
 96 |                 
 97 |             w2 = tf.get_variable(name = 'w2',
 98 |                                      shape = [self.n_hidden,self.n_actions],
 99 |                                      initializer = w_init,
100 |                                      trainable = True)
101 |             b2 = tf.get_variable(name = 'b2',
102 |                                      shape = [self.n_actions],
103 |                                      initializer = b_init,
104 |                                      trainable = True)
105 | #            tf.add_to_collection('current_params',w2)
106 | #            tf.add_to_collection('current_params',b2)
107 |                 
108 |             self.q_current = tf.matmul(hidden,w2) + b2
109 |         return self.q_current
110 |     
111 |     def build_target_net(self):
112 |         w_init = tf.random_normal_initializer(0,0.1)
113 |         b_init = tf.constant_initializer(0.1)
114 |         # Target Q Network  ------------------------------------------------
115 | #        self.s_ = tf.placeholder(shape = [None,self.n_features])
116 |         with tf.variable_scope('target_net'):
117 |             w1 = tf.get_variable(name = 'w1',
118 |                                      shape = [self.n_features,self.n_hidden],
119 |                                      initializer = w_init,
120 |                                      trainable = False)
121 |             b1 = tf.get_variable(name = 'b1',
122 |                                      shape = [self.n_hidden],
123 |                                      initializer = b_init,
124 |                                      trainable = False)
125 | #           tf.add_to_collection('target_params',w1)
126 | #           tf.add_to_collection('target_params',b1)
127 |             hidden = tf.nn.relu(tf.matmul(self.s_,w1) + b1)
128 |             
129 |             w2 = tf.get_variable(name = 'w2',
130 |                                      shape = [self.n_hidden,self.n_actions],
131 |                                      initializer = w_init,
132 |                                      trainable = False)
133 |             b2 = tf.get_variable(name = 'b2',
134 |                                      shape = [self.n_actions],
135 |                                      initializer = b_init,
136 |                                      trainable = False)
137 | #            tf.add_to_collection('target_params',w2)
138 | #            tf.add_to_collection('target_params',b2)
139 |         
140 |             self.q_next = tf.matmul(hidden,w2) + b2
141 |         return self.q_next
142 |                
143 |                 
144 |     def epsilon_choose_a(self,state):
145 |         state = np.reshape(state,[-1,self.n_features])
146 |         if np.random.uniform() < self.epsilon:
147 |             state_action = self.sess.run(self.q_current,
148 |                                          feed_dict = {self.s:state})
149 |             action = np.argmax(state_action)
150 |         else:
151 |             action = np.random.choice(np.arange(self.n_actions))
152 |         return action
153 |     
154 |     
155 |     def store_transition(self,state,action,reward,next_state,is_done):
156 |         state,next_state = state[np.newaxis,:],next_state[np.newaxis,:]
157 |         action,reward,is_done = np.array(action),np.array(reward),np.array(is_done)
158 |         action = action.reshape([1,1])
159 |         reward = reward.reshape([1,1])
160 |         is_done = is_done.reshape([1,1])
161 |         
162 |         transition = np.concatenate((state,action,reward,next_state,is_done),axis = 1)
163 |         self.memory.append(transition[0,:])
164 |     
165 |     
166 |     def learn(self):
167 |         if len(self.memory) == self.memory_size:
168 |             if self.learn_step_counter % 500 == 0:
169 |                 self.sess.run(self.params_replace)
170 |             self.learn_step_counter += 1
171 |             
172 |             batch = np.array(random.sample(self.memory,self.batch_size))
173 |             batch_s = batch[:,:self.n_features]
174 |             batch_a = batch[:,self.n_features:(self.n_features + 1)][:,0]
175 |             batch_r = batch[:,(self.n_features + 1):(self.n_features + 2)][:,0]
176 |             batch_s_ = batch[:,(self.n_features + 2):(self.n_features*2 + 2)]
177 |             batch_done = batch[:,-1]
178 |             train_op,loss = self.sess.run((self.train_op,self.loss),
179 |                                           feed_dict = {self.s:batch_s,self.a:batch_a,
180 |                                                        self.s_:batch_s_,self.r:batch_r,
181 |                                                        self.done:batch_done})
182 |             self.loss_history.append(loss)
183 |             
184 |             self.epsilon = self.epsilon + self.epsilon_increment if (
185 |                     self.epsilon + self.epsilon_increment) < 1. else 1.
186 |             
187 |     
188 |     def plot_loss(self):
189 |         import matplotlib.pyplot as plt
190 |         plt.plot(range(len(self.loss_history)),self.loss_history,'-')
191 | 


--------------------------------------------------------------------------------
/RDPG-forecasting/RDPG_agent.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat May 30 20:52:21 2020
  4 | 
  5 | @author: ChefLiutao
  6 | 
  7 | The agent of RL algorithm Recurrent Detrministic Policy Gradient.
  8 | 
  9 | The Actor NNs are deployed as three-layer Fully-Connected NN.
 10 | 
 11 | The Critic NNs are deployed as RNN.
 12 | """
 13 | import tensorflow as tf
 14 | import numpy as np
 15 | from collections import deque
 16 | import random
 17 | 
 18 | class RDPG():
 19 |     def __init__(self,
 20 |                  n_features,
 21 | #                 n_actions,
 22 |                  a_low,
 23 |                  a_high,
 24 |                  learning_rate_actor,
 25 |                  learning_rate_critic,
 26 |                  n_actor_hidden,
 27 |                  n_critic_hidden,
 28 |                  gamma = 0.9,
 29 |                  noise_varience = 3,
 30 |                  soft_replace = 0.1,
 31 |                  memory_size = 1000,
 32 |                  batch_size = 128):
 33 |         self.n_features = n_features             #dimension of states
 34 | #        self.n_actions = n_actions        
 35 |         self.a_low = a_low                       #The low bound of action sapce
 36 |         self.a_high = a_high                     #The high bound of action space
 37 |         self.lr_a = learning_rate_actor          #Learning rate of Actor NN
 38 |         self.lr_c = learning_rate_critic         #Learning rate of Critic NN
 39 |         self.n_actor_hidden = n_actor_hidden     #Number of hidden layer neurons in Actor
 40 |         self.n_critic_cells = n_critic_hidden   #Number of hidden layer neurons in Critic
 41 |         self.gamma = gamma                       #Reward discount rate
 42 |         self.noise_var = noise_varience          #Variance of output action distribution
 43 |         self.soft_replace = soft_replace         #Update speed of target networks
 44 |         self.memory_size = memory_size           #Size of experience replay buffer
 45 |         self.memory = deque(maxlen = self.memory_size)   #Experience replay buffer
 46 |         self.batch_size = batch_size                     
 47 |         
 48 |         self.s = tf.placeholder(dtype = tf.float32,shape = [None,self.n_features])
 49 |         self.s_ = tf.placeholder(dtype = tf.float32,shape = [None,self.n_features])
 50 |         self.r = tf.placeholder(dtype = tf.float32,shape = [None,])
 51 |         self.done = tf.placeholder(dtype = tf.float32,shape = [None,]) # 0 if s_ == terminal else 1
 52 |         
 53 |         self.a = self.build_Actor1()
 54 |         self.a_ = self.build_Actor2()
 55 |         self.q_sa = self.build_Critic1()      #shape:[None,] 
 56 |         self.q_s_a_ = self.build_Critic2()    #shape:[None,]
 57 |         
 58 |         self.curr_a_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
 59 |                                             scope = 'Actor/Current')
 60 |         self.targ_a_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
 61 |                                             scope = 'Actor/Target')
 62 |         self.curr_c_params= tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
 63 |                                             scope = 'Critic/Current')
 64 |         self.targ_c_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
 65 |                                             scope = 'Critic/Target')
 66 |         
 67 |         # Soft replace of Targets NN parameters
 68 |         self.replace_a_params = [tf.assign(t,(1-self.soft_replace)*t + self.soft_replace*e) \
 69 |                                  for (t,e) in zip(self.targ_a_params,self.curr_a_params)]
 70 |         self.replace_c_params = [tf.assign(t,(1-self.soft_replace)*t + self.soft_replace*e) \
 71 |                                  for (t,e) in zip(self.targ_c_params,self.curr_c_params)]
 72 |         
 73 |         self.td_error = self.r + self.gamma*self.q_s_a_ - self.q_sa
 74 |         self.critic_loss = tf.reduce_mean(tf.square(self.td_error))
 75 |         self.actor_loss = -tf.reduce_mean(self.q_sa)
 76 |         
 77 |         self.actor_train_op = tf.train.AdamOptimizer(self.lr_a).minimize(self.actor_loss,
 78 |                                                     var_list = self.curr_a_params)
 79 |         self.critic_train_op = tf.train.AdamOptimizer(self.lr_c).minimize(self.critic_loss,
 80 |                                                      var_list = self.curr_c_params)
 81 |         
 82 |         self.learn_step_counter = 0
 83 |         self.sess = tf.Session()
 84 |         self.sess.run(tf.global_variables_initializer())
 85 |         
 86 |     
 87 |     def build_Actor1(self):
 88 |         '''
 89 |         Building Current Actor network.
 90 |         '''
 91 |         with tf.variable_scope('Actor/Current'):
 92 |             w_init = tf.random_normal_initializer(0,0.1)
 93 |             b_init = tf.constant_initializer(0.1)
 94 |             w1 = tf.get_variable(name = 'w1',shape = [self.n_features,self.n_actor_hidden],
 95 |                                  dtype = tf.float32,initializer = w_init,
 96 |                                  trainable = True)
 97 |             b1 = tf.get_variable('b1',shape = [self.n_actor_hidden,],
 98 |                                  dtype = tf.float32,initializer = b_init,
 99 |                                  trainable = True)
100 |             w2 = tf.get_variable('w2',shape = [self.n_actor_hidden,1],
101 |                                  dtype = tf.float32,initializer = w_init,
102 |                                  trainable = True)
103 |             b2 = tf.get_variable('b2',shape = [1,],
104 |                                  dtype = tf.float32,initializer = b_init,
105 |                                  trainable = True)
106 |             hidden = tf.nn.relu(tf.matmul(self.s,w1) + b1)
107 |             a = tf.matmul(hidden,w2) + b2
108 |         return a[:,0]
109 | #            return np.clip(np.random.normal(a,self.noise_var),self.a_low,self.a_high)
110 |     
111 |     def build_Actor2(self):
112 |         '''
113 |         Building Target Actor network.
114 |         '''
115 |         with tf.variable_scope('Actor/Target'):
116 |             w_init = tf.random_normal_initializer(0,0.1)
117 |             b_init = tf.constant_initializer(0.1)
118 |             w1 = tf.get_variable('w1',shape = [self.n_features,self.n_actor_hidden],
119 |                                  dtype = tf.float32,initializer = w_init,
120 |                                  trainable = False)
121 |             b1 = tf.get_variable('b1',shape = [self.n_actor_hidden,],
122 |                                  dtype = tf.float32,initializer = b_init,
123 |                                  trainable = False)
124 |             w2 = tf.get_variable('w2',shape = [self.n_actor_hidden,1],
125 |                                  dtype = tf.float32,initializer = w_init,
126 |                                  trainable = False)
127 |             b2 = tf.get_variable('b2',shape = [1,],
128 |                                  dtype = tf.float32,initializer = b_init,
129 |                                  trainable = False)
130 |             hidden = tf.nn.relu(tf.matmul(self.s_,w1) + b1)
131 |             a_ = tf.matmul(hidden,w2) + b2
132 |         return a_[:,0]
133 |     
134 |     def build_Critic1(self):
135 |         '''
136 |         Building Current Critic network.
137 |         '''
138 |         with tf.variable_scope('Critic/Current'):
139 |             w_init = tf.random_normal_initializer(0,0.1)
140 |             b_init = tf.constant_initializer(0.1)
141 |             
142 |             rnn_cell = tf.contrib.rnn.BasicRNNCell(self.n_critic_cells)
143 |             self.init_state = rnn_cell.zero_state(batch_size=1, dtype=tf.float64)
144 |             s = tf.cast(tf.expand_dims(self.s,axis = 1),tf.float64)
145 |             
146 |             outputs, self.final_state = tf.nn.dynamic_rnn(
147 |                     cell = rnn_cell, inputs = s, 
148 |                     initial_state = self.init_state, time_major = True)
149 |             cell_out = tf.cast(tf.reshape(outputs, [-1, self.n_critic_cells]),tf.float32)
150 |             
151 |             a_out = tf.layers.dense(self.a[:,np.newaxis],self.n_critic_cells,trainable = True)
152 |             q_sa = tf.layers.dense(cell_out + a_out,1,tf.nn.relu,
153 |                                    kernel_initializer = w_init,
154 |                                    bias_initializer = b_init,trainable = True)
155 | 
156 |         return q_sa[:,0]
157 |             
158 |     
159 |     def build_Critic2(self):
160 |         '''
161 |         Building Target Critic network.
162 |         '''
163 |         with tf.variable_scope('Critic/Target'):
164 |             w_init = tf.random_normal_initializer(0,0.1)
165 |             b_init = tf.constant_initializer(0.1)
166 |             
167 |             rnn_cell = tf.contrib.rnn.BasicRNNCell(self.n_critic_cells)
168 |             self.init_state = rnn_cell.zero_state(batch_size=1, dtype=tf.float64)
169 |             s_ = tf.cast(tf.expand_dims(self.s_,axis = 1),tf.float64)
170 |             
171 |             outputs, self.final_state = tf.nn.dynamic_rnn(
172 |                     cell = rnn_cell, inputs = s_, 
173 |                     initial_state = self.init_state, time_major = True)
174 |             cell_out = tf.cast(tf.reshape(outputs, [-1, self.n_critic_cells]),tf.float32)
175 |             
176 |             a_out = tf.layers.dense(self.a_[:,np.newaxis],self.n_critic_cells,trainable = False)
177 |             q_s_a_ = tf.layers.dense(cell_out + a_out,1,tf.nn.relu,
178 |                                    kernel_initializer = w_init,
179 |                                    bias_initializer = b_init,trainable = False)
180 | 
181 |         return q_s_a_[:,0]         
182 |     
183 |     def choose_action(self,state):
184 |         state = np.reshape(state,[-1,self.n_features])
185 |         action = self.sess.run(self.a,feed_dict = {self.s:state})
186 |         return action
187 |     
188 |     def store_transition(self,state,action,reward,next_state):
189 |         state,next_state = state[np.newaxis,:],next_state[np.newaxis,:]
190 |         action,reward = np.array(action),np.array(reward)
191 |         action = np.reshape(action,[1,-1])
192 |         reward = np.reshape(reward,[1,-1])
193 | #        is_done = np.reshape(is_done,[1,-1])
194 |         
195 |         transition = np.concatenate((state,action,reward,next_state),axis = 1)
196 |         self.memory.append(transition[0,:])
197 |     
198 |     def learn(self):
199 |         if len(self.memory) == self.memory_size:
200 |             if self.learn_step_counter % 200 == 0:
201 |                 self.sess.run((self.replace_a_params,self.replace_c_params))
202 |             
203 |             self.noise_var *= 0.999
204 |                 
205 |             batch = np.array(random.sample(self.memory,self.batch_size))
206 |             batch_s = batch[:,:self.n_features]
207 |             batch_a = batch[:,self.n_features:(self.n_features + 1)][:,0]
208 |             batch_r = batch[:,(self.n_features + 1):(self.n_features + 2)][:,0]
209 |             batch_s_ = batch[:,(self.n_features + 2):(self.n_features*2 + 2)]
210 |             
211 |             self.sess.run(self.actor_train_op,feed_dict = {self.s:batch_s})
212 |             self.sess.run(self.critic_train_op,feed_dict = {self.s:batch_s,
213 |                                                             self.a:batch_a,
214 |                                                             self.s_:batch_s_,
215 |                                                             self.r:batch_r})
216 | 
217 | if __name__ == '__main__':
218 |     rdpg = RDPG(5,0,1,0.03,0.01,30,30)


--------------------------------------------------------------------------------
/DDPG-forcasting/DDPG_agent.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sun May 24 18:32:39 2020
  4 | 
  5 | @author: ChefLiutao
  6 | 
  7 | The agent of RL algorithm Deep Detrministic Policy Gradient.
  8 | 
  9 | Both the Actor and Critic neuron networks  adopt three-layer Fully-Connected NN.
 10 | """
 11 | import tensorflow as tf
 12 | import numpy as np
 13 | from collections import deque
 14 | import random
 15 | 
 16 | class DDPG():
 17 |     def __init__(self,
 18 |                  n_features,
 19 | #                 n_actions,
 20 |                  a_low,
 21 |                  a_high,
 22 |                  learning_rate_actor,
 23 |                  learning_rate_critic,
 24 |                  n_actor_hidden,
 25 |                  n_critic_hidden,
 26 |                  gamma = 0.9,
 27 |                  noise_varience = 3,
 28 |                  soft_replace = 0.1,
 29 |                  memory_size = 1000,
 30 |                  batch_size = 128):
 31 |         self.n_features = n_features             #dimension of states
 32 | #        self.n_actions = n_actions        
 33 |         self.a_low = a_low                       #The low bound of action sapce
 34 |         self.a_high = a_high                     #The high bound of action space
 35 |         self.lr_a = learning_rate_actor          #Learning rate of Actor NN
 36 |         self.lr_c = learning_rate_critic         #Learning rate of Critic NN
 37 |         self.n_actor_hidden = n_actor_hidden     #Number of hidden layer neurons in Actor
 38 |         self.n_critic_hidden = n_critic_hidden   #Number of hidden layer neurons in Critic
 39 |         self.gamma = gamma                       #Reward discount rate
 40 |         self.noise_var = noise_varience          #Variance of output action distribution
 41 |         self.soft_replace = soft_replace         #Update speed of target networks
 42 |         self.memory_size = memory_size           #Size of experience replay buffer
 43 |         self.memory = deque(maxlen = self.memory_size)   #Experience replay buffer
 44 |         self.batch_size = batch_size                     
 45 |         
 46 |         self.s = tf.placeholder(dtype = tf.float32,shape = [None,self.n_features])
 47 |         self.s_ = tf.placeholder(dtype = tf.float32,shape = [None,self.n_features])
 48 |         self.r = tf.placeholder(dtype = tf.float32,shape = [None,])
 49 |         self.done = tf.placeholder(dtype = tf.float32,shape = [None,]) # 0 if s_ == terminal else 1
 50 |         
 51 |         self.a = self.build_Actor1()
 52 |         self.a_ = self.build_Actor2()
 53 |         self.q_sa = self.build_Critic1()      #shape:[None,] 
 54 |         self.q_s_a_ = self.build_Critic2()    #shape:[None,]
 55 |         
 56 |         self.curr_a_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
 57 |                                             scope = 'Actor/Current')
 58 |         self.targ_a_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
 59 |                                             scope = 'Actor/Target')
 60 |         self.curr_c_params= tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
 61 |                                             scope = 'Critic/Current')
 62 |         self.targ_c_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
 63 |                                             scope = 'Critic/Target')
 64 |         
 65 |         # Soft replace of Targets NN parameters
 66 |         self.replace_a_params = [tf.assign(t,(1-self.soft_replace)*t + self.soft_replace*e) \
 67 |                                  for (t,e) in zip(self.targ_a_params,self.curr_a_params)]
 68 |         self.replace_c_params = [tf.assign(t,(1-self.soft_replace)*t + self.soft_replace*e) \
 69 |                                  for (t,e) in zip(self.targ_c_params,self.curr_c_params)]
 70 |         
 71 |         self.td_error = self.r + self.gamma*self.q_s_a_ - self.q_sa
 72 |         self.critic_loss = tf.reduce_mean(tf.square(self.td_error))
 73 |         self.actor_loss = -tf.reduce_mean(self.q_sa)
 74 |         
 75 |         self.actor_train_op = tf.train.AdamOptimizer(self.lr_a).minimize(self.actor_loss,
 76 |                                                     var_list = self.curr_a_params)
 77 |         self.critic_train_op = tf.train.AdamOptimizer(self.lr_c).minimize(self.critic_loss,
 78 |                                                      var_list = self.curr_c_params)
 79 |         
 80 |         self.learn_step_counter = 0
 81 |         self.sess = tf.Session()
 82 |         self.sess.run(tf.global_variables_initializer())
 83 |         
 84 |     
 85 |     def build_Actor1(self):
 86 |         '''
 87 |         Building Current Actor network.
 88 |         '''
 89 |         with tf.variable_scope('Actor/Current'):
 90 |             w_init = tf.random_normal_initializer(0,0.1)
 91 |             b_init = tf.constant_initializer(0.1)
 92 |             w1 = tf.get_variable(name = 'w1',shape = [self.n_features,self.n_actor_hidden],
 93 |                                  dtype = tf.float32,initializer = w_init,
 94 |                                  trainable = True)
 95 |             b1 = tf.get_variable('b1',shape = [self.n_actor_hidden,],
 96 |                                  dtype = tf.float32,initializer = b_init,
 97 |                                  trainable = True)
 98 |             w2 = tf.get_variable('w2',shape = [self.n_actor_hidden,1],
 99 |                                  dtype = tf.float32,initializer = w_init,
100 |                                  trainable = True)
101 |             b2 = tf.get_variable('b2',shape = [1,],
102 |                                  dtype = tf.float32,initializer = b_init,
103 |                                  trainable = True)
104 |             hidden = tf.nn.relu(tf.matmul(self.s,w1) + b1)
105 |             a = tf.matmul(hidden,w2) + b2
106 |         return a[:,0]
107 | #            return np.clip(np.random.normal(a,self.noise_var),self.a_low,self.a_high)
108 |     
109 |     def build_Actor2(self):
110 |         '''
111 |         Building Target Actor network.
112 |         '''
113 |         with tf.variable_scope('Actor/Target'):
114 |             w_init = tf.random_normal_initializer(0,0.1)
115 |             b_init = tf.constant_initializer(0.1)
116 |             w1 = tf.get_variable('w1',shape = [self.n_features,self.n_actor_hidden],
117 |                                  dtype = tf.float32,initializer = w_init,
118 |                                  trainable = False)
119 |             b1 = tf.get_variable('b1',shape = [self.n_actor_hidden,],
120 |                                  dtype = tf.float32,initializer = b_init,
121 |                                  trainable = False)
122 |             w2 = tf.get_variable('w2',shape = [self.n_actor_hidden,1],
123 |                                  dtype = tf.float32,initializer = w_init,
124 |                                  trainable = False)
125 |             b2 = tf.get_variable('b2',shape = [1,],
126 |                                  dtype = tf.float32,initializer = b_init,
127 |                                  trainable = False)
128 |             hidden = tf.nn.relu(tf.matmul(self.s_,w1) + b1)
129 |             a_ = tf.matmul(hidden,w2) + b2
130 |         return a_[:,0]
131 |     
132 |     def build_Critic1(self):
133 |         '''
134 |         Building Current Critic network.
135 |         '''
136 |         with tf.variable_scope('Critic/Current'):
137 |             w_init = tf.random_normal_initializer(0,0.1)
138 |             b_init = tf.constant_initializer(0.1)
139 |             w1_s = tf.get_variable('w1_s',shape = [self.n_features,self.n_critic_hidden],
140 |                                  dtype = tf.float32,initializer = w_init,
141 |                                  trainable = True)
142 |             w1_a = tf.get_variable('w1_a',shape = [1,self.n_critic_hidden],
143 |                                  dtype = tf.float32,initializer = w_init,
144 |                                  trainable = True)
145 |             b1 = tf.get_variable('b1',shape = [self.n_critic_hidden,],
146 |                                  dtype = tf.float32,initializer = b_init,
147 |                                  trainable = True)
148 |             w2 = tf.get_variable('w2',shape = [self.n_critic_hidden,1],
149 |                                  dtype = tf.float32,initializer = w_init,
150 |                                  trainable = True)
151 |             b2 = tf.get_variable('b2',shape = [1,],dtype = tf.float32,
152 |                                  initializer = b_init,trainable = True)
153 |             hidden = tf.nn.relu(tf.matmul(self.s,w1_s) + tf.matmul(self.a[:,np.newaxis],w1_a) + b1)
154 |             q_sa = tf.matmul(hidden,w2) + b2
155 |         return q_sa[:,0]
156 |     
157 |     def build_Critic2(self):
158 |         '''
159 |         Building Target Critic network.
160 |         '''
161 |         with tf.variable_scope('Critic/Target'):
162 |             w_init = tf.random_normal_initializer(0,0.1)
163 |             b_init = tf.constant_initializer(0.1)
164 |             w1_s = tf.get_variable('w1_s',shape = [self.n_features,self.n_critic_hidden],
165 |                                  dtype = tf.float32,initializer = w_init,
166 |                                  trainable = False)
167 |             w1_a = tf.get_variable('w1_a',shape = [1,self.n_critic_hidden],
168 |                                  dtype = tf.float32,initializer = w_init,
169 |                                  trainable = False)
170 |             b1 = tf.get_variable('b1',shape = [self.n_critic_hidden,],
171 |                                  dtype = tf.float32,initializer = b_init,
172 |                                  trainable = False)
173 |             w2 = tf.get_variable('w2',shape = [self.n_critic_hidden,1],
174 |                                  dtype = tf.float32,initializer = w_init,
175 |                                  trainable = False)
176 |             b2 = tf.get_variable('b2',shape = [1,],dtype = tf.float32,
177 |                                  initializer = b_init,trainable = True)
178 |             hidden = tf.nn.relu(tf.matmul(self.s_,w1_s) + tf.matmul(self.a_[:,np.newaxis],w1_a) + b1)
179 |             q_s_a_ = tf.matmul(hidden,w2) + b2
180 |         return q_s_a_[:,0]            
181 |     
182 |     def choose_action(self,state):
183 |         state = np.reshape(state,[-1,self.n_features])
184 |         action = self.sess.run(self.a,feed_dict = {self.s:state})
185 |         return action
186 |     
187 |     def store_transition(self,state,action,reward,next_state):
188 |         state,next_state = state[np.newaxis,:],next_state[np.newaxis,:]
189 |         action,reward = np.array(action),np.array(reward)
190 |         action = np.reshape(action,[1,-1])
191 |         reward = np.reshape(reward,[1,-1])
192 | #        is_done = np.reshape(is_done,[1,-1])
193 |         
194 |         transition = np.concatenate((state,action,reward,next_state),axis = 1)
195 |         self.memory.append(transition[0,:])
196 |     
197 |     def learn(self):
198 |         if len(self.memory) == self.memory_size:
199 |             if self.learn_step_counter % 200 == 0:
200 |                 self.sess.run((self.replace_a_params,self.replace_c_params))
201 |             
202 |             self.noise_var *= 0.999
203 |                 
204 |             batch = np.array(random.sample(self.memory,self.batch_size))
205 |             batch_s = batch[:,:self.n_features]
206 |             batch_a = batch[:,self.n_features:(self.n_features + 1)][:,0]
207 |             batch_r = batch[:,(self.n_features + 1):(self.n_features + 2)][:,0]
208 |             batch_s_ = batch[:,(self.n_features + 2):(self.n_features*2 + 2)]
209 |             
210 |             self.sess.run(self.actor_train_op,feed_dict = {self.s:batch_s})
211 |             self.sess.run(self.critic_train_op,feed_dict = {self.s:batch_s,
212 |                                                             self.a:batch_a,
213 |                                                             self.s_:batch_s_,
214 |                                                             self.r:batch_r})
215 | if __name__ == '__main__':
216 |     ddpg = DDPG(5,0,1,0.03,0.01,30,30)


--------------------------------------------------------------------------------