├── README.md ├── paper ├── 07967695.pdf ├── 08417385.pdf ├── Three_ADP.pdf ├── action_network.pdf └── raceface.pdf └── project ├── cart-pole_example.py ├── model.py └── model.py.bak /README.md: -------------------------------------------------------------------------------- 1 | # Adaptive-Dynamic-programming 2 | Adaptive-Dynamic-programming based on the paper: 3 | [A three-network architecture for on-line learning](https://www.sciencedirect.com/science/article/pii/S0925231211004760) 4 | [Learning Without External Reward](https://ieeexplore.ieee.org/document/8417385) 5 | 6 | Install: 7 | ``` 8 | python 3.7 , tensorflow , gym 9 | ``` 10 | Run: 11 | ``` 12 | python ./project/model.py 13 | ``` 14 | -------------------------------------------------------------------------------- /paper/07967695.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yushinliu/Adaptive-Dynamic-programming/b64ce818148973ebf89f492172e7ed64c69c37f8/paper/07967695.pdf -------------------------------------------------------------------------------- /paper/08417385.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yushinliu/Adaptive-Dynamic-programming/b64ce818148973ebf89f492172e7ed64c69c37f8/paper/08417385.pdf -------------------------------------------------------------------------------- /paper/Three_ADP.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yushinliu/Adaptive-Dynamic-programming/b64ce818148973ebf89f492172e7ed64c69c37f8/paper/Three_ADP.pdf -------------------------------------------------------------------------------- /paper/action_network.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yushinliu/Adaptive-Dynamic-programming/b64ce818148973ebf89f492172e7ed64c69c37f8/paper/action_network.pdf -------------------------------------------------------------------------------- /paper/raceface.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yushinliu/Adaptive-Dynamic-programming/b64ce818148973ebf89f492172e7ed64c69c37f8/paper/raceface.pdf -------------------------------------------------------------------------------- /project/cart-pole_example.py: -------------------------------------------------------------------------------- 1 | import gym 2 | env = gym.make('CartPole-v0') 3 | for i_episode in range(20): 4 | observation = env.reset() 5 | for t in range(100): 6 | env.render() 7 | print(observation) 8 | action = env.action_space.sample() 9 | observation, reward, done, info = env.step(action) 10 | if done: 11 | print("Episode finished after {} timesteps".format(t+1)) 12 | break -------------------------------------------------------------------------------- /project/model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | import gym 5 | 6 | #hyper-parameters definition 7 | GAMMA=1 8 | U_C=0 9 | MAX_RUN=100 10 | N_g,N_c,N_a=2,2,2 11 | e_g,e_c,e_a=-1,0,0 12 | 13 | """ 14 | Goal network 15 | """ 16 | class goal_network(object): 17 | def __init__(self,sess,w_initializer=tf.contrib.layers.xavier_initializer(),b_initializer=tf.zeros_initializer()): 18 | self.sess=sess 19 | self.w_initializer=w_initializer 20 | self.b_initializer=b_initializer 21 | self.learning_rate=0.1 22 | with tf.variable_scope("goal_net"): 23 | """ 24 | weights_1= tf.get_variable("gn_01_w",[5,5],initializer=self.w_initializer) #dimensions : [input layer, hidden_layer] 25 | bias_1 = tf.get_variable("gn_01_b",[5],initializer=self.b_initializer)#dimensions : [hidden_layer] 26 | tensor=tf.add(tf.matmul(self.g_input,weights_1),bias_1) 27 | tensor=tf.nn.relu(tensor) #dont know, assume it is relu 28 | weights_2= tf.get_variable("gn_02_w",[5,1],initializer=self.w_initializer)#dimensions : [hidden_layer,output_layer] 29 | bias_2 = tf.get_variable("gn_02_b",[1],initializer=self.b_initializer)#dimensions : [output_layer] 30 | tensor=tf.add(tf.matmul(tensor,weights_2),bias_2) 31 | self.s_now=tf.nn.sigmoid(tensor) 32 | """ 33 | self.g_input = tf.concat([observ, action_input], axis=1) 34 | hidden= tf.layers.dense(self.g_input, 5, kernel_initializer=self.w_initializer, bias_initializer=self.b_initializer,name="l1",activation=tf.nn.relu) 35 | self.s=tf.layers.dense(hidden, 1, kernel_initializer=self.w_initializer, bias_initializer=self.b_initializer,name="l2",activation=tf.nn.sigmoid) 36 | print("goal network init finish") 37 | def cal_loss(self,J_now,J_last,reward,gamma=GAMMA): 38 | loss=np.mean(0.5*(gamma*J_now-(J_last-reward))**2) 39 | return loss 40 | def update_gradient(self,pass_gradients): 41 | self.goal_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='goal_net') 42 | self.goal_grads = tf.gradients(ys=self.s, xs=self.goal_params, grad_ys= pass_gradients) 43 | opt = tf.train.AdamOptimizer(self.learning_rate) # (- learning rate) for ascent policy 44 | self.train_op = opt.apply_gradients(zip(self.goal_grads, self.goal_params)) 45 | def train(self,action,observation): 46 | _,signal=self.sess.run([self.train_op,self.s],feed_dict={observ:observation,action_input:action}) 47 | return signal 48 | 49 | def test(self): 50 | #for var in tf.trainable_variables(): 51 | # print(var.name) 52 | g_var= tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='goal_net') 53 | var=self.sess.run(g_var) 54 | #print(var) 55 | 56 | """ 57 | Critic network 58 | """ 59 | class critic_network(object): 60 | def __init__(self,sess,s_now,a_now,w_initializer=tf.contrib.layers.xavier_initializer(),b_initializer=tf.zeros_initializer(),gamma=GAMMA): 61 | self.sess=sess 62 | self.J_last=tf.placeholder(tf.float32,[1,1],name="J_last") 63 | self.s_now=s_now 64 | self.a_now=a_now 65 | self.w_initializer=w_initializer 66 | self.b_initializer=b_initializer 67 | self.learning_rate=0.1 68 | with tf.variable_scope("critic_net"): 69 | self.g_input=tf.concat([observ,a_now],axis=1) 70 | self.g_input=tf.concat([self.g_input,s_now],axis=1) 71 | """ 72 | weights_1= tf.get_variable("gn_01_w",[5,5],initializer=self.w_initializer) #dimensions : [input layer, hidden_layer] 73 | bias_1 = tf.get_variable("gn_01_b",[5],initializer=self.b_initializer)#dimensions : [hidden_layer] 74 | tensor=tf.add(tf.matmul(self.g_input,weights_1),bias_1) 75 | tensor=tf.nn.relu(tensor) #dont know, assume it is relu 76 | weights_2= tf.get_variable("gn_02_w",[5,1],initializer=self.w_initializer)#dimensions : [hidden_layer,output_layer] 77 | bias_2 = tf.get_variable("gn_02_b",[1],initializer=self.b_initializer)#dimensions : [output_layer] 78 | tensor=tf.add(tf.matmul(tensor,weights_2),bias_2) 79 | self.s_now=tf.nn.sigmoid(tensor) 80 | """ 81 | hidden= tf.layers.dense(self.g_input, 5, kernel_initializer=self.w_initializer, bias_initializer=self.b_initializer,name="l1",activation=tf.nn.relu) 82 | self.J_now=tf.layers.dense(hidden, 1, kernel_initializer=self.w_initializer, bias_initializer=self.b_initializer,name="l2",activation=tf.nn.sigmoid) 83 | self.loss=tf.reduce_mean(0.5*tf.squared_difference(gamma*self.J_now,(self.J_last-s_now))) 84 | self.critic_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic_net') 85 | self.c2a_grads = tf.gradients(ys=self.J_now, xs=a_now) 86 | self.c2g_grads = tf.gradients(ys=self.J_now, xs=s_now) 87 | opt = tf.train.AdamOptimizer(self.learning_rate) # (- learning rate) for ascent policy 88 | self.train_op = opt.minimize(self.loss) 89 | print("critic network init finish") 90 | def train(self,J_last,action,signal,observation): 91 | _,value,Loss=self.sess.run([self.train_op,self.J_now,self.loss],feed_dict={observ:observation,action_input:action,self.J_last:J_last}) 92 | return value,Loss 93 | 94 | def test(self): 95 | #for var in tf.trainable_variables(): 96 | # print(var.name) 97 | c_var= tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic_net') 98 | var=self.sess.run(c_var) 99 | #print(var) 100 | 101 | """ 102 | Action network 103 | """ 104 | class action_network(object): 105 | def __init__(self,sess,w_initializer=tf.contrib.layers.xavier_initializer(),b_initializer=tf.zeros_initializer()): 106 | self.sess=sess 107 | self.w_initializer=w_initializer 108 | self.b_initializer=b_initializer 109 | self.learning_rate=0.1 110 | with tf.variable_scope("action_net"): 111 | """ 112 | weights_1= tf.get_variable("gn_01_w",[5,5],initializer=self.w_initializer) #dimensions : [input layer, hidden_layer] 113 | bias_1 = tf.get_variable("gn_01_b",[5],initializer=self.b_initializer)#dimensions : [hidden_layer] 114 | tensor=tf.add(tf.matmul(self.g_input,weights_1),bias_1) 115 | tensor=tf.nn.relu(tensor) #dont know, assume it is relu 116 | weights_2= tf.get_variable("gn_02_w",[5,1],initializer=self.w_initializer)#dimensions : [hidden_layer,output_layer] 117 | bias_2 = tf.get_variable("gn_02_b",[1],initializer=self.b_initializer)#dimensions : [output_layer] 118 | tensor=tf.add(tf.matmul(tensor,weights_2),bias_2) 119 | self.s_now=tf.nn.sigmoid(tensor) 120 | """ 121 | hidden= tf.layers.dense(observ, 5, kernel_initializer=self.w_initializer, bias_initializer=self.b_initializer,name="l1",activation=tf.nn.relu) 122 | self.a=tf.layers.dense(hidden, 1, kernel_initializer=self.w_initializer, bias_initializer=self.b_initializer,name="l2",activation=tf.nn.sigmoid) 123 | print("goal network init finish") 124 | def cal_loss(self,J_now,U_c=U_C): 125 | loss=np.mean(0.5*(J_now-U_c)**2) 126 | return loss 127 | def update_gradient(self,pass_gradients): 128 | self.action_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='action_net') 129 | self.action_grads = tf.gradients(ys=self.a, xs=self.action_params, grad_ys= pass_gradients) 130 | opt = tf.train.AdamOptimizer(self.learning_rate) # (- learning rate) for ascent policy 131 | self.train_op = opt.apply_gradients(zip(self.action_grads, self.action_params)) 132 | def train(self,observation,action): 133 | _,action=self.sess.run([self.train_op,self.a],feed_dict={observ:observation,action_input:action}) 134 | return action 135 | 136 | def test(self): 137 | #for var in tf.trainable_variables(): 138 | # print(var.name) 139 | a_var= tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='action_network') 140 | var=self.sess.run(a_var) 141 | #print(var) 142 | 143 | 144 | if __name__ == "__main__": 145 | #initialize the openai env 146 | env = gym.make('CartPole-v0') 147 | env.reset(); 148 | random_episodes = 0 149 | reward_sum = 0 150 | while random_episodes < 1: 151 | env.render() 152 | observation, reward, done, action = env.step(np.random.randint(0,2)) 153 | reward_sum += reward 154 | if done: 155 | random_episodes += 1 156 | print("Reward for this episode was:",reward_sum) 157 | reward_sum = 0 158 | env.reset() 159 | 160 | 161 | action_lst,signal_lst,value_lst=[],[],[0,0] 162 | #set up input tensor 163 | with tf.variable_scope("Input"): 164 | observ=tf.placeholder(tf.float32,[1,4],name="observ") 165 | action_input=tf.placeholder(tf.float32,[1,1],name="action_input") 166 | #set up agent tensor graph 167 | sess=tf.Session() 168 | goal_net=goal_network(sess) 169 | action_net=action_network(sess) 170 | critic_net=critic_network(sess,s_now=goal_net.s,a_now=action_net.a) 171 | goal_net.update_gradient(critic_net.c2g_grads) 172 | action_net.update_gradient(critic_net.c2a_grads) 173 | sess.run(tf.global_variables_initializer()) 174 | 175 | #start training 176 | action = np.array(env.action_space.sample()).reshape(1,1) 177 | epoch=0 178 | done=0 179 | while epoch < MAX_RUN: 180 | cyc_g, cyc_c, cyc_a = 0, 0, 0 181 | loss_goal, loss_critic, loss_action = 1, 1, 1 182 | observation=observation.reshape(1, 4) 183 | action = np.array(action).reshape(1, 1) 184 | #tuning the goal network 185 | while cyc_g < N_g and loss_goal > e_g: 186 | signal=goal_net.train(action,observation) 187 | loss_goal=goal_net.cal_loss(value_lst[-1],value_lst[-2],np.int(done)) 188 | cyc_g+=1 189 | #print("goal ",loss_goal) 190 | # signal_lst.append(signal) 191 | #tuning the critic network 192 | value_tmp_lst=[] 193 | value_tmp_lst.append(value_lst[-1]) 194 | while cyc_c < N_c and loss_critic > e_c: 195 | value_last=np.array(value_tmp_lst[-1]).reshape(1,1) 196 | value_now,loss_critic=critic_net.train(value_last,action,signal,observation) 197 | value_tmp_lst.append(value_now) 198 | cyc_c+=1 199 | #print("critic ",loss_critic) 200 | value_lst.append(value_now) 201 | #tuning the action network 202 | while cyc_a < N_a and loss_action > e_a: 203 | action=action_net.train(observation,action) 204 | loss_action=action_net.cal_loss(value_lst[-1]) 205 | cyc_a+=1 206 | #print("action: ",loss_action) 207 | action=np.int(np.round(action)[0][0]) 208 | env.render() 209 | observation, reward, done, info = env.step(action) 210 | if done: 211 | env.reset() 212 | epoch+=1 213 | print("Epoch :",epoch," action :",action," observation :",observation," done :",done) 214 | -------------------------------------------------------------------------------- /project/model.py.bak: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | import gym 5 | 6 | #hyper-parameters definition 7 | MAX_RUN=5 8 | N_g,N_c,N_g=5,5,5 9 | 10 | """ 11 | Goal network 12 | """ 13 | class goal_network(object): 14 | def __init__(self,sess,w_initializer=tf.contrib.layers.xavier_initializer(),b_initializer=tf.zeros_initializer()): 15 | self.sess=sess 16 | self.w_initializer=w_initializer 17 | self.b_initializer=b_initializer 18 | self.a_last=tf.placeholder(tf.float32,[1,1],name="input_x") 19 | self.learning_rate=0.1 20 | # implement goal network 21 | self.g_input=tf.concat([observ,self.a_last],axis=0) 22 | with tf.variable_scope("goal_net"): 23 | """ 24 | weights_1= tf.get_variable("gn_01_w",[5,5],initializer=self.w_initializer) #dimensions : [input layer, hidden_layer] 25 | bias_1 = tf.get_variable("gn_01_b",[5],initializer=self.b_initializer)#dimensions : [hidden_layer] 26 | tensor=tf.add(tf.matmul(self.g_input,weights_1),bias_1) 27 | tensor=tf.nn.relu(tensor) #dont know, assume it is relu 28 | weights_2= tf.get_variable("gn_02_w",[5,1],initializer=self.w_initializer)#dimensions : [hidden_layer,output_layer] 29 | bias_2 = tf.get_variable("gn_02_b",[1],initializer=self.b_initializer)#dimensions : [output_layer] 30 | tensor=tf.add(tf.matmul(tensor,weights_2),bias_2) 31 | self.s_now=tf.nn.sigmoid(tensor) 32 | """ 33 | hidden= tf.layers.dense(self.g_input, 5, kernel_initializer=self.w_initializer, bias_initializer=self.b_initializer,name="l1",activation=tf.nn.relu) 34 | self.s=tf.layers.dense(hidden, 1, kernel_initializer=self.w_initializer, bias_initializer=self.b_initializer,name="l2",activation=tf.nn.sigmoid) 35 | print("goal network init finish") 36 | #self.a_now=tf.nn.sigmoid(tensor) 37 | #self.loss_goal=0.5*tf.squared_difference(self.gamma*self.J_now,(self.J_last-self.reward)) 38 | #self.gradient_goal=tf.gradients(self.loss_goal,[weights_1,bias_1,weights_2,bias_2]) 39 | #self.gradient_action=tf.gradients(self.loss_action,[weights_1,bias_1,weights_2,bias_2]) 40 | def cal_loss(self,J_now,J_last,reward): 41 | loss=tf.reduced_mean(0.5*tf.squared_difference(self.gamma*self.J_now,(J_last-reward))) 42 | return self.sess.run(loss) 43 | def update_gradient(self,pass_gradients): 44 | self.goal_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='goal_net') 45 | self.goal_grads = tf.gradients(ys=self.u, xs=self.action_params, grad_ys= pass_gradients) 46 | opt = tf.train.AdamOptimizer(self.learning_rate) # (- learning rate) for ascent policy 47 | self.train_op = opt.apply_gradients(zip(self.goal_grads, self.goal_params)) 48 | def train(self,action,observation): 49 | input=np.concatenate((x,action),axis=0) 50 | _,signal=self.sess.run([self.train_op,self.s],feed_dict={observ:observation,self.a_last:action}) 51 | return signal 52 | 53 | def test(self): 54 | #for var in tf.trainable_variables(): 55 | # print(var.name) 56 | g_var= tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='goal_net') 57 | var=self.sess.run(g_var) 58 | #print(var) 59 | 60 | """ 61 | Critic network 62 | """ 63 | class critic_network(object): 64 | def __init__(self,sess,s_now,a_now,w_initializer=tf.contrib.layers.xavier_initializer(),b_initializer=tf.zeros_initializer(),gamma=1): 65 | self.sess=sess 66 | self.J_last=tf.placeholder(tf.float32,[1,1],name="V_last") 67 | self.s_now=s_now 68 | self.a_now=a_now 69 | self.w_initializer=w_initializer 70 | self.b_initializer=b_initializer 71 | self.learning_rate=0.1 72 | # implement goal network 73 | 74 | with tf.variable_scope("critic_net"): 75 | self.g_input=tf.concat([observ,a_now],axis=0) 76 | self.g_input=tf.concat([self.g_input,s_now],axis=0) 77 | """ 78 | weights_1= tf.get_variable("gn_01_w",[5,5],initializer=self.w_initializer) #dimensions : [input layer, hidden_layer] 79 | bias_1 = tf.get_variable("gn_01_b",[5],initializer=self.b_initializer)#dimensions : [hidden_layer] 80 | tensor=tf.add(tf.matmul(self.g_input,weights_1),bias_1) 81 | tensor=tf.nn.relu(tensor) #dont know, assume it is relu 82 | weights_2= tf.get_variable("gn_02_w",[5,1],initializer=self.w_initializer)#dimensions : [hidden_layer,output_layer] 83 | bias_2 = tf.get_variable("gn_02_b",[1],initializer=self.b_initializer)#dimensions : [output_layer] 84 | tensor=tf.add(tf.matmul(tensor,weights_2),bias_2) 85 | self.s_now=tf.nn.sigmoid(tensor) 86 | """ 87 | hidden= tf.layers.dense(self.g_input, 5, kernel_initializer=self.w_initializer, bias_initializer=self.b_initializer,name="l1",activation=tf.nn.relu) 88 | self.J_now=tf.layers.dense(hidden, 1, kernel_initializer=self.w_initializer, bias_initializer=self.b_initializer,name="l2",activation=tf.nn.sigmoid) 89 | self.loss=tf.reduced_mean(0.5*tf.squared_difference(gamma*self.J_now,(self.J_last-s_now))) 90 | self.critic_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic_net') 91 | self.c2a_grads = tf.gradients(ys=self.J_now, xs=a_now) 92 | self.c2g_grads = tf.gradients(ys=self.J_now, xs=s_now) 93 | print("critic network init finish") 94 | #self.a_now=tf.nn.sigmoid(tensor) 95 | #self.loss_goal=0.5*tf.squared_difference(self.gamma*self.J_now,(self.J_last-self.reward)) 96 | #self.gradient_goal=tf.gradients(self.loss_goal,[weights_1,bias_1,weights_2,bias_2]) 97 | #self.gradient_action=tf.gradients(self.loss_action,[weights_1,bias_1,weights_2,bias_2]) 98 | def train(self,J_last,action,signal,observation): 99 | opt = tf.train.AdamOptimizer(self.learning_rate) # (- learning rate) for ascent policy 100 | self.train_op = opt.minimize(self.loss) 101 | _,J_now=self.sess.run([self.train_op,J_now],feed_dict={observ:observation,self.a_now:action,self.s_now:signal,self.J_last:J_last}) 102 | return J_now 103 | 104 | def test(self): 105 | #for var in tf.trainable_variables(): 106 | # print(var.name) 107 | c_var= tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic_net') 108 | var=self.sess.run(c_var) 109 | #print(var) 110 | 111 | """ 112 | Action network 113 | """ 114 | class action_network(object): 115 | def __init__(self,sess,w_initializer=tf.contrib.layers.xavier_initializer(),b_initializer=tf.zeros_initializer(),U_c=0,gamma=1): 116 | self.sess=sess 117 | self.w_initializer=w_initializer 118 | self.b_initializer=b_initializer 119 | self.x=tf.placeholder(tf.float32,[1,4],name="input_x") 120 | self.learning_rate=0.1 121 | # implement goal network 122 | with tf.variable_scope("action_net"): 123 | """ 124 | weights_1= tf.get_variable("gn_01_w",[5,5],initializer=self.w_initializer) #dimensions : [input layer, hidden_layer] 125 | bias_1 = tf.get_variable("gn_01_b",[5],initializer=self.b_initializer)#dimensions : [hidden_layer] 126 | tensor=tf.add(tf.matmul(self.g_input,weights_1),bias_1) 127 | tensor=tf.nn.relu(tensor) #dont know, assume it is relu 128 | weights_2= tf.get_variable("gn_02_w",[5,1],initializer=self.w_initializer)#dimensions : [hidden_layer,output_layer] 129 | bias_2 = tf.get_variable("gn_02_b",[1],initializer=self.b_initializer)#dimensions : [output_layer] 130 | tensor=tf.add(tf.matmul(tensor,weights_2),bias_2) 131 | self.s_now=tf.nn.sigmoid(tensor) 132 | """ 133 | hidden= tf.layers.dense(observ, 5, kernel_initializer=self.w_initializer, bias_initializer=self.b_initializer,name="l1",activation=tf.nn.relu) 134 | self.a=tf.layers.dense(hidden, 1, kernel_initializer=self.w_initializer, bias_initializer=self.b_initializer,name="l2",activation=tf.nn.sigmoid) 135 | print("goal network init finish") 136 | #self.a_now=tf.nn.sigmoid(tensor) 137 | #self.loss_goal=0.5*tf.squared_difference(self.gamma*self.J_now,(self.J_last-self.reward)) 138 | #self.gradient_goal=tf.gradients(self.loss_goal,[weights_1,bias_1,weights_2,bias_2]) 139 | #self.gradient_action=tf.gradients(self.loss_action,[weights_1,bias_1,weights_2,bias_2]) 140 | def cal_loss(self,J_now,J_last,reward): 141 | loss=tf.reduced_mean(0.5*tf.squared_difference(self.gamma*self.J_now,(J_last-reward))) 142 | return self.sess.run(loss) 143 | def update_gradient(self,pass_gradients): 144 | self.action_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='action_net') 145 | self.action_grads = tf.gradients(ys=self.a, xs=self.action_params, grad_ys= pass_gradients) 146 | opt = tf.train.AdamOptimizer(self.learning_rate) # (- learning rate) for ascent policy 147 | self.train_op = opt.apply_gradients(zip(self.action_grads, self.action_params)) 148 | def train(self,observation): 149 | self.train_op = opt.apply_gradients(zip(self.action_grads, self.action_params)) 150 | _,action=self.sess.run([self.train_op,self.a],feed_dict={observ:observation}) 151 | return action 152 | 153 | def test(self): 154 | #for var in tf.trainable_variables(): 155 | # print(var.name) 156 | a_var= tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='action_network') 157 | var=self.sess.run(a_var) 158 | #print(var) 159 | 160 | 161 | if __name__ == "__main__": 162 | env = gym.make('CartPole-v0') 163 | 164 | env.reset(); 165 | random_episodes = 0 166 | reward_sum = 0 167 | while random_episodes < 1: 168 | env.render() 169 | observation, reward, done, action = env.step(np.random.randint(0,2)) 170 | reward_sum += reward 171 | if done: 172 | random_episodes += 1 173 | print("Reward for this episode was:",reward_sum) 174 | reward_sum = 0 175 | env.reset() 176 | 177 | 178 | action_lst,signal_lst,value_lst=[],[],[] 179 | cyc_g,cyc_c,cyc_a=0,0,0 180 | value_lst.append(0) 181 | # 182 | with tf.variable_scope("Input"): 183 | observ=tf.placeholder(tf.float32,[4,1],name="observ") 184 | sess=tf.Session() 185 | goal_net=goal_network(sess) 186 | action_net=action_network(sess) 187 | critic_net=critic_network(sess,s_now=goal_net.s,a_now=action_net.a) 188 | goal_net.update_gradients(critic_net.c2g_grads) 189 | action_net.update_gradients(critic_net.c2a_grads) 190 | sess.run(tf.initialize_all_variables()) 191 | observation=observation.reshape(4,1) 192 | for epoch in range(MAX_RUN): 193 | while cyc_g < N_g: 194 | signal=goal_net.train(action,observation) 195 | cyc_g+=1 196 | # signal_lst.append(signal) 197 | while cyc_c < N_c: 198 | value_now=critic_net.train(value_lst[-1],action,signal,observation) 199 | cyc_c+=1 200 | while cyc_a < N_a: 201 | action=action_net.train(observation) 202 | cyc_a+=1 203 | print("the next action :" ,action) 204 | observation, reward, done, info = env.step(action) 205 | print("observation :", observation) 206 | print("reward :", reward) 207 | print("info :", info) 208 | --------------------------------------------------------------------------------