├── Alpha-Zero ├── 21P │ ├── game.py │ ├── mcts.py │ ├── network.py │ ├── player.py │ └── train.py ├── Ani-Chess │ ├── game.py │ ├── mcts.py │ ├── network.py │ ├── player.py │ └── train.py └── FiveAI │ ├── game.py │ ├── mcts_alphaZero.py │ ├── policy_value_net_tensorflow.py │ └── train.py ├── README.md ├── assets └── 学习线路.png ├── double_DQN & dueling_DQN.py ├── theading_demo.py ├── tutorial_A3C.py ├── tutorial_AC.py ├── tutorial_DDPG.py ├── tutorial_DPPO.py ├── tutorial_DQN.py ├── tutorial_DQN_variants.py ├── tutorial_PG.py ├── tutorial_PPO.py ├── tutorial_Qlearning.py └── tutorial_TD3.py /Alpha-Zero/21P/game.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import copy 4 | from player import MCTSPlayer, Player, Human_Player, MCTSPlayer, Pure_MCTS_Player 5 | 6 | def one_hot(x): 7 | arr = np.zeros(12) 8 | if x != -1: 9 | arr[x] = 1 10 | return arr 11 | 12 | def one_hot_to_fig(arr): 13 | for i in range(arr.shape[0]): 14 | if arr[i] == 1: 15 | break 16 | return i 17 | 18 | def count_one(arr,x): 19 | z = 0 20 | for a in arr: 21 | if a == x: 22 | z+=1 23 | return z 24 | 25 | def softmax(x): 26 | probs = np.exp(x - np.max(x)) 27 | probs /= np.sum(probs) 28 | return probs 29 | 30 | def get_avail_act(arr): 31 | avail_act_list = [] 32 | for i in range(len(arr)): 33 | if arr[i]==1: 34 | avail_act_list.append(i) 35 | return avail_act_list 36 | 37 | class Blackjack(): 38 | def __init__(self,game_state,policy_value_net): 39 | self.state = game_state 40 | self.figures = self.init_figures() 41 | self.buffer_value = [] 42 | self.policy_value_net = policy_value_net 43 | self.p1 = MCTSPlayer(self.state,'p1',self.policy_value_net.policy_value_fn,n_playout=100,is_selfplay=1) #用于训练是selfplay 44 | self.p2 = MCTSPlayer(self.state,'p2',self.policy_value_net.policy_value_fn,n_playout=1000,is_selfplay=0) #用于真正自己玩 45 | self.human = Human_Player('human') 46 | self.random_player = Player('random') 47 | self.pure_tree_playre = Pure_MCTS_Player(self.state,'pure_tree',self.policy_value_net.policy_value_fn,n_playout=1000,is_selfplay=0) 48 | 49 | # 初始化数字池和可用数字 50 | def init_figures(self): 51 | figures = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 52 | for _ in range(2): 53 | figures.append(random.randint(1, 10)) 54 | self.figures = np.array(figures) 55 | self.availabel_figures = np.ones(12) 56 | self.state.update_current_state(self.figures,self.availabel_figures,p1_num=False,p2_num=False,p1_choi=-1,p2_choi=-1) 57 | 58 | # 初始化玩家的数字 59 | def init_player_figures(self): 60 | a = random.randint(15, 21) 61 | b = random.randint(19, 27) 62 | # 两者相加不能大于21*2 63 | if a + b >= 21 * 2: 64 | a = a - (a + b - 21 * 2) / 2 65 | b = b - (a + b - 21 * 2) / 2 66 | a = int(a) - 1 67 | b = int(b) - 1 68 | # 两者相加要为奇数 69 | if (a + b) % 2 == 0: 70 | b = b - 1 71 | # 随机P1和P2 72 | if random.random() >= .5: 73 | self.p1_num = a 74 | self.p2_num = b 75 | else: 76 | self.p1_num = b 77 | self.p2_num = a 78 | self.state.current_state[2]=self.p1_num 79 | self.state.current_state[3]=self.p2_num 80 | return self.p1_num, self.p2_num 81 | 82 | 83 | def who_first(self): 84 | if self.p1.num >= self.p2.num: 85 | return self.p1,self.p2 86 | else: 87 | return self.p2,self.p1 88 | 89 | 90 | def get_winner(self): 91 | if count_one(self.state.current_state[1],1)<=2: 92 | if self.state.current_state[2][0]<=21 and self.state.current_state[3][0]<=21: 93 | if self.state.current_state[2][0]>=self.state.current_state[3][0]: 94 | winner = 0 95 | else: 96 | winner = 1 97 | elif self.state.current_state[3][0]>21: 98 | winner = 0 99 | else: 100 | winner = 1 101 | return winner 102 | 103 | # 开始游戏(和纯树玩家玩) 104 | def start_game(self): 105 | print('=========START GAME==========') 106 | self.init_figures() # 初始化数字池 107 | self.init_player_figures() # 初始化双方数字 108 | self.state.save_current_state() 109 | 110 | #print(self.state.current_state) 111 | for i in range(5): 112 | #=====打印状态===== 113 | print('********ROUND %i*********'%(i+1)) 114 | print(self.state.current_state[0]) 115 | print(self.state.current_state[1]) 116 | print(self.state.current_state[2]) 117 | print(self.state.current_state[3]) 118 | 119 | if self.state.current_state[2][0]>self.state.current_state[3][0]: #如果第3行大于第4行,就纯树先走。 120 | act, num = self.pure_tree_playre.get_action(self.state.current_state) # 纯树玩家选择 121 | #act, num = self.random_player.get_action(self.state.current_state) # 随机玩家选择 122 | self.state.do_move(act) 123 | print('PTreePlayer Selcet No.%i fig: %i '%(act,self.state.current_state[0][act-1])) 124 | 125 | act_2nd, num_2nd = self.p2.get_action(self.state.current_state) # MCTS玩家选择 126 | self.state.do_move(act_2nd) 127 | print('MCTSPlayer Selcet No.%i fig: %i '%(act_2nd,self.state.current_state[0][act_2nd-1])) 128 | else: 129 | act_2nd, num_2nd = self.p2.get_action(self.state.current_state) # MCTS玩家选择 130 | self.state.do_move(act_2nd) 131 | print('MCTSPlayer Selcet No.%i fig: %i '%(act_2nd,self.state.current_state[0][act_2nd-1])) 132 | act, num = self.pure_tree_playre.get_action(self.state.current_state) # 纯树玩家选择 133 | #act, num = self.random_player.get_action(self.state.current_state) 134 | self.state.do_move(act) 135 | print('PTreePlayer Selcet No.%i fig: %i '%(act,self.state.current_state[0][act-1])) 136 | 137 | if count_one(self.state.current_state[1],1)<=2: # 判断是否已经结束 138 | if self.state.current_state[2][0]<=21 and self.state.current_state[3][0]<=21: #如果两者都小于21,那么大的一方获胜 139 | if self.state.current_state[2][0]>=self.state.current_state[3][0]: 140 | winner = 0 # 纯树 141 | else: 142 | winner = 1 # MCTS 143 | elif self.state.current_state[3][0]>21: 144 | winner = 0 # 纯树 145 | else: 146 | winner = 1 # 纯树 147 | 148 | return winner 149 | 150 | # 开始游戏(和真人玩家玩) 151 | def start_game_human(self): 152 | print('=========START GAME==========') 153 | self.init_figures() 154 | self.init_player_figures() 155 | self.state.save_current_state() 156 | 157 | #print(self.state.current_state) 158 | for i in range(5): 159 | print('********ROUND %i*********'%(i+1)) 160 | num_list = [] 161 | for i in range(12): 162 | if self.state.current_state[1][i]==1: 163 | num_list.append(int(self.state.current_state[0][i])) 164 | else: 165 | num_list.append(0) 166 | print('数字列表: : ', num_list) 167 | print('行动列表: : ', list(range(1,13))) 168 | print('your number: ', self.state.current_state[2][0]) 169 | print('oppe number: ', self.state.current_state[3][0]) 170 | if self.state.current_state[2][0]>self.state.current_state[3][0]: #p1先手,p1是random 或者是玩家 171 | act, num = self.human.get_action(self.state.current_state) 172 | self.state.do_move(act) 173 | print('你的选择:[%i] 数字: [%i] '%(act+1,self.state.current_state[0][act])) 174 | 175 | act_2nd, num_2nd = self.p2.get_action(self.state.current_state) 176 | self.state.do_move(act_2nd) 177 | print('对手选择:[%i] 数字: [%i] '%(act_2nd+1,self.state.current_state[0][act_2nd])) 178 | else: 179 | act_2nd, num_2nd = self.p2.get_action(self.state.current_state) 180 | self.state.do_move(act_2nd) 181 | print('对手选择:[%i] 数字: [%i] '%(act_2nd+1,self.state.current_state[0][act_2nd])) 182 | act, num = self.human.get_action(self.state.current_state) 183 | self.state.do_move(act) 184 | print('你的选择:[%i] 数字: [%i] '%(act+1,self.state.current_state[0][act])) 185 | 186 | if count_one(self.state.current_state[1],1)<=2: 187 | if self.state.current_state[2][0]<=21 and self.state.current_state[3][0]<=21: 188 | if self.state.current_state[2][0]>=self.state.current_state[3][0]: 189 | winner = 0 190 | else: 191 | winner = 1 192 | elif self.state.current_state[3][0]>21: 193 | winner = 0 194 | else: 195 | winner = 1 196 | 197 | return winner 198 | 199 | def start_self_play(self): 200 | 201 | states, mcts_probs, current_players, buffer_value = [], [], [],[] 202 | run_down_list = [] 203 | 204 | self.init_figures() # 初始化数字公共数字 205 | self.init_player_figures() # 初始化玩家自己的数字 206 | self.state.save_current_state() # 保存到current_state 207 | 208 | #=====start a selfplay game======= 209 | for _ in range(5): # 进行5轮游戏 210 | #通过state判断谁先手。 211 | if self.state.current_state[2][0] > self.state.current_state[3][0]: 212 | #run_down_list主要记录哪个player先手 213 | run_down_list.append(0) 214 | run_down_list.append(1) 215 | else: 216 | run_down_list.append(1) 217 | run_down_list.append(0) 218 | 219 | #【敲黑板】选择1个动作。这个动作的选择,是根据MCTS模拟获得的。 220 | act1, act1_porbs = self.p1.get_action(self.state.current_state) 221 | self.state.do_move(act1) # 执行动作,并进入下一个state 222 | states.append((copy.copy(self.state.current_state)).reshape(-1,6,12,1).astype('float32')) #加入到states保存,等会拿来训练网络 223 | mcts_probs.append(np.array(act1_porbs).astype('float32')) #把act1_porbs保存,等会拿来训练网络 224 | 225 | #print('======change player========') 226 | act2, act2_porbs = self.p1.get_action(self.state.current_state) 227 | self.state.do_move(act2) 228 | states.append((copy.copy(self.state.current_state)).reshape(-1,6,12,1).astype('float32')) 229 | mcts_probs.append(np.array(act2_porbs).astype('float32')) 230 | 231 | # 经过5轮之后,计算winner 232 | winner = self.get_winner() 233 | if winner == 0: 234 | print('winner: p1') 235 | else: 236 | print('winner: p2') 237 | 238 | # 根据胜负,放入到最后 239 | for p in run_down_list: 240 | if p != winner: 241 | #if p == winner: 242 | buffer_value.append(np.ones(12).astype('float32')) 243 | else: 244 | buffer_value.append((np.ones(12) * (-1)).astype('float32')) 245 | 246 | self.p1.reset_player() 247 | self.p2.reset_player() 248 | 249 | #把state,动作概率,结果返回。 250 | return zip(states, mcts_probs, buffer_value) 251 | 252 | class Game_State(): 253 | def __init__(self): 254 | self.current_state = np.zeros((6, 12)) 255 | self.state_buffer = [] 256 | 257 | #save current state 258 | def save_current_state(self): 259 | self.state_buffer.append(np.copy(self.current_state)) 260 | #print(self.current_state) 261 | 262 | #ini current state 263 | def update_current_state(self,figures,availabel_figures,p1_num,p2_num,p1_choi,p2_choi): 264 | self.current_state[0] = figures #可选数字 265 | self.current_state[1] = availabel_figures # 266 | self.current_state[2] = p1_num * np.ones(12) 267 | self.current_state[3] = p2_num * np.ones(12) 268 | self.current_state[4] = one_hot(-1) 269 | self.current_state[5] = one_hot(-1) 270 | 271 | def do_move(self,act): 272 | #=======先手======= 273 | if count_one(self.current_state[1],1)%2 == 0: 274 | self.current_state[1][act] = 0 275 | #比较两个num谁大,看看谁先选 276 | if self.current_state[2][0]>self.current_state[3][0]: #p1先手 277 | self.current_state[4] = one_hot(act) 278 | else: 279 | self.current_state[5] = one_hot(act) 280 | #self.save_current_state() 281 | 282 | #=======后手======= 283 | else: 284 | self.current_state[1][act] = 0 285 | if self.current_state[2][0]>self.current_state[3][0]: #p1先手 286 | #print('++++++++act :',act) 287 | #print('++++++++state:',self.current_state) 288 | self.current_state[5] = one_hot(act) 289 | else: 290 | self.current_state[4] = one_hot(act) 291 | self.cal_state() 292 | self.save_current_state() 293 | 294 | def cal_state(self): 295 | p1_choise = self.current_state[0][np.argwhere(self.current_state[4]==1)[0]] 296 | p2_choise = self.current_state[0][np.argwhere(self.current_state[5]==1)[0]] 297 | p1_num = self.current_state[2][0] 298 | p2_num = self.current_state[3][0] 299 | ab = abs(p1_choise-p2_choise) 300 | if p1_choise>=p2_choise: 301 | p1_num += ab 302 | p2_num -= ab 303 | else: 304 | p1_num -= ab 305 | p2_num += ab 306 | self.current_state[2] = p1_num * np.ones(12) 307 | self.current_state[3] = p2_num * np.ones(12) 308 | self.current_state[4] = np.zeros(12) 309 | self.current_state[5] = np.zeros(12) 310 | 311 | def get_curr_player(self): 312 | #看谁的数字大,大的先 313 | if count_one(self.current_state[4],1)>=1: 314 | return 1 315 | elif count_one(self.current_state[5],1)>=1: 316 | return 0 317 | else: 318 | if self.current_state[2][0]>self.current_state[3][0]: 319 | return 0 320 | else: 321 | return 1 -------------------------------------------------------------------------------- /Alpha-Zero/21P/mcts.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | 4 | def count_one(arr,x): 5 | z = 0 6 | for a in arr: 7 | if a == x: 8 | z+=1 9 | return z 10 | 11 | def softmax(x): 12 | probs = np.exp(x - np.max(x)) 13 | probs /= np.sum(probs) 14 | return probs 15 | 16 | def get_avail_act(arr): 17 | avail_act_list = [] 18 | for i in range(len(arr)): 19 | if arr[i]==1: 20 | avail_act_list.append(i) 21 | return avail_act_list 22 | 23 | class TreeNode(object): 24 | def __init__(self, parent, prior_p): 25 | self._parent = parent #父节点 26 | self._children = {} # 子节点,是一个字典:字典的key是动作,item是子节点。子节点包括了描述这个动作的概率,Q等 27 | self._n_visits = 0 # 记录这个节点被访问次数 28 | self._Q = 0 #这个节点的价值 29 | self._u = 0 #用于计算UCB上限。在select的时候,用的是Q+U的最大值。 30 | self._P = prior_p #动作对应的概率 31 | #self.curr_player = curr_player #要不要传入一个s,或者player比较好? 32 | 33 | def expand(self, action_priors): 34 | for action, prob in action_priors: 35 | if action not in self._children: 36 | self._children[action] = TreeNode(self, prob) 37 | 38 | def select(self, c_puct): 39 | return max(self._children.items(),key=lambda act_node: act_node[1].get_value(c_puct)) 40 | 41 | def update(self, leaf_value): 42 | self._n_visits += 1 43 | self._Q += 1.0*(leaf_value - self._Q) / self._n_visits 44 | 45 | # 从祖先一直下来,按着pointer刚更新 46 | def update_recursive(self, leaf_value_buffer, pointer): 47 | if self._parent: 48 | self._parent.update_recursive(leaf_value_buffer, pointer) 49 | self.update(leaf_value_buffer[pointer]) 50 | pointer +=1 51 | 52 | def get_value(self, c_puct): 53 | self._u = (c_puct * self._P * 54 | np.sqrt(self._parent._n_visits) / (1 + self._n_visits)) 55 | return self._Q + self._u 56 | 57 | def is_leaf(self): 58 | """Check if leaf node (i.e. no nodes below this have been expanded).""" 59 | return self._children == {} 60 | 61 | def is_root(self): 62 | return self._parent is None 63 | 64 | class MCTS(object): 65 | """An implementation of Monte Carlo Tree Search.""" 66 | 67 | def __init__(self, policy_value_fn, c_puct=5, n_playout=500): 68 | self._root = TreeNode(None, 1.0) #初始化根节点 69 | self._policy = policy_value_fn #用于生成子节点action-prob对 70 | self._c_puct = c_puct #一个常数,好像没啥用 71 | self._n_playout = n_playout #模拟多少次走一步 72 | 73 | 74 | def _playout(self, state): 75 | ''' 76 | 主要功能就是创建一颗MCTS。 77 | ''' 78 | #进行一次模拟_root就代表传入state 79 | node = self._root #把这个node变成当前根节点 80 | run_down = [] #存player的顺序 81 | 82 | #Select--expand---updata 83 | while(1): 84 | if node.is_leaf(): #如果已经是叶子节点,就不需要 85 | break 86 | 87 | action, node = node.select(self._c_puct) # select:选择节点:选择最大分数的 88 | curr_player = state.get_curr_player() # 记录当前玩家 89 | run_down.append(copy.copy(curr_player)) 90 | state.do_move(action) # 进行下一步,直到已经到叶子节点。 91 | 92 | action_probs, leaf_value = self._policy(state) # 【敲黑板】用网络预估动作的概率和叶子的价值。 93 | 94 | 95 | if count_one(state.current_state[1],1)>2: # 判断游戏是否应该结束了 96 | curr_player = state.get_curr_player() 97 | node.expand(action_probs) # 如果没有,就扩展叶子节点。 98 | 99 | # ======update 100 | # leaf_value_buffer 是用于反向更新 101 | pointer = 0 102 | leaf_value_buffer = [] 103 | if len(run_down)>0: # 当只有根节点的时候。 104 | for player in run_down: 105 | if player == curr_player: 106 | leaf_value_buffer.append(leaf_value) 107 | else: 108 | leaf_value_buffer.append(-leaf_value) 109 | # 向上更新祖先节点。 110 | node.update_recursive(leaf_value_buffer,pointer) 111 | 112 | # 如果游戏结束了,就算出winner 113 | else: 114 | if state.current_state[2][0]<=21 and state.current_state[3][0]<=21: 115 | if state.current_state[2][0]>=state.current_state[3][0]: 116 | winner = 0 117 | else: 118 | winner = 1 119 | elif state.current_state[3][0]>21: 120 | winner = 0 121 | else: 122 | winner = 1 123 | 124 | #====update==== 125 | pointer = 0 126 | leaf_value_buffer = [] 127 | for player in run_down: 128 | if player == winner: 129 | leaf_value_buffer.append(1) 130 | else: 131 | leaf_value_buffer.append(-1) 132 | #向上更新祖先节点。 133 | node.update_recursive(leaf_value_buffer,pointer) 134 | 135 | 136 | 137 | #===============_puretree========================================================= 138 | def _puretree_expand_fn(self,state): 139 | legal_position = get_avail_act(state.current_state[1]) 140 | act_porbs = zip(legal_position,(np.ones(12)/12)) 141 | return act_porbs, 1 142 | 143 | #进行一次模拟_root就代表传入state(不用network,纯用树playout) 144 | def _puretree_playout(self, state): 145 | node = self._root #把这个node变成当前根节点 146 | run_down = [] #存player的顺序 147 | #Select--expand---updata 148 | while(1): 149 | 150 | if node.is_leaf(): 151 | break 152 | action, node = node.select(self._c_puct) 153 | curr_player = state.get_curr_player() 154 | run_down.append(copy.copy(curr_player)) 155 | state.do_move(action) 156 | 157 | #action_probs, leaf_value = self._policy(state) 158 | action_probs,leaf_value = self._puretree_expand_fn(state) 159 | if count_one(state.current_state[1],1)>2: 160 | curr_player = state.get_curr_player() 161 | node.expand(action_probs) 162 | # ======update 163 | # leaf_value_buffer 是用于反向更新 164 | pointer = 0 165 | leaf_value_buffer = [] 166 | if len(run_down)>0: # 当只有根节点的时候。 167 | for player in run_down: 168 | if player == curr_player: 169 | leaf_value_buffer.append(leaf_value) 170 | else: 171 | leaf_value_buffer.append(-leaf_value) 172 | # 向上更新祖先节点。 173 | node.update_recursive(leaf_value_buffer,pointer) 174 | 175 | else: 176 | if state.current_state[2][0]<=21 and state.current_state[3][0]<=21: 177 | if state.current_state[2][0]>=state.current_state[3][0]: 178 | winner = 0 179 | else: 180 | winner = 1 181 | elif state.current_state[3][0]>21: 182 | winner = 0 183 | else: 184 | winner = 1 185 | 186 | pointer = 0 187 | leaf_value_buffer = [] 188 | for player in run_down: 189 | if player == winner: 190 | leaf_value_buffer.append(1) 191 | else: 192 | leaf_value_buffer.append(-1) 193 | #向上更新祖先节点。 194 | node.update_recursive(leaf_value_buffer,pointer) 195 | 196 | def get_move_probs(self, state, temp=1e-3, is_pure_tree=0): 197 | #每一步进行_n_playout模拟 198 | #每次都把现在的state复制出来。进行模拟,一直到游戏结束 199 | #把得到的leaf_value更新到每个状态。同时更新被**访问次数**。 200 | #最后我们会得到一颗模拟出来各种结果的树,我们需要的就是这个树。 201 | for n in range(self._n_playout): 202 | state_copy = copy.deepcopy(state) #复制当前的state出来, 203 | if is_pure_tree: #如果是纯树,就用纯树playout 204 | self._puretree_playout(state_copy) 205 | else: #用MCTS的playout,建立一颗MCTS 206 | self._playout(state_copy) 207 | 208 | # 【敲黑板】这里是通过节点的访问次数,返回的动作和动作概率 209 | # _root._children.items()访问根节点的_children,就是访问当前状态下,各个动作和对应的节点。 210 | # 取出节点和被访问次数 211 | # 然后一轮运算后,根据访问次数,获得act 和对应的act_probs 212 | act_visits = [(act, node._n_visits) 213 | for act, node in self._root._children.items()] 214 | acts, visits = zip(*act_visits) 215 | act_probs = softmax(1.0/temp * np.log(np.array(visits) + 1e-10)) 216 | return acts, act_probs 217 | 218 | def update_with_move(self, last_move): 219 | #下棋后,检查这move是否在这个树的子节点中。如果在就把根节点移动到这个节点。 220 | #否则新建一个节点。 221 | #这棵树会一直维护,直到一次游戏结束。 222 | if last_move in self._root._children: 223 | self._root = self._root._children[last_move] 224 | self._root._parent = None 225 | else: 226 | self._root = TreeNode(None, 1.0) 227 | 228 | 229 | def __str__(self): 230 | return "MCTS" 231 | 232 | 233 | -------------------------------------------------------------------------------- /Alpha-Zero/21P/network.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorlayer as tl 3 | import os 4 | import numpy as np 5 | 6 | def get_avail_act(arr): 7 | avail_act_list = [] 8 | for i in range(len(arr)): 9 | if arr[i]==1: 10 | avail_act_list.append(i) 11 | return avail_act_list 12 | 13 | 14 | class PolicyValueNet(): 15 | def __init__(self): 16 | self.lr =1e-5 17 | self.opt = tf.optimizers.Adam(self.lr) 18 | 19 | self.model = self.get_model() 20 | self.model.train() 21 | self.old_model = self.get_model() 22 | self.old_model.eval() 23 | self.model_save_path = './model/blackjack.hdf5' 24 | 25 | def update_param(self): 26 | ''' 27 | 赋值给oldmodel 28 | ''' 29 | for i, old in zip(self.model.trainable_weights, self.old_model.trainable_weights): 30 | old.assign(i) 31 | 32 | def get_model(self): 33 | 34 | #=====init W===== 35 | w_init = tf.random_normal_initializer(stddev=0.1) 36 | 37 | #=====Input===== 38 | inn = tl.layers.Input([None,6,12,1]) 39 | 40 | #=====Conv====== 41 | conv1 = tl.layers.Conv2d(n_filter=32,filter_size=(6,3),act=tl.activation.leaky_relu,W_init=w_init,b_init=None,padding='SAME')(inn) 42 | conv2 = tl.layers.Conv2d(n_filter=64,filter_size=(6,3),act=tl.activation.leaky_relu,W_init=w_init,b_init=None,padding='SAME')(conv1) 43 | conv3 = tl.layers.Conv2d(n_filter=128,filter_size=(6,3),act=tl.activation.leaky_relu,W_init=w_init,b_init=None,padding='SAME')(conv2) 44 | #====Action Network===== 45 | action_conv = tl.layers.Conv2d(n_filter=4,filter_size=(6,1),act=tl.activation.leaky_relu,W_init=w_init,b_init=None,padding='SAME')(conv3) 46 | action_reshape = tl.layers.Reshape([-1,4*6*12])(action_conv) 47 | action_fc = tl.layers.Dense(n_units=12,act=tf.nn.log_softmax,b_init=None,)(action_reshape) 48 | #=====Value Network===== 49 | value_conv = tl.layers.Conv2d(n_filter=2,filter_size=(6,1),act=tl.activation.leaky_relu,W_init=w_init,b_init=None,padding='SAME')(conv3) 50 | value_reshape = tl.layers.Reshape([-1,2*6*12])(value_conv) 51 | value_fc1 = tl.layers.Dense(n_units=2*12,act=tl.activation.leaky_relu,W_init=w_init,b_init=None,)(value_reshape) 52 | value_fc2 = tl.layers.Dense(n_units=1,act=tf.nn.tanh)(value_fc1) 53 | 54 | return tl.models.Model(inputs=inn, outputs=[action_fc,value_fc2]) 55 | 56 | def policy_value_fn(self, state): 57 | legal_positions = get_avail_act(state.current_state[1]) 58 | #print('legal_positions',legal_positions) 59 | current_state = state 60 | act_probs, value ,_= self.policy_value( 61 | state.current_state.reshape(-1, 6,12,1).astype('float32') 62 | ) 63 | 64 | #print("++++++++++expand+++++++++++") 65 | #print("legal_positions",legal_positions) 66 | #print("act_probs ",act_probs.flatten()[legal_positions]) 67 | #print("+++++++++++++++++++++++++++") 68 | 69 | act_probs = zip(legal_positions, act_probs.flatten()[legal_positions]) 70 | 71 | return act_probs, value[0][0] 72 | 73 | def policy_value(self,state): 74 | log_act_probs,value = self.model(state) 75 | act_probs = np.exp(log_act_probs) 76 | return act_probs, value , log_act_probs 77 | 78 | def policy_value_old(self,state): 79 | log_act_probs,value = self.old_model(state) 80 | act_probs = np.exp(log_act_probs) 81 | return act_probs, value 82 | 83 | def train_step(self, state_batch, mcts_probs, winner_batch, lr): 84 | ''' 85 | 开始训练 86 | ''' 87 | with tf.GradientTape() as tape: 88 | act_probs, value , log_act_probs= self.policy_value(state_batch) 89 | 90 | # value_loss 用 mse 就可以了 91 | self.value_loss = tf.losses.mean_squared_error(winner_batch,value) 92 | self.policy_loss = tf.negative(tf.reduce_mean(tf.reduce_sum(tf.multiply(mcts_probs, log_act_probs), 1))) 93 | 94 | #print('=====================================') 95 | #print('mcts_probs') 96 | #print(mcts_probs[0]) 97 | #print('log_act_probs') 98 | #print(log_act_probs[0]) 99 | #print('=====================================') 100 | 101 | #L2权重正则化,防止过拟合 102 | l2_penalty_beta = 1e-4 103 | vars = self.model.trainable_weights 104 | l2_penalty = l2_penalty_beta * tf.add_n([tf.nn.l2_loss(v) for v in vars if 'bias' not in v.name.lower()]) 105 | 106 | self.loss = self.value_loss + self.policy_loss + l2_penalty 107 | 108 | #自动求导,常规动作 109 | grads = tape.gradient(self.loss,self.model.trainable_weights) 110 | self.opt.apply_gradients(zip(grads,self.model.trainable_weights)) 111 | loss = tf.reduce_mean(self.loss) 112 | value_loss = tf.reduce_mean(self.value_loss) 113 | policy_loss = tf.reduce_mean(self.policy_loss) 114 | l2_penalty = tf.reduce_mean(l2_penalty) 115 | 116 | return loss,value_loss,policy_loss,l2_penalty 117 | 118 | def save_ckpt(self): 119 | """ 120 | save trained weights 121 | :return: None 122 | """ 123 | if not os.path.exists('model'): 124 | os.makedirs('model') 125 | tl.files.save_weights_to_hdf5(self.model_save_path, self.model) 126 | 127 | 128 | def load_ckpt(self): 129 | """ 130 | load trained weights 131 | :return: None 132 | """ 133 | tl.files.load_hdf5_to_weights_in_order(self.model_save_path, self.model) -------------------------------------------------------------------------------- /Alpha-Zero/21P/player.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | from mcts import MCTS 4 | 5 | class Player(): 6 | #def __init__(self, name, policy_value_function, c_puct=5, n_playout=2000, is_selfplay=0): 7 | def __init__(self, name,): 8 | self.name = name 9 | #self.mcts = MCTS(policy_value_function, c_puct, n_playout) 10 | #self._is_selfplay = is_selfplay 11 | self.last_action = -1 12 | self.num = 0 13 | 14 | def get_action(self,state): 15 | while True: 16 | act = random.randint(0, 11) 17 | if state[1][act] == 1: 18 | break 19 | num = state[0][act] 20 | print('%s 选择第:%i个数字:%i'%(self.name,act+1,num)) 21 | return act, num 22 | 23 | class Human_Player(): 24 | def __init__(self, name,): 25 | self.name = name 26 | self.last_action = -1 27 | self.num = 0 28 | 29 | def get_action(self,state): 30 | 31 | while True: 32 | act = int(input('choose action'))-1 33 | if act>12: 34 | print('your choise is over 12') 35 | if state[1][act] != 0: 36 | break 37 | else: 38 | print('your choise has been choosen') 39 | num = state[0][act] 40 | return act, num 41 | 42 | class MCTSPlayer(object): 43 | def __init__(self,state,name,policy_value_function, c_puct=0.1, n_playout=500, is_selfplay=1): 44 | self.mcts = MCTS(policy_value_function, c_puct, n_playout) 45 | self._is_selfplay = is_selfplay 46 | self.name = name 47 | self.last_action = -1 48 | self.num = 0 49 | self.state = state 50 | 51 | def set_player_ind(self, p): 52 | self.player = p 53 | 54 | def reset_player(self): 55 | self.mcts.update_with_move(-1) #把-1传进去,就重置了整个树了。 56 | 57 | def get_action(self, state, temp=1e-3, return_prob=1): 58 | 59 | sensible_moves = np.argwhere(state[1]==1) # 获得合法的选择 60 | move_probs = np.zeros(12) 61 | if len(sensible_moves) > 2: # 判断一下游戏是否应该结束了 62 | 63 | # 【敲黑板】进行n_playout模拟,生成一棵MCTS,返回根节点的acts, probs 64 | acts, probs = self.mcts.get_move_probs(self.state, temp, is_pure_tree=0) 65 | move_probs[list(acts)] = probs 66 | 67 | #======================================= 68 | #如果是selfplay模式,就要加0.25噪音。然后sample出一个move,执行。 69 | #如果不是selfplay模式,就不加噪音,但会重置整棵树。 70 | if self._is_selfplay: 71 | move = np.random.choice( 72 | acts, 73 | p=0.75*probs + 0.25*np.random.dirichlet(0.3*np.ones(len(probs))) 74 | ) 75 | 76 | self.mcts.update_with_move(move) 77 | else: 78 | move = np.argmax(move_probs) # 最大概率的动作 79 | self.mcts.update_with_move(-1) 80 | # ======================================= 81 | if return_prob: 82 | return move, move_probs 83 | else: 84 | return move 85 | else: 86 | print("WARNING: the board is full") 87 | 88 | def __str__(self): 89 | return "MCTS {}".format(self.player) 90 | 91 | 92 | class Pure_MCTS_Player(object): 93 | def __init__(self,state,name,policy_value_function, c_puct=5, n_playout=1000, is_selfplay=0): 94 | self.mcts = MCTS(policy_value_function, c_puct, n_playout) 95 | self._is_selfplay = is_selfplay 96 | self.name = name 97 | self.last_action = -1 98 | self.num = 0 99 | self.state = state 100 | 101 | def set_player_ind(self, p): 102 | self.player = p 103 | 104 | def reset_player(self): 105 | self.mcts.update_with_move(-1) #把-1传进去,就重置了整个树了。 106 | 107 | def get_action(self, state, temp=1e-3, return_prob=1): 108 | sensible_moves = np.argwhere(state[1]==1) #可以的选择 109 | move_probs = np.zeros(12) 110 | if len(sensible_moves) > 2: 111 | #======================================= 112 | # 进行n_playout模拟,生成一棵MCTS,返回根节点的acts, probs 113 | acts, probs = self.mcts.get_move_probs(self.state, temp, is_pure_tree=1) 114 | move_probs[list(acts)] = probs 115 | 116 | #======================================= 117 | #如果是selfplay模式,就要加0.25噪音。然后sample出一个move,执行。 118 | #如果不是selfplay模式,就不加噪音,但会重置整棵树。 119 | if self._is_selfplay: 120 | move = np.random.choice( 121 | acts, 122 | p=0.75*probs + 0.25*np.random.dirichlet(0.3*np.ones(len(probs))) 123 | ) 124 | self.mcts.update_with_move(move) 125 | else: 126 | move = np.random.choice(acts, p=probs) 127 | self.mcts.update_with_move(-1) 128 | # ======================================= 129 | if return_prob: 130 | return move, move_probs 131 | else: 132 | return move 133 | else: 134 | print("WARNING: the board is full") 135 | 136 | def __str__(self): 137 | return "MCTS {}".format(self.player) -------------------------------------------------------------------------------- /Alpha-Zero/21P/train.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict, deque 2 | import ipywidgets as widgets # 控件库 3 | from IPython.display import display # 显示控件的方法 4 | import numpy as np 5 | import random 6 | from game import Game_State 7 | from network import PolicyValueNet 8 | from game import Blackjack 9 | 10 | class TrainPipeline(): 11 | 12 | def __init__(self,init_model=None): 13 | self.state = Game_State() 14 | self.policy_value_net = PolicyValueNet() 15 | self.game = Blackjack(self.state,self.policy_value_net) 16 | 17 | self.game_batch_num = 30 #相当于更新次数 18 | self.play_batch_size = 1 #跑多少次去获取batch 19 | self.batch_size = 8 20 | self.buffer_size = 512 21 | self.epochs = 32 #更新多少次 22 | self.data_buffer = deque(maxlen=self.buffer_size) 23 | self.learn_rate = 2e-3 24 | self.lr_multiplier = 1 25 | self.kl_targ = 0.02 26 | 27 | #好像最重要的是start_selfplay,其他保存的好像都可以的了。 28 | def collect_selfplay_data(self,n_games=1): 29 | ''' 30 | 收集selfplay的数据 31 | ''' 32 | for _ in range(n_games): #n_games selfplay的次数 33 | play_data = self.game.start_self_play() #开始selfplay,并返回数据给play_data 34 | play_data = list(play_data)[:] 35 | self.episode_len = len(play_data) 36 | self.data_buffer.extend(play_data) #把selfplay加入大data_buffer 37 | 38 | def run(self,ep): 39 | #self.policy_value_net.load_ckpt() #加载原来的参数继续训练 40 | for i in range(ep): #训练次数 41 | self.collect_selfplay_data(self.play_batch_size) 42 | 43 | print("batch i:{}, episode_len:{}".format(i+1, self.episode_len)) 44 | if len(self.data_buffer)>self.batch_size: 45 | loss= self.policy_update() 46 | print('============================No%i update network SUCCESS==========================================='%(i)) 47 | self.policy_value_net.save_ckpt() 48 | 49 | def policy_update(self): 50 | """update the policy-value net""" 51 | #========解压数据============ 52 | mini_batch = random.sample(self.data_buffer, self.batch_size) 53 | state_batch = [data[0] for data in mini_batch] # state 54 | mcts_probs_batch = [data[1] for data in mini_batch] # probs 55 | winner_batch = [data[2] for data in mini_batch] # winner 56 | 57 | #=========================== 58 | #这里好像做了important sampling,直接计算KL_diverges大小,超过一定就早停 59 | self.policy_value_net.update_param() 60 | old_probs, old_v = self.policy_value_net.policy_value_old(state_batch) 61 | 62 | #进行epochs次训练 63 | for _ in range(self.epochs): 64 | # 开始训练 65 | loss,value_loss,policy_loss,l2_penalty= self.policy_value_net.train_step( 66 | state_batch, 67 | mcts_probs_batch, 68 | winner_batch, 69 | self.learn_rate*self.lr_multiplier) 70 | print('total_loss: %f , value_loss: %f , policy_loss: %f , l2_penalty: %f'%(loss,value_loss,policy_loss,l2_penalty)) 71 | new_probs, new_v,_ = self.policy_value_net.policy_value(state_batch) 72 | kl = np.mean(np.sum(old_probs * ( 73 | np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)), 74 | axis=1) 75 | ) 76 | if kl > self.kl_targ * 4: # early stopping if D_KL diverges badly 77 | break 78 | # adaptively adjust the learning rate 79 | # 根据上次更新的KL_diverges大小,动态调整学习率 80 | if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1: 81 | self.lr_multiplier /= 1.5 82 | elif kl < self.kl_targ / 2 and self.lr_multiplier < 10: 83 | self.lr_multiplier *= 1.5 84 | 85 | explained_var_old = (1 - 86 | np.var(np.array(winner_batch) - old_v) / 87 | np.var(np.array(winner_batch))) 88 | explained_var_new = (1 - 89 | np.var(np.array(winner_batch) - new_v) / 90 | np.var(np.array(winner_batch))) 91 | 92 | print(("kl:{:.5f}," 93 | "lr_multiplier:{:.3f}," 94 | "loss:{}," 95 | "explained_var_old:{:.3f}," 96 | "explained_var_new:{:.3f}" 97 | ).format(kl, 98 | self.lr_multiplier, 99 | loss, 100 | explained_var_old, 101 | explained_var_new)) 102 | return loss 103 | 104 | def play_game(self,playtime=1): 105 | self.policy_value_net.load_ckpt() #读取神经网络参数 106 | pure_tree_playre_win_count = 0 107 | tree_player_win_count = 0 108 | 109 | for _ in range(playtime): 110 | winner = self.game.start_game() 111 | if winner == 0: 112 | print('pure_tree_playre_win!!!') 113 | pure_tree_playre_win_count += 1 114 | else: 115 | print('MCTS_tree_player_win!!!') 116 | tree_player_win_count += 1 117 | 118 | print('===========Result============') 119 | print('pure_tree_playre_win: %i '%(pure_tree_playre_win_count)) 120 | print('MCTS_tree_player_win: %i '%(tree_player_win_count)) 121 | return tree_player_win_count 122 | 123 | 124 | def playgame_with_human(self,playtime=1): 125 | ''' 126 | 和真人玩 127 | ''' 128 | self.policy_value_net.load_ckpt() #加载参数 129 | human_win_count = 0 130 | msts_player_win_count = 0 131 | for i in range(playtime): 132 | winner = self.game.start_game_human() 133 | if winner == 0: 134 | print('Human_playre_win!!!') 135 | human_win_count += 1 136 | else: 137 | print('MCTS_tree_player_win!!!') 138 | msts_player_win_count += 1 139 | print('===========Result============') 140 | print('Human_playre_win: %i '%(human_win_count)) 141 | print('MCTS_player_win: %i '%(msts_player_win_count)) 142 | 143 | if __name__=="__main__": 144 | trainpipeline = TrainPipeline() 145 | #trainpipeline.run(5000) 146 | trainpipeline.play_game(10) 147 | 148 | -------------------------------------------------------------------------------- /Alpha-Zero/Ani-Chess/game.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import copy 4 | from player import Player 5 | from network import PolicyValueNet 6 | from mcts import MCTS_Player 7 | 8 | 9 | class Chess(): 10 | def __init__(self, group, cid, is_show=0): 11 | self.group = group 12 | self.cid = cid #chessid 根据id换算名字,但不分阵营 13 | self.is_show = is_show 14 | self.name = self.id_to_name() 15 | 16 | def id_to_name(self): 17 | name = ['象','狮','虎','豹','狼','狗','猫','鼠'] 18 | return name[self.cid] 19 | 20 | 21 | 22 | class Board(): 23 | 24 | def __init__(self,mcts_player): 25 | self.width = 4 26 | self.height = 4 27 | self.state = np.zeros([self.height*self.width,2*8+1]) 28 | self.max_step = 200 #最大步数 29 | #self.step_count = 0 #计数目前走了多少步 30 | self.mcts_player = mcts_player 31 | #self.p2 = p2 32 | self.board = self.init_board() #board是一个list,记载所有的棋子,不维护一个4*4的array了。 33 | self.dead_chess_list = [] 34 | 35 | # 判断是否已经结束,返回winner 36 | def get_winner(self): 37 | ''' 38 | -1:平局 39 | 0:蓝胜 40 | 1:红胜 41 | ''' 42 | group_list = [] 43 | # board中的棋子小于等于1个。(1个的时候是None) 44 | # 平局 45 | if len(set(self.board))<=1: 46 | #print('没有棋子了,平局') 47 | return 1,-1 48 | 49 | #超出步数 50 | #平局 51 | elif self.step_count>=self.max_step: 52 | #print('超出步数,平局') 53 | return 1,-1 54 | 55 | #逐个看每个剩余棋子的group 56 | else: 57 | for i in range(len(self.board)): 58 | if self.board[i]==None: 59 | continue 60 | else: 61 | group_list.append(self.board[i].group) 62 | #print('group_list:',group_list) 63 | if len(set(group_list))<=1: 64 | if group_list[0]==0: #如果剩余是蓝,就是蓝胜,否则:红胜 65 | #print('剩余棋子,蓝胜') 66 | return 1, 0 67 | else: 68 | #print('剩余棋子,红胜') 69 | return 1, 1 70 | else: 71 | #print('尚未结束') 72 | return 0,-1 #没结束 73 | 74 | def init_board(self): 75 | board_list = [] 76 | for i in range(2): 77 | for j in range(8): 78 | chess = Chess(group=i,cid=j,is_show=0) 79 | board_list.append(chess) 80 | random.shuffle(board_list) 81 | self.current_player = 0 82 | return board_list 83 | 84 | def reset_board(self): 85 | self.board = self.init_board() 86 | self.step_count = 0 87 | 88 | #https://www.cnblogs.com/daofaziran/p/9015284.html 89 | #python输出带颜色字体详解 90 | def show_board(self): 91 | 92 | for i in range(self.height*self.width): 93 | # 如果为空 94 | if self.board[i] == None: 95 | print('\033[1;37;47m \033[0m',end='') 96 | # 是否已经揭开 97 | elif self.board[i].is_show == 0: 98 | print('\033[1;37;40m? \033[0m',end='') 99 | # 判断阵营 100 | elif self.board[i].group == 0: 101 | print('\033[1;34;47m%s\033[0m'%(self.board[i].name),end='') 102 | else: 103 | print('\033[1;31;47m%s\033[0m'%(self.board[i].name),end='') 104 | 105 | if (i+1)%4 ==0: 106 | print('') 107 | 108 | 109 | #从board转换为当前state 110 | def board_to_state(self,board): 111 | ''' 112 | 根据棋子来的,每一层代表一个棋子: 113 | 全-1:未知 114 | 全0:死了 115 | 已知:对应点标记位置 116 | ''' 117 | 118 | state = np.ones([(2*8+1),self.height,self.width]) * (-1) #见设置全部未知,然后根据board里面的恢复 119 | 120 | # 先搜索棋盘上的棋子 121 | for i in range(len(board)): 122 | # 那个位置是空的 123 | if board[i] == None: 124 | continue 125 | else: 126 | index = board[i].group*8 + board[i].cid #找出该棋子应该在第几channel标记 127 | state[index] = np.zeros([self.width,self.height]) 128 | 129 | if board[i].is_show == 1: 130 | h = i // self.width 131 | w = i % self.width 132 | #print('index',index) 133 | state[index][h,w] = 1 134 | 135 | #搜索死了的棋子 136 | for j in range(len(self.dead_chess_list)): 137 | index = self.dead_chess_list[j].group*8 + self.dead_chess_list[j].cid 138 | state[index] = np.zeros([self.width,self.height]) 139 | 140 | #如果是p1,则用0表示,否则用1 141 | if self.current_player == 0 : 142 | state[-1] *= 0 143 | else: 144 | state[-1] *= -1 145 | state = np.transpose(state,(1,2,0)) 146 | return state 147 | 148 | 149 | #判断两个棋子大小 150 | def compare_chess(self,chess_a, chess_b): 151 | ''' 152 | 前者胜:1 153 | 平:0 154 | 前者败:-1 155 | ''' 156 | #print(chess_a) 157 | a = chess_a.cid 158 | b = chess_b.cid 159 | if a==7 and b==0: 160 | return 1 161 | elif b==7 and a==0: 162 | return -1 163 | elif a==b: 164 | return 0 165 | elif a15: 245 | return -1 246 | else: 247 | return target_pos 248 | elif direct==3: 249 | if pos%4 == 0: #最左的一列 250 | return -1 251 | else: 252 | return pos - 1 253 | 254 | elif direct==4: 255 | if pos%4 == 3: #最右的一列 256 | return -1 257 | else: 258 | return pos + 1 259 | else: 260 | print('非法方向') 261 | 262 | 263 | #从state获取可以移动空间,可移动为1,否则0 264 | def get_legal_action(self,board): 265 | action_space = np.zeros([self.width*self.height,5]) 266 | for i in range(len(board)): 267 | #如果这个位置没棋子,直接跳过 268 | if board[i]==None: 269 | continue 270 | 271 | # 如果未翻,则可以翻 272 | if not board[i].is_show: 273 | action_space[i][0]=1 274 | continue 275 | 276 | #检查阵营是否一致,如果不是自己棋,直接set0,跳出 277 | elif board[i].group != self.current_player: 278 | action_space[i] = np.zeros_like(action_space[i]) 279 | continue 280 | 281 | #如果是自己棋 282 | for j in range(1,5): 283 | if self.get_target_pos(i,j) != -1: 284 | target_pos = self.get_target_pos(i,j) 285 | # 如果是空位,能移动 286 | if self.board[target_pos] == None : 287 | action_space[i][j] = 1 288 | # 如果是自己的棋和未知的棋,就不能动 289 | elif self.board[target_pos].is_show == 0 or self.board[target_pos].group == self.current_player: 290 | action_space[i][j] = 0 291 | # 是敌方的棋 292 | elif self.board[target_pos].group != self.current_player: 293 | action_space[i][j] = 1 294 | else: 295 | action_space[i][j] = 0 296 | 297 | return action_space 298 | 299 | 300 | def change_player(self): 301 | if self.current_player == 0: 302 | self.current_player = 1 303 | else: 304 | self.current_player = 0 305 | 306 | 307 | 308 | class Game(): 309 | def __init__(self,policy_value_net): 310 | 311 | self.policy_value_net = policy_value_net 312 | #self.p1 = Player(group=0,name = 'p1') #蓝 313 | #self.p2 = Player(group=1,name = 'p2') #红 314 | self.mcts_player = MCTS_Player(group=0,name = 'p1',policy_value_function = self.policy_value_net.policy_value_fn) #蓝 315 | #self.p2 = MCTS_Player(group=1,name = 'p2',policy_value_function = self.policy_value_net.policy_value_fn) #红 316 | self.game_board = Board(self.mcts_player) 317 | self.state_buffer = [] 318 | 319 | 320 | 321 | def init_player_group(self,pos): 322 | chess = self.game_board.board[pos] 323 | if chess.group != 0: 324 | self.game_board.mcts_player.group = 1 325 | 326 | ''' 327 | def start_game(self,is_show_board): 328 | #board = self.game_board.reset_board() 329 | self.game_board.show_board() 330 | print('=======================') 331 | for i in range(self.game_board.max_step): 332 | state = self.game_board.board_to_state(self.game_board.board) 333 | self.state_buffer.append(copy.deepcopy(state)) 334 | pos,dire= self.game_board.current_player.get_action() #action是一个元组,(pos,移动) 335 | #print('%s , action: (%i, %i)'%(self.current_player.name,pos,dire)) 336 | 337 | self.game_board.move(pos,dire) 338 | if is_show_board: 339 | self.game_board.show_board() 340 | 341 | #第一轮改变一下阵营,p1总是先手,但翻到什么棋,就是什么阵营 342 | if i == 0 : 343 | self.init_player_group(pos) 344 | 345 | # 判断是否结束 346 | if self.game_board.get_winner()==1: 347 | break 348 | else: 349 | self.game_board.change_player() 350 | print('==========step %i=========='%i) 351 | 352 | has_winner , winner = self.game_board.get_winner() 353 | 354 | if has_winner==0: 355 | print('draw') 356 | else: 357 | if winner == 0: 358 | print('winner is p%i'%(winner+1)) 359 | else: 360 | print('winner is p%i'%(winner+1)) 361 | ''' 362 | 363 | 364 | def move_to_act(self,move): 365 | pos,dire = move 366 | action = np.zeros((16,5)) 367 | action[pos][dire] = 1 368 | return action 369 | 370 | 371 | def start_self_play(self,is_show_board=1): 372 | 373 | #定义buffer 374 | state_buff, action_buff= [], [] 375 | player_group_list = [] #记录走棋的顺序 376 | self.game_board.reset_board() 377 | self.mcts_player.reset_player() 378 | 379 | #开始游戏: 380 | for i in range(self.game_board.max_step): 381 | print('==========step %i=========='%i) 382 | #self.game_board.show_board() 383 | 384 | #从game_board转为state,并保存。 385 | state = self.game_board.board_to_state(self.game_board.board) 386 | state_buff.append(copy.deepcopy(state)) #记录state 387 | 388 | #从game_board计算action,并move 389 | player_group_list.append(copy.deepcopy(self.game_board.current_player)) #记录下棋顺序 390 | move = self.game_board.mcts_player.get_action(self.game_board) #action是一个元组,(pos,移动) 391 | self.game_board.move(move,change_player=True) 392 | 393 | self.game_board.step_count += 1 394 | #print('setpcountprint:',self.game_board.step_count) 395 | 396 | #move转化为action的方式,并保存 397 | action = self.move_to_act(move) 398 | action_buff.append(action) #记录action 399 | 400 | #是否要show过程 401 | if is_show_board: 402 | self.game_board.show_board() 403 | 404 | #第一轮改变一下阵营,p1总是先手,但翻到什么棋,就是什么阵营 405 | if i == 0 : 406 | pos,_ = move 407 | self.init_player_group(pos) 408 | 409 | 410 | 411 | # 判断是否结束 412 | end, winner = self.game_board.get_winner() 413 | #如果结束, 414 | if end: 415 | winner_z = np.zeros(len(player_group_list)) 416 | if winner != -1: #非平局 417 | winner_z[np.array(player_group_list)==winner] = 1.0 418 | winner_z[np.array(player_group_list)!=winner] = -1.0 419 | self.mcts_player.reset_player() 420 | #self.p2.reset_player() 421 | print('winner: ',winner) 422 | #print(winner_z) 423 | #print(player_group_list) 424 | break 425 | #else: 426 | #self.game_board.change_player() 427 | 428 | 429 | #如果超过步数,则当做打平,并结束 430 | if i >=self.game_board.max_step: 431 | winner_z = np.zeros(len(player_group_list)) 432 | self.mcts_player.reset_player() 433 | print('winner: draw') 434 | #print(winner_z) 435 | 436 | 437 | return zip(state_buff,action_buff,winner_z) -------------------------------------------------------------------------------- /Alpha-Zero/Ani-Chess/mcts.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | 4 | def softmax(x): 5 | probs = np.exp(x - np.max(x)) 6 | probs /= np.sum(probs) 7 | return probs 8 | 9 | class TreeNode(): 10 | def __init__(self ,parent,prior_p): 11 | self._parent = parent 12 | self._children = {} 13 | self._n_visit = 0 14 | self._q = 0 15 | self._u = 0 16 | self._p = prior_p 17 | 18 | def print_children(self): 19 | for child in self._children: 20 | print(child) 21 | 22 | def expand(self,action_probs,game_board): 23 | legal_action = game_board.get_legal_action(game_board.board) 24 | #print('EXPAND-player_name:',game_board.current_player.name) 25 | #print('EXPAND-player_group:',game_board.current_player.group) 26 | 27 | #这里需要过滤一下,不能走的位置呀 28 | #print('expand check') 29 | #print(legal_action) 30 | for h in range(action_probs.shape[1]): 31 | for w in range(action_probs.shape[2]): 32 | if legal_action[h][w]==0: 33 | continue 34 | else: 35 | action = (h,w) 36 | #print(action) 37 | if action not in self._children: 38 | self._children[action] = TreeNode(self,action_probs[0][h][w]) 39 | #self.print_children() 40 | 41 | def get_value(self, c_puct): 42 | self._u = (c_puct * self._p * np.sqrt(self._parent._n_visit)/(1 + self._n_visit)) 43 | return self._q + self._u 44 | 45 | def select(self,c_puct): 46 | ''' 47 | 获取value最大的node 48 | ''' 49 | #print('======_children.items======') 50 | #print(self._children.items()) 51 | return max(self._children.items(),key=lambda act_node: act_node[1].get_value(c_puct)) 52 | 53 | def update(self, leaf_value): 54 | self._n_visit += 1 55 | self._q = 1.0 * (leaf_value - self._q)/self._n_visit 56 | 57 | def update_recursive(self,leaf_value): 58 | if self._parent: 59 | self._parent.update_recursive(-leaf_value) 60 | self.update(leaf_value) 61 | 62 | def is_leaf(self): 63 | return self._children == {} 64 | 65 | def is_root(self): 66 | return self._parent is None 67 | 68 | 69 | class MCTS(object): 70 | def __init__(self, policy_value_fn, c_puct=5, n_playout=10): 71 | self._root = TreeNode(None,1.0) 72 | self._policy = policy_value_fn 73 | self._c_puct = c_puct 74 | self._n_playout = n_playout 75 | 76 | def _playout(self, game_board): 77 | node = self._root 78 | step_count = copy.deepcopy(game_board.step_count) 79 | #Select 80 | while(1): 81 | if node.is_leaf(): 82 | break 83 | #node.print_children() 84 | #game_board.show_board() 85 | action, node = node.select(self._c_puct) 86 | #print('SELECT-player_name:',game_board.current_player.name) 87 | #print('SELECT-player_group:',game_board.current_player.group) 88 | #print('SELECT-action',action) 89 | game_board.move(action,change_player=True) 90 | step_count += 1 91 | #game_board.show_board() 92 | #Expand 93 | action_probs, leaf_value = self._policy(game_board) #在这个function里面再转state 94 | end, winner = game_board.get_winner() 95 | if step_count >= game_board.max_step: 96 | end, winner = 1, -1 97 | if not end : 98 | node.expand(action_probs,game_board) #这里需要改一下,把game_board传进去,过滤掉不能用的动作 99 | #print(node) 100 | else: 101 | ''' 102 | -1:平局 103 | 0:蓝胜 104 | 1:红胜 105 | ''' 106 | if winner == -1:#平局 107 | leaf_value = 0 108 | elif game_board.current_player == winner: 109 | leaf_value = 1.0 110 | else: 111 | leaf_value = -1.0 112 | #update 113 | #print(node) 114 | node.update_recursive(-leaf_value) 115 | 116 | def get_move_probs(self, game_board,group,temp=1e-3, is_pure_tree=0): 117 | ''' 118 | 开始用mcts计算action 119 | ''' 120 | for i in range(self._n_playout): 121 | print('No.%i play out'%(i)) 122 | game_board_copy = copy.deepcopy(game_board) 123 | self._playout(game_board_copy) 124 | action_visit = [(act, node._n_visit) for act, node in self._root._children.items()] 125 | acts , visit = zip(*action_visit) 126 | act_porbs = softmax(1.0 / temp * np.log(np.array(visit) + 1e-10)) 127 | return acts, act_porbs 128 | 129 | def update_with_move(self, last_move): 130 | if last_move in self._root._children: 131 | self._root = self._root._children[last_move] 132 | self._root._parent = None 133 | else: 134 | self._root = TreeNode(None,1.0) 135 | 136 | class MCTS_Player(object): 137 | def __init__(self, group, name,policy_value_function, c_puct=5, n_playout=10, is_selfplay=1): 138 | self.group = group 139 | self.name = name 140 | self.mcts = MCTS(policy_value_function,c_puct,n_playout) 141 | self._is_selfplay = is_selfplay 142 | 143 | def reset_player(self): 144 | self.mcts.update_with_move(-1) 145 | 146 | def set_player_ind(self, p): 147 | self.player = p 148 | 149 | def get_action(self,game_board): 150 | 151 | acts, act_porbs = self.mcts.get_move_probs(game_board,self.group, temp=1e-3) 152 | act_list = [] 153 | 154 | for a in acts: 155 | act_list.append(zip(a)) 156 | if self._is_selfplay: 157 | move = list(zip(*(np.random.choice(act_list,p=0.75*act_porbs + 0.25*np.random.dirichlet(0.3*np.ones(len(act_porbs)))))))[0] 158 | print('move====',move) 159 | self.mcts.update_with_move(move) 160 | else: 161 | move = list(zip(*np.random.choice(act_list, p=act_porbs)))[0] 162 | self.mcts.update_with_move(-1) 163 | return move 164 | 165 | 166 | -------------------------------------------------------------------------------- /Alpha-Zero/Ani-Chess/network.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorlayer as tl 3 | import numpy as np 4 | #from game import Game 5 | import os 6 | 7 | class PolicyValueNet(): 8 | def __init__(self): 9 | self.model = self.get_model() 10 | self.model.train() 11 | self.old_model = self.get_model() 12 | self.old_model.eval() 13 | self.learning_rate = 1e-5 14 | self.opt = tf.optimizers.Adam(self.learning_rate) 15 | 16 | def get_model(self): 17 | #=====init W===== 18 | w_init = tf.random_normal_initializer(stddev=0.1) 19 | #=====Input===== 20 | inn = tl.layers.Input([None,4,4,17]) 21 | #transpose = tf.transpose(inn,[0,2,3,1]) 22 | #=====Conv====== 23 | conv1 = tl.layers.Conv2d(n_filter=32,filter_size=(4,4),act=tl.activation.leaky_relu,W_init=w_init,b_init=None,padding='SAME')(inn) 24 | conv2 = tl.layers.Conv2d(n_filter=64,filter_size=(3,3),act=tl.activation.leaky_relu,W_init=w_init,b_init=None,padding='SAME')(conv1) 25 | conv3 = tl.layers.Conv2d(n_filter=128,filter_size=(2,2),act=tl.activation.leaky_relu,W_init=w_init,b_init=None,padding='SAME')(conv2) 26 | #====Action Network===== 27 | action_conv = tl.layers.Conv2d(n_filter=4,filter_size=(1,1),act=tl.activation.leaky_relu,W_init=w_init,b_init=None,padding='SAME')(conv3) 28 | action_reshape = tl.layers.Reshape([-1,4*16])(action_conv) 29 | action_fc = tl.layers.Dense(n_units=16*5,act=tf.nn.log_softmax,b_init=None)(action_reshape) 30 | action_output = tl.layers.Reshape([-1,16,5])(action_fc) 31 | #=====Value Network===== 32 | value_conv = tl.layers.Conv2d(n_filter=2,filter_size=(1,1),act=tl.activation.leaky_relu,W_init=w_init,b_init=None,padding='SAME')(conv3) 33 | value_reshape = tl.layers.Reshape([-1,2*16])(value_conv) 34 | value_fc1 = tl.layers.Dense(n_units=2*16,act=tl.activation.leaky_relu,W_init=w_init,b_init=None,)(value_reshape) 35 | value_fc2 = tl.layers.Dense(n_units=1,act=tf.nn.tanh)(value_fc1) 36 | 37 | return tl.models.Model(inputs=inn, outputs=[action_output,value_fc2]) 38 | 39 | def policy_value(self,state): 40 | ''' 41 | 通过model计算动作概率,和value值 42 | ''' 43 | state = np.reshape(state,[-1,4,4,17]).astype('float32') 44 | log_act_porbs, value = self.model(state) 45 | act_porbs = np.exp(log_act_porbs) 46 | return act_porbs, value 47 | 48 | def policy_value_fn(self,game_board): 49 | #game_board先转为state才能放入运算,但怎么需要player呢? 50 | legal_action = game_board.get_legal_action(game_board.board) 51 | 52 | state = game_board.board_to_state(game_board.board) 53 | act_porbs, value = self.policy_value(np.array(state).astype('float32')) 54 | act_porbs = legal_action * act_porbs #这是一个带有概率的数组 55 | return act_porbs, value 56 | 57 | def policy_value_old(self,state): 58 | state = np.reshape(state,[-1,4,4,17]).astype('float32') 59 | log_act_porbs, value = self.model(state) 60 | act_porbs = np.exp(log_act_porbs) 61 | return act_porbs, value 62 | 63 | def train_step(self, state_batch, mcts_probs, winner_batch, lr): 64 | with tf.GradientTape() as tape: 65 | act_porbs, value = self.policy_value(state_batch) 66 | 67 | #policy 和 value loss 68 | self.value_loss = tf.losses.mean_squared_error(winner_batch, value) 69 | print('mcts_probs',mcts_probs) 70 | print('act_porbs',act_porbs) 71 | self.policy_loss = tf.negative(tf.reduce_mean(tf.reduce_sum(tf.multiply(mcts_probs,act_porbs),1))) 72 | 73 | #L2 penalty 74 | l2_penalty_beta = 1e-4 75 | var = self.model.trainable_weights 76 | l2_penalty = l2_penalty_beta * tf.add_n([tf.nn.l2_loss(v) for v in var if 'bias' not in v.name.lower()]) 77 | 78 | #total loss 79 | self.loss = self.value_loss + self.policy_loss + l2_penalty 80 | 81 | grads = tape.gradient(self.loss, self.model.trainable_weights) 82 | self.opt.apply_gradients(zip(grads, self.model.trainable_weights)) 83 | 84 | total_loss = tf.reduce_mean(self.loss) 85 | value_loss = tf.reduce_mean(self.value_loss) 86 | policy_loss = tf.reduce_mean(self.policy_loss) 87 | l2_penalty = tf.reduce_mean(l2_penalty) 88 | 89 | return total_loss,value_loss,policy_loss,l2_penalty 90 | 91 | def update_parm(self): 92 | ''' 93 | 更新旧模型参数 94 | ''' 95 | for i, old in zip(self.model.trainable_weights, self.old_model.trainable_weights): 96 | old.assign(i) 97 | 98 | 99 | def save_ckpt(self): 100 | """ 101 | save trained weights 102 | :return: None 103 | """ 104 | if not os.path.exists('model'): 105 | os.makedirs('model') 106 | tl.files.save_weights_to_hdf5('model/blackjack.hdf5', self.model) 107 | 108 | 109 | def load_ckpt(self): 110 | """ 111 | load trained weights 112 | :return: None 113 | """ 114 | tl.files.load_hdf5_to_weights_in_order('model/blackjack.hdf5', self.model) 115 | 116 | -------------------------------------------------------------------------------- /Alpha-Zero/Ani-Chess/player.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | from network import PolicyValueNet 4 | 5 | class Player(): 6 | def __init__(self,group,name): 7 | self.group = group 8 | self.name = name 9 | 10 | def get_action(self,board): 11 | #ramdom 12 | #state = board.board_to_state(board.board) 13 | legal_action = board.get_legal_action(board.board,self.group) 14 | act_probs = np.random.rand(16,5)#生成随机 15 | act_probs = act_probs * legal_action 16 | action = np.where(act_probs==np.max(act_probs)) 17 | return action[0][0],action[1][0] -------------------------------------------------------------------------------- /Alpha-Zero/Ani-Chess/train.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from collections import defaultdict, deque 4 | from game import Game, Board 5 | from network import PolicyValueNet 6 | import time 7 | 8 | def list_mean(list): 9 | sum = 0.0 10 | for i in range(len(list)): 11 | sum += list[i] 12 | return sum/len(list) 13 | 14 | 15 | 16 | class TrainPipeline(): 17 | def __init__(self): 18 | self.policy_value_net = PolicyValueNet() 19 | self.game = Game(self.policy_value_net) 20 | 21 | self.game_batch_num = 10 #1000 22 | self.play_batch_size = 1 23 | self.batch_size = 64 #每次train用的数据块大小 24 | self.buffer_size = 512 25 | self.update_epochs = 16 #更新次数 26 | 27 | self.data_buffer = deque(maxlen=self.buffer_size) 28 | self.learning_rate = 2e-3 29 | self.lr_multiplier = 1 30 | self.kl_targ = 0.02 31 | 32 | def collect_selfplay_data(self, n_games=1): 33 | for _ in range(n_games): 34 | play_data = self.game.start_self_play() 35 | play_data = list(play_data)[:] 36 | self.episode_len = len(play_data) 37 | self.data_buffer.extend(play_data) 38 | 39 | def run(self): 40 | for i in range(self.game_batch_num): 41 | t0 = time.time() 42 | self.collect_selfplay_data() 43 | print("======batch :{}, episode_len:{}, time:{}======".format(i+1, self.episode_len, time.time()-t0)) 44 | 45 | if len(self.data_buffer)>self.batch_size: 46 | loss = self.policy_update() 47 | self.policy_value_net.save_ckpt() 48 | 49 | def policy_update(self): 50 | 51 | mini_batch = random.sample(self.data_buffer, self.batch_size) 52 | 53 | #print('minibatch_len:',len(mini_batch)) 54 | state_batch = np.array([data[0] for data in mini_batch]).astype('float32') 55 | mcts_probs_batch = np.array([data[1] for data in mini_batch]).astype('float32') 56 | winner_batch = np.array([data[2] for data in mini_batch]).astype('float32') 57 | 58 | total_loss_list, value_loss_list, policy_loss_list, l2_penalty_list = [],[],[],[] 59 | 60 | ''' 61 | for i in range(len(mini_batch)): 62 | state_batch = mini_batch[i][0] 63 | mcts_probs_batch = mini_batch[i][1] 64 | winner_batch = mini_batch[i][2] 65 | ''' 66 | #更新old_network 67 | self.policy_value_net.update_parm() 68 | old_probs, old_v = self.policy_value_net.policy_value_old(state_batch) 69 | 70 | #print('state_batch:',state_batch) 71 | #print('mcts_probs_batch:',mcts_probs_batch) 72 | print('winner_batch:',winner_batch) 73 | 74 | for _ in range(self.update_epochs): 75 | loss= self.policy_value_net.train_step( 76 | state_batch, 77 | mcts_probs_batch, 78 | winner_batch, 79 | self.learning_rate*self.lr_multiplier) 80 | 81 | total_loss, value_loss, policy_loss, l2_penalty = loss 82 | 83 | total_loss_list.append(total_loss) 84 | value_loss_list.append(value_loss) 85 | policy_loss_list.append(policy_loss) 86 | l2_penalty_list.append(l2_penalty) 87 | 88 | print('total_loss: %f,value_loss: %f,policy_loss: %f,l2_penalty: %f'%(total_loss, value_loss, policy_loss, l2_penalty)) 89 | 90 | new_probs, new_v = self.policy_value_net.policy_value(state_batch) 91 | kl = np.mean(np.sum(old_probs * ( 92 | np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)), 93 | axis=1) 94 | ) 95 | if kl > self.kl_targ * 4: # early stopping if D_KL diverges badly 96 | break 97 | # adaptively adjust the learning rate 98 | # 根据上次更新的KL_diverges大小,动态调整学习率 99 | if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1: 100 | self.lr_multiplier /= 1.5 101 | elif kl < self.kl_targ / 2 and self.lr_multiplier < 10: 102 | self.lr_multiplier *= 1.5 103 | 104 | explained_var_old = (1 - 105 | np.var(np.array(winner_batch) - old_v) / 106 | np.var(np.array(winner_batch))) 107 | explained_var_new = (1 - 108 | np.var(np.array(winner_batch) - new_v) / 109 | np.var(np.array(winner_batch))) 110 | print(("kl:{:.5f}," 111 | "lr_multiplier:{:.3f}," 112 | "loss:{}," 113 | "explained_var_old:{:.3f}," 114 | "explained_var_new:{:.3f}" 115 | ).format(kl, 116 | self.lr_multiplier, 117 | total_loss, 118 | explained_var_old, 119 | explained_var_new)) 120 | total_loss_mean = list_mean(total_loss_list) 121 | value_loss_mean = list_mean(value_loss_list) 122 | policy_loss_mean = list_mean(policy_loss_list) 123 | l2_penalty_mean = list_mean(l2_penalty_list) 124 | 125 | return total_loss_mean, value_loss_mean, policy_loss_mean, l2_penalty_mean 126 | 127 | 128 | if __name__ == "__main__": 129 | 130 | trainpipeline = TrainPipeline() 131 | trainpipeline.run() -------------------------------------------------------------------------------- /Alpha-Zero/FiveAI/game.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Junxiao Song 4 | """ 5 | 6 | from __future__ import print_function 7 | import numpy as np 8 | 9 | 10 | class Board(object): 11 | """board for the game""" 12 | 13 | def __init__(self, **kwargs): 14 | '''注意这里的参数方式可以这样''' 15 | self.width = int(kwargs.get('width', 8)) 16 | self.height = int(kwargs.get('height', 8)) 17 | # board states stored as a dict, 18 | # key: move as location on the board:棋盘位置 19 | # value: player as pieces type:谁下的棋 20 | # 更新方式:self.states[move] = self.current_player 21 | self.states = {} 22 | # need how many pieces in a row to win 23 | self.n_in_row = int(kwargs.get('n_in_row', 5)) 24 | self.players = [1, 2] # player1 and player2 25 | 26 | # 初始化棋盘 27 | def init_board(self, start_player=0): 28 | 29 | # 判断生成棋盘是否大于胜利条件。否则抛异常。 30 | if self.width < self.n_in_row or self.height < self.n_in_row: 31 | raise Exception('board width and height can not be ' 32 | 'less than {}'.format(self.n_in_row)) 33 | 34 | # current_player:谁在下棋,初始化的时候为P1 35 | self.current_player = self.players[start_player] # start player 36 | 37 | # keep available moves in a list 38 | # availables:一个list,记录当前可以走棋的位置。某些位置被其他字占据,就不能下了。 39 | self.availables = list(range(self.width * self.height)) 40 | # 清空状态 41 | self.states = {} 42 | # 重置不是最后一步 43 | self.last_move = -1 44 | 45 | # 把move变成棋盘对应位置。 46 | # 这里的move相当于action。 47 | def move_to_location(self, move): 48 | """ 49 | 3*3 board's moves like: 50 | 6 7 8 51 | 3 4 5 52 | 0 1 2 53 | and move 5's location is (1,2) 54 | """ 55 | h = move // self.width 56 | w = move % self.width 57 | return [h, w] 58 | 59 | # 把棋盘中对应位置转换为move 60 | def location_to_move(self, location): 61 | # location是一个二维数组,所以先判断下。如果不是,就返回-1 62 | if len(location) != 2: 63 | return -1 64 | h = location[0] 65 | w = location[1] 66 | move = h * self.width + w 67 | # 如果超出了,就代表不合法,返回-1 68 | if move not in range(self.width * self.height): 69 | return -1 70 | return move 71 | 72 | #state的形式:self.states[move] = self.current_player 73 | #=============棋盘状态描述================================ 74 | #===1. 棋盘状态用4层,和棋盘大小相同的array进行描述。 75 | #===No.1 用1.0表示当前这个玩家已经下的子 76 | #===No.2 用1.0表示另外一个玩家已经下的子 77 | #===No.3 用1.0表示,当前状态最后一个下的子的位置 78 | #===No.4 P1玩家所有标1,P2玩家所有标0。 79 | #===2. Alphazero:用17层。其中16层表示之前两位玩家各8步。 80 | #======================================================== 81 | def current_state(self): 82 | """return the board state from the perspective of the current player. 83 | state shape: 4*width*height 84 | """ 85 | 86 | square_state = np.zeros((4, self.width, self.height)) 87 | if self.states: 88 | #============注意这里的操作=============== 89 | moves, players = np.array(list(zip(*self.states.items()))) 90 | move_curr = moves[players == self.current_player] 91 | move_oppo = moves[players != self.current_player] 92 | #======================================== 93 | # 第1层: 94 | square_state[0][move_curr // self.width, 95 | move_curr % self.height] = 1.0 96 | # 第2层: 97 | square_state[1][move_oppo // self.width, 98 | move_oppo % self.height] = 1.0 99 | # 第3层: 100 | # indicate the last move location 101 | square_state[2][self.last_move // self.width, 102 | self.last_move % self.height] = 1.0 103 | # 第4层: 104 | if len(self.states) % 2 == 0: 105 | square_state[3][:, :] = 1.0 # indicate the colour to play 106 | return square_state[:, ::-1, :] 107 | 108 | # 执行走棋操作 109 | def do_move(self, move): 110 | # 当前玩家下子 111 | # 1.改变棋盘状态 112 | # 2.可行的位置去掉这个位置 113 | # 3.转换下一个下子玩家 114 | # 4.记录最后一个下子的位置 115 | self.states[move] = self.current_player 116 | self.availables.remove(move) 117 | self.current_player = ( 118 | self.players[0] if self.current_player == self.players[1] 119 | else self.players[1] 120 | ) 121 | self.last_move = move 122 | 123 | ''' 124 | 判断这个状态是否结束了。 125 | 是:返回True,胜利玩家 126 | 否:返回False,-1 127 | ''' 128 | def has_a_winner(self): 129 | width = self.width 130 | height = self.height 131 | states = self.states 132 | n = self.n_in_row 133 | 134 | moved = list(set(range(width * height)) - set(self.availables)) 135 | if len(moved) < self.n_in_row *2-1: 136 | return False, -1 137 | 138 | for m in moved: 139 | h = m // width 140 | w = m % width 141 | player = states[m] 142 | 143 | if (w in range(width - n + 1) and 144 | len(set(states.get(i, -1) for i in range(m, m + n))) == 1): 145 | return True, player 146 | 147 | if (h in range(height - n + 1) and 148 | len(set(states.get(i, -1) for i in range(m, m + n * width, width))) == 1): 149 | return True, player 150 | 151 | if (w in range(width - n + 1) and h in range(height - n + 1) and 152 | len(set(states.get(i, -1) for i in range(m, m + n * (width + 1), width + 1))) == 1): 153 | return True, player 154 | 155 | if (w in range(n - 1, width) and h in range(height - n + 1) and 156 | len(set(states.get(i, -1) for i in range(m, m + n * (width - 1), width - 1))) == 1): 157 | return True, player 158 | 159 | return False, -1 160 | 161 | ''' 162 | 先判断是否有玩家胜利 163 | 有:True, winner 164 | 否则检查是否还有可以下子的地方 165 | 如果没有:True, -1(-1表示平局) 166 | ''' 167 | def game_end(self): 168 | """Check whether the game is ended or not""" 169 | win, winner = self.has_a_winner() 170 | if win: 171 | return True, winner 172 | elif not len(self.availables): 173 | return True, -1 174 | return False, -1 175 | 176 | def get_current_player(self): 177 | return self.current_player 178 | 179 | 180 | class Game(object): 181 | """game server""" 182 | 183 | def __init__(self, board, **kwargs): 184 | self.board = board 185 | 186 | # 根据当前状态,画棋盘 187 | def graphic(self, board, player1, player2): 188 | """Draw the board and show game info""" 189 | width = board.width 190 | height = board.height 191 | 192 | print("Player", player1, "with X".rjust(3)) 193 | print("Player", player2, "with O".rjust(3)) 194 | print() 195 | for x in range(width): 196 | print("{0:8}".format(x), end='') 197 | print('\r\n') 198 | for i in range(height - 1, -1, -1): 199 | print("{0:4d}".format(i), end='') 200 | for j in range(width): 201 | loc = i * width + j 202 | p = board.states.get(loc, -1) 203 | if p == player1: 204 | print('X'.center(8), end='') 205 | elif p == player2: 206 | print('O'.center(8), end='') 207 | else: 208 | print('_'.center(8), end='') 209 | print('\r\n\r\n') 210 | 211 | # 开始一场游戏 212 | def start_play(self, player1, player2, start_player=0, is_shown=1): 213 | """start a game between two players""" 214 | if start_player not in (0, 1): 215 | raise Exception('start_player should be either 0 (player1 first) ' 216 | 'or 1 (player2 first)') 217 | self.board.init_board(start_player) #初始化棋盘 218 | 219 | # 定义board.players:P1=0 P2=1 220 | # 定义MCTSPlayer;player1和player2 221 | p1, p2 = self.board.players 222 | player1.set_player_ind(p1) 223 | player2.set_player_ind(p2) 224 | players = {p1: player1, p2: player2} 225 | 226 | if is_shown: 227 | self.graphic(self.board, player1.player, player2.player) 228 | 229 | #==================开始游戏=============================== 230 | while True: 231 | current_player = self.board.get_current_player() 232 | player_in_turn = players[current_player] #注意:players[]不再只是一个数字,是MCTSPlayer 233 | move = player_in_turn.get_action(self.board) #该MCTSPlayer会把当前board放入getaction。产出动作move 234 | self.board.do_move(move) #走棋,交换人,直到结束。 235 | if is_shown: 236 | self.graphic(self.board, player1.player, player2.player) 237 | end, winner = self.board.game_end() 238 | if end: 239 | if is_shown: 240 | if winner != -1: 241 | print("Game end. Winner is", players[winner]) 242 | else: 243 | print("Game end. Tie") 244 | return winner 245 | 246 | #self_play!!!通过selfplay获取数据 247 | def start_self_play(self, player, is_shown=0, temp=1e-3): 248 | """ start a self-play game using a MCTS player, reuse the search tree, 249 | and store the self-play data: (state, mcts_probs, z) for training 250 | """ 251 | self.board.init_board() 252 | p1, p2 = self.board.players 253 | states, mcts_probs, current_players = [], [], [] 254 | 255 | while True: 256 | # ======通过get_action,把当前state放入,预测move和概率的列表 257 | move, move_probs = player.get_action(self.board, 258 | temp=temp, 259 | return_prob=1) 260 | 261 | # ======存储s,move_probs,player 262 | states.append(self.board.current_state()) 263 | mcts_probs.append(move_probs) 264 | current_players.append(self.board.current_player) 265 | 266 | # ======执行动作 267 | self.board.do_move(move) 268 | if is_shown: 269 | self.graphic(self.board, p1, p2) 270 | end, winner = self.board.game_end() 271 | if end: 272 | # winner from the perspective of the current player of each state 273 | winners_z = np.zeros(len(current_players)) 274 | #如果不是平局:winners_z把winner的步数设置1,否则-1 275 | if winner != -1: 276 | winners_z[np.array(current_players) == winner] = 1.0 277 | winners_z[np.array(current_players) != winner] = -1.0 278 | # reset MCTS root node 279 | #每一次跑完游戏,都会重新建立一棵树 280 | player.reset_player() 281 | 282 | #show文字 283 | if is_shown: 284 | if winner != -1: 285 | print("Game end. Winner is player:", winner) 286 | else: 287 | print("Game end. Tie") 288 | return winner, zip(states, mcts_probs, winners_z) 289 | # 其实都只返回1个 290 | # winner:胜利玩家,1个值(P1 or P2) 291 | # 假设这个是有N步,那么每一步都产生这样的data 292 | # states:大小4xHxW,记录state,用于放到network,产生v-preds(一个值) 和 p-preds(每个位置胜率) 293 | # mcts_probs:HxW,通过建立mcts树,用UCB跑多次,根据访问次数的胜率预计。会作为p-preds更新标准 294 | # winners_z: 单一值。这个值是通过最后结果反过来填的,如果在这个s下,下棋的人获胜,那么+1,否则-1.平0。用于更新v-preds 295 | -------------------------------------------------------------------------------- /Alpha-Zero/FiveAI/mcts_alphaZero.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Monte Carlo Tree Search in AlphaGo Zero style, which uses a policy-value 4 | network to guide the tree search and evaluate the leaf nodes 5 | 6 | @author: Junxiao Song 7 | """ 8 | 9 | import numpy as np 10 | import copy 11 | 12 | 13 | def softmax(x): 14 | probs = np.exp(x - np.max(x)) 15 | probs /= np.sum(probs) 16 | return probs 17 | 18 | #定义节点 19 | class TreeNode(object): 20 | """A node in the MCTS tree. 21 | 22 | Each node keeps track of its own value Q, prior probability P, and 23 | its visit-count-adjusted prior score u. 24 | """ 25 | 26 | def __init__(self, parent, prior_p): 27 | self._parent = parent #父节点 28 | self._children = {} # 子节点,是一个字典:字典的key是动作,item是子节点。子节点包括了描述这个动作的概率,Q等 29 | self._n_visits = 0 # 记录这个节点被访问次数 30 | self._Q = 0 #这个节点的价值 31 | self._u = 0 #用于计算UCB上限。在select的时候,用的是Q+U的最大值。 32 | self._P = prior_p #动作对应的概率 33 | 34 | def expand(self, action_priors): 35 | """Expand tree by creating new children. 36 | action_priors: a list of tuples of actions and their prior probability 37 | according to the policy function. 38 | 展开:当一个节点是叶子节点的时候,需要被展开。 39 | 输入action_priors:包含action和对应的概率 40 | 判断这个动作是否在_children的字典中。如果不在,增加这个动作,并增加对应的节点,把概率写在节点中。 41 | """ 42 | for action, prob in action_priors: 43 | if action not in self._children: 44 | self._children[action] = TreeNode(self, prob) 45 | 46 | def select(self, c_puct): 47 | """Select action among children that gives maximum action value Q 48 | plus bonus u(P). 49 | Return: A tuple of (action, next_node) 50 | 选择:选择UCB最大的值:UCB = Q(s,a) + U(s,a) 51 | """ 52 | return max(self._children.items(), 53 | key=lambda act_node: act_node[1].get_value(c_puct)) 54 | 55 | def update(self, leaf_value): 56 | """Update node values from leaf evaluation. 57 | leaf_value: the value of subtree evaluation from the current player's 58 | perspective. 59 | 做一次模拟,把返回的leaf_value去修改Q 60 | 1._n_visits增加 61 | 2.leaf_value和原来的Q,用_n_visits平均一下。1.0是学习率 62 | """ 63 | # Count visit. 64 | self._n_visits += 1 65 | # Update Q, a running average of values for all visits. 66 | self._Q += 1.0*(leaf_value - self._Q) / self._n_visits 67 | 68 | def update_recursive(self, leaf_value): 69 | """Like a call to update(), but applied recursively for all ancestors. 70 | 用leaf_value反向更新祖先节点。 71 | 因为整棵树,是双方轮流下子的。所以对于一个state update是正的,那么这个state前后updata的数值就是负的。 72 | """ 73 | # If it is not root, this node's parent should be updated first. 74 | if self._parent: 75 | self._parent.update_recursive(-leaf_value) 76 | self.update(leaf_value) 77 | 78 | def get_value(self, c_puct): 79 | """Calculate and return the value for this node. 80 | It is a combination of leaf evaluations Q, and this node's prior 81 | adjusted for its visit count, u. 82 | c_puct: a number in (0, inf) controlling the relative impact of 83 | value Q, and prior probability P, on this node's score. 84 | UCB = Q(s,a) + U(s,a) 85 | """ 86 | self._u = (c_puct * self._P * 87 | np.sqrt(self._parent._n_visits) / (1 + self._n_visits)) 88 | return self._Q + self._u 89 | 90 | def is_leaf(self): 91 | """Check if leaf node (i.e. no nodes below this have been expanded).""" 92 | return self._children == {} 93 | 94 | def is_root(self): 95 | return self._parent is None 96 | 97 | 98 | class MCTS(object): 99 | """An implementation of Monte Carlo Tree Search.""" 100 | 101 | def __init__(self, policy_value_fn, c_puct=5, n_playout=10000): 102 | """ 103 | policy_value_fn: a function that takes in a board state and outputs 104 | a list of (action, probability) tuples and also a score in [-1, 1] 105 | (i.e. the expected value of the end game score from the current 106 | player's perspective) for the current player. 107 | c_puct: a number in (0, inf) that controls how quickly exploration 108 | converges to the maximum-value policy. A higher value means 109 | relying on the prior more. 110 | """ 111 | self._root = TreeNode(None, 1.0) #初始化根节点 112 | self._policy = policy_value_fn #用于生成子节点action-prob对 113 | self._c_puct = c_puct #一个常数,好像没啥用 114 | self._n_playout = n_playout #模拟多少次走一步 115 | 116 | #进行一次模拟_root就代表传入state 117 | def _playout(self, state): 118 | """Run a single playout from the root to the leaf, getting a value at 119 | the leaf and propagating it back through its parents. 120 | State is modified in-place, so a copy must be provided. 121 | """ 122 | node = self._root 123 | while(1): 124 | # Greedily select next move. 125 | #找出UCB最大的动作,并执行。 126 | action, node = node.select(self._c_puct) 127 | state.do_move(action) 128 | #直到去到叶子节点 129 | if node.is_leaf(): 130 | break 131 | 132 | # Evaluate the leaf using a network which outputs a list of 133 | # (action, probability) tuples p and also a score v in [-1, 1] 134 | # for the current player. 135 | # 我们评估这个叶子节点的Q,和他的action-probs 136 | # 如果还没有结束,那么就扩展这棵树。action-probs放进子节点。 137 | action_probs, leaf_value = self._policy(state) 138 | # Check for end of game. 139 | end, winner = state.game_end() 140 | if not end: 141 | node.expand(action_probs) 142 | 143 | #如果结束了。 144 | # 如果平局,就设置成leaf_value = 0 145 | # 否则: 如果胜利者是当前的,那么leaf_value = 1, 否则leaf_value = -1 146 | else: 147 | # for end state,return the "true" leaf_value 148 | if winner == -1: # tie 149 | leaf_value = 0.0 150 | else: 151 | leaf_value = ( 152 | 1.0 if winner == state.get_current_player() else -1.0 153 | ) 154 | # 向上更新祖先节点。 155 | # Update value and visit count of nodes in this traversal. 156 | node.update_recursive(-leaf_value) 157 | 158 | 159 | def get_move_probs(self, state, temp=1e-3): 160 | """Run all playouts sequentially and return the available actions and 161 | their corresponding probabilities. 162 | state: the current game state 163 | temp: temperature parameter in (0, 1] controls the level of exploration 164 | """ 165 | #每一步进行_n_playout次模拟 166 | #每次都把现在的state复制出来。进行模拟,一直到游戏结束 167 | #把得到的leaf_value更新到每个状态。同时更新被访问次数。 168 | #最后我们会得到一颗模拟出来各种结果的树,我们需要的就是这个树。 169 | for n in range(self._n_playout): 170 | #关于copy.deepcopy(state) 171 | #https://blog.csdn.net/u010712012/article/details/79754132 172 | state_copy = copy.deepcopy(state) 173 | self._playout(state_copy) 174 | 175 | # calc the move probabilities based on visit counts at the root node 176 | # _root._children.items()访问根节点的_children,就是访问当前状态下,各个动作和对应的节点。 177 | # 取出节点和被访问次数 178 | # 然后一轮运算后,根据访问次数,获得act 和对应的act_probs 179 | act_visits = [(act, node._n_visits) 180 | for act, node in self._root._children.items()] 181 | acts, visits = zip(*act_visits) 182 | act_probs = softmax(1.0/temp * np.log(np.array(visits) + 1e-10)) 183 | 184 | return acts, act_probs 185 | 186 | def update_with_move(self, last_move): 187 | """Step forward in the tree, keeping everything we already know 188 | about the subtree. 189 | """ 190 | #下棋后,检查这move是否在这个树的子节点中。如果在就把根节点移动到这个节点。 191 | #否则新建一个节点。 192 | #这棵树会一直维护,直到一次游戏结束。 193 | if last_move in self._root._children: 194 | self._root = self._root._children[last_move] 195 | self._root._parent = None 196 | # 输入-1,重置整棵树 197 | else: 198 | self._root = TreeNode(None, 1.0) 199 | 200 | def __str__(self): 201 | return "MCTS" 202 | 203 | 204 | class MCTSPlayer(object): 205 | """AI player based on MCTS""" 206 | 207 | def __init__(self, policy_value_function, 208 | c_puct=5, n_playout=2000, is_selfplay=0): 209 | self.mcts = MCTS(policy_value_function, c_puct, n_playout) 210 | self._is_selfplay = is_selfplay 211 | 212 | def set_player_ind(self, p): 213 | self.player = p 214 | 215 | def reset_player(self): 216 | self.mcts.update_with_move(-1) #把-1传进去,就重置了整个树了。 217 | 218 | def get_action(self, board, temp=1e-3, return_prob=0): 219 | #============================================================================ 220 | #进行一次游戏,每一步都会get一次action的。 221 | #1.首先获取合法动作位置 222 | #2.基于当前状态,进行_n_playout次模拟。生成树,并返回acts, probs(注意:这个prob是根据树的访问次数来的,不是通过network来的) 223 | #3.如果是selfplay模式,那么加噪音然后sample。然后挪动树的根节点。(树是保留的) 224 | # 如果不是selfplay模式,那么不加噪音。 重置整棵树 225 | # ============================================================================ 226 | sensible_moves = board.availables 227 | # the pi vector returned by MCTS as in the alphaGo Zero paper 228 | move_probs = np.zeros(board.width*board.height) 229 | 230 | if len(sensible_moves) > 0: 231 | # 进行n_playout模拟,生成一棵MCTS,返回根节点的acts, probs 232 | acts, probs = self.mcts.get_move_probs(board, temp) 233 | move_probs[list(acts)] = probs 234 | 235 | #======================================= 236 | #如果是selfplay模式,就要加0.25噪音。然后sample出一个move,执行。 237 | #如果不是selfplay模式,就不加噪音,但会重置整棵树。 238 | if self._is_selfplay: 239 | # add Dirichlet Noise for exploration (needed for 240 | # self-play training) 241 | move = np.random.choice( 242 | acts, 243 | p=0.75*probs + 0.25*np.random.dirichlet(0.3*np.ones(len(probs))) 244 | ) 245 | # update the root node and reuse the search tree 246 | self.mcts.update_with_move(move) #把树的根节点和当前状态对应。 247 | else: 248 | # with the default temp=1e-3, it is almost equivalent 249 | # to choosing the move with the highest prob 250 | move = np.random.choice(acts, p=probs) 251 | # reset the root node 252 | self.mcts.update_with_move(-1) 253 | #location = board.move_to_location(move) 254 | #print("AI move: %d,%d\n" % (location[0], location[1])) 255 | # ======================================= 256 | if return_prob: 257 | return move, move_probs 258 | else: 259 | return move 260 | else: 261 | print("WARNING: the board is full") 262 | 263 | def __str__(self): 264 | return "MCTS {}".format(self.player) 265 | -------------------------------------------------------------------------------- /Alpha-Zero/FiveAI/policy_value_net_tensorflow.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | An implementation of the policyValueNet in Tensorflow 4 | Tested in Tensorflow 1.4 and 1.5 5 | 6 | @author: Xiang Zhong 7 | """ 8 | 9 | import numpy as np 10 | import tensorflow as tf 11 | 12 | 13 | class PolicyValueNet(): 14 | def __init__(self, board_width, board_height, model_file=None): 15 | self.board_width = board_width 16 | self.board_height = board_height 17 | 18 | # Define the tensorflow neural network 19 | # 1. Input: 20 | self.input_states = tf.placeholder( 21 | tf.float32, shape=[None, 4, board_height, board_width]) 22 | self.input_state = tf.transpose(self.input_states, [0, 2, 3, 1]) 23 | 24 | #========================================================================= 25 | # 2. Common Networks Layers 26 | self.conv1 = tf.layers.conv2d(inputs=self.input_state, 27 | filters=32, kernel_size=[3, 3], 28 | padding="same", data_format="channels_last", 29 | activation=tf.nn.relu) 30 | self.conv2 = tf.layers.conv2d(inputs=self.conv1, filters=64, 31 | kernel_size=[3, 3], padding="same", 32 | data_format="channels_last", 33 | activation=tf.nn.relu) 34 | self.conv3 = tf.layers.conv2d(inputs=self.conv2, filters=128, 35 | kernel_size=[3, 3], padding="same", 36 | data_format="channels_last", 37 | activation=tf.nn.relu) 38 | 39 | # ========================================================================= 40 | # 3-1 Action Networks 41 | self.action_conv = tf.layers.conv2d(inputs=self.conv3, filters=4, 42 | kernel_size=[1, 1], padding="same", 43 | data_format="channels_last", 44 | activation=tf.nn.relu) 45 | # Flatten the tensor 46 | self.action_conv_flat = tf.reshape( 47 | self.action_conv, [-1, 4 * board_height * board_width]) 48 | # 3-2 Full connected layer, the output is the log probability of moves 49 | # on each slot on the board 50 | self.action_fc = tf.layers.dense(inputs=self.action_conv_flat, 51 | units=board_height * board_width, 52 | activation=tf.nn.log_softmax) 53 | 54 | # ========================================================================= 55 | # 4 Evaluation Networks 56 | self.evaluation_conv = tf.layers.conv2d(inputs=self.conv3, filters=2, 57 | kernel_size=[1, 1], 58 | padding="same", 59 | data_format="channels_last", 60 | activation=tf.nn.relu) 61 | self.evaluation_conv_flat = tf.reshape( 62 | self.evaluation_conv, [-1, 2 * board_height * board_width]) 63 | self.evaluation_fc1 = tf.layers.dense(inputs=self.evaluation_conv_flat, 64 | units=64, activation=tf.nn.relu) 65 | # output the score of evaluation on current state 66 | self.evaluation_fc2 = tf.layers.dense(inputs=self.evaluation_fc1, 67 | units=1, activation=tf.nn.tanh) 68 | 69 | 70 | # ========================================================================= 71 | # Define the Loss function 72 | # 1. Label: the array containing if the game wins or not for each state 73 | self.labels = tf.placeholder(tf.float32, shape=[None, 1]) 74 | # 2. Predictions: the array containing the evaluation score of each state 75 | # which is self.evaluation_fc2 76 | # =======3-1. Value Loss function======= 77 | self.value_loss = tf.losses.mean_squared_error(self.labels, 78 | self.evaluation_fc2) 79 | 80 | # =======3-2. Policy Loss function======= 81 | self.mcts_probs = tf.placeholder( 82 | tf.float32, shape=[None, board_height * board_width]) 83 | self.policy_loss = tf.negative(tf.reduce_mean( 84 | tf.reduce_sum(tf.multiply(self.mcts_probs, self.action_fc), 1))) 85 | 86 | # =======3-3. L2 penalty (regularization)======= 87 | l2_penalty_beta = 1e-4 88 | vars = tf.trainable_variables() 89 | l2_penalty = l2_penalty_beta * tf.add_n( 90 | [tf.nn.l2_loss(v) for v in vars if 'bias' not in v.name.lower()]) 91 | # 3-4 Add up to be the Loss function 92 | self.loss = self.value_loss + self.policy_loss + l2_penalty 93 | 94 | # =======Define the optimizer we use for training======= 95 | self.learning_rate = tf.placeholder(tf.float32) 96 | self.optimizer = tf.train.AdamOptimizer( 97 | learning_rate=self.learning_rate).minimize(self.loss) 98 | 99 | # Make a session 100 | self.session = tf.Session() 101 | 102 | # =======calc policy entropy, for monitoring only======= 103 | # 这里有点不懂,为啥要算这样一个entropy呢? 104 | self.entropy = tf.negative(tf.reduce_mean( 105 | tf.reduce_sum(tf.exp(self.action_fc) * self.action_fc, 1))) 106 | 107 | # Initialize variables 108 | init = tf.global_variables_initializer() 109 | self.session.run(init) 110 | 111 | # For saving and restoring 112 | self.saver = tf.train.Saver() 113 | if model_file is not None: 114 | self.restore_model(model_file) 115 | 116 | 117 | #输入state的batch,注意这里是state,不是broad 118 | #输出每个动作的胜率,和这个state的价值 119 | def policy_value(self, state_batch): 120 | """ 121 | input: a batch of states 122 | output: a batch of action probabilities and state values 123 | """ 124 | log_act_probs, value = self.session.run( 125 | [self.action_fc, self.evaluation_fc2], 126 | feed_dict={self.input_states: state_batch} 127 | ) 128 | act_probs = np.exp(log_act_probs) 129 | return act_probs, value 130 | 131 | #输入board, 132 | #返回 133 | # 1.合法位置和对应的概率 134 | # 2.状态价值 135 | def policy_value_fn(self, board): 136 | """ 137 | input: board 138 | output: a list of (action, probability) tuples for each available 139 | action and the score of the board state 140 | """ 141 | legal_positions = board.availables 142 | #ascontiguousarray函数将一个内存不连续存储的数组转换为内存连续存储的数组,使得运行速度更快。 143 | current_state = np.ascontiguousarray(board.current_state().reshape( 144 | -1, 4, self.board_width, self.board_height)) 145 | act_probs, value = self.policy_value(current_state) 146 | act_probs = zip(legal_positions, act_probs[0][legal_positions]) 147 | return act_probs, value 148 | 149 | 150 | #winner_batch不知道有什么用呢? 151 | #self.labels: winner_batch,就是用胜利者的V作为标签,提升价值? 152 | def train_step(self, state_batch, mcts_probs, winner_batch, lr): 153 | """perform a training step""" 154 | winner_batch = np.reshape(winner_batch, (-1, 1)) 155 | loss, entropy, _ = self.session.run( 156 | [self.loss, self.entropy, self.optimizer], 157 | feed_dict={self.input_states: state_batch, 158 | self.mcts_probs: mcts_probs, 159 | self.labels: winner_batch, 160 | self.learning_rate: lr}) 161 | return loss, entropy 162 | 163 | def save_model(self, model_path): 164 | self.saver.save(self.session, model_path) 165 | 166 | def restore_model(self, model_path): 167 | self.saver.restore(self.session, model_path) 168 | -------------------------------------------------------------------------------- /Alpha-Zero/FiveAI/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | An implementation of the training pipeline of AlphaZero for Gomoku 4 | 5 | @author: Junxiao Song 6 | """ 7 | 8 | from __future__ import print_function 9 | import random 10 | import numpy as np 11 | from collections import defaultdict, deque 12 | from game import Board, Game 13 | from mcts_pure import MCTSPlayer as MCTS_Pure 14 | from mcts_alphaZero import MCTSPlayer 15 | from policy_value_net import PolicyValueNet # Theano and Lasagne 16 | # from policy_value_net_pytorch import PolicyValueNet # Pytorch 17 | # from policy_value_net_tensorflow import PolicyValueNet # Tensorflow 18 | # from policy_value_net_keras import PolicyValueNet # Keras 19 | 20 | 21 | class TrainPipeline(): 22 | def __init__(self, init_model=None): 23 | # params of the board and the game 24 | self.board_width = 6 #棋盘宽度 25 | self.board_height = 6 #棋盘高度 26 | self.n_in_row = 4 #胜利条件:多少个棋连成一线算是胜利 27 | 28 | # 实例化一个board,定义棋盘宽高和胜利条件 29 | self.board = Board(width=self.board_width, 30 | height=self.board_height, 31 | n_in_row=self.n_in_row) 32 | self.game = Game(self.board) 33 | 34 | # training params 35 | self.learn_rate = 2e-3 36 | self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL 37 | self.temp = 1.0 # the temperature param 38 | self.n_playout = 400 # num of simulations for each move 39 | self.c_puct = 5 40 | self.buffer_size = 10000 41 | self.batch_size = 512 # mini-batch size for training 42 | self.data_buffer = deque(maxlen=self.buffer_size) 43 | self.play_batch_size = 1 44 | self.epochs = 5 # num of train_steps for each update 45 | self.kl_targ = 0.02 46 | self.check_freq = 50 47 | self.game_batch_num = 1500 48 | self.best_win_ratio = 0.0 49 | # num of simulations used for the pure mcts, which is used as 50 | # the opponent to evaluate the trained policy 51 | self.pure_mcts_playout_num = 1000 52 | 53 | #初始化network和树,network是一直保存的,树的话不知道什么时候重置。 54 | if init_model: 55 | # start training from an initial policy-value net 56 | self.policy_value_net = PolicyValueNet(self.board_width, 57 | self.board_height, 58 | model_file=init_model) 59 | else: 60 | # start training from a new policy-value net 61 | self.policy_value_net = PolicyValueNet(self.board_width, 62 | self.board_height) 63 | self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, 64 | c_puct=self.c_puct, 65 | n_playout=self.n_playout, 66 | is_selfplay=1) 67 | 68 | #作用是扩充data,因为五子棋是上下左右相同的。 69 | def get_equi_data(self, play_data): 70 | """augment the data set by rotation and flipping 71 | ##play_data: [(state, mcts_prob, winner_z), ..., ...] 72 | """ 73 | extend_data = [] 74 | for state, mcts_porb, winner in play_data: 75 | for i in [1, 2, 3, 4]: 76 | # rotate counterclockwise 77 | # np.rot90:矩阵旋转90度 78 | # np.flipud:矩阵反转 79 | equi_state = np.array([np.rot90(s, i) for s in state]) 80 | equi_mcts_prob = np.rot90(np.flipud( 81 | mcts_porb.reshape(self.board_height, self.board_width)), i) 82 | extend_data.append((equi_state, 83 | np.flipud(equi_mcts_prob).flatten(), 84 | winner)) 85 | # flip horizontally 86 | equi_state = np.array([np.fliplr(s) for s in equi_state]) 87 | equi_mcts_prob = np.fliplr(equi_mcts_prob) 88 | extend_data.append((equi_state, 89 | np.flipud(equi_mcts_prob).flatten(), 90 | winner)) 91 | return extend_data 92 | 93 | #搜集selfplay的data 94 | def collect_selfplay_data(self, n_games=1): 95 | """collect self-play data for training""" 96 | #进行n_games游戏 97 | for i in range(n_games): 98 | winner, play_data = self.game.start_self_play(self.mcts_player, 99 | temp=self.temp) 100 | play_data = list(play_data)[:] 101 | self.episode_len = len(play_data) #对弈步数 102 | # augment the data 103 | play_data = self.get_equi_data(play_data) 104 | self.data_buffer.extend(play_data) 105 | 106 | def policy_update(self): 107 | """update the policy-value net""" 108 | #======解压数据============ 109 | mini_batch = random.sample(self.data_buffer, self.batch_size) 110 | state_batch = [data[0] for data in mini_batch] 111 | mcts_probs_batch = [data[1] for data in mini_batch] 112 | winner_batch = [data[2] for data in mini_batch] 113 | #========================= 114 | #这里好像做了important sampling,直接计算KL_diverges大小,超过一定就早停 115 | old_probs, old_v = self.policy_value_net.policy_value(state_batch) 116 | #进行epochs次训练 117 | for i in range(self.epochs): 118 | loss, entropy = self.policy_value_net.train_step( 119 | state_batch, 120 | mcts_probs_batch, 121 | winner_batch, 122 | self.learn_rate*self.lr_multiplier) 123 | new_probs, new_v = self.policy_value_net.policy_value(state_batch) 124 | kl = np.mean(np.sum(old_probs * ( 125 | np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)), 126 | axis=1) 127 | ) 128 | if kl > self.kl_targ * 4: # early stopping if D_KL diverges badly 129 | break 130 | # adaptively adjust the learning rate 131 | # 根据上次更新的KL_diverges大小,动态调整学习率 132 | if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1: 133 | self.lr_multiplier /= 1.5 134 | elif kl < self.kl_targ / 2 and self.lr_multiplier < 10: 135 | self.lr_multiplier *= 1.5 136 | 137 | explained_var_old = (1 - 138 | np.var(np.array(winner_batch) - old_v.flatten()) / 139 | np.var(np.array(winner_batch))) 140 | explained_var_new = (1 - 141 | np.var(np.array(winner_batch) - new_v.flatten()) / 142 | np.var(np.array(winner_batch))) 143 | print(("kl:{:.5f}," 144 | "lr_multiplier:{:.3f}," 145 | "loss:{}," 146 | "entropy:{}," 147 | "explained_var_old:{:.3f}," 148 | "explained_var_new:{:.3f}" 149 | ).format(kl, 150 | self.lr_multiplier, 151 | loss, 152 | entropy, 153 | explained_var_old, 154 | explained_var_new)) 155 | return loss, entropy 156 | 157 | #用纯MCTS玩,和AlphaZERO玩,看看哪个更厉害 158 | def policy_evaluate(self, n_games=10): 159 | """ 160 | Evaluate the trained policy by playing against the pure MCTS player 161 | Note: this is only for monitoring the progress of training 162 | """ 163 | current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, 164 | c_puct=self.c_puct, 165 | n_playout=self.n_playout) 166 | pure_mcts_player = MCTS_Pure(c_puct=5, 167 | n_playout=self.pure_mcts_playout_num) 168 | win_cnt = defaultdict(int) 169 | for i in range(n_games): 170 | winner = self.game.start_play(current_mcts_player, 171 | pure_mcts_player, 172 | start_player=i % 2, 173 | is_shown=0) 174 | win_cnt[winner] += 1 175 | win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1]) / n_games 176 | print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( 177 | self.pure_mcts_playout_num, 178 | win_cnt[1], win_cnt[2], win_cnt[-1])) 179 | return win_ratio 180 | 181 | #training pipeline 182 | def run(self): 183 | """run the training pipeline""" 184 | try: 185 | for i in range(self.game_batch_num): 186 | #搜集data,搜集play_batch_size次,每次玩n_game次。 187 | #每次game都会新建一棵树,每一步就是树的一个节点。 188 | #每一步都会进行_n_playout次模拟 189 | self.collect_selfplay_data(self.play_batch_size) 190 | print("batch i:{}, episode_len:{}".format( 191 | i+1, self.episode_len)) 192 | # data足够,update.可以用上important sampling,updata,n次。 193 | # update玩,进行新的搜集时,就会清空原来数据。 194 | if len(self.data_buffer) > self.batch_size: 195 | loss, entropy = self.policy_update() 196 | # check the performance of the current model, 197 | # and save the model params 198 | if (i+1) % self.check_freq == 0: 199 | print("current self-play batch: {}".format(i+1)) 200 | win_ratio = self.policy_evaluate() 201 | self.policy_value_net.save_model('./current_policy.model') 202 | if win_ratio > self.best_win_ratio: 203 | print("New best policy!!!!!!!!") 204 | self.best_win_ratio = win_ratio 205 | # update the best_policy 206 | self.policy_value_net.save_model('./best_policy.model') 207 | if (self.best_win_ratio == 1.0 and 208 | self.pure_mcts_playout_num < 5000): 209 | self.pure_mcts_playout_num += 1000 210 | self.best_win_ratio = 0.0 211 | except KeyboardInterrupt: 212 | print('\n\rquit') 213 | 214 | 215 | if __name__ == '__main__': 216 | training_pipeline = TrainPipeline() 217 | training_pipeline.run() 218 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### 白话强化学习 2 | 3 | 愿成为一把梯子,助你跨过无数的坑。——初心 4 | 5 | 当前,人工智能可以说是大热的学科。强化学习这个分支更是备受瞩目。至于强化学习什么征服Atari,围棋,德扑之类这些就不吹了。你应该知道自己为什么要学reinforcment learning。 6 | 7 | 但在入门的时候,却感觉非常困难! 8 | 9 | 以个人爬坑的经验,主要有三点: 10 | 11 | 1. 人工智能背靠数学。但绝大部分人,在高考之后,数学能力就直线下降。一生都被数学的恐惧所支配。这无疑令想入门人工智能的人望而却步。学习了大半年贝叶斯和博弈论,但还是进入不了人工智能的大门。其实在人工智能领域,很多时候更讲求“直觉”。人工智能科学家的算力远不如计算机,但为什么能设计出这些优秀的模型呢?除了数学,主要还是直觉。我希望这系列文章,能先帮助大家“绕”过数学,进入人工智能的大门。当我们进入大门,能够用代码实现一些算法,再深究算法中的数学,那时候必定更容易理解,更有动力学习其数学原理。 12 | 13 | 2. 代码能力。这里先给大家吃颗定心丸。在强化学习试验,代码一般比较简单的。很多人工智能的代码已经集成成工具给大家直接调用。在这个系列文章,每种强化学习的算法我会用tensorflow2.0的官方示例加上我自己的代码详细讲解,并提供我自己的注释版本。让你一边学习,一边提高代码能力。 14 | 15 | 3. 学习线路。强化学习作为人工智能的一个分支,网上已经有很多学习资料。但翻开资料,每位老师说的方式和侧重点都不太一样。我希望在这系列文章中,给大家指出一条线路。让大家能够快速入门。大家将会在这系列文章学到:TD、MC、Qlearning、DQN、DQN的变种、PG、AC、A3C、SAC、PPO、DPPO等系列算法的实现。这系列文章将会少用数学,多做代码解释。希望能帮助大家快速掌握强化学习知识。快速入门。 16 | 17 | 希望我的努力能够帮助大家较少入门的负担。 18 | 19 | 综述: 20 | 21 | 欢迎大家入坑Reinforcment Learning 22 | https://zhuanlan.zhihu.com/p/111869532 23 | 24 | ![学习线路.png](./assets/学习线路.png) 25 | 26 | 目录 27 | 第一部分——概念: 28 | 29 | 怎样正确理解马尔科夫链? 30 | https://zhuanlan.zhihu.com/p/109217883 31 | 32 | 如何理解强化学习中的Q值和V值? 33 | https://zhuanlan.zhihu.com/p/109498587 34 | 35 | 如何用蒙地卡罗方法(Monte-Carlo)估算V值? 36 | https://zhuanlan.zhihu.com/p/109755443 37 | 38 | [番外]蒙地卡罗MC的更新公式怎么来的? 39 | https://zhuanlan.zhihu.com/p/110118392 40 | 41 | 如何用时序差分TD估算状态V值? 42 | https://zhuanlan.zhihu.com/p/110132710 43 | 44 | 第二部分——核心算法: 45 | 46 | [番外]如何从师生关系理解环境与智能体的互动? 47 | https://zhuanlan.zhihu.com/p/110155777 48 | 49 | [理论篇]怎样直观理解Qlearning算法? 50 | https://zhuanlan.zhihu.com/p/110338833 51 | 52 | 手把手教你实现Qlearning算法[实战篇] 53 | https://zhuanlan.zhihu.com/p/110410276 54 | 55 | 一篇文章带你了解深度神经网络 56 | https://zhuanlan.zhihu.com/p/110531783 57 | 58 | 三维可视化助你直观理解DQN算法[DQN理论篇] 59 | https://zhuanlan.zhihu.com/p/110620815 60 | 61 | 用可视化直观理解DQN[DQN实战篇] 62 | https://zhuanlan.zhihu.com/p/110657606 63 | 64 | Double DQN原理是什么,怎样实现?(附代码) 65 | https://zhuanlan.zhihu.com/p/110769361 66 | 67 | [番外篇]DuelingDQN为何那么强?(附代码) 68 | https://zhuanlan.zhihu.com/p/110807201 69 | 70 | 如何理解策略梯度(Policy Gradient)算法?[附代码] 71 | https://zhuanlan.zhihu.com/p/110881517 72 | 73 | 理解Actor-Critic的关键是什么? 74 | https://zhuanlan.zhihu.com/p/110998399 75 | 76 | 小段文讲清argparse模块基本用法[小番外] 77 | https://zhuanlan.zhihu.com/p/111010774 78 | 79 | 如何直观理解PPO算法?[理论篇] 80 | https://zhuanlan.zhihu.com/p/111049450 81 | 82 | 如何直观理解PPO算法[实战篇] 83 | https://zhuanlan.zhihu.com/p/111068310 84 | 85 | 一文带你理清DDPG算法 86 | https://zhuanlan.zhihu.com/p/111257402 87 | 88 | 什么是TD3算法? 89 | https://zhuanlan.zhihu.com/p/111334500 90 | 91 | AC:看我的影分身之术[A3C] 92 | https://zhuanlan.zhihu.com/p/111336330 93 | 94 | PPO:看我的影分身之术[DPPO] 95 | https://zhuanlan.zhihu.com/p/111346592 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /assets/学习线路.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/louisnino/RLcode/10296e8536e5f661a05be2d3995363e8cf36194c/assets/学习线路.png -------------------------------------------------------------------------------- /double_DQN & dueling_DQN.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorlayer as tl 3 | from collections import deque 4 | import numpy as np 5 | import gym 6 | import random 7 | 8 | 9 | class Double_DQN(): 10 | def __init__(self): 11 | self.env = gym.make('CartPole-v0') #定义环境 12 | self.input_dim = self.env.observation_space.shape[0] #定义网络的输入形状,这里就是输入S 13 | 14 | #建立两个网络 15 | self.Q_network = self.get_model() #建立一个Q网络 16 | self.Q_network.train() #在tensorlayer要指定这个网络用于训练。 17 | self.target_Q_network = self.get_model() #创建一个target_Q网络 18 | self.target_Q_network.eval() #这个网络指定为不用于更新。 19 | 20 | ## epsilon-greedy相关参数 21 | self.epsilon = 1.0 #epsilon大小,随机数大于epsilon,则进行开发;否则,进行探索。 22 | self.epsilon_decay = 0.995 #减少率:epsilon会随着迭代而更新,每次会乘以0.995 23 | self.epsilon_min = 0.01 #小于最小epsilon就不再减少了。 24 | 25 | #其余超参数 26 | self.memory = deque(maxlen=2000) #队列,最大值是2000 27 | self.batch = 128 28 | self.gamma = 0.95 #折扣率 29 | self.learning_rate = 1e-3 #学习率 30 | self.opt = tf.optimizers.Adam(self.learning_rate) #优化器 31 | self.is_rend = False #默认不渲染,当达到一定次数后,开始渲染。 32 | ''' 33 | def get_model(self): 34 | #创建网络 35 | # 输入:S 36 | # 输出:所有动作的Q值 37 | self.input = tl.layers.Input(shape=[None,self.input_dim]) 38 | self.h1 = tl.layers.Dense(32, tf.nn.relu, W_init=tf.initializers.GlorotUniform())(self.input) 39 | self.h2 = tl.layers.Dense(16, tf.nn.relu, W_init=tf.initializers.GlorotUniform())(self.h1) 40 | self.output = tl.layers.Dense(2,act=None, W_init=tf.initializers.GlorotUniform())(self.h2) 41 | return tl.models.Model(inputs=self.input,outputs=self.output) 42 | 43 | ''' 44 | 45 | # dueling DQN只改了网络架构。 46 | def get_model(self): 47 | #第一部分 48 | input = tl.layers.Input(shape=[None,self.input_dim]) 49 | h1 = tl.layers.Dense(16, tf.nn.relu, W_init=tf.initializers.GlorotUniform())(input) 50 | h2 = tl.layers.Dense(16, tf.nn.relu, W_init=tf.initializers.GlorotUniform())(h1) 51 | #第二部分 52 | svalue = tl.layers.Dense(2,)(h2) 53 | #第三部分 54 | avalue = tl.layers.Dense(2,)(h2) #计算avalue 55 | mean = tl.layers.Lambda(lambda x: tf.reduce_mean(x,axis=1,keepdims=True))(avalue) #用Lambda层,计算avg(a) 56 | advantage = tl.layers.ElementwiseLambda(lambda x,y: x-y)([avalue,mean]) #a - avg(a) 57 | 58 | output = tl.layers.ElementwiseLambda(lambda x,y: x+y)([svalue,avalue]) 59 | return tl.models.Model(inputs=input,outputs=output) 60 | 61 | 62 | def update_epsilon(self): 63 | ''' 64 | 用于更新epsilon 65 | 除非已经epsilon_min还小,否则比每次都乘以减少率epsilon_decay。 66 | ''' 67 | if self.epsilon >= self.epsilon_min: 68 | self.epsilon *= self.epsilon_decay 69 | 70 | def update_target_Q(self): 71 | ''' 72 | Q网络学习完之后,需要把参数赋值到target_Q网络 73 | ''' 74 | for i , target in zip(self.Q_network.trainable_weights, self.target_Q_network.trainable_weights): 75 | target.assign(i) 76 | 77 | def remember(self, s, a, s_, r, done): 78 | ''' 79 | 把数据放入到队列中保存。 80 | ''' 81 | data = (s, a, s_, r, done) 82 | self.memory.append(data) 83 | 84 | def process_data(self): 85 | 86 | # 从队列中,随机取出一个batch大小的数据。 87 | data = random.sample(self.memory, self.batch) 88 | s = np.array([d[0] for d in data]) 89 | a = [d[1] for d in data] 90 | s_ = np.array([d[2] for d in data]) 91 | r = [d[3] for d in data] 92 | done = [d[4] for d in data] 93 | 94 | # 原始DQN的target 95 | ''' 96 | target_Q = np.max(self.target_Q_network(np.array(s_,dtype='float32'))) #计算下一状态最大的Q值 97 | target = target_Q * self.gamma + r 98 | ''' 99 | # [敲黑板] 100 | # 计算Double的target 101 | y = self.Q_network(np.array(s,dtype='float32')) 102 | y = y.numpy() 103 | Q1 = self.target_Q_network(np.array(s_,dtype='float32')) 104 | Q2 = self.Q_network(np.array(s_,dtype='float32')) 105 | next_action = np.argmax(Q2,axis=1) 106 | 107 | for i ,(_,a,_,r,done) in enumerate(data): 108 | if done: 109 | target = r 110 | else: 111 | #[敲黑板] 112 | # next_action是从Q_network计算出来的最大Q值的动作 113 | # 但输出的,是target_Q_network中的next_action的Q值。 114 | # 可以理解为:一个网络提议案,另外一个网络进行执行 115 | target = r + self.gamma * Q1[i][next_action[i]] 116 | target = np.array(target,dtype='float32') 117 | 118 | # y 就是更新目标。 119 | y[i][a] = target 120 | return s, y 121 | 122 | def update_Q_network(self): 123 | ''' 124 | 更新Q_network,最小化target和Q的距离 125 | ''' 126 | s,y = self.process_data() 127 | with tf.GradientTape() as tape: 128 | Q = self.Q_network(np.array(s,dtype='float32')) 129 | loss = tl.cost.mean_squared_error(Q,y) # 最小化target和Q的距离 130 | grads = tape.gradient(loss, self.Q_network.trainable_weights) 131 | self.opt.apply_gradients(zip(grads,self.Q_network.trainable_weights)) 132 | return loss 133 | 134 | def get_action(self,s): 135 | ''' 136 | 用epsilon-greedy的方式求动作。 137 | ''' 138 | # 先随机一个数,如果比epsilon大,那么,就输出最大Q值的动作。 139 | if np.random.rand()>=self.epsilon: 140 | q = self.Q_network(np.array(s,dtype='float32').reshape([-1,4])) 141 | a = np.argmax(q) 142 | return a 143 | # 否则,随机一个动作输出。 144 | else: 145 | a = random.randint(0, 1) 146 | return a 147 | 148 | ## 开始训练 149 | def train(self,episode): 150 | step = 0 151 | rend = 0 152 | for ep in range(episode): 153 | 154 | s = self.env.reset() #重置初始状态s 155 | total_reward = 0 156 | total_loss = [] 157 | loss = 0 158 | 159 | while True: 160 | if self.is_rend:self.env.render() 161 | 162 | # 进行游戏 163 | a = self.get_action(s) 164 | s_,r,done,_ = self.env.step(a) 165 | total_reward += r 166 | step += 1 167 | 168 | #保存s, a, s_, r, done 169 | self.remember(s, a, s_, r, done) 170 | s = s_ 171 | 172 | #如果数据足够,那么就开始更新 173 | if len(self.memory)>self.batch: 174 | loss = self.update_Q_network() 175 | total_loss.append(loss) 176 | if (step+1)%5 == 0: 177 | self.update_epsilon() 178 | self.update_target_Q() 179 | 180 | #如果到最终状态,就打印一下成绩如何 181 | if done: 182 | print('EP:%i, total_rewards:%f, epsilon:%f, loss:%f'%(ep,total_reward,self.epsilon,np.mean(loss))) 183 | break 184 | 185 | # 如果有5个ep成绩大于200,就开始渲染游戏。 186 | if total_reward>=200: 187 | rend += 1 188 | if rend == 5: 189 | self.is_rend = True 190 | 191 | # 开始运行游戏 192 | if __name__=='__main__': 193 | ddqn = Double_DQN() 194 | ddqn.train(200) 195 | -------------------------------------------------------------------------------- /theading_demo.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | import threading 4 | import queue 5 | 6 | N_WORKER = 4 #worker的数量 7 | QUEUE = queue.Queue() #队列,用于储存数据 8 | EP_MAX = 10 #执行EP 9 | EP_LEN = 200 #每个EP的最大步数 10 | MIN_BATCH_SIZE = 10 #每个batch的大小 11 | 12 | 13 | class Worker(): 14 | #工人对象的id。该程序只是模拟,所以在填入数据的时候,会直接把wid放入队列表示该工人产生的数据。 15 | def __init__(self,wid): 16 | self.wid = wid #工人id 17 | 18 | def work(self): 19 | global GLOBAL_EP, GLOBAL_UPDATE_COUNTER 20 | 21 | #判断是否所有线程都应该停止了。 22 | while not COORD.should_stop(): 23 | 24 | for _ in range(EP_LEN): #开始新的EP 25 | 26 | #if not ROLLING_EVENT.is_set(): #如果有其他worker线程已经被阻塞,那么其他线程也需要在这等待。 27 | ROLLING_EVENT.wait() 28 | 29 | QUEUE.put(self.wid) 30 | ''' 31 | 这里做了简化,直接把worker的id当做和环境互动产生的数据放入队列中。 32 | 实际上,这里会用buffer记录智能体和环境互动产生的数据。当数据大于MIN_BATCH_SIZE才开始整理数据。 33 | ''' 34 | GLOBAL_UPDATE_COUNTER += 1 #GLOBAL_UPDATE_COUNTER+1:表示有智能体走了一步了 35 | 36 | 37 | if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: #更新大于 38 | ''' 39 | 这里可以插入整理数据部分 40 | ''' 41 | ROLLING_EVENT.clear() 42 | UPDATE_EVENT.set() 43 | 44 | if GLOBAL_EP >= EP_MAX: #更新10次 45 | COORD.request_stop() 46 | break 47 | 48 | 49 | 50 | 51 | class PPO(object): 52 | 53 | def update(self): 54 | global GLOBAL_UPDATE_COUNTER 55 | 56 | #判断是否所有线程都应该停止了。 57 | while not COORD.should_stop(): 58 | if GLOBAL_EP <= EP_MAX: 59 | UPDATE_EVENT.wait() 60 | 61 | ''' 62 | 这里用输出表示更新 63 | ''' 64 | print("====update====") 65 | print("GLOBAL_EP",GLOBAL_EP) 66 | print("GLOBAL_UPDATE_COUNTER:",GLOBAL_UPDATE_COUNTER) 67 | print("update_old_pi") 68 | print("Queuesize:",QUEUE.qsize()) 69 | print([QUEUE.get() for _ in range(QUEUE.qsize())]) 70 | print("update Critic") 71 | print("update Actor") 72 | print("=====END======") 73 | 74 | GLOBAL_UPDATE_COUNTER = 0 75 | 76 | UPDATE_EVENT.clear() 77 | ROLLING_EVENT.set() 78 | 79 | if __name__ == "__main__": 80 | #创建worker对象 81 | #做法1: 82 | workers = [] 83 | for i in range(N_WORKER): 84 | worker = Worker(i) 85 | workers.append(worker) 86 | #做法2: 87 | #workers = [Worker(wid=i) for i in range(N_WORKER)] 88 | 89 | #创建PPO对象 90 | GLOBAL_PPO = PPO() 91 | 92 | #新建两个event:UPDATE_EVENT,ROLLING_EVENT 93 | #把UPDATE_EVENT的信号设置为阻塞 94 | #把ROLLING_EVENT的信号设置为就绪 95 | UPDATE_EVENT,ROLLING_EVENT = threading.Event(), threading.Event() 96 | UPDATE_EVENT.clear() 97 | ROLLING_EVENT.set() 98 | 99 | #定义两个全局变量 100 | #GLOBAL_UPDATE_COUNTER:每次更新+1 101 | #GLOBAL_STEP: 102 | GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0 103 | threads = [] 104 | 105 | #创建协调器 106 | COORD = tf.train.Coordinator() 107 | 108 | #开启rolling线程 109 | for worker in workers: #三个rolling线程 110 | t = threading.Thread(target=worker.work) #线程的功能就是执行work函数 111 | t.start() 112 | threads.append(t) 113 | 114 | #开启update线程 115 | threads.append(threading.Thread(target=GLOBAL_PPO.update,)) #update线程执行PPO的update函数 116 | threads[-1].start() #启动最后加入的线程,就是update线程 117 | 118 | #加入协调器 119 | COORD.join(threads) -------------------------------------------------------------------------------- /tutorial_A3C.py: -------------------------------------------------------------------------------- 1 | """ 2 | Asynchronous Advantage Actor Critic (A3C) with Continuous Action Space. 3 | 4 | Actor Critic History 5 | ---------------------- 6 | A3C > DDPG (for continuous action space) > AC 7 | 8 | Advantage 9 | ---------- 10 | Train faster and more stable than AC. 11 | 12 | Disadvantage 13 | ------------- 14 | Have bias. 15 | 16 | Reference 17 | ---------- 18 | Original Paper: https://arxiv.org/pdf/1602.01783.pdf 19 | MorvanZhou's tutorial: https://morvanzhou.github.io/tutorials/ 20 | MorvanZhou's code: https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/blob/master/experiments/Solve_BipedalWalker/A3C.py 21 | 22 | Environment 23 | ----------- 24 | BipedalWalker-v2 : https://gym.openai.com/envs/BipedalWalker-v2 25 | 26 | Reward is given for moving forward, total 300+ points up to the far end. 27 | If the robot falls, it gets -100. Applying motor torque costs a small amount of 28 | points, more optimal agent will get better score. State consists of hull angle 29 | speed, angular velocity, horizontal speed, vertical speed, position of joints 30 | and joints angular speed, legs contact with ground, and 10 lidar rangefinder 31 | measurements. There's no coordinates in the state vector. 32 | 33 | Prerequisites 34 | -------------- 35 | tensorflow 2.0.0a0 36 | tensorflow-probability 0.6.0 37 | tensorlayer 2.0.0 38 | && 39 | pip install box2d box2d-kengz --user 40 | 41 | To run 42 | ------ 43 | python tutorial_A3C.py --train/test 44 | 45 | """ 46 | 47 | import argparse 48 | import multiprocessing 49 | import threading 50 | import time 51 | 52 | import gym 53 | import numpy as np 54 | import tensorflow as tf 55 | import tensorflow_probability as tfp 56 | 57 | import tensorlayer as tl 58 | from tensorlayer.layers import DenseLayer, InputLayer 59 | 60 | tfd = tfp.distributions 61 | 62 | tl.logging.set_verbosity(tl.logging.DEBUG) 63 | 64 | np.random.seed(2) 65 | tf.random.set_seed(2) # reproducible 66 | 67 | # add arguments in command --train/test 68 | parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') 69 | parser.add_argument('--train', dest='train', action='store_true', default=True) 70 | parser.add_argument('--test', dest='test', action='store_true', default=False) 71 | args = parser.parse_args() 72 | 73 | ##################### hyper parameters #################### 74 | 75 | GAME = 'BipedalWalker-v2' # BipedalWalkerHardcore-v2 BipedalWalker-v2 LunarLanderContinuous-v2 76 | LOG_DIR = './log' # the log file 77 | N_WORKERS = multiprocessing.cpu_count() # number of workers accroding to number of cores in cpu 78 | print("n_workers:",N_WORKERS) 79 | # N_WORKERS = 2 # manually set number of workers 80 | MAX_GLOBAL_EP = 800 # number of training episodes 81 | GLOBAL_NET_SCOPE = 'Global_Net' 82 | UPDATE_GLOBAL_ITER = 10 # update global policy after several episodes 83 | GAMMA = 0.99 # reward discount factor 84 | ENTROPY_BETA = 0.005 # factor for entropy boosted exploration 85 | LR_A = 0.00005 # learning rate for actor 86 | LR_C = 0.0001 # learning rate for critic 87 | GLOBAL_RUNNING_R = [] 88 | GLOBAL_EP = 0 # will increase during training, stop training when it >= MAX_GLOBAL_EP 89 | 90 | ################### Asynchronous Advantage Actor Critic (A3C) #################################### 91 | 92 | 93 | class ACNet(object): 94 | 95 | def __init__(self, scope, globalAC=None): 96 | self.scope = scope 97 | self.save_path = './model' 98 | 99 | w_init = tf.keras.initializers.glorot_normal(seed=None) # initializer, glorot=xavier 100 | 101 | #输入state,输出action分布mu和sigma 102 | def get_actor(input_shape): # policy network 103 | with tf.name_scope(self.scope): 104 | ni = tl.layers.Input(input_shape, name='in') 105 | nn = tl.layers.Dense(n_units=500, act=tf.nn.relu6, W_init=w_init, name='la')(ni) 106 | nn = tl.layers.Dense(n_units=300, act=tf.nn.relu6, W_init=w_init, name='la2')(nn) 107 | mu = tl.layers.Dense(n_units=N_A, act=tf.nn.tanh, W_init=w_init, name='mu')(nn) 108 | sigma = tl.layers.Dense(n_units=N_A, act=tf.nn.softplus, W_init=w_init, name='sigma')(nn) 109 | return tl.models.Model(inputs=ni, outputs=[mu, sigma], name=scope + '/Actor') 110 | 111 | self.actor = get_actor([None, N_S]) 112 | self.actor.train() # train mode for Dropout, BatchNorm 113 | 114 | #输入state,输出V值 115 | def get_critic(input_shape): # we use Value-function here, but not Q-function. 116 | with tf.name_scope(self.scope): 117 | ni = tl.layers.Input(input_shape, name='in') 118 | nn = tl.layers.Dense(n_units=500, act=tf.nn.relu6, W_init=w_init, name='lc')(ni) 119 | nn = tl.layers.Dense(n_units=300, act=tf.nn.relu6, W_init=w_init, name='lc2')(nn) 120 | v = tl.layers.Dense(n_units=1, W_init=w_init, name='v')(nn) 121 | return tl.models.Model(inputs=ni, outputs=v, name=scope + '/Critic') 122 | 123 | self.critic = get_critic([None, N_S]) 124 | self.critic.train() # train mode for Dropout, BatchNorm 125 | 126 | #更新网络 127 | @tf.function # convert numpy functions to tf.Operations in the TFgraph, return tensor 128 | def update_global( 129 | self, buffer_s, buffer_a, buffer_v_target, globalAC 130 | ): # refer to the global Actor-Crtic network for updating it with samples 131 | 132 | 133 | ''' update the global critic ''' 134 | with tf.GradientTape() as tape: 135 | self.v = self.critic(buffer_s) #V(s) 136 | self.v_target = buffer_v_target #V(s')*gamma + r 137 | td = tf.subtract(self.v_target, self.v, name='TD_error') #td = V(s')*gamma + r - V(s') 138 | self.c_loss = tf.reduce_mean(tf.square(td)) 139 | self.c_grads = tape.gradient(self.c_loss, self.critic.trainable_weights) # 注意!求梯度,实在本地求的,但更新的是global的 140 | OPT_C.apply_gradients(zip(self.c_grads, globalAC.critic.trainable_weights)) # local grads applies to global net 141 | # del tape # Drop the reference to the tape 142 | 143 | ''' update the global actor ''' 144 | with tf.GradientTape() as tape: 145 | self.mu, self.sigma = self.actor(buffer_s) #actor输出mu和sigma 146 | self.test = self.sigma[0] #这里只是为了测试用 147 | self.mu, self.sigma = self.mu * A_BOUND[1], self.sigma + 1e-5 #mu需要映射到行动空间的范围 148 | normal_dist = tfd.Normal(self.mu, self.sigma) #根据mu和sigma创建正态分布 149 | 150 | self.a_his = buffer_a # 求action在分布下的概率。float32 151 | log_prob = normal_dist.log_prob(self.a_his) 152 | 153 | exp_v = log_prob * td ##带权重更新 td is from the critic part, no gradients for it。 154 | 155 | #求最大熵 156 | entropy = normal_dist.entropy() # encourage exploration 157 | 158 | self.exp_v = ENTROPY_BETA * entropy + exp_v 159 | self.a_loss = tf.reduce_mean(-self.exp_v) 160 | self.a_grads = tape.gradient(self.a_loss, self.actor.trainable_weights) 161 | OPT_A.apply_gradients(zip(self.a_grads, globalAC.actor.trainable_weights)) # local grads applies to global net 162 | return self.test # for test purpose 163 | 164 | @tf.function 165 | def pull_global(self, globalAC): # run by a local, pull weights from the global nets 166 | # 把全局网络的参数赋值给本地网络 167 | for l_p, g_p in zip(self.actor.trainable_weights, globalAC.actor.trainable_weights): 168 | l_p.assign(g_p) 169 | for l_p, g_p in zip(self.critic.trainable_weights, globalAC.critic.trainable_weights): 170 | l_p.assign(g_p) 171 | 172 | #选择动作,输入s,输出 173 | def choose_action(self, s): # run by a local 174 | s = s[np.newaxis, :] 175 | self.mu, self.sigma = self.actor(s) 176 | 177 | with tf.name_scope('wrap_a_out'): 178 | self.mu, self.sigma = self.mu * A_BOUND[1], self.sigma + 1e-5 # sigma增大了少许 179 | normal_dist = tfd.Normal(self.mu, self.sigma) # 构建正态分布 180 | self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), *A_BOUND) 181 | return self.A.numpy()[0] 182 | 183 | def save_ckpt(self): # save trained weights 184 | tl.files.save_npz(self.actor.trainable_weights, name='model_actor.npz') 185 | tl.files.save_npz(self.critic.trainable_weights, name='model_critic.npz') 186 | 187 | def load_ckpt(self): # load trained weights 188 | tl.files.load_and_assign_npz(name='model_actor.npz', network=self.actor) 189 | tl.files.load_and_assign_npz(name='model_critic.npz', network=self.critic) 190 | 191 | 192 | class Worker(object): 193 | 194 | def __init__(self, name, globalAC): 195 | self.env = gym.make(GAME) #创建环境,每个worker都要创建一个环境,是独立的。 196 | self.name = name #worker的名字 197 | self.AC = ACNet(name, globalAC) #AC算法 198 | 199 | # def work(self): 200 | def work(self, globalAC): 201 | global GLOBAL_RUNNING_R, GLOBAL_EP 202 | total_step = 1 203 | buffer_s, buffer_a, buffer_r = [], [], [] 204 | while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: # MAX_GLOBAL_EP最大训练EP 205 | s = self.env.reset() #重置环境 206 | ep_r = 0 #统计ep的总reward 207 | while True: 208 | # visualize Worker_0 during training 209 | if self.name == 'Worker_0' and total_step % 30 == 0: #worker_0,每30步渲染一次 210 | self.env.render() 211 | s = s.astype('float32') # double to float 212 | a = self.AC.choose_action(s) # 选择动作 213 | s_, r, done, _info = self.env.step(a) # 和环境互动 214 | 215 | s_ = s_.astype('float32') # double to float 216 | # set robot falls reward to -2 instead of -100 217 | if r == -100: r = -2 # 把reward-100的时候,改为-2 218 | 219 | ep_r += r # 保存数据 220 | buffer_s.append(s) 221 | buffer_a.append(a) 222 | buffer_r.append(r) 223 | 224 | # TD(n)的架构。 225 | if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net 226 | 227 | #计算最后一步的V(s') 228 | if done: 229 | v_s_ = 0 # terminal 230 | else: 231 | v_s_ = self.AC.critic(s_[np.newaxis, :])[0, 0] # reduce dim from 2 to 0 232 | 233 | buffer_v_target = [] 234 | 235 | #计算每个state的V(s') 236 | for r in buffer_r[::-1]: # reverse buffer r 237 | v_s_ = r + GAMMA * v_s_ 238 | buffer_v_target.append(v_s_) 239 | 240 | buffer_v_target.reverse() 241 | 242 | buffer_s, buffer_a, buffer_v_target = ( 243 | np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target) 244 | ) 245 | 246 | # 更新全局网络的参数 247 | # update gradients on global network 248 | self.AC.update_global(buffer_s, buffer_a, buffer_v_target.astype('float32'), globalAC) 249 | buffer_s, buffer_a, buffer_r = [], [], [] 250 | 251 | # update local network from global network 252 | self.AC.pull_global(globalAC) 253 | 254 | s = s_ 255 | total_step += 1 256 | if done: 257 | if len(GLOBAL_RUNNING_R) == 0: # record running episode reward 258 | GLOBAL_RUNNING_R.append(ep_r) 259 | else: # moving average 260 | GLOBAL_RUNNING_R.append(0.95 * GLOBAL_RUNNING_R[-1] + 0.05 * ep_r) 261 | # print( 262 | # self.name, 263 | # "Episode: ", 264 | # GLOBAL_EP, 265 | # # "| pos: %i" % self.env.unwrapped.hull.position[0], # number of move 266 | # '| reward: %.1f' % ep_r, 267 | # "| running_reward: %.1f" % GLOBAL_RUNNING_R[-1], 268 | # # '| sigma:', test, # debug 269 | # # 'WIN ' * 5 if self.env.unwrapped.hull.position[0] >= 88 else '', 270 | # ) 271 | print('{}, Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'\ 272 | .format(self.name, GLOBAL_EP, MAX_GLOBAL_EP, ep_r, time.time()-t0 )) 273 | GLOBAL_EP += 1 274 | break 275 | 276 | 277 | if __name__ == "__main__": 278 | 279 | env = gym.make(GAME) 280 | 281 | N_S = env.observation_space.shape[0] #状态空间 282 | N_A = env.action_space.shape[0] #动作空间 283 | 284 | A_BOUND = [env.action_space.low, env.action_space.high] #动作范围 285 | A_BOUND[0] = A_BOUND[0].reshape(1, N_A) #动作范围形状修改 286 | A_BOUND[1] = A_BOUND[1].reshape(1, N_A) 287 | # print(A_BOUND) 288 | if args.train: 289 | # ============================= TRAINING =============================== 290 | t0 = time.time() #计算时间 291 | with tf.device("/cpu:0"): #以下部分,都在CPU0完成 292 | 293 | OPT_A = tf.optimizers.RMSprop(LR_A, name='RMSPropA') #创建Actor的优化器 294 | OPT_C = tf.optimizers.RMSprop(LR_C, name='RMSPropC') #创建Critic的优化器 295 | 296 | GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params 创建全局网络GLOBAL_AC 297 | workers = [] # workers列表 298 | 299 | # 创建worker 300 | # Create worker 301 | for i in range(N_WORKERS): 302 | i_name = 'Worker_%i' % i # worker name 303 | workers.append(Worker(i_name, GLOBAL_AC)) # 创建worker,并放在workers列表中,方便统一管理 304 | 305 | COORD = tf.train.Coordinator() #创建tensorflow中协调器 306 | 307 | # start TF threading 308 | worker_threads = [] 309 | for worker in workers: #执行每一个worker 310 | # t = threading.Thread(target=worker.work) 311 | job = lambda: worker.work(GLOBAL_AC) #worker要执行的工作。 312 | t = threading.Thread(target=job) #创建一个线程,执行工作 313 | t.start() #开始线程,并执行 314 | worker_threads.append(t) #把线程加入worker_threads中。 315 | COORD.join(worker_threads) #线程由COORD统一管理即可 316 | 317 | #====画图==== 318 | import matplotlib.pyplot as plt 319 | plt.plot(GLOBAL_RUNNING_R) 320 | plt.xlabel('episode') 321 | plt.ylabel('global running reward') 322 | plt.savefig('a3c.png') 323 | plt.show() 324 | 325 | GLOBAL_AC.save_ckpt() 326 | 327 | if args.test: 328 | # ============================= EVALUATION ============================= 329 | # env = gym.make(GAME) 330 | # GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) 331 | GLOBAL_AC.load_ckpt() 332 | while True: 333 | s = env.reset() 334 | rall = 0 335 | while True: 336 | env.render() 337 | s = s.astype('float32') # double to float 338 | a = GLOBAL_AC.choose_action(s) 339 | s, r, d, _ = env.step(a) 340 | rall += r 341 | if d: 342 | print("reward", rall) 343 | break 344 | -------------------------------------------------------------------------------- /tutorial_AC.py: -------------------------------------------------------------------------------- 1 | """ 2 | Actor-Critic 3 | ------------- 4 | It uses TD-error as the Advantage. 5 | 6 | Actor Critic History 7 | ---------------------- 8 | A3C > DDPG > AC 9 | 10 | Advantage 11 | ---------- 12 | AC converge faster than Policy Gradient. 13 | 14 | Disadvantage (IMPORTANT) 15 | ------------------------ 16 | The Policy is oscillated (difficult to converge), DDPG can solve 17 | this problem using advantage of DQN. 18 | 19 | Reference 20 | ---------- 21 | paper: https://papers.nips.cc/paper/1786-actor-critic-algorithms.pdf 22 | View more on MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/ 23 | 24 | Environment 25 | ------------ 26 | CartPole-v0: https://gym.openai.com/envs/CartPole-v0 27 | 28 | A pole is attached by an un-actuated joint to a cart, which moves along a 29 | frictionless track. The system is controlled by applying a force of +1 or -1 30 | to the cart. The pendulum starts upright, and the goal is to prevent it from 31 | falling over. 32 | 33 | A reward of +1 is provided for every timestep that the pole remains upright. 34 | The episode ends when the pole is more than 15 degrees from vertical, or the 35 | cart moves more than 2.4 units from the center. 36 | 37 | 38 | Prerequisites 39 | -------------- 40 | tensorflow >=2.0.0a0 41 | tensorlayer >=2.0.0 42 | 43 | To run 44 | ------ 45 | python tutorial_AC.py --train/test 46 | 47 | """ 48 | import argparse 49 | import time 50 | 51 | import gym 52 | import numpy as np 53 | import tensorflow as tf 54 | 55 | import tensorlayer as tl 56 | 57 | tl.logging.set_verbosity(tl.logging.DEBUG) 58 | 59 | np.random.seed(2) 60 | tf.random.set_seed(2) # reproducible 61 | 62 | # add arguments in command --train/test 63 | parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') 64 | parser.add_argument('--train', dest='train', action='store_true', default=True) 65 | parser.add_argument('--test', dest='test', action='store_true', default=False) 66 | print(parser.parse_args()) 67 | args = parser.parse_args() 68 | 69 | ##################### hyper parameters #################### 70 | 71 | OUTPUT_GRAPH = False 72 | MAX_EPISODE = 3000 # number of overall episodes for training 73 | DISPLAY_REWARD_THRESHOLD = 100 # renders environment if running reward is greater then this threshold 74 | MAX_EP_STEPS = 1000 # maximum time step in one episode 75 | RENDER = False # rendering wastes time 76 | LAMBDA = 0.9 # reward discount in TD error 77 | LR_A = 0.001 # learning rate for actor 78 | LR_C = 0.01 # learning rate for critic 79 | 80 | ############################### Actor-Critic #################################### 81 | 82 | 83 | class Actor(object): 84 | 85 | def __init__(self, n_features, n_actions, lr=0.001): 86 | 87 | # 创建Actor网络 88 | def get_model(inputs_shape): 89 | ni = tl.layers.Input(inputs_shape, name='state') 90 | nn = tl.layers.Dense( 91 | n_units=30, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden' 92 | )(ni) 93 | nn = tl.layers.Dense( 94 | n_units=10, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden2' 95 | )(nn) 96 | nn = tl.layers.Dense(n_units=n_actions, name='actions')(nn) 97 | return tl.models.Model(inputs=ni, outputs=nn, name="Actor") 98 | 99 | self.model = get_model([None, n_features]) 100 | self.model.train() 101 | self.optimizer = tf.optimizers.Adam(lr) 102 | 103 | # Actor学习 104 | def learn(self, s, a, td): 105 | with tf.GradientTape() as tape: 106 | _logits = self.model(np.array([s])) 107 | ## 带权重更新。 108 | _exp_v = tl.rein.cross_entropy_reward_loss(logits=_logits, actions=[a], rewards=td[0]) 109 | grad = tape.gradient(_exp_v, self.model.trainable_weights) 110 | self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights)) 111 | return _exp_v 112 | 113 | # 按照分布随机动作。 114 | def choose_action(self, s): 115 | _logits = self.model(np.array([s])) 116 | _probs = tf.nn.softmax(_logits).numpy() 117 | return tl.rein.choice_action_by_probs(_probs.ravel()) # sample according to probability distribution 118 | 119 | # 贪婪算法。 120 | def choose_action_greedy(self, s): 121 | _logits = self.model(np.array([s])) # logits: probability distribution of actions 122 | _probs = tf.nn.softmax(_logits).numpy() 123 | return np.argmax(_probs.ravel()) 124 | 125 | def save_ckpt(self): # save trained weights 126 | tl.files.save_npz(self.model.trainable_weights, name='model_actor.npz') 127 | 128 | def load_ckpt(self): # load trained weights 129 | tl.files.load_and_assign_npz(name='model_actor.npz', network=self.model) 130 | 131 | 132 | class Critic(object): 133 | 134 | def __init__(self, n_features, lr=0.01): 135 | 136 | # 创建Critic网络。 137 | def get_model(inputs_shape): 138 | ni = tl.layers.Input(inputs_shape, name='state') 139 | nn = tl.layers.Dense( 140 | n_units=30, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden' 141 | )(ni) 142 | nn = tl.layers.Dense( 143 | n_units=5, act=tf.nn.relu, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden2' 144 | )(nn) 145 | nn = tl.layers.Dense(n_units=1, act=None, name='value')(nn) 146 | return tl.models.Model(inputs=ni, outputs=nn, name="Critic") 147 | 148 | self.model = get_model([1, n_features]) 149 | self.model.train() 150 | 151 | self.optimizer = tf.optimizers.Adam(lr) 152 | 153 | # Critic学习 154 | def learn(self, s, r, s_): 155 | v_ = self.model(np.array([s_])) 156 | with tf.GradientTape() as tape: 157 | v = self.model(np.array([s])) 158 | ## [敲黑板]计算TD-error 159 | ## TD_error = r + lambd * V(newS) - V(S) 160 | td_error = r + LAMBDA * v_ - v 161 | loss = tf.square(td_error) 162 | grad = tape.gradient(loss, self.model.trainable_weights) 163 | self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights)) 164 | return td_error 165 | 166 | def save_ckpt(self): # save trained weights 167 | tl.files.save_npz(self.model.trainable_weights, name='model_critic.npz') 168 | 169 | def load_ckpt(self): # load trained weights 170 | tl.files.load_and_assign_npz(name='model_critic.npz', network=self.model) 171 | 172 | 173 | if __name__ == '__main__': 174 | ''' 175 | choose environment 176 | 1. Openai gym: 177 | env = gym.make() 178 | 2. DeepMind Control Suite: 179 | env = dm_control2gym.make() 180 | ''' 181 | env = gym.make('CartPole-v1') 182 | # dm_control2gym.create_render_mode('example mode', show=True, return_pixel=False, height=240, width=320, camera_id=-1, overlays=(), 183 | # depth=False, scene_option=None) 184 | # env = dm_control2gym.make(domain_name="cartpole", task_name="balance") 185 | env.seed(2) # reproducible 186 | # env = env.unwrapped 187 | N_F = env.observation_space.shape[0] 188 | # N_A = env.action_space.shape[0] 189 | N_A = env.action_space.n 190 | 191 | print("observation dimension: %d" % N_F) # 4 192 | print("observation high: %s" % env.observation_space.high) # [ 2.4 , inf , 0.41887902 , inf] 193 | print("observation low : %s" % env.observation_space.low) # [-2.4 , -inf , -0.41887902 , -inf] 194 | print("num of actions: %d" % N_A) # 2 : left or right 195 | 196 | actor = Actor(n_features=N_F, n_actions=N_A, lr=LR_A) 197 | # we need a good teacher, so the teacher should learn faster than the actor 198 | critic = Critic(n_features=N_F, lr=LR_C) 199 | 200 | ## 训练部分 201 | if args.train: 202 | t0 = time.time() 203 | for i_episode in range(MAX_EPISODE): 204 | # episode_time = time.time() 205 | s = env.reset().astype(np.float32) 206 | t = 0 # number of step in this episode 207 | all_r = [] # rewards of all steps 208 | while True: 209 | 210 | if RENDER: env.render() 211 | 212 | a = actor.choose_action(s) 213 | 214 | s_new, r, done, info = env.step(a) 215 | s_new = s_new.astype(np.float32) 216 | # [敲黑板],我们希望在濒死状态,可以减去一个大reward。让智能体学习如何力挽狂澜。 217 | if done: r = -20 218 | # these may helpful in some tasks 219 | # if abs(s_new[0]) >= env.observation_space.high[0]: 220 | # # cart moves more than 2.4 units from the center 221 | # r = -20 222 | # reward for the distance between cart to the center 223 | # r -= abs(s_new[0]) * .1 224 | 225 | all_r.append(r) 226 | 227 | # Critic学习,并计算出td-error 228 | td_error = critic.learn( 229 | s, r, s_new 230 | ) # learn Value-function : gradient = grad[r + lambda * V(s_new) - V(s)] 231 | 232 | # actor学习 233 | try: 234 | for _ in range(1): 235 | actor.learn(s, a, td_error) # learn Policy : true_gradient = grad[logPi(s, a) * td_error] 236 | except KeyboardInterrupt: # if Ctrl+C at running actor.learn(), then save model, or exit if not at actor.learn() 237 | actor.save_ckpt() 238 | critic.save_ckpt() 239 | # logging 240 | 241 | s = s_new 242 | t += 1 243 | 244 | if done or t >= MAX_EP_STEPS: 245 | ep_rs_sum = sum(all_r) 246 | 247 | if 'running_reward' not in globals(): 248 | running_reward = ep_rs_sum 249 | else: 250 | running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 251 | # start rending if running_reward greater than a threshold 252 | # if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True 253 | # print("Episode: %d reward: %f running_reward %f took: %.5f" % \ 254 | # (i_episode, ep_rs_sum, running_reward, time.time() - episode_time)) 255 | print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'\ 256 | .format(i_episode, MAX_EPISODE, ep_rs_sum, time.time()-t0 )) 257 | 258 | # Early Stopping for quick check 259 | if t >= MAX_EP_STEPS: 260 | print("Early Stopping") 261 | s = env.reset().astype(np.float32) 262 | rall = 0 263 | while True: 264 | env.render() 265 | # a = actor.choose_action(s) 266 | a = actor.choose_action_greedy(s) # Hao Dong: it is important for this task 267 | s_new, r, done, info = env.step(a) 268 | s_new = np.concatenate((s_new[0:N_F], s[N_F:]), axis=0).astype(np.float32) 269 | rall += r 270 | s = s_new 271 | if done: 272 | print("reward", rall) 273 | s = env.reset().astype(np.float32) 274 | rall = 0 275 | break 276 | actor.save_ckpt() 277 | critic.save_ckpt() 278 | 279 | ## 测试部分 280 | if args.test: 281 | actor.load_ckpt() 282 | critic.load_ckpt() 283 | t0 = time.time() 284 | 285 | for i_episode in range(MAX_EPISODE): 286 | episode_time = time.time() 287 | s = env.reset().astype(np.float32) 288 | t = 0 # number of step in this episode 289 | all_r = [] # rewards of all steps 290 | while True: 291 | if RENDER: env.render() 292 | a = actor.choose_action(s) 293 | s_new, r, done, info = env.step(a) 294 | s_new = s_new.astype(np.float32) 295 | if done: r = -20 296 | # these may helpful in some tasks 297 | # if abs(s_new[0]) >= env.observation_space.high[0]: 298 | # # cart moves more than 2.4 units from the center 299 | # r = -20 300 | # reward for the distance between cart to the center 301 | # r -= abs(s_new[0]) * .1 302 | 303 | all_r.append(r) 304 | s = s_new 305 | t += 1 306 | 307 | if done or t >= MAX_EP_STEPS: 308 | ep_rs_sum = sum(all_r) 309 | 310 | if 'running_reward' not in globals(): 311 | running_reward = ep_rs_sum 312 | else: 313 | running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 314 | # start rending if running_reward greater than a threshold 315 | # if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True 316 | # print("Episode: %d reward: %f running_reward %f took: %.5f" % \ 317 | # (i_episode, ep_rs_sum, running_reward, time.time() - episode_time)) 318 | print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'\ 319 | .format(i_episode, MAX_EPISODE, ep_rs_sum, time.time()-t0 )) 320 | 321 | # Early Stopping for quick check 322 | if t >= MAX_EP_STEPS: 323 | print("Early Stopping") 324 | s = env.reset().astype(np.float32) 325 | rall = 0 326 | while True: 327 | env.render() 328 | # a = actor.choose_action(s) 329 | a = actor.choose_action_greedy(s) # Hao Dong: it is important for this task 330 | s_new, r, done, info = env.step(a) 331 | s_new = np.concatenate((s_new[0:N_F], s[N_F:]), axis=0).astype(np.float32) 332 | rall += r 333 | s = s_new 334 | if done: 335 | print("reward", rall) 336 | s = env.reset().astype(np.float32) 337 | rall = 0 338 | break 339 | -------------------------------------------------------------------------------- /tutorial_DDPG.py: -------------------------------------------------------------------------------- 1 | """ 2 | Deep Deterministic Policy Gradient (DDPG) 3 | ----------------------------------------- 4 | An algorithm concurrently learns a Q-function and a policy. 5 | It uses off-policy data and the Bellman equation to learn the Q-function, 6 | and uses the Q-function to learn the policy. 7 | 8 | Reference 9 | --------- 10 | Deterministic Policy Gradient Algorithms, Silver et al. 2014 11 | Continuous Control With Deep Reinforcement Learning, Lillicrap et al. 2016 12 | MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/ 13 | 14 | Environment 15 | ----------- 16 | Openai Gym Pendulum-v0, continual action space 17 | 18 | Prerequisites 19 | ------------- 20 | tensorflow >=2.0.0a0 21 | tensorflow-probability 0.6.0 22 | tensorlayer >=2.0.0 23 | 24 | To run 25 | ------ 26 | python tutorial_DDPG.py --train/test 27 | 28 | """ 29 | 30 | import argparse 31 | import os 32 | import time 33 | 34 | import gym 35 | import matplotlib.pyplot as plt 36 | import numpy as np 37 | import tensorflow as tf 38 | 39 | import tensorlayer as tl 40 | 41 | parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') 42 | parser.add_argument('--train', dest='train', action='store_true', default=True) 43 | parser.add_argument('--test', dest='test', action='store_false') 44 | args = parser.parse_args() 45 | 46 | ##################### hyper parameters #################### 47 | 48 | ENV_NAME = 'Pendulum-v0' # environment name 49 | RANDOMSEED = 1 # random seed 50 | 51 | LR_A = 0.001 # learning rate for actor 52 | LR_C = 0.002 # learning rate for critic 53 | GAMMA = 0.9 # reward discount 54 | TAU = 0.01 # soft replacement 55 | MEMORY_CAPACITY = 10000 # size of replay buffer 56 | BATCH_SIZE = 32 # update batchsize 57 | 58 | MAX_EPISODES = 200 # total number of episodes for training 59 | MAX_EP_STEPS = 200 # total number of steps for each episode 60 | TEST_PER_EPISODES = 10 # test the model per episodes 61 | VAR = 3 # control exploration 62 | 63 | ############################### DDPG #################################### 64 | 65 | class DDPG(object): 66 | """ 67 | DDPG class 68 | """ 69 | def __init__(self, a_dim, s_dim, a_bound): 70 | # memory用于储存跑的数据的数组: 71 | # 保存个数MEMORY_CAPACITY,s_dim * 2 + a_dim + 1:分别是两个state,一个action,和一个reward 72 | self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32) 73 | self.pointer = 0 74 | self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound 75 | 76 | W_init = tf.random_normal_initializer(mean=0, stddev=0.3) 77 | b_init = tf.constant_initializer(0.1) 78 | 79 | # 建立actor网络,输入s,输出a 80 | def get_actor(input_state_shape, name=''): 81 | """ 82 | Build actor network 83 | :param input_state_shape: state 84 | :param name: name 85 | :return: act 86 | """ 87 | inputs = tl.layers.Input(input_state_shape, name='A_input') 88 | x = tl.layers.Dense(n_units=30, act=tf.nn.relu, W_init=W_init, b_init=b_init, name='A_l1')(inputs) 89 | x = tl.layers.Dense(n_units=a_dim, act=tf.nn.tanh, W_init=W_init, b_init=b_init, name='A_a')(x) 90 | x = tl.layers.Lambda(lambda x: np.array(a_bound) * x)(x) #注意这里,先用tanh把范围限定在[-1,1]之间,再进行映射 91 | return tl.models.Model(inputs=inputs, outputs=x, name='Actor' + name) 92 | 93 | #建立Critic网络,输入s,a。输出Q值 94 | def get_critic(input_state_shape, input_action_shape, name=''): 95 | """ 96 | Build critic network 97 | :param input_state_shape: state 98 | :param input_action_shape: act 99 | :param name: name 100 | :return: Q value Q(s,a) 101 | """ 102 | s = tl.layers.Input(input_state_shape, name='C_s_input') 103 | a = tl.layers.Input(input_action_shape, name='C_a_input') 104 | x = tl.layers.Concat(1)([s, a]) 105 | x = tl.layers.Dense(n_units=60, act=tf.nn.relu, W_init=W_init, b_init=b_init, name='C_l1')(x) 106 | x = tl.layers.Dense(n_units=1, W_init=W_init, b_init=b_init, name='C_out')(x) 107 | return tl.models.Model(inputs=[s, a], outputs=x, name='Critic' + name) 108 | 109 | self.actor = get_actor([None, s_dim]) 110 | self.critic = get_critic([None, s_dim], [None, a_dim]) 111 | self.actor.train() 112 | self.critic.train() 113 | 114 | #更新参数,只用于首次赋值,之后就没用了 115 | def copy_para(from_model, to_model): 116 | """ 117 | Copy parameters for soft updating 118 | :param from_model: latest model 119 | :param to_model: target model 120 | :return: None 121 | """ 122 | for i, j in zip(from_model.trainable_weights, to_model.trainable_weights): 123 | j.assign(i) 124 | 125 | #建立actor_target网络,并和actor参数一致,不能训练 126 | self.actor_target = get_actor([None, s_dim], name='_target') 127 | copy_para(self.actor, self.actor_target) 128 | self.actor_target.eval() 129 | 130 | #建立critic_target网络,并和actor参数一致,不能训练 131 | self.critic_target = get_critic([None, s_dim], [None, a_dim], name='_target') 132 | copy_para(self.critic, self.critic_target) 133 | self.critic_target.eval() 134 | 135 | self.R = tl.layers.Input([None, 1], tf.float32, 'r') 136 | 137 | #建立ema,滑动平均值 138 | self.ema = tf.train.ExponentialMovingAverage(decay=1 - TAU) # soft replacement 139 | 140 | self.actor_opt = tf.optimizers.Adam(LR_A) 141 | self.critic_opt = tf.optimizers.Adam(LR_C) 142 | 143 | 144 | def ema_update(self): 145 | """ 146 | 滑动平均更新 147 | """ 148 | # 其实和之前的硬更新类似,不过在更新赋值之前,用一个ema.average。 149 | paras = self.actor.trainable_weights + self.critic.trainable_weights #获取要更新的参数包括actor和critic的 150 | self.ema.apply(paras) #主要是建立影子参数 151 | for i, j in zip(self.actor_target.trainable_weights + self.critic_target.trainable_weights, paras): 152 | i.assign(self.ema.average(j)) # 用滑动平均赋值 153 | 154 | # 选择动作,把s带进入,输出a 155 | def choose_action(self, s): 156 | """ 157 | Choose action 158 | :param s: state 159 | :return: act 160 | """ 161 | return self.actor(np.array([s], dtype=np.float32))[0] 162 | 163 | def learn(self): 164 | """ 165 | Update parameters 166 | :return: None 167 | """ 168 | indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE) #随机BATCH_SIZE个随机数 169 | bt = self.memory[indices, :] #根据indices,选取数据bt,相当于随机 170 | bs = bt[:, :self.s_dim] #从bt获得数据s 171 | ba = bt[:, self.s_dim:self.s_dim + self.a_dim] #从bt获得数据a 172 | br = bt[:, -self.s_dim - 1:-self.s_dim] #从bt获得数据r 173 | bs_ = bt[:, -self.s_dim:] #从bt获得数据s' 174 | 175 | # Critic: 176 | # Critic更新和DQN很像,不过target不是argmax了,是用critic_target计算出来的。 177 | # br + GAMMA * q_ 178 | with tf.GradientTape() as tape: 179 | a_ = self.actor_target(bs_) 180 | q_ = self.critic_target([bs_, a_]) 181 | y = br + GAMMA * q_ 182 | q = self.critic([bs, ba]) 183 | td_error = tf.losses.mean_squared_error(y, q) 184 | c_grads = tape.gradient(td_error, self.critic.trainable_weights) 185 | self.critic_opt.apply_gradients(zip(c_grads, self.critic.trainable_weights)) 186 | 187 | # Actor: 188 | # Actor的目标就是获取最多Q值的。 189 | with tf.GradientTape() as tape: 190 | a = self.actor(bs) 191 | q = self.critic([bs, a]) 192 | a_loss = -tf.reduce_mean(q) # 【敲黑板】:注意这里用负号,是梯度上升!也就是离目标会越来越远的,就是越来越大。 193 | a_grads = tape.gradient(a_loss, self.actor.trainable_weights) 194 | self.actor_opt.apply_gradients(zip(a_grads, self.actor.trainable_weights)) 195 | 196 | self.ema_update() 197 | 198 | 199 | # 保存s,a,r,s_ 200 | def store_transition(self, s, a, r, s_): 201 | """ 202 | Store data in data buffer 203 | :param s: state 204 | :param a: act 205 | :param r: reward 206 | :param s_: next state 207 | :return: None 208 | """ 209 | # 整理s,s_,方便直接输入网络计算 210 | s = s.astype(np.float32) 211 | s_ = s_.astype(np.float32) 212 | 213 | #把s, a, [r], s_横向堆叠 214 | transition = np.hstack((s, a, [r], s_)) 215 | 216 | #pointer是记录了曾经有多少数据进来。 217 | #index是记录当前最新进来的数据位置。 218 | #所以是一个循环,当MEMORY_CAPACITY满了以后,index就重新在最底开始了 219 | index = self.pointer % MEMORY_CAPACITY # replace the old memory with new memory 220 | #把transition,也就是s, a, [r], s_存进去。 221 | self.memory[index, :] = transition 222 | self.pointer += 1 223 | 224 | def save_ckpt(self): 225 | """ 226 | save trained weights 227 | :return: None 228 | """ 229 | if not os.path.exists('model'): 230 | os.makedirs('model') 231 | 232 | tl.files.save_weights_to_hdf5('model/ddpg_actor.hdf5', self.actor) 233 | tl.files.save_weights_to_hdf5('model/ddpg_actor_target.hdf5', self.actor_target) 234 | tl.files.save_weights_to_hdf5('model/ddpg_critic.hdf5', self.critic) 235 | tl.files.save_weights_to_hdf5('model/ddpg_critic_target.hdf5', self.critic_target) 236 | 237 | def load_ckpt(self): 238 | """ 239 | load trained weights 240 | :return: None 241 | """ 242 | tl.files.load_hdf5_to_weights_in_order('model/ddpg_actor.hdf5', self.actor) 243 | tl.files.load_hdf5_to_weights_in_order('model/ddpg_actor_target.hdf5', self.actor_target) 244 | tl.files.load_hdf5_to_weights_in_order('model/ddpg_critic.hdf5', self.critic) 245 | tl.files.load_hdf5_to_weights_in_order('model/ddpg_critic_target.hdf5', self.critic_target) 246 | 247 | 248 | if __name__ == '__main__': 249 | 250 | #初始化环境 251 | env = gym.make(ENV_NAME) 252 | env = env.unwrapped 253 | 254 | # reproducible,设置随机种子,为了能够重现 255 | env.seed(RANDOMSEED) 256 | np.random.seed(RANDOMSEED) 257 | tf.random.set_seed(RANDOMSEED) 258 | 259 | #定义状态空间,动作空间,动作幅度范围 260 | s_dim = env.observation_space.shape[0] 261 | a_dim = env.action_space.shape[0] 262 | a_bound = env.action_space.high 263 | 264 | print('s_dim',s_dim) 265 | print('a_dim',a_dim) 266 | 267 | #用DDPG算法 268 | ddpg = DDPG(a_dim, s_dim, a_bound) 269 | 270 | #训练部分: 271 | if args.train: # train 272 | 273 | reward_buffer = [] #用于记录每个EP的reward,统计变化 274 | t0 = time.time() #统计时间 275 | for i in range(MAX_EPISODES): 276 | t1 = time.time() 277 | s = env.reset() 278 | ep_reward = 0 #记录当前EP的reward 279 | for j in range(MAX_EP_STEPS): 280 | # Add exploration noise 281 | a = ddpg.choose_action(s) #这里很简单,直接用actor估算出a动作 282 | 283 | # 为了能保持开发,这里用了另外一种方式增加探索。 284 | # 因此需要需要以a为均值,VAR为标准差,建立正态分布,再从正态分布采样出a 285 | # 因为a是均值,所以a的概率是最大的。但a相对其他概率由多大,是靠VAR调整。这里我们其实可以增加更新VAR,动态调整a的确定性 286 | # 然后进行裁剪 287 | a = np.clip(np.random.normal(a, VAR), -2, 2) 288 | # 与环境进行互动 289 | s_, r, done, info = env.step(a) 290 | 291 | # 保存s,a,r,s_ 292 | ddpg.store_transition(s, a, r / 10, s_) 293 | 294 | # 第一次数据满了,就可以开始学习 295 | if ddpg.pointer > MEMORY_CAPACITY: 296 | ddpg.learn() 297 | 298 | #输出数据记录 299 | s = s_ 300 | ep_reward += r #记录当前EP的总reward 301 | if j == MAX_EP_STEPS - 1: 302 | print( 303 | '\rEpisode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( 304 | i, MAX_EPISODES, ep_reward, 305 | time.time() - t1 306 | ), end='' 307 | ) 308 | plt.show() 309 | # test 310 | if i and not i % TEST_PER_EPISODES: 311 | t1 = time.time() 312 | s = env.reset() 313 | ep_reward = 0 314 | for j in range(MAX_EP_STEPS): 315 | 316 | a = ddpg.choose_action(s) # 注意,在测试的时候,我们就不需要用正态分布了,直接一个a就可以了。 317 | s_, r, done, info = env.step(a) 318 | 319 | s = s_ 320 | ep_reward += r 321 | if j == MAX_EP_STEPS - 1: 322 | print( 323 | '\rEpisode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( 324 | i, MAX_EPISODES, ep_reward, 325 | time.time() - t1 326 | ) 327 | ) 328 | 329 | reward_buffer.append(ep_reward) 330 | 331 | if reward_buffer: 332 | plt.ion() 333 | plt.cla() 334 | plt.title('DDPG') 335 | plt.plot(np.array(range(len(reward_buffer))) * TEST_PER_EPISODES, reward_buffer) # plot the episode vt 336 | plt.xlabel('episode steps') 337 | plt.ylabel('normalized state-action value') 338 | plt.ylim(-2000, 0) 339 | plt.show() 340 | plt.pause(0.1) 341 | plt.ioff() 342 | plt.show() 343 | print('\nRunning time: ', time.time() - t0) 344 | ddpg.save_ckpt() 345 | 346 | # test 347 | ddpg.load_ckpt() 348 | while True: 349 | s = env.reset() 350 | for i in range(MAX_EP_STEPS): 351 | env.render() 352 | s, r, done, info = env.step(ddpg.choose_action(s)) 353 | if done: 354 | break 355 | -------------------------------------------------------------------------------- /tutorial_DQN.py: -------------------------------------------------------------------------------- 1 | """ 2 | Deep Q-Network Q(a, s) 3 | ----------------------- 4 | TD Learning, Off-Policy, e-Greedy Exploration (GLIE). 5 | 6 | Q(S, A) <- Q(S, A) + alpha * (R + lambda * Q(newS, newA) - Q(S, A)) 7 | delta_w = R + lambda * Q(newS, newA) 8 | 9 | See David Silver RL Tutorial Lecture 5 - Q-Learning for more details. 10 | 11 | Reference 12 | ---------- 13 | original paper: https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf 14 | EN: https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0#.5m3361vlw 15 | CN: https://zhuanlan.zhihu.com/p/25710327 16 | 17 | Note: Policy Network has been proved to be better than Q-Learning, see tutorial_atari_pong.py 18 | 19 | Environment 20 | ----------- 21 | # The FrozenLake v0 environment 22 | https://gym.openai.com/envs/FrozenLake-v0 23 | The agent controls the movement of a character in a grid world. Some tiles of 24 | the grid are walkable, and others lead to the agent falling into the water. 25 | Additionally, the movement direction of the agent is uncertain and only partially 26 | depends on the chosen direction. The agent is rewarded for finding a walkable 27 | path to a goal tile. 28 | SFFF (S: starting point, safe) 29 | FHFH (F: frozen surface, safe) 30 | FFFH (H: hole, fall to your doom) 31 | HFFG (G: goal, where the frisbee is located) 32 | The episode ends when you reach the goal or fall in a hole. You receive a reward 33 | of 1 if you reach the goal, and zero otherwise. 34 | 35 | Prerequisites 36 | -------------- 37 | tensorflow>=2.0.0a0 38 | tensorlayer>=2.0.0 39 | 40 | To run 41 | ------- 42 | python tutorial_DQN.py --train/test 43 | 44 | 45 | """ 46 | import argparse 47 | import time 48 | import gym 49 | import numpy as np 50 | import tensorflow as tf 51 | import tensorlayer as tl 52 | 53 | # add arguments in command --train/test 54 | # 关于argparase的应用,可以看看我这篇知乎专栏: 55 | # 小段文讲清argparse模块基本用法[小番外] 56 | # https://zhuanlan.zhihu.com/p/111010774 57 | # 注意:原代码默认为test,我改为了train。 58 | parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') 59 | parser.add_argument('--train', dest='train', action='store_true', default=True) 60 | parser.add_argument('--test', dest='test', action='store_true', default=False) 61 | args = parser.parse_args() 62 | 63 | tl.logging.set_verbosity(tl.logging.DEBUG) 64 | 65 | ##################### hyper parameters #################### 66 | lambd = .99 # 折扣率(decay factor) 67 | e = 0.1 # epsilon-greedy算法参数,越大随机性越大,越倾向于探索行为。 68 | num_episodes = 10000 # 迭代次数 69 | render = False # 是否渲染游戏 70 | running_reward = None 71 | 72 | ##################### DQN ########################## 73 | 74 | ## 把分类的数字表示,变成onehot表示。 75 | # 例如有4类,那么第三类变为:[0,0,1,0]的表示。 76 | def to_one_hot(i, n_classes=None): 77 | a = np.zeros(n_classes, 'uint8') # 这里先按照分类数量构建一个全0向量 78 | a[i] = 1 # 然后点亮需要onehot的位数。 79 | return a 80 | 81 | 82 | ## Define Q-network q(a,s) that ouput the rewards of 4 actions by given state, i.e. Action-Value Function. 83 | # encoding for state: 4x4 grid can be represented by one-hot vector with 16 integers. 84 | def get_model(inputs_shape): 85 | ''' 86 | 定义Q网络模型: 87 | 1. 注意输入的shape和输出的shape 88 | 2. W_init和b_init是模型在初始化的时候,控制初始化参数的随机。该代码中用正态分布,均值0,方差0.01的方式初始化参数。 89 | ''' 90 | ni = tl.layers.Input(inputs_shape, name='observation') 91 | nn = tl.layers.Dense(4, act=None, W_init=tf.random_uniform_initializer(0, 0.01), b_init=None, name='q_a_s')(ni) 92 | return tl.models.Model(inputs=ni, outputs=nn, name="Q-Network") 93 | 94 | 95 | def save_ckpt(model): # save trained weights 96 | ''' 97 | 保存参数 98 | ''' 99 | tl.files.save_npz(model.trainable_weights, name='dqn_model.npz') 100 | 101 | 102 | def load_ckpt(model): # load trained weights 103 | ''' 104 | 加载参数 105 | ''' 106 | tl.files.load_and_assign_npz(name='dqn_model.npz', network=model) 107 | 108 | 109 | if __name__ == '__main__': 110 | 111 | qnetwork = get_model([None, 16]) #定义inputshape[None,16]。16是state数量 112 | qnetwork.train() #调用tensorlayer的时候,需要标注这个模型是否可以训练。(再次吐槽tenorlayers...) 113 | train_weights = qnetwork.trainable_weights #模型的参数 114 | 115 | optimizer = tf.optimizers.SGD(learning_rate=0.1) #定义优化器 116 | env = gym.make('FrozenLake-v0') #定义环境 117 | 118 | # ======开始训练======= 119 | if args.train: 120 | t0 = time.time() 121 | for i in range(num_episodes): 122 | ## 重置环境初始状态 123 | s = env.reset() 124 | rAll = 0 125 | for j in range(99): # 最多探索99步。因为环境状态比较少,99步一般也够探索到最终状态了。 126 | if render: env.render() 127 | 128 | ## 把state放入network,计算Q值。 129 | ## 注意,这里先把state进行onehote处理,这里注意解释下什么是onehot 130 | ## 输出,这个状态下,所有动作的Q值,也就是说,是一个[None,4]大小的矩阵 131 | allQ = qnetwork(np.asarray([to_one_hot(s, 16)], dtype=np.float32)).numpy() 132 | 133 | # 在矩阵中找最大的Q值的动作 134 | a = np.argmax(allQ, 1) 135 | 136 | # e-Greedy:如果小于epsilon,就智能体随机探索。否则,就用最大Q值的动作。 137 | if np.random.rand(1) < e: 138 | a[0] = env.action_space.sample() 139 | 140 | # 输入到环境,获得下一步的state,reward,done 141 | s1, r, d, _ = env.step(a[0]) 142 | 143 | # 把new-state 放入,预测下一个state的**所有动作**的Q值。 144 | Q1 = qnetwork(np.asarray([to_one_hot(s1, 16)], dtype=np.float32)).numpy() 145 | 146 | ##=======计算target======= 147 | ## 构建更新target: 148 | # Q'(s,a) <- Q(s,a) + alpha(r + lambd * maxQ(s',a') - Q(s, a)) 149 | maxQ1 = np.max(Q1) # 下一个状态中最大Q值. 150 | targetQ = allQ # 用allQ(现在状态的Q值)构建更新的target。因为只有被选择那个动作才会被更新到。 151 | targetQ[0, a[0]] = r + lambd * maxQ1 152 | 153 | ## 利用自动求导 进行更新。 154 | with tf.GradientTape() as tape: 155 | _qvalues = qnetwork(np.asarray([to_one_hot(s, 16)], dtype=np.float32)) #把s放入到Q网络,计算_qvalues。 156 | #_qvalues和targetQ的差距就是loss。这里衡量的尺子是mse 157 | _loss = tl.cost.mean_squared_error(targetQ, _qvalues, is_mean=False) 158 | # 同梯度带求导对网络参数求导 159 | grad = tape.gradient(_loss, train_weights) 160 | # 应用梯度到网络参数求导 161 | optimizer.apply_gradients(zip(grad, train_weights)) 162 | 163 | # 累计reward,并且把s更新为newstate 164 | rAll += r 165 | s = s1 166 | 167 | #更新epsilon,让epsilon随着迭代次数增加而减少。 168 | #目的就是智能体越来越少进行“探索” 169 | if d ==True: 170 | e = 1. / ((i / 50) + 10) 171 | break 172 | 173 | ## 这里的running_reward用于记载每一次更新的总和。为了能够更加看清变化,所以大部分是前面的。只有一部分是后面的。 174 | running_reward = rAll if running_reward is None else running_reward * 0.99 + rAll * 0.01 175 | # print("Episode [%d/%d] sum reward: %f running reward: %f took: %.5fs " % \ 176 | # (i, num_episodes, rAll, running_reward, time.time() - episode_time)) 177 | print('Episode: {}/{} | Episode Reward: {:.4f} | Running Average Reward: {:.4f} | Running Time: {:.4f}'\ 178 | .format(i, num_episodes, rAll, running_reward, time.time()-t0 )) 179 | save_ckpt(qnetwork) # save model 180 | 181 | 182 | ##============这部分是正式游戏了======== 183 | # 这部分就不讲解了,和训练一样。只是少了epsilon-greedy。 184 | if args.test: 185 | t0 = time.time() 186 | load_ckpt(qnetwork) # load model 187 | for i in range(num_episodes): 188 | ## Reset environment and get first new observation 189 | episode_time = time.time() 190 | s = env.reset() # observation is state, integer 0 ~ 15 191 | rAll = 0 192 | for j in range(99): # step index, maximum step is 99 193 | if render: env.render() 194 | 195 | ## Choose an action by greedily (with e chance of random action) from the Q-network 196 | allQ = qnetwork(np.asarray([to_one_hot(s, 16)], dtype=np.float32)).numpy() 197 | a = np.argmax(allQ, 1) # no epsilon, only greedy for testing 198 | 199 | ## Get new state and reward from environment 200 | s1, r, d, _ = env.step(a[0]) 201 | rAll += r 202 | s = s1 203 | ## Reduce chance of random action if an episode is done. 204 | if d ==True: 205 | #e = 1. / ((i / 50) + 10) # reduce e, GLIE: Greey in the limit with infinite Exploration 206 | break 207 | 208 | ## Note that, the rewards here with random action 209 | running_reward = rAll if running_reward is None else running_reward * 0.99 + rAll * 0.01 210 | # print("Episode [%d/%d] sum reward: %f running reward: %f took: %.5fs " % \ 211 | # (i, num_episodes, rAll, running_reward, time.time() - episode_time)) 212 | print('Episode: {}/{} | Episode Reward: {:.4f} | Running Average Reward: {:.4f} | Running Time: {:.4f}'\ 213 | .format(i, num_episodes, rAll, running_reward, time.time()-t0 )) 214 | -------------------------------------------------------------------------------- /tutorial_DQN_variants.py: -------------------------------------------------------------------------------- 1 | """ 2 | DQN and its variants 3 | ------------------------ 4 | We implement Double DQN, Dueling DQN and Noisy DQN here. 5 | 6 | The max operator in standard DQN uses the same values both to select and to 7 | evaluate an action by 8 | Q(s_t, a_t) = R_{t+1} + \gamma * max_{a}Q_{tar}(s_{t+1}, a). 9 | Double DQN propose to use following evaluation to address overestimation problem 10 | of max operator: 11 | Q(s_t, a_t) = R_{t+1} + \gamma * Q_{tar}(s_{t+1}, max_{a}Q(s_{t+1}, a)). 12 | 13 | Dueling DQN uses dueling architecture where the value of state and the advantage 14 | of each action is estimated separately. 15 | 16 | Noisy DQN propose to explore by adding parameter noises. 17 | 18 | 19 | Reference: 20 | ------------------------ 21 | 1. Double DQN 22 | Van Hasselt H, Guez A, Silver D. Deep reinforcement learning with double 23 | q-learning[C]//Thirtieth AAAI Conference on Artificial Intelligence. 2016. 24 | 2. Dueling DQN 25 | Wang Z, Schaul T, Hessel M, et al. Dueling network architectures for deep 26 | reinforcement learning[J]. arXiv preprint arXiv:1511.06581, 2015. 27 | 3. Noisy DQN 28 | Plappert M, Houthooft R, Dhariwal P, et al. Parameter space noise for 29 | exploration[J]. arXiv preprint arXiv:1706.01905, 2017. 30 | 31 | 32 | Environment: 33 | ------------------------ 34 | Cartpole and Pong in OpenAI Gym 35 | 36 | 37 | Requirements: 38 | ------------------------ 39 | tensorflow>=2.0.0a0 40 | tensorlayer>=2.0.0 41 | 42 | 43 | To run: 44 | ------------------------ 45 | python tutorial_DQN_variantes.py --mode=train 46 | python tutorial_DQN_variantes.py --mode=test --save_path=dqn_variants/8000.npz 47 | """ 48 | import argparse 49 | import os 50 | import random 51 | import time 52 | 53 | import numpy as np 54 | import tensorflow as tf 55 | 56 | import tensorlayer as tl 57 | from tutorial_wrappers import build_env 58 | 59 | parser = argparse.ArgumentParser() 60 | parser.add_argument('--mode', help='train or test', default='train') 61 | parser.add_argument( 62 | '--save_path', default='dqn_variants', help='folder to save if mode == train else model path,' 63 | 'qnet will be saved once target net update' 64 | ) 65 | parser.add_argument('--seed', help='random seed', type=int, default=0) 66 | parser.add_argument('--env_id', default='CartPole-v0', help='CartPole-v0 or PongNoFrameskip-v4') 67 | parser.add_argument('--noisy_scale', type=float, default=1e-2) 68 | parser.add_argument('--disable_double', action='store_false', default=True) 69 | parser.add_argument('--disable_dueling', action='store_false', default=True) 70 | args = parser.parse_args() 71 | 72 | if args.mode == 'train': 73 | os.makedirs(args.save_path, exist_ok=True) 74 | random.seed(args.seed) 75 | np.random.seed(args.seed) 76 | tf.random.set_seed(args.seed) # reproducible 77 | env_id = args.env_id 78 | env = build_env(env_id, seed=args.seed) 79 | noise_scale = args.noisy_scale 80 | double = not args.disable_double 81 | dueling = not args.disable_dueling 82 | 83 | # #################### hyper parameters #################### 84 | if env_id == 'CartPole-v0': 85 | qnet_type = 'MLP' 86 | number_timesteps = 10000 # total number of time steps to train on 87 | explore_timesteps = 100 88 | # epsilon-greedy schedule, final exploit prob is 0.99 89 | epsilon = lambda i_iter: 1 - 0.99 * min(1, i_iter / explore_timesteps) 90 | lr = 5e-3 # learning rate 91 | buffer_size = 1000 # replay buffer size 92 | target_q_update_freq = 50 # how frequency target q net update 93 | ob_scale = 1.0 # scale observations 94 | clipnorm = None 95 | else: 96 | # reward will increase obviously after 1e5 time steps 97 | qnet_type = 'CNN' 98 | number_timesteps = int(1e6) # total number of time steps to train on 99 | explore_timesteps = 1e5 100 | # epsilon-greedy schedule, final exploit prob is 0.99 101 | epsilon = lambda i_iter: 1 - 0.99 * min(1, i_iter / explore_timesteps) 102 | lr = 1e-4 # learning rate 103 | buffer_size = 10000 # replay buffer size 104 | target_q_update_freq = 200 # how frequency target q net update 105 | ob_scale = 1.0 / 255 # scale observations 106 | clipnorm = 10 107 | 108 | in_dim = env.observation_space.shape 109 | out_dim = env.action_space.n 110 | reward_gamma = 0.99 # reward discount 111 | batch_size = 32 # batch size for sampling from replay buffer 112 | warm_start = buffer_size / 10 # sample times befor learning 113 | noise_update_freq = 50 # how frequency param noise net update 114 | 115 | 116 | # ############################## Network #################################### 117 | class MLP(tl.models.Model): 118 | 119 | def __init__(self, name): 120 | super(MLP, self).__init__(name=name) 121 | self.h1 = tl.layers.Dense(64, tf.nn.tanh, in_channels=in_dim[0]) 122 | self.qvalue = tl.layers.Dense(out_dim, in_channels=64, name='q', W_init=tf.initializers.GlorotUniform()) 123 | self.svalue = tl.layers.Dense(1, in_channels=64, name='s', W_init=tf.initializers.GlorotUniform()) 124 | self.noise_scale = 0 125 | 126 | def forward(self, ni): 127 | feature = self.h1(ni) 128 | 129 | # apply noise to all linear layer 130 | if self.noise_scale != 0: 131 | noises = [] 132 | for layer in [self.qvalue, self.svalue]: 133 | for var in layer.trainable_weights: 134 | noise = tf.random.normal(tf.shape(var), 0, self.noise_scale) 135 | noises.append(noise) 136 | var.assign_add(noise) 137 | 138 | qvalue = self.qvalue(feature) 139 | svalue = self.svalue(feature) 140 | 141 | if self.noise_scale != 0: 142 | idx = 0 143 | for layer in [self.qvalue, self.svalue]: 144 | for var in layer.trainable_weights: 145 | var.assign_sub(noises[idx]) 146 | idx += 1 147 | 148 | if dueling: 149 | # dueling network 150 | return svalue + qvalue - tf.reduce_mean(qvalue, 1, keepdims=True) 151 | else: 152 | return qvalue 153 | 154 | 155 | class CNN(tl.models.Model): 156 | 157 | def __init__(self, name): 158 | super(CNN, self).__init__(name=name) 159 | h, w, in_channels = in_dim 160 | dense_in_channels = 64 * ((h - 28) // 8) * ((w - 28) // 8) 161 | self.conv1 = tl.layers.Conv2d( 162 | 32, (8, 8), (4, 4), tf.nn.relu, 'VALID', in_channels=in_channels, name='conv2d_1', 163 | W_init=tf.initializers.GlorotUniform() 164 | ) 165 | self.conv2 = tl.layers.Conv2d( 166 | 64, (4, 4), (2, 2), tf.nn.relu, 'VALID', in_channels=32, name='conv2d_2', 167 | W_init=tf.initializers.GlorotUniform() 168 | ) 169 | self.conv3 = tl.layers.Conv2d( 170 | 64, (3, 3), (1, 1), tf.nn.relu, 'VALID', in_channels=64, name='conv2d_3', 171 | W_init=tf.initializers.GlorotUniform() 172 | ) 173 | self.flatten = tl.layers.Flatten(name='flatten') 174 | self.preq = tl.layers.Dense( 175 | 256, tf.nn.relu, in_channels=dense_in_channels, name='pre_q', W_init=tf.initializers.GlorotUniform() 176 | ) 177 | self.qvalue = tl.layers.Dense(out_dim, in_channels=256, name='q', W_init=tf.initializers.GlorotUniform()) 178 | self.pres = tl.layers.Dense( 179 | 256, tf.nn.relu, in_channels=dense_in_channels, name='pre_s', W_init=tf.initializers.GlorotUniform() 180 | ) 181 | self.svalue = tl.layers.Dense(1, in_channels=256, name='state', W_init=tf.initializers.GlorotUniform()) 182 | self.noise_scale = 0 183 | 184 | def forward(self, ni): 185 | feature = self.flatten(self.conv3(self.conv2(self.conv1(ni)))) 186 | 187 | # apply noise to all linear layer 188 | if self.noise_scale != 0: 189 | noises = [] 190 | for layer in [self.preq, self.qvalue, self.pres, self.svalue]: 191 | for var in layer.trainable_weights: 192 | noise = tf.random.normal(tf.shape(var), 0, self.noise_scale) 193 | noises.append(noise) 194 | var.assign_add(noise) 195 | 196 | qvalue = self.qvalue(self.preq(feature)) 197 | svalue = self.svalue(self.pres(feature)) 198 | 199 | if self.noise_scale != 0: 200 | idx = 0 201 | for layer in [self.preq, self.qvalue, self.pres, self.svalue]: 202 | for var in layer.trainable_weights: 203 | var.assign_sub(noises[idx]) 204 | idx += 1 205 | 206 | if dueling: 207 | # dueling network 208 | return svalue + qvalue - tf.reduce_mean(qvalue, 1, keepdims=True) 209 | else: 210 | return qvalue 211 | 212 | 213 | # ############################## Replay #################################### 214 | class ReplayBuffer(object): 215 | 216 | def __init__(self, size): 217 | self._storage = [] #保存的容器 218 | self._maxsize = size #容器最大的size 219 | self._next_idx = 0 #指针,表示当前新增位置 220 | 221 | #查询这个容器的大小 222 | def __len__(self): 223 | return len(self._storage) 224 | 225 | #把信息放入buffer 226 | def add(self, *args): 227 | #如果当前指针大于容器目前大小,那么扩展容器,append数据 228 | if self._next_idx >= len(self._storage): 229 | self._storage.append(args) 230 | #如果不是,直接写进去就可以了。 231 | else: 232 | self._storage[self._next_idx] = args 233 | #这是一个循环指针 234 | self._next_idx = (self._next_idx + 1) % self._maxsize 235 | 236 | #对 237 | def _encode_sample(self, idxes): 238 | b_o, b_a, b_r, b_o_, b_d = [], [], [], [], [] 239 | for i in idxes: 240 | o, a, r, o_, d = self._storage[i] 241 | b_o.append(o) 242 | b_a.append(a) 243 | b_r.append(r) 244 | b_o_.append(o_) 245 | b_d.append(d) 246 | return ( 247 | np.stack(b_o).astype('float32') * ob_scale, 248 | np.stack(b_a).astype('int32'), 249 | np.stack(b_r).astype('float32'), 250 | np.stack(b_o_).astype('float32') * ob_scale, 251 | np.stack(b_d).astype('float32'), 252 | ) 253 | 254 | #抽取数据 255 | def sample(self, batch_size): 256 | indexes = range(len(self._storage)) 257 | idxes = [random.choice(indexes) for _ in range(batch_size)] 258 | return self._encode_sample(idxes) 259 | 260 | 261 | # ############################# Functions ################################### 262 | def huber_loss(x): 263 | """Loss function for value""" 264 | return tf.where(tf.abs(x) < 1, tf.square(x) * 0.5, tf.abs(x) - 0.5) 265 | 266 | 267 | def sync(net, net_tar): 268 | """Copy q network to target q network""" 269 | for var, var_tar in zip(net.trainable_weights, net_tar.trainable_weights): 270 | var_tar.assign(var) 271 | 272 | 273 | def log_softmax(x, dim): 274 | temp = x - np.max(x, dim, keepdims=True) 275 | return temp - np.log(np.exp(temp).sum(dim, keepdims=True)) 276 | 277 | 278 | def softmax(x, dim): 279 | temp = np.exp(x - np.max(x, dim, keepdims=True)) 280 | return temp / temp.sum(dim, keepdims=True) 281 | 282 | 283 | # ############################### DQN ##################################### 284 | class DQN(object): 285 | 286 | def __init__(self): 287 | model = MLP if qnet_type == 'MLP' else CNN 288 | self.qnet = model('q') 289 | if args.mode == 'train': 290 | self.qnet.train() 291 | self.targetqnet = model('targetq') 292 | self.targetqnet.infer() 293 | sync(self.qnet, self.targetqnet) 294 | else: 295 | self.qnet.infer() 296 | tl.files.load_and_assign_npz(name=args.save_path, network=self.qnet) 297 | self.niter = 0 298 | if clipnorm is not None: 299 | self.optimizer = tf.optimizers.Adam(learning_rate=lr, clipnorm=clipnorm) 300 | else: 301 | self.optimizer = tf.optimizers.Adam(learning_rate=lr) 302 | self.noise_scale = noise_scale 303 | 304 | def get_action(self, obv): 305 | eps = epsilon(self.niter) 306 | if args.mode == 'train': 307 | if random.random() < eps: 308 | return int(random.random() * out_dim) 309 | obv = np.expand_dims(obv, 0).astype('float32') * ob_scale 310 | if self.niter < explore_timesteps: 311 | self.qnet.noise_scale = self.noise_scale 312 | q_ptb = self._qvalues_func(obv).numpy() 313 | self.qnet.noise_scale = 0 314 | if i % noise_update_freq == 0: 315 | q = self._qvalues_func(obv).numpy() 316 | kl_ptb = (log_softmax(q, 1) - log_softmax(q_ptb, 1)) 317 | kl_ptb = np.sum(kl_ptb * softmax(q, 1), 1).mean() 318 | kl_explore = -np.log(1 - eps + eps / out_dim) 319 | if kl_ptb < kl_explore: 320 | self.noise_scale *= 1.01 321 | else: 322 | self.noise_scale /= 1.01 323 | return q_ptb.argmax(1)[0] 324 | else: 325 | return self._qvalues_func(obv).numpy().argmax(1)[0] 326 | else: 327 | obv = np.expand_dims(obv, 0).astype('float32') * ob_scale 328 | return self._qvalues_func(obv).numpy().argmax(1)[0] 329 | 330 | @tf.function 331 | def _qvalues_func(self, obv): 332 | return self.qnet(obv) 333 | 334 | def train(self, b_o, b_a, b_r, b_o_, b_d): 335 | self._train_func(b_o, b_a, b_r, b_o_, b_d) 336 | 337 | self.niter += 1 338 | if self.niter % target_q_update_freq == 0: 339 | sync(self.qnet, self.targetqnet) 340 | path = os.path.join(args.save_path, '{}.npz'.format(self.niter)) 341 | tl.files.save_npz(self.qnet.trainable_weights, name=path) 342 | 343 | @tf.function 344 | def _train_func(self, b_o, b_a, b_r, b_o_, b_d): 345 | with tf.GradientTape() as tape: 346 | td_errors = self._tderror_func(b_o, b_a, b_r, b_o_, b_d) 347 | loss = tf.reduce_mean(huber_loss(td_errors)) 348 | 349 | grad = tape.gradient(loss, self.qnet.trainable_weights) 350 | self.optimizer.apply_gradients(zip(grad, self.qnet.trainable_weights)) 351 | 352 | return td_errors 353 | 354 | @tf.function 355 | def _tderror_func(self, b_o, b_a, b_r, b_o_, b_d): 356 | if double: 357 | b_a_ = tf.one_hot(tf.argmax(self.qnet(b_o_), 1), out_dim) 358 | b_q_ = (1 - b_d) * tf.reduce_sum(self.targetqnet(b_o_) * b_a_, 1) 359 | else: 360 | b_q_ = (1 - b_d) * tf.reduce_max(self.targetqnet(b_o_), 1) 361 | 362 | b_q = tf.reduce_sum(self.qnet(b_o) * tf.one_hot(b_a, out_dim), 1) 363 | return b_q - (b_r + reward_gamma * b_q_) 364 | 365 | 366 | # ############################# Trainer ################################### 367 | if __name__ == '__main__': 368 | dqn = DQN() 369 | if args.mode == 'train': 370 | buffer = ReplayBuffer(buffer_size) 371 | 372 | o = env.reset() 373 | nepisode = 0 374 | t = time.time() 375 | for i in range(1, number_timesteps + 1): 376 | 377 | a = dqn.get_action(o) 378 | 379 | # execute action and feed to replay buffer 380 | # note that `_` tail in var name means next 381 | o_, r, done, info = env.step(a) 382 | buffer.add(o, a, r, o_, done) 383 | 384 | if i >= warm_start: 385 | transitions = buffer.sample(batch_size) 386 | dqn.train(*transitions) 387 | 388 | if done: 389 | o = env.reset() 390 | else: 391 | o = o_ 392 | 393 | # episode in info is real (unwrapped) message 394 | if info.get('episode'): 395 | nepisode += 1 396 | reward, length = info['episode']['r'], info['episode']['l'] 397 | fps = int(length / (time.time() - t)) 398 | print( 399 | 'Time steps so far: {}, episode so far: {}, ' 400 | 'episode reward: {:.4f}, episode length: {}, FPS: {}'.format(i, nepisode, reward, length, fps) 401 | ) 402 | t = time.time() 403 | else: 404 | nepisode = 0 405 | o = env.reset() 406 | for i in range(1, number_timesteps + 1): 407 | a = dqn.get_action(o) 408 | 409 | # execute action 410 | # note that `_` tail in var name means next 411 | o_, r, done, info = env.step(a) 412 | 413 | if done: 414 | o = env.reset() 415 | else: 416 | o = o_ 417 | 418 | # episode in info is real (unwrapped) message 419 | if info.get('episode'): 420 | nepisode += 1 421 | reward, length = info['episode']['r'], info['episode']['l'] 422 | print( 423 | 'Time steps so far: {}, episode so far: {}, ' 424 | 'episode reward: {:.4f}, episode length: {}'.format(i, nepisode, reward, length) 425 | ) 426 | -------------------------------------------------------------------------------- /tutorial_PG.py: -------------------------------------------------------------------------------- 1 | """ 2 | Vanilla Policy Gradient(VPG or REINFORCE) 3 | ----------------------------------------- 4 | The policy gradient algorithm works by updating policy parameters via stochastic gradient ascent on policy performance. 5 | It's an on-policy algorithm can be used for environments with either discrete or continuous action spaces. 6 | Here is an example on discrete action space game CartPole-v0. 7 | To apply it on continuous action space, you need to change the last softmax layer and the choose_action function. 8 | 9 | Reference 10 | --------- 11 | Cookbook: Barto A G, Sutton R S. Reinforcement Learning: An Introduction[J]. 1998. 12 | MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/ 13 | 14 | Environment 15 | ----------- 16 | Openai Gym CartPole-v0, discrete action space 17 | 18 | Prerequisites 19 | -------------- 20 | tensorflow >=2.0.0a0 21 | tensorflow-probability 0.6.0 22 | tensorlayer >=2.0.0 23 | 24 | To run 25 | ------ 26 | python tutorial_PG.py --train/test 27 | 28 | """ 29 | import argparse 30 | import os 31 | import time 32 | 33 | import gym 34 | import matplotlib.pyplot as plt 35 | import numpy as np 36 | import tensorflow as tf 37 | 38 | import tensorlayer as tl 39 | 40 | parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') 41 | parser.add_argument('--train', dest='train', action='store_true', default=True) 42 | parser.add_argument('--test', dest='train', action='store_false') 43 | args = parser.parse_args() 44 | 45 | ##################### hyper parameters #################### 46 | 47 | ENV_NAME = 'CartPole-v0' # 定义环境 48 | RANDOMSEED = 1 # 设置随机种子。建议大家都设置,这样试验可以重现。 49 | 50 | DISPLAY_REWARD_THRESHOLD = 400 # 如果奖励超过DISPLAY_REWARD_THRESHOLD,就开始渲染 51 | RENDER = False # 开始的时候,不渲染游戏。 52 | num_episodes = 2 # 游戏迭代次数 53 | 54 | ############################### PG #################################### 55 | 56 | 57 | class PolicyGradient: 58 | """ 59 | PG class 60 | """ 61 | 62 | def __init__(self, n_features, n_actions, learning_rate=0.01, reward_decay=0.95): 63 | # 定义相关参数 64 | self.n_actions = n_actions #动作 65 | self.n_features = n_features #环境特征数量 66 | self.lr = learning_rate #学习率 67 | self.gamma = reward_decay #折扣 68 | 69 | #用于保存每个ep的数据。 70 | self.ep_obs, self.ep_as, self.ep_rs = [], [], [] 71 | 72 | def get_model(inputs_shape): 73 | """ 74 | 创建一个神经网络 75 | 输入: state 76 | 输出: act 77 | """ 78 | with tf.name_scope('inputs'): 79 | self.tf_obs = tl.layers.Input(inputs_shape, tf.float32, name="observations") 80 | #self.tf_acts = tl.layers.Input([None,], tf.int32, name="actions_num") 81 | #self.tf_vt = tl.layers.Input([None,], tf.float32, name="actions_value") 82 | # fc1 83 | layer = tl.layers.Dense( 84 | n_units=30, act=tf.nn.tanh, W_init=tf.random_normal_initializer(mean=0, stddev=0.3), 85 | b_init=tf.constant_initializer(0.1), name='fc1' 86 | )(self.tf_obs) 87 | # fc2 88 | all_act = tl.layers.Dense( 89 | n_units=self.n_actions, act=None, W_init=tf.random_normal_initializer(mean=0, stddev=0.3), 90 | b_init=tf.constant_initializer(0.1), name='all_act' 91 | )(layer) 92 | return tl.models.Model(inputs=self.tf_obs, outputs=all_act, name='PG model') 93 | 94 | self.model = get_model([None, n_features]) 95 | self.model.train() 96 | self.optimizer = tf.optimizers.Adam(self.lr) 97 | 98 | def choose_action(self, s): 99 | """ 100 | 用神经网络输出的**策略pi**,选择动作。 101 | 输入: state 102 | 输出: act 103 | """ 104 | _logits = self.model(np.array([s], np.float32)) 105 | _probs = tf.nn.softmax(_logits).numpy() 106 | return tl.rein.choice_action_by_probs(_probs.ravel()) #根据策略PI选择动作。 107 | 108 | def choose_action_greedy(self, s): 109 | """ 110 | 贪心算法:直接用概率最大的动作 111 | 输入: state 112 | 输出: act 113 | """ 114 | _probs = tf.nn.softmax(self.model(np.array([s], np.float32))).numpy() 115 | return np.argmax(_probs.ravel()) 116 | 117 | def store_transition(self, s, a, r): 118 | """ 119 | 保存数据到buffer中 120 | """ 121 | self.ep_obs.append(np.array([s], np.float32)) 122 | self.ep_as.append(a) 123 | self.ep_rs.append(r) 124 | 125 | def learn(self): 126 | """ 127 | 通过带权重更新方法更新神经网络 128 | """ 129 | # _discount_and_norm_rewards中存储的就是这一ep中,每个状态的G值。 130 | discounted_ep_rs_norm = self._discount_and_norm_rewards() 131 | 132 | with tf.GradientTape() as tape: 133 | 134 | # 把s放入神经网络,就算_logits 135 | _logits = self.model(np.vstack(self.ep_obs)) 136 | 137 | # 敲黑板 138 | ## _logits和真正的动作的差距 139 | # 差距也可以这样算,和sparse_softmax_cross_entropy_with_logits等价的: 140 | # neg_log_prob = tf.reduce_sum(-tf.log(self.all_act_prob)*tf.one_hot(self.tf_acts, self.n_actions), axis=1) 141 | neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=_logits, labels=np.array(self.ep_as)) 142 | 143 | # 在原来的差距乘以G值,也就是以G值作为更新 144 | loss = tf.reduce_mean(neg_log_prob * discounted_ep_rs_norm) 145 | 146 | grad = tape.gradient(loss, self.model.trainable_weights) 147 | self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights)) 148 | 149 | self.ep_obs, self.ep_as, self.ep_rs = [], [], [] # empty episode data 150 | return discounted_ep_rs_norm 151 | 152 | def _discount_and_norm_rewards(self): 153 | """ 154 | 通过回溯计算G值 155 | """ 156 | # 先创建一个数组,大小和ep_rs一样。ep_rs记录的是每个状态的收获r。 157 | discounted_ep_rs = np.zeros_like(self.ep_rs) 158 | running_add = 0 159 | # 从ep_rs的最后往前,逐个计算G 160 | for t in reversed(range(0, len(self.ep_rs))): 161 | running_add = running_add * self.gamma + self.ep_rs[t] 162 | discounted_ep_rs[t] = running_add 163 | 164 | # 归一化G值。 165 | # 我们希望G值有正有负,这样比较容易学习。 166 | discounted_ep_rs -= np.mean(discounted_ep_rs) 167 | discounted_ep_rs /= np.std(discounted_ep_rs) 168 | return discounted_ep_rs 169 | 170 | def save_ckpt(self): 171 | """ 172 | save trained weights 173 | :return: None 174 | """ 175 | if not os.path.exists('model'): 176 | os.makedirs('model') 177 | tl.files.save_weights_to_hdf5('model/pg_policy.hdf5', self.model) 178 | 179 | def load_ckpt(self): 180 | """ 181 | load trained weights 182 | :return: None 183 | """ 184 | tl.files.load_hdf5_to_weights_in_order('model/pg_policy.hdf5', self.model) 185 | 186 | 187 | if __name__ == '__main__': 188 | 189 | # reproducible 190 | np.random.seed(RANDOMSEED) 191 | tf.random.set_seed(RANDOMSEED) 192 | 193 | tl.logging.set_verbosity(tl.logging.DEBUG) 194 | 195 | env = gym.make(ENV_NAME) 196 | env.seed(RANDOMSEED) # reproducible, general Policy gradient has high variance 197 | env = env.unwrapped 198 | 199 | print(env.action_space) 200 | print(env.observation_space) 201 | print(env.observation_space.high) 202 | print(env.observation_space.low) 203 | 204 | RL = PolicyGradient( 205 | n_actions=env.action_space.n, 206 | n_features=env.observation_space.shape[0], 207 | learning_rate=0.02, 208 | reward_decay=0.99, 209 | # output_graph=True, 210 | ) 211 | 212 | if args.train: 213 | reward_buffer = [] 214 | 215 | #=====开始更新训练===== 216 | for i_episode in range(num_episodes): 217 | 218 | episode_time = time.time() 219 | observation = env.reset() 220 | 221 | while True: 222 | if RENDER: 223 | env.render() 224 | 225 | # 注意:这里没有用贪婪算法,而是根据pi随机动作,以保证一定的探索性。 226 | action = RL.choose_action(observation) 227 | 228 | observation_, reward, done, info = env.step(action) 229 | 230 | # 保存数据 231 | RL.store_transition(observation, action, reward) 232 | 233 | # PG用的是MC,如果到了最终状态 234 | if done: 235 | ep_rs_sum = sum(RL.ep_rs) 236 | 237 | if 'running_reward' not in globals(): 238 | running_reward = ep_rs_sum 239 | else: 240 | running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 241 | 242 | #如果超过DISPLAY_REWARD_THRESHOLD就开始渲染游戏吧。 243 | if running_reward > DISPLAY_REWARD_THRESHOLD: 244 | RENDER = True 245 | 246 | # print("episode:", i_episode, " reward:", int(running_reward)) 247 | 248 | print( 249 | "Episode [%d/%d] \tsum reward: %d \trunning reward: %f \ttook: %.5fs " % 250 | (i_episode, num_episodes, ep_rs_sum, running_reward, time.time() - episode_time) 251 | ) 252 | reward_buffer.append(running_reward) 253 | 254 | # 开始学习 255 | vt = RL.learn() 256 | 257 | # 画图 258 | plt.ion() 259 | plt.cla() 260 | plt.title('PG') 261 | plt.plot(reward_buffer, ) 262 | plt.xlabel('episode steps') 263 | plt.ylabel('normalized state-action value') 264 | plt.show() 265 | plt.pause(0.1) 266 | 267 | break 268 | 269 | # 开始新一步 270 | observation = observation_ 271 | RL.save_ckpt() 272 | plt.ioff() 273 | plt.show() 274 | 275 | # =====test===== 276 | RL.load_ckpt() 277 | observation = env.reset() 278 | while True: 279 | env.render() 280 | action = RL.choose_action(observation) # 这里建议大家可以改贪婪算法获取动作,对比效果是否有不同。 281 | observation, reward, done, info = env.step(action) 282 | if done: 283 | observation = env.reset() 284 | -------------------------------------------------------------------------------- /tutorial_PPO.py: -------------------------------------------------------------------------------- 1 | """ 2 | Proximal Policy Optimization (PPO) 3 | ---------------------------- 4 | A simple version of Proximal Policy Optimization (PPO) using single thread. 5 | PPO is a family of first-order methods that use a few other tricks to keep new policies close to old. 6 | PPO methods are significantly simpler to implement, and empirically seem to perform at least as well as TRPO. 7 | 8 | Reference 9 | --------- 10 | Proximal Policy Optimization Algorithms, Schulman et al. 2017 11 | High Dimensional Continuous Control Using Generalized Advantage Estimation, Schulman et al. 2016 12 | Emergence of Locomotion Behaviours in Rich Environments, Heess et al. 2017 13 | MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials 14 | 15 | Environment 16 | ----------- 17 | Openai Gym Pendulum-v0, continual action space 18 | 19 | Prerequisites 20 | -------------- 21 | tensorflow >=2.0.0a0 22 | tensorflow-probability 0.6.0 23 | tensorlayer >=2.0.0 24 | 25 | To run 26 | ------ 27 | python tutorial_PPO.py --train/test 28 | 29 | """ 30 | import argparse 31 | import os 32 | import time 33 | 34 | import gym 35 | import matplotlib.pyplot as plt 36 | import numpy as np 37 | import tensorflow as tf 38 | import tensorflow_probability as tfp 39 | 40 | import tensorlayer as tl 41 | 42 | parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') 43 | parser.add_argument('--train', dest='train', action='store_true', default=True) 44 | parser.add_argument('--test', dest='train', action='store_false') 45 | args = parser.parse_args() 46 | 47 | ##################### hyper parameters #################### 48 | 49 | ENV_NAME = 'Pendulum-v0' # environment name 50 | RANDOMSEED = 1 # random seed 51 | 52 | EP_MAX = 1000 # total number of episodes for training 53 | EP_LEN = 200 # total number of steps for each episode 54 | GAMMA = 0.9 # reward discount 55 | A_LR = 0.0001 # learning rate for actor 56 | C_LR = 0.0002 # learning rate for critic 57 | BATCH = 32 # update batchsize 58 | A_UPDATE_STEPS = 10 # actor update steps 59 | C_UPDATE_STEPS = 10 # critic update steps 60 | S_DIM, A_DIM = 3, 1 # state dimension, action dimension 61 | EPS = 1e-8 # epsilon 62 | 63 | # 注意:这里是PPO1和PPO2的相关的参数。 64 | METHOD = [ 65 | dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty PPO1 66 | dict(name='clip', epsilon=0.2), # Clipped surrogate objective, find this is better PPO2 67 | ][1] # choose the method for optimization 68 | 69 | ############################### PPO #################################### 70 | 71 | 72 | class PPO(object): 73 | ''' 74 | PPO 类 75 | ''' 76 | 77 | def __init__(self): 78 | 79 | # 构建critic网络: 80 | # 输入state,输出V值 81 | tfs = tl.layers.Input([None, S_DIM], tf.float32, 'state') 82 | l1 = tl.layers.Dense(100, tf.nn.relu)(tfs) 83 | v = tl.layers.Dense(1)(l1) 84 | self.critic = tl.models.Model(tfs, v) 85 | self.critic.train() 86 | 87 | # 构建actor网络: 88 | # actor有两个actor 和 actor_old, actor_old的主要功能是记录行为策略的版本。 89 | # 输入时state,输出是描述动作分布的mu和sigma 90 | self.actor = self._build_anet('pi', trainable=True) 91 | self.actor_old = self._build_anet('oldpi', trainable=False) 92 | self.actor_opt = tf.optimizers.Adam(A_LR) 93 | self.critic_opt = tf.optimizers.Adam(C_LR) 94 | 95 | def a_train(self, tfs, tfa, tfadv): 96 | ''' 97 | 更新策略网络(policy network) 98 | ''' 99 | # 输入时s,a,td-error。这个和AC是类似的。 100 | tfs = np.array(tfs, np.float32) #state 101 | tfa = np.array(tfa, np.float32) #action 102 | tfadv = np.array(tfadv, np.float32) #td-error 103 | 104 | 105 | with tf.GradientTape() as tape: 106 | 107 | # 【敲黑板】这里是重点!!!! 108 | # 我们需要从两个不同网络,构建两个正态分布pi,oldpi。 109 | mu, sigma = self.actor(tfs) 110 | pi = tfp.distributions.Normal(mu, sigma) 111 | 112 | mu_old, sigma_old = self.actor_old(tfs) 113 | oldpi = tfp.distributions.Normal(mu_old, sigma_old) 114 | 115 | # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa)) 116 | # 在新旧两个分布下,同样输出a的概率的比值 117 | # 除以(oldpi.prob(tfa) + EPS),其实就是做了import-sampling。怎么解释这里好呢 118 | # 本来我们是可以直接用pi.prob(tfa)去跟新的,但为了能够更新多次,我们需要除以(oldpi.prob(tfa) + EPS)。 119 | # 在AC或者PG,我们是以1,0作为更新目标,缩小动作概率到1or0的差距 120 | # 而PPO可以想作是,以oldpi.prob(tfa)出发,不断远离(增大or缩小)的过程。 121 | ratio = pi.prob(tfa) / (oldpi.prob(tfa) + EPS) 122 | # 这个的意义和带参数更新是一样的。 123 | surr = ratio * tfadv 124 | 125 | # 我们还不能让两个分布差异太大。 126 | # PPO1 127 | if METHOD['name'] == 'kl_pen': 128 | tflam = METHOD['lam'] 129 | kl = tfp.distributions.kl_divergence(oldpi, pi) 130 | kl_mean = tf.reduce_mean(kl) 131 | aloss = -(tf.reduce_mean(surr - tflam * kl)) 132 | # PPO2: 133 | # 很直接,就是直接进行截断。 134 | else: # clipping method, find this is better 135 | aloss = -tf.reduce_mean( 136 | tf.minimum(ratio * tfadv, #surr 137 | tf.clip_by_value(ratio, 1. - METHOD['epsilon'], 1. + METHOD['epsilon']) * tfadv) 138 | ) 139 | a_gard = tape.gradient(aloss, self.actor.trainable_weights) 140 | 141 | self.actor_opt.apply_gradients(zip(a_gard, self.actor.trainable_weights)) 142 | 143 | if METHOD['name'] == 'kl_pen': 144 | return kl_mean 145 | 146 | def update_old_pi(self): 147 | ''' 148 | 更新actor_old参数。 149 | ''' 150 | for p, oldp in zip(self.actor.trainable_weights, self.actor_old.trainable_weights): 151 | oldp.assign(p) 152 | 153 | def c_train(self, tfdc_r, s): 154 | ''' 155 | 更新Critic网络 156 | ''' 157 | tfdc_r = np.array(tfdc_r, dtype=np.float32) #tfdc_r可以理解为PG中就是G,通过回溯计算。只不过这PPO用TD而已。 158 | 159 | with tf.GradientTape() as tape: 160 | v = self.critic(s) 161 | advantage = tfdc_r - v # 就是我们说的td-error 162 | closs = tf.reduce_mean(tf.square(advantage)) 163 | 164 | grad = tape.gradient(closs, self.critic.trainable_weights) 165 | self.critic_opt.apply_gradients(zip(grad, self.critic.trainable_weights)) 166 | 167 | def cal_adv(self, tfs, tfdc_r): 168 | ''' 169 | 计算advantage,也就是td-error 170 | ''' 171 | tfdc_r = np.array(tfdc_r, dtype=np.float32) 172 | advantage = tfdc_r - self.critic(tfs) # advantage = r - gamma * V(s_) 173 | return advantage.numpy() 174 | 175 | def update(self, s, a, r): 176 | ''' 177 | Update parameter with the constraint of KL divergent 178 | :param s: state 179 | :param a: act 180 | :param r: reward 181 | :return: None 182 | ''' 183 | s, a, r = s.astype(np.float32), a.astype(np.float32), r.astype(np.float32) 184 | 185 | self.update_old_pi() 186 | adv = self.cal_adv(s, r) 187 | # adv = (adv - adv.mean())/(adv.std()+1e-6) # sometimes helpful 188 | 189 | # update actor 190 | #### PPO1比较复杂: 191 | # 动态调整参数 adaptive KL penalty 192 | if METHOD['name'] == 'kl_pen': 193 | for _ in range(A_UPDATE_STEPS): 194 | kl = self.a_train(s, a, adv) 195 | if kl > 4 * METHOD['kl_target']: # this in in google's paper 196 | break 197 | if kl < METHOD['kl_target'] / 1.5: # adaptive lambda, this is in OpenAI's paper 198 | METHOD['lam'] /= 2 199 | elif kl > METHOD['kl_target'] * 1.5: 200 | METHOD['lam'] *= 2 201 | METHOD['lam'] = np.clip( 202 | METHOD['lam'], 1e-4, 10 203 | ) # sometimes explode, this clipping is MorvanZhou's solution 204 | 205 | #### PPO2比较简单,直接就进行a_train更新: 206 | # clipping method, find this is better (OpenAI's paper) 207 | else: 208 | for _ in range(A_UPDATE_STEPS): 209 | self.a_train(s, a, adv) 210 | 211 | # 更新 critic 212 | for _ in range(C_UPDATE_STEPS): 213 | self.c_train(r, s) 214 | 215 | def _build_anet(self, name, trainable): 216 | ''' 217 | Build policy network 218 | :param name: name 219 | :param trainable: trainable flag 220 | :return: policy network 221 | ''' 222 | # 连续动作型问题,输出mu和sigma。 223 | tfs = tl.layers.Input([None, S_DIM], tf.float32, name + '_state') 224 | l1 = tl.layers.Dense(100, tf.nn.relu, name=name + '_l1')(tfs) 225 | 226 | a = tl.layers.Dense(A_DIM, tf.nn.tanh, name=name + '_a')(l1) 227 | mu = tl.layers.Lambda(lambda x: x * 2, name=name + '_lambda')(a) 228 | 229 | sigma = tl.layers.Dense(A_DIM, tf.nn.softplus, name=name + '_sigma')(l1) 230 | 231 | model = tl.models.Model(tfs, [mu, sigma], name) 232 | 233 | if trainable: 234 | model.train() 235 | else: 236 | model.eval() 237 | return model 238 | 239 | def choose_action(self, s): 240 | ''' 241 | Choose action 242 | :param s: state 243 | :return: clipped act 244 | ''' 245 | s = s[np.newaxis, :].astype(np.float32) 246 | mu, sigma = self.actor(s) # 通过actor计算出分布的mu和sigma 247 | pi = tfp.distributions.Normal(mu, sigma) # 用mu和sigma构建正态分布 248 | a = tf.squeeze(pi.sample(1), axis=0)[0] # 根据概率分布随机出动作 249 | return np.clip(a, -2, 2) # 最后sample动作,并进行裁剪。 250 | 251 | def get_v(self, s): 252 | ''' 253 | 计算value值。 254 | ''' 255 | s = s.astype(np.float32) 256 | if s.ndim < 2: s = s[np.newaxis, :] # 要和输入的形状对应。 257 | return self.critic(s)[0, 0] 258 | 259 | def save_ckpt(self): 260 | """ 261 | save trained weights 262 | :return: None 263 | """ 264 | if not os.path.exists('model'): 265 | os.makedirs('model') 266 | tl.files.save_weights_to_hdf5('model/ppo_actor.hdf5', self.actor) 267 | tl.files.save_weights_to_hdf5('model/ppo_actor_old.hdf5', self.actor_old) 268 | tl.files.save_weights_to_hdf5('model/ppo_critic.hdf5', self.critic) 269 | 270 | def load_ckpt(self): 271 | """ 272 | load trained weights 273 | :return: None 274 | """ 275 | tl.files.load_hdf5_to_weights_in_order('model/ppo_actor.hdf5', self.actor) 276 | tl.files.load_hdf5_to_weights_in_order('model/ppo_actor_old.hdf5', self.actor_old) 277 | tl.files.load_hdf5_to_weights_in_order('model/ppo_critic.hdf5', self.critic) 278 | 279 | 280 | if __name__ == '__main__': 281 | 282 | env = gym.make(ENV_NAME).unwrapped 283 | 284 | # reproducible 285 | env.seed(RANDOMSEED) 286 | np.random.seed(RANDOMSEED) 287 | tf.random.set_seed(RANDOMSEED) 288 | 289 | ppo = PPO() 290 | 291 | if args.train: 292 | all_ep_r = [] 293 | 294 | # 更新流程: 295 | for ep in range(EP_MAX): 296 | s = env.reset() 297 | buffer_s, buffer_a, buffer_r = [], [], [] 298 | ep_r = 0 299 | t0 = time.time() 300 | for t in range(EP_LEN): # in one episode 301 | # env.render() 302 | a = ppo.choose_action(s) 303 | s_, r, done, _ = env.step(a) 304 | buffer_s.append(s) 305 | buffer_a.append(a) 306 | buffer_r.append((r + 8) / 8) # 对奖励进行归一化。有时候会挺有用的。所以我们说说,奖励是个主观的东西。 307 | s = s_ 308 | ep_r += r 309 | 310 | # # N步更新的方法,每BATCH步了就可以进行一次更新 311 | if (t + 1) % BATCH == 0 or t == EP_LEN - 1: 312 | v_s_ = ppo.get_v(s_) # 计算n步中最后一个state的v_s_ 313 | 314 | # 和PG一样,向后回溯计算。 315 | discounted_r = [] 316 | for r in buffer_r[::-1]: 317 | v_s_ = r + GAMMA * v_s_ 318 | discounted_r.append(v_s_) 319 | discounted_r.reverse() 320 | 321 | # 所以这里的br并不是每个状态的reward,而是通过回溯计算的V值 322 | bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis] 323 | buffer_s, buffer_a, buffer_r = [], [], [] 324 | ppo.update(bs, ba, br) 325 | 326 | 327 | if ep == 0: 328 | all_ep_r.append(ep_r) 329 | else: 330 | all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1) 331 | print( 332 | 'Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( 333 | ep, EP_MAX, ep_r, 334 | time.time() - t0 335 | ) 336 | ) 337 | 338 | #画图 339 | plt.ion() 340 | plt.cla() 341 | plt.title('PPO') 342 | plt.plot(np.arange(len(all_ep_r)), all_ep_r) 343 | plt.ylim(-2000, 0) 344 | plt.xlabel('Episode') 345 | plt.ylabel('Moving averaged episode reward') 346 | plt.show() 347 | plt.pause(0.1) 348 | ppo.save_ckpt() 349 | plt.ioff() 350 | plt.show() 351 | 352 | # test 353 | ppo.load_ckpt() 354 | while True: 355 | s = env.reset() 356 | for i in range(EP_LEN): 357 | env.render() 358 | s, r, done, _ = env.step(ppo.choose_action(s)) 359 | if done: 360 | break 361 | -------------------------------------------------------------------------------- /tutorial_Qlearning.py: -------------------------------------------------------------------------------- 1 | """Q-Table learning algorithm. 2 | 3 | Non deep learning - TD Learning, Off-Policy, e-Greedy Exploration 4 | 5 | Q(S, A) <- Q(S, A) + alpha * (R + lambda * Q(newS, newA) - Q(S, A)) 6 | 7 | See David Silver RL Tutorial Lecture 5 - Q-Learning for more details. 8 | 9 | For Q-Network, see tutorial_frozenlake_q_network.py 10 | 11 | EN: https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0#.5m3361vlw 12 | CN: https://zhuanlan.zhihu.com/p/25710327 13 | 14 | tensorflow==2.0.00 15 | tensorlayer==2.0.0 16 | 17 | """ 18 | 19 | import time 20 | import gym 21 | import numpy as np 22 | 23 | # FrozenLake-v0是一个4*4的网络格子,每个格子可以是起始块,目标块、冻结块或者危险块。 24 | # 我们的目标是让智能体学习如何从开始块如何行动到目标块上,而不是移动到危险块上。 25 | # 智能体可以选择向上、向下、向左或者向右移动,同时游戏中还有可能吹来一阵风,将智能体吹到任意的方块上。 26 | env = gym.make('FrozenLake-v0') 27 | 28 | # 设置是否渲染,展示游戏画面。 29 | render = False 30 | running_reward = None 31 | 32 | ##================= Implement Q-Table learning algorithm =====================## 33 | 34 | ## 建立Q表格,并初始化为全0数组。形状为:[状态空间,动作空间] 35 | Q = np.zeros([env.observation_space.n, env.action_space.n]) 36 | 37 | ## 设置更新的超参数 38 | ## Q[s, a] = Q[s, a] + lr * (r + lambd * np.max(Q[s1, :]) - Q[s, a]) 39 | lr = .85 # Qleaning的学习率。alpha, if use value function approximation, we can ignore it 40 | lambd = .99 # 折扣率 decay factor 41 | num_episodes = 10000 # 迭代次数,也就是开始10000次游戏 42 | rList = [] # 用于记录每次迭代的总奖励,这样我们就可以知道智能体是否有进步了。rewards for each episode 43 | 44 | 45 | ##=================开始游戏=====================## 46 | for i in range(num_episodes): 47 | 48 | ## 重置环境初始状态 49 | episode_time = time.time() #用于记录运行时间,我们可以通过比较运行时间判断算法效率。 50 | s = env.reset() #重置初始状态。 51 | rAll = 0 #用于记录这次游戏的总奖励,这里先初始化为0 52 | 53 | ## 开始Qlearning算法 54 | for j in range(99): 55 | if render: env.render() #判断是否渲染环境。 56 | 57 | ## [敲黑板] 58 | ## 从Q表格中,找到当前状态S最大Q值,并在Q值上加上噪音。 59 | ## 然后找到最大的Q+噪音的动作 60 | a = np.argmax(Q[s, :] + np.random.randn(1, env.action_space.n) * (1. / (i + 1))) 61 | 62 | ## 与环境互动,把动作放到env.step()函数,并返回下一状态S1,奖励,done,info 63 | s1, r, d, _ = env.step(a) 64 | 65 | ## 更新Q表格 66 | Q[s, a] = Q[s, a] + lr * (r + lambd * np.max(Q[s1, :]) - Q[s, a]) 67 | 68 | rAll += r # rAll累加当前的收获。 69 | s = s1 # 把下一状态赋值给s,准备开始下一步。 70 | if d ==True: # 如果已经到达最终状态,就跳出for循环。(开始下一次迭代) 71 | break 72 | 73 | ##=================更新结束,打印结果=====================## 74 | #每次的总收获都放到rlist。可以通过打印看看算法效率。 75 | rList.append(rAll) 76 | #每一次迭代获得的总收获rAll,会以0.01的份额加入到running_reward。(原代码这里rAll用了r,个人认为是rAll更合适) 77 | running_reward = rAll if running_reward is None else running_reward * 0.99 + rAll * 0.01 78 | print("Episode [%d/%d] sum reward: %f running reward: %f took: %.5fs " % \ 79 | (i, num_episodes, rAll, running_reward, time.time() - episode_time)) 80 | 81 | #最后打印Q表格,看看Q表格的样子吧。 82 | print("Final Q-Table Values:/n %s" % Q) 83 | --------------------------------------------------------------------------------