├── README.md ├── WoLF-PHC Grid Game,2agents,same goal, print policy.py ├── WoLF-PHC, MAS, diffrt goals, normalized Q value.py ├── WoLF-PHC, MAS, diffrt goals, cumulative reward.py ├── WoLF-PHC with Mobility Simulation,diffrt goals.py ├── WoLF-PHC with Mobility Simulation,same goal.py └── WoLF-PHC, MAS, diffrt goals.py /README.md: -------------------------------------------------------------------------------- 1 | # Realize The Automation of UAV Cluster by Reinforcement Learning Approach 2 | This project implemented and tuned a WoLF-PHC machine learning algorithm with the underlying reinforcement learning principle, to increase the automation degree of UAV group by allowing each UAV in a group be an independent learning machine to form a totally decentralized system, and also by increasing the cooperation within the UAV group when performing a task. 3 | THis project utilized matplotlib to visualize the result data. 4 | -------------------------------------------------------------------------------- /WoLF-PHC Grid Game,2agents,same goal, print policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | 6 | 7 | def find_state_from_position(position): 8 | x = position[0] 9 | y = position[1] 10 | if x != 100 and y != 100: 11 | state_num = (y//1) * 100 + ((x//1) + 1) 12 | elif x == 100 and y != 100: 13 | state_num = (y//1) * 100 + x 14 | elif y == 100 and x != 100: 15 | state_num = (y-1)*100 + ((x//1) + 1) 16 | else: 17 | state_num = 10000 18 | return int(state_num) 19 | 20 | 21 | def calculate_reward(current_state, goal_state): 22 | if current_state == goal_state: 23 | r = 20 24 | else: 25 | r = 0 26 | return r 27 | 28 | 29 | def find_max_q_at_state_s(s, matrix): 30 | q_list_at_state_s = matrix[s-1] 31 | max_q_value = np.max(q_list_at_state_s) 32 | return max_q_value 33 | 34 | 35 | def calculate_delta(matrix_pi, matrix_pi_mean, matrix_q, state, win_learning_rate, lose_learning_rate): 36 | left_side = 0 37 | right_side = 0 38 | for action in range(4): 39 | left_side += matrix_pi[state-1, action] * matrix_q[state-1, action] 40 | right_side += matrix_pi_mean[state-1, action] * matrix_q[state-1, action] 41 | if left_side > right_side: 42 | learning_rate = win_learning_rate 43 | else: 44 | learning_rate = lose_learning_rate 45 | return learning_rate 46 | 47 | 48 | def __calculate_delta_sa(matrix_pi, state, action, learning_rate): 49 | compare_list = np.array([ matrix_pi[state-1, action], learning_rate / (4-1)]) 50 | return np.min(compare_list) 51 | 52 | 53 | def calculate_triangle_delta_sa(state, action, matrix_pi, matrix_q, learning_rate): 54 | q_values_list = matrix_q[state-1] 55 | arg_max_a = np.argmax(q_values_list) 56 | if action != arg_max_a: 57 | result = 0 - __calculate_delta_sa(matrix_pi, state, action, learning_rate) 58 | else: 59 | result = 0 60 | for i in range(4): 61 | if i != arg_max_a: 62 | result += __calculate_delta_sa(matrix_pi, state, i, learning_rate) 63 | return result 64 | 65 | 66 | def wolf_phc_learning(position, goal_state): 67 | EastProb_Array = [] 68 | WestProb_Array = [] 69 | NorthProb_Array = [] 70 | currentState = find_state_from_position(position) 71 | # print 'currentState is: ', currentState 72 | Q_matrix = np.zeros((10000, 4)) 73 | pi_matrix = np.zeros((10000, 4)) 74 | pi_mean_matrix = np.zeros((10000, 4)) 75 | C_matrix = np.zeros((10000, 1), dtype=np.int) 76 | # Initialize pi-matrix 77 | pi_matrix.fill(0) 78 | for j in range(0, 10000): 79 | if j == 0: 80 | pi_matrix[j, 0] = 0.5 81 | pi_matrix[j, 3] = 0.5 82 | elif j == 99: 83 | pi_matrix[j, 0] = 0.5 84 | pi_matrix[j, 2] = 0.5 85 | elif j == 9900: 86 | pi_matrix[j, 1] = 0.5 87 | pi_matrix[j, 3] = 0.5 88 | elif j == 9999: 89 | pi_matrix[j, 1] = 0.5 90 | pi_matrix[j, 2] = 0.5 91 | elif 1 <= j <= 98: 92 | pi_matrix[j, 0] = 1.0 / 3.0 93 | pi_matrix[j, 2] = 1.0 / 3.0 94 | pi_matrix[j, 3] = 1.0 / 3.0 95 | elif 9901 <= j <= 9998: 96 | pi_matrix[j, 1] = 1.0 / 3.0 97 | pi_matrix[j, 2] = 1.0 / 3.0 98 | pi_matrix[j, 3] = 1.0 / 3.0 99 | elif j % 100 == 0: 100 | pi_matrix[j, 0] = 1.0 / 3.0 101 | pi_matrix[j, 1] = 1.0 / 3.0 102 | pi_matrix[j, 3] = 1.0 / 3.0 103 | elif (j+1) % 100 == 0: 104 | pi_matrix[j, 0] = 1.0 / 3.0 105 | pi_matrix[j, 1] = 1.0 / 3.0 106 | pi_matrix[j, 2] = 1.0 / 3.0 107 | else: 108 | pi_matrix[j] = 0.25 109 | 110 | # print 'Q_matrix =', Q_matrix 111 | # print 'pi_matrix =', pi_matrix 112 | # print 'C_matrix = ', C_matrix 113 | step = 0 114 | initialState = currentState 115 | while step != 100000: 116 | step += 1 117 | # print 'currentState= ', currentState 118 | # a) From state s select action a according to mixed strategy Pi with suitable exploration 119 | actionProbability = pi_matrix[currentState - 1] 120 | # print 'actionProbability = ', actionProbability 121 | EastProb_Array.append(pi_matrix[initialState-1][3]) 122 | WestProb_Array.append(pi_matrix[initialState-1][2]) 123 | NorthProb_Array.append(pi_matrix[initialState-1][0]) 124 | currentAction = np.random.choice(4, 1, p=actionProbability) 125 | # print 'currentAction = ', currentAction 126 | # b) Observing reward and next state s' 127 | nextState = currentState + calculate_nextState_table[currentAction[0]] 128 | # print 'nextSate = ', nextState 129 | reward = calculate_reward(currentState, goal_state) 130 | max_Q = find_max_q_at_state_s(nextState, Q_matrix) 131 | # print 'max_Q = ', max_Q 132 | Q_matrix[currentState-1, currentAction[0]] = \ 133 | (1 - alpha) * Q_matrix[currentState-1, currentAction[0]] + alpha * ( 134 | reward + gama * max_Q) 135 | # c) update estimate of average policy, pi_mean_matrix 136 | pi_mean_matrix = (pi_mean_matrix * (step - 1) + pi_matrix) / step 137 | C_matrix[currentState-1] += 1 138 | for each_action in range(4): 139 | pi_mean_matrix[currentState-1, each_action] += (1 / C_matrix[currentState-1]) * ( 140 | pi_matrix[currentState-1, each_action] - pi_mean_matrix[currentState-1, each_action] 141 | ) 142 | # d) Step pi closer to the optimal policy w.r.t Q 143 | delta = calculate_delta(pi_matrix, pi_mean_matrix, Q_matrix, currentState, delta_W, delta_L) 144 | triangle_delta_sa = calculate_triangle_delta_sa(currentState, currentAction, pi_matrix, Q_matrix, delta) 145 | pi_matrix[currentState-1, currentAction] += triangle_delta_sa 146 | sum_probability = np.sum(pi_matrix[currentState-1]) 147 | pi_matrix[currentState-1] /= sum_probability 148 | currentState = nextState 149 | print 'reached goal!' 150 | print 'total steps = ', step 151 | # print 'xArray= ', xArray 152 | # print 'yArray= ', yArray 153 | return Q_matrix, pi_matrix, pi_mean_matrix, EastProb_Array, WestProb_Array, NorthProb_Array 154 | 155 | 156 | # --------------------- All agents have the same goal ------------------------ # 157 | 158 | Agent1_position = [25.0, 0] 159 | Agent2_position = [75.0, 0] 160 | 161 | 162 | calculate_nextState_table = (100, -100, -1, 1) 163 | alpha = 0.5 164 | gama = 0.4 165 | delta_L = 0.8 166 | delta_W = 0.4 167 | 168 | 169 | Agent1_goal = 5550 170 | Agent2_goal = 5550 171 | 172 | 173 | Agent1_trainingResult = wolf_phc_learning(Agent1_position, Agent1_goal) 174 | Agent2_trainingResult = wolf_phc_learning(Agent2_position, Agent2_goal) 175 | 176 | x1 = Agent1_trainingResult[3] 177 | y1 = Agent1_trainingResult[5] 178 | x2 = Agent2_trainingResult[4] 179 | y2 = Agent2_trainingResult[5] 180 | 181 | 182 | x1 = np.asarray(Agent1_trainingResult[3]) 183 | y1 = np.asarray(Agent1_trainingResult[5]) 184 | x2 = np.asarray(Agent2_trainingResult[4]) 185 | y2 = np.asarray(Agent2_trainingResult[5]) 186 | 187 | 188 | fig, axes = plt.subplots(1, 2, figsize=(10, 4)) 189 | axes[0].plot(x1, y1) 190 | axes[0].set_title("Agent1 Policy Trajectory") 191 | axes[0].set_xlabel("Pr(East)") 192 | axes[0].set_ylabel("Pr(North)") 193 | axes[0].set_xlim([0, 1]) 194 | axes[0].set_ylim([0, 1]) 195 | axes[1].plot(x2, y2) 196 | axes[1].set_title("Agent2 Policy Trajectory") 197 | axes[1].set_xlabel("Pr(West)") 198 | axes[1].set_ylabel("Pr(North)") 199 | axes[1].set_xlim([0, 1]) 200 | axes[1].set_ylim([0, 1]) 201 | plt.show() 202 | 203 | 204 | 205 | 206 | -------------------------------------------------------------------------------- /WoLF-PHC, MAS, diffrt goals, normalized Q value.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | def find_state_from_position(position): 7 | x = position[0] 8 | y = position[1] 9 | if x != 100 and y != 100: 10 | state_num = (y//1) * 100 + ((x//1) + 1) 11 | elif x == 100 and y != 100: 12 | state_num = (y//1) * 100 + x 13 | elif y == 100 and x != 100: 14 | state_num = (y-1)*100 + ((x//1) + 1) 15 | else: 16 | state_num = 10000 17 | return int(state_num) 18 | 19 | 20 | def calculate_reward(previous_state, current_state, goal_state): 21 | if current_state == goal_state: 22 | r = 20 23 | elif abs(current_state % 100 - goal_state % 100) < abs(previous_state % 100 - goal_state % 100) \ 24 | or abs(current_state/100 - goal_state/100) < abs(previous_state/100 - goal_state/100): 25 | r = 5 26 | else: 27 | r = -5 28 | return r 29 | 30 | 31 | def find_max_q_at_state_s(s, matrix): 32 | q_list_at_state_s = matrix[s-1] 33 | max_q_value = np.max(q_list_at_state_s) 34 | return max_q_value 35 | 36 | 37 | def calculate_delta(matrix_pi, matrix_pi_mean, matrix_q, state, win_learning_rate, lose_learning_rate): 38 | left_side = 0 39 | right_side = 0 40 | for action in range(4): 41 | left_side += matrix_pi[state-1, action] * matrix_q[state-1, action] 42 | right_side += matrix_pi_mean[state-1, action] * matrix_q[state-1, action] 43 | if left_side > right_side: 44 | learning_rate = win_learning_rate 45 | else: 46 | learning_rate = lose_learning_rate 47 | return learning_rate 48 | 49 | 50 | def __calculate_delta_sa(matrix_pi, state, action, learning_rate): 51 | compare_list = np.array([ matrix_pi[state-1, action], learning_rate / (4-1)]) 52 | return np.min(compare_list) 53 | 54 | 55 | def calculate_triangle_delta_sa(state, action, matrix_pi, matrix_q, learning_rate): 56 | q_values_list = matrix_q[state-1] 57 | arg_max_a = np.argmax(q_values_list) 58 | if action != arg_max_a: 59 | result = 0 - __calculate_delta_sa(matrix_pi, state, action, learning_rate) 60 | else: 61 | result = 0 62 | for i in range(4): 63 | if i != arg_max_a: 64 | result += __calculate_delta_sa(matrix_pi, state, i, learning_rate) 65 | return result 66 | 67 | 68 | def wolf_phc_learning(position, goal_state): 69 | normalized_Q = [] 70 | currentState = find_state_from_position(position) 71 | Q_matrix = np.zeros((10000, 4)) 72 | pi_matrix = np.zeros((10000, 4)) 73 | pi_mean_matrix = np.zeros((10000, 4)) 74 | C_matrix = np.zeros((10000, 1), dtype=np.int) 75 | # Initialize pi-matrix 76 | pi_matrix.fill(0) 77 | for j in range(0, 10000): 78 | if j == 0: 79 | pi_matrix[j, 0] = 0.5 80 | pi_matrix[j, 3] = 0.5 81 | elif j == 99: 82 | pi_matrix[j, 0] = 0.5 83 | pi_matrix[j, 2] = 0.5 84 | elif j == 9900: 85 | pi_matrix[j, 1] = 0.5 86 | pi_matrix[j, 3] = 0.5 87 | elif j == 9999: 88 | pi_matrix[j, 1] = 0.5 89 | pi_matrix[j, 2] = 0.5 90 | elif 1 <= j <= 98: 91 | pi_matrix[j, 0] = 1.0 / 3.0 92 | pi_matrix[j, 2] = 1.0 / 3.0 93 | pi_matrix[j, 3] = 1.0 / 3.0 94 | elif 9901 <= j <= 9998: 95 | pi_matrix[j, 1] = 1.0 / 3.0 96 | pi_matrix[j, 2] = 1.0 / 3.0 97 | pi_matrix[j, 3] = 1.0 / 3.0 98 | elif j % 100 == 0: 99 | pi_matrix[j, 0] = 1.0 / 3.0 100 | pi_matrix[j, 1] = 1.0 / 3.0 101 | pi_matrix[j, 3] = 1.0 / 3.0 102 | elif (j+1) % 100 == 0: 103 | pi_matrix[j, 0] = 1.0 / 3.0 104 | pi_matrix[j, 1] = 1.0 / 3.0 105 | pi_matrix[j, 2] = 1.0 / 3.0 106 | else: 107 | pi_matrix[j] = 0.25 108 | 109 | step = 0 110 | normalized_Q.append(math.sqrt(np.sum((Q_matrix ** 2)))) 111 | while step != 1000: 112 | step += 1 113 | # print 'currentState= ', currentState 114 | # a) From state s select action a according to mixed strategy Pi with suitable exploration 115 | actionProbability = pi_matrix[currentState - 1] 116 | # print 'actionProbability = ', actionProbability 117 | currentAction = np.random.choice(4, 1, p=actionProbability) 118 | # print 'currentAction = ', currentAction 119 | # b) Observing reward and next state s' 120 | nextState = currentState + calculate_nextState_table[currentAction[0]] 121 | # print 'nextSate = ', nextState 122 | reward = calculate_reward(currentState, nextState, goal_state) 123 | max_Q = find_max_q_at_state_s(nextState, Q_matrix) 124 | # print 'max_Q = ', max_Q 125 | Q_matrix[currentState-1, currentAction[0]] = \ 126 | (1 - alpha) * Q_matrix[currentState-1, currentAction[0]] + alpha * ( 127 | reward + gama * max_Q) 128 | # c) update estimate of average policy, pi_mean_matrix 129 | pi_mean_matrix = (pi_mean_matrix * (step - 1) + pi_matrix) / step 130 | C_matrix[currentState-1] += 1 131 | for each_action in range(4): 132 | pi_mean_matrix[currentState-1, each_action] += (1 / C_matrix[currentState-1]) * ( 133 | pi_matrix[currentState-1, each_action] - pi_mean_matrix[currentState-1, each_action] 134 | ) 135 | # d) Step pi closer to the optimal policy w.r.t Q 136 | delta = calculate_delta(pi_matrix, pi_mean_matrix, Q_matrix, currentState, delta_W, delta_L) 137 | triangle_delta_sa = calculate_triangle_delta_sa(currentState, currentAction, pi_matrix, Q_matrix, delta) 138 | pi_matrix[currentState-1, currentAction] += triangle_delta_sa 139 | sum_probability = np.sum(pi_matrix[currentState-1]) 140 | pi_matrix[currentState-1] /= sum_probability 141 | currentState = nextState 142 | # caculate normalized Q value 143 | Q_value = math.sqrt(np.sum((Q_matrix ** 2))) 144 | normalized_Q.append(Q_value) 145 | print 'reached goal!' 146 | print 'total steps = ', step 147 | return Q_matrix, pi_matrix, pi_mean_matrix, normalized_Q 148 | 149 | 150 | # --------------------- Each agent has different goal ------------------------ # 151 | 152 | Agent1_position = [55.0, 46.0] 153 | Agent2_position = [28.0, 70.0] 154 | Agent3_position = [24.0, 50.0] 155 | Agent4_position = [77.0, 45.0] 156 | Agent5_position = [20.8, 28.8] 157 | 158 | calculate_nextState_table = (100, -100, -1, 1) 159 | alpha = 0.5 160 | gama = 0.4 161 | delta_L = 0.8 162 | delta_W = 0.4 163 | 164 | Agent1_goal = 3228 165 | Agent2_goal = 6799 166 | Agent3_goal = 4923 167 | Agent4_goal = 5054 168 | Agent5_goal = 8179 169 | 170 | Agent1_trainingResult = wolf_phc_learning(Agent1_position, Agent1_goal) 171 | Agent2_trainingResult = wolf_phc_learning(Agent2_position, Agent2_goal) 172 | Agent3_trainingResult = wolf_phc_learning(Agent3_position, Agent3_goal) 173 | Agent4_trainingResult = wolf_phc_learning(Agent4_position, Agent4_goal) 174 | Agent5_trainingResult = wolf_phc_learning(Agent5_position, Agent5_goal) 175 | 176 | Agent1_normalized_Q = np.asarray(Agent1_trainingResult[3]) 177 | Agent2_normalized_Q = np.asarray(Agent2_trainingResult[3]) 178 | Agent3_normalized_Q = np.asarray(Agent3_trainingResult[3]) 179 | Agent4_normalized_Q = np.asarray(Agent4_trainingResult[3]) 180 | Agent5_normalized_Q = np.asarray(Agent5_trainingResult[3]) 181 | 182 | x = np.arange(1001) 183 | plt.plot(x, Agent1_normalized_Q, label="Agent1") 184 | plt.plot(x, Agent2_normalized_Q, label="Agent2") 185 | plt.plot(x, Agent3_normalized_Q, label="Agent3") 186 | plt.plot(x, Agent4_normalized_Q, label="Agent4") 187 | plt.plot(x, Agent5_normalized_Q, label="Agent5") 188 | plt.legend() 189 | plt.title("WoLF-PHC MAS, normalized Q value") 190 | plt.show() 191 | 192 | -------------------------------------------------------------------------------- /WoLF-PHC, MAS, diffrt goals, cumulative reward.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | def find_state_from_position(position): 7 | x = position[0] 8 | y = position[1] 9 | if x != 100 and y != 100: 10 | state_num = (y//1) * 100 + ((x//1) + 1) 11 | elif x == 100 and y != 100: 12 | state_num = (y//1) * 100 + x 13 | elif y == 100 and x != 100: 14 | state_num = (y-1)*100 + ((x//1) + 1) 15 | else: 16 | state_num = 10000 17 | return int(state_num) 18 | 19 | 20 | def calculate_reward(previous_state, current_state, goal_state): 21 | if current_state == goal_state: 22 | r = 20 23 | elif abs(current_state % 100 - goal_state % 100) < abs(previous_state % 100 - goal_state % 100) \ 24 | or abs(current_state/100 - goal_state/100) < abs(previous_state/100 - goal_state/100): 25 | r = 5 26 | else: 27 | r = -5 28 | return r 29 | 30 | 31 | def find_max_q_at_state_s(s, matrix): 32 | q_list_at_state_s = matrix[s-1] 33 | max_q_value = np.max(q_list_at_state_s) 34 | return max_q_value 35 | 36 | 37 | def calculate_delta(matrix_pi, matrix_pi_mean, matrix_q, state, win_learning_rate, lose_learning_rate): 38 | left_side = 0 39 | right_side = 0 40 | for action in range(4): 41 | left_side += matrix_pi[state-1, action] * matrix_q[state-1, action] 42 | right_side += matrix_pi_mean[state-1, action] * matrix_q[state-1, action] 43 | if left_side > right_side: 44 | learning_rate = win_learning_rate 45 | else: 46 | learning_rate = lose_learning_rate 47 | return learning_rate 48 | 49 | 50 | def __calculate_delta_sa(matrix_pi, state, action, learning_rate): 51 | compare_list = np.array([ matrix_pi[state-1, action], learning_rate / (4-1)]) 52 | return np.min(compare_list) 53 | 54 | 55 | def calculate_triangle_delta_sa(state, action, matrix_pi, matrix_q, learning_rate): 56 | q_values_list = matrix_q[state-1] 57 | arg_max_a = np.argmax(q_values_list) 58 | if action != arg_max_a: 59 | result = 0 - __calculate_delta_sa(matrix_pi, state, action, learning_rate) 60 | else: 61 | result = 0 62 | for i in range(4): 63 | if i != arg_max_a: 64 | result += __calculate_delta_sa(matrix_pi, state, i, learning_rate) 65 | return result 66 | 67 | 68 | def wolf_phc_learning(position, goal_state): 69 | cumulRwrdList = [] 70 | currentState = find_state_from_position(position) 71 | Q_matrix = np.zeros((10000, 4)) 72 | pi_matrix = np.zeros((10000, 4)) 73 | pi_mean_matrix = np.zeros((10000, 4)) 74 | C_matrix = np.zeros((10000, 1), dtype=np.int) 75 | # Initialize pi-matrix 76 | pi_matrix.fill(0) 77 | for j in range(0, 10000): 78 | if j == 0: 79 | pi_matrix[j, 0] = 0.5 80 | pi_matrix[j, 3] = 0.5 81 | elif j == 99: 82 | pi_matrix[j, 0] = 0.5 83 | pi_matrix[j, 2] = 0.5 84 | elif j == 9900: 85 | pi_matrix[j, 1] = 0.5 86 | pi_matrix[j, 3] = 0.5 87 | elif j == 9999: 88 | pi_matrix[j, 1] = 0.5 89 | pi_matrix[j, 2] = 0.5 90 | elif 1 <= j <= 98: 91 | pi_matrix[j, 0] = 1.0 / 3.0 92 | pi_matrix[j, 2] = 1.0 / 3.0 93 | pi_matrix[j, 3] = 1.0 / 3.0 94 | elif 9901 <= j <= 9998: 95 | pi_matrix[j, 1] = 1.0 / 3.0 96 | pi_matrix[j, 2] = 1.0 / 3.0 97 | pi_matrix[j, 3] = 1.0 / 3.0 98 | elif j % 100 == 0: 99 | pi_matrix[j, 0] = 1.0 / 3.0 100 | pi_matrix[j, 1] = 1.0 / 3.0 101 | pi_matrix[j, 3] = 1.0 / 3.0 102 | elif (j+1) % 100 == 0: 103 | pi_matrix[j, 0] = 1.0 / 3.0 104 | pi_matrix[j, 1] = 1.0 / 3.0 105 | pi_matrix[j, 2] = 1.0 / 3.0 106 | else: 107 | pi_matrix[j] = 0.25 108 | 109 | step = 0 110 | cumulativeReward = 0 111 | cumulRwrdList.append(cumulativeReward) 112 | while step != 1000: 113 | step += 1 114 | # print 'currentState= ', currentState 115 | # a) From state s select action a according to mixed strategy Pi with suitable exploration 116 | actionProbability = pi_matrix[currentState - 1] 117 | # print 'actionProbability = ', actionProbability 118 | currentAction = np.random.choice(4, 1, p=actionProbability) 119 | # print 'currentAction = ', currentAction 120 | # b) Observing reward and next state s' 121 | nextState = currentState + calculate_nextState_table[currentAction[0]] 122 | # print 'nextSate = ', nextState 123 | reward = calculate_reward(currentState, nextState, goal_state) 124 | cumulativeReward += reward 125 | cumulRwrdList.append(cumulativeReward) 126 | max_Q = find_max_q_at_state_s(nextState, Q_matrix) 127 | # print 'max_Q = ', max_Q 128 | Q_matrix[currentState-1, currentAction[0]] = \ 129 | (1 - alpha) * Q_matrix[currentState-1, currentAction[0]] + alpha * ( 130 | reward + gama * max_Q) 131 | # c) update estimate of average policy, pi_mean_matrix 132 | pi_mean_matrix = (pi_mean_matrix * (step - 1) + pi_matrix) / step 133 | C_matrix[currentState-1] += 1 134 | for each_action in range(4): 135 | pi_mean_matrix[currentState-1, each_action] += (1 / C_matrix[currentState-1]) * ( 136 | pi_matrix[currentState-1, each_action] - pi_mean_matrix[currentState-1, each_action] 137 | ) 138 | # d) Step pi closer to the optimal policy w.r.t Q 139 | delta = calculate_delta(pi_matrix, pi_mean_matrix, Q_matrix, currentState, delta_W, delta_L) 140 | triangle_delta_sa = calculate_triangle_delta_sa(currentState, currentAction, pi_matrix, Q_matrix, delta) 141 | pi_matrix[currentState-1, currentAction] += triangle_delta_sa 142 | sum_probability = np.sum(pi_matrix[currentState-1]) 143 | pi_matrix[currentState-1] /= sum_probability 144 | currentState = nextState 145 | # caculate normalized Q value 146 | Q_value = math.sqrt(np.sum((Q_matrix ** 2))) 147 | print 'reached goal!' 148 | print 'total steps = ', step 149 | return Q_matrix, pi_matrix, pi_mean_matrix, cumulRwrdList 150 | 151 | 152 | # --------------------- Each agent has different goal ------------------------ # 153 | 154 | Agent1_position = [55.0, 46.0] 155 | Agent2_position = [28.0, 70.0] 156 | Agent3_position = [24.0, 50.0] 157 | Agent4_position = [77.0, 45.0] 158 | Agent5_position = [20.8, 28.8] 159 | 160 | calculate_nextState_table = (100, -100, -1, 1) 161 | alpha = 0.5 162 | gama = 0.4 163 | delta_L = 0.8 164 | delta_W = 0.4 165 | 166 | Agent1_goal = 3228 167 | Agent2_goal = 6799 168 | Agent3_goal = 4923 169 | Agent4_goal = 5054 170 | Agent5_goal = 8179 171 | 172 | Agent1_trainingResult = wolf_phc_learning(Agent1_position, Agent1_goal) 173 | Agent2_trainingResult = wolf_phc_learning(Agent2_position, Agent2_goal) 174 | Agent3_trainingResult = wolf_phc_learning(Agent3_position, Agent3_goal) 175 | Agent4_trainingResult = wolf_phc_learning(Agent4_position, Agent4_goal) 176 | Agent5_trainingResult = wolf_phc_learning(Agent5_position, Agent5_goal) 177 | 178 | Agent1_cumulRwrd = np.asarray(Agent1_trainingResult[3]) 179 | Agent2_cumulRwrd = np.asarray(Agent2_trainingResult[3]) 180 | Agent3_cumulRwrd = np.asarray(Agent3_trainingResult[3]) 181 | Agent4_cumulRwrd = np.asarray(Agent4_trainingResult[3]) 182 | Agent5_cumulRwrd = np.asarray(Agent5_trainingResult[3]) 183 | 184 | x = np.arange(1001) 185 | plt.plot(x, Agent1_cumulRwrd, label="Agent1") 186 | plt.plot(x, Agent2_cumulRwrd, label="Agent2") 187 | plt.plot(x, Agent3_cumulRwrd, label="Agent3") 188 | plt.plot(x, Agent4_cumulRwrd, label="Agent4") 189 | plt.plot(x, Agent5_cumulRwrd, label="Agent5") 190 | plt.legend() 191 | plt.title("WoLF-PHC MAS, cumulative reward") 192 | plt.show() 193 | 194 | -------------------------------------------------------------------------------- /WoLF-PHC with Mobility Simulation,diffrt goals.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | def find_state_from_position(position): 6 | x = position[0] 7 | y = position[1] 8 | if x != 100 and y != 100: 9 | state_num = (y//1) * 100 + ((x//1) + 1) 10 | elif x == 100 and y != 100: 11 | state_num = (y//1) * 100 + x 12 | elif y == 100 and x != 100: 13 | state_num = (y-1)*100 + ((x//1) + 1) 14 | else: 15 | state_num = 10000 16 | return int(state_num) 17 | 18 | 19 | def calculate_reward(current_state, goal_state): 20 | if current_state == goal_state: 21 | r = 20 22 | else: 23 | r = 0 24 | return r 25 | 26 | 27 | def find_max_q_at_state_s(s, matrix): 28 | q_list_at_state_s = matrix[s-1] 29 | max_q_value = np.max(q_list_at_state_s) 30 | return max_q_value 31 | 32 | 33 | def calculate_delta(matrix_pi, matrix_pi_mean, matrix_q, state, win_learning_rate, lose_learning_rate): 34 | left_side = 0 35 | right_side = 0 36 | for action in range(4): 37 | left_side += matrix_pi[state-1, action] * matrix_q[state-1, action] 38 | right_side += matrix_pi_mean[state-1, action] * matrix_q[state-1, action] 39 | if left_side > right_side: 40 | learning_rate = win_learning_rate 41 | else: 42 | learning_rate = lose_learning_rate 43 | return learning_rate 44 | 45 | 46 | def __calculate_delta_sa(matrix_pi, state, action, learning_rate): 47 | compare_list = np.array([ matrix_pi[state-1, action], learning_rate / (4-1)]) 48 | return np.min(compare_list) 49 | 50 | 51 | def calculate_triangle_delta_sa(state, action, matrix_pi, matrix_q, learning_rate): 52 | q_values_list = matrix_q[state-1] 53 | arg_max_a = np.argmax(q_values_list) 54 | if action != arg_max_a: 55 | result = 0 - __calculate_delta_sa(matrix_pi, state, action, learning_rate) 56 | else: 57 | result = 0 58 | for i in range(4): 59 | if i != arg_max_a: 60 | result += __calculate_delta_sa(matrix_pi, state, i, learning_rate) 61 | return result 62 | 63 | 64 | def wolf_phc_learning(position, goal_state): 65 | xArray = [] 66 | yArray = [] 67 | currentState = find_state_from_position(position) 68 | print 'currentState = ',currentState 69 | # print 'currentState is: ', currentState 70 | Q_matrix = np.zeros((10000, 4)) 71 | pi_matrix = np.zeros((10000, 4)) 72 | pi_mean_matrix = np.zeros((10000, 4)) 73 | C_matrix = np.zeros((10000, 1), dtype=np.int) 74 | # Initialize pi-matrix 75 | pi_matrix.fill(0) 76 | for j in range(0, 10000): 77 | if j == 0: 78 | pi_matrix[j, 0] = 0.5 79 | pi_matrix[j, 3] = 0.5 80 | elif j == 99: 81 | pi_matrix[j, 0] = 0.5 82 | pi_matrix[j, 2] = 0.5 83 | elif j == 9900: 84 | pi_matrix[j, 1] = 0.5 85 | pi_matrix[j, 3] = 0.5 86 | elif j == 9999: 87 | pi_matrix[j, 1] = 0.5 88 | pi_matrix[j, 2] = 0.5 89 | elif 1 <= j <= 98: 90 | pi_matrix[j, 0] = 1.0 / 3.0 91 | pi_matrix[j, 2] = 1.0 / 3.0 92 | pi_matrix[j, 3] = 1.0 / 3.0 93 | elif 9901 <= j <= 9998: 94 | pi_matrix[j, 1] = 1.0 / 3.0 95 | pi_matrix[j, 2] = 1.0 / 3.0 96 | pi_matrix[j, 3] = 1.0 / 3.0 97 | elif j % 100 == 0: 98 | pi_matrix[j, 0] = 1.0 / 3.0 99 | pi_matrix[j, 1] = 1.0 / 3.0 100 | pi_matrix[j, 3] = 1.0 / 3.0 101 | elif (j+1) % 100 == 0: 102 | pi_matrix[j, 0] = 1.0 / 3.0 103 | pi_matrix[j, 1] = 1.0 / 3.0 104 | pi_matrix[j, 2] = 1.0 / 3.0 105 | else: 106 | pi_matrix[j] = 0.25 107 | 108 | # print 'Q_matrix =', Q_matrix 109 | # print 'pi_matrix =', pi_matrix 110 | # print 'C_matrix = ', C_matrix 111 | step = 0 112 | while step != 10000: 113 | step += 1 114 | # print 'currentState= ', currentState 115 | yArray.append(currentState/100) 116 | xArray.append(currentState % 100) 117 | # a) From state s select action a according to mixed strategy Pi with suitable exploration 118 | actionProbability = pi_matrix[currentState - 1] 119 | # print 'actionProbability = ', actionProbability 120 | currentAction = np.random.choice(4, 1, p=actionProbability) 121 | # print 'currentAction = ', currentAction 122 | # b) Observing reward and next state s' 123 | nextState = currentState + calculate_nextState_table[currentAction[0]] 124 | # print 'nextSate = ', nextState 125 | reward = calculate_reward(nextState, goal_state) 126 | max_Q = find_max_q_at_state_s(nextState, Q_matrix) 127 | # print 'max_Q = ', max_Q 128 | Q_matrix[currentState-1, currentAction[0]] = \ 129 | (1 - alpha) * Q_matrix[currentState-1, currentAction[0]] + alpha * ( 130 | reward + gama * max_Q) 131 | # c) update estimate of average policy, pi_mean_matrix 132 | pi_mean_matrix = (pi_mean_matrix * (step - 1) + pi_matrix) / step 133 | C_matrix[currentState-1] += 1 134 | for each_action in range(4): 135 | pi_mean_matrix[currentState-1, each_action] += (1 / C_matrix[currentState-1]) * ( 136 | pi_matrix[currentState-1, each_action] - pi_mean_matrix[currentState-1, each_action] 137 | ) 138 | # d) Step pi closer to the optimal policy w.r.t Q 139 | delta = calculate_delta(pi_matrix, pi_mean_matrix, Q_matrix, currentState, delta_W, delta_L) 140 | triangle_delta_sa = calculate_triangle_delta_sa(currentState, currentAction, pi_matrix, Q_matrix, delta) 141 | pi_matrix[currentState-1, currentAction] += triangle_delta_sa 142 | sum_probability = np.sum(pi_matrix[currentState-1]) 143 | pi_matrix[currentState-1] /= sum_probability 144 | currentState = nextState 145 | print 'reached goal!' 146 | print 'total steps = ', step 147 | # print 'xArray= ', xArray 148 | # print 'yArray= ', yArray 149 | return Q_matrix, pi_matrix, pi_mean_matrix, xArray, yArray 150 | 151 | 152 | # --------------------- Each agent has different goal ------------------------ # 153 | 154 | Agent1_position = [55.0, 46.0] 155 | Agent2_position = [28.0, 70.0] 156 | Agent3_position = [24.0, 50.0] 157 | Agent4_position = [77.0, 45.0] 158 | 159 | calculate_nextState_table = (100, -100, -1, 1) 160 | alpha = 0.5 161 | gama = 0.4 162 | delta_L = 0.8 163 | delta_W = 0.4 164 | 165 | Agent1_goal = 3228 166 | Agent2_goal = 8799 167 | Agent3_goal = 4923 168 | Agent4_goal = 1003 169 | 170 | Agent1_trainingResult = wolf_phc_learning(Agent1_position, Agent1_goal) 171 | Agent2_trainingResult = wolf_phc_learning(Agent2_position, Agent2_goal) 172 | Agent3_trainingResult = wolf_phc_learning(Agent3_position, Agent3_goal) 173 | Agent4_trainingResult = wolf_phc_learning(Agent4_position, Agent4_goal) 174 | 175 | Agent1_xArray = Agent1_trainingResult[3] 176 | Agent1_yArray = Agent1_trainingResult[4] 177 | print "Agent1 start with (", Agent1_xArray[0], ",", Agent1_yArray[0], ")" 178 | print "Agent1 end with (", 28, ",", 32, ")" 179 | 180 | # Agent2_xArray = Agent2_trainingResult[3] 181 | # Agent2_yArray = Agent2_trainingResult[4] 182 | # print "Agent2 start with (", Agent2_xArray[0], ",", Agent2_yArray[0], ")" 183 | # print "Agent2 end with (", 99, ",", 87, ")" 184 | # 185 | # Agent3_xArray = Agent3_trainingResult[3] 186 | # Agent3_yArray = Agent3_trainingResult[4] 187 | # print "Agent3 start with (", Agent3_xArray[0], ",", Agent3_yArray[0], ")" 188 | # print "Agent3 end with (", 23, ",", 49, ")" 189 | # 190 | # Agent4_xArray = Agent4_trainingResult[3] 191 | # Agent4_yArray = Agent4_trainingResult[4] 192 | # print "Agent4 start with (", Agent4_xArray[0], ",", Agent4_yArray[0], ")" 193 | # print "Agent4 end with (", 3, ",", 10, ")" 194 | 195 | 196 | plt.plot(Agent1_xArray, Agent1_yArray, label="Agent1") 197 | # plt.plot(Agent2_xArray, Agent2_yArray, label="Agent2") 198 | # plt.plot(Agent3_xArray, Agent3_yArray, label="Agent3") 199 | # plt.plot(Agent4_xArray, Agent4_yArray, label="Agent4") 200 | plt.axis([0, 100, 0, 100]) 201 | plt.legend() 202 | plt.title("WoLF-PHC learning track with Multiagent System") 203 | plt.show() 204 | 205 | 206 | 207 | -------------------------------------------------------------------------------- /WoLF-PHC with Mobility Simulation,same goal.py: -------------------------------------------------------------------------------- 1 | from pymobility.models.mobility import random_waypoint 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | def find_state_from_position(position): 7 | x = position[0] 8 | y = position[1] 9 | if x != 100 and y != 100: 10 | state_num = (y//1) * 100 + ((x//1) + 1) 11 | elif x == 100 and y != 100: 12 | state_num = (y//1) * 100 + x 13 | elif y == 100 and x != 100: 14 | state_num = (y-1)*100 + ((x//1) + 1) 15 | else: 16 | state_num = 10000 17 | return int(state_num) 18 | 19 | 20 | def calculate_reward(previous_state, current_state, goal_state): 21 | if current_state == goal_state: 22 | r = 20 23 | elif abs(current_state % 100 - goal_state % 100) < abs(previous_state % 100 - goal_state % 100)\ 24 | or abs(current_state/100 - goal_state/100) < abs(previous_state/100 - goal_state/100): 25 | r = 5 26 | else: 27 | r = -5 28 | return r 29 | 30 | 31 | def find_max_q_at_state_s(s, matrix): 32 | q_list_at_state_s = matrix[s-1] 33 | max_q_value = np.max(q_list_at_state_s) 34 | return max_q_value 35 | 36 | 37 | def calculate_delta(matrix_pi, matrix_pi_mean, matrix_q, state, win_learning_rate, lose_learning_rate): 38 | left_side = 0 39 | right_side = 0 40 | for action in range(4): 41 | left_side += matrix_pi[state-1, action] * matrix_q[state-1, action] 42 | right_side += matrix_pi_mean[state-1, action] * matrix_q[state-1, action] 43 | if left_side > right_side: 44 | learning_rate = win_learning_rate 45 | else: 46 | learning_rate = lose_learning_rate 47 | return learning_rate 48 | 49 | 50 | def __calculate_delta_sa(matrix_pi, state, action, learning_rate): 51 | compare_list = np.array([ matrix_pi[state-1, action], learning_rate / (4-1)]) 52 | return np.min(compare_list) 53 | 54 | 55 | def calculate_triangle_delta_sa(state, action, matrix_pi, matrix_q, learning_rate): 56 | q_values_list = matrix_q[state-1] 57 | arg_max_a = np.argmax(q_values_list) 58 | if action != arg_max_a: 59 | result = 0 - __calculate_delta_sa(matrix_pi, state, action, learning_rate) 60 | else: 61 | result = 0 62 | for i in range(4): 63 | if i != arg_max_a: 64 | result += __calculate_delta_sa(matrix_pi, state, i, learning_rate) 65 | return result 66 | 67 | 68 | def wolf_phc_learning(position, goal_state): 69 | xArray = [] 70 | yArray = [] 71 | currentState = find_state_from_position(position) 72 | # print 'currentState is: ', currentState 73 | Q_matrix = np.zeros((10000, 4)) 74 | pi_matrix = np.zeros((10000, 4)) 75 | pi_mean_matrix = np.zeros((10000, 4)) 76 | C_matrix = np.zeros((10000, 1), dtype=np.int) 77 | # Initialize pi-matrix 78 | pi_matrix.fill(0) 79 | for j in range(0, 10000): 80 | if j == 0: 81 | pi_matrix[j, 0] = 0.5 82 | pi_matrix[j, 3] = 0.5 83 | elif j == 99: 84 | pi_matrix[j, 0] = 0.5 85 | pi_matrix[j, 2] = 0.5 86 | elif j == 9900: 87 | pi_matrix[j, 1] = 0.5 88 | pi_matrix[j, 3] = 0.5 89 | elif j == 9999: 90 | pi_matrix[j, 1] = 0.5 91 | pi_matrix[j, 2] = 0.5 92 | elif 1 <= j <= 98: 93 | pi_matrix[j, 0] = 1.0 / 3.0 94 | pi_matrix[j, 2] = 1.0 / 3.0 95 | pi_matrix[j, 3] = 1.0 / 3.0 96 | elif 9901 <= j <= 9998: 97 | pi_matrix[j, 1] = 1.0 / 3.0 98 | pi_matrix[j, 2] = 1.0 / 3.0 99 | pi_matrix[j, 3] = 1.0 / 3.0 100 | elif j % 100 == 0: 101 | pi_matrix[j, 0] = 1.0 / 3.0 102 | pi_matrix[j, 1] = 1.0 / 3.0 103 | pi_matrix[j, 3] = 1.0 / 3.0 104 | elif (j+1) % 100 == 0: 105 | pi_matrix[j, 0] = 1.0 / 3.0 106 | pi_matrix[j, 1] = 1.0 / 3.0 107 | pi_matrix[j, 2] = 1.0 / 3.0 108 | else: 109 | pi_matrix[j] = 0.25 110 | 111 | # print 'Q_matrix =', Q_matrix 112 | # print 'pi_matrix =', pi_matrix 113 | # print 'C_matrix = ', C_matrix 114 | step = 0 115 | while currentState != goal_state: 116 | step += 1 117 | # print 'currentState= ', currentState 118 | yArray.append(currentState/100) 119 | xArray.append(currentState % 100) 120 | # a) From state s select action a according to mixed strategy Pi with suitable exploration 121 | actionProbability = pi_matrix[currentState - 1] 122 | # print 'actionProbability = ', actionProbability 123 | currentAction = np.random.choice(4, 1, p=actionProbability) 124 | # print 'currentAction = ', currentAction 125 | # b) Observing reward and next state s' 126 | nextState = currentState + calculate_nextState_table[currentAction[0]] 127 | # print 'nextSate = ', nextState 128 | reward = calculate_reward(currentState, nextState, goal_state) 129 | max_Q = find_max_q_at_state_s(nextState, Q_matrix) 130 | # print 'max_Q = ', max_Q 131 | Q_matrix[currentState-1, currentAction[0]] = \ 132 | (1 - alpha) * Q_matrix[currentState-1, currentAction[0]] + alpha * ( 133 | reward + gama * max_Q) 134 | # c) update estimate of average policy, pi_mean_matrix 135 | pi_mean_matrix = (pi_mean_matrix * (step - 1) + pi_matrix) / step 136 | C_matrix[currentState-1] += 1 137 | for each_action in range(4): 138 | pi_mean_matrix[currentState-1, each_action] += (1 / C_matrix[currentState-1]) * ( 139 | pi_matrix[currentState-1, each_action] - pi_mean_matrix[currentState-1, each_action] 140 | ) 141 | # d) Step pi closer to the optimal policy w.r.t Q 142 | delta = calculate_delta(pi_matrix, pi_mean_matrix, Q_matrix, currentState, delta_W, delta_L) 143 | triangle_delta_sa = calculate_triangle_delta_sa(currentState, currentAction, pi_matrix, Q_matrix, delta) 144 | pi_matrix[currentState-1, currentAction] += triangle_delta_sa 145 | sum_probability = np.sum(pi_matrix[currentState-1]) 146 | pi_matrix[currentState-1] /= sum_probability 147 | currentState = nextState 148 | print 'reached goal!' 149 | print 'total steps = ', step 150 | # print 'xArray= ', xArray 151 | # print 'yArray= ', yArray 152 | return Q_matrix, pi_matrix, pi_mean_matrix, xArray, yArray 153 | 154 | 155 | # --------------------- All agents have the same goal ------------------------ # 156 | 157 | 158 | rw = random_waypoint(5, dimensions=(100, 100), velocity=(0.1, 1.0), wt_max=1.0) 159 | positions = next(rw) 160 | # print 'positions= ',positions 161 | Agent1_position = positions[0] 162 | print 'Agent1_position= ',Agent1_position 163 | Agent2_position = positions[1] 164 | Agent3_position = positions[2] 165 | Agent4_position = positions[3] 166 | Agent5_position = positions[4] 167 | 168 | calculate_nextState_table = (100, -100, -1, 1) 169 | alpha = 0.5 170 | gama = 0.4 171 | delta_L = 0.8 172 | delta_W = 0.4 173 | 174 | Agent1_goal = 3228 175 | Agent2_goal = 3228 176 | Agent3_goal = 3228 177 | Agent4_goal = 3228 178 | Agent5_goal = 3228 179 | 180 | Agent1_trainingResult = wolf_phc_learning(Agent1_position, Agent1_goal) 181 | Agent2_trainingResult = wolf_phc_learning(Agent2_position, Agent2_goal) 182 | Agent3_trainingResult = wolf_phc_learning(Agent3_position, Agent3_goal) 183 | Agent4_trainingResult = wolf_phc_learning(Agent4_position, Agent4_goal) 184 | Agent5_trainingResult = wolf_phc_learning(Agent5_position, Agent5_goal) 185 | 186 | Agent1_xArray = Agent1_trainingResult[3] 187 | Agent1_yArray = Agent1_trainingResult[4] 188 | print "Agent1 start with (", Agent1_xArray[0], ",", Agent1_yArray[0], ")" 189 | print "Agent1 end with (", 28, ",", 32, ")" 190 | 191 | Agent2_xArray = Agent2_trainingResult[3] 192 | Agent2_yArray = Agent2_trainingResult[4] 193 | print "Agent2 start with (", Agent2_xArray[0], ",", Agent2_yArray[0], ")" 194 | print "Agent2 end with (", 99, ",", 87, ")" 195 | 196 | Agent3_xArray = Agent3_trainingResult[3] 197 | Agent3_yArray = Agent3_trainingResult[4] 198 | print "Agent3 start with (", Agent3_xArray[0], ",", Agent3_yArray[0], ")" 199 | print "Agent3 end with (", 23, ",", 49, ")" 200 | 201 | Agent4_xArray = Agent4_trainingResult[3] 202 | Agent4_yArray = Agent4_trainingResult[4] 203 | print "Agent4 start with (", Agent4_xArray[0], ",", Agent4_yArray[0], ")" 204 | print "Agent4 end with (", 3, ",", 10, ")" 205 | 206 | Agent5_xArray = Agent5_trainingResult[3] 207 | Agent5_yArray = Agent5_trainingResult[4] 208 | print "Agent5 start with (", Agent5_xArray[0], ",", Agent5_yArray[0], ")" 209 | print "Agent5 end with (", 27, ",", 63, ")" 210 | 211 | plt.plot(Agent1_xArray, Agent1_yArray, label="Agent1") 212 | plt.plot(Agent2_xArray, Agent2_yArray, label="Agent2") 213 | plt.plot(Agent3_xArray, Agent3_yArray, label="Agent3") 214 | plt.plot(Agent4_xArray, Agent4_yArray, label="Agent4") 215 | plt.plot(Agent5_xArray, Agent5_yArray, label="Agent5") 216 | plt.axis([0, 100, 0, 100]) 217 | plt.legend() 218 | plt.title("WoLF-PHC learning track with Multiagent System") 219 | plt.show() 220 | 221 | # Agent1_pi_matrix = wolf_phc_learning(Agent1_position, Agent1_goal)[1] 222 | # Agent2_pi_matrix = wolf_phc_learning(Agent2_position, Agent2_goal)[1] 223 | # Agent3_pi_matrix = wolf_phc_learning(Agent3_position, Agent3_goal)[1] 224 | # Agent4_pi_matrix = wolf_phc_learning(Agent4_position, Agent4_goal)[1] 225 | # Agent5_pi_matrix = wolf_phc_learning(Agent5_position, Agent5_goal)[1] 226 | 227 | # print 'Agent1_pi_matrix = ', Agent1_pi_matrix 228 | # print 'Agent2_pi_matrix = ', Agent2_pi_matrix 229 | # print 'Agent3_pi_matrix = ', Agent3_pi_matrix 230 | # print 'Agent4_pi_matrix = ', Agent4_pi_matrix 231 | # print 'Agent5_pi_matrix = ', Agent5_pi_matrix 232 | 233 | 234 | -------------------------------------------------------------------------------- /WoLF-PHC, MAS, diffrt goals.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | def find_state_from_position(position): 6 | x = position[0] 7 | y = position[1] 8 | if x != 100 and y != 100: 9 | state_num = (y//1) * 100 + ((x//1) + 1) 10 | elif x == 100 and y != 100: 11 | state_num = (y//1) * 100 + x 12 | elif y == 100 and x != 100: 13 | state_num = (y-1)*100 + ((x//1) + 1) 14 | else: 15 | state_num = 10000 16 | return int(state_num) 17 | 18 | 19 | def calculate_reward(previous_state, current_state, goal_state): 20 | if current_state == goal_state: 21 | r = 20 22 | elif abs(current_state % 100 - goal_state % 100) < abs(previous_state % 100 - goal_state % 100)\ 23 | or abs(current_state/100 - goal_state/100) < abs(previous_state/100 - goal_state/100): 24 | r = 5 25 | else: 26 | r = -5 27 | return r 28 | 29 | 30 | def find_max_q_at_state_s(s, matrix): 31 | q_list_at_state_s = matrix[s-1] 32 | max_q_value = np.max(q_list_at_state_s) 33 | return max_q_value 34 | 35 | 36 | def calculate_delta(matrix_pi, matrix_pi_mean, matrix_q, state, win_learning_rate, lose_learning_rate): 37 | left_side = 0 38 | right_side = 0 39 | for action in range(4): 40 | left_side += matrix_pi[state-1, action] * matrix_q[state-1, action] 41 | right_side += matrix_pi_mean[state-1, action] * matrix_q[state-1, action] 42 | if left_side > right_side: 43 | learning_rate = win_learning_rate 44 | else: 45 | learning_rate = lose_learning_rate 46 | return learning_rate 47 | 48 | 49 | def __calculate_delta_sa(matrix_pi, state, action, learning_rate): 50 | compare_list = np.array([ matrix_pi[state-1, action], learning_rate / (4-1)]) 51 | return np.min(compare_list) 52 | 53 | 54 | def calculate_triangle_delta_sa(state, action, matrix_pi, matrix_q, learning_rate): 55 | q_values_list = matrix_q[state-1] 56 | arg_max_a = np.argmax(q_values_list) 57 | if action != arg_max_a: 58 | result = 0 - __calculate_delta_sa(matrix_pi, state, action, learning_rate) 59 | else: 60 | result = 0 61 | for i in range(4): 62 | if i != arg_max_a: 63 | result += __calculate_delta_sa(matrix_pi, state, i, learning_rate) 64 | return result 65 | 66 | 67 | def wolf_phc_learning(position, goal_state): 68 | xArray = [] 69 | yArray = [] 70 | timestamp = {} 71 | currentState = find_state_from_position(position) 72 | print "initial state = ", currentState 73 | # print 'currentState is: ', currentState 74 | Q_matrix = np.zeros((10000, 4)) 75 | pi_matrix = np.zeros((10000, 4)) 76 | pi_mean_matrix = np.zeros((10000, 4)) 77 | C_matrix = np.zeros((10000, 1), dtype=np.int) 78 | # Initialize pi-matrix 79 | pi_matrix.fill(0) 80 | for j in range(0, 10000): 81 | if j == 0: 82 | pi_matrix[j, 0] = 0.5 83 | pi_matrix[j, 3] = 0.5 84 | elif j == 99: 85 | pi_matrix[j, 0] = 0.5 86 | pi_matrix[j, 2] = 0.5 87 | elif j == 9900: 88 | pi_matrix[j, 1] = 0.5 89 | pi_matrix[j, 3] = 0.5 90 | elif j == 9999: 91 | pi_matrix[j, 1] = 0.5 92 | pi_matrix[j, 2] = 0.5 93 | elif 1 <= j <= 98: 94 | pi_matrix[j, 0] = 1.0 / 3.0 95 | pi_matrix[j, 2] = 1.0 / 3.0 96 | pi_matrix[j, 3] = 1.0 / 3.0 97 | elif 9901 <= j <= 9998: 98 | pi_matrix[j, 1] = 1.0 / 3.0 99 | pi_matrix[j, 2] = 1.0 / 3.0 100 | pi_matrix[j, 3] = 1.0 / 3.0 101 | elif j % 100 == 0: 102 | pi_matrix[j, 0] = 1.0 / 3.0 103 | pi_matrix[j, 1] = 1.0 / 3.0 104 | pi_matrix[j, 3] = 1.0 / 3.0 105 | elif (j+1) % 100 == 0: 106 | pi_matrix[j, 0] = 1.0 / 3.0 107 | pi_matrix[j, 1] = 1.0 / 3.0 108 | pi_matrix[j, 2] = 1.0 / 3.0 109 | else: 110 | pi_matrix[j] = 0.25 111 | 112 | # print 'Q_matrix =', Q_matrix 113 | # print 'pi_matrix =', pi_matrix 114 | # print 'C_matrix = ', C_matrix 115 | step = 0 116 | while step != 10000: 117 | step += 1 118 | # print 'currentState= ', currentState 119 | yArray.append(currentState/100) 120 | xArray.append(currentState % 100) 121 | timestamp[step]= currentState 122 | # a) From state s select action a according to mixed strategy Pi with suitable exploration 123 | actionProbability = pi_matrix[currentState - 1] 124 | # print 'actionProbability = ', actionProbability 125 | currentAction = np.random.choice(4, 1, p=actionProbability) 126 | # print 'currentAction = ', currentAction 127 | # b) Observing reward and next state s' 128 | nextState = currentState + calculate_nextState_table[currentAction[0]] 129 | # print 'nextSate = ', nextState 130 | reward = calculate_reward(currentState, nextState, goal_state) 131 | max_Q = find_max_q_at_state_s(nextState, Q_matrix) 132 | # print 'max_Q = ', max_Q 133 | Q_matrix[currentState-1, currentAction[0]] = \ 134 | (1 - alpha) * Q_matrix[currentState-1, currentAction[0]] + alpha * ( 135 | reward + gama * max_Q) 136 | # c) update estimate of average policy, pi_mean_matrix 137 | pi_mean_matrix = (pi_mean_matrix * (step - 1) + pi_matrix) / step 138 | C_matrix[currentState-1] += 1 139 | for each_action in range(4): 140 | pi_mean_matrix[currentState-1, each_action] += (1 / C_matrix[currentState-1]) * ( 141 | pi_matrix[currentState-1, each_action] - pi_mean_matrix[currentState-1, each_action] 142 | ) 143 | # d) Step pi closer to the optimal policy w.r.t Q 144 | delta = calculate_delta(pi_matrix, pi_mean_matrix, Q_matrix, currentState, delta_W, delta_L) 145 | triangle_delta_sa = calculate_triangle_delta_sa(currentState, currentAction, pi_matrix, Q_matrix, delta) 146 | pi_matrix[currentState-1, currentAction] += triangle_delta_sa 147 | sum_probability = np.sum(pi_matrix[currentState-1]) 148 | pi_matrix[currentState-1] /= sum_probability 149 | currentState = nextState 150 | print 'reached goal!' 151 | print 'total steps = ', step 152 | # print 'xArray= ', xArray 153 | # print 'yArray= ', yArray 154 | return Q_matrix, pi_matrix, pi_mean_matrix, xArray, yArray, timestamp 155 | 156 | 157 | # --------------------- Each agent has different goal ------------------------ # 158 | 159 | Agent1_position = [55.0, 46.0] 160 | Agent2_position = [28.0, 70.0] 161 | Agent3_position = [24.0, 50.0] 162 | Agent4_position = [77.0, 45.0] 163 | Agent5_position = [20.8, 28.8] 164 | 165 | calculate_nextState_table = (100, -100, -1, 1) 166 | alpha = 0.5 167 | gama = 0.4 168 | delta_L = 0.8 169 | delta_W = 0.4 170 | 171 | Agent1_goal = 3228 172 | Agent2_goal = 6799 173 | Agent3_goal = 4923 174 | Agent4_goal = 5054 175 | Agent5_goal = 8179 176 | 177 | Agent1_trainingResult = wolf_phc_learning(Agent1_position, Agent1_goal) 178 | Agent2_trainingResult = wolf_phc_learning(Agent2_position, Agent2_goal) 179 | Agent3_trainingResult = wolf_phc_learning(Agent3_position, Agent3_goal) 180 | Agent4_trainingResult = wolf_phc_learning(Agent4_position, Agent4_goal) 181 | Agent5_trainingResult = wolf_phc_learning(Agent5_position, Agent5_goal) 182 | 183 | Agent1_xArray = Agent1_trainingResult[3] 184 | Agent1_yArray = Agent1_trainingResult[4] 185 | # print "Agent1 start with (", Agent1_xArray[0], ",", Agent1_yArray[0], ")" 186 | # print "Agent1 end with (", 28, ",", 32, ")" 187 | 188 | Agent2_xArray = Agent2_trainingResult[3] 189 | Agent2_yArray = Agent2_trainingResult[4] 190 | # print "Agent2 start with (", Agent2_xArray[0], ",", Agent2_yArray[0], ")" 191 | # print "Agent2 end with (", 99, ",", 87, ")" 192 | 193 | Agent3_xArray = Agent3_trainingResult[3] 194 | Agent3_yArray = Agent3_trainingResult[4] 195 | # print "Agent3 start with (", Agent3_xArray[0], ",", Agent3_yArray[0], ")" 196 | # print "Agent3 end with (", 23, ",", 49, ")" 197 | 198 | Agent4_xArray = Agent4_trainingResult[3] 199 | Agent4_yArray = Agent4_trainingResult[4] 200 | # print "Agent4 start with (", Agent4_xArray[0], ",", Agent4_yArray[0], ")" 201 | # print "Agent4 end with (", 54, ",", 54, ")" 202 | 203 | Agent5_xArray = Agent5_trainingResult[3] 204 | Agent5_yArray = Agent5_trainingResult[4] 205 | # print "Agent5 start with (", Agent5_xArray[0], ",", Agent5_yArray[0], ")" 206 | # print "Agent5 end with (", 79, ",", 81, ")" 207 | 208 | Agent1_timestamp = Agent1_trainingResult[5] 209 | Agent2_timestamp = Agent2_trainingResult[5] 210 | Agent3_timestamp = Agent3_trainingResult[5] 211 | Agent4_timestamp = Agent4_trainingResult[5] 212 | Agent5_timestamp = Agent5_trainingResult[5] 213 | 214 | 215 | plt.plot(Agent1_xArray, Agent1_yArray, label="Agent1") 216 | plt.plot(Agent2_xArray, Agent2_yArray, label="Agent2") 217 | plt.plot(Agent3_xArray, Agent3_yArray, label="Agent3") 218 | plt.plot(Agent4_xArray, Agent4_yArray, label="Agent4") 219 | plt.plot(Agent5_xArray, Agent5_yArray, label="Agent5") 220 | plt.axis([0, 100, 0, 100]) 221 | plt.legend() 222 | plt.title("WoLF-PHC learning track with Multiagent System") 223 | plt.show() 224 | 225 | total_timestamp = {} 226 | for key in Agent1_timestamp: 227 | total_timestamp[key] = [] 228 | total_timestamp[key].append(Agent1_timestamp[key]) 229 | total_timestamp[key].append(Agent2_timestamp[key]) 230 | total_timestamp[key].append(Agent3_timestamp[key]) 231 | total_timestamp[key].append(Agent4_timestamp[key]) 232 | total_timestamp[key].append(Agent5_timestamp[key]) 233 | 234 | for key in total_timestamp: 235 | states_at_step = total_timestamp[key] 236 | if len(states_at_step) != len(set(states_at_step)): 237 | print "conflicts exist in MAS trajectories at step = ", key 238 | 239 | print "End of the Program !" 240 | --------------------------------------------------------------------------------