├── README.md
├── WoLF-PHC Grid Game,2agents,same goal, print policy.py
├── WoLF-PHC, MAS, diffrt goals, normalized Q value.py
├── WoLF-PHC, MAS, diffrt goals, cumulative reward.py
├── WoLF-PHC with Mobility Simulation,diffrt goals.py
├── WoLF-PHC with Mobility Simulation,same goal.py
└── WoLF-PHC, MAS, diffrt goals.py


/README.md:
--------------------------------------------------------------------------------
1 | # Realize The Automation of UAV Cluster by Reinforcement Learning Approach
2 | This project implemented and tuned a WoLF-PHC machine learning algorithm with the underlying reinforcement learning principle, to increase the automation degree of UAV group by allowing each UAV in a group be an independent learning machine to form a totally decentralized system, and also by increasing the cooperation within the UAV group when performing a task.
3 | THis project utilized matplotlib to visualize the result data. 
4 | 


--------------------------------------------------------------------------------
/WoLF-PHC Grid Game,2agents,same goal, print policy.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | 
  4 | 
  5 | 
  6 | 
  7 | def find_state_from_position(position):
  8 |     x = position[0]
  9 |     y = position[1]
 10 |     if x != 100 and y != 100:
 11 |         state_num = (y//1) * 100 + ((x//1) + 1)
 12 |     elif x == 100 and y != 100:
 13 |         state_num = (y//1) * 100 + x
 14 |     elif y == 100 and x != 100:
 15 |         state_num = (y-1)*100 + ((x//1) + 1)
 16 |     else:
 17 |         state_num = 10000
 18 |     return int(state_num)
 19 | 
 20 | 
 21 | def calculate_reward(current_state, goal_state):
 22 |     if current_state == goal_state:
 23 |         r = 20
 24 |     else:
 25 |         r = 0
 26 |     return r
 27 | 
 28 | 
 29 | def find_max_q_at_state_s(s, matrix):
 30 |     q_list_at_state_s = matrix[s-1]
 31 |     max_q_value = np.max(q_list_at_state_s)
 32 |     return max_q_value
 33 | 
 34 | 
 35 | def calculate_delta(matrix_pi, matrix_pi_mean, matrix_q, state, win_learning_rate, lose_learning_rate):
 36 |     left_side = 0
 37 |     right_side = 0
 38 |     for action in range(4):
 39 |         left_side += matrix_pi[state-1, action] * matrix_q[state-1, action]
 40 |         right_side += matrix_pi_mean[state-1, action] * matrix_q[state-1, action]
 41 |     if left_side > right_side:
 42 |         learning_rate = win_learning_rate
 43 |     else:
 44 |         learning_rate = lose_learning_rate
 45 |     return learning_rate
 46 | 
 47 | 
 48 | def __calculate_delta_sa(matrix_pi, state, action, learning_rate):
 49 |     compare_list = np.array([ matrix_pi[state-1, action], learning_rate / (4-1)])
 50 |     return np.min(compare_list)
 51 | 
 52 | 
 53 | def calculate_triangle_delta_sa(state, action, matrix_pi, matrix_q, learning_rate):
 54 |     q_values_list = matrix_q[state-1]
 55 |     arg_max_a = np.argmax(q_values_list)
 56 |     if action != arg_max_a:
 57 |         result = 0 - __calculate_delta_sa(matrix_pi, state, action, learning_rate)
 58 |     else:
 59 |         result = 0
 60 |         for i in range(4):
 61 |             if i != arg_max_a:
 62 |                 result += __calculate_delta_sa(matrix_pi, state, i, learning_rate)
 63 |     return result
 64 | 
 65 | 
 66 | def wolf_phc_learning(position, goal_state):
 67 |     EastProb_Array = []
 68 |     WestProb_Array = []
 69 |     NorthProb_Array = []
 70 |     currentState = find_state_from_position(position)
 71 |     # print 'currentState is: ', currentState
 72 |     Q_matrix = np.zeros((10000, 4))
 73 |     pi_matrix = np.zeros((10000, 4))
 74 |     pi_mean_matrix = np.zeros((10000, 4))
 75 |     C_matrix = np.zeros((10000, 1), dtype=np.int)
 76 |     # Initialize pi-matrix
 77 |     pi_matrix.fill(0)
 78 |     for j in range(0, 10000):
 79 |         if j == 0:
 80 |             pi_matrix[j, 0] = 0.5
 81 |             pi_matrix[j, 3] = 0.5
 82 |         elif j == 99:
 83 |             pi_matrix[j, 0] = 0.5
 84 |             pi_matrix[j, 2] = 0.5
 85 |         elif j == 9900:
 86 |             pi_matrix[j, 1] = 0.5
 87 |             pi_matrix[j, 3] = 0.5
 88 |         elif j == 9999:
 89 |             pi_matrix[j, 1] = 0.5
 90 |             pi_matrix[j, 2] = 0.5
 91 |         elif 1 <= j <= 98:
 92 |             pi_matrix[j, 0] = 1.0 / 3.0
 93 |             pi_matrix[j, 2] = 1.0 / 3.0
 94 |             pi_matrix[j, 3] = 1.0 / 3.0
 95 |         elif 9901 <= j <= 9998:
 96 |             pi_matrix[j, 1] = 1.0 / 3.0
 97 |             pi_matrix[j, 2] = 1.0 / 3.0
 98 |             pi_matrix[j, 3] = 1.0 / 3.0
 99 |         elif j % 100 == 0:
100 |             pi_matrix[j, 0] = 1.0 / 3.0
101 |             pi_matrix[j, 1] = 1.0 / 3.0
102 |             pi_matrix[j, 3] = 1.0 / 3.0
103 |         elif (j+1) % 100 == 0:
104 |             pi_matrix[j, 0] = 1.0 / 3.0
105 |             pi_matrix[j, 1] = 1.0 / 3.0
106 |             pi_matrix[j, 2] = 1.0 / 3.0
107 |         else:
108 |             pi_matrix[j] = 0.25
109 | 
110 |     # print 'Q_matrix =', Q_matrix
111 |     # print 'pi_matrix =', pi_matrix
112 |     # print 'C_matrix = ', C_matrix
113 |     step = 0
114 |     initialState = currentState
115 |     while step != 100000:
116 |         step += 1
117 |         # print 'currentState= ', currentState
118 |         #   a) From state s select action a according to mixed strategy Pi with suitable exploration
119 |         actionProbability = pi_matrix[currentState - 1]
120 |         # print 'actionProbability = ', actionProbability
121 |         EastProb_Array.append(pi_matrix[initialState-1][3])
122 |         WestProb_Array.append(pi_matrix[initialState-1][2])
123 |         NorthProb_Array.append(pi_matrix[initialState-1][0])
124 |         currentAction = np.random.choice(4, 1, p=actionProbability)
125 |         # print 'currentAction = ', currentAction
126 |         #   b) Observing reward and next state s'
127 |         nextState = currentState + calculate_nextState_table[currentAction[0]]
128 |         # print 'nextSate = ', nextState
129 |         reward = calculate_reward(currentState, goal_state)
130 |         max_Q = find_max_q_at_state_s(nextState, Q_matrix)
131 |         # print 'max_Q = ', max_Q
132 |         Q_matrix[currentState-1, currentAction[0]] = \
133 |             (1 - alpha) * Q_matrix[currentState-1, currentAction[0]] + alpha * (
134 |                 reward + gama * max_Q)
135 |         #   c) update estimate of average policy, pi_mean_matrix
136 |         pi_mean_matrix = (pi_mean_matrix * (step - 1) + pi_matrix) / step
137 |         C_matrix[currentState-1] += 1
138 |         for each_action in range(4):
139 |             pi_mean_matrix[currentState-1, each_action] += (1 / C_matrix[currentState-1]) * (
140 |                 pi_matrix[currentState-1, each_action] - pi_mean_matrix[currentState-1, each_action]
141 |             )
142 |         #   d) Step pi closer to the optimal policy w.r.t Q
143 |         delta = calculate_delta(pi_matrix, pi_mean_matrix, Q_matrix, currentState, delta_W, delta_L)
144 |         triangle_delta_sa = calculate_triangle_delta_sa(currentState, currentAction, pi_matrix, Q_matrix, delta)
145 |         pi_matrix[currentState-1, currentAction] += triangle_delta_sa
146 |         sum_probability = np.sum(pi_matrix[currentState-1])
147 |         pi_matrix[currentState-1] /= sum_probability
148 |         currentState = nextState
149 |     print 'reached goal!'
150 |     print 'total steps = ', step
151 |     # print 'xArray= ', xArray
152 |     # print 'yArray= ', yArray
153 |     return Q_matrix, pi_matrix, pi_mean_matrix, EastProb_Array, WestProb_Array, NorthProb_Array
154 | 
155 | 
156 | # --------------------- All agents have the same goal ------------------------ #
157 | 
158 | Agent1_position = [25.0, 0]
159 | Agent2_position = [75.0, 0]
160 | 
161 | 
162 | calculate_nextState_table = (100, -100, -1, 1)
163 | alpha = 0.5
164 | gama = 0.4
165 | delta_L = 0.8
166 | delta_W = 0.4
167 | 
168 | 
169 | Agent1_goal = 5550
170 | Agent2_goal = 5550
171 | 
172 | 
173 | Agent1_trainingResult = wolf_phc_learning(Agent1_position, Agent1_goal)
174 | Agent2_trainingResult = wolf_phc_learning(Agent2_position, Agent2_goal)
175 | 
176 | x1 = Agent1_trainingResult[3]
177 | y1 = Agent1_trainingResult[5]
178 | x2 = Agent2_trainingResult[4]
179 | y2 = Agent2_trainingResult[5]
180 | 
181 | 
182 | x1 = np.asarray(Agent1_trainingResult[3])
183 | y1 = np.asarray(Agent1_trainingResult[5])
184 | x2 = np.asarray(Agent2_trainingResult[4])
185 | y2 = np.asarray(Agent2_trainingResult[5])
186 | 
187 | 
188 | fig, axes = plt.subplots(1, 2, figsize=(10, 4))
189 | axes[0].plot(x1, y1)
190 | axes[0].set_title("Agent1 Policy Trajectory")
191 | axes[0].set_xlabel("Pr(East)")
192 | axes[0].set_ylabel("Pr(North)")
193 | axes[0].set_xlim([0, 1])
194 | axes[0].set_ylim([0, 1])
195 | axes[1].plot(x2, y2)
196 | axes[1].set_title("Agent2 Policy Trajectory")
197 | axes[1].set_xlabel("Pr(West)")
198 | axes[1].set_ylabel("Pr(North)")
199 | axes[1].set_xlim([0, 1])
200 | axes[1].set_ylim([0, 1])
201 | plt.show()
202 | 
203 | 
204 | 
205 | 
206 | 


--------------------------------------------------------------------------------
/WoLF-PHC, MAS, diffrt goals, normalized Q value.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import math
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | 
  6 | def find_state_from_position(position):
  7 |     x = position[0]
  8 |     y = position[1]
  9 |     if x != 100 and y != 100:
 10 |         state_num = (y//1) * 100 + ((x//1) + 1)
 11 |     elif x == 100 and y != 100:
 12 |         state_num = (y//1) * 100 + x
 13 |     elif y == 100 and x != 100:
 14 |         state_num = (y-1)*100 + ((x//1) + 1)
 15 |     else:
 16 |         state_num = 10000
 17 |     return int(state_num)
 18 | 
 19 | 
 20 | def calculate_reward(previous_state, current_state, goal_state):
 21 |     if current_state == goal_state:
 22 |         r = 20
 23 |     elif abs(current_state % 100 - goal_state % 100) < abs(previous_state % 100 - goal_state % 100) \
 24 |             or abs(current_state/100 - goal_state/100) < abs(previous_state/100 - goal_state/100):
 25 |         r = 5
 26 |     else:
 27 |         r = -5
 28 |     return r
 29 | 
 30 | 
 31 | def find_max_q_at_state_s(s, matrix):
 32 |     q_list_at_state_s = matrix[s-1]
 33 |     max_q_value = np.max(q_list_at_state_s)
 34 |     return max_q_value
 35 | 
 36 | 
 37 | def calculate_delta(matrix_pi, matrix_pi_mean, matrix_q, state, win_learning_rate, lose_learning_rate):
 38 |     left_side = 0
 39 |     right_side = 0
 40 |     for action in range(4):
 41 |         left_side += matrix_pi[state-1, action] * matrix_q[state-1, action]
 42 |         right_side += matrix_pi_mean[state-1, action] * matrix_q[state-1, action]
 43 |     if left_side > right_side:
 44 |         learning_rate = win_learning_rate
 45 |     else:
 46 |         learning_rate = lose_learning_rate
 47 |     return learning_rate
 48 | 
 49 | 
 50 | def __calculate_delta_sa(matrix_pi, state, action, learning_rate):
 51 |     compare_list = np.array([ matrix_pi[state-1, action], learning_rate / (4-1)])
 52 |     return np.min(compare_list)
 53 | 
 54 | 
 55 | def calculate_triangle_delta_sa(state, action, matrix_pi, matrix_q, learning_rate):
 56 |     q_values_list = matrix_q[state-1]
 57 |     arg_max_a = np.argmax(q_values_list)
 58 |     if action != arg_max_a:
 59 |         result = 0 - __calculate_delta_sa(matrix_pi, state, action, learning_rate)
 60 |     else:
 61 |         result = 0
 62 |         for i in range(4):
 63 |             if i != arg_max_a:
 64 |                 result += __calculate_delta_sa(matrix_pi, state, i, learning_rate)
 65 |     return result
 66 | 
 67 | 
 68 | def wolf_phc_learning(position, goal_state):
 69 |     normalized_Q = []
 70 |     currentState = find_state_from_position(position)
 71 |     Q_matrix = np.zeros((10000, 4))
 72 |     pi_matrix = np.zeros((10000, 4))
 73 |     pi_mean_matrix = np.zeros((10000, 4))
 74 |     C_matrix = np.zeros((10000, 1), dtype=np.int)
 75 |     # Initialize pi-matrix
 76 |     pi_matrix.fill(0)
 77 |     for j in range(0, 10000):
 78 |         if j == 0:
 79 |             pi_matrix[j, 0] = 0.5
 80 |             pi_matrix[j, 3] = 0.5
 81 |         elif j == 99:
 82 |             pi_matrix[j, 0] = 0.5
 83 |             pi_matrix[j, 2] = 0.5
 84 |         elif j == 9900:
 85 |             pi_matrix[j, 1] = 0.5
 86 |             pi_matrix[j, 3] = 0.5
 87 |         elif j == 9999:
 88 |             pi_matrix[j, 1] = 0.5
 89 |             pi_matrix[j, 2] = 0.5
 90 |         elif 1 <= j <= 98:
 91 |             pi_matrix[j, 0] = 1.0 / 3.0
 92 |             pi_matrix[j, 2] = 1.0 / 3.0
 93 |             pi_matrix[j, 3] = 1.0 / 3.0
 94 |         elif 9901 <= j <= 9998:
 95 |             pi_matrix[j, 1] = 1.0 / 3.0
 96 |             pi_matrix[j, 2] = 1.0 / 3.0
 97 |             pi_matrix[j, 3] = 1.0 / 3.0
 98 |         elif j % 100 == 0:
 99 |             pi_matrix[j, 0] = 1.0 / 3.0
100 |             pi_matrix[j, 1] = 1.0 / 3.0
101 |             pi_matrix[j, 3] = 1.0 / 3.0
102 |         elif (j+1) % 100 == 0:
103 |             pi_matrix[j, 0] = 1.0 / 3.0
104 |             pi_matrix[j, 1] = 1.0 / 3.0
105 |             pi_matrix[j, 2] = 1.0 / 3.0
106 |         else:
107 |             pi_matrix[j] = 0.25
108 | 
109 |     step = 0
110 |     normalized_Q.append(math.sqrt(np.sum((Q_matrix ** 2))))
111 |     while step != 1000:
112 |         step += 1
113 |         # print 'currentState= ', currentState
114 |         #   a) From state s select action a according to mixed strategy Pi with suitable exploration
115 |         actionProbability = pi_matrix[currentState - 1]
116 |         # print 'actionProbability = ', actionProbability
117 |         currentAction = np.random.choice(4, 1, p=actionProbability)
118 |         # print 'currentAction = ', currentAction
119 |         #   b) Observing reward and next state s'
120 |         nextState = currentState + calculate_nextState_table[currentAction[0]]
121 |         # print 'nextSate = ', nextState
122 |         reward = calculate_reward(currentState, nextState, goal_state)
123 |         max_Q = find_max_q_at_state_s(nextState, Q_matrix)
124 |         # print 'max_Q = ', max_Q
125 |         Q_matrix[currentState-1, currentAction[0]] = \
126 |             (1 - alpha) * Q_matrix[currentState-1, currentAction[0]] + alpha * (
127 |                 reward + gama * max_Q)
128 |         #   c) update estimate of average policy, pi_mean_matrix
129 |         pi_mean_matrix = (pi_mean_matrix * (step - 1) + pi_matrix) / step
130 |         C_matrix[currentState-1] += 1
131 |         for each_action in range(4):
132 |             pi_mean_matrix[currentState-1, each_action] += (1 / C_matrix[currentState-1]) * (
133 |                 pi_matrix[currentState-1, each_action] - pi_mean_matrix[currentState-1, each_action]
134 |             )
135 |         #   d) Step pi closer to the optimal policy w.r.t Q
136 |         delta = calculate_delta(pi_matrix, pi_mean_matrix, Q_matrix, currentState, delta_W, delta_L)
137 |         triangle_delta_sa = calculate_triangle_delta_sa(currentState, currentAction, pi_matrix, Q_matrix, delta)
138 |         pi_matrix[currentState-1, currentAction] += triangle_delta_sa
139 |         sum_probability = np.sum(pi_matrix[currentState-1])
140 |         pi_matrix[currentState-1] /= sum_probability
141 |         currentState = nextState
142 |         # caculate normalized Q value
143 |         Q_value = math.sqrt(np.sum((Q_matrix ** 2)))
144 |         normalized_Q.append(Q_value)
145 |     print 'reached goal!'
146 |     print 'total steps = ', step
147 |     return Q_matrix, pi_matrix, pi_mean_matrix, normalized_Q
148 | 
149 | 
150 | # --------------------- Each agent has different goal ------------------------ #
151 | 
152 | Agent1_position = [55.0, 46.0]
153 | Agent2_position = [28.0, 70.0]
154 | Agent3_position = [24.0, 50.0]
155 | Agent4_position = [77.0, 45.0]
156 | Agent5_position = [20.8, 28.8]
157 | 
158 | calculate_nextState_table = (100, -100, -1, 1)
159 | alpha = 0.5
160 | gama = 0.4
161 | delta_L = 0.8
162 | delta_W = 0.4
163 | 
164 | Agent1_goal = 3228
165 | Agent2_goal = 6799
166 | Agent3_goal = 4923
167 | Agent4_goal = 5054
168 | Agent5_goal = 8179
169 | 
170 | Agent1_trainingResult = wolf_phc_learning(Agent1_position, Agent1_goal)
171 | Agent2_trainingResult = wolf_phc_learning(Agent2_position, Agent2_goal)
172 | Agent3_trainingResult = wolf_phc_learning(Agent3_position, Agent3_goal)
173 | Agent4_trainingResult = wolf_phc_learning(Agent4_position, Agent4_goal)
174 | Agent5_trainingResult = wolf_phc_learning(Agent5_position, Agent5_goal)
175 | 
176 | Agent1_normalized_Q = np.asarray(Agent1_trainingResult[3])
177 | Agent2_normalized_Q = np.asarray(Agent2_trainingResult[3])
178 | Agent3_normalized_Q = np.asarray(Agent3_trainingResult[3])
179 | Agent4_normalized_Q = np.asarray(Agent4_trainingResult[3])
180 | Agent5_normalized_Q = np.asarray(Agent5_trainingResult[3])
181 | 
182 | x = np.arange(1001)
183 | plt.plot(x, Agent1_normalized_Q, label="Agent1")
184 | plt.plot(x, Agent2_normalized_Q, label="Agent2")
185 | plt.plot(x, Agent3_normalized_Q, label="Agent3")
186 | plt.plot(x, Agent4_normalized_Q, label="Agent4")
187 | plt.plot(x, Agent5_normalized_Q, label="Agent5")
188 | plt.legend()
189 | plt.title("WoLF-PHC MAS, normalized Q value")
190 | plt.show()
191 | 
192 | 


--------------------------------------------------------------------------------
/WoLF-PHC, MAS, diffrt goals, cumulative reward.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import math
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | 
  6 | def find_state_from_position(position):
  7 |     x = position[0]
  8 |     y = position[1]
  9 |     if x != 100 and y != 100:
 10 |         state_num = (y//1) * 100 + ((x//1) + 1)
 11 |     elif x == 100 and y != 100:
 12 |         state_num = (y//1) * 100 + x
 13 |     elif y == 100 and x != 100:
 14 |         state_num = (y-1)*100 + ((x//1) + 1)
 15 |     else:
 16 |         state_num = 10000
 17 |     return int(state_num)
 18 | 
 19 | 
 20 | def calculate_reward(previous_state, current_state, goal_state):
 21 |     if current_state == goal_state:
 22 |         r = 20
 23 |     elif abs(current_state % 100 - goal_state % 100) < abs(previous_state % 100 - goal_state % 100) \
 24 |             or abs(current_state/100 - goal_state/100) < abs(previous_state/100 - goal_state/100):
 25 |         r = 5
 26 |     else:
 27 |         r = -5
 28 |     return r
 29 | 
 30 | 
 31 | def find_max_q_at_state_s(s, matrix):
 32 |     q_list_at_state_s = matrix[s-1]
 33 |     max_q_value = np.max(q_list_at_state_s)
 34 |     return max_q_value
 35 | 
 36 | 
 37 | def calculate_delta(matrix_pi, matrix_pi_mean, matrix_q, state, win_learning_rate, lose_learning_rate):
 38 |     left_side = 0
 39 |     right_side = 0
 40 |     for action in range(4):
 41 |         left_side += matrix_pi[state-1, action] * matrix_q[state-1, action]
 42 |         right_side += matrix_pi_mean[state-1, action] * matrix_q[state-1, action]
 43 |     if left_side > right_side:
 44 |         learning_rate = win_learning_rate
 45 |     else:
 46 |         learning_rate = lose_learning_rate
 47 |     return learning_rate
 48 | 
 49 | 
 50 | def __calculate_delta_sa(matrix_pi, state, action, learning_rate):
 51 |     compare_list = np.array([ matrix_pi[state-1, action], learning_rate / (4-1)])
 52 |     return np.min(compare_list)
 53 | 
 54 | 
 55 | def calculate_triangle_delta_sa(state, action, matrix_pi, matrix_q, learning_rate):
 56 |     q_values_list = matrix_q[state-1]
 57 |     arg_max_a = np.argmax(q_values_list)
 58 |     if action != arg_max_a:
 59 |         result = 0 - __calculate_delta_sa(matrix_pi, state, action, learning_rate)
 60 |     else:
 61 |         result = 0
 62 |         for i in range(4):
 63 |             if i != arg_max_a:
 64 |                 result += __calculate_delta_sa(matrix_pi, state, i, learning_rate)
 65 |     return result
 66 | 
 67 | 
 68 | def wolf_phc_learning(position, goal_state):
 69 |     cumulRwrdList = []
 70 |     currentState = find_state_from_position(position)
 71 |     Q_matrix = np.zeros((10000, 4))
 72 |     pi_matrix = np.zeros((10000, 4))
 73 |     pi_mean_matrix = np.zeros((10000, 4))
 74 |     C_matrix = np.zeros((10000, 1), dtype=np.int)
 75 |     # Initialize pi-matrix
 76 |     pi_matrix.fill(0)
 77 |     for j in range(0, 10000):
 78 |         if j == 0:
 79 |             pi_matrix[j, 0] = 0.5
 80 |             pi_matrix[j, 3] = 0.5
 81 |         elif j == 99:
 82 |             pi_matrix[j, 0] = 0.5
 83 |             pi_matrix[j, 2] = 0.5
 84 |         elif j == 9900:
 85 |             pi_matrix[j, 1] = 0.5
 86 |             pi_matrix[j, 3] = 0.5
 87 |         elif j == 9999:
 88 |             pi_matrix[j, 1] = 0.5
 89 |             pi_matrix[j, 2] = 0.5
 90 |         elif 1 <= j <= 98:
 91 |             pi_matrix[j, 0] = 1.0 / 3.0
 92 |             pi_matrix[j, 2] = 1.0 / 3.0
 93 |             pi_matrix[j, 3] = 1.0 / 3.0
 94 |         elif 9901 <= j <= 9998:
 95 |             pi_matrix[j, 1] = 1.0 / 3.0
 96 |             pi_matrix[j, 2] = 1.0 / 3.0
 97 |             pi_matrix[j, 3] = 1.0 / 3.0
 98 |         elif j % 100 == 0:
 99 |             pi_matrix[j, 0] = 1.0 / 3.0
100 |             pi_matrix[j, 1] = 1.0 / 3.0
101 |             pi_matrix[j, 3] = 1.0 / 3.0
102 |         elif (j+1) % 100 == 0:
103 |             pi_matrix[j, 0] = 1.0 / 3.0
104 |             pi_matrix[j, 1] = 1.0 / 3.0
105 |             pi_matrix[j, 2] = 1.0 / 3.0
106 |         else:
107 |             pi_matrix[j] = 0.25
108 | 
109 |     step = 0
110 |     cumulativeReward = 0
111 |     cumulRwrdList.append(cumulativeReward)
112 |     while step != 1000:
113 |         step += 1
114 |         # print 'currentState= ', currentState
115 |         #   a) From state s select action a according to mixed strategy Pi with suitable exploration
116 |         actionProbability = pi_matrix[currentState - 1]
117 |         # print 'actionProbability = ', actionProbability
118 |         currentAction = np.random.choice(4, 1, p=actionProbability)
119 |         # print 'currentAction = ', currentAction
120 |         #   b) Observing reward and next state s'
121 |         nextState = currentState + calculate_nextState_table[currentAction[0]]
122 |         # print 'nextSate = ', nextState
123 |         reward = calculate_reward(currentState, nextState, goal_state)
124 |         cumulativeReward += reward
125 |         cumulRwrdList.append(cumulativeReward)
126 |         max_Q = find_max_q_at_state_s(nextState, Q_matrix)
127 |         # print 'max_Q = ', max_Q
128 |         Q_matrix[currentState-1, currentAction[0]] = \
129 |             (1 - alpha) * Q_matrix[currentState-1, currentAction[0]] + alpha * (
130 |                 reward + gama * max_Q)
131 |         #   c) update estimate of average policy, pi_mean_matrix
132 |         pi_mean_matrix = (pi_mean_matrix * (step - 1) + pi_matrix) / step
133 |         C_matrix[currentState-1] += 1
134 |         for each_action in range(4):
135 |             pi_mean_matrix[currentState-1, each_action] += (1 / C_matrix[currentState-1]) * (
136 |                 pi_matrix[currentState-1, each_action] - pi_mean_matrix[currentState-1, each_action]
137 |             )
138 |         #   d) Step pi closer to the optimal policy w.r.t Q
139 |         delta = calculate_delta(pi_matrix, pi_mean_matrix, Q_matrix, currentState, delta_W, delta_L)
140 |         triangle_delta_sa = calculate_triangle_delta_sa(currentState, currentAction, pi_matrix, Q_matrix, delta)
141 |         pi_matrix[currentState-1, currentAction] += triangle_delta_sa
142 |         sum_probability = np.sum(pi_matrix[currentState-1])
143 |         pi_matrix[currentState-1] /= sum_probability
144 |         currentState = nextState
145 |         # caculate normalized Q value
146 |         Q_value = math.sqrt(np.sum((Q_matrix ** 2)))
147 |     print 'reached goal!'
148 |     print 'total steps = ', step
149 |     return Q_matrix, pi_matrix, pi_mean_matrix, cumulRwrdList
150 | 
151 | 
152 | # --------------------- Each agent has different goal ------------------------ #
153 | 
154 | Agent1_position = [55.0, 46.0]
155 | Agent2_position = [28.0, 70.0]
156 | Agent3_position = [24.0, 50.0]
157 | Agent4_position = [77.0, 45.0]
158 | Agent5_position = [20.8, 28.8]
159 | 
160 | calculate_nextState_table = (100, -100, -1, 1)
161 | alpha = 0.5
162 | gama = 0.4
163 | delta_L = 0.8
164 | delta_W = 0.4
165 | 
166 | Agent1_goal = 3228
167 | Agent2_goal = 6799
168 | Agent3_goal = 4923
169 | Agent4_goal = 5054
170 | Agent5_goal = 8179
171 | 
172 | Agent1_trainingResult = wolf_phc_learning(Agent1_position, Agent1_goal)
173 | Agent2_trainingResult = wolf_phc_learning(Agent2_position, Agent2_goal)
174 | Agent3_trainingResult = wolf_phc_learning(Agent3_position, Agent3_goal)
175 | Agent4_trainingResult = wolf_phc_learning(Agent4_position, Agent4_goal)
176 | Agent5_trainingResult = wolf_phc_learning(Agent5_position, Agent5_goal)
177 | 
178 | Agent1_cumulRwrd = np.asarray(Agent1_trainingResult[3])
179 | Agent2_cumulRwrd = np.asarray(Agent2_trainingResult[3])
180 | Agent3_cumulRwrd = np.asarray(Agent3_trainingResult[3])
181 | Agent4_cumulRwrd = np.asarray(Agent4_trainingResult[3])
182 | Agent5_cumulRwrd = np.asarray(Agent5_trainingResult[3])
183 | 
184 | x = np.arange(1001)
185 | plt.plot(x, Agent1_cumulRwrd, label="Agent1")
186 | plt.plot(x, Agent2_cumulRwrd, label="Agent2")
187 | plt.plot(x, Agent3_cumulRwrd, label="Agent3")
188 | plt.plot(x, Agent4_cumulRwrd, label="Agent4")
189 | plt.plot(x, Agent5_cumulRwrd, label="Agent5")
190 | plt.legend()
191 | plt.title("WoLF-PHC MAS, cumulative reward")
192 | plt.show()
193 | 
194 | 


--------------------------------------------------------------------------------
/WoLF-PHC with Mobility Simulation,diffrt goals.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | 
  4 | 
  5 | def find_state_from_position(position):
  6 |     x = position[0]
  7 |     y = position[1]
  8 |     if x != 100 and y != 100:
  9 |         state_num = (y//1) * 100 + ((x//1) + 1)
 10 |     elif x == 100 and y != 100:
 11 |         state_num = (y//1) * 100 + x
 12 |     elif y == 100 and x != 100:
 13 |         state_num = (y-1)*100 + ((x//1) + 1)
 14 |     else:
 15 |         state_num = 10000
 16 |     return int(state_num)
 17 | 
 18 | 
 19 | def calculate_reward(current_state, goal_state):
 20 |     if current_state == goal_state:
 21 |         r = 20
 22 |     else:
 23 |         r = 0
 24 |     return r
 25 | 
 26 | 
 27 | def find_max_q_at_state_s(s, matrix):
 28 |     q_list_at_state_s = matrix[s-1]
 29 |     max_q_value = np.max(q_list_at_state_s)
 30 |     return max_q_value
 31 | 
 32 | 
 33 | def calculate_delta(matrix_pi, matrix_pi_mean, matrix_q, state, win_learning_rate, lose_learning_rate):
 34 |     left_side = 0
 35 |     right_side = 0
 36 |     for action in range(4):
 37 |         left_side += matrix_pi[state-1, action] * matrix_q[state-1, action]
 38 |         right_side += matrix_pi_mean[state-1, action] * matrix_q[state-1, action]
 39 |     if left_side > right_side:
 40 |         learning_rate = win_learning_rate
 41 |     else:
 42 |         learning_rate = lose_learning_rate
 43 |     return learning_rate
 44 | 
 45 | 
 46 | def __calculate_delta_sa(matrix_pi, state, action, learning_rate):
 47 |     compare_list = np.array([ matrix_pi[state-1, action], learning_rate / (4-1)])
 48 |     return np.min(compare_list)
 49 | 
 50 | 
 51 | def calculate_triangle_delta_sa(state, action, matrix_pi, matrix_q, learning_rate):
 52 |     q_values_list = matrix_q[state-1]
 53 |     arg_max_a = np.argmax(q_values_list)
 54 |     if action != arg_max_a:
 55 |         result = 0 - __calculate_delta_sa(matrix_pi, state, action, learning_rate)
 56 |     else:
 57 |         result = 0
 58 |         for i in range(4):
 59 |             if i != arg_max_a:
 60 |                 result += __calculate_delta_sa(matrix_pi, state, i, learning_rate)
 61 |     return result
 62 | 
 63 | 
 64 | def wolf_phc_learning(position, goal_state):
 65 |     xArray = []
 66 |     yArray = []
 67 |     currentState = find_state_from_position(position)
 68 |     print 'currentState = ',currentState
 69 |     # print 'currentState is: ', currentState
 70 |     Q_matrix = np.zeros((10000, 4))
 71 |     pi_matrix = np.zeros((10000, 4))
 72 |     pi_mean_matrix = np.zeros((10000, 4))
 73 |     C_matrix = np.zeros((10000, 1), dtype=np.int)
 74 |     # Initialize pi-matrix
 75 |     pi_matrix.fill(0)
 76 |     for j in range(0, 10000):
 77 |         if j == 0:
 78 |             pi_matrix[j, 0] = 0.5
 79 |             pi_matrix[j, 3] = 0.5
 80 |         elif j == 99:
 81 |             pi_matrix[j, 0] = 0.5
 82 |             pi_matrix[j, 2] = 0.5
 83 |         elif j == 9900:
 84 |             pi_matrix[j, 1] = 0.5
 85 |             pi_matrix[j, 3] = 0.5
 86 |         elif j == 9999:
 87 |             pi_matrix[j, 1] = 0.5
 88 |             pi_matrix[j, 2] = 0.5
 89 |         elif 1 <= j <= 98:
 90 |             pi_matrix[j, 0] = 1.0 / 3.0
 91 |             pi_matrix[j, 2] = 1.0 / 3.0
 92 |             pi_matrix[j, 3] = 1.0 / 3.0
 93 |         elif 9901 <= j <= 9998:
 94 |             pi_matrix[j, 1] = 1.0 / 3.0
 95 |             pi_matrix[j, 2] = 1.0 / 3.0
 96 |             pi_matrix[j, 3] = 1.0 / 3.0
 97 |         elif j % 100 == 0:
 98 |             pi_matrix[j, 0] = 1.0 / 3.0
 99 |             pi_matrix[j, 1] = 1.0 / 3.0
100 |             pi_matrix[j, 3] = 1.0 / 3.0
101 |         elif (j+1) % 100 == 0:
102 |             pi_matrix[j, 0] = 1.0 / 3.0
103 |             pi_matrix[j, 1] = 1.0 / 3.0
104 |             pi_matrix[j, 2] = 1.0 / 3.0
105 |         else:
106 |             pi_matrix[j] = 0.25
107 | 
108 |     # print 'Q_matrix =', Q_matrix
109 |     # print 'pi_matrix =', pi_matrix
110 |     # print 'C_matrix = ', C_matrix
111 |     step = 0
112 |     while step != 10000:
113 |         step += 1
114 |         # print 'currentState= ', currentState
115 |         yArray.append(currentState/100)
116 |         xArray.append(currentState % 100)
117 |         #   a) From state s select action a according to mixed strategy Pi with suitable exploration
118 |         actionProbability = pi_matrix[currentState - 1]
119 |         # print 'actionProbability = ', actionProbability
120 |         currentAction = np.random.choice(4, 1, p=actionProbability)
121 |         # print 'currentAction = ', currentAction
122 |         #   b) Observing reward and next state s'
123 |         nextState = currentState + calculate_nextState_table[currentAction[0]]
124 |         # print 'nextSate = ', nextState
125 |         reward = calculate_reward(nextState, goal_state)
126 |         max_Q = find_max_q_at_state_s(nextState, Q_matrix)
127 |         # print 'max_Q = ', max_Q
128 |         Q_matrix[currentState-1, currentAction[0]] = \
129 |             (1 - alpha) * Q_matrix[currentState-1, currentAction[0]] + alpha * (
130 |                 reward + gama * max_Q)
131 |         #   c) update estimate of average policy, pi_mean_matrix
132 |         pi_mean_matrix = (pi_mean_matrix * (step - 1) + pi_matrix) / step
133 |         C_matrix[currentState-1] += 1
134 |         for each_action in range(4):
135 |             pi_mean_matrix[currentState-1, each_action] += (1 / C_matrix[currentState-1]) * (
136 |                 pi_matrix[currentState-1, each_action] - pi_mean_matrix[currentState-1, each_action]
137 |             )
138 |         #   d) Step pi closer to the optimal policy w.r.t Q
139 |         delta = calculate_delta(pi_matrix, pi_mean_matrix, Q_matrix, currentState, delta_W, delta_L)
140 |         triangle_delta_sa = calculate_triangle_delta_sa(currentState, currentAction, pi_matrix, Q_matrix, delta)
141 |         pi_matrix[currentState-1, currentAction] += triangle_delta_sa
142 |         sum_probability = np.sum(pi_matrix[currentState-1])
143 |         pi_matrix[currentState-1] /= sum_probability
144 |         currentState = nextState
145 |     print 'reached goal!'
146 |     print 'total steps = ', step
147 |     # print 'xArray= ', xArray
148 |     # print 'yArray= ', yArray
149 |     return Q_matrix, pi_matrix, pi_mean_matrix, xArray, yArray
150 | 
151 | 
152 | # --------------------- Each agent has different goal ------------------------ #
153 | 
154 | Agent1_position = [55.0, 46.0]
155 | Agent2_position = [28.0, 70.0]
156 | Agent3_position = [24.0, 50.0]
157 | Agent4_position = [77.0, 45.0]
158 | 
159 | calculate_nextState_table = (100, -100, -1, 1)
160 | alpha = 0.5
161 | gama = 0.4
162 | delta_L = 0.8
163 | delta_W = 0.4
164 | 
165 | Agent1_goal = 3228
166 | Agent2_goal = 8799
167 | Agent3_goal = 4923
168 | Agent4_goal = 1003
169 | 
170 | Agent1_trainingResult = wolf_phc_learning(Agent1_position, Agent1_goal)
171 | Agent2_trainingResult = wolf_phc_learning(Agent2_position, Agent2_goal)
172 | Agent3_trainingResult = wolf_phc_learning(Agent3_position, Agent3_goal)
173 | Agent4_trainingResult = wolf_phc_learning(Agent4_position, Agent4_goal)
174 | 
175 | Agent1_xArray = Agent1_trainingResult[3]
176 | Agent1_yArray = Agent1_trainingResult[4]
177 | print "Agent1 start with (", Agent1_xArray[0], ",", Agent1_yArray[0], ")"
178 | print "Agent1 end with (", 28, ",", 32, ")"
179 | 
180 | # Agent2_xArray = Agent2_trainingResult[3]
181 | # Agent2_yArray = Agent2_trainingResult[4]
182 | # print "Agent2 start with (", Agent2_xArray[0], ",", Agent2_yArray[0], ")"
183 | # print "Agent2 end with (", 99, ",", 87, ")"
184 | #
185 | # Agent3_xArray = Agent3_trainingResult[3]
186 | # Agent3_yArray = Agent3_trainingResult[4]
187 | # print "Agent3 start with (", Agent3_xArray[0], ",", Agent3_yArray[0], ")"
188 | # print "Agent3 end with (", 23, ",", 49, ")"
189 | #
190 | # Agent4_xArray = Agent4_trainingResult[3]
191 | # Agent4_yArray = Agent4_trainingResult[4]
192 | # print "Agent4 start with (", Agent4_xArray[0], ",", Agent4_yArray[0], ")"
193 | # print "Agent4 end with (", 3, ",", 10, ")"
194 | 
195 | 
196 | plt.plot(Agent1_xArray, Agent1_yArray, label="Agent1")
197 | # plt.plot(Agent2_xArray, Agent2_yArray, label="Agent2")
198 | # plt.plot(Agent3_xArray, Agent3_yArray, label="Agent3")
199 | # plt.plot(Agent4_xArray, Agent4_yArray, label="Agent4")
200 | plt.axis([0, 100, 0, 100])
201 | plt.legend()
202 | plt.title("WoLF-PHC learning track with Multiagent System")
203 | plt.show()
204 | 
205 | 
206 | 
207 | 


--------------------------------------------------------------------------------
/WoLF-PHC with Mobility Simulation,same goal.py:
--------------------------------------------------------------------------------
  1 | from pymobility.models.mobility import random_waypoint
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | 
  6 | def find_state_from_position(position):
  7 |     x = position[0]
  8 |     y = position[1]
  9 |     if x != 100 and y != 100:
 10 |         state_num = (y//1) * 100 + ((x//1) + 1)
 11 |     elif x == 100 and y != 100:
 12 |         state_num = (y//1) * 100 + x
 13 |     elif y == 100 and x != 100:
 14 |         state_num = (y-1)*100 + ((x//1) + 1)
 15 |     else:
 16 |         state_num = 10000
 17 |     return int(state_num)
 18 | 
 19 | 
 20 | def calculate_reward(previous_state, current_state, goal_state):
 21 |     if current_state == goal_state:
 22 |         r = 20
 23 |     elif abs(current_state % 100 - goal_state % 100) < abs(previous_state % 100 - goal_state % 100)\
 24 |             or abs(current_state/100 - goal_state/100) < abs(previous_state/100 - goal_state/100):
 25 |         r = 5
 26 |     else:
 27 |         r = -5
 28 |     return r
 29 | 
 30 | 
 31 | def find_max_q_at_state_s(s, matrix):
 32 |     q_list_at_state_s = matrix[s-1]
 33 |     max_q_value = np.max(q_list_at_state_s)
 34 |     return max_q_value
 35 | 
 36 | 
 37 | def calculate_delta(matrix_pi, matrix_pi_mean, matrix_q, state, win_learning_rate, lose_learning_rate):
 38 |     left_side = 0
 39 |     right_side = 0
 40 |     for action in range(4):
 41 |         left_side += matrix_pi[state-1, action] * matrix_q[state-1, action]
 42 |         right_side += matrix_pi_mean[state-1, action] * matrix_q[state-1, action]
 43 |     if left_side > right_side:
 44 |         learning_rate = win_learning_rate
 45 |     else:
 46 |         learning_rate = lose_learning_rate
 47 |     return learning_rate
 48 | 
 49 | 
 50 | def __calculate_delta_sa(matrix_pi, state, action, learning_rate):
 51 |     compare_list = np.array([ matrix_pi[state-1, action], learning_rate / (4-1)])
 52 |     return np.min(compare_list)
 53 | 
 54 | 
 55 | def calculate_triangle_delta_sa(state, action, matrix_pi, matrix_q, learning_rate):
 56 |     q_values_list = matrix_q[state-1]
 57 |     arg_max_a = np.argmax(q_values_list)
 58 |     if action != arg_max_a:
 59 |         result = 0 - __calculate_delta_sa(matrix_pi, state, action, learning_rate)
 60 |     else:
 61 |         result = 0
 62 |         for i in range(4):
 63 |             if i != arg_max_a:
 64 |                 result += __calculate_delta_sa(matrix_pi, state, i, learning_rate)
 65 |     return result
 66 | 
 67 | 
 68 | def wolf_phc_learning(position, goal_state):
 69 |     xArray = []
 70 |     yArray = []
 71 |     currentState = find_state_from_position(position)
 72 |     # print 'currentState is: ', currentState
 73 |     Q_matrix = np.zeros((10000, 4))
 74 |     pi_matrix = np.zeros((10000, 4))
 75 |     pi_mean_matrix = np.zeros((10000, 4))
 76 |     C_matrix = np.zeros((10000, 1), dtype=np.int)
 77 |     # Initialize pi-matrix
 78 |     pi_matrix.fill(0)
 79 |     for j in range(0, 10000):
 80 |         if j == 0:
 81 |             pi_matrix[j, 0] = 0.5
 82 |             pi_matrix[j, 3] = 0.5
 83 |         elif j == 99:
 84 |             pi_matrix[j, 0] = 0.5
 85 |             pi_matrix[j, 2] = 0.5
 86 |         elif j == 9900:
 87 |             pi_matrix[j, 1] = 0.5
 88 |             pi_matrix[j, 3] = 0.5
 89 |         elif j == 9999:
 90 |             pi_matrix[j, 1] = 0.5
 91 |             pi_matrix[j, 2] = 0.5
 92 |         elif 1 <= j <= 98:
 93 |             pi_matrix[j, 0] = 1.0 / 3.0
 94 |             pi_matrix[j, 2] = 1.0 / 3.0
 95 |             pi_matrix[j, 3] = 1.0 / 3.0
 96 |         elif 9901 <= j <= 9998:
 97 |             pi_matrix[j, 1] = 1.0 / 3.0
 98 |             pi_matrix[j, 2] = 1.0 / 3.0
 99 |             pi_matrix[j, 3] = 1.0 / 3.0
100 |         elif j % 100 == 0:
101 |             pi_matrix[j, 0] = 1.0 / 3.0
102 |             pi_matrix[j, 1] = 1.0 / 3.0
103 |             pi_matrix[j, 3] = 1.0 / 3.0
104 |         elif (j+1) % 100 == 0:
105 |             pi_matrix[j, 0] = 1.0 / 3.0
106 |             pi_matrix[j, 1] = 1.0 / 3.0
107 |             pi_matrix[j, 2] = 1.0 / 3.0
108 |         else:
109 |             pi_matrix[j] = 0.25
110 | 
111 |     # print 'Q_matrix =', Q_matrix
112 |     # print 'pi_matrix =', pi_matrix
113 |     # print 'C_matrix = ', C_matrix
114 |     step = 0
115 |     while currentState != goal_state:
116 |         step += 1
117 |         # print 'currentState= ', currentState
118 |         yArray.append(currentState/100)
119 |         xArray.append(currentState % 100)
120 |         #   a) From state s select action a according to mixed strategy Pi with suitable exploration
121 |         actionProbability = pi_matrix[currentState - 1]
122 |         # print 'actionProbability = ', actionProbability
123 |         currentAction = np.random.choice(4, 1, p=actionProbability)
124 |         # print 'currentAction = ', currentAction
125 |         #   b) Observing reward and next state s'
126 |         nextState = currentState + calculate_nextState_table[currentAction[0]]
127 |         # print 'nextSate = ', nextState
128 |         reward = calculate_reward(currentState, nextState, goal_state)
129 |         max_Q = find_max_q_at_state_s(nextState, Q_matrix)
130 |         # print 'max_Q = ', max_Q
131 |         Q_matrix[currentState-1, currentAction[0]] = \
132 |             (1 - alpha) * Q_matrix[currentState-1, currentAction[0]] + alpha * (
133 |                 reward + gama * max_Q)
134 |         #   c) update estimate of average policy, pi_mean_matrix
135 |         pi_mean_matrix = (pi_mean_matrix * (step - 1) + pi_matrix) / step
136 |         C_matrix[currentState-1] += 1
137 |         for each_action in range(4):
138 |             pi_mean_matrix[currentState-1, each_action] += (1 / C_matrix[currentState-1]) * (
139 |                 pi_matrix[currentState-1, each_action] - pi_mean_matrix[currentState-1, each_action]
140 |             )
141 |         #   d) Step pi closer to the optimal policy w.r.t Q
142 |         delta = calculate_delta(pi_matrix, pi_mean_matrix, Q_matrix, currentState, delta_W, delta_L)
143 |         triangle_delta_sa = calculate_triangle_delta_sa(currentState, currentAction, pi_matrix, Q_matrix, delta)
144 |         pi_matrix[currentState-1, currentAction] += triangle_delta_sa
145 |         sum_probability = np.sum(pi_matrix[currentState-1])
146 |         pi_matrix[currentState-1] /= sum_probability
147 |         currentState = nextState
148 |     print 'reached goal!'
149 |     print 'total steps = ', step
150 |     # print 'xArray= ', xArray
151 |     # print 'yArray= ', yArray
152 |     return Q_matrix, pi_matrix, pi_mean_matrix, xArray, yArray
153 | 
154 | 
155 | # --------------------- All agents have the same goal ------------------------ #
156 | 
157 | 
158 | rw = random_waypoint(5, dimensions=(100, 100), velocity=(0.1, 1.0), wt_max=1.0)
159 | positions = next(rw)
160 | # print 'positions= ',positions
161 | Agent1_position = positions[0]
162 | print 'Agent1_position= ',Agent1_position
163 | Agent2_position = positions[1]
164 | Agent3_position = positions[2]
165 | Agent4_position = positions[3]
166 | Agent5_position = positions[4]
167 | 
168 | calculate_nextState_table = (100, -100, -1, 1)
169 | alpha = 0.5
170 | gama = 0.4
171 | delta_L = 0.8
172 | delta_W = 0.4
173 | 
174 | Agent1_goal = 3228
175 | Agent2_goal = 3228
176 | Agent3_goal = 3228
177 | Agent4_goal = 3228
178 | Agent5_goal = 3228
179 | 
180 | Agent1_trainingResult = wolf_phc_learning(Agent1_position, Agent1_goal)
181 | Agent2_trainingResult = wolf_phc_learning(Agent2_position, Agent2_goal)
182 | Agent3_trainingResult = wolf_phc_learning(Agent3_position, Agent3_goal)
183 | Agent4_trainingResult = wolf_phc_learning(Agent4_position, Agent4_goal)
184 | Agent5_trainingResult = wolf_phc_learning(Agent5_position, Agent5_goal)
185 | 
186 | Agent1_xArray = Agent1_trainingResult[3]
187 | Agent1_yArray = Agent1_trainingResult[4]
188 | print "Agent1 start with (", Agent1_xArray[0], ",", Agent1_yArray[0], ")"
189 | print "Agent1 end with (", 28, ",", 32, ")"
190 | 
191 | Agent2_xArray = Agent2_trainingResult[3]
192 | Agent2_yArray = Agent2_trainingResult[4]
193 | print "Agent2 start with (", Agent2_xArray[0], ",", Agent2_yArray[0], ")"
194 | print "Agent2 end with (", 99, ",", 87, ")"
195 | 
196 | Agent3_xArray = Agent3_trainingResult[3]
197 | Agent3_yArray = Agent3_trainingResult[4]
198 | print "Agent3 start with (", Agent3_xArray[0], ",", Agent3_yArray[0], ")"
199 | print "Agent3 end with (", 23, ",", 49, ")"
200 | 
201 | Agent4_xArray = Agent4_trainingResult[3]
202 | Agent4_yArray = Agent4_trainingResult[4]
203 | print "Agent4 start with (", Agent4_xArray[0], ",", Agent4_yArray[0], ")"
204 | print "Agent4 end with (", 3, ",", 10, ")"
205 | 
206 | Agent5_xArray = Agent5_trainingResult[3]
207 | Agent5_yArray = Agent5_trainingResult[4]
208 | print "Agent5 start with (", Agent5_xArray[0], ",", Agent5_yArray[0], ")"
209 | print "Agent5 end with (", 27, ",", 63, ")"
210 | 
211 | plt.plot(Agent1_xArray, Agent1_yArray, label="Agent1")
212 | plt.plot(Agent2_xArray, Agent2_yArray, label="Agent2")
213 | plt.plot(Agent3_xArray, Agent3_yArray, label="Agent3")
214 | plt.plot(Agent4_xArray, Agent4_yArray, label="Agent4")
215 | plt.plot(Agent5_xArray, Agent5_yArray, label="Agent5")
216 | plt.axis([0, 100, 0, 100])
217 | plt.legend()
218 | plt.title("WoLF-PHC learning track with Multiagent System")
219 | plt.show()
220 | 
221 | # Agent1_pi_matrix = wolf_phc_learning(Agent1_position, Agent1_goal)[1]
222 | # Agent2_pi_matrix = wolf_phc_learning(Agent2_position, Agent2_goal)[1]
223 | # Agent3_pi_matrix = wolf_phc_learning(Agent3_position, Agent3_goal)[1]
224 | # Agent4_pi_matrix = wolf_phc_learning(Agent4_position, Agent4_goal)[1]
225 | # Agent5_pi_matrix = wolf_phc_learning(Agent5_position, Agent5_goal)[1]
226 | 
227 | # print 'Agent1_pi_matrix = ', Agent1_pi_matrix
228 | # print 'Agent2_pi_matrix = ', Agent2_pi_matrix
229 | # print 'Agent3_pi_matrix = ', Agent3_pi_matrix
230 | # print 'Agent4_pi_matrix = ', Agent4_pi_matrix
231 | # print 'Agent5_pi_matrix = ', Agent5_pi_matrix
232 | 
233 | 
234 | 


--------------------------------------------------------------------------------
/WoLF-PHC, MAS, diffrt goals.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | 
  4 | 
  5 | def find_state_from_position(position):
  6 |     x = position[0]
  7 |     y = position[1]
  8 |     if x != 100 and y != 100:
  9 |         state_num = (y//1) * 100 + ((x//1) + 1)
 10 |     elif x == 100 and y != 100:
 11 |         state_num = (y//1) * 100 + x
 12 |     elif y == 100 and x != 100:
 13 |         state_num = (y-1)*100 + ((x//1) + 1)
 14 |     else:
 15 |         state_num = 10000
 16 |     return int(state_num)
 17 | 
 18 | 
 19 | def calculate_reward(previous_state, current_state, goal_state):
 20 |     if current_state == goal_state:
 21 |         r = 20
 22 |     elif abs(current_state % 100 - goal_state % 100) < abs(previous_state % 100 - goal_state % 100)\
 23 |             or abs(current_state/100 - goal_state/100) < abs(previous_state/100 - goal_state/100):
 24 |         r = 5
 25 |     else:
 26 |         r = -5
 27 |     return r
 28 | 
 29 | 
 30 | def find_max_q_at_state_s(s, matrix):
 31 |     q_list_at_state_s = matrix[s-1]
 32 |     max_q_value = np.max(q_list_at_state_s)
 33 |     return max_q_value
 34 | 
 35 | 
 36 | def calculate_delta(matrix_pi, matrix_pi_mean, matrix_q, state, win_learning_rate, lose_learning_rate):
 37 |     left_side = 0
 38 |     right_side = 0
 39 |     for action in range(4):
 40 |         left_side += matrix_pi[state-1, action] * matrix_q[state-1, action]
 41 |         right_side += matrix_pi_mean[state-1, action] * matrix_q[state-1, action]
 42 |     if left_side > right_side:
 43 |         learning_rate = win_learning_rate
 44 |     else:
 45 |         learning_rate = lose_learning_rate
 46 |     return learning_rate
 47 | 
 48 | 
 49 | def __calculate_delta_sa(matrix_pi, state, action, learning_rate):
 50 |     compare_list = np.array([ matrix_pi[state-1, action], learning_rate / (4-1)])
 51 |     return np.min(compare_list)
 52 | 
 53 | 
 54 | def calculate_triangle_delta_sa(state, action, matrix_pi, matrix_q, learning_rate):
 55 |     q_values_list = matrix_q[state-1]
 56 |     arg_max_a = np.argmax(q_values_list)
 57 |     if action != arg_max_a:
 58 |         result = 0 - __calculate_delta_sa(matrix_pi, state, action, learning_rate)
 59 |     else:
 60 |         result = 0
 61 |         for i in range(4):
 62 |             if i != arg_max_a:
 63 |                 result += __calculate_delta_sa(matrix_pi, state, i, learning_rate)
 64 |     return result
 65 | 
 66 | 
 67 | def wolf_phc_learning(position, goal_state):
 68 |     xArray = []
 69 |     yArray = []
 70 |     timestamp = {}
 71 |     currentState = find_state_from_position(position)
 72 |     print "initial state = ", currentState
 73 |     # print 'currentState is: ', currentState
 74 |     Q_matrix = np.zeros((10000, 4))
 75 |     pi_matrix = np.zeros((10000, 4))
 76 |     pi_mean_matrix = np.zeros((10000, 4))
 77 |     C_matrix = np.zeros((10000, 1), dtype=np.int)
 78 |     # Initialize pi-matrix
 79 |     pi_matrix.fill(0)
 80 |     for j in range(0, 10000):
 81 |         if j == 0:
 82 |             pi_matrix[j, 0] = 0.5
 83 |             pi_matrix[j, 3] = 0.5
 84 |         elif j == 99:
 85 |             pi_matrix[j, 0] = 0.5
 86 |             pi_matrix[j, 2] = 0.5
 87 |         elif j == 9900:
 88 |             pi_matrix[j, 1] = 0.5
 89 |             pi_matrix[j, 3] = 0.5
 90 |         elif j == 9999:
 91 |             pi_matrix[j, 1] = 0.5
 92 |             pi_matrix[j, 2] = 0.5
 93 |         elif 1 <= j <= 98:
 94 |             pi_matrix[j, 0] = 1.0 / 3.0
 95 |             pi_matrix[j, 2] = 1.0 / 3.0
 96 |             pi_matrix[j, 3] = 1.0 / 3.0
 97 |         elif 9901 <= j <= 9998:
 98 |             pi_matrix[j, 1] = 1.0 / 3.0
 99 |             pi_matrix[j, 2] = 1.0 / 3.0
100 |             pi_matrix[j, 3] = 1.0 / 3.0
101 |         elif j % 100 == 0:
102 |             pi_matrix[j, 0] = 1.0 / 3.0
103 |             pi_matrix[j, 1] = 1.0 / 3.0
104 |             pi_matrix[j, 3] = 1.0 / 3.0
105 |         elif (j+1) % 100 == 0:
106 |             pi_matrix[j, 0] = 1.0 / 3.0
107 |             pi_matrix[j, 1] = 1.0 / 3.0
108 |             pi_matrix[j, 2] = 1.0 / 3.0
109 |         else:
110 |             pi_matrix[j] = 0.25
111 | 
112 |     # print 'Q_matrix =', Q_matrix
113 |     # print 'pi_matrix =', pi_matrix
114 |     # print 'C_matrix = ', C_matrix
115 |     step = 0
116 |     while step != 10000:
117 |         step += 1
118 |         # print 'currentState= ', currentState
119 |         yArray.append(currentState/100)
120 |         xArray.append(currentState % 100)
121 |         timestamp[step]= currentState
122 |         #   a) From state s select action a according to mixed strategy Pi with suitable exploration
123 |         actionProbability = pi_matrix[currentState - 1]
124 |         # print 'actionProbability = ', actionProbability
125 |         currentAction = np.random.choice(4, 1, p=actionProbability)
126 |         # print 'currentAction = ', currentAction
127 |         #   b) Observing reward and next state s'
128 |         nextState = currentState + calculate_nextState_table[currentAction[0]]
129 |         # print 'nextSate = ', nextState
130 |         reward = calculate_reward(currentState, nextState, goal_state)
131 |         max_Q = find_max_q_at_state_s(nextState, Q_matrix)
132 |         # print 'max_Q = ', max_Q
133 |         Q_matrix[currentState-1, currentAction[0]] = \
134 |             (1 - alpha) * Q_matrix[currentState-1, currentAction[0]] + alpha * (
135 |                 reward + gama * max_Q)
136 |         #   c) update estimate of average policy, pi_mean_matrix
137 |         pi_mean_matrix = (pi_mean_matrix * (step - 1) + pi_matrix) / step
138 |         C_matrix[currentState-1] += 1
139 |         for each_action in range(4):
140 |             pi_mean_matrix[currentState-1, each_action] += (1 / C_matrix[currentState-1]) * (
141 |                 pi_matrix[currentState-1, each_action] - pi_mean_matrix[currentState-1, each_action]
142 |             )
143 |         #   d) Step pi closer to the optimal policy w.r.t Q
144 |         delta = calculate_delta(pi_matrix, pi_mean_matrix, Q_matrix, currentState, delta_W, delta_L)
145 |         triangle_delta_sa = calculate_triangle_delta_sa(currentState, currentAction, pi_matrix, Q_matrix, delta)
146 |         pi_matrix[currentState-1, currentAction] += triangle_delta_sa
147 |         sum_probability = np.sum(pi_matrix[currentState-1])
148 |         pi_matrix[currentState-1] /= sum_probability
149 |         currentState = nextState
150 |     print 'reached goal!'
151 |     print 'total steps = ', step
152 |     # print 'xArray= ', xArray
153 |     # print 'yArray= ', yArray
154 |     return Q_matrix, pi_matrix, pi_mean_matrix, xArray, yArray, timestamp
155 | 
156 | 
157 | # --------------------- Each agent has different goal ------------------------ #
158 | 
159 | Agent1_position = [55.0, 46.0]
160 | Agent2_position = [28.0, 70.0]
161 | Agent3_position = [24.0, 50.0]
162 | Agent4_position = [77.0, 45.0]
163 | Agent5_position = [20.8, 28.8]
164 | 
165 | calculate_nextState_table = (100, -100, -1, 1)
166 | alpha = 0.5
167 | gama = 0.4
168 | delta_L = 0.8
169 | delta_W = 0.4
170 | 
171 | Agent1_goal = 3228
172 | Agent2_goal = 6799
173 | Agent3_goal = 4923
174 | Agent4_goal = 5054
175 | Agent5_goal = 8179
176 | 
177 | Agent1_trainingResult = wolf_phc_learning(Agent1_position, Agent1_goal)
178 | Agent2_trainingResult = wolf_phc_learning(Agent2_position, Agent2_goal)
179 | Agent3_trainingResult = wolf_phc_learning(Agent3_position, Agent3_goal)
180 | Agent4_trainingResult = wolf_phc_learning(Agent4_position, Agent4_goal)
181 | Agent5_trainingResult = wolf_phc_learning(Agent5_position, Agent5_goal)
182 | 
183 | Agent1_xArray = Agent1_trainingResult[3]
184 | Agent1_yArray = Agent1_trainingResult[4]
185 | # print "Agent1 start with (", Agent1_xArray[0], ",", Agent1_yArray[0], ")"
186 | # print "Agent1 end with (", 28, ",", 32, ")"
187 | 
188 | Agent2_xArray = Agent2_trainingResult[3]
189 | Agent2_yArray = Agent2_trainingResult[4]
190 | # print "Agent2 start with (", Agent2_xArray[0], ",", Agent2_yArray[0], ")"
191 | # print "Agent2 end with (", 99, ",", 87, ")"
192 | 
193 | Agent3_xArray = Agent3_trainingResult[3]
194 | Agent3_yArray = Agent3_trainingResult[4]
195 | # print "Agent3 start with (", Agent3_xArray[0], ",", Agent3_yArray[0], ")"
196 | # print "Agent3 end with (", 23, ",", 49, ")"
197 | 
198 | Agent4_xArray = Agent4_trainingResult[3]
199 | Agent4_yArray = Agent4_trainingResult[4]
200 | # print "Agent4 start with (", Agent4_xArray[0], ",", Agent4_yArray[0], ")"
201 | # print "Agent4 end with (", 54, ",", 54, ")"
202 | 
203 | Agent5_xArray = Agent5_trainingResult[3]
204 | Agent5_yArray = Agent5_trainingResult[4]
205 | # print "Agent5 start with (", Agent5_xArray[0], ",", Agent5_yArray[0], ")"
206 | # print "Agent5 end with (", 79, ",", 81, ")"
207 | 
208 | Agent1_timestamp = Agent1_trainingResult[5]
209 | Agent2_timestamp = Agent2_trainingResult[5]
210 | Agent3_timestamp = Agent3_trainingResult[5]
211 | Agent4_timestamp = Agent4_trainingResult[5]
212 | Agent5_timestamp = Agent5_trainingResult[5]
213 | 
214 | 
215 | plt.plot(Agent1_xArray, Agent1_yArray, label="Agent1")
216 | plt.plot(Agent2_xArray, Agent2_yArray, label="Agent2")
217 | plt.plot(Agent3_xArray, Agent3_yArray, label="Agent3")
218 | plt.plot(Agent4_xArray, Agent4_yArray, label="Agent4")
219 | plt.plot(Agent5_xArray, Agent5_yArray, label="Agent5")
220 | plt.axis([0, 100, 0, 100])
221 | plt.legend()
222 | plt.title("WoLF-PHC learning track with Multiagent System")
223 | plt.show()
224 | 
225 | total_timestamp = {}
226 | for key in Agent1_timestamp:
227 |     total_timestamp[key] = []
228 |     total_timestamp[key].append(Agent1_timestamp[key])
229 |     total_timestamp[key].append(Agent2_timestamp[key])
230 |     total_timestamp[key].append(Agent3_timestamp[key])
231 |     total_timestamp[key].append(Agent4_timestamp[key])
232 |     total_timestamp[key].append(Agent5_timestamp[key])
233 | 
234 | for key in total_timestamp:
235 |     states_at_step = total_timestamp[key]
236 |     if len(states_at_step) != len(set(states_at_step)):
237 |         print "conflicts exist in MAS trajectories at step = ", key
238 | 
239 | print "End of the Program !"
240 | 


--------------------------------------------------------------------------------