├── CSE585_Term_Project.pdf
├── requirements.txt
├── README.md
└── run.py


/CSE585_Term_Project.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/merveenoyan/MARL-grid/main/CSE585_Term_Project.pdf


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.20.3
2 | opencv-python==4.5.2.52
3 | Pillow==8.2.0
4 | pygame==2.0.1
5 | argparse
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | To run the project, run:
 2 | - create python environment of your choice with version 3.7
 3 | - pip install -r requirements.txt
 4 | - python run.py
 5 | 
 6 | Couple of arguments you can pass:
 7 | - "--runner": Location of runner as a list (str(List)) 
 8 | - "--chaser_2": Location of chaser 2 as a list (str(List))
 9 | - "--chaser_1": Location of chaser_1 as a list (str(List))
10 | - "--blocks": List of location of blocks
11 | - "--SIZE_X": Horizontal size (int)
12 | - --SIZE_Y": Vertical size (int)
13 | - "--exploitation_steps": Exploitation steps (int)
14 | - "--exploration_steps: Exploration steps (int)
15 | - "--episodes": Episodes (int)
16 | - "--show_ep": Show every N episodes (int)
17 | - "--learning_rate": Learning rate (float)
18 | - "--gamma": "Discount factor for future rewards" (float)
19 |  
20 | To-do:
21 | - Add direction to the state space (DONE)
22 | - Take user parameters through GUI (DONE)
23 | - Write environment as a separate class
24 | - Take second best action when agents try to go over blocks or get out of the board (currently they get stuck)
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
  1 | import pygame
  2 | import numpy as np
  3 | import pickle
  4 | import time
  5 | from PIL import Image
  6 | import cv2
  7 | import pickle
  8 | import math
  9 | import time
 10 | import argparse
 11 | 
 12 | 
 13 | 
 14 | 
 15 | """ Single Agent class, environment is defined inside it,
 16 |     it is used to define runner and the chasers. 
 17 |     Each agent has an x and y coordinates, used to calculate Q-values later on.
 18 |     Block configurations can either be given during instantiation,
 19 |     if not, blocks will have default values. They cannot take the initial positions of agents """
 20 | 
 21 | 
 22 | #size of the board, for this project, it's hard-coded
 23 | #SIZE_X = 8
 24 | #SIZE_Y = 8
 25 | class Agent:
 26 |   
 27 |   def __init__(self, SIZE_X, SIZE_Y, x, y, blocks=None):
 28 |     ## defining empty cells and blocks
 29 |     self.x = x
 30 |     self.y = y
 31 |     self.SIZE_X = SIZE_X
 32 |     self.SIZE_Y = SIZE_Y
 33 |     env = np.zeros(shape = [self.SIZE_X,self.SIZE_Y])
 34 |     if blocks == None:
 35 |       blocks = [[0,6],[3,5],[5,2]]
 36 |     
 37 |     
 38 | 
 39 |   def dist_x(self, other):
 40 |     return self.x-other.x
 41 | 
 42 |   def dist_y(self, other):
 43 |     return self.y-other.y
 44 | 
 45 |   #manhattan distance for penalties
 46 |   def dist(self, other):
 47 |     return(abs(self.x-other.x)+abs(self.y-other.y))
 48 | 
 49 |   #action  
 50 |   def action(self, choice):
 51 |       if choice == 0: #right
 52 |         self.move(x=1,y=0)
 53 |       elif choice == 1: #up
 54 |         self.move(x=0, y=1)
 55 |       elif choice == 2: #left
 56 |         self.move(x=-1, y=0)
 57 |       elif choice == 3: #down
 58 |         self.move(x=0, y=-1)
 59 |   
 60 |   def move(self, x=False, y=False):
 61 |     
 62 |     if not x:
 63 |       self.x += np.random.randint(-1, 2)
 64 |     else:
 65 |       self.x += x
 66 |     if not y:
 67 |       self.y += np.random.randint(-1,2)
 68 |     else:
 69 |       self.y += y
 70 |     
 71 |     #move
 72 |     
 73 |     new_x = self.x + x
 74 |     new_y = self.y + y
 75 | 
 76 |     #in case if they get out of table
 77 |     if self.x<0:
 78 |       self.x=0
 79 |     if self.x>=self.SIZE_X:
 80 |       self.x = self.SIZE_X-1
 81 |     if self.y<0:
 82 |       self.y=0
 83 |     if self.y>=self.SIZE_Y:
 84 |       self.y = self.SIZE_Y-1
 85 |     for i in blocks:
 86 |       if [self.x, self.y] == i:
 87 |         action = np.random.randint(0,4)
 88 |         self.action(action)
 89 |     #update positions
 90 | 
 91 | 
 92 | 
 93 | #Q-table
 94 | def Q_table(SIZE_X, SIZE_Y):
 95 | 
 96 |     q_table = {}
 97 |   
 98 |     for a in range(0, SIZE_X): #x coordinate of agent
 99 |         for b in range(0, SIZE_Y): #y coordinate of agent
100 |             for c in range(-SIZE_X+1, SIZE_X): #distance between agents
101 |               for d in range(-SIZE_Y+1, SIZE_Y): #distance between agents
102 |                 q_table[((a,b,c,d))]= [np.random.uniform(-4, 0) for i in range(5)] 
103 |     
104 |     print(f"q-table is {q_table.keys()}")
105 |     return q_table
106 | 
107 | 
108 | 
109 | if __name__=="__main__":
110 | 
111 |   
112 | 
113 |   show = False
114 |   parser = argparse.ArgumentParser()
115 |   parser.add_argument("--runner", type=str, nargs = '?', default = "[0,0]", help = "Location of runner as a list")
116 |   parser.add_argument("--chaser_2", type=str, nargs = '?', default = "[6,6]", help = "Location of chaser_2 as a list")
117 |   parser.add_argument("--chaser_1", type=str, nargs = '?', default = "[6,5]", help = "Location of chaser_1 as a list")
118 |   parser.add_argument("--blocks", type=str, nargs = '?', default = "[[0,6],[3,5],[5,2]]", help = "List of location of blocks")
119 |   parser.add_argument("--SIZE_X", type=int, nargs = '?', default = 8, help = "Horizontal size")
120 |   parser.add_argument("--SIZE_Y", type=int, nargs = '?', default = 8, help = "Vertical size")
121 |   parser.add_argument("--exploitation_steps", type=int, nargs = '?', default = 150, help = "Exploitation steps")
122 |   parser.add_argument("--exploration_steps", type=int, nargs = '?', default = 150, help = "Exploration steps")
123 |   parser.add_argument("--episodes", type=int, nargs = '?', default = 100, help = "Episodes")
124 |   parser.add_argument("--show_ep", type=int, nargs = '?', default = 10, help = "Show every N episodes")
125 |   parser.add_argument("--learning_rate", type=float, nargs = '?', default = 0.1, help = "Learning rate")
126 |   parser.add_argument("--gamma", type=float, nargs = '?', default = 0.1, help = "Discount factor for future rewards")
127 |   args = parser.parse_args()
128 | 
129 |   exploration_steps = args.exploration_steps
130 |   exploitation_steps = args.exploitation_steps
131 |   show_ep = args.show_ep
132 |   episodes = args.episodes
133 |   learning_rate = args.learning_rate
134 |   gamma = args.gamma
135 |   SIZE_X = args.SIZE_X
136 |   SIZE_Y = args.SIZE_Y
137 |   runner_loc = eval(args.runner)
138 |   chaser_1_loc = eval(args.chaser_1)
139 |   chaser_2_loc = eval(args.chaser_2)
140 |   blocks = eval(args.blocks)
141 | 
142 |   rounds = exploration_steps + exploitation_steps
143 | 
144 |   
145 |   
146 |   # RGB color coding
147 |   d = {"runner_color":(0, 255, 0), "chaser1_color":(255,180, 20), "chaser2_color":(255,20,147), "block_color":(255, 255, 208)}
148 | 
149 | 
150 |   
151 |   chasers_win = 0
152 |   for eps in range(episodes):
153 |     
154 |     if(eps%show_ep==0):
155 |       show = True
156 | 
157 |     # initialize agents back to their original positions by the 
158 |     # beginning of every game
159 |     chaser1 = Agent(SIZE_X=SIZE_X, SIZE_Y=SIZE_Y, x = chaser_1_loc[0], y = chaser_1_loc[1])
160 |     chaser2 = Agent(SIZE_X=SIZE_X, SIZE_Y=SIZE_Y, x = chaser_2_loc[0], y = chaser_2_loc[1])
161 |     runner= Agent(SIZE_X=SIZE_X, SIZE_Y=SIZE_Y, x = runner_loc[0], y = runner_loc[1])
162 |     
163 |     # initialize Q_tables before training
164 |     q_table_c1 = Q_table(SIZE_X=SIZE_X, SIZE_Y=SIZE_Y)
165 |     q_table_c2 = Q_table(SIZE_X=SIZE_X, SIZE_Y=SIZE_Y)
166 |     q_table_r = Q_table(SIZE_X=SIZE_X, SIZE_Y=SIZE_Y)
167 |     
168 |     for i in range(rounds):
169 |       
170 |       # states are (x, y, distance to the other agent)
171 |       
172 |       dstate_r = (runner.x, runner.y, min(runner.dist_x(chaser1),runner.dist_x(chaser2)), min(runner.dist_y(chaser1),runner.dist_y(chaser2)))
173 | 
174 |       dstate_c1 = (chaser1.x, chaser1.y, chaser1.dist_x(runner), chaser1.dist_y(runner))
175 | 
176 |       dstate_c2 = (chaser2.x, chaser2.y, chaser2.dist_x(runner), chaser2.dist_y(runner))
177 |       
178 |       #first action is a random one
179 |       
180 |       if i<exploration_steps: #for first ten turns explore
181 | 
182 |         action_c1 = np.random.randint(0,4)
183 |         action_c2 = np.random.randint(0,4)
184 |         action_r = np.random.randint(0,4)
185 | 
186 |       else: #for the rest of the episodes take the action that has
187 |           #the highest q-value for that state
188 | 
189 |         action_c1 = np.argmax(dstate_c1)
190 |         action_c2 = np.argmax(dstate_c2)
191 |         action_r = np.argmax(dstate_r)
192 | 
193 | 
194 |       #defining rewards
195 |       chaser_reward_1 = 1
196 |       chaser_reward_2 = 2
197 |       catch_reward = 3
198 |       runner_reward = 2
199 |       
200 | 
201 |       #taking actions 
202 | 
203 |       runner.action(action_r)
204 |       chaser1.action(action_c1)
205 |       chaser2.action(action_c2)
206 |       
207 |       #rewards of each agent
208 |       reward_1 = 0
209 |       reward_2 = 0
210 |       reward_r = 0
211 | 
212 |       #reward conditions
213 |       #chasers get reward when they get close to runner
214 | 
215 |       if runner.dist_x(chaser1)==2 or runner.dist_x(chaser2)==2:
216 |         reward_1 = chaser_reward_1
217 | 
218 |       elif runner.dist(chaser1)==1 or runner.dist(chaser2)==1:
219 |         reward_2 = chaser_reward_2
220 |       #runner gets reward when it is far from chasers
221 |       elif runner.dist(chaser1)>4 or runner.dist(chaser2)>4:
222 |         reward_r = runner_reward
223 | 
224 |       #both of the chasers get reward 
225 |       elif (runner.x==chaser1.x and runner.y==chaser1.y):
226 |         print("Game is over, runner is caught")
227 |         reward_1 = catch_reward
228 |         reward_r = -catch_reward
229 |         chasers_win += 1
230 |         break
231 |       elif(runner.x==chaser2.x and runner.y==chaser2.y):
232 |         print("Game is over, runner is caught")
233 |         reward_2 = catch_reward
234 |         reward_r = -catch_reward
235 |         chasers_win += 1
236 |         break
237 |       
238 |       #state updates  
239 |       new_dstate_c2 = ( chaser2.x, chaser2.y, chaser2.dist_x(runner), chaser2.dist_y(runner) )
240 |       new_dstate_c1 = ( chaser1.x, chaser1.y, chaser1.dist_x(runner), chaser1.dist_y(runner) )
241 |       new_dstate_r = ( runner.x, runner.y, min(runner.dist_x(chaser1), runner.dist_x(chaser2)), min(runner.dist_y(chaser1), runner.dist_y(chaser2)))
242 | 
243 |       # calculating cumulated future reward
244 |       future_qval_c1 = np.max(q_table_c1[new_dstate_c1])
245 | 
246 |       future_qval_c2 = np.max(q_table_c2[new_dstate_c2])
247 | 
248 |       future_qval_r = np.max(q_table_r[new_dstate_r])
249 | 
250 |       #retrieve q-values for each action
251 |       current_qval_c1 = q_table_c1[dstate_c1][action_c1]
252 |       current_qval_c2 = q_table_c2[dstate_c2][action_c2]
253 |       current_qval_r = q_table_r[dstate_r][action_r]
254 | 
255 |       #calculate q-values
256 |       new_qval_c1 = (1 - learning_rate) * current_qval_c1 + learning_rate * (reward_1 + gamma * future_qval_c1)
257 |       new_qval_c2 = (1 - learning_rate) * current_qval_c2 + learning_rate * (reward_2 + gamma * future_qval_c2)
258 |       new_qval_r = (1 - learning_rate) * current_qval_r + learning_rate * (reward_r + gamma * future_qval_r)
259 | 
260 | 
261 |       #update q-table
262 |       
263 |       q_table_c1[dstate_c1][action_c1] = new_qval_c1
264 |       q_table_c2[dstate_c2][action_c2] = new_qval_c2
265 |       q_table_r[dstate_r][action_r] = new_qval_r
266 |       
267 |       #interface
268 |     
269 |       if(show):
270 |         env = np.zeros((args.SIZE_X, args.SIZE_Y, 3), dtype=np.uint8)
271 |         env[runner.x][runner.y] = d["runner_color"]
272 |         env[chaser1.x][chaser1.y] = d["chaser1_color"]
273 |         env[chaser2.x][chaser2.y] = d["chaser2_color"]
274 | 
275 |         for i in blocks:
276 |           env[i[0]][i[1]] = d["block_color"]
277 |           
278 |         image = Image.fromarray(env, 'RGB')
279 |         image = image.resize((1300, 800), resample=Image.NEAREST)
280 | 
281 |         
282 |         cv2.imshow("ENV", np.array(image))
283 | 
284 | 
285 |         # if the runner is caught
286 |         if reward_1 == catch_reward or reward_2 == catch_reward:
287 |           if cv2.waitKey(50000) and 0xFF == ord('q'):
288 |             break
289 |         else:
290 |           if cv2.waitKey(1) & 0xFF == ord('q'):
291 |             break
292 | 
293 |   print(f"Chasers win: {chasers_win}")
294 |             


--------------------------------------------------------------------------------