├── CSE585_Term_Project.pdf ├── requirements.txt ├── README.md └── run.py /CSE585_Term_Project.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/merveenoyan/MARL-grid/main/CSE585_Term_Project.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.20.3 2 | opencv-python==4.5.2.52 3 | Pillow==8.2.0 4 | pygame==2.0.1 5 | argparse 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | To run the project, run: 2 | - create python environment of your choice with version 3.7 3 | - pip install -r requirements.txt 4 | - python run.py 5 | 6 | Couple of arguments you can pass: 7 | - "--runner": Location of runner as a list (str(List)) 8 | - "--chaser_2": Location of chaser 2 as a list (str(List)) 9 | - "--chaser_1": Location of chaser_1 as a list (str(List)) 10 | - "--blocks": List of location of blocks 11 | - "--SIZE_X": Horizontal size (int) 12 | - --SIZE_Y": Vertical size (int) 13 | - "--exploitation_steps": Exploitation steps (int) 14 | - "--exploration_steps: Exploration steps (int) 15 | - "--episodes": Episodes (int) 16 | - "--show_ep": Show every N episodes (int) 17 | - "--learning_rate": Learning rate (float) 18 | - "--gamma": "Discount factor for future rewards" (float) 19 | 20 | To-do: 21 | - Add direction to the state space (DONE) 22 | - Take user parameters through GUI (DONE) 23 | - Write environment as a separate class 24 | - Take second best action when agents try to go over blocks or get out of the board (currently they get stuck) 25 | 26 | 27 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import pygame 2 | import numpy as np 3 | import pickle 4 | import time 5 | from PIL import Image 6 | import cv2 7 | import pickle 8 | import math 9 | import time 10 | import argparse 11 | 12 | 13 | 14 | 15 | """ Single Agent class, environment is defined inside it, 16 | it is used to define runner and the chasers. 17 | Each agent has an x and y coordinates, used to calculate Q-values later on. 18 | Block configurations can either be given during instantiation, 19 | if not, blocks will have default values. They cannot take the initial positions of agents """ 20 | 21 | 22 | #size of the board, for this project, it's hard-coded 23 | #SIZE_X = 8 24 | #SIZE_Y = 8 25 | class Agent: 26 | 27 | def __init__(self, SIZE_X, SIZE_Y, x, y, blocks=None): 28 | ## defining empty cells and blocks 29 | self.x = x 30 | self.y = y 31 | self.SIZE_X = SIZE_X 32 | self.SIZE_Y = SIZE_Y 33 | env = np.zeros(shape = [self.SIZE_X,self.SIZE_Y]) 34 | if blocks == None: 35 | blocks = [[0,6],[3,5],[5,2]] 36 | 37 | 38 | 39 | def dist_x(self, other): 40 | return self.x-other.x 41 | 42 | def dist_y(self, other): 43 | return self.y-other.y 44 | 45 | #manhattan distance for penalties 46 | def dist(self, other): 47 | return(abs(self.x-other.x)+abs(self.y-other.y)) 48 | 49 | #action 50 | def action(self, choice): 51 | if choice == 0: #right 52 | self.move(x=1,y=0) 53 | elif choice == 1: #up 54 | self.move(x=0, y=1) 55 | elif choice == 2: #left 56 | self.move(x=-1, y=0) 57 | elif choice == 3: #down 58 | self.move(x=0, y=-1) 59 | 60 | def move(self, x=False, y=False): 61 | 62 | if not x: 63 | self.x += np.random.randint(-1, 2) 64 | else: 65 | self.x += x 66 | if not y: 67 | self.y += np.random.randint(-1,2) 68 | else: 69 | self.y += y 70 | 71 | #move 72 | 73 | new_x = self.x + x 74 | new_y = self.y + y 75 | 76 | #in case if they get out of table 77 | if self.x<0: 78 | self.x=0 79 | if self.x>=self.SIZE_X: 80 | self.x = self.SIZE_X-1 81 | if self.y<0: 82 | self.y=0 83 | if self.y>=self.SIZE_Y: 84 | self.y = self.SIZE_Y-1 85 | for i in blocks: 86 | if [self.x, self.y] == i: 87 | action = np.random.randint(0,4) 88 | self.action(action) 89 | #update positions 90 | 91 | 92 | 93 | #Q-table 94 | def Q_table(SIZE_X, SIZE_Y): 95 | 96 | q_table = {} 97 | 98 | for a in range(0, SIZE_X): #x coordinate of agent 99 | for b in range(0, SIZE_Y): #y coordinate of agent 100 | for c in range(-SIZE_X+1, SIZE_X): #distance between agents 101 | for d in range(-SIZE_Y+1, SIZE_Y): #distance between agents 102 | q_table[((a,b,c,d))]= [np.random.uniform(-4, 0) for i in range(5)] 103 | 104 | print(f"q-table is {q_table.keys()}") 105 | return q_table 106 | 107 | 108 | 109 | if __name__=="__main__": 110 | 111 | 112 | 113 | show = False 114 | parser = argparse.ArgumentParser() 115 | parser.add_argument("--runner", type=str, nargs = '?', default = "[0,0]", help = "Location of runner as a list") 116 | parser.add_argument("--chaser_2", type=str, nargs = '?', default = "[6,6]", help = "Location of chaser_2 as a list") 117 | parser.add_argument("--chaser_1", type=str, nargs = '?', default = "[6,5]", help = "Location of chaser_1 as a list") 118 | parser.add_argument("--blocks", type=str, nargs = '?', default = "[[0,6],[3,5],[5,2]]", help = "List of location of blocks") 119 | parser.add_argument("--SIZE_X", type=int, nargs = '?', default = 8, help = "Horizontal size") 120 | parser.add_argument("--SIZE_Y", type=int, nargs = '?', default = 8, help = "Vertical size") 121 | parser.add_argument("--exploitation_steps", type=int, nargs = '?', default = 150, help = "Exploitation steps") 122 | parser.add_argument("--exploration_steps", type=int, nargs = '?', default = 150, help = "Exploration steps") 123 | parser.add_argument("--episodes", type=int, nargs = '?', default = 100, help = "Episodes") 124 | parser.add_argument("--show_ep", type=int, nargs = '?', default = 10, help = "Show every N episodes") 125 | parser.add_argument("--learning_rate", type=float, nargs = '?', default = 0.1, help = "Learning rate") 126 | parser.add_argument("--gamma", type=float, nargs = '?', default = 0.1, help = "Discount factor for future rewards") 127 | args = parser.parse_args() 128 | 129 | exploration_steps = args.exploration_steps 130 | exploitation_steps = args.exploitation_steps 131 | show_ep = args.show_ep 132 | episodes = args.episodes 133 | learning_rate = args.learning_rate 134 | gamma = args.gamma 135 | SIZE_X = args.SIZE_X 136 | SIZE_Y = args.SIZE_Y 137 | runner_loc = eval(args.runner) 138 | chaser_1_loc = eval(args.chaser_1) 139 | chaser_2_loc = eval(args.chaser_2) 140 | blocks = eval(args.blocks) 141 | 142 | rounds = exploration_steps + exploitation_steps 143 | 144 | 145 | 146 | # RGB color coding 147 | d = {"runner_color":(0, 255, 0), "chaser1_color":(255,180, 20), "chaser2_color":(255,20,147), "block_color":(255, 255, 208)} 148 | 149 | 150 | 151 | chasers_win = 0 152 | for eps in range(episodes): 153 | 154 | if(eps%show_ep==0): 155 | show = True 156 | 157 | # initialize agents back to their original positions by the 158 | # beginning of every game 159 | chaser1 = Agent(SIZE_X=SIZE_X, SIZE_Y=SIZE_Y, x = chaser_1_loc[0], y = chaser_1_loc[1]) 160 | chaser2 = Agent(SIZE_X=SIZE_X, SIZE_Y=SIZE_Y, x = chaser_2_loc[0], y = chaser_2_loc[1]) 161 | runner= Agent(SIZE_X=SIZE_X, SIZE_Y=SIZE_Y, x = runner_loc[0], y = runner_loc[1]) 162 | 163 | # initialize Q_tables before training 164 | q_table_c1 = Q_table(SIZE_X=SIZE_X, SIZE_Y=SIZE_Y) 165 | q_table_c2 = Q_table(SIZE_X=SIZE_X, SIZE_Y=SIZE_Y) 166 | q_table_r = Q_table(SIZE_X=SIZE_X, SIZE_Y=SIZE_Y) 167 | 168 | for i in range(rounds): 169 | 170 | # states are (x, y, distance to the other agent) 171 | 172 | dstate_r = (runner.x, runner.y, min(runner.dist_x(chaser1),runner.dist_x(chaser2)), min(runner.dist_y(chaser1),runner.dist_y(chaser2))) 173 | 174 | dstate_c1 = (chaser1.x, chaser1.y, chaser1.dist_x(runner), chaser1.dist_y(runner)) 175 | 176 | dstate_c2 = (chaser2.x, chaser2.y, chaser2.dist_x(runner), chaser2.dist_y(runner)) 177 | 178 | #first action is a random one 179 | 180 | if i4 or runner.dist(chaser2)>4: 222 | reward_r = runner_reward 223 | 224 | #both of the chasers get reward 225 | elif (runner.x==chaser1.x and runner.y==chaser1.y): 226 | print("Game is over, runner is caught") 227 | reward_1 = catch_reward 228 | reward_r = -catch_reward 229 | chasers_win += 1 230 | break 231 | elif(runner.x==chaser2.x and runner.y==chaser2.y): 232 | print("Game is over, runner is caught") 233 | reward_2 = catch_reward 234 | reward_r = -catch_reward 235 | chasers_win += 1 236 | break 237 | 238 | #state updates 239 | new_dstate_c2 = ( chaser2.x, chaser2.y, chaser2.dist_x(runner), chaser2.dist_y(runner) ) 240 | new_dstate_c1 = ( chaser1.x, chaser1.y, chaser1.dist_x(runner), chaser1.dist_y(runner) ) 241 | new_dstate_r = ( runner.x, runner.y, min(runner.dist_x(chaser1), runner.dist_x(chaser2)), min(runner.dist_y(chaser1), runner.dist_y(chaser2))) 242 | 243 | # calculating cumulated future reward 244 | future_qval_c1 = np.max(q_table_c1[new_dstate_c1]) 245 | 246 | future_qval_c2 = np.max(q_table_c2[new_dstate_c2]) 247 | 248 | future_qval_r = np.max(q_table_r[new_dstate_r]) 249 | 250 | #retrieve q-values for each action 251 | current_qval_c1 = q_table_c1[dstate_c1][action_c1] 252 | current_qval_c2 = q_table_c2[dstate_c2][action_c2] 253 | current_qval_r = q_table_r[dstate_r][action_r] 254 | 255 | #calculate q-values 256 | new_qval_c1 = (1 - learning_rate) * current_qval_c1 + learning_rate * (reward_1 + gamma * future_qval_c1) 257 | new_qval_c2 = (1 - learning_rate) * current_qval_c2 + learning_rate * (reward_2 + gamma * future_qval_c2) 258 | new_qval_r = (1 - learning_rate) * current_qval_r + learning_rate * (reward_r + gamma * future_qval_r) 259 | 260 | 261 | #update q-table 262 | 263 | q_table_c1[dstate_c1][action_c1] = new_qval_c1 264 | q_table_c2[dstate_c2][action_c2] = new_qval_c2 265 | q_table_r[dstate_r][action_r] = new_qval_r 266 | 267 | #interface 268 | 269 | if(show): 270 | env = np.zeros((args.SIZE_X, args.SIZE_Y, 3), dtype=np.uint8) 271 | env[runner.x][runner.y] = d["runner_color"] 272 | env[chaser1.x][chaser1.y] = d["chaser1_color"] 273 | env[chaser2.x][chaser2.y] = d["chaser2_color"] 274 | 275 | for i in blocks: 276 | env[i[0]][i[1]] = d["block_color"] 277 | 278 | image = Image.fromarray(env, 'RGB') 279 | image = image.resize((1300, 800), resample=Image.NEAREST) 280 | 281 | 282 | cv2.imshow("ENV", np.array(image)) 283 | 284 | 285 | # if the runner is caught 286 | if reward_1 == catch_reward or reward_2 == catch_reward: 287 | if cv2.waitKey(50000) and 0xFF == ord('q'): 288 | break 289 | else: 290 | if cv2.waitKey(1) & 0xFF == ord('q'): 291 | break 292 | 293 | print(f"Chasers win: {chasers_win}") 294 | --------------------------------------------------------------------------------