├── agent_memory.py ├── preprocess_frame.py ├── debug.py ├── main.py ├── environment.py └── the_agent.py /agent_memory.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | 3 | class Memory(): 4 | def __init__(self,max_len): 5 | self.max_len = max_len 6 | self.frames = deque(maxlen = max_len) 7 | self.actions = deque(maxlen = max_len) 8 | self.rewards = deque(maxlen = max_len) 9 | self.done_flags = deque(maxlen = max_len) 10 | 11 | def add_experience(self,next_frame, next_frames_reward, next_action, next_frame_terminal): 12 | self.frames.append(next_frame) 13 | self.actions.append(next_action) 14 | self.rewards.append(next_frames_reward) 15 | self.done_flags.append(next_frame_terminal) 16 | -------------------------------------------------------------------------------- /preprocess_frame.py: -------------------------------------------------------------------------------- 1 | """ 2 | import cv2 3 | import numpy as np 4 | 5 | 6 | def resize_frame(frame): 7 | frame = frame[28:,5:-4] 8 | frame = np.average(frame,axis = 2) 9 | frame = cv2.resize(frame,(84,84),interpolation = cv2.INTER_NEAREST) 10 | frame = np.array(frame,dtype = np.uint8) 11 | return frame 12 | """ 13 | 14 | import cv2 15 | import numpy as np 16 | 17 | 18 | def resize_frame(frame): 19 | frame = frame[30:-12,5:-4] 20 | frame = np.average(frame,axis = 2) 21 | frame = cv2.resize(frame,(84,84),interpolation = cv2.INTER_NEAREST) 22 | frame = np.array(frame,dtype = np.uint8) 23 | return frame -------------------------------------------------------------------------------- /debug.py: -------------------------------------------------------------------------------- 1 | import the_agent 2 | import environment 3 | import matplotlib.pyplot as plt 4 | import matplotlib.animation as animation 5 | 6 | name = 'PongDeterministic-v4' 7 | 8 | agent = the_agent.Agent(possible_actions=[0,2,3],starting_mem_len=50,max_mem_len=750000, starting_epsilon = .5, debug = True) 9 | env = environment.make_env(name,agent) 10 | 11 | environment.play_episode(name, env,agent, debug = True) 12 | env.close() 13 | 14 | for i in range(0,len(agent.memory.frames)+1): 15 | fig = plt.figure(figsize = (7,7)) 16 | state = [agent.memory.frames[i-3], agent.memory.frames[i-2], agent.memory.frames[i-1], agent.memory.frames[i]] 17 | for ind in range(4): 18 | state[ind] = [plt.imshow(state[ind], animated=True)] 19 | ani = animation.ArtistAnimation(fig, state, interval=750, blit=True,repeat_delay=250) 20 | 21 | plt.text(0, 0, 'Step: ' + str(i) + ' Reward: ' + str(agent.memory.rewards[i]) + '\nAction: ' + str(agent.memory.actions[i]) + " Done: " + str(agent.memory.done_flags[i]) + '\n', fontsize=14, fontweight='bold', bbox=dict(facecolor='white', alpha=0.5)) 22 | plt.show() 23 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import the_agent 2 | import environment 3 | import matplotlib.pyplot as plt 4 | import time 5 | from collections import deque 6 | import numpy as np 7 | 8 | name = 'PongDeterministic-v4' 9 | 10 | agent = the_agent.Agent(possible_actions=[0,2,3],starting_mem_len=50000,max_mem_len=750000,starting_epsilon = 1, learn_rate = .00025) 11 | env = environment.make_env(name,agent) 12 | 13 | last_100_avg = [-21] 14 | scores = deque(maxlen = 100) 15 | max_score = -21 16 | 17 | """ If testing: 18 | agent.model.load_weights('recent_weights.hdf5') 19 | agent.model_target.load_weights('recent_weights.hdf5') 20 | agent.epsilon = 0.0 21 | """ 22 | 23 | env.reset() 24 | 25 | for i in range(1000000): 26 | timesteps = agent.total_timesteps 27 | timee = time.time() 28 | score = environment.play_episode(name, env, agent, debug = False) #set debug to true for rendering 29 | scores.append(score) 30 | if score > max_score: 31 | max_score = score 32 | 33 | print('\nEpisode: ' + str(i)) 34 | print('Steps: ' + str(agent.total_timesteps - timesteps)) 35 | print('Duration: ' + str(time.time() - timee)) 36 | print('Score: ' + str(score)) 37 | print('Max Score: ' + str(max_score)) 38 | print('Epsilon: ' + str(agent.epsilon)) 39 | 40 | if i%100==0 and i!=0: 41 | last_100_avg.append(sum(scores)/len(scores)) 42 | plt.plot(np.arange(0,i+1,100),last_100_avg) 43 | plt.show() 44 | -------------------------------------------------------------------------------- /environment.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import preprocess_frame as ppf 3 | import numpy as np 4 | 5 | 6 | def initialize_new_game(name, env, agent): 7 | """We don't want an agents past game influencing its new game, so we add in some dummy data to initialize""" 8 | 9 | env.reset() 10 | starting_frame = ppf.resize_frame(env.step(0)[0]) 11 | 12 | dummy_action = 0 13 | dummy_reward = 0 14 | dummy_done = False 15 | for i in range(3): 16 | agent.memory.add_experience(starting_frame, dummy_reward, dummy_action, dummy_done) 17 | 18 | def make_env(name, agent): 19 | env = gym.make(name) 20 | return env 21 | 22 | def take_step(name, env, agent, score, debug): 23 | 24 | #1 and 2: Update timesteps and save weights 25 | agent.total_timesteps += 1 26 | if agent.total_timesteps % 50000 == 0: 27 | agent.model.save_weights('recent_weights.hdf5') 28 | print('\nWeights saved!') 29 | 30 | #3: Take action 31 | next_frame, next_frames_reward, next_frame_terminal, info = env.step(agent.memory.actions[-1]) 32 | 33 | #4: Get next state 34 | next_frame = ppf.resize_frame(next_frame) 35 | new_state = [agent.memory.frames[-3], agent.memory.frames[-2], agent.memory.frames[-1], next_frame] 36 | new_state = np.moveaxis(new_state,0,2)/255 #We have to do this to get it into keras's goofy format of [batch_size,rows,columns,channels] 37 | new_state = np.expand_dims(new_state,0) #^^^ 38 | 39 | #5: Get next action, using next state 40 | next_action = agent.get_action(new_state) 41 | 42 | #6: If game is over, return the score 43 | if next_frame_terminal: 44 | agent.memory.add_experience(next_frame, next_frames_reward, next_action, next_frame_terminal) 45 | return (score + next_frames_reward),True 46 | 47 | #7: Now we add the next experience to memory 48 | agent.memory.add_experience(next_frame, next_frames_reward, next_action, next_frame_terminal) 49 | 50 | #8: If we are trying to debug this then render 51 | if debug: 52 | env.render() 53 | 54 | #9: If the threshold memory is satisfied, make the agent learn from memory 55 | if len(agent.memory.frames) > agent.starting_mem_len: 56 | agent.learn(debug) 57 | 58 | return (score + next_frames_reward),False 59 | 60 | def play_episode(name, env, agent, debug = False): 61 | initialize_new_game(name, env, agent) 62 | done = False 63 | score = 0 64 | while True: 65 | score,done = take_step(name,env,agent,score, debug) 66 | if done: 67 | break 68 | return score 69 | -------------------------------------------------------------------------------- /the_agent.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.models import Sequential, clone_model 2 | from tensorflow.keras.layers import Dense, Flatten, Conv2D, Input 3 | from tensorflow.keras.optimizers import Adam 4 | import keras.backend as K 5 | import tensorflow as tf 6 | from agent_memory import Memory 7 | import numpy as np 8 | import random 9 | 10 | 11 | class Agent(): 12 | def __init__(self,possible_actions,starting_mem_len,max_mem_len,starting_epsilon,learn_rate, starting_lives = 5, debug = False): 13 | self.memory = Memory(max_mem_len) 14 | self.possible_actions = possible_actions 15 | self.epsilon = starting_epsilon 16 | self.epsilon_decay = .9/100000 17 | self.epsilon_min = .05 18 | self.gamma = .95 19 | self.learn_rate = learn_rate 20 | self.model = self._build_model() 21 | self.model_target = clone_model(self.model) 22 | self.total_timesteps = 0 23 | self.lives = starting_lives #this parameter does not apply to pong 24 | self.starting_mem_len = starting_mem_len 25 | self.learns = 0 26 | 27 | 28 | def _build_model(self): 29 | model = Sequential() 30 | model.add(Input((84,84,4))) 31 | model.add(Conv2D(filters = 32,kernel_size = (8,8),strides = 4,data_format="channels_last", activation = 'relu',kernel_initializer = tf.keras.initializers.VarianceScaling(scale=2))) 32 | model.add(Conv2D(filters = 64,kernel_size = (4,4),strides = 2,data_format="channels_last", activation = 'relu',kernel_initializer = tf.keras.initializers.VarianceScaling(scale=2))) 33 | model.add(Conv2D(filters = 64,kernel_size = (3,3),strides = 1,data_format="channels_last", activation = 'relu',kernel_initializer = tf.keras.initializers.VarianceScaling(scale=2))) 34 | model.add(Flatten()) 35 | model.add(Dense(512,activation = 'relu', kernel_initializer = tf.keras.initializers.VarianceScaling(scale=2))) 36 | model.add(Dense(len(self.possible_actions), activation = 'linear')) 37 | optimizer = Adam(self.learn_rate) 38 | model.compile(optimizer, loss=tf.keras.losses.Huber()) 39 | model.summary() 40 | print('\nAgent Initialized\n') 41 | return model 42 | 43 | def get_action(self,state): 44 | """Explore""" 45 | if np.random.rand() < self.epsilon: 46 | return random.sample(self.possible_actions,1)[0] 47 | 48 | """Do Best Acton""" 49 | a_index = np.argmax(self.model.predict(state)) 50 | return self.possible_actions[a_index] 51 | 52 | def _index_valid(self,index): 53 | if self.memory.done_flags[index-3] or self.memory.done_flags[index-2] or self.memory.done_flags[index-1] or self.memory.done_flags[index]: 54 | return False 55 | else: 56 | return True 57 | 58 | def learn(self,debug = False): 59 | """we want the output[a] to be R_(t+1) + Qmax_(t+1).""" 60 | """So target for taking action 1 should be [output[0], R_(t+1) + Qmax_(t+1), output[2]]""" 61 | 62 | """First we need 32 random valid indicies""" 63 | states = [] 64 | next_states = [] 65 | actions_taken = [] 66 | next_rewards = [] 67 | next_done_flags = [] 68 | 69 | while len(states) < 32: 70 | index = np.random.randint(4,len(self.memory.frames) - 1) 71 | if self._index_valid(index): 72 | state = [self.memory.frames[index-3], self.memory.frames[index-2], self.memory.frames[index-1], self.memory.frames[index]] 73 | state = np.moveaxis(state,0,2)/255 74 | next_state = [self.memory.frames[index-2], self.memory.frames[index-1], self.memory.frames[index], self.memory.frames[index+1]] 75 | next_state = np.moveaxis(next_state,0,2)/255 76 | 77 | states.append(state) 78 | next_states.append(next_state) 79 | actions_taken.append(self.memory.actions[index]) 80 | next_rewards.append(self.memory.rewards[index+1]) 81 | next_done_flags.append(self.memory.done_flags[index+1]) 82 | 83 | """Now we get the ouputs from our model, and the target model. We need this for our target in the error function""" 84 | labels = self.model.predict(np.array(states)) 85 | next_state_values = self.model_target.predict(np.array(next_states)) 86 | 87 | """Now we define our labels, or what the output should have been 88 | We want the output[action_taken] to be R_(t+1) + Qmax_(t+1) """ 89 | for i in range(32): 90 | action = self.possible_actions.index(actions_taken[i]) 91 | labels[i][action] = next_rewards[i] + (not next_done_flags[i]) * self.gamma * max(next_state_values[i]) 92 | 93 | """Train our model using the states and outputs generated""" 94 | self.model.fit(np.array(states),labels,batch_size = 32, epochs = 1, verbose = 0) 95 | 96 | """Decrease epsilon and update how many times our agent has learned""" 97 | if self.epsilon > self.epsilon_min: 98 | self.epsilon -= self.epsilon_decay 99 | self.learns += 1 100 | 101 | """Every 10000 learned, copy our model weights to our target model""" 102 | if self.learns % 10000 == 0: 103 | self.model_target.set_weights(self.model.get_weights()) 104 | print('\nTarget model updated') 105 | --------------------------------------------------------------------------------