├── agent_memory.py
├── preprocess_frame.py
├── debug.py
├── main.py
├── environment.py
└── the_agent.py


/agent_memory.py:
--------------------------------------------------------------------------------
 1 | from collections import deque
 2 | 
 3 | class Memory():
 4 |     def __init__(self,max_len):
 5 |         self.max_len = max_len
 6 |         self.frames = deque(maxlen = max_len)
 7 |         self.actions = deque(maxlen = max_len)
 8 |         self.rewards = deque(maxlen = max_len)
 9 |         self.done_flags = deque(maxlen = max_len)
10 | 
11 |     def add_experience(self,next_frame, next_frames_reward, next_action, next_frame_terminal):
12 |         self.frames.append(next_frame)
13 |         self.actions.append(next_action)
14 |         self.rewards.append(next_frames_reward)
15 |         self.done_flags.append(next_frame_terminal)
16 | 


--------------------------------------------------------------------------------
/preprocess_frame.py:
--------------------------------------------------------------------------------
 1 | """
 2 | import cv2
 3 | import numpy as np
 4 | 
 5 | 
 6 | def resize_frame(frame):
 7 |     frame = frame[28:,5:-4]
 8 |     frame = np.average(frame,axis = 2)
 9 |     frame = cv2.resize(frame,(84,84),interpolation = cv2.INTER_NEAREST)
10 |     frame = np.array(frame,dtype = np.uint8)
11 |     return frame
12 | """
13 | 
14 | import cv2
15 | import numpy as np
16 | 
17 | 
18 | def resize_frame(frame):
19 |     frame = frame[30:-12,5:-4]
20 |     frame = np.average(frame,axis = 2)
21 |     frame = cv2.resize(frame,(84,84),interpolation = cv2.INTER_NEAREST)
22 |     frame = np.array(frame,dtype = np.uint8)
23 |     return frame


--------------------------------------------------------------------------------
/debug.py:
--------------------------------------------------------------------------------
 1 | import the_agent
 2 | import environment
 3 | import matplotlib.pyplot as plt
 4 | import matplotlib.animation as animation
 5 | 
 6 | name = 'PongDeterministic-v4'
 7 | 
 8 | agent = the_agent.Agent(possible_actions=[0,2,3],starting_mem_len=50,max_mem_len=750000, starting_epsilon = .5, debug = True)
 9 | env = environment.make_env(name,agent)
10 | 
11 | environment.play_episode(name, env,agent, debug = True)
12 | env.close()
13 | 
14 | for i in range(0,len(agent.memory.frames)+1):
15 |     fig = plt.figure(figsize = (7,7))
16 |     state = [agent.memory.frames[i-3], agent.memory.frames[i-2], agent.memory.frames[i-1], agent.memory.frames[i]]
17 |     for ind in range(4):
18 |         state[ind] = [plt.imshow(state[ind], animated=True)]
19 |     ani = animation.ArtistAnimation(fig, state, interval=750, blit=True,repeat_delay=250)
20 | 
21 |     plt.text(0, 0, 'Step: ' + str(i) + '    Reward: ' + str(agent.memory.rewards[i]) + '\nAction: ' + str(agent.memory.actions[i]) + "    Done: " + str(agent.memory.done_flags[i]) + '\n', fontsize=14, fontweight='bold', bbox=dict(facecolor='white', alpha=0.5))
22 |     plt.show()
23 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import the_agent
 2 | import environment
 3 | import matplotlib.pyplot as plt
 4 | import time
 5 | from collections import deque
 6 | import numpy as np
 7 | 
 8 | name = 'PongDeterministic-v4'
 9 | 
10 | agent = the_agent.Agent(possible_actions=[0,2,3],starting_mem_len=50000,max_mem_len=750000,starting_epsilon = 1, learn_rate = .00025)
11 | env = environment.make_env(name,agent)
12 | 
13 | last_100_avg = [-21]
14 | scores = deque(maxlen = 100)
15 | max_score = -21
16 | 
17 | """ If testing:
18 | agent.model.load_weights('recent_weights.hdf5')
19 | agent.model_target.load_weights('recent_weights.hdf5')
20 | agent.epsilon = 0.0
21 | """
22 | 
23 | env.reset()
24 | 
25 | for i in range(1000000):
26 |     timesteps = agent.total_timesteps
27 |     timee = time.time()
28 |     score = environment.play_episode(name, env, agent, debug = False) #set debug to true for rendering
29 |     scores.append(score)
30 |     if score > max_score:
31 |         max_score = score
32 | 
33 |     print('\nEpisode: ' + str(i))
34 |     print('Steps: ' + str(agent.total_timesteps - timesteps))
35 |     print('Duration: ' + str(time.time() - timee))
36 |     print('Score: ' + str(score))
37 |     print('Max Score: ' + str(max_score))
38 |     print('Epsilon: ' + str(agent.epsilon))
39 | 
40 |     if i%100==0 and i!=0:
41 |         last_100_avg.append(sum(scores)/len(scores))
42 |         plt.plot(np.arange(0,i+1,100),last_100_avg)
43 |         plt.show()
44 | 


--------------------------------------------------------------------------------
/environment.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import preprocess_frame as ppf
 3 | import numpy as np
 4 | 
 5 | 
 6 | def initialize_new_game(name, env, agent):
 7 |     """We don't want an agents past game influencing its new game, so we add in some dummy data to initialize"""
 8 |     
 9 |     env.reset()
10 |     starting_frame = ppf.resize_frame(env.step(0)[0])
11 | 
12 |     dummy_action = 0
13 |     dummy_reward = 0
14 |     dummy_done = False
15 |     for i in range(3):
16 |         agent.memory.add_experience(starting_frame, dummy_reward, dummy_action, dummy_done)
17 | 
18 | def make_env(name, agent):
19 |     env = gym.make(name)
20 |     return env
21 | 
22 | def take_step(name, env, agent, score, debug):
23 |     
24 |     #1 and 2: Update timesteps and save weights
25 |     agent.total_timesteps += 1
26 |     if agent.total_timesteps % 50000 == 0:
27 |       agent.model.save_weights('recent_weights.hdf5')
28 |       print('\nWeights saved!')
29 | 
30 |     #3: Take action
31 |     next_frame, next_frames_reward, next_frame_terminal, info = env.step(agent.memory.actions[-1])
32 |     
33 |     #4: Get next state
34 |     next_frame = ppf.resize_frame(next_frame)
35 |     new_state = [agent.memory.frames[-3], agent.memory.frames[-2], agent.memory.frames[-1], next_frame]
36 |     new_state = np.moveaxis(new_state,0,2)/255 #We have to do this to get it into keras's goofy format of [batch_size,rows,columns,channels]
37 |     new_state = np.expand_dims(new_state,0) #^^^
38 |     
39 |     #5: Get next action, using next state
40 |     next_action = agent.get_action(new_state)
41 | 
42 |     #6: If game is over, return the score
43 |     if next_frame_terminal:
44 |         agent.memory.add_experience(next_frame, next_frames_reward, next_action, next_frame_terminal)
45 |         return (score + next_frames_reward),True
46 | 
47 |     #7: Now we add the next experience to memory
48 |     agent.memory.add_experience(next_frame, next_frames_reward, next_action, next_frame_terminal)
49 | 
50 |     #8: If we are trying to debug this then render
51 |     if debug:
52 |         env.render()
53 | 
54 |     #9: If the threshold memory is satisfied, make the agent learn from memory
55 |     if len(agent.memory.frames) > agent.starting_mem_len:
56 |         agent.learn(debug)
57 | 
58 |     return (score + next_frames_reward),False
59 | 
60 | def play_episode(name, env, agent, debug = False):
61 |     initialize_new_game(name, env, agent)
62 |     done = False
63 |     score = 0
64 |     while True:
65 |         score,done = take_step(name,env,agent,score, debug)
66 |         if done:
67 |             break
68 |     return score
69 | 


--------------------------------------------------------------------------------
/the_agent.py:
--------------------------------------------------------------------------------
  1 | from tensorflow.keras.models import Sequential, clone_model
  2 | from tensorflow.keras.layers import Dense, Flatten, Conv2D, Input
  3 | from tensorflow.keras.optimizers import Adam
  4 | import keras.backend as K
  5 | import tensorflow as tf
  6 | from agent_memory import Memory
  7 | import numpy as np
  8 | import random
  9 | 
 10 | 
 11 | class Agent():
 12 |     def __init__(self,possible_actions,starting_mem_len,max_mem_len,starting_epsilon,learn_rate, starting_lives = 5, debug = False):
 13 |         self.memory = Memory(max_mem_len)
 14 |         self.possible_actions = possible_actions
 15 |         self.epsilon = starting_epsilon
 16 |         self.epsilon_decay = .9/100000
 17 |         self.epsilon_min = .05
 18 |         self.gamma = .95
 19 |         self.learn_rate = learn_rate
 20 |         self.model = self._build_model()
 21 |         self.model_target = clone_model(self.model)
 22 |         self.total_timesteps = 0
 23 |         self.lives = starting_lives #this parameter does not apply to pong
 24 |         self.starting_mem_len = starting_mem_len
 25 |         self.learns = 0
 26 | 
 27 | 
 28 |     def _build_model(self):
 29 |         model = Sequential()
 30 |         model.add(Input((84,84,4)))
 31 |         model.add(Conv2D(filters = 32,kernel_size = (8,8),strides = 4,data_format="channels_last", activation = 'relu',kernel_initializer = tf.keras.initializers.VarianceScaling(scale=2)))
 32 |         model.add(Conv2D(filters = 64,kernel_size = (4,4),strides = 2,data_format="channels_last", activation = 'relu',kernel_initializer = tf.keras.initializers.VarianceScaling(scale=2)))
 33 |         model.add(Conv2D(filters = 64,kernel_size = (3,3),strides = 1,data_format="channels_last", activation = 'relu',kernel_initializer = tf.keras.initializers.VarianceScaling(scale=2)))
 34 |         model.add(Flatten())
 35 |         model.add(Dense(512,activation = 'relu', kernel_initializer = tf.keras.initializers.VarianceScaling(scale=2)))
 36 |         model.add(Dense(len(self.possible_actions), activation = 'linear'))
 37 |         optimizer = Adam(self.learn_rate)
 38 |         model.compile(optimizer, loss=tf.keras.losses.Huber())
 39 |         model.summary()
 40 |         print('\nAgent Initialized\n')
 41 |         return model
 42 | 
 43 |     def get_action(self,state):
 44 |         """Explore"""
 45 |         if np.random.rand() < self.epsilon:
 46 |             return random.sample(self.possible_actions,1)[0]
 47 | 
 48 |         """Do Best Acton"""
 49 |         a_index = np.argmax(self.model.predict(state))
 50 |         return self.possible_actions[a_index]
 51 | 
 52 |     def _index_valid(self,index):
 53 |         if self.memory.done_flags[index-3] or self.memory.done_flags[index-2] or self.memory.done_flags[index-1] or self.memory.done_flags[index]:
 54 |             return False
 55 |         else:
 56 |             return True
 57 | 
 58 |     def learn(self,debug = False):
 59 |         """we want the output[a] to be R_(t+1) + Qmax_(t+1)."""
 60 |         """So target for taking action 1 should be [output[0], R_(t+1) + Qmax_(t+1), output[2]]"""
 61 | 
 62 |         """First we need 32 random valid indicies"""
 63 |         states = []
 64 |         next_states = []
 65 |         actions_taken = []
 66 |         next_rewards = []
 67 |         next_done_flags = []
 68 | 
 69 |         while len(states) < 32:
 70 |             index = np.random.randint(4,len(self.memory.frames) - 1)
 71 |             if self._index_valid(index):
 72 |                 state = [self.memory.frames[index-3], self.memory.frames[index-2], self.memory.frames[index-1], self.memory.frames[index]]
 73 |                 state = np.moveaxis(state,0,2)/255
 74 |                 next_state = [self.memory.frames[index-2], self.memory.frames[index-1], self.memory.frames[index], self.memory.frames[index+1]]
 75 |                 next_state = np.moveaxis(next_state,0,2)/255
 76 | 
 77 |                 states.append(state)
 78 |                 next_states.append(next_state)
 79 |                 actions_taken.append(self.memory.actions[index])
 80 |                 next_rewards.append(self.memory.rewards[index+1])
 81 |                 next_done_flags.append(self.memory.done_flags[index+1])
 82 | 
 83 |         """Now we get the ouputs from our model, and the target model. We need this for our target in the error function"""
 84 |         labels = self.model.predict(np.array(states))
 85 |         next_state_values = self.model_target.predict(np.array(next_states))
 86 |         
 87 |         """Now we define our labels, or what the output should have been
 88 |            We want the output[action_taken] to be R_(t+1) + Qmax_(t+1) """
 89 |         for i in range(32):
 90 |             action = self.possible_actions.index(actions_taken[i])
 91 |             labels[i][action] = next_rewards[i] + (not next_done_flags[i]) * self.gamma * max(next_state_values[i])
 92 | 
 93 |         """Train our model using the states and outputs generated"""
 94 |         self.model.fit(np.array(states),labels,batch_size = 32, epochs = 1, verbose = 0)
 95 | 
 96 |         """Decrease epsilon and update how many times our agent has learned"""
 97 |         if self.epsilon > self.epsilon_min:
 98 |             self.epsilon -= self.epsilon_decay
 99 |         self.learns += 1
100 |         
101 |         """Every 10000 learned, copy our model weights to our target model"""
102 |         if self.learns % 10000 == 0:
103 |             self.model_target.set_weights(self.model.get_weights())
104 |             print('\nTarget model updated')
105 | 


--------------------------------------------------------------------------------