├── 01_CartPole-reinforcement-learning ├── Cartpole_DQN.py ├── IMAGES │ ├── CartPole_test.gif │ ├── image.png │ ├── math.PNG │ ├── testing_model.PNG │ └── training_model.PNG ├── cartpole-dqn.h5 └── cartpole_random.py ├── 02_CartPole-reinforcement-learning_DDQN ├── Cartpole_DDQN.py ├── Cartpole_DDQN_TF2.py └── IMAGES │ ├── DDQN_CartPole-v1.png │ ├── DDQN_CartPole-v1_soft.png │ └── DQN_CartPole-v1.png ├── 03_CartPole-reinforcement-learning_Dueling_DDQN ├── Cartpole_Double_DDQN.py ├── Cartpole_Double_DDQN_TF2.py └── IMAGES │ ├── DDQN_CartPole-v1.png │ ├── DDQN_CartPole-v1_Dueling.png │ └── DQN_CartPole-v1_Dueling.png ├── 04_CartPole-reinforcement-learning_e_greedy_D3QN ├── Cartpole_e_greedy_D3QN.py ├── Cartpole_e_greedy_D3QN_TF2.py └── IMAGES │ └── DDQN_CartPole-v1_Dueling_Greedy.png ├── 05_CartPole-reinforcement-learning_PER_D3QN ├── Cartpole_PER_D3QN.py ├── Cartpole_PER_D3QN_TF2.py ├── IMAGES │ ├── DDQN_CartPole-v1_Dueling.png │ ├── DDQN_CartPole-v1_Dueling_PER.png │ ├── Replay_buffer.png │ └── SumTree.png └── PER.py ├── 06_CartPole-reinforcement-learning_PER_D3QN_CNN ├── Cartpole_PER_D3QN_CNN.py ├── Cartpole_PER_D3QN_CNN_TF2.py ├── PER.py └── random_game.py ├── 07_Pong-reinforcement-learning_DQN_CNN ├── IMAGES │ ├── DDQN_Pong-v0_CNN.png │ ├── DDQN_Pong-v0_Dueling_CNN.png │ ├── DDQN_Pong-v0_Dueling_PER_CNN.png │ └── DQN_Pong-v0_CNN.png ├── Models │ ├── Pong-v0_DDQN_CNN.h5 │ ├── Pong-v0_DDQN_Dueling_CNN.h5 │ ├── Pong-v0_DDQN_Dueling_PER_CNN.h5 │ └── Pong-v0_DQN_CNN.h5 ├── PER.py ├── Pong-v0_DQN_CNN.py └── Pong-v0_DQN_CNN_TF2.py ├── 08_Pong-v0_Policy_gradient ├── IMAGES │ ├── Pong-v0_PG_2.5e-05.png │ └── PongDeterministic-v4_PG_0.0001.png ├── Pong-v0_PG.py └── Pong-v0_PG_TF2.py ├── 09_Pong-v0_A2C ├── IMAGES │ ├── Pong-v0_A2C_2.5e-05.png │ └── PongDeterministic-v4_A2C_2.5e-05.png ├── Pong-v0_A2C.py └── Pong-v0_A2C_TF2.py ├── 10_Pong-v0_A3C ├── Pong-v0_A3C.py ├── Pong-v0_A3C_TF2.py └── PongDeterministic-v4_A3C_2.5e-05.png ├── 11_Pong-v0_PPO ├── Models │ └── Pong-v0_APPO_0.0001_Actor_CNN.h5 ├── Pong-v0_APPO_0.0001_CNN.png ├── Pong-v0_APPO_0.0001_RMSprop.png ├── Pong-v0_PPO.py ├── Pong-v0_PPO_TF2.py ├── Pong-v0_PPO_gif.py ├── PongDeterministic-v4_APPO_0.0001.png ├── gameplay.gif └── gameplay_CNN.gif ├── BipedalWalker-v3_PPO ├── BipedalWalker-v3_PPO.py ├── BipedalWalker-v3_PPO_Actor.h5 ├── BipedalWalker-v3_PPO_Critic.h5 ├── BipedalWalker-v3_training.png └── gameplay.gif ├── LICENSE.md ├── LunarLander-v2_PPO ├── LunarLander-v2.png ├── LunarLander-v2_PPO.py ├── LunarLander-v2_PPO_Actor.h5 ├── LunarLander-v2_PPO_Critic.h5 └── gameplay.gif ├── README.md └── requirements.txt /01_CartPole-reinforcement-learning/Cartpole_DQN.py: -------------------------------------------------------------------------------- 1 | # Tutorial by www.pylessons.com 2 | # Tutorial written for - Tensorflow 1.15, Keras 2.2.4 3 | 4 | import os 5 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1' 6 | import random 7 | import gym 8 | import numpy as np 9 | from collections import deque 10 | from keras.models import Model, load_model 11 | from keras.layers import Input, Dense 12 | from keras.optimizers import Adam, RMSprop 13 | 14 | 15 | def OurModel(input_shape, action_space): 16 | X_input = Input(input_shape) 17 | 18 | # 'Dense' is the basic form of a neural network layer 19 | # Input Layer of state size(4) and Hidden Layer with 512 nodes 20 | X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X_input) 21 | 22 | # Hidden layer with 256 nodes 23 | X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X) 24 | 25 | # Hidden layer with 64 nodes 26 | X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X) 27 | 28 | # Output Layer with # of actions: 2 nodes (left, right) 29 | X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X) 30 | 31 | model = Model(inputs = X_input, outputs = X, name='CartPole DQN model') 32 | model.compile(loss="mse", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"]) 33 | 34 | model.summary() 35 | return model 36 | 37 | class DQNAgent: 38 | def __init__(self): 39 | self.env = gym.make('CartPole-v1') 40 | # by default, CartPole-v1 has max episode steps = 500 41 | self.state_size = self.env.observation_space.shape[0] 42 | self.action_size = self.env.action_space.n 43 | self.EPISODES = 1000 44 | self.memory = deque(maxlen=2000) 45 | 46 | self.gamma = 0.95 # discount rate 47 | self.epsilon = 1.0 # exploration rate 48 | self.epsilon_min = 0.001 49 | self.epsilon_decay = 0.999 50 | self.batch_size = 64 51 | self.train_start = 1000 52 | 53 | # create main model 54 | self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size) 55 | 56 | def remember(self, state, action, reward, next_state, done): 57 | self.memory.append((state, action, reward, next_state, done)) 58 | if len(self.memory) > self.train_start: 59 | if self.epsilon > self.epsilon_min: 60 | self.epsilon *= self.epsilon_decay 61 | 62 | def act(self, state): 63 | if np.random.random() <= self.epsilon: 64 | return random.randrange(self.action_size) 65 | else: 66 | return np.argmax(self.model.predict(state)) 67 | 68 | def replay(self): 69 | if len(self.memory) < self.train_start: 70 | return 71 | # Randomly sample minibatch from the memory 72 | minibatch = random.sample(self.memory, min(len(self.memory), self.batch_size)) 73 | 74 | state = np.zeros((self.batch_size, self.state_size)) 75 | next_state = np.zeros((self.batch_size, self.state_size)) 76 | action, reward, done = [], [], [] 77 | 78 | # do this before prediction 79 | # for speedup, this could be done on the tensor level 80 | # but easier to understand using a loop 81 | for i in range(self.batch_size): 82 | state[i] = minibatch[i][0] 83 | action.append(minibatch[i][1]) 84 | reward.append(minibatch[i][2]) 85 | next_state[i] = minibatch[i][3] 86 | done.append(minibatch[i][4]) 87 | 88 | # do batch prediction to save speed 89 | target = self.model.predict(state) 90 | target_next = self.model.predict(next_state) 91 | 92 | for i in range(self.batch_size): 93 | # correction on the Q value for the action used 94 | if done[i]: 95 | target[i][action[i]] = reward[i] 96 | else: 97 | # Standard - DQN 98 | # DQN chooses the max Q value among next actions 99 | # selection and evaluation of action is on the target Q Network 100 | # Q_max = max_a' Q_target(s', a') 101 | target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i])) 102 | 103 | # Train the Neural Network with batches 104 | self.model.fit(state, target, batch_size=self.batch_size, verbose=0) 105 | 106 | 107 | def load(self, name): 108 | self.model = load_model(name) 109 | 110 | def save(self, name): 111 | self.model.save(name) 112 | 113 | def run(self): 114 | for e in range(self.EPISODES): 115 | state = self.env.reset() 116 | state = np.reshape(state, [1, self.state_size]) 117 | done = False 118 | i = 0 119 | while not done: 120 | self.env.render() 121 | action = self.act(state) 122 | next_state, reward, done, _ = self.env.step(action) 123 | next_state = np.reshape(next_state, [1, self.state_size]) 124 | if not done or i == self.env._max_episode_steps-1: 125 | reward = reward 126 | else: 127 | reward = -100 128 | self.remember(state, action, reward, next_state, done) 129 | state = next_state 130 | i += 1 131 | if done: 132 | print("episode: {}/{}, score: {}, e: {:.2}".format(e, self.EPISODES, i, self.epsilon)) 133 | if i == 500: 134 | print("Saving trained model as cartpole-dqn.h5") 135 | self.save("cartpole-dqn.h5") 136 | return 137 | self.replay() 138 | 139 | def test(self): 140 | self.load("cartpole-dqn.h5") 141 | for e in range(self.EPISODES): 142 | state = self.env.reset() 143 | state = np.reshape(state, [1, self.state_size]) 144 | done = False 145 | i = 0 146 | while not done: 147 | self.env.render() 148 | action = np.argmax(self.model.predict(state)) 149 | next_state, reward, done, _ = self.env.step(action) 150 | state = np.reshape(next_state, [1, self.state_size]) 151 | i += 1 152 | if done: 153 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, i)) 154 | break 155 | 156 | if __name__ == "__main__": 157 | agent = DQNAgent() 158 | #agent.run() 159 | agent.test() 160 | -------------------------------------------------------------------------------- /01_CartPole-reinforcement-learning/IMAGES/CartPole_test.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/01_CartPole-reinforcement-learning/IMAGES/CartPole_test.gif -------------------------------------------------------------------------------- /01_CartPole-reinforcement-learning/IMAGES/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/01_CartPole-reinforcement-learning/IMAGES/image.png -------------------------------------------------------------------------------- /01_CartPole-reinforcement-learning/IMAGES/math.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/01_CartPole-reinforcement-learning/IMAGES/math.PNG -------------------------------------------------------------------------------- /01_CartPole-reinforcement-learning/IMAGES/testing_model.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/01_CartPole-reinforcement-learning/IMAGES/testing_model.PNG -------------------------------------------------------------------------------- /01_CartPole-reinforcement-learning/IMAGES/training_model.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/01_CartPole-reinforcement-learning/IMAGES/training_model.PNG -------------------------------------------------------------------------------- /01_CartPole-reinforcement-learning/cartpole-dqn.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/01_CartPole-reinforcement-learning/cartpole-dqn.h5 -------------------------------------------------------------------------------- /01_CartPole-reinforcement-learning/cartpole_random.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import random 3 | 4 | env = gym.make("CartPole-v1") 5 | 6 | def Random_games(): 7 | # Each of this episode is its own game. 8 | for episode in range(10): 9 | env.reset() 10 | # this is each frame, up to 500...but we wont make it that far with random. 11 | for t in range(500): 12 | # This will display the environment 13 | # Only display if you really want to see it. 14 | # Takes much longer to display it. 15 | env.render() 16 | 17 | # This will just create a sample action in any environment. 18 | # In this environment, the action can be 0 or 1, which is left or right 19 | action = env.action_space.sample() 20 | 21 | # this executes the environment with an action, 22 | # and returns the observation of the environment, 23 | # the reward, if the env is over, and other info. 24 | next_state, reward, done, info = env.step(action) 25 | 26 | # lets print everything in one line: 27 | print(t, next_state, reward, done, info, action) 28 | if done: 29 | break 30 | 31 | Random_games() -------------------------------------------------------------------------------- /02_CartPole-reinforcement-learning_DDQN/Cartpole_DDQN.py: -------------------------------------------------------------------------------- 1 | # Tutorial by www.pylessons.com 2 | # Tutorial written for - Tensorflow 1.15, Keras 2.2.4 3 | 4 | import os 5 | import random 6 | import gym 7 | import pylab 8 | import numpy as np 9 | from collections import deque 10 | from keras.models import Model, load_model 11 | from keras.layers import Input, Dense 12 | from keras.optimizers import Adam, RMSprop 13 | 14 | 15 | def OurModel(input_shape, action_space): 16 | X_input = Input(input_shape) 17 | X = X_input 18 | 19 | # 'Dense' is the basic form of a neural network layer 20 | # Input Layer of state size(4) and Hidden Layer with 512 nodes 21 | X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X) 22 | 23 | # Hidden layer with 256 nodes 24 | X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X) 25 | 26 | # Hidden layer with 64 nodes 27 | X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X) 28 | 29 | # Output Layer with # of actions: 2 nodes (left, right) 30 | X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X) 31 | 32 | model = Model(inputs = X_input, outputs = X, name='CartPole DDQN model') 33 | model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"]) 34 | 35 | model.summary() 36 | return model 37 | 38 | class DQNAgent: 39 | def __init__(self, env_name): 40 | self.env_name = env_name 41 | self.env = gym.make(env_name) 42 | self.env.seed(0) 43 | # by default, CartPole-v1 has max episode steps = 500 44 | self.env._max_episode_steps = 4000 45 | self.state_size = self.env.observation_space.shape[0] 46 | self.action_size = self.env.action_space.n 47 | 48 | self.EPISODES = 1000 49 | self.memory = deque(maxlen=2000) 50 | 51 | self.gamma = 0.95 # discount rate 52 | self.epsilon = 1.0 # exploration rate 53 | self.epsilon_min = 0.01 54 | self.epsilon_decay = 0.999 55 | self.batch_size = 32 56 | self.train_start = 1000 57 | 58 | # defining model parameters 59 | self.ddqn = True 60 | self.Soft_Update = False 61 | 62 | self.TAU = 0.1 # target network soft update hyperparameter 63 | 64 | self.Save_Path = 'Models' 65 | self.scores, self.episodes, self.average = [], [], [] 66 | 67 | if self.ddqn: 68 | print("----------Double DQN--------") 69 | self.Model_name = os.path.join(self.Save_Path,"DDQN_"+self.env_name+".h5") 70 | else: 71 | print("-------------DQN------------") 72 | self.Model_name = os.path.join(self.Save_Path,"DQN_"+self.env_name+".h5") 73 | 74 | # create main model 75 | self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size) 76 | self.target_model = OurModel(input_shape=(self.state_size,), action_space = self.action_size) 77 | 78 | # after some time interval update the target model to be same with model 79 | def update_target_model(self): 80 | if not self.Soft_Update and self.ddqn: 81 | self.target_model.set_weights(self.model.get_weights()) 82 | return 83 | if self.Soft_Update and self.ddqn: 84 | q_model_theta = self.model.get_weights() 85 | target_model_theta = self.target_model.get_weights() 86 | counter = 0 87 | for q_weight, target_weight in zip(q_model_theta, target_model_theta): 88 | target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU 89 | target_model_theta[counter] = target_weight 90 | counter += 1 91 | self.target_model.set_weights(target_model_theta) 92 | 93 | def remember(self, state, action, reward, next_state, done): 94 | self.memory.append((state, action, reward, next_state, done)) 95 | if len(self.memory) > self.train_start: 96 | if self.epsilon > self.epsilon_min: 97 | self.epsilon *= self.epsilon_decay 98 | 99 | def act(self, state): 100 | if np.random.random() <= self.epsilon: 101 | return random.randrange(self.action_size) 102 | else: 103 | return np.argmax(self.model.predict(state)) 104 | 105 | def replay(self): 106 | if len(self.memory) < self.train_start: 107 | return 108 | # Randomly sample minibatch from the memory 109 | minibatch = random.sample(self.memory, min(self.batch_size, self.batch_size)) 110 | 111 | state = np.zeros((self.batch_size, self.state_size)) 112 | next_state = np.zeros((self.batch_size, self.state_size)) 113 | action, reward, done = [], [], [] 114 | 115 | # do this before prediction 116 | # for speedup, this could be done on the tensor level 117 | # but easier to understand using a loop 118 | for i in range(self.batch_size): 119 | state[i] = minibatch[i][0] 120 | action.append(minibatch[i][1]) 121 | reward.append(minibatch[i][2]) 122 | next_state[i] = minibatch[i][3] 123 | done.append(minibatch[i][4]) 124 | 125 | # do batch prediction to save speed 126 | target = self.model.predict(state) 127 | target_next = self.model.predict(next_state) 128 | target_val = self.target_model.predict(next_state) 129 | 130 | for i in range(len(minibatch)): 131 | # correction on the Q value for the action used 132 | if done[i]: 133 | target[i][action[i]] = reward[i] 134 | else: 135 | if self.ddqn: # Double - DQN 136 | # current Q Network selects the action 137 | # a'_max = argmax_a' Q(s', a') 138 | a = np.argmax(target_next[i]) 139 | # target Q Network evaluates the action 140 | # Q_max = Q_target(s', a'_max) 141 | target[i][action[i]] = reward[i] + self.gamma * (target_val[i][a]) 142 | else: # Standard - DQN 143 | # DQN chooses the max Q value among next actions 144 | # selection and evaluation of action is on the target Q Network 145 | # Q_max = max_a' Q_target(s', a') 146 | target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i])) 147 | 148 | # Train the Neural Network with batches 149 | self.model.fit(state, target, batch_size=self.batch_size, verbose=0) 150 | 151 | 152 | def load(self, name): 153 | self.model = load_model(name) 154 | 155 | def save(self, name): 156 | self.model.save(name) 157 | 158 | pylab.figure(figsize=(18, 9)) 159 | def PlotModel(self, score, episode): 160 | self.scores.append(score) 161 | self.episodes.append(episode) 162 | self.average.append(sum(self.scores) / len(self.scores)) 163 | pylab.plot(self.episodes, self.average, 'r') 164 | pylab.plot(self.episodes, self.scores, 'b') 165 | pylab.ylabel('Score', fontsize=18) 166 | pylab.xlabel('Steps', fontsize=18) 167 | dqn = 'DQN_' 168 | softupdate = '' 169 | if self.ddqn: 170 | dqn = 'DDQN_' 171 | if self.Soft_Update: 172 | softupdate = '_soft' 173 | try: 174 | pylab.savefig(dqn+self.env_name+softupdate+".png") 175 | except OSError: 176 | pass 177 | 178 | return str(self.average[-1])[:5] 179 | 180 | def run(self): 181 | for e in range(self.EPISODES): 182 | state = self.env.reset() 183 | state = np.reshape(state, [1, self.state_size]) 184 | done = False 185 | i = 0 186 | while not done: 187 | #self.env.render() 188 | action = self.act(state) 189 | next_state, reward, done, _ = self.env.step(action) 190 | next_state = np.reshape(next_state, [1, self.state_size]) 191 | if not done or i == self.env._max_episode_steps-1: 192 | reward = reward 193 | else: 194 | reward = -100 195 | self.remember(state, action, reward, next_state, done) 196 | state = next_state 197 | i += 1 198 | if done: 199 | # every step update target model 200 | self.update_target_model() 201 | 202 | # every episode, plot the result 203 | average = self.PlotModel(i, e) 204 | 205 | print("episode: {}/{}, score: {}, e: {:.2}, average: {}".format(e, self.EPISODES, i, self.epsilon, average)) 206 | if i == self.env._max_episode_steps: 207 | print("Saving trained model as cartpole-ddqn.h5") 208 | #self.save("cartpole-ddqn.h5") 209 | break 210 | self.replay() 211 | 212 | def test(self): 213 | self.load("cartpole-ddqn.h5") 214 | for e in range(self.EPISODES): 215 | state = self.env.reset() 216 | state = np.reshape(state, [1, self.state_size]) 217 | done = False 218 | i = 0 219 | while not done: 220 | self.env.render() 221 | action = np.argmax(self.model.predict(state)) 222 | next_state, reward, done, _ = self.env.step(action) 223 | state = np.reshape(next_state, [1, self.state_size]) 224 | i += 1 225 | if done: 226 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, i)) 227 | break 228 | 229 | if __name__ == "__main__": 230 | env_name = 'CartPole-v1' 231 | agent = DQNAgent(env_name) 232 | agent.run() 233 | #agent.test() 234 | -------------------------------------------------------------------------------- /02_CartPole-reinforcement-learning_DDQN/Cartpole_DDQN_TF2.py: -------------------------------------------------------------------------------- 1 | # Tutorial by www.pylessons.com 2 | # Tutorial written for - Tensorflow 2.3.1 3 | 4 | import os 5 | import random 6 | import gym 7 | import pylab 8 | import numpy as np 9 | from collections import deque 10 | import tensorflow as tf 11 | from tensorflow.keras.models import Model, load_model 12 | from tensorflow.keras.layers import Input, Dense 13 | from tensorflow.keras.optimizers import Adam, RMSprop 14 | 15 | 16 | def OurModel(input_shape, action_space): 17 | X_input = Input(input_shape) 18 | X = X_input 19 | 20 | # 'Dense' is the basic form of a neural network layer 21 | # Input Layer of state size(4) and Hidden Layer with 512 nodes 22 | X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X) 23 | 24 | # Hidden layer with 256 nodes 25 | X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X) 26 | 27 | # Hidden layer with 64 nodes 28 | X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X) 29 | 30 | # Output Layer with # of actions: 2 nodes (left, right) 31 | X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X) 32 | 33 | model = Model(inputs = X_input, outputs = X) 34 | model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"]) 35 | 36 | model.summary() 37 | return model 38 | 39 | class DQNAgent: 40 | def __init__(self, env_name): 41 | self.env_name = env_name 42 | self.env = gym.make(env_name) 43 | self.env.seed(0) 44 | # by default, CartPole-v1 has max episode steps = 500 45 | self.env._max_episode_steps = 4000 46 | self.state_size = self.env.observation_space.shape[0] 47 | self.action_size = self.env.action_space.n 48 | 49 | self.EPISODES = 1000 50 | self.memory = deque(maxlen=2000) 51 | 52 | self.gamma = 0.95 # discount rate 53 | self.epsilon = 1.0 # exploration rate 54 | self.epsilon_min = 0.01 55 | self.epsilon_decay = 0.999 56 | self.batch_size = 32 57 | self.train_start = 1000 58 | 59 | # defining model parameters 60 | self.ddqn = True 61 | self.Soft_Update = False 62 | 63 | self.TAU = 0.1 # target network soft update hyperparameter 64 | 65 | self.Save_Path = 'Models' 66 | self.scores, self.episodes, self.average = [], [], [] 67 | 68 | if self.ddqn: 69 | print("----------Double DQN--------") 70 | self.Model_name = os.path.join(self.Save_Path,"DDQN_"+self.env_name+".h5") 71 | else: 72 | print("-------------DQN------------") 73 | self.Model_name = os.path.join(self.Save_Path,"DQN_"+self.env_name+".h5") 74 | 75 | # create main model 76 | self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size) 77 | self.target_model = OurModel(input_shape=(self.state_size,), action_space = self.action_size) 78 | 79 | # after some time interval update the target model to be same with model 80 | def update_target_model(self): 81 | if not self.Soft_Update and self.ddqn: 82 | self.target_model.set_weights(self.model.get_weights()) 83 | return 84 | if self.Soft_Update and self.ddqn: 85 | q_model_theta = self.model.get_weights() 86 | target_model_theta = self.target_model.get_weights() 87 | counter = 0 88 | for q_weight, target_weight in zip(q_model_theta, target_model_theta): 89 | target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU 90 | target_model_theta[counter] = target_weight 91 | counter += 1 92 | self.target_model.set_weights(target_model_theta) 93 | 94 | def remember(self, state, action, reward, next_state, done): 95 | self.memory.append((state, action, reward, next_state, done)) 96 | if len(self.memory) > self.train_start: 97 | if self.epsilon > self.epsilon_min: 98 | self.epsilon *= self.epsilon_decay 99 | 100 | def act(self, state): 101 | if np.random.random() <= self.epsilon: 102 | return random.randrange(self.action_size) 103 | else: 104 | return np.argmax(self.model.predict(state)) 105 | 106 | def replay(self): 107 | if len(self.memory) < self.train_start: 108 | return 109 | # Randomly sample minibatch from the memory 110 | minibatch = random.sample(self.memory, min(self.batch_size, self.batch_size)) 111 | 112 | state = np.zeros((self.batch_size, self.state_size)) 113 | next_state = np.zeros((self.batch_size, self.state_size)) 114 | action, reward, done = [], [], [] 115 | 116 | # do this before prediction 117 | # for speedup, this could be done on the tensor level 118 | # but easier to understand using a loop 119 | for i in range(self.batch_size): 120 | state[i] = minibatch[i][0] 121 | action.append(minibatch[i][1]) 122 | reward.append(minibatch[i][2]) 123 | next_state[i] = minibatch[i][3] 124 | done.append(minibatch[i][4]) 125 | 126 | # do batch prediction to save speed 127 | target = self.model.predict(state) 128 | target_next = self.model.predict(next_state) 129 | target_val = self.target_model.predict(next_state) 130 | 131 | for i in range(len(minibatch)): 132 | # correction on the Q value for the action used 133 | if done[i]: 134 | target[i][action[i]] = reward[i] 135 | else: 136 | if self.ddqn: # Double - DQN 137 | # current Q Network selects the action 138 | # a'_max = argmax_a' Q(s', a') 139 | a = np.argmax(target_next[i]) 140 | # target Q Network evaluates the action 141 | # Q_max = Q_target(s', a'_max) 142 | target[i][action[i]] = reward[i] + self.gamma * (target_val[i][a]) 143 | else: # Standard - DQN 144 | # DQN chooses the max Q value among next actions 145 | # selection and evaluation of action is on the target Q Network 146 | # Q_max = max_a' Q_target(s', a') 147 | target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i])) 148 | 149 | # Train the Neural Network with batches 150 | self.model.fit(state, target, batch_size=self.batch_size, verbose=0) 151 | 152 | 153 | def load(self, name): 154 | self.model = load_model(name) 155 | 156 | def save(self, name): 157 | self.model.save(name) 158 | 159 | pylab.figure(figsize=(18, 9)) 160 | def PlotModel(self, score, episode): 161 | self.scores.append(score) 162 | self.episodes.append(episode) 163 | self.average.append(sum(self.scores) / len(self.scores)) 164 | pylab.plot(self.episodes, self.average, 'r') 165 | pylab.plot(self.episodes, self.scores, 'b') 166 | pylab.ylabel('Score', fontsize=18) 167 | pylab.xlabel('Steps', fontsize=18) 168 | dqn = 'DQN_' 169 | softupdate = '' 170 | if self.ddqn: 171 | dqn = 'DDQN_' 172 | if self.Soft_Update: 173 | softupdate = '_soft' 174 | try: 175 | pylab.savefig(dqn+self.env_name+softupdate+".png") 176 | except OSError: 177 | pass 178 | 179 | return str(self.average[-1])[:5] 180 | 181 | def run(self): 182 | for e in range(self.EPISODES): 183 | state = self.env.reset() 184 | state = np.reshape(state, [1, self.state_size]) 185 | done = False 186 | i = 0 187 | while not done: 188 | #self.env.render() 189 | action = self.act(state) 190 | next_state, reward, done, _ = self.env.step(action) 191 | next_state = np.reshape(next_state, [1, self.state_size]) 192 | if not done or i == self.env._max_episode_steps-1: 193 | reward = reward 194 | else: 195 | reward = -100 196 | self.remember(state, action, reward, next_state, done) 197 | state = next_state 198 | i += 1 199 | if done: 200 | # every step update target model 201 | self.update_target_model() 202 | 203 | # every episode, plot the result 204 | average = self.PlotModel(i, e) 205 | 206 | print("episode: {}/{}, score: {}, e: {:.2}, average: {}".format(e, self.EPISODES, i, self.epsilon, average)) 207 | if i == self.env._max_episode_steps: 208 | print("Saving trained model as cartpole-ddqn.h5") 209 | #self.save("cartpole-ddqn.h5") 210 | break 211 | self.replay() 212 | 213 | def test(self): 214 | self.load("cartpole-ddqn.h5") 215 | for e in range(self.EPISODES): 216 | state = self.env.reset() 217 | state = np.reshape(state, [1, self.state_size]) 218 | done = False 219 | i = 0 220 | while not done: 221 | self.env.render() 222 | action = np.argmax(self.model.predict(state)) 223 | next_state, reward, done, _ = self.env.step(action) 224 | state = np.reshape(next_state, [1, self.state_size]) 225 | i += 1 226 | if done: 227 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, i)) 228 | break 229 | 230 | if __name__ == "__main__": 231 | env_name = 'CartPole-v1' 232 | agent = DQNAgent(env_name) 233 | agent.run() 234 | #agent.test() 235 | -------------------------------------------------------------------------------- /02_CartPole-reinforcement-learning_DDQN/IMAGES/DDQN_CartPole-v1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/02_CartPole-reinforcement-learning_DDQN/IMAGES/DDQN_CartPole-v1.png -------------------------------------------------------------------------------- /02_CartPole-reinforcement-learning_DDQN/IMAGES/DDQN_CartPole-v1_soft.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/02_CartPole-reinforcement-learning_DDQN/IMAGES/DDQN_CartPole-v1_soft.png -------------------------------------------------------------------------------- /02_CartPole-reinforcement-learning_DDQN/IMAGES/DQN_CartPole-v1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/02_CartPole-reinforcement-learning_DDQN/IMAGES/DQN_CartPole-v1.png -------------------------------------------------------------------------------- /03_CartPole-reinforcement-learning_Dueling_DDQN/Cartpole_Double_DDQN.py: -------------------------------------------------------------------------------- 1 | # Tutorial by www.pylessons.com 2 | # Tutorial written for - Tensorflow 1.15, Keras 2.2.4 3 | 4 | import os 5 | import random 6 | import gym 7 | import pylab 8 | import numpy as np 9 | from collections import deque 10 | from keras.models import Model, load_model 11 | from keras.layers import Input, Dense, Lambda, Add 12 | from keras.optimizers import Adam, RMSprop 13 | from keras import backend as K 14 | 15 | def OurModel(input_shape, action_space, dueling): 16 | X_input = Input(input_shape) 17 | X = X_input 18 | 19 | # 'Dense' is the basic form of a neural network layer 20 | # Input Layer of state size(4) and Hidden Layer with 512 nodes 21 | X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X) 22 | 23 | # Hidden layer with 256 nodes 24 | X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X) 25 | 26 | # Hidden layer with 64 nodes 27 | X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X) 28 | 29 | if dueling: 30 | state_value = Dense(1, kernel_initializer='he_uniform')(X) 31 | state_value = Lambda(lambda s: K.expand_dims(s[:, 0], -1), output_shape=(action_space,))(state_value) 32 | 33 | action_advantage = Dense(action_space, kernel_initializer='he_uniform')(X) 34 | action_advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), output_shape=(action_space,))(action_advantage) 35 | 36 | X = Add()([state_value, action_advantage]) 37 | else: 38 | # Output Layer with # of actions: 2 nodes (left, right) 39 | X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X) 40 | 41 | model = Model(inputs = X_input, outputs = X, name='CartPole Dueling DDQN model') 42 | model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"]) 43 | 44 | model.summary() 45 | return model 46 | 47 | class DQNAgent: 48 | def __init__(self, env_name): 49 | self.env_name = env_name 50 | self.env = gym.make(env_name) 51 | self.env.seed(0) 52 | # by default, CartPole-v1 has max episode steps = 500 53 | self.env._max_episode_steps = 4000 54 | self.state_size = self.env.observation_space.shape[0] 55 | self.action_size = self.env.action_space.n 56 | 57 | self.EPISODES = 1000 58 | self.memory = deque(maxlen=2000) 59 | 60 | self.gamma = 0.95 # discount rate 61 | self.epsilon = 1.0 # exploration rate 62 | self.epsilon_min = 0.01 # minimum exploration probability 63 | self.epsilon_decay = 0.999 # exponential decay rate for exploration prob 64 | self.batch_size = 32 65 | self.train_start = 1000 66 | 67 | # defining model parameters 68 | self.ddqn = True # use doudle deep q network 69 | self.Soft_Update = False # use soft parameter update 70 | self.dueling = True # use dealing netowrk 71 | 72 | self.TAU = 0.1 # target network soft update hyperparameter 73 | 74 | self.Save_Path = 'Models' 75 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path) 76 | self.scores, self.episodes, self.average = [], [], [] 77 | 78 | if self.ddqn: 79 | print("----------Double DQN--------") 80 | self.Model_name = os.path.join(self.Save_Path,"Dueling DDQN_"+self.env_name+".h5") 81 | else: 82 | print("-------------DQN------------") 83 | self.Model_name = os.path.join(self.Save_Path,"Dueling DQN_"+self.env_name+".h5") 84 | 85 | # create main model and target model 86 | self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling) 87 | self.target_model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling) 88 | 89 | # after some time interval update the target model to be same with model 90 | def update_target_model(self): 91 | if not self.Soft_Update and self.ddqn: 92 | self.target_model.set_weights(self.model.get_weights()) 93 | return 94 | if self.Soft_Update and self.ddqn: 95 | q_model_theta = self.model.get_weights() 96 | target_model_theta = self.target_model.get_weights() 97 | counter = 0 98 | for q_weight, target_weight in zip(q_model_theta, target_model_theta): 99 | target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU 100 | target_model_theta[counter] = target_weight 101 | counter += 1 102 | self.target_model.set_weights(target_model_theta) 103 | 104 | def remember(self, state, action, reward, next_state, done): 105 | self.memory.append((state, action, reward, next_state, done)) 106 | if len(self.memory) > self.train_start: 107 | if self.epsilon > self.epsilon_min: 108 | self.epsilon *= self.epsilon_decay 109 | 110 | def act(self, state): 111 | if np.random.random() <= self.epsilon: 112 | return random.randrange(self.action_size) 113 | else: 114 | return np.argmax(self.model.predict(state)) 115 | 116 | def replay(self): 117 | if len(self.memory) < self.train_start: 118 | return 119 | # Randomly sample minibatch from the memory 120 | minibatch = random.sample(self.memory, self.batch_size) 121 | 122 | state = np.zeros((self.batch_size, self.state_size)) 123 | next_state = np.zeros((self.batch_size, self.state_size)) 124 | action, reward, done = [], [], [] 125 | 126 | # do this before prediction 127 | # for speedup, this could be done on the tensor level 128 | # but easier to understand using a loop 129 | for i in range(self.batch_size): 130 | state[i] = minibatch[i][0] 131 | action.append(minibatch[i][1]) 132 | reward.append(minibatch[i][2]) 133 | next_state[i] = minibatch[i][3] 134 | done.append(minibatch[i][4]) 135 | 136 | # do batch prediction to save speed 137 | # predict Q-values for starting state using the main network 138 | target = self.model.predict(state) 139 | # predict best action in ending state using the main network 140 | target_next = self.model.predict(next_state) 141 | # predict Q-values for ending state using the target network 142 | target_val = self.target_model.predict(next_state) 143 | 144 | for i in range(len(minibatch)): 145 | # correction on the Q value for the action used 146 | if done[i]: 147 | target[i][action[i]] = reward[i] 148 | else: 149 | if self.ddqn: # Double - DQN 150 | # current Q Network selects the action 151 | # a'_max = argmax_a' Q(s', a') 152 | a = np.argmax(target_next[i]) 153 | # target Q Network evaluates the action 154 | # Q_max = Q_target(s', a'_max) 155 | target[i][action[i]] = reward[i] + self.gamma * (target_val[i][a]) 156 | else: # Standard - DQN 157 | # DQN chooses the max Q value among next actions 158 | # selection and evaluation of action is on the target Q Network 159 | # Q_max = max_a' Q_target(s', a') 160 | target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i])) 161 | 162 | # Train the Neural Network with batches 163 | self.model.fit(state, target, batch_size=self.batch_size, verbose=0) 164 | 165 | def load(self, name): 166 | self.model = load_model(name) 167 | 168 | def save(self, name): 169 | self.model.save(name) 170 | 171 | pylab.figure(figsize=(18, 9)) 172 | def PlotModel(self, score, episode): 173 | self.scores.append(score) 174 | self.episodes.append(episode) 175 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:])) 176 | pylab.plot(self.episodes, self.average, 'r') 177 | pylab.plot(self.episodes, self.scores, 'b') 178 | pylab.ylabel('Score', fontsize=18) 179 | pylab.xlabel('Steps', fontsize=18) 180 | dqn = 'DQN_' 181 | softupdate = '' 182 | dueling = '' 183 | if self.ddqn: dqn = 'DDQN_' 184 | if self.Soft_Update: softupdate = '_soft' 185 | if self.dueling: dueling = '_Dueling' 186 | try: 187 | pylab.savefig(dqn+self.env_name+softupdate+dueling+".png") 188 | except OSError: 189 | pass 190 | 191 | return str(self.average[-1])[:5] 192 | 193 | def run(self): 194 | for e in range(self.EPISODES): 195 | state = self.env.reset() 196 | state = np.reshape(state, [1, self.state_size]) 197 | done = False 198 | i = 0 199 | while not done: 200 | #self.env.render() 201 | action = self.act(state) 202 | next_state, reward, done, _ = self.env.step(action) 203 | next_state = np.reshape(next_state, [1, self.state_size]) 204 | if not done or i == self.env._max_episode_steps-1: 205 | reward = reward 206 | else: 207 | reward = -100 208 | self.remember(state, action, reward, next_state, done) 209 | state = next_state 210 | i += 1 211 | if done: 212 | # every step update target model 213 | self.update_target_model() 214 | 215 | # every episode, plot the result 216 | average = self.PlotModel(i, e) 217 | 218 | print("episode: {}/{}, score: {}, e: {:.2}, average: {}".format(e, self.EPISODES, i, self.epsilon, average)) 219 | if i == self.env._max_episode_steps: 220 | print("Saving trained model as", self.Model_name) 221 | #self.save(self.Model_name) 222 | break 223 | self.replay() 224 | 225 | def test(self): 226 | self.load(self.Model_name) 227 | for e in range(self.EPISODES): 228 | state = self.env.reset() 229 | state = np.reshape(state, [1, self.state_size]) 230 | done = False 231 | i = 0 232 | while not done: 233 | self.env.render() 234 | action = np.argmax(self.model.predict(state)) 235 | next_state, reward, done, _ = self.env.step(action) 236 | state = np.reshape(next_state, [1, self.state_size]) 237 | i += 1 238 | if done: 239 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, i)) 240 | break 241 | 242 | if __name__ == "__main__": 243 | env_name = 'CartPole-v1' 244 | agent = DQNAgent(env_name) 245 | agent.run() 246 | #agent.test() 247 | -------------------------------------------------------------------------------- /03_CartPole-reinforcement-learning_Dueling_DDQN/Cartpole_Double_DDQN_TF2.py: -------------------------------------------------------------------------------- 1 | # Tutorial by www.pylessons.com 2 | # Tutorial written for - Tensorflow 2.3.1 3 | 4 | import os 5 | import random 6 | import gym 7 | import pylab 8 | import numpy as np 9 | from collections import deque 10 | import tensorflow as tf 11 | from tensorflow.keras.models import Model, load_model 12 | from tensorflow.keras.layers import Input, Dense, Lambda, Add 13 | from tensorflow.keras.optimizers import Adam, RMSprop 14 | from tensorflow.keras import backend as K 15 | 16 | def OurModel(input_shape, action_space, dueling): 17 | X_input = Input(input_shape) 18 | X = X_input 19 | 20 | # 'Dense' is the basic form of a neural network layer 21 | # Input Layer of state size(4) and Hidden Layer with 512 nodes 22 | X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X) 23 | 24 | # Hidden layer with 256 nodes 25 | X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X) 26 | 27 | # Hidden layer with 64 nodes 28 | X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X) 29 | 30 | if dueling: 31 | state_value = Dense(1, kernel_initializer='he_uniform')(X) 32 | state_value = Lambda(lambda s: K.expand_dims(s[:, 0], -1), output_shape=(action_space,))(state_value) 33 | 34 | action_advantage = Dense(action_space, kernel_initializer='he_uniform')(X) 35 | action_advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), output_shape=(action_space,))(action_advantage) 36 | 37 | X = Add()([state_value, action_advantage]) 38 | else: 39 | # Output Layer with # of actions: 2 nodes (left, right) 40 | X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X) 41 | 42 | model = Model(inputs = X_input, outputs = X) 43 | model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"]) 44 | 45 | model.summary() 46 | return model 47 | 48 | class DQNAgent: 49 | def __init__(self, env_name): 50 | self.env_name = env_name 51 | self.env = gym.make(env_name) 52 | self.env.seed(0) 53 | # by default, CartPole-v1 has max episode steps = 500 54 | self.env._max_episode_steps = 4000 55 | self.state_size = self.env.observation_space.shape[0] 56 | self.action_size = self.env.action_space.n 57 | 58 | self.EPISODES = 1000 59 | self.memory = deque(maxlen=2000) 60 | 61 | self.gamma = 0.95 # discount rate 62 | self.epsilon = 1.0 # exploration rate 63 | self.epsilon_min = 0.01 # minimum exploration probability 64 | self.epsilon_decay = 0.999 # exponential decay rate for exploration prob 65 | self.batch_size = 32 66 | self.train_start = 1000 67 | 68 | # defining model parameters 69 | self.ddqn = True # use doudle deep q network 70 | self.Soft_Update = False # use soft parameter update 71 | self.dueling = True # use dealing netowrk 72 | 73 | self.TAU = 0.1 # target network soft update hyperparameter 74 | 75 | self.Save_Path = 'Models' 76 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path) 77 | self.scores, self.episodes, self.average = [], [], [] 78 | 79 | if self.ddqn: 80 | print("----------Double DQN--------") 81 | self.Model_name = os.path.join(self.Save_Path,"Dueling DDQN_"+self.env_name+".h5") 82 | else: 83 | print("-------------DQN------------") 84 | self.Model_name = os.path.join(self.Save_Path,"Dueling DQN_"+self.env_name+".h5") 85 | 86 | # create main model and target model 87 | self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling) 88 | self.target_model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling) 89 | 90 | # after some time interval update the target model to be same with model 91 | def update_target_model(self): 92 | if not self.Soft_Update and self.ddqn: 93 | self.target_model.set_weights(self.model.get_weights()) 94 | return 95 | if self.Soft_Update and self.ddqn: 96 | q_model_theta = self.model.get_weights() 97 | target_model_theta = self.target_model.get_weights() 98 | counter = 0 99 | for q_weight, target_weight in zip(q_model_theta, target_model_theta): 100 | target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU 101 | target_model_theta[counter] = target_weight 102 | counter += 1 103 | self.target_model.set_weights(target_model_theta) 104 | 105 | def remember(self, state, action, reward, next_state, done): 106 | self.memory.append((state, action, reward, next_state, done)) 107 | if len(self.memory) > self.train_start: 108 | if self.epsilon > self.epsilon_min: 109 | self.epsilon *= self.epsilon_decay 110 | 111 | def act(self, state): 112 | if np.random.random() <= self.epsilon: 113 | return random.randrange(self.action_size) 114 | else: 115 | return np.argmax(self.model.predict(state)) 116 | 117 | def replay(self): 118 | if len(self.memory) < self.train_start: 119 | return 120 | # Randomly sample minibatch from the memory 121 | minibatch = random.sample(self.memory, self.batch_size) 122 | 123 | state = np.zeros((self.batch_size, self.state_size)) 124 | next_state = np.zeros((self.batch_size, self.state_size)) 125 | action, reward, done = [], [], [] 126 | 127 | # do this before prediction 128 | # for speedup, this could be done on the tensor level 129 | # but easier to understand using a loop 130 | for i in range(self.batch_size): 131 | state[i] = minibatch[i][0] 132 | action.append(minibatch[i][1]) 133 | reward.append(minibatch[i][2]) 134 | next_state[i] = minibatch[i][3] 135 | done.append(minibatch[i][4]) 136 | 137 | # do batch prediction to save speed 138 | # predict Q-values for starting state using the main network 139 | target = self.model.predict(state) 140 | # predict best action in ending state using the main network 141 | target_next = self.model.predict(next_state) 142 | # predict Q-values for ending state using the target network 143 | target_val = self.target_model.predict(next_state) 144 | 145 | for i in range(len(minibatch)): 146 | # correction on the Q value for the action used 147 | if done[i]: 148 | target[i][action[i]] = reward[i] 149 | else: 150 | if self.ddqn: # Double - DQN 151 | # current Q Network selects the action 152 | # a'_max = argmax_a' Q(s', a') 153 | a = np.argmax(target_next[i]) 154 | # target Q Network evaluates the action 155 | # Q_max = Q_target(s', a'_max) 156 | target[i][action[i]] = reward[i] + self.gamma * (target_val[i][a]) 157 | else: # Standard - DQN 158 | # DQN chooses the max Q value among next actions 159 | # selection and evaluation of action is on the target Q Network 160 | # Q_max = max_a' Q_target(s', a') 161 | target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i])) 162 | 163 | # Train the Neural Network with batches 164 | self.model.fit(state, target, batch_size=self.batch_size, verbose=0) 165 | 166 | def load(self, name): 167 | self.model = load_model(name) 168 | 169 | def save(self, name): 170 | self.model.save(name) 171 | 172 | pylab.figure(figsize=(18, 9)) 173 | def PlotModel(self, score, episode): 174 | self.scores.append(score) 175 | self.episodes.append(episode) 176 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:])) 177 | pylab.plot(self.episodes, self.average, 'r') 178 | pylab.plot(self.episodes, self.scores, 'b') 179 | pylab.ylabel('Score', fontsize=18) 180 | pylab.xlabel('Steps', fontsize=18) 181 | dqn = 'DQN_' 182 | softupdate = '' 183 | dueling = '' 184 | if self.ddqn: dqn = 'DDQN_' 185 | if self.Soft_Update: softupdate = '_soft' 186 | if self.dueling: dueling = '_Dueling' 187 | try: 188 | pylab.savefig(dqn+self.env_name+softupdate+dueling+".png") 189 | except OSError: 190 | pass 191 | 192 | return str(self.average[-1])[:5] 193 | 194 | def run(self): 195 | for e in range(self.EPISODES): 196 | state = self.env.reset() 197 | state = np.reshape(state, [1, self.state_size]) 198 | done = False 199 | i = 0 200 | while not done: 201 | #self.env.render() 202 | action = self.act(state) 203 | next_state, reward, done, _ = self.env.step(action) 204 | next_state = np.reshape(next_state, [1, self.state_size]) 205 | if not done or i == self.env._max_episode_steps-1: 206 | reward = reward 207 | else: 208 | reward = -100 209 | self.remember(state, action, reward, next_state, done) 210 | state = next_state 211 | i += 1 212 | if done: 213 | # every step update target model 214 | self.update_target_model() 215 | 216 | # every episode, plot the result 217 | average = self.PlotModel(i, e) 218 | 219 | print("episode: {}/{}, score: {}, e: {:.2}, average: {}".format(e, self.EPISODES, i, self.epsilon, average)) 220 | if i == self.env._max_episode_steps: 221 | print("Saving trained model as", self.Model_name) 222 | #self.save(self.Model_name) 223 | break 224 | self.replay() 225 | 226 | def test(self): 227 | self.load(self.Model_name) 228 | for e in range(self.EPISODES): 229 | state = self.env.reset() 230 | state = np.reshape(state, [1, self.state_size]) 231 | done = False 232 | i = 0 233 | while not done: 234 | self.env.render() 235 | action = np.argmax(self.model.predict(state)) 236 | next_state, reward, done, _ = self.env.step(action) 237 | state = np.reshape(next_state, [1, self.state_size]) 238 | i += 1 239 | if done: 240 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, i)) 241 | break 242 | 243 | if __name__ == "__main__": 244 | env_name = 'CartPole-v1' 245 | agent = DQNAgent(env_name) 246 | agent.run() 247 | #agent.test() 248 | -------------------------------------------------------------------------------- /03_CartPole-reinforcement-learning_Dueling_DDQN/IMAGES/DDQN_CartPole-v1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/03_CartPole-reinforcement-learning_Dueling_DDQN/IMAGES/DDQN_CartPole-v1.png -------------------------------------------------------------------------------- /03_CartPole-reinforcement-learning_Dueling_DDQN/IMAGES/DDQN_CartPole-v1_Dueling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/03_CartPole-reinforcement-learning_Dueling_DDQN/IMAGES/DDQN_CartPole-v1_Dueling.png -------------------------------------------------------------------------------- /03_CartPole-reinforcement-learning_Dueling_DDQN/IMAGES/DQN_CartPole-v1_Dueling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/03_CartPole-reinforcement-learning_Dueling_DDQN/IMAGES/DQN_CartPole-v1_Dueling.png -------------------------------------------------------------------------------- /04_CartPole-reinforcement-learning_e_greedy_D3QN/Cartpole_e_greedy_D3QN.py: -------------------------------------------------------------------------------- 1 | # Tutorial by www.pylessons.com 2 | # Tutorial written for - Tensorflow 1.15, Keras 2.2.4 3 | 4 | import os 5 | import random 6 | import gym 7 | import pylab 8 | import numpy as np 9 | from collections import deque 10 | from keras.models import Model, load_model 11 | from keras.layers import Input, Dense, Lambda, Add 12 | from keras.optimizers import Adam, RMSprop 13 | from keras import backend as K 14 | 15 | def OurModel(input_shape, action_space, dueling): 16 | X_input = Input(input_shape) 17 | X = X_input 18 | 19 | # 'Dense' is the basic form of a neural network layer 20 | # Input Layer of state size(4) and Hidden Layer with 512 nodes 21 | X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X) 22 | 23 | # Hidden layer with 256 nodes 24 | X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X) 25 | 26 | # Hidden layer with 64 nodes 27 | X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X) 28 | 29 | if dueling: 30 | state_value = Dense(1, kernel_initializer='he_uniform')(X) 31 | state_value = Lambda(lambda s: K.expand_dims(s[:, 0], -1), output_shape=(action_space,))(state_value) 32 | 33 | action_advantage = Dense(action_space, kernel_initializer='he_uniform')(X) 34 | action_advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), output_shape=(action_space,))(action_advantage) 35 | 36 | X = Add()([state_value, action_advantage]) 37 | else: 38 | # Output Layer with # of actions: 2 nodes (left, right) 39 | X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X) 40 | 41 | model = Model(inputs = X_input, outputs = X, name='CartPole Dueling DDQN model') 42 | model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"]) 43 | 44 | model.summary() 45 | return model 46 | 47 | class DQNAgent: 48 | def __init__(self, env_name): 49 | self.env_name = env_name 50 | self.env = gym.make(env_name) 51 | self.env.seed(0) 52 | # by default, CartPole-v1 has max episode steps = 500 53 | self.env._max_episode_steps = 4000 54 | self.state_size = self.env.observation_space.shape[0] 55 | self.action_size = self.env.action_space.n 56 | 57 | self.EPISODES = 1000 58 | self.memory = deque(maxlen=2000) 59 | self.gamma = 0.95 # discount rate 60 | 61 | # EXPLORATION HYPERPARAMETERS for epsilon and epsilon greedy strategy 62 | self.epsilon = 1.0 # exploration probability at start 63 | self.epsilon_min = 0.01 # minimum exploration probability 64 | self.epsilon_decay = 0.0005 # exponential decay rate for exploration prob 65 | 66 | self.batch_size = 32 67 | 68 | # defining model parameters 69 | self.ddqn = True # use double deep q network 70 | self.Soft_Update = False # use soft parameter update 71 | self.dueling = True # use dealing network 72 | self.epsilon_greedy = True # use epsilon greedy strategy 73 | 74 | self.TAU = 0.1 # target network soft update hyperparameter 75 | 76 | self.Save_Path = 'Models' 77 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path) 78 | self.scores, self.episodes, self.average = [], [], [] 79 | 80 | self.Model_name = os.path.join(self.Save_Path, self.env_name+"_e_greedy.h5") 81 | 82 | # create main model and target model 83 | self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling) 84 | self.target_model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling) 85 | 86 | # after some time interval update the target model to be same with model 87 | def update_target_model(self): 88 | if not self.Soft_Update and self.ddqn: 89 | self.target_model.set_weights(self.model.get_weights()) 90 | return 91 | if self.Soft_Update and self.ddqn: 92 | q_model_theta = self.model.get_weights() 93 | target_model_theta = self.target_model.get_weights() 94 | counter = 0 95 | for q_weight, target_weight in zip(q_model_theta, target_model_theta): 96 | target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU 97 | target_model_theta[counter] = target_weight 98 | counter += 1 99 | self.target_model.set_weights(target_model_theta) 100 | 101 | def remember(self, state, action, reward, next_state, done): 102 | experience = state, action, reward, next_state, done 103 | self.memory.append((experience)) 104 | 105 | def act(self, state, decay_step): 106 | # EPSILON GREEDY STRATEGY 107 | if self.epsilon_greedy: 108 | # Here we'll use an improved version of our epsilon greedy strategy for Q-learning 109 | explore_probability = self.epsilon_min + (self.epsilon - self.epsilon_min) * np.exp(-self.epsilon_decay * decay_step) 110 | # OLD EPSILON STRATEGY 111 | else: 112 | if self.epsilon > self.epsilon_min: 113 | self.epsilon *= (1-self.epsilon_decay) 114 | explore_probability = self.epsilon 115 | 116 | if explore_probability > np.random.rand(): 117 | # Make a random action (exploration) 118 | return random.randrange(self.action_size), explore_probability 119 | else: 120 | # Get action from Q-network (exploitation) 121 | # Estimate the Qs values state 122 | # Take the biggest Q value (= the best action) 123 | return np.argmax(self.model.predict(state)), explore_probability 124 | 125 | def replay(self): 126 | if len(self.memory) < self.batch_size: 127 | return 128 | # Randomly sample minibatch from the memory 129 | minibatch = random.sample(self.memory, self.batch_size) 130 | 131 | state = np.zeros((self.batch_size, self.state_size)) 132 | next_state = np.zeros((self.batch_size, self.state_size)) 133 | action, reward, done = [], [], [] 134 | 135 | # do this before prediction 136 | # for speedup, this could be done on the tensor level 137 | # but easier to understand using a loop 138 | for i in range(self.batch_size): 139 | state[i] = minibatch[i][0] 140 | action.append(minibatch[i][1]) 141 | reward.append(minibatch[i][2]) 142 | next_state[i] = minibatch[i][3] 143 | done.append(minibatch[i][4]) 144 | 145 | # do batch prediction to save speed 146 | # predict Q-values for starting state using the main network 147 | target = self.model.predict(state) 148 | # predict best action in ending state using the main network 149 | target_next = self.model.predict(next_state) 150 | # predict Q-values for ending state using the target network 151 | target_val = self.target_model.predict(next_state) 152 | 153 | for i in range(len(minibatch)): 154 | # correction on the Q value for the action used 155 | if done[i]: 156 | target[i][action[i]] = reward[i] 157 | else: 158 | if self.ddqn: # Double - DQN 159 | # current Q Network selects the action 160 | # a'_max = argmax_a' Q(s', a') 161 | a = np.argmax(target_next[i]) 162 | # target Q Network evaluates the action 163 | # Q_max = Q_target(s', a'_max) 164 | target[i][action[i]] = reward[i] + self.gamma * (target_val[i][a]) 165 | else: # Standard - DQN 166 | # DQN chooses the max Q value among next actions 167 | # selection and evaluation of action is on the target Q Network 168 | # Q_max = max_a' Q_target(s', a') 169 | target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i])) 170 | 171 | # Train the Neural Network with batches 172 | self.model.fit(state, target, batch_size=self.batch_size, verbose=0) 173 | 174 | def load(self, name): 175 | self.model = load_model(name) 176 | 177 | def save(self, name): 178 | self.model.save(name) 179 | 180 | pylab.figure(figsize=(18, 9)) 181 | def PlotModel(self, score, episode): 182 | self.scores.append(score) 183 | self.episodes.append(episode) 184 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:])) 185 | pylab.plot(self.episodes, self.average, 'r') 186 | pylab.plot(self.episodes, self.scores, 'b') 187 | pylab.ylabel('Score', fontsize=18) 188 | pylab.xlabel('Steps', fontsize=18) 189 | dqn = 'DQN_' 190 | softupdate = '' 191 | dueling = '' 192 | greedy = '' 193 | if self.ddqn: dqn = 'DDQN_' 194 | if self.Soft_Update: softupdate = '_soft' 195 | if self.dueling: dueling = '_Dueling' 196 | if self.epsilon_greedy: greedy = '_Greedy' 197 | try: 198 | pylab.savefig(dqn+self.env_name+softupdate+dueling+greedy+".png") 199 | except OSError: 200 | pass 201 | 202 | return str(self.average[-1])[:5] 203 | 204 | def run(self): 205 | decay_step = 0 206 | for e in range(self.EPISODES): 207 | state = self.env.reset() 208 | state = np.reshape(state, [1, self.state_size]) 209 | done = False 210 | i = 0 211 | while not done: 212 | #self.env.render() 213 | decay_step += 1 214 | action, explore_probability = self.act(state, decay_step) 215 | next_state, reward, done, _ = self.env.step(action) 216 | next_state = np.reshape(next_state, [1, self.state_size]) 217 | if not done or i == self.env._max_episode_steps-1: 218 | reward = reward 219 | else: 220 | reward = -100 221 | self.remember(state, action, reward, next_state, done) 222 | state = next_state 223 | i += 1 224 | if done: 225 | # every step update target model 226 | self.update_target_model() 227 | 228 | # every episode, plot the result 229 | average = self.PlotModel(i, e) 230 | 231 | print("episode: {}/{}, score: {}, e: {:.2}, average: {}".format(e, self.EPISODES, i, explore_probability, average)) 232 | if i == self.env._max_episode_steps: 233 | print("Saving trained model to", self.Model_name) 234 | self.save(self.Model_name) 235 | break 236 | 237 | self.replay() 238 | 239 | def test(self): 240 | self.load(self.Model_name) 241 | for e in range(self.EPISODES): 242 | state = self.env.reset() 243 | state = np.reshape(state, [1, self.state_size]) 244 | done = False 245 | i = 0 246 | while not done: 247 | self.env.render() 248 | action = np.argmax(self.model.predict(state)) 249 | next_state, reward, done, _ = self.env.step(action) 250 | state = np.reshape(next_state, [1, self.state_size]) 251 | i += 1 252 | if done: 253 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, i)) 254 | break 255 | 256 | if __name__ == "__main__": 257 | env_name = 'CartPole-v1' 258 | agent = DQNAgent(env_name) 259 | agent.run() 260 | #agent.test() 261 | -------------------------------------------------------------------------------- /04_CartPole-reinforcement-learning_e_greedy_D3QN/Cartpole_e_greedy_D3QN_TF2.py: -------------------------------------------------------------------------------- 1 | # Tutorial by www.pylessons.com 2 | # Tutorial written for - Tensorflow 2.3.1 3 | 4 | import os 5 | import random 6 | import gym 7 | import pylab 8 | import numpy as np 9 | from collections import deque 10 | from tensorflow.keras.models import Model, load_model 11 | from tensorflow.keras.layers import Input, Dense, Lambda, Add 12 | from tensorflow.keras.optimizers import Adam, RMSprop 13 | from tensorflow.keras import backend as K 14 | 15 | def OurModel(input_shape, action_space, dueling): 16 | X_input = Input(input_shape) 17 | X = X_input 18 | 19 | # 'Dense' is the basic form of a neural network layer 20 | # Input Layer of state size(4) and Hidden Layer with 512 nodes 21 | X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X) 22 | 23 | # Hidden layer with 256 nodes 24 | X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X) 25 | 26 | # Hidden layer with 64 nodes 27 | X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X) 28 | 29 | if dueling: 30 | state_value = Dense(1, kernel_initializer='he_uniform')(X) 31 | state_value = Lambda(lambda s: K.expand_dims(s[:, 0], -1), output_shape=(action_space,))(state_value) 32 | 33 | action_advantage = Dense(action_space, kernel_initializer='he_uniform')(X) 34 | action_advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), output_shape=(action_space,))(action_advantage) 35 | 36 | X = Add()([state_value, action_advantage]) 37 | else: 38 | # Output Layer with # of actions: 2 nodes (left, right) 39 | X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X) 40 | 41 | model = Model(inputs = X_input, outputs = X) 42 | model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"]) 43 | 44 | model.summary() 45 | return model 46 | 47 | class DQNAgent: 48 | def __init__(self, env_name): 49 | self.env_name = env_name 50 | self.env = gym.make(env_name) 51 | self.env.seed(0) 52 | # by default, CartPole-v1 has max episode steps = 500 53 | self.env._max_episode_steps = 4000 54 | self.state_size = self.env.observation_space.shape[0] 55 | self.action_size = self.env.action_space.n 56 | 57 | self.EPISODES = 1000 58 | self.memory = deque(maxlen=2000) 59 | self.gamma = 0.95 # discount rate 60 | 61 | # EXPLORATION HYPERPARAMETERS for epsilon and epsilon greedy strategy 62 | self.epsilon = 1.0 # exploration probability at start 63 | self.epsilon_min = 0.01 # minimum exploration probability 64 | self.epsilon_decay = 0.0005 # exponential decay rate for exploration prob 65 | 66 | self.batch_size = 32 67 | 68 | # defining model parameters 69 | self.ddqn = True # use double deep q network 70 | self.Soft_Update = False # use soft parameter update 71 | self.dueling = True # use dealing network 72 | self.epsilon_greedy = True # use epsilon greedy strategy 73 | 74 | self.TAU = 0.1 # target network soft update hyperparameter 75 | 76 | self.Save_Path = 'Models' 77 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path) 78 | self.scores, self.episodes, self.average = [], [], [] 79 | 80 | self.Model_name = os.path.join(self.Save_Path, self.env_name+"_e_greedy.h5") 81 | 82 | # create main model and target model 83 | self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling) 84 | self.target_model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling) 85 | 86 | # after some time interval update the target model to be same with model 87 | def update_target_model(self): 88 | if not self.Soft_Update and self.ddqn: 89 | self.target_model.set_weights(self.model.get_weights()) 90 | return 91 | if self.Soft_Update and self.ddqn: 92 | q_model_theta = self.model.get_weights() 93 | target_model_theta = self.target_model.get_weights() 94 | counter = 0 95 | for q_weight, target_weight in zip(q_model_theta, target_model_theta): 96 | target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU 97 | target_model_theta[counter] = target_weight 98 | counter += 1 99 | self.target_model.set_weights(target_model_theta) 100 | 101 | def remember(self, state, action, reward, next_state, done): 102 | experience = state, action, reward, next_state, done 103 | self.memory.append((experience)) 104 | 105 | def act(self, state, decay_step): 106 | # EPSILON GREEDY STRATEGY 107 | if self.epsilon_greedy: 108 | # Here we'll use an improved version of our epsilon greedy strategy for Q-learning 109 | explore_probability = self.epsilon_min + (self.epsilon - self.epsilon_min) * np.exp(-self.epsilon_decay * decay_step) 110 | # OLD EPSILON STRATEGY 111 | else: 112 | if self.epsilon > self.epsilon_min: 113 | self.epsilon *= (1-self.epsilon_decay) 114 | explore_probability = self.epsilon 115 | 116 | if explore_probability > np.random.rand(): 117 | # Make a random action (exploration) 118 | return random.randrange(self.action_size), explore_probability 119 | else: 120 | # Get action from Q-network (exploitation) 121 | # Estimate the Qs values state 122 | # Take the biggest Q value (= the best action) 123 | return np.argmax(self.model.predict(state)), explore_probability 124 | 125 | def replay(self): 126 | if len(self.memory) < self.batch_size: 127 | return 128 | # Randomly sample minibatch from the memory 129 | minibatch = random.sample(self.memory, self.batch_size) 130 | 131 | state = np.zeros((self.batch_size, self.state_size)) 132 | next_state = np.zeros((self.batch_size, self.state_size)) 133 | action, reward, done = [], [], [] 134 | 135 | # do this before prediction 136 | # for speedup, this could be done on the tensor level 137 | # but easier to understand using a loop 138 | for i in range(self.batch_size): 139 | state[i] = minibatch[i][0] 140 | action.append(minibatch[i][1]) 141 | reward.append(minibatch[i][2]) 142 | next_state[i] = minibatch[i][3] 143 | done.append(minibatch[i][4]) 144 | 145 | # do batch prediction to save speed 146 | # predict Q-values for starting state using the main network 147 | target = self.model.predict(state) 148 | # predict best action in ending state using the main network 149 | target_next = self.model.predict(next_state) 150 | # predict Q-values for ending state using the target network 151 | target_val = self.target_model.predict(next_state) 152 | 153 | for i in range(len(minibatch)): 154 | # correction on the Q value for the action used 155 | if done[i]: 156 | target[i][action[i]] = reward[i] 157 | else: 158 | if self.ddqn: # Double - DQN 159 | # current Q Network selects the action 160 | # a'_max = argmax_a' Q(s', a') 161 | a = np.argmax(target_next[i]) 162 | # target Q Network evaluates the action 163 | # Q_max = Q_target(s', a'_max) 164 | target[i][action[i]] = reward[i] + self.gamma * (target_val[i][a]) 165 | else: # Standard - DQN 166 | # DQN chooses the max Q value among next actions 167 | # selection and evaluation of action is on the target Q Network 168 | # Q_max = max_a' Q_target(s', a') 169 | target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i])) 170 | 171 | # Train the Neural Network with batches 172 | self.model.fit(state, target, batch_size=self.batch_size, verbose=0) 173 | 174 | def load(self, name): 175 | self.model = load_model(name) 176 | 177 | def save(self, name): 178 | self.model.save(name) 179 | 180 | pylab.figure(figsize=(18, 9)) 181 | def PlotModel(self, score, episode): 182 | self.scores.append(score) 183 | self.episodes.append(episode) 184 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:])) 185 | pylab.plot(self.episodes, self.average, 'r') 186 | pylab.plot(self.episodes, self.scores, 'b') 187 | pylab.ylabel('Score', fontsize=18) 188 | pylab.xlabel('Steps', fontsize=18) 189 | dqn = 'DQN_' 190 | softupdate = '' 191 | dueling = '' 192 | greedy = '' 193 | if self.ddqn: dqn = 'DDQN_' 194 | if self.Soft_Update: softupdate = '_soft' 195 | if self.dueling: dueling = '_Dueling' 196 | if self.epsilon_greedy: greedy = '_Greedy' 197 | try: 198 | pylab.savefig(dqn+self.env_name+softupdate+dueling+greedy+".png") 199 | except OSError: 200 | pass 201 | 202 | return str(self.average[-1])[:5] 203 | 204 | def run(self): 205 | decay_step = 0 206 | for e in range(self.EPISODES): 207 | state = self.env.reset() 208 | state = np.reshape(state, [1, self.state_size]) 209 | done = False 210 | i = 0 211 | while not done: 212 | #self.env.render() 213 | decay_step += 1 214 | action, explore_probability = self.act(state, decay_step) 215 | next_state, reward, done, _ = self.env.step(action) 216 | next_state = np.reshape(next_state, [1, self.state_size]) 217 | if not done or i == self.env._max_episode_steps-1: 218 | reward = reward 219 | else: 220 | reward = -100 221 | self.remember(state, action, reward, next_state, done) 222 | state = next_state 223 | i += 1 224 | if done: 225 | # every step update target model 226 | self.update_target_model() 227 | 228 | # every episode, plot the result 229 | average = self.PlotModel(i, e) 230 | 231 | print("episode: {}/{}, score: {}, e: {:.2}, average: {}".format(e, self.EPISODES, i, explore_probability, average)) 232 | if i == self.env._max_episode_steps: 233 | print("Saving trained model to", self.Model_name) 234 | self.save(self.Model_name) 235 | break 236 | 237 | self.replay() 238 | 239 | def test(self): 240 | self.load(self.Model_name) 241 | for e in range(self.EPISODES): 242 | state = self.env.reset() 243 | state = np.reshape(state, [1, self.state_size]) 244 | done = False 245 | i = 0 246 | while not done: 247 | self.env.render() 248 | action = np.argmax(self.model.predict(state)) 249 | next_state, reward, done, _ = self.env.step(action) 250 | state = np.reshape(next_state, [1, self.state_size]) 251 | i += 1 252 | if done: 253 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, i)) 254 | break 255 | 256 | if __name__ == "__main__": 257 | env_name = 'CartPole-v1' 258 | agent = DQNAgent(env_name) 259 | agent.run() 260 | #agent.test() 261 | -------------------------------------------------------------------------------- /04_CartPole-reinforcement-learning_e_greedy_D3QN/IMAGES/DDQN_CartPole-v1_Dueling_Greedy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/04_CartPole-reinforcement-learning_e_greedy_D3QN/IMAGES/DDQN_CartPole-v1_Dueling_Greedy.png -------------------------------------------------------------------------------- /05_CartPole-reinforcement-learning_PER_D3QN/Cartpole_PER_D3QN.py: -------------------------------------------------------------------------------- 1 | # Tutorial by www.pylessons.com 2 | # Tutorial written for - Tensorflow 1.15, Keras 2.2.4 3 | 4 | import os 5 | import random 6 | import gym 7 | import pylab 8 | import numpy as np 9 | from collections import deque 10 | from keras.models import Model, load_model 11 | from keras.layers import Input, Dense, Lambda, Add 12 | from keras.optimizers import Adam, RMSprop 13 | from keras import backend as K 14 | from PER import * 15 | 16 | def OurModel(input_shape, action_space, dueling): 17 | X_input = Input(input_shape) 18 | X = X_input 19 | 20 | # 'Dense' is the basic form of a neural network layer 21 | # Input Layer of state size(4) and Hidden Layer with 512 nodes 22 | X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X) 23 | 24 | # Hidden layer with 256 nodes 25 | X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X) 26 | 27 | # Hidden layer with 64 nodes 28 | X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X) 29 | 30 | if dueling: 31 | state_value = Dense(1, kernel_initializer='he_uniform')(X) 32 | state_value = Lambda(lambda s: K.expand_dims(s[:, 0], -1), output_shape=(action_space,))(state_value) 33 | 34 | action_advantage = Dense(action_space, kernel_initializer='he_uniform')(X) 35 | action_advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), output_shape=(action_space,))(action_advantage) 36 | 37 | X = Add()([state_value, action_advantage]) 38 | else: 39 | # Output Layer with # of actions: 2 nodes (left, right) 40 | X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X) 41 | 42 | model = Model(inputs = X_input, outputs = X, name='CartPole D3QN model') 43 | model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"]) 44 | 45 | model.summary() 46 | return model 47 | 48 | class DQNAgent: 49 | def __init__(self, env_name): 50 | self.env_name = env_name 51 | self.env = gym.make(env_name) 52 | self.env.seed(0) 53 | # by default, CartPole-v1 has max episode steps = 500 54 | self.env._max_episode_steps = 4000 55 | self.state_size = self.env.observation_space.shape[0] 56 | self.action_size = self.env.action_space.n 57 | 58 | self.EPISODES = 1000 59 | memory_size = 10000 60 | self.MEMORY = Memory(memory_size) 61 | self.memory = deque(maxlen=2000) 62 | self.gamma = 0.95 # discount rate 63 | 64 | # EXPLORATION HYPERPARAMETERS for epsilon and epsilon greedy strategy 65 | self.epsilon = 1.0 # exploration probability at start 66 | self.epsilon_min = 0.01 # minimum exploration probability 67 | self.epsilon_decay = 0.0005 # exponential decay rate for exploration prob 68 | 69 | self.batch_size = 32 70 | 71 | # defining model parameters 72 | self.ddqn = True # use doudle deep q network 73 | self.Soft_Update = False # use soft parameter update 74 | self.dueling = True # use dealing netowrk 75 | self.epsilot_greedy = False # use epsilon greedy strategy 76 | self.USE_PER = True 77 | 78 | self.TAU = 0.1 # target network soft update hyperparameter 79 | 80 | self.Save_Path = 'Models' 81 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path) 82 | self.scores, self.episodes, self.average = [], [], [] 83 | 84 | self.Model_name = os.path.join(self.Save_Path, self.env_name+"_e_greedy.h5") 85 | 86 | # create main model and target model 87 | self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling) 88 | self.target_model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling) 89 | 90 | # after some time interval update the target model to be same with model 91 | def update_target_model(self): 92 | if not self.Soft_Update and self.ddqn: 93 | self.target_model.set_weights(self.model.get_weights()) 94 | return 95 | if self.Soft_Update and self.ddqn: 96 | q_model_theta = self.model.get_weights() 97 | target_model_theta = self.target_model.get_weights() 98 | counter = 0 99 | for q_weight, target_weight in zip(q_model_theta, target_model_theta): 100 | target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU 101 | target_model_theta[counter] = target_weight 102 | counter += 1 103 | self.target_model.set_weights(target_model_theta) 104 | 105 | def remember(self, state, action, reward, next_state, done): 106 | experience = state, action, reward, next_state, done 107 | if self.USE_PER: 108 | self.MEMORY.store(experience) 109 | else: 110 | self.memory.append((experience)) 111 | 112 | def act(self, state, decay_step): 113 | # EPSILON GREEDY STRATEGY 114 | if self.epsilot_greedy: 115 | # Here we'll use an improved version of our epsilon greedy strategy for Q-learning 116 | explore_probability = self.epsilon_min + (self.epsilon - self.epsilon_min) * np.exp(-self.epsilon_decay * decay_step) 117 | # OLD EPSILON STRATEGY 118 | else: 119 | if self.epsilon > self.epsilon_min: 120 | self.epsilon *= (1-self.epsilon_decay) 121 | explore_probability = self.epsilon 122 | 123 | if explore_probability > np.random.rand(): 124 | # Make a random action (exploration) 125 | return random.randrange(self.action_size), explore_probability 126 | else: 127 | # Get action from Q-network (exploitation) 128 | # Estimate the Qs values state 129 | # Take the biggest Q value (= the best action) 130 | return np.argmax(self.model.predict(state)), explore_probability 131 | 132 | def replay(self): 133 | if self.USE_PER: 134 | tree_idx, minibatch = self.MEMORY.sample(self.batch_size) 135 | else: 136 | minibatch = random.sample(self.memory, min(len(self.memory), self.batch_size)) 137 | 138 | state = np.zeros((self.batch_size, self.state_size)) 139 | next_state = np.zeros((self.batch_size, self.state_size)) 140 | action, reward, done = [], [], [] 141 | 142 | # do this before prediction 143 | # for speedup, this could be done on the tensor level 144 | # but easier to understand using a loop 145 | for i in range(self.batch_size): 146 | state[i] = minibatch[i][0] 147 | action.append(minibatch[i][1]) 148 | reward.append(minibatch[i][2]) 149 | next_state[i] = minibatch[i][3] 150 | done.append(minibatch[i][4]) 151 | 152 | # do batch prediction to save speed 153 | # predict Q-values for starting state using the main network 154 | target = self.model.predict(state) 155 | target_old = np.array(target) 156 | # predict best action in ending state using the main network 157 | target_next = self.model.predict(next_state) 158 | # predict Q-values for ending state using the target network 159 | target_val = self.target_model.predict(next_state) 160 | 161 | for i in range(len(minibatch)): 162 | # correction on the Q value for the action used 163 | if done[i]: 164 | target[i][action[i]] = reward[i] 165 | else: 166 | if self.ddqn: # Double - DQN 167 | # current Q Network selects the action 168 | # a'_max = argmax_a' Q(s', a') 169 | a = np.argmax(target_next[i]) 170 | # target Q Network evaluates the action 171 | # Q_max = Q_target(s', a'_max) 172 | target[i][action[i]] = reward[i] + self.gamma * (target_val[i][a]) 173 | else: # Standard - DQN 174 | # DQN chooses the max Q value among next actions 175 | # selection and evaluation of action is on the target Q Network 176 | # Q_max = max_a' Q_target(s', a') 177 | target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i])) 178 | 179 | if self.USE_PER: 180 | indices = np.arange(self.batch_size, dtype=np.int32) 181 | absolute_errors = np.abs(target_old[indices, np.array(action)]-target[indices, np.array(action)]) 182 | # Update priority 183 | self.MEMORY.batch_update(tree_idx, absolute_errors) 184 | 185 | # Train the Neural Network with batches 186 | self.model.fit(state, target, batch_size=self.batch_size, verbose=0) 187 | 188 | def load(self, name): 189 | self.model = load_model(name) 190 | 191 | def save(self, name): 192 | self.model.save(name) 193 | 194 | pylab.figure(figsize=(18, 9)) 195 | def PlotModel(self, score, episode): 196 | self.scores.append(score) 197 | self.episodes.append(episode) 198 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:])) 199 | pylab.plot(self.episodes, self.average, 'r') 200 | pylab.plot(self.episodes, self.scores, 'b') 201 | pylab.ylabel('Score', fontsize=18) 202 | pylab.xlabel('Steps', fontsize=18) 203 | dqn = 'DQN_' 204 | softupdate = '' 205 | dueling = '' 206 | greedy = '' 207 | PER = '' 208 | if self.ddqn: dqn = 'DDQN_' 209 | if self.Soft_Update: softupdate = '_soft' 210 | if self.dueling: dueling = '_Dueling' 211 | if self.epsilot_greedy: greedy = '_Greedy' 212 | if self.USE_PER: PER = '_PER' 213 | try: 214 | pylab.savefig(dqn+self.env_name+softupdate+dueling+greedy+PER+".png") 215 | except OSError: 216 | pass 217 | 218 | return str(self.average[-1])[:5] 219 | 220 | def run(self): 221 | decay_step = 0 222 | for e in range(self.EPISODES): 223 | state = self.env.reset() 224 | state = np.reshape(state, [1, self.state_size]) 225 | done = False 226 | i = 0 227 | while not done: 228 | #self.env.render() 229 | decay_step += 1 230 | action, explore_probability = self.act(state, decay_step) 231 | next_state, reward, done, _ = self.env.step(action) 232 | next_state = np.reshape(next_state, [1, self.state_size]) 233 | if not done or i == self.env._max_episode_steps-1: 234 | reward = reward 235 | else: 236 | reward = -100 237 | self.remember(state, action, reward, next_state, done) 238 | state = next_state 239 | i += 1 240 | if done: 241 | # every step update target model 242 | self.update_target_model() 243 | 244 | # every episode, plot the result 245 | average = self.PlotModel(i, e) 246 | 247 | print("episode: {}/{}, score: {}, e: {:.2}, average: {}".format(e, self.EPISODES, i, explore_probability, average)) 248 | if i == self.env._max_episode_steps: 249 | print("Saving trained model to", self.Model_name) 250 | #self.save(self.Model_name) 251 | break 252 | self.replay() 253 | self.env.close() 254 | 255 | def test(self): 256 | self.load(self.Model_name) 257 | for e in range(self.EPISODES): 258 | state = self.env.reset() 259 | state = np.reshape(state, [1, self.state_size]) 260 | done = False 261 | i = 0 262 | while not done: 263 | self.env.render() 264 | action = np.argmax(self.model.predict(state)) 265 | next_state, reward, done, _ = self.env.step(action) 266 | state = np.reshape(next_state, [1, self.state_size]) 267 | i += 1 268 | if done: 269 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, i)) 270 | break 271 | 272 | if __name__ == "__main__": 273 | env_name = 'CartPole-v1' 274 | agent = DQNAgent(env_name) 275 | agent.run() 276 | #agent.test() 277 | -------------------------------------------------------------------------------- /05_CartPole-reinforcement-learning_PER_D3QN/Cartpole_PER_D3QN_TF2.py: -------------------------------------------------------------------------------- 1 | # Tutorial by www.pylessons.com 2 | # Tutorial written for - Tensorflow 2.3.1 3 | 4 | import os 5 | import random 6 | import gym 7 | import pylab 8 | import numpy as np 9 | from collections import deque 10 | from tensorflow.keras.models import Model, load_model 11 | from tensorflow.keras.layers import Input, Dense, Lambda, Add 12 | from tensorflow.keras.optimizers import Adam, RMSprop 13 | from tensorflow.keras import backend as K 14 | from PER import * 15 | 16 | def OurModel(input_shape, action_space, dueling): 17 | X_input = Input(input_shape) 18 | X = X_input 19 | 20 | # 'Dense' is the basic form of a neural network layer 21 | # Input Layer of state size(4) and Hidden Layer with 512 nodes 22 | X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X) 23 | 24 | # Hidden layer with 256 nodes 25 | X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X) 26 | 27 | # Hidden layer with 64 nodes 28 | X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X) 29 | 30 | if dueling: 31 | state_value = Dense(1, kernel_initializer='he_uniform')(X) 32 | state_value = Lambda(lambda s: K.expand_dims(s[:, 0], -1), output_shape=(action_space,))(state_value) 33 | 34 | action_advantage = Dense(action_space, kernel_initializer='he_uniform')(X) 35 | action_advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), output_shape=(action_space,))(action_advantage) 36 | 37 | X = Add()([state_value, action_advantage]) 38 | else: 39 | # Output Layer with # of actions: 2 nodes (left, right) 40 | X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X) 41 | 42 | model = Model(inputs = X_input, outputs = X) 43 | model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"]) 44 | 45 | model.summary() 46 | return model 47 | 48 | class DQNAgent: 49 | def __init__(self, env_name): 50 | self.env_name = env_name 51 | self.env = gym.make(env_name) 52 | self.env.seed(0) 53 | # by default, CartPole-v1 has max episode steps = 500 54 | self.env._max_episode_steps = 4000 55 | self.state_size = self.env.observation_space.shape[0] 56 | self.action_size = self.env.action_space.n 57 | 58 | self.EPISODES = 1000 59 | memory_size = 10000 60 | self.MEMORY = Memory(memory_size) 61 | self.memory = deque(maxlen=2000) 62 | self.gamma = 0.95 # discount rate 63 | 64 | # EXPLORATION HYPERPARAMETERS for epsilon and epsilon greedy strategy 65 | self.epsilon = 1.0 # exploration probability at start 66 | self.epsilon_min = 0.01 # minimum exploration probability 67 | self.epsilon_decay = 0.0005 # exponential decay rate for exploration prob 68 | 69 | self.batch_size = 32 70 | 71 | # defining model parameters 72 | self.ddqn = True # use doudle deep q network 73 | self.Soft_Update = False # use soft parameter update 74 | self.dueling = True # use dealing netowrk 75 | self.epsilot_greedy = False # use epsilon greedy strategy 76 | self.USE_PER = True 77 | 78 | self.TAU = 0.1 # target network soft update hyperparameter 79 | 80 | self.Save_Path = 'Models' 81 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path) 82 | self.scores, self.episodes, self.average = [], [], [] 83 | 84 | self.Model_name = os.path.join(self.Save_Path, self.env_name+"_e_greedy.h5") 85 | 86 | # create main model and target model 87 | self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling) 88 | self.target_model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling) 89 | 90 | # after some time interval update the target model to be same with model 91 | def update_target_model(self): 92 | if not self.Soft_Update and self.ddqn: 93 | self.target_model.set_weights(self.model.get_weights()) 94 | return 95 | if self.Soft_Update and self.ddqn: 96 | q_model_theta = self.model.get_weights() 97 | target_model_theta = self.target_model.get_weights() 98 | counter = 0 99 | for q_weight, target_weight in zip(q_model_theta, target_model_theta): 100 | target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU 101 | target_model_theta[counter] = target_weight 102 | counter += 1 103 | self.target_model.set_weights(target_model_theta) 104 | 105 | def remember(self, state, action, reward, next_state, done): 106 | experience = state, action, reward, next_state, done 107 | if self.USE_PER: 108 | self.MEMORY.store(experience) 109 | else: 110 | self.memory.append((experience)) 111 | 112 | def act(self, state, decay_step): 113 | # EPSILON GREEDY STRATEGY 114 | if self.epsilot_greedy: 115 | # Here we'll use an improved version of our epsilon greedy strategy for Q-learning 116 | explore_probability = self.epsilon_min + (self.epsilon - self.epsilon_min) * np.exp(-self.epsilon_decay * decay_step) 117 | # OLD EPSILON STRATEGY 118 | else: 119 | if self.epsilon > self.epsilon_min: 120 | self.epsilon *= (1-self.epsilon_decay) 121 | explore_probability = self.epsilon 122 | 123 | if explore_probability > np.random.rand(): 124 | # Make a random action (exploration) 125 | return random.randrange(self.action_size), explore_probability 126 | else: 127 | # Get action from Q-network (exploitation) 128 | # Estimate the Qs values state 129 | # Take the biggest Q value (= the best action) 130 | return np.argmax(self.model.predict(state)), explore_probability 131 | 132 | def replay(self): 133 | if self.USE_PER: 134 | tree_idx, minibatch = self.MEMORY.sample(self.batch_size) 135 | else: 136 | minibatch = random.sample(self.memory, min(len(self.memory), self.batch_size)) 137 | 138 | state = np.zeros((self.batch_size, self.state_size)) 139 | next_state = np.zeros((self.batch_size, self.state_size)) 140 | action, reward, done = [], [], [] 141 | 142 | # do this before prediction 143 | # for speedup, this could be done on the tensor level 144 | # but easier to understand using a loop 145 | for i in range(self.batch_size): 146 | state[i] = minibatch[i][0] 147 | action.append(minibatch[i][1]) 148 | reward.append(minibatch[i][2]) 149 | next_state[i] = minibatch[i][3] 150 | done.append(minibatch[i][4]) 151 | 152 | # do batch prediction to save speed 153 | # predict Q-values for starting state using the main network 154 | target = self.model.predict(state) 155 | target_old = np.array(target) 156 | # predict best action in ending state using the main network 157 | target_next = self.model.predict(next_state) 158 | # predict Q-values for ending state using the target network 159 | target_val = self.target_model.predict(next_state) 160 | 161 | for i in range(len(minibatch)): 162 | # correction on the Q value for the action used 163 | if done[i]: 164 | target[i][action[i]] = reward[i] 165 | else: 166 | if self.ddqn: # Double - DQN 167 | # current Q Network selects the action 168 | # a'_max = argmax_a' Q(s', a') 169 | a = np.argmax(target_next[i]) 170 | # target Q Network evaluates the action 171 | # Q_max = Q_target(s', a'_max) 172 | target[i][action[i]] = reward[i] + self.gamma * (target_val[i][a]) 173 | else: # Standard - DQN 174 | # DQN chooses the max Q value among next actions 175 | # selection and evaluation of action is on the target Q Network 176 | # Q_max = max_a' Q_target(s', a') 177 | target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i])) 178 | 179 | if self.USE_PER: 180 | indices = np.arange(self.batch_size, dtype=np.int32) 181 | absolute_errors = np.abs(target_old[indices, np.array(action)]-target[indices, np.array(action)]) 182 | # Update priority 183 | self.MEMORY.batch_update(tree_idx, absolute_errors) 184 | 185 | # Train the Neural Network with batches 186 | self.model.fit(state, target, batch_size=self.batch_size, verbose=0) 187 | 188 | def load(self, name): 189 | self.model = load_model(name) 190 | 191 | def save(self, name): 192 | self.model.save(name) 193 | 194 | pylab.figure(figsize=(18, 9)) 195 | def PlotModel(self, score, episode): 196 | self.scores.append(score) 197 | self.episodes.append(episode) 198 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:])) 199 | pylab.plot(self.episodes, self.average, 'r') 200 | pylab.plot(self.episodes, self.scores, 'b') 201 | pylab.ylabel('Score', fontsize=18) 202 | pylab.xlabel('Steps', fontsize=18) 203 | dqn = 'DQN_' 204 | softupdate = '' 205 | dueling = '' 206 | greedy = '' 207 | PER = '' 208 | if self.ddqn: dqn = 'DDQN_' 209 | if self.Soft_Update: softupdate = '_soft' 210 | if self.dueling: dueling = '_Dueling' 211 | if self.epsilot_greedy: greedy = '_Greedy' 212 | if self.USE_PER: PER = '_PER' 213 | try: 214 | pylab.savefig(dqn+self.env_name+softupdate+dueling+greedy+PER+".png") 215 | except OSError: 216 | pass 217 | 218 | return str(self.average[-1])[:5] 219 | 220 | def run(self): 221 | decay_step = 0 222 | for e in range(self.EPISODES): 223 | state = self.env.reset() 224 | state = np.reshape(state, [1, self.state_size]) 225 | done = False 226 | i = 0 227 | while not done: 228 | #self.env.render() 229 | decay_step += 1 230 | action, explore_probability = self.act(state, decay_step) 231 | next_state, reward, done, _ = self.env.step(action) 232 | next_state = np.reshape(next_state, [1, self.state_size]) 233 | if not done or i == self.env._max_episode_steps-1: 234 | reward = reward 235 | else: 236 | reward = -100 237 | self.remember(state, action, reward, next_state, done) 238 | state = next_state 239 | i += 1 240 | if done: 241 | # every step update target model 242 | self.update_target_model() 243 | 244 | # every episode, plot the result 245 | average = self.PlotModel(i, e) 246 | 247 | print("episode: {}/{}, score: {}, e: {:.2}, average: {}".format(e, self.EPISODES, i, explore_probability, average)) 248 | if i == self.env._max_episode_steps: 249 | print("Saving trained model to", self.Model_name) 250 | #self.save(self.Model_name) 251 | break 252 | self.replay() 253 | self.env.close() 254 | 255 | def test(self): 256 | self.load(self.Model_name) 257 | for e in range(self.EPISODES): 258 | state = self.env.reset() 259 | state = np.reshape(state, [1, self.state_size]) 260 | done = False 261 | i = 0 262 | while not done: 263 | self.env.render() 264 | action = np.argmax(self.model.predict(state)) 265 | next_state, reward, done, _ = self.env.step(action) 266 | state = np.reshape(next_state, [1, self.state_size]) 267 | i += 1 268 | if done: 269 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, i)) 270 | break 271 | 272 | if __name__ == "__main__": 273 | env_name = 'CartPole-v1' 274 | agent = DQNAgent(env_name) 275 | agent.run() 276 | #agent.test() 277 | -------------------------------------------------------------------------------- /05_CartPole-reinforcement-learning_PER_D3QN/IMAGES/DDQN_CartPole-v1_Dueling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/05_CartPole-reinforcement-learning_PER_D3QN/IMAGES/DDQN_CartPole-v1_Dueling.png -------------------------------------------------------------------------------- /05_CartPole-reinforcement-learning_PER_D3QN/IMAGES/DDQN_CartPole-v1_Dueling_PER.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/05_CartPole-reinforcement-learning_PER_D3QN/IMAGES/DDQN_CartPole-v1_Dueling_PER.png -------------------------------------------------------------------------------- /05_CartPole-reinforcement-learning_PER_D3QN/IMAGES/Replay_buffer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/05_CartPole-reinforcement-learning_PER_D3QN/IMAGES/Replay_buffer.png -------------------------------------------------------------------------------- /05_CartPole-reinforcement-learning_PER_D3QN/IMAGES/SumTree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/05_CartPole-reinforcement-learning_PER_D3QN/IMAGES/SumTree.png -------------------------------------------------------------------------------- /05_CartPole-reinforcement-learning_PER_D3QN/PER.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class SumTree(object): 4 | data_pointer = 0 5 | 6 | # Here we initialize the tree with all nodes = 0, and initialize the data with all values = 0 7 | def __init__(self, capacity): 8 | # Number of leaf nodes (final nodes) that contains experiences 9 | self.capacity = capacity 10 | 11 | # Generate the tree with all nodes values = 0 12 | # To understand this calculation (2 * capacity - 1) look at the schema below 13 | # Remember we are in a binary node (each node has max 2 children) so 2x size of leaf (capacity) - 1 (root node) 14 | # Parent nodes = capacity - 1 15 | # Leaf nodes = capacity 16 | self.tree = np.zeros(2 * capacity - 1) 17 | 18 | # Contains the experiences (so the size of data is capacity) 19 | self.data = np.zeros(capacity, dtype=object) 20 | 21 | 22 | # Here we define function that will add our priority score in the sumtree leaf and add the experience in data: 23 | def add(self, priority, data): 24 | # Look at what index we want to put the experience 25 | tree_index = self.data_pointer + self.capacity - 1 26 | 27 | # Update data frame 28 | self.data[self.data_pointer] = data 29 | 30 | # Update the leaf 31 | self.update (tree_index, priority) 32 | 33 | # Add 1 to data_pointer 34 | self.data_pointer += 1 35 | 36 | if self.data_pointer >= self.capacity: # If we're above the capacity, we go back to first index (we overwrite) 37 | self.data_pointer = 0 38 | 39 | # Update the leaf priority score and propagate the change through tree 40 | def update(self, tree_index, priority): 41 | # Change = new priority score - former priority score 42 | change = priority - self.tree[tree_index] 43 | self.tree[tree_index] = priority 44 | 45 | # then propagate the change through tree 46 | # this method is faster than the recursive loop in the reference code 47 | while tree_index != 0: 48 | tree_index = (tree_index - 1) // 2 49 | self.tree[tree_index] += change 50 | 51 | # Here build a function to get a leaf from our tree. So we'll build a function to get the leaf_index, priority value of that leaf and experience associated with that leaf index: 52 | def get_leaf(self, v): 53 | parent_index = 0 54 | 55 | # the while loop is faster than the method in the reference code 56 | while True: 57 | left_child_index = 2 * parent_index + 1 58 | right_child_index = left_child_index + 1 59 | 60 | # If we reach bottom, end the search 61 | if left_child_index >= len(self.tree): 62 | leaf_index = parent_index 63 | break 64 | else: # downward search, always search for a higher priority node 65 | if v <= self.tree[left_child_index]: 66 | parent_index = left_child_index 67 | else: 68 | v -= self.tree[left_child_index] 69 | parent_index = right_child_index 70 | 71 | data_index = leaf_index - self.capacity + 1 72 | 73 | return leaf_index, self.tree[leaf_index], self.data[data_index] 74 | 75 | @property 76 | def total_priority(self): 77 | return self.tree[0] # Returns the root node 78 | 79 | # Now we finished constructing our SumTree object, next we'll build a memory object. 80 | class Memory(object): # stored as ( state, action, reward, next_state ) in SumTree 81 | PER_e = 0.01 # Hyperparameter that we use to avoid some experiences to have 0 probability of being taken 82 | PER_a = 0.6 # Hyperparameter that we use to make a tradeoff between taking only exp with high priority and sampling randomly 83 | PER_b = 0.4 # importance-sampling, from initial value increasing to 1 84 | 85 | PER_b_increment_per_sampling = 0.001 86 | 87 | absolute_error_upper = 1. # clipped abs error 88 | 89 | def __init__(self, capacity): 90 | # Making the tree 91 | self.tree = SumTree(capacity) 92 | 93 | # Next, we define a function to store a new experience in our tree. 94 | # Each new experience will have a score of max_prority (it will be then improved when we use this exp to train our DDQN). 95 | def store(self, experience): 96 | # Find the max priority 97 | max_priority = np.max(self.tree.tree[-self.tree.capacity:]) 98 | 99 | # If the max priority = 0 we can't put priority = 0 since this experience will never have a chance to be selected 100 | # So we use a minimum priority 101 | if max_priority == 0: 102 | max_priority = self.absolute_error_upper 103 | 104 | self.tree.add(max_priority, experience) # set the max priority for new priority 105 | 106 | # Now we create sample function, which will be used to pick batch from our tree memory, which will be used to train our model. 107 | # - First, we sample a minibatch of n size, the range [0, priority_total] into priority ranges. 108 | # - Then a value is uniformly sampled from each range. 109 | # - Then we search in the sumtree, for the experience where priority score correspond to sample values are retrieved from. 110 | def sample(self, n): 111 | # Create a minibatch array that will contains the minibatch 112 | minibatch = [] 113 | 114 | b_idx = np.empty((n,), dtype=np.int32) 115 | 116 | # Calculate the priority segment 117 | # Here, as explained in the paper, we divide the Range[0, ptotal] into n ranges 118 | priority_segment = self.tree.total_priority / n # priority segment 119 | 120 | for i in range(n): 121 | # A value is uniformly sample from each range 122 | a, b = priority_segment * i, priority_segment * (i + 1) 123 | value = np.random.uniform(a, b) 124 | 125 | # Experience that correspond to each value is retrieved 126 | index, priority, data = self.tree.get_leaf(value) 127 | 128 | b_idx[i]= index 129 | 130 | minibatch.append([data[0],data[1],data[2],data[3],data[4]]) 131 | 132 | return b_idx, minibatch 133 | 134 | # Update the priorities on the tree 135 | def batch_update(self, tree_idx, abs_errors): 136 | abs_errors += self.PER_e # convert to abs and avoid 0 137 | clipped_errors = np.minimum(abs_errors, self.absolute_error_upper) 138 | ps = np.power(clipped_errors, self.PER_a) 139 | 140 | for ti, p in zip(tree_idx, ps): 141 | self.tree.update(ti, p) 142 | -------------------------------------------------------------------------------- /06_CartPole-reinforcement-learning_PER_D3QN_CNN/Cartpole_PER_D3QN_CNN.py: -------------------------------------------------------------------------------- 1 | # Tutorial by www.pylessons.com 2 | # Tutorial written for - Tensorflow 1.15, Keras 2.2.4 3 | 4 | import os 5 | import random 6 | import gym 7 | import pylab 8 | import numpy as np 9 | from collections import deque 10 | from keras.models import Model, load_model 11 | from keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten 12 | from keras.optimizers import Adam, RMSprop 13 | from keras import backend as K 14 | from PER import * 15 | import cv2 16 | 17 | def OurModel(input_shape, action_space, dueling): 18 | X_input = Input(input_shape) 19 | X = X_input 20 | 21 | X = Conv2D(64, 5, strides=(3, 3),padding="valid", input_shape=input_shape, activation="relu", data_format="channels_first")(X) 22 | X = Conv2D(64, 4, strides=(2, 2),padding="valid", activation="relu", data_format="channels_first")(X) 23 | X = Conv2D(64, 3, strides=(1, 1),padding="valid", activation="relu", data_format="channels_first")(X) 24 | X = Flatten()(X) 25 | # 'Dense' is the basic form of a neural network layer 26 | # Input Layer of state size(4) and Hidden Layer with 512 nodes 27 | X = Dense(512, activation="relu", kernel_initializer='he_uniform')(X) 28 | 29 | # Hidden layer with 256 nodes 30 | X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X) 31 | 32 | # Hidden layer with 64 nodes 33 | X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X) 34 | 35 | if dueling: 36 | state_value = Dense(1, kernel_initializer='he_uniform')(X) 37 | state_value = Lambda(lambda s: K.expand_dims(s[:, 0], -1), output_shape=(action_space,))(state_value) 38 | 39 | action_advantage = Dense(action_space, kernel_initializer='he_uniform')(X) 40 | action_advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), output_shape=(action_space,))(action_advantage) 41 | 42 | X = Add()([state_value, action_advantage]) 43 | else: 44 | # Output Layer with # of actions: 2 nodes (left, right) 45 | X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X) 46 | 47 | model = Model(inputs = X_input, outputs = X, name='CartPole PER D3QN CNN model') 48 | model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"]) 49 | 50 | model.summary() 51 | return model 52 | 53 | class DQNAgent: 54 | def __init__(self, env_name): 55 | self.env_name = env_name 56 | self.env = gym.make(env_name) 57 | self.env.seed(0) 58 | # by default, CartPole-v1 has max episode steps = 500 59 | # we can use this to experiment beyond 500 60 | self.env._max_episode_steps = 4000 61 | self.state_size = self.env.observation_space.shape[0] 62 | self.action_size = self.env.action_space.n 63 | self.EPISODES = 1000 64 | 65 | # Instantiate memory 66 | memory_size = 10000 67 | self.MEMORY = Memory(memory_size) 68 | self.memory = deque(maxlen=2000) 69 | 70 | self.gamma = 0.95 # discount rate 71 | 72 | # EXPLORATION HYPERPARAMETERS for epsilon and epsilon greedy strategy 73 | self.epsilon = 1.0 # exploration probability at start 74 | self.epsilon_min = 0.01 # minimum exploration probability 75 | self.epsilon_decay = 0.0005 # exponential decay rate for exploration prob 76 | 77 | self.batch_size = 32 78 | 79 | # defining model parameters 80 | self.ddqn = True # use doudle deep q network 81 | self.Soft_Update = False # use soft parameter update 82 | self.dueling = True # use dealing netowrk 83 | self.epsilon_greedy = False # use epsilon greedy strategy 84 | self.USE_PER = True # use priority experienced replay 85 | 86 | self.TAU = 0.1 # target network soft update hyperparameter 87 | 88 | self.Save_Path = 'Models' 89 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path) 90 | self.scores, self.episodes, self.average = [], [], [] 91 | 92 | self.Model_name = os.path.join(self.Save_Path, self.env_name+"_PER_D3QN_CNN.h5") 93 | 94 | self.ROWS = 160 95 | self.COLS = 240 96 | self.REM_STEP = 4 97 | 98 | self.image_memory = np.zeros((self.REM_STEP, self.ROWS, self.COLS)) 99 | self.state_size = (self.REM_STEP, self.ROWS, self.COLS) 100 | 101 | # create main model and target model 102 | self.model = OurModel(input_shape=self.state_size, action_space = self.action_size, dueling = self.dueling) 103 | self.target_model = OurModel(input_shape=self.state_size, action_space = self.action_size, dueling = self.dueling) 104 | 105 | # after some time interval update the target model to be same with model 106 | def update_target_model(self): 107 | if not self.Soft_Update and self.ddqn: 108 | self.target_model.set_weights(self.model.get_weights()) 109 | return 110 | if self.Soft_Update and self.ddqn: 111 | q_model_theta = self.model.get_weights() 112 | target_model_theta = self.target_model.get_weights() 113 | counter = 0 114 | for q_weight, target_weight in zip(q_model_theta, target_model_theta): 115 | target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU 116 | target_model_theta[counter] = target_weight 117 | counter += 1 118 | self.target_model.set_weights(target_model_theta) 119 | 120 | def remember(self, state, action, reward, next_state, done): 121 | experience = state, action, reward, next_state, done 122 | if self.USE_PER: 123 | self.MEMORY.store(experience) 124 | else: 125 | self.memory.append((experience)) 126 | 127 | def act(self, state, decay_step): 128 | # EPSILON GREEDY STRATEGY 129 | if self.epsilon_greedy: 130 | # Here we'll use an improved version of our epsilon greedy strategy for Q-learning 131 | explore_probability = self.epsilon_min + (self.epsilon - self.epsilon_min) * np.exp(-self.epsilon_decay * decay_step) 132 | # OLD EPSILON STRATEGY 133 | else: 134 | if self.epsilon > self.epsilon_min: 135 | self.epsilon *= (1-self.epsilon_decay) 136 | explore_probability = self.epsilon 137 | 138 | if explore_probability > np.random.rand(): 139 | # Make a random action (exploration) 140 | return random.randrange(self.action_size), explore_probability 141 | else: 142 | # Get action from Q-network (exploitation) 143 | # Estimate the Qs values state 144 | # Take the biggest Q value (= the best action) 145 | return np.argmax(self.model.predict(state)), explore_probability 146 | 147 | def replay(self): 148 | if self.USE_PER: 149 | # Sample minibatch from the PER memory 150 | tree_idx, minibatch = self.MEMORY.sample(self.batch_size) 151 | else: 152 | # Randomly sample minibatch from the deque memory 153 | minibatch = random.sample(self.memory, min(len(self.memory), self.batch_size)) 154 | 155 | state = np.zeros((self.batch_size,) + self.state_size) 156 | next_state = np.zeros((self.batch_size,) + self.state_size) 157 | action, reward, done = [], [], [] 158 | 159 | # do this before prediction 160 | # for speedup, this could be done on the tensor level 161 | # but easier to understand using a loop 162 | for i in range(len(minibatch)): 163 | state[i] = minibatch[i][0] 164 | action.append(minibatch[i][1]) 165 | reward.append(minibatch[i][2]) 166 | next_state[i] = minibatch[i][3] 167 | done.append(minibatch[i][4]) 168 | 169 | # do batch prediction to save speed 170 | # predict Q-values for starting state using the main network 171 | target = self.model.predict(state) 172 | target_old = np.array(target) 173 | # predict best action in ending state using the main network 174 | target_next = self.model.predict(next_state) 175 | # predict Q-values for ending state using the target network 176 | target_val = self.target_model.predict(next_state) 177 | 178 | for i in range(len(minibatch)): 179 | # correction on the Q value for the action used 180 | if done[i]: 181 | target[i][action[i]] = reward[i] 182 | else: 183 | # the key point of Double DQN 184 | # selection of action is from model 185 | # update is from target model 186 | if self.ddqn: # Double - DQN 187 | # current Q Network selects the action 188 | # a'_max = argmax_a' Q(s', a') 189 | a = np.argmax(target_next[i]) 190 | # target Q Network evaluates the action 191 | # Q_max = Q_target(s', a'_max) 192 | target[i][action[i]] = reward[i] + self.gamma * (target_val[i][a]) 193 | else: # Standard - DQN 194 | # DQN chooses the max Q value among next actions 195 | # selection and evaluation of action is on the target Q Network 196 | # Q_max = max_a' Q_target(s', a') 197 | target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i])) 198 | 199 | if self.USE_PER: 200 | indices = np.arange(self.batch_size, dtype=np.int32) 201 | absolute_errors = np.abs(target_old[indices, np.array(action)]-target[indices, np.array(action)]) 202 | # Update priority 203 | self.MEMORY.batch_update(tree_idx, absolute_errors) 204 | 205 | # Train the Neural Network with batches 206 | self.model.fit(state, target, batch_size=self.batch_size, verbose=0) 207 | 208 | def load(self, name): 209 | self.model = load_model(name) 210 | 211 | def save(self, name): 212 | self.model.save(name) 213 | 214 | pylab.figure(figsize=(18, 9)) 215 | def PlotModel(self, score, episode): 216 | self.scores.append(score) 217 | self.episodes.append(episode) 218 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:])) 219 | pylab.plot(self.episodes, self.average, 'r') 220 | pylab.plot(self.episodes, self.scores, 'b') 221 | pylab.ylabel('Score', fontsize=18) 222 | pylab.xlabel('Steps', fontsize=18) 223 | dqn = 'DQN_' 224 | softupdate = '' 225 | dueling = '' 226 | greedy = '' 227 | PER = '' 228 | if self.ddqn: dqn = 'DDQN_' 229 | if self.Soft_Update: softupdate = '_soft' 230 | if self.dueling: dueling = '_Dueling' 231 | if self.epsilon_greedy: greedy = '_Greedy' 232 | if self.USE_PER: PER = '_PER' 233 | try: 234 | pylab.savefig(dqn+self.env_name+softupdate+dueling+greedy+PER+"_CNN.png") 235 | except OSError: 236 | pass 237 | 238 | return str(self.average[-1])[:5] 239 | 240 | def imshow(self, image, rem_step=0): 241 | cv2.imshow("cartpole"+str(rem_step), image[rem_step,...]) 242 | if cv2.waitKey(25) & 0xFF == ord("q"): 243 | cv2.destroyAllWindows() 244 | return 245 | 246 | def GetImage(self): 247 | img = self.env.render(mode='rgb_array') 248 | 249 | img_rgb = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) 250 | img_rgb_resized = cv2.resize(img_rgb, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC) 251 | img_rgb_resized[img_rgb_resized < 255] = 0 252 | img_rgb_resized = img_rgb_resized / 255 253 | 254 | self.image_memory = np.roll(self.image_memory, 1, axis = 0) 255 | self.image_memory[0,:,:] = img_rgb_resized 256 | 257 | #self.imshow(self.image_memory,0) 258 | 259 | return np.expand_dims(self.image_memory, axis=0) 260 | 261 | def reset(self): 262 | self.env.reset() 263 | for i in range(self.REM_STEP): 264 | state = self.GetImage() 265 | return state 266 | 267 | def step(self,action): 268 | next_state, reward, done, info = self.env.step(action) 269 | next_state = self.GetImage() 270 | return next_state, reward, done, info 271 | 272 | def run(self): 273 | decay_step = 0 274 | for e in range(self.EPISODES): 275 | state = self.reset() 276 | done = False 277 | i = 0 278 | while not done: 279 | decay_step += 1 280 | action, explore_probability = self.act(state, decay_step) 281 | next_state, reward, done, _ = self.step(action) 282 | if not done or i == self.env._max_episode_steps-1: 283 | reward = reward 284 | else: 285 | reward = -100 286 | self.remember(state, action, reward, next_state, done) 287 | state = next_state 288 | i += 1 289 | if done: 290 | # every REM_STEP update target model 291 | if e % self.REM_STEP == 0: 292 | self.update_target_model() 293 | 294 | # every episode, plot the result 295 | average = self.PlotModel(i, e) 296 | 297 | print("episode: {}/{}, score: {}, e: {:.2}, average: {}".format(e, self.EPISODES, i, explore_probability, average)) 298 | if i == self.env._max_episode_steps: 299 | print("Saving trained model to", self.Model_name) 300 | #self.save(self.Model_name) 301 | break 302 | self.replay() 303 | self.env.close() 304 | 305 | def test(self): 306 | self.load(self.Model_name) 307 | for e in range(self.EPISODES): 308 | state = self.reset() 309 | done = False 310 | i = 0 311 | while not done: 312 | action = np.argmax(self.model.predict(state)) 313 | next_state, reward, done, _ = env.step(action) 314 | i += 1 315 | if done: 316 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, i)) 317 | break 318 | 319 | if __name__ == "__main__": 320 | env_name = 'CartPole-v1' 321 | agent = DQNAgent(env_name) 322 | agent.run() 323 | #agent.test() 324 | -------------------------------------------------------------------------------- /06_CartPole-reinforcement-learning_PER_D3QN_CNN/PER.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class SumTree(object): 4 | data_pointer = 0 5 | 6 | # Here we initialize the tree with all nodes = 0, and initialize the data with all values = 0 7 | def __init__(self, capacity): 8 | # Number of leaf nodes (final nodes) that contains experiences 9 | self.capacity = capacity 10 | 11 | # Generate the tree with all nodes values = 0 12 | # To understand this calculation (2 * capacity - 1) look at the schema below 13 | # Remember we are in a binary node (each node has max 2 children) so 2x size of leaf (capacity) - 1 (root node) 14 | # Parent nodes = capacity - 1 15 | # Leaf nodes = capacity 16 | self.tree = np.zeros(2 * capacity - 1) 17 | 18 | # Contains the experiences (so the size of data is capacity) 19 | self.data = np.zeros(capacity, dtype=object) 20 | 21 | 22 | # Here we define function that will add our priority score in the sumtree leaf and add the experience in data: 23 | def add(self, priority, data): 24 | # Look at what index we want to put the experience 25 | tree_index = self.data_pointer + self.capacity - 1 26 | 27 | # Update data frame 28 | self.data[self.data_pointer] = data 29 | 30 | # Update the leaf 31 | self.update (tree_index, priority) 32 | 33 | # Add 1 to data_pointer 34 | self.data_pointer += 1 35 | 36 | if self.data_pointer >= self.capacity: # If we're above the capacity, we go back to first index (we overwrite) 37 | self.data_pointer = 0 38 | 39 | # Update the leaf priority score and propagate the change through tree 40 | def update(self, tree_index, priority): 41 | # Change = new priority score - former priority score 42 | change = priority - self.tree[tree_index] 43 | self.tree[tree_index] = priority 44 | 45 | # then propagate the change through tree 46 | # this method is faster than the recursive loop in the reference code 47 | while tree_index != 0: 48 | tree_index = (tree_index - 1) // 2 49 | self.tree[tree_index] += change 50 | 51 | # Here build a function to get a leaf from our tree. So we'll build a function to get the leaf_index, priority value of that leaf and experience associated with that leaf index: 52 | def get_leaf(self, v): 53 | parent_index = 0 54 | 55 | # the while loop is faster than the method in the reference code 56 | while True: 57 | left_child_index = 2 * parent_index + 1 58 | right_child_index = left_child_index + 1 59 | 60 | # If we reach bottom, end the search 61 | if left_child_index >= len(self.tree): 62 | leaf_index = parent_index 63 | break 64 | else: # downward search, always search for a higher priority node 65 | if v <= self.tree[left_child_index]: 66 | parent_index = left_child_index 67 | else: 68 | v -= self.tree[left_child_index] 69 | parent_index = right_child_index 70 | 71 | data_index = leaf_index - self.capacity + 1 72 | 73 | return leaf_index, self.tree[leaf_index], self.data[data_index] 74 | 75 | @property 76 | def total_priority(self): 77 | return self.tree[0] # Returns the root node 78 | 79 | # Now we finished constructing our SumTree object, next we'll build a memory object. 80 | class Memory(object): # stored as ( state, action, reward, next_state ) in SumTree 81 | PER_e = 0.01 # Hyperparameter that we use to avoid some experiences to have 0 probability of being taken 82 | PER_a = 0.6 # Hyperparameter that we use to make a tradeoff between taking only exp with high priority and sampling randomly 83 | PER_b = 0.4 # importance-sampling, from initial value increasing to 1 84 | 85 | PER_b_increment_per_sampling = 0.001 86 | 87 | absolute_error_upper = 1. # clipped abs error 88 | 89 | def __init__(self, capacity): 90 | # Making the tree 91 | self.tree = SumTree(capacity) 92 | 93 | # Next, we define a function to store a new experience in our tree. 94 | # Each new experience will have a score of max_prority (it will be then improved when we use this exp to train our DDQN). 95 | def store(self, experience): 96 | # Find the max priority 97 | max_priority = np.max(self.tree.tree[-self.tree.capacity:]) 98 | 99 | # If the max priority = 0 we can't put priority = 0 since this experience will never have a chance to be selected 100 | # So we use a minimum priority 101 | if max_priority == 0: 102 | max_priority = self.absolute_error_upper 103 | 104 | self.tree.add(max_priority, experience) # set the max priority for new priority 105 | 106 | # Now we create sample function, which will be used to pick batch from our tree memory, which will be used to train our model. 107 | # - First, we sample a minibatch of n size, the range [0, priority_total] into priority ranges. 108 | # - Then a value is uniformly sampled from each range. 109 | # - Then we search in the sumtree, for the experience where priority score correspond to sample values are retrieved from. 110 | def sample(self, n): 111 | # Create a minibatch array that will contains the minibatch 112 | minibatch = [] 113 | 114 | b_idx = np.empty((n,), dtype=np.int32) 115 | 116 | # Calculate the priority segment 117 | # Here, as explained in the paper, we divide the Range[0, ptotal] into n ranges 118 | priority_segment = self.tree.total_priority / n # priority segment 119 | 120 | for i in range(n): 121 | # A value is uniformly sample from each range 122 | a, b = priority_segment * i, priority_segment * (i + 1) 123 | value = np.random.uniform(a, b) 124 | 125 | # Experience that correspond to each value is retrieved 126 | index, priority, data = self.tree.get_leaf(value) 127 | 128 | b_idx[i]= index 129 | 130 | minibatch.append([data[0],data[1],data[2],data[3],data[4]]) 131 | 132 | return b_idx, minibatch 133 | 134 | # Update the priorities on the tree 135 | def batch_update(self, tree_idx, abs_errors): 136 | abs_errors += self.PER_e # convert to abs and avoid 0 137 | clipped_errors = np.minimum(abs_errors, self.absolute_error_upper) 138 | ps = np.power(clipped_errors, self.PER_a) 139 | 140 | for ti, p in zip(tree_idx, ps): 141 | self.tree.update(ti, p) 142 | -------------------------------------------------------------------------------- /06_CartPole-reinforcement-learning_PER_D3QN_CNN/random_game.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import random 3 | import numpy as np 4 | import cv2 5 | 6 | class DQN_CNN_Agent: 7 | def __init__(self, env_name): 8 | self.env_name = env_name 9 | self.env = gym.make(env_name) 10 | self.ROWS = 160 11 | self.COLS = 240 12 | self.REM_STEP = 4 13 | 14 | self.EPISODES = 10 15 | 16 | self.image_memory = np.zeros((self.REM_STEP, self.ROWS, self.COLS)) 17 | 18 | def imshow(self, image, rem_step=0): 19 | cv2.imshow(env_name+str(rem_step), image[rem_step,...]) 20 | if cv2.waitKey(25) & 0xFF == ord("q"): 21 | cv2.destroyAllWindows() 22 | return 23 | 24 | def GetImage(self): 25 | img = self.env.render(mode='rgb_array') 26 | 27 | img_rgb = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) 28 | img_rgb_resized = cv2.resize(img_rgb, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC) 29 | img_rgb_resized[img_rgb_resized < 255] = 0 30 | img_rgb_resized = img_rgb_resized / 255 31 | 32 | self.image_memory = np.roll(self.image_memory, 1, axis = 0) 33 | self.image_memory[0,:,:] = img_rgb_resized 34 | 35 | self.imshow(self.image_memory,0) 36 | 37 | return np.expand_dims(self.image_memory, axis=0) 38 | 39 | def reset(self): 40 | self.env.reset() 41 | for i in range(self.REM_STEP): 42 | state = self.GetImage() 43 | return state 44 | 45 | def step(self,action): 46 | next_state, reward, done, info = self.env.step(action) 47 | next_state = self.GetImage() 48 | return next_state, reward, done, info 49 | 50 | def run(self): 51 | # Each of this episode is its own game. 52 | for episode in range(self.EPISODES): 53 | self.reset() 54 | # this is each frame, up to 500...but we wont make it that far with random. 55 | for t in range(500): 56 | # This will just create a sample action in any environment. 57 | # In this environment, the action can be 0 or 1, which is left or right 58 | action = self.env.action_space.sample() 59 | 60 | # this executes the environment with an action, 61 | # and returns the observation of the environment, 62 | # the reward, if the env is over, and other info. 63 | next_state, reward, done, info = self.step(action) 64 | 65 | # lets print everything in one line: 66 | #print(t, next_state, reward, done, info, action) 67 | if done: 68 | break 69 | 70 | if __name__ == "__main__": 71 | env_name = 'CartPole-v1' 72 | agent = DQN_CNN_Agent(env_name) 73 | agent.run() 74 | -------------------------------------------------------------------------------- /07_Pong-reinforcement-learning_DQN_CNN/IMAGES/DDQN_Pong-v0_CNN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/07_Pong-reinforcement-learning_DQN_CNN/IMAGES/DDQN_Pong-v0_CNN.png -------------------------------------------------------------------------------- /07_Pong-reinforcement-learning_DQN_CNN/IMAGES/DDQN_Pong-v0_Dueling_CNN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/07_Pong-reinforcement-learning_DQN_CNN/IMAGES/DDQN_Pong-v0_Dueling_CNN.png -------------------------------------------------------------------------------- /07_Pong-reinforcement-learning_DQN_CNN/IMAGES/DDQN_Pong-v0_Dueling_PER_CNN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/07_Pong-reinforcement-learning_DQN_CNN/IMAGES/DDQN_Pong-v0_Dueling_PER_CNN.png -------------------------------------------------------------------------------- /07_Pong-reinforcement-learning_DQN_CNN/IMAGES/DQN_Pong-v0_CNN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/07_Pong-reinforcement-learning_DQN_CNN/IMAGES/DQN_Pong-v0_CNN.png -------------------------------------------------------------------------------- /07_Pong-reinforcement-learning_DQN_CNN/Models/Pong-v0_DDQN_CNN.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/07_Pong-reinforcement-learning_DQN_CNN/Models/Pong-v0_DDQN_CNN.h5 -------------------------------------------------------------------------------- /07_Pong-reinforcement-learning_DQN_CNN/Models/Pong-v0_DDQN_Dueling_CNN.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/07_Pong-reinforcement-learning_DQN_CNN/Models/Pong-v0_DDQN_Dueling_CNN.h5 -------------------------------------------------------------------------------- /07_Pong-reinforcement-learning_DQN_CNN/Models/Pong-v0_DDQN_Dueling_PER_CNN.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/07_Pong-reinforcement-learning_DQN_CNN/Models/Pong-v0_DDQN_Dueling_PER_CNN.h5 -------------------------------------------------------------------------------- /07_Pong-reinforcement-learning_DQN_CNN/Models/Pong-v0_DQN_CNN.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/07_Pong-reinforcement-learning_DQN_CNN/Models/Pong-v0_DQN_CNN.h5 -------------------------------------------------------------------------------- /07_Pong-reinforcement-learning_DQN_CNN/PER.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class SumTree(object): 4 | data_pointer = 0 5 | 6 | # Here we initialize the tree with all nodes = 0, and initialize the data with all values = 0 7 | def __init__(self, capacity): 8 | # Number of leaf nodes (final nodes) that contains experiences 9 | self.capacity = capacity 10 | 11 | # Generate the tree with all nodes values = 0 12 | # To understand this calculation (2 * capacity - 1) look at the schema below 13 | # Remember we are in a binary node (each node has max 2 children) so 2x size of leaf (capacity) - 1 (root node) 14 | # Parent nodes = capacity - 1 15 | # Leaf nodes = capacity 16 | self.tree = np.zeros(2 * capacity - 1) 17 | 18 | # Contains the experiences (so the size of data is capacity) 19 | self.data = np.zeros(capacity, dtype=object) 20 | 21 | 22 | # Here we define function that will add our priority score in the sumtree leaf and add the experience in data: 23 | def add(self, priority, data): 24 | # Look at what index we want to put the experience 25 | tree_index = self.data_pointer + self.capacity - 1 26 | 27 | # Update data frame 28 | self.data[self.data_pointer] = data 29 | 30 | # Update the leaf 31 | self.update (tree_index, priority) 32 | 33 | # Add 1 to data_pointer 34 | self.data_pointer += 1 35 | 36 | if self.data_pointer >= self.capacity: # If we're above the capacity, we go back to first index (we overwrite) 37 | self.data_pointer = 0 38 | 39 | # Update the leaf priority score and propagate the change through tree 40 | def update(self, tree_index, priority): 41 | # Change = new priority score - former priority score 42 | change = priority - self.tree[tree_index] 43 | self.tree[tree_index] = priority 44 | 45 | # then propagate the change through tree 46 | # this method is faster than the recursive loop in the reference code 47 | while tree_index != 0: 48 | tree_index = (tree_index - 1) // 2 49 | self.tree[tree_index] += change 50 | 51 | # Here build a function to get a leaf from our tree. So we'll build a function to get the leaf_index, priority value of that leaf and experience associated with that leaf index: 52 | def get_leaf(self, v): 53 | parent_index = 0 54 | 55 | # the while loop is faster than the method in the reference code 56 | while True: 57 | left_child_index = 2 * parent_index + 1 58 | right_child_index = left_child_index + 1 59 | 60 | # If we reach bottom, end the search 61 | if left_child_index >= len(self.tree): 62 | leaf_index = parent_index 63 | break 64 | else: # downward search, always search for a higher priority node 65 | if v <= self.tree[left_child_index]: 66 | parent_index = left_child_index 67 | else: 68 | v -= self.tree[left_child_index] 69 | parent_index = right_child_index 70 | 71 | data_index = leaf_index - self.capacity + 1 72 | 73 | return leaf_index, self.tree[leaf_index], self.data[data_index] 74 | 75 | @property 76 | def total_priority(self): 77 | return self.tree[0] # Returns the root node 78 | 79 | # Now we finished constructing our SumTree object, next we'll build a memory object. 80 | class Memory(object): # stored as ( state, action, reward, next_state ) in SumTree 81 | PER_e = 0.01 # Hyperparameter that we use to avoid some experiences to have 0 probability of being taken 82 | PER_a = 0.6 # Hyperparameter that we use to make a tradeoff between taking only exp with high priority and sampling randomly 83 | PER_b = 0.4 # importance-sampling, from initial value increasing to 1 84 | 85 | PER_b_increment_per_sampling = 0.001 86 | 87 | absolute_error_upper = 1. # clipped abs error 88 | 89 | def __init__(self, capacity): 90 | # Making the tree 91 | self.tree = SumTree(capacity) 92 | 93 | # Next, we define a function to store a new experience in our tree. 94 | # Each new experience will have a score of max_prority (it will be then improved when we use this exp to train our DDQN). 95 | def store(self, experience): 96 | # Find the max priority 97 | max_priority = np.max(self.tree.tree[-self.tree.capacity:]) 98 | 99 | # If the max priority = 0 we can't put priority = 0 since this experience will never have a chance to be selected 100 | # So we use a minimum priority 101 | if max_priority == 0: 102 | max_priority = self.absolute_error_upper 103 | 104 | self.tree.add(max_priority, experience) # set the max priority for new priority 105 | 106 | # Now we create sample function, which will be used to pick batch from our tree memory, which will be used to train our model. 107 | # - First, we sample a minibatch of n size, the range [0, priority_total] into priority ranges. 108 | # - Then a value is uniformly sampled from each range. 109 | # - Then we search in the sumtree, for the experience where priority score correspond to sample values are retrieved from. 110 | def sample(self, n): 111 | # Create a minibatch array that will contains the minibatch 112 | minibatch = [] 113 | 114 | b_idx = np.empty((n,), dtype=np.int32) 115 | 116 | # Calculate the priority segment 117 | # Here, as explained in the paper, we divide the Range[0, ptotal] into n ranges 118 | priority_segment = self.tree.total_priority / n # priority segment 119 | 120 | for i in range(n): 121 | # A value is uniformly sample from each range 122 | a, b = priority_segment * i, priority_segment * (i + 1) 123 | value = np.random.uniform(a, b) 124 | 125 | # Experience that correspond to each value is retrieved 126 | index, priority, data = self.tree.get_leaf(value) 127 | 128 | b_idx[i]= index 129 | 130 | minibatch.append([data[0],data[1],data[2],data[3],data[4]]) 131 | 132 | return b_idx, minibatch 133 | 134 | # Update the priorities on the tree 135 | def batch_update(self, tree_idx, abs_errors): 136 | abs_errors += self.PER_e # convert to abs and avoid 0 137 | clipped_errors = np.minimum(abs_errors, self.absolute_error_upper) 138 | ps = np.power(clipped_errors, self.PER_a) 139 | 140 | for ti, p in zip(tree_idx, ps): 141 | self.tree.update(ti, p) 142 | -------------------------------------------------------------------------------- /08_Pong-v0_Policy_gradient/IMAGES/Pong-v0_PG_2.5e-05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/08_Pong-v0_Policy_gradient/IMAGES/Pong-v0_PG_2.5e-05.png -------------------------------------------------------------------------------- /08_Pong-v0_Policy_gradient/IMAGES/PongDeterministic-v4_PG_0.0001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/08_Pong-v0_Policy_gradient/IMAGES/PongDeterministic-v4_PG_0.0001.png -------------------------------------------------------------------------------- /08_Pong-v0_Policy_gradient/Pong-v0_PG.py: -------------------------------------------------------------------------------- 1 | # Tutorial by www.pylessons.com 2 | # Tutorial written for - Tensorflow 1.15, Keras 2.2.4 3 | 4 | import os 5 | import random 6 | import gym 7 | import pylab 8 | import numpy as np 9 | from keras.models import Model, load_model 10 | from keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten 11 | from keras.optimizers import Adam, RMSprop 12 | from keras import backend as K 13 | import cv2 14 | 15 | def OurModel(input_shape, action_space, lr): 16 | X_input = Input(input_shape) 17 | 18 | #X = Conv2D(32, 8, strides=(4, 4),padding="valid", activation="elu", data_format="channels_first", input_shape=input_shape)(X_input) 19 | #X = Conv2D(64, 4, strides=(2, 2),padding="valid", activation="elu", data_format="channels_first")(X) 20 | #X = Conv2D(64, 3, strides=(1, 1),padding="valid", activation="elu", data_format="channels_first")(X) 21 | X = Flatten(input_shape=input_shape)(X_input) 22 | 23 | X = Dense(512, activation="elu", kernel_initializer='he_uniform')(X) 24 | #X = Dense(256, activation="elu", kernel_initializer='he_uniform')(X) 25 | #X = Dense(64, activation="elu", kernel_initializer='he_uniform')(X) 26 | 27 | action = Dense(action_space, activation="softmax", kernel_initializer='he_uniform')(X) 28 | 29 | Actor = Model(inputs = X_input, outputs = action) 30 | Actor.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=lr)) 31 | 32 | return Actor 33 | 34 | class PGAgent: 35 | # Policy Gradient Main Optimization Algorithm 36 | def __init__(self, env_name): 37 | # Initialization 38 | # Environment and PG parameters 39 | self.env_name = env_name 40 | self.env = gym.make(env_name) 41 | self.action_size = self.env.action_space.n 42 | self.EPISODES, self.max_average = 10000, -21.0 # specific for pong 43 | self.lr = 0.000025 44 | 45 | self.ROWS = 80 46 | self.COLS = 80 47 | self.REM_STEP = 4 48 | 49 | # Instantiate games and plot memory 50 | self.states, self.actions, self.rewards = [], [], [] 51 | self.scores, self.episodes, self.average = [], [], [] 52 | 53 | self.Save_Path = 'Models' 54 | self.state_size = (self.REM_STEP, self.ROWS, self.COLS) 55 | self.image_memory = np.zeros(self.state_size) 56 | 57 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path) 58 | self.path = '{}_PG_{}'.format(self.env_name, self.lr) 59 | self.Model_name = os.path.join(self.Save_Path, self.path) 60 | 61 | # Create Actor network model 62 | self.Actor = OurModel(input_shape=self.state_size, action_space = self.action_size, lr=self.lr) 63 | 64 | def remember(self, state, action, reward): 65 | # store episode actions to memory 66 | self.states.append(state) 67 | action_onehot = np.zeros([self.action_size]) 68 | action_onehot[action] = 1 69 | self.actions.append(action_onehot) 70 | self.rewards.append(reward) 71 | 72 | def act(self, state): 73 | # Use the network to predict the next action to take, using the model 74 | prediction = self.Actor.predict(state)[0] 75 | action = np.random.choice(self.action_size, p=prediction) 76 | return action 77 | 78 | def discount_rewards(self, reward): 79 | # Compute the gamma-discounted rewards over an episode 80 | gamma = 0.99 # discount rate 81 | running_add = 0 82 | discounted_r = np.zeros_like(reward) 83 | for i in reversed(range(0,len(reward))): 84 | if reward[i] != 0: # reset the sum, since this was a game boundary (pong specific!) 85 | running_add = 0 86 | running_add = running_add * gamma + reward[i] 87 | discounted_r[i] = running_add 88 | 89 | discounted_r -= np.mean(discounted_r) # normalizing the result 90 | discounted_r /= np.std(discounted_r) # divide by standard deviation 91 | return discounted_r 92 | 93 | def replay(self): 94 | # reshape memory to appropriate shape for training 95 | states = np.vstack(self.states) 96 | actions = np.vstack(self.actions) 97 | 98 | # Compute discounted rewards 99 | discounted_r = self.discount_rewards(self.rewards) 100 | 101 | # training PG network 102 | self.Actor.fit(states, actions, sample_weight=discounted_r, epochs=1, verbose=0) 103 | # reset training memory 104 | self.states, self.actions, self.rewards = [], [], [] 105 | 106 | def load(self, Actor_name): 107 | self.Actor = load_model(Actor_name, compile=False) 108 | 109 | def save(self): 110 | self.Actor.save(self.Model_name + '.h5') 111 | 112 | pylab.figure(figsize=(18, 9)) 113 | def PlotModel(self, score, episode): 114 | self.scores.append(score) 115 | self.episodes.append(episode) 116 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:])) 117 | if str(episode)[-2:] == "00":# much faster than episode % 100 118 | pylab.plot(self.episodes, self.scores, 'b') 119 | pylab.plot(self.episodes, self.average, 'r') 120 | pylab.ylabel('Score', fontsize=18) 121 | pylab.xlabel('Steps', fontsize=18) 122 | try: 123 | pylab.savefig(self.path+".png") 124 | except OSError: 125 | pass 126 | 127 | return self.average[-1] 128 | 129 | def imshow(self, image, rem_step=0): 130 | cv2.imshow(self.Model_name+str(rem_step), image[rem_step,...]) 131 | if cv2.waitKey(25) & 0xFF == ord("q"): 132 | cv2.destroyAllWindows() 133 | return 134 | 135 | def GetImage(self, frame): 136 | # croping frame to 80x80 size 137 | frame_cropped = frame[35:195:2, ::2,:] 138 | if frame_cropped.shape[0] != self.COLS or frame_cropped.shape[1] != self.ROWS: 139 | # OpenCV resize function 140 | frame_cropped = cv2.resize(frame, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC) 141 | 142 | # converting to RGB (numpy way) 143 | frame_rgb = 0.299*frame_cropped[:,:,0] + 0.587*frame_cropped[:,:,1] + 0.114*frame_cropped[:,:,2] 144 | 145 | # convert everything to black and white (agent will train faster) 146 | frame_rgb[frame_rgb < 100] = 0 147 | frame_rgb[frame_rgb >= 100] = 255 148 | # converting to RGB (OpenCV way) 149 | #frame_rgb = cv2.cvtColor(frame_cropped, cv2.COLOR_RGB2GRAY) 150 | 151 | # dividing by 255 we expresses value to 0-1 representation 152 | new_frame = np.array(frame_rgb).astype(np.float32) / 255.0 153 | 154 | # push our data by 1 frame, similar as deq() function work 155 | self.image_memory = np.roll(self.image_memory, 1, axis = 0) 156 | 157 | # inserting new frame to free space 158 | self.image_memory[0,:,:] = new_frame 159 | 160 | # show image frame 161 | #self.imshow(self.image_memory,0) 162 | #self.imshow(self.image_memory,1) 163 | #self.imshow(self.image_memory,2) 164 | #self.imshow(self.image_memory,3) 165 | return np.expand_dims(self.image_memory, axis=0) 166 | 167 | def reset(self): 168 | frame = self.env.reset() 169 | for i in range(self.REM_STEP): 170 | state = self.GetImage(frame) 171 | return state 172 | 173 | def step(self,action): 174 | next_state, reward, done, info = self.env.step(action) 175 | next_state = self.GetImage(next_state) 176 | return next_state, reward, done, info 177 | 178 | def run(self): 179 | for e in range(self.EPISODES): 180 | state = self.reset() 181 | done, score, SAVING = False, 0, '' 182 | while not done: 183 | #self.env.render() 184 | # Actor picks an action 185 | action = self.act(state) 186 | # Retrieve new state, reward, and whether the state is terminal 187 | next_state, reward, done, _ = self.step(action) 188 | # Memorize (state, action, reward) for training 189 | self.remember(state, action, reward) 190 | # Update current state 191 | state = next_state 192 | score += reward 193 | if done: 194 | average = self.PlotModel(score, e) 195 | # saving best models 196 | if average >= self.max_average: 197 | self.max_average = average 198 | self.save() 199 | SAVING = "SAVING" 200 | else: 201 | SAVING = "" 202 | print("episode: {}/{}, score: {}, average: {:.2f} {}".format(e, self.EPISODES, score, average, SAVING)) 203 | 204 | self.replay() 205 | 206 | # close environemnt when finish training 207 | self.env.close() 208 | 209 | def test(self, Model_name): 210 | self.load(Model_name) 211 | for e in range(100): 212 | state = self.reset() 213 | done = False 214 | score = 0 215 | while not done: 216 | self.env.render() 217 | action = np.argmax(self.Actor.predict(state)) 218 | state, reward, done, _ = self.step(action) 219 | score += reward 220 | if done: 221 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, score)) 222 | break 223 | self.env.close() 224 | 225 | if __name__ == "__main__": 226 | #env_name = 'Pong-v0' 227 | env_name = 'PongDeterministic-v4' 228 | agent = PGAgent(env_name) 229 | agent.run() 230 | #agent.test('Models/PongDeterministic-v4_PG_2.5e-05.h5') 231 | #agent.test('Models/Pong-v0_PG_2.5e-05.h5') 232 | -------------------------------------------------------------------------------- /08_Pong-v0_Policy_gradient/Pong-v0_PG_TF2.py: -------------------------------------------------------------------------------- 1 | # Tutorial by www.pylessons.com 2 | # Tutorial written for - Tensorflow 2.3.1 3 | 4 | import os 5 | import random 6 | import gym 7 | import pylab 8 | import numpy as np 9 | from tensorflow.keras.models import Model, load_model 10 | from tensorflow.keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten 11 | from tensorflow.keras.optimizers import Adam, RMSprop 12 | from tensorflow.keras import backend as K 13 | import cv2 14 | 15 | def OurModel(input_shape, action_space, lr): 16 | X_input = Input(input_shape) 17 | 18 | #X = Conv2D(32, 8, strides=(4, 4),padding="valid", activation="elu", data_format="channels_first", input_shape=input_shape)(X_input) 19 | #X = Conv2D(64, 4, strides=(2, 2),padding="valid", activation="elu", data_format="channels_first")(X) 20 | #X = Conv2D(64, 3, strides=(1, 1),padding="valid", activation="elu", data_format="channels_first")(X) 21 | X = Flatten(input_shape=input_shape)(X_input) 22 | 23 | X = Dense(512, activation="elu", kernel_initializer='he_uniform')(X) 24 | #X = Dense(256, activation="elu", kernel_initializer='he_uniform')(X) 25 | #X = Dense(64, activation="elu", kernel_initializer='he_uniform')(X) 26 | 27 | action = Dense(action_space, activation="softmax", kernel_initializer='he_uniform')(X) 28 | 29 | Actor = Model(inputs = X_input, outputs = action) 30 | Actor.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=lr)) 31 | 32 | return Actor 33 | 34 | class PGAgent: 35 | # Policy Gradient Main Optimization Algorithm 36 | def __init__(self, env_name): 37 | # Initialization 38 | # Environment and PG parameters 39 | self.env_name = env_name 40 | self.env = gym.make(env_name) 41 | self.action_size = self.env.action_space.n 42 | self.EPISODES, self.max_average = 10000, -21.0 # specific for pong 43 | self.lr = 0.000025 44 | 45 | self.ROWS = 80 46 | self.COLS = 80 47 | self.REM_STEP = 4 48 | 49 | # Instantiate games and plot memory 50 | self.states, self.actions, self.rewards = [], [], [] 51 | self.scores, self.episodes, self.average = [], [], [] 52 | 53 | self.Save_Path = 'Models' 54 | self.state_size = (self.REM_STEP, self.ROWS, self.COLS) 55 | self.image_memory = np.zeros(self.state_size) 56 | 57 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path) 58 | self.path = '{}_PG_{}'.format(self.env_name, self.lr) 59 | self.Model_name = os.path.join(self.Save_Path, self.path) 60 | 61 | # Create Actor network model 62 | self.Actor = OurModel(input_shape=self.state_size, action_space = self.action_size, lr=self.lr) 63 | 64 | def remember(self, state, action, reward): 65 | # store episode actions to memory 66 | self.states.append(state) 67 | action_onehot = np.zeros([self.action_size]) 68 | action_onehot[action] = 1 69 | self.actions.append(action_onehot) 70 | self.rewards.append(reward) 71 | 72 | def act(self, state): 73 | # Use the network to predict the next action to take, using the model 74 | prediction = self.Actor.predict(state)[0] 75 | action = np.random.choice(self.action_size, p=prediction) 76 | return action 77 | 78 | def discount_rewards(self, reward): 79 | # Compute the gamma-discounted rewards over an episode 80 | gamma = 0.99 # discount rate 81 | running_add = 0 82 | discounted_r = np.zeros_like(reward) 83 | for i in reversed(range(0,len(reward))): 84 | if reward[i] != 0: # reset the sum, since this was a game boundary (pong specific!) 85 | running_add = 0 86 | running_add = running_add * gamma + reward[i] 87 | discounted_r[i] = running_add 88 | 89 | discounted_r -= np.mean(discounted_r) # normalizing the result 90 | discounted_r /= np.std(discounted_r) # divide by standard deviation 91 | return discounted_r 92 | 93 | def replay(self): 94 | # reshape memory to appropriate shape for training 95 | states = np.vstack(self.states) 96 | actions = np.vstack(self.actions) 97 | 98 | # Compute discounted rewards 99 | discounted_r = self.discount_rewards(self.rewards) 100 | 101 | # training PG network 102 | self.Actor.fit(states, actions, sample_weight=discounted_r, epochs=1, verbose=0) 103 | # reset training memory 104 | self.states, self.actions, self.rewards = [], [], [] 105 | 106 | def load(self, Actor_name): 107 | self.Actor = load_model(Actor_name, compile=False) 108 | 109 | def save(self): 110 | self.Actor.save(self.Model_name + '.h5') 111 | 112 | pylab.figure(figsize=(18, 9)) 113 | def PlotModel(self, score, episode): 114 | self.scores.append(score) 115 | self.episodes.append(episode) 116 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:])) 117 | if str(episode)[-2:] == "00":# much faster than episode % 100 118 | pylab.plot(self.episodes, self.scores, 'b') 119 | pylab.plot(self.episodes, self.average, 'r') 120 | pylab.ylabel('Score', fontsize=18) 121 | pylab.xlabel('Steps', fontsize=18) 122 | try: 123 | pylab.savefig(self.path+".png") 124 | except OSError: 125 | pass 126 | 127 | return self.average[-1] 128 | 129 | def imshow(self, image, rem_step=0): 130 | cv2.imshow(self.Model_name+str(rem_step), image[rem_step,...]) 131 | if cv2.waitKey(25) & 0xFF == ord("q"): 132 | cv2.destroyAllWindows() 133 | return 134 | 135 | def GetImage(self, frame): 136 | # croping frame to 80x80 size 137 | frame_cropped = frame[35:195:2, ::2,:] 138 | if frame_cropped.shape[0] != self.COLS or frame_cropped.shape[1] != self.ROWS: 139 | # OpenCV resize function 140 | frame_cropped = cv2.resize(frame, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC) 141 | 142 | # converting to RGB (numpy way) 143 | frame_rgb = 0.299*frame_cropped[:,:,0] + 0.587*frame_cropped[:,:,1] + 0.114*frame_cropped[:,:,2] 144 | 145 | # convert everything to black and white (agent will train faster) 146 | frame_rgb[frame_rgb < 100] = 0 147 | frame_rgb[frame_rgb >= 100] = 255 148 | # converting to RGB (OpenCV way) 149 | #frame_rgb = cv2.cvtColor(frame_cropped, cv2.COLOR_RGB2GRAY) 150 | 151 | # dividing by 255 we expresses value to 0-1 representation 152 | new_frame = np.array(frame_rgb).astype(np.float32) / 255.0 153 | 154 | # push our data by 1 frame, similar as deq() function work 155 | self.image_memory = np.roll(self.image_memory, 1, axis = 0) 156 | 157 | # inserting new frame to free space 158 | self.image_memory[0,:,:] = new_frame 159 | 160 | # show image frame 161 | #self.imshow(self.image_memory,0) 162 | #self.imshow(self.image_memory,1) 163 | #self.imshow(self.image_memory,2) 164 | #self.imshow(self.image_memory,3) 165 | return np.expand_dims(self.image_memory, axis=0) 166 | 167 | def reset(self): 168 | frame = self.env.reset() 169 | for i in range(self.REM_STEP): 170 | state = self.GetImage(frame) 171 | return state 172 | 173 | def step(self,action): 174 | next_state, reward, done, info = self.env.step(action) 175 | next_state = self.GetImage(next_state) 176 | return next_state, reward, done, info 177 | 178 | def run(self): 179 | for e in range(self.EPISODES): 180 | state = self.reset() 181 | done, score, SAVING = False, 0, '' 182 | while not done: 183 | #self.env.render() 184 | # Actor picks an action 185 | action = self.act(state) 186 | # Retrieve new state, reward, and whether the state is terminal 187 | next_state, reward, done, _ = self.step(action) 188 | # Memorize (state, action, reward) for training 189 | self.remember(state, action, reward) 190 | # Update current state 191 | state = next_state 192 | score += reward 193 | if done: 194 | average = self.PlotModel(score, e) 195 | # saving best models 196 | if average >= self.max_average: 197 | self.max_average = average 198 | self.save() 199 | SAVING = "SAVING" 200 | else: 201 | SAVING = "" 202 | print("episode: {}/{}, score: {}, average: {:.2f} {}".format(e, self.EPISODES, score, average, SAVING)) 203 | 204 | self.replay() 205 | 206 | # close environemnt when finish training 207 | self.env.close() 208 | 209 | def test(self, Model_name): 210 | self.load(Model_name) 211 | for e in range(100): 212 | state = self.reset() 213 | done = False 214 | score = 0 215 | while not done: 216 | self.env.render() 217 | action = np.argmax(self.Actor.predict(state)) 218 | state, reward, done, _ = self.step(action) 219 | score += reward 220 | if done: 221 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, score)) 222 | break 223 | self.env.close() 224 | 225 | if __name__ == "__main__": 226 | #env_name = 'Pong-v0' 227 | env_name = 'PongDeterministic-v4' 228 | agent = PGAgent(env_name) 229 | agent.run() 230 | #agent.test('Models/PongDeterministic-v4_PG_2.5e-05.h5') 231 | #agent.test('Models/Pong-v0_PG_2.5e-05.h5') 232 | -------------------------------------------------------------------------------- /09_Pong-v0_A2C/IMAGES/Pong-v0_A2C_2.5e-05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/09_Pong-v0_A2C/IMAGES/Pong-v0_A2C_2.5e-05.png -------------------------------------------------------------------------------- /09_Pong-v0_A2C/IMAGES/PongDeterministic-v4_A2C_2.5e-05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/09_Pong-v0_A2C/IMAGES/PongDeterministic-v4_A2C_2.5e-05.png -------------------------------------------------------------------------------- /09_Pong-v0_A2C/Pong-v0_A2C.py: -------------------------------------------------------------------------------- 1 | # Tutorial by www.pylessons.com 2 | # Tutorial written for - Tensorflow 1.15, Keras 2.2.4 3 | 4 | import os 5 | #os.environ['CUDA_VISIBLE_DEVICES'] = '2' 6 | import random 7 | import gym 8 | import pylab 9 | import numpy as np 10 | from keras.models import Model, load_model 11 | from keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten 12 | from keras.optimizers import Adam, RMSprop 13 | from keras import backend as K 14 | import cv2 15 | 16 | def OurModel(input_shape, action_space, lr): 17 | X_input = Input(input_shape) 18 | 19 | #X = Conv2D(32, 8, strides=(4, 4),padding="valid", activation="elu", data_format="channels_first", input_shape=input_shape)(X_input) 20 | #X = Conv2D(64, 4, strides=(2, 2),padding="valid", activation="elu", data_format="channels_first")(X) 21 | #X = Conv2D(64, 3, strides=(1, 1),padding="valid", activation="elu", data_format="channels_first")(X) 22 | X = Flatten(input_shape=input_shape)(X_input) 23 | 24 | X = Dense(512, activation="elu", kernel_initializer='he_uniform')(X) 25 | #X = Dense(256, activation="elu", kernel_initializer='he_uniform')(X) 26 | #X = Dense(64, activation="elu", kernel_initializer='he_uniform')(X) 27 | 28 | action = Dense(action_space, activation="softmax", kernel_initializer='he_uniform')(X) 29 | value = Dense(1, kernel_initializer='he_uniform')(X) 30 | 31 | Actor = Model(inputs = X_input, outputs = action) 32 | Actor.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=lr)) 33 | 34 | Critic = Model(inputs = X_input, outputs = value) 35 | Critic.compile(loss='mse', optimizer=RMSprop(lr=lr)) 36 | 37 | return Actor, Critic 38 | 39 | class A2CAgent: 40 | # Actor-Critic Main Optimization Algorithm 41 | def __init__(self, env_name): 42 | # Initialization 43 | # Environment and PPO parameters 44 | self.env_name = env_name 45 | self.env = gym.make(env_name) 46 | self.action_size = self.env.action_space.n 47 | self.EPISODES, self.max_average = 10000, -21.0 # specific for pong 48 | self.lr = 0.000025 49 | 50 | self.ROWS = 80 51 | self.COLS = 80 52 | self.REM_STEP = 4 53 | 54 | # Instantiate games and plot memory 55 | self.states, self.actions, self.rewards = [], [], [] 56 | self.scores, self.episodes, self.average = [], [], [] 57 | 58 | self.Save_Path = 'Models' 59 | self.state_size = (self.REM_STEP, self.ROWS, self.COLS) 60 | self.image_memory = np.zeros(self.state_size) 61 | 62 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path) 63 | self.path = '{}_A2C_{}'.format(self.env_name, self.lr) 64 | self.Model_name = os.path.join(self.Save_Path, self.path) 65 | 66 | # Create Actor-Critic network model 67 | self.Actor, self.Critic = OurModel(input_shape=self.state_size, action_space = self.action_size, lr=self.lr) 68 | 69 | 70 | def remember(self, state, action, reward): 71 | # store episode actions to memory 72 | self.states.append(state) 73 | action_onehot = np.zeros([self.action_size]) 74 | action_onehot[action] = 1 75 | self.actions.append(action_onehot) 76 | self.rewards.append(reward) 77 | 78 | 79 | def act(self, state): 80 | # Use the network to predict the next action to take, using the model 81 | prediction = self.Actor.predict(state)[0] 82 | action = np.random.choice(self.action_size, p=prediction) 83 | return action 84 | 85 | def discount_rewards(self, reward): 86 | # Compute the gamma-discounted rewards over an episode 87 | gamma = 0.99 # discount rate 88 | running_add = 0 89 | discounted_r = np.zeros_like(reward) 90 | for i in reversed(range(0,len(reward))): 91 | if reward[i] != 0: # reset the sum, since this was a game boundary (pong specific!) 92 | running_add = 0 93 | running_add = running_add * gamma + reward[i] 94 | discounted_r[i] = running_add 95 | 96 | discounted_r -= np.mean(discounted_r) # normalizing the result 97 | discounted_r /= np.std(discounted_r) # divide by standard deviation 98 | return discounted_r 99 | 100 | 101 | def replay(self): 102 | # reshape memory to appropriate shape for training 103 | states = np.vstack(self.states) 104 | actions = np.vstack(self.actions) 105 | 106 | # Compute discounted rewards 107 | discounted_r = self.discount_rewards(self.rewards) 108 | 109 | # Get Critic network predictions 110 | values = self.Critic.predict(states)[:, 0] 111 | # Compute advantages 112 | advantages = discounted_r - values 113 | # training Actor and Critic networks 114 | self.Actor.fit(states, actions, sample_weight=advantages, epochs=1, verbose=0) 115 | self.Critic.fit(states, discounted_r, epochs=1, verbose=0) 116 | # reset training memory 117 | self.states, self.actions, self.rewards = [], [], [] 118 | 119 | def load(self, Actor_name, Critic_name): 120 | self.Actor = load_model(Actor_name, compile=False) 121 | #self.Critic = load_model(Critic_name, compile=False) 122 | 123 | def save(self): 124 | self.Actor.save(self.Model_name + '_Actor.h5') 125 | #self.Critic.save(self.Model_name + '_Critic.h5') 126 | 127 | pylab.figure(figsize=(18, 9)) 128 | def PlotModel(self, score, episode): 129 | self.scores.append(score) 130 | self.episodes.append(episode) 131 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:])) 132 | if str(episode)[-2:] == "00":# much faster than episode % 100 133 | pylab.plot(self.episodes, self.scores, 'b') 134 | pylab.plot(self.episodes, self.average, 'r') 135 | pylab.ylabel('Score', fontsize=18) 136 | pylab.xlabel('Steps', fontsize=18) 137 | try: 138 | pylab.savefig(self.path+".png") 139 | except OSError: 140 | pass 141 | 142 | return self.average[-1] 143 | 144 | def imshow(self, image, rem_step=0): 145 | cv2.imshow(self.Model_name+str(rem_step), image[rem_step,...]) 146 | if cv2.waitKey(25) & 0xFF == ord("q"): 147 | cv2.destroyAllWindows() 148 | return 149 | 150 | def GetImage(self, frame): 151 | # croping frame to 80x80 size 152 | frame_cropped = frame[35:195:2, ::2,:] 153 | if frame_cropped.shape[0] != self.COLS or frame_cropped.shape[1] != self.ROWS: 154 | # OpenCV resize function 155 | frame_cropped = cv2.resize(frame, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC) 156 | 157 | # converting to RGB (numpy way) 158 | frame_rgb = 0.299*frame_cropped[:,:,0] + 0.587*frame_cropped[:,:,1] + 0.114*frame_cropped[:,:,2] 159 | 160 | # convert everything to black and white (agent will train faster) 161 | frame_rgb[frame_rgb < 100] = 0 162 | frame_rgb[frame_rgb >= 100] = 255 163 | # converting to RGB (OpenCV way) 164 | #frame_rgb = cv2.cvtColor(frame_cropped, cv2.COLOR_RGB2GRAY) 165 | 166 | # dividing by 255 we expresses value to 0-1 representation 167 | new_frame = np.array(frame_rgb).astype(np.float32) / 255.0 168 | 169 | # push our data by 1 frame, similar as deq() function work 170 | self.image_memory = np.roll(self.image_memory, 1, axis = 0) 171 | 172 | # inserting new frame to free space 173 | self.image_memory[0,:,:] = new_frame 174 | 175 | # show image frame 176 | #self.imshow(self.image_memory,0) 177 | #self.imshow(self.image_memory,1) 178 | #self.imshow(self.image_memory,2) 179 | #self.imshow(self.image_memory,3) 180 | 181 | return np.expand_dims(self.image_memory, axis=0) 182 | 183 | def reset(self): 184 | frame = self.env.reset() 185 | for i in range(self.REM_STEP): 186 | state = self.GetImage(frame) 187 | return state 188 | 189 | def step(self, action): 190 | next_state, reward, done, info = self.env.step(action) 191 | next_state = self.GetImage(next_state) 192 | return next_state, reward, done, info 193 | 194 | def run(self): 195 | for e in range(self.EPISODES): 196 | state = self.reset() 197 | done, score, SAVING = False, 0, '' 198 | while not done: 199 | #self.env.render() 200 | # Actor picks an action 201 | action = self.act(state) 202 | # Retrieve new state, reward, and whether the state is terminal 203 | next_state, reward, done, _ = self.step(action) 204 | # Memorize (state, action, reward) for training 205 | self.remember(state, action, reward) 206 | # Update current state 207 | state = next_state 208 | score += reward 209 | if done: 210 | average = self.PlotModel(score, e) 211 | # saving best models 212 | if average >= self.max_average: 213 | self.max_average = average 214 | self.save() 215 | SAVING = "SAVING" 216 | else: 217 | SAVING = "" 218 | print("episode: {}/{}, score: {}, average: {:.2f} {}".format(e, self.EPISODES, score, average, SAVING)) 219 | 220 | self.replay() 221 | # close environemnt when finish training 222 | self.env.close() 223 | 224 | def test(self, Actor_name, Critic_name): 225 | self.load(Actor_name, Critic_name) 226 | for e in range(100): 227 | state = self.reset() 228 | done = False 229 | score = 0 230 | while not done: 231 | action = np.argmax(self.Actor.predict(state)) 232 | state, reward, done, _ = self.step(action) 233 | score += reward 234 | if done: 235 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, score)) 236 | break 237 | self.env.close() 238 | 239 | if __name__ == "__main__": 240 | #env_name = 'PongDeterministic-v4' 241 | env_name = 'Pong-v0' 242 | agent = A2CAgent(env_name) 243 | agent.run() 244 | #agent.test('Pong-v0_A2C_2.5e-05_Actor.h5', '') 245 | #agent.test('PongDeterministic-v4_A2C_1e-05_Actor.h5', '') 246 | -------------------------------------------------------------------------------- /09_Pong-v0_A2C/Pong-v0_A2C_TF2.py: -------------------------------------------------------------------------------- 1 | # Tutorial by www.pylessons.com 2 | # Tutorial written for - Tensorflow 2.3.1 3 | 4 | import os 5 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 6 | import random 7 | import gym 8 | import pylab 9 | import numpy as np 10 | from tensorflow.keras.models import Model, load_model 11 | from tensorflow.keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten 12 | from tensorflow.keras.optimizers import Adam, RMSprop 13 | from tensorflow.keras import backend as K 14 | import cv2 15 | 16 | def OurModel(input_shape, action_space, lr): 17 | X_input = Input(input_shape) 18 | 19 | #X = Conv2D(32, 8, strides=(4, 4),padding="valid", activation="elu", data_format="channels_first", input_shape=input_shape)(X_input) 20 | #X = Conv2D(64, 4, strides=(2, 2),padding="valid", activation="elu", data_format="channels_first")(X) 21 | #X = Conv2D(64, 3, strides=(1, 1),padding="valid", activation="elu", data_format="channels_first")(X) 22 | X = Flatten(input_shape=input_shape)(X_input) 23 | 24 | X = Dense(512, activation="elu", kernel_initializer='he_uniform')(X) 25 | #X = Dense(256, activation="elu", kernel_initializer='he_uniform')(X) 26 | #X = Dense(64, activation="elu", kernel_initializer='he_uniform')(X) 27 | 28 | action = Dense(action_space, activation="softmax", kernel_initializer='he_uniform')(X) 29 | value = Dense(1, kernel_initializer='he_uniform')(X) 30 | 31 | Actor = Model(inputs = X_input, outputs = action) 32 | Actor.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=lr)) 33 | 34 | Critic = Model(inputs = X_input, outputs = value) 35 | Critic.compile(loss='mse', optimizer=RMSprop(lr=lr)) 36 | 37 | return Actor, Critic 38 | 39 | class A2CAgent: 40 | # Actor-Critic Main Optimization Algorithm 41 | def __init__(self, env_name): 42 | # Initialization 43 | # Environment and PPO parameters 44 | self.env_name = env_name 45 | self.env = gym.make(env_name) 46 | self.action_size = self.env.action_space.n 47 | self.EPISODES, self.max_average = 10000, -21.0 # specific for pong 48 | self.lr = 0.000025 49 | 50 | self.ROWS = 80 51 | self.COLS = 80 52 | self.REM_STEP = 4 53 | 54 | # Instantiate games and plot memory 55 | self.states, self.actions, self.rewards = [], [], [] 56 | self.scores, self.episodes, self.average = [], [], [] 57 | 58 | self.Save_Path = 'Models' 59 | self.state_size = (self.REM_STEP, self.ROWS, self.COLS) 60 | self.image_memory = np.zeros(self.state_size) 61 | 62 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path) 63 | self.path = '{}_A2C_{}'.format(self.env_name, self.lr) 64 | self.Model_name = os.path.join(self.Save_Path, self.path) 65 | 66 | # Create Actor-Critic network model 67 | self.Actor, self.Critic = OurModel(input_shape=self.state_size, action_space = self.action_size, lr=self.lr) 68 | 69 | 70 | def remember(self, state, action, reward): 71 | # store episode actions to memory 72 | self.states.append(state) 73 | action_onehot = np.zeros([self.action_size]) 74 | action_onehot[action] = 1 75 | self.actions.append(action_onehot) 76 | self.rewards.append(reward) 77 | 78 | 79 | def act(self, state): 80 | # Use the network to predict the next action to take, using the model 81 | prediction = self.Actor.predict(state)[0] 82 | action = np.random.choice(self.action_size, p=prediction) 83 | return action 84 | 85 | def discount_rewards(self, reward): 86 | # Compute the gamma-discounted rewards over an episode 87 | gamma = 0.99 # discount rate 88 | running_add = 0 89 | discounted_r = np.zeros_like(reward) 90 | for i in reversed(range(0,len(reward))): 91 | if reward[i] != 0: # reset the sum, since this was a game boundary (pong specific!) 92 | running_add = 0 93 | running_add = running_add * gamma + reward[i] 94 | discounted_r[i] = running_add 95 | 96 | discounted_r -= np.mean(discounted_r) # normalizing the result 97 | discounted_r /= np.std(discounted_r) # divide by standard deviation 98 | return discounted_r 99 | 100 | 101 | def replay(self): 102 | # reshape memory to appropriate shape for training 103 | states = np.vstack(self.states) 104 | actions = np.vstack(self.actions) 105 | 106 | # Compute discounted rewards 107 | discounted_r = self.discount_rewards(self.rewards) 108 | 109 | # Get Critic network predictions 110 | values = self.Critic.predict(states)[:, 0] 111 | # Compute advantages 112 | advantages = discounted_r - values 113 | # training Actor and Critic networks 114 | self.Actor.fit(states, actions, sample_weight=advantages, epochs=1, verbose=0) 115 | self.Critic.fit(states, discounted_r, epochs=1, verbose=0) 116 | # reset training memory 117 | self.states, self.actions, self.rewards = [], [], [] 118 | 119 | def load(self, Actor_name, Critic_name): 120 | self.Actor = load_model(Actor_name, compile=False) 121 | #self.Critic = load_model(Critic_name, compile=False) 122 | 123 | def save(self): 124 | self.Actor.save(self.Model_name + '_Actor.h5') 125 | #self.Critic.save(self.Model_name + '_Critic.h5') 126 | 127 | pylab.figure(figsize=(18, 9)) 128 | def PlotModel(self, score, episode): 129 | self.scores.append(score) 130 | self.episodes.append(episode) 131 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:])) 132 | if str(episode)[-2:] == "00":# much faster than episode % 100 133 | pylab.plot(self.episodes, self.scores, 'b') 134 | pylab.plot(self.episodes, self.average, 'r') 135 | pylab.ylabel('Score', fontsize=18) 136 | pylab.xlabel('Steps', fontsize=18) 137 | try: 138 | pylab.savefig(self.path+".png") 139 | except OSError: 140 | pass 141 | 142 | return self.average[-1] 143 | 144 | def imshow(self, image, rem_step=0): 145 | cv2.imshow(self.Model_name+str(rem_step), image[rem_step,...]) 146 | if cv2.waitKey(25) & 0xFF == ord("q"): 147 | cv2.destroyAllWindows() 148 | return 149 | 150 | def GetImage(self, frame): 151 | # croping frame to 80x80 size 152 | frame_cropped = frame[35:195:2, ::2,:] 153 | if frame_cropped.shape[0] != self.COLS or frame_cropped.shape[1] != self.ROWS: 154 | # OpenCV resize function 155 | frame_cropped = cv2.resize(frame, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC) 156 | 157 | # converting to RGB (numpy way) 158 | frame_rgb = 0.299*frame_cropped[:,:,0] + 0.587*frame_cropped[:,:,1] + 0.114*frame_cropped[:,:,2] 159 | 160 | # convert everything to black and white (agent will train faster) 161 | frame_rgb[frame_rgb < 100] = 0 162 | frame_rgb[frame_rgb >= 100] = 255 163 | # converting to RGB (OpenCV way) 164 | #frame_rgb = cv2.cvtColor(frame_cropped, cv2.COLOR_RGB2GRAY) 165 | 166 | # dividing by 255 we expresses value to 0-1 representation 167 | new_frame = np.array(frame_rgb).astype(np.float32) / 255.0 168 | 169 | # push our data by 1 frame, similar as deq() function work 170 | self.image_memory = np.roll(self.image_memory, 1, axis = 0) 171 | 172 | # inserting new frame to free space 173 | self.image_memory[0,:,:] = new_frame 174 | 175 | # show image frame 176 | #self.imshow(self.image_memory,0) 177 | #self.imshow(self.image_memory,1) 178 | #self.imshow(self.image_memory,2) 179 | #self.imshow(self.image_memory,3) 180 | 181 | return np.expand_dims(self.image_memory, axis=0) 182 | 183 | def reset(self): 184 | frame = self.env.reset() 185 | for i in range(self.REM_STEP): 186 | state = self.GetImage(frame) 187 | return state 188 | 189 | def step(self, action): 190 | next_state, reward, done, info = self.env.step(action) 191 | next_state = self.GetImage(next_state) 192 | return next_state, reward, done, info 193 | 194 | def run(self): 195 | for e in range(self.EPISODES): 196 | state = self.reset() 197 | done, score, SAVING = False, 0, '' 198 | while not done: 199 | #self.env.render() 200 | # Actor picks an action 201 | action = self.act(state) 202 | # Retrieve new state, reward, and whether the state is terminal 203 | next_state, reward, done, _ = self.step(action) 204 | # Memorize (state, action, reward) for training 205 | self.remember(state, action, reward) 206 | # Update current state 207 | state = next_state 208 | score += reward 209 | if done: 210 | average = self.PlotModel(score, e) 211 | # saving best models 212 | if average >= self.max_average: 213 | self.max_average = average 214 | self.save() 215 | SAVING = "SAVING" 216 | else: 217 | SAVING = "" 218 | print("episode: {}/{}, score: {}, average: {:.2f} {}".format(e, self.EPISODES, score, average, SAVING)) 219 | 220 | self.replay() 221 | # close environemnt when finish training 222 | self.env.close() 223 | 224 | def test(self, Actor_name, Critic_name): 225 | self.load(Actor_name, Critic_name) 226 | for e in range(100): 227 | state = self.reset() 228 | done = False 229 | score = 0 230 | while not done: 231 | action = np.argmax(self.Actor.predict(state)) 232 | state, reward, done, _ = self.step(action) 233 | score += reward 234 | if done: 235 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, score)) 236 | break 237 | self.env.close() 238 | 239 | if __name__ == "__main__": 240 | #env_name = 'PongDeterministic-v4' 241 | env_name = 'Pong-v0' 242 | agent = A2CAgent(env_name) 243 | agent.run() 244 | #agent.test('Pong-v0_A2C_2.5e-05_Actor.h5', '') 245 | #agent.test('PongDeterministic-v4_A2C_1e-05_Actor.h5', '') 246 | -------------------------------------------------------------------------------- /10_Pong-v0_A3C/Pong-v0_A3C.py: -------------------------------------------------------------------------------- 1 | # Tutorial by www.pylessons.com 2 | # Tutorial written for - Tensorflow 1.15, Keras 2.2.4 3 | 4 | import os 5 | #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 6 | os.environ['CUDA_VISIBLE_DEVICES'] = '1' 7 | import random 8 | import gym 9 | import pylab 10 | import numpy as np 11 | from keras.models import Model, load_model 12 | from keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten 13 | from keras.optimizers import Adam, RMSprop 14 | from keras import backend as K 15 | import cv2 16 | # import needed for threading 17 | import tensorflow as tf 18 | from keras.backend.tensorflow_backend import set_session 19 | import threading 20 | from threading import Thread, Lock 21 | import time 22 | 23 | # configure Keras and TensorFlow sessions and graph 24 | config = tf.ConfigProto() 25 | config.gpu_options.allow_growth = True 26 | sess = tf.Session(config=config) 27 | set_session(sess) 28 | K.set_session(sess) 29 | graph = tf.get_default_graph() 30 | 31 | def OurModel(input_shape, action_space, lr): 32 | X_input = Input(input_shape) 33 | 34 | #X = Conv2D(32, 8, strides=(4, 4),padding="valid", activation="elu", data_format="channels_first", input_shape=input_shape)(X_input) 35 | #X = Conv2D(64, 4, strides=(2, 2),padding="valid", activation="elu", data_format="channels_first")(X) 36 | #X = Conv2D(64, 3, strides=(1, 1),padding="valid", activation="elu", data_format="channels_first")(X) 37 | X = Flatten(input_shape=input_shape)(X_input) 38 | 39 | X = Dense(512, activation="elu", kernel_initializer='he_uniform')(X) 40 | #X = Dense(256, activation="elu", kernel_initializer='he_uniform')(X) 41 | #X = Dense(64, activation="elu", kernel_initializer='he_uniform')(X) 42 | 43 | action = Dense(action_space, activation="softmax", kernel_initializer='he_uniform')(X) 44 | value = Dense(1, kernel_initializer='he_uniform')(X) 45 | 46 | Actor = Model(inputs = X_input, outputs = action) 47 | Actor.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=lr)) 48 | 49 | Critic = Model(inputs = X_input, outputs = value) 50 | Critic.compile(loss='mse', optimizer=RMSprop(lr=lr)) 51 | 52 | return Actor, Critic 53 | 54 | class A3CAgent: 55 | # Actor-Critic Main Optimization Algorithm 56 | def __init__(self, env_name): 57 | # Initialization 58 | # Environment and PPO parameters 59 | self.env_name = env_name 60 | self.env = gym.make(env_name) 61 | self.action_size = self.env.action_space.n 62 | self.EPISODES, self.episode, self.max_average = 20000, 0, -21.0 # specific for pong 63 | self.lock = Lock() 64 | self.lr = 0.000025 65 | 66 | self.ROWS = 80 67 | self.COLS = 80 68 | self.REM_STEP = 4 69 | 70 | # Instantiate plot memory 71 | self.scores, self.episodes, self.average = [], [], [] 72 | 73 | self.Save_Path = 'Models' 74 | self.state_size = (self.REM_STEP, self.ROWS, self.COLS) 75 | 76 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path) 77 | self.path = '{}_A3C_{}'.format(self.env_name, self.lr) 78 | self.Model_name = os.path.join(self.Save_Path, self.path) 79 | 80 | # Create Actor-Critic network model 81 | self.Actor, self.Critic = OurModel(input_shape=self.state_size, action_space = self.action_size, lr=self.lr) 82 | 83 | # make predict function to work while multithreading 84 | self.Actor._make_predict_function() 85 | self.Critic._make_predict_function() 86 | 87 | global graph 88 | graph = tf.get_default_graph() 89 | 90 | def act(self, state): 91 | # Use the network to predict the next action to take, using the model 92 | prediction = self.Actor.predict(state)[0] 93 | action = np.random.choice(self.action_size, p=prediction) 94 | return action 95 | 96 | def discount_rewards(self, reward): 97 | # Compute the gamma-discounted rewards over an episode 98 | gamma = 0.99 # discount rate 99 | running_add = 0 100 | discounted_r = np.zeros_like(reward) 101 | for i in reversed(range(0,len(reward))): 102 | if reward[i] != 0: # reset the sum, since this was a game boundary (pong specific!) 103 | running_add = 0 104 | running_add = running_add * gamma + reward[i] 105 | discounted_r[i] = running_add 106 | 107 | discounted_r -= np.mean(discounted_r) # normalizing the result 108 | discounted_r /= np.std(discounted_r) # divide by standard deviation 109 | return discounted_r 110 | 111 | def replay(self, states, actions, rewards): 112 | # reshape memory to appropriate shape for training 113 | states = np.vstack(states) 114 | actions = np.vstack(actions) 115 | 116 | # Compute discounted rewards 117 | discounted_r = self.discount_rewards(rewards) 118 | 119 | # Get Critic network predictions 120 | value = self.Critic.predict(states)[:, 0] 121 | # Compute advantages 122 | advantages = discounted_r - value 123 | # training Actor and Critic networks 124 | self.Actor.fit(states, actions, sample_weight=advantages, epochs=1, verbose=0) 125 | self.Critic.fit(states, discounted_r, epochs=1, verbose=0) 126 | 127 | def load(self, Actor_name, Critic_name): 128 | self.Actor = load_model(Actor_name, compile=False) 129 | #self.Critic = load_model(Critic_name, compile=False) 130 | 131 | def save(self): 132 | self.Actor.save(self.Model_name + '_Actor.h5') 133 | #self.Critic.save(self.Model_name + '_Critic.h5') 134 | 135 | pylab.figure(figsize=(18, 9)) 136 | def PlotModel(self, score, episode): 137 | self.scores.append(score) 138 | self.episodes.append(episode) 139 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:])) 140 | if str(episode)[-2:] == "00":# much faster than episode % 100 141 | pylab.plot(self.episodes, self.scores, 'b') 142 | pylab.plot(self.episodes, self.average, 'r') 143 | pylab.ylabel('Score', fontsize=18) 144 | pylab.xlabel('Steps', fontsize=18) 145 | try: 146 | pylab.savefig(self.path+".png") 147 | except OSError: 148 | pass 149 | 150 | return self.average[-1] 151 | 152 | def imshow(self, image, rem_step=0): 153 | cv2.imshow(self.Model_name+str(rem_step), image[rem_step,...]) 154 | if cv2.waitKey(25) & 0xFF == ord("q"): 155 | cv2.destroyAllWindows() 156 | return 157 | 158 | def GetImage(self, frame, image_memory): 159 | if image_memory.shape == (1,*self.state_size): 160 | image_memory = np.squeeze(image_memory) 161 | 162 | # croping frame to 80x80 size 163 | frame_cropped = frame[35:195:2, ::2,:] 164 | if frame_cropped.shape[0] != self.COLS or frame_cropped.shape[1] != self.ROWS: 165 | # OpenCV resize function 166 | frame_cropped = cv2.resize(frame, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC) 167 | 168 | # converting to RGB (numpy way) 169 | frame_rgb = 0.299*frame_cropped[:,:,0] + 0.587*frame_cropped[:,:,1] + 0.114*frame_cropped[:,:,2] 170 | 171 | # convert everything to black and white (agent will train faster) 172 | frame_rgb[frame_rgb < 100] = 0 173 | frame_rgb[frame_rgb >= 100] = 255 174 | # converting to RGB (OpenCV way) 175 | #frame_rgb = cv2.cvtColor(frame_cropped, cv2.COLOR_RGB2GRAY) 176 | 177 | # dividing by 255 we expresses value to 0-1 representation 178 | new_frame = np.array(frame_rgb).astype(np.float32) / 255.0 179 | 180 | # push our data by 1 frame, similar as deq() function work 181 | image_memory = np.roll(image_memory, 1, axis = 0) 182 | 183 | # inserting new frame to free space 184 | image_memory[0,:,:] = new_frame 185 | 186 | # show image frame 187 | #self.imshow(image_memory,0) 188 | #self.imshow(image_memory,1) 189 | #self.imshow(image_memory,2) 190 | #self.imshow(image_memory,3) 191 | 192 | return np.expand_dims(image_memory, axis=0) 193 | 194 | def reset(self, env): 195 | image_memory = np.zeros(self.state_size) 196 | frame = env.reset() 197 | for i in range(self.REM_STEP): 198 | state = self.GetImage(frame, image_memory) 199 | return state 200 | 201 | def step(self, action, env, image_memory): 202 | next_state, reward, done, info = env.step(action) 203 | next_state = self.GetImage(next_state, image_memory) 204 | return next_state, reward, done, info 205 | 206 | def run(self): 207 | for e in range(self.EPISODES): 208 | state = self.reset(self.env) 209 | done, score, SAVING = False, 0, '' 210 | # Instantiate or reset games memory 211 | states, actions, rewards = [], [], [] 212 | while not done: 213 | #self.env.render() 214 | # Actor picks an action 215 | action = self.act(state) 216 | # Retrieve new state, reward, and whether the state is terminal 217 | next_state, reward, done, _ = self.step(action, self.env, state) 218 | # Memorize (state, action, reward) for training 219 | states.append(state) 220 | action_onehot = np.zeros([self.action_size]) 221 | action_onehot[action] = 1 222 | actions.append(action_onehot) 223 | rewards.append(reward) 224 | # Update current state 225 | state = next_state 226 | score += reward 227 | if done: 228 | average = self.PlotModel(score, e) 229 | # saving best models 230 | if average >= self.max_average: 231 | self.max_average = average 232 | self.save() 233 | SAVING = "SAVING" 234 | else: 235 | SAVING = "" 236 | print("episode: {}/{}, score: {}, average: {:.2f} {}".format(e, self.EPISODES, score, average, SAVING)) 237 | 238 | self.replay(states, actions, rewards) 239 | # close environemnt when finish training 240 | self.env.close() 241 | 242 | def train(self, n_threads): 243 | self.env.close() 244 | # Instantiate one environment per thread 245 | envs = [gym.make(self.env_name) for i in range(n_threads)] 246 | 247 | # Create threads 248 | threads = [threading.Thread( 249 | target=self.train_threading, 250 | daemon=True, 251 | args=(self, 252 | envs[i], 253 | i)) for i in range(n_threads)] 254 | 255 | for t in threads: 256 | time.sleep(2) 257 | t.start() 258 | 259 | for t in threads: 260 | time.sleep(10) 261 | t.join() 262 | 263 | def train_threading(self, agent, env, thread): 264 | global graph 265 | with graph.as_default(): 266 | while self.episode < self.EPISODES: 267 | # Reset episode 268 | score, done, SAVING = 0, False, '' 269 | state = self.reset(env) 270 | # Instantiate or reset games memory 271 | states, actions, rewards = [], [], [] 272 | while not done: 273 | action = agent.act(state) 274 | next_state, reward, done, _ = self.step(action, env, state) 275 | 276 | states.append(state) 277 | action_onehot = np.zeros([self.action_size]) 278 | action_onehot[action] = 1 279 | actions.append(action_onehot) 280 | rewards.append(reward) 281 | 282 | score += reward 283 | state = next_state 284 | 285 | self.lock.acquire() 286 | self.replay(states, actions, rewards) 287 | self.lock.release() 288 | 289 | # Update episode count 290 | with self.lock: 291 | average = self.PlotModel(score, self.episode) 292 | # saving best models 293 | if average >= self.max_average: 294 | self.max_average = average 295 | self.save() 296 | SAVING = "SAVING" 297 | else: 298 | SAVING = "" 299 | print("episode: {}/{}, thread: {}, score: {}, average: {:.2f} {}".format(self.episode, self.EPISODES, thread, score, average, SAVING)) 300 | if(self.episode < self.EPISODES): 301 | self.episode += 1 302 | env.close() 303 | 304 | def test(self, Actor_name, Critic_name): 305 | self.load(Actor_name, Critic_name) 306 | for e in range(100): 307 | state = self.reset(self.env) 308 | done = False 309 | score = 0 310 | while not done: 311 | self.env.render() 312 | action = np.argmax(self.Actor.predict(state)) 313 | state, reward, done, _ = self.step(action, self.env, state) 314 | score += reward 315 | if done: 316 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, score)) 317 | break 318 | self.env.close() 319 | 320 | if __name__ == "__main__": 321 | #env_name = 'PongDeterministic-v4' 322 | env_name = 'Pong-v0' 323 | agent = A3CAgent(env_name) 324 | #agent.run() # use as A2C 325 | #agent.train(n_threads=5) # use as A3C 326 | agent.test('Models/Pong-v0_A3C_2.5e-05_Actor.h5', '') 327 | -------------------------------------------------------------------------------- /10_Pong-v0_A3C/Pong-v0_A3C_TF2.py: -------------------------------------------------------------------------------- 1 | # Tutorial by www.pylessons.com 2 | # Tutorial written for - Tensorflow 2.3.1 3 | 4 | import os 5 | #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 6 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 7 | import random 8 | import gym 9 | import pylab 10 | import numpy as np 11 | import tensorflow as tf 12 | from tensorflow.keras.models import Model, load_model 13 | from tensorflow.keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten 14 | from tensorflow.keras.optimizers import Adam, RMSprop 15 | from tensorflow.keras import backend as K 16 | import cv2 17 | import threading 18 | from threading import Thread, Lock 19 | import time 20 | 21 | gpus = tf.config.experimental.list_physical_devices('GPU') 22 | if len(gpus) > 0: 23 | print(f'GPUs {gpus}') 24 | try: tf.config.experimental.set_memory_growth(gpus[0], True) 25 | except RuntimeError: pass 26 | 27 | def OurModel(input_shape, action_space, lr): 28 | X_input = Input(input_shape) 29 | 30 | #X = Conv2D(32, 8, strides=(4, 4),padding="valid", activation="elu", data_format="channels_first", input_shape=input_shape)(X_input) 31 | #X = Conv2D(64, 4, strides=(2, 2),padding="valid", activation="elu", data_format="channels_first")(X) 32 | #X = Conv2D(64, 3, strides=(1, 1),padding="valid", activation="elu", data_format="channels_first")(X) 33 | X = Flatten(input_shape=input_shape)(X_input) 34 | 35 | X = Dense(512, activation="elu", kernel_initializer='he_uniform')(X) 36 | #X = Dense(256, activation="elu", kernel_initializer='he_uniform')(X) 37 | #X = Dense(64, activation="elu", kernel_initializer='he_uniform')(X) 38 | 39 | action = Dense(action_space, activation="softmax", kernel_initializer='he_uniform')(X) 40 | value = Dense(1, kernel_initializer='he_uniform')(X) 41 | 42 | Actor = Model(inputs = X_input, outputs = action) 43 | Actor.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=lr)) 44 | 45 | Critic = Model(inputs = X_input, outputs = value) 46 | Critic.compile(loss='mse', optimizer=RMSprop(lr=lr)) 47 | 48 | return Actor, Critic 49 | 50 | class A3CAgent: 51 | # Actor-Critic Main Optimization Algorithm 52 | def __init__(self, env_name): 53 | # Initialization 54 | # Environment and PPO parameters 55 | self.env_name = env_name 56 | self.env = gym.make(env_name) 57 | self.action_size = self.env.action_space.n 58 | self.EPISODES, self.episode, self.max_average = 20000, 0, -21.0 # specific for pong 59 | self.lock = Lock() 60 | self.lr = 0.000025 61 | 62 | self.ROWS = 80 63 | self.COLS = 80 64 | self.REM_STEP = 4 65 | 66 | # Instantiate plot memory 67 | self.scores, self.episodes, self.average = [], [], [] 68 | 69 | self.Save_Path = 'Models' 70 | self.state_size = (self.REM_STEP, self.ROWS, self.COLS) 71 | 72 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path) 73 | self.path = '{}_A3C_{}'.format(self.env_name, self.lr) 74 | self.Model_name = os.path.join(self.Save_Path, self.path) 75 | 76 | # Create Actor-Critic network model 77 | self.Actor, self.Critic = OurModel(input_shape=self.state_size, action_space = self.action_size, lr=self.lr) 78 | 79 | def act(self, state): 80 | # Use the network to predict the next action to take, using the model 81 | prediction = self.Actor.predict(state)[0] 82 | action = np.random.choice(self.action_size, p=prediction) 83 | return action 84 | 85 | def discount_rewards(self, reward): 86 | # Compute the gamma-discounted rewards over an episode 87 | gamma = 0.99 # discount rate 88 | running_add = 0 89 | discounted_r = np.zeros_like(reward) 90 | for i in reversed(range(0,len(reward))): 91 | if reward[i] != 0: # reset the sum, since this was a game boundary (pong specific!) 92 | running_add = 0 93 | running_add = running_add * gamma + reward[i] 94 | discounted_r[i] = running_add 95 | 96 | discounted_r -= np.mean(discounted_r) # normalizing the result 97 | discounted_r /= np.std(discounted_r) # divide by standard deviation 98 | return discounted_r 99 | 100 | def replay(self, states, actions, rewards): 101 | # reshape memory to appropriate shape for training 102 | states = np.vstack(states) 103 | actions = np.vstack(actions) 104 | 105 | # Compute discounted rewards 106 | discounted_r = self.discount_rewards(rewards) 107 | 108 | # Get Critic network predictions 109 | value = self.Critic.predict(states)[:, 0] 110 | # Compute advantages 111 | advantages = discounted_r - value 112 | # training Actor and Critic networks 113 | self.Actor.fit(states, actions, sample_weight=advantages, epochs=1, verbose=0) 114 | self.Critic.fit(states, discounted_r, epochs=1, verbose=0) 115 | 116 | def load(self, Actor_name, Critic_name): 117 | self.Actor = load_model(Actor_name, compile=False) 118 | #self.Critic = load_model(Critic_name, compile=False) 119 | 120 | def save(self): 121 | self.Actor.save(self.Model_name + '_Actor.h5') 122 | #self.Critic.save(self.Model_name + '_Critic.h5') 123 | 124 | pylab.figure(figsize=(18, 9)) 125 | def PlotModel(self, score, episode): 126 | self.scores.append(score) 127 | self.episodes.append(episode) 128 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:])) 129 | if str(episode)[-2:] == "00":# much faster than episode % 100 130 | pylab.plot(self.episodes, self.scores, 'b') 131 | pylab.plot(self.episodes, self.average, 'r') 132 | pylab.ylabel('Score', fontsize=18) 133 | pylab.xlabel('Steps', fontsize=18) 134 | try: 135 | pylab.savefig(self.path+".png") 136 | except OSError: 137 | pass 138 | 139 | return self.average[-1] 140 | 141 | def imshow(self, image, rem_step=0): 142 | cv2.imshow(self.Model_name+str(rem_step), image[rem_step,...]) 143 | if cv2.waitKey(25) & 0xFF == ord("q"): 144 | cv2.destroyAllWindows() 145 | return 146 | 147 | def GetImage(self, frame, image_memory): 148 | if image_memory.shape == (1,*self.state_size): 149 | image_memory = np.squeeze(image_memory) 150 | 151 | # croping frame to 80x80 size 152 | frame_cropped = frame[35:195:2, ::2,:] 153 | if frame_cropped.shape[0] != self.COLS or frame_cropped.shape[1] != self.ROWS: 154 | # OpenCV resize function 155 | frame_cropped = cv2.resize(frame, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC) 156 | 157 | # converting to RGB (numpy way) 158 | frame_rgb = 0.299*frame_cropped[:,:,0] + 0.587*frame_cropped[:,:,1] + 0.114*frame_cropped[:,:,2] 159 | 160 | # convert everything to black and white (agent will train faster) 161 | frame_rgb[frame_rgb < 100] = 0 162 | frame_rgb[frame_rgb >= 100] = 255 163 | # converting to RGB (OpenCV way) 164 | #frame_rgb = cv2.cvtColor(frame_cropped, cv2.COLOR_RGB2GRAY) 165 | 166 | # dividing by 255 we expresses value to 0-1 representation 167 | new_frame = np.array(frame_rgb).astype(np.float32) / 255.0 168 | 169 | # push our data by 1 frame, similar as deq() function work 170 | image_memory = np.roll(image_memory, 1, axis = 0) 171 | 172 | # inserting new frame to free space 173 | image_memory[0,:,:] = new_frame 174 | 175 | # show image frame 176 | #self.imshow(image_memory,0) 177 | #self.imshow(image_memory,1) 178 | #self.imshow(image_memory,2) 179 | #self.imshow(image_memory,3) 180 | 181 | return np.expand_dims(image_memory, axis=0) 182 | 183 | def reset(self, env): 184 | image_memory = np.zeros(self.state_size) 185 | frame = env.reset() 186 | for i in range(self.REM_STEP): 187 | state = self.GetImage(frame, image_memory) 188 | return state 189 | 190 | def step(self, action, env, image_memory): 191 | next_state, reward, done, info = env.step(action) 192 | next_state = self.GetImage(next_state, image_memory) 193 | return next_state, reward, done, info 194 | 195 | def run(self): 196 | for e in range(self.EPISODES): 197 | state = self.reset(self.env) 198 | done, score, SAVING = False, 0, '' 199 | # Instantiate or reset games memory 200 | states, actions, rewards = [], [], [] 201 | while not done: 202 | #self.env.render() 203 | # Actor picks an action 204 | action = self.act(state) 205 | # Retrieve new state, reward, and whether the state is terminal 206 | next_state, reward, done, _ = self.step(action, self.env, state) 207 | # Memorize (state, action, reward) for training 208 | states.append(state) 209 | action_onehot = np.zeros([self.action_size]) 210 | action_onehot[action] = 1 211 | actions.append(action_onehot) 212 | rewards.append(reward) 213 | # Update current state 214 | state = next_state 215 | score += reward 216 | if done: 217 | average = self.PlotModel(score, e) 218 | # saving best models 219 | if average >= self.max_average: 220 | self.max_average = average 221 | self.save() 222 | SAVING = "SAVING" 223 | else: 224 | SAVING = "" 225 | print("episode: {}/{}, score: {}, average: {:.2f} {}".format(e, self.EPISODES, score, average, SAVING)) 226 | 227 | self.replay(states, actions, rewards) 228 | # close environemnt when finish training 229 | self.env.close() 230 | 231 | def train(self, n_threads): 232 | self.env.close() 233 | # Instantiate one environment per thread 234 | envs = [gym.make(self.env_name) for i in range(n_threads)] 235 | 236 | # Create threads 237 | threads = [threading.Thread( 238 | target=self.train_threading, 239 | daemon=True, 240 | args=(self, 241 | envs[i], 242 | i)) for i in range(n_threads)] 243 | 244 | for t in threads: 245 | time.sleep(2) 246 | t.start() 247 | 248 | for t in threads: 249 | time.sleep(10) 250 | t.join() 251 | 252 | def train_threading(self, agent, env, thread): 253 | while self.episode < self.EPISODES: 254 | # Reset episode 255 | score, done, SAVING = 0, False, '' 256 | state = self.reset(env) 257 | # Instantiate or reset games memory 258 | states, actions, rewards = [], [], [] 259 | while not done: 260 | action = agent.act(state) 261 | next_state, reward, done, _ = self.step(action, env, state) 262 | 263 | states.append(state) 264 | action_onehot = np.zeros([self.action_size]) 265 | action_onehot[action] = 1 266 | actions.append(action_onehot) 267 | rewards.append(reward) 268 | 269 | score += reward 270 | state = next_state 271 | 272 | self.lock.acquire() 273 | self.replay(states, actions, rewards) 274 | self.lock.release() 275 | 276 | # Update episode count 277 | with self.lock: 278 | average = self.PlotModel(score, self.episode) 279 | # saving best models 280 | if average >= self.max_average: 281 | self.max_average = average 282 | self.save() 283 | SAVING = "SAVING" 284 | else: 285 | SAVING = "" 286 | print("episode: {}/{}, thread: {}, score: {}, average: {:.2f} {}".format(self.episode, self.EPISODES, thread, score, average, SAVING)) 287 | if(self.episode < self.EPISODES): 288 | self.episode += 1 289 | env.close() 290 | 291 | def test(self, Actor_name, Critic_name): 292 | self.load(Actor_name, Critic_name) 293 | for e in range(100): 294 | state = self.reset(self.env) 295 | done = False 296 | score = 0 297 | while not done: 298 | self.env.render() 299 | action = np.argmax(self.Actor.predict(state)) 300 | state, reward, done, _ = self.step(action, self.env, state) 301 | score += reward 302 | if done: 303 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, score)) 304 | break 305 | self.env.close() 306 | 307 | if __name__ == "__main__": 308 | env_name = 'PongDeterministic-v4' 309 | #env_name = 'Pong-v0' 310 | agent = A3CAgent(env_name) 311 | #agent.run() # use as A2C 312 | agent.train(n_threads=5) # use as A3C 313 | #agent.test('Models/Pong-v0_A3C_2.5e-05_Actor.h5', '') 314 | -------------------------------------------------------------------------------- /10_Pong-v0_A3C/PongDeterministic-v4_A3C_2.5e-05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/10_Pong-v0_A3C/PongDeterministic-v4_A3C_2.5e-05.png -------------------------------------------------------------------------------- /11_Pong-v0_PPO/Models/Pong-v0_APPO_0.0001_Actor_CNN.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/11_Pong-v0_PPO/Models/Pong-v0_APPO_0.0001_Actor_CNN.h5 -------------------------------------------------------------------------------- /11_Pong-v0_PPO/Pong-v0_APPO_0.0001_CNN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/11_Pong-v0_PPO/Pong-v0_APPO_0.0001_CNN.png -------------------------------------------------------------------------------- /11_Pong-v0_PPO/Pong-v0_APPO_0.0001_RMSprop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/11_Pong-v0_PPO/Pong-v0_APPO_0.0001_RMSprop.png -------------------------------------------------------------------------------- /11_Pong-v0_PPO/PongDeterministic-v4_APPO_0.0001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/11_Pong-v0_PPO/PongDeterministic-v4_APPO_0.0001.png -------------------------------------------------------------------------------- /11_Pong-v0_PPO/gameplay.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/11_Pong-v0_PPO/gameplay.gif -------------------------------------------------------------------------------- /11_Pong-v0_PPO/gameplay_CNN.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/11_Pong-v0_PPO/gameplay_CNN.gif -------------------------------------------------------------------------------- /BipedalWalker-v3_PPO/BipedalWalker-v3_PPO_Actor.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/BipedalWalker-v3_PPO/BipedalWalker-v3_PPO_Actor.h5 -------------------------------------------------------------------------------- /BipedalWalker-v3_PPO/BipedalWalker-v3_PPO_Critic.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/BipedalWalker-v3_PPO/BipedalWalker-v3_PPO_Critic.h5 -------------------------------------------------------------------------------- /BipedalWalker-v3_PPO/BipedalWalker-v3_training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/BipedalWalker-v3_PPO/BipedalWalker-v3_training.png -------------------------------------------------------------------------------- /BipedalWalker-v3_PPO/gameplay.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/BipedalWalker-v3_PPO/gameplay.gif -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Rokas 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /LunarLander-v2_PPO/LunarLander-v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/LunarLander-v2_PPO/LunarLander-v2.png -------------------------------------------------------------------------------- /LunarLander-v2_PPO/LunarLander-v2_PPO_Actor.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/LunarLander-v2_PPO/LunarLander-v2_PPO_Actor.h5 -------------------------------------------------------------------------------- /LunarLander-v2_PPO/LunarLander-v2_PPO_Critic.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/LunarLander-v2_PPO/LunarLander-v2_PPO_Critic.h5 -------------------------------------------------------------------------------- /LunarLander-v2_PPO/gameplay.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/LunarLander-v2_PPO/gameplay.gif -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reinforcement Learning Tutorials: 2 | 3 | *2020-10-07 added support for Tensorflow 2.3.1* 4 | 5 | PPO and PPO_CNN agents playing Pong-v0 game:
6 | ![PPO agent](11_Pong-v0_PPO/gameplay.gif) 7 | ![PPO CNN agent](11_Pong-v0_PPO/gameplay_CNN.gif) 8 | 9 | *2020-10-10 added LunarLander-v2_PPO Continuous code for Tensorflow 2.3.1*: 10 | ![LunarLander-v2_PPO](https://github.com/pythonlessons/Reinforcement_Learning/blob/master/LunarLander-v2_PPO/gameplay.gif) 11 | 12 | *2020-10-23 added BipedalWalker-v3_PPO code for Tensorflow 2.3.1*: 13 | ![BipedalWalker-v3_PPO_PPO](https://github.com/pythonlessons/Reinforcement_Learning/blob/master/BipedalWalker-v3_PPO/gameplay.gif) 14 | 15 | 1. [Deep Q Learning tutorial (DQN)](https://pylessons.com/CartPole-reinforcement-learning/) 16 | 17 | 2. [Double Deep Q Learning tutorial (DDQN)](https://pylessons.com/CartPole-DDQN/) 18 | 19 | 3. [Dueling Double Deep Q Learning tutorial (D3QN)](https://pylessons.com/CartPole-DDDQN/) 20 | 21 | 4. [Epsilon Greedy Dueling Double Deep Q Learning tutorial (D3QN)](https://pylessons.com/Epsilon-Greedy-DQN/) 22 | 23 | 5. [Prioritized Experience Replay (PER) D3QN tutorial](https://pylessons.com/CartPole-PER/) 24 | 25 | 6. [D3QN PER with Convolutional Neural Networks tutorial](https://pylessons.com/CartPole-PER-CNN/) 26 | 27 | 7. [A.I. learns to play Pong with DQN](https://pylessons.com/DQN-PONG/) 28 | 29 | 8. [Introduction to RL Policy Gradient (PG or REINFORCE)](https://pylessons.com/Beyond-DQN/) 30 | 31 | 9. [Introduction to RL Advanced Actor Critic algorythm (A2C)](https://pylessons.com/A2C-reinforcement-learning/) 32 | 33 | 10. [Introduction to RL Asynchronous Advanced Actor Critic algorythm (A3C)](https://pylessons.com/A3C-reinforcement-learning/) 34 | 35 | 11. [Introduction to RL Proximal Policy Optimization algorythm (PPO)](https://pylessons.com/PPO-reinforcement-learning/) 36 | 37 | 12. [Let’s code from scratch a discrete Reinforcement Learning rocket landing agent! (PPO)](https://pylessons.com/LunarLander-v2-PPO/) 38 | 39 | 13. [Continuous Proximal Policy Optimization Tutorial with OpenAI gym environment! (PPO)](https://pylessons.com/BipedalWalker-v3-PPO/) 40 |

41 | PPO Pong-v0 Learning curve: 42 | 43 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | tensorflow==2.3.1 4 | tensorflow-gpu==2.3.1 5 | opencv-python 6 | matplotlib 7 | tensorboardx 8 | pandas 9 | gym[all] 10 | box2d-py 11 | --------------------------------------------------------------------------------