├── 01_CartPole-reinforcement-learning
├── Cartpole_DQN.py
├── IMAGES
│ ├── CartPole_test.gif
│ ├── image.png
│ ├── math.PNG
│ ├── testing_model.PNG
│ └── training_model.PNG
├── cartpole-dqn.h5
└── cartpole_random.py
├── 02_CartPole-reinforcement-learning_DDQN
├── Cartpole_DDQN.py
├── Cartpole_DDQN_TF2.py
└── IMAGES
│ ├── DDQN_CartPole-v1.png
│ ├── DDQN_CartPole-v1_soft.png
│ └── DQN_CartPole-v1.png
├── 03_CartPole-reinforcement-learning_Dueling_DDQN
├── Cartpole_Double_DDQN.py
├── Cartpole_Double_DDQN_TF2.py
└── IMAGES
│ ├── DDQN_CartPole-v1.png
│ ├── DDQN_CartPole-v1_Dueling.png
│ └── DQN_CartPole-v1_Dueling.png
├── 04_CartPole-reinforcement-learning_e_greedy_D3QN
├── Cartpole_e_greedy_D3QN.py
├── Cartpole_e_greedy_D3QN_TF2.py
└── IMAGES
│ └── DDQN_CartPole-v1_Dueling_Greedy.png
├── 05_CartPole-reinforcement-learning_PER_D3QN
├── Cartpole_PER_D3QN.py
├── Cartpole_PER_D3QN_TF2.py
├── IMAGES
│ ├── DDQN_CartPole-v1_Dueling.png
│ ├── DDQN_CartPole-v1_Dueling_PER.png
│ ├── Replay_buffer.png
│ └── SumTree.png
└── PER.py
├── 06_CartPole-reinforcement-learning_PER_D3QN_CNN
├── Cartpole_PER_D3QN_CNN.py
├── Cartpole_PER_D3QN_CNN_TF2.py
├── PER.py
└── random_game.py
├── 07_Pong-reinforcement-learning_DQN_CNN
├── IMAGES
│ ├── DDQN_Pong-v0_CNN.png
│ ├── DDQN_Pong-v0_Dueling_CNN.png
│ ├── DDQN_Pong-v0_Dueling_PER_CNN.png
│ └── DQN_Pong-v0_CNN.png
├── Models
│ ├── Pong-v0_DDQN_CNN.h5
│ ├── Pong-v0_DDQN_Dueling_CNN.h5
│ ├── Pong-v0_DDQN_Dueling_PER_CNN.h5
│ └── Pong-v0_DQN_CNN.h5
├── PER.py
├── Pong-v0_DQN_CNN.py
└── Pong-v0_DQN_CNN_TF2.py
├── 08_Pong-v0_Policy_gradient
├── IMAGES
│ ├── Pong-v0_PG_2.5e-05.png
│ └── PongDeterministic-v4_PG_0.0001.png
├── Pong-v0_PG.py
└── Pong-v0_PG_TF2.py
├── 09_Pong-v0_A2C
├── IMAGES
│ ├── Pong-v0_A2C_2.5e-05.png
│ └── PongDeterministic-v4_A2C_2.5e-05.png
├── Pong-v0_A2C.py
└── Pong-v0_A2C_TF2.py
├── 10_Pong-v0_A3C
├── Pong-v0_A3C.py
├── Pong-v0_A3C_TF2.py
└── PongDeterministic-v4_A3C_2.5e-05.png
├── 11_Pong-v0_PPO
├── Models
│ └── Pong-v0_APPO_0.0001_Actor_CNN.h5
├── Pong-v0_APPO_0.0001_CNN.png
├── Pong-v0_APPO_0.0001_RMSprop.png
├── Pong-v0_PPO.py
├── Pong-v0_PPO_TF2.py
├── Pong-v0_PPO_gif.py
├── PongDeterministic-v4_APPO_0.0001.png
├── gameplay.gif
└── gameplay_CNN.gif
├── BipedalWalker-v3_PPO
├── BipedalWalker-v3_PPO.py
├── BipedalWalker-v3_PPO_Actor.h5
├── BipedalWalker-v3_PPO_Critic.h5
├── BipedalWalker-v3_training.png
└── gameplay.gif
├── LICENSE.md
├── LunarLander-v2_PPO
├── LunarLander-v2.png
├── LunarLander-v2_PPO.py
├── LunarLander-v2_PPO_Actor.h5
├── LunarLander-v2_PPO_Critic.h5
└── gameplay.gif
├── README.md
└── requirements.txt
/01_CartPole-reinforcement-learning/Cartpole_DQN.py:
--------------------------------------------------------------------------------
1 | # Tutorial by www.pylessons.com
2 | # Tutorial written for - Tensorflow 1.15, Keras 2.2.4
3 |
4 | import os
5 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
6 | import random
7 | import gym
8 | import numpy as np
9 | from collections import deque
10 | from keras.models import Model, load_model
11 | from keras.layers import Input, Dense
12 | from keras.optimizers import Adam, RMSprop
13 |
14 |
15 | def OurModel(input_shape, action_space):
16 | X_input = Input(input_shape)
17 |
18 | # 'Dense' is the basic form of a neural network layer
19 | # Input Layer of state size(4) and Hidden Layer with 512 nodes
20 | X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X_input)
21 |
22 | # Hidden layer with 256 nodes
23 | X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X)
24 |
25 | # Hidden layer with 64 nodes
26 | X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
27 |
28 | # Output Layer with # of actions: 2 nodes (left, right)
29 | X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X)
30 |
31 | model = Model(inputs = X_input, outputs = X, name='CartPole DQN model')
32 | model.compile(loss="mse", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"])
33 |
34 | model.summary()
35 | return model
36 |
37 | class DQNAgent:
38 | def __init__(self):
39 | self.env = gym.make('CartPole-v1')
40 | # by default, CartPole-v1 has max episode steps = 500
41 | self.state_size = self.env.observation_space.shape[0]
42 | self.action_size = self.env.action_space.n
43 | self.EPISODES = 1000
44 | self.memory = deque(maxlen=2000)
45 |
46 | self.gamma = 0.95 # discount rate
47 | self.epsilon = 1.0 # exploration rate
48 | self.epsilon_min = 0.001
49 | self.epsilon_decay = 0.999
50 | self.batch_size = 64
51 | self.train_start = 1000
52 |
53 | # create main model
54 | self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size)
55 |
56 | def remember(self, state, action, reward, next_state, done):
57 | self.memory.append((state, action, reward, next_state, done))
58 | if len(self.memory) > self.train_start:
59 | if self.epsilon > self.epsilon_min:
60 | self.epsilon *= self.epsilon_decay
61 |
62 | def act(self, state):
63 | if np.random.random() <= self.epsilon:
64 | return random.randrange(self.action_size)
65 | else:
66 | return np.argmax(self.model.predict(state))
67 |
68 | def replay(self):
69 | if len(self.memory) < self.train_start:
70 | return
71 | # Randomly sample minibatch from the memory
72 | minibatch = random.sample(self.memory, min(len(self.memory), self.batch_size))
73 |
74 | state = np.zeros((self.batch_size, self.state_size))
75 | next_state = np.zeros((self.batch_size, self.state_size))
76 | action, reward, done = [], [], []
77 |
78 | # do this before prediction
79 | # for speedup, this could be done on the tensor level
80 | # but easier to understand using a loop
81 | for i in range(self.batch_size):
82 | state[i] = minibatch[i][0]
83 | action.append(minibatch[i][1])
84 | reward.append(minibatch[i][2])
85 | next_state[i] = minibatch[i][3]
86 | done.append(minibatch[i][4])
87 |
88 | # do batch prediction to save speed
89 | target = self.model.predict(state)
90 | target_next = self.model.predict(next_state)
91 |
92 | for i in range(self.batch_size):
93 | # correction on the Q value for the action used
94 | if done[i]:
95 | target[i][action[i]] = reward[i]
96 | else:
97 | # Standard - DQN
98 | # DQN chooses the max Q value among next actions
99 | # selection and evaluation of action is on the target Q Network
100 | # Q_max = max_a' Q_target(s', a')
101 | target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))
102 |
103 | # Train the Neural Network with batches
104 | self.model.fit(state, target, batch_size=self.batch_size, verbose=0)
105 |
106 |
107 | def load(self, name):
108 | self.model = load_model(name)
109 |
110 | def save(self, name):
111 | self.model.save(name)
112 |
113 | def run(self):
114 | for e in range(self.EPISODES):
115 | state = self.env.reset()
116 | state = np.reshape(state, [1, self.state_size])
117 | done = False
118 | i = 0
119 | while not done:
120 | self.env.render()
121 | action = self.act(state)
122 | next_state, reward, done, _ = self.env.step(action)
123 | next_state = np.reshape(next_state, [1, self.state_size])
124 | if not done or i == self.env._max_episode_steps-1:
125 | reward = reward
126 | else:
127 | reward = -100
128 | self.remember(state, action, reward, next_state, done)
129 | state = next_state
130 | i += 1
131 | if done:
132 | print("episode: {}/{}, score: {}, e: {:.2}".format(e, self.EPISODES, i, self.epsilon))
133 | if i == 500:
134 | print("Saving trained model as cartpole-dqn.h5")
135 | self.save("cartpole-dqn.h5")
136 | return
137 | self.replay()
138 |
139 | def test(self):
140 | self.load("cartpole-dqn.h5")
141 | for e in range(self.EPISODES):
142 | state = self.env.reset()
143 | state = np.reshape(state, [1, self.state_size])
144 | done = False
145 | i = 0
146 | while not done:
147 | self.env.render()
148 | action = np.argmax(self.model.predict(state))
149 | next_state, reward, done, _ = self.env.step(action)
150 | state = np.reshape(next_state, [1, self.state_size])
151 | i += 1
152 | if done:
153 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, i))
154 | break
155 |
156 | if __name__ == "__main__":
157 | agent = DQNAgent()
158 | #agent.run()
159 | agent.test()
160 |
--------------------------------------------------------------------------------
/01_CartPole-reinforcement-learning/IMAGES/CartPole_test.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/01_CartPole-reinforcement-learning/IMAGES/CartPole_test.gif
--------------------------------------------------------------------------------
/01_CartPole-reinforcement-learning/IMAGES/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/01_CartPole-reinforcement-learning/IMAGES/image.png
--------------------------------------------------------------------------------
/01_CartPole-reinforcement-learning/IMAGES/math.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/01_CartPole-reinforcement-learning/IMAGES/math.PNG
--------------------------------------------------------------------------------
/01_CartPole-reinforcement-learning/IMAGES/testing_model.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/01_CartPole-reinforcement-learning/IMAGES/testing_model.PNG
--------------------------------------------------------------------------------
/01_CartPole-reinforcement-learning/IMAGES/training_model.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/01_CartPole-reinforcement-learning/IMAGES/training_model.PNG
--------------------------------------------------------------------------------
/01_CartPole-reinforcement-learning/cartpole-dqn.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/01_CartPole-reinforcement-learning/cartpole-dqn.h5
--------------------------------------------------------------------------------
/01_CartPole-reinforcement-learning/cartpole_random.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import random
3 |
4 | env = gym.make("CartPole-v1")
5 |
6 | def Random_games():
7 | # Each of this episode is its own game.
8 | for episode in range(10):
9 | env.reset()
10 | # this is each frame, up to 500...but we wont make it that far with random.
11 | for t in range(500):
12 | # This will display the environment
13 | # Only display if you really want to see it.
14 | # Takes much longer to display it.
15 | env.render()
16 |
17 | # This will just create a sample action in any environment.
18 | # In this environment, the action can be 0 or 1, which is left or right
19 | action = env.action_space.sample()
20 |
21 | # this executes the environment with an action,
22 | # and returns the observation of the environment,
23 | # the reward, if the env is over, and other info.
24 | next_state, reward, done, info = env.step(action)
25 |
26 | # lets print everything in one line:
27 | print(t, next_state, reward, done, info, action)
28 | if done:
29 | break
30 |
31 | Random_games()
--------------------------------------------------------------------------------
/02_CartPole-reinforcement-learning_DDQN/Cartpole_DDQN.py:
--------------------------------------------------------------------------------
1 | # Tutorial by www.pylessons.com
2 | # Tutorial written for - Tensorflow 1.15, Keras 2.2.4
3 |
4 | import os
5 | import random
6 | import gym
7 | import pylab
8 | import numpy as np
9 | from collections import deque
10 | from keras.models import Model, load_model
11 | from keras.layers import Input, Dense
12 | from keras.optimizers import Adam, RMSprop
13 |
14 |
15 | def OurModel(input_shape, action_space):
16 | X_input = Input(input_shape)
17 | X = X_input
18 |
19 | # 'Dense' is the basic form of a neural network layer
20 | # Input Layer of state size(4) and Hidden Layer with 512 nodes
21 | X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X)
22 |
23 | # Hidden layer with 256 nodes
24 | X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X)
25 |
26 | # Hidden layer with 64 nodes
27 | X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
28 |
29 | # Output Layer with # of actions: 2 nodes (left, right)
30 | X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X)
31 |
32 | model = Model(inputs = X_input, outputs = X, name='CartPole DDQN model')
33 | model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"])
34 |
35 | model.summary()
36 | return model
37 |
38 | class DQNAgent:
39 | def __init__(self, env_name):
40 | self.env_name = env_name
41 | self.env = gym.make(env_name)
42 | self.env.seed(0)
43 | # by default, CartPole-v1 has max episode steps = 500
44 | self.env._max_episode_steps = 4000
45 | self.state_size = self.env.observation_space.shape[0]
46 | self.action_size = self.env.action_space.n
47 |
48 | self.EPISODES = 1000
49 | self.memory = deque(maxlen=2000)
50 |
51 | self.gamma = 0.95 # discount rate
52 | self.epsilon = 1.0 # exploration rate
53 | self.epsilon_min = 0.01
54 | self.epsilon_decay = 0.999
55 | self.batch_size = 32
56 | self.train_start = 1000
57 |
58 | # defining model parameters
59 | self.ddqn = True
60 | self.Soft_Update = False
61 |
62 | self.TAU = 0.1 # target network soft update hyperparameter
63 |
64 | self.Save_Path = 'Models'
65 | self.scores, self.episodes, self.average = [], [], []
66 |
67 | if self.ddqn:
68 | print("----------Double DQN--------")
69 | self.Model_name = os.path.join(self.Save_Path,"DDQN_"+self.env_name+".h5")
70 | else:
71 | print("-------------DQN------------")
72 | self.Model_name = os.path.join(self.Save_Path,"DQN_"+self.env_name+".h5")
73 |
74 | # create main model
75 | self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size)
76 | self.target_model = OurModel(input_shape=(self.state_size,), action_space = self.action_size)
77 |
78 | # after some time interval update the target model to be same with model
79 | def update_target_model(self):
80 | if not self.Soft_Update and self.ddqn:
81 | self.target_model.set_weights(self.model.get_weights())
82 | return
83 | if self.Soft_Update and self.ddqn:
84 | q_model_theta = self.model.get_weights()
85 | target_model_theta = self.target_model.get_weights()
86 | counter = 0
87 | for q_weight, target_weight in zip(q_model_theta, target_model_theta):
88 | target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU
89 | target_model_theta[counter] = target_weight
90 | counter += 1
91 | self.target_model.set_weights(target_model_theta)
92 |
93 | def remember(self, state, action, reward, next_state, done):
94 | self.memory.append((state, action, reward, next_state, done))
95 | if len(self.memory) > self.train_start:
96 | if self.epsilon > self.epsilon_min:
97 | self.epsilon *= self.epsilon_decay
98 |
99 | def act(self, state):
100 | if np.random.random() <= self.epsilon:
101 | return random.randrange(self.action_size)
102 | else:
103 | return np.argmax(self.model.predict(state))
104 |
105 | def replay(self):
106 | if len(self.memory) < self.train_start:
107 | return
108 | # Randomly sample minibatch from the memory
109 | minibatch = random.sample(self.memory, min(self.batch_size, self.batch_size))
110 |
111 | state = np.zeros((self.batch_size, self.state_size))
112 | next_state = np.zeros((self.batch_size, self.state_size))
113 | action, reward, done = [], [], []
114 |
115 | # do this before prediction
116 | # for speedup, this could be done on the tensor level
117 | # but easier to understand using a loop
118 | for i in range(self.batch_size):
119 | state[i] = minibatch[i][0]
120 | action.append(minibatch[i][1])
121 | reward.append(minibatch[i][2])
122 | next_state[i] = minibatch[i][3]
123 | done.append(minibatch[i][4])
124 |
125 | # do batch prediction to save speed
126 | target = self.model.predict(state)
127 | target_next = self.model.predict(next_state)
128 | target_val = self.target_model.predict(next_state)
129 |
130 | for i in range(len(minibatch)):
131 | # correction on the Q value for the action used
132 | if done[i]:
133 | target[i][action[i]] = reward[i]
134 | else:
135 | if self.ddqn: # Double - DQN
136 | # current Q Network selects the action
137 | # a'_max = argmax_a' Q(s', a')
138 | a = np.argmax(target_next[i])
139 | # target Q Network evaluates the action
140 | # Q_max = Q_target(s', a'_max)
141 | target[i][action[i]] = reward[i] + self.gamma * (target_val[i][a])
142 | else: # Standard - DQN
143 | # DQN chooses the max Q value among next actions
144 | # selection and evaluation of action is on the target Q Network
145 | # Q_max = max_a' Q_target(s', a')
146 | target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))
147 |
148 | # Train the Neural Network with batches
149 | self.model.fit(state, target, batch_size=self.batch_size, verbose=0)
150 |
151 |
152 | def load(self, name):
153 | self.model = load_model(name)
154 |
155 | def save(self, name):
156 | self.model.save(name)
157 |
158 | pylab.figure(figsize=(18, 9))
159 | def PlotModel(self, score, episode):
160 | self.scores.append(score)
161 | self.episodes.append(episode)
162 | self.average.append(sum(self.scores) / len(self.scores))
163 | pylab.plot(self.episodes, self.average, 'r')
164 | pylab.plot(self.episodes, self.scores, 'b')
165 | pylab.ylabel('Score', fontsize=18)
166 | pylab.xlabel('Steps', fontsize=18)
167 | dqn = 'DQN_'
168 | softupdate = ''
169 | if self.ddqn:
170 | dqn = 'DDQN_'
171 | if self.Soft_Update:
172 | softupdate = '_soft'
173 | try:
174 | pylab.savefig(dqn+self.env_name+softupdate+".png")
175 | except OSError:
176 | pass
177 |
178 | return str(self.average[-1])[:5]
179 |
180 | def run(self):
181 | for e in range(self.EPISODES):
182 | state = self.env.reset()
183 | state = np.reshape(state, [1, self.state_size])
184 | done = False
185 | i = 0
186 | while not done:
187 | #self.env.render()
188 | action = self.act(state)
189 | next_state, reward, done, _ = self.env.step(action)
190 | next_state = np.reshape(next_state, [1, self.state_size])
191 | if not done or i == self.env._max_episode_steps-1:
192 | reward = reward
193 | else:
194 | reward = -100
195 | self.remember(state, action, reward, next_state, done)
196 | state = next_state
197 | i += 1
198 | if done:
199 | # every step update target model
200 | self.update_target_model()
201 |
202 | # every episode, plot the result
203 | average = self.PlotModel(i, e)
204 |
205 | print("episode: {}/{}, score: {}, e: {:.2}, average: {}".format(e, self.EPISODES, i, self.epsilon, average))
206 | if i == self.env._max_episode_steps:
207 | print("Saving trained model as cartpole-ddqn.h5")
208 | #self.save("cartpole-ddqn.h5")
209 | break
210 | self.replay()
211 |
212 | def test(self):
213 | self.load("cartpole-ddqn.h5")
214 | for e in range(self.EPISODES):
215 | state = self.env.reset()
216 | state = np.reshape(state, [1, self.state_size])
217 | done = False
218 | i = 0
219 | while not done:
220 | self.env.render()
221 | action = np.argmax(self.model.predict(state))
222 | next_state, reward, done, _ = self.env.step(action)
223 | state = np.reshape(next_state, [1, self.state_size])
224 | i += 1
225 | if done:
226 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, i))
227 | break
228 |
229 | if __name__ == "__main__":
230 | env_name = 'CartPole-v1'
231 | agent = DQNAgent(env_name)
232 | agent.run()
233 | #agent.test()
234 |
--------------------------------------------------------------------------------
/02_CartPole-reinforcement-learning_DDQN/Cartpole_DDQN_TF2.py:
--------------------------------------------------------------------------------
1 | # Tutorial by www.pylessons.com
2 | # Tutorial written for - Tensorflow 2.3.1
3 |
4 | import os
5 | import random
6 | import gym
7 | import pylab
8 | import numpy as np
9 | from collections import deque
10 | import tensorflow as tf
11 | from tensorflow.keras.models import Model, load_model
12 | from tensorflow.keras.layers import Input, Dense
13 | from tensorflow.keras.optimizers import Adam, RMSprop
14 |
15 |
16 | def OurModel(input_shape, action_space):
17 | X_input = Input(input_shape)
18 | X = X_input
19 |
20 | # 'Dense' is the basic form of a neural network layer
21 | # Input Layer of state size(4) and Hidden Layer with 512 nodes
22 | X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X)
23 |
24 | # Hidden layer with 256 nodes
25 | X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X)
26 |
27 | # Hidden layer with 64 nodes
28 | X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
29 |
30 | # Output Layer with # of actions: 2 nodes (left, right)
31 | X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X)
32 |
33 | model = Model(inputs = X_input, outputs = X)
34 | model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"])
35 |
36 | model.summary()
37 | return model
38 |
39 | class DQNAgent:
40 | def __init__(self, env_name):
41 | self.env_name = env_name
42 | self.env = gym.make(env_name)
43 | self.env.seed(0)
44 | # by default, CartPole-v1 has max episode steps = 500
45 | self.env._max_episode_steps = 4000
46 | self.state_size = self.env.observation_space.shape[0]
47 | self.action_size = self.env.action_space.n
48 |
49 | self.EPISODES = 1000
50 | self.memory = deque(maxlen=2000)
51 |
52 | self.gamma = 0.95 # discount rate
53 | self.epsilon = 1.0 # exploration rate
54 | self.epsilon_min = 0.01
55 | self.epsilon_decay = 0.999
56 | self.batch_size = 32
57 | self.train_start = 1000
58 |
59 | # defining model parameters
60 | self.ddqn = True
61 | self.Soft_Update = False
62 |
63 | self.TAU = 0.1 # target network soft update hyperparameter
64 |
65 | self.Save_Path = 'Models'
66 | self.scores, self.episodes, self.average = [], [], []
67 |
68 | if self.ddqn:
69 | print("----------Double DQN--------")
70 | self.Model_name = os.path.join(self.Save_Path,"DDQN_"+self.env_name+".h5")
71 | else:
72 | print("-------------DQN------------")
73 | self.Model_name = os.path.join(self.Save_Path,"DQN_"+self.env_name+".h5")
74 |
75 | # create main model
76 | self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size)
77 | self.target_model = OurModel(input_shape=(self.state_size,), action_space = self.action_size)
78 |
79 | # after some time interval update the target model to be same with model
80 | def update_target_model(self):
81 | if not self.Soft_Update and self.ddqn:
82 | self.target_model.set_weights(self.model.get_weights())
83 | return
84 | if self.Soft_Update and self.ddqn:
85 | q_model_theta = self.model.get_weights()
86 | target_model_theta = self.target_model.get_weights()
87 | counter = 0
88 | for q_weight, target_weight in zip(q_model_theta, target_model_theta):
89 | target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU
90 | target_model_theta[counter] = target_weight
91 | counter += 1
92 | self.target_model.set_weights(target_model_theta)
93 |
94 | def remember(self, state, action, reward, next_state, done):
95 | self.memory.append((state, action, reward, next_state, done))
96 | if len(self.memory) > self.train_start:
97 | if self.epsilon > self.epsilon_min:
98 | self.epsilon *= self.epsilon_decay
99 |
100 | def act(self, state):
101 | if np.random.random() <= self.epsilon:
102 | return random.randrange(self.action_size)
103 | else:
104 | return np.argmax(self.model.predict(state))
105 |
106 | def replay(self):
107 | if len(self.memory) < self.train_start:
108 | return
109 | # Randomly sample minibatch from the memory
110 | minibatch = random.sample(self.memory, min(self.batch_size, self.batch_size))
111 |
112 | state = np.zeros((self.batch_size, self.state_size))
113 | next_state = np.zeros((self.batch_size, self.state_size))
114 | action, reward, done = [], [], []
115 |
116 | # do this before prediction
117 | # for speedup, this could be done on the tensor level
118 | # but easier to understand using a loop
119 | for i in range(self.batch_size):
120 | state[i] = minibatch[i][0]
121 | action.append(minibatch[i][1])
122 | reward.append(minibatch[i][2])
123 | next_state[i] = minibatch[i][3]
124 | done.append(minibatch[i][4])
125 |
126 | # do batch prediction to save speed
127 | target = self.model.predict(state)
128 | target_next = self.model.predict(next_state)
129 | target_val = self.target_model.predict(next_state)
130 |
131 | for i in range(len(minibatch)):
132 | # correction on the Q value for the action used
133 | if done[i]:
134 | target[i][action[i]] = reward[i]
135 | else:
136 | if self.ddqn: # Double - DQN
137 | # current Q Network selects the action
138 | # a'_max = argmax_a' Q(s', a')
139 | a = np.argmax(target_next[i])
140 | # target Q Network evaluates the action
141 | # Q_max = Q_target(s', a'_max)
142 | target[i][action[i]] = reward[i] + self.gamma * (target_val[i][a])
143 | else: # Standard - DQN
144 | # DQN chooses the max Q value among next actions
145 | # selection and evaluation of action is on the target Q Network
146 | # Q_max = max_a' Q_target(s', a')
147 | target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))
148 |
149 | # Train the Neural Network with batches
150 | self.model.fit(state, target, batch_size=self.batch_size, verbose=0)
151 |
152 |
153 | def load(self, name):
154 | self.model = load_model(name)
155 |
156 | def save(self, name):
157 | self.model.save(name)
158 |
159 | pylab.figure(figsize=(18, 9))
160 | def PlotModel(self, score, episode):
161 | self.scores.append(score)
162 | self.episodes.append(episode)
163 | self.average.append(sum(self.scores) / len(self.scores))
164 | pylab.plot(self.episodes, self.average, 'r')
165 | pylab.plot(self.episodes, self.scores, 'b')
166 | pylab.ylabel('Score', fontsize=18)
167 | pylab.xlabel('Steps', fontsize=18)
168 | dqn = 'DQN_'
169 | softupdate = ''
170 | if self.ddqn:
171 | dqn = 'DDQN_'
172 | if self.Soft_Update:
173 | softupdate = '_soft'
174 | try:
175 | pylab.savefig(dqn+self.env_name+softupdate+".png")
176 | except OSError:
177 | pass
178 |
179 | return str(self.average[-1])[:5]
180 |
181 | def run(self):
182 | for e in range(self.EPISODES):
183 | state = self.env.reset()
184 | state = np.reshape(state, [1, self.state_size])
185 | done = False
186 | i = 0
187 | while not done:
188 | #self.env.render()
189 | action = self.act(state)
190 | next_state, reward, done, _ = self.env.step(action)
191 | next_state = np.reshape(next_state, [1, self.state_size])
192 | if not done or i == self.env._max_episode_steps-1:
193 | reward = reward
194 | else:
195 | reward = -100
196 | self.remember(state, action, reward, next_state, done)
197 | state = next_state
198 | i += 1
199 | if done:
200 | # every step update target model
201 | self.update_target_model()
202 |
203 | # every episode, plot the result
204 | average = self.PlotModel(i, e)
205 |
206 | print("episode: {}/{}, score: {}, e: {:.2}, average: {}".format(e, self.EPISODES, i, self.epsilon, average))
207 | if i == self.env._max_episode_steps:
208 | print("Saving trained model as cartpole-ddqn.h5")
209 | #self.save("cartpole-ddqn.h5")
210 | break
211 | self.replay()
212 |
213 | def test(self):
214 | self.load("cartpole-ddqn.h5")
215 | for e in range(self.EPISODES):
216 | state = self.env.reset()
217 | state = np.reshape(state, [1, self.state_size])
218 | done = False
219 | i = 0
220 | while not done:
221 | self.env.render()
222 | action = np.argmax(self.model.predict(state))
223 | next_state, reward, done, _ = self.env.step(action)
224 | state = np.reshape(next_state, [1, self.state_size])
225 | i += 1
226 | if done:
227 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, i))
228 | break
229 |
230 | if __name__ == "__main__":
231 | env_name = 'CartPole-v1'
232 | agent = DQNAgent(env_name)
233 | agent.run()
234 | #agent.test()
235 |
--------------------------------------------------------------------------------
/02_CartPole-reinforcement-learning_DDQN/IMAGES/DDQN_CartPole-v1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/02_CartPole-reinforcement-learning_DDQN/IMAGES/DDQN_CartPole-v1.png
--------------------------------------------------------------------------------
/02_CartPole-reinforcement-learning_DDQN/IMAGES/DDQN_CartPole-v1_soft.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/02_CartPole-reinforcement-learning_DDQN/IMAGES/DDQN_CartPole-v1_soft.png
--------------------------------------------------------------------------------
/02_CartPole-reinforcement-learning_DDQN/IMAGES/DQN_CartPole-v1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/02_CartPole-reinforcement-learning_DDQN/IMAGES/DQN_CartPole-v1.png
--------------------------------------------------------------------------------
/03_CartPole-reinforcement-learning_Dueling_DDQN/Cartpole_Double_DDQN.py:
--------------------------------------------------------------------------------
1 | # Tutorial by www.pylessons.com
2 | # Tutorial written for - Tensorflow 1.15, Keras 2.2.4
3 |
4 | import os
5 | import random
6 | import gym
7 | import pylab
8 | import numpy as np
9 | from collections import deque
10 | from keras.models import Model, load_model
11 | from keras.layers import Input, Dense, Lambda, Add
12 | from keras.optimizers import Adam, RMSprop
13 | from keras import backend as K
14 |
15 | def OurModel(input_shape, action_space, dueling):
16 | X_input = Input(input_shape)
17 | X = X_input
18 |
19 | # 'Dense' is the basic form of a neural network layer
20 | # Input Layer of state size(4) and Hidden Layer with 512 nodes
21 | X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X)
22 |
23 | # Hidden layer with 256 nodes
24 | X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X)
25 |
26 | # Hidden layer with 64 nodes
27 | X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
28 |
29 | if dueling:
30 | state_value = Dense(1, kernel_initializer='he_uniform')(X)
31 | state_value = Lambda(lambda s: K.expand_dims(s[:, 0], -1), output_shape=(action_space,))(state_value)
32 |
33 | action_advantage = Dense(action_space, kernel_initializer='he_uniform')(X)
34 | action_advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), output_shape=(action_space,))(action_advantage)
35 |
36 | X = Add()([state_value, action_advantage])
37 | else:
38 | # Output Layer with # of actions: 2 nodes (left, right)
39 | X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X)
40 |
41 | model = Model(inputs = X_input, outputs = X, name='CartPole Dueling DDQN model')
42 | model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"])
43 |
44 | model.summary()
45 | return model
46 |
47 | class DQNAgent:
48 | def __init__(self, env_name):
49 | self.env_name = env_name
50 | self.env = gym.make(env_name)
51 | self.env.seed(0)
52 | # by default, CartPole-v1 has max episode steps = 500
53 | self.env._max_episode_steps = 4000
54 | self.state_size = self.env.observation_space.shape[0]
55 | self.action_size = self.env.action_space.n
56 |
57 | self.EPISODES = 1000
58 | self.memory = deque(maxlen=2000)
59 |
60 | self.gamma = 0.95 # discount rate
61 | self.epsilon = 1.0 # exploration rate
62 | self.epsilon_min = 0.01 # minimum exploration probability
63 | self.epsilon_decay = 0.999 # exponential decay rate for exploration prob
64 | self.batch_size = 32
65 | self.train_start = 1000
66 |
67 | # defining model parameters
68 | self.ddqn = True # use doudle deep q network
69 | self.Soft_Update = False # use soft parameter update
70 | self.dueling = True # use dealing netowrk
71 |
72 | self.TAU = 0.1 # target network soft update hyperparameter
73 |
74 | self.Save_Path = 'Models'
75 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
76 | self.scores, self.episodes, self.average = [], [], []
77 |
78 | if self.ddqn:
79 | print("----------Double DQN--------")
80 | self.Model_name = os.path.join(self.Save_Path,"Dueling DDQN_"+self.env_name+".h5")
81 | else:
82 | print("-------------DQN------------")
83 | self.Model_name = os.path.join(self.Save_Path,"Dueling DQN_"+self.env_name+".h5")
84 |
85 | # create main model and target model
86 | self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling)
87 | self.target_model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling)
88 |
89 | # after some time interval update the target model to be same with model
90 | def update_target_model(self):
91 | if not self.Soft_Update and self.ddqn:
92 | self.target_model.set_weights(self.model.get_weights())
93 | return
94 | if self.Soft_Update and self.ddqn:
95 | q_model_theta = self.model.get_weights()
96 | target_model_theta = self.target_model.get_weights()
97 | counter = 0
98 | for q_weight, target_weight in zip(q_model_theta, target_model_theta):
99 | target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU
100 | target_model_theta[counter] = target_weight
101 | counter += 1
102 | self.target_model.set_weights(target_model_theta)
103 |
104 | def remember(self, state, action, reward, next_state, done):
105 | self.memory.append((state, action, reward, next_state, done))
106 | if len(self.memory) > self.train_start:
107 | if self.epsilon > self.epsilon_min:
108 | self.epsilon *= self.epsilon_decay
109 |
110 | def act(self, state):
111 | if np.random.random() <= self.epsilon:
112 | return random.randrange(self.action_size)
113 | else:
114 | return np.argmax(self.model.predict(state))
115 |
116 | def replay(self):
117 | if len(self.memory) < self.train_start:
118 | return
119 | # Randomly sample minibatch from the memory
120 | minibatch = random.sample(self.memory, self.batch_size)
121 |
122 | state = np.zeros((self.batch_size, self.state_size))
123 | next_state = np.zeros((self.batch_size, self.state_size))
124 | action, reward, done = [], [], []
125 |
126 | # do this before prediction
127 | # for speedup, this could be done on the tensor level
128 | # but easier to understand using a loop
129 | for i in range(self.batch_size):
130 | state[i] = minibatch[i][0]
131 | action.append(minibatch[i][1])
132 | reward.append(minibatch[i][2])
133 | next_state[i] = minibatch[i][3]
134 | done.append(minibatch[i][4])
135 |
136 | # do batch prediction to save speed
137 | # predict Q-values for starting state using the main network
138 | target = self.model.predict(state)
139 | # predict best action in ending state using the main network
140 | target_next = self.model.predict(next_state)
141 | # predict Q-values for ending state using the target network
142 | target_val = self.target_model.predict(next_state)
143 |
144 | for i in range(len(minibatch)):
145 | # correction on the Q value for the action used
146 | if done[i]:
147 | target[i][action[i]] = reward[i]
148 | else:
149 | if self.ddqn: # Double - DQN
150 | # current Q Network selects the action
151 | # a'_max = argmax_a' Q(s', a')
152 | a = np.argmax(target_next[i])
153 | # target Q Network evaluates the action
154 | # Q_max = Q_target(s', a'_max)
155 | target[i][action[i]] = reward[i] + self.gamma * (target_val[i][a])
156 | else: # Standard - DQN
157 | # DQN chooses the max Q value among next actions
158 | # selection and evaluation of action is on the target Q Network
159 | # Q_max = max_a' Q_target(s', a')
160 | target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))
161 |
162 | # Train the Neural Network with batches
163 | self.model.fit(state, target, batch_size=self.batch_size, verbose=0)
164 |
165 | def load(self, name):
166 | self.model = load_model(name)
167 |
168 | def save(self, name):
169 | self.model.save(name)
170 |
171 | pylab.figure(figsize=(18, 9))
172 | def PlotModel(self, score, episode):
173 | self.scores.append(score)
174 | self.episodes.append(episode)
175 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
176 | pylab.plot(self.episodes, self.average, 'r')
177 | pylab.plot(self.episodes, self.scores, 'b')
178 | pylab.ylabel('Score', fontsize=18)
179 | pylab.xlabel('Steps', fontsize=18)
180 | dqn = 'DQN_'
181 | softupdate = ''
182 | dueling = ''
183 | if self.ddqn: dqn = 'DDQN_'
184 | if self.Soft_Update: softupdate = '_soft'
185 | if self.dueling: dueling = '_Dueling'
186 | try:
187 | pylab.savefig(dqn+self.env_name+softupdate+dueling+".png")
188 | except OSError:
189 | pass
190 |
191 | return str(self.average[-1])[:5]
192 |
193 | def run(self):
194 | for e in range(self.EPISODES):
195 | state = self.env.reset()
196 | state = np.reshape(state, [1, self.state_size])
197 | done = False
198 | i = 0
199 | while not done:
200 | #self.env.render()
201 | action = self.act(state)
202 | next_state, reward, done, _ = self.env.step(action)
203 | next_state = np.reshape(next_state, [1, self.state_size])
204 | if not done or i == self.env._max_episode_steps-1:
205 | reward = reward
206 | else:
207 | reward = -100
208 | self.remember(state, action, reward, next_state, done)
209 | state = next_state
210 | i += 1
211 | if done:
212 | # every step update target model
213 | self.update_target_model()
214 |
215 | # every episode, plot the result
216 | average = self.PlotModel(i, e)
217 |
218 | print("episode: {}/{}, score: {}, e: {:.2}, average: {}".format(e, self.EPISODES, i, self.epsilon, average))
219 | if i == self.env._max_episode_steps:
220 | print("Saving trained model as", self.Model_name)
221 | #self.save(self.Model_name)
222 | break
223 | self.replay()
224 |
225 | def test(self):
226 | self.load(self.Model_name)
227 | for e in range(self.EPISODES):
228 | state = self.env.reset()
229 | state = np.reshape(state, [1, self.state_size])
230 | done = False
231 | i = 0
232 | while not done:
233 | self.env.render()
234 | action = np.argmax(self.model.predict(state))
235 | next_state, reward, done, _ = self.env.step(action)
236 | state = np.reshape(next_state, [1, self.state_size])
237 | i += 1
238 | if done:
239 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, i))
240 | break
241 |
242 | if __name__ == "__main__":
243 | env_name = 'CartPole-v1'
244 | agent = DQNAgent(env_name)
245 | agent.run()
246 | #agent.test()
247 |
--------------------------------------------------------------------------------
/03_CartPole-reinforcement-learning_Dueling_DDQN/Cartpole_Double_DDQN_TF2.py:
--------------------------------------------------------------------------------
1 | # Tutorial by www.pylessons.com
2 | # Tutorial written for - Tensorflow 2.3.1
3 |
4 | import os
5 | import random
6 | import gym
7 | import pylab
8 | import numpy as np
9 | from collections import deque
10 | import tensorflow as tf
11 | from tensorflow.keras.models import Model, load_model
12 | from tensorflow.keras.layers import Input, Dense, Lambda, Add
13 | from tensorflow.keras.optimizers import Adam, RMSprop
14 | from tensorflow.keras import backend as K
15 |
16 | def OurModel(input_shape, action_space, dueling):
17 | X_input = Input(input_shape)
18 | X = X_input
19 |
20 | # 'Dense' is the basic form of a neural network layer
21 | # Input Layer of state size(4) and Hidden Layer with 512 nodes
22 | X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X)
23 |
24 | # Hidden layer with 256 nodes
25 | X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X)
26 |
27 | # Hidden layer with 64 nodes
28 | X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
29 |
30 | if dueling:
31 | state_value = Dense(1, kernel_initializer='he_uniform')(X)
32 | state_value = Lambda(lambda s: K.expand_dims(s[:, 0], -1), output_shape=(action_space,))(state_value)
33 |
34 | action_advantage = Dense(action_space, kernel_initializer='he_uniform')(X)
35 | action_advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), output_shape=(action_space,))(action_advantage)
36 |
37 | X = Add()([state_value, action_advantage])
38 | else:
39 | # Output Layer with # of actions: 2 nodes (left, right)
40 | X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X)
41 |
42 | model = Model(inputs = X_input, outputs = X)
43 | model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"])
44 |
45 | model.summary()
46 | return model
47 |
48 | class DQNAgent:
49 | def __init__(self, env_name):
50 | self.env_name = env_name
51 | self.env = gym.make(env_name)
52 | self.env.seed(0)
53 | # by default, CartPole-v1 has max episode steps = 500
54 | self.env._max_episode_steps = 4000
55 | self.state_size = self.env.observation_space.shape[0]
56 | self.action_size = self.env.action_space.n
57 |
58 | self.EPISODES = 1000
59 | self.memory = deque(maxlen=2000)
60 |
61 | self.gamma = 0.95 # discount rate
62 | self.epsilon = 1.0 # exploration rate
63 | self.epsilon_min = 0.01 # minimum exploration probability
64 | self.epsilon_decay = 0.999 # exponential decay rate for exploration prob
65 | self.batch_size = 32
66 | self.train_start = 1000
67 |
68 | # defining model parameters
69 | self.ddqn = True # use doudle deep q network
70 | self.Soft_Update = False # use soft parameter update
71 | self.dueling = True # use dealing netowrk
72 |
73 | self.TAU = 0.1 # target network soft update hyperparameter
74 |
75 | self.Save_Path = 'Models'
76 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
77 | self.scores, self.episodes, self.average = [], [], []
78 |
79 | if self.ddqn:
80 | print("----------Double DQN--------")
81 | self.Model_name = os.path.join(self.Save_Path,"Dueling DDQN_"+self.env_name+".h5")
82 | else:
83 | print("-------------DQN------------")
84 | self.Model_name = os.path.join(self.Save_Path,"Dueling DQN_"+self.env_name+".h5")
85 |
86 | # create main model and target model
87 | self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling)
88 | self.target_model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling)
89 |
90 | # after some time interval update the target model to be same with model
91 | def update_target_model(self):
92 | if not self.Soft_Update and self.ddqn:
93 | self.target_model.set_weights(self.model.get_weights())
94 | return
95 | if self.Soft_Update and self.ddqn:
96 | q_model_theta = self.model.get_weights()
97 | target_model_theta = self.target_model.get_weights()
98 | counter = 0
99 | for q_weight, target_weight in zip(q_model_theta, target_model_theta):
100 | target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU
101 | target_model_theta[counter] = target_weight
102 | counter += 1
103 | self.target_model.set_weights(target_model_theta)
104 |
105 | def remember(self, state, action, reward, next_state, done):
106 | self.memory.append((state, action, reward, next_state, done))
107 | if len(self.memory) > self.train_start:
108 | if self.epsilon > self.epsilon_min:
109 | self.epsilon *= self.epsilon_decay
110 |
111 | def act(self, state):
112 | if np.random.random() <= self.epsilon:
113 | return random.randrange(self.action_size)
114 | else:
115 | return np.argmax(self.model.predict(state))
116 |
117 | def replay(self):
118 | if len(self.memory) < self.train_start:
119 | return
120 | # Randomly sample minibatch from the memory
121 | minibatch = random.sample(self.memory, self.batch_size)
122 |
123 | state = np.zeros((self.batch_size, self.state_size))
124 | next_state = np.zeros((self.batch_size, self.state_size))
125 | action, reward, done = [], [], []
126 |
127 | # do this before prediction
128 | # for speedup, this could be done on the tensor level
129 | # but easier to understand using a loop
130 | for i in range(self.batch_size):
131 | state[i] = minibatch[i][0]
132 | action.append(minibatch[i][1])
133 | reward.append(minibatch[i][2])
134 | next_state[i] = minibatch[i][3]
135 | done.append(minibatch[i][4])
136 |
137 | # do batch prediction to save speed
138 | # predict Q-values for starting state using the main network
139 | target = self.model.predict(state)
140 | # predict best action in ending state using the main network
141 | target_next = self.model.predict(next_state)
142 | # predict Q-values for ending state using the target network
143 | target_val = self.target_model.predict(next_state)
144 |
145 | for i in range(len(minibatch)):
146 | # correction on the Q value for the action used
147 | if done[i]:
148 | target[i][action[i]] = reward[i]
149 | else:
150 | if self.ddqn: # Double - DQN
151 | # current Q Network selects the action
152 | # a'_max = argmax_a' Q(s', a')
153 | a = np.argmax(target_next[i])
154 | # target Q Network evaluates the action
155 | # Q_max = Q_target(s', a'_max)
156 | target[i][action[i]] = reward[i] + self.gamma * (target_val[i][a])
157 | else: # Standard - DQN
158 | # DQN chooses the max Q value among next actions
159 | # selection and evaluation of action is on the target Q Network
160 | # Q_max = max_a' Q_target(s', a')
161 | target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))
162 |
163 | # Train the Neural Network with batches
164 | self.model.fit(state, target, batch_size=self.batch_size, verbose=0)
165 |
166 | def load(self, name):
167 | self.model = load_model(name)
168 |
169 | def save(self, name):
170 | self.model.save(name)
171 |
172 | pylab.figure(figsize=(18, 9))
173 | def PlotModel(self, score, episode):
174 | self.scores.append(score)
175 | self.episodes.append(episode)
176 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
177 | pylab.plot(self.episodes, self.average, 'r')
178 | pylab.plot(self.episodes, self.scores, 'b')
179 | pylab.ylabel('Score', fontsize=18)
180 | pylab.xlabel('Steps', fontsize=18)
181 | dqn = 'DQN_'
182 | softupdate = ''
183 | dueling = ''
184 | if self.ddqn: dqn = 'DDQN_'
185 | if self.Soft_Update: softupdate = '_soft'
186 | if self.dueling: dueling = '_Dueling'
187 | try:
188 | pylab.savefig(dqn+self.env_name+softupdate+dueling+".png")
189 | except OSError:
190 | pass
191 |
192 | return str(self.average[-1])[:5]
193 |
194 | def run(self):
195 | for e in range(self.EPISODES):
196 | state = self.env.reset()
197 | state = np.reshape(state, [1, self.state_size])
198 | done = False
199 | i = 0
200 | while not done:
201 | #self.env.render()
202 | action = self.act(state)
203 | next_state, reward, done, _ = self.env.step(action)
204 | next_state = np.reshape(next_state, [1, self.state_size])
205 | if not done or i == self.env._max_episode_steps-1:
206 | reward = reward
207 | else:
208 | reward = -100
209 | self.remember(state, action, reward, next_state, done)
210 | state = next_state
211 | i += 1
212 | if done:
213 | # every step update target model
214 | self.update_target_model()
215 |
216 | # every episode, plot the result
217 | average = self.PlotModel(i, e)
218 |
219 | print("episode: {}/{}, score: {}, e: {:.2}, average: {}".format(e, self.EPISODES, i, self.epsilon, average))
220 | if i == self.env._max_episode_steps:
221 | print("Saving trained model as", self.Model_name)
222 | #self.save(self.Model_name)
223 | break
224 | self.replay()
225 |
226 | def test(self):
227 | self.load(self.Model_name)
228 | for e in range(self.EPISODES):
229 | state = self.env.reset()
230 | state = np.reshape(state, [1, self.state_size])
231 | done = False
232 | i = 0
233 | while not done:
234 | self.env.render()
235 | action = np.argmax(self.model.predict(state))
236 | next_state, reward, done, _ = self.env.step(action)
237 | state = np.reshape(next_state, [1, self.state_size])
238 | i += 1
239 | if done:
240 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, i))
241 | break
242 |
243 | if __name__ == "__main__":
244 | env_name = 'CartPole-v1'
245 | agent = DQNAgent(env_name)
246 | agent.run()
247 | #agent.test()
248 |
--------------------------------------------------------------------------------
/03_CartPole-reinforcement-learning_Dueling_DDQN/IMAGES/DDQN_CartPole-v1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/03_CartPole-reinforcement-learning_Dueling_DDQN/IMAGES/DDQN_CartPole-v1.png
--------------------------------------------------------------------------------
/03_CartPole-reinforcement-learning_Dueling_DDQN/IMAGES/DDQN_CartPole-v1_Dueling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/03_CartPole-reinforcement-learning_Dueling_DDQN/IMAGES/DDQN_CartPole-v1_Dueling.png
--------------------------------------------------------------------------------
/03_CartPole-reinforcement-learning_Dueling_DDQN/IMAGES/DQN_CartPole-v1_Dueling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/03_CartPole-reinforcement-learning_Dueling_DDQN/IMAGES/DQN_CartPole-v1_Dueling.png
--------------------------------------------------------------------------------
/04_CartPole-reinforcement-learning_e_greedy_D3QN/Cartpole_e_greedy_D3QN.py:
--------------------------------------------------------------------------------
1 | # Tutorial by www.pylessons.com
2 | # Tutorial written for - Tensorflow 1.15, Keras 2.2.4
3 |
4 | import os
5 | import random
6 | import gym
7 | import pylab
8 | import numpy as np
9 | from collections import deque
10 | from keras.models import Model, load_model
11 | from keras.layers import Input, Dense, Lambda, Add
12 | from keras.optimizers import Adam, RMSprop
13 | from keras import backend as K
14 |
15 | def OurModel(input_shape, action_space, dueling):
16 | X_input = Input(input_shape)
17 | X = X_input
18 |
19 | # 'Dense' is the basic form of a neural network layer
20 | # Input Layer of state size(4) and Hidden Layer with 512 nodes
21 | X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X)
22 |
23 | # Hidden layer with 256 nodes
24 | X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X)
25 |
26 | # Hidden layer with 64 nodes
27 | X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
28 |
29 | if dueling:
30 | state_value = Dense(1, kernel_initializer='he_uniform')(X)
31 | state_value = Lambda(lambda s: K.expand_dims(s[:, 0], -1), output_shape=(action_space,))(state_value)
32 |
33 | action_advantage = Dense(action_space, kernel_initializer='he_uniform')(X)
34 | action_advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), output_shape=(action_space,))(action_advantage)
35 |
36 | X = Add()([state_value, action_advantage])
37 | else:
38 | # Output Layer with # of actions: 2 nodes (left, right)
39 | X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X)
40 |
41 | model = Model(inputs = X_input, outputs = X, name='CartPole Dueling DDQN model')
42 | model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"])
43 |
44 | model.summary()
45 | return model
46 |
47 | class DQNAgent:
48 | def __init__(self, env_name):
49 | self.env_name = env_name
50 | self.env = gym.make(env_name)
51 | self.env.seed(0)
52 | # by default, CartPole-v1 has max episode steps = 500
53 | self.env._max_episode_steps = 4000
54 | self.state_size = self.env.observation_space.shape[0]
55 | self.action_size = self.env.action_space.n
56 |
57 | self.EPISODES = 1000
58 | self.memory = deque(maxlen=2000)
59 | self.gamma = 0.95 # discount rate
60 |
61 | # EXPLORATION HYPERPARAMETERS for epsilon and epsilon greedy strategy
62 | self.epsilon = 1.0 # exploration probability at start
63 | self.epsilon_min = 0.01 # minimum exploration probability
64 | self.epsilon_decay = 0.0005 # exponential decay rate for exploration prob
65 |
66 | self.batch_size = 32
67 |
68 | # defining model parameters
69 | self.ddqn = True # use double deep q network
70 | self.Soft_Update = False # use soft parameter update
71 | self.dueling = True # use dealing network
72 | self.epsilon_greedy = True # use epsilon greedy strategy
73 |
74 | self.TAU = 0.1 # target network soft update hyperparameter
75 |
76 | self.Save_Path = 'Models'
77 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
78 | self.scores, self.episodes, self.average = [], [], []
79 |
80 | self.Model_name = os.path.join(self.Save_Path, self.env_name+"_e_greedy.h5")
81 |
82 | # create main model and target model
83 | self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling)
84 | self.target_model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling)
85 |
86 | # after some time interval update the target model to be same with model
87 | def update_target_model(self):
88 | if not self.Soft_Update and self.ddqn:
89 | self.target_model.set_weights(self.model.get_weights())
90 | return
91 | if self.Soft_Update and self.ddqn:
92 | q_model_theta = self.model.get_weights()
93 | target_model_theta = self.target_model.get_weights()
94 | counter = 0
95 | for q_weight, target_weight in zip(q_model_theta, target_model_theta):
96 | target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU
97 | target_model_theta[counter] = target_weight
98 | counter += 1
99 | self.target_model.set_weights(target_model_theta)
100 |
101 | def remember(self, state, action, reward, next_state, done):
102 | experience = state, action, reward, next_state, done
103 | self.memory.append((experience))
104 |
105 | def act(self, state, decay_step):
106 | # EPSILON GREEDY STRATEGY
107 | if self.epsilon_greedy:
108 | # Here we'll use an improved version of our epsilon greedy strategy for Q-learning
109 | explore_probability = self.epsilon_min + (self.epsilon - self.epsilon_min) * np.exp(-self.epsilon_decay * decay_step)
110 | # OLD EPSILON STRATEGY
111 | else:
112 | if self.epsilon > self.epsilon_min:
113 | self.epsilon *= (1-self.epsilon_decay)
114 | explore_probability = self.epsilon
115 |
116 | if explore_probability > np.random.rand():
117 | # Make a random action (exploration)
118 | return random.randrange(self.action_size), explore_probability
119 | else:
120 | # Get action from Q-network (exploitation)
121 | # Estimate the Qs values state
122 | # Take the biggest Q value (= the best action)
123 | return np.argmax(self.model.predict(state)), explore_probability
124 |
125 | def replay(self):
126 | if len(self.memory) < self.batch_size:
127 | return
128 | # Randomly sample minibatch from the memory
129 | minibatch = random.sample(self.memory, self.batch_size)
130 |
131 | state = np.zeros((self.batch_size, self.state_size))
132 | next_state = np.zeros((self.batch_size, self.state_size))
133 | action, reward, done = [], [], []
134 |
135 | # do this before prediction
136 | # for speedup, this could be done on the tensor level
137 | # but easier to understand using a loop
138 | for i in range(self.batch_size):
139 | state[i] = minibatch[i][0]
140 | action.append(minibatch[i][1])
141 | reward.append(minibatch[i][2])
142 | next_state[i] = minibatch[i][3]
143 | done.append(minibatch[i][4])
144 |
145 | # do batch prediction to save speed
146 | # predict Q-values for starting state using the main network
147 | target = self.model.predict(state)
148 | # predict best action in ending state using the main network
149 | target_next = self.model.predict(next_state)
150 | # predict Q-values for ending state using the target network
151 | target_val = self.target_model.predict(next_state)
152 |
153 | for i in range(len(minibatch)):
154 | # correction on the Q value for the action used
155 | if done[i]:
156 | target[i][action[i]] = reward[i]
157 | else:
158 | if self.ddqn: # Double - DQN
159 | # current Q Network selects the action
160 | # a'_max = argmax_a' Q(s', a')
161 | a = np.argmax(target_next[i])
162 | # target Q Network evaluates the action
163 | # Q_max = Q_target(s', a'_max)
164 | target[i][action[i]] = reward[i] + self.gamma * (target_val[i][a])
165 | else: # Standard - DQN
166 | # DQN chooses the max Q value among next actions
167 | # selection and evaluation of action is on the target Q Network
168 | # Q_max = max_a' Q_target(s', a')
169 | target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))
170 |
171 | # Train the Neural Network with batches
172 | self.model.fit(state, target, batch_size=self.batch_size, verbose=0)
173 |
174 | def load(self, name):
175 | self.model = load_model(name)
176 |
177 | def save(self, name):
178 | self.model.save(name)
179 |
180 | pylab.figure(figsize=(18, 9))
181 | def PlotModel(self, score, episode):
182 | self.scores.append(score)
183 | self.episodes.append(episode)
184 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
185 | pylab.plot(self.episodes, self.average, 'r')
186 | pylab.plot(self.episodes, self.scores, 'b')
187 | pylab.ylabel('Score', fontsize=18)
188 | pylab.xlabel('Steps', fontsize=18)
189 | dqn = 'DQN_'
190 | softupdate = ''
191 | dueling = ''
192 | greedy = ''
193 | if self.ddqn: dqn = 'DDQN_'
194 | if self.Soft_Update: softupdate = '_soft'
195 | if self.dueling: dueling = '_Dueling'
196 | if self.epsilon_greedy: greedy = '_Greedy'
197 | try:
198 | pylab.savefig(dqn+self.env_name+softupdate+dueling+greedy+".png")
199 | except OSError:
200 | pass
201 |
202 | return str(self.average[-1])[:5]
203 |
204 | def run(self):
205 | decay_step = 0
206 | for e in range(self.EPISODES):
207 | state = self.env.reset()
208 | state = np.reshape(state, [1, self.state_size])
209 | done = False
210 | i = 0
211 | while not done:
212 | #self.env.render()
213 | decay_step += 1
214 | action, explore_probability = self.act(state, decay_step)
215 | next_state, reward, done, _ = self.env.step(action)
216 | next_state = np.reshape(next_state, [1, self.state_size])
217 | if not done or i == self.env._max_episode_steps-1:
218 | reward = reward
219 | else:
220 | reward = -100
221 | self.remember(state, action, reward, next_state, done)
222 | state = next_state
223 | i += 1
224 | if done:
225 | # every step update target model
226 | self.update_target_model()
227 |
228 | # every episode, plot the result
229 | average = self.PlotModel(i, e)
230 |
231 | print("episode: {}/{}, score: {}, e: {:.2}, average: {}".format(e, self.EPISODES, i, explore_probability, average))
232 | if i == self.env._max_episode_steps:
233 | print("Saving trained model to", self.Model_name)
234 | self.save(self.Model_name)
235 | break
236 |
237 | self.replay()
238 |
239 | def test(self):
240 | self.load(self.Model_name)
241 | for e in range(self.EPISODES):
242 | state = self.env.reset()
243 | state = np.reshape(state, [1, self.state_size])
244 | done = False
245 | i = 0
246 | while not done:
247 | self.env.render()
248 | action = np.argmax(self.model.predict(state))
249 | next_state, reward, done, _ = self.env.step(action)
250 | state = np.reshape(next_state, [1, self.state_size])
251 | i += 1
252 | if done:
253 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, i))
254 | break
255 |
256 | if __name__ == "__main__":
257 | env_name = 'CartPole-v1'
258 | agent = DQNAgent(env_name)
259 | agent.run()
260 | #agent.test()
261 |
--------------------------------------------------------------------------------
/04_CartPole-reinforcement-learning_e_greedy_D3QN/Cartpole_e_greedy_D3QN_TF2.py:
--------------------------------------------------------------------------------
1 | # Tutorial by www.pylessons.com
2 | # Tutorial written for - Tensorflow 2.3.1
3 |
4 | import os
5 | import random
6 | import gym
7 | import pylab
8 | import numpy as np
9 | from collections import deque
10 | from tensorflow.keras.models import Model, load_model
11 | from tensorflow.keras.layers import Input, Dense, Lambda, Add
12 | from tensorflow.keras.optimizers import Adam, RMSprop
13 | from tensorflow.keras import backend as K
14 |
15 | def OurModel(input_shape, action_space, dueling):
16 | X_input = Input(input_shape)
17 | X = X_input
18 |
19 | # 'Dense' is the basic form of a neural network layer
20 | # Input Layer of state size(4) and Hidden Layer with 512 nodes
21 | X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X)
22 |
23 | # Hidden layer with 256 nodes
24 | X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X)
25 |
26 | # Hidden layer with 64 nodes
27 | X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
28 |
29 | if dueling:
30 | state_value = Dense(1, kernel_initializer='he_uniform')(X)
31 | state_value = Lambda(lambda s: K.expand_dims(s[:, 0], -1), output_shape=(action_space,))(state_value)
32 |
33 | action_advantage = Dense(action_space, kernel_initializer='he_uniform')(X)
34 | action_advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), output_shape=(action_space,))(action_advantage)
35 |
36 | X = Add()([state_value, action_advantage])
37 | else:
38 | # Output Layer with # of actions: 2 nodes (left, right)
39 | X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X)
40 |
41 | model = Model(inputs = X_input, outputs = X)
42 | model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"])
43 |
44 | model.summary()
45 | return model
46 |
47 | class DQNAgent:
48 | def __init__(self, env_name):
49 | self.env_name = env_name
50 | self.env = gym.make(env_name)
51 | self.env.seed(0)
52 | # by default, CartPole-v1 has max episode steps = 500
53 | self.env._max_episode_steps = 4000
54 | self.state_size = self.env.observation_space.shape[0]
55 | self.action_size = self.env.action_space.n
56 |
57 | self.EPISODES = 1000
58 | self.memory = deque(maxlen=2000)
59 | self.gamma = 0.95 # discount rate
60 |
61 | # EXPLORATION HYPERPARAMETERS for epsilon and epsilon greedy strategy
62 | self.epsilon = 1.0 # exploration probability at start
63 | self.epsilon_min = 0.01 # minimum exploration probability
64 | self.epsilon_decay = 0.0005 # exponential decay rate for exploration prob
65 |
66 | self.batch_size = 32
67 |
68 | # defining model parameters
69 | self.ddqn = True # use double deep q network
70 | self.Soft_Update = False # use soft parameter update
71 | self.dueling = True # use dealing network
72 | self.epsilon_greedy = True # use epsilon greedy strategy
73 |
74 | self.TAU = 0.1 # target network soft update hyperparameter
75 |
76 | self.Save_Path = 'Models'
77 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
78 | self.scores, self.episodes, self.average = [], [], []
79 |
80 | self.Model_name = os.path.join(self.Save_Path, self.env_name+"_e_greedy.h5")
81 |
82 | # create main model and target model
83 | self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling)
84 | self.target_model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling)
85 |
86 | # after some time interval update the target model to be same with model
87 | def update_target_model(self):
88 | if not self.Soft_Update and self.ddqn:
89 | self.target_model.set_weights(self.model.get_weights())
90 | return
91 | if self.Soft_Update and self.ddqn:
92 | q_model_theta = self.model.get_weights()
93 | target_model_theta = self.target_model.get_weights()
94 | counter = 0
95 | for q_weight, target_weight in zip(q_model_theta, target_model_theta):
96 | target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU
97 | target_model_theta[counter] = target_weight
98 | counter += 1
99 | self.target_model.set_weights(target_model_theta)
100 |
101 | def remember(self, state, action, reward, next_state, done):
102 | experience = state, action, reward, next_state, done
103 | self.memory.append((experience))
104 |
105 | def act(self, state, decay_step):
106 | # EPSILON GREEDY STRATEGY
107 | if self.epsilon_greedy:
108 | # Here we'll use an improved version of our epsilon greedy strategy for Q-learning
109 | explore_probability = self.epsilon_min + (self.epsilon - self.epsilon_min) * np.exp(-self.epsilon_decay * decay_step)
110 | # OLD EPSILON STRATEGY
111 | else:
112 | if self.epsilon > self.epsilon_min:
113 | self.epsilon *= (1-self.epsilon_decay)
114 | explore_probability = self.epsilon
115 |
116 | if explore_probability > np.random.rand():
117 | # Make a random action (exploration)
118 | return random.randrange(self.action_size), explore_probability
119 | else:
120 | # Get action from Q-network (exploitation)
121 | # Estimate the Qs values state
122 | # Take the biggest Q value (= the best action)
123 | return np.argmax(self.model.predict(state)), explore_probability
124 |
125 | def replay(self):
126 | if len(self.memory) < self.batch_size:
127 | return
128 | # Randomly sample minibatch from the memory
129 | minibatch = random.sample(self.memory, self.batch_size)
130 |
131 | state = np.zeros((self.batch_size, self.state_size))
132 | next_state = np.zeros((self.batch_size, self.state_size))
133 | action, reward, done = [], [], []
134 |
135 | # do this before prediction
136 | # for speedup, this could be done on the tensor level
137 | # but easier to understand using a loop
138 | for i in range(self.batch_size):
139 | state[i] = minibatch[i][0]
140 | action.append(minibatch[i][1])
141 | reward.append(minibatch[i][2])
142 | next_state[i] = minibatch[i][3]
143 | done.append(minibatch[i][4])
144 |
145 | # do batch prediction to save speed
146 | # predict Q-values for starting state using the main network
147 | target = self.model.predict(state)
148 | # predict best action in ending state using the main network
149 | target_next = self.model.predict(next_state)
150 | # predict Q-values for ending state using the target network
151 | target_val = self.target_model.predict(next_state)
152 |
153 | for i in range(len(minibatch)):
154 | # correction on the Q value for the action used
155 | if done[i]:
156 | target[i][action[i]] = reward[i]
157 | else:
158 | if self.ddqn: # Double - DQN
159 | # current Q Network selects the action
160 | # a'_max = argmax_a' Q(s', a')
161 | a = np.argmax(target_next[i])
162 | # target Q Network evaluates the action
163 | # Q_max = Q_target(s', a'_max)
164 | target[i][action[i]] = reward[i] + self.gamma * (target_val[i][a])
165 | else: # Standard - DQN
166 | # DQN chooses the max Q value among next actions
167 | # selection and evaluation of action is on the target Q Network
168 | # Q_max = max_a' Q_target(s', a')
169 | target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))
170 |
171 | # Train the Neural Network with batches
172 | self.model.fit(state, target, batch_size=self.batch_size, verbose=0)
173 |
174 | def load(self, name):
175 | self.model = load_model(name)
176 |
177 | def save(self, name):
178 | self.model.save(name)
179 |
180 | pylab.figure(figsize=(18, 9))
181 | def PlotModel(self, score, episode):
182 | self.scores.append(score)
183 | self.episodes.append(episode)
184 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
185 | pylab.plot(self.episodes, self.average, 'r')
186 | pylab.plot(self.episodes, self.scores, 'b')
187 | pylab.ylabel('Score', fontsize=18)
188 | pylab.xlabel('Steps', fontsize=18)
189 | dqn = 'DQN_'
190 | softupdate = ''
191 | dueling = ''
192 | greedy = ''
193 | if self.ddqn: dqn = 'DDQN_'
194 | if self.Soft_Update: softupdate = '_soft'
195 | if self.dueling: dueling = '_Dueling'
196 | if self.epsilon_greedy: greedy = '_Greedy'
197 | try:
198 | pylab.savefig(dqn+self.env_name+softupdate+dueling+greedy+".png")
199 | except OSError:
200 | pass
201 |
202 | return str(self.average[-1])[:5]
203 |
204 | def run(self):
205 | decay_step = 0
206 | for e in range(self.EPISODES):
207 | state = self.env.reset()
208 | state = np.reshape(state, [1, self.state_size])
209 | done = False
210 | i = 0
211 | while not done:
212 | #self.env.render()
213 | decay_step += 1
214 | action, explore_probability = self.act(state, decay_step)
215 | next_state, reward, done, _ = self.env.step(action)
216 | next_state = np.reshape(next_state, [1, self.state_size])
217 | if not done or i == self.env._max_episode_steps-1:
218 | reward = reward
219 | else:
220 | reward = -100
221 | self.remember(state, action, reward, next_state, done)
222 | state = next_state
223 | i += 1
224 | if done:
225 | # every step update target model
226 | self.update_target_model()
227 |
228 | # every episode, plot the result
229 | average = self.PlotModel(i, e)
230 |
231 | print("episode: {}/{}, score: {}, e: {:.2}, average: {}".format(e, self.EPISODES, i, explore_probability, average))
232 | if i == self.env._max_episode_steps:
233 | print("Saving trained model to", self.Model_name)
234 | self.save(self.Model_name)
235 | break
236 |
237 | self.replay()
238 |
239 | def test(self):
240 | self.load(self.Model_name)
241 | for e in range(self.EPISODES):
242 | state = self.env.reset()
243 | state = np.reshape(state, [1, self.state_size])
244 | done = False
245 | i = 0
246 | while not done:
247 | self.env.render()
248 | action = np.argmax(self.model.predict(state))
249 | next_state, reward, done, _ = self.env.step(action)
250 | state = np.reshape(next_state, [1, self.state_size])
251 | i += 1
252 | if done:
253 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, i))
254 | break
255 |
256 | if __name__ == "__main__":
257 | env_name = 'CartPole-v1'
258 | agent = DQNAgent(env_name)
259 | agent.run()
260 | #agent.test()
261 |
--------------------------------------------------------------------------------
/04_CartPole-reinforcement-learning_e_greedy_D3QN/IMAGES/DDQN_CartPole-v1_Dueling_Greedy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/04_CartPole-reinforcement-learning_e_greedy_D3QN/IMAGES/DDQN_CartPole-v1_Dueling_Greedy.png
--------------------------------------------------------------------------------
/05_CartPole-reinforcement-learning_PER_D3QN/Cartpole_PER_D3QN.py:
--------------------------------------------------------------------------------
1 | # Tutorial by www.pylessons.com
2 | # Tutorial written for - Tensorflow 1.15, Keras 2.2.4
3 |
4 | import os
5 | import random
6 | import gym
7 | import pylab
8 | import numpy as np
9 | from collections import deque
10 | from keras.models import Model, load_model
11 | from keras.layers import Input, Dense, Lambda, Add
12 | from keras.optimizers import Adam, RMSprop
13 | from keras import backend as K
14 | from PER import *
15 |
16 | def OurModel(input_shape, action_space, dueling):
17 | X_input = Input(input_shape)
18 | X = X_input
19 |
20 | # 'Dense' is the basic form of a neural network layer
21 | # Input Layer of state size(4) and Hidden Layer with 512 nodes
22 | X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X)
23 |
24 | # Hidden layer with 256 nodes
25 | X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X)
26 |
27 | # Hidden layer with 64 nodes
28 | X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
29 |
30 | if dueling:
31 | state_value = Dense(1, kernel_initializer='he_uniform')(X)
32 | state_value = Lambda(lambda s: K.expand_dims(s[:, 0], -1), output_shape=(action_space,))(state_value)
33 |
34 | action_advantage = Dense(action_space, kernel_initializer='he_uniform')(X)
35 | action_advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), output_shape=(action_space,))(action_advantage)
36 |
37 | X = Add()([state_value, action_advantage])
38 | else:
39 | # Output Layer with # of actions: 2 nodes (left, right)
40 | X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X)
41 |
42 | model = Model(inputs = X_input, outputs = X, name='CartPole D3QN model')
43 | model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"])
44 |
45 | model.summary()
46 | return model
47 |
48 | class DQNAgent:
49 | def __init__(self, env_name):
50 | self.env_name = env_name
51 | self.env = gym.make(env_name)
52 | self.env.seed(0)
53 | # by default, CartPole-v1 has max episode steps = 500
54 | self.env._max_episode_steps = 4000
55 | self.state_size = self.env.observation_space.shape[0]
56 | self.action_size = self.env.action_space.n
57 |
58 | self.EPISODES = 1000
59 | memory_size = 10000
60 | self.MEMORY = Memory(memory_size)
61 | self.memory = deque(maxlen=2000)
62 | self.gamma = 0.95 # discount rate
63 |
64 | # EXPLORATION HYPERPARAMETERS for epsilon and epsilon greedy strategy
65 | self.epsilon = 1.0 # exploration probability at start
66 | self.epsilon_min = 0.01 # minimum exploration probability
67 | self.epsilon_decay = 0.0005 # exponential decay rate for exploration prob
68 |
69 | self.batch_size = 32
70 |
71 | # defining model parameters
72 | self.ddqn = True # use doudle deep q network
73 | self.Soft_Update = False # use soft parameter update
74 | self.dueling = True # use dealing netowrk
75 | self.epsilot_greedy = False # use epsilon greedy strategy
76 | self.USE_PER = True
77 |
78 | self.TAU = 0.1 # target network soft update hyperparameter
79 |
80 | self.Save_Path = 'Models'
81 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
82 | self.scores, self.episodes, self.average = [], [], []
83 |
84 | self.Model_name = os.path.join(self.Save_Path, self.env_name+"_e_greedy.h5")
85 |
86 | # create main model and target model
87 | self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling)
88 | self.target_model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling)
89 |
90 | # after some time interval update the target model to be same with model
91 | def update_target_model(self):
92 | if not self.Soft_Update and self.ddqn:
93 | self.target_model.set_weights(self.model.get_weights())
94 | return
95 | if self.Soft_Update and self.ddqn:
96 | q_model_theta = self.model.get_weights()
97 | target_model_theta = self.target_model.get_weights()
98 | counter = 0
99 | for q_weight, target_weight in zip(q_model_theta, target_model_theta):
100 | target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU
101 | target_model_theta[counter] = target_weight
102 | counter += 1
103 | self.target_model.set_weights(target_model_theta)
104 |
105 | def remember(self, state, action, reward, next_state, done):
106 | experience = state, action, reward, next_state, done
107 | if self.USE_PER:
108 | self.MEMORY.store(experience)
109 | else:
110 | self.memory.append((experience))
111 |
112 | def act(self, state, decay_step):
113 | # EPSILON GREEDY STRATEGY
114 | if self.epsilot_greedy:
115 | # Here we'll use an improved version of our epsilon greedy strategy for Q-learning
116 | explore_probability = self.epsilon_min + (self.epsilon - self.epsilon_min) * np.exp(-self.epsilon_decay * decay_step)
117 | # OLD EPSILON STRATEGY
118 | else:
119 | if self.epsilon > self.epsilon_min:
120 | self.epsilon *= (1-self.epsilon_decay)
121 | explore_probability = self.epsilon
122 |
123 | if explore_probability > np.random.rand():
124 | # Make a random action (exploration)
125 | return random.randrange(self.action_size), explore_probability
126 | else:
127 | # Get action from Q-network (exploitation)
128 | # Estimate the Qs values state
129 | # Take the biggest Q value (= the best action)
130 | return np.argmax(self.model.predict(state)), explore_probability
131 |
132 | def replay(self):
133 | if self.USE_PER:
134 | tree_idx, minibatch = self.MEMORY.sample(self.batch_size)
135 | else:
136 | minibatch = random.sample(self.memory, min(len(self.memory), self.batch_size))
137 |
138 | state = np.zeros((self.batch_size, self.state_size))
139 | next_state = np.zeros((self.batch_size, self.state_size))
140 | action, reward, done = [], [], []
141 |
142 | # do this before prediction
143 | # for speedup, this could be done on the tensor level
144 | # but easier to understand using a loop
145 | for i in range(self.batch_size):
146 | state[i] = minibatch[i][0]
147 | action.append(minibatch[i][1])
148 | reward.append(minibatch[i][2])
149 | next_state[i] = minibatch[i][3]
150 | done.append(minibatch[i][4])
151 |
152 | # do batch prediction to save speed
153 | # predict Q-values for starting state using the main network
154 | target = self.model.predict(state)
155 | target_old = np.array(target)
156 | # predict best action in ending state using the main network
157 | target_next = self.model.predict(next_state)
158 | # predict Q-values for ending state using the target network
159 | target_val = self.target_model.predict(next_state)
160 |
161 | for i in range(len(minibatch)):
162 | # correction on the Q value for the action used
163 | if done[i]:
164 | target[i][action[i]] = reward[i]
165 | else:
166 | if self.ddqn: # Double - DQN
167 | # current Q Network selects the action
168 | # a'_max = argmax_a' Q(s', a')
169 | a = np.argmax(target_next[i])
170 | # target Q Network evaluates the action
171 | # Q_max = Q_target(s', a'_max)
172 | target[i][action[i]] = reward[i] + self.gamma * (target_val[i][a])
173 | else: # Standard - DQN
174 | # DQN chooses the max Q value among next actions
175 | # selection and evaluation of action is on the target Q Network
176 | # Q_max = max_a' Q_target(s', a')
177 | target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))
178 |
179 | if self.USE_PER:
180 | indices = np.arange(self.batch_size, dtype=np.int32)
181 | absolute_errors = np.abs(target_old[indices, np.array(action)]-target[indices, np.array(action)])
182 | # Update priority
183 | self.MEMORY.batch_update(tree_idx, absolute_errors)
184 |
185 | # Train the Neural Network with batches
186 | self.model.fit(state, target, batch_size=self.batch_size, verbose=0)
187 |
188 | def load(self, name):
189 | self.model = load_model(name)
190 |
191 | def save(self, name):
192 | self.model.save(name)
193 |
194 | pylab.figure(figsize=(18, 9))
195 | def PlotModel(self, score, episode):
196 | self.scores.append(score)
197 | self.episodes.append(episode)
198 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
199 | pylab.plot(self.episodes, self.average, 'r')
200 | pylab.plot(self.episodes, self.scores, 'b')
201 | pylab.ylabel('Score', fontsize=18)
202 | pylab.xlabel('Steps', fontsize=18)
203 | dqn = 'DQN_'
204 | softupdate = ''
205 | dueling = ''
206 | greedy = ''
207 | PER = ''
208 | if self.ddqn: dqn = 'DDQN_'
209 | if self.Soft_Update: softupdate = '_soft'
210 | if self.dueling: dueling = '_Dueling'
211 | if self.epsilot_greedy: greedy = '_Greedy'
212 | if self.USE_PER: PER = '_PER'
213 | try:
214 | pylab.savefig(dqn+self.env_name+softupdate+dueling+greedy+PER+".png")
215 | except OSError:
216 | pass
217 |
218 | return str(self.average[-1])[:5]
219 |
220 | def run(self):
221 | decay_step = 0
222 | for e in range(self.EPISODES):
223 | state = self.env.reset()
224 | state = np.reshape(state, [1, self.state_size])
225 | done = False
226 | i = 0
227 | while not done:
228 | #self.env.render()
229 | decay_step += 1
230 | action, explore_probability = self.act(state, decay_step)
231 | next_state, reward, done, _ = self.env.step(action)
232 | next_state = np.reshape(next_state, [1, self.state_size])
233 | if not done or i == self.env._max_episode_steps-1:
234 | reward = reward
235 | else:
236 | reward = -100
237 | self.remember(state, action, reward, next_state, done)
238 | state = next_state
239 | i += 1
240 | if done:
241 | # every step update target model
242 | self.update_target_model()
243 |
244 | # every episode, plot the result
245 | average = self.PlotModel(i, e)
246 |
247 | print("episode: {}/{}, score: {}, e: {:.2}, average: {}".format(e, self.EPISODES, i, explore_probability, average))
248 | if i == self.env._max_episode_steps:
249 | print("Saving trained model to", self.Model_name)
250 | #self.save(self.Model_name)
251 | break
252 | self.replay()
253 | self.env.close()
254 |
255 | def test(self):
256 | self.load(self.Model_name)
257 | for e in range(self.EPISODES):
258 | state = self.env.reset()
259 | state = np.reshape(state, [1, self.state_size])
260 | done = False
261 | i = 0
262 | while not done:
263 | self.env.render()
264 | action = np.argmax(self.model.predict(state))
265 | next_state, reward, done, _ = self.env.step(action)
266 | state = np.reshape(next_state, [1, self.state_size])
267 | i += 1
268 | if done:
269 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, i))
270 | break
271 |
272 | if __name__ == "__main__":
273 | env_name = 'CartPole-v1'
274 | agent = DQNAgent(env_name)
275 | agent.run()
276 | #agent.test()
277 |
--------------------------------------------------------------------------------
/05_CartPole-reinforcement-learning_PER_D3QN/Cartpole_PER_D3QN_TF2.py:
--------------------------------------------------------------------------------
1 | # Tutorial by www.pylessons.com
2 | # Tutorial written for - Tensorflow 2.3.1
3 |
4 | import os
5 | import random
6 | import gym
7 | import pylab
8 | import numpy as np
9 | from collections import deque
10 | from tensorflow.keras.models import Model, load_model
11 | from tensorflow.keras.layers import Input, Dense, Lambda, Add
12 | from tensorflow.keras.optimizers import Adam, RMSprop
13 | from tensorflow.keras import backend as K
14 | from PER import *
15 |
16 | def OurModel(input_shape, action_space, dueling):
17 | X_input = Input(input_shape)
18 | X = X_input
19 |
20 | # 'Dense' is the basic form of a neural network layer
21 | # Input Layer of state size(4) and Hidden Layer with 512 nodes
22 | X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X)
23 |
24 | # Hidden layer with 256 nodes
25 | X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X)
26 |
27 | # Hidden layer with 64 nodes
28 | X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
29 |
30 | if dueling:
31 | state_value = Dense(1, kernel_initializer='he_uniform')(X)
32 | state_value = Lambda(lambda s: K.expand_dims(s[:, 0], -1), output_shape=(action_space,))(state_value)
33 |
34 | action_advantage = Dense(action_space, kernel_initializer='he_uniform')(X)
35 | action_advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), output_shape=(action_space,))(action_advantage)
36 |
37 | X = Add()([state_value, action_advantage])
38 | else:
39 | # Output Layer with # of actions: 2 nodes (left, right)
40 | X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X)
41 |
42 | model = Model(inputs = X_input, outputs = X)
43 | model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"])
44 |
45 | model.summary()
46 | return model
47 |
48 | class DQNAgent:
49 | def __init__(self, env_name):
50 | self.env_name = env_name
51 | self.env = gym.make(env_name)
52 | self.env.seed(0)
53 | # by default, CartPole-v1 has max episode steps = 500
54 | self.env._max_episode_steps = 4000
55 | self.state_size = self.env.observation_space.shape[0]
56 | self.action_size = self.env.action_space.n
57 |
58 | self.EPISODES = 1000
59 | memory_size = 10000
60 | self.MEMORY = Memory(memory_size)
61 | self.memory = deque(maxlen=2000)
62 | self.gamma = 0.95 # discount rate
63 |
64 | # EXPLORATION HYPERPARAMETERS for epsilon and epsilon greedy strategy
65 | self.epsilon = 1.0 # exploration probability at start
66 | self.epsilon_min = 0.01 # minimum exploration probability
67 | self.epsilon_decay = 0.0005 # exponential decay rate for exploration prob
68 |
69 | self.batch_size = 32
70 |
71 | # defining model parameters
72 | self.ddqn = True # use doudle deep q network
73 | self.Soft_Update = False # use soft parameter update
74 | self.dueling = True # use dealing netowrk
75 | self.epsilot_greedy = False # use epsilon greedy strategy
76 | self.USE_PER = True
77 |
78 | self.TAU = 0.1 # target network soft update hyperparameter
79 |
80 | self.Save_Path = 'Models'
81 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
82 | self.scores, self.episodes, self.average = [], [], []
83 |
84 | self.Model_name = os.path.join(self.Save_Path, self.env_name+"_e_greedy.h5")
85 |
86 | # create main model and target model
87 | self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling)
88 | self.target_model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling)
89 |
90 | # after some time interval update the target model to be same with model
91 | def update_target_model(self):
92 | if not self.Soft_Update and self.ddqn:
93 | self.target_model.set_weights(self.model.get_weights())
94 | return
95 | if self.Soft_Update and self.ddqn:
96 | q_model_theta = self.model.get_weights()
97 | target_model_theta = self.target_model.get_weights()
98 | counter = 0
99 | for q_weight, target_weight in zip(q_model_theta, target_model_theta):
100 | target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU
101 | target_model_theta[counter] = target_weight
102 | counter += 1
103 | self.target_model.set_weights(target_model_theta)
104 |
105 | def remember(self, state, action, reward, next_state, done):
106 | experience = state, action, reward, next_state, done
107 | if self.USE_PER:
108 | self.MEMORY.store(experience)
109 | else:
110 | self.memory.append((experience))
111 |
112 | def act(self, state, decay_step):
113 | # EPSILON GREEDY STRATEGY
114 | if self.epsilot_greedy:
115 | # Here we'll use an improved version of our epsilon greedy strategy for Q-learning
116 | explore_probability = self.epsilon_min + (self.epsilon - self.epsilon_min) * np.exp(-self.epsilon_decay * decay_step)
117 | # OLD EPSILON STRATEGY
118 | else:
119 | if self.epsilon > self.epsilon_min:
120 | self.epsilon *= (1-self.epsilon_decay)
121 | explore_probability = self.epsilon
122 |
123 | if explore_probability > np.random.rand():
124 | # Make a random action (exploration)
125 | return random.randrange(self.action_size), explore_probability
126 | else:
127 | # Get action from Q-network (exploitation)
128 | # Estimate the Qs values state
129 | # Take the biggest Q value (= the best action)
130 | return np.argmax(self.model.predict(state)), explore_probability
131 |
132 | def replay(self):
133 | if self.USE_PER:
134 | tree_idx, minibatch = self.MEMORY.sample(self.batch_size)
135 | else:
136 | minibatch = random.sample(self.memory, min(len(self.memory), self.batch_size))
137 |
138 | state = np.zeros((self.batch_size, self.state_size))
139 | next_state = np.zeros((self.batch_size, self.state_size))
140 | action, reward, done = [], [], []
141 |
142 | # do this before prediction
143 | # for speedup, this could be done on the tensor level
144 | # but easier to understand using a loop
145 | for i in range(self.batch_size):
146 | state[i] = minibatch[i][0]
147 | action.append(minibatch[i][1])
148 | reward.append(minibatch[i][2])
149 | next_state[i] = minibatch[i][3]
150 | done.append(minibatch[i][4])
151 |
152 | # do batch prediction to save speed
153 | # predict Q-values for starting state using the main network
154 | target = self.model.predict(state)
155 | target_old = np.array(target)
156 | # predict best action in ending state using the main network
157 | target_next = self.model.predict(next_state)
158 | # predict Q-values for ending state using the target network
159 | target_val = self.target_model.predict(next_state)
160 |
161 | for i in range(len(minibatch)):
162 | # correction on the Q value for the action used
163 | if done[i]:
164 | target[i][action[i]] = reward[i]
165 | else:
166 | if self.ddqn: # Double - DQN
167 | # current Q Network selects the action
168 | # a'_max = argmax_a' Q(s', a')
169 | a = np.argmax(target_next[i])
170 | # target Q Network evaluates the action
171 | # Q_max = Q_target(s', a'_max)
172 | target[i][action[i]] = reward[i] + self.gamma * (target_val[i][a])
173 | else: # Standard - DQN
174 | # DQN chooses the max Q value among next actions
175 | # selection and evaluation of action is on the target Q Network
176 | # Q_max = max_a' Q_target(s', a')
177 | target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))
178 |
179 | if self.USE_PER:
180 | indices = np.arange(self.batch_size, dtype=np.int32)
181 | absolute_errors = np.abs(target_old[indices, np.array(action)]-target[indices, np.array(action)])
182 | # Update priority
183 | self.MEMORY.batch_update(tree_idx, absolute_errors)
184 |
185 | # Train the Neural Network with batches
186 | self.model.fit(state, target, batch_size=self.batch_size, verbose=0)
187 |
188 | def load(self, name):
189 | self.model = load_model(name)
190 |
191 | def save(self, name):
192 | self.model.save(name)
193 |
194 | pylab.figure(figsize=(18, 9))
195 | def PlotModel(self, score, episode):
196 | self.scores.append(score)
197 | self.episodes.append(episode)
198 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
199 | pylab.plot(self.episodes, self.average, 'r')
200 | pylab.plot(self.episodes, self.scores, 'b')
201 | pylab.ylabel('Score', fontsize=18)
202 | pylab.xlabel('Steps', fontsize=18)
203 | dqn = 'DQN_'
204 | softupdate = ''
205 | dueling = ''
206 | greedy = ''
207 | PER = ''
208 | if self.ddqn: dqn = 'DDQN_'
209 | if self.Soft_Update: softupdate = '_soft'
210 | if self.dueling: dueling = '_Dueling'
211 | if self.epsilot_greedy: greedy = '_Greedy'
212 | if self.USE_PER: PER = '_PER'
213 | try:
214 | pylab.savefig(dqn+self.env_name+softupdate+dueling+greedy+PER+".png")
215 | except OSError:
216 | pass
217 |
218 | return str(self.average[-1])[:5]
219 |
220 | def run(self):
221 | decay_step = 0
222 | for e in range(self.EPISODES):
223 | state = self.env.reset()
224 | state = np.reshape(state, [1, self.state_size])
225 | done = False
226 | i = 0
227 | while not done:
228 | #self.env.render()
229 | decay_step += 1
230 | action, explore_probability = self.act(state, decay_step)
231 | next_state, reward, done, _ = self.env.step(action)
232 | next_state = np.reshape(next_state, [1, self.state_size])
233 | if not done or i == self.env._max_episode_steps-1:
234 | reward = reward
235 | else:
236 | reward = -100
237 | self.remember(state, action, reward, next_state, done)
238 | state = next_state
239 | i += 1
240 | if done:
241 | # every step update target model
242 | self.update_target_model()
243 |
244 | # every episode, plot the result
245 | average = self.PlotModel(i, e)
246 |
247 | print("episode: {}/{}, score: {}, e: {:.2}, average: {}".format(e, self.EPISODES, i, explore_probability, average))
248 | if i == self.env._max_episode_steps:
249 | print("Saving trained model to", self.Model_name)
250 | #self.save(self.Model_name)
251 | break
252 | self.replay()
253 | self.env.close()
254 |
255 | def test(self):
256 | self.load(self.Model_name)
257 | for e in range(self.EPISODES):
258 | state = self.env.reset()
259 | state = np.reshape(state, [1, self.state_size])
260 | done = False
261 | i = 0
262 | while not done:
263 | self.env.render()
264 | action = np.argmax(self.model.predict(state))
265 | next_state, reward, done, _ = self.env.step(action)
266 | state = np.reshape(next_state, [1, self.state_size])
267 | i += 1
268 | if done:
269 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, i))
270 | break
271 |
272 | if __name__ == "__main__":
273 | env_name = 'CartPole-v1'
274 | agent = DQNAgent(env_name)
275 | agent.run()
276 | #agent.test()
277 |
--------------------------------------------------------------------------------
/05_CartPole-reinforcement-learning_PER_D3QN/IMAGES/DDQN_CartPole-v1_Dueling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/05_CartPole-reinforcement-learning_PER_D3QN/IMAGES/DDQN_CartPole-v1_Dueling.png
--------------------------------------------------------------------------------
/05_CartPole-reinforcement-learning_PER_D3QN/IMAGES/DDQN_CartPole-v1_Dueling_PER.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/05_CartPole-reinforcement-learning_PER_D3QN/IMAGES/DDQN_CartPole-v1_Dueling_PER.png
--------------------------------------------------------------------------------
/05_CartPole-reinforcement-learning_PER_D3QN/IMAGES/Replay_buffer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/05_CartPole-reinforcement-learning_PER_D3QN/IMAGES/Replay_buffer.png
--------------------------------------------------------------------------------
/05_CartPole-reinforcement-learning_PER_D3QN/IMAGES/SumTree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/05_CartPole-reinforcement-learning_PER_D3QN/IMAGES/SumTree.png
--------------------------------------------------------------------------------
/05_CartPole-reinforcement-learning_PER_D3QN/PER.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | class SumTree(object):
4 | data_pointer = 0
5 |
6 | # Here we initialize the tree with all nodes = 0, and initialize the data with all values = 0
7 | def __init__(self, capacity):
8 | # Number of leaf nodes (final nodes) that contains experiences
9 | self.capacity = capacity
10 |
11 | # Generate the tree with all nodes values = 0
12 | # To understand this calculation (2 * capacity - 1) look at the schema below
13 | # Remember we are in a binary node (each node has max 2 children) so 2x size of leaf (capacity) - 1 (root node)
14 | # Parent nodes = capacity - 1
15 | # Leaf nodes = capacity
16 | self.tree = np.zeros(2 * capacity - 1)
17 |
18 | # Contains the experiences (so the size of data is capacity)
19 | self.data = np.zeros(capacity, dtype=object)
20 |
21 |
22 | # Here we define function that will add our priority score in the sumtree leaf and add the experience in data:
23 | def add(self, priority, data):
24 | # Look at what index we want to put the experience
25 | tree_index = self.data_pointer + self.capacity - 1
26 |
27 | # Update data frame
28 | self.data[self.data_pointer] = data
29 |
30 | # Update the leaf
31 | self.update (tree_index, priority)
32 |
33 | # Add 1 to data_pointer
34 | self.data_pointer += 1
35 |
36 | if self.data_pointer >= self.capacity: # If we're above the capacity, we go back to first index (we overwrite)
37 | self.data_pointer = 0
38 |
39 | # Update the leaf priority score and propagate the change through tree
40 | def update(self, tree_index, priority):
41 | # Change = new priority score - former priority score
42 | change = priority - self.tree[tree_index]
43 | self.tree[tree_index] = priority
44 |
45 | # then propagate the change through tree
46 | # this method is faster than the recursive loop in the reference code
47 | while tree_index != 0:
48 | tree_index = (tree_index - 1) // 2
49 | self.tree[tree_index] += change
50 |
51 | # Here build a function to get a leaf from our tree. So we'll build a function to get the leaf_index, priority value of that leaf and experience associated with that leaf index:
52 | def get_leaf(self, v):
53 | parent_index = 0
54 |
55 | # the while loop is faster than the method in the reference code
56 | while True:
57 | left_child_index = 2 * parent_index + 1
58 | right_child_index = left_child_index + 1
59 |
60 | # If we reach bottom, end the search
61 | if left_child_index >= len(self.tree):
62 | leaf_index = parent_index
63 | break
64 | else: # downward search, always search for a higher priority node
65 | if v <= self.tree[left_child_index]:
66 | parent_index = left_child_index
67 | else:
68 | v -= self.tree[left_child_index]
69 | parent_index = right_child_index
70 |
71 | data_index = leaf_index - self.capacity + 1
72 |
73 | return leaf_index, self.tree[leaf_index], self.data[data_index]
74 |
75 | @property
76 | def total_priority(self):
77 | return self.tree[0] # Returns the root node
78 |
79 | # Now we finished constructing our SumTree object, next we'll build a memory object.
80 | class Memory(object): # stored as ( state, action, reward, next_state ) in SumTree
81 | PER_e = 0.01 # Hyperparameter that we use to avoid some experiences to have 0 probability of being taken
82 | PER_a = 0.6 # Hyperparameter that we use to make a tradeoff between taking only exp with high priority and sampling randomly
83 | PER_b = 0.4 # importance-sampling, from initial value increasing to 1
84 |
85 | PER_b_increment_per_sampling = 0.001
86 |
87 | absolute_error_upper = 1. # clipped abs error
88 |
89 | def __init__(self, capacity):
90 | # Making the tree
91 | self.tree = SumTree(capacity)
92 |
93 | # Next, we define a function to store a new experience in our tree.
94 | # Each new experience will have a score of max_prority (it will be then improved when we use this exp to train our DDQN).
95 | def store(self, experience):
96 | # Find the max priority
97 | max_priority = np.max(self.tree.tree[-self.tree.capacity:])
98 |
99 | # If the max priority = 0 we can't put priority = 0 since this experience will never have a chance to be selected
100 | # So we use a minimum priority
101 | if max_priority == 0:
102 | max_priority = self.absolute_error_upper
103 |
104 | self.tree.add(max_priority, experience) # set the max priority for new priority
105 |
106 | # Now we create sample function, which will be used to pick batch from our tree memory, which will be used to train our model.
107 | # - First, we sample a minibatch of n size, the range [0, priority_total] into priority ranges.
108 | # - Then a value is uniformly sampled from each range.
109 | # - Then we search in the sumtree, for the experience where priority score correspond to sample values are retrieved from.
110 | def sample(self, n):
111 | # Create a minibatch array that will contains the minibatch
112 | minibatch = []
113 |
114 | b_idx = np.empty((n,), dtype=np.int32)
115 |
116 | # Calculate the priority segment
117 | # Here, as explained in the paper, we divide the Range[0, ptotal] into n ranges
118 | priority_segment = self.tree.total_priority / n # priority segment
119 |
120 | for i in range(n):
121 | # A value is uniformly sample from each range
122 | a, b = priority_segment * i, priority_segment * (i + 1)
123 | value = np.random.uniform(a, b)
124 |
125 | # Experience that correspond to each value is retrieved
126 | index, priority, data = self.tree.get_leaf(value)
127 |
128 | b_idx[i]= index
129 |
130 | minibatch.append([data[0],data[1],data[2],data[3],data[4]])
131 |
132 | return b_idx, minibatch
133 |
134 | # Update the priorities on the tree
135 | def batch_update(self, tree_idx, abs_errors):
136 | abs_errors += self.PER_e # convert to abs and avoid 0
137 | clipped_errors = np.minimum(abs_errors, self.absolute_error_upper)
138 | ps = np.power(clipped_errors, self.PER_a)
139 |
140 | for ti, p in zip(tree_idx, ps):
141 | self.tree.update(ti, p)
142 |
--------------------------------------------------------------------------------
/06_CartPole-reinforcement-learning_PER_D3QN_CNN/Cartpole_PER_D3QN_CNN.py:
--------------------------------------------------------------------------------
1 | # Tutorial by www.pylessons.com
2 | # Tutorial written for - Tensorflow 1.15, Keras 2.2.4
3 |
4 | import os
5 | import random
6 | import gym
7 | import pylab
8 | import numpy as np
9 | from collections import deque
10 | from keras.models import Model, load_model
11 | from keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten
12 | from keras.optimizers import Adam, RMSprop
13 | from keras import backend as K
14 | from PER import *
15 | import cv2
16 |
17 | def OurModel(input_shape, action_space, dueling):
18 | X_input = Input(input_shape)
19 | X = X_input
20 |
21 | X = Conv2D(64, 5, strides=(3, 3),padding="valid", input_shape=input_shape, activation="relu", data_format="channels_first")(X)
22 | X = Conv2D(64, 4, strides=(2, 2),padding="valid", activation="relu", data_format="channels_first")(X)
23 | X = Conv2D(64, 3, strides=(1, 1),padding="valid", activation="relu", data_format="channels_first")(X)
24 | X = Flatten()(X)
25 | # 'Dense' is the basic form of a neural network layer
26 | # Input Layer of state size(4) and Hidden Layer with 512 nodes
27 | X = Dense(512, activation="relu", kernel_initializer='he_uniform')(X)
28 |
29 | # Hidden layer with 256 nodes
30 | X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X)
31 |
32 | # Hidden layer with 64 nodes
33 | X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
34 |
35 | if dueling:
36 | state_value = Dense(1, kernel_initializer='he_uniform')(X)
37 | state_value = Lambda(lambda s: K.expand_dims(s[:, 0], -1), output_shape=(action_space,))(state_value)
38 |
39 | action_advantage = Dense(action_space, kernel_initializer='he_uniform')(X)
40 | action_advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), output_shape=(action_space,))(action_advantage)
41 |
42 | X = Add()([state_value, action_advantage])
43 | else:
44 | # Output Layer with # of actions: 2 nodes (left, right)
45 | X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X)
46 |
47 | model = Model(inputs = X_input, outputs = X, name='CartPole PER D3QN CNN model')
48 | model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"])
49 |
50 | model.summary()
51 | return model
52 |
53 | class DQNAgent:
54 | def __init__(self, env_name):
55 | self.env_name = env_name
56 | self.env = gym.make(env_name)
57 | self.env.seed(0)
58 | # by default, CartPole-v1 has max episode steps = 500
59 | # we can use this to experiment beyond 500
60 | self.env._max_episode_steps = 4000
61 | self.state_size = self.env.observation_space.shape[0]
62 | self.action_size = self.env.action_space.n
63 | self.EPISODES = 1000
64 |
65 | # Instantiate memory
66 | memory_size = 10000
67 | self.MEMORY = Memory(memory_size)
68 | self.memory = deque(maxlen=2000)
69 |
70 | self.gamma = 0.95 # discount rate
71 |
72 | # EXPLORATION HYPERPARAMETERS for epsilon and epsilon greedy strategy
73 | self.epsilon = 1.0 # exploration probability at start
74 | self.epsilon_min = 0.01 # minimum exploration probability
75 | self.epsilon_decay = 0.0005 # exponential decay rate for exploration prob
76 |
77 | self.batch_size = 32
78 |
79 | # defining model parameters
80 | self.ddqn = True # use doudle deep q network
81 | self.Soft_Update = False # use soft parameter update
82 | self.dueling = True # use dealing netowrk
83 | self.epsilon_greedy = False # use epsilon greedy strategy
84 | self.USE_PER = True # use priority experienced replay
85 |
86 | self.TAU = 0.1 # target network soft update hyperparameter
87 |
88 | self.Save_Path = 'Models'
89 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
90 | self.scores, self.episodes, self.average = [], [], []
91 |
92 | self.Model_name = os.path.join(self.Save_Path, self.env_name+"_PER_D3QN_CNN.h5")
93 |
94 | self.ROWS = 160
95 | self.COLS = 240
96 | self.REM_STEP = 4
97 |
98 | self.image_memory = np.zeros((self.REM_STEP, self.ROWS, self.COLS))
99 | self.state_size = (self.REM_STEP, self.ROWS, self.COLS)
100 |
101 | # create main model and target model
102 | self.model = OurModel(input_shape=self.state_size, action_space = self.action_size, dueling = self.dueling)
103 | self.target_model = OurModel(input_shape=self.state_size, action_space = self.action_size, dueling = self.dueling)
104 |
105 | # after some time interval update the target model to be same with model
106 | def update_target_model(self):
107 | if not self.Soft_Update and self.ddqn:
108 | self.target_model.set_weights(self.model.get_weights())
109 | return
110 | if self.Soft_Update and self.ddqn:
111 | q_model_theta = self.model.get_weights()
112 | target_model_theta = self.target_model.get_weights()
113 | counter = 0
114 | for q_weight, target_weight in zip(q_model_theta, target_model_theta):
115 | target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU
116 | target_model_theta[counter] = target_weight
117 | counter += 1
118 | self.target_model.set_weights(target_model_theta)
119 |
120 | def remember(self, state, action, reward, next_state, done):
121 | experience = state, action, reward, next_state, done
122 | if self.USE_PER:
123 | self.MEMORY.store(experience)
124 | else:
125 | self.memory.append((experience))
126 |
127 | def act(self, state, decay_step):
128 | # EPSILON GREEDY STRATEGY
129 | if self.epsilon_greedy:
130 | # Here we'll use an improved version of our epsilon greedy strategy for Q-learning
131 | explore_probability = self.epsilon_min + (self.epsilon - self.epsilon_min) * np.exp(-self.epsilon_decay * decay_step)
132 | # OLD EPSILON STRATEGY
133 | else:
134 | if self.epsilon > self.epsilon_min:
135 | self.epsilon *= (1-self.epsilon_decay)
136 | explore_probability = self.epsilon
137 |
138 | if explore_probability > np.random.rand():
139 | # Make a random action (exploration)
140 | return random.randrange(self.action_size), explore_probability
141 | else:
142 | # Get action from Q-network (exploitation)
143 | # Estimate the Qs values state
144 | # Take the biggest Q value (= the best action)
145 | return np.argmax(self.model.predict(state)), explore_probability
146 |
147 | def replay(self):
148 | if self.USE_PER:
149 | # Sample minibatch from the PER memory
150 | tree_idx, minibatch = self.MEMORY.sample(self.batch_size)
151 | else:
152 | # Randomly sample minibatch from the deque memory
153 | minibatch = random.sample(self.memory, min(len(self.memory), self.batch_size))
154 |
155 | state = np.zeros((self.batch_size,) + self.state_size)
156 | next_state = np.zeros((self.batch_size,) + self.state_size)
157 | action, reward, done = [], [], []
158 |
159 | # do this before prediction
160 | # for speedup, this could be done on the tensor level
161 | # but easier to understand using a loop
162 | for i in range(len(minibatch)):
163 | state[i] = minibatch[i][0]
164 | action.append(minibatch[i][1])
165 | reward.append(minibatch[i][2])
166 | next_state[i] = minibatch[i][3]
167 | done.append(minibatch[i][4])
168 |
169 | # do batch prediction to save speed
170 | # predict Q-values for starting state using the main network
171 | target = self.model.predict(state)
172 | target_old = np.array(target)
173 | # predict best action in ending state using the main network
174 | target_next = self.model.predict(next_state)
175 | # predict Q-values for ending state using the target network
176 | target_val = self.target_model.predict(next_state)
177 |
178 | for i in range(len(minibatch)):
179 | # correction on the Q value for the action used
180 | if done[i]:
181 | target[i][action[i]] = reward[i]
182 | else:
183 | # the key point of Double DQN
184 | # selection of action is from model
185 | # update is from target model
186 | if self.ddqn: # Double - DQN
187 | # current Q Network selects the action
188 | # a'_max = argmax_a' Q(s', a')
189 | a = np.argmax(target_next[i])
190 | # target Q Network evaluates the action
191 | # Q_max = Q_target(s', a'_max)
192 | target[i][action[i]] = reward[i] + self.gamma * (target_val[i][a])
193 | else: # Standard - DQN
194 | # DQN chooses the max Q value among next actions
195 | # selection and evaluation of action is on the target Q Network
196 | # Q_max = max_a' Q_target(s', a')
197 | target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))
198 |
199 | if self.USE_PER:
200 | indices = np.arange(self.batch_size, dtype=np.int32)
201 | absolute_errors = np.abs(target_old[indices, np.array(action)]-target[indices, np.array(action)])
202 | # Update priority
203 | self.MEMORY.batch_update(tree_idx, absolute_errors)
204 |
205 | # Train the Neural Network with batches
206 | self.model.fit(state, target, batch_size=self.batch_size, verbose=0)
207 |
208 | def load(self, name):
209 | self.model = load_model(name)
210 |
211 | def save(self, name):
212 | self.model.save(name)
213 |
214 | pylab.figure(figsize=(18, 9))
215 | def PlotModel(self, score, episode):
216 | self.scores.append(score)
217 | self.episodes.append(episode)
218 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
219 | pylab.plot(self.episodes, self.average, 'r')
220 | pylab.plot(self.episodes, self.scores, 'b')
221 | pylab.ylabel('Score', fontsize=18)
222 | pylab.xlabel('Steps', fontsize=18)
223 | dqn = 'DQN_'
224 | softupdate = ''
225 | dueling = ''
226 | greedy = ''
227 | PER = ''
228 | if self.ddqn: dqn = 'DDQN_'
229 | if self.Soft_Update: softupdate = '_soft'
230 | if self.dueling: dueling = '_Dueling'
231 | if self.epsilon_greedy: greedy = '_Greedy'
232 | if self.USE_PER: PER = '_PER'
233 | try:
234 | pylab.savefig(dqn+self.env_name+softupdate+dueling+greedy+PER+"_CNN.png")
235 | except OSError:
236 | pass
237 |
238 | return str(self.average[-1])[:5]
239 |
240 | def imshow(self, image, rem_step=0):
241 | cv2.imshow("cartpole"+str(rem_step), image[rem_step,...])
242 | if cv2.waitKey(25) & 0xFF == ord("q"):
243 | cv2.destroyAllWindows()
244 | return
245 |
246 | def GetImage(self):
247 | img = self.env.render(mode='rgb_array')
248 |
249 | img_rgb = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
250 | img_rgb_resized = cv2.resize(img_rgb, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC)
251 | img_rgb_resized[img_rgb_resized < 255] = 0
252 | img_rgb_resized = img_rgb_resized / 255
253 |
254 | self.image_memory = np.roll(self.image_memory, 1, axis = 0)
255 | self.image_memory[0,:,:] = img_rgb_resized
256 |
257 | #self.imshow(self.image_memory,0)
258 |
259 | return np.expand_dims(self.image_memory, axis=0)
260 |
261 | def reset(self):
262 | self.env.reset()
263 | for i in range(self.REM_STEP):
264 | state = self.GetImage()
265 | return state
266 |
267 | def step(self,action):
268 | next_state, reward, done, info = self.env.step(action)
269 | next_state = self.GetImage()
270 | return next_state, reward, done, info
271 |
272 | def run(self):
273 | decay_step = 0
274 | for e in range(self.EPISODES):
275 | state = self.reset()
276 | done = False
277 | i = 0
278 | while not done:
279 | decay_step += 1
280 | action, explore_probability = self.act(state, decay_step)
281 | next_state, reward, done, _ = self.step(action)
282 | if not done or i == self.env._max_episode_steps-1:
283 | reward = reward
284 | else:
285 | reward = -100
286 | self.remember(state, action, reward, next_state, done)
287 | state = next_state
288 | i += 1
289 | if done:
290 | # every REM_STEP update target model
291 | if e % self.REM_STEP == 0:
292 | self.update_target_model()
293 |
294 | # every episode, plot the result
295 | average = self.PlotModel(i, e)
296 |
297 | print("episode: {}/{}, score: {}, e: {:.2}, average: {}".format(e, self.EPISODES, i, explore_probability, average))
298 | if i == self.env._max_episode_steps:
299 | print("Saving trained model to", self.Model_name)
300 | #self.save(self.Model_name)
301 | break
302 | self.replay()
303 | self.env.close()
304 |
305 | def test(self):
306 | self.load(self.Model_name)
307 | for e in range(self.EPISODES):
308 | state = self.reset()
309 | done = False
310 | i = 0
311 | while not done:
312 | action = np.argmax(self.model.predict(state))
313 | next_state, reward, done, _ = env.step(action)
314 | i += 1
315 | if done:
316 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, i))
317 | break
318 |
319 | if __name__ == "__main__":
320 | env_name = 'CartPole-v1'
321 | agent = DQNAgent(env_name)
322 | agent.run()
323 | #agent.test()
324 |
--------------------------------------------------------------------------------
/06_CartPole-reinforcement-learning_PER_D3QN_CNN/PER.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | class SumTree(object):
4 | data_pointer = 0
5 |
6 | # Here we initialize the tree with all nodes = 0, and initialize the data with all values = 0
7 | def __init__(self, capacity):
8 | # Number of leaf nodes (final nodes) that contains experiences
9 | self.capacity = capacity
10 |
11 | # Generate the tree with all nodes values = 0
12 | # To understand this calculation (2 * capacity - 1) look at the schema below
13 | # Remember we are in a binary node (each node has max 2 children) so 2x size of leaf (capacity) - 1 (root node)
14 | # Parent nodes = capacity - 1
15 | # Leaf nodes = capacity
16 | self.tree = np.zeros(2 * capacity - 1)
17 |
18 | # Contains the experiences (so the size of data is capacity)
19 | self.data = np.zeros(capacity, dtype=object)
20 |
21 |
22 | # Here we define function that will add our priority score in the sumtree leaf and add the experience in data:
23 | def add(self, priority, data):
24 | # Look at what index we want to put the experience
25 | tree_index = self.data_pointer + self.capacity - 1
26 |
27 | # Update data frame
28 | self.data[self.data_pointer] = data
29 |
30 | # Update the leaf
31 | self.update (tree_index, priority)
32 |
33 | # Add 1 to data_pointer
34 | self.data_pointer += 1
35 |
36 | if self.data_pointer >= self.capacity: # If we're above the capacity, we go back to first index (we overwrite)
37 | self.data_pointer = 0
38 |
39 | # Update the leaf priority score and propagate the change through tree
40 | def update(self, tree_index, priority):
41 | # Change = new priority score - former priority score
42 | change = priority - self.tree[tree_index]
43 | self.tree[tree_index] = priority
44 |
45 | # then propagate the change through tree
46 | # this method is faster than the recursive loop in the reference code
47 | while tree_index != 0:
48 | tree_index = (tree_index - 1) // 2
49 | self.tree[tree_index] += change
50 |
51 | # Here build a function to get a leaf from our tree. So we'll build a function to get the leaf_index, priority value of that leaf and experience associated with that leaf index:
52 | def get_leaf(self, v):
53 | parent_index = 0
54 |
55 | # the while loop is faster than the method in the reference code
56 | while True:
57 | left_child_index = 2 * parent_index + 1
58 | right_child_index = left_child_index + 1
59 |
60 | # If we reach bottom, end the search
61 | if left_child_index >= len(self.tree):
62 | leaf_index = parent_index
63 | break
64 | else: # downward search, always search for a higher priority node
65 | if v <= self.tree[left_child_index]:
66 | parent_index = left_child_index
67 | else:
68 | v -= self.tree[left_child_index]
69 | parent_index = right_child_index
70 |
71 | data_index = leaf_index - self.capacity + 1
72 |
73 | return leaf_index, self.tree[leaf_index], self.data[data_index]
74 |
75 | @property
76 | def total_priority(self):
77 | return self.tree[0] # Returns the root node
78 |
79 | # Now we finished constructing our SumTree object, next we'll build a memory object.
80 | class Memory(object): # stored as ( state, action, reward, next_state ) in SumTree
81 | PER_e = 0.01 # Hyperparameter that we use to avoid some experiences to have 0 probability of being taken
82 | PER_a = 0.6 # Hyperparameter that we use to make a tradeoff between taking only exp with high priority and sampling randomly
83 | PER_b = 0.4 # importance-sampling, from initial value increasing to 1
84 |
85 | PER_b_increment_per_sampling = 0.001
86 |
87 | absolute_error_upper = 1. # clipped abs error
88 |
89 | def __init__(self, capacity):
90 | # Making the tree
91 | self.tree = SumTree(capacity)
92 |
93 | # Next, we define a function to store a new experience in our tree.
94 | # Each new experience will have a score of max_prority (it will be then improved when we use this exp to train our DDQN).
95 | def store(self, experience):
96 | # Find the max priority
97 | max_priority = np.max(self.tree.tree[-self.tree.capacity:])
98 |
99 | # If the max priority = 0 we can't put priority = 0 since this experience will never have a chance to be selected
100 | # So we use a minimum priority
101 | if max_priority == 0:
102 | max_priority = self.absolute_error_upper
103 |
104 | self.tree.add(max_priority, experience) # set the max priority for new priority
105 |
106 | # Now we create sample function, which will be used to pick batch from our tree memory, which will be used to train our model.
107 | # - First, we sample a minibatch of n size, the range [0, priority_total] into priority ranges.
108 | # - Then a value is uniformly sampled from each range.
109 | # - Then we search in the sumtree, for the experience where priority score correspond to sample values are retrieved from.
110 | def sample(self, n):
111 | # Create a minibatch array that will contains the minibatch
112 | minibatch = []
113 |
114 | b_idx = np.empty((n,), dtype=np.int32)
115 |
116 | # Calculate the priority segment
117 | # Here, as explained in the paper, we divide the Range[0, ptotal] into n ranges
118 | priority_segment = self.tree.total_priority / n # priority segment
119 |
120 | for i in range(n):
121 | # A value is uniformly sample from each range
122 | a, b = priority_segment * i, priority_segment * (i + 1)
123 | value = np.random.uniform(a, b)
124 |
125 | # Experience that correspond to each value is retrieved
126 | index, priority, data = self.tree.get_leaf(value)
127 |
128 | b_idx[i]= index
129 |
130 | minibatch.append([data[0],data[1],data[2],data[3],data[4]])
131 |
132 | return b_idx, minibatch
133 |
134 | # Update the priorities on the tree
135 | def batch_update(self, tree_idx, abs_errors):
136 | abs_errors += self.PER_e # convert to abs and avoid 0
137 | clipped_errors = np.minimum(abs_errors, self.absolute_error_upper)
138 | ps = np.power(clipped_errors, self.PER_a)
139 |
140 | for ti, p in zip(tree_idx, ps):
141 | self.tree.update(ti, p)
142 |
--------------------------------------------------------------------------------
/06_CartPole-reinforcement-learning_PER_D3QN_CNN/random_game.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import random
3 | import numpy as np
4 | import cv2
5 |
6 | class DQN_CNN_Agent:
7 | def __init__(self, env_name):
8 | self.env_name = env_name
9 | self.env = gym.make(env_name)
10 | self.ROWS = 160
11 | self.COLS = 240
12 | self.REM_STEP = 4
13 |
14 | self.EPISODES = 10
15 |
16 | self.image_memory = np.zeros((self.REM_STEP, self.ROWS, self.COLS))
17 |
18 | def imshow(self, image, rem_step=0):
19 | cv2.imshow(env_name+str(rem_step), image[rem_step,...])
20 | if cv2.waitKey(25) & 0xFF == ord("q"):
21 | cv2.destroyAllWindows()
22 | return
23 |
24 | def GetImage(self):
25 | img = self.env.render(mode='rgb_array')
26 |
27 | img_rgb = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
28 | img_rgb_resized = cv2.resize(img_rgb, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC)
29 | img_rgb_resized[img_rgb_resized < 255] = 0
30 | img_rgb_resized = img_rgb_resized / 255
31 |
32 | self.image_memory = np.roll(self.image_memory, 1, axis = 0)
33 | self.image_memory[0,:,:] = img_rgb_resized
34 |
35 | self.imshow(self.image_memory,0)
36 |
37 | return np.expand_dims(self.image_memory, axis=0)
38 |
39 | def reset(self):
40 | self.env.reset()
41 | for i in range(self.REM_STEP):
42 | state = self.GetImage()
43 | return state
44 |
45 | def step(self,action):
46 | next_state, reward, done, info = self.env.step(action)
47 | next_state = self.GetImage()
48 | return next_state, reward, done, info
49 |
50 | def run(self):
51 | # Each of this episode is its own game.
52 | for episode in range(self.EPISODES):
53 | self.reset()
54 | # this is each frame, up to 500...but we wont make it that far with random.
55 | for t in range(500):
56 | # This will just create a sample action in any environment.
57 | # In this environment, the action can be 0 or 1, which is left or right
58 | action = self.env.action_space.sample()
59 |
60 | # this executes the environment with an action,
61 | # and returns the observation of the environment,
62 | # the reward, if the env is over, and other info.
63 | next_state, reward, done, info = self.step(action)
64 |
65 | # lets print everything in one line:
66 | #print(t, next_state, reward, done, info, action)
67 | if done:
68 | break
69 |
70 | if __name__ == "__main__":
71 | env_name = 'CartPole-v1'
72 | agent = DQN_CNN_Agent(env_name)
73 | agent.run()
74 |
--------------------------------------------------------------------------------
/07_Pong-reinforcement-learning_DQN_CNN/IMAGES/DDQN_Pong-v0_CNN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/07_Pong-reinforcement-learning_DQN_CNN/IMAGES/DDQN_Pong-v0_CNN.png
--------------------------------------------------------------------------------
/07_Pong-reinforcement-learning_DQN_CNN/IMAGES/DDQN_Pong-v0_Dueling_CNN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/07_Pong-reinforcement-learning_DQN_CNN/IMAGES/DDQN_Pong-v0_Dueling_CNN.png
--------------------------------------------------------------------------------
/07_Pong-reinforcement-learning_DQN_CNN/IMAGES/DDQN_Pong-v0_Dueling_PER_CNN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/07_Pong-reinforcement-learning_DQN_CNN/IMAGES/DDQN_Pong-v0_Dueling_PER_CNN.png
--------------------------------------------------------------------------------
/07_Pong-reinforcement-learning_DQN_CNN/IMAGES/DQN_Pong-v0_CNN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/07_Pong-reinforcement-learning_DQN_CNN/IMAGES/DQN_Pong-v0_CNN.png
--------------------------------------------------------------------------------
/07_Pong-reinforcement-learning_DQN_CNN/Models/Pong-v0_DDQN_CNN.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/07_Pong-reinforcement-learning_DQN_CNN/Models/Pong-v0_DDQN_CNN.h5
--------------------------------------------------------------------------------
/07_Pong-reinforcement-learning_DQN_CNN/Models/Pong-v0_DDQN_Dueling_CNN.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/07_Pong-reinforcement-learning_DQN_CNN/Models/Pong-v0_DDQN_Dueling_CNN.h5
--------------------------------------------------------------------------------
/07_Pong-reinforcement-learning_DQN_CNN/Models/Pong-v0_DDQN_Dueling_PER_CNN.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/07_Pong-reinforcement-learning_DQN_CNN/Models/Pong-v0_DDQN_Dueling_PER_CNN.h5
--------------------------------------------------------------------------------
/07_Pong-reinforcement-learning_DQN_CNN/Models/Pong-v0_DQN_CNN.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/07_Pong-reinforcement-learning_DQN_CNN/Models/Pong-v0_DQN_CNN.h5
--------------------------------------------------------------------------------
/07_Pong-reinforcement-learning_DQN_CNN/PER.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | class SumTree(object):
4 | data_pointer = 0
5 |
6 | # Here we initialize the tree with all nodes = 0, and initialize the data with all values = 0
7 | def __init__(self, capacity):
8 | # Number of leaf nodes (final nodes) that contains experiences
9 | self.capacity = capacity
10 |
11 | # Generate the tree with all nodes values = 0
12 | # To understand this calculation (2 * capacity - 1) look at the schema below
13 | # Remember we are in a binary node (each node has max 2 children) so 2x size of leaf (capacity) - 1 (root node)
14 | # Parent nodes = capacity - 1
15 | # Leaf nodes = capacity
16 | self.tree = np.zeros(2 * capacity - 1)
17 |
18 | # Contains the experiences (so the size of data is capacity)
19 | self.data = np.zeros(capacity, dtype=object)
20 |
21 |
22 | # Here we define function that will add our priority score in the sumtree leaf and add the experience in data:
23 | def add(self, priority, data):
24 | # Look at what index we want to put the experience
25 | tree_index = self.data_pointer + self.capacity - 1
26 |
27 | # Update data frame
28 | self.data[self.data_pointer] = data
29 |
30 | # Update the leaf
31 | self.update (tree_index, priority)
32 |
33 | # Add 1 to data_pointer
34 | self.data_pointer += 1
35 |
36 | if self.data_pointer >= self.capacity: # If we're above the capacity, we go back to first index (we overwrite)
37 | self.data_pointer = 0
38 |
39 | # Update the leaf priority score and propagate the change through tree
40 | def update(self, tree_index, priority):
41 | # Change = new priority score - former priority score
42 | change = priority - self.tree[tree_index]
43 | self.tree[tree_index] = priority
44 |
45 | # then propagate the change through tree
46 | # this method is faster than the recursive loop in the reference code
47 | while tree_index != 0:
48 | tree_index = (tree_index - 1) // 2
49 | self.tree[tree_index] += change
50 |
51 | # Here build a function to get a leaf from our tree. So we'll build a function to get the leaf_index, priority value of that leaf and experience associated with that leaf index:
52 | def get_leaf(self, v):
53 | parent_index = 0
54 |
55 | # the while loop is faster than the method in the reference code
56 | while True:
57 | left_child_index = 2 * parent_index + 1
58 | right_child_index = left_child_index + 1
59 |
60 | # If we reach bottom, end the search
61 | if left_child_index >= len(self.tree):
62 | leaf_index = parent_index
63 | break
64 | else: # downward search, always search for a higher priority node
65 | if v <= self.tree[left_child_index]:
66 | parent_index = left_child_index
67 | else:
68 | v -= self.tree[left_child_index]
69 | parent_index = right_child_index
70 |
71 | data_index = leaf_index - self.capacity + 1
72 |
73 | return leaf_index, self.tree[leaf_index], self.data[data_index]
74 |
75 | @property
76 | def total_priority(self):
77 | return self.tree[0] # Returns the root node
78 |
79 | # Now we finished constructing our SumTree object, next we'll build a memory object.
80 | class Memory(object): # stored as ( state, action, reward, next_state ) in SumTree
81 | PER_e = 0.01 # Hyperparameter that we use to avoid some experiences to have 0 probability of being taken
82 | PER_a = 0.6 # Hyperparameter that we use to make a tradeoff between taking only exp with high priority and sampling randomly
83 | PER_b = 0.4 # importance-sampling, from initial value increasing to 1
84 |
85 | PER_b_increment_per_sampling = 0.001
86 |
87 | absolute_error_upper = 1. # clipped abs error
88 |
89 | def __init__(self, capacity):
90 | # Making the tree
91 | self.tree = SumTree(capacity)
92 |
93 | # Next, we define a function to store a new experience in our tree.
94 | # Each new experience will have a score of max_prority (it will be then improved when we use this exp to train our DDQN).
95 | def store(self, experience):
96 | # Find the max priority
97 | max_priority = np.max(self.tree.tree[-self.tree.capacity:])
98 |
99 | # If the max priority = 0 we can't put priority = 0 since this experience will never have a chance to be selected
100 | # So we use a minimum priority
101 | if max_priority == 0:
102 | max_priority = self.absolute_error_upper
103 |
104 | self.tree.add(max_priority, experience) # set the max priority for new priority
105 |
106 | # Now we create sample function, which will be used to pick batch from our tree memory, which will be used to train our model.
107 | # - First, we sample a minibatch of n size, the range [0, priority_total] into priority ranges.
108 | # - Then a value is uniformly sampled from each range.
109 | # - Then we search in the sumtree, for the experience where priority score correspond to sample values are retrieved from.
110 | def sample(self, n):
111 | # Create a minibatch array that will contains the minibatch
112 | minibatch = []
113 |
114 | b_idx = np.empty((n,), dtype=np.int32)
115 |
116 | # Calculate the priority segment
117 | # Here, as explained in the paper, we divide the Range[0, ptotal] into n ranges
118 | priority_segment = self.tree.total_priority / n # priority segment
119 |
120 | for i in range(n):
121 | # A value is uniformly sample from each range
122 | a, b = priority_segment * i, priority_segment * (i + 1)
123 | value = np.random.uniform(a, b)
124 |
125 | # Experience that correspond to each value is retrieved
126 | index, priority, data = self.tree.get_leaf(value)
127 |
128 | b_idx[i]= index
129 |
130 | minibatch.append([data[0],data[1],data[2],data[3],data[4]])
131 |
132 | return b_idx, minibatch
133 |
134 | # Update the priorities on the tree
135 | def batch_update(self, tree_idx, abs_errors):
136 | abs_errors += self.PER_e # convert to abs and avoid 0
137 | clipped_errors = np.minimum(abs_errors, self.absolute_error_upper)
138 | ps = np.power(clipped_errors, self.PER_a)
139 |
140 | for ti, p in zip(tree_idx, ps):
141 | self.tree.update(ti, p)
142 |
--------------------------------------------------------------------------------
/08_Pong-v0_Policy_gradient/IMAGES/Pong-v0_PG_2.5e-05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/08_Pong-v0_Policy_gradient/IMAGES/Pong-v0_PG_2.5e-05.png
--------------------------------------------------------------------------------
/08_Pong-v0_Policy_gradient/IMAGES/PongDeterministic-v4_PG_0.0001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/08_Pong-v0_Policy_gradient/IMAGES/PongDeterministic-v4_PG_0.0001.png
--------------------------------------------------------------------------------
/08_Pong-v0_Policy_gradient/Pong-v0_PG.py:
--------------------------------------------------------------------------------
1 | # Tutorial by www.pylessons.com
2 | # Tutorial written for - Tensorflow 1.15, Keras 2.2.4
3 |
4 | import os
5 | import random
6 | import gym
7 | import pylab
8 | import numpy as np
9 | from keras.models import Model, load_model
10 | from keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten
11 | from keras.optimizers import Adam, RMSprop
12 | from keras import backend as K
13 | import cv2
14 |
15 | def OurModel(input_shape, action_space, lr):
16 | X_input = Input(input_shape)
17 |
18 | #X = Conv2D(32, 8, strides=(4, 4),padding="valid", activation="elu", data_format="channels_first", input_shape=input_shape)(X_input)
19 | #X = Conv2D(64, 4, strides=(2, 2),padding="valid", activation="elu", data_format="channels_first")(X)
20 | #X = Conv2D(64, 3, strides=(1, 1),padding="valid", activation="elu", data_format="channels_first")(X)
21 | X = Flatten(input_shape=input_shape)(X_input)
22 |
23 | X = Dense(512, activation="elu", kernel_initializer='he_uniform')(X)
24 | #X = Dense(256, activation="elu", kernel_initializer='he_uniform')(X)
25 | #X = Dense(64, activation="elu", kernel_initializer='he_uniform')(X)
26 |
27 | action = Dense(action_space, activation="softmax", kernel_initializer='he_uniform')(X)
28 |
29 | Actor = Model(inputs = X_input, outputs = action)
30 | Actor.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=lr))
31 |
32 | return Actor
33 |
34 | class PGAgent:
35 | # Policy Gradient Main Optimization Algorithm
36 | def __init__(self, env_name):
37 | # Initialization
38 | # Environment and PG parameters
39 | self.env_name = env_name
40 | self.env = gym.make(env_name)
41 | self.action_size = self.env.action_space.n
42 | self.EPISODES, self.max_average = 10000, -21.0 # specific for pong
43 | self.lr = 0.000025
44 |
45 | self.ROWS = 80
46 | self.COLS = 80
47 | self.REM_STEP = 4
48 |
49 | # Instantiate games and plot memory
50 | self.states, self.actions, self.rewards = [], [], []
51 | self.scores, self.episodes, self.average = [], [], []
52 |
53 | self.Save_Path = 'Models'
54 | self.state_size = (self.REM_STEP, self.ROWS, self.COLS)
55 | self.image_memory = np.zeros(self.state_size)
56 |
57 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
58 | self.path = '{}_PG_{}'.format(self.env_name, self.lr)
59 | self.Model_name = os.path.join(self.Save_Path, self.path)
60 |
61 | # Create Actor network model
62 | self.Actor = OurModel(input_shape=self.state_size, action_space = self.action_size, lr=self.lr)
63 |
64 | def remember(self, state, action, reward):
65 | # store episode actions to memory
66 | self.states.append(state)
67 | action_onehot = np.zeros([self.action_size])
68 | action_onehot[action] = 1
69 | self.actions.append(action_onehot)
70 | self.rewards.append(reward)
71 |
72 | def act(self, state):
73 | # Use the network to predict the next action to take, using the model
74 | prediction = self.Actor.predict(state)[0]
75 | action = np.random.choice(self.action_size, p=prediction)
76 | return action
77 |
78 | def discount_rewards(self, reward):
79 | # Compute the gamma-discounted rewards over an episode
80 | gamma = 0.99 # discount rate
81 | running_add = 0
82 | discounted_r = np.zeros_like(reward)
83 | for i in reversed(range(0,len(reward))):
84 | if reward[i] != 0: # reset the sum, since this was a game boundary (pong specific!)
85 | running_add = 0
86 | running_add = running_add * gamma + reward[i]
87 | discounted_r[i] = running_add
88 |
89 | discounted_r -= np.mean(discounted_r) # normalizing the result
90 | discounted_r /= np.std(discounted_r) # divide by standard deviation
91 | return discounted_r
92 |
93 | def replay(self):
94 | # reshape memory to appropriate shape for training
95 | states = np.vstack(self.states)
96 | actions = np.vstack(self.actions)
97 |
98 | # Compute discounted rewards
99 | discounted_r = self.discount_rewards(self.rewards)
100 |
101 | # training PG network
102 | self.Actor.fit(states, actions, sample_weight=discounted_r, epochs=1, verbose=0)
103 | # reset training memory
104 | self.states, self.actions, self.rewards = [], [], []
105 |
106 | def load(self, Actor_name):
107 | self.Actor = load_model(Actor_name, compile=False)
108 |
109 | def save(self):
110 | self.Actor.save(self.Model_name + '.h5')
111 |
112 | pylab.figure(figsize=(18, 9))
113 | def PlotModel(self, score, episode):
114 | self.scores.append(score)
115 | self.episodes.append(episode)
116 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
117 | if str(episode)[-2:] == "00":# much faster than episode % 100
118 | pylab.plot(self.episodes, self.scores, 'b')
119 | pylab.plot(self.episodes, self.average, 'r')
120 | pylab.ylabel('Score', fontsize=18)
121 | pylab.xlabel('Steps', fontsize=18)
122 | try:
123 | pylab.savefig(self.path+".png")
124 | except OSError:
125 | pass
126 |
127 | return self.average[-1]
128 |
129 | def imshow(self, image, rem_step=0):
130 | cv2.imshow(self.Model_name+str(rem_step), image[rem_step,...])
131 | if cv2.waitKey(25) & 0xFF == ord("q"):
132 | cv2.destroyAllWindows()
133 | return
134 |
135 | def GetImage(self, frame):
136 | # croping frame to 80x80 size
137 | frame_cropped = frame[35:195:2, ::2,:]
138 | if frame_cropped.shape[0] != self.COLS or frame_cropped.shape[1] != self.ROWS:
139 | # OpenCV resize function
140 | frame_cropped = cv2.resize(frame, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC)
141 |
142 | # converting to RGB (numpy way)
143 | frame_rgb = 0.299*frame_cropped[:,:,0] + 0.587*frame_cropped[:,:,1] + 0.114*frame_cropped[:,:,2]
144 |
145 | # convert everything to black and white (agent will train faster)
146 | frame_rgb[frame_rgb < 100] = 0
147 | frame_rgb[frame_rgb >= 100] = 255
148 | # converting to RGB (OpenCV way)
149 | #frame_rgb = cv2.cvtColor(frame_cropped, cv2.COLOR_RGB2GRAY)
150 |
151 | # dividing by 255 we expresses value to 0-1 representation
152 | new_frame = np.array(frame_rgb).astype(np.float32) / 255.0
153 |
154 | # push our data by 1 frame, similar as deq() function work
155 | self.image_memory = np.roll(self.image_memory, 1, axis = 0)
156 |
157 | # inserting new frame to free space
158 | self.image_memory[0,:,:] = new_frame
159 |
160 | # show image frame
161 | #self.imshow(self.image_memory,0)
162 | #self.imshow(self.image_memory,1)
163 | #self.imshow(self.image_memory,2)
164 | #self.imshow(self.image_memory,3)
165 | return np.expand_dims(self.image_memory, axis=0)
166 |
167 | def reset(self):
168 | frame = self.env.reset()
169 | for i in range(self.REM_STEP):
170 | state = self.GetImage(frame)
171 | return state
172 |
173 | def step(self,action):
174 | next_state, reward, done, info = self.env.step(action)
175 | next_state = self.GetImage(next_state)
176 | return next_state, reward, done, info
177 |
178 | def run(self):
179 | for e in range(self.EPISODES):
180 | state = self.reset()
181 | done, score, SAVING = False, 0, ''
182 | while not done:
183 | #self.env.render()
184 | # Actor picks an action
185 | action = self.act(state)
186 | # Retrieve new state, reward, and whether the state is terminal
187 | next_state, reward, done, _ = self.step(action)
188 | # Memorize (state, action, reward) for training
189 | self.remember(state, action, reward)
190 | # Update current state
191 | state = next_state
192 | score += reward
193 | if done:
194 | average = self.PlotModel(score, e)
195 | # saving best models
196 | if average >= self.max_average:
197 | self.max_average = average
198 | self.save()
199 | SAVING = "SAVING"
200 | else:
201 | SAVING = ""
202 | print("episode: {}/{}, score: {}, average: {:.2f} {}".format(e, self.EPISODES, score, average, SAVING))
203 |
204 | self.replay()
205 |
206 | # close environemnt when finish training
207 | self.env.close()
208 |
209 | def test(self, Model_name):
210 | self.load(Model_name)
211 | for e in range(100):
212 | state = self.reset()
213 | done = False
214 | score = 0
215 | while not done:
216 | self.env.render()
217 | action = np.argmax(self.Actor.predict(state))
218 | state, reward, done, _ = self.step(action)
219 | score += reward
220 | if done:
221 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, score))
222 | break
223 | self.env.close()
224 |
225 | if __name__ == "__main__":
226 | #env_name = 'Pong-v0'
227 | env_name = 'PongDeterministic-v4'
228 | agent = PGAgent(env_name)
229 | agent.run()
230 | #agent.test('Models/PongDeterministic-v4_PG_2.5e-05.h5')
231 | #agent.test('Models/Pong-v0_PG_2.5e-05.h5')
232 |
--------------------------------------------------------------------------------
/08_Pong-v0_Policy_gradient/Pong-v0_PG_TF2.py:
--------------------------------------------------------------------------------
1 | # Tutorial by www.pylessons.com
2 | # Tutorial written for - Tensorflow 2.3.1
3 |
4 | import os
5 | import random
6 | import gym
7 | import pylab
8 | import numpy as np
9 | from tensorflow.keras.models import Model, load_model
10 | from tensorflow.keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten
11 | from tensorflow.keras.optimizers import Adam, RMSprop
12 | from tensorflow.keras import backend as K
13 | import cv2
14 |
15 | def OurModel(input_shape, action_space, lr):
16 | X_input = Input(input_shape)
17 |
18 | #X = Conv2D(32, 8, strides=(4, 4),padding="valid", activation="elu", data_format="channels_first", input_shape=input_shape)(X_input)
19 | #X = Conv2D(64, 4, strides=(2, 2),padding="valid", activation="elu", data_format="channels_first")(X)
20 | #X = Conv2D(64, 3, strides=(1, 1),padding="valid", activation="elu", data_format="channels_first")(X)
21 | X = Flatten(input_shape=input_shape)(X_input)
22 |
23 | X = Dense(512, activation="elu", kernel_initializer='he_uniform')(X)
24 | #X = Dense(256, activation="elu", kernel_initializer='he_uniform')(X)
25 | #X = Dense(64, activation="elu", kernel_initializer='he_uniform')(X)
26 |
27 | action = Dense(action_space, activation="softmax", kernel_initializer='he_uniform')(X)
28 |
29 | Actor = Model(inputs = X_input, outputs = action)
30 | Actor.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=lr))
31 |
32 | return Actor
33 |
34 | class PGAgent:
35 | # Policy Gradient Main Optimization Algorithm
36 | def __init__(self, env_name):
37 | # Initialization
38 | # Environment and PG parameters
39 | self.env_name = env_name
40 | self.env = gym.make(env_name)
41 | self.action_size = self.env.action_space.n
42 | self.EPISODES, self.max_average = 10000, -21.0 # specific for pong
43 | self.lr = 0.000025
44 |
45 | self.ROWS = 80
46 | self.COLS = 80
47 | self.REM_STEP = 4
48 |
49 | # Instantiate games and plot memory
50 | self.states, self.actions, self.rewards = [], [], []
51 | self.scores, self.episodes, self.average = [], [], []
52 |
53 | self.Save_Path = 'Models'
54 | self.state_size = (self.REM_STEP, self.ROWS, self.COLS)
55 | self.image_memory = np.zeros(self.state_size)
56 |
57 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
58 | self.path = '{}_PG_{}'.format(self.env_name, self.lr)
59 | self.Model_name = os.path.join(self.Save_Path, self.path)
60 |
61 | # Create Actor network model
62 | self.Actor = OurModel(input_shape=self.state_size, action_space = self.action_size, lr=self.lr)
63 |
64 | def remember(self, state, action, reward):
65 | # store episode actions to memory
66 | self.states.append(state)
67 | action_onehot = np.zeros([self.action_size])
68 | action_onehot[action] = 1
69 | self.actions.append(action_onehot)
70 | self.rewards.append(reward)
71 |
72 | def act(self, state):
73 | # Use the network to predict the next action to take, using the model
74 | prediction = self.Actor.predict(state)[0]
75 | action = np.random.choice(self.action_size, p=prediction)
76 | return action
77 |
78 | def discount_rewards(self, reward):
79 | # Compute the gamma-discounted rewards over an episode
80 | gamma = 0.99 # discount rate
81 | running_add = 0
82 | discounted_r = np.zeros_like(reward)
83 | for i in reversed(range(0,len(reward))):
84 | if reward[i] != 0: # reset the sum, since this was a game boundary (pong specific!)
85 | running_add = 0
86 | running_add = running_add * gamma + reward[i]
87 | discounted_r[i] = running_add
88 |
89 | discounted_r -= np.mean(discounted_r) # normalizing the result
90 | discounted_r /= np.std(discounted_r) # divide by standard deviation
91 | return discounted_r
92 |
93 | def replay(self):
94 | # reshape memory to appropriate shape for training
95 | states = np.vstack(self.states)
96 | actions = np.vstack(self.actions)
97 |
98 | # Compute discounted rewards
99 | discounted_r = self.discount_rewards(self.rewards)
100 |
101 | # training PG network
102 | self.Actor.fit(states, actions, sample_weight=discounted_r, epochs=1, verbose=0)
103 | # reset training memory
104 | self.states, self.actions, self.rewards = [], [], []
105 |
106 | def load(self, Actor_name):
107 | self.Actor = load_model(Actor_name, compile=False)
108 |
109 | def save(self):
110 | self.Actor.save(self.Model_name + '.h5')
111 |
112 | pylab.figure(figsize=(18, 9))
113 | def PlotModel(self, score, episode):
114 | self.scores.append(score)
115 | self.episodes.append(episode)
116 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
117 | if str(episode)[-2:] == "00":# much faster than episode % 100
118 | pylab.plot(self.episodes, self.scores, 'b')
119 | pylab.plot(self.episodes, self.average, 'r')
120 | pylab.ylabel('Score', fontsize=18)
121 | pylab.xlabel('Steps', fontsize=18)
122 | try:
123 | pylab.savefig(self.path+".png")
124 | except OSError:
125 | pass
126 |
127 | return self.average[-1]
128 |
129 | def imshow(self, image, rem_step=0):
130 | cv2.imshow(self.Model_name+str(rem_step), image[rem_step,...])
131 | if cv2.waitKey(25) & 0xFF == ord("q"):
132 | cv2.destroyAllWindows()
133 | return
134 |
135 | def GetImage(self, frame):
136 | # croping frame to 80x80 size
137 | frame_cropped = frame[35:195:2, ::2,:]
138 | if frame_cropped.shape[0] != self.COLS or frame_cropped.shape[1] != self.ROWS:
139 | # OpenCV resize function
140 | frame_cropped = cv2.resize(frame, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC)
141 |
142 | # converting to RGB (numpy way)
143 | frame_rgb = 0.299*frame_cropped[:,:,0] + 0.587*frame_cropped[:,:,1] + 0.114*frame_cropped[:,:,2]
144 |
145 | # convert everything to black and white (agent will train faster)
146 | frame_rgb[frame_rgb < 100] = 0
147 | frame_rgb[frame_rgb >= 100] = 255
148 | # converting to RGB (OpenCV way)
149 | #frame_rgb = cv2.cvtColor(frame_cropped, cv2.COLOR_RGB2GRAY)
150 |
151 | # dividing by 255 we expresses value to 0-1 representation
152 | new_frame = np.array(frame_rgb).astype(np.float32) / 255.0
153 |
154 | # push our data by 1 frame, similar as deq() function work
155 | self.image_memory = np.roll(self.image_memory, 1, axis = 0)
156 |
157 | # inserting new frame to free space
158 | self.image_memory[0,:,:] = new_frame
159 |
160 | # show image frame
161 | #self.imshow(self.image_memory,0)
162 | #self.imshow(self.image_memory,1)
163 | #self.imshow(self.image_memory,2)
164 | #self.imshow(self.image_memory,3)
165 | return np.expand_dims(self.image_memory, axis=0)
166 |
167 | def reset(self):
168 | frame = self.env.reset()
169 | for i in range(self.REM_STEP):
170 | state = self.GetImage(frame)
171 | return state
172 |
173 | def step(self,action):
174 | next_state, reward, done, info = self.env.step(action)
175 | next_state = self.GetImage(next_state)
176 | return next_state, reward, done, info
177 |
178 | def run(self):
179 | for e in range(self.EPISODES):
180 | state = self.reset()
181 | done, score, SAVING = False, 0, ''
182 | while not done:
183 | #self.env.render()
184 | # Actor picks an action
185 | action = self.act(state)
186 | # Retrieve new state, reward, and whether the state is terminal
187 | next_state, reward, done, _ = self.step(action)
188 | # Memorize (state, action, reward) for training
189 | self.remember(state, action, reward)
190 | # Update current state
191 | state = next_state
192 | score += reward
193 | if done:
194 | average = self.PlotModel(score, e)
195 | # saving best models
196 | if average >= self.max_average:
197 | self.max_average = average
198 | self.save()
199 | SAVING = "SAVING"
200 | else:
201 | SAVING = ""
202 | print("episode: {}/{}, score: {}, average: {:.2f} {}".format(e, self.EPISODES, score, average, SAVING))
203 |
204 | self.replay()
205 |
206 | # close environemnt when finish training
207 | self.env.close()
208 |
209 | def test(self, Model_name):
210 | self.load(Model_name)
211 | for e in range(100):
212 | state = self.reset()
213 | done = False
214 | score = 0
215 | while not done:
216 | self.env.render()
217 | action = np.argmax(self.Actor.predict(state))
218 | state, reward, done, _ = self.step(action)
219 | score += reward
220 | if done:
221 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, score))
222 | break
223 | self.env.close()
224 |
225 | if __name__ == "__main__":
226 | #env_name = 'Pong-v0'
227 | env_name = 'PongDeterministic-v4'
228 | agent = PGAgent(env_name)
229 | agent.run()
230 | #agent.test('Models/PongDeterministic-v4_PG_2.5e-05.h5')
231 | #agent.test('Models/Pong-v0_PG_2.5e-05.h5')
232 |
--------------------------------------------------------------------------------
/09_Pong-v0_A2C/IMAGES/Pong-v0_A2C_2.5e-05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/09_Pong-v0_A2C/IMAGES/Pong-v0_A2C_2.5e-05.png
--------------------------------------------------------------------------------
/09_Pong-v0_A2C/IMAGES/PongDeterministic-v4_A2C_2.5e-05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/09_Pong-v0_A2C/IMAGES/PongDeterministic-v4_A2C_2.5e-05.png
--------------------------------------------------------------------------------
/09_Pong-v0_A2C/Pong-v0_A2C.py:
--------------------------------------------------------------------------------
1 | # Tutorial by www.pylessons.com
2 | # Tutorial written for - Tensorflow 1.15, Keras 2.2.4
3 |
4 | import os
5 | #os.environ['CUDA_VISIBLE_DEVICES'] = '2'
6 | import random
7 | import gym
8 | import pylab
9 | import numpy as np
10 | from keras.models import Model, load_model
11 | from keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten
12 | from keras.optimizers import Adam, RMSprop
13 | from keras import backend as K
14 | import cv2
15 |
16 | def OurModel(input_shape, action_space, lr):
17 | X_input = Input(input_shape)
18 |
19 | #X = Conv2D(32, 8, strides=(4, 4),padding="valid", activation="elu", data_format="channels_first", input_shape=input_shape)(X_input)
20 | #X = Conv2D(64, 4, strides=(2, 2),padding="valid", activation="elu", data_format="channels_first")(X)
21 | #X = Conv2D(64, 3, strides=(1, 1),padding="valid", activation="elu", data_format="channels_first")(X)
22 | X = Flatten(input_shape=input_shape)(X_input)
23 |
24 | X = Dense(512, activation="elu", kernel_initializer='he_uniform')(X)
25 | #X = Dense(256, activation="elu", kernel_initializer='he_uniform')(X)
26 | #X = Dense(64, activation="elu", kernel_initializer='he_uniform')(X)
27 |
28 | action = Dense(action_space, activation="softmax", kernel_initializer='he_uniform')(X)
29 | value = Dense(1, kernel_initializer='he_uniform')(X)
30 |
31 | Actor = Model(inputs = X_input, outputs = action)
32 | Actor.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=lr))
33 |
34 | Critic = Model(inputs = X_input, outputs = value)
35 | Critic.compile(loss='mse', optimizer=RMSprop(lr=lr))
36 |
37 | return Actor, Critic
38 |
39 | class A2CAgent:
40 | # Actor-Critic Main Optimization Algorithm
41 | def __init__(self, env_name):
42 | # Initialization
43 | # Environment and PPO parameters
44 | self.env_name = env_name
45 | self.env = gym.make(env_name)
46 | self.action_size = self.env.action_space.n
47 | self.EPISODES, self.max_average = 10000, -21.0 # specific for pong
48 | self.lr = 0.000025
49 |
50 | self.ROWS = 80
51 | self.COLS = 80
52 | self.REM_STEP = 4
53 |
54 | # Instantiate games and plot memory
55 | self.states, self.actions, self.rewards = [], [], []
56 | self.scores, self.episodes, self.average = [], [], []
57 |
58 | self.Save_Path = 'Models'
59 | self.state_size = (self.REM_STEP, self.ROWS, self.COLS)
60 | self.image_memory = np.zeros(self.state_size)
61 |
62 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
63 | self.path = '{}_A2C_{}'.format(self.env_name, self.lr)
64 | self.Model_name = os.path.join(self.Save_Path, self.path)
65 |
66 | # Create Actor-Critic network model
67 | self.Actor, self.Critic = OurModel(input_shape=self.state_size, action_space = self.action_size, lr=self.lr)
68 |
69 |
70 | def remember(self, state, action, reward):
71 | # store episode actions to memory
72 | self.states.append(state)
73 | action_onehot = np.zeros([self.action_size])
74 | action_onehot[action] = 1
75 | self.actions.append(action_onehot)
76 | self.rewards.append(reward)
77 |
78 |
79 | def act(self, state):
80 | # Use the network to predict the next action to take, using the model
81 | prediction = self.Actor.predict(state)[0]
82 | action = np.random.choice(self.action_size, p=prediction)
83 | return action
84 |
85 | def discount_rewards(self, reward):
86 | # Compute the gamma-discounted rewards over an episode
87 | gamma = 0.99 # discount rate
88 | running_add = 0
89 | discounted_r = np.zeros_like(reward)
90 | for i in reversed(range(0,len(reward))):
91 | if reward[i] != 0: # reset the sum, since this was a game boundary (pong specific!)
92 | running_add = 0
93 | running_add = running_add * gamma + reward[i]
94 | discounted_r[i] = running_add
95 |
96 | discounted_r -= np.mean(discounted_r) # normalizing the result
97 | discounted_r /= np.std(discounted_r) # divide by standard deviation
98 | return discounted_r
99 |
100 |
101 | def replay(self):
102 | # reshape memory to appropriate shape for training
103 | states = np.vstack(self.states)
104 | actions = np.vstack(self.actions)
105 |
106 | # Compute discounted rewards
107 | discounted_r = self.discount_rewards(self.rewards)
108 |
109 | # Get Critic network predictions
110 | values = self.Critic.predict(states)[:, 0]
111 | # Compute advantages
112 | advantages = discounted_r - values
113 | # training Actor and Critic networks
114 | self.Actor.fit(states, actions, sample_weight=advantages, epochs=1, verbose=0)
115 | self.Critic.fit(states, discounted_r, epochs=1, verbose=0)
116 | # reset training memory
117 | self.states, self.actions, self.rewards = [], [], []
118 |
119 | def load(self, Actor_name, Critic_name):
120 | self.Actor = load_model(Actor_name, compile=False)
121 | #self.Critic = load_model(Critic_name, compile=False)
122 |
123 | def save(self):
124 | self.Actor.save(self.Model_name + '_Actor.h5')
125 | #self.Critic.save(self.Model_name + '_Critic.h5')
126 |
127 | pylab.figure(figsize=(18, 9))
128 | def PlotModel(self, score, episode):
129 | self.scores.append(score)
130 | self.episodes.append(episode)
131 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
132 | if str(episode)[-2:] == "00":# much faster than episode % 100
133 | pylab.plot(self.episodes, self.scores, 'b')
134 | pylab.plot(self.episodes, self.average, 'r')
135 | pylab.ylabel('Score', fontsize=18)
136 | pylab.xlabel('Steps', fontsize=18)
137 | try:
138 | pylab.savefig(self.path+".png")
139 | except OSError:
140 | pass
141 |
142 | return self.average[-1]
143 |
144 | def imshow(self, image, rem_step=0):
145 | cv2.imshow(self.Model_name+str(rem_step), image[rem_step,...])
146 | if cv2.waitKey(25) & 0xFF == ord("q"):
147 | cv2.destroyAllWindows()
148 | return
149 |
150 | def GetImage(self, frame):
151 | # croping frame to 80x80 size
152 | frame_cropped = frame[35:195:2, ::2,:]
153 | if frame_cropped.shape[0] != self.COLS or frame_cropped.shape[1] != self.ROWS:
154 | # OpenCV resize function
155 | frame_cropped = cv2.resize(frame, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC)
156 |
157 | # converting to RGB (numpy way)
158 | frame_rgb = 0.299*frame_cropped[:,:,0] + 0.587*frame_cropped[:,:,1] + 0.114*frame_cropped[:,:,2]
159 |
160 | # convert everything to black and white (agent will train faster)
161 | frame_rgb[frame_rgb < 100] = 0
162 | frame_rgb[frame_rgb >= 100] = 255
163 | # converting to RGB (OpenCV way)
164 | #frame_rgb = cv2.cvtColor(frame_cropped, cv2.COLOR_RGB2GRAY)
165 |
166 | # dividing by 255 we expresses value to 0-1 representation
167 | new_frame = np.array(frame_rgb).astype(np.float32) / 255.0
168 |
169 | # push our data by 1 frame, similar as deq() function work
170 | self.image_memory = np.roll(self.image_memory, 1, axis = 0)
171 |
172 | # inserting new frame to free space
173 | self.image_memory[0,:,:] = new_frame
174 |
175 | # show image frame
176 | #self.imshow(self.image_memory,0)
177 | #self.imshow(self.image_memory,1)
178 | #self.imshow(self.image_memory,2)
179 | #self.imshow(self.image_memory,3)
180 |
181 | return np.expand_dims(self.image_memory, axis=0)
182 |
183 | def reset(self):
184 | frame = self.env.reset()
185 | for i in range(self.REM_STEP):
186 | state = self.GetImage(frame)
187 | return state
188 |
189 | def step(self, action):
190 | next_state, reward, done, info = self.env.step(action)
191 | next_state = self.GetImage(next_state)
192 | return next_state, reward, done, info
193 |
194 | def run(self):
195 | for e in range(self.EPISODES):
196 | state = self.reset()
197 | done, score, SAVING = False, 0, ''
198 | while not done:
199 | #self.env.render()
200 | # Actor picks an action
201 | action = self.act(state)
202 | # Retrieve new state, reward, and whether the state is terminal
203 | next_state, reward, done, _ = self.step(action)
204 | # Memorize (state, action, reward) for training
205 | self.remember(state, action, reward)
206 | # Update current state
207 | state = next_state
208 | score += reward
209 | if done:
210 | average = self.PlotModel(score, e)
211 | # saving best models
212 | if average >= self.max_average:
213 | self.max_average = average
214 | self.save()
215 | SAVING = "SAVING"
216 | else:
217 | SAVING = ""
218 | print("episode: {}/{}, score: {}, average: {:.2f} {}".format(e, self.EPISODES, score, average, SAVING))
219 |
220 | self.replay()
221 | # close environemnt when finish training
222 | self.env.close()
223 |
224 | def test(self, Actor_name, Critic_name):
225 | self.load(Actor_name, Critic_name)
226 | for e in range(100):
227 | state = self.reset()
228 | done = False
229 | score = 0
230 | while not done:
231 | action = np.argmax(self.Actor.predict(state))
232 | state, reward, done, _ = self.step(action)
233 | score += reward
234 | if done:
235 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, score))
236 | break
237 | self.env.close()
238 |
239 | if __name__ == "__main__":
240 | #env_name = 'PongDeterministic-v4'
241 | env_name = 'Pong-v0'
242 | agent = A2CAgent(env_name)
243 | agent.run()
244 | #agent.test('Pong-v0_A2C_2.5e-05_Actor.h5', '')
245 | #agent.test('PongDeterministic-v4_A2C_1e-05_Actor.h5', '')
246 |
--------------------------------------------------------------------------------
/09_Pong-v0_A2C/Pong-v0_A2C_TF2.py:
--------------------------------------------------------------------------------
1 | # Tutorial by www.pylessons.com
2 | # Tutorial written for - Tensorflow 2.3.1
3 |
4 | import os
5 | os.environ['CUDA_VISIBLE_DEVICES'] = '0'
6 | import random
7 | import gym
8 | import pylab
9 | import numpy as np
10 | from tensorflow.keras.models import Model, load_model
11 | from tensorflow.keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten
12 | from tensorflow.keras.optimizers import Adam, RMSprop
13 | from tensorflow.keras import backend as K
14 | import cv2
15 |
16 | def OurModel(input_shape, action_space, lr):
17 | X_input = Input(input_shape)
18 |
19 | #X = Conv2D(32, 8, strides=(4, 4),padding="valid", activation="elu", data_format="channels_first", input_shape=input_shape)(X_input)
20 | #X = Conv2D(64, 4, strides=(2, 2),padding="valid", activation="elu", data_format="channels_first")(X)
21 | #X = Conv2D(64, 3, strides=(1, 1),padding="valid", activation="elu", data_format="channels_first")(X)
22 | X = Flatten(input_shape=input_shape)(X_input)
23 |
24 | X = Dense(512, activation="elu", kernel_initializer='he_uniform')(X)
25 | #X = Dense(256, activation="elu", kernel_initializer='he_uniform')(X)
26 | #X = Dense(64, activation="elu", kernel_initializer='he_uniform')(X)
27 |
28 | action = Dense(action_space, activation="softmax", kernel_initializer='he_uniform')(X)
29 | value = Dense(1, kernel_initializer='he_uniform')(X)
30 |
31 | Actor = Model(inputs = X_input, outputs = action)
32 | Actor.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=lr))
33 |
34 | Critic = Model(inputs = X_input, outputs = value)
35 | Critic.compile(loss='mse', optimizer=RMSprop(lr=lr))
36 |
37 | return Actor, Critic
38 |
39 | class A2CAgent:
40 | # Actor-Critic Main Optimization Algorithm
41 | def __init__(self, env_name):
42 | # Initialization
43 | # Environment and PPO parameters
44 | self.env_name = env_name
45 | self.env = gym.make(env_name)
46 | self.action_size = self.env.action_space.n
47 | self.EPISODES, self.max_average = 10000, -21.0 # specific for pong
48 | self.lr = 0.000025
49 |
50 | self.ROWS = 80
51 | self.COLS = 80
52 | self.REM_STEP = 4
53 |
54 | # Instantiate games and plot memory
55 | self.states, self.actions, self.rewards = [], [], []
56 | self.scores, self.episodes, self.average = [], [], []
57 |
58 | self.Save_Path = 'Models'
59 | self.state_size = (self.REM_STEP, self.ROWS, self.COLS)
60 | self.image_memory = np.zeros(self.state_size)
61 |
62 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
63 | self.path = '{}_A2C_{}'.format(self.env_name, self.lr)
64 | self.Model_name = os.path.join(self.Save_Path, self.path)
65 |
66 | # Create Actor-Critic network model
67 | self.Actor, self.Critic = OurModel(input_shape=self.state_size, action_space = self.action_size, lr=self.lr)
68 |
69 |
70 | def remember(self, state, action, reward):
71 | # store episode actions to memory
72 | self.states.append(state)
73 | action_onehot = np.zeros([self.action_size])
74 | action_onehot[action] = 1
75 | self.actions.append(action_onehot)
76 | self.rewards.append(reward)
77 |
78 |
79 | def act(self, state):
80 | # Use the network to predict the next action to take, using the model
81 | prediction = self.Actor.predict(state)[0]
82 | action = np.random.choice(self.action_size, p=prediction)
83 | return action
84 |
85 | def discount_rewards(self, reward):
86 | # Compute the gamma-discounted rewards over an episode
87 | gamma = 0.99 # discount rate
88 | running_add = 0
89 | discounted_r = np.zeros_like(reward)
90 | for i in reversed(range(0,len(reward))):
91 | if reward[i] != 0: # reset the sum, since this was a game boundary (pong specific!)
92 | running_add = 0
93 | running_add = running_add * gamma + reward[i]
94 | discounted_r[i] = running_add
95 |
96 | discounted_r -= np.mean(discounted_r) # normalizing the result
97 | discounted_r /= np.std(discounted_r) # divide by standard deviation
98 | return discounted_r
99 |
100 |
101 | def replay(self):
102 | # reshape memory to appropriate shape for training
103 | states = np.vstack(self.states)
104 | actions = np.vstack(self.actions)
105 |
106 | # Compute discounted rewards
107 | discounted_r = self.discount_rewards(self.rewards)
108 |
109 | # Get Critic network predictions
110 | values = self.Critic.predict(states)[:, 0]
111 | # Compute advantages
112 | advantages = discounted_r - values
113 | # training Actor and Critic networks
114 | self.Actor.fit(states, actions, sample_weight=advantages, epochs=1, verbose=0)
115 | self.Critic.fit(states, discounted_r, epochs=1, verbose=0)
116 | # reset training memory
117 | self.states, self.actions, self.rewards = [], [], []
118 |
119 | def load(self, Actor_name, Critic_name):
120 | self.Actor = load_model(Actor_name, compile=False)
121 | #self.Critic = load_model(Critic_name, compile=False)
122 |
123 | def save(self):
124 | self.Actor.save(self.Model_name + '_Actor.h5')
125 | #self.Critic.save(self.Model_name + '_Critic.h5')
126 |
127 | pylab.figure(figsize=(18, 9))
128 | def PlotModel(self, score, episode):
129 | self.scores.append(score)
130 | self.episodes.append(episode)
131 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
132 | if str(episode)[-2:] == "00":# much faster than episode % 100
133 | pylab.plot(self.episodes, self.scores, 'b')
134 | pylab.plot(self.episodes, self.average, 'r')
135 | pylab.ylabel('Score', fontsize=18)
136 | pylab.xlabel('Steps', fontsize=18)
137 | try:
138 | pylab.savefig(self.path+".png")
139 | except OSError:
140 | pass
141 |
142 | return self.average[-1]
143 |
144 | def imshow(self, image, rem_step=0):
145 | cv2.imshow(self.Model_name+str(rem_step), image[rem_step,...])
146 | if cv2.waitKey(25) & 0xFF == ord("q"):
147 | cv2.destroyAllWindows()
148 | return
149 |
150 | def GetImage(self, frame):
151 | # croping frame to 80x80 size
152 | frame_cropped = frame[35:195:2, ::2,:]
153 | if frame_cropped.shape[0] != self.COLS or frame_cropped.shape[1] != self.ROWS:
154 | # OpenCV resize function
155 | frame_cropped = cv2.resize(frame, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC)
156 |
157 | # converting to RGB (numpy way)
158 | frame_rgb = 0.299*frame_cropped[:,:,0] + 0.587*frame_cropped[:,:,1] + 0.114*frame_cropped[:,:,2]
159 |
160 | # convert everything to black and white (agent will train faster)
161 | frame_rgb[frame_rgb < 100] = 0
162 | frame_rgb[frame_rgb >= 100] = 255
163 | # converting to RGB (OpenCV way)
164 | #frame_rgb = cv2.cvtColor(frame_cropped, cv2.COLOR_RGB2GRAY)
165 |
166 | # dividing by 255 we expresses value to 0-1 representation
167 | new_frame = np.array(frame_rgb).astype(np.float32) / 255.0
168 |
169 | # push our data by 1 frame, similar as deq() function work
170 | self.image_memory = np.roll(self.image_memory, 1, axis = 0)
171 |
172 | # inserting new frame to free space
173 | self.image_memory[0,:,:] = new_frame
174 |
175 | # show image frame
176 | #self.imshow(self.image_memory,0)
177 | #self.imshow(self.image_memory,1)
178 | #self.imshow(self.image_memory,2)
179 | #self.imshow(self.image_memory,3)
180 |
181 | return np.expand_dims(self.image_memory, axis=0)
182 |
183 | def reset(self):
184 | frame = self.env.reset()
185 | for i in range(self.REM_STEP):
186 | state = self.GetImage(frame)
187 | return state
188 |
189 | def step(self, action):
190 | next_state, reward, done, info = self.env.step(action)
191 | next_state = self.GetImage(next_state)
192 | return next_state, reward, done, info
193 |
194 | def run(self):
195 | for e in range(self.EPISODES):
196 | state = self.reset()
197 | done, score, SAVING = False, 0, ''
198 | while not done:
199 | #self.env.render()
200 | # Actor picks an action
201 | action = self.act(state)
202 | # Retrieve new state, reward, and whether the state is terminal
203 | next_state, reward, done, _ = self.step(action)
204 | # Memorize (state, action, reward) for training
205 | self.remember(state, action, reward)
206 | # Update current state
207 | state = next_state
208 | score += reward
209 | if done:
210 | average = self.PlotModel(score, e)
211 | # saving best models
212 | if average >= self.max_average:
213 | self.max_average = average
214 | self.save()
215 | SAVING = "SAVING"
216 | else:
217 | SAVING = ""
218 | print("episode: {}/{}, score: {}, average: {:.2f} {}".format(e, self.EPISODES, score, average, SAVING))
219 |
220 | self.replay()
221 | # close environemnt when finish training
222 | self.env.close()
223 |
224 | def test(self, Actor_name, Critic_name):
225 | self.load(Actor_name, Critic_name)
226 | for e in range(100):
227 | state = self.reset()
228 | done = False
229 | score = 0
230 | while not done:
231 | action = np.argmax(self.Actor.predict(state))
232 | state, reward, done, _ = self.step(action)
233 | score += reward
234 | if done:
235 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, score))
236 | break
237 | self.env.close()
238 |
239 | if __name__ == "__main__":
240 | #env_name = 'PongDeterministic-v4'
241 | env_name = 'Pong-v0'
242 | agent = A2CAgent(env_name)
243 | agent.run()
244 | #agent.test('Pong-v0_A2C_2.5e-05_Actor.h5', '')
245 | #agent.test('PongDeterministic-v4_A2C_1e-05_Actor.h5', '')
246 |
--------------------------------------------------------------------------------
/10_Pong-v0_A3C/Pong-v0_A3C.py:
--------------------------------------------------------------------------------
1 | # Tutorial by www.pylessons.com
2 | # Tutorial written for - Tensorflow 1.15, Keras 2.2.4
3 |
4 | import os
5 | #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
6 | os.environ['CUDA_VISIBLE_DEVICES'] = '1'
7 | import random
8 | import gym
9 | import pylab
10 | import numpy as np
11 | from keras.models import Model, load_model
12 | from keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten
13 | from keras.optimizers import Adam, RMSprop
14 | from keras import backend as K
15 | import cv2
16 | # import needed for threading
17 | import tensorflow as tf
18 | from keras.backend.tensorflow_backend import set_session
19 | import threading
20 | from threading import Thread, Lock
21 | import time
22 |
23 | # configure Keras and TensorFlow sessions and graph
24 | config = tf.ConfigProto()
25 | config.gpu_options.allow_growth = True
26 | sess = tf.Session(config=config)
27 | set_session(sess)
28 | K.set_session(sess)
29 | graph = tf.get_default_graph()
30 |
31 | def OurModel(input_shape, action_space, lr):
32 | X_input = Input(input_shape)
33 |
34 | #X = Conv2D(32, 8, strides=(4, 4),padding="valid", activation="elu", data_format="channels_first", input_shape=input_shape)(X_input)
35 | #X = Conv2D(64, 4, strides=(2, 2),padding="valid", activation="elu", data_format="channels_first")(X)
36 | #X = Conv2D(64, 3, strides=(1, 1),padding="valid", activation="elu", data_format="channels_first")(X)
37 | X = Flatten(input_shape=input_shape)(X_input)
38 |
39 | X = Dense(512, activation="elu", kernel_initializer='he_uniform')(X)
40 | #X = Dense(256, activation="elu", kernel_initializer='he_uniform')(X)
41 | #X = Dense(64, activation="elu", kernel_initializer='he_uniform')(X)
42 |
43 | action = Dense(action_space, activation="softmax", kernel_initializer='he_uniform')(X)
44 | value = Dense(1, kernel_initializer='he_uniform')(X)
45 |
46 | Actor = Model(inputs = X_input, outputs = action)
47 | Actor.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=lr))
48 |
49 | Critic = Model(inputs = X_input, outputs = value)
50 | Critic.compile(loss='mse', optimizer=RMSprop(lr=lr))
51 |
52 | return Actor, Critic
53 |
54 | class A3CAgent:
55 | # Actor-Critic Main Optimization Algorithm
56 | def __init__(self, env_name):
57 | # Initialization
58 | # Environment and PPO parameters
59 | self.env_name = env_name
60 | self.env = gym.make(env_name)
61 | self.action_size = self.env.action_space.n
62 | self.EPISODES, self.episode, self.max_average = 20000, 0, -21.0 # specific for pong
63 | self.lock = Lock()
64 | self.lr = 0.000025
65 |
66 | self.ROWS = 80
67 | self.COLS = 80
68 | self.REM_STEP = 4
69 |
70 | # Instantiate plot memory
71 | self.scores, self.episodes, self.average = [], [], []
72 |
73 | self.Save_Path = 'Models'
74 | self.state_size = (self.REM_STEP, self.ROWS, self.COLS)
75 |
76 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
77 | self.path = '{}_A3C_{}'.format(self.env_name, self.lr)
78 | self.Model_name = os.path.join(self.Save_Path, self.path)
79 |
80 | # Create Actor-Critic network model
81 | self.Actor, self.Critic = OurModel(input_shape=self.state_size, action_space = self.action_size, lr=self.lr)
82 |
83 | # make predict function to work while multithreading
84 | self.Actor._make_predict_function()
85 | self.Critic._make_predict_function()
86 |
87 | global graph
88 | graph = tf.get_default_graph()
89 |
90 | def act(self, state):
91 | # Use the network to predict the next action to take, using the model
92 | prediction = self.Actor.predict(state)[0]
93 | action = np.random.choice(self.action_size, p=prediction)
94 | return action
95 |
96 | def discount_rewards(self, reward):
97 | # Compute the gamma-discounted rewards over an episode
98 | gamma = 0.99 # discount rate
99 | running_add = 0
100 | discounted_r = np.zeros_like(reward)
101 | for i in reversed(range(0,len(reward))):
102 | if reward[i] != 0: # reset the sum, since this was a game boundary (pong specific!)
103 | running_add = 0
104 | running_add = running_add * gamma + reward[i]
105 | discounted_r[i] = running_add
106 |
107 | discounted_r -= np.mean(discounted_r) # normalizing the result
108 | discounted_r /= np.std(discounted_r) # divide by standard deviation
109 | return discounted_r
110 |
111 | def replay(self, states, actions, rewards):
112 | # reshape memory to appropriate shape for training
113 | states = np.vstack(states)
114 | actions = np.vstack(actions)
115 |
116 | # Compute discounted rewards
117 | discounted_r = self.discount_rewards(rewards)
118 |
119 | # Get Critic network predictions
120 | value = self.Critic.predict(states)[:, 0]
121 | # Compute advantages
122 | advantages = discounted_r - value
123 | # training Actor and Critic networks
124 | self.Actor.fit(states, actions, sample_weight=advantages, epochs=1, verbose=0)
125 | self.Critic.fit(states, discounted_r, epochs=1, verbose=0)
126 |
127 | def load(self, Actor_name, Critic_name):
128 | self.Actor = load_model(Actor_name, compile=False)
129 | #self.Critic = load_model(Critic_name, compile=False)
130 |
131 | def save(self):
132 | self.Actor.save(self.Model_name + '_Actor.h5')
133 | #self.Critic.save(self.Model_name + '_Critic.h5')
134 |
135 | pylab.figure(figsize=(18, 9))
136 | def PlotModel(self, score, episode):
137 | self.scores.append(score)
138 | self.episodes.append(episode)
139 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
140 | if str(episode)[-2:] == "00":# much faster than episode % 100
141 | pylab.plot(self.episodes, self.scores, 'b')
142 | pylab.plot(self.episodes, self.average, 'r')
143 | pylab.ylabel('Score', fontsize=18)
144 | pylab.xlabel('Steps', fontsize=18)
145 | try:
146 | pylab.savefig(self.path+".png")
147 | except OSError:
148 | pass
149 |
150 | return self.average[-1]
151 |
152 | def imshow(self, image, rem_step=0):
153 | cv2.imshow(self.Model_name+str(rem_step), image[rem_step,...])
154 | if cv2.waitKey(25) & 0xFF == ord("q"):
155 | cv2.destroyAllWindows()
156 | return
157 |
158 | def GetImage(self, frame, image_memory):
159 | if image_memory.shape == (1,*self.state_size):
160 | image_memory = np.squeeze(image_memory)
161 |
162 | # croping frame to 80x80 size
163 | frame_cropped = frame[35:195:2, ::2,:]
164 | if frame_cropped.shape[0] != self.COLS or frame_cropped.shape[1] != self.ROWS:
165 | # OpenCV resize function
166 | frame_cropped = cv2.resize(frame, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC)
167 |
168 | # converting to RGB (numpy way)
169 | frame_rgb = 0.299*frame_cropped[:,:,0] + 0.587*frame_cropped[:,:,1] + 0.114*frame_cropped[:,:,2]
170 |
171 | # convert everything to black and white (agent will train faster)
172 | frame_rgb[frame_rgb < 100] = 0
173 | frame_rgb[frame_rgb >= 100] = 255
174 | # converting to RGB (OpenCV way)
175 | #frame_rgb = cv2.cvtColor(frame_cropped, cv2.COLOR_RGB2GRAY)
176 |
177 | # dividing by 255 we expresses value to 0-1 representation
178 | new_frame = np.array(frame_rgb).astype(np.float32) / 255.0
179 |
180 | # push our data by 1 frame, similar as deq() function work
181 | image_memory = np.roll(image_memory, 1, axis = 0)
182 |
183 | # inserting new frame to free space
184 | image_memory[0,:,:] = new_frame
185 |
186 | # show image frame
187 | #self.imshow(image_memory,0)
188 | #self.imshow(image_memory,1)
189 | #self.imshow(image_memory,2)
190 | #self.imshow(image_memory,3)
191 |
192 | return np.expand_dims(image_memory, axis=0)
193 |
194 | def reset(self, env):
195 | image_memory = np.zeros(self.state_size)
196 | frame = env.reset()
197 | for i in range(self.REM_STEP):
198 | state = self.GetImage(frame, image_memory)
199 | return state
200 |
201 | def step(self, action, env, image_memory):
202 | next_state, reward, done, info = env.step(action)
203 | next_state = self.GetImage(next_state, image_memory)
204 | return next_state, reward, done, info
205 |
206 | def run(self):
207 | for e in range(self.EPISODES):
208 | state = self.reset(self.env)
209 | done, score, SAVING = False, 0, ''
210 | # Instantiate or reset games memory
211 | states, actions, rewards = [], [], []
212 | while not done:
213 | #self.env.render()
214 | # Actor picks an action
215 | action = self.act(state)
216 | # Retrieve new state, reward, and whether the state is terminal
217 | next_state, reward, done, _ = self.step(action, self.env, state)
218 | # Memorize (state, action, reward) for training
219 | states.append(state)
220 | action_onehot = np.zeros([self.action_size])
221 | action_onehot[action] = 1
222 | actions.append(action_onehot)
223 | rewards.append(reward)
224 | # Update current state
225 | state = next_state
226 | score += reward
227 | if done:
228 | average = self.PlotModel(score, e)
229 | # saving best models
230 | if average >= self.max_average:
231 | self.max_average = average
232 | self.save()
233 | SAVING = "SAVING"
234 | else:
235 | SAVING = ""
236 | print("episode: {}/{}, score: {}, average: {:.2f} {}".format(e, self.EPISODES, score, average, SAVING))
237 |
238 | self.replay(states, actions, rewards)
239 | # close environemnt when finish training
240 | self.env.close()
241 |
242 | def train(self, n_threads):
243 | self.env.close()
244 | # Instantiate one environment per thread
245 | envs = [gym.make(self.env_name) for i in range(n_threads)]
246 |
247 | # Create threads
248 | threads = [threading.Thread(
249 | target=self.train_threading,
250 | daemon=True,
251 | args=(self,
252 | envs[i],
253 | i)) for i in range(n_threads)]
254 |
255 | for t in threads:
256 | time.sleep(2)
257 | t.start()
258 |
259 | for t in threads:
260 | time.sleep(10)
261 | t.join()
262 |
263 | def train_threading(self, agent, env, thread):
264 | global graph
265 | with graph.as_default():
266 | while self.episode < self.EPISODES:
267 | # Reset episode
268 | score, done, SAVING = 0, False, ''
269 | state = self.reset(env)
270 | # Instantiate or reset games memory
271 | states, actions, rewards = [], [], []
272 | while not done:
273 | action = agent.act(state)
274 | next_state, reward, done, _ = self.step(action, env, state)
275 |
276 | states.append(state)
277 | action_onehot = np.zeros([self.action_size])
278 | action_onehot[action] = 1
279 | actions.append(action_onehot)
280 | rewards.append(reward)
281 |
282 | score += reward
283 | state = next_state
284 |
285 | self.lock.acquire()
286 | self.replay(states, actions, rewards)
287 | self.lock.release()
288 |
289 | # Update episode count
290 | with self.lock:
291 | average = self.PlotModel(score, self.episode)
292 | # saving best models
293 | if average >= self.max_average:
294 | self.max_average = average
295 | self.save()
296 | SAVING = "SAVING"
297 | else:
298 | SAVING = ""
299 | print("episode: {}/{}, thread: {}, score: {}, average: {:.2f} {}".format(self.episode, self.EPISODES, thread, score, average, SAVING))
300 | if(self.episode < self.EPISODES):
301 | self.episode += 1
302 | env.close()
303 |
304 | def test(self, Actor_name, Critic_name):
305 | self.load(Actor_name, Critic_name)
306 | for e in range(100):
307 | state = self.reset(self.env)
308 | done = False
309 | score = 0
310 | while not done:
311 | self.env.render()
312 | action = np.argmax(self.Actor.predict(state))
313 | state, reward, done, _ = self.step(action, self.env, state)
314 | score += reward
315 | if done:
316 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, score))
317 | break
318 | self.env.close()
319 |
320 | if __name__ == "__main__":
321 | #env_name = 'PongDeterministic-v4'
322 | env_name = 'Pong-v0'
323 | agent = A3CAgent(env_name)
324 | #agent.run() # use as A2C
325 | #agent.train(n_threads=5) # use as A3C
326 | agent.test('Models/Pong-v0_A3C_2.5e-05_Actor.h5', '')
327 |
--------------------------------------------------------------------------------
/10_Pong-v0_A3C/Pong-v0_A3C_TF2.py:
--------------------------------------------------------------------------------
1 | # Tutorial by www.pylessons.com
2 | # Tutorial written for - Tensorflow 2.3.1
3 |
4 | import os
5 | #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
6 | os.environ['CUDA_VISIBLE_DEVICES'] = '0'
7 | import random
8 | import gym
9 | import pylab
10 | import numpy as np
11 | import tensorflow as tf
12 | from tensorflow.keras.models import Model, load_model
13 | from tensorflow.keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten
14 | from tensorflow.keras.optimizers import Adam, RMSprop
15 | from tensorflow.keras import backend as K
16 | import cv2
17 | import threading
18 | from threading import Thread, Lock
19 | import time
20 |
21 | gpus = tf.config.experimental.list_physical_devices('GPU')
22 | if len(gpus) > 0:
23 | print(f'GPUs {gpus}')
24 | try: tf.config.experimental.set_memory_growth(gpus[0], True)
25 | except RuntimeError: pass
26 |
27 | def OurModel(input_shape, action_space, lr):
28 | X_input = Input(input_shape)
29 |
30 | #X = Conv2D(32, 8, strides=(4, 4),padding="valid", activation="elu", data_format="channels_first", input_shape=input_shape)(X_input)
31 | #X = Conv2D(64, 4, strides=(2, 2),padding="valid", activation="elu", data_format="channels_first")(X)
32 | #X = Conv2D(64, 3, strides=(1, 1),padding="valid", activation="elu", data_format="channels_first")(X)
33 | X = Flatten(input_shape=input_shape)(X_input)
34 |
35 | X = Dense(512, activation="elu", kernel_initializer='he_uniform')(X)
36 | #X = Dense(256, activation="elu", kernel_initializer='he_uniform')(X)
37 | #X = Dense(64, activation="elu", kernel_initializer='he_uniform')(X)
38 |
39 | action = Dense(action_space, activation="softmax", kernel_initializer='he_uniform')(X)
40 | value = Dense(1, kernel_initializer='he_uniform')(X)
41 |
42 | Actor = Model(inputs = X_input, outputs = action)
43 | Actor.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=lr))
44 |
45 | Critic = Model(inputs = X_input, outputs = value)
46 | Critic.compile(loss='mse', optimizer=RMSprop(lr=lr))
47 |
48 | return Actor, Critic
49 |
50 | class A3CAgent:
51 | # Actor-Critic Main Optimization Algorithm
52 | def __init__(self, env_name):
53 | # Initialization
54 | # Environment and PPO parameters
55 | self.env_name = env_name
56 | self.env = gym.make(env_name)
57 | self.action_size = self.env.action_space.n
58 | self.EPISODES, self.episode, self.max_average = 20000, 0, -21.0 # specific for pong
59 | self.lock = Lock()
60 | self.lr = 0.000025
61 |
62 | self.ROWS = 80
63 | self.COLS = 80
64 | self.REM_STEP = 4
65 |
66 | # Instantiate plot memory
67 | self.scores, self.episodes, self.average = [], [], []
68 |
69 | self.Save_Path = 'Models'
70 | self.state_size = (self.REM_STEP, self.ROWS, self.COLS)
71 |
72 | if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
73 | self.path = '{}_A3C_{}'.format(self.env_name, self.lr)
74 | self.Model_name = os.path.join(self.Save_Path, self.path)
75 |
76 | # Create Actor-Critic network model
77 | self.Actor, self.Critic = OurModel(input_shape=self.state_size, action_space = self.action_size, lr=self.lr)
78 |
79 | def act(self, state):
80 | # Use the network to predict the next action to take, using the model
81 | prediction = self.Actor.predict(state)[0]
82 | action = np.random.choice(self.action_size, p=prediction)
83 | return action
84 |
85 | def discount_rewards(self, reward):
86 | # Compute the gamma-discounted rewards over an episode
87 | gamma = 0.99 # discount rate
88 | running_add = 0
89 | discounted_r = np.zeros_like(reward)
90 | for i in reversed(range(0,len(reward))):
91 | if reward[i] != 0: # reset the sum, since this was a game boundary (pong specific!)
92 | running_add = 0
93 | running_add = running_add * gamma + reward[i]
94 | discounted_r[i] = running_add
95 |
96 | discounted_r -= np.mean(discounted_r) # normalizing the result
97 | discounted_r /= np.std(discounted_r) # divide by standard deviation
98 | return discounted_r
99 |
100 | def replay(self, states, actions, rewards):
101 | # reshape memory to appropriate shape for training
102 | states = np.vstack(states)
103 | actions = np.vstack(actions)
104 |
105 | # Compute discounted rewards
106 | discounted_r = self.discount_rewards(rewards)
107 |
108 | # Get Critic network predictions
109 | value = self.Critic.predict(states)[:, 0]
110 | # Compute advantages
111 | advantages = discounted_r - value
112 | # training Actor and Critic networks
113 | self.Actor.fit(states, actions, sample_weight=advantages, epochs=1, verbose=0)
114 | self.Critic.fit(states, discounted_r, epochs=1, verbose=0)
115 |
116 | def load(self, Actor_name, Critic_name):
117 | self.Actor = load_model(Actor_name, compile=False)
118 | #self.Critic = load_model(Critic_name, compile=False)
119 |
120 | def save(self):
121 | self.Actor.save(self.Model_name + '_Actor.h5')
122 | #self.Critic.save(self.Model_name + '_Critic.h5')
123 |
124 | pylab.figure(figsize=(18, 9))
125 | def PlotModel(self, score, episode):
126 | self.scores.append(score)
127 | self.episodes.append(episode)
128 | self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
129 | if str(episode)[-2:] == "00":# much faster than episode % 100
130 | pylab.plot(self.episodes, self.scores, 'b')
131 | pylab.plot(self.episodes, self.average, 'r')
132 | pylab.ylabel('Score', fontsize=18)
133 | pylab.xlabel('Steps', fontsize=18)
134 | try:
135 | pylab.savefig(self.path+".png")
136 | except OSError:
137 | pass
138 |
139 | return self.average[-1]
140 |
141 | def imshow(self, image, rem_step=0):
142 | cv2.imshow(self.Model_name+str(rem_step), image[rem_step,...])
143 | if cv2.waitKey(25) & 0xFF == ord("q"):
144 | cv2.destroyAllWindows()
145 | return
146 |
147 | def GetImage(self, frame, image_memory):
148 | if image_memory.shape == (1,*self.state_size):
149 | image_memory = np.squeeze(image_memory)
150 |
151 | # croping frame to 80x80 size
152 | frame_cropped = frame[35:195:2, ::2,:]
153 | if frame_cropped.shape[0] != self.COLS or frame_cropped.shape[1] != self.ROWS:
154 | # OpenCV resize function
155 | frame_cropped = cv2.resize(frame, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC)
156 |
157 | # converting to RGB (numpy way)
158 | frame_rgb = 0.299*frame_cropped[:,:,0] + 0.587*frame_cropped[:,:,1] + 0.114*frame_cropped[:,:,2]
159 |
160 | # convert everything to black and white (agent will train faster)
161 | frame_rgb[frame_rgb < 100] = 0
162 | frame_rgb[frame_rgb >= 100] = 255
163 | # converting to RGB (OpenCV way)
164 | #frame_rgb = cv2.cvtColor(frame_cropped, cv2.COLOR_RGB2GRAY)
165 |
166 | # dividing by 255 we expresses value to 0-1 representation
167 | new_frame = np.array(frame_rgb).astype(np.float32) / 255.0
168 |
169 | # push our data by 1 frame, similar as deq() function work
170 | image_memory = np.roll(image_memory, 1, axis = 0)
171 |
172 | # inserting new frame to free space
173 | image_memory[0,:,:] = new_frame
174 |
175 | # show image frame
176 | #self.imshow(image_memory,0)
177 | #self.imshow(image_memory,1)
178 | #self.imshow(image_memory,2)
179 | #self.imshow(image_memory,3)
180 |
181 | return np.expand_dims(image_memory, axis=0)
182 |
183 | def reset(self, env):
184 | image_memory = np.zeros(self.state_size)
185 | frame = env.reset()
186 | for i in range(self.REM_STEP):
187 | state = self.GetImage(frame, image_memory)
188 | return state
189 |
190 | def step(self, action, env, image_memory):
191 | next_state, reward, done, info = env.step(action)
192 | next_state = self.GetImage(next_state, image_memory)
193 | return next_state, reward, done, info
194 |
195 | def run(self):
196 | for e in range(self.EPISODES):
197 | state = self.reset(self.env)
198 | done, score, SAVING = False, 0, ''
199 | # Instantiate or reset games memory
200 | states, actions, rewards = [], [], []
201 | while not done:
202 | #self.env.render()
203 | # Actor picks an action
204 | action = self.act(state)
205 | # Retrieve new state, reward, and whether the state is terminal
206 | next_state, reward, done, _ = self.step(action, self.env, state)
207 | # Memorize (state, action, reward) for training
208 | states.append(state)
209 | action_onehot = np.zeros([self.action_size])
210 | action_onehot[action] = 1
211 | actions.append(action_onehot)
212 | rewards.append(reward)
213 | # Update current state
214 | state = next_state
215 | score += reward
216 | if done:
217 | average = self.PlotModel(score, e)
218 | # saving best models
219 | if average >= self.max_average:
220 | self.max_average = average
221 | self.save()
222 | SAVING = "SAVING"
223 | else:
224 | SAVING = ""
225 | print("episode: {}/{}, score: {}, average: {:.2f} {}".format(e, self.EPISODES, score, average, SAVING))
226 |
227 | self.replay(states, actions, rewards)
228 | # close environemnt when finish training
229 | self.env.close()
230 |
231 | def train(self, n_threads):
232 | self.env.close()
233 | # Instantiate one environment per thread
234 | envs = [gym.make(self.env_name) for i in range(n_threads)]
235 |
236 | # Create threads
237 | threads = [threading.Thread(
238 | target=self.train_threading,
239 | daemon=True,
240 | args=(self,
241 | envs[i],
242 | i)) for i in range(n_threads)]
243 |
244 | for t in threads:
245 | time.sleep(2)
246 | t.start()
247 |
248 | for t in threads:
249 | time.sleep(10)
250 | t.join()
251 |
252 | def train_threading(self, agent, env, thread):
253 | while self.episode < self.EPISODES:
254 | # Reset episode
255 | score, done, SAVING = 0, False, ''
256 | state = self.reset(env)
257 | # Instantiate or reset games memory
258 | states, actions, rewards = [], [], []
259 | while not done:
260 | action = agent.act(state)
261 | next_state, reward, done, _ = self.step(action, env, state)
262 |
263 | states.append(state)
264 | action_onehot = np.zeros([self.action_size])
265 | action_onehot[action] = 1
266 | actions.append(action_onehot)
267 | rewards.append(reward)
268 |
269 | score += reward
270 | state = next_state
271 |
272 | self.lock.acquire()
273 | self.replay(states, actions, rewards)
274 | self.lock.release()
275 |
276 | # Update episode count
277 | with self.lock:
278 | average = self.PlotModel(score, self.episode)
279 | # saving best models
280 | if average >= self.max_average:
281 | self.max_average = average
282 | self.save()
283 | SAVING = "SAVING"
284 | else:
285 | SAVING = ""
286 | print("episode: {}/{}, thread: {}, score: {}, average: {:.2f} {}".format(self.episode, self.EPISODES, thread, score, average, SAVING))
287 | if(self.episode < self.EPISODES):
288 | self.episode += 1
289 | env.close()
290 |
291 | def test(self, Actor_name, Critic_name):
292 | self.load(Actor_name, Critic_name)
293 | for e in range(100):
294 | state = self.reset(self.env)
295 | done = False
296 | score = 0
297 | while not done:
298 | self.env.render()
299 | action = np.argmax(self.Actor.predict(state))
300 | state, reward, done, _ = self.step(action, self.env, state)
301 | score += reward
302 | if done:
303 | print("episode: {}/{}, score: {}".format(e, self.EPISODES, score))
304 | break
305 | self.env.close()
306 |
307 | if __name__ == "__main__":
308 | env_name = 'PongDeterministic-v4'
309 | #env_name = 'Pong-v0'
310 | agent = A3CAgent(env_name)
311 | #agent.run() # use as A2C
312 | agent.train(n_threads=5) # use as A3C
313 | #agent.test('Models/Pong-v0_A3C_2.5e-05_Actor.h5', '')
314 |
--------------------------------------------------------------------------------
/10_Pong-v0_A3C/PongDeterministic-v4_A3C_2.5e-05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/10_Pong-v0_A3C/PongDeterministic-v4_A3C_2.5e-05.png
--------------------------------------------------------------------------------
/11_Pong-v0_PPO/Models/Pong-v0_APPO_0.0001_Actor_CNN.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/11_Pong-v0_PPO/Models/Pong-v0_APPO_0.0001_Actor_CNN.h5
--------------------------------------------------------------------------------
/11_Pong-v0_PPO/Pong-v0_APPO_0.0001_CNN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/11_Pong-v0_PPO/Pong-v0_APPO_0.0001_CNN.png
--------------------------------------------------------------------------------
/11_Pong-v0_PPO/Pong-v0_APPO_0.0001_RMSprop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/11_Pong-v0_PPO/Pong-v0_APPO_0.0001_RMSprop.png
--------------------------------------------------------------------------------
/11_Pong-v0_PPO/PongDeterministic-v4_APPO_0.0001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/11_Pong-v0_PPO/PongDeterministic-v4_APPO_0.0001.png
--------------------------------------------------------------------------------
/11_Pong-v0_PPO/gameplay.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/11_Pong-v0_PPO/gameplay.gif
--------------------------------------------------------------------------------
/11_Pong-v0_PPO/gameplay_CNN.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/11_Pong-v0_PPO/gameplay_CNN.gif
--------------------------------------------------------------------------------
/BipedalWalker-v3_PPO/BipedalWalker-v3_PPO_Actor.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/BipedalWalker-v3_PPO/BipedalWalker-v3_PPO_Actor.h5
--------------------------------------------------------------------------------
/BipedalWalker-v3_PPO/BipedalWalker-v3_PPO_Critic.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/BipedalWalker-v3_PPO/BipedalWalker-v3_PPO_Critic.h5
--------------------------------------------------------------------------------
/BipedalWalker-v3_PPO/BipedalWalker-v3_training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/BipedalWalker-v3_PPO/BipedalWalker-v3_training.png
--------------------------------------------------------------------------------
/BipedalWalker-v3_PPO/gameplay.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/BipedalWalker-v3_PPO/gameplay.gif
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Rokas
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/LunarLander-v2_PPO/LunarLander-v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/LunarLander-v2_PPO/LunarLander-v2.png
--------------------------------------------------------------------------------
/LunarLander-v2_PPO/LunarLander-v2_PPO_Actor.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/LunarLander-v2_PPO/LunarLander-v2_PPO_Actor.h5
--------------------------------------------------------------------------------
/LunarLander-v2_PPO/LunarLander-v2_PPO_Critic.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/LunarLander-v2_PPO/LunarLander-v2_PPO_Critic.h5
--------------------------------------------------------------------------------
/LunarLander-v2_PPO/gameplay.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/Reinforcement_Learning/c9717f523fb9bd4bb8ccb5b34bd6ee6c76ea21b6/LunarLander-v2_PPO/gameplay.gif
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Reinforcement Learning Tutorials:
2 |
3 | *2020-10-07 added support for Tensorflow 2.3.1*
4 |
5 | PPO and PPO_CNN agents playing Pong-v0 game:
6 | 
7 | 
8 |
9 | *2020-10-10 added LunarLander-v2_PPO Continuous code for Tensorflow 2.3.1*:
10 | 
11 |
12 | *2020-10-23 added BipedalWalker-v3_PPO code for Tensorflow 2.3.1*:
13 | 
14 |
15 | 1. [Deep Q Learning tutorial (DQN)](https://pylessons.com/CartPole-reinforcement-learning/)
16 |
17 | 2. [Double Deep Q Learning tutorial (DDQN)](https://pylessons.com/CartPole-DDQN/)
18 |
19 | 3. [Dueling Double Deep Q Learning tutorial (D3QN)](https://pylessons.com/CartPole-DDDQN/)
20 |
21 | 4. [Epsilon Greedy Dueling Double Deep Q Learning tutorial (D3QN)](https://pylessons.com/Epsilon-Greedy-DQN/)
22 |
23 | 5. [Prioritized Experience Replay (PER) D3QN tutorial](https://pylessons.com/CartPole-PER/)
24 |
25 | 6. [D3QN PER with Convolutional Neural Networks tutorial](https://pylessons.com/CartPole-PER-CNN/)
26 |
27 | 7. [A.I. learns to play Pong with DQN](https://pylessons.com/DQN-PONG/)
28 |
29 | 8. [Introduction to RL Policy Gradient (PG or REINFORCE)](https://pylessons.com/Beyond-DQN/)
30 |
31 | 9. [Introduction to RL Advanced Actor Critic algorythm (A2C)](https://pylessons.com/A2C-reinforcement-learning/)
32 |
33 | 10. [Introduction to RL Asynchronous Advanced Actor Critic algorythm (A3C)](https://pylessons.com/A3C-reinforcement-learning/)
34 |
35 | 11. [Introduction to RL Proximal Policy Optimization algorythm (PPO)](https://pylessons.com/PPO-reinforcement-learning/)
36 |
37 | 12. [Let’s code from scratch a discrete Reinforcement Learning rocket landing agent! (PPO)](https://pylessons.com/LunarLander-v2-PPO/)
38 |
39 | 13. [Continuous Proximal Policy Optimization Tutorial with OpenAI gym environment! (PPO)](https://pylessons.com/BipedalWalker-v3-PPO/)
40 |
41 | PPO Pong-v0 Learning curve:
42 |
43 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy
3 | tensorflow==2.3.1
4 | tensorflow-gpu==2.3.1
5 | opencv-python
6 | matplotlib
7 | tensorboardx
8 | pandas
9 | gym[all]
10 | box2d-py
11 |
--------------------------------------------------------------------------------