├── _config.yml
├── Misc
    ├── Graph.png
    ├── Loss.jpg
    ├── Plot.png
    ├── Q-NN.jpg
    ├── Initial.gif
    ├── NextGen.gif
    ├── Q-table.jpg
    ├── Target.jpg
    ├── Double Q.png
    ├── Estimation.jpg
    └── Q-learning.jpg
├── LICENSE
├── README.md
└── Code source
    └── Lunar_Lander_v2.py


/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-time-machine


--------------------------------------------------------------------------------
/Misc/Graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anh-nn01/Lunar-Lander-Double-Deep-Q-Networks/HEAD/Misc/Graph.png


--------------------------------------------------------------------------------
/Misc/Loss.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anh-nn01/Lunar-Lander-Double-Deep-Q-Networks/HEAD/Misc/Loss.jpg


--------------------------------------------------------------------------------
/Misc/Plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anh-nn01/Lunar-Lander-Double-Deep-Q-Networks/HEAD/Misc/Plot.png


--------------------------------------------------------------------------------
/Misc/Q-NN.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anh-nn01/Lunar-Lander-Double-Deep-Q-Networks/HEAD/Misc/Q-NN.jpg


--------------------------------------------------------------------------------
/Misc/Initial.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anh-nn01/Lunar-Lander-Double-Deep-Q-Networks/HEAD/Misc/Initial.gif


--------------------------------------------------------------------------------
/Misc/NextGen.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anh-nn01/Lunar-Lander-Double-Deep-Q-Networks/HEAD/Misc/NextGen.gif


--------------------------------------------------------------------------------
/Misc/Q-table.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anh-nn01/Lunar-Lander-Double-Deep-Q-Networks/HEAD/Misc/Q-table.jpg


--------------------------------------------------------------------------------
/Misc/Target.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anh-nn01/Lunar-Lander-Double-Deep-Q-Networks/HEAD/Misc/Target.jpg


--------------------------------------------------------------------------------
/Misc/Double Q.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anh-nn01/Lunar-Lander-Double-Deep-Q-Networks/HEAD/Misc/Double Q.png


--------------------------------------------------------------------------------
/Misc/Estimation.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anh-nn01/Lunar-Lander-Double-Deep-Q-Networks/HEAD/Misc/Estimation.jpg


--------------------------------------------------------------------------------
/Misc/Q-learning.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anh-nn01/Lunar-Lander-Double-Deep-Q-Networks/HEAD/Misc/Q-learning.jpg


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Nhu Nhat Anh
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Lunar-Lander-Double-Deep-Q-Networks
 2 | An AI agent that use Double Deep Q-learning to learn by itself how to land a Lunar Lander on OpenAI universe
 3 | # AI-Lunar-Laner-Lander-v2-Keras TF Backend
 4 | A Reinforcement Learning AI Agent that use Deep Q Network to play Lunar Lander
 5 | 
 6 | 
 7 | Algorithm Details and Hyperparameters:
 8 | ===============
 9 | * Implementation: Keras TF Backend
10 | * Algorithm: Deep Q-Network with a Double Fully connected layers
11 | * Each Neural Network has the same structure: 2 Fully connected layers each with 128 nodes.
12 | * Optimization algorithm: Adaptive Moment (Adam)
13 | * Learning rate: **α = 0.0001**
14 | * Discount factor: **γ = 0.99**
15 | * Minimum exploration rate: **ε = 0.1**
16 | * Replay memory size: **10^6**
17 | * Mini batch size: **2^6**
18 | <br>
19 | **Commplete evolution (training process): https://www.youtube.com/watch?v=XopVALk2xb4&t=286s**
20 | <br><br>
21 | 
22 | Description of the problem
23 | ===============
24 | 
25 | * The agent has to learn how to land a Lunar Lander to the moon surface safely, quickly and accurately.
26 | * If the agent just lets the lander fall freely, it is dangerous and thus get a very negative reward from the environment.
27 | * If the agent does not land quickly enough (after 20 seconds), it fails its objective and receive a negative reward from the environment.
28 | * If the agent lands the lander safely but in wrong position, it is given either a small negative or small positive reward, depending on how far from the landing zone is the lander.
29 | * If the AI lands the lander to the landing zone quickly and safely, it is successful and is award very positive reward.
30 | 
31 | Double Deep Q Networks (DDQN):
32 | ===============
33 | * Since the state space is infinite, traditional Q-value table method does not work on this problem. As a result, we need to integrate Q-learning with Neural Network for value approximation. However, the action space remains discrete.
34 | 
35 | **Q-learning:**<br>
36 | <img src="Misc/Q-learning.jpg"><br><br>
37 | 
38 | The equation above based on Bellman equation. You can try creating a sample graph of MDP to see intuitively why the Q-learning method converge to optimal value, thus converging to optimal policy.
39 | 
40 | * For Deep Q-learning, we simply use a NN to approximate Q-value in each time step, and then update the NN so that the estimate Q(s,a) approach its target:<br>
41 | * <img src="Misc/Estimation.jpg"><br>
42 | * <img src="Misc/Target.jpg"><br><br>
43 | * <img src="Misc/Loss.jpg"><br><br>
44 | 
45 | <img src="Misc/Graph.png">
46 | 
47 | **Difference between Q-learning and DQN:**<br><br><br>
48 | <img src="Misc/Q-table.jpg"><br><br>
49 | 
50 | <img src="Misc/Q-NN.jpg"><br><br>
51 | 
52 | * Purpose of using Double Deep Q-network: 
53 | * To stablize the target Q-value and ensure convergence.
54 | * Reference: https://arxiv.org/abs/1509.06461
55 | 
56 | <img src="Misc/Double Q.png"><br><br>
57 | 
58 | <br> It has been proven mathematically and empirically that using Deep Q-Network approximation converges to optimal policy in reasonable amount of time.
59 | 
60 | 
61 | Training Result:
62 | ===============
63 | <br><br>
64 | **Before training:**<br><br>
65 | <img src="Misc/Initial.gif">
66 | 
67 | **After 800 games:**<br><br>
68 | <img src="Misc/NextGen.gif">
69 | 
70 | <br><br>
71 | **Learning curve:**<br><br>
72 | <img src="Misc/Plot.png"><br>
73 | 
74 | * The Blue curve shows the reward the agent earned in each episode.
75 | * The Red curve shows the average reward from the corresponding episode in the x-axis and 100 previous episodes. In other words, it shows the average reward of 100 most current episodes.
76 | * From the plot, we see that the Blue curve is much noisier due to exploration ε = 0.1 throughout the training process and due to the imperfect approximation during some first episodes of the training.
77 | * Averaging 100 most current rewards produces much smoother curve, however.
78 | * From the curve, we can conclude that the agent has successfully learned a good policy to solve the Lunar Lander problem, according to OpenAI criteria (the average point of any 100 consecutive episodes is at least 200).
79 | 
80 | <br><br>
81 | 


--------------------------------------------------------------------------------
/Code source/Lunar_Lander_v2.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import gym
  4 | 
  5 | import tensorflow.compat.v1 as tf
  6 | from tensorflow.keras import Model, Sequential
  7 | from tensorflow.keras.layers import Dense, Embedding, Reshape
  8 | from tensorflow.keras.optimizers import Adam
  9 | 
 10 | import matplotlib.pyplot as plt
 11 | import random
 12 | from collections import deque
 13 | import time
 14 | tf.disable_v2_behavior() # testing on tensorflow 1
 15 | 
 16 | class Agent:
 17 |     def __init__(self, env, optimizer, batch_size):
 18 |         # general info
 19 |         self.state_size = env.observation_space.shape[0] # number of factors in the state; e.g: velocity, position, etc
 20 |         self.action_size = env.action_space.n
 21 |         self.optimizer = optimizer
 22 |         self.batch_size = batch_size
 23 |         
 24 |         # allow large replay exp space
 25 |         self.replay_exp = deque(maxlen=1000000)
 26 |         
 27 |         self.gamma = 0.99
 28 |         self.epsilon = 1.0 # initialize with high exploration, which will decay later
 29 |         
 30 |         # Build Policy Network
 31 |         self.brain_policy = Sequential()
 32 |         self.brain_policy.add(Dense(128, input_dim = self.state_size, activation = "relu"))
 33 |         self.brain_policy.add(Dense(128 , activation = "relu"))
 34 |         self.brain_policy.add(Dense(self.action_size, activation = "linear"))
 35 |         self.brain_policy.compile(loss = "mse", optimizer = self.optimizer)
 36 |         
 37 |         
 38 |         # Build Target Network
 39 |         self.brain_target = Sequential()
 40 |         self.brain_target.add(Dense(128, input_dim = self.state_size, activation = "relu"))
 41 |         self.brain_target.add(Dense(128 , activation = "relu"))
 42 |         self.brain_target.add(Dense(self.action_size, activation = "linear"))
 43 |         self.brain_target.compile(loss = "mse", optimizer = self.optimizer)
 44 |         
 45 |         
 46 |         self.update_brain_target()
 47 |     
 48 |     # add new experience to the replay exp
 49 |     def memorize_exp(self, state, action, reward, next_state, done):
 50 |         self.replay_exp.append((state, action, reward, next_state, done))
 51 |     
 52 |     """
 53 |     # agent's brain
 54 |     def build_model(self):
 55 |         # a NN with 2 fully connected hidden layers
 56 |         model = Sequential()
 57 |         model.add(Dense(128, input_dim = self.state_size, activation = "relu"))
 58 |         model.add(Dense(128 , activation = "relu"))
 59 |         model.add(Dense(self.action_size, activation = "linear"))
 60 |         model.compile(loss = "mse", optimizer = self.optimizer)
 61 |         
 62 |         return model
 63 |     """
 64 |     
 65 |     def update_brain_target(self):
 66 |         return self.brain_target.set_weights(self.brain_policy.get_weights())
 67 |     
 68 |     def choose_action(self, state):
 69 |         if np.random.uniform(0.0, 1.0) < self.epsilon: # exploration
 70 |             action = np.random.choice(self.action_size)
 71 |         else:
 72 |             state = np.reshape(state, [1, state_size])
 73 |             qhat = self.brain_policy.predict(state) # output Q(s,a) for all a of current state
 74 |             action = np.argmax(qhat[0]) # because the output is m * n, so we need to consider the dimension [0]
 75 |             
 76 |         return action
 77 |      
 78 |     # update params in NN
 79 |     def learn(self):
 80 |         """
 81 |         sample = random.choices(self.replay_exp, k = min(len(self.replay_exp), self.batch_size))
 82 |         
 83 |         
 84 |         states, actions, rewards, next_states, dones = map(list, zip(sample))
 85 |         
 86 |         # add exp to replay exp
 87 |         qhats_next = self.brain_target(next_states)
 88 |         
 89 |         # set all value actions of terminal state to 0
 90 |         qhats_next[dones] = np.zeros((self.action_size))
 91 |         
 92 |         q_targets = rewards + self.gamma * np.max(qhats_next, axis=1) # update greedily
 93 |         
 94 |         self.brain.update_nn(self.sess, states, actions, q_targets)
 95 |         
 96 |         """
 97 |         
 98 |         # take a mini-batch from replay experience
 99 |         cur_batch_size = min(len(self.replay_exp), self.batch_size)
100 |         mini_batch = random.sample(self.replay_exp, cur_batch_size)
101 |         
102 |         # batch data
103 |         sample_states = np.ndarray(shape = (cur_batch_size, self.state_size)) # replace 128 with cur_batch_size
104 |         sample_actions = np.ndarray(shape = (cur_batch_size, 1))
105 |         sample_rewards = np.ndarray(shape = (cur_batch_size, 1))
106 |         sample_next_states = np.ndarray(shape = (cur_batch_size, self.state_size))
107 |         sample_dones = np.ndarray(shape = (cur_batch_size, 1))
108 | 
109 |         temp=0
110 |         for exp in mini_batch:
111 |             sample_states[temp] = exp[0]
112 |             sample_actions[temp] = exp[1]
113 |             sample_rewards[temp] = exp[2]
114 |             sample_next_states[temp] = exp[3]
115 |             sample_dones[temp] = exp[4]
116 |             temp += 1
117 |         
118 |          
119 |         sample_qhat_next = self.brain_target.predict(sample_next_states)
120 |         
121 |         # set all Q values terminal states to 0
122 |         sample_qhat_next = sample_qhat_next * (np.ones(shape = sample_dones.shape) - sample_dones)
123 |         # choose max action for each state
124 |         sample_qhat_next = np.max(sample_qhat_next, axis=1)
125 |         
126 |         sample_qhat = self.brain_policy.predict(sample_states)
127 |         
128 |         for i in range(cur_batch_size):
129 |             a = sample_actions[i,0]
130 |             sample_qhat[i,int(a)] = sample_rewards[i] + self.gamma * sample_qhat_next[i]
131 |             
132 |         q_target = sample_qhat
133 |             
134 |         self.brain_policy.fit(sample_states, q_target, epochs = 1, verbose = 0)
135 |         
136 |             
137 |             
138 |         """
139 |         
140 |         for state, action, reward, next_state, done in mini_batch:
141 |             target_Q_s_a = 0 # new target for Q(s,a)
142 |             state = np.reshape(state, [1, state_size])
143 |             next_state = np.reshape(next_state, [1, state_size])
144 |             
145 |             # if it is not the terminal state
146 |             if not done:
147 |                 qhat_next = self.brain_target.predict(next_state)  # estimate Q(s',a')
148 |                 target_Q_s_a = reward + self.gamma * np.amax(qhat_next[0]) # because the output is m * n, so we need to consider the dimension [0]
149 |             else:
150 |                 target_Q_s_a = reward
151 |             
152 |             target_output = self.brain_policy.predict(state) # we will replace target of Q(s,a) for specific a later
153 |             target_output[0][action] = target_Q_s_a # new target for state s and action a
154 |             
155 |             self.brain_policy.fit(state, target_output, epochs = 1, verbose = 0)
156 |             
157 |         """
158 |             
159 |             
160 |         
161 | 
162 | env = gym.make("LunarLander-v2")
163 | optimizer = Adam(learning_rate = 0.0001)
164 | 
165 | agent = Agent(env, optimizer, batch_size = 64)
166 | state_size = env.observation_space.shape[0]
167 | 
168 | #state = env.reset()
169 | 
170 | #print(state.shape)
171 | 
172 | # load model
173 | #agent.brain_policy.set_weights(tf.keras.models.load_model('C:/Users/nhunh/.spyder-py3/Model1.h5').get_weights())
174 | 
175 | timestep=0
176 | rewards = []
177 | aver_reward = []
178 | aver = deque(maxlen=100)
179 | 
180 | 
181 | for episode in range(1000):
182 |     state = env.reset()
183 |     total_reward = 0
184 |     done = False
185 |     
186 |     while not done:
187 |         action = agent.choose_action(state)
188 |         next_state, reward, done, info = env.step(action)
189 |         
190 |         env.render()
191 | 
192 |         total_reward += reward
193 |         
194 |         agent.memorize_exp(state, action, reward, next_state, done)
195 |         agent.learn()
196 |         
197 |         state = next_state
198 |         timestep += 1
199 |         
200 |         
201 |     aver.append(total_reward)     
202 |     aver_reward.append(np.mean(aver))
203 |     
204 |     rewards.append(total_reward)
205 |         
206 |     # update model_target after each episode
207 |     agent.update_brain_target()
208 | 
209 |     agent.epsilon = max(0.1, 0.995 * agent.epsilon) # decaying exploration
210 |     print("Episode ", episode, total_reward)
211 |     
212 |     """
213 |     if episode % 50 == 0:
214 |         agent.brain_policy.save("C:/Users/nhunh/.spyder-py3/Newest_update.h5")
215 |     """
216 |     
217 | plt.title("Learning Curve")
218 | plt.xlabel("Episode")
219 | plt.ylabel("Reward")
220 | plt.plot(rewards)
221 | 
222 | plt.xlabel("Episode")
223 | plt.ylabel("Reward")
224 | plt.plot(aver_reward, 'r')
225 | 
226 | agent.brain_policy.save('C:/Users/nhunh/.spyder-py3/Model1.h5')
227 | 
228 |     
229 | 


--------------------------------------------------------------------------------