├── requirements.txt
├── assets
    └── animation.gif
├── save
    ├── cartpole-ddqn.h5
    └── cartpole-dqn.h5
├── README.md
├── LICENSE
├── .gitignore
├── dqn.py
├── dqn_batch.py
└── ddqn.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | keras
3 | gym
4 | tensorflow
5 | 


--------------------------------------------------------------------------------
/assets/animation.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keon/deep-q-learning/HEAD/assets/animation.gif


--------------------------------------------------------------------------------
/save/cartpole-ddqn.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keon/deep-q-learning/HEAD/save/cartpole-ddqn.h5


--------------------------------------------------------------------------------
/save/cartpole-dqn.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keon/deep-q-learning/HEAD/save/cartpole-dqn.h5


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # deep-q-learning
 2 | 
 3 | Introduction to Making a Simple Game AI with Deep Reinforcement Learning
 4 | 
 5 | 
 6 | 
 7 | ![animation](./assets/animation.gif)
 8 | 
 9 | Minimal and Simple Deep Q Learning Implemenation in Keras and Gym. Under 100 lines of code!
10 | 
11 | The explanation for the `dqn.py` code is covered in the blog article
12 | [https://keon.io/deep-q-learning/](https://keon.io/deep-q-learning/)
13 | 
14 | 
15 | I made minor tweaks to this repository such as `load` and `save` functions for convenience.
16 | 
17 | I also made the `memory` a deque instead of just a list.
18 | This is in order to limit the maximum number of elements in the memory.
19 | 
20 | 
21 | The training might be unstable for `dqn.py`. This problem is mitigated in `ddqn.py`.
22 | I'll cover `ddqn` in the next article.
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Keon Kim
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 
91 | .vscode/


--------------------------------------------------------------------------------
/dqn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import random
 3 | import gym
 4 | import numpy as np
 5 | from collections import deque
 6 | from keras.models import Sequential
 7 | from keras.layers import Dense
 8 | from keras.optimizers import Adam
 9 | 
10 | EPISODES = 1000
11 | 
12 | class DQNAgent:
13 |     def __init__(self, state_size, action_size):
14 |         self.state_size = state_size
15 |         self.action_size = action_size
16 |         self.memory = deque(maxlen=2000)
17 |         self.gamma = 0.95    # discount rate
18 |         self.epsilon = 1.0  # exploration rate
19 |         self.epsilon_min = 0.01
20 |         self.epsilon_decay = 0.995
21 |         self.learning_rate = 0.001
22 |         self.model = self._build_model()
23 | 
24 |     def _build_model(self):
25 |         # Neural Net for Deep-Q learning Model
26 |         model = Sequential()
27 |         model.add(Dense(24, input_dim=self.state_size, activation='relu'))
28 |         model.add(Dense(24, activation='relu'))
29 |         model.add(Dense(self.action_size, activation='linear'))
30 |         model.compile(loss='mse',
31 |                       optimizer=Adam(lr=self.learning_rate))
32 |         return model
33 | 
34 |     def memorize(self, state, action, reward, next_state, done):
35 |         self.memory.append((state, action, reward, next_state, done))
36 | 
37 |     def act(self, state):
38 |         if np.random.rand() <= self.epsilon:
39 |             return random.randrange(self.action_size)
40 |         act_values = self.model.predict(state)
41 |         return np.argmax(act_values[0])  # returns action
42 | 
43 |     def replay(self, batch_size):
44 |         minibatch = random.sample(self.memory, batch_size)
45 |         for state, action, reward, next_state, done in minibatch:
46 |             target = reward
47 |             if not done:
48 |                 target = (reward + self.gamma *
49 |                           np.amax(self.model.predict(next_state)[0]))
50 |             target_f = self.model.predict(state)
51 |             target_f[0][action] = target
52 |             self.model.fit(state, target_f, epochs=1, verbose=0)
53 |         if self.epsilon > self.epsilon_min:
54 |             self.epsilon *= self.epsilon_decay
55 | 
56 |     def load(self, name):
57 |         self.model.load_weights(name)
58 | 
59 |     def save(self, name):
60 |         self.model.save_weights(name)
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     env = gym.make('CartPole-v1')
65 |     state_size = env.observation_space.shape[0]
66 |     action_size = env.action_space.n
67 |     agent = DQNAgent(state_size, action_size)
68 |     # agent.load("./save/cartpole-dqn.h5")
69 |     done = False
70 |     batch_size = 32
71 | 
72 |     for e in range(EPISODES):
73 |         state = env.reset()
74 |         state = np.reshape(state, [1, state_size])
75 |         for time in range(500):
76 |             # env.render()
77 |             action = agent.act(state)
78 |             next_state, reward, done, _ = env.step(action)
79 |             reward = reward if not done else -10
80 |             next_state = np.reshape(next_state, [1, state_size])
81 |             agent.memorize(state, action, reward, next_state, done)
82 |             state = next_state
83 |             if done:
84 |                 print("episode: {}/{}, score: {}, e: {:.2}"
85 |                       .format(e, EPISODES, time, agent.epsilon))
86 |                 break
87 |             if len(agent.memory) > batch_size:
88 |                 agent.replay(batch_size)
89 |         # if e % 10 == 0:
90 |         #     agent.save("./save/cartpole-dqn.h5")


--------------------------------------------------------------------------------
/dqn_batch.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import random
  3 | import gym
  4 | import numpy as np
  5 | from collections import deque
  6 | from keras.models import Sequential
  7 | from keras.layers import Dense
  8 | from keras.optimizers import Adam
  9 | 
 10 | EPISODES = 1000
 11 | 
 12 | class DQNAgent:
 13 |     def __init__(self, state_size, action_size):
 14 |         self.state_size = state_size
 15 |         self.action_size = action_size
 16 |         self.memory = deque(maxlen=2000)
 17 |         self.gamma = 0.95    # discount rate
 18 |         self.epsilon = 1.0  # exploration rate
 19 |         self.epsilon_min = 0.01
 20 |         self.epsilon_decay = 0.995
 21 |         self.learning_rate = 0.001
 22 |         self.model = self._build_model()
 23 | 
 24 |     def _build_model(self):
 25 |         # Neural Net for Deep-Q learning Model
 26 |         model = Sequential()
 27 |         model.add(Dense(24, input_dim=self.state_size, activation='relu'))
 28 |         model.add(Dense(24, activation='relu'))
 29 |         model.add(Dense(self.action_size, activation='linear'))
 30 |         model.compile(loss='mse',
 31 |                       optimizer=Adam(lr=self.learning_rate))
 32 |         return model
 33 | 
 34 |     def memorize(self, state, action, reward, next_state, done):
 35 |         self.memory.append((state, action, reward, next_state, done))
 36 | 
 37 |     def act(self, state):
 38 |         if np.random.rand() <= self.epsilon:
 39 |             return random.randrange(self.action_size)
 40 |         act_values = self.model.predict(state)
 41 |         return np.argmax(act_values[0])  # returns action
 42 | 
 43 |     def replay(self, batch_size):
 44 |         minibatch = random.sample(self.memory, batch_size)
 45 |         states, targets_f = [], []
 46 |         for state, action, reward, next_state, done in minibatch:
 47 |             target = reward
 48 |             if not done:
 49 |                 target = (reward + self.gamma *
 50 |                           np.amax(self.model.predict(next_state)[0]))
 51 |             target_f = self.model.predict(state)
 52 |             target_f[0][action] = target 
 53 |             # Filtering out states and targets for training
 54 |             states.append(state[0])
 55 |             targets_f.append(target_f[0])
 56 |         history = self.model.fit(np.array(states), np.array(targets_f), epochs=1, verbose=0)
 57 |         # Keeping track of loss
 58 |         loss = history.history['loss'][0]
 59 |         if self.epsilon > self.epsilon_min:
 60 |             self.epsilon *= self.epsilon_decay
 61 |         return loss
 62 | 
 63 |     def load(self, name):
 64 |         self.model.load_weights(name)
 65 | 
 66 |     def save(self, name):
 67 |         self.model.save_weights(name)
 68 | 
 69 | 
 70 | if __name__ == "__main__":
 71 |     env = gym.make('CartPole-v1')
 72 |     state_size = env.observation_space.shape[0]
 73 |     action_size = env.action_space.n
 74 |     agent = DQNAgent(state_size, action_size)
 75 |     # agent.load("./save/cartpole-dqn.h5")
 76 |     done = False
 77 |     batch_size = 32
 78 | 
 79 |     for e in range(EPISODES):
 80 |         state = env.reset()
 81 |         state = np.reshape(state, [1, state_size])
 82 |         for time in range(500):
 83 |             # env.render()
 84 |             action = agent.act(state)
 85 |             next_state, reward, done, _ = env.step(action)
 86 |             reward = reward if not done else -10
 87 |             next_state = np.reshape(next_state, [1, state_size])
 88 |             agent.memorize(state, action, reward, next_state, done)
 89 |             state = next_state
 90 |             if done:
 91 |                 print("episode: {}/{}, score: {}, e: {:.2}"
 92 |                       .format(e, EPISODES, time, agent.epsilon))
 93 |                 break
 94 |             if len(agent.memory) > batch_size:
 95 |                 loss = agent.replay(batch_size)
 96 |                 # Logging training loss every 10 timesteps
 97 |                 if time % 10 == 0:
 98 |                     print("episode: {}/{}, time: {}, loss: {:.4f}"
 99 |                         .format(e, EPISODES, time, loss))  
100 |         # if e % 10 == 0:
101 |         #     agent.save("./save/cartpole-dqn.h5")
102 | 


--------------------------------------------------------------------------------
/ddqn.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import random
  3 | import gym
  4 | import numpy as np
  5 | from collections import deque
  6 | from keras.models import Sequential
  7 | from keras.layers import Dense
  8 | from keras.optimizers import Adam
  9 | from keras import backend as K
 10 | 
 11 | import tensorflow as tf
 12 | 
 13 | EPISODES = 5000
 14 | 
 15 | class DQNAgent:
 16 |     def __init__(self, state_size, action_size):
 17 |         self.state_size = state_size
 18 |         self.action_size = action_size
 19 |         self.memory = deque(maxlen=2000)
 20 |         self.gamma = 0.95    # discount rate
 21 |         self.epsilon = 1.0  # exploration rate
 22 |         self.epsilon_min = 0.01
 23 |         self.epsilon_decay = 0.99
 24 |         self.learning_rate = 0.001
 25 |         self.model = self._build_model()
 26 |         self.target_model = self._build_model()
 27 |         self.update_target_model()
 28 | 
 29 |     """Huber loss for Q Learning
 30 | 
 31 |     References: https://en.wikipedia.org/wiki/Huber_loss
 32 |                 https://www.tensorflow.org/api_docs/python/tf/losses/huber_loss
 33 |     """
 34 | 
 35 |     def _huber_loss(self, y_true, y_pred, clip_delta=1.0):
 36 |         error = y_true - y_pred
 37 |         cond  = K.abs(error) <= clip_delta
 38 | 
 39 |         squared_loss = 0.5 * K.square(error)
 40 |         quadratic_loss = 0.5 * K.square(clip_delta) + clip_delta * (K.abs(error) - clip_delta)
 41 | 
 42 |         return K.mean(tf.where(cond, squared_loss, quadratic_loss))
 43 | 
 44 |     def _build_model(self):
 45 |         # Neural Net for Deep-Q learning Model
 46 |         model = Sequential()
 47 |         model.add(Dense(24, input_dim=self.state_size, activation='relu'))
 48 |         model.add(Dense(24, activation='relu'))
 49 |         model.add(Dense(self.action_size, activation='linear'))
 50 |         model.compile(loss=self._huber_loss,
 51 |                       optimizer=Adam(lr=self.learning_rate))
 52 |         return model
 53 | 
 54 |     def update_target_model(self):
 55 |         # copy weights from model to target_model
 56 |         self.target_model.set_weights(self.model.get_weights())
 57 | 
 58 |     def memorize(self, state, action, reward, next_state, done):
 59 |         self.memory.append((state, action, reward, next_state, done))
 60 | 
 61 |     def act(self, state):
 62 |         if np.random.rand() <= self.epsilon:
 63 |             return random.randrange(self.action_size)
 64 |         act_values = self.model.predict(state)
 65 |         return np.argmax(act_values[0])  # returns action
 66 | 
 67 |     def replay(self, batch_size):
 68 |         minibatch = random.sample(self.memory, batch_size)
 69 |         for state, action, reward, next_state, done in minibatch:
 70 |             target = self.model.predict(state)
 71 |             if done:
 72 |                 target[0][action] = reward
 73 |             else:
 74 |                 # a = self.model.predict(next_state)[0]
 75 |                 t = self.target_model.predict(next_state)[0]
 76 |                 target[0][action] = reward + self.gamma * np.amax(t)
 77 |                 # target[0][action] = reward + self.gamma * t[np.argmax(a)]
 78 |             self.model.fit(state, target, epochs=1, verbose=0)
 79 |         if self.epsilon > self.epsilon_min:
 80 |             self.epsilon *= self.epsilon_decay
 81 | 
 82 |     def load(self, name):
 83 |         self.model.load_weights(name)
 84 | 
 85 |     def save(self, name):
 86 |         self.model.save_weights(name)
 87 | 
 88 | 
 89 | if __name__ == "__main__":
 90 |     env = gym.make('CartPole-v1')
 91 |     state_size = env.observation_space.shape[0]
 92 |     action_size = env.action_space.n
 93 |     agent = DQNAgent(state_size, action_size)
 94 |     # agent.load("./save/cartpole-ddqn.h5")
 95 |     done = False
 96 |     batch_size = 32
 97 | 
 98 |     for e in range(EPISODES):
 99 |         state = env.reset()
100 |         state = np.reshape(state, [1, state_size])
101 |         for time in range(500):
102 |             # env.render()
103 |             action = agent.act(state)
104 |             next_state, reward, done, _ = env.step(action)
105 |             #reward = reward if not done else -10
106 |             x,x_dot,theta,theta_dot = next_state
107 |             r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
108 |             r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
109 |             reward = r1 + r2
110 |             
111 |             next_state = np.reshape(next_state, [1, state_size])
112 |             agent.memorize(state, action, reward, next_state, done)
113 |             state = next_state
114 |             if done:
115 |                 agent.update_target_model()
116 |                 print("episode: {}/{}, score: {}, e: {:.2}"
117 |                       .format(e, EPISODES, time, agent.epsilon))
118 |                 break
119 |             if len(agent.memory) > batch_size:
120 |                 agent.replay(batch_size)
121 |         # if e % 10 == 0:
122 |         #     agent.save("./save/cartpole-ddqn.h5")
123 | 


--------------------------------------------------------------------------------