├── requirements.txt ├── assets └── animation.gif ├── save ├── cartpole-ddqn.h5 └── cartpole-dqn.h5 ├── README.md ├── LICENSE ├── .gitignore ├── dqn.py ├── dqn_batch.py └── ddqn.py /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | keras 3 | gym 4 | tensorflow 5 | -------------------------------------------------------------------------------- /assets/animation.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/keon/deep-q-learning/HEAD/assets/animation.gif -------------------------------------------------------------------------------- /save/cartpole-ddqn.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/keon/deep-q-learning/HEAD/save/cartpole-ddqn.h5 -------------------------------------------------------------------------------- /save/cartpole-dqn.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/keon/deep-q-learning/HEAD/save/cartpole-dqn.h5 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # deep-q-learning 2 | 3 | Introduction to Making a Simple Game AI with Deep Reinforcement Learning 4 | 5 | 6 | 7 | ![animation](./assets/animation.gif) 8 | 9 | Minimal and Simple Deep Q Learning Implemenation in Keras and Gym. Under 100 lines of code! 10 | 11 | The explanation for the `dqn.py` code is covered in the blog article 12 | [https://keon.io/deep-q-learning/](https://keon.io/deep-q-learning/) 13 | 14 | 15 | I made minor tweaks to this repository such as `load` and `save` functions for convenience. 16 | 17 | I also made the `memory` a deque instead of just a list. 18 | This is in order to limit the maximum number of elements in the memory. 19 | 20 | 21 | The training might be unstable for `dqn.py`. This problem is mitigated in `ddqn.py`. 22 | I'll cover `ddqn` in the next article. 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Keon Kim 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | .vscode/ -------------------------------------------------------------------------------- /dqn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | import gym 4 | import numpy as np 5 | from collections import deque 6 | from keras.models import Sequential 7 | from keras.layers import Dense 8 | from keras.optimizers import Adam 9 | 10 | EPISODES = 1000 11 | 12 | class DQNAgent: 13 | def __init__(self, state_size, action_size): 14 | self.state_size = state_size 15 | self.action_size = action_size 16 | self.memory = deque(maxlen=2000) 17 | self.gamma = 0.95 # discount rate 18 | self.epsilon = 1.0 # exploration rate 19 | self.epsilon_min = 0.01 20 | self.epsilon_decay = 0.995 21 | self.learning_rate = 0.001 22 | self.model = self._build_model() 23 | 24 | def _build_model(self): 25 | # Neural Net for Deep-Q learning Model 26 | model = Sequential() 27 | model.add(Dense(24, input_dim=self.state_size, activation='relu')) 28 | model.add(Dense(24, activation='relu')) 29 | model.add(Dense(self.action_size, activation='linear')) 30 | model.compile(loss='mse', 31 | optimizer=Adam(lr=self.learning_rate)) 32 | return model 33 | 34 | def memorize(self, state, action, reward, next_state, done): 35 | self.memory.append((state, action, reward, next_state, done)) 36 | 37 | def act(self, state): 38 | if np.random.rand() <= self.epsilon: 39 | return random.randrange(self.action_size) 40 | act_values = self.model.predict(state) 41 | return np.argmax(act_values[0]) # returns action 42 | 43 | def replay(self, batch_size): 44 | minibatch = random.sample(self.memory, batch_size) 45 | for state, action, reward, next_state, done in minibatch: 46 | target = reward 47 | if not done: 48 | target = (reward + self.gamma * 49 | np.amax(self.model.predict(next_state)[0])) 50 | target_f = self.model.predict(state) 51 | target_f[0][action] = target 52 | self.model.fit(state, target_f, epochs=1, verbose=0) 53 | if self.epsilon > self.epsilon_min: 54 | self.epsilon *= self.epsilon_decay 55 | 56 | def load(self, name): 57 | self.model.load_weights(name) 58 | 59 | def save(self, name): 60 | self.model.save_weights(name) 61 | 62 | 63 | if __name__ == "__main__": 64 | env = gym.make('CartPole-v1') 65 | state_size = env.observation_space.shape[0] 66 | action_size = env.action_space.n 67 | agent = DQNAgent(state_size, action_size) 68 | # agent.load("./save/cartpole-dqn.h5") 69 | done = False 70 | batch_size = 32 71 | 72 | for e in range(EPISODES): 73 | state = env.reset() 74 | state = np.reshape(state, [1, state_size]) 75 | for time in range(500): 76 | # env.render() 77 | action = agent.act(state) 78 | next_state, reward, done, _ = env.step(action) 79 | reward = reward if not done else -10 80 | next_state = np.reshape(next_state, [1, state_size]) 81 | agent.memorize(state, action, reward, next_state, done) 82 | state = next_state 83 | if done: 84 | print("episode: {}/{}, score: {}, e: {:.2}" 85 | .format(e, EPISODES, time, agent.epsilon)) 86 | break 87 | if len(agent.memory) > batch_size: 88 | agent.replay(batch_size) 89 | # if e % 10 == 0: 90 | # agent.save("./save/cartpole-dqn.h5") -------------------------------------------------------------------------------- /dqn_batch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | import gym 4 | import numpy as np 5 | from collections import deque 6 | from keras.models import Sequential 7 | from keras.layers import Dense 8 | from keras.optimizers import Adam 9 | 10 | EPISODES = 1000 11 | 12 | class DQNAgent: 13 | def __init__(self, state_size, action_size): 14 | self.state_size = state_size 15 | self.action_size = action_size 16 | self.memory = deque(maxlen=2000) 17 | self.gamma = 0.95 # discount rate 18 | self.epsilon = 1.0 # exploration rate 19 | self.epsilon_min = 0.01 20 | self.epsilon_decay = 0.995 21 | self.learning_rate = 0.001 22 | self.model = self._build_model() 23 | 24 | def _build_model(self): 25 | # Neural Net for Deep-Q learning Model 26 | model = Sequential() 27 | model.add(Dense(24, input_dim=self.state_size, activation='relu')) 28 | model.add(Dense(24, activation='relu')) 29 | model.add(Dense(self.action_size, activation='linear')) 30 | model.compile(loss='mse', 31 | optimizer=Adam(lr=self.learning_rate)) 32 | return model 33 | 34 | def memorize(self, state, action, reward, next_state, done): 35 | self.memory.append((state, action, reward, next_state, done)) 36 | 37 | def act(self, state): 38 | if np.random.rand() <= self.epsilon: 39 | return random.randrange(self.action_size) 40 | act_values = self.model.predict(state) 41 | return np.argmax(act_values[0]) # returns action 42 | 43 | def replay(self, batch_size): 44 | minibatch = random.sample(self.memory, batch_size) 45 | states, targets_f = [], [] 46 | for state, action, reward, next_state, done in minibatch: 47 | target = reward 48 | if not done: 49 | target = (reward + self.gamma * 50 | np.amax(self.model.predict(next_state)[0])) 51 | target_f = self.model.predict(state) 52 | target_f[0][action] = target 53 | # Filtering out states and targets for training 54 | states.append(state[0]) 55 | targets_f.append(target_f[0]) 56 | history = self.model.fit(np.array(states), np.array(targets_f), epochs=1, verbose=0) 57 | # Keeping track of loss 58 | loss = history.history['loss'][0] 59 | if self.epsilon > self.epsilon_min: 60 | self.epsilon *= self.epsilon_decay 61 | return loss 62 | 63 | def load(self, name): 64 | self.model.load_weights(name) 65 | 66 | def save(self, name): 67 | self.model.save_weights(name) 68 | 69 | 70 | if __name__ == "__main__": 71 | env = gym.make('CartPole-v1') 72 | state_size = env.observation_space.shape[0] 73 | action_size = env.action_space.n 74 | agent = DQNAgent(state_size, action_size) 75 | # agent.load("./save/cartpole-dqn.h5") 76 | done = False 77 | batch_size = 32 78 | 79 | for e in range(EPISODES): 80 | state = env.reset() 81 | state = np.reshape(state, [1, state_size]) 82 | for time in range(500): 83 | # env.render() 84 | action = agent.act(state) 85 | next_state, reward, done, _ = env.step(action) 86 | reward = reward if not done else -10 87 | next_state = np.reshape(next_state, [1, state_size]) 88 | agent.memorize(state, action, reward, next_state, done) 89 | state = next_state 90 | if done: 91 | print("episode: {}/{}, score: {}, e: {:.2}" 92 | .format(e, EPISODES, time, agent.epsilon)) 93 | break 94 | if len(agent.memory) > batch_size: 95 | loss = agent.replay(batch_size) 96 | # Logging training loss every 10 timesteps 97 | if time % 10 == 0: 98 | print("episode: {}/{}, time: {}, loss: {:.4f}" 99 | .format(e, EPISODES, time, loss)) 100 | # if e % 10 == 0: 101 | # agent.save("./save/cartpole-dqn.h5") 102 | -------------------------------------------------------------------------------- /ddqn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | import gym 4 | import numpy as np 5 | from collections import deque 6 | from keras.models import Sequential 7 | from keras.layers import Dense 8 | from keras.optimizers import Adam 9 | from keras import backend as K 10 | 11 | import tensorflow as tf 12 | 13 | EPISODES = 5000 14 | 15 | class DQNAgent: 16 | def __init__(self, state_size, action_size): 17 | self.state_size = state_size 18 | self.action_size = action_size 19 | self.memory = deque(maxlen=2000) 20 | self.gamma = 0.95 # discount rate 21 | self.epsilon = 1.0 # exploration rate 22 | self.epsilon_min = 0.01 23 | self.epsilon_decay = 0.99 24 | self.learning_rate = 0.001 25 | self.model = self._build_model() 26 | self.target_model = self._build_model() 27 | self.update_target_model() 28 | 29 | """Huber loss for Q Learning 30 | 31 | References: https://en.wikipedia.org/wiki/Huber_loss 32 | https://www.tensorflow.org/api_docs/python/tf/losses/huber_loss 33 | """ 34 | 35 | def _huber_loss(self, y_true, y_pred, clip_delta=1.0): 36 | error = y_true - y_pred 37 | cond = K.abs(error) <= clip_delta 38 | 39 | squared_loss = 0.5 * K.square(error) 40 | quadratic_loss = 0.5 * K.square(clip_delta) + clip_delta * (K.abs(error) - clip_delta) 41 | 42 | return K.mean(tf.where(cond, squared_loss, quadratic_loss)) 43 | 44 | def _build_model(self): 45 | # Neural Net for Deep-Q learning Model 46 | model = Sequential() 47 | model.add(Dense(24, input_dim=self.state_size, activation='relu')) 48 | model.add(Dense(24, activation='relu')) 49 | model.add(Dense(self.action_size, activation='linear')) 50 | model.compile(loss=self._huber_loss, 51 | optimizer=Adam(lr=self.learning_rate)) 52 | return model 53 | 54 | def update_target_model(self): 55 | # copy weights from model to target_model 56 | self.target_model.set_weights(self.model.get_weights()) 57 | 58 | def memorize(self, state, action, reward, next_state, done): 59 | self.memory.append((state, action, reward, next_state, done)) 60 | 61 | def act(self, state): 62 | if np.random.rand() <= self.epsilon: 63 | return random.randrange(self.action_size) 64 | act_values = self.model.predict(state) 65 | return np.argmax(act_values[0]) # returns action 66 | 67 | def replay(self, batch_size): 68 | minibatch = random.sample(self.memory, batch_size) 69 | for state, action, reward, next_state, done in minibatch: 70 | target = self.model.predict(state) 71 | if done: 72 | target[0][action] = reward 73 | else: 74 | # a = self.model.predict(next_state)[0] 75 | t = self.target_model.predict(next_state)[0] 76 | target[0][action] = reward + self.gamma * np.amax(t) 77 | # target[0][action] = reward + self.gamma * t[np.argmax(a)] 78 | self.model.fit(state, target, epochs=1, verbose=0) 79 | if self.epsilon > self.epsilon_min: 80 | self.epsilon *= self.epsilon_decay 81 | 82 | def load(self, name): 83 | self.model.load_weights(name) 84 | 85 | def save(self, name): 86 | self.model.save_weights(name) 87 | 88 | 89 | if __name__ == "__main__": 90 | env = gym.make('CartPole-v1') 91 | state_size = env.observation_space.shape[0] 92 | action_size = env.action_space.n 93 | agent = DQNAgent(state_size, action_size) 94 | # agent.load("./save/cartpole-ddqn.h5") 95 | done = False 96 | batch_size = 32 97 | 98 | for e in range(EPISODES): 99 | state = env.reset() 100 | state = np.reshape(state, [1, state_size]) 101 | for time in range(500): 102 | # env.render() 103 | action = agent.act(state) 104 | next_state, reward, done, _ = env.step(action) 105 | #reward = reward if not done else -10 106 | x,x_dot,theta,theta_dot = next_state 107 | r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 108 | r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 109 | reward = r1 + r2 110 | 111 | next_state = np.reshape(next_state, [1, state_size]) 112 | agent.memorize(state, action, reward, next_state, done) 113 | state = next_state 114 | if done: 115 | agent.update_target_model() 116 | print("episode: {}/{}, score: {}, e: {:.2}" 117 | .format(e, EPISODES, time, agent.epsilon)) 118 | break 119 | if len(agent.memory) > batch_size: 120 | agent.replay(batch_size) 121 | # if e % 10 == 0: 122 | # agent.save("./save/cartpole-ddqn.h5") 123 | --------------------------------------------------------------------------------