├── DQN-chainer-gym ├── network │ └── readme ├── experiment_gym.py ├── dqn_agent.py └── dqn_agent_cpu.py ├── LICENSE ├── README.md ├── experiment_ale.py ├── readme.txt ├── dqn_agent_nips.py └── dqn_agent_nature.py /DQN-chainer-gym/network/readme: -------------------------------------------------------------------------------- 1 | Learned network is saved here. 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Naoto Yoshida 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DQN-chainer 2 | 3 | This software is a python implementation of Deep Q-Networks for playing ATARI games with Chainer package. 4 | 5 | I followed the implementation described in: 6 | * V. Mnih *et al*., "Playing atari with deep reinforcement learning" 7 | 8 | http://arxiv.org/pdf/1312.5602.pdf 9 | * V. Mnih *et al.*, "Human-level control through deep reinforcement learning" 10 | 11 | http://www.nature.com/nature/journal/v518/n7540/abs/nature14236.html 12 | 13 | For japanese instruction of DQN and historical review, please check: 14 | 15 | http://qiita.com/Ugo-Nama/items/08c6a5f6a571335972d5 16 | 17 | # Requirement 18 | My implementation is dependent on RL-glue, Arcade Learning Environment, and Chainer. To run the software, you need following softwares/packages. 19 | 20 | * Python 2.7+ 21 | * Numpy 22 | * Scipy 23 | * Pillow (PIL) 24 | * Chainer (1.3.0): https://github.com/pfnet/chainer 25 | * RL-glue core: https://sites.google.com/a/rl-community.org/rl-glue/Home/rl-glue 26 | * RL-glue Python codec: https://sites.google.com/a/rl-community.org/rl-glue/Home/Extensions/python-codec 27 | * Arcade Learning Environment (version ALE 0.4.4): http://www.arcadelearningenvironment.org/ 28 | 29 | This software was tested on Ubuntu 14.04 LTS. 30 | 31 | # How to run 32 | Please check readme.txt 33 | 34 | -------------------------------------------------------------------------------- /experiment_ale.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Simple RL glue experiment setup 4 | """ 5 | 6 | import numpy as np 7 | import rlglue.RLGlue as RLGlue 8 | 9 | max_learningEpisode = 1000 10 | 11 | whichEpisode = 0 12 | learningEpisode = 0 13 | 14 | 15 | def runEpisode(is_learning_episode): 16 | global whichEpisode, learningEpisode 17 | 18 | RLGlue.RL_episode(0) 19 | totalSteps = RLGlue.RL_num_steps() 20 | totalReward = RLGlue.RL_return() 21 | 22 | whichEpisode += 1 23 | 24 | if is_learning_episode: 25 | learningEpisode += 1 26 | print "Episode " + str(learningEpisode) + "\t " + str(totalSteps) + " steps \t" + str(totalReward) + " total reward\t " 27 | else: 28 | print "Evaluation ::\t " + str(totalSteps) + " steps \t" + str(totalReward) + " total reward\t " 29 | 30 | 31 | # Main Program starts here 32 | print "\n\nDQN-ALE Experiment starting up!" 33 | RLGlue.RL_init() 34 | 35 | while learningEpisode < max_learningEpisode: 36 | # Evaluate model every 10 episodes 37 | if np.mod(whichEpisode, 10) == 0: 38 | print "Freeze learning for Evaluation" 39 | RLGlue.RL_agent_message("freeze learning") 40 | runEpisode(is_learning_episode=False) 41 | else: 42 | print "DQN is Learning" 43 | RLGlue.RL_agent_message("unfreeze learning") 44 | runEpisode(is_learning_episode=True) 45 | 46 | # Save model every 100 learning episodes 47 | if np.mod(learningEpisode, 100) == 0 and learningEpisode != 0: 48 | print "SAVE CURRENT MODEL" 49 | RLGlue.RL_agent_message("save model") 50 | 51 | RLGlue.RL_cleanup() 52 | 53 | print "Experiment COMPLETED @ Episode ", whichEpisode 54 | -------------------------------------------------------------------------------- /DQN-chainer-gym/experiment_gym.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Simple gym experiment setup 4 | """ 5 | 6 | 7 | import gym 8 | import dqn_agent as ag # for GPU experiment 9 | #import dqn_agent_cpu as ag # for CPU experiment 10 | import matplotlib.pyplot as plt 11 | import numpy as np 12 | import time 13 | 14 | # Generate an environment 15 | env = gym.make('Pong-v0') 16 | 17 | # Generate an agent 18 | agent = ag.DQN_Agent(env) 19 | 20 | eval_interval = 5 21 | num_episode = 10**5 22 | total_score = [] 23 | eval_steps = [] 24 | for i_episode in range(num_episode): 25 | observation = env.reset() 26 | terminal = False 27 | total_score_ = 0 28 | reward = 0.0 # initial reward is assumed to be zero 29 | step_in_episode = 0 30 | 31 | if np.mod(i_episode, eval_interval) == 0: 32 | # Learnin OFF evaluation 33 | agent.policyFrozen = True 34 | else: 35 | # Learning ON 36 | agent.policyFrozen = False 37 | 38 | while True: 39 | print(str(i_episode) + "-th episode") 40 | env.render() # Render the game 41 | 42 | if step_in_episode == 0: 43 | observation, reward, terminal, info = env.step(agent.start(observation)) # take an action 44 | else: 45 | observation, reward, terminal, info = env.step(agent.act(observation, reward)) # take an action 46 | 47 | total_score_ += reward 48 | step_in_episode += 1 49 | 50 | if terminal is True: 51 | agent.end(reward) 52 | break 53 | 54 | if np.mod(i_episode, eval_interval) == 0: 55 | total_score.append(total_score_) 56 | eval_steps.append(i_episode) 57 | print("REWARD@" + str(i_episode) + "-th episode : " + str(total_score_)) 58 | 59 | plt.clf() 60 | plt.plot(eval_steps, total_score) 61 | plt.legend(["Total Score"]) 62 | plt.savefig("result_plot.png") 63 | plt.draw() 64 | plt.pause(0.001) 65 | 66 | # Save the current agent parameters 67 | agent.save() 68 | -------------------------------------------------------------------------------- /readme.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------ 2 | Requirement 3 | ------------------------------------ 4 | This software is based on the following packages/softwares. Please install them before running the code. 5 | 6 | * Python 2.7+ 7 | * Numpy 8 | * Scipy 9 | * Chainer: https://github.com/pfnet/chainer 10 | * RL-glue core: https://sites.google.com/a/rl-community.org/rl-glue/Home/rl-glue 11 | * RL-glue Python codec: https://sites.google.com/a/rl-community.org/rl-glue/Home/Extensions/python-codec 12 | * Arcade Learning Environment (version ALE 0.4.4): http://www.arcadelearningenvironment.org/ 13 | * NVIDIA GPU (This code was tested on Geforce GTX 660 with Ubuntu 14.04 LTS) 14 | 15 | Also you may need the binary rom of the ATARI games. 16 | I reccomend you to run examples in RL-glue python codec and ALE before testing DQN. 17 | 18 | ------------------------------------ 19 | How to run 20 | ------------------------------------ 21 | To run a DQN, we just follow the standard RL-glue experiment. 22 | Concretely, we will need to start the following processes. 23 | 24 | * rl_glue 25 | * RLGlueAgent (dqn_agent_*.py) 26 | * RLGlueExperiment (experiment_ale.py) 27 | * ale (ALE 0.4.4) 28 | (So, you may need four terminal windows!) 29 | 30 | The actual process will be: 31 | (first window: rlglue) 32 | rl_glue 33 | (second window: RLGlueAgent) 34 | python dqn_agent_nature.py 35 | (third window: RLGlueExperiment) 36 | python experiment_ale.py 37 | (fourth window: ALE) 38 | ./ale -game_controller rlglue -use_starting_actions true -random_seed time -display_screen true -frame_skip 4 path_to_roms/pong.bin 39 | 40 | In the above example, we are assuming that the binary file of the roms ("Pong" in this case) 41 | are in path_to_roms directory. 42 | 43 | ------------------------------------ 44 | Playing other games 45 | ------------------------------------ 46 | The default setting of the code is for playing "Pong". 47 | To run with other games, you need to modify a line in "agent_start" function in "dqn_agent" class. 48 | 49 | To make DQN play "Breakout", we may set as 50 | 51 | (before modification) self.DQN = DQN_class() 52 | ( after modification) self.DQN = DQN_class(enable_controller=[0, 1, 3, 4]) 53 | 54 | "enable_controller" is the list of available actions of the agents. 55 | The minimum set of the actions required for each game rom are described 56 | in ale_0_4/src/games/supported/name_of_game.cpp, 57 | 58 | and you can check the corrensponding integer numbers in the section 8.1 of the technical manual of ALE: 59 | 60 | Technical Manual (you have same manual in your ale directory!): https://github.com/mgbellemare/Arcade-Learning-Environment/tree/master/doc/manual 61 | 62 | ------------------------------------ 63 | Modification of hyper-parameters 64 | ------------------------------------ 65 | 66 | If your machine does not have enough memory to run the full-version DQN, 67 | try setting "data_size" variable much smaller value like 2*10**4. 68 | This setting may reduce the final performance, but still works well at least in "Pong" domain. 69 | 70 | ------------------------------------ 71 | Copyright (c) 2015 Naoto Yoshida All Right Reserved. 72 | -------------------------------------------------------------------------------- /dqn_agent_nips.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Deep Q-network implementation with chainer and rlglue 4 | Copyright (c) 2015 Naoto Yoshida All Right Reserved. 5 | """ 6 | 7 | import copy 8 | 9 | import pickle 10 | import numpy as np 11 | import scipy.misc as spm 12 | 13 | from chainer import cuda, FunctionSet, Variable, optimizers 14 | import chainer.functions as F 15 | 16 | from rlglue.agent.Agent import Agent 17 | from rlglue.agent import AgentLoader as AgentLoader 18 | from rlglue.types import Action 19 | 20 | 21 | class DQN_class: 22 | # Hyper-Parameters 23 | gamma = 0.99 # Discount factor 24 | initial_exploration = 10**4 # Initial exploratoin. original: 5x10^4 25 | replay_size = 32 # Replay (batch) size 26 | target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 27 | data_size = 10**5 # Data size of history. original: 10^6 28 | 29 | def __init__(self, enable_controller=[0, 3, 4]): 30 | self.num_of_actions = len(enable_controller) 31 | self.enable_controller = enable_controller # Default setting : "Pong" 32 | 33 | print "Initializing DQN..." 34 | # Initialization for Chainer 1.1.0 or older. 35 | # print "CUDA init" 36 | # cuda.init() 37 | 38 | print "Model Building" 39 | self.model = FunctionSet( 40 | l1=F.Convolution2D(4, 16, ksize=8, stride=4, wscale=np.sqrt(2)), 41 | l2=F.Convolution2D(16, 32, ksize=4, stride=2, wscale=np.sqrt(2)), 42 | l3=F.Linear(2592, 256), 43 | q_value=F.Linear(256, self.num_of_actions, 44 | initialW=np.zeros((self.num_of_actions, 256), 45 | dtype=np.float32)) 46 | ).to_gpu() 47 | 48 | print "Initizlizing Optimizer" 49 | self.optimizer = optimizers.RMSpropGraves(lr=0.0002, alpha=0.3, momentum=0.2) 50 | self.optimizer.setup(self.model.collect_parameters()) 51 | 52 | # History Data : D=[s, a, r, s_dash, end_episode_flag] 53 | self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), 54 | np.zeros(self.data_size, dtype=np.uint8), 55 | np.zeros((self.data_size, 1), dtype=np.int8), 56 | np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), 57 | np.zeros((self.data_size, 1), dtype=np.bool)] 58 | 59 | def forward(self, state, action, Reward, state_dash, episode_end): 60 | num_of_batch = state.shape[0] 61 | s = Variable(state) 62 | s_dash = Variable(state_dash) 63 | 64 | Q = self.Q_func(s) # Get Q-value 65 | 66 | # Generate Target Signals 67 | max_Q_dash_ = self.Q_func(s_dash) 68 | tmp = list(map(np.max, max_Q_dash_.data.get())) 69 | max_Q_dash = np.asanyarray(tmp, dtype=np.float32) 70 | target = np.asanyarray(Q.data.get(), dtype=np.float32) 71 | 72 | for i in xrange(num_of_batch): 73 | if not episode_end[i][0]: 74 | tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] 75 | else: 76 | tmp_ = np.sign(Reward[i]) 77 | target[i, self.action_to_index(action[i])] = tmp_ 78 | 79 | loss = F.mean_squared_error(Variable(cuda.to_gpu(target)), Q) 80 | return loss, Q 81 | 82 | def stockExperience(self, time, 83 | state, action, reward, state_dash, 84 | episode_end_flag): 85 | data_index = time % self.data_size 86 | 87 | if episode_end_flag is True: 88 | self.D[0][data_index] = state 89 | self.D[1][data_index] = action 90 | self.D[2][data_index] = reward 91 | else: 92 | self.D[0][data_index] = state 93 | self.D[1][data_index] = action 94 | self.D[2][data_index] = reward 95 | self.D[3][data_index] = state_dash 96 | self.D[4][data_index] = episode_end_flag 97 | 98 | def experienceReplay(self, time): 99 | 100 | if self.initial_exploration < time: 101 | # Pick up replay_size number of samples from the Data 102 | if time < self.data_size: # during the first sweep of the History Data 103 | replay_index = np.random.randint(0, time, (self.replay_size, 1)) 104 | else: 105 | replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) 106 | 107 | s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) 108 | a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) 109 | r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) 110 | s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) 111 | episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) 112 | for i in xrange(self.replay_size): 113 | s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) 114 | a_replay[i] = self.D[1][replay_index[i]] 115 | r_replay[i] = self.D[2][replay_index[i]] 116 | s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) 117 | episode_end_replay[i] = self.D[4][replay_index[i]] 118 | 119 | s_replay = cuda.to_gpu(s_replay) 120 | s_dash_replay = cuda.to_gpu(s_dash_replay) 121 | 122 | # Gradient-based update 123 | self.optimizer.zero_grads() 124 | loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) 125 | loss.backward() 126 | self.optimizer.update() 127 | 128 | def Q_func(self, state): 129 | h1 = F.relu(self.model.l1(state / 254.0)) # scale inputs in [0.0, 1.0] 130 | h2 = F.relu(self.model.l2(h1)) 131 | h3 = F.relu(self.model.l3(h2)) 132 | Q = self.model.q_value(h3) 133 | return Q 134 | 135 | def e_greedy(self, state, epsilon): 136 | s = Variable(state) 137 | Q = self.Q_func(s) 138 | Q = Q.data 139 | 140 | if np.random.rand() < epsilon: 141 | index_action = np.random.randint(0, self.num_of_actions) 142 | print "RANDOM" 143 | else: 144 | index_action = np.argmax(Q.get()) 145 | print "GREEDY" 146 | 147 | return self.index_to_action(index_action), Q 148 | 149 | def index_to_action(self, index_of_action): 150 | return self.enable_controller[index_of_action] 151 | 152 | def action_to_index(self, action): 153 | return self.enable_controller.index(action) 154 | 155 | 156 | class dqn_agent(Agent): # RL-glue Process 157 | lastAction = Action() 158 | policyFrozen = False 159 | 160 | def agent_init(self, taskSpec): 161 | # Some initializations for rlglue 162 | self.lastAction = Action() 163 | 164 | self.time = 0 165 | self.epsilon = 1.0 # Initial exploratoin rate 166 | 167 | # Pick a DQN from DQN_class 168 | self.DQN = DQN_class() # Default is for "Pong". 169 | 170 | def agent_start(self, observation): 171 | 172 | # Get intensity from current observation array 173 | tmp = np.bitwise_and(np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111) # Get Intensity from the observation 174 | obs_array = (spm.imresize(tmp, (110, 84)))[110-84-8:110-8, :] # Scaling 175 | 176 | # Initialize State 177 | self.state = np.zeros((4, 84, 84), dtype=np.uint8) 178 | self.state[0] = obs_array 179 | state_ = cuda.to_gpu(np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32)) 180 | 181 | # Generate an Action e-greedy 182 | returnAction = Action() 183 | action, Q_now = self.DQN.e_greedy(state_, self.epsilon) 184 | returnAction.intArray = [action] 185 | 186 | # Update for next step 187 | self.lastAction = copy.deepcopy(returnAction) 188 | self.last_state = self.state.copy() 189 | self.last_observation = obs_array 190 | 191 | return returnAction 192 | 193 | def agent_step(self, reward, observation): 194 | 195 | # Preproces 196 | tmp = np.bitwise_and(np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111) # Get Intensity from the observation 197 | obs_array = (spm.imresize(tmp, (110, 84)))[110-84-8:110-8, :] # Scaling 198 | obs_processed = np.maximum(obs_array, self.last_observation) # Take maximum from two frames 199 | 200 | # Compose State : 4-step sequential observation 201 | self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_processed], dtype=np.uint8) 202 | state_ = cuda.to_gpu(np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32)) 203 | 204 | # Exploration decays along the time sequence 205 | if self.policyFrozen is False: # Learning ON/OFF 206 | if self.DQN.initial_exploration < self.time: 207 | self.epsilon -= 1.0/10**6 208 | if self.epsilon < 0.1: 209 | self.epsilon = 0.1 210 | eps = self.epsilon 211 | else: # Initial Exploation Phase 212 | print "Initial Exploration : %d/%d steps" % (self.time, self.DQN.initial_exploration) 213 | eps = 1.0 214 | else: # Evaluation 215 | print "Policy is Frozen" 216 | eps = 0.05 217 | 218 | # Generate an Action from e-greedy action selection 219 | returnAction = Action() 220 | action, Q_now = self.DQN.e_greedy(state_, eps) 221 | returnAction.intArray = [action] 222 | 223 | # Learning Phase 224 | if self.policyFrozen is False: # Learning ON/OFF 225 | self.DQN.stockExperience(self.time, self.last_state, self.lastAction.intArray[0], reward, self.state, False) 226 | self.DQN.experienceReplay(self.time) 227 | 228 | # Simple text based visualization 229 | print ' Time Step %d / ACTION %d / REWARD %.1f / EPSILON %.6f / Q_max %3f' % (self.time, self.DQN.action_to_index(action), np.sign(reward), eps, np.max(Q_now.get())) 230 | 231 | # Updates for next step 232 | self.last_observation = obs_array 233 | 234 | # Update for next step 235 | if self.policyFrozen is False: 236 | self.lastAction = copy.deepcopy(returnAction) 237 | self.last_state = self.state.copy() 238 | self.time += 1 239 | 240 | return returnAction 241 | 242 | def agent_end(self, reward): # Episode Terminated 243 | 244 | # Learning Phase 245 | if self.policyFrozen is False: # Learning ON/OFF 246 | self.DQN.stockExperience(self.time, self.last_state, self.lastAction.intArray[0], reward, self.last_state, True) 247 | self.DQN.experienceReplay(self.time) 248 | 249 | # Simple text based visualization 250 | print ' REWARD %.1f / EPSILON %.5f' % (np.sign(reward), self.epsilon) 251 | 252 | # Time count 253 | if not self.policyFrozen: 254 | self.time += 1 255 | 256 | def agent_cleanup(self): 257 | pass 258 | 259 | def agent_message(self, inMessage): 260 | if inMessage.startswith("freeze learning"): 261 | self.policyFrozen = True 262 | return "message understood, policy frozen" 263 | 264 | if inMessage.startswith("unfreeze learning"): 265 | self.policyFrozen = False 266 | return "message understood, policy unfrozen" 267 | 268 | if inMessage.startswith("save model"): 269 | with open('dqn_model.dat', 'w') as f: 270 | pickle.dump(self.DQN.model, f) 271 | return "message understood, model saved" 272 | 273 | if __name__ == "__main__": 274 | AgentLoader.loadAgent(dqn_agent()) 275 | -------------------------------------------------------------------------------- /DQN-chainer-gym/dqn_agent.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Deep Q-network implementation with chainer for gym environment 4 | Copyright (c) 2016 Naoto Yoshida All Right Reserved. 5 | """ 6 | 7 | import copy 8 | 9 | import pickle 10 | import numpy as np 11 | import scipy.misc as spm 12 | 13 | from chainer import cuda, Function, Variable, optimizers, serializers 14 | from chainer import Chain 15 | import chainer.functions as F 16 | import chainer.links as L 17 | 18 | class ActionValue(Chain): 19 | def __init__(self, n_history, n_act): 20 | super(ActionValue, self).__init__( 21 | l1=F.Convolution2D(n_history, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)), 22 | l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)), 23 | l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)), 24 | l4=F.Linear(3136, 512, wscale=np.sqrt(2)), 25 | q_value=F.Linear(512, n_act, 26 | initialW=np.zeros((n_act, 512), 27 | dtype=np.float32)) 28 | ) 29 | 30 | def q_function(self, state): 31 | h1 = F.relu(self.l1(state/255.)) 32 | h2 = F.relu(self.l2(h1)) 33 | h3 = F.relu(self.l3(h2)) 34 | h4 = F.relu(self.l4(h3)) 35 | return self.q_value(h4) 36 | 37 | 38 | class DQN: 39 | # Hyper-Parameters 40 | gamma = 0.99 # Discount factor 41 | initial_exploration = 10**4 # Initial exploratoin. original: 5x10^4 42 | replay_size = 32 # Replay (batch) size 43 | target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 44 | data_size = 10**5 # Data size of history. original: 10^6 45 | img_size = 84 # 84x84 image input (fixed) 46 | 47 | def __init__(self, n_history, n_act): 48 | print("Initializing DQN...") 49 | self.step = 0 # number of steps that DQN is updated 50 | self.n_act = n_act 51 | self.n_history = n_history # Number of obervations used to construct the single state 52 | 53 | print("Model Building") 54 | self.model = ActionValue(n_history, n_act).to_gpu() 55 | self.model_target = copy.deepcopy(self.model) 56 | 57 | print("Initizlizing Optimizer") 58 | self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.01) 59 | self.optimizer.setup(self.model) 60 | 61 | # History Data : D=[s, a, r, s_dash, end_episode_flag] 62 | hs = self.n_history 63 | ims = self.img_size 64 | self.replay_buffer = [np.zeros((self.data_size, hs, ims, ims), dtype=np.uint8), 65 | np.zeros(self.data_size, dtype=np.uint8), 66 | np.zeros((self.data_size, 1), dtype=np.float32), 67 | np.zeros((self.data_size, hs, ims, ims), dtype=np.uint8), 68 | np.zeros((self.data_size, 1), dtype=np.bool)] 69 | 70 | def get_loss(self, state, action, reward, state_prime, episode_end): 71 | s = Variable(cuda.to_gpu(state)) 72 | s_dash = Variable(cuda.to_gpu(state_prime)) 73 | 74 | q = self.model.q_function(s) # Get Q-value 75 | 76 | # Generate Target Signals 77 | tmp = self.model_target.q_function(s_dash) # Q(s',*) 78 | tmp = list(map(np.max, tmp.data)) # max_a Q(s',a) 79 | max_q_prime = np.asanyarray(tmp, dtype=np.float32) 80 | target = np.asanyarray(copy.deepcopy(q.data.get()), dtype=np.float32) 81 | 82 | for i in range(self.replay_size): 83 | if episode_end[i][0] is True: 84 | tmp_ = np.sign(reward[i]) 85 | else: 86 | # The sign of reward is used as the reward of DQN! 87 | tmp_ = np.sign(reward[i]) + self.gamma * max_q_prime[i] 88 | 89 | target[i, action[i]] = tmp_ 90 | 91 | # TD-error clipping 92 | td = Variable(cuda.to_gpu(target)) - q # TD error 93 | td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division 94 | td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) 95 | 96 | zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.n_act), dtype=np.float32))) 97 | loss = F.mean_squared_error(td_clip, zero_val) 98 | return loss, q 99 | 100 | def stock_experience(self, time, 101 | state, action, reward, state_prime, 102 | episode_end_flag): 103 | data_index = time % self.data_size 104 | 105 | if episode_end_flag is True: 106 | self.replay_buffer[0][data_index] = state 107 | self.replay_buffer[1][data_index] = action 108 | self.replay_buffer[2][data_index] = reward 109 | else: 110 | self.replay_buffer[0][data_index] = state 111 | self.replay_buffer[1][data_index] = action 112 | self.replay_buffer[2][data_index] = reward 113 | self.replay_buffer[3][data_index] = state_prime 114 | self.replay_buffer[4][data_index] = episode_end_flag 115 | 116 | def experience_replay(self, time): 117 | 118 | if self.initial_exploration < time: 119 | # Pick up replay_size number of samples from the Data 120 | if time < self.data_size: # during the first sweep of the History Data 121 | replay_index = np.random.randint(0, time, (self.replay_size, 1)) 122 | else: 123 | replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) 124 | 125 | hs = self.n_history 126 | ims = self.img_size 127 | rs = self.replay_size 128 | 129 | s_replay = np.ndarray(shape=(rs, hs, ims, ims), dtype=np.float32) 130 | a_replay = np.ndarray(shape=(rs, 1), dtype=np.int8) 131 | r_replay = np.ndarray(shape=(rs, 1), dtype=np.float32) 132 | s_dash_replay = np.ndarray(shape=(rs, hs, ims, ims), dtype=np.float32) 133 | episode_end_replay = np.ndarray(shape=(rs, 1), dtype=np.bool) 134 | for i in range(self.replay_size): 135 | s_replay[i] = np.asarray(self.replay_buffer[0][replay_index[i]], dtype=np.float32) 136 | a_replay[i] = self.replay_buffer[1][replay_index[i]] 137 | r_replay[i] = self.replay_buffer[2][replay_index[i]] 138 | s_dash_replay[i] = np.array(self.replay_buffer[3][replay_index[i]], dtype=np.float32) 139 | episode_end_replay[i] = self.replay_buffer[4][replay_index[i]] 140 | 141 | # Gradient-based update 142 | self.optimizer.zero_grads() 143 | loss, _ = self.get_loss(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) 144 | loss.backward() 145 | self.optimizer.update() 146 | 147 | 148 | def action_sample_e_greedy(self, state, epsilon): 149 | s = Variable(cuda.to_gpu(state)) 150 | q = self.model.q_function(s) 151 | q = q.data.get()[0] 152 | 153 | if np.random.rand() < epsilon: 154 | action = np.random.randint(0, self.n_act) 155 | print("RANDOM : " + str(action)) 156 | else: 157 | a = np.argmax(q) 158 | print("GREEDY : " + str(a)) 159 | action = np.asarray(a, dtype=np.int8) 160 | print(q) 161 | return action, q 162 | 163 | def target_model_update(self, soft_update): 164 | if soft_update is True: 165 | tau = self.target_update_rate 166 | 167 | # Target preference Update 168 | model_params = dict(self.model.namedparams()) 169 | model_target_params = dict(self.model_target.namedparams()) 170 | for name in model_target_params: 171 | model_target_params[name].data = tau*model_params[name].data\ 172 | + (1 - tau)*model_target_params[name].data 173 | else: 174 | if np.mod(self.step, self.target_model_update_freq) == 0: 175 | self.model_target = copy.deepcopy(self.model) 176 | 177 | def learn(self, state, action, reward, state_prime, terminal): 178 | self.stock_experience(self.step, 179 | state, action, reward, state_prime, 180 | terminal) 181 | 182 | self.experience_replay(self.step) 183 | self.target_model_update(soft_update=False) 184 | 185 | self.step += 1 186 | 187 | 188 | class DQN_Agent: # RL-glue Process 189 | policyFrozen = False 190 | 191 | def __init__(self, env): 192 | 193 | self.epsilon = 1.0 # Initial exploratoin rate 194 | 195 | # Pick a DQN from DQN_class 196 | self.dqn = DQN(n_history=4, n_act=env.action_space.n) 197 | 198 | def start(self, observation): 199 | 200 | self.reset_state(observation) 201 | state_ = np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32) 202 | 203 | # Generate an Action e-greedy 204 | action, Q_now = self.dqn.action_sample_e_greedy(state_, self.epsilon) 205 | 206 | # Update for next step 207 | self.last_action = action 208 | self.last_state = copy.deepcopy(self.state) 209 | 210 | return action 211 | 212 | def act(self, observation, reward): 213 | 214 | self.set_state(observation) 215 | state_ = np.asanyarray(self.state.reshape(1, self.dqn.n_history, 84, 84), dtype=np.float32) 216 | 217 | # Exploration decays along the time sequence 218 | if self.policyFrozen is False: # Learning ON/OFF 219 | if self.dqn.initial_exploration < self.dqn.step: 220 | self.epsilon -= 1.0/10**6 221 | if self.epsilon < 0.1: 222 | self.epsilon = 0.1 223 | eps = self.epsilon 224 | else: # Initial Exploation Phase 225 | print("Initial Exploration : %d/%d steps" % (self.dqn.step, self.dqn.initial_exploration)) 226 | eps = 1.0 227 | else: # Evaluation 228 | print("Policy is Frozen") 229 | eps = 0.05 230 | 231 | # Generate an Action by e-greedy action selection 232 | action, Q_now = self.dqn.action_sample_e_greedy(state_, eps) 233 | 234 | # Learning Phase 235 | if self.policyFrozen is False: # Learning ON/OFF 236 | self.dqn.learn(self.last_state, self.last_action, reward, self.state, False) 237 | self.last_action = copy.deepcopy(action) 238 | self.last_state = self.state.copy() 239 | 240 | # Simple text based visualization 241 | print(' Time Step %d / ACTION %d / REWARD %.1f / EPSILON %.6f / Q_max %3f' % (self.dqn.step, action, np.sign(reward), eps, np.max(Q_now))) 242 | 243 | return action 244 | 245 | def end(self, reward): # Episode Terminated 246 | 247 | # Learning Phase 248 | if self.policyFrozen is False: # Learning ON/OFF 249 | self.dqn.learn(self.last_state, self.last_action, reward, self.last_state, True) 250 | 251 | # Simple text based visualization 252 | print(' REWARD %.1f / EPSILON %.5f' % (np.sign(reward), self.epsilon)) 253 | 254 | 255 | def reset_state(self, observation): 256 | # Preprocess 257 | obs_array = self.scale_image(observation) 258 | # Updates for next step 259 | self.last_observation = obs_array 260 | 261 | # Initialize State 262 | self.state = np.zeros((self.dqn.n_history, 84, 84), dtype=np.uint8) 263 | self.state[0] = obs_array 264 | 265 | def set_state(self, observation): 266 | # Preproces 267 | obs_array = self.scale_image(observation) 268 | obs_processed = np.maximum(obs_array, self.last_observation) # Take maximum from two frames 269 | 270 | # Updates for the next step 271 | self.last_observation = obs_array 272 | 273 | # Compose State : 4-step sequential observation 274 | for i in range(self.dqn.n_history - 1): 275 | self.state[i] = self.state[i + 1].astype(np.uint8) 276 | self.state[self.dqn.n_history - 1] = obs_processed.astype(np.uint8) 277 | 278 | def scale_image(self, observation): 279 | img = self.rgb2gray(observation) # Convert RGB to Grayscale 280 | return (spm.imresize(img, (110, 84)))[110-84-8:110-8, :] # Scaling 281 | 282 | def rgb2gray(self, image): 283 | return np.dot(image[...,:3], [0.299, 0.587, 0.114]) 284 | 285 | def save(self): 286 | serializers.save_npz('network/model.model', self.dqn.model) 287 | serializers.save_npz('network/model_target.model', 288 | self.dqn.model_target) 289 | 290 | print("------------ Networks were SAVED ---------------") 291 | -------------------------------------------------------------------------------- /DQN-chainer-gym/dqn_agent_cpu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Deep Q-network implementation with chainer for gym environment 4 | Copyright (c) 2016 Naoto Yoshida All Right Reserved. 5 | """ 6 | 7 | import copy 8 | 9 | import pickle 10 | import numpy as np 11 | import scipy.misc as spm 12 | 13 | from chainer import cuda, Function, Variable, optimizers, serializers 14 | from chainer import Chain 15 | import chainer.functions as F 16 | import chainer.links as L 17 | 18 | import matplotlib.pyplot as plt 19 | 20 | class ActionValue(Chain): 21 | def __init__(self, n_history, n_act): 22 | super(ActionValue, self).__init__( 23 | l1=F.Convolution2D(n_history, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)), 24 | l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)), 25 | l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)), 26 | l4=F.Linear(3136, 512),#, wscale=np.sqrt(2)), 27 | q_value=F.Linear(512, n_act, 28 | initialW=0.0*np.random.randn(n_act, 512).astype(np.float32)) 29 | ) 30 | 31 | def q_function(self, state): 32 | h1 = F.relu(self.l1(state/255.)) 33 | h2 = F.relu(self.l2(h1)) 34 | h3 = F.relu(self.l3(h2)) 35 | h4 = F.relu(self.l4(h3)) 36 | return self.q_value(h4) 37 | 38 | 39 | class DQN: 40 | # Hyper-Parameters 41 | gamma = 0.99 # Discount factor 42 | initial_exploration = 10**4 # Initial exploratoin. original: 5x10^4 43 | replay_size = 32 # Replay (batch) size 44 | target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 45 | data_size = 10**5 # Data size of history. original: 10^6 46 | img_size = 84 # 84x84 image input (fixed) 47 | 48 | def __init__(self, n_history, n_act): 49 | print("Initializing DQN...") 50 | self.step = 0 # number of steps that DQN is updated 51 | self.n_act = n_act 52 | self.n_history = n_history # Number of obervations used to construct the single state 53 | 54 | print("Model Building") 55 | self.model = ActionValue(n_history, n_act) 56 | self.model_target = copy.deepcopy(self.model) 57 | 58 | print("Initizlizing Optimizer") 59 | self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.01) 60 | self.optimizer.setup(self.model) 61 | 62 | # History Data : D=[s, a, r, s_dash, end_episode_flag] 63 | hs = self.n_history 64 | ims = self.img_size 65 | self.replay_buffer = [np.zeros((self.data_size, hs, ims, ims), dtype=np.uint8), 66 | np.zeros(self.data_size, dtype=np.uint8), 67 | np.zeros((self.data_size, 1), dtype=np.float32), 68 | np.zeros((self.data_size, hs, ims, ims), dtype=np.uint8), 69 | np.zeros((self.data_size, 1), dtype=np.bool)] 70 | 71 | def get_loss(self, state, action, reward, state_prime, episode_end): 72 | s = Variable(state) 73 | s_dash = Variable(state_prime) 74 | 75 | q = self.model.q_function(s) # Get Q-value 76 | 77 | # Generate Target Signals 78 | tmp = self.model_target.q_function(s_dash) # Q(s',*) 79 | tmp = list(map(np.max, tmp.data)) # max_a Q(s',a) 80 | max_q_prime = np.asanyarray(tmp, dtype=np.float32) 81 | target = np.asanyarray(copy.deepcopy(q.data), dtype=np.float32) 82 | 83 | for i in range(self.replay_size): 84 | if episode_end[i][0] is True: 85 | tmp_ = np.sign(reward[i]) 86 | else: 87 | # The sign of reward is used as the reward of DQN! 88 | tmp_ = np.sign(reward[i]) + self.gamma * max_q_prime[i] 89 | 90 | target[i, action[i]] = tmp_ 91 | #print(tmp_) 92 | 93 | #print(target) 94 | # TD-error clipping 95 | td = Variable(target) - q # TD error 96 | #print("TD ") 97 | #print(td.data) 98 | td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division 99 | td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) 100 | #print(np.round(td.data)) 101 | 102 | zero_val = Variable(np.zeros((self.replay_size, self.n_act), dtype=np.float32)) 103 | loss = F.mean_squared_error(td_clip, zero_val) 104 | return loss, q 105 | 106 | def stock_experience(self, time, 107 | state, action, reward, state_prime, 108 | episode_end_flag): 109 | data_index = time % self.data_size 110 | 111 | if episode_end_flag is True: 112 | self.replay_buffer[0][data_index] = state 113 | self.replay_buffer[1][data_index] = action 114 | self.replay_buffer[2][data_index] = reward 115 | else: 116 | self.replay_buffer[0][data_index] = state 117 | self.replay_buffer[1][data_index] = action 118 | self.replay_buffer[2][data_index] = reward 119 | self.replay_buffer[3][data_index] = state_prime 120 | self.replay_buffer[4][data_index] = episode_end_flag 121 | 122 | def experience_replay(self, time): 123 | 124 | if self.initial_exploration < time: 125 | # Pick up replay_size number of samples from the Data 126 | if time < self.data_size: # during the first sweep of the History Data 127 | replay_index = np.random.randint(0, time, (self.replay_size, 1)) 128 | else: 129 | replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) 130 | 131 | hs = self.n_history 132 | ims = self.img_size 133 | rs = self.replay_size 134 | 135 | s_replay = np.ndarray(shape=(rs, hs, ims, ims), dtype=np.float32) 136 | a_replay = np.ndarray(shape=(rs, 1), dtype=np.int8) 137 | r_replay = np.ndarray(shape=(rs, 1), dtype=np.float32) 138 | s_dash_replay = np.ndarray(shape=(rs, hs, ims, ims), dtype=np.float32) 139 | episode_end_replay = np.ndarray(shape=(rs, 1), dtype=np.bool) 140 | for i in range(self.replay_size): 141 | s_replay[i] = np.asarray(self.replay_buffer[0][replay_index[i]], dtype=np.float32) 142 | a_replay[i] = self.replay_buffer[1][replay_index[i]] 143 | r_replay[i] = self.replay_buffer[2][replay_index[i]] 144 | s_dash_replay[i] = np.array(self.replay_buffer[3][replay_index[i]], dtype=np.float32) 145 | episode_end_replay[i] = self.replay_buffer[4][replay_index[i]] 146 | 147 | # Gradient-based update 148 | self.optimizer.zero_grads() 149 | loss, _ = self.get_loss(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) 150 | loss.backward() 151 | self.optimizer.update() 152 | 153 | 154 | def action_sample_e_greedy(self, state, epsilon): 155 | s = Variable(state) 156 | q = self.model.q_function(s) 157 | q = q.data[0] 158 | 159 | if np.random.rand() < epsilon: 160 | action = np.random.randint(0, self.n_act) 161 | print("RANDOM : " + str(action)) 162 | else: 163 | a = np.argmax(q) 164 | print("GREEDY : " + str(a)) 165 | action = np.asarray(a, dtype=np.int8) 166 | print(q) 167 | return action, q 168 | 169 | def target_model_update(self, soft_update): 170 | if soft_update is True: 171 | tau = self.target_update_rate 172 | 173 | # Target preference Update 174 | model_params = dict(self.model.namedparams()) 175 | model_target_params = dict(self.model_target.namedparams()) 176 | for name in model_target_params: 177 | model_target_params[name].data = tau*model_params[name].data\ 178 | + (1 - tau)*model_target_params[name].data 179 | else: 180 | if np.mod(self.step, self.target_model_update_freq) == 0: 181 | self.model_target = copy.deepcopy(self.model) 182 | 183 | def learn(self, state, action, reward, state_prime, terminal): 184 | self.stock_experience(self.step, 185 | state, action, reward, state_prime, 186 | terminal) 187 | 188 | self.experience_replay(self.step) 189 | self.target_model_update(soft_update=False) 190 | 191 | self.step += 1 192 | 193 | 194 | class DQN_Agent: # RL-glue Process 195 | policyFrozen = False 196 | 197 | def __init__(self, env): 198 | 199 | self.epsilon = 1.0 # Initial exploratoin rate 200 | 201 | # Pick a DQN from DQN_class 202 | self.dqn = DQN(n_history=4, n_act=env.action_space.n) 203 | 204 | def start(self, observation): 205 | 206 | self.reset_state(observation) 207 | state_ = np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32) 208 | 209 | # Generate an Action e-greedy 210 | action, Q_now = self.dqn.action_sample_e_greedy(state_, self.epsilon) 211 | 212 | # Update for next step 213 | self.last_action = action 214 | self.last_state = copy.deepcopy(self.state) 215 | 216 | return action 217 | 218 | def act(self, observation, reward): 219 | 220 | self.set_state(observation) 221 | state_ = np.asanyarray(self.state.reshape(1, self.dqn.n_history, 84, 84), dtype=np.float32) 222 | 223 | # Exploration decays along the time sequence 224 | if self.policyFrozen is False: # Learning ON/OFF 225 | if self.dqn.initial_exploration < self.dqn.step: 226 | self.epsilon -= 1.0/10**6 227 | if self.epsilon < 0.1: 228 | self.epsilon = 0.1 229 | eps = self.epsilon 230 | else: # Initial Exploation Phase 231 | print("Initial Exploration : %d/%d steps" % (self.dqn.step, self.dqn.initial_exploration)) 232 | eps = 1.0 233 | else: # Evaluation 234 | print("Policy is Frozen") 235 | eps = 0.05 236 | 237 | # Generate an Action by e-greedy action selection 238 | action, Q_now = self.dqn.action_sample_e_greedy(state_, eps) 239 | 240 | # Learning Phase 241 | if self.policyFrozen is False: # Learning ON/OFF 242 | self.dqn.learn(self.last_state, self.last_action, reward, self.state, False) 243 | self.last_action = copy.deepcopy(action) 244 | self.last_state = self.state.copy() 245 | 246 | # Simple text based visualization 247 | print(' Time Step %d / ACTION %d / REWARD %.1f / EPSILON %.6f / Q_max %3f' % (self.dqn.step, action, np.sign(reward), eps, np.max(Q_now))) 248 | 249 | return action 250 | 251 | def end(self, reward): # Episode Terminated 252 | 253 | # Learning Phase 254 | if self.policyFrozen is False: # Learning ON/OFF 255 | self.dqn.learn(self.last_state, self.last_action, reward, self.last_state, True) 256 | 257 | # Simple text based visualization 258 | print(' REWARD %.1f / EPSILON %.5f' % (np.sign(reward), self.epsilon)) 259 | 260 | 261 | def reset_state(self, observation): 262 | # Preprocess 263 | obs_array = self.scale_image(observation) 264 | # Updates for next step 265 | self.last_observation = obs_array 266 | 267 | # Initialize State 268 | self.state = np.zeros((self.dqn.n_history, 84, 84), dtype=np.uint8) 269 | self.state[0] = obs_array 270 | 271 | def set_state(self, observation): 272 | # Preproces 273 | obs_array = self.scale_image(observation) 274 | obs_processed = np.maximum(obs_array, self.last_observation) # Take maximum from two frames 275 | 276 | """ 277 | print(obs_processed.max()) 278 | plt.imshow(obs_processed) 279 | plt.draw() 280 | plt.pause(0.0001) 281 | """ 282 | 283 | # Updates for the next step 284 | self.last_observation = obs_array 285 | 286 | # Compose State : 4-step sequential observation 287 | for i in range(self.dqn.n_history - 1): 288 | self.state[i] = self.state[i + 1].astype(np.uint8) 289 | self.state[self.dqn.n_history - 1] = obs_processed.astype(np.uint8) 290 | 291 | def scale_image(self, observation): 292 | img = self.rgb2gray(observation) # Convert RGB to Grayscale 293 | return (spm.imresize(img, (110, 84)))[110-84-8:110-8, :] # Scaling 294 | 295 | def rgb2gray(self, image): 296 | return np.dot(image[...,:3], [0.299, 0.587, 0.114]) 297 | 298 | def save(self): 299 | serializers.save_npz('network/model.model', self.dqn.model) 300 | serializers.save_npz('network/model_target.model', 301 | self.dqn.model_target) 302 | 303 | print("------------ Networks were SAVED ---------------") 304 | -------------------------------------------------------------------------------- /dqn_agent_nature.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Deep Q-network implementation with chainer and rlglue 4 | Copyright (c) 2015 Naoto Yoshida All Right Reserved. 5 | """ 6 | 7 | import copy 8 | 9 | import pickle 10 | import numpy as np 11 | import scipy.misc as spm 12 | 13 | from chainer import cuda, FunctionSet, Variable, optimizers 14 | import chainer.functions as F 15 | 16 | from rlglue.agent.Agent import Agent 17 | from rlglue.agent import AgentLoader as AgentLoader 18 | from rlglue.types import Action 19 | 20 | 21 | class DQN_class: 22 | # Hyper-Parameters 23 | gamma = 0.99 # Discount factor 24 | initial_exploration = 100#10**4 # Initial exploratoin. original: 5x10^4 25 | replay_size = 32 # Replay (batch) size 26 | target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 27 | data_size = 10**5 # Data size of history. original: 10^6 28 | 29 | def __init__(self, enable_controller=[0, 3, 4]): 30 | self.num_of_actions = len(enable_controller) 31 | self.enable_controller = enable_controller # Default setting : "Pong" 32 | 33 | print "Initializing DQN..." 34 | # Initialization of Chainer 1.1.0 or older. 35 | # print "CUDA init" 36 | # cuda.init() 37 | 38 | print "Model Building" 39 | self.model = FunctionSet( 40 | l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)), 41 | l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)), 42 | l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)), 43 | l4=F.Linear(3136, 512, wscale=np.sqrt(2)), 44 | q_value=F.Linear(512, self.num_of_actions, 45 | initialW=np.zeros((self.num_of_actions, 512), 46 | dtype=np.float32)) 47 | ).to_gpu() 48 | 49 | self.model_target = copy.deepcopy(self.model) 50 | 51 | print "Initizlizing Optimizer" 52 | self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) 53 | self.optimizer.setup(self.model.collect_parameters()) 54 | 55 | # History Data : D=[s, a, r, s_dash, end_episode_flag] 56 | self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), 57 | np.zeros(self.data_size, dtype=np.uint8), 58 | np.zeros((self.data_size, 1), dtype=np.int8), 59 | np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), 60 | np.zeros((self.data_size, 1), dtype=np.bool)] 61 | 62 | def forward(self, state, action, Reward, state_dash, episode_end): 63 | num_of_batch = state.shape[0] 64 | s = Variable(state) 65 | s_dash = Variable(state_dash) 66 | 67 | Q = self.Q_func(s) # Get Q-value 68 | 69 | # Generate Target Signals 70 | tmp = self.Q_func_target(s_dash) # Q(s',*) 71 | tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) 72 | max_Q_dash = np.asanyarray(tmp, dtype=np.float32) 73 | target = np.asanyarray(Q.data.get(), dtype=np.float32) 74 | 75 | for i in xrange(num_of_batch): 76 | if not episode_end[i][0]: 77 | tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] 78 | else: 79 | tmp_ = np.sign(Reward[i]) 80 | 81 | action_index = self.action_to_index(action[i]) 82 | target[i, action_index] = tmp_ 83 | 84 | # TD-error clipping 85 | td = Variable(cuda.to_gpu(target)) - Q # TD error 86 | td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division 87 | td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) 88 | 89 | zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32))) 90 | loss = F.mean_squared_error(td_clip, zero_val) 91 | return loss, Q 92 | 93 | def stockExperience(self, time, 94 | state, action, reward, state_dash, 95 | episode_end_flag): 96 | data_index = time % self.data_size 97 | 98 | if episode_end_flag is True: 99 | self.D[0][data_index] = state 100 | self.D[1][data_index] = action 101 | self.D[2][data_index] = reward 102 | else: 103 | self.D[0][data_index] = state 104 | self.D[1][data_index] = action 105 | self.D[2][data_index] = reward 106 | self.D[3][data_index] = state_dash 107 | self.D[4][data_index] = episode_end_flag 108 | 109 | def experienceReplay(self, time): 110 | 111 | if self.initial_exploration < time: 112 | # Pick up replay_size number of samples from the Data 113 | if time < self.data_size: # during the first sweep of the History Data 114 | replay_index = np.random.randint(0, time, (self.replay_size, 1)) 115 | else: 116 | replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) 117 | 118 | s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) 119 | a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) 120 | r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) 121 | s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) 122 | episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) 123 | for i in xrange(self.replay_size): 124 | s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) 125 | a_replay[i] = self.D[1][replay_index[i]] 126 | r_replay[i] = self.D[2][replay_index[i]] 127 | s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) 128 | episode_end_replay[i] = self.D[4][replay_index[i]] 129 | 130 | s_replay = cuda.to_gpu(s_replay) 131 | s_dash_replay = cuda.to_gpu(s_dash_replay) 132 | 133 | # Gradient-based update 134 | self.optimizer.zero_grads() 135 | loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) 136 | loss.backward() 137 | self.optimizer.update() 138 | 139 | def Q_func(self, state): 140 | h1 = F.relu(self.model.l1(state / 255.0)) # scale inputs in [0.0 1.0] 141 | h2 = F.relu(self.model.l2(h1)) 142 | h3 = F.relu(self.model.l3(h2)) 143 | h4 = F.relu(self.model.l4(h3)) 144 | Q = self.model.q_value(h4) 145 | return Q 146 | 147 | def Q_func_target(self, state): 148 | h1 = F.relu(self.model_target.l1(state / 255.0)) # scale inputs in [0.0 1.0] 149 | h2 = F.relu(self.model_target.l2(h1)) 150 | h3 = F.relu(self.model_target.l3(h2)) 151 | h4 = F.relu(self.model_target.l4(h3)) 152 | Q = self.model_target.q_value(h4) 153 | return Q 154 | 155 | def e_greedy(self, state, epsilon): 156 | s = Variable(state) 157 | Q = self.Q_func(s) 158 | Q = Q.data 159 | 160 | if np.random.rand() < epsilon: 161 | index_action = np.random.randint(0, self.num_of_actions) 162 | print "RANDOM" 163 | else: 164 | index_action = np.argmax(Q.get()) 165 | print "GREEDY" 166 | return self.index_to_action(index_action), Q 167 | 168 | def target_model_update(self): 169 | self.model_target = copy.deepcopy(self.model) 170 | 171 | def index_to_action(self, index_of_action): 172 | return self.enable_controller[index_of_action] 173 | 174 | def action_to_index(self, action): 175 | return self.enable_controller.index(action) 176 | 177 | 178 | class dqn_agent(Agent): # RL-glue Process 179 | lastAction = Action() 180 | policyFrozen = False 181 | 182 | def agent_init(self, taskSpec): 183 | # Some initializations for rlglue 184 | self.lastAction = Action() 185 | 186 | self.time = 0 187 | self.epsilon = 1.0 # Initial exploratoin rate 188 | 189 | # Pick a DQN from DQN_class 190 | self.DQN = DQN_class() # default is for "Pong". 191 | 192 | def agent_start(self, observation): 193 | 194 | # Preprocess 195 | tmp = np.bitwise_and(np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111) # Get Intensity from the observation 196 | obs_array = (spm.imresize(tmp, (110, 84)))[110-84-8:110-8, :] # Scaling 197 | 198 | # Initialize State 199 | self.state = np.zeros((4, 84, 84), dtype=np.uint8) 200 | self.state[0] = obs_array 201 | state_ = cuda.to_gpu(np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32)) 202 | 203 | # Generate an Action e-greedy 204 | returnAction = Action() 205 | action, Q_now = self.DQN.e_greedy(state_, self.epsilon) 206 | returnAction.intArray = [action] 207 | 208 | # Update for next step 209 | self.lastAction = copy.deepcopy(returnAction) 210 | self.last_state = self.state.copy() 211 | self.last_observation = obs_array 212 | 213 | return returnAction 214 | 215 | def agent_step(self, reward, observation): 216 | 217 | # Preproces 218 | tmp = np.bitwise_and(np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111) # Get Intensity from the observation 219 | obs_array = (spm.imresize(tmp, (110, 84)))[110-84-8:110-8, :] # Scaling 220 | obs_processed = np.maximum(obs_array, self.last_observation) # Take maximum from two frames 221 | 222 | # Compose State : 4-step sequential observation 223 | self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_processed], dtype=np.uint8) 224 | state_ = cuda.to_gpu(np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32)) 225 | 226 | # Exploration decays along the time sequence 227 | if self.policyFrozen is False: # Learning ON/OFF 228 | if self.DQN.initial_exploration < self.time: 229 | self.epsilon -= 1.0/10**6 230 | if self.epsilon < 0.1: 231 | self.epsilon = 0.1 232 | eps = self.epsilon 233 | else: # Initial Exploation Phase 234 | print "Initial Exploration : %d/%d steps" % (self.time, self.DQN.initial_exploration) 235 | eps = 1.0 236 | else: # Evaluation 237 | print "Policy is Frozen" 238 | eps = 0.05 239 | 240 | # Generate an Action by e-greedy action selection 241 | returnAction = Action() 242 | action, Q_now = self.DQN.e_greedy(state_, eps) 243 | returnAction.intArray = [action] 244 | 245 | # Learning Phase 246 | if self.policyFrozen is False: # Learning ON/OFF 247 | self.DQN.stockExperience(self.time, self.last_state, self.lastAction.intArray[0], reward, self.state, False) 248 | self.DQN.experienceReplay(self.time) 249 | 250 | # Target model update 251 | if self.DQN.initial_exploration < self.time and np.mod(self.time, self.DQN.target_model_update_freq) == 0: 252 | print "########### MODEL UPDATED ######################" 253 | self.DQN.target_model_update() 254 | 255 | # Simple text based visualization 256 | print ' Time Step %d / ACTION %d / REWARD %.1f / EPSILON %.6f / Q_max %3f' % (self.time, self.DQN.action_to_index(action), np.sign(reward), eps, np.max(Q_now.get())) 257 | 258 | # Updates for next step 259 | self.last_observation = obs_array 260 | 261 | if self.policyFrozen is False: 262 | self.lastAction = copy.deepcopy(returnAction) 263 | self.last_state = self.state.copy() 264 | self.time += 1 265 | 266 | return returnAction 267 | 268 | def agent_end(self, reward): # Episode Terminated 269 | 270 | # Learning Phase 271 | if self.policyFrozen is False: # Learning ON/OFF 272 | self.DQN.stockExperience(self.time, self.last_state, self.lastAction.intArray[0], reward, self.last_state, True) 273 | self.DQN.experienceReplay(self.time) 274 | 275 | # Target model update 276 | if self.DQN.initial_exploration < self.time and np.mod(self.time, self.DQN.target_model_update_freq) == 0: 277 | print "########### MODEL UPDATED ######################" 278 | self.DQN.target_model_update() 279 | 280 | # Simple text based visualization 281 | print ' REWARD %.1f / EPSILON %.5f' % (np.sign(reward), self.epsilon) 282 | 283 | # Time count 284 | if self.policyFrozen is False: 285 | self.time += 1 286 | 287 | def agent_cleanup(self): 288 | pass 289 | 290 | def agent_message(self, inMessage): 291 | if inMessage.startswith("freeze learning"): 292 | self.policyFrozen = True 293 | return "message understood, policy frozen" 294 | 295 | if inMessage.startswith("unfreeze learning"): 296 | self.policyFrozen = False 297 | return "message understood, policy unfrozen" 298 | 299 | if inMessage.startswith("save model"): 300 | with open('dqn_model.dat', 'w') as f: 301 | pickle.dump(self.DQN.model, f) 302 | return "message understood, model saved" 303 | 304 | if __name__ == "__main__": 305 | AgentLoader.loadAgent(dqn_agent()) 306 | --------------------------------------------------------------------------------