├── DQN-chainer-gym
    ├── network
    │   └── readme
    ├── experiment_gym.py
    ├── dqn_agent.py
    └── dqn_agent_cpu.py
├── LICENSE
├── README.md
├── experiment_ale.py
├── readme.txt
├── dqn_agent_nips.py
└── dqn_agent_nature.py


/DQN-chainer-gym/network/readme:
--------------------------------------------------------------------------------
1 | Learned network is saved here.
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Naoto Yoshida
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DQN-chainer
 2 | 
 3 | This software is a python implementation of Deep Q-Networks for playing ATARI games with Chainer package.
 4 | 
 5 | I followed the implementation described in:
 6 | * V. Mnih *et al*., "Playing atari with deep reinforcement learning"
 7 | 
 8 | http://arxiv.org/pdf/1312.5602.pdf
 9 | * V. Mnih *et al.*, "Human-level control through deep reinforcement learning"
10 | 
11 | http://www.nature.com/nature/journal/v518/n7540/abs/nature14236.html
12 | 
13 | For japanese instruction of DQN and historical review, please check:
14 | 
15 | http://qiita.com/Ugo-Nama/items/08c6a5f6a571335972d5
16 | 
17 | # Requirement
18 | My implementation is dependent on RL-glue, Arcade Learning Environment, and Chainer. To run the software, you need following softwares/packages.
19 | 
20 | * Python 2.7+
21 | * Numpy
22 | * Scipy
23 | * Pillow (PIL)
24 | * Chainer (1.3.0): https://github.com/pfnet/chainer
25 | * RL-glue core: https://sites.google.com/a/rl-community.org/rl-glue/Home/rl-glue
26 | * RL-glue Python codec: https://sites.google.com/a/rl-community.org/rl-glue/Home/Extensions/python-codec
27 | * Arcade Learning Environment (version ALE 0.4.4): http://www.arcadelearningenvironment.org/
28 | 
29 | This software was tested on Ubuntu 14.04 LTS.
30 | 
31 | # How to run
32 | Please check readme.txt
33 | 
34 | 


--------------------------------------------------------------------------------
/experiment_ale.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Simple RL glue experiment setup
 4 | """
 5 | 
 6 | import numpy as np
 7 | import rlglue.RLGlue as RLGlue
 8 | 
 9 | max_learningEpisode = 1000
10 | 
11 | whichEpisode = 0
12 | learningEpisode = 0
13 | 
14 | 
15 | def runEpisode(is_learning_episode):
16 |     global whichEpisode, learningEpisode
17 | 
18 |     RLGlue.RL_episode(0)
19 |     totalSteps = RLGlue.RL_num_steps()
20 |     totalReward = RLGlue.RL_return()
21 | 
22 |     whichEpisode += 1
23 | 
24 |     if is_learning_episode:
25 |         learningEpisode += 1
26 |         print "Episode " + str(learningEpisode) + "\t " + str(totalSteps) + " steps \t" + str(totalReward) + " total reward\t "
27 |     else:
28 |         print "Evaluation ::\t " + str(totalSteps) + " steps \t" + str(totalReward) + " total reward\t "
29 | 
30 | 
31 | # Main Program starts here
32 | print "\n\nDQN-ALE Experiment starting up!"
33 | RLGlue.RL_init()
34 | 
35 | while learningEpisode < max_learningEpisode:
36 |     # Evaluate model every 10 episodes
37 |     if np.mod(whichEpisode, 10) == 0:
38 |         print "Freeze learning for Evaluation"
39 |         RLGlue.RL_agent_message("freeze learning")
40 |         runEpisode(is_learning_episode=False)
41 |     else:
42 |         print "DQN is Learning"
43 |         RLGlue.RL_agent_message("unfreeze learning")
44 |         runEpisode(is_learning_episode=True)
45 | 
46 |     # Save model every 100 learning episodes
47 |     if np.mod(learningEpisode, 100) == 0 and learningEpisode != 0:
48 |         print "SAVE CURRENT MODEL"
49 |         RLGlue.RL_agent_message("save model")
50 | 
51 | RLGlue.RL_cleanup()
52 | 
53 | print "Experiment COMPLETED @ Episode ", whichEpisode
54 | 


--------------------------------------------------------------------------------
/DQN-chainer-gym/experiment_gym.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Simple gym experiment setup
 4 | """
 5 | 
 6 | 
 7 | import gym
 8 | import dqn_agent as ag # for GPU experiment
 9 | #import dqn_agent_cpu as ag # for CPU experiment
10 | import matplotlib.pyplot as plt
11 | import numpy as np
12 | import time
13 | 
14 | # Generate an environment
15 | env = gym.make('Pong-v0')
16 | 
17 | # Generate an agent
18 | agent = ag.DQN_Agent(env)
19 | 
20 | eval_interval = 5
21 | num_episode = 10**5
22 | total_score = []
23 | eval_steps = []
24 | for i_episode in range(num_episode):
25 |     observation  = env.reset()
26 |     terminal = False
27 |     total_score_ = 0
28 |     reward = 0.0  # initial reward is assumed to be zero
29 |     step_in_episode = 0
30 | 
31 |     if np.mod(i_episode, eval_interval) == 0:
32 |         # Learnin OFF evaluation
33 |         agent.policyFrozen = True
34 |     else:
35 |         # Learning ON
36 |         agent.policyFrozen = False
37 | 
38 |     while True:
39 |         print(str(i_episode) + "-th episode")
40 |         env.render() # Render the game
41 | 
42 |         if step_in_episode == 0:
43 |             observation, reward, terminal, info = env.step(agent.start(observation)) # take an action
44 |         else:
45 |             observation, reward, terminal, info = env.step(agent.act(observation, reward)) # take an action
46 | 
47 |         total_score_ += reward
48 |         step_in_episode += 1
49 | 
50 |         if terminal is True:
51 |             agent.end(reward)
52 |             break
53 | 
54 |     if np.mod(i_episode, eval_interval) == 0:
55 |         total_score.append(total_score_)
56 |         eval_steps.append(i_episode)
57 |         print("REWARD@" + str(i_episode) + "-th episode : " + str(total_score_))
58 | 
59 |         plt.clf()
60 |         plt.plot(eval_steps, total_score)
61 |         plt.legend(["Total Score"])
62 |         plt.savefig("result_plot.png")
63 |         plt.draw()
64 |         plt.pause(0.001)
65 | 
66 |         # Save the current agent parameters
67 |         agent.save()
68 | 


--------------------------------------------------------------------------------
/readme.txt:
--------------------------------------------------------------------------------
 1 | ------------------------------------
 2 |     Requirement
 3 | ------------------------------------
 4 | This software is based on the following packages/softwares. Please install them before running the code.
 5 | 
 6 | * Python 2.7+
 7 | * Numpy
 8 | * Scipy
 9 | * Chainer: https://github.com/pfnet/chainer
10 | * RL-glue core: https://sites.google.com/a/rl-community.org/rl-glue/Home/rl-glue
11 | * RL-glue Python codec: https://sites.google.com/a/rl-community.org/rl-glue/Home/Extensions/python-codec
12 | * Arcade Learning Environment (version ALE 0.4.4): http://www.arcadelearningenvironment.org/
13 | * NVIDIA GPU (This code was tested on Geforce GTX 660 with Ubuntu 14.04 LTS)
14 | 
15 | Also you may need the binary rom of the ATARI games.
16 | I reccomend you to run examples in RL-glue python codec and ALE before testing DQN.
17 | 
18 | ------------------------------------
19 |     How to run
20 | ------------------------------------
21 | To run a DQN, we just follow the standard RL-glue experiment. 
22 | Concretely, we will need to start the following processes.
23 | 
24 | * rl_glue
25 | * RLGlueAgent (dqn_agent_*.py)
26 | * RLGlueExperiment (experiment_ale.py)
27 | * ale (ALE 0.4.4)
28 | (So, you may need four terminal windows!)
29 | 
30 | The actual process will be:
31 | (first window: rlglue)
32 | rl_glue
33 | (second window: RLGlueAgent)
34 | python dqn_agent_nature.py
35 | (third window: RLGlueExperiment)
36 | python experiment_ale.py
37 | (fourth window: ALE)
38 | ./ale -game_controller rlglue -use_starting_actions true -random_seed time -display_screen true -frame_skip 4 path_to_roms/pong.bin 
39 | 
40 | In the above example, we are assuming that the binary file of the roms ("Pong" in this case)
41 | are in path_to_roms directory. 
42 | 
43 | ------------------------------------
44 |     Playing other games
45 | ------------------------------------
46 | The default setting of the code is for playing "Pong". 
47 | To run with other games, you need to modify a line in "agent_start" function in "dqn_agent" class.
48 | 
49 | To make DQN play "Breakout", we may set as
50 | 
51 | (before modification) self.DQN = DQN_class()
52 | ( after modification) self.DQN = DQN_class(enable_controller=[0, 1, 3, 4])
53 | 
54 | "enable_controller" is the list of available actions of the agents. 
55 | The minimum set of the actions required for each game rom are described
56 | in ale_0_4/src/games/supported/name_of_game.cpp,
57 | 
58 | and you can check the corrensponding integer numbers in the section 8.1 of the technical manual of ALE:
59 | 
60 | Technical Manual (you have same manual in your ale directory!): https://github.com/mgbellemare/Arcade-Learning-Environment/tree/master/doc/manual
61 | 
62 | ------------------------------------
63 | Modification of hyper-parameters
64 | ------------------------------------
65 | 
66 | If your machine does not have enough memory to run the full-version DQN, 
67 | try setting "data_size" variable much smaller value like 2*10**4.
68 | This setting may reduce the final performance, but still works well at least in "Pong" domain.
69 | 
70 | ------------------------------------
71 | Copyright (c) 2015 Naoto Yoshida All Right Reserved.
72 | 


--------------------------------------------------------------------------------
/dqn_agent_nips.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Deep Q-network implementation with chainer and rlglue
  4 | Copyright (c) 2015 Naoto Yoshida All Right Reserved.
  5 | """
  6 | 
  7 | import copy
  8 | 
  9 | import pickle
 10 | import numpy as np
 11 | import scipy.misc as spm
 12 | 
 13 | from chainer import cuda, FunctionSet, Variable, optimizers
 14 | import chainer.functions as F
 15 | 
 16 | from rlglue.agent.Agent import Agent
 17 | from rlglue.agent import AgentLoader as AgentLoader
 18 | from rlglue.types import Action
 19 | 
 20 | 
 21 | class DQN_class:
 22 |     # Hyper-Parameters
 23 |     gamma = 0.99  # Discount factor
 24 |     initial_exploration = 10**4  # Initial exploratoin. original: 5x10^4
 25 |     replay_size = 32  # Replay (batch) size
 26 |     target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
 27 |     data_size = 10**5  # Data size of history. original: 10^6
 28 | 
 29 |     def __init__(self, enable_controller=[0, 3, 4]):
 30 |         self.num_of_actions = len(enable_controller)
 31 |         self.enable_controller = enable_controller  # Default setting : "Pong"
 32 | 
 33 |         print "Initializing DQN..."
 34 | #	Initialization for Chainer 1.1.0 or older.
 35 | #        print "CUDA init"
 36 | #        cuda.init()
 37 | 
 38 |         print "Model Building"
 39 |         self.model = FunctionSet(
 40 |             l1=F.Convolution2D(4, 16, ksize=8, stride=4, wscale=np.sqrt(2)),
 41 |             l2=F.Convolution2D(16, 32, ksize=4, stride=2, wscale=np.sqrt(2)),
 42 |             l3=F.Linear(2592, 256),
 43 |             q_value=F.Linear(256, self.num_of_actions,
 44 |                              initialW=np.zeros((self.num_of_actions, 256),
 45 |                                                dtype=np.float32))
 46 |         ).to_gpu()
 47 | 
 48 |         print "Initizlizing Optimizer"
 49 |         self.optimizer = optimizers.RMSpropGraves(lr=0.0002, alpha=0.3, momentum=0.2)
 50 |         self.optimizer.setup(self.model.collect_parameters())
 51 | 
 52 |         # History Data :  D=[s, a, r, s_dash, end_episode_flag]
 53 |         self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
 54 |                   np.zeros(self.data_size, dtype=np.uint8),
 55 |                   np.zeros((self.data_size, 1), dtype=np.int8),
 56 |                   np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
 57 |                   np.zeros((self.data_size, 1), dtype=np.bool)]
 58 | 
 59 |     def forward(self, state, action, Reward, state_dash, episode_end):
 60 |         num_of_batch = state.shape[0]
 61 |         s = Variable(state)
 62 |         s_dash = Variable(state_dash)
 63 | 
 64 |         Q = self.Q_func(s)  # Get Q-value
 65 | 
 66 |         # Generate Target Signals
 67 |         max_Q_dash_ = self.Q_func(s_dash)
 68 |         tmp = list(map(np.max, max_Q_dash_.data.get()))
 69 |         max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
 70 |         target = np.asanyarray(Q.data.get(), dtype=np.float32)
 71 | 
 72 |         for i in xrange(num_of_batch):
 73 |             if not episode_end[i][0]:
 74 |                 tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
 75 |             else:
 76 |                 tmp_ = np.sign(Reward[i])
 77 |             target[i, self.action_to_index(action[i])] = tmp_
 78 | 
 79 |         loss = F.mean_squared_error(Variable(cuda.to_gpu(target)), Q)
 80 |         return loss, Q
 81 | 
 82 |     def stockExperience(self, time,
 83 |                         state, action, reward, state_dash,
 84 |                         episode_end_flag):
 85 |         data_index = time % self.data_size
 86 | 
 87 |         if episode_end_flag is True:
 88 |             self.D[0][data_index] = state
 89 |             self.D[1][data_index] = action
 90 |             self.D[2][data_index] = reward
 91 |         else:
 92 |             self.D[0][data_index] = state
 93 |             self.D[1][data_index] = action
 94 |             self.D[2][data_index] = reward
 95 |             self.D[3][data_index] = state_dash
 96 |         self.D[4][data_index] = episode_end_flag
 97 | 
 98 |     def experienceReplay(self, time):
 99 | 
100 |         if self.initial_exploration < time:
101 |             # Pick up replay_size number of samples from the Data
102 |             if time < self.data_size:  # during the first sweep of the History Data
103 |                 replay_index = np.random.randint(0, time, (self.replay_size, 1))
104 |             else:
105 |                 replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))
106 | 
107 |             s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
108 |             a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
109 |             r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
110 |             s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
111 |             episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
112 |             for i in xrange(self.replay_size):
113 |                 s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
114 |                 a_replay[i] = self.D[1][replay_index[i]]
115 |                 r_replay[i] = self.D[2][replay_index[i]]
116 |                 s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
117 |                 episode_end_replay[i] = self.D[4][replay_index[i]]
118 | 
119 |             s_replay = cuda.to_gpu(s_replay)
120 |             s_dash_replay = cuda.to_gpu(s_dash_replay)
121 | 
122 |             # Gradient-based update
123 |             self.optimizer.zero_grads()
124 |             loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
125 |             loss.backward()
126 |             self.optimizer.update()
127 | 
128 |     def Q_func(self, state):
129 |         h1 = F.relu(self.model.l1(state / 254.0))  # scale inputs in [0.0, 1.0]
130 |         h2 = F.relu(self.model.l2(h1))
131 |         h3 = F.relu(self.model.l3(h2))
132 |         Q = self.model.q_value(h3)
133 |         return Q
134 | 
135 |     def e_greedy(self, state, epsilon):
136 |         s = Variable(state)
137 |         Q = self.Q_func(s)
138 |         Q = Q.data
139 | 
140 |         if np.random.rand() < epsilon:
141 |             index_action = np.random.randint(0, self.num_of_actions)
142 |             print "RANDOM"
143 |         else:
144 |             index_action = np.argmax(Q.get())
145 |             print "GREEDY"
146 | 
147 |         return self.index_to_action(index_action), Q
148 | 
149 |     def index_to_action(self, index_of_action):
150 |         return self.enable_controller[index_of_action]
151 | 
152 |     def action_to_index(self, action):
153 |         return self.enable_controller.index(action)
154 | 
155 | 
156 | class dqn_agent(Agent):  # RL-glue Process
157 |     lastAction = Action()
158 |     policyFrozen = False
159 | 
160 |     def agent_init(self, taskSpec):
161 |         # Some initializations for rlglue
162 |         self.lastAction = Action()
163 | 
164 |         self.time = 0
165 |         self.epsilon = 1.0  # Initial exploratoin rate
166 | 
167 |         # Pick a DQN from DQN_class
168 |         self.DQN = DQN_class()  # Default is for "Pong".
169 | 
170 |     def agent_start(self, observation):
171 | 
172 |         # Get intensity from current observation array
173 |         tmp = np.bitwise_and(np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111)  # Get Intensity from the observation
174 |         obs_array = (spm.imresize(tmp, (110, 84)))[110-84-8:110-8, :]  # Scaling
175 | 
176 |         # Initialize State
177 |         self.state = np.zeros((4, 84, 84), dtype=np.uint8)
178 |         self.state[0] = obs_array
179 |         state_ = cuda.to_gpu(np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32))
180 | 
181 |         # Generate an Action e-greedy
182 |         returnAction = Action()
183 |         action, Q_now = self.DQN.e_greedy(state_, self.epsilon)
184 |         returnAction.intArray = [action]
185 | 
186 |         # Update for next step
187 |         self.lastAction = copy.deepcopy(returnAction)
188 |         self.last_state = self.state.copy()
189 |         self.last_observation = obs_array
190 | 
191 |         return returnAction
192 | 
193 |     def agent_step(self, reward, observation):
194 | 
195 |         # Preproces
196 |         tmp = np.bitwise_and(np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111)  # Get Intensity from the observation
197 |         obs_array = (spm.imresize(tmp, (110, 84)))[110-84-8:110-8, :]  # Scaling
198 |         obs_processed = np.maximum(obs_array, self.last_observation)  # Take maximum from two frames
199 | 
200 |         # Compose State : 4-step sequential observation
201 |         self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_processed], dtype=np.uint8)
202 |         state_ = cuda.to_gpu(np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32))
203 | 
204 |         # Exploration decays along the time sequence
205 |         if self.policyFrozen is False:  # Learning ON/OFF
206 |             if self.DQN.initial_exploration < self.time:
207 |                 self.epsilon -= 1.0/10**6
208 |                 if self.epsilon < 0.1:
209 |                     self.epsilon = 0.1
210 |                 eps = self.epsilon
211 |             else:  # Initial Exploation Phase
212 |                 print "Initial Exploration : %d/%d steps" % (self.time, self.DQN.initial_exploration)
213 |                 eps = 1.0
214 |         else:  # Evaluation
215 |                 print "Policy is Frozen"
216 |                 eps = 0.05
217 | 
218 |         # Generate an Action from e-greedy action selection
219 |         returnAction = Action()
220 |         action, Q_now = self.DQN.e_greedy(state_, eps)
221 |         returnAction.intArray = [action]
222 | 
223 |         # Learning Phase
224 |         if self.policyFrozen is False:  # Learning ON/OFF
225 |             self.DQN.stockExperience(self.time, self.last_state, self.lastAction.intArray[0], reward, self.state, False)
226 |             self.DQN.experienceReplay(self.time)
227 | 
228 |         # Simple text based visualization
229 |         print ' Time Step %d /   ACTION  %d  /   REWARD %.1f   / EPSILON  %.6f  /   Q_max  %3f' % (self.time, self.DQN.action_to_index(action), np.sign(reward), eps, np.max(Q_now.get()))
230 | 
231 |         # Updates for next step
232 |         self.last_observation = obs_array
233 |         
234 |         # Update for next step
235 |         if self.policyFrozen is False:
236 |             self.lastAction = copy.deepcopy(returnAction)
237 |             self.last_state = self.state.copy()
238 |             self.time += 1
239 | 
240 |         return returnAction
241 | 
242 |     def agent_end(self, reward):  # Episode Terminated
243 | 
244 |         # Learning Phase
245 |         if self.policyFrozen is False:  # Learning ON/OFF
246 |             self.DQN.stockExperience(self.time, self.last_state, self.lastAction.intArray[0], reward, self.last_state, True)
247 |             self.DQN.experienceReplay(self.time)
248 | 
249 |         # Simple text based visualization
250 |         print '  REWARD %.1f   / EPSILON  %.5f' % (np.sign(reward), self.epsilon)
251 | 
252 |         # Time count
253 |         if not self.policyFrozen:
254 |             self.time += 1
255 | 
256 |     def agent_cleanup(self):
257 |         pass
258 | 
259 |     def agent_message(self, inMessage):
260 |         if inMessage.startswith("freeze learning"):
261 |             self.policyFrozen = True
262 |             return "message understood, policy frozen"
263 | 
264 |         if inMessage.startswith("unfreeze learning"):
265 |             self.policyFrozen = False
266 |             return "message understood, policy unfrozen"
267 | 
268 |         if inMessage.startswith("save model"):
269 |             with open('dqn_model.dat', 'w') as f:
270 |                 pickle.dump(self.DQN.model, f)
271 |             return "message understood, model saved"
272 | 
273 | if __name__ == "__main__":
274 |     AgentLoader.loadAgent(dqn_agent())
275 | 


--------------------------------------------------------------------------------
/DQN-chainer-gym/dqn_agent.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Deep Q-network implementation with chainer for gym environment
  4 | Copyright (c) 2016  Naoto Yoshida All Right Reserved.
  5 | """
  6 | 
  7 | import copy
  8 | 
  9 | import pickle
 10 | import numpy as np
 11 | import scipy.misc as spm
 12 | 
 13 | from chainer import cuda, Function, Variable, optimizers, serializers
 14 | from chainer import Chain
 15 | import chainer.functions as F
 16 | import chainer.links as L
 17 | 
 18 | class ActionValue(Chain):
 19 |     def __init__(self, n_history, n_act):
 20 |         super(ActionValue, self).__init__(
 21 |             l1=F.Convolution2D(n_history, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)),
 22 |             l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)),
 23 |             l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)),
 24 |             l4=F.Linear(3136, 512, wscale=np.sqrt(2)),
 25 |             q_value=F.Linear(512, n_act,
 26 |                              initialW=np.zeros((n_act, 512),
 27 |                              dtype=np.float32))
 28 |         )
 29 | 
 30 |     def q_function(self, state):
 31 |         h1 = F.relu(self.l1(state/255.))
 32 |         h2 = F.relu(self.l2(h1))
 33 |         h3 = F.relu(self.l3(h2))
 34 |         h4 = F.relu(self.l4(h3))
 35 |         return self.q_value(h4)
 36 | 
 37 | 
 38 | class DQN:
 39 |     # Hyper-Parameters
 40 |     gamma = 0.99  # Discount factor
 41 |     initial_exploration = 10**4  # Initial exploratoin. original: 5x10^4
 42 |     replay_size = 32  # Replay (batch) size
 43 |     target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
 44 |     data_size = 10**5  # Data size of history. original: 10^6
 45 |     img_size = 84  # 84x84 image input (fixed)
 46 | 
 47 |     def __init__(self, n_history, n_act):
 48 |         print("Initializing DQN...")
 49 |         self.step = 0  # number of steps that DQN is updated
 50 |         self.n_act = n_act
 51 |         self.n_history = n_history  # Number of obervations used to construct the single state
 52 | 
 53 |         print("Model Building")
 54 |         self.model = ActionValue(n_history, n_act).to_gpu()
 55 |         self.model_target = copy.deepcopy(self.model)
 56 | 
 57 |         print("Initizlizing Optimizer")
 58 |         self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.01)
 59 |         self.optimizer.setup(self.model)
 60 | 
 61 |         # History Data :  D=[s, a, r, s_dash, end_episode_flag]
 62 |         hs = self.n_history
 63 |         ims = self.img_size
 64 |         self.replay_buffer = [np.zeros((self.data_size, hs, ims, ims), dtype=np.uint8),
 65 |                   np.zeros(self.data_size, dtype=np.uint8),
 66 |                   np.zeros((self.data_size, 1), dtype=np.float32),
 67 |                   np.zeros((self.data_size, hs, ims, ims), dtype=np.uint8),
 68 |                   np.zeros((self.data_size, 1), dtype=np.bool)]
 69 | 
 70 |     def get_loss(self, state, action, reward, state_prime, episode_end):
 71 |         s = Variable(cuda.to_gpu(state))
 72 |         s_dash = Variable(cuda.to_gpu(state_prime))
 73 | 
 74 |         q = self.model.q_function(s)  # Get Q-value
 75 | 
 76 |         # Generate Target Signals
 77 |         tmp = self.model_target.q_function(s_dash)  # Q(s',*)
 78 |         tmp = list(map(np.max, tmp.data))  # max_a Q(s',a)
 79 |         max_q_prime = np.asanyarray(tmp, dtype=np.float32)
 80 |         target = np.asanyarray(copy.deepcopy(q.data.get()), dtype=np.float32)
 81 | 
 82 |         for i in range(self.replay_size):
 83 |             if episode_end[i][0] is True:
 84 |                 tmp_ = np.sign(reward[i])
 85 |             else:
 86 |                 #  The sign of reward is used as the reward of DQN!
 87 |                 tmp_ = np.sign(reward[i]) + self.gamma * max_q_prime[i]
 88 | 
 89 |             target[i, action[i]] = tmp_
 90 | 
 91 |         # TD-error clipping
 92 |         td = Variable(cuda.to_gpu(target)) - q  # TD error
 93 |         td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
 94 |         td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)
 95 | 
 96 |         zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.n_act), dtype=np.float32)))
 97 |         loss = F.mean_squared_error(td_clip, zero_val)
 98 |         return loss, q
 99 | 
100 |     def stock_experience(self, time,
101 |                         state, action, reward, state_prime,
102 |                         episode_end_flag):
103 |         data_index = time % self.data_size
104 | 
105 |         if episode_end_flag is True:
106 |             self.replay_buffer[0][data_index] = state
107 |             self.replay_buffer[1][data_index] = action
108 |             self.replay_buffer[2][data_index] = reward
109 |         else:
110 |             self.replay_buffer[0][data_index] = state
111 |             self.replay_buffer[1][data_index] = action
112 |             self.replay_buffer[2][data_index] = reward
113 |             self.replay_buffer[3][data_index] = state_prime
114 |         self.replay_buffer[4][data_index] = episode_end_flag
115 | 
116 |     def experience_replay(self, time):
117 | 
118 |         if self.initial_exploration < time:
119 |             # Pick up replay_size number of samples from the Data
120 |             if time < self.data_size:  # during the first sweep of the History Data
121 |                 replay_index = np.random.randint(0, time, (self.replay_size, 1))
122 |             else:
123 |                 replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))
124 | 
125 |             hs = self.n_history
126 |             ims = self.img_size
127 |             rs = self.replay_size
128 | 
129 |             s_replay = np.ndarray(shape=(rs, hs, ims, ims), dtype=np.float32)
130 |             a_replay = np.ndarray(shape=(rs, 1), dtype=np.int8)
131 |             r_replay = np.ndarray(shape=(rs, 1), dtype=np.float32)
132 |             s_dash_replay = np.ndarray(shape=(rs, hs, ims, ims), dtype=np.float32)
133 |             episode_end_replay = np.ndarray(shape=(rs, 1), dtype=np.bool)
134 |             for i in range(self.replay_size):
135 |                 s_replay[i] = np.asarray(self.replay_buffer[0][replay_index[i]], dtype=np.float32)
136 |                 a_replay[i] = self.replay_buffer[1][replay_index[i]]
137 |                 r_replay[i] = self.replay_buffer[2][replay_index[i]]
138 |                 s_dash_replay[i] = np.array(self.replay_buffer[3][replay_index[i]], dtype=np.float32)
139 |                 episode_end_replay[i] = self.replay_buffer[4][replay_index[i]]
140 | 
141 |             # Gradient-based update
142 |             self.optimizer.zero_grads()
143 |             loss, _ = self.get_loss(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
144 |             loss.backward()
145 |             self.optimizer.update()
146 | 
147 | 
148 |     def action_sample_e_greedy(self, state, epsilon):
149 |         s = Variable(cuda.to_gpu(state))
150 |         q = self.model.q_function(s)
151 |         q = q.data.get()[0]
152 | 
153 |         if np.random.rand() < epsilon:
154 |             action = np.random.randint(0, self.n_act)
155 |             print("RANDOM : " + str(action))
156 |         else:
157 |             a = np.argmax(q)
158 |             print("GREEDY  : " + str(a))
159 |             action = np.asarray(a, dtype=np.int8)
160 |             print(q)
161 |         return action, q
162 | 
163 |     def target_model_update(self, soft_update):
164 |         if soft_update is True:
165 |             tau = self.target_update_rate
166 | 
167 |             # Target preference Update
168 |             model_params = dict(self.model.namedparams())
169 |             model_target_params = dict(self.model_target.namedparams())
170 |             for name in model_target_params:
171 |                 model_target_params[name].data = tau*model_params[name].data\
172 |                                         + (1 - tau)*model_target_params[name].data
173 |         else:
174 |             if np.mod(self.step, self.target_model_update_freq) == 0:
175 |                 self.model_target = copy.deepcopy(self.model)
176 | 
177 |     def learn(self, state, action, reward, state_prime, terminal):
178 |         self.stock_experience(self.step,
179 |                          state, action, reward, state_prime,
180 |                          terminal)
181 | 
182 |         self.experience_replay(self.step)
183 |         self.target_model_update(soft_update=False)
184 | 
185 |         self.step += 1
186 | 
187 | 
188 | class DQN_Agent:  # RL-glue Process
189 |     policyFrozen = False
190 | 
191 |     def __init__(self, env):
192 | 
193 |         self.epsilon = 1.0  # Initial exploratoin rate
194 | 
195 |         # Pick a DQN from DQN_class
196 |         self.dqn = DQN(n_history=4, n_act=env.action_space.n)
197 | 
198 |     def start(self, observation):
199 | 
200 |         self.reset_state(observation)
201 |         state_ = np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32)
202 | 
203 |         # Generate an Action e-greedy
204 |         action, Q_now = self.dqn.action_sample_e_greedy(state_, self.epsilon)
205 | 
206 |         # Update for next step
207 |         self.last_action = action
208 |         self.last_state = copy.deepcopy(self.state)
209 | 
210 |         return action
211 | 
212 |     def act(self, observation, reward):
213 | 
214 |         self.set_state(observation)
215 |         state_ = np.asanyarray(self.state.reshape(1, self.dqn.n_history, 84, 84), dtype=np.float32)
216 | 
217 |         # Exploration decays along the time sequence
218 |         if self.policyFrozen is False:  # Learning ON/OFF
219 |             if self.dqn.initial_exploration < self.dqn.step:
220 |                 self.epsilon -= 1.0/10**6
221 |                 if self.epsilon < 0.1:
222 |                     self.epsilon = 0.1
223 |                 eps = self.epsilon
224 |             else:  # Initial Exploation Phase
225 |                 print("Initial Exploration : %d/%d steps" % (self.dqn.step, self.dqn.initial_exploration))
226 |                 eps = 1.0
227 |         else:  # Evaluation
228 |                 print("Policy is Frozen")
229 |                 eps = 0.05
230 | 
231 |         # Generate an Action by e-greedy action selection
232 |         action, Q_now = self.dqn.action_sample_e_greedy(state_, eps)
233 | 
234 |         # Learning Phase
235 |         if self.policyFrozen is False:  # Learning ON/OFF
236 |             self.dqn.learn(self.last_state, self.last_action, reward, self.state, False)
237 |             self.last_action = copy.deepcopy(action)
238 |             self.last_state = self.state.copy()
239 | 
240 |         # Simple text based visualization
241 |         print(' Time Step %d /   ACTION  %d  /   REWARD %.1f   / EPSILON  %.6f  /   Q_max  %3f' % (self.dqn.step, action, np.sign(reward), eps, np.max(Q_now)))
242 | 
243 |         return action
244 | 
245 |     def end(self, reward):  # Episode Terminated
246 | 
247 |         # Learning Phase
248 |         if self.policyFrozen is False:  # Learning ON/OFF
249 |             self.dqn.learn(self.last_state, self.last_action, reward, self.last_state, True)
250 | 
251 |         # Simple text based visualization
252 |         print('  REWARD %.1f   / EPSILON  %.5f' % (np.sign(reward), self.epsilon))
253 | 
254 | 
255 |     def reset_state(self, observation):
256 |         # Preprocess
257 |         obs_array = self.scale_image(observation)
258 |         # Updates for next step
259 |         self.last_observation = obs_array
260 | 
261 |         # Initialize State
262 |         self.state = np.zeros((self.dqn.n_history, 84, 84), dtype=np.uint8)
263 |         self.state[0] = obs_array
264 | 
265 |     def set_state(self, observation):
266 |         # Preproces
267 |         obs_array = self.scale_image(observation)
268 |         obs_processed = np.maximum(obs_array, self.last_observation)  # Take maximum from two frames
269 | 
270 |         # Updates for the next step
271 |         self.last_observation = obs_array
272 | 
273 |         # Compose State : 4-step sequential observation
274 |         for i in range(self.dqn.n_history - 1):
275 |             self.state[i] = self.state[i + 1].astype(np.uint8)
276 |         self.state[self.dqn.n_history - 1] = obs_processed.astype(np.uint8)
277 | 
278 |     def scale_image(self, observation):
279 |         img = self.rgb2gray(observation)  # Convert RGB to Grayscale
280 |         return (spm.imresize(img, (110, 84)))[110-84-8:110-8, :]  # Scaling
281 | 
282 |     def rgb2gray(self, image):
283 |         return np.dot(image[...,:3], [0.299, 0.587, 0.114])
284 | 
285 |     def save(self):
286 |         serializers.save_npz('network/model.model', self.dqn.model)
287 |         serializers.save_npz('network/model_target.model',
288 |                              self.dqn.model_target)
289 | 
290 |         print("------------ Networks were SAVED ---------------")
291 | 


--------------------------------------------------------------------------------
/DQN-chainer-gym/dqn_agent_cpu.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Deep Q-network implementation with chainer for gym environment
  4 | Copyright (c) 2016  Naoto Yoshida All Right Reserved.
  5 | """
  6 | 
  7 | import copy
  8 | 
  9 | import pickle
 10 | import numpy as np
 11 | import scipy.misc as spm
 12 | 
 13 | from chainer import cuda, Function, Variable, optimizers, serializers
 14 | from chainer import Chain
 15 | import chainer.functions as F
 16 | import chainer.links as L
 17 | 
 18 | import matplotlib.pyplot as plt
 19 | 
 20 | class ActionValue(Chain):
 21 |     def __init__(self, n_history, n_act):
 22 |         super(ActionValue, self).__init__(
 23 |             l1=F.Convolution2D(n_history, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)),
 24 |             l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)),
 25 |             l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)),
 26 |             l4=F.Linear(3136, 512),#, wscale=np.sqrt(2)),
 27 |             q_value=F.Linear(512, n_act,
 28 |                              initialW=0.0*np.random.randn(n_act, 512).astype(np.float32))
 29 |         )
 30 | 
 31 |     def q_function(self, state):
 32 |         h1 = F.relu(self.l1(state/255.))
 33 |         h2 = F.relu(self.l2(h1))
 34 |         h3 = F.relu(self.l3(h2))
 35 |         h4 = F.relu(self.l4(h3))
 36 |         return self.q_value(h4)
 37 | 
 38 | 
 39 | class DQN:
 40 |     # Hyper-Parameters
 41 |     gamma = 0.99  # Discount factor
 42 |     initial_exploration = 10**4  # Initial exploratoin. original: 5x10^4
 43 |     replay_size = 32  # Replay (batch) size
 44 |     target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
 45 |     data_size = 10**5  # Data size of history. original: 10^6
 46 |     img_size = 84  # 84x84 image input (fixed)
 47 | 
 48 |     def __init__(self, n_history, n_act):
 49 |         print("Initializing DQN...")
 50 |         self.step = 0  # number of steps that DQN is updated
 51 |         self.n_act = n_act
 52 |         self.n_history = n_history  # Number of obervations used to construct the single state
 53 | 
 54 |         print("Model Building")
 55 |         self.model = ActionValue(n_history, n_act)
 56 |         self.model_target = copy.deepcopy(self.model)
 57 | 
 58 |         print("Initizlizing Optimizer")
 59 |         self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.01)
 60 |         self.optimizer.setup(self.model)
 61 | 
 62 |         # History Data :  D=[s, a, r, s_dash, end_episode_flag]
 63 |         hs = self.n_history
 64 |         ims = self.img_size
 65 |         self.replay_buffer = [np.zeros((self.data_size, hs, ims, ims), dtype=np.uint8),
 66 |                   np.zeros(self.data_size, dtype=np.uint8),
 67 |                   np.zeros((self.data_size, 1), dtype=np.float32),
 68 |                   np.zeros((self.data_size, hs, ims, ims), dtype=np.uint8),
 69 |                   np.zeros((self.data_size, 1), dtype=np.bool)]
 70 | 
 71 |     def get_loss(self, state, action, reward, state_prime, episode_end):
 72 |         s = Variable(state)
 73 |         s_dash = Variable(state_prime)
 74 | 
 75 |         q = self.model.q_function(s)  # Get Q-value
 76 | 
 77 |         # Generate Target Signals
 78 |         tmp = self.model_target.q_function(s_dash)  # Q(s',*)
 79 |         tmp = list(map(np.max, tmp.data))  # max_a Q(s',a)
 80 |         max_q_prime = np.asanyarray(tmp, dtype=np.float32)
 81 |         target = np.asanyarray(copy.deepcopy(q.data), dtype=np.float32)
 82 | 
 83 |         for i in range(self.replay_size):
 84 |             if episode_end[i][0] is True:
 85 |                 tmp_ = np.sign(reward[i])
 86 |             else:
 87 |                 #  The sign of reward is used as the reward of DQN!
 88 |                 tmp_ = np.sign(reward[i]) + self.gamma * max_q_prime[i]
 89 | 
 90 |             target[i, action[i]] = tmp_
 91 |             #print(tmp_)
 92 | 
 93 |         #print(target)
 94 |         # TD-error clipping
 95 |         td = Variable(target) - q  # TD error
 96 |         #print("TD ")
 97 |         #print(td.data)
 98 |         td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
 99 |         td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)
100 |         #print(np.round(td.data))
101 | 
102 |         zero_val = Variable(np.zeros((self.replay_size, self.n_act), dtype=np.float32))
103 |         loss = F.mean_squared_error(td_clip, zero_val)
104 |         return loss, q
105 | 
106 |     def stock_experience(self, time,
107 |                         state, action, reward, state_prime,
108 |                         episode_end_flag):
109 |         data_index = time % self.data_size
110 | 
111 |         if episode_end_flag is True:
112 |             self.replay_buffer[0][data_index] = state
113 |             self.replay_buffer[1][data_index] = action
114 |             self.replay_buffer[2][data_index] = reward
115 |         else:
116 |             self.replay_buffer[0][data_index] = state
117 |             self.replay_buffer[1][data_index] = action
118 |             self.replay_buffer[2][data_index] = reward
119 |             self.replay_buffer[3][data_index] = state_prime
120 |         self.replay_buffer[4][data_index] = episode_end_flag
121 | 
122 |     def experience_replay(self, time):
123 | 
124 |         if self.initial_exploration < time:
125 |             # Pick up replay_size number of samples from the Data
126 |             if time < self.data_size:  # during the first sweep of the History Data
127 |                 replay_index = np.random.randint(0, time, (self.replay_size, 1))
128 |             else:
129 |                 replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))
130 | 
131 |             hs = self.n_history
132 |             ims = self.img_size
133 |             rs = self.replay_size
134 | 
135 |             s_replay = np.ndarray(shape=(rs, hs, ims, ims), dtype=np.float32)
136 |             a_replay = np.ndarray(shape=(rs, 1), dtype=np.int8)
137 |             r_replay = np.ndarray(shape=(rs, 1), dtype=np.float32)
138 |             s_dash_replay = np.ndarray(shape=(rs, hs, ims, ims), dtype=np.float32)
139 |             episode_end_replay = np.ndarray(shape=(rs, 1), dtype=np.bool)
140 |             for i in range(self.replay_size):
141 |                 s_replay[i] = np.asarray(self.replay_buffer[0][replay_index[i]], dtype=np.float32)
142 |                 a_replay[i] = self.replay_buffer[1][replay_index[i]]
143 |                 r_replay[i] = self.replay_buffer[2][replay_index[i]]
144 |                 s_dash_replay[i] = np.array(self.replay_buffer[3][replay_index[i]], dtype=np.float32)
145 |                 episode_end_replay[i] = self.replay_buffer[4][replay_index[i]]
146 | 
147 |             # Gradient-based update
148 |             self.optimizer.zero_grads()
149 |             loss, _ = self.get_loss(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
150 |             loss.backward()
151 |             self.optimizer.update()
152 | 
153 | 
154 |     def action_sample_e_greedy(self, state, epsilon):
155 |         s = Variable(state)
156 |         q = self.model.q_function(s)
157 |         q = q.data[0]
158 | 
159 |         if np.random.rand() < epsilon:
160 |             action = np.random.randint(0, self.n_act)
161 |             print("RANDOM : " + str(action))
162 |         else:
163 |             a = np.argmax(q)
164 |             print("GREEDY  : " + str(a))
165 |             action = np.asarray(a, dtype=np.int8)
166 |             print(q)
167 |         return action, q
168 | 
169 |     def target_model_update(self, soft_update):
170 |         if soft_update is True:
171 |             tau = self.target_update_rate
172 | 
173 |             # Target preference Update
174 |             model_params = dict(self.model.namedparams())
175 |             model_target_params = dict(self.model_target.namedparams())
176 |             for name in model_target_params:
177 |                 model_target_params[name].data = tau*model_params[name].data\
178 |                                         + (1 - tau)*model_target_params[name].data
179 |         else:
180 |             if np.mod(self.step, self.target_model_update_freq) == 0:
181 |                 self.model_target = copy.deepcopy(self.model)
182 | 
183 |     def learn(self, state, action, reward, state_prime, terminal):
184 |         self.stock_experience(self.step,
185 |                          state, action, reward, state_prime,
186 |                          terminal)
187 | 
188 |         self.experience_replay(self.step)
189 |         self.target_model_update(soft_update=False)
190 | 
191 |         self.step += 1
192 | 
193 | 
194 | class DQN_Agent:  # RL-glue Process
195 |     policyFrozen = False
196 | 
197 |     def __init__(self, env):
198 | 
199 |         self.epsilon = 1.0  # Initial exploratoin rate
200 | 
201 |         # Pick a DQN from DQN_class
202 |         self.dqn = DQN(n_history=4, n_act=env.action_space.n)
203 | 
204 |     def start(self, observation):
205 | 
206 |         self.reset_state(observation)
207 |         state_ = np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32)
208 | 
209 |         # Generate an Action e-greedy
210 |         action, Q_now = self.dqn.action_sample_e_greedy(state_, self.epsilon)
211 | 
212 |         # Update for next step
213 |         self.last_action = action
214 |         self.last_state = copy.deepcopy(self.state)
215 | 
216 |         return action
217 | 
218 |     def act(self, observation, reward):
219 | 
220 |         self.set_state(observation)
221 |         state_ = np.asanyarray(self.state.reshape(1, self.dqn.n_history, 84, 84), dtype=np.float32)
222 | 
223 |         # Exploration decays along the time sequence
224 |         if self.policyFrozen is False:  # Learning ON/OFF
225 |             if self.dqn.initial_exploration < self.dqn.step:
226 |                 self.epsilon -= 1.0/10**6
227 |                 if self.epsilon < 0.1:
228 |                     self.epsilon = 0.1
229 |                 eps = self.epsilon
230 |             else:  # Initial Exploation Phase
231 |                 print("Initial Exploration : %d/%d steps" % (self.dqn.step, self.dqn.initial_exploration))
232 |                 eps = 1.0
233 |         else:  # Evaluation
234 |                 print("Policy is Frozen")
235 |                 eps = 0.05
236 | 
237 |         # Generate an Action by e-greedy action selection
238 |         action, Q_now = self.dqn.action_sample_e_greedy(state_, eps)
239 | 
240 |         # Learning Phase
241 |         if self.policyFrozen is False:  # Learning ON/OFF
242 |             self.dqn.learn(self.last_state, self.last_action, reward, self.state, False)
243 |             self.last_action = copy.deepcopy(action)
244 |             self.last_state = self.state.copy()
245 | 
246 |         # Simple text based visualization
247 |         print(' Time Step %d /   ACTION  %d  /   REWARD %.1f   / EPSILON  %.6f  /   Q_max  %3f' % (self.dqn.step, action, np.sign(reward), eps, np.max(Q_now)))
248 | 
249 |         return action
250 | 
251 |     def end(self, reward):  # Episode Terminated
252 | 
253 |         # Learning Phase
254 |         if self.policyFrozen is False:  # Learning ON/OFF
255 |             self.dqn.learn(self.last_state, self.last_action, reward, self.last_state, True)
256 | 
257 |         # Simple text based visualization
258 |         print('  REWARD %.1f   / EPSILON  %.5f' % (np.sign(reward), self.epsilon))
259 | 
260 | 
261 |     def reset_state(self, observation):
262 |         # Preprocess
263 |         obs_array = self.scale_image(observation)
264 |         # Updates for next step
265 |         self.last_observation = obs_array
266 | 
267 |         # Initialize State
268 |         self.state = np.zeros((self.dqn.n_history, 84, 84), dtype=np.uint8)
269 |         self.state[0] = obs_array
270 | 
271 |     def set_state(self, observation):
272 |         # Preproces
273 |         obs_array = self.scale_image(observation)
274 |         obs_processed = np.maximum(obs_array, self.last_observation)  # Take maximum from two frames
275 | 
276 |         """
277 |         print(obs_processed.max())
278 |         plt.imshow(obs_processed)
279 |         plt.draw()
280 |         plt.pause(0.0001)
281 |         """
282 | 
283 |         # Updates for the next step
284 |         self.last_observation = obs_array
285 | 
286 |         # Compose State : 4-step sequential observation
287 |         for i in range(self.dqn.n_history - 1):
288 |             self.state[i] = self.state[i + 1].astype(np.uint8)
289 |         self.state[self.dqn.n_history - 1] = obs_processed.astype(np.uint8)
290 | 
291 |     def scale_image(self, observation):
292 |         img = self.rgb2gray(observation)  # Convert RGB to Grayscale
293 |         return (spm.imresize(img, (110, 84)))[110-84-8:110-8, :]  # Scaling
294 | 
295 |     def rgb2gray(self, image):
296 |         return np.dot(image[...,:3], [0.299, 0.587, 0.114])
297 | 
298 |     def save(self):
299 |         serializers.save_npz('network/model.model', self.dqn.model)
300 |         serializers.save_npz('network/model_target.model',
301 |                              self.dqn.model_target)
302 | 
303 |         print("------------ Networks were SAVED ---------------")
304 | 


--------------------------------------------------------------------------------
/dqn_agent_nature.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Deep Q-network implementation with chainer and rlglue
  4 | Copyright (c) 2015  Naoto Yoshida All Right Reserved.
  5 | """
  6 | 
  7 | import copy
  8 | 
  9 | import pickle
 10 | import numpy as np
 11 | import scipy.misc as spm
 12 | 
 13 | from chainer import cuda, FunctionSet, Variable, optimizers
 14 | import chainer.functions as F
 15 | 
 16 | from rlglue.agent.Agent import Agent
 17 | from rlglue.agent import AgentLoader as AgentLoader
 18 | from rlglue.types import Action
 19 | 
 20 | 
 21 | class DQN_class:
 22 |     # Hyper-Parameters
 23 |     gamma = 0.99  # Discount factor
 24 |     initial_exploration = 100#10**4  # Initial exploratoin. original: 5x10^4
 25 |     replay_size = 32  # Replay (batch) size
 26 |     target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
 27 |     data_size = 10**5  # Data size of history. original: 10^6
 28 | 
 29 |     def __init__(self, enable_controller=[0, 3, 4]):
 30 |         self.num_of_actions = len(enable_controller)
 31 |         self.enable_controller = enable_controller  # Default setting : "Pong"
 32 | 
 33 |         print "Initializing DQN..."
 34 | #	Initialization of Chainer 1.1.0 or older.
 35 | #        print "CUDA init"
 36 | #        cuda.init()
 37 | 
 38 |         print "Model Building"
 39 |         self.model = FunctionSet(
 40 |             l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)),
 41 |             l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)),
 42 |             l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)),
 43 |             l4=F.Linear(3136, 512, wscale=np.sqrt(2)),
 44 |             q_value=F.Linear(512, self.num_of_actions,
 45 |                              initialW=np.zeros((self.num_of_actions, 512),
 46 |                                                dtype=np.float32))
 47 |         ).to_gpu()
 48 | 
 49 |         self.model_target = copy.deepcopy(self.model)
 50 | 
 51 |         print "Initizlizing Optimizer"
 52 |         self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)
 53 |         self.optimizer.setup(self.model.collect_parameters())
 54 | 
 55 |         # History Data :  D=[s, a, r, s_dash, end_episode_flag]
 56 |         self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
 57 |                   np.zeros(self.data_size, dtype=np.uint8),
 58 |                   np.zeros((self.data_size, 1), dtype=np.int8),
 59 |                   np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
 60 |                   np.zeros((self.data_size, 1), dtype=np.bool)]
 61 | 
 62 |     def forward(self, state, action, Reward, state_dash, episode_end):
 63 |         num_of_batch = state.shape[0]
 64 |         s = Variable(state)
 65 |         s_dash = Variable(state_dash)
 66 | 
 67 |         Q = self.Q_func(s)  # Get Q-value
 68 | 
 69 |         # Generate Target Signals
 70 |         tmp = self.Q_func_target(s_dash)  # Q(s',*)
 71 |         tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
 72 |         max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
 73 |         target = np.asanyarray(Q.data.get(), dtype=np.float32)
 74 | 
 75 |         for i in xrange(num_of_batch):
 76 |             if not episode_end[i][0]:
 77 |                 tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
 78 |             else:
 79 |                 tmp_ = np.sign(Reward[i])
 80 | 
 81 |             action_index = self.action_to_index(action[i])
 82 |             target[i, action_index] = tmp_
 83 | 
 84 |         # TD-error clipping
 85 |         td = Variable(cuda.to_gpu(target)) - Q  # TD error
 86 |         td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
 87 |         td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)
 88 | 
 89 |         zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)))
 90 |         loss = F.mean_squared_error(td_clip, zero_val)
 91 |         return loss, Q
 92 | 
 93 |     def stockExperience(self, time,
 94 |                         state, action, reward, state_dash,
 95 |                         episode_end_flag):
 96 |         data_index = time % self.data_size
 97 | 
 98 |         if episode_end_flag is True:
 99 |             self.D[0][data_index] = state
100 |             self.D[1][data_index] = action
101 |             self.D[2][data_index] = reward
102 |         else:
103 |             self.D[0][data_index] = state
104 |             self.D[1][data_index] = action
105 |             self.D[2][data_index] = reward
106 |             self.D[3][data_index] = state_dash
107 |         self.D[4][data_index] = episode_end_flag
108 | 
109 |     def experienceReplay(self, time):
110 | 
111 |         if self.initial_exploration < time:
112 |             # Pick up replay_size number of samples from the Data
113 |             if time < self.data_size:  # during the first sweep of the History Data
114 |                 replay_index = np.random.randint(0, time, (self.replay_size, 1))
115 |             else:
116 |                 replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))
117 | 
118 |             s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
119 |             a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
120 |             r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
121 |             s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
122 |             episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
123 |             for i in xrange(self.replay_size):
124 |                 s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
125 |                 a_replay[i] = self.D[1][replay_index[i]]
126 |                 r_replay[i] = self.D[2][replay_index[i]]
127 |                 s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
128 |                 episode_end_replay[i] = self.D[4][replay_index[i]]
129 | 
130 |             s_replay = cuda.to_gpu(s_replay)
131 |             s_dash_replay = cuda.to_gpu(s_dash_replay)
132 | 
133 |             # Gradient-based update
134 |             self.optimizer.zero_grads()
135 |             loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
136 |             loss.backward()
137 |             self.optimizer.update()
138 | 
139 |     def Q_func(self, state):
140 |         h1 = F.relu(self.model.l1(state / 255.0))  # scale inputs in [0.0 1.0]
141 |         h2 = F.relu(self.model.l2(h1))
142 |         h3 = F.relu(self.model.l3(h2))
143 |         h4 = F.relu(self.model.l4(h3))
144 |         Q = self.model.q_value(h4)
145 |         return Q
146 | 
147 |     def Q_func_target(self, state):
148 |         h1 = F.relu(self.model_target.l1(state / 255.0))  # scale inputs in [0.0 1.0]
149 |         h2 = F.relu(self.model_target.l2(h1))
150 |         h3 = F.relu(self.model_target.l3(h2))
151 |         h4 = F.relu(self.model_target.l4(h3))
152 |         Q = self.model_target.q_value(h4)
153 |         return Q
154 | 
155 |     def e_greedy(self, state, epsilon):
156 |         s = Variable(state)
157 |         Q = self.Q_func(s)
158 |         Q = Q.data
159 | 
160 |         if np.random.rand() < epsilon:
161 |             index_action = np.random.randint(0, self.num_of_actions)
162 |             print "RANDOM"
163 |         else:
164 |             index_action = np.argmax(Q.get())
165 |             print "GREEDY"
166 |         return self.index_to_action(index_action), Q
167 | 
168 |     def target_model_update(self):
169 |         self.model_target = copy.deepcopy(self.model)
170 | 
171 |     def index_to_action(self, index_of_action):
172 |         return self.enable_controller[index_of_action]
173 | 
174 |     def action_to_index(self, action):
175 |         return self.enable_controller.index(action)
176 | 
177 | 
178 | class dqn_agent(Agent):  # RL-glue Process
179 |     lastAction = Action()
180 |     policyFrozen = False
181 | 
182 |     def agent_init(self, taskSpec):
183 |         # Some initializations for rlglue
184 |         self.lastAction = Action()
185 | 
186 |         self.time = 0
187 |         self.epsilon = 1.0  # Initial exploratoin rate
188 | 
189 |         # Pick a DQN from DQN_class
190 |         self.DQN = DQN_class()  # default is for "Pong".
191 | 
192 |     def agent_start(self, observation):
193 | 
194 |         # Preprocess
195 |         tmp = np.bitwise_and(np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111)  # Get Intensity from the observation
196 |         obs_array = (spm.imresize(tmp, (110, 84)))[110-84-8:110-8, :]  # Scaling
197 | 
198 |         # Initialize State
199 |         self.state = np.zeros((4, 84, 84), dtype=np.uint8)
200 |         self.state[0] = obs_array
201 |         state_ = cuda.to_gpu(np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32))
202 | 
203 |         # Generate an Action e-greedy
204 |         returnAction = Action()
205 |         action, Q_now = self.DQN.e_greedy(state_, self.epsilon)
206 |         returnAction.intArray = [action]
207 | 
208 |         # Update for next step
209 |         self.lastAction = copy.deepcopy(returnAction)
210 |         self.last_state = self.state.copy()
211 |         self.last_observation = obs_array
212 | 
213 |         return returnAction
214 | 
215 |     def agent_step(self, reward, observation):
216 | 
217 |         # Preproces
218 |         tmp = np.bitwise_and(np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111)  # Get Intensity from the observation
219 |         obs_array = (spm.imresize(tmp, (110, 84)))[110-84-8:110-8, :]  # Scaling
220 |         obs_processed = np.maximum(obs_array, self.last_observation)  # Take maximum from two frames
221 | 
222 |         # Compose State : 4-step sequential observation
223 |         self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_processed], dtype=np.uint8)
224 |         state_ = cuda.to_gpu(np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32))
225 | 
226 |         # Exploration decays along the time sequence
227 |         if self.policyFrozen is False:  # Learning ON/OFF
228 |             if self.DQN.initial_exploration < self.time:
229 |                 self.epsilon -= 1.0/10**6
230 |                 if self.epsilon < 0.1:
231 |                     self.epsilon = 0.1
232 |                 eps = self.epsilon
233 |             else:  # Initial Exploation Phase
234 |                 print "Initial Exploration : %d/%d steps" % (self.time, self.DQN.initial_exploration)
235 |                 eps = 1.0
236 |         else:  # Evaluation
237 |                 print "Policy is Frozen"
238 |                 eps = 0.05
239 | 
240 |         # Generate an Action by e-greedy action selection
241 |         returnAction = Action()
242 |         action, Q_now = self.DQN.e_greedy(state_, eps)
243 |         returnAction.intArray = [action]
244 | 
245 |         # Learning Phase
246 |         if self.policyFrozen is False:  # Learning ON/OFF
247 |             self.DQN.stockExperience(self.time, self.last_state, self.lastAction.intArray[0], reward, self.state, False)
248 |             self.DQN.experienceReplay(self.time)
249 | 
250 |         # Target model update
251 |         if self.DQN.initial_exploration < self.time and np.mod(self.time, self.DQN.target_model_update_freq) == 0:
252 |             print "########### MODEL UPDATED ######################"
253 |             self.DQN.target_model_update()
254 | 
255 |         # Simple text based visualization
256 |         print ' Time Step %d /   ACTION  %d  /   REWARD %.1f   / EPSILON  %.6f  /   Q_max  %3f' % (self.time, self.DQN.action_to_index(action), np.sign(reward), eps, np.max(Q_now.get()))
257 | 
258 |         # Updates for next step
259 |         self.last_observation = obs_array
260 | 
261 |         if self.policyFrozen is False:
262 |             self.lastAction = copy.deepcopy(returnAction)
263 |             self.last_state = self.state.copy()
264 |             self.time += 1
265 | 
266 |         return returnAction
267 | 
268 |     def agent_end(self, reward):  # Episode Terminated
269 | 
270 |         # Learning Phase
271 |         if self.policyFrozen is False:  # Learning ON/OFF
272 |             self.DQN.stockExperience(self.time, self.last_state, self.lastAction.intArray[0], reward, self.last_state, True)
273 |             self.DQN.experienceReplay(self.time)
274 | 
275 |         # Target model update
276 |         if self.DQN.initial_exploration < self.time and np.mod(self.time, self.DQN.target_model_update_freq) == 0:
277 |             print "########### MODEL UPDATED ######################"
278 |             self.DQN.target_model_update()
279 | 
280 |         # Simple text based visualization
281 |         print '  REWARD %.1f   / EPSILON  %.5f' % (np.sign(reward), self.epsilon)
282 | 
283 |         # Time count
284 |         if self.policyFrozen is False:
285 |             self.time += 1
286 | 
287 |     def agent_cleanup(self):
288 |         pass
289 | 
290 |     def agent_message(self, inMessage):
291 |         if inMessage.startswith("freeze learning"):
292 |             self.policyFrozen = True
293 |             return "message understood, policy frozen"
294 | 
295 |         if inMessage.startswith("unfreeze learning"):
296 |             self.policyFrozen = False
297 |             return "message understood, policy unfrozen"
298 | 
299 |         if inMessage.startswith("save model"):
300 |             with open('dqn_model.dat', 'w') as f:
301 |                 pickle.dump(self.DQN.model, f)
302 |             return "message understood, model saved"
303 | 
304 | if __name__ == "__main__":
305 |     AgentLoader.loadAgent(dqn_agent())
306 | 


--------------------------------------------------------------------------------