├── LICENSE ├── env_vizdoom.py ├── env_lab.py ├── README.md ├── agent_dqn.py └── agent_a3c.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Dmitriy Anisimov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /env_vizdoom.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | 4 | import itertools as it 5 | 6 | from vizdoom import * 7 | 8 | class EnvVizDoom(object): 9 | def __init__(self, scenario_path): 10 | print("Initializing doom.") 11 | self.game = DoomGame() 12 | self.game.set_doom_scenario_path(scenario_path) 13 | self.game.set_doom_map("map01") 14 | #self.game.set_screen_format(ScreenFormat.GRAY8) 15 | self.game.set_screen_format(ScreenFormat.RGB24) 16 | #self.game.set_screen_resolution(ScreenResolution.RES_160X120) 17 | self.game.set_screen_resolution(ScreenResolution.RES_640X480) 18 | self.game.set_render_hud(True) # False 19 | self.game.set_render_crosshair(False) 20 | self.game.set_render_weapon(True) 21 | self.game.set_render_decals(False) 22 | self.game.set_render_particles(False) 23 | self.game.add_available_button(Button.MOVE_LEFT) 24 | self.game.add_available_button(Button.MOVE_RIGHT) 25 | self.game.add_available_button(Button.ATTACK) 26 | #self.game.add_available_game_variable(GameVariable.AMMO2) 27 | #self.game.add_available_game_variable(GameVariable.POSITION_X) 28 | #self.game.add_available_game_variable(GameVariable.POSITION_Y) 29 | self.game.set_episode_timeout(300) 30 | self.game.set_episode_start_time(14) # 10 20 31 | self.game.set_window_visible(False) 32 | self.game.set_sound_enabled(False) 33 | self.game.set_living_reward(-1) 34 | self.game.set_mode(Mode.PLAYER) 35 | self.game.init() 36 | print("Doom initialized.") 37 | 38 | n = self.game.get_available_buttons_size() 39 | self.actions = [list(a) for a in it.product([0, 1], repeat=n)] 40 | self.num_actions = len(self.actions) 41 | print(self.num_actions) 42 | 43 | def NumActions(self): 44 | return self.num_actions 45 | 46 | def Reset(self): 47 | self.game.new_episode() 48 | 49 | def Act(self, action, frame_repeat): 50 | action = self.MapActions(action) 51 | return self.game.make_action(self.actions[action], frame_repeat) 52 | 53 | def IsRunning(self): 54 | return (not self.game.is_episode_finished()) 55 | 56 | def Observation(self): 57 | return self.game.get_state().screen_buffer 58 | 59 | def MapActions(self, action_raw): 60 | return action_raw 61 | -------------------------------------------------------------------------------- /env_lab.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | 4 | import numpy as np 5 | import cv2 6 | 7 | import deepmind_lab 8 | 9 | class EnvLab(object): 10 | def __init__(self, width, height, fps, level): 11 | lab = deepmind_lab.Lab(level, []) 12 | 13 | self.env = deepmind_lab.Lab( 14 | level, ["RGB_INTERLACED"], 15 | config = { 16 | "fps": str(fps), 17 | "width": str(width), 18 | "height": str(height) 19 | }) 20 | 21 | self.env.reset() 22 | 23 | import pprint 24 | observation_spec = lab.observation_spec() 25 | print("Observation spec:") 26 | pprint.pprint(observation_spec) 27 | self.action_spec = self.env.action_spec() 28 | print("Action spec:") 29 | pprint.pprint(self.action_spec) 30 | 31 | self.indices = {a["name"]: i for i, a in enumerate(self.action_spec)} 32 | self.mins = np.array([a["min"] for a in self.action_spec]) 33 | self.maxs = np.array([a["max"] for a in self.action_spec]) 34 | self.num_actions = len(self.action_spec) 35 | print(self.num_actions) 36 | 37 | self.action = None 38 | 39 | def NumActions(self): 40 | return 3 #self.num_actions*2 41 | 42 | def Reset(self): 43 | self.env.reset() 44 | 45 | def Act(self, action, frame_repeat): 46 | action = self.MapActions(action) 47 | return self.env.step(action, num_steps=frame_repeat) 48 | 49 | def IsRunning(self): 50 | return self.env.is_running() 51 | 52 | def Observation(self): 53 | obs = self.env.observations() 54 | img = obs["RGB_INTERLACED"] 55 | img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) 56 | return img 57 | 58 | def MapActions(self, action_raw): 59 | self.action = np.zeros([self.num_actions]) 60 | 61 | if (action_raw == 0): 62 | self.action[self.indices["LOOK_LEFT_RIGHT_PIXELS_PER_FRAME"]] = -25 63 | elif (action_raw == 1): 64 | self.action[self.indices["LOOK_LEFT_RIGHT_PIXELS_PER_FRAME"]] = 25 65 | 66 | """if (action_raw==2): 67 | self.action[self.indices["LOOK_DOWN_UP_PIXELS_PER_FRAME"]] = -25 68 | elif (action_raw==3): 69 | self.action[self.indices["LOOK_DOWN_UP_PIXELS_PER_FRAME"]] = 25 70 | 71 | if (action_raw==4): 72 | self.action[self.indices["STRAFE_LEFT_RIGHT"]] = -1 73 | elif (action_raw==5): 74 | self.action[self.indices["STRAFE_LEFT_RIGHT"]] = 1 75 | 76 | if (action_raw==6): 77 | self.action[self.indices["MOVE_BACK_FORWARD"]] = -1 78 | el""" 79 | if (action_raw == 2): # 7 80 | self.action[self.indices["MOVE_BACK_FORWARD"]] = 1 81 | 82 | # all binary actions need reset 83 | """if (action_raw==8): 84 | self.action[self.indices["FIRE"]] = 0 85 | elif (action_raw==9): 86 | self.action[self.indices["FIRE"]] = 1 87 | 88 | if (action_raw==10): 89 | self.action[self.indices["JUMP"]] = 0 90 | elif (action_raw==11): 91 | self.action[self.indices["JUMP"]] = 1 92 | 93 | if (action_raw==12): 94 | self.action[self.indices["CROUCH"]] = 0 95 | elif (action_raw==13): 96 | self.action[self.indices["CROUCH"]] = 1""" 97 | 98 | return np.clip(self.action, self.mins, self.maxs).astype(np.intc) 99 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Reinforcement learning in 3D 2 | 3 | Implemented DQN [3] and A3C [4] algorithm for ViZDoom [1] and DeepMind Lab [2] environments. 4 | 5 | 6 | Small network on small screen resolution trained relatively fast on simple maps: 7 | * DQN on 1 GPU: ~ 5 minutes on ViZDoom map *simpler_basic*. 8 | * DQN on 1 GPU: ~ 5 hours on DeepMind Lab map *seekavoid_arena_01*. 9 | * A3C on 1 CPU, 3 threads: ~13 minutes on ViZDoom map *simpler_basic*. 10 | * A3C on 1 GPU, 3 workers: ~8 minutes on ViZDoom map *simpler_basic*. 11 | 12 | 13 | _**DQN, ViZDoom map simpler_basic**_ 14 | 15 | [![ViZDoom map simpler_basic](http://i.imgur.com/zInpPnW.png)](https://youtu.be/mgY-G8rl9O4) 16 | 17 | _**DQN, DeepMind Lab map seekavoid_arena_01**_ 18 | 19 | [![ViZDoom map simpler_basic](http://i.imgur.com/nDLoaNW.png)](https://youtu.be/G41s4FQPIX4) 20 | 21 | 22 | ### Dependencies 23 | 24 | * numpy 25 | * [opencv](https://github.com/opencv/opencv) 26 | * [tensorflow](https://github.com/tensorflow/tensorflow) 27 | * [ViZDoom](https://github.com/mwydmuch/ViZDoom) 28 | * [DeepMind Lab](https://github.com/deepmind/lab) 29 | 30 | ### How to run 31 | _**ViZDoom**_ 32 | * Install [ViZDoom](https://github.com/mwydmuch/ViZDoom) and other dependencies 33 | * Set path to it in variable *vizdoom_path* 34 | * Set variable *lab* to *False* 35 | * Set path to rl_3d in *path_work_dir* 36 | * Run: 37 | * DQN: *./agent_dqn.py --gpu 0* 38 | * A3C: *./agent_a3c.py* 39 | 40 | _**DeepMind Lab**_ 41 | * Install [DeepMind Lab](https://github.com/deepmind/lab) and other dependencies 42 | * Set variable *lab* to *True* 43 | * Set path to rl_3d in *path_work_dir* 44 | * For now I used DeepMind Lab build and run system through bazel, so add build rule to *lab_path*/BUILD (change *path_work_dir* to your rl_3d path): 45 | ``` 46 | py_binary( 47 | name = "agent_dqn", 48 | srcs = ["*path_work_dir*/agent_dqn.py"], 49 | data = [":deepmind_lab.so"], 50 | main = "*path_work_dir*/agent_dqn.py", 51 | ) 52 | 53 | py_binary( 54 | name = "agent_a3c", 55 | srcs = ["*path_work_dir*/agent_a3c.py"], 56 | data = [":deepmind_lab.so"], 57 | main = "*path_work_dir*/agent_a3c.py", 58 | ) 59 | ``` 60 | * From *lab_path* run: 61 | * DQN: *bazel run :agent_dqn -- --gpu 0* 62 | * A3C: *bazel run :agent_a3c* 63 | 64 | ### Thanks 65 | A3C is a little bit tricky algorithm and there are a lot of it's implementations already. So as reference I used implementation by [Arthur Juliani](https://github.com/awjuliani/DeepRL-Agents/blob/master/A3C-Doom.ipynb). 66 | 67 | ### References 68 | [1] Michał Kempka, Marek Wydmuch, Grzegorz Runc, Jakub Toczek, Wojciech Jaśkowski. ViZDoom: A Doom-based AI Research Platform for Visual Reinforcement Learning. arXiv:[1605.02097](https://arxiv.org/abs/1605.02097), 2016. 69 | 70 | [2] Charles Beattie, Joel Z. Leibo, Denis Teplyashin, Tom Ward, Marcus Wainwright, Heinrich Küttler, Andrew Lefrancq, Simon Green, Víctor Valdés, Amir Sadik, Julian Schrittwieser, Keith Anderson, Sarah York, Max Cant, Adam Cain, Adrian Bolton, Stephen Gaffney, Helen King, Demis Hassabis, Shane Legg, Stig Petersen. DeepMind Lab. arXiv:[1612.03801](https://arxiv.org/abs/1612.03801), 2016. 71 | 72 | [3] Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Alex Graves, Ioannis Antonoglou, Daan Wierstra, Martin Riedmiller. Playing Atari with Deep Reinforcement Learning. arXiv:[1312.5602](https://arxiv.org/abs/1312.5602), 2013. 73 | 74 | [4] Volodymyr Mnih, Adrià Puigdomènech Badia, Mehdi Mirza, Alex Graves, Timothy P. Lillicrap, Tim Harley, David Silver, Koray Kavukcuoglu. Asynchronous Methods for Deep Reinforcement Learning. arXiv:[1602.01783](https://arxiv.org/abs/1602.01783), 2016. 75 | 76 | -------------------------------------------------------------------------------- /agent_dqn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import argparse 7 | import random 8 | import time 9 | import sys 10 | import os 11 | 12 | import numpy as np 13 | import cv2 14 | import tensorflow as tf 15 | 16 | def MakeDir(path): 17 | try: 18 | os.makedirs(path) 19 | except: 20 | pass 21 | 22 | lab = False 23 | load_model = False 24 | train = True 25 | test_display = True 26 | test_write_video = False 27 | path_work_dir = "~/rl_3d/" 28 | vizdoom_path = "~/ViZDoom/" 29 | vizdoom_scenario = vizdoom_path + "scenarios/simpler_basic.wad" 30 | 31 | # Lab parameters. 32 | if (lab): 33 | from env_lab import EnvLab 34 | 35 | learning_rate = 0.00025 # 0.001 36 | discount_factor = 0.99 37 | step_num = int(5e5) # int(1e6) 38 | replay_memory_size = int(1e6) 39 | replay_memory_batch_size = 64 40 | 41 | # Exploration rate. 42 | start_eps = 1.0 43 | end_eps = 0.1 44 | eps_decay_iter = 0.33 * step_num 45 | 46 | frame_repeat = 10 # 4 47 | channels = 3 48 | resolution = (40, 40) + (channels,) # Original: 240x320 49 | 50 | model_path = path_work_dir + "model_lab_dqn/" 51 | save_each = 0.01 * step_num 52 | step_load = 100 53 | 54 | # Vizdoom parameters. 55 | if (not lab): 56 | from env_vizdoom import EnvVizDoom 57 | 58 | learning_rate = 0.00025 59 | discount_factor = 0.99 60 | step_num = int(5e4) 61 | replay_memory_size = int(1e5) 62 | replay_memory_batch_size = 64 63 | 64 | frame_repeat = 10 65 | channels = 3 66 | resolution = (40, 40) + (channels,) # Original: 480x640 67 | 68 | start_eps = 1.0 69 | end_eps = 0.1 70 | eps_decay_iter = 0.33 * step_num 71 | 72 | model_path = path_work_dir + "model_vizdoom_dqn/" 73 | save_each = 0.01 * step_num 74 | step_load = 100 75 | 76 | MakeDir(model_path) 77 | model_name = model_path + "dqn" 78 | 79 | # Global variables. 80 | env = None 81 | 82 | def PrintStat(elapsed_time, step, step_num, train_scores): 83 | steps_per_s = 1.0 * step / elapsed_time 84 | steps_per_m = 60.0 * step / elapsed_time 85 | steps_per_h = 3600.0 * step / elapsed_time 86 | steps_remain = step_num - step 87 | remain_h = int(steps_remain / steps_per_h) 88 | remain_m = int((steps_remain - remain_h * steps_per_h) / steps_per_m) 89 | remain_s = int((steps_remain - remain_h * steps_per_h - remain_m * steps_per_m) / steps_per_s) 90 | elapsed_h = int(elapsed_time / 3600) 91 | elapsed_m = int((elapsed_time - elapsed_h * 3600) / 60) 92 | elapsed_s = int((elapsed_time - elapsed_h * 3600 - elapsed_m * 60)) 93 | print("{}% | Steps: {}/{}, {:.2f}M step/h, {:02}:{:02}:{:02}/{:02}:{:02}:{:02}".format( 94 | 100.0 * step / step_num, step, step_num, steps_per_h / 1e6, 95 | elapsed_h, elapsed_m, elapsed_s, remain_h, remain_m, remain_s), file=sys.stderr) 96 | 97 | mean_train = 0 98 | std_train = 0 99 | min_train = 0 100 | max_train = 0 101 | if (len(train_scores) > 0): 102 | train_scores = np.array(train_scores) 103 | mean_train = train_scores.mean() 104 | std_train = train_scores.std() 105 | min_train = train_scores.min() 106 | max_train = train_scores.max() 107 | print("Episodes: {} Rewards: mean: {:.2f}, std: {:.2f}, min: {:.2f}, max: {:.2f}".format( 108 | len(train_scores), mean_train, std_train, min_train, max_train), file=sys.stderr) 109 | 110 | def Preprocess(img): 111 | #cv2.imshow("frame-train", img) 112 | #cv2.waitKey(20) 113 | if (channels == 1): 114 | img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) 115 | img = cv2.resize(img, (resolution[1], resolution[0])) 116 | #cv2.imshow("frame-train", img) 117 | #cv2.waitKey(200) 118 | return np.reshape(img, resolution) 119 | 120 | class ReplayMemory(object): 121 | def __init__(self, capacity): 122 | 123 | self.s = np.zeros((capacity,) + resolution, dtype=np.uint8) 124 | self.a = np.zeros(capacity, dtype=np.int32) 125 | self.r = np.zeros(capacity, dtype=np.float32) 126 | self.isterminal = np.zeros(capacity, dtype=np.float32) 127 | 128 | self.capacity = capacity 129 | self.size = 0 130 | self.pos = 0 131 | 132 | def Add(self, s, action, isterminal, reward): 133 | 134 | self.s[self.pos, ...] = s 135 | self.a[self.pos] = action 136 | self.isterminal[self.pos] = isterminal 137 | self.r[self.pos] = reward 138 | 139 | self.pos = (self.pos + 1) % self.capacity 140 | self.size = min(self.size + 1, self.capacity) 141 | 142 | def Get(self, sample_size): 143 | 144 | idx = random.sample(xrange(0, self.size-2), sample_size) 145 | idx2 = [] 146 | for i in idx: 147 | idx2.append(i + 1) 148 | return self.s[idx], self.a[idx], self.s[idx2], self.isterminal[idx], self.r[idx] 149 | 150 | class Model(object): 151 | def __init__(self, session, actions_count): 152 | 153 | self.session = session 154 | 155 | # Create the input. 156 | self.s_ = tf.placeholder(shape=[None] + list(resolution), dtype=tf.float32) 157 | self.q_ = tf.placeholder(shape=[None, actions_count], dtype=tf.float32) 158 | 159 | # Create the network. 160 | conv1 = tf.contrib.layers.conv2d(self.s_, num_outputs=8, kernel_size=[3, 3], stride=[2, 2]) 161 | conv2 = tf.contrib.layers.conv2d(conv1, num_outputs=16, kernel_size=[3, 3], stride=[2, 2]) 162 | conv2_flat = tf.contrib.layers.flatten(conv2) 163 | fc1 = tf.contrib.layers.fully_connected(conv2_flat, num_outputs=128) 164 | 165 | self.q = tf.contrib.layers.fully_connected(fc1, num_outputs=actions_count, activation_fn=None) 166 | self.action = tf.argmax(self.q, 1) 167 | 168 | self.loss = tf.losses.mean_squared_error(self.q_, self.q) 169 | 170 | self.optimizer = tf.train.RMSPropOptimizer(learning_rate) 171 | self.train_step = self.optimizer.minimize(self.loss) 172 | 173 | def Learn(self, state, q): 174 | 175 | state = state.astype(np.float32) 176 | l, _ = self.session.run([self.loss, self.train_step], feed_dict={self.s_: state, self.q_: q}) 177 | return l 178 | 179 | def GetQ(self, state): 180 | 181 | state = state.astype(np.float32) 182 | return self.session.run(self.q, feed_dict={self.s_: state}) 183 | 184 | def GetAction(self, state): 185 | 186 | state = state.astype(np.float32) 187 | state = state.reshape([1] + list(resolution)) 188 | return self.session.run(self.action, feed_dict={self.s_: state})[0] 189 | 190 | class Agent(object): 191 | 192 | def __init__(self, num_actions): 193 | 194 | config = tf.ConfigProto() 195 | config.gpu_options.allow_growth = True 196 | config.log_device_placement = False 197 | config.allow_soft_placement = True 198 | 199 | self.session = tf.Session(config=config) 200 | 201 | self.model = Model(self.session, num_actions) 202 | self.memory = ReplayMemory(replay_memory_size) 203 | 204 | self.rewards = 0 205 | 206 | self.saver = tf.train.Saver(max_to_keep=1000) 207 | if (load_model): 208 | model_name_curr = model_name + "_{:04}".format(step_load) 209 | print("Loading model from: ", model_name_curr) 210 | self.saver.restore(self.session, model_name_curr) 211 | else: 212 | init = tf.global_variables_initializer() 213 | self.session.run(init) 214 | 215 | self.num_actions = num_actions 216 | 217 | def LearnFromMemory(self): 218 | 219 | if (self.memory.size > 2*replay_memory_batch_size): 220 | s1, a, s2, isterminal, r = self.memory.Get(replay_memory_batch_size) 221 | 222 | q = self.model.GetQ(s1) 223 | q2 = np.max(self.model.GetQ(s2), axis=1) 224 | q[np.arange(q.shape[0]), a] = r + (1 - isterminal) * discount_factor * q2 225 | self.model.Learn(s1, q) 226 | 227 | def GetAction(self, state): 228 | 229 | if (random.random() <= 0.05): 230 | a = random.randint(0, self.num_actions-1) 231 | else: 232 | a = self.model.GetAction(state) 233 | 234 | return a 235 | 236 | def Step(self, iteration): 237 | 238 | s = Preprocess(env.Observation()) 239 | 240 | # Epsilon-greedy. 241 | if (iteration < eps_decay_iter): 242 | eps = start_eps - iteration / eps_decay_iter * (start_eps - end_eps) 243 | else: 244 | eps = end_eps 245 | 246 | if (random.random() <= eps): 247 | a = random.randint(0, self.num_actions-1) 248 | else: 249 | a = self.model.GetAction(s) 250 | 251 | reward = env.Act(a, frame_repeat) 252 | self.rewards += reward 253 | 254 | isterminal = not env.IsRunning() 255 | self.memory.Add(s, a, isterminal, reward) 256 | self.LearnFromMemory() 257 | 258 | def Train(self): 259 | 260 | print("Starting training.") 261 | start_time = time.time() 262 | train_scores = [] 263 | env.Reset() 264 | for step in xrange(1, step_num+1): 265 | self.Step(step) 266 | if (not env.IsRunning()): 267 | train_scores.append(self.rewards) 268 | self.rewards = 0 269 | env.Reset() 270 | 271 | if (step % save_each == 0): 272 | model_name_curr = model_name + "_{:04}".format(int(step / save_each)) 273 | print("\nSaving the network weigths to:", model_name_curr, file=sys.stderr) 274 | self.saver.save(self.session, model_name_curr) 275 | 276 | PrintStat(time.time() - start_time, step, step_num, train_scores) 277 | 278 | train_scores = [] 279 | 280 | env.Reset() 281 | 282 | def Test(agent): 283 | if (test_write_video): 284 | size = (640, 480) 285 | fps = 30.0 #/ frame_repeat 286 | fourcc = cv2.VideoWriter_fourcc(*'XVID') # cv2.cv.CV_FOURCC(*'XVID') 287 | out_video = cv2.VideoWriter(path_work_dir + "test.avi", fourcc, fps, size) 288 | 289 | reward_total = 0 290 | num_episodes = 30 291 | while (num_episodes != 0): 292 | if (not env.IsRunning()): 293 | env.Reset() 294 | print("Total reward: {}".format(reward_total)) 295 | reward_total = 0 296 | num_episodes -= 1 297 | 298 | state_raw = env.Observation() 299 | 300 | state = Preprocess(state_raw) 301 | action = agent.GetAction(state) 302 | 303 | for _ in xrange(frame_repeat): 304 | # Display. 305 | if (test_display): 306 | cv2.imshow("frame-test", state_raw) 307 | cv2.waitKey(20) 308 | 309 | if (test_write_video): 310 | out_video.write(state_raw) 311 | 312 | reward = env.Act(action, 1) 313 | reward_total += reward 314 | 315 | if (not env.IsRunning()): 316 | break 317 | 318 | state_raw = env.Observation() 319 | 320 | if __name__ == '__main__': 321 | 322 | parser = argparse.ArgumentParser() 323 | parser.add_argument("--gpu", help="the GPU to use") 324 | args = parser.parse_args() 325 | 326 | if (args.gpu): 327 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu 328 | 329 | if (lab): 330 | env = EnvLab(80, 80, 60, "seekavoid_arena_01") 331 | else: 332 | env = EnvVizDoom(vizdoom_scenario) 333 | 334 | agent = Agent(env.NumActions()) 335 | 336 | if (train): 337 | agent.Train() 338 | 339 | Test(agent) 340 | -------------------------------------------------------------------------------- /agent_a3c.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import print_function 4 | 5 | import numpy as np 6 | import cv2 7 | import tensorflow as tf 8 | import threading 9 | import sys 10 | import time 11 | import os 12 | 13 | def MakeDir(path): 14 | try: 15 | os.makedirs(path) 16 | except: 17 | pass 18 | 19 | lab = False 20 | load_model = False 21 | train = True 22 | test_display = True 23 | test_write_video = True 24 | path_work_dir = "~/rl_3d/" 25 | vizdoom_path = "~/ViZDoom/" 26 | vizdoom_scenario = vizdoom_path + "scenarios/simpler_basic.wad" 27 | 28 | if (lab): 29 | from env_lab import EnvLab 30 | 31 | model_path = path_work_dir + "model_lab_a3c/" 32 | else: 33 | from env_vizdoom import EnvVizDoom 34 | 35 | model_path = path_work_dir + "model_vizdoom_a3c/" 36 | 37 | learning_rate = 0.00025 38 | device = "/cpu:0" 39 | num_workers = 3 40 | t_max = 30 41 | frame_repeat = 10 # 4 42 | gamma = 0.99 43 | step_num = int(2.5e5) 44 | save_each = 0.01 * step_num 45 | step_load = 100 46 | entropy_beta = 0.01 47 | grad_norm_clip = 40.0 48 | 49 | global_scope_name = "global" 50 | step = 0 51 | train_scores = [] 52 | lock = threading.Lock() 53 | start_time = 0 54 | 55 | # Global. 56 | env = None 57 | 58 | MakeDir(model_path) 59 | model_name = model_path + "a3c" 60 | 61 | def PrintStat(elapsed_time, step, step_num, train_scores): 62 | steps_per_s = 1.0 * step / elapsed_time 63 | steps_per_m = 60.0 * step / elapsed_time 64 | steps_per_h = 3600.0 * step / elapsed_time 65 | steps_remain = step_num - step 66 | remain_h = int(steps_remain / steps_per_h) 67 | remain_m = int((steps_remain - remain_h * steps_per_h) / steps_per_m) 68 | remain_s = int((steps_remain - remain_h * steps_per_h - remain_m * steps_per_m) / steps_per_s) 69 | elapsed_h = int(elapsed_time / 3600) 70 | elapsed_m = int((elapsed_time - elapsed_h * 3600) / 60) 71 | elapsed_s = int((elapsed_time - elapsed_h * 3600 - elapsed_m * 60)) 72 | print("{}% | Steps: {}/{}, {:.2f}M step/h, {:02}:{:02}:{:02}/{:02}:{:02}:{:02}".format( 73 | 100.0 * step / step_num, step, step_num, steps_per_h / 1e6, 74 | elapsed_h, elapsed_m, elapsed_s, remain_h, remain_m, remain_s), file=sys.stderr) 75 | 76 | mean_train = 0 77 | std_train = 0 78 | min_train = 0 79 | max_train = 0 80 | if (len(train_scores) > 0): 81 | train_scores = np.array(train_scores) 82 | mean_train = train_scores.mean() 83 | std_train = train_scores.std() 84 | min_train = train_scores.min() 85 | max_train = train_scores.max() 86 | print("Episodes: {} Rewards: mean: {:.2f}, std: {:.2f}, min: {:.2f}, max: {:.2f}".format( 87 | len(train_scores), mean_train, std_train, min_train, max_train), file=sys.stderr) 88 | 89 | channels = 3 90 | resolution = (40, 40, channels) 91 | 92 | def Preprocess(frame): 93 | if (channels == 1): 94 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) 95 | frame = cv2.resize(frame, (resolution[1], resolution[0])) 96 | return np.reshape(frame, resolution) 97 | 98 | class ACNet(object): 99 | def __init__(self, num_actions, scope, trainer): 100 | with tf.variable_scope(scope): 101 | self.inputs = tf.placeholder(shape=[None] + list(resolution), dtype=tf.float32) 102 | 103 | conv1 = tf.contrib.layers.conv2d(self.inputs, num_outputs=16, kernel_size=[3, 3], stride=[2, 2]) 104 | conv2 = tf.contrib.layers.conv2d(conv1, num_outputs=32, kernel_size=[3, 3], stride=[2, 2]) 105 | conv2_flat = tf.contrib.layers.flatten(conv2) 106 | hidden = tf.contrib.layers.fully_connected(conv2_flat, 256) 107 | 108 | # Recurrent network for temporal dependencies 109 | 110 | # Introduce a "fake" batch dimension of 1 after flatten so that we can do LSTM over time dim 111 | rnn_in = tf.expand_dims(hidden, [0]) 112 | 113 | lstm_size = 256 114 | lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size, state_is_tuple=True) 115 | step_size = tf.shape(self.inputs)[:1] 116 | 117 | c_init = np.zeros((1, lstm_cell.state_size.c), dtype=np.float32) 118 | h_init = np.zeros((1, lstm_cell.state_size.h), dtype=np.float32) 119 | self.state_init = [c_init, h_init] 120 | self.rnn_state = self.state_init 121 | 122 | c_in = tf.placeholder(shape=[1, lstm_cell.state_size.c], dtype=tf.float32) 123 | h_in = tf.placeholder(shape=[1, lstm_cell.state_size.h], dtype=tf.float32) 124 | self.state_in = (c_in, h_in) 125 | 126 | state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in) 127 | lstm_outputs, lstm_state = tf.nn.dynamic_rnn(lstm_cell, rnn_in, initial_state=state_in, 128 | sequence_length=step_size, time_major=False) 129 | lstm_c, lstm_h = lstm_state 130 | rnn_out = tf.reshape(lstm_outputs, [-1, lstm_size]) 131 | self.state_out = (lstm_c[:1, :], lstm_h[:1, :]) 132 | 133 | # Output layers for policy and value estimations 134 | self.policy = tf.contrib.layers.fully_connected(rnn_out, num_actions, activation_fn=tf.nn.softmax, 135 | weights_initializer=self.normalized_columns_initializer(0.01), 136 | biases_initializer=None) 137 | self.value = tf.contrib.layers.fully_connected(rnn_out, 1, activation_fn=None, 138 | weights_initializer=self.normalized_columns_initializer(1.0), 139 | biases_initializer=None) 140 | 141 | # Only the worker network need ops for loss functions and gradient updating. 142 | if (scope != global_scope_name): 143 | self.actions = tf.placeholder(shape=[None], dtype=tf.int32) 144 | actions_onehot = tf.one_hot(self.actions, num_actions, dtype=tf.float32) 145 | self.target_v = tf.placeholder(shape=[None], dtype=tf.float32) 146 | self.advantages = tf.placeholder(shape=[None], dtype=tf.float32) 147 | 148 | responsible_outputs = tf.reduce_sum(self.policy * actions_onehot, [1]) 149 | 150 | # Loss functions 151 | value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value, [-1]))) 152 | entropy = -tf.reduce_sum(self.policy * tf.log(self.policy)) 153 | policy_loss = -tf.reduce_sum(tf.log(responsible_outputs) * self.advantages) 154 | self.loss = 0.5 * value_loss + policy_loss - entropy * entropy_beta 155 | 156 | # Get gradients from local network using local losses 157 | local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) 158 | self.gradients = tf.gradients(self.loss, local_vars) 159 | 160 | if (grad_norm_clip != None): 161 | grads, _ = tf.clip_by_global_norm(self.gradients, grad_norm_clip) 162 | else: 163 | grads = self.gradients 164 | 165 | # Apply local gradients to global network 166 | global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, global_scope_name) 167 | self.apply_grads = trainer.apply_gradients(zip(grads, global_vars)) 168 | 169 | # Used to initialize weights for policy and value output layers 170 | def normalized_columns_initializer(self, std = 1.0): 171 | def _initializer(shape, dtype=None, partition_info=None): 172 | out = np.random.randn(*shape).astype(np.float32) 173 | out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True)) 174 | return tf.constant(out) 175 | 176 | return _initializer 177 | 178 | def Train(self, sess, discounted_rewards, states, actions, advantages): 179 | states = states / 255.0 180 | self.ResetLstm() 181 | feed_dict = {self.target_v : discounted_rewards, 182 | self.inputs : np.stack(states, axis=0), 183 | self.actions : actions, 184 | self.advantages : advantages, 185 | self.state_in[0] : self.rnn_state[0], 186 | self.state_in[1] : self.rnn_state[1]} 187 | _ = sess.run([self.apply_grads], feed_dict=feed_dict) 188 | 189 | def ResetLstm(self): 190 | self.rnn_state = self.state_init 191 | 192 | def GetAction(self, sess, state): 193 | state = state / 255.0 194 | a_dist, v, self.rnn_state = sess.run([self.policy, self.value, self.state_out], 195 | feed_dict={self.inputs: [state], 196 | self.state_in[0]: self.rnn_state[0], 197 | self.state_in[1]: self.rnn_state[1]}) 198 | a = np.random.choice(a_dist[0], p=a_dist[0]) 199 | a = np.argmax(a_dist == a) 200 | return a, v[0, 0] 201 | 202 | def GetValue(self, sess, state): 203 | state = state / 255.0 204 | v = sess.run([self.value], 205 | feed_dict={self.inputs: [state], 206 | self.state_in[0]: self.rnn_state[0], 207 | self.state_in[1]: self.rnn_state[1]}) 208 | return v[0][0, 0] 209 | 210 | class Worker(object): 211 | def __init__(self, number, num_actions, trainer, model_name): 212 | 213 | self.name = "worker_" + str(number) 214 | self.number = number 215 | self.model_name = model_name 216 | 217 | # Create the local copy of the network and the tensorflow op to copy global paramters to local network 218 | self.local_ac = ACNet(num_actions, self.name, trainer) 219 | self.update_target_graph = self.update_target(global_scope_name, self.name) 220 | 221 | if (lab): 222 | self.env = EnvLab(80, 80, 60, "seekavoid_arena_01") 223 | else: 224 | self.env = EnvVizDoom(vizdoom_scenario) 225 | 226 | # Copies one set of variables to another. 227 | # Used to set worker network parameters to those of global network. 228 | def update_target(self, from_scope, to_scope): 229 | from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope) 230 | to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope) 231 | 232 | op_holder = [] 233 | for from_var, to_var in zip(from_vars, to_vars): 234 | op_holder.append(to_var.assign(from_var)) 235 | return op_holder 236 | 237 | # Calculate discounted returns. 238 | def Discount(self, x, gamma): 239 | for idx in reversed(xrange(len(x) - 1)): 240 | x[idx] += x[idx + 1] * gamma 241 | return x 242 | 243 | def Start(self, session, saver, coord): 244 | worker_process = lambda: self.Process(session, saver, coord) 245 | thread = threading.Thread(target=worker_process) 246 | thread.start() 247 | 248 | global start_time 249 | start_time = time.time() 250 | return thread 251 | 252 | def Train(self, episode_buffer, sess, bootstrap_value): 253 | episode_buffer = np.array(episode_buffer) 254 | states = episode_buffer[:, 0] 255 | actions = episode_buffer[:, 1] 256 | rewards = episode_buffer[:, 2] 257 | values = episode_buffer[:, 3] 258 | 259 | # Here we take the rewards and values from the episode_buffer, and use them to 260 | # generate the advantage and discounted returns. 261 | # The advantage function uses "Generalized Advantage Estimation" 262 | rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) 263 | discounted_rewards = self.Discount(rewards_plus, gamma)[:-1] 264 | 265 | value_plus = np.asarray(values.tolist() + [bootstrap_value]) 266 | advantages = rewards + gamma * value_plus[1:] - value_plus[:-1] 267 | advantages = self.Discount(advantages, gamma) 268 | 269 | # Update the global network using gradients from loss 270 | # Generate network statistics to periodically save 271 | self.local_ac.Train(sess, discounted_rewards, states, actions, advantages) 272 | 273 | def Process(self, sess, saver, coord): 274 | global step, train_scores, start_time, lock 275 | 276 | print("Starting worker " + str(self.number)) 277 | while (not coord.should_stop()): 278 | sess.run(self.update_target_graph) 279 | episode_buffer = [] 280 | episode_reward = 0 281 | 282 | self.env.Reset() 283 | s = self.env.Observation() 284 | s = Preprocess(s) 285 | self.local_ac.ResetLstm() 286 | 287 | while (self.env.IsRunning()): 288 | # Take an action using probabilities from policy network output. 289 | a, v = self.local_ac.GetAction(sess, s) 290 | r = self.env.Act(a, frame_repeat) 291 | finished = not self.env.IsRunning() 292 | if (not finished): 293 | s1 = self.env.Observation() 294 | s1 = Preprocess(s1) 295 | else: 296 | s1 = None 297 | 298 | episode_buffer.append([s, a, r, v]) 299 | 300 | episode_reward += r 301 | s = s1 302 | 303 | lock.acquire() 304 | 305 | step += 1 306 | 307 | if (step % save_each == 0): 308 | model_name_curr = self.model_name + "_{:04}".format(int(step / save_each)) 309 | print("\nSaving the network weigths to:", model_name_curr, file=sys.stderr) 310 | saver.save(sess, model_name_curr) 311 | 312 | PrintStat(time.time() - start_time, step, step_num, train_scores) 313 | 314 | train_scores = [] 315 | 316 | if (step == step_num): 317 | coord.request_stop() 318 | 319 | lock.release() 320 | 321 | # If the episode hasn't ended, but the experience buffer is full, then we 322 | # make an update step using that experience rollout. 323 | if (len(episode_buffer) == t_max or (finished and len(episode_buffer) > 0)): 324 | # Since we don't know what the true final return is, 325 | # we "bootstrap" from our current value estimation. 326 | if (not finished): 327 | v1 = self.local_ac.GetValue(sess, s) 328 | self.Train(episode_buffer, sess, v1) 329 | episode_buffer = [] 330 | sess.run(self.update_target_graph) 331 | else: 332 | self.Train(episode_buffer, sess, 0.0) 333 | 334 | lock.acquire() 335 | train_scores.append(episode_reward) 336 | lock.release() 337 | 338 | class Agent(object): 339 | def __init__(self): 340 | config = tf.ConfigProto() 341 | config.gpu_options.allow_growth = True 342 | config.log_device_placement = False 343 | config.allow_soft_placement = True 344 | 345 | self.session = tf.Session(config=config) 346 | 347 | with tf.device(device): 348 | # Global network 349 | self.global_net = ACNet(env.NumActions(), global_scope_name, None) 350 | 351 | if (train): 352 | trainer = tf.train.RMSPropOptimizer(learning_rate) 353 | 354 | workers = [] 355 | for i in xrange(num_workers): 356 | workers.append(Worker(i, env.NumActions(), trainer, model_name)) 357 | 358 | saver = tf.train.Saver(max_to_keep=100) 359 | if (load_model): 360 | model_name_curr = model_name + "_{:04}".format(step_load) 361 | print("Loading model from: ", model_name_curr) 362 | saver.restore(self.session, model_name_curr) 363 | else: 364 | self.session.run(tf.global_variables_initializer()) 365 | 366 | if (train): 367 | coord = tf.train.Coordinator() 368 | # Start the "work" process for each worker in a separate thread. 369 | worker_threads = [] 370 | for worker in workers: 371 | thread = worker.Start(self.session, saver, coord) 372 | worker_threads.append(thread) 373 | coord.join(worker_threads) 374 | 375 | def Reset(self): 376 | self.global_net.ResetLstm() 377 | 378 | def Act(self, state): 379 | action, _ = self.global_net.GetAction(self.session, state) 380 | return action 381 | 382 | def Test(agent): 383 | if (test_write_video): 384 | size = (640, 480) 385 | fps = 30.0 386 | fourcc = cv2.VideoWriter_fourcc(*'XVID') # cv2.cv.CV_FOURCC(*'XVID') 387 | out_video = cv2.VideoWriter(path_work_dir + "test.avi", fourcc, fps, size) 388 | 389 | reward_total = 0 390 | num_episodes = 30 391 | while (num_episodes != 0): 392 | if (not env.IsRunning()): 393 | env.Reset() 394 | agent.Reset() 395 | print("Total reward: {}".format(reward_total)) 396 | reward_total = 0 397 | num_episodes -= 1 398 | 399 | state_raw = env.Observation() 400 | 401 | state = Preprocess(state_raw) 402 | action = agent.Act(state) 403 | 404 | for _ in xrange(frame_repeat): 405 | if (test_display): 406 | cv2.imshow("frame-test", state_raw) 407 | cv2.waitKey(20) 408 | 409 | if (test_write_video): 410 | out_video.write(state_raw) 411 | 412 | reward = env.Act(action, 1) 413 | reward_total += reward 414 | 415 | if (not env.IsRunning()): 416 | break 417 | 418 | state_raw = env.Observation() 419 | 420 | if __name__ == '__main__': 421 | 422 | if (lab): 423 | env = EnvLab(80, 80, 60, "seekavoid_arena_01") 424 | else: 425 | env = EnvVizDoom(vizdoom_scenario) 426 | 427 | agent = Agent() 428 | 429 | Test(agent) 430 | --------------------------------------------------------------------------------