├── LICENSE
├── env_vizdoom.py
├── env_lab.py
├── README.md
├── agent_dqn.py
└── agent_a3c.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Dmitriy Anisimov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/env_vizdoom.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | 
 4 | import itertools as it
 5 | 
 6 | from vizdoom import *
 7 | 
 8 | class EnvVizDoom(object):
 9 |     def __init__(self, scenario_path):
10 |         print("Initializing doom.")
11 |         self.game = DoomGame()
12 |         self.game.set_doom_scenario_path(scenario_path)
13 |         self.game.set_doom_map("map01")
14 |         #self.game.set_screen_format(ScreenFormat.GRAY8)
15 |         self.game.set_screen_format(ScreenFormat.RGB24)
16 |         #self.game.set_screen_resolution(ScreenResolution.RES_160X120)
17 |         self.game.set_screen_resolution(ScreenResolution.RES_640X480)
18 |         self.game.set_render_hud(True) # False
19 |         self.game.set_render_crosshair(False)
20 |         self.game.set_render_weapon(True)
21 |         self.game.set_render_decals(False)
22 |         self.game.set_render_particles(False)
23 |         self.game.add_available_button(Button.MOVE_LEFT)
24 |         self.game.add_available_button(Button.MOVE_RIGHT)
25 |         self.game.add_available_button(Button.ATTACK)
26 |         #self.game.add_available_game_variable(GameVariable.AMMO2)
27 |         #self.game.add_available_game_variable(GameVariable.POSITION_X)
28 |         #self.game.add_available_game_variable(GameVariable.POSITION_Y)
29 |         self.game.set_episode_timeout(300)
30 |         self.game.set_episode_start_time(14) # 10 20
31 |         self.game.set_window_visible(False)
32 |         self.game.set_sound_enabled(False)
33 |         self.game.set_living_reward(-1)
34 |         self.game.set_mode(Mode.PLAYER)
35 |         self.game.init()
36 |         print("Doom initialized.")
37 | 
38 |         n = self.game.get_available_buttons_size()
39 |         self.actions = [list(a) for a in it.product([0, 1], repeat=n)]
40 |         self.num_actions = len(self.actions)
41 |         print(self.num_actions)
42 | 
43 |     def NumActions(self):
44 |         return self.num_actions
45 | 
46 |     def Reset(self):
47 |         self.game.new_episode()
48 | 
49 |     def Act(self, action, frame_repeat):
50 |         action = self.MapActions(action)
51 |         return self.game.make_action(self.actions[action], frame_repeat)
52 | 
53 |     def IsRunning(self):
54 |         return (not self.game.is_episode_finished())
55 | 
56 |     def Observation(self):
57 |         return self.game.get_state().screen_buffer
58 | 
59 |     def MapActions(self, action_raw):
60 |         return action_raw
61 | 


--------------------------------------------------------------------------------
/env_lab.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | 
 4 | import numpy as np
 5 | import cv2
 6 | 
 7 | import deepmind_lab
 8 | 
 9 | class EnvLab(object):
10 |     def __init__(self, width, height, fps, level):
11 |         lab = deepmind_lab.Lab(level, [])
12 | 
13 |         self.env = deepmind_lab.Lab(
14 |             level, ["RGB_INTERLACED"],
15 |             config = {
16 |                 "fps": str(fps),
17 |                 "width": str(width),
18 |                 "height": str(height)
19 |             })
20 | 
21 |         self.env.reset()
22 | 
23 |         import pprint
24 |         observation_spec = lab.observation_spec()
25 |         print("Observation spec:")
26 |         pprint.pprint(observation_spec)
27 |         self.action_spec = self.env.action_spec()
28 |         print("Action spec:")
29 |         pprint.pprint(self.action_spec)
30 | 
31 |         self.indices = {a["name"]: i for i, a in enumerate(self.action_spec)}
32 |         self.mins = np.array([a["min"] for a in self.action_spec])
33 |         self.maxs = np.array([a["max"] for a in self.action_spec])
34 |         self.num_actions = len(self.action_spec)
35 |         print(self.num_actions)
36 | 
37 |         self.action = None
38 | 
39 |     def NumActions(self):
40 |         return 3 #self.num_actions*2
41 | 
42 |     def Reset(self):
43 |         self.env.reset()
44 | 
45 |     def Act(self, action, frame_repeat):
46 |         action = self.MapActions(action)
47 |         return self.env.step(action, num_steps=frame_repeat)
48 | 
49 |     def IsRunning(self):
50 |         return self.env.is_running()
51 | 
52 |     def Observation(self):
53 |         obs = self.env.observations()
54 |         img = obs["RGB_INTERLACED"]
55 |         img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
56 |         return img
57 | 
58 |     def MapActions(self, action_raw):
59 |         self.action = np.zeros([self.num_actions])
60 | 
61 |         if (action_raw == 0):
62 |             self.action[self.indices["LOOK_LEFT_RIGHT_PIXELS_PER_FRAME"]] = -25
63 |         elif (action_raw == 1):
64 |             self.action[self.indices["LOOK_LEFT_RIGHT_PIXELS_PER_FRAME"]] = 25
65 | 
66 |         """if (action_raw==2):
67 |             self.action[self.indices["LOOK_DOWN_UP_PIXELS_PER_FRAME"]] = -25
68 |         elif (action_raw==3):
69 |             self.action[self.indices["LOOK_DOWN_UP_PIXELS_PER_FRAME"]] = 25
70 | 
71 |         if (action_raw==4):
72 |             self.action[self.indices["STRAFE_LEFT_RIGHT"]] = -1
73 |         elif (action_raw==5):
74 |             self.action[self.indices["STRAFE_LEFT_RIGHT"]] = 1
75 | 
76 |         if (action_raw==6):
77 |             self.action[self.indices["MOVE_BACK_FORWARD"]] = -1
78 |         el"""
79 |         if (action_raw == 2):  # 7
80 |             self.action[self.indices["MOVE_BACK_FORWARD"]] = 1
81 | 
82 |         # all binary actions need reset
83 |         """if (action_raw==8):
84 |             self.action[self.indices["FIRE"]] = 0
85 |         elif (action_raw==9):
86 |             self.action[self.indices["FIRE"]] = 1
87 | 
88 |         if (action_raw==10):
89 |             self.action[self.indices["JUMP"]] = 0
90 |         elif (action_raw==11):
91 |             self.action[self.indices["JUMP"]] = 1
92 | 
93 |         if (action_raw==12):
94 |             self.action[self.indices["CROUCH"]] = 0
95 |         elif (action_raw==13):
96 |             self.action[self.indices["CROUCH"]] = 1"""
97 | 
98 |         return np.clip(self.action, self.mins, self.maxs).astype(np.intc)
99 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ### Reinforcement learning in 3D
 2 | 
 3 | Implemented DQN [3] and A3C [4] algorithm for ViZDoom [1] and DeepMind Lab [2] environments.
 4 | 
 5 | 
 6 | Small network on small screen resolution trained relatively fast on simple maps:
 7 | * DQN on 1 GPU: ~ 5 minutes on ViZDoom map *simpler_basic*.
 8 | * DQN on 1 GPU: ~ 5 hours on DeepMind Lab map *seekavoid_arena_01*.
 9 | * A3C on 1 CPU, 3 threads: ~13 minutes on ViZDoom map *simpler_basic*.
10 | * A3C on 1 GPU, 3 workers: ~8 minutes on ViZDoom map *simpler_basic*.
11 | 
12 | 
13 | _**DQN, ViZDoom map simpler_basic**_
14 | 
15 | [![ViZDoom map simpler_basic](http://i.imgur.com/zInpPnW.png)](https://youtu.be/mgY-G8rl9O4)
16 | 
17 | _**DQN, DeepMind Lab map seekavoid_arena_01**_
18 | 
19 | [![ViZDoom map simpler_basic](http://i.imgur.com/nDLoaNW.png)](https://youtu.be/G41s4FQPIX4)
20 | 
21 | 
22 | ### Dependencies
23 | 
24 | * numpy
25 | * [opencv](https://github.com/opencv/opencv)
26 | * [tensorflow](https://github.com/tensorflow/tensorflow)
27 | * [ViZDoom](https://github.com/mwydmuch/ViZDoom)
28 | * [DeepMind Lab](https://github.com/deepmind/lab)
29 | 
30 | ### How to run
31 | _**ViZDoom**_
32 | * Install [ViZDoom](https://github.com/mwydmuch/ViZDoom) and other dependencies
33 | * Set path to it in variable *vizdoom_path*
34 | * Set variable *lab* to *False*
35 | * Set path to rl_3d in *path_work_dir*
36 | * Run:
37 |   * DQN: *./agent_dqn.py --gpu 0*
38 |   * A3C: *./agent_a3c.py*
39 | 
40 | _**DeepMind Lab**_
41 | * Install [DeepMind Lab](https://github.com/deepmind/lab) and other dependencies
42 | * Set variable *lab* to *True*
43 | * Set path to rl_3d in *path_work_dir*
44 | * For now I used DeepMind Lab build and run system through bazel, so add build rule to *lab_path*/BUILD (change *path_work_dir* to your rl_3d path):
45 | ```
46 | py_binary(
47 |     name = "agent_dqn",
48 |     srcs = ["*path_work_dir*/agent_dqn.py"],
49 |     data = [":deepmind_lab.so"],
50 |     main = "*path_work_dir*/agent_dqn.py",
51 | )
52 | 
53 | py_binary(
54 |     name = "agent_a3c",
55 |     srcs = ["*path_work_dir*/agent_a3c.py"],
56 |     data = [":deepmind_lab.so"],
57 |     main = "*path_work_dir*/agent_a3c.py",
58 | )
59 | ```
60 | * From *lab_path* run:
61 |   * DQN: *bazel run :agent_dqn -- --gpu 0*
62 |   * A3C: *bazel run :agent_a3c*
63 | 
64 | ### Thanks
65 | A3C is a little bit tricky algorithm and there are a lot of it's implementations already. So as reference I used implementation by [Arthur Juliani](https://github.com/awjuliani/DeepRL-Agents/blob/master/A3C-Doom.ipynb).
66 | 
67 | ### References
68 | [1] Michał Kempka, Marek Wydmuch, Grzegorz Runc, Jakub Toczek, Wojciech Jaśkowski. ViZDoom: A Doom-based AI Research Platform for Visual Reinforcement Learning. arXiv:[1605.02097](https://arxiv.org/abs/1605.02097), 2016.
69 | 
70 | [2] Charles Beattie, Joel Z. Leibo, Denis Teplyashin, Tom Ward, Marcus Wainwright, Heinrich Küttler, Andrew Lefrancq, Simon Green, Víctor Valdés, Amir Sadik, Julian Schrittwieser, Keith Anderson, Sarah York, Max Cant, Adam Cain, Adrian Bolton, Stephen Gaffney, Helen King, Demis Hassabis, Shane Legg, Stig Petersen. DeepMind Lab. arXiv:[1612.03801](https://arxiv.org/abs/1612.03801), 2016.
71 | 
72 | [3] Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Alex Graves, Ioannis Antonoglou, Daan Wierstra, Martin Riedmiller. Playing Atari with Deep Reinforcement Learning. arXiv:[1312.5602](https://arxiv.org/abs/1312.5602), 2013.
73 | 
74 | [4] Volodymyr Mnih, Adrià Puigdomènech Badia, Mehdi Mirza, Alex Graves, Timothy P. Lillicrap, Tim Harley, David Silver, Koray Kavukcuoglu. Asynchronous Methods for Deep Reinforcement Learning. arXiv:[1602.01783](https://arxiv.org/abs/1602.01783), 2016.
75 | 
76 | 


--------------------------------------------------------------------------------
/agent_dqn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | 
  6 | import argparse
  7 | import random
  8 | import time
  9 | import sys
 10 | import os
 11 | 
 12 | import numpy as np
 13 | import cv2
 14 | import tensorflow as tf
 15 | 
 16 | def MakeDir(path):
 17 |     try:
 18 |         os.makedirs(path)
 19 |     except:
 20 |         pass
 21 | 
 22 | lab = False
 23 | load_model = False
 24 | train = True
 25 | test_display = True
 26 | test_write_video = False
 27 | path_work_dir = "~/rl_3d/"
 28 | vizdoom_path = "~/ViZDoom/"
 29 | vizdoom_scenario = vizdoom_path + "scenarios/simpler_basic.wad"
 30 | 
 31 | # Lab parameters.
 32 | if (lab):
 33 |     from env_lab import EnvLab
 34 | 
 35 |     learning_rate = 0.00025  # 0.001
 36 |     discount_factor = 0.99
 37 |     step_num = int(5e5)  # int(1e6)
 38 |     replay_memory_size = int(1e6)
 39 |     replay_memory_batch_size = 64
 40 | 
 41 |     # Exploration rate.
 42 |     start_eps = 1.0
 43 |     end_eps = 0.1
 44 |     eps_decay_iter = 0.33 * step_num
 45 | 
 46 |     frame_repeat = 10  # 4
 47 |     channels = 3
 48 |     resolution = (40, 40) + (channels,)  # Original: 240x320
 49 | 
 50 |     model_path = path_work_dir + "model_lab_dqn/"
 51 |     save_each = 0.01 * step_num
 52 |     step_load = 100
 53 | 
 54 | # Vizdoom parameters.
 55 | if (not lab):
 56 |     from env_vizdoom import EnvVizDoom
 57 | 
 58 |     learning_rate = 0.00025
 59 |     discount_factor = 0.99
 60 |     step_num = int(5e4)
 61 |     replay_memory_size = int(1e5)
 62 |     replay_memory_batch_size = 64
 63 | 
 64 |     frame_repeat = 10
 65 |     channels = 3
 66 |     resolution = (40, 40) + (channels,) # Original: 480x640
 67 | 
 68 |     start_eps = 1.0
 69 |     end_eps = 0.1
 70 |     eps_decay_iter = 0.33 * step_num
 71 | 
 72 |     model_path = path_work_dir + "model_vizdoom_dqn/"
 73 |     save_each = 0.01 * step_num
 74 |     step_load = 100
 75 | 
 76 | MakeDir(model_path)
 77 | model_name = model_path + "dqn"
 78 | 
 79 | # Global variables.
 80 | env = None
 81 | 
 82 | def PrintStat(elapsed_time, step, step_num, train_scores):
 83 |     steps_per_s = 1.0 * step / elapsed_time
 84 |     steps_per_m = 60.0 * step / elapsed_time
 85 |     steps_per_h = 3600.0 * step / elapsed_time
 86 |     steps_remain = step_num - step
 87 |     remain_h = int(steps_remain / steps_per_h)
 88 |     remain_m = int((steps_remain - remain_h * steps_per_h) / steps_per_m)
 89 |     remain_s = int((steps_remain - remain_h * steps_per_h - remain_m * steps_per_m) / steps_per_s)
 90 |     elapsed_h = int(elapsed_time / 3600)
 91 |     elapsed_m = int((elapsed_time - elapsed_h * 3600) / 60)
 92 |     elapsed_s = int((elapsed_time - elapsed_h * 3600 - elapsed_m * 60))
 93 |     print("{}% | Steps: {}/{}, {:.2f}M step/h, {:02}:{:02}:{:02}/{:02}:{:02}:{:02}".format(
 94 |         100.0 * step / step_num, step, step_num, steps_per_h / 1e6,
 95 |         elapsed_h, elapsed_m, elapsed_s, remain_h, remain_m, remain_s), file=sys.stderr)
 96 | 
 97 |     mean_train = 0
 98 |     std_train = 0
 99 |     min_train = 0
100 |     max_train = 0
101 |     if (len(train_scores) > 0):
102 |         train_scores = np.array(train_scores)
103 |         mean_train = train_scores.mean()
104 |         std_train = train_scores.std()
105 |         min_train = train_scores.min()
106 |         max_train = train_scores.max()
107 |     print("Episodes: {} Rewards: mean: {:.2f}, std: {:.2f}, min: {:.2f}, max: {:.2f}".format(
108 |         len(train_scores), mean_train, std_train, min_train, max_train), file=sys.stderr)
109 | 
110 | def Preprocess(img):
111 |     #cv2.imshow("frame-train", img)
112 |     #cv2.waitKey(20)
113 |     if (channels == 1):
114 |         img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
115 |     img = cv2.resize(img, (resolution[1], resolution[0]))
116 |     #cv2.imshow("frame-train", img)
117 |     #cv2.waitKey(200)
118 |     return np.reshape(img, resolution)
119 | 
120 | class ReplayMemory(object):
121 |     def __init__(self, capacity):
122 | 
123 |         self.s = np.zeros((capacity,) + resolution, dtype=np.uint8)
124 |         self.a = np.zeros(capacity, dtype=np.int32)
125 |         self.r = np.zeros(capacity, dtype=np.float32)
126 |         self.isterminal = np.zeros(capacity, dtype=np.float32)
127 | 
128 |         self.capacity = capacity
129 |         self.size = 0
130 |         self.pos = 0
131 | 
132 |     def Add(self, s, action, isterminal, reward):
133 | 
134 |         self.s[self.pos, ...] = s
135 |         self.a[self.pos] = action
136 |         self.isterminal[self.pos] = isterminal
137 |         self.r[self.pos] = reward
138 | 
139 |         self.pos = (self.pos + 1) % self.capacity
140 |         self.size = min(self.size + 1, self.capacity)
141 | 
142 |     def Get(self, sample_size):
143 | 
144 |         idx = random.sample(xrange(0, self.size-2), sample_size)
145 |         idx2 = []
146 |         for i in idx:
147 |             idx2.append(i + 1)
148 |         return self.s[idx], self.a[idx], self.s[idx2], self.isterminal[idx], self.r[idx]
149 | 
150 | class Model(object):
151 |     def __init__(self, session, actions_count):
152 | 
153 |         self.session = session
154 | 
155 |         # Create the input.
156 |         self.s_ = tf.placeholder(shape=[None] + list(resolution), dtype=tf.float32)
157 |         self.q_ = tf.placeholder(shape=[None, actions_count], dtype=tf.float32)
158 | 
159 |         # Create the network.
160 |         conv1 = tf.contrib.layers.conv2d(self.s_, num_outputs=8, kernel_size=[3, 3], stride=[2, 2])
161 |         conv2 = tf.contrib.layers.conv2d(conv1, num_outputs=16, kernel_size=[3, 3], stride=[2, 2])
162 |         conv2_flat = tf.contrib.layers.flatten(conv2)
163 |         fc1 = tf.contrib.layers.fully_connected(conv2_flat, num_outputs=128)
164 | 
165 |         self.q = tf.contrib.layers.fully_connected(fc1, num_outputs=actions_count, activation_fn=None)
166 |         self.action = tf.argmax(self.q, 1)
167 | 
168 |         self.loss = tf.losses.mean_squared_error(self.q_, self.q)
169 | 
170 |         self.optimizer = tf.train.RMSPropOptimizer(learning_rate)
171 |         self.train_step = self.optimizer.minimize(self.loss)
172 | 
173 |     def Learn(self, state, q):
174 | 
175 |         state = state.astype(np.float32)
176 |         l, _ = self.session.run([self.loss, self.train_step], feed_dict={self.s_: state, self.q_: q})
177 |         return l
178 | 
179 |     def GetQ(self, state):
180 | 
181 |         state = state.astype(np.float32)
182 |         return self.session.run(self.q, feed_dict={self.s_: state})
183 | 
184 |     def GetAction(self, state):
185 | 
186 |         state = state.astype(np.float32)
187 |         state = state.reshape([1] + list(resolution))
188 |         return self.session.run(self.action, feed_dict={self.s_: state})[0]
189 | 
190 | class Agent(object):
191 | 
192 |     def __init__(self, num_actions):
193 | 
194 |         config = tf.ConfigProto()
195 |         config.gpu_options.allow_growth = True
196 |         config.log_device_placement = False
197 |         config.allow_soft_placement = True
198 | 
199 |         self.session = tf.Session(config=config)
200 | 
201 |         self.model = Model(self.session, num_actions)
202 |         self.memory = ReplayMemory(replay_memory_size)
203 | 
204 |         self.rewards = 0
205 | 
206 |         self.saver = tf.train.Saver(max_to_keep=1000)
207 |         if (load_model):
208 |             model_name_curr = model_name + "_{:04}".format(step_load)
209 |             print("Loading model from: ", model_name_curr)
210 |             self.saver.restore(self.session, model_name_curr)
211 |         else:
212 |             init = tf.global_variables_initializer()
213 |             self.session.run(init)
214 | 
215 |         self.num_actions = num_actions
216 | 
217 |     def LearnFromMemory(self):
218 | 
219 |         if (self.memory.size > 2*replay_memory_batch_size):
220 |             s1, a, s2, isterminal, r = self.memory.Get(replay_memory_batch_size)
221 | 
222 |             q = self.model.GetQ(s1)
223 |             q2 = np.max(self.model.GetQ(s2), axis=1)
224 |             q[np.arange(q.shape[0]), a] = r + (1 - isterminal) * discount_factor * q2
225 |             self.model.Learn(s1, q)
226 | 
227 |     def GetAction(self, state):
228 | 
229 |         if (random.random() <= 0.05):
230 |             a = random.randint(0, self.num_actions-1)
231 |         else:
232 |             a = self.model.GetAction(state)
233 | 
234 |         return a
235 | 
236 |     def Step(self, iteration):
237 | 
238 |         s = Preprocess(env.Observation())
239 | 
240 |         # Epsilon-greedy.
241 |         if (iteration < eps_decay_iter):
242 |             eps = start_eps - iteration / eps_decay_iter * (start_eps - end_eps)
243 |         else:
244 |             eps = end_eps
245 | 
246 |         if (random.random() <= eps):
247 |             a = random.randint(0, self.num_actions-1)
248 |         else:
249 |             a = self.model.GetAction(s)
250 | 
251 |         reward = env.Act(a, frame_repeat)
252 |         self.rewards += reward
253 | 
254 |         isterminal = not env.IsRunning()
255 |         self.memory.Add(s, a, isterminal, reward)
256 |         self.LearnFromMemory()
257 | 
258 |     def Train(self):
259 | 
260 |         print("Starting training.")
261 |         start_time = time.time()
262 |         train_scores = []
263 |         env.Reset()
264 |         for step in xrange(1, step_num+1):
265 |             self.Step(step)
266 |             if (not env.IsRunning()):
267 |                 train_scores.append(self.rewards)
268 |                 self.rewards = 0
269 |                 env.Reset()
270 | 
271 |             if (step % save_each == 0):
272 |                 model_name_curr = model_name + "_{:04}".format(int(step / save_each))
273 |                 print("\nSaving the network weigths to:", model_name_curr, file=sys.stderr)
274 |                 self.saver.save(self.session, model_name_curr)
275 | 
276 |                 PrintStat(time.time() - start_time, step, step_num, train_scores)
277 | 
278 |                 train_scores = []
279 | 
280 |         env.Reset()
281 | 
282 | def Test(agent):
283 |     if (test_write_video):
284 |         size = (640, 480)
285 |         fps = 30.0 #/ frame_repeat
286 |         fourcc = cv2.VideoWriter_fourcc(*'XVID')  # cv2.cv.CV_FOURCC(*'XVID')
287 |         out_video = cv2.VideoWriter(path_work_dir + "test.avi", fourcc, fps, size)
288 | 
289 |     reward_total = 0
290 |     num_episodes = 30
291 |     while (num_episodes != 0):
292 |         if (not env.IsRunning()):
293 |             env.Reset()
294 |             print("Total reward: {}".format(reward_total))
295 |             reward_total = 0
296 |             num_episodes -= 1
297 | 
298 |         state_raw = env.Observation()
299 | 
300 |         state = Preprocess(state_raw)
301 |         action = agent.GetAction(state)
302 | 
303 |         for _ in xrange(frame_repeat):
304 |             # Display.
305 |             if (test_display):
306 |                 cv2.imshow("frame-test", state_raw)
307 |                 cv2.waitKey(20)
308 | 
309 |             if (test_write_video):
310 |                 out_video.write(state_raw)
311 | 
312 |             reward = env.Act(action, 1)
313 |             reward_total += reward
314 | 
315 |             if (not env.IsRunning()):
316 |                 break
317 | 
318 |             state_raw = env.Observation()
319 | 
320 | if __name__ == '__main__':
321 | 
322 |     parser = argparse.ArgumentParser()
323 |     parser.add_argument("--gpu", help="the GPU to use")
324 |     args = parser.parse_args()
325 | 
326 |     if (args.gpu):
327 |         os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
328 | 
329 |     if (lab):
330 |         env = EnvLab(80, 80, 60, "seekavoid_arena_01")
331 |     else:
332 |         env = EnvVizDoom(vizdoom_scenario)
333 | 
334 |     agent = Agent(env.NumActions())
335 | 
336 |     if (train):
337 |         agent.Train()
338 | 
339 |     Test(agent)
340 | 


--------------------------------------------------------------------------------
/agent_a3c.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from __future__ import print_function
  4 | 
  5 | import numpy as np
  6 | import cv2
  7 | import tensorflow as tf
  8 | import threading
  9 | import sys
 10 | import time
 11 | import os
 12 | 
 13 | def MakeDir(path):
 14 |     try:
 15 |         os.makedirs(path)
 16 |     except:
 17 |         pass
 18 | 
 19 | lab = False
 20 | load_model = False
 21 | train = True
 22 | test_display = True
 23 | test_write_video = True
 24 | path_work_dir = "~/rl_3d/"
 25 | vizdoom_path = "~/ViZDoom/"
 26 | vizdoom_scenario = vizdoom_path + "scenarios/simpler_basic.wad"
 27 | 
 28 | if (lab):
 29 |     from env_lab import EnvLab
 30 | 
 31 |     model_path = path_work_dir + "model_lab_a3c/"
 32 | else:
 33 |     from env_vizdoom import EnvVizDoom
 34 | 
 35 |     model_path = path_work_dir + "model_vizdoom_a3c/"
 36 | 
 37 | learning_rate = 0.00025
 38 | device = "/cpu:0"
 39 | num_workers = 3
 40 | t_max = 30
 41 | frame_repeat = 10  # 4
 42 | gamma = 0.99
 43 | step_num = int(2.5e5)
 44 | save_each = 0.01 * step_num
 45 | step_load = 100
 46 | entropy_beta = 0.01
 47 | grad_norm_clip = 40.0
 48 | 
 49 | global_scope_name = "global"
 50 | step = 0
 51 | train_scores = []
 52 | lock = threading.Lock()
 53 | start_time = 0
 54 | 
 55 | # Global.
 56 | env = None
 57 | 
 58 | MakeDir(model_path)
 59 | model_name = model_path + "a3c"
 60 | 
 61 | def PrintStat(elapsed_time, step, step_num, train_scores):
 62 |     steps_per_s = 1.0 * step / elapsed_time
 63 |     steps_per_m = 60.0 * step / elapsed_time
 64 |     steps_per_h = 3600.0 * step / elapsed_time
 65 |     steps_remain = step_num - step
 66 |     remain_h = int(steps_remain / steps_per_h)
 67 |     remain_m = int((steps_remain - remain_h * steps_per_h) / steps_per_m)
 68 |     remain_s = int((steps_remain - remain_h * steps_per_h - remain_m * steps_per_m) / steps_per_s)
 69 |     elapsed_h = int(elapsed_time / 3600)
 70 |     elapsed_m = int((elapsed_time - elapsed_h * 3600) / 60)
 71 |     elapsed_s = int((elapsed_time - elapsed_h * 3600 - elapsed_m * 60))
 72 |     print("{}% | Steps: {}/{}, {:.2f}M step/h, {:02}:{:02}:{:02}/{:02}:{:02}:{:02}".format(
 73 |         100.0 * step / step_num, step, step_num, steps_per_h / 1e6,
 74 |         elapsed_h, elapsed_m, elapsed_s, remain_h, remain_m, remain_s), file=sys.stderr)
 75 | 
 76 |     mean_train = 0
 77 |     std_train = 0
 78 |     min_train = 0
 79 |     max_train = 0
 80 |     if (len(train_scores) > 0):
 81 |         train_scores = np.array(train_scores)
 82 |         mean_train = train_scores.mean()
 83 |         std_train = train_scores.std()
 84 |         min_train = train_scores.min()
 85 |         max_train = train_scores.max()
 86 |     print("Episodes: {} Rewards: mean: {:.2f}, std: {:.2f}, min: {:.2f}, max: {:.2f}".format(
 87 |         len(train_scores), mean_train, std_train, min_train, max_train), file=sys.stderr)
 88 | 
 89 | channels = 3
 90 | resolution = (40, 40, channels)
 91 | 
 92 | def Preprocess(frame):
 93 |     if (channels == 1):
 94 |         frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
 95 |     frame = cv2.resize(frame, (resolution[1], resolution[0]))
 96 |     return np.reshape(frame, resolution)
 97 | 
 98 | class ACNet(object):
 99 |     def __init__(self, num_actions, scope, trainer):
100 |         with tf.variable_scope(scope):
101 |             self.inputs = tf.placeholder(shape=[None] + list(resolution), dtype=tf.float32)
102 | 
103 |             conv1 = tf.contrib.layers.conv2d(self.inputs, num_outputs=16, kernel_size=[3, 3], stride=[2, 2])
104 |             conv2 = tf.contrib.layers.conv2d(conv1, num_outputs=32, kernel_size=[3, 3], stride=[2, 2])
105 |             conv2_flat = tf.contrib.layers.flatten(conv2)
106 |             hidden = tf.contrib.layers.fully_connected(conv2_flat, 256)
107 | 
108 |             # Recurrent network for temporal dependencies
109 | 
110 |             # Introduce a "fake" batch dimension of 1 after flatten so that we can do LSTM over time dim
111 |             rnn_in = tf.expand_dims(hidden, [0])
112 | 
113 |             lstm_size = 256
114 |             lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size, state_is_tuple=True)
115 |             step_size = tf.shape(self.inputs)[:1]
116 | 
117 |             c_init = np.zeros((1, lstm_cell.state_size.c), dtype=np.float32)
118 |             h_init = np.zeros((1, lstm_cell.state_size.h), dtype=np.float32)
119 |             self.state_init = [c_init, h_init]
120 |             self.rnn_state = self.state_init
121 | 
122 |             c_in = tf.placeholder(shape=[1, lstm_cell.state_size.c], dtype=tf.float32)
123 |             h_in = tf.placeholder(shape=[1, lstm_cell.state_size.h], dtype=tf.float32)
124 |             self.state_in = (c_in, h_in)
125 | 
126 |             state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)
127 |             lstm_outputs, lstm_state = tf.nn.dynamic_rnn(lstm_cell, rnn_in, initial_state=state_in,
128 |                                                          sequence_length=step_size, time_major=False)
129 |             lstm_c, lstm_h = lstm_state
130 |             rnn_out = tf.reshape(lstm_outputs, [-1, lstm_size])
131 |             self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
132 | 
133 |             # Output layers for policy and value estimations
134 |             self.policy = tf.contrib.layers.fully_connected(rnn_out, num_actions, activation_fn=tf.nn.softmax,
135 |                                                             weights_initializer=self.normalized_columns_initializer(0.01),
136 |                                                             biases_initializer=None)
137 |             self.value = tf.contrib.layers.fully_connected(rnn_out, 1, activation_fn=None,
138 |                                                            weights_initializer=self.normalized_columns_initializer(1.0),
139 |                                                            biases_initializer=None)
140 | 
141 |             # Only the worker network need ops for loss functions and gradient updating.
142 |             if (scope != global_scope_name):
143 |                 self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
144 |                 actions_onehot = tf.one_hot(self.actions, num_actions, dtype=tf.float32)
145 |                 self.target_v = tf.placeholder(shape=[None], dtype=tf.float32)
146 |                 self.advantages = tf.placeholder(shape=[None], dtype=tf.float32)
147 | 
148 |                 responsible_outputs = tf.reduce_sum(self.policy * actions_onehot, [1])
149 | 
150 |                 # Loss functions
151 |                 value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value, [-1])))
152 |                 entropy = -tf.reduce_sum(self.policy * tf.log(self.policy))
153 |                 policy_loss = -tf.reduce_sum(tf.log(responsible_outputs) * self.advantages)
154 |                 self.loss = 0.5 * value_loss + policy_loss - entropy * entropy_beta
155 | 
156 |                 # Get gradients from local network using local losses
157 |                 local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
158 |                 self.gradients = tf.gradients(self.loss, local_vars)
159 | 
160 |                 if (grad_norm_clip != None):
161 |                     grads, _ = tf.clip_by_global_norm(self.gradients, grad_norm_clip)
162 |                 else:
163 |                     grads = self.gradients
164 | 
165 |                 # Apply local gradients to global network
166 |                 global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, global_scope_name)
167 |                 self.apply_grads = trainer.apply_gradients(zip(grads, global_vars))
168 | 
169 |     # Used to initialize weights for policy and value output layers
170 |     def normalized_columns_initializer(self, std = 1.0):
171 |         def _initializer(shape, dtype=None, partition_info=None):
172 |             out = np.random.randn(*shape).astype(np.float32)
173 |             out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
174 |             return tf.constant(out)
175 | 
176 |         return _initializer
177 | 
178 |     def Train(self, sess, discounted_rewards, states, actions, advantages):
179 |         states = states / 255.0
180 |         self.ResetLstm()
181 |         feed_dict = {self.target_v : discounted_rewards,
182 |                      self.inputs : np.stack(states, axis=0),
183 |                      self.actions : actions,
184 |                      self.advantages : advantages,
185 |                      self.state_in[0] : self.rnn_state[0],
186 |                      self.state_in[1] : self.rnn_state[1]}
187 |         _ = sess.run([self.apply_grads], feed_dict=feed_dict)
188 | 
189 |     def ResetLstm(self):
190 |         self.rnn_state = self.state_init
191 | 
192 |     def GetAction(self, sess, state):
193 |         state = state / 255.0
194 |         a_dist, v, self.rnn_state = sess.run([self.policy, self.value, self.state_out],
195 |                                              feed_dict={self.inputs: [state],
196 |                                                         self.state_in[0]: self.rnn_state[0],
197 |                                                         self.state_in[1]: self.rnn_state[1]})
198 |         a = np.random.choice(a_dist[0], p=a_dist[0])
199 |         a = np.argmax(a_dist == a)
200 |         return a, v[0, 0]
201 | 
202 |     def GetValue(self, sess, state):
203 |         state = state / 255.0
204 |         v = sess.run([self.value],
205 |                         feed_dict={self.inputs: [state],
206 |                                    self.state_in[0]: self.rnn_state[0],
207 |                                    self.state_in[1]: self.rnn_state[1]})
208 |         return v[0][0, 0]
209 | 
210 | class Worker(object):
211 |     def __init__(self, number, num_actions, trainer, model_name):
212 | 
213 |         self.name = "worker_" + str(number)
214 |         self.number = number
215 |         self.model_name = model_name
216 | 
217 |         # Create the local copy of the network and the tensorflow op to copy global paramters to local network
218 |         self.local_ac = ACNet(num_actions, self.name, trainer)
219 |         self.update_target_graph = self.update_target(global_scope_name, self.name)
220 | 
221 |         if (lab):
222 |             self.env = EnvLab(80, 80, 60, "seekavoid_arena_01")
223 |         else:
224 |             self.env = EnvVizDoom(vizdoom_scenario)
225 | 
226 |     # Copies one set of variables to another.
227 |     # Used to set worker network parameters to those of global network.
228 |     def update_target(self, from_scope, to_scope):
229 |         from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
230 |         to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)
231 | 
232 |         op_holder = []
233 |         for from_var, to_var in zip(from_vars, to_vars):
234 |             op_holder.append(to_var.assign(from_var))
235 |         return op_holder
236 | 
237 |     # Calculate discounted returns.
238 |     def Discount(self, x, gamma):
239 |         for idx in reversed(xrange(len(x) - 1)):
240 |             x[idx] += x[idx + 1] * gamma
241 |         return x
242 | 
243 |     def Start(self, session, saver, coord):
244 |         worker_process = lambda: self.Process(session, saver, coord)
245 |         thread = threading.Thread(target=worker_process)
246 |         thread.start()
247 | 
248 |         global start_time
249 |         start_time = time.time()
250 |         return thread
251 | 
252 |     def Train(self, episode_buffer, sess, bootstrap_value):
253 |         episode_buffer = np.array(episode_buffer)
254 |         states = episode_buffer[:, 0]
255 |         actions = episode_buffer[:, 1]
256 |         rewards = episode_buffer[:, 2]
257 |         values = episode_buffer[:, 3]
258 | 
259 |         # Here we take the rewards and values from the episode_buffer, and use them to
260 |         # generate the advantage and discounted returns.
261 |         # The advantage function uses "Generalized Advantage Estimation"
262 |         rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
263 |         discounted_rewards = self.Discount(rewards_plus, gamma)[:-1]
264 | 
265 |         value_plus = np.asarray(values.tolist() + [bootstrap_value])
266 |         advantages = rewards + gamma * value_plus[1:] - value_plus[:-1]
267 |         advantages = self.Discount(advantages, gamma)
268 | 
269 |         # Update the global network using gradients from loss
270 |         # Generate network statistics to periodically save
271 |         self.local_ac.Train(sess, discounted_rewards, states, actions, advantages)
272 | 
273 |     def Process(self, sess, saver, coord):
274 |         global step, train_scores, start_time, lock
275 | 
276 |         print("Starting worker " + str(self.number))
277 |         while (not coord.should_stop()):
278 |             sess.run(self.update_target_graph)
279 |             episode_buffer = []
280 |             episode_reward = 0
281 | 
282 |             self.env.Reset()
283 |             s = self.env.Observation()
284 |             s = Preprocess(s)
285 |             self.local_ac.ResetLstm()
286 | 
287 |             while (self.env.IsRunning()):
288 |                 # Take an action using probabilities from policy network output.
289 |                 a, v = self.local_ac.GetAction(sess, s)
290 |                 r = self.env.Act(a, frame_repeat)
291 |                 finished = not self.env.IsRunning()
292 |                 if (not finished):
293 |                     s1 = self.env.Observation()
294 |                     s1 = Preprocess(s1)
295 |                 else:
296 |                     s1 = None
297 | 
298 |                 episode_buffer.append([s, a, r, v])
299 | 
300 |                 episode_reward += r
301 |                 s = s1
302 | 
303 |                 lock.acquire()
304 | 
305 |                 step += 1
306 | 
307 |                 if (step % save_each == 0):
308 |                     model_name_curr = self.model_name + "_{:04}".format(int(step / save_each))
309 |                     print("\nSaving the network weigths to:", model_name_curr, file=sys.stderr)
310 |                     saver.save(sess, model_name_curr)
311 | 
312 |                     PrintStat(time.time() - start_time, step, step_num, train_scores)
313 | 
314 |                     train_scores = []
315 | 
316 |                 if (step == step_num):
317 |                     coord.request_stop()
318 | 
319 |                 lock.release()
320 | 
321 |                 # If the episode hasn't ended, but the experience buffer is full, then we
322 |                 # make an update step using that experience rollout.
323 |                 if (len(episode_buffer) == t_max or (finished and len(episode_buffer) > 0)):
324 |                     # Since we don't know what the true final return is,
325 |                     # we "bootstrap" from our current value estimation.
326 |                     if (not finished):
327 |                         v1 = self.local_ac.GetValue(sess, s)
328 |                         self.Train(episode_buffer, sess, v1)
329 |                         episode_buffer = []
330 |                         sess.run(self.update_target_graph)
331 |                     else:
332 |                         self.Train(episode_buffer, sess, 0.0)
333 | 
334 |             lock.acquire()
335 |             train_scores.append(episode_reward)
336 |             lock.release()
337 | 
338 | class Agent(object):
339 |     def __init__(self):
340 |         config = tf.ConfigProto()
341 |         config.gpu_options.allow_growth = True
342 |         config.log_device_placement = False
343 |         config.allow_soft_placement = True
344 | 
345 |         self.session = tf.Session(config=config)
346 | 
347 |         with tf.device(device):
348 |             # Global network
349 |             self.global_net = ACNet(env.NumActions(), global_scope_name, None)
350 | 
351 |             if (train):
352 |                 trainer = tf.train.RMSPropOptimizer(learning_rate)
353 | 
354 |                 workers = []
355 |                 for i in xrange(num_workers):
356 |                     workers.append(Worker(i, env.NumActions(), trainer, model_name))
357 | 
358 |         saver = tf.train.Saver(max_to_keep=100)
359 |         if (load_model):
360 |             model_name_curr = model_name + "_{:04}".format(step_load)
361 |             print("Loading model from: ", model_name_curr)
362 |             saver.restore(self.session, model_name_curr)
363 |         else:
364 |             self.session.run(tf.global_variables_initializer())
365 | 
366 |         if (train):
367 |             coord = tf.train.Coordinator()
368 |             # Start the "work" process for each worker in a separate thread.
369 |             worker_threads = []
370 |             for worker in workers:
371 |                 thread = worker.Start(self.session, saver, coord)
372 |                 worker_threads.append(thread)
373 |             coord.join(worker_threads)
374 | 
375 |     def Reset(self):
376 |         self.global_net.ResetLstm()
377 | 
378 |     def Act(self, state):
379 |         action, _ = self.global_net.GetAction(self.session, state)
380 |         return action
381 | 
382 | def Test(agent):
383 |     if (test_write_video):
384 |         size = (640, 480)
385 |         fps = 30.0
386 |         fourcc = cv2.VideoWriter_fourcc(*'XVID')  # cv2.cv.CV_FOURCC(*'XVID')
387 |         out_video = cv2.VideoWriter(path_work_dir + "test.avi", fourcc, fps, size)
388 | 
389 |     reward_total = 0
390 |     num_episodes = 30
391 |     while (num_episodes != 0):
392 |         if (not env.IsRunning()):
393 |             env.Reset()
394 |             agent.Reset()
395 |             print("Total reward: {}".format(reward_total))
396 |             reward_total = 0
397 |             num_episodes -= 1
398 | 
399 |         state_raw = env.Observation()
400 | 
401 |         state = Preprocess(state_raw)
402 |         action = agent.Act(state)
403 | 
404 |         for _ in xrange(frame_repeat):
405 |             if (test_display):
406 |                 cv2.imshow("frame-test", state_raw)
407 |                 cv2.waitKey(20)
408 | 
409 |             if (test_write_video):
410 |                 out_video.write(state_raw)
411 | 
412 |             reward = env.Act(action, 1)
413 |             reward_total += reward
414 | 
415 |             if (not env.IsRunning()):
416 |                 break
417 | 
418 |             state_raw = env.Observation()
419 | 
420 | if __name__ == '__main__':
421 | 
422 |     if (lab):
423 |         env = EnvLab(80, 80, 60, "seekavoid_arena_01")
424 |     else:
425 |         env = EnvVizDoom(vizdoom_scenario)
426 | 
427 |     agent = Agent()
428 | 
429 |     Test(agent)
430 | 


--------------------------------------------------------------------------------