├── Config.py ├── Display.py ├── Environment.py ├── Experience.py ├── GA3C.py ├── GameManager.py ├── NetworkVP.py ├── ProcessAgent.py ├── ProcessStats.py ├── README.md ├── Server.py ├── ThreadDynamicAdjustment.py ├── ThreadPredictor.py ├── ThreadTrainer.py └── assets ├── DeepNav_final.pdf ├── nav_maze_static_01_score.png ├── nn.png └── stairway_to_melon_score.png /Config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | class Config: 28 | 29 | ######################################################################### 30 | # Number of stacked LSTM layers 31 | NUM_LSTMS = 2 32 | 33 | ######################################################################### 34 | # Game configuration 35 | 36 | #MAP = 'seekavoid_arena_01' 37 | MAP = 'stairway_to_melon' 38 | #MAP = 'nav_maze_static_01' 39 | #MAP = 'nav_maze_static_02' 40 | 41 | # Enable to see the trained agent in action 42 | PLAY_MODE = False 43 | # Enable to train 44 | TRAIN_MODELS = True 45 | # Load old models. Throws if the model doesn't exist 46 | LOAD_CHECKPOINT = False 47 | # If 0, the latest checkpoint is loaded 48 | LOAD_EPISODE = 0 49 | 50 | ######################################################################### 51 | # Number of agents, predictors, trainers and other system settings 52 | 53 | # If the dynamic configuration is on, these are the initial values. 54 | # Number of Agents 55 | AGENTS = 8 56 | # Number of Predictors 57 | PREDICTORS = 2 58 | # Number of Trainers 59 | TRAINERS = 2 60 | 61 | # Device 62 | DEVICE = 'gpu:0' 63 | 64 | # Play mode display size 65 | DISPLAY_SIZE = (440, 400) 66 | # Movie recording 67 | RECORD = False 68 | VIDEO_DURATION = 60 # seconds 69 | 70 | # Enable the dynamic adjustment (+ waiting time to start it) 71 | DYNAMIC_SETTINGS = False 72 | DYNAMIC_SETTINGS_STEP_WAIT = 20 73 | DYNAMIC_SETTINGS_INITIAL_WAIT = 10 74 | 75 | ######################################################################### 76 | # Algorithm parameters 77 | 78 | # Discount factor 79 | DISCOUNT = 0.99 80 | 81 | # Tmax (Interval over which gradients are computerd) 82 | TIME_MAX = 50 83 | 84 | # Maximum steps taken by agent in environment 85 | MAX_STEPS = 10 * 10**7 86 | 87 | # Reward Clipping 88 | REWARD_CLIPPING = False 89 | REWARD_MIN = -1 90 | REWARD_MAX = 1 91 | 92 | # Max size of the queue 93 | MAX_QUEUE_SIZE = 100 94 | PREDICTION_BATCH_SIZE = 128 95 | 96 | # Input of the DNN 97 | STACKED_FRAMES = 1 98 | IMAGE_WIDTH = 84 99 | IMAGE_HEIGHT = 84 100 | IMAGE_DEPTH = 3 # 3 for RGB, 4 for RGBD 101 | 102 | COMBINED_STATE_SIZE = 21240 # includes auxiliary inputs to NN (TODO: can be calculated inside the program using other params) 103 | VEL_DIM = 6 # velocity dimension 104 | DEPTH_PIXELS = 64 # number of depth pixels for auxiliary supervision 105 | DEPTH_QUANTIZATION = 8 # number of bins for depth 106 | 107 | # scaling factors for depth loss 108 | BETA1 = 1 109 | BETA2 = 1 110 | 111 | # Lab setting (frames per second) 112 | FPS = 60 113 | 114 | # Rotation for look-left, look-right actions [-512, 512] 115 | ROTATION = 20 116 | 117 | # Total number of episodes and annealing frequency 118 | EPISODES = 400000 119 | ANNEALING_EPISODE_COUNT = 400000 120 | 121 | # Entropy regualrization hyper-parameter 122 | BETA_START = 0.001 123 | BETA_END = 0.001 124 | 125 | # Learning rate 126 | LEARNING_RATE_START = 0.0005 127 | LEARNING_RATE_END = 0.0005 128 | 129 | # RMSProp parameters 130 | RMSPROP_DECAY = 0.99 131 | RMSPROP_MOMENTUM = 0.0 132 | RMSPROP_EPSILON = 0.1 133 | 134 | # Dual RMSProp - we found that using a single RMSProp for the two cost function works better and faster 135 | DUAL_RMSPROP = False 136 | 137 | # Gradient clipping 138 | USE_GRAD_CLIP = False 139 | GRAD_CLIP_NORM = 40.0 140 | # Epsilon (regularize policy lag in GA3C) 141 | LOG_EPSILON = 1e-6 142 | # Training min batch size - increasing the batch size increases the stability of the algorithm, but make learning slower 143 | TRAINING_MIN_BATCH_SIZE = 0 144 | 145 | ######################################################################### 146 | # Log and save 147 | 148 | # Enable TensorBoard 149 | TENSORBOARD = False 150 | # Update TensorBoard every X training steps 151 | TENSORBOARD_UPDATE_FREQUENCY = 1000 152 | 153 | # Enable to save models every SAVE_FREQUENCY episodes 154 | SAVE_MODELS = True 155 | # Save every SAVE_FREQUENCY episodes 156 | SAVE_FREQUENCY = 1000 157 | 158 | # Print stats every PRINT_STATS_FREQUENCY episodes 159 | PRINT_STATS_FREQUENCY = 1 160 | # The window to average stats 161 | STAT_ROLLING_MEAN_WINDOW = 1000 162 | 163 | # Results filename 164 | RESULTS_FILENAME = 'results.txt' 165 | # Network checkpoint name 166 | NETWORK_NAME = 'network' 167 | 168 | ######################################################################### 169 | # More experimental parameters here 170 | 171 | # Minimum policy 172 | MIN_POLICY = 0.0 173 | # Use log_softmax() instead of log(softmax()) 174 | USE_LOG_SOFTMAX = False 175 | -------------------------------------------------------------------------------- /Display.py: -------------------------------------------------------------------------------- 1 | import pygame 2 | from Config import Config 3 | import numpy as np 4 | from collections import deque 5 | import cv2 6 | 7 | BLUE = (128, 128, 255) 8 | RED = (255, 192, 192) 9 | BLACK = (0, 0, 0) 10 | WHITE = (255, 255, 255) 11 | 12 | depth_dict = {k:v for k,v in zip(range(Config.DEPTH_QUANTIZATION), 13 | [0.05,0.175,0.3,0.425,0.55,0.675,0.8,1])} #bins 14 | 15 | class MovieWriter(object): 16 | def __init__(self, file_name, frame_size, fps): 17 | self.vout = cv2.VideoWriter() 18 | if not self.vout.open(file_name, 19 | cv2.VideoWriter_fourcc('M','J','P','G'), fps, frame_size, True): 20 | print("Create movie failed: {0}".format(file_name)) 21 | 22 | def add_frame(self, frame): 23 | self.vout.write(frame) 24 | 25 | def close(self): 26 | self.vout.release() 27 | self.vout = None 28 | 29 | def isOpen(self): 30 | return self.vout != None and self.vout.isOpened() 31 | 32 | class Display(object): 33 | def __init__(self): 34 | pygame.init() 35 | 36 | self.display_size = Config.DISPLAY_SIZE 37 | self.surface = pygame.display.set_mode(self.display_size, 0, 24) 38 | pygame.display.set_caption('NAV') 39 | self.font = pygame.font.SysFont(None, 20) 40 | self._values = deque(maxlen=100) 41 | if Config.RECORD: 42 | self.video_fps = 5 43 | self.frames = 0 44 | self.writer = MovieWriter('melonvideo.avi', self.display_size, self.video_fps) 45 | 46 | def draw_center_text(self, str, center_x, top): 47 | text = self.font.render(str, True, WHITE, BLACK) 48 | text_rect = text.get_rect() 49 | text_rect.centerx = center_x 50 | text_rect.top = top 51 | self.surface.blit(text, text_rect) 52 | 53 | def show_image(self, im): 54 | data = im.astype(np.uint8) 55 | image = pygame.image.frombuffer(data, (84,84), 'RGB') 56 | image = pygame.transform.scale(image, (128, 128)) 57 | self.surface.blit(image, (8, 8)) 58 | self.draw_center_text("input", 50, 150) 59 | 60 | def show_depth(self, dm): 61 | dm = dm * 255. 62 | data = dm.astype(np.uint8) 63 | color_img = cv2.cvtColor(data, cv2.COLOR_GRAY2RGB) 64 | 65 | image = pygame.image.frombuffer(color_img, (16,4), 'RGB') 66 | image = pygame.transform.scale(image, (128, 32)) 67 | self.surface.blit(image, (200, 8)) 68 | self.draw_center_text("depth", 250, 50) 69 | 70 | def show_policy(self, pi): 71 | start_x = 10 72 | 73 | y = 200 74 | 75 | for i in range(len(pi)): 76 | width = pi[i] * 100 77 | pygame.draw.rect(self.surface, WHITE, (start_x, y, width, 10)) 78 | y += 20 79 | self.draw_center_text("Action Prob.", 50, y) 80 | 81 | def show_values(self): 82 | if len(self._values) == 0: 83 | return 84 | 85 | min_v = float("inf") 86 | max_v = float("-inf") 87 | 88 | for v in self._values: 89 | min_v = min(min_v, v) 90 | max_v = max(max_v, v) 91 | 92 | top = 150 93 | left = 150 94 | width = 100 95 | height = 100 96 | bottom = top + width 97 | right = left + height 98 | 99 | d = max_v - min_v 100 | last_r = 0.0 101 | for i,v in enumerate(self._values): 102 | r = (v - min_v) / d 103 | if i > 0: 104 | x0 = i-1 + left 105 | x1 = i + left 106 | y0 = bottom - last_r * height 107 | y1 = bottom - r * height 108 | pygame.draw.line(self.surface, BLUE, (x0, y0), (x1, y1), 1) 109 | last_r = r 110 | 111 | pygame.draw.line(self.surface, WHITE, (left, top), (left, bottom), 1) 112 | pygame.draw.line(self.surface, WHITE, (right, top), (right, bottom), 1) 113 | pygame.draw.line(self.surface, WHITE, (left, top), (right, top), 1) 114 | pygame.draw.line(self.surface, WHITE, (left, bottom), (right, bottom), 1) 115 | 116 | self.draw_center_text("V", left + width/2, bottom+10) 117 | 118 | def update(self, state, prediction, value, depth): 119 | im_size = Config.IMAGE_HEIGHT*Config.IMAGE_WIDTH*Config.IMAGE_DEPTH 120 | im = state[:im_size] * 255. 121 | im = np.reshape(im, (Config.IMAGE_HEIGHT, Config.IMAGE_WIDTH, Config.IMAGE_DEPTH)) 122 | self._values.append(value) 123 | 124 | # create depth_map (4,16) from depth (64, 8) 125 | depth_map = [depth_dict[np.argmax(depth[p])] for p in 126 | range(depth.shape[0])] 127 | depth_map = np.array(depth_map) 128 | 129 | self.surface.fill(BLACK) 130 | self.show_image(im) 131 | self.show_policy(prediction) 132 | self.show_values() 133 | self.show_depth(depth_map) 134 | pygame.display.update() 135 | 136 | if Config.RECORD and self.writer.isOpen(): 137 | frame_str = self.surface.get_buffer().raw 138 | d = np.fromstring(frame_str, dtype=np.uint8) 139 | d = d.reshape((self.display_size[1], self.display_size[0], 3)) 140 | self.writer.add_frame(d) 141 | self.frames += 1 142 | if self.frames == Config.VIDEO_DURATION*self.video_fps: 143 | print("Movie writing complete.") 144 | self.writer.close() 145 | -------------------------------------------------------------------------------- /Environment.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | import sys 28 | if sys.version_info >= (3,0): 29 | from queue import Queue 30 | else: 31 | from Queue import Queue 32 | 33 | import numpy as np 34 | #import scipy.misc as misc 35 | 36 | from Config import Config 37 | from GameManager import GameManager 38 | 39 | class Environment: 40 | def __init__(self): 41 | self.game = GameManager(Config.MAP) 42 | self.nb_frames = Config.STACKED_FRAMES 43 | self.frame_q = Queue(maxsize=self.nb_frames) 44 | self.previous_state = None 45 | self.current_state = None 46 | self.total_reward = 0 47 | 48 | self.reset() 49 | 50 | def is_running(self): 51 | return self.game.is_running() 52 | 53 | @staticmethod 54 | def _rgb2gray(rgb): 55 | return np.dot(rgb[..., :3], [0.299, 0.587, 0.114]) 56 | 57 | @staticmethod 58 | def _preprocess(image): 59 | #image = Environment._rgb2gray(image) 60 | #image = misc.imresize(image, [Config.IMAGE_HEIGHT, Config.IMAGE_WIDTH], 'bilinear') 61 | image = image.astype(np.float32) / 255. 62 | return image 63 | 64 | def _get_current_state_no_stacking(self): 65 | if not self.frame_q.full(): 66 | return None # frame queue is not full yet. 67 | return np.array(list(self.frame_q.queue)[0]) 68 | 69 | def _get_current_state(self): 70 | if not self.frame_q.full(): 71 | return None # frame queue is not full yet. 72 | x_ = [np.array(i) for i in list(self.frame_q.queue)] 73 | x_ = np.concatenate(x_, axis=2) 74 | #x_ = np.array(self.frame_q.queue) 75 | #x_ = np.transpose(x_, [1, 2, 3, 0]) # move channels 76 | return x_ 77 | 78 | def _update_frame_q(self, frame): 79 | if self.frame_q.full(): 80 | self.frame_q.get() 81 | self.frame_q.put(frame) 82 | 83 | # the state is no longer just the image, but a concatenation of 84 | # image and auxiliary inputs. We can't use the same _preprocess() 85 | #image = Environment._preprocess(frame) 86 | #self.frame_q.put(image) 87 | 88 | def get_num_actions(self): 89 | return GameManager.get_num_actions() 90 | 91 | def reset(self): 92 | self.total_reward = 0 93 | self.frame_q.queue.clear() 94 | self.game.reset() 95 | self._update_frame_q(self.game.get_state()) 96 | self.previous_state = self.current_state = None 97 | 98 | def step(self, action): 99 | reward, is_running = self.game.step(action) 100 | self.total_reward += reward 101 | self.previous_state = self.current_state 102 | 103 | if is_running: 104 | observation = self.game.get_state() 105 | self._update_frame_q(observation) 106 | self.current_state = self._get_current_state_no_stacking() 107 | 108 | return reward, is_running 109 | -------------------------------------------------------------------------------- /Experience.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | class Experience: 28 | def __init__(self, state, action, prediction, reward): 29 | self.state = state 30 | self.action = action 31 | self.prediction = prediction 32 | self.reward = reward 33 | -------------------------------------------------------------------------------- /GA3C.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | # check python version; warn if not Python3 28 | import os, sys 29 | import warnings 30 | if sys.version_info < (3,0): 31 | warnings.warn("Optimized for Python3. Performance may suffer under Python2.", Warning) 32 | 33 | from Config import Config 34 | from Server import Server 35 | 36 | # Suppress the output from C functions 37 | # source - http://stackoverflow.com/questions/5081657/how-do-i-prevent-a-c-shared-library-to-print-on-stdout-in-python 38 | def redirect_stdout(): 39 | sys.stdout.flush() # <--- important when redirecting to files 40 | newstdout = os.dup(1) 41 | devnull = os.open(os.devnull, os.O_WRONLY) 42 | os.dup2(devnull, 1) 43 | os.close(devnull) 44 | sys.stdout = os.fdopen(newstdout, 'w') 45 | 46 | def checks(): 47 | if Config.STACKED_FRAMES != 1: 48 | assert False, "Stacking of multiple frames not supported. See disentangle_obs() in NetworkVP.py" 49 | 50 | if Config.NUM_LSTMS != 2: 51 | assert False, "Architecture hard-wired for 2 stacked LSTM layers" 52 | 53 | # Parse arguments 54 | for i in range(1, len(sys.argv)): 55 | # Config arguments should be in format of Config=Value 56 | # For setting booleans to False use Config= 57 | x, y = sys.argv[i].split('=') 58 | setattr(Config, x, type(getattr(Config, x))(y)) 59 | 60 | # Adjust configs for Play mode 61 | if Config.PLAY_MODE: 62 | print("==Play mode on==") 63 | Config.AGENTS = 1 64 | Config.PREDICTORS = 1 65 | Config.TRAINERS = 1 66 | Config.DYNAMIC_SETTINGS = False 67 | 68 | Config.LOAD_CHECKPOINT = True 69 | Config.TRAIN_MODELS = False 70 | Config.SAVE_MODELS = False 71 | 72 | redirect_stdout() 73 | checks() 74 | print('+++ GA3C on %s +++'%Config.MAP) 75 | print('===Network===') 76 | print('LSTM layers:', Config.NUM_LSTMS) 77 | print("Reward clipping %s. Clipping affects policy!"%('ENABLED' if Config.REWARD_CLIPPING else 'DISABLED')) 78 | print('======') 79 | # Start main program 80 | Server().main() 81 | -------------------------------------------------------------------------------- /GameManager.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | import deepmind_lab 28 | import numpy as np 29 | from Config import Config 30 | import sys 31 | 32 | def _action(*entries): 33 | return np.array(entries, dtype=np.intc) 34 | 35 | class GameManager: 36 | 37 | ACTION_LIST = [ 38 | _action(-1*int(Config.ROTATION), 0, 0, 0, 0, 0, 0), # look_left 39 | _action( int(Config.ROTATION), 0, 0, 0, 0, 0, 0), # look_right 40 | #_action( 0, 10, 0, 0, 0, 0, 0), # look_up 41 | #_action( 0, -10, 0, 0, 0, 0, 0), # look_down 42 | #_action(-1*int(Config.ROTATION), 0, 0, 1, 0, 0, 0), 43 | #_action( int(Config.ROTATION), 0, 0, 1, 0, 0, 0), 44 | _action( 0, 0, -1, 0, 0, 0, 0), # strafe_left 45 | _action( 0, 0, 1, 0, 0, 0, 0), # strafe_right 46 | _action( 0, 0, 0, 1, 0, 0, 0), # forward 47 | _action( 0, 0, 0, -1, 0, 0, 0), # backward 48 | #_action( 0, 0, 0, 0, 1, 0, 0), # fire 49 | #_action( 0, 0, 0, 0, 0, 1, 0), # jump 50 | #_action( 0, 0, 0, 0, 0, 0, 1) # crouch 51 | ] 52 | 53 | def __init__(self, map_name): 54 | self.map_name = map_name 55 | self.obs_specs = ['RGBD_INTERLACED', 'VEL.TRANS', 'VEL.ROT'] 56 | 57 | self.lab = deepmind_lab.Lab(map_name, self.obs_specs, config={ 58 | 'fps': str(Config.FPS), 59 | 'width': str(Config.IMAGE_WIDTH), 60 | 'height': str(Config.IMAGE_HEIGHT) 61 | }) 62 | 63 | self.prev_action = 0 64 | self.prev_reward = 0 65 | self.reset() 66 | 67 | def reset(self): 68 | self.prev_action = 0 69 | self.prev_reward = 0 70 | if not self.lab.reset(): 71 | assert 'Error reseting lab environment' 72 | 73 | def is_running(self): 74 | return self.lab.is_running() 75 | 76 | def get_state(self): 77 | obs = self.lab.observations() # dict of Numpy arrays 78 | image = obs['RGBD_INTERLACED'] 79 | 80 | # create a low resolution (4x16) depth map from the 84x84 image 81 | depth_map = image[:,:,3] 82 | depth_map = depth_map[16:-16,:] # crop 83 | depth_map = depth_map[:,2:-2] # crop 84 | depth_map = depth_map[::13,::5] # subsample 85 | 86 | image = image[:,:,:3].astype(np.float32) / 255. #RGB 87 | 88 | # flatten array for later append 89 | image = image.flatten() 90 | depth_map = depth_map.flatten() 91 | 92 | # quantize depth (as per DeepMind paper) 93 | depth_map = np.power(depth_map/255., 10) 94 | depth_map = np.digitize(depth_map, 95 | [0,0.05,0.175,0.3,0.425,0.55,0.675,0.8,1.01]) # bins 96 | depth_map -= 1 97 | 98 | # velocity vectors 99 | vel_vec1 = obs['VEL.TRANS'] 100 | vel_vec2 = obs['VEL.ROT'] 101 | 102 | # combined state 103 | state = np.append(image, depth_map) 104 | state = np.append(state, vel_vec1) 105 | state = np.append(state, vel_vec2) 106 | state = np.append(state, self.prev_action) 107 | state = np.append(state, self.prev_reward) 108 | 109 | return state 110 | 111 | @staticmethod 112 | def get_num_actions(): 113 | return len(GameManager.ACTION_LIST) 114 | 115 | def step(self, action): 116 | if action == -1: #NO-OP 117 | reward = 0 118 | else: 119 | reward = self.lab.step(GameManager.ACTION_LIST[action], num_steps=4) 120 | self.prev_action = action 121 | self.prev_reward = reward 122 | 123 | return reward, self.is_running() 124 | -------------------------------------------------------------------------------- /NetworkVP.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | import os 28 | import re 29 | import numpy as np 30 | import tensorflow as tf 31 | 32 | from Config import Config 33 | 34 | 35 | class NetworkVP: 36 | def __init__(self, device, model_name, num_actions): 37 | self.device = device 38 | self.model_name = model_name 39 | self.num_actions = num_actions 40 | 41 | self.img_width = Config.IMAGE_WIDTH 42 | self.img_height = Config.IMAGE_HEIGHT 43 | self.img_channels = Config.IMAGE_DEPTH * Config.STACKED_FRAMES 44 | 45 | self.learning_rate = Config.LEARNING_RATE_START 46 | self.beta = Config.BETA_START 47 | self.log_epsilon = Config.LOG_EPSILON 48 | 49 | self.graph = tf.Graph() 50 | with self.graph.as_default() as g: 51 | with tf.device(self.device): 52 | self._create_graph() 53 | 54 | self.sess = tf.Session( 55 | graph=self.graph, 56 | config=tf.ConfigProto( 57 | allow_soft_placement=True, 58 | log_device_placement=False, 59 | gpu_options=tf.GPUOptions(allow_growth=True))) 60 | self.sess.run(tf.global_variables_initializer()) 61 | 62 | if Config.TENSORBOARD: self._create_tensor_board() 63 | if Config.LOAD_CHECKPOINT or Config.SAVE_MODELS: 64 | vars = tf.global_variables() 65 | self.saver = tf.train.Saver({var.name: var for var in vars}, max_to_keep=0) 66 | 67 | 68 | def _create_graph(self): 69 | self.x = tf.placeholder( 70 | tf.float32, [None, self.img_height, self.img_width, self.img_channels], name='X') 71 | self.y_r = tf.placeholder(tf.float32, [None], name='Yr') 72 | self.p_rewards = tf.placeholder(tf.float32, [None, 1], name='p_rewards') 73 | self.aux_inp = tf.placeholder(tf.float32, shape=[None, self.num_actions+Config.VEL_DIM], name='aux_input') 74 | self.depth_labels = [tf.placeholder(tf.int32, shape=[None, Config.DEPTH_QUANTIZATION])]*Config.DEPTH_PIXELS 75 | 76 | self.var_beta = tf.placeholder(tf.float32, name='beta', shape=[]) 77 | self.var_learning_rate = tf.placeholder(tf.float32, name='lr', shape=[]) 78 | 79 | self.global_step = tf.Variable(0, trainable=False, name='step') 80 | 81 | # As implemented in A3C paper 82 | self.n1 = self.conv2d_layer(self.x, 8, 16, 'conv11', strides=[1, 4, 4, 1]) 83 | self.n2 = self.conv2d_layer(self.n1, 4, 32, 'conv12', strides=[1, 2, 2, 1]) 84 | self.action_index = tf.placeholder(tf.float32, name='action_index', shape=[None, self.num_actions]) 85 | _input = self.n2 86 | 87 | flatten_input_shape = _input.get_shape() 88 | nb_elements = flatten_input_shape[1] * flatten_input_shape[2] * flatten_input_shape[3] 89 | 90 | self.flat = tf.reshape(_input, shape=[-1, nb_elements._value]) 91 | self.enc_out = self.dense_layer(self.flat, 256, 'dense1') # encoder output 92 | 93 | self.d1 = self.dense_layer(self.enc_out, 128, 'depth1') 94 | 95 | # input to first LSTM. Add previous step rewards 96 | self.aux1 = tf.concat((self.enc_out, self.p_rewards), axis=1) 97 | 98 | lstm_layers = Config.NUM_LSTMS 99 | self.seq_len = tf.placeholder(tf.int32, name='seq_len', shape=[]) # LSTM sequence length 100 | self.state_in = [] # LSTM input state 101 | self.state_out = [] # LSTM output state 102 | 103 | with tf.variable_scope('lstm1'): 104 | lstm_cell = tf.contrib.rnn.BasicLSTMCell(64, state_is_tuple=True) 105 | c_in_1 = tf.placeholder(tf.float32, name='c_1', shape=[None, lstm_cell.state_size.c]) 106 | h_in_1 = tf.placeholder(tf.float32, name='h_1', shape=[None, lstm_cell.state_size.h]) 107 | self.state_in.append((c_in_1, h_in_1)) 108 | 109 | # using tf.stack here since tf doesn't like when integers and 110 | # placeholders are mixed together in the desired shape 111 | rnn_in = tf.reshape(self.aux1, tf.stack([-1, self.seq_len, self.aux1.shape[1]])) 112 | 113 | init_1 = tf.contrib.rnn.LSTMStateTuple(c_in_1, h_in_1) 114 | lstm_outputs_1, lstm_state_1 = tf.nn.dynamic_rnn(lstm_cell, rnn_in, 115 | initial_state=init_1, time_major=False) 116 | lstm_outputs_1 = tf.reshape(lstm_outputs_1, [-1, 64]) 117 | self.state_out.append(tuple(lstm_state_1)) 118 | 119 | # input to second LSTM. Add previous LSTM output, vel and prev action 120 | self.aux2 = tf.concat((self.enc_out, lstm_outputs_1), axis=1) 121 | self.aux2 = tf.concat((self.aux2, self.aux_inp), axis=1) 122 | 123 | with tf.variable_scope('lstm2'): 124 | lstm_cell = tf.contrib.rnn.BasicLSTMCell(256, state_is_tuple=True) 125 | c_in_2 = tf.placeholder(tf.float32, name='c_2', shape=[None, lstm_cell.state_size.c]) 126 | h_in_2 = tf.placeholder(tf.float32, name='h_2', shape=[None, lstm_cell.state_size.h]) 127 | self.state_in.append((c_in_2, h_in_2)) 128 | 129 | rnn_in = tf.reshape(self.aux2, tf.stack([-1, self.seq_len, self.aux2.shape[1]])) 130 | init_2 = tf.contrib.rnn.LSTMStateTuple(c_in_2, h_in_2) 131 | lstm_outputs_2, lstm_state_2 = tf.nn.dynamic_rnn(lstm_cell, rnn_in, 132 | initial_state=init_2, time_major=False) 133 | self.state_out.append(tuple(lstm_state_2)) 134 | 135 | self.rnn_out = tf.reshape(lstm_outputs_2, [-1, 256]) 136 | 137 | self.d2 = self.dense_layer(self.rnn_out, 128, 'depth2') 138 | self.logits_v = tf.squeeze(self.dense_layer(self.rnn_out, 1, 'logits_v', func=None), axis=[1]) 139 | self.logits_p = self.dense_layer(self.rnn_out, self.num_actions, 'logits_p', func=None) 140 | 141 | if Config.USE_LOG_SOFTMAX: 142 | self.softmax_p = tf.nn.softmax(self.logits_p) 143 | self.log_softmax_p = tf.nn.log_softmax(self.logits_p) 144 | self.log_selected_action_prob = tf.reduce_sum(self.log_softmax_p * self.action_index, axis=1) 145 | 146 | self.cost_p_1 = self.log_selected_action_prob * (self.y_r - tf.stop_gradient(self.logits_v)) 147 | self.cost_p_2 = -1 * self.var_beta * \ 148 | tf.reduce_sum(self.log_softmax_p * self.softmax_p, axis=1) 149 | else: 150 | self.softmax_p = (tf.nn.softmax(self.logits_p) + Config.MIN_POLICY) / (1.0 + Config.MIN_POLICY * self.num_actions) 151 | self.selected_action_prob = tf.reduce_sum(self.softmax_p * self.action_index, axis=1) 152 | 153 | self.cost_p_1 = tf.log(tf.maximum(self.selected_action_prob, self.log_epsilon)) \ 154 | * (self.y_r - tf.stop_gradient(self.logits_v)) 155 | self.cost_p_2 = -1 * self.var_beta * \ 156 | tf.reduce_sum(tf.log(tf.maximum(self.softmax_p, self.log_epsilon)) * 157 | self.softmax_p, axis=1) 158 | 159 | # use a mask since we pad bactches of size < TIME_MAX 160 | mask = tf.reduce_max(self.action_index,axis=1) 161 | self.cost_v = 0.5 * tf.reduce_sum(tf.square(self.y_r - self.logits_v) * mask, axis=0) 162 | self.cost_p_1_agg = tf.reduce_sum(self.cost_p_1 * mask, axis=0) 163 | self.cost_p_2_agg = tf.reduce_sum(self.cost_p_2 * mask, axis=0) 164 | 165 | # depth logits 166 | self.d1_logits = [self.dense_layer(self.d1, Config.DEPTH_QUANTIZATION, 'logits_d1_%d'%i, func=None) 167 | for i in range(Config.DEPTH_PIXELS)] 168 | 169 | self.d2_logits = [self.dense_layer(self.d2, Config.DEPTH_QUANTIZATION, 'logits_d2_%d'%i, func=None) 170 | for i in range(Config.DEPTH_PIXELS)] 171 | 172 | self.d1_loss = [tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.d1_logits[i], 173 | labels=self.depth_labels[i])*mask, axis=0) for i in range(Config.DEPTH_PIXELS)] 174 | 175 | self.d2_loss = [tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.d2_logits[i], 176 | labels=self.depth_labels[i])*mask, axis=0) for i in range(Config.DEPTH_PIXELS)] 177 | 178 | # total depth loss 179 | self.d1_loss = tf.add_n(self.d1_loss) 180 | self.d2_loss = tf.add_n(self.d2_loss) 181 | #self.d1_loss = tf.reduce_mean(self.d1_loss) 182 | #self.d2_loss = tf.reduce_mean(self.d2_loss) 183 | 184 | self.cost_p = -(self.cost_p_1_agg + self.cost_p_2_agg) + Config.BETA1*self.d1_loss + Config.BETA2*self.d2_loss 185 | 186 | if Config.DUAL_RMSPROP: 187 | self.opt_p = tf.train.RMSPropOptimizer( 188 | learning_rate=self.var_learning_rate, 189 | decay=Config.RMSPROP_DECAY, 190 | momentum=Config.RMSPROP_MOMENTUM, 191 | epsilon=Config.RMSPROP_EPSILON) 192 | 193 | self.opt_v = tf.train.RMSPropOptimizer( 194 | learning_rate=self.var_learning_rate, 195 | decay=Config.RMSPROP_DECAY, 196 | momentum=Config.RMSPROP_MOMENTUM, 197 | epsilon=Config.RMSPROP_EPSILON) 198 | else: 199 | self.cost_all = self.cost_p + self.cost_v 200 | self.opt = tf.train.RMSPropOptimizer( 201 | learning_rate=self.var_learning_rate, 202 | decay=Config.RMSPROP_DECAY, 203 | momentum=Config.RMSPROP_MOMENTUM, 204 | epsilon=Config.RMSPROP_EPSILON) 205 | 206 | if Config.USE_GRAD_CLIP: 207 | if Config.DUAL_RMSPROP: 208 | self.opt_grad_v = self.opt_v.compute_gradients(self.cost_v) 209 | self.opt_grad_v_clipped = [(tf.clip_by_norm(g, Config.GRAD_CLIP_NORM),v) 210 | for g,v in self.opt_grad_v if not g is None] 211 | self.train_op_v = self.opt_v.apply_gradients(self.opt_grad_v_clipped) 212 | 213 | self.opt_grad_p = self.opt_p.compute_gradients(self.cost_p) 214 | self.opt_grad_p_clipped = [(tf.clip_by_norm(g, Config.GRAD_CLIP_NORM),v) 215 | for g,v in self.opt_grad_p if not g is None] 216 | self.train_op_p = self.opt_p.apply_gradients(self.opt_grad_p_clipped) 217 | self.train_op = [self.train_op_p, self.train_op_v] 218 | else: 219 | self.opt_grad = self.opt.compute_gradients(self.cost_all) 220 | self.opt_grad_clipped = [(tf.clip_by_average_norm(g, Config.GRAD_CLIP_NORM),v) for g,v in self.opt_grad] 221 | self.train_op = self.opt.apply_gradients(self.opt_grad_clipped) 222 | else: 223 | if Config.DUAL_RMSPROP: 224 | self.train_op_v = self.opt_p.minimize(self.cost_v, global_step=self.global_step) 225 | self.train_op_p = self.opt_v.minimize(self.cost_p, global_step=self.global_step) 226 | self.train_op = [self.train_op_p, self.train_op_v] 227 | else: 228 | self.train_op = self.opt.minimize(self.cost_all, global_step=self.global_step) 229 | 230 | 231 | def _create_tensor_board(self): 232 | summaries = tf.get_collection(tf.GraphKeys.SUMMARIES) 233 | summaries.append(tf.summary.scalar("Pcost_advantage", self.cost_p_1_agg)) 234 | summaries.append(tf.summary.scalar("Pcost_entropy", self.cost_p_2_agg)) 235 | summaries.append(tf.summary.scalar("Pcost", self.cost_p)) 236 | summaries.append(tf.summary.scalar("Vcost", self.cost_v)) 237 | summaries.append(tf.summary.scalar("D1_loss", self.d1_loss)) 238 | summaries.append(tf.summary.scalar("D2_loss", self.d2_loss)) 239 | summaries.append(tf.summary.scalar("LearningRate", self.var_learning_rate)) 240 | summaries.append(tf.summary.scalar("Beta", self.var_beta)) 241 | for var in tf.trainable_variables(): 242 | summaries.append(tf.summary.histogram("weights_%s" % var.name, var)) 243 | 244 | summaries.append(tf.summary.histogram("activation_n1", self.n1)) 245 | summaries.append(tf.summary.histogram("activation_n2", self.n2)) 246 | summaries.append(tf.summary.histogram("activation_enc", self.enc_out)) 247 | summaries.append(tf.summary.histogram("activation_v", self.logits_v)) 248 | summaries.append(tf.summary.histogram("activation_p", self.softmax_p)) 249 | 250 | self.summary_op = tf.summary.merge(summaries) 251 | self.log_writer = tf.summary.FileWriter("logs/%s" % self.model_name, self.sess.graph) 252 | 253 | def dense_layer(self, input, out_dim, name, func=tf.nn.relu): 254 | in_dim = input.get_shape().as_list()[-1] 255 | d = 1.0 / np.sqrt(in_dim) 256 | with tf.variable_scope(name): 257 | w_init = tf.random_uniform_initializer(-d, d) 258 | b_init = tf.random_uniform_initializer(-d, d) 259 | w = tf.get_variable('w', dtype=tf.float32, shape=[in_dim, out_dim], initializer=w_init) 260 | b = tf.get_variable('b', shape=[out_dim], initializer=b_init) 261 | 262 | output = tf.matmul(input, w) + b 263 | if func is not None: 264 | output = func(output) 265 | 266 | return output 267 | 268 | def conv2d_layer(self, input, filter_size, out_dim, name, strides, func=tf.nn.relu): 269 | in_dim = input.get_shape().as_list()[-1] 270 | d = 1.0 / np.sqrt(filter_size * filter_size * in_dim) 271 | with tf.variable_scope(name): 272 | w_init = tf.random_uniform_initializer(-d, d) 273 | b_init = tf.random_uniform_initializer(-d, d) 274 | w = tf.get_variable('w', 275 | shape=[filter_size, filter_size, in_dim, out_dim], 276 | dtype=tf.float32, 277 | initializer=w_init) 278 | b = tf.get_variable('b', shape=[out_dim], initializer=b_init) 279 | 280 | output = tf.nn.conv2d(input, w, strides=strides, padding='SAME') + b 281 | if func is not None: 282 | output = func(output) 283 | 284 | return output 285 | 286 | def __get_base_feed_dict(self): 287 | return {self.var_beta: self.beta, self.var_learning_rate: self.learning_rate} 288 | 289 | def get_global_step(self): 290 | step = self.sess.run(self.global_step) 291 | return step 292 | 293 | def predict_single(self, x): 294 | return self.predict_p(x[None, :])[0] 295 | 296 | def predict_v(self, x): 297 | prediction = self.sess.run(self.logits_v, feed_dict={self.x: x}) 298 | return prediction 299 | 300 | def predict_p(self, x): 301 | prediction = self.sess.run(self.softmax_p, feed_dict={self.x: x}) 302 | return prediction 303 | 304 | def predict_p_and_v_and_d(self, x, c_batch, h_batch): 305 | batch_size = x.shape[0] 306 | im, depth_map, vel, p_action, p_reward = self.disentangle_obs(x) 307 | feed_dict={self.x: im, self.seq_len: 1, self.p_rewards: p_reward, 308 | self.aux_inp: np.concatenate((vel, p_action), axis=1)} 309 | 310 | # shape of c/h_batch: (batch_size, Config.NUM_LSTMS, 256) 311 | for i in range(Config.NUM_LSTMS): 312 | c = c_batch[:,i,:] if i == 1 else c_batch[:,i,:64] 313 | h = h_batch[:,i,:] if i == 1 else h_batch[:,i,:64] 314 | feed_dict.update({self.state_in[i]: (c, h)}) 315 | 316 | p, v, d, lstm_out = self.sess.run([self.softmax_p, self.logits_v, 317 | self.d1_logits, self.state_out], feed_dict=feed_dict) 318 | 319 | # reshape lstm_out(c/h) to: (batch_size, Config.NUM_LSTMS, 256) 320 | c = np.zeros((batch_size, Config.NUM_LSTMS, 256), 321 | dtype=np.float32) 322 | 323 | h = np.zeros((batch_size, Config.NUM_LSTMS, 256), 324 | dtype=np.float32) 325 | 326 | for i in range(Config.NUM_LSTMS): 327 | if i == 0: 328 | c[:,i,:64] = lstm_out[i][0] 329 | h[:,i,:64] = lstm_out[i][1] 330 | else: 331 | c[:,i,:] = lstm_out[i][0] 332 | h[:,i,:] = lstm_out[i][1] 333 | 334 | d = np.array(d).transpose(1, 0, 2) 335 | return p, v, d, c, h 336 | 337 | def disentangle_obs(self, states): 338 | """ 339 | The obervations x is a concatenation of image, depth_map, prev_actn, 340 | velocity vector, and prev_rewards. This function separate these 341 | """ 342 | 343 | batch_size = states.shape[0] 344 | im_size = Config.IMAGE_HEIGHT*Config.IMAGE_WIDTH*Config.IMAGE_DEPTH 345 | im = states[:, :im_size] 346 | im = np.reshape(im, (batch_size, Config.IMAGE_HEIGHT, Config.IMAGE_WIDTH, Config.IMAGE_DEPTH)) 347 | states = states[:, im_size:] 348 | 349 | dm_size = Config.DEPTH_PIXELS 350 | dm_val = states[:, :dm_size].astype(int) 351 | states = states[:, dm_size:] 352 | 353 | depth_map = np.zeros((dm_size, batch_size, Config.DEPTH_QUANTIZATION)) 354 | for i in range(dm_size): 355 | depth_map[i, np.arange(batch_size), dm_val[:,i].astype(int)] = 1 # make one-hot 356 | 357 | vl_size = Config.VEL_DIM 358 | vel = states[:, :vl_size] 359 | states = states[:, vl_size:] 360 | 361 | assert states.shape[1] == 2, "Missed something ?!" 362 | p_action = np.zeros((batch_size, self.num_actions)) 363 | p_action[np.arange(batch_size), states[:,0].astype(int)] = 1 # make one-hot 364 | p_reward = states[:, 1] 365 | p_reward = np.reshape(p_reward, (batch_size, 1)) 366 | 367 | # return (batch_size, ...) arrays 368 | return im, depth_map, vel, p_action, p_reward 369 | 370 | def train(self, x, y_r, a, c, h, trainer_id): 371 | feed_dict = self.__get_base_feed_dict() 372 | im, depth_map, vel, p_action, p_reward = self.disentangle_obs(x) 373 | feed_dict.update({self.x: im, self.y_r: y_r, self.action_index: a, 374 | self.seq_len: int(Config.TIME_MAX), self.p_rewards: p_reward, 375 | self.aux_inp: np.concatenate((vel, p_action), axis=1)}) 376 | 377 | # depth supervision 378 | feed_dict.update({self.depth_labels[i]:depth_map[i] for i in 379 | range(Config.DEPTH_PIXELS)}) 380 | 381 | for i in range(Config.NUM_LSTMS): 382 | cb = np.array(c[i]).reshape((-1, 256)) 383 | hb = np.array(h[i]).reshape((-1, 256)) 384 | if i == 0: 385 | cb = cb[:,:64] 386 | hb = hb[:,:64] 387 | 388 | feed_dict.update({self.state_in[i]: (cb, hb)}) 389 | 390 | self.sess.run(self.train_op, feed_dict=feed_dict) 391 | 392 | def log(self, x, y_r, a, c, h): 393 | feed_dict = self.__get_base_feed_dict() 394 | im, depth_map, vel, p_action, p_reward = self.disentangle_obs(x) 395 | 396 | feed_dict.update({self.x: im, self.y_r: y_r, self.action_index: a, 397 | self.seq_len: int(Config.TIME_MAX), self.p_rewards: p_reward, 398 | self.aux_inp: np.concatenate((vel, p_action), axis=1)}) 399 | 400 | # depth supervision 401 | feed_dict.update({self.depth_labels[i]:depth_map[i] for i in 402 | range(Config.DEPTH_PIXELS)}) 403 | 404 | for i in range(Config.NUM_LSTMS): 405 | cb = np.array(c[i]).reshape((-1, 256)) 406 | hb = np.array(h[i]).reshape((-1, 256)) 407 | if i == 0: 408 | cb = cb[:,:64] 409 | hb = hb[:,:64] 410 | 411 | feed_dict.update({self.state_in[i]: (cb, hb)}) 412 | 413 | step, summary = self.sess.run([self.global_step, self.summary_op], feed_dict=feed_dict) 414 | self.log_writer.add_summary(summary, step) 415 | 416 | def _checkpoint_filename(self, episode): 417 | return 'checkpoints/%s_%08d' % (self.model_name, episode) 418 | 419 | def _get_episode_from_filename(self, filename): 420 | # TODO: hacky way of getting the episode. ideally episode should be stored as a TF variable 421 | return int(re.split('/|_|\.', filename)[2]) 422 | 423 | def save(self, episode): 424 | self.saver.save(self.sess, self._checkpoint_filename(episode)) 425 | 426 | def load(self): 427 | filename = tf.train.latest_checkpoint(os.path.dirname(self._checkpoint_filename(episode=0))) 428 | if Config.LOAD_EPISODE > 0: 429 | filename = self._checkpoint_filename(Config.LOAD_EPISODE) 430 | self.saver.restore(self.sess, filename) 431 | return self._get_episode_from_filename(filename) 432 | 433 | def get_variables_names(self): 434 | return [var.name for var in self.graph.get_collection('trainable_variables')] 435 | 436 | def get_variable_value(self, name): 437 | return self.sess.run(self.graph.get_tensor_by_name(name)) 438 | -------------------------------------------------------------------------------- /ProcessAgent.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | from datetime import datetime 28 | from multiprocessing import Process, Queue, Value 29 | 30 | import numpy as np 31 | import sys, time 32 | 33 | from Config import Config 34 | from Environment import Environment 35 | from Experience import Experience 36 | 37 | 38 | class ProcessAgent(Process): 39 | def __init__(self, id, prediction_q, training_q, episode_log_q, dm): 40 | super(ProcessAgent, self).__init__() 41 | 42 | self.id = id 43 | self.prediction_q = prediction_q 44 | self.training_q = training_q 45 | self.episode_log_q = episode_log_q 46 | 47 | self.env = Environment() 48 | self.num_actions = self.env.get_num_actions() 49 | self.actions = np.arange(self.num_actions) 50 | 51 | self.discount_factor = Config.DISCOUNT 52 | # one frame at a time 53 | self.wait_q = Queue(maxsize=1) 54 | self.exit_flag = Value('i', 0) 55 | self.display_manager = dm 56 | 57 | @staticmethod 58 | def _accumulate_rewards(experiences, discount_factor, value, is_running): 59 | if is_running: 60 | reward_sum = value # terminal reward 61 | for t in reversed(range(0, len(experiences)-1)): 62 | r = np.clip(experiences[t].reward, Config.REWARD_MIN, Config.REWARD_MAX) if Config.REWARD_CLIPPING else experiences[t].reward 63 | reward_sum = discount_factor * reward_sum + r 64 | experiences[t].reward = reward_sum 65 | return experiences[:-1] 66 | # if the episode has terminated, we take the full trajectory into 67 | # account, including the very last experience 68 | else: 69 | reward_sum = 0 70 | for t in reversed(range(0, len(experiences))): 71 | r = np.clip(experiences[t].reward, Config.REWARD_MIN, Config.REWARD_MAX) if Config.REWARD_CLIPPING else experiences[t].reward 72 | reward_sum = discount_factor * reward_sum + r 73 | experiences[t].reward = reward_sum 74 | return experiences 75 | 76 | def convert_data(self, experiences): 77 | x_ = np.array([exp.state for exp in experiences]) 78 | a_ = np.eye(self.num_actions)[np.array([exp.action for exp in experiences])].astype(np.float32) 79 | r_ = np.array([exp.reward for exp in experiences]) 80 | return x_, r_, a_ 81 | 82 | def predict(self, state, lstm_inputs): 83 | # put the state in the prediction q 84 | 85 | # lstm_inputs: [dict{stacklayer1}, dict{stacklayer2}, ...] 86 | c_state = np.array([lstm['c'] for lstm in lstm_inputs]) if len(lstm_inputs) else None 87 | h_state = np.array([lstm['h'] for lstm in lstm_inputs]) if len(lstm_inputs) else None 88 | self.prediction_q.put((self.id, state, c_state, h_state)) 89 | # wait for the prediction to come back 90 | p, v, d, c_state, h_state = self.wait_q.get() 91 | 92 | if not len(lstm_inputs): 93 | return p, v, d, [] 94 | 95 | # convert return back to form: [dict{stack-layer1}, dict{stack-layer2}, ...] 96 | l = [{'c':c_state[i], 'h':h_state[i]} for i in range(c_state.shape[0])] 97 | return p, v, d, l 98 | 99 | def select_action(self, prediction): 100 | if Config.PLAY_MODE: 101 | action = np.argmax(prediction) 102 | else: 103 | action = np.random.choice(self.actions, p=prediction) 104 | return action 105 | 106 | def run_episode(self): 107 | self.env.reset() 108 | is_running = True 109 | experiences = [] 110 | 111 | time_count = 0 112 | reward_sum = 0.0 113 | 114 | # input states for prediction 115 | lstm_input_p = [{'c':np.zeros(256, dtype=np.float32), 116 | 'h':np.zeros(256, dtype=np.float32)}]*Config.NUM_LSTMS 117 | 118 | # input states for training 119 | lstm_input_t = [{'c':np.zeros(256, dtype=np.float32), 120 | 'h':np.zeros(256, dtype=np.float32)}]*Config.NUM_LSTMS 121 | 122 | while is_running: 123 | 124 | # very first few frames 125 | if self.env.current_state is None: 126 | _ , is_running = self.env.step(-1) # NOOP 127 | assert(is_running) 128 | continue 129 | 130 | prediction, value, depth, lstm_input_p = self.predict(self.env.current_state, lstm_input_p) 131 | 132 | if Config.PLAY_MODE: 133 | self.display_manager.update(self.env.current_state, prediction, value, depth) 134 | 135 | action = self.select_action(prediction) 136 | reward, is_running = self.env.step(action) 137 | 138 | reward_sum += reward 139 | exp = Experience(self.env.previous_state, action, prediction, reward) 140 | experiences.append(exp) 141 | 142 | if not is_running or time_count == int(Config.TIME_MAX): 143 | updated_exps = ProcessAgent._accumulate_rewards(experiences, self.discount_factor, value, is_running) 144 | x_, r_, a_ = self.convert_data(updated_exps) 145 | yield x_, r_, a_, lstm_input_t, reward_sum, time_count 146 | 147 | # lstm input state for next training step 148 | lstm_input_t = lstm_input_p 149 | 150 | # reset the tmax count 151 | time_count = 0 152 | # keep the last experience for the next batch 153 | experiences = [experiences[-1]] 154 | reward_sum = 0.0 155 | 156 | time_count += 1 157 | 158 | def run(self): 159 | # randomly sleep up to 1 second. helps agents boot smoothly. 160 | time.sleep(np.random.rand()) 161 | np.random.seed(np.int32(time.time() % 1 * 1000 + self.id * 10)) 162 | total_steps = 0 163 | 164 | while total_steps == Config.MAX_STEPS or self.exit_flag.value == 0: 165 | total_reward = 0 166 | total_length = 0 167 | for x_, r_, a_, lstm_, reward_sum, steps in self.run_episode(): 168 | total_steps += steps 169 | total_reward += reward_sum 170 | total_length += len(r_) + 1 # +1 for last frame that we drop 171 | self.training_q.put((x_, r_, a_, lstm_)) 172 | self.episode_log_q.put((datetime.now(), total_reward, total_length, 173 | total_steps)) 174 | -------------------------------------------------------------------------------- /ProcessStats.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | import sys 28 | if sys.version_info >= (3,0): 29 | from queue import Queue as queueQueue 30 | else: 31 | from Queue import Queue as queueQueue 32 | 33 | from datetime import datetime 34 | from multiprocessing import Process, Queue, Value 35 | 36 | import numpy as np 37 | import time 38 | 39 | from Config import Config 40 | 41 | 42 | class ProcessStats(Process): 43 | def __init__(self): 44 | super(ProcessStats, self).__init__() 45 | self.episode_log_q = Queue(maxsize=100) 46 | self.episode_count = Value('i', 0) 47 | self.training_count = Value('i', 0) 48 | self.should_save_model = Value('i', 0) 49 | self.trainer_count = Value('i', 0) 50 | self.predictor_count = Value('i', 0) 51 | self.agent_count = Value('i', 0) 52 | self.total_frame_count = 0 53 | 54 | def FPS(self): 55 | # average FPS from the beginning of the training (not current FPS) 56 | return np.ceil(self.total_frame_count / (time.time() - self.start_time)) 57 | 58 | def TPS(self): 59 | # average TPS from the beginning of the training (not current TPS) 60 | return np.ceil(self.training_count.value / (time.time() - self.start_time)) 61 | 62 | def run(self): 63 | with open(Config.RESULTS_FILENAME, 'a') as results_logger: 64 | rolling_frame_count = 0 65 | rolling_reward = 0 66 | results_q = queueQueue(maxsize=Config.STAT_ROLLING_MEAN_WINDOW) 67 | 68 | self.start_time = time.time() 69 | first_time = datetime.now() 70 | while True: 71 | episode_time, reward, length, steps = self.episode_log_q.get() 72 | results_logger.write('%s, %d, %d\n' % (episode_time.strftime("%Y-%m-%d %H:%M:%S"), reward, length)) 73 | results_logger.flush() 74 | 75 | self.total_frame_count += length 76 | self.episode_count.value += 1 77 | 78 | rolling_frame_count += length 79 | rolling_reward += reward 80 | 81 | if results_q.full(): 82 | old_episode_time, old_reward, old_length = results_q.get() 83 | rolling_frame_count -= old_length 84 | rolling_reward -= old_reward 85 | first_time = old_episode_time 86 | 87 | results_q.put((episode_time, reward, length)) 88 | 89 | if self.episode_count.value % Config.SAVE_FREQUENCY == 0: 90 | self.should_save_model.value = 1 91 | 92 | if self.episode_count.value % Config.PRINT_STATS_FREQUENCY == 0: 93 | print( 94 | '[Time: %8d] ' 95 | '[Steps: %8d] ' 96 | '[Episode: %8d Score: %10.4f] ' 97 | '[RScore: %10.4f RPPS: %5d] ' 98 | '[PPS: %5d TPS: %5d] ' 99 | '[NT: %2d NP: %2d NA: %2d]' 100 | % (int(time.time()-self.start_time), 101 | steps, 102 | self.episode_count.value, reward, 103 | rolling_reward / results_q.qsize(), 104 | rolling_frame_count / (datetime.now() - first_time).total_seconds(), 105 | self.FPS(), self.TPS(), 106 | self.trainer_count.value, self.predictor_count.value, self.agent_count.value)) 107 | sys.stdout.flush() 108 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Maze Navigation using Reinforcement Learning 2 | 3 | ## Description 4 | 5 | TensorFlow implementation of ideas in the DeepMind paper ["Learning to Navigate in Complex Environments"](https://arxiv.org/abs/1611.03673). The baseline architecture is GPU-based A3C from the paper ["Reinforcement Learning through Asynchronous Advantage Actor-Critic on a GPU"](https://openreview.net/forum?id=r1VGvBcxl). Full [report](./assets/DeepNav_final.pdf). 6 | 7 | ## Requirements 8 | 9 | * TensorFlow 1.0 10 | * DeepMind Lab 11 | * Python plugins - numpy, cv2, pygame 12 | 13 | ## Getting Started 14 | 15 | Dowload and install DeepMind Lab 16 | ``` 17 | $ git clone https://github.com/deepmind/lab.git 18 | ``` 19 | Build it following the [build instructions](https://github.com/deepmind/lab/blob/master/docs/build.md) 20 | 21 | Clone repo **inside** the lab directory 22 | ``` 23 | $ cd lab 24 | $ git clone https://github.com/tgangwani/GA3C-DeepNavigation.git 25 | ``` 26 | Add the bazel instructions at the end of lab/BUILD file 27 | 28 | ``` 29 | py_binary( 30 | name = "GA3C-DeepNavigation_train", 31 | srcs = ["GA3C-DeepNavigation/GA3C.py"], 32 | data = [":deepmind_lab.so"], 33 | main = "GA3C-DeepNavigation/GA3C.py", 34 | ) 35 | ``` 36 | 37 | Then run bazel command to run the agent 38 | ``` 39 | bazel run :GA3C-DeepNavigation_train --define headless=osmesa 40 | ``` 41 | Use ```PLAY_MODE=False``` in Config.py for training. 42 | Setting ```PLAY_MODE=True``` loads model parameters from a 43 | checkpoint and runs a single agent. A display is expected. 44 | 45 | ## Network 46 | The neural net architecture is the same as in the paper, **but for the 47 | loop-prediction loss**. 48 | 49 | 50 | 51 | ## Results 52 | 53 | ### 1. Nav_maze_static_01 54 | 55 | #### Live Agent 56 | [YouTube](https://www.youtube.com/watch?v=vyS0Z7wdHHs) 57 | 58 | #### Learning Curve 59 | 60 | 61 | ### 2. Stairway to melon 62 | 63 | #### Live Agent 64 | [YouTube](https://www.youtube.com/watch?v=0R5MGM7VPo4) 65 | 66 | #### Learning Curve 67 | 68 | 69 | 70 | ## Acknowledgement 71 | [Unreal code by miyosuda](https://github.com/miyosuda/unreal) 72 | -------------------------------------------------------------------------------- /Server.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | from multiprocessing import Queue 28 | 29 | import time 30 | 31 | from Config import Config 32 | from Environment import Environment 33 | from NetworkVP import NetworkVP 34 | from ProcessAgent import ProcessAgent 35 | from ProcessStats import ProcessStats 36 | from ThreadDynamicAdjustment import ThreadDynamicAdjustment 37 | from ThreadPredictor import ThreadPredictor 38 | from ThreadTrainer import ThreadTrainer 39 | from Display import Display 40 | 41 | class Server: 42 | def __init__(self): 43 | self.stats = ProcessStats() 44 | 45 | self.training_q = Queue(maxsize=Config.MAX_QUEUE_SIZE) 46 | self.prediction_q = Queue(maxsize=Config.MAX_QUEUE_SIZE) 47 | 48 | self.model = NetworkVP(Config.DEVICE, Config.NETWORK_NAME, Environment().get_num_actions()) 49 | if Config.LOAD_CHECKPOINT: 50 | self.stats.episode_count.value = self.model.load() 51 | 52 | self.training_step = 0 53 | self.frame_counter = 0 54 | 55 | self.agents = [] 56 | self.predictors = [] 57 | self.trainers = [] 58 | self.dynamic_adjustment = ThreadDynamicAdjustment(self) 59 | self.display_manager = Display() 60 | 61 | def add_agent(self): 62 | self.agents.append( 63 | ProcessAgent(len(self.agents), self.prediction_q, self.training_q, self.stats.episode_log_q, self.display_manager)) 64 | self.agents[-1].start() 65 | 66 | def remove_agent(self): 67 | self.agents[-1].exit_flag.value = True 68 | self.agents[-1].join() 69 | self.agents.pop() 70 | 71 | def add_predictor(self): 72 | self.predictors.append(ThreadPredictor(self, len(self.predictors))) 73 | self.predictors[-1].start() 74 | 75 | def remove_predictor(self): 76 | self.predictors[-1].exit_flag = True 77 | self.predictors[-1].join() 78 | self.predictors.pop() 79 | 80 | def add_trainer(self): 81 | self.trainers.append(ThreadTrainer(self, len(self.trainers))) 82 | self.trainers[-1].start() 83 | 84 | def remove_trainer(self): 85 | self.trainers[-1].exit_flag = True 86 | self.trainers[-1].join() 87 | self.trainers.pop() 88 | 89 | def train_model(self, x_, r_, a_, c_, h_, trainer_id): 90 | self.model.train(x_, r_, a_, c_, h_, trainer_id) 91 | self.training_step += 1 92 | self.frame_counter += x_.shape[0] 93 | 94 | self.stats.training_count.value += 1 95 | self.dynamic_adjustment.temporal_training_count += 1 96 | 97 | if Config.TENSORBOARD and self.stats.training_count.value % Config.TENSORBOARD_UPDATE_FREQUENCY == 0: 98 | self.model.log(x_, r_, a_, c_, h_) 99 | 100 | def save_model(self): 101 | self.model.save(self.stats.episode_count.value) 102 | 103 | def main(self): 104 | self.stats.start() 105 | self.dynamic_adjustment.start() 106 | 107 | if Config.PLAY_MODE: 108 | for trainer in self.trainers: 109 | trainer.enabled = False 110 | 111 | learning_rate_multiplier = ( 112 | Config.LEARNING_RATE_END - Config.LEARNING_RATE_START) / Config.ANNEALING_EPISODE_COUNT 113 | beta_multiplier = (Config.BETA_END - Config.BETA_START) / Config.ANNEALING_EPISODE_COUNT 114 | 115 | while self.stats.episode_count.value < Config.EPISODES: 116 | step = min(self.stats.episode_count.value, Config.ANNEALING_EPISODE_COUNT - 1) 117 | self.model.learning_rate = Config.LEARNING_RATE_START + learning_rate_multiplier * step 118 | self.model.beta = Config.BETA_START + beta_multiplier * step 119 | 120 | # Saving is async - even if we start saving at a given episode, we may save the model at a later episode 121 | if Config.SAVE_MODELS and self.stats.should_save_model.value > 0: 122 | self.save_model() 123 | self.stats.should_save_model.value = 0 124 | 125 | time.sleep(0.01) 126 | 127 | self.dynamic_adjustment.exit_flag = True 128 | while self.agents: 129 | self.remove_agent() 130 | while self.predictors: 131 | self.remove_predictor() 132 | while self.trainers: 133 | self.remove_trainer() 134 | -------------------------------------------------------------------------------- /ThreadDynamicAdjustment.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | from threading import Thread 28 | 29 | import numpy as np 30 | import time 31 | 32 | from Config import Config 33 | 34 | 35 | class ThreadDynamicAdjustment(Thread): 36 | def __init__(self, server): 37 | super(ThreadDynamicAdjustment, self).__init__() 38 | self.setDaemon(True) 39 | 40 | self.server = server 41 | self.enabled = Config.DYNAMIC_SETTINGS 42 | 43 | self.trainer_count = Config.TRAINERS 44 | self.predictor_count = Config.PREDICTORS 45 | self.agent_count = Config.AGENTS 46 | 47 | self.temporal_training_count = 0 48 | self.exit_flag = False 49 | 50 | def enable_disable_components(self): 51 | cur_len = len(self.server.trainers) 52 | if cur_len < self.trainer_count: 53 | for _ in np.arange(cur_len, self.trainer_count): 54 | self.server.add_trainer() 55 | elif cur_len > self.trainer_count: 56 | for _ in np.arange(self.trainer_count, cur_len): 57 | self.server.remove_trainer() 58 | 59 | cur_len = len(self.server.predictors) 60 | if cur_len < self.predictor_count: 61 | for _ in np.arange(cur_len, self.predictor_count): 62 | self.server.add_predictor() 63 | elif cur_len > self.predictor_count: 64 | for _ in np.arange(self.predictor_count, cur_len): 65 | self.server.remove_predictor() 66 | 67 | cur_len = len(self.server.agents) 68 | if cur_len < self.agent_count: 69 | for _ in np.arange(cur_len, self.agent_count): 70 | self.server.add_agent() 71 | elif cur_len > self.agent_count: 72 | for _ in np.arange(self.agent_count, cur_len): 73 | self.server.remove_agent() 74 | 75 | def random_walk(self): 76 | # 3 directions, 1 for Trainers, 1 for Predictors and 1 for Agents 77 | # 3 outcome for each, -1: remove one, 0: no change, 2: remove one 78 | direction = np.random.randint(3, size=3) - 1 79 | self.trainer_count = max(1, self.trainer_count - direction[0]) 80 | self.predictor_count = max(1, self.predictor_count - direction[1]) 81 | self.agent_count = max(1, self.agent_count - direction[2]) 82 | 83 | def update_stats(self): 84 | self.server.stats.trainer_count.value = self.trainer_count 85 | self.server.stats.predictor_count.value = self.predictor_count 86 | self.server.stats.agent_count.value = self.agent_count 87 | 88 | def run(self): 89 | self.enable_disable_components() 90 | self.update_stats() 91 | 92 | if not self.enabled: 93 | return 94 | 95 | # Wait for initialization 96 | time.sleep(Config.DYNAMIC_SETTINGS_INITIAL_WAIT) 97 | 98 | while not self.exit_flag: 99 | old_trainer_count, old_predictor_count, old_agent_count = \ 100 | self.trainer_count, self.predictor_count, self.agent_count 101 | self.random_walk() 102 | 103 | # If no change, do nothing 104 | if self.trainer_count == old_trainer_count \ 105 | and self.predictor_count == old_predictor_count \ 106 | and self.agent_count == old_agent_count: 107 | continue 108 | 109 | old_count = self.temporal_training_count 110 | self.enable_disable_components() 111 | 112 | self.temporal_training_count = 0 113 | time.sleep(Config.DYNAMIC_SETTINGS_STEP_WAIT) 114 | 115 | cur_count = self.temporal_training_count 116 | # if it didn't work, revert the changes 117 | if cur_count < old_count: 118 | self.trainer_count, self.predictor_count, self.agent_count = \ 119 | old_trainer_count, old_predictor_count, old_agent_count 120 | 121 | self.update_stats() 122 | -------------------------------------------------------------------------------- /ThreadPredictor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | from threading import Thread 28 | 29 | import numpy as np 30 | 31 | from Config import Config 32 | 33 | 34 | class ThreadPredictor(Thread): 35 | def __init__(self, server, id): 36 | super(ThreadPredictor, self).__init__() 37 | self.setDaemon(True) 38 | 39 | self.id = id 40 | self.server = server 41 | self.exit_flag = False 42 | 43 | def run(self): 44 | ids = np.zeros(Config.PREDICTION_BATCH_SIZE, dtype=np.uint16) 45 | states = np.zeros((Config.PREDICTION_BATCH_SIZE, 46 | Config.COMBINED_STATE_SIZE), dtype=np.float32) 47 | 48 | cs = np.zeros((Config.PREDICTION_BATCH_SIZE, Config.NUM_LSTMS, 256), 49 | dtype=np.float32) if Config.NUM_LSTMS else [None]*Config.PREDICTION_BATCH_SIZE 50 | 51 | hs = np.zeros((Config.PREDICTION_BATCH_SIZE, Config.NUM_LSTMS, 256), 52 | dtype=np.float32) if Config.NUM_LSTMS else [None]*Config.PREDICTION_BATCH_SIZE 53 | 54 | while not self.exit_flag: 55 | ids[0], states[0], cs[0], hs[0] = self.server.prediction_q.get() 56 | 57 | size = 1 58 | while size < Config.PREDICTION_BATCH_SIZE and not self.server.prediction_q.empty(): 59 | ids[size], states[size], cs[size], hs[size] = self.server.prediction_q.get() 60 | size += 1 61 | 62 | batch = states[:size] 63 | cb = cs[:size] 64 | hb = hs[:size] 65 | p, v, d, c, h = self.server.model.predict_p_and_v_and_d(batch, cb, hb) 66 | 67 | for i in range(size): 68 | if ids[i] < len(self.server.agents): 69 | if Config.NUM_LSTMS: 70 | assert c[i].shape == (Config.NUM_LSTMS, 256) 71 | assert h[i].shape == (Config.NUM_LSTMS, 256) 72 | self.server.agents[ids[i]].wait_q.put((p[i], v[i], d[i], c[i], h[i])) 73 | -------------------------------------------------------------------------------- /ThreadTrainer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | from threading import Thread 28 | import numpy as np 29 | 30 | from Config import Config 31 | 32 | 33 | class ThreadTrainer(Thread): 34 | def __init__(self, server, id): 35 | super(ThreadTrainer, self).__init__() 36 | self.setDaemon(True) 37 | 38 | self.id = id 39 | self.server = server 40 | self.exit_flag = False 41 | 42 | @staticmethod 43 | def dynamic_pad(x, r, a): 44 | size = int(Config.TIME_MAX) # required size 45 | z = np.zeros((size-len(x),) + x.shape[1:]) 46 | x = np.append(x, z, axis=0) 47 | z = np.zeros((size-len(r),) + r.shape[1:]) 48 | r = np.append(r, z, axis=0) 49 | z = np.zeros((size-len(a),) + a.shape[1:]) 50 | a = np.append(a, z, axis=0) 51 | assert len(x) == size 52 | return x, r, a 53 | 54 | def run(self): 55 | while not self.exit_flag: 56 | batch_size = 0 57 | c__ = []; h__ = [] # lstm hidden states 58 | while batch_size <= Config.TRAINING_MIN_BATCH_SIZE: 59 | x_, r_, a_, lstm_ = self.server.training_q.get() 60 | 61 | # when using LSTMs, the recurrence is over the TIME_MAX length 62 | # trajectory from each agent. Use padding for trajectories of 63 | # length < TIME_MAX 64 | if Config.NUM_LSTMS and x_.shape[0] != int(Config.TIME_MAX): 65 | x_, r_, a_ = ThreadTrainer.dynamic_pad(x_, r_, a_) 66 | 67 | if batch_size == 0: 68 | x__ = x_; r__ = r_; a__ = a_ 69 | 70 | if len(lstm_): 71 | c__ = []; h__ = [] 72 | for i in range(Config.NUM_LSTMS): 73 | c__.append(lstm_[i]['c']) 74 | h__.append(lstm_[i]['h']) 75 | 76 | else: 77 | x__ = np.concatenate((x__, x_)) 78 | r__ = np.concatenate((r__, r_)) 79 | a__ = np.concatenate((a__, a_)) 80 | 81 | if len(lstm_): 82 | for i in range(Config.NUM_LSTMS): 83 | c__[i] = np.concatenate((c__[i], lstm_[i]['c'])) 84 | h__[i] = np.concatenate((h__[i], lstm_[i]['h'])) 85 | 86 | batch_size += x_.shape[0] 87 | 88 | if Config.TRAIN_MODELS: 89 | self.server.train_model(x__, r__, a__, c__, h__, self.id) 90 | -------------------------------------------------------------------------------- /assets/DeepNav_final.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tgangwani/GA3C-DeepNavigation/32aae2f806479bff6aff4d61894313d26b2251a6/assets/DeepNav_final.pdf -------------------------------------------------------------------------------- /assets/nav_maze_static_01_score.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tgangwani/GA3C-DeepNavigation/32aae2f806479bff6aff4d61894313d26b2251a6/assets/nav_maze_static_01_score.png -------------------------------------------------------------------------------- /assets/nn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tgangwani/GA3C-DeepNavigation/32aae2f806479bff6aff4d61894313d26b2251a6/assets/nn.png -------------------------------------------------------------------------------- /assets/stairway_to_melon_score.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tgangwani/GA3C-DeepNavigation/32aae2f806479bff6aff4d61894313d26b2251a6/assets/stairway_to_melon_score.png --------------------------------------------------------------------------------