├── Config.py
├── Display.py
├── Environment.py
├── Experience.py
├── GA3C.py
├── GameManager.py
├── NetworkVP.py
├── ProcessAgent.py
├── ProcessStats.py
├── README.md
├── Server.py
├── ThreadDynamicAdjustment.py
├── ThreadPredictor.py
├── ThreadTrainer.py
└── assets
    ├── DeepNav_final.pdf
    ├── nav_maze_static_01_score.png
    ├── nn.png
    └── stairway_to_melon_score.png


/Config.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without
  4 | # modification, are permitted provided that the following conditions
  5 | # are met:
  6 | #  * Redistributions of source code must retain the above copyright
  7 | #    notice, this list of conditions and the following disclaimer.
  8 | #  * Redistributions in binary form must reproduce the above copyright
  9 | #    notice, this list of conditions and the following disclaimer in the
 10 | #    documentation and/or other materials provided with the distribution.
 11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | #    contributors may be used to endorse or promote products derived
 13 | #    from this software without specific prior written permission.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | 
 27 | class Config:
 28 | 
 29 |     #########################################################################
 30 |     # Number of stacked LSTM layers
 31 |     NUM_LSTMS = 2
 32 | 
 33 |     #########################################################################
 34 |     # Game configuration
 35 | 
 36 |     #MAP = 'seekavoid_arena_01'
 37 |     MAP = 'stairway_to_melon'
 38 |     #MAP = 'nav_maze_static_01'
 39 |     #MAP = 'nav_maze_static_02'
 40 |     
 41 |     # Enable to see the trained agent in action
 42 |     PLAY_MODE = False
 43 |     # Enable to train
 44 |     TRAIN_MODELS = True
 45 |     # Load old models. Throws if the model doesn't exist
 46 |     LOAD_CHECKPOINT = False
 47 |     # If 0, the latest checkpoint is loaded
 48 |     LOAD_EPISODE = 0 
 49 | 
 50 |     #########################################################################
 51 |     # Number of agents, predictors, trainers and other system settings
 52 |     
 53 |     # If the dynamic configuration is on, these are the initial values.
 54 |     # Number of Agents
 55 |     AGENTS = 8
 56 |     # Number of Predictors
 57 |     PREDICTORS = 2 
 58 |     # Number of Trainers
 59 |     TRAINERS = 2
 60 | 
 61 |     # Device
 62 |     DEVICE = 'gpu:0'
 63 | 
 64 |     # Play mode display size
 65 |     DISPLAY_SIZE = (440, 400)
 66 |     # Movie recording
 67 |     RECORD = False
 68 |     VIDEO_DURATION = 60 # seconds
 69 | 
 70 |     # Enable the dynamic adjustment (+ waiting time to start it)
 71 |     DYNAMIC_SETTINGS = False
 72 |     DYNAMIC_SETTINGS_STEP_WAIT = 20
 73 |     DYNAMIC_SETTINGS_INITIAL_WAIT = 10
 74 | 
 75 |     #########################################################################
 76 |     # Algorithm parameters
 77 | 
 78 |     # Discount factor
 79 |     DISCOUNT = 0.99
 80 |     
 81 |     # Tmax (Interval over which gradients are computerd)
 82 |     TIME_MAX = 50
 83 |    
 84 |     # Maximum steps taken by agent in environment
 85 |     MAX_STEPS = 10 * 10**7
 86 | 
 87 |     # Reward Clipping
 88 |     REWARD_CLIPPING = False
 89 |     REWARD_MIN = -1
 90 |     REWARD_MAX = 1
 91 | 
 92 |     # Max size of the queue
 93 |     MAX_QUEUE_SIZE = 100
 94 |     PREDICTION_BATCH_SIZE = 128
 95 | 
 96 |     # Input of the DNN
 97 |     STACKED_FRAMES = 1
 98 |     IMAGE_WIDTH = 84
 99 |     IMAGE_HEIGHT = 84
100 |     IMAGE_DEPTH = 3  # 3 for RGB, 4 for RGBD
101 |     
102 |     COMBINED_STATE_SIZE = 21240 # includes auxiliary inputs to NN  (TODO: can be calculated inside the program using other params)
103 |     VEL_DIM = 6 # velocity dimension
104 |     DEPTH_PIXELS = 64 # number of depth pixels for auxiliary supervision
105 |     DEPTH_QUANTIZATION = 8 # number of bins for depth
106 | 
107 |     # scaling factors for depth loss
108 |     BETA1 = 1
109 |     BETA2 = 1
110 | 
111 |     # Lab setting (frames per second)
112 |     FPS = 60
113 | 
114 |     # Rotation for look-left, look-right actions [-512, 512]
115 |     ROTATION = 20
116 |   
117 |     # Total number of episodes and annealing frequency
118 |     EPISODES = 400000
119 |     ANNEALING_EPISODE_COUNT = 400000
120 | 
121 |     # Entropy regualrization hyper-parameter
122 |     BETA_START = 0.001
123 |     BETA_END = 0.001
124 | 
125 |     # Learning rate
126 |     LEARNING_RATE_START = 0.0005
127 |     LEARNING_RATE_END = 0.0005
128 | 
129 |     # RMSProp parameters
130 |     RMSPROP_DECAY = 0.99
131 |     RMSPROP_MOMENTUM = 0.0
132 |     RMSPROP_EPSILON = 0.1
133 | 
134 |     # Dual RMSProp - we found that using a single RMSProp for the two cost function works better and faster
135 |     DUAL_RMSPROP = False
136 |     
137 |     # Gradient clipping
138 |     USE_GRAD_CLIP = False
139 |     GRAD_CLIP_NORM = 40.0 
140 |     # Epsilon (regularize policy lag in GA3C)
141 |     LOG_EPSILON = 1e-6
142 |     # Training min batch size - increasing the batch size increases the stability of the algorithm, but make learning slower
143 |     TRAINING_MIN_BATCH_SIZE = 0
144 |     
145 |     #########################################################################
146 |     # Log and save
147 | 
148 |     # Enable TensorBoard
149 |     TENSORBOARD = False
150 |     # Update TensorBoard every X training steps
151 |     TENSORBOARD_UPDATE_FREQUENCY = 1000
152 | 
153 |     # Enable to save models every SAVE_FREQUENCY episodes
154 |     SAVE_MODELS = True
155 |     # Save every SAVE_FREQUENCY episodes
156 |     SAVE_FREQUENCY = 1000
157 |     
158 |     # Print stats every PRINT_STATS_FREQUENCY episodes
159 |     PRINT_STATS_FREQUENCY = 1
160 |     # The window to average stats
161 |     STAT_ROLLING_MEAN_WINDOW = 1000
162 | 
163 |     # Results filename
164 |     RESULTS_FILENAME = 'results.txt'
165 |     # Network checkpoint name
166 |     NETWORK_NAME = 'network'
167 | 
168 |     #########################################################################
169 |     # More experimental parameters here
170 |     
171 |     # Minimum policy
172 |     MIN_POLICY = 0.0
173 |     # Use log_softmax() instead of log(softmax())
174 |     USE_LOG_SOFTMAX = False
175 | 


--------------------------------------------------------------------------------
/Display.py:
--------------------------------------------------------------------------------
  1 | import pygame
  2 | from Config import Config
  3 | import numpy as np
  4 | from collections import deque
  5 | import cv2
  6 | 
  7 | BLUE  = (128, 128, 255)
  8 | RED   = (255, 192, 192)
  9 | BLACK = (0, 0, 0)
 10 | WHITE = (255, 255, 255)
 11 | 
 12 | depth_dict = {k:v for k,v in zip(range(Config.DEPTH_QUANTIZATION),
 13 |     [0.05,0.175,0.3,0.425,0.55,0.675,0.8,1])}  #bins
 14 | 
 15 | class MovieWriter(object):
 16 |   def __init__(self, file_name, frame_size, fps):
 17 |     self.vout = cv2.VideoWriter()
 18 |     if not self.vout.open(file_name,
 19 |             cv2.VideoWriter_fourcc('M','J','P','G'), fps, frame_size, True):
 20 |         print("Create movie failed: {0}".format(file_name))
 21 | 
 22 |   def add_frame(self, frame):
 23 |     self.vout.write(frame)
 24 | 
 25 |   def close(self):
 26 |     self.vout.release() 
 27 |     self.vout = None
 28 | 
 29 |   def isOpen(self):
 30 |       return self.vout != None and self.vout.isOpened()
 31 | 
 32 | class Display(object):
 33 |   def __init__(self):
 34 |     pygame.init()
 35 |     
 36 |     self.display_size = Config.DISPLAY_SIZE
 37 |     self.surface = pygame.display.set_mode(self.display_size, 0, 24)
 38 |     pygame.display.set_caption('NAV')
 39 |     self.font = pygame.font.SysFont(None, 20)
 40 |     self._values = deque(maxlen=100)
 41 |     if Config.RECORD:
 42 |         self.video_fps = 5
 43 |         self.frames = 0
 44 |         self.writer = MovieWriter('melonvideo.avi', self.display_size, self.video_fps) 
 45 | 
 46 |   def draw_center_text(self, str, center_x, top):
 47 |     text = self.font.render(str, True, WHITE, BLACK)
 48 |     text_rect = text.get_rect()
 49 |     text_rect.centerx = center_x
 50 |     text_rect.top = top
 51 |     self.surface.blit(text, text_rect)
 52 | 
 53 |   def show_image(self, im):
 54 |     data = im.astype(np.uint8)
 55 |     image = pygame.image.frombuffer(data, (84,84), 'RGB')
 56 |     image = pygame.transform.scale(image, (128, 128))
 57 |     self.surface.blit(image, (8, 8))
 58 |     self.draw_center_text("input", 50, 150)
 59 | 
 60 |   def show_depth(self, dm):
 61 |       dm = dm * 255.
 62 |       data = dm.astype(np.uint8)
 63 |       color_img = cv2.cvtColor(data, cv2.COLOR_GRAY2RGB) 
 64 | 
 65 |       image = pygame.image.frombuffer(color_img, (16,4), 'RGB')
 66 |       image = pygame.transform.scale(image, (128, 32))
 67 |       self.surface.blit(image, (200, 8))
 68 |       self.draw_center_text("depth", 250, 50)
 69 | 
 70 |   def show_policy(self, pi):
 71 |     start_x = 10
 72 | 
 73 |     y = 200
 74 |   
 75 |     for i in range(len(pi)):
 76 |       width = pi[i] * 100
 77 |       pygame.draw.rect(self.surface, WHITE, (start_x, y, width, 10))
 78 |       y += 20
 79 |     self.draw_center_text("Action Prob.", 50, y)
 80 |   
 81 |   def show_values(self):
 82 |     if  len(self._values) == 0:
 83 |       return
 84 | 
 85 |     min_v = float("inf")
 86 |     max_v = float("-inf")
 87 | 
 88 |     for v in self._values:
 89 |       min_v = min(min_v, v)
 90 |       max_v = max(max_v, v)
 91 | 
 92 |     top = 150
 93 |     left = 150
 94 |     width = 100
 95 |     height = 100
 96 |     bottom = top + width
 97 |     right = left + height
 98 | 
 99 |     d = max_v - min_v
100 |     last_r = 0.0
101 |     for i,v in enumerate(self._values):
102 |       r = (v - min_v) / d
103 |       if i > 0:
104 |         x0 = i-1 + left
105 |         x1 = i   + left
106 |         y0 = bottom - last_r * height
107 |         y1 = bottom - r * height
108 |         pygame.draw.line(self.surface, BLUE, (x0, y0), (x1, y1), 1)
109 |       last_r = r
110 | 
111 |     pygame.draw.line(self.surface, WHITE, (left,  top),    (left,  bottom), 1)
112 |     pygame.draw.line(self.surface, WHITE, (right, top),    (right, bottom), 1)
113 |     pygame.draw.line(self.surface, WHITE, (left,  top),    (right, top),    1)
114 |     pygame.draw.line(self.surface, WHITE, (left,  bottom), (right, bottom), 1)
115 | 
116 |     self.draw_center_text("V", left + width/2, bottom+10)
117 | 
118 |   def update(self, state, prediction, value, depth):
119 |       im_size = Config.IMAGE_HEIGHT*Config.IMAGE_WIDTH*Config.IMAGE_DEPTH
120 |       im = state[:im_size] * 255.
121 |       im = np.reshape(im, (Config.IMAGE_HEIGHT, Config.IMAGE_WIDTH, Config.IMAGE_DEPTH))
122 |       self._values.append(value)
123 | 
124 |       # create depth_map (4,16) from depth (64, 8)
125 |       depth_map = [depth_dict[np.argmax(depth[p])] for p in
126 |               range(depth.shape[0])]
127 |       depth_map = np.array(depth_map)
128 | 
129 |       self.surface.fill(BLACK)      
130 |       self.show_image(im)
131 |       self.show_policy(prediction) 
132 |       self.show_values()
133 |       self.show_depth(depth_map)
134 |       pygame.display.update()
135 | 
136 |       if Config.RECORD and self.writer.isOpen(): 
137 |           frame_str = self.surface.get_buffer().raw
138 |           d = np.fromstring(frame_str, dtype=np.uint8)
139 |           d = d.reshape((self.display_size[1], self.display_size[0], 3))
140 |           self.writer.add_frame(d)
141 |           self.frames += 1
142 |           if self.frames == Config.VIDEO_DURATION*self.video_fps:
143 |               print("Movie writing complete.")
144 |               self.writer.close()
145 | 


--------------------------------------------------------------------------------
/Environment.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without
  4 | # modification, are permitted provided that the following conditions
  5 | # are met:
  6 | #  * Redistributions of source code must retain the above copyright
  7 | #    notice, this list of conditions and the following disclaimer.
  8 | #  * Redistributions in binary form must reproduce the above copyright
  9 | #    notice, this list of conditions and the following disclaimer in the
 10 | #    documentation and/or other materials provided with the distribution.
 11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | #    contributors may be used to endorse or promote products derived
 13 | #    from this software without specific prior written permission.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | 
 27 | import sys
 28 | if sys.version_info >= (3,0):
 29 |     from queue import Queue
 30 | else:
 31 |     from Queue import Queue
 32 | 
 33 | import numpy as np
 34 | #import scipy.misc as misc
 35 | 
 36 | from Config import Config
 37 | from GameManager import GameManager
 38 | 
 39 | class Environment:
 40 |     def __init__(self):
 41 |         self.game = GameManager(Config.MAP)
 42 |         self.nb_frames = Config.STACKED_FRAMES
 43 |         self.frame_q = Queue(maxsize=self.nb_frames)
 44 |         self.previous_state = None
 45 |         self.current_state = None
 46 |         self.total_reward = 0
 47 | 
 48 |         self.reset()
 49 |     
 50 |     def is_running(self):
 51 |         return self.game.is_running()
 52 |     
 53 |     @staticmethod
 54 |     def _rgb2gray(rgb):
 55 |         return np.dot(rgb[..., :3], [0.299, 0.587, 0.114])
 56 | 
 57 |     @staticmethod
 58 |     def _preprocess(image):
 59 |         #image = Environment._rgb2gray(image)
 60 |         #image = misc.imresize(image, [Config.IMAGE_HEIGHT, Config.IMAGE_WIDTH], 'bilinear')
 61 |         image = image.astype(np.float32) / 255.
 62 |         return image
 63 | 
 64 |     def _get_current_state_no_stacking(self):
 65 |         if not self.frame_q.full():
 66 |             return None  # frame queue is not full yet.
 67 |         return np.array(list(self.frame_q.queue)[0])
 68 | 
 69 |     def _get_current_state(self):
 70 |         if not self.frame_q.full():
 71 |             return None  # frame queue is not full yet.
 72 |         x_ = [np.array(i) for i in list(self.frame_q.queue)]
 73 |         x_ = np.concatenate(x_, axis=2)
 74 |         #x_ = np.array(self.frame_q.queue)
 75 |         #x_ = np.transpose(x_, [1, 2, 3, 0])  # move channels
 76 |         return x_
 77 | 
 78 |     def _update_frame_q(self, frame):
 79 |         if self.frame_q.full():
 80 |             self.frame_q.get()
 81 |         self.frame_q.put(frame)
 82 | 
 83 |         # the state is no longer just the image, but a concatenation of
 84 |         # image and auxiliary inputs. We can't use the same _preprocess()
 85 |         #image = Environment._preprocess(frame)
 86 |         #self.frame_q.put(image)
 87 | 
 88 |     def get_num_actions(self):
 89 |         return GameManager.get_num_actions()
 90 | 
 91 |     def reset(self):
 92 |         self.total_reward = 0
 93 |         self.frame_q.queue.clear()
 94 |         self.game.reset()
 95 |         self._update_frame_q(self.game.get_state())
 96 |         self.previous_state = self.current_state = None
 97 | 
 98 |     def step(self, action):
 99 |         reward, is_running = self.game.step(action)
100 |         self.total_reward += reward
101 |         self.previous_state = self.current_state
102 |         
103 |         if is_running:
104 |           observation = self.game.get_state()
105 |           self._update_frame_q(observation)
106 |           self.current_state = self._get_current_state_no_stacking() 
107 |           
108 |         return reward, is_running
109 | 


--------------------------------------------------------------------------------
/Experience.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | class Experience:
28 |     def __init__(self, state, action, prediction, reward):
29 |         self.state = state
30 |         self.action = action
31 |         self.prediction = prediction
32 |         self.reward = reward
33 | 


--------------------------------------------------------------------------------
/GA3C.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | # check python version; warn if not Python3
28 | import os, sys
29 | import warnings
30 | if sys.version_info < (3,0):
31 |     warnings.warn("Optimized for Python3. Performance may suffer under Python2.", Warning)
32 | 
33 | from Config import Config
34 | from Server import Server
35 | 
36 | # Suppress the output from C functions
37 | # source - http://stackoverflow.com/questions/5081657/how-do-i-prevent-a-c-shared-library-to-print-on-stdout-in-python
38 | def redirect_stdout():
39 |     sys.stdout.flush() # <--- important when redirecting to files
40 |     newstdout = os.dup(1)
41 |     devnull = os.open(os.devnull, os.O_WRONLY)
42 |     os.dup2(devnull, 1)
43 |     os.close(devnull)
44 |     sys.stdout = os.fdopen(newstdout, 'w')
45 | 
46 | def checks():
47 |   if Config.STACKED_FRAMES != 1:
48 |     assert False, "Stacking of multiple frames not supported. See disentangle_obs() in NetworkVP.py"
49 | 
50 |   if Config.NUM_LSTMS != 2:
51 |     assert False, "Architecture hard-wired for 2 stacked LSTM layers"
52 | 
53 | # Parse arguments
54 | for i in range(1, len(sys.argv)):
55 |     # Config arguments should be in format of Config=Value
56 |     # For setting booleans to False use Config=
57 |     x, y = sys.argv[i].split('=')
58 |     setattr(Config, x, type(getattr(Config, x))(y))
59 | 
60 | # Adjust configs for Play mode
61 | if Config.PLAY_MODE:
62 |     print("==Play mode on==")
63 |     Config.AGENTS = 1
64 |     Config.PREDICTORS = 1
65 |     Config.TRAINERS = 1
66 |     Config.DYNAMIC_SETTINGS = False
67 | 
68 |     Config.LOAD_CHECKPOINT = True
69 |     Config.TRAIN_MODELS = False
70 |     Config.SAVE_MODELS = False
71 |     
72 | redirect_stdout()
73 | checks()
74 | print('+++ GA3C on %s +++'%Config.MAP)
75 | print('===Network===')
76 | print('LSTM layers:', Config.NUM_LSTMS)
77 | print("Reward clipping %s. Clipping affects policy!"%('ENABLED' if Config.REWARD_CLIPPING else 'DISABLED'))
78 | print('======')
79 | # Start main program
80 | Server().main()
81 | 


--------------------------------------------------------------------------------
/GameManager.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without
  4 | # modification, are permitted provided that the following conditions
  5 | # are met:
  6 | #  * Redistributions of source code must retain the above copyright
  7 | #    notice, this list of conditions and the following disclaimer.
  8 | #  * Redistributions in binary form must reproduce the above copyright
  9 | #    notice, this list of conditions and the following disclaimer in the
 10 | #    documentation and/or other materials provided with the distribution.
 11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | #    contributors may be used to endorse or promote products derived
 13 | #    from this software without specific prior written permission.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | 
 27 | import deepmind_lab
 28 | import numpy as np
 29 | from Config import Config
 30 | import sys
 31 | 
 32 | def _action(*entries):
 33 |       return np.array(entries, dtype=np.intc)
 34 | 
 35 | class GameManager:
 36 | 
 37 |     ACTION_LIST = [
 38 |      _action(-1*int(Config.ROTATION),   0,  0,  0, 0, 0, 0), # look_left
 39 |      _action( int(Config.ROTATION),   0,  0,  0, 0, 0, 0), # look_right
 40 |      #_action(  0,  10,  0,  0, 0, 0, 0), # look_up
 41 |      #_action(  0, -10,  0,  0, 0, 0, 0), # look_down
 42 |      #_action(-1*int(Config.ROTATION),   0,  0,  1, 0, 0, 0),
 43 |      #_action( int(Config.ROTATION),   0,  0,  1, 0, 0, 0), 
 44 |      _action(  0,   0, -1,  0, 0, 0, 0), # strafe_left
 45 |      _action(  0,   0,  1,  0, 0, 0, 0), # strafe_right
 46 |      _action(  0,   0,  0,  1, 0, 0, 0), # forward
 47 |      _action(  0,   0,  0, -1, 0, 0, 0), # backward
 48 |      #_action(  0,   0,  0,  0, 1, 0, 0), # fire
 49 |      #_action(  0,   0,  0,  0, 0, 1, 0), # jump
 50 |      #_action(  0,   0,  0,  0, 0, 0, 1)  # crouch
 51 |     ]
 52 | 
 53 |     def __init__(self, map_name):
 54 |         self.map_name = map_name
 55 |         self.obs_specs = ['RGBD_INTERLACED', 'VEL.TRANS', 'VEL.ROT']
 56 | 
 57 |         self.lab = deepmind_lab.Lab(map_name, self.obs_specs, config={
 58 |             'fps': str(Config.FPS),
 59 |             'width': str(Config.IMAGE_WIDTH),
 60 |             'height': str(Config.IMAGE_HEIGHT)
 61 |             })
 62 | 
 63 |         self.prev_action = 0
 64 |         self.prev_reward = 0
 65 |         self.reset()
 66 | 
 67 |     def reset(self):
 68 |         self.prev_action = 0
 69 |         self.prev_reward = 0
 70 |         if not self.lab.reset():
 71 |             assert 'Error reseting lab environment'
 72 |         
 73 |     def is_running(self):
 74 |         return self.lab.is_running()
 75 | 
 76 |     def get_state(self):
 77 |         obs = self.lab.observations()  # dict of Numpy arrays
 78 |         image = obs['RGBD_INTERLACED']
 79 | 
 80 |         # create a low resolution (4x16) depth map from the 84x84 image
 81 |         depth_map = image[:,:,3]
 82 |         depth_map = depth_map[16:-16,:] # crop
 83 |         depth_map = depth_map[:,2:-2] # crop
 84 |         depth_map = depth_map[::13,::5] # subsample
 85 | 
 86 |         image = image[:,:,:3].astype(np.float32) / 255. #RGB
 87 | 
 88 |         # flatten array for later append
 89 |         image = image.flatten()
 90 |         depth_map = depth_map.flatten()
 91 | 
 92 |         # quantize depth (as per DeepMind paper)
 93 |         depth_map = np.power(depth_map/255., 10)
 94 |         depth_map = np.digitize(depth_map,
 95 |             [0,0.05,0.175,0.3,0.425,0.55,0.675,0.8,1.01])  # bins
 96 |         depth_map -= 1
 97 | 
 98 |         # velocity vectors
 99 |         vel_vec1 = obs['VEL.TRANS'] 
100 |         vel_vec2 = obs['VEL.ROT']
101 | 
102 |         # combined state
103 |         state = np.append(image, depth_map) 
104 |         state = np.append(state, vel_vec1)
105 |         state = np.append(state, vel_vec2)
106 |         state = np.append(state, self.prev_action)
107 |         state = np.append(state, self.prev_reward)
108 | 
109 |         return state
110 |     
111 |     @staticmethod
112 |     def get_num_actions():
113 |         return len(GameManager.ACTION_LIST)
114 | 
115 |     def step(self, action):
116 |         if action == -1:  #NO-OP
117 |             reward = 0
118 |         else:
119 |             reward = self.lab.step(GameManager.ACTION_LIST[action], num_steps=4)
120 |             self.prev_action = action
121 |             self.prev_reward = reward
122 |         
123 |         return reward, self.is_running()
124 | 


--------------------------------------------------------------------------------
/NetworkVP.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without
  4 | # modification, are permitted provided that the following conditions
  5 | # are met:
  6 | #  * Redistributions of source code must retain the above copyright
  7 | #    notice, this list of conditions and the following disclaimer.
  8 | #  * Redistributions in binary form must reproduce the above copyright
  9 | #    notice, this list of conditions and the following disclaimer in the
 10 | #    documentation and/or other materials provided with the distribution.
 11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | #    contributors may be used to endorse or promote products derived
 13 | #    from this software without specific prior written permission.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | 
 27 | import os
 28 | import re
 29 | import numpy as np
 30 | import tensorflow as tf
 31 | 
 32 | from Config import Config
 33 | 
 34 | 
 35 | class NetworkVP:
 36 |     def __init__(self, device, model_name, num_actions):
 37 |         self.device = device
 38 |         self.model_name = model_name
 39 |         self.num_actions = num_actions
 40 | 
 41 |         self.img_width = Config.IMAGE_WIDTH
 42 |         self.img_height = Config.IMAGE_HEIGHT
 43 |         self.img_channels = Config.IMAGE_DEPTH * Config.STACKED_FRAMES
 44 | 
 45 |         self.learning_rate = Config.LEARNING_RATE_START
 46 |         self.beta = Config.BETA_START
 47 |         self.log_epsilon = Config.LOG_EPSILON
 48 | 
 49 |         self.graph = tf.Graph()
 50 |         with self.graph.as_default() as g:
 51 |             with tf.device(self.device):
 52 |                 self._create_graph()
 53 | 
 54 |                 self.sess = tf.Session(
 55 |                     graph=self.graph,
 56 |                     config=tf.ConfigProto(
 57 |                         allow_soft_placement=True,
 58 |                         log_device_placement=False,
 59 |                         gpu_options=tf.GPUOptions(allow_growth=True)))
 60 |                 self.sess.run(tf.global_variables_initializer())
 61 | 
 62 |                 if Config.TENSORBOARD: self._create_tensor_board()
 63 |                 if Config.LOAD_CHECKPOINT or Config.SAVE_MODELS:
 64 |                     vars = tf.global_variables()
 65 |                     self.saver = tf.train.Saver({var.name: var for var in vars}, max_to_keep=0)
 66 |                 
 67 | 
 68 |     def _create_graph(self):
 69 |         self.x = tf.placeholder(
 70 |             tf.float32, [None, self.img_height, self.img_width, self.img_channels], name='X')
 71 |         self.y_r = tf.placeholder(tf.float32, [None], name='Yr')
 72 |         self.p_rewards = tf.placeholder(tf.float32, [None, 1], name='p_rewards')
 73 |         self.aux_inp = tf.placeholder(tf.float32, shape=[None, self.num_actions+Config.VEL_DIM], name='aux_input')
 74 |         self.depth_labels = [tf.placeholder(tf.int32, shape=[None, Config.DEPTH_QUANTIZATION])]*Config.DEPTH_PIXELS
 75 | 
 76 |         self.var_beta = tf.placeholder(tf.float32, name='beta', shape=[])
 77 |         self.var_learning_rate = tf.placeholder(tf.float32, name='lr', shape=[])
 78 | 
 79 |         self.global_step = tf.Variable(0, trainable=False, name='step')
 80 | 
 81 |         # As implemented in A3C paper
 82 |         self.n1 = self.conv2d_layer(self.x, 8, 16, 'conv11', strides=[1, 4, 4, 1])
 83 |         self.n2 = self.conv2d_layer(self.n1, 4, 32, 'conv12', strides=[1, 2, 2, 1])
 84 |         self.action_index = tf.placeholder(tf.float32, name='action_index', shape=[None, self.num_actions])
 85 |         _input = self.n2
 86 | 
 87 |         flatten_input_shape = _input.get_shape()
 88 |         nb_elements = flatten_input_shape[1] * flatten_input_shape[2] * flatten_input_shape[3]
 89 | 
 90 |         self.flat = tf.reshape(_input, shape=[-1, nb_elements._value])
 91 |         self.enc_out = self.dense_layer(self.flat, 256, 'dense1') # encoder output
 92 | 
 93 |         self.d1 = self.dense_layer(self.enc_out, 128, 'depth1')
 94 | 
 95 |         # input to first LSTM. Add previous step rewards
 96 |         self.aux1 = tf.concat((self.enc_out, self.p_rewards), axis=1)   
 97 | 
 98 |         lstm_layers = Config.NUM_LSTMS
 99 |         self.seq_len = tf.placeholder(tf.int32, name='seq_len', shape=[])  # LSTM sequence length
100 |         self.state_in = [] # LSTM input state
101 |         self.state_out = [] # LSTM output state
102 | 
103 |         with tf.variable_scope('lstm1'):
104 |           lstm_cell = tf.contrib.rnn.BasicLSTMCell(64, state_is_tuple=True)
105 |           c_in_1 = tf.placeholder(tf.float32, name='c_1', shape=[None, lstm_cell.state_size.c])
106 |           h_in_1 = tf.placeholder(tf.float32, name='h_1', shape=[None, lstm_cell.state_size.h])
107 |           self.state_in.append((c_in_1, h_in_1))
108 |           
109 |           # using tf.stack here since tf doesn't like when integers and
110 |           # placeholders are mixed together in the desired shape
111 |           rnn_in = tf.reshape(self.aux1, tf.stack([-1, self.seq_len, self.aux1.shape[1]])) 
112 |           
113 |           init_1 = tf.contrib.rnn.LSTMStateTuple(c_in_1, h_in_1)
114 |           lstm_outputs_1, lstm_state_1 = tf.nn.dynamic_rnn(lstm_cell, rnn_in,
115 |               initial_state=init_1, time_major=False)
116 |           lstm_outputs_1 = tf.reshape(lstm_outputs_1, [-1, 64])
117 |           self.state_out.append(tuple(lstm_state_1))
118 | 
119 |         # input to second LSTM. Add previous LSTM output, vel and prev action
120 |         self.aux2 = tf.concat((self.enc_out, lstm_outputs_1), axis=1)  
121 |         self.aux2 = tf.concat((self.aux2, self.aux_inp), axis=1)
122 | 
123 |         with tf.variable_scope('lstm2'):
124 |           lstm_cell = tf.contrib.rnn.BasicLSTMCell(256, state_is_tuple=True)
125 |           c_in_2 = tf.placeholder(tf.float32, name='c_2', shape=[None, lstm_cell.state_size.c])
126 |           h_in_2 = tf.placeholder(tf.float32, name='h_2', shape=[None, lstm_cell.state_size.h])
127 |           self.state_in.append((c_in_2, h_in_2))
128 |           
129 |           rnn_in = tf.reshape(self.aux2, tf.stack([-1, self.seq_len, self.aux2.shape[1]]))
130 |           init_2 = tf.contrib.rnn.LSTMStateTuple(c_in_2, h_in_2)
131 |           lstm_outputs_2, lstm_state_2 = tf.nn.dynamic_rnn(lstm_cell, rnn_in,
132 |               initial_state=init_2, time_major=False)
133 |           self.state_out.append(tuple(lstm_state_2))
134 | 
135 |           self.rnn_out = tf.reshape(lstm_outputs_2, [-1, 256])
136 | 
137 |         self.d2 = self.dense_layer(self.rnn_out, 128, 'depth2')
138 |         self.logits_v = tf.squeeze(self.dense_layer(self.rnn_out, 1, 'logits_v', func=None), axis=[1])
139 |         self.logits_p = self.dense_layer(self.rnn_out, self.num_actions, 'logits_p', func=None)
140 | 
141 |         if Config.USE_LOG_SOFTMAX:
142 |             self.softmax_p = tf.nn.softmax(self.logits_p)
143 |             self.log_softmax_p = tf.nn.log_softmax(self.logits_p)
144 |             self.log_selected_action_prob = tf.reduce_sum(self.log_softmax_p * self.action_index, axis=1)
145 | 
146 |             self.cost_p_1 = self.log_selected_action_prob * (self.y_r - tf.stop_gradient(self.logits_v))
147 |             self.cost_p_2 = -1 * self.var_beta * \
148 |                         tf.reduce_sum(self.log_softmax_p * self.softmax_p, axis=1)
149 |         else:
150 |             self.softmax_p = (tf.nn.softmax(self.logits_p) + Config.MIN_POLICY) / (1.0 + Config.MIN_POLICY * self.num_actions)
151 |             self.selected_action_prob = tf.reduce_sum(self.softmax_p * self.action_index, axis=1)
152 | 
153 |             self.cost_p_1 = tf.log(tf.maximum(self.selected_action_prob, self.log_epsilon)) \
154 |                         * (self.y_r - tf.stop_gradient(self.logits_v))
155 |             self.cost_p_2 = -1 * self.var_beta * \
156 |                         tf.reduce_sum(tf.log(tf.maximum(self.softmax_p, self.log_epsilon)) *
157 |                                       self.softmax_p, axis=1)
158 | 
159 |         # use a mask since we pad bactches of size < TIME_MAX
160 |         mask = tf.reduce_max(self.action_index,axis=1)
161 |         self.cost_v = 0.5 * tf.reduce_sum(tf.square(self.y_r - self.logits_v) * mask, axis=0)
162 |         self.cost_p_1_agg = tf.reduce_sum(self.cost_p_1 * mask, axis=0)
163 |         self.cost_p_2_agg = tf.reduce_sum(self.cost_p_2 * mask, axis=0)
164 | 
165 |         # depth logits
166 |         self.d1_logits = [self.dense_layer(self.d1, Config.DEPTH_QUANTIZATION, 'logits_d1_%d'%i, func=None) 
167 |             for i in range(Config.DEPTH_PIXELS)]
168 |         
169 |         self.d2_logits = [self.dense_layer(self.d2, Config.DEPTH_QUANTIZATION, 'logits_d2_%d'%i, func=None) 
170 |             for i in range(Config.DEPTH_PIXELS)]
171 | 
172 |         self.d1_loss = [tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.d1_logits[i],
173 |           labels=self.depth_labels[i])*mask, axis=0) for i in range(Config.DEPTH_PIXELS)]
174 |         
175 |         self.d2_loss = [tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.d2_logits[i],
176 |           labels=self.depth_labels[i])*mask, axis=0) for i in range(Config.DEPTH_PIXELS)]
177 | 
178 |         # total depth loss
179 |         self.d1_loss = tf.add_n(self.d1_loss)
180 |         self.d2_loss = tf.add_n(self.d2_loss)
181 |         #self.d1_loss = tf.reduce_mean(self.d1_loss)
182 |         #self.d2_loss = tf.reduce_mean(self.d2_loss)
183 | 
184 |         self.cost_p = -(self.cost_p_1_agg + self.cost_p_2_agg) + Config.BETA1*self.d1_loss + Config.BETA2*self.d2_loss
185 |         
186 |         if Config.DUAL_RMSPROP:
187 |             self.opt_p = tf.train.RMSPropOptimizer(
188 |                 learning_rate=self.var_learning_rate,
189 |                 decay=Config.RMSPROP_DECAY,
190 |                 momentum=Config.RMSPROP_MOMENTUM,
191 |                 epsilon=Config.RMSPROP_EPSILON)
192 | 
193 |             self.opt_v = tf.train.RMSPropOptimizer(
194 |                 learning_rate=self.var_learning_rate,
195 |                 decay=Config.RMSPROP_DECAY,
196 |                 momentum=Config.RMSPROP_MOMENTUM,
197 |                 epsilon=Config.RMSPROP_EPSILON)
198 |         else:
199 |             self.cost_all = self.cost_p + self.cost_v
200 |             self.opt = tf.train.RMSPropOptimizer(
201 |                 learning_rate=self.var_learning_rate,
202 |                 decay=Config.RMSPROP_DECAY,
203 |                 momentum=Config.RMSPROP_MOMENTUM,
204 |                 epsilon=Config.RMSPROP_EPSILON)
205 | 
206 |         if Config.USE_GRAD_CLIP:
207 |             if Config.DUAL_RMSPROP:
208 |                 self.opt_grad_v = self.opt_v.compute_gradients(self.cost_v)
209 |                 self.opt_grad_v_clipped = [(tf.clip_by_norm(g, Config.GRAD_CLIP_NORM),v) 
210 |                                             for g,v in self.opt_grad_v if not g is None]
211 |                 self.train_op_v = self.opt_v.apply_gradients(self.opt_grad_v_clipped)
212 |             
213 |                 self.opt_grad_p = self.opt_p.compute_gradients(self.cost_p)
214 |                 self.opt_grad_p_clipped = [(tf.clip_by_norm(g, Config.GRAD_CLIP_NORM),v)
215 |                                             for g,v in self.opt_grad_p if not g is None]
216 |                 self.train_op_p = self.opt_p.apply_gradients(self.opt_grad_p_clipped)
217 |                 self.train_op = [self.train_op_p, self.train_op_v]
218 |             else:
219 |                 self.opt_grad = self.opt.compute_gradients(self.cost_all)
220 |                 self.opt_grad_clipped = [(tf.clip_by_average_norm(g, Config.GRAD_CLIP_NORM),v) for g,v in self.opt_grad]
221 |                 self.train_op = self.opt.apply_gradients(self.opt_grad_clipped)
222 |         else:
223 |             if Config.DUAL_RMSPROP:
224 |                 self.train_op_v = self.opt_p.minimize(self.cost_v, global_step=self.global_step)
225 |                 self.train_op_p = self.opt_v.minimize(self.cost_p, global_step=self.global_step)
226 |                 self.train_op = [self.train_op_p, self.train_op_v]
227 |             else:
228 |                 self.train_op = self.opt.minimize(self.cost_all, global_step=self.global_step)
229 | 
230 | 
231 |     def _create_tensor_board(self):
232 |         summaries = tf.get_collection(tf.GraphKeys.SUMMARIES)
233 |         summaries.append(tf.summary.scalar("Pcost_advantage", self.cost_p_1_agg))
234 |         summaries.append(tf.summary.scalar("Pcost_entropy", self.cost_p_2_agg))
235 |         summaries.append(tf.summary.scalar("Pcost", self.cost_p))
236 |         summaries.append(tf.summary.scalar("Vcost", self.cost_v))
237 |         summaries.append(tf.summary.scalar("D1_loss", self.d1_loss))
238 |         summaries.append(tf.summary.scalar("D2_loss", self.d2_loss))
239 |         summaries.append(tf.summary.scalar("LearningRate", self.var_learning_rate))
240 |         summaries.append(tf.summary.scalar("Beta", self.var_beta))
241 |         for var in tf.trainable_variables():
242 |             summaries.append(tf.summary.histogram("weights_%s" % var.name, var))
243 | 
244 |         summaries.append(tf.summary.histogram("activation_n1", self.n1))
245 |         summaries.append(tf.summary.histogram("activation_n2", self.n2))
246 |         summaries.append(tf.summary.histogram("activation_enc", self.enc_out))
247 |         summaries.append(tf.summary.histogram("activation_v", self.logits_v))
248 |         summaries.append(tf.summary.histogram("activation_p", self.softmax_p))
249 | 
250 |         self.summary_op = tf.summary.merge(summaries)
251 |         self.log_writer = tf.summary.FileWriter("logs/%s" % self.model_name, self.sess.graph)
252 | 
253 |     def dense_layer(self, input, out_dim, name, func=tf.nn.relu):
254 |         in_dim = input.get_shape().as_list()[-1]
255 |         d = 1.0 / np.sqrt(in_dim)
256 |         with tf.variable_scope(name):
257 |             w_init = tf.random_uniform_initializer(-d, d)
258 |             b_init = tf.random_uniform_initializer(-d, d)
259 |             w = tf.get_variable('w', dtype=tf.float32, shape=[in_dim, out_dim], initializer=w_init)
260 |             b = tf.get_variable('b', shape=[out_dim], initializer=b_init)
261 | 
262 |             output = tf.matmul(input, w) + b
263 |             if func is not None:
264 |                 output = func(output)
265 | 
266 |         return output
267 | 
268 |     def conv2d_layer(self, input, filter_size, out_dim, name, strides, func=tf.nn.relu):
269 |         in_dim = input.get_shape().as_list()[-1]
270 |         d = 1.0 / np.sqrt(filter_size * filter_size * in_dim)
271 |         with tf.variable_scope(name):
272 |             w_init = tf.random_uniform_initializer(-d, d)
273 |             b_init = tf.random_uniform_initializer(-d, d)
274 |             w = tf.get_variable('w',
275 |                                 shape=[filter_size, filter_size, in_dim, out_dim],
276 |                                 dtype=tf.float32,
277 |                                 initializer=w_init)
278 |             b = tf.get_variable('b', shape=[out_dim], initializer=b_init)
279 | 
280 |             output = tf.nn.conv2d(input, w, strides=strides, padding='SAME') + b
281 |             if func is not None:
282 |                 output = func(output)
283 | 
284 |         return output
285 | 
286 |     def __get_base_feed_dict(self):
287 |         return {self.var_beta: self.beta, self.var_learning_rate: self.learning_rate}
288 | 
289 |     def get_global_step(self):
290 |         step = self.sess.run(self.global_step)
291 |         return step
292 | 
293 |     def predict_single(self, x):
294 |         return self.predict_p(x[None, :])[0]
295 | 
296 |     def predict_v(self, x):
297 |         prediction = self.sess.run(self.logits_v, feed_dict={self.x: x})
298 |         return prediction
299 | 
300 |     def predict_p(self, x):
301 |         prediction = self.sess.run(self.softmax_p, feed_dict={self.x: x})
302 |         return prediction
303 |     
304 |     def predict_p_and_v_and_d(self, x, c_batch, h_batch):
305 |         batch_size = x.shape[0]
306 |         im, depth_map, vel, p_action, p_reward = self.disentangle_obs(x)
307 |         feed_dict={self.x: im, self.seq_len: 1, self.p_rewards: p_reward,
308 |             self.aux_inp: np.concatenate((vel, p_action), axis=1)}
309 | 
310 |         # shape of c/h_batch: (batch_size, Config.NUM_LSTMS, 256)
311 |         for i in range(Config.NUM_LSTMS):
312 |           c = c_batch[:,i,:] if i == 1 else c_batch[:,i,:64]
313 |           h = h_batch[:,i,:] if i == 1 else h_batch[:,i,:64]
314 |           feed_dict.update({self.state_in[i]: (c, h)})
315 | 
316 |         p, v, d, lstm_out = self.sess.run([self.softmax_p, self.logits_v,
317 |             self.d1_logits, self.state_out], feed_dict=feed_dict)
318 | 
319 |         # reshape lstm_out(c/h) to: (batch_size, Config.NUM_LSTMS, 256)
320 |         c = np.zeros((batch_size, Config.NUM_LSTMS, 256),
321 |             dtype=np.float32)
322 |         
323 |         h = np.zeros((batch_size, Config.NUM_LSTMS, 256),
324 |             dtype=np.float32)
325 |         
326 |         for i in range(Config.NUM_LSTMS):
327 |           if i == 0:
328 |             c[:,i,:64] = lstm_out[i][0]
329 |             h[:,i,:64] = lstm_out[i][1]
330 |           else:
331 |             c[:,i,:] = lstm_out[i][0]
332 |             h[:,i,:] = lstm_out[i][1]
333 |             
334 |         d = np.array(d).transpose(1, 0, 2)
335 |         return p, v, d, c, h
336 |     
337 |     def disentangle_obs(self, states):
338 |       """
339 |       The obervations x is a concatenation of image, depth_map, prev_actn,
340 |       velocity vector, and prev_rewards. This function separate these
341 |       """
342 | 
343 |       batch_size = states.shape[0]
344 |       im_size = Config.IMAGE_HEIGHT*Config.IMAGE_WIDTH*Config.IMAGE_DEPTH
345 |       im = states[:, :im_size]
346 |       im = np.reshape(im, (batch_size, Config.IMAGE_HEIGHT, Config.IMAGE_WIDTH, Config.IMAGE_DEPTH))
347 |       states = states[:, im_size:]
348 | 
349 |       dm_size = Config.DEPTH_PIXELS
350 |       dm_val = states[:, :dm_size].astype(int)                          
351 |       states = states[:, dm_size:]
352 | 
353 |       depth_map = np.zeros((dm_size, batch_size, Config.DEPTH_QUANTIZATION))
354 |       for i in range(dm_size):
355 |         depth_map[i, np.arange(batch_size), dm_val[:,i].astype(int)] = 1 # make one-hot
356 | 
357 |       vl_size = Config.VEL_DIM
358 |       vel = states[:, :vl_size]
359 |       states = states[:, vl_size:]
360 | 
361 |       assert states.shape[1] == 2, "Missed something ?!"
362 |       p_action = np.zeros((batch_size, self.num_actions))
363 |       p_action[np.arange(batch_size), states[:,0].astype(int)] = 1  # make one-hot
364 |       p_reward = states[:, 1]
365 |       p_reward = np.reshape(p_reward, (batch_size, 1))
366 | 
367 |       # return (batch_size, ...) arrays
368 |       return im, depth_map, vel, p_action, p_reward 
369 | 
370 |     def train(self, x, y_r, a, c, h, trainer_id):
371 |         feed_dict = self.__get_base_feed_dict()
372 |         im, depth_map, vel, p_action, p_reward = self.disentangle_obs(x)
373 |         feed_dict.update({self.x: im, self.y_r: y_r, self.action_index: a,
374 |           self.seq_len: int(Config.TIME_MAX), self.p_rewards: p_reward,
375 |           self.aux_inp: np.concatenate((vel, p_action), axis=1)})
376 | 
377 |         # depth supervision
378 |         feed_dict.update({self.depth_labels[i]:depth_map[i] for i in
379 |           range(Config.DEPTH_PIXELS)})
380 | 
381 |         for i in range(Config.NUM_LSTMS):  
382 |           cb = np.array(c[i]).reshape((-1, 256))
383 |           hb = np.array(h[i]).reshape((-1, 256))
384 |           if i == 0:
385 |             cb = cb[:,:64]
386 |             hb = hb[:,:64]
387 | 
388 |           feed_dict.update({self.state_in[i]: (cb, hb)})
389 | 
390 |         self.sess.run(self.train_op, feed_dict=feed_dict)
391 | 
392 |     def log(self, x, y_r, a, c, h):
393 |         feed_dict = self.__get_base_feed_dict()
394 |         im, depth_map, vel, p_action, p_reward = self.disentangle_obs(x)
395 | 
396 |         feed_dict.update({self.x: im, self.y_r: y_r, self.action_index: a,
397 |           self.seq_len: int(Config.TIME_MAX), self.p_rewards: p_reward,
398 |           self.aux_inp: np.concatenate((vel, p_action), axis=1)})
399 | 
400 |         # depth supervision
401 |         feed_dict.update({self.depth_labels[i]:depth_map[i] for i in
402 |           range(Config.DEPTH_PIXELS)})
403 | 
404 |         for i in range(Config.NUM_LSTMS):  
405 |           cb = np.array(c[i]).reshape((-1, 256))
406 |           hb = np.array(h[i]).reshape((-1, 256))
407 |           if i == 0:
408 |             cb = cb[:,:64]
409 |             hb = hb[:,:64]
410 |           
411 |           feed_dict.update({self.state_in[i]: (cb, hb)})
412 | 
413 |         step, summary = self.sess.run([self.global_step, self.summary_op], feed_dict=feed_dict)
414 |         self.log_writer.add_summary(summary, step)
415 | 
416 |     def _checkpoint_filename(self, episode):
417 |         return 'checkpoints/%s_%08d' % (self.model_name, episode)
418 |     
419 |     def _get_episode_from_filename(self, filename):
420 |         # TODO: hacky way of getting the episode. ideally episode should be stored as a TF variable
421 |         return int(re.split('/|_|\.', filename)[2])
422 | 
423 |     def save(self, episode):
424 |         self.saver.save(self.sess, self._checkpoint_filename(episode))
425 | 
426 |     def load(self):
427 |         filename = tf.train.latest_checkpoint(os.path.dirname(self._checkpoint_filename(episode=0)))
428 |         if Config.LOAD_EPISODE > 0:
429 |             filename = self._checkpoint_filename(Config.LOAD_EPISODE)
430 |         self.saver.restore(self.sess, filename)
431 |         return self._get_episode_from_filename(filename)
432 |        
433 |     def get_variables_names(self):
434 |         return [var.name for var in self.graph.get_collection('trainable_variables')]
435 | 
436 |     def get_variable_value(self, name):
437 |         return self.sess.run(self.graph.get_tensor_by_name(name))
438 | 


--------------------------------------------------------------------------------
/ProcessAgent.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without
  4 | # modification, are permitted provided that the following conditions
  5 | # are met:
  6 | #  * Redistributions of source code must retain the above copyright
  7 | #    notice, this list of conditions and the following disclaimer.
  8 | #  * Redistributions in binary form must reproduce the above copyright
  9 | #    notice, this list of conditions and the following disclaimer in the
 10 | #    documentation and/or other materials provided with the distribution.
 11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | #    contributors may be used to endorse or promote products derived
 13 | #    from this software without specific prior written permission.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | 
 27 | from datetime import datetime
 28 | from multiprocessing import Process, Queue, Value
 29 | 
 30 | import numpy as np
 31 | import sys, time
 32 | 
 33 | from Config import Config
 34 | from Environment import Environment
 35 | from Experience import Experience
 36 | 
 37 | 
 38 | class ProcessAgent(Process):
 39 |     def __init__(self, id, prediction_q, training_q, episode_log_q, dm):
 40 |         super(ProcessAgent, self).__init__()
 41 | 
 42 |         self.id = id
 43 |         self.prediction_q = prediction_q
 44 |         self.training_q = training_q
 45 |         self.episode_log_q = episode_log_q
 46 | 
 47 |         self.env = Environment()
 48 |         self.num_actions = self.env.get_num_actions()
 49 |         self.actions = np.arange(self.num_actions)
 50 | 
 51 |         self.discount_factor = Config.DISCOUNT
 52 |         # one frame at a time
 53 |         self.wait_q = Queue(maxsize=1)
 54 |         self.exit_flag = Value('i', 0)
 55 |         self.display_manager = dm
 56 | 
 57 |     @staticmethod
 58 |     def _accumulate_rewards(experiences, discount_factor, value, is_running):
 59 |         if is_running:
 60 |           reward_sum = value # terminal reward
 61 |           for t in reversed(range(0, len(experiences)-1)):
 62 |               r = np.clip(experiences[t].reward, Config.REWARD_MIN, Config.REWARD_MAX) if Config.REWARD_CLIPPING else experiences[t].reward
 63 |               reward_sum = discount_factor * reward_sum + r
 64 |               experiences[t].reward = reward_sum
 65 |           return experiences[:-1]
 66 |         # if the episode has terminated, we take the full trajectory into
 67 |         # account, including the very last experience 
 68 |         else:
 69 |           reward_sum = 0
 70 |           for t in reversed(range(0, len(experiences))):
 71 |               r = np.clip(experiences[t].reward, Config.REWARD_MIN, Config.REWARD_MAX) if Config.REWARD_CLIPPING else experiences[t].reward
 72 |               reward_sum = discount_factor * reward_sum + r
 73 |               experiences[t].reward = reward_sum
 74 |           return experiences
 75 | 
 76 |     def convert_data(self, experiences):
 77 |         x_ = np.array([exp.state for exp in experiences])
 78 |         a_ = np.eye(self.num_actions)[np.array([exp.action for exp in experiences])].astype(np.float32)
 79 |         r_ = np.array([exp.reward for exp in experiences])
 80 |         return x_, r_, a_
 81 | 
 82 |     def predict(self, state, lstm_inputs):
 83 |         # put the state in the prediction q
 84 |         
 85 |         # lstm_inputs: [dict{stacklayer1}, dict{stacklayer2}, ...]
 86 |         c_state = np.array([lstm['c'] for lstm in lstm_inputs]) if len(lstm_inputs) else None
 87 |         h_state = np.array([lstm['h'] for lstm in lstm_inputs]) if len(lstm_inputs) else None
 88 |         self.prediction_q.put((self.id, state, c_state, h_state))  
 89 |         # wait for the prediction to come back
 90 |         p, v, d, c_state, h_state = self.wait_q.get()
 91 | 
 92 |         if not len(lstm_inputs):
 93 |           return p, v, d, []
 94 | 
 95 |         # convert return back to form: [dict{stack-layer1}, dict{stack-layer2}, ...]
 96 |         l = [{'c':c_state[i], 'h':h_state[i]} for i in range(c_state.shape[0])] 
 97 |         return p, v, d, l
 98 | 
 99 |     def select_action(self, prediction):
100 |         if Config.PLAY_MODE:
101 |             action = np.argmax(prediction)
102 |         else:
103 |             action = np.random.choice(self.actions, p=prediction)
104 |         return action
105 | 
106 |     def run_episode(self):
107 |         self.env.reset()
108 |         is_running = True
109 |         experiences = []
110 | 
111 |         time_count = 0
112 |         reward_sum = 0.0
113 | 
114 |         # input states for prediction
115 |         lstm_input_p = [{'c':np.zeros(256, dtype=np.float32),
116 |           'h':np.zeros(256, dtype=np.float32)}]*Config.NUM_LSTMS
117 | 
118 |         # input states for training
119 |         lstm_input_t = [{'c':np.zeros(256, dtype=np.float32),
120 |           'h':np.zeros(256, dtype=np.float32)}]*Config.NUM_LSTMS
121 | 
122 |         while is_running:
123 | 
124 |             # very first few frames
125 |             if self.env.current_state is None:
126 |                 _ , is_running = self.env.step(-1)  # NOOP
127 |                 assert(is_running)
128 |                 continue
129 | 
130 |             prediction, value, depth, lstm_input_p = self.predict(self.env.current_state, lstm_input_p)
131 |             
132 |             if Config.PLAY_MODE:
133 |                 self.display_manager.update(self.env.current_state, prediction, value, depth)
134 | 
135 |             action = self.select_action(prediction)
136 |             reward, is_running = self.env.step(action)
137 | 
138 |             reward_sum += reward
139 |             exp = Experience(self.env.previous_state, action, prediction, reward)
140 |             experiences.append(exp)
141 |             
142 |             if not is_running or time_count == int(Config.TIME_MAX):
143 |                 updated_exps = ProcessAgent._accumulate_rewards(experiences, self.discount_factor, value, is_running)
144 |                 x_, r_, a_ = self.convert_data(updated_exps)
145 |                 yield x_, r_, a_, lstm_input_t, reward_sum, time_count 
146 |  
147 |                 # lstm input state for next training step
148 |                 lstm_input_t = lstm_input_p
149 |                                                                         
150 |                 # reset the tmax count
151 |                 time_count = 0
152 |                 # keep the last experience for the next batch
153 |                 experiences = [experiences[-1]]
154 |                 reward_sum = 0.0
155 | 
156 |             time_count += 1
157 | 
158 |     def run(self):
159 |         # randomly sleep up to 1 second. helps agents boot smoothly.
160 |         time.sleep(np.random.rand())
161 |         np.random.seed(np.int32(time.time() % 1 * 1000 + self.id * 10))
162 |         total_steps = 0
163 | 
164 |         while total_steps == Config.MAX_STEPS or self.exit_flag.value == 0:
165 |             total_reward = 0
166 |             total_length = 0
167 |             for x_, r_, a_, lstm_, reward_sum, steps in self.run_episode():
168 |                 total_steps += steps
169 |                 total_reward += reward_sum
170 |                 total_length += len(r_) + 1  # +1 for last frame that we drop
171 |                 self.training_q.put((x_, r_, a_, lstm_))
172 |             self.episode_log_q.put((datetime.now(), total_reward, total_length,
173 |               total_steps))
174 | 


--------------------------------------------------------------------------------
/ProcessStats.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without
  4 | # modification, are permitted provided that the following conditions
  5 | # are met:
  6 | #  * Redistributions of source code must retain the above copyright
  7 | #    notice, this list of conditions and the following disclaimer.
  8 | #  * Redistributions in binary form must reproduce the above copyright
  9 | #    notice, this list of conditions and the following disclaimer in the
 10 | #    documentation and/or other materials provided with the distribution.
 11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | #    contributors may be used to endorse or promote products derived
 13 | #    from this software without specific prior written permission.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | 
 27 | import sys
 28 | if sys.version_info >= (3,0):
 29 |     from queue import Queue as queueQueue
 30 | else:
 31 |     from Queue import Queue as queueQueue
 32 | 
 33 | from datetime import datetime
 34 | from multiprocessing import Process, Queue, Value
 35 | 
 36 | import numpy as np
 37 | import time
 38 | 
 39 | from Config import Config
 40 | 
 41 | 
 42 | class ProcessStats(Process):
 43 |     def __init__(self):
 44 |         super(ProcessStats, self).__init__()
 45 |         self.episode_log_q = Queue(maxsize=100)
 46 |         self.episode_count = Value('i', 0)
 47 |         self.training_count = Value('i', 0)
 48 |         self.should_save_model = Value('i', 0)
 49 |         self.trainer_count = Value('i', 0)
 50 |         self.predictor_count = Value('i', 0)
 51 |         self.agent_count = Value('i', 0)
 52 |         self.total_frame_count = 0
 53 | 
 54 |     def FPS(self):
 55 |         # average FPS from the beginning of the training (not current FPS)
 56 |         return np.ceil(self.total_frame_count / (time.time() - self.start_time))
 57 | 
 58 |     def TPS(self):
 59 |         # average TPS from the beginning of the training (not current TPS)
 60 |         return np.ceil(self.training_count.value / (time.time() - self.start_time))
 61 | 
 62 |     def run(self):
 63 |         with open(Config.RESULTS_FILENAME, 'a') as results_logger:
 64 |             rolling_frame_count = 0
 65 |             rolling_reward = 0
 66 |             results_q = queueQueue(maxsize=Config.STAT_ROLLING_MEAN_WINDOW)
 67 |             
 68 |             self.start_time = time.time()
 69 |             first_time = datetime.now()
 70 |             while True:
 71 |                 episode_time, reward, length, steps = self.episode_log_q.get()
 72 |                 results_logger.write('%s, %d, %d\n' % (episode_time.strftime("%Y-%m-%d %H:%M:%S"), reward, length))
 73 |                 results_logger.flush()
 74 | 
 75 |                 self.total_frame_count += length
 76 |                 self.episode_count.value += 1
 77 | 
 78 |                 rolling_frame_count += length
 79 |                 rolling_reward += reward
 80 | 
 81 |                 if results_q.full():
 82 |                     old_episode_time, old_reward, old_length = results_q.get()
 83 |                     rolling_frame_count -= old_length
 84 |                     rolling_reward -= old_reward
 85 |                     first_time = old_episode_time
 86 | 
 87 |                 results_q.put((episode_time, reward, length))
 88 | 
 89 |                 if self.episode_count.value % Config.SAVE_FREQUENCY == 0:
 90 |                     self.should_save_model.value = 1
 91 | 
 92 |                 if self.episode_count.value % Config.PRINT_STATS_FREQUENCY == 0:
 93 |                     print(
 94 |                         '[Time: %8d] '
 95 |                         '[Steps: %8d] '
 96 |                         '[Episode: %8d Score: %10.4f] '
 97 |                         '[RScore: %10.4f RPPS: %5d] '
 98 |                         '[PPS: %5d TPS: %5d] '
 99 |                         '[NT: %2d NP: %2d NA: %2d]'
100 |                         % (int(time.time()-self.start_time),
101 |                            steps,
102 |                            self.episode_count.value, reward,
103 |                            rolling_reward / results_q.qsize(),
104 |                            rolling_frame_count / (datetime.now() - first_time).total_seconds(),
105 |                            self.FPS(), self.TPS(),
106 |                            self.trainer_count.value, self.predictor_count.value, self.agent_count.value))
107 |                     sys.stdout.flush()
108 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Maze Navigation using Reinforcement Learning
 2 | 
 3 | ## Description
 4 | 
 5 | TensorFlow implementation of ideas in the DeepMind paper ["Learning to Navigate in Complex Environments"](https://arxiv.org/abs/1611.03673). The baseline architecture is GPU-based A3C from the paper ["Reinforcement Learning through Asynchronous Advantage Actor-Critic on a GPU"](https://openreview.net/forum?id=r1VGvBcxl). Full [report](./assets/DeepNav_final.pdf).
 6 | 
 7 | ## Requirements
 8 | 
 9 | * TensorFlow 1.0
10 | * DeepMind Lab
11 | * Python plugins - numpy, cv2, pygame
12 | 
13 | ## Getting Started
14 | 
15 | Dowload and install DeepMind Lab
16 | ```
17 | $ git clone https://github.com/deepmind/lab.git
18 | ```
19 | Build it following the [build instructions](https://github.com/deepmind/lab/blob/master/docs/build.md)
20 | 
21 | Clone repo **inside** the lab directory
22 | ```
23 | $ cd lab
24 | $ git clone https://github.com/tgangwani/GA3C-DeepNavigation.git
25 | ```
26 | Add the bazel instructions at the end of lab/BUILD file
27 | 
28 | ```
29 | py_binary(
30 |     name = "GA3C-DeepNavigation_train",
31 |     srcs = ["GA3C-DeepNavigation/GA3C.py"],
32 |     data = [":deepmind_lab.so"],
33 |     main = "GA3C-DeepNavigation/GA3C.py",
34 | )
35 | ```
36 | 
37 | Then run bazel command to run the agent 
38 | ```
39 | bazel run :GA3C-DeepNavigation_train --define headless=osmesa
40 | ```
41 | Use ```PLAY_MODE=False``` in Config.py for training. 
42 | Setting ```PLAY_MODE=True``` loads model parameters from a 
43 | checkpoint and runs a single agent. A display is expected.
44 | 
45 | ## Network
46 | The neural net architecture is the same as in the paper, **but for the
47 | loop-prediction loss**.
48 | 
49 | <img src="./assets/nn.png" width="400" height="300" />
50 | 
51 | ## Results 
52 | 
53 | ### 1. Nav_maze_static_01
54 | 
55 | #### Live Agent 
56 | [YouTube](https://www.youtube.com/watch?v=vyS0Z7wdHHs)
57 | 
58 | #### Learning Curve
59 | <img src="./assets/nav_maze_static_01_score.png" width="400" height="300" />
60 | 
61 | ### 2. Stairway to melon
62 | 
63 | #### Live Agent 
64 | [YouTube](https://www.youtube.com/watch?v=0R5MGM7VPo4)
65 | 
66 | #### Learning Curve
67 | <img src="./assets/stairway_to_melon_score.png" width="400" height="300" />
68 | 
69 | 
70 | ## Acknowledgement 
71 | [Unreal code by miyosuda](https://github.com/miyosuda/unreal)
72 | 


--------------------------------------------------------------------------------
/Server.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without
  4 | # modification, are permitted provided that the following conditions
  5 | # are met:
  6 | #  * Redistributions of source code must retain the above copyright
  7 | #    notice, this list of conditions and the following disclaimer.
  8 | #  * Redistributions in binary form must reproduce the above copyright
  9 | #    notice, this list of conditions and the following disclaimer in the
 10 | #    documentation and/or other materials provided with the distribution.
 11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | #    contributors may be used to endorse or promote products derived
 13 | #    from this software without specific prior written permission.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | 
 27 | from multiprocessing import Queue
 28 | 
 29 | import time
 30 | 
 31 | from Config import Config
 32 | from Environment import Environment
 33 | from NetworkVP import NetworkVP
 34 | from ProcessAgent import ProcessAgent
 35 | from ProcessStats import ProcessStats
 36 | from ThreadDynamicAdjustment import ThreadDynamicAdjustment
 37 | from ThreadPredictor import ThreadPredictor
 38 | from ThreadTrainer import ThreadTrainer
 39 | from Display import Display
 40 | 
 41 | class Server:
 42 |     def __init__(self):
 43 |         self.stats = ProcessStats()
 44 | 
 45 |         self.training_q = Queue(maxsize=Config.MAX_QUEUE_SIZE)
 46 |         self.prediction_q = Queue(maxsize=Config.MAX_QUEUE_SIZE)
 47 | 
 48 |         self.model = NetworkVP(Config.DEVICE, Config.NETWORK_NAME, Environment().get_num_actions())
 49 |         if Config.LOAD_CHECKPOINT:
 50 |             self.stats.episode_count.value = self.model.load()
 51 | 
 52 |         self.training_step = 0
 53 |         self.frame_counter = 0
 54 | 
 55 |         self.agents = []
 56 |         self.predictors = []
 57 |         self.trainers = []
 58 |         self.dynamic_adjustment = ThreadDynamicAdjustment(self)
 59 |         self.display_manager = Display()
 60 | 
 61 |     def add_agent(self):
 62 |         self.agents.append(
 63 |             ProcessAgent(len(self.agents), self.prediction_q, self.training_q, self.stats.episode_log_q, self.display_manager))
 64 |         self.agents[-1].start()
 65 | 
 66 |     def remove_agent(self):
 67 |         self.agents[-1].exit_flag.value = True
 68 |         self.agents[-1].join()
 69 |         self.agents.pop()
 70 | 
 71 |     def add_predictor(self):
 72 |         self.predictors.append(ThreadPredictor(self, len(self.predictors)))
 73 |         self.predictors[-1].start()
 74 | 
 75 |     def remove_predictor(self):
 76 |         self.predictors[-1].exit_flag = True
 77 |         self.predictors[-1].join()
 78 |         self.predictors.pop()
 79 | 
 80 |     def add_trainer(self):
 81 |         self.trainers.append(ThreadTrainer(self, len(self.trainers)))
 82 |         self.trainers[-1].start()
 83 | 
 84 |     def remove_trainer(self):
 85 |         self.trainers[-1].exit_flag = True
 86 |         self.trainers[-1].join()
 87 |         self.trainers.pop()
 88 | 
 89 |     def train_model(self, x_, r_, a_, c_, h_, trainer_id):
 90 |         self.model.train(x_, r_, a_, c_, h_, trainer_id)
 91 |         self.training_step += 1
 92 |         self.frame_counter += x_.shape[0]
 93 | 
 94 |         self.stats.training_count.value += 1
 95 |         self.dynamic_adjustment.temporal_training_count += 1
 96 | 
 97 |         if Config.TENSORBOARD and self.stats.training_count.value % Config.TENSORBOARD_UPDATE_FREQUENCY == 0:
 98 |             self.model.log(x_, r_, a_, c_, h_)
 99 | 
100 |     def save_model(self):
101 |         self.model.save(self.stats.episode_count.value)
102 | 
103 |     def main(self):
104 |         self.stats.start()
105 |         self.dynamic_adjustment.start()
106 | 
107 |         if Config.PLAY_MODE:
108 |             for trainer in self.trainers:
109 |                 trainer.enabled = False
110 | 
111 |         learning_rate_multiplier = (
112 |                                        Config.LEARNING_RATE_END - Config.LEARNING_RATE_START) / Config.ANNEALING_EPISODE_COUNT
113 |         beta_multiplier = (Config.BETA_END - Config.BETA_START) / Config.ANNEALING_EPISODE_COUNT
114 | 
115 |         while self.stats.episode_count.value < Config.EPISODES:
116 |             step = min(self.stats.episode_count.value, Config.ANNEALING_EPISODE_COUNT - 1)
117 |             self.model.learning_rate = Config.LEARNING_RATE_START + learning_rate_multiplier * step
118 |             self.model.beta = Config.BETA_START + beta_multiplier * step
119 | 
120 |             # Saving is async - even if we start saving at a given episode, we may save the model at a later episode
121 |             if Config.SAVE_MODELS and self.stats.should_save_model.value > 0:
122 |                 self.save_model()
123 |                 self.stats.should_save_model.value = 0
124 | 
125 |             time.sleep(0.01)
126 | 
127 |         self.dynamic_adjustment.exit_flag = True
128 |         while self.agents:
129 |             self.remove_agent()
130 |         while self.predictors:
131 |             self.remove_predictor()
132 |         while self.trainers:
133 |             self.remove_trainer()
134 | 


--------------------------------------------------------------------------------
/ThreadDynamicAdjustment.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without
  4 | # modification, are permitted provided that the following conditions
  5 | # are met:
  6 | #  * Redistributions of source code must retain the above copyright
  7 | #    notice, this list of conditions and the following disclaimer.
  8 | #  * Redistributions in binary form must reproduce the above copyright
  9 | #    notice, this list of conditions and the following disclaimer in the
 10 | #    documentation and/or other materials provided with the distribution.
 11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | #    contributors may be used to endorse or promote products derived
 13 | #    from this software without specific prior written permission.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | 
 27 | from threading import Thread
 28 | 
 29 | import numpy as np
 30 | import time
 31 | 
 32 | from Config import Config
 33 | 
 34 | 
 35 | class ThreadDynamicAdjustment(Thread):
 36 |     def __init__(self, server):
 37 |         super(ThreadDynamicAdjustment, self).__init__()
 38 |         self.setDaemon(True)
 39 | 
 40 |         self.server = server
 41 |         self.enabled = Config.DYNAMIC_SETTINGS
 42 | 
 43 |         self.trainer_count = Config.TRAINERS
 44 |         self.predictor_count = Config.PREDICTORS
 45 |         self.agent_count = Config.AGENTS
 46 | 
 47 |         self.temporal_training_count = 0
 48 |         self.exit_flag = False
 49 | 
 50 |     def enable_disable_components(self):
 51 |         cur_len = len(self.server.trainers)
 52 |         if cur_len < self.trainer_count:
 53 |             for _ in np.arange(cur_len, self.trainer_count):
 54 |                 self.server.add_trainer()
 55 |         elif cur_len > self.trainer_count:
 56 |             for _ in np.arange(self.trainer_count, cur_len):
 57 |                 self.server.remove_trainer()
 58 | 
 59 |         cur_len = len(self.server.predictors)
 60 |         if cur_len < self.predictor_count:
 61 |             for _ in np.arange(cur_len, self.predictor_count):
 62 |                 self.server.add_predictor()
 63 |         elif cur_len > self.predictor_count:
 64 |             for _ in np.arange(self.predictor_count, cur_len):
 65 |                 self.server.remove_predictor()
 66 | 
 67 |         cur_len = len(self.server.agents)
 68 |         if cur_len < self.agent_count:
 69 |             for _ in np.arange(cur_len, self.agent_count):
 70 |                 self.server.add_agent()
 71 |         elif cur_len > self.agent_count:
 72 |             for _ in np.arange(self.agent_count, cur_len):
 73 |                 self.server.remove_agent()
 74 | 
 75 |     def random_walk(self):
 76 |         # 3 directions, 1 for Trainers, 1 for Predictors and 1 for Agents
 77 |         # 3 outcome for each, -1: remove one, 0: no change, 2: remove one
 78 |         direction = np.random.randint(3, size=3) - 1
 79 |         self.trainer_count = max(1, self.trainer_count - direction[0])
 80 |         self.predictor_count = max(1, self.predictor_count - direction[1])
 81 |         self.agent_count = max(1, self.agent_count - direction[2])
 82 | 
 83 |     def update_stats(self):
 84 |         self.server.stats.trainer_count.value = self.trainer_count
 85 |         self.server.stats.predictor_count.value = self.predictor_count
 86 |         self.server.stats.agent_count.value = self.agent_count
 87 | 
 88 |     def run(self):
 89 |         self.enable_disable_components()
 90 |         self.update_stats()
 91 | 
 92 |         if not self.enabled:
 93 |             return
 94 | 
 95 |         # Wait for initialization
 96 |         time.sleep(Config.DYNAMIC_SETTINGS_INITIAL_WAIT)
 97 | 
 98 |         while not self.exit_flag:
 99 |             old_trainer_count, old_predictor_count, old_agent_count = \
100 |                 self.trainer_count, self.predictor_count, self.agent_count
101 |             self.random_walk()
102 | 
103 |             # If no change, do nothing
104 |             if self.trainer_count == old_trainer_count \
105 |                     and self.predictor_count == old_predictor_count \
106 |                     and self.agent_count == old_agent_count:
107 |                 continue
108 | 
109 |             old_count = self.temporal_training_count
110 |             self.enable_disable_components()
111 | 
112 |             self.temporal_training_count = 0
113 |             time.sleep(Config.DYNAMIC_SETTINGS_STEP_WAIT)
114 | 
115 |             cur_count = self.temporal_training_count
116 |             # if it didn't work, revert the changes
117 |             if cur_count < old_count:
118 |                 self.trainer_count, self.predictor_count, self.agent_count = \
119 |                     old_trainer_count, old_predictor_count, old_agent_count
120 | 
121 |             self.update_stats()
122 | 


--------------------------------------------------------------------------------
/ThreadPredictor.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | from threading import Thread
28 | 
29 | import numpy as np
30 | 
31 | from Config import Config
32 | 
33 | 
34 | class ThreadPredictor(Thread):
35 |     def __init__(self, server, id):
36 |         super(ThreadPredictor, self).__init__()
37 |         self.setDaemon(True)
38 | 
39 |         self.id = id
40 |         self.server = server
41 |         self.exit_flag = False
42 | 
43 |     def run(self):
44 |         ids = np.zeros(Config.PREDICTION_BATCH_SIZE, dtype=np.uint16)
45 |         states = np.zeros((Config.PREDICTION_BATCH_SIZE,
46 |           Config.COMBINED_STATE_SIZE), dtype=np.float32)
47 | 
48 |         cs = np.zeros((Config.PREDICTION_BATCH_SIZE, Config.NUM_LSTMS, 256),
49 |             dtype=np.float32) if Config.NUM_LSTMS else [None]*Config.PREDICTION_BATCH_SIZE
50 | 
51 |         hs = np.zeros((Config.PREDICTION_BATCH_SIZE, Config.NUM_LSTMS, 256),
52 |             dtype=np.float32) if Config.NUM_LSTMS else [None]*Config.PREDICTION_BATCH_SIZE
53 | 
54 |         while not self.exit_flag:
55 |             ids[0], states[0], cs[0], hs[0] = self.server.prediction_q.get()
56 | 
57 |             size = 1
58 |             while size < Config.PREDICTION_BATCH_SIZE and not self.server.prediction_q.empty():
59 |                 ids[size], states[size], cs[size], hs[size] = self.server.prediction_q.get()
60 |                 size += 1
61 | 
62 |             batch = states[:size]
63 |             cb = cs[:size]
64 |             hb = hs[:size]
65 |             p, v, d, c, h = self.server.model.predict_p_and_v_and_d(batch, cb, hb)
66 | 
67 |             for i in range(size):
68 |                 if ids[i] < len(self.server.agents):
69 |                     if Config.NUM_LSTMS:
70 |                       assert c[i].shape == (Config.NUM_LSTMS, 256)
71 |                       assert h[i].shape == (Config.NUM_LSTMS, 256)
72 |                     self.server.agents[ids[i]].wait_q.put((p[i], v[i], d[i], c[i], h[i]))
73 | 


--------------------------------------------------------------------------------
/ThreadTrainer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | from threading import Thread
28 | import numpy as np
29 | 
30 | from Config import Config
31 | 
32 | 
33 | class ThreadTrainer(Thread):
34 |     def __init__(self, server, id):
35 |         super(ThreadTrainer, self).__init__()
36 |         self.setDaemon(True)
37 | 
38 |         self.id = id
39 |         self.server = server
40 |         self.exit_flag = False
41 | 
42 |     @staticmethod
43 |     def dynamic_pad(x, r, a):
44 |       size = int(Config.TIME_MAX) # required size
45 |       z = np.zeros((size-len(x),) + x.shape[1:])
46 |       x = np.append(x, z, axis=0)
47 |       z = np.zeros((size-len(r),) + r.shape[1:])
48 |       r = np.append(r, z, axis=0)
49 |       z = np.zeros((size-len(a),) + a.shape[1:])
50 |       a = np.append(a, z, axis=0)
51 |       assert len(x) == size 
52 |       return x, r, a
53 | 
54 |     def run(self):
55 |         while not self.exit_flag:
56 |             batch_size = 0
57 |             c__ = []; h__ = []   # lstm hidden states
58 |             while batch_size <= Config.TRAINING_MIN_BATCH_SIZE:
59 |                 x_, r_, a_, lstm_ = self.server.training_q.get()
60 | 
61 |                 # when using LSTMs, the recurrence is over the TIME_MAX length
62 |                 # trajectory from each agent. Use padding for trajectories of 
63 |                 # length < TIME_MAX
64 |                 if Config.NUM_LSTMS and x_.shape[0] != int(Config.TIME_MAX):
65 |                   x_, r_, a_ = ThreadTrainer.dynamic_pad(x_, r_, a_)
66 | 
67 |                 if batch_size == 0:
68 |                     x__ = x_; r__ = r_; a__ = a_ 
69 | 
70 |                     if len(lstm_):
71 |                       c__ = []; h__ = []
72 |                       for i in range(Config.NUM_LSTMS):
73 |                         c__.append(lstm_[i]['c'])
74 |                         h__.append(lstm_[i]['h'])
75 | 
76 |                 else:
77 |                     x__ = np.concatenate((x__, x_))
78 |                     r__ = np.concatenate((r__, r_))
79 |                     a__ = np.concatenate((a__, a_))
80 | 
81 |                     if len(lstm_):
82 |                       for i in range(Config.NUM_LSTMS):
83 |                         c__[i] = np.concatenate((c__[i], lstm_[i]['c']))
84 |                         h__[i] = np.concatenate((h__[i], lstm_[i]['h']))
85 | 
86 |                 batch_size += x_.shape[0]
87 |             
88 |             if Config.TRAIN_MODELS:
89 |                 self.server.train_model(x__, r__, a__, c__, h__, self.id)
90 | 


--------------------------------------------------------------------------------
/assets/DeepNav_final.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tgangwani/GA3C-DeepNavigation/32aae2f806479bff6aff4d61894313d26b2251a6/assets/DeepNav_final.pdf


--------------------------------------------------------------------------------
/assets/nav_maze_static_01_score.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tgangwani/GA3C-DeepNavigation/32aae2f806479bff6aff4d61894313d26b2251a6/assets/nav_maze_static_01_score.png


--------------------------------------------------------------------------------
/assets/nn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tgangwani/GA3C-DeepNavigation/32aae2f806479bff6aff4d61894313d26b2251a6/assets/nn.png


--------------------------------------------------------------------------------
/assets/stairway_to_melon_score.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tgangwani/GA3C-DeepNavigation/32aae2f806479bff6aff4d61894313d26b2251a6/assets/stairway_to_melon_score.png


--------------------------------------------------------------------------------