├── Config.py
├── Display.py
├── Environment.py
├── Experience.py
├── GA3C.py
├── GameManager.py
├── NetworkVP.py
├── ProcessAgent.py
├── ProcessStats.py
├── README.md
├── Server.py
├── ThreadDynamicAdjustment.py
├── ThreadPredictor.py
├── ThreadTrainer.py
└── assets
├── DeepNav_final.pdf
├── nav_maze_static_01_score.png
├── nn.png
└── stairway_to_melon_score.png
/Config.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # Redistribution and use in source and binary forms, with or without
4 | # modification, are permitted provided that the following conditions
5 | # are met:
6 | # * Redistributions of source code must retain the above copyright
7 | # notice, this list of conditions and the following disclaimer.
8 | # * Redistributions in binary form must reproduce the above copyright
9 | # notice, this list of conditions and the following disclaimer in the
10 | # documentation and/or other materials provided with the distribution.
11 | # * Neither the name of NVIDIA CORPORATION nor the names of its
12 | # contributors may be used to endorse or promote products derived
13 | # from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |
27 | class Config:
28 |
29 | #########################################################################
30 | # Number of stacked LSTM layers
31 | NUM_LSTMS = 2
32 |
33 | #########################################################################
34 | # Game configuration
35 |
36 | #MAP = 'seekavoid_arena_01'
37 | MAP = 'stairway_to_melon'
38 | #MAP = 'nav_maze_static_01'
39 | #MAP = 'nav_maze_static_02'
40 |
41 | # Enable to see the trained agent in action
42 | PLAY_MODE = False
43 | # Enable to train
44 | TRAIN_MODELS = True
45 | # Load old models. Throws if the model doesn't exist
46 | LOAD_CHECKPOINT = False
47 | # If 0, the latest checkpoint is loaded
48 | LOAD_EPISODE = 0
49 |
50 | #########################################################################
51 | # Number of agents, predictors, trainers and other system settings
52 |
53 | # If the dynamic configuration is on, these are the initial values.
54 | # Number of Agents
55 | AGENTS = 8
56 | # Number of Predictors
57 | PREDICTORS = 2
58 | # Number of Trainers
59 | TRAINERS = 2
60 |
61 | # Device
62 | DEVICE = 'gpu:0'
63 |
64 | # Play mode display size
65 | DISPLAY_SIZE = (440, 400)
66 | # Movie recording
67 | RECORD = False
68 | VIDEO_DURATION = 60 # seconds
69 |
70 | # Enable the dynamic adjustment (+ waiting time to start it)
71 | DYNAMIC_SETTINGS = False
72 | DYNAMIC_SETTINGS_STEP_WAIT = 20
73 | DYNAMIC_SETTINGS_INITIAL_WAIT = 10
74 |
75 | #########################################################################
76 | # Algorithm parameters
77 |
78 | # Discount factor
79 | DISCOUNT = 0.99
80 |
81 | # Tmax (Interval over which gradients are computerd)
82 | TIME_MAX = 50
83 |
84 | # Maximum steps taken by agent in environment
85 | MAX_STEPS = 10 * 10**7
86 |
87 | # Reward Clipping
88 | REWARD_CLIPPING = False
89 | REWARD_MIN = -1
90 | REWARD_MAX = 1
91 |
92 | # Max size of the queue
93 | MAX_QUEUE_SIZE = 100
94 | PREDICTION_BATCH_SIZE = 128
95 |
96 | # Input of the DNN
97 | STACKED_FRAMES = 1
98 | IMAGE_WIDTH = 84
99 | IMAGE_HEIGHT = 84
100 | IMAGE_DEPTH = 3 # 3 for RGB, 4 for RGBD
101 |
102 | COMBINED_STATE_SIZE = 21240 # includes auxiliary inputs to NN (TODO: can be calculated inside the program using other params)
103 | VEL_DIM = 6 # velocity dimension
104 | DEPTH_PIXELS = 64 # number of depth pixels for auxiliary supervision
105 | DEPTH_QUANTIZATION = 8 # number of bins for depth
106 |
107 | # scaling factors for depth loss
108 | BETA1 = 1
109 | BETA2 = 1
110 |
111 | # Lab setting (frames per second)
112 | FPS = 60
113 |
114 | # Rotation for look-left, look-right actions [-512, 512]
115 | ROTATION = 20
116 |
117 | # Total number of episodes and annealing frequency
118 | EPISODES = 400000
119 | ANNEALING_EPISODE_COUNT = 400000
120 |
121 | # Entropy regualrization hyper-parameter
122 | BETA_START = 0.001
123 | BETA_END = 0.001
124 |
125 | # Learning rate
126 | LEARNING_RATE_START = 0.0005
127 | LEARNING_RATE_END = 0.0005
128 |
129 | # RMSProp parameters
130 | RMSPROP_DECAY = 0.99
131 | RMSPROP_MOMENTUM = 0.0
132 | RMSPROP_EPSILON = 0.1
133 |
134 | # Dual RMSProp - we found that using a single RMSProp for the two cost function works better and faster
135 | DUAL_RMSPROP = False
136 |
137 | # Gradient clipping
138 | USE_GRAD_CLIP = False
139 | GRAD_CLIP_NORM = 40.0
140 | # Epsilon (regularize policy lag in GA3C)
141 | LOG_EPSILON = 1e-6
142 | # Training min batch size - increasing the batch size increases the stability of the algorithm, but make learning slower
143 | TRAINING_MIN_BATCH_SIZE = 0
144 |
145 | #########################################################################
146 | # Log and save
147 |
148 | # Enable TensorBoard
149 | TENSORBOARD = False
150 | # Update TensorBoard every X training steps
151 | TENSORBOARD_UPDATE_FREQUENCY = 1000
152 |
153 | # Enable to save models every SAVE_FREQUENCY episodes
154 | SAVE_MODELS = True
155 | # Save every SAVE_FREQUENCY episodes
156 | SAVE_FREQUENCY = 1000
157 |
158 | # Print stats every PRINT_STATS_FREQUENCY episodes
159 | PRINT_STATS_FREQUENCY = 1
160 | # The window to average stats
161 | STAT_ROLLING_MEAN_WINDOW = 1000
162 |
163 | # Results filename
164 | RESULTS_FILENAME = 'results.txt'
165 | # Network checkpoint name
166 | NETWORK_NAME = 'network'
167 |
168 | #########################################################################
169 | # More experimental parameters here
170 |
171 | # Minimum policy
172 | MIN_POLICY = 0.0
173 | # Use log_softmax() instead of log(softmax())
174 | USE_LOG_SOFTMAX = False
175 |
--------------------------------------------------------------------------------
/Display.py:
--------------------------------------------------------------------------------
1 | import pygame
2 | from Config import Config
3 | import numpy as np
4 | from collections import deque
5 | import cv2
6 |
7 | BLUE = (128, 128, 255)
8 | RED = (255, 192, 192)
9 | BLACK = (0, 0, 0)
10 | WHITE = (255, 255, 255)
11 |
12 | depth_dict = {k:v for k,v in zip(range(Config.DEPTH_QUANTIZATION),
13 | [0.05,0.175,0.3,0.425,0.55,0.675,0.8,1])} #bins
14 |
15 | class MovieWriter(object):
16 | def __init__(self, file_name, frame_size, fps):
17 | self.vout = cv2.VideoWriter()
18 | if not self.vout.open(file_name,
19 | cv2.VideoWriter_fourcc('M','J','P','G'), fps, frame_size, True):
20 | print("Create movie failed: {0}".format(file_name))
21 |
22 | def add_frame(self, frame):
23 | self.vout.write(frame)
24 |
25 | def close(self):
26 | self.vout.release()
27 | self.vout = None
28 |
29 | def isOpen(self):
30 | return self.vout != None and self.vout.isOpened()
31 |
32 | class Display(object):
33 | def __init__(self):
34 | pygame.init()
35 |
36 | self.display_size = Config.DISPLAY_SIZE
37 | self.surface = pygame.display.set_mode(self.display_size, 0, 24)
38 | pygame.display.set_caption('NAV')
39 | self.font = pygame.font.SysFont(None, 20)
40 | self._values = deque(maxlen=100)
41 | if Config.RECORD:
42 | self.video_fps = 5
43 | self.frames = 0
44 | self.writer = MovieWriter('melonvideo.avi', self.display_size, self.video_fps)
45 |
46 | def draw_center_text(self, str, center_x, top):
47 | text = self.font.render(str, True, WHITE, BLACK)
48 | text_rect = text.get_rect()
49 | text_rect.centerx = center_x
50 | text_rect.top = top
51 | self.surface.blit(text, text_rect)
52 |
53 | def show_image(self, im):
54 | data = im.astype(np.uint8)
55 | image = pygame.image.frombuffer(data, (84,84), 'RGB')
56 | image = pygame.transform.scale(image, (128, 128))
57 | self.surface.blit(image, (8, 8))
58 | self.draw_center_text("input", 50, 150)
59 |
60 | def show_depth(self, dm):
61 | dm = dm * 255.
62 | data = dm.astype(np.uint8)
63 | color_img = cv2.cvtColor(data, cv2.COLOR_GRAY2RGB)
64 |
65 | image = pygame.image.frombuffer(color_img, (16,4), 'RGB')
66 | image = pygame.transform.scale(image, (128, 32))
67 | self.surface.blit(image, (200, 8))
68 | self.draw_center_text("depth", 250, 50)
69 |
70 | def show_policy(self, pi):
71 | start_x = 10
72 |
73 | y = 200
74 |
75 | for i in range(len(pi)):
76 | width = pi[i] * 100
77 | pygame.draw.rect(self.surface, WHITE, (start_x, y, width, 10))
78 | y += 20
79 | self.draw_center_text("Action Prob.", 50, y)
80 |
81 | def show_values(self):
82 | if len(self._values) == 0:
83 | return
84 |
85 | min_v = float("inf")
86 | max_v = float("-inf")
87 |
88 | for v in self._values:
89 | min_v = min(min_v, v)
90 | max_v = max(max_v, v)
91 |
92 | top = 150
93 | left = 150
94 | width = 100
95 | height = 100
96 | bottom = top + width
97 | right = left + height
98 |
99 | d = max_v - min_v
100 | last_r = 0.0
101 | for i,v in enumerate(self._values):
102 | r = (v - min_v) / d
103 | if i > 0:
104 | x0 = i-1 + left
105 | x1 = i + left
106 | y0 = bottom - last_r * height
107 | y1 = bottom - r * height
108 | pygame.draw.line(self.surface, BLUE, (x0, y0), (x1, y1), 1)
109 | last_r = r
110 |
111 | pygame.draw.line(self.surface, WHITE, (left, top), (left, bottom), 1)
112 | pygame.draw.line(self.surface, WHITE, (right, top), (right, bottom), 1)
113 | pygame.draw.line(self.surface, WHITE, (left, top), (right, top), 1)
114 | pygame.draw.line(self.surface, WHITE, (left, bottom), (right, bottom), 1)
115 |
116 | self.draw_center_text("V", left + width/2, bottom+10)
117 |
118 | def update(self, state, prediction, value, depth):
119 | im_size = Config.IMAGE_HEIGHT*Config.IMAGE_WIDTH*Config.IMAGE_DEPTH
120 | im = state[:im_size] * 255.
121 | im = np.reshape(im, (Config.IMAGE_HEIGHT, Config.IMAGE_WIDTH, Config.IMAGE_DEPTH))
122 | self._values.append(value)
123 |
124 | # create depth_map (4,16) from depth (64, 8)
125 | depth_map = [depth_dict[np.argmax(depth[p])] for p in
126 | range(depth.shape[0])]
127 | depth_map = np.array(depth_map)
128 |
129 | self.surface.fill(BLACK)
130 | self.show_image(im)
131 | self.show_policy(prediction)
132 | self.show_values()
133 | self.show_depth(depth_map)
134 | pygame.display.update()
135 |
136 | if Config.RECORD and self.writer.isOpen():
137 | frame_str = self.surface.get_buffer().raw
138 | d = np.fromstring(frame_str, dtype=np.uint8)
139 | d = d.reshape((self.display_size[1], self.display_size[0], 3))
140 | self.writer.add_frame(d)
141 | self.frames += 1
142 | if self.frames == Config.VIDEO_DURATION*self.video_fps:
143 | print("Movie writing complete.")
144 | self.writer.close()
145 |
--------------------------------------------------------------------------------
/Environment.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # Redistribution and use in source and binary forms, with or without
4 | # modification, are permitted provided that the following conditions
5 | # are met:
6 | # * Redistributions of source code must retain the above copyright
7 | # notice, this list of conditions and the following disclaimer.
8 | # * Redistributions in binary form must reproduce the above copyright
9 | # notice, this list of conditions and the following disclaimer in the
10 | # documentation and/or other materials provided with the distribution.
11 | # * Neither the name of NVIDIA CORPORATION nor the names of its
12 | # contributors may be used to endorse or promote products derived
13 | # from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |
27 | import sys
28 | if sys.version_info >= (3,0):
29 | from queue import Queue
30 | else:
31 | from Queue import Queue
32 |
33 | import numpy as np
34 | #import scipy.misc as misc
35 |
36 | from Config import Config
37 | from GameManager import GameManager
38 |
39 | class Environment:
40 | def __init__(self):
41 | self.game = GameManager(Config.MAP)
42 | self.nb_frames = Config.STACKED_FRAMES
43 | self.frame_q = Queue(maxsize=self.nb_frames)
44 | self.previous_state = None
45 | self.current_state = None
46 | self.total_reward = 0
47 |
48 | self.reset()
49 |
50 | def is_running(self):
51 | return self.game.is_running()
52 |
53 | @staticmethod
54 | def _rgb2gray(rgb):
55 | return np.dot(rgb[..., :3], [0.299, 0.587, 0.114])
56 |
57 | @staticmethod
58 | def _preprocess(image):
59 | #image = Environment._rgb2gray(image)
60 | #image = misc.imresize(image, [Config.IMAGE_HEIGHT, Config.IMAGE_WIDTH], 'bilinear')
61 | image = image.astype(np.float32) / 255.
62 | return image
63 |
64 | def _get_current_state_no_stacking(self):
65 | if not self.frame_q.full():
66 | return None # frame queue is not full yet.
67 | return np.array(list(self.frame_q.queue)[0])
68 |
69 | def _get_current_state(self):
70 | if not self.frame_q.full():
71 | return None # frame queue is not full yet.
72 | x_ = [np.array(i) for i in list(self.frame_q.queue)]
73 | x_ = np.concatenate(x_, axis=2)
74 | #x_ = np.array(self.frame_q.queue)
75 | #x_ = np.transpose(x_, [1, 2, 3, 0]) # move channels
76 | return x_
77 |
78 | def _update_frame_q(self, frame):
79 | if self.frame_q.full():
80 | self.frame_q.get()
81 | self.frame_q.put(frame)
82 |
83 | # the state is no longer just the image, but a concatenation of
84 | # image and auxiliary inputs. We can't use the same _preprocess()
85 | #image = Environment._preprocess(frame)
86 | #self.frame_q.put(image)
87 |
88 | def get_num_actions(self):
89 | return GameManager.get_num_actions()
90 |
91 | def reset(self):
92 | self.total_reward = 0
93 | self.frame_q.queue.clear()
94 | self.game.reset()
95 | self._update_frame_q(self.game.get_state())
96 | self.previous_state = self.current_state = None
97 |
98 | def step(self, action):
99 | reward, is_running = self.game.step(action)
100 | self.total_reward += reward
101 | self.previous_state = self.current_state
102 |
103 | if is_running:
104 | observation = self.game.get_state()
105 | self._update_frame_q(observation)
106 | self.current_state = self._get_current_state_no_stacking()
107 |
108 | return reward, is_running
109 |
--------------------------------------------------------------------------------
/Experience.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # Redistribution and use in source and binary forms, with or without
4 | # modification, are permitted provided that the following conditions
5 | # are met:
6 | # * Redistributions of source code must retain the above copyright
7 | # notice, this list of conditions and the following disclaimer.
8 | # * Redistributions in binary form must reproduce the above copyright
9 | # notice, this list of conditions and the following disclaimer in the
10 | # documentation and/or other materials provided with the distribution.
11 | # * Neither the name of NVIDIA CORPORATION nor the names of its
12 | # contributors may be used to endorse or promote products derived
13 | # from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |
27 | class Experience:
28 | def __init__(self, state, action, prediction, reward):
29 | self.state = state
30 | self.action = action
31 | self.prediction = prediction
32 | self.reward = reward
33 |
--------------------------------------------------------------------------------
/GA3C.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # Redistribution and use in source and binary forms, with or without
4 | # modification, are permitted provided that the following conditions
5 | # are met:
6 | # * Redistributions of source code must retain the above copyright
7 | # notice, this list of conditions and the following disclaimer.
8 | # * Redistributions in binary form must reproduce the above copyright
9 | # notice, this list of conditions and the following disclaimer in the
10 | # documentation and/or other materials provided with the distribution.
11 | # * Neither the name of NVIDIA CORPORATION nor the names of its
12 | # contributors may be used to endorse or promote products derived
13 | # from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |
27 | # check python version; warn if not Python3
28 | import os, sys
29 | import warnings
30 | if sys.version_info < (3,0):
31 | warnings.warn("Optimized for Python3. Performance may suffer under Python2.", Warning)
32 |
33 | from Config import Config
34 | from Server import Server
35 |
36 | # Suppress the output from C functions
37 | # source - http://stackoverflow.com/questions/5081657/how-do-i-prevent-a-c-shared-library-to-print-on-stdout-in-python
38 | def redirect_stdout():
39 | sys.stdout.flush() # <--- important when redirecting to files
40 | newstdout = os.dup(1)
41 | devnull = os.open(os.devnull, os.O_WRONLY)
42 | os.dup2(devnull, 1)
43 | os.close(devnull)
44 | sys.stdout = os.fdopen(newstdout, 'w')
45 |
46 | def checks():
47 | if Config.STACKED_FRAMES != 1:
48 | assert False, "Stacking of multiple frames not supported. See disentangle_obs() in NetworkVP.py"
49 |
50 | if Config.NUM_LSTMS != 2:
51 | assert False, "Architecture hard-wired for 2 stacked LSTM layers"
52 |
53 | # Parse arguments
54 | for i in range(1, len(sys.argv)):
55 | # Config arguments should be in format of Config=Value
56 | # For setting booleans to False use Config=
57 | x, y = sys.argv[i].split('=')
58 | setattr(Config, x, type(getattr(Config, x))(y))
59 |
60 | # Adjust configs for Play mode
61 | if Config.PLAY_MODE:
62 | print("==Play mode on==")
63 | Config.AGENTS = 1
64 | Config.PREDICTORS = 1
65 | Config.TRAINERS = 1
66 | Config.DYNAMIC_SETTINGS = False
67 |
68 | Config.LOAD_CHECKPOINT = True
69 | Config.TRAIN_MODELS = False
70 | Config.SAVE_MODELS = False
71 |
72 | redirect_stdout()
73 | checks()
74 | print('+++ GA3C on %s +++'%Config.MAP)
75 | print('===Network===')
76 | print('LSTM layers:', Config.NUM_LSTMS)
77 | print("Reward clipping %s. Clipping affects policy!"%('ENABLED' if Config.REWARD_CLIPPING else 'DISABLED'))
78 | print('======')
79 | # Start main program
80 | Server().main()
81 |
--------------------------------------------------------------------------------
/GameManager.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # Redistribution and use in source and binary forms, with or without
4 | # modification, are permitted provided that the following conditions
5 | # are met:
6 | # * Redistributions of source code must retain the above copyright
7 | # notice, this list of conditions and the following disclaimer.
8 | # * Redistributions in binary form must reproduce the above copyright
9 | # notice, this list of conditions and the following disclaimer in the
10 | # documentation and/or other materials provided with the distribution.
11 | # * Neither the name of NVIDIA CORPORATION nor the names of its
12 | # contributors may be used to endorse or promote products derived
13 | # from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |
27 | import deepmind_lab
28 | import numpy as np
29 | from Config import Config
30 | import sys
31 |
32 | def _action(*entries):
33 | return np.array(entries, dtype=np.intc)
34 |
35 | class GameManager:
36 |
37 | ACTION_LIST = [
38 | _action(-1*int(Config.ROTATION), 0, 0, 0, 0, 0, 0), # look_left
39 | _action( int(Config.ROTATION), 0, 0, 0, 0, 0, 0), # look_right
40 | #_action( 0, 10, 0, 0, 0, 0, 0), # look_up
41 | #_action( 0, -10, 0, 0, 0, 0, 0), # look_down
42 | #_action(-1*int(Config.ROTATION), 0, 0, 1, 0, 0, 0),
43 | #_action( int(Config.ROTATION), 0, 0, 1, 0, 0, 0),
44 | _action( 0, 0, -1, 0, 0, 0, 0), # strafe_left
45 | _action( 0, 0, 1, 0, 0, 0, 0), # strafe_right
46 | _action( 0, 0, 0, 1, 0, 0, 0), # forward
47 | _action( 0, 0, 0, -1, 0, 0, 0), # backward
48 | #_action( 0, 0, 0, 0, 1, 0, 0), # fire
49 | #_action( 0, 0, 0, 0, 0, 1, 0), # jump
50 | #_action( 0, 0, 0, 0, 0, 0, 1) # crouch
51 | ]
52 |
53 | def __init__(self, map_name):
54 | self.map_name = map_name
55 | self.obs_specs = ['RGBD_INTERLACED', 'VEL.TRANS', 'VEL.ROT']
56 |
57 | self.lab = deepmind_lab.Lab(map_name, self.obs_specs, config={
58 | 'fps': str(Config.FPS),
59 | 'width': str(Config.IMAGE_WIDTH),
60 | 'height': str(Config.IMAGE_HEIGHT)
61 | })
62 |
63 | self.prev_action = 0
64 | self.prev_reward = 0
65 | self.reset()
66 |
67 | def reset(self):
68 | self.prev_action = 0
69 | self.prev_reward = 0
70 | if not self.lab.reset():
71 | assert 'Error reseting lab environment'
72 |
73 | def is_running(self):
74 | return self.lab.is_running()
75 |
76 | def get_state(self):
77 | obs = self.lab.observations() # dict of Numpy arrays
78 | image = obs['RGBD_INTERLACED']
79 |
80 | # create a low resolution (4x16) depth map from the 84x84 image
81 | depth_map = image[:,:,3]
82 | depth_map = depth_map[16:-16,:] # crop
83 | depth_map = depth_map[:,2:-2] # crop
84 | depth_map = depth_map[::13,::5] # subsample
85 |
86 | image = image[:,:,:3].astype(np.float32) / 255. #RGB
87 |
88 | # flatten array for later append
89 | image = image.flatten()
90 | depth_map = depth_map.flatten()
91 |
92 | # quantize depth (as per DeepMind paper)
93 | depth_map = np.power(depth_map/255., 10)
94 | depth_map = np.digitize(depth_map,
95 | [0,0.05,0.175,0.3,0.425,0.55,0.675,0.8,1.01]) # bins
96 | depth_map -= 1
97 |
98 | # velocity vectors
99 | vel_vec1 = obs['VEL.TRANS']
100 | vel_vec2 = obs['VEL.ROT']
101 |
102 | # combined state
103 | state = np.append(image, depth_map)
104 | state = np.append(state, vel_vec1)
105 | state = np.append(state, vel_vec2)
106 | state = np.append(state, self.prev_action)
107 | state = np.append(state, self.prev_reward)
108 |
109 | return state
110 |
111 | @staticmethod
112 | def get_num_actions():
113 | return len(GameManager.ACTION_LIST)
114 |
115 | def step(self, action):
116 | if action == -1: #NO-OP
117 | reward = 0
118 | else:
119 | reward = self.lab.step(GameManager.ACTION_LIST[action], num_steps=4)
120 | self.prev_action = action
121 | self.prev_reward = reward
122 |
123 | return reward, self.is_running()
124 |
--------------------------------------------------------------------------------
/NetworkVP.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # Redistribution and use in source and binary forms, with or without
4 | # modification, are permitted provided that the following conditions
5 | # are met:
6 | # * Redistributions of source code must retain the above copyright
7 | # notice, this list of conditions and the following disclaimer.
8 | # * Redistributions in binary form must reproduce the above copyright
9 | # notice, this list of conditions and the following disclaimer in the
10 | # documentation and/or other materials provided with the distribution.
11 | # * Neither the name of NVIDIA CORPORATION nor the names of its
12 | # contributors may be used to endorse or promote products derived
13 | # from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |
27 | import os
28 | import re
29 | import numpy as np
30 | import tensorflow as tf
31 |
32 | from Config import Config
33 |
34 |
35 | class NetworkVP:
36 | def __init__(self, device, model_name, num_actions):
37 | self.device = device
38 | self.model_name = model_name
39 | self.num_actions = num_actions
40 |
41 | self.img_width = Config.IMAGE_WIDTH
42 | self.img_height = Config.IMAGE_HEIGHT
43 | self.img_channels = Config.IMAGE_DEPTH * Config.STACKED_FRAMES
44 |
45 | self.learning_rate = Config.LEARNING_RATE_START
46 | self.beta = Config.BETA_START
47 | self.log_epsilon = Config.LOG_EPSILON
48 |
49 | self.graph = tf.Graph()
50 | with self.graph.as_default() as g:
51 | with tf.device(self.device):
52 | self._create_graph()
53 |
54 | self.sess = tf.Session(
55 | graph=self.graph,
56 | config=tf.ConfigProto(
57 | allow_soft_placement=True,
58 | log_device_placement=False,
59 | gpu_options=tf.GPUOptions(allow_growth=True)))
60 | self.sess.run(tf.global_variables_initializer())
61 |
62 | if Config.TENSORBOARD: self._create_tensor_board()
63 | if Config.LOAD_CHECKPOINT or Config.SAVE_MODELS:
64 | vars = tf.global_variables()
65 | self.saver = tf.train.Saver({var.name: var for var in vars}, max_to_keep=0)
66 |
67 |
68 | def _create_graph(self):
69 | self.x = tf.placeholder(
70 | tf.float32, [None, self.img_height, self.img_width, self.img_channels], name='X')
71 | self.y_r = tf.placeholder(tf.float32, [None], name='Yr')
72 | self.p_rewards = tf.placeholder(tf.float32, [None, 1], name='p_rewards')
73 | self.aux_inp = tf.placeholder(tf.float32, shape=[None, self.num_actions+Config.VEL_DIM], name='aux_input')
74 | self.depth_labels = [tf.placeholder(tf.int32, shape=[None, Config.DEPTH_QUANTIZATION])]*Config.DEPTH_PIXELS
75 |
76 | self.var_beta = tf.placeholder(tf.float32, name='beta', shape=[])
77 | self.var_learning_rate = tf.placeholder(tf.float32, name='lr', shape=[])
78 |
79 | self.global_step = tf.Variable(0, trainable=False, name='step')
80 |
81 | # As implemented in A3C paper
82 | self.n1 = self.conv2d_layer(self.x, 8, 16, 'conv11', strides=[1, 4, 4, 1])
83 | self.n2 = self.conv2d_layer(self.n1, 4, 32, 'conv12', strides=[1, 2, 2, 1])
84 | self.action_index = tf.placeholder(tf.float32, name='action_index', shape=[None, self.num_actions])
85 | _input = self.n2
86 |
87 | flatten_input_shape = _input.get_shape()
88 | nb_elements = flatten_input_shape[1] * flatten_input_shape[2] * flatten_input_shape[3]
89 |
90 | self.flat = tf.reshape(_input, shape=[-1, nb_elements._value])
91 | self.enc_out = self.dense_layer(self.flat, 256, 'dense1') # encoder output
92 |
93 | self.d1 = self.dense_layer(self.enc_out, 128, 'depth1')
94 |
95 | # input to first LSTM. Add previous step rewards
96 | self.aux1 = tf.concat((self.enc_out, self.p_rewards), axis=1)
97 |
98 | lstm_layers = Config.NUM_LSTMS
99 | self.seq_len = tf.placeholder(tf.int32, name='seq_len', shape=[]) # LSTM sequence length
100 | self.state_in = [] # LSTM input state
101 | self.state_out = [] # LSTM output state
102 |
103 | with tf.variable_scope('lstm1'):
104 | lstm_cell = tf.contrib.rnn.BasicLSTMCell(64, state_is_tuple=True)
105 | c_in_1 = tf.placeholder(tf.float32, name='c_1', shape=[None, lstm_cell.state_size.c])
106 | h_in_1 = tf.placeholder(tf.float32, name='h_1', shape=[None, lstm_cell.state_size.h])
107 | self.state_in.append((c_in_1, h_in_1))
108 |
109 | # using tf.stack here since tf doesn't like when integers and
110 | # placeholders are mixed together in the desired shape
111 | rnn_in = tf.reshape(self.aux1, tf.stack([-1, self.seq_len, self.aux1.shape[1]]))
112 |
113 | init_1 = tf.contrib.rnn.LSTMStateTuple(c_in_1, h_in_1)
114 | lstm_outputs_1, lstm_state_1 = tf.nn.dynamic_rnn(lstm_cell, rnn_in,
115 | initial_state=init_1, time_major=False)
116 | lstm_outputs_1 = tf.reshape(lstm_outputs_1, [-1, 64])
117 | self.state_out.append(tuple(lstm_state_1))
118 |
119 | # input to second LSTM. Add previous LSTM output, vel and prev action
120 | self.aux2 = tf.concat((self.enc_out, lstm_outputs_1), axis=1)
121 | self.aux2 = tf.concat((self.aux2, self.aux_inp), axis=1)
122 |
123 | with tf.variable_scope('lstm2'):
124 | lstm_cell = tf.contrib.rnn.BasicLSTMCell(256, state_is_tuple=True)
125 | c_in_2 = tf.placeholder(tf.float32, name='c_2', shape=[None, lstm_cell.state_size.c])
126 | h_in_2 = tf.placeholder(tf.float32, name='h_2', shape=[None, lstm_cell.state_size.h])
127 | self.state_in.append((c_in_2, h_in_2))
128 |
129 | rnn_in = tf.reshape(self.aux2, tf.stack([-1, self.seq_len, self.aux2.shape[1]]))
130 | init_2 = tf.contrib.rnn.LSTMStateTuple(c_in_2, h_in_2)
131 | lstm_outputs_2, lstm_state_2 = tf.nn.dynamic_rnn(lstm_cell, rnn_in,
132 | initial_state=init_2, time_major=False)
133 | self.state_out.append(tuple(lstm_state_2))
134 |
135 | self.rnn_out = tf.reshape(lstm_outputs_2, [-1, 256])
136 |
137 | self.d2 = self.dense_layer(self.rnn_out, 128, 'depth2')
138 | self.logits_v = tf.squeeze(self.dense_layer(self.rnn_out, 1, 'logits_v', func=None), axis=[1])
139 | self.logits_p = self.dense_layer(self.rnn_out, self.num_actions, 'logits_p', func=None)
140 |
141 | if Config.USE_LOG_SOFTMAX:
142 | self.softmax_p = tf.nn.softmax(self.logits_p)
143 | self.log_softmax_p = tf.nn.log_softmax(self.logits_p)
144 | self.log_selected_action_prob = tf.reduce_sum(self.log_softmax_p * self.action_index, axis=1)
145 |
146 | self.cost_p_1 = self.log_selected_action_prob * (self.y_r - tf.stop_gradient(self.logits_v))
147 | self.cost_p_2 = -1 * self.var_beta * \
148 | tf.reduce_sum(self.log_softmax_p * self.softmax_p, axis=1)
149 | else:
150 | self.softmax_p = (tf.nn.softmax(self.logits_p) + Config.MIN_POLICY) / (1.0 + Config.MIN_POLICY * self.num_actions)
151 | self.selected_action_prob = tf.reduce_sum(self.softmax_p * self.action_index, axis=1)
152 |
153 | self.cost_p_1 = tf.log(tf.maximum(self.selected_action_prob, self.log_epsilon)) \
154 | * (self.y_r - tf.stop_gradient(self.logits_v))
155 | self.cost_p_2 = -1 * self.var_beta * \
156 | tf.reduce_sum(tf.log(tf.maximum(self.softmax_p, self.log_epsilon)) *
157 | self.softmax_p, axis=1)
158 |
159 | # use a mask since we pad bactches of size < TIME_MAX
160 | mask = tf.reduce_max(self.action_index,axis=1)
161 | self.cost_v = 0.5 * tf.reduce_sum(tf.square(self.y_r - self.logits_v) * mask, axis=0)
162 | self.cost_p_1_agg = tf.reduce_sum(self.cost_p_1 * mask, axis=0)
163 | self.cost_p_2_agg = tf.reduce_sum(self.cost_p_2 * mask, axis=0)
164 |
165 | # depth logits
166 | self.d1_logits = [self.dense_layer(self.d1, Config.DEPTH_QUANTIZATION, 'logits_d1_%d'%i, func=None)
167 | for i in range(Config.DEPTH_PIXELS)]
168 |
169 | self.d2_logits = [self.dense_layer(self.d2, Config.DEPTH_QUANTIZATION, 'logits_d2_%d'%i, func=None)
170 | for i in range(Config.DEPTH_PIXELS)]
171 |
172 | self.d1_loss = [tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.d1_logits[i],
173 | labels=self.depth_labels[i])*mask, axis=0) for i in range(Config.DEPTH_PIXELS)]
174 |
175 | self.d2_loss = [tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.d2_logits[i],
176 | labels=self.depth_labels[i])*mask, axis=0) for i in range(Config.DEPTH_PIXELS)]
177 |
178 | # total depth loss
179 | self.d1_loss = tf.add_n(self.d1_loss)
180 | self.d2_loss = tf.add_n(self.d2_loss)
181 | #self.d1_loss = tf.reduce_mean(self.d1_loss)
182 | #self.d2_loss = tf.reduce_mean(self.d2_loss)
183 |
184 | self.cost_p = -(self.cost_p_1_agg + self.cost_p_2_agg) + Config.BETA1*self.d1_loss + Config.BETA2*self.d2_loss
185 |
186 | if Config.DUAL_RMSPROP:
187 | self.opt_p = tf.train.RMSPropOptimizer(
188 | learning_rate=self.var_learning_rate,
189 | decay=Config.RMSPROP_DECAY,
190 | momentum=Config.RMSPROP_MOMENTUM,
191 | epsilon=Config.RMSPROP_EPSILON)
192 |
193 | self.opt_v = tf.train.RMSPropOptimizer(
194 | learning_rate=self.var_learning_rate,
195 | decay=Config.RMSPROP_DECAY,
196 | momentum=Config.RMSPROP_MOMENTUM,
197 | epsilon=Config.RMSPROP_EPSILON)
198 | else:
199 | self.cost_all = self.cost_p + self.cost_v
200 | self.opt = tf.train.RMSPropOptimizer(
201 | learning_rate=self.var_learning_rate,
202 | decay=Config.RMSPROP_DECAY,
203 | momentum=Config.RMSPROP_MOMENTUM,
204 | epsilon=Config.RMSPROP_EPSILON)
205 |
206 | if Config.USE_GRAD_CLIP:
207 | if Config.DUAL_RMSPROP:
208 | self.opt_grad_v = self.opt_v.compute_gradients(self.cost_v)
209 | self.opt_grad_v_clipped = [(tf.clip_by_norm(g, Config.GRAD_CLIP_NORM),v)
210 | for g,v in self.opt_grad_v if not g is None]
211 | self.train_op_v = self.opt_v.apply_gradients(self.opt_grad_v_clipped)
212 |
213 | self.opt_grad_p = self.opt_p.compute_gradients(self.cost_p)
214 | self.opt_grad_p_clipped = [(tf.clip_by_norm(g, Config.GRAD_CLIP_NORM),v)
215 | for g,v in self.opt_grad_p if not g is None]
216 | self.train_op_p = self.opt_p.apply_gradients(self.opt_grad_p_clipped)
217 | self.train_op = [self.train_op_p, self.train_op_v]
218 | else:
219 | self.opt_grad = self.opt.compute_gradients(self.cost_all)
220 | self.opt_grad_clipped = [(tf.clip_by_average_norm(g, Config.GRAD_CLIP_NORM),v) for g,v in self.opt_grad]
221 | self.train_op = self.opt.apply_gradients(self.opt_grad_clipped)
222 | else:
223 | if Config.DUAL_RMSPROP:
224 | self.train_op_v = self.opt_p.minimize(self.cost_v, global_step=self.global_step)
225 | self.train_op_p = self.opt_v.minimize(self.cost_p, global_step=self.global_step)
226 | self.train_op = [self.train_op_p, self.train_op_v]
227 | else:
228 | self.train_op = self.opt.minimize(self.cost_all, global_step=self.global_step)
229 |
230 |
231 | def _create_tensor_board(self):
232 | summaries = tf.get_collection(tf.GraphKeys.SUMMARIES)
233 | summaries.append(tf.summary.scalar("Pcost_advantage", self.cost_p_1_agg))
234 | summaries.append(tf.summary.scalar("Pcost_entropy", self.cost_p_2_agg))
235 | summaries.append(tf.summary.scalar("Pcost", self.cost_p))
236 | summaries.append(tf.summary.scalar("Vcost", self.cost_v))
237 | summaries.append(tf.summary.scalar("D1_loss", self.d1_loss))
238 | summaries.append(tf.summary.scalar("D2_loss", self.d2_loss))
239 | summaries.append(tf.summary.scalar("LearningRate", self.var_learning_rate))
240 | summaries.append(tf.summary.scalar("Beta", self.var_beta))
241 | for var in tf.trainable_variables():
242 | summaries.append(tf.summary.histogram("weights_%s" % var.name, var))
243 |
244 | summaries.append(tf.summary.histogram("activation_n1", self.n1))
245 | summaries.append(tf.summary.histogram("activation_n2", self.n2))
246 | summaries.append(tf.summary.histogram("activation_enc", self.enc_out))
247 | summaries.append(tf.summary.histogram("activation_v", self.logits_v))
248 | summaries.append(tf.summary.histogram("activation_p", self.softmax_p))
249 |
250 | self.summary_op = tf.summary.merge(summaries)
251 | self.log_writer = tf.summary.FileWriter("logs/%s" % self.model_name, self.sess.graph)
252 |
253 | def dense_layer(self, input, out_dim, name, func=tf.nn.relu):
254 | in_dim = input.get_shape().as_list()[-1]
255 | d = 1.0 / np.sqrt(in_dim)
256 | with tf.variable_scope(name):
257 | w_init = tf.random_uniform_initializer(-d, d)
258 | b_init = tf.random_uniform_initializer(-d, d)
259 | w = tf.get_variable('w', dtype=tf.float32, shape=[in_dim, out_dim], initializer=w_init)
260 | b = tf.get_variable('b', shape=[out_dim], initializer=b_init)
261 |
262 | output = tf.matmul(input, w) + b
263 | if func is not None:
264 | output = func(output)
265 |
266 | return output
267 |
268 | def conv2d_layer(self, input, filter_size, out_dim, name, strides, func=tf.nn.relu):
269 | in_dim = input.get_shape().as_list()[-1]
270 | d = 1.0 / np.sqrt(filter_size * filter_size * in_dim)
271 | with tf.variable_scope(name):
272 | w_init = tf.random_uniform_initializer(-d, d)
273 | b_init = tf.random_uniform_initializer(-d, d)
274 | w = tf.get_variable('w',
275 | shape=[filter_size, filter_size, in_dim, out_dim],
276 | dtype=tf.float32,
277 | initializer=w_init)
278 | b = tf.get_variable('b', shape=[out_dim], initializer=b_init)
279 |
280 | output = tf.nn.conv2d(input, w, strides=strides, padding='SAME') + b
281 | if func is not None:
282 | output = func(output)
283 |
284 | return output
285 |
286 | def __get_base_feed_dict(self):
287 | return {self.var_beta: self.beta, self.var_learning_rate: self.learning_rate}
288 |
289 | def get_global_step(self):
290 | step = self.sess.run(self.global_step)
291 | return step
292 |
293 | def predict_single(self, x):
294 | return self.predict_p(x[None, :])[0]
295 |
296 | def predict_v(self, x):
297 | prediction = self.sess.run(self.logits_v, feed_dict={self.x: x})
298 | return prediction
299 |
300 | def predict_p(self, x):
301 | prediction = self.sess.run(self.softmax_p, feed_dict={self.x: x})
302 | return prediction
303 |
304 | def predict_p_and_v_and_d(self, x, c_batch, h_batch):
305 | batch_size = x.shape[0]
306 | im, depth_map, vel, p_action, p_reward = self.disentangle_obs(x)
307 | feed_dict={self.x: im, self.seq_len: 1, self.p_rewards: p_reward,
308 | self.aux_inp: np.concatenate((vel, p_action), axis=1)}
309 |
310 | # shape of c/h_batch: (batch_size, Config.NUM_LSTMS, 256)
311 | for i in range(Config.NUM_LSTMS):
312 | c = c_batch[:,i,:] if i == 1 else c_batch[:,i,:64]
313 | h = h_batch[:,i,:] if i == 1 else h_batch[:,i,:64]
314 | feed_dict.update({self.state_in[i]: (c, h)})
315 |
316 | p, v, d, lstm_out = self.sess.run([self.softmax_p, self.logits_v,
317 | self.d1_logits, self.state_out], feed_dict=feed_dict)
318 |
319 | # reshape lstm_out(c/h) to: (batch_size, Config.NUM_LSTMS, 256)
320 | c = np.zeros((batch_size, Config.NUM_LSTMS, 256),
321 | dtype=np.float32)
322 |
323 | h = np.zeros((batch_size, Config.NUM_LSTMS, 256),
324 | dtype=np.float32)
325 |
326 | for i in range(Config.NUM_LSTMS):
327 | if i == 0:
328 | c[:,i,:64] = lstm_out[i][0]
329 | h[:,i,:64] = lstm_out[i][1]
330 | else:
331 | c[:,i,:] = lstm_out[i][0]
332 | h[:,i,:] = lstm_out[i][1]
333 |
334 | d = np.array(d).transpose(1, 0, 2)
335 | return p, v, d, c, h
336 |
337 | def disentangle_obs(self, states):
338 | """
339 | The obervations x is a concatenation of image, depth_map, prev_actn,
340 | velocity vector, and prev_rewards. This function separate these
341 | """
342 |
343 | batch_size = states.shape[0]
344 | im_size = Config.IMAGE_HEIGHT*Config.IMAGE_WIDTH*Config.IMAGE_DEPTH
345 | im = states[:, :im_size]
346 | im = np.reshape(im, (batch_size, Config.IMAGE_HEIGHT, Config.IMAGE_WIDTH, Config.IMAGE_DEPTH))
347 | states = states[:, im_size:]
348 |
349 | dm_size = Config.DEPTH_PIXELS
350 | dm_val = states[:, :dm_size].astype(int)
351 | states = states[:, dm_size:]
352 |
353 | depth_map = np.zeros((dm_size, batch_size, Config.DEPTH_QUANTIZATION))
354 | for i in range(dm_size):
355 | depth_map[i, np.arange(batch_size), dm_val[:,i].astype(int)] = 1 # make one-hot
356 |
357 | vl_size = Config.VEL_DIM
358 | vel = states[:, :vl_size]
359 | states = states[:, vl_size:]
360 |
361 | assert states.shape[1] == 2, "Missed something ?!"
362 | p_action = np.zeros((batch_size, self.num_actions))
363 | p_action[np.arange(batch_size), states[:,0].astype(int)] = 1 # make one-hot
364 | p_reward = states[:, 1]
365 | p_reward = np.reshape(p_reward, (batch_size, 1))
366 |
367 | # return (batch_size, ...) arrays
368 | return im, depth_map, vel, p_action, p_reward
369 |
370 | def train(self, x, y_r, a, c, h, trainer_id):
371 | feed_dict = self.__get_base_feed_dict()
372 | im, depth_map, vel, p_action, p_reward = self.disentangle_obs(x)
373 | feed_dict.update({self.x: im, self.y_r: y_r, self.action_index: a,
374 | self.seq_len: int(Config.TIME_MAX), self.p_rewards: p_reward,
375 | self.aux_inp: np.concatenate((vel, p_action), axis=1)})
376 |
377 | # depth supervision
378 | feed_dict.update({self.depth_labels[i]:depth_map[i] for i in
379 | range(Config.DEPTH_PIXELS)})
380 |
381 | for i in range(Config.NUM_LSTMS):
382 | cb = np.array(c[i]).reshape((-1, 256))
383 | hb = np.array(h[i]).reshape((-1, 256))
384 | if i == 0:
385 | cb = cb[:,:64]
386 | hb = hb[:,:64]
387 |
388 | feed_dict.update({self.state_in[i]: (cb, hb)})
389 |
390 | self.sess.run(self.train_op, feed_dict=feed_dict)
391 |
392 | def log(self, x, y_r, a, c, h):
393 | feed_dict = self.__get_base_feed_dict()
394 | im, depth_map, vel, p_action, p_reward = self.disentangle_obs(x)
395 |
396 | feed_dict.update({self.x: im, self.y_r: y_r, self.action_index: a,
397 | self.seq_len: int(Config.TIME_MAX), self.p_rewards: p_reward,
398 | self.aux_inp: np.concatenate((vel, p_action), axis=1)})
399 |
400 | # depth supervision
401 | feed_dict.update({self.depth_labels[i]:depth_map[i] for i in
402 | range(Config.DEPTH_PIXELS)})
403 |
404 | for i in range(Config.NUM_LSTMS):
405 | cb = np.array(c[i]).reshape((-1, 256))
406 | hb = np.array(h[i]).reshape((-1, 256))
407 | if i == 0:
408 | cb = cb[:,:64]
409 | hb = hb[:,:64]
410 |
411 | feed_dict.update({self.state_in[i]: (cb, hb)})
412 |
413 | step, summary = self.sess.run([self.global_step, self.summary_op], feed_dict=feed_dict)
414 | self.log_writer.add_summary(summary, step)
415 |
416 | def _checkpoint_filename(self, episode):
417 | return 'checkpoints/%s_%08d' % (self.model_name, episode)
418 |
419 | def _get_episode_from_filename(self, filename):
420 | # TODO: hacky way of getting the episode. ideally episode should be stored as a TF variable
421 | return int(re.split('/|_|\.', filename)[2])
422 |
423 | def save(self, episode):
424 | self.saver.save(self.sess, self._checkpoint_filename(episode))
425 |
426 | def load(self):
427 | filename = tf.train.latest_checkpoint(os.path.dirname(self._checkpoint_filename(episode=0)))
428 | if Config.LOAD_EPISODE > 0:
429 | filename = self._checkpoint_filename(Config.LOAD_EPISODE)
430 | self.saver.restore(self.sess, filename)
431 | return self._get_episode_from_filename(filename)
432 |
433 | def get_variables_names(self):
434 | return [var.name for var in self.graph.get_collection('trainable_variables')]
435 |
436 | def get_variable_value(self, name):
437 | return self.sess.run(self.graph.get_tensor_by_name(name))
438 |
--------------------------------------------------------------------------------
/ProcessAgent.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # Redistribution and use in source and binary forms, with or without
4 | # modification, are permitted provided that the following conditions
5 | # are met:
6 | # * Redistributions of source code must retain the above copyright
7 | # notice, this list of conditions and the following disclaimer.
8 | # * Redistributions in binary form must reproduce the above copyright
9 | # notice, this list of conditions and the following disclaimer in the
10 | # documentation and/or other materials provided with the distribution.
11 | # * Neither the name of NVIDIA CORPORATION nor the names of its
12 | # contributors may be used to endorse or promote products derived
13 | # from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |
27 | from datetime import datetime
28 | from multiprocessing import Process, Queue, Value
29 |
30 | import numpy as np
31 | import sys, time
32 |
33 | from Config import Config
34 | from Environment import Environment
35 | from Experience import Experience
36 |
37 |
38 | class ProcessAgent(Process):
39 | def __init__(self, id, prediction_q, training_q, episode_log_q, dm):
40 | super(ProcessAgent, self).__init__()
41 |
42 | self.id = id
43 | self.prediction_q = prediction_q
44 | self.training_q = training_q
45 | self.episode_log_q = episode_log_q
46 |
47 | self.env = Environment()
48 | self.num_actions = self.env.get_num_actions()
49 | self.actions = np.arange(self.num_actions)
50 |
51 | self.discount_factor = Config.DISCOUNT
52 | # one frame at a time
53 | self.wait_q = Queue(maxsize=1)
54 | self.exit_flag = Value('i', 0)
55 | self.display_manager = dm
56 |
57 | @staticmethod
58 | def _accumulate_rewards(experiences, discount_factor, value, is_running):
59 | if is_running:
60 | reward_sum = value # terminal reward
61 | for t in reversed(range(0, len(experiences)-1)):
62 | r = np.clip(experiences[t].reward, Config.REWARD_MIN, Config.REWARD_MAX) if Config.REWARD_CLIPPING else experiences[t].reward
63 | reward_sum = discount_factor * reward_sum + r
64 | experiences[t].reward = reward_sum
65 | return experiences[:-1]
66 | # if the episode has terminated, we take the full trajectory into
67 | # account, including the very last experience
68 | else:
69 | reward_sum = 0
70 | for t in reversed(range(0, len(experiences))):
71 | r = np.clip(experiences[t].reward, Config.REWARD_MIN, Config.REWARD_MAX) if Config.REWARD_CLIPPING else experiences[t].reward
72 | reward_sum = discount_factor * reward_sum + r
73 | experiences[t].reward = reward_sum
74 | return experiences
75 |
76 | def convert_data(self, experiences):
77 | x_ = np.array([exp.state for exp in experiences])
78 | a_ = np.eye(self.num_actions)[np.array([exp.action for exp in experiences])].astype(np.float32)
79 | r_ = np.array([exp.reward for exp in experiences])
80 | return x_, r_, a_
81 |
82 | def predict(self, state, lstm_inputs):
83 | # put the state in the prediction q
84 |
85 | # lstm_inputs: [dict{stacklayer1}, dict{stacklayer2}, ...]
86 | c_state = np.array([lstm['c'] for lstm in lstm_inputs]) if len(lstm_inputs) else None
87 | h_state = np.array([lstm['h'] for lstm in lstm_inputs]) if len(lstm_inputs) else None
88 | self.prediction_q.put((self.id, state, c_state, h_state))
89 | # wait for the prediction to come back
90 | p, v, d, c_state, h_state = self.wait_q.get()
91 |
92 | if not len(lstm_inputs):
93 | return p, v, d, []
94 |
95 | # convert return back to form: [dict{stack-layer1}, dict{stack-layer2}, ...]
96 | l = [{'c':c_state[i], 'h':h_state[i]} for i in range(c_state.shape[0])]
97 | return p, v, d, l
98 |
99 | def select_action(self, prediction):
100 | if Config.PLAY_MODE:
101 | action = np.argmax(prediction)
102 | else:
103 | action = np.random.choice(self.actions, p=prediction)
104 | return action
105 |
106 | def run_episode(self):
107 | self.env.reset()
108 | is_running = True
109 | experiences = []
110 |
111 | time_count = 0
112 | reward_sum = 0.0
113 |
114 | # input states for prediction
115 | lstm_input_p = [{'c':np.zeros(256, dtype=np.float32),
116 | 'h':np.zeros(256, dtype=np.float32)}]*Config.NUM_LSTMS
117 |
118 | # input states for training
119 | lstm_input_t = [{'c':np.zeros(256, dtype=np.float32),
120 | 'h':np.zeros(256, dtype=np.float32)}]*Config.NUM_LSTMS
121 |
122 | while is_running:
123 |
124 | # very first few frames
125 | if self.env.current_state is None:
126 | _ , is_running = self.env.step(-1) # NOOP
127 | assert(is_running)
128 | continue
129 |
130 | prediction, value, depth, lstm_input_p = self.predict(self.env.current_state, lstm_input_p)
131 |
132 | if Config.PLAY_MODE:
133 | self.display_manager.update(self.env.current_state, prediction, value, depth)
134 |
135 | action = self.select_action(prediction)
136 | reward, is_running = self.env.step(action)
137 |
138 | reward_sum += reward
139 | exp = Experience(self.env.previous_state, action, prediction, reward)
140 | experiences.append(exp)
141 |
142 | if not is_running or time_count == int(Config.TIME_MAX):
143 | updated_exps = ProcessAgent._accumulate_rewards(experiences, self.discount_factor, value, is_running)
144 | x_, r_, a_ = self.convert_data(updated_exps)
145 | yield x_, r_, a_, lstm_input_t, reward_sum, time_count
146 |
147 | # lstm input state for next training step
148 | lstm_input_t = lstm_input_p
149 |
150 | # reset the tmax count
151 | time_count = 0
152 | # keep the last experience for the next batch
153 | experiences = [experiences[-1]]
154 | reward_sum = 0.0
155 |
156 | time_count += 1
157 |
158 | def run(self):
159 | # randomly sleep up to 1 second. helps agents boot smoothly.
160 | time.sleep(np.random.rand())
161 | np.random.seed(np.int32(time.time() % 1 * 1000 + self.id * 10))
162 | total_steps = 0
163 |
164 | while total_steps == Config.MAX_STEPS or self.exit_flag.value == 0:
165 | total_reward = 0
166 | total_length = 0
167 | for x_, r_, a_, lstm_, reward_sum, steps in self.run_episode():
168 | total_steps += steps
169 | total_reward += reward_sum
170 | total_length += len(r_) + 1 # +1 for last frame that we drop
171 | self.training_q.put((x_, r_, a_, lstm_))
172 | self.episode_log_q.put((datetime.now(), total_reward, total_length,
173 | total_steps))
174 |
--------------------------------------------------------------------------------
/ProcessStats.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # Redistribution and use in source and binary forms, with or without
4 | # modification, are permitted provided that the following conditions
5 | # are met:
6 | # * Redistributions of source code must retain the above copyright
7 | # notice, this list of conditions and the following disclaimer.
8 | # * Redistributions in binary form must reproduce the above copyright
9 | # notice, this list of conditions and the following disclaimer in the
10 | # documentation and/or other materials provided with the distribution.
11 | # * Neither the name of NVIDIA CORPORATION nor the names of its
12 | # contributors may be used to endorse or promote products derived
13 | # from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |
27 | import sys
28 | if sys.version_info >= (3,0):
29 | from queue import Queue as queueQueue
30 | else:
31 | from Queue import Queue as queueQueue
32 |
33 | from datetime import datetime
34 | from multiprocessing import Process, Queue, Value
35 |
36 | import numpy as np
37 | import time
38 |
39 | from Config import Config
40 |
41 |
42 | class ProcessStats(Process):
43 | def __init__(self):
44 | super(ProcessStats, self).__init__()
45 | self.episode_log_q = Queue(maxsize=100)
46 | self.episode_count = Value('i', 0)
47 | self.training_count = Value('i', 0)
48 | self.should_save_model = Value('i', 0)
49 | self.trainer_count = Value('i', 0)
50 | self.predictor_count = Value('i', 0)
51 | self.agent_count = Value('i', 0)
52 | self.total_frame_count = 0
53 |
54 | def FPS(self):
55 | # average FPS from the beginning of the training (not current FPS)
56 | return np.ceil(self.total_frame_count / (time.time() - self.start_time))
57 |
58 | def TPS(self):
59 | # average TPS from the beginning of the training (not current TPS)
60 | return np.ceil(self.training_count.value / (time.time() - self.start_time))
61 |
62 | def run(self):
63 | with open(Config.RESULTS_FILENAME, 'a') as results_logger:
64 | rolling_frame_count = 0
65 | rolling_reward = 0
66 | results_q = queueQueue(maxsize=Config.STAT_ROLLING_MEAN_WINDOW)
67 |
68 | self.start_time = time.time()
69 | first_time = datetime.now()
70 | while True:
71 | episode_time, reward, length, steps = self.episode_log_q.get()
72 | results_logger.write('%s, %d, %d\n' % (episode_time.strftime("%Y-%m-%d %H:%M:%S"), reward, length))
73 | results_logger.flush()
74 |
75 | self.total_frame_count += length
76 | self.episode_count.value += 1
77 |
78 | rolling_frame_count += length
79 | rolling_reward += reward
80 |
81 | if results_q.full():
82 | old_episode_time, old_reward, old_length = results_q.get()
83 | rolling_frame_count -= old_length
84 | rolling_reward -= old_reward
85 | first_time = old_episode_time
86 |
87 | results_q.put((episode_time, reward, length))
88 |
89 | if self.episode_count.value % Config.SAVE_FREQUENCY == 0:
90 | self.should_save_model.value = 1
91 |
92 | if self.episode_count.value % Config.PRINT_STATS_FREQUENCY == 0:
93 | print(
94 | '[Time: %8d] '
95 | '[Steps: %8d] '
96 | '[Episode: %8d Score: %10.4f] '
97 | '[RScore: %10.4f RPPS: %5d] '
98 | '[PPS: %5d TPS: %5d] '
99 | '[NT: %2d NP: %2d NA: %2d]'
100 | % (int(time.time()-self.start_time),
101 | steps,
102 | self.episode_count.value, reward,
103 | rolling_reward / results_q.qsize(),
104 | rolling_frame_count / (datetime.now() - first_time).total_seconds(),
105 | self.FPS(), self.TPS(),
106 | self.trainer_count.value, self.predictor_count.value, self.agent_count.value))
107 | sys.stdout.flush()
108 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Maze Navigation using Reinforcement Learning
2 |
3 | ## Description
4 |
5 | TensorFlow implementation of ideas in the DeepMind paper ["Learning to Navigate in Complex Environments"](https://arxiv.org/abs/1611.03673). The baseline architecture is GPU-based A3C from the paper ["Reinforcement Learning through Asynchronous Advantage Actor-Critic on a GPU"](https://openreview.net/forum?id=r1VGvBcxl). Full [report](./assets/DeepNav_final.pdf).
6 |
7 | ## Requirements
8 |
9 | * TensorFlow 1.0
10 | * DeepMind Lab
11 | * Python plugins - numpy, cv2, pygame
12 |
13 | ## Getting Started
14 |
15 | Dowload and install DeepMind Lab
16 | ```
17 | $ git clone https://github.com/deepmind/lab.git
18 | ```
19 | Build it following the [build instructions](https://github.com/deepmind/lab/blob/master/docs/build.md)
20 |
21 | Clone repo **inside** the lab directory
22 | ```
23 | $ cd lab
24 | $ git clone https://github.com/tgangwani/GA3C-DeepNavigation.git
25 | ```
26 | Add the bazel instructions at the end of lab/BUILD file
27 |
28 | ```
29 | py_binary(
30 | name = "GA3C-DeepNavigation_train",
31 | srcs = ["GA3C-DeepNavigation/GA3C.py"],
32 | data = [":deepmind_lab.so"],
33 | main = "GA3C-DeepNavigation/GA3C.py",
34 | )
35 | ```
36 |
37 | Then run bazel command to run the agent
38 | ```
39 | bazel run :GA3C-DeepNavigation_train --define headless=osmesa
40 | ```
41 | Use ```PLAY_MODE=False``` in Config.py for training.
42 | Setting ```PLAY_MODE=True``` loads model parameters from a
43 | checkpoint and runs a single agent. A display is expected.
44 |
45 | ## Network
46 | The neural net architecture is the same as in the paper, **but for the
47 | loop-prediction loss**.
48 |
49 |
50 |
51 | ## Results
52 |
53 | ### 1. Nav_maze_static_01
54 |
55 | #### Live Agent
56 | [YouTube](https://www.youtube.com/watch?v=vyS0Z7wdHHs)
57 |
58 | #### Learning Curve
59 |
60 |
61 | ### 2. Stairway to melon
62 |
63 | #### Live Agent
64 | [YouTube](https://www.youtube.com/watch?v=0R5MGM7VPo4)
65 |
66 | #### Learning Curve
67 |
68 |
69 |
70 | ## Acknowledgement
71 | [Unreal code by miyosuda](https://github.com/miyosuda/unreal)
72 |
--------------------------------------------------------------------------------
/Server.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # Redistribution and use in source and binary forms, with or without
4 | # modification, are permitted provided that the following conditions
5 | # are met:
6 | # * Redistributions of source code must retain the above copyright
7 | # notice, this list of conditions and the following disclaimer.
8 | # * Redistributions in binary form must reproduce the above copyright
9 | # notice, this list of conditions and the following disclaimer in the
10 | # documentation and/or other materials provided with the distribution.
11 | # * Neither the name of NVIDIA CORPORATION nor the names of its
12 | # contributors may be used to endorse or promote products derived
13 | # from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |
27 | from multiprocessing import Queue
28 |
29 | import time
30 |
31 | from Config import Config
32 | from Environment import Environment
33 | from NetworkVP import NetworkVP
34 | from ProcessAgent import ProcessAgent
35 | from ProcessStats import ProcessStats
36 | from ThreadDynamicAdjustment import ThreadDynamicAdjustment
37 | from ThreadPredictor import ThreadPredictor
38 | from ThreadTrainer import ThreadTrainer
39 | from Display import Display
40 |
41 | class Server:
42 | def __init__(self):
43 | self.stats = ProcessStats()
44 |
45 | self.training_q = Queue(maxsize=Config.MAX_QUEUE_SIZE)
46 | self.prediction_q = Queue(maxsize=Config.MAX_QUEUE_SIZE)
47 |
48 | self.model = NetworkVP(Config.DEVICE, Config.NETWORK_NAME, Environment().get_num_actions())
49 | if Config.LOAD_CHECKPOINT:
50 | self.stats.episode_count.value = self.model.load()
51 |
52 | self.training_step = 0
53 | self.frame_counter = 0
54 |
55 | self.agents = []
56 | self.predictors = []
57 | self.trainers = []
58 | self.dynamic_adjustment = ThreadDynamicAdjustment(self)
59 | self.display_manager = Display()
60 |
61 | def add_agent(self):
62 | self.agents.append(
63 | ProcessAgent(len(self.agents), self.prediction_q, self.training_q, self.stats.episode_log_q, self.display_manager))
64 | self.agents[-1].start()
65 |
66 | def remove_agent(self):
67 | self.agents[-1].exit_flag.value = True
68 | self.agents[-1].join()
69 | self.agents.pop()
70 |
71 | def add_predictor(self):
72 | self.predictors.append(ThreadPredictor(self, len(self.predictors)))
73 | self.predictors[-1].start()
74 |
75 | def remove_predictor(self):
76 | self.predictors[-1].exit_flag = True
77 | self.predictors[-1].join()
78 | self.predictors.pop()
79 |
80 | def add_trainer(self):
81 | self.trainers.append(ThreadTrainer(self, len(self.trainers)))
82 | self.trainers[-1].start()
83 |
84 | def remove_trainer(self):
85 | self.trainers[-1].exit_flag = True
86 | self.trainers[-1].join()
87 | self.trainers.pop()
88 |
89 | def train_model(self, x_, r_, a_, c_, h_, trainer_id):
90 | self.model.train(x_, r_, a_, c_, h_, trainer_id)
91 | self.training_step += 1
92 | self.frame_counter += x_.shape[0]
93 |
94 | self.stats.training_count.value += 1
95 | self.dynamic_adjustment.temporal_training_count += 1
96 |
97 | if Config.TENSORBOARD and self.stats.training_count.value % Config.TENSORBOARD_UPDATE_FREQUENCY == 0:
98 | self.model.log(x_, r_, a_, c_, h_)
99 |
100 | def save_model(self):
101 | self.model.save(self.stats.episode_count.value)
102 |
103 | def main(self):
104 | self.stats.start()
105 | self.dynamic_adjustment.start()
106 |
107 | if Config.PLAY_MODE:
108 | for trainer in self.trainers:
109 | trainer.enabled = False
110 |
111 | learning_rate_multiplier = (
112 | Config.LEARNING_RATE_END - Config.LEARNING_RATE_START) / Config.ANNEALING_EPISODE_COUNT
113 | beta_multiplier = (Config.BETA_END - Config.BETA_START) / Config.ANNEALING_EPISODE_COUNT
114 |
115 | while self.stats.episode_count.value < Config.EPISODES:
116 | step = min(self.stats.episode_count.value, Config.ANNEALING_EPISODE_COUNT - 1)
117 | self.model.learning_rate = Config.LEARNING_RATE_START + learning_rate_multiplier * step
118 | self.model.beta = Config.BETA_START + beta_multiplier * step
119 |
120 | # Saving is async - even if we start saving at a given episode, we may save the model at a later episode
121 | if Config.SAVE_MODELS and self.stats.should_save_model.value > 0:
122 | self.save_model()
123 | self.stats.should_save_model.value = 0
124 |
125 | time.sleep(0.01)
126 |
127 | self.dynamic_adjustment.exit_flag = True
128 | while self.agents:
129 | self.remove_agent()
130 | while self.predictors:
131 | self.remove_predictor()
132 | while self.trainers:
133 | self.remove_trainer()
134 |
--------------------------------------------------------------------------------
/ThreadDynamicAdjustment.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # Redistribution and use in source and binary forms, with or without
4 | # modification, are permitted provided that the following conditions
5 | # are met:
6 | # * Redistributions of source code must retain the above copyright
7 | # notice, this list of conditions and the following disclaimer.
8 | # * Redistributions in binary form must reproduce the above copyright
9 | # notice, this list of conditions and the following disclaimer in the
10 | # documentation and/or other materials provided with the distribution.
11 | # * Neither the name of NVIDIA CORPORATION nor the names of its
12 | # contributors may be used to endorse or promote products derived
13 | # from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |
27 | from threading import Thread
28 |
29 | import numpy as np
30 | import time
31 |
32 | from Config import Config
33 |
34 |
35 | class ThreadDynamicAdjustment(Thread):
36 | def __init__(self, server):
37 | super(ThreadDynamicAdjustment, self).__init__()
38 | self.setDaemon(True)
39 |
40 | self.server = server
41 | self.enabled = Config.DYNAMIC_SETTINGS
42 |
43 | self.trainer_count = Config.TRAINERS
44 | self.predictor_count = Config.PREDICTORS
45 | self.agent_count = Config.AGENTS
46 |
47 | self.temporal_training_count = 0
48 | self.exit_flag = False
49 |
50 | def enable_disable_components(self):
51 | cur_len = len(self.server.trainers)
52 | if cur_len < self.trainer_count:
53 | for _ in np.arange(cur_len, self.trainer_count):
54 | self.server.add_trainer()
55 | elif cur_len > self.trainer_count:
56 | for _ in np.arange(self.trainer_count, cur_len):
57 | self.server.remove_trainer()
58 |
59 | cur_len = len(self.server.predictors)
60 | if cur_len < self.predictor_count:
61 | for _ in np.arange(cur_len, self.predictor_count):
62 | self.server.add_predictor()
63 | elif cur_len > self.predictor_count:
64 | for _ in np.arange(self.predictor_count, cur_len):
65 | self.server.remove_predictor()
66 |
67 | cur_len = len(self.server.agents)
68 | if cur_len < self.agent_count:
69 | for _ in np.arange(cur_len, self.agent_count):
70 | self.server.add_agent()
71 | elif cur_len > self.agent_count:
72 | for _ in np.arange(self.agent_count, cur_len):
73 | self.server.remove_agent()
74 |
75 | def random_walk(self):
76 | # 3 directions, 1 for Trainers, 1 for Predictors and 1 for Agents
77 | # 3 outcome for each, -1: remove one, 0: no change, 2: remove one
78 | direction = np.random.randint(3, size=3) - 1
79 | self.trainer_count = max(1, self.trainer_count - direction[0])
80 | self.predictor_count = max(1, self.predictor_count - direction[1])
81 | self.agent_count = max(1, self.agent_count - direction[2])
82 |
83 | def update_stats(self):
84 | self.server.stats.trainer_count.value = self.trainer_count
85 | self.server.stats.predictor_count.value = self.predictor_count
86 | self.server.stats.agent_count.value = self.agent_count
87 |
88 | def run(self):
89 | self.enable_disable_components()
90 | self.update_stats()
91 |
92 | if not self.enabled:
93 | return
94 |
95 | # Wait for initialization
96 | time.sleep(Config.DYNAMIC_SETTINGS_INITIAL_WAIT)
97 |
98 | while not self.exit_flag:
99 | old_trainer_count, old_predictor_count, old_agent_count = \
100 | self.trainer_count, self.predictor_count, self.agent_count
101 | self.random_walk()
102 |
103 | # If no change, do nothing
104 | if self.trainer_count == old_trainer_count \
105 | and self.predictor_count == old_predictor_count \
106 | and self.agent_count == old_agent_count:
107 | continue
108 |
109 | old_count = self.temporal_training_count
110 | self.enable_disable_components()
111 |
112 | self.temporal_training_count = 0
113 | time.sleep(Config.DYNAMIC_SETTINGS_STEP_WAIT)
114 |
115 | cur_count = self.temporal_training_count
116 | # if it didn't work, revert the changes
117 | if cur_count < old_count:
118 | self.trainer_count, self.predictor_count, self.agent_count = \
119 | old_trainer_count, old_predictor_count, old_agent_count
120 |
121 | self.update_stats()
122 |
--------------------------------------------------------------------------------
/ThreadPredictor.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # Redistribution and use in source and binary forms, with or without
4 | # modification, are permitted provided that the following conditions
5 | # are met:
6 | # * Redistributions of source code must retain the above copyright
7 | # notice, this list of conditions and the following disclaimer.
8 | # * Redistributions in binary form must reproduce the above copyright
9 | # notice, this list of conditions and the following disclaimer in the
10 | # documentation and/or other materials provided with the distribution.
11 | # * Neither the name of NVIDIA CORPORATION nor the names of its
12 | # contributors may be used to endorse or promote products derived
13 | # from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |
27 | from threading import Thread
28 |
29 | import numpy as np
30 |
31 | from Config import Config
32 |
33 |
34 | class ThreadPredictor(Thread):
35 | def __init__(self, server, id):
36 | super(ThreadPredictor, self).__init__()
37 | self.setDaemon(True)
38 |
39 | self.id = id
40 | self.server = server
41 | self.exit_flag = False
42 |
43 | def run(self):
44 | ids = np.zeros(Config.PREDICTION_BATCH_SIZE, dtype=np.uint16)
45 | states = np.zeros((Config.PREDICTION_BATCH_SIZE,
46 | Config.COMBINED_STATE_SIZE), dtype=np.float32)
47 |
48 | cs = np.zeros((Config.PREDICTION_BATCH_SIZE, Config.NUM_LSTMS, 256),
49 | dtype=np.float32) if Config.NUM_LSTMS else [None]*Config.PREDICTION_BATCH_SIZE
50 |
51 | hs = np.zeros((Config.PREDICTION_BATCH_SIZE, Config.NUM_LSTMS, 256),
52 | dtype=np.float32) if Config.NUM_LSTMS else [None]*Config.PREDICTION_BATCH_SIZE
53 |
54 | while not self.exit_flag:
55 | ids[0], states[0], cs[0], hs[0] = self.server.prediction_q.get()
56 |
57 | size = 1
58 | while size < Config.PREDICTION_BATCH_SIZE and not self.server.prediction_q.empty():
59 | ids[size], states[size], cs[size], hs[size] = self.server.prediction_q.get()
60 | size += 1
61 |
62 | batch = states[:size]
63 | cb = cs[:size]
64 | hb = hs[:size]
65 | p, v, d, c, h = self.server.model.predict_p_and_v_and_d(batch, cb, hb)
66 |
67 | for i in range(size):
68 | if ids[i] < len(self.server.agents):
69 | if Config.NUM_LSTMS:
70 | assert c[i].shape == (Config.NUM_LSTMS, 256)
71 | assert h[i].shape == (Config.NUM_LSTMS, 256)
72 | self.server.agents[ids[i]].wait_q.put((p[i], v[i], d[i], c[i], h[i]))
73 |
--------------------------------------------------------------------------------
/ThreadTrainer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # Redistribution and use in source and binary forms, with or without
4 | # modification, are permitted provided that the following conditions
5 | # are met:
6 | # * Redistributions of source code must retain the above copyright
7 | # notice, this list of conditions and the following disclaimer.
8 | # * Redistributions in binary form must reproduce the above copyright
9 | # notice, this list of conditions and the following disclaimer in the
10 | # documentation and/or other materials provided with the distribution.
11 | # * Neither the name of NVIDIA CORPORATION nor the names of its
12 | # contributors may be used to endorse or promote products derived
13 | # from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |
27 | from threading import Thread
28 | import numpy as np
29 |
30 | from Config import Config
31 |
32 |
33 | class ThreadTrainer(Thread):
34 | def __init__(self, server, id):
35 | super(ThreadTrainer, self).__init__()
36 | self.setDaemon(True)
37 |
38 | self.id = id
39 | self.server = server
40 | self.exit_flag = False
41 |
42 | @staticmethod
43 | def dynamic_pad(x, r, a):
44 | size = int(Config.TIME_MAX) # required size
45 | z = np.zeros((size-len(x),) + x.shape[1:])
46 | x = np.append(x, z, axis=0)
47 | z = np.zeros((size-len(r),) + r.shape[1:])
48 | r = np.append(r, z, axis=0)
49 | z = np.zeros((size-len(a),) + a.shape[1:])
50 | a = np.append(a, z, axis=0)
51 | assert len(x) == size
52 | return x, r, a
53 |
54 | def run(self):
55 | while not self.exit_flag:
56 | batch_size = 0
57 | c__ = []; h__ = [] # lstm hidden states
58 | while batch_size <= Config.TRAINING_MIN_BATCH_SIZE:
59 | x_, r_, a_, lstm_ = self.server.training_q.get()
60 |
61 | # when using LSTMs, the recurrence is over the TIME_MAX length
62 | # trajectory from each agent. Use padding for trajectories of
63 | # length < TIME_MAX
64 | if Config.NUM_LSTMS and x_.shape[0] != int(Config.TIME_MAX):
65 | x_, r_, a_ = ThreadTrainer.dynamic_pad(x_, r_, a_)
66 |
67 | if batch_size == 0:
68 | x__ = x_; r__ = r_; a__ = a_
69 |
70 | if len(lstm_):
71 | c__ = []; h__ = []
72 | for i in range(Config.NUM_LSTMS):
73 | c__.append(lstm_[i]['c'])
74 | h__.append(lstm_[i]['h'])
75 |
76 | else:
77 | x__ = np.concatenate((x__, x_))
78 | r__ = np.concatenate((r__, r_))
79 | a__ = np.concatenate((a__, a_))
80 |
81 | if len(lstm_):
82 | for i in range(Config.NUM_LSTMS):
83 | c__[i] = np.concatenate((c__[i], lstm_[i]['c']))
84 | h__[i] = np.concatenate((h__[i], lstm_[i]['h']))
85 |
86 | batch_size += x_.shape[0]
87 |
88 | if Config.TRAIN_MODELS:
89 | self.server.train_model(x__, r__, a__, c__, h__, self.id)
90 |
--------------------------------------------------------------------------------
/assets/DeepNav_final.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tgangwani/GA3C-DeepNavigation/32aae2f806479bff6aff4d61894313d26b2251a6/assets/DeepNav_final.pdf
--------------------------------------------------------------------------------
/assets/nav_maze_static_01_score.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tgangwani/GA3C-DeepNavigation/32aae2f806479bff6aff4d61894313d26b2251a6/assets/nav_maze_static_01_score.png
--------------------------------------------------------------------------------
/assets/nn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tgangwani/GA3C-DeepNavigation/32aae2f806479bff6aff4d61894313d26b2251a6/assets/nn.png
--------------------------------------------------------------------------------
/assets/stairway_to_melon_score.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tgangwani/GA3C-DeepNavigation/32aae2f806479bff6aff4d61894313d26b2251a6/assets/stairway_to_melon_score.png
--------------------------------------------------------------------------------