├── Alpha_Zero ├── MCTS_Basics.py └── alpha_go_zero │ ├── 6-6-4-pie-0.mypolicy │ ├── 6-6-4-pie.policy │ ├── ConnectN.py │ ├── MCTS.py │ ├── Play.py │ ├── alphazero-TicTacToe-advanced.ipynb │ ├── alphazero-TicTacToe.ipynb │ └── playground.py ├── P1_Navigation ├── Future_Improvements.md ├── Navigation_Final.ipynb ├── Readme.md ├── ddqn_checkpoint.pth ├── dqn_agent.py ├── model.py └── visual_pixels │ ├── Navigation_Pixels.ipynb │ ├── pixel_dqn_agent.py │ └── pixel_model.py ├── P2_Continuous_Actions ├── Continuous_Control_UdacityWorkspace.ipynb ├── README.md ├── Report.md ├── checkpoint_actor.pth └── checkpoint_critic.pth ├── P3_Collab_Compete ├── Future_Improvements.md ├── README.md ├── Tennis_Udacity_Workspace.ipynb ├── checkpoint_actor_local_0.pth ├── checkpoint_actor_local_1.pth ├── checkpoint_critic_local_0.pth ├── checkpoint_critic_local_1.pth └── workspace_utils.py └── README.md /Alpha_Zero/MCTS_Basics.py: -------------------------------------------------------------------------------- 1 | ################################# MCTS ####################################### 2 | 3 | # Version 1 (mcts.ai) 4 | 5 | from math import * 6 | import random 7 | 8 | # This is a very simple implementation of the UCT Monte Carlo Tree Search algorithm in Python 2.7. 9 | # The function UCT(rootstate, itermax, verbose = False) is towards the bottom of the code. 10 | # It aims to have the clearest and simplest possible code, and for the sake of clarity, the code 11 | # is orders of magnitude less efficient than it could be made, particularly by using a 12 | # state.GetRandomMove() or state.DoRandomRollout() function. 13 | # 14 | # Example GameState classes for Nim, OXO and Othello are included to give some idea of how you 15 | # can write your own GameState use UCT in your 2-player game. Change the game to be played in 16 | # the UCTPlayGame() function at the bottom of the code. 17 | # 18 | # Written by Peter Cowling, Ed Powley, Daniel Whitehouse (University of York, UK) September 2012. 19 | # 20 | # Licence is granted to freely use and distribute for any sensible/legal purpose so long as this comment 21 | # remains in any distributed code. 22 | # 23 | # For more information about Monte Carlo Tree Search check out our web site at www.mcts.ai 24 | 25 | 26 | class OXOState(object): 27 | """ A state of the game, i.e. the game board. These are the only functions which are 28 | absolutely necessary to implement UCT in any 2-player complete information deterministic 29 | zero-sum game, although they can be enhanced and made quicker, for example by using a 30 | GetRandomMove() function to generate a random move during rollout. 31 | By convention the players are numbered 1 and 2. 32 | """ 33 | """ A state of the game, i.e. the game board. 34 | Squares in the board are in this arrangement 35 | 012 36 | 345 37 | 678 38 | where 0 = empty, 1 = player 1 (X), 2 = player 2 (O) 39 | """ 40 | def __init__(self): 41 | self.playerJustMoved = 2 # At the root pretend the player just moved is p2 - p1 has the first move 42 | self.board = [0,0,0,0,0,0,0,0,0] # 0 = empty, 1 = player 1, 2 = player 2 43 | 44 | def Clone(self): 45 | """ Create a deep clone of this game state. 46 | """ 47 | st = OXOState() 48 | st.playerJustMoved = self.playerJustMoved 49 | st.board = self.board[:] 50 | return st 51 | 52 | def DoMove(self, move): 53 | """ Update a state by carrying out the given move. 54 | Must update playerJustMoved. 55 | """ 56 | assert move >= 0 and move <= 8 and move == int(move) and self.board[move] == 0 57 | self.playerJustMoved = 3 - self.playerJustMoved 58 | self.board[move] = self.playerJustMoved 59 | 60 | def GetMoves(self): 61 | """ Get all possible moves from this state. 62 | """ 63 | return [i for i in range(9) if self.board[i] == 0] #empty spots are initialized to 0 (which is printed as '.' by design, see __repr__ function) 64 | 65 | def GetResult(self, playerjm): 66 | """ Get the game result from the viewpoint of playerjm. 67 | """ 68 | for (x,y,z) in [(0,1,2),(3,4,5),(6,7,8),(0,3,6),(1,4,7),(2,5,8),(0,4,8),(2,4,6)]: 69 | if self.board[x] == self.board[y] == self.board[z]: 70 | if self.board[x] == playerjm: 71 | return 1.0 72 | elif self.board[x] == 3-playerjm: 73 | return 0.0 74 | if self.GetMoves() == []: return 0.5 # draw 75 | return None #if it comes here then the game is still undecided 76 | 77 | def __repr__(self): 78 | s= "" 79 | for i in range(9): 80 | s += ".XO"[self.board[i]] 81 | if i % 3 == 2: s += "\n" 82 | return s 83 | 84 | 85 | class Node(object): 86 | """ A node in the game tree. Note wins is always from the viewpoint of playerJustMoved. 87 | Crashes if state not specified. 88 | """ 89 | def __init__(self, move = None, parent = None, state = None): 90 | self.move = move # the move that got us to this node - "None" for the root node 91 | self.parentNode = parent # "None" for the root node 92 | self.childNodes = [] #all the explored children 93 | self.wins = 0 94 | self.visits = 0 95 | self.untriedMoves = state.GetMoves() # future child nodes 96 | self.playerJustMoved = state.playerJustMoved # the only part of the state that the Node needs later 97 | 98 | def UCTSelectChild(self): 99 | """ Use the UCB1 formula to select a child node. Often a constant UCTK is applied so we have 100 | lambda c: c.wins/c.visits + UCTK * sqrt(2*log(self.visits)/c.visits to vary the amount of 101 | exploration versus exploitation. 102 | """ 103 | s = sorted(self.childNodes, key = lambda c: c.wins/c.visits + sqrt(2*log(self.visits)/c.visits))[-1] 104 | return s 105 | 106 | def AddChild(self, m, s): 107 | """ Remove m from untriedMoves and add a new child node for this move. 108 | Return the added child node 109 | """ 110 | n = Node(move = m, parent = self, state = s) 111 | self.untriedMoves.remove(m) 112 | self.childNodes.append(n) 113 | return n 114 | 115 | def Update(self, result): 116 | """ Update this node - one additional visit and result additional wins. result must be from the viewpoint of playerJustmoved. 117 | """ 118 | self.visits += 1 119 | self.wins += result 120 | 121 | def __repr__(self): 122 | return "[M:" + str(self.move) + " W/V:" + str(self.wins) + "/" + str(self.visits) + " U:" + str(self.untriedMoves) + "]" 123 | 124 | def TreeToString(self, indent): 125 | s = self.IndentString(indent) + str(self) 126 | for c in self.childNodes: 127 | s += c.TreeToString(indent+1) 128 | return s 129 | 130 | def IndentString(self,indent): 131 | s = "\n" 132 | for i in range (1,indent+1): 133 | s += "| " 134 | return s 135 | 136 | def ChildrenToString(self): 137 | s = "" 138 | for c in self.childNodes: 139 | s += str(c) + "\n" 140 | return s 141 | 142 | def UCT(rootstate, itermax, verbose = False): 143 | """ Conduct a UCT search for itermax iterations starting from rootstate. 144 | Return the best move from the rootstate. 145 | Assumes 2 alternating players (player 1 starts), with game results in the range [0.0, 1.0].""" 146 | #Note: node.childNodes include all the explored children. Unexplored children are not in it. 147 | 148 | if rootstate.GetResult(1) is not None: #it should never come here as the UCT function will not be called if it is 149 | raise ValueError("Game Has Ended!") 150 | 151 | 152 | rootnode = Node(state = rootstate) 153 | #node is just a pointer to rootnode. So as node is changed, rootnode is changed too. (before node is assigned to a different object) 154 | #this is actually how the search tree is built during each iteration. 155 | 156 | for i in range(itermax): 157 | node = rootnode #node is just a pointer to rootnode. So as node is changed, rootnode is changed too. (before node is assigned to a different object) 158 | state = rootstate.Clone() #state is updated in place when executing the state.domove() method below 159 | 160 | #print('at beginning', node, '\t', rootnode) 161 | # Select 162 | while node.untriedMoves == [] and node.childNodes != [] and state.GetResult(1) is None: # node is fully expanded and non-terminal and game has not yet ended 163 | node = node.UCTSelectChild() #only updates node and not rootnode 164 | state.DoMove(node.move) #this updates the variable state in place when executing the state.domove() method 165 | #print('after select', node, '\t', rootnode) 166 | 167 | # Expand 168 | if node.untriedMoves != [] and node.childNodes == [] and state.GetResult(1) is None: # if we can expand (i.e. state/node is non-terminal) and game has not yet ended 169 | m = random.choice(node.untriedMoves) #randomly pick an unexplored child 170 | state.DoMove(m) #this updates the variable state in place when executing the move 171 | node = node.AddChild(m,state) # add child and descend tree. node.addchild() updates node in place. It changes rootnode only if 'select' module was not executed 172 | #node.addchild() also returns a different node which is assigned to variable 'node.' There after node is now not poining to rootnode. They are different. 173 | #print('Node children: \n'.format(node.ChildrenToString())) 174 | #print('after expand', node, '\t', rootnode) 175 | 176 | # Rollout - this can often be made orders of magnitude quicker using a state.GetRandomMove() function 177 | # Rollout starts from the selected/expanded node (as state is updated after state.DoMove() in select and expand sections) 178 | while state.GetMoves() != [] and state.GetResult(1) is None: # while state is non-terminal and game has not yet ended 179 | m = random.choice(state.GetMoves()) 180 | state.DoMove(m) #this updates the variable state in place when executing the move 181 | #rollout does not change node or rootnode 182 | #print('after rollout', node, '\t', rootnode) 183 | 184 | # Backpropagate 185 | while node != None: # backpropagate from the expanded node and work back to the root node 186 | node.Update(state.GetResult(node.playerJustMoved)) # state is terminal. Update node with result from POV of node.playerJustMoved 187 | node = node.parentNode 188 | #print('after backprop', node, '\t', rootnode) 189 | #after backpropagation, node points to None 190 | 191 | # Output some information about the tree - can be omitted 192 | if (verbose): print(rootnode.TreeToString(0)) 193 | else: print(rootnode.ChildrenToString()) 194 | 195 | return sorted(rootnode.childNodes, key = lambda c: c.visits)[-1].move # return the move that was most visited 196 | 197 | 198 | 199 | def UCTPlayGame(): 200 | """ Play a sample game between two UCT players where each player gets a different number 201 | of UCT iterations (= simulations = tree nodes). 202 | """ 203 | state = OXOState() # uncomment to play OXO 204 | while (state.GetMoves() != [] and state.GetResult(1) is None): #doesn't matter for which player 1 or 2 205 | print(str(state)) 206 | if state.playerJustMoved == 1: 207 | print('Next Player is 2') 208 | m = UCT(rootstate = state, itermax = 1000, verbose = False) # play with values for itermax and verbose = True 209 | #m = random.choice(state.GetMoves()) 210 | else: 211 | print('Next Player is 1') 212 | m = UCT(rootstate = state, itermax = 100, verbose = False) 213 | #m = random.choice(state.GetMoves()) 214 | print("Best Move: " + str(m) + "\n") 215 | state.DoMove(m) 216 | if state.GetResult(state.playerJustMoved) == 1.0: 217 | print("Player " + str(state.playerJustMoved) + " wins!") 218 | elif state.GetResult(state.playerJustMoved) == 0.0: 219 | print("Player " + str(3 - state.playerJustMoved) + " wins!") 220 | else: print("Nobody wins!") 221 | 222 | 223 | 224 | if __name__ == "__main__": 225 | """ Play a single game to the end using UCT for both players. 226 | """ 227 | UCTPlayGame() 228 | 229 | 230 | # Version 2 231 | ''' 232 | # Pseudo Code 233 | def monte_carlo_tree_search(root): 234 | while resources_left(time, computational power): 235 | leaf = traverse(root) # leaf = unvisited node 236 | simulation_result = rollout(leaf) 237 | backpropagate(leaf, simulation_result) 238 | return best_child(root) 239 | 240 | def traverse(node): 241 | while fully_expanded(node): 242 | node = best_uct(node) 243 | return pick_univisted(node.children) or node # in case no children are present / node is terminal 244 | 245 | def rollout(node): 246 | while non_terminal(node): 247 | node = rollout_policy(node) 248 | return result(node) 249 | 250 | def rollout_policy(node): 251 | return pick_random(node.children) 252 | 253 | def backpropagate(node, result): 254 | if is_root(node) return 255 | node.stats = update_stats(node, result) 256 | backpropagate(node.parent) 257 | 258 | def best_child(node): 259 | pick child with highest number of visits 260 | ''' 261 | 262 | ''' 263 | import numpy as np 264 | from collections import defaultdict 265 | from games.tictactoe import * 266 | from games.common import TwoPlayersGameState 267 | 268 | class MonteCarloTreeSearchNode: 269 | 270 | def __init__(self, state: TwoPlayersGameState, parent = None): 271 | self.state = state 272 | self.parent = parent 273 | self.children = [] 274 | 275 | @property 276 | def untried_actions(self): 277 | raise NotImplemented() 278 | 279 | @property 280 | def q(self): 281 | raise NotImplemented() 282 | 283 | @property 284 | def n(self): 285 | raise NotImplemented() 286 | 287 | def expand(self): 288 | raise NotImplemented() 289 | 290 | def is_terminal_node(self): 291 | raise NotImplemented() 292 | 293 | def rollout(self): 294 | raise NotImplemented() 295 | 296 | def backpropagate(self, reward): 297 | raise NotImplemented() 298 | 299 | 300 | def is_fully_expanded(self): 301 | return len(self.untried_actions) == 0 302 | 303 | def best_child(self, c_param = 1.4): 304 | choices_weights = [ 305 | (c.q / (c.n)) + c_param * np.sqrt((2 * np.log(self.n) / (c.n))) 306 | for c in self.children 307 | ] 308 | return self.children[np.argmax(choices_weights)] 309 | 310 | def rollout_policy(self, possible_moves): 311 | return possible_moves[np.random.randint(len(possible_moves))] 312 | 313 | class TwoPlayersGameMonteCarloTreeSearchNode(MonteCarloTreeSearchNode): 314 | 315 | def __init__(self, state: TwoPlayersGameState, parent): 316 | super(TwoPlayersGameMonteCarloTreeSearchNode, self).__init__(state, parent) 317 | self._number_of_visits = 0. 318 | self._results = defaultdict(int) 319 | 320 | @property 321 | def untried_actions(self): 322 | if not hasattr(self, '_untried_actions'): 323 | self._untried_actions = self.state.get_legal_actions() 324 | return self._untried_actions 325 | 326 | @property 327 | def q(self): 328 | wins = self._results[self.parent.state.next_to_move] 329 | loses = self._results[-1 * self.parent.state.next_to_move] 330 | return wins - loses 331 | 332 | @property 333 | def n(self): 334 | return self._number_of_visits 335 | 336 | def expand(self): 337 | action = self.untried_actions.pop() 338 | next_state = self.state.move(action) 339 | child_node = TwoPlayersGameMonteCarloTreeSearchNode(next_state, parent = self) 340 | self.children.append(child_node) 341 | return child_node 342 | 343 | def is_terminal_node(self): 344 | return self.state.is_game_over() 345 | 346 | def rollout(self): 347 | current_rollout_state = self.state 348 | while not current_rollout_state.is_game_over(): 349 | possible_moves = current_rollout_state.get_legal_actions() 350 | action = self.rollout_policy(possible_moves) 351 | current_rollout_state = current_rollout_state.move(action) 352 | return current_rollout_state.game_result 353 | 354 | def backpropagate(self, result): 355 | self._number_of_visits += 1. 356 | self._results[result] += 1. 357 | if self.parent: 358 | self.parent.backpropagate(result) 359 | 360 | class MonteCarloTreeSearch: 361 | 362 | def __init__(self, node: MonteCarloTreeSearchNode): 363 | self.root = node 364 | 365 | 366 | def best_action(self, simulations_number): 367 | for _ in range(0, simulations_number): 368 | v = self.tree_policy() 369 | reward = v.rollout() 370 | v.backpropagate(reward) 371 | # exploitation only 372 | return self.root.best_child(c_param = 0.) 373 | 374 | 375 | def tree_policy(self): 376 | current_node = self.root 377 | while not current_node.is_terminal_node(): 378 | if not current_node.is_fully_expanded(): 379 | return current_node.expand() 380 | else: 381 | current_node = current_node.best_child() 382 | return current_node 383 | 384 | 385 | ''' 386 | -------------------------------------------------------------------------------- /Alpha_Zero/alpha_go_zero/6-6-4-pie-0.mypolicy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitp-ai/Deep_Reinforcement_Learning/68fa57e1be9a7e72887f16a413aa55c85033ec4c/Alpha_Zero/alpha_go_zero/6-6-4-pie-0.mypolicy -------------------------------------------------------------------------------- /Alpha_Zero/alpha_go_zero/6-6-4-pie.policy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitp-ai/Deep_Reinforcement_Learning/68fa57e1be9a7e72887f16a413aa55c85033ec4c/Alpha_Zero/alpha_go_zero/6-6-4-pie.policy -------------------------------------------------------------------------------- /Alpha_Zero/alpha_go_zero/ConnectN.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import matplotlib.animation as animation 7 | from copy import copy 8 | 9 | 10 | # output the index of when v has a continuous string of i 11 | # get_runs([0,0,1,1,1,0,0],1) gives [2],[5],[3] 12 | def get_runs(v, i): 13 | bounded = np.hstack(([0], (v==i).astype(int), [0])) 14 | difs = np.diff(bounded) 15 | starts, = np.where(difs > 0) 16 | ends, = np.where(difs < 0) 17 | return starts, ends, ends-starts 18 | 19 | # see if vector contains N of certain number in a row 20 | def in_a_row(v, N, i): 21 | if len(v) < N: 22 | return False 23 | else: 24 | _, _, total = get_runs(v,i) 25 | return np.any(total >= N) 26 | 27 | 28 | 29 | def get_lines(matrix, loc): 30 | 31 | i,j=loc 32 | flat = matrix.reshape(-1,*matrix.shape[2:]) 33 | 34 | w = matrix.shape[0] 35 | h = matrix.shape[1] 36 | def flat_pos(pos): 37 | return pos[0]*h+pos[1] 38 | 39 | pos = flat_pos((i,j)) 40 | 41 | # index for flipping matrix across different axis 42 | ic = w-1-i 43 | jc = h-1-j 44 | 45 | # top left 46 | tl = (i-j,0) if i>j else (0, j-i) 47 | tl = flat_pos(tl) 48 | 49 | # bottom left 50 | bl = (w-1-(ic-j),0) if ic>j else (w-1, j-ic) 51 | bl = flat_pos(bl) 52 | 53 | # top right 54 | tr = (i-jc,h-1) if i>jc else (0, h-1-(jc-i)) 55 | tr = flat_pos(tr) 56 | 57 | # bottom right 58 | br = (w-1-(ic-jc),h-1) if ic>jc else (w-1, h-1-(jc-ic)) 59 | br = flat_pos(br) 60 | 61 | hor = matrix[:,j] 62 | ver = matrix[i,:] 63 | diag_right = np.concatenate([flat[tl:pos:h+1],flat[pos:br+1:h+1]]) 64 | diag_left = np.concatenate([flat[tr:pos:h-1],flat[pos:bl+1:h-1]]) 65 | 66 | return hor, ver, diag_right, diag_left 67 | 68 | 69 | 70 | 71 | 72 | 73 | class ConnectN: 74 | 75 | def __init__(self, size, N, pie_rule=False): 76 | self.size = size 77 | self.w, self.h = size 78 | self.N = N 79 | 80 | # make sure game is well defined 81 | if self.w<0 or self.h<0 or self.N<2 or \ 82 | (self.N > self.w and self.N > self.h): 83 | raise ValueError('Game cannot initialize with a {0:d}x{1:d} grid, and winning condition {2:d} in a row'.format(self.w, self.h, self.N)) 84 | 85 | 86 | self.score = None 87 | self.state=np.zeros(size, dtype=np.float) 88 | self.player=1 89 | self.last_move=None 90 | self.n_moves=0 91 | self.pie_rule=pie_rule 92 | self.switched_side=False 93 | 94 | # fast deepcopy 95 | def __copy__(self): 96 | cls = self.__class__ 97 | new_game = cls.__new__(cls) 98 | new_game.__dict__.update(self.__dict__) 99 | 100 | new_game.N = self.N 101 | new_game.pie_rule = self.pie_rule 102 | new_game.state = self.state.copy() 103 | new_game.switched_side = self.switched_side 104 | new_game.n_moves = self.n_moves 105 | new_game.last_move = self.last_move 106 | new_game.player = self.player 107 | new_game.score = self.score 108 | return new_game 109 | 110 | # check victory condition 111 | # fast version 112 | def get_score(self): 113 | 114 | # game cannot end beca 115 | if self.n_moves<2*self.N-1: 116 | return None 117 | 118 | i,j = self.last_move 119 | hor, ver, diag_right, diag_left = get_lines(self.state, (i,j)) 120 | 121 | # loop over each possibility 122 | for line in [ver, hor, diag_right, diag_left]: 123 | if in_a_row(line, self.N, self.player): 124 | return self.player 125 | 126 | # no more moves 127 | if np.all(self.state!=0): 128 | return 0 129 | 130 | return None 131 | 132 | # for rendering 133 | # output a list of location for the winning line 134 | def get_winning_loc(self): 135 | 136 | if self.n_moves<2*self.N-1: 137 | return [] 138 | 139 | 140 | loc = self.last_move 141 | hor, ver, diag_right, diag_left = get_lines(self.state, loc) 142 | ind = np.indices(self.state.shape) 143 | ind = np.moveaxis(ind, 0, -1) 144 | hor_ind, ver_ind, diag_right_ind, diag_left_ind = get_lines(ind, loc) 145 | # loop over each possibility 146 | 147 | pieces = [hor, ver, diag_right, diag_left] 148 | indices = [hor_ind, ver_ind, diag_right_ind, diag_left_ind] 149 | 150 | #winning_loc = np.full(self.state.shape, False, dtype=bool) 151 | 152 | for line, index in zip(pieces, indices): 153 | starts, ends, runs = get_runs(line, self.player) 154 | 155 | # get the start and end location 156 | winning = (runs >= self.N) 157 | print(winning) 158 | if not np.any(winning): 159 | continue 160 | 161 | starts_ind = starts[winning][0] 162 | ends_ind = ends[winning][0] 163 | indices = index[starts_ind:ends_ind] 164 | #winning_loc[indices[:,0], indices[:,1]] = True 165 | return indices 166 | 167 | return [] 168 | 169 | 170 | def move(self, loc): 171 | i,j=loc 172 | success = False 173 | if self.w>i>=0 and self.h>j>=0: 174 | if self.state[i,j]==0: 175 | 176 | # make a move 177 | self.state[i,j]=self.player 178 | 179 | # if pie rule is enabled 180 | if self.pie_rule: 181 | if self.n_moves==1: 182 | self.state[tuple(self.last_move)]=-self.player 183 | self.switched_side=False 184 | 185 | elif self.n_moves==0: 186 | # pie rule, make first move 0.5 187 | # this is to let the neural net know 188 | self.state[i,j]=self.player/2.0 189 | self.switched_side=False 190 | 191 | success = True 192 | 193 | # switching side 194 | elif self.pie_rule and self.state[i,j] == -self.player/2.0: 195 | 196 | # make a move 197 | self.state[i,j]=self.player 198 | self.switched_side=True 199 | 200 | success = True 201 | 202 | 203 | 204 | 205 | if success: 206 | self.n_moves += 1 207 | self.last_move = tuple((i,j)) 208 | self.score = self.get_score() 209 | 210 | # if game is not over, switch player 211 | if self.score is None: 212 | self.player *= -1 213 | 214 | return True 215 | 216 | return False 217 | 218 | 219 | def available_moves(self): 220 | indices = np.moveaxis(np.indices(self.state.shape), 0, -1) 221 | return indices[np.abs(self.state) != 1] 222 | 223 | def available_mask(self): 224 | return (np.abs(self.state) != 1).astype(np.uint8) 225 | -------------------------------------------------------------------------------- /Alpha_Zero/alpha_go_zero/MCTS.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import matplotlib.animation as animation 7 | from copy import copy 8 | from math import * 9 | import random 10 | 11 | c=1.0 12 | 13 | # transformations 14 | t0= lambda x: x 15 | t1= lambda x: x[:,::-1].copy() 16 | t2= lambda x: x[::-1,:].copy() 17 | t3= lambda x: x[::-1,::-1].copy() 18 | t4= lambda x: x.T 19 | t5= lambda x: x[:,::-1].T.copy() 20 | t6= lambda x: x[::-1,:].T.copy() 21 | t7= lambda x: x[::-1,::-1].T.copy() 22 | 23 | tlist=[t0, t1,t2,t3,t4,t5,t6,t7] 24 | tlist_half=[t0,t1,t2,t3] 25 | 26 | def flip(x, dim): 27 | indices = [slice(None)] * x.dim() 28 | indices[dim] = torch.arange(x.size(dim) - 1, -1, -1, 29 | dtype=torch.long, device=x.device) 30 | return x[tuple(indices)] 31 | 32 | 33 | t0inv= lambda x: x 34 | t1inv= lambda x: flip(x,1) 35 | t2inv= lambda x: flip(x,0) 36 | t3inv= lambda x: flip(flip(x,0),1) 37 | t4inv= lambda x: x.t() 38 | t5inv= lambda x: flip(x,0).t() 39 | t6inv= lambda x: flip(x,1).t() 40 | t7inv= lambda x: flip(flip(x,0),1).t() 41 | 42 | tinvlist = [t0inv, t1inv, t2inv, t3inv, t4inv, t5inv, t6inv, t7inv] 43 | tinvlist_half=[t0inv, t1inv, t2inv, t3inv] 44 | 45 | transformation_list = list(zip(tlist, tinvlist)) 46 | transformation_list_half = list(zip(tlist_half, tinvlist_half)) 47 | 48 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 49 | device ='cpu' 50 | 51 | def process_policy(policy, game): 52 | 53 | # for square board, add rotations as well 54 | if game.size[0]==game.size[1]: 55 | t, tinv = random.choice(transformation_list) 56 | 57 | # otherwise only add reflections 58 | else: 59 | t, tinv = random.choice(transformation_list_half) 60 | 61 | frame=torch.tensor(t(game.state*game.player), dtype=torch.float, device=device) 62 | input=frame.unsqueeze(0).unsqueeze(0) 63 | prob, v = policy(input) 64 | mask = torch.tensor(game.available_mask()) 65 | 66 | # we add a negative sign because when deciding next move, 67 | # the current player is the previous player making the move 68 | return game.available_moves(), tinv(prob)[mask].view(-1), v.squeeze().squeeze() 69 | 70 | class Node: 71 | def __init__(self, game, mother=None, prob=torch.tensor(0., dtype=torch.float)): 72 | self.game = game 73 | 74 | # child nodes 75 | self.child = {} 76 | # numbers for determining which actions to take next 77 | self.U = 0 78 | 79 | # V from neural net output 80 | # it's a torch.tensor object 81 | # has require_grad enabled 82 | self.prob = prob 83 | # the predicted expectation from neural net 84 | self.nn_v = torch.tensor(0., dtype=torch.float) 85 | 86 | # visit count 87 | self.N = 0 88 | 89 | # expected V from MCTS 90 | self.V = 0 91 | 92 | # keeps track of the guaranteed outcome 93 | # initialized to None 94 | # this is for speeding the tree-search up 95 | # but stopping exploration when the outcome is certain 96 | # and there is a known perfect play 97 | self.outcome = self.game.score 98 | 99 | 100 | # if game is won/loss/draw 101 | if self.game.score is not None: 102 | self.V = self.game.score*self.game.player 103 | self.U = 0 if self.game.score is 0 else self.V*float('inf') #this speeds up the training 104 | 105 | # link to previous node 106 | self.mother = mother 107 | 108 | def create_child(self, actions, probs): 109 | # create a dictionary of children 110 | games = [ copy(self.game) for a in actions ] 111 | 112 | for action, game in zip(actions, games): 113 | game.move(action) 114 | 115 | child = { tuple(a):Node(g, self, p) for a,g,p in zip(actions, games, probs) } 116 | self.child = child 117 | 118 | def explore(self, policy): 119 | #Utilizes the 4 steps of MCTS, but guided by the network policy and value function for much better search efficiency. 120 | #B'cse it doesn't do MC rollout/simulation, it is more like TDTS and not MCTS. 121 | #See the comments in the 'simulation' section below for details. 122 | 123 | if self.game.score is not None: 124 | raise ValueError("game has ended with score {0:d}".format(self.game.score)) 125 | 126 | current = self 127 | #any modifications made to current while it is pointing to self will also modify self i.e. rootnode 128 | #This is actually how the search tree is build over each iteration of explore 129 | 130 | # 1. Selection step 131 | # explore children of the node 132 | # to speed things up 133 | sel_depth = 0 134 | while current.child and current.outcome is None: 135 | 136 | sel_depth += 1 137 | 138 | child = current.child 139 | max_U = max(c.U for c in child.values()) 140 | #print("current max_U ", max_U) 141 | actions = [ a for a,c in child.items() if c.U == max_U ] 142 | if len(actions) == 0: 143 | print("error zero length ", max_U) 144 | print(current.game.state) 145 | 146 | action = random.choice(actions) 147 | 148 | #this helps speed up the training. Whenever we see a winning move, we don't have to explore the other moves. 149 | if max_U == -float("inf"): #for next player 150 | #current outcome=-current.game.player 151 | current.U = float("inf") #switch to current player with -ve 152 | current.V = 1.0 153 | break 154 | 155 | elif max_U == float("inf"): #for next player 156 | #current outcome=current.game.player 157 | current.U = -float("inf") #switch to current player with -ve 158 | current.V = -1.0 159 | break 160 | 161 | current = child[action] 162 | 163 | # 2. Expansion step 164 | # if node hasn't been expanded 165 | if not current.child and current.outcome is None: 166 | # policy outputs results from the perspective of the next player 167 | # thus extra - sign is needed for the current player's perspective. 168 | next_actions, probs, v = process_policy(policy, current.game) 169 | current.nn_v = -v 170 | current.create_child(next_actions, probs) #will create all the children at once and not just one child 171 | current.V = -float(v) 172 | 173 | 174 | current.N += 1 175 | 176 | #3. Simulation /roll out step: not doing as its very expensive to evaluate the policy network for this 177 | # you can think of it as just a single step roll out since expansion is like the first step in roll out. It's sort of like TD estimate rather than MC estimate of the game score 178 | # without full MC rollout, this is really like a single step TD rollout. It is really TDTS instead of MCTS. 179 | 180 | #4. Backpropagation step 181 | # now update U and back-prop 182 | while current.mother: 183 | mother = current.mother 184 | mother.N += 1 185 | # between mother and child, the player is switched, extra - sign 186 | #mother.V += (-current.V - mother.V)/mother.N #original version but I think it is wrong (this is like TD update) (running average) 187 | mother.V += (-current.V - (mother.N-1)*mother.V)/mother.N #my modified version (this is like TD update) (running average) 188 | # Note: nn_v is not backpropagated. It is only updated when the state is being expanded (step 2) using the policy network. 189 | 190 | #update U for all sibling nodes 191 | for sibling in mother.child.values(): 192 | if sibling.U is not float("inf") and sibling.U is not -float("inf"): 193 | sibling.U = sibling.V + c*float(sibling.prob)* sqrt(mother.N)/(1+sibling.N) 194 | 195 | current = current.mother 196 | 197 | #return sel_depth, debug_find_max_tree_depth(current) #for debug only (depth during selection vs tree depth at the end of explore) 198 | 199 | 200 | def next(self, temperature=1.0): 201 | 202 | if self.game.score is not None: 203 | raise ValueError('game has ended with score {0:d}'.format(self.game.score)) 204 | 205 | if not self.child: 206 | print(self.game.state) 207 | raise ValueError('no children found and game hasn\'t ended') 208 | 209 | child=self.child 210 | 211 | 212 | max_U = max(c.U for c in child.values()) 213 | 214 | if max_U == float("inf"): # if there are winning moves, just output those 215 | prob = torch.tensor([ 1.0 if c.U == float("inf") else 0 for c in child.values()], device=device) 216 | 217 | else: 218 | # divide things by maxN for numerical stability 219 | maxN = max(node.N for node in child.values())+1 220 | prob = torch.tensor([ (node.N/maxN)**(1/temperature) for node in child.values() ], device=device) 221 | 222 | # normalize the probability 223 | if torch.sum(prob) > 0: 224 | prob /= torch.sum(prob) 225 | 226 | # if sum is zero, just make things random 227 | else: 228 | prob = torch.tensor(1.0/len(child), device=device).repeat(len(child)) 229 | 230 | nn_prob = torch.stack([ node.prob for node in child.values() ]).to(device) 231 | 232 | nextstate = random.choices(list(child.values()), weights=prob)[0] 233 | 234 | # V was for the current player making a move 235 | # to convert to the next player we add - sign 236 | return nextstate, (-self.V, -self.nn_v, prob, nn_prob) 237 | 238 | def detach_mother(self): 239 | del self.mother 240 | self.mother = None 241 | 242 | 243 | def debug_find_max_tree_depth(current): 244 | children = current.child 245 | if children is not {}: 246 | max_len = -1 247 | for a_,c_ in children.items(): 248 | c_len = 1+debug_find_max_tree_depth(c_) 249 | if max_len < c_len: max_len = c_len 250 | return max_len 251 | else: 252 | max_len = 0 253 | return max_len -------------------------------------------------------------------------------- /Alpha_Zero/alpha_go_zero/Play.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from matplotlib.animation import FuncAnimation 3 | 4 | import numpy as np 5 | import time 6 | 7 | from copy import copy 8 | 9 | class Play: 10 | 11 | def __init__(self, game, player1=None, player2=None, name='game'): 12 | self.original_game=game 13 | self.game=copy(game) 14 | self.player1=player1 15 | self.player2=player2 16 | self.player=self.game.player 17 | self.end=False 18 | self.play() 19 | 20 | def reset(self): 21 | self.game=copy(self.original_game) 22 | self.click_cid=None 23 | self.end=False 24 | 25 | def play(self, name='Game'): 26 | 27 | self.reset() 28 | 29 | if self.game.w * self.game.h <25: 30 | figsize=(self.game.w/1.6, self.game.h/1.6) 31 | 32 | else: 33 | figsize=(self.game.w/2.1, self.game.h/2.1) 34 | 35 | 36 | self.fig=plt.figure(name, figsize=figsize) 37 | if self.game.w * self.game.h <25: 38 | self.fig.subplots_adjust(.2,.2,1,1) 39 | else: 40 | self.fig.subplots_adjust(.1,.1,1,1) 41 | 42 | self.fig.show() 43 | w,h=self.game.size 44 | self.ax=self.fig.gca() 45 | self.ax.grid() 46 | # remove hovering coordinate tooltips 47 | self.ax.format_coord = lambda x, y: '' 48 | self.ax.set_xlim([-.5,w-.5]) 49 | self.ax.set_ylim([-.5,h-.5]) 50 | self.ax.set_xticks(np.arange(0, w, 1)) 51 | self.ax.set_yticks(np.arange(0, h, 1)) 52 | self.ax.set_aspect('equal') 53 | 54 | for loc in ['top', 'right', 'bottom', 'left']: 55 | self.ax.spines[loc].set_visible(False) 56 | 57 | 58 | # fully AI game 59 | if self.player1 is not None and self.player2 is not None: 60 | self.anim = FuncAnimation(self.fig, self.draw_move, frames=self.move_generator, interval=500, repeat=False) 61 | return 62 | 63 | # at least one human 64 | if self.player1 is not None: 65 | # first move from AI first 66 | succeed = False 67 | while not succeed: 68 | loc = self.player1(self.game) 69 | succeed = self.game.move(loc) 70 | 71 | self.draw_move(loc) 72 | 73 | self.click_cid=self.fig.canvas.mpl_connect('button_press_event', self.click) 74 | 75 | 76 | def move_generator(self): 77 | score = None 78 | # game not concluded yet 79 | while score is None: 80 | self.player = self.game.player 81 | if self.game.player == 1: 82 | loc = self.player1(self.game) 83 | else: 84 | loc = self.player2(self.game) 85 | 86 | success = self.game.move(loc) 87 | 88 | # see if game is done 89 | if success: 90 | score=self.game.score 91 | yield loc 92 | 93 | 94 | def draw_move(self, move=None): 95 | if self.end: 96 | return 97 | 98 | i,j=self.game.last_move if move is None else move 99 | c='salmon' if self.player==1 else 'lightskyblue' 100 | self.ax.scatter(i,j,s=500,marker='o',zorder=3, c=c) 101 | score = self.game.score 102 | self.draw_winner(score) 103 | self.fig.canvas.draw() 104 | 105 | 106 | def draw_winner(self, score): 107 | if score is None: 108 | return 109 | 110 | if score == -1 or score == 1: 111 | locs = self.game.get_winning_loc() 112 | c='darkred' if score==1 else 'darkblue' 113 | self.ax.scatter(locs[:,0],locs[:,1], s=300, marker='*',c=c,zorder=4) 114 | 115 | # try to disconnect if game is over 116 | if hasattr(self, 'click_cid'): 117 | self.fig.canvas.mpl_disconnect(self.click_cid) 118 | 119 | self.end=True 120 | 121 | 122 | def click(self,event): 123 | 124 | loc=(int(round(event.xdata)), int(round(event.ydata))) 125 | self.player = self.game.player 126 | succeed=self.game.move(loc) 127 | 128 | if succeed: 129 | self.draw_move() 130 | 131 | else: 132 | return 133 | 134 | if self.player1 is not None or self.player2 is not None: 135 | 136 | succeed = False 137 | self.player = self.game.player 138 | while not succeed: 139 | if self.game.player == 1: 140 | loc = self.player1(self.game) 141 | else: 142 | loc = self.player2(self.game) 143 | succeed = self.game.move(loc) 144 | 145 | self.draw_move() 146 | -------------------------------------------------------------------------------- /Alpha_Zero/alpha_go_zero/playground.py: -------------------------------------------------------------------------------- 1 | 2 | dict1 = {'k1': 1, 'k2': 2} 3 | print(dict1) 4 | 5 | print(10*float('inf')) 6 | 7 | if not {}: print(1) 8 | else: print(0) 9 | 10 | import numpy as np 11 | 12 | a = np.array([1,2,3,4,5]) 13 | print(a[[False,True,True,False,False]]) 14 | 15 | a = [1,2,3,4,4,4] 16 | b = (i**2 for i in a) 17 | print(max(b)) 18 | 19 | import torch 20 | 21 | a = [torch.tensor(1),torch.tensor(2),torch.tensor(3)] 22 | print(torch.stack(a, dim=0)) 23 | print(float(torch.tensor(1))) 24 | 25 | -------------------------------------------------------------------------------- /P1_Navigation/Future_Improvements.md: -------------------------------------------------------------------------------- 1 | It would be very useful to check [Improvements in Deep Q Learning: Dueling Double DQN, Prioritized Experience Replay, and fixed Q-targets](https://medium.freecodecamp.org/improvements-in-deep-q-learning-dueling-double-dqn-prioritized-experience-replay-and-fixed-58b130cc5682) 2 | -------------------------------------------------------------------------------- /P1_Navigation/Navigation_Final.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Navigation\n", 8 | "\n", 9 | "---\n", 10 | "\n", 11 | "You are welcome to use this coding environment to train your agent for the project. Follow the instructions below to get started!\n", 12 | "\n", 13 | "### 1. Start the Environment\n", 14 | "\n", 15 | "Run the next code cell to install a few packages. This line will take a few minutes to run!" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "#!pip -q install ./python #to run on udacity workspace" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "The environment is already saved in the Workspace and can be accessed at the file path provided below. Please run the next code cell without making any changes." 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stderr", 41 | "output_type": "stream", 42 | "text": [ 43 | "INFO:unityagents:\n", 44 | "'Academy' started successfully!\n", 45 | "Unity Academy name: Academy\n", 46 | " Number of Brains: 1\n", 47 | " Number of External Brains : 1\n", 48 | " Lesson number : 0\n", 49 | " Reset Parameters :\n", 50 | "\t\t\n", 51 | "Unity brain name: BananaBrain\n", 52 | " Number of Visual Observations (per agent): 0\n", 53 | " Vector Observation space type: continuous\n", 54 | " Vector Observation space size (per agent): 37\n", 55 | " Number of stacked Vector Observation: 1\n", 56 | " Vector Action space type: discrete\n", 57 | " Vector Action space size (per agent): 4\n", 58 | " Vector Action descriptions: , , , \n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "from unityagents import UnityEnvironment\n", 64 | "import numpy as np\n", 65 | "import time\n", 66 | "from collections import deque\n", 67 | "import matplotlib.pyplot as plt\n", 68 | "import torch\n", 69 | "\n", 70 | "#env = UnityEnvironment(file_name=\"/data/Banana_Linux_NoVis/Banana.x86_64\") #to run on udacity workspace\n", 71 | "env = UnityEnvironment(file_name=\"./Banana_Linux/Banana.x86_64\") #to run locally" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python." 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 3, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "# get the default brain\n", 88 | "brain_name = env.brain_names[0]\n", 89 | "brain = env.brains[brain_name]" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "### 2. Examine the State and Action Spaces\n", 97 | "\n", 98 | "Run the code cell below to print some information about the environment." 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 4, 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "name": "stdout", 108 | "output_type": "stream", 109 | "text": [ 110 | "Number of agents: 1\n", 111 | "Number of actions: 4\n", 112 | "States look like: [ 1. 0. 0. 0. 0.84408134 0. 0.\n", 113 | " 1. 0. 0.0748472 0. 1. 0. 0.\n", 114 | " 0.25755 1. 0. 0. 0. 0.74177343\n", 115 | " 0. 1. 0. 0. 0.25854847 0. 0.\n", 116 | " 1. 0. 0.09355672 0. 1. 0. 0.\n", 117 | " 0.31969345 0. 0. ]\n", 118 | "States have length: 37\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "# reset the environment\n", 124 | "env_info = env.reset(train_mode=True)[brain_name]\n", 125 | "\n", 126 | "# number of agents in the environment\n", 127 | "print('Number of agents:', len(env_info.agents))\n", 128 | "\n", 129 | "# number of actions\n", 130 | "action_size = brain.vector_action_space_size\n", 131 | "print('Number of actions:', action_size)\n", 132 | "\n", 133 | "# examine the state space \n", 134 | "state = env_info.vector_observations[0]\n", 135 | "print('States look like:', state)\n", 136 | "state_size = len(state)\n", 137 | "print('States have length:', state_size)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "### 3. Take Random Actions in the Environment\n", 145 | "\n", 146 | "In the next code cell, you will learn how to use the Python API to control the agent and receive feedback from the environment.\n", 147 | "\n", 148 | "Note that **in this coding environment, you will not be able to watch the agent while it is training**, and you should set `train_mode=True` to restart the environment." 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 5, 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "name": "stdout", 158 | "output_type": "stream", 159 | "text": [ 160 | "Score: 0.0\n" 161 | ] 162 | } 163 | ], 164 | "source": [ 165 | "env_info = env.reset(train_mode=False)[brain_name] # reset the environment\n", 166 | "state = env_info.vector_observations[0] # get the current state\n", 167 | "score = 0 # initialize the score\n", 168 | "while True:\n", 169 | " action = np.random.randint(action_size) # select an action\n", 170 | " env_info = env.step(action)[brain_name] # send the action to the environment\n", 171 | " next_state = env_info.vector_observations[0] # get the next state\n", 172 | " reward = env_info.rewards[0] # get the reward\n", 173 | " done = env_info.local_done[0] # see if episode has finished\n", 174 | " score += reward # update the score\n", 175 | " state = next_state # roll over the state to next time step\n", 176 | " if done: # exit loop if episode finished\n", 177 | " break\n", 178 | " \n", 179 | "print(\"Score: {}\".format(score))" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "When finished, you can close the environment." 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "### 4. It's Your Turn!\n", 194 | "\n", 195 | "Now it's your turn to train your own agent to solve the environment! A few **important notes**:\n", 196 | "- When training the environment, set `train_mode=True`, so that the line for resetting the environment looks like the following:\n", 197 | "```python\n", 198 | "env_info = env.reset(train_mode=True)[brain_name]\n", 199 | "```\n", 200 | "- To structure your work, you're welcome to work directly in this Jupyter notebook, or you might like to start over with a new file! You can see the list of files in the workspace by clicking on **_Jupyter_** in the top left corner of the notebook.\n", 201 | "- In this coding environment, you will not be able to watch the agent while it is training. However, **_after training the agent_**, you can download the saved model weights to watch the agent on your own machine! " 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 8, 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "name": "stdout", 211 | "output_type": "stream", 212 | "text": [ 213 | "Episode 100\tAverage Score: -0.03\n", 214 | "Episode 200\tAverage Score: 2.742\n", 215 | "Episode 300\tAverage Score: 6.36\n", 216 | "Episode 400\tAverage Score: 8.75\n", 217 | "Episode 500\tAverage Score: 12.05\n", 218 | "Episode 600\tAverage Score: 14.01\n", 219 | "Episode 700\tAverage Score: 15.20\n", 220 | "Episode 800\tAverage Score: 15.65\n", 221 | "Episode 900\tAverage Score: 16.08\n", 222 | "Episode 1000\tAverage Score: 15.60\n", 223 | "Episode 1100\tAverage Score: 15.92\n", 224 | "Episode 1200\tAverage Score: 16.57\n", 225 | "Episode 1300\tAverage Score: 16.78\n", 226 | "Episode 1400\tAverage Score: 16.74\n", 227 | "Episode 1500\tAverage Score: 16.81\n", 228 | "Episode 1600\tAverage Score: 16.73\n", 229 | "Episode 1700\tAverage Score: 17.11\n", 230 | "Episode 1800\tAverage Score: 17.14\n", 231 | "Episode 1900\tAverage Score: 15.93\n", 232 | "Episode 2000\tAverage Score: 16.42\n", 233 | "Training Time is 2462.7125222682953\n" 234 | ] 235 | }, 236 | { 237 | "data": { 238 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEKCAYAAAAfGVI8AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAIABJREFUeJztnXecFdXZx3/PFurSWZC+9KbSVhAFBBEBUbDECDGKLdYYzZuYoImoUSOx19hijyHmjUk0r4mIgiCK4II0kSYuvSxtWWD7nvePmbk7d+70O+3e+3w/H9h7z5yZ89wzM+c55znPeQ4JIcAwDMNkLllhC8AwDMOECysChmGYDIcVAcMwTIbDioBhGCbDYUXAMAyT4bAiYBiGyXBYETAMw2Q4rAgYhmEyHFYEDMMwGU5O2ALYoW3btqKgoCBsMRiGYVKKFStWHBBC5FvlSwlFUFBQgKKiorDFYBiGSSmIaJudfGwaYhiGyXBYETAMw2Q4rAgYhmEyHFYEDMMwGQ4rAoZhmAyHFQHDMEyGw4qAYRgmw2FFwCRFUfEhbNh7NGwxUoKqmjr8rWgHeHtYJmqkxIIyJrr84IWlAIDiOVNCliT6PLtgM55esAWNc7NxwaCOYYvDMDF4RMAwAVFyrBIAcLSiOmRJGCYeVgQMwzAZDisChmGYDMc3RUBEXYhoIRF9S0TfENFtcvq9RLSLiFbJ/87zSwaGYRjGGj8ni2sA/EIIsZKImgFYQUTz5WNPCCEe9bFshmEYxia+jQiEEHuEECvlz2UAvgXQya/yGCZVIJDp8Y++2YvKmtqkyth+8ATW7ix1de7X2w9j5+ETSZVvlyMnqvD5lgOuz/+q+BD2H62wlffj9ftQUV1fr3tLK1BUfMh12V6wblcpth08HqoMQEBzBERUAGAIgGVy0k+JaA0RvUpErQzOuZ6IioioqKSkJAgxGSZ0ln53ENe/tQKPfLgxqeuMeWQhLnh2iatzL/rjFxj1h4VJlW+Xma99hcv/tCyugXbCpS8sxeSnPrPMt2bnEVz3ZhHu+/f6WNqEJxbF3J/D4vxnluCsRz4NVQYgAEVARHkA3gVwuxDiKIDnAfQEMBjAHgCP6Z0nhHhJCFEohCjMz7fcYIdh0oIjJ6oAADsPl4csSTBs2lsGAKitc7/I7uDxKss8ZRU1ABDX+1bSGJ8VARHlQlICbwsh/gEAQoh9QohaIUQdgJcBDPdTBoZJRQR49bGXkGyNS0bhpDN+eg0RgFcAfCuEeFyV3kGV7SIA6/ySgWGY1IDMp02SJlsugKN76OOn19CZAK4AsJaIVslpdwGYQUSDAQgAxQBu8FEGhokMThohqwnldMPvBppkRVDHmkAX3xSBEGIJoPs0/8evMhkmXWDTkLdkyS0RKwJ9eGUxExiHj1dh3S53Lo1hsPS7g6iprQu0TL9NJGr2lJZjy/5jwRWoQ7lLbyEApi6uq3cciYvplJWljAiclXGiqgYrtvnvYnpAjkMVFqwImMC48I+f4/xn3Lk0Bs2KbYcx4+Uv8dj8TZ5fO8jG3oyRDy3AOY8vClsM1xi5uNbWCUx77nNc/dpXsbSs2ByBM03wy/9djUueX2p7rYJbJj6x2NfrW8GKgAmMbQeDWaTkBSVlUg8t6B5zplouvPzZivln1Y4jsbR605Cza63bJe21caIquQV+VthxgfUTVgQMo0NUeu2Mc/RuXZbLyWJFgaS7fmZFwDA6KI1J0D10VkDJo3fLyOWIgFyalFINVgQMowPFWuT0bgCigh8NrfqabucIMuUpYEXAMCZ42T45uVaad0B9Ra/uFEXgeGWxYhpK8/vBiiBNKauoxtaScF0Dzdh9pDw2IRsG1bV1+Ga3sSur1xaag8cqsbs0Pn5QXZ0wdafdvK8M5R5NUq7bVWrYCK7bVYoDxyqxPcTJfAFg7c5ST0cG6ivtL5O8ftyvI7B33v6yCizcsB/HK2uwYe9Rl2UFDyuCNOWHL36Jsx+LrmvgGXMW4LQHPw6t/EfmbcSUp5dgy/4y03xeNUvDHvgYn22OD7f8x0+34PxnluDr7YcT8lfW1GHCE4tx69yvky573a5SnP/MEjz1yWbd4+c/swSFD3yMMY8EE3FUjy+2HMQFzy7Ba58XJ30tvcV4V7yyXDrm8IZmOQxNMfzBT3D1619h4D3zMOnJz3x3O/UKVgRpyrd7Uqc3EgarZdfCkjJ9t70gJm3Xy/do95HExqKqRlrItmzrwaTKEEJgb6l0/Sgv5tt+SIoKunGvuWIOmmTnCI6mSIRTVgRMRkIxt0DzV9xPbxElnlC8uUKTlqRCqhOp4YmkWK28kNXsljm9m5T0HEFqTC6wImAyG4P31M/GkzQf4kUQ+nldkipuj4ri87LevfjpirJ2G/spRaqfFQGTmdS/4Ob4+R6buTQqKZRky6i+cpSVQlCiua0Dt/JFt8bjYUXAZCRW7WsQYaCzdM0O8coh2R5ynRApYRqqx19hgzcNpQasCBhd9pdVoEwVvREAig8cR51DP+yqmjrsOGTulvj9geA371Ze8MMnqrD94InEKKMeNAB6dahTBPaUVmBPqf7WlEdOVBu6kB6tqLZ0wVXLf+hEdWwrTCO0LqbbDh7HntJynKiqkeWpwiFVXBztvdtTWh4nb3lVLVbtOIIV2w5j39EKVNfqPw9Hy6V6OlZZE4vE6ea52H2kPG7/4xpNeXvkiXk9GfYdrcDxSv3JXcU0tONQ/bNi9FvizpOrs7KmFut3mztw7C2tiKu7ujoR2Mb2rAgYXYY/+AnGq9xPN+4tw9hHP8Xzi76Lpb23apflde55/xuMfnghSk/oN4iLN5Vg3KOf4l9fW1/LS5Qe/0//8jXGPLIQc/67QXNcIpmOoLYOE2SQtdEfPtyQsAG7ugG/6rXluuePeXihpQuu+jqrdxzB4N/NN81frVKIm/eV4axHPsXIhxbgB89Lm7wP/t18DL1fusYinXs38qEFcfLOfHU5Lnzuc1zy/BcY8ftPYs/DYU2QtRcXbwUA/Hv1bhQ+8DEWbtyPcY9+ivdX79aV0yg8+BlzFuCGt1bEvt/89kqMfrjeLbaqtg77jlbEpX28fh8AYMTvP8Elz38Rdz21++j+Mum8Bz74FgDwu3+vx+iHF+KgSQhpZe7jifmbcd7Tn5kGMTz9oU8w/eUvY9+fXrAZZz3yaSDrgVgRMIbsV/U2dx2Rej5FxfWx2dfutHZH/GxzCQCgtFxfESjugmG7Nn7xnb6bZrJ29f0mPXa1yeaIRlGqS132vX48fO05eiSzwc3OI/WjlPU67sibDO6dWt7lxfGyL9ooPQ/HDHreClbPhd7AVLlX6vI/kht5NQePxSuhb1Q99Q0a91X1PVI6M8ozrfy14yK6VHYDNhshAvVuzQDwpXyO4v7rJ6wIGFu4bQ9zs6VHrLpOvwfnh7eIHSznCAIQKCuAMlLVtl0f9M/+D7CbVascaw2eTe21jW6XmYzKofo5n2hO2LAiYBzh9EHOkWdEa2r1XxYlNYhGMWpkmfxkrzx86oRwNPGtLtbPO2KtiKW/RtWgN9KptVln2mzVJvNeeutNnHh0KecpZZrdc2MhXJzjEFYEjK/kKCMCA5tuXUg9JavygmgEzRporzryTq/j917JdhWclXuv3mUcB5STMduONCaHkL6ZCqWDImfsOXfzZAUwqmNFwPhKbrY8IjB4SWMLaIM2DdnM56dpRe83e+2u6PQ6bsp1I6qlInZRD24Dyhk9m3FyIHGTGicOBVE30bEiyHAOHKu05RLq9kFWTEOHjutPmsZsp5r00vJqHKuswZETVSgpq4QQApU1tYbeR1q2Hzyh6wp45EQV9pSWJzTC2k3UtcfN6kkIEXPjPF5ZE3O1VNBGodx5uFwuI74QIUTMddIqWFlljb2opHqboh85URWLZaRFbcIza6z3Ha2IuWkqz4adHrmSw8qNVeGwKl9FdW1ssrVCc78qqmttjwgqNb+9oro2zg1Xmcg+VlkTK2f3kfLYxL82/Mf3JcYunrE5AlXaoeNVOFZZYxrttaSssl6mADpJOf4XwUSVPaXlGPnQAvz8nD647ZzevpSRkyX1Na55vUj3uPLuaucIBt33Udz3ey4YgPnr9+GL7w6ieM4U0zJXbDuES2R3R21eI/dJI591AYHdR8pxxpwF+MWEPrh1fGI9vfXlNsx+7xvM//kYTHhiMbII2PpQfbmTnox3DX1mwRac3KlFgrJ5ftF3ePjDjQCA3RaeIjNf1Xcp1TL+sUU4s1ebuLTBv5uPs/rk6+a/4++rbV13xO8/SUh7ZN5Gy/OUhnHK00tM76OihP759S7MueQUNMzJxvjHFmHXkXIUz5mCqc9+Hpf/wuc+xzs3jLQl+4//tCzu+3urdmPu8h2x7yffMw/Fc6bg5HvmxdJufnul4fWue7MIz/5oCM4/taNhHqXDU1lTG3O/NSPOLZhNQ4yfKG5pCzbut32O486JxQl2J9EWbyoxdPHUYrVwxw5q2/Aei3pavEkKL71VViZ2OqYrtx9OqJpPN5TYlu/LrfoupXp8viWx3hZt0i9L7W7p9F5/9M1eR/nN5gvUZSuLrHap3Fm3axZybdhbZnuxo3b053Rjej2xi4oTQ4kDiXMu2tFIVGBFwPiKVWPiy2SxB9eKt1GbNzCe2fVT3HHKjtdOnPeNSXb1LTSz4TstXw+nnjzKb1CfZuT1pp0sjupcASuCDMbJM+n2+bVqk5VeoZX7qJPyvWhP9a5hdF2v2m+/9IArl0UXOJ2sNcutFtnI9TihfJdeQ4pnm130lh0Y1bEiUf1cQTQ1gW+KgIi6ENFCIvqWiL4hotvk9NZENJ+INst/W/klA+M96vbai0683Tj0Tt5xT0MZB6IuJfzynMoOSBN42dtVjxBrbCz4Apw9I2qyPaj4LIM6Vjo6mTwiqAHwCyFEfwCnA7iFiAYAmAXgEyFEbwCfyN+ZNEXPb1ptG1YaWqu2yskCK08Wp6kuYVW0Vw24X4vqglqsZ+cWqfOYzhGoRLbrDeTWNJTjUlGqlZXhqmODv04IQnf4pgiEEHuEECvlz2UAvgXQCcA0AG/I2d4AcKFfMjDeYachrq6tM1w4ZsTxSmmiLq4HaOMaNXLwMC21dSJOhvIqyeW0prbO0uXyRFUNSk9IbquKe+XR8hpUy6YJpUFy8zvtYLfB1otGqqTV1YkE10q3E5RacYziA5VXS/WlNg1VVNfq5ldP1No1DVXW1MW5mxpFY62sdjbpq5CdnVjvRmUo8lRU18ZFYa2s1q9jodEAdtxm3Zq4kiEQ91EiKgAwBMAyAO2FEHsASVkQUbsgZGC8R6sbBs6eh2aNcrDi7gm2z3v9i+KE471+81/LspQ8f/nJCJzRs20s/SdvFmHBhnrvnv6zPwQAXDSkE/5pEeF0wOx5CWnr9xzFDDki5Go5yN4p985Do9xsrJp9LgDt6lP37DxsHtIYkHzQte6HK7cfxsV//AKvXlWIxZsO6NapF6jdKdXMXb4Dc5fvQPvmDWNp/e7+UDdvmSpAm2l9qbTQ+U8vQZVK8Sr3VMvZJpFenWJUBiCtzTjl3nmxDgIgPceXDO2sk1uo/gdu++sqy7Ive2mpE1E9wffJYiLKA/AugNuFELb9+ojoeiIqIqKikhL7bnWMc7wyHFTV1uGgJrywV1YJI1u9NqyvWgmosVICTqiorouL/KlefeoEbd0UmywwUtBbILZym+S6+NnmA5i7fLtDKYxxGg7B6SDJbP5FXXKVD6MvNW564NU6E9hrdh3RySnhxLT5lYErqp/4qgiIKBeSEnhbCPEPOXkfEXWQj3cAoPvmCiFeEkIUCiEK8/P1F78wYWBtF43LrZPHlZ3U4KQoRXMMYiJQb+JXXQfhTkY69Bqy6T7qN37WWb37qH9leIGfXkME4BUA3wohHlcdeh/ATPnzTADv+SUDkz4YvaxeeHwkS5Ai5GYZv7JCuI+34wVOGztTRRDgogo/66x+iiDamsDPOYIzAVwBYC0RKYaxuwDMAfA3IroWwHYAl/ooA2OCk+ff9ToCC68h++XrnxOUj3xU0JvYVKd42ag5VXBOyzbLH+R99bOJjo0IormgOIZvikAIsQTG5ufxfpXLRAuvestGvc0IDAhUIZP97/VZr9T2XQTjsh0WbqYIgryvXilPrzo9YcAri5nAcfVqqE5SR86MxByBixATRu6GRsQifeoVHys/+UYnmep0HPLaTI5ATUP+lxF1hUBRFxAACgsLRVGRfvRKRp+CWR/EPo/tm4/Xrx4ed7ymti7mgjm4S0v865Yzdc9VIkROe3YJVu8sxejebVFSVpmwt6ua/GYNUVJWidvG98bK7Yfx2eYDlvJ2atk4LqhYVCmeMyVWP7ee3QvPLNiCSQNPwocOA6655aozCgzdQ68c2Q1vLt3mWVlvXzcCl2sidQZFz/ym+M4kvHMq8eY1w3HH31dj31Hj/avNePu6ETizV1vrjDoQ0QohRKFVPh4RZACfbkx0v3Xqkqf40B+tqDFVAgBicdRf+/x729dPBSWg5cXFWwHYD4HgBW996V1Db0WYY610UQIA8NH6vbbjJekRRF+dFUGGQg7dQN0Q/bFmcoQxmrazUToTPaJ+a1gRMP4hImLD9wkl5ESQE7RRb1CYRISI/hwBKwLGV9JXDdQrALebprvBrD3x3GspnW9ewGTsgjIm2rhtNJy0DRF/9j0jzEVcTGqQzDMShFsyK4IMJYi2K8iecphERQ94LkdEfleqI5DcveHJYsZz9pSWo2DWB/j4230Jx/aWVsS5jgLA3f9ah153/cdVWeXVtSg+mD7eHwAS6geIzojg7WXeBZwDgB+F5Dqabvxl2XbDEN52COLpCiQMNRMd1spuoHpRKtftKk1IS9ZdcZuNiJqpTlQUAZOedG/T1PcyeESQYXCT5T0ZYgFjQqJrmya+l8GKIENRd2LZOSQ5wthRimG8hBVBhsJNl3ewaYhJdVgRZBhK7z/qC1xSCR4QMKkOK4IMw6zN4vbMHaxUmVSHvYZSnGVbD+Kyl77EsrvGo33zRgCkzcy1aN0e3bZdq3YY78uaqSgB+RgmVeERQYrzxtJiAECRasPrv9jwJ+c+LMMwCqwI0hA7XkBszmCCZmjXlmGLwBjAiiANsRPwk9UAEzTpHIk21WFFwDBMIAS5IT3jDFYEaYid/V7ZMsQETZD7EDPOYEWQhjg1DfGQnQkEfswiCyuCiPCzuV/jd/9ebyvv3OXbMenJxXFp/1i5E+Mf+xSAPUWwWuUGWl5Vi6+KD+EnbxbZlpdhnMJ6ILrwOoKI8P7q3QCA2RcMsMx75z/WJqR9smG/6puzV27bweN4Yv4mR+cwjFN44BldeESQZrBbKBNVeI4gurAiSDPc6AFWHUwQ8IgguvimCIjoVSLaT0TrVGn3EtEuIlol/zvPr/IzlTohHL9wPIhgmMzGzxHB6wAm6aQ/IYQYLP9ztwciY4gAT8oxDOMM3xSBEGIxgEN+XZ/Rx9WIAIJHBQyTwYQxR/BTIlojm45ahVB+2rDzcOJ+wP9duxd//tLbTcwZxgt4jiC6BK0IngfQE8BgAHsAPGaUkYiuJ6IiIioqKSkJSr6U4o0vihPSbn9nlePrCMEvaaoxundbW/leumKY7Wu2zWvoVhxLrhvVHY1ysnWP5Wa7f/huGNMDk08+yfX5U07p4PrcdCJQRSCE2CeEqBVC1AF4GcBwk7wvCSEKhRCF+fn5wQmZQnhlzmGrUOrxmyn9LfMM794aQ7raH3S/fKV9paEwcWB7W/l+e77x+phLhnaOfb52VHdH5ffIb4pnfzTUVt7e7fIS0uwooRnDuziSKRUJVBEQkVr9XgRgnVFexhrPGnDWBGmL3+tKvFgboA5x4mdguiyXw95MCMHi28piIpoLYCyAtkS0E8A9AMYS0WBITU8xgBv8Kp+xj2BNkHLYaoBFauynrG5nnTa65EAV6V3aTnnprwZ8VARCiBk6ya/4VV4mkgkPKKOP3fbSiZIPS2dkxSkC5+fbPSdbZ7hh59QMGBDwyuJUJgU6e0yICAjfRwReNJJqk41jUxPZH0XomYbsVE8mhMZgRZAifLx+H+7/P3vRSZ1SXSvwxXcHfbk24w/2tiNNjdhTWUnMETjJ7nb+gUcETGS47s0ivLLk+7DFCJRfTeobtgih8dilg0yP222cOrVsbHisWaMc/HRcLydi+QKZmIaaNYy3Xj9xmXm9mJfjcrLYdYmpAyuCFCYFOnuu+ejnY3Dz2OQaqeI5U5KWo21eg6Sv4YZLhnU2PGbXb17AuPG79exeWHvvRPxyYr2yDavBMzO9LPvN+LjvFw0xrhfLcuRi3rpW32v9zsn9AAADOzbXnJf+qsC2IiCiUUR0tfw5n4icOfwyjAOi8+pFR5J47GxHatxT0Ds7rH6Fup3V2vG9tM9nWzToStkZ0O4nYEsRENE9AH4N4E45KRfAn/0SimEy8WW0i92RYBANuxf3iQw+27m+k9663XUEWuWTCc+i3RHBRQCmAjgOAEKI3QCa+SUUwzDm2NqXOkVMh8mtI3Bejrqh1xs1aUVgr6F6qoRUYwIAiKipfyIxDGOFLa8h0wtEp3FTN/4JjbCHYtofEfgnQ1Sxqwj+RkQvAmhJRD8B8DGkWEGMC8qranHnP9ag9ES1aT4hBB6ZtwErtulH8+YVwf5z4Fhl2CL4gldtmychJkyup/6uvzLYQTkWeWPvU8I8RfpjSxEIIR4F8HcA7wLoC2C2EOIZPwVLZ/5WtANzl+/AEx+bbxhfXSvw3MLvcMnzSwOSLLXo1qZJQtrUQR0xa3I/PDV9sOX5HVo0sp3XCrvRQL1C3Ytu31yKGtqnfR7evekMtGySCwB44ofxrpazzx+QVKTPIPjhafFeQeo2+c/XjgAgeffMPn8ALhjUEZNPlsKXFXYzD67XIDvLWhHIeiCLgLevG6ErQxA8edlgnNq5RaBlWioCIsomoo+FEPOFEHcIIX4phJgfhHDpimKXTIXFPlHmycviG/BTOrXA0zOG4MazemLa4E4AgAsGdTQ8/87z+sfyFugoFSf8aWYh/n7jyLi04jlTUDxnCn42vrfl+cMsGjIjWjXJRYMc6TV+avoQDOvWCqtmn4viOVPQIz8+2uY1o7rjpiRcct+75cy47+2aNfSmu6y6RtOGOXj1qkK9QzhFbhxvOKsnrhnVHc/MGILGDaTQ1n+/6Qx0aNEolrd4zhSsnn0uAGm9xKYHJzsS58xe9YpdrXQvld16FVdTAMjxKFLeqtkTUDxnCi4c0glPTR/iyTXtYqkIhBC1AE4QUbAqKo3h5t8btBOLTk1lXiriLKKkeo51DmQREKbmFDu4kVUvVo/fJOXDrznVqoqVw9oySTdPYpqXBF3TdoPOVQBYS0TzIXsOAYAQ4me+SJUhZMJCFT/R1p7eix5UDUuKwH1pTmMC6a3GtVV8EsoviMc1YaLW5JgWNz9N75SEctSNvjKNYOF5lCxBNw12FcEH8j/GQ6weIKsebqZblrQvi9P68LL+ssh6wZK5LA5GBKqsRO6ma92cped143d7lYxrqRdlAvqNvv8jgmA1gS1FIIR4g4gaAOgjJ20UQpi7vDBJk+kNvRXal8VpdakVbbKNDBEZuyfauJG1DocE6t8eWxGbZONhJaZfpqF4zyDtYi77ZboKYa36rNfb115Xr4r8eE8jOSIgorEA3oC0mQwB6EJEM4UQi/0TLTO4///Wo0mDxL1chRB46pPNpud+s7sUXxUf9ku0ULHzcmlfloY57kNneTLnaaQHbJybjGnIifDJtFl+TRHYndtxXbztldgxTWBYrjKX43a3M9PyQ+z42X1zHgNwrhDiLCHEGAATATzhn1jpjXLDDx6vwitLvsczC7Yk5Nl64Die//Q73fPz5IiM6aoEzLhlXE/DY7+a6CxaqdmLN21wR5zdr52jxk/dOPzWZE9hPS8ixeQwcWB73aireaoonL+aVO+xkqUyDRm1TT8/pw/ukOvmitO7YXCXlpgxInEfXrvhHPIa5mBMn3w8M8OdZ0uPtu7Wozpte5s1zMG4vvl44QppL+ZYj9+ih2+2oOzn5/RBYbdWuGRoZzTI9iZmZ9u8hhjbNx8tGufqlhkEdn9JrhBio/JFCLEJUrwhJgnMOgBmpoL+HaIT3aPfScayPHjRybauUTxnCu46r591RgB3TOyH7nJDon5Z+p3UDGf0SvTld/tC3XVef7x61Wlo1sj+Y56lepuuOqPAMN//TOiTsCG6cr9/cW5f3airt55dn9ZLtQl7brb1JPVt5/TGLXK46XbNG+Fft5yJds3qXS0Hd2lper6CUkoWAW9eMxwjerQxzf/wD07VTb/rvP5x4aXjTEO2JLFHVhbhtauHx7mCmqHIYTZHUNC2Kf5+0xlo0SQXr151miN5/nr96ZhySoeE9OHdW+H1q4cjS9XrCNqRxO5kcRERvQLgLfn75QBW+CNSBmGiCZy4E0YVP4bPQH3vOdnrx1WxgTnAySSuWh4re7pW0dc6/E3K85GTTSpZbQrqEt3JYjN5zeRxZc/3/nnSu6LdoHNePd51dXoyBItdRXATgFsA/AySjIsB/NEvoTIFM9uo2UudKkGwgvQo8asRdHLZ+L131Z4miXm1A766OkUR2CurRs6fm1W/YjbZkCNWdRiI+6hJGX6UL+I+J3oEAf4/x7U6FR/JyWI531NCiMcBabUxgIa+SZUhmL14PCKwxleFGOtmOzjFwe/V3l+lMTAaSWgvXVNbPyJQ6tnvR8ZpfadqLKyE2+jzc1ynYwYOurNnd47gEwDqPe8aQwo8xySB2YubKnrArPHzu1cTtfV4ThSf9uVXzAN2r1FdK52Qo5qYCOOZMZPWjTxBNIBGJdTHGorP4feCar1OX1RHBI2EEMeUL0KIY0SUXHCWNOfw8Sq8U7QDN4zpgTU7S7HrSDnOkyeKlNv+4Td7E86b+uwSbNxbhuHdW+ted9WOI1herB+NNGo46SG7azSSy2Nuwnaz4MqonMSSEkxDFiMCLTHTUDahqta4nKjiahFcSIrfS+Wkd49qQ1wRr2B3RHCciIYqX4ioEEC5PyKlB79+dw3m/HcDvio+jGnPfY6b315p67w1O0tRWVPnVmhrAAAdQklEQVSHzzYf0D1+85+DnaPPa2jeV1A/sNo2jCB5z2j3gAWAG87qAQCYc/EpptdX3DDzGuZgwoD28dcnxNKsGsHrRnWPC0qm5b6pA9G9bdOEzdKtmtYfDOuM03tISrtDC2nQrK0HPSV35chucR5Xv7/oFPRql4f8ZvoW16mDOsV979M+D93aNMFd5/UPbLI4r1His2A1V3zz2J6xQG3JYrdBvqww0TXWiBvGJLojJy5qk/420/n9gBTs0C63jOuF5o1y0LxRDl676jR0atkYPz9HJyhhREcEtwP4XyLaDen+dgRwmW9SpQFlFTUAgJpaHZeAJNhdWuHp9axYd99EAEDBLP0II+q2592bzsBFf/wi9j0rC7h36sCE8/ud1Ax3Tu6POycb+9orXDe6B64b3cPgKOGX5/bF/PX7LBvBgZ2a47fnD4iTQ+0RNLp3Phb+ciwKH5iPssp6xWLlNTRjeNdY5NAGOVkonjPF8jcBQGFBa3x4+5iYPOP6tcO4fu0M85/UolHctZs0yMGiO8YBqG+o3CoCvca8eM6UuLq6cmQ3yxAaN57VEy8sil/7ol7z4FSWcX3zAUijJCcrr2/Ta1gNGKDTSUmYIpD/zhxZoHsNIwWhd92BHVtgzb0TY2mfzzrbIG+E5giI6DQiOkkI8RWAfgDeAVAD4EMA3wcgX8qSSsN0r6jRvKy+TxaTH+aC+Aumwl30ymvItAwHcih4NULRi+/jJ0a7pAX5TkdtQdmLAKrkzyMB3AXgOQCHAbzko1wpT+wliNiEpteof161x6MfJ2V7/orKF4yCS6UVSu8x7MlibVWYNZxuFkwlW9VuQ1mQhVeWL7GGvL+kKVaKIFsIocxMXgbgJSHEu0KIuwGY7nBBRK8S0X4iWqdKa01E84los/zX3W4cKYTeEC9dN6Sp0cx6+b060knkTbtVHoWG3Sn1PdbUJPA6tyjP6LkNsn6DXllsqQiISDGAjQewQHXMyjD2OoBJmrRZAD4RQvSG5JI6y6acKUeqvpROUYdVqNEskUzG7c6s/pzUrdkLZacML8wBfj8L9T3WcJ86L0xD5gvK/G0cYyaohHKV474WH19mcEUBsFYEcwEsIqL3IHkJfQYARNQLQKnZiXJkUq2f4zRIUUwh/73QqcApg/zQfLppfyzpsheXYt43e9N2Q5rqWvdzBG7esTjTkEdvaYJ5w8o0ZOOa/i/0ksvxswwb9zLZsOBRwSjWkLZT4OdbHKk5AiHEgwB+Aal3P0rUv21ZAG51UV57IcQe+dp7ABi6SRDR9URURERFJSUlLooKF+WheXHR1ljasu8P4Ya3VoTec/OS28f3iX3W7rt7WoH+Wgg72H0POrdqggbZWbHomlquHNkNQP0etKMsApBpe3/3TzMPnKcOAOcX2gB1WrzssY7tm2/o7tm0YTaaNczB7AsG6h6/cEhH02B7atQN3UVD4l1jB3eRnqOfjJG8xR686BQ0b5RjOcK887z+aJybjTZ5DXSPK4H3BnbQd/ccKj+/Wi81CkLTaojcxjRCiC910jb5I05cGS9BnpAuLCxMuZYzldr6dfdNxMn3zLOd/yeju+PlzySnsdO6t45za7TrPmmF3epr3CDbdGPyIV1bxcn05+tGGLrCAom9vx+e1gW/endN7Hh+s4Z4avpg/OjlZRjRvbWj6KRueehi/SieCvXrCJJ/6F6/erjhsZzsLKy9b2JcmjJSePyHg9CrXTPcO3Ugqmvr8Pay7bZegpV3T0DrpvUNN4HQummDuHs2Y3hXzBje1fJaUwd1xNRBHQ2Pj+6dr/t89m6Xh837j6FtXkPd4yHogcBtQ94E1LbPPiLqAADy3/0W+VOWFNIDSZHSVq4kQ3zY/e1+ux3G5giSvpJH5jUb9RKlx8bqV9ePuDLXfdRr3gcwU/48E8B7AZcfGKlk/nH6zKntxV49r1GpLqsXUAhETssHtbLYDL2yXc37hKAhjCaJFcJwz43aZLFriGgugKUA+hLRTiK6FsAcABOIaDOACfJ3JmSSefnSdeLbzjsflXDgWSF6DenVQJjrGtwQW/JjcDuVdKfbiSZDVDemcYwQYobBofF+lckEg9kiolTHlheQ04v67j8aTDFmqMt20oZplVcoz1NMBPPSA11ZHFhJEkGbhlKej9fvw8FjlZb5UqQzBMBFz1aVPZUHBOYbAxkdq0+3P0fgL0GYhgx/q1nQOROBojiSNB4RhGAaSvM5gpTmRFUNrnuzCDNfW26ZN1WGxUDiQ/frSf0wtm8+Tu6UGJALQNy+q3bXCswY3hUNcqTHTYk8akbHlo0Nj/lZt8rG8m2a6kcBvW1878jd22tGdQcgRSR1g9EdVIdCd9JZuFB2Bx3dJ98yb1hVeWrnFhgrB7WzkkHJN3WwvkeS7dAVDlr3yLmPMvUoQdW2HThhmTdKbUXzRjk4KkdDtcNNY3viprGJ4XkVTu1sb7NzNQ9dfAoeMgk5rbxMN5zVw1ZUUiD5XpNegz59eFdMN3BV3Pr785CVRVgihwiPSqd24sCTPHPbVfO3G0biT59txQMffGuZV937H6px2XVCUCOF9386KvbZarK4Z36e7d+j5NNzUXYyh8MjgnQhQt1Gf1ec+njxiBFGFMqoE5UJ82SonyxO/d/iFlYELki2GYiQjgCQpNdQGjQEdtHaitPlt4fxOLqYcvCN+vvpEB+F5RFBhHFybyLW1vtGBnei7E8WR03zu8Cv+xylqonSsxypjWkY90TpAbcimYcuQu+OY9zeIqfnRf1ZiNI9DGVBWUDdNkeTxTwiSG1WbDuEkrJKrNttGpw1WFJgc5V0M7ekC3aVWMR1nSlRfPZ4HUGKc8nzSzH12SWh9wLVG6NbkcxD5/d2lHooES7b5Om7ePqNU1PPhAHtAQA985smHPNqY3e/MbrLFwySXImHdnW2x5QS4VPZ7/eWccZean5z9ZmS+602aulZffLRINteE6mNuqrc1wtVLqfd2ybefyPSZmVxOmPVEOyx2GDe76Go4sJWWl6NQfd95GsZYXDNqO4x33k3TD+tC/761Y6k5bD7so7o0cawvh65dBAeuXRQ0rL4hdWzOrZvO1fPgtZF+Y6J/XDHRGcb3XvFtaO641qd5+mNa4wjsaoRArh36kDcO7U+PLf6vj45fYhjmXhEEGFSbfhrdwvDTHWbcztqS7XngEk9eI4gwsRsiUnepaBskXZLyTQ14NVLlm71FrY5M1Xxo9GO2p7FjBqPXpSgFySlkvtiSgxOUqc6k6a+8xOuHFEmhV4vQ1gROCBdV5TyS+4OrjcmXWBF4AJtD/ur4kOoqa2zff7v/7PBa5GSImPnCFwq9nTtEDCZCysCB+gNAVduP4xLX1iKJz/eHLxAFjTKzQYAXH56N8+uefkI671jkyFasydBXMWcEaoIoGFwmlz+6N7WkUStmCxHre3tMkpqJjL9tC6BlMPuow7Q6wceKJP2JtiwtyxUW/wDF56M3/5rXVxabnYWNj84GTlZhF9P6ocdh05g7KOfxo5fN6o7Zk125rL3wIUneyFuJHDtNRTQbd7y4ORA1mmYjQiHdm2FTQ9MjoUQT4YfDOuMqYM6enKtTGDzg5ORHdBonRWBA/QaevU2gWFOGhktfMmV07MJaJgbnycri5Bjc8GMQqaakdR45T1mhdN74xarDoyXDXc6KQG/vf9yA7r/AJuGHCE0fwH1fqYCdSFqAjd2a27Sk4Prj0kXWBEkiTIiqBPR9yqMsptbkGa1pDe08UYMhokMrAgcoNdWqVfvhjoiSJPWKUjLU7JVxlYyBkiPjgErAgco5pcTVbXSdyGwctthAMCq7Yfx3f7joclmh4QHNkMbsmR/diot0LMDz/swrAicoHn/v95xBE8v2AIAOFpRg/Oe/iwEoSRSvWkaIkevLOwWrrukHXrkS+6PZ/VtF7Ik3nBOfyk6aoeWjUOWhAkL9hpKgsPHq8IWIYadTqpZT3btvefilHv9iVRqhzN7tcXXd09Aq6YNrDOHTK92efj67glo2SQ3bFE84cazemDG8C5o2ST6dR8l0mkgxYrAAdpmtC7FuuFaPaB2f2vWKPxGLXAlkISJJxUUll2IiJWAC9LJQsimIQdob3yYk8NaXLmPplGPxgmZ+rsZxohQRgREVAygDEAtgBohRGEYcjhF29hGSA/YInFEkNmk2O1jGN8I0zQ0TghxIMTyHaNtSKPkPRIhURiGSTHYNGSToxXVOHyifnJ4074yfLn1YIgSxWNHD3DUTIbxjnQyMYalCASAj4hoBRFdr5eBiK4noiIiKiopKQlYvESG/G4+pjy9JPb93CcW442l2wKVQXHza9IgO+FY3/bWm9U3lyeEu7ZuAgA4tXMLD6VLHZSN1nu3s64zhjGiQ4tGAIAzerYJWZLkCcs0dKYQYjcRtQMwn4g2CCEWqzMIIV4C8BIAFBYWht6VrfXJRejHp3fFn7/cbivvc5cPQemJajRukI2aWoGH523E3OXSucNthCtu1bQBlt55NvLzGmJPaQW6yArBDqtnn5s2kwoXD+2M0wpaO/r9DKOlW5um+HzW2ejQvFHYoiRNKIpACLFb/rufiP4JYDiAxeZnpSendm4JwJ4iaJiTjXbN60cDzRs5v30dWkiLhpw2gi3SxGdegZUA4wWd0mQRXuCmISJqSkTNlM8AzgWwzvwshmEYxi/CGBG0B/BPOb5JDoC/CCE+DEGOaJCMxSlNTDUMw4RL4IpACLEVwKCgy40qyXjy+L0xBsMwmQG7j9rgUIRiCjEMw3gNKwILlm09iKH3zw9bDF289mNW3OGYePrwZutMmsNB5yxYs7PU82sWtGmCpg1z8M3uo4YrgmcM74K5y3d4XrYZH94+BmUV1YGWmQq8e9MZOHKC64VJX3hEEAK92zdDQZumAIznisfZiHWf5fGIoEXjXHRuxW6VWpo1ymV3UyatYUUQFh404jxZzDCMF7AiCAmrJpy3D2QYJihYEYREltzQJxM1lHUFwzBewIpAprZOoLq2LrDyuBFnGCYqsCKQ+fGflqH3b/4bWHmKHjBaUOZGT7RKs3hADMMEA7uPyiz1eW+BK0d2w5uqsNVWcwB2RgzaLB//z1m8+I1hGMfwiMBH1O6dUwd1jDvmh2WoTV5D9LaxLwHDMIwaVgQBoe3hk8Vksa05BJ5oYBjGA1gReEy2ahhgZv5RDiWz7zGrAYZhvIAVgcdkGzb+FPdJ+ZbMxmc8IGAYxgtYEWioS3JLSqPGOdE0JP01GhHwqmGGYYIirRXBK0u+R8GsD3Csskb3+HurdqFg1gcYMLt+X5zqOmktwaZ9ZSiY9QEe/M+3jso8uVP9hvD9O9RP3CobxwNSfCFlA/n2Sex3ytFCGYbxgrRWBK9/8T0A4NAxfZfKvxVJ0T1PVNXG0mpqpR76su8POS7vxrN64tWZp8W+v/DjYbHPvdrl4Zz+UiC5nCzCTWN74U9XFmLSySfhnetPx6zJ/Qyvu+TX4/B/t45KSP9hYRfHMjIMw2hJa0Ugd+4NzTVKo68mmdXFsyb3i9vkvXnj+AVeo3vnAwBysrOQnUU4Z0B7EBFG9GiDlo2NF4N1btUkbqShwPGIGIbxgrRWBApG7WWdjn2+Wkc5uCVLU7DyNcdO/Ghu4xmGCYi0VgRKQ2/Uc67RmRj2Mt6QtlTluy1FwDAMExBprQiUDr9Rs1urowgUc1Ey/v0KCfpHTsjJtlYErCoYhgmK9FYEckC3+pDPItbACyF0zUBV8oigqib5kYHWBVQpOyfLutrZ/s8wTFCkddA5pcN/xSvLsHn/MVvnnPP4Is/K17bliuLJ1jENafPmNcz2TA6GYRgz0loRKNYdu0rADvdPG4h/r9mD5SbupX+9/nRdH/8aebSRa2Ia6tG2KX4+oQ+GdWudvLAMwzA2SGvTkPHW8O7p0roJLh7SyTTP6T3aoFubpgm9fGVyOtvENDSsWytcoIlUyjAM4ydprQj0JoOTRcC+etHOESgT0XojAg4pwTBMWISiCIhoEhFtJKItRDTLr3L03EOTpbZW2N5nWDsiqJVXuOnNESh4LzHDMIw5gSsCIsoG8ByAyQAGAJhBRAP8KMuPEYET5aJdUFZdp4wI0nogxjBMihFGizQcwBYhxFYhRBWAvwKY5kdBvowIHFxT2+9XzjVbUMYGIoZhgiYMRdAJwA7V951ymufUeLhKWCE7i9DUpmunMiBQPIiUVct6pqG8RpIDV0sXG9DzkgOGYZIhDPdRvWYroZtNRNcDuB4Aunbt6qqgM3q2xZItB1ydq8eUUztgbN98NMjOQklZJR74QApR/d4tZ8ZFMFUgIjwzYwiGdmsFoH5EoGcamjTwJNw/bSAuVUUUffnKQvTMb2oq04tXDENf3qeYYZgkCGNEsBOAOn5yZwC7tZmEEC8JIQqFEIX5+fmuCpo4sL07CQ148MKT0Sg3G1lZhCtHFsTSB3VpiZE92+iec8GgjujUsjEA8wVlWVmEK0YWoFFu/WhjwoD26JGfZyrTxIEnoaCtubJgGIYxIwxF8BWA3kTUnYgaAJgO4H0/CvJ6ikDt4unGHKN4DZktKGMYhgmawE1DQogaIvopgHkAsgG8KoT4xo+y9MJMuyEni1BTJ0AqtemmKa+ptV5QxjAMEzShhJgQQvwHwH/8L8eb62RlEVAn4txB3QSFq6kzXlDGMAwTFmndNfXKMuTV/gE1NhaUMQzDBE16KwKPhgRt8xomXM9NU944VxqANWnAkUUZhokOaR19VLv4q3mjHBytqDHMP3NkNxw4VoUP1u4BAPzhklPQKDcbw7q1wsKNJWjWqN7H381k8ewLBqBXuzyM7dPO+ckMwzA+kdYjAu3K4ud/PCz2uaBNEwDAo5cOAgAM7doS9007Gc9dPhQDOjQHAAzs2ALTBndC51ZNcMXp3eKu5WaOoEXjXNw0tqc058AwDBMR0loRaEcEeiEn6uridzED6r2NtLGCGIZh0pG0VgTahl/x449LUxp9VS9dmQpgL0+GYTKBtG7qtA2/XughZdSQrTMi4D0CGIbJBNJaEdTUWo8IlEZf7dKpnMWmfIZhMoG0VgSJ/vr135WYPkoedYyfhjlStfCkLsMwmUBau4/ePK4XauoEWjTORbc2TXBO/3bod1IzXDGyG8b0zsc/v96F6ad1xa7D5bhmVPfYeS9eMQzvrtiFHhbB3O4+fwDO7KUfbI5hGCZVIK8WXflJYWGhKCoqClsMhmGYlIKIVgghCq3ypbVpiGEYhrGGFQHDMEyGw4qAYRgmw2FFwDAMk+GwImAYhslwWBEwDMNkOKwIGIZhMhxWBAzDMBlOSiwoI6ISANtcnt4WwAEPxfEKlssZLJczoioXEF3Z0lGubkKIfKtMKaEIkoGIiuysrAsalssZLJczoioXEF3ZMlkuNg0xDMNkOKwIGIZhMpxMUAQvhS2AASyXM1guZ0RVLiC6smWsXGk/R8AwDMOYkwkjAoZhGMaEtFYERDSJiDYS0RYimhVguV2IaCERfUtE3xDRbXL6vUS0i4hWyf/OU51zpyznRiKa6LN8xUS0VpahSE5rTUTziWiz/LeVnE5E9LQs2xoiGuqTTH1V9bKKiI4S0e1h1BkRvUpE+4lonSrNcf0Q0Uw5/2YimumTXI8Q0Qa57H8SUUs5vYCIylX19oLqnGHy/d8iy57UVnwGcjm+b16/rwZyvaOSqZiIVsnpQdaXUfsQ3jMmhEjLfwCyAXwHoAeABgBWAxgQUNkdAAyVPzcDsAnAAAD3AvilTv4BsnwNAXSX5c72Ub5iAG01aQ8DmCV/ngXgD/Ln8wD8F9I+n6cDWBbQvdsLoFsYdQZgDIChANa5rR8ArQFslf+2kj+38kGucwHkyJ//oJKrQJ1Pc53lAEbKMv8XwGQf5HJ03/x4X/Xk0hx/DMDsEOrLqH0I7RlL5xHBcABbhBBbhRBVAP4KYFoQBQsh9gghVsqfywB8C6CTySnTAPxVCFEphPgewBZI8gfJNABvyJ/fAHChKv1NIfElgJZE1MFnWcYD+E4IYbaI0Lc6E0IsBnBIpzwn9TMRwHwhxCEhxGEA8wFM8louIcRHQoga+euXADqbXUOWrbkQYqmQWpM3Vb/FM7lMMLpvnr+vZnLJvfofAphrdg2f6suofQjtGUtnRdAJwA7V950wb4x9gYgKAAwBsExO+qk8vHtVGfoheFkFgI+IaAURXS+ntRdC7AGkBxVAu5BkA4DpiH9Bo1BnTusnjHq7BlLPUaE7EX1NRIuIaLSc1kmWJQi5nNy3oOtrNIB9QojNqrTA60vTPoT2jKWzItCz4wXqIkVEeQDeBXC7EOIogOcB9AQwGMAeSENTIHhZzxRCDAUwGcAtRDTGJG+gshFRAwBTAfyvnBSVOjPCSI6g6+03AGoAvC0n7QHQVQgxBMD/APgLETUPUC6n9y3o+zkD8Z2NwOtLp30wzGogg2eypbMi2Amgi+p7ZwC7gyqciHIh3eS3hRD/AAAhxD4hRK0Qog7Ay6g3ZQQqqxBit/x3P4B/ynLsU0w+8t/9YcgGSTmtFELsk2WMRJ3Bef0EJp88SXg+gMtl8wVk08tB+fMKSPb3PrJcavORL3K5uG9B1lcOgIsBvKOSN9D60msfEOIzls6K4CsAvYmou9zLnA7g/SAKlu2PrwD4VgjxuCpdbVu/CIDizfA+gOlE1JCIugPoDWmCyg/ZmhJRM+UzpMnGdbIMitfBTADvqWS7UvZcOB1AqTJ89Ym4nloU6kxVnpP6mQfgXCJqJZtFzpXTPIWIJgH4NYCpQogTqvR8IsqWP/eAVD9bZdnKiOh0+Tm9UvVbvJTL6X0L8n09B8AGIUTM5BNkfRm1DwjzGUtm9jvq/yDNtm+CpN1/E2C5oyAN0dYAWCX/Ow/AWwDWyunvA+igOuc3spwbkaRXgoVsPSB5ZKwG8I1SLwDaAPgEwGb5b2s5nQA8J8u2FkChj7I1AXAQQAtVWuB1BkkR7QFQDanXda2b+oFks98i/7vaJ7m2QLITK8/ZC3LeS+T7uxrASgAXqK5TCKlh/g7As5AXlnosl+P75vX7qieXnP46gBs1eYOsL6P2IbRnjFcWMwzDZDjpbBpiGIZhbMCKgGEYJsNhRcAwDJPhsCJgGIbJcFgRMAzDZDisCJi0hohqKT6qqWlUSyK6kYiu9KDcYiJq6+K8iSRF7mxFRP9JVg6GsUNO2AIwjM+UCyEG280shHjBOpevjAawEFLkzM9DloXJEFgRMBkJERVDCjEwTk76kRBiCxHdC+CYEOJRIvoZgBshxfBZL4SYTkStAbwKaWHeCQDXCyHWEFEbSAuY8iGtlCVVWT8G8DNI4ZWXAbhZCFGrkecyAHfK150GoD2Ao0Q0Qggx1Y86YBgFNg0x6U5jjWnoMtWxo0KI4ZBWiz6pc+4sAEOEEKdCUggAcB+Ar+W0uyCFJQaAewAsEVLQsvcBdAUAIuoP4DJIgf4GA6gFcLm2ICHEO6iPnX8KpJWsQ1gJMEHAIwIm3TEzDc1V/X1C5/gaAG8T0b8A/EtOGwUpHAGEEAuIqA0RtYBkyrlYTv+AiA7L+ccDGAbgKynEDBqjPpiYlt6QwggAQBMhxapnGN9hRcBkMsLgs8IUSA38VAB3E9FAmIf+1bsGAXhDCHGnmSAkbRnaFkAOEa0H0IGkbRRvFUJ8Zv4zGCY52DTEZDKXqf4uVR8goiwAXYQQCwH8CkBLAHkAFkM27RDRWAAHhBRLXp0+GdLWgYAUPOwHRNROPtaaiLppBRFCFAL4ANL8wMOQgq4NZiXABAGPCJh0p7Hcs1b4UAihuJA2JKJlkDpEMzTnZQP4s2z2IQBPCCGOyJPJrxHRGkiTxUrY4PsAzCWilQAWAdgOAEKI9UT0W0g7wmVBioR5CwC9bTiHQppUvhnA4zrHGcYXOPook5HIXkOFQogDYcvCMGHDpiGGYZgMh0cEDMMwGQ6PCBiGYTIcVgQMwzAZDisChmGYDIcVAcMwTIbDioBhGCbDYUXAMAyT4fw/YeGrw3QyCiMAAAAASUVORK5CYII=\n", 239 | "text/plain": [ 240 | "" 241 | ] 242 | }, 243 | "metadata": {}, 244 | "output_type": "display_data" 245 | } 246 | ], 247 | "source": [ 248 | "#Training the model\n", 249 | "from dqn_agent import Agent\n", 250 | "\n", 251 | "agent = Agent(state_size=37, action_size=4, seed=0)\n", 252 | "#print([p for p in agent.qnetwork_local.parameters()])\n", 253 | "\n", 254 | "# #debug\n", 255 | "# state = env.reset()\n", 256 | "# print(agent.qnetwork_local(state))\n", 257 | "# print(agent.qnetwork_local.forward(state))\n", 258 | "# #end debug\n", 259 | "\n", 260 | "filename_to_save = 'ddqn_checkpoint.pth'\n", 261 | "final_eps = 0.01\n", 262 | "def dqn(n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=final_eps, eps_decay=0.995):\n", 263 | " \"\"\"Deep Q-Learning.\n", 264 | " \n", 265 | " Params\n", 266 | " ======\n", 267 | " n_episodes (int): maximum number of training episodes\n", 268 | " max_t (int): maximum number of timesteps per episode\n", 269 | " eps_start (float): starting value of epsilon, for epsilon-greedy action selection\n", 270 | " eps_end (float): minimum value of epsilon\n", 271 | " eps_decay (float): multiplicative factor (per episode) for decreasing epsilon\n", 272 | " \"\"\"\n", 273 | " scores = [] # list containing scores from each episode\n", 274 | " scores_window = deque(maxlen=100) # last 100 scores\n", 275 | " eps = eps_start # initialize epsilon\n", 276 | " for i_episode in range(1, n_episodes+1): \n", 277 | " env_info = env.reset(train_mode=True)[brain_name] # reset the environment\n", 278 | " state = env_info.vector_observations[0] # get the current state\n", 279 | " score = 0 # initialize the score\n", 280 | " for t in range(max_t): #this could also be while True instead\n", 281 | " action = agent.act(state, eps) # select an action\n", 282 | " env_info = env.step(action)[brain_name] # send the action to the environment\n", 283 | " next_state = env_info.vector_observations[0] # get the next state\n", 284 | " reward = env_info.rewards[0] # get the reward\n", 285 | " done = env_info.local_done[0] # see if episode has finished\n", 286 | " agent.step(state, action, reward, next_state, done) #do the learning\n", 287 | "\n", 288 | " score += reward # update the score\n", 289 | " state = next_state # roll over the state to next time step\n", 290 | " if done: # exit loop if episode finished\n", 291 | " break\n", 292 | " scores_window.append(score) # save most recent score\n", 293 | " scores.append(score) # save most recent score\n", 294 | " eps = max(eps_end, eps_decay*eps) # decrease epsilon\n", 295 | " print('\\rEpisode {}\\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end=\"\")\n", 296 | " if i_episode % 100 == 0:\n", 297 | " print('\\rEpisode {}\\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))\n", 298 | " if np.mean(scores_window)>=25.0:\n", 299 | " print('\\nEnvironment solved in {:d} episodes!\\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))\n", 300 | " torch.save(agent.qnetwork_local.state_dict(), filename_to_save)\n", 301 | " break\n", 302 | " torch.save(agent.qnetwork_local.state_dict(), filename_to_save) #for debug only\n", 303 | " return scores\n", 304 | "\n", 305 | "strt = time.time()\n", 306 | "scores = dqn()\n", 307 | "print('Training Time is {}'.format(time.time()-strt))\n", 308 | "\n", 309 | "# plot the scores\n", 310 | "fig = plt.figure()\n", 311 | "ax = fig.add_subplot(111)\n", 312 | "plt.plot(np.arange(len(scores)), scores)\n", 313 | "plt.ylabel('Score')\n", 314 | "plt.xlabel('Episode #')\n", 315 | "plt.show()" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 6, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 4, 328 | "metadata": {}, 329 | "outputs": [ 330 | { 331 | "name": "stdout", 332 | "output_type": "stream", 333 | "text": [ 334 | "Episode 1: 16.0\n", 335 | "Episode 2: 19.0\n", 336 | "Episode 3: 13.0\n", 337 | "All the scores[16.0, 19.0, 13.0]\n", 338 | "Mean Score: 16.0\n" 339 | ] 340 | } 341 | ], 342 | "source": [ 343 | "#Testing the model\n", 344 | "from dqn_agent import Agent\n", 345 | "filename_to_load = './ddqn_checkpoint.pth'\n", 346 | "final_eps = 0.01\n", 347 | "\n", 348 | "agent = Agent(state_size=37, action_size=4, seed=0)\n", 349 | "# load the weights from file\n", 350 | "#agent.qnetwork_local.load_state_dict(torch.load(filename_to_load))\n", 351 | "\n", 352 | "#Since the model is trained on gpu, need to load all gpu tensors to cpu:\n", 353 | "agent.qnetwork_local.load_state_dict(torch.load(filename_to_load, map_location=lambda storage, loc: storage))\n", 354 | "\n", 355 | "\n", 356 | "#print([p for p in agent.qnetwork_local.parameters()])\n", 357 | "\n", 358 | "# #debug\n", 359 | "# state = env.reset()\n", 360 | "# print(agent.qnetwork_local(state))\n", 361 | "# print(agent.qnetwork_local.forward(state))\n", 362 | "# #end debug\n", 363 | "\n", 364 | "num_episodes = 100\n", 365 | "scores = []\n", 366 | "for i_episode in range(1,num_episodes+1):\n", 367 | " env_info = env.reset(train_mode=False)[brain_name] # reset the environment\n", 368 | " state = env_info.vector_observations[0] # get the current state\n", 369 | " score = 0 # initialize the score\n", 370 | " while True:\n", 371 | " action = agent.act(state, eps=final_eps) # select an action\n", 372 | " env_info = env.step(action)[brain_name] # send the action to the environment\n", 373 | " next_state = env_info.vector_observations[0] # get the next state\n", 374 | " reward = env_info.rewards[0] # get the reward\n", 375 | " done = env_info.local_done[0] # see if episode has finished\n", 376 | " #agent.step(state, action, reward, next_state, done) #do the learning\n", 377 | "\n", 378 | " score += reward # update the score\n", 379 | " state = next_state # roll over the state to next time step\n", 380 | " if done: # exit loop if episode finished\n", 381 | " print('Episode {}: {}'. format(i_episode, score))\n", 382 | " scores.append(score)\n", 383 | " break\n", 384 | "\n", 385 | "print('All the scores{}'.format(scores))\n", 386 | "print(\"Mean Score: {}\".format(np.mean(scores)))" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 5, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "env.close()" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [] 411 | } 412 | ], 413 | "metadata": { 414 | "kernelspec": { 415 | "display_name": "Python 3", 416 | "language": "python", 417 | "name": "python3" 418 | }, 419 | "language_info": { 420 | "codemirror_mode": { 421 | "name": "ipython", 422 | "version": 3 423 | }, 424 | "file_extension": ".py", 425 | "mimetype": "text/x-python", 426 | "name": "python", 427 | "nbconvert_exporter": "python", 428 | "pygments_lexer": "ipython3", 429 | "version": "3.6.2" 430 | } 431 | }, 432 | "nbformat": 4, 433 | "nbformat_minor": 2 434 | } 435 | -------------------------------------------------------------------------------- /P1_Navigation/Readme.md: -------------------------------------------------------------------------------- 1 | **Project Report: https://medium.com/@amitp-ai/double-dqn-48562b5f31c1** 2 | 3 | 4 | [//]: # (Image References) 5 | 6 | [image1]: https://user-images.githubusercontent.com/10624937/42135619-d90f2f28-7d12-11e8-8823-82b970a54d7e.gif "Trained Agent" 7 | 8 | # Project 1: Navigation 9 | 10 | ### Introduction 11 | 12 | For this project, you will train an agent to navigate (and collect bananas!) in a large, square world. 13 | 14 | ![Trained Agent][image1] 15 | 16 | A reward of +1 is provided for collecting a yellow banana, and a reward of -1 is provided for collecting a blue banana. Thus, the goal of your agent is to collect as many yellow bananas as possible while avoiding blue bananas. 17 | 18 | The state space has 37 dimensions and contains the agent's velocity, along with ray-based perception of objects around agent's forward direction. Given this information, the agent has to learn how to best select actions. Four discrete actions are available, corresponding to: 19 | - **`0`** - move forward. 20 | - **`1`** - move backward. 21 | - **`2`** - turn left. 22 | - **`3`** - turn right. 23 | 24 | The task is episodic, and in order to solve the environment, your agent must get an average score of +13 over 100 consecutive episodes. 25 | 26 | ### Getting Started 27 | 28 | 1. Download the environment from one of the links below. You need only select the environment that matches your operating system: 29 | - Linux: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/Banana_Linux.zip) 30 | - Mac OSX: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/Banana.app.zip) 31 | - Windows (32-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/Banana_Windows_x86.zip) 32 | - Windows (64-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/Banana_Windows_x86_64.zip) 33 | 34 | (_For Windows users_) Check out [this link](https://support.microsoft.com/en-us/help/827218/how-to-determine-whether-a-computer-is-running-a-32-bit-version-or-64) if you need help with determining if your computer is running a 32-bit version or 64-bit version of the Windows operating system. 35 | 36 | (_For AWS_) If you'd like to train the agent on AWS (and have not [enabled a virtual screen](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-on-Amazon-Web-Service.md)), then please use [this link](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/Banana_Linux_NoVis.zip) to obtain the environment. 37 | 38 | 2. Place the file in the DRLND GitHub repository, in the `p1_navigation/` folder, and unzip (or decompress) the file. 39 | 40 | ### Instructions 41 | 42 | Follow the instructions in `Navigation_Final.ipynb` to get started with training your own agent! 43 | 44 | ### (Optional) Challenge: Learning from Pixels 45 | 46 | After you have successfully completed the project, if you're looking for an additional challenge, you have come to the right place! In the project, your agent learned from information such as its velocity, along with ray-based perception of objects around its forward direction. A more challenging task would be to learn directly from pixels! 47 | 48 | To solve this harder task, you'll need to download a new Unity environment. This environment is almost identical to the project environment, where the only difference is that the state is an 84 x 84 RGB image, corresponding to the agent's first-person view. (**Note**: Udacity students should not submit a project with this new environment.) 49 | 50 | You need only select the environment that matches your operating system: 51 | - Linux: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/VisualBanana_Linux.zip) 52 | - Mac OSX: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/VisualBanana.app.zip) 53 | - Windows (32-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/VisualBanana_Windows_x86.zip) 54 | - Windows (64-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/VisualBanana_Windows_x86_64.zip) 55 | 56 | Then, place the file in the `p1_navigation/` folder in the DRLND GitHub repository, and unzip (or decompress) the file. Next, open `Navigation_Pixels.ipynb` and follow the instructions to learn how to use the Python API to control the agent. 57 | 58 | (_For AWS_) If you'd like to train the agent on AWS, you must follow the instructions to [set up X Server](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-on-Amazon-Web-Service.md), and then download the environment for the **Linux** operating system above. 59 | -------------------------------------------------------------------------------- /P1_Navigation/ddqn_checkpoint.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitp-ai/Deep_Reinforcement_Learning/68fa57e1be9a7e72887f16a413aa55c85033ec4c/P1_Navigation/ddqn_checkpoint.pth -------------------------------------------------------------------------------- /P1_Navigation/dqn_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from collections import namedtuple, deque 4 | 5 | from model import QNetwork 6 | 7 | import torch 8 | import torch.nn.functional as F 9 | import torch.optim as optim 10 | 11 | BUFFER_SIZE = int(1e5) # replay buffer size 12 | BATCH_SIZE = 64 # minibatch size 13 | GAMMA = 0.99 # discount factor 14 | TAU = 1e-3 # for soft update of target parameters 15 | LR = 5e-4 # learning rate 16 | UPDATE_EVERY = 4 # how often to update the network 17 | 18 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 19 | 20 | class Agent(): 21 | """Interacts with and learns from the environment.""" 22 | 23 | def __init__(self, state_size, action_size, seed): 24 | """Initialize an Agent object. 25 | 26 | Params 27 | ====== 28 | state_size (int): dimension of each state 29 | action_size (int): dimension of each action 30 | seed (int): random seed 31 | """ 32 | self.state_size = state_size 33 | self.action_size = action_size 34 | self.seed = random.seed(seed) 35 | 36 | # Q-Network 37 | self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) 38 | self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) 39 | self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) 40 | 41 | # Replay memory 42 | self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) 43 | # Initialize time step (for updating every UPDATE_EVERY steps) 44 | self.t_step = 0 45 | 46 | def step(self, state, action, reward, next_state, done): 47 | # Save experience in replay memory 48 | self.memory.add(state, action, reward, next_state, done) 49 | 50 | # Learn every UPDATE_EVERY time steps. 51 | self.t_step = (self.t_step + 1) % UPDATE_EVERY 52 | if self.t_step == 0: 53 | # If enough samples are available in memory, get random subset and learn 54 | if len(self.memory) > BATCH_SIZE: 55 | experiences = self.memory.sample() 56 | self.learn(experiences, GAMMA) 57 | 58 | def act(self, state, eps=0.): 59 | """Returns actions for given state as per current policy. 60 | 61 | Params 62 | ====== 63 | state (array_like): current state 64 | eps (float): epsilon, for epsilon-greedy action selection 65 | """ 66 | state = torch.from_numpy(state).float().unsqueeze(0).to(device) 67 | self.qnetwork_local.eval() 68 | with torch.no_grad(): 69 | action_values = self.qnetwork_local(state) #same as self.qnetwork_local.forward(state) 70 | self.qnetwork_local.train() 71 | 72 | # Epsilon-greedy action selection 73 | if random.random() > eps: 74 | return np.argmax(action_values.cpu().data.numpy()) 75 | else: 76 | return random.choice(np.arange(self.action_size)) 77 | 78 | def learn(self, experiences, gamma): 79 | """Update value parameters using given batch of experience tuples. 80 | 81 | Params 82 | ====== 83 | experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 84 | gamma (float): discount factor 85 | """ 86 | states, actions, rewards, next_states, dones = experiences 87 | 88 | ## TODO: compute and minimize the loss 89 | #"*** YOUR CODE HERE ***" 90 | qs_local = self.qnetwork_local.forward(states) 91 | qsa_local = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long), actions.reshape(BATCH_SIZE)] 92 | qsa_local = qsa_local.reshape((BATCH_SIZE,1)) 93 | #print(qsa_local.shape) 94 | 95 | # # DQN Target 96 | # qs_target = self.qnetwork_target.forward(next_states) 97 | # qsa_target, _ = torch.max(qs_target, dim=1) #using the greedy policy (q-learning) 98 | # qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete 99 | # qsa_target = qsa_target.reshape((BATCH_SIZE,1)) 100 | # TD_target = rewards + gamma * qsa_target 101 | # #print(qsa_target.shape, TD_target.shape, rewards.shape) 102 | 103 | # # Double DQN Target ver 1 104 | # qs_target = self.qnetwork_target.forward(next_states) 105 | # if random.random() > 0.5: 106 | # _, qsa_target_argmax_a = torch.max(qs_target, dim=1) #using the greedy policy (q-learning) 107 | # qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_target_argmax_a.reshape(BATCH_SIZE)] 108 | # else: 109 | # _, qsa_local_argmax_a = torch.max(qs_local, dim=1) #using the greedy policy (q-learning) 110 | # #qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)] 111 | # ##qsa_target = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)] 112 | 113 | # qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete 114 | # qsa_target = qsa_target.reshape((BATCH_SIZE,1)) 115 | # TD_target = rewards + gamma * qsa_target 116 | 117 | # Double DQN Target ver 2 (based upon double dqn paper) 118 | qs_target = self.qnetwork_target.forward(next_states) 119 | _, qsa_local_argmax_a = torch.max(qs_local, dim=1) #using the greedy policy (q-learning) 120 | qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)] 121 | 122 | qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete 123 | qsa_target = qsa_target.reshape((BATCH_SIZE,1)) 124 | TD_target = rewards + gamma * qsa_target 125 | 126 | #print(qsa_target.shape, TD_target.shape, rewards.shape) 127 | 128 | # #Udacity's approach 129 | # # Get max predicted Q values (for next states) from target model 130 | # Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) 131 | # # Compute Q targets for current states 132 | # TD_target = rewards + (gamma * Q_targets_next * (1 - dones)) 133 | # # Get expected Q values from local model 134 | # qsa_local = self.qnetwork_local(states).gather(1, actions) 135 | 136 | 137 | 138 | #diff = qsa_local - TD_target 139 | #loss = torch.matmul(torch.transpose(diff, dim0=0, dim1=1), diff) #loss is now a scalar 140 | loss = F.mse_loss(qsa_local, TD_target) #much faster than the above loss function 141 | #print(loss) 142 | #minimize the loss 143 | self.optimizer.zero_grad() #clears the gradients 144 | loss.backward() 145 | self.optimizer.step() 146 | 147 | 148 | # ------------------- update target network ------------------- # 149 | self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) 150 | 151 | def soft_update(self, local_model, target_model, tau): 152 | """Soft update model parameters. 153 | θ_target = τ*θ_local + (1 - τ)*θ_target 154 | 155 | Params 156 | ====== 157 | local_model (PyTorch model): weights will be copied from 158 | target_model (PyTorch model): weights will be copied to 159 | tau (float): interpolation parameter 160 | """ 161 | for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): 162 | target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) 163 | 164 | 165 | class ReplayBuffer: 166 | """Fixed-size buffer to store experience tuples.""" 167 | 168 | def __init__(self, action_size, buffer_size, batch_size, seed): 169 | """Initialize a ReplayBuffer object. 170 | 171 | Params 172 | ====== 173 | action_size (int): dimension of each action 174 | buffer_size (int): maximum size of buffer 175 | batch_size (int): size of each training batch 176 | seed (int): random seed 177 | """ 178 | self.action_size = action_size 179 | self.memory = deque(maxlen=buffer_size) 180 | self.batch_size = batch_size 181 | self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) 182 | self.seed = random.seed(seed) 183 | 184 | def add(self, state, action, reward, next_state, done): 185 | """Add a new experience to memory.""" 186 | e = self.experience(state, action, reward, next_state, done) 187 | self.memory.append(e) 188 | 189 | def sample(self): 190 | """Randomly sample a batch of experiences from memory.""" 191 | experiences = random.sample(self.memory, k=self.batch_size) 192 | 193 | states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device) 194 | actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device) 195 | rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device) 196 | next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device) 197 | dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device) 198 | 199 | return (states, actions, rewards, next_states, dones) 200 | 201 | def __len__(self): 202 | """Return the current size of internal memory.""" 203 | return len(self.memory) -------------------------------------------------------------------------------- /P1_Navigation/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class QNetwork(nn.Module): 6 | """Actor (Policy) Model.""" 7 | 8 | def __init__(self, state_size, action_size, seed, hidden_layers = [64,64]): 9 | """Initialize parameters and build model. 10 | Params 11 | ====== 12 | state_size (int): Dimension of each state 13 | action_size (int): Dimension of each action 14 | seed (int): Random seed 15 | """ 16 | super(QNetwork, self).__init__() 17 | self.seed = torch.manual_seed(seed) 18 | 19 | #"*** YOUR CODE HERE ***" 20 | self.fc1 = nn.Linear(state_size, hidden_layers[0]) 21 | self.relu1 = nn.ReLU() 22 | self.fc2 = nn.Linear(hidden_layers[0], hidden_layers[1]) 23 | self.relu2 = nn.ReLU() 24 | self.fc3 = nn.Linear(hidden_layers[1], action_size) 25 | 26 | def forward(self, state): 27 | """Build a network that maps state -> action values.""" 28 | state = self.fc1(state) 29 | state = self.relu1(state) 30 | state = self.fc2(state) 31 | state = self.relu2(state) 32 | state = self.fc3(state) 33 | return state 34 | 35 | -------------------------------------------------------------------------------- /P1_Navigation/visual_pixels/Navigation_Pixels.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Navigation\n", 8 | "\n", 9 | "---\n", 10 | "\n", 11 | "Congratulations for completing the first project of the [Deep Reinforcement Learning Nanodegree](https://www.udacity.com/course/deep-reinforcement-learning-nanodegree--nd893)! In this notebook, you will learn how to control an agent in a more challenging environment, where it can learn directly from raw pixels! **Note that this exercise is optional!**\n", 12 | "\n", 13 | "### 1. Start the Environment\n", 14 | "\n", 15 | "We begin by importing some necessary packages. If the code cell below returns an error, please revisit the project instructions to double-check that you have installed [Unity ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) and [NumPy](http://www.numpy.org/)." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "from unityagents import UnityEnvironment\n", 25 | "import numpy as np\n", 26 | "import time\n", 27 | "from collections import deque\n", 28 | "import matplotlib.pyplot as plt\n", 29 | "import torch" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "Next, we will start the environment! **_Before running the code cell below_**, change the `file_name` parameter to match the location of the Unity environment that you downloaded.\n", 37 | "\n", 38 | "- **Mac**: `\"path/to/VisualBanana.app\"`\n", 39 | "- **Windows** (x86): `\"path/to/VisualBanana_Windows_x86/Banana.exe\"`\n", 40 | "- **Windows** (x86_64): `\"path/to/VisualBanana_Windows_x86_64/Banana.exe\"`\n", 41 | "- **Linux** (x86): `\"path/to/VisualBanana_Linux/Banana.x86\"`\n", 42 | "- **Linux** (x86_64): `\"path/to/VisualBanana_Linux/Banana.x86_64\"`\n", 43 | "- **Linux** (x86, headless): `\"path/to/VisualBanana_Linux_NoVis/Banana.x86\"`\n", 44 | "- **Linux** (x86_64, headless): `\"path/to/VisualBanana_Linux_NoVis/Banana.x86_64\"`\n", 45 | "\n", 46 | "For instance, if you are using a Mac, then you downloaded `VisualBanana.app`. If this file is in the same folder as the notebook, then the line below should appear as follows:\n", 47 | "```\n", 48 | "env = UnityEnvironment(file_name=\"VisualBanana.app\")\n", 49 | "```" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "###env = UnityEnvironment(file_name=\"./VisualBanana_Linux/Banana.x86_64\", no_graphics=False)\n", 59 | "###env = UnityEnvironment(file_name=\"./VisualBanana_Linux/Banana.x86_64\", no_graphics=True)\n", 60 | "env = UnityEnvironment(file_name=\"./VisualBanana_Linux/Banana.x86_64\") #suggested by Udacity" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python." 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "# get the default brain\n", 77 | "brain_name = env.brain_names[0]\n", 78 | "brain = env.brains[brain_name]" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "### 2. Examine the State and Action Spaces\n", 93 | "\n", 94 | "The simulation contains a single agent that navigates a large environment. At each time step, it has four actions at its disposal:\n", 95 | "- `0` - walk forward \n", 96 | "- `1` - walk backward\n", 97 | "- `2` - turn left\n", 98 | "- `3` - turn right\n", 99 | "\n", 100 | "The environment state is an array of raw pixels with shape `(1, 84, 84, 3)`. *Note that this code differs from the notebook for the project, where we are grabbing **`visual_observations`** (the raw pixels) instead of **`vector_observations`**.* A reward of `+1` is provided for collecting a yellow banana, and a reward of `-1` is provided for collecting a blue banana. \n", 101 | "\n", 102 | "Run the code cell below to print some information about the environment." 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "# reset the environment\n", 112 | "env_info = env.reset(train_mode=True)[brain_name]\n", 113 | "\n", 114 | "# number of agents in the environment\n", 115 | "print('Number of agents:', len(env_info.agents))\n", 116 | "\n", 117 | "# number of actions\n", 118 | "action_size = brain.vector_action_space_size\n", 119 | "print('Number of actions:', action_size)\n", 120 | "\n", 121 | "# examine the state space\n", 122 | "state = env_info.visual_observations[0]\n", 123 | "\n", 124 | "print('States look like:')\n", 125 | "plt.imshow(np.squeeze(state, axis=0))\n", 126 | "plt.show()\n", 127 | "state_size = state.shape\n", 128 | "print('States have shape:', state.shape)\n", 129 | "#print(np.expand_dims(state, axis=4).shape) #this is unsqueeze\n", 130 | "state = state.reshape((-1,3,84,84))\n", 131 | "print('modified state is: ', state.shape)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "### 3. Take Random Actions in the Environment\n", 139 | "\n", 140 | "In the next code cell, you will learn how to use the Python API to control the agent and receive feedback from the environment.\n", 141 | "\n", 142 | "Once this cell is executed, you will watch the agent's performance, if it selects an action (uniformly) at random with each time step. A window should pop up that allows you to observe the agent, as it moves through the environment. \n", 143 | "\n", 144 | "Of course, you'll have to change the code so that the agent is able to use its experience to gradually choose better actions when interacting with the environment!" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "env_info = env.reset(train_mode=False)[brain_name] # reset the environment\n", 154 | "state = env_info.visual_observations[0] # get the current state\n", 155 | "score = 0 # initialize the score\n", 156 | "while True:\n", 157 | " action = np.random.randint(action_size) # select an action\n", 158 | " env_info = env.step(action)[brain_name] # send the action to the environment\n", 159 | " next_state = env_info.visual_observations[0] # get the next state\n", 160 | " reward = env_info.rewards[0] # get the reward\n", 161 | " done = env_info.local_done[0] # see if episode has finished\n", 162 | " score += reward # update the score\n", 163 | " state = next_state # roll over the state to next time step\n", 164 | " #print(done)\n", 165 | " if done: # exit loop if episode finished\n", 166 | " break\n", 167 | " \n", 168 | "print(\"Score: {}\".format(score))" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "When finished, you can close the environment." 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "### 4. It's Your Turn!\n", 183 | "\n", 184 | "Now it's your turn to train your own agent to solve the environment! When training the environment, set `train_mode=True`, so that the line for resetting the environment looks like the following:\n", 185 | "```python\n", 186 | "env_info = env.reset(train_mode=True)[brain_name]\n", 187 | "```" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": { 194 | "scrolled": false 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "#Training the agent\n", 199 | "from unityagents import UnityEnvironment\n", 200 | "import numpy as np\n", 201 | "import time\n", 202 | "from collections import deque\n", 203 | "import matplotlib.pyplot as plt\n", 204 | "import torch\n", 205 | "import pickle\n", 206 | "from pixel_dqn_agent import Agent\n", 207 | "\n", 208 | "local_network_fn = './saved_agent/dqn_checkpoint_local.pth'\n", 209 | "target_network_fn = './saved_agent/dqn_checkpoint_target.pth'\n", 210 | "memory_buffer_fn = './saved_agent/memory_buffer'\n", 211 | "scores_fn = './saved_agent/scores.txt'\n", 212 | "#agent_fn = 'ddqn_checkpoint_agent.pth'\n", 213 | "def load_agent(agent): \n", 214 | " agent.qnetwork_local.load_state_dict(torch.load(local_network_fn, map_location=lambda storage, loc: storage))\n", 215 | " agent.qnetwork_target.load_state_dict(torch.load(target_network_fn, map_location=lambda storage, loc: storage))\n", 216 | " with open(scores_fn, \"rb\") as sf:\n", 217 | " agent.scores = pickle.load(sf)\n", 218 | " with open(memory_buffer_fn, \"rb\") as mf:\n", 219 | " agent.memory.memory = pickle.load(mf)\n", 220 | " #agent = Agent.load(agent_fn)\n", 221 | " return agent\n", 222 | "\n", 223 | "def save_agent(agent): \n", 224 | " torch.save(agent.qnetwork_local.state_dict(), local_network_fn)\n", 225 | " torch.save(agent.qnetwork_target.state_dict(), target_network_fn)\n", 226 | " with open(scores_fn, \"wb\") as sf:\n", 227 | " pickle.dump(agent.scores, sf, pickle.HIGHEST_PROTOCOL)\n", 228 | " with open(memory_buffer_fn, \"wb\") as mf:\n", 229 | " mem_to_save = deque(list(agent.memory.memory)[-50000:], maxlen=100000) #agent.memory.memory\n", 230 | " pickle.dump(mem_to_save, mf, pickle.HIGHEST_PROTOCOL)\n", 231 | " #agent.save(agent_fn)\n", 232 | " return None\n", 233 | "\n", 234 | "def train_agent():\n", 235 | " env = UnityEnvironment(file_name=\"./VisualBanana_Linux/Banana.x86_64\", seed=111)\n", 236 | " brain_name = env.brain_names[0]\n", 237 | " brain = env.brains[brain_name]\n", 238 | "\n", 239 | " final_eps = 0.01\n", 240 | " eps_start= 1.0 #0.01 #1.0\n", 241 | " agent = Agent(num_input_chnl=11, action_size=4, seed=0) #create a new agent\n", 242 | " #agent = load_agent(agent)\n", 243 | " \n", 244 | " def dqn(n_episodes=3000, max_t=1000, eps_start=eps_start, eps_end=final_eps, eps_decay=0.995):\n", 245 | " \"\"\"Deep Q-Learning.\n", 246 | "\n", 247 | " Params\n", 248 | " ======\n", 249 | " n_episodes (int): maximum number of training episodes\n", 250 | " max_t (int): maximum number of timesteps per episode\n", 251 | " eps_start (float): starting value of epsilon, for epsilon-greedy action selection\n", 252 | " eps_end (float): minimum value of epsilon\n", 253 | " eps_decay (float): multiplicative factor (per episode) for decreasing epsilon\n", 254 | " \"\"\"\n", 255 | " scores_window = deque(maxlen=100) # last 100 scores\n", 256 | " eps = eps_start # initialize epsilon\n", 257 | " for i_episode in range(1, n_episodes+1):\n", 258 | " env_info = env.reset(train_mode=True)[brain_name] # reset the environment\n", 259 | " state = env_info.visual_observations[0] # get the current state\n", 260 | " #print(type(state))\n", 261 | " state = state.reshape((-1,3,84,84))\n", 262 | " #state = np.expand_dims(state, axis=0)\n", 263 | " #print(state.shape)\n", 264 | " #state = torch.from_numpy(state)\n", 265 | " score = 0 # initialize the score\n", 266 | " for t in range(max_t): #this could also be while True instead\n", 267 | " aug_state = agent.augment_state(state) # augment the state\n", 268 | " action = agent.act(aug_state, eps) # select an action using e-greedy policy\n", 269 | " env_info = env.step(action)[brain_name] # send the action to the environment\n", 270 | " next_state = env_info.visual_observations[0] # get the next state\n", 271 | " next_state = next_state.reshape((-1,3,84,84))\n", 272 | " reward = env_info.rewards[0] # get the reward\n", 273 | " done = env_info.local_done[0] # see if episode has finished\n", 274 | " agent.step(state, action, reward, next_state, done, is_training=True) #add to experience buffer and do the learning\n", 275 | "\n", 276 | " score += reward # update the score\n", 277 | " state = next_state # roll over the state to next time step\n", 278 | " if done: # exit loop if episode finished\n", 279 | " break \n", 280 | "\n", 281 | " scores_window.append(score) # save most recent score\n", 282 | " agent.scores.append(score) # save most recent score\n", 283 | " eps = max(eps_end, eps_decay*eps) # decrease epsilon\n", 284 | " print('\\rEpisode {}\\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end=\"\")\n", 285 | " if i_episode % 100 == 0:\n", 286 | " print('\\rEpisode {}\\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))\n", 287 | " print(len(agent.memory.memory), agent.memory.memory.maxlen)\n", 288 | " if i_episode % 500 == 0: #save weights every 500 episodes\n", 289 | " save_agent(agent)\n", 290 | " if np.mean(scores_window)>=17.0:\n", 291 | " print('\\nEnvironment solved in {:d} episodes!\\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))\n", 292 | " save_agent(agent)\n", 293 | " break\n", 294 | " save_agent(agent) #save at the end\n", 295 | "\n", 296 | " return agent.scores\n", 297 | "\n", 298 | "\n", 299 | " strt = time.time()\n", 300 | " scores = dqn()\n", 301 | " print('\\nTraining Time is {}'.format(time.time()-strt))\n", 302 | " env.close()\n", 303 | "\n", 304 | "# # plot the scores\n", 305 | "# fig = plt.figure()\n", 306 | "# ax = fig.add_subplot(111)\n", 307 | "# plt.plot(np.arange(len(scores)), scores)\n", 308 | "# plt.ylabel('Score')\n", 309 | "# plt.xlabel('Episode #')\n", 310 | "# plt.show()\n", 311 | "\n", 312 | "\n", 313 | "#Train the agent\n", 314 | "print('Starting Agent Training:')\n", 315 | "train_agent()" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 1, 328 | "metadata": { 329 | "scrolled": true 330 | }, 331 | "outputs": [ 332 | { 333 | "name": "stdout", 334 | "output_type": "stream", 335 | "text": [ 336 | "Starting Agent Testing:\n" 337 | ] 338 | }, 339 | { 340 | "name": "stderr", 341 | "output_type": "stream", 342 | "text": [ 343 | "INFO:unityagents:\n", 344 | "'Academy' started successfully!\n", 345 | "Unity Academy name: Academy\n", 346 | " Number of Brains: 1\n", 347 | " Number of External Brains : 1\n", 348 | " Lesson number : 0\n", 349 | " Reset Parameters :\n", 350 | "\t\t\n", 351 | "Unity brain name: BananaBrain\n", 352 | " Number of Visual Observations (per agent): 1\n", 353 | " Vector Observation space type: continuous\n", 354 | " Vector Observation space size (per agent): 0\n", 355 | " Number of stacked Vector Observation: 1\n", 356 | " Vector Action space type: discrete\n", 357 | " Vector Action space size (per agent): 4\n", 358 | " Vector Action descriptions: , , , \n" 359 | ] 360 | }, 361 | { 362 | "name": "stdout", 363 | "output_type": "stream", 364 | "text": [ 365 | "Episode 1: 11.0\n", 366 | "Mean Score out of 1 episodes is 11.0\n" 367 | ] 368 | } 369 | ], 370 | "source": [ 371 | "#Testing the agent\n", 372 | "from unityagents import UnityEnvironment\n", 373 | "import numpy as np\n", 374 | "import time\n", 375 | "from collections import deque\n", 376 | "import matplotlib.pyplot as plt\n", 377 | "import torch\n", 378 | "import pickle\n", 379 | "from pixel_dqn_agent import Agent\n", 380 | "\n", 381 | "local_network_fn = './saved_agent/dqn_checkpoint_local.pth'\n", 382 | "def test_agent(num_episodes=10):\n", 383 | " #env = UnityEnvironment(file_name=\"./VisualBanana_Linux/Banana.x86_64\", seed=111)\n", 384 | " env = UnityEnvironment(file_name=\"./VisualBanana_Linux/Banana.x86_64\")\n", 385 | " brain_name = env.brain_names[0]\n", 386 | " brain = env.brains[brain_name]\n", 387 | "\n", 388 | " scores = []\n", 389 | " final_eps=0.01 \n", 390 | " agent = Agent(num_input_chnl=11, action_size=4, seed=0) #create a new agent\n", 391 | " agent.qnetwork_local.load_state_dict(torch.load(local_network_fn, map_location=lambda storage, loc: storage)) #load the weights\n", 392 | " \n", 393 | " for i_episode in range(1,num_episodes+1):\n", 394 | " env_info = env.reset(train_mode=False)[brain_name] # reset the environment\n", 395 | " state = env_info.visual_observations[0] # get the current state\n", 396 | " state = state.reshape((-1,3,84,84))\n", 397 | " score = 0 # initialize the score\n", 398 | " while True:\n", 399 | " aug_state = agent.augment_state(state) # augment the state\n", 400 | " action = agent.act(aug_state, final_eps) # select an action using e-greedy policy\n", 401 | " env_info = env.step(action)[brain_name] # send the action to the environment\n", 402 | " next_state = env_info.visual_observations[0] # get the next state\n", 403 | " next_state = next_state.reshape((-1,3,84,84))\n", 404 | " reward = env_info.rewards[0] # get the reward\n", 405 | " done = env_info.local_done[0] # see if episode has finished\n", 406 | " agent.step(state, action, reward, next_state, done, is_training=False) #only add to experience buffer and don't do learning\n", 407 | "\n", 408 | " score += reward # update the score\n", 409 | " state = next_state # roll over the state to next time step\n", 410 | " if done: # exit loop if episode finished\n", 411 | " scores.append(score)\n", 412 | " print('Episode {}: {}'.format(i_episode, score))\n", 413 | " break\n", 414 | " env.close()\n", 415 | " return np.mean(scores)\n", 416 | "\n", 417 | "print('Starting Agent Testing:')\n", 418 | "num_episodes=1 #100\n", 419 | "mean_score = test_agent(num_episodes)\n", 420 | "print(\"Mean Score out of {} episodes is {}\".format(num_episodes, mean_score))" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 3, 426 | "metadata": {}, 427 | "outputs": [ 428 | { 429 | "data": { 430 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAEKCAYAAAASByJ7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJztnXd8FVXax39PEgKhhkCAUEIQkCIoYARpKkUB0RV39xV1XbvYy77b4q6vdXVZdxXLqohr1wV7wSAKKNIUCb330CGhh5J+3j9m5mbuvTP3zsyddu99vp9PPpl75syc58ycOc8pz3kOCSHAMAzDMCleC8AwDMP4A1YIDMMwDABWCAzDMIwMKwSGYRgGACsEhmEYRoYVAsMwDAOAFQLDMAwjwwqBYRiGAcAKgWEYhpFJ81oAM7Rs2VLk5eV5LQbDMExcsXTp0oNCiOxo8eJKIeTl5aGoqMhrMRiGYeIKItphJB4PGTEMwzAAWCEwDMMwMqwQGIZhGACsEBiGYRgZVggMwzAMAFYIDMMwjAwrBIZhGAYAKwQmASivqsHHS3eDt4NlmNiIq4VpDKPFs7M2Ycq8bWiWUQ8X92zttTgME7dwD4GJe0rLKgAAZeVVHkvCMPENKwQmYeARI4aJDVYITNxDXgvAMAkCKwSGYRgGACsEhmEYRoYVApMw8BQCw8QGKwTGNNU1tbjg6e8xc81+r0WR8MEkwl8/W42CT1Z5LYZv5HCai5/9AZ8u2+21GDFTVVOLoU9/h2/W+uNbYoXAmObIqSrsPHwKD32+2mtRgvByYdr7i3di2pJdnqXvNzmcZnPJCfzvhyu9FiNmjpysxK7Dp/HQ52u8FgUAKwSGYeKMRFyR7pcssUJgLOOXQswwjD2wQmDiHvLDJALjGtwQcQ7HFQIRdSCi74loHRGtJaL75fAsIppFRJvl/82dloVJbLieYOKOQFvGH6XXjR5CNYDfCyF6AjgfwN1E1BNAAYA5QoiuAObIv5k4gLhBzniIP6rOxMRxhSCE2CeEWCYflwFYD6AdgCsAvC1HexvAOKdlYZxhz9HTOK7jWE7r3LHTVdh79HRQ2Ib9x22RpaZWYPOBsrDw8qoafLt2f2BCcvOBMtTUulO11NYKbNKQKVa0nqPXCCGweNsh7Dt22tAz3lJShqqa2ohxyqtqsK30hGEZjpysxIHj5YbjO0l5VQ22HzzptRiGcXUOgYjyAPQFsBhAayHEPvnUfgCafouJaAIRFRFRUWlpqStyMsZQPvXBE7/DmOfma8YZPPE7jJ40Lyhs5LM/YNDE7wK/C1ftw+jn5mPG6n2hlxtC3WP517cbcfGkedgaUoH0f3I2Jry7FK8v2I4N+4/j4knz8MKczZbSM8uU+dtwyaR5WLHrqK33vWRS8HP0A1+s2IvxU37CwL9/h4snzcNzszfpxt11+BRGPjsPf5+xIeI9f//RSgx/5gecrKg2JEPfJ2ZhwFNzTMntFPdPW45h/5qL8qoazfPK/Jdf5kVcUwhE1BjAJwAeEEIENQeF1GzTfCRCiClCiHwhRH52drYLkjJW2BOhpbr3WHBrTXFXrbBRbj3H3IoWwNIdRwAAJceD0zheLlUmq3Yfw76jkjx2V9B6rNotpbPniL2t+QMhefQD60N6est36j/jwycrAQBFOw5HvOeiLQcBABXVUk8insxOF2yWZK92qTcaK64oBCKqB0kZvC+E+FQOPkBEOfL5HAAlbsjC+BCPPnCeC3EAE6/SaNQU+UXVxpEiiFfcsDIiAK8DWC+EeFZ16ksAN8jHNwD4wmlZGHtwqh51y3xUuDwtmcz1mBGlGy0KhSiERHqcfmuUuLGF5mAAvwWwmohWyGF/ATARwIdEdAuAHQCuckEWJgEhnWMz17mB3z7+eCHw3BJJE/gUxxWCEGIB9L+9EU6nzyQPbrf8mXCceAOh+iARe1x+yRKvVGY8J9aPwWzL2+0KJRErMD2cmPBN5DkEv3UaWSEwuhw6UYHHp69DtY6duN0fv1Kxf7Z8N977aQeemrEetRrWGVU1tXhs+tqAlYqmbDpqRqCugiaVJtl/rBxPFq4ztDZh1+FTmPj1hqj5n75yb0QX4St3HcVr87YFhSl5O3QiNguiNxduR1FxuPXOmj3H8MrcrSivqsGfPl4ZZp5rhadnbkBeQSGOna7C9JXGTYeNlh/lNSmvRv1u1+w5Zugeby7cjqUqa6apP+/EQtl6ianDjTkEJk555Mu1+GrVPvTv1Byje+UEwsnmwfDQeuF3H9S5NR7ZozX6d8oKOj9j9T68ubAYx05X4dmr+tgiwx8/Xon5mw9iePfWGNi5RcS4d76/FGv2HMe4vm3RvU1T3Xj3Tl0OABh9VhvN81e8tBAAcNsFZwTCZq07gDcXFuPwyUo8f3Vfs9kI8Nj0dQCA4oljg8Ive3EBAKBXu6b4sGg3TlbW4KVr+1lOBwBenrsVADBp1ibsd2BBWGDISEOBXPbigrA8ahH6PB78dHXQb6/xiykt9xAYXaprZKuOkLLqZuHVGiZQgkJb8+qoehZLerIrq2WNzENUVWs/FztQ8qQ8e6dQ0jl+WnuFuRWcGtJRGiDK7X1Sd9qC3Y2rWGGFwETFy+9P6+MPrbS1Kv9IFbtyxuqnqHzDiVAx2VkhaT0PQ/ePEiclRf/+jL2wQmBM41SrRuuukSr2WKVQZ8PJykbJg5/agk5k17EeAhJ3UtlvsEJgoqJXkdn1eUYcprF2ytZrEhk7lZRZ7wxGowd6ZOZu7yuiDbP6JW+sEJioeDpkZDKukc6L1sdppdMT1+seHBFd47nacNeUwByCc3M3jAQrBMYybgyBRBomUNI3VZkH3c5aDkInOY1fZ08cOwgMY9mYnlMVtSJinPiH0yReRGeFkKD0fvQbXPzsD0FhI56Zi3Me+9a2NEIL+aRZm5BXUIjyqhrU1grkFRSaup9W5fTb13+2LN91/1msKYPWx6lVmY1+bh7OengmAMmvfV5BIfIKCm1RhJ8u220oXpe/zMAV/15gOZ1Ln5+PvIJCrN4dbK9/81tFAMJV4hNfrTP03k5XSs/j4S/WBMKmLdkVFu+HTaW4/MU6+a969UfkFRRiwFOzA2HRfRlJ/1fuOoq8gkIs2hq8fiCvoDDg5fb7jeZ8ZHZ6sBDjZPNfAFi87RDyCgqxfKd0v0MnKpBXUIjHp0vPZeaa6Osshj8zF30eD/7Ozn70W1z16o8ApP0a8goK8dj0tej3xCwAUvl79Mu1pr8Zu2GFkKCUlVdjc0nwoqOtpSdxzISZoV7rUe8DfvvHYgBSZVFjornoSMtSAAs0Fh6ph3m08qe2WNqwvwwnKyU/9naaZwLA5yv2GopXXSuwcrexxVdarNsnuaNevP2QofivL9huKJ5Sjt75cUfUuKtVi8d+3i4tDjPjulvpkf24TcqDVqWvLDL7du0Bw/cFpLKndoM+d5O058qirVJailv3NxZKz6Vwtf5CQ4VtpSdx9FR4eVHyrtzzzYXFQeffWlQMr2GFwOgSS0XthUWNEMY8pkbKl1vzAn6xONKzGIs2CWrHUJPZ8hVpHYJzw1UU8juxYYXA2IZXk31WKye31iFo2+dbTNwiVitfPdwUPzQtN4tZ6HuyI22td88rlRnf43alZXR9QyzfjnRtpInq6DJYfy6kceRP/FE9SdSZnfpJqsSEFQKjS7SK165GjdXbRFIgRu7ppdcAv7ksCCX6kJF98ke7VUBJG3JdYa/SSCEeMmKYmHDNdNKCRrKzZx5LizXaI7K7Nax3Pz05opl4eqvP3OspOJFPbVcr/oC9nSYQ20pP4OCJyjDvoFaJ9jGEWiypK+jQAr6k+DDmrC9Bv9zMoPDqmlp8tmwPAMld9E2D87TTOlWFRVsPYkzvHM3zgDGTw5lr9yM1VcrYNyqLlN1HTke+kNSH0o+fth1CyfEKNG9UL3CusroWT81Yj6v7dwiEhVqcVNfUYs6GcFlnrN6HHYdOaSY/Y/U+DOnaEmXl1dheehJDuraMLG8I01fuw4QLOodnS+cdK+s/9h49jbd/LEb/vCxsPFCGiqpajOjRCsdPV5tKf+Wuo7qutoUAPlm6G1U1taiorsX8zaV44Zq+aJiehpMV1dh4oAwAsO9YeSB+KAu3HETHFg1RWhZuvaR2e22W0Ofz5cq9eG58H6SkSCdW7DqKWiGw58hp1AqBUSGebRUrNYVlO49g8bZwecrK657nzkOnkNuioWWZY4EVQgIx/Blp3YHdLn3taFX/z+QfNcPfWlQccJm8YX8Zrn1tsWa8e6ctx7xNpVjw52GBsLphC+n/rHUHMCiK62oAKFwVbkuumAIaQUn2qRkbws69MGcz3lpUHGRC+HPIvgR/K1wfdt2WkhO46/1lgd+nQyqSu95fhot7tsaiLQdxsrLG9DtevecY3lpozKQUAGau2Y9xfdth1HPzUFZejVd/qNu34fk5m02lDdS5+g5GKlgrdh0NMv0EgP/7fC2eueocPPR53ToHxexUi5+LD4c9Z4VfvaJd9oyg1ZqfumQnfjOgIwAErWEAgOsHdox4v1++vChqmhf883vP3HLzkBHjKaUhG8FsLdFuRe4+LLWcy6u0N+vxCyVl0fcD2H0kvBdwqjK4xa01ZLPnyOmwFqcZ9mva/mt3EZTen7rl6ib7j0sKeo9Gz81NgxytHtThE/obM+0/Zv9+EG7CCoGJilnTOwLZ/tHWyeCX0VbzRDJX9YnVoW9QWuZacx9uWhuZnULwua1AVFghMFHxQ2Vl1n+QkzLH+TcfhO5q9ETKZAxoWVP54HNwDFYIjKeEjtFG+9i0zrteeTmQYGi+NBcv2Z6qf/FLb8rsqzayjsXPsEJgomL0oxBBx/Z+tXX76hqrEPzcwjXyZNyq9Lx8TFbz6OpKZRfT8gOsEJioeLmnsoLZ1apeDXNZruRCLtR2vGc/flWcflm45xc53ILNThOc4c/MxcOX9cRF3Vq5mu7ny/dEjdPrkW9woiLYikVL2Yx89oegcOUb/Wz5HvzinLZBcRUvlde9Xme+Gsl9dKi74X3HTuO8J2dj8nX9dK9ZGWIiGStXvrwwzLpq7sbSMNkUz6UA8Pj0dYFjoy6TJ/+wNSzsm7UHkFdQiAV/Hob2zets33ccOoX+T84Oi+8WgR6hxrmPlxpzHf787M24Z3gXw2ne8tYSfCevZfnnNxux79hp/KSxZkBZN/H7j1ZGvF+P/5tpOO1QTlfWICM9FQBw39TlyGvZCP978ZmW72cU7iEkONtKT+LhL9a6k5jq6/3zJ6ujRg9VBnpsKTkRMIMMHTL662ero7aczbiP/nTZHpSWVeDGN5cEhRsdG7bSoFy+8yiOmzTvfMPEmgIjhCrw/y7eiRKNRV5uUTeHYL2rN2n2JlPu3udsKAkqW+/9tBNbdMyg9ZSB+v2frrJuIry5pCxw/OXKvXjBwtoPK7BCSAIsewO1uB2lE2aBVncpc5tI8vl58CF0aMTrkRK7kndieDNS+bbruXk1Oc0KIQmwWrT0viUv6uQUnTkEn+sHTfzi6jgSblRHhibXHZfCn3ilkFkhMDFht1mg3qWBhUoeWd+w62V3satH6MhmfBF7gfbU5KFeVt3CcYVARG8QUQkRrVGFPUpEe4hohfx3qdNyJDNWLSUMDRlZurN59GQRwuaN4h3MUTxZrHgta/w8KWdI8aip7kaybwEYrRE+SQjRR/6b4YIcSYvbH5cTVap6HUJwWja7iZZvF1YhcgfBVezSR3EwOqdJws4hCCHmAbDuf5YxRGW1vtM3o9YiQggs3XE4zEFbTchXVav6ffhknaMvJfTAcesOvvQ+4L2y07DSE+UoUrkzPnyyMkiGWFHMVtXWKVtKyrArmntsma80PKmq2XX4lC8d9P207RCOqJ6jUQuwWDimsRG9wtbSkwBi18OhTgPX7DmGnTouxo1yvNy45ZJVSssqsGjrwTD5ncbLdQj3ENH1AIoA/F4IcUQrEhFNADABAHJzc10UL7548FN9M0+jH/cHS3ahQL5P8cSxAZO7F+ZsDrL3f/n7Onv28zRs1cc8P99Qela4+a2ioN9VNQIzVu93LD0AGPnsPMNxo5kaDn36+1jFcYT5mw+i7xOzXE3z1neKdM9tP3hS2scgxib+hf+cG/T7she116Ro7aOgx5sLi/VP2tSwV9bR9Mhpas8NDeLVpPIrADoD6ANgH4Bn9CIKIaYIIfKFEPnZ2dluyRd3LNhSGvM91AufgLoNSUJtsZfv1NTdDGMrepsFOUGZC61+K6wP+SadxhOFIIQ4IISoEULUAngNQH8v5GCC8cqyQQ1b8zAKRG4aLdhT9r3/gmLDE4VAROp9EK8EsEYvLmMMOybPUlOMFWe9pOLBvt5L+PmYhx+Zuzg+h0BEUwFcBKAlEe0G8AiAi4ioD6S6pRjA7U7LwUTHqEJgmESDS76E4wpBCHGNRvDrTqebbNjR4/XBiBG3CJkAbppe+qHs+wFeqcwESOWvwlFY15kn3uaUvF7QFyvs/jqOeWXuVgzrno3crIY4oNpA/devLAqL+9q8bWFhNbUCT8/cgE4tG6FxgzS8PLfOnPTJwnVBG6wfL6/C87M3Y+zZOVhlwnsoU8cnBt02M3Ws2eOOlc0ymyznpq/ca8t9vIIVQhzzj5kbMGnWJtw9LNjne9GO8ML95Iz1YWHzNpXiVQ1FAQCvzQ92rzxp1ia8ubAYry+w1+1yMvHtugNeixBXuNnY/t0Hkfc2SBZ4yChOUSxWKmtqUV1rbeVrTa3x7nh1TfS48dW5ZxgmFFYIcUqiTr4maLYYJi5ghRCn1CaqRmAYxjNYIcQpijogYhtqhmHsgRVCnGJHB8GXFnLc8WEYz2CFEKco9tl+rNMZxg7i3aY/HmGz0zgg/2+z0aJROr753QWBsG4PzQQA1Arghe+2mL5nXkEhcrMaGo5v5Ns8VRnZ9bMRKmv8t1cA4w33TV3utQhJByuEOODgiQocPGHcX7tRdh427l6Y57AZJvHhISOGYZg4wA1vuawQGEPwcC7DeIuZhaRWYYXAMAwTB4Tube4ErBAYhmHiAIseakzBCoFhGCYOsOqzzAysEHzGrHUHcPRUpea5o6cqMdsGj5mPfrnW9DUzVu+POV2GYazDPYQko6SsHLe9U4Q731umef72d5fi1neKcChGE9S3FhWbvsYJs1eGYYzDPYQko7JaeuE7Dp3UPF8sh1cZcEXNMExiwZPKSUa0pfq8OIxhkhceMkpSotX7vCaAYZIPHjJKMrieZxhGD+4hMAzDMAC4h8AwDMPIuLFLIisEH6L33kvKJNPP1xdsd1EahmH8QDX7MkoujE4WT5m3zVlBGIbxHTyHkKQI3keSYZgQ3KgXWCH4CGI7I4ZhdHBjHZJhhUBEQ4joJvk4m4g6OSdWcsML0BiG8QJDCoGIHgHwZwAPykH1ALxn8No3iKiEiNaowrKIaBYRbZb/NzcrOMMwTDLhpx7ClQB+AeAkAAgh9gJoYvDatwCMDgkrADBHCNEVwBz5N8MwDKODn+YQKoW0oacAACJqZDQBIcQ8AIdDgq8A8LZ8/DaAcUbvl0gs33kEB09U4NipKhw6URGwMiopq8DW0hPeCscwjK9wweoUaQbjfUhErwLIJKLbANwM4LUY0m0thNgnH+8H0FovIhFNADABAHJzc2NI0l+UV9XgypcXBYUt/suIwPGIZ37A9HuGoHf7Zm6LxjCMDxF+WZgmhPgXgI8BfAKgG4CHhRAv2iGAuuehc36KECJfCJGfnZ1tR5K+oKom3Kg41MZo5+FTrhQChmH8jxs1QdQeAhGlApgthBgGYJZN6R4gohwhxD4iygFQYtN944bUFDYxZRjGOL6YVBZC1ACoJSI7xy6+BHCDfHwDgC9svHdCwR0EhmEknK8MjM4hnACwmohmQbY0AgAhxH3RLiSiqQAuAtCSiHYDeATAREjzErcA2AHgKpNyMwzDJBVuNA6NKoRP5T/TCCGu0Tk1Qic8KdB6uVrvmzsIDMMAPplDAAAhxNtElA7gTDlooxCiyjmxGIZhGDW1LtidGlIIRHQRpPUCxZCMYToQ0Q3yGgPGAlqv9vrXfw76/WHRLhw5VemOQAzD+Brf9BAAPAPgEiHERgAgojMBTAVwrlOCJSMbD5QF/f5hUyl+2FTqkTQMw/gJX1gZydRTlAEACCE2QfJnxFiE1xcwjDaPXt7TaxF8iRuuK4z2EIqI6D+oc2j3GwBFzojEMAzDhOEjK6M7AdwNQDEznQ/gZUckShK4f8AwjBn8NIeQBuB5IcSzQGD1cn3HpEoCeMSIYRgz+GkOYQ6ADNXvDACz7ReHYRiG0aLWL87tADQQQgT8McvHDZ0RKUngHgLDMCZwo8owqhBOElE/5QcR5QM47YxIicfcjSXIKyhEXkEhDp/kdQUMw5jHDctEo3MIDwD4iIj2yr9zAIx3RqTE46tV+wLHOw+fQlajdFdMyBiGSRw6Zzd2PI2IPQQiOo+I2gghlgDoDuADAFUAZgLY7rh0CQivP2CYyPAXEs6Ng/LQIcv5UfpoQ0avAlDGOAYC+AuAlwAcATDFQbkSFqWws15gGMZvRBsyShVCKPshjwcwRQjxCYBPiGiFs6IlNqwPGIbxG9F6CKlEpCiNEQC+U50zOv/AqOCeAcMwfiVapT4VwA9EdBCSVdF8ACCiLgCOOSxbgiJpBJ5LYBhteHPZcMilhxJRIQghniSiOZCsir4VdbVYCoB7nRYuEdl04ATO7ZiFuRvZiynDaMFNJe8wsqfyT0KIz4QQ6q0zNwkhljkrWmLy4KercfRUJX7/0UqvRWFcpGF6qtciOM6wbtlei8DEiNGFaYyNVLuw8xEj8at+7R1Po3H96NNpg7u0DAvL79jcCXE8oXjiWLx5U/+IcZo3tMdjfvHEsbhnWBdb7qXFzYM7OXZvv8MKwQNS3BoQZHyD1pRRshUDO5tBNTwH5wisEDwgJckqgkTH6usknj61jJOO3pJNUathhcAwrqDVRXBfCi+xsw53Y8P5ZIQVggdwb9c9/OIziusve3HyefpRT7vVm2SF4AFcNzCAPyseJ7Fz7Y0bewMkI6wQHOL6N37GFyv2YPOBMny8dHfQuX5PzPJIKsYreCEi0KSBPVZGAFA/zTkzXp5DYGxn3qZS3D9tBZ6fs9lrUeKSq/KdNxe1wk2D89CkQRruvKizqeu01IHRimfydefi3Vv6o3N2I904r9+Qb0oeNzkvrzn+OKobPrlzEK4f2DHs/NCuwSa5at05afw5GNG9FUad1RoA0DOnKQDgvhH2mJ2+cE1f09f8aXQ309d0aqn/7ozglpJiheAw3C60xkOX9bT9nm2aNoj5HhecmY3Vj47CAyO7Gopvh7386F5tMLRrNn538Zm6cUb0aB1zOk7Rq10z3D2sC9o0a4DHr+gVdv5v48LDFK7s2x6v33ge7rhQUsD10qQqq2G6Pa7U+udlmb7m6vNyTV/TrXUT09d4ASsEp2GNYAknGkR2mvuaneTjEaPYUOYMUl1oKVMSjxmxQnAYnvyyhhMfpS33NPk6FSVkRylI5nULNbXS/1SbF/FYKRJWJPCLtVs0PHVhTUTFAMoA1ACoFkL4dyDUIqwP/IOdOiboXpHuK0e0Y1I5XioVJ6iR7UzdWOWfvGrXH3saDBNCHPRaCKdI5o84FpwZMnLoU+dXHBE7ejZKT9sPbl+s9Spik9utXPOQkUn2HTuNiuoaAMCOQyejtvy2lp6MeJ5xDztGG8wqeCVJO3qKyT1kJD3ANDcmERwgXhqGXisEAeBbIlpKRBO0IhDRBCIqIqKi0lJv9xCoqRUY+Pfv8MC0FVi9+xgu/OdcvLWoOOI1W0pOuCNcgkEE9M3NtPmesVcmHZpLG50H3SnCbc9u3wyAZGkTSr1Uc59fXsvgTdbP9aG31GYZ5tcahCrL7jnhFjltMzMAaHuNtZtom9knsmL2WiEMEUL0AzAGwN1EdEFoBCHEFCFEvhAiPzvbW3/rSrf123UHUHxIavkX7TjipUi2EaudtFmu6R/ZdI9AmHrb+bamqaUP1jw2Crk6FYA6/ne/vxALC4ajqwnzwZE9WmFEj9ZYWDAcI3u0Cjp397DOphXCWW3rlEpuVkO8f+uAoPPL/+9iU/eLlVFntcbGv43Gkr+ODIQtLBiOFQ/HJsegzuGVfpdWjbGwYDhuv+CMmO4dilbV3rVVY0vrE9Q8cnmw2XQkJbKwYHhMadmJpwpBCLFH/l8C4DMAkR2qe0zitgvcX52ZZmD8pkE9e1ejao0/N66fhpxm2usTMlTpn5HdGO3kVipgrLeh7JOgvk6hVZPY1kRkNUoPez7NG6XHdE8jqK18+nRojvppqchuUj8Q1rh+GjIbBssR7VEZLXvtMjNssz5r2bh+xPMdmoe/swAGRAjdIyPSkJFW+fAKzxQCETUioibKMYBLAKzxSh4jKK+U3RDETjR94ISC0ksz5repcwN15eWDuVDf4sfPyYciuYKXVkatAXwmfzRpAP4rhJjpoTyMi3ix+MexsV8PKnuvFEzi6DW5ytdrJETQCEaevd3WUG69b88UghBiG4BzvErfCkohSeaVjHbhxSO0dR2Cyfh2t4L90Krmz0CfFK9nZy0Sp2J7SyIOGbn9bUdrQTkzZOTUmJF5hBBx2dpWP0K7PgN/Khb9zBkRN14tkVghRGDmmv2Yu7Ek8FuZGKoVkjdTAChctQ/Pzw72aBqPCsNtib3YRtSxlcp6cRxK24n7OUk0Ub38XLQqbjtGAOLp/ahhhRCBO95bihvfXKJ57iPVHgeTZm8KOheH+iCIp67sHTXOr/pZd0/dM6cpbhzcKWKcWFpYdng1Vfj1ue3xj19Ffx7RUJeJnjlNcfk5bW27XyT0rGm0vLXefmGwSaeWjFOuN+ddRnrXeaaucYKxvXOCfr9wdV+cf0YWslSWWU9d2Rt9OmTi7PbNIj5fAUR0RQ4Evx8j31OGzRZ1VmGFYAI/VvSxViwK6uo31D+9FpFcFis0Stcu5DPuHxrV1M5MC6tBvbpi/Mmdg/D+bSr7fAPvLNQkcO1jo6RwAfzrf87BeA13x7G0ImfcPxQtopg92kXRQyODfhdPHIviiWPxwMhwV9r3jwhWEi9E7BmfAAAXJUlEQVRq2OIP69YqsBbAyCOYcf9QtG8eeaGX0+R3bI6XftMPxRPHBsIGdWmJaRMGBpnRXjsgF5/fPdiQuXO0dTTKmqVLe7fBtQOiu8tW9nsIJbOhtNDPrXlLVggO4Kbe8GLoxc8Q6Q9ROPZN6b1wVXrxMIQQr+PeThDpGzbSMIxXAxRWCDahnjdwcw7BCWdffijDZkQIrcjMfoR6ryvW56CWy4+9y1D88N6dwMqjj/a+opUx5fJYv0+3yw0rBBNEejmK8y23ceIjNlKh+tVZFyGRbOUZX2KohxC8oU+0it0vXxMrBJuoUfcQXEzXq26+0/rPTCs/NKqudalJmeOhVZ8I+PExR+rlG2kMKZenxNmYLisEE0QqCOoegpsViV3lzfwwi7OZdPMz8qpCStQhGr9hd1k1cru6LT+llxz78KM7+GGDHNf5xb8X4ER5Nb77w0UAgLyCQgBAj5ym+Pr+oWHx1+w5hsteXBDxnj0f/gZpKYTqWoHx+R1sl1kPuyoVsx9NmoGlmF1bN8GKXUetimQZvV5Tp5aNsHbv8bDwvBaNsFTDa63RZ9u5VWPNfLaL5CDNJojsa4CoLW7ObN3YnpsawOnKztIcQoRzaamEVk0iW4kpVmSKhVW89DaTUiGs2n1MM3z9vvDKAgCmr9xr6L7Vci/hg6Jd1gSzgF7l16BeCsqrag3dY+YDQ3H3+8tMpZuhYVL6p9Hd0Et20ZyaQjirbVP0eXyW7j2+/8NFGPavuZrnlMp4aNeWmL/5IPp0yMS4Pm3x6PR1aNk4HR/dMQiv/rAV05bsClkApv08Ljs7B//41dn4atU+AMD0e4agaYZU/P82rhfG9GqDW98pCrrG6Ec8+bpzsWznEaSlECa8uxRpKYR/X9sPI0JcXpvl07sGBVqYoSiiLfzzcOw/Xm7p/t//4SKcrKhGepqk3OulpuC/tw7A0dNVGNApy9I9FabfM8Tw3gi27DcdQTGqn+HX9w817Xb807sG4eipSpQcr0B5VQ2aNKiHy87Owb1TlwOQ1lpMGt8Ho56bF7hmZI9WmHzduWFuz8f1aYvPVxirT7wgKRWCWWp9rN71WrF9OmTip22HNc+lp6WgsrpOWXRv09TQkJHSA1Lo1LIRth+s2xHurou6GJS67no9FHn652Vh/uaDGNylBfrmShvC5DTLQKeWjfC/l5yJaUvCla9WVoZ1a4VGKpfEvdvX7S2QkZ6KkT3r7MCN9gwUpdssox4u7Z2D8qqawPWje7WJKlM0+uVG3wCnbWZGYPOYaISu/dB6/oNs2oBG/XzdoGXj+igtq9A8px7H75HT1ND91J+81ntQfy99czPRrU2TsPPqMqBEH92rja8VAs8hGKDGWEPbE/Qq8mSyKbfb1tuo/g+NF0kMu9sUyfN2jRHpeViZZ7Pbii6qlZHOebfd4LBCMICfewh6hT3RJizVCi40b6EfjV15t3ofN5Sxf0ukeZx+Wqm+svSJPASoG+5SFlghGMDPzuosLXzxb3bCMCJq0EY0dqbNZqqu4PRjc/obSaTXzgrBAB6tOTOEXlmPpZVsVAG62e4ykx+1grDy6sw+u7D4hq+3/gT91Ob1O/7qIfibpJpUXrnrKIpCzAtDK79xLy1EdpP6aK8yGVxSrD056wd0/faYrDKMxPabXtR2XRz7feOhpR8HIrpKpPdupYeQrM83qXoIV7y0EE98tS4obN7mg0G/V+w6ilnrDuDNhcWBsA37y9wQzxJ6E6qXhrj7VXPvcHPWQHZy46C8oN8XnJkdMb66cu7YQrLp/u35HSNek9VQf7P54d1bBTxIxoryHBUzRuX/vcOMPd+r8q27EI/EjYPyMK5P27D5pVuHRnY5bhQ7K8sUQpBNf0/ZCujKvu1M3ee2oXWuuztkZSC/Y3M0aSC1d8efZ35dUFd5HUbTBva0mf9HXpvUu30z9Ncw6f1lP3P5dYqk6iFocfiktqlaPHB2iGnfTw+OwPl/n4PWTesHCl3j+mk4UVGNVk3qo0Q2y7t3RFc8M0vaw6F9LIunDDS81C6HtXjn5v6BY2WBoN41mQ3To94PkMxIlXi/+2BF0Lk3bjwv6vVGG5T3DO+Ke4bXuYxOTSFD8ilcclYbFE8cG8i3ET69axB++fIi3fPq9J+7Oth99U1R9qBwG7Wsoc9t0vg+mDS+j+F73Tr0DPytcD0AYP6fhscsW6smDUy9SwAR3+XFPVsH7vfh7QPx+PR1eGPh9sD5Yd1amS4LTpBUPQQt4mF4wChKRaae8/DzhHisKKaBPELsLvy8XURxo+3SU2eF4JP60srYt5G5TCV79oyte/ewjIpvx4fjlzKhBVfG/sPP5cUsrBC8FiBGtCp6dQE1XJHGYU2TTIvvFPxQXv0gQ7Lg+l7nLqfnO/yy6MyOqq1ugtl8nnzyGMLwYt+FeFSODGMHSa8Q/NLcsep+QWsFb6zrJnypHCI9nySqwP2QVT/IwDhDUlgZLdpyENf+Z3FYuNcz+mqaNEjD0VNVMd1D60NVbLAbpqcBCLeoaih7LVXXt3qbeoTqiUbp3hYfRcxG6WkoK68GEK43GtST2jxpqcarMUXJNqoffbN1oyTS4ig/the8pH6a+Xa10Z5vo/qpOKHySOs0SaEQtJSBn+jSqjEmX9cPI5+dFxSenpqC7Cb1sefo6aj3OEdlgipN/koFLrtpfdyd3wVje+dg6NPfh133+g3BZpjj8zugbbMGYR5RFb66d0jAffirvz0Xny3fg3PaZ+J4eWzKDACeurJ3mCltJFo0ro8Hx3THmF45uOCf4XkDgIIxPZDVKB1jI6zLCCUjPRUPje2B4d1jc1+tpk+HTLTLzMBjvzgr7NxbN52H8qoa3PGeORfkbvDBhPOx/3g52jdvGOTZFrDeU/j6/qFhC0RjQdnD5K2bzkOFRpmNxts398fpyhrT171xYz6eLFyPP4zqFggzW4ZDeeU3/fDJst24b0RXzFyzH3dc1BmT527FnRd2tnxPMySFQvA7d17YGZ1aBm9IktmwHlY8fAmACD0ZokCL+PJz2gZ6A+q2BwG4I0Jh6pDVMOj3TUPyQES488LOeH7O5rD4vdo1Q692UoFvm5mBuw0uwjLCtQNyw8KiDV/dLuetR05Tzf0smmXUwx9HdTcty62qhU52QERYWKBtH39Rt1ao9al/lAFntAgcn9sx2A20VYl75DQ17Iba6P0A6Tla4cIoiyP1GN69NYZ3bx0UplWGzTCmdw7GyI2Xs9tnAgD+NNp8+bUKzyH4hFgHFIRQzSGot/M0LUe4UmEYNYkz+MWEwgrBpxi3u1cf22CDz6ogPvBw5p9LSOz40nADHisEIhpNRBuJaAsRFXgpi98wvek9tHsFVhd0+aUVGFhYZzA+m4y6Bz/qxMMzhUBEqQBeAjAGQE8A1xBRT6/k8RrLm7GoriPlbapdVxi8j5GegV9bNUD8u+gw/f5Z8zEO4GUPoT+ALUKIbUKISgDTAFzhoTy+wsjnro4jRN1vvyy284KkWb3sg3fsvQSM3XipENoBUO+QvlsOs53O2fqbufsBrcae2uTxrLb6FhmKJUK3Nk0Ctsoje7ZGpuwCWs+CItQ07oKuUrxIrqMv6dla95xTKHmP9AyAOjfaLRrry5/MtMuMwautT+ndzrp5p9f06ZDptQia+N7slIgmAJgAALm51ky6xvVpF3D37Bb/vW0AzsppBhCwpaQMFdW1uPa14PUQI7q3wpwNJQCC5wwWFQxHy8Z1PuI/vmMQig9JNuBjnp8fdI/Lz2mLc9pnIlfeK+DHB4ejRaP6SE9LwcKC4Wit8jWv5oMJA1FWUbd2oGBMd9w4OA+tmjYIinfPsC64ZkAuUonQvJE9+wiYYdRZbTDvj8MC+dPjT6O64fqBHdE6RH4zrHj4Yl/0MCLJofT+zCx0K3poJDLq2bfITsHrJ/Xh7cFlOJ4Y17cdcls0RLfWTbwWJQgvFcIeAOqdK9rLYUEIIaYAmAIA+fn5lnqpeitvnWRQ55aB43M7ZqGiOnzhS2OdzTfahrTmMtJTI9ptqyvLnGZ110ZqFWakpyIjva6SSEtNQfvm4ZVuSgp53rqMpgwAffnNkBmhd+QmkeSokdddmVEI6sZFIhFahuONfrnNo0dyGS+HjJYA6EpEnYgoHcDVAL50IiFLm2zbjOZ2j8p/GyaU7cYHjyypMGpVViOvMbHq+8pOeA4h8fCshyCEqCaiewB8AyAVwBtCiLVOpOUHNzJ+kMEMPpizZDRQhozS4q1AMXGBp3MIQogZAGY4nY4fHIv5oZfCxD9KD8EPZdp7CRi74ZXKLqG5kY37YhiG9Zc/qZF7CNzAYJwgKRTCR0W7vRYh4pivVcsWrhKSj1of9RCYxCMpFMLGA2WupjdQ5SFSzeAuLfDva/sGft87vAvObN0Ywyx6aTTbw3hiXC9c0aetobhXn5eLLq0a4+rzOkSPzNjCwDNaYPJ150aMM+CMFujepgn+qHK57DbXD+yILq0a48q+jiwbYjyE4mnJf35+vigqKjJ9ndWNcP59bV+8++MOLN5+OCj81iGd8J8F24PCJl/XD6N7GfO5r8hTPHGsoXCtOADQNzcTn9012FCaDMMkL0S0VAiRHy1eUvQQrJKi2m/Aj/hYNIZh4hBWCBEgaI/vx0+fimEYxjhJoRCsL/zS7iHE0SgbwzCMYZJCIVg10UshNr9kGCZ5SBKFYPU6MmES6r7m8IP7AoZhEoekUAhnqjwKZjepb9hZG5G2YzWtzWSaZbjvCTQ3KzZnbgzDMGp87/7aDt69ZQDW7j2GtJQUaW8EAqb9vAuZDeuhR05TZNRLxWUvLgi7LoUID1/WE/XTUnDHhZ1RWV2LkrIKfLVqb1C8ydf1w8DO2msP7GbeH4fh0MkKHDxRicFd3EmTYZjkICkUQlajdAztGrxRzH0juka9jghoUC8Vj1x+ViCsQ1bDMIVgdP2BHeS2aGjIHTTDMIxZkmLIyCp6Y/RsZcQwTCLCCiEC7C6GYZhkghVCBNijJMMwyQQrhAiwPmAYJplghRABvTUI8eQQkGEYxiisEGR+e37HsDC9OQS1VVFDC5t8n5HdCN1UayPU3DKkk+n7MQzD2EFSuL82g9q99Ed3DMR5eVmOpscwDOM07P7aBngKgWGYZIIVQgTYVxDDMMkEK4QI8DoEhmGSCVYIEeB1CAzDJBOsECLA+oBhmGSCFUIEUnnMiGGYJCIpvJ2a4ev7h+LTZbvRMD0NPXOaei0OwzCMa7BCCKFHTlP8dWxPr8VgGIZxHR4yYhiGYQCwQmAYhmFkPFEIRPQoEe0hohXy36VeyMEwDMPU4eUcwiQhxL88TJ9hGIZRwUNGDMMwDABvFcI9RLSKiN4gouYeysEwDMPAQYVARLOJaI3G3xUAXgHQGUAfAPsAPBPhPhOIqIiIikpLS50Sl2EYJunxfD8EIsoD8JUQole0uG7sh8AwDJNoGN0PwZNJZSLKEULsk39eCWCNkeuWLl16kIh2WEy2JYCDFq/1G5wX/5Eo+QA4L34llryEbwmpgSc9BCJ6F9JwkQBQDOB2lYJwKs0iIxoyHuC8+I9EyQfAefErbuTFkx6CEOK3XqTLMAzD6MNmpwzDMAyA5FIIU7wWwEY4L/4jUfIBcF78iuN58dzKiGEYhvEHydRDYBiGYSKQFAqBiEYT0UYi2kJEBV7LEw0iKiai1bLjvyI5LIuIZhHRZvl/czmciOgFOW+riKifx7K/QUQlRLRGFWZadiK6QY6/mYhu8FFedB0zEtGDcl42EtEoVbin5Y+IOhDR90S0jojWEtH9cnjcvZcIeYnH99KAiH4mopVyXh6TwzsR0WJZrg+IKF0Ory//3iKfz4uWR9MIIRL6D0AqgK0AzgCQDmAlgJ5eyxVF5mIALUPCngZQIB8XAPiHfHwpgK8BEIDzASz2WPYLAPQDsMaq7ACyAGyT/zeXj5v7JC+PAviDRtyectmqD6CTXOZS/VD+AOQA6CcfNwGwSZY37t5LhLzE43shAI3l43oAFsvP+0MAV8vhkwHcKR/fBWCyfHw1gA8i5dGKTMnQQ+gPYIsQYpsQohLANABXeCyTFa4A8LZ8/DaAcarwd4TETwAyiSjHCwEBQAgxD8DhkGCzso8CMEsIcVgIcQTALACjnZc+GJ286HEFgGlCiAohxHYAWyCVPc/LnxBinxBimXxcBmA9gHaIw/cSIS96+Pm9CCHECflnPflPABgO4GM5PPS9KO/rYwAjiIign0fTJINCaAdgl+r3bkQuQH5AAPiWiJYS0QQ5rLWoW7y3H0Br+Tge8mdWdr/nScsxY1zkRR5m6AupNRrX7yUkL0AcvhciSiWiFQBKICnYrQCOCiGqNeQKyCyfPwagBWzMSzIohHhkiBCiH4AxAO4mogvUJ4XUT4xL87B4ll3GsGNGv0FEjQF8AuABIcRx9bl4ey8aeYnL9yKEqBFC9AHQHlKrvruX8iSDQtgDoIPqd3s5zLcIIfbI/0sAfAapoBxQhoLk/yVy9HjIn1nZfZsnIcQB+SOuBfAa6rrmvs4LEdWDVIG+L4T4VA6Oy/eilZd4fS8KQoijAL4HMBDSEJ3iRUItV0Bm+XwzAIdgY16SQSEsAdBVnrlPhzQZ86XHMulCRI2IqIlyDOASSM7/vgSgWHXcAOAL+fhLANfLliHnAzgmHPYLZQGzsn8D4BIiai53/S+RwzwnZH5G7ZjxSwBXy5YgnQB0BfAzfFD+5HHm1wGsF0I8qzoVd+9FLy9x+l6yiShTPs4AcDGkOZHvAfxajhb6XpT39WsA38k9O708msfNWXWv/iBZTWyCND73V6/liSLrGZAsBlYCWKvIC2mscA6AzQBmA8gSdZYKL8l5Ww0g32P5p0LqsldBGsu8xYrsAG6GNDm2BcBNPsrLu7Ksq+QPMUcV/69yXjYCGOOX8gdgCKThoFUAVsh/l8bje4mQl3h8L2cDWC7LvAbAw3L4GZAq9C0APgJQXw5vIP/eIp8/I1oezf7xSmWGYRgGQHIMGTEMwzAGYIXAMAzDAGCFwDAMw8iwQmAYhmEAsEJgGIZhZFghMEkBEdWoPGGuiObdkojuIKLrbUi3mIhaWrhuFBE9RpJH0q9jlYNhjODJnsoM4wGnheQiwBBCiMlOCmOAoZAWKA0FsMBjWZgkgXsITFIjt+CfJmn/iZ+JqIsc/igR/UE+vo8k//uriGiaHJZFRJ/LYT8R0dlyeAsi+lb2b/8fSIu8lLSuk9NYQUSvElGqhjzjZWdn9wF4DpIbhpuIyLer65nEgRUCkyxkhAwZjVedOyaE6A3g35Aq4VAKAPQVQpwN4A457DEAy+WwvwB4Rw5/BMACIcRZkPxQ5QIAEfUAMB7AYLmnUgPgN6EJCSE+gOTBc40s02o57V/EknmGMQIPGTHJQqQho6mq/5M0zq8C8D4RfQ7gczlsCIBfAYAQ4ju5Z9AU0qY6v5TDC4noiBx/BIBzASyR3PEgA3XO5EI5E9LmMwDQSEh+/xnGcVghMEyw22ctXy5jIVX0lwP4KxH1tpAGAXhbCPFgxEjSlqktAaQR0ToAOfIQ0r1CiPkW0mUYw/CQEcNIQznK/x/VJ4goBUAHIcT3AP4MyeVwYwDzIQ/5ENFFAA4KyS//PADXyuFjIG01CUhO5H5NRK3kc1lE1DFUECFEPoBCSLtgPQ3J6VofVgaMG3APgUkWMuSWtsJMIYRietqciFYBqABwTch1qQDeI6JmkFr5LwghjhLRowDekK87hTq3xI8BmEpEawEsArATAIQQ64joIUg74aVA8qB6N4AdGrL2gzSpfBeAZzXOM4wjsLdTJqkhomJI7p0Pei0Lw3gNDxkxDMMwALiHwDAMw8hwD4FhGIYBwAqBYRiGkWGFwDAMwwBghcAwDMPIsEJgGIZhALBCYBiGYWT+H6dommtoOanuAAAAAElFTkSuQmCC\n", 431 | "text/plain": [ 432 | "
" 433 | ] 434 | }, 435 | "metadata": {}, 436 | "output_type": "display_data" 437 | } 438 | ], 439 | "source": [ 440 | "#Plot the learning behavior\n", 441 | "import pickle\n", 442 | "import matplotlib.pyplot as plt\n", 443 | "\n", 444 | "with open('./saved_agent/scores.txt', 'rb') as f:\n", 445 | " scores = pickle.load(f)\n", 446 | "\n", 447 | "plt.plot(scores)\n", 448 | "plt.ylabel('Score')\n", 449 | "plt.xlabel('Episode #')\n", 450 | "plt.show()" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": {}, 457 | "outputs": [], 458 | "source": [] 459 | } 460 | ], 461 | "metadata": { 462 | "kernelspec": { 463 | "display_name": "Python 3", 464 | "language": "python", 465 | "name": "python3" 466 | }, 467 | "language_info": { 468 | "codemirror_mode": { 469 | "name": "ipython", 470 | "version": 3 471 | }, 472 | "file_extension": ".py", 473 | "mimetype": "text/x-python", 474 | "name": "python", 475 | "nbconvert_exporter": "python", 476 | "pygments_lexer": "ipython3", 477 | "version": "3.6.2" 478 | } 479 | }, 480 | "nbformat": 4, 481 | "nbformat_minor": 2 482 | } 483 | -------------------------------------------------------------------------------- /P1_Navigation/visual_pixels/pixel_dqn_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from collections import namedtuple, deque 4 | 5 | from pixel_model import QNetwork 6 | 7 | import torch 8 | import torch.nn.functional as F 9 | import torch.optim as optim 10 | 11 | BUFFER_SIZE = int(1e5) # replay buffer size 12 | BATCH_SIZE = 32 # minibatch size 13 | GAMMA = 0.99 # discount factor 14 | TAU = 1e-3 # for soft update of target parameters 15 | LR = 5e-4 # learning rate 16 | UPDATE_EVERY = 4 # how often to update the network 17 | REGULARIZATION = 1e-4 # regularization parameter 18 | 19 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 20 | 21 | class Agent(): 22 | """Interacts with and learns from the environment.""" 23 | 24 | def __init__(self, num_input_chnl, action_size, seed): 25 | """Initialize an Agent object. 26 | 27 | Params 28 | ====== 29 | num_input_chnl (int): number of input channels 30 | action_size (int): dimension of each action 31 | seed (int): random seed 32 | """ 33 | self.num_input_chnl = num_input_chnl 34 | self.action_size = action_size 35 | self.seed = seed 36 | random.seed(seed) #returns None 37 | 38 | # Q-Network 39 | self.qnetwork_local = QNetwork(num_input_chnl, action_size, seed).to(device) 40 | self.qnetwork_target = QNetwork(num_input_chnl, action_size, seed).to(device) 41 | self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR, weight_decay=REGULARIZATION) 42 | 43 | # Replay memory 44 | self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) 45 | # Initialize time step (for updating every UPDATE_EVERY steps) 46 | self.t_step = 0 47 | # additional parameters (not used for now) 48 | self.episode = 0 49 | self.scores = [] 50 | 51 | def step(self, state, action, reward, next_state, done, is_training=True): 52 | # Save experience in replay memory 53 | self.memory.add(state, action, reward, next_state, done) 54 | 55 | # Learn every UPDATE_EVERY time steps. 56 | self.t_step = (self.t_step + 1) % UPDATE_EVERY 57 | if self.t_step == 0: 58 | # If enough samples are available in memory and in training mode, then get random subset and learn 59 | if len(self.memory) > BATCH_SIZE and is_training == True: 60 | experiences = self.memory.sample_augmented_experience() #self.memory.sample_old() 61 | self.learn(experiences, GAMMA) 62 | 63 | 64 | def augment_state(self, state): 65 | # Augment the state to include previous observations and actions 66 | input_image_shape = self.memory.input_image_shape 67 | if len(self.memory) >= 2: 68 | prev_idx = len(self.memory)-1 69 | prev_prev_idx = prev_idx-1 70 | prev_e = self.memory.memory[prev_idx] 71 | prev_prev_e = self.memory.memory[prev_prev_idx] 72 | 73 | #e.state and e.next_state is in Nx3xHxW format (augment state in the C dimension) 74 | prev_e_a = np.ones((1,1,input_image_shape[0],input_image_shape[1]))*prev_e.action 75 | prev_prev_e_a = np.ones((1,1,input_image_shape[0],input_image_shape[1]))*prev_prev_e.action 76 | aug_state = np.concatenate((prev_prev_e.state, prev_prev_e_a, prev_e.state, prev_e_a, state), axis=1) 77 | else: 78 | #e.state and e.next_state is in Nx3xHxW format (augment state in the C dimension) 79 | initial_action = 0 80 | prev_e_a = np.ones((1,1,input_image_shape[0],input_image_shape[1]))*initial_action 81 | prev_prev_e_a = np.ones((1,1,input_image_shape[0],input_image_shape[1]))*initial_action 82 | aug_state = np.concatenate((state, prev_prev_e_a, state, prev_e_a, state), axis=1) 83 | 84 | return aug_state 85 | 86 | 87 | def act(self, state, eps=0.): 88 | """Returns actions for given state as per current policy. 89 | 90 | Params 91 | ====== 92 | state (array_like): current state 93 | eps (float): epsilon, for epsilon-greedy action selection 94 | """ 95 | #state = torch.from_numpy(state).float().unsqueeze(0).to(device) 96 | state = torch.from_numpy(state).float().to(device) 97 | #print(state.shape) 98 | 99 | self.qnetwork_local.eval() 100 | with torch.no_grad(): 101 | action_values = self.qnetwork_local(state) #same as self.qnetwork_local.forward(state) 102 | self.qnetwork_local.train() 103 | 104 | # Epsilon-greedy action selection 105 | if random.random() > eps: 106 | return np.argmax(action_values.cpu().data.numpy()) 107 | else: 108 | return random.choice(np.arange(self.action_size)) 109 | 110 | def learn(self, experiences, gamma): 111 | """Update value parameters using given batch of experience tuples. 112 | 113 | Params 114 | ====== 115 | experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 116 | gamma (float): discount factor 117 | """ 118 | states, actions, rewards, next_states, dones = experiences 119 | #print(states.shape, next_states.shape) 120 | #print(torch.sum(next_states[0,:,:,:]==states[1,:,:,:])) 121 | 122 | ## TODO: compute and minimize the loss 123 | #"*** YOUR CODE HERE ***" 124 | qs_local = self.qnetwork_local.forward(states) 125 | qsa_local = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long), actions.reshape(BATCH_SIZE)] 126 | qsa_local = qsa_local.reshape((BATCH_SIZE,1)) 127 | #print(qsa_local.shape) 128 | 129 | # DQN Target 130 | qs_target = self.qnetwork_target.forward(next_states) 131 | qsa_target, _ = torch.max(qs_target, dim=1) #using the greedy policy (q-learning) 132 | qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete 133 | qsa_target = qsa_target.reshape((BATCH_SIZE,1)) 134 | TD_target = rewards + gamma * qsa_target 135 | #print(qsa_target.shape, TD_target.shape, rewards.shape) 136 | 137 | # # Double DQN Target ver 1 138 | # qs_target = self.qnetwork_target.forward(next_states) 139 | # if random.random() > 0.5: 140 | # _, qsa_target_argmax_a = torch.max(qs_target, dim=1) #using the greedy policy (q-learning) 141 | # qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_target_argmax_a.reshape(BATCH_SIZE)] 142 | # else: 143 | # _, qsa_local_argmax_a = torch.max(qs_local, dim=1) #using the greedy policy (q-learning) 144 | # #qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)] 145 | # ##qsa_target = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)] 146 | 147 | # qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete 148 | # qsa_target = qsa_target.reshape((BATCH_SIZE,1)) 149 | # TD_target = rewards + gamma * qsa_target 150 | 151 | # # Double DQN Target ver 2 (based upon double dqn paper. Use this version, it's better.) 152 | # qs_target = self.qnetwork_target.forward(next_states) 153 | # _, qsa_local_argmax_a = torch.max(qs_local, dim=1) #using the greedy policy (q-learning) 154 | # qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)] 155 | 156 | # qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete 157 | # qsa_target = qsa_target.reshape((BATCH_SIZE,1)) 158 | # TD_target = rewards + gamma * qsa_target 159 | # #print(qsa_target.shape, TD_target.shape, rewards.shape) 160 | 161 | # #Udacity's approach 162 | # # Get max predicted Q values (for next states) from target model 163 | # Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) 164 | # # Compute Q targets for current states 165 | # TD_target = rewards + (gamma * Q_targets_next * (1 - dones)) 166 | # # Get expected Q values from local model 167 | # qsa_local = self.qnetwork_local(states).gather(1, actions) 168 | 169 | 170 | 171 | #diff = qsa_local - TD_target 172 | #loss = torch.matmul(torch.transpose(diff, dim0=0, dim1=1), diff) #loss is now a scalar 173 | loss = F.mse_loss(qsa_local, TD_target) #much faster than the above loss function 174 | #print(loss) 175 | #minimize the loss 176 | self.optimizer.zero_grad() #clears the gradients 177 | loss.backward() 178 | self.optimizer.step() 179 | 180 | 181 | # ------------------- update target network ------------------- # 182 | self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) 183 | 184 | def soft_update(self, local_model, target_model, tau): 185 | """Soft update model parameters. 186 | θ_target = τ*θ_local + (1 - τ)*θ_target 187 | 188 | Params 189 | ====== 190 | local_model (PyTorch model): weights will be copied from 191 | target_model (PyTorch model): weights will be copied to 192 | tau (float): interpolation parameter 193 | """ 194 | for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): 195 | target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) 196 | 197 | @classmethod 198 | def load(cls, path): 199 | checkpoint = torch.load(path, map_location=lambda storage, loc: storage) #helps loading model when testing on local machine with no gpu 200 | my_agent = cls(checkpoint['num_input_chnl'], checkpoint['action_size'], checkpoint['seed']) 201 | my_agent.qnetwork_local.load_state_dict(checkpoint['local_state_dict']) 202 | my_agent.qnetwork_target.load_state_dict(checkpoint['target_state_dict']) 203 | my_agent.memory.memory = checkpoint['memory'] 204 | my_agent.episode = checkpoint['episode'] 205 | my_agent.scores = checkpoint['scores'] 206 | return my_agent 207 | 208 | 209 | def save(self, path): 210 | checkpoint = { 211 | 'num_input_chnl': self.num_input_chnl, 212 | 'action_size': self.action_size, 213 | 'seed': self.seed, 214 | 'local_state_dict': self.qnetwork_local.state_dict(), 215 | 'target_state_dict': self.qnetwork_target.state_dict(), 216 | 'memory': self.memory.memory, 217 | 'episode': self.episode, 218 | 'scores': self.scores 219 | } 220 | torch.save(checkpoint, path) 221 | 222 | Experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) 223 | class ReplayBuffer: 224 | """Fixed-size buffer to store experience tuples.""" 225 | 226 | def __init__(self, action_size, buffer_size, batch_size, seed): 227 | """Initialize a ReplayBuffer object. 228 | 229 | Params 230 | ====== 231 | action_size (int): dimension of each action 232 | buffer_size (int): maximum size of buffer 233 | batch_size (int): size of each training batch 234 | seed (int): random seed 235 | """ 236 | self.action_size = action_size 237 | self.memory = deque(maxlen=buffer_size) 238 | self.batch_size = batch_size 239 | ###self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) #doesn't like tuple to be defined inside class when using pickle 240 | self.experience = Experience 241 | self.seed = seed 242 | random.seed(seed) #returns None 243 | self.input_image_shape = (84,84) 244 | 245 | def add(self, state, action, reward, next_state, done): 246 | """Add a new experience to memory.""" 247 | e = self.experience(state, action, reward, next_state, done) 248 | self.memory.append(e) 249 | 250 | def sample_old(self): 251 | """Randomly sample a batch of experiences from memory.""" 252 | experiences = random.sample(self.memory, k=self.batch_size) 253 | 254 | states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device) 255 | actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device) 256 | rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device) 257 | next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device) 258 | dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device) 259 | 260 | return (states, actions, rewards, next_states, dones) 261 | 262 | def sample_augmented_experience(self): 263 | """Randomly sample a batch of experiences from memory.""" 264 | #Note: the experiences are store in the memory in chronoogical order 265 | 266 | #experiences = list(self.memory)[0:self.batch_size] #get experiences in order 267 | 268 | aug_states = [] #augment state 269 | actions = [] 270 | rewards = [] 271 | aug_next_states = [] #augment next state 272 | dones = [] 273 | while len(aug_states) < self.batch_size: 274 | idx = random.sample(range(len(self.memory)), k=1)[0] 275 | #idx = 3+len(aug_states) #take experiences in order and in agent.step make sure 'len(self.memory) > BATCH_SIZE+5' 276 | e = self.memory[idx] 277 | if e is None or (idx-2) < 0 or (idx+1) >= len(self.memory): 278 | continue 279 | else: 280 | prev_e = self.memory[idx-1] 281 | prev_prev_e = self.memory[idx-2] 282 | next_e = self.memory[idx+1] 283 | 284 | #e.state and e.next_state is in Nx3xHxW format (augment state in the C dimension) 285 | prev_e_a = np.ones((1,1,self.input_image_shape[0],self.input_image_shape[1]))*prev_e.action 286 | prev_prev_e_a = np.ones((1,1,self.input_image_shape[0],self.input_image_shape[1]))*prev_prev_e.action 287 | aug_state = np.concatenate((prev_prev_e.state, prev_prev_e_a, prev_e.state, prev_e_a, e.state), axis=1) 288 | aug_states.append(aug_state) 289 | actions.append(e.action) 290 | rewards.append(e.reward) 291 | e_a = np.ones((1,1,self.input_image_shape[0],self.input_image_shape[1]))*e.action 292 | aug_next_state = np.concatenate((prev_e.state, prev_e_a, e.state, e_a, next_e.state), axis=1) 293 | aug_next_states.append(aug_next_state) 294 | dones.append(e.done) 295 | 296 | #augment state is of shape Nx11x84x84 297 | states = torch.from_numpy(np.vstack([s for s in aug_states])).float().to(device) 298 | actions = torch.from_numpy(np.vstack([a for a in actions])).long().to(device) 299 | rewards = torch.from_numpy(np.vstack([r for r in rewards])).float().to(device) 300 | next_states = torch.from_numpy(np.vstack([ns for ns in aug_next_states])).float().to(device) 301 | dones = torch.from_numpy(np.vstack([d for d in dones]).astype(np.uint8)).float().to(device) 302 | 303 | return (states, actions, rewards, next_states, dones) 304 | 305 | 306 | def __len__(self): 307 | """Return the current size of internal memory.""" 308 | return len(self.memory) -------------------------------------------------------------------------------- /P1_Navigation/visual_pixels/pixel_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class QNetwork(nn.Module): 6 | """Actor (Policy) Model.""" 7 | 8 | def __init__(self, num_input_chnl, action_size, seed, num_filters = [16,32], fc_layers=[64,64]): 9 | """Initialize parameters and build model. 10 | Params 11 | ====== 12 | num_input_chnl (int): Number of input channels 13 | action_size (int): Dimension of each action 14 | seed (int): Random seed 15 | """ 16 | super(QNetwork, self).__init__() 17 | self.seed = torch.manual_seed(seed) 18 | 19 | self.conv1 = nn.Conv2d(num_input_chnl, num_filters[0], kernel_size=(3,3), stride=1, padding=(1,1)) 20 | self.conv1bnorm = nn.BatchNorm2d(num_filters[0]) 21 | self.conv1relu = nn.ReLU() 22 | self.conv1maxp = nn.MaxPool2d(kernel_size=(2,2), stride=(2,2)) 23 | #self.conv2d_1 = [self.conv1, self.bnorm1, self.relu1, self.maxp1] 24 | 25 | self.conv2 = nn.Conv2d(num_filters[0], num_filters[1], kernel_size=(3,3), stride=1, padding=(1,1)) 26 | self.conv2bnorm = nn.BatchNorm2d(num_filters[1]) 27 | self.conv2relu = nn.ReLU() 28 | self.conv2maxp = nn.MaxPool2d(kernel_size=(2,2), stride=(2,2)) 29 | 30 | self.fc1 = nn.Linear(num_filters[1]*21*21, fc_layers[0]) 31 | self.fc1bnorm = nn.BatchNorm1d(fc_layers[0]) 32 | self.fc1relu = nn.ReLU() 33 | 34 | self.fc2 = nn.Linear(fc_layers[0], fc_layers[1]) 35 | self.fc2bnorm = nn.BatchNorm1d(fc_layers[1]) 36 | self.fc2relu = nn.ReLU() 37 | 38 | self.fc3 = nn.Linear(fc_layers[1], action_size) 39 | 40 | def forward(self, state): 41 | """Build a network that maps state -> action values.""" 42 | 43 | # for conv_1 in self.conv2d_1: 44 | # state = conv_1(state) 45 | 46 | state = self.conv1(state) 47 | state = self.conv1bnorm(state) 48 | state = self.conv1relu(state) 49 | state = self.conv1maxp(state) 50 | 51 | state = self.conv2(state) 52 | state = self.conv2bnorm(state) 53 | state = self.conv2relu(state) 54 | state = self.conv2maxp(state) 55 | 56 | #print(state.shape) #state is of shape Nx32x21x21 57 | state = state.reshape((-1,32*21*21)) #reshape the output of conv2 before feeding into fc1 layer 58 | 59 | state = self.fc1(state) 60 | state = self.fc1bnorm(state) 61 | state = self.fc1relu(state) 62 | 63 | state = self.fc2(state) 64 | state = self.fc2bnorm(state) 65 | state = self.fc2relu(state) 66 | 67 | state = self.fc3(state) 68 | 69 | return state 70 | 71 | ''' 72 | Note: when training, do model_name.train() to properly update batchnorm variables. 73 | And during inference, do model_name.eval() to us the batch norm statistics from training time. 74 | The dqn_agent's act method already handles this. 75 | 76 | To speed up inference turn off gradients like this: 77 | with torch.no_grad(): 78 | action = model.forward(state) 79 | 80 | ''' 81 | 82 | # If it doesn't work, maybe remove batchnorm. 83 | 84 | -------------------------------------------------------------------------------- /P2_Continuous_Actions/README.md: -------------------------------------------------------------------------------- 1 | [//]: # (Image References) 2 | 3 | [image1]: https://user-images.githubusercontent.com/10624937/43851024-320ba930-9aff-11e8-8493-ee547c6af349.gif "Trained Agent" 4 | [image2]: https://user-images.githubusercontent.com/10624937/43851646-d899bf20-9b00-11e8-858c-29b5c2c94ccc.png "Crawler" 5 | 6 | Project Report: https://medium.com/@amitp-ai/policy-gradients-1edbbbc8de6b 7 | 8 | # Project 2: Continuous Control 9 | 10 | ### Introduction 11 | 12 | For this project, you will work with the [Reacher](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Learning-Environment-Examples.md#reacher) environment. 13 | 14 | ![Trained Agent][image1] 15 | 16 | In this environment, a double-jointed arm can move to target locations. A reward of +0.1 is provided for each step that the agent's hand is in the goal location. Thus, the goal of your agent is to maintain its position at the target location for as many time steps as possible. 17 | 18 | The observation space consists of 33 variables corresponding to position, rotation, velocity, and angular velocities of the arm. Each action is a vector with four numbers, corresponding to torque applicable to two joints. Every entry in the action vector should be a number between -1 and 1. 19 | 20 | ### Distributed Training 21 | 22 | For this project, we will provide you with two separate versions of the Unity environment: 23 | - The first version contains a single agent. 24 | - The second version contains 20 identical agents, each with its own copy of the environment. 25 | 26 | The second version is useful for algorithms like [PPO](https://arxiv.org/pdf/1707.06347.pdf), [A3C](https://arxiv.org/pdf/1602.01783.pdf), and [D4PG](https://openreview.net/pdf?id=SyZipzbCb) that use multiple (non-interacting, parallel) copies of the same agent to distribute the task of gathering experience. 27 | 28 | ### Solving the Environment 29 | 30 | Note that your project submission need only solve one of the two versions of the environment. 31 | 32 | #### Option 1: Solve the First Version 33 | 34 | The task is episodic, and in order to solve the environment, your agent must get an average score of +30 over 100 consecutive episodes. 35 | 36 | #### Option 2: Solve the Second Version 37 | 38 | The barrier for solving the second version of the environment is slightly different, to take into account the presence of many agents. In particular, your agents must get an average score of +30 (over 100 consecutive episodes, and over all agents). Specifically, 39 | - After each episode, we add up the rewards that each agent received (without discounting), to get a score for each agent. This yields 20 (potentially different) scores. We then take the average of these 20 scores. 40 | - This yields an **average score** for each episode (where the average is over all 20 agents). 41 | 42 | The environment is considered solved, when the average (over 100 episodes) of those average scores is at least +30. 43 | 44 | ### Getting Started 45 | 46 | 1. Download the environment from one of the links below. You need only select the environment that matches your operating system: 47 | 48 | - **_Version 1: One (1) Agent_** 49 | - Linux: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Reacher/one_agent/Reacher_Linux.zip) 50 | - Mac OSX: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Reacher/one_agent/Reacher.app.zip) 51 | - Windows (32-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Reacher/one_agent/Reacher_Windows_x86.zip) 52 | - Windows (64-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Reacher/one_agent/Reacher_Windows_x86_64.zip) 53 | 54 | - **_Version 2: Twenty (20) Agents_** 55 | - Linux: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Reacher/Reacher_Linux.zip) 56 | - Mac OSX: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Reacher/Reacher.app.zip) 57 | - Windows (32-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Reacher/Reacher_Windows_x86.zip) 58 | - Windows (64-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Reacher/Reacher_Windows_x86_64.zip) 59 | 60 | (_For Windows users_) Check out [this link](https://support.microsoft.com/en-us/help/827218/how-to-determine-whether-a-computer-is-running-a-32-bit-version-or-64) if you need help with determining if your computer is running a 32-bit version or 64-bit version of the Windows operating system. 61 | 62 | (_For AWS_) If you'd like to train the agent on AWS (and have not [enabled a virtual screen](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-on-Amazon-Web-Service.md)), then please use [this link](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Reacher/one_agent/Reacher_Linux_NoVis.zip) (version 1) or [this link](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Reacher/Reacher_Linux_NoVis.zip) (version 2) to obtain the "headless" version of the environment. You will **not** be able to watch the agent without enabling a virtual screen, but you will be able to train the agent. (_To watch the agent, you should follow the instructions to [enable a virtual screen](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-on-Amazon-Web-Service.md), and then download the environment for the **Linux** operating system above._) 63 | 64 | 2. Place the file in the DRLND GitHub repository, in the `p2_continuous-control/` folder, and unzip (or decompress) the file. 65 | 66 | ### Instructions 67 | 68 | Follow the instructions in `Continuous_Control.ipynb` to get started with training your own agent! 69 | 70 | ### (Optional) Challenge: Crawler Environment 71 | 72 | After you have successfully completed the project, you might like to solve the more difficult **Crawler** environment. 73 | 74 | ![Crawler][image2] 75 | 76 | In this continuous control environment, the goal is to teach a creature with four legs to walk forward without falling. 77 | 78 | You can read more about this environment in the ML-Agents GitHub [here](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Learning-Environment-Examples.md#crawler). To solve this harder task, you'll need to download a new Unity environment. (**Note**: Udacity students should not submit a project with this new environment.) 79 | 80 | You need only select the environment that matches your operating system: 81 | - Linux: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Crawler/Crawler_Linux.zip) 82 | - Mac OSX: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Crawler/Crawler.app.zip) 83 | - Windows (32-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Crawler/Crawler_Windows_x86.zip) 84 | - Windows (64-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Crawler/Crawler_Windows_x86_64.zip) 85 | 86 | Then, place the file in the `p2_continuous-control/` folder in the DRLND GitHub repository, and unzip (or decompress) the file. Next, open `Crawler.ipynb` and follow the instructions to learn how to use the Python API to control the agent. 87 | 88 | (_For AWS_) If you'd like to train the agent on AWS (and have not [enabled a virtual screen](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-on-Amazon-Web-Service.md)), then please use [this link](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Crawler/Crawler_Linux_NoVis.zip) to obtain the "headless" version of the environment. You will **not** be able to watch the agent without enabling a virtual screen, but you will be able to train the agent. (_To watch the agent, you should follow the instructions to [enable a virtual screen](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-on-Amazon-Web-Service.md), and then download the environment for the **Linux** operating system above._) 89 | 90 | -------------------------------------------------------------------------------- /P2_Continuous_Actions/Report.md: -------------------------------------------------------------------------------- 1 | [image1]: https://user-images.githubusercontent.com/23042512/48657451-cc597f00-e9e5-11e8-8332-bf97ee7da5f8.gif "Trained Agent Perf" 2 | [image2]: https://user-images.githubusercontent.com/23042512/48657452-cc597f00-e9e5-11e8-8776-37a144f24702.png "Trained Agent Scores" 3 | 4 | ## Introduction 5 | ![Trained Agent][image1] 6 | 7 | 8 | This report outlines my implementation for Udacity's Deep Reinforcement Learning Nanodegree's second project on the Reacher environment. In this project, the goal is to train an acrobat arm that has two joints so that it tracks a balloon. As the balloon moves, the two joints at adjusted to track the balloon. So this is a classical robotics problem, and using model-free reinforcement learning, the agent will learn the optimal policy. In particular, the method used is the deep deterministic policy gradient method (DDPG). 9 | 10 | Value based reinforcement learning algorithms such as DQN have shown great performance in many domains. However, they are still limited to discrete action space environments and for deterministic policies (as they are essentially based upon a deterministic greedy policy as epsilon only selects a uniform-random action). Moreover, with value based methods, we first compute the value-function for each state, and use that to determine the best policy. This is an indirect way of finding the optimal policy. 11 | 12 | Using a policy based method, on the other hand, we directly find the policy that yields the most rewards. Policy gradient is one of the more efficient policy based learning algorithms where we directly compute the gradient of the expected reward with respect to the policy parameters. In addition to being direct, it works well with continuous actions as well as stochastic policies. Other policy based methods include stochastic optimization methods such as random shooting, cross entropy method, etc. 13 | 14 | Before delving into my implementation for this project, I have included below some basics on policy gradient methods. In particular, this article will walk the reader from the basic objective of Reinforcement Learning (RL) to some of the advanced policy-gradient algorithms, such as Reinforce, Actor-Critic, Advantage Actor-Critic, Deterministic Policy Gradient, and Deep Deterministic Policy Gradient. For thorough understanding, it is assumed the reader is well versed in Probability & Statistics, Linear Algebra, Vector Calculus, and basic Reinforcement Learning terminologies. 15 | 16 | --------------------------------------------------------------- 17 | ## Reinforce Algorithm 18 | Please note, in the below analyses, the discount factor γ is assumed to be 1 for simplicity. But all the analyses can be easily extended to cases where γ is not 1. 19 | 20 | The basic objective in all of Reinforcement Learning (RL) is to maximize the expected total utility Uθ, which is defined as follows [1]: 21 | 22 | 23 | 24 | After doing some Math, it can be shown that Uθ is equal to the expected value of Q(s0,a0). And if the initial state distribution is uniform, then it means the goal in RL is to find a policy which maximizes the q-values of all possible states. 25 | 26 | 27 | 28 | Using the definition of expectation, the above equation 1a can be re-written as: 29 | 30 | 31 | 32 | Using Policy gradient method, we can maximize Uθ by first computing its gradient with respect to θ, which can readily be derived to be: 33 | 34 | 35 | 36 | One approach to improving the expected total reward is to randomly add noise to the current θ and if it results in better total reward, then we keep it, otherwise we ignore it, and we keep repeating this process. This method is called the random shooting method. There are other more sophisticated methods in the same vein such as the Cross Entropy Method. All these methods fall under the domain of stochastic optimization algorithms. However, while these methods are very simple to implement, they are not efficient and don't scale well with high dimensional space. A more efficient approach is to change θ in the direction of the gradient using Stochastic Gradient Ascent as follows: 37 | 38 | 39 | 40 | A basic policy gradient algorithm making use of the above gradient is known as the Reinforce algorithm, and here is how it works: 41 | 42 | ***A Basic Reinforce Algorithm:*** 43 | 44 | Start with a random vector θ and repeat the following 3 steps until convergence: 45 | 46 | 1. Use the policy Pθ(at|st) to collect m trajectories {τ1, τ2, ..., τm}, where each trajectory is as defined above. 47 | 2. Use these trajectories to compute the Monte-Carlo estimator of the gradient as follows: 48 | 49 | 50 | 51 | Note that the reason why the above estimator is valid is because the trajectories are generated by following the policy being learned, i.e. Pθ(τ) -- i.e. it is an on-policy algorithm. Another way to say it is that we sample each of the trajectories in {τ1, τ2, ..., τm} from the probability distribution Pθ(τ). 52 | 53 | 3. Update the weights/parameters of the policy network using the above estimator of the gradient: 54 | 55 | 56 | 57 | The intuition behind the reinforce algorithm is that if the total reward is positive, then all the actions taken in that trajectory are reinforced whereas if the total reward is negative, then all the actions taken in the trajectory are inhibited. Moreover, to be computationally efficient, typically m is set to 1. 58 | 59 | While better than stochastic optimization methods, the Reinforce algorithm suffers from a few drawbacks: 60 | 1. The gradient estimator is pretty noisy, especially for the case m=1, because a single trajectory maynot be representative of the policy. 61 | 2. There is no clear credit assignment. A trajectory may contain many good and bad actions, and whether those actions are reinforced or not depend only on the total reward achieved starting from the initial state. 62 | 3. It is very sensitive to the absolute value of the rewards. For example, adding a fixed constant to all the rewards can drastically change the behavior of the algorithm. Such a trivial transformation should have no effect on the optimal policy. 63 | 64 | By the definition of the gradient, ∇θUθ points in the direction of the maximum change in Uθ. However, at a fundamental level, the above drawbacks of Reinforce algorithm are due to the fact that the Monte-Carlo estimator of ∇θUθ (i.e. ĝ) has high variance. If we can reduce its variance, then our estimate of gradient (ĝ) will be closer to the true gradient ∇θUθ. 65 | 66 | While the Monte-Carlo estimator of the gradient (ĝ) is unbiased, it exhibits high variance. As discussed below, there are a few ways of reducing variance without introducing bias: 1) using causality and 2) using a baseline. 67 | 68 | ## Actor-Critic Algorithm 69 | 70 | One way to reduce variance is by taking advantage of causality: ĝ updates all the actions in a trajectory based upon total rewards and not the rewards to go. That is to say, future actions affect past rewards, which is not possible in our causal Universe. So we can make the gradient estimator more realistic by using rewards to go as shown in the below equation. 71 | 72 | 73 | 74 | Note that using the rewards to go instead of the total rewards still results in an unbiased estimator of ∇θUθ because causality is handled in the expectation in Equation 3 using Pθ(τ). Moreover, doing so reduces variance because the rewards to go expression has fewer terms (and thus lower uncertainty) than the total rewards expression. 75 | 76 | An important aside to note is that the rewards to go is really an estimate of the the q-value of (st, at). This is because the q-value is defined as follows: 77 | 78 | 79 | 80 | 81 | And so, if the trajectory τ is sampled from Pθ(τ), then the single-sample Monte-Carlo estimate of QPθ(st, at) is just: 82 | 83 | 84 | 85 | As shown above, instead of using the Monte-Carlo estimator of the rewards to go as in Equation 7, we can use the Q-value estimator of the rewards to go. As a result, Equation 7 can be re-written as: 86 | 87 | 88 | 89 | If QhatPθ(st, at) is modeled using a neural network (parameterized by w), then we get: 90 | 91 | 92 | 93 | Note that because the state-action space can be very high dimensional, it quickly runs into Bellman's curse of dimensionality; and thus, in most practical situations with complex state-transition dynamics, QhatPθ(st, at) is modeled using a neural network based function approximator. 94 | 95 | Then Equation 10 can be re-written as: 96 | 97 | 98 | 99 | Whereby, Pθ(at | st) is the actor network that is parameterized by θ and QhatPθ(st, at, w) is the critic network that is parameterized by w. This is essentially what is known as the actor-critic algorithm. 100 | 101 | For any visited state-action pair (s,a), the actor network is updated using Equation 6 (utilizing ĝ from Equation 12), and the critic network is typically updated using Temporal-Difference learning (due to its lower variance than Monte-Carlo learning) using the following update equation: 102 | 103 | 104 | 105 | Whereby the weight vector w is updated to reduce the loss L(w), which is defined as: 106 | 107 | 108 | 109 | and using Q-learning (so that the critic is based of off an off-policy algorithm): 110 | 111 | 112 | 113 | and so 114 | 115 | 116 | 117 | whereby 118 | 119 | 120 | 121 | This is the basics of the actor-critic algorithm. While there are many variants of it, as we will see below, this is the basic core of it. 122 | 123 | ## Advantage Actor-Critic Algorithm 124 | 125 | In addition to using the rewards to go (due to causality), another approach to minimizing the variance of ĝ is by subtracting out a baseline b that is not dependent on θ or action a -- and this combined term is known as the Advantage function. It can be mathematically proved that such a transformation is not only unbiased, but it reduces variance. An intuitive explanation for why it reduces variance is because the term multiplying ∇θlog(Pθ(a|s)) has smaller magnitude, which essentially reduces the variance of the overall expression. 126 | 127 | 128 | 129 | There are many choices for the baseline b, and in theory, the optimal value of b can also be computed. However, in the interest of simplicity and to be intuitive, a commonly used baseline is the q-value averaged over all the actions, i.e. the state-value. 130 | 131 | 132 | 133 | The Advantage function is then written as follows: 134 | 135 | 136 | 137 | The basic idea with using this advantage function is that actions with higher q-value than the average (i.e. state-value) are reinforced where as other actions are inhibited. This makes a lot more intuitive sense than the gradient equation used in the original Reinforce algorithm. And so it's not totally surprising that Mathematically it results in lower variance. Moreover, now the gradient is no longer dependent on the absolute value of the rewards. 138 | 139 | One problem with the above Equation is that, in practice, it is very difficult to compute the above expectation -- especially for continuous actions or high dimensional action space. Hence, the state-value function is modeled with a separate neural network that is parameterized by wv as follows: 140 | 141 | 142 | 143 | The advantage function now becomes: 144 | 145 | 146 | 147 | The issue with this advantage function is that it requires two separate neural networks. With some clever re-ordering, we can re-write the Advantage function using a single neural network. However, inorder to do so, let us first re-visit the above analysis. Basically, the ideal Advantage function we would like to have is: 148 | 149 | 150 | 151 | As defined in Equation 8 above, state-action value can be further simplified interms of the state-value function as: 152 | 153 | 154 | 155 | The single-sample Monte-Carlo estimate of QPθ(st, at) as defined in the Equation above is: 156 | 157 | 158 | 159 | And so now we just need to represent the state-value function using a neural network parameterized by wv as follows: 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | And thus the Advantage function can now be represented using a single neural network parameterized with wv. Note with the above equation for the Advantage function, it is really just the one-step TD error (i.e. TD(0) error). Additionally, it is also possible to represent it using TD(λ) error. 168 | 169 | The gradient equation for Advantage Actor Critic is now going to be: 170 | 171 | 172 | 173 | And this is going to be a much better estimator of the expected gradient (Equation 3), i.e. with lower variance and still be unbiased, even for m=1. As a result, the algorithm will learn much faster. 174 | 175 | wv is updated as follows: 176 | 177 | 178 | 179 | whereby using one-step TD learning (i.e. TD(0)): 180 | 181 | 182 | 183 | Using the gradient estimator from Equation 29, the weight update from Equation 30, and the remaining steps from the basic Reinforce algorithm results in what is known as the Advantage Actor-Critic algorithm. 184 | 185 | To briefly summarize the above discussion, the main downside of the Reinforce algorithm is that the gradient estimator is based upon the Monte-Carlo estimator of the expected total reward from the initial state-action pair -- which while has low bias, it has high variance. By using causality and subtracting out a baseline from the Monte-Carlo estimator, we can reduce the variance. The variance is further reduced by using TD estimator of the expected total reward to go instead of Monte-Carlo estimator. 186 | 187 | ## Deterministic Policy Gradient (DPG) Algorithm 188 | 189 | For stochastic policies in continuous environments, the actor outputs the mean and variance of a Gaussian distribution. And an action is sampled from this Gaussian distribution. For deterministic actions, while this approach still works as the network will learn to have very low variance, it involves complexity and computational burden that unnecessarily slows down the learning algorithm. To address these short comings, for deterministic actions, we can use what is known as the deterministic policy gradient. 190 | 191 | In stochastic case, the policy gradient integrates over both state and action spaces, whereas in the deterministic case it only integrates over the state space. As a result, computing the deterministic policy gradient can potentially require fewer samples. But in order to fully explore the state space, the basic idea is to choose actions according to a stochastic behavior policy and learn about a deterministic target policy (i.e. needs to be an off-policy algorithm). 192 | 193 | DPG is essentially a deterministic version of Actor-Critic algorithm. For a basic DPG algorithm, we have two neural networks, one network (parameterized by θ) is estimating the optimal target policy and the second network (parameterized by w) is estimating the action-value function corresponding to the target policy. The below equations formalize this. 194 | 195 | As mentioned above, because the target policy is deterministic, the actor may not explore the state-space very well to find the optimal policy. To address it, we use a behavior policy (b(st)) that is different from the target policy. It is basically the target policy with some additional noise. For simplicity, we will use a Normal distribution as our noise source. But note that this term is like a hyper parameter, and in the below implementation for the Reacher environment, a different noise process is used. 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | **Deterministic Policy Gradient Update:**
206 | 1. Actor network is updated as follows: 207 | 208 | 209 | 210 | which by chain rule, it becomes: 211 | 212 | 213 | 214 | 2. The critic network is updated as follows: 215 | 216 | The TD error is given by: 217 | 218 | 219 | 220 | and the weight update is: 221 | 222 | 223 | 224 | To reiterate, in order to properly balance exploration-exploitation tradeoff, while the target policy μ is deterministic, the behavior policy is stochastic. So this is an off-policy version of the DPG algorithm. While stochastic off-policy actor-critic algorithms typically use importance sampling for both the actor and the critic, because the deterministic policy gradient removes expectation over the actions, and given the state transition dynamics are same for both the target and behavior policies as they operate in the same environment, importance sampling ratio is not needed. So we can avoid having to use importance sampling in the actor, and with same reasoning, we avoid using importance sampling in the critic [2]. For those who are wondering, similar reasoning applies as to why we don't use importance sampling with Q-learning. 225 | 226 | ## Deep Deterministic Policy Gradient (DDPG) Algorithm 227 | DDPG is basically DPG with a few training changes adopted from the DQN architecture. 228 | 229 | One challenge when using neural networks for reinforcement learning is that most optimization algorithms assume the samples are independently and identically distributed. Obviously this assumption doesn't hold true because the samples are generated by exploring sequentially in an environment. Because DDPG is an off policy algorithm, we can use the replay buffer (a finite sized cache) as in DQN to address this issue. At each timestep the actor and critic are updated by sampling a minibatch uniformly from the buffer [2]. 230 | 231 | For the critic, since the network being updated is also used in calculating the target, this can potentially lead to training instabilities for highly nonlinear function approximators like neural networks. One solution to address this is using a separate target network, as with DQN [2]. Given the target values are determined using both the critic and actor networks, we create a copy of both of these networks and soft update their weights to the respective learned networks. [Please refer to my github code for details.](https://github.com/gtg162y/DRLND/blob/master/P2_Continuous_Actions/Continuous_Control_UdacityWorkspace.ipynb) 232 | 233 | --------------------------------------------------------------- 234 | ## DDPG Implementation for Reacher Environment 235 | 236 | Having now seen some of the commonly used policy gradient algorithms, we can now get to my implementation for the Udacity's Reacher project. In this environment, a double-jointed arm (acrobot) can move to target locations (i.e. where the balloons are). A reward of +0.1 is provided for each step that the agent's hand is in the goal location. Thus, the goal of the agent is to maintain its position at the target location for as many time steps as possible. As the balloon moves, the two joints at adjusted to track the balloon. So this is a classical robotics project, and using model-free reinforcement learning, the agent will learn the optimal policy. In particular, the method used is the deep deterministic policy gradient method (DDPG). The observation space consists of 33 variables corresponding to position, rotation, velocity, and angular velocities of the arm. Each action is a vector with four numbers, corresponding to torque applicable to two joints. Every entry in the action vector is a number between -1 and 1. 237 | 238 | The Reacher environment used contains 20 identical agents, each with its own copy of the environment. In order to be considered solved, the agents must get an average score of +30 (over 100 consecutive episodes, and over all 20 agents). In particular, after each episode, we add up the rewards that each agent received (without discounting), to get a score for each agent. This yields 20 (potentially different) scores. We then take the average of these 20 scores. 239 | This yields an average score for each episode (where the average is over all 20 agents). 240 | The environment is considered solved, when the average (over 100 episodes) of those average scores is at least +30. 241 | 242 | The DDPG algorithm uses 4 separate neural networks. One to learn the policy, another to learn the value function, another for the target value function, and one for the target action in the target action-value function network. As discussed earlier, we use a separate target network instead of the local network so as to prevent any instabilities in learning when the TD target is dependent on the local network. The weights in the target network are updated very slowly, like 0.1% towards the local network at every time step. Slowly changing the target network does slow down the learning rate, but it helps with the learning algorithm's stability [2]. 243 | 244 | To speed up the learning process, since all 20 agents are experiencing at the same time, the input data stream is pretty large. To adequately take advantage of this, I perform network parameter update 4 times at each iteration. The reason this works without using an importance sampling ratio is because the target policy for the actor is deterministic, as well as the target value for the critic is also deterministic. Additionally, the critic network's gradients are clipped to 1. This prevents the critic network from changing too fast. Moreover, I initialized the target networks to have the same (random) weights as the networks being learned. Additionally, instead of Normal distribution, Ornstein-Uhlenbeck noise process is used to generate the behavior policy for better exploration. All these things together allowed the agent to achieve the learning objective in just over 100 episodes. [Please refer to my github code for details.](https://github.com/gtg162y/DRLND/blob/master/P2_Continuous_Actions/Continuous_Control_UdacityWorkspace.ipynb) 245 | 246 | Below is the learning performance of the algorithm averaged over all the 20 agents. 247 | 248 | ![Scores][image2] 249 | 250 | In terms of hyperparameters used, the learning rate for both the actor and critic network was 1e-4, no regularization was used as the networks were fairly small. I gradually decayed the exploration probability to get the optimal policy. Ornstein-Uhlenbeck noise process used a mean value of 0 and sigma of 0.2. The actor network was built using a three layer neural network (with 256 neurons in the first layer, 128 neurons in the second layer, and 4 neurons in the final output layer). The critic network was also built using three layers and similar number of neurons as the actor network, except that actions were concatenated to the output of the first layer and the final layer had a single output neuron. For faster learning, elu non-linearity was used for both networks. 251 | 252 | In terms of methods to further improve the agent's performance, a couple things on my to do list are: 1) train the agent using Prioritized Experience Replay as well as 2) use the Proximal Policy Optimization algorithm. 253 | 254 | **References:**
255 | 1: UC Berkeley CS294 Lectures (http://rail.eecs.berkeley.edu/deeprlcourse/)
256 | 2. DDPG paper (https://arxiv.org/pdf/1509.02971.pdf) 257 | 258 | 259 | [comment]: # (Equations generated using: https://stackoverflow.com/questions/11256433/how-to-show-math-equations-in-general-githubs-markdownnot-githubs-blog, 260 | https://www.codecogs.com/latex/eqneditor.php, 261 | http://mathurl.com/) 262 | -------------------------------------------------------------------------------- /P2_Continuous_Actions/checkpoint_actor.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitp-ai/Deep_Reinforcement_Learning/68fa57e1be9a7e72887f16a413aa55c85033ec4c/P2_Continuous_Actions/checkpoint_actor.pth -------------------------------------------------------------------------------- /P2_Continuous_Actions/checkpoint_critic.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitp-ai/Deep_Reinforcement_Learning/68fa57e1be9a7e72887f16a413aa55c85033ec4c/P2_Continuous_Actions/checkpoint_critic.pth -------------------------------------------------------------------------------- /P3_Collab_Compete/Future_Improvements.md: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------- 2 | 3 | ## Ideas for Future Improvement: 4 | 1. Use parameter space noise rather than noise on action. https://vimeo.com/252185862 https://github.com/jvmancuso/ParamNoise 5 | 2. We can use prioritised experience buffer. https://github.com/Damcy/prioritized-experience-replay 6 | 3. Different replay buffer for actor/critic 7 | 4. Try adding dropouts in critic network 8 | 5. Turn off OU noise and use random noise 9 | 6. You should also try implementing some other algorithms like A3C and PPO. Following are some useful posts. 10 | [Asynchronous Actor-Critic Agents (A3C)](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2) 11 | 12 | [Trust Region Policy Optimization (TRPO) and Proximal Policy Optimization (PPO)](https://medium.com/@sanketgujar95/trust-region-policy-optimization-trpo-and-proximal-policy-optimization-ppo-e6e7075f39ed) 13 | -------------------------------------------------------------------------------- /P3_Collab_Compete/README.md: -------------------------------------------------------------------------------- 1 | [//]: # (Image References) 2 | 3 | [image1]: https://user-images.githubusercontent.com/10624937/42135623-e770e354-7d12-11e8-998d-29fc74429ca2.gif "Trained Agent" 4 | [image2]: https://user-images.githubusercontent.com/10624937/42135622-e55fb586-7d12-11e8-8a54-3c31da15a90a.gif "Soccer" 5 | 6 | Project Report: https://medium.com/@amitp-ai/maddpg-91caa221d75e 7 | 8 | 9 | # Project 3: Collaboration and Competition 10 | 11 | ### Introduction 12 | 13 | For this project, you will work with the [Tennis](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Learning-Environment-Examples.md#tennis) environment. 14 | 15 | ![Trained Agent][image1] 16 | 17 | In this environment, two agents control rackets to bounce a ball over a net. If an agent hits the ball over the net, it receives a reward of +0.1. If an agent lets a ball hit the ground or hits the ball out of bounds, it receives a reward of -0.01. Thus, the goal of each agent is to keep the ball in play. 18 | 19 | The observation space consists of 8 variables corresponding to the position and velocity of the ball and racket. Each agent receives its own, local observation. Two continuous actions are available, corresponding to movement toward (or away from) the net, and jumping. 20 | 21 | The task is episodic, and in order to solve the environment, your agents must get an average score of +0.5 (over 100 consecutive episodes, after taking the maximum over both agents). Specifically, 22 | 23 | - After each episode, we add up the rewards that each agent received (without discounting), to get a score for each agent. This yields 2 (potentially different) scores. We then take the maximum of these 2 scores. 24 | - This yields a single **score** for each episode. 25 | 26 | The environment is considered solved, when the average (over 100 episodes) of those **scores** is at least +0.5. 27 | 28 | ### Getting Started 29 | 30 | 1. Download the environment from one of the links below. You need only select the environment that matches your operating system: 31 | - Linux: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P3/Tennis/Tennis_Linux.zip) 32 | - Mac OSX: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P3/Tennis/Tennis.app.zip) 33 | - Windows (32-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P3/Tennis/Tennis_Windows_x86.zip) 34 | - Windows (64-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P3/Tennis/Tennis_Windows_x86_64.zip) 35 | 36 | (_For Windows users_) Check out [this link](https://support.microsoft.com/en-us/help/827218/how-to-determine-whether-a-computer-is-running-a-32-bit-version-or-64) if you need help with determining if your computer is running a 32-bit version or 64-bit version of the Windows operating system. 37 | 38 | (_For AWS_) If you'd like to train the agent on AWS (and have not [enabled a virtual screen](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-on-Amazon-Web-Service.md)), then please use [this link](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P3/Tennis/Tennis_Linux_NoVis.zip) to obtain the "headless" version of the environment. You will **not** be able to watch the agent without enabling a virtual screen, but you will be able to train the agent. (_To watch the agent, you should follow the instructions to [enable a virtual screen](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-on-Amazon-Web-Service.md), and then download the environment for the **Linux** operating system above._) 39 | 40 | 2. Place the file in the DRLND GitHub repository, in the `p3_collab-compet/` folder, and unzip (or decompress) the file. 41 | 42 | ### Instructions 43 | 44 | Follow the instructions in `Tennis.ipynb` to get started with training your own agent! 45 | 46 | ### (Optional) Challenge: Crawler Environment 47 | 48 | After you have successfully completed the project, you might like to solve the more difficult **Soccer** environment. 49 | 50 | ![Soccer][image2] 51 | 52 | In this environment, the goal is to train a team of agents to play soccer. 53 | 54 | You can read more about this environment in the ML-Agents GitHub [here](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Learning-Environment-Examples.md#soccer-twos). To solve this harder task, you'll need to download a new Unity environment. (**Note**: Udacity students should not submit a project with this new environment.) 55 | 56 | You need only select the environment that matches your operating system: 57 | - Linux: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P3/Soccer/Soccer_Linux.zip) 58 | - Mac OSX: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P3/Soccer/Soccer.app.zip) 59 | - Windows (32-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P3/Soccer/Soccer_Windows_x86.zip) 60 | - Windows (64-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P3/Soccer/Soccer_Windows_x86_64.zip) 61 | 62 | Then, place the file in the `p3_collab-compet/` folder in the DRLND GitHub repository, and unzip (or decompress) the file. Next, open `Soccer.ipynb` and follow the instructions to learn how to use the Python API to control the agent. 63 | 64 | (_For AWS_) If you'd like to train the agents on AWS (and have not [enabled a virtual screen](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-on-Amazon-Web-Service.md)), then please use [this link](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P3/Soccer/Soccer_Linux_NoVis.zip) to obtain the "headless" version of the environment. You will **not** be able to watch the agents without enabling a virtual screen, but you will be able to train the agents. (_To watch the agents, you should follow the instructions to [enable a virtual screen](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-on-Amazon-Web-Service.md), and then download the environment for the **Linux** operating system above._) 65 | -------------------------------------------------------------------------------- /P3_Collab_Compete/checkpoint_actor_local_0.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitp-ai/Deep_Reinforcement_Learning/68fa57e1be9a7e72887f16a413aa55c85033ec4c/P3_Collab_Compete/checkpoint_actor_local_0.pth -------------------------------------------------------------------------------- /P3_Collab_Compete/checkpoint_actor_local_1.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitp-ai/Deep_Reinforcement_Learning/68fa57e1be9a7e72887f16a413aa55c85033ec4c/P3_Collab_Compete/checkpoint_actor_local_1.pth -------------------------------------------------------------------------------- /P3_Collab_Compete/checkpoint_critic_local_0.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitp-ai/Deep_Reinforcement_Learning/68fa57e1be9a7e72887f16a413aa55c85033ec4c/P3_Collab_Compete/checkpoint_critic_local_0.pth -------------------------------------------------------------------------------- /P3_Collab_Compete/checkpoint_critic_local_1.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amitp-ai/Deep_Reinforcement_Learning/68fa57e1be9a7e72887f16a413aa55c85033ec4c/P3_Collab_Compete/checkpoint_critic_local_1.pth -------------------------------------------------------------------------------- /P3_Collab_Compete/workspace_utils.py: -------------------------------------------------------------------------------- 1 | import signal 2 | 3 | from contextlib import contextmanager 4 | 5 | import requests 6 | 7 | 8 | DELAY = INTERVAL = 4 * 60 # interval time in seconds 9 | MIN_DELAY = MIN_INTERVAL = 2 * 60 10 | KEEPALIVE_URL = "https://nebula.udacity.com/api/v1/remote/keep-alive" 11 | TOKEN_URL = "http://metadata.google.internal/computeMetadata/v1/instance/attributes/keep_alive_token" 12 | TOKEN_HEADERS = {"Metadata-Flavor":"Google"} 13 | 14 | 15 | def _request_handler(headers): 16 | def _handler(signum, frame): 17 | requests.request("POST", KEEPALIVE_URL, headers=headers) 18 | return _handler 19 | 20 | 21 | @contextmanager 22 | def active_session(delay=DELAY, interval=INTERVAL): 23 | """ 24 | Example: 25 | 26 | from workspace_utils import active session 27 | 28 | with active_session(): 29 | # do long-running work here 30 | """ 31 | token = requests.request("GET", TOKEN_URL, headers=TOKEN_HEADERS).text 32 | headers = {'Authorization': "STAR " + token} 33 | delay = max(delay, MIN_DELAY) 34 | interval = max(interval, MIN_INTERVAL) 35 | original_handler = signal.getsignal(signal.SIGALRM) 36 | try: 37 | signal.signal(signal.SIGALRM, _request_handler(headers)) 38 | signal.setitimer(signal.ITIMER_REAL, delay, interval) 39 | yield 40 | finally: 41 | signal.signal(signal.SIGALRM, original_handler) 42 | signal.setitimer(signal.ITIMER_REAL, 0) 43 | 44 | 45 | def keep_awake(iterable, delay=DELAY, interval=INTERVAL): 46 | """ 47 | Example: 48 | 49 | from workspace_utils import keep_awake 50 | 51 | for i in keep_awake(range(5)): 52 | # do iteration with lots of work here 53 | """ 54 | with active_session(delay, interval): yield from iterable 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Reinforcement Learning (Fall 2018) 2 | 3 | **Project 1: Value Based RL Methods Including Deep Q-Network (DQN) and Double Deep Q-Network (DDQN)
4 | https://medium.com/@amitp-ai/double-dqn-48562b5f31c1** 5 | 6 | 7 | **Project 2: Policy Based RL Methods Including Advantage Actor-Critic (A2C) and Deep Deterministic Policy Gradient (DDPG)
8 | https://medium.com/@amitp-ai/policy-gradients-1edbbbc8de6b** 9 | 10 | 11 | **Project 3: Multi-Agent RL Methods Such as Multi-Agent DDPG (MADDPG)
12 | https://medium.com/@amitp-ai/maddpg-91caa221d75e** 13 | --------------------------------------------------------------------------------