├── Alpha_Zero
    ├── MCTS_Basics.py
    └── alpha_go_zero
    │   ├── 6-6-4-pie-0.mypolicy
    │   ├── 6-6-4-pie.policy
    │   ├── ConnectN.py
    │   ├── MCTS.py
    │   ├── Play.py
    │   ├── alphazero-TicTacToe-advanced.ipynb
    │   ├── alphazero-TicTacToe.ipynb
    │   └── playground.py
├── P1_Navigation
    ├── Future_Improvements.md
    ├── Navigation_Final.ipynb
    ├── Readme.md
    ├── ddqn_checkpoint.pth
    ├── dqn_agent.py
    ├── model.py
    └── visual_pixels
    │   ├── Navigation_Pixels.ipynb
    │   ├── pixel_dqn_agent.py
    │   └── pixel_model.py
├── P2_Continuous_Actions
    ├── Continuous_Control_UdacityWorkspace.ipynb
    ├── README.md
    ├── Report.md
    ├── checkpoint_actor.pth
    └── checkpoint_critic.pth
├── P3_Collab_Compete
    ├── Future_Improvements.md
    ├── README.md
    ├── Tennis_Udacity_Workspace.ipynb
    ├── checkpoint_actor_local_0.pth
    ├── checkpoint_actor_local_1.pth
    ├── checkpoint_critic_local_0.pth
    ├── checkpoint_critic_local_1.pth
    └── workspace_utils.py
└── README.md


/Alpha_Zero/MCTS_Basics.py:
--------------------------------------------------------------------------------
  1 | ################################# MCTS #######################################
  2 | 
  3 | # Version 1 (mcts.ai)
  4 | 
  5 | from math import *
  6 | import random
  7 | 
  8 | # This is a very simple implementation of the UCT Monte Carlo Tree Search algorithm in Python 2.7.
  9 | # The function UCT(rootstate, itermax, verbose = False) is towards the bottom of the code.
 10 | # It aims to have the clearest and simplest possible code, and for the sake of clarity, the code
 11 | # is orders of magnitude less efficient than it could be made, particularly by using a 
 12 | # state.GetRandomMove() or state.DoRandomRollout() function.
 13 | # 
 14 | # Example GameState classes for Nim, OXO and Othello are included to give some idea of how you
 15 | # can write your own GameState use UCT in your 2-player game. Change the game to be played in 
 16 | # the UCTPlayGame() function at the bottom of the code.
 17 | # 
 18 | # Written by Peter Cowling, Ed Powley, Daniel Whitehouse (University of York, UK) September 2012.
 19 | # 
 20 | # Licence is granted to freely use and distribute for any sensible/legal purpose so long as this comment
 21 | # remains in any distributed code.
 22 | # 
 23 | # For more information about Monte Carlo Tree Search check out our web site at www.mcts.ai
 24 | 
 25 | 
 26 | class OXOState(object):
 27 |     """ A state of the game, i.e. the game board. These are the only functions which are
 28 |         absolutely necessary to implement UCT in any 2-player complete information deterministic 
 29 |         zero-sum game, although they can be enhanced and made quicker, for example by using a 
 30 |         GetRandomMove() function to generate a random move during rollout.
 31 |         By convention the players are numbered 1 and 2.
 32 |     """
 33 |     """ A state of the game, i.e. the game board.
 34 |         Squares in the board are in this arrangement
 35 |         012
 36 |         345
 37 |         678
 38 |         where 0 = empty, 1 = player 1 (X), 2 = player 2 (O)
 39 |     """
 40 |     def __init__(self):
 41 |         self.playerJustMoved = 2 # At the root pretend the player just moved is p2 - p1 has the first move
 42 |         self.board = [0,0,0,0,0,0,0,0,0] # 0 = empty, 1 = player 1, 2 = player 2
 43 |         
 44 |     def Clone(self):
 45 |         """ Create a deep clone of this game state.
 46 |         """
 47 |         st = OXOState()
 48 |         st.playerJustMoved = self.playerJustMoved
 49 |         st.board = self.board[:]
 50 |         return st
 51 | 
 52 |     def DoMove(self, move):
 53 |         """ Update a state by carrying out the given move.
 54 |             Must update playerJustMoved.
 55 |         """
 56 |         assert move >= 0 and move <= 8 and move == int(move) and self.board[move] == 0
 57 |         self.playerJustMoved = 3 - self.playerJustMoved
 58 |         self.board[move] = self.playerJustMoved
 59 |         
 60 |     def GetMoves(self):
 61 |         """ Get all possible moves from this state.
 62 |         """
 63 |         return [i for i in range(9) if self.board[i] == 0] #empty spots are initialized to 0 (which is printed as '.' by design, see __repr__ function)
 64 |     
 65 |     def GetResult(self, playerjm):
 66 |         """ Get the game result from the viewpoint of playerjm. 
 67 |         """
 68 |         for (x,y,z) in [(0,1,2),(3,4,5),(6,7,8),(0,3,6),(1,4,7),(2,5,8),(0,4,8),(2,4,6)]:
 69 |             if self.board[x] == self.board[y] == self.board[z]:
 70 |                 if self.board[x] == playerjm:
 71 |                     return 1.0
 72 |                 elif self.board[x] == 3-playerjm:
 73 |                     return 0.0
 74 |         if self.GetMoves() == []: return 0.5 # draw
 75 |         return None #if it comes here then the game is still undecided
 76 | 
 77 |     def __repr__(self):
 78 |         s= ""
 79 |         for i in range(9): 
 80 |             s += ".XO"[self.board[i]]
 81 |             if i % 3 == 2: s += "\n"
 82 |         return s
 83 | 
 84 | 
 85 | class Node(object):
 86 |     """ A node in the game tree. Note wins is always from the viewpoint of playerJustMoved.
 87 |         Crashes if state not specified.
 88 |     """
 89 |     def __init__(self, move = None, parent = None, state = None):
 90 |         self.move = move # the move that got us to this node - "None" for the root node
 91 |         self.parentNode = parent # "None" for the root node
 92 |         self.childNodes = [] #all the explored children
 93 |         self.wins = 0
 94 |         self.visits = 0
 95 |         self.untriedMoves = state.GetMoves() # future child nodes
 96 |         self.playerJustMoved = state.playerJustMoved # the only part of the state that the Node needs later
 97 |         
 98 |     def UCTSelectChild(self):
 99 |         """ Use the UCB1 formula to select a child node. Often a constant UCTK is applied so we have
100 |             lambda c: c.wins/c.visits + UCTK * sqrt(2*log(self.visits)/c.visits to vary the amount of
101 |             exploration versus exploitation.
102 |         """
103 |         s = sorted(self.childNodes, key = lambda c: c.wins/c.visits + sqrt(2*log(self.visits)/c.visits))[-1]
104 |         return s
105 |     
106 |     def AddChild(self, m, s):
107 |         """ Remove m from untriedMoves and add a new child node for this move.
108 |             Return the added child node
109 |         """
110 |         n = Node(move = m, parent = self, state = s)
111 |         self.untriedMoves.remove(m)
112 |         self.childNodes.append(n)
113 |         return n
114 |     
115 |     def Update(self, result):
116 |         """ Update this node - one additional visit and result additional wins. result must be from the viewpoint of playerJustmoved.
117 |         """
118 |         self.visits += 1
119 |         self.wins += result
120 | 
121 |     def __repr__(self):
122 |         return "[M:" + str(self.move) + " W/V:" + str(self.wins) + "/" + str(self.visits) + " U:" + str(self.untriedMoves) + "]"
123 | 
124 |     def TreeToString(self, indent):
125 |         s = self.IndentString(indent) + str(self)
126 |         for c in self.childNodes:
127 |              s += c.TreeToString(indent+1)
128 |         return s
129 | 
130 |     def IndentString(self,indent):
131 |         s = "\n"
132 |         for i in range (1,indent+1):
133 |             s += "| "
134 |         return s
135 | 
136 |     def ChildrenToString(self):
137 |         s = ""
138 |         for c in self.childNodes:
139 |              s += str(c) + "\n"
140 |         return s
141 | 
142 | def UCT(rootstate, itermax, verbose = False):
143 |     """ Conduct a UCT search for itermax iterations starting from rootstate.
144 |         Return the best move from the rootstate.
145 |         Assumes 2 alternating players (player 1 starts), with game results in the range [0.0, 1.0]."""
146 |         #Note: node.childNodes include all the explored children. Unexplored children are not in it.
147 | 
148 |     if rootstate.GetResult(1) is not None: #it should never come here as the UCT function will not be called if it is
149 |         raise ValueError("Game Has Ended!")
150 | 
151 | 
152 |     rootnode = Node(state = rootstate)
153 |     #node is just a pointer to rootnode. So as node is changed, rootnode is changed too. (before node is assigned to a different object)
154 |     #this is actually how the search tree is built during each iteration.
155 | 
156 |     for i in range(itermax):
157 |         node = rootnode #node is just a pointer to rootnode. So as node is changed, rootnode is changed too. (before node is assigned to a different object)
158 |         state = rootstate.Clone() #state is updated in place when executing the state.domove() method below
159 |         
160 |         #print('at beginning', node, '\t', rootnode)
161 |         # Select
162 |         while node.untriedMoves == [] and node.childNodes != [] and state.GetResult(1) is None: # node is fully expanded and non-terminal and game has not yet ended
163 |             node = node.UCTSelectChild() #only updates node and not rootnode
164 |             state.DoMove(node.move) #this updates the variable state in place when executing the state.domove() method
165 |         #print('after select', node, '\t', rootnode)
166 | 
167 |         # Expand
168 |         if node.untriedMoves != [] and node.childNodes == [] and state.GetResult(1) is None: # if we can expand (i.e. state/node is non-terminal) and game has not yet ended
169 |             m = random.choice(node.untriedMoves)  #randomly pick an unexplored child
170 |             state.DoMove(m) #this updates the variable state in place when executing the move
171 |             node = node.AddChild(m,state) # add child and descend tree. node.addchild() updates node in place. It changes rootnode only if 'select' module was not executed
172 |             #node.addchild() also returns a different node which is assigned to variable 'node.' There after node is now not poining to rootnode. They are different.
173 |         #print('Node children: \n'.format(node.ChildrenToString()))
174 |         #print('after expand', node, '\t', rootnode)
175 | 
176 |         # Rollout - this can often be made orders of magnitude quicker using a state.GetRandomMove() function
177 |         # Rollout starts from the selected/expanded node (as state is updated after state.DoMove() in select and expand sections)
178 |         while state.GetMoves() != [] and state.GetResult(1) is None: # while state is non-terminal and game has not yet ended
179 |             m = random.choice(state.GetMoves())
180 |             state.DoMove(m) #this updates the variable state in place when executing the move
181 |         #rollout does not change node or rootnode
182 |         #print('after rollout', node, '\t', rootnode)
183 | 
184 |         # Backpropagate
185 |         while node != None: # backpropagate from the expanded node and work back to the root node
186 |             node.Update(state.GetResult(node.playerJustMoved)) # state is terminal. Update node with result from POV of node.playerJustMoved
187 |             node = node.parentNode
188 |         #print('after backprop', node, '\t', rootnode)
189 |         #after backpropagation, node points to None
190 | 
191 |     # Output some information about the tree - can be omitted
192 |     if (verbose): print(rootnode.TreeToString(0))
193 |     else: print(rootnode.ChildrenToString())
194 | 
195 |     return sorted(rootnode.childNodes, key = lambda c: c.visits)[-1].move # return the move that was most visited
196 | 
197 | 
198 | 
199 | def UCTPlayGame():
200 |     """ Play a sample game between two UCT players where each player gets a different number 
201 |         of UCT iterations (= simulations = tree nodes).
202 |     """
203 |     state = OXOState() # uncomment to play OXO
204 |     while (state.GetMoves() != [] and state.GetResult(1) is None): #doesn't matter for which player 1 or 2
205 |         print(str(state))
206 |         if state.playerJustMoved == 1:
207 |             print('Next Player is 2')
208 |             m = UCT(rootstate = state, itermax = 1000, verbose = False) # play with values for itermax and verbose = True
209 |             #m = random.choice(state.GetMoves())
210 |         else:
211 |             print('Next Player is 1')
212 |             m = UCT(rootstate = state, itermax = 100, verbose = False)
213 |             #m = random.choice(state.GetMoves())
214 |         print("Best Move: " + str(m) + "\n")
215 |         state.DoMove(m)
216 |     if state.GetResult(state.playerJustMoved) == 1.0:
217 |         print("Player " + str(state.playerJustMoved) + " wins!")
218 |     elif state.GetResult(state.playerJustMoved) == 0.0:
219 |         print("Player " + str(3 - state.playerJustMoved) + " wins!")
220 |     else: print("Nobody wins!")
221 | 
222 | 
223 | 
224 | if __name__ == "__main__":
225 |     """ Play a single game to the end using UCT for both players. 
226 |     """
227 |     UCTPlayGame()
228 | 
229 | 
230 | # Version 2
231 | '''
232 | # Pseudo Code
233 | def monte_carlo_tree_search(root):
234 |     while resources_left(time, computational power):
235 |         leaf = traverse(root) # leaf = unvisited node 
236 |         simulation_result = rollout(leaf)
237 |         backpropagate(leaf, simulation_result)
238 |     return best_child(root)
239 | 
240 | def traverse(node):
241 |     while fully_expanded(node):
242 |         node = best_uct(node)
243 |     return pick_univisted(node.children) or node # in case no children are present / node is terminal 
244 | 
245 | def rollout(node):
246 |     while non_terminal(node):
247 |         node = rollout_policy(node)
248 |     return result(node) 
249 | 
250 | def rollout_policy(node):
251 |     return pick_random(node.children)
252 | 
253 | def backpropagate(node, result):
254 |    if is_root(node) return 
255 |    node.stats = update_stats(node, result) 
256 |    backpropagate(node.parent)
257 | 
258 | def best_child(node):
259 |     pick child with highest number of visits
260 | '''
261 | 
262 | '''
263 | import numpy as np
264 | from collections import defaultdict
265 | from games.tictactoe import *
266 | from games.common import TwoPlayersGameState
267 | 
268 | class MonteCarloTreeSearchNode:
269 | 
270 |     def __init__(self, state: TwoPlayersGameState, parent = None):
271 |         self.state = state
272 |         self.parent = parent
273 |         self.children = []
274 | 
275 |     @property
276 |     def untried_actions(self):
277 |         raise NotImplemented()
278 | 
279 |     @property
280 |     def q(self):
281 |         raise NotImplemented()
282 | 
283 |     @property
284 |     def n(self):
285 |         raise NotImplemented()
286 | 
287 |     def expand(self):
288 |         raise NotImplemented()
289 | 
290 |     def is_terminal_node(self):
291 |         raise NotImplemented()
292 | 
293 |     def rollout(self):
294 |         raise NotImplemented()
295 | 
296 |     def backpropagate(self, reward):
297 |         raise NotImplemented()
298 | 
299 | 
300 |     def is_fully_expanded(self):
301 |         return len(self.untried_actions) == 0
302 | 
303 |     def best_child(self, c_param = 1.4):
304 |         choices_weights = [
305 |             (c.q / (c.n)) + c_param * np.sqrt((2 * np.log(self.n) / (c.n)))
306 |             for c in self.children
307 |         ]
308 |         return self.children[np.argmax(choices_weights)]
309 | 
310 |     def rollout_policy(self, possible_moves):        
311 |         return possible_moves[np.random.randint(len(possible_moves))]
312 | 
313 | class TwoPlayersGameMonteCarloTreeSearchNode(MonteCarloTreeSearchNode):
314 | 
315 |     def __init__(self, state: TwoPlayersGameState, parent):
316 |         super(TwoPlayersGameMonteCarloTreeSearchNode, self).__init__(state, parent)
317 |         self._number_of_visits = 0.
318 |         self._results = defaultdict(int)
319 | 
320 |     @property
321 |     def untried_actions(self):
322 |         if not hasattr(self, '_untried_actions'):
323 |             self._untried_actions = self.state.get_legal_actions()
324 |         return self._untried_actions
325 | 
326 |     @property
327 |     def q(self):
328 |         wins = self._results[self.parent.state.next_to_move]
329 |         loses = self._results[-1 * self.parent.state.next_to_move]
330 |         return wins - loses
331 | 
332 |     @property
333 |     def n(self):
334 |         return self._number_of_visits
335 | 
336 |     def expand(self):
337 |         action = self.untried_actions.pop()
338 |         next_state = self.state.move(action)
339 |         child_node = TwoPlayersGameMonteCarloTreeSearchNode(next_state, parent = self)
340 |         self.children.append(child_node)
341 |         return child_node
342 | 
343 |     def is_terminal_node(self):
344 |         return self.state.is_game_over()
345 | 
346 |     def rollout(self):
347 |         current_rollout_state = self.state
348 |         while not current_rollout_state.is_game_over():
349 |             possible_moves = current_rollout_state.get_legal_actions()
350 |             action = self.rollout_policy(possible_moves)
351 |             current_rollout_state = current_rollout_state.move(action)
352 |         return current_rollout_state.game_result
353 | 
354 |     def backpropagate(self, result):
355 |         self._number_of_visits += 1.
356 |         self._results[result] += 1.
357 |         if self.parent:
358 |             self.parent.backpropagate(result)
359 | 
360 | class MonteCarloTreeSearch:
361 | 
362 |     def __init__(self, node: MonteCarloTreeSearchNode):
363 |         self.root = node
364 | 
365 | 
366 |     def best_action(self, simulations_number):
367 |         for _ in range(0, simulations_number):            
368 |             v = self.tree_policy()
369 |             reward = v.rollout()
370 |             v.backpropagate(reward)
371 |         # exploitation only
372 |         return self.root.best_child(c_param = 0.)
373 | 
374 | 
375 |     def tree_policy(self):
376 |         current_node = self.root
377 |         while not current_node.is_terminal_node():
378 |             if not current_node.is_fully_expanded():
379 |                 return current_node.expand()
380 |             else:
381 |                 current_node = current_node.best_child()
382 |         return current_node
383 | 
384 | 
385 | '''
386 | 


--------------------------------------------------------------------------------
/Alpha_Zero/alpha_go_zero/6-6-4-pie-0.mypolicy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitp-ai/Deep_Reinforcement_Learning/68fa57e1be9a7e72887f16a413aa55c85033ec4c/Alpha_Zero/alpha_go_zero/6-6-4-pie-0.mypolicy


--------------------------------------------------------------------------------
/Alpha_Zero/alpha_go_zero/6-6-4-pie.policy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitp-ai/Deep_Reinforcement_Learning/68fa57e1be9a7e72887f16a413aa55c85033ec4c/Alpha_Zero/alpha_go_zero/6-6-4-pie.policy


--------------------------------------------------------------------------------
/Alpha_Zero/alpha_go_zero/ConnectN.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | import matplotlib.animation as animation
  7 | from copy import copy
  8 | 
  9 | 
 10 | # output the index of when v has a continuous string of i
 11 | # get_runs([0,0,1,1,1,0,0],1) gives [2],[5],[3]
 12 | def get_runs(v, i):
 13 |      bounded = np.hstack(([0], (v==i).astype(int), [0]))
 14 |      difs = np.diff(bounded)
 15 |      starts, = np.where(difs > 0)
 16 |      ends, = np.where(difs < 0)
 17 |      return starts, ends, ends-starts
 18 | 
 19 | # see if vector contains N of certain number in a row
 20 | def in_a_row(v, N, i):
 21 |      if len(v) < N:
 22 |           return False
 23 |      else:
 24 |           _, _, total = get_runs(v,i)
 25 |           return np.any(total >= N) 
 26 |      
 27 | 
 28 |  
 29 | def get_lines(matrix, loc):
 30 | 
 31 |      i,j=loc
 32 |      flat = matrix.reshape(-1,*matrix.shape[2:])
 33 |     
 34 |      w = matrix.shape[0]
 35 |      h = matrix.shape[1]
 36 |      def flat_pos(pos):
 37 |           return pos[0]*h+pos[1]
 38 | 
 39 |      pos = flat_pos((i,j))
 40 | 
 41 |      # index for flipping matrix across different axis
 42 |      ic = w-1-i
 43 |      jc = h-1-j
 44 | 
 45 |      # top left
 46 |      tl = (i-j,0) if i>j else (0, j-i)
 47 |      tl = flat_pos(tl)
 48 | 
 49 |      # bottom left
 50 |      bl = (w-1-(ic-j),0) if ic>j else (w-1, j-ic)
 51 |      bl = flat_pos(bl)
 52 | 
 53 |      # top right
 54 |      tr = (i-jc,h-1) if i>jc else (0, h-1-(jc-i))
 55 |      tr = flat_pos(tr)
 56 | 
 57 |      # bottom right
 58 |      br = (w-1-(ic-jc),h-1) if ic>jc else (w-1, h-1-(jc-ic))
 59 |      br = flat_pos(br)
 60 | 
 61 |      hor = matrix[:,j]
 62 |      ver = matrix[i,:]
 63 |      diag_right = np.concatenate([flat[tl:pos:h+1],flat[pos:br+1:h+1]])
 64 |      diag_left = np.concatenate([flat[tr:pos:h-1],flat[pos:bl+1:h-1]])
 65 | 
 66 |      return hor, ver, diag_right, diag_left
 67 |         
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 
 73 | class ConnectN:
 74 | 
 75 |      def __init__(self, size, N, pie_rule=False):
 76 |           self.size = size
 77 |           self.w, self.h = size
 78 |           self.N = N
 79 | 
 80 |           # make sure game is well defined
 81 |           if self.w<0 or self.h<0 or self.N<2 or \
 82 |              (self.N > self.w and self.N > self.h):
 83 |                raise ValueError('Game cannot initialize with a {0:d}x{1:d} grid, and winning condition {2:d} in a row'.format(self.w, self.h, self.N))
 84 | 
 85 |           
 86 |           self.score = None
 87 |           self.state=np.zeros(size, dtype=np.float)
 88 |           self.player=1
 89 |           self.last_move=None
 90 |           self.n_moves=0
 91 |           self.pie_rule=pie_rule
 92 |           self.switched_side=False
 93 | 
 94 |      # fast deepcopy
 95 |      def __copy__(self):
 96 |           cls = self.__class__
 97 |           new_game = cls.__new__(cls)
 98 |           new_game.__dict__.update(self.__dict__)
 99 | 
100 |           new_game.N = self.N
101 |           new_game.pie_rule = self.pie_rule
102 |           new_game.state = self.state.copy()
103 |           new_game.switched_side = self.switched_side
104 |           new_game.n_moves = self.n_moves
105 |           new_game.last_move = self.last_move
106 |           new_game.player = self.player
107 |           new_game.score = self.score
108 |           return new_game
109 |     
110 |      # check victory condition
111 |      # fast version
112 |      def get_score(self):
113 | 
114 |           # game cannot end beca
115 |           if self.n_moves<2*self.N-1:
116 |                return None
117 | 
118 |           i,j = self.last_move
119 |           hor, ver, diag_right, diag_left = get_lines(self.state, (i,j))
120 | 
121 |           # loop over each possibility
122 |           for line in [ver, hor, diag_right, diag_left]:
123 |                if in_a_row(line, self.N, self.player):
124 |                     return self.player
125 |                     
126 |           # no more moves
127 |           if np.all(self.state!=0):
128 |                return 0
129 | 
130 |           return None
131 | 
132 |      # for rendering
133 |      # output a list of location for the winning line
134 |      def get_winning_loc(self):
135 |         
136 |           if self.n_moves<2*self.N-1:
137 |                return []
138 | 
139 |           
140 |           loc = self.last_move
141 |           hor, ver, diag_right, diag_left = get_lines(self.state, loc)
142 |           ind = np.indices(self.state.shape)
143 |           ind = np.moveaxis(ind, 0, -1)
144 |           hor_ind, ver_ind, diag_right_ind, diag_left_ind = get_lines(ind, loc)
145 |           # loop over each possibility
146 |         
147 |           pieces = [hor, ver, diag_right, diag_left]
148 |           indices = [hor_ind, ver_ind, diag_right_ind, diag_left_ind]
149 |         
150 |           #winning_loc = np.full(self.state.shape, False, dtype=bool)
151 |         
152 |           for line, index in zip(pieces, indices):
153 |                starts, ends, runs = get_runs(line, self.player)
154 | 
155 |                # get the start and end location
156 |                winning = (runs >= self.N)
157 |                print(winning)
158 |                if not np.any(winning):
159 |                     continue
160 |             
161 |                starts_ind = starts[winning][0]
162 |                ends_ind = ends[winning][0]
163 |                indices = index[starts_ind:ends_ind]
164 |                #winning_loc[indices[:,0], indices[:,1]] = True
165 |                return indices
166 |             
167 |           return []
168 |     
169 |     
170 |      def move(self, loc):
171 |           i,j=loc
172 |           success = False
173 |           if self.w>i>=0 and self.h>j>=0:
174 |                if self.state[i,j]==0:
175 | 
176 |                     # make a move
177 |                     self.state[i,j]=self.player
178 | 
179 |                     # if pie rule is enabled
180 |                     if self.pie_rule:
181 |                          if self.n_moves==1:
182 |                               self.state[tuple(self.last_move)]=-self.player
183 |                               self.switched_side=False
184 |                     
185 |                          elif self.n_moves==0:
186 |                               # pie rule, make first move 0.5
187 |                               # this is to let the neural net know
188 |                               self.state[i,j]=self.player/2.0
189 |                               self.switched_side=False
190 |                          
191 |                     success = True
192 | 
193 |                # switching side
194 |                elif self.pie_rule and self.state[i,j] == -self.player/2.0:
195 | 
196 |                     # make a move
197 |                     self.state[i,j]=self.player
198 |                     self.switched_side=True
199 | 
200 |                     success = True
201 | 
202 |                          
203 |                
204 | 
205 |           if success:
206 |                self.n_moves += 1
207 |                self.last_move = tuple((i,j))
208 |                self.score = self.get_score()
209 | 
210 |                # if game is not over, switch player
211 |                if self.score is None:
212 |                     self.player *= -1
213 |                
214 |                return True
215 | 
216 |           return False
217 |     
218 |     
219 |      def available_moves(self):
220 |           indices = np.moveaxis(np.indices(self.state.shape), 0, -1)
221 |           return indices[np.abs(self.state) != 1]
222 | 
223 |      def available_mask(self):
224 |           return (np.abs(self.state) != 1).astype(np.uint8)
225 | 


--------------------------------------------------------------------------------
/Alpha_Zero/alpha_go_zero/MCTS.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | import matplotlib.animation as animation
  7 | from copy import copy
  8 | from math import *
  9 | import random
 10 | 
 11 | c=1.0
 12 | 
 13 | # transformations
 14 | t0= lambda x: x
 15 | t1= lambda x: x[:,::-1].copy()
 16 | t2= lambda x: x[::-1,:].copy()
 17 | t3= lambda x: x[::-1,::-1].copy()
 18 | t4= lambda x: x.T
 19 | t5= lambda x: x[:,::-1].T.copy()
 20 | t6= lambda x: x[::-1,:].T.copy()
 21 | t7= lambda x: x[::-1,::-1].T.copy()
 22 | 
 23 | tlist=[t0, t1,t2,t3,t4,t5,t6,t7]
 24 | tlist_half=[t0,t1,t2,t3]
 25 | 
 26 | def flip(x, dim):
 27 |     indices = [slice(None)] * x.dim()
 28 |     indices[dim] = torch.arange(x.size(dim) - 1, -1, -1,
 29 |                                 dtype=torch.long, device=x.device)
 30 |     return x[tuple(indices)]
 31 | 
 32 | 
 33 | t0inv= lambda x: x
 34 | t1inv= lambda x: flip(x,1)
 35 | t2inv= lambda x: flip(x,0)
 36 | t3inv= lambda x: flip(flip(x,0),1)
 37 | t4inv= lambda x: x.t()
 38 | t5inv= lambda x: flip(x,0).t()
 39 | t6inv= lambda x: flip(x,1).t()
 40 | t7inv= lambda x: flip(flip(x,0),1).t()
 41 | 
 42 | tinvlist = [t0inv, t1inv, t2inv, t3inv, t4inv, t5inv, t6inv, t7inv]
 43 | tinvlist_half=[t0inv, t1inv, t2inv, t3inv]
 44 | 
 45 | transformation_list = list(zip(tlist, tinvlist))
 46 | transformation_list_half = list(zip(tlist_half, tinvlist_half))
 47 | 
 48 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
 49 | device ='cpu'
 50 | 
 51 | def process_policy(policy, game):
 52 | 
 53 |     # for square board, add rotations as well
 54 |     if game.size[0]==game.size[1]:
 55 |         t, tinv = random.choice(transformation_list)
 56 | 
 57 |     # otherwise only add reflections
 58 |     else:
 59 |         t, tinv = random.choice(transformation_list_half)
 60 |      
 61 |     frame=torch.tensor(t(game.state*game.player), dtype=torch.float, device=device)
 62 |     input=frame.unsqueeze(0).unsqueeze(0)
 63 |     prob, v = policy(input)
 64 |     mask = torch.tensor(game.available_mask())
 65 |     
 66 |     # we add a negative sign because when deciding next move,
 67 |     # the current player is the previous player making the move
 68 |     return game.available_moves(), tinv(prob)[mask].view(-1), v.squeeze().squeeze()
 69 | 
 70 | class Node:
 71 |     def __init__(self, game, mother=None, prob=torch.tensor(0., dtype=torch.float)):
 72 |         self.game = game
 73 |           
 74 |         # child nodes
 75 |         self.child = {}
 76 |         # numbers for determining which actions to take next
 77 |         self.U = 0
 78 | 
 79 |         # V from neural net output
 80 |         # it's a torch.tensor object
 81 |         # has require_grad enabled
 82 |         self.prob = prob
 83 |         # the predicted expectation from neural net
 84 |         self.nn_v = torch.tensor(0., dtype=torch.float)
 85 |         
 86 |         # visit count
 87 |         self.N = 0
 88 | 
 89 |         # expected V from MCTS
 90 |         self.V = 0
 91 | 
 92 |         # keeps track of the guaranteed outcome
 93 |         # initialized to None
 94 |         # this is for speeding the tree-search up
 95 |         # but stopping exploration when the outcome is certain
 96 |         # and there is a known perfect play
 97 |         self.outcome = self.game.score
 98 | 
 99 | 
100 |         # if game is won/loss/draw
101 |         if self.game.score is not None:
102 |             self.V = self.game.score*self.game.player
103 |             self.U = 0 if self.game.score is 0 else self.V*float('inf') #this speeds up the training
104 | 
105 |         # link to previous node
106 |         self.mother = mother
107 | 
108 |     def create_child(self, actions, probs):
109 |         # create a dictionary of children
110 |         games = [ copy(self.game) for a in actions ]
111 | 
112 |         for action, game in zip(actions, games):
113 |             game.move(action)
114 | 
115 |         child = { tuple(a):Node(g, self, p) for a,g,p in zip(actions, games, probs) }
116 |         self.child = child
117 |         
118 |     def explore(self, policy):
119 |         #Utilizes the 4 steps of MCTS, but guided by the network policy and value function for much better search efficiency.
120 |         #B'cse it doesn't do MC rollout/simulation, it is more like TDTS and not MCTS.
121 |         #See the comments in the 'simulation' section below for details.
122 | 
123 |         if self.game.score is not None:
124 |             raise ValueError("game has ended with score {0:d}".format(self.game.score))
125 | 
126 |         current = self 
127 |         #any modifications made to current while it is pointing to self will also modify self i.e. rootnode
128 |         #This is actually how the search tree is build over each iteration of explore
129 | 
130 |         # 1. Selection step
131 |         # explore children of the node
132 |         # to speed things up
133 |         sel_depth = 0
134 |         while current.child and current.outcome is None:
135 | 
136 |             sel_depth += 1
137 | 
138 |             child = current.child
139 |             max_U = max(c.U for c in child.values())
140 |             #print("current max_U ", max_U) 
141 |             actions = [ a for a,c in child.items() if c.U == max_U ]
142 |             if len(actions) == 0:
143 |                 print("error zero length ", max_U)
144 |                 print(current.game.state)
145 |                       
146 |             action = random.choice(actions)            
147 | 
148 |             #this helps speed up the training. Whenever we see a winning move, we don't have to explore the other moves.
149 |             if max_U == -float("inf"): #for next player
150 |                 #current outcome=-current.game.player
151 |                 current.U = float("inf") #switch to current player with -ve
152 |                 current.V = 1.0
153 |                 break
154 |             
155 |             elif max_U == float("inf"): #for next player
156 |                 #current outcome=current.game.player
157 |                 current.U = -float("inf")  #switch to current player with -ve
158 |                 current.V = -1.0
159 |                 break
160 |                 
161 |             current = child[action]
162 |         
163 |         # 2. Expansion step
164 |         # if node hasn't been expanded
165 |         if not current.child and current.outcome is None:
166 |             # policy outputs results from the perspective of the next player
167 |             # thus extra - sign is needed for the current player's perspective.
168 |             next_actions, probs, v = process_policy(policy, current.game)
169 |             current.nn_v = -v
170 |             current.create_child(next_actions, probs) #will create all the children at once and not just one child
171 |             current.V = -float(v)
172 | 
173 |         
174 |         current.N += 1
175 | 
176 |         #3. Simulation /roll out step: not doing as its very expensive to evaluate the policy network for this
177 |         # you can think of it as just a single step roll out since expansion is like the first step in roll out. It's sort of like TD estimate rather than MC estimate of the game score
178 |         # without full MC rollout, this is really like a single step TD rollout. It is really TDTS instead of MCTS.
179 | 
180 |         #4. Backpropagation step
181 |         # now update U and back-prop
182 |         while current.mother:
183 |             mother = current.mother
184 |             mother.N += 1
185 |             # between mother and child, the player is switched, extra - sign
186 |             #mother.V += (-current.V - mother.V)/mother.N #original version but I think it is wrong (this is like TD update) (running average)
187 |             mother.V += (-current.V - (mother.N-1)*mother.V)/mother.N #my modified version (this is like TD update) (running average)
188 |             # Note: nn_v is not backpropagated. It is only updated when the state is being expanded (step 2) using the policy network.
189 | 
190 |             #update U for all sibling nodes
191 |             for sibling in mother.child.values():
192 |                 if sibling.U is not float("inf") and sibling.U is not -float("inf"):
193 |                     sibling.U = sibling.V + c*float(sibling.prob)* sqrt(mother.N)/(1+sibling.N)
194 | 
195 |             current = current.mother
196 | 
197 |         #return sel_depth, debug_find_max_tree_depth(current) #for debug only (depth during selection vs tree depth at the end of explore)
198 | 
199 | 
200 |     def next(self, temperature=1.0):
201 | 
202 |         if self.game.score is not None:
203 |             raise ValueError('game has ended with score {0:d}'.format(self.game.score))
204 | 
205 |         if not self.child:
206 |             print(self.game.state)
207 |             raise ValueError('no children found and game hasn\'t ended')
208 |         
209 |         child=self.child
210 | 
211 |         
212 |         max_U = max(c.U for c in child.values())
213 | 
214 |         if max_U == float("inf"): # if there are winning moves, just output those
215 |             prob = torch.tensor([ 1.0 if c.U == float("inf") else 0 for c in child.values()], device=device)
216 |             
217 |         else:
218 |             # divide things by maxN for numerical stability
219 |             maxN = max(node.N for node in child.values())+1
220 |             prob = torch.tensor([ (node.N/maxN)**(1/temperature) for node in child.values() ], device=device)
221 | 
222 |         # normalize the probability
223 |         if torch.sum(prob) > 0:
224 |             prob /= torch.sum(prob)
225 |             
226 |         # if sum is zero, just make things random
227 |         else:
228 |             prob = torch.tensor(1.0/len(child), device=device).repeat(len(child))
229 | 
230 |         nn_prob = torch.stack([ node.prob for node in child.values() ]).to(device)
231 | 
232 |         nextstate = random.choices(list(child.values()), weights=prob)[0]
233 |         
234 |         # V was for the current player making a move
235 |         # to convert to the next player we add - sign
236 |         return nextstate, (-self.V, -self.nn_v, prob, nn_prob)
237 | 
238 |     def detach_mother(self):
239 |         del self.mother
240 |         self.mother = None
241 | 
242 | 
243 | def debug_find_max_tree_depth(current):
244 |     children = current.child
245 |     if children is not {}:
246 |         max_len = -1
247 |         for a_,c_ in children.items():
248 |             c_len = 1+debug_find_max_tree_depth(c_)
249 |             if max_len < c_len: max_len = c_len
250 |         return max_len            
251 |     else:
252 |         max_len = 0
253 |         return max_len


--------------------------------------------------------------------------------
/Alpha_Zero/alpha_go_zero/Play.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | from matplotlib.animation import FuncAnimation
  3 | 
  4 | import numpy as np
  5 | import time
  6 | 
  7 | from copy import copy
  8 | 
  9 | class Play:
 10 |     
 11 |     def __init__(self, game, player1=None, player2=None, name='game'):
 12 |         self.original_game=game
 13 |         self.game=copy(game)
 14 |         self.player1=player1
 15 |         self.player2=player2
 16 |         self.player=self.game.player
 17 |         self.end=False
 18 |         self.play()
 19 | 
 20 |     def reset(self):
 21 |         self.game=copy(self.original_game)
 22 |         self.click_cid=None
 23 |         self.end=False
 24 |         
 25 |     def play(self, name='Game'):
 26 |         
 27 |         self.reset()
 28 |         
 29 |         if self.game.w * self.game.h <25:
 30 |             figsize=(self.game.w/1.6, self.game.h/1.6)
 31 | 
 32 |         else:
 33 |             figsize=(self.game.w/2.1, self.game.h/2.1)
 34 | 
 35 |         
 36 |         self.fig=plt.figure(name, figsize=figsize)
 37 |         if self.game.w * self.game.h <25:
 38 |             self.fig.subplots_adjust(.2,.2,1,1)
 39 |         else:
 40 |             self.fig.subplots_adjust(.1,.1,1,1)
 41 |             
 42 |         self.fig.show()
 43 |         w,h=self.game.size
 44 |         self.ax=self.fig.gca()
 45 |         self.ax.grid()
 46 |         # remove hovering coordinate tooltips
 47 |         self.ax.format_coord = lambda x, y: ''
 48 |         self.ax.set_xlim([-.5,w-.5])
 49 |         self.ax.set_ylim([-.5,h-.5])
 50 |         self.ax.set_xticks(np.arange(0, w, 1))
 51 |         self.ax.set_yticks(np.arange(0, h, 1))
 52 |         self.ax.set_aspect('equal')
 53 |     
 54 |         for loc in ['top', 'right', 'bottom', 'left']:
 55 |             self.ax.spines[loc].set_visible(False)
 56 | 
 57 | 
 58 |         # fully AI game
 59 |         if self.player1 is not None and self.player2 is not None:
 60 |             self.anim = FuncAnimation(self.fig, self.draw_move, frames=self.move_generator, interval=500, repeat=False)
 61 |             return
 62 |         
 63 |         # at least one human
 64 |         if self.player1 is not None:
 65 |             # first move from AI first
 66 |             succeed = False
 67 |             while not succeed:
 68 |                 loc = self.player1(self.game)
 69 |                 succeed = self.game.move(loc)
 70 | 
 71 |             self.draw_move(loc)
 72 |             
 73 |         self.click_cid=self.fig.canvas.mpl_connect('button_press_event', self.click)
 74 | 
 75 |             
 76 |     def move_generator(self):
 77 |         score = None
 78 |         # game not concluded yet
 79 |         while score is None:
 80 |             self.player = self.game.player
 81 |             if self.game.player == 1:
 82 |                 loc = self.player1(self.game)
 83 |             else:
 84 |                 loc = self.player2(self.game)
 85 |                 
 86 |             success = self.game.move(loc)
 87 | 
 88 |             # see if game is done
 89 |             if success:
 90 |                 score=self.game.score
 91 |                 yield loc
 92 |                 
 93 |         
 94 |     def draw_move(self, move=None):
 95 |         if self.end:
 96 |             return
 97 |         
 98 |         i,j=self.game.last_move if move is None else move
 99 |         c='salmon' if self.player==1 else 'lightskyblue'
100 |         self.ax.scatter(i,j,s=500,marker='o',zorder=3, c=c)
101 |         score = self.game.score
102 |         self.draw_winner(score)
103 |         self.fig.canvas.draw()
104 | 
105 | 
106 |     def draw_winner(self, score):
107 |         if score is None:
108 |             return
109 |         
110 |         if score == -1 or score == 1:
111 |             locs = self.game.get_winning_loc()
112 |             c='darkred' if score==1 else 'darkblue'
113 |             self.ax.scatter(locs[:,0],locs[:,1], s=300, marker='*',c=c,zorder=4)
114 | 
115 |         # try to disconnect if game is over
116 |         if hasattr(self, 'click_cid'):
117 |             self.fig.canvas.mpl_disconnect(self.click_cid)
118 | 
119 |         self.end=True
120 |         
121 |     
122 |     def click(self,event):
123 |         
124 |         loc=(int(round(event.xdata)), int(round(event.ydata)))
125 |         self.player = self.game.player
126 |         succeed=self.game.move(loc)
127 | 
128 |         if succeed:
129 |             self.draw_move()
130 | 
131 |         else:
132 |             return
133 |         
134 |         if self.player1 is not None or self.player2 is not None:
135 | 
136 |             succeed = False
137 |             self.player = self.game.player
138 |             while not succeed:
139 |                 if self.game.player == 1:
140 |                     loc = self.player1(self.game)
141 |                 else:
142 |                     loc = self.player2(self.game)
143 |                 succeed = self.game.move(loc)
144 |                
145 |             self.draw_move()
146 | 


--------------------------------------------------------------------------------
/Alpha_Zero/alpha_go_zero/playground.py:
--------------------------------------------------------------------------------
 1 | 
 2 | dict1 = {'k1': 1, 'k2': 2}
 3 | print(dict1)
 4 | 
 5 | print(10*float('inf'))
 6 | 
 7 | if not {}: print(1)
 8 | else: print(0)
 9 | 
10 | import numpy as np
11 | 
12 | a = np.array([1,2,3,4,5])
13 | print(a[[False,True,True,False,False]])
14 | 
15 | a = [1,2,3,4,4,4]
16 | b = (i**2 for i in a)
17 | print(max(b))
18 | 
19 | import torch
20 | 
21 | a = [torch.tensor(1),torch.tensor(2),torch.tensor(3)]
22 | print(torch.stack(a, dim=0))
23 | print(float(torch.tensor(1)))
24 | 
25 | 


--------------------------------------------------------------------------------
/P1_Navigation/Future_Improvements.md:
--------------------------------------------------------------------------------
1 | It would be very useful to check [Improvements in Deep Q Learning: Dueling Double DQN, Prioritized Experience Replay, and fixed Q-targets](https://medium.freecodecamp.org/improvements-in-deep-q-learning-dueling-double-dqn-prioritized-experience-replay-and-fixed-58b130cc5682)
2 | 


--------------------------------------------------------------------------------
/P1_Navigation/Navigation_Final.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Navigation\n",
  8 |     "\n",
  9 |     "---\n",
 10 |     "\n",
 11 |     "You are welcome to use this coding environment to train your agent for the project.  Follow the instructions below to get started!\n",
 12 |     "\n",
 13 |     "### 1. Start the Environment\n",
 14 |     "\n",
 15 |     "Run the next code cell to install a few packages.  This line will take a few minutes to run!"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "#!pip -q install ./python #to run on udacity workspace"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "The environment is already saved in the Workspace and can be accessed at the file path provided below.  Please run the next code cell without making any changes."
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "name": "stderr",
 41 |      "output_type": "stream",
 42 |      "text": [
 43 |       "INFO:unityagents:\n",
 44 |       "'Academy' started successfully!\n",
 45 |       "Unity Academy name: Academy\n",
 46 |       "        Number of Brains: 1\n",
 47 |       "        Number of External Brains : 1\n",
 48 |       "        Lesson number : 0\n",
 49 |       "        Reset Parameters :\n",
 50 |       "\t\t\n",
 51 |       "Unity brain name: BananaBrain\n",
 52 |       "        Number of Visual Observations (per agent): 0\n",
 53 |       "        Vector Observation space type: continuous\n",
 54 |       "        Vector Observation space size (per agent): 37\n",
 55 |       "        Number of stacked Vector Observation: 1\n",
 56 |       "        Vector Action space type: discrete\n",
 57 |       "        Vector Action space size (per agent): 4\n",
 58 |       "        Vector Action descriptions: , , , \n"
 59 |      ]
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "from unityagents import UnityEnvironment\n",
 64 |     "import numpy as np\n",
 65 |     "import time\n",
 66 |     "from collections import deque\n",
 67 |     "import matplotlib.pyplot as plt\n",
 68 |     "import torch\n",
 69 |     "\n",
 70 |     "#env = UnityEnvironment(file_name=\"/data/Banana_Linux_NoVis/Banana.x86_64\") #to run on udacity workspace\n",
 71 |     "env = UnityEnvironment(file_name=\"./Banana_Linux/Banana.x86_64\") #to run locally"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python."
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 3,
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "# get the default brain\n",
 88 |     "brain_name = env.brain_names[0]\n",
 89 |     "brain = env.brains[brain_name]"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "### 2. Examine the State and Action Spaces\n",
 97 |     "\n",
 98 |     "Run the code cell below to print some information about the environment."
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 4,
104 |    "metadata": {},
105 |    "outputs": [
106 |     {
107 |      "name": "stdout",
108 |      "output_type": "stream",
109 |      "text": [
110 |       "Number of agents: 1\n",
111 |       "Number of actions: 4\n",
112 |       "States look like: [ 1.          0.          0.          0.          0.84408134  0.          0.\n",
113 |       "  1.          0.          0.0748472   0.          1.          0.          0.\n",
114 |       "  0.25755     1.          0.          0.          0.          0.74177343\n",
115 |       "  0.          1.          0.          0.          0.25854847  0.          0.\n",
116 |       "  1.          0.          0.09355672  0.          1.          0.          0.\n",
117 |       "  0.31969345  0.          0.        ]\n",
118 |       "States have length: 37\n"
119 |      ]
120 |     }
121 |    ],
122 |    "source": [
123 |     "# reset the environment\n",
124 |     "env_info = env.reset(train_mode=True)[brain_name]\n",
125 |     "\n",
126 |     "# number of agents in the environment\n",
127 |     "print('Number of agents:', len(env_info.agents))\n",
128 |     "\n",
129 |     "# number of actions\n",
130 |     "action_size = brain.vector_action_space_size\n",
131 |     "print('Number of actions:', action_size)\n",
132 |     "\n",
133 |     "# examine the state space \n",
134 |     "state = env_info.vector_observations[0]\n",
135 |     "print('States look like:', state)\n",
136 |     "state_size = len(state)\n",
137 |     "print('States have length:', state_size)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "### 3. Take Random Actions in the Environment\n",
145 |     "\n",
146 |     "In the next code cell, you will learn how to use the Python API to control the agent and receive feedback from the environment.\n",
147 |     "\n",
148 |     "Note that **in this coding environment, you will not be able to watch the agent while it is training**, and you should set `train_mode=True` to restart the environment."
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 5,
154 |    "metadata": {},
155 |    "outputs": [
156 |     {
157 |      "name": "stdout",
158 |      "output_type": "stream",
159 |      "text": [
160 |       "Score: 0.0\n"
161 |      ]
162 |     }
163 |    ],
164 |    "source": [
165 |     "env_info = env.reset(train_mode=False)[brain_name] # reset the environment\n",
166 |     "state = env_info.vector_observations[0]            # get the current state\n",
167 |     "score = 0                                          # initialize the score\n",
168 |     "while True:\n",
169 |     "    action = np.random.randint(action_size)        # select an action\n",
170 |     "    env_info = env.step(action)[brain_name]        # send the action to the environment\n",
171 |     "    next_state = env_info.vector_observations[0]   # get the next state\n",
172 |     "    reward = env_info.rewards[0]                   # get the reward\n",
173 |     "    done = env_info.local_done[0]                  # see if episode has finished\n",
174 |     "    score += reward                                # update the score\n",
175 |     "    state = next_state                             # roll over the state to next time step\n",
176 |     "    if done:                                       # exit loop if episode finished\n",
177 |     "        break\n",
178 |     "    \n",
179 |     "print(\"Score: {}\".format(score))"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {},
185 |    "source": [
186 |     "When finished, you can close the environment."
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "metadata": {},
192 |    "source": [
193 |     "### 4. It's Your Turn!\n",
194 |     "\n",
195 |     "Now it's your turn to train your own agent to solve the environment!  A few **important notes**:\n",
196 |     "- When training the environment, set `train_mode=True`, so that the line for resetting the environment looks like the following:\n",
197 |     "```python\n",
198 |     "env_info = env.reset(train_mode=True)[brain_name]\n",
199 |     "```\n",
200 |     "- To structure your work, you're welcome to work directly in this Jupyter notebook, or you might like to start over with a new file!  You can see the list of files in the workspace by clicking on **_Jupyter_** in the top left corner of the notebook.\n",
201 |     "- In this coding environment, you will not be able to watch the agent while it is training.  However, **_after training the agent_**, you can download the saved model weights to watch the agent on your own machine! "
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 8,
207 |    "metadata": {},
208 |    "outputs": [
209 |     {
210 |      "name": "stdout",
211 |      "output_type": "stream",
212 |      "text": [
213 |       "Episode 100\tAverage Score: -0.03\n",
214 |       "Episode 200\tAverage Score: 2.742\n",
215 |       "Episode 300\tAverage Score: 6.36\n",
216 |       "Episode 400\tAverage Score: 8.75\n",
217 |       "Episode 500\tAverage Score: 12.05\n",
218 |       "Episode 600\tAverage Score: 14.01\n",
219 |       "Episode 700\tAverage Score: 15.20\n",
220 |       "Episode 800\tAverage Score: 15.65\n",
221 |       "Episode 900\tAverage Score: 16.08\n",
222 |       "Episode 1000\tAverage Score: 15.60\n",
223 |       "Episode 1100\tAverage Score: 15.92\n",
224 |       "Episode 1200\tAverage Score: 16.57\n",
225 |       "Episode 1300\tAverage Score: 16.78\n",
226 |       "Episode 1400\tAverage Score: 16.74\n",
227 |       "Episode 1500\tAverage Score: 16.81\n",
228 |       "Episode 1600\tAverage Score: 16.73\n",
229 |       "Episode 1700\tAverage Score: 17.11\n",
230 |       "Episode 1800\tAverage Score: 17.14\n",
231 |       "Episode 1900\tAverage Score: 15.93\n",
232 |       "Episode 2000\tAverage Score: 16.42\n",
233 |       "Training Time is 2462.7125222682953\n"
234 |      ]
235 |     },
236 |     {
237 |      "data": {
238 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEKCAYAAAAfGVI8AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAIABJREFUeJztnXecFdXZx3/PFurSWZC+9KbSVhAFBBEBUbDECDGKLdYYzZuYoImoUSOx19hijyHmjUk0r4mIgiCK4II0kSYuvSxtWWD7nvePmbk7d+70O+3e+3w/H9h7z5yZ89wzM+c55znPeQ4JIcAwDMNkLllhC8AwDMOECysChmGYDIcVAcMwTIbDioBhGCbDYUXAMAyT4bAiYBiGyXBYETAMw2Q4rAgYhmEyHFYEDMMwGU5O2ALYoW3btqKgoCBsMRiGYVKKFStWHBBC5FvlSwlFUFBQgKKiorDFYBiGSSmIaJudfGwaYhiGyXBYETAMw2Q4rAgYhmEyHFYEDMMwGQ4rAoZhmAyHFQHDMEyGw4qAYRgmw2FFwCRFUfEhbNh7NGwxUoKqmjr8rWgHeHtYJmqkxIIyJrr84IWlAIDiOVNCliT6PLtgM55esAWNc7NxwaCOYYvDMDF4RMAwAVFyrBIAcLSiOmRJGCYeVgQMwzAZDisChmGYDMc3RUBEXYhoIRF9S0TfENFtcvq9RLSLiFbJ/87zSwaGYRjGGj8ni2sA/EIIsZKImgFYQUTz5WNPCCEe9bFshmEYxia+jQiEEHuEECvlz2UAvgXQya/yGCZVIJDp8Y++2YvKmtqkyth+8ATW7ix1de7X2w9j5+ETSZVvlyMnqvD5lgOuz/+q+BD2H62wlffj9ftQUV1fr3tLK1BUfMh12V6wblcpth08HqoMQEBzBERUAGAIgGVy0k+JaA0RvUpErQzOuZ6IioioqKSkJAgxGSZ0ln53ENe/tQKPfLgxqeuMeWQhLnh2iatzL/rjFxj1h4VJlW+Xma99hcv/tCyugXbCpS8sxeSnPrPMt2bnEVz3ZhHu+/f6WNqEJxbF3J/D4vxnluCsRz4NVQYgAEVARHkA3gVwuxDiKIDnAfQEMBjAHgCP6Z0nhHhJCFEohCjMz7fcYIdh0oIjJ6oAADsPl4csSTBs2lsGAKitc7/I7uDxKss8ZRU1ABDX+1bSGJ8VARHlQlICbwsh/gEAQoh9QohaIUQdgJcBDPdTBoZJRQR49bGXkGyNS0bhpDN+eg0RgFcAfCuEeFyV3kGV7SIA6/ySgWGY1IDMp02SJlsugKN76OOn19CZAK4AsJaIVslpdwGYQUSDAQgAxQBu8FEGhokMThohqwnldMPvBppkRVDHmkAX3xSBEGIJoPs0/8evMhkmXWDTkLdkyS0RKwJ9eGUxExiHj1dh3S53Lo1hsPS7g6iprQu0TL9NJGr2lJZjy/5jwRWoQ7lLbyEApi6uq3cciYvplJWljAiclXGiqgYrtvnvYnpAjkMVFqwImMC48I+f4/xn3Lk0Bs2KbYcx4+Uv8dj8TZ5fO8jG3oyRDy3AOY8vClsM1xi5uNbWCUx77nNc/dpXsbSs2ByBM03wy/9djUueX2p7rYJbJj6x2NfrW8GKgAmMbQeDWaTkBSVlUg8t6B5zplouvPzZivln1Y4jsbR605Cza63bJe21caIquQV+VthxgfUTVgQMo0NUeu2Mc/RuXZbLyWJFgaS7fmZFwDA6KI1J0D10VkDJo3fLyOWIgFyalFINVgQMowPFWuT0bgCigh8NrfqabucIMuUpYEXAMCZ42T45uVaad0B9Ra/uFEXgeGWxYhpK8/vBiiBNKauoxtaScF0Dzdh9pDw2IRsG1bV1+Ga3sSur1xaag8cqsbs0Pn5QXZ0wdafdvK8M5R5NUq7bVWrYCK7bVYoDxyqxPcTJfAFg7c5ST0cG6ivtL5O8ftyvI7B33v6yCizcsB/HK2uwYe9Rl2UFDyuCNOWHL36Jsx+LrmvgGXMW4LQHPw6t/EfmbcSUp5dgy/4y03xeNUvDHvgYn22OD7f8x0+34PxnluDr7YcT8lfW1GHCE4tx69yvky573a5SnP/MEjz1yWbd4+c/swSFD3yMMY8EE3FUjy+2HMQFzy7Ba58XJ30tvcV4V7yyXDrm8IZmOQxNMfzBT3D1619h4D3zMOnJz3x3O/UKVgRpyrd7Uqc3EgarZdfCkjJ9t70gJm3Xy/do95HExqKqRlrItmzrwaTKEEJgb6l0/Sgv5tt+SIoKunGvuWIOmmTnCI6mSIRTVgRMRkIxt0DzV9xPbxElnlC8uUKTlqRCqhOp4YmkWK28kNXsljm9m5T0HEFqTC6wImAyG4P31M/GkzQf4kUQ+nldkipuj4ri87LevfjpirJ2G/spRaqfFQGTmdS/4Ob4+R6buTQqKZRky6i+cpSVQlCiua0Dt/JFt8bjYUXAZCRW7WsQYaCzdM0O8coh2R5ynRApYRqqx19hgzcNpQasCBhd9pdVoEwVvREAig8cR51DP+yqmjrsOGTulvj9geA371Ze8MMnqrD94InEKKMeNAB6dahTBPaUVmBPqf7WlEdOVBu6kB6tqLZ0wVXLf+hEdWwrTCO0LqbbDh7HntJynKiqkeWpwiFVXBztvdtTWh4nb3lVLVbtOIIV2w5j39EKVNfqPw9Hy6V6OlZZE4vE6ea52H2kPG7/4xpNeXvkiXk9GfYdrcDxSv3JXcU0tONQ/bNi9FvizpOrs7KmFut3mztw7C2tiKu7ujoR2Mb2rAgYXYY/+AnGq9xPN+4tw9hHP8Xzi76Lpb23apflde55/xuMfnghSk/oN4iLN5Vg3KOf4l9fW1/LS5Qe/0//8jXGPLIQc/67QXNcIpmOoLYOE2SQtdEfPtyQsAG7ugG/6rXluuePeXihpQuu+jqrdxzB4N/NN81frVKIm/eV4axHPsXIhxbgB89Lm7wP/t18DL1fusYinXs38qEFcfLOfHU5Lnzuc1zy/BcY8ftPYs/DYU2QtRcXbwUA/Hv1bhQ+8DEWbtyPcY9+ivdX79aV0yg8+BlzFuCGt1bEvt/89kqMfrjeLbaqtg77jlbEpX28fh8AYMTvP8Elz38Rdz21++j+Mum8Bz74FgDwu3+vx+iHF+KgSQhpZe7jifmbcd7Tn5kGMTz9oU8w/eUvY9+fXrAZZz3yaSDrgVgRMIbsV/U2dx2Rej5FxfWx2dfutHZH/GxzCQCgtFxfESjugmG7Nn7xnb6bZrJ29f0mPXa1yeaIRlGqS132vX48fO05eiSzwc3OI/WjlPU67sibDO6dWt7lxfGyL9ooPQ/HDHreClbPhd7AVLlX6vI/kht5NQePxSuhb1Q99Q0a91X1PVI6M8ozrfy14yK6VHYDNhshAvVuzQDwpXyO4v7rJ6wIGFu4bQ9zs6VHrLpOvwfnh7eIHSznCAIQKCuAMlLVtl0f9M/+D7CbVascaw2eTe21jW6XmYzKofo5n2hO2LAiYBzh9EHOkWdEa2r1XxYlNYhGMWpkmfxkrzx86oRwNPGtLtbPO2KtiKW/RtWgN9KptVln2mzVJvNeeutNnHh0KecpZZrdc2MhXJzjEFYEjK/kKCMCA5tuXUg9JavygmgEzRporzryTq/j917JdhWclXuv3mUcB5STMduONCaHkL6ZCqWDImfsOXfzZAUwqmNFwPhKbrY8IjB4SWMLaIM2DdnM56dpRe83e+2u6PQ6bsp1I6qlInZRD24Dyhk9m3FyIHGTGicOBVE30bEiyHAOHKu05RLq9kFWTEOHjutPmsZsp5r00vJqHKuswZETVSgpq4QQApU1tYbeR1q2Hzyh6wp45EQV9pSWJzTC2k3UtcfN6kkIEXPjPF5ZE3O1VNBGodx5uFwuI74QIUTMddIqWFlljb2opHqboh85URWLZaRFbcIza6z3Ha2IuWkqz4adHrmSw8qNVeGwKl9FdW1ssrVCc78qqmttjwgqNb+9oro2zg1Xmcg+VlkTK2f3kfLYxL82/Mf3JcYunrE5AlXaoeNVOFZZYxrttaSssl6mADpJOf4XwUSVPaXlGPnQAvz8nD647ZzevpSRkyX1Na55vUj3uPLuaucIBt33Udz3ey4YgPnr9+GL7w6ieM4U0zJXbDuES2R3R21eI/dJI591AYHdR8pxxpwF+MWEPrh1fGI9vfXlNsx+7xvM//kYTHhiMbII2PpQfbmTnox3DX1mwRac3KlFgrJ5ftF3ePjDjQCA3RaeIjNf1Xcp1TL+sUU4s1ebuLTBv5uPs/rk6+a/4++rbV13xO8/SUh7ZN5Gy/OUhnHK00tM76OihP759S7MueQUNMzJxvjHFmHXkXIUz5mCqc9+Hpf/wuc+xzs3jLQl+4//tCzu+3urdmPu8h2x7yffMw/Fc6bg5HvmxdJufnul4fWue7MIz/5oCM4/taNhHqXDU1lTG3O/NSPOLZhNQ4yfKG5pCzbut32O486JxQl2J9EWbyoxdPHUYrVwxw5q2/Aei3pavEkKL71VViZ2OqYrtx9OqJpPN5TYlu/LrfoupXp8viWx3hZt0i9L7W7p9F5/9M1eR/nN5gvUZSuLrHap3Fm3axZybdhbZnuxo3b053Rjej2xi4oTQ4kDiXMu2tFIVGBFwPiKVWPiy2SxB9eKt1GbNzCe2fVT3HHKjtdOnPeNSXb1LTSz4TstXw+nnjzKb1CfZuT1pp0sjupcASuCDMbJM+n2+bVqk5VeoZX7qJPyvWhP9a5hdF2v2m+/9IArl0UXOJ2sNcutFtnI9TihfJdeQ4pnm130lh0Y1bEiUf1cQTQ1gW+KgIi6ENFCIvqWiL4hotvk9NZENJ+INst/W/klA+M96vbai0683Tj0Tt5xT0MZB6IuJfzynMoOSBN42dtVjxBrbCz4Apw9I2qyPaj4LIM6Vjo6mTwiqAHwCyFEfwCnA7iFiAYAmAXgEyFEbwCfyN+ZNEXPb1ptG1YaWqu2yskCK08Wp6kuYVW0Vw24X4vqglqsZ+cWqfOYzhGoRLbrDeTWNJTjUlGqlZXhqmODv04IQnf4pgiEEHuEECvlz2UAvgXQCcA0AG/I2d4AcKFfMjDeYachrq6tM1w4ZsTxSmmiLq4HaOMaNXLwMC21dSJOhvIqyeW0prbO0uXyRFUNSk9IbquKe+XR8hpUy6YJpUFy8zvtYLfB1otGqqTV1YkE10q3E5RacYziA5VXS/WlNg1VVNfq5ldP1No1DVXW1MW5mxpFY62sdjbpq5CdnVjvRmUo8lRU18ZFYa2s1q9jodEAdtxm3Zq4kiEQ91EiKgAwBMAyAO2FEHsASVkQUbsgZGC8R6sbBs6eh2aNcrDi7gm2z3v9i+KE471+81/LspQ8f/nJCJzRs20s/SdvFmHBhnrvnv6zPwQAXDSkE/5pEeF0wOx5CWnr9xzFDDki5Go5yN4p985Do9xsrJp9LgDt6lP37DxsHtIYkHzQte6HK7cfxsV//AKvXlWIxZsO6NapF6jdKdXMXb4Dc5fvQPvmDWNp/e7+UDdvmSpAm2l9qbTQ+U8vQZVK8Sr3VMvZJpFenWJUBiCtzTjl3nmxDgIgPceXDO2sk1uo/gdu++sqy7Ive2mpE1E9wffJYiLKA/AugNuFELb9+ojoeiIqIqKikhL7bnWMc7wyHFTV1uGgJrywV1YJI1u9NqyvWgmosVICTqiorouL/KlefeoEbd0UmywwUtBbILZym+S6+NnmA5i7fLtDKYxxGg7B6SDJbP5FXXKVD6MvNW564NU6E9hrdh3RySnhxLT5lYErqp/4qgiIKBeSEnhbCPEPOXkfEXWQj3cAoPvmCiFeEkIUCiEK8/P1F78wYWBtF43LrZPHlZ3U4KQoRXMMYiJQb+JXXQfhTkY69Bqy6T7qN37WWb37qH9leIGfXkME4BUA3wohHlcdeh/ATPnzTADv+SUDkz4YvaxeeHwkS5Ai5GYZv7JCuI+34wVOGztTRRDgogo/66x+iiDamsDPOYIzAVwBYC0RKYaxuwDMAfA3IroWwHYAl/ooA2OCk+ff9ToCC68h++XrnxOUj3xU0JvYVKd42ag5VXBOyzbLH+R99bOJjo0IormgOIZvikAIsQTG5ufxfpXLRAuvestGvc0IDAhUIZP97/VZr9T2XQTjsh0WbqYIgryvXilPrzo9YcAri5nAcfVqqE5SR86MxByBixATRu6GRsQifeoVHys/+UYnmep0HPLaTI5ATUP+lxF1hUBRFxAACgsLRVGRfvRKRp+CWR/EPo/tm4/Xrx4ed7ymti7mgjm4S0v865Yzdc9VIkROe3YJVu8sxejebVFSVpmwt6ua/GYNUVJWidvG98bK7Yfx2eYDlvJ2atk4LqhYVCmeMyVWP7ee3QvPLNiCSQNPwocOA6655aozCgzdQ68c2Q1vLt3mWVlvXzcCl2sidQZFz/ym+M4kvHMq8eY1w3HH31dj31Hj/avNePu6ETizV1vrjDoQ0QohRKFVPh4RZACfbkx0v3Xqkqf40B+tqDFVAgBicdRf+/x729dPBSWg5cXFWwHYD4HgBW996V1Db0WYY610UQIA8NH6vbbjJekRRF+dFUGGQg7dQN0Q/bFmcoQxmrazUToTPaJ+a1gRMP4hImLD9wkl5ESQE7RRb1CYRISI/hwBKwLGV9JXDdQrALebprvBrD3x3GspnW9ewGTsgjIm2rhtNJy0DRF/9j0jzEVcTGqQzDMShFsyK4IMJYi2K8iecphERQ94LkdEfleqI5DcveHJYsZz9pSWo2DWB/j4230Jx/aWVsS5jgLA3f9ah153/cdVWeXVtSg+mD7eHwAS6geIzojg7WXeBZwDgB+F5Dqabvxl2XbDEN52COLpCiQMNRMd1spuoHpRKtftKk1IS9ZdcZuNiJqpTlQUAZOedG/T1PcyeESQYXCT5T0ZYgFjQqJrmya+l8GKIENRd2LZOSQ5wthRimG8hBVBhsJNl3ewaYhJdVgRZBhK7z/qC1xSCR4QMKkOK4IMw6zN4vbMHaxUmVSHvYZSnGVbD+Kyl77EsrvGo33zRgCkzcy1aN0e3bZdq3YY78uaqSgB+RgmVeERQYrzxtJiAECRasPrv9jwJ+c+LMMwCqwI0hA7XkBszmCCZmjXlmGLwBjAiiANsRPwk9UAEzTpHIk21WFFwDBMIAS5IT3jDFYEaYid/V7ZMsQETZD7EDPOYEWQhjg1DfGQnQkEfswiCyuCiPCzuV/jd/9ebyvv3OXbMenJxXFp/1i5E+Mf+xSAPUWwWuUGWl5Vi6+KD+EnbxbZlpdhnMJ6ILrwOoKI8P7q3QCA2RcMsMx75z/WJqR9smG/6puzV27bweN4Yv4mR+cwjFN44BldeESQZrBbKBNVeI4gurAiSDPc6AFWHUwQ8IgguvimCIjoVSLaT0TrVGn3EtEuIlol/zvPr/IzlTohHL9wPIhgmMzGzxHB6wAm6aQ/IYQYLP9ztwciY4gAT8oxDOMM3xSBEGIxgEN+XZ/Rx9WIAIJHBQyTwYQxR/BTIlojm45ahVB+2rDzcOJ+wP9duxd//tLbTcwZxgt4jiC6BK0IngfQE8BgAHsAPGaUkYiuJ6IiIioqKSkJSr6U4o0vihPSbn9nlePrCMEvaaoxundbW/leumKY7Wu2zWvoVhxLrhvVHY1ysnWP5Wa7f/huGNMDk08+yfX5U07p4PrcdCJQRSCE2CeEqBVC1AF4GcBwk7wvCSEKhRCF+fn5wQmZQnhlzmGrUOrxmyn9LfMM794aQ7raH3S/fKV9paEwcWB7W/l+e77x+phLhnaOfb52VHdH5ffIb4pnfzTUVt7e7fIS0uwooRnDuziSKRUJVBEQkVr9XgRgnVFexhrPGnDWBGmL3+tKvFgboA5x4mdguiyXw95MCMHi28piIpoLYCyAtkS0E8A9AMYS0WBITU8xgBv8Kp+xj2BNkHLYaoBFauynrG5nnTa65EAV6V3aTnnprwZ8VARCiBk6ya/4VV4mkgkPKKOP3fbSiZIPS2dkxSkC5+fbPSdbZ7hh59QMGBDwyuJUJgU6e0yICAjfRwReNJJqk41jUxPZH0XomYbsVE8mhMZgRZAifLx+H+7/P3vRSZ1SXSvwxXcHfbk24w/2tiNNjdhTWUnMETjJ7nb+gUcETGS47s0ivLLk+7DFCJRfTeobtgih8dilg0yP222cOrVsbHisWaMc/HRcLydi+QKZmIaaNYy3Xj9xmXm9mJfjcrLYdYmpAyuCFCYFOnuu+ejnY3Dz2OQaqeI5U5KWo21eg6Sv4YZLhnU2PGbXb17AuPG79exeWHvvRPxyYr2yDavBMzO9LPvN+LjvFw0xrhfLcuRi3rpW32v9zsn9AAADOzbXnJf+qsC2IiCiUUR0tfw5n4icOfwyjAOi8+pFR5J47GxHatxT0Ds7rH6Fup3V2vG9tM9nWzToStkZ0O4nYEsRENE9AH4N4E45KRfAn/0SimEy8WW0i92RYBANuxf3iQw+27m+k9663XUEWuWTCc+i3RHBRQCmAjgOAEKI3QCa+SUUwzDm2NqXOkVMh8mtI3Bejrqh1xs1aUVgr6F6qoRUYwIAiKipfyIxDGOFLa8h0wtEp3FTN/4JjbCHYtofEfgnQ1Sxqwj+RkQvAmhJRD8B8DGkWEGMC8qranHnP9ag9ES1aT4hBB6ZtwErtulH8+YVwf5z4Fhl2CL4gldtmychJkyup/6uvzLYQTkWeWPvU8I8RfpjSxEIIR4F8HcA7wLoC2C2EOIZPwVLZ/5WtANzl+/AEx+bbxhfXSvw3MLvcMnzSwOSLLXo1qZJQtrUQR0xa3I/PDV9sOX5HVo0sp3XCrvRQL1C3Ytu31yKGtqnfR7evekMtGySCwB44ofxrpazzx+QVKTPIPjhafFeQeo2+c/XjgAgeffMPn8ALhjUEZNPlsKXFXYzD67XIDvLWhHIeiCLgLevG6ErQxA8edlgnNq5RaBlWioCIsomoo+FEPOFEHcIIX4phJgfhHDpimKXTIXFPlHmycviG/BTOrXA0zOG4MazemLa4E4AgAsGdTQ8/87z+sfyFugoFSf8aWYh/n7jyLi04jlTUDxnCn42vrfl+cMsGjIjWjXJRYMc6TV+avoQDOvWCqtmn4viOVPQIz8+2uY1o7rjpiRcct+75cy47+2aNfSmu6y6RtOGOXj1qkK9QzhFbhxvOKsnrhnVHc/MGILGDaTQ1n+/6Qx0aNEolrd4zhSsnn0uAGm9xKYHJzsS58xe9YpdrXQvld16FVdTAMjxKFLeqtkTUDxnCi4c0glPTR/iyTXtYqkIhBC1AE4QUbAqKo3h5t8btBOLTk1lXiriLKKkeo51DmQREKbmFDu4kVUvVo/fJOXDrznVqoqVw9oySTdPYpqXBF3TdoPOVQBYS0TzIXsOAYAQ4me+SJUhZMJCFT/R1p7eix5UDUuKwH1pTmMC6a3GtVV8EsoviMc1YaLW5JgWNz9N75SEctSNvjKNYOF5lCxBNw12FcEH8j/GQ6weIKsebqZblrQvi9P68LL+ssh6wZK5LA5GBKqsRO6ma92cped143d7lYxrqRdlAvqNvv8jgmA1gS1FIIR4g4gaAOgjJ20UQpi7vDBJk+kNvRXal8VpdakVbbKNDBEZuyfauJG1DocE6t8eWxGbZONhJaZfpqF4zyDtYi77ZboKYa36rNfb115Xr4r8eE8jOSIgorEA3oC0mQwB6EJEM4UQi/0TLTO4///Wo0mDxL1chRB46pPNpud+s7sUXxUf9ku0ULHzcmlfloY57kNneTLnaaQHbJybjGnIifDJtFl+TRHYndtxXbztldgxTWBYrjKX43a3M9PyQ+z42X1zHgNwrhDiLCHEGAATATzhn1jpjXLDDx6vwitLvsczC7Yk5Nl64Die//Q73fPz5IiM6aoEzLhlXE/DY7+a6CxaqdmLN21wR5zdr52jxk/dOPzWZE9hPS8ixeQwcWB73aireaoonL+aVO+xkqUyDRm1TT8/pw/ukOvmitO7YXCXlpgxInEfXrvhHPIa5mBMn3w8M8OdZ0uPtu7Wozpte5s1zMG4vvl44QppL+ZYj9+ih2+2oOzn5/RBYbdWuGRoZzTI9iZmZ9u8hhjbNx8tGufqlhkEdn9JrhBio/JFCLEJUrwhJgnMOgBmpoL+HaIT3aPfScayPHjRybauUTxnCu46r591RgB3TOyH7nJDon5Z+p3UDGf0SvTld/tC3XVef7x61Wlo1sj+Y56lepuuOqPAMN//TOiTsCG6cr9/cW5f3airt55dn9ZLtQl7brb1JPVt5/TGLXK46XbNG+Fft5yJds3qXS0Hd2lper6CUkoWAW9eMxwjerQxzf/wD07VTb/rvP5x4aXjTEO2JLFHVhbhtauHx7mCmqHIYTZHUNC2Kf5+0xlo0SQXr151miN5/nr96ZhySoeE9OHdW+H1q4cjS9XrCNqRxO5kcRERvQLgLfn75QBW+CNSBmGiCZy4E0YVP4bPQH3vOdnrx1WxgTnAySSuWh4re7pW0dc6/E3K85GTTSpZbQrqEt3JYjN5zeRxZc/3/nnSu6LdoHNePd51dXoyBItdRXATgFsA/AySjIsB/NEvoTIFM9uo2UudKkGwgvQo8asRdHLZ+L131Z4miXm1A766OkUR2CurRs6fm1W/YjbZkCNWdRiI+6hJGX6UL+I+J3oEAf4/x7U6FR/JyWI531NCiMcBabUxgIa+SZUhmL14PCKwxleFGOtmOzjFwe/V3l+lMTAaSWgvXVNbPyJQ6tnvR8ZpfadqLKyE2+jzc1ynYwYOurNnd47gEwDqPe8aQwo8xySB2YubKnrArPHzu1cTtfV4ThSf9uVXzAN2r1FdK52Qo5qYCOOZMZPWjTxBNIBGJdTHGorP4feCar1OX1RHBI2EEMeUL0KIY0SUXHCWNOfw8Sq8U7QDN4zpgTU7S7HrSDnOkyeKlNv+4Td7E86b+uwSbNxbhuHdW+ted9WOI1herB+NNGo46SG7azSSy2Nuwnaz4MqonMSSEkxDFiMCLTHTUDahqta4nKjiahFcSIrfS+Wkd49qQ1wRr2B3RHCciIYqX4ioEEC5PyKlB79+dw3m/HcDvio+jGnPfY6b315p67w1O0tRWVPnVmhrAAAdQklEQVSHzzYf0D1+85+DnaPPa2jeV1A/sNo2jCB5z2j3gAWAG87qAQCYc/EpptdX3DDzGuZgwoD28dcnxNKsGsHrRnWPC0qm5b6pA9G9bdOEzdKtmtYfDOuM03tISrtDC2nQrK0HPSV35chucR5Xv7/oFPRql4f8ZvoW16mDOsV979M+D93aNMFd5/UPbLI4r1His2A1V3zz2J6xQG3JYrdBvqww0TXWiBvGJLojJy5qk/420/n9gBTs0C63jOuF5o1y0LxRDl676jR0atkYPz9HJyhhREcEtwP4XyLaDen+dgRwmW9SpQFlFTUAgJpaHZeAJNhdWuHp9axYd99EAEDBLP0II+q2592bzsBFf/wi9j0rC7h36sCE8/ud1Ax3Tu6POycb+9orXDe6B64b3cPgKOGX5/bF/PX7LBvBgZ2a47fnD4iTQ+0RNLp3Phb+ciwKH5iPssp6xWLlNTRjeNdY5NAGOVkonjPF8jcBQGFBa3x4+5iYPOP6tcO4fu0M85/UolHctZs0yMGiO8YBqG+o3CoCvca8eM6UuLq6cmQ3yxAaN57VEy8sil/7ol7z4FSWcX3zAUijJCcrr2/Ta1gNGKDTSUmYIpD/zhxZoHsNIwWhd92BHVtgzb0TY2mfzzrbIG+E5giI6DQiOkkI8RWAfgDeAVAD4EMA3wcgX8qSSsN0r6jRvKy+TxaTH+aC+Aumwl30ymvItAwHcih4NULRi+/jJ0a7pAX5TkdtQdmLAKrkzyMB3AXgOQCHAbzko1wpT+wliNiEpteof161x6MfJ2V7/orKF4yCS6UVSu8x7MlibVWYNZxuFkwlW9VuQ1mQhVeWL7GGvL+kKVaKIFsIocxMXgbgJSHEu0KIuwGY7nBBRK8S0X4iWqdKa01E84los/zX3W4cKYTeEC9dN6Sp0cx6+b060knkTbtVHoWG3Sn1PdbUJPA6tyjP6LkNsn6DXllsqQiISDGAjQewQHXMyjD2OoBJmrRZAD4RQvSG5JI6y6acKUeqvpROUYdVqNEskUzG7c6s/pzUrdkLZacML8wBfj8L9T3WcJ86L0xD5gvK/G0cYyaohHKV474WH19mcEUBsFYEcwEsIqL3IHkJfQYARNQLQKnZiXJkUq2f4zRIUUwh/73QqcApg/zQfLppfyzpsheXYt43e9N2Q5rqWvdzBG7esTjTkEdvaYJ5w8o0ZOOa/i/0ksvxswwb9zLZsOBRwSjWkLZT4OdbHKk5AiHEgwB+Aal3P0rUv21ZAG51UV57IcQe+dp7ABi6SRDR9URURERFJSUlLooKF+WheXHR1ljasu8P4Ya3VoTec/OS28f3iX3W7rt7WoH+Wgg72H0POrdqggbZWbHomlquHNkNQP0etKMsApBpe3/3TzMPnKcOAOcX2gB1WrzssY7tm2/o7tm0YTaaNczB7AsG6h6/cEhH02B7atQN3UVD4l1jB3eRnqOfjJG8xR686BQ0b5RjOcK887z+aJybjTZ5DXSPK4H3BnbQd/ccKj+/Wi81CkLTaojcxjRCiC910jb5I05cGS9BnpAuLCxMuZYzldr6dfdNxMn3zLOd/yeju+PlzySnsdO6t45za7TrPmmF3epr3CDbdGPyIV1bxcn05+tGGLrCAom9vx+e1gW/endN7Hh+s4Z4avpg/OjlZRjRvbWj6KRueehi/SieCvXrCJJ/6F6/erjhsZzsLKy9b2JcmjJSePyHg9CrXTPcO3Ugqmvr8Pay7bZegpV3T0DrpvUNN4HQummDuHs2Y3hXzBje1fJaUwd1xNRBHQ2Pj+6dr/t89m6Xh837j6FtXkPd4yHogcBtQ94E1LbPPiLqAADy3/0W+VOWFNIDSZHSVq4kQ3zY/e1+ux3G5giSvpJH5jUb9RKlx8bqV9ePuDLXfdRr3gcwU/48E8B7AZcfGKlk/nH6zKntxV49r1GpLqsXUAhETssHtbLYDL2yXc37hKAhjCaJFcJwz43aZLFriGgugKUA+hLRTiK6FsAcABOIaDOACfJ3JmSSefnSdeLbzjsflXDgWSF6DenVQJjrGtwQW/JjcDuVdKfbiSZDVDemcYwQYobBofF+lckEg9kiolTHlheQ04v67j8aTDFmqMt20oZplVcoz1NMBPPSA11ZHFhJEkGbhlKej9fvw8FjlZb5UqQzBMBFz1aVPZUHBOYbAxkdq0+3P0fgL0GYhgx/q1nQOROBojiSNB4RhGAaSvM5gpTmRFUNrnuzCDNfW26ZN1WGxUDiQ/frSf0wtm8+Tu6UGJALQNy+q3bXCswY3hUNcqTHTYk8akbHlo0Nj/lZt8rG8m2a6kcBvW1878jd22tGdQcgRSR1g9EdVIdCd9JZuFB2Bx3dJ98yb1hVeWrnFhgrB7WzkkHJN3WwvkeS7dAVDlr3yLmPMvUoQdW2HThhmTdKbUXzRjk4KkdDtcNNY3viprGJ4XkVTu1sb7NzNQ9dfAoeMgk5rbxMN5zVw1ZUUiD5XpNegz59eFdMN3BV3Pr785CVRVgihwiPSqd24sCTPHPbVfO3G0biT59txQMffGuZV937H6px2XVCUCOF9386KvbZarK4Z36e7d+j5NNzUXYyh8MjgnQhQt1Gf1ec+njxiBFGFMqoE5UJ82SonyxO/d/iFlYELki2GYiQjgCQpNdQGjQEdtHaitPlt4fxOLqYcvCN+vvpEB+F5RFBhHFybyLW1vtGBnei7E8WR03zu8Cv+xylqonSsxypjWkY90TpAbcimYcuQu+OY9zeIqfnRf1ZiNI9DGVBWUDdNkeTxTwiSG1WbDuEkrJKrNttGpw1WFJgc5V0M7ekC3aVWMR1nSlRfPZ4HUGKc8nzSzH12SWh9wLVG6NbkcxD5/d2lHooES7b5Om7ePqNU1PPhAHtAQA985smHPNqY3e/MbrLFwySXImHdnW2x5QS4VPZ7/eWccZean5z9ZmS+602aulZffLRINteE6mNuqrc1wtVLqfd2ybefyPSZmVxOmPVEOyx2GDe76Go4sJWWl6NQfd95GsZYXDNqO4x33k3TD+tC/761Y6k5bD7so7o0cawvh65dBAeuXRQ0rL4hdWzOrZvO1fPgtZF+Y6J/XDHRGcb3XvFtaO641qd5+mNa4wjsaoRArh36kDcO7U+PLf6vj45fYhjmXhEEGFSbfhrdwvDTHWbcztqS7XngEk9eI4gwsRsiUnepaBskXZLyTQ14NVLlm71FrY5M1Xxo9GO2p7FjBqPXpSgFySlkvtiSgxOUqc6k6a+8xOuHFEmhV4vQ1gROCBdV5TyS+4OrjcmXWBF4AJtD/ur4kOoqa2zff7v/7PBa5GSImPnCFwq9nTtEDCZCysCB+gNAVduP4xLX1iKJz/eHLxAFjTKzQYAXH56N8+uefkI671jkyFasydBXMWcEaoIoGFwmlz+6N7WkUStmCxHre3tMkpqJjL9tC6BlMPuow7Q6wceKJP2JtiwtyxUW/wDF56M3/5rXVxabnYWNj84GTlZhF9P6ocdh05g7KOfxo5fN6o7Zk125rL3wIUneyFuJHDtNRTQbd7y4ORA1mmYjQiHdm2FTQ9MjoUQT4YfDOuMqYM6enKtTGDzg5ORHdBonRWBA/QaevU2gWFOGhktfMmV07MJaJgbnycri5Bjc8GMQqaakdR45T1mhdN74xarDoyXDXc6KQG/vf9yA7r/AJuGHCE0fwH1fqYCdSFqAjd2a27Sk4Prj0kXWBEkiTIiqBPR9yqMsptbkGa1pDe08UYMhokMrAgcoNdWqVfvhjoiSJPWKUjLU7JVxlYyBkiPjgErAgco5pcTVbXSdyGwctthAMCq7Yfx3f7joclmh4QHNkMbsmR/diot0LMDz/swrAicoHn/v95xBE8v2AIAOFpRg/Oe/iwEoSRSvWkaIkevLOwWrrukHXrkS+6PZ/VtF7Ik3nBOfyk6aoeWjUOWhAkL9hpKgsPHq8IWIYadTqpZT3btvefilHv9iVRqhzN7tcXXd09Aq6YNrDOHTK92efj67glo2SQ3bFE84cazemDG8C5o2ST6dR8l0mkgxYrAAdpmtC7FuuFaPaB2f2vWKPxGLXAlkISJJxUUll2IiJWAC9LJQsimIQdob3yYk8NaXLmPplGPxgmZ+rsZxohQRgREVAygDEAtgBohRGEYcjhF29hGSA/YInFEkNmk2O1jGN8I0zQ0TghxIMTyHaNtSKPkPRIhURiGSTHYNGSToxXVOHyifnJ4074yfLn1YIgSxWNHD3DUTIbxjnQyMYalCASAj4hoBRFdr5eBiK4noiIiKiopKQlYvESG/G4+pjy9JPb93CcW442l2wKVQXHza9IgO+FY3/bWm9U3lyeEu7ZuAgA4tXMLD6VLHZSN1nu3s64zhjGiQ4tGAIAzerYJWZLkCcs0dKYQYjcRtQMwn4g2CCEWqzMIIV4C8BIAFBYWht6VrfXJRejHp3fFn7/cbivvc5cPQemJajRukI2aWoGH523E3OXSucNthCtu1bQBlt55NvLzGmJPaQW6yArBDqtnn5s2kwoXD+2M0wpaO/r9DKOlW5um+HzW2ejQvFHYoiRNKIpACLFb/rufiP4JYDiAxeZnpSendm4JwJ4iaJiTjXbN60cDzRs5v30dWkiLhpw2gi3SxGdegZUA4wWd0mQRXuCmISJqSkTNlM8AzgWwzvwshmEYxi/CGBG0B/BPOb5JDoC/CCE+DEGOaJCMxSlNTDUMw4RL4IpACLEVwKCgy40qyXjy+L0xBsMwmQG7j9rgUIRiCjEMw3gNKwILlm09iKH3zw9bDF289mNW3OGYePrwZutMmsNB5yxYs7PU82sWtGmCpg1z8M3uo4YrgmcM74K5y3d4XrYZH94+BmUV1YGWmQq8e9MZOHKC64VJX3hEEAK92zdDQZumAIznisfZiHWf5fGIoEXjXHRuxW6VWpo1ymV3UyatYUUQFh404jxZzDCMF7AiCAmrJpy3D2QYJihYEYREltzQJxM1lHUFwzBewIpAprZOoLq2LrDyuBFnGCYqsCKQ+fGflqH3b/4bWHmKHjBaUOZGT7RKs3hADMMEA7uPyiz1eW+BK0d2w5uqsNVWcwB2RgzaLB//z1m8+I1hGMfwiMBH1O6dUwd1jDvmh2WoTV5D9LaxLwHDMIwaVgQBoe3hk8Vksa05BJ5oYBjGA1gReEy2ahhgZv5RDiWz7zGrAYZhvIAVgcdkGzb+FPdJ+ZbMxmc8IGAYxgtYEWioS3JLSqPGOdE0JP01GhHwqmGGYYIirRXBK0u+R8GsD3Csskb3+HurdqFg1gcYMLt+X5zqOmktwaZ9ZSiY9QEe/M+3jso8uVP9hvD9O9RP3CobxwNSfCFlA/n2Sex3ytFCGYbxgrRWBK9/8T0A4NAxfZfKvxVJ0T1PVNXG0mpqpR76su8POS7vxrN64tWZp8W+v/DjYbHPvdrl4Zz+UiC5nCzCTWN74U9XFmLSySfhnetPx6zJ/Qyvu+TX4/B/t45KSP9hYRfHMjIMw2hJa0Ugd+4NzTVKo68mmdXFsyb3i9vkvXnj+AVeo3vnAwBysrOQnUU4Z0B7EBFG9GiDlo2NF4N1btUkbqShwPGIGIbxgrRWBApG7WWdjn2+Wkc5uCVLU7DyNcdO/Ghu4xmGCYi0VgRKQ2/Uc67RmRj2Mt6QtlTluy1FwDAMExBprQiUDr9Rs1urowgUc1Ey/v0KCfpHTsjJtlYErCoYhgmK9FYEckC3+pDPItbACyF0zUBV8oigqib5kYHWBVQpOyfLutrZ/s8wTFCkddA5pcN/xSvLsHn/MVvnnPP4Is/K17bliuLJ1jENafPmNcz2TA6GYRgz0loRKNYdu0rADvdPG4h/r9mD5SbupX+9/nRdH/8aebSRa2Ia6tG2KX4+oQ+GdWudvLAMwzA2SGvTkPHW8O7p0roJLh7SyTTP6T3aoFubpgm9fGVyOtvENDSsWytcoIlUyjAM4ydprQj0JoOTRcC+etHOESgT0XojAg4pwTBMWISiCIhoEhFtJKItRDTLr3L03EOTpbZW2N5nWDsiqJVXuOnNESh4LzHDMIw5gSsCIsoG8ByAyQAGAJhBRAP8KMuPEYET5aJdUFZdp4wI0nogxjBMihFGizQcwBYhxFYhRBWAvwKY5kdBvowIHFxT2+9XzjVbUMYGIoZhgiYMRdAJwA7V951ymufUeLhKWCE7i9DUpmunMiBQPIiUVct6pqG8RpIDV0sXG9DzkgOGYZIhDPdRvWYroZtNRNcDuB4Aunbt6qqgM3q2xZItB1ydq8eUUztgbN98NMjOQklZJR74QApR/d4tZ8ZFMFUgIjwzYwiGdmsFoH5EoGcamjTwJNw/bSAuVUUUffnKQvTMb2oq04tXDENf3qeYYZgkCGNEsBOAOn5yZwC7tZmEEC8JIQqFEIX5+fmuCpo4sL07CQ148MKT0Sg3G1lZhCtHFsTSB3VpiZE92+iec8GgjujUsjEA8wVlWVmEK0YWoFFu/WhjwoD26JGfZyrTxIEnoaCtubJgGIYxIwxF8BWA3kTUnYgaAJgO4H0/CvJ6ikDt4unGHKN4DZktKGMYhgmawE1DQogaIvopgHkAsgG8KoT4xo+y9MJMuyEni1BTJ0AqtemmKa+ptV5QxjAMEzShhJgQQvwHwH/8L8eb62RlEVAn4txB3QSFq6kzXlDGMAwTFmndNfXKMuTV/gE1NhaUMQzDBE16KwKPhgRt8xomXM9NU944VxqANWnAkUUZhokOaR19VLv4q3mjHBytqDHMP3NkNxw4VoUP1u4BAPzhklPQKDcbw7q1wsKNJWjWqN7H381k8ewLBqBXuzyM7dPO+ckMwzA+kdYjAu3K4ud/PCz2uaBNEwDAo5cOAgAM7doS9007Gc9dPhQDOjQHAAzs2ALTBndC51ZNcMXp3eKu5WaOoEXjXNw0tqc058AwDBMR0loRaEcEeiEn6uridzED6r2NtLGCGIZh0pG0VgTahl/x449LUxp9VS9dmQpgL0+GYTKBtG7qtA2/XughZdSQrTMi4D0CGIbJBNJaEdTUWo8IlEZf7dKpnMWmfIZhMoG0VgSJ/vr135WYPkoedYyfhjlStfCkLsMwmUBau4/ePK4XauoEWjTORbc2TXBO/3bod1IzXDGyG8b0zsc/v96F6ad1xa7D5bhmVPfYeS9eMQzvrtiFHhbB3O4+fwDO7KUfbI5hGCZVIK8WXflJYWGhKCoqClsMhmGYlIKIVgghCq3ypbVpiGEYhrGGFQHDMEyGw4qAYRgmw2FFwDAMk+GwImAYhslwWBEwDMNkOKwIGIZhMhxWBAzDMBlOSiwoI6ISANtcnt4WwAEPxfEKlssZLJczoioXEF3Z0lGubkKIfKtMKaEIkoGIiuysrAsalssZLJczoioXEF3ZMlkuNg0xDMNkOKwIGIZhMpxMUAQvhS2AASyXM1guZ0RVLiC6smWsXGk/R8AwDMOYkwkjAoZhGMaEtFYERDSJiDYS0RYimhVguV2IaCERfUtE3xDRbXL6vUS0i4hWyf/OU51zpyznRiKa6LN8xUS0VpahSE5rTUTziWiz/LeVnE5E9LQs2xoiGuqTTH1V9bKKiI4S0e1h1BkRvUpE+4lonSrNcf0Q0Uw5/2YimumTXI8Q0Qa57H8SUUs5vYCIylX19oLqnGHy/d8iy57UVnwGcjm+b16/rwZyvaOSqZiIVsnpQdaXUfsQ3jMmhEjLfwCyAXwHoAeABgBWAxgQUNkdAAyVPzcDsAnAAAD3AvilTv4BsnwNAXSX5c72Ub5iAG01aQ8DmCV/ngXgD/Ln8wD8F9I+n6cDWBbQvdsLoFsYdQZgDIChANa5rR8ArQFslf+2kj+38kGucwHkyJ//oJKrQJ1Pc53lAEbKMv8XwGQf5HJ03/x4X/Xk0hx/DMDsEOrLqH0I7RlL5xHBcABbhBBbhRBVAP4KYFoQBQsh9gghVsqfywB8C6CTySnTAPxVCFEphPgewBZI8gfJNABvyJ/fAHChKv1NIfElgJZE1MFnWcYD+E4IYbaI0Lc6E0IsBnBIpzwn9TMRwHwhxCEhxGEA8wFM8louIcRHQoga+euXADqbXUOWrbkQYqmQWpM3Vb/FM7lMMLpvnr+vZnLJvfofAphrdg2f6suofQjtGUtnRdAJwA7V950wb4x9gYgKAAwBsExO+qk8vHtVGfoheFkFgI+IaAURXS+ntRdC7AGkBxVAu5BkA4DpiH9Bo1BnTusnjHq7BlLPUaE7EX1NRIuIaLSc1kmWJQi5nNy3oOtrNIB9QojNqrTA60vTPoT2jKWzItCz4wXqIkVEeQDeBXC7EOIogOcB9AQwGMAeSENTIHhZzxRCDAUwGcAtRDTGJG+gshFRAwBTAfyvnBSVOjPCSI6g6+03AGoAvC0n7QHQVQgxBMD/APgLETUPUC6n9y3o+zkD8Z2NwOtLp30wzGogg2eypbMi2Amgi+p7ZwC7gyqciHIh3eS3hRD/AAAhxD4hRK0Qog7Ay6g3ZQQqqxBit/x3P4B/ynLsU0w+8t/9YcgGSTmtFELsk2WMRJ3Bef0EJp88SXg+gMtl8wVk08tB+fMKSPb3PrJcavORL3K5uG9B1lcOgIsBvKOSN9D60msfEOIzls6K4CsAvYmou9zLnA7g/SAKlu2PrwD4VgjxuCpdbVu/CIDizfA+gOlE1JCIugPoDWmCyg/ZmhJRM+UzpMnGdbIMitfBTADvqWS7UvZcOB1AqTJ89Ym4nloU6kxVnpP6mQfgXCJqJZtFzpXTPIWIJgH4NYCpQogTqvR8IsqWP/eAVD9bZdnKiOh0+Tm9UvVbvJTL6X0L8n09B8AGIUTM5BNkfRm1DwjzGUtm9jvq/yDNtm+CpN1/E2C5oyAN0dYAWCX/Ow/AWwDWyunvA+igOuc3spwbkaRXgoVsPSB5ZKwG8I1SLwDaAPgEwGb5b2s5nQA8J8u2FkChj7I1AXAQQAtVWuB1BkkR7QFQDanXda2b+oFks98i/7vaJ7m2QLITK8/ZC3LeS+T7uxrASgAXqK5TCKlh/g7As5AXlnosl+P75vX7qieXnP46gBs1eYOsL6P2IbRnjFcWMwzDZDjpbBpiGIZhbMCKgGEYJsNhRcAwDJPhsCJgGIbJcFgRMAzDZDisCJi0hohqKT6qqWlUSyK6kYiu9KDcYiJq6+K8iSRF7mxFRP9JVg6GsUNO2AIwjM+UCyEG280shHjBOpevjAawEFLkzM9DloXJEFgRMBkJERVDCjEwTk76kRBiCxHdC+CYEOJRIvoZgBshxfBZL4SYTkStAbwKaWHeCQDXCyHWEFEbSAuY8iGtlCVVWT8G8DNI4ZWXAbhZCFGrkecyAHfK150GoD2Ao0Q0Qggx1Y86YBgFNg0x6U5jjWnoMtWxo0KI4ZBWiz6pc+4sAEOEEKdCUggAcB+Ar+W0uyCFJQaAewAsEVLQsvcBdAUAIuoP4DJIgf4GA6gFcLm2ICHEO6iPnX8KpJWsQ1gJMEHAIwIm3TEzDc1V/X1C5/gaAG8T0b8A/EtOGwUpHAGEEAuIqA0RtYBkyrlYTv+AiA7L+ccDGAbgKynEDBqjPpiYlt6QwggAQBMhxapnGN9hRcBkMsLgs8IUSA38VAB3E9FAmIf+1bsGAXhDCHGnmSAkbRnaFkAOEa0H0IGkbRRvFUJ8Zv4zGCY52DTEZDKXqf4uVR8goiwAXYQQCwH8CkBLAHkAFkM27RDRWAAHhBRLXp0+GdLWgYAUPOwHRNROPtaaiLppBRFCFAL4ANL8wMOQgq4NZiXABAGPCJh0p7Hcs1b4UAihuJA2JKJlkDpEMzTnZQP4s2z2IQBPCCGOyJPJrxHRGkiTxUrY4PsAzCWilQAWAdgOAEKI9UT0W0g7wmVBioR5CwC9bTiHQppUvhnA4zrHGcYXOPook5HIXkOFQogDYcvCMGHDpiGGYZgMh0cEDMMwGQ6PCBiGYTIcVgQMwzAZDisChmGYDIcVAcMwTIbDioBhGCbDYUXAMAyT4fw/YeGrw3QyCiMAAAAASUVORK5CYII=\n",
239 |       "text/plain": [
240 |        "<matplotlib.figure.Figure at 0x7fee0c3d50f0>"
241 |       ]
242 |      },
243 |      "metadata": {},
244 |      "output_type": "display_data"
245 |     }
246 |    ],
247 |    "source": [
248 |     "#Training the model\n",
249 |     "from dqn_agent import Agent\n",
250 |     "\n",
251 |     "agent = Agent(state_size=37, action_size=4, seed=0)\n",
252 |     "#print([p for p in agent.qnetwork_local.parameters()])\n",
253 |     "\n",
254 |     "# #debug\n",
255 |     "# state = env.reset()\n",
256 |     "# print(agent.qnetwork_local(state))\n",
257 |     "# print(agent.qnetwork_local.forward(state))\n",
258 |     "# #end debug\n",
259 |     "\n",
260 |     "filename_to_save = 'ddqn_checkpoint.pth'\n",
261 |     "final_eps = 0.01\n",
262 |     "def dqn(n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=final_eps, eps_decay=0.995):\n",
263 |     "    \"\"\"Deep Q-Learning.\n",
264 |     "    \n",
265 |     "    Params\n",
266 |     "    ======\n",
267 |     "        n_episodes (int): maximum number of training episodes\n",
268 |     "        max_t (int): maximum number of timesteps per episode\n",
269 |     "        eps_start (float): starting value of epsilon, for epsilon-greedy action selection\n",
270 |     "        eps_end (float): minimum value of epsilon\n",
271 |     "        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon\n",
272 |     "    \"\"\"\n",
273 |     "    scores = []                        # list containing scores from each episode\n",
274 |     "    scores_window = deque(maxlen=100)  # last 100 scores\n",
275 |     "    eps = eps_start                    # initialize epsilon\n",
276 |     "    for i_episode in range(1, n_episodes+1):     \n",
277 |     "        env_info = env.reset(train_mode=True)[brain_name] # reset the environment\n",
278 |     "        state = env_info.vector_observations[0]            # get the current state\n",
279 |     "        score = 0   # initialize the score\n",
280 |     "        for t in range(max_t): #this could also be while True instead\n",
281 |     "            action = agent.act(state, eps)        # select an action\n",
282 |     "            env_info = env.step(action)[brain_name]        # send the action to the environment\n",
283 |     "            next_state = env_info.vector_observations[0]   # get the next state\n",
284 |     "            reward = env_info.rewards[0]                   # get the reward\n",
285 |     "            done = env_info.local_done[0]                  # see if episode has finished\n",
286 |     "            agent.step(state, action, reward, next_state, done) #do the learning\n",
287 |     "\n",
288 |     "            score += reward                                # update the score\n",
289 |     "            state = next_state                             # roll over the state to next time step\n",
290 |     "            if done:                                       # exit loop if episode finished\n",
291 |     "                break\n",
292 |     "        scores_window.append(score)       # save most recent score\n",
293 |     "        scores.append(score)              # save most recent score\n",
294 |     "        eps = max(eps_end, eps_decay*eps) # decrease epsilon\n",
295 |     "        print('\\rEpisode {}\\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end=\"\")\n",
296 |     "        if i_episode % 100 == 0:\n",
297 |     "            print('\\rEpisode {}\\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))\n",
298 |     "        if np.mean(scores_window)>=25.0:\n",
299 |     "            print('\\nEnvironment solved in {:d} episodes!\\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))\n",
300 |     "            torch.save(agent.qnetwork_local.state_dict(), filename_to_save)\n",
301 |     "            break\n",
302 |     "    torch.save(agent.qnetwork_local.state_dict(), filename_to_save) #for debug only\n",
303 |     "    return scores\n",
304 |     "\n",
305 |     "strt = time.time()\n",
306 |     "scores = dqn()\n",
307 |     "print('Training Time is {}'.format(time.time()-strt))\n",
308 |     "\n",
309 |     "# plot the scores\n",
310 |     "fig = plt.figure()\n",
311 |     "ax = fig.add_subplot(111)\n",
312 |     "plt.plot(np.arange(len(scores)), scores)\n",
313 |     "plt.ylabel('Score')\n",
314 |     "plt.xlabel('Episode #')\n",
315 |     "plt.show()"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": 6,
321 |    "metadata": {},
322 |    "outputs": [],
323 |    "source": []
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": 4,
328 |    "metadata": {},
329 |    "outputs": [
330 |     {
331 |      "name": "stdout",
332 |      "output_type": "stream",
333 |      "text": [
334 |       "Episode 1: 16.0\n",
335 |       "Episode 2: 19.0\n",
336 |       "Episode 3: 13.0\n",
337 |       "All the scores[16.0, 19.0, 13.0]\n",
338 |       "Mean Score: 16.0\n"
339 |      ]
340 |     }
341 |    ],
342 |    "source": [
343 |     "#Testing the model\n",
344 |     "from dqn_agent import Agent\n",
345 |     "filename_to_load = './ddqn_checkpoint.pth'\n",
346 |     "final_eps = 0.01\n",
347 |     "\n",
348 |     "agent = Agent(state_size=37, action_size=4, seed=0)\n",
349 |     "# load the weights from file\n",
350 |     "#agent.qnetwork_local.load_state_dict(torch.load(filename_to_load))\n",
351 |     "\n",
352 |     "#Since the model is trained on gpu, need to load all gpu tensors to cpu:\n",
353 |     "agent.qnetwork_local.load_state_dict(torch.load(filename_to_load, map_location=lambda storage, loc: storage))\n",
354 |     "\n",
355 |     "\n",
356 |     "#print([p for p in agent.qnetwork_local.parameters()])\n",
357 |     "\n",
358 |     "# #debug\n",
359 |     "# state = env.reset()\n",
360 |     "# print(agent.qnetwork_local(state))\n",
361 |     "# print(agent.qnetwork_local.forward(state))\n",
362 |     "# #end debug\n",
363 |     "\n",
364 |     "num_episodes = 100\n",
365 |     "scores = []\n",
366 |     "for i_episode in range(1,num_episodes+1):\n",
367 |     "    env_info = env.reset(train_mode=False)[brain_name] # reset the environment\n",
368 |     "    state = env_info.vector_observations[0]            # get the current state\n",
369 |     "    score = 0                                          # initialize the score\n",
370 |     "    while True:\n",
371 |     "        action = agent.act(state, eps=final_eps)        # select an action\n",
372 |     "        env_info = env.step(action)[brain_name]        # send the action to the environment\n",
373 |     "        next_state = env_info.vector_observations[0]   # get the next state\n",
374 |     "        reward = env_info.rewards[0]                   # get the reward\n",
375 |     "        done = env_info.local_done[0]                  # see if episode has finished\n",
376 |     "        #agent.step(state, action, reward, next_state, done) #do the learning\n",
377 |     "\n",
378 |     "        score += reward                                # update the score\n",
379 |     "        state = next_state                             # roll over the state to next time step\n",
380 |     "        if done:                                       # exit loop if episode finished\n",
381 |     "            print('Episode {}: {}'. format(i_episode, score))\n",
382 |     "            scores.append(score)\n",
383 |     "            break\n",
384 |     "\n",
385 |     "print('All the scores{}'.format(scores))\n",
386 |     "print(\"Mean Score: {}\".format(np.mean(scores)))"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": 5,
392 |    "metadata": {},
393 |    "outputs": [],
394 |    "source": [
395 |     "env.close()"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": null,
401 |    "metadata": {},
402 |    "outputs": [],
403 |    "source": []
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": null,
408 |    "metadata": {},
409 |    "outputs": [],
410 |    "source": []
411 |   }
412 |  ],
413 |  "metadata": {
414 |   "kernelspec": {
415 |    "display_name": "Python 3",
416 |    "language": "python",
417 |    "name": "python3"
418 |   },
419 |   "language_info": {
420 |    "codemirror_mode": {
421 |     "name": "ipython",
422 |     "version": 3
423 |    },
424 |    "file_extension": ".py",
425 |    "mimetype": "text/x-python",
426 |    "name": "python",
427 |    "nbconvert_exporter": "python",
428 |    "pygments_lexer": "ipython3",
429 |    "version": "3.6.2"
430 |   }
431 |  },
432 |  "nbformat": 4,
433 |  "nbformat_minor": 2
434 | }
435 | 


--------------------------------------------------------------------------------
/P1_Navigation/Readme.md:
--------------------------------------------------------------------------------
 1 | **Project Report: https://medium.com/@amitp-ai/double-dqn-48562b5f31c1**
 2 | 
 3 | 
 4 | [//]: # (Image References)
 5 | 
 6 | [image1]: https://user-images.githubusercontent.com/10624937/42135619-d90f2f28-7d12-11e8-8823-82b970a54d7e.gif "Trained Agent"
 7 | 
 8 | # Project 1: Navigation
 9 | 
10 | ### Introduction
11 | 
12 | For this project, you will train an agent to navigate (and collect bananas!) in a large, square world.  
13 | 
14 | ![Trained Agent][image1]
15 | 
16 | A reward of +1 is provided for collecting a yellow banana, and a reward of -1 is provided for collecting a blue banana.  Thus, the goal of your agent is to collect as many yellow bananas as possible while avoiding blue bananas.  
17 | 
18 | The state space has 37 dimensions and contains the agent's velocity, along with ray-based perception of objects around agent's forward direction.  Given this information, the agent has to learn how to best select actions.  Four discrete actions are available, corresponding to:
19 | - **`0`** - move forward.
20 | - **`1`** - move backward.
21 | - **`2`** - turn left.
22 | - **`3`** - turn right.
23 | 
24 | The task is episodic, and in order to solve the environment, your agent must get an average score of +13 over 100 consecutive episodes.
25 | 
26 | ### Getting Started
27 | 
28 | 1. Download the environment from one of the links below.  You need only select the environment that matches your operating system:
29 |     - Linux: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/Banana_Linux.zip)
30 |     - Mac OSX: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/Banana.app.zip)
31 |     - Windows (32-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/Banana_Windows_x86.zip)
32 |     - Windows (64-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/Banana_Windows_x86_64.zip)
33 |     
34 |     (_For Windows users_) Check out [this link](https://support.microsoft.com/en-us/help/827218/how-to-determine-whether-a-computer-is-running-a-32-bit-version-or-64) if you need help with determining if your computer is running a 32-bit version or 64-bit version of the Windows operating system.
35 | 
36 |     (_For AWS_) If you'd like to train the agent on AWS (and have not [enabled a virtual screen](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-on-Amazon-Web-Service.md)), then please use [this link](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/Banana_Linux_NoVis.zip) to obtain the environment.
37 | 
38 | 2. Place the file in the DRLND GitHub repository, in the `p1_navigation/` folder, and unzip (or decompress) the file. 
39 | 
40 | ### Instructions
41 | 
42 | Follow the instructions in `Navigation_Final.ipynb` to get started with training your own agent!  
43 | 
44 | ### (Optional) Challenge: Learning from Pixels
45 | 
46 | After you have successfully completed the project, if you're looking for an additional challenge, you have come to the right place!  In the project, your agent learned from information such as its velocity, along with ray-based perception of objects around its forward direction.  A more challenging task would be to learn directly from pixels!
47 | 
48 | To solve this harder task, you'll need to download a new Unity environment.  This environment is almost identical to the project environment, where the only difference is that the state is an 84 x 84 RGB image, corresponding to the agent's first-person view.  (**Note**: Udacity students should not submit a project with this new environment.)
49 | 
50 | You need only select the environment that matches your operating system:
51 | - Linux: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/VisualBanana_Linux.zip)
52 | - Mac OSX: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/VisualBanana.app.zip)
53 | - Windows (32-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/VisualBanana_Windows_x86.zip)
54 | - Windows (64-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/VisualBanana_Windows_x86_64.zip)
55 | 
56 | Then, place the file in the `p1_navigation/` folder in the DRLND GitHub repository, and unzip (or decompress) the file.  Next, open `Navigation_Pixels.ipynb` and follow the instructions to learn how to use the Python API to control the agent.
57 | 
58 | (_For AWS_) If you'd like to train the agent on AWS, you must follow the instructions to [set up X Server](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-on-Amazon-Web-Service.md), and then download the environment for the **Linux** operating system above.
59 | 


--------------------------------------------------------------------------------
/P1_Navigation/ddqn_checkpoint.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitp-ai/Deep_Reinforcement_Learning/68fa57e1be9a7e72887f16a413aa55c85033ec4c/P1_Navigation/ddqn_checkpoint.pth


--------------------------------------------------------------------------------
/P1_Navigation/dqn_agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | from collections import namedtuple, deque
  4 | 
  5 | from model import QNetwork
  6 | 
  7 | import torch
  8 | import torch.nn.functional as F
  9 | import torch.optim as optim
 10 | 
 11 | BUFFER_SIZE = int(1e5)  # replay buffer size
 12 | BATCH_SIZE = 64         # minibatch size
 13 | GAMMA = 0.99            # discount factor
 14 | TAU = 1e-3              # for soft update of target parameters
 15 | LR = 5e-4               # learning rate 
 16 | UPDATE_EVERY = 4        # how often to update the network
 17 | 
 18 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 19 | 
 20 | class Agent():
 21 |     """Interacts with and learns from the environment."""
 22 | 
 23 |     def __init__(self, state_size, action_size, seed):
 24 |         """Initialize an Agent object.
 25 |         
 26 |         Params
 27 |         ======
 28 |             state_size (int): dimension of each state
 29 |             action_size (int): dimension of each action
 30 |             seed (int): random seed
 31 |         """
 32 |         self.state_size = state_size
 33 |         self.action_size = action_size
 34 |         self.seed = random.seed(seed)
 35 | 
 36 |         # Q-Network
 37 |         self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
 38 |         self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
 39 |         self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
 40 | 
 41 |         # Replay memory
 42 |         self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
 43 |         # Initialize time step (for updating every UPDATE_EVERY steps)
 44 |         self.t_step = 0
 45 |     
 46 |     def step(self, state, action, reward, next_state, done):
 47 |         # Save experience in replay memory
 48 |         self.memory.add(state, action, reward, next_state, done)
 49 |         
 50 |         # Learn every UPDATE_EVERY time steps.
 51 |         self.t_step = (self.t_step + 1) % UPDATE_EVERY
 52 |         if self.t_step == 0:
 53 |             # If enough samples are available in memory, get random subset and learn
 54 |             if len(self.memory) > BATCH_SIZE:
 55 |                 experiences = self.memory.sample()
 56 |                 self.learn(experiences, GAMMA)
 57 | 
 58 |     def act(self, state, eps=0.):
 59 |         """Returns actions for given state as per current policy.
 60 |         
 61 |         Params
 62 |         ======
 63 |             state (array_like): current state
 64 |             eps (float): epsilon, for epsilon-greedy action selection
 65 |         """
 66 |         state = torch.from_numpy(state).float().unsqueeze(0).to(device)
 67 |         self.qnetwork_local.eval()
 68 |         with torch.no_grad():
 69 |             action_values = self.qnetwork_local(state) #same as self.qnetwork_local.forward(state)
 70 |         self.qnetwork_local.train()
 71 | 
 72 |         # Epsilon-greedy action selection
 73 |         if random.random() > eps:
 74 |             return np.argmax(action_values.cpu().data.numpy())
 75 |         else:
 76 |             return random.choice(np.arange(self.action_size))
 77 | 
 78 |     def learn(self, experiences, gamma):
 79 |         """Update value parameters using given batch of experience tuples.
 80 | 
 81 |         Params
 82 |         ======
 83 |             experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
 84 |             gamma (float): discount factor
 85 |         """
 86 |         states, actions, rewards, next_states, dones = experiences
 87 | 
 88 |         ## TODO: compute and minimize the loss
 89 |         #"*** YOUR CODE HERE ***"
 90 |         qs_local = self.qnetwork_local.forward(states)
 91 |         qsa_local = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long), actions.reshape(BATCH_SIZE)]
 92 |         qsa_local = qsa_local.reshape((BATCH_SIZE,1))
 93 |         #print(qsa_local.shape)
 94 | 
 95 |         # # DQN Target
 96 |         # qs_target = self.qnetwork_target.forward(next_states)
 97 |         # qsa_target, _ = torch.max(qs_target, dim=1) #using the greedy policy (q-learning)
 98 |         # qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete
 99 |         # qsa_target = qsa_target.reshape((BATCH_SIZE,1))
100 |         # TD_target = rewards + gamma * qsa_target
101 |         # #print(qsa_target.shape, TD_target.shape, rewards.shape)
102 | 
103 |         # # Double DQN Target ver 1
104 |         # qs_target = self.qnetwork_target.forward(next_states)
105 |         # if random.random() > 0.5:
106 |         #     _, qsa_target_argmax_a = torch.max(qs_target, dim=1) #using the greedy policy (q-learning)
107 |         #     qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_target_argmax_a.reshape(BATCH_SIZE)]
108 |         # else:
109 |         #     _, qsa_local_argmax_a = torch.max(qs_local, dim=1) #using the greedy policy (q-learning)
110 |         #     #qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)]
111 |         #     ##qsa_target = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)]
112 | 
113 |         # qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete
114 |         # qsa_target = qsa_target.reshape((BATCH_SIZE,1))
115 |         # TD_target = rewards + gamma * qsa_target
116 | 
117 |         # Double DQN Target ver 2 (based upon double dqn paper)
118 |         qs_target = self.qnetwork_target.forward(next_states)
119 |         _, qsa_local_argmax_a = torch.max(qs_local, dim=1) #using the greedy policy (q-learning)
120 |         qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)]
121 | 
122 |         qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete
123 |         qsa_target = qsa_target.reshape((BATCH_SIZE,1))
124 |         TD_target = rewards + gamma * qsa_target
125 | 
126 |         #print(qsa_target.shape, TD_target.shape, rewards.shape)
127 | 
128 |         # #Udacity's approach
129 |         # # Get max predicted Q values (for next states) from target model
130 |         # Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
131 |         # # Compute Q targets for current states 
132 |         # TD_target = rewards + (gamma * Q_targets_next * (1 - dones))
133 |         # # Get expected Q values from local model
134 |         # qsa_local = self.qnetwork_local(states).gather(1, actions)
135 | 
136 | 
137 | 
138 |         #diff = qsa_local - TD_target
139 |         #loss = torch.matmul(torch.transpose(diff, dim0=0, dim1=1), diff) #loss is now a scalar
140 |         loss = F.mse_loss(qsa_local, TD_target) #much faster than the above loss function
141 |         #print(loss)
142 |         #minimize the loss
143 |         self.optimizer.zero_grad() #clears the gradients
144 |         loss.backward()
145 |         self.optimizer.step()
146 | 
147 | 
148 |         # ------------------- update target network ------------------- #
149 |         self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     
150 | 
151 |     def soft_update(self, local_model, target_model, tau):
152 |         """Soft update model parameters.
153 |         θ_target = τ*θ_local + (1 - τ)*θ_target
154 | 
155 |         Params
156 |         ======
157 |             local_model (PyTorch model): weights will be copied from
158 |             target_model (PyTorch model): weights will be copied to
159 |             tau (float): interpolation parameter 
160 |         """
161 |         for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
162 |             target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
163 | 
164 | 
165 | class ReplayBuffer:
166 |     """Fixed-size buffer to store experience tuples."""
167 | 
168 |     def __init__(self, action_size, buffer_size, batch_size, seed):
169 |         """Initialize a ReplayBuffer object.
170 | 
171 |         Params
172 |         ======
173 |             action_size (int): dimension of each action
174 |             buffer_size (int): maximum size of buffer
175 |             batch_size (int): size of each training batch
176 |             seed (int): random seed
177 |         """
178 |         self.action_size = action_size
179 |         self.memory = deque(maxlen=buffer_size)  
180 |         self.batch_size = batch_size
181 |         self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
182 |         self.seed = random.seed(seed)
183 |     
184 |     def add(self, state, action, reward, next_state, done):
185 |         """Add a new experience to memory."""
186 |         e = self.experience(state, action, reward, next_state, done)
187 |         self.memory.append(e)
188 |     
189 |     def sample(self):
190 |         """Randomly sample a batch of experiences from memory."""
191 |         experiences = random.sample(self.memory, k=self.batch_size)
192 | 
193 |         states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
194 |         actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
195 |         rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
196 |         next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
197 |         dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
198 |   
199 |         return (states, actions, rewards, next_states, dones)
200 | 
201 |     def __len__(self):
202 |         """Return the current size of internal memory."""
203 |         return len(self.memory)


--------------------------------------------------------------------------------
/P1_Navigation/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class QNetwork(nn.Module):
 6 |     """Actor (Policy) Model."""
 7 | 
 8 |     def __init__(self, state_size, action_size, seed, hidden_layers = [64,64]):
 9 |         """Initialize parameters and build model.
10 |         Params
11 |         ======
12 |             state_size (int): Dimension of each state
13 |             action_size (int): Dimension of each action
14 |             seed (int): Random seed
15 |         """
16 |         super(QNetwork, self).__init__()
17 |         self.seed = torch.manual_seed(seed)
18 |         
19 |         #"*** YOUR CODE HERE ***"
20 |         self.fc1 = nn.Linear(state_size, hidden_layers[0])
21 |         self.relu1 = nn.ReLU()
22 |         self.fc2 = nn.Linear(hidden_layers[0], hidden_layers[1])
23 |         self.relu2 = nn.ReLU()
24 |         self.fc3 = nn.Linear(hidden_layers[1], action_size)
25 | 
26 |     def forward(self, state):
27 |         """Build a network that maps state -> action values."""
28 |         state = self.fc1(state)
29 |         state = self.relu1(state)
30 |         state = self.fc2(state)
31 |         state = self.relu2(state)
32 |         state = self.fc3(state)
33 |         return state
34 | 
35 | 


--------------------------------------------------------------------------------
/P1_Navigation/visual_pixels/Navigation_Pixels.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Navigation\n",
  8 |     "\n",
  9 |     "---\n",
 10 |     "\n",
 11 |     "Congratulations for completing the first project of the [Deep Reinforcement Learning Nanodegree](https://www.udacity.com/course/deep-reinforcement-learning-nanodegree--nd893)!  In this notebook, you will learn how to control an agent in a more challenging environment, where it can learn directly from raw pixels!  **Note that this exercise is optional!**\n",
 12 |     "\n",
 13 |     "### 1. Start the Environment\n",
 14 |     "\n",
 15 |     "We begin by importing some necessary packages.  If the code cell below returns an error, please revisit the project instructions to double-check that you have installed [Unity ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) and [NumPy](http://www.numpy.org/)."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "from unityagents import UnityEnvironment\n",
 25 |     "import numpy as np\n",
 26 |     "import time\n",
 27 |     "from collections import deque\n",
 28 |     "import matplotlib.pyplot as plt\n",
 29 |     "import torch"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "Next, we will start the environment!  **_Before running the code cell below_**, change the `file_name` parameter to match the location of the Unity environment that you downloaded.\n",
 37 |     "\n",
 38 |     "- **Mac**: `\"path/to/VisualBanana.app\"`\n",
 39 |     "- **Windows** (x86): `\"path/to/VisualBanana_Windows_x86/Banana.exe\"`\n",
 40 |     "- **Windows** (x86_64): `\"path/to/VisualBanana_Windows_x86_64/Banana.exe\"`\n",
 41 |     "- **Linux** (x86): `\"path/to/VisualBanana_Linux/Banana.x86\"`\n",
 42 |     "- **Linux** (x86_64): `\"path/to/VisualBanana_Linux/Banana.x86_64\"`\n",
 43 |     "- **Linux** (x86, headless): `\"path/to/VisualBanana_Linux_NoVis/Banana.x86\"`\n",
 44 |     "- **Linux** (x86_64, headless): `\"path/to/VisualBanana_Linux_NoVis/Banana.x86_64\"`\n",
 45 |     "\n",
 46 |     "For instance, if you are using a Mac, then you downloaded `VisualBanana.app`.  If this file is in the same folder as the notebook, then the line below should appear as follows:\n",
 47 |     "```\n",
 48 |     "env = UnityEnvironment(file_name=\"VisualBanana.app\")\n",
 49 |     "```"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "###env = UnityEnvironment(file_name=\"./VisualBanana_Linux/Banana.x86_64\", no_graphics=False)\n",
 59 |     "###env = UnityEnvironment(file_name=\"./VisualBanana_Linux/Banana.x86_64\", no_graphics=True)\n",
 60 |     "env = UnityEnvironment(file_name=\"./VisualBanana_Linux/Banana.x86_64\") #suggested by Udacity"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python."
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "# get the default brain\n",
 77 |     "brain_name = env.brain_names[0]\n",
 78 |     "brain = env.brains[brain_name]"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": []
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "### 2. Examine the State and Action Spaces\n",
 93 |     "\n",
 94 |     "The simulation contains a single agent that navigates a large environment.  At each time step, it has four actions at its disposal:\n",
 95 |     "- `0` - walk forward \n",
 96 |     "- `1` - walk backward\n",
 97 |     "- `2` - turn left\n",
 98 |     "- `3` - turn right\n",
 99 |     "\n",
100 |     "The environment state is an array of raw pixels with shape `(1, 84, 84, 3)`.  *Note that this code differs from the notebook for the project, where we are grabbing **`visual_observations`** (the raw pixels) instead of **`vector_observations`**.* A reward of `+1` is provided for collecting a yellow banana, and a reward of `-1` is provided for collecting a blue banana. \n",
101 |     "\n",
102 |     "Run the code cell below to print some information about the environment."
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "# reset the environment\n",
112 |     "env_info = env.reset(train_mode=True)[brain_name]\n",
113 |     "\n",
114 |     "# number of agents in the environment\n",
115 |     "print('Number of agents:', len(env_info.agents))\n",
116 |     "\n",
117 |     "# number of actions\n",
118 |     "action_size = brain.vector_action_space_size\n",
119 |     "print('Number of actions:', action_size)\n",
120 |     "\n",
121 |     "# examine the state space\n",
122 |     "state = env_info.visual_observations[0]\n",
123 |     "\n",
124 |     "print('States look like:')\n",
125 |     "plt.imshow(np.squeeze(state, axis=0))\n",
126 |     "plt.show()\n",
127 |     "state_size = state.shape\n",
128 |     "print('States have shape:', state.shape)\n",
129 |     "#print(np.expand_dims(state, axis=4).shape) #this is unsqueeze\n",
130 |     "state = state.reshape((-1,3,84,84))\n",
131 |     "print('modified state is: ', state.shape)"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "### 3. Take Random Actions in the Environment\n",
139 |     "\n",
140 |     "In the next code cell, you will learn how to use the Python API to control the agent and receive feedback from the environment.\n",
141 |     "\n",
142 |     "Once this cell is executed, you will watch the agent's performance, if it selects an action (uniformly) at random with each time step.  A window should pop up that allows you to observe the agent, as it moves through the environment.  \n",
143 |     "\n",
144 |     "Of course, you'll have to change the code so that the agent is able to use its experience to gradually choose better actions when interacting with the environment!"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "env_info = env.reset(train_mode=False)[brain_name] # reset the environment\n",
154 |     "state = env_info.visual_observations[0]            # get the current state\n",
155 |     "score = 0                                          # initialize the score\n",
156 |     "while True:\n",
157 |     "    action = np.random.randint(action_size)        # select an action\n",
158 |     "    env_info = env.step(action)[brain_name]        # send the action to the environment\n",
159 |     "    next_state = env_info.visual_observations[0]   # get the next state\n",
160 |     "    reward = env_info.rewards[0]                   # get the reward\n",
161 |     "    done = env_info.local_done[0]                  # see if episode has finished\n",
162 |     "    score += reward                                # update the score\n",
163 |     "    state = next_state                             # roll over the state to next time step\n",
164 |     "    #print(done)\n",
165 |     "    if done:                                       # exit loop if episode finished\n",
166 |     "        break\n",
167 |     "    \n",
168 |     "print(\"Score: {}\".format(score))"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {},
174 |    "source": [
175 |     "When finished, you can close the environment."
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {},
181 |    "source": [
182 |     "### 4. It's Your Turn!\n",
183 |     "\n",
184 |     "Now it's your turn to train your own agent to solve the environment!  When training the environment, set `train_mode=True`, so that the line for resetting the environment looks like the following:\n",
185 |     "```python\n",
186 |     "env_info = env.reset(train_mode=True)[brain_name]\n",
187 |     "```"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {
194 |     "scrolled": false
195 |    },
196 |    "outputs": [],
197 |    "source": [
198 |     "#Training the agent\n",
199 |     "from unityagents import UnityEnvironment\n",
200 |     "import numpy as np\n",
201 |     "import time\n",
202 |     "from collections import deque\n",
203 |     "import matplotlib.pyplot as plt\n",
204 |     "import torch\n",
205 |     "import pickle\n",
206 |     "from pixel_dqn_agent import Agent\n",
207 |     "\n",
208 |     "local_network_fn = './saved_agent/dqn_checkpoint_local.pth'\n",
209 |     "target_network_fn = './saved_agent/dqn_checkpoint_target.pth'\n",
210 |     "memory_buffer_fn = './saved_agent/memory_buffer'\n",
211 |     "scores_fn = './saved_agent/scores.txt'\n",
212 |     "#agent_fn = 'ddqn_checkpoint_agent.pth'\n",
213 |     "def load_agent(agent):    \n",
214 |     "    agent.qnetwork_local.load_state_dict(torch.load(local_network_fn, map_location=lambda storage, loc: storage))\n",
215 |     "    agent.qnetwork_target.load_state_dict(torch.load(target_network_fn, map_location=lambda storage, loc: storage))\n",
216 |     "    with open(scores_fn, \"rb\") as sf:\n",
217 |     "        agent.scores = pickle.load(sf)\n",
218 |     "    with open(memory_buffer_fn, \"rb\") as mf:\n",
219 |     "        agent.memory.memory = pickle.load(mf)\n",
220 |     "    #agent = Agent.load(agent_fn)\n",
221 |     "    return agent\n",
222 |     "\n",
223 |     "def save_agent(agent):    \n",
224 |     "    torch.save(agent.qnetwork_local.state_dict(), local_network_fn)\n",
225 |     "    torch.save(agent.qnetwork_target.state_dict(), target_network_fn)\n",
226 |     "    with open(scores_fn, \"wb\") as sf:\n",
227 |     "        pickle.dump(agent.scores, sf, pickle.HIGHEST_PROTOCOL)\n",
228 |     "    with open(memory_buffer_fn, \"wb\") as mf:\n",
229 |     "        mem_to_save = deque(list(agent.memory.memory)[-50000:], maxlen=100000) #agent.memory.memory\n",
230 |     "        pickle.dump(mem_to_save, mf, pickle.HIGHEST_PROTOCOL)\n",
231 |     "    #agent.save(agent_fn)\n",
232 |     "    return None\n",
233 |     "\n",
234 |     "def train_agent():\n",
235 |     "    env = UnityEnvironment(file_name=\"./VisualBanana_Linux/Banana.x86_64\", seed=111)\n",
236 |     "    brain_name = env.brain_names[0]\n",
237 |     "    brain = env.brains[brain_name]\n",
238 |     "\n",
239 |     "    final_eps = 0.01\n",
240 |     "    eps_start= 1.0 #0.01 #1.0\n",
241 |     "    agent = Agent(num_input_chnl=11, action_size=4, seed=0) #create a new agent\n",
242 |     "    #agent = load_agent(agent)\n",
243 |     "  \n",
244 |     "    def dqn(n_episodes=3000, max_t=1000, eps_start=eps_start, eps_end=final_eps, eps_decay=0.995):\n",
245 |     "        \"\"\"Deep Q-Learning.\n",
246 |     "\n",
247 |     "        Params\n",
248 |     "        ======\n",
249 |     "            n_episodes (int): maximum number of training episodes\n",
250 |     "            max_t (int): maximum number of timesteps per episode\n",
251 |     "            eps_start (float): starting value of epsilon, for epsilon-greedy action selection\n",
252 |     "            eps_end (float): minimum value of epsilon\n",
253 |     "            eps_decay (float): multiplicative factor (per episode) for decreasing epsilon\n",
254 |     "        \"\"\"\n",
255 |     "        scores_window = deque(maxlen=100)  # last 100 scores\n",
256 |     "        eps = eps_start                    # initialize epsilon\n",
257 |     "        for i_episode in range(1, n_episodes+1):\n",
258 |     "            env_info = env.reset(train_mode=True)[brain_name] # reset the environment\n",
259 |     "            state = env_info.visual_observations[0]            # get the current state\n",
260 |     "            #print(type(state))\n",
261 |     "            state = state.reshape((-1,3,84,84))\n",
262 |     "            #state = np.expand_dims(state, axis=0)\n",
263 |     "            #print(state.shape)\n",
264 |     "            #state = torch.from_numpy(state)\n",
265 |     "            score = 0                                          # initialize the score\n",
266 |     "            for t in range(max_t): #this could also be while True instead\n",
267 |     "                aug_state = agent.augment_state(state)         # augment the state\n",
268 |     "                action = agent.act(aug_state, eps)             # select an action using e-greedy policy\n",
269 |     "                env_info = env.step(action)[brain_name]        # send the action to the environment\n",
270 |     "                next_state = env_info.visual_observations[0]   # get the next state\n",
271 |     "                next_state = next_state.reshape((-1,3,84,84))\n",
272 |     "                reward = env_info.rewards[0]                   # get the reward\n",
273 |     "                done = env_info.local_done[0]                  # see if episode has finished\n",
274 |     "                agent.step(state, action, reward, next_state, done, is_training=True) #add to experience buffer and do the learning\n",
275 |     "\n",
276 |     "                score += reward                                # update the score\n",
277 |     "                state = next_state                             # roll over the state to next time step\n",
278 |     "                if done:                                       # exit loop if episode finished\n",
279 |     "                    break                \n",
280 |     "\n",
281 |     "            scores_window.append(score)       # save most recent score\n",
282 |     "            agent.scores.append(score)              # save most recent score\n",
283 |     "            eps = max(eps_end, eps_decay*eps) # decrease epsilon\n",
284 |     "            print('\\rEpisode {}\\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end=\"\")\n",
285 |     "            if i_episode % 100 == 0:\n",
286 |     "                print('\\rEpisode {}\\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))\n",
287 |     "                print(len(agent.memory.memory), agent.memory.memory.maxlen)\n",
288 |     "            if i_episode % 500 == 0: #save weights every 500 episodes\n",
289 |     "                save_agent(agent)\n",
290 |     "            if np.mean(scores_window)>=17.0:\n",
291 |     "                print('\\nEnvironment solved in {:d} episodes!\\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))\n",
292 |     "                save_agent(agent)\n",
293 |     "                break\n",
294 |     "        save_agent(agent) #save at the end\n",
295 |     "\n",
296 |     "        return agent.scores\n",
297 |     "\n",
298 |     "\n",
299 |     "    strt = time.time()\n",
300 |     "    scores = dqn()\n",
301 |     "    print('\\nTraining Time is {}'.format(time.time()-strt))\n",
302 |     "    env.close()\n",
303 |     "\n",
304 |     "#     # plot the scores\n",
305 |     "#     fig = plt.figure()\n",
306 |     "#     ax = fig.add_subplot(111)\n",
307 |     "#     plt.plot(np.arange(len(scores)), scores)\n",
308 |     "#     plt.ylabel('Score')\n",
309 |     "#     plt.xlabel('Episode #')\n",
310 |     "#     plt.show()\n",
311 |     "\n",
312 |     "\n",
313 |     "#Train the agent\n",
314 |     "print('Starting Agent Training:')\n",
315 |     "train_agent()"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": null,
321 |    "metadata": {},
322 |    "outputs": [],
323 |    "source": []
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": 1,
328 |    "metadata": {
329 |     "scrolled": true
330 |    },
331 |    "outputs": [
332 |     {
333 |      "name": "stdout",
334 |      "output_type": "stream",
335 |      "text": [
336 |       "Starting Agent Testing:\n"
337 |      ]
338 |     },
339 |     {
340 |      "name": "stderr",
341 |      "output_type": "stream",
342 |      "text": [
343 |       "INFO:unityagents:\n",
344 |       "'Academy' started successfully!\n",
345 |       "Unity Academy name: Academy\n",
346 |       "        Number of Brains: 1\n",
347 |       "        Number of External Brains : 1\n",
348 |       "        Lesson number : 0\n",
349 |       "        Reset Parameters :\n",
350 |       "\t\t\n",
351 |       "Unity brain name: BananaBrain\n",
352 |       "        Number of Visual Observations (per agent): 1\n",
353 |       "        Vector Observation space type: continuous\n",
354 |       "        Vector Observation space size (per agent): 0\n",
355 |       "        Number of stacked Vector Observation: 1\n",
356 |       "        Vector Action space type: discrete\n",
357 |       "        Vector Action space size (per agent): 4\n",
358 |       "        Vector Action descriptions: , , , \n"
359 |      ]
360 |     },
361 |     {
362 |      "name": "stdout",
363 |      "output_type": "stream",
364 |      "text": [
365 |       "Episode 1: 11.0\n",
366 |       "Mean Score out of 1 episodes is 11.0\n"
367 |      ]
368 |     }
369 |    ],
370 |    "source": [
371 |     "#Testing the agent\n",
372 |     "from unityagents import UnityEnvironment\n",
373 |     "import numpy as np\n",
374 |     "import time\n",
375 |     "from collections import deque\n",
376 |     "import matplotlib.pyplot as plt\n",
377 |     "import torch\n",
378 |     "import pickle\n",
379 |     "from pixel_dqn_agent import Agent\n",
380 |     "\n",
381 |     "local_network_fn = './saved_agent/dqn_checkpoint_local.pth'\n",
382 |     "def test_agent(num_episodes=10):\n",
383 |     "    #env = UnityEnvironment(file_name=\"./VisualBanana_Linux/Banana.x86_64\", seed=111)\n",
384 |     "    env = UnityEnvironment(file_name=\"./VisualBanana_Linux/Banana.x86_64\")\n",
385 |     "    brain_name = env.brain_names[0]\n",
386 |     "    brain = env.brains[brain_name]\n",
387 |     "\n",
388 |     "    scores = []\n",
389 |     "    final_eps=0.01    \n",
390 |     "    agent = Agent(num_input_chnl=11, action_size=4, seed=0) #create a new agent\n",
391 |     "    agent.qnetwork_local.load_state_dict(torch.load(local_network_fn, map_location=lambda storage, loc: storage)) #load the weights\n",
392 |     "    \n",
393 |     "    for i_episode in range(1,num_episodes+1):\n",
394 |     "        env_info = env.reset(train_mode=False)[brain_name] # reset the environment\n",
395 |     "        state = env_info.visual_observations[0]            # get the current state\n",
396 |     "        state = state.reshape((-1,3,84,84))\n",
397 |     "        score = 0                                          # initialize the score\n",
398 |     "        while True:\n",
399 |     "            aug_state = agent.augment_state(state)         # augment the state\n",
400 |     "            action = agent.act(aug_state, final_eps)             # select an action using e-greedy policy\n",
401 |     "            env_info = env.step(action)[brain_name]        # send the action to the environment\n",
402 |     "            next_state = env_info.visual_observations[0]   # get the next state\n",
403 |     "            next_state = next_state.reshape((-1,3,84,84))\n",
404 |     "            reward = env_info.rewards[0]                   # get the reward\n",
405 |     "            done = env_info.local_done[0]                  # see if episode has finished\n",
406 |     "            agent.step(state, action, reward, next_state, done, is_training=False) #only add to experience buffer and don't do learning\n",
407 |     "\n",
408 |     "            score += reward                                # update the score\n",
409 |     "            state = next_state                             # roll over the state to next time step\n",
410 |     "            if done:                                       # exit loop if episode finished\n",
411 |     "                scores.append(score)\n",
412 |     "                print('Episode {}: {}'.format(i_episode, score))\n",
413 |     "                break\n",
414 |     "    env.close()\n",
415 |     "    return np.mean(scores)\n",
416 |     "\n",
417 |     "print('Starting Agent Testing:')\n",
418 |     "num_episodes=1 #100\n",
419 |     "mean_score = test_agent(num_episodes)\n",
420 |     "print(\"Mean Score out of {} episodes is {}\".format(num_episodes, mean_score))"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "code",
425 |    "execution_count": 3,
426 |    "metadata": {},
427 |    "outputs": [
428 |     {
429 |      "data": {
430 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAEKCAYAAAASByJ7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJztnXd8FVXax39PEgKhhkCAUEIQkCIoYARpKkUB0RV39xV1XbvYy77b4q6vdXVZdxXLqohr1wV7wSAKKNIUCb330CGhh5J+3j9m5mbuvTP3zsyddu99vp9PPpl75syc58ycOc8pz3kOCSHAMAzDMCleC8AwDMP4A1YIDMMwDABWCAzDMIwMKwSGYRgGACsEhmEYRoYVAsMwDAOAFQLDMAwjwwqBYRiGAcAKgWEYhpFJ81oAM7Rs2VLk5eV5LQbDMExcsXTp0oNCiOxo8eJKIeTl5aGoqMhrMRiGYeIKItphJB4PGTEMwzAAWCEwDMMwMqwQGIZhGACsEBiGYRgZVggMwzAMAFYIDMMwjAwrBIZhGAYAKwQmASivqsHHS3eDt4NlmNiIq4VpDKPFs7M2Ycq8bWiWUQ8X92zttTgME7dwD4GJe0rLKgAAZeVVHkvCMPENKwQmYeARI4aJDVYITNxDXgvAMAkCKwSGYRgGACsEhmEYRoYVApMw8BQCw8QGKwTGNNU1tbjg6e8xc81+r0WR8MEkwl8/W42CT1Z5LYZv5HCai5/9AZ8u2+21GDFTVVOLoU9/h2/W+uNbYoXAmObIqSrsPHwKD32+2mtRgvByYdr7i3di2pJdnqXvNzmcZnPJCfzvhyu9FiNmjpysxK7Dp/HQ52u8FgUAKwSGYeKMRFyR7pcssUJgLOOXQswwjD2wQmDiHvLDJALjGtwQcQ7HFQIRdSCi74loHRGtJaL75fAsIppFRJvl/82dloVJbLieYOKOQFvGH6XXjR5CNYDfCyF6AjgfwN1E1BNAAYA5QoiuAObIv5k4gLhBzniIP6rOxMRxhSCE2CeEWCYflwFYD6AdgCsAvC1HexvAOKdlYZxhz9HTOK7jWE7r3LHTVdh79HRQ2Ib9x22RpaZWYPOBsrDw8qoafLt2f2BCcvOBMtTUulO11NYKbNKQKVa0nqPXCCGweNsh7Dt22tAz3lJShqqa2ohxyqtqsK30hGEZjpysxIHj5YbjO0l5VQ22HzzptRiGcXUOgYjyAPQFsBhAayHEPvnUfgCafouJaAIRFRFRUWlpqStyMsZQPvXBE7/DmOfma8YZPPE7jJ40Lyhs5LM/YNDE7wK/C1ftw+jn5mPG6n2hlxtC3WP517cbcfGkedgaUoH0f3I2Jry7FK8v2I4N+4/j4knz8MKczZbSM8uU+dtwyaR5WLHrqK33vWRS8HP0A1+s2IvxU37CwL9/h4snzcNzszfpxt11+BRGPjsPf5+xIeI9f//RSgx/5gecrKg2JEPfJ2ZhwFNzTMntFPdPW45h/5qL8qoazfPK/Jdf5kVcUwhE1BjAJwAeEEIENQeF1GzTfCRCiClCiHwhRH52drYLkjJW2BOhpbr3WHBrTXFXrbBRbj3H3IoWwNIdRwAAJceD0zheLlUmq3Yfw76jkjx2V9B6rNotpbPniL2t+QMhefQD60N6est36j/jwycrAQBFOw5HvOeiLQcBABXVUk8insxOF2yWZK92qTcaK64oBCKqB0kZvC+E+FQOPkBEOfL5HAAlbsjC+BCPPnCeC3EAE6/SaNQU+UXVxpEiiFfcsDIiAK8DWC+EeFZ16ksAN8jHNwD4wmlZGHtwqh51y3xUuDwtmcz1mBGlGy0KhSiERHqcfmuUuLGF5mAAvwWwmohWyGF/ATARwIdEdAuAHQCuckEWJgEhnWMz17mB3z7+eCHw3BJJE/gUxxWCEGIB9L+9EU6nzyQPbrf8mXCceAOh+iARe1x+yRKvVGY8J9aPwWzL2+0KJRErMD2cmPBN5DkEv3UaWSEwuhw6UYHHp69DtY6duN0fv1Kxf7Z8N977aQeemrEetRrWGVU1tXhs+tqAlYqmbDpqRqCugiaVJtl/rBxPFq4ztDZh1+FTmPj1hqj5n75yb0QX4St3HcVr87YFhSl5O3QiNguiNxduR1FxuPXOmj3H8MrcrSivqsGfPl4ZZp5rhadnbkBeQSGOna7C9JXGTYeNlh/lNSmvRv1u1+w5Zugeby7cjqUqa6apP+/EQtl6ianDjTkEJk555Mu1+GrVPvTv1Byje+UEwsnmwfDQeuF3H9S5NR7ZozX6d8oKOj9j9T68ubAYx05X4dmr+tgiwx8/Xon5mw9iePfWGNi5RcS4d76/FGv2HMe4vm3RvU1T3Xj3Tl0OABh9VhvN81e8tBAAcNsFZwTCZq07gDcXFuPwyUo8f3Vfs9kI8Nj0dQCA4oljg8Ive3EBAKBXu6b4sGg3TlbW4KVr+1lOBwBenrsVADBp1ibsd2BBWGDISEOBXPbigrA8ahH6PB78dHXQb6/xiykt9xAYXaprZKuOkLLqZuHVGiZQgkJb8+qoehZLerIrq2WNzENUVWs/FztQ8qQ8e6dQ0jl+WnuFuRWcGtJRGiDK7X1Sd9qC3Y2rWGGFwETFy+9P6+MPrbS1Kv9IFbtyxuqnqHzDiVAx2VkhaT0PQ/ePEiclRf/+jL2wQmBM41SrRuuukSr2WKVQZ8PJykbJg5/agk5k17EeAhJ3UtlvsEJgoqJXkdn1eUYcprF2ytZrEhk7lZRZ7wxGowd6ZOZu7yuiDbP6JW+sEJioeDpkZDKukc6L1sdppdMT1+seHBFd47nacNeUwByCc3M3jAQrBMYybgyBRBomUNI3VZkH3c5aDkInOY1fZ08cOwgMY9mYnlMVtSJinPiH0yReRGeFkKD0fvQbXPzsD0FhI56Zi3Me+9a2NEIL+aRZm5BXUIjyqhrU1grkFRSaup9W5fTb13+2LN91/1msKYPWx6lVmY1+bh7OengmAMmvfV5BIfIKCm1RhJ8u220oXpe/zMAV/15gOZ1Ln5+PvIJCrN4dbK9/81tFAMJV4hNfrTP03k5XSs/j4S/WBMKmLdkVFu+HTaW4/MU6+a969UfkFRRiwFOzA2HRfRlJ/1fuOoq8gkIs2hq8fiCvoDDg5fb7jeZ8ZHZ6sBDjZPNfAFi87RDyCgqxfKd0v0MnKpBXUIjHp0vPZeaa6Osshj8zF30eD/7Ozn70W1z16o8ApP0a8goK8dj0tej3xCwAUvl79Mu1pr8Zu2GFkKCUlVdjc0nwoqOtpSdxzISZoV7rUe8DfvvHYgBSZVFjornoSMtSAAs0Fh6ph3m08qe2WNqwvwwnKyU/9naaZwLA5yv2GopXXSuwcrexxVdarNsnuaNevP2QofivL9huKJ5Sjt75cUfUuKtVi8d+3i4tDjPjulvpkf24TcqDVqWvLDL7du0Bw/cFpLKndoM+d5O058qirVJailv3NxZKz6Vwtf5CQ4VtpSdx9FR4eVHyrtzzzYXFQeffWlQMr2GFwOgSS0XthUWNEMY8pkbKl1vzAn6xONKzGIs2CWrHUJPZ8hVpHYJzw1UU8juxYYXA2IZXk31WKye31iFo2+dbTNwiVitfPdwUPzQtN4tZ6HuyI22td88rlRnf43alZXR9QyzfjnRtpInq6DJYfy6kceRP/FE9SdSZnfpJqsSEFQKjS7SK165GjdXbRFIgRu7ppdcAv7ksCCX6kJF98ke7VUBJG3JdYa/SSCEeMmKYmHDNdNKCRrKzZx5LizXaI7K7Nax3Pz05opl4eqvP3OspOJFPbVcr/oC9nSYQ20pP4OCJyjDvoFaJ9jGEWiypK+jQAr6k+DDmrC9Bv9zMoPDqmlp8tmwPAMld9E2D87TTOlWFRVsPYkzvHM3zgDGTw5lr9yM1VcrYNyqLlN1HTke+kNSH0o+fth1CyfEKNG9UL3CusroWT81Yj6v7dwiEhVqcVNfUYs6GcFlnrN6HHYdOaSY/Y/U+DOnaEmXl1dheehJDuraMLG8I01fuw4QLOodnS+cdK+s/9h49jbd/LEb/vCxsPFCGiqpajOjRCsdPV5tKf+Wuo7qutoUAPlm6G1U1taiorsX8zaV44Zq+aJiehpMV1dh4oAwAsO9YeSB+KAu3HETHFg1RWhZuvaR2e22W0Ofz5cq9eG58H6SkSCdW7DqKWiGw58hp1AqBUSGebRUrNYVlO49g8bZwecrK657nzkOnkNuioWWZY4EVQgIx/Blp3YHdLn3taFX/z+QfNcPfWlQccJm8YX8Zrn1tsWa8e6ctx7xNpVjw52GBsLphC+n/rHUHMCiK62oAKFwVbkuumAIaQUn2qRkbws69MGcz3lpUHGRC+HPIvgR/K1wfdt2WkhO46/1lgd+nQyqSu95fhot7tsaiLQdxsrLG9DtevecY3lpozKQUAGau2Y9xfdth1HPzUFZejVd/qNu34fk5m02lDdS5+g5GKlgrdh0NMv0EgP/7fC2eueocPPR53ToHxexUi5+LD4c9Z4VfvaJd9oyg1ZqfumQnfjOgIwAErWEAgOsHdox4v1++vChqmhf883vP3HLzkBHjKaUhG8FsLdFuRe4+LLWcy6u0N+vxCyVl0fcD2H0kvBdwqjK4xa01ZLPnyOmwFqcZ9mva/mt3EZTen7rl6ib7j0sKeo9Gz81NgxytHtThE/obM+0/Zv9+EG7CCoGJilnTOwLZ/tHWyeCX0VbzRDJX9YnVoW9QWuZacx9uWhuZnULwua1AVFghMFHxQ2Vl1n+QkzLH+TcfhO5q9ETKZAxoWVP54HNwDFYIjKeEjtFG+9i0zrteeTmQYGi+NBcv2Z6qf/FLb8rsqzayjsXPsEJgomL0oxBBx/Z+tXX76hqrEPzcwjXyZNyq9Lx8TFbz6OpKZRfT8gOsEJioeLmnsoLZ1apeDXNZruRCLtR2vGc/flWcflm45xc53ILNThOc4c/MxcOX9cRF3Vq5mu7ny/dEjdPrkW9woiLYikVL2Yx89oegcOUb/Wz5HvzinLZBcRUvlde9Xme+Gsl9dKi74X3HTuO8J2dj8nX9dK9ZGWIiGStXvrwwzLpq7sbSMNkUz6UA8Pj0dYFjoy6TJ/+wNSzsm7UHkFdQiAV/Hob2zets33ccOoX+T84Oi+8WgR6hxrmPlxpzHf787M24Z3gXw2ne8tYSfCevZfnnNxux79hp/KSxZkBZN/H7j1ZGvF+P/5tpOO1QTlfWICM9FQBw39TlyGvZCP978ZmW72cU7iEkONtKT+LhL9a6k5jq6/3zJ6ujRg9VBnpsKTkRMIMMHTL662ero7aczbiP/nTZHpSWVeDGN5cEhRsdG7bSoFy+8yiOmzTvfMPEmgIjhCrw/y7eiRKNRV5uUTeHYL2rN2n2JlPu3udsKAkqW+/9tBNbdMyg9ZSB+v2frrJuIry5pCxw/OXKvXjBwtoPK7BCSAIsewO1uB2lE2aBVncpc5tI8vl58CF0aMTrkRK7kndieDNS+bbruXk1Oc0KIQmwWrT0viUv6uQUnTkEn+sHTfzi6jgSblRHhibXHZfCn3ilkFkhMDFht1mg3qWBhUoeWd+w62V3satH6MhmfBF7gfbU5KFeVt3CcYVARG8QUQkRrVGFPUpEe4hohfx3qdNyJDNWLSUMDRlZurN59GQRwuaN4h3MUTxZrHgta/w8KWdI8aip7kaybwEYrRE+SQjRR/6b4YIcSYvbH5cTVap6HUJwWja7iZZvF1YhcgfBVezSR3EwOqdJws4hCCHmAbDuf5YxRGW1vtM3o9YiQggs3XE4zEFbTchXVav6ffhknaMvJfTAcesOvvQ+4L2y07DSE+UoUrkzPnyyMkiGWFHMVtXWKVtKyrArmntsma80PKmq2XX4lC8d9P207RCOqJ6jUQuwWDimsRG9wtbSkwBi18OhTgPX7DmGnTouxo1yvNy45ZJVSssqsGjrwTD5ncbLdQj3ENH1AIoA/F4IcUQrEhFNADABAHJzc10UL7548FN9M0+jH/cHS3ahQL5P8cSxAZO7F+ZsDrL3f/n7Onv28zRs1cc8P99Qela4+a2ioN9VNQIzVu93LD0AGPnsPMNxo5kaDn36+1jFcYT5mw+i7xOzXE3z1neKdM9tP3hS2scgxib+hf+cG/T7she116Ro7aOgx5sLi/VP2tSwV9bR9Mhpas8NDeLVpPIrADoD6ANgH4Bn9CIKIaYIIfKFEPnZ2dluyRd3LNhSGvM91AufgLoNSUJtsZfv1NTdDGMrepsFOUGZC61+K6wP+SadxhOFIIQ4IISoEULUAngNQH8v5GCC8cqyQQ1b8zAKRG4aLdhT9r3/gmLDE4VAROp9EK8EsEYvLmMMOybPUlOMFWe9pOLBvt5L+PmYhx+Zuzg+h0BEUwFcBKAlEe0G8AiAi4ioD6S6pRjA7U7LwUTHqEJgmESDS76E4wpBCHGNRvDrTqebbNjR4/XBiBG3CJkAbppe+qHs+wFeqcwESOWvwlFY15kn3uaUvF7QFyvs/jqOeWXuVgzrno3crIY4oNpA/devLAqL+9q8bWFhNbUCT8/cgE4tG6FxgzS8PLfOnPTJwnVBG6wfL6/C87M3Y+zZOVhlwnsoU8cnBt02M3Ws2eOOlc0ymyznpq/ca8t9vIIVQhzzj5kbMGnWJtw9LNjne9GO8ML95Iz1YWHzNpXiVQ1FAQCvzQ92rzxp1ia8ubAYry+w1+1yMvHtugNeixBXuNnY/t0Hkfc2SBZ4yChOUSxWKmtqUV1rbeVrTa3x7nh1TfS48dW5ZxgmFFYIcUqiTr4maLYYJi5ghRCn1CaqRmAYxjNYIcQpijogYhtqhmHsgRVCnGJHB8GXFnLc8WEYz2CFEKco9tl+rNMZxg7i3aY/HmGz0zgg/2+z0aJROr753QWBsG4PzQQA1Arghe+2mL5nXkEhcrMaGo5v5Ns8VRnZ9bMRKmv8t1cA4w33TV3utQhJByuEOODgiQocPGHcX7tRdh427l6Y57AZJvHhISOGYZg4wA1vuawQGEPwcC7DeIuZhaRWYYXAMAwTB4Tube4ErBAYhmHiAIseakzBCoFhGCYOsOqzzAysEHzGrHUHcPRUpea5o6cqMdsGj5mPfrnW9DUzVu+POV2GYazDPYQko6SsHLe9U4Q731umef72d5fi1neKcChGE9S3FhWbvsYJs1eGYYzDPYQko7JaeuE7Dp3UPF8sh1cZcEXNMExiwZPKSUa0pfq8OIxhkhceMkpSotX7vCaAYZIPHjJKMrieZxhGD+4hMAzDMAC4h8AwDMPIuLFLIisEH6L33kvKJNPP1xdsd1EahmH8QDX7MkoujE4WT5m3zVlBGIbxHTyHkKQI3keSYZgQ3KgXWCH4CGI7I4ZhdHBjHZJhhUBEQ4joJvk4m4g6OSdWcsML0BiG8QJDCoGIHgHwZwAPykH1ALxn8No3iKiEiNaowrKIaBYRbZb/NzcrOMMwTDLhpx7ClQB+AeAkAAgh9gJoYvDatwCMDgkrADBHCNEVwBz5N8MwDKODn+YQKoW0oacAACJqZDQBIcQ8AIdDgq8A8LZ8/DaAcUbvl0gs33kEB09U4NipKhw6URGwMiopq8DW0hPeCscwjK9wweoUaQbjfUhErwLIJKLbANwM4LUY0m0thNgnH+8H0FovIhFNADABAHJzc2NI0l+UV9XgypcXBYUt/suIwPGIZ37A9HuGoHf7Zm6LxjCMDxF+WZgmhPgXgI8BfAKgG4CHhRAv2iGAuuehc36KECJfCJGfnZ1tR5K+oKom3Kg41MZo5+FTrhQChmH8jxs1QdQeAhGlApgthBgGYJZN6R4gohwhxD4iygFQYtN944bUFDYxZRjGOL6YVBZC1ACoJSI7xy6+BHCDfHwDgC9svHdCwR0EhmEknK8MjM4hnACwmohmQbY0AgAhxH3RLiSiqQAuAtCSiHYDeATAREjzErcA2AHgKpNyMwzDJBVuNA6NKoRP5T/TCCGu0Tk1Qic8KdB6uVrvmzsIDMMAPplDAAAhxNtElA7gTDlooxCiyjmxGIZhGDW1LtidGlIIRHQRpPUCxZCMYToQ0Q3yGgPGAlqv9vrXfw76/WHRLhw5VemOQAzD+Brf9BAAPAPgEiHERgAgojMBTAVwrlOCJSMbD5QF/f5hUyl+2FTqkTQMw/gJX1gZydRTlAEACCE2QfJnxFiE1xcwjDaPXt7TaxF8iRuuK4z2EIqI6D+oc2j3GwBFzojEMAzDhOEjK6M7AdwNQDEznQ/gZUckShK4f8AwjBn8NIeQBuB5IcSzQGD1cn3HpEoCeMSIYRgz+GkOYQ6ADNXvDACz7ReHYRiG0aLWL87tADQQQgT8McvHDZ0RKUngHgLDMCZwo8owqhBOElE/5QcR5QM47YxIicfcjSXIKyhEXkEhDp/kdQUMw5jHDctEo3MIDwD4iIj2yr9zAIx3RqTE46tV+wLHOw+fQlajdFdMyBiGSRw6Zzd2PI2IPQQiOo+I2gghlgDoDuADAFUAZgLY7rh0CQivP2CYyPAXEs6Ng/LQIcv5UfpoQ0avAlDGOAYC+AuAlwAcATDFQbkSFqWws15gGMZvRBsyShVCKPshjwcwRQjxCYBPiGiFs6IlNqwPGIbxG9F6CKlEpCiNEQC+U50zOv/AqOCeAcMwfiVapT4VwA9EdBCSVdF8ACCiLgCOOSxbgiJpBJ5LYBhteHPZcMilhxJRIQghniSiOZCsir4VdbVYCoB7nRYuEdl04ATO7ZiFuRvZiynDaMFNJe8wsqfyT0KIz4QQ6q0zNwkhljkrWmLy4KercfRUJX7/0UqvRWFcpGF6qtciOM6wbtlei8DEiNGFaYyNVLuw8xEj8at+7R1Po3H96NNpg7u0DAvL79jcCXE8oXjiWLx5U/+IcZo3tMdjfvHEsbhnWBdb7qXFzYM7OXZvv8MKwQNS3BoQZHyD1pRRshUDO5tBNTwH5wisEDwgJckqgkTH6usknj61jJOO3pJNUathhcAwrqDVRXBfCi+xsw53Y8P5ZIQVggdwb9c9/OIziusve3HyefpRT7vVm2SF4AFcNzCAPyseJ7Fz7Y0bewMkI6wQHOL6N37GFyv2YPOBMny8dHfQuX5PzPJIKsYreCEi0KSBPVZGAFA/zTkzXp5DYGxn3qZS3D9tBZ6fs9lrUeKSq/KdNxe1wk2D89CkQRruvKizqeu01IHRimfydefi3Vv6o3N2I904r9+Qb0oeNzkvrzn+OKobPrlzEK4f2DHs/NCuwSa5at05afw5GNG9FUad1RoA0DOnKQDgvhH2mJ2+cE1f09f8aXQ309d0aqn/7ozglpJiheAw3C60xkOX9bT9nm2aNoj5HhecmY3Vj47CAyO7Gopvh7386F5tMLRrNn538Zm6cUb0aB1zOk7Rq10z3D2sC9o0a4DHr+gVdv5v48LDFK7s2x6v33ge7rhQUsD10qQqq2G6Pa7U+udlmb7m6vNyTV/TrXUT09d4ASsEp2GNYAknGkR2mvuaneTjEaPYUOYMUl1oKVMSjxmxQnAYnvyyhhMfpS33NPk6FSVkRylI5nULNbXS/1SbF/FYKRJWJPCLtVs0PHVhTUTFAMoA1ACoFkL4dyDUIqwP/IOdOiboXpHuK0e0Y1I5XioVJ6iR7UzdWOWfvGrXH3saDBNCHPRaCKdI5o84FpwZMnLoU+dXHBE7ejZKT9sPbl+s9Spik9utXPOQkUn2HTuNiuoaAMCOQyejtvy2lp6MeJ5xDztGG8wqeCVJO3qKyT1kJD3ANDcmERwgXhqGXisEAeBbIlpKRBO0IhDRBCIqIqKi0lJv9xCoqRUY+Pfv8MC0FVi9+xgu/OdcvLWoOOI1W0pOuCNcgkEE9M3NtPmesVcmHZpLG50H3SnCbc9u3wyAZGkTSr1Uc59fXsvgTdbP9aG31GYZ5tcahCrL7jnhFjltMzMAaHuNtZtom9knsmL2WiEMEUL0AzAGwN1EdEFoBCHEFCFEvhAiPzvbW3/rSrf123UHUHxIavkX7TjipUi2EaudtFmu6R/ZdI9AmHrb+bamqaUP1jw2Crk6FYA6/ne/vxALC4ajqwnzwZE9WmFEj9ZYWDAcI3u0Cjp397DOphXCWW3rlEpuVkO8f+uAoPPL/+9iU/eLlVFntcbGv43Gkr+ODIQtLBiOFQ/HJsegzuGVfpdWjbGwYDhuv+CMmO4dilbV3rVVY0vrE9Q8cnmw2XQkJbKwYHhMadmJpwpBCLFH/l8C4DMAkR2qe0zitgvcX52ZZmD8pkE9e1ejao0/N66fhpxm2usTMlTpn5HdGO3kVipgrLeh7JOgvk6hVZPY1kRkNUoPez7NG6XHdE8jqK18+nRojvppqchuUj8Q1rh+GjIbBssR7VEZLXvtMjNssz5r2bh+xPMdmoe/swAGRAjdIyPSkJFW+fAKzxQCETUioibKMYBLAKzxSh4jKK+U3RDETjR94ISC0ksz5repcwN15eWDuVDf4sfPyYciuYKXVkatAXwmfzRpAP4rhJjpoTyMi3ix+MexsV8PKnuvFEzi6DW5ytdrJETQCEaevd3WUG69b88UghBiG4BzvErfCkohSeaVjHbhxSO0dR2Cyfh2t4L90Krmz0CfFK9nZy0Sp2J7SyIOGbn9bUdrQTkzZOTUmJF5hBBx2dpWP0K7PgN/Khb9zBkRN14tkVghRGDmmv2Yu7Ek8FuZGKoVkjdTAChctQ/Pzw72aBqPCsNtib3YRtSxlcp6cRxK24n7OUk0Ub38XLQqbjtGAOLp/ahhhRCBO95bihvfXKJ57iPVHgeTZm8KOheH+iCIp67sHTXOr/pZd0/dM6cpbhzcKWKcWFpYdng1Vfj1ue3xj19Ffx7RUJeJnjlNcfk5bW27XyT0rGm0vLXefmGwSaeWjFOuN+ddRnrXeaaucYKxvXOCfr9wdV+cf0YWslSWWU9d2Rt9OmTi7PbNIj5fAUR0RQ4Evx8j31OGzRZ1VmGFYAI/VvSxViwK6uo31D+9FpFcFis0Stcu5DPuHxrV1M5MC6tBvbpi/Mmdg/D+bSr7fAPvLNQkcO1jo6RwAfzrf87BeA13x7G0ImfcPxQtopg92kXRQyODfhdPHIviiWPxwMhwV9r3jwhWEi9E7BmfAAAXJUlEQVRq2OIP69YqsBbAyCOYcf9QtG8eeaGX0+R3bI6XftMPxRPHBsIGdWmJaRMGBpnRXjsgF5/fPdiQuXO0dTTKmqVLe7fBtQOiu8tW9nsIJbOhtNDPrXlLVggO4Kbe8GLoxc8Q6Q9ROPZN6b1wVXrxMIQQr+PeThDpGzbSMIxXAxRWCDahnjdwcw7BCWdffijDZkQIrcjMfoR6ryvW56CWy4+9y1D88N6dwMqjj/a+opUx5fJYv0+3yw0rBBNEejmK8y23ceIjNlKh+tVZFyGRbOUZX2KohxC8oU+0it0vXxMrBJuoUfcQXEzXq26+0/rPTCs/NKqudalJmeOhVZ8I+PExR+rlG2kMKZenxNmYLisEE0QqCOoegpsViV3lzfwwi7OZdPMz8qpCStQhGr9hd1k1cru6LT+llxz78KM7+GGDHNf5xb8X4ER5Nb77w0UAgLyCQgBAj5ym+Pr+oWHx1+w5hsteXBDxnj0f/gZpKYTqWoHx+R1sl1kPuyoVsx9NmoGlmF1bN8GKXUetimQZvV5Tp5aNsHbv8bDwvBaNsFTDa63RZ9u5VWPNfLaL5CDNJojsa4CoLW7ObN3YnpsawOnKztIcQoRzaamEVk0iW4kpVmSKhVW89DaTUiGs2n1MM3z9vvDKAgCmr9xr6L7Vci/hg6Jd1gSzgF7l16BeCsqrag3dY+YDQ3H3+8tMpZuhYVL6p9Hd0Et20ZyaQjirbVP0eXyW7j2+/8NFGPavuZrnlMp4aNeWmL/5IPp0yMS4Pm3x6PR1aNk4HR/dMQiv/rAV05bsClkApv08Ljs7B//41dn4atU+AMD0e4agaYZU/P82rhfG9GqDW98pCrrG6Ec8+bpzsWznEaSlECa8uxRpKYR/X9sPI0JcXpvl07sGBVqYoSiiLfzzcOw/Xm7p/t//4SKcrKhGepqk3OulpuC/tw7A0dNVGNApy9I9FabfM8Tw3gi27DcdQTGqn+HX9w817Xb807sG4eipSpQcr0B5VQ2aNKiHy87Owb1TlwOQ1lpMGt8Ho56bF7hmZI9WmHzduWFuz8f1aYvPVxirT7wgKRWCWWp9rN71WrF9OmTip22HNc+lp6WgsrpOWXRv09TQkJHSA1Lo1LIRth+s2xHurou6GJS67no9FHn652Vh/uaDGNylBfrmShvC5DTLQKeWjfC/l5yJaUvCla9WVoZ1a4VGKpfEvdvX7S2QkZ6KkT3r7MCN9gwUpdssox4u7Z2D8qqawPWje7WJKlM0+uVG3wCnbWZGYPOYaISu/dB6/oNs2oBG/XzdoGXj+igtq9A8px7H75HT1ND91J+81ntQfy99czPRrU2TsPPqMqBEH92rja8VAs8hGKDGWEPbE/Qq8mSyKbfb1tuo/g+NF0kMu9sUyfN2jRHpeViZZ7Pbii6qlZHOebfd4LBCMICfewh6hT3RJizVCi40b6EfjV15t3ofN5Sxf0ukeZx+Wqm+svSJPASoG+5SFlghGMDPzuosLXzxb3bCMCJq0EY0dqbNZqqu4PRjc/obSaTXzgrBAB6tOTOEXlmPpZVsVAG62e4ykx+1grDy6sw+u7D4hq+3/gT91Ob1O/7qIfibpJpUXrnrKIpCzAtDK79xLy1EdpP6aK8yGVxSrD056wd0/faYrDKMxPabXtR2XRz7feOhpR8HIrpKpPdupYeQrM83qXoIV7y0EE98tS4obN7mg0G/V+w6ilnrDuDNhcWBsA37y9wQzxJ6E6qXhrj7VXPvcHPWQHZy46C8oN8XnJkdMb66cu7YQrLp/u35HSNek9VQf7P54d1bBTxIxoryHBUzRuX/vcOMPd+r8q27EI/EjYPyMK5P27D5pVuHRnY5bhQ7K8sUQpBNf0/ZCujKvu1M3ee2oXWuuztkZSC/Y3M0aSC1d8efZ35dUFd5HUbTBva0mf9HXpvUu30z9Ncw6f1lP3P5dYqk6iFocfiktqlaPHB2iGnfTw+OwPl/n4PWTesHCl3j+mk4UVGNVk3qo0Q2y7t3RFc8M0vaw6F9LIunDDS81C6HtXjn5v6BY2WBoN41mQ3To94PkMxIlXi/+2BF0Lk3bjwv6vVGG5T3DO+Ke4bXuYxOTSFD8ilcclYbFE8cG8i3ET69axB++fIi3fPq9J+7Oth99U1R9qBwG7Wsoc9t0vg+mDS+j+F73Tr0DPytcD0AYP6fhscsW6smDUy9SwAR3+XFPVsH7vfh7QPx+PR1eGPh9sD5Yd1amS4LTpBUPQQt4mF4wChKRaae8/DzhHisKKaBPELsLvy8XURxo+3SU2eF4JP60srYt5G5TCV79oyte/ewjIpvx4fjlzKhBVfG/sPP5cUsrBC8FiBGtCp6dQE1XJHGYU2TTIvvFPxQXv0gQ7Lg+l7nLqfnO/yy6MyOqq1ugtl8nnzyGMLwYt+FeFSODGMHSa8Q/NLcsep+QWsFb6zrJnypHCI9nySqwP2QVT/IwDhDUlgZLdpyENf+Z3FYuNcz+mqaNEjD0VNVMd1D60NVbLAbpqcBCLeoaih7LVXXt3qbeoTqiUbp3hYfRcxG6WkoK68GEK43GtST2jxpqcarMUXJNqoffbN1oyTS4ig/the8pH6a+Xa10Z5vo/qpOKHySOs0SaEQtJSBn+jSqjEmX9cPI5+dFxSenpqC7Cb1sefo6aj3OEdlgipN/koFLrtpfdyd3wVje+dg6NPfh133+g3BZpjj8zugbbMGYR5RFb66d0jAffirvz0Xny3fg3PaZ+J4eWzKDACeurJ3mCltJFo0ro8Hx3THmF45uOCf4XkDgIIxPZDVKB1jI6zLCCUjPRUPje2B4d1jc1+tpk+HTLTLzMBjvzgr7NxbN52H8qoa3PGeORfkbvDBhPOx/3g52jdvGOTZFrDeU/j6/qFhC0RjQdnD5K2bzkOFRpmNxts398fpyhrT171xYz6eLFyPP4zqFggzW4ZDeeU3/fDJst24b0RXzFyzH3dc1BmT527FnRd2tnxPMySFQvA7d17YGZ1aBm9IktmwHlY8fAmACD0ZokCL+PJz2gZ6A+q2BwG4I0Jh6pDVMOj3TUPyQES488LOeH7O5rD4vdo1Q692UoFvm5mBuw0uwjLCtQNyw8KiDV/dLuetR05Tzf0smmXUwx9HdTcty62qhU52QERYWKBtH39Rt1ao9al/lAFntAgcn9sx2A20VYl75DQ17Iba6P0A6Tla4cIoiyP1GN69NYZ3bx0UplWGzTCmdw7GyI2Xs9tnAgD+NNp8+bUKzyH4hFgHFIRQzSGot/M0LUe4UmEYNYkz+MWEwgrBpxi3u1cf22CDz6ogPvBw5p9LSOz40nADHisEIhpNRBuJaAsRFXgpi98wvek9tHsFVhd0+aUVGFhYZzA+m4y6Bz/qxMMzhUBEqQBeAjAGQE8A1xBRT6/k8RrLm7GoriPlbapdVxi8j5GegV9bNUD8u+gw/f5Z8zEO4GUPoT+ALUKIbUKISgDTAFzhoTy+wsjnro4jRN1vvyy284KkWb3sg3fsvQSM3XipENoBUO+QvlsOs53O2fqbufsBrcae2uTxrLb6FhmKJUK3Nk0Ctsoje7ZGpuwCWs+CItQ07oKuUrxIrqMv6dla95xTKHmP9AyAOjfaLRrry5/MtMuMwautT+ndzrp5p9f06ZDptQia+N7slIgmAJgAALm51ky6xvVpF3D37Bb/vW0AzsppBhCwpaQMFdW1uPa14PUQI7q3wpwNJQCC5wwWFQxHy8Z1PuI/vmMQig9JNuBjnp8fdI/Lz2mLc9pnIlfeK+DHB4ejRaP6SE9LwcKC4Wit8jWv5oMJA1FWUbd2oGBMd9w4OA+tmjYIinfPsC64ZkAuUonQvJE9+wiYYdRZbTDvj8MC+dPjT6O64fqBHdE6RH4zrHj4Yl/0MCLJofT+zCx0K3poJDLq2bfITsHrJ/Xh7cFlOJ4Y17cdcls0RLfWTbwWJQgvFcIeAOqdK9rLYUEIIaYAmAIA+fn5lnqpeitvnWRQ55aB43M7ZqGiOnzhS2OdzTfahrTmMtJTI9ptqyvLnGZ110ZqFWakpyIjva6SSEtNQfvm4ZVuSgp53rqMpgwAffnNkBmhd+QmkeSokdddmVEI6sZFIhFahuONfrnNo0dyGS+HjJYA6EpEnYgoHcDVAL50IiFLm2zbjOZ2j8p/GyaU7cYHjyypMGpVViOvMbHq+8pOeA4h8fCshyCEqCaiewB8AyAVwBtCiLVOpOUHNzJ+kMEMPpizZDRQhozS4q1AMXGBp3MIQogZAGY4nY4fHIv5oZfCxD9KD8EPZdp7CRi74ZXKLqG5kY37YhiG9Zc/qZF7CNzAYJwgKRTCR0W7vRYh4pivVcsWrhKSj1of9RCYxCMpFMLGA2WupjdQ5SFSzeAuLfDva/sGft87vAvObN0Ywyx6aTTbw3hiXC9c0aetobhXn5eLLq0a4+rzOkSPzNjCwDNaYPJ150aMM+CMFujepgn+qHK57DbXD+yILq0a48q+jiwbYjyE4mnJf35+vigqKjJ9ndWNcP59bV+8++MOLN5+OCj81iGd8J8F24PCJl/XD6N7GfO5r8hTPHGsoXCtOADQNzcTn9012FCaDMMkL0S0VAiRHy1eUvQQrJKi2m/Aj/hYNIZh4hBWCBEgaI/vx0+fimEYxjhJoRCsL/zS7iHE0SgbwzCMYZJCIVg10UshNr9kGCZ5SBKFYPU6MmES6r7m8IP7AoZhEoekUAhnqjwKZjepb9hZG5G2YzWtzWSaZbjvCTQ3KzZnbgzDMGp87/7aDt69ZQDW7j2GtJQUaW8EAqb9vAuZDeuhR05TZNRLxWUvLgi7LoUID1/WE/XTUnDHhZ1RWV2LkrIKfLVqb1C8ydf1w8DO2msP7GbeH4fh0MkKHDxRicFd3EmTYZjkICkUQlajdAztGrxRzH0juka9jghoUC8Vj1x+ViCsQ1bDMIVgdP2BHeS2aGjIHTTDMIxZkmLIyCp6Y/RsZcQwTCLCCiEC7C6GYZhkghVCBNijJMMwyQQrhAiwPmAYJplghRABvTUI8eQQkGEYxiisEGR+e37HsDC9OQS1VVFDC5t8n5HdCN1UayPU3DKkk+n7MQzD2EFSuL82g9q99Ed3DMR5eVmOpscwDOM07P7aBngKgWGYZIIVQgTYVxDDMMkEK4QI8DoEhmGSCVYIEeB1CAzDJBOsECLA+oBhmGSCFUIEUnnMiGGYJCIpvJ2a4ev7h+LTZbvRMD0NPXOaei0OwzCMa7BCCKFHTlP8dWxPr8VgGIZxHR4yYhiGYQCwQmAYhmFkPFEIRPQoEe0hohXy36VeyMEwDMPU4eUcwiQhxL88TJ9hGIZRwUNGDMMwDABvFcI9RLSKiN4gouYeysEwDMPAQYVARLOJaI3G3xUAXgHQGUAfAPsAPBPhPhOIqIiIikpLS50Sl2EYJunxfD8EIsoD8JUQole0uG7sh8AwDJNoGN0PwZNJZSLKEULsk39eCWCNkeuWLl16kIh2WEy2JYCDFq/1G5wX/5Eo+QA4L34llryEbwmpgSc9BCJ6F9JwkQBQDOB2lYJwKs0iIxoyHuC8+I9EyQfAefErbuTFkx6CEOK3XqTLMAzD6MNmpwzDMAyA5FIIU7wWwEY4L/4jUfIBcF78iuN58dzKiGEYhvEHydRDYBiGYSKQFAqBiEYT0UYi2kJEBV7LEw0iKiai1bLjvyI5LIuIZhHRZvl/czmciOgFOW+riKifx7K/QUQlRLRGFWZadiK6QY6/mYhu8FFedB0zEtGDcl42EtEoVbin5Y+IOhDR90S0jojWEtH9cnjcvZcIeYnH99KAiH4mopVyXh6TwzsR0WJZrg+IKF0Ory//3iKfz4uWR9MIIRL6D0AqgK0AzgCQDmAlgJ5eyxVF5mIALUPCngZQIB8XAPiHfHwpgK8BEIDzASz2WPYLAPQDsMaq7ACyAGyT/zeXj5v7JC+PAviDRtyectmqD6CTXOZS/VD+AOQA6CcfNwGwSZY37t5LhLzE43shAI3l43oAFsvP+0MAV8vhkwHcKR/fBWCyfHw1gA8i5dGKTMnQQ+gPYIsQYpsQohLANABXeCyTFa4A8LZ8/DaAcarwd4TETwAyiSjHCwEBQAgxD8DhkGCzso8CMEsIcVgIcQTALACjnZc+GJ286HEFgGlCiAohxHYAWyCVPc/LnxBinxBimXxcBmA9gHaIw/cSIS96+Pm9CCHECflnPflPABgO4GM5PPS9KO/rYwAjiIign0fTJINCaAdgl+r3bkQuQH5AAPiWiJYS0QQ5rLWoW7y3H0Br+Tge8mdWdr/nScsxY1zkRR5m6AupNRrX7yUkL0AcvhciSiWiFQBKICnYrQCOCiGqNeQKyCyfPwagBWzMSzIohHhkiBCiH4AxAO4mogvUJ4XUT4xL87B4ll3GsGNGv0FEjQF8AuABIcRx9bl4ey8aeYnL9yKEqBFC9AHQHlKrvruX8iSDQtgDoIPqd3s5zLcIIfbI/0sAfAapoBxQhoLk/yVy9HjIn1nZfZsnIcQB+SOuBfAa6rrmvs4LEdWDVIG+L4T4VA6Oy/eilZd4fS8KQoijAL4HMBDSEJ3iRUItV0Bm+XwzAIdgY16SQSEsAdBVnrlPhzQZ86XHMulCRI2IqIlyDOASSM7/vgSgWHXcAOAL+fhLANfLliHnAzgmHPYLZQGzsn8D4BIiai53/S+RwzwnZH5G7ZjxSwBXy5YgnQB0BfAzfFD+5HHm1wGsF0I8qzoVd+9FLy9x+l6yiShTPs4AcDGkOZHvAfxajhb6XpT39WsA38k9O708msfNWXWv/iBZTWyCND73V6/liSLrGZAsBlYCWKvIC2mscA6AzQBmA8gSdZYKL8l5Ww0g32P5p0LqsldBGsu8xYrsAG6GNDm2BcBNPsrLu7Ksq+QPMUcV/69yXjYCGOOX8gdgCKThoFUAVsh/l8bje4mQl3h8L2cDWC7LvAbAw3L4GZAq9C0APgJQXw5vIP/eIp8/I1oezf7xSmWGYRgGQHIMGTEMwzAGYIXAMAzDAGCFwDAMw8iwQmAYhmEAsEJgGIZhZFghMEkBEdWoPGGuiObdkojuIKLrbUi3mIhaWrhuFBE9RpJH0q9jlYNhjODJnsoM4wGnheQiwBBCiMlOCmOAoZAWKA0FsMBjWZgkgXsITFIjt+CfJmn/iZ+JqIsc/igR/UE+vo8k//uriGiaHJZFRJ/LYT8R0dlyeAsi+lb2b/8fSIu8lLSuk9NYQUSvElGqhjzjZWdn9wF4DpIbhpuIyLer65nEgRUCkyxkhAwZjVedOyaE6A3g35Aq4VAKAPQVQpwN4A457DEAy+WwvwB4Rw5/BMACIcRZkPxQ5QIAEfUAMB7AYLmnUgPgN6EJCSE+gOTBc40s02o57V/EknmGMQIPGTHJQqQho6mq/5M0zq8C8D4RfQ7gczlsCIBfAYAQ4ju5Z9AU0qY6v5TDC4noiBx/BIBzASyR3PEgA3XO5EI5E9LmMwDQSEh+/xnGcVghMEyw22ctXy5jIVX0lwP4KxH1tpAGAXhbCPFgxEjSlqktAaQR0ToAOfIQ0r1CiPkW0mUYw/CQEcNIQznK/x/VJ4goBUAHIcT3AP4MyeVwYwDzIQ/5ENFFAA4KyS//PADXyuFjIG01CUhO5H5NRK3kc1lE1DFUECFEPoBCSLtgPQ3J6VofVgaMG3APgUkWMuSWtsJMIYRietqciFYBqABwTch1qQDeI6JmkFr5LwghjhLRowDekK87hTq3xI8BmEpEawEsArATAIQQ64joIUg74aVA8qB6N4AdGrL2gzSpfBeAZzXOM4wjsLdTJqkhomJI7p0Pei0Lw3gNDxkxDMMwALiHwDAMw8hwD4FhGIYBwAqBYRiGkWGFwDAMwwBghcAwDMPIsEJgGIZhALBCYBiGYWT+H6dommtoOanuAAAAAElFTkSuQmCC\n",
431 |       "text/plain": [
432 |        "<Figure size 432x288 with 1 Axes>"
433 |       ]
434 |      },
435 |      "metadata": {},
436 |      "output_type": "display_data"
437 |     }
438 |    ],
439 |    "source": [
440 |     "#Plot the learning behavior\n",
441 |     "import pickle\n",
442 |     "import matplotlib.pyplot as plt\n",
443 |     "\n",
444 |     "with open('./saved_agent/scores.txt', 'rb') as f:\n",
445 |     "    scores = pickle.load(f)\n",
446 |     "\n",
447 |     "plt.plot(scores)\n",
448 |     "plt.ylabel('Score')\n",
449 |     "plt.xlabel('Episode #')\n",
450 |     "plt.show()"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": null,
456 |    "metadata": {},
457 |    "outputs": [],
458 |    "source": []
459 |   }
460 |  ],
461 |  "metadata": {
462 |   "kernelspec": {
463 |    "display_name": "Python 3",
464 |    "language": "python",
465 |    "name": "python3"
466 |   },
467 |   "language_info": {
468 |    "codemirror_mode": {
469 |     "name": "ipython",
470 |     "version": 3
471 |    },
472 |    "file_extension": ".py",
473 |    "mimetype": "text/x-python",
474 |    "name": "python",
475 |    "nbconvert_exporter": "python",
476 |    "pygments_lexer": "ipython3",
477 |    "version": "3.6.2"
478 |   }
479 |  },
480 |  "nbformat": 4,
481 |  "nbformat_minor": 2
482 | }
483 | 


--------------------------------------------------------------------------------
/P1_Navigation/visual_pixels/pixel_dqn_agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | from collections import namedtuple, deque
  4 | 
  5 | from pixel_model import QNetwork
  6 | 
  7 | import torch
  8 | import torch.nn.functional as F
  9 | import torch.optim as optim
 10 | 
 11 | BUFFER_SIZE = int(1e5)  # replay buffer size
 12 | BATCH_SIZE = 32         # minibatch size
 13 | GAMMA = 0.99            # discount factor
 14 | TAU = 1e-3              # for soft update of target parameters
 15 | LR = 5e-4               # learning rate 
 16 | UPDATE_EVERY = 4        # how often to update the network
 17 | REGULARIZATION = 1e-4   # regularization parameter
 18 | 
 19 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 20 | 
 21 | class Agent():
 22 |     """Interacts with and learns from the environment."""
 23 | 
 24 |     def __init__(self, num_input_chnl, action_size, seed):
 25 |         """Initialize an Agent object.
 26 |         
 27 |         Params
 28 |         ======
 29 |             num_input_chnl (int): number of input channels
 30 |             action_size (int): dimension of each action
 31 |             seed (int): random seed
 32 |         """
 33 |         self.num_input_chnl = num_input_chnl
 34 |         self.action_size = action_size
 35 |         self.seed = seed
 36 |         random.seed(seed) #returns None
 37 | 
 38 |         # Q-Network
 39 |         self.qnetwork_local = QNetwork(num_input_chnl, action_size, seed).to(device)
 40 |         self.qnetwork_target = QNetwork(num_input_chnl, action_size, seed).to(device)
 41 |         self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR, weight_decay=REGULARIZATION)
 42 | 
 43 |         # Replay memory
 44 |         self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
 45 |         # Initialize time step (for updating every UPDATE_EVERY steps)
 46 |         self.t_step = 0
 47 |         # additional parameters (not used for now)
 48 |         self.episode = 0
 49 |         self.scores = []
 50 |     
 51 |     def step(self, state, action, reward, next_state, done, is_training=True):
 52 |         # Save experience in replay memory
 53 |         self.memory.add(state, action, reward, next_state, done)
 54 |         
 55 |         # Learn every UPDATE_EVERY time steps.
 56 |         self.t_step = (self.t_step + 1) % UPDATE_EVERY
 57 |         if self.t_step == 0:
 58 |             # If enough samples are available in memory and in training mode, then get random subset and learn
 59 |             if len(self.memory) > BATCH_SIZE and is_training == True:
 60 |                 experiences = self.memory.sample_augmented_experience() #self.memory.sample_old()
 61 |                 self.learn(experiences, GAMMA)
 62 | 
 63 | 
 64 |     def augment_state(self, state):
 65 |         # Augment the state to include previous observations and actions
 66 |         input_image_shape = self.memory.input_image_shape
 67 |         if len(self.memory) >= 2:
 68 |             prev_idx = len(self.memory)-1
 69 |             prev_prev_idx = prev_idx-1
 70 |             prev_e = self.memory.memory[prev_idx]
 71 |             prev_prev_e = self.memory.memory[prev_prev_idx]
 72 | 
 73 |             #e.state and e.next_state is in Nx3xHxW format (augment state in the C dimension)
 74 |             prev_e_a = np.ones((1,1,input_image_shape[0],input_image_shape[1]))*prev_e.action
 75 |             prev_prev_e_a = np.ones((1,1,input_image_shape[0],input_image_shape[1]))*prev_prev_e.action
 76 |             aug_state = np.concatenate((prev_prev_e.state, prev_prev_e_a, prev_e.state, prev_e_a, state), axis=1)
 77 |         else:
 78 |             #e.state and e.next_state is in Nx3xHxW format (augment state in the C dimension)
 79 |             initial_action = 0
 80 |             prev_e_a = np.ones((1,1,input_image_shape[0],input_image_shape[1]))*initial_action
 81 |             prev_prev_e_a = np.ones((1,1,input_image_shape[0],input_image_shape[1]))*initial_action
 82 |             aug_state = np.concatenate((state, prev_prev_e_a, state, prev_e_a, state), axis=1)
 83 | 
 84 |         return aug_state
 85 | 
 86 | 
 87 |     def act(self, state, eps=0.):
 88 |         """Returns actions for given state as per current policy.
 89 |         
 90 |         Params
 91 |         ======
 92 |             state (array_like): current state
 93 |             eps (float): epsilon, for epsilon-greedy action selection
 94 |         """
 95 |         #state = torch.from_numpy(state).float().unsqueeze(0).to(device)
 96 |         state = torch.from_numpy(state).float().to(device)
 97 |         #print(state.shape)
 98 | 
 99 |         self.qnetwork_local.eval()
100 |         with torch.no_grad():
101 |             action_values = self.qnetwork_local(state) #same as self.qnetwork_local.forward(state)
102 |         self.qnetwork_local.train()
103 | 
104 |         # Epsilon-greedy action selection
105 |         if random.random() > eps:
106 |             return np.argmax(action_values.cpu().data.numpy())
107 |         else:
108 |             return random.choice(np.arange(self.action_size))
109 | 
110 |     def learn(self, experiences, gamma):
111 |         """Update value parameters using given batch of experience tuples.
112 | 
113 |         Params
114 |         ======
115 |             experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
116 |             gamma (float): discount factor
117 |         """
118 |         states, actions, rewards, next_states, dones = experiences
119 |         #print(states.shape, next_states.shape)
120 |         #print(torch.sum(next_states[0,:,:,:]==states[1,:,:,:]))
121 | 
122 |         ## TODO: compute and minimize the loss
123 |         #"*** YOUR CODE HERE ***"
124 |         qs_local = self.qnetwork_local.forward(states)
125 |         qsa_local = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long), actions.reshape(BATCH_SIZE)]
126 |         qsa_local = qsa_local.reshape((BATCH_SIZE,1))
127 |         #print(qsa_local.shape)
128 | 
129 |         # DQN Target
130 |         qs_target = self.qnetwork_target.forward(next_states)
131 |         qsa_target, _ = torch.max(qs_target, dim=1) #using the greedy policy (q-learning)
132 |         qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete
133 |         qsa_target = qsa_target.reshape((BATCH_SIZE,1))
134 |         TD_target = rewards + gamma * qsa_target
135 |         #print(qsa_target.shape, TD_target.shape, rewards.shape)
136 | 
137 |         # # Double DQN Target ver 1
138 |         # qs_target = self.qnetwork_target.forward(next_states)
139 |         # if random.random() > 0.5:
140 |         #     _, qsa_target_argmax_a = torch.max(qs_target, dim=1) #using the greedy policy (q-learning)
141 |         #     qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_target_argmax_a.reshape(BATCH_SIZE)]
142 |         # else:
143 |         #     _, qsa_local_argmax_a = torch.max(qs_local, dim=1) #using the greedy policy (q-learning)
144 |         #     #qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)]
145 |         #     ##qsa_target = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)]
146 | 
147 |         # qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete
148 |         # qsa_target = qsa_target.reshape((BATCH_SIZE,1))
149 |         # TD_target = rewards + gamma * qsa_target
150 | 
151 |         # # Double DQN Target ver 2 (based upon double dqn paper. Use this version, it's better.)
152 |         # qs_target = self.qnetwork_target.forward(next_states)
153 |         # _, qsa_local_argmax_a = torch.max(qs_local, dim=1) #using the greedy policy (q-learning)
154 |         # qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)]
155 | 
156 |         # qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete
157 |         # qsa_target = qsa_target.reshape((BATCH_SIZE,1))
158 |         # TD_target = rewards + gamma * qsa_target
159 |         # #print(qsa_target.shape, TD_target.shape, rewards.shape)
160 | 
161 |         # #Udacity's approach
162 |         # # Get max predicted Q values (for next states) from target model
163 |         # Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
164 |         # # Compute Q targets for current states 
165 |         # TD_target = rewards + (gamma * Q_targets_next * (1 - dones))
166 |         # # Get expected Q values from local model
167 |         # qsa_local = self.qnetwork_local(states).gather(1, actions)
168 | 
169 | 
170 | 
171 |         #diff = qsa_local - TD_target
172 |         #loss = torch.matmul(torch.transpose(diff, dim0=0, dim1=1), diff) #loss is now a scalar
173 |         loss = F.mse_loss(qsa_local, TD_target) #much faster than the above loss function
174 |         #print(loss)
175 |         #minimize the loss
176 |         self.optimizer.zero_grad() #clears the gradients
177 |         loss.backward()
178 |         self.optimizer.step()
179 | 
180 | 
181 |         # ------------------- update target network ------------------- #
182 |         self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     
183 | 
184 |     def soft_update(self, local_model, target_model, tau):
185 |         """Soft update model parameters.
186 |         θ_target = τ*θ_local + (1 - τ)*θ_target
187 | 
188 |         Params
189 |         ======
190 |             local_model (PyTorch model): weights will be copied from
191 |             target_model (PyTorch model): weights will be copied to
192 |             tau (float): interpolation parameter 
193 |         """
194 |         for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
195 |             target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
196 | 
197 |     @classmethod
198 |     def load(cls, path):
199 |         checkpoint = torch.load(path, map_location=lambda storage, loc: storage) #helps loading model when testing on local machine with no gpu
200 |         my_agent = cls(checkpoint['num_input_chnl'], checkpoint['action_size'], checkpoint['seed'])
201 |         my_agent.qnetwork_local.load_state_dict(checkpoint['local_state_dict'])
202 |         my_agent.qnetwork_target.load_state_dict(checkpoint['target_state_dict'])
203 |         my_agent.memory.memory = checkpoint['memory']
204 |         my_agent.episode = checkpoint['episode']
205 |         my_agent.scores = checkpoint['scores']
206 |         return my_agent
207 | 
208 | 
209 |     def save(self, path):
210 |         checkpoint = {
211 |         'num_input_chnl': self.num_input_chnl,
212 |         'action_size': self.action_size,
213 |         'seed': self.seed,
214 |         'local_state_dict': self.qnetwork_local.state_dict(),
215 |         'target_state_dict': self.qnetwork_target.state_dict(),
216 |         'memory': self.memory.memory,
217 |         'episode': self.episode,
218 |         'scores': self.scores
219 |         }
220 |         torch.save(checkpoint, path)
221 | 
222 | Experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
223 | class ReplayBuffer:
224 |     """Fixed-size buffer to store experience tuples."""
225 | 
226 |     def __init__(self, action_size, buffer_size, batch_size, seed):
227 |         """Initialize a ReplayBuffer object.
228 | 
229 |         Params
230 |         ======
231 |             action_size (int): dimension of each action
232 |             buffer_size (int): maximum size of buffer
233 |             batch_size (int): size of each training batch
234 |             seed (int): random seed
235 |         """
236 |         self.action_size = action_size
237 |         self.memory = deque(maxlen=buffer_size)  
238 |         self.batch_size = batch_size
239 |         ###self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) #doesn't like tuple to be defined inside class when using pickle
240 |         self.experience = Experience
241 |         self.seed = seed
242 |         random.seed(seed) #returns None
243 |         self.input_image_shape = (84,84)
244 |     
245 |     def add(self, state, action, reward, next_state, done):
246 |         """Add a new experience to memory."""
247 |         e = self.experience(state, action, reward, next_state, done)
248 |         self.memory.append(e)
249 |     
250 |     def sample_old(self):
251 |         """Randomly sample a batch of experiences from memory."""
252 |         experiences = random.sample(self.memory, k=self.batch_size)
253 | 
254 |         states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
255 |         actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
256 |         rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
257 |         next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
258 |         dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
259 |   
260 |         return (states, actions, rewards, next_states, dones)
261 | 
262 |     def sample_augmented_experience(self):
263 |         """Randomly sample a batch of experiences from memory."""
264 |         #Note: the experiences are store in the memory in chronoogical order
265 | 
266 |         #experiences = list(self.memory)[0:self.batch_size] #get experiences in order
267 | 
268 |         aug_states = [] #augment state
269 |         actions = []
270 |         rewards = []
271 |         aug_next_states = [] #augment next state
272 |         dones = []
273 |         while len(aug_states) < self.batch_size:
274 |             idx = random.sample(range(len(self.memory)), k=1)[0]
275 |             #idx = 3+len(aug_states) #take experiences in order and in agent.step make sure 'len(self.memory) > BATCH_SIZE+5'
276 |             e = self.memory[idx]
277 |             if e is None or (idx-2) < 0 or (idx+1) >= len(self.memory):
278 |                 continue
279 |             else:
280 |                 prev_e = self.memory[idx-1]
281 |                 prev_prev_e = self.memory[idx-2]
282 |                 next_e = self.memory[idx+1]
283 | 
284 |             #e.state and e.next_state is in Nx3xHxW format (augment state in the C dimension)
285 |             prev_e_a = np.ones((1,1,self.input_image_shape[0],self.input_image_shape[1]))*prev_e.action
286 |             prev_prev_e_a = np.ones((1,1,self.input_image_shape[0],self.input_image_shape[1]))*prev_prev_e.action
287 |             aug_state = np.concatenate((prev_prev_e.state, prev_prev_e_a, prev_e.state, prev_e_a, e.state), axis=1)
288 |             aug_states.append(aug_state)
289 |             actions.append(e.action)
290 |             rewards.append(e.reward)
291 |             e_a = np.ones((1,1,self.input_image_shape[0],self.input_image_shape[1]))*e.action
292 |             aug_next_state = np.concatenate((prev_e.state, prev_e_a, e.state, e_a, next_e.state), axis=1)
293 |             aug_next_states.append(aug_next_state)
294 |             dones.append(e.done)
295 | 
296 |         #augment state is of shape Nx11x84x84
297 |         states = torch.from_numpy(np.vstack([s for s in aug_states])).float().to(device)
298 |         actions = torch.from_numpy(np.vstack([a for a in actions])).long().to(device)
299 |         rewards = torch.from_numpy(np.vstack([r for r in rewards])).float().to(device)
300 |         next_states = torch.from_numpy(np.vstack([ns for ns in aug_next_states])).float().to(device)
301 |         dones = torch.from_numpy(np.vstack([d for d in dones]).astype(np.uint8)).float().to(device)
302 |   
303 |         return (states, actions, rewards, next_states, dones)
304 | 
305 | 
306 |     def __len__(self):
307 |         """Return the current size of internal memory."""
308 |         return len(self.memory)


--------------------------------------------------------------------------------
/P1_Navigation/visual_pixels/pixel_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class QNetwork(nn.Module):
 6 |     """Actor (Policy) Model."""
 7 | 
 8 |     def __init__(self, num_input_chnl, action_size, seed, num_filters = [16,32], fc_layers=[64,64]):
 9 |         """Initialize parameters and build model.
10 |         Params
11 |         ======
12 |             num_input_chnl (int): Number of input channels
13 |             action_size (int): Dimension of each action
14 |             seed (int): Random seed
15 |         """
16 |         super(QNetwork, self).__init__()
17 |         self.seed = torch.manual_seed(seed)
18 |         
19 |         self.conv1 = nn.Conv2d(num_input_chnl, num_filters[0], kernel_size=(3,3), stride=1, padding=(1,1))
20 |         self.conv1bnorm = nn.BatchNorm2d(num_filters[0])
21 |         self.conv1relu = nn.ReLU()
22 |         self.conv1maxp = nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))
23 |         #self.conv2d_1 = [self.conv1, self.bnorm1, self.relu1, self.maxp1]
24 | 
25 |         self.conv2 = nn.Conv2d(num_filters[0], num_filters[1], kernel_size=(3,3), stride=1, padding=(1,1))
26 |         self.conv2bnorm = nn.BatchNorm2d(num_filters[1])
27 |         self.conv2relu = nn.ReLU()
28 |         self.conv2maxp = nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))
29 | 
30 |         self.fc1 = nn.Linear(num_filters[1]*21*21, fc_layers[0])
31 |         self.fc1bnorm = nn.BatchNorm1d(fc_layers[0])
32 |         self.fc1relu = nn.ReLU()
33 | 
34 |         self.fc2 = nn.Linear(fc_layers[0], fc_layers[1])
35 |         self.fc2bnorm = nn.BatchNorm1d(fc_layers[1])
36 |         self.fc2relu = nn.ReLU()
37 |         
38 |         self.fc3 = nn.Linear(fc_layers[1], action_size)
39 | 
40 |     def forward(self, state):
41 |         """Build a network that maps state -> action values."""
42 |         
43 |         # for conv_1 in self.conv2d_1:
44 |         #     state = conv_1(state)
45 | 
46 |         state = self.conv1(state)
47 |         state = self.conv1bnorm(state)
48 |         state = self.conv1relu(state)
49 |         state = self.conv1maxp(state)
50 | 
51 |         state = self.conv2(state)
52 |         state = self.conv2bnorm(state)
53 |         state = self.conv2relu(state)
54 |         state = self.conv2maxp(state)
55 | 
56 |         #print(state.shape) #state is of shape Nx32x21x21
57 |         state = state.reshape((-1,32*21*21)) #reshape the output of conv2 before feeding into fc1 layer
58 | 
59 |         state = self.fc1(state)
60 |         state = self.fc1bnorm(state)
61 |         state = self.fc1relu(state)
62 | 
63 |         state = self.fc2(state)
64 |         state = self.fc2bnorm(state)
65 |         state = self.fc2relu(state)
66 | 
67 |         state = self.fc3(state)
68 | 
69 |         return state
70 | 
71 | '''
72 | Note: when training, do model_name.train() to properly update batchnorm variables. 
73 | And during inference, do model_name.eval() to us the batch norm statistics from training time.
74 | The dqn_agent's act method already handles this.
75 | 
76 | To speed up inference turn off gradients like this:
77 | with torch.no_grad():
78 |     action = model.forward(state)
79 | 
80 | '''
81 | 
82 | # If it doesn't work, maybe remove batchnorm.
83 | 
84 | 


--------------------------------------------------------------------------------
/P2_Continuous_Actions/README.md:
--------------------------------------------------------------------------------
 1 | [//]: # (Image References)
 2 | 
 3 | [image1]: https://user-images.githubusercontent.com/10624937/43851024-320ba930-9aff-11e8-8493-ee547c6af349.gif "Trained Agent"
 4 | [image2]: https://user-images.githubusercontent.com/10624937/43851646-d899bf20-9b00-11e8-858c-29b5c2c94ccc.png "Crawler"
 5 | 
 6 | Project Report: https://medium.com/@amitp-ai/policy-gradients-1edbbbc8de6b
 7 | 
 8 | # Project 2: Continuous Control
 9 | 
10 | ### Introduction
11 | 
12 | For this project, you will work with the [Reacher](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Learning-Environment-Examples.md#reacher) environment.
13 | 
14 | ![Trained Agent][image1]
15 | 
16 | In this environment, a double-jointed arm can move to target locations. A reward of +0.1 is provided for each step that the agent's hand is in the goal location. Thus, the goal of your agent is to maintain its position at the target location for as many time steps as possible.
17 | 
18 | The observation space consists of 33 variables corresponding to position, rotation, velocity, and angular velocities of the arm. Each action is a vector with four numbers, corresponding to torque applicable to two joints. Every entry in the action vector should be a number between -1 and 1.
19 | 
20 | ### Distributed Training
21 | 
22 | For this project, we will provide you with two separate versions of the Unity environment:
23 | - The first version contains a single agent.
24 | - The second version contains 20 identical agents, each with its own copy of the environment.  
25 | 
26 | The second version is useful for algorithms like [PPO](https://arxiv.org/pdf/1707.06347.pdf), [A3C](https://arxiv.org/pdf/1602.01783.pdf), and [D4PG](https://openreview.net/pdf?id=SyZipzbCb) that use multiple (non-interacting, parallel) copies of the same agent to distribute the task of gathering experience.  
27 | 
28 | ### Solving the Environment
29 | 
30 | Note that your project submission need only solve one of the two versions of the environment. 
31 | 
32 | #### Option 1: Solve the First Version
33 | 
34 | The task is episodic, and in order to solve the environment,  your agent must get an average score of +30 over 100 consecutive episodes.
35 | 
36 | #### Option 2: Solve the Second Version
37 | 
38 | The barrier for solving the second version of the environment is slightly different, to take into account the presence of many agents.  In particular, your agents must get an average score of +30 (over 100 consecutive episodes, and over all agents).  Specifically,
39 | - After each episode, we add up the rewards that each agent received (without discounting), to get a score for each agent.  This yields 20 (potentially different) scores.  We then take the average of these 20 scores. 
40 | - This yields an **average score** for each episode (where the average is over all 20 agents).
41 | 
42 | The environment is considered solved, when the average (over 100 episodes) of those average scores is at least +30. 
43 | 
44 | ### Getting Started
45 | 
46 | 1. Download the environment from one of the links below.  You need only select the environment that matches your operating system:
47 | 
48 |     - **_Version 1: One (1) Agent_**
49 |         - Linux: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Reacher/one_agent/Reacher_Linux.zip)
50 |         - Mac OSX: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Reacher/one_agent/Reacher.app.zip)
51 |         - Windows (32-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Reacher/one_agent/Reacher_Windows_x86.zip)
52 |         - Windows (64-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Reacher/one_agent/Reacher_Windows_x86_64.zip)
53 | 
54 |     - **_Version 2: Twenty (20) Agents_**
55 |         - Linux: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Reacher/Reacher_Linux.zip)
56 |         - Mac OSX: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Reacher/Reacher.app.zip)
57 |         - Windows (32-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Reacher/Reacher_Windows_x86.zip)
58 |         - Windows (64-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Reacher/Reacher_Windows_x86_64.zip)
59 |     
60 |     (_For Windows users_) Check out [this link](https://support.microsoft.com/en-us/help/827218/how-to-determine-whether-a-computer-is-running-a-32-bit-version-or-64) if you need help with determining if your computer is running a 32-bit version or 64-bit version of the Windows operating system.
61 | 
62 |     (_For AWS_) If you'd like to train the agent on AWS (and have not [enabled a virtual screen](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-on-Amazon-Web-Service.md)), then please use [this link](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Reacher/one_agent/Reacher_Linux_NoVis.zip) (version 1) or [this link](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Reacher/Reacher_Linux_NoVis.zip) (version 2) to obtain the "headless" version of the environment.  You will **not** be able to watch the agent without enabling a virtual screen, but you will be able to train the agent.  (_To watch the agent, you should follow the instructions to [enable a virtual screen](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-on-Amazon-Web-Service.md), and then download the environment for the **Linux** operating system above._)
63 | 
64 | 2. Place the file in the DRLND GitHub repository, in the `p2_continuous-control/` folder, and unzip (or decompress) the file. 
65 | 
66 | ### Instructions
67 | 
68 | Follow the instructions in `Continuous_Control.ipynb` to get started with training your own agent!  
69 | 
70 | ### (Optional) Challenge: Crawler Environment
71 | 
72 | After you have successfully completed the project, you might like to solve the more difficult **Crawler** environment.
73 | 
74 | ![Crawler][image2]
75 | 
76 | In this continuous control environment, the goal is to teach a creature with four legs to walk forward without falling.  
77 | 
78 | You can read more about this environment in the ML-Agents GitHub [here](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Learning-Environment-Examples.md#crawler).  To solve this harder task, you'll need to download a new Unity environment.  (**Note**: Udacity students should not submit a project with this new environment.)
79 | 
80 | You need only select the environment that matches your operating system:
81 | - Linux: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Crawler/Crawler_Linux.zip)
82 | - Mac OSX: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Crawler/Crawler.app.zip)
83 | - Windows (32-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Crawler/Crawler_Windows_x86.zip)
84 | - Windows (64-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Crawler/Crawler_Windows_x86_64.zip)
85 | 
86 | Then, place the file in the `p2_continuous-control/` folder in the DRLND GitHub repository, and unzip (or decompress) the file.  Next, open `Crawler.ipynb` and follow the instructions to learn how to use the Python API to control the agent.
87 | 
88 | (_For AWS_) If you'd like to train the agent on AWS (and have not [enabled a virtual screen](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-on-Amazon-Web-Service.md)), then please use [this link](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Crawler/Crawler_Linux_NoVis.zip) to obtain the "headless" version of the environment.  You will **not** be able to watch the agent without enabling a virtual screen, but you will be able to train the agent.  (_To watch the agent, you should follow the instructions to [enable a virtual screen](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-on-Amazon-Web-Service.md), and then download the environment for the **Linux** operating system above._)
89 | 
90 | 


--------------------------------------------------------------------------------
/P2_Continuous_Actions/Report.md:
--------------------------------------------------------------------------------
  1 | [image1]: https://user-images.githubusercontent.com/23042512/48657451-cc597f00-e9e5-11e8-8332-bf97ee7da5f8.gif "Trained Agent Perf"
  2 | [image2]: https://user-images.githubusercontent.com/23042512/48657452-cc597f00-e9e5-11e8-8776-37a144f24702.png "Trained Agent Scores"
  3 | 
  4 | ## Introduction
  5 | ![Trained Agent][image1]
  6 | 
  7 | 
  8 | This report outlines my implementation for Udacity's Deep Reinforcement Learning Nanodegree's second project on the Reacher environment. In this project, the goal is to train an acrobat arm that has two joints so that it tracks a balloon. As the balloon moves, the two joints at adjusted to track the balloon. So this is a classical robotics problem, and using model-free reinforcement learning, the agent will learn the optimal policy. In particular, the method used is the deep deterministic policy gradient method (DDPG).
  9 | 
 10 | Value based reinforcement learning algorithms such as DQN have shown great performance in many domains. However, they are still limited to discrete action space environments and for deterministic policies (as they are essentially based upon a deterministic greedy policy as epsilon only selects a uniform-random action). Moreover, with value based methods, we first compute the value-function for each state, and use that to determine the best policy. This is an indirect way of finding the optimal policy.
 11 | 
 12 | Using a policy based method, on the other hand, we directly find the policy that yields the most rewards. Policy gradient is one of the more efficient policy based learning algorithms where we directly compute the gradient of the expected reward with respect to the policy parameters. In addition to being direct, it works well with continuous actions as well as stochastic policies. Other policy based methods include stochastic optimization methods such as random shooting, cross entropy method, etc.
 13 | 
 14 | Before delving into my implementation for this project, I have included below some basics on policy gradient methods. In particular, this article will walk the reader from the basic objective of Reinforcement Learning (RL) to some of the advanced policy-gradient algorithms, such as Reinforce, Actor-Critic, Advantage Actor-Critic, Deterministic Policy Gradient, and Deep Deterministic Policy Gradient. For thorough understanding, it is assumed the reader is well versed in Probability & Statistics, Linear Algebra, Vector Calculus, and basic Reinforcement Learning terminologies.
 15 | 
 16 | ---------------------------------------------------------------
 17 | ## Reinforce Algorithm
 18 | Please note, in the below analyses, the discount factor &gamma; is assumed to be 1 for simplicity. But all the analyses can be easily extended to cases where &gamma; is not 1.
 19 | 
 20 | The basic objective in all of Reinforcement Learning (RL) is to maximize the expected total utility U<sub>&theta;</sub>, which is defined as follows [1]:
 21 | 
 22 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;U_{\theta}&space;=&space;E_{\tau&space;\,&space;\sim&space;\,&space;P_{\theta}(\tau)}&space;\left&space;[&space;\sum_{t=0}^{T}r(s_{t},a_{t})&space;\right&space;],&space;whereby&space;\,&space;\tau&space;=&space;\left&space;\{&space;s_{0},&space;a_{0},&space;s_{1},&space;a_{1},&space;...,&space;s_{T},&space;a_{T}&space;\right&space;\}&space;\:&space;...&space;\,&space;Equation&space;\,&space;1a" title="U_{\theta} = E_{\tau \, \sim \, P_{\theta}(\tau)} \left [ \sum_{t=0}^{T}r(s_{t},a_{t}) \right ], whereby \, \tau = \left \{ s_{0}, a_{0}, s_{1}, a_{1}, ..., s_{T}, a_{T} \right \} \: ... \, Equation \, 1a" />
 23 | 
 24 | After doing some Math, it can be shown that U<sub>&theta;</sub> is equal to the expected value of Q(s<sub>0</sub>,a<sub>0</sub>). And if the initial state distribution is uniform, then it means the goal in RL is to find a policy which maximizes the q-values of all possible states.
 25 | 
 26 | <img src="https://latex.codecogs.com/png.latex?\dpi{120}&space;\fn_cm&space;U_{\theta}&space;=&space;E_{s_{0}&space;\,&space;\sim&space;\,&space;P(s_{0})}&space;\left&space;[&space;E_{a_0&space;\,&space;\sim&space;\,&space;P_{\theta}(a_0&space;|&space;s_0)}&space;\left&space;[&space;Q_{P_{\theta}}(s_{0},&space;a_{0})&space;|s_0&space;\right&space;]&space;\right&space;]&space;\:&space;...&space;\,&space;Equation&space;\,&space;1b" title="U_{\theta} = E_{s_{0} \, \sim \, P(s_{0})} \left [ E_{a_0 \, \sim \, P_{\theta}(a_0 | s_0)} \left [ Q_{P_{\theta}}(s_{0}, a_{0}) |s_0 \right ] \right ] \: ... \, Equation \, 1b" />
 27 | 
 28 | Using the definition of expectation, the above equation 1a can be re-written as:
 29 | 
 30 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;U_{\theta}&space;=&space;\sum_{\tau}&space;\left&space;(P_{\theta}(\tau)&space;\left(\sum_{t=0}^{T}r(s_{t},a_{t})\right)&space;\right&space;)&space;...&space;\:&space;Equation&space;\:&space;2" title="U_{\theta} = \sum_{\tau} \left (P_{\theta}(\tau) \left(\sum_{t=0}^{T}r(s_{t},a_{t})\right) \right ) ... \: Equation \: 2" />
 31 | 
 32 | Using Policy gradient method, we can maximize U<sub>&theta;</sub> by first computing its gradient with respect to &theta;, which can readily be derived to be:
 33 | 
 34 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;\nabla_{\theta}U_{\theta}&space;=&space;E_{\tau&space;\,&space;\sim&space;\,&space;P_{\theta}(\tau)}&space;\left[&space;\sum_{t=0}^{T}\left(\nabla_{\theta}log(P_{\theta}(a_{t}|s_{t}))&space;\left(&space;\sum_{t=0}^{T}r(s_{t},a_{t})&space;\right)&space;\right)&space;\right]&space;\:&space;...&space;\,&space;Equation&space;\,&space;3" title="\nabla_{\theta}U_{\theta} = E_{\tau \, \sim \, P_{\theta}(\tau)} \left[ \sum_{t=0}^{T}\left(\nabla_{\theta}log(P_{\theta}(a_{t}|s_{t})) \left( \sum_{t=0}^{T}r(s_{t},a_{t}) \right) \right) \right] \: ... \, Equation \, 3" />
 35 | 
 36 | One approach to improving the expected total reward is to randomly add noise to the current &theta; and if it results in better total reward, then we keep it, otherwise we ignore it, and we keep repeating this process. This method is called the random shooting method. There are other more sophisticated methods in the same vein such as the Cross Entropy Method. All these methods fall under the domain of stochastic optimization algorithms. However, while these methods are very simple to implement, they are not efficient and don't scale well with high dimensional space. A more efficient approach is to change &theta; in the direction of the gradient using Stochastic Gradient Ascent as follows:
 37 | 
 38 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;\theta&space;\leftarrow&space;\theta&space;&plus;&space;\alpha\nabla_{\theta}U_{\theta}&space;\;&space;...&space;\;&space;Equation&space;\,&space;4" title="\theta \leftarrow \theta + \alpha\nabla_{\theta}U_{\theta} \; ... \; Equation \, 4" />
 39 | 
 40 | A basic policy gradient algorithm making use of the above gradient is known as the Reinforce algorithm, and here is how it works:
 41 | 
 42 | ***A Basic Reinforce Algorithm:***
 43 | 
 44 | Start with a random vector &theta; and repeat the following 3 steps until convergence:
 45 | 
 46 |   1. Use the policy P<sub>&theta;</sub>(a<sub>t</sub>|s<sub>t</sub>) to collect m trajectories {&tau;<sup>1</sup>, &tau;<sup>2</sup>, ..., &tau;<sup>m</sup>}, where each trajectory is as defined above.
 47 |   2. Use these trajectories to compute the Monte-Carlo estimator of the gradient as follows:
 48 |       
 49 |       <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;\nabla_{\theta}U_{\theta}&space;\approx&space;\hat{g}&space;=&space;\frac{1}{m}&space;\sum_{i=1}^{m}&space;\left(&space;\sum_{t=0}^{T}&space;\left(&space;\nabla_{\theta}log(P_{\theta}(a_{t}|s_{t}))&space;\left(&space;\sum_{t=0}^{T}r(s_{t},a_{t})&space;\right)&space;\right)&space;\right)_i&space;\:&space;...&space;\,&space;Equation&space;\,&space;5" title="\nabla_{\theta}U_{\theta} \approx \hat{g} = \frac{1}{m} \sum_{i=1}^{m} \left( \sum_{t=0}^{T} \left( \nabla_{\theta}log(P_{\theta}(a_{t}|s_{t})) \left( \sum_{t=0}^{T}r(s_{t},a_{t}) \right) \right) \right)_i \: ... \, Equation \, 5" />
 50 | 
 51 |       Note that the reason why the above estimator is valid is because the trajectories are generated by following the policy being learned, i.e. P<sub>&theta;</sub>(&tau;) -- i.e. it is an on-policy algorithm. Another way to say it is that we sample each of the trajectories in {&tau;<sup>1</sup>, &tau;<sup>2</sup>, ..., &tau;<sup>m</sup>} from the probability distribution P<sub>&theta;</sub>(&tau;).
 52 | 
 53 |   3. Update the weights/parameters of the policy network using the above estimator of the gradient:
 54 | 
 55 |       <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;\theta&space;\leftarrow&space;\theta&space;&plus;&space;\alpha\hat{g}&space;\;&space;...&space;\;&space;Equation&space;\,&space;6" title="\theta \leftarrow \theta + \alpha\hat{g} \; ... \; Equation \, 6" />
 56 | 
 57 | The intuition behind the reinforce algorithm is that if the total reward is positive, then all the actions taken in that trajectory are reinforced whereas if the total reward is negative, then all the actions taken in the trajectory are inhibited. Moreover, to be computationally efficient, typically m is set to 1.
 58 | 
 59 | While better than stochastic optimization methods, the Reinforce algorithm suffers from a few drawbacks:
 60 |   1. The gradient estimator is pretty noisy, especially for the case m=1, because a single trajectory maynot be representative of the policy.
 61 |   2. There is no clear credit assignment. A trajectory may contain many good and bad actions, and whether those actions are reinforced or not depend only on the total reward achieved starting from the initial state.
 62 |   3. It is very sensitive to the absolute value of the rewards. For example, adding a fixed constant to all the rewards can drastically change the behavior of the algorithm. Such a trivial transformation should have no effect on the optimal policy.
 63 | 
 64 | By the definition of the gradient, &nabla;<sub>&theta;</sub>U<sub>&theta;</sub> points in the direction of the maximum change in U<sub>&theta;</sub>. However, at a fundamental level, the above drawbacks of Reinforce algorithm are due to the fact that the Monte-Carlo estimator of &nabla;<sub>&theta;</sub>U<sub>&theta;</sub> (i.e. ĝ) has high variance. If we can reduce its variance, then our estimate of gradient (&gcirc;) will be closer to the true gradient &nabla;<sub>&theta;</sub>U<sub>&theta;</sub>.
 65 | 
 66 | While the Monte-Carlo estimator of the gradient (&gcirc;) is unbiased, it exhibits high variance. As discussed below, there are a few ways of reducing variance without introducing bias: 1) using causality and 2) using a baseline.
 67 | 
 68 | ## Actor-Critic Algorithm
 69 | 
 70 | One way to reduce variance is by taking advantage of causality: &gcirc; updates all the actions in a trajectory based upon total rewards and not the rewards to go. That is to say, future actions affect past rewards, which is not possible in our causal Universe. So we can make the gradient estimator more realistic by using rewards to go as shown in the below equation.
 71 | 
 72 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;\nabla_{\theta}U_{\theta}&space;\approx&space;\hat{g}&space;=&space;\frac{1}{m}&space;\sum_{i=1}^{m}&space;\left(&space;\sum_{t=0}^{T}&space;\left(&space;\nabla_{\theta}log(P_{\theta}(a_{t}|s_{t}))&space;\left(&space;\sum_{t^{'}=t}^{T}r(s_{t^{'}},a_{t^{'}})&space;\right)&space;\right)&space;\right)_i&space;\:&space;...&space;\,&space;Equation&space;\,&space;7" title="\nabla_{\theta}U_{\theta} \approx \hat{g} = \frac{1}{m} \sum_{i=1}^{m} \left( \sum_{t=0}^{T} \left( \nabla_{\theta}log(P_{\theta}(a_{t}|s_{t})) \left( \sum_{t^{'}=t}^{T}r(s_{t^{'}},a_{t^{'}}) \right) \right) \right)_i \: ... \, Equation \, 7" />
 73 | 
 74 | Note that using the rewards to go instead of the total rewards still results in an unbiased estimator of &nabla;<sub>&theta;</sub>U<sub>&theta;</sub> because causality is handled in the expectation in Equation 3 using P<sub>&theta;</sub>(&tau;). Moreover, doing so reduces variance because the rewards to go expression has fewer terms (and thus lower uncertainty) than the total rewards expression.
 75 | 
 76 | An important aside to note is that the rewards to go is really an estimate of the the q-value of (s<sub>t</sub>, a<sub>t</sub>). This is because the q-value is defined as follows:
 77 | 
 78 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;Q_{P_{\theta}}(s_{t},&space;a_{t})&space;=&space;r(s_t,&space;a_t)&space;&plus;&space;E_{\tau&space;\sim&space;P_{\theta}(\tau|s_t,&space;a_t)}\left&space;[\sum&space;_{t^{'}=t&plus;1}^T&space;r(s_{t^{'}},&space;a_{t^{'}})&space;\mid&space;s_t,&space;a_t&space;\right&space;]&space;\:&space;...&space;\,&space;Equation&space;\,&space;8" title="Q_{P_{\theta}}(s_{t}, a_{t}) = r(s_t, a_t) + E_{\tau \sim P_{\theta}(\tau|s_t, a_t)}\left [\sum _{t^{'}=t+1}^T r(s_{t^{'}}, a_{t^{'}}) \mid s_t, a_t \right ] \: ... \, Equation \, 8" />
 79 | 
 80 | 
 81 | And so, if the trajectory &tau; is sampled from P<sub>&theta;</sub>(&tau;), then the single-sample Monte-Carlo estimate of Q<sub>P<sub>&theta;</sub></sub>(s<sub>t</sub>, a<sub>t</sub>) is just:
 82 | 
 83 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;Q_{P_{\theta}}(s_{t},&space;a_{t})&space;\approx&space;\hat{Q}_{P_{\theta}}(s_{t},&space;a_{t})&space;=&space;\sum_{t^{'}=t}^T&space;r(s_{t^{'}},&space;a_{t^{'}})&space;\:&space;...&space;\,&space;Equation&space;\,&space;9" title="Q_{P_{\theta}}(s_{t}, a_{t}) \approx \hat{Q}_{P_{\theta}}(s_{t}, a_{t}) = \sum_{t^{'}=t}^T r(s_{t^{'}}, a_{t^{'}}) \: ... \, Equation \, 9" />
 84 | 
 85 | As shown above, instead of using the Monte-Carlo estimator of the rewards to go as in Equation 7, we can use the Q-value estimator of the rewards to go. As a result, Equation 7 can be re-written as:
 86 | 
 87 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;\nabla_{\theta}U_{\theta}&space;\approx&space;\hat{g}&space;=&space;\frac{1}{m}&space;\sum_{i=1}^{m}&space;\left(&space;\sum_{t=0}^{T}&space;\left(&space;\nabla_{\theta}log(P_{\theta}(a_{t}|s_{t}))&space;\,&space;\hat{Q}_{P_\theta}(s_t,&space;a_t)&space;\right)&space;\right)_i&space;\:&space;...&space;\,&space;Equation&space;\,&space;10" title="\nabla_{\theta}U_{\theta} \approx \hat{g} = \frac{1}{m} \sum_{i=1}^{m} \left( \sum_{t=0}^{T} \left( \nabla_{\theta}log(P_{\theta}(a_{t}|s_{t})) \, \hat{Q}_{P_\theta}(s_t, a_t) \right) \right)_i \: ... \, Equation \, 10" />
 88 | 
 89 | If Qhat<sub>P<sub>&theta;</sub></sub>(s<sub>t</sub>, a<sub>t</sub>) is modeled using a neural network (parameterized by w), then we get:
 90 | 
 91 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;\hat{Q}_{P_{\theta}}(s_{t},&space;a_{t})&space;=&space;\hat{Q}_{P_{\theta}}(s_{t},&space;a_{t},&space;w)&space;\:&space;...&space;\,&space;Equation&space;\,&space;11" title="\hat{Q}_{P_{\theta}}(s_{t}, a_{t}) = \hat{Q}_{P_{\theta}}(s_{t}, a_{t}, w) \: ... \, Equation \, 11" />
 92 | 
 93 | Note that because the state-action space can be very high dimensional, it quickly runs into Bellman's curse of dimensionality; and thus, in most practical situations with complex state-transition dynamics, Qhat<sub>P<sub>&theta;</sub></sub>(s<sub>t</sub>, a<sub>t</sub>) is modeled using a neural network based function approximator.
 94 | 
 95 | Then Equation 10 can be re-written as:
 96 | 
 97 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;\nabla_{\theta}U_{\theta}&space;\approx&space;\hat{g}&space;=&space;\frac{1}{m}&space;\sum_{i=1}^{m}&space;\left(&space;\sum_{t=0}^{T}&space;\left(&space;\nabla_{\theta}log(P_{\theta}(a_{t}|s_{t}))&space;\,&space;\hat{Q}_{P_\theta}(s_t,&space;a_t,&space;w)&space;\right)&space;\right)_i&space;\:&space;...&space;\,&space;Equation&space;\,&space;12" title="\nabla_{\theta}U_{\theta} \approx \hat{g} = \frac{1}{m} \sum_{i=1}^{m} \left( \sum_{t=0}^{T} \left( \nabla_{\theta}log(P_{\theta}(a_{t}|s_{t})) \, \hat{Q}_{P_\theta}(s_t, a_t, w) \right) \right)_i \: ... \, Equation \, 12" />
 98 | 
 99 | Whereby, P<sub>&theta;</sub>(a<sub>t</sub> | s<sub>t</sub>) is the actor network that is parameterized by &theta; and Qhat<sub>P<sub>&theta;</sub></sub>(s<sub>t</sub>, a<sub>t</sub>, w) is the critic network that is parameterized by w. This is essentially what is known as the actor-critic algorithm.
100 | 
101 | For any visited state-action pair (s,a), the actor network is updated using Equation 6 (utilizing &gcirc; from Equation 12), and the critic network is typically updated using Temporal-Difference learning (due to its lower variance than Monte-Carlo learning) using the following update equation:
102 | 
103 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;w&space;\leftarrow&space;w&space;-&space;\beta&space;\nabla_{w}L(w)&space;\:&space;...&space;\,&space;Equation&space;\,&space;13" title="w \leftarrow w - \beta \nabla_{w}L(w) \: ... \, Equation \, 13" />
104 | 
105 | Whereby the weight vector w is updated to reduce the loss L(w), which is defined as:
106 | 
107 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;L(w)&space;=&space;\frac{1}2{}(Q_{P_\theta}(s_t,a_t)&space;-&space;\hat{Q}_{P_\theta}(s_t,a_t,w))^2&space;\:&space;...&space;\,&space;Equation&space;\,&space;14" title="L(w) = \frac{1}2{}(Q_{P_\theta}(s_t,a_t) - \hat{Q}_{P_\theta}(s_t,a_t,w))^2 \: ... \, Equation \, 14" />
108 | 
109 | and using Q-learning (so that the critic is based of off an off-policy algorithm):
110 | 
111 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;Q_{P_\theta}(s_t,a_t)&space;\approx&space;r(s_t,a_t)&space;&plus;&space;\max_a\hat{Q}_{P_\theta}(s_{t&plus;1},a,w)&space;\:&space;...&space;\,&space;Equation&space;\,&space;15" title="Q_{P_\theta}(s_t,a_t) \approx r(s_t,a_t) + \max_a\hat{Q}_{P_\theta}(s_{t+1},a,w) \: ... \, Equation \, 15" />
112 | 
113 | and so
114 | 
115 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;\nabla_wL(w)&space;=&space;-(r(s_t,a_t)&space;&plus;&space;\max_a\hat{Q}_{P_\theta}(s_{t&plus;1},a,w)&space;-&space;\hat{Q}_{P_\theta}(s_{t},a_{t},w))\nabla_w\hat{Q}_{P_\theta}(s_{t},a_{t},w)&space;\:&space;...&space;\,&space;Equation&space;\,&space;16" title="\nabla_wL(w) = -(r(s_t,a_t) + \max_a\hat{Q}_{P_\theta}(s_{t+1},a,w) - \hat{Q}_{P_\theta}(s_{t},a_{t},w))\nabla_w\hat{Q}_{P_\theta}(s_{t},a_{t},w) \: ... \, Equation \, 16" />
116 | 
117 | whereby
118 | 
119 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;\delta_{TD&space;\;&space;Error}&space;=&space;r(s_t,a_t)&space;&plus;&space;\max_a\hat{Q}_{P_\theta}(s_{t&plus;1},a,w)&space;-&space;\hat{Q}_{P_\theta}(s_{t},a_{t},w)&space;\:&space;...&space;\,&space;Equation&space;\,&space;17" title="\delta_{TD \; Error} = r(s_t,a_t) + \max_a\hat{Q}_{P_\theta}(s_{t+1},a,w) - \hat{Q}_{P_\theta}(s_{t},a_{t},w) \: ... \, Equation \, 17" />
120 | 
121 | This is the basics of the actor-critic algorithm. While there are many variants of it, as we will see below, this is the basic core of it.
122 | 
123 | ## Advantage Actor-Critic Algorithm
124 | 
125 | In addition to using the rewards to go (due to causality), another approach to minimizing the variance of &gcirc; is by subtracting out a baseline b that is not dependent on &theta; or action a -- and this combined term is known as the Advantage function. It can be mathematically proved that such a transformation is not only unbiased, but it reduces variance. An intuitive explanation for why it reduces variance is because the term multiplying &nabla;<sub>&theta;</sub>log(P<sub>&theta;</sub>(a|s)) has smaller magnitude, which essentially reduces the variance of the overall expression.
126 | 
127 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;\nabla_{\theta}U_{\theta}&space;\approx&space;\hat{g}&space;=&space;\frac{1}{m}&space;\sum_{i=1}^{m}&space;\left(&space;\sum_{t=0}^{T}&space;\left(&space;\nabla_{\theta}log(P_{\theta}(a_{t}|s_{t}))&space;\,&space;\left(&space;\hat{Q}_{P_\theta}(s_t,&space;a_t,&space;w)&space;-&space;b\right)&space;\right)&space;\right)_i&space;\:&space;...&space;\,&space;Equation&space;\,&space;18" title="\nabla_{\theta}U_{\theta} \approx \hat{g} = \frac{1}{m} \sum_{i=1}^{m} \left( \sum_{t=0}^{T} \left( \nabla_{\theta}log(P_{\theta}(a_{t}|s_{t})) \, \left( \hat{Q}_{P_\theta}(s_t, a_t, w) - b\right) \right) \right)_i \: ... \, Equation \, 18" />
128 | 
129 | There are many choices for the baseline b, and in theory, the optimal value of b can also be computed. However, in the interest of simplicity and to be intuitive, a commonly used baseline is the q-value averaged over all the actions, i.e. the state-value.
130 | 
131 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;b&space;=&space;\hat{V}_{P_\theta}(s_t,&space;w)&space;=&space;E_{a_t&space;\sim&space;P_{\theta}(a_t|s_t)}&space;\left&space;[\hat{Q}_{P_\theta}(s_t,&space;a_t,&space;w)&space;\right&space;]&space;\:&space;...&space;\,&space;Equation&space;\,&space;19" title="b = \hat{V}_{P_\theta}(s_t, w) = E_{a_t \sim P_{\theta}(a_t|s_t)} \left [\hat{Q}_{P_\theta}(s_t, a_t, w) \right ] \: ... \, Equation \, 19" />
132 | 
133 | The Advantage function is then written as follows:
134 | 
135 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;\hat{A}_{P_\theta}(s_t,&space;a_t,w)&space;=&space;\hat{Q}_{P_\theta}(s_t,&space;a_t,&space;w)&space;-&space;\hat{V}_{P_\theta}(s_t,&space;w)&space;\:&space;...&space;\,&space;Equation&space;\,&space;20" title="\hat{A}_{P_\theta}(s_t, a_t,w) = \hat{Q}_{P_\theta}(s_t, a_t, w) - \hat{V}_{P_\theta}(s_t, w) \: ... \, Equation \, 20" />
136 | 
137 | The basic idea with using this advantage function is that actions with higher q-value than the average (i.e. state-value) are reinforced where as other actions are inhibited. This makes a lot more intuitive sense than the gradient equation used in the original Reinforce algorithm. And so it's not totally surprising that Mathematically it results in lower variance. Moreover, now the gradient is no longer dependent on the absolute value of the rewards.
138 | 
139 | One problem with the above Equation is that, in practice, it is very difficult to compute the above expectation -- especially for continuous actions or high dimensional action space. Hence, the state-value function is modeled with a separate neural network that is parameterized by w<sub>v</sub> as follows:
140 | 
141 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;\hat{V}(s_t,&space;w_v)&space;\approx&space;\hat{V}_{P_\theta}(s_t,&space;w)&space;\:&space;...&space;\,&space;Equation&space;\,&space;21" title="\hat{V}(s_t, w_v) \approx \hat{V}_{P_\theta}(s_t, w) \: ... \, Equation \, 21" />
142 | 
143 | The advantage function now becomes:
144 | 
145 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;\hat{A}_{P_\theta}(s_t,&space;a_t,&space;w)&space;\approx&space;\hat{A}(s_t,&space;a_t,&space;w,&space;w_v)&space;=&space;\hat{Q}_{P_\theta}(s_t,&space;a_t,&space;w)&space;-&space;\hat{V}(s_t,&space;w_v)&space;\:&space;...&space;\,&space;Equation&space;\,&space;22" title="\hat{A}_{P_\theta}(s_t, a_t, w) \approx \hat{A}(s_t, a_t, w, w_v) = \hat{Q}_{P_\theta}(s_t, a_t, w) - \hat{V}(s_t, w_v) \: ... \, Equation \, 22" />
146 | 
147 | The issue with this advantage function is that it requires two separate neural networks. With some clever re-ordering, we can re-write the Advantage function using a single neural network. However, inorder to do so, let us first re-visit the above analysis. Basically, the ideal Advantage function we would like to have is:
148 | 
149 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;A_{P_\theta}(s_t,&space;a_t)&space;=&space;Q_{P_\theta}(s_t,&space;a_t)&space;-&space;V_{P_\theta}(s_t)&space;\:&space;...&space;\,&space;Equation&space;\,&space;23" title="A_{P_\theta}(s_t, a_t) = Q_{P_\theta}(s_t, a_t) - V_{P_\theta}(s_t) \: ... \, Equation \, 23" />
150 | 
151 | As defined in Equation 8 above, state-action value can be further simplified interms of the state-value function as:
152 | 
153 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;Q_{P_{\theta}}(s_{t},&space;a_{t})&space;=&space;r(s_t,&space;a_t)&space;&plus;&space;E_{s_{t^{'}}&space;\sim&space;P_{\theta}(s_{t^{'}}|s_t,&space;a_t)}\left&space;[V_{P_{\theta}}(s_{t^{'}})&space;\mid&space;s_t,&space;a_t&space;\right&space;]&space;\:&space;...&space;\,&space;Equation&space;\,&space;24" title="Q_{P_{\theta}}(s_{t}, a_{t}) = r(s_t, a_t) + E_{s_{t^{'}} \sim P_{\theta}(s_{t^{'}}|s_t, a_t)}\left [V_{P_{\theta}}(s_{t^{'}}) \mid s_t, a_t \right ] \: ... \, Equation \, 24" />
154 | 
155 | The single-sample Monte-Carlo estimate of Q<sub>P<sub>&theta;</sub></sub>(s<sub>t</sub>, a<sub>t</sub>) as defined in the Equation above is:
156 | 
157 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;Q_{P_{\theta}}(s_{t},&space;a_{t})&space;\approx&space;\hat{Q}_{P_{\theta}}(s_{t},&space;a_{t})&space;=&space;r(s_t,&space;a_t)&space;&plus;&space;V_{P_{\theta}}(s_{t^{'}})&space;\:&space;...&space;\,&space;Equation&space;\,&space;25" title="Q_{P_{\theta}}(s_{t}, a_{t}) \approx \hat{Q}_{P_{\theta}}(s_{t}, a_{t}) = r(s_t, a_t) + V_{P_{\theta}}(s_{t^{'}}) \: ... \, Equation \, 25" />
158 | 
159 | And so now we just need to represent the state-value function using a neural network parameterized by w<sub>v</sub> as follows:
160 | 
161 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;V_{P_{\theta}}(s_{t})&space;\approx&space;\hat{V}_{P_{\theta}}(s_{t},&space;w_v)&space;\:&space;...&space;\,&space;Equation&space;\,&space;26" title="V_{P_{\theta}}(s_{t}) \approx \hat{V}_{P_{\theta}}(s_{t}, w_v) \: ... \, Equation \, 26" />
162 | 
163 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;\hat{Q}_{P_{\theta}}(s_{t},&space;a_{t})&space;\approx&space;\hat{Q}_{P_{\theta}}(s_{t},&space;a_{t},&space;w_v)&space;=&space;r(s_t,&space;a_t)&space;&plus;&space;\hat{V}_{P_{\theta}}(s_{t^{'}},&space;w_v)&space;\:&space;...&space;\,&space;Equation&space;\,&space;27" title="\hat{Q}_{P_{\theta}}(s_{t}, a_{t}) \approx \hat{Q}_{P_{\theta}}(s_{t}, a_{t}, w_v) = r(s_t, a_t) + \hat{V}_{P_{\theta}}(s_{t^{'}}, w_v) \: ... \, Equation \, 27" />
164 | 
165 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;A_{P_{\theta}}(s_{t},&space;a_{t})&space;\approx&space;\hat{A}_{P_{\theta}}(s_{t},&space;a_{t},&space;w_v)&space;=&space;r(s_t,&space;a_t)&space;&plus;&space;\hat{V}_{P_{\theta}}(s_{t^{'}},&space;w_v)&space;-&space;\hat{V}_{P_{\theta}}(s_t,&space;w_v)&space;\:&space;...&space;\,&space;Equation&space;\,&space;28" title="A_{P_{\theta}}(s_{t}, a_{t}) \approx \hat{A}_{P_{\theta}}(s_{t}, a_{t}, w_v) = r(s_t, a_t) + \hat{V}_{P_{\theta}}(s_{t^{'}}, w_v) - \hat{V}_{P_{\theta}}(s_t, w_v) \: ... \, Equation \, 28" />
166 | 
167 | And thus the Advantage function can now be represented using a single neural network parameterized with w<sub>v</sub>. Note with the above equation for the Advantage function, it is really just the one-step TD error (i.e. TD(0) error). Additionally, it is also possible to represent it using TD(&lambda;) error.
168 | 
169 | The gradient equation for Advantage Actor Critic is now going to be:
170 | 
171 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;\nabla_{\theta}U_{\theta}&space;\approx&space;\hat{g}&space;=&space;\frac{1}{m}&space;\sum_{i=1}^{m}&space;\left(&space;\sum_{t=0}^{T}&space;\left(&space;\nabla_{\theta}log(P_{\theta}(a_{t}|s_{t}))&space;\,&space;\hat{A}_{P_{\theta}}(s_{t},&space;a_{t},&space;w_v)&space;\right)&space;\right)_i&space;\:&space;...&space;\,&space;Equation&space;\,&space;29" title="\nabla_{\theta}U_{\theta} \approx \hat{g} = \frac{1}{m} \sum_{i=1}^{m} \left( \sum_{t=0}^{T} \left( \nabla_{\theta}log(P_{\theta}(a_{t}|s_{t})) \, \hat{A}_{P_{\theta}}(s_{t}, a_{t}, w_v) \right) \right)_i \: ... \, Equation \, 29" />
172 | 
173 | And this is going to be a much better estimator of the expected gradient (Equation 3), i.e. with lower variance and still be unbiased, even for m=1. As a result, the algorithm will learn much faster.
174 | 
175 | w<sub>v</sub> is updated as follows:
176 | 
177 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;w_v&space;\leftarrow&space;w_v&space;-&space;\beta\nabla_{w_{v}}L(w_v)&space;\:&space;...&space;\,&space;Equation&space;\,&space;30" title="w_v \leftarrow w_v - \beta\nabla_{w_{v}}L(w_v) \: ... \, Equation \, 30" />
178 | 
179 | whereby using one-step TD learning (i.e. TD(0)):
180 | 
181 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;\nabla_{w_{v}}L(w_v)&space;=&space;-(r(s_t,&space;a_t)&space;&plus;&space;\hat{V}_{P_{\theta}}(s_{t^{'}},&space;w_v)&space;-&space;\hat{V}_{P_{\theta}}(s_t,&space;w_v))\nabla_{w_v}\hat{V}_{P_{\theta}}(s_t,&space;w_v)&space;=&space;-\hat{A}_{P_{\theta}}(s_t,a_t,w_v)&space;\nabla_{w_v}\hat{V}_{P_{\theta}}(s_t,&space;w_v)&space;\;&space;...&space;\,&space;Equation&space;\,&space;31" title="\nabla_{w_{v}}L(w_v) = -(r(s_t, a_t) + \hat{V}_{P_{\theta}}(s_{t^{'}}, w_v) - \hat{V}_{P_{\theta}}(s_t, w_v))\nabla_{w_v}\hat{V}_{P_{\theta}}(s_t, w_v) = -\hat{A}_{P_{\theta}}(s_t,a_t,w_v) \nabla_{w_v}\hat{V}_{P_{\theta}}(s_t, w_v) \; ... \, Equation \, 31" />
182 | 
183 | Using the gradient estimator from Equation 29, the weight update from Equation 30, and the remaining steps from the basic Reinforce algorithm results in what is known as the Advantage Actor-Critic algorithm.
184 | 
185 | To briefly summarize the above discussion, the main downside of the Reinforce algorithm is that the gradient estimator is based upon the Monte-Carlo estimator of the expected total reward from the initial state-action pair -- which while has low bias, it has high variance. By using causality and subtracting out a baseline from the Monte-Carlo estimator, we can reduce the variance. The variance is further reduced by using TD estimator of the expected total reward to go instead of Monte-Carlo estimator.
186 | 
187 | ## Deterministic Policy Gradient (DPG) Algorithm
188 | 
189 | For stochastic policies in continuous environments, the actor outputs the mean and variance of a Gaussian distribution. And an action is sampled from this Gaussian distribution. For deterministic actions, while this approach still works as the network will learn to have very low variance, it involves complexity and computational burden that unnecessarily slows down the learning algorithm. To address these short comings, for deterministic actions, we can use what is known as the deterministic policy gradient.
190 | 
191 | In stochastic case, the policy gradient integrates over both state and action spaces, whereas in the deterministic case it only integrates over the state space. As a result, computing the deterministic policy gradient can potentially require fewer samples. But in order to fully explore the state space, the basic idea is to choose actions according to a stochastic behavior policy and learn about a deterministic target policy (i.e. needs to be an off-policy algorithm).
192 | 
193 | DPG is essentially a deterministic version of Actor-Critic algorithm. For a basic DPG algorithm, we have two neural networks, one network (parameterized by &theta;) is estimating the optimal target policy and the second network (parameterized by w) is estimating the action-value function corresponding to the target policy. The below equations formalize this.
194 | 
195 | As mentioned above, because the target policy is deterministic, the actor may not explore the state-space very well to find the optimal policy. To address it, we use a behavior policy (b(s<sub>t</sub>)) that is different from the target policy. It is basically the target policy with some additional noise. For simplicity, we will use a Normal distribution as our noise source. But note that this term is like a hyper parameter, and in the below implementation for the Reacher environment, a different noise process is used.
196 | 
197 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;Target&space;\:&space;Policy&space;=&space;\mu_{\theta}(s_t)&space;\:&space;...&space;\,&space;Equation&space;\,&space;32a" title="Target \: Policy = \mu_{\theta}(s_t) \: ... \, Equation \, 32a" />
198 | 
199 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;b(s_t)&space;=&space;\mu_{\theta}(s_t)&space;&plus;&space;\mathit{N}(0,I)&space;\:&space;...&space;\,&space;Equation&space;\,&space;32b" title="b(s_t) = \mu_{\theta}(s_t) + \mathit{N}(0,I) \: ... \, Equation \, 32b" />
200 | 
201 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;a_t&space;\sim&space;b(s_t)&space;\:&space;...&space;\,&space;Equation&space;\,&space;32c" title="a_t \sim b(s_t) \: ... \, Equation \, 32c" />
202 | 
203 | <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;Q_{\theta}(s_t,a_t)&space;\approx&space;\hat{Q}_{\theta}(s_t,a_t,w)&space;\:&space;...&space;\,&space;Equation&space;\,&space;33" title="Q_{\theta}(s_t,a_t) \approx \hat{Q}_{\theta}(s_t,a_t,w) \: ... \, Equation \, 33" />
204 | 
205 | **Deterministic Policy Gradient Update:**<br>
206 | 1. Actor network is updated as follows:
207 | 
208 |     <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;\theta&space;\leftarrow&space;\theta&space;&plus;&space;\alpha_{\theta}\nabla_{\theta}\hat{Q}_{\theta}(s_t,\mu_{\theta}(s_t),w)&space;\:&space;...&space;\,&space;Equation&space;\,&space;34" title="\theta \leftarrow \theta + \alpha_{\theta}\nabla_{\theta}\hat{Q}_{\theta}(s_t,\mu_{\theta}(s_t),w) \: ... \, Equation \, 34" />
209 | 
210 |     which by chain rule, it becomes:
211 | 
212 |     <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;\theta&space;\leftarrow&space;\theta&space;&plus;&space;\alpha_{\theta}\nabla_{a}\hat{Q}_{\theta}(s_t,a,w)\nabla_{\theta}\mu_{\theta}(s_t)&space;\:&space;...&space;\,&space;Equation&space;\,&space;35" title="\theta \leftarrow \theta + \alpha_{\theta}\nabla_{a}\hat{Q}_{\theta}(s_t,a,w)\nabla_{\theta}\mu_{\theta}(s_t) \: ... \, Equation \, 35" />
213 | 
214 | 2. The critic network is updated as follows:
215 | 
216 |     The TD error is given by:
217 | 
218 |     <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;\delta_t&space;=&space;r(s_t,a_t)&space;&plus;&space;\hat{Q}_{\theta}(s_{t&plus;1},&space;\mu_{\theta}(s_{t&plus;1}),w)&space;-&space;\hat{Q}_{\theta}(s_t,&space;a_t&space;\sim&space;b(s_t),w)&space;\:&space;...&space;\,&space;Equation&space;\,&space;36" title="\delta_t = r(s_t,a_t) + \hat{Q}_{\theta}(s_{t+1}, \mu_{\theta}(s_{t+1}),w) - \hat{Q}_{\theta}(s_t, a_t \sim b(s_t),w) \: ... \, Equation \, 36" />
219 | 
220 |     and the weight update is:
221 | 
222 |     <img src="https://latex.codecogs.com/png.latex?\fn_cm&space;w&space;\leftarrow&space;w&space;&plus;&space;\alpha_w\delta_t\nabla_w\hat{Q}_{\theta}(s_t,&space;a_t&space;\sim&space;b(s_t),w)&space;\:&space;...&space;\,&space;Equation&space;\,&space;37" title="w \leftarrow w + \alpha_w\delta_t\nabla_w\hat{Q}_{\theta}(s_t, a_t \sim b(s_t),w) \: ... \, Equation \, 37" />
223 | 
224 | To reiterate, in order to properly balance exploration-exploitation tradeoff, while the target policy &mu; is deterministic, the behavior policy is stochastic. So this is an off-policy version of the DPG algorithm. While stochastic off-policy actor-critic algorithms typically use importance sampling for both the actor and the critic, because the deterministic policy gradient removes expectation over the actions, and given the state transition dynamics are same for both the target and behavior policies as they operate in the same environment, importance sampling ratio is not needed. So we can avoid having to use importance sampling in the actor, and with same reasoning, we avoid using importance sampling in the critic [2]. For those who are wondering, similar reasoning applies as to why we don't use importance sampling with Q-learning.
225 | 
226 | ## Deep Deterministic Policy Gradient (DDPG) Algorithm
227 | DDPG is basically DPG with a few training changes adopted from the DQN architecture.
228 | 
229 | One challenge when using neural networks for reinforcement learning is that most optimization algorithms assume the samples are independently and identically distributed. Obviously this assumption doesn't hold true because the samples are generated by exploring sequentially in an environment. Because DDPG is an off policy algorithm, we can use the replay buffer (a finite sized cache) as in DQN to address this issue. At each timestep the actor and critic are updated by sampling a minibatch uniformly from the buffer [2].
230 | 
231 | For the critic, since the network being updated is also used in calculating the target, this can potentially lead to training instabilities for highly nonlinear function approximators like neural networks. One solution to address this is using a separate target network, as with DQN [2]. Given the target values are determined using both the critic and actor networks, we create a copy of both of these networks and soft update their weights to the respective learned networks. [Please refer to my github code for details.](https://github.com/gtg162y/DRLND/blob/master/P2_Continuous_Actions/Continuous_Control_UdacityWorkspace.ipynb)
232 | 
233 | ---------------------------------------------------------------
234 | ## DDPG Implementation for Reacher Environment
235 | 
236 | Having now seen some of the commonly used policy gradient algorithms, we can now get to my implementation for the Udacity's Reacher project. In this environment, a double-jointed arm (acrobot) can move to target locations (i.e.  where the balloons are). A reward of +0.1 is provided for each step that the agent's hand is in the goal location. Thus, the goal of the agent is to maintain its position at the target location for as many time steps as possible. As the balloon moves, the two joints at adjusted to track the balloon. So this is a classical robotics project, and using model-free reinforcement learning, the agent will learn the optimal policy. In particular, the method used is the deep deterministic policy gradient method (DDPG). The observation space consists of 33 variables corresponding to position, rotation, velocity, and angular velocities of the arm. Each action is a vector with four numbers, corresponding to torque applicable to two joints. Every entry in the action vector is a number between -1 and 1.
237 | 
238 | The Reacher environment used contains 20 identical agents, each with its own copy of the environment. In order to be considered solved, the agents must get an average score of +30 (over 100 consecutive episodes, and over all 20 agents). In particular, after each episode, we add up the rewards that each agent received (without discounting), to get a score for each agent. This yields 20 (potentially different) scores. We then take the average of these 20 scores.
239 | This yields an average score for each episode (where the average is over all 20 agents).
240 | The environment is considered solved, when the average (over 100 episodes) of those average scores is at least +30.
241 | 
242 | The DDPG algorithm uses 4 separate neural networks. One to learn the policy, another to learn the value function, another for the target value function, and one for the target action in the target action-value function network. As discussed earlier, we use a separate target network instead of the local network so as to prevent any instabilities in learning when the TD target is dependent on the local network. The weights in the target network are updated very slowly, like 0.1% towards the local network at every time step. Slowly changing the target network does slow down the learning rate, but it helps with the learning algorithm's stability [2]. 
243 | 
244 | To speed up the learning process, since all 20 agents are experiencing at the same time, the input data stream is pretty large. To adequately take advantage of this, I perform network parameter update 4 times at each iteration. The reason this works without using an importance sampling ratio is because the target policy for the actor is deterministic, as well as the target value for the critic is also deterministic. Additionally, the critic network's gradients are clipped to 1. This prevents the critic network from changing too fast. Moreover, I initialized the target networks to have the same (random) weights as the networks being learned. Additionally, instead of Normal distribution, Ornstein-Uhlenbeck noise process is used to generate the behavior policy for better exploration. All these things together allowed the agent to achieve the learning objective in just over 100 episodes. [Please refer to my github code for details.](https://github.com/gtg162y/DRLND/blob/master/P2_Continuous_Actions/Continuous_Control_UdacityWorkspace.ipynb)
245 | 
246 | Below is the learning performance of the algorithm averaged over all the 20 agents.
247 | 
248 | ![Scores][image2]
249 | 
250 | In terms of hyperparameters used, the learning rate for both the actor and critic network was 1e-4, no regularization was used as the networks were fairly small. I gradually decayed the exploration probability to get the optimal policy. Ornstein-Uhlenbeck noise process used a mean value of 0 and sigma of 0.2. The actor network was built using a three layer neural network (with 256 neurons in the first layer, 128 neurons in the second layer, and 4 neurons in the final output layer). The critic network was also built using three layers and similar number of neurons as the actor network, except that actions were concatenated to the output of the first layer and the final layer had a single output neuron. For faster learning, elu non-linearity was used for both networks.
251 | 
252 | In terms of methods to further improve the agent's performance, a couple things on my to do list are: 1) train the agent using Prioritized Experience Replay as well as 2) use the Proximal Policy Optimization algorithm.
253 | 
254 | **References:**<br>
255 | 1: UC Berkeley CS294 Lectures (http://rail.eecs.berkeley.edu/deeprlcourse/)<br>
256 | 2. DDPG paper (https://arxiv.org/pdf/1509.02971.pdf)
257 | 
258 | 
259 | [comment]: # (Equations generated using: https://stackoverflow.com/questions/11256433/how-to-show-math-equations-in-general-githubs-markdownnot-githubs-blog,
260 | https://www.codecogs.com/latex/eqneditor.php,
261 | http://mathurl.com/)
262 | 


--------------------------------------------------------------------------------
/P2_Continuous_Actions/checkpoint_actor.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitp-ai/Deep_Reinforcement_Learning/68fa57e1be9a7e72887f16a413aa55c85033ec4c/P2_Continuous_Actions/checkpoint_actor.pth


--------------------------------------------------------------------------------
/P2_Continuous_Actions/checkpoint_critic.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitp-ai/Deep_Reinforcement_Learning/68fa57e1be9a7e72887f16a413aa55c85033ec4c/P2_Continuous_Actions/checkpoint_critic.pth


--------------------------------------------------------------------------------
/P3_Collab_Compete/Future_Improvements.md:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------------
 2 | 
 3 | ## Ideas for Future Improvement:
 4 | 1. Use parameter space noise rather than noise on action. https://vimeo.com/252185862 https://github.com/jvmancuso/ParamNoise
 5 | 2. We can use prioritised experience buffer. https://github.com/Damcy/prioritized-experience-replay
 6 | 3. Different replay buffer for actor/critic
 7 | 4. Try adding dropouts in critic network
 8 | 5. Turn off OU noise and use random noise
 9 | 6. You should also try implementing some other algorithms like A3C and PPO. Following are some useful posts.
10 |     [Asynchronous Actor-Critic Agents (A3C)](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2)
11 |     
12 |     [Trust Region Policy Optimization (TRPO) and Proximal Policy Optimization (PPO)](https://medium.com/@sanketgujar95/trust-region-policy-optimization-trpo-and-proximal-policy-optimization-ppo-e6e7075f39ed)
13 | 


--------------------------------------------------------------------------------
/P3_Collab_Compete/README.md:
--------------------------------------------------------------------------------
 1 | [//]: # (Image References)
 2 | 
 3 | [image1]: https://user-images.githubusercontent.com/10624937/42135623-e770e354-7d12-11e8-998d-29fc74429ca2.gif "Trained Agent"
 4 | [image2]: https://user-images.githubusercontent.com/10624937/42135622-e55fb586-7d12-11e8-8a54-3c31da15a90a.gif "Soccer"
 5 | 
 6 | Project Report: https://medium.com/@amitp-ai/maddpg-91caa221d75e
 7 | 
 8 | 
 9 | # Project 3: Collaboration and Competition
10 | 
11 | ### Introduction
12 | 
13 | For this project, you will work with the [Tennis](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Learning-Environment-Examples.md#tennis) environment.
14 | 
15 | ![Trained Agent][image1]
16 | 
17 | In this environment, two agents control rackets to bounce a ball over a net. If an agent hits the ball over the net, it receives a reward of +0.1.  If an agent lets a ball hit the ground or hits the ball out of bounds, it receives a reward of -0.01.  Thus, the goal of each agent is to keep the ball in play.
18 | 
19 | The observation space consists of 8 variables corresponding to the position and velocity of the ball and racket. Each agent receives its own, local observation.  Two continuous actions are available, corresponding to movement toward (or away from) the net, and jumping. 
20 | 
21 | The task is episodic, and in order to solve the environment, your agents must get an average score of +0.5 (over 100 consecutive episodes, after taking the maximum over both agents). Specifically,
22 | 
23 | - After each episode, we add up the rewards that each agent received (without discounting), to get a score for each agent. This yields 2 (potentially different) scores. We then take the maximum of these 2 scores.
24 | - This yields a single **score** for each episode.
25 | 
26 | The environment is considered solved, when the average (over 100 episodes) of those **scores** is at least +0.5.
27 | 
28 | ### Getting Started
29 | 
30 | 1. Download the environment from one of the links below.  You need only select the environment that matches your operating system:
31 |     - Linux: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P3/Tennis/Tennis_Linux.zip)
32 |     - Mac OSX: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P3/Tennis/Tennis.app.zip)
33 |     - Windows (32-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P3/Tennis/Tennis_Windows_x86.zip)
34 |     - Windows (64-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P3/Tennis/Tennis_Windows_x86_64.zip)
35 |     
36 |     (_For Windows users_) Check out [this link](https://support.microsoft.com/en-us/help/827218/how-to-determine-whether-a-computer-is-running-a-32-bit-version-or-64) if you need help with determining if your computer is running a 32-bit version or 64-bit version of the Windows operating system.
37 | 
38 |     (_For AWS_) If you'd like to train the agent on AWS (and have not [enabled a virtual screen](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-on-Amazon-Web-Service.md)), then please use [this link](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P3/Tennis/Tennis_Linux_NoVis.zip) to obtain the "headless" version of the environment.  You will **not** be able to watch the agent without enabling a virtual screen, but you will be able to train the agent.  (_To watch the agent, you should follow the instructions to [enable a virtual screen](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-on-Amazon-Web-Service.md), and then download the environment for the **Linux** operating system above._)
39 | 
40 | 2. Place the file in the DRLND GitHub repository, in the `p3_collab-compet/` folder, and unzip (or decompress) the file. 
41 | 
42 | ### Instructions
43 | 
44 | Follow the instructions in `Tennis.ipynb` to get started with training your own agent!  
45 | 
46 | ### (Optional) Challenge: Crawler Environment
47 | 
48 | After you have successfully completed the project, you might like to solve the more difficult **Soccer** environment.
49 | 
50 | ![Soccer][image2]
51 | 
52 | In this environment, the goal is to train a team of agents to play soccer.  
53 | 
54 | You can read more about this environment in the ML-Agents GitHub [here](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Learning-Environment-Examples.md#soccer-twos).  To solve this harder task, you'll need to download a new Unity environment.  (**Note**: Udacity students should not submit a project with this new environment.)
55 | 
56 | You need only select the environment that matches your operating system:
57 | - Linux: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P3/Soccer/Soccer_Linux.zip)
58 | - Mac OSX: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P3/Soccer/Soccer.app.zip)
59 | - Windows (32-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P3/Soccer/Soccer_Windows_x86.zip)
60 | - Windows (64-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P3/Soccer/Soccer_Windows_x86_64.zip)
61 | 
62 | Then, place the file in the `p3_collab-compet/` folder in the DRLND GitHub repository, and unzip (or decompress) the file.  Next, open `Soccer.ipynb` and follow the instructions to learn how to use the Python API to control the agent.
63 | 
64 | (_For AWS_) If you'd like to train the agents on AWS (and have not [enabled a virtual screen](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-on-Amazon-Web-Service.md)), then please use [this link](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P3/Soccer/Soccer_Linux_NoVis.zip) to obtain the "headless" version of the environment.  You will **not** be able to watch the agents without enabling a virtual screen, but you will be able to train the agents.  (_To watch the agents, you should follow the instructions to [enable a virtual screen](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-on-Amazon-Web-Service.md), and then download the environment for the **Linux** operating system above._)
65 | 


--------------------------------------------------------------------------------
/P3_Collab_Compete/checkpoint_actor_local_0.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitp-ai/Deep_Reinforcement_Learning/68fa57e1be9a7e72887f16a413aa55c85033ec4c/P3_Collab_Compete/checkpoint_actor_local_0.pth


--------------------------------------------------------------------------------
/P3_Collab_Compete/checkpoint_actor_local_1.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitp-ai/Deep_Reinforcement_Learning/68fa57e1be9a7e72887f16a413aa55c85033ec4c/P3_Collab_Compete/checkpoint_actor_local_1.pth


--------------------------------------------------------------------------------
/P3_Collab_Compete/checkpoint_critic_local_0.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitp-ai/Deep_Reinforcement_Learning/68fa57e1be9a7e72887f16a413aa55c85033ec4c/P3_Collab_Compete/checkpoint_critic_local_0.pth


--------------------------------------------------------------------------------
/P3_Collab_Compete/checkpoint_critic_local_1.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitp-ai/Deep_Reinforcement_Learning/68fa57e1be9a7e72887f16a413aa55c85033ec4c/P3_Collab_Compete/checkpoint_critic_local_1.pth


--------------------------------------------------------------------------------
/P3_Collab_Compete/workspace_utils.py:
--------------------------------------------------------------------------------
 1 | import signal
 2 | 
 3 | from contextlib import contextmanager
 4 | 
 5 | import requests
 6 | 
 7 | 
 8 | DELAY = INTERVAL = 4 * 60  # interval time in seconds
 9 | MIN_DELAY = MIN_INTERVAL = 2 * 60
10 | KEEPALIVE_URL = "https://nebula.udacity.com/api/v1/remote/keep-alive"
11 | TOKEN_URL = "http://metadata.google.internal/computeMetadata/v1/instance/attributes/keep_alive_token"
12 | TOKEN_HEADERS = {"Metadata-Flavor":"Google"}
13 | 
14 | 
15 | def _request_handler(headers):
16 |     def _handler(signum, frame):
17 |         requests.request("POST", KEEPALIVE_URL, headers=headers)
18 |     return _handler
19 | 
20 | 
21 | @contextmanager
22 | def active_session(delay=DELAY, interval=INTERVAL):
23 |     """
24 |     Example:
25 | 
26 |     from workspace_utils import active session
27 | 
28 |     with active_session():
29 |         # do long-running work here
30 |     """
31 |     token = requests.request("GET", TOKEN_URL, headers=TOKEN_HEADERS).text
32 |     headers = {'Authorization': "STAR " + token}
33 |     delay = max(delay, MIN_DELAY)
34 |     interval = max(interval, MIN_INTERVAL)
35 |     original_handler = signal.getsignal(signal.SIGALRM)
36 |     try:
37 |         signal.signal(signal.SIGALRM, _request_handler(headers))
38 |         signal.setitimer(signal.ITIMER_REAL, delay, interval)
39 |         yield
40 |     finally:
41 |         signal.signal(signal.SIGALRM, original_handler)
42 |         signal.setitimer(signal.ITIMER_REAL, 0)
43 | 
44 | 
45 | def keep_awake(iterable, delay=DELAY, interval=INTERVAL):
46 |     """
47 |     Example:
48 | 
49 |     from workspace_utils import keep_awake
50 | 
51 |     for i in keep_awake(range(5)):
52 |         # do iteration with lots of work here
53 |     """
54 |     with active_session(delay, interval): yield from iterable
55 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Deep Reinforcement Learning (Fall 2018)
 2 | 
 3 | **Project 1: Value Based RL Methods Including Deep Q-Network (DQN) and Double Deep Q-Network (DDQN) <br />
 4 | https://medium.com/@amitp-ai/double-dqn-48562b5f31c1**
 5 | 
 6 | 
 7 | **Project 2: Policy Based RL Methods Including Advantage Actor-Critic (A2C) and Deep Deterministic Policy Gradient (DDPG) <br />
 8 | https://medium.com/@amitp-ai/policy-gradients-1edbbbc8de6b**
 9 | 
10 | 
11 | **Project 3: Multi-Agent RL Methods Such as Multi-Agent DDPG (MADDPG) <br />
12 | https://medium.com/@amitp-ai/maddpg-91caa221d75e**
13 | 


--------------------------------------------------------------------------------