├── requirements.txt
├── deeprl_prj
    ├── __init__.py
    ├── objectives.py
    ├── utils.py
    ├── policy.py
    ├── preprocessors.py
    ├── core.py
    ├── dqn_keras.py
    ├── dqn_tf_temporalAt.py
    └── dqn_tf_spatialAt.py
├── readme.md
├── helper.py
└── main.py


/requirements.txt:
--------------------------------------------------------------------------------
 1 | scikit-image
 2 | attrs
 3 | h5py
 4 | keras
 5 | matplotlib
 6 | numpy
 7 | pillow
 8 | protobuf>=3.0
 9 | pydot-ng
10 | scipy
11 | semver
12 | gym[atari]
13 | 


--------------------------------------------------------------------------------
/deeprl_prj/__init__.py:
--------------------------------------------------------------------------------
1 | from . import core
2 | from . import dqn_keras
3 | from . import dqn_tf_temporalAt
4 | from . import dqn_tf_spatialAt
5 | from . import objectives
6 | from . import policy
7 | from . import preprocessors
8 | from . import utils
9 | 


--------------------------------------------------------------------------------
/deeprl_prj/objectives.py:
--------------------------------------------------------------------------------
 1 | """Loss functions."""
 2 | 
 3 | import tensorflow as tf
 4 | import semver
 5 | 
 6 | def huber_loss(y_true, y_pred, max_grad=1.):
 7 |     """Calculate the huber loss.
 8 | 
 9 |     See https://en.wikipedia.org/wiki/Huber_loss
10 | 
11 |     Parameters
12 |     ----------
13 |     y_true: np.array, tf.Tensor
14 |       Target value.
15 |     y_pred: np.array, tf.Tensor
16 |       Predicted value.
17 |     max_grad: float, optional
18 |       Positive floating point value. Represents the maximum possible
19 |       gradient magnitude.
20 | 
21 |     Returns
22 |     -------
23 |     tf.Tensor
24 |       The huber loss.
25 |     """
26 |     with tf.variable_scope("HuberLoss"):
27 |         delta = max_grad 
28 |         diff = tf.abs(y_true - y_pred, name = "diff")
29 |         mask = diff < delta
30 |         return tf.where(mask, 0.5 * tf.square(diff), delta * (diff - 0.5 * delta))
31 | 
32 | def mean_huber_loss(y_true, y_pred, max_grad=1.):
33 |     """Return mean huber loss.
34 | 
35 |     Same as huber_loss, but takes the mean over all values in the
36 |     output tensor.
37 | 
38 |     Parameters
39 |     ----------
40 |     y_true: np.array, tf.Tensor
41 |       Target value.
42 |     y_pred: np.array, tf.Tensor
43 |       Predicted value.
44 |     max_grad: float, optional
45 |       Positive floating point value. Represents the maximum possible
46 |       gradient magnitude.
47 | 
48 |     Returns
49 |     -------
50 |     tf.Tensor
51 |       The mean huber loss.
52 |     """
53 |     return tf.reduce_mean(huber_loss(y_true, y_pred, max_grad))
54 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # UAV Obstacle Avoidance using Deep Recurrent Reinforcement Learning with Temporal Attention
 2 | 
 3 | The code is implemented in Tensorflow(version = 1.1.0) and Keras.  
 4 | 
 5 | ## Requirements
 6 | 
 7 | The code is based on **Python 2**. Install dependency by running:
 8 | 
 9 | 	pip install --user -r requirements.txt
10 |     
11 | ## How to run
12 | 
13 | There are two types of DQN implementation with gpu: Keras and Tensorflow.  
14 | You can choose different implementation by altering **line 15** in 
15 | **main.py**
16 | 
17 | Train original DQN:
18 | 
19 | 	python main.py --task_name 'DQN'
20 |     
21 | Train Double DQN:
22 | 
23 | 	python main.py --ddqn --task_name 'Double_DQN'
24 |     
25 | Train Dueling DQN:
26 | 
27 | 	python main.py --net_mode=duel --task_name 'Dueling_DQN'
28 | 
29 | Train Recurrent DQN:
30 | 
31 | 	python main.py --num_frames 10 --recurrent --task_name 'Recurrent_DQN'
32 |     
33 | Train Recurrent Temporal Attention DQN: (Using **dqn_tf_temporalAt.py** by uncommenting **line 18** in **main.py**)
34 | 
35 | 	python main.py --num_frames 10 --recurrent --a_t --selector --task_name 'TemporalAt_DQN'
36 | 
37 | Train Recurrent Spatial Attention DQN: (Using **dqn_tf_spatialAt.py** by uncommenting **line 21** in **main.py**)
38 | 
39 | 	python main.py --num_frames 10 --recurrent --a_t --selector --task_name 'SpatialAt_DQN'
40 | 
41 | Test trained model (e.g. Spatial Attention DQN):
42 | 
43 | 	python main.py --num_frames 10 --recurrent --a_t --selector --test \
44 |     --load_network --load_network_path=PATH_TO_NET
45 | 
46 | ## Acknowledgement
47 | 
48 | This code repository is highly inspired from work of Rui Zhu et al [link](https://github.com/chasewind007/Attention-DQN).
49 | 


--------------------------------------------------------------------------------
/deeprl_prj/utils.py:
--------------------------------------------------------------------------------
  1 | """Common functions."""
  2 | 
  3 | import semver
  4 | import tensorflow as tf
  5 | 
  6 | def get_uninitialized_variables(variables=None):
  7 |     """Return a list of uninitialized tf variables.
  8 | 
  9 |     Parameters
 10 |     ----------
 11 |     variables: tf.Variable, list(tf.Variable), optional
 12 |       Filter variable list to only those that are uninitialized. If no
 13 |       variables are specified the list of all variables in the graph
 14 |       will be used.
 15 | 
 16 |     Returns
 17 |     -------
 18 |     list(tf.Variable)
 19 |       List of uninitialized tf variables.
 20 |     """
 21 |     sess = tf.get_default_session()
 22 |     if variables is None:
 23 |         variables = tf.global_variables()
 24 |     else:
 25 |         variables = list(variables)
 26 | 
 27 |     if len(variables) == 0:
 28 |         return []
 29 | 
 30 |     if semver.match(tf.__version__, '<1.0.0'):
 31 |         init_flag = sess.run(
 32 |             tf.pack([tf.is_variable_initialized(v) for v in variables]))
 33 |     else:
 34 |         init_flag = sess.run(
 35 |             tf.stack([tf.is_variable_initialized(v) for v in variables]))
 36 |     return [v for v, f in zip(variables, init_flag) if not f]
 37 | 
 38 | def get_soft_target_model_updates(target, source, tau):
 39 |     r"""Return list of target model update ops.
 40 | 
 41 |     These are soft target updates. Meaning that the target values are
 42 |     slowly adjusted, rather than directly copied over from the source
 43 |     model.
 44 | 
 45 |     The update is of the form:
 46 | 
 47 |     $W' \gets (1- \tau) W' + \tau W$ where $W'$ is the target weight
 48 |     and $W$ is the source weight.
 49 | 
 50 |     Parameters
 51 |     ----------
 52 |     target: keras.models.Model
 53 |       The target model. Should have same architecture as source model.
 54 |     source: keras.models.Model
 55 |       The source model. Should have same architecture as target model.
 56 |     tau: float
 57 |       The weight of the source weights to the target weights used
 58 |       during update.
 59 | 
 60 |     Returns
 61 |     -------
 62 |     list(tf.Tensor)
 63 |       List of tensor update ops.
 64 |     """
 65 |     target_weights = target.get_weights()
 66 |     source_weights = source.get_weights()
 67 |     for i in range(len(target_weights)):
 68 |         target_weights[i] = (1 - tau) * target_weights[i] + tau * source_weights[i]
 69 |     return target_weights
 70 | 
 71 | def get_hard_target_model_updates(target, source):
 72 |     """Return list of target model update ops.
 73 | 
 74 |     These are hard target updates. The source weights are copied
 75 |     directly to the target network.
 76 | 
 77 |     Parameters
 78 |     ----------
 79 |     target: keras.models.Model
 80 |       The target model. Should have same architecture as source model.
 81 |     source: keras.models.Model
 82 |       The source model. Should have same architecture as target model.
 83 | 
 84 |     Returns
 85 |     -------
 86 |     list(tf.Tensor)
 87 |       List of tensor update ops.
 88 |     """
 89 |     return source.get_weights()
 90 | 
 91 | def compare_model(target, source):
 92 |     target_weights = target.get_weights()
 93 |     source_weights = source.get_weights()
 94 |     print(len(target_weights))
 95 |     for i in range(len(target_weights)):
 96 |         print(target_weights[i].shape, source_weights[i].shape)
 97 |         if (target_weights[i] != source_weights[i]).any():
 98 |             return False
 99 |     return True
100 | 


--------------------------------------------------------------------------------
/helper.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | import tensorflow as tf
  4 | import matplotlib.pyplot as plt
  5 | import scipy.misc
  6 | import os
  7 | import csv
  8 | import itertools
  9 | import tensorflow.contrib.slim as slim
 10 | 
 11 | #This is a simple function to reshape our game frames.
 12 | def processState(state1):
 13 |     return np.reshape(state1,[21168])
 14 |     
 15 | #These functions allows us to update the parameters of our target network with those of the primary network.
 16 | def updateTargetGraph(tfVars,tau):
 17 |     total_vars = len(tfVars)
 18 |     op_holder = []
 19 |     for idx,var in enumerate(tfVars[0:total_vars//2]):
 20 |         print("copy from %s ===> %s"%(var.op.name, tfVars[idx+total_vars//2].op.name))
 21 |         # op_holder.append(tfVars[idx+total_vars//2].assign((var.value()*tau) + ((1-tau)*tfVars[idx+total_vars//2].value())))
 22 |         op_holder.append(tfVars[idx+total_vars//2].assign(var.value()))
 23 |     return op_holder
 24 | 
 25 | def updateTarget(op_holder,sess):
 26 |     for op in op_holder:
 27 |         sess.run(op)
 28 |     total_vars = len(tf.trainable_variables())
 29 |     a = tf.trainable_variables()[0].eval(session=sess)
 30 |     b = tf.trainable_variables()[total_vars//2].eval(session=sess)
 31 |     if a.all() == b.all():
 32 |         print("Target Set Success")
 33 |     else:
 34 |         print("Target Set Failed")
 35 |         
 36 | #Record performance metrics and episode logs for the Control Center.
 37 | def saveToCenter(i,rList,jList,bufferArray,summaryLength,h_size,sess,mainQN,time_per_step):
 38 |     with open('./Center/log.csv', 'a') as myfile:
 39 |         state_display = (np.zeros([1,h_size]),np.zeros([1,h_size]))
 40 |         imagesS = []
 41 |         for idx,z in enumerate(np.vstack(bufferArray[:,0])):
 42 |             img,state_display = sess.run([mainQN.salience,mainQN.rnn_state],\
 43 |                 feed_dict={mainQN.scalarInput:np.reshape(bufferArray[idx,0],[1,21168])/255.0,\
 44 |                 mainQN.trainLength:1,mainQN.state_in:state_display,mainQN.batch_size:1})
 45 |             imagesS.append(img)
 46 |         imagesS = (imagesS - np.min(imagesS))/(np.max(imagesS) - np.min(imagesS))
 47 |         imagesS = np.vstack(imagesS)
 48 |         imagesS = np.resize(imagesS,[len(imagesS),84,84,3])
 49 |         luminance = np.max(imagesS,3)
 50 |         imagesS = np.multiply(np.ones([len(imagesS),84,84,3]),np.reshape(luminance,[len(imagesS),84,84,1]))
 51 |         make_gif(np.ones([len(imagesS),84,84,3]),'./Center/frames/sal'+str(i)+'.gif',duration=len(imagesS)*time_per_step,true_image=False,salience=True,salIMGS=luminance)
 52 | 
 53 |         images = zip(bufferArray[:,0])
 54 |         images.append(bufferArray[-1,3])
 55 |         images = np.vstack(images)
 56 |         images = np.resize(images,[len(images),84,84,3])
 57 |         make_gif(images,'./Center/frames/image'+str(i)+'.gif',duration=len(images)*time_per_step,true_image=True,salience=False)
 58 | 
 59 |         wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
 60 |         wr.writerow([i,np.mean(jList[-100:]),np.mean(rList[-summaryLength:]),'./frames/image'+str(i)+'.gif','./frames/log'+str(i)+'.csv','./frames/sal'+str(i)+'.gif'])
 61 |         myfile.close()
 62 |     with open('./Center/frames/log'+str(i)+'.csv','w') as myfile:
 63 |         state_train = (np.zeros([1,h_size]),np.zeros([1,h_size]))
 64 |         wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
 65 |         wr.writerow(["ACTION","REWARD","A0","A1",'A2','A3','V'])
 66 |         a, v = sess.run([mainQN.Advantage,mainQN.Value],\
 67 |             feed_dict={mainQN.scalarInput:np.vstack(bufferArray[:,0])/255.0,mainQN.trainLength:len(bufferArray),mainQN.state_in:state_train,mainQN.batch_size:1})
 68 |         wr.writerows(zip(bufferArray[:,1],bufferArray[:,2],a[:,0],a[:,1],a[:,2],a[:,3],v[:,0]))
 69 |     
 70 | #This code allows gifs to be saved of the training episode for use in the Control Center.
 71 | def make_gif(images, fname, duration=2, true_image=False,salience=False,salIMGS=None):
 72 |   import moviepy.editor as mpy
 73 |   
 74 |   def make_frame(t):
 75 |     try:
 76 |       x = images[int(len(images)/duration*t)]
 77 |     except:
 78 |       x = images[-1]
 79 | 
 80 |     if true_image:
 81 |       return x.astype(np.uint8)
 82 |     else:
 83 |       return ((x+1)/2*255).astype(np.uint8)
 84 |   
 85 |   def make_mask(t):
 86 |     try:
 87 |       x = salIMGS[int(len(salIMGS)/duration*t)]
 88 |     except:
 89 |       x = salIMGS[-1]
 90 |     return x
 91 | 
 92 |   clip = mpy.VideoClip(make_frame, duration=duration)
 93 |   if salience == True:
 94 |     mask = mpy.VideoClip(make_mask, ismask=True,duration= duration)
 95 |     clipB = clip.set_mask(mask)
 96 |     clipB = clip.set_opacity(0)
 97 |     mask = mask.set_opacity(0.1)
 98 |     mask.write_gif(fname, fps = len(images) / duration,verbose=False)
 99 |     #clipB.write_gif(fname, fps = len(images) / duration,verbose=False)
100 |   else:
101 |     clip.write_gif(fname, fps = len(images) / duration,verbose=False)
102 | 


--------------------------------------------------------------------------------
/deeprl_prj/policy.py:
--------------------------------------------------------------------------------
  1 | """RL Policy classes."""
  2 | 
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | 
  6 | 
  7 | class Policy:
  8 |     """Base class representing an MDP policy.
  9 | 
 10 |     Policies are used by the agent to choose actions.
 11 | 
 12 |     Policies are designed to be stacked to get interesting behaviors
 13 |     of choices. For instances in a discrete action space the lowest
 14 |     level policy may take in Q-Values and select the action index
 15 |     corresponding to the largest value. If this policy is wrapped in
 16 |     an epsilon greedy policy then with some probability epsilon, a
 17 |     random action will be chosen.
 18 |     """
 19 | 
 20 |     def select_action(self, **kwargs):
 21 |         """Used by agents to select actions.
 22 | 
 23 |         Returns
 24 |         -------
 25 |         Any:
 26 |           An object representing the chosen action. Type depends on
 27 |           the hierarchy of policy instances.
 28 |         """
 29 |         raise NotImplementedError('This method should be overriden.')
 30 | 
 31 | class UniformRandomPolicy(Policy):
 32 |     """Chooses a discrete action with uniform random probability.
 33 | 
 34 |     Parameters
 35 |     ----------
 36 |     num_actions: int
 37 |       Number of actions to choose from. Must be > 0.
 38 | 
 39 |     Raises
 40 |     ------
 41 |     ValueError:
 42 |       If num_actions <= 0
 43 |     """
 44 | 
 45 |     def __init__(self, num_actions):
 46 |         assert num_actions >= 1
 47 |         self.num_actions = num_actions
 48 | 
 49 |     def select_action(self, **kwargs):
 50 |         """Return a random action index.
 51 | 
 52 |         This policy cannot contain others (as they would just be ignored).
 53 | 
 54 |         Returns
 55 |         -------
 56 |         int:
 57 |           Action index in range [0, num_actions)
 58 |         """
 59 |         return np.random.randint(0, self.num_actions)
 60 | 
 61 |     def get_config(self):  # noqa: D102
 62 |         return {'num_actions': self.num_actions}
 63 | 
 64 | class GreedyPolicy(Policy):
 65 |     """Always returns best action according to Q-values.
 66 | 
 67 |     This is a pure exploitation policy.
 68 |     """
 69 | 
 70 |     def select_action(self, q_values, **kwargs):  # noqa: D102
 71 |         return np.argmax(q_values)
 72 | 
 73 | class GreedyEpsilonPolicy(Policy):
 74 |     """Selects greedy action or with some probability a random action.
 75 | 
 76 |     Standard greedy-epsilon implementation. With probability epsilon
 77 |     choose a random action. Otherwise choose the greedy action.
 78 | 
 79 |     Parameters
 80 |     ----------
 81 |     epsilon: float
 82 |      Initial probability of choosing a random action. Can be changed
 83 |      over time.
 84 |     """
 85 |     def __init__(self, epsilon):
 86 |         self.epsilon = epsilon
 87 | 
 88 |     def select_action(self, q_values, **kwargs):
 89 |         """Run Greedy-Epsilon for the given Q-values.
 90 | 
 91 |         Parameters
 92 |         ----------
 93 |         q_values: array-like
 94 |           Array-like structure of floats representing the Q-values for
 95 |           each action.
 96 | 
 97 |         Returns
 98 |         -------
 99 |         int:
100 |           The action index chosen.
101 |         """
102 |         num_actions = q_values.shape[1] 
103 |         if np.random.rand() < self.epsilon:
104 |             return UniformRandomPolicy(num_actions).select_action()
105 |         else:
106 |             return GreedyPolicy().select_action(q_values)
107 | 
108 | class LinearDecayGreedyEpsilonPolicy(Policy):
109 |     """Policy with a parameter that decays linearly.
110 | 
111 |     Like GreedyEpsilonPolicy but the epsilon decays from a start value
112 |     to an end value over k steps.
113 | 
114 |     Parameters
115 |     ----------
116 |     start_value: int, float
117 |       The initial value of the parameter
118 |     end_value: int, float
119 |       The value of the policy at the end of the decay.
120 |     num_steps: int
121 |       The number of steps over which to decay the value.
122 | 
123 |     """
124 | 
125 |     def __init__(self, start_value, end_value, num_steps):  # noqa: D102
126 |         self.start_value = start_value
127 |         self.decay_rate = float(end_value - start_value) / num_steps
128 |         self.end_value = end_value
129 |         self.step = 0
130 | 
131 |     def select_action(self, q_values, is_training = True, **kwargs):
132 |         """Decay parameter and select action.
133 | 
134 |         Parameters
135 |         ----------
136 |         q_values: np.array
137 |           The Q-values for each action.
138 |         is_training: bool, optional
139 |           If true then parameter will be decayed. Defaults to true.
140 | 
141 |         Returns
142 |         -------
143 |         Any:
144 |           Selected action.
145 |         """
146 |         epsilon = self.start_value
147 |         if is_training:
148 |             epsilon += self.decay_rate * self.step
149 |             self.step += 1
150 |         epsilon = max(epsilon, self.end_value)
151 |         return GreedyEpsilonPolicy(epsilon).select_action(q_values)
152 | 
153 |     def reset(self):
154 |         """Start the decay over at the start value."""
155 |         self.step = 0
156 | 


--------------------------------------------------------------------------------
/deeprl_prj/preprocessors.py:
--------------------------------------------------------------------------------
  1 | """Suggested Preprocessors."""
  2 | 
  3 | import numpy as np
  4 | from PIL import Image
  5 | 
  6 | from deeprl_prj import utils
  7 | from deeprl_prj.core import Preprocessor
  8 | 
  9 | 
 10 | class HistoryPreprocessor(Preprocessor):
 11 |     """Keeps the last k states.
 12 | 
 13 |     Useful for domains where you need velocities, but the state
 14 |     contains only positions.
 15 | 
 16 |     When the environment starts, this will just fill the initial
 17 |     sequence values with zeros k times.
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     history_length: int
 22 |       Number of previous states to prepend to state being processed.
 23 | 
 24 |     """
 25 | 
 26 |     def __init__(self, history_length=1):
 27 |         self.history_length = history_length
 28 |         self.past_states = None 
 29 |         self.past_states_ori = None 
 30 | 
 31 |     def process_state_for_network(self, state):
 32 |         """You only want history when you're deciding the current action to take."""
 33 |         row, col = state.shape
 34 |         if self.past_states is None:
 35 |             self.past_states = np.zeros((row, col, self.history_length))
 36 |         history = np.dstack((self.past_states, state))
 37 |         self.past_states = history[:, :, 1:]
 38 |         return history
 39 | 
 40 |     def process_state_for_network_ori(self, state):
 41 |         """You only want history when you're deciding the current action to take."""
 42 |         row, col = state.shape
 43 |         channel = 1
 44 |         if self.past_states_ori is None:
 45 |             self.past_states_ori = np.zeros((row, col, channel, self.history_length))
 46 |         history = np.concatenate((self.past_states_ori, np.expand_dims(state, -1)), axis=3)
 47 |         self.past_states_ori = history[:, :, :, 1:]
 48 |         return history
 49 | 
 50 |     def reset(self):
 51 |         """Reset the history sequence.
 52 | 
 53 |         Useful when you start a new episode.
 54 |         """
 55 |         self.past_states = None
 56 |         self.past_states_ori = None 
 57 | 
 58 |     def get_config(self):
 59 |         return {'history_length': self.history_length}
 60 | 
 61 | class AtariPreprocessor(Preprocessor):
 62 |     """Converts images to greyscale and downscales.
 63 | 
 64 |     Based on the preprocessing step described in:
 65 | 
 66 |     @article{mnih15_human_level_contr_throug_deep_reinf_learn,
 67 |     author =	 {Volodymyr Mnih and Koray Kavukcuoglu and David
 68 |                   Silver and Andrei A. Rusu and Joel Veness and Marc
 69 |                   G. Bellemare and Alex Graves and Martin Riedmiller
 70 |                   and Andreas K. Fidjeland and Georg Ostrovski and
 71 |                   Stig Petersen and Charles Beattie and Amir Sadik and
 72 |                   Ioannis Antonoglou and Helen King and Dharshan
 73 |                   Kumaran and Daan Wierstra and Shane Legg and Demis
 74 |                   Hassabis},
 75 |     title =	 {Human-Level Control Through Deep Reinforcement
 76 |                   Learning},
 77 |     journal =	 {Nature},
 78 |     volume =	 518,
 79 |     number =	 7540,
 80 |     pages =	 {529-533},
 81 |     year =	 2015,
 82 |     doi =        {10.1038/nature14236},
 83 |     url =	 {http://dx.doi.org/10.1038/nature14236},
 84 |     }
 85 | 
 86 |     You may also want to max over frames to remove flickering. Some
 87 |     games require this (based on animations and the limited sprite
 88 |     drawing capabilities of the original Atari).
 89 | 
 90 |     Parameters
 91 |     ----------
 92 |     new_size: 2 element tuple
 93 |       The size that each image in the state should be scaled to. e.g
 94 |       (84, 84) will make each image in the output have shape (84, 84).
 95 |     """
 96 | 
 97 |     def process_state_for_memory(self, state):
 98 |         """Scale, convert to greyscale and store as uint8.
 99 | 
100 |         We don't want to save floating point numbers in the replay
101 |         memory. We get the same resolution as uint8, but use a quarter
102 |         to an eigth of the bytes (depending on float32 or float64)
103 | 
104 |         We recommend using the Python Image Library (PIL) to do the
105 |         image conversions.
106 |         """
107 |         img = Image.fromarray(state).convert('L').resize((84, 84), Image.BILINEAR)
108 |         state = np.array(img)
109 |         return state
110 | 
111 |     def process_state_for_network(self, state):
112 |         """Scale, convert to greyscale and store as float32.
113 | 
114 |         Basically same as process state for memory, but this time
115 |         outputs float32 images.
116 |         """
117 |         return np.float32(self.process_state_for_memory(state) / 255.0)
118 | 
119 |     def process_state_for_network_ori(self, state):
120 |         """Scale, convert to greyscale and store as float32.
121 | 
122 |         Basically same as process state for memory, but this time
123 |         outputs float32 images.
124 |         """
125 |         img = Image.fromarray(state)
126 |         state = np.float32(np.array(img) / 255.0)
127 |         return state
128 | 
129 |     def process_batch(self, samples):
130 |         """The batches from replay memory will be uint8, convert to float32.
131 | 
132 |         Same as process_state_for_network but works on a batch of
133 |         samples from the replay memory. Meaning you need to convert
134 |         both state and next state values.
135 |         """
136 |         batch_size = len(samples)
137 |         for i in range(batch_size):
138 |             samples[i].state = np.float32(samples[i].state / 255.0)
139 |             samples[i].next_state = np.float32(samples[i].next_state / 255.0)
140 |         return samples
141 | 
142 |     def process_reward(self, reward):
143 |         """Clip reward between -1 and 1."""
144 |         # return np.clip(reward, -1, 1) 
145 |         return reward
146 |     
147 |     def reset(self):
148 |         self.last_state = None
149 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import argparse
  4 | import os
  5 | import gym
  6 | from gym import wrappers
  7 | import tensorflow as tf
  8 | from future.builtins import input
  9 | 
 10 | # >>>>>>>>>>>>>>>>>>>>>>>>
 11 | # Different implementation of DQNAgent
 12 | # Uncomment the one you want to train and test
 13 | 
 14 | # Keras implementation. Includes Basic Dueling Double DQN and Temporal Attention DQN.
 15 | from deeprl_prj.dqn_keras import DQNAgent
 16 | 
 17 | # Pure Tensorflow implementation. Includes Basic Dueling Double DQN and Temporal Attention DQN.
 18 | # from deeprl_prj.dqn_tf_temporalAt import DQNAgent
 19 | 
 20 | # Pure Tensorflow implementation. Includes Basic Dueling Double DQN and Spatial Attention DQN.
 21 | # from deeprl_prj.dqn_tf_spatialAt import DQNAgent
 22 | 
 23 | # <<<<<<<<<<<<<<<<<<<<<<<<<
 24 | 
 25 | def get_output_folder(args, parent_dir, env_name, task_name):
 26 |     """Return save folder.
 27 | 
 28 |     Assumes folders in the parent_dir have suffix -run{run
 29 |     number}. Finds the highest run number and sets the output folder
 30 |     to that number + 1. This is just convenient so that if you run the
 31 |     same script multiple times tensorboard can plot all of the results
 32 |     on the same plots with different names.
 33 | 
 34 |     Parameters
 35 |     ----------
 36 |     parent_dir: str
 37 |       Path of the directory containing all experiment runs.
 38 | 
 39 |     Returns
 40 |     -------
 41 |     parent_dir/run_dir
 42 |       Path to this run's save directory.
 43 |     """
 44 |     if not os.path.exists(parent_dir):
 45 |         os.makedirs(parent_dir)
 46 |         print('===== Folder did not exist; creating... %s'%parent_dir)
 47 | 
 48 |     experiment_id = 0
 49 |     for folder_name in os.listdir(parent_dir):
 50 |         if not os.path.isdir(os.path.join(parent_dir, folder_name)):
 51 |             continue
 52 |         try:
 53 |             folder_name = int(folder_name.split('-run')[-1][0])
 54 |             print(folder_name)
 55 |             if folder_name > experiment_id:
 56 |                 experiment_id = folder_name
 57 |         except:
 58 |             pass
 59 |     experiment_id += 1
 60 | 
 61 |     parent_dir = os.path.join(parent_dir, env_name)
 62 |     parent_dir = parent_dir + '-run{}'.format(experiment_id) + '-' + task_name
 63 |     if not os.path.exists(parent_dir):
 64 |         os.makedirs(parent_dir)
 65 |         print('===== Folder did not exist; creating... %s'%parent_dir)
 66 |     else:
 67 |         print('===== Folder exists; delete? %s'%parent_dir)
 68 |         response = input("Press Enter to continue...")
 69 |         os.system('rm -rf %s/' % (parent_dir))
 70 |     os.makedirs(parent_dir+'/videos/')
 71 |     os.makedirs(parent_dir+'/images/')
 72 |     os.makedirs(parent_dir+'/losses/')
 73 |     return parent_dir
 74 | 
 75 | def main():
 76 |     parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
 77 |     parser.add_argument('--env', default='QuadCopter-v4', help='Atari env name')
 78 |     parser.add_argument('-o', '--output', default='./log/', help='Directory to save data to')
 79 |     parser.add_argument('--seed', default=0, type=int, help='Random seed')
 80 |     parser.add_argument('--gamma', default=0.99, type=float, help='Discount factor')
 81 |     parser.add_argument('--batch_size', default=32, type=int, help='Minibatch size')
 82 |     parser.add_argument('--learning_rate', default=0.0001, type=float, help='Learning rate')
 83 |     parser.add_argument('--initial_epsilon', default=1.0, type=float, help='Initial exploration probability in epsilon-greedy')
 84 |     parser.add_argument('--final_epsilon', default=0.05, type=float, help='Final exploration probability in epsilon-greedy')
 85 |     parser.add_argument('--exploration_steps', default=24000, type=int, help='Number of steps over which the initial value of epsilon is linearly annealed to its final value')
 86 |     parser.add_argument('--num_samples', default=40000, type=int, help='Number of training samples from the environment in training')
 87 |     parser.add_argument('--num_frames', default=4, type=int, help='Number of frames to feed to Q-Network')
 88 |     parser.add_argument('--frame_width', default=84, type=int, help='Resized frame width')
 89 |     parser.add_argument('--frame_height', default=84, type=int, help='Resized frame height')
 90 |     parser.add_argument('--replay_memory_size', default=50000, type=int, help='Number of replay memory the agent uses for training')
 91 |     parser.add_argument('--target_update_freq', default=200, type=int, help='The frequency with which the target network is updated')
 92 |     parser.add_argument('--train_freq', default=4, type=int, help='The frequency of actions wrt Q-network update')
 93 |     parser.add_argument('--save_freq', default=500, type=int, help='The frequency with which the network is saved')
 94 |     parser.add_argument('--eval_freq', default=200, type=int, help='The frequency with which the policy is evlauted')    
 95 |     parser.add_argument('--num_burn_in', default=10000, type=int, help='Number of steps to populate the replay memory before training starts')
 96 |     parser.add_argument('--load_network', default=False, action='store_true', help='Load trained mode')
 97 |     parser.add_argument('--load_network_path', default='', help='the path to the trained mode file')
 98 |     parser.add_argument('--net_mode', default='dqn', help='choose the mode of net, can be linear, dqn, duel')
 99 |     parser.add_argument('--max_episode_length', default = 1000, type=int, help = 'max length of each episode')
100 |     parser.add_argument('--num_episodes_at_test', default = 20, type=int, help='Number of episodes the agent plays at test')
101 |     parser.add_argument('--ddqn', default=False, dest='ddqn', action='store_true', help='enable ddqn')
102 |     parser.add_argument('--train', default=True, dest='train', action='store_true', help='Train mode')
103 |     parser.add_argument('--test', dest='train', action='store_false', help='Test mode')
104 |     parser.add_argument('--no_experience', default=False, action='store_true', help='do not use experience replay')
105 |     parser.add_argument('--no_target', default=False, action='store_true', help='do not use target fixing')
106 |     parser.add_argument('--monitor', default=False, action='store_true', help='record video')
107 |     parser.add_argument('--task_name', default='', help='task name')
108 |     parser.add_argument('--recurrent', default=False, dest='recurrent', action='store_true', help='enable recurrent DQN')
109 |     parser.add_argument('--a_t', default=False, dest='a_t', action='store_true', help='enable temporal/spatial attention')
110 |     parser.add_argument('--global_a_t', default=False, dest='global_a_t', action='store_true', help='enable global temporal attention')
111 |     parser.add_argument('--selector', default=False, dest='selector', action='store_true', help='enable selector for spatial attention')
112 |     parser.add_argument('--bidir', default=False, dest='bidir', action='store_true', help='enable two layer bidirectional lstm')
113 | 
114 |     args = parser.parse_args()
115 |     args.output = get_output_folder(args, args.output, args.env, args.task_name)
116 | 
117 |     env = gym.make(args.env)
118 |     print("==== Output saved to: ", args.output)
119 |     print("==== Args used:")
120 |     print(args)
121 | 
122 |     # here is where you should start up a session,
123 |     # create your DQN agent, create your model, etc.
124 |     # then you can run your fit method.
125 | 
126 |     num_actions = env.action_space.n
127 |     print(">>>> Game ", args.env, " #actions: ", num_actions)
128 | 
129 |     dqn = DQNAgent(args, num_actions)
130 |     if args.train:
131 |         print(">> Training mode.")
132 |         dqn.fit(env, args.num_samples, args.max_episode_length)
133 |     else:
134 |         print(">> Evaluation mode.")
135 |         dqn.evaluate(env, args.num_episodes_at_test, 0, args.max_episode_length, args.monitor)
136 | 
137 | if __name__ == '__main__':
138 |     main()
139 | 


--------------------------------------------------------------------------------
/deeprl_prj/core.py:
--------------------------------------------------------------------------------
  1 | """Core classes."""
  2 | 
  3 | import numpy as np
  4 | from PIL import Image
  5 | 
  6 | class Sample:
  7 |     """Represents a reinforcement learning sample.
  8 | 
  9 |     Used to store observed experience from an MDP. Represents a
 10 |     standard `(s, a, r, s', terminal)` tuple.
 11 | 
 12 |     Parameters
 13 |     ----------
 14 |     state: array-like
 15 |       Represents the state of the MDP before taking an action. In most
 16 |       cases this will be a numpy array.
 17 |     action: int, float, tuple
 18 |       For discrete action domains this will be an integer. For
 19 |       continuous action domains this will be a floating point
 20 |       number. For a parameterized action MDP this will be a tuple
 21 |       containing the action and its associated parameters.
 22 |     reward: float
 23 |       The reward received for executing the given action in the given
 24 |       state and transitioning to the resulting state.
 25 |     next_state: array-like
 26 |       This is the state the agent transitions to after executing the
 27 |       `action` in `state`. Expected to be the same type/dimensions as
 28 |       the state.
 29 |     is_terminal: boolean
 30 |       True if this action finished the episode. False otherwise.
 31 |     """
 32 |     def __init__(self, state, action, reward, next_state, is_terminal):
 33 |         self.state = state
 34 |         self.action = action
 35 |         self.reward = reward
 36 |         self.next_state = next_state
 37 |         self.is_terminal = is_terminal
 38 | 
 39 | class Preprocessor:
 40 |     """Preprocessor base class.
 41 | 
 42 |     This is a suggested interface for the preprocessing steps. 
 43 | 
 44 |     Preprocessor can be used to perform some fixed operations on the
 45 |     raw state from an environment. For example, in ConvNet based
 46 |     networks which use image as the raw state, it is often useful to
 47 |     convert the image to greyscale or downsample the image.
 48 | 
 49 |     Preprocessors are implemented as class so that they can have
 50 |     internal state. This can be useful for things like the
 51 |     AtariPreproccessor which maxes over k frames.
 52 | 
 53 |     If you're using internal states, such as for keeping a sequence of
 54 |     inputs like in Atari, you should probably call reset when a new
 55 |     episode begins so that state doesn't leak in from episode to
 56 |     episode.
 57 |     """
 58 | 
 59 |     def process_state_for_network(self, state):
 60 |         """Preprocess the given state before giving it to the network.
 61 | 
 62 |         Should be called just before the action is selected.
 63 | 
 64 |         This is a different method from the process_state_for_memory
 65 |         because the replay memory may require a different storage
 66 |         format to reduce memory usage. For example, storing images as
 67 |         uint8 in memory is a lot more efficient thant float32, but the
 68 |         networks work better with floating point images.
 69 | 
 70 |         Parameters
 71 |         ----------
 72 |         state: np.ndarray
 73 |           Generally a numpy array. A single state from an environment.
 74 | 
 75 |         Returns
 76 |         -------
 77 |         processed_state: np.ndarray
 78 |           Generally a numpy array. The state after processing. Can be
 79 |           modified in anyway.
 80 |         """
 81 |         return state
 82 | 
 83 |     def process_state_for_memory(self, state):
 84 |         """Preprocess the given state before giving it to the replay memory.
 85 | 
 86 |         Should be called just before appending this to the replay memory.
 87 | 
 88 |         This is a different method from the process_state_for_network
 89 |         because the replay memory may require a different storage
 90 |         format to reduce memory usage. For example, storing images as
 91 |         uint8 in memory and the network expecting images in floating
 92 |         point.
 93 | 
 94 |         Parameters
 95 |         ----------
 96 |         state: np.ndarray
 97 |           A single state from an environmnet. Generally a numpy array.
 98 | 
 99 |         Returns
100 |         -------
101 |         processed_state: np.ndarray
102 |           Generally a numpy array. The state after processing. Can be
103 |           modified in any manner.
104 |         """
105 |         return state
106 | 
107 |     def process_batch(self, samples):
108 |         """Process batch of samples.
109 | 
110 |         If your replay memory storage format is different than your
111 |         network input, you may want to apply this function to your
112 |         sampled batch before running it through your update function.
113 | 
114 |         Parameters
115 |         ----------
116 |         samples: list(tensorflow_rl.core.Sample)
117 |           List of samples to process
118 | 
119 |         Returns
120 |         -------
121 |         processed_samples: list(tensorflow_rl.core.Sample)
122 |           Samples after processing. Can be modified in anyways, but
123 |           the list length will generally stay the same.
124 |         """
125 |         return samples
126 | 
127 |     def process_reward(self, reward):
128 |         """Process the reward.
129 | 
130 |         Useful for things like reward clipping. The Atari environments
131 |         from DQN paper do this. Instead of taking real score, they
132 |         take the sign of the delta of the score.
133 | 
134 |         Parameters
135 |         ----------
136 |         reward: float
137 |           Reward to process
138 | 
139 |         Returns
140 |         -------
141 |         processed_reward: float
142 |           The processed reward
143 |         """
144 |         return reward
145 | 
146 |     def reset(self):
147 |         """Reset any internal state.
148 | 
149 |         Will be called at the start of every new episode. Makes it
150 |         possible to do history snapshots.
151 |         """
152 |         pass
153 | 
154 | class ReplayMemory:
155 |     """Interface for replay memories.
156 | 
157 |     Methods
158 |     -------
159 |     append(state, action, reward, debug_info=None)
160 |       Add a sample to the replay memory. 
161 |     end_episode(final_state, is_terminal, debug_info=None)
162 |       Set the final state of an episode and mark whether it was a true
163 |       terminal state (i.e. the env returned is_terminal=True), of it
164 |       is an artificial terminal state (i.e. agent quit the episode
165 |       early, but agent could have kept running episode).
166 |     sample(batch_size, indexes=None)
167 |       Return list of samples from the memory. Each class will
168 |       implement a different method of choosing the
169 |       samples. Optionally, specify the sample indexes manually.
170 |     clear()
171 |       Reset the memory. Deletes all references to the samples.
172 |     """
173 |     def __init__(self, args):
174 |         """Setup memory.
175 | 
176 |         You should specify the maximum size o the memory. Once the
177 |         memory fills up oldest values should be removed. You can try
178 |         the collections.deque class as the underlying storage, but
179 |         your sample method will be very slow.
180 | 
181 |         We recommend using a list as a ring buffer. Just track the
182 |         index where the next sample should be inserted in the list.
183 |         """
184 |         self.memory_size = args.replay_memory_size
185 |         self.history_length = args.num_frames
186 |         self.actions = np.zeros(self.memory_size, dtype = np.int8)
187 |         self.rewards = np.zeros(self.memory_size, dtype = np.int8)
188 |         self.screens = np.zeros((self.memory_size, args.frame_height, args.frame_width), dtype = np.uint8)
189 |         self.terminals = np.zeros(self.memory_size, dtype = np.bool)
190 |         self.current = 0
191 | 
192 |     def append(self, state, action, reward, is_terminal):
193 |         self.actions[self.current % self.memory_size] = action
194 |         self.rewards[self.current % self.memory_size] = reward
195 |         self.screens[self.current % self.memory_size] = state
196 |         self.terminals[self.current % self.memory_size] = is_terminal
197 |         # img = Image.fromarray(state, mode = 'L')
198 |         # path = "./tmp/%05d-%s.png" % (self.current, is_terminal)
199 |         # img.save(path)
200 |         self.current += 1
201 | 
202 |     def get_state(self, index):
203 |         state = self.screens[index - self.history_length + 1:index + 1, :, :]
204 |         # history dimention last
205 |         return np.transpose(state, (1, 2, 0)) 
206 | 
207 |     def sample(self, batch_size):
208 |         samples = []
209 |         indexes = []
210 |         # ensure enough frames to sample
211 |         assert self.current > self.history_length
212 |         # -1 because still need next frame
213 |         end = min(self.current, self.memory_size) - 1
214 | 
215 |         while len(indexes) < batch_size: 
216 |             index = np.random.randint(self.history_length - 1, end)
217 |             # sampled state shouldn't contain episode end
218 |             if self.terminals[index - self.history_length + 1: index + 1].any():
219 |                 continue
220 |             indexes.append(index)
221 | 
222 |         for idx in indexes:
223 |             new_sample = Sample(self.get_state(idx), self.actions[idx],
224 |                 self.rewards[idx], self.get_state(idx + 1), self.terminals[idx])
225 |             samples.append(new_sample)
226 |         return samples
227 | 
228 |     def clear(self):
229 |         self.current = 0
230 | 


--------------------------------------------------------------------------------
/deeprl_prj/dqn_keras.py:
--------------------------------------------------------------------------------
  1 | '''Keras DQN Agent implementation. Includes Basic Dueling Double DQN and Temporal Attention DQN.'''
  2 | 
  3 | from deeprl_prj.policy import *
  4 | from deeprl_prj.objectives import *
  5 | from deeprl_prj.preprocessors import *
  6 | from deeprl_prj.utils import *
  7 | from deeprl_prj.core import *
  8 | 
  9 | import keras
 10 | from keras.optimizers import (Adam, RMSprop)
 11 | from keras.layers import (Activation, Convolution2D, Dense, Flatten, Input,
 12 |         Permute, merge, Merge, Lambda, Reshape, TimeDistributed, LSTM, RepeatVector, Permute, multiply)
 13 | from keras.layers.wrappers import Bidirectional
 14 | from keras.models import Model
 15 | from keras import backend as K
 16 | from keras.backend.tensorflow_backend import set_session
 17 | 
 18 | import sys
 19 | from gym import wrappers
 20 | import tensorflow as tf
 21 | import numpy as np
 22 | 
 23 | config = tf.ConfigProto()
 24 | config.gpu_options.allow_growth = True
 25 | config.allow_soft_placement = True
 26 | set_session(tf.Session(config=config))
 27 | 
 28 | def create_model(input_shape, num_actions, mode, args, model_name='q_network'):  # noqa: D103
 29 |     """Create the Q-network model.
 30 | 
 31 |     Use Keras to construct a keras.models.Model instance.
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     window: int
 36 |       Each input to the network is a sequence of frames. This value
 37 |       defines how many frames are in the sequence.
 38 |     input_shape: tuple(int, int, int), rows, cols, channels
 39 |       The expected input image size.
 40 |     num_actions: int
 41 |       Number of possible actions. Defined by the gym environment.
 42 |     model_name: str
 43 |       Useful when debugging. Makes the model show up nicer in tensorboard.
 44 | 
 45 |     Returns
 46 |     -------
 47 |     keras.models.Model
 48 |       The Q-model.
 49 |     """
 50 |     assert(mode in ("linear", "duel", "dqn"))
 51 |     with tf.variable_scope(model_name):
 52 |         input_data = Input(shape = input_shape, name = "input")
 53 |         if mode == "linear": # We will never enter this loop
 54 |             flatten_hidden = Flatten(name = "flatten")(input_data) #(H, W, D, Batch)
 55 |             output = Dense(num_actions, name = "output")(flatten_hidden)
 56 |         # Directly come here for DQN
 57 |         else:
 58 |             if not(args.recurrent): # Only when "not" using DRQN
 59 |                 h1 = Convolution2D(32, (8, 8), strides = 4, activation = "relu", name = "conv1")(input_data)
 60 |                 h2 = Convolution2D(64, (4, 4), strides = 2, activation = "relu", name = "conv2")(h1)
 61 |                 h3 = Convolution2D(64, (3, 3), strides = 1, activation = "relu", name = "conv3")(h2)
 62 |                 context = Flatten(name = "flatten")(h3)
 63 |             # ENTER HERE FOR DRQN
 64 |             else:
 65 |                 print('>>>> Defining Recurrent Modules...')
 66 |                 input_data_expanded = Reshape((input_shape[0], input_shape[1], input_shape[2], 1), input_shape = input_shape) (input_data)
 67 |                 input_data_TimeDistributed = Permute((3, 1, 2, 4), input_shape=input_shape)(input_data_expanded) # (D, H, W, Batch)
 68 |                 h1 = TimeDistributed(Convolution2D(32, (8, 8), strides = 4, activation = "relu", name = "conv1"), \
 69 |                     input_shape=(args.num_frames, input_shape[0], input_shape[1], 1))(input_data_TimeDistributed)
 70 |                 h2 = TimeDistributed(Convolution2D(64, (4, 4), strides = 2, activation = "relu", name = "conv2"))(h1)
 71 |                 h3 = TimeDistributed(Convolution2D(64, (3, 3), strides = 1, activation = "relu", name = "conv3"))(h2)
 72 |                 flatten_hidden = TimeDistributed(Flatten())(h3)
 73 |                 hidden_input = TimeDistributed(Dense(512, activation = 'relu', name = 'flat_to_512')) (flatten_hidden)
 74 |                 if not(args.a_t):
 75 |                     context = LSTM(512, return_sequences=False, stateful=False, input_shape=(args.num_frames, 512)) (hidden_input)
 76 |                 else:
 77 |                     if args.bidir:
 78 |                         hidden_input = Bidirectional(LSTM(512, return_sequences=True, stateful=False, input_shape=(args.num_frames, 512)), merge_mode='sum') (hidden_input)
 79 |                         all_outs = Bidirectional(LSTM(512, return_sequences=True, stateful=False, input_shape=(args.num_frames, 512)), merge_mode='sum') (hidden_input)
 80 |                     else:
 81 |                         all_outs = LSTM(512, return_sequences=True, stateful=False, input_shape=(args.num_frames, 512)) (hidden_input)
 82 |                     # attention
 83 |                     attention = TimeDistributed(Dense(1, activation='tanh'))(all_outs) 
 84 |                     # print(attention.shape)
 85 |                     attention = Flatten()(attention)
 86 |                     attention = Activation('softmax')(attention)
 87 |                     attention = RepeatVector(512)(attention)
 88 |                     attention = Permute([2, 1])(attention)
 89 |                     sent_representation = merge([all_outs, attention], mode='mul')
 90 |                     context = Lambda(lambda xin: K.sum(xin, axis=-2), output_shape=(512,))(sent_representation)
 91 |                     # print(context.shape)
 92 | 
 93 |             if mode == "dqn":
 94 |                 h4 = Dense(512, activation='relu', name = "fc")(context)
 95 |                 output = Dense(num_actions, name = "output")(h4)
 96 |             elif mode == "duel":
 97 |                 value_hidden = Dense(512, activation = 'relu', name = 'value_fc')(context)
 98 |                 value = Dense(1, name = "value")(value_hidden)
 99 |                 action_hidden = Dense(512, activation = 'relu', name = 'action_fc')(context)
100 |                 action = Dense(num_actions, name = "action")(action_hidden)
101 |                 action_mean = Lambda(lambda x: tf.reduce_mean(x, axis = 1, keep_dims = True), name = 'action_mean')(action) 
102 |                 output = Lambda(lambda x: x[0] + x[1] - x[2], name = 'output')([action, value, action_mean])
103 |     model = Model(inputs = input_data, outputs = output)
104 |     print(model.summary())
105 |     return model
106 | 
107 | def save_scalar(step, name, value, writer):
108 |     """Save a scalar value to tensorboard.
109 |       Parameters
110 |       ----------
111 |       step: int
112 |         Training step (sets the position on x-axis of tensorboard graph.
113 |       name: str
114 |         Name of variable. Will be the name of the graph in tensorboard.
115 |       value: float
116 |         The value of the variable at this step.
117 |       writer: tf.FileWriter
118 |         The tensorboard FileWriter instance.
119 |       """
120 |     summary = tf.Summary()
121 |     summary_value = summary.value.add()
122 |     summary_value.simple_value = float(value)
123 |     summary_value.tag = name
124 |     writer.add_summary(summary, step)
125 | 
126 | class DQNAgent:
127 |     """Class implementing DQN.
128 | 
129 |     This is a basic outline of the functions/parameters to implement the DQNAgnet. 
130 | 
131 |     Parameters
132 |     ----------
133 |     q_network: keras.models.Model
134 |       Your Q-network model.
135 |     preprocessor: deeprl_hw2.core.Preprocessor
136 |       The preprocessor class. See the associated classes for more
137 |       details.
138 |     memory: deeprl_hw2.core.Memory
139 |       Your replay memory.
140 |     gamma: float
141 |       Discount factor.
142 |     target_update_freq: float
143 |       Frequency to update the target network. You can either provide a
144 |       number representing a soft target update (see utils.py) or a
145 |       hard target update (see utils.py and Atari paper.)
146 |     num_burn_in: int
147 |       Before you begin updating the Q-network your replay memory has
148 |       to be filled up with some number of samples. This number says
149 |       how many.
150 |     train_freq: int
151 |       How often you actually update your Q-Network. Sometimes
152 |       stability is improved if you collect a couple samples for your
153 |       replay memory, for every Q-network update that you run.
154 |     batch_size: int
155 |       How many samples in each minibatch.
156 |     """
157 |     def __init__(self, args, num_actions):
158 |         self.num_actions = num_actions
159 |         input_shape = (args.frame_height, args.frame_width, args.num_frames)
160 |         self.history_processor = HistoryPreprocessor(args.num_frames - 1)
161 |         self.atari_processor = AtariPreprocessor()
162 |         self.memory = ReplayMemory(args)
163 |         self.policy = LinearDecayGreedyEpsilonPolicy(args.initial_epsilon, args.final_epsilon, args.exploration_steps)
164 |         self.gamma = args.gamma
165 |         self.target_update_freq = args.target_update_freq
166 |         self.num_burn_in = args.num_burn_in
167 |         self.train_freq = args.train_freq
168 |         self.batch_size = args.batch_size
169 |         self.learning_rate = args.learning_rate
170 |         self.frame_width = args.frame_width
171 |         self.frame_height = args.frame_height
172 |         self.num_frames = args.num_frames
173 |         self.output_path = args.output
174 |         self.output_path_videos = args.output + '/videos/'
175 |         self.save_freq = args.save_freq
176 |         self.load_network = args.load_network
177 |         self.load_network_path = args.load_network_path
178 |         self.enable_ddqn = args.ddqn
179 |         self.net_mode = args.net_mode
180 |         self.q_network = create_model(input_shape, num_actions, self.net_mode, args, "QNet")
181 |         self.target_network = create_model(input_shape, num_actions, self.net_mode, args, "TargetNet")
182 |         print(">>>> Net mode: %s, Using double dqn: %s" % (self.net_mode, self.enable_ddqn))
183 |         self.eval_freq = args.eval_freq
184 |         self.no_experience = args.no_experience
185 |         self.no_target = args.no_target
186 |         print(">>>> Target fixing: %s, Experience replay: %s" % (not self.no_target, not self.no_experience))
187 | 
188 |         # initialize target network
189 |         self.target_network.set_weights(self.q_network.get_weights())
190 |         self.final_model = None
191 |         self.compile()
192 | 
193 |         self.writer = tf.summary.FileWriter(self.output_path)
194 | 
195 |         print("*******__init__", input_shape)
196 | 
197 |     def compile(self, optimizer = None, loss_func = None):
198 |         """Setup all of the TF graph variables/ops.
199 | 
200 |         This is inspired by the compile method on the
201 |         keras.models.Model class.
202 | 
203 |         This is the place to create the target network, setup 
204 |         loss function and any placeholders.
205 |         """
206 |         if loss_func is None:
207 |             loss_func = mean_huber_loss
208 |             # loss_func = 'mse'
209 |         if optimizer is None:
210 |             optimizer = Adam(lr = self.learning_rate)
211 |             # optimizer = RMSprop(lr=0.00025)
212 |         with tf.variable_scope("Loss"):
213 |             state = Input(shape = (self.frame_height, self.frame_width, self.num_frames) , name = "states")
214 |             action_mask = Input(shape = (self.num_actions,), name = "actions")
215 |             qa_value = self.q_network(state)
216 |             qa_value = merge([qa_value, action_mask], mode = 'mul', name = "multiply")
217 |             qa_value = Lambda(lambda x: tf.reduce_sum(x, axis=1, keep_dims = True), name = "sum")(qa_value)
218 | 
219 |         self.final_model = Model(inputs = [state, action_mask], outputs = qa_value)
220 |         self.final_model.compile(loss=loss_func, optimizer=optimizer)
221 | 
222 |     def calc_q_values(self, state):
223 |         """Given a state (or batch of states) calculate the Q-values.
224 | 
225 |         Basically run your network on these states.
226 | 
227 |         Return
228 |         ------
229 |         Q-values for the state(s)
230 |         """
231 |         state = state[None, :, :, :]
232 |         return self.q_network.predict_on_batch(state)
233 | 
234 |     def select_action(self, state, is_training = True, **kwargs):
235 |         """Select the action based on the current state.
236 | 
237 |         Returns
238 |         --------
239 |         selected action
240 |         """
241 |         q_values = self.calc_q_values(state)
242 |         if is_training:
243 |             if kwargs['policy_type'] == 'UniformRandomPolicy':
244 |                 return UniformRandomPolicy(self.num_actions).select_action()
245 |             else:
246 |                 # linear decay greedy epsilon policy
247 |                 return self.policy.select_action(q_values, is_training)
248 |         else:
249 |             # return GreedyEpsilonPolicy(0.05).select_action(q_values)
250 |             return GreedyPolicy().select_action(q_values)
251 | 
252 |     def update_policy(self, current_sample):
253 |         """Update your policy.
254 | 
255 |         Behavior may differ based on what stage of training your
256 |         in. If you're in training mode then you should check if you
257 |         should update your network parameters based on the current
258 |         step and the value you set for train_freq.
259 | 
260 |         Inside, you'll want to sample a minibatch, calculate the
261 |         target values, update your network, and then update your
262 |         target values.
263 | 
264 |         You might want to return the loss and other metrics as an
265 |         output. They can help you monitor how training is going.
266 |         """
267 |         batch_size = self.batch_size
268 | 
269 |         if self.no_experience:
270 |             states = np.stack([current_sample.state])
271 |             next_states = np.stack([current_sample.next_state])
272 |             rewards = np.asarray([current_sample.reward])
273 |             mask = np.asarray([1 - int(current_sample.is_terminal)])
274 | 
275 |             action_mask = np.zeros((1, self.num_actions))
276 |             action_mask[0, current_sample.action] = 1.0
277 |         else:
278 |             samples = self.memory.sample(batch_size)
279 |             samples = self.atari_processor.process_batch(samples)
280 | 
281 |             states = np.stack([x.state for x in samples])
282 |             actions = np.asarray([x.action for x in samples])
283 |             action_mask = np.zeros((batch_size, self.num_actions))
284 |             action_mask[range(batch_size), actions] = 1.0
285 | 
286 |             next_states = np.stack([x.next_state for x in samples])
287 |             mask = np.asarray([1 - int(x.is_terminal) for x in samples])
288 |             rewards = np.asarray([x.reward for x in samples])
289 | 
290 |         if self.no_target:
291 |             next_qa_value = self.q_network.predict_on_batch(next_states)
292 |         else:
293 |             next_qa_value = self.target_network.predict_on_batch(next_states)
294 | 
295 |         if self.enable_ddqn:
296 |             qa_value = self.q_network.predict_on_batch(next_states)
297 |             max_actions = np.argmax(qa_value, axis = 1)
298 |             next_qa_value = next_qa_value[range(batch_size), max_actions]
299 |         else:
300 |             next_qa_value = np.max(next_qa_value, axis = 1)
301 |         target = rewards + self.gamma * mask * next_qa_value
302 | 
303 |         return self.final_model.train_on_batch([states, action_mask], target), np.mean(target)
304 | 
305 |     def fit(self, env, num_iterations, max_episode_length=None):
306 |         """Fit your model to the provided environment.
307 | 
308 |         This is where you sample actions from your network,
309 |         collect experience samples and add them to your replay memory,
310 |         and update your network parameters.
311 | 
312 |         Parameters
313 |         ----------
314 |         env: gym.Env
315 |           This is the Atari environment. 
316 |         num_iterations: int
317 |           How many samples/updates to perform.
318 |         max_episode_length: int
319 |           How long a single episode should last before the agent
320 |           resets. Can help exploration.
321 |         """
322 |         is_training = True
323 |         print("Training starts.")
324 |         self.save_model(0)
325 |         eval_count = 0
326 | 
327 |         state = env.reset()
328 | 
329 |         burn_in = True
330 |         idx_episode = 1
331 |         episode_loss = .0
332 |         episode_frames = 0
333 |         episode_reward = .0
334 |         episode_raw_reward = .0
335 |         episode_target_value = .0
336 | 
337 |         # Logs
338 |         losses_list = list()
339 |         step_loss_list = list()
340 |         step_reward = 0.0
341 |         step_reward_raw = 0.0
342 | 
343 |         for t in range(self.num_burn_in + num_iterations):
344 |             print ("iteration --> %s, episode --> %s" % (t, idx_episode))
345 |             action_state = self.history_processor.process_state_for_network(
346 |                 self.atari_processor.process_state_for_network(state))
347 |             policy_type = "UniformRandomPolicy" if burn_in else "LinearDecayGreedyEpsilonPolicy"
348 |             action = self.select_action(action_state, is_training, policy_type = policy_type)
349 |             processed_state = self.atari_processor.process_state_for_memory(state)
350 | 
351 |             # print("******* fit_action", action_state.shape)
352 |             # print("******* fit_proecess", processed_state.shape)
353 | 
354 |             env.render()
355 |             state, reward, done, info = env.step(action)
356 | 
357 |             processed_next_state = self.atari_processor.process_state_for_network(state)
358 |             action_next_state = np.dstack((action_state, processed_next_state))
359 |             action_next_state = action_next_state[:, :, 1:]
360 | 
361 |             processed_reward = self.atari_processor.process_reward(reward)
362 | 
363 |             self.memory.append(processed_state, action, processed_reward, done)
364 |             current_sample = Sample(action_state, action, processed_reward, action_next_state, done)
365 |             
366 |             if not burn_in: 
367 |                 episode_frames += 1
368 |                 episode_reward += processed_reward
369 |                 episode_raw_reward += reward
370 |                 if episode_frames > max_episode_length:
371 |                     done = True
372 | 
373 |             if not burn_in:
374 |                 step_reward += processed_reward
375 |                 step_reward_raw += reward
376 |                 step_losses = [t-last_burn-1, step_reward, step_reward_raw, step_reward / (t-last_burn-1), step_reward_raw / (t-last_burn-1)]
377 |                 step_loss_list.append(step_losses)
378 | 
379 | 
380 |             if done:
381 |                 # adding last frame only to save last state
382 |                 last_frame = self.atari_processor.process_state_for_memory(state)
383 |                 # action, reward, done doesn't matter here
384 |                 self.memory.append(last_frame, action, 0, done)
385 |                 if not burn_in:
386 |                     avg_target_value = episode_target_value / episode_frames
387 |                     print(">>> Training: time %d, episode %d, length %d, reward %.0f, raw_reward %.0f, loss %.4f, target value %.4f, policy step %d, memory cap %d" % 
388 |                         (t, idx_episode, episode_frames, episode_reward, episode_raw_reward, episode_loss, 
389 |                         avg_target_value, self.policy.step, self.memory.current))
390 |                     sys.stdout.flush()
391 |                     save_scalar(idx_episode, 'train/episode_frames', episode_frames, self.writer)
392 |                     save_scalar(idx_episode, 'train/episode_reward', episode_reward, self.writer)
393 |                     save_scalar(idx_episode, 'train/episode_raw_reward', episode_raw_reward, self.writer)
394 |                     save_scalar(idx_episode, 'train/episode_loss', episode_loss, self.writer)
395 |                     save_scalar(idx_episode, 'train_avg/avg_reward', episode_reward / episode_frames, self.writer)
396 |                     save_scalar(idx_episode, 'train_avg/avg_target_value', avg_target_value, self.writer)
397 |                     save_scalar(idx_episode, 'train_avg/avg_loss', episode_loss / episode_frames, self.writer)
398 | 
399 |                     # log losses
400 |                     losses = [idx_episode, episode_frames, episode_reward, episode_raw_reward, episode_loss, episode_reward / episode_frames, avg_target_value, episode_loss / episode_frames]
401 |                     losses_list.append(losses)
402 | 
403 |                     # reset values
404 |                     episode_frames = 0
405 |                     episode_reward = .0
406 |                     episode_raw_reward = .0
407 |                     episode_loss = .0
408 |                     episode_target_value = .0
409 |                     idx_episode += 1
410 |                 burn_in = (t < self.num_burn_in)
411 |                 state = env.reset()
412 |                 self.atari_processor.reset()
413 |                 self.history_processor.reset()
414 | 
415 |             if burn_in:
416 |                 last_burn = t
417 | 
418 |             if not burn_in:
419 |                 if t % self.train_freq == 0:
420 |                     loss, target_value = self.update_policy(current_sample)
421 |                     episode_loss += loss
422 |                     episode_target_value += target_value
423 |                 # update freq is based on train_freq
424 |                 if t % (self.train_freq * self.target_update_freq) == 0:
425 |                     # target updates can have the option to be hard or soft
426 |                     # related functions are defined in deeprl_prj.utils
427 |                     # here we use hard target update as default
428 |                     self.target_network.set_weights(self.q_network.get_weights())
429 |                 if t % self.save_freq == 0:
430 |                     self.save_model(idx_episode)
431 | 
432 |                     loss_array = np.asarray(losses_list)
433 |                     print (loss_array.shape) # 10 element vector
434 | 
435 |                     # loss_path = os.path.join('./losses/loss_episode%s.csv' % (idx_episode))
436 |                     loss_path = self.output_path + "/losses/loss_episodes" + str(idx_episode) + ".csv"
437 |                     np.savetxt(loss_path, loss_array, fmt='%.5f', delimiter=',')
438 | 
439 |                     step_loss_array = np.asarray(step_loss_list)
440 |                     print (step_loss_array.shape) # 10 element vector
441 | 
442 |                     step_loss_path = self.output_path + "/losses/loss_steps" + str(t-last_burn-1) + ".csv"
443 |                     np.savetxt(step_loss_path, step_loss_array, fmt='%.5f', delimiter=',')
444 | 
445 | 
446 |                 # No evaluation while training
447 |                 # if t % (self.eval_freq * self.train_freq) == 0:
448 |                 #     episode_reward_mean, episode_reward_std, eval_count = self.evaluate(env, 1, eval_count, max_episode_length, True)
449 |                 #     save_scalar(t, 'eval/eval_episode_reward_mean', episode_reward_mean, self.writer)
450 |                 #     save_scalar(t, 'eval/eval_episode_reward_std', episode_reward_std, self.writer)
451 | 
452 |         self.save_model(idx_episode)
453 | 
454 | 
455 |     def save_model(self, idx_episode):
456 |         safe_path = self.output_path + "/qnet" + str(idx_episode) + ".h5"
457 |         self.q_network.save_weights(safe_path)
458 |         print("Network at", idx_episode, "saved to:", safe_path)
459 | 
460 |     def evaluate(self, env, num_episodes, eval_count, max_episode_length=None, monitor=False):
461 |         """Test your agent with a provided environment.
462 | 
463 |         Basically run your policy on the environment and collect stats
464 |         like cumulative reward, average episode length, etc.
465 | 
466 |         You can also call the render function here if you want to
467 |         visually inspect your policy.
468 |         """
469 |         print("Evaluation starts.")
470 | 
471 |         is_training = False
472 |         if self.load_network:
473 |             self.q_network.load_weights(self.load_network_path)
474 |             print("Load network from:", self.load_network_path)
475 |         # if monitor:
476 |         #     env = wrappers.Monitor(env, self.output_path_videos, video_callable=lambda x:True, resume=True)
477 |         state = env.reset()
478 | 
479 |         idx_episode = 1
480 |         episode_frames = 0
481 |         episode_reward = np.zeros(num_episodes)
482 |         t = 0
483 | 
484 |         while idx_episode <= num_episodes:
485 |             t += 1
486 |             action_state = self.history_processor.process_state_for_network(
487 |                 self.atari_processor.process_state_for_network(state))
488 |             action = self.select_action(action_state, is_training, policy_type = 'GreedyEpsilonPolicy')
489 |             state, reward, done, info = env.step(action)
490 |             episode_frames += 1
491 |             episode_reward[idx_episode-1] += reward 
492 |             if episode_frames > max_episode_length:
493 |                 done = True
494 |             if done:
495 |                 print("Eval: time %d, episode %d, length %d, reward %.0f" %
496 |                     (t, idx_episode, episode_frames, episode_reward[idx_episode-1]))
497 |                 eval_count += 1
498 |                 save_scalar(eval_count, 'eval/eval_episode_raw_reward', episode_reward[idx_episode-1], self.writer)
499 |                 save_scalar(eval_count, 'eval/eval_episode_raw_length', episode_frames, self.writer)
500 |                 sys.stdout.flush()
501 |                 state = env.reset()
502 |                 episode_frames = 0
503 |                 idx_episode += 1
504 |                 self.atari_processor.reset()
505 |                 self.history_processor.reset()
506 | 
507 |         reward_mean = np.mean(episode_reward)
508 |         reward_std = np.std(episode_reward)
509 |         print("Evaluation summury: num_episodes [%d], reward_mean [%.3f], reward_std [%.3f]" %
510 |             (num_episodes, reward_mean, reward_std))
511 |         sys.stdout.flush()
512 | 
513 |         return reward_mean, reward_std, eval_count
514 | 


--------------------------------------------------------------------------------
/deeprl_prj/dqn_tf_temporalAt.py:
--------------------------------------------------------------------------------
  1 | '''Pure Tensorflow implementation. Includes Basic Dueling Double DQN and Temporal Attention DQN.'''
  2 | 
  3 | from deeprl_prj.policy import *
  4 | from deeprl_prj.objectives import *
  5 | from deeprl_prj.preprocessors import *
  6 | from deeprl_prj.utils import *
  7 | from deeprl_prj.core import *
  8 | from helper import *
  9 | 
 10 | import numpy as np
 11 | import sys
 12 | from gym import wrappers
 13 | import tensorflow as tf
 14 | print(tf.__version__)
 15 | 
 16 | """Main DQN agent."""
 17 | 
 18 | class Qnetwork():
 19 |     def __init__(self, args, h_size, num_frames, num_actions, rnn_cell_1, myScope, rnn_cell_2=None):
 20 |         #The network recieves a frame from the game, flattened into an array.
 21 |         #It then resizes it and processes it through four convolutional layers.
 22 |         self.imageIn =  tf.placeholder(shape=[None,84,84,num_frames],dtype=tf.float32)
 23 |         self.image_permute = tf.transpose(self.imageIn, perm=[0, 3, 1, 2])
 24 |         self.image_reshape = tf.reshape(self.image_permute, [-1, 84, 84, 1])
 25 |         self.image_reshape_recoverd = tf.squeeze(tf.gather(tf.reshape(self.image_reshape, [-1, num_frames, 84, 84, 1]), [0]), [0])
 26 |         self.summary_merged = tf.summary.merge([tf.summary.image('image_reshape_recoverd', self.image_reshape_recoverd, max_outputs=num_frames)])
 27 |         # self.imageIn = tf.reshape(self.scalarInput,shape=[-1,84,84,1])
 28 |         self.conv1 = tf.contrib.layers.convolution2d( \
 29 |             inputs=self.image_reshape,num_outputs=32,\
 30 |             kernel_size=[8,8],stride=[4,4],padding='VALID', \
 31 |             activation_fn=tf.nn.relu, biases_initializer=None,scope=myScope+'_conv1')
 32 |         self.conv2 = tf.contrib.layers.convolution2d( \
 33 |             inputs=self.conv1,num_outputs=64,\
 34 |             kernel_size=[4,4],stride=[2,2],padding='VALID', \
 35 |             activation_fn=tf.nn.relu, biases_initializer=None,scope=myScope+'_conv2')
 36 |         self.conv3 = tf.contrib.layers.convolution2d( \
 37 |             inputs=self.conv2,num_outputs=64,\
 38 |             kernel_size=[3,3],stride=[1,1],padding='VALID', \
 39 |             activation_fn=tf.nn.relu, biases_initializer=None,scope=myScope+'_conv3')
 40 |         self.conv4 = tf.contrib.layers.fully_connected(tf.contrib.layers.flatten(self.conv3), h_size, activation_fn=tf.nn.relu)
 41 |         
 42 |         #We take the output from the final convolutional layer and send it to a recurrent layer.
 43 |         #The input must be reshaped into [batch x trace x units] for rnn processing, 
 44 |         #and then returned to [batch x units] when sent through the upper levels.
 45 |         self.batch_size = tf.placeholder(dtype=tf.int32)
 46 |         self.convFlat = tf.reshape(self.conv4,[self.batch_size, num_frames, h_size])
 47 |         self.state_in_1 = rnn_cell_1.zero_state(self.batch_size, tf.float32)
 48 | 
 49 |         if args.bidir:
 50 |             self.state_in_2 = rnn_cell_2.zero_state(self.batch_size, tf.float32)
 51 |             self.rnn_outputs_tuple, self.rnn_state = tf.nn.bidirectional_dynamic_rnn(\
 52 |                 cell_fw=rnn_cell_1, cell_bw=rnn_cell_2, inputs=self.convFlat, dtype=tf.float32, \
 53 |                 initial_state_fw=self.state_in_1, initial_state_bw=self.state_in_2, scope=myScope+'_rnn')
 54 |             # print "====== len(self.rnn_outputs_tuple), self.rnn_outputs_tuple[0] ", len(self.rnn_outputs_tuple), self.rnn_outputs_tuple[0].get_shape().as_list(), self.rnn_outputs_tuple[1].get_shape().as_list() # [None, 10, 512]
 55 |             # As we have Bi-LSTM, we have two output, which are not connected. So merge them
 56 |             self.rnn_outputs = tf.concat([self.rnn_outputs_tuple[0], self.rnn_outputs_tuple[1]], axis=2)
 57 |             # self.rnn_outputs = tf.contrib.layers.fully_connected(tf.contrib.layers.flatten(self.rnn_outputs_double), h_size, activation_fn=None)
 58 |             self.rnn_output_dim = h_size * 2
 59 |         else:
 60 |             self.rnn_outputs, self.rnn_state = tf.nn.dynamic_rnn(\
 61 |                 inputs=self.convFlat,cell=rnn_cell_1, dtype=tf.float32, \
 62 |                 initial_state=self.state_in_1, scope=myScope+'_rnn')
 63 |             # print "====== self.rnn_outputs ", self.rnn_outputs.get_shape().as_list() # [None, 10, 512]
 64 |             self.rnn_output_dim = h_size
 65 | 
 66 |         # attention machanism
 67 |         if not(args.a_t):
 68 |             self.rnn_last_output = tf.slice(self.rnn_outputs, [0, num_frames-1, 0], [-1, 1, -1])
 69 |             self.rnn = tf.squeeze(self.rnn_last_output, [1])
 70 |         else:
 71 |             if args.global_a_t:
 72 |                 self.rnn_outputs_before = tf.slice(self.rnn_outputs, [0, 0, 0], [-1, num_frames-1, -1])
 73 |                 self.attention_v = tf.reshape(tf.slice(self.rnn_outputs, [0, num_frames-1, 0], [-1, 1, -1]), [-1, self.rnn_output_dim, 1])
 74 |                 self.attention_va = tf.tanh(tf.matmul(self.rnn_outputs_before, self.attention_v))
 75 |                 self.attention_a = tf.nn.softmax(self.attention_va, dim=1)
 76 |                 self.rnn = tf.reduce_sum(tf.multiply(self.rnn_outputs_before, self.attention_a), axis=1)
 77 |                 self.rnn = tf.concat([self.rnn, tf.squeeze(tf.slice(self.rnn_outputs, [0, num_frames-1, 0], [-1, 1, -1]), [1])], axis=1)
 78 |             else:
 79 |                 with tf.variable_scope(myScope+'_attention'):
 80 |                     self.attention_v = tf.get_variable(name='atten_v', shape=[self.rnn_output_dim, 1], initializer=tf.contrib.layers.xavier_initializer())
 81 |                 self.attention_va = tf.tanh(tf.map_fn(lambda x: tf.matmul(x, self.attention_v), self.rnn_outputs))
 82 |                 self.attention_a = tf.nn.softmax(self.attention_va, dim=1)
 83 |                 self.rnn = tf.reduce_sum(tf.multiply(self.rnn_outputs, self.attention_a), axis=1)
 84 |         # print "========== self.rnn ", self.rnn.get_shape().as_list() #[None, 1024]
 85 | 
 86 |         if args.net_mode == "duel":
 87 |             #The output from the recurrent player is then split into separate Value and Advantage streams
 88 |             self.ad_hidden = tf.contrib.layers.fully_connected(self.rnn, h_size, activation_fn=tf.nn.relu, scope=myScope+'_fc_advantage_hidden')
 89 |             self.Advantage = tf.contrib.layers.fully_connected(self.ad_hidden, num_actions, activation_fn=None, scope=myScope+'_fc_advantage')
 90 |             self.value_hidden = tf.contrib.layers.fully_connected(self.rnn, h_size, activation_fn=tf.nn.relu, scope=myScope+'_fc_value_hidden')
 91 |             self.Value = tf.contrib.layers.fully_connected(self.value_hidden, 1, activation_fn=None, scope=myScope+'_fc_value')       
 92 |             #Then combine them together to get our final Q-values.
 93 |             self.Qout = self.Value + tf.subtract(self.Advantage,tf.reduce_mean(self.Advantage,axis=1,keep_dims=True))
 94 |         else:
 95 |             self.Qout = tf.contrib.layers.fully_connected(self.rnn, num_actions, activation_fn=None)
 96 | 
 97 |         self.predict = tf.argmax(self.Qout,1)
 98 |         
 99 |         #Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
100 |         self.targetQ = tf.placeholder(shape=[None],dtype=tf.float32)
101 |         self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
102 |         self.actions_onehot = tf.one_hot(self.actions, num_actions, dtype=tf.float32)
103 |         
104 |         self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis=1)
105 |         self.td_error = tf.square(self.targetQ - self.Q)
106 |         self.loss = tf.reduce_mean(self.td_error)
107 |         
108 |         self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001)
109 |         self.updateModel = self.trainer.minimize(self.loss)
110 | 
111 | def save_scalar(step, name, value, writer):
112 |     """Save a scalar value to tensorboard.
113 |       Parameters
114 |       ----------
115 |       step: int
116 |         Training step (sets the position on x-axis of tensorboard graph.
117 |       name: str
118 |         Name of variable. Will be the name of the graph in tensorboard.
119 |       value: float
120 |         The value of the variable at this step.
121 |       writer: tf.FileWriter
122 |         The tensorboard FileWriter instance.
123 |       """
124 |     summary = tf.Summary()
125 |     summary_value = summary.value.add()
126 |     summary_value.simple_value = float(value)
127 |     summary_value.tag = name
128 |     writer.add_summary(summary, step)
129 | 
130 | class DQNAgent:
131 |     """Class implementing DQN.
132 | 
133 |     This is a basic outline of the functions/parameters you will need
134 |     in order to implement the DQNAgnet. This is just to get you
135 |     started. You may need to tweak the parameters, add new ones, etc.
136 | 
137 |     Feel free to change the functions and funciton parameters that the class 
138 |     provides.
139 | 
140 |     We have provided docstrings to go along with our suggested API.
141 | 
142 |     Parameters
143 |     ----------
144 |     q_network: keras.models.Model
145 |       Your Q-network model.
146 |     preprocessor: deeprl_hw2.core.Preprocessor
147 |       The preprocessor class. See the associated classes for more
148 |       details.
149 |     memory: deeprl_hw2.core.Memory
150 |       Your replay memory.
151 |     gamma: float
152 |       Discount factor.
153 |     target_update_freq: float
154 |       Frequency to update the target network. You can either provide a
155 |       number representing a soft target update (see utils.py) or a
156 |       hard target update (see utils.py and Atari paper.)
157 |     num_burn_in: int
158 |       Before you begin updating the Q-network your replay memory has
159 |       to be filled up with some number of samples. This number says
160 |       how many.
161 |     train_freq: int
162 |       How often you actually update your Q-Network. Sometimes
163 |       stability is improved if you collect a couple samples for your
164 |       replay memory, for every Q-network update that you run.
165 |     batch_size: int
166 |       How many samples in each minibatch.
167 |     """
168 |     def __init__(self, args, num_actions):
169 |         self.num_actions = num_actions
170 |         input_shape = (args.frame_height, args.frame_width, args.num_frames)
171 |         self.history_processor = HistoryPreprocessor(args.num_frames - 1)
172 |         self.atari_processor = AtariPreprocessor()
173 |         self.memory = ReplayMemory(args)
174 |         self.policy = LinearDecayGreedyEpsilonPolicy(args.initial_epsilon, args.final_epsilon, args.exploration_steps)
175 |         self.gamma = args.gamma
176 |         self.target_update_freq = args.target_update_freq
177 |         self.num_burn_in = args.num_burn_in
178 |         self.train_freq = args.train_freq
179 |         self.batch_size = args.batch_size
180 |         self.learning_rate = args.learning_rate
181 |         self.frame_width = args.frame_width
182 |         self.frame_height = args.frame_height
183 |         self.num_frames = args.num_frames
184 |         self.output_path = args.output
185 |         self.output_path_videos = args.output + '/videos/'
186 |         self.output_path_images = args.output + '/images/'
187 |         self.save_freq = args.save_freq
188 |         self.load_network = args.load_network
189 |         self.load_network_path = args.load_network_path
190 |         self.enable_ddqn = args.ddqn
191 |         self.net_mode = args.net_mode
192 |         self.args = args
193 | 
194 |         self.h_size = 512
195 |         self.tau = 0.001
196 |         # self.q_network = create_model(input_shape, num_actions, self.net_mode, args, "QNet")
197 |         # self.target_network = create_model(input_shape, num_actions, self.net_mode, args, "TargetNet")
198 |         tf.reset_default_graph()
199 |         #We define the cells for the primary and target q-networks
200 |         cell = tf.contrib.rnn.BasicLSTMCell(num_units=self.h_size, state_is_tuple=True)
201 |         cellT = tf.contrib.rnn.BasicLSTMCell(num_units=self.h_size, state_is_tuple=True)
202 |         if args.bidir:
203 |             cell_2 = tf.contrib.rnn.BasicLSTMCell(num_units=self.h_size, state_is_tuple=True)
204 |             cellT_2 = tf.contrib.rnn.BasicLSTMCell(num_units=self.h_size, state_is_tuple=True)
205 |             self.q_network = Qnetwork(args, h_size=self.h_size, num_frames=self.num_frames, num_actions=self.num_actions, rnn_cell_1=cell, rnn_cell_2=cell_2, myScope="QNet")
206 |             self.target_network = Qnetwork(args, h_size=self.h_size, num_frames=self.num_frames, num_actions=self.num_actions, rnn_cell_1=cellT, rnn_cell_2=cellT_2, myScope="TargetNet")
207 |         else:
208 |             self.q_network = Qnetwork(args, h_size=self.h_size, num_frames=self.num_frames, num_actions=self.num_actions, rnn_cell_1=cell, myScope="QNet")
209 |             self.target_network = Qnetwork(args, h_size=self.h_size, num_frames=self.num_frames, num_actions=self.num_actions, rnn_cell_1=cellT, myScope="TargetNet")
210 |         
211 |         print(">>>> Net mode: %s, Using double dqn: %s" % (self.net_mode, self.enable_ddqn))
212 |         self.eval_freq = args.eval_freq
213 |         self.no_experience = args.no_experience
214 |         self.no_target = args.no_target
215 |         print(">>>> Target fixing: %s, Experience replay: %s" % (not self.no_target, not self.no_experience))
216 | 
217 |         # initialize target network
218 |         init = tf.global_variables_initializer()
219 |         self.saver = tf.train.Saver(max_to_keep=2)
220 |         trainables = tf.trainable_variables()
221 |         print(trainables, len(trainables))
222 |         self.targetOps = updateTargetGraph(trainables, self.tau)
223 | 
224 |         config = tf.ConfigProto()
225 |         config.gpu_options.allow_growth = True
226 |         config.allow_soft_placement = True
227 |         self.sess = tf.Session(config=config)
228 |         self.sess.run(init)
229 |         updateTarget(self.targetOps, self.sess)
230 |         self.writer = tf.summary.FileWriter(self.output_path)
231 | 
232 |     def calc_q_values(self, state):
233 |         """Given a state (or batch of states) calculate the Q-values.
234 | 
235 |         Basically run your network on these states.
236 | 
237 |         Return
238 |         ------
239 |         Q-values for the state(s)
240 |         """
241 |         state = state[None, :, :, :]
242 |         # return self.q_network.predict_on_batch(state)
243 |         # print state.shape
244 |         # Qout = self.sess.run(self.q_network.rnn_outputs,\
245 |         #             feed_dict={self.q_network.imageIn: state, self.q_network.batch_size:1})
246 |         # print Qout.shape
247 |         Qout = self.sess.run(self.q_network.Qout,\
248 |                     feed_dict={self.q_network.imageIn: state, self.q_network.batch_size:1})
249 |         # print Qout.shape
250 |         return Qout
251 | 
252 |     def select_action(self, state, is_training = True, **kwargs):
253 |         """Select the action based on the current state.
254 | 
255 |         You will probably want to vary your behavior here based on
256 |         which stage of training your in. For example, if you're still
257 |         collecting random samples you might want to use a
258 |         UniformRandomPolicy.
259 | 
260 |         If you're testing, you might want to use a GreedyEpsilonPolicy
261 |         with a low epsilon.
262 | 
263 |         If you're training, you might want to use the
264 |         LinearDecayGreedyEpsilonPolicy.
265 | 
266 |         This would also be a good place to call
267 |         process_state_for_network in your preprocessor.
268 | 
269 |         Returns
270 |         --------
271 |         selected action
272 |         """
273 |         q_values = self.calc_q_values(state)
274 |         if is_training:
275 |             if kwargs['policy_type'] == 'UniformRandomPolicy':
276 |                 return UniformRandomPolicy(self.num_actions).select_action()
277 |             else:
278 |                 # linear decay greedy epsilon policy
279 |                 return self.policy.select_action(q_values, is_training)
280 |         else:
281 |             # return GreedyEpsilonPolicy(0.05).select_action(q_values)
282 |             return GreedyPolicy().select_action(q_values)
283 | 
284 |     def update_policy(self, current_sample):
285 |         """Update your policy.
286 | 
287 |         Behavior may differ based on what stage of training your
288 |         in. If you're in training mode then you should check if you
289 |         should update your network parameters based on the current
290 |         step and the value you set for train_freq.
291 | 
292 |         Inside, you'll want to sample a minibatch, calculate the
293 |         target values, update your network, and then update your
294 |         target values.
295 | 
296 |         You might want to return the loss and other metrics as an
297 |         output. They can help you monitor how training is going.
298 |         """
299 |         batch_size = self.batch_size
300 | 
301 |         if self.no_experience:
302 |             states = np.stack([current_sample.state])
303 |             next_states = np.stack([current_sample.next_state])
304 |             rewards = np.asarray([current_sample.reward])
305 |             mask = np.asarray([1 - int(current_sample.is_terminal)])
306 | 
307 |             action_mask = np.zeros((1, self.num_actions))
308 |             action_mask[0, current_sample.action] = 1.0
309 |         else:
310 |             samples = self.memory.sample(batch_size)
311 |             samples = self.atari_processor.process_batch(samples)
312 | 
313 |             states = np.stack([x.state for x in samples])
314 |             actions = np.asarray([x.action for x in samples])
315 |             # action_mask = np.zeros((batch_size, self.num_actions))
316 |             # action_mask[range(batch_size), actions] = 1.0
317 | 
318 |             next_states = np.stack([x.next_state for x in samples])
319 |             mask = np.asarray([1 - int(x.is_terminal) for x in samples])
320 |             rewards = np.asarray([x.reward for x in samples])
321 | 
322 |         if self.no_target:
323 |             next_qa_value = self.q_network.predict_on_batch(next_states)
324 |         else:
325 |             # next_qa_value = self.target_network.predict_on_batch(next_states)
326 |             next_qa_value = self.sess.run(self.target_network.Qout,\
327 |                     feed_dict={self.target_network.imageIn: next_states, self.target_network.batch_size:batch_size})
328 | 
329 |         if self.enable_ddqn:
330 |             # qa_value = self.q_network.predict_on_batch(next_states)
331 |             qa_value = self.sess.run(self.q_network.Qout,\
332 |                     feed_dict={self.q_network.imageIn: next_states, self.q_network.batch_size:batch_size})
333 |             max_actions = np.argmax(qa_value, axis = 1)
334 |             next_qa_value = next_qa_value[range(batch_size), max_actions]
335 |         else:
336 |             next_qa_value = np.max(next_qa_value, axis = 1)
337 |         # print rewards.shape, mask.shape, next_qa_value.shape, batch_size
338 |         target = rewards + self.gamma * mask * next_qa_value
339 | 
340 |         if self.args.a_t and np.random.random()<1e-3:
341 |             loss, _, rnn, attention_v, attention_a = self.sess.run([self.q_network.loss, self.q_network.updateModel, self.q_network.rnn, self.q_network.attention_v, self.q_network.attention_a], \
342 |                         feed_dict={self.q_network.imageIn: states, self.q_network.batch_size:batch_size, \
343 |                         self.q_network.actions: actions, self.q_network.targetQ: target})
344 |             # print(attention_a[0])
345 |         else:
346 |             loss, _, rnn = self.sess.run([self.q_network.loss, self.q_network.updateModel, self.q_network.rnn], \
347 |                         feed_dict={self.q_network.imageIn: states, self.q_network.batch_size:batch_size, \
348 |                         self.q_network.actions: actions, self.q_network.targetQ: target})
349 |         
350 |         return loss, np.mean(target)
351 | 
352 |     def fit(self, env, num_iterations, max_episode_length=None):
353 |         """Fit your model to the provided environment.
354 | 
355 |         Its a good idea to print out things like loss, average reward,
356 |         Q-values, etc to see if your agent is actually improving.
357 | 
358 |         You should probably also periodically save your network
359 |         weights and any other useful info.
360 | 
361 |         This is where you should sample actions from your network,
362 |         collect experience samples and add them to your replay memory,
363 |         and update your network parameters.
364 | 
365 |         Parameters
366 |         ----------
367 |         env: gym.Env
368 |           This is your Atari environment. You should wrap the
369 |           environment using the wrap_atari_env function in the
370 |           utils.py
371 |         num_iterations: int
372 |           How many samples/updates to perform.
373 |         max_episode_length: int
374 |           How long a single episode should last before the agent
375 |           resets. Can help exploration.
376 |         """
377 |         is_training = True
378 |         print("Training starts.")
379 |         self.save_model(0)
380 |         eval_count = 0
381 | 
382 |         state = env.reset()
383 |         burn_in = True
384 |         idx_episode = 1
385 |         episode_loss = .0
386 |         episode_frames = 0
387 |         episode_reward = .0
388 |         episode_raw_reward = .0
389 |         episode_target_value = .0
390 |         for t in range(self.num_burn_in + num_iterations):
391 |             action_state = self.history_processor.process_state_for_network(
392 |                 self.atari_processor.process_state_for_network(state))
393 |             policy_type = "UniformRandomPolicy" if burn_in else "LinearDecayGreedyEpsilonPolicy"
394 |             action = self.select_action(action_state, is_training, policy_type = policy_type)
395 |             processed_state = self.atari_processor.process_state_for_memory(state)
396 | 
397 |             state, reward, done, info = env.step(action)
398 | 
399 |             processed_next_state = self.atari_processor.process_state_for_network(state)
400 |             action_next_state = np.dstack((action_state, processed_next_state))
401 |             action_next_state = action_next_state[:, :, 1:]
402 | 
403 |             processed_reward = self.atari_processor.process_reward(reward)
404 | 
405 |             self.memory.append(processed_state, action, processed_reward, done)
406 |             current_sample = Sample(action_state, action, processed_reward, action_next_state, done)
407 |             
408 |             if not burn_in: 
409 |                 episode_frames += 1
410 |                 episode_reward += processed_reward
411 |                 episode_raw_reward += reward
412 |                 if episode_frames > max_episode_length:
413 |                     done = True
414 | 
415 |             if done:
416 |                 # adding last frame only to save last state
417 |                 last_frame = self.atari_processor.process_state_for_memory(state)
418 |                 # action, reward, done doesn't matter here
419 |                 self.memory.append(last_frame, action, 0, done)
420 |                 if not burn_in:
421 |                     avg_target_value = episode_target_value / episode_frames
422 |                     print(">>> Training: time %d, episode %d, length %d, reward %.0f, raw_reward %.0f, loss %.4f, target value %.4f, policy step %d, memory cap %d" % 
423 |                         (t, idx_episode, episode_frames, episode_reward, episode_raw_reward, episode_loss, 
424 |                         avg_target_value, self.policy.step, self.memory.current))
425 |                     sys.stdout.flush()
426 |                     save_scalar(idx_episode, 'train/episode_frames', episode_frames, self.writer)
427 |                     save_scalar(idx_episode, 'train/episode_reward', episode_reward, self.writer)
428 |                     save_scalar(idx_episode, 'train/episode_raw_reward', episode_raw_reward, self.writer)
429 |                     save_scalar(idx_episode, 'train/episode_loss', episode_loss, self.writer)
430 |                     save_scalar(idx_episode, 'train_avg/avg_reward', episode_reward / episode_frames, self.writer)
431 |                     save_scalar(idx_episode, 'train_avg/avg_target_value', avg_target_value, self.writer)
432 |                     save_scalar(idx_episode, 'train_avg/avg_loss', episode_loss / episode_frames, self.writer)
433 |                     episode_frames = 0
434 |                     episode_reward = .0
435 |                     episode_raw_reward = .0
436 |                     episode_loss = .0
437 |                     episode_target_value = .0
438 |                     idx_episode += 1
439 |                 burn_in = (t < self.num_burn_in)
440 |                 state = env.reset()
441 |                 self.atari_processor.reset()
442 |                 self.history_processor.reset()
443 | 
444 |             if not burn_in:
445 |                 if t % self.train_freq == 0:
446 |                     loss, target_value = self.update_policy(current_sample)
447 |                     episode_loss += loss
448 |                     episode_target_value += target_value
449 |                 # update freq is based on train_freq
450 |                 if t % (self.train_freq * self.target_update_freq) == 0:
451 |                     # self.target_network.set_weights(self.q_network.get_weights())
452 |                     updateTarget(self.targetOps, self.sess)
453 |                     print("----- Synced.")
454 |                 if t % self.save_freq == 0:
455 |                     self.save_model(idx_episode)
456 |                 if t % (self.eval_freq * self.train_freq) == 0:
457 |                     episode_reward_mean, episode_reward_std, eval_count = self.evaluate(env, 20, eval_count, max_episode_length, True)
458 |                     save_scalar(t, 'eval/eval_episode_reward_mean', episode_reward_mean, self.writer)
459 |                     save_scalar(t, 'eval/eval_episode_reward_std', episode_reward_std, self.writer)
460 | 
461 |         self.save_model(idx_episode)
462 | 
463 | 
464 |     def save_model(self, idx_episode):
465 |         safe_path = self.output_path + "/qnet" + str(idx_episode) + ".cptk"
466 |         self.saver.save(self.sess, safe_path)
467 |         # self.q_network.save_weights(safe_path)
468 |         print("+++++++++ Network at", idx_episode, "saved to:", safe_path)
469 | 
470 |     def restore_model(self, restore_path):
471 |         self.saver.restore(self.sess, restore_path)
472 |         print("+++++++++ Network restored from: %s", restore_path)
473 | 
474 |     def evaluate(self, env, num_episodes, eval_count, max_episode_length=None, monitor=True):
475 |         """Test your agent with a provided environment.
476 |         
477 |         You shouldn't update your network parameters here. Also if you
478 |         have any layers that vary in behavior between train/test time
479 |         (such as dropout or batch norm), you should set them to test.
480 | 
481 |         Basically run your policy on the environment and collect stats
482 |         like cumulative reward, average episode length, etc.
483 | 
484 |         You can also call the render function here if you want to
485 |         visually inspect your policy.
486 |         """
487 |         print("Evaluation starts.")
488 |         plt.figure(1, figsize=(45, 20))
489 | 
490 |         is_training = False
491 |         if self.load_network:
492 |             # self.q_network.load_weights(self.load_network_path)
493 |             # print("Load network from:", self.load_network_path)
494 |             self.restore_model(self.load_network_path)
495 |         if monitor:
496 |             env = wrappers.Monitor(env, self.output_path_videos, video_callable=lambda x:True, resume=True)
497 |         state = env.reset()
498 | 
499 |         idx_episode = 1
500 |         episode_frames = 0
501 |         episode_reward = np.zeros(num_episodes)
502 |         t = 0
503 | 
504 |         while idx_episode <= num_episodes:
505 |             t += 1
506 |             action_state = self.history_processor.process_state_for_network(
507 |                 self.atari_processor.process_state_for_network(state))
508 |             action = self.select_action(action_state, is_training, policy_type = 'GreedyEpsilonPolicy')
509 | 
510 |             action_state_ori = self.history_processor.process_state_for_network_ori(
511 |                 self.atari_processor.process_state_for_network_ori(state))
512 | 
513 |             dice = np.random.random()
514 | 
515 |             state, reward, done, info = env.step(action)
516 | 
517 |             if dice < 0.1:
518 |                 attention_a = self.sess.run(self.q_network.attention_a,\
519 |                             feed_dict={self.q_network.imageIn: action_state[None, :, :, :], self.q_network.batch_size:1})
520 |                 # print attention_a.shape #(1, 10, 1)
521 |                 attention_a = np.reshape(attention_a, (-1))
522 |                 for alpha_idx in range(action_state_ori.shape[3]):
523 |                     plt.subplot(2, action_state_ori.shape[3]//2+1, alpha_idx+1)
524 |                     img = action_state_ori[:, :, :, alpha_idx] #(210, 160, 3)
525 |                     plt.imshow(img)
526 |                     # plt.text(0, 1, 'Weight: %.4f'%(att ention_a[alpha_idx]) , color='black', weight='bold', backgroundcolor='white', fontsize=30)
527 |                 plt.subplot(2, action_state_ori.shape[3]//2+1, action_state_ori.shape[3]+2)
528 |                 plt.imshow(state)
529 |                 # plt.text(0, 1, 'Next state after taking the action %s'%(action), color='black', weight='bold', backgroundcolor='white', fontsize=20)
530 |                 plt.axis('off')
531 |                 plt.savefig('%sattention_ep%d-frame%d.png'%(self.output_path_images, eval_count, episode_frames))
532 |                 print('---- Image saved at: %sattention_ep%d-frame%d.png'%(self.output_path_images, eval_count, episode_frames))
533 | 
534 |             episode_frames += 1
535 |             episode_reward[idx_episode-1] += reward 
536 |             if episode_frames > max_episode_length:
537 |                 done = True
538 |             if done:
539 |                 print("Eval: time %d, episode %d, length %d, reward %.0f. @eval_count %s" %
540 |                     (t, idx_episode, episode_frames, episode_reward[idx_episode-1], eval_count))
541 |                 eval_count += 1
542 |                 save_scalar(eval_count, 'eval/eval_episode_raw_reward', episode_reward[idx_episode-1], self.writer)
543 |                 save_scalar(eval_count, 'eval/eval_episode_raw_length', episode_frames, self.writer)
544 |                 sys.stdout.flush()
545 |                 state = env.reset()
546 |                 episode_frames = 0
547 |                 idx_episode += 1
548 |                 self.atari_processor.reset()
549 |                 self.history_processor.reset()
550 | 
551 | 
552 |         reward_mean = np.mean(episode_reward)
553 |         reward_std = np.std(episode_reward)
554 |         print("Evaluation summury: num_episodes [%d], reward_mean [%.3f], reward_std [%.3f]" %
555 |             (num_episodes, reward_mean, reward_std))
556 |         sys.stdout.flush()
557 | 
558 |         return reward_mean, reward_std, eval_count
559 | 


--------------------------------------------------------------------------------
/deeprl_prj/dqn_tf_spatialAt.py:
--------------------------------------------------------------------------------
  1 | '''Pure Tensorflow implementation. Includes Basic Dueling Double DQN and Spatial Attention DQN.'''
  2 | 
  3 | from deeprl_prj.policy import *
  4 | from deeprl_prj.objectives import *
  5 | from deeprl_prj.preprocessors import *
  6 | from deeprl_prj.utils import *
  7 | from deeprl_prj.core import *
  8 | from helper import *
  9 | 
 10 | import numpy as np
 11 | import sys
 12 | from gym import wrappers
 13 | import tensorflow as tf
 14 | import skimage.transform
 15 | 
 16 | """Main DQN agent."""
 17 | 
 18 | class Qnetwork():
 19 |     def __init__(self, args, h_size, num_frames, num_actions, rnn_cell, myScope):
 20 |         #The network recieves a frame from the game, flattened into an array.
 21 |         #It then resizes it and processes it through four convolutional layers.
 22 |         self.imageIn =  tf.placeholder(shape=[None,84,84,num_frames],dtype=tf.float32)
 23 |         self.image_permute = tf.transpose(self.imageIn, perm=[0, 3, 1, 2])
 24 |         self.image_reshape = tf.reshape(self.image_permute, [-1, 84, 84, 1])
 25 |         self.image_reshape_recoverd = tf.squeeze(tf.gather(tf.reshape(self.image_reshape, [-1, num_frames, 84, 84, 1]), [0]), [0])
 26 |         self.summary_merged = tf.summary.merge([tf.summary.image('image_reshape_recoverd', self.image_reshape_recoverd, max_outputs=num_frames)])
 27 |         # self.imageIn = tf.reshape(self.scalarInput,shape=[-1,84,84,1])
 28 |         self.conv1 = tf.contrib.layers.convolution2d( \
 29 |             inputs=self.image_reshape,num_outputs=32,\
 30 |             kernel_size=[8,8],stride=[4,4],padding='VALID', \
 31 |             activation_fn=tf.nn.relu, biases_initializer=None,scope=myScope+'_conv1')
 32 |         self.conv2 = tf.contrib.layers.convolution2d( \
 33 |             inputs=self.conv1,num_outputs=64,\
 34 |             kernel_size=[4,4],stride=[2,2],padding='VALID', \
 35 |             activation_fn=tf.nn.relu, biases_initializer=None,scope=myScope+'_conv2')
 36 |         self.conv3 = tf.contrib.layers.convolution2d( \
 37 |             inputs=self.conv2,num_outputs=64,\
 38 |             kernel_size=[3,3],stride=[1,1],padding='VALID', \
 39 |             activation_fn=tf.nn.relu, biases_initializer=None,scope=myScope+'_conv3') # (None, 10, 7, 7, 64)
 40 |         self.batch_size = tf.placeholder(dtype=tf.int32)
 41 | 
 42 |         if not(args.a_t):
 43 |             self.conv4 = tf.contrib.layers.fully_connected(tf.contrib.layers.flatten(self.conv3), h_size, activation_fn=tf.nn.relu)
 44 |             
 45 |             #We take the output from the final convolutional layer and send it to a recurrent layer.
 46 |             #The input must be reshaped into [batch x trace x units] for rnn processing, 
 47 |             #and then returned to [batch x units] when sent through the upper levles.
 48 |             self.convFlat = tf.reshape(self.conv4,[self.batch_size, num_frames, h_size])
 49 |             self.state_in = rnn_cell.zero_state(self.batch_size, tf.float32)
 50 |             self.rnn_outputs, self.rnn_state = tf.nn.dynamic_rnn(\
 51 |                     inputs=self.convFlat,cell=rnn_cell,dtype=tf.float32,initial_state=self.state_in,scope=myScope+'_rnn')
 52 |             # print("======", self.rnn_outputs.get_shape().as_list())
 53 | 
 54 |             self.rnn_last_output = tf.slice(self.rnn_outputs, [0, num_frames-1, 0], [-1, 1, -1])
 55 |             self.rnn = tf.squeeze(self.rnn_last_output, [1])
 56 |             # print("==========", self.rnn.get_shape().as_list())
 57 |         else:
 58 |             self.L = 7*7
 59 |             self.D = 64
 60 |             self.T = num_frames
 61 |             self.H = 512
 62 |             self.selector=args.selector
 63 |             self.weight_initializer = tf.contrib.layers.xavier_initializer()
 64 |             self.const_initializer = tf.constant_initializer(0.0)
 65 | 
 66 |             self.features = tf.reshape(self.conv3, [self.batch_size, num_frames, self.L, self.D])
 67 |             self.features_list = tf.split(self.features, num_frames, axis=1)
 68 |             # print(len(self.features_list), self.features_list[0].get_shape().as_list()) # 10 [None, 1, 49, 64]
 69 |             self.alpha_list = []
 70 |             lstm_cell = tf.contrib.rnn.BasicLSTMCell(num_units=self.H)
 71 |             c, h = self._get_initial_lstm(features=tf.squeeze(self.features_list[0], [1]), myScope=myScope)
 72 | 
 73 |             for t in range(self.T):
 74 |                 features = tf.squeeze(self.features_list[t], [1])
 75 |                 features = self._batch_norm(features, mode='train', name=myScope+'conv_features', reuse=(t!=0))
 76 |                 features_proj = self._project_features(features=features, myScope=myScope, reuse=(t!=0))
 77 |                 context, alpha = self._attention_layer(features, features_proj, h, myScope=myScope, reuse=(t!=0))
 78 |                 self.alpha_list.append(alpha)
 79 | 
 80 |                 if self.selector:
 81 |                     context, beta = self._selector(context, h, myScope=myScope, reuse=(t!=0)) 
 82 | 
 83 |                 # print("========== context ", context.get_shape().as_list())
 84 |                 # print("========== h ", h.get_shape().as_list())
 85 | 
 86 |                 with tf.variable_scope(myScope+'_lstmCell', reuse=(t!=0)):
 87 |                     _, (c, h) = lstm_cell(inputs=tf.concat([context, h], 1), state=[c, h])
 88 |                     # print("========== h ", h.get_shape().as_list())
 89 | 
 90 |             self.rnn = h
 91 | 
 92 | 
 93 |         if args.net_mode == "duel":
 94 |             #The output from the recurrent player is then split into separate Value and Advantage streams
 95 |             self.ad_hidden = tf.contrib.layers.fully_connected(self.rnn, h_size, activation_fn=tf.nn.relu, scope=myScope+'_fc_advantage_hidden')
 96 |             self.Advantage = tf.contrib.layers.fully_connected(self.ad_hidden, num_actions, activation_fn=None, scope=myScope+'_fc_advantage')
 97 |             self.value_hidden = tf.contrib.layers.fully_connected(self.rnn, h_size, activation_fn=tf.nn.relu, scope=myScope+'_fc_value_hidden')
 98 |             self.Value = tf.contrib.layers.fully_connected(self.value_hidden, 1, activation_fn=None, scope=myScope+'_fc_value')
 99 |             
100 |             #Then combine them together to get our final Q-values.
101 |             self.Qout = self.Value + tf.subtract(self.Advantage,tf.reduce_mean(self.Advantage,axis=1,keep_dims=True))
102 |         else:
103 |             self.Qout = tf.contrib.layers.fully_connected(self.rnn, num_actions, activation_fn=None)
104 |         self.predict = tf.argmax(self.Qout,1)
105 |         
106 |         #Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
107 |         self.targetQ = tf.placeholder(shape=[None],dtype=tf.float32)
108 |         self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
109 |         self.actions_onehot = tf.one_hot(self.actions, num_actions, dtype=tf.float32)
110 |         
111 |         self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis=1)
112 |         self.td_error = tf.square(self.targetQ - self.Q)
113 |         self.loss = tf.reduce_mean(self.td_error)
114 |         
115 |         self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001)
116 |         self.updateModel = self.trainer.minimize(self.loss)
117 | 
118 |     def _batch_norm(self, x, mode='train', name=None, reuse=False):
119 |         return tf.contrib.layers.batch_norm(inputs=x, 
120 |                                             decay=0.95,
121 |                                             center=True,
122 |                                             scale=True,
123 |                                             is_training=(mode=='train'),
124 |                                             updates_collections=None,
125 |                                             reuse=reuse,
126 |                                             scope=(name+'batch_norm'))
127 | 
128 |     def _get_initial_lstm(self, features, myScope):
129 |         with tf.variable_scope(myScope+'_initial_lstm'):
130 |             features_mean = tf.reduce_mean(features, 1)
131 | 
132 |             w_h = tf.get_variable('w_h', [self.D, self.H], initializer=self.weight_initializer)
133 |             b_h = tf.get_variable('b_h', [self.H], initializer=self.const_initializer)
134 |             h = tf.nn.tanh(tf.matmul(features_mean, w_h) + b_h)
135 | 
136 |             w_c = tf.get_variable('w_c', [self.D, self.H], initializer=self.weight_initializer)
137 |             b_c = tf.get_variable('b_c', [self.H], initializer=self.const_initializer)
138 |             c = tf.nn.tanh(tf.matmul(features_mean, w_c) + b_c)
139 |             return c, h
140 | 
141 |     def _project_features(self, features, myScope, reuse=False):
142 |         with tf.variable_scope(myScope+'_project_features', reuse=reuse):
143 |             w = tf.get_variable('w', [self.D, self.D], initializer=self.weight_initializer)
144 |             features_flat = tf.reshape(features, [-1, self.D])
145 |             features_proj = tf.matmul(features_flat, w)  
146 |             features_proj = tf.reshape(features_proj, [-1, self.L, self.D])
147 |             return features_proj
148 | 
149 |     def _attention_layer(self, features, features_proj, h, myScope, reuse=False):
150 |         with tf.variable_scope(myScope+'_attention_layer', reuse=reuse):
151 |             w = tf.get_variable('w', [self.H, self.D], initializer=self.weight_initializer)
152 |             b = tf.get_variable('b', [self.D], initializer=self.const_initializer)
153 |             w_att = tf.get_variable('w_att', [self.D, 1], initializer=self.weight_initializer)
154 | 
155 |             h_att = tf.nn.relu(features_proj + tf.expand_dims(tf.matmul(h, w), 1) + b)    # (N, L, D)
156 |             out_att = tf.reshape(tf.matmul(tf.reshape(h_att, [-1, self.D]), w_att), [-1, self.L])   # (N, L)
157 |             alpha = tf.nn.softmax(out_att)  
158 |             context = tf.reduce_sum(features * tf.expand_dims(alpha, 2), 1, name='context')   #(N, D)
159 |             return context, alpha
160 |   
161 |     def _selector(self, context, h, myScope, reuse=False):
162 |         with tf.variable_scope(myScope+'_selector', reuse=reuse):
163 |             w = tf.get_variable('w', [self.H, 1], initializer=self.weight_initializer)
164 |             b = tf.get_variable('b', [1], initializer=self.const_initializer)
165 |             beta = tf.nn.sigmoid(tf.matmul(h, w) + b, 'beta')    # (N, 1)
166 |             context = tf.multiply(beta, context, name='selected_context') 
167 |             return context, beta
168 | 
169 | def save_scalar(step, name, value, writer):
170 |     """Save a scalar value to tensorboard.
171 |       Parameters
172 |       ----------
173 |       step: int
174 |         Training step (sets the position on x-axis of tensorboard graph.
175 |       name: str
176 |         Name of variable. Will be the name of the graph in tensorboard.
177 |       value: float
178 |         The value of the variable at this step.
179 |       writer: tf.FileWriter
180 |         The tensorboard FileWriter instance.
181 |       """
182 |     summary = tf.Summary()
183 |     summary_value = summary.value.add()
184 |     summary_value.simple_value = float(value)
185 |     summary_value.tag = name
186 |     writer.add_summary(summary, step)
187 | 
188 | class DQNAgent:
189 |     """Class implementing DQN.
190 | 
191 |     This is a basic outline of the functions/parameters you will need
192 |     in order to implement the DQNAgnet. This is just to get you
193 |     started. You may need to tweak the parameters, add new ones, etc.
194 | 
195 |     Feel free to change the functions and funciton parameters that the class 
196 |     provides.
197 | 
198 |     We have provided docstrings to go along with our suggested API.
199 | 
200 |     Parameters
201 |     ----------
202 |     q_network: keras.models.Model
203 |       Your Q-network model.
204 |     preprocessor: deeprl_hw2.core.Preprocessor
205 |       The preprocessor class. See the associated classes for more
206 |       details.
207 |     memory: deeprl_hw2.core.Memory
208 |       Your replay memory.
209 |     gamma: float
210 |       Discount factor.
211 |     target_update_freq: float
212 |       Frequency to update the target network. You can either provide a
213 |       number representing a soft target update (see utils.py) or a
214 |       hard target update (see utils.py and Atari paper.)
215 |     num_burn_in: int
216 |       Before you begin updating the Q-network your replay memory has
217 |       to be filled up with some number of samples. This number says
218 |       how many.
219 |     train_freq: int
220 |       How often you actually update your Q-Network. Sometimes
221 |       stability is improved if you collect a couple samples for your
222 |       replay memory, for every Q-network update that you run.
223 |     batch_size: int
224 |       How many samples in each minibatch.
225 |     """
226 |     def __init__(self, args, num_actions):
227 |         self.num_actions = num_actions
228 |         input_shape = (args.frame_height, args.frame_width, args.num_frames)
229 |         self.history_processor = HistoryPreprocessor(args.num_frames - 1)
230 |         self.atari_processor = AtariPreprocessor()
231 |         self.memory = ReplayMemory(args)
232 |         self.policy = LinearDecayGreedyEpsilonPolicy(args.initial_epsilon, args.final_epsilon, args.exploration_steps)
233 |         self.gamma = args.gamma
234 |         self.target_update_freq = args.target_update_freq
235 |         self.num_burn_in = args.num_burn_in
236 |         self.train_freq = args.train_freq
237 |         self.batch_size = args.batch_size
238 |         self.learning_rate = args.learning_rate
239 |         self.frame_width = args.frame_width
240 |         self.frame_height = args.frame_height
241 |         self.num_frames = args.num_frames
242 |         self.output_path = args.output
243 |         self.output_path_videos = args.output + '/videos/'
244 |         self.output_path_images = args.output + '/images/'
245 |         self.save_freq = args.save_freq
246 |         self.load_network = args.load_network
247 |         self.load_network_path = args.load_network_path
248 |         self.enable_ddqn = args.ddqn
249 |         self.net_mode = args.net_mode
250 | 
251 |         self.h_size = 512
252 |         self.tau = 0.001
253 |         tf.reset_default_graph()
254 |         #We define the cells for the primary and target q-networks
255 |         cell = tf.contrib.rnn.BasicLSTMCell(num_units=self.h_size, state_is_tuple=True)
256 |         cellT = tf.contrib.rnn.BasicLSTMCell(num_units=self.h_size, state_is_tuple=True)
257 |         self.q_network = Qnetwork(args=args, h_size=self.h_size, num_frames=self.num_frames, num_actions=self.num_actions, rnn_cell=cell, myScope="QNet")
258 |         self.target_network = Qnetwork(args=args, h_size=self.h_size, num_frames=self.num_frames, num_actions=self.num_actions, rnn_cell=cellT, myScope="TargetNet")
259 |         
260 |         print(">>>> Net mode: %s, Using double dqn: %s" % (self.net_mode, self.enable_ddqn))
261 |         self.eval_freq = args.eval_freq
262 |         self.no_experience = args.no_experience
263 |         self.no_target = args.no_target
264 |         print(">>>> Target fixing: %s, Experience replay: %s" % (not self.no_target, not self.no_experience))
265 | 
266 |         # initialize target network
267 |         init = tf.global_variables_initializer()
268 |         self.saver = tf.train.Saver(max_to_keep=2)
269 |         trainables = tf.trainable_variables()
270 |         print(trainables, len(trainables))
271 |         self.targetOps = updateTargetGraph(trainables, self.tau)
272 | 
273 |         config = tf.ConfigProto()
274 |         config.gpu_options.allow_growth = True
275 |         config.allow_soft_placement = True
276 |         self.sess = tf.Session(config=config)
277 |         self.sess.run(init)
278 |         updateTarget(self.targetOps, self.sess)
279 |         self.writer = tf.summary.FileWriter(self.output_path)
280 | 
281 |     def calc_q_values(self, state):
282 |         """Given a state (or batch of states) calculate the Q-values.
283 | 
284 |         Basically run your network on these states.
285 | 
286 |         Return
287 |         ------
288 |         Q-values for the state(s)
289 |         """
290 |         state = state[None, :, :, :]
291 |         # return self.q_network.predict_on_batch(state)
292 |         # print state.shape
293 |         # Qout = self.sess.run(self.q_network.rnn_outputs,\
294 |         #             feed_dict={self.q_network.imageIn: state, self.q_network.batch_size:1})
295 |         # print Qout.shape
296 |         Qout = self.sess.run(self.q_network.Qout,\
297 |                     feed_dict={self.q_network.imageIn: state, self.q_network.batch_size:1})
298 |         # print Qout.shape
299 |         return Qout
300 | 
301 |     def select_action(self, state, is_training = True, **kwargs):
302 |         """Select the action based on the current state.
303 | 
304 |         You will probably want to vary your behavior here based on
305 |         which stage of training your in. For example, if you're still
306 |         collecting random samples you might want to use a
307 |         UniformRandomPolicy.
308 | 
309 |         If you're testing, you might want to use a GreedyEpsilonPolicy
310 |         with a low epsilon.
311 | 
312 |         If you're training, you might want to use the
313 |         LinearDecayGreedyEpsilonPolicy.
314 | 
315 |         This would also be a good place to call
316 |         process_state_for_network in your preprocessor.
317 | 
318 |         Returns
319 |         --------
320 |         selected action
321 |         """
322 |         q_values = self.calc_q_values(state)
323 |         if is_training:
324 |             if kwargs['policy_type'] == 'UniformRandomPolicy':
325 |                 return UniformRandomPolicy(self.num_actions).select_action()
326 |             else:
327 |                 # linear decay greedy epsilon policy
328 |                 return self.policy.select_action(q_values, is_training)
329 |         else:
330 |             # return GreedyEpsilonPolicy(0.05).select_action(q_values)
331 |             return GreedyPolicy().select_action(q_values)
332 | 
333 |     def update_policy(self, current_sample):
334 |         """Update your policy.
335 | 
336 |         Behavior may differ based on what stage of training your
337 |         in. If you're in training mode then you should check if you
338 |         should update your network parameters based on the current
339 |         step and the value you set for train_freq.
340 | 
341 |         Inside, you'll want to sample a minibatch, calculate the
342 |         target values, update your network, and then update your
343 |         target values.
344 | 
345 |         You might want to return the loss and other metrics as an
346 |         output. They can help you monitor how training is going.
347 |         """
348 |         batch_size = self.batch_size
349 | 
350 |         if self.no_experience:
351 |             states = np.stack([current_sample.state])
352 |             next_states = np.stack([current_sample.next_state])
353 |             rewards = np.asarray([current_sample.reward])
354 |             mask = np.asarray([1 - int(current_sample.is_terminal)])
355 | 
356 |             action_mask = np.zeros((1, self.num_actions))
357 |             action_mask[0, current_sample.action] = 1.0
358 |         else:
359 |             samples = self.memory.sample(batch_size)
360 |             samples = self.atari_processor.process_batch(samples)
361 | 
362 |             states = np.stack([x.state for x in samples])
363 |             actions = np.asarray([x.action for x in samples])
364 |             # action_mask = np.zeros((batch_size, self.num_actions))
365 |             # action_mask[range(batch_size), actions] = 1.0
366 | 
367 |             next_states = np.stack([x.next_state for x in samples])
368 |             mask = np.asarray([1 - int(x.is_terminal) for x in samples])
369 |             rewards = np.asarray([x.reward for x in samples])
370 | 
371 |         if self.no_target:
372 |             next_qa_value = self.q_network.predict_on_batch(next_states)
373 |         else:
374 |             # next_qa_value = self.target_network.predict_on_batch(next_states)
375 |             next_qa_value = self.sess.run(self.target_network.Qout,\
376 |                     feed_dict={self.target_network.imageIn: next_states, self.target_network.batch_size:batch_size})
377 | 
378 |         if self.enable_ddqn:
379 |             # qa_value = self.q_network.predict_on_batch(next_states)
380 |             qa_value = self.sess.run(self.q_network.Qout,\
381 |                     feed_dict={self.q_network.imageIn: next_states, self.q_network.batch_size:batch_size})
382 |             max_actions = np.argmax(qa_value, axis = 1)
383 |             next_qa_value = next_qa_value[range(batch_size), max_actions]
384 |         else:
385 |             next_qa_value = np.max(next_qa_value, axis = 1)
386 |         # print rewards.shape, mask.shape, next_qa_value.shape, batch_size
387 |         target = rewards + self.gamma * mask * next_qa_value
388 | 
389 |         loss, _, rnn = self.sess.run([self.q_network.loss, self.q_network.updateModel, self.q_network.rnn], \
390 |                     feed_dict={self.q_network.imageIn: states, self.q_network.batch_size:batch_size, \
391 |                     self.q_network.actions: actions, self.q_network.targetQ: target})
392 |         # print rnn[:5]
393 |         # if np.random.random() < 0.001:
394 |         #     merged = self.sess.run(self.q_network.summary_merged, \
395 |         #                 feed_dict={self.q_network.imageIn: states, self.q_network.batch_size:batch_size, \
396 |         #                 self.q_network.actions: actions, self.q_network.targetQ: target})
397 |         #     self.writer.add_summary(merged)
398 |         #     self.writer.flush()
399 |         #     print '----- writer flushed.'
400 |         # return self.final_model.train_on_batch([states, action_mask], target), np.mean(target)
401 |         return loss, np.mean(target)
402 | 
403 |     def fit(self, env, num_iterations, max_episode_length=None):
404 |         """Fit your model to the provided environment.
405 | 
406 |         Its a good idea to print out things like loss, average reward,
407 |         Q-values, etc to see if your agent is actually improving.
408 | 
409 |         You should probably also periodically save your network
410 |         weights and any other useful info.
411 | 
412 |         This is where you should sample actions from your network,
413 |         collect experience samples and add them to your replay memory,
414 |         and update your network parameters.
415 | 
416 |         Parameters
417 |         ----------
418 |         env: gym.Env
419 |           This is your Atari environment. You should wrap the
420 |           environment using the wrap_atari_env function in the
421 |           utils.py
422 |         num_iterations: int
423 |           How many samples/updates to perform.
424 |         max_episode_length: int
425 |           How long a single episode should last before the agent
426 |           resets. Can help exploration.
427 |         """
428 |         is_training = True
429 |         print("Training starts.")
430 |         self.save_model(0)
431 |         eval_count = 0
432 | 
433 |         state = env.reset()
434 |         burn_in = True
435 |         idx_episode = 1
436 |         episode_loss = .0
437 |         episode_frames = 0
438 |         episode_reward = .0
439 |         episode_raw_reward = .0
440 |         episode_target_value = .0
441 |         for t in range(self.num_burn_in + num_iterations):
442 |             print ("iteration --> %s, episode --> %s" % (t, idx_episode))
443 |             action_state = self.history_processor.process_state_for_network(
444 |                 self.atari_processor.process_state_for_network(state))
445 |             policy_type = "UniformRandomPolicy" if burn_in else "LinearDecayGreedyEpsilonPolicy"
446 |             action = self.select_action(action_state, is_training, policy_type = policy_type)
447 |             processed_state = self.atari_processor.process_state_for_memory(state)
448 | 
449 |             state, reward, done, info = env.step(action)
450 | 
451 |             processed_next_state = self.atari_processor.process_state_for_network(state)
452 |             action_next_state = np.dstack((action_state, processed_next_state))
453 |             action_next_state = action_next_state[:, :, 1:]
454 | 
455 |             processed_reward = self.atari_processor.process_reward(reward)
456 | 
457 |             self.memory.append(processed_state, action, processed_reward, done)
458 |             current_sample = Sample(action_state, action, processed_reward, action_next_state, done)
459 |             
460 |             if not burn_in: 
461 |                 episode_frames += 1
462 |                 episode_reward += processed_reward
463 |                 episode_raw_reward += reward
464 |                 if episode_frames > max_episode_length:
465 |                     done = True
466 | 
467 |             if done:
468 |                 # adding last frame only to save last state
469 |                 last_frame = self.atari_processor.process_state_for_memory(state)
470 |                 # action, reward, done doesn't matter here
471 |                 self.memory.append(last_frame, action, 0, done)
472 |                 if not burn_in:
473 |                     avg_target_value = episode_target_value / episode_frames
474 |                     print(">>> Training: time %d, episode %d, length %d, reward %.0f, raw_reward %.0f, loss %.4f, target value %.4f, policy step %d, memory cap %d" % 
475 |                         (t, idx_episode, episode_frames, episode_reward, episode_raw_reward, episode_loss, 
476 |                         avg_target_value, self.policy.step, self.memory.current))
477 |                     sys.stdout.flush()
478 |                     save_scalar(idx_episode, 'train/episode_frames', episode_frames, self.writer)
479 |                     save_scalar(idx_episode, 'train/episode_reward', episode_reward, self.writer)
480 |                     save_scalar(idx_episode, 'train/episode_raw_reward', episode_raw_reward, self.writer)
481 |                     save_scalar(idx_episode, 'train/episode_loss', episode_loss, self.writer)
482 |                     save_scalar(idx_episode, 'train_avg/avg_reward', episode_reward / episode_frames, self.writer)
483 |                     save_scalar(idx_episode, 'train_avg/avg_target_value', avg_target_value, self.writer)
484 |                     save_scalar(idx_episode, 'train_avg/avg_loss', episode_loss / episode_frames, self.writer)
485 |                     episode_frames = 0
486 |                     episode_reward = .0
487 |                     episode_raw_reward = .0
488 |                     episode_loss = .0
489 |                     episode_target_value = .0
490 |                     idx_episode += 1
491 |                 burn_in = (t < self.num_burn_in)
492 |                 state = env.reset()
493 |                 self.atari_processor.reset()
494 |                 self.history_processor.reset()
495 | 
496 |             if not burn_in:
497 |                 if t % self.train_freq == 0:
498 |                     loss, target_value = self.update_policy(current_sample)
499 |                     episode_loss += loss
500 |                     episode_target_value += target_value
501 |                 # update freq is based on train_freq
502 |                 if t % (self.train_freq * self.target_update_freq) == 0:
503 |                     # self.target_network.set_weights(self.q_network.get_weights())
504 |                     updateTarget(self.targetOps, self.sess)
505 |                     print("----- Synced.")
506 |                 if t % self.save_freq == 0:
507 |                     self.save_model(idx_episode)
508 |                 # if t % (self.eval_freq * self.train_freq) == 0:
509 |                 #     episode_reward_mean, episode_reward_std, eval_count = self.evaluate(env, 20, eval_count, max_episode_length, True)
510 |                 #     save_scalar(t, 'eval/eval_episode_reward_mean', episode_reward_mean, self.writer)
511 |                 #     save_scalar(t, 'eval/eval_episode_reward_std', episode_reward_std, self.writer)
512 | 
513 |         self.save_model(idx_episode)
514 | 
515 | 
516 |     def save_model(self, idx_episode):
517 |         safe_path = self.output_path + "/qnet" + str(idx_episode) + ".cptk"
518 |         self.saver.save(self.sess, safe_path)
519 |         # self.q_network.save_weights(safe_path)
520 |         print("Network at", idx_episode, "saved to:", safe_path)
521 | 
522 |     def evaluate(self, env, num_episodes, eval_count, max_episode_length=None, monitor=True):
523 |         """Test your agent with a provided environment.
524 |         
525 |         You shouldn't update your network parameters here. Also if you
526 |         have any layers that vary in behavior between train/test time
527 |         (such as dropout or batch norm), you should set them to test.
528 | 
529 |         Basically run your policy on the environment and collect stats
530 |         like cumulative reward, average episode length, etc.
531 | 
532 |         You can also call the render function here if you want to
533 |         visually inspect your policy.
534 |         """
535 |         print("Evaluation starts.")
536 |         plt.figure(1, figsize=(40, 20))
537 | 
538 |         is_training = False
539 |         if self.load_network:
540 |             self.q_network.load_weights(self.load_network_path)
541 |             print("Load network from:", self.load_network_path)
542 |         if monitor:
543 |             env = wrappers.Monitor(env, self.output_path_videos, video_callable=lambda x:True, resume=True)
544 |         state = env.reset()
545 | 
546 |         idx_episode = 1
547 |         episode_frames = 0
548 |         episode_reward = np.zeros(num_episodes)
549 |         t = 0
550 | 
551 |         while idx_episode <= num_episodes:
552 |             t += 1
553 |             action_state = self.history_processor.process_state_for_network(
554 |                 self.atari_processor.process_state_for_network(state))
555 |             action = self.select_action(action_state, is_training, policy_type = 'GreedyEpsilonPolicy')
556 | 
557 |             action_state_ori = self.history_processor.process_state_for_network_ori(
558 |                 self.atari_processor.process_state_for_network_ori(state))
559 |             # print "state.shape", state.shape
560 |             # print "action_state_ori.shape", action_state_ori.shape
561 | 
562 |             if np.random.random() < 1e-3:
563 |                 alpha_list = self.sess.run(self.q_network.alpha_list,\
564 |                             feed_dict={self.q_network.imageIn: action_state[None, :, :, :], self.q_network.batch_size:1})
565 |                 # print alpha_list, len(alpha_list), alpha_list[0].shape #10 (1, 49)
566 |                 for alpha_idx in range(len(alpha_list)):
567 |                     plt.subplot(2, len(alpha_list)//2, alpha_idx+1)
568 |                     img = action_state_ori[:, :, :, alpha_idx] #(210, 160, 3)
569 |                     plt.imshow(img)
570 |                     alp_curr = alpha_list[alpha_idx].reshape(7, 7)
571 |                     alp_img = skimage.transform.pyramid_expand(alp_curr, upscale=22, sigma=20)
572 |                     plt.imshow(scipy.misc.imresize(alp_img, (img.shape[0], img.shape[1])), alpha=0.7, cmap='gray')
573 |                     plt.axis('off')
574 |                 # plt.show()
575 |                 # plt.canvas.draw()
576 |                 plt.savefig('%sattention_ep%d-frame%d.png'%(self.output_path_images, eval_count, episode_frames))
577 |             
578 |             state, reward, done, info = env.step(action)
579 |             episode_frames += 1
580 |             episode_reward[idx_episode-1] += reward 
581 |             if episode_frames > max_episode_length:
582 |                 done = True
583 |             if done:
584 |                 print("Eval: time %d, episode %d, length %d, reward %.0f. @eval_count %s" %
585 |                     (t, idx_episode, episode_frames, episode_reward[idx_episode-1], eval_count))
586 |                 eval_count += 1
587 |                 save_scalar(eval_count, 'eval/eval_episode_raw_reward', episode_reward[idx_episode-1], self.writer)
588 |                 save_scalar(eval_count, 'eval/eval_episode_raw_length', episode_frames, self.writer)
589 |                 sys.stdout.flush()
590 |                 state = env.reset()
591 |                 episode_frames = 0
592 |                 idx_episode += 1
593 |                 self.atari_processor.reset()
594 |                 self.history_processor.reset()
595 | 
596 | 
597 |         reward_mean = np.mean(episode_reward)
598 |         reward_std = np.std(episode_reward)
599 |         print("Evaluation summury: num_episodes [%d], reward_mean [%.3f], reward_std [%.3f]" %
600 |             (num_episodes, reward_mean, reward_std))
601 |         sys.stdout.flush()
602 | 
603 |         return reward_mean, reward_std, eval_count
604 | 


--------------------------------------------------------------------------------