├── requirements.txt ├── deeprl_prj ├── __init__.py ├── objectives.py ├── utils.py ├── policy.py ├── preprocessors.py ├── core.py ├── dqn_keras.py ├── dqn_tf_temporalAt.py └── dqn_tf_spatialAt.py ├── readme.md ├── helper.py └── main.py /requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-image 2 | attrs 3 | h5py 4 | keras 5 | matplotlib 6 | numpy 7 | pillow 8 | protobuf>=3.0 9 | pydot-ng 10 | scipy 11 | semver 12 | gym[atari] 13 | -------------------------------------------------------------------------------- /deeprl_prj/__init__.py: -------------------------------------------------------------------------------- 1 | from . import core 2 | from . import dqn_keras 3 | from . import dqn_tf_temporalAt 4 | from . import dqn_tf_spatialAt 5 | from . import objectives 6 | from . import policy 7 | from . import preprocessors 8 | from . import utils 9 | -------------------------------------------------------------------------------- /deeprl_prj/objectives.py: -------------------------------------------------------------------------------- 1 | """Loss functions.""" 2 | 3 | import tensorflow as tf 4 | import semver 5 | 6 | def huber_loss(y_true, y_pred, max_grad=1.): 7 | """Calculate the huber loss. 8 | 9 | See https://en.wikipedia.org/wiki/Huber_loss 10 | 11 | Parameters 12 | ---------- 13 | y_true: np.array, tf.Tensor 14 | Target value. 15 | y_pred: np.array, tf.Tensor 16 | Predicted value. 17 | max_grad: float, optional 18 | Positive floating point value. Represents the maximum possible 19 | gradient magnitude. 20 | 21 | Returns 22 | ------- 23 | tf.Tensor 24 | The huber loss. 25 | """ 26 | with tf.variable_scope("HuberLoss"): 27 | delta = max_grad 28 | diff = tf.abs(y_true - y_pred, name = "diff") 29 | mask = diff < delta 30 | return tf.where(mask, 0.5 * tf.square(diff), delta * (diff - 0.5 * delta)) 31 | 32 | def mean_huber_loss(y_true, y_pred, max_grad=1.): 33 | """Return mean huber loss. 34 | 35 | Same as huber_loss, but takes the mean over all values in the 36 | output tensor. 37 | 38 | Parameters 39 | ---------- 40 | y_true: np.array, tf.Tensor 41 | Target value. 42 | y_pred: np.array, tf.Tensor 43 | Predicted value. 44 | max_grad: float, optional 45 | Positive floating point value. Represents the maximum possible 46 | gradient magnitude. 47 | 48 | Returns 49 | ------- 50 | tf.Tensor 51 | The mean huber loss. 52 | """ 53 | return tf.reduce_mean(huber_loss(y_true, y_pred, max_grad)) 54 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # UAV Obstacle Avoidance using Deep Recurrent Reinforcement Learning with Temporal Attention 2 | 3 | The code is implemented in Tensorflow(version = 1.1.0) and Keras. 4 | 5 | ## Requirements 6 | 7 | The code is based on **Python 2**. Install dependency by running: 8 | 9 | pip install --user -r requirements.txt 10 | 11 | ## How to run 12 | 13 | There are two types of DQN implementation with gpu: Keras and Tensorflow. 14 | You can choose different implementation by altering **line 15** in 15 | **main.py** 16 | 17 | Train original DQN: 18 | 19 | python main.py --task_name 'DQN' 20 | 21 | Train Double DQN: 22 | 23 | python main.py --ddqn --task_name 'Double_DQN' 24 | 25 | Train Dueling DQN: 26 | 27 | python main.py --net_mode=duel --task_name 'Dueling_DQN' 28 | 29 | Train Recurrent DQN: 30 | 31 | python main.py --num_frames 10 --recurrent --task_name 'Recurrent_DQN' 32 | 33 | Train Recurrent Temporal Attention DQN: (Using **dqn_tf_temporalAt.py** by uncommenting **line 18** in **main.py**) 34 | 35 | python main.py --num_frames 10 --recurrent --a_t --selector --task_name 'TemporalAt_DQN' 36 | 37 | Train Recurrent Spatial Attention DQN: (Using **dqn_tf_spatialAt.py** by uncommenting **line 21** in **main.py**) 38 | 39 | python main.py --num_frames 10 --recurrent --a_t --selector --task_name 'SpatialAt_DQN' 40 | 41 | Test trained model (e.g. Spatial Attention DQN): 42 | 43 | python main.py --num_frames 10 --recurrent --a_t --selector --test \ 44 | --load_network --load_network_path=PATH_TO_NET 45 | 46 | ## Acknowledgement 47 | 48 | This code repository is highly inspired from work of Rui Zhu et al [link](https://github.com/chasewind007/Attention-DQN). 49 | -------------------------------------------------------------------------------- /deeprl_prj/utils.py: -------------------------------------------------------------------------------- 1 | """Common functions.""" 2 | 3 | import semver 4 | import tensorflow as tf 5 | 6 | def get_uninitialized_variables(variables=None): 7 | """Return a list of uninitialized tf variables. 8 | 9 | Parameters 10 | ---------- 11 | variables: tf.Variable, list(tf.Variable), optional 12 | Filter variable list to only those that are uninitialized. If no 13 | variables are specified the list of all variables in the graph 14 | will be used. 15 | 16 | Returns 17 | ------- 18 | list(tf.Variable) 19 | List of uninitialized tf variables. 20 | """ 21 | sess = tf.get_default_session() 22 | if variables is None: 23 | variables = tf.global_variables() 24 | else: 25 | variables = list(variables) 26 | 27 | if len(variables) == 0: 28 | return [] 29 | 30 | if semver.match(tf.__version__, '<1.0.0'): 31 | init_flag = sess.run( 32 | tf.pack([tf.is_variable_initialized(v) for v in variables])) 33 | else: 34 | init_flag = sess.run( 35 | tf.stack([tf.is_variable_initialized(v) for v in variables])) 36 | return [v for v, f in zip(variables, init_flag) if not f] 37 | 38 | def get_soft_target_model_updates(target, source, tau): 39 | r"""Return list of target model update ops. 40 | 41 | These are soft target updates. Meaning that the target values are 42 | slowly adjusted, rather than directly copied over from the source 43 | model. 44 | 45 | The update is of the form: 46 | 47 | $W' \gets (1- \tau) W' + \tau W$ where $W'$ is the target weight 48 | and $W$ is the source weight. 49 | 50 | Parameters 51 | ---------- 52 | target: keras.models.Model 53 | The target model. Should have same architecture as source model. 54 | source: keras.models.Model 55 | The source model. Should have same architecture as target model. 56 | tau: float 57 | The weight of the source weights to the target weights used 58 | during update. 59 | 60 | Returns 61 | ------- 62 | list(tf.Tensor) 63 | List of tensor update ops. 64 | """ 65 | target_weights = target.get_weights() 66 | source_weights = source.get_weights() 67 | for i in range(len(target_weights)): 68 | target_weights[i] = (1 - tau) * target_weights[i] + tau * source_weights[i] 69 | return target_weights 70 | 71 | def get_hard_target_model_updates(target, source): 72 | """Return list of target model update ops. 73 | 74 | These are hard target updates. The source weights are copied 75 | directly to the target network. 76 | 77 | Parameters 78 | ---------- 79 | target: keras.models.Model 80 | The target model. Should have same architecture as source model. 81 | source: keras.models.Model 82 | The source model. Should have same architecture as target model. 83 | 84 | Returns 85 | ------- 86 | list(tf.Tensor) 87 | List of tensor update ops. 88 | """ 89 | return source.get_weights() 90 | 91 | def compare_model(target, source): 92 | target_weights = target.get_weights() 93 | source_weights = source.get_weights() 94 | print(len(target_weights)) 95 | for i in range(len(target_weights)): 96 | print(target_weights[i].shape, source_weights[i].shape) 97 | if (target_weights[i] != source_weights[i]).any(): 98 | return False 99 | return True 100 | -------------------------------------------------------------------------------- /helper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import tensorflow as tf 4 | import matplotlib.pyplot as plt 5 | import scipy.misc 6 | import os 7 | import csv 8 | import itertools 9 | import tensorflow.contrib.slim as slim 10 | 11 | #This is a simple function to reshape our game frames. 12 | def processState(state1): 13 | return np.reshape(state1,[21168]) 14 | 15 | #These functions allows us to update the parameters of our target network with those of the primary network. 16 | def updateTargetGraph(tfVars,tau): 17 | total_vars = len(tfVars) 18 | op_holder = [] 19 | for idx,var in enumerate(tfVars[0:total_vars//2]): 20 | print("copy from %s ===> %s"%(var.op.name, tfVars[idx+total_vars//2].op.name)) 21 | # op_holder.append(tfVars[idx+total_vars//2].assign((var.value()*tau) + ((1-tau)*tfVars[idx+total_vars//2].value()))) 22 | op_holder.append(tfVars[idx+total_vars//2].assign(var.value())) 23 | return op_holder 24 | 25 | def updateTarget(op_holder,sess): 26 | for op in op_holder: 27 | sess.run(op) 28 | total_vars = len(tf.trainable_variables()) 29 | a = tf.trainable_variables()[0].eval(session=sess) 30 | b = tf.trainable_variables()[total_vars//2].eval(session=sess) 31 | if a.all() == b.all(): 32 | print("Target Set Success") 33 | else: 34 | print("Target Set Failed") 35 | 36 | #Record performance metrics and episode logs for the Control Center. 37 | def saveToCenter(i,rList,jList,bufferArray,summaryLength,h_size,sess,mainQN,time_per_step): 38 | with open('./Center/log.csv', 'a') as myfile: 39 | state_display = (np.zeros([1,h_size]),np.zeros([1,h_size])) 40 | imagesS = [] 41 | for idx,z in enumerate(np.vstack(bufferArray[:,0])): 42 | img,state_display = sess.run([mainQN.salience,mainQN.rnn_state],\ 43 | feed_dict={mainQN.scalarInput:np.reshape(bufferArray[idx,0],[1,21168])/255.0,\ 44 | mainQN.trainLength:1,mainQN.state_in:state_display,mainQN.batch_size:1}) 45 | imagesS.append(img) 46 | imagesS = (imagesS - np.min(imagesS))/(np.max(imagesS) - np.min(imagesS)) 47 | imagesS = np.vstack(imagesS) 48 | imagesS = np.resize(imagesS,[len(imagesS),84,84,3]) 49 | luminance = np.max(imagesS,3) 50 | imagesS = np.multiply(np.ones([len(imagesS),84,84,3]),np.reshape(luminance,[len(imagesS),84,84,1])) 51 | make_gif(np.ones([len(imagesS),84,84,3]),'./Center/frames/sal'+str(i)+'.gif',duration=len(imagesS)*time_per_step,true_image=False,salience=True,salIMGS=luminance) 52 | 53 | images = zip(bufferArray[:,0]) 54 | images.append(bufferArray[-1,3]) 55 | images = np.vstack(images) 56 | images = np.resize(images,[len(images),84,84,3]) 57 | make_gif(images,'./Center/frames/image'+str(i)+'.gif',duration=len(images)*time_per_step,true_image=True,salience=False) 58 | 59 | wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) 60 | wr.writerow([i,np.mean(jList[-100:]),np.mean(rList[-summaryLength:]),'./frames/image'+str(i)+'.gif','./frames/log'+str(i)+'.csv','./frames/sal'+str(i)+'.gif']) 61 | myfile.close() 62 | with open('./Center/frames/log'+str(i)+'.csv','w') as myfile: 63 | state_train = (np.zeros([1,h_size]),np.zeros([1,h_size])) 64 | wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) 65 | wr.writerow(["ACTION","REWARD","A0","A1",'A2','A3','V']) 66 | a, v = sess.run([mainQN.Advantage,mainQN.Value],\ 67 | feed_dict={mainQN.scalarInput:np.vstack(bufferArray[:,0])/255.0,mainQN.trainLength:len(bufferArray),mainQN.state_in:state_train,mainQN.batch_size:1}) 68 | wr.writerows(zip(bufferArray[:,1],bufferArray[:,2],a[:,0],a[:,1],a[:,2],a[:,3],v[:,0])) 69 | 70 | #This code allows gifs to be saved of the training episode for use in the Control Center. 71 | def make_gif(images, fname, duration=2, true_image=False,salience=False,salIMGS=None): 72 | import moviepy.editor as mpy 73 | 74 | def make_frame(t): 75 | try: 76 | x = images[int(len(images)/duration*t)] 77 | except: 78 | x = images[-1] 79 | 80 | if true_image: 81 | return x.astype(np.uint8) 82 | else: 83 | return ((x+1)/2*255).astype(np.uint8) 84 | 85 | def make_mask(t): 86 | try: 87 | x = salIMGS[int(len(salIMGS)/duration*t)] 88 | except: 89 | x = salIMGS[-1] 90 | return x 91 | 92 | clip = mpy.VideoClip(make_frame, duration=duration) 93 | if salience == True: 94 | mask = mpy.VideoClip(make_mask, ismask=True,duration= duration) 95 | clipB = clip.set_mask(mask) 96 | clipB = clip.set_opacity(0) 97 | mask = mask.set_opacity(0.1) 98 | mask.write_gif(fname, fps = len(images) / duration,verbose=False) 99 | #clipB.write_gif(fname, fps = len(images) / duration,verbose=False) 100 | else: 101 | clip.write_gif(fname, fps = len(images) / duration,verbose=False) 102 | -------------------------------------------------------------------------------- /deeprl_prj/policy.py: -------------------------------------------------------------------------------- 1 | """RL Policy classes.""" 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | 7 | class Policy: 8 | """Base class representing an MDP policy. 9 | 10 | Policies are used by the agent to choose actions. 11 | 12 | Policies are designed to be stacked to get interesting behaviors 13 | of choices. For instances in a discrete action space the lowest 14 | level policy may take in Q-Values and select the action index 15 | corresponding to the largest value. If this policy is wrapped in 16 | an epsilon greedy policy then with some probability epsilon, a 17 | random action will be chosen. 18 | """ 19 | 20 | def select_action(self, **kwargs): 21 | """Used by agents to select actions. 22 | 23 | Returns 24 | ------- 25 | Any: 26 | An object representing the chosen action. Type depends on 27 | the hierarchy of policy instances. 28 | """ 29 | raise NotImplementedError('This method should be overriden.') 30 | 31 | class UniformRandomPolicy(Policy): 32 | """Chooses a discrete action with uniform random probability. 33 | 34 | Parameters 35 | ---------- 36 | num_actions: int 37 | Number of actions to choose from. Must be > 0. 38 | 39 | Raises 40 | ------ 41 | ValueError: 42 | If num_actions <= 0 43 | """ 44 | 45 | def __init__(self, num_actions): 46 | assert num_actions >= 1 47 | self.num_actions = num_actions 48 | 49 | def select_action(self, **kwargs): 50 | """Return a random action index. 51 | 52 | This policy cannot contain others (as they would just be ignored). 53 | 54 | Returns 55 | ------- 56 | int: 57 | Action index in range [0, num_actions) 58 | """ 59 | return np.random.randint(0, self.num_actions) 60 | 61 | def get_config(self): # noqa: D102 62 | return {'num_actions': self.num_actions} 63 | 64 | class GreedyPolicy(Policy): 65 | """Always returns best action according to Q-values. 66 | 67 | This is a pure exploitation policy. 68 | """ 69 | 70 | def select_action(self, q_values, **kwargs): # noqa: D102 71 | return np.argmax(q_values) 72 | 73 | class GreedyEpsilonPolicy(Policy): 74 | """Selects greedy action or with some probability a random action. 75 | 76 | Standard greedy-epsilon implementation. With probability epsilon 77 | choose a random action. Otherwise choose the greedy action. 78 | 79 | Parameters 80 | ---------- 81 | epsilon: float 82 | Initial probability of choosing a random action. Can be changed 83 | over time. 84 | """ 85 | def __init__(self, epsilon): 86 | self.epsilon = epsilon 87 | 88 | def select_action(self, q_values, **kwargs): 89 | """Run Greedy-Epsilon for the given Q-values. 90 | 91 | Parameters 92 | ---------- 93 | q_values: array-like 94 | Array-like structure of floats representing the Q-values for 95 | each action. 96 | 97 | Returns 98 | ------- 99 | int: 100 | The action index chosen. 101 | """ 102 | num_actions = q_values.shape[1] 103 | if np.random.rand() < self.epsilon: 104 | return UniformRandomPolicy(num_actions).select_action() 105 | else: 106 | return GreedyPolicy().select_action(q_values) 107 | 108 | class LinearDecayGreedyEpsilonPolicy(Policy): 109 | """Policy with a parameter that decays linearly. 110 | 111 | Like GreedyEpsilonPolicy but the epsilon decays from a start value 112 | to an end value over k steps. 113 | 114 | Parameters 115 | ---------- 116 | start_value: int, float 117 | The initial value of the parameter 118 | end_value: int, float 119 | The value of the policy at the end of the decay. 120 | num_steps: int 121 | The number of steps over which to decay the value. 122 | 123 | """ 124 | 125 | def __init__(self, start_value, end_value, num_steps): # noqa: D102 126 | self.start_value = start_value 127 | self.decay_rate = float(end_value - start_value) / num_steps 128 | self.end_value = end_value 129 | self.step = 0 130 | 131 | def select_action(self, q_values, is_training = True, **kwargs): 132 | """Decay parameter and select action. 133 | 134 | Parameters 135 | ---------- 136 | q_values: np.array 137 | The Q-values for each action. 138 | is_training: bool, optional 139 | If true then parameter will be decayed. Defaults to true. 140 | 141 | Returns 142 | ------- 143 | Any: 144 | Selected action. 145 | """ 146 | epsilon = self.start_value 147 | if is_training: 148 | epsilon += self.decay_rate * self.step 149 | self.step += 1 150 | epsilon = max(epsilon, self.end_value) 151 | return GreedyEpsilonPolicy(epsilon).select_action(q_values) 152 | 153 | def reset(self): 154 | """Start the decay over at the start value.""" 155 | self.step = 0 156 | -------------------------------------------------------------------------------- /deeprl_prj/preprocessors.py: -------------------------------------------------------------------------------- 1 | """Suggested Preprocessors.""" 2 | 3 | import numpy as np 4 | from PIL import Image 5 | 6 | from deeprl_prj import utils 7 | from deeprl_prj.core import Preprocessor 8 | 9 | 10 | class HistoryPreprocessor(Preprocessor): 11 | """Keeps the last k states. 12 | 13 | Useful for domains where you need velocities, but the state 14 | contains only positions. 15 | 16 | When the environment starts, this will just fill the initial 17 | sequence values with zeros k times. 18 | 19 | Parameters 20 | ---------- 21 | history_length: int 22 | Number of previous states to prepend to state being processed. 23 | 24 | """ 25 | 26 | def __init__(self, history_length=1): 27 | self.history_length = history_length 28 | self.past_states = None 29 | self.past_states_ori = None 30 | 31 | def process_state_for_network(self, state): 32 | """You only want history when you're deciding the current action to take.""" 33 | row, col = state.shape 34 | if self.past_states is None: 35 | self.past_states = np.zeros((row, col, self.history_length)) 36 | history = np.dstack((self.past_states, state)) 37 | self.past_states = history[:, :, 1:] 38 | return history 39 | 40 | def process_state_for_network_ori(self, state): 41 | """You only want history when you're deciding the current action to take.""" 42 | row, col = state.shape 43 | channel = 1 44 | if self.past_states_ori is None: 45 | self.past_states_ori = np.zeros((row, col, channel, self.history_length)) 46 | history = np.concatenate((self.past_states_ori, np.expand_dims(state, -1)), axis=3) 47 | self.past_states_ori = history[:, :, :, 1:] 48 | return history 49 | 50 | def reset(self): 51 | """Reset the history sequence. 52 | 53 | Useful when you start a new episode. 54 | """ 55 | self.past_states = None 56 | self.past_states_ori = None 57 | 58 | def get_config(self): 59 | return {'history_length': self.history_length} 60 | 61 | class AtariPreprocessor(Preprocessor): 62 | """Converts images to greyscale and downscales. 63 | 64 | Based on the preprocessing step described in: 65 | 66 | @article{mnih15_human_level_contr_throug_deep_reinf_learn, 67 | author = {Volodymyr Mnih and Koray Kavukcuoglu and David 68 | Silver and Andrei A. Rusu and Joel Veness and Marc 69 | G. Bellemare and Alex Graves and Martin Riedmiller 70 | and Andreas K. Fidjeland and Georg Ostrovski and 71 | Stig Petersen and Charles Beattie and Amir Sadik and 72 | Ioannis Antonoglou and Helen King and Dharshan 73 | Kumaran and Daan Wierstra and Shane Legg and Demis 74 | Hassabis}, 75 | title = {Human-Level Control Through Deep Reinforcement 76 | Learning}, 77 | journal = {Nature}, 78 | volume = 518, 79 | number = 7540, 80 | pages = {529-533}, 81 | year = 2015, 82 | doi = {10.1038/nature14236}, 83 | url = {http://dx.doi.org/10.1038/nature14236}, 84 | } 85 | 86 | You may also want to max over frames to remove flickering. Some 87 | games require this (based on animations and the limited sprite 88 | drawing capabilities of the original Atari). 89 | 90 | Parameters 91 | ---------- 92 | new_size: 2 element tuple 93 | The size that each image in the state should be scaled to. e.g 94 | (84, 84) will make each image in the output have shape (84, 84). 95 | """ 96 | 97 | def process_state_for_memory(self, state): 98 | """Scale, convert to greyscale and store as uint8. 99 | 100 | We don't want to save floating point numbers in the replay 101 | memory. We get the same resolution as uint8, but use a quarter 102 | to an eigth of the bytes (depending on float32 or float64) 103 | 104 | We recommend using the Python Image Library (PIL) to do the 105 | image conversions. 106 | """ 107 | img = Image.fromarray(state).convert('L').resize((84, 84), Image.BILINEAR) 108 | state = np.array(img) 109 | return state 110 | 111 | def process_state_for_network(self, state): 112 | """Scale, convert to greyscale and store as float32. 113 | 114 | Basically same as process state for memory, but this time 115 | outputs float32 images. 116 | """ 117 | return np.float32(self.process_state_for_memory(state) / 255.0) 118 | 119 | def process_state_for_network_ori(self, state): 120 | """Scale, convert to greyscale and store as float32. 121 | 122 | Basically same as process state for memory, but this time 123 | outputs float32 images. 124 | """ 125 | img = Image.fromarray(state) 126 | state = np.float32(np.array(img) / 255.0) 127 | return state 128 | 129 | def process_batch(self, samples): 130 | """The batches from replay memory will be uint8, convert to float32. 131 | 132 | Same as process_state_for_network but works on a batch of 133 | samples from the replay memory. Meaning you need to convert 134 | both state and next state values. 135 | """ 136 | batch_size = len(samples) 137 | for i in range(batch_size): 138 | samples[i].state = np.float32(samples[i].state / 255.0) 139 | samples[i].next_state = np.float32(samples[i].next_state / 255.0) 140 | return samples 141 | 142 | def process_reward(self, reward): 143 | """Clip reward between -1 and 1.""" 144 | # return np.clip(reward, -1, 1) 145 | return reward 146 | 147 | def reset(self): 148 | self.last_state = None 149 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import os 5 | import gym 6 | from gym import wrappers 7 | import tensorflow as tf 8 | from future.builtins import input 9 | 10 | # >>>>>>>>>>>>>>>>>>>>>>>> 11 | # Different implementation of DQNAgent 12 | # Uncomment the one you want to train and test 13 | 14 | # Keras implementation. Includes Basic Dueling Double DQN and Temporal Attention DQN. 15 | from deeprl_prj.dqn_keras import DQNAgent 16 | 17 | # Pure Tensorflow implementation. Includes Basic Dueling Double DQN and Temporal Attention DQN. 18 | # from deeprl_prj.dqn_tf_temporalAt import DQNAgent 19 | 20 | # Pure Tensorflow implementation. Includes Basic Dueling Double DQN and Spatial Attention DQN. 21 | # from deeprl_prj.dqn_tf_spatialAt import DQNAgent 22 | 23 | # <<<<<<<<<<<<<<<<<<<<<<<<< 24 | 25 | def get_output_folder(args, parent_dir, env_name, task_name): 26 | """Return save folder. 27 | 28 | Assumes folders in the parent_dir have suffix -run{run 29 | number}. Finds the highest run number and sets the output folder 30 | to that number + 1. This is just convenient so that if you run the 31 | same script multiple times tensorboard can plot all of the results 32 | on the same plots with different names. 33 | 34 | Parameters 35 | ---------- 36 | parent_dir: str 37 | Path of the directory containing all experiment runs. 38 | 39 | Returns 40 | ------- 41 | parent_dir/run_dir 42 | Path to this run's save directory. 43 | """ 44 | if not os.path.exists(parent_dir): 45 | os.makedirs(parent_dir) 46 | print('===== Folder did not exist; creating... %s'%parent_dir) 47 | 48 | experiment_id = 0 49 | for folder_name in os.listdir(parent_dir): 50 | if not os.path.isdir(os.path.join(parent_dir, folder_name)): 51 | continue 52 | try: 53 | folder_name = int(folder_name.split('-run')[-1][0]) 54 | print(folder_name) 55 | if folder_name > experiment_id: 56 | experiment_id = folder_name 57 | except: 58 | pass 59 | experiment_id += 1 60 | 61 | parent_dir = os.path.join(parent_dir, env_name) 62 | parent_dir = parent_dir + '-run{}'.format(experiment_id) + '-' + task_name 63 | if not os.path.exists(parent_dir): 64 | os.makedirs(parent_dir) 65 | print('===== Folder did not exist; creating... %s'%parent_dir) 66 | else: 67 | print('===== Folder exists; delete? %s'%parent_dir) 68 | response = input("Press Enter to continue...") 69 | os.system('rm -rf %s/' % (parent_dir)) 70 | os.makedirs(parent_dir+'/videos/') 71 | os.makedirs(parent_dir+'/images/') 72 | os.makedirs(parent_dir+'/losses/') 73 | return parent_dir 74 | 75 | def main(): 76 | parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') 77 | parser.add_argument('--env', default='QuadCopter-v4', help='Atari env name') 78 | parser.add_argument('-o', '--output', default='./log/', help='Directory to save data to') 79 | parser.add_argument('--seed', default=0, type=int, help='Random seed') 80 | parser.add_argument('--gamma', default=0.99, type=float, help='Discount factor') 81 | parser.add_argument('--batch_size', default=32, type=int, help='Minibatch size') 82 | parser.add_argument('--learning_rate', default=0.0001, type=float, help='Learning rate') 83 | parser.add_argument('--initial_epsilon', default=1.0, type=float, help='Initial exploration probability in epsilon-greedy') 84 | parser.add_argument('--final_epsilon', default=0.05, type=float, help='Final exploration probability in epsilon-greedy') 85 | parser.add_argument('--exploration_steps', default=24000, type=int, help='Number of steps over which the initial value of epsilon is linearly annealed to its final value') 86 | parser.add_argument('--num_samples', default=40000, type=int, help='Number of training samples from the environment in training') 87 | parser.add_argument('--num_frames', default=4, type=int, help='Number of frames to feed to Q-Network') 88 | parser.add_argument('--frame_width', default=84, type=int, help='Resized frame width') 89 | parser.add_argument('--frame_height', default=84, type=int, help='Resized frame height') 90 | parser.add_argument('--replay_memory_size', default=50000, type=int, help='Number of replay memory the agent uses for training') 91 | parser.add_argument('--target_update_freq', default=200, type=int, help='The frequency with which the target network is updated') 92 | parser.add_argument('--train_freq', default=4, type=int, help='The frequency of actions wrt Q-network update') 93 | parser.add_argument('--save_freq', default=500, type=int, help='The frequency with which the network is saved') 94 | parser.add_argument('--eval_freq', default=200, type=int, help='The frequency with which the policy is evlauted') 95 | parser.add_argument('--num_burn_in', default=10000, type=int, help='Number of steps to populate the replay memory before training starts') 96 | parser.add_argument('--load_network', default=False, action='store_true', help='Load trained mode') 97 | parser.add_argument('--load_network_path', default='', help='the path to the trained mode file') 98 | parser.add_argument('--net_mode', default='dqn', help='choose the mode of net, can be linear, dqn, duel') 99 | parser.add_argument('--max_episode_length', default = 1000, type=int, help = 'max length of each episode') 100 | parser.add_argument('--num_episodes_at_test', default = 20, type=int, help='Number of episodes the agent plays at test') 101 | parser.add_argument('--ddqn', default=False, dest='ddqn', action='store_true', help='enable ddqn') 102 | parser.add_argument('--train', default=True, dest='train', action='store_true', help='Train mode') 103 | parser.add_argument('--test', dest='train', action='store_false', help='Test mode') 104 | parser.add_argument('--no_experience', default=False, action='store_true', help='do not use experience replay') 105 | parser.add_argument('--no_target', default=False, action='store_true', help='do not use target fixing') 106 | parser.add_argument('--monitor', default=False, action='store_true', help='record video') 107 | parser.add_argument('--task_name', default='', help='task name') 108 | parser.add_argument('--recurrent', default=False, dest='recurrent', action='store_true', help='enable recurrent DQN') 109 | parser.add_argument('--a_t', default=False, dest='a_t', action='store_true', help='enable temporal/spatial attention') 110 | parser.add_argument('--global_a_t', default=False, dest='global_a_t', action='store_true', help='enable global temporal attention') 111 | parser.add_argument('--selector', default=False, dest='selector', action='store_true', help='enable selector for spatial attention') 112 | parser.add_argument('--bidir', default=False, dest='bidir', action='store_true', help='enable two layer bidirectional lstm') 113 | 114 | args = parser.parse_args() 115 | args.output = get_output_folder(args, args.output, args.env, args.task_name) 116 | 117 | env = gym.make(args.env) 118 | print("==== Output saved to: ", args.output) 119 | print("==== Args used:") 120 | print(args) 121 | 122 | # here is where you should start up a session, 123 | # create your DQN agent, create your model, etc. 124 | # then you can run your fit method. 125 | 126 | num_actions = env.action_space.n 127 | print(">>>> Game ", args.env, " #actions: ", num_actions) 128 | 129 | dqn = DQNAgent(args, num_actions) 130 | if args.train: 131 | print(">> Training mode.") 132 | dqn.fit(env, args.num_samples, args.max_episode_length) 133 | else: 134 | print(">> Evaluation mode.") 135 | dqn.evaluate(env, args.num_episodes_at_test, 0, args.max_episode_length, args.monitor) 136 | 137 | if __name__ == '__main__': 138 | main() 139 | -------------------------------------------------------------------------------- /deeprl_prj/core.py: -------------------------------------------------------------------------------- 1 | """Core classes.""" 2 | 3 | import numpy as np 4 | from PIL import Image 5 | 6 | class Sample: 7 | """Represents a reinforcement learning sample. 8 | 9 | Used to store observed experience from an MDP. Represents a 10 | standard `(s, a, r, s', terminal)` tuple. 11 | 12 | Parameters 13 | ---------- 14 | state: array-like 15 | Represents the state of the MDP before taking an action. In most 16 | cases this will be a numpy array. 17 | action: int, float, tuple 18 | For discrete action domains this will be an integer. For 19 | continuous action domains this will be a floating point 20 | number. For a parameterized action MDP this will be a tuple 21 | containing the action and its associated parameters. 22 | reward: float 23 | The reward received for executing the given action in the given 24 | state and transitioning to the resulting state. 25 | next_state: array-like 26 | This is the state the agent transitions to after executing the 27 | `action` in `state`. Expected to be the same type/dimensions as 28 | the state. 29 | is_terminal: boolean 30 | True if this action finished the episode. False otherwise. 31 | """ 32 | def __init__(self, state, action, reward, next_state, is_terminal): 33 | self.state = state 34 | self.action = action 35 | self.reward = reward 36 | self.next_state = next_state 37 | self.is_terminal = is_terminal 38 | 39 | class Preprocessor: 40 | """Preprocessor base class. 41 | 42 | This is a suggested interface for the preprocessing steps. 43 | 44 | Preprocessor can be used to perform some fixed operations on the 45 | raw state from an environment. For example, in ConvNet based 46 | networks which use image as the raw state, it is often useful to 47 | convert the image to greyscale or downsample the image. 48 | 49 | Preprocessors are implemented as class so that they can have 50 | internal state. This can be useful for things like the 51 | AtariPreproccessor which maxes over k frames. 52 | 53 | If you're using internal states, such as for keeping a sequence of 54 | inputs like in Atari, you should probably call reset when a new 55 | episode begins so that state doesn't leak in from episode to 56 | episode. 57 | """ 58 | 59 | def process_state_for_network(self, state): 60 | """Preprocess the given state before giving it to the network. 61 | 62 | Should be called just before the action is selected. 63 | 64 | This is a different method from the process_state_for_memory 65 | because the replay memory may require a different storage 66 | format to reduce memory usage. For example, storing images as 67 | uint8 in memory is a lot more efficient thant float32, but the 68 | networks work better with floating point images. 69 | 70 | Parameters 71 | ---------- 72 | state: np.ndarray 73 | Generally a numpy array. A single state from an environment. 74 | 75 | Returns 76 | ------- 77 | processed_state: np.ndarray 78 | Generally a numpy array. The state after processing. Can be 79 | modified in anyway. 80 | """ 81 | return state 82 | 83 | def process_state_for_memory(self, state): 84 | """Preprocess the given state before giving it to the replay memory. 85 | 86 | Should be called just before appending this to the replay memory. 87 | 88 | This is a different method from the process_state_for_network 89 | because the replay memory may require a different storage 90 | format to reduce memory usage. For example, storing images as 91 | uint8 in memory and the network expecting images in floating 92 | point. 93 | 94 | Parameters 95 | ---------- 96 | state: np.ndarray 97 | A single state from an environmnet. Generally a numpy array. 98 | 99 | Returns 100 | ------- 101 | processed_state: np.ndarray 102 | Generally a numpy array. The state after processing. Can be 103 | modified in any manner. 104 | """ 105 | return state 106 | 107 | def process_batch(self, samples): 108 | """Process batch of samples. 109 | 110 | If your replay memory storage format is different than your 111 | network input, you may want to apply this function to your 112 | sampled batch before running it through your update function. 113 | 114 | Parameters 115 | ---------- 116 | samples: list(tensorflow_rl.core.Sample) 117 | List of samples to process 118 | 119 | Returns 120 | ------- 121 | processed_samples: list(tensorflow_rl.core.Sample) 122 | Samples after processing. Can be modified in anyways, but 123 | the list length will generally stay the same. 124 | """ 125 | return samples 126 | 127 | def process_reward(self, reward): 128 | """Process the reward. 129 | 130 | Useful for things like reward clipping. The Atari environments 131 | from DQN paper do this. Instead of taking real score, they 132 | take the sign of the delta of the score. 133 | 134 | Parameters 135 | ---------- 136 | reward: float 137 | Reward to process 138 | 139 | Returns 140 | ------- 141 | processed_reward: float 142 | The processed reward 143 | """ 144 | return reward 145 | 146 | def reset(self): 147 | """Reset any internal state. 148 | 149 | Will be called at the start of every new episode. Makes it 150 | possible to do history snapshots. 151 | """ 152 | pass 153 | 154 | class ReplayMemory: 155 | """Interface for replay memories. 156 | 157 | Methods 158 | ------- 159 | append(state, action, reward, debug_info=None) 160 | Add a sample to the replay memory. 161 | end_episode(final_state, is_terminal, debug_info=None) 162 | Set the final state of an episode and mark whether it was a true 163 | terminal state (i.e. the env returned is_terminal=True), of it 164 | is an artificial terminal state (i.e. agent quit the episode 165 | early, but agent could have kept running episode). 166 | sample(batch_size, indexes=None) 167 | Return list of samples from the memory. Each class will 168 | implement a different method of choosing the 169 | samples. Optionally, specify the sample indexes manually. 170 | clear() 171 | Reset the memory. Deletes all references to the samples. 172 | """ 173 | def __init__(self, args): 174 | """Setup memory. 175 | 176 | You should specify the maximum size o the memory. Once the 177 | memory fills up oldest values should be removed. You can try 178 | the collections.deque class as the underlying storage, but 179 | your sample method will be very slow. 180 | 181 | We recommend using a list as a ring buffer. Just track the 182 | index where the next sample should be inserted in the list. 183 | """ 184 | self.memory_size = args.replay_memory_size 185 | self.history_length = args.num_frames 186 | self.actions = np.zeros(self.memory_size, dtype = np.int8) 187 | self.rewards = np.zeros(self.memory_size, dtype = np.int8) 188 | self.screens = np.zeros((self.memory_size, args.frame_height, args.frame_width), dtype = np.uint8) 189 | self.terminals = np.zeros(self.memory_size, dtype = np.bool) 190 | self.current = 0 191 | 192 | def append(self, state, action, reward, is_terminal): 193 | self.actions[self.current % self.memory_size] = action 194 | self.rewards[self.current % self.memory_size] = reward 195 | self.screens[self.current % self.memory_size] = state 196 | self.terminals[self.current % self.memory_size] = is_terminal 197 | # img = Image.fromarray(state, mode = 'L') 198 | # path = "./tmp/%05d-%s.png" % (self.current, is_terminal) 199 | # img.save(path) 200 | self.current += 1 201 | 202 | def get_state(self, index): 203 | state = self.screens[index - self.history_length + 1:index + 1, :, :] 204 | # history dimention last 205 | return np.transpose(state, (1, 2, 0)) 206 | 207 | def sample(self, batch_size): 208 | samples = [] 209 | indexes = [] 210 | # ensure enough frames to sample 211 | assert self.current > self.history_length 212 | # -1 because still need next frame 213 | end = min(self.current, self.memory_size) - 1 214 | 215 | while len(indexes) < batch_size: 216 | index = np.random.randint(self.history_length - 1, end) 217 | # sampled state shouldn't contain episode end 218 | if self.terminals[index - self.history_length + 1: index + 1].any(): 219 | continue 220 | indexes.append(index) 221 | 222 | for idx in indexes: 223 | new_sample = Sample(self.get_state(idx), self.actions[idx], 224 | self.rewards[idx], self.get_state(idx + 1), self.terminals[idx]) 225 | samples.append(new_sample) 226 | return samples 227 | 228 | def clear(self): 229 | self.current = 0 230 | -------------------------------------------------------------------------------- /deeprl_prj/dqn_keras.py: -------------------------------------------------------------------------------- 1 | '''Keras DQN Agent implementation. Includes Basic Dueling Double DQN and Temporal Attention DQN.''' 2 | 3 | from deeprl_prj.policy import * 4 | from deeprl_prj.objectives import * 5 | from deeprl_prj.preprocessors import * 6 | from deeprl_prj.utils import * 7 | from deeprl_prj.core import * 8 | 9 | import keras 10 | from keras.optimizers import (Adam, RMSprop) 11 | from keras.layers import (Activation, Convolution2D, Dense, Flatten, Input, 12 | Permute, merge, Merge, Lambda, Reshape, TimeDistributed, LSTM, RepeatVector, Permute, multiply) 13 | from keras.layers.wrappers import Bidirectional 14 | from keras.models import Model 15 | from keras import backend as K 16 | from keras.backend.tensorflow_backend import set_session 17 | 18 | import sys 19 | from gym import wrappers 20 | import tensorflow as tf 21 | import numpy as np 22 | 23 | config = tf.ConfigProto() 24 | config.gpu_options.allow_growth = True 25 | config.allow_soft_placement = True 26 | set_session(tf.Session(config=config)) 27 | 28 | def create_model(input_shape, num_actions, mode, args, model_name='q_network'): # noqa: D103 29 | """Create the Q-network model. 30 | 31 | Use Keras to construct a keras.models.Model instance. 32 | 33 | Parameters 34 | ---------- 35 | window: int 36 | Each input to the network is a sequence of frames. This value 37 | defines how many frames are in the sequence. 38 | input_shape: tuple(int, int, int), rows, cols, channels 39 | The expected input image size. 40 | num_actions: int 41 | Number of possible actions. Defined by the gym environment. 42 | model_name: str 43 | Useful when debugging. Makes the model show up nicer in tensorboard. 44 | 45 | Returns 46 | ------- 47 | keras.models.Model 48 | The Q-model. 49 | """ 50 | assert(mode in ("linear", "duel", "dqn")) 51 | with tf.variable_scope(model_name): 52 | input_data = Input(shape = input_shape, name = "input") 53 | if mode == "linear": # We will never enter this loop 54 | flatten_hidden = Flatten(name = "flatten")(input_data) #(H, W, D, Batch) 55 | output = Dense(num_actions, name = "output")(flatten_hidden) 56 | # Directly come here for DQN 57 | else: 58 | if not(args.recurrent): # Only when "not" using DRQN 59 | h1 = Convolution2D(32, (8, 8), strides = 4, activation = "relu", name = "conv1")(input_data) 60 | h2 = Convolution2D(64, (4, 4), strides = 2, activation = "relu", name = "conv2")(h1) 61 | h3 = Convolution2D(64, (3, 3), strides = 1, activation = "relu", name = "conv3")(h2) 62 | context = Flatten(name = "flatten")(h3) 63 | # ENTER HERE FOR DRQN 64 | else: 65 | print('>>>> Defining Recurrent Modules...') 66 | input_data_expanded = Reshape((input_shape[0], input_shape[1], input_shape[2], 1), input_shape = input_shape) (input_data) 67 | input_data_TimeDistributed = Permute((3, 1, 2, 4), input_shape=input_shape)(input_data_expanded) # (D, H, W, Batch) 68 | h1 = TimeDistributed(Convolution2D(32, (8, 8), strides = 4, activation = "relu", name = "conv1"), \ 69 | input_shape=(args.num_frames, input_shape[0], input_shape[1], 1))(input_data_TimeDistributed) 70 | h2 = TimeDistributed(Convolution2D(64, (4, 4), strides = 2, activation = "relu", name = "conv2"))(h1) 71 | h3 = TimeDistributed(Convolution2D(64, (3, 3), strides = 1, activation = "relu", name = "conv3"))(h2) 72 | flatten_hidden = TimeDistributed(Flatten())(h3) 73 | hidden_input = TimeDistributed(Dense(512, activation = 'relu', name = 'flat_to_512')) (flatten_hidden) 74 | if not(args.a_t): 75 | context = LSTM(512, return_sequences=False, stateful=False, input_shape=(args.num_frames, 512)) (hidden_input) 76 | else: 77 | if args.bidir: 78 | hidden_input = Bidirectional(LSTM(512, return_sequences=True, stateful=False, input_shape=(args.num_frames, 512)), merge_mode='sum') (hidden_input) 79 | all_outs = Bidirectional(LSTM(512, return_sequences=True, stateful=False, input_shape=(args.num_frames, 512)), merge_mode='sum') (hidden_input) 80 | else: 81 | all_outs = LSTM(512, return_sequences=True, stateful=False, input_shape=(args.num_frames, 512)) (hidden_input) 82 | # attention 83 | attention = TimeDistributed(Dense(1, activation='tanh'))(all_outs) 84 | # print(attention.shape) 85 | attention = Flatten()(attention) 86 | attention = Activation('softmax')(attention) 87 | attention = RepeatVector(512)(attention) 88 | attention = Permute([2, 1])(attention) 89 | sent_representation = merge([all_outs, attention], mode='mul') 90 | context = Lambda(lambda xin: K.sum(xin, axis=-2), output_shape=(512,))(sent_representation) 91 | # print(context.shape) 92 | 93 | if mode == "dqn": 94 | h4 = Dense(512, activation='relu', name = "fc")(context) 95 | output = Dense(num_actions, name = "output")(h4) 96 | elif mode == "duel": 97 | value_hidden = Dense(512, activation = 'relu', name = 'value_fc')(context) 98 | value = Dense(1, name = "value")(value_hidden) 99 | action_hidden = Dense(512, activation = 'relu', name = 'action_fc')(context) 100 | action = Dense(num_actions, name = "action")(action_hidden) 101 | action_mean = Lambda(lambda x: tf.reduce_mean(x, axis = 1, keep_dims = True), name = 'action_mean')(action) 102 | output = Lambda(lambda x: x[0] + x[1] - x[2], name = 'output')([action, value, action_mean]) 103 | model = Model(inputs = input_data, outputs = output) 104 | print(model.summary()) 105 | return model 106 | 107 | def save_scalar(step, name, value, writer): 108 | """Save a scalar value to tensorboard. 109 | Parameters 110 | ---------- 111 | step: int 112 | Training step (sets the position on x-axis of tensorboard graph. 113 | name: str 114 | Name of variable. Will be the name of the graph in tensorboard. 115 | value: float 116 | The value of the variable at this step. 117 | writer: tf.FileWriter 118 | The tensorboard FileWriter instance. 119 | """ 120 | summary = tf.Summary() 121 | summary_value = summary.value.add() 122 | summary_value.simple_value = float(value) 123 | summary_value.tag = name 124 | writer.add_summary(summary, step) 125 | 126 | class DQNAgent: 127 | """Class implementing DQN. 128 | 129 | This is a basic outline of the functions/parameters to implement the DQNAgnet. 130 | 131 | Parameters 132 | ---------- 133 | q_network: keras.models.Model 134 | Your Q-network model. 135 | preprocessor: deeprl_hw2.core.Preprocessor 136 | The preprocessor class. See the associated classes for more 137 | details. 138 | memory: deeprl_hw2.core.Memory 139 | Your replay memory. 140 | gamma: float 141 | Discount factor. 142 | target_update_freq: float 143 | Frequency to update the target network. You can either provide a 144 | number representing a soft target update (see utils.py) or a 145 | hard target update (see utils.py and Atari paper.) 146 | num_burn_in: int 147 | Before you begin updating the Q-network your replay memory has 148 | to be filled up with some number of samples. This number says 149 | how many. 150 | train_freq: int 151 | How often you actually update your Q-Network. Sometimes 152 | stability is improved if you collect a couple samples for your 153 | replay memory, for every Q-network update that you run. 154 | batch_size: int 155 | How many samples in each minibatch. 156 | """ 157 | def __init__(self, args, num_actions): 158 | self.num_actions = num_actions 159 | input_shape = (args.frame_height, args.frame_width, args.num_frames) 160 | self.history_processor = HistoryPreprocessor(args.num_frames - 1) 161 | self.atari_processor = AtariPreprocessor() 162 | self.memory = ReplayMemory(args) 163 | self.policy = LinearDecayGreedyEpsilonPolicy(args.initial_epsilon, args.final_epsilon, args.exploration_steps) 164 | self.gamma = args.gamma 165 | self.target_update_freq = args.target_update_freq 166 | self.num_burn_in = args.num_burn_in 167 | self.train_freq = args.train_freq 168 | self.batch_size = args.batch_size 169 | self.learning_rate = args.learning_rate 170 | self.frame_width = args.frame_width 171 | self.frame_height = args.frame_height 172 | self.num_frames = args.num_frames 173 | self.output_path = args.output 174 | self.output_path_videos = args.output + '/videos/' 175 | self.save_freq = args.save_freq 176 | self.load_network = args.load_network 177 | self.load_network_path = args.load_network_path 178 | self.enable_ddqn = args.ddqn 179 | self.net_mode = args.net_mode 180 | self.q_network = create_model(input_shape, num_actions, self.net_mode, args, "QNet") 181 | self.target_network = create_model(input_shape, num_actions, self.net_mode, args, "TargetNet") 182 | print(">>>> Net mode: %s, Using double dqn: %s" % (self.net_mode, self.enable_ddqn)) 183 | self.eval_freq = args.eval_freq 184 | self.no_experience = args.no_experience 185 | self.no_target = args.no_target 186 | print(">>>> Target fixing: %s, Experience replay: %s" % (not self.no_target, not self.no_experience)) 187 | 188 | # initialize target network 189 | self.target_network.set_weights(self.q_network.get_weights()) 190 | self.final_model = None 191 | self.compile() 192 | 193 | self.writer = tf.summary.FileWriter(self.output_path) 194 | 195 | print("*******__init__", input_shape) 196 | 197 | def compile(self, optimizer = None, loss_func = None): 198 | """Setup all of the TF graph variables/ops. 199 | 200 | This is inspired by the compile method on the 201 | keras.models.Model class. 202 | 203 | This is the place to create the target network, setup 204 | loss function and any placeholders. 205 | """ 206 | if loss_func is None: 207 | loss_func = mean_huber_loss 208 | # loss_func = 'mse' 209 | if optimizer is None: 210 | optimizer = Adam(lr = self.learning_rate) 211 | # optimizer = RMSprop(lr=0.00025) 212 | with tf.variable_scope("Loss"): 213 | state = Input(shape = (self.frame_height, self.frame_width, self.num_frames) , name = "states") 214 | action_mask = Input(shape = (self.num_actions,), name = "actions") 215 | qa_value = self.q_network(state) 216 | qa_value = merge([qa_value, action_mask], mode = 'mul', name = "multiply") 217 | qa_value = Lambda(lambda x: tf.reduce_sum(x, axis=1, keep_dims = True), name = "sum")(qa_value) 218 | 219 | self.final_model = Model(inputs = [state, action_mask], outputs = qa_value) 220 | self.final_model.compile(loss=loss_func, optimizer=optimizer) 221 | 222 | def calc_q_values(self, state): 223 | """Given a state (or batch of states) calculate the Q-values. 224 | 225 | Basically run your network on these states. 226 | 227 | Return 228 | ------ 229 | Q-values for the state(s) 230 | """ 231 | state = state[None, :, :, :] 232 | return self.q_network.predict_on_batch(state) 233 | 234 | def select_action(self, state, is_training = True, **kwargs): 235 | """Select the action based on the current state. 236 | 237 | Returns 238 | -------- 239 | selected action 240 | """ 241 | q_values = self.calc_q_values(state) 242 | if is_training: 243 | if kwargs['policy_type'] == 'UniformRandomPolicy': 244 | return UniformRandomPolicy(self.num_actions).select_action() 245 | else: 246 | # linear decay greedy epsilon policy 247 | return self.policy.select_action(q_values, is_training) 248 | else: 249 | # return GreedyEpsilonPolicy(0.05).select_action(q_values) 250 | return GreedyPolicy().select_action(q_values) 251 | 252 | def update_policy(self, current_sample): 253 | """Update your policy. 254 | 255 | Behavior may differ based on what stage of training your 256 | in. If you're in training mode then you should check if you 257 | should update your network parameters based on the current 258 | step and the value you set for train_freq. 259 | 260 | Inside, you'll want to sample a minibatch, calculate the 261 | target values, update your network, and then update your 262 | target values. 263 | 264 | You might want to return the loss and other metrics as an 265 | output. They can help you monitor how training is going. 266 | """ 267 | batch_size = self.batch_size 268 | 269 | if self.no_experience: 270 | states = np.stack([current_sample.state]) 271 | next_states = np.stack([current_sample.next_state]) 272 | rewards = np.asarray([current_sample.reward]) 273 | mask = np.asarray([1 - int(current_sample.is_terminal)]) 274 | 275 | action_mask = np.zeros((1, self.num_actions)) 276 | action_mask[0, current_sample.action] = 1.0 277 | else: 278 | samples = self.memory.sample(batch_size) 279 | samples = self.atari_processor.process_batch(samples) 280 | 281 | states = np.stack([x.state for x in samples]) 282 | actions = np.asarray([x.action for x in samples]) 283 | action_mask = np.zeros((batch_size, self.num_actions)) 284 | action_mask[range(batch_size), actions] = 1.0 285 | 286 | next_states = np.stack([x.next_state for x in samples]) 287 | mask = np.asarray([1 - int(x.is_terminal) for x in samples]) 288 | rewards = np.asarray([x.reward for x in samples]) 289 | 290 | if self.no_target: 291 | next_qa_value = self.q_network.predict_on_batch(next_states) 292 | else: 293 | next_qa_value = self.target_network.predict_on_batch(next_states) 294 | 295 | if self.enable_ddqn: 296 | qa_value = self.q_network.predict_on_batch(next_states) 297 | max_actions = np.argmax(qa_value, axis = 1) 298 | next_qa_value = next_qa_value[range(batch_size), max_actions] 299 | else: 300 | next_qa_value = np.max(next_qa_value, axis = 1) 301 | target = rewards + self.gamma * mask * next_qa_value 302 | 303 | return self.final_model.train_on_batch([states, action_mask], target), np.mean(target) 304 | 305 | def fit(self, env, num_iterations, max_episode_length=None): 306 | """Fit your model to the provided environment. 307 | 308 | This is where you sample actions from your network, 309 | collect experience samples and add them to your replay memory, 310 | and update your network parameters. 311 | 312 | Parameters 313 | ---------- 314 | env: gym.Env 315 | This is the Atari environment. 316 | num_iterations: int 317 | How many samples/updates to perform. 318 | max_episode_length: int 319 | How long a single episode should last before the agent 320 | resets. Can help exploration. 321 | """ 322 | is_training = True 323 | print("Training starts.") 324 | self.save_model(0) 325 | eval_count = 0 326 | 327 | state = env.reset() 328 | 329 | burn_in = True 330 | idx_episode = 1 331 | episode_loss = .0 332 | episode_frames = 0 333 | episode_reward = .0 334 | episode_raw_reward = .0 335 | episode_target_value = .0 336 | 337 | # Logs 338 | losses_list = list() 339 | step_loss_list = list() 340 | step_reward = 0.0 341 | step_reward_raw = 0.0 342 | 343 | for t in range(self.num_burn_in + num_iterations): 344 | print ("iteration --> %s, episode --> %s" % (t, idx_episode)) 345 | action_state = self.history_processor.process_state_for_network( 346 | self.atari_processor.process_state_for_network(state)) 347 | policy_type = "UniformRandomPolicy" if burn_in else "LinearDecayGreedyEpsilonPolicy" 348 | action = self.select_action(action_state, is_training, policy_type = policy_type) 349 | processed_state = self.atari_processor.process_state_for_memory(state) 350 | 351 | # print("******* fit_action", action_state.shape) 352 | # print("******* fit_proecess", processed_state.shape) 353 | 354 | env.render() 355 | state, reward, done, info = env.step(action) 356 | 357 | processed_next_state = self.atari_processor.process_state_for_network(state) 358 | action_next_state = np.dstack((action_state, processed_next_state)) 359 | action_next_state = action_next_state[:, :, 1:] 360 | 361 | processed_reward = self.atari_processor.process_reward(reward) 362 | 363 | self.memory.append(processed_state, action, processed_reward, done) 364 | current_sample = Sample(action_state, action, processed_reward, action_next_state, done) 365 | 366 | if not burn_in: 367 | episode_frames += 1 368 | episode_reward += processed_reward 369 | episode_raw_reward += reward 370 | if episode_frames > max_episode_length: 371 | done = True 372 | 373 | if not burn_in: 374 | step_reward += processed_reward 375 | step_reward_raw += reward 376 | step_losses = [t-last_burn-1, step_reward, step_reward_raw, step_reward / (t-last_burn-1), step_reward_raw / (t-last_burn-1)] 377 | step_loss_list.append(step_losses) 378 | 379 | 380 | if done: 381 | # adding last frame only to save last state 382 | last_frame = self.atari_processor.process_state_for_memory(state) 383 | # action, reward, done doesn't matter here 384 | self.memory.append(last_frame, action, 0, done) 385 | if not burn_in: 386 | avg_target_value = episode_target_value / episode_frames 387 | print(">>> Training: time %d, episode %d, length %d, reward %.0f, raw_reward %.0f, loss %.4f, target value %.4f, policy step %d, memory cap %d" % 388 | (t, idx_episode, episode_frames, episode_reward, episode_raw_reward, episode_loss, 389 | avg_target_value, self.policy.step, self.memory.current)) 390 | sys.stdout.flush() 391 | save_scalar(idx_episode, 'train/episode_frames', episode_frames, self.writer) 392 | save_scalar(idx_episode, 'train/episode_reward', episode_reward, self.writer) 393 | save_scalar(idx_episode, 'train/episode_raw_reward', episode_raw_reward, self.writer) 394 | save_scalar(idx_episode, 'train/episode_loss', episode_loss, self.writer) 395 | save_scalar(idx_episode, 'train_avg/avg_reward', episode_reward / episode_frames, self.writer) 396 | save_scalar(idx_episode, 'train_avg/avg_target_value', avg_target_value, self.writer) 397 | save_scalar(idx_episode, 'train_avg/avg_loss', episode_loss / episode_frames, self.writer) 398 | 399 | # log losses 400 | losses = [idx_episode, episode_frames, episode_reward, episode_raw_reward, episode_loss, episode_reward / episode_frames, avg_target_value, episode_loss / episode_frames] 401 | losses_list.append(losses) 402 | 403 | # reset values 404 | episode_frames = 0 405 | episode_reward = .0 406 | episode_raw_reward = .0 407 | episode_loss = .0 408 | episode_target_value = .0 409 | idx_episode += 1 410 | burn_in = (t < self.num_burn_in) 411 | state = env.reset() 412 | self.atari_processor.reset() 413 | self.history_processor.reset() 414 | 415 | if burn_in: 416 | last_burn = t 417 | 418 | if not burn_in: 419 | if t % self.train_freq == 0: 420 | loss, target_value = self.update_policy(current_sample) 421 | episode_loss += loss 422 | episode_target_value += target_value 423 | # update freq is based on train_freq 424 | if t % (self.train_freq * self.target_update_freq) == 0: 425 | # target updates can have the option to be hard or soft 426 | # related functions are defined in deeprl_prj.utils 427 | # here we use hard target update as default 428 | self.target_network.set_weights(self.q_network.get_weights()) 429 | if t % self.save_freq == 0: 430 | self.save_model(idx_episode) 431 | 432 | loss_array = np.asarray(losses_list) 433 | print (loss_array.shape) # 10 element vector 434 | 435 | # loss_path = os.path.join('./losses/loss_episode%s.csv' % (idx_episode)) 436 | loss_path = self.output_path + "/losses/loss_episodes" + str(idx_episode) + ".csv" 437 | np.savetxt(loss_path, loss_array, fmt='%.5f', delimiter=',') 438 | 439 | step_loss_array = np.asarray(step_loss_list) 440 | print (step_loss_array.shape) # 10 element vector 441 | 442 | step_loss_path = self.output_path + "/losses/loss_steps" + str(t-last_burn-1) + ".csv" 443 | np.savetxt(step_loss_path, step_loss_array, fmt='%.5f', delimiter=',') 444 | 445 | 446 | # No evaluation while training 447 | # if t % (self.eval_freq * self.train_freq) == 0: 448 | # episode_reward_mean, episode_reward_std, eval_count = self.evaluate(env, 1, eval_count, max_episode_length, True) 449 | # save_scalar(t, 'eval/eval_episode_reward_mean', episode_reward_mean, self.writer) 450 | # save_scalar(t, 'eval/eval_episode_reward_std', episode_reward_std, self.writer) 451 | 452 | self.save_model(idx_episode) 453 | 454 | 455 | def save_model(self, idx_episode): 456 | safe_path = self.output_path + "/qnet" + str(idx_episode) + ".h5" 457 | self.q_network.save_weights(safe_path) 458 | print("Network at", idx_episode, "saved to:", safe_path) 459 | 460 | def evaluate(self, env, num_episodes, eval_count, max_episode_length=None, monitor=False): 461 | """Test your agent with a provided environment. 462 | 463 | Basically run your policy on the environment and collect stats 464 | like cumulative reward, average episode length, etc. 465 | 466 | You can also call the render function here if you want to 467 | visually inspect your policy. 468 | """ 469 | print("Evaluation starts.") 470 | 471 | is_training = False 472 | if self.load_network: 473 | self.q_network.load_weights(self.load_network_path) 474 | print("Load network from:", self.load_network_path) 475 | # if monitor: 476 | # env = wrappers.Monitor(env, self.output_path_videos, video_callable=lambda x:True, resume=True) 477 | state = env.reset() 478 | 479 | idx_episode = 1 480 | episode_frames = 0 481 | episode_reward = np.zeros(num_episodes) 482 | t = 0 483 | 484 | while idx_episode <= num_episodes: 485 | t += 1 486 | action_state = self.history_processor.process_state_for_network( 487 | self.atari_processor.process_state_for_network(state)) 488 | action = self.select_action(action_state, is_training, policy_type = 'GreedyEpsilonPolicy') 489 | state, reward, done, info = env.step(action) 490 | episode_frames += 1 491 | episode_reward[idx_episode-1] += reward 492 | if episode_frames > max_episode_length: 493 | done = True 494 | if done: 495 | print("Eval: time %d, episode %d, length %d, reward %.0f" % 496 | (t, idx_episode, episode_frames, episode_reward[idx_episode-1])) 497 | eval_count += 1 498 | save_scalar(eval_count, 'eval/eval_episode_raw_reward', episode_reward[idx_episode-1], self.writer) 499 | save_scalar(eval_count, 'eval/eval_episode_raw_length', episode_frames, self.writer) 500 | sys.stdout.flush() 501 | state = env.reset() 502 | episode_frames = 0 503 | idx_episode += 1 504 | self.atari_processor.reset() 505 | self.history_processor.reset() 506 | 507 | reward_mean = np.mean(episode_reward) 508 | reward_std = np.std(episode_reward) 509 | print("Evaluation summury: num_episodes [%d], reward_mean [%.3f], reward_std [%.3f]" % 510 | (num_episodes, reward_mean, reward_std)) 511 | sys.stdout.flush() 512 | 513 | return reward_mean, reward_std, eval_count 514 | -------------------------------------------------------------------------------- /deeprl_prj/dqn_tf_temporalAt.py: -------------------------------------------------------------------------------- 1 | '''Pure Tensorflow implementation. Includes Basic Dueling Double DQN and Temporal Attention DQN.''' 2 | 3 | from deeprl_prj.policy import * 4 | from deeprl_prj.objectives import * 5 | from deeprl_prj.preprocessors import * 6 | from deeprl_prj.utils import * 7 | from deeprl_prj.core import * 8 | from helper import * 9 | 10 | import numpy as np 11 | import sys 12 | from gym import wrappers 13 | import tensorflow as tf 14 | print(tf.__version__) 15 | 16 | """Main DQN agent.""" 17 | 18 | class Qnetwork(): 19 | def __init__(self, args, h_size, num_frames, num_actions, rnn_cell_1, myScope, rnn_cell_2=None): 20 | #The network recieves a frame from the game, flattened into an array. 21 | #It then resizes it and processes it through four convolutional layers. 22 | self.imageIn = tf.placeholder(shape=[None,84,84,num_frames],dtype=tf.float32) 23 | self.image_permute = tf.transpose(self.imageIn, perm=[0, 3, 1, 2]) 24 | self.image_reshape = tf.reshape(self.image_permute, [-1, 84, 84, 1]) 25 | self.image_reshape_recoverd = tf.squeeze(tf.gather(tf.reshape(self.image_reshape, [-1, num_frames, 84, 84, 1]), [0]), [0]) 26 | self.summary_merged = tf.summary.merge([tf.summary.image('image_reshape_recoverd', self.image_reshape_recoverd, max_outputs=num_frames)]) 27 | # self.imageIn = tf.reshape(self.scalarInput,shape=[-1,84,84,1]) 28 | self.conv1 = tf.contrib.layers.convolution2d( \ 29 | inputs=self.image_reshape,num_outputs=32,\ 30 | kernel_size=[8,8],stride=[4,4],padding='VALID', \ 31 | activation_fn=tf.nn.relu, biases_initializer=None,scope=myScope+'_conv1') 32 | self.conv2 = tf.contrib.layers.convolution2d( \ 33 | inputs=self.conv1,num_outputs=64,\ 34 | kernel_size=[4,4],stride=[2,2],padding='VALID', \ 35 | activation_fn=tf.nn.relu, biases_initializer=None,scope=myScope+'_conv2') 36 | self.conv3 = tf.contrib.layers.convolution2d( \ 37 | inputs=self.conv2,num_outputs=64,\ 38 | kernel_size=[3,3],stride=[1,1],padding='VALID', \ 39 | activation_fn=tf.nn.relu, biases_initializer=None,scope=myScope+'_conv3') 40 | self.conv4 = tf.contrib.layers.fully_connected(tf.contrib.layers.flatten(self.conv3), h_size, activation_fn=tf.nn.relu) 41 | 42 | #We take the output from the final convolutional layer and send it to a recurrent layer. 43 | #The input must be reshaped into [batch x trace x units] for rnn processing, 44 | #and then returned to [batch x units] when sent through the upper levels. 45 | self.batch_size = tf.placeholder(dtype=tf.int32) 46 | self.convFlat = tf.reshape(self.conv4,[self.batch_size, num_frames, h_size]) 47 | self.state_in_1 = rnn_cell_1.zero_state(self.batch_size, tf.float32) 48 | 49 | if args.bidir: 50 | self.state_in_2 = rnn_cell_2.zero_state(self.batch_size, tf.float32) 51 | self.rnn_outputs_tuple, self.rnn_state = tf.nn.bidirectional_dynamic_rnn(\ 52 | cell_fw=rnn_cell_1, cell_bw=rnn_cell_2, inputs=self.convFlat, dtype=tf.float32, \ 53 | initial_state_fw=self.state_in_1, initial_state_bw=self.state_in_2, scope=myScope+'_rnn') 54 | # print "====== len(self.rnn_outputs_tuple), self.rnn_outputs_tuple[0] ", len(self.rnn_outputs_tuple), self.rnn_outputs_tuple[0].get_shape().as_list(), self.rnn_outputs_tuple[1].get_shape().as_list() # [None, 10, 512] 55 | # As we have Bi-LSTM, we have two output, which are not connected. So merge them 56 | self.rnn_outputs = tf.concat([self.rnn_outputs_tuple[0], self.rnn_outputs_tuple[1]], axis=2) 57 | # self.rnn_outputs = tf.contrib.layers.fully_connected(tf.contrib.layers.flatten(self.rnn_outputs_double), h_size, activation_fn=None) 58 | self.rnn_output_dim = h_size * 2 59 | else: 60 | self.rnn_outputs, self.rnn_state = tf.nn.dynamic_rnn(\ 61 | inputs=self.convFlat,cell=rnn_cell_1, dtype=tf.float32, \ 62 | initial_state=self.state_in_1, scope=myScope+'_rnn') 63 | # print "====== self.rnn_outputs ", self.rnn_outputs.get_shape().as_list() # [None, 10, 512] 64 | self.rnn_output_dim = h_size 65 | 66 | # attention machanism 67 | if not(args.a_t): 68 | self.rnn_last_output = tf.slice(self.rnn_outputs, [0, num_frames-1, 0], [-1, 1, -1]) 69 | self.rnn = tf.squeeze(self.rnn_last_output, [1]) 70 | else: 71 | if args.global_a_t: 72 | self.rnn_outputs_before = tf.slice(self.rnn_outputs, [0, 0, 0], [-1, num_frames-1, -1]) 73 | self.attention_v = tf.reshape(tf.slice(self.rnn_outputs, [0, num_frames-1, 0], [-1, 1, -1]), [-1, self.rnn_output_dim, 1]) 74 | self.attention_va = tf.tanh(tf.matmul(self.rnn_outputs_before, self.attention_v)) 75 | self.attention_a = tf.nn.softmax(self.attention_va, dim=1) 76 | self.rnn = tf.reduce_sum(tf.multiply(self.rnn_outputs_before, self.attention_a), axis=1) 77 | self.rnn = tf.concat([self.rnn, tf.squeeze(tf.slice(self.rnn_outputs, [0, num_frames-1, 0], [-1, 1, -1]), [1])], axis=1) 78 | else: 79 | with tf.variable_scope(myScope+'_attention'): 80 | self.attention_v = tf.get_variable(name='atten_v', shape=[self.rnn_output_dim, 1], initializer=tf.contrib.layers.xavier_initializer()) 81 | self.attention_va = tf.tanh(tf.map_fn(lambda x: tf.matmul(x, self.attention_v), self.rnn_outputs)) 82 | self.attention_a = tf.nn.softmax(self.attention_va, dim=1) 83 | self.rnn = tf.reduce_sum(tf.multiply(self.rnn_outputs, self.attention_a), axis=1) 84 | # print "========== self.rnn ", self.rnn.get_shape().as_list() #[None, 1024] 85 | 86 | if args.net_mode == "duel": 87 | #The output from the recurrent player is then split into separate Value and Advantage streams 88 | self.ad_hidden = tf.contrib.layers.fully_connected(self.rnn, h_size, activation_fn=tf.nn.relu, scope=myScope+'_fc_advantage_hidden') 89 | self.Advantage = tf.contrib.layers.fully_connected(self.ad_hidden, num_actions, activation_fn=None, scope=myScope+'_fc_advantage') 90 | self.value_hidden = tf.contrib.layers.fully_connected(self.rnn, h_size, activation_fn=tf.nn.relu, scope=myScope+'_fc_value_hidden') 91 | self.Value = tf.contrib.layers.fully_connected(self.value_hidden, 1, activation_fn=None, scope=myScope+'_fc_value') 92 | #Then combine them together to get our final Q-values. 93 | self.Qout = self.Value + tf.subtract(self.Advantage,tf.reduce_mean(self.Advantage,axis=1,keep_dims=True)) 94 | else: 95 | self.Qout = tf.contrib.layers.fully_connected(self.rnn, num_actions, activation_fn=None) 96 | 97 | self.predict = tf.argmax(self.Qout,1) 98 | 99 | #Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values. 100 | self.targetQ = tf.placeholder(shape=[None],dtype=tf.float32) 101 | self.actions = tf.placeholder(shape=[None],dtype=tf.int32) 102 | self.actions_onehot = tf.one_hot(self.actions, num_actions, dtype=tf.float32) 103 | 104 | self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis=1) 105 | self.td_error = tf.square(self.targetQ - self.Q) 106 | self.loss = tf.reduce_mean(self.td_error) 107 | 108 | self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001) 109 | self.updateModel = self.trainer.minimize(self.loss) 110 | 111 | def save_scalar(step, name, value, writer): 112 | """Save a scalar value to tensorboard. 113 | Parameters 114 | ---------- 115 | step: int 116 | Training step (sets the position on x-axis of tensorboard graph. 117 | name: str 118 | Name of variable. Will be the name of the graph in tensorboard. 119 | value: float 120 | The value of the variable at this step. 121 | writer: tf.FileWriter 122 | The tensorboard FileWriter instance. 123 | """ 124 | summary = tf.Summary() 125 | summary_value = summary.value.add() 126 | summary_value.simple_value = float(value) 127 | summary_value.tag = name 128 | writer.add_summary(summary, step) 129 | 130 | class DQNAgent: 131 | """Class implementing DQN. 132 | 133 | This is a basic outline of the functions/parameters you will need 134 | in order to implement the DQNAgnet. This is just to get you 135 | started. You may need to tweak the parameters, add new ones, etc. 136 | 137 | Feel free to change the functions and funciton parameters that the class 138 | provides. 139 | 140 | We have provided docstrings to go along with our suggested API. 141 | 142 | Parameters 143 | ---------- 144 | q_network: keras.models.Model 145 | Your Q-network model. 146 | preprocessor: deeprl_hw2.core.Preprocessor 147 | The preprocessor class. See the associated classes for more 148 | details. 149 | memory: deeprl_hw2.core.Memory 150 | Your replay memory. 151 | gamma: float 152 | Discount factor. 153 | target_update_freq: float 154 | Frequency to update the target network. You can either provide a 155 | number representing a soft target update (see utils.py) or a 156 | hard target update (see utils.py and Atari paper.) 157 | num_burn_in: int 158 | Before you begin updating the Q-network your replay memory has 159 | to be filled up with some number of samples. This number says 160 | how many. 161 | train_freq: int 162 | How often you actually update your Q-Network. Sometimes 163 | stability is improved if you collect a couple samples for your 164 | replay memory, for every Q-network update that you run. 165 | batch_size: int 166 | How many samples in each minibatch. 167 | """ 168 | def __init__(self, args, num_actions): 169 | self.num_actions = num_actions 170 | input_shape = (args.frame_height, args.frame_width, args.num_frames) 171 | self.history_processor = HistoryPreprocessor(args.num_frames - 1) 172 | self.atari_processor = AtariPreprocessor() 173 | self.memory = ReplayMemory(args) 174 | self.policy = LinearDecayGreedyEpsilonPolicy(args.initial_epsilon, args.final_epsilon, args.exploration_steps) 175 | self.gamma = args.gamma 176 | self.target_update_freq = args.target_update_freq 177 | self.num_burn_in = args.num_burn_in 178 | self.train_freq = args.train_freq 179 | self.batch_size = args.batch_size 180 | self.learning_rate = args.learning_rate 181 | self.frame_width = args.frame_width 182 | self.frame_height = args.frame_height 183 | self.num_frames = args.num_frames 184 | self.output_path = args.output 185 | self.output_path_videos = args.output + '/videos/' 186 | self.output_path_images = args.output + '/images/' 187 | self.save_freq = args.save_freq 188 | self.load_network = args.load_network 189 | self.load_network_path = args.load_network_path 190 | self.enable_ddqn = args.ddqn 191 | self.net_mode = args.net_mode 192 | self.args = args 193 | 194 | self.h_size = 512 195 | self.tau = 0.001 196 | # self.q_network = create_model(input_shape, num_actions, self.net_mode, args, "QNet") 197 | # self.target_network = create_model(input_shape, num_actions, self.net_mode, args, "TargetNet") 198 | tf.reset_default_graph() 199 | #We define the cells for the primary and target q-networks 200 | cell = tf.contrib.rnn.BasicLSTMCell(num_units=self.h_size, state_is_tuple=True) 201 | cellT = tf.contrib.rnn.BasicLSTMCell(num_units=self.h_size, state_is_tuple=True) 202 | if args.bidir: 203 | cell_2 = tf.contrib.rnn.BasicLSTMCell(num_units=self.h_size, state_is_tuple=True) 204 | cellT_2 = tf.contrib.rnn.BasicLSTMCell(num_units=self.h_size, state_is_tuple=True) 205 | self.q_network = Qnetwork(args, h_size=self.h_size, num_frames=self.num_frames, num_actions=self.num_actions, rnn_cell_1=cell, rnn_cell_2=cell_2, myScope="QNet") 206 | self.target_network = Qnetwork(args, h_size=self.h_size, num_frames=self.num_frames, num_actions=self.num_actions, rnn_cell_1=cellT, rnn_cell_2=cellT_2, myScope="TargetNet") 207 | else: 208 | self.q_network = Qnetwork(args, h_size=self.h_size, num_frames=self.num_frames, num_actions=self.num_actions, rnn_cell_1=cell, myScope="QNet") 209 | self.target_network = Qnetwork(args, h_size=self.h_size, num_frames=self.num_frames, num_actions=self.num_actions, rnn_cell_1=cellT, myScope="TargetNet") 210 | 211 | print(">>>> Net mode: %s, Using double dqn: %s" % (self.net_mode, self.enable_ddqn)) 212 | self.eval_freq = args.eval_freq 213 | self.no_experience = args.no_experience 214 | self.no_target = args.no_target 215 | print(">>>> Target fixing: %s, Experience replay: %s" % (not self.no_target, not self.no_experience)) 216 | 217 | # initialize target network 218 | init = tf.global_variables_initializer() 219 | self.saver = tf.train.Saver(max_to_keep=2) 220 | trainables = tf.trainable_variables() 221 | print(trainables, len(trainables)) 222 | self.targetOps = updateTargetGraph(trainables, self.tau) 223 | 224 | config = tf.ConfigProto() 225 | config.gpu_options.allow_growth = True 226 | config.allow_soft_placement = True 227 | self.sess = tf.Session(config=config) 228 | self.sess.run(init) 229 | updateTarget(self.targetOps, self.sess) 230 | self.writer = tf.summary.FileWriter(self.output_path) 231 | 232 | def calc_q_values(self, state): 233 | """Given a state (or batch of states) calculate the Q-values. 234 | 235 | Basically run your network on these states. 236 | 237 | Return 238 | ------ 239 | Q-values for the state(s) 240 | """ 241 | state = state[None, :, :, :] 242 | # return self.q_network.predict_on_batch(state) 243 | # print state.shape 244 | # Qout = self.sess.run(self.q_network.rnn_outputs,\ 245 | # feed_dict={self.q_network.imageIn: state, self.q_network.batch_size:1}) 246 | # print Qout.shape 247 | Qout = self.sess.run(self.q_network.Qout,\ 248 | feed_dict={self.q_network.imageIn: state, self.q_network.batch_size:1}) 249 | # print Qout.shape 250 | return Qout 251 | 252 | def select_action(self, state, is_training = True, **kwargs): 253 | """Select the action based on the current state. 254 | 255 | You will probably want to vary your behavior here based on 256 | which stage of training your in. For example, if you're still 257 | collecting random samples you might want to use a 258 | UniformRandomPolicy. 259 | 260 | If you're testing, you might want to use a GreedyEpsilonPolicy 261 | with a low epsilon. 262 | 263 | If you're training, you might want to use the 264 | LinearDecayGreedyEpsilonPolicy. 265 | 266 | This would also be a good place to call 267 | process_state_for_network in your preprocessor. 268 | 269 | Returns 270 | -------- 271 | selected action 272 | """ 273 | q_values = self.calc_q_values(state) 274 | if is_training: 275 | if kwargs['policy_type'] == 'UniformRandomPolicy': 276 | return UniformRandomPolicy(self.num_actions).select_action() 277 | else: 278 | # linear decay greedy epsilon policy 279 | return self.policy.select_action(q_values, is_training) 280 | else: 281 | # return GreedyEpsilonPolicy(0.05).select_action(q_values) 282 | return GreedyPolicy().select_action(q_values) 283 | 284 | def update_policy(self, current_sample): 285 | """Update your policy. 286 | 287 | Behavior may differ based on what stage of training your 288 | in. If you're in training mode then you should check if you 289 | should update your network parameters based on the current 290 | step and the value you set for train_freq. 291 | 292 | Inside, you'll want to sample a minibatch, calculate the 293 | target values, update your network, and then update your 294 | target values. 295 | 296 | You might want to return the loss and other metrics as an 297 | output. They can help you monitor how training is going. 298 | """ 299 | batch_size = self.batch_size 300 | 301 | if self.no_experience: 302 | states = np.stack([current_sample.state]) 303 | next_states = np.stack([current_sample.next_state]) 304 | rewards = np.asarray([current_sample.reward]) 305 | mask = np.asarray([1 - int(current_sample.is_terminal)]) 306 | 307 | action_mask = np.zeros((1, self.num_actions)) 308 | action_mask[0, current_sample.action] = 1.0 309 | else: 310 | samples = self.memory.sample(batch_size) 311 | samples = self.atari_processor.process_batch(samples) 312 | 313 | states = np.stack([x.state for x in samples]) 314 | actions = np.asarray([x.action for x in samples]) 315 | # action_mask = np.zeros((batch_size, self.num_actions)) 316 | # action_mask[range(batch_size), actions] = 1.0 317 | 318 | next_states = np.stack([x.next_state for x in samples]) 319 | mask = np.asarray([1 - int(x.is_terminal) for x in samples]) 320 | rewards = np.asarray([x.reward for x in samples]) 321 | 322 | if self.no_target: 323 | next_qa_value = self.q_network.predict_on_batch(next_states) 324 | else: 325 | # next_qa_value = self.target_network.predict_on_batch(next_states) 326 | next_qa_value = self.sess.run(self.target_network.Qout,\ 327 | feed_dict={self.target_network.imageIn: next_states, self.target_network.batch_size:batch_size}) 328 | 329 | if self.enable_ddqn: 330 | # qa_value = self.q_network.predict_on_batch(next_states) 331 | qa_value = self.sess.run(self.q_network.Qout,\ 332 | feed_dict={self.q_network.imageIn: next_states, self.q_network.batch_size:batch_size}) 333 | max_actions = np.argmax(qa_value, axis = 1) 334 | next_qa_value = next_qa_value[range(batch_size), max_actions] 335 | else: 336 | next_qa_value = np.max(next_qa_value, axis = 1) 337 | # print rewards.shape, mask.shape, next_qa_value.shape, batch_size 338 | target = rewards + self.gamma * mask * next_qa_value 339 | 340 | if self.args.a_t and np.random.random()<1e-3: 341 | loss, _, rnn, attention_v, attention_a = self.sess.run([self.q_network.loss, self.q_network.updateModel, self.q_network.rnn, self.q_network.attention_v, self.q_network.attention_a], \ 342 | feed_dict={self.q_network.imageIn: states, self.q_network.batch_size:batch_size, \ 343 | self.q_network.actions: actions, self.q_network.targetQ: target}) 344 | # print(attention_a[0]) 345 | else: 346 | loss, _, rnn = self.sess.run([self.q_network.loss, self.q_network.updateModel, self.q_network.rnn], \ 347 | feed_dict={self.q_network.imageIn: states, self.q_network.batch_size:batch_size, \ 348 | self.q_network.actions: actions, self.q_network.targetQ: target}) 349 | 350 | return loss, np.mean(target) 351 | 352 | def fit(self, env, num_iterations, max_episode_length=None): 353 | """Fit your model to the provided environment. 354 | 355 | Its a good idea to print out things like loss, average reward, 356 | Q-values, etc to see if your agent is actually improving. 357 | 358 | You should probably also periodically save your network 359 | weights and any other useful info. 360 | 361 | This is where you should sample actions from your network, 362 | collect experience samples and add them to your replay memory, 363 | and update your network parameters. 364 | 365 | Parameters 366 | ---------- 367 | env: gym.Env 368 | This is your Atari environment. You should wrap the 369 | environment using the wrap_atari_env function in the 370 | utils.py 371 | num_iterations: int 372 | How many samples/updates to perform. 373 | max_episode_length: int 374 | How long a single episode should last before the agent 375 | resets. Can help exploration. 376 | """ 377 | is_training = True 378 | print("Training starts.") 379 | self.save_model(0) 380 | eval_count = 0 381 | 382 | state = env.reset() 383 | burn_in = True 384 | idx_episode = 1 385 | episode_loss = .0 386 | episode_frames = 0 387 | episode_reward = .0 388 | episode_raw_reward = .0 389 | episode_target_value = .0 390 | for t in range(self.num_burn_in + num_iterations): 391 | action_state = self.history_processor.process_state_for_network( 392 | self.atari_processor.process_state_for_network(state)) 393 | policy_type = "UniformRandomPolicy" if burn_in else "LinearDecayGreedyEpsilonPolicy" 394 | action = self.select_action(action_state, is_training, policy_type = policy_type) 395 | processed_state = self.atari_processor.process_state_for_memory(state) 396 | 397 | state, reward, done, info = env.step(action) 398 | 399 | processed_next_state = self.atari_processor.process_state_for_network(state) 400 | action_next_state = np.dstack((action_state, processed_next_state)) 401 | action_next_state = action_next_state[:, :, 1:] 402 | 403 | processed_reward = self.atari_processor.process_reward(reward) 404 | 405 | self.memory.append(processed_state, action, processed_reward, done) 406 | current_sample = Sample(action_state, action, processed_reward, action_next_state, done) 407 | 408 | if not burn_in: 409 | episode_frames += 1 410 | episode_reward += processed_reward 411 | episode_raw_reward += reward 412 | if episode_frames > max_episode_length: 413 | done = True 414 | 415 | if done: 416 | # adding last frame only to save last state 417 | last_frame = self.atari_processor.process_state_for_memory(state) 418 | # action, reward, done doesn't matter here 419 | self.memory.append(last_frame, action, 0, done) 420 | if not burn_in: 421 | avg_target_value = episode_target_value / episode_frames 422 | print(">>> Training: time %d, episode %d, length %d, reward %.0f, raw_reward %.0f, loss %.4f, target value %.4f, policy step %d, memory cap %d" % 423 | (t, idx_episode, episode_frames, episode_reward, episode_raw_reward, episode_loss, 424 | avg_target_value, self.policy.step, self.memory.current)) 425 | sys.stdout.flush() 426 | save_scalar(idx_episode, 'train/episode_frames', episode_frames, self.writer) 427 | save_scalar(idx_episode, 'train/episode_reward', episode_reward, self.writer) 428 | save_scalar(idx_episode, 'train/episode_raw_reward', episode_raw_reward, self.writer) 429 | save_scalar(idx_episode, 'train/episode_loss', episode_loss, self.writer) 430 | save_scalar(idx_episode, 'train_avg/avg_reward', episode_reward / episode_frames, self.writer) 431 | save_scalar(idx_episode, 'train_avg/avg_target_value', avg_target_value, self.writer) 432 | save_scalar(idx_episode, 'train_avg/avg_loss', episode_loss / episode_frames, self.writer) 433 | episode_frames = 0 434 | episode_reward = .0 435 | episode_raw_reward = .0 436 | episode_loss = .0 437 | episode_target_value = .0 438 | idx_episode += 1 439 | burn_in = (t < self.num_burn_in) 440 | state = env.reset() 441 | self.atari_processor.reset() 442 | self.history_processor.reset() 443 | 444 | if not burn_in: 445 | if t % self.train_freq == 0: 446 | loss, target_value = self.update_policy(current_sample) 447 | episode_loss += loss 448 | episode_target_value += target_value 449 | # update freq is based on train_freq 450 | if t % (self.train_freq * self.target_update_freq) == 0: 451 | # self.target_network.set_weights(self.q_network.get_weights()) 452 | updateTarget(self.targetOps, self.sess) 453 | print("----- Synced.") 454 | if t % self.save_freq == 0: 455 | self.save_model(idx_episode) 456 | if t % (self.eval_freq * self.train_freq) == 0: 457 | episode_reward_mean, episode_reward_std, eval_count = self.evaluate(env, 20, eval_count, max_episode_length, True) 458 | save_scalar(t, 'eval/eval_episode_reward_mean', episode_reward_mean, self.writer) 459 | save_scalar(t, 'eval/eval_episode_reward_std', episode_reward_std, self.writer) 460 | 461 | self.save_model(idx_episode) 462 | 463 | 464 | def save_model(self, idx_episode): 465 | safe_path = self.output_path + "/qnet" + str(idx_episode) + ".cptk" 466 | self.saver.save(self.sess, safe_path) 467 | # self.q_network.save_weights(safe_path) 468 | print("+++++++++ Network at", idx_episode, "saved to:", safe_path) 469 | 470 | def restore_model(self, restore_path): 471 | self.saver.restore(self.sess, restore_path) 472 | print("+++++++++ Network restored from: %s", restore_path) 473 | 474 | def evaluate(self, env, num_episodes, eval_count, max_episode_length=None, monitor=True): 475 | """Test your agent with a provided environment. 476 | 477 | You shouldn't update your network parameters here. Also if you 478 | have any layers that vary in behavior between train/test time 479 | (such as dropout or batch norm), you should set them to test. 480 | 481 | Basically run your policy on the environment and collect stats 482 | like cumulative reward, average episode length, etc. 483 | 484 | You can also call the render function here if you want to 485 | visually inspect your policy. 486 | """ 487 | print("Evaluation starts.") 488 | plt.figure(1, figsize=(45, 20)) 489 | 490 | is_training = False 491 | if self.load_network: 492 | # self.q_network.load_weights(self.load_network_path) 493 | # print("Load network from:", self.load_network_path) 494 | self.restore_model(self.load_network_path) 495 | if monitor: 496 | env = wrappers.Monitor(env, self.output_path_videos, video_callable=lambda x:True, resume=True) 497 | state = env.reset() 498 | 499 | idx_episode = 1 500 | episode_frames = 0 501 | episode_reward = np.zeros(num_episodes) 502 | t = 0 503 | 504 | while idx_episode <= num_episodes: 505 | t += 1 506 | action_state = self.history_processor.process_state_for_network( 507 | self.atari_processor.process_state_for_network(state)) 508 | action = self.select_action(action_state, is_training, policy_type = 'GreedyEpsilonPolicy') 509 | 510 | action_state_ori = self.history_processor.process_state_for_network_ori( 511 | self.atari_processor.process_state_for_network_ori(state)) 512 | 513 | dice = np.random.random() 514 | 515 | state, reward, done, info = env.step(action) 516 | 517 | if dice < 0.1: 518 | attention_a = self.sess.run(self.q_network.attention_a,\ 519 | feed_dict={self.q_network.imageIn: action_state[None, :, :, :], self.q_network.batch_size:1}) 520 | # print attention_a.shape #(1, 10, 1) 521 | attention_a = np.reshape(attention_a, (-1)) 522 | for alpha_idx in range(action_state_ori.shape[3]): 523 | plt.subplot(2, action_state_ori.shape[3]//2+1, alpha_idx+1) 524 | img = action_state_ori[:, :, :, alpha_idx] #(210, 160, 3) 525 | plt.imshow(img) 526 | # plt.text(0, 1, 'Weight: %.4f'%(att ention_a[alpha_idx]) , color='black', weight='bold', backgroundcolor='white', fontsize=30) 527 | plt.subplot(2, action_state_ori.shape[3]//2+1, action_state_ori.shape[3]+2) 528 | plt.imshow(state) 529 | # plt.text(0, 1, 'Next state after taking the action %s'%(action), color='black', weight='bold', backgroundcolor='white', fontsize=20) 530 | plt.axis('off') 531 | plt.savefig('%sattention_ep%d-frame%d.png'%(self.output_path_images, eval_count, episode_frames)) 532 | print('---- Image saved at: %sattention_ep%d-frame%d.png'%(self.output_path_images, eval_count, episode_frames)) 533 | 534 | episode_frames += 1 535 | episode_reward[idx_episode-1] += reward 536 | if episode_frames > max_episode_length: 537 | done = True 538 | if done: 539 | print("Eval: time %d, episode %d, length %d, reward %.0f. @eval_count %s" % 540 | (t, idx_episode, episode_frames, episode_reward[idx_episode-1], eval_count)) 541 | eval_count += 1 542 | save_scalar(eval_count, 'eval/eval_episode_raw_reward', episode_reward[idx_episode-1], self.writer) 543 | save_scalar(eval_count, 'eval/eval_episode_raw_length', episode_frames, self.writer) 544 | sys.stdout.flush() 545 | state = env.reset() 546 | episode_frames = 0 547 | idx_episode += 1 548 | self.atari_processor.reset() 549 | self.history_processor.reset() 550 | 551 | 552 | reward_mean = np.mean(episode_reward) 553 | reward_std = np.std(episode_reward) 554 | print("Evaluation summury: num_episodes [%d], reward_mean [%.3f], reward_std [%.3f]" % 555 | (num_episodes, reward_mean, reward_std)) 556 | sys.stdout.flush() 557 | 558 | return reward_mean, reward_std, eval_count 559 | -------------------------------------------------------------------------------- /deeprl_prj/dqn_tf_spatialAt.py: -------------------------------------------------------------------------------- 1 | '''Pure Tensorflow implementation. Includes Basic Dueling Double DQN and Spatial Attention DQN.''' 2 | 3 | from deeprl_prj.policy import * 4 | from deeprl_prj.objectives import * 5 | from deeprl_prj.preprocessors import * 6 | from deeprl_prj.utils import * 7 | from deeprl_prj.core import * 8 | from helper import * 9 | 10 | import numpy as np 11 | import sys 12 | from gym import wrappers 13 | import tensorflow as tf 14 | import skimage.transform 15 | 16 | """Main DQN agent.""" 17 | 18 | class Qnetwork(): 19 | def __init__(self, args, h_size, num_frames, num_actions, rnn_cell, myScope): 20 | #The network recieves a frame from the game, flattened into an array. 21 | #It then resizes it and processes it through four convolutional layers. 22 | self.imageIn = tf.placeholder(shape=[None,84,84,num_frames],dtype=tf.float32) 23 | self.image_permute = tf.transpose(self.imageIn, perm=[0, 3, 1, 2]) 24 | self.image_reshape = tf.reshape(self.image_permute, [-1, 84, 84, 1]) 25 | self.image_reshape_recoverd = tf.squeeze(tf.gather(tf.reshape(self.image_reshape, [-1, num_frames, 84, 84, 1]), [0]), [0]) 26 | self.summary_merged = tf.summary.merge([tf.summary.image('image_reshape_recoverd', self.image_reshape_recoverd, max_outputs=num_frames)]) 27 | # self.imageIn = tf.reshape(self.scalarInput,shape=[-1,84,84,1]) 28 | self.conv1 = tf.contrib.layers.convolution2d( \ 29 | inputs=self.image_reshape,num_outputs=32,\ 30 | kernel_size=[8,8],stride=[4,4],padding='VALID', \ 31 | activation_fn=tf.nn.relu, biases_initializer=None,scope=myScope+'_conv1') 32 | self.conv2 = tf.contrib.layers.convolution2d( \ 33 | inputs=self.conv1,num_outputs=64,\ 34 | kernel_size=[4,4],stride=[2,2],padding='VALID', \ 35 | activation_fn=tf.nn.relu, biases_initializer=None,scope=myScope+'_conv2') 36 | self.conv3 = tf.contrib.layers.convolution2d( \ 37 | inputs=self.conv2,num_outputs=64,\ 38 | kernel_size=[3,3],stride=[1,1],padding='VALID', \ 39 | activation_fn=tf.nn.relu, biases_initializer=None,scope=myScope+'_conv3') # (None, 10, 7, 7, 64) 40 | self.batch_size = tf.placeholder(dtype=tf.int32) 41 | 42 | if not(args.a_t): 43 | self.conv4 = tf.contrib.layers.fully_connected(tf.contrib.layers.flatten(self.conv3), h_size, activation_fn=tf.nn.relu) 44 | 45 | #We take the output from the final convolutional layer and send it to a recurrent layer. 46 | #The input must be reshaped into [batch x trace x units] for rnn processing, 47 | #and then returned to [batch x units] when sent through the upper levles. 48 | self.convFlat = tf.reshape(self.conv4,[self.batch_size, num_frames, h_size]) 49 | self.state_in = rnn_cell.zero_state(self.batch_size, tf.float32) 50 | self.rnn_outputs, self.rnn_state = tf.nn.dynamic_rnn(\ 51 | inputs=self.convFlat,cell=rnn_cell,dtype=tf.float32,initial_state=self.state_in,scope=myScope+'_rnn') 52 | # print("======", self.rnn_outputs.get_shape().as_list()) 53 | 54 | self.rnn_last_output = tf.slice(self.rnn_outputs, [0, num_frames-1, 0], [-1, 1, -1]) 55 | self.rnn = tf.squeeze(self.rnn_last_output, [1]) 56 | # print("==========", self.rnn.get_shape().as_list()) 57 | else: 58 | self.L = 7*7 59 | self.D = 64 60 | self.T = num_frames 61 | self.H = 512 62 | self.selector=args.selector 63 | self.weight_initializer = tf.contrib.layers.xavier_initializer() 64 | self.const_initializer = tf.constant_initializer(0.0) 65 | 66 | self.features = tf.reshape(self.conv3, [self.batch_size, num_frames, self.L, self.D]) 67 | self.features_list = tf.split(self.features, num_frames, axis=1) 68 | # print(len(self.features_list), self.features_list[0].get_shape().as_list()) # 10 [None, 1, 49, 64] 69 | self.alpha_list = [] 70 | lstm_cell = tf.contrib.rnn.BasicLSTMCell(num_units=self.H) 71 | c, h = self._get_initial_lstm(features=tf.squeeze(self.features_list[0], [1]), myScope=myScope) 72 | 73 | for t in range(self.T): 74 | features = tf.squeeze(self.features_list[t], [1]) 75 | features = self._batch_norm(features, mode='train', name=myScope+'conv_features', reuse=(t!=0)) 76 | features_proj = self._project_features(features=features, myScope=myScope, reuse=(t!=0)) 77 | context, alpha = self._attention_layer(features, features_proj, h, myScope=myScope, reuse=(t!=0)) 78 | self.alpha_list.append(alpha) 79 | 80 | if self.selector: 81 | context, beta = self._selector(context, h, myScope=myScope, reuse=(t!=0)) 82 | 83 | # print("========== context ", context.get_shape().as_list()) 84 | # print("========== h ", h.get_shape().as_list()) 85 | 86 | with tf.variable_scope(myScope+'_lstmCell', reuse=(t!=0)): 87 | _, (c, h) = lstm_cell(inputs=tf.concat([context, h], 1), state=[c, h]) 88 | # print("========== h ", h.get_shape().as_list()) 89 | 90 | self.rnn = h 91 | 92 | 93 | if args.net_mode == "duel": 94 | #The output from the recurrent player is then split into separate Value and Advantage streams 95 | self.ad_hidden = tf.contrib.layers.fully_connected(self.rnn, h_size, activation_fn=tf.nn.relu, scope=myScope+'_fc_advantage_hidden') 96 | self.Advantage = tf.contrib.layers.fully_connected(self.ad_hidden, num_actions, activation_fn=None, scope=myScope+'_fc_advantage') 97 | self.value_hidden = tf.contrib.layers.fully_connected(self.rnn, h_size, activation_fn=tf.nn.relu, scope=myScope+'_fc_value_hidden') 98 | self.Value = tf.contrib.layers.fully_connected(self.value_hidden, 1, activation_fn=None, scope=myScope+'_fc_value') 99 | 100 | #Then combine them together to get our final Q-values. 101 | self.Qout = self.Value + tf.subtract(self.Advantage,tf.reduce_mean(self.Advantage,axis=1,keep_dims=True)) 102 | else: 103 | self.Qout = tf.contrib.layers.fully_connected(self.rnn, num_actions, activation_fn=None) 104 | self.predict = tf.argmax(self.Qout,1) 105 | 106 | #Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values. 107 | self.targetQ = tf.placeholder(shape=[None],dtype=tf.float32) 108 | self.actions = tf.placeholder(shape=[None],dtype=tf.int32) 109 | self.actions_onehot = tf.one_hot(self.actions, num_actions, dtype=tf.float32) 110 | 111 | self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis=1) 112 | self.td_error = tf.square(self.targetQ - self.Q) 113 | self.loss = tf.reduce_mean(self.td_error) 114 | 115 | self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001) 116 | self.updateModel = self.trainer.minimize(self.loss) 117 | 118 | def _batch_norm(self, x, mode='train', name=None, reuse=False): 119 | return tf.contrib.layers.batch_norm(inputs=x, 120 | decay=0.95, 121 | center=True, 122 | scale=True, 123 | is_training=(mode=='train'), 124 | updates_collections=None, 125 | reuse=reuse, 126 | scope=(name+'batch_norm')) 127 | 128 | def _get_initial_lstm(self, features, myScope): 129 | with tf.variable_scope(myScope+'_initial_lstm'): 130 | features_mean = tf.reduce_mean(features, 1) 131 | 132 | w_h = tf.get_variable('w_h', [self.D, self.H], initializer=self.weight_initializer) 133 | b_h = tf.get_variable('b_h', [self.H], initializer=self.const_initializer) 134 | h = tf.nn.tanh(tf.matmul(features_mean, w_h) + b_h) 135 | 136 | w_c = tf.get_variable('w_c', [self.D, self.H], initializer=self.weight_initializer) 137 | b_c = tf.get_variable('b_c', [self.H], initializer=self.const_initializer) 138 | c = tf.nn.tanh(tf.matmul(features_mean, w_c) + b_c) 139 | return c, h 140 | 141 | def _project_features(self, features, myScope, reuse=False): 142 | with tf.variable_scope(myScope+'_project_features', reuse=reuse): 143 | w = tf.get_variable('w', [self.D, self.D], initializer=self.weight_initializer) 144 | features_flat = tf.reshape(features, [-1, self.D]) 145 | features_proj = tf.matmul(features_flat, w) 146 | features_proj = tf.reshape(features_proj, [-1, self.L, self.D]) 147 | return features_proj 148 | 149 | def _attention_layer(self, features, features_proj, h, myScope, reuse=False): 150 | with tf.variable_scope(myScope+'_attention_layer', reuse=reuse): 151 | w = tf.get_variable('w', [self.H, self.D], initializer=self.weight_initializer) 152 | b = tf.get_variable('b', [self.D], initializer=self.const_initializer) 153 | w_att = tf.get_variable('w_att', [self.D, 1], initializer=self.weight_initializer) 154 | 155 | h_att = tf.nn.relu(features_proj + tf.expand_dims(tf.matmul(h, w), 1) + b) # (N, L, D) 156 | out_att = tf.reshape(tf.matmul(tf.reshape(h_att, [-1, self.D]), w_att), [-1, self.L]) # (N, L) 157 | alpha = tf.nn.softmax(out_att) 158 | context = tf.reduce_sum(features * tf.expand_dims(alpha, 2), 1, name='context') #(N, D) 159 | return context, alpha 160 | 161 | def _selector(self, context, h, myScope, reuse=False): 162 | with tf.variable_scope(myScope+'_selector', reuse=reuse): 163 | w = tf.get_variable('w', [self.H, 1], initializer=self.weight_initializer) 164 | b = tf.get_variable('b', [1], initializer=self.const_initializer) 165 | beta = tf.nn.sigmoid(tf.matmul(h, w) + b, 'beta') # (N, 1) 166 | context = tf.multiply(beta, context, name='selected_context') 167 | return context, beta 168 | 169 | def save_scalar(step, name, value, writer): 170 | """Save a scalar value to tensorboard. 171 | Parameters 172 | ---------- 173 | step: int 174 | Training step (sets the position on x-axis of tensorboard graph. 175 | name: str 176 | Name of variable. Will be the name of the graph in tensorboard. 177 | value: float 178 | The value of the variable at this step. 179 | writer: tf.FileWriter 180 | The tensorboard FileWriter instance. 181 | """ 182 | summary = tf.Summary() 183 | summary_value = summary.value.add() 184 | summary_value.simple_value = float(value) 185 | summary_value.tag = name 186 | writer.add_summary(summary, step) 187 | 188 | class DQNAgent: 189 | """Class implementing DQN. 190 | 191 | This is a basic outline of the functions/parameters you will need 192 | in order to implement the DQNAgnet. This is just to get you 193 | started. You may need to tweak the parameters, add new ones, etc. 194 | 195 | Feel free to change the functions and funciton parameters that the class 196 | provides. 197 | 198 | We have provided docstrings to go along with our suggested API. 199 | 200 | Parameters 201 | ---------- 202 | q_network: keras.models.Model 203 | Your Q-network model. 204 | preprocessor: deeprl_hw2.core.Preprocessor 205 | The preprocessor class. See the associated classes for more 206 | details. 207 | memory: deeprl_hw2.core.Memory 208 | Your replay memory. 209 | gamma: float 210 | Discount factor. 211 | target_update_freq: float 212 | Frequency to update the target network. You can either provide a 213 | number representing a soft target update (see utils.py) or a 214 | hard target update (see utils.py and Atari paper.) 215 | num_burn_in: int 216 | Before you begin updating the Q-network your replay memory has 217 | to be filled up with some number of samples. This number says 218 | how many. 219 | train_freq: int 220 | How often you actually update your Q-Network. Sometimes 221 | stability is improved if you collect a couple samples for your 222 | replay memory, for every Q-network update that you run. 223 | batch_size: int 224 | How many samples in each minibatch. 225 | """ 226 | def __init__(self, args, num_actions): 227 | self.num_actions = num_actions 228 | input_shape = (args.frame_height, args.frame_width, args.num_frames) 229 | self.history_processor = HistoryPreprocessor(args.num_frames - 1) 230 | self.atari_processor = AtariPreprocessor() 231 | self.memory = ReplayMemory(args) 232 | self.policy = LinearDecayGreedyEpsilonPolicy(args.initial_epsilon, args.final_epsilon, args.exploration_steps) 233 | self.gamma = args.gamma 234 | self.target_update_freq = args.target_update_freq 235 | self.num_burn_in = args.num_burn_in 236 | self.train_freq = args.train_freq 237 | self.batch_size = args.batch_size 238 | self.learning_rate = args.learning_rate 239 | self.frame_width = args.frame_width 240 | self.frame_height = args.frame_height 241 | self.num_frames = args.num_frames 242 | self.output_path = args.output 243 | self.output_path_videos = args.output + '/videos/' 244 | self.output_path_images = args.output + '/images/' 245 | self.save_freq = args.save_freq 246 | self.load_network = args.load_network 247 | self.load_network_path = args.load_network_path 248 | self.enable_ddqn = args.ddqn 249 | self.net_mode = args.net_mode 250 | 251 | self.h_size = 512 252 | self.tau = 0.001 253 | tf.reset_default_graph() 254 | #We define the cells for the primary and target q-networks 255 | cell = tf.contrib.rnn.BasicLSTMCell(num_units=self.h_size, state_is_tuple=True) 256 | cellT = tf.contrib.rnn.BasicLSTMCell(num_units=self.h_size, state_is_tuple=True) 257 | self.q_network = Qnetwork(args=args, h_size=self.h_size, num_frames=self.num_frames, num_actions=self.num_actions, rnn_cell=cell, myScope="QNet") 258 | self.target_network = Qnetwork(args=args, h_size=self.h_size, num_frames=self.num_frames, num_actions=self.num_actions, rnn_cell=cellT, myScope="TargetNet") 259 | 260 | print(">>>> Net mode: %s, Using double dqn: %s" % (self.net_mode, self.enable_ddqn)) 261 | self.eval_freq = args.eval_freq 262 | self.no_experience = args.no_experience 263 | self.no_target = args.no_target 264 | print(">>>> Target fixing: %s, Experience replay: %s" % (not self.no_target, not self.no_experience)) 265 | 266 | # initialize target network 267 | init = tf.global_variables_initializer() 268 | self.saver = tf.train.Saver(max_to_keep=2) 269 | trainables = tf.trainable_variables() 270 | print(trainables, len(trainables)) 271 | self.targetOps = updateTargetGraph(trainables, self.tau) 272 | 273 | config = tf.ConfigProto() 274 | config.gpu_options.allow_growth = True 275 | config.allow_soft_placement = True 276 | self.sess = tf.Session(config=config) 277 | self.sess.run(init) 278 | updateTarget(self.targetOps, self.sess) 279 | self.writer = tf.summary.FileWriter(self.output_path) 280 | 281 | def calc_q_values(self, state): 282 | """Given a state (or batch of states) calculate the Q-values. 283 | 284 | Basically run your network on these states. 285 | 286 | Return 287 | ------ 288 | Q-values for the state(s) 289 | """ 290 | state = state[None, :, :, :] 291 | # return self.q_network.predict_on_batch(state) 292 | # print state.shape 293 | # Qout = self.sess.run(self.q_network.rnn_outputs,\ 294 | # feed_dict={self.q_network.imageIn: state, self.q_network.batch_size:1}) 295 | # print Qout.shape 296 | Qout = self.sess.run(self.q_network.Qout,\ 297 | feed_dict={self.q_network.imageIn: state, self.q_network.batch_size:1}) 298 | # print Qout.shape 299 | return Qout 300 | 301 | def select_action(self, state, is_training = True, **kwargs): 302 | """Select the action based on the current state. 303 | 304 | You will probably want to vary your behavior here based on 305 | which stage of training your in. For example, if you're still 306 | collecting random samples you might want to use a 307 | UniformRandomPolicy. 308 | 309 | If you're testing, you might want to use a GreedyEpsilonPolicy 310 | with a low epsilon. 311 | 312 | If you're training, you might want to use the 313 | LinearDecayGreedyEpsilonPolicy. 314 | 315 | This would also be a good place to call 316 | process_state_for_network in your preprocessor. 317 | 318 | Returns 319 | -------- 320 | selected action 321 | """ 322 | q_values = self.calc_q_values(state) 323 | if is_training: 324 | if kwargs['policy_type'] == 'UniformRandomPolicy': 325 | return UniformRandomPolicy(self.num_actions).select_action() 326 | else: 327 | # linear decay greedy epsilon policy 328 | return self.policy.select_action(q_values, is_training) 329 | else: 330 | # return GreedyEpsilonPolicy(0.05).select_action(q_values) 331 | return GreedyPolicy().select_action(q_values) 332 | 333 | def update_policy(self, current_sample): 334 | """Update your policy. 335 | 336 | Behavior may differ based on what stage of training your 337 | in. If you're in training mode then you should check if you 338 | should update your network parameters based on the current 339 | step and the value you set for train_freq. 340 | 341 | Inside, you'll want to sample a minibatch, calculate the 342 | target values, update your network, and then update your 343 | target values. 344 | 345 | You might want to return the loss and other metrics as an 346 | output. They can help you monitor how training is going. 347 | """ 348 | batch_size = self.batch_size 349 | 350 | if self.no_experience: 351 | states = np.stack([current_sample.state]) 352 | next_states = np.stack([current_sample.next_state]) 353 | rewards = np.asarray([current_sample.reward]) 354 | mask = np.asarray([1 - int(current_sample.is_terminal)]) 355 | 356 | action_mask = np.zeros((1, self.num_actions)) 357 | action_mask[0, current_sample.action] = 1.0 358 | else: 359 | samples = self.memory.sample(batch_size) 360 | samples = self.atari_processor.process_batch(samples) 361 | 362 | states = np.stack([x.state for x in samples]) 363 | actions = np.asarray([x.action for x in samples]) 364 | # action_mask = np.zeros((batch_size, self.num_actions)) 365 | # action_mask[range(batch_size), actions] = 1.0 366 | 367 | next_states = np.stack([x.next_state for x in samples]) 368 | mask = np.asarray([1 - int(x.is_terminal) for x in samples]) 369 | rewards = np.asarray([x.reward for x in samples]) 370 | 371 | if self.no_target: 372 | next_qa_value = self.q_network.predict_on_batch(next_states) 373 | else: 374 | # next_qa_value = self.target_network.predict_on_batch(next_states) 375 | next_qa_value = self.sess.run(self.target_network.Qout,\ 376 | feed_dict={self.target_network.imageIn: next_states, self.target_network.batch_size:batch_size}) 377 | 378 | if self.enable_ddqn: 379 | # qa_value = self.q_network.predict_on_batch(next_states) 380 | qa_value = self.sess.run(self.q_network.Qout,\ 381 | feed_dict={self.q_network.imageIn: next_states, self.q_network.batch_size:batch_size}) 382 | max_actions = np.argmax(qa_value, axis = 1) 383 | next_qa_value = next_qa_value[range(batch_size), max_actions] 384 | else: 385 | next_qa_value = np.max(next_qa_value, axis = 1) 386 | # print rewards.shape, mask.shape, next_qa_value.shape, batch_size 387 | target = rewards + self.gamma * mask * next_qa_value 388 | 389 | loss, _, rnn = self.sess.run([self.q_network.loss, self.q_network.updateModel, self.q_network.rnn], \ 390 | feed_dict={self.q_network.imageIn: states, self.q_network.batch_size:batch_size, \ 391 | self.q_network.actions: actions, self.q_network.targetQ: target}) 392 | # print rnn[:5] 393 | # if np.random.random() < 0.001: 394 | # merged = self.sess.run(self.q_network.summary_merged, \ 395 | # feed_dict={self.q_network.imageIn: states, self.q_network.batch_size:batch_size, \ 396 | # self.q_network.actions: actions, self.q_network.targetQ: target}) 397 | # self.writer.add_summary(merged) 398 | # self.writer.flush() 399 | # print '----- writer flushed.' 400 | # return self.final_model.train_on_batch([states, action_mask], target), np.mean(target) 401 | return loss, np.mean(target) 402 | 403 | def fit(self, env, num_iterations, max_episode_length=None): 404 | """Fit your model to the provided environment. 405 | 406 | Its a good idea to print out things like loss, average reward, 407 | Q-values, etc to see if your agent is actually improving. 408 | 409 | You should probably also periodically save your network 410 | weights and any other useful info. 411 | 412 | This is where you should sample actions from your network, 413 | collect experience samples and add them to your replay memory, 414 | and update your network parameters. 415 | 416 | Parameters 417 | ---------- 418 | env: gym.Env 419 | This is your Atari environment. You should wrap the 420 | environment using the wrap_atari_env function in the 421 | utils.py 422 | num_iterations: int 423 | How many samples/updates to perform. 424 | max_episode_length: int 425 | How long a single episode should last before the agent 426 | resets. Can help exploration. 427 | """ 428 | is_training = True 429 | print("Training starts.") 430 | self.save_model(0) 431 | eval_count = 0 432 | 433 | state = env.reset() 434 | burn_in = True 435 | idx_episode = 1 436 | episode_loss = .0 437 | episode_frames = 0 438 | episode_reward = .0 439 | episode_raw_reward = .0 440 | episode_target_value = .0 441 | for t in range(self.num_burn_in + num_iterations): 442 | print ("iteration --> %s, episode --> %s" % (t, idx_episode)) 443 | action_state = self.history_processor.process_state_for_network( 444 | self.atari_processor.process_state_for_network(state)) 445 | policy_type = "UniformRandomPolicy" if burn_in else "LinearDecayGreedyEpsilonPolicy" 446 | action = self.select_action(action_state, is_training, policy_type = policy_type) 447 | processed_state = self.atari_processor.process_state_for_memory(state) 448 | 449 | state, reward, done, info = env.step(action) 450 | 451 | processed_next_state = self.atari_processor.process_state_for_network(state) 452 | action_next_state = np.dstack((action_state, processed_next_state)) 453 | action_next_state = action_next_state[:, :, 1:] 454 | 455 | processed_reward = self.atari_processor.process_reward(reward) 456 | 457 | self.memory.append(processed_state, action, processed_reward, done) 458 | current_sample = Sample(action_state, action, processed_reward, action_next_state, done) 459 | 460 | if not burn_in: 461 | episode_frames += 1 462 | episode_reward += processed_reward 463 | episode_raw_reward += reward 464 | if episode_frames > max_episode_length: 465 | done = True 466 | 467 | if done: 468 | # adding last frame only to save last state 469 | last_frame = self.atari_processor.process_state_for_memory(state) 470 | # action, reward, done doesn't matter here 471 | self.memory.append(last_frame, action, 0, done) 472 | if not burn_in: 473 | avg_target_value = episode_target_value / episode_frames 474 | print(">>> Training: time %d, episode %d, length %d, reward %.0f, raw_reward %.0f, loss %.4f, target value %.4f, policy step %d, memory cap %d" % 475 | (t, idx_episode, episode_frames, episode_reward, episode_raw_reward, episode_loss, 476 | avg_target_value, self.policy.step, self.memory.current)) 477 | sys.stdout.flush() 478 | save_scalar(idx_episode, 'train/episode_frames', episode_frames, self.writer) 479 | save_scalar(idx_episode, 'train/episode_reward', episode_reward, self.writer) 480 | save_scalar(idx_episode, 'train/episode_raw_reward', episode_raw_reward, self.writer) 481 | save_scalar(idx_episode, 'train/episode_loss', episode_loss, self.writer) 482 | save_scalar(idx_episode, 'train_avg/avg_reward', episode_reward / episode_frames, self.writer) 483 | save_scalar(idx_episode, 'train_avg/avg_target_value', avg_target_value, self.writer) 484 | save_scalar(idx_episode, 'train_avg/avg_loss', episode_loss / episode_frames, self.writer) 485 | episode_frames = 0 486 | episode_reward = .0 487 | episode_raw_reward = .0 488 | episode_loss = .0 489 | episode_target_value = .0 490 | idx_episode += 1 491 | burn_in = (t < self.num_burn_in) 492 | state = env.reset() 493 | self.atari_processor.reset() 494 | self.history_processor.reset() 495 | 496 | if not burn_in: 497 | if t % self.train_freq == 0: 498 | loss, target_value = self.update_policy(current_sample) 499 | episode_loss += loss 500 | episode_target_value += target_value 501 | # update freq is based on train_freq 502 | if t % (self.train_freq * self.target_update_freq) == 0: 503 | # self.target_network.set_weights(self.q_network.get_weights()) 504 | updateTarget(self.targetOps, self.sess) 505 | print("----- Synced.") 506 | if t % self.save_freq == 0: 507 | self.save_model(idx_episode) 508 | # if t % (self.eval_freq * self.train_freq) == 0: 509 | # episode_reward_mean, episode_reward_std, eval_count = self.evaluate(env, 20, eval_count, max_episode_length, True) 510 | # save_scalar(t, 'eval/eval_episode_reward_mean', episode_reward_mean, self.writer) 511 | # save_scalar(t, 'eval/eval_episode_reward_std', episode_reward_std, self.writer) 512 | 513 | self.save_model(idx_episode) 514 | 515 | 516 | def save_model(self, idx_episode): 517 | safe_path = self.output_path + "/qnet" + str(idx_episode) + ".cptk" 518 | self.saver.save(self.sess, safe_path) 519 | # self.q_network.save_weights(safe_path) 520 | print("Network at", idx_episode, "saved to:", safe_path) 521 | 522 | def evaluate(self, env, num_episodes, eval_count, max_episode_length=None, monitor=True): 523 | """Test your agent with a provided environment. 524 | 525 | You shouldn't update your network parameters here. Also if you 526 | have any layers that vary in behavior between train/test time 527 | (such as dropout or batch norm), you should set them to test. 528 | 529 | Basically run your policy on the environment and collect stats 530 | like cumulative reward, average episode length, etc. 531 | 532 | You can also call the render function here if you want to 533 | visually inspect your policy. 534 | """ 535 | print("Evaluation starts.") 536 | plt.figure(1, figsize=(40, 20)) 537 | 538 | is_training = False 539 | if self.load_network: 540 | self.q_network.load_weights(self.load_network_path) 541 | print("Load network from:", self.load_network_path) 542 | if monitor: 543 | env = wrappers.Monitor(env, self.output_path_videos, video_callable=lambda x:True, resume=True) 544 | state = env.reset() 545 | 546 | idx_episode = 1 547 | episode_frames = 0 548 | episode_reward = np.zeros(num_episodes) 549 | t = 0 550 | 551 | while idx_episode <= num_episodes: 552 | t += 1 553 | action_state = self.history_processor.process_state_for_network( 554 | self.atari_processor.process_state_for_network(state)) 555 | action = self.select_action(action_state, is_training, policy_type = 'GreedyEpsilonPolicy') 556 | 557 | action_state_ori = self.history_processor.process_state_for_network_ori( 558 | self.atari_processor.process_state_for_network_ori(state)) 559 | # print "state.shape", state.shape 560 | # print "action_state_ori.shape", action_state_ori.shape 561 | 562 | if np.random.random() < 1e-3: 563 | alpha_list = self.sess.run(self.q_network.alpha_list,\ 564 | feed_dict={self.q_network.imageIn: action_state[None, :, :, :], self.q_network.batch_size:1}) 565 | # print alpha_list, len(alpha_list), alpha_list[0].shape #10 (1, 49) 566 | for alpha_idx in range(len(alpha_list)): 567 | plt.subplot(2, len(alpha_list)//2, alpha_idx+1) 568 | img = action_state_ori[:, :, :, alpha_idx] #(210, 160, 3) 569 | plt.imshow(img) 570 | alp_curr = alpha_list[alpha_idx].reshape(7, 7) 571 | alp_img = skimage.transform.pyramid_expand(alp_curr, upscale=22, sigma=20) 572 | plt.imshow(scipy.misc.imresize(alp_img, (img.shape[0], img.shape[1])), alpha=0.7, cmap='gray') 573 | plt.axis('off') 574 | # plt.show() 575 | # plt.canvas.draw() 576 | plt.savefig('%sattention_ep%d-frame%d.png'%(self.output_path_images, eval_count, episode_frames)) 577 | 578 | state, reward, done, info = env.step(action) 579 | episode_frames += 1 580 | episode_reward[idx_episode-1] += reward 581 | if episode_frames > max_episode_length: 582 | done = True 583 | if done: 584 | print("Eval: time %d, episode %d, length %d, reward %.0f. @eval_count %s" % 585 | (t, idx_episode, episode_frames, episode_reward[idx_episode-1], eval_count)) 586 | eval_count += 1 587 | save_scalar(eval_count, 'eval/eval_episode_raw_reward', episode_reward[idx_episode-1], self.writer) 588 | save_scalar(eval_count, 'eval/eval_episode_raw_length', episode_frames, self.writer) 589 | sys.stdout.flush() 590 | state = env.reset() 591 | episode_frames = 0 592 | idx_episode += 1 593 | self.atari_processor.reset() 594 | self.history_processor.reset() 595 | 596 | 597 | reward_mean = np.mean(episode_reward) 598 | reward_std = np.std(episode_reward) 599 | print("Evaluation summury: num_episodes [%d], reward_mean [%.3f], reward_std [%.3f]" % 600 | (num_episodes, reward_mean, reward_std)) 601 | sys.stdout.flush() 602 | 603 | return reward_mean, reward_std, eval_count 604 | --------------------------------------------------------------------------------