├── Chapter02 ├── Frozen_Lake_QLearning.py └── Frozen_Lake_QNetwork.py ├── Chapter03 └── Frozen_Lake_Value_Iteration_MDP.py ├── Chapter04 └── PlayPongPolicyGradients.py ├── Chapter05 ├── Atari_Breakout.py ├── DeepQNetwork_Cartpole.py ├── Deep_Q_Network_Mountain_Car.py ├── Deep_Q_Network_Mountain_Car.pyc ├── MountainCar_SARSA.py └── Mountain_Car_Problem_QLearning.py ├── Chapter06 └── A3C_Pong.py ├── LICENSE ├── README.md └── requirements.sh /Chapter02/Frozen_Lake_QLearning.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | from __future__ import print_function 7 | import gym 8 | import numpy as np 9 | import time 10 | 11 | 12 | 13 | # ## Load the environment 14 | 15 | # In[2]: 16 | 17 | 18 | env = gym.make('FrozenLake-v0') 19 | 20 | 21 | # In[5]: 22 | 23 | 24 | s = env.reset() 25 | print(s) 26 | 27 | 28 | # In[6]: 29 | 30 | 31 | env.render() 32 | 33 | 34 | # In[7]: 35 | 36 | 37 | print(env.action_space) #number of actions 38 | 39 | 40 | # In[8]: 41 | 42 | 43 | print(env.observation_space) #number of states 44 | 45 | 46 | # In[9]: 47 | 48 | 49 | print("Number of actions : ",env.action_space.n) 50 | print("Number of states : ",env.observation_space.n) 51 | 52 | 53 | # ## Epsilon Greedy 54 | 55 | # In[40]: 56 | 57 | 58 | def epsilon_greedy(Q,s,na): 59 | epsilon = 0.3 60 | p = np.random.uniform(low=0,high=1) 61 | #print(p) 62 | if p > epsilon: 63 | return np.argmax(Q[s,:])#say here,initial policy = for each state consider the action having highest Q-value 64 | else: 65 | return env.action_space.sample() 66 | 67 | 68 | # ## Q-Learning Implementation 69 | 70 | # In[47]: 71 | 72 | 73 | #Initializing Q-table with zeros 74 | Q = np.zeros([env.observation_space.n,env.action_space.n]) 75 | 76 | #set hyperparameters 77 | lr = 0.5 #learning rate 78 | y = 0.9 #discount factor lambda 79 | eps = 100000 #total episodes being 100000 80 | 81 | 82 | for i in range(eps): 83 | s = env.reset() 84 | t = False 85 | while(True): 86 | a = epsilon_greedy(Q,s,env.action_space.n) 87 | s_,r,t,_ = env.step(a) 88 | if (r==0): 89 | if t==True: 90 | r = -5 #to give negative rewards when holes turn up 91 | Q[s_] = np.ones(env.action_space.n)*r #in terminal state Q value equals the reward 92 | else: 93 | r = -1 #to give negative rewards to avoid long routes 94 | if (r==1): 95 | r = 100 96 | Q[s_] = np.ones(env.action_space.n)*r #in terminal state Q value equals the reward 97 | Q[s,a] = Q[s,a] + lr * (r + y*np.max(Q[s_,a]) - Q[s,a]) 98 | s = s_ 99 | if (t == True) : 100 | break 101 | 102 | 103 | # In[48]: 104 | 105 | 106 | print("Q-table") 107 | print(Q) 108 | 109 | 110 | # In[49]: 111 | 112 | 113 | s = env.reset() 114 | env.render() 115 | while(True): 116 | a = np.argmax(Q[s]) 117 | s_,r,t,_ = env.step(a) 118 | env.render() 119 | s = s_ 120 | if(t==True) : 121 | break 122 | 123 | 124 | # In[ ]: 125 | 126 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /Chapter02/Frozen_Lake_QNetwork.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | 5 | import gym 6 | import numpy as np 7 | import tensorflow as tf 8 | import random 9 | from matplotlib import pyplot as plt 10 | #get_ipython().magic(u'matplotlib inline') 11 | 12 | 13 | # ## Load the Environment 14 | 15 | 16 | 17 | env = gym.make('FrozenLake-v0') 18 | 19 | 20 | # ## Q - Network Implementation 21 | 22 | # ### Creating Neural Network 23 | 24 | 25 | 26 | tf.reset_default_graph() 27 | 28 | #tensors for inputs, weights, biases, Qtarget 29 | inputs = tf.placeholder(shape=[None,env.observation_space.n],dtype=tf.float32) 30 | W = tf.get_variable(name="W",dtype=tf.float32,shape=[env.observation_space.n,env.action_space.n],initializer=tf.contrib.layers.xavier_initializer()) 31 | b = tf.Variable(tf.zeros(shape=[env.action_space.n]),dtype=tf.float32) 32 | qpred = tf.add(tf.matmul(inputs,W),b) 33 | apred = tf.argmax(qpred,1) 34 | 35 | qtar = tf.placeholder(shape=[1,env.action_space.n],dtype=tf.float32) 36 | loss = tf.reduce_sum(tf.square(qtar-qpred)) 37 | 38 | train = tf.train.AdamOptimizer(learning_rate=0.001) 39 | minimizer = train.minimize(loss) 40 | 41 | 42 | # ## Training the neural network 43 | 44 | 45 | 46 | init = tf.global_variables_initializer() 47 | 48 | #learning parameters 49 | y = 0.5 50 | e = 0.3 51 | episodes = 10000 52 | 53 | #list to capture total steps and rewards per episodes 54 | slist = [] 55 | rlist = [] 56 | 57 | with tf.Session() as sess: 58 | sess.run(init) 59 | for i in range(episodes): 60 | s = env.reset() #resetting the environment at the start of each episode 61 | r_total = 0 #to calculate the sum of rewards in the current episode 62 | while(True): 63 | #running the Q-network created above 64 | a_pred,q_pred = sess.run([apred,qpred],feed_dict={inputs:np.identity(env.observation_space.n)[s:s+1]}) 65 | #a_pred is the action prediction by the neural network 66 | #q_pred contains q_values of the actions at current state 's' 67 | if np.random.uniform(low=0,high=1) < e: 68 | a_pred[0] = env.action_space.sample() 69 | #exploring different action by randomly assigning them as the next action 70 | s_,r,t,_ = env.step(a_pred[0]) #action taken and new state 's_' is encountered with a feedback reward 'r' 71 | if r==0: 72 | if t==True: 73 | r=-5 #if hole make the reward more negative 74 | else: 75 | r=-1 #if block is fine/frozen then give slight negative reward to optimise the path 76 | if r==1: 77 | r=5 #good positive goat state reward 78 | 79 | q_pred_new = sess.run(qpred,feed_dict={inputs:np.identity(env.observation_space.n)[s_:s_+1]}) 80 | #q_pred_new contains q_values of the actions at the new state 81 | 82 | #update the Q-target value for action taken 83 | targetQ = q_pred 84 | max_qpredn = np.max(q_pred_new) 85 | targetQ[0,a_pred[0]] = r + y*max_qpredn 86 | #this gives our targetQ 87 | 88 | #train the neural network to minimise the loss 89 | _ = sess.run(minimizer,feed_dict={inputs:np.identity(env.observation_space.n)[s:s+1],qtar:targetQ}) 90 | r_total+=r 91 | 92 | s=s_ 93 | if t==True: 94 | break 95 | 96 | #learning ends with the end of the loop of several episodes above 97 | #let's check how much our agent has learned 98 | s = env.reset() 99 | env.render() 100 | while(True): 101 | a = sess.run(apred,feed_dict={inputs:np.identity(env.observation_space.n)[s:s+1]}) 102 | s_,r,t,_ = env.step(a[0]) 103 | env.render() 104 | s = s_ 105 | if t==True: 106 | break 107 | 108 | 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /Chapter03/Frozen_Lake_Value_Iteration_MDP.py: -------------------------------------------------------------------------------- 1 | # importing dependency libraries 2 | from __future__ import print_function 3 | import gym 4 | import numpy as np 5 | import time 6 | 7 | #Load the environment 8 | env = gym.make('FrozenLake-v0') 9 | 10 | s = env.reset() 11 | print(s) 12 | print() 13 | 14 | env.render() 15 | print() 16 | 17 | print(env.action_space) #number of actions 18 | print(env.observation_space) #number of states 19 | print() 20 | 21 | print("Number of actions : ",env.action_space.n) 22 | print("Number of states : ",env.observation_space.n) 23 | print() 24 | 25 | # Value Iteration Implementation 26 | 27 | #Initializing Utilities of all states with zeros 28 | U = np.zeros([env.observation_space.n]) 29 | 30 | #since terminal states have utility values equal to their reward 31 | U[15] = 1 #goal state 32 | U[[5,7,11,12]] = -1 #hole states 33 | termS = [5,7,11,12,15] #terminal states 34 | #set hyperparameters 35 | y = 0.8 #discount factor lambda 36 | 37 | eps = 1e-3 #threshold if the learning difference i.e. prev_u - U goes below this value break the learning 38 | 39 | i=0 40 | while(True): 41 | i+=1 42 | prev_u = np.copy(U) 43 | for s in range(env.observation_space.n): 44 | q_sa = [sum([p*(r + y*prev_u[s_]) for p, s_, r, _ in env.env.P[s][a]]) for a in range(env.action_space.n)] 45 | if s not in termS: 46 | U[s] = max(q_sa) 47 | if (np.sum(np.fabs(prev_u - U)) <= eps): 48 | print ('Value-iteration converged at iteration# %d.' %(i+1)) 49 | break 50 | 51 | print("After learning completion printing the utilities for each states below from state ids 0-15") 52 | print() 53 | print(U[:4]) 54 | print(U[4:8]) 55 | print(U[8:12]) 56 | print(U[12:16]) 57 | -------------------------------------------------------------------------------- /Chapter04/PlayPongPolicyGradients.py: -------------------------------------------------------------------------------- 1 | #import dependencies 2 | import numpy as np #for matrix math 3 | import cPickle as pickle #to save/load model 4 | import gym 5 | 6 | 7 | #hyperparameters 8 | H = 200 #number of nodes in the hidden layer 9 | batch_size = 10 10 | learning_rate = 1e-4 11 | gamma = 0.99 #discount factor 12 | decay_rate = 0.99 #for RMS Prop Optimiser for Gradient Descent 13 | resume = False #to resume from previous checkpoint or not 14 | 15 | 16 | #initialise : init model 17 | D = 80*80 #input dimension 18 | if resume: 19 | model = pickle.load(open('model.v','rb')) 20 | else: 21 | model = {} 22 | #xavier initialisation of weights 23 | model['W1'] = np.random.randn(H,D)*np.sqrt(2.0/D) 24 | model['W2'] = np.random.randn(H)*np.sqrt(2.0/H) 25 | grad_buffer = {k: np.zeros_like(v) for k,v in model.iteritems()} #to store our gradients which can be summed up over a batch 26 | rmsprop_cache = {k: np.zeros_like(v) for k,v in model.iteritems()} #to store the value of rms prop formula 27 | 28 | 29 | #activation function 30 | def sigmoid(x): 31 | return 1.0/(1.0+np.exp(-x)) #adding non linearing + squashing 32 | 33 | def relu(x): 34 | x[x<0] = 0 35 | return x 36 | 37 | 38 | #preprocessing function 39 | def prepro(I): #where I is the single frame of the game as the input 40 | """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """ 41 | #the values below have been precomputed through trail and error by OpenAI team members 42 | I = I[35:195] #cropping the image frame to an extent where it contains on the paddles and ball and area between them 43 | I = I[::2,::2,0] #downsample by the factor of 2 and take only the R of the RGB channel.Therefore, now 2D frame 44 | I[I==144] = 0 #erase background type 1 45 | I[I==109] = 0 #erase background type 2 46 | I[I!=0] = 1 #everything else(other than paddles and ball) set to 1 47 | return I.astype('float').ravel() #flattening to 1D 48 | 49 | 50 | def discount_rewards(r): 51 | """ take 1D float array of rewards and compute discounted reward """ 52 | discount_r = np.zeros_like(r) 53 | running_add = 0 #addition of rewards 54 | for t in reversed(xrange(0,r.size)): 55 | if r[t] != 0: #episode ends 56 | running_add = 0 57 | running_add = gamma*running_add+r[t] 58 | discount_r[t] = running_add 59 | return discount_r 60 | 61 | 62 | def policy_forward(x): 63 | h = np.dot(model['W1'],x) 64 | h = relu(h) 65 | logit = np.dot(model['W2'],h) 66 | p = sigmoid(logit) 67 | return p,h #probability of action 2(i.e. UP) and hidden layer state i.e. hidden state 68 | 69 | 70 | def policy_backward(arr_hidden_state,gradient_logp,observation_values): 71 | """ backward pass """ 72 | #arr_hidden_state is array of intermediate hidden states shape [200x1] 73 | #gradient_logp is the loss value [1x1] 74 | dW2 = np.dot(arr_hidden_state.T,gradient_logp).ravel() # [200x1].[1x1] => [200x1] =>flatten=>[1x200] 75 | dh = np.outer(gradient_logp,model['W2']) # [1x1]outer[1x200] => [1x200] 76 | dh = relu(dh) #[1x200] 77 | dW1 = np.dot(dh.T,observation_values) #[200x1].[1x6400] => [200x6400] 78 | return {'W1':dW1,'W2':dW2} 79 | 80 | 81 | #implementation details 82 | env = gym.make('Pong-v0') 83 | observation = env.reset() 84 | prev_x = None #prev frame value in order to compute the difference between current and previous frame 85 | #as discussed frames are static and the difference is used to capture the motion 86 | #Intially None because there's no previous frame if the current frame is the 1st frame of the game 87 | episode_hidden_layer_values, episode_observations, episode_gradient_log_ps, episode_rewards = [], [], [], [] 88 | running_reward = None 89 | reward_sum = 0 90 | episode_number = 0 91 | 92 | 93 | #begin training 94 | while True: 95 | env.render() 96 | #get the input and preprocess it 97 | cur_x = prepro(observation) 98 | #get the frame difference which would be the input to the network 99 | if prev_x is None: 100 | prev_x = np.zeros(D) 101 | x = cur_x - prev_x 102 | prev_x = cur_x 103 | 104 | #forward propagation of the policy network 105 | #sample an action from the returned probability 106 | aprob, h = policy_forward(x) 107 | #stochastic part 108 | if np.random.uniform() < aprob: 109 | action = 2 110 | else: 111 | action = 3 112 | 113 | episode_observations.append(x) #record observation 114 | episode_hidden_layer_values.append(h) #record hidden state 115 | if action == 2: 116 | y = 1 117 | else: 118 | y = 0 119 | 120 | episode_gradient_log_ps.append(y-aprob) #record the gradient 121 | 122 | #new step in the environment 123 | observation,reward,done,info = env.step(action) 124 | reward_sum+=reward #for advantage purpose 125 | episode_rewards.append(reward) #record the reward 126 | 127 | if done: #if the episode is over 128 | episode_number+=1 129 | 130 | #stack inputs,hidden_states,actions,gradients_logp,rewards for the episode 131 | arr_hidden_state = np.vstack(episode_hidden_layer_values) 132 | gradient_logp = np.vstack(episode_gradient_log_ps) 133 | observation_values = np.vstack(episode_observations) 134 | reward_values = np.vstack(episode_rewards) 135 | 136 | #reset the memory arrays 137 | episode_hidden_layer_values, episode_observations, episode_gradient_log_ps, episode_rewards = [], [], [], [] 138 | 139 | #discounted reward computation 140 | discounted_episoderewards = discount_rewards(reward_values) 141 | #normalise discounted_episoderewards i.e. we obtain Advantage 142 | discounted_episoderewards = (discounted_episoderewards - np.mean(discounted_episoderewards))/np.std(discounted_episoderewards) 143 | 144 | #modulate the gradient with the advantage 145 | gradient_logp *= discounted_episoderewards 146 | 147 | grad = policy_backward(arr_hidden_state,gradient_logp,observation_values) 148 | 149 | #summing the gradients over the batch size 150 | for layer in model: 151 | grad_buffer[layer]+=grad[layer] 152 | 153 | #perform RMS prop to update weights after every 10 episodes 154 | if episode_number % batch_size == 0: 155 | epsilon = 1e-5 156 | for weight in model.keys(): 157 | g = grad_buffer[weight] #gradient 158 | rmsprop_cache[weight] = decay_rate*rmsprop_cache[weight]+(1-decay_rate)*g**2 159 | model[weight]+=learning_rate*g/(np.sqrt(rmsprop_cache[weight]) + epsilon) 160 | grad_buffer[weight] = np.zeros_like(model[weight]) 161 | 162 | 163 | if running_reward is None: 164 | running_reward = reward_sum 165 | else: 166 | running_reward = running_reward*learning_rate+reward_sum*(1-learning_rate) 167 | 168 | print('Episode Reward : {}, Running Mean Award : {}'.format(reward_sum,running_reward)) 169 | if episode_number % 100 == 0: 170 | pickle.dump(model,open('model.v','wb')) 171 | 172 | reward_sum = 0 173 | prev_x = None 174 | observation = env.reset() #resetting the environment since episode has ended 175 | 176 | 177 | if reward != 0: #if reward is either +1 or -1 i.e. an episode has ended 178 | print("Episode {} ended with reward {}".format(episode_number,reward)) 179 | -------------------------------------------------------------------------------- /Chapter05/Atari_Breakout.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | import gym 6 | from scipy.misc import imresize 7 | 8 | class DQN: 9 | def __init__(self, 10 | learning_rate, 11 | gamma, 12 | n_features, 13 | n_actions, 14 | epsilon, 15 | parameter_changing_pointer, 16 | memory_size, 17 | epsilon_incrementer): 18 | 19 | tf.reset_default_graph() 20 | self.learning_rate = learning_rate 21 | self.gamma = gamma 22 | self.n_features = n_features 23 | self.n_actions = n_actions 24 | self.epsilon = epsilon 25 | self.batch_size = 32 26 | self.experience_counter = 0 27 | self.epsilon_incrementer = epsilon_incrementer 28 | self.experience_limit = memory_size 29 | self.replace_target_pointer = parameter_changing_pointer 30 | self.learning_counter = 0 31 | self.memory = [] #np.zeros([self.experience_limit,4]) #for experience replay 32 | 33 | self.build_networks() 34 | p_params = tf.get_collection('primary_network_parameters') 35 | t_params = tf.get_collection('target_network_parameters') 36 | self.replacing_target_parameters = [tf.assign(t,p) for t,p in zip(t_params,p_params)] 37 | 38 | self.sess = tf.Session() 39 | self.sess.run(tf.global_variables_initializer()) 40 | 41 | def add_layer(self,inputs,w_shape=None,b_shape=None,layer=None,activation_fn=None,c=None,isconv=False): 42 | w = self.weight_variable(w_shape,layer,c) 43 | b = self.bias_variable(b_shape,layer,c) 44 | eps = tf.constant(value=0.000001, shape=b.shape) 45 | if isconv: 46 | if activation_fn is None: 47 | return self.conv(inputs,w)+b+eps 48 | else: 49 | h_conv = activation_fn(self.conv(inputs,w)+b+eps) 50 | return h_conv 51 | if activation_fn is None: 52 | return tf.matmul(inputs,w)+b+eps 53 | outputs = activation_fn(tf.matmul(inputs,w)+b+eps) 54 | return outputs 55 | 56 | def weight_variable(self,w_shape,layer,c): 57 | return tf.get_variable('w'+layer,w_shape,initializer=tf.contrib.layers.xavier_initializer(), 58 | dtype=tf.float32,collections=c) 59 | 60 | def bias_variable(self,b_shape,layer,c): 61 | return tf.get_variable('b'+layer,b_shape,initializer=tf.contrib.layers.xavier_initializer(), 62 | dtype=tf.float32,collections=c) 63 | 64 | def conv(self,inputs,w): 65 | #strides [1,x_movement,y_movement,1] 66 | #stride[0] = stride[3] = 1 67 | return tf.nn.conv2d(inputs,w,strides=[1,1,1,1],padding='SAME') 68 | 69 | def build_networks(self): 70 | #primary network 71 | shape = [None] + self.n_features 72 | self.s = tf.placeholder(tf.float32,shape) 73 | self.qtarget = tf.placeholder(tf.float32,[None,self.n_actions]) 74 | 75 | with tf.variable_scope('primary_network'): 76 | c = ['primary_network_parameters', tf.GraphKeys.GLOBAL_VARIABLES] 77 | #first convolutional layer 78 | with tf.variable_scope('convlayer1'): 79 | l1 = self.add_layer(self.s,w_shape=[5,5,4,32],b_shape=[32],layer='convL1',activation_fn=tf.nn.relu,c=c,isconv=True) 80 | 81 | #first convolutional layer 82 | with tf.variable_scope('convlayer2'): 83 | l2 = self.add_layer(l1,w_shape=[5,5,32,64],b_shape=[64],layer='convL2',activation_fn=tf.nn.relu,c=c,isconv=True) 84 | 85 | #first fully-connected layer 86 | l2 = tf.reshape(l2,[-1,80*80*64]) 87 | with tf.variable_scope('FClayer1'): 88 | l3 = self.add_layer(l2,w_shape=[80*80*64,128],b_shape=[128],layer='fclayer1',activation_fn=tf.nn.relu,c=c) 89 | 90 | #second fully-connected layer 91 | with tf.variable_scope('FClayer2'): 92 | self.qeval = self.add_layer(l3,w_shape=[128,self.n_actions],b_shape=[self.n_actions],layer='fclayer2',c=c) 93 | 94 | with tf.variable_scope('loss'): 95 | self.loss = tf.reduce_mean(tf.squared_difference(self.qtarget,self.qeval)) 96 | 97 | with tf.variable_scope('optimiser'): 98 | self.train = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss) 99 | 100 | #target network 101 | self.st = tf.placeholder(tf.float32,shape) 102 | 103 | with tf.variable_scope('target_network'): 104 | c = ['target_network_parameters', tf.GraphKeys.GLOBAL_VARIABLES] 105 | #first convolutional layer 106 | with tf.variable_scope('convlayer1'): 107 | l1 = self.add_layer(self.st,w_shape=[5,5,4,32],b_shape=[32],layer='convL1',activation_fn=tf.nn.relu,c=c,isconv=True) 108 | 109 | #first convolutional layer 110 | with tf.variable_scope('convlayer2'): 111 | l2 = self.add_layer(l1,w_shape=[5,5,32,64],b_shape=[64],layer='convL2',activation_fn=tf.nn.relu,c=c,isconv=True) 112 | 113 | #first fully-connected layer 114 | l2 = tf.reshape(l2,[-1,80*80*64]) 115 | with tf.variable_scope('FClayer1'): 116 | l3 = self.add_layer(l2,w_shape=[80*80*64,128],b_shape=[128],layer='fclayer1',activation_fn=tf.nn.relu,c=c) 117 | 118 | #second fully-connected layer 119 | with tf.variable_scope('FClayer2'): 120 | self.qt = self.add_layer(l3,w_shape=[128,self.n_actions],b_shape=[self.n_actions],layer='fclayer2',c=c) 121 | 122 | def target_params_replaced(self): 123 | self.sess.run(self.replacing_target_parameters) 124 | 125 | def store_experience(self,obs,a,r,obs_): 126 | if len(obs.shape)<3 or len(obs_.shape)<3: 127 | print("Wrong shape entered : ",obs.shape,obs_.shape,len(self.memory)) 128 | else: 129 | index = self.experience_counter % self.experience_limit 130 | if self.experience_counter < self.experience_limit: 131 | self.memory.append([obs,a,r,obs_]) 132 | else: 133 | self.memory[index] = [obs,a,r,obs_] 134 | self.experience_counter+=1 135 | 136 | def fit(self): 137 | # sample batch memory from all memory 138 | 139 | #if self.experience_counter < self.experience_limit: 140 | # indices = np.random.choice(self.experience_counter, size=self.batch_size) 141 | #else: 142 | # indices = np.random.choice(self.experience_limit, size=self.batch_size) 143 | 144 | indices = np.random.choice(len(self.memory), size=self.batch_size) 145 | batch = [self.memory[i] for i in indices] 146 | obs_nlist = np.array([i[3] for i in batch]) 147 | obs_list = np.array([i[0] for i in batch]) 148 | qt,qeval = self.sess.run([self.qt,self.qeval],feed_dict={self.st:obs_nlist,self.s:obs_list}) 149 | 150 | qtarget = qeval.copy() 151 | batch_indices = np.arange(self.batch_size, dtype=np.int32) 152 | actions = np.array([int(i[1]) for i in batch])#self.memory[indices,self.n_features].astype(int) 153 | rewards = np.array([int(i[2]) for i in batch])#self.memory[indices,self.n_features+1] 154 | qtarget[batch_indices,actions] = rewards + self.gamma * np.max(qt,axis=1) 155 | 156 | _ = self.sess.run(self.train,feed_dict = {self.s:obs_list,self.qtarget:qtarget}) 157 | print(self.learning_counter+1," learning done") 158 | #increasing epsilon 159 | if self.epsilon < 0.9: 160 | self.epsilon += self.epsilon_incrementer 161 | 162 | #replacing target network parameters with primary network parameters 163 | if self.learning_counter % self.replace_target_pointer == 0: 164 | self.target_params_replaced() 165 | print("target parameters changed") 166 | 167 | self.learning_counter += 1 168 | 169 | def epsilon_greedy(self,obs): 170 | new_shape = [1]+list(obs.shape) 171 | obs = obs.reshape(new_shape) 172 | #epsilon greedy implementation to choose action 173 | if np.random.uniform(low=0,high=1) < self.epsilon: 174 | return np.argmax(self.sess.run(self.qeval,feed_dict={self.s:obs})) #[np.newaxis,:] 175 | else: 176 | return np.random.choice(self.n_actions) 177 | 178 | def preprocessing_image(s): 179 | s = s[31:195] 180 | s = s.mean(axis=2) 181 | s = imresize(s,size=(80,80),interp='nearest') 182 | s = s/255.0 183 | return s 184 | 185 | if __name__ == "__main__": 186 | env = gym.make('Breakout-v0') 187 | env = env.unwrapped 188 | epsilon_rate_change = 0.9/500000.0 189 | dqn = DQN(learning_rate=0.0001, 190 | gamma=0.9, 191 | n_features=[80,80,4], 192 | n_actions=env.action_space.n, 193 | epsilon=0.0, 194 | parameter_changing_pointer=100, 195 | memory_size=50000, 196 | epsilon_incrementer=epsilon_rate_change) 197 | 198 | episodes = 100000 199 | total_steps = 0 200 | 201 | for episode in range(episodes): 202 | steps = 0 203 | 204 | obs = preprocessing_image(env.reset()) 205 | s_rec = np.stack([obs]*4,axis=0) 206 | s = np.stack([obs]*4,axis=0) 207 | s = s.transpose([1,2,0]) 208 | episode_reward = 0 209 | while True: 210 | env.render() 211 | action = dqn.epsilon_greedy(s) 212 | obs_,reward,terminate,_ = env.step(action) 213 | obs_ = preprocessing_image(obs_) 214 | 215 | a = s_rec[1:] 216 | a = a.tolist() 217 | a.append(obs_) 218 | s_rec = np.array(a) 219 | 220 | s_ = s_rec.transpose([1,2,0]) 221 | dqn.store_experience(s,action,reward,s_) 222 | if total_steps > 1999 and total_steps%500==0: 223 | dqn.fit() 224 | episode_reward+=reward 225 | if terminate: 226 | break 227 | s = s_ 228 | total_steps+=1 229 | steps+=1 230 | print("Episode {} with Reward : {} at epsilon {} in steps {}".format(episode+1,episode_reward,dqn.epsilon,steps)) 231 | 232 | while True: #to hold the render at the last step when Car passes the flag 233 | env.render() 234 | -------------------------------------------------------------------------------- /Chapter05/DeepQNetwork_Cartpole.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | import numpy as np 4 | import gym 5 | from Deep_Q_Network_Mountain_Car import DQN 6 | 7 | env = gym.make('CartPole-v0') 8 | env = env.unwrapped 9 | 10 | print(env.action_space) 11 | print(env.observation_space) 12 | print(env.observation_space.high) 13 | print(env.observation_space.low) 14 | 15 | 16 | dqn = DQN(learning_rate=0.01,gamma=0.9,n_features=env.observation_space.shape[0],n_actions=env.action_space.n,epsilon=0.0,parameter_changing_pointer=100,memory_size=2000) 17 | 18 | episodes = 150 19 | total_steps = 0 20 | rew_ep = [] 21 | for episode in range(episodes): 22 | steps = 0 23 | obs = env.reset() 24 | episode_reward = 0 25 | while True: 26 | env.render() 27 | action = dqn.epsilon_greedy(obs) 28 | obs_,reward,terminate,_ = env.step(action) 29 | 30 | #smaller the theta angle and closer to center then better should be the reward 31 | x, vel, angle, ang_vel = obs_ 32 | r1 = (env.x_threshold - abs(x))/env.x_threshold - 0.8 33 | r2 = (env.theta_threshold_radians - abs(angle))/env.theta_threshold_radians - 0.5 34 | reward_ = r1 + r2 35 | 36 | dqn.store_experience(obs,action,reward_,obs_) 37 | if total_steps > 1000: 38 | dqn.fit() 39 | episode_reward+=reward 40 | if terminate: 41 | break 42 | obs = obs_ 43 | total_steps+=1 44 | steps+=1 45 | print("Episode {} with Reward : {} at epsilon {} in steps {}".format(episode+1,episode_reward,dqn.epsilon,steps)) 46 | rew_ep.append(episode_reward) 47 | print("Mean over last 100 episodes are: ",np.mean(rew_ep[50:])) 48 | 49 | while True: #to hold the render at the last step when Car passes the flag 50 | env.render() 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /Chapter05/Deep_Q_Network_Mountain_Car.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | import gym 6 | 7 | class DQN: 8 | def __init__(self,learning_rate,gamma,n_features,n_actions,epsilon,parameter_changing_pointer,memory_size): 9 | 10 | self.learning_rate = learning_rate 11 | self.gamma = gamma 12 | self.n_features = n_features 13 | self.n_actions = n_actions 14 | self.epsilon = epsilon 15 | self.batch_size = 100 16 | self.experience_counter = 0 17 | self.experience_limit = memory_size 18 | self.replace_target_pointer = parameter_changing_pointer 19 | self.learning_counter = 0 20 | self.memory = np.zeros([self.experience_limit,self.n_features*2+2]) #for experience replay 21 | 22 | self.build_networks() 23 | p_params = tf.get_collection('primary_network_parameters') 24 | t_params = tf.get_collection('target_network_parameters') 25 | self.replacing_target_parameters = [tf.assign(t,p) for t,p in zip(t_params,p_params)] 26 | 27 | self.sess = tf.Session() 28 | self.sess.run(tf.global_variables_initializer()) 29 | 30 | 31 | def build_networks(self): 32 | #primary network 33 | hidden_units = 10 34 | self.s = tf.placeholder(tf.float32,[None,self.n_features]) 35 | self.qtarget = tf.placeholder(tf.float32,[None,self.n_actions]) 36 | 37 | with tf.variable_scope('primary_network'): 38 | c = ['primary_network_parameters', tf.GraphKeys.GLOBAL_VARIABLES] 39 | # first layer 40 | with tf.variable_scope('layer1'): 41 | w1 = tf.get_variable('w1', [self.n_features, hidden_units],initializer=tf.contrib.layers.xavier_initializer(), 42 | dtype=tf.float32,collections=c) 43 | b1 = tf.get_variable('b1', [1, hidden_units],initializer=tf.contrib.layers.xavier_initializer(), 44 | dtype=tf.float32,collections=c) 45 | l1 = tf.nn.relu(tf.matmul(self.s, w1) + b1) 46 | 47 | # second layer 48 | with tf.variable_scope('layer2'): 49 | w2 = tf.get_variable('w2', [hidden_units, self.n_actions],initializer=tf.contrib.layers.xavier_initializer(), 50 | dtype=tf.float32,collections=c) 51 | b2 = tf.get_variable('b2', [1, self.n_actions],initializer=tf.contrib.layers.xavier_initializer(), 52 | dtype=tf.float32,collections=c) 53 | self.qeval = tf.matmul(l1, w2) + b2 54 | 55 | 56 | with tf.variable_scope('loss'): 57 | self.loss = tf.reduce_mean(tf.squared_difference(self.qtarget,self.qeval)) 58 | 59 | with tf.variable_scope('optimiser'): 60 | self.train = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss) 61 | 62 | #target network 63 | self.st = tf.placeholder(tf.float32,[None,self.n_features]) 64 | 65 | with tf.variable_scope('target_network'): 66 | c = ['target_network_parameters', tf.GraphKeys.GLOBAL_VARIABLES] 67 | # first layer 68 | with tf.variable_scope('layer1'): 69 | w1 = tf.get_variable('w1', [self.n_features, hidden_units],initializer=tf.contrib.layers.xavier_initializer(), 70 | dtype=tf.float32,collections=c) 71 | b1 = tf.get_variable('b1', [1, hidden_units],initializer=tf.contrib.layers.xavier_initializer(), 72 | dtype=tf.float32,collections=c) 73 | l1 = tf.nn.relu(tf.matmul(self.st, w1) + b1) 74 | 75 | # second layer 76 | with tf.variable_scope('layer2'): 77 | w2 = tf.get_variable('w2', [hidden_units, self.n_actions],initializer=tf.contrib.layers.xavier_initializer(), 78 | dtype=tf.float32,collections=c) 79 | b2 = tf.get_variable('b2', [1, self.n_actions],initializer=tf.contrib.layers.xavier_initializer(), 80 | dtype=tf.float32,collections=c) 81 | self.qt = tf.matmul(l1, w2) + b2 82 | 83 | def target_params_replaced(self): 84 | self.sess.run(self.replacing_target_parameters) 85 | 86 | def store_experience(self,obs,a,r,obs_): 87 | index = self.experience_counter % self.experience_limit 88 | self.memory[index,:] = np.hstack((obs,[a,r],obs_)) 89 | self.experience_counter+=1 90 | 91 | def fit(self): 92 | # sample batch memory from all memory 93 | if self.experience_counter < self.experience_limit: 94 | indices = np.random.choice(self.experience_counter, size=self.batch_size) 95 | else: 96 | indices = np.random.choice(self.experience_limit, size=self.batch_size) 97 | 98 | batch = self.memory[indices,:] 99 | qt,qeval = self.sess.run([self.qt,self.qeval],feed_dict={self.st:batch[:,-self.n_features:],self.s:batch[:,:self.n_features]}) 100 | 101 | qtarget = qeval.copy() 102 | batch_indices = np.arange(self.batch_size, dtype=np.int32) 103 | actions = self.memory[indices,self.n_features].astype(int) 104 | rewards = self.memory[indices,self.n_features+1] 105 | qtarget[batch_indices,actions] = rewards + self.gamma * np.max(qt,axis=1) 106 | 107 | _ = self.sess.run(self.train,feed_dict = {self.s:batch[:,:self.n_features],self.qtarget:qtarget}) 108 | 109 | #increasing epsilon 110 | if self.epsilon < 0.9: 111 | self.epsilon += 0.0002 112 | 113 | #replacing target network parameters with primary network parameters 114 | if self.learning_counter % self.replace_target_pointer == 0: 115 | self.target_params_replaced() 116 | print("target parameters changed") 117 | 118 | self.learning_counter += 1 119 | 120 | def epsilon_greedy(self,obs): 121 | #epsilon greedy implementation to choose action 122 | if np.random.uniform(low=0,high=1) < self.epsilon: 123 | return np.argmax(self.sess.run(self.qeval,feed_dict={self.s:obs[np.newaxis,:]})) 124 | else: 125 | return np.random.choice(self.n_actions) 126 | 127 | 128 | 129 | if __name__ == "__main__": 130 | env = gym.make('MountainCar-v0') 131 | env = env.unwrapped 132 | dqn = DQN(learning_rate=0.001,gamma=0.9,n_features=env.observation_space.shape[0],n_actions=env.action_space.n,epsilon=0.0,parameter_changing_pointer=500,memory_size=5000) 133 | 134 | episodes = 10 135 | total_steps = 0 136 | 137 | for episode in range(episodes): 138 | steps = 0 139 | obs = env.reset() 140 | episode_reward = 0 141 | while True: 142 | env.render() 143 | action = dqn.epsilon_greedy(obs) 144 | obs_,reward,terminate,_ = env.step(action) 145 | reward = abs(obs_[0]+0.5) 146 | dqn.store_experience(obs,action,reward,obs_) 147 | if total_steps > 1000: 148 | dqn.fit() 149 | episode_reward+=reward 150 | if terminate: 151 | break 152 | obs = obs_ 153 | total_steps+=1 154 | steps+=1 155 | print("Episode {} with Reward : {} at epsilon {} in steps {}".format(episode+1,episode_reward,dqn.epsilon,steps)) 156 | 157 | while True: #to hold the render at the last step when Car passes the flag 158 | env.render() 159 | 160 | -------------------------------------------------------------------------------- /Chapter05/Deep_Q_Network_Mountain_Car.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Reinforcement-Learning-with-TensorFlow/abb2873f892dc5232c3c5352e165de37e6f2a6fe/Chapter05/Deep_Q_Network_Mountain_Car.pyc -------------------------------------------------------------------------------- /Chapter05/MountainCar_SARSA.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | 4 | #Q-Learning example using OpenAI gym MountainCar environment 5 | 6 | import gym 7 | import numpy as np 8 | 9 | 10 | n_states = 40 11 | episodes = 10 #number of episodes 12 | 13 | initial_lr = 1.0 # Learning rate 14 | min_lr = 0.005 15 | gamma = 0.99 16 | max_steps = 300 17 | epsilon = 0.05 18 | 19 | 20 | #exploring Mountain Car environment 21 | 22 | env_name = 'MountainCar-v0' 23 | env = gym.make(env_name) 24 | 25 | 26 | #One-dimensional discrete action space. 27 | #left,neutral,right 28 | print("Action Set size :",env.action_space) 29 | 30 | 31 | #Two-dimensional continuous state space. 32 | #Velocity=(-0.07,0.07) 33 | #Position=(-1.2,0.6) 34 | print("Observation set shape :",env.observation_space) # 2 (position,velocity) 35 | print("Highest state feature value :",env.observation_space.high) # i.e. (position = 0.6, velocity = 0.07) 36 | print("Lowest state feature value:",env.observation_space.low) #(position = -1.2, velocity = -0.07) 37 | print(env.observation_space.shape) # 2 38 | 39 | 40 | 41 | #Discretization of continuous state space : Converting continuous state space observation to a discrete set of state space 42 | 43 | def discretization(env, obs): 44 | 45 | env_low = env.observation_space.low 46 | env_high = env.observation_space.high 47 | 48 | env_den = (env_high - env_low) / n_states 49 | pos_den = env_den[0] 50 | vel_den = env_den[1] 51 | 52 | pos_high = env_high[0] 53 | pos_low = env_low[0] 54 | vel_high = env_high[1] 55 | vel_low = env_low[1] 56 | 57 | pos_scaled = int((obs[0] - pos_low)/pos_den) 58 | vel_scaled = int((obs[1] - vel_low)/vel_den) 59 | 60 | return pos_scaled,vel_scaled 61 | 62 | env = env.unwrapped 63 | env.seed(0) 64 | np.random.seed(0) 65 | #Q table 66 | #rows are states but here state is 2-D pos,vel 67 | #columns are actions 68 | #therefore, Q- table would be 3-D 69 | 70 | q_table = np.zeros((n_states,n_states,env.action_space.n)) 71 | total_steps = 0 72 | for episode in range(episodes): 73 | obs = env.reset() 74 | total_reward = 0 75 | # decreasing learning rate alpha over time 76 | alpha = max(min_lr,initial_lr*(gamma**(episode//100))) 77 | steps = 0 78 | 79 | #action for the initial state using epsilon greedy 80 | if np.random.uniform(low=0,high=1) < epsilon: 81 | a = np.random.choice(env.action_space.n) 82 | else: 83 | pos,vel = discretization(env,obs) 84 | a = np.argmax(q_table[pos][vel]) 85 | 86 | while True: 87 | env.render() 88 | pos,vel = discretization(env,obs) 89 | 90 | obs,reward,terminate,_ = env.step(a) 91 | total_reward += abs(obs[0]+0.5) 92 | pos_,vel_ = discretization(env,obs) 93 | 94 | #action for the next state using epsilon greedy 95 | if np.random.uniform(low=0,high=1) < epsilon: 96 | a_ = np.random.choice(env.action_space.n) 97 | else: 98 | a_ = np.argmax(q_table[pos_][vel_]) 99 | 100 | #q-table update 101 | q_table[pos][vel][a] = (1-alpha)*q_table[pos][vel][a] + alpha*(reward+gamma*q_table[pos_][vel_][a_]) 102 | steps+=1 103 | if terminate: 104 | break 105 | a = a_ 106 | print("Episode {} completed with total reward {} in {} steps".format(episode+1,total_reward,steps)) 107 | 108 | while True: #to hold the render at the last step when Car passes the flag 109 | env.render() 110 | -------------------------------------------------------------------------------- /Chapter05/Mountain_Car_Problem_QLearning.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | 4 | #Q-Learning example using OpenAI gym MountainCar environment 5 | 6 | import gym 7 | import numpy as np 8 | 9 | 10 | n_states = 40 11 | episodes = 10 #number of episodes 12 | 13 | initial_lr = 1.0 # Learning rate 14 | min_lr = 0.005 15 | gamma = 0.99 16 | max_steps = 300 17 | epsilon = 0.05 18 | 19 | 20 | #exploring Mountain Car environment 21 | 22 | env_name = 'MountainCar-v0' 23 | env = gym.make(env_name) 24 | 25 | 26 | #One-dimensional discrete action space. 27 | #left,neutral,right 28 | print("Action Set size :",env.action_space) 29 | 30 | 31 | #Two-dimensional continuous state space. 32 | #Velocity=(-0.07,0.07) 33 | #Position=(-1.2,0.6) 34 | print("Observation set shape :",env.observation_space) # 2 (position,velocity) 35 | print("Highest state feature value :",env.observation_space.high) # i.e. (position = 0.6, velocity = 0.07) 36 | print("Lowest state feature value:",env.observation_space.low) #(position = -1.2, velocity = -0.07) 37 | print(env.observation_space.shape) # 2 38 | 39 | 40 | 41 | #Discretization of continuous state space : Converting continuous state space observation to a discrete set of state space 42 | 43 | def discretization(env, obs): 44 | env_low = env.observation_space.low 45 | env_high = env.observation_space.high 46 | 47 | env_den = (env_high - env_low) / n_states 48 | pos_den = env_den[0] 49 | vel_den = env_den[1] 50 | 51 | pos_high = env_high[0] 52 | pos_low = env_low[0] 53 | vel_high = env_high[1] 54 | vel_low = env_low[1] 55 | 56 | pos_scaled = int((obs[0] - pos_low)/pos_den) 57 | vel_scaled = int((obs[1] - vel_low)/vel_den) 58 | 59 | return pos_scaled,vel_scaled 60 | 61 | env = env.unwrapped 62 | env.seed(0) 63 | np.random.seed(0) 64 | #Q table 65 | #rows are states but here state is 2-D pos,vel 66 | #columns are actions 67 | #therefore, Q- table would be 3-D 68 | 69 | q_table = np.zeros((n_states,n_states,env.action_space.n)) 70 | total_steps = 0 71 | for episode in range(episodes): 72 | obs = env.reset() 73 | total_reward = 0 74 | # decreasing learning rate alpha over time 75 | alpha = max(min_lr,initial_lr*(gamma**(episode//100))) 76 | steps = 0 77 | while True: 78 | env.render() 79 | pos,vel = discretization(env,obs) 80 | 81 | if np.random.uniform(low=0,high=1) < epsilon: 82 | a = np.random.choice(env.action_space.n) 83 | else: 84 | a = np.argmax(q_table[pos][vel]) 85 | #q_val = q_table[pos][vel] 86 | #logits = np.exp(q_val) 87 | #probabilities = logits/np.sum(logits) 88 | #a = np.random.choice(env.action_space.n,p=probabilities) 89 | 90 | obs,reward,terminate,_ = env.step(a) 91 | total_reward += abs(obs[0]+0.5)#reward 92 | 93 | #q-table update 94 | pos_,vel_ = discretization(env,obs) 95 | q_table[pos][vel][a] = (1-alpha)*q_table[pos][vel][a] + alpha*(reward+gamma*np.max(q_table[pos_][vel_])) 96 | steps+=1 97 | if terminate: 98 | break 99 | print("Episode {} completed with total reward {} in {} steps".format(episode+1,total_reward,steps)) 100 | 101 | while True: #to hold the render at the last step when Car passes the flag 102 | env.render() 103 | -------------------------------------------------------------------------------- /Chapter06/A3C_Pong.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import multiprocessing 4 | import threading 5 | import tensorflow as tf 6 | import numpy as np 7 | import gym 8 | import os 9 | import shutil 10 | import matplotlib.pyplot as plt 11 | 12 | def preprocessing_image(obs): #where I is the single frame of the game as the input 13 | """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """ 14 | #the values below have been precomputed through trail and error by OpenAI team members 15 | obs = obs[35:195] #cropping the image frame to an extent where it contains on the paddles and ball and area between them 16 | obs = obs[::2,::2,0] #downsample by the factor of 2 and take only the R of the RGB channel.Therefore, now 2D frame 17 | obs[obs==144] = 0 #erase background type 1 18 | obs[obs==109] = 0 #erase background type 2 19 | obs[obs!=0] = 1 #everything else(other than paddles and ball) set to 1 20 | return obs.astype('float').ravel() #flattening to 1D 21 | 22 | 23 | game_env = 'Pong-v0' 24 | num_workers = multiprocessing.cpu_count() 25 | max_global_episodes = 100000 26 | global_network_scope = 'globalnet' 27 | global_iteration_update = 20 28 | gamma = 0.9 29 | beta = 0.0001 30 | lr_actor = 0.0001 # learning rate for actor 31 | lr_critic = 0.0001 # learning rate for critic 32 | global_running_rate = [] 33 | global_episode = 0 34 | 35 | env = gym.make(game_env) 36 | 37 | num_actions = env.action_space.n 38 | 39 | 40 | tf.reset_default_graph() 41 | 42 | 43 | class ActorCriticNetwork(object): 44 | def __init__(self, scope, globalAC=None): 45 | if scope == global_network_scope: # get global network 46 | with tf.variable_scope(scope): 47 | self.s = tf.placeholder(tf.float32, [None,6400], 'state') 48 | self.a_params, self.c_params = self._build_net(scope)[-2:] 49 | else: # local net, calculate losses 50 | with tf.variable_scope(scope): 51 | self.s = tf.placeholder(tf.float32, [None,6400], 'state') 52 | self.a_his = tf.placeholder(tf.int32, [None,], 'action') 53 | self.v_target = tf.placeholder(tf.float32, [None, 1], 'target_vector') 54 | 55 | self.a_prob, self.v, self.a_params, self.c_params = self._build_net(scope) 56 | 57 | td = tf.subtract(self.v_target, self.v, name='temporal_difference_error') 58 | with tf.name_scope('critic_loss'): 59 | self.c_loss = tf.reduce_mean(tf.square(td)) 60 | 61 | with tf.name_scope('actor_loss'): 62 | log_prob = tf.reduce_sum(tf.log(self.a_prob) * tf.one_hot(self.a_his, num_actions, dtype=tf.float32), axis=1, keep_dims=True) 63 | exp_v = log_prob * td 64 | entropy = -tf.reduce_sum(self.a_prob * tf.log(self.a_prob + 1e-5), 65 | axis=1, keep_dims=True) #exploration 66 | self.exp_v = beta * entropy + exp_v 67 | self.a_loss = tf.reduce_mean(-self.exp_v) 68 | 69 | with tf.name_scope('local_grad'): 70 | self.a_grads = tf.gradients(self.a_loss, self.a_params) 71 | self.c_grads = tf.gradients(self.c_loss, self.c_params) 72 | 73 | with tf.name_scope('sync'): 74 | with tf.name_scope('pull'): 75 | self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)] 76 | self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)] 77 | with tf.name_scope('push'): 78 | self.update_a_op = actor_train.apply_gradients(zip(self.a_grads, globalAC.a_params)) 79 | self.update_c_op = critic_train.apply_gradients(zip(self.c_grads, globalAC.c_params)) 80 | 81 | def _build_net(self, scope): 82 | w_init = tf.random_normal_initializer(0., .1) 83 | with tf.variable_scope('actor_network'): 84 | l_a = tf.layers.dense(self.s, 300, tf.nn.relu6, kernel_initializer=w_init, name='actor_layer') 85 | a_prob = tf.layers.dense(l_a, num_actions, tf.nn.softmax, kernel_initializer=w_init, name='ap') 86 | with tf.variable_scope('critic_network'): 87 | l_c = tf.layers.dense(self.s, 100, tf.nn.relu6, kernel_initializer=w_init, name='critic_layer') 88 | v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value 89 | a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor') 90 | c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic') 91 | return a_prob, v, a_params, c_params 92 | 93 | def update_global(self, feed_dict): # run local 94 | session.run([self.update_a_op, self.update_c_op], feed_dict) # local gradient applied to global net 95 | 96 | def pull_global(self): # run local 97 | session.run([self.pull_a_params_op, self.pull_c_params_op]) 98 | 99 | def choose_action(self, s): # run local 100 | s = np.reshape(s,[-1]) 101 | prob_weights = session.run(self.a_prob, feed_dict={self.s: s[np.newaxis, :]}) 102 | action = np.random.choice(range(prob_weights.shape[1]),p=prob_weights.ravel()) # select action w.r.t the actions prob 103 | return action 104 | 105 | 106 | 107 | class Worker(object): 108 | def __init__(self, name, globalAC): 109 | self.env = gym.make(game_env).unwrapped 110 | self.name = name 111 | self.AC = ActorCriticNetwork(name, globalAC) 112 | 113 | def work(self): 114 | global global_running_rate, global_episode 115 | total_step = 1 116 | buffer_s, buffer_a, buffer_r = [], [], [] 117 | while not coordinator.should_stop() and global_episode < max_global_episodes: 118 | obs = self.env.reset() 119 | s = preprocessing_image(obs) 120 | ep_r = 0 121 | while True: 122 | if self.name == 'W_0': 123 | self.env.render() 124 | a = self.AC.choose_action(s) 125 | 126 | #print(a.shape) 127 | 128 | obs_, r, done, info = self.env.step(a) 129 | s_ = preprocessing_image(obs_) 130 | if done and r<=0: 131 | r = -20 132 | ep_r += r 133 | buffer_s.append(np.reshape(s,[-1])) 134 | buffer_a.append(a) 135 | buffer_r.append(r) 136 | 137 | if total_step % global_iteration_update == 0 or done: # update global and assign to local net 138 | if done: 139 | v_s_ = 0 # terminal 140 | else: 141 | s_ = np.reshape(s_,[-1]) 142 | v_s_ = session.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0] 143 | buffer_v_target = [] 144 | for r in buffer_r[::-1]: # reverse buffer r 145 | v_s_ = r + gamma * v_s_ 146 | buffer_v_target.append(v_s_) 147 | buffer_v_target.reverse() 148 | 149 | buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.array(buffer_a), np.vstack(buffer_v_target) 150 | feed_dict = { 151 | self.AC.s: buffer_s, 152 | self.AC.a_his: buffer_a, 153 | self.AC.v_target: buffer_v_target, 154 | } 155 | self.AC.update_global(feed_dict) 156 | 157 | buffer_s, buffer_a, buffer_r = [], [], [] 158 | self.AC.pull_global() 159 | 160 | s = s_ 161 | total_step += 1 162 | if done: 163 | if len(global_running_rate) == 0: # record running episode reward 164 | global_running_rate.append(ep_r) 165 | else: 166 | global_running_rate.append(0.99 * global_running_rate[-1] + 0.01 * ep_r) 167 | print( 168 | self.name, 169 | "Ep:", global_episode, 170 | "| Ep_r: %i" % global_running_rate[-1], 171 | ) 172 | global_episode += 1 173 | break 174 | 175 | 176 | if __name__ == "__main__": 177 | session = tf.Session() 178 | 179 | with tf.device("/cpu:0"): 180 | actor_train = tf.train.RMSPropOptimizer(lr_actor, name='RMSPropOptimiserActor') 181 | critic_train = tf.train.RMSPropOptimizer(lr_critic, name='RMSPropOptimiserCritic') 182 | acn_global = ActorCriticNetwork(global_network_scope) # we only need its params 183 | workers = [] 184 | # Create worker 185 | for i in range(num_workers): 186 | i_name = 'W_%i' % i # worker name 187 | workers.append(Worker(i_name, acn_global)) 188 | 189 | coordinator = tf.train.Coordinator() 190 | session.run(tf.global_variables_initializer()) 191 | 192 | worker_threads = [] 193 | for worker in workers: 194 | job = lambda: worker.work() 195 | t = threading.Thread(target=job) 196 | t.start() 197 | worker_threads.append(t) 198 | coordinator.join(worker_threads) 199 | 200 | plt.plot(np.arange(len(global_running_rate)), global_running_rate) 201 | plt.xlabel('step') 202 | plt.ylabel('Total moving reward') 203 | plt.show() 204 | 205 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Reinforcement Learning with Tensorflow 5 | This is the code repository for [Reinforcement Learning with Tensorflow](https://www.packtpub.com/big-data-and-business-intelligence/reinforcement-learning-tensorflow?utm_source=github&utm_medium=repository&utm_campaign=9781788835725), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the book from start to finish. 6 | ## About the Book 7 | Reinforcement Learning (RL), allows you to develop smart, quick and self-learning systems in your business surroundings. It is an effective method to train your learning agents and solve a variety of problems in Artificial Intelligence—from games, self-driving cars and robots to enterprise applications that range from datacenter energy saving (cooling data centers) to smart warehousing solutions. 8 | 9 | The book covers the major advancements and successes achieved in deep reinforcement learning by synergizing deep neural network architectures with reinforcement learning. The book also introduces readers to the concept of Reinforcement Learning, its advantages and why it’s gaining so much popularity. The book also discusses on MDPs, Monte Carlo tree searches, dynamic programming such as policy and value iteration, temporal difference learning such as Q-learning and SARSA. You will use TensorFlow and OpenAI Gym to build simple neural network models that learn from their own actions. You will also see how reinforcement learning algorithms play a role in games, image processing and NLP. 10 | ## Instructions and Navigation 11 | All of the code is organized into folders. Each folder starts with a number followed by the application name. For example, Chapter02. 12 | 13 | 14 | 15 | The code will look like the following: 16 | ``` 17 | def discretization(env, obs): 18 | env_low = env.observation_space.low 19 | env_high = env.observation_space.high 20 | ``` 21 | 22 | The following are the requirements to get the most out of this book: 23 | * Python and TensorFlow 24 | * Linear algebra as a prerequisite for neural networks 25 | * Installation bundle: Python, TensorFlow, and OpenAI gym (shown in Chapter 1, Deep Learning – Architectures and Frameworks and Chapter 2, Training Reinforcement Learning Agents Using OpenAI Gym) 26 | 27 | ## Related Products 28 | * [Deep Learning with TensorFlow - Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/deep-learning-tensorflow-second-edition?utm_source=github&utm_medium=repository&utm_campaign=9781788831109) 29 | 30 | * [TensorFlow: Powerful Predictive Analytics with TensorFlow](https://www.packtpub.com/big-data-and-business-intelligence/tensorflow-powerful-predictive-analytics-tensorflow?utm_source=github&utm_medium=repository&utm_campaign=9781789136913) 31 | 32 | * [Hands-On Deep Learning with TensorFlow](https://www.packtpub.com/big-data-and-business-intelligence/hands-deep-learning-tensorflow?utm_source=github&utm_medium=repository&utm_campaign=9781787282773) 33 | 34 | ### Download a free PDF 35 | 36 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
37 |

https://packt.link/free-ebook/9781788835725

-------------------------------------------------------------------------------- /requirements.sh: -------------------------------------------------------------------------------- 1 | sudo apt-get install pip 2 | sudo pip install --upgrade pip 3 | sudo pip install numpy 4 | sudo pip install pandas 5 | sudo pip install tensorflow 6 | sudo pip install gym 7 | --------------------------------------------------------------------------------