├── Chapter02
    ├── Frozen_Lake_QLearning.py
    └── Frozen_Lake_QNetwork.py
├── Chapter03
    └── Frozen_Lake_Value_Iteration_MDP.py
├── Chapter04
    └── PlayPongPolicyGradients.py
├── Chapter05
    ├── Atari_Breakout.py
    ├── DeepQNetwork_Cartpole.py
    ├── Deep_Q_Network_Mountain_Car.py
    ├── Deep_Q_Network_Mountain_Car.pyc
    ├── MountainCar_SARSA.py
    └── Mountain_Car_Problem_QLearning.py
├── Chapter06
    └── A3C_Pong.py
├── LICENSE
├── README.md
└── requirements.sh


/Chapter02/Frozen_Lake_QLearning.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | from __future__ import print_function
  7 | import gym
  8 | import numpy as np
  9 | import time
 10 | 
 11 | 
 12 | 
 13 | # ## Load the environment
 14 | 
 15 | # In[2]:
 16 | 
 17 | 
 18 | env = gym.make('FrozenLake-v0')
 19 | 
 20 | 
 21 | # In[5]:
 22 | 
 23 | 
 24 | s = env.reset()
 25 | print(s)
 26 | 
 27 | 
 28 | # In[6]:
 29 | 
 30 | 
 31 | env.render()
 32 | 
 33 | 
 34 | # In[7]:
 35 | 
 36 | 
 37 | print(env.action_space)   #number of actions
 38 | 
 39 | 
 40 | # In[8]:
 41 | 
 42 | 
 43 | print(env.observation_space)  #number of states
 44 | 
 45 | 
 46 | # In[9]:
 47 | 
 48 | 
 49 | print("Number of actions : ",env.action_space.n)
 50 | print("Number of states : ",env.observation_space.n)
 51 | 
 52 | 
 53 | # ## Epsilon Greedy
 54 | 
 55 | # In[40]:
 56 | 
 57 | 
 58 | def epsilon_greedy(Q,s,na):
 59 |     epsilon = 0.3
 60 |     p = np.random.uniform(low=0,high=1)
 61 |     #print(p)
 62 |     if p > epsilon:
 63 |         return np.argmax(Q[s,:])#say here,initial policy = for each state consider the action having highest Q-value
 64 |     else:
 65 |         return env.action_space.sample()
 66 | 
 67 | 
 68 | # ## Q-Learning Implementation
 69 | 
 70 | # In[47]:
 71 | 
 72 | 
 73 | #Initializing Q-table with zeros
 74 | Q = np.zeros([env.observation_space.n,env.action_space.n])
 75 | 
 76 | #set hyperparameters
 77 | lr = 0.5 #learning rate
 78 | y = 0.9 #discount factor lambda
 79 | eps = 100000 #total episodes being 100000
 80 | 
 81 | 
 82 | for i in range(eps):
 83 |     s = env.reset()
 84 |     t = False
 85 |     while(True):
 86 |         a = epsilon_greedy(Q,s,env.action_space.n)
 87 |         s_,r,t,_ = env.step(a)
 88 |         if (r==0):  
 89 |             if t==True:
 90 |                 r = -5 #to give negative rewards when holes turn up
 91 |                 Q[s_] = np.ones(env.action_space.n)*r    #in terminal state Q value equals the reward
 92 |             else:
 93 |                 r = -1  #to give negative rewards to avoid long routes
 94 |         if (r==1):
 95 |                 r = 100
 96 |                 Q[s_] = np.ones(env.action_space.n)*r    #in terminal state Q value equals the reward
 97 |         Q[s,a] = Q[s,a] + lr * (r + y*np.max(Q[s_,a]) - Q[s,a])
 98 |         s = s_   
 99 |         if (t == True) :
100 |                 break
101 | 
102 | 
103 | # In[48]:
104 | 
105 | 
106 | print("Q-table")
107 | print(Q)
108 | 
109 | 
110 | # In[49]:
111 | 
112 | 
113 | s = env.reset()
114 | env.render()
115 | while(True):
116 |     a = np.argmax(Q[s])
117 |     s_,r,t,_ = env.step(a)
118 |     env.render()
119 |     s = s_
120 |     if(t==True) :
121 |         break
122 | 
123 | 
124 | # In[ ]:
125 | 
126 | 
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/Chapter02/Frozen_Lake_QNetwork.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | 
  5 | import gym
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import random
  9 | from matplotlib import pyplot as plt
 10 | #get_ipython().magic(u'matplotlib inline')
 11 | 
 12 | 
 13 | # ## Load the Environment
 14 | 
 15 | 
 16 | 
 17 | env = gym.make('FrozenLake-v0')
 18 | 
 19 | 
 20 | # ## Q - Network Implementation
 21 | 
 22 | # ### Creating Neural Network
 23 | 
 24 | 
 25 | 
 26 | tf.reset_default_graph()
 27 | 
 28 | #tensors for inputs, weights, biases, Qtarget
 29 | inputs = tf.placeholder(shape=[None,env.observation_space.n],dtype=tf.float32)
 30 | W = tf.get_variable(name="W",dtype=tf.float32,shape=[env.observation_space.n,env.action_space.n],initializer=tf.contrib.layers.xavier_initializer())
 31 | b = tf.Variable(tf.zeros(shape=[env.action_space.n]),dtype=tf.float32)
 32 | qpred = tf.add(tf.matmul(inputs,W),b)
 33 | apred = tf.argmax(qpred,1)
 34 | 
 35 | qtar = tf.placeholder(shape=[1,env.action_space.n],dtype=tf.float32)
 36 | loss = tf.reduce_sum(tf.square(qtar-qpred))
 37 | 
 38 | train = tf.train.AdamOptimizer(learning_rate=0.001)
 39 | minimizer = train.minimize(loss)
 40 | 
 41 | 
 42 | # ## Training the neural network
 43 | 
 44 | 
 45 | 
 46 | init = tf.global_variables_initializer()
 47 | 
 48 | #learning parameters
 49 | y = 0.5
 50 | e = 0.3
 51 | episodes = 10000
 52 | 
 53 | #list to capture total steps and rewards per episodes
 54 | slist = []
 55 | rlist = []
 56 | 
 57 | with tf.Session() as sess:
 58 |     sess.run(init)
 59 |     for i in range(episodes):
 60 |         s = env.reset() #resetting the environment at the start of each episode
 61 |         r_total = 0  #to calculate the sum of rewards in the current episode
 62 |         while(True):
 63 |             #running the Q-network created above
 64 |             a_pred,q_pred = sess.run([apred,qpred],feed_dict={inputs:np.identity(env.observation_space.n)[s:s+1]})
 65 |             #a_pred is the action prediction by the neural network
 66 |             #q_pred contains q_values of the actions at current state 's'
 67 |             if np.random.uniform(low=0,high=1) < e:
 68 |                 a_pred[0] = env.action_space.sample()
 69 |                 #exploring different action by randomly assigning them as the next action
 70 |             s_,r,t,_ = env.step(a_pred[0])  #action taken and new state 's_' is encountered with a feedback reward 'r'
 71 |             if r==0: 
 72 |                 if t==True:
 73 |                     r=-5  #if hole make the reward more negative
 74 |                 else:
 75 |                     r=-1  #if block is fine/frozen then give slight negative reward to optimise the path
 76 |             if r==1:
 77 |                     r=5       #good positive goat state reward
 78 | 
 79 |             q_pred_new = sess.run(qpred,feed_dict={inputs:np.identity(env.observation_space.n)[s_:s_+1]})
 80 |             #q_pred_new contains q_values of the actions at the new state 
 81 | 
 82 |             #update the Q-target value for action taken
 83 |             targetQ = q_pred
 84 |             max_qpredn = np.max(q_pred_new)
 85 |             targetQ[0,a_pred[0]] = r + y*max_qpredn
 86 |             #this gives our targetQ
 87 | 
 88 |             #train the neural network to minimise the loss
 89 |             _ = sess.run(minimizer,feed_dict={inputs:np.identity(env.observation_space.n)[s:s+1],qtar:targetQ})
 90 |             r_total+=r
 91 | 
 92 |             s=s_
 93 |             if t==True:
 94 |                 break
 95 |     
 96 |     #learning ends with the end of the loop of several episodes above
 97 |     #let's check how much our agent has learned
 98 |     s = env.reset()
 99 |     env.render()
100 |     while(True):
101 |         a = sess.run(apred,feed_dict={inputs:np.identity(env.observation_space.n)[s:s+1]})
102 |         s_,r,t,_ = env.step(a[0])
103 |         env.render()
104 |         s = s_
105 |         if t==True:
106 |             break
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/Chapter03/Frozen_Lake_Value_Iteration_MDP.py:
--------------------------------------------------------------------------------
 1 | # importing dependency libraries
 2 | from __future__ import print_function
 3 | import gym
 4 | import numpy as np
 5 | import time
 6 | 
 7 | #Load the environment
 8 | env = gym.make('FrozenLake-v0')
 9 | 
10 | s = env.reset()
11 | print(s)
12 | print()
13 | 
14 | env.render()
15 | print()
16 | 
17 | print(env.action_space) #number of actions
18 | print(env.observation_space) #number of states
19 | print()
20 | 
21 | print("Number of actions : ",env.action_space.n)
22 | print("Number of states : ",env.observation_space.n)
23 | print()
24 | 
25 | # Value Iteration Implementation
26 | 
27 | #Initializing Utilities of all states with zeros
28 | U = np.zeros([env.observation_space.n])
29 | 
30 | #since terminal states have utility values equal to their reward
31 | U[15] = 1 #goal state
32 | U[[5,7,11,12]] = -1 #hole states
33 | termS = [5,7,11,12,15] #terminal states
34 | #set hyperparameters
35 | y = 0.8 #discount factor lambda
36 | 
37 | eps = 1e-3 #threshold if the learning difference i.e. prev_u - U goes below this value break the learning
38 | 
39 | i=0
40 | while(True):
41 | 	i+=1
42 | 	prev_u = np.copy(U)
43 | 	for s in range(env.observation_space.n):
44 | 		q_sa = [sum([p*(r + y*prev_u[s_]) for p, s_, r, _ in env.env.P[s][a]]) for a in range(env.action_space.n)]
45 | 		if s not in termS: 
46 | 			U[s] = max(q_sa)
47 | 	if (np.sum(np.fabs(prev_u - U)) <= eps):
48 | 		print ('Value-iteration converged at iteration# %d.' %(i+1))
49 | 		break
50 | 
51 | print("After learning completion printing the utilities for each states below from state ids 0-15")
52 | print()
53 | print(U[:4])
54 | print(U[4:8])
55 | print(U[8:12])
56 | print(U[12:16])
57 | 


--------------------------------------------------------------------------------
/Chapter04/PlayPongPolicyGradients.py:
--------------------------------------------------------------------------------
  1 | #import dependencies
  2 | import numpy as np   #for matrix math
  3 | import cPickle as pickle  #to save/load model
  4 | import gym
  5 | 
  6 | 
  7 | #hyperparameters
  8 | H = 200 #number of nodes in the hidden layer
  9 | batch_size = 10 
 10 | learning_rate = 1e-4 
 11 | gamma = 0.99 #discount factor
 12 | decay_rate = 0.99 #for RMS Prop Optimiser for Gradient Descent
 13 | resume = False #to resume from previous checkpoint or not
 14 | 
 15 | 
 16 | #initialise : init model
 17 | D = 80*80 #input dimension
 18 | if resume:
 19 |     model = pickle.load(open('model.v','rb'))
 20 | else:
 21 |     model = {}
 22 |     #xavier initialisation of weights
 23 |     model['W1'] = np.random.randn(H,D)*np.sqrt(2.0/D)
 24 |     model['W2'] = np.random.randn(H)*np.sqrt(2.0/H)
 25 | grad_buffer = {k: np.zeros_like(v) for k,v in model.iteritems()} #to store our gradients which can be summed up over a batch
 26 | rmsprop_cache = {k: np.zeros_like(v) for k,v in model.iteritems()} #to store the value of rms prop formula
 27 | 
 28 | 
 29 | #activation function
 30 | def sigmoid(x):
 31 |     return 1.0/(1.0+np.exp(-x))   #adding non linearing + squashing
 32 | 
 33 | def relu(x):
 34 |     x[x<0] = 0
 35 |     return x
 36 | 
 37 | 
 38 | #preprocessing function
 39 | def prepro(I): #where I is the single frame of the game as the input
 40 |     """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
 41 |     #the values below have been precomputed through trail and error by OpenAI team members
 42 |     I = I[35:195] #cropping the image frame to an extent where it contains on the paddles and ball and area between them
 43 |     I = I[::2,::2,0] #downsample by the factor of 2 and take only the R of the RGB channel.Therefore, now 2D frame
 44 |     I[I==144] = 0 #erase background type 1
 45 |     I[I==109] = 0 #erase background type 2
 46 |     I[I!=0] = 1 #everything else(other than paddles and ball) set to 1
 47 |     return I.astype('float').ravel() #flattening to 1D
 48 | 
 49 | 
 50 | def discount_rewards(r):
 51 |     """ take 1D float array of rewards and compute discounted reward """
 52 |     discount_r = np.zeros_like(r)
 53 |     running_add = 0 #addition of rewards
 54 |     for t in reversed(xrange(0,r.size)):
 55 |         if r[t] != 0: #episode ends
 56 |             running_add = 0
 57 |         running_add = gamma*running_add+r[t]
 58 |         discount_r[t] = running_add
 59 |     return discount_r
 60 | 
 61 | 
 62 | def policy_forward(x):
 63 |     h = np.dot(model['W1'],x)   
 64 |     h = relu(h)  
 65 |     logit = np.dot(model['W2'],h)
 66 |     p = sigmoid(logit)
 67 |     return p,h   #probability of action 2(i.e. UP) and hidden layer state i.e. hidden state
 68 | 
 69 | 
 70 | def policy_backward(arr_hidden_state,gradient_logp,observation_values):
 71 |     """ backward pass """
 72 |     #arr_hidden_state is array of intermediate hidden states  shape [200x1]
 73 |     #gradient_logp is the loss value [1x1]
 74 |     dW2 = np.dot(arr_hidden_state.T,gradient_logp).ravel()  # [200x1].[1x1] => [200x1] =>flatten=>[1x200]
 75 |     dh = np.outer(gradient_logp,model['W2']) # [1x1]outer[1x200] => [1x200]
 76 |     dh = relu(dh) #[1x200]
 77 |     dW1 = np.dot(dh.T,observation_values)  #[200x1].[1x6400] => [200x6400]
 78 |     return {'W1':dW1,'W2':dW2}
 79 | 
 80 | 
 81 | #implementation details
 82 | env = gym.make('Pong-v0')
 83 | observation = env.reset()
 84 | prev_x = None #prev frame value in order to compute the difference between current and previous frame
 85 | #as discussed frames are static and the difference is used to capture the motion
 86 | #Intially None because there's no previous frame if the current frame is the 1st frame of the game
 87 | episode_hidden_layer_values, episode_observations, episode_gradient_log_ps, episode_rewards = [], [], [], []
 88 | running_reward = None
 89 | reward_sum = 0
 90 | episode_number = 0
 91 | 
 92 | 
 93 | #begin training
 94 | while True:
 95 |     env.render()
 96 |     #get the input and preprocess it
 97 |     cur_x = prepro(observation)
 98 |     #get the frame difference which would be the input to the network
 99 |     if prev_x is None:
100 |         prev_x = np.zeros(D)
101 |     x = cur_x - prev_x
102 |     prev_x = cur_x
103 | 
104 |     #forward propagation of the policy network
105 |     #sample an action from the returned probability
106 |     aprob, h = policy_forward(x)
107 |     #stochastic part
108 |     if np.random.uniform() < aprob:
109 |         action = 2
110 |     else:
111 |         action = 3
112 | 
113 |     episode_observations.append(x) #record observation
114 |     episode_hidden_layer_values.append(h) #record hidden state
115 |     if action == 2:
116 |         y = 1
117 |     else:
118 |         y = 0
119 | 
120 |     episode_gradient_log_ps.append(y-aprob) #record the gradient
121 | 
122 |     #new step in the environment
123 |     observation,reward,done,info = env.step(action)
124 |     reward_sum+=reward #for advantage purpose
125 |     episode_rewards.append(reward) #record the reward
126 | 
127 |     if done:  #if the episode is over
128 |         episode_number+=1
129 | 
130 |         #stack inputs,hidden_states,actions,gradients_logp,rewards for the episode
131 |         arr_hidden_state = np.vstack(episode_hidden_layer_values)
132 |         gradient_logp = np.vstack(episode_gradient_log_ps)
133 |         observation_values = np.vstack(episode_observations)
134 |         reward_values = np.vstack(episode_rewards)
135 | 
136 |         #reset the memory arrays
137 |         episode_hidden_layer_values, episode_observations, episode_gradient_log_ps, episode_rewards = [], [], [], []
138 | 
139 |         #discounted reward computation
140 |         discounted_episoderewards = discount_rewards(reward_values)
141 |         #normalise discounted_episoderewards i.e. we obtain Advantage
142 |         discounted_episoderewards = (discounted_episoderewards - np.mean(discounted_episoderewards))/np.std(discounted_episoderewards)
143 | 
144 |         #modulate the gradient with the advantage
145 |         gradient_logp *= discounted_episoderewards
146 | 
147 |         grad = policy_backward(arr_hidden_state,gradient_logp,observation_values)
148 | 
149 |         #summing the gradients over the batch size
150 |         for layer in model:
151 |             grad_buffer[layer]+=grad[layer]
152 | 
153 |         #perform RMS prop to update weights after every 10 episodes
154 |         if episode_number % batch_size == 0:
155 |             epsilon = 1e-5
156 |             for weight in model.keys():
157 |                 g = grad_buffer[weight] #gradient
158 |                 rmsprop_cache[weight] = decay_rate*rmsprop_cache[weight]+(1-decay_rate)*g**2
159 |                 model[weight]+=learning_rate*g/(np.sqrt(rmsprop_cache[weight]) + epsilon)
160 |                 grad_buffer[weight] = np.zeros_like(model[weight])
161 |                 
162 | 
163 |         if running_reward is None:
164 |             running_reward = reward_sum
165 |         else:
166 |             running_reward = running_reward*learning_rate+reward_sum*(1-learning_rate)
167 | 
168 |         print('Episode Reward : {}, Running Mean Award : {}'.format(reward_sum,running_reward))
169 |         if episode_number % 100 == 0:
170 |             pickle.dump(model,open('model.v','wb'))
171 | 
172 |         reward_sum = 0
173 |         prev_x = None
174 |         observation = env.reset() #resetting the environment since episode has ended
175 |         
176 | 
177 |     if reward != 0: #if reward is either +1 or -1 i.e. an episode has ended
178 |         print("Episode {} ended with reward {}".format(episode_number,reward))
179 | 


--------------------------------------------------------------------------------
/Chapter05/Atari_Breakout.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | import gym
  6 | from scipy.misc import imresize
  7 | 
  8 | class DQN:
  9 |     def __init__(self,
 10 |                 learning_rate,
 11 |                 gamma,
 12 |                 n_features,
 13 |                 n_actions,
 14 |                 epsilon,
 15 |                 parameter_changing_pointer,
 16 |                 memory_size,
 17 |                 epsilon_incrementer):
 18 |         
 19 |             tf.reset_default_graph()
 20 |             self.learning_rate = learning_rate
 21 |             self.gamma = gamma
 22 |             self.n_features = n_features
 23 |             self.n_actions = n_actions
 24 |             self.epsilon = epsilon
 25 |             self.batch_size = 32
 26 |             self.experience_counter = 0
 27 |             self.epsilon_incrementer = epsilon_incrementer
 28 |             self.experience_limit = memory_size
 29 |             self.replace_target_pointer = parameter_changing_pointer
 30 |             self.learning_counter = 0
 31 |             self.memory = [] #np.zeros([self.experience_limit,4])  #for experience replay
 32 | 
 33 |             self.build_networks()
 34 |             p_params = tf.get_collection('primary_network_parameters')
 35 |             t_params = tf.get_collection('target_network_parameters')
 36 |             self.replacing_target_parameters = [tf.assign(t,p) for t,p in zip(t_params,p_params)]
 37 | 
 38 |             self.sess = tf.Session()
 39 |             self.sess.run(tf.global_variables_initializer())
 40 |         
 41 |     def add_layer(self,inputs,w_shape=None,b_shape=None,layer=None,activation_fn=None,c=None,isconv=False):
 42 |             w = self.weight_variable(w_shape,layer,c)
 43 |             b = self.bias_variable(b_shape,layer,c)
 44 |             eps = tf.constant(value=0.000001, shape=b.shape)
 45 |             if isconv:
 46 |                 if activation_fn is None:
 47 |                     return self.conv(inputs,w)+b+eps
 48 |                 else:
 49 |                     h_conv = activation_fn(self.conv(inputs,w)+b+eps) 
 50 |                     return h_conv
 51 |             if activation_fn is None:
 52 |                 return tf.matmul(inputs,w)+b+eps
 53 |             outputs = activation_fn(tf.matmul(inputs,w)+b+eps)
 54 |             return outputs
 55 | 
 56 |     def weight_variable(self,w_shape,layer,c):
 57 |             return tf.get_variable('w'+layer,w_shape,initializer=tf.contrib.layers.xavier_initializer(),
 58 |                                      dtype=tf.float32,collections=c)
 59 | 
 60 |     def bias_variable(self,b_shape,layer,c):
 61 |             return tf.get_variable('b'+layer,b_shape,initializer=tf.contrib.layers.xavier_initializer(),
 62 |                                      dtype=tf.float32,collections=c)
 63 | 
 64 |     def conv(self,inputs,w):
 65 |             #strides [1,x_movement,y_movement,1]
 66 |             #stride[0] = stride[3] = 1
 67 |             return tf.nn.conv2d(inputs,w,strides=[1,1,1,1],padding='SAME')    
 68 |         
 69 |     def build_networks(self):
 70 |             #primary network
 71 |             shape = [None] + self.n_features
 72 |             self.s = tf.placeholder(tf.float32,shape)
 73 |             self.qtarget = tf.placeholder(tf.float32,[None,self.n_actions])
 74 | 
 75 |             with tf.variable_scope('primary_network'):
 76 |                     c = ['primary_network_parameters', tf.GraphKeys.GLOBAL_VARIABLES]
 77 |                     #first convolutional layer
 78 |                     with tf.variable_scope('convlayer1'):
 79 |                         l1 = self.add_layer(self.s,w_shape=[5,5,4,32],b_shape=[32],layer='convL1',activation_fn=tf.nn.relu,c=c,isconv=True)
 80 | 
 81 |                     #first convolutional layer
 82 |                     with tf.variable_scope('convlayer2'):
 83 |                         l2 = self.add_layer(l1,w_shape=[5,5,32,64],b_shape=[64],layer='convL2',activation_fn=tf.nn.relu,c=c,isconv=True)
 84 |                         
 85 |                     #first fully-connected layer
 86 |                     l2 = tf.reshape(l2,[-1,80*80*64])
 87 |                     with tf.variable_scope('FClayer1'):
 88 |                         l3 = self.add_layer(l2,w_shape=[80*80*64,128],b_shape=[128],layer='fclayer1',activation_fn=tf.nn.relu,c=c)
 89 | 
 90 |                     #second fully-connected layer
 91 |                     with tf.variable_scope('FClayer2'):
 92 |                         self.qeval = self.add_layer(l3,w_shape=[128,self.n_actions],b_shape=[self.n_actions],layer='fclayer2',c=c)
 93 | 
 94 |             with tf.variable_scope('loss'):
 95 |                     self.loss = tf.reduce_mean(tf.squared_difference(self.qtarget,self.qeval))
 96 | 
 97 |             with tf.variable_scope('optimiser'):
 98 |                     self.train = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
 99 | 
100 |             #target network
101 |             self.st = tf.placeholder(tf.float32,shape)
102 | 
103 |             with tf.variable_scope('target_network'):
104 |                     c = ['target_network_parameters', tf.GraphKeys.GLOBAL_VARIABLES]
105 |                     #first convolutional layer
106 |                     with tf.variable_scope('convlayer1'):
107 |                         l1 = self.add_layer(self.st,w_shape=[5,5,4,32],b_shape=[32],layer='convL1',activation_fn=tf.nn.relu,c=c,isconv=True)
108 | 
109 |                     #first convolutional layer
110 |                     with tf.variable_scope('convlayer2'):
111 |                         l2 = self.add_layer(l1,w_shape=[5,5,32,64],b_shape=[64],layer='convL2',activation_fn=tf.nn.relu,c=c,isconv=True)
112 |                         
113 |                     #first fully-connected layer
114 |                     l2 = tf.reshape(l2,[-1,80*80*64])
115 |                     with tf.variable_scope('FClayer1'):
116 |                         l3 = self.add_layer(l2,w_shape=[80*80*64,128],b_shape=[128],layer='fclayer1',activation_fn=tf.nn.relu,c=c)
117 | 
118 |                     #second fully-connected layer
119 |                     with tf.variable_scope('FClayer2'):
120 |                         self.qt = self.add_layer(l3,w_shape=[128,self.n_actions],b_shape=[self.n_actions],layer='fclayer2',c=c)
121 | 
122 |     def target_params_replaced(self):
123 |             self.sess.run(self.replacing_target_parameters)
124 |         
125 |     def store_experience(self,obs,a,r,obs_):
126 |             if len(obs.shape)<3 or len(obs_.shape)<3: 
127 |                 print("Wrong shape entered : ",obs.shape,obs_.shape,len(self.memory))
128 |             else:
129 |                 index = self.experience_counter % self.experience_limit
130 |                 if self.experience_counter < self.experience_limit:
131 |                     self.memory.append([obs,a,r,obs_])
132 |                 else:
133 |                     self.memory[index] = [obs,a,r,obs_]
134 |                 self.experience_counter+=1
135 | 
136 |     def fit(self):
137 |             # sample batch memory from all memory
138 | 
139 |             #if self.experience_counter < self.experience_limit:
140 |             #    indices = np.random.choice(self.experience_counter, size=self.batch_size)
141 |             #else:
142 |             #    indices = np.random.choice(self.experience_limit, size=self.batch_size)
143 | 
144 |             indices = np.random.choice(len(self.memory), size=self.batch_size)
145 |             batch = [self.memory[i] for i in indices]
146 |             obs_nlist = np.array([i[3] for i in batch])
147 |             obs_list = np.array([i[0] for i in batch])
148 |             qt,qeval = self.sess.run([self.qt,self.qeval],feed_dict={self.st:obs_nlist,self.s:obs_list})
149 | 
150 |             qtarget = qeval.copy()    
151 |             batch_indices = np.arange(self.batch_size, dtype=np.int32)
152 |             actions = np.array([int(i[1]) for i in batch])#self.memory[indices,self.n_features].astype(int)
153 |             rewards = np.array([int(i[2]) for i in batch])#self.memory[indices,self.n_features+1]
154 |             qtarget[batch_indices,actions] = rewards + self.gamma * np.max(qt,axis=1)
155 | 
156 |             _ = self.sess.run(self.train,feed_dict = {self.s:obs_list,self.qtarget:qtarget})
157 |             print(self.learning_counter+1," learning done")
158 |             #increasing epsilon        
159 |             if self.epsilon < 0.9:
160 |                 self.epsilon += self.epsilon_incrementer
161 | 
162 |             #replacing target network parameters with primary network parameters    
163 |             if self.learning_counter % self.replace_target_pointer == 0:
164 |                 self.target_params_replaced()
165 |                 print("target parameters changed")
166 | 
167 |             self.learning_counter += 1
168 |      
169 |     def epsilon_greedy(self,obs):
170 |             new_shape = [1]+list(obs.shape)
171 |             obs = obs.reshape(new_shape)
172 |             #epsilon greedy implementation to choose action
173 |             if np.random.uniform(low=0,high=1) < self.epsilon:
174 |                 return np.argmax(self.sess.run(self.qeval,feed_dict={self.s:obs})) #[np.newaxis,:]
175 |             else:
176 |                 return np.random.choice(self.n_actions)
177 | 
178 | def preprocessing_image(s):
179 |         s = s[31:195]
180 |         s = s.mean(axis=2)
181 |         s = imresize(s,size=(80,80),interp='nearest')
182 |         s = s/255.0
183 |         return s        
184 | 
185 | if __name__ == "__main__":
186 |     env = gym.make('Breakout-v0')
187 |     env = env.unwrapped
188 |     epsilon_rate_change = 0.9/500000.0
189 |     dqn = DQN(learning_rate=0.0001,
190 |               gamma=0.9,
191 |               n_features=[80,80,4],
192 |               n_actions=env.action_space.n,
193 |               epsilon=0.0,
194 |               parameter_changing_pointer=100,
195 |               memory_size=50000,
196 |               epsilon_incrementer=epsilon_rate_change)
197 | 
198 |     episodes = 100000
199 |     total_steps = 0
200 | 
201 |     for episode in range(episodes):
202 |         steps = 0
203 | 
204 |         obs = preprocessing_image(env.reset())
205 |         s_rec = np.stack([obs]*4,axis=0)
206 |         s = np.stack([obs]*4,axis=0)
207 |         s = s.transpose([1,2,0])
208 |         episode_reward = 0
209 |         while True:
210 |             env.render()
211 |             action = dqn.epsilon_greedy(s)
212 |             obs_,reward,terminate,_ = env.step(action)
213 |             obs_ = preprocessing_image(obs_)
214 | 
215 |             a = s_rec[1:]
216 |             a = a.tolist()
217 |             a.append(obs_)
218 |             s_rec = np.array(a)
219 | 
220 |             s_ = s_rec.transpose([1,2,0])
221 |             dqn.store_experience(s,action,reward,s_)
222 |             if total_steps > 1999 and total_steps%500==0:
223 |                 dqn.fit()
224 |             episode_reward+=reward
225 |             if terminate:
226 |                 break
227 |             s = s_
228 |             total_steps+=1
229 |             steps+=1
230 |         print("Episode {} with Reward : {} at epsilon {} in steps {}".format(episode+1,episode_reward,dqn.epsilon,steps))
231 | 
232 |     while True:  #to hold the render at the last step when Car passes the flag
233 |         env.render()
234 | 


--------------------------------------------------------------------------------
/Chapter05/DeepQNetwork_Cartpole.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | import numpy as np
 4 | import gym
 5 | from Deep_Q_Network_Mountain_Car import DQN
 6 | 
 7 | env = gym.make('CartPole-v0')
 8 | env = env.unwrapped
 9 | 
10 | print(env.action_space)
11 | print(env.observation_space)
12 | print(env.observation_space.high)
13 | print(env.observation_space.low)
14 | 
15 | 
16 | dqn = DQN(learning_rate=0.01,gamma=0.9,n_features=env.observation_space.shape[0],n_actions=env.action_space.n,epsilon=0.0,parameter_changing_pointer=100,memory_size=2000)
17 | 
18 | episodes = 150
19 | total_steps = 0
20 | rew_ep = []
21 | for episode in range(episodes):
22 |     steps = 0
23 |     obs = env.reset()
24 |     episode_reward = 0
25 |     while True:
26 |         env.render()
27 |         action = dqn.epsilon_greedy(obs)
28 |         obs_,reward,terminate,_ = env.step(action)
29 | 
30 |         #smaller the theta angle and closer to center then better should be the reward
31 |         x, vel, angle, ang_vel = obs_
32 |         r1 = (env.x_threshold - abs(x))/env.x_threshold - 0.8
33 |         r2 = (env.theta_threshold_radians - abs(angle))/env.theta_threshold_radians - 0.5
34 |         reward_ = r1 + r2
35 | 
36 |         dqn.store_experience(obs,action,reward_,obs_)
37 |         if total_steps > 1000:
38 |             dqn.fit()
39 |         episode_reward+=reward
40 |         if terminate:
41 |             break
42 |         obs = obs_
43 |         total_steps+=1
44 |         steps+=1
45 |     print("Episode {} with Reward : {} at epsilon {} in steps {}".format(episode+1,episode_reward,dqn.epsilon,steps))
46 |     rew_ep.append(episode_reward)
47 | print("Mean over last 100 episodes are: ",np.mean(rew_ep[50:]))
48 | 
49 | while True:  #to hold the render at the last step when Car passes the flag
50 |     env.render()
51 | 
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/Chapter05/Deep_Q_Network_Mountain_Car.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | import gym
  6 | 
  7 | class DQN:
  8 |     def __init__(self,learning_rate,gamma,n_features,n_actions,epsilon,parameter_changing_pointer,memory_size):
  9 |         
 10 |         self.learning_rate = learning_rate
 11 |         self.gamma = gamma
 12 |         self.n_features = n_features
 13 |         self.n_actions = n_actions
 14 |         self.epsilon = epsilon
 15 |         self.batch_size = 100
 16 |         self.experience_counter = 0
 17 |         self.experience_limit = memory_size
 18 |         self.replace_target_pointer = parameter_changing_pointer
 19 |         self.learning_counter = 0
 20 |         self.memory = np.zeros([self.experience_limit,self.n_features*2+2])  #for experience replay
 21 | 
 22 |         self.build_networks()
 23 |         p_params = tf.get_collection('primary_network_parameters')
 24 |         t_params = tf.get_collection('target_network_parameters')
 25 |         self.replacing_target_parameters = [tf.assign(t,p) for t,p in zip(t_params,p_params)]
 26 | 
 27 |         self.sess = tf.Session()
 28 |         self.sess.run(tf.global_variables_initializer())
 29 |         
 30 |         
 31 |     def build_networks(self):
 32 |         #primary network
 33 |         hidden_units = 10
 34 |         self.s = tf.placeholder(tf.float32,[None,self.n_features])
 35 |         self.qtarget = tf.placeholder(tf.float32,[None,self.n_actions])
 36 | 
 37 |         with tf.variable_scope('primary_network'):
 38 |             c = ['primary_network_parameters', tf.GraphKeys.GLOBAL_VARIABLES]
 39 |             # first layer
 40 |             with tf.variable_scope('layer1'):
 41 |                 w1 = tf.get_variable('w1', [self.n_features, hidden_units],initializer=tf.contrib.layers.xavier_initializer(),
 42 |                                      dtype=tf.float32,collections=c)
 43 |                 b1 = tf.get_variable('b1', [1, hidden_units],initializer=tf.contrib.layers.xavier_initializer(),
 44 |                                      dtype=tf.float32,collections=c)
 45 |                 l1 = tf.nn.relu(tf.matmul(self.s, w1) + b1)
 46 | 
 47 |             # second layer
 48 |             with tf.variable_scope('layer2'):
 49 |                 w2 = tf.get_variable('w2', [hidden_units, self.n_actions],initializer=tf.contrib.layers.xavier_initializer(),
 50 |                                      dtype=tf.float32,collections=c)
 51 |                 b2 = tf.get_variable('b2', [1, self.n_actions],initializer=tf.contrib.layers.xavier_initializer(),
 52 |                                      dtype=tf.float32,collections=c)
 53 |                 self.qeval = tf.matmul(l1, w2) + b2
 54 | 
 55 | 
 56 |         with tf.variable_scope('loss'):
 57 |                 self.loss = tf.reduce_mean(tf.squared_difference(self.qtarget,self.qeval))
 58 | 
 59 |         with tf.variable_scope('optimiser'):
 60 |                 self.train = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
 61 | 
 62 |         #target network
 63 |         self.st = tf.placeholder(tf.float32,[None,self.n_features])
 64 | 
 65 |         with tf.variable_scope('target_network'):
 66 |             c = ['target_network_parameters', tf.GraphKeys.GLOBAL_VARIABLES]
 67 |             # first layer
 68 |             with tf.variable_scope('layer1'):
 69 |                 w1 = tf.get_variable('w1', [self.n_features, hidden_units],initializer=tf.contrib.layers.xavier_initializer(),
 70 |                                      dtype=tf.float32,collections=c)
 71 |                 b1 = tf.get_variable('b1', [1, hidden_units],initializer=tf.contrib.layers.xavier_initializer(),
 72 |                                      dtype=tf.float32,collections=c)
 73 |                 l1 = tf.nn.relu(tf.matmul(self.st, w1) + b1)
 74 | 
 75 |             # second layer
 76 |             with tf.variable_scope('layer2'):
 77 |                 w2 = tf.get_variable('w2', [hidden_units, self.n_actions],initializer=tf.contrib.layers.xavier_initializer(),
 78 |                                      dtype=tf.float32,collections=c)
 79 |                 b2 = tf.get_variable('b2', [1, self.n_actions],initializer=tf.contrib.layers.xavier_initializer(),
 80 |                                      dtype=tf.float32,collections=c)
 81 |                 self.qt = tf.matmul(l1, w2) + b2
 82 |     
 83 |     def target_params_replaced(self):
 84 |         self.sess.run(self.replacing_target_parameters)
 85 |         
 86 |     def store_experience(self,obs,a,r,obs_):
 87 |         index = self.experience_counter % self.experience_limit
 88 |         self.memory[index,:] = np.hstack((obs,[a,r],obs_))
 89 |         self.experience_counter+=1
 90 |         
 91 |     def fit(self):
 92 |         # sample batch memory from all memory
 93 |         if self.experience_counter < self.experience_limit:
 94 |             indices = np.random.choice(self.experience_counter, size=self.batch_size)
 95 |         else:
 96 |             indices = np.random.choice(self.experience_limit, size=self.batch_size)
 97 | 
 98 |         batch = self.memory[indices,:]
 99 |         qt,qeval = self.sess.run([self.qt,self.qeval],feed_dict={self.st:batch[:,-self.n_features:],self.s:batch[:,:self.n_features]})
100 | 
101 |         qtarget = qeval.copy()    
102 |         batch_indices = np.arange(self.batch_size, dtype=np.int32)
103 |         actions = self.memory[indices,self.n_features].astype(int)
104 |         rewards = self.memory[indices,self.n_features+1]
105 |         qtarget[batch_indices,actions] = rewards + self.gamma * np.max(qt,axis=1)
106 | 
107 |         _ = self.sess.run(self.train,feed_dict = {self.s:batch[:,:self.n_features],self.qtarget:qtarget})
108 | 
109 |         #increasing epsilon        
110 |         if self.epsilon < 0.9:
111 |             self.epsilon += 0.0002
112 | 
113 |         #replacing target network parameters with primary network parameters    
114 |         if self.learning_counter % self.replace_target_pointer == 0:
115 |             self.target_params_replaced()
116 |             print("target parameters changed")
117 |             
118 |         self.learning_counter += 1
119 |      
120 |     def epsilon_greedy(self,obs):
121 |         #epsilon greedy implementation to choose action
122 |         if np.random.uniform(low=0,high=1) < self.epsilon:
123 |             return np.argmax(self.sess.run(self.qeval,feed_dict={self.s:obs[np.newaxis,:]}))
124 |         else:
125 |             return np.random.choice(self.n_actions)
126 |         
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     env = gym.make('MountainCar-v0')
131 |     env = env.unwrapped
132 |     dqn = DQN(learning_rate=0.001,gamma=0.9,n_features=env.observation_space.shape[0],n_actions=env.action_space.n,epsilon=0.0,parameter_changing_pointer=500,memory_size=5000)
133 | 
134 |     episodes = 10
135 |     total_steps = 0
136 | 
137 |     for episode in range(episodes):
138 |         steps = 0		
139 |         obs = env.reset()
140 |         episode_reward = 0
141 |         while True:
142 |             env.render()
143 |             action = dqn.epsilon_greedy(obs)
144 |             obs_,reward,terminate,_ = env.step(action)
145 |             reward = abs(obs_[0]+0.5)
146 |             dqn.store_experience(obs,action,reward,obs_)
147 |             if total_steps > 1000:
148 |                 dqn.fit()
149 |             episode_reward+=reward
150 |             if terminate:
151 |                 break
152 |             obs = obs_
153 |             total_steps+=1
154 |             steps+=1
155 |         print("Episode {} with Reward : {} at epsilon {} in steps {}".format(episode+1,episode_reward,dqn.epsilon,steps))
156 | 
157 |     while True:  #to hold the render at the last step when Car passes the flag
158 |         env.render()	
159 |     
160 | 


--------------------------------------------------------------------------------
/Chapter05/Deep_Q_Network_Mountain_Car.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Reinforcement-Learning-with-TensorFlow/abb2873f892dc5232c3c5352e165de37e6f2a6fe/Chapter05/Deep_Q_Network_Mountain_Car.pyc


--------------------------------------------------------------------------------
/Chapter05/MountainCar_SARSA.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | 
  4 | #Q-Learning example using OpenAI gym MountainCar environment
  5 | 
  6 | import gym
  7 | import numpy as np
  8 | 
  9 | 
 10 | n_states = 40
 11 | episodes = 10 #number of episodes
 12 | 
 13 | initial_lr = 1.0 # Learning rate
 14 | min_lr = 0.005
 15 | gamma = 0.99
 16 | max_steps = 300
 17 | epsilon = 0.05
 18 | 
 19 | 
 20 | #exploring Mountain Car environment
 21 | 
 22 | env_name = 'MountainCar-v0'
 23 | env = gym.make(env_name)
 24 | 
 25 | 
 26 | #One-dimensional discrete action space.
 27 | #left,neutral,right
 28 | print("Action Set size :",env.action_space)
 29 | 
 30 | 
 31 | #Two-dimensional continuous state space.
 32 | #Velocity=(-0.07,0.07)
 33 | #Position=(-1.2,0.6)
 34 | print("Observation set shape :",env.observation_space)    # 2 (position,velocity)
 35 | print("Highest state feature value :",env.observation_space.high) # i.e. (position = 0.6, velocity = 0.07)
 36 | print("Lowest state feature value:",env.observation_space.low) #(position = -1.2, velocity = -0.07)
 37 | print(env.observation_space.shape) # 2
 38 | 
 39 | 
 40 | 
 41 | #Discretization of continuous state space : Converting continuous state space observation to a discrete set of state space
 42 | 
 43 | def discretization(env, obs):
 44 |     
 45 | 	env_low = env.observation_space.low
 46 | 	env_high = env.observation_space.high
 47 | 
 48 | 	env_den = (env_high - env_low) / n_states
 49 | 	pos_den = env_den[0]
 50 | 	vel_den = env_den[1]
 51 | 
 52 | 	pos_high = env_high[0]
 53 | 	pos_low = env_low[0]
 54 | 	vel_high = env_high[1]
 55 | 	vel_low = env_low[1]
 56 | 
 57 | 	pos_scaled = int((obs[0] - pos_low)/pos_den)
 58 | 	vel_scaled = int((obs[1] - vel_low)/vel_den)
 59 | 
 60 | 	return pos_scaled,vel_scaled
 61 | 
 62 | env = env.unwrapped
 63 | env.seed(0)
 64 | np.random.seed(0)
 65 | #Q table
 66 | #rows are states but here state is 2-D pos,vel
 67 | #columns are actions
 68 | #therefore, Q- table would be 3-D
 69 | 
 70 | q_table = np.zeros((n_states,n_states,env.action_space.n))
 71 | total_steps = 0
 72 | for episode in range(episodes):
 73 | 	obs = env.reset()
 74 | 	total_reward = 0
 75 | 	# decreasing learning rate alpha over time
 76 | 	alpha = max(min_lr,initial_lr*(gamma**(episode//100)))
 77 | 	steps = 0
 78 | 
 79 | 	#action for the initial state using epsilon greedy
 80 | 	if np.random.uniform(low=0,high=1) < epsilon:
 81 | 		a = np.random.choice(env.action_space.n)
 82 | 	else:
 83 | 		pos,vel = discretization(env,obs)
 84 | 		a = np.argmax(q_table[pos][vel])
 85 | 
 86 | 	while True:
 87 | 		env.render()
 88 | 		pos,vel = discretization(env,obs)
 89 | 
 90 | 		obs,reward,terminate,_ = env.step(a)	
 91 | 		total_reward += abs(obs[0]+0.5)		
 92 | 		pos_,vel_ = discretization(env,obs)
 93 | 
 94 | 		#action for the next state using epsilon greedy
 95 | 		if np.random.uniform(low=0,high=1) < epsilon:
 96 | 			a_ = np.random.choice(env.action_space.n)
 97 | 		else:
 98 | 			a_ = np.argmax(q_table[pos_][vel_])
 99 | 
100 | 		#q-table update
101 | 		q_table[pos][vel][a] = (1-alpha)*q_table[pos][vel][a] + alpha*(reward+gamma*q_table[pos_][vel_][a_])
102 | 		steps+=1
103 | 		if terminate:
104 | 			break
105 | 		a = a_
106 | 	print("Episode {} completed with total reward {} in {} steps".format(episode+1,total_reward,steps))    
107 | 
108 | while True:  #to hold the render at the last step when Car passes the flag
109 | 	env.render()    
110 | 


--------------------------------------------------------------------------------
/Chapter05/Mountain_Car_Problem_QLearning.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | 
  4 | #Q-Learning example using OpenAI gym MountainCar environment
  5 | 
  6 | import gym
  7 | import numpy as np
  8 | 
  9 | 
 10 | n_states = 40
 11 | episodes = 10 #number of episodes
 12 | 
 13 | initial_lr = 1.0 # Learning rate
 14 | min_lr = 0.005
 15 | gamma = 0.99
 16 | max_steps = 300
 17 | epsilon = 0.05
 18 | 
 19 | 
 20 | #exploring Mountain Car environment
 21 | 
 22 | env_name = 'MountainCar-v0'
 23 | env = gym.make(env_name)
 24 | 
 25 | 
 26 | #One-dimensional discrete action space.
 27 | #left,neutral,right
 28 | print("Action Set size :",env.action_space)
 29 | 
 30 | 
 31 | #Two-dimensional continuous state space.
 32 | #Velocity=(-0.07,0.07)
 33 | #Position=(-1.2,0.6)
 34 | print("Observation set shape :",env.observation_space)    # 2 (position,velocity)
 35 | print("Highest state feature value :",env.observation_space.high) # i.e. (position = 0.6, velocity = 0.07)
 36 | print("Lowest state feature value:",env.observation_space.low) #(position = -1.2, velocity = -0.07)
 37 | print(env.observation_space.shape) # 2
 38 | 
 39 | 
 40 | 
 41 | #Discretization of continuous state space : Converting continuous state space observation to a discrete set of state space
 42 | 
 43 | def discretization(env, obs):
 44 | 	env_low = env.observation_space.low
 45 | 	env_high = env.observation_space.high
 46 | 
 47 | 	env_den = (env_high - env_low) / n_states
 48 | 	pos_den = env_den[0]
 49 | 	vel_den = env_den[1]
 50 | 
 51 | 	pos_high = env_high[0]
 52 | 	pos_low = env_low[0]
 53 | 	vel_high = env_high[1]
 54 | 	vel_low = env_low[1]
 55 | 
 56 | 	pos_scaled = int((obs[0] - pos_low)/pos_den)
 57 | 	vel_scaled = int((obs[1] - vel_low)/vel_den)
 58 | 
 59 | 	return pos_scaled,vel_scaled
 60 | 
 61 | env = env.unwrapped
 62 | env.seed(0)
 63 | np.random.seed(0)
 64 | #Q table
 65 | #rows are states but here state is 2-D pos,vel
 66 | #columns are actions
 67 | #therefore, Q- table would be 3-D
 68 | 
 69 | q_table = np.zeros((n_states,n_states,env.action_space.n))
 70 | total_steps = 0
 71 | for episode in range(episodes):
 72 | 	obs = env.reset()
 73 | 	total_reward = 0
 74 | 	# decreasing learning rate alpha over time
 75 | 	alpha = max(min_lr,initial_lr*(gamma**(episode//100)))
 76 | 	steps = 0
 77 | 	while True:
 78 | 		env.render()
 79 | 		pos,vel = discretization(env,obs)
 80 | 
 81 | 		if np.random.uniform(low=0,high=1) < epsilon:
 82 | 			a = np.random.choice(env.action_space.n)
 83 | 		else:
 84 | 			a = np.argmax(q_table[pos][vel])
 85 | 			#q_val = q_table[pos][vel]
 86 | 			#logits = np.exp(q_val)
 87 | 			#probabilities = logits/np.sum(logits)
 88 | 			#a = np.random.choice(env.action_space.n,p=probabilities)
 89 | 
 90 | 		obs,reward,terminate,_ = env.step(a)	
 91 | 		total_reward += abs(obs[0]+0.5)#reward
 92 | 
 93 | 		#q-table update
 94 | 		pos_,vel_ = discretization(env,obs)
 95 | 		q_table[pos][vel][a] = (1-alpha)*q_table[pos][vel][a] + alpha*(reward+gamma*np.max(q_table[pos_][vel_]))
 96 | 		steps+=1
 97 | 		if terminate:
 98 | 			break
 99 | 	print("Episode {} completed with total reward {} in {} steps".format(episode+1,total_reward,steps))    
100 | 
101 | while True:  #to hold the render at the last step when Car passes the flag
102 | 	env.render()    
103 | 


--------------------------------------------------------------------------------
/Chapter06/A3C_Pong.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import multiprocessing
  4 | import threading
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | import gym
  8 | import os
  9 | import shutil
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | def preprocessing_image(obs): #where I is the single frame of the game as the input
 13 |     """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
 14 |     #the values below have been precomputed through trail and error by OpenAI team members
 15 |     obs = obs[35:195] #cropping the image frame to an extent where it contains on the paddles and ball and area between them
 16 |     obs = obs[::2,::2,0] #downsample by the factor of 2 and take only the R of the RGB channel.Therefore, now 2D frame
 17 |     obs[obs==144] = 0 #erase background type 1
 18 |     obs[obs==109] = 0 #erase background type 2
 19 |     obs[obs!=0] = 1 #everything else(other than paddles and ball) set to 1
 20 |     return obs.astype('float').ravel() #flattening to 1D
 21 | 
 22 | 
 23 | game_env = 'Pong-v0'
 24 | num_workers = multiprocessing.cpu_count()
 25 | max_global_episodes = 100000
 26 | global_network_scope = 'globalnet'
 27 | global_iteration_update = 20
 28 | gamma = 0.9
 29 | beta = 0.0001
 30 | lr_actor = 0.0001    # learning rate for actor
 31 | lr_critic = 0.0001    # learning rate for critic
 32 | global_running_rate = []
 33 | global_episode = 0
 34 | 
 35 | env = gym.make(game_env)
 36 | 
 37 | num_actions = env.action_space.n
 38 | 
 39 | 
 40 | tf.reset_default_graph()
 41 | 
 42 | 
 43 | class ActorCriticNetwork(object):
 44 |     def __init__(self, scope, globalAC=None):
 45 |         if scope == global_network_scope:   # get global network
 46 |             with tf.variable_scope(scope):
 47 |                 self.s = tf.placeholder(tf.float32, [None,6400], 'state')
 48 |                 self.a_params, self.c_params = self._build_net(scope)[-2:]
 49 |         else:   # local net, calculate losses
 50 |             with tf.variable_scope(scope):
 51 |                 self.s = tf.placeholder(tf.float32, [None,6400], 'state')
 52 |                 self.a_his = tf.placeholder(tf.int32, [None,], 'action')
 53 |                 self.v_target = tf.placeholder(tf.float32, [None, 1], 'target_vector')
 54 | 
 55 |                 self.a_prob, self.v, self.a_params, self.c_params = self._build_net(scope)
 56 | 
 57 |                 td = tf.subtract(self.v_target, self.v, name='temporal_difference_error')
 58 |                 with tf.name_scope('critic_loss'):
 59 |                 	self.c_loss = tf.reduce_mean(tf.square(td))
 60 | 
 61 |                 with tf.name_scope('actor_loss'):
 62 |                 	log_prob = tf.reduce_sum(tf.log(self.a_prob) * tf.one_hot(self.a_his, num_actions, dtype=tf.float32), axis=1, keep_dims=True)
 63 |                 	exp_v = log_prob * td
 64 |                 	entropy = -tf.reduce_sum(self.a_prob * tf.log(self.a_prob + 1e-5),
 65 |                 			     axis=1, keep_dims=True)  #exploration
 66 |                 	self.exp_v = beta * entropy + exp_v
 67 |                 	self.a_loss = tf.reduce_mean(-self.exp_v)
 68 | 
 69 |                 with tf.name_scope('local_grad'):
 70 |                 	self.a_grads = tf.gradients(self.a_loss, self.a_params)
 71 |                 	self.c_grads = tf.gradients(self.c_loss, self.c_params)
 72 | 
 73 |             with tf.name_scope('sync'):
 74 |                 with tf.name_scope('pull'):
 75 |                     self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
 76 |                     self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
 77 |                 with tf.name_scope('push'):
 78 |                     self.update_a_op = actor_train.apply_gradients(zip(self.a_grads, globalAC.a_params))
 79 |                     self.update_c_op = critic_train.apply_gradients(zip(self.c_grads, globalAC.c_params))
 80 | 
 81 |     def _build_net(self, scope):
 82 |         w_init = tf.random_normal_initializer(0., .1)
 83 |         with tf.variable_scope('actor_network'):
 84 |             l_a = tf.layers.dense(self.s, 300, tf.nn.relu6, kernel_initializer=w_init, name='actor_layer')
 85 |             a_prob = tf.layers.dense(l_a, num_actions, tf.nn.softmax, kernel_initializer=w_init, name='ap')
 86 |         with tf.variable_scope('critic_network'):
 87 |             l_c = tf.layers.dense(self.s, 100, tf.nn.relu6, kernel_initializer=w_init, name='critic_layer')
 88 |             v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v')  # state value
 89 |         a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
 90 |         c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
 91 |         return a_prob, v, a_params, c_params
 92 | 
 93 |     def update_global(self, feed_dict):  # run local
 94 |         session.run([self.update_a_op, self.update_c_op], feed_dict)  # local gradient applied to global net
 95 | 
 96 |     def pull_global(self):  # run local
 97 |         session.run([self.pull_a_params_op, self.pull_c_params_op])
 98 | 
 99 |     def choose_action(self, s):  # run local
100 |         s = np.reshape(s,[-1])
101 |         prob_weights = session.run(self.a_prob, feed_dict={self.s: s[np.newaxis, :]})
102 |         action = np.random.choice(range(prob_weights.shape[1]),p=prob_weights.ravel())  # select action w.r.t the actions prob
103 |         return action
104 | 
105 | 
106 | 
107 | class Worker(object):
108 |     def __init__(self, name, globalAC):
109 |         self.env = gym.make(game_env).unwrapped
110 |         self.name = name
111 |         self.AC = ActorCriticNetwork(name, globalAC)
112 | 
113 |     def work(self):
114 |         global global_running_rate, global_episode
115 |         total_step = 1
116 |         buffer_s, buffer_a, buffer_r = [], [], []
117 |         while not coordinator.should_stop() and global_episode < max_global_episodes:
118 |             obs = self.env.reset()
119 |             s = preprocessing_image(obs)
120 |             ep_r = 0
121 |             while True:
122 |                 if self.name == 'W_0':
123 |                     self.env.render()
124 |                 a = self.AC.choose_action(s)
125 | 
126 |                 #print(a.shape)
127 | 
128 |                 obs_, r, done, info = self.env.step(a)
129 |                 s_ = preprocessing_image(obs_)
130 |                 if done and r<=0: 
131 |                     r = -20
132 |                 ep_r += r
133 |                 buffer_s.append(np.reshape(s,[-1]))
134 |                 buffer_a.append(a)
135 |                 buffer_r.append(r)
136 | 
137 |                 if total_step % global_iteration_update == 0 or done:   # update global and assign to local net
138 |                     if done:
139 |                         v_s_ = 0   # terminal
140 |                     else:
141 |                         s_ = np.reshape(s_,[-1])
142 |                         v_s_ = session.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
143 |                     buffer_v_target = []
144 |                     for r in buffer_r[::-1]:    # reverse buffer r
145 |                         v_s_ = r + gamma * v_s_
146 |                         buffer_v_target.append(v_s_)
147 |                     buffer_v_target.reverse()
148 | 
149 |                     buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.array(buffer_a), np.vstack(buffer_v_target)
150 |                     feed_dict = {
151 |                         self.AC.s: buffer_s,
152 |                         self.AC.a_his: buffer_a,
153 |                         self.AC.v_target: buffer_v_target,
154 |                     }
155 |                     self.AC.update_global(feed_dict)
156 | 
157 |                     buffer_s, buffer_a, buffer_r = [], [], []
158 |                     self.AC.pull_global()
159 | 
160 |                 s = s_
161 |                 total_step += 1
162 |                 if done:
163 |                     if len(global_running_rate) == 0:  # record running episode reward
164 |                         global_running_rate.append(ep_r)
165 |                     else:
166 |                         global_running_rate.append(0.99 * global_running_rate[-1] + 0.01 * ep_r)
167 |                     print(
168 |                         self.name,
169 |                         "Ep:", global_episode,
170 |                         "| Ep_r: %i" % global_running_rate[-1],
171 |                           )
172 |                     global_episode += 1
173 |                     break
174 | 
175 | 
176 | if __name__ == "__main__":
177 |     session = tf.Session()
178 | 
179 |     with tf.device("/cpu:0"):
180 |         actor_train = tf.train.RMSPropOptimizer(lr_actor, name='RMSPropOptimiserActor')
181 |         critic_train = tf.train.RMSPropOptimizer(lr_critic, name='RMSPropOptimiserCritic')
182 |         acn_global = ActorCriticNetwork(global_network_scope)  # we only need its params
183 |         workers = []
184 |         # Create worker
185 |         for i in range(num_workers):
186 |             i_name = 'W_%i' % i   # worker name
187 |             workers.append(Worker(i_name, acn_global))
188 | 
189 |     coordinator = tf.train.Coordinator()
190 |     session.run(tf.global_variables_initializer())
191 | 
192 |     worker_threads = []
193 |     for worker in workers:
194 |         job = lambda: worker.work()
195 |         t = threading.Thread(target=job)
196 |         t.start()
197 |         worker_threads.append(t)
198 |     coordinator.join(worker_threads)
199 | 
200 |     plt.plot(np.arange(len(global_running_rate)), global_running_rate)
201 |     plt.xlabel('step')
202 |     plt.ylabel('Total moving reward')
203 |     plt.show()
204 | 
205 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | # Reinforcement Learning with Tensorflow
 5 | This is the code repository for [Reinforcement Learning with Tensorflow](https://www.packtpub.com/big-data-and-business-intelligence/reinforcement-learning-tensorflow?utm_source=github&utm_medium=repository&utm_campaign=9781788835725), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the book from start to finish.
 6 | ## About the Book
 7 | Reinforcement Learning (RL), allows you to develop smart, quick and self-learning systems in your business surroundings. It is an effective method to train your learning agents and solve a variety of problems in Artificial Intelligence—from games, self-driving cars and robots to enterprise applications that range from datacenter energy saving (cooling data centers) to smart warehousing solutions.
 8 | 
 9 | The book covers the major advancements and successes achieved in deep reinforcement learning by synergizing deep neural network architectures with reinforcement learning. The book also introduces readers to the concept of Reinforcement Learning, its advantages and why it’s gaining so much popularity. The book also discusses on MDPs, Monte Carlo tree searches, dynamic programming such as policy and value iteration, temporal difference learning such as Q-learning and SARSA. You will use TensorFlow and OpenAI Gym to build simple neural network models that learn from their own actions. You will also see how reinforcement learning algorithms play a role in games, image processing and NLP.
10 | ## Instructions and Navigation
11 | All of the code is organized into folders. Each folder starts with a number followed by the application name. For example, Chapter02.
12 | 
13 | 
14 | 
15 | The code will look like the following:
16 | ```
17 | def discretization(env, obs):
18 |  env_low = env.observation_space.low
19 |  env_high = env.observation_space.high
20 | ```
21 | 
22 | The following are the requirements to get the most out of this book:
23 | * Python and TensorFlow
24 | * Linear algebra as a prerequisite for neural networks
25 | * Installation bundle: Python, TensorFlow, and OpenAI gym (shown in Chapter 1, Deep Learning – Architectures and Frameworks and Chapter 2, Training Reinforcement Learning Agents Using OpenAI Gym)
26 | 
27 | ## Related Products
28 | * [Deep Learning with TensorFlow - Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/deep-learning-tensorflow-second-edition?utm_source=github&utm_medium=repository&utm_campaign=9781788831109)
29 | 
30 | * [TensorFlow: Powerful Predictive Analytics with TensorFlow](https://www.packtpub.com/big-data-and-business-intelligence/tensorflow-powerful-predictive-analytics-tensorflow?utm_source=github&utm_medium=repository&utm_campaign=9781789136913)
31 | 
32 | * [Hands-On Deep Learning with TensorFlow](https://www.packtpub.com/big-data-and-business-intelligence/hands-deep-learning-tensorflow?utm_source=github&utm_medium=repository&utm_campaign=9781787282773)
33 | 
34 | ### Download a free PDF
35 | 
36 |  <i>If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.<br>Simply click on the link to claim your free PDF.</i>
37 | <p align="center"> <a href="https://packt.link/free-ebook/9781788835725">https://packt.link/free-ebook/9781788835725 </a> </p>


--------------------------------------------------------------------------------
/requirements.sh:
--------------------------------------------------------------------------------
1 | sudo apt-get install pip
2 | sudo pip install --upgrade pip
3 | sudo pip install numpy
4 | sudo pip install pandas
5 | sudo pip install tensorflow
6 | sudo pip install gym
7 | 


--------------------------------------------------------------------------------