├── Acrobot.gif ├── LICENSE ├── MountainCar-v0-video.mp4 ├── MountainCar.gif ├── RBF.py ├── README.md ├── ezgif.com-video-cutter.mp4 ├── lspi.py ├── plot.py ├── policy.py ├── rbf.py ├── replay_memory.py ├── run.py └── video0.mp4 /Acrobot.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yusme/LSPI/0e06dff817ac494b1aa37b9c176726069814614c/Acrobot.gif -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 Regis Jean-Pierre Boudinot (selfup) 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /MountainCar-v0-video.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yusme/LSPI/0e06dff817ac494b1aa37b9c176726069814614c/MountainCar-v0-video.mp4 -------------------------------------------------------------------------------- /MountainCar.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yusme/LSPI/0e06dff817ac494b1aa37b9c176726069814614c/MountainCar.gif -------------------------------------------------------------------------------- /RBF.py: -------------------------------------------------------------------------------- 1 | from scipy import * 2 | from scipy.linalg import norm, pinv 3 | 4 | import numpy as np 5 | 6 | 7 | class BasisFunction: 8 | def __init__(self, indim, numCenters, outdim): 9 | self.indim = indim 10 | self.outdim = outdim 11 | self.numCenters = numCenters 12 | self.centers = [np.random.uniform(-1, 1, indim) for i in xrange(numCenters)] 13 | print "Centers",self.centers 14 | self.beta = 8 15 | self.W = np.random.random((self.numCenters, self.outdim)) 16 | 17 | def _basisfunc(self, c, d): 18 | assert len(d) == self.indim 19 | norm_1 = (c-d)/((c**2)+(d**2))**(1/2) 20 | print (c-d),(norm_1) 21 | return np.exp(-self.beta *( (c-d)[0]** 2)) 22 | 23 | # Berechnen de basis FUnction for each sample and gaussion 24 | def _calcAct(self, X): 25 | # calculate activations of RBFs 26 | G = np.zeros((X.shape[0], self.numCenters), float) 27 | for ci, c in enumerate(self.centers): 28 | for xi, x in enumerate(X): 29 | G[xi, ci] = self._basisfunc(c, x) 30 | return G 31 | 32 | def train(self, X, Y): 33 | """ X: matrix of dimensions n x indim 34 | y: column vector of dimension n x 1 """ 35 | 36 | # choose random center vectors from training set 37 | rnd_idx = np.random.permutation(X.shape[0])[:self.numCenters] 38 | self.centers = [X[i, :] for i in rnd_idx] 39 | 40 | #print "center", self.centers 41 | # calculate activations of RBFs 42 | G = self._calcAct(X) 43 | #print G 44 | 45 | 46 | # calculate output weights (pseudoinverse) 47 | Maximun_likelihood= pinv(G) #pseudoinverse 48 | self.W = np.dot(Maximun_likelihood, Y) 49 | 50 | def test(self, X): 51 | """ X: matrix of dimensions n x indim """ 52 | 53 | G = self._calcAct(X) 54 | Y = np.dot(G, self.W) 55 | return Y 56 | 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LSPI Least-Squares Policy Iteration 2 | 3 | (LSPI) reinforcement learning algorithm is a model-free, off-policy method 4 | 5 | https://www2.cs.duke.edu/research/AI/LSPI/nips01.pdf 6 | 7 | The goal of these algorithms is to perform 8 | 9 | LSPI is model-free and uses the results of LSQ to form an approximate policy iteration algorithm. 10 | This algorithm combines the policy search efficiency of policy iteration with the data efficiency of LSTD 11 | 12 | Since LSPI uses LSQ to compute approximate Q functions, it can use any data source for samples. 13 | A single set of samples may be used for the entire optimization, or additional samples may be acquired, 14 | either through trajectories or some other scheme, for each iteration of policy iteration 15 | 16 | LSQ: Learning the State-Action Value Function 17 | LSPI uses LSQ to compute approximate Q function 18 | 19 | 20 | 21 | - Solving Acrobot env with LSPI 22 | 23 | ![](Acrobot.gif) 24 | 25 | 26 | - Solving mountainCar-v0 env with LSPI 27 | 28 | ![](MountainCar.gif) 29 | 30 | 31 | ## TODO 32 | - Weighted importance sampling for off-policy 33 | -------------------------------------------------------------------------------- /ezgif.com-video-cutter.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yusme/LSPI/0e06dff817ac494b1aa37b9c176726069814614c/ezgif.com-video-cutter.mp4 -------------------------------------------------------------------------------- /lspi.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | from rbf import Basis_Function 4 | from policy import Policy 5 | 6 | """ 7 | important property of LSPI is that it does not require an approximate policy representation, 8 | At each iteration, a different policy is evaluated 9 | and certain sets of basis functions may be more appropriate than others for representing 10 | the state-action value function for each of these policies. 11 | 12 | since LSPI approximates state-action value 13 | functions, it can use samples from any policy to estimate the state-action value function of 14 | another policy. This focuses attention more clearly on the issue of exploration since any 15 | policy can be followed while collecting samples. 16 | """ 17 | 18 | 19 | class LSPI: 20 | 21 | def __init__(self, num_actions=3, num_means=2 ,gamma=0.99 ): 22 | 23 | print num_actions, num_means 24 | 25 | self.basis_function = Basis_Function(num_means, num_means, num_actions, gamma) 26 | num_basis = self.basis_function._num_basis() 27 | 28 | self.policy = Policy(self.basis_function, num_basis) 29 | self.lstdq = LSTDQ(self.basis_function, gamma, self.policy) 30 | 31 | self.stop_criterium= 10**-5 32 | self.gamma = gamma 33 | 34 | 35 | 36 | #def agent (self,sample,total_iterationen): 37 | 38 | 39 | def _act(self,state): 40 | index = self.policy.get_actions(state) # TODO: validation for random actions 41 | action = self.policy.actions[index[0]] 42 | return action 43 | 44 | 45 | 46 | def train( self, sample, total_iterationen, w_important_Sampling=False ): 47 | 48 | error = float('inf') 49 | num_interation=0 50 | epsilon = 0.001 51 | 52 | #print "policy weights", self.policy.weights 53 | 54 | while (epsilon * (1 - self.gamma) / self.gamma) < error and num_interation< total_iterationen : 55 | 56 | if w_important_Sampling: 57 | new_weights = self.lstdq.train_weight_parameter ( sample, 58 | self.policy, 59 | self.basis_function ) 60 | else: 61 | new_weights = self.lstdq.train_parameter(sample, 62 | self.policy, 63 | self.basis_function) 64 | 65 | 66 | error = np.linalg.norm((new_weights - self.policy.weights))#difference between current policy and target policy 67 | self.policy.theta_behavior = self.policy.weights 68 | self.policy.weights = new_weights 69 | #print "new weights", self.policy.weights 70 | 71 | 72 | num_interation += 1 73 | 74 | 75 | return self.policy 76 | 77 | 78 | def td_error(self, sample): 79 | 80 | states = sample[0] 81 | actions = sample[1] 82 | rewards = sample[2] 83 | next_states = sample[3] 84 | sample_size = len(states) 85 | td_e = 0.0 86 | 87 | for i in range(sample_size): 88 | 89 | index = self.policy.get_actions(next_states[i]) # TODO: validation in case more actions 90 | action = self.policy.actions[index[0]] 91 | 92 | index = self.policy.get_actions(states[i]) # TODO: validation in case more actions 93 | act = self.policy.actions[index[0]] 94 | 95 | Vst = self.policy.q_value_function(next_states[i], action) 96 | Vs = self.policy.q_value_function(states[i], act) 97 | 98 | td_e += ((rewards[i] + self.gamma * Vst) - Vs) ** 2 99 | # td_e = (rewards[i]- Vs)**2 100 | 101 | print"td_error=", (td_e / float(sample_size)) 102 | 103 | 104 | # return (td_e/sample_size) 105 | 106 | 107 | class LSTDQ: 108 | def __init__(self,basis_function, gamma, init_policy): 109 | self.basis_function = basis_function 110 | self.gamma = gamma 111 | self.policy = init_policy 112 | self.gready =[] 113 | 114 | 115 | 116 | 117 | 118 | def train_parameter (self, sample, policy , basis_function ): 119 | r""" Computer Q value function of current policy 120 | to obtain the gready policy 121 | """ 122 | k = basis_function._num_basis() 123 | 124 | A=np.zeros([k,k]) 125 | b=np.zeros([k,1]) 126 | np.fill_diagonal(A, .1) 127 | 128 | states = sample[0] 129 | actions = sample[1] 130 | rewards = sample[2] 131 | next_states = sample[3] 132 | 133 | for i in range(len(states)): 134 | 135 | # take action from the gready target policy 136 | 137 | index = policy.get_actions(next_states[i]) # TODO: validation in case more actions 138 | action= policy.actions[index[0]] 139 | 140 | 141 | phi = self.basis_function.evaluate(states[i], actions[i]) 142 | phi_next = self.basis_function.evaluate(next_states[i], action) 143 | 144 | loss = (phi - self.gamma * phi_next) 145 | phi = np.resize(phi, [k, 1]) 146 | 147 | phi = np.resize(phi, [k, 1]) 148 | loss = np.resize(phi, [1, len(loss)]) 149 | 150 | A = A + np.dot(phi, loss) 151 | b = b + (phi * rewards[i]) 152 | 153 | #A = A +np.dot(loss.transpose(),loss) 154 | #b = b + (loss.transpose() * rewards[i]) 155 | 156 | inv_A = np.linalg.inv(A) 157 | 158 | theta= np.dot(inv_A,b) 159 | 160 | 161 | return theta 162 | 163 | def train_weight_parameter(self, sample, policy, basis_function): 164 | r""" Computer Q value function of current policy 165 | to obtain the gready policy 166 | """ 167 | 168 | k = basis_function._num_basis() 169 | A = np.zeros([k, k]) 170 | b = np.zeros([k, 1]) 171 | np.fill_diagonal(A, .1) 172 | 173 | states = sample[0] 174 | actions = sample[1] 175 | rewards = sample[2] 176 | next_states = sample[3] 177 | sample_size= len(states) 178 | 179 | self.gready =np.zeros_like(actions) 180 | self.gready = np.reshape(self.gready,[1,len(actions)]) 181 | self.gready=self.gready[0] 182 | 183 | 184 | sum_W = 0.0 185 | W = 1.0 186 | for i in range(sample_size): 187 | 188 | act = policy.get_actions(states[i]) 189 | prob_target = policy.q_value_function(states[i], act[0]) 190 | prob_behavior = policy.behavior(states[i], actions[i]) 191 | 192 | 193 | #exp = (i - sample_size) 194 | if prob_behavior==0.0: 195 | W=0 196 | else: 197 | 198 | W = (prob_target / prob_behavior) 199 | sum_W = sum_W + W 200 | 201 | 202 | for i in range(sample_size): 203 | # take action from the gready target policy 204 | 205 | index = policy.get_actions(next_states[i]) 206 | action = policy.actions[index[0]] 207 | 208 | phi = self.basis_function.evaluate(states[i], actions[i]) 209 | phi_next = self.basis_function.evaluate(next_states[i], action) 210 | 211 | act=policy.get_actions(states[i]) 212 | 213 | prob_target = policy.q_value_function( states[i], act[0] ) 214 | prob_behavior= policy.behavior(states[i], actions[i] ) 215 | 216 | self.gready[i] = act[0] 217 | 218 | 219 | #print"prob Target", prob_target, "beharvior ",prob_behavior 220 | 221 | exp=(i-sample_size) 222 | 223 | norm_W = (prob_target/prob_behavior )/sum_W 224 | 225 | 226 | #important weigthing on the whole transition 227 | 228 | loss = norm_W *(phi - self.gamma * phi_next) 229 | #print "norm W", norm_W 230 | 231 | 232 | phi = np.resize(phi, [k, 1]) 233 | loss= np.resize(phi, [1,len(loss)]) 234 | 235 | A = A + np.dot(phi, loss) 236 | b = b + (phi * rewards[i] ) 237 | #print "b=",(phi * rewards[i] ) 238 | #print "b_norm=",norm_W*(phi * rewards[i] ) 239 | 240 | 241 | inv_A = np.linalg.inv(A) 242 | 243 | theta = np.dot(inv_A, b) 244 | policy.theta_behavior=policy.weights 245 | 246 | #print "actions=", np.reshape(actions,[1,len(actions)]) 247 | #print "gready=", self.gready,"\n" 248 | 249 | 250 | return theta 251 | """ 252 | lstd has problem mit singular matrix a 253 | 254 | one qay to avoid this problem is to initialize A matrix 255 | mulitple identify matrix phi*indenty_matrix 256 | for a robuste inversion use singular value desomposition 257 | 258 | learn the value function Q 259 | """ -------------------------------------------------------------------------------- /plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | import numpy as np 4 | 5 | class Plot: 6 | 7 | def plot_rewad(self,x, y1,y2): 8 | 9 | plt.ylim(-220, 100) 10 | 11 | plt.plot(x, y1,'bo-',linewidth=2.5, linestyle="-", label="LSPI-model-based LSTDQ") 12 | plt.plot(x, y2,'ro-',linewidth=2.5, linestyle="-", label="LSPI-IS") 13 | plt.legend(loc='upper left') 14 | 15 | plt.show() 16 | 17 | 18 | 19 | 20 | 21 | def plot(self): 22 | x = np.linspace(0, 30, 30) 23 | y = np.cos(x / 6 * np.pi) + np.sin(x / 3 * np.pi) 24 | 25 | error = np.random.rand(len(y)) * 2 26 | y += np.random.normal(0, 0.1, size=y.shape) 27 | print np.random.normal(0, 0.1, size=y.shape) 28 | print "\n", np.random.rand(len(y)) * 2 29 | plt.plot(x, y, 'k', color='#CC4F1B') # color='#3F7F4C')color="#4682b4" 30 | plt.fill_between(x, y - error, y + error, 31 | edgecolor='#3F7F4C', facecolor='#7EFF99', linewidth=1, 32 | ) 33 | plt.show() 34 | 35 | def plot_function(self,x,y,z,rbf): 36 | # plot original data 37 | plt.figure(figsize=(12, 8)) 38 | plt.plot(x, y, 'k-') 39 | 40 | # plot learned model 41 | plt.plot(x, z, 'r-', linewidth=2) 42 | 43 | # plot rbfs 44 | #plt.plot(rbf.centers, np.zeros(rbf.numCenters), 'gs') 45 | 46 | for c in rbf.centers: 47 | # RF prediction lines 48 | cx = np.arange(c - 0.7, c + 0.7, 0.01) 49 | cy = [rbf._basisfunc(np.array([cx_]), np.array([c])) for cx_ in cx] 50 | # print "-----",cx.shape,len(cy)," " 51 | 52 | #plt.plot(cx, cy, '-', color='gray', linewidth=0.2) 53 | 54 | # print "\n",cx, cy 55 | plt.plot(cx, cy, '-', color='gray', linewidth=0.2) 56 | plt.xlim(-1.2, 1.2) 57 | # print "plottt" 58 | plt.show() 59 | -------------------------------------------------------------------------------- /policy.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import numpy as np 4 | from rbf import Basis_Function 5 | 6 | 7 | class Policy: 8 | 9 | def __init__(self,basis, num_theta, theta=None ): 10 | self.basis_function=basis 11 | self.actions = [0, 1, 2] 12 | 13 | self.num_theta=num_theta 14 | 15 | # uniform distribution of the actions 16 | 17 | 18 | self.theta_behavior= theta 19 | 20 | if theta is None: 21 | self.weights = np.random.uniform(-1.0, 1.0, size=(num_theta,)) 22 | 23 | else: 24 | self.weights=theta 25 | 26 | def set_theta(self, theta): 27 | self.weights = (self.weights+theta)*0*5 28 | 29 | 30 | def behavior(self,state,action): 31 | prob=0.0 32 | if self.theta_behavior is None: 33 | self.theta_behavior = np.random.uniform(-1.0, 1.0, size=(self.num_theta,)) 34 | 35 | 36 | vector_basis = self.basis_function.evaluate(state, action) 37 | return np.dot(vector_basis, self.theta_behavior) 38 | 39 | 40 | 41 | 42 | def q_value_function(self, state, action ): 43 | vector_basis = self.basis_function.evaluate(state,action) 44 | return np.dot(vector_basis,self.weights) 45 | 46 | def get_actions(self, state): 47 | 48 | 49 | q_state_action=[self.q_value_function(state,self.actions[i]) for i in range(len(self.actions))] 50 | q_state_action = np.reshape(q_state_action,[len(q_state_action),1])# convert to column vector 51 | 52 | index = np.argmax(q_state_action) 53 | q_max = q_state_action[index] 54 | 55 | 56 | best_actions = [self.actions[index]] 57 | ind =[index] 58 | 59 | for i in range(len(q_state_action)): 60 | if q_state_action[i]==q_max and index!=i: 61 | best_actions.append(self.actions[i]) 62 | ind.append(i) 63 | 64 | 65 | 66 | return best_actions 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /rbf.py: -------------------------------------------------------------------------------- 1 | from scipy import * 2 | from scipy.linalg import norm, pinv 3 | 4 | import numpy as np 5 | 6 | 7 | class BasisFunction: 8 | def __init__(self, indim, numCenters, outdim): 9 | self.indim = indim 10 | self.outdim = outdim 11 | self.numCenters = numCenters 12 | self.centers = [np.random.uniform(-1, 1, indim) for i in xrange(numCenters)] 13 | print "Centers",self.centers 14 | self.beta = 8 15 | self.W = np.random.random((self.numCenters, self.outdim)) 16 | 17 | def _basisfunc(self, c, d): 18 | assert len(d) == self.indim 19 | norm_1 = (c-d)/((c**2)+(d**2))**(1/2) 20 | print (c-d),(norm_1) 21 | return np.exp(-self.beta *( (c-d)[0]** 2)) 22 | 23 | # Berechnen de basis FUnction for each sample and gaussion 24 | def _calcAct(self, X): 25 | # calculate activations of RBFs 26 | G = np.zeros((X.shape[0], self.numCenters), float) 27 | for ci, c in enumerate(self.centers): 28 | for xi, x in enumerate(X): 29 | G[xi, ci] = self._basisfunc(c, x) 30 | return G 31 | 32 | def train(self, X, Y): 33 | """ X: matrix of dimensions n x indim 34 | y: column vector of dimension n x 1 """ 35 | 36 | # choose random center vectors from training set 37 | rnd_idx = np.random.permutation(X.shape[0])[:self.numCenters] 38 | self.centers = [X[i, :] for i in rnd_idx] 39 | 40 | #print "center", self.centers 41 | # calculate activations of RBFs 42 | G = self._calcAct(X) 43 | #print G 44 | 45 | 46 | # calculate output weights (pseudoinverse) 47 | Maximun_likelihood= pinv(G) #pseudoinverse 48 | self.W = np.dot(Maximun_likelihood, Y) 49 | 50 | def test(self, X): 51 | """ X: matrix of dimensions n x indim """ 52 | 53 | G = self._calcAct(X) 54 | Y = np.dot(G, self.W) 55 | return Y 56 | 57 | -------------------------------------------------------------------------------- /replay_memory.py: -------------------------------------------------------------------------------- 1 | from random import sample as random 2 | import collections as memory 3 | import numpy as np 4 | import random 5 | 6 | alpha=0.7 7 | beta=0.5 8 | 9 | 10 | from collections import defaultdict 11 | 12 | class Memory: 13 | # This is unsere MDP model 14 | #TODO: contruct the MDP chain of state 15 | 16 | def __init__(self,MemorySize, batch_size, act_dim,obs_dim): 17 | self.Memorysize = MemorySize 18 | #self.batch_size = batch_size 19 | self.container= memory.deque() 20 | self.containerSize = 0 21 | self.priority=1 22 | self.act_dim=act_dim 23 | self.obs_dim=obs_dim 24 | 25 | 26 | 27 | def get_size(self): 28 | return self.batch_size 29 | 30 | def size(self): 31 | return self.containerSize 32 | 33 | def select_batch(self, batchSize): 34 | return random.sample(self.container, batchSize) 35 | 36 | def add(self, experience): 37 | 38 | experience.append(self.priority) 39 | 40 | if self.containerSize < self.Memorysize: 41 | self.container.append(experience) 42 | self.containerSize = self.containerSize+1 43 | 44 | 45 | else: 46 | self.container.popleft() 47 | self.container.append(experience) 48 | 49 | 50 | def transform_sample(self,sample,batch_size): 51 | 52 | 53 | 54 | 55 | obs_dim=self.obs_dim 56 | act_dim = self.act_dim 57 | 58 | current_state= [x[0] for x in sample] 59 | actions = np.asarray([x[1] for x in sample]) 60 | rewards = [x[2] for x in sample] 61 | next_state= [x[3] for x in sample] 62 | done = [x[4] for x in sample] 63 | 64 | 65 | current_state = np.resize(current_state,[batch_size,obs_dim]) 66 | actions = np.resize(actions, [batch_size, act_dim]) 67 | rewards = np.resize(rewards, [batch_size, act_dim]) 68 | next_state = np.resize(next_state, [batch_size, obs_dim]) 69 | done = np.resize(done, [batch_size, act_dim]) 70 | 71 | 72 | return [current_state,actions,rewards,next_state,done] 73 | 74 | def select_sample(self,batch_size): 75 | #print "container size",self.containerSize 76 | sample = random.sample(self.container, batch_size) 77 | return self.transform_sample(sample,batch_size) 78 | 79 | def clear_memory(self): 80 | self.container = memory.deque() 81 | self.containerSize=0 82 | self.num_experiences = 0 83 | 84 | 85 | def important_sampling(self, batch_size,policy): 86 | current_state, actions, rewards, next_state, done\ 87 | = self.select_sample(batch_size) 88 | discount_factor=0.8 89 | 90 | G = 0.0 91 | W = 1.0 92 | C = np.zeros(3) 93 | 94 | for i in range(batch_size): 95 | G =+ discount_factor*rewards[i] 96 | C+=W 97 | q_state_action=policy.q_value_function(current_state[i],actions[i]) 98 | new_q = (W/C)*(G - q_state_action) 99 | W = W * 1. / behavior_policy(state)[action] 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | from matplotlib import pyplot as pl 2 | 3 | 4 | from replay_memory import Memory 5 | from gym.monitoring.video_recorder import VideoRecorder 6 | from policy import Policy 7 | from rbf import Basis_Function 8 | from lspi import LSPI, LSTDQ 9 | import gym 10 | import scipy 11 | import numpy as np 12 | 13 | 14 | 15 | import matplotlib.pyplot as plt 16 | from collections import defaultdict 17 | 18 | TRANSITION =15000 19 | EPISODE = 1000 20 | BATCH_SIZE = 20 21 | MEMORY_SIZE=TRANSITION+1000 22 | 23 | 24 | important_sampling = None 25 | lspi_interation = 20 26 | num_actions = 3 27 | num_means = 4 28 | gamma = 0.99 29 | 30 | mean_reward1=[] 31 | mean_reward2=[] 32 | 33 | def test_policy(policy, env, state, agent): 34 | 35 | print ("Test") 36 | total_reward = 0.0 37 | state = env.reset() 38 | 39 | for j in range(1): 40 | state = env.reset() 41 | for i in range(15000): 42 | env.render() 43 | index = policy.get_actions(state) # TODO: valication for random actions 44 | #action = policy.actions[index[0]] # todo. take just one action 45 | action=agent._act(state) 46 | next_state, reward, done, info = env.step(action) 47 | state = next_state 48 | total_reward += gamma * reward 49 | Best_policy=0 50 | 51 | if done: 52 | print ("Done",policy.weights) 53 | Best_policy = agent.policy 54 | print ("Done", total_reward) 55 | break 56 | 57 | return total_reward, Best_policy 58 | 59 | def _initial_sample2(env, memory, agent ): 60 | 61 | state = env.reset() 62 | ##action = env.action_space.sample() 63 | total_reward = -4000 64 | best_reward=-4000 65 | Best_agent=None 66 | found=False 67 | best_theta=False 68 | 69 | for j in range(EPISODE): 70 | 71 | state = env.reset() 72 | best_theta = False 73 | for i in range(TRANSITION): 74 | # env.render() 75 | #action = agent._act(state) 76 | if best_reward >= total_reward and found==False: 77 | action = env.action_space.sample() 78 | else: 79 | agent = Best_agent 80 | best_theta = True 81 | action = agent._act(state) 82 | next_state, reward, done, info = env.step(action) 83 | memory.add([state, action, reward, next_state, done]) 84 | state = next_state 85 | if done: 86 | print "done interation=",i 87 | break 88 | if j>0: 89 | if done: 90 | sample = memory.select_sample(j) 91 | else: 92 | sample = memory.select_sample(TRANSITION) 93 | 94 | policy = agent.train(sample, lspi_interation, important_sampling) 95 | total_reward, policy_test = test_policy(policy, env, state, agent) 96 | if best_reward < total_reward: 97 | Best_agent = agent 98 | best_reward = total_reward 99 | total_reward = -4950.0 100 | #found=True 101 | print "TEST---",j 102 | print "total_reward",total_reward 103 | if best_theta: 104 | memory.clear_memory() 105 | 106 | memory.clear_memory() 107 | 108 | return mean_reward1 109 | 110 | def _reuse_sample2(env, memory, agent ): 111 | state = env.reset() 112 | total_reward = 0.0 113 | important_sampling = False 114 | reward = 0.0 115 | 116 | for j in range(EPISODE): 117 | print ("episode", j, "/", EPISODE) 118 | state = env.reset() 119 | total_reward = 0.0 120 | policy=[] 121 | video_recorder = None 122 | video_recorder = VideoRecorder(env, path="/home/yeimy/Documents/LSPI/Video/video"+str(j)+".mp4",enabled=True) 123 | for i in range(TRANSITION): 124 | 125 | if i50: 247 | print ("episode-", j) 248 | #env.render() 249 | return policy 250 | 251 | def _reuse_sample(env, memory): 252 | total_reward = 0.0 253 | lspi = LSPI() 254 | policy = lspi.policy 255 | state = env.reset() 256 | env.unwrapped.render() 257 | for j in range(EPISODE): 258 | print "episode-",j,"/",EPISODE ,j/float(EPISODE) 259 | 260 | for i in range(TRANSITION): 261 | #env.render() 262 | if memory.containerSize < BATCH_SIZE: 263 | 264 | index = policy.get_actions(state) # TODO: validation for random actions 265 | action = policy.actions[index[0]] 266 | 267 | else: 268 | 269 | sample = memory.select_sample(BATCH_SIZE) # [current_state, actions, rewards, next_state, done] 270 | policy = lspi.train(sample, lspi_interation, important_sampling) 271 | index = policy.get_actions(state) 272 | action = policy.actions[index[0]] 273 | memory.clear_memory() 274 | 275 | next_state, reward, done, info = env.step(action) 276 | memory.add([state, action, reward, next_state, done]) 277 | state = next_state 278 | 279 | if done: 280 | print ("done") 281 | state = env.reset() 282 | break 283 | 284 | mean_reward1.append(total_reward) 285 | total_reward = 0.0 286 | 287 | # R=test_policy(policy, env,state) 288 | # mean_reward.append([np.mean(R),np.max(R),np.min(R)]) 289 | if j>10: 290 | print ("episode-", j) 291 | #R = test_policy(policy, env, state) 292 | 293 | def exmaple_RBF(): 294 | # ----- 1D Example ------------------------------------------------ 295 | 296 | rbf= BasisFunction2(1, 13, 1) 297 | #rbf =BasisFunction() 298 | n = 500 299 | 300 | x = np.mgrid[-1:1:complex(0, n)].reshape(n, 1) 301 | #print "vector" ,np.mgrid[-1:1:complex(0, n)] 302 | # set y and add random noise 303 | y = np.sin(3 * (x + 0.5) ** 3 - 50) 304 | # y += random.normal(0, 0.1, y.shape) 305 | 306 | # rbf regression 307 | 308 | rbf.train(x, y) 309 | z = rbf.test(x) 310 | plot_function(x, y, z,rbf) 311 | 312 | 313 | if __name__ == '__main__': 314 | main() 315 | 316 | 317 | 318 | 319 | 320 | 321 | -------------------------------------------------------------------------------- /video0.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yusme/LSPI/0e06dff817ac494b1aa37b9c176726069814614c/video0.mp4 --------------------------------------------------------------------------------