├── Acrobot.gif
├── LICENSE
├── MountainCar-v0-video.mp4
├── MountainCar.gif
├── RBF.py
├── README.md
├── ezgif.com-video-cutter.mp4
├── lspi.py
├── plot.py
├── policy.py
├── rbf.py
├── replay_memory.py
├── run.py
└── video0.mp4


/Acrobot.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yusme/LSPI/0e06dff817ac494b1aa37b9c176726069814614c/Acrobot.gif


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2016 Regis Jean-Pierre Boudinot (selfup)
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/MountainCar-v0-video.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yusme/LSPI/0e06dff817ac494b1aa37b9c176726069814614c/MountainCar-v0-video.mp4


--------------------------------------------------------------------------------
/MountainCar.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yusme/LSPI/0e06dff817ac494b1aa37b9c176726069814614c/MountainCar.gif


--------------------------------------------------------------------------------
/RBF.py:
--------------------------------------------------------------------------------
 1 | from scipy import *
 2 | from scipy.linalg import norm, pinv
 3 | 
 4 | import numpy as np
 5 | 
 6 | 
 7 | class BasisFunction:
 8 |     def __init__(self, indim, numCenters, outdim):
 9 |         self.indim = indim
10 |         self.outdim = outdim
11 |         self.numCenters = numCenters
12 |         self.centers = [np.random.uniform(-1, 1, indim) for i in xrange(numCenters)]
13 |         print "Centers",self.centers
14 |         self.beta = 8
15 |         self.W = np.random.random((self.numCenters, self.outdim))
16 | 
17 |     def _basisfunc(self, c, d):
18 |         assert len(d) == self.indim
19 |         norm_1 = (c-d)/((c**2)+(d**2))**(1/2)
20 |         print (c-d),(norm_1)
21 |         return np.exp(-self.beta *( (c-d)[0]** 2))
22 | 
23 |     # Berechnen de basis FUnction for each sample and gaussion
24 |     def _calcAct(self, X):
25 |         # calculate activations of RBFs
26 |         G = np.zeros((X.shape[0], self.numCenters), float)
27 |         for ci, c in enumerate(self.centers):
28 |             for xi, x in enumerate(X):
29 |                 G[xi, ci] = self._basisfunc(c, x)
30 |         return G
31 | 
32 |     def train(self, X, Y):
33 |         """ X: matrix of dimensions n x indim
34 |             y: column vector of dimension n x 1 """
35 | 
36 |         # choose random center vectors from training set
37 |         rnd_idx = np.random.permutation(X.shape[0])[:self.numCenters]
38 |         self.centers = [X[i, :] for i in rnd_idx]
39 | 
40 |         #print "center", self.centers
41 |         # calculate activations of RBFs
42 |         G = self._calcAct(X)
43 |         #print G
44 | 
45 | 
46 |         # calculate output weights (pseudoinverse)
47 |         Maximun_likelihood= pinv(G) #pseudoinverse
48 |         self.W = np.dot(Maximun_likelihood, Y)
49 | 
50 |     def test(self, X):
51 |         """ X: matrix of dimensions n x indim """
52 | 
53 |         G = self._calcAct(X)
54 |         Y = np.dot(G, self.W)
55 |         return Y
56 | 
57 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LSPI  Least-Squares Policy Iteration
 2 | 
 3 | (LSPI) reinforcement learning algorithm is a model-free, off-policy method
 4 | 
 5 | https://www2.cs.duke.edu/research/AI/LSPI/nips01.pdf
 6 | 
 7 | The goal of these algorithms is to perform
 8 | 
 9 | LSPI is model-free and uses the results of LSQ to form an approximate policy iteration algorithm. 
10 | This algorithm combines the policy search efficiency of policy iteration with the data efficiency of LSTD
11 | 
12 | Since LSPI uses LSQ to compute approximate Q functions, it can use any data source for samples.
13 | A single set of samples may be used for the entire optimization, or additional samples may be acquired, 
14 | either through trajectories or some other scheme, for each iteration of policy iteration
15 |  
16 | LSQ: Learning the State-Action Value Function
17 | LSPI uses LSQ to compute approximate Q function
18 | 
19 | 
20 | 
21 | - Solving Acrobot env with LSPI 
22 | 
23 | ![](Acrobot.gif)
24 | 
25 | 
26 | - Solving mountainCar-v0 env with LSPI
27 | 
28 | ![](MountainCar.gif)
29 | 
30 | 
31 | ## TODO
32 | - Weighted importance sampling for off-policy
33 | 


--------------------------------------------------------------------------------
/ezgif.com-video-cutter.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yusme/LSPI/0e06dff817ac494b1aa37b9c176726069814614c/ezgif.com-video-cutter.mp4


--------------------------------------------------------------------------------
/lspi.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | from rbf import Basis_Function
  4 | from policy import Policy
  5 | 
  6 | """
  7 | important property of LSPI is that it does not require an approximate policy representation,
  8 | At each iteration, a different policy is evaluated
  9 | and certain sets of basis functions may be more appropriate than others for representing
 10 | the state-action value function for each of these policies.
 11 | 
 12 | since LSPI approximates state-action value
 13 | functions, it can use samples from any policy to estimate the state-action value function of
 14 | another policy. This focuses attention more clearly on the issue of exploration since any
 15 | policy can be followed while collecting samples.
 16 | """
 17 | 
 18 | 
 19 | class LSPI:
 20 | 
 21 |     def __init__(self, num_actions=3, num_means=2 ,gamma=0.99 ):
 22 | 
 23 |         print num_actions, num_means
 24 | 
 25 |         self.basis_function = Basis_Function(num_means, num_means, num_actions, gamma)
 26 |         num_basis = self.basis_function._num_basis()
 27 | 
 28 |         self.policy = Policy(self.basis_function, num_basis)
 29 |         self.lstdq  = LSTDQ(self.basis_function, gamma, self.policy)
 30 | 
 31 |         self.stop_criterium= 10**-5
 32 |         self.gamma = gamma
 33 | 
 34 | 
 35 | 
 36 |     #def agent (self,sample,total_iterationen):
 37 | 
 38 | 
 39 |     def _act(self,state):
 40 |         index =  self.policy.get_actions(state)  # TODO: validation for random actions
 41 |         action = self.policy.actions[index[0]]
 42 |         return action
 43 | 
 44 | 
 45 | 
 46 |     def train( self,  sample,  total_iterationen, w_important_Sampling=False  ):
 47 | 
 48 |         error = float('inf')
 49 |         num_interation=0
 50 |         epsilon = 0.001
 51 | 
 52 |         #print "policy weights", self.policy.weights
 53 | 
 54 |         while  (epsilon * (1 - self.gamma) / self.gamma) < error and num_interation< total_iterationen :
 55 | 
 56 |             if w_important_Sampling:
 57 |                 new_weights = self.lstdq.train_weight_parameter ( sample,
 58 |                                                                   self.policy,
 59 |                                                                   self.basis_function )
 60 |             else:
 61 |                 new_weights = self.lstdq.train_parameter(sample,
 62 |                                                          self.policy,
 63 |                                                          self.basis_function)
 64 | 
 65 | 
 66 |             error = np.linalg.norm((new_weights - self.policy.weights))#difference between current policy and target policy
 67 |             self.policy.theta_behavior  = self.policy.weights
 68 |             self.policy.weights = new_weights
 69 |             #print "new weights", self.policy.weights
 70 | 
 71 | 
 72 |             num_interation += 1
 73 | 
 74 | 
 75 |         return self.policy
 76 | 
 77 | 
 78 |     def td_error(self, sample):
 79 | 
 80 |         states = sample[0]
 81 |         actions = sample[1]
 82 |         rewards = sample[2]
 83 |         next_states = sample[3]
 84 |         sample_size = len(states)
 85 |         td_e = 0.0
 86 | 
 87 |         for i in range(sample_size):
 88 | 
 89 |             index = self.policy.get_actions(next_states[i])  # TODO: validation in case more actions
 90 |             action = self.policy.actions[index[0]]
 91 | 
 92 |             index = self.policy.get_actions(states[i])  # TODO: validation in case more actions
 93 |             act = self.policy.actions[index[0]]
 94 | 
 95 |             Vst = self.policy.q_value_function(next_states[i], action)
 96 |             Vs = self.policy.q_value_function(states[i], act)
 97 | 
 98 |             td_e += ((rewards[i] + self.gamma * Vst) - Vs) ** 2
 99 |             # td_e = (rewards[i]- Vs)**2
100 | 
101 |         print"td_error=", (td_e / float(sample_size))
102 | 
103 | 
104 |         # return (td_e/sample_size)
105 | 
106 | 
107 | class LSTDQ:
108 |     def __init__(self,basis_function, gamma, init_policy):
109 |         self.basis_function = basis_function
110 |         self.gamma = gamma
111 |         self.policy = init_policy
112 |         self.gready =[]
113 | 
114 | 
115 | 
116 | 
117 | 
118 |     def train_parameter (self, sample, policy , basis_function ):
119 |         r""" Computer Q value function of current policy
120 |             to obtain the gready policy
121 |         """
122 |         k = basis_function._num_basis()
123 | 
124 |         A=np.zeros([k,k])
125 |         b=np.zeros([k,1])
126 |         np.fill_diagonal(A, .1)
127 | 
128 |         states      = sample[0]
129 |         actions     = sample[1]
130 |         rewards     = sample[2]
131 |         next_states = sample[3]
132 | 
133 |         for i in range(len(states)):
134 | 
135 |             # take action from the gready target policy
136 | 
137 |             index = policy.get_actions(next_states[i]) # TODO: validation in case more actions
138 |             action= policy.actions[index[0]]
139 | 
140 | 
141 |             phi =      self.basis_function.evaluate(states[i], actions[i])
142 |             phi_next = self.basis_function.evaluate(next_states[i], action)
143 | 
144 |             loss = (phi - self.gamma * phi_next)
145 |             phi  = np.resize(phi, [k, 1])
146 | 
147 |             phi = np.resize(phi, [k, 1])
148 |             loss = np.resize(phi, [1, len(loss)])
149 | 
150 |             A = A + np.dot(phi, loss)
151 |             b = b + (phi * rewards[i])
152 | 
153 |             #A = A +np.dot(loss.transpose(),loss)
154 |             #b = b + (loss.transpose() * rewards[i])
155 | 
156 |         inv_A = np.linalg.inv(A)
157 | 
158 |         theta= np.dot(inv_A,b)
159 | 
160 | 
161 |         return theta
162 | 
163 |     def train_weight_parameter(self, sample, policy, basis_function):
164 |         r""" Computer Q value function of current policy
165 |             to obtain the gready policy
166 |         """
167 | 
168 |         k = basis_function._num_basis()
169 |         A = np.zeros([k, k])
170 |         b = np.zeros([k, 1])
171 |         np.fill_diagonal(A, .1)
172 | 
173 |         states  = sample[0]
174 |         actions = sample[1]
175 |         rewards = sample[2]
176 |         next_states = sample[3]
177 |         sample_size= len(states)
178 | 
179 |         self.gready =np.zeros_like(actions)
180 |         self.gready = np.reshape(self.gready,[1,len(actions)])
181 |         self.gready=self.gready[0]
182 | 
183 | 
184 |         sum_W = 0.0
185 |         W = 1.0
186 |         for i in range(sample_size):
187 | 
188 |             act = policy.get_actions(states[i])
189 |             prob_target =   policy.q_value_function(states[i], act[0])
190 |             prob_behavior = policy.behavior(states[i], actions[i])
191 | 
192 | 
193 |             #exp = (i - sample_size)
194 |             if prob_behavior==0.0:
195 |                 W=0
196 |             else:
197 | 
198 |                 W = (prob_target / prob_behavior)
199 |                 sum_W = sum_W + W
200 | 
201 | 
202 |         for i in range(sample_size):
203 |             # take action from the gready target policy
204 | 
205 |             index = policy.get_actions(next_states[i])
206 |             action = policy.actions[index[0]]
207 | 
208 |             phi =      self.basis_function.evaluate(states[i], actions[i])
209 |             phi_next = self.basis_function.evaluate(next_states[i], action)
210 | 
211 |             act=policy.get_actions(states[i])
212 | 
213 |             prob_target =  policy.q_value_function( states[i], act[0] )
214 |             prob_behavior= policy.behavior(states[i], actions[i] )
215 | 
216 |             self.gready[i] = act[0]
217 | 
218 | 
219 |             #print"prob Target", prob_target, "beharvior ",prob_behavior
220 | 
221 |             exp=(i-sample_size)
222 | 
223 |             norm_W = (prob_target/prob_behavior  )/sum_W
224 | 
225 | 
226 |             #important weigthing on the whole transition
227 | 
228 |             loss = norm_W *(phi - self.gamma *  phi_next)
229 |             #print "norm W", norm_W
230 | 
231 | 
232 |             phi = np.resize(phi, [k, 1])
233 |             loss= np.resize(phi, [1,len(loss)])
234 | 
235 |             A = A + np.dot(phi, loss)
236 |             b = b + (phi * rewards[i] )
237 |             #print "b=",(phi * rewards[i] )
238 |             #print "b_norm=",norm_W*(phi * rewards[i] )
239 | 
240 | 
241 |         inv_A = np.linalg.inv(A)
242 | 
243 |         theta = np.dot(inv_A, b)
244 |         policy.theta_behavior=policy.weights
245 | 
246 |         #print "actions=", np.reshape(actions,[1,len(actions)])
247 |         #print "gready=", self.gready,"\n"
248 | 
249 | 
250 |         return theta
251 |     """
252 |         lstd has problem mit singular matrix a
253 | 
254 |         one qay to avoid this problem is to initialize A matrix
255 |         mulitple identify matrix phi*indenty_matrix
256 |         for a robuste inversion use singular value desomposition
257 | 
258 |         learn the value function Q
259 |     """


--------------------------------------------------------------------------------
/plot.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | 
 3 | import numpy as np
 4 | 
 5 | class Plot:
 6 | 
 7 |     def plot_rewad(self,x, y1,y2):
 8 | 
 9 |         plt.ylim(-220, 100)
10 | 
11 |         plt.plot(x, y1,'bo-',linewidth=2.5, linestyle="-", label="LSPI-model-based LSTDQ")
12 |         plt.plot(x, y2,'ro-',linewidth=2.5, linestyle="-", label="LSPI-IS")
13 |         plt.legend(loc='upper left')
14 | 
15 |         plt.show()
16 | 
17 | 
18 | 
19 | 
20 | 
21 |     def plot(self):
22 |         x = np.linspace(0, 30, 30)
23 |         y = np.cos(x / 6 * np.pi) + np.sin(x / 3 * np.pi)
24 | 
25 |         error = np.random.rand(len(y)) * 2
26 |         y += np.random.normal(0, 0.1, size=y.shape)
27 |         print np.random.normal(0, 0.1, size=y.shape)
28 |         print "\n", np.random.rand(len(y)) * 2
29 |         plt.plot(x, y, 'k', color='#CC4F1B')  # color='#3F7F4C')color="#4682b4"
30 |         plt.fill_between(x, y - error, y + error,
31 |                         edgecolor='#3F7F4C', facecolor='#7EFF99', linewidth=1,
32 |                         )
33 |         plt.show()
34 | 
35 |     def plot_function(self,x,y,z,rbf):
36 |         # plot original data
37 |         plt.figure(figsize=(12, 8))
38 |         plt.plot(x, y, 'k-')
39 | 
40 |         # plot learned model
41 |         plt.plot(x, z, 'r-', linewidth=2)
42 | 
43 |         # plot rbfs
44 |         #plt.plot(rbf.centers, np.zeros(rbf.numCenters), 'gs')
45 | 
46 |         for c in rbf.centers:
47 |             # RF prediction lines
48 |             cx = np.arange(c - 0.7, c + 0.7, 0.01)
49 |             cy = [rbf._basisfunc(np.array([cx_]), np.array([c])) for cx_ in cx]
50 |             # print "-----",cx.shape,len(cy)," "
51 | 
52 |             #plt.plot(cx, cy, '-', color='gray', linewidth=0.2)
53 | 
54 |         # print "\n",cx, cy
55 |         plt.plot(cx, cy, '-', color='gray', linewidth=0.2)
56 |         plt.xlim(-1.2, 1.2)
57 |         # print "plottt"
58 |         plt.show()
59 | 


--------------------------------------------------------------------------------
/policy.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | 
 3 | import numpy as np
 4 | from rbf import Basis_Function
 5 | 
 6 | 
 7 | class Policy:
 8 | 
 9 |     def __init__(self,basis, num_theta, theta=None ):
10 |         self.basis_function=basis
11 |         self.actions = [0, 1, 2]
12 | 
13 |         self.num_theta=num_theta
14 | 
15 |         # uniform distribution of the actions
16 | 
17 | 
18 |         self.theta_behavior= theta
19 | 
20 |         if theta is None:
21 |             self.weights = np.random.uniform(-1.0, 1.0, size=(num_theta,))
22 | 
23 |         else:
24 |             self.weights=theta
25 | 
26 |     def set_theta(self, theta):
27 |         self.weights = (self.weights+theta)*0*5
28 | 
29 | 
30 |     def behavior(self,state,action):
31 |         prob=0.0
32 |         if self.theta_behavior is None:
33 |             self.theta_behavior = np.random.uniform(-1.0, 1.0, size=(self.num_theta,))
34 | 
35 | 
36 |         vector_basis = self.basis_function.evaluate(state, action)
37 |         return np.dot(vector_basis, self.theta_behavior)
38 | 
39 | 
40 | 
41 | 
42 |     def q_value_function(self, state, action ):
43 |         vector_basis = self.basis_function.evaluate(state,action)
44 |         return np.dot(vector_basis,self.weights)
45 | 
46 |     def get_actions(self, state):
47 | 
48 | 
49 |         q_state_action=[self.q_value_function(state,self.actions[i]) for i in range(len(self.actions))]
50 |         q_state_action = np.reshape(q_state_action,[len(q_state_action),1])# convert to column vector
51 | 
52 |         index = np.argmax(q_state_action)
53 |         q_max = q_state_action[index]
54 | 
55 | 
56 |         best_actions = [self.actions[index]]
57 |         ind =[index]
58 | 
59 |         for i in range(len(q_state_action)):
60 |             if q_state_action[i]==q_max and index!=i:
61 |                 best_actions.append(self.actions[i])
62 |                 ind.append(i)
63 | 
64 | 
65 | 
66 |         return best_actions
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/rbf.py:
--------------------------------------------------------------------------------
 1 | from scipy import *
 2 | from scipy.linalg import norm, pinv
 3 | 
 4 | import numpy as np
 5 | 
 6 | 
 7 | class BasisFunction:
 8 |     def __init__(self, indim, numCenters, outdim):
 9 |         self.indim = indim
10 |         self.outdim = outdim
11 |         self.numCenters = numCenters
12 |         self.centers = [np.random.uniform(-1, 1, indim) for i in xrange(numCenters)]
13 |         print "Centers",self.centers
14 |         self.beta = 8
15 |         self.W = np.random.random((self.numCenters, self.outdim))
16 | 
17 |     def _basisfunc(self, c, d):
18 |         assert len(d) == self.indim
19 |         norm_1 = (c-d)/((c**2)+(d**2))**(1/2)
20 |         print (c-d),(norm_1)
21 |         return np.exp(-self.beta *( (c-d)[0]** 2))
22 | 
23 |     # Berechnen de basis FUnction for each sample and gaussion
24 |     def _calcAct(self, X):
25 |         # calculate activations of RBFs
26 |         G = np.zeros((X.shape[0], self.numCenters), float)
27 |         for ci, c in enumerate(self.centers):
28 |             for xi, x in enumerate(X):
29 |                 G[xi, ci] = self._basisfunc(c, x)
30 |         return G
31 | 
32 |     def train(self, X, Y):
33 |         """ X: matrix of dimensions n x indim
34 |             y: column vector of dimension n x 1 """
35 | 
36 |         # choose random center vectors from training set
37 |         rnd_idx = np.random.permutation(X.shape[0])[:self.numCenters]
38 |         self.centers = [X[i, :] for i in rnd_idx]
39 | 
40 |         #print "center", self.centers
41 |         # calculate activations of RBFs
42 |         G = self._calcAct(X)
43 |         #print G
44 | 
45 | 
46 |         # calculate output weights (pseudoinverse)
47 |         Maximun_likelihood= pinv(G) #pseudoinverse
48 |         self.W = np.dot(Maximun_likelihood, Y)
49 | 
50 |     def test(self, X):
51 |         """ X: matrix of dimensions n x indim """
52 | 
53 |         G = self._calcAct(X)
54 |         Y = np.dot(G, self.W)
55 |         return Y
56 | 
57 | 


--------------------------------------------------------------------------------
/replay_memory.py:
--------------------------------------------------------------------------------
  1 | from random import sample as random
  2 | import collections as memory
  3 | import numpy as np
  4 | import random
  5 | 
  6 | alpha=0.7
  7 | beta=0.5
  8 | 
  9 | 
 10 | from collections import defaultdict
 11 | 
 12 | class Memory:
 13 |     # This is unsere MDP model
 14 |     #TODO: contruct the MDP chain of state
 15 | 
 16 |      def __init__(self,MemorySize, batch_size, act_dim,obs_dim):
 17 |          self.Memorysize = MemorySize
 18 |          #self.batch_size = batch_size
 19 |          self.container= memory.deque()
 20 |          self.containerSize = 0
 21 |          self.priority=1
 22 |          self.act_dim=act_dim
 23 |          self.obs_dim=obs_dim
 24 | 
 25 | 
 26 | 
 27 |      def get_size(self):
 28 |          return self.batch_size
 29 | 
 30 |      def size(self):
 31 |          return self.containerSize
 32 | 
 33 |      def select_batch(self, batchSize):
 34 |          return random.sample(self.container, batchSize)
 35 | 
 36 |      def add(self, experience):
 37 | 
 38 |          experience.append(self.priority)
 39 | 
 40 |          if self.containerSize < self.Memorysize:
 41 |             self.container.append(experience)
 42 |             self.containerSize = self.containerSize+1
 43 | 
 44 | 
 45 |          else:
 46 |              self.container.popleft()
 47 |              self.container.append(experience)
 48 | 
 49 | 
 50 |      def transform_sample(self,sample,batch_size):
 51 | 
 52 | 
 53 | 
 54 | 
 55 |          obs_dim=self.obs_dim
 56 |          act_dim = self.act_dim
 57 | 
 58 |          current_state=  [x[0] for x in sample]
 59 |          actions =     np.asarray([x[1] for x in sample])
 60 |          rewards =     [x[2] for x in sample]
 61 |          next_state=   [x[3] for x in sample]
 62 |          done =        [x[4] for x in sample]
 63 | 
 64 | 
 65 |          current_state = np.resize(current_state,[batch_size,obs_dim])
 66 |          actions       = np.resize(actions, [batch_size, act_dim])
 67 |          rewards       = np.resize(rewards, [batch_size, act_dim])
 68 |          next_state    = np.resize(next_state, [batch_size, obs_dim])
 69 |          done          = np.resize(done, [batch_size, act_dim])
 70 | 
 71 | 
 72 |          return [current_state,actions,rewards,next_state,done]
 73 | 
 74 |      def select_sample(self,batch_size):
 75 |          #print "container size",self.containerSize
 76 |          sample = random.sample(self.container, batch_size)
 77 |          return self.transform_sample(sample,batch_size)
 78 | 
 79 |      def clear_memory(self):
 80 |          self.container = memory.deque()
 81 |          self.containerSize=0
 82 |          self.num_experiences = 0
 83 | 
 84 | 
 85 |      def important_sampling(self, batch_size,policy):
 86 |          current_state, actions, rewards, next_state, done\
 87 |              = self.select_sample(batch_size)
 88 |          discount_factor=0.8
 89 | 
 90 |          G = 0.0
 91 |          W = 1.0
 92 |          C = np.zeros(3)
 93 | 
 94 |          for i in range(batch_size):
 95 |             G =+ discount_factor*rewards[i]
 96 |             C+=W
 97 |             q_state_action=policy.q_value_function(current_state[i],actions[i])
 98 |             new_q = (W/C)*(G - q_state_action)
 99 |             W = W * 1. / behavior_policy(state)[action]
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
  1 | from matplotlib import pyplot as pl
  2 | 
  3 | 
  4 | from replay_memory import Memory
  5 | from gym.monitoring.video_recorder import VideoRecorder
  6 | from policy import Policy
  7 | from rbf import Basis_Function
  8 | from lspi import LSPI, LSTDQ
  9 | import gym
 10 | import scipy
 11 | import numpy as np
 12 | 
 13 | 
 14 | 
 15 | import matplotlib.pyplot as plt
 16 | from collections import defaultdict
 17 | 
 18 | TRANSITION =15000
 19 | EPISODE = 1000
 20 | BATCH_SIZE = 20
 21 | MEMORY_SIZE=TRANSITION+1000
 22 | 
 23 | 
 24 | important_sampling = None
 25 | lspi_interation = 20
 26 | num_actions = 3
 27 | num_means = 4
 28 | gamma = 0.99
 29 | 
 30 | mean_reward1=[]
 31 | mean_reward2=[]
 32 | 
 33 | def test_policy(policy, env, state, agent):
 34 |    
 35 |     print ("Test")
 36 |     total_reward = 0.0
 37 |     state = env.reset()
 38 | 
 39 |     for j in range(1):
 40 |         state = env.reset()
 41 |         for i in range(15000):
 42 |             env.render()
 43 |             index = policy.get_actions(state)  # TODO: valication for random actions
 44 |             #action = policy.actions[index[0]]  # todo. take just one action
 45 |             action=agent._act(state)
 46 |             next_state, reward, done, info = env.step(action)
 47 |             state = next_state
 48 |             total_reward += gamma * reward
 49 |             Best_policy=0
 50 | 
 51 |             if done:
 52 |                 print ("Done",policy.weights)
 53 |                 Best_policy = agent.policy
 54 |                 print ("Done", total_reward)
 55 |                 break
 56 | 
 57 |     return total_reward, Best_policy
 58 | 
 59 | def _initial_sample2(env, memory, agent ):
 60 | 
 61 |     state = env.reset()
 62 |     ##action = env.action_space.sample()
 63 |     total_reward = -4000
 64 |     best_reward=-4000
 65 |     Best_agent=None
 66 |     found=False
 67 |     best_theta=False
 68 | 
 69 |     for j in range(EPISODE):
 70 | 
 71 |         state = env.reset()
 72 |         best_theta = False
 73 |         for i in range(TRANSITION):
 74 |            # env.render()
 75 |             #action = agent._act(state)
 76 |             if best_reward >= total_reward and found==False:
 77 |                 action = env.action_space.sample()
 78 |             else:
 79 |                 agent = Best_agent
 80 |                 best_theta = True
 81 |                 action = agent._act(state)
 82 |             next_state, reward, done, info = env.step(action)
 83 |             memory.add([state, action, reward, next_state, done])
 84 |             state = next_state
 85 |             if done:
 86 |                 print "done interation=",i
 87 |                 break
 88 |         if j>0:
 89 |             if done:
 90 |                 sample = memory.select_sample(j)
 91 |             else:
 92 |                 sample = memory.select_sample(TRANSITION)
 93 | 
 94 |             policy = agent.train(sample, lspi_interation, important_sampling)
 95 |             total_reward, policy_test = test_policy(policy, env, state, agent)
 96 |             if best_reward < total_reward:
 97 |                 Best_agent = agent
 98 |                 best_reward = total_reward
 99 |                 total_reward = -4950.0
100 |                 #found=True
101 |             print "TEST---",j
102 |             print "total_reward",total_reward
103 |             if best_theta:
104 |                 memory.clear_memory()
105 | 
106 |     memory.clear_memory()
107 | 
108 |     return mean_reward1
109 | 
110 | def _reuse_sample2(env, memory, agent ):
111 |     state = env.reset()
112 |     total_reward = 0.0
113 |     important_sampling = False
114 |     reward = 0.0
115 | 
116 |     for j in range(EPISODE):
117 |         print ("episode", j, "/", EPISODE)
118 |         state = env.reset()
119 |         total_reward = 0.0
120 |         policy=[]
121 |         video_recorder = None
122 |         video_recorder = VideoRecorder(env, path="/home/yeimy/Documents/LSPI/Video/video"+str(j)+".mp4",enabled=True)
123 |         for i in range(TRANSITION):
124 | 
125 |             if i<BATCH_SIZE+1:
126 |                 action = env.action_space.sample()
127 |                 print ("ADD SAMPLE", i)
128 |             else:
129 |                 action = agent._act(state)
130 |                 sample = memory.select_sample(BATCH_SIZE)  # [current_state, actions, rewards, next_state, done]
131 |                 policy = agent.train(sample, lspi_interation, important_sampling)
132 | 
133 |             video_recorder.capture_frame()
134 |             next_state, reward, done, info = env.step(action)
135 |             memory.add([state, action, reward, next_state, done])
136 |             state = next_state
137 |             total_reward= total_reward+reward
138 |             env.render()
139 | 
140 |             #if done:
141 |                 #print ("****** DONE **********")
142 |                 #print ("TRANSITION", i, total_reward)
143 |                 #video_recorder.close()
144 |                 #video_recorder.enabled = False
145 |                 #test_policy(policy, env, state, agent)
146 |                 #break
147 | 
148 |         #sample = memory.select_sample(BATCH_SIZE)  # [current_state, actions, rewards, next_state, done]
149 |         #agent.train(sample, lspi_interation, important_sampling)
150 |         print ("END OF TRANSITION ",TRANSITION, "-" ,memory.containerSize)
151 | 
152 |         memory.clear_memory()
153 |         #video_recorder.close()
154 |         #video_recorder.enabled = False
155 | 
156 |         env.close()
157 | 
158 |     return mean_reward2
159 | 
160 | def experiment_1():
161 |     import gym
162 |     #env = gym.make('InvertedPendulum-v1')
163 |     env = gym.make('Acrobot-v0');
164 |     state = env.reset()
165 | 
166 |     num_actions = env.action_space.n
167 |     obs_dim = env.observation_space.shape[0]
168 |     action_dim = 1
169 | 
170 |     memory = Memory(MEMORY_SIZE, BATCH_SIZE,
171 |                     action_dim,  obs_dim)
172 |     print (num_actions,obs_dim)
173 | 
174 |     agent =  LSPI(num_actions,obs_dim )
175 |     return agent, env, memory
176 | 
177 | def experiment_2():
178 |     env = gym.make('MountainCar-v0');
179 |     #env = gym.make('CartPole-v0');
180 |     state = env.reset()
181 | 
182 |     action_dim = 1
183 |     num_actions = env.action_space.n
184 |     obs_dim = env.observation_space.shape[0]
185 |     memory = Memory(MEMORY_SIZE, BATCH_SIZE,
186 |                     action_dim,  obs_dim)
187 | 
188 |     agent = LSPI(num_actions,obs_dim)
189 | 
190 |     return agent, env , memory
191 | 
192 | def main():
193 | 
194 |     #agent, env, memory = experiment_1()
195 |     agent, env, memory = experiment_2()
196 | 
197 |     print ("memory size", memory.containerSize)
198 | 
199 |     y2 =_reuse_sample2(env, memory, agent)
200 |     #y1 = _initial_sample2(env, memory, agent)
201 | 
202 |     x  = np.arange(0, len(mean_reward1))
203 | 
204 |     np.reshape(mean_reward1,x.shape)
205 |     print (x.shape, mean_reward1, mean_reward2, x)
206 | 
207 |     import plot as pl
208 |     pj = pl.Plot()
209 |     pj.plot_rewad(x, y1,y2)
210 | 
211 | def _initial_sample(env, memory, policy=None):
212 |     impotant_samplig = False
213 |     lspi_interation = 20
214 | 
215 |     lspi = LSPI()
216 |     state = env.reset()
217 |     action = env.action_space.sample()
218 | 
219 |     for j in range(EPISODE):
220 | 
221 |         print ("episode-", j, "/", EPISODE, float(j / float(EPISODE)))
222 | 
223 |         for i in range(TRANSITION):
224 |             #env.render()
225 | 
226 |             if policy is None:
227 |                 action = env.action_space.sample()
228 |             else:
229 |                 sample = memory.select_sample(BATCH_SIZE)  # [current_state, actions, rewards, next_state, done]
230 |                 policy = lspi.train(sample, lspi_interation, impotant_samplig)
231 |                 index  = policy.get_actions(state)  # TODO: for random actions
232 |                 action = policy.actions[index[0]]
233 | 
234 |             next_state, reward, done, info = env.step(action)
235 |             memory.add([state, action, reward, next_state, done])
236 | 
237 |             state = next_state
238 | 
239 |             if done:
240 |                 print ("done")
241 |                 #test_policy(policy, env, state)
242 |                 break
243 | 
244 |         sample = memory.select_sample(BATCH_SIZE)
245 |         policy = lspi.train(sample, lspi_interation, impotant_samplig)
246 |         if j>50:
247 |             print ("episode-", j)
248 |             #env.render()
249 |     return policy
250 | 
251 | def _reuse_sample(env, memory):
252 |     total_reward = 0.0
253 |     lspi = LSPI()
254 |     policy = lspi.policy
255 |     state = env.reset()
256 |     env.unwrapped.render()
257 |     for j in range(EPISODE):
258 |         print "episode-",j,"/",EPISODE ,j/float(EPISODE)
259 | 
260 |         for i in range(TRANSITION):
261 |             #env.render()
262 |             if memory.containerSize < BATCH_SIZE:
263 | 
264 |                 index = policy.get_actions(state)  # TODO: validation for random actions
265 |                 action = policy.actions[index[0]]
266 | 
267 |             else:
268 | 
269 |                 sample = memory.select_sample(BATCH_SIZE)  # [current_state, actions, rewards, next_state, done]
270 |                 policy = lspi.train(sample, lspi_interation, important_sampling)
271 |                 index = policy.get_actions(state)
272 |                 action = policy.actions[index[0]]
273 |                 memory.clear_memory()
274 | 
275 |             next_state, reward, done, info = env.step(action)
276 |             memory.add([state, action, reward, next_state, done])
277 |             state = next_state
278 | 
279 |             if done:
280 |                 print ("done")
281 |                 state = env.reset()
282 |                 break
283 | 
284 |             mean_reward1.append(total_reward)
285 |             total_reward = 0.0
286 | 
287 |             # R=test_policy(policy, env,state)
288 |             # mean_reward.append([np.mean(R),np.max(R),np.min(R)])
289 |         if j>10:
290 |             print ("episode-", j)
291 |             #R = test_policy(policy, env, state)
292 | 
293 | def exmaple_RBF():
294 |     # ----- 1D Example ------------------------------------------------
295 | 
296 |     rbf= BasisFunction2(1, 13, 1)
297 |     #rbf =BasisFunction()
298 |     n = 500
299 | 
300 |     x = np.mgrid[-1:1:complex(0, n)].reshape(n, 1)
301 |     #print "vector" ,np.mgrid[-1:1:complex(0, n)]
302 |     # set y and add random noise
303 |     y = np.sin(3 * (x + 0.5) ** 3 - 50)
304 |     # y += random.normal(0, 0.1, y.shape)
305 | 
306 |     # rbf regression
307 | 
308 |     rbf.train(x, y)
309 |     z = rbf.test(x)
310 |     plot_function(x, y, z,rbf)
311 | 
312 | 
313 | if __name__ == '__main__':
314 |     main()
315 | 
316 | 
317 | 
318 | 
319 | 
320 | 
321 | 


--------------------------------------------------------------------------------
/video0.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yusme/LSPI/0e06dff817ac494b1aa37b9c176726069814614c/video0.mp4


--------------------------------------------------------------------------------