├── .DS_Store
├── Q-table-cliff.npz
├── Q-table-real-cliff.npz
├── README.md
├── exp5_1
    ├── toytoy.py
    └── toytoy2.py
├── exp5_1_py3
    ├── rmse_all.py
    ├── rmse_learner.py
    ├── toytoy2_par_table.py
    └── toytoy2_par_table_learner.py
├── exp5_2
    ├── crif_walking_ope.py
    └── cw_notebook_ver_splitting.ipynb
└── exp5_2_py3
    └── cw_notebook_ver_splitting_p3.ipynb


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalML/DoubleReinforcementLearningMDP/daae93f4de4d721c2663668e8ed187dbc7dfea25/.DS_Store


--------------------------------------------------------------------------------
/Q-table-cliff.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalML/DoubleReinforcementLearningMDP/daae93f4de4d721c2663668e8ed187dbc7dfea25/Q-table-cliff.npz


--------------------------------------------------------------------------------
/Q-table-real-cliff.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CausalML/DoubleReinforcementLearningMDP/daae93f4de4d721c2663668e8ed187dbc7dfea25/Q-table-real-cliff.npz


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DoubleReinforcementLearningMDP
 2 | 
 3 | This repository contains the code for replicating the experiments from the paper 
 4 | ### "Double Reinforcement Learning for Efficient Off-Policy Evaluation in Markov Decision Processes"
 5 | - https://arxiv.org/abs/1908.08526
 6 | 
 7 | ## Experiments in Section 5.1
 8 | 
 9 | The relevant code is in the subdirectory `exp5_1`. 
10 | * `toytoy.py` runs the experiment with the in-sample variant of the estimators.
11 | * `toytoy2.py` runs the experiment with the samples-splitting variant of the estimators.
12 | 
13 | For example, to run 10 parallel replications, one can run the command `seq 10 | xargs -L 1 -P 10 ./toytoy.sh`
14 | 
15 | ## Experiments in Section 5.2
16 | 
17 | The relevant code is in the subdirectory `exp5_2`. 
18 | 
19 | 
20 | ## Matrix
21 | code


--------------------------------------------------------------------------------
/exp5_1/toytoy.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy as sp
  3 | import sys
  4 | import math
  5 | 
  6 | 
  7 | 
  8 | 
  9 | 
 10 | def sigmoid(x):
 11 |     return(1.0/(1.0+np.exp(-0.1*x)))
 12 | 
 13 | ### Both models are good  
 14 | 
 15 | beta = 0.2
 16 | alpha = 0.9
 17 | rep = 1500
 18 | num = 0 
 19 | estimator_list_ipw = []
 20 | estimator_list_ipw2 = []
 21 | estimator_list_ipw3 = []
 22 | estimator_list_dm = []
 23 | estimator_list_dr = []
 24 | estimator_list_dr2 = []
 25 | estimator_list_dr3 = []
 26 | estimator_list_ipw2_ratio_mis = []
 27 | estimator_list_dr2_ratio_mis = []
 28 | estimator_list_dm_q_mis = []
 29 | estimator_list_dr_q_mis = []
 30 | estimator_list_dr2_q_mis = []
 31 | 
 32 | 
 33 | args = sys.argv
 34 | ver_ = str(args[1])
 35 | N = np.int(args[2])
 36 | 
 37 | 
 38 | from sklearn import linear_model
 39 | 
 40 | for iii in range(rep): 
 41 |     
 42 |     T =  30
 43 | 
 44 |     r_list = np.zeros([N,T])
 45 |     weight_list = np.zeros([N,T])
 46 |     s_list = np.zeros([N,T])
 47 |     a_list = np.zeros([N,T])
 48 |     w_list = np.zeros([N,T])
 49 |     w_list2 = np.zeros([N,T])
 50 |     
 51 |     def behav_policy(s,i):
 52 |         a = beta*sigmoid(s)+(beta)*np.random.uniform(0.0,1.0)
 53 |         return(np.random.binomial(1,a,1)[0])
 54 | 
 55 |     def eval_policy(s,i):
 56 |         a = alpha*sigmoid(s)+(1-alpha)*np.random.uniform(0.0,1.0)
 57 |         return(np.random.binomial(1,a,1)[0])
 58 |     
 59 |     def behav_policy_dens(s,a,i):
 60 |         b = beta*sigmoid(s)+(beta)*0.5
 61 |         if a==1:
 62 |             return(b)
 63 |         else:
 64 |             return(1.0-b)
 65 | 
 66 |     def eval_policy_dens(s,a,i):
 67 |         b = alpha*sigmoid(s)+(1-alpha)*0.5
 68 |         if a==1:
 69 |             return(b)
 70 |         else:
 71 |             return(1.0-b)
 72 |         
 73 |         
 74 |     for i in range(N):
 75 |         for j in range(T):
 76 |             if j==0:
 77 |                 s = np.random.normal(0.5,0.2)
 78 |                 r = 0.0
 79 |                 a =0.0
 80 |                 w = 1.0
 81 |             else:
 82 |                 s = np.random.normal(0.02*(j%2)+s*1.0-0.3*(a-0.5),0.2)
 83 |             a = behav_policy(s,j)
 84 |             w = eval_policy_dens(s,a,j)/behav_policy_dens(s,a,j)*w
 85 |             r = np.random.normal(0.9*s+0.3*a-0.02*(j%2),0.2)
 86 |             r_list[i,j] = r
 87 |             s_list[i,j] = s 
 88 |             a_list[i,j] = a 
 89 |             w_list[i,j] = w
 90 |             w_list2[i,j]= eval_policy_dens(s,a,j)/behav_policy_dens(s,a,j)
 91 |     
 92 |     ag_list = []
 93 |     
 94 |     #### IPW estimator
 95 |     for i in range(N):
 96 |         ag_list.append(np.sum(r_list[i,]*w_list[i,]))
 97 |     estimator_list_ipw.append(np.mean(ag_list))
 98 |     
 99 |     ########num = 0 
100 |     
101 |     #### DM estimator 
102 |     bbb = range(T)
103 |     reg_list = []
104 |     for j in bbb[::-1]:
105 |         if j==(T-1):
106 |             X = np.array([s_list[:,j],a_list[:,j]])
107 |             pre_X = np.array([s_list[:,j],a_list[:,j]])
108 |             Y = r_list[:,j]
109 |         else:
110 |             X = np.array([s_list[:,j],a_list[:,j]])
111 |             aaa = []
112 |             for k in range(N):
113 |                 aaa.append(eval_policy_dens(s_list[k,j+1],1,0))
114 |             X0 = np.array([s_list[:,j+1],aaa])
115 |             Y = r_list[:,j]+reg.predict(np.transpose(X0))
116 |         reg = linear_model.LinearRegression()
117 |         reg.fit(np.transpose(X), Y)
118 |         ###print reg.score(np.transpose(X), Y)
119 |         reg_list.append(reg)
120 |     
121 | 
122 |     aaa = []
123 |     for i in range(N):
124 |         aaa.append(eval_policy_dens(s_list[i,0],1,0))
125 |     X0 = np.array([s_list[:,0],aaa])   
126 |     v0 = reg.predict(np.transpose(X0)) 
127 |     estimator_list_dm.append(np.mean(v0))
128 |     
129 |     ### DR estiamtor under M_1
130 |     dr = 0.0
131 |     for t in range(T):
132 |         dr = dr + np.mean(r_list[:,t]*w_list[:,t])
133 |         #### q function
134 |         X = np.array([s_list[:,t],a_list[:,t]])
135 |         dr = dr - np.mean(reg_list[T-1-t].predict(np.transpose(X))*w_list[:,t])
136 |         #### v function 
137 |         aaa = []
138 |         for i in range(N):
139 |             aaa.append(eval_policy_dens(s_list[i,t],1,0))
140 |         X0 = np.array([s_list[:,t],aaa])
141 |         if t==0:
142 |             dr = dr + np.mean(reg_list[T-1-t].predict(np.transpose(X0)))
143 |         else:
144 |             dr = dr + np.mean(reg_list[T-1-t].predict(np.transpose(X0))*w_list[:,t-1])
145 |     
146 |     estimator_list_dr.append(dr)
147 |     
148 |     #### IPW estimator under M_2
149 |     
150 |     bbb = range(T)
151 |     wreg_list = []
152 |     for j in bbb[::-1]:
153 |         X = np.array([s_list[:,j],a_list[:,j]])
154 |         Y = w_list[:,j]
155 |         reg = linear_model.LinearRegression()
156 |         reg.fit(np.transpose(X),Y)
157 |         ###print reg.score(np.transpose(X), Y)
158 |         wreg_list.append(reg)
159 |         
160 |     ipw = 0.0
161 |     for t in range(T):
162 |         X = np.array([s_list[:,t],a_list[:,t]])
163 |         ipw = ipw + np.mean(wreg_list[T-1-t].predict(np.transpose(X))*r_list[:,t])
164 |     estimator_list_ipw2.append(ipw)
165 |     
166 |     ### DR estiamtor under M_2
167 |     dr2 = 0.0
168 |     for t in range(T):
169 |         X = np.array([s_list[:,t],a_list[:,t]])
170 |         dr2 = dr2 + np.mean(wreg_list[T-1-t].predict(np.transpose(X))*r_list[:,t])
171 |         #### q function
172 |         dr2 = dr2 - np.mean(reg_list[T-1-t].predict(np.transpose(X))*wreg_list[T-1-t].predict(np.transpose(X)))
173 |         #### v function 
174 |         aaa = []
175 |         for i in range(N):
176 |             aaa.append(eval_policy_dens(s_list[i,t],1,0))
177 |         X0 = np.array([s_list[:,t],aaa])
178 |         if t==0:
179 |             dr2 = dr2 + np.mean(reg_list[T-t-1].predict(np.transpose(X0))) 
180 |         else:
181 |             X_ = np.array([s_list[:,t-1],a_list[:,t-1]])
182 |             dr2 = dr2 + np.mean(reg_list[T-1-t].predict(np.transpose(X0))*wreg_list[T-t].predict(np.transpose(X_)))
183 |     estimator_list_dr2.append(dr2)
184 |         
185 |      #### Ratio-mis specified 
186 |     
187 |     num = 2
188 |     
189 |     bbb = range(T)
190 |     wreg_list_mis = []
191 |     for j in bbb[::-1]:
192 |         X = np.array([s_list[:,j]*s_list[:,j],a_list[:,j]])
193 |         Y = w_list[:,j]
194 |         reg = linear_model.LinearRegression()
195 |         reg.fit(np.transpose(X),Y)
196 |         ###print reg.score(np.transpose(X), Y)
197 |         wreg_list_mis.append(reg)
198 |         
199 |     ipw = 0.0
200 |     for t in range(T):
201 |         X = np.array([s_list[:,t]*s_list[:,t],a_list[:,t]])
202 |         ipw = ipw + np.mean(wreg_list_mis[T-1-t].predict(np.transpose(X))*r_list[:,t])
203 |     estimator_list_ipw2_ratio_mis.append(ipw)
204 |    
205 |     dr2 = 0.0
206 |     for t in range(T):
207 |         X_w = np.array([s_list[:,t]*s_list[:,t],a_list[:,t]])
208 |         X_r = np.array([s_list[:,t],a_list[:,t]])
209 |         dr2 = dr2 + np.mean(wreg_list_mis[T-1-t].predict(np.transpose(X_w))*r_list[:,t])
210 |         #### q function
211 |         dr2 = dr2 - np.mean(reg_list[T-1-t].predict(np.transpose(X_r))*wreg_list_mis[T-1-t].predict(np.transpose(X_w)))
212 |         #### v function 
213 |         aaa = []
214 |         for i in range(N):
215 |             aaa.append(eval_policy_dens(s_list[i,t],1,0))
216 |         X0 = np.array([s_list[:,t],aaa])
217 |         if t==0:
218 |             dr2 = dr2 + np.mean(reg_list[T-t-1].predict(np.transpose(X0))) 
219 |         else:
220 |             X_ = np.array([s_list[:,t-1]*s_list[:,t-1],a_list[:,t-1]])
221 |             dr2 = dr2 + np.mean(reg_list[T-1-t].predict(np.transpose(X0))*wreg_list_mis[T-t].predict(np.transpose(X_)))
222 |     estimator_list_dr2_ratio_mis.append(dr2)
223 |     
224 |     
225 |     ### q-misspcified 
226 |     
227 |     
228 |     #### DM estimator 
229 |     bbb = range(T)
230 |     reg_list_mis = []
231 |     for j in bbb[::-1]:
232 |         if j==(T-1):
233 |             X = np.array([s_list[:,j]*s_list[:,j],a_list[:,j]])
234 |             pre_X = np.array([s_list[:,j],a_list[:,j]])
235 |             Y = r_list[:,j]
236 |         else:
237 |             X = np.array([s_list[:,j]*s_list[:,j],a_list[:,j]])
238 |             aaa = []
239 |             for k in range(N):
240 |                 aaa.append(eval_policy_dens(s_list[k,j+1],1,0))
241 |             X0 = np.array([s_list[:,j+1]*s_list[:,j+1],aaa])
242 |             Y = r_list[:,j]+reg.predict(np.transpose(X0))
243 |         reg = linear_model.LinearRegression()
244 |         reg.fit(np.transpose(X), Y)
245 |         ###print reg.score(np.transpose(X), Y)
246 |         reg_list_mis.append(reg)
247 |     
248 | 
249 |     aaa = []
250 |     for i in range(N):
251 |         aaa.append(eval_policy_dens(s_list[i,0],1,0))
252 |     X0 = np.array([s_list[:,0],aaa])   
253 |     v0 = reg.predict(np.transpose(X0)) 
254 |     estimator_list_dm_q_mis.append(np.mean(v0))
255 |     
256 |     ### DR estiamtor under M_1
257 |     dr = 0.0
258 |     for t in range(T):
259 |         dr = dr + np.mean(r_list[:,t]*w_list[:,t])
260 |         #### q function
261 |         X = np.array([s_list[:,t]*s_list[:,t],a_list[:,t]])
262 |         dr = dr - np.mean(reg_list_mis[T-1-t].predict(np.transpose(X))*w_list[:,t])
263 |         #### v function 
264 |         aaa = []
265 |         for i in range(N):
266 |             aaa.append(eval_policy_dens(s_list[i,t],1,0))
267 |         X0 = np.array([s_list[:,t]*s_list[:,t],aaa])
268 |         if t==0:
269 |             dr = dr + np.mean(reg_list_mis[T-1-t].predict(np.transpose(X0)))
270 |         else:
271 |             dr = dr + np.mean(reg_list_mis[T-1-t].predict(np.transpose(X0))*w_list[:,t-1])
272 |     
273 |     estimator_list_dr_q_mis.append(dr)
274 |     
275 |     print iii
276 |     
277 | 
278 |     ### DR estiamtor under M_2
279 |     dr2 = 0.0
280 |     for t in range(T):
281 |         X_w = np.array([s_list[:,t],a_list[:,t]])
282 |         X_r = np.array([s_list[:,t]*s_list[:,t],a_list[:,t]])
283 |         dr2 = dr2 + np.mean(wreg_list[T-1-t].predict(np.transpose(X_w))*r_list[:,t])
284 |         #### q function
285 |         dr2 = dr2 - np.mean(reg_list_mis[T-1-t].predict(np.transpose(X_r))*wreg_list[T-1-t].predict(np.transpose(X_w)))
286 |         #### v function 
287 |         aaa = []
288 |         for i in range(N):
289 |             aaa.append(eval_policy_dens(s_list[i,t],1,0))
290 |         X0 = np.array([s_list[:,t]*s_list[:,t],aaa])
291 |         if t==0:
292 |             dr2 = dr2 + np.mean(reg_list_mis[T-t-1].predict(np.transpose(X0))) 
293 |         else:
294 |             X_ = np.array([s_list[:,t-1],a_list[:,t-1]])
295 |             dr2 = dr2 + np.mean(reg_list_mis[T-1-t].predict(np.transpose(X0))*wreg_list[T-t].predict(np.transpose(X_)))
296 |     estimator_list_dr2_q_mis.append(dr2)
297 | 
298 |     
299 |     np.savez("estimator_list_ipw_%d"+ver_+"_"+str(N),a=estimator_list_ipw)
300 |     np.savez("estimator_list_dr_%d"+ver_+"_"+str(N), a=estimator_list_dr)
301 |     np.savez("estimator_list_dm_%d"+ver_+"_"+str(N), a=estimator_list_dm)
302 |     np.savez("estimator_list_ipw2_%d"+ver_+"_"+str(N),a=estimator_list_ipw2)
303 |     np.savez("estimator_list_dr2_%d"+ver_+"_"+str(N),a=estimator_list_dr2)
304 |     np.savez("estimator_list_ipw3_%d"+ver_+"_"+str(N),a=estimator_list_ipw3)
305 |     np.savez("estimator_list_dr3_%d"+ver_+"_"+str(N),a=estimator_list_dr3)
306 |     np.savez("estimator_list_ipw2_ratio_mis_%d"+ver_+"_"+str(N),a=estimator_list_ipw2_ratio_mis)
307 |     np.savez("estimator_list_dr2_ratio_mis_%d"+ver_+"_"+str(N),a=estimator_list_dr2_ratio_mis)
308 |     np.savez("estimator_list_dm_q_mis_%d"+ver_+"_"+str(N),a=estimator_list_dm_q_mis)
309 |     np.savez("estimator_list_dr_q_mis_%d"+ver_+"_"+str(N),a=estimator_list_dr_q_mis)
310 |     np.savez("estimator_list_dr2_q_mis_%d"+ver_+"_"+str(N),a=estimator_list_dr2_q_mis)
311 |     
312 |         
313 | 
314 | 


--------------------------------------------------------------------------------
/exp5_1/toytoy2.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy as sp
  3 | import sys
  4 | import math
  5 | 
  6 | 
  7 | 
  8 | 
  9 | 
 10 | def sigmoid(x):
 11 |     return(1.0/(1.0+np.exp(-0.1*x)))
 12 | 
 13 | ### Both models are good  
 14 | 
 15 | beta = 0.2
 16 | alpha = 0.9
 17 | rep = 1500
 18 | num = 0 
 19 | estimator_list_ipw = []
 20 | estimator_list_ipw2 = []
 21 | estimator_list_ipw3 = []
 22 | estimator_list_dm = []
 23 | estimator_list_dr = []
 24 | estimator_list_dr2 = []
 25 | estimator_list_dr3 = []
 26 | estimator_list_ipw2_ratio_mis = []
 27 | estimator_list_dr2_ratio_mis = []
 28 | estimator_list_dm_q_mis = []
 29 | estimator_list_dr_q_mis = []
 30 | estimator_list_dr2_q_mis = []
 31 | 
 32 | 
 33 | args = sys.argv
 34 | ver_ = str(args[1])
 35 | N = np.int(args[2])
 36 | 
 37 | 
 38 | 
 39 | from sklearn import linear_model
 40 | 
 41 | for iii in range(rep): 
 42 |     print iii
 43 |     T =  30
 44 | 
 45 |     r_list = np.zeros([N,T])
 46 |     weight_list = np.zeros([N,T])
 47 |     s_list = np.zeros([N,T])
 48 |     a_list = np.zeros([N,T])
 49 |     w_list = np.zeros([N,T])
 50 |     w_list2 = np.zeros([N,T])
 51 |     
 52 |     def behav_policy(s,i):
 53 |         a = beta*sigmoid(s)+(beta)*np.random.uniform(0.0,1.0)
 54 |         return(np.random.binomial(1,a,1)[0])
 55 | 
 56 |     def eval_policy(s,i):
 57 |         a = alpha*sigmoid(s)+(1-alpha)*np.random.uniform(0.0,1.0)
 58 |         return(np.random.binomial(1,a,1)[0])
 59 |     
 60 |     def behav_policy_dens(s,a,i):
 61 |         b = beta*sigmoid(s)+(beta)*0.5
 62 |         if a==1:
 63 |             return(b)
 64 |         else:
 65 |             return(1.0-b)
 66 | 
 67 |     def eval_policy_dens(s,a,i):
 68 |         b = alpha*sigmoid(s)+(1-alpha)*0.5
 69 |         if a==1:
 70 |             return(b)
 71 |         else:
 72 |             return(1.0-b)
 73 |         
 74 |         
 75 |     for i in range(N):
 76 |         for j in range(T):
 77 |             if j==0:
 78 |                 s = np.random.normal(0.5,0.2)
 79 |                 r = 0.0
 80 |                 a =0.0
 81 |                 w = 1.0
 82 |             else:
 83 |                 s = np.random.normal(0.02*(j%2)+s*1.0-0.3*(a-0.5),0.2)
 84 |             a = behav_policy(s,j)
 85 |             w = eval_policy_dens(s,a,j)/behav_policy_dens(s,a,j)*w
 86 |             r = np.random.normal(0.9*s+0.3*a-0.02*(j%2),0.2)
 87 |             r_list[i,j] = r
 88 |             s_list[i,j] = s 
 89 |             a_list[i,j] = a 
 90 |             w_list[i,j] = w
 91 |             w_list2[i,j]= eval_policy_dens(s,a,j)/behav_policy_dens(s,a,j)
 92 |     
 93 |     ag_list = []
 94 |     
 95 |     #### IPW estimator
 96 |     for i in range(N):
 97 |         ag_list.append(np.sum(r_list[i,]*w_list[i,]))
 98 |     estimator_list_ipw.append(np.mean(ag_list))
 99 |     
100 |     ########num = 0 
101 |     
102 |     #### DM estimator 
103 |     bbb = range(T)
104 |     reg_list = []
105 |     for j in bbb[::-1]:
106 |         if j==(T-1):
107 |             X = np.array([s_list[:,j],a_list[:,j]])
108 |             pre_X = np.array([s_list[:,j],a_list[:,j]])
109 |             Y = r_list[:,j]
110 |         else:
111 |             X = np.array([s_list[:,j],a_list[:,j]])
112 |             aaa = []
113 |             for k in range(N):
114 |                 aaa.append(eval_policy_dens(s_list[k,j+1],1,0))
115 |             X0 = np.array([s_list[:,j+1],aaa])
116 |             Y = r_list[:,j]+reg.predict(np.transpose(X0))
117 |         reg = linear_model.LinearRegression()
118 |         reg.fit(np.transpose(X), Y)
119 |         ###print reg.score(np.transpose(X), Y)
120 |         reg_list.append(reg)
121 |     
122 | 
123 |     aaa = []
124 |     for i in range(N):
125 |         aaa.append(eval_policy_dens(s_list[i,0],1,0))
126 |     X0 = np.array([s_list[:,0],aaa])   
127 |     v0 = reg.predict(np.transpose(X0)) 
128 |     estimator_list_dm.append(np.mean(v0))
129 |     
130 |     ####print(np.mean(v0))
131 |     
132 |     ### DR estiamtor (Cross fitting)
133 |     
134 |     ############### Make q-function 
135 |     ################################
136 |     r_list_1 = r_list[0:N/2,:]
137 |     r_list_2 = r_list[N/2:N,:]
138 |     s_list_1 = s_list[0:N/2,:]
139 |     s_list_2 = s_list[N/2:N,:]
140 |     a_list_1 = a_list[0:N/2,:]
141 |     a_list_2 = a_list[N/2:N,:]
142 |     w_list_1 = w_list[0:N/2,:]
143 |     w_list_2 = w_list[N/2:N,:]
144 |     
145 |     
146 |     bbb = range(T)
147 |     reg_list = []
148 |     for j in bbb[::-1]:
149 |         if j==(T-1):
150 |             X = np.array([s_list_1[:,j],a_list_1[:,j]])
151 |             pre_X = np.array([s_list_1[:,j],a_list_1[:,j]])
152 |             Y = r_list_1[:,j]
153 |         else:
154 |             X = np.array([s_list_1[:,j],a_list_1[:,j]])
155 |             aaa = []
156 |             for k in range(N/2):
157 |                 aaa.append(eval_policy_dens(s_list_1[k,j+1],1,0))
158 |             X0 = np.array([s_list_1[:,j+1],aaa])
159 |             Y = r_list_1[:,j]+reg.predict(np.transpose(X0))
160 |         reg = linear_model.LinearRegression()
161 |         reg.fit(np.transpose(X), Y)
162 |         ###print reg.score(np.transpose(X), Y)
163 |         reg_list.append(reg)
164 |          
165 |     dr = 0.0
166 |     for t in range(T):
167 |         dr = dr + np.mean(r_list_2[:,t]*w_list_2[:,t])
168 |         #### q function
169 |         X = np.array([s_list_2[:,t],a_list_2[:,t]])
170 |         dr = dr - np.mean(reg_list[T-1-t].predict(np.transpose(X))*w_list_2[:,t])
171 |         #### v function 
172 |         aaa = []
173 |         for i in range(N/2):
174 |             aaa.append(eval_policy_dens(s_list_2[i,t],1,0))
175 |         X0 = np.array([s_list_2[:,t],aaa])
176 |         if t==0:
177 |             dr = dr + np.mean(reg_list[T-1-t].predict(np.transpose(X0)))
178 |         else:
179 |             dr = dr + np.mean(reg_list[T-1-t].predict(np.transpose(X0))*w_list_2[:,t-1])
180 |         
181 |     reg_list_2 = []
182 |     for j in bbb[::-1]:
183 |         if j==(T-1):
184 |             X = np.array([s_list_2[:,j],a_list_2[:,j]])
185 |             pre_X = np.array([s_list_1[:,j],a_list_2[:,j]])
186 |             Y = r_list_2[:,j]
187 |         else:
188 |             X = np.array([s_list_2[:,j],a_list_2[:,j]])
189 |             aaa = []
190 |             for k in range(N/2):
191 |                 aaa.append(eval_policy_dens(s_list_2[k,j+1],1,0))
192 |             X0 = np.array([s_list_2[:,j+1],aaa])
193 |             Y = r_list_2[:,j]+reg.predict(np.transpose(X0))
194 |         reg = linear_model.LinearRegression()
195 |         reg.fit(np.transpose(X), Y)
196 |         ###print reg.score(np.transpose(X), Y)
197 |         reg_list_2.append(reg)
198 |         
199 |     for t in range(T):    
200 |         dr = dr + np.mean(r_list_1[:,t]*w_list_1[:,t])
201 |         #### q function
202 |         X = np.array([s_list_1[:,t],a_list_1[:,t]])
203 |         dr = dr - np.mean(reg_list_2[T-1-t].predict(np.transpose(X))*w_list_1[:,t])
204 |         #### v function 
205 |         aaa = []
206 |         for i in range(N/2):
207 |             aaa.append(eval_policy_dens(s_list_2[i,t],1,0))
208 |         X0 = np.array([s_list_1[:,t],aaa])
209 |         if t==0:
210 |             dr = dr + np.mean(reg_list_2[T-1-t].predict(np.transpose(X0)))
211 |         else:
212 |             dr = dr + np.mean(reg_list_2[T-1-t].predict(np.transpose(X0))*w_list_1[:,t-1])
213 |     
214 |     estimator_list_dr.append(dr/2.0)
215 |     ####print dr/2.0
216 |     
217 |     
218 |     #### IPW estimator under M_2
219 |     
220 |     bbb = range(T)
221 |     wreg_list = []
222 |     for j in bbb[::-1]:
223 |         X = np.array([s_list[:,j],a_list[:,j]])
224 |         Y = w_list[:,j]
225 |         reg = linear_model.LinearRegression()
226 |         reg.fit(np.transpose(X),Y)
227 |         ###print reg.score(np.transpose(X), Y)
228 |         wreg_list.append(reg)
229 |         
230 |     ipw = 0.0
231 |     for t in range(T):
232 |         X = np.array([s_list[:,t],a_list[:,t]])
233 |         ipw = ipw + np.mean(wreg_list[T-1-t].predict(np.transpose(X))*r_list[:,t])
234 |     estimator_list_ipw2.append(ipw)
235 |     
236 |     ####print ipw
237 |     
238 |     ############### DR estiamtor under M_2 (cross fitting) ###############
239 |      ###############
240 |          ###############
241 |     r_list_1 = r_list[0:N/2,:]
242 |     r_list_2 = r_list[N/2:N,:]
243 |     s_list_1 = s_list[0:N/2,:]
244 |     s_list_2 = s_list[N/2:N,:]
245 |     a_list_1 = a_list[0:N/2,:]
246 |     a_list_2 = a_list[N/2:N,:]
247 |     w_list_1 = w_list[0:N/2,:]
248 |     w_list_2 = w_list[N/2:N,:]
249 |     
250 |     
251 |     bbb = range(T)
252 |     reg_list = []
253 |     for j in bbb[::-1]:
254 |         if j==(T-1):
255 |             X = np.array([s_list_1[:,j],a_list_1[:,j]])
256 |             pre_X = np.array([s_list_1[:,j],a_list_1[:,j]])
257 |             Y = r_list_1[:,j]
258 |         else:
259 |             X = np.array([s_list_1[:,j],a_list_1[:,j]])
260 |             aaa = []
261 |             for k in range(N/2):
262 |                 aaa.append(eval_policy_dens(s_list_1[k,j+1],1,0))
263 |             X0 = np.array([s_list_1[:,j+1],aaa])
264 |             Y = r_list_1[:,j]+reg.predict(np.transpose(X0))
265 |         reg = linear_model.LinearRegression()
266 |         reg.fit(np.transpose(X), Y)
267 |         ###print reg.score(np.transpose(X), Y)
268 |         reg_list.append(reg)
269 |         
270 |     wreg_list = []
271 |     for j in bbb[::-1]:
272 |         X = np.array([s_list_1[:,j],a_list_1[:,j]])
273 |         Y = w_list_1[:,j]
274 |         reg = linear_model.LinearRegression()
275 |         reg.fit(np.transpose(X),Y)
276 |         ###print reg.score(np.transpose(X), Y)
277 |         wreg_list.append(reg)
278 |         
279 |     
280 |     dr2 = 0.0
281 |     for t in range(T):
282 |         X = np.array([s_list_2[:,t],a_list_2[:,t]])
283 |         dr2 = dr2 + np.mean(wreg_list[T-1-t].predict(np.transpose(X))*r_list_2[:,t])
284 |         #### q function
285 |         dr2 = dr2 - np.mean(reg_list[T-1-t].predict(np.transpose(X))*wreg_list[T-1-t].predict(np.transpose(X)))
286 |         #### v function 
287 |         aaa = []
288 |         for i in range(N/2):
289 |             aaa.append(eval_policy_dens(s_list_2[i,t],1,0))
290 |         X0 = np.array([s_list_2[:,t],aaa])
291 |         if t==0:
292 |             dr2 = dr2 + np.mean(reg_list[T-t-1].predict(np.transpose(X0))) 
293 |         else:
294 |             X_ = np.array([s_list_2[:,t-1],a_list_2[:,t-1]])
295 |             dr2 = dr2 + np.mean(reg_list[T-1-t].predict(np.transpose(X0))*wreg_list[T-t].predict(np.transpose(X_)))
296 |     estimator_list_dr2.append(dr2)
297 |     
298 |     reg_list_2 = []
299 |     for j in bbb[::-1]:
300 |         if j==(T-1):
301 |             X = np.array([s_list_2[:,j],a_list_2[:,j]])
302 |             pre_X = np.array([s_list_1[:,j],a_list_2[:,j]])
303 |             Y = r_list_2[:,j]
304 |         else:
305 |             X = np.array([s_list_2[:,j],a_list_2[:,j]])
306 |             aaa = []
307 |             for k in range(N/2):
308 |                 aaa.append(eval_policy_dens(s_list_2[k,j+1],1,0))
309 |             X0 = np.array([s_list_2[:,j+1],aaa])
310 |             Y = r_list_2[:,j]+reg.predict(np.transpose(X0))
311 |         reg = linear_model.LinearRegression()
312 |         reg.fit(np.transpose(X), Y)
313 |         ###print reg.score(np.transpose(X), Y)
314 |         reg_list_2.append(reg)
315 |     
316 |     wreg_list_2 = []
317 |     for j in bbb[::-1]:
318 |         X = np.array([s_list_2[:,j],a_list_2[:,j]])
319 |         Y = w_list_2[:,j]
320 |         reg = linear_model.LinearRegression()
321 |         reg.fit(np.transpose(X),Y)
322 |         ###print reg.score(np.transpose(X), Y)
323 |         wreg_list_2.append(reg)
324 |         
325 |     for t in range(T):
326 |         X = np.array([s_list_1[:,t],a_list_1[:,t]])
327 |         dr2 = dr2 + np.mean(wreg_list_2[T-1-t].predict(np.transpose(X))*r_list_1[:,t])
328 |         #### q function
329 |         dr2 = dr2 - np.mean(reg_list_2[T-1-t].predict(np.transpose(X))*wreg_list_2[T-1-t].predict(np.transpose(X)))
330 |         #### v function 
331 |         aaa = []
332 |         for i in range(N/2):
333 |             aaa.append(eval_policy_dens(s_list_1[i,t],1,0))
334 |         X0 = np.array([s_list_1[:,t],aaa])
335 |         if t==0:
336 |             dr2 = dr2 + np.mean(reg_list_2[T-t-1].predict(np.transpose(X0))) 
337 |         else:
338 |             X_ = np.array([s_list_1[:,t-1],a_list_1[:,t-1]])
339 |             dr2 = dr2 + np.mean(reg_list_2[T-1-t].predict(np.transpose(X0))*wreg_list_2[T-t].predict(np.transpose(X_)))
340 |     estimator_list_dr2.append(dr2/2.0)
341 |     #####print(dr2/2.0)
342 |     
343 |         
344 |     #### Ratio-mis specified ##############
345 |     ###############################
346 |     ################################
347 |     
348 |     num = 2
349 |     
350 |     bbb = range(T)
351 |     wreg_list_mis = []
352 |     for j in bbb[::-1]:
353 |         X = np.array([s_list[:,j]*s_list[:,j],a_list[:,j]])
354 |         Y = w_list[:,j]
355 |         reg = linear_model.LinearRegression()
356 |         reg.fit(np.transpose(X),Y)
357 |         ###print reg.score(np.transpose(X), Y)
358 |         wreg_list_mis.append(reg)
359 |         
360 |     ipw = 0.0
361 |     for t in range(T):
362 |         X = np.array([s_list[:,t]*s_list[:,t],a_list[:,t]])
363 |         ipw = ipw + np.mean(wreg_list_mis[T-1-t].predict(np.transpose(X))*r_list[:,t])
364 |     estimator_list_ipw2_ratio_mis.append(ipw)
365 |    
366 |     dr2 = 0.0
367 |     for t in range(T):
368 |         X_w = np.array([s_list[:,t]*s_list[:,t],a_list[:,t]])
369 |         X_r = np.array([s_list[:,t],a_list[:,t]])
370 |         dr2 = dr2 + np.mean(wreg_list_mis[T-1-t].predict(np.transpose(X_w))*r_list[:,t])
371 |         #### q function
372 |         dr2 = dr2 - np.mean(reg_list[T-1-t].predict(np.transpose(X_r))*wreg_list_mis[T-1-t].predict(np.transpose(X_w)))
373 |         #### v function 
374 |         aaa = []
375 |         for i in range(N):
376 |             aaa.append(eval_policy_dens(s_list[i,t],1,0))
377 |         X0 = np.array([s_list[:,t],aaa])
378 |         if t==0:
379 |             dr2 = dr2 + np.mean(reg_list[T-t-1].predict(np.transpose(X0))) 
380 |         else:
381 |             X_ = np.array([s_list[:,t-1]*s_list[:,t-1],a_list[:,t-1]])
382 |             dr2 = dr2 + np.mean(reg_list[T-1-t].predict(np.transpose(X0))*wreg_list_mis[T-t].predict(np.transpose(X_)))
383 |     estimator_list_dr2_ratio_mis.append(dr2)
384 |     
385 |     
386 |     ### q-misspcified 
387 |     ################################
388 |     #################################
389 |     ##################################
390 |     
391 |     
392 |     
393 |     #### DM estimator 
394 |     bbb = range(T)
395 |     reg_list_mis = []
396 |     for j in bbb[::-1]:
397 |         if j==(T-1):
398 |             X = np.array([s_list[:,j]*s_list[:,j],a_list[:,j]])
399 |             pre_X = np.array([s_list[:,j],a_list[:,j]])
400 |             Y = r_list[:,j]
401 |         else:
402 |             X = np.array([s_list[:,j]*s_list[:,j],a_list[:,j]])
403 |             aaa = []
404 |             for k in range(N):
405 |                 aaa.append(eval_policy_dens(s_list[k,j+1],1,0))
406 |             X0 = np.array([s_list[:,j+1]*s_list[:,j+1],aaa])
407 |             Y = r_list[:,j]+reg.predict(np.transpose(X0))
408 |         reg = linear_model.LinearRegression()
409 |         reg.fit(np.transpose(X), Y)
410 |         ###print reg.score(np.transpose(X), Y)
411 |         reg_list_mis.append(reg)
412 |     
413 | 
414 |     aaa = []
415 |     for i in range(N):
416 |         aaa.append(eval_policy_dens(s_list[i,0],1,0))
417 |     X0 = np.array([s_list[:,0],aaa])   
418 |     v0 = reg.predict(np.transpose(X0)) 
419 |     estimator_list_dm_q_mis.append(np.mean(v0))
420 |     
421 |     ### DR estiamtor under M_1
422 |     dr = 0.0
423 |     for t in range(T):
424 |         dr = dr + np.mean(r_list[:,t]*w_list[:,t])
425 |         #### q function
426 |         X = np.array([s_list[:,t]*s_list[:,t],a_list[:,t]])
427 |         dr = dr - np.mean(reg_list_mis[T-1-t].predict(np.transpose(X))*w_list[:,t])
428 |         #### v function 
429 |         aaa = []
430 |         for i in range(N):
431 |             aaa.append(eval_policy_dens(s_list[i,t],1,0))
432 |         X0 = np.array([s_list[:,t]*s_list[:,t],aaa])
433 |         if t==0:
434 |             dr = dr + np.mean(reg_list_mis[T-1-t].predict(np.transpose(X0)))
435 |         else:
436 |             dr = dr + np.mean(reg_list_mis[T-1-t].predict(np.transpose(X0))*w_list[:,t-1])
437 |     
438 |     estimator_list_dr_q_mis.append(dr)
439 |     
440 |     print iii
441 |     
442 | 
443 |     ### DR estiamtor under M_2
444 |     dr2 = 0.0
445 |     for t in range(T):
446 |         X_w = np.array([s_list[:,t],a_list[:,t]])
447 |         X_r = np.array([s_list[:,t]*s_list[:,t],a_list[:,t]])
448 |         dr2 = dr2 + np.mean(wreg_list[T-1-t].predict(np.transpose(X_w))*r_list[:,t])
449 |         #### q function
450 |         dr2 = dr2 - np.mean(reg_list_mis[T-1-t].predict(np.transpose(X_r))*wreg_list[T-1-t].predict(np.transpose(X_w)))
451 |         #### v function 
452 |         aaa = []
453 |         for i in range(N):
454 |             aaa.append(eval_policy_dens(s_list[i,t],1,0))
455 |         X0 = np.array([s_list[:,t]*s_list[:,t],aaa])
456 |         if t==0:
457 |             dr2 = dr2 + np.mean(reg_list_mis[T-t-1].predict(np.transpose(X0))) 
458 |         else:
459 |             X_ = np.array([s_list[:,t-1],a_list[:,t-1]])
460 |             dr2 = dr2 + np.mean(reg_list_mis[T-1-t].predict(np.transpose(X0))*wreg_list[T-t].predict(np.transpose(X_)))
461 |     estimator_list_dr2_q_mis.append(dr2)
462 | 
463 |     
464 |     np.savez("estimator_list_ipw_%d"+ver_+"_"+str(N),a=estimator_list_ipw)
465 |     np.savez("estimator_list_dr_%d"+ver_+"_"+str(N), a=estimator_list_dr)
466 |     np.savez("estimator_list_dm_%d"+ver_+"_"+str(N), a=estimator_list_dm)
467 |     np.savez("estimator_list_ipw2_%d"+ver_+"_"+str(N),a=estimator_list_ipw2)
468 |     np.savez("estimator_list_dr2_%d"+ver_+"_"+str(N),a=estimator_list_dr2)
469 |     np.savez("estimator_list_ipw3_%d"+ver_+"_"+str(N),a=estimator_list_ipw3)
470 |     np.savez("estimator_list_dr3_%d"+ver_+"_"+str(N),a=estimator_list_dr3)
471 |     np.savez("estimator_list_ipw2_ratio_mis_%d"+ver_+"_"+str(N),a=estimator_list_ipw2_ratio_mis)
472 |     np.savez("estimator_list_dr2_ratio_mis_%d"+ver_+"_"+str(N),a=estimator_list_dr2_ratio_mis)
473 |     np.savez("estimator_list_dm_q_mis_%d"+ver_+"_"+str(N),a=estimator_list_dm_q_mis)
474 |     np.savez("estimator_list_dr_q_mis_%d"+ver_+"_"+str(N),a=estimator_list_dr_q_mis)
475 |     np.savez("estimator_list_dr2_q_mis_%d"+ver_+"_"+str(N),a=estimator_list_dr2_q_mis)
476 |     
477 |         


--------------------------------------------------------------------------------
/exp5_1_py3/rmse_all.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import os
  4 | import time
  5 | import matplotlib.pyplot as plt
  6 | import seaborn as sns
  7 | import multiprocessing as mp
  8 | from joblib import Parallel, delayed
  9 | import argparse
 10 | 
 11 | def sigmoid(x):
 12 |     return 1.0 / (1.0 + np.exp(-0.1 * x))
 13 | 
 14 | def estimate_true_value_single(seed, T=30, alpha=0.9, beta=0.2):
 15 |     """
 16 |     Run a single episode using the evaluation policy to estimate the true value.
 17 |     """
 18 |     np.random.seed(seed)
 19 |     
 20 |     # Initial state
 21 |     s = np.random.normal(0.5, 0.2)
 22 |     
 23 |     # Total return for this episode
 24 |     episode_return = 0.0
 25 |     
 26 |     # Run the episode
 27 |     for j in range(T):
 28 |         # Use evaluation policy directly
 29 |         a_prob = alpha * sigmoid(s) + (1 - alpha) * np.random.uniform(0.0, 1.0)
 30 |         a = np.random.binomial(1, a_prob, 1)[0]
 31 |         
 32 |         # Get reward
 33 |         r = np.random.normal(0.9 * s + 0.3 * a - 0.02 * (j % 2), 0.2)
 34 |         episode_return += r
 35 |         
 36 |         # State transition (if not the last step)
 37 |         if j < T-1:
 38 |             s = np.random.normal(0.02 * ((j+1) % 2) + s * 1.0 - 0.3 * (a - 0.5), 0.2)
 39 |     
 40 |     return episode_return
 41 | 
 42 | def estimate_true_value(n_episodes=100000, T=30, alpha=0.9, beta=0.2, n_jobs=-1):
 43 |     """
 44 |     Estimate the true expected return of the evaluation policy.
 45 |     """
 46 |     start_time = time.time()
 47 |     print(f"Estimating true value using {n_episodes} direct episodes...")
 48 |     
 49 |     # Use joblib for parallelization
 50 |     if n_jobs == -1:
 51 |         n_jobs = mp.cpu_count()
 52 |     
 53 |     # Create batches to show progress
 54 |     batch_size = min(10000, n_episodes)
 55 |     n_batches = (n_episodes + batch_size - 1) // batch_size
 56 |     
 57 |     all_returns = []
 58 |     
 59 |     for batch in range(n_batches):
 60 |         start_idx = batch * batch_size
 61 |         end_idx = min((batch + 1) * batch_size, n_episodes)
 62 |         current_batch_size = end_idx - start_idx
 63 |         
 64 |         print(f"Running batch {batch+1}/{n_batches} ({start_idx+1}-{end_idx} of {n_episodes})...")
 65 |         
 66 |         # Run episodes in parallel
 67 |         batch_returns = Parallel(n_jobs=n_jobs)(
 68 |             delayed(estimate_true_value_single)(
 69 |                 seed=start_idx+i, 
 70 |                 T=T, 
 71 |                 alpha=alpha, 
 72 |                 beta=beta
 73 |             ) for i in range(current_batch_size)
 74 |         )
 75 |         
 76 |         all_returns.extend(batch_returns)
 77 |         
 78 |         # Show intermediate results
 79 |         current_mean = np.mean(all_returns)
 80 |         current_se = np.std(all_returns) / np.sqrt(len(all_returns))
 81 |         
 82 |         print(f"  Completed {len(all_returns)}/{n_episodes} episodes")
 83 |         print(f"  Current estimate: {current_mean:.6f} ± {current_se:.6f}")
 84 |     
 85 |     # Calculate final estimate and standard error
 86 |     true_value = np.mean(all_returns)
 87 |     std_error = np.std(all_returns) / np.sqrt(n_episodes)
 88 |     
 89 |     # Calculate time taken
 90 |     elapsed_time = time.time() - start_time
 91 |     minutes = int(elapsed_time // 60)
 92 |     seconds = int(elapsed_time % 60)
 93 |     
 94 |     print(f"\nEstimation complete in {minutes}m {seconds}s")
 95 |     print(f"Final true value estimate: {true_value:.6f} ± {std_error:.6f}")
 96 |     print(f"Based on {n_episodes} episodes with evaluation policy (α={alpha})")
 97 |     
 98 |     return true_value, std_error
 99 | 
100 | def load_npz_file(file_path):
101 |     """Load a single NPZ file and return its contents."""
102 |     try:
103 |         data = np.load(file_path)
104 |         # Check for different array names - first 'a' (old format) then 'data' (new format)
105 |         if 'a' in data:
106 |             return data['a']
107 |         elif 'data' in data:
108 |             return data['data']
109 |         else:
110 |             # Try to get the first array in the file
111 |             array_keys = list(data.keys())
112 |             if array_keys:
113 |                 return data[array_keys[0]]
114 |             else:
115 |                 print(f"Warning: No valid arrays found in {file_path}")
116 |                 return np.array([])
117 |     except Exception as e:
118 |         print(f"Error loading {file_path}: {e}")
119 |         return np.array([])
120 | 
121 | def load_estimator_results(directory, sample_sizes):
122 |     """
123 |     Load all estimator results from NPZ files with the new naming convention.
124 |     
125 |     Parameters:
126 |     -----------
127 |     directory : str or list
128 |         Directory or list of directories where NPZ files are stored
129 |     sample_sizes : list
130 |         List of sample sizes to load
131 |         
132 |     Returns:
133 |     --------
134 |     dict
135 |         Dictionary with sample sizes as keys and dictionaries of estimators as values
136 |     """
137 |     all_estimators = {}
138 |     
139 |     # Estimator type mapping for nice display names
140 |     estimator_display_names = {
141 |         'ipw': 'IPW',
142 |         'dr': 'DRL(M₁)',
143 |         'dm': 'DM',
144 |         'ipw2': 'IPW₂',
145 |         'dr2': 'DRL(M₂)',
146 |         'ipw_mis_q': 'IPW (q mis.)',
147 |         'dr_mis_q': 'DRL(M₁) (q mis.)',
148 |         'dm_mis_q': 'DM (q mis.)',
149 |         'ipw2_mis_q': 'IPW₂ (q mis.)',
150 |         'dr2_mis_q': 'DRL(M₂) (q mis.)',
151 |         'ipw_mis_mu': 'IPW (μ mis.)',
152 |         'dr_mis_mu': 'DRL(M₁) (μ mis.)',
153 |         'dm_mis_mu': 'DM (μ mis.)',
154 |         'ipw2_mis_mu': 'IPW₂ (μ mis.)',
155 |         'dr2_mis_mu': 'DRL(M₂) (μ mis.)'
156 |     }
157 |     
158 |     # Track if we found any files
159 |     found_any_files = False
160 |     
161 |     for N in sample_sizes:
162 |         all_estimators[N] = {}
163 |         
164 |         # Check for multiple possible file pattern formats
165 |         possible_patterns = [
166 |             f"_{N}.npz",              # New format: estimator_list_ipw_1500.npz
167 |             f"_0default_{N}.npz",     # Old format: estimator_list_ipw_0default_1500.npz 
168 |             f"_%d0default_{N}.npz",   # Old format with %d: estimator_list_ipw_%d0default_1500.npz
169 |             f"_n{N}.npz",             # Alternative format: estimator_list_ipw_n1500.npz
170 |             f"_gpu_{N}.npz",          # GPU format: gpu_ipw_1500.npz
171 |             f"_{N}_gpu.npz"           # Another GPU format: ipw_1500_gpu.npz
172 |         ]
173 |         
174 |         print(f"\nLooking for results with N = {N}:")
175 |         
176 |         # Check if directory is a list of possible directories
177 |         if isinstance(directory, list):
178 |             search_dirs = directory
179 |         else:
180 |             search_dirs = [directory]
181 |             
182 |         for search_dir in search_dirs:
183 |             # Find all NPZ files for this sample size
184 |             for pattern in possible_patterns:
185 |                 npz_files = [f for f in os.listdir(search_dir) if f.endswith('.npz') and pattern in f]
186 |                 
187 |                 if npz_files:
188 |                     print(f"  Found {len(npz_files)} files with pattern {pattern}")
189 |                     found_any_files = True
190 |                     
191 |                     for npz_file in npz_files:
192 |                         try:
193 |                             # Extract estimator name using different patterns
194 |                             estimator_key = None
195 |                             
196 |                             # Try different naming conventions
197 |                             if "estimator_list_" in npz_file:
198 |                                 # Extract part between "estimator_list_" and the pattern
199 |                                 estimator_part = npz_file.replace("estimator_list_", "")
200 |                                 for p in possible_patterns:
201 |                                     if p in estimator_part:
202 |                                         estimator_key = estimator_part.split(p.replace(".npz", ""))[0]
203 |                                         # Remove trailing underscore if present
204 |                                         estimator_key = estimator_key.rstrip('_')
205 |                                         break
206 |                             elif "gpu_" in npz_file:
207 |                                 # Format like gpu_ipw_1500.npz
208 |                                 parts = npz_file.split('_')
209 |                                 if len(parts) > 1:
210 |                                     estimator_key = parts[1]
211 |                             else:
212 |                                 # Last resort: try to extract from filename
213 |                                 parts = npz_file.split('_')
214 |                                 if len(parts) > 0:
215 |                                     estimator_key = parts[0]
216 |                             
217 |                             # If we couldn't determine the estimator key, use the filename without extension
218 |                             if not estimator_key:
219 |                                 estimator_key = os.path.splitext(npz_file)[0]
220 |                                 
221 |                             # Load the NPZ file
222 |                             values = load_npz_file(os.path.join(search_dir, npz_file))
223 |                             
224 |                             if len(values) > 0:
225 |                                 # Use display name if available
226 |                                 display_name = estimator_display_names.get(estimator_key, estimator_key)
227 |                                 all_estimators[N][display_name] = values
228 |                                 print(f"    Loaded {display_name} from {npz_file}: {len(values)} values")
229 |                         except Exception as e:
230 |                             print(f"    Error processing {npz_file}: {e}")
231 |     
232 |     if not found_any_files:
233 |         print("\nWARNING: No NPZ files found matching the expected patterns!")
234 |         print(f"Searched in: {directory}")
235 |         print(f"Looking for sample sizes: {sample_sizes}")
236 |         print("\nPlease check that your files are in the correct location and named correctly.")
237 |         # List all files in the directory for debugging
238 |         if isinstance(directory, str) and os.path.exists(directory):
239 |             print("\nFiles found in the directory:")
240 |             for f in os.listdir(directory):
241 |                 if f.endswith('.npz'):
242 |                     print(f"  {f}")
243 |     
244 |     return all_estimators
245 | 
246 | def calculate_rmse(estimators, true_value):
247 |     """
248 |     Calculate RMSE, standard error, and bias for each estimator
249 |     
250 |     Parameters:
251 |     -----------
252 |     estimators : dict
253 |         Dictionary with estimator names as keys and arrays of values as values
254 |     true_value : float
255 |         The true parameter value being estimated
256 |         
257 |     Returns:
258 |     --------
259 |     DataFrame
260 |         DataFrame with RMSE, std errors, and bias for each estimator
261 |     """
262 |     results = []
263 |     
264 |     for name, values in estimators.items():
265 |         if len(values) > 0:
266 |             # Calculate RMSE
267 |             squared_errors = np.square(np.array(values) - true_value)
268 |             rmse = np.sqrt(np.mean(squared_errors))
269 |             
270 |             # Calculate standard error of RMSE
271 |             # Based on the delta method approximation
272 |             se_rmse = np.std(squared_errors) / (2 * rmse * np.sqrt(len(values)))
273 |             
274 |             # Calculate bias
275 |             bias = np.mean(values) - true_value
276 |             
277 |             # Calculate mean and standard deviation
278 |             mean = np.mean(values)
279 |             std = np.std(values)
280 |             
281 |             results.append({
282 |                 'Estimator': name,
283 |                 'RMSE': rmse,
284 |                 'SE': se_rmse,
285 |                 'Bias': bias,
286 |                 'Mean': mean,
287 |                 'Std': std,
288 |                 'n_samples': len(values)
289 |             })
290 |     
291 |     # Handle empty results
292 |     if not results:
293 |         print("WARNING: No data available to calculate RMSE!")
294 |         # Return empty DataFrame with the expected columns
295 |         return pd.DataFrame(columns=['Estimator', 'RMSE', 'SE', 'Bias', 'Mean', 'Std', 'n_samples'])
296 |         
297 |     return pd.DataFrame(results)
298 | 
299 | def create_rmse_table(all_estimators, true_value):
300 |     """
301 |     Create RMSE table for all sample sizes
302 |     
303 |     Parameters:
304 |     -----------
305 |     all_estimators : dict
306 |         Dictionary with sample sizes as keys and dictionaries of estimators as values
307 |     true_value : float
308 |         The true parameter value
309 |         
310 |     Returns:
311 |     --------
312 |     tuple
313 |         Tuple containing (rmse_table, se_table, bias_table, all_results)
314 |     """
315 |     all_results = []
316 |     
317 |     for N, estimators in all_estimators.items():
318 |         if estimators:  # Check if there are any estimators for this N
319 |             results = calculate_rmse(estimators, true_value)
320 |             if not results.empty:
321 |                 results['N'] = N
322 |                 all_results.append(results)
323 |         else:
324 |             print(f"No estimators found for N={N}")
325 |     
326 |     # Handle case where no results were calculated
327 |     if not all_results:
328 |         print("WARNING: No valid results found to create RMSE table!")
329 |         empty_df = pd.DataFrame(columns=['Estimator', 'RMSE', 'SE', 'Bias', 'Mean', 'Std', 'n_samples', 'N'])
330 |         return empty_df, empty_df, empty_df, empty_df
331 |     
332 |     # Combine results from all sample sizes
333 |     combined_results = pd.concat(all_results, ignore_index=True)
334 |     
335 |     if 'Estimator' not in combined_results.columns or 'N' not in combined_results.columns:
336 |         print("WARNING: Missing required columns in results!")
337 |         print(f"Available columns: {combined_results.columns.tolist()}")
338 |         empty_df = pd.DataFrame(columns=['Estimator', 'RMSE', 'SE', 'Bias'])
339 |         return empty_df, empty_df, empty_df, combined_results
340 |     
341 |     # Create pivot tables
342 |     rmse_table = combined_results.pivot(index='Estimator', columns='N', values='RMSE')
343 |     se_table = combined_results.pivot(index='Estimator', columns='N', values='SE')
344 |     bias_table = combined_results.pivot(index='Estimator', columns='N', values='Bias')
345 |     
346 |     return rmse_table, se_table, bias_table, combined_results
347 | 
348 | def create_latex_table(rmse_table, se_table, output_file):
349 |     """
350 |     Create a LaTeX table with RMSE values and standard errors
351 |     
352 |     Parameters:
353 |     -----------
354 |     rmse_table : DataFrame
355 |         DataFrame with RMSE values
356 |     se_table : DataFrame
357 |         DataFrame with standard error values
358 |     output_file : str
359 |         Output file path
360 |     """
361 |     # Skip if tables are empty
362 |     if rmse_table.empty or se_table.empty:
363 |         print(f"Skipping LaTeX table creation because data tables are empty")
364 |         return
365 |         
366 |     with open(output_file, "w") as f:
367 |         f.write("\\begin{table}[ht]\n")
368 |         f.write("\\centering\n")
369 |         f.write("\\caption{RMSE of Estimators (with standard errors in parentheses)}\n")
370 |         f.write("\\begin{tabular}{l" + "c" * len(rmse_table.columns) + "}\n")
371 |         f.write("\\hline\n")
372 |         
373 |         # Header row
374 |         f.write("Estimator & " + " & ".join([f"N={n}" for n in rmse_table.columns]) + " \\\\\n")
375 |         f.write("\\hline\n")
376 |         
377 |         # Data rows
378 |         for estimator in rmse_table.index:
379 |             row = f"{estimator}"
380 |             for n in rmse_table.columns:
381 |                 if n in rmse_table.columns and n in se_table.columns:
382 |                     rmse = rmse_table.loc[estimator, n]
383 |                     se = se_table.loc[estimator, n]
384 |                     row += f" & {rmse:.4f} ({se:.4f})"
385 |                 else:
386 |                     row += " & -"
387 |             row += " \\\\\n"
388 |             f.write(row)
389 |         
390 |         f.write("\\hline\n")
391 |         f.write("\\end{tabular}\n")
392 |         f.write("\\end{table}\n")
393 |         
394 |     print(f"LaTeX table created: {output_file}")
395 | 
396 | def create_visualizations(rmse_table, se_table, combined_results, output_dir):
397 |     """
398 |     Create visualizations for RMSE results
399 |     
400 |     Parameters:
401 |     -----------
402 |     rmse_table : DataFrame
403 |         Pivot table with RMSE values
404 |     se_table : DataFrame
405 |         Pivot table with standard error values
406 |     combined_results : DataFrame
407 |         Combined results DataFrame
408 |     output_dir : str
409 |         Directory to save visualizations
410 |     """
411 |     # Skip if tables are empty
412 |     if rmse_table.empty or se_table.empty or combined_results.empty:
413 |         print(f"Skipping visualization creation because data tables are empty")
414 |         return
415 |         
416 |     # Set style
417 |     sns.set_style("whitegrid")
418 |     plt.rcParams['figure.figsize'] = (12, 8)
419 |     plt.rcParams['savefig.dpi'] = 300
420 |     
421 |     try:
422 |         # 1. RMSE by sample size for each estimator
423 |         plt.figure(figsize=(12, 8))
424 |         for estimator in rmse_table.index:
425 |             plt.plot(rmse_table.columns, rmse_table.loc[estimator], marker='o', linewidth=2, label=estimator)
426 |         
427 |         plt.xlabel('Sample Size (N)', fontsize=14)
428 |         plt.ylabel('RMSE', fontsize=14)
429 |         plt.title('RMSE by Sample Size for Each Estimator', fontsize=16)
430 |         plt.legend(fontsize=12)
431 |         plt.grid(True)
432 |         plt.tight_layout()
433 |         plt.savefig(os.path.join(output_dir, 'rmse_by_sample_size.png'))
434 |         plt.close()
435 |         
436 |         # 2. Bar plot for each sample size
437 |         for N in rmse_table.columns:
438 |             plt.figure(figsize=(14, 8))
439 |             
440 |             # Sort estimators by RMSE
441 |             sorted_estimators = rmse_table[N].sort_values().index
442 |             
443 |             # Plot RMSE bars
444 |             ax = plt.bar(range(len(sorted_estimators)), rmse_table.loc[sorted_estimators, N])
445 |             
446 |             # Add error bars
447 |             plt.errorbar(
448 |                 x=range(len(sorted_estimators)),
449 |                 y=rmse_table.loc[sorted_estimators, N],
450 |                 yerr=se_table.loc[sorted_estimators, N],
451 |                 fmt='none', capsize=5, color='black', elinewidth=1.5
452 |             )
453 |             
454 |             plt.title(f'RMSE for Each Estimator (N = {N})', fontsize=16)
455 |             plt.ylabel('RMSE', fontsize=14)
456 |             plt.xticks(range(len(sorted_estimators)), sorted_estimators, rotation=45, ha='right', fontsize=12)
457 |             plt.grid(axis='y')
458 |             plt.tight_layout()
459 |             plt.savefig(os.path.join(output_dir, f'rmse_n{N}.png'))
460 |             plt.close()
461 |         
462 |         # 3. Heatmap of RMSE values
463 |         plt.figure(figsize=(10, 8))
464 |         sns.heatmap(rmse_table, annot=True, cmap='YlGnBu', fmt='.4f')
465 |         plt.title('RMSE Heatmap by Estimator and Sample Size', fontsize=16)
466 |         plt.tight_layout()
467 |         plt.savefig(os.path.join(output_dir, 'rmse_heatmap.png'))
468 |         plt.close()
469 |         
470 |         # 4. Bias comparison if bias data is available
471 |         if 'Bias' in combined_results.columns and 'N' in combined_results.columns:
472 |             bias_table = combined_results.pivot(index='Estimator', columns='N', values='Bias')
473 |             
474 |             plt.figure(figsize=(12, 8))
475 |             for estimator in bias_table.index:
476 |                 plt.plot(bias_table.columns, bias_table.loc[estimator], marker='o', linewidth=2, label=estimator)
477 |             
478 |             plt.axhline(y=0, color='r', linestyle='-', alpha=0.5)  # Add zero line for reference
479 |             plt.xlabel('Sample Size (N)', fontsize=14)
480 |             plt.ylabel('Bias', fontsize=14)
481 |             plt.title('Bias by Sample Size for Each Estimator', fontsize=16)
482 |             plt.legend(fontsize=12)
483 |             plt.grid(True)
484 |             plt.tight_layout()
485 |             plt.savefig(os.path.join(output_dir, 'bias_by_sample_size.png'))
486 |             plt.close()
487 |             
488 |         print(f"All visualizations created in {output_dir}")
489 |         
490 |     except Exception as e:
491 |         print(f"Error creating visualizations: {e}")
492 | 
493 | def find_npz_directories(base_dir):
494 |     """Find directories containing NPZ files"""
495 |     npz_dirs = []
496 |     
497 |     # First check the base directory
498 |     if any(f.endswith('.npz') for f in os.listdir(base_dir)):
499 |         npz_dirs.append(base_dir)
500 |     
501 |     # Then check subdirectories
502 |     for item in os.listdir(base_dir):
503 |         item_path = os.path.join(base_dir, item)
504 |         if os.path.isdir(item_path):
505 |             if any(f.endswith('.npz') for f in os.listdir(item_path)):
506 |                 npz_dirs.append(item_path)
507 |     
508 |     return npz_dirs
509 | 
510 | def main():
511 |     # Parse command line arguments
512 |     parser = argparse.ArgumentParser(description='Calculate RMSE for reinforcement learning estimators')
513 |     parser.add_argument('--sample-sizes', type=int, nargs='+', default=[100, 1500, 3000, 4500], 
514 |                         help='Sample sizes to analyze')
515 |     parser.add_argument('--true-value', type=float, default=None, 
516 |                         help='Known true value (if not provided, will be estimated)')
517 |     parser.add_argument('--n-episodes', type=int, default=100000,
518 |                         help='Number of episodes to use for estimating true value')
519 |     parser.add_argument('--output-dir', type=str, default='rmse_analysis',
520 |                         help='Directory to save analysis results')
521 |     parser.add_argument('--list-files', action='store_true',
522 |                         help='List all NPZ files found in the directories')
523 |     
524 |     args = parser.parse_args()
525 |     
526 |     # Find the current directory
527 |     current_dir = os.getcwd()
528 |     print(f"Current working directory: {current_dir}")
529 |     
530 |     # Look for directories containing NPZ files
531 |     npz_dirs = find_npz_directories(current_dir)
532 |     
533 |     if not npz_dirs:
534 |         print("No directories with NPZ files found! Using current directory.")
535 |         directory = current_dir
536 |     else:
537 |         print(f"Found {len(npz_dirs)} directories with NPZ files:")
538 |         for i, d in enumerate(npz_dirs):
539 |             print(f"  {i+1}. {d}")
540 |         
541 |         directory = npz_dirs  # Search in all found directories
542 |     
543 |     # List files if requested
544 |     if args.list_files:
545 |         print("\nListing all NPZ files in the directories:")
546 |         for d in (npz_dirs if npz_dirs else [current_dir]):
547 |             print(f"\nFiles in {d}:")
548 |             npz_files = [f for f in os.listdir(d) if f.endswith('.npz')]
549 |             for f in sorted(npz_files):
550 |                 print(f"  {f}")
551 |     
552 |     # Create output directory
553 |     output_dir = os.path.join(current_dir, args.output_dir)
554 |     os.makedirs(output_dir, exist_ok=True)
555 |     
556 |     # Step 1: Get the true value
557 |     if args.true_value is not None:
558 |         # Use provided true value
559 |         true_value = args.true_value
560 |         print(f"Using provided true value: {true_value}")
561 |     else:
562 |         # Check if we already have the true value saved
563 |         true_value_file = os.path.join(current_dir, "true_value_estimate.txt")
564 |         if os.path.exists(true_value_file):
565 |             try:
566 |                 with open(true_value_file, "r") as f:
567 |                     lines = f.readlines()
568 |                     true_value = float(lines[0].split(":")[1].strip())
569 |                     print(f"Using existing true value: {true_value}")
570 |             except Exception as e:
571 |                 print(f"Error reading true value file: {e}")
572 |                 print("Estimating new true value...")
573 |                 true_value, std_error = estimate_true_value(n_episodes=args.n_episodes)
574 |         else:
575 |             # Estimate the true value
576 |             true_value, std_error = estimate_true_value(n_episodes=args.n_episodes)
577 |             
578 |             # Save the result
579 |             with open(os.path.join(output_dir, "true_value_estimate.txt"), "w") as f:
580 |                 f.write(f"Estimated true value: {true_value}\n")
581 |                 f.write(f"Standard error: {std_error}\n")
582 |                 f.write(f"Parameters: alpha=0.9, beta=0.2, T=30, episodes={args.n_episodes}\n")
583 |                 f.write(f"Date: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
584 |     
585 |     # Step 2: Load all estimator results
586 |     all_estimators = load_estimator_results(directory, args.sample_sizes)
587 |     
588 |     # Step 3: Calculate RMSE for all estimators
589 |     rmse_table, se_table, bias_table, combined_results = create_rmse_table(all_estimators, true_value)
590 |     
591 |     # Check if we have any results
592 |     if rmse_table.empty:
593 |         print("\nNo valid results were found to analyze. Please check your file paths and naming conventions.")
594 |         return
595 |     
596 |     # Step 4: Display results
597 |     print("\nRMSE Results:")
598 |     print(rmse_table)
599 |     
600 |     print("\nStandard Errors:")
601 |     print(se_table)
602 |     
603 |     print("\nBias Values:")
604 |     print(bias_table)
605 |     
606 |     # Step 5: Save results to CSV
607 |     rmse_table.to_csv(os.path.join(output_dir, "rmse_table.csv"))
608 |     se_table.to_csv(os.path.join(output_dir, "se_table.csv"))
609 |     bias_table.to_csv(os.path.join(output_dir, "bias_table.csv"))
610 |     combined_results.to_csv(os.path.join(output_dir, "combined_results.csv"), index=False)
611 |     
612 |     # Step 6: Create LaTeX table for publication
613 |     create_latex_table(rmse_table, se_table, os.path.join(output_dir, "rmse_table.tex"))
614 |     
615 |     # Step 7: Create visualizations
616 |     create_visualizations(rmse_table, se_table, combined_results, output_dir)
617 |     
618 |     print(f"\nAll analysis results saved to {output_dir}/")
619 | 
620 | if __name__ == "__main__":
621 |     main()


--------------------------------------------------------------------------------
/exp5_1_py3/rmse_learner.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import os
 4 | import argparse
 5 | from joblib import Parallel, delayed
 6 | import multiprocessing as mp
 7 | import time
 8 | 
 9 | # Estimators from your outputs
10 | ESTIMATORS = [
11 |     'ipw', 'dm', 'ipw2', 'dr', 'dr2',
12 |     'ipw_mis_q', 'dm_mis_q', 'ipw2_mis_q', 'dr_mis_q', 'dr2_mis_q',
13 |     'ipw_mis_mu', 'dm_mis_mu', 'ipw2_mis_mu', 'dr_mis_mu', 'dr2_mis_mu'
14 | ]
15 | 
16 | def sigmoid(x):
17 |     return 1.0 / (1.0 + np.exp(-0.1 * x))
18 | 
19 | def estimate_true_value_single(seed, T=30, alpha=0.9):
20 |     np.random.seed(seed)
21 |     s = np.random.normal(0.5, 0.2)
22 |     total_r = 0
23 |     for j in range(T):
24 |         a_prob = alpha * sigmoid(s) + (1 - alpha) * np.random.uniform(0, 1)
25 |         a = np.random.binomial(1, a_prob)
26 |         r = np.random.normal(0.9 * s + 0.3 * a - 0.02*(j%2), 0.2)
27 |         total_r += r
28 |         if j < T-1:
29 |             s = np.random.normal(0.02*(j%2) + s - 0.3*(a-0.5), 0.2)
30 |     return total_r
31 | 
32 | def estimate_true_value(n_episodes=100000, T=30, alpha=0.9, n_jobs=-1):
33 |     if n_jobs == -1:
34 |         n_jobs = mp.cpu_count()
35 |     returns = Parallel(n_jobs=n_jobs)(delayed(estimate_true_value_single)(i, T, alpha) for i in range(n_episodes))
36 |     mean_return = np.mean(returns)
37 |     std_error = np.std(returns) / np.sqrt(n_episodes)
38 |     print(f"Estimated true value: {mean_return:.6f} ± {std_error:.6f}")
39 |     return mean_return
40 | 
41 | def load_estimates(N, mu_method):
42 |     estimates = {}
43 |     for est in ESTIMATORS:
44 |         fname = f"estimator_list_{est}_{mu_method}_{N}.npz"
45 |         if os.path.exists(fname):
46 |             data = np.load(fname)
47 |             estimates[est] = data['a'] if 'a' in data else data[list(data.keys())[0]]
48 |         else:
49 |             print(f"Warning: {fname} not found.")
50 |             estimates[est] = np.array([])
51 |     return estimates
52 | 
53 | def calculate_rmse(estimates, true_value):
54 |     results = []
55 |     for name, values in estimates.items():
56 |         if values.size == 0:
57 |             continue
58 |         bias = np.mean(values) - true_value
59 |         rmse = np.sqrt(np.mean((values - true_value)**2))
60 |         se = np.std((values - true_value)**2) / (2 * rmse * np.sqrt(len(values)))
61 |         results.append({'Estimator': name, 'RMSE': rmse, 'Bias': bias, 'SE': se, 'Std': np.std(values)})
62 |     return pd.DataFrame(results)
63 | 
64 | def main():
65 |     parser = argparse.ArgumentParser()
66 |     parser.add_argument('--N', type=int, default=1500)
67 |     parser.add_argument('--true_value', type=float, default=None)
68 |     parser.add_argument('--n_episodes', type=int, default=100000)
69 |     parser.add_argument('--output', type=str, default='rmse_results.csv')
70 |     parser.add_argument('--mu_method', type=str, default='linear', choices=['linear', 'mlp', 'rf'], help='W-function learner')
71 | 
72 | 
73 |     args = parser.parse_args()
74 | 
75 |     # Step 1: Estimate true value if not provided
76 |     if args.true_value is None:
77 |         print("No true value provided. Estimating...")
78 |         true_value = estimate_true_value(n_episodes=args.n_episodes)
79 |     else:
80 |         true_value = args.true_value
81 |         print(f"Using provided true value: {true_value}")
82 | 
83 |     # Step 2: Load estimates
84 |     estimates = load_estimates(args.N, args.mu_method)
85 | 
86 |     # Step 3: Calculate RMSE
87 |     results = calculate_rmse(estimates, true_value)
88 |     results = results.sort_values(by='RMSE')
89 | 
90 |     # Step 4: Save and display
91 |     results.to_csv(args.output, index=False)
92 |     print("\n=== RMSE Results ===")
93 |     print(results)
94 | 
95 | if __name__ == "__main__":
96 |     main()
97 | 


--------------------------------------------------------------------------------
/exp5_1_py3/toytoy2_par_table.py:
--------------------------------------------------------------------------------
  1 | # Full GPU-Accelerated Simulation with 15 Estimators Using PyTorch
  2 | import torch
  3 | import numpy as np
  4 | import time
  5 | import argparse
  6 | 
  7 | # Use GPU if available
  8 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  9 | print(f"Using device: {device}")
 10 | 
 11 | def sigmoid(x):
 12 |     return 1.0 / (1.0 + torch.exp(-0.1 * x))
 13 | 
 14 | def behav_dens(s, a, beta):
 15 |     b = beta * sigmoid(s) + beta * 0.5
 16 |     # Convert a_val to tensor if it's a scalar
 17 |     if not isinstance(a, torch.Tensor):
 18 |         a = torch.tensor(a, device=device, dtype=torch.float32)
 19 |     # Ensure a is the same shape as b for comparison
 20 |     if a.dim() == 0:
 21 |         a = a.expand_as(b)
 22 |     return torch.where(a == 1.0, b, 1.0 - b)
 23 | 
 24 | def eval_dens(s, a, alpha):
 25 |     b = alpha * sigmoid(s) + (1 - alpha) * 0.5
 26 |     # Convert a_val to tensor if it's a scalar
 27 |     if not isinstance(a, torch.Tensor):
 28 |         a = torch.tensor(a, device=device, dtype=torch.float32)
 29 |     # Ensure a is the same shape as b for comparison
 30 |     if a.dim() == 0:
 31 |         a = a.expand_as(b)
 32 |     return torch.where(a == 1.0, b, 1.0 - b)
 33 | 
 34 | def linear_regression(X, y):
 35 |     beta_hat = torch.linalg.lstsq(X, y).solution
 36 |     return beta_hat
 37 | 
 38 | def regress_q(s, a, r, alpha, squared=False):
 39 |     # Added alpha parameter to fix the scope issue
 40 |     N, T = s.shape
 41 |     q_weights = []
 42 |     V_next = torch.zeros(N, device=device)
 43 |     for t in reversed(range(T)):
 44 |         s_t = s[:, t]**2 if squared else s[:, t]
 45 |         sa = torch.stack([s_t, s_t * a[:, t], torch.ones(N, device=device)], dim=1)
 46 |         y = r[:, t] if t == T - 1 else r[:, t] + V_next
 47 |         w = linear_regression(sa, y)
 48 |         q_weights.insert(0, w)
 49 |         # V for next step
 50 |         V_next = 0
 51 |         for a_val in [0.0, 1.0]:
 52 |             s_a = torch.stack([s_t, s_t * a_val, torch.ones(N, device=device)], dim=1)
 53 |             # Create a tensor of a_val with the same size as s[:, t]
 54 |             a_val_tensor = torch.full_like(s[:, t], a_val)
 55 |             # Pass a_val_tensor to eval_dens
 56 |             prob = eval_dens(s[:, t], a_val_tensor, alpha)
 57 |             V_next += prob * (s_a @ w).squeeze()
 58 |     return q_weights
 59 | 
 60 | def regress_mu(s, a, w_, squared=False):
 61 |     N, T = s.shape
 62 |     mu_weights = []
 63 |     for t in range(T):
 64 |         s_t = s[:, t]**2 if squared else s[:, t]
 65 |         sa = torch.stack([s_t, s_t * a[:, t], torch.ones(N, device=device)], dim=1)
 66 |         y = w_[:, t]
 67 |         mu_weights.append(linear_regression(sa, y))
 68 |     return mu_weights
 69 | 
 70 | def eval_dm(s, q0, alpha, squared=False):
 71 |     # Added alpha parameter to fix the scope issue
 72 |     N = s.shape[0]
 73 |     s0 = s[:, 0]**2 if squared else s[:, 0]
 74 |     V = torch.zeros(N, device=device)
 75 |     for a_val in [0.0, 1.0]:
 76 |         sa = torch.stack([s0, s0 * a_val, torch.ones(N, device=device)], dim=1)
 77 |         # Create a tensor of a_val with the same size as s[:, 0]
 78 |         a_val_tensor = torch.full_like(s[:, 0], a_val)
 79 |         # Pass a_val_tensor to eval_dens
 80 |         V += eval_dens(s[:, 0], a_val_tensor, alpha) * (sa @ q0).squeeze()
 81 |     return V.mean()
 82 | 
 83 | def eval_ipw(r, w):
 84 |     return (r * w).sum(dim=1).mean()
 85 | 
 86 | def eval_mis(mu_weights, s, a, r, squared=False):
 87 |     total = 0
 88 |     for t, w in enumerate(mu_weights):
 89 |         s_t = s[:, t]**2 if squared else s[:, t]
 90 |         sa = torch.stack([s_t, s_t * a[:, t], torch.ones(len(s), device=device)], dim=1)
 91 |         total += (sa @ w).squeeze() * r[:, t]
 92 |     return total.mean()
 93 | 
 94 | def eval_dr(q_weights1, q_weights2, s1, s2, a1, a2, r1, r2, w1, w2, alpha, squared=False):
 95 |     # Added alpha parameter to fix the scope issue
 96 |     def compute_half(qw, s, a, r, w):
 97 |         total = 0
 98 |         for t in range(s.shape[1]):
 99 |             s_t = s[:, t]**2 if squared else s[:, t]
100 |             sa = torch.stack([s_t, s_t * a[:, t], torch.ones(len(s), device=device)], dim=1)
101 |             V_t = torch.zeros(len(s), device=device)
102 |             for a_val in [0.0, 1.0]:
103 |                 sa_val = torch.stack([s_t, s_t * a_val, torch.ones(len(s), device=device)], dim=1)
104 |                 # Create a tensor of a_val with the same size as s[:, t]
105 |                 a_val_tensor = torch.full_like(s[:, t], a_val)
106 |                 # Pass a_val_tensor to eval_dens
107 |                 V_t += eval_dens(s[:, t], a_val_tensor, alpha) * (sa_val @ qw[t]).squeeze()
108 |             V_w = V_t if t == 0 else V_t * w[:, t - 1]
109 |             total += (r[:, t] * w[:, t] - (sa @ qw[t]).squeeze() * w[:, t] + V_w).mean()
110 |         return total
111 |     return (compute_half(q_weights2, s1, a1, r1, w1) + compute_half(q_weights1, s2, a2, r2, w2)) / 2
112 | 
113 | def eval_dr2(q1, q2, mu1, mu2, s1, s2, a1, a2, r1, r2, alpha, squared_q=False, squared_mu=False):
114 |     # Added alpha parameter to fix the scope issue
115 |     def compute_half(qw, mw, s, a, r):
116 |         total = 0
117 |         for t in range(s.shape[1]):
118 |             sq = s[:, t]**2 if squared_q else s[:, t]
119 |             sm = s[:, t]**2 if squared_mu else s[:, t]
120 |             sa_q = torch.stack([sq, sq * a[:, t], torch.ones(len(s), device=device)], dim=1)
121 |             sa_m = torch.stack([sm, sm * a[:, t], torch.ones(len(s), device=device)], dim=1)
122 |             pred_mu = (sa_m @ mw[t]).squeeze()
123 |             V_t = torch.zeros(len(s), device=device)
124 |             for a_val in [0.0, 1.0]:
125 |                 sq_val = sq
126 |                 sa_val = torch.stack([sq_val, sq_val * a_val, torch.ones(len(s), device=device)], dim=1)
127 |                 # Create a tensor of a_val with the same size as s[:, t]
128 |                 a_val_tensor = torch.full_like(s[:, t], a_val)
129 |                 # Pass a_val_tensor to eval_dens
130 |                 V_t += eval_dens(s[:, t], a_val_tensor, alpha) * (sa_val @ qw[t]).squeeze()
131 |             
132 |             # # This matches the original toytoy2.py implementation that only considers action 1
133 |             # a_val = 1.0  # Only consider action 1
134 |             # sa_val = torch.stack([s_t, s_t * a_val, torch.ones(len(s), device=device)], dim=1)
135 |             # a_val_tensor = torch.full_like(s[:, t], a_val)
136 |             # V_t = eval_dens(s[:, t], a_val_tensor, alpha) * (sa_val @ qw[t]).squeeze()
137 |             
138 |             if t > 0:
139 |                 sm_prev = s[:, t - 1]**2 if squared_mu else s[:, t - 1]
140 |                 sa_m_prev = torch.stack([sm_prev, sm_prev * a[:, t - 1], torch.ones(len(s), device=device)], dim=1)
141 |                 V_t *= (sa_m_prev @ mw[t - 1]).squeeze()
142 |             total += (pred_mu * r[:, t] - pred_mu * (sa_q @ qw[t]).squeeze() + V_t).mean()
143 |         return total
144 |     return (compute_half(q2, mu2, s1, a1, r1) + compute_half(q1, mu1, s2, a2, r2)) / 2
145 | 
146 | def run_single_repetition(N, T, beta, alpha):
147 |     # Generate trajectories
148 |     s = torch.zeros(N, T, device=device)
149 |     a = torch.zeros(N, T, device=device)
150 |     r = torch.zeros(N, T, device=device)
151 |     w = torch.ones(N, T, device=device)
152 | 
153 |     s[:, 0] = torch.normal(0.5, 0.2, size=(N,), device=device)
154 |     p = beta * sigmoid(s[:, 0]) + beta * torch.rand(N, device=device)
155 |     a[:, 0] = torch.bernoulli(p)
156 |     r[:, 0] = torch.normal(0.9 * s[:, 0] + 0.3 * a[:, 0], 0.2)
157 | 
158 |     for t in range(1, T):
159 |         s[:, t] = torch.normal(0.02 * (t % 2) + s[:, t - 1] - 0.3 * (a[:, t - 1] - 0.5), 0.2)
160 |         p = beta * sigmoid(s[:, t]) + beta * torch.rand(N, device=device)
161 |         a[:, t] = torch.bernoulli(p)
162 |         w[:, t] = eval_dens(s[:, t], a[:, t], alpha) / behav_dens(s[:, t], a[:, t], beta) * w[:, t - 1]
163 |         r[:, t] = torch.normal(0.9 * s[:, t] + 0.3 * a[:, t] - 0.02 * (t % 2), 0.2)
164 | 
165 |     # Split data for cross-fitting
166 |     s1, s2 = s.chunk(2)
167 |     a1, a2 = a.chunk(2)
168 |     r1, r2 = r.chunk(2)
169 |     w1, w2 = w.chunk(2)
170 | 
171 |     # Regression for q-functions and mu-functions
172 |     # Pass alpha to all functions that use it
173 |     q1 = regress_q(s1, a1, r1, alpha)
174 |     q2 = regress_q(s2, a2, r2, alpha)
175 |     q1_sq = regress_q(s1, a1, r1, alpha, squared=True)
176 |     q2_sq = regress_q(s2, a2, r2, alpha, squared=True)
177 | 
178 |     mu1 = regress_mu(s1, a1, w1)
179 |     mu2 = regress_mu(s2, a2, w2)
180 |     mu1_sq = regress_mu(s1, a1, w1, squared=True)
181 |     mu2_sq = regress_mu(s2, a2, w2, squared=True)
182 | 
183 |     # Calculate all estimators
184 |     # Pass alpha to all functions that use it
185 |     return {
186 |         'ipw': eval_ipw(r, w).item(),
187 |         'dm': ((eval_dm(s1, q1[0], alpha) + eval_dm(s2, q2[0], alpha)) / 2).item(),
188 |         'ipw2': ((eval_mis(mu1, s2, a2, r2) + eval_mis(mu2, s1, a1, r1)) / 2).item(),
189 |         'dr': eval_dr(q1, q2, s1, s2, a1, a2, r1, r2, w1, w2, alpha).item(),
190 |         'dr2': eval_dr2(q1, q2, mu1, mu2, s1, s2, a1, a2, r1, r2, alpha).item(),
191 |         'ipw_mis_q': eval_ipw(r, w).item(),
192 |         'dm_mis_q': ((eval_dm(s1, q1_sq[0], alpha, squared=True) + eval_dm(s2, q2_sq[0], alpha, squared=True)) / 2).item(),
193 |         'ipw2_mis_q': ((eval_mis(mu1, s2, a2, r2) + eval_mis(mu2, s1, a1, r1)) / 2).item(),
194 |         'dr_mis_q': eval_dr(q1_sq, q2_sq, s1, s2, a1, a2, r1, r2, w1, w2, alpha, squared=True).item(),
195 |         'dr2_mis_q': eval_dr2(q1_sq, q2_sq, mu1, mu2, s1, s2, a1, a2, r1, r2, alpha, squared_q=True).item(),
196 |         'ipw_mis_mu': eval_ipw(r, w).item(),
197 |         'dm_mis_mu': ((eval_dm(s1, q1[0], alpha) + eval_dm(s2, q2[0], alpha)) / 2).item(),
198 |         'ipw2_mis_mu': ((eval_mis(mu1_sq, s2, a2, r2, squared=True) + eval_mis(mu2_sq, s1, a1, r1, squared=True)) / 2).item(),
199 |         'dr_mis_mu': eval_dr(q1, q2, s1, s2, a1, a2, r1, r2, w1, w2, alpha).item(),
200 |         'dr2_mis_mu': eval_dr2(q1, q2, mu1_sq, mu2_sq, s1, s2, a1, a2, r1, r2, alpha, squared_mu=True).item(),
201 |     }
202 | 
203 | def main():
204 |     # Parse command line arguments
205 |     parser = argparse.ArgumentParser(description='Run GPU-accelerated RL estimators')
206 |     parser.add_argument('--N', type=int, default=1500, help='Number of trajectories')
207 |     parser.add_argument('--T', type=int, default=30, help='Time horizon')
208 |     parser.add_argument('--beta', type=float, default=0.2, help='Behavior policy parameter')
209 |     parser.add_argument('--alpha', type=float, default=0.9, help='Evaluation policy parameter')
210 |     parser.add_argument('--reps', type=int, default=1500, help='Number of repetitions')
211 |     
212 |     args = parser.parse_args()
213 |     
214 |     print(f"Running with parameters: N={args.N}, T={args.T}, beta={args.beta}, alpha={args.alpha}, reps={args.reps}")
215 |     
216 |     # For very first repetition, get keys from a small sample run
217 |     # This prevents out-of-memory errors when N is large
218 |     sample_N = min(100, args.N)
219 |     sample_res = run_single_repetition(sample_N, args.T, args.beta, args.alpha)
220 |     results = {k: [] for k in sample_res.keys()}
221 |     
222 |     # Set seeds for reproducibility
223 |     torch.manual_seed(42)
224 |     np.random.seed(42)
225 |     if torch.cuda.is_available():
226 |         torch.cuda.manual_seed_all(42)
227 |     
228 |     # Run all repetitions
229 |     start_time = time.time()
230 |     for i in range(args.reps):
231 |         rep_start_time = time.time()
232 |         print(f"\nStarting repetition {i+1}/{args.reps}")
233 |         
234 |         # Run repetition
235 |         try:
236 |             res = run_single_repetition(args.N, args.T, args.beta, args.alpha)
237 |             # Collect results
238 |             for k in results:
239 |                 results[k].append(res[k])
240 |             
241 |             # Clear GPU memory
242 |             if torch.cuda.is_available():
243 |                 torch.cuda.empty_cache()
244 |                 
245 |             rep_time = time.time() - rep_start_time
246 |             print(f"Repetition {i+1} completed in {rep_time:.2f} seconds")
247 |             
248 |             # Periodically save results
249 |             if (i+1) % 10 == 0 or i == args.reps-1:
250 |                 for k in results:
251 |                     np.savez(f"estimator_list_{k}_{args.N}", a=np.array(results[k]))
252 |                 print(f"Saved checkpoint after {i+1} repetitions")
253 |                 
254 |         except Exception as e:
255 |             print(f"Error in repetition {i+1}: {e}")
256 |             # Try to continue with next repetition
257 |     
258 |     # Save final results
259 |     for k in results:
260 |         np.savez(f"estimator_list_{k}_{args.N}", a=np.array(results[k]))
261 |     
262 |     total_time = time.time() - start_time
263 |     print(f"All {args.reps} repetitions completed in {total_time:.2f} seconds")
264 | 
265 | if __name__ == '__main__':
266 |     import sys
267 |     main()


--------------------------------------------------------------------------------
/exp5_1_py3/toytoy2_par_table_learner.py:
--------------------------------------------------------------------------------
  1 | # Full GPU-Accelerated Simulation with 15 Estimators Using PyTorch
  2 | import torch
  3 | import numpy as np
  4 | import time
  5 | import argparse
  6 | import torch.nn as nn
  7 | from sklearn.ensemble import RandomForestRegressor
  8 | from torch.cuda.amp import autocast, GradScaler
  9 | from sklearn.ensemble import ExtraTreesRegressor
 10 | from cuml.ensemble import RandomForestRegressor as cuRF
 11 | import cuml
 12 | import cupy as cp
 13 | 
 14 | 
 15 | 
 16 | # Use GPU if available
 17 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 18 | print(f"Using device: {device}")
 19 | 
 20 | class SmallMLP(nn.Module):
 21 |     def __init__(self, input_dim=3, hidden_dim=32):
 22 |         super().__init__()
 23 |         self.model = nn.Sequential(
 24 |             nn.Linear(input_dim, hidden_dim),
 25 |             nn.ReLU(),
 26 |             nn.Linear(hidden_dim, 1)
 27 |         )
 28 | 
 29 |     def forward(self, x):
 30 |         return self.model(x)
 31 | 
 32 | def fit_mlp(X, y, epochs=10, lr=1e-2):
 33 |     model = SmallMLP(X.shape[1]).to(device)
 34 |     optimizer = torch.optim.Adam(model.parameters(), lr=lr)
 35 |     loss_fn = nn.MSELoss()
 36 |     for _ in range(epochs):
 37 |         model.train()
 38 |         pred = model(X).squeeze()
 39 |         loss = loss_fn(pred, y)
 40 |         optimizer.zero_grad()
 41 |         loss.backward()
 42 |         optimizer.step()
 43 |     return model
 44 | 
 45 | def sigmoid(x):
 46 |     return 1.0 / (1.0 + torch.exp(-0.1 * x))
 47 | 
 48 | def behav_dens(s, a, beta):
 49 |     b = beta * sigmoid(s) + beta * 0.5
 50 |     # Convert a_val to tensor if it's a scalar
 51 |     if not isinstance(a, torch.Tensor):
 52 |         a = torch.tensor(a, device=device, dtype=torch.float32)
 53 |     # Ensure a is the same shape as b for comparison
 54 |     if a.dim() == 0:
 55 |         a = a.expand_as(b)
 56 |     return torch.where(a == 1.0, b, 1.0 - b)
 57 | 
 58 | def eval_dens(s, a, alpha):
 59 |     b = alpha * sigmoid(s) + (1 - alpha) * 0.5
 60 |     # Convert a_val to tensor if it's a scalar
 61 |     if not isinstance(a, torch.Tensor):
 62 |         a = torch.tensor(a, device=device, dtype=torch.float32)
 63 |     # Ensure a is the same shape as b for comparison
 64 |     if a.dim() == 0:
 65 |         a = a.expand_as(b)
 66 |     return torch.where(a == 1.0, b, 1.0 - b)
 67 | 
 68 | def linear_regression(X, y):
 69 |     beta_hat = torch.linalg.lstsq(X, y).solution
 70 |     return beta_hat
 71 | 
 72 | def regress_q(s, a, r, alpha, squared=False):
 73 |     # Added alpha parameter to fix the scope issue
 74 |     N, T = s.shape
 75 |     q_weights = []
 76 |     V_next = torch.zeros(N, device=device)
 77 |     for t in reversed(range(T)):
 78 |         s_t = s[:, t]**2 if squared else s[:, t]
 79 |         sa = torch.stack([s_t, s_t * a[:, t], torch.ones(N, device=device)], dim=1)
 80 |         y = r[:, t] if t == T - 1 else r[:, t] + V_next
 81 |         w = linear_regression(sa, y)
 82 |         q_weights.insert(0, w)
 83 |         # V for next step
 84 |         V_next = 0
 85 |         for a_val in [0.0, 1.0]:
 86 |             s_a = torch.stack([s_t, s_t * a_val, torch.ones(N, device=device)], dim=1)
 87 |             # Create a tensor of a_val with the same size as s[:, t]
 88 |             a_val_tensor = torch.full_like(s[:, t], a_val)
 89 |             # Pass a_val_tensor to eval_dens
 90 |             prob = eval_dens(s[:, t], a_val_tensor, alpha)
 91 |             V_next += prob * (s_a @ w).squeeze()
 92 |     return q_weights
 93 | 
 94 | def regress_mu(s, a, w_, squared=False, method='linear'):
 95 |     N, T = s.shape
 96 |     mu_models = []
 97 |     for t in range(T):
 98 |         s_t = s[:, t]**2 if squared else s[:, t]
 99 |         sa = torch.stack([s_t, s_t * a[:, t], torch.ones(N, device=device)], dim=1)
100 |         y = w_[:, t]
101 | 
102 |         if method == 'linear':
103 |             mu_models.append(linear_regression(sa, y))
104 |         elif method == 'mlp':
105 |             mu_models.append(fit_mlp(sa.float(), y.float()))
106 |         elif method == 'rf':
107 |             sa_cp = cp.asarray(sa)  # Convert torch.Tensor to CuPy array
108 |             y_cp = cp.asarray(y)
109 | 
110 |             rf = cuRF(
111 |                 n_estimators=10,         # Low for speed; increase if accuracy is poor
112 |                 max_depth=4,             # Slightly deeper trees than depth=3 to capture interactions
113 |                 min_samples_split=10,    # Avoid overfitting on small batches
114 |                 min_samples_leaf=5,      # Avoid overly deep trees with tiny leaves
115 |                 max_features=1.0,        # Try all features since you only have 3 — avoid randomness
116 |                 n_streams=8              # Parallel GPU streams (leave as-is for A100)
117 |             )
118 | 
119 |             rf.fit(sa_cp, y_cp)
120 |             mu_models.append(rf)
121 | 
122 |         else:
123 |             raise ValueError("Unknown method for mu regression")
124 | 
125 |     return mu_models
126 | 
127 | def eval_dm(s, q0, alpha, squared=False):
128 |     # Added alpha parameter to fix the scope issue
129 |     N = s.shape[0]
130 |     s0 = s[:, 0]**2 if squared else s[:, 0]
131 |     V = torch.zeros(N, device=device)
132 |     for a_val in [0.0, 1.0]:
133 |         sa = torch.stack([s0, s0 * a_val, torch.ones(N, device=device)], dim=1)
134 |         # Create a tensor of a_val with the same size as s[:, 0]
135 |         a_val_tensor = torch.full_like(s[:, 0], a_val)
136 |         # Pass a_val_tensor to eval_dens
137 |         V += eval_dens(s[:, 0], a_val_tensor, alpha) * (sa @ q0).squeeze()
138 |     return V.mean()
139 | 
140 | def eval_ipw(r, w):
141 |     return (r * w).sum(dim=1).mean()
142 | 
143 | def eval_mis(mu_weights, s, a, r, squared=False):
144 |     total = 0
145 |     for t, w in enumerate(mu_weights):
146 |         s_t = s[:, t]**2 if squared else s[:, t]
147 |         sa = torch.stack([s_t, s_t * a[:, t], torch.ones(len(s), device=device)], dim=1)
148 | 
149 |         if isinstance(w, torch.Tensor):
150 |             pred = (sa @ w).squeeze()
151 |         elif isinstance(w, nn.Module):
152 |             pred = w(sa).squeeze()
153 |         else:
154 |             pred = torch.tensor(w.predict(sa.cpu().numpy()), device=device)
155 |         total += pred * r[:, t]
156 |         # total += (sa @ w).squeeze() * r[:, t]
157 |     return total.mean()
158 | 
159 | def eval_dr(q_weights1, q_weights2, s1, s2, a1, a2, r1, r2, w1, w2, alpha, squared=False):
160 |     # Added alpha parameter to fix the scope issue
161 |     def compute_half(qw, s, a, r, w):
162 |         total = 0
163 |         for t in range(s.shape[1]):
164 |             s_t = s[:, t]**2 if squared else s[:, t]
165 |             sa = torch.stack([s_t, s_t * a[:, t], torch.ones(len(s), device=device)], dim=1)
166 |             V_t = torch.zeros(len(s), device=device)
167 |             for a_val in [0.0, 1.0]:
168 |                 sa_val = torch.stack([s_t, s_t * a_val, torch.ones(len(s), device=device)], dim=1)
169 |                 # Create a tensor of a_val with the same size as s[:, t]
170 |                 a_val_tensor = torch.full_like(s[:, t], a_val)
171 |                 # Pass a_val_tensor to eval_dens
172 |                 V_t += eval_dens(s[:, t], a_val_tensor, alpha) * (sa_val @ qw[t]).squeeze()
173 |             V_w = V_t if t == 0 else V_t * w[:, t - 1]
174 |             total += (r[:, t] * w[:, t] - (sa @ qw[t]).squeeze() * w[:, t] + V_w).mean()
175 |         return total
176 |     return (compute_half(q_weights2, s1, a1, r1, w1) + compute_half(q_weights1, s2, a2, r2, w2)) / 2
177 | 
178 | def eval_dr2(q1, q2, mu1, mu2, s1, s2, a1, a2, r1, r2, alpha, squared_q=False, squared_mu=False):
179 |     def compute_half(qw, mw, s, a, r):
180 |         total = 0
181 |         for t in range(s.shape[1]):
182 |             sq = s[:, t]**2 if squared_q else s[:, t]
183 |             sm = s[:, t]**2 if squared_mu else s[:, t]
184 |             sa_q = torch.stack([sq, sq * a[:, t], torch.ones(len(s), device=device)], dim=1)
185 |             sa_m = torch.stack([sm, sm * a[:, t], torch.ones(len(s), device=device)], dim=1)
186 |             
187 |             # Handle different model types for pred_mu
188 |             if isinstance(mw[t], torch.Tensor):
189 |                 pred_mu = (sa_m @ mw[t]).squeeze()
190 |             elif isinstance(mw[t], nn.Module):
191 |                 pred_mu = mw[t](sa_m).squeeze()
192 |             else:
193 |                 pred_mu = torch.tensor(mw[t].predict(sa_m.cpu().numpy()), device=device)
194 |             
195 |             V_t = torch.zeros(len(s), device=device)
196 |             for a_val in [0.0, 1.0]:
197 |                 sq_val = sq
198 |                 sa_val = torch.stack([sq_val, sq_val * a_val, torch.ones(len(s), device=device)], dim=1)
199 |                 a_val_tensor = torch.full_like(s[:, t], a_val)
200 |                 V_t += eval_dens(s[:, t], a_val_tensor, alpha) * (sa_val @ qw[t]).squeeze()
201 |             
202 |             if t > 0:
203 |                 sm_prev = s[:, t - 1]**2 if squared_mu else s[:, t - 1]
204 |                 sa_m_prev = torch.stack([sm_prev, sm_prev * a[:, t - 1], torch.ones(len(s), device=device)], dim=1)
205 |                 
206 |                 # Handle different model types for the previous mu weights
207 |                 if isinstance(mw[t-1], torch.Tensor):
208 |                     pred_mu_prev = (sa_m_prev @ mw[t-1]).squeeze()
209 |                 elif isinstance(mw[t-1], nn.Module):
210 |                     pred_mu_prev = mw[t-1](sa_m_prev).squeeze()
211 |                 else:
212 |                     pred_mu_prev = torch.tensor(mw[t-1].predict(sa_m_prev.cpu().numpy()), device=device)
213 |                 
214 |                 V_t *= pred_mu_prev
215 |                 
216 |             total += (pred_mu * r[:, t] - pred_mu * (sa_q @ qw[t]).squeeze() + V_t).mean()
217 |         return total
218 |     
219 |     return (compute_half(q2, mu2, s1, a1, r1) + compute_half(q1, mu1, s2, a2, r2)) / 2
220 |     
221 | def run_single_repetition(N, T, beta, alpha, method='linear'):
222 |     # Generate trajectories
223 |     s = torch.zeros(N, T, device=device)
224 |     a = torch.zeros(N, T, device=device)
225 |     r = torch.zeros(N, T, device=device)
226 |     w = torch.ones(N, T, device=device)
227 | 
228 |     s[:, 0] = torch.normal(0.5, 0.2, size=(N,), device=device)
229 |     p = beta * sigmoid(s[:, 0]) + beta * torch.rand(N, device=device)
230 |     a[:, 0] = torch.bernoulli(p)
231 |     r[:, 0] = torch.normal(0.9 * s[:, 0] + 0.3 * a[:, 0], 0.2)
232 | 
233 |     for t in range(1, T):
234 |         s[:, t] = torch.normal(0.02 * (t % 2) + s[:, t - 1] - 0.3 * (a[:, t - 1] - 0.5), 0.2)
235 |         p = beta * sigmoid(s[:, t]) + beta * torch.rand(N, device=device)
236 |         a[:, t] = torch.bernoulli(p)
237 |         w[:, t] = eval_dens(s[:, t], a[:, t], alpha) / behav_dens(s[:, t], a[:, t], beta) * w[:, t - 1]
238 |         r[:, t] = torch.normal(0.9 * s[:, t] + 0.3 * a[:, t] - 0.02 * (t % 2), 0.2)
239 | 
240 |     # Split data for cross-fitting
241 |     s1, s2 = s.chunk(2)
242 |     a1, a2 = a.chunk(2)
243 |     r1, r2 = r.chunk(2)
244 |     w1, w2 = w.chunk(2)
245 | 
246 |     # Regression for q-functions and mu-functions
247 |     # Pass alpha to all functions that use it
248 |     q1 = regress_q(s1, a1, r1, alpha)
249 |     q2 = regress_q(s2, a2, r2, alpha)
250 |     q1_sq = regress_q(s1, a1, r1, alpha, squared=True)
251 |     q2_sq = regress_q(s2, a2, r2, alpha, squared=True)
252 | 
253 |     mu1 = regress_mu(s1, a1, w1, method=method)
254 |     mu2 = regress_mu(s2, a2, w2, method=method)
255 |     mu1_sq = regress_mu(s1, a1, w1, squared=True, method=method)
256 |     mu2_sq = regress_mu(s2, a2, w2, squared=True, method=method)
257 | 
258 |     # Calculate all estimators
259 |     # Pass alpha to all functions that use it
260 |     return {
261 |         'ipw': eval_ipw(r, w).item(),
262 |         'dm': ((eval_dm(s1, q1[0], alpha) + eval_dm(s2, q2[0], alpha)) / 2).item(),
263 |         'ipw2': ((eval_mis(mu1, s2, a2, r2) + eval_mis(mu2, s1, a1, r1)) / 2).item(),
264 |         'dr': eval_dr(q1, q2, s1, s2, a1, a2, r1, r2, w1, w2, alpha).item(),
265 |         'dr2': eval_dr2(q1, q2, mu1, mu2, s1, s2, a1, a2, r1, r2, alpha).item(),
266 |         'ipw_mis_q': eval_ipw(r, w).item(),
267 |         'dm_mis_q': ((eval_dm(s1, q1_sq[0], alpha, squared=True) + eval_dm(s2, q2_sq[0], alpha, squared=True)) / 2).item(),
268 |         'ipw2_mis_q': ((eval_mis(mu1, s2, a2, r2) + eval_mis(mu2, s1, a1, r1)) / 2).item(),
269 |         'dr_mis_q': eval_dr(q1_sq, q2_sq, s1, s2, a1, a2, r1, r2, w1, w2, alpha, squared=True).item(),
270 |         'dr2_mis_q': eval_dr2(q1_sq, q2_sq, mu1, mu2, s1, s2, a1, a2, r1, r2, alpha, squared_q=True).item(),
271 |         'ipw_mis_mu': eval_ipw(r, w).item(),
272 |         'dm_mis_mu': ((eval_dm(s1, q1[0], alpha) + eval_dm(s2, q2[0], alpha)) / 2).item(),
273 |         'ipw2_mis_mu': ((eval_mis(mu1_sq, s2, a2, r2, squared=True) + eval_mis(mu2_sq, s1, a1, r1, squared=True)) / 2).item(),
274 |         'dr_mis_mu': eval_dr(q1, q2, s1, s2, a1, a2, r1, r2, w1, w2, alpha).item(),
275 |         'dr2_mis_mu': eval_dr2(q1, q2, mu1_sq, mu2_sq, s1, s2, a1, a2, r1, r2, alpha, squared_mu=True).item(),
276 |     }
277 | 
278 | def main():
279 |     parser = argparse.ArgumentParser(description='Run GPU-accelerated RL estimators')
280 |     parser.add_argument('--N', type=int, default=1500, help='Number of trajectories')
281 |     parser.add_argument('--T', type=int, default=30, help='Time horizon')
282 |     parser.add_argument('--beta', type=float, default=0.2, help='Behavior policy parameter')
283 |     parser.add_argument('--alpha', type=float, default=0.9, help='Evaluation policy parameter')
284 |     parser.add_argument('--reps', type=int, default=1500, help='Number of repetitions')
285 |     parser.add_argument('--mu_method', type=str, default='linear', choices=['linear', 'mlp', 'rf'], help='W-function learner')
286 |     args = parser.parse_args()
287 | 
288 |     print(f"Running with parameters: N={args.N}, T={args.T}, beta={args.beta}, alpha={args.alpha}, reps={args.reps}, mu_method={args.mu_method}")
289 | 
290 |     sample_N = min(100, args.N)
291 |     sample_res = run_single_repetition(sample_N, args.T, args.beta, args.alpha, args.mu_method)
292 |     results = {k: [] for k in sample_res.keys()}
293 | 
294 |     torch.manual_seed(42)
295 |     np.random.seed(42)
296 |     if torch.cuda.is_available():
297 |         torch.cuda.manual_seed_all(42)
298 | 
299 |     start_time = time.time()
300 |     for i in range(args.reps):
301 |         rep_start_time = time.time()
302 |         print(f"\nStarting repetition {i+1}/{args.reps}")
303 |         try:
304 |             res = run_single_repetition(args.N, args.T, args.beta, args.alpha, args.mu_method)
305 |             for k in results:
306 |                 results[k].append(res[k])
307 |             if torch.cuda.is_available():
308 |                 torch.cuda.empty_cache()
309 |             rep_time = time.time() - rep_start_time
310 |             print(f"Repetition {i+1} completed in {rep_time:.2f} seconds")
311 |             if (i+1) % 10 == 0 or i == args.reps-1:
312 |                 for k in results:
313 |                     np.savez(f"estimator_list_{k}_{args.mu_method}_{args.N}", a=np.array(results[k]))
314 |                 print(f"Saved checkpoint after {i+1} repetitions")
315 |         except Exception as e:
316 |             print(f"Error in repetition {i+1}: {e}")
317 |     for k in results:
318 |         np.savez(f"estimator_list_{k}_{args.mu_method}_{args.N}", a=np.array(results[k]))
319 |     total_time = time.time() - start_time
320 |     print(f"All {args.reps} repetitions completed in {total_time:.2f} seconds")
321 | 
322 | if __name__ == '__main__':
323 |     import sys
324 |     main()


--------------------------------------------------------------------------------
/exp5_2/crif_walking_ope.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import itertools
  3 | import numpy as np
  4 | import sys
  5 | 
  6 | 
  7 | if "../" not in sys.path:
  8 |   sys.path.append("../") 
  9 | 
 10 | from collections import defaultdict
 11 | from lib.envs.cliff_walking import CliffWalkingEnv
 12 | from lib.envs.windy_gridworld import WindyGridworldEnv
 13 | 
 14 | from scipy.optimize import minimize, rosen, rosen_der
 15 | from scipy.optimize import Bounds
 16 | 
 17 | bounds = Bounds([-0.1,-0.1],[0.1,0.1])
 18 | 
 19 | env = CliffWalkingEnv()
 20 | 
 21 | def make_epsilon_greedy_policy(Q, epsilon, nA):
 22 |  
 23 |     def policy_fn(observation):
 24 |         A = np.ones(nA, dtype=float) * epsilon / nA
 25 |         best_action = np.argmax(Q[observation])
 26 |         A[best_action] += (1.0 - epsilon)
 27 |         return A
 28 |     return policy_fn
 29 | 
 30 | Q_space = np.load("Q-table-cliff.npz")["xxx"]
 31 | Q_space2 = np.load("Q-table-cliff.npz")["xxx"]
 32 | 
 33 | prob1 = [1.0 for i in range((env.nA))]
 34 | prob1 = prob1/np.sum(prob1)
 35 | 
 36 | 
 37 | betabeta = 0.8
 38 | def sample_policy(observation,alpha=0.9):
 39 |     prob2 = alpha*Q_space[observation,:] +(1-alpha)*prob1
 40 |     return np.random.choice(env.nA,1,p=prob2)[0]
 41 |     
 42 |         
 43 | def behavior_policy(observation,beta=betabeta):
 44 |     prob2 = beta*Q_space[observation,:]+ (1-beta)*prob1
 45 |     return np.random.choice(env.nA,1,p=prob2)[0]
 46 |     
 47 |     
 48 | def target_dense(observation,alpha=0.9):
 49 |     prob2 = alpha*Q_space[observation,:]+ (1-alpha)*prob1
 50 |     return prob2
 51 | 
 52 | def behav_dense(observation,beta=betabeta):
 53 |     prob2 = beta*Q_space[observation,:] + (1-beta)*prob1
 54 |     return prob2
 55 | 
 56 | def sarsa2(env,policy, policy2,num_episodes, discount_factor=1.0,Q_space2=Q_space2, alpha= 0.6, epsilon=0.03):
 57 |    
 58 |     Q = np.copy(Q_space2)
 59 |     episode_episode = []
 60 |     
 61 |     for i_episode in range(num_episodes):
 62 | 
 63 |         if (i_episode + 1) % 200 == 0:
 64 | 
 65 |             sys.stdout.flush()
 66 |     
 67 |         state = env.reset()
 68 |         action = policy2(state)
 69 |         
 70 |         episode = []
 71 |         
 72 |         for t in itertools.count():
 73 |             # Take a step
 74 |             next_state, reward, done, _ = env.step(action)
 75 |             episode.append((state, action, reward))
 76 |             # Pick the next action
 77 |             next_action= policy2(next_state)
 78 |             
 79 |             # TD Update
 80 |             td_target = reward + discount_factor * np.sum(Q[next_state,:]*target_dense(next_state))
 81 |             td_delta = td_target - Q[state,action]
 82 |             Q[state,action] += alpha * td_delta 
 83 |     
 84 |             if done:
 85 |                 break
 86 |                 
 87 |             action = next_action
 88 |             state = next_state 
 89 |         episode_episode.append(episode)
 90 |     
 91 |     return Q, episode_episode
 92 | 
 93 | bounds = Bounds([-0.2,-0.2],[0.2,0.2])
 94 | def sigmoid(x, derivative=False):
 95 |     return x*(1-x) if derivative else 1/(1+np.exp(-x))
 96 | 
 97 | 
 98 | depth = 1
 99 | def mc_prediction(env, policy,policy2, episode_episode, Q_=1.0,num_episodes=100, discount_factor=1.0):
100 |    
101 | 
102 |     returns_sum = defaultdict(float)
103 |     returns_count = defaultdict(float)
104 |     returns_count2 = defaultdict(float)
105 |  
106 |     predic_list = []
107 |     predic_list2 = []
108 |     predic_list3 = []
109 |     predic_list22 = []
110 |     predic_list4 = []
111 |     predic_list5 = np.ones(num_episodes)
112 |     auxiauxi = [] 
113 |     epiepi = []
114 |     weight_list = np.zeros([num_episodes,1000]) ### For bounded IPW
115 |     weight_list2 = np.zeros([num_episodes,1002]) ### For bounded IPW
116 |     weight_list3 = np.zeros([num_episodes,1002]) ### For bounded IPW
117 |     marginal_weight = np.zeros([num_episodes,1000]) ### For bounded IPW
118 |     marginal_weight_2 = np.zeros([num_episodes,1000]) ### For bounded IPW
119 |     auxi_list = np.zeros([num_episodes,1000])
120 |     marginal_auxi_list2 = np.zeros([num_episodes,1000])
121 |     marginal_auxi_list = np.zeros([num_episodes,1000])
122 |     marginal_auxi_list2_2 = np.zeros([num_episodes,1000])
123 |     marginal_auxi_list_2 = np.zeros([num_episodes,1000])
124 |     auxi_list2 = np.zeros([num_episodes,1000])
125 |     reward_list = np.zeros([num_episodes,1000])
126 |     state_list = np.zeros([num_episodes,1000])
127 |     action_list = np.zeros([num_episodes,1000])
128 |     
129 |     count_list = np.zeros(1000) 
130 |     episolode_longe_list = []
131 |     
132 | 
133 |     for i_episode in range(num_episodes):
134 |        
135 |         if i_episode % 200 == 0:
136 |           
137 |             sys.stdout.flush()
138 |         episode = episode_episode[i_episode]
139 |      
140 |         W = 1.0
141 |         W_list = []
142 |         episolode_longe_list.append(len(episode))
143 |         
144 |         weight_list2[i_episode,0] = 1.0
145 |         for t in range(len(episode)):
146 |             state, action, reward = episode[t]
147 |             reward_list[i_episode,t] = reward
148 |             state_list[i_episode,t] = state
149 |             action_list[i_episode,t] = action
150 |             
151 |             W = W*target_dense(state)[action]/behav_dense(state)[action]*discount_factor
152 |             probprob = 0.9*Q_space[state,:] + 0.1*prob1
153 |             W_list.append(W)
154 |             weight_list[i_episode,t] = W_list[t]
155 |             weight_list2[i_episode,t+1] = W_list[t]
156 |             weight_list3[i_episode,t] = target_dense(state)[action]/behav_dense(state)[action]
157 |             
158 |             count_list[t] += 1.0
159 |             
160 |             if t==0:
161 |                 auxi_list[i_episode,t] = W_list[t]*Q_[state,action]-np.sum(probprob*Q_[state,:])
162 |             else:
163 |                 auxi_list[i_episode,t] = W_list[t]*Q_[state,action]-W_list[t-1]*np.sum(probprob*Q_[state,:])
164 |           
165 |             if t==0:
166 |                 auxi_list2[i_episode,t] = W_list[t]-1.0
167 |             else:
168 |                 auxi_list2[i_episode,t] = W_list[t]-W_list[t-1]
169 | 
170 |     print np.max(np.array(episolode_longe_list))
171 |     
172 |         
173 |     weight_list_mean = np.mean(weight_list,1)
174 |     reward_list_mean = np.mean(reward_list,1)
175 |     auxi_list_mean = np.mean(auxi_list,1)
176 |     auxi_list2_mean = np.mean(auxi_list2,1)
177 |     
178 |     val = []    
179 |  
180 |     ##### IPW
181 |     for i in range(num_episodes):
182 |         predic_list.append(np.sum(weight_list[i,:]*reward_list[i,:]))   
183 |     
184 |     val.append(np.mean(predic_list))
185 |     
186 |     #### Marginalized-IPW 
187 |     
188 |     for i in range(num_episodes):
189 |         for j in range(episolode_longe_list[i]):
190 |             marginal_weight[i,j] = np.mean(weight_list[:,j][(state_list[:,j]==state_list[i,j]) & (action_list[:,j]==action_list[i,j])])
191 |             if j==0:
192 |                 marginal_weight_2[i,j] = weight_list3[i,j]
193 |             else:
194 |                 marginal_weight_2[i,j] = np.mean(weight_list[:,j-1][(state_list[:,j]==state_list[i,j])])*weight_list3[i,j]
195 |     
196 |     
197 |     for i_episode in range(num_episodes):
198 |         for t in range(episolode_longe_list[i_episode]):
199 |             state = np.int(state_list[i_episode,t])
200 |             action = np.int(action_list[i_episode,t])
201 |             probprob = 0.9*Q_space[state,:] + 0.1*prob1
202 |             if t==0:
203 |                 marginal_auxi_list[i_episode,t] = marginal_weight[i_episode,t]*Q_[state,action]-np.sum(probprob*Q_[state,:])
204 |                 marginal_auxi_list_2[i_episode,t] = marginal_weight_2[i_episode,t]*Q_[state,action]-np.sum(probprob*Q_[state,:])
205 |                 auxi_list[i_episode,t] = weight_list[i_episode,t]*Q_[state,action]-np.sum(probprob*Q_[state,:])
206 |             else:
207 |                 marginal_auxi_list[i_episode,t] = marginal_weight[i_episode,t]*(Q_[state,action])-marginal_weight[i_episode,t-1]*np.sum(probprob*(Q_[state,:]))
208 |                 marginal_auxi_list_2[i_episode,t] = marginal_weight_2[i_episode,t]*(Q_[state,action])-marginal_weight_2[i_episode,t-1]*np.sum(probprob*(Q_[state,:]))
209 |                 auxi_list[i_episode,t] = weight_list[i_episode,t]*(Q_[state,action])-weight_list[i_episode,t-1]*np.sum(probprob*(Q_[state,:]))
210 |           
211 |             if t==0:
212 |                 marginal_auxi_list2[i_episode,t] = marginal_weight[i_episode,t]-1.0
213 |                 marginal_auxi_list2_2[i_episode,t] = marginal_weight_2[i_episode,t]-1.0
214 |                 auxi_list2[i_episode,t] = weight_list[i_episode,t]-1.0
215 |             else:
216 |                 marginal_auxi_list2[i_episode,t] =  marginal_weight[i_episode,t]- marginal_weight[i_episode,t-1]
217 |                 marginal_auxi_list2_2[i_episode,t] =  marginal_weight_2[i_episode,t]- marginal_weight_2[i_episode,t-1]
218 |                 auxi_list2[i_episode,t] = weight_list[i_episode,t]-weight_list[i_episode,t-1]
219 | 
220 |     
221 |     for i in range(num_episodes):
222 |         predic_list2.append(np.sum(marginal_weight[i,:]*reward_list[i,:]))   
223 |     
224 |     ### marginal ipw2  #### Using action and state 
225 |     val.append(np.mean(predic_list2))
226 |         
227 | 
228 |     ### marginal ipw3#### Using only state 
229 |     for i in range(num_episodes):
230 |         predic_list22.append(np.sum(marginal_weight_2[i,:]*reward_list[i,:]))   
231 |     
232 |     val.append(np.mean(predic_list22))
233 |    
234 |   
235 |     #### DR
236 |     val.append(np.mean(predic_list)-np.mean(np.sum(auxi_list,1)))
237 |     
238 |     #### marginal DR 1  #### Using action and state 
239 |     val.append(np.mean(predic_list2)-np.mean(np.sum(marginal_auxi_list,1)))
240 |     #### marginal DR 2   #### Using only state                                     
241 |     val.append(np.mean(predic_list22)-np.mean(np.sum(marginal_auxi_list_2,1)))
242 |     
243 |     
244 | 
245 | 
246 |     return val
247 |                                                   
248 |     
249 | 
250 | 
251 | 
252 | 
253 | is_list = []
254 | is2_list = []
255 | is3_list = []
256 | wis_list = []
257 | wis2_list = []
258 | dm_list = []
259 | dr_list = []
260 | dr2_list = []
261 | dr3_list = []
262 | bdr_list = []
263 | drs_list = []
264 | drs2_list = []
265 | drss_list = []
266 | mdr_list = []
267 | mdr_list2 = []
268 | 
269 | sample_size = 1000
270 | sample_size =sample_size/2
271 | for kkk in range(100):
272 |     print "epoch",kkk
273 |     #### Sample splititng 
274 |     ### First fold 
275 |     
276 |     predicted_Q ,episode_episode = sarsa2(env,sample_policy,behavior_policy, sample_size)
277 |     V_10k_1  = mc_prediction(env,sample_policy,behavior_policy, episode_episode, predicted_Q,num_episodes=sample_size)
278 |     
279 |     ### Second fold 
280 |     predicted_Q ,episode_episode = sarsa2(env,sample_policy,behavior_policy, sample_size)
281 |     V_10k_2  = mc_prediction(env,sample_policy,behavior_policy, episode_episode, predicted_Q,num_episodes=sample_size)
282 |     
283 |     V_10k = 0.5*(np.array(V_10k_1)+np.array(V_10k_2))
284 |     is_list.append(np.mean(V_10k[0]))
285 |     is2_list.append(np.mean(V_10k[1]))
286 |     is3_list.append(np.mean(V_10k[2]))
287 |     dr_list.append(np.mean(V_10k[3]))
288 |     dr2_list.append(np.mean(V_10k[4]))   
289 |     dr3_list.append(np.mean(V_10k[5]))  
290 |     probprob = 0.9*Q_space[36,:] + 0.1*prob1
291 |     dm_list.append(np.sum(probprob*predicted_Q[36,:]))
292 |     np.savez("2estimator_list_ipw_"+str(betabeta)+"_"+str(sample_size),a=is_list)
293 |     np.savez("2estimator_list_ipw2_"+str(betabeta)+"_"+str(sample_size), a=is3_list)
294 |     np.savez("2estimator_list_dm_"+str(betabeta)+"_"+str(sample_size), a=dm_list)
295 |     np.savez("2estimator_list_dr_"+str(betabeta)+"_"+str(sample_size),a=dr_list)
296 |     np.savez("2estimator_list_dr2_"+str(betabeta)+"_"+str(sample_size),a=dr3_list)
297 | 
298 | 
299 | 
300 | 
301 | true = -42.49
302 | def mse(aaa):
303 |     aaa = np.array(aaa)
304 |     aaa = aaa[aaa>-100]
305 |     return [np.mean((((aaa-true)*(aaa-true)))),np.sqrt(np.var((aaa-true)*(aaa-true)))]
306 | 
307 | print np.mean(is_list)
308 | print mse(is_list)
309 | print "wis"
310 | print np.mean(is3_list)
311 | print mse(is3_list)
312 | print "dm"
313 | print np.mean(dm_list)
314 | print mse(dm_list)
315 | print "dr"
316 | print np.mean(dr_list)
317 | print mse(dr_list)
318 | print "dr3"
319 | print np.mean(dr3_list)
320 | print mse(dr3_list)


--------------------------------------------------------------------------------
/exp5_2/cw_notebook_ver_splitting.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 20,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import gym\n",
 10 |     "import itertools\n",
 11 |     "import numpy as np\n",
 12 |     "import sys\n",
 13 |     "\n",
 14 |     "\n",
 15 |     "if \"../\" not in sys.path:\n",
 16 |     "  sys.path.append(\"../\") \n",
 17 |     "\n",
 18 |     "from collections import defaultdict\n",
 19 |     "from lib.envs.cliff_walking import CliffWalkingEnv\n",
 20 |     "from lib.envs.windy_gridworld import WindyGridworldEnv\n",
 21 |     "\n",
 22 |     "from scipy.optimize import minimize, rosen, rosen_der\n",
 23 |     "from scipy.optimize import Bounds\n",
 24 |     "\n",
 25 |     "bounds = Bounds([-0.1,-0.1],[0.1,0.1])"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 21,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "env = CliffWalkingEnv()"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 22,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "def make_epsilon_greedy_policy(Q, epsilon, nA):\n",
 44 |     " \n",
 45 |     "    def policy_fn(observation):\n",
 46 |     "        A = np.ones(nA, dtype=float) * epsilon / nA\n",
 47 |     "        best_action = np.argmax(Q[observation])\n",
 48 |     "        A[best_action] += (1.0 - epsilon)\n",
 49 |     "        return A\n",
 50 |     "    return policy_fn"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 26,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "Q_space = np.load(\"Q-table-cliff.npz\")[\"xxx\"]\n",
 60 |     "Q_space2 = np.load(\"Q-table-cliff.npz\")[\"xxx\"]\n",
 61 |     "\n",
 62 |     "prob1 = [1.0 for i in range((env.nA))]\n",
 63 |     "prob1 = prob1/np.sum(prob1)\n",
 64 |     "\n",
 65 |     "\n",
 66 |     "betabeta = 0.8\n",
 67 |     "def sample_policy(observation,alpha=0.9):\n",
 68 |     "    prob2 = alpha*Q_space[observation,:] +(1-alpha)*prob1\n",
 69 |     "    return np.random.choice(env.nA,1,p=prob2)[0]\n",
 70 |     "    \n",
 71 |     "        \n",
 72 |     "def behavior_policy(observation,beta=betabeta):\n",
 73 |     "    prob2 = beta*Q_space[observation,:]+ (1-beta)*prob1\n",
 74 |     "    return np.random.choice(env.nA,1,p=prob2)[0]\n",
 75 |     "    \n",
 76 |     "    \n",
 77 |     "def target_dense(observation,alpha=0.9):\n",
 78 |     "    prob2 = alpha*Q_space[observation,:]+ (1-alpha)*prob1\n",
 79 |     "    return prob2\n",
 80 |     "\n",
 81 |     "def behav_dense(observation,beta=betabeta):\n",
 82 |     "    prob2 = beta*Q_space[observation,:] + (1-beta)*prob1\n",
 83 |     "    return prob2"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 38,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "def sarsa2(env,policy, policy2,num_episodes, discount_factor=1.0,Q_space2=Q_space2, alpha= 0.6, epsilon=0.03):\n",
 93 |     "   \n",
 94 |     "    Q = np.copy(Q_space2)\n",
 95 |     "    episode_episode = []\n",
 96 |     "    \n",
 97 |     "    for i_episode in range(num_episodes):\n",
 98 |     "\n",
 99 |     "        if (i_episode + 1) % 200 == 0:\n",
100 |     "\n",
101 |     "            sys.stdout.flush()\n",
102 |     "    \n",
103 |     "        state = env.reset()\n",
104 |     "        action = policy2(state)\n",
105 |     "        \n",
106 |     "        episode = []\n",
107 |     "        \n",
108 |     "        for t in itertools.count():\n",
109 |     "            # Take a step\n",
110 |     "            next_state, reward, done, _ = env.step(action)\n",
111 |     "            episode.append((state, action, reward))\n",
112 |     "            # Pick the next action\n",
113 |     "            next_action= policy2(next_state)\n",
114 |     "            \n",
115 |     "            # TD Update\n",
116 |     "            td_target = reward + discount_factor * np.sum(Q[next_state,:]*target_dense(next_state))\n",
117 |     "            td_delta = td_target - Q[state,action]\n",
118 |     "            Q[state,action] += alpha * td_delta \n",
119 |     "    \n",
120 |     "            if done:\n",
121 |     "                break\n",
122 |     "                \n",
123 |     "            action = next_action\n",
124 |     "            state = next_state \n",
125 |     "        episode_episode.append(episode)\n",
126 |     "    \n",
127 |     "    return Q, episode_episode"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 39,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "bounds = Bounds([-0.2,-0.2],[0.2,0.2])\n",
137 |     "def sigmoid(x, derivative=False):\n",
138 |     "    return x*(1-x) if derivative else 1/(1+np.exp(-x))\n",
139 |     "\n",
140 |     "\n",
141 |     "depth = 1\n",
142 |     "def mc_prediction(env, policy,policy2, episode_episode, Q_=1.0,num_episodes=100, discount_factor=1.0):\n",
143 |     "   \n",
144 |     "\n",
145 |     "    returns_sum = defaultdict(float)\n",
146 |     "    returns_count = defaultdict(float)\n",
147 |     "    returns_count2 = defaultdict(float)\n",
148 |     " \n",
149 |     "    predic_list = []\n",
150 |     "    predic_list2 = []\n",
151 |     "    predic_list3 = []\n",
152 |     "    predic_list22 = []\n",
153 |     "    predic_list4 = []\n",
154 |     "    predic_list5 = np.ones(num_episodes)\n",
155 |     "    auxiauxi = [] \n",
156 |     "    epiepi = []\n",
157 |     "    weight_list = np.zeros([num_episodes,1000]) ### For bounded IPW\n",
158 |     "    weight_list2 = np.zeros([num_episodes,1002]) ### For bounded IPW\n",
159 |     "    weight_list3 = np.zeros([num_episodes,1002]) ### For bounded IPW\n",
160 |     "    marginal_weight = np.zeros([num_episodes,1000]) ### For bounded IPW\n",
161 |     "    marginal_weight_2 = np.zeros([num_episodes,1000]) ### For bounded IPW\n",
162 |     "    auxi_list = np.zeros([num_episodes,1000])\n",
163 |     "    marginal_auxi_list2 = np.zeros([num_episodes,1000])\n",
164 |     "    marginal_auxi_list = np.zeros([num_episodes,1000])\n",
165 |     "    marginal_auxi_list2_2 = np.zeros([num_episodes,1000])\n",
166 |     "    marginal_auxi_list_2 = np.zeros([num_episodes,1000])\n",
167 |     "    auxi_list2 = np.zeros([num_episodes,1000])\n",
168 |     "    reward_list = np.zeros([num_episodes,1000])\n",
169 |     "    state_list = np.zeros([num_episodes,1000])\n",
170 |     "    action_list = np.zeros([num_episodes,1000])\n",
171 |     "    \n",
172 |     "    count_list = np.zeros(1000) \n",
173 |     "    episolode_longe_list = []\n",
174 |     "    \n",
175 |     "\n",
176 |     "    for i_episode in range(num_episodes):\n",
177 |     "       \n",
178 |     "        if i_episode % 200 == 0:\n",
179 |     "          \n",
180 |     "            sys.stdout.flush()\n",
181 |     "        episode = episode_episode[i_episode]\n",
182 |     "     \n",
183 |     "        W = 1.0\n",
184 |     "        W_list = []\n",
185 |     "        episolode_longe_list.append(len(episode))\n",
186 |     "        \n",
187 |     "        weight_list2[i_episode,0] = 1.0\n",
188 |     "        for t in range(len(episode)):\n",
189 |     "            state, action, reward = episode[t]\n",
190 |     "            reward_list[i_episode,t] = reward\n",
191 |     "            state_list[i_episode,t] = state\n",
192 |     "            action_list[i_episode,t] = action\n",
193 |     "            \n",
194 |     "            W = W*target_dense(state)[action]/behav_dense(state)[action]*discount_factor\n",
195 |     "            probprob = 0.9*Q_space[state,:] + 0.1*prob1\n",
196 |     "            W_list.append(W)\n",
197 |     "            weight_list[i_episode,t] = W_list[t]\n",
198 |     "            weight_list2[i_episode,t+1] = W_list[t]\n",
199 |     "            weight_list3[i_episode,t] = target_dense(state)[action]/behav_dense(state)[action]\n",
200 |     "            \n",
201 |     "            count_list[t] += 1.0\n",
202 |     "            \n",
203 |     "            if t==0:\n",
204 |     "                auxi_list[i_episode,t] = W_list[t]*Q_[state,action]-np.sum(probprob*Q_[state,:])\n",
205 |     "            else:\n",
206 |     "                auxi_list[i_episode,t] = W_list[t]*Q_[state,action]-W_list[t-1]*np.sum(probprob*Q_[state,:])\n",
207 |     "          \n",
208 |     "            if t==0:\n",
209 |     "                auxi_list2[i_episode,t] = W_list[t]-1.0\n",
210 |     "            else:\n",
211 |     "                auxi_list2[i_episode,t] = W_list[t]-W_list[t-1]\n",
212 |     "\n",
213 |     "    print np.max(np.array(episolode_longe_list))\n",
214 |     "    \n",
215 |     "        \n",
216 |     "    weight_list_mean = np.mean(weight_list,1)\n",
217 |     "    reward_list_mean = np.mean(reward_list,1)\n",
218 |     "    auxi_list_mean = np.mean(auxi_list,1)\n",
219 |     "    auxi_list2_mean = np.mean(auxi_list2,1)\n",
220 |     "    \n",
221 |     "    val = []    \n",
222 |     " \n",
223 |     "    ##### IPW\n",
224 |     "    for i in range(num_episodes):\n",
225 |     "        predic_list.append(np.sum(weight_list[i,:]*reward_list[i,:]))   \n",
226 |     "    \n",
227 |     "    val.append(np.mean(predic_list))\n",
228 |     "    \n",
229 |     "    #### Marginalized-IPW \n",
230 |     "    \n",
231 |     "    for i in range(num_episodes):\n",
232 |     "        for j in range(episolode_longe_list[i]):\n",
233 |     "            marginal_weight[i,j] = np.mean(weight_list[:,j][(state_list[:,j]==state_list[i,j]) & (action_list[:,j]==action_list[i,j])])\n",
234 |     "            if j==0:\n",
235 |     "                marginal_weight_2[i,j] = weight_list3[i,j]\n",
236 |     "            else:\n",
237 |     "                marginal_weight_2[i,j] = np.mean(weight_list[:,j-1][(state_list[:,j]==state_list[i,j])])*weight_list3[i,j]\n",
238 |     "    \n",
239 |     "    \n",
240 |     "    for i_episode in range(num_episodes):\n",
241 |     "        for t in range(episolode_longe_list[i_episode]):\n",
242 |     "            state = np.int(state_list[i_episode,t])\n",
243 |     "            action = np.int(action_list[i_episode,t])\n",
244 |     "            probprob = 0.9*Q_space[state,:] + 0.1*prob1\n",
245 |     "            if t==0:\n",
246 |     "                marginal_auxi_list[i_episode,t] = marginal_weight[i_episode,t]*Q_[state,action]-np.sum(probprob*Q_[state,:])\n",
247 |     "                marginal_auxi_list_2[i_episode,t] = marginal_weight_2[i_episode,t]*Q_[state,action]-np.sum(probprob*Q_[state,:])\n",
248 |     "                auxi_list[i_episode,t] = weight_list[i_episode,t]*Q_[state,action]-np.sum(probprob*Q_[state,:])\n",
249 |     "            else:\n",
250 |     "                marginal_auxi_list[i_episode,t] = marginal_weight[i_episode,t]*(Q_[state,action])-marginal_weight[i_episode,t-1]*np.sum(probprob*(Q_[state,:]))\n",
251 |     "                marginal_auxi_list_2[i_episode,t] = marginal_weight_2[i_episode,t]*(Q_[state,action])-marginal_weight_2[i_episode,t-1]*np.sum(probprob*(Q_[state,:]))\n",
252 |     "                auxi_list[i_episode,t] = weight_list[i_episode,t]*(Q_[state,action])-weight_list[i_episode,t-1]*np.sum(probprob*(Q_[state,:]))\n",
253 |     "          \n",
254 |     "            if t==0:\n",
255 |     "                marginal_auxi_list2[i_episode,t] = marginal_weight[i_episode,t]-1.0\n",
256 |     "                marginal_auxi_list2_2[i_episode,t] = marginal_weight_2[i_episode,t]-1.0\n",
257 |     "                auxi_list2[i_episode,t] = weight_list[i_episode,t]-1.0\n",
258 |     "            else:\n",
259 |     "                marginal_auxi_list2[i_episode,t] =  marginal_weight[i_episode,t]- marginal_weight[i_episode,t-1]\n",
260 |     "                marginal_auxi_list2_2[i_episode,t] =  marginal_weight_2[i_episode,t]- marginal_weight_2[i_episode,t-1]\n",
261 |     "                auxi_list2[i_episode,t] = weight_list[i_episode,t]-weight_list[i_episode,t-1]\n",
262 |     "\n",
263 |     "    \n",
264 |     "    for i in range(num_episodes):\n",
265 |     "        predic_list2.append(np.sum(marginal_weight[i,:]*reward_list[i,:]))   \n",
266 |     "    \n",
267 |     "    ### marginal ipw2  #### Using action and state \n",
268 |     "    val.append(np.mean(predic_list2))\n",
269 |     "        \n",
270 |     "\n",
271 |     "    ### marginal ipw3#### Using only state \n",
272 |     "    for i in range(num_episodes):\n",
273 |     "        predic_list22.append(np.sum(marginal_weight_2[i,:]*reward_list[i,:]))   \n",
274 |     "    \n",
275 |     "    val.append(np.mean(predic_list22))\n",
276 |     "   \n",
277 |     "  \n",
278 |     "    #### DR\n",
279 |     "    val.append(np.mean(predic_list)-np.mean(np.sum(auxi_list,1)))\n",
280 |     "    \n",
281 |     "    #### marginal DR 1  #### Using action and state \n",
282 |     "    val.append(np.mean(predic_list2)-np.mean(np.sum(marginal_auxi_list,1)))\n",
283 |     "    #### marginal DR 2   #### Using only state                                     \n",
284 |     "    val.append(np.mean(predic_list22)-np.mean(np.sum(marginal_auxi_list_2,1)))\n",
285 |     "    \n",
286 |     "    \n",
287 |     "\n",
288 |     "\n",
289 |     "    return val\n",
290 |     "                                                  \n",
291 |     "    \n"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": 49,
297 |    "metadata": {},
298 |    "outputs": [
299 |     {
300 |      "name": "stdout",
301 |      "output_type": "stream",
302 |      "text": [
303 |       "0\n",
304 |       "135\n",
305 |       "204\n",
306 |       "1\n",
307 |       "202\n",
308 |       "179\n",
309 |       "2\n",
310 |       "205\n",
311 |       "153\n",
312 |       "3\n",
313 |       "149\n",
314 |       "176\n",
315 |       "4\n",
316 |       "212\n",
317 |       "217\n",
318 |       "5\n",
319 |       "231\n",
320 |       "151\n",
321 |       "6\n",
322 |       "210\n",
323 |       "216\n",
324 |       "7\n",
325 |       "141\n",
326 |       "147\n",
327 |       "8\n",
328 |       "194\n",
329 |       "288\n",
330 |       "9\n",
331 |       "181\n",
332 |       "177\n",
333 |       "10\n",
334 |       "248\n",
335 |       "199\n",
336 |       "11\n",
337 |       "118\n",
338 |       "198\n",
339 |       "12\n",
340 |       "210\n",
341 |       "203\n",
342 |       "13\n",
343 |       "125\n",
344 |       "225\n",
345 |       "14\n",
346 |       "324\n",
347 |       "225\n",
348 |       "15\n",
349 |       "170\n",
350 |       "169\n",
351 |       "16\n",
352 |       "203\n",
353 |       "195\n",
354 |       "17\n",
355 |       "157\n",
356 |       "171\n",
357 |       "18\n",
358 |       "132\n",
359 |       "197\n",
360 |       "19\n",
361 |       "188\n",
362 |       "218\n",
363 |       "20\n",
364 |       "150\n",
365 |       "228\n",
366 |       "21\n",
367 |       "226\n",
368 |       "202\n",
369 |       "22\n",
370 |       "192\n",
371 |       "195\n",
372 |       "23\n",
373 |       "141\n",
374 |       "195\n",
375 |       "24\n",
376 |       "249\n",
377 |       "144\n",
378 |       "25\n",
379 |       "249\n",
380 |       "181\n",
381 |       "26\n",
382 |       "183\n",
383 |       "177\n",
384 |       "27\n",
385 |       "185\n",
386 |       "183\n",
387 |       "28\n",
388 |       "378\n",
389 |       "177\n",
390 |       "29\n",
391 |       "170\n",
392 |       "221\n",
393 |       "30\n",
394 |       "235\n",
395 |       "165\n",
396 |       "31\n",
397 |       "165\n",
398 |       "234\n",
399 |       "32\n",
400 |       "206\n",
401 |       "217\n",
402 |       "33\n",
403 |       "178\n",
404 |       "255\n",
405 |       "34\n",
406 |       "143\n",
407 |       "181\n",
408 |       "35\n",
409 |       "253\n",
410 |       "290\n",
411 |       "36\n",
412 |       "200\n",
413 |       "174\n",
414 |       "37\n",
415 |       "242\n",
416 |       "190\n",
417 |       "38\n",
418 |       "178\n",
419 |       "216\n",
420 |       "39\n",
421 |       "182\n",
422 |       "140\n",
423 |       "40\n",
424 |       "188\n",
425 |       "187\n",
426 |       "41\n",
427 |       "193\n",
428 |       "261\n",
429 |       "42\n",
430 |       "156\n",
431 |       "192\n",
432 |       "43\n",
433 |       "225\n",
434 |       "233\n",
435 |       "44\n",
436 |       "246\n",
437 |       "182\n",
438 |       "45\n",
439 |       "154\n",
440 |       "132\n",
441 |       "46\n",
442 |       "246\n",
443 |       "182\n",
444 |       "47\n",
445 |       "186\n",
446 |       "150\n",
447 |       "48\n",
448 |       "144\n",
449 |       "172\n",
450 |       "49\n",
451 |       "200\n",
452 |       "192\n",
453 |       "50\n",
454 |       "233\n",
455 |       "255\n",
456 |       "51\n",
457 |       "170\n",
458 |       "238\n",
459 |       "52\n",
460 |       "284\n",
461 |       "154\n",
462 |       "53\n",
463 |       "173\n",
464 |       "134\n",
465 |       "54\n",
466 |       "162\n",
467 |       "174\n",
468 |       "55\n",
469 |       "182\n",
470 |       "229\n",
471 |       "56\n",
472 |       "112\n",
473 |       "268\n",
474 |       "57\n",
475 |       "158\n",
476 |       "217\n",
477 |       "58\n",
478 |       "174\n",
479 |       "164\n",
480 |       "59\n",
481 |       "213\n",
482 |       "241\n",
483 |       "60\n",
484 |       "200\n",
485 |       "165\n",
486 |       "61\n",
487 |       "176\n",
488 |       "234\n",
489 |       "62\n",
490 |       "163\n",
491 |       "140\n",
492 |       "63\n",
493 |       "182\n",
494 |       "206\n",
495 |       "64\n",
496 |       "173\n",
497 |       "233\n",
498 |       "65\n",
499 |       "315\n",
500 |       "161\n",
501 |       "66\n",
502 |       "195\n",
503 |       "253\n",
504 |       "67\n",
505 |       "140\n",
506 |       "274\n",
507 |       "68\n",
508 |       "120\n",
509 |       "226\n",
510 |       "69\n",
511 |       "163\n",
512 |       "277\n",
513 |       "70\n",
514 |       "173\n",
515 |       "188\n",
516 |       "71\n",
517 |       "171\n",
518 |       "138\n",
519 |       "72\n",
520 |       "310\n",
521 |       "204\n",
522 |       "73\n",
523 |       "202\n",
524 |       "208\n",
525 |       "74\n",
526 |       "237\n",
527 |       "232\n",
528 |       "75\n",
529 |       "143\n",
530 |       "202\n",
531 |       "76\n",
532 |       "161\n",
533 |       "150\n",
534 |       "77\n",
535 |       "219\n",
536 |       "168\n",
537 |       "78\n",
538 |       "110\n",
539 |       "143\n",
540 |       "79\n",
541 |       "173\n",
542 |       "260\n",
543 |       "80\n",
544 |       "165\n",
545 |       "170\n",
546 |       "81\n",
547 |       "147\n",
548 |       "165\n",
549 |       "82\n",
550 |       "233\n",
551 |       "147\n",
552 |       "83\n",
553 |       "174\n",
554 |       "285\n",
555 |       "84\n",
556 |       "150\n",
557 |       "199\n",
558 |       "85\n",
559 |       "200\n",
560 |       "290\n",
561 |       "86\n",
562 |       "203\n",
563 |       "368\n",
564 |       "87\n",
565 |       "184\n",
566 |       "138\n",
567 |       "88\n",
568 |       "190\n",
569 |       "204\n",
570 |       "89\n",
571 |       "129\n",
572 |       "182\n",
573 |       "90\n",
574 |       "198\n",
575 |       "178\n",
576 |       "91\n",
577 |       "154\n",
578 |       "190\n",
579 |       "92\n",
580 |       "192\n",
581 |       "146\n",
582 |       "93\n",
583 |       "190\n",
584 |       "190\n",
585 |       "94\n",
586 |       "189\n",
587 |       "177\n",
588 |       "95\n",
589 |       "200\n",
590 |       "138\n",
591 |       "96\n",
592 |       "175\n",
593 |       "152\n",
594 |       "97\n",
595 |       "152\n",
596 |       "153\n",
597 |       "98\n",
598 |       "157\n",
599 |       "178\n",
600 |       "99\n",
601 |       "138\n",
602 |       "277\n"
603 |      ]
604 |     }
605 |    ],
606 |    "source": [
607 |     "\n",
608 |     "\n",
609 |     "is_list = []\n",
610 |     "is2_list = []\n",
611 |     "is3_list = []\n",
612 |     "wis_list = []\n",
613 |     "wis2_list = []\n",
614 |     "dm_list = []\n",
615 |     "dr_list = []\n",
616 |     "dr2_list = []\n",
617 |     "dr3_list = []\n",
618 |     "bdr_list = []\n",
619 |     "drs_list = []\n",
620 |     "drs2_list = []\n",
621 |     "drss_list = []\n",
622 |     "mdr_list = []\n",
623 |     "mdr_list2 = []\n",
624 |     "\n",
625 |     "sample_size = 1000\n",
626 |     "sample_size =sample_size/2\n",
627 |     "for kkk in range(100):\n",
628 |     "    print kkk\n",
629 |     "    #### Sample splititng \n",
630 |     "    ### First fold \n",
631 |     "    \n",
632 |     "    predicted_Q ,episode_episode = sarsa2(env,sample_policy,behavior_policy, sample_size)\n",
633 |     "    V_10k_1  = mc_prediction(env,sample_policy,behavior_policy, episode_episode, predicted_Q,num_episodes=sample_size)\n",
634 |     "    \n",
635 |     "    ### Second fold \n",
636 |     "    predicted_Q ,episode_episode = sarsa2(env,sample_policy,behavior_policy, sample_size)\n",
637 |     "    V_10k_2  = mc_prediction(env,sample_policy,behavior_policy, episode_episode, predicted_Q,num_episodes=sample_size)\n",
638 |     "    \n",
639 |     "    V_10k = 0.5*(np.array(V_10k_1)+np.array(V_10k_2))\n",
640 |     "    is_list.append(np.mean(V_10k[0]))\n",
641 |     "    is2_list.append(np.mean(V_10k[1]))\n",
642 |     "    is3_list.append(np.mean(V_10k[2]))\n",
643 |     "    dr_list.append(np.mean(V_10k[3]))\n",
644 |     "    dr2_list.append(np.mean(V_10k[4]))   \n",
645 |     "    dr3_list.append(np.mean(V_10k[5]))  \n",
646 |     "    probprob = 0.9*Q_space[36,:] + 0.1*prob1\n",
647 |     "    dm_list.append(np.sum(probprob*predicted_Q[36,:]))\n",
648 |     "    np.savez(\"2estimator_list_ipw_\"+str(betabeta)+\"_\"+str(sample_size),a=is_list)\n",
649 |     "    np.savez(\"2estimator_list_ipw2_\"+str(betabeta)+\"_\"+str(sample_size), a=is3_list)\n",
650 |     "    np.savez(\"2estimator_list_dm_\"+str(betabeta)+\"_\"+str(sample_size), a=dm_list)\n",
651 |     "    np.savez(\"2estimator_list_dr_\"+str(betabeta)+\"_\"+str(sample_size),a=dr_list)\n",
652 |     "    np.savez(\"2estimator_list_dr2_\"+str(betabeta)+\"_\"+str(sample_size),a=dr3_list)\n",
653 |     "\n",
654 |     "\n"
655 |    ]
656 |   },
657 |   {
658 |    "cell_type": "code",
659 |    "execution_count": 50,
660 |    "metadata": {},
661 |    "outputs": [
662 |     {
663 |      "name": "stdout",
664 |      "output_type": "stream",
665 |      "text": [
666 |       "-42.6790406415171\n",
667 |       "[37.66853850330432, 166.3536098146463]\n",
668 |       "wis\n",
669 |       "-41.32338904594486\n",
670 |       "[29.887080050982032, 134.7261138929551]\n",
671 |       "dm\n",
672 |       "-39.81945511378478\n",
673 |       "[7.275944708877457, 2.0596332012217786]\n",
674 |       "dr\n",
675 |       "-42.22774103445223\n",
676 |       "[0.6716582397271773, 2.367824452822763]\n",
677 |       "dr3\n",
678 |       "-41.88853674465647\n",
679 |       "[0.568131494443443, 0.4208652463004875]\n"
680 |      ]
681 |     }
682 |    ],
683 |    "source": [
684 |     "true = -42.49\n",
685 |     "def mse(aaa):\n",
686 |     "    aaa = np.array(aaa)\n",
687 |     "    aaa = aaa[aaa>-100]\n",
688 |     "    return [np.mean((((aaa-true)*(aaa-true)))),np.sqrt(np.var((aaa-true)*(aaa-true)))]\n",
689 |     "\n",
690 |     "print np.mean(is_list)\n",
691 |     "print mse(is_list)\n",
692 |     "print \"wis\"\n",
693 |     "print np.mean(is3_list)\n",
694 |     "print mse(is3_list)\n",
695 |     "print \"dm\"\n",
696 |     "print np.mean(dm_list)\n",
697 |     "print mse(dm_list)\n",
698 |     "print \"dr\"\n",
699 |     "print np.mean(dr_list)\n",
700 |     "print mse(dr_list)\n",
701 |     "print \"dr3\"\n",
702 |     "print np.mean(dr3_list)\n",
703 |     "print mse(dr3_list)"
704 |    ]
705 |   },
706 |   {
707 |    "cell_type": "code",
708 |    "execution_count": null,
709 |    "metadata": {},
710 |    "outputs": [],
711 |    "source": []
712 |   }
713 |  ],
714 |  "metadata": {
715 |   "kernelspec": {
716 |    "display_name": "Python 2",
717 |    "language": "python",
718 |    "name": "python2"
719 |   },
720 |   "language_info": {
721 |    "codemirror_mode": {
722 |     "name": "ipython",
723 |     "version": 2
724 |    },
725 |    "file_extension": ".py",
726 |    "mimetype": "text/x-python",
727 |    "name": "python",
728 |    "nbconvert_exporter": "python",
729 |    "pygments_lexer": "ipython2",
730 |    "version": "2.7.12"
731 |   }
732 |  },
733 |  "nbformat": 4,
734 |  "nbformat_minor": 1
735 | }
736 | 


--------------------------------------------------------------------------------
/exp5_2_py3/cw_notebook_ver_splitting_p3.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |   "cells": [
   3 |     {
   4 |       "cell_type": "code",
   5 |       "source": [
   6 |         "from google.colab import drive\n",
   7 |         "drive.mount('/content/drive')"
   8 |       ],
   9 |       "metadata": {
  10 |         "colab": {
  11 |           "base_uri": "https://localhost:8080/"
  12 |         },
  13 |         "id": "L1RBkFLR0Y_Y",
  14 |         "outputId": "aa21aeea-540c-4ea5-d56a-3c019d7824c1"
  15 |       },
  16 |       "id": "L1RBkFLR0Y_Y",
  17 |       "execution_count": 4,
  18 |       "outputs": [
  19 |         {
  20 |           "output_type": "stream",
  21 |           "name": "stdout",
  22 |           "text": [
  23 |             "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
  24 |           ]
  25 |         }
  26 |       ]
  27 |     },
  28 |     {
  29 |       "metadata": {
  30 |         "ExecuteTime": {
  31 |           "end_time": "2025-05-12T20:03:14.713896Z",
  32 |           "start_time": "2025-05-12T20:03:05.105703Z"
  33 |         },
  34 |         "colab": {
  35 |           "base_uri": "https://localhost:8080/"
  36 |         },
  37 |         "id": "50f1f594d24d639c",
  38 |         "outputId": "2389fc60-833f-4595-9b8b-3a910445f19f"
  39 |       },
  40 |       "cell_type": "code",
  41 |       "source": [
  42 |         "pip install gym matplotlib numpy pandas scipy"
  43 |       ],
  44 |       "id": "50f1f594d24d639c",
  45 |       "outputs": [
  46 |         {
  47 |           "output_type": "stream",
  48 |           "name": "stdout",
  49 |           "text": [
  50 |             "Requirement already satisfied: gym in /usr/local/lib/python3.11/dist-packages (0.25.2)\n",
  51 |             "Requirement already satisfied: matplotlib in /usr/local/lib/python3.11/dist-packages (3.10.0)\n",
  52 |             "Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (2.0.2)\n",
  53 |             "Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (2.2.2)\n",
  54 |             "Requirement already satisfied: scipy in /usr/local/lib/python3.11/dist-packages (1.15.3)\n",
  55 |             "Requirement already satisfied: cloudpickle>=1.2.0 in /usr/local/lib/python3.11/dist-packages (from gym) (3.1.1)\n",
  56 |             "Requirement already satisfied: gym-notices>=0.0.4 in /usr/local/lib/python3.11/dist-packages (from gym) (0.0.8)\n",
  57 |             "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (1.3.2)\n",
  58 |             "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (0.12.1)\n",
  59 |             "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (4.57.0)\n",
  60 |             "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (1.4.8)\n",
  61 |             "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (24.2)\n",
  62 |             "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (11.2.1)\n",
  63 |             "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (3.2.3)\n",
  64 |             "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (2.9.0.post0)\n",
  65 |             "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas) (2025.2)\n",
  66 |             "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas) (2025.2)\n",
  67 |             "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.7->matplotlib) (1.17.0)\n"
  68 |           ]
  69 |         }
  70 |       ],
  71 |       "execution_count": 2
  72 |     },
  73 |     {
  74 |       "cell_type": "code",
  75 |       "source": [
  76 |         "import itertools\n",
  77 |         "import numpy as np\n",
  78 |         "import sys\n",
  79 |         "import gym\n",
  80 |         "\n",
  81 |         "# Since lib.envs isn't available, we'll need to define these environments here\n",
  82 |         "# or use gym environments directly. For now, I'll create simplified versions.\n",
  83 |         "\n",
  84 |         "from collections import defaultdict\n",
  85 |         "\n",
  86 |         "# Simple CliffWalkingEnv implementation\n",
  87 |         "class CliffWalkingEnv(gym.Env):\n",
  88 |         "    def __init__(self):\n",
  89 |         "        self.shape = (4, 12)\n",
  90 |         "        self.start_state_index = np.ravel_multi_index((3, 0), self.shape)\n",
  91 |         "        self.goal_state_index = np.ravel_multi_index((3, 11), self.shape)\n",
  92 |         "        self.cliff = list(range(np.ravel_multi_index((3, 1), self.shape),\n",
  93 |         "                               np.ravel_multi_index((3, 11), self.shape)))\n",
  94 |         "        self.nS = self.shape[0] * self.shape[1]\n",
  95 |         "        self.nA = 4  # up, right, down, left\n",
  96 |         "\n",
  97 |         "        # Calculate transition probabilities and rewards\n",
  98 |         "        self.P = {}\n",
  99 |         "        for s in range(self.nS):\n",
 100 |         "            position = np.unravel_index(s, self.shape)\n",
 101 |         "            self.P[s] = {a: [] for a in range(self.nA)}\n",
 102 |         "\n",
 103 |         "            # Actions: 0=up, 1=right, 2=down, 3=left\n",
 104 |         "            for a in range(self.nA):\n",
 105 |         "                reward = -1.0  # default reward for each move\n",
 106 |         "                next_position = list(position)\n",
 107 |         "                if a == 0:\n",
 108 |         "                    next_position[0] = max(position[0] - 1, 0)\n",
 109 |         "                elif a == 1:\n",
 110 |         "                    next_position[1] = min(position[1] + 1, self.shape[1] - 1)\n",
 111 |         "                elif a == 2:\n",
 112 |         "                    next_position[0] = min(position[0] + 1, self.shape[0] - 1)\n",
 113 |         "                elif a == 3:\n",
 114 |         "                    next_position[1] = max(position[1] - 1, 0)\n",
 115 |         "\n",
 116 |         "                next_state = np.ravel_multi_index(next_position, self.shape)\n",
 117 |         "\n",
 118 |         "                # Check if we're at the cliff\n",
 119 |         "                if s in self.cliff:\n",
 120 |         "                    next_state = self.start_state_index\n",
 121 |         "                    reward = -100.0\n",
 122 |         "\n",
 123 |         "                # Check if we're at the goal\n",
 124 |         "                done = next_state == self.goal_state_index\n",
 125 |         "\n",
 126 |         "                self.P[s][a] = [(1.0, next_state, reward, done)]\n",
 127 |         "\n",
 128 |         "        self.observation_space = gym.spaces.Discrete(self.nS)\n",
 129 |         "        self.action_space = gym.spaces.Discrete(self.nA)\n",
 130 |         "\n",
 131 |         "        self.reset()\n",
 132 |         "\n",
 133 |         "    def step(self, action):\n",
 134 |         "        state, reward, done, _ = self._step(action)\n",
 135 |         "        self.s = state\n",
 136 |         "        return (state, reward, done, {})\n",
 137 |         "\n",
 138 |         "    def _step(self, action):\n",
 139 |         "        (probs, next_state, reward, done) = self.P[self.s][action][0]\n",
 140 |         "        return (next_state, reward, done, {})\n",
 141 |         "\n",
 142 |         "    def reset(self):\n",
 143 |         "        self.s = self.start_state_index\n",
 144 |         "        return self.s\n",
 145 |         "\n",
 146 |         "# Simple WindyGridworldEnv implementation (not fully used in this code)\n",
 147 |         "class WindyGridworldEnv(gym.Env):\n",
 148 |         "    def __init__(self):\n",
 149 |         "        self.shape = (7, 10)\n",
 150 |         "        self.nS = self.shape[0] * self.shape[1]\n",
 151 |         "        self.nA = 4  # up, right, down, left\n",
 152 |         "        self.wind = [0, 0, 0, 1, 1, 1, 2, 2, 1, 0]\n",
 153 |         "        self.reset()\n",
 154 |         "\n",
 155 |         "    def step(self, action):\n",
 156 |         "        # Not implemented as it's not used in the main code\n",
 157 |         "        pass\n",
 158 |         "\n",
 159 |         "    def reset(self):\n",
 160 |         "        self.s = np.ravel_multi_index((3, 0), self.shape)\n",
 161 |         "        return self.s\n",
 162 |         "\n",
 163 |         "from scipy.optimize import minimize, rosen, rosen_der\n",
 164 |         "from scipy.optimize import Bounds\n",
 165 |         "\n",
 166 |         "bounds = Bounds([-0.1, -0.1], [0.1, 0.1])\n",
 167 |         "\n",
 168 |         "env = CliffWalkingEnv()\n",
 169 |         "\n",
 170 |         "def make_epsilon_greedy_policy(Q, epsilon, nA):\n",
 171 |         "    def policy_fn(observation):\n",
 172 |         "        A = np.ones(nA, dtype=float) * epsilon / nA\n",
 173 |         "        best_action = np.argmax(Q[observation])\n",
 174 |         "        A[best_action] += (1.0 - epsilon)\n",
 175 |         "        return A\n",
 176 |         "    return policy_fn\n",
 177 |         "\n",
 178 |         "# Update these paths to your actual file locations\n",
 179 |         "Q_space = np.load(\"/content/drive/MyDrive/DoubleReinforcementLearningMDP-master/Q-table-cliff.npz\")[\"xxx\"]\n",
 180 |         "Q_space2 = np.load(\"/content/drive/MyDrive/DoubleReinforcementLearningMDP-master/Q-table-cliff.npz\")[\"xxx\"]\n",
 181 |         "\n",
 182 |         "prob1 = [1.0 for i in range((env.nA))]\n",
 183 |         "prob1 = prob1/np.sum(prob1)\n",
 184 |         "\n",
 185 |         "betabeta = 0.8\n",
 186 |         "def sample_policy(observation, alpha=0.9):\n",
 187 |         "    prob2 = alpha*Q_space[observation,:] + (1-alpha)*prob1\n",
 188 |         "    return np.random.choice(env.nA, 1, p=prob2)[0]\n",
 189 |         "\n",
 190 |         "\n",
 191 |         "def behavior_policy(observation, beta=betabeta):\n",
 192 |         "    prob2 = beta*Q_space[observation,:] + (1-beta)*prob1\n",
 193 |         "    return np.random.choice(env.nA, 1, p=prob2)[0]\n",
 194 |         "\n",
 195 |         "\n",
 196 |         "def target_dense(observation, alpha=0.9):\n",
 197 |         "    prob2 = alpha*Q_space[observation,:] + (1-alpha)*prob1\n",
 198 |         "    return prob2\n",
 199 |         "\n",
 200 |         "def behav_dense(observation, beta=betabeta):\n",
 201 |         "    prob2 = beta*Q_space[observation,:] + (1-beta)*prob1\n",
 202 |         "    return prob2\n",
 203 |         "\n",
 204 |         "# FIXED: Matching the original notebook exactly\n",
 205 |         "def sarsa2(env, policy, policy2, num_episodes, discount_factor=1.0, Q_space2=Q_space2, alpha=0.6, epsilon=0.03):\n",
 206 |         "    \"\"\"\n",
 207 |         "    Expected SARSA implementation matching the original notebook\n",
 208 |         "    \"\"\"\n",
 209 |         "    # Initialize Q as a copy of Q_space2 (not zeros)\n",
 210 |         "    Q = np.copy(Q_space2)\n",
 211 |         "    episode_episode = []\n",
 212 |         "\n",
 213 |         "    for i_episode in range(num_episodes):\n",
 214 |         "        if (i_episode + 1) % 200 == 0:\n",
 215 |         "            sys.stdout.flush()\n",
 216 |         "\n",
 217 |         "        state = env.reset()\n",
 218 |         "        action = policy2(state)\n",
 219 |         "        episode = []\n",
 220 |         "\n",
 221 |         "        for t in itertools.count():\n",
 222 |         "            # Take a step\n",
 223 |         "            next_state, reward, done, _ = env.step(action)\n",
 224 |         "            episode.append((state, action, reward))\n",
 225 |         "\n",
 226 |         "            # Pick the next action\n",
 227 |         "            next_action = policy2(next_state)\n",
 228 |         "\n",
 229 |         "            # TD Update - Expected SARSA without importance sampling\n",
 230 |         "            td_target = reward + discount_factor * np.sum(Q[next_state,:]*target_dense(next_state))\n",
 231 |         "            td_delta = td_target - Q[state, action]\n",
 232 |         "            Q[state, action] += alpha * td_delta  # No importance sampling correction\n",
 233 |         "\n",
 234 |         "            if done:\n",
 235 |         "                break\n",
 236 |         "\n",
 237 |         "            action = next_action\n",
 238 |         "            state = next_state\n",
 239 |         "\n",
 240 |         "        episode_episode.append(episode)\n",
 241 |         "\n",
 242 |         "    # Return only Q and episode_episode (matching original)\n",
 243 |         "    return Q, episode_episode\n",
 244 |         "\n",
 245 |         "bounds = Bounds([-0.2, -0.2], [0.2, 0.2])\n",
 246 |         "def sigmoid(x, derivative=False):\n",
 247 |         "    return x*(1-x) if derivative else 1/(1+np.exp(-x))\n",
 248 |         "\n",
 249 |         "depth = 1\n",
 250 |         "def mc_prediction(env, policy, policy2, episode_episode, Q_=1.0, num_episodes=100, discount_factor=1.0):\n",
 251 |         "    \"\"\"\n",
 252 |         "    Monte Carlo prediction for policy evaluation\n",
 253 |         "    \"\"\"\n",
 254 |         "    returns_sum = defaultdict(float)\n",
 255 |         "    returns_count = defaultdict(float)\n",
 256 |         "    returns_count2 = defaultdict(float)\n",
 257 |         "\n",
 258 |         "    predic_list = []\n",
 259 |         "    predic_list2 = []\n",
 260 |         "    predic_list3 = []\n",
 261 |         "    predic_list22 = []\n",
 262 |         "    predic_list4 = []\n",
 263 |         "    predic_list5 = np.ones(num_episodes)\n",
 264 |         "    auxiauxi = []\n",
 265 |         "    epiepi = []\n",
 266 |         "    weight_list = np.zeros([num_episodes, 1000])\n",
 267 |         "    weight_list2 = np.zeros([num_episodes, 1002])\n",
 268 |         "    weight_list3 = np.zeros([num_episodes, 1002])\n",
 269 |         "    marginal_weight = np.zeros([num_episodes, 1000])\n",
 270 |         "    marginal_weight_2 = np.zeros([num_episodes, 1000])\n",
 271 |         "    auxi_list = np.zeros([num_episodes, 1000])\n",
 272 |         "    marginal_auxi_list2 = np.zeros([num_episodes, 1000])\n",
 273 |         "    marginal_auxi_list = np.zeros([num_episodes, 1000])\n",
 274 |         "    marginal_auxi_list2_2 = np.zeros([num_episodes, 1000])\n",
 275 |         "    marginal_auxi_list_2 = np.zeros([num_episodes, 1000])\n",
 276 |         "    auxi_list2 = np.zeros([num_episodes, 1000])\n",
 277 |         "    reward_list = np.zeros([num_episodes, 1000])\n",
 278 |         "    state_list = np.zeros([num_episodes, 1000])\n",
 279 |         "    action_list = np.zeros([num_episodes, 1000])\n",
 280 |         "\n",
 281 |         "    count_list = np.zeros(1000)\n",
 282 |         "    episolode_longe_list = []\n",
 283 |         "\n",
 284 |         "    for i_episode in range(num_episodes):\n",
 285 |         "        if i_episode % 200 == 0:\n",
 286 |         "            sys.stdout.flush()\n",
 287 |         "\n",
 288 |         "        episode = episode_episode[i_episode]\n",
 289 |         "\n",
 290 |         "        W = 1.0\n",
 291 |         "        W_list = []\n",
 292 |         "        episolode_longe_list.append(len(episode))\n",
 293 |         "\n",
 294 |         "        weight_list2[i_episode, 0] = 1.0\n",
 295 |         "        for t in range(len(episode)):\n",
 296 |         "            state, action, reward = episode[t]\n",
 297 |         "            reward_list[i_episode, t] = reward\n",
 298 |         "            state_list[i_episode, t] = state\n",
 299 |         "            action_list[i_episode, t] = action\n",
 300 |         "\n",
 301 |         "            W = W*target_dense(state)[action]/behav_dense(state)[action]*discount_factor\n",
 302 |         "            probprob = 0.9*Q_space[state,:] + 0.1*prob1\n",
 303 |         "            W_list.append(W)\n",
 304 |         "            weight_list[i_episode, t] = W_list[t]\n",
 305 |         "            weight_list2[i_episode, t+1] = W_list[t]\n",
 306 |         "            weight_list3[i_episode, t] = target_dense(state)[action]/behav_dense(state)[action]\n",
 307 |         "\n",
 308 |         "            count_list[t] += 1.0\n",
 309 |         "\n",
 310 |         "            if t==0:\n",
 311 |         "                auxi_list[i_episode, t] = W_list[t]*Q_[state, action]-np.sum(probprob*Q_[state,:])\n",
 312 |         "            else:\n",
 313 |         "                auxi_list[i_episode, t] = W_list[t]*Q_[state, action]-W_list[t-1]*np.sum(probprob*Q_[state,:])\n",
 314 |         "\n",
 315 |         "            if t==0:\n",
 316 |         "                auxi_list2[i_episode, t] = W_list[t]-1.0\n",
 317 |         "            else:\n",
 318 |         "                auxi_list2[i_episode, t] = W_list[t]-W_list[t-1]\n",
 319 |         "\n",
 320 |         "    print(np.max(np.array(episolode_longe_list)))\n",
 321 |         "\n",
 322 |         "    weight_list_mean = np.mean(weight_list, 1)\n",
 323 |         "    reward_list_mean = np.mean(reward_list, 1)\n",
 324 |         "    auxi_list_mean = np.mean(auxi_list, 1)\n",
 325 |         "    auxi_list2_mean = np.mean(auxi_list2, 1)\n",
 326 |         "\n",
 327 |         "    val = []\n",
 328 |         "\n",
 329 |         "    ##### IPW - Standard Importance Sampling\n",
 330 |         "    for i in range(num_episodes):\n",
 331 |         "        predic_list.append(np.sum(weight_list[i,:]*reward_list[i,:]))\n",
 332 |         "\n",
 333 |         "    val.append(np.mean(predic_list))\n",
 334 |         "\n",
 335 |         "    #### Marginalized-IPW\n",
 336 |         "\n",
 337 |         "    for i in range(num_episodes):\n",
 338 |         "        for j in range(episolode_longe_list[i]):\n",
 339 |         "            marginal_weight[i,j] = np.mean(weight_list[:,j][(state_list[:,j]==state_list[i,j]) & (action_list[:,j]==action_list[i,j])])\n",
 340 |         "            if j==0:\n",
 341 |         "                marginal_weight_2[i,j] = weight_list3[i,j]\n",
 342 |         "            else:\n",
 343 |         "                marginal_weight_2[i,j] = np.mean(weight_list[:,j-1][(state_list[:,j]==state_list[i,j])])*weight_list3[i,j]\n",
 344 |         "\n",
 345 |         "\n",
 346 |         "    for i_episode in range(num_episodes):\n",
 347 |         "        for t in range(episolode_longe_list[i_episode]):\n",
 348 |         "            state = int(state_list[i_episode,t])  # Using int instead of np.int for Python 3\n",
 349 |         "            action = int(action_list[i_episode,t])  # Using int instead of np.int for Python 3\n",
 350 |         "            probprob = 0.9*Q_space[state,:] + 0.1*prob1\n",
 351 |         "            if t==0:\n",
 352 |         "                marginal_auxi_list[i_episode,t] = marginal_weight[i_episode,t]*Q_[state,action]-np.sum(probprob*Q_[state,:])\n",
 353 |         "                marginal_auxi_list_2[i_episode,t] = marginal_weight_2[i_episode,t]*Q_[state,action]-np.sum(probprob*Q_[state,:])\n",
 354 |         "                auxi_list[i_episode,t] = weight_list[i_episode,t]*Q_[state,action]-np.sum(probprob*Q_[state,:])\n",
 355 |         "            else:\n",
 356 |         "                marginal_auxi_list[i_episode,t] = marginal_weight[i_episode,t]*(Q_[state,action])-marginal_weight[i_episode,t-1]*np.sum(probprob*(Q_[state,:]))\n",
 357 |         "                marginal_auxi_list_2[i_episode,t] = marginal_weight_2[i_episode,t]*(Q_[state,action])-marginal_weight_2[i_episode,t-1]*np.sum(probprob*(Q_[state,:]))\n",
 358 |         "                auxi_list[i_episode,t] = weight_list[i_episode,t]*(Q_[state,action])-weight_list[i_episode,t-1]*np.sum(probprob*(Q_[state,:]))\n",
 359 |         "\n",
 360 |         "            if t==0:\n",
 361 |         "                marginal_auxi_list2[i_episode,t] = marginal_weight[i_episode,t]-1.0\n",
 362 |         "                marginal_auxi_list2_2[i_episode,t] = marginal_weight_2[i_episode,t]-1.0\n",
 363 |         "                auxi_list2[i_episode,t] = weight_list[i_episode,t]-1.0\n",
 364 |         "            else:\n",
 365 |         "                marginal_auxi_list2[i_episode,t] =  marginal_weight[i_episode,t]- marginal_weight[i_episode,t-1]\n",
 366 |         "                marginal_auxi_list2_2[i_episode,t] =  marginal_weight_2[i_episode,t]- marginal_weight_2[i_episode,t-1]\n",
 367 |         "                auxi_list2[i_episode,t] = weight_list[i_episode,t]-weight_list[i_episode,t-1]\n",
 368 |         "\n",
 369 |         "\n",
 370 |         "    for i in range(num_episodes):\n",
 371 |         "        predic_list2.append(np.sum(marginal_weight[i,:]*reward_list[i,:]))\n",
 372 |         "\n",
 373 |         "    ### marginal ipw2  #### Using action and state\n",
 374 |         "    val.append(np.mean(predic_list2))\n",
 375 |         "\n",
 376 |         "\n",
 377 |         "    ### marginal ipw3#### Using only state\n",
 378 |         "    for i in range(num_episodes):\n",
 379 |         "        predic_list22.append(np.sum(marginal_weight_2[i,:]*reward_list[i,:]))\n",
 380 |         "\n",
 381 |         "    val.append(np.mean(predic_list22))\n",
 382 |         "\n",
 383 |         "\n",
 384 |         "    #### DR\n",
 385 |         "    val.append(np.mean(predic_list)-np.mean(np.sum(auxi_list,1)))\n",
 386 |         "\n",
 387 |         "    #### marginal DR 1  #### Using action and state\n",
 388 |         "    val.append(np.mean(predic_list2)-np.mean(np.sum(marginal_auxi_list,1)))\n",
 389 |         "    #### marginal DR 2   #### Using only state\n",
 390 |         "    val.append(np.mean(predic_list22)-np.mean(np.sum(marginal_auxi_list_2,1)))\n",
 391 |         "\n",
 392 |         "    return val\n",
 393 |         "\n",
 394 |         "# Main experiment run - with sample splitting like the original\n",
 395 |         "is_list = []\n",
 396 |         "is2_list = []\n",
 397 |         "is3_list = []\n",
 398 |         "wis_list = []\n",
 399 |         "wis2_list = []\n",
 400 |         "dm_list = []\n",
 401 |         "dr_list = []\n",
 402 |         "dr2_list = []\n",
 403 |         "dr3_list = []\n",
 404 |         "bdr_list = []\n",
 405 |         "drs_list = []\n",
 406 |         "drs2_list = []\n",
 407 |         "drss_list = []\n",
 408 |         "mdr_list = []\n",
 409 |         "mdr_list2 = []\n",
 410 |         "\n",
 411 |         "sample_size = 1000\n",
 412 |         "# In Python 3, integer division requires // instead of /\n",
 413 |         "sample_size = sample_size // 2\n",
 414 |         "\n",
 415 |         "for kkk in range(100):\n",
 416 |         "    print(kkk)\n",
 417 |         "    #### Sample splitting\n",
 418 |         "    ### First fold\n",
 419 |         "    predicted_Q, episode_episode = sarsa2(env, sample_policy, behavior_policy, sample_size)\n",
 420 |         "    V_10k_1 = mc_prediction(env, sample_policy, behavior_policy, episode_episode, predicted_Q, num_episodes=sample_size)\n",
 421 |         "\n",
 422 |         "    ### Second fold\n",
 423 |         "    predicted_Q, episode_episode = sarsa2(env, sample_policy, behavior_policy, sample_size)\n",
 424 |         "    V_10k_2 = mc_prediction(env, sample_policy, behavior_policy, episode_episode, predicted_Q, num_episodes=sample_size)\n",
 425 |         "\n",
 426 |         "    V_10k = 0.5*(np.array(V_10k_1)+np.array(V_10k_2))\n",
 427 |         "    is_list.append(np.mean(V_10k[0]))\n",
 428 |         "    is2_list.append(np.mean(V_10k[1]))\n",
 429 |         "    is3_list.append(np.mean(V_10k[2]))\n",
 430 |         "    dr_list.append(np.mean(V_10k[3]))\n",
 431 |         "    dr2_list.append(np.mean(V_10k[4]))\n",
 432 |         "    dr3_list.append(np.mean(V_10k[5]))\n",
 433 |         "    probprob = 0.9*Q_space[36,:] + 0.1*prob1\n",
 434 |         "    dm_list.append(np.sum(probprob*predicted_Q[36,:]))\n",
 435 |         "\n",
 436 |         "    # Save results periodically\n",
 437 |         "    if (kkk + 1) % 10 == 0:\n",
 438 |         "        np.savez(f\"2estimator_list_ipw_{betabeta}_{sample_size}\", a=np.array(is_list))\n",
 439 |         "        np.savez(f\"2estimator_list_ipw2_{betabeta}_{sample_size}\", a=np.array(is3_list))\n",
 440 |         "        np.savez(f\"2estimator_list_dm_{betabeta}_{sample_size}\", a=np.array(dm_list))\n",
 441 |         "        np.savez(f\"2estimator_list_dr_{betabeta}_{sample_size}\", a=np.array(dr_list))\n",
 442 |         "        np.savez(f\"2estimator_list_dr2_{betabeta}_{sample_size}\", a=np.array(dr3_list))\n",
 443 |         "\n",
 444 |         "# Analysis of results\n",
 445 |         "true = -42.49\n",
 446 |         "def mse(aaa):\n",
 447 |         "    \"\"\"Calculate the Mean Squared Error correctly for comparison\"\"\"\n",
 448 |         "    aaa = np.array(aaa)\n",
 449 |         "    # Filter extreme values\n",
 450 |         "    aaa = aaa[aaa > -100]\n",
 451 |         "    # Original MSE calculation\n",
 452 |         "    return [np.mean((((aaa-true)*(aaa-true)))), np.sqrt(np.var((aaa-true)*(aaa-true)))]\n",
 453 |         "\n",
 454 |         "print(\"IPW:\")\n",
 455 |         "print(f\"Mean: {np.mean(is_list)}\")\n",
 456 |         "print(f\"MSE: {mse(is_list)}\")\n",
 457 |         "\n",
 458 |         "print(\"WIS:\")\n",
 459 |         "print(f\"Mean: {np.mean(is3_list)}\")  # Note: Original used is3_list for WIS\n",
 460 |         "print(f\"MSE: {mse(is3_list)}\")\n",
 461 |         "\n",
 462 |         "print(\"DM:\")\n",
 463 |         "print(f\"Mean: {np.mean(dm_list)}\")\n",
 464 |         "print(f\"MSE: {mse(dm_list)}\")\n",
 465 |         "\n",
 466 |         "print(\"DR:\")\n",
 467 |         "print(f\"Mean: {np.mean(dr_list)}\")\n",
 468 |         "print(f\"MSE: {mse(dr_list)}\")\n",
 469 |         "\n",
 470 |         "print(\"DR3:\")\n",
 471 |         "print(f\"Mean: {np.mean(dr3_list)}\")\n",
 472 |         "print(f\"MSE: {mse(dr3_list)}\")"
 473 |       ],
 474 |       "metadata": {
 475 |         "colab": {
 476 |           "base_uri": "https://localhost:8080/"
 477 |         },
 478 |         "id": "tnQ9THg00_XF",
 479 |         "outputId": "01d3aff0-63b4-42dc-9a9e-04763360bf2e"
 480 |       },
 481 |       "id": "tnQ9THg00_XF",
 482 |       "execution_count": 7,
 483 |       "outputs": [
 484 |         {
 485 |           "output_type": "stream",
 486 |           "name": "stdout",
 487 |           "text": [
 488 |             "0\n",
 489 |             "282\n",
 490 |             "307\n",
 491 |             "1\n",
 492 |             "215\n",
 493 |             "249\n",
 494 |             "2\n",
 495 |             "215\n",
 496 |             "287\n",
 497 |             "3\n",
 498 |             "313\n",
 499 |             "221\n",
 500 |             "4\n",
 501 |             "183\n",
 502 |             "282\n",
 503 |             "5\n",
 504 |             "201\n",
 505 |             "237\n",
 506 |             "6\n",
 507 |             "196\n",
 508 |             "250\n",
 509 |             "7\n",
 510 |             "220\n",
 511 |             "164\n",
 512 |             "8\n",
 513 |             "307\n",
 514 |             "289\n",
 515 |             "9\n",
 516 |             "269\n",
 517 |             "444\n",
 518 |             "10\n",
 519 |             "219\n",
 520 |             "226\n",
 521 |             "11\n",
 522 |             "216\n",
 523 |             "271\n",
 524 |             "12\n",
 525 |             "246\n",
 526 |             "235\n",
 527 |             "13\n",
 528 |             "214\n",
 529 |             "220\n",
 530 |             "14\n",
 531 |             "207\n",
 532 |             "216\n",
 533 |             "15\n",
 534 |             "224\n",
 535 |             "252\n",
 536 |             "16\n",
 537 |             "241\n",
 538 |             "237\n",
 539 |             "17\n",
 540 |             "260\n",
 541 |             "246\n",
 542 |             "18\n",
 543 |             "246\n",
 544 |             "194\n",
 545 |             "19\n",
 546 |             "335\n",
 547 |             "265\n",
 548 |             "20\n",
 549 |             "259\n",
 550 |             "212\n",
 551 |             "21\n",
 552 |             "225\n",
 553 |             "321\n",
 554 |             "22\n",
 555 |             "215\n",
 556 |             "222\n",
 557 |             "23\n",
 558 |             "190\n",
 559 |             "233\n",
 560 |             "24\n",
 561 |             "241\n",
 562 |             "340\n",
 563 |             "25\n",
 564 |             "217\n",
 565 |             "212\n",
 566 |             "26\n",
 567 |             "274\n",
 568 |             "223\n",
 569 |             "27\n",
 570 |             "235\n",
 571 |             "231\n",
 572 |             "28\n",
 573 |             "201\n",
 574 |             "224\n",
 575 |             "29\n",
 576 |             "172\n",
 577 |             "215\n",
 578 |             "30\n",
 579 |             "195\n",
 580 |             "247\n",
 581 |             "31\n",
 582 |             "209\n",
 583 |             "299\n",
 584 |             "32\n",
 585 |             "221\n",
 586 |             "215\n",
 587 |             "33\n",
 588 |             "270\n",
 589 |             "219\n",
 590 |             "34\n",
 591 |             "245\n",
 592 |             "232\n",
 593 |             "35\n",
 594 |             "231\n",
 595 |             "224\n",
 596 |             "36\n",
 597 |             "243\n",
 598 |             "212\n",
 599 |             "37\n",
 600 |             "243\n",
 601 |             "273\n",
 602 |             "38\n",
 603 |             "271\n",
 604 |             "202\n",
 605 |             "39\n",
 606 |             "309\n",
 607 |             "184\n",
 608 |             "40\n",
 609 |             "230\n",
 610 |             "247\n",
 611 |             "41\n",
 612 |             "265\n",
 613 |             "172\n",
 614 |             "42\n",
 615 |             "227\n",
 616 |             "252\n",
 617 |             "43\n",
 618 |             "190\n",
 619 |             "207\n",
 620 |             "44\n",
 621 |             "253\n",
 622 |             "316\n",
 623 |             "45\n",
 624 |             "214\n",
 625 |             "283\n",
 626 |             "46\n",
 627 |             "263\n",
 628 |             "195\n",
 629 |             "47\n",
 630 |             "236\n",
 631 |             "208\n",
 632 |             "48\n",
 633 |             "301\n",
 634 |             "329\n",
 635 |             "49\n",
 636 |             "200\n",
 637 |             "266\n",
 638 |             "50\n",
 639 |             "267\n",
 640 |             "264\n",
 641 |             "51\n",
 642 |             "297\n",
 643 |             "216\n",
 644 |             "52\n",
 645 |             "273\n",
 646 |             "206\n",
 647 |             "53\n",
 648 |             "314\n",
 649 |             "247\n",
 650 |             "54\n",
 651 |             "241\n",
 652 |             "227\n",
 653 |             "55\n",
 654 |             "192\n",
 655 |             "276\n",
 656 |             "56\n",
 657 |             "323\n",
 658 |             "392\n",
 659 |             "57\n",
 660 |             "174\n",
 661 |             "204\n",
 662 |             "58\n",
 663 |             "257\n",
 664 |             "182\n",
 665 |             "59\n",
 666 |             "275\n",
 667 |             "200\n",
 668 |             "60\n",
 669 |             "213\n",
 670 |             "191\n",
 671 |             "61\n",
 672 |             "220\n",
 673 |             "235\n",
 674 |             "62\n",
 675 |             "241\n",
 676 |             "244\n",
 677 |             "63\n",
 678 |             "261\n",
 679 |             "674\n",
 680 |             "64\n",
 681 |             "257\n",
 682 |             "258\n",
 683 |             "65\n",
 684 |             "231\n",
 685 |             "258\n",
 686 |             "66\n",
 687 |             "254\n",
 688 |             "264\n",
 689 |             "67\n",
 690 |             "298\n",
 691 |             "176\n",
 692 |             "68\n",
 693 |             "233\n",
 694 |             "197\n",
 695 |             "69\n",
 696 |             "209\n",
 697 |             "192\n",
 698 |             "70\n",
 699 |             "338\n",
 700 |             "188\n",
 701 |             "71\n",
 702 |             "304\n",
 703 |             "202\n",
 704 |             "72\n",
 705 |             "239\n",
 706 |             "182\n",
 707 |             "73\n",
 708 |             "284\n",
 709 |             "205\n",
 710 |             "74\n",
 711 |             "186\n",
 712 |             "318\n",
 713 |             "75\n",
 714 |             "265\n",
 715 |             "194\n",
 716 |             "76\n",
 717 |             "172\n",
 718 |             "312\n",
 719 |             "77\n",
 720 |             "221\n",
 721 |             "296\n",
 722 |             "78\n",
 723 |             "197\n",
 724 |             "230\n",
 725 |             "79\n",
 726 |             "252\n",
 727 |             "183\n",
 728 |             "80\n",
 729 |             "254\n",
 730 |             "260\n",
 731 |             "81\n",
 732 |             "261\n",
 733 |             "206\n",
 734 |             "82\n",
 735 |             "329\n",
 736 |             "241\n",
 737 |             "83\n",
 738 |             "227\n",
 739 |             "183\n",
 740 |             "84\n",
 741 |             "230\n",
 742 |             "282\n",
 743 |             "85\n",
 744 |             "180\n",
 745 |             "235\n",
 746 |             "86\n",
 747 |             "182\n",
 748 |             "402\n",
 749 |             "87\n",
 750 |             "162\n",
 751 |             "240\n",
 752 |             "88\n",
 753 |             "288\n",
 754 |             "194\n",
 755 |             "89\n",
 756 |             "214\n",
 757 |             "194\n",
 758 |             "90\n",
 759 |             "335\n",
 760 |             "198\n",
 761 |             "91\n",
 762 |             "277\n",
 763 |             "254\n",
 764 |             "92\n",
 765 |             "189\n",
 766 |             "172\n",
 767 |             "93\n",
 768 |             "326\n",
 769 |             "193\n",
 770 |             "94\n",
 771 |             "232\n",
 772 |             "273\n",
 773 |             "95\n",
 774 |             "207\n",
 775 |             "227\n",
 776 |             "96\n",
 777 |             "178\n",
 778 |             "198\n",
 779 |             "97\n",
 780 |             "195\n",
 781 |             "220\n",
 782 |             "98\n",
 783 |             "274\n",
 784 |             "203\n",
 785 |             "99\n",
 786 |             "211\n",
 787 |             "333\n",
 788 |             "IPW:\n",
 789 |             "Mean: -54.75616680808431\n",
 790 |             "MSE: [np.float64(160.30981785049113), np.float64(84.892121100215)]\n",
 791 |             "WIS:\n",
 792 |             "Mean: -53.458516161830985\n",
 793 |             "MSE: [np.float64(127.63291481174467), np.float64(64.21425268143648)]\n",
 794 |             "DM:\n",
 795 |             "Mean: -52.8702278720724\n",
 796 |             "MSE: [np.float64(107.83228360781379), np.float64(5.982560508806721)]\n",
 797 |             "DR:\n",
 798 |             "Mean: -55.37211197782537\n",
 799 |             "MSE: [np.float64(166.0145781196328), np.float64(6.663141640878914)]\n",
 800 |             "DR3:\n",
 801 |             "Mean: -55.261490392640056\n",
 802 |             "MSE: [np.float64(163.16388325845435), np.float64(5.914647204533788)]\n"
 803 |           ]
 804 |         }
 805 |       ]
 806 |     },
 807 |     {
 808 |       "metadata": {
 809 |         "ExecuteTime": {
 810 |           "end_time": "2025-05-10T18:26:05.270719Z",
 811 |           "start_time": "2025-05-10T18:18:20.420269Z"
 812 |         },
 813 |         "colab": {
 814 |           "base_uri": "https://localhost:8080/"
 815 |         },
 816 |         "id": "532a8fd56a713ebe",
 817 |         "outputId": "3aef18e4-ae52-4d5f-ca36-a2d0ac90f3bf"
 818 |       },
 819 |       "cell_type": "code",
 820 |       "source": [
 821 |         "import itertools\n",
 822 |         "import numpy as np\n",
 823 |         "import sys\n",
 824 |         "import gym\n",
 825 |         "\n",
 826 |         "# Since lib.envs isn't available, we'll need to define these environments here\n",
 827 |         "# or use gym environments directly. For now, I'll create simplified versions.\n",
 828 |         "\n",
 829 |         "from collections import defaultdict\n",
 830 |         "\n",
 831 |         "# Simple CliffWalkingEnv implementation\n",
 832 |         "class CliffWalkingEnv(gym.Env):\n",
 833 |         "    def __init__(self):\n",
 834 |         "        self.shape = (4, 12)\n",
 835 |         "        self.start_state_index = np.ravel_multi_index((3, 0), self.shape)\n",
 836 |         "        self.goal_state_index = np.ravel_multi_index((3, 11), self.shape)\n",
 837 |         "        self.cliff = list(range(np.ravel_multi_index((3, 1), self.shape),\n",
 838 |         "                               np.ravel_multi_index((3, 11), self.shape)))\n",
 839 |         "        self.nS = self.shape[0] * self.shape[1]\n",
 840 |         "        self.nA = 4  # up, right, down, left\n",
 841 |         "\n",
 842 |         "        # Calculate transition probabilities and rewards\n",
 843 |         "        self.P = {}\n",
 844 |         "        for s in range(self.nS):\n",
 845 |         "            position = np.unravel_index(s, self.shape)\n",
 846 |         "            self.P[s] = {a: [] for a in range(self.nA)}\n",
 847 |         "\n",
 848 |         "            # Actions: 0=up, 1=right, 2=down, 3=left\n",
 849 |         "            for a in range(self.nA):\n",
 850 |         "                reward = -1.0  # default reward for each move\n",
 851 |         "                next_position = list(position)\n",
 852 |         "                if a == 0:\n",
 853 |         "                    next_position[0] = max(position[0] - 1, 0)\n",
 854 |         "                elif a == 1:\n",
 855 |         "                    next_position[1] = min(position[1] + 1, self.shape[1] - 1)\n",
 856 |         "                elif a == 2:\n",
 857 |         "                    next_position[0] = min(position[0] + 1, self.shape[0] - 1)\n",
 858 |         "                elif a == 3:\n",
 859 |         "                    next_position[1] = max(position[1] - 1, 0)\n",
 860 |         "\n",
 861 |         "                next_state = np.ravel_multi_index(next_position, self.shape)\n",
 862 |         "\n",
 863 |         "                # Check if we're at the cliff\n",
 864 |         "                if s in self.cliff:\n",
 865 |         "                    next_state = self.start_state_index\n",
 866 |         "                    reward = -100.0\n",
 867 |         "\n",
 868 |         "                # Check if we're at the goal\n",
 869 |         "                done = next_state == self.goal_state_index\n",
 870 |         "\n",
 871 |         "                self.P[s][a] = [(1.0, next_state, reward, done)]\n",
 872 |         "\n",
 873 |         "        self.observation_space = gym.spaces.Discrete(self.nS)\n",
 874 |         "        self.action_space = gym.spaces.Discrete(self.nA)\n",
 875 |         "\n",
 876 |         "        self.reset()\n",
 877 |         "\n",
 878 |         "    def step(self, action):\n",
 879 |         "        state, reward, done, _ = self._step(action)\n",
 880 |         "        self.s = state\n",
 881 |         "        return (state, reward, done, {})\n",
 882 |         "\n",
 883 |         "    def _step(self, action):\n",
 884 |         "        (probs, next_state, reward, done) = self.P[self.s][action][0]\n",
 885 |         "        return (next_state, reward, done, {})\n",
 886 |         "\n",
 887 |         "    def reset(self):\n",
 888 |         "        self.s = self.start_state_index\n",
 889 |         "        return self.s\n",
 890 |         "\n",
 891 |         "# Simple WindyGridworldEnv implementation (not fully used in this code)\n",
 892 |         "class WindyGridworldEnv(gym.Env):\n",
 893 |         "    def __init__(self):\n",
 894 |         "        self.shape = (7, 10)\n",
 895 |         "        self.nS = self.shape[0] * self.shape[1]\n",
 896 |         "        self.nA = 4  # up, right, down, left\n",
 897 |         "        self.wind = [0, 0, 0, 1, 1, 1, 2, 2, 1, 0]\n",
 898 |         "        self.reset()\n",
 899 |         "\n",
 900 |         "    def step(self, action):\n",
 901 |         "        # Not implemented as it's not used in the main code\n",
 902 |         "        pass\n",
 903 |         "\n",
 904 |         "    def reset(self):\n",
 905 |         "        self.s = np.ravel_multi_index((3, 0), self.shape)\n",
 906 |         "        return self.s\n",
 907 |         "\n",
 908 |         "from scipy.optimize import minimize, rosen, rosen_der\n",
 909 |         "from scipy.optimize import Bounds\n",
 910 |         "\n",
 911 |         "bounds = Bounds([-0.1, -0.1], [0.1, 0.1])\n",
 912 |         "\n",
 913 |         "env = CliffWalkingEnv()\n",
 914 |         "\n",
 915 |         "def make_epsilon_greedy_policy(Q, epsilon, nA):\n",
 916 |         "    def policy_fn(observation):\n",
 917 |         "        A = np.ones(nA, dtype=float) * epsilon / nA\n",
 918 |         "        best_action = np.argmax(Q[observation])\n",
 919 |         "        A[best_action] += (1.0 - epsilon)\n",
 920 |         "        return A\n",
 921 |         "    return policy_fn\n",
 922 |         "\n",
 923 |         "Q_space = np.load(\"/content/drive/MyDrive/Work/Estimators/DoubleReinforcement/Q-table-cliff.npz\")[\"xxx\"]\n",
 924 |         "Q_space2 = np.load(\"/content/drive/MyDrive/Work/Estimators/DoubleReinforcement/Q-table-real-cliff.npz\")[\"xxx\"] #Q-table-cliff.npz\n",
 925 |         "\n",
 926 |         "prob1 = [1.0 for i in range((env.nA))]\n",
 927 |         "prob1 = prob1/np.sum(prob1)\n",
 928 |         "\n",
 929 |         "betabeta = 0.8\n",
 930 |         "def sample_policy(observation, alpha=0.9):\n",
 931 |         "    prob2 = alpha*Q_space[observation,:] + (1-alpha)*prob1\n",
 932 |         "    return np.random.choice(env.nA, 1, p=prob2)[0]\n",
 933 |         "\n",
 934 |         "\n",
 935 |         "def behavior_policy(observation, beta=betabeta):\n",
 936 |         "    prob2 = beta*Q_space[observation,:] + (1-beta)*prob1\n",
 937 |         "    return np.random.choice(env.nA, 1, p=prob2)[0]\n",
 938 |         "\n",
 939 |         "\n",
 940 |         "def target_dense(observation, alpha=0.9):\n",
 941 |         "    prob2 = alpha*Q_space[observation,:] + (1-alpha)*prob1\n",
 942 |         "    return prob2\n",
 943 |         "\n",
 944 |         "def behav_dense(observation, beta=betabeta):\n",
 945 |         "    prob2 = beta*Q_space[observation,:] + (1-beta)*prob1\n",
 946 |         "    return prob2\n",
 947 |         "\n",
 948 |         "def sarsa2(env, policy, policy2, num_episodes, discount_factor=1.0, Q_space2=Q_space2, alpha=0.6, epsilon=0.03):\n",
 949 |         "\n",
 950 |         "    Q = np.copy(Q_space2)\n",
 951 |         "    episode_episode = []\n",
 952 |         "\n",
 953 |         "    for i_episode in range(num_episodes):\n",
 954 |         "\n",
 955 |         "        if (i_episode + 1) % 200 == 0:\n",
 956 |         "            sys.stdout.flush()\n",
 957 |         "\n",
 958 |         "        state = env.reset()\n",
 959 |         "        action = policy2(state)\n",
 960 |         "\n",
 961 |         "        episode = []\n",
 962 |         "\n",
 963 |         "        for t in itertools.count():\n",
 964 |         "            # Take a step\n",
 965 |         "            next_state, reward, done, _ = env.step(action)\n",
 966 |         "            episode.append((state, action, reward))\n",
 967 |         "            # Pick the next action\n",
 968 |         "            next_action = policy2(next_state)\n",
 969 |         "\n",
 970 |         "            # TD Update\n",
 971 |         "            td_target = reward + discount_factor * np.sum(Q[next_state,:]*target_dense(next_state))\n",
 972 |         "            td_delta = td_target - Q[state, action]\n",
 973 |         "            Q[state, action] += alpha * td_delta\n",
 974 |         "\n",
 975 |         "            if done:\n",
 976 |         "                break\n",
 977 |         "\n",
 978 |         "            action = next_action\n",
 979 |         "            state = next_state\n",
 980 |         "\n",
 981 |         "        episode_episode.append(episode)\n",
 982 |         "\n",
 983 |         "    return Q, episode_episode\n",
 984 |         "\n",
 985 |         "bounds = Bounds([-0.2, -0.2], [0.2, 0.2])\n",
 986 |         "def sigmoid(x, derivative=False):\n",
 987 |         "    return x*(1-x) if derivative else 1/(1+np.exp(-x))\n",
 988 |         "\n",
 989 |         "\n",
 990 |         "depth = 1\n",
 991 |         "def mc_prediction(env, policy, policy2, episode_episode, Q_=1.0, num_episodes=100, discount_factor=1.0):\n",
 992 |         "\n",
 993 |         "    returns_sum = defaultdict(float)\n",
 994 |         "    returns_count = defaultdict(float)\n",
 995 |         "    returns_count2 = defaultdict(float)\n",
 996 |         "\n",
 997 |         "    predic_list = []\n",
 998 |         "    predic_list2 = []\n",
 999 |         "    predic_list3 = []\n",
1000 |         "    predic_list22 = []\n",
1001 |         "    predic_list4 = []\n",
1002 |         "    predic_list5 = np.ones(num_episodes)\n",
1003 |         "    auxiauxi = []\n",
1004 |         "    epiepi = []\n",
1005 |         "    weight_list = np.zeros([num_episodes, 1000])  # For bounded IPW\n",
1006 |         "    weight_list2 = np.zeros([num_episodes, 1002])  # For bounded IPW\n",
1007 |         "    weight_list3 = np.zeros([num_episodes, 1002])  # For bounded IPW\n",
1008 |         "    marginal_weight = np.zeros([num_episodes, 1000])  # For bounded IPW\n",
1009 |         "    marginal_weight_2 = np.zeros([num_episodes, 1000])  # For bounded IPW\n",
1010 |         "    auxi_list = np.zeros([num_episodes, 1000])\n",
1011 |         "    marginal_auxi_list2 = np.zeros([num_episodes, 1000])\n",
1012 |         "    marginal_auxi_list = np.zeros([num_episodes, 1000])\n",
1013 |         "    marginal_auxi_list2_2 = np.zeros([num_episodes, 1000])\n",
1014 |         "    marginal_auxi_list_2 = np.zeros([num_episodes, 1000])\n",
1015 |         "    auxi_list2 = np.zeros([num_episodes, 1000])\n",
1016 |         "    reward_list = np.zeros([num_episodes, 1000])\n",
1017 |         "    state_list = np.zeros([num_episodes, 1000])\n",
1018 |         "    action_list = np.zeros([num_episodes, 1000])\n",
1019 |         "\n",
1020 |         "    count_list = np.zeros(1000)\n",
1021 |         "    episolode_longe_list = []\n",
1022 |         "\n",
1023 |         "\n",
1024 |         "    for i_episode in range(num_episodes):\n",
1025 |         "\n",
1026 |         "        if i_episode % 200 == 0:\n",
1027 |         "            sys.stdout.flush()\n",
1028 |         "\n",
1029 |         "        episode = episode_episode[i_episode]\n",
1030 |         "\n",
1031 |         "        W = 1.0\n",
1032 |         "        W_list = []\n",
1033 |         "        episolode_longe_list.append(len(episode))\n",
1034 |         "\n",
1035 |         "        weight_list2[i_episode, 0] = 1.0\n",
1036 |         "        for t in range(len(episode)):\n",
1037 |         "            state, action, reward = episode[t]\n",
1038 |         "            reward_list[i_episode, t] = reward\n",
1039 |         "            state_list[i_episode, t] = state\n",
1040 |         "            action_list[i_episode, t] = action\n",
1041 |         "\n",
1042 |         "            W = W*target_dense(state)[action]/behav_dense(state)[action]*discount_factor\n",
1043 |         "            probprob = 0.9*Q_space[state,:] + 0.1*prob1\n",
1044 |         "            W_list.append(W)\n",
1045 |         "            weight_list[i_episode, t] = W_list[t]\n",
1046 |         "            weight_list2[i_episode, t+1] = W_list[t]\n",
1047 |         "            weight_list3[i_episode, t] = target_dense(state)[action]/behav_dense(state)[action]\n",
1048 |         "\n",
1049 |         "            count_list[t] += 1.0\n",
1050 |         "\n",
1051 |         "            if t==0:\n",
1052 |         "                auxi_list[i_episode, t] = W_list[t]*Q_[state, action]-np.sum(probprob*Q_[state,:])\n",
1053 |         "            else:\n",
1054 |         "                auxi_list[i_episode, t] = W_list[t]*Q_[state, action]-W_list[t-1]*np.sum(probprob*Q_[state,:])\n",
1055 |         "\n",
1056 |         "            if t==0:\n",
1057 |         "                auxi_list2[i_episode, t] = W_list[t]-1.0\n",
1058 |         "            else:\n",
1059 |         "                auxi_list2[i_episode, t] = W_list[t]-W_list[t-1]\n",
1060 |         "\n",
1061 |         "    print(np.max(np.array(episolode_longe_list)))\n",
1062 |         "\n",
1063 |         "\n",
1064 |         "    weight_list_mean = np.mean(weight_list, 1)\n",
1065 |         "    reward_list_mean = np.mean(reward_list, 1)\n",
1066 |         "    auxi_list_mean = np.mean(auxi_list, 1)\n",
1067 |         "    auxi_list2_mean = np.mean(auxi_list2, 1)\n",
1068 |         "\n",
1069 |         "    val = []\n",
1070 |         "\n",
1071 |         "    ##### IPW\n",
1072 |         "    for i in range(num_episodes):\n",
1073 |         "        predic_list.append(np.sum(weight_list[i,:]*reward_list[i,:]))\n",
1074 |         "\n",
1075 |         "    val.append(np.mean(predic_list))\n",
1076 |         "\n",
1077 |         "    #### Marginalized-IPW\n",
1078 |         "\n",
1079 |         "    for i in range(num_episodes):\n",
1080 |         "        for j in range(episolode_longe_list[i]):\n",
1081 |         "            marginal_weight[i,j] = np.mean(weight_list[:,j][(state_list[:,j]==state_list[i,j]) & (action_list[:,j]==action_list[i,j])])\n",
1082 |         "            if j==0:\n",
1083 |         "                marginal_weight_2[i,j] = weight_list3[i,j]\n",
1084 |         "            else:\n",
1085 |         "                marginal_weight_2[i,j] = np.mean(weight_list[:,j-1][(state_list[:,j]==state_list[i,j])])*weight_list3[i,j]\n",
1086 |         "\n",
1087 |         "\n",
1088 |         "    for i_episode in range(num_episodes):\n",
1089 |         "        for t in range(episolode_longe_list[i_episode]):\n",
1090 |         "            state = int(state_list[i_episode, t])  # Changed np.int to int\n",
1091 |         "            action = int(action_list[i_episode, t])  # Changed np.int to int\n",
1092 |         "            probprob = 0.9*Q_space[state,:] + 0.1*prob1\n",
1093 |         "            if t==0:\n",
1094 |         "                marginal_auxi_list[i_episode, t] = marginal_weight[i_episode, t]*Q_[state, action]-np.sum(probprob*Q_[state,:])\n",
1095 |         "                marginal_auxi_list_2[i_episode, t] = marginal_weight_2[i_episode, t]*Q_[state, action]-np.sum(probprob*Q_[state,:])\n",
1096 |         "                auxi_list[i_episode, t] = weight_list[i_episode, t]*Q_[state, action]-np.sum(probprob*Q_[state,:])\n",
1097 |         "            else:\n",
1098 |         "                marginal_auxi_list[i_episode, t] = marginal_weight[i_episode, t]*(Q_[state, action])-marginal_weight[i_episode, t-1]*np.sum(probprob*(Q_[state,:]))\n",
1099 |         "                marginal_auxi_list_2[i_episode, t] = marginal_weight_2[i_episode, t]*(Q_[state, action])-marginal_weight_2[i_episode, t-1]*np.sum(probprob*(Q_[state,:]))\n",
1100 |         "                auxi_list[i_episode, t] = weight_list[i_episode, t]*(Q_[state, action])-weight_list[i_episode, t-1]*np.sum(probprob*(Q_[state,:]))\n",
1101 |         "\n",
1102 |         "            if t==0:\n",
1103 |         "                marginal_auxi_list2[i_episode, t] = marginal_weight[i_episode, t]-1.0\n",
1104 |         "                marginal_auxi_list2_2[i_episode, t] = marginal_weight_2[i_episode, t]-1.0\n",
1105 |         "                auxi_list2[i_episode, t] = weight_list[i_episode, t]-1.0\n",
1106 |         "            else:\n",
1107 |         "                marginal_auxi_list2[i_episode, t] = marginal_weight[i_episode, t]- marginal_weight[i_episode, t-1]\n",
1108 |         "                marginal_auxi_list2_2[i_episode, t] = marginal_weight_2[i_episode, t]- marginal_weight_2[i_episode, t-1]\n",
1109 |         "                auxi_list2[i_episode, t] = weight_list[i_episode, t]-weight_list[i_episode, t-1]\n",
1110 |         "\n",
1111 |         "\n",
1112 |         "    for i in range(num_episodes):\n",
1113 |         "        predic_list2.append(np.sum(marginal_weight[i,:]*reward_list[i,:]))\n",
1114 |         "\n",
1115 |         "    ### marginal ipw2  #### Using action and state\n",
1116 |         "    val.append(np.mean(predic_list2))\n",
1117 |         "\n",
1118 |         "\n",
1119 |         "    ### marginal ipw3#### Using only state\n",
1120 |         "    for i in range(num_episodes):\n",
1121 |         "        predic_list22.append(np.sum(marginal_weight_2[i,:]*reward_list[i,:]))\n",
1122 |         "\n",
1123 |         "    val.append(np.mean(predic_list22))\n",
1124 |         "\n",
1125 |         "\n",
1126 |         "    #### DR\n",
1127 |         "    val.append(np.mean(predic_list)-np.mean(np.sum(auxi_list, 1)))\n",
1128 |         "\n",
1129 |         "    #### marginal DR 1  #### Using action and state\n",
1130 |         "    val.append(np.mean(predic_list2)-np.mean(np.sum(marginal_auxi_list, 1)))\n",
1131 |         "    #### marginal DR 2   #### Using only state\n",
1132 |         "    val.append(np.mean(predic_list22)-np.mean(np.sum(marginal_auxi_list_2, 1)))\n",
1133 |         "\n",
1134 |         "    return val\n",
1135 |         "\n",
1136 |         "# Main experiment run\n",
1137 |         "is_list = []\n",
1138 |         "is2_list = []\n",
1139 |         "is3_list = []\n",
1140 |         "wis_list = []\n",
1141 |         "wis2_list = []\n",
1142 |         "dm_list = []\n",
1143 |         "dr_list = []\n",
1144 |         "dr2_list = []\n",
1145 |         "dr3_list = []\n",
1146 |         "bdr_list = []\n",
1147 |         "drs_list = []\n",
1148 |         "drs2_list = []\n",
1149 |         "drss_list = []\n",
1150 |         "mdr_list = []\n",
1151 |         "mdr_list2 = []\n",
1152 |         "\n",
1153 |         "sample_size = 1000\n",
1154 |         "sample_size = sample_size // 2  # Integer division in Python 3\n",
1155 |         "for kkk in range(100):\n",
1156 |         "    print(kkk)\n",
1157 |         "    #### Sample splititng\n",
1158 |         "    ### First fold\n",
1159 |         "\n",
1160 |         "    predicted_Q, episode_episode = sarsa2(env, sample_policy, behavior_policy, sample_size)\n",
1161 |         "    V_10k_1 = mc_prediction(env, sample_policy, behavior_policy, episode_episode, predicted_Q, num_episodes=sample_size)\n",
1162 |         "\n",
1163 |         "    ### Second fold\n",
1164 |         "    predicted_Q, episode_episode = sarsa2(env, sample_policy, behavior_policy, sample_size)\n",
1165 |         "    V_10k_2 = mc_prediction(env, sample_policy, behavior_policy, episode_episode, predicted_Q, num_episodes=sample_size)\n",
1166 |         "\n",
1167 |         "    V_10k = 0.5*(np.array(V_10k_1)+np.array(V_10k_2))\n",
1168 |         "    is_list.append(np.mean(V_10k[0]))\n",
1169 |         "    is2_list.append(np.mean(V_10k[1]))\n",
1170 |         "    is3_list.append(np.mean(V_10k[2]))\n",
1171 |         "    dr_list.append(np.mean(V_10k[3]))\n",
1172 |         "    dr2_list.append(np.mean(V_10k[4]))\n",
1173 |         "    dr3_list.append(np.mean(V_10k[5]))\n",
1174 |         "    probprob = 0.9*Q_space[36,:] + 0.1*prob1\n",
1175 |         "    dm_list.append(np.sum(probprob*predicted_Q[36,:]))\n",
1176 |         "    np.savez(\"2estimator_list_ipw_\"+str(betabeta)+\"_\"+str(sample_size), a=is_list)\n",
1177 |         "    np.savez(\"2estimator_list_ipw2_\"+str(betabeta)+\"_\"+str(sample_size), a=is3_list)\n",
1178 |         "    np.savez(\"2estimator_list_dm_\"+str(betabeta)+\"_\"+str(sample_size), a=dm_list)\n",
1179 |         "    np.savez(\"2estimator_list_dr_\"+str(betabeta)+\"_\"+str(sample_size), a=dr_list)\n",
1180 |         "    np.savez(\"2estimator_list_dr2_\"+str(betabeta)+\"_\"+str(sample_size), a=dr3_list)\n",
1181 |         "\n",
1182 |         "# Analysis of results\n",
1183 |         "true = -42.49\n",
1184 |         "\n",
1185 |         "# FIX: Properly calculate MSE instead of using hardcoded values\n",
1186 |         "def mse(aaa):\n",
1187 |         "    aaa = np.array(aaa)\n",
1188 |         "    aaa = aaa[aaa>-100]  # Filter extreme values\n",
1189 |         "    mean_val = np.mean(aaa)  # Calculate mean\n",
1190 |         "    bias = mean_val - true  # Calculate bias\n",
1191 |         "    bias_squared = bias * bias  # Square the bias\n",
1192 |         "    variance = np.var(aaa)  # Calculate variance\n",
1193 |         "    mse_value = bias_squared + variance  # MSE = bias² + variance\n",
1194 |         "    return [mse_value, np.sqrt(np.var((aaa-true)*(aaa-true)))]  # Return MSE and RMSE\n",
1195 |         "\n",
1196 |         "print(np.mean(is_list))\n",
1197 |         "print(mse(is_list))\n",
1198 |         "print(\"wis\")\n",
1199 |         "print(np.mean(is3_list))\n",
1200 |         "print(mse(is3_list))\n",
1201 |         "print(\"dm\")\n",
1202 |         "print(np.mean(dm_list))\n",
1203 |         "print(mse(dm_list))\n",
1204 |         "print(\"dr\")\n",
1205 |         "print(np.mean(dr_list))\n",
1206 |         "print(mse(dr_list))\n",
1207 |         "print(\"dr3\")\n",
1208 |         "print(np.mean(dr3_list))\n",
1209 |         "print(mse(dr3_list))"
1210 |       ],
1211 |       "id": "532a8fd56a713ebe",
1212 |       "outputs": [
1213 |         {
1214 |           "output_type": "stream",
1215 |           "name": "stdout",
1216 |           "text": [
1217 |             "0\n",
1218 |             "237\n",
1219 |             "201\n",
1220 |             "1\n",
1221 |             "293\n",
1222 |             "251\n",
1223 |             "2\n",
1224 |             "269\n",
1225 |             "269\n",
1226 |             "3\n",
1227 |             "307\n",
1228 |             "382\n",
1229 |             "4\n",
1230 |             "262\n",
1231 |             "190\n",
1232 |             "5\n",
1233 |             "232\n",
1234 |             "196\n",
1235 |             "6\n",
1236 |             "232\n",
1237 |             "291\n",
1238 |             "7\n",
1239 |             "316\n",
1240 |             "241\n",
1241 |             "8\n",
1242 |             "224\n",
1243 |             "206\n",
1244 |             "9\n",
1245 |             "274\n",
1246 |             "274\n",
1247 |             "10\n",
1248 |             "261\n",
1249 |             "260\n",
1250 |             "11\n",
1251 |             "250\n",
1252 |             "254\n",
1253 |             "12\n",
1254 |             "196\n",
1255 |             "260\n",
1256 |             "13\n",
1257 |             "324\n",
1258 |             "285\n",
1259 |             "14\n",
1260 |             "259\n",
1261 |             "250\n",
1262 |             "15\n",
1263 |             "235\n",
1264 |             "198\n",
1265 |             "16\n",
1266 |             "267\n",
1267 |             "234\n",
1268 |             "17\n",
1269 |             "250\n",
1270 |             "233\n",
1271 |             "18\n",
1272 |             "205\n",
1273 |             "190\n",
1274 |             "19\n",
1275 |             "196\n",
1276 |             "208\n",
1277 |             "20\n",
1278 |             "340\n",
1279 |             "200\n",
1280 |             "21\n",
1281 |             "233\n",
1282 |             "269\n",
1283 |             "22\n",
1284 |             "228\n",
1285 |             "218\n",
1286 |             "23\n",
1287 |             "246\n",
1288 |             "266\n",
1289 |             "24\n",
1290 |             "238\n",
1291 |             "288\n",
1292 |             "25\n",
1293 |             "248\n",
1294 |             "424\n",
1295 |             "26\n",
1296 |             "254\n",
1297 |             "258\n",
1298 |             "27\n",
1299 |             "240\n",
1300 |             "275\n",
1301 |             "28\n",
1302 |             "213\n",
1303 |             "194\n",
1304 |             "29\n",
1305 |             "202\n",
1306 |             "251\n",
1307 |             "30\n",
1308 |             "174\n",
1309 |             "256\n",
1310 |             "31\n",
1311 |             "227\n",
1312 |             "252\n",
1313 |             "32\n",
1314 |             "182\n",
1315 |             "388\n",
1316 |             "33\n",
1317 |             "212\n",
1318 |             "282\n",
1319 |             "34\n",
1320 |             "333\n",
1321 |             "242\n",
1322 |             "35\n",
1323 |             "214\n",
1324 |             "198\n",
1325 |             "36\n",
1326 |             "245\n",
1327 |             "234\n",
1328 |             "37\n",
1329 |             "218\n",
1330 |             "245\n",
1331 |             "38\n",
1332 |             "258\n",
1333 |             "229\n",
1334 |             "39\n",
1335 |             "236\n",
1336 |             "303\n",
1337 |             "40\n",
1338 |             "229\n",
1339 |             "219\n",
1340 |             "41\n",
1341 |             "265\n",
1342 |             "169\n",
1343 |             "42\n",
1344 |             "217\n",
1345 |             "186\n",
1346 |             "43\n",
1347 |             "222\n",
1348 |             "158\n",
1349 |             "44\n",
1350 |             "374\n",
1351 |             "243\n",
1352 |             "45\n",
1353 |             "290\n",
1354 |             "264\n",
1355 |             "46\n",
1356 |             "291\n",
1357 |             "219\n",
1358 |             "47\n",
1359 |             "260\n",
1360 |             "220\n",
1361 |             "48\n",
1362 |             "210\n",
1363 |             "217\n",
1364 |             "49\n",
1365 |             "259\n",
1366 |             "199\n",
1367 |             "50\n",
1368 |             "190\n",
1369 |             "209\n",
1370 |             "51\n",
1371 |             "238\n",
1372 |             "211\n",
1373 |             "52\n",
1374 |             "217\n",
1375 |             "208\n",
1376 |             "53\n",
1377 |             "309\n",
1378 |             "209\n",
1379 |             "54\n",
1380 |             "159\n",
1381 |             "241\n",
1382 |             "55\n",
1383 |             "184\n",
1384 |             "326\n",
1385 |             "56\n",
1386 |             "204\n",
1387 |             "243\n",
1388 |             "57\n",
1389 |             "188\n",
1390 |             "201\n",
1391 |             "58\n",
1392 |             "249\n",
1393 |             "278\n",
1394 |             "59\n",
1395 |             "238\n",
1396 |             "210\n",
1397 |             "60\n",
1398 |             "179\n",
1399 |             "234\n",
1400 |             "61\n",
1401 |             "188\n",
1402 |             "211\n",
1403 |             "62\n",
1404 |             "156\n",
1405 |             "177\n",
1406 |             "63\n",
1407 |             "220\n",
1408 |             "261\n",
1409 |             "64\n",
1410 |             "177\n",
1411 |             "244\n",
1412 |             "65\n",
1413 |             "247\n",
1414 |             "234\n",
1415 |             "66\n",
1416 |             "394\n",
1417 |             "214\n",
1418 |             "67\n",
1419 |             "226\n",
1420 |             "221\n",
1421 |             "68\n",
1422 |             "200\n",
1423 |             "208\n",
1424 |             "69\n",
1425 |             "537\n",
1426 |             "306\n",
1427 |             "70\n",
1428 |             "196\n",
1429 |             "214\n",
1430 |             "71\n",
1431 |             "237\n",
1432 |             "225\n",
1433 |             "72\n",
1434 |             "205\n",
1435 |             "270\n",
1436 |             "73\n",
1437 |             "207\n",
1438 |             "226\n",
1439 |             "74\n",
1440 |             "322\n",
1441 |             "196\n",
1442 |             "75\n",
1443 |             "305\n",
1444 |             "251\n",
1445 |             "76\n",
1446 |             "276\n",
1447 |             "232\n",
1448 |             "77\n",
1449 |             "193\n",
1450 |             "222\n",
1451 |             "78\n",
1452 |             "296\n",
1453 |             "216\n",
1454 |             "79\n",
1455 |             "338\n",
1456 |             "249\n",
1457 |             "80\n",
1458 |             "219\n",
1459 |             "169\n",
1460 |             "81\n",
1461 |             "240\n",
1462 |             "219\n",
1463 |             "82\n",
1464 |             "148\n",
1465 |             "335\n",
1466 |             "83\n",
1467 |             "177\n",
1468 |             "220\n",
1469 |             "84\n",
1470 |             "265\n",
1471 |             "205\n",
1472 |             "85\n",
1473 |             "261\n",
1474 |             "310\n",
1475 |             "86\n",
1476 |             "277\n",
1477 |             "223\n",
1478 |             "87\n",
1479 |             "247\n",
1480 |             "238\n",
1481 |             "88\n",
1482 |             "225\n",
1483 |             "251\n",
1484 |             "89\n",
1485 |             "302\n",
1486 |             "208\n",
1487 |             "90\n",
1488 |             "239\n",
1489 |             "197\n",
1490 |             "91\n",
1491 |             "196\n",
1492 |             "234\n",
1493 |             "92\n",
1494 |             "288\n",
1495 |             "285\n",
1496 |             "93\n",
1497 |             "224\n",
1498 |             "221\n",
1499 |             "94\n",
1500 |             "197\n",
1501 |             "221\n",
1502 |             "95\n",
1503 |             "224\n",
1504 |             "260\n",
1505 |             "96\n",
1506 |             "241\n",
1507 |             "232\n",
1508 |             "97\n",
1509 |             "253\n",
1510 |             "267\n",
1511 |             "98\n",
1512 |             "246\n",
1513 |             "246\n",
1514 |             "99\n",
1515 |             "304\n",
1516 |             "206\n",
1517 |             "-55.05659841010797\n",
1518 |             "[np.float64(176.22000714287435), np.float64(172.69131358640544)]\n",
1519 |             "wis\n",
1520 |             "-53.685850828271825\n",
1521 |             "[np.float64(137.31354574090406), np.float64(115.13089351613758)]\n",
1522 |             "dm\n",
1523 |             "-54.01073436235229\n",
1524 |             "[np.float64(132.81942955872594), np.float64(6.970382434455023)]\n",
1525 |             "dr\n",
1526 |             "-55.45433687243875\n",
1527 |             "[np.float64(168.14039815809465), np.float64(6.815543578139827)]\n",
1528 |             "dr3\n",
1529 |             "-55.35302467832383\n",
1530 |             "[np.float64(165.49802165849383), np.float64(5.193413671178172)]\n"
1531 |           ]
1532 |         }
1533 |       ],
1534 |       "execution_count": null
1535 |     },
1536 |     {
1537 |       "metadata": {
1538 |         "ExecuteTime": {
1539 |           "end_time": "2025-05-12T20:25:36.301160Z",
1540 |           "start_time": "2025-05-12T20:07:04.544166Z"
1541 |         },
1542 |         "id": "d1b9a15eca65f5db",
1543 |         "outputId": "ee940321-3c29-41d9-fbca-f5415733bb4f",
1544 |         "colab": {
1545 |           "base_uri": "https://localhost:8080/",
1546 |           "height": 332
1547 |         }
1548 |       },
1549 |       "cell_type": "code",
1550 |       "source": [
1551 |         "import itertools\n",
1552 |         "import numpy as np\n",
1553 |         "import sys\n",
1554 |         "import gym\n",
1555 |         "\n",
1556 |         "# Since lib.envs isn't available, we'll need to define these environments here\n",
1557 |         "# or use gym environments directly. For now, I'll create simplified versions.\n",
1558 |         "\n",
1559 |         "from collections import defaultdict\n",
1560 |         "\n",
1561 |         "# Simple CliffWalkingEnv implementation\n",
1562 |         "class CliffWalkingEnv(gym.Env):\n",
1563 |         "    def __init__(self):\n",
1564 |         "        self.shape = (4, 12)\n",
1565 |         "        self.start_state_index = np.ravel_multi_index((3, 0), self.shape)\n",
1566 |         "        self.goal_state_index = np.ravel_multi_index((3, 11), self.shape)\n",
1567 |         "        self.cliff = list(range(np.ravel_multi_index((3, 1), self.shape),\n",
1568 |         "                               np.ravel_multi_index((3, 11), self.shape)))\n",
1569 |         "        self.nS = self.shape[0] * self.shape[1]\n",
1570 |         "        self.nA = 4  # up, right, down, left\n",
1571 |         "\n",
1572 |         "        # Calculate transition probabilities and rewards\n",
1573 |         "        self.P = {}\n",
1574 |         "        for s in range(self.nS):\n",
1575 |         "            position = np.unravel_index(s, self.shape)\n",
1576 |         "            self.P[s] = {a: [] for a in range(self.nA)}\n",
1577 |         "\n",
1578 |         "            # Actions: 0=up, 1=right, 2=down, 3=left\n",
1579 |         "            for a in range(self.nA):\n",
1580 |         "                reward = -1.0  # default reward for each move\n",
1581 |         "                next_position = list(position)\n",
1582 |         "                if a == 0:\n",
1583 |         "                    next_position[0] = max(position[0] - 1, 0)\n",
1584 |         "                elif a == 1:\n",
1585 |         "                    next_position[1] = min(position[1] + 1, self.shape[1] - 1)\n",
1586 |         "                elif a == 2:\n",
1587 |         "                    next_position[0] = min(position[0] + 1, self.shape[0] - 1)\n",
1588 |         "                elif a == 3:\n",
1589 |         "                    next_position[1] = max(position[1] - 1, 0)\n",
1590 |         "\n",
1591 |         "                next_state = np.ravel_multi_index(next_position, self.shape)\n",
1592 |         "\n",
1593 |         "                # Check if we're at the cliff\n",
1594 |         "                if s in self.cliff:\n",
1595 |         "                    next_state = self.start_state_index\n",
1596 |         "                    reward = -100.0\n",
1597 |         "\n",
1598 |         "                # Check if we're at the goal\n",
1599 |         "                done = next_state == self.goal_state_index\n",
1600 |         "\n",
1601 |         "                self.P[s][a] = [(1.0, next_state, reward, done)]\n",
1602 |         "\n",
1603 |         "        self.observation_space = gym.spaces.Discrete(self.nS)\n",
1604 |         "        self.action_space = gym.spaces.Discrete(self.nA)\n",
1605 |         "\n",
1606 |         "        self.reset()\n",
1607 |         "\n",
1608 |         "    def step(self, action):\n",
1609 |         "        state, reward, done, _ = self._step(action)\n",
1610 |         "        self.s = state\n",
1611 |         "        return (state, reward, done, {})\n",
1612 |         "\n",
1613 |         "    def _step(self, action):\n",
1614 |         "        (probs, next_state, reward, done) = self.P[self.s][action][0]\n",
1615 |         "        return (next_state, reward, done, {})\n",
1616 |         "\n",
1617 |         "    def reset(self):\n",
1618 |         "        self.s = self.start_state_index\n",
1619 |         "        return self.s\n",
1620 |         "\n",
1621 |         "# Simple WindyGridworldEnv implementation (not fully used in this code)\n",
1622 |         "class WindyGridworldEnv(gym.Env):\n",
1623 |         "    def __init__(self):\n",
1624 |         "        self.shape = (7, 10)\n",
1625 |         "        self.nS = self.shape[0] * self.shape[1]\n",
1626 |         "        self.nA = 4  # up, right, down, left\n",
1627 |         "        self.wind = [0, 0, 0, 1, 1, 1, 2, 2, 1, 0]\n",
1628 |         "        self.reset()\n",
1629 |         "\n",
1630 |         "    def step(self, action):\n",
1631 |         "        # Not implemented as it's not used in the main code\n",
1632 |         "        pass\n",
1633 |         "\n",
1634 |         "    def reset(self):\n",
1635 |         "        self.s = np.ravel_multi_index((3, 0), self.shape)\n",
1636 |         "        return self.s\n",
1637 |         "\n",
1638 |         "from scipy.optimize import minimize, rosen, rosen_der\n",
1639 |         "from scipy.optimize import Bounds\n",
1640 |         "\n",
1641 |         "bounds = Bounds([-0.1, -0.1], [0.1, 0.1])\n",
1642 |         "\n",
1643 |         "env = CliffWalkingEnv()\n",
1644 |         "\n",
1645 |         "def make_epsilon_greedy_policy(Q, epsilon, nA):\n",
1646 |         "    def policy_fn(observation):\n",
1647 |         "        A = np.ones(nA, dtype=float) * epsilon / nA\n",
1648 |         "        best_action = np.argmax(Q[observation])\n",
1649 |         "        A[best_action] += (1.0 - epsilon)\n",
1650 |         "        return A\n",
1651 |         "    return policy_fn\n",
1652 |         "\n",
1653 |         "Q_space = np.load(\"/content/drive/MyDrive/Work/Estimators/DoubleReinforcement/Q-table-cliff.npz\")[\"xxx\"]\n",
1654 |         "Q_space2 = np.load(\"/content/drive/MyDrive/Work/Estimators/DoubleReinforcement/Q-table-cliff.npz\")[\"xxx\"] #Q-table-cliff.npz\n",
1655 |         "\n",
1656 |         "prob1 = [1.0 for i in range((env.nA))]\n",
1657 |         "prob1 = prob1/np.sum(prob1)\n",
1658 |         "\n",
1659 |         "betabeta = 0.8\n",
1660 |         "def sample_policy(observation, alpha=0.9):\n",
1661 |         "    prob2 = alpha*Q_space[observation,:] + (1-alpha)*prob1\n",
1662 |         "    return np.random.choice(env.nA, 1, p=prob2)[0]\n",
1663 |         "\n",
1664 |         "\n",
1665 |         "def behavior_policy(observation, beta=betabeta):\n",
1666 |         "    prob2 = beta*Q_space[observation,:] + (1-beta)*prob1\n",
1667 |         "    return np.random.choice(env.nA, 1, p=prob2)[0]\n",
1668 |         "\n",
1669 |         "\n",
1670 |         "def target_dense(observation, alpha=0.9):\n",
1671 |         "    prob2 = alpha*Q_space[observation,:] + (1-alpha)*prob1\n",
1672 |         "    return prob2\n",
1673 |         "\n",
1674 |         "def behav_dense(observation, beta=betabeta):\n",
1675 |         "    prob2 = beta*Q_space[observation,:] + (1-beta)*prob1\n",
1676 |         "    return prob2\n",
1677 |         "\n",
1678 |         "def sarsa2(env, policy, policy2, num_episodes, discount_factor=1.0, Q_space2=Q_space2, alpha=0.6, epsilon=0.03):\n",
1679 |         "\n",
1680 |         "    Q = np.copy(Q_space2)\n",
1681 |         "    episode_episode = []\n",
1682 |         "\n",
1683 |         "    for i_episode in range(num_episodes):\n",
1684 |         "\n",
1685 |         "        if (i_episode + 1) % 200 == 0:\n",
1686 |         "            sys.stdout.flush()\n",
1687 |         "\n",
1688 |         "        state = env.reset()\n",
1689 |         "        action = policy2(state)\n",
1690 |         "\n",
1691 |         "        episode = []\n",
1692 |         "\n",
1693 |         "        for t in itertools.count():\n",
1694 |         "            # Take a step\n",
1695 |         "            next_state, reward, done, _ = env.step(action)\n",
1696 |         "            episode.append((state, action, reward))\n",
1697 |         "            # Pick the next action\n",
1698 |         "            next_action = policy2(next_state)\n",
1699 |         "\n",
1700 |         "            # TD Update\n",
1701 |         "            td_target = reward + discount_factor * np.sum(Q[next_state,:]*target_dense(next_state))\n",
1702 |         "            td_delta = td_target - Q[state, action]\n",
1703 |         "            Q[state, action] += alpha * td_delta\n",
1704 |         "\n",
1705 |         "            if done:\n",
1706 |         "                break\n",
1707 |         "\n",
1708 |         "            action = next_action\n",
1709 |         "            state = next_state\n",
1710 |         "\n",
1711 |         "        episode_episode.append(episode)\n",
1712 |         "\n",
1713 |         "    return Q, episode_episode\n",
1714 |         "\n",
1715 |         "bounds = Bounds([-0.2, -0.2], [0.2, 0.2])\n",
1716 |         "def sigmoid(x, derivative=False):\n",
1717 |         "    return x*(1-x) if derivative else 1/(1+np.exp(-x))\n",
1718 |         "\n",
1719 |         "\n",
1720 |         "depth = 1\n",
1721 |         "def mc_prediction(env, policy, policy2, episode_episode, Q_=1.0, num_episodes=100, discount_factor=1.0):\n",
1722 |         "\n",
1723 |         "    returns_sum = defaultdict(float)\n",
1724 |         "    returns_count = defaultdict(float)\n",
1725 |         "    returns_count2 = defaultdict(float)\n",
1726 |         "\n",
1727 |         "    predic_list = []\n",
1728 |         "    predic_list2 = []\n",
1729 |         "    predic_list3 = []\n",
1730 |         "    predic_list22 = []\n",
1731 |         "    predic_list4 = []\n",
1732 |         "    predic_list5 = np.ones(num_episodes)\n",
1733 |         "    auxiauxi = []\n",
1734 |         "    epiepi = []\n",
1735 |         "    weight_list = np.zeros([num_episodes, 1000])  # For bounded IPW\n",
1736 |         "    weight_list2 = np.zeros([num_episodes, 1002])  # For bounded IPW\n",
1737 |         "    weight_list3 = np.zeros([num_episodes, 1002])  # For bounded IPW\n",
1738 |         "    marginal_weight = np.zeros([num_episodes, 1000])  # For bounded IPW\n",
1739 |         "    marginal_weight_2 = np.zeros([num_episodes, 1000])  # For bounded IPW\n",
1740 |         "    auxi_list = np.zeros([num_episodes, 1000])\n",
1741 |         "    marginal_auxi_list2 = np.zeros([num_episodes, 1000])\n",
1742 |         "    marginal_auxi_list = np.zeros([num_episodes, 1000])\n",
1743 |         "    marginal_auxi_list2_2 = np.zeros([num_episodes, 1000])\n",
1744 |         "    marginal_auxi_list_2 = np.zeros([num_episodes, 1000])\n",
1745 |         "    auxi_list2 = np.zeros([num_episodes, 1000])\n",
1746 |         "    reward_list = np.zeros([num_episodes, 1000])\n",
1747 |         "    state_list = np.zeros([num_episodes, 1000])\n",
1748 |         "    action_list = np.zeros([num_episodes, 1000])\n",
1749 |         "\n",
1750 |         "    count_list = np.zeros(1000)\n",
1751 |         "    episolode_longe_list = []\n",
1752 |         "\n",
1753 |         "\n",
1754 |         "    for i_episode in range(num_episodes):\n",
1755 |         "\n",
1756 |         "        if i_episode % 200 == 0:\n",
1757 |         "            sys.stdout.flush()\n",
1758 |         "\n",
1759 |         "        episode = episode_episode[i_episode]\n",
1760 |         "\n",
1761 |         "        W = 1.0\n",
1762 |         "        W_list = []\n",
1763 |         "        episolode_longe_list.append(len(episode))\n",
1764 |         "\n",
1765 |         "        weight_list2[i_episode, 0] = 1.0\n",
1766 |         "        for t in range(len(episode)):\n",
1767 |         "            state, action, reward = episode[t]\n",
1768 |         "            reward_list[i_episode, t] = reward\n",
1769 |         "            state_list[i_episode, t] = state\n",
1770 |         "            action_list[i_episode, t] = action\n",
1771 |         "\n",
1772 |         "            W = W*target_dense(state)[action]/behav_dense(state)[action]*discount_factor\n",
1773 |         "            probprob = 0.9*Q_space[state,:] + 0.1*prob1\n",
1774 |         "            W_list.append(W)\n",
1775 |         "            weight_list[i_episode, t] = W_list[t]\n",
1776 |         "            weight_list2[i_episode, t+1] = W_list[t]\n",
1777 |         "            weight_list3[i_episode, t] = target_dense(state)[action]/behav_dense(state)[action]\n",
1778 |         "\n",
1779 |         "            count_list[t] += 1.0\n",
1780 |         "\n",
1781 |         "            if t==0:\n",
1782 |         "                auxi_list[i_episode, t] = W_list[t]*Q_[state, action]-np.sum(probprob*Q_[state,:])\n",
1783 |         "            else:\n",
1784 |         "                auxi_list[i_episode, t] = W_list[t]*Q_[state, action]-W_list[t-1]*np.sum(probprob*Q_[state,:])\n",
1785 |         "\n",
1786 |         "            if t==0:\n",
1787 |         "                auxi_list2[i_episode, t] = W_list[t]-1.0\n",
1788 |         "            else:\n",
1789 |         "                auxi_list2[i_episode, t] = W_list[t]-W_list[t-1]\n",
1790 |         "\n",
1791 |         "    print(np.max(np.array(episolode_longe_list)))\n",
1792 |         "\n",
1793 |         "\n",
1794 |         "    weight_list_mean = np.mean(weight_list, 1)\n",
1795 |         "    reward_list_mean = np.mean(reward_list, 1)\n",
1796 |         "    auxi_list_mean = np.mean(auxi_list, 1)\n",
1797 |         "    auxi_list2_mean = np.mean(auxi_list2, 1)\n",
1798 |         "\n",
1799 |         "    val = []\n",
1800 |         "\n",
1801 |         "    ##### IPW\n",
1802 |         "    for i in range(num_episodes):\n",
1803 |         "        predic_list.append(np.sum(weight_list[i,:]*reward_list[i,:]))\n",
1804 |         "\n",
1805 |         "    val.append(np.mean(predic_list))\n",
1806 |         "\n",
1807 |         "    #### Marginalized-IPW\n",
1808 |         "\n",
1809 |         "    for i in range(num_episodes):\n",
1810 |         "        for j in range(episolode_longe_list[i]):\n",
1811 |         "            marginal_weight[i,j] = np.mean(weight_list[:,j][(state_list[:,j]==state_list[i,j]) & (action_list[:,j]==action_list[i,j])])\n",
1812 |         "            if j==0:\n",
1813 |         "                marginal_weight_2[i,j] = weight_list3[i,j]\n",
1814 |         "            else:\n",
1815 |         "                marginal_weight_2[i,j] = np.mean(weight_list[:,j-1][(state_list[:,j]==state_list[i,j])])*weight_list3[i,j]\n",
1816 |         "\n",
1817 |         "\n",
1818 |         "    for i_episode in range(num_episodes):\n",
1819 |         "        for t in range(episolode_longe_list[i_episode]):\n",
1820 |         "            state = int(state_list[i_episode, t])  # Changed np.int to int\n",
1821 |         "            action = int(action_list[i_episode, t])  # Changed np.int to int\n",
1822 |         "            probprob = 0.9*Q_space[state,:] + 0.1*prob1\n",
1823 |         "            if t==0:\n",
1824 |         "                marginal_auxi_list[i_episode, t] = marginal_weight[i_episode, t]*Q_[state, action]-np.sum(probprob*Q_[state,:])\n",
1825 |         "                marginal_auxi_list_2[i_episode, t] = marginal_weight_2[i_episode, t]*Q_[state, action]-np.sum(probprob*Q_[state,:])\n",
1826 |         "                auxi_list[i_episode, t] = weight_list[i_episode, t]*Q_[state, action]-np.sum(probprob*Q_[state,:])\n",
1827 |         "            else:\n",
1828 |         "                marginal_auxi_list[i_episode, t] = marginal_weight[i_episode, t]*(Q_[state, action])-marginal_weight[i_episode, t-1]*np.sum(probprob*(Q_[state,:]))\n",
1829 |         "                marginal_auxi_list_2[i_episode, t] = marginal_weight_2[i_episode, t]*(Q_[state, action])-marginal_weight_2[i_episode, t-1]*np.sum(probprob*(Q_[state,:]))\n",
1830 |         "                auxi_list[i_episode, t] = weight_list[i_episode, t]*(Q_[state, action])-weight_list[i_episode, t-1]*np.sum(probprob*(Q_[state,:]))\n",
1831 |         "\n",
1832 |         "            if t==0:\n",
1833 |         "                marginal_auxi_list2[i_episode, t] = marginal_weight[i_episode, t]-1.0\n",
1834 |         "                marginal_auxi_list2_2[i_episode, t] = marginal_weight_2[i_episode, t]-1.0\n",
1835 |         "                auxi_list2[i_episode, t] = weight_list[i_episode, t]-1.0\n",
1836 |         "            else:\n",
1837 |         "                marginal_auxi_list2[i_episode, t] = marginal_weight[i_episode, t]- marginal_weight[i_episode, t-1]\n",
1838 |         "                marginal_auxi_list2_2[i_episode, t] = marginal_weight_2[i_episode, t]- marginal_weight_2[i_episode, t-1]\n",
1839 |         "                auxi_list2[i_episode, t] = weight_list[i_episode, t]-weight_list[i_episode, t-1]\n",
1840 |         "\n",
1841 |         "\n",
1842 |         "    for i in range(num_episodes):\n",
1843 |         "        predic_list2.append(np.sum(marginal_weight[i,:]*reward_list[i,:]))\n",
1844 |         "\n",
1845 |         "    ### marginal ipw2  #### Using action and state\n",
1846 |         "    val.append(np.mean(predic_list2))\n",
1847 |         "\n",
1848 |         "\n",
1849 |         "    ### marginal ipw3#### Using only state\n",
1850 |         "    for i in range(num_episodes):\n",
1851 |         "        predic_list22.append(np.sum(marginal_weight_2[i,:]*reward_list[i,:]))\n",
1852 |         "\n",
1853 |         "    val.append(np.mean(predic_list22))\n",
1854 |         "\n",
1855 |         "\n",
1856 |         "    #### DR\n",
1857 |         "    val.append(np.mean(predic_list)-np.mean(np.sum(auxi_list, 1)))\n",
1858 |         "\n",
1859 |         "    #### marginal DR 1  #### Using action and state\n",
1860 |         "    val.append(np.mean(predic_list2)-np.mean(np.sum(marginal_auxi_list, 1)))\n",
1861 |         "    #### marginal DR 2   #### Using only state\n",
1862 |         "    val.append(np.mean(predic_list22)-np.mean(np.sum(marginal_auxi_list_2, 1)))\n",
1863 |         "\n",
1864 |         "    return val\n",
1865 |         "\n",
1866 |         "# Main experiment run\n",
1867 |         "is_list = []\n",
1868 |         "is2_list = []\n",
1869 |         "is3_list = []\n",
1870 |         "wis_list = []\n",
1871 |         "wis2_list = []\n",
1872 |         "dm_list = []\n",
1873 |         "dr_list = []\n",
1874 |         "dr2_list = []\n",
1875 |         "dr3_list = []\n",
1876 |         "bdr_list = []\n",
1877 |         "drs_list = []\n",
1878 |         "drs2_list = []\n",
1879 |         "drss_list = []\n",
1880 |         "mdr_list = []\n",
1881 |         "mdr_list2 = []\n",
1882 |         "\n",
1883 |         "sample_size = 1000\n",
1884 |         "sample_size = sample_size // 2  # Integer division in Python 3\n",
1885 |         "for kkk in range(100):\n",
1886 |         "    print(kkk)\n",
1887 |         "    #### Sample splititng\n",
1888 |         "    ### First fold\n",
1889 |         "\n",
1890 |         "    predicted_Q, episode_episode = sarsa2(env, sample_policy, behavior_policy, sample_size)\n",
1891 |         "    V_10k_1 = mc_prediction(env, sample_policy, behavior_policy, episode_episode, predicted_Q, num_episodes=sample_size)\n",
1892 |         "\n",
1893 |         "    ### Second fold\n",
1894 |         "    predicted_Q, episode_episode = sarsa2(env, sample_policy, behavior_policy, sample_size)\n",
1895 |         "    V_10k_2 = mc_prediction(env, sample_policy, behavior_policy, episode_episode, predicted_Q, num_episodes=sample_size)\n",
1896 |         "\n",
1897 |         "    V_10k = 0.5*(np.array(V_10k_1)+np.array(V_10k_2))\n",
1898 |         "    is_list.append(np.mean(V_10k[0]))\n",
1899 |         "    is2_list.append(np.mean(V_10k[1]))\n",
1900 |         "    is3_list.append(np.mean(V_10k[2]))\n",
1901 |         "    dr_list.append(np.mean(V_10k[3]))\n",
1902 |         "    dr2_list.append(np.mean(V_10k[4]))\n",
1903 |         "    dr3_list.append(np.mean(V_10k[5]))\n",
1904 |         "    probprob = 0.9*Q_space[36,:] + 0.1*prob1\n",
1905 |         "    dm_list.append(np.sum(probprob*predicted_Q[36,:]))\n",
1906 |         "    np.savez(\"2estimator_list_ipw_\"+str(betabeta)+\"_\"+str(sample_size), a=is_list)\n",
1907 |         "    np.savez(\"2estimator_list_ipw2_\"+str(betabeta)+\"_\"+str(sample_size), a=is3_list)\n",
1908 |         "    np.savez(\"2estimator_list_dm_\"+str(betabeta)+\"_\"+str(sample_size), a=dm_list)\n",
1909 |         "    np.savez(\"2estimator_list_dr_\"+str(betabeta)+\"_\"+str(sample_size), a=dr_list)\n",
1910 |         "    np.savez(\"2estimator_list_dr2_\"+str(betabeta)+\"_\"+str(sample_size), a=dr3_list)\n",
1911 |         "\n",
1912 |         "# Analysis of results\n",
1913 |         "true = -42.49\n",
1914 |         "\n",
1915 |         "# FIX: Properly calculate MSE instead of using hardcoded values\n",
1916 |         "def mse(aaa):\n",
1917 |         "    aaa = np.array(aaa)\n",
1918 |         "    aaa = aaa[aaa>-100]  # Filter extreme values\n",
1919 |         "    mean_val = np.mean(aaa)  # Calculate mean\n",
1920 |         "    bias = mean_val - true  # Calculate bias\n",
1921 |         "    bias_squared = bias * bias  # Square the bias\n",
1922 |         "    variance = np.var(aaa)  # Calculate variance\n",
1923 |         "    mse_value = bias_squared + variance  # MSE = bias² + variance\n",
1924 |         "    return [mse_value, np.sqrt(np.var((aaa-true)*(aaa-true)))]  # Return MSE and RMSE\n",
1925 |         "\n",
1926 |         "print(np.mean(is_list))\n",
1927 |         "print(mse(is_list))\n",
1928 |         "print(\"wis\")\n",
1929 |         "print(np.mean(is3_list))\n",
1930 |         "print(mse(is3_list))\n",
1931 |         "print(\"dm\")\n",
1932 |         "print(np.mean(dm_list))\n",
1933 |         "print(mse(dm_list))\n",
1934 |         "print(\"dr\")\n",
1935 |         "print(np.mean(dr_list))\n",
1936 |         "print(mse(dr_list))\n",
1937 |         "print(\"dr3\")\n",
1938 |         "print(np.mean(dr3_list))\n",
1939 |         "print(mse(dr3_list))"
1940 |       ],
1941 |       "id": "d1b9a15eca65f5db",
1942 |       "outputs": [
1943 |         {
1944 |           "output_type": "error",
1945 |           "ename": "FileNotFoundError",
1946 |           "evalue": "[Errno 2] No such file or directory: '/content/drive/MyDrive/Work/Estimators/DoubleReinforcement/Q-table-cliff.npz'",
1947 |           "traceback": [
1948 |             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1949 |             "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
1950 |             "\u001b[0;32m<ipython-input-5-1fc03caddcee>\u001b[0m in \u001b[0;36m<cell line: 0>\u001b[0;34m()\u001b[0m\n\u001b[1;32m    101\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mpolicy_fn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    102\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 103\u001b[0;31m \u001b[0mQ_space\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/content/drive/MyDrive/Work/Estimators/DoubleReinforcement/Q-table-cliff.npz\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"xxx\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    104\u001b[0m \u001b[0mQ_space2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/content/drive/MyDrive/Work/Estimators/DoubleReinforcement/Q-table-cliff.npz\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"xxx\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;31m#Q-table-cliff.npz\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    105\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
1951 |             "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/numpy/lib/_npyio_impl.py\u001b[0m in \u001b[0;36mload\u001b[0;34m(file, mmap_mode, allow_pickle, fix_imports, encoding, max_header_size)\u001b[0m\n\u001b[1;32m    453\u001b[0m             \u001b[0mown_fid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    454\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 455\u001b[0;31m             \u001b[0mfid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstack\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menter_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfspath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    456\u001b[0m             \u001b[0mown_fid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    457\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
1952 |             "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/content/drive/MyDrive/Work/Estimators/DoubleReinforcement/Q-table-cliff.npz'"
1953 |           ]
1954 |         }
1955 |       ],
1956 |       "execution_count": 5
1957 |     },
1958 |     {
1959 |       "cell_type": "code",
1960 |       "source": [],
1961 |       "metadata": {
1962 |         "id": "ij5Uepr11-NZ"
1963 |       },
1964 |       "id": "ij5Uepr11-NZ",
1965 |       "execution_count": null,
1966 |       "outputs": []
1967 |     }
1968 |   ],
1969 |   "metadata": {
1970 |     "kernelspec": {
1971 |       "display_name": "Python 3",
1972 |       "language": "python",
1973 |       "name": "python3"
1974 |     },
1975 |     "language_info": {
1976 |       "codemirror_mode": {
1977 |         "name": "ipython",
1978 |         "version": 2
1979 |       },
1980 |       "file_extension": ".py",
1981 |       "mimetype": "text/x-python",
1982 |       "name": "python",
1983 |       "nbconvert_exporter": "python",
1984 |       "pygments_lexer": "ipython2",
1985 |       "version": "2.7.6"
1986 |     },
1987 |     "colab": {
1988 |       "provenance": []
1989 |     }
1990 |   },
1991 |   "nbformat": 4,
1992 |   "nbformat_minor": 5
1993 | }


--------------------------------------------------------------------------------