├── .DS_Store ├── Q-table-cliff.npz ├── Q-table-real-cliff.npz ├── README.md ├── exp5_1 ├── toytoy.py └── toytoy2.py ├── exp5_1_py3 ├── rmse_all.py ├── rmse_learner.py ├── toytoy2_par_table.py └── toytoy2_par_table_learner.py ├── exp5_2 ├── crif_walking_ope.py └── cw_notebook_ver_splitting.ipynb └── exp5_2_py3 └── cw_notebook_ver_splitting_p3.ipynb /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalML/DoubleReinforcementLearningMDP/daae93f4de4d721c2663668e8ed187dbc7dfea25/.DS_Store -------------------------------------------------------------------------------- /Q-table-cliff.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalML/DoubleReinforcementLearningMDP/daae93f4de4d721c2663668e8ed187dbc7dfea25/Q-table-cliff.npz -------------------------------------------------------------------------------- /Q-table-real-cliff.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CausalML/DoubleReinforcementLearningMDP/daae93f4de4d721c2663668e8ed187dbc7dfea25/Q-table-real-cliff.npz -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DoubleReinforcementLearningMDP 2 | 3 | This repository contains the code for replicating the experiments from the paper 4 | ### "Double Reinforcement Learning for Efficient Off-Policy Evaluation in Markov Decision Processes" 5 | - https://arxiv.org/abs/1908.08526 6 | 7 | ## Experiments in Section 5.1 8 | 9 | The relevant code is in the subdirectory `exp5_1`. 10 | * `toytoy.py` runs the experiment with the in-sample variant of the estimators. 11 | * `toytoy2.py` runs the experiment with the samples-splitting variant of the estimators. 12 | 13 | For example, to run 10 parallel replications, one can run the command `seq 10 | xargs -L 1 -P 10 ./toytoy.sh` 14 | 15 | ## Experiments in Section 5.2 16 | 17 | The relevant code is in the subdirectory `exp5_2`. 18 | 19 | 20 | ## Matrix 21 | code -------------------------------------------------------------------------------- /exp5_1/toytoy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy as sp 3 | import sys 4 | import math 5 | 6 | 7 | 8 | 9 | 10 | def sigmoid(x): 11 | return(1.0/(1.0+np.exp(-0.1*x))) 12 | 13 | ### Both models are good 14 | 15 | beta = 0.2 16 | alpha = 0.9 17 | rep = 1500 18 | num = 0 19 | estimator_list_ipw = [] 20 | estimator_list_ipw2 = [] 21 | estimator_list_ipw3 = [] 22 | estimator_list_dm = [] 23 | estimator_list_dr = [] 24 | estimator_list_dr2 = [] 25 | estimator_list_dr3 = [] 26 | estimator_list_ipw2_ratio_mis = [] 27 | estimator_list_dr2_ratio_mis = [] 28 | estimator_list_dm_q_mis = [] 29 | estimator_list_dr_q_mis = [] 30 | estimator_list_dr2_q_mis = [] 31 | 32 | 33 | args = sys.argv 34 | ver_ = str(args[1]) 35 | N = np.int(args[2]) 36 | 37 | 38 | from sklearn import linear_model 39 | 40 | for iii in range(rep): 41 | 42 | T = 30 43 | 44 | r_list = np.zeros([N,T]) 45 | weight_list = np.zeros([N,T]) 46 | s_list = np.zeros([N,T]) 47 | a_list = np.zeros([N,T]) 48 | w_list = np.zeros([N,T]) 49 | w_list2 = np.zeros([N,T]) 50 | 51 | def behav_policy(s,i): 52 | a = beta*sigmoid(s)+(beta)*np.random.uniform(0.0,1.0) 53 | return(np.random.binomial(1,a,1)[0]) 54 | 55 | def eval_policy(s,i): 56 | a = alpha*sigmoid(s)+(1-alpha)*np.random.uniform(0.0,1.0) 57 | return(np.random.binomial(1,a,1)[0]) 58 | 59 | def behav_policy_dens(s,a,i): 60 | b = beta*sigmoid(s)+(beta)*0.5 61 | if a==1: 62 | return(b) 63 | else: 64 | return(1.0-b) 65 | 66 | def eval_policy_dens(s,a,i): 67 | b = alpha*sigmoid(s)+(1-alpha)*0.5 68 | if a==1: 69 | return(b) 70 | else: 71 | return(1.0-b) 72 | 73 | 74 | for i in range(N): 75 | for j in range(T): 76 | if j==0: 77 | s = np.random.normal(0.5,0.2) 78 | r = 0.0 79 | a =0.0 80 | w = 1.0 81 | else: 82 | s = np.random.normal(0.02*(j%2)+s*1.0-0.3*(a-0.5),0.2) 83 | a = behav_policy(s,j) 84 | w = eval_policy_dens(s,a,j)/behav_policy_dens(s,a,j)*w 85 | r = np.random.normal(0.9*s+0.3*a-0.02*(j%2),0.2) 86 | r_list[i,j] = r 87 | s_list[i,j] = s 88 | a_list[i,j] = a 89 | w_list[i,j] = w 90 | w_list2[i,j]= eval_policy_dens(s,a,j)/behav_policy_dens(s,a,j) 91 | 92 | ag_list = [] 93 | 94 | #### IPW estimator 95 | for i in range(N): 96 | ag_list.append(np.sum(r_list[i,]*w_list[i,])) 97 | estimator_list_ipw.append(np.mean(ag_list)) 98 | 99 | ########num = 0 100 | 101 | #### DM estimator 102 | bbb = range(T) 103 | reg_list = [] 104 | for j in bbb[::-1]: 105 | if j==(T-1): 106 | X = np.array([s_list[:,j],a_list[:,j]]) 107 | pre_X = np.array([s_list[:,j],a_list[:,j]]) 108 | Y = r_list[:,j] 109 | else: 110 | X = np.array([s_list[:,j],a_list[:,j]]) 111 | aaa = [] 112 | for k in range(N): 113 | aaa.append(eval_policy_dens(s_list[k,j+1],1,0)) 114 | X0 = np.array([s_list[:,j+1],aaa]) 115 | Y = r_list[:,j]+reg.predict(np.transpose(X0)) 116 | reg = linear_model.LinearRegression() 117 | reg.fit(np.transpose(X), Y) 118 | ###print reg.score(np.transpose(X), Y) 119 | reg_list.append(reg) 120 | 121 | 122 | aaa = [] 123 | for i in range(N): 124 | aaa.append(eval_policy_dens(s_list[i,0],1,0)) 125 | X0 = np.array([s_list[:,0],aaa]) 126 | v0 = reg.predict(np.transpose(X0)) 127 | estimator_list_dm.append(np.mean(v0)) 128 | 129 | ### DR estiamtor under M_1 130 | dr = 0.0 131 | for t in range(T): 132 | dr = dr + np.mean(r_list[:,t]*w_list[:,t]) 133 | #### q function 134 | X = np.array([s_list[:,t],a_list[:,t]]) 135 | dr = dr - np.mean(reg_list[T-1-t].predict(np.transpose(X))*w_list[:,t]) 136 | #### v function 137 | aaa = [] 138 | for i in range(N): 139 | aaa.append(eval_policy_dens(s_list[i,t],1,0)) 140 | X0 = np.array([s_list[:,t],aaa]) 141 | if t==0: 142 | dr = dr + np.mean(reg_list[T-1-t].predict(np.transpose(X0))) 143 | else: 144 | dr = dr + np.mean(reg_list[T-1-t].predict(np.transpose(X0))*w_list[:,t-1]) 145 | 146 | estimator_list_dr.append(dr) 147 | 148 | #### IPW estimator under M_2 149 | 150 | bbb = range(T) 151 | wreg_list = [] 152 | for j in bbb[::-1]: 153 | X = np.array([s_list[:,j],a_list[:,j]]) 154 | Y = w_list[:,j] 155 | reg = linear_model.LinearRegression() 156 | reg.fit(np.transpose(X),Y) 157 | ###print reg.score(np.transpose(X), Y) 158 | wreg_list.append(reg) 159 | 160 | ipw = 0.0 161 | for t in range(T): 162 | X = np.array([s_list[:,t],a_list[:,t]]) 163 | ipw = ipw + np.mean(wreg_list[T-1-t].predict(np.transpose(X))*r_list[:,t]) 164 | estimator_list_ipw2.append(ipw) 165 | 166 | ### DR estiamtor under M_2 167 | dr2 = 0.0 168 | for t in range(T): 169 | X = np.array([s_list[:,t],a_list[:,t]]) 170 | dr2 = dr2 + np.mean(wreg_list[T-1-t].predict(np.transpose(X))*r_list[:,t]) 171 | #### q function 172 | dr2 = dr2 - np.mean(reg_list[T-1-t].predict(np.transpose(X))*wreg_list[T-1-t].predict(np.transpose(X))) 173 | #### v function 174 | aaa = [] 175 | for i in range(N): 176 | aaa.append(eval_policy_dens(s_list[i,t],1,0)) 177 | X0 = np.array([s_list[:,t],aaa]) 178 | if t==0: 179 | dr2 = dr2 + np.mean(reg_list[T-t-1].predict(np.transpose(X0))) 180 | else: 181 | X_ = np.array([s_list[:,t-1],a_list[:,t-1]]) 182 | dr2 = dr2 + np.mean(reg_list[T-1-t].predict(np.transpose(X0))*wreg_list[T-t].predict(np.transpose(X_))) 183 | estimator_list_dr2.append(dr2) 184 | 185 | #### Ratio-mis specified 186 | 187 | num = 2 188 | 189 | bbb = range(T) 190 | wreg_list_mis = [] 191 | for j in bbb[::-1]: 192 | X = np.array([s_list[:,j]*s_list[:,j],a_list[:,j]]) 193 | Y = w_list[:,j] 194 | reg = linear_model.LinearRegression() 195 | reg.fit(np.transpose(X),Y) 196 | ###print reg.score(np.transpose(X), Y) 197 | wreg_list_mis.append(reg) 198 | 199 | ipw = 0.0 200 | for t in range(T): 201 | X = np.array([s_list[:,t]*s_list[:,t],a_list[:,t]]) 202 | ipw = ipw + np.mean(wreg_list_mis[T-1-t].predict(np.transpose(X))*r_list[:,t]) 203 | estimator_list_ipw2_ratio_mis.append(ipw) 204 | 205 | dr2 = 0.0 206 | for t in range(T): 207 | X_w = np.array([s_list[:,t]*s_list[:,t],a_list[:,t]]) 208 | X_r = np.array([s_list[:,t],a_list[:,t]]) 209 | dr2 = dr2 + np.mean(wreg_list_mis[T-1-t].predict(np.transpose(X_w))*r_list[:,t]) 210 | #### q function 211 | dr2 = dr2 - np.mean(reg_list[T-1-t].predict(np.transpose(X_r))*wreg_list_mis[T-1-t].predict(np.transpose(X_w))) 212 | #### v function 213 | aaa = [] 214 | for i in range(N): 215 | aaa.append(eval_policy_dens(s_list[i,t],1,0)) 216 | X0 = np.array([s_list[:,t],aaa]) 217 | if t==0: 218 | dr2 = dr2 + np.mean(reg_list[T-t-1].predict(np.transpose(X0))) 219 | else: 220 | X_ = np.array([s_list[:,t-1]*s_list[:,t-1],a_list[:,t-1]]) 221 | dr2 = dr2 + np.mean(reg_list[T-1-t].predict(np.transpose(X0))*wreg_list_mis[T-t].predict(np.transpose(X_))) 222 | estimator_list_dr2_ratio_mis.append(dr2) 223 | 224 | 225 | ### q-misspcified 226 | 227 | 228 | #### DM estimator 229 | bbb = range(T) 230 | reg_list_mis = [] 231 | for j in bbb[::-1]: 232 | if j==(T-1): 233 | X = np.array([s_list[:,j]*s_list[:,j],a_list[:,j]]) 234 | pre_X = np.array([s_list[:,j],a_list[:,j]]) 235 | Y = r_list[:,j] 236 | else: 237 | X = np.array([s_list[:,j]*s_list[:,j],a_list[:,j]]) 238 | aaa = [] 239 | for k in range(N): 240 | aaa.append(eval_policy_dens(s_list[k,j+1],1,0)) 241 | X0 = np.array([s_list[:,j+1]*s_list[:,j+1],aaa]) 242 | Y = r_list[:,j]+reg.predict(np.transpose(X0)) 243 | reg = linear_model.LinearRegression() 244 | reg.fit(np.transpose(X), Y) 245 | ###print reg.score(np.transpose(X), Y) 246 | reg_list_mis.append(reg) 247 | 248 | 249 | aaa = [] 250 | for i in range(N): 251 | aaa.append(eval_policy_dens(s_list[i,0],1,0)) 252 | X0 = np.array([s_list[:,0],aaa]) 253 | v0 = reg.predict(np.transpose(X0)) 254 | estimator_list_dm_q_mis.append(np.mean(v0)) 255 | 256 | ### DR estiamtor under M_1 257 | dr = 0.0 258 | for t in range(T): 259 | dr = dr + np.mean(r_list[:,t]*w_list[:,t]) 260 | #### q function 261 | X = np.array([s_list[:,t]*s_list[:,t],a_list[:,t]]) 262 | dr = dr - np.mean(reg_list_mis[T-1-t].predict(np.transpose(X))*w_list[:,t]) 263 | #### v function 264 | aaa = [] 265 | for i in range(N): 266 | aaa.append(eval_policy_dens(s_list[i,t],1,0)) 267 | X0 = np.array([s_list[:,t]*s_list[:,t],aaa]) 268 | if t==0: 269 | dr = dr + np.mean(reg_list_mis[T-1-t].predict(np.transpose(X0))) 270 | else: 271 | dr = dr + np.mean(reg_list_mis[T-1-t].predict(np.transpose(X0))*w_list[:,t-1]) 272 | 273 | estimator_list_dr_q_mis.append(dr) 274 | 275 | print iii 276 | 277 | 278 | ### DR estiamtor under M_2 279 | dr2 = 0.0 280 | for t in range(T): 281 | X_w = np.array([s_list[:,t],a_list[:,t]]) 282 | X_r = np.array([s_list[:,t]*s_list[:,t],a_list[:,t]]) 283 | dr2 = dr2 + np.mean(wreg_list[T-1-t].predict(np.transpose(X_w))*r_list[:,t]) 284 | #### q function 285 | dr2 = dr2 - np.mean(reg_list_mis[T-1-t].predict(np.transpose(X_r))*wreg_list[T-1-t].predict(np.transpose(X_w))) 286 | #### v function 287 | aaa = [] 288 | for i in range(N): 289 | aaa.append(eval_policy_dens(s_list[i,t],1,0)) 290 | X0 = np.array([s_list[:,t]*s_list[:,t],aaa]) 291 | if t==0: 292 | dr2 = dr2 + np.mean(reg_list_mis[T-t-1].predict(np.transpose(X0))) 293 | else: 294 | X_ = np.array([s_list[:,t-1],a_list[:,t-1]]) 295 | dr2 = dr2 + np.mean(reg_list_mis[T-1-t].predict(np.transpose(X0))*wreg_list[T-t].predict(np.transpose(X_))) 296 | estimator_list_dr2_q_mis.append(dr2) 297 | 298 | 299 | np.savez("estimator_list_ipw_%d"+ver_+"_"+str(N),a=estimator_list_ipw) 300 | np.savez("estimator_list_dr_%d"+ver_+"_"+str(N), a=estimator_list_dr) 301 | np.savez("estimator_list_dm_%d"+ver_+"_"+str(N), a=estimator_list_dm) 302 | np.savez("estimator_list_ipw2_%d"+ver_+"_"+str(N),a=estimator_list_ipw2) 303 | np.savez("estimator_list_dr2_%d"+ver_+"_"+str(N),a=estimator_list_dr2) 304 | np.savez("estimator_list_ipw3_%d"+ver_+"_"+str(N),a=estimator_list_ipw3) 305 | np.savez("estimator_list_dr3_%d"+ver_+"_"+str(N),a=estimator_list_dr3) 306 | np.savez("estimator_list_ipw2_ratio_mis_%d"+ver_+"_"+str(N),a=estimator_list_ipw2_ratio_mis) 307 | np.savez("estimator_list_dr2_ratio_mis_%d"+ver_+"_"+str(N),a=estimator_list_dr2_ratio_mis) 308 | np.savez("estimator_list_dm_q_mis_%d"+ver_+"_"+str(N),a=estimator_list_dm_q_mis) 309 | np.savez("estimator_list_dr_q_mis_%d"+ver_+"_"+str(N),a=estimator_list_dr_q_mis) 310 | np.savez("estimator_list_dr2_q_mis_%d"+ver_+"_"+str(N),a=estimator_list_dr2_q_mis) 311 | 312 | 313 | 314 | -------------------------------------------------------------------------------- /exp5_1/toytoy2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy as sp 3 | import sys 4 | import math 5 | 6 | 7 | 8 | 9 | 10 | def sigmoid(x): 11 | return(1.0/(1.0+np.exp(-0.1*x))) 12 | 13 | ### Both models are good 14 | 15 | beta = 0.2 16 | alpha = 0.9 17 | rep = 1500 18 | num = 0 19 | estimator_list_ipw = [] 20 | estimator_list_ipw2 = [] 21 | estimator_list_ipw3 = [] 22 | estimator_list_dm = [] 23 | estimator_list_dr = [] 24 | estimator_list_dr2 = [] 25 | estimator_list_dr3 = [] 26 | estimator_list_ipw2_ratio_mis = [] 27 | estimator_list_dr2_ratio_mis = [] 28 | estimator_list_dm_q_mis = [] 29 | estimator_list_dr_q_mis = [] 30 | estimator_list_dr2_q_mis = [] 31 | 32 | 33 | args = sys.argv 34 | ver_ = str(args[1]) 35 | N = np.int(args[2]) 36 | 37 | 38 | 39 | from sklearn import linear_model 40 | 41 | for iii in range(rep): 42 | print iii 43 | T = 30 44 | 45 | r_list = np.zeros([N,T]) 46 | weight_list = np.zeros([N,T]) 47 | s_list = np.zeros([N,T]) 48 | a_list = np.zeros([N,T]) 49 | w_list = np.zeros([N,T]) 50 | w_list2 = np.zeros([N,T]) 51 | 52 | def behav_policy(s,i): 53 | a = beta*sigmoid(s)+(beta)*np.random.uniform(0.0,1.0) 54 | return(np.random.binomial(1,a,1)[0]) 55 | 56 | def eval_policy(s,i): 57 | a = alpha*sigmoid(s)+(1-alpha)*np.random.uniform(0.0,1.0) 58 | return(np.random.binomial(1,a,1)[0]) 59 | 60 | def behav_policy_dens(s,a,i): 61 | b = beta*sigmoid(s)+(beta)*0.5 62 | if a==1: 63 | return(b) 64 | else: 65 | return(1.0-b) 66 | 67 | def eval_policy_dens(s,a,i): 68 | b = alpha*sigmoid(s)+(1-alpha)*0.5 69 | if a==1: 70 | return(b) 71 | else: 72 | return(1.0-b) 73 | 74 | 75 | for i in range(N): 76 | for j in range(T): 77 | if j==0: 78 | s = np.random.normal(0.5,0.2) 79 | r = 0.0 80 | a =0.0 81 | w = 1.0 82 | else: 83 | s = np.random.normal(0.02*(j%2)+s*1.0-0.3*(a-0.5),0.2) 84 | a = behav_policy(s,j) 85 | w = eval_policy_dens(s,a,j)/behav_policy_dens(s,a,j)*w 86 | r = np.random.normal(0.9*s+0.3*a-0.02*(j%2),0.2) 87 | r_list[i,j] = r 88 | s_list[i,j] = s 89 | a_list[i,j] = a 90 | w_list[i,j] = w 91 | w_list2[i,j]= eval_policy_dens(s,a,j)/behav_policy_dens(s,a,j) 92 | 93 | ag_list = [] 94 | 95 | #### IPW estimator 96 | for i in range(N): 97 | ag_list.append(np.sum(r_list[i,]*w_list[i,])) 98 | estimator_list_ipw.append(np.mean(ag_list)) 99 | 100 | ########num = 0 101 | 102 | #### DM estimator 103 | bbb = range(T) 104 | reg_list = [] 105 | for j in bbb[::-1]: 106 | if j==(T-1): 107 | X = np.array([s_list[:,j],a_list[:,j]]) 108 | pre_X = np.array([s_list[:,j],a_list[:,j]]) 109 | Y = r_list[:,j] 110 | else: 111 | X = np.array([s_list[:,j],a_list[:,j]]) 112 | aaa = [] 113 | for k in range(N): 114 | aaa.append(eval_policy_dens(s_list[k,j+1],1,0)) 115 | X0 = np.array([s_list[:,j+1],aaa]) 116 | Y = r_list[:,j]+reg.predict(np.transpose(X0)) 117 | reg = linear_model.LinearRegression() 118 | reg.fit(np.transpose(X), Y) 119 | ###print reg.score(np.transpose(X), Y) 120 | reg_list.append(reg) 121 | 122 | 123 | aaa = [] 124 | for i in range(N): 125 | aaa.append(eval_policy_dens(s_list[i,0],1,0)) 126 | X0 = np.array([s_list[:,0],aaa]) 127 | v0 = reg.predict(np.transpose(X0)) 128 | estimator_list_dm.append(np.mean(v0)) 129 | 130 | ####print(np.mean(v0)) 131 | 132 | ### DR estiamtor (Cross fitting) 133 | 134 | ############### Make q-function 135 | ################################ 136 | r_list_1 = r_list[0:N/2,:] 137 | r_list_2 = r_list[N/2:N,:] 138 | s_list_1 = s_list[0:N/2,:] 139 | s_list_2 = s_list[N/2:N,:] 140 | a_list_1 = a_list[0:N/2,:] 141 | a_list_2 = a_list[N/2:N,:] 142 | w_list_1 = w_list[0:N/2,:] 143 | w_list_2 = w_list[N/2:N,:] 144 | 145 | 146 | bbb = range(T) 147 | reg_list = [] 148 | for j in bbb[::-1]: 149 | if j==(T-1): 150 | X = np.array([s_list_1[:,j],a_list_1[:,j]]) 151 | pre_X = np.array([s_list_1[:,j],a_list_1[:,j]]) 152 | Y = r_list_1[:,j] 153 | else: 154 | X = np.array([s_list_1[:,j],a_list_1[:,j]]) 155 | aaa = [] 156 | for k in range(N/2): 157 | aaa.append(eval_policy_dens(s_list_1[k,j+1],1,0)) 158 | X0 = np.array([s_list_1[:,j+1],aaa]) 159 | Y = r_list_1[:,j]+reg.predict(np.transpose(X0)) 160 | reg = linear_model.LinearRegression() 161 | reg.fit(np.transpose(X), Y) 162 | ###print reg.score(np.transpose(X), Y) 163 | reg_list.append(reg) 164 | 165 | dr = 0.0 166 | for t in range(T): 167 | dr = dr + np.mean(r_list_2[:,t]*w_list_2[:,t]) 168 | #### q function 169 | X = np.array([s_list_2[:,t],a_list_2[:,t]]) 170 | dr = dr - np.mean(reg_list[T-1-t].predict(np.transpose(X))*w_list_2[:,t]) 171 | #### v function 172 | aaa = [] 173 | for i in range(N/2): 174 | aaa.append(eval_policy_dens(s_list_2[i,t],1,0)) 175 | X0 = np.array([s_list_2[:,t],aaa]) 176 | if t==0: 177 | dr = dr + np.mean(reg_list[T-1-t].predict(np.transpose(X0))) 178 | else: 179 | dr = dr + np.mean(reg_list[T-1-t].predict(np.transpose(X0))*w_list_2[:,t-1]) 180 | 181 | reg_list_2 = [] 182 | for j in bbb[::-1]: 183 | if j==(T-1): 184 | X = np.array([s_list_2[:,j],a_list_2[:,j]]) 185 | pre_X = np.array([s_list_1[:,j],a_list_2[:,j]]) 186 | Y = r_list_2[:,j] 187 | else: 188 | X = np.array([s_list_2[:,j],a_list_2[:,j]]) 189 | aaa = [] 190 | for k in range(N/2): 191 | aaa.append(eval_policy_dens(s_list_2[k,j+1],1,0)) 192 | X0 = np.array([s_list_2[:,j+1],aaa]) 193 | Y = r_list_2[:,j]+reg.predict(np.transpose(X0)) 194 | reg = linear_model.LinearRegression() 195 | reg.fit(np.transpose(X), Y) 196 | ###print reg.score(np.transpose(X), Y) 197 | reg_list_2.append(reg) 198 | 199 | for t in range(T): 200 | dr = dr + np.mean(r_list_1[:,t]*w_list_1[:,t]) 201 | #### q function 202 | X = np.array([s_list_1[:,t],a_list_1[:,t]]) 203 | dr = dr - np.mean(reg_list_2[T-1-t].predict(np.transpose(X))*w_list_1[:,t]) 204 | #### v function 205 | aaa = [] 206 | for i in range(N/2): 207 | aaa.append(eval_policy_dens(s_list_2[i,t],1,0)) 208 | X0 = np.array([s_list_1[:,t],aaa]) 209 | if t==0: 210 | dr = dr + np.mean(reg_list_2[T-1-t].predict(np.transpose(X0))) 211 | else: 212 | dr = dr + np.mean(reg_list_2[T-1-t].predict(np.transpose(X0))*w_list_1[:,t-1]) 213 | 214 | estimator_list_dr.append(dr/2.0) 215 | ####print dr/2.0 216 | 217 | 218 | #### IPW estimator under M_2 219 | 220 | bbb = range(T) 221 | wreg_list = [] 222 | for j in bbb[::-1]: 223 | X = np.array([s_list[:,j],a_list[:,j]]) 224 | Y = w_list[:,j] 225 | reg = linear_model.LinearRegression() 226 | reg.fit(np.transpose(X),Y) 227 | ###print reg.score(np.transpose(X), Y) 228 | wreg_list.append(reg) 229 | 230 | ipw = 0.0 231 | for t in range(T): 232 | X = np.array([s_list[:,t],a_list[:,t]]) 233 | ipw = ipw + np.mean(wreg_list[T-1-t].predict(np.transpose(X))*r_list[:,t]) 234 | estimator_list_ipw2.append(ipw) 235 | 236 | ####print ipw 237 | 238 | ############### DR estiamtor under M_2 (cross fitting) ############### 239 | ############### 240 | ############### 241 | r_list_1 = r_list[0:N/2,:] 242 | r_list_2 = r_list[N/2:N,:] 243 | s_list_1 = s_list[0:N/2,:] 244 | s_list_2 = s_list[N/2:N,:] 245 | a_list_1 = a_list[0:N/2,:] 246 | a_list_2 = a_list[N/2:N,:] 247 | w_list_1 = w_list[0:N/2,:] 248 | w_list_2 = w_list[N/2:N,:] 249 | 250 | 251 | bbb = range(T) 252 | reg_list = [] 253 | for j in bbb[::-1]: 254 | if j==(T-1): 255 | X = np.array([s_list_1[:,j],a_list_1[:,j]]) 256 | pre_X = np.array([s_list_1[:,j],a_list_1[:,j]]) 257 | Y = r_list_1[:,j] 258 | else: 259 | X = np.array([s_list_1[:,j],a_list_1[:,j]]) 260 | aaa = [] 261 | for k in range(N/2): 262 | aaa.append(eval_policy_dens(s_list_1[k,j+1],1,0)) 263 | X0 = np.array([s_list_1[:,j+1],aaa]) 264 | Y = r_list_1[:,j]+reg.predict(np.transpose(X0)) 265 | reg = linear_model.LinearRegression() 266 | reg.fit(np.transpose(X), Y) 267 | ###print reg.score(np.transpose(X), Y) 268 | reg_list.append(reg) 269 | 270 | wreg_list = [] 271 | for j in bbb[::-1]: 272 | X = np.array([s_list_1[:,j],a_list_1[:,j]]) 273 | Y = w_list_1[:,j] 274 | reg = linear_model.LinearRegression() 275 | reg.fit(np.transpose(X),Y) 276 | ###print reg.score(np.transpose(X), Y) 277 | wreg_list.append(reg) 278 | 279 | 280 | dr2 = 0.0 281 | for t in range(T): 282 | X = np.array([s_list_2[:,t],a_list_2[:,t]]) 283 | dr2 = dr2 + np.mean(wreg_list[T-1-t].predict(np.transpose(X))*r_list_2[:,t]) 284 | #### q function 285 | dr2 = dr2 - np.mean(reg_list[T-1-t].predict(np.transpose(X))*wreg_list[T-1-t].predict(np.transpose(X))) 286 | #### v function 287 | aaa = [] 288 | for i in range(N/2): 289 | aaa.append(eval_policy_dens(s_list_2[i,t],1,0)) 290 | X0 = np.array([s_list_2[:,t],aaa]) 291 | if t==0: 292 | dr2 = dr2 + np.mean(reg_list[T-t-1].predict(np.transpose(X0))) 293 | else: 294 | X_ = np.array([s_list_2[:,t-1],a_list_2[:,t-1]]) 295 | dr2 = dr2 + np.mean(reg_list[T-1-t].predict(np.transpose(X0))*wreg_list[T-t].predict(np.transpose(X_))) 296 | estimator_list_dr2.append(dr2) 297 | 298 | reg_list_2 = [] 299 | for j in bbb[::-1]: 300 | if j==(T-1): 301 | X = np.array([s_list_2[:,j],a_list_2[:,j]]) 302 | pre_X = np.array([s_list_1[:,j],a_list_2[:,j]]) 303 | Y = r_list_2[:,j] 304 | else: 305 | X = np.array([s_list_2[:,j],a_list_2[:,j]]) 306 | aaa = [] 307 | for k in range(N/2): 308 | aaa.append(eval_policy_dens(s_list_2[k,j+1],1,0)) 309 | X0 = np.array([s_list_2[:,j+1],aaa]) 310 | Y = r_list_2[:,j]+reg.predict(np.transpose(X0)) 311 | reg = linear_model.LinearRegression() 312 | reg.fit(np.transpose(X), Y) 313 | ###print reg.score(np.transpose(X), Y) 314 | reg_list_2.append(reg) 315 | 316 | wreg_list_2 = [] 317 | for j in bbb[::-1]: 318 | X = np.array([s_list_2[:,j],a_list_2[:,j]]) 319 | Y = w_list_2[:,j] 320 | reg = linear_model.LinearRegression() 321 | reg.fit(np.transpose(X),Y) 322 | ###print reg.score(np.transpose(X), Y) 323 | wreg_list_2.append(reg) 324 | 325 | for t in range(T): 326 | X = np.array([s_list_1[:,t],a_list_1[:,t]]) 327 | dr2 = dr2 + np.mean(wreg_list_2[T-1-t].predict(np.transpose(X))*r_list_1[:,t]) 328 | #### q function 329 | dr2 = dr2 - np.mean(reg_list_2[T-1-t].predict(np.transpose(X))*wreg_list_2[T-1-t].predict(np.transpose(X))) 330 | #### v function 331 | aaa = [] 332 | for i in range(N/2): 333 | aaa.append(eval_policy_dens(s_list_1[i,t],1,0)) 334 | X0 = np.array([s_list_1[:,t],aaa]) 335 | if t==0: 336 | dr2 = dr2 + np.mean(reg_list_2[T-t-1].predict(np.transpose(X0))) 337 | else: 338 | X_ = np.array([s_list_1[:,t-1],a_list_1[:,t-1]]) 339 | dr2 = dr2 + np.mean(reg_list_2[T-1-t].predict(np.transpose(X0))*wreg_list_2[T-t].predict(np.transpose(X_))) 340 | estimator_list_dr2.append(dr2/2.0) 341 | #####print(dr2/2.0) 342 | 343 | 344 | #### Ratio-mis specified ############## 345 | ############################### 346 | ################################ 347 | 348 | num = 2 349 | 350 | bbb = range(T) 351 | wreg_list_mis = [] 352 | for j in bbb[::-1]: 353 | X = np.array([s_list[:,j]*s_list[:,j],a_list[:,j]]) 354 | Y = w_list[:,j] 355 | reg = linear_model.LinearRegression() 356 | reg.fit(np.transpose(X),Y) 357 | ###print reg.score(np.transpose(X), Y) 358 | wreg_list_mis.append(reg) 359 | 360 | ipw = 0.0 361 | for t in range(T): 362 | X = np.array([s_list[:,t]*s_list[:,t],a_list[:,t]]) 363 | ipw = ipw + np.mean(wreg_list_mis[T-1-t].predict(np.transpose(X))*r_list[:,t]) 364 | estimator_list_ipw2_ratio_mis.append(ipw) 365 | 366 | dr2 = 0.0 367 | for t in range(T): 368 | X_w = np.array([s_list[:,t]*s_list[:,t],a_list[:,t]]) 369 | X_r = np.array([s_list[:,t],a_list[:,t]]) 370 | dr2 = dr2 + np.mean(wreg_list_mis[T-1-t].predict(np.transpose(X_w))*r_list[:,t]) 371 | #### q function 372 | dr2 = dr2 - np.mean(reg_list[T-1-t].predict(np.transpose(X_r))*wreg_list_mis[T-1-t].predict(np.transpose(X_w))) 373 | #### v function 374 | aaa = [] 375 | for i in range(N): 376 | aaa.append(eval_policy_dens(s_list[i,t],1,0)) 377 | X0 = np.array([s_list[:,t],aaa]) 378 | if t==0: 379 | dr2 = dr2 + np.mean(reg_list[T-t-1].predict(np.transpose(X0))) 380 | else: 381 | X_ = np.array([s_list[:,t-1]*s_list[:,t-1],a_list[:,t-1]]) 382 | dr2 = dr2 + np.mean(reg_list[T-1-t].predict(np.transpose(X0))*wreg_list_mis[T-t].predict(np.transpose(X_))) 383 | estimator_list_dr2_ratio_mis.append(dr2) 384 | 385 | 386 | ### q-misspcified 387 | ################################ 388 | ################################# 389 | ################################## 390 | 391 | 392 | 393 | #### DM estimator 394 | bbb = range(T) 395 | reg_list_mis = [] 396 | for j in bbb[::-1]: 397 | if j==(T-1): 398 | X = np.array([s_list[:,j]*s_list[:,j],a_list[:,j]]) 399 | pre_X = np.array([s_list[:,j],a_list[:,j]]) 400 | Y = r_list[:,j] 401 | else: 402 | X = np.array([s_list[:,j]*s_list[:,j],a_list[:,j]]) 403 | aaa = [] 404 | for k in range(N): 405 | aaa.append(eval_policy_dens(s_list[k,j+1],1,0)) 406 | X0 = np.array([s_list[:,j+1]*s_list[:,j+1],aaa]) 407 | Y = r_list[:,j]+reg.predict(np.transpose(X0)) 408 | reg = linear_model.LinearRegression() 409 | reg.fit(np.transpose(X), Y) 410 | ###print reg.score(np.transpose(X), Y) 411 | reg_list_mis.append(reg) 412 | 413 | 414 | aaa = [] 415 | for i in range(N): 416 | aaa.append(eval_policy_dens(s_list[i,0],1,0)) 417 | X0 = np.array([s_list[:,0],aaa]) 418 | v0 = reg.predict(np.transpose(X0)) 419 | estimator_list_dm_q_mis.append(np.mean(v0)) 420 | 421 | ### DR estiamtor under M_1 422 | dr = 0.0 423 | for t in range(T): 424 | dr = dr + np.mean(r_list[:,t]*w_list[:,t]) 425 | #### q function 426 | X = np.array([s_list[:,t]*s_list[:,t],a_list[:,t]]) 427 | dr = dr - np.mean(reg_list_mis[T-1-t].predict(np.transpose(X))*w_list[:,t]) 428 | #### v function 429 | aaa = [] 430 | for i in range(N): 431 | aaa.append(eval_policy_dens(s_list[i,t],1,0)) 432 | X0 = np.array([s_list[:,t]*s_list[:,t],aaa]) 433 | if t==0: 434 | dr = dr + np.mean(reg_list_mis[T-1-t].predict(np.transpose(X0))) 435 | else: 436 | dr = dr + np.mean(reg_list_mis[T-1-t].predict(np.transpose(X0))*w_list[:,t-1]) 437 | 438 | estimator_list_dr_q_mis.append(dr) 439 | 440 | print iii 441 | 442 | 443 | ### DR estiamtor under M_2 444 | dr2 = 0.0 445 | for t in range(T): 446 | X_w = np.array([s_list[:,t],a_list[:,t]]) 447 | X_r = np.array([s_list[:,t]*s_list[:,t],a_list[:,t]]) 448 | dr2 = dr2 + np.mean(wreg_list[T-1-t].predict(np.transpose(X_w))*r_list[:,t]) 449 | #### q function 450 | dr2 = dr2 - np.mean(reg_list_mis[T-1-t].predict(np.transpose(X_r))*wreg_list[T-1-t].predict(np.transpose(X_w))) 451 | #### v function 452 | aaa = [] 453 | for i in range(N): 454 | aaa.append(eval_policy_dens(s_list[i,t],1,0)) 455 | X0 = np.array([s_list[:,t]*s_list[:,t],aaa]) 456 | if t==0: 457 | dr2 = dr2 + np.mean(reg_list_mis[T-t-1].predict(np.transpose(X0))) 458 | else: 459 | X_ = np.array([s_list[:,t-1],a_list[:,t-1]]) 460 | dr2 = dr2 + np.mean(reg_list_mis[T-1-t].predict(np.transpose(X0))*wreg_list[T-t].predict(np.transpose(X_))) 461 | estimator_list_dr2_q_mis.append(dr2) 462 | 463 | 464 | np.savez("estimator_list_ipw_%d"+ver_+"_"+str(N),a=estimator_list_ipw) 465 | np.savez("estimator_list_dr_%d"+ver_+"_"+str(N), a=estimator_list_dr) 466 | np.savez("estimator_list_dm_%d"+ver_+"_"+str(N), a=estimator_list_dm) 467 | np.savez("estimator_list_ipw2_%d"+ver_+"_"+str(N),a=estimator_list_ipw2) 468 | np.savez("estimator_list_dr2_%d"+ver_+"_"+str(N),a=estimator_list_dr2) 469 | np.savez("estimator_list_ipw3_%d"+ver_+"_"+str(N),a=estimator_list_ipw3) 470 | np.savez("estimator_list_dr3_%d"+ver_+"_"+str(N),a=estimator_list_dr3) 471 | np.savez("estimator_list_ipw2_ratio_mis_%d"+ver_+"_"+str(N),a=estimator_list_ipw2_ratio_mis) 472 | np.savez("estimator_list_dr2_ratio_mis_%d"+ver_+"_"+str(N),a=estimator_list_dr2_ratio_mis) 473 | np.savez("estimator_list_dm_q_mis_%d"+ver_+"_"+str(N),a=estimator_list_dm_q_mis) 474 | np.savez("estimator_list_dr_q_mis_%d"+ver_+"_"+str(N),a=estimator_list_dr_q_mis) 475 | np.savez("estimator_list_dr2_q_mis_%d"+ver_+"_"+str(N),a=estimator_list_dr2_q_mis) 476 | 477 | -------------------------------------------------------------------------------- /exp5_1_py3/rmse_all.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import os 4 | import time 5 | import matplotlib.pyplot as plt 6 | import seaborn as sns 7 | import multiprocessing as mp 8 | from joblib import Parallel, delayed 9 | import argparse 10 | 11 | def sigmoid(x): 12 | return 1.0 / (1.0 + np.exp(-0.1 * x)) 13 | 14 | def estimate_true_value_single(seed, T=30, alpha=0.9, beta=0.2): 15 | """ 16 | Run a single episode using the evaluation policy to estimate the true value. 17 | """ 18 | np.random.seed(seed) 19 | 20 | # Initial state 21 | s = np.random.normal(0.5, 0.2) 22 | 23 | # Total return for this episode 24 | episode_return = 0.0 25 | 26 | # Run the episode 27 | for j in range(T): 28 | # Use evaluation policy directly 29 | a_prob = alpha * sigmoid(s) + (1 - alpha) * np.random.uniform(0.0, 1.0) 30 | a = np.random.binomial(1, a_prob, 1)[0] 31 | 32 | # Get reward 33 | r = np.random.normal(0.9 * s + 0.3 * a - 0.02 * (j % 2), 0.2) 34 | episode_return += r 35 | 36 | # State transition (if not the last step) 37 | if j < T-1: 38 | s = np.random.normal(0.02 * ((j+1) % 2) + s * 1.0 - 0.3 * (a - 0.5), 0.2) 39 | 40 | return episode_return 41 | 42 | def estimate_true_value(n_episodes=100000, T=30, alpha=0.9, beta=0.2, n_jobs=-1): 43 | """ 44 | Estimate the true expected return of the evaluation policy. 45 | """ 46 | start_time = time.time() 47 | print(f"Estimating true value using {n_episodes} direct episodes...") 48 | 49 | # Use joblib for parallelization 50 | if n_jobs == -1: 51 | n_jobs = mp.cpu_count() 52 | 53 | # Create batches to show progress 54 | batch_size = min(10000, n_episodes) 55 | n_batches = (n_episodes + batch_size - 1) // batch_size 56 | 57 | all_returns = [] 58 | 59 | for batch in range(n_batches): 60 | start_idx = batch * batch_size 61 | end_idx = min((batch + 1) * batch_size, n_episodes) 62 | current_batch_size = end_idx - start_idx 63 | 64 | print(f"Running batch {batch+1}/{n_batches} ({start_idx+1}-{end_idx} of {n_episodes})...") 65 | 66 | # Run episodes in parallel 67 | batch_returns = Parallel(n_jobs=n_jobs)( 68 | delayed(estimate_true_value_single)( 69 | seed=start_idx+i, 70 | T=T, 71 | alpha=alpha, 72 | beta=beta 73 | ) for i in range(current_batch_size) 74 | ) 75 | 76 | all_returns.extend(batch_returns) 77 | 78 | # Show intermediate results 79 | current_mean = np.mean(all_returns) 80 | current_se = np.std(all_returns) / np.sqrt(len(all_returns)) 81 | 82 | print(f" Completed {len(all_returns)}/{n_episodes} episodes") 83 | print(f" Current estimate: {current_mean:.6f} ± {current_se:.6f}") 84 | 85 | # Calculate final estimate and standard error 86 | true_value = np.mean(all_returns) 87 | std_error = np.std(all_returns) / np.sqrt(n_episodes) 88 | 89 | # Calculate time taken 90 | elapsed_time = time.time() - start_time 91 | minutes = int(elapsed_time // 60) 92 | seconds = int(elapsed_time % 60) 93 | 94 | print(f"\nEstimation complete in {minutes}m {seconds}s") 95 | print(f"Final true value estimate: {true_value:.6f} ± {std_error:.6f}") 96 | print(f"Based on {n_episodes} episodes with evaluation policy (α={alpha})") 97 | 98 | return true_value, std_error 99 | 100 | def load_npz_file(file_path): 101 | """Load a single NPZ file and return its contents.""" 102 | try: 103 | data = np.load(file_path) 104 | # Check for different array names - first 'a' (old format) then 'data' (new format) 105 | if 'a' in data: 106 | return data['a'] 107 | elif 'data' in data: 108 | return data['data'] 109 | else: 110 | # Try to get the first array in the file 111 | array_keys = list(data.keys()) 112 | if array_keys: 113 | return data[array_keys[0]] 114 | else: 115 | print(f"Warning: No valid arrays found in {file_path}") 116 | return np.array([]) 117 | except Exception as e: 118 | print(f"Error loading {file_path}: {e}") 119 | return np.array([]) 120 | 121 | def load_estimator_results(directory, sample_sizes): 122 | """ 123 | Load all estimator results from NPZ files with the new naming convention. 124 | 125 | Parameters: 126 | ----------- 127 | directory : str or list 128 | Directory or list of directories where NPZ files are stored 129 | sample_sizes : list 130 | List of sample sizes to load 131 | 132 | Returns: 133 | -------- 134 | dict 135 | Dictionary with sample sizes as keys and dictionaries of estimators as values 136 | """ 137 | all_estimators = {} 138 | 139 | # Estimator type mapping for nice display names 140 | estimator_display_names = { 141 | 'ipw': 'IPW', 142 | 'dr': 'DRL(M₁)', 143 | 'dm': 'DM', 144 | 'ipw2': 'IPW₂', 145 | 'dr2': 'DRL(M₂)', 146 | 'ipw_mis_q': 'IPW (q mis.)', 147 | 'dr_mis_q': 'DRL(M₁) (q mis.)', 148 | 'dm_mis_q': 'DM (q mis.)', 149 | 'ipw2_mis_q': 'IPW₂ (q mis.)', 150 | 'dr2_mis_q': 'DRL(M₂) (q mis.)', 151 | 'ipw_mis_mu': 'IPW (μ mis.)', 152 | 'dr_mis_mu': 'DRL(M₁) (μ mis.)', 153 | 'dm_mis_mu': 'DM (μ mis.)', 154 | 'ipw2_mis_mu': 'IPW₂ (μ mis.)', 155 | 'dr2_mis_mu': 'DRL(M₂) (μ mis.)' 156 | } 157 | 158 | # Track if we found any files 159 | found_any_files = False 160 | 161 | for N in sample_sizes: 162 | all_estimators[N] = {} 163 | 164 | # Check for multiple possible file pattern formats 165 | possible_patterns = [ 166 | f"_{N}.npz", # New format: estimator_list_ipw_1500.npz 167 | f"_0default_{N}.npz", # Old format: estimator_list_ipw_0default_1500.npz 168 | f"_%d0default_{N}.npz", # Old format with %d: estimator_list_ipw_%d0default_1500.npz 169 | f"_n{N}.npz", # Alternative format: estimator_list_ipw_n1500.npz 170 | f"_gpu_{N}.npz", # GPU format: gpu_ipw_1500.npz 171 | f"_{N}_gpu.npz" # Another GPU format: ipw_1500_gpu.npz 172 | ] 173 | 174 | print(f"\nLooking for results with N = {N}:") 175 | 176 | # Check if directory is a list of possible directories 177 | if isinstance(directory, list): 178 | search_dirs = directory 179 | else: 180 | search_dirs = [directory] 181 | 182 | for search_dir in search_dirs: 183 | # Find all NPZ files for this sample size 184 | for pattern in possible_patterns: 185 | npz_files = [f for f in os.listdir(search_dir) if f.endswith('.npz') and pattern in f] 186 | 187 | if npz_files: 188 | print(f" Found {len(npz_files)} files with pattern {pattern}") 189 | found_any_files = True 190 | 191 | for npz_file in npz_files: 192 | try: 193 | # Extract estimator name using different patterns 194 | estimator_key = None 195 | 196 | # Try different naming conventions 197 | if "estimator_list_" in npz_file: 198 | # Extract part between "estimator_list_" and the pattern 199 | estimator_part = npz_file.replace("estimator_list_", "") 200 | for p in possible_patterns: 201 | if p in estimator_part: 202 | estimator_key = estimator_part.split(p.replace(".npz", ""))[0] 203 | # Remove trailing underscore if present 204 | estimator_key = estimator_key.rstrip('_') 205 | break 206 | elif "gpu_" in npz_file: 207 | # Format like gpu_ipw_1500.npz 208 | parts = npz_file.split('_') 209 | if len(parts) > 1: 210 | estimator_key = parts[1] 211 | else: 212 | # Last resort: try to extract from filename 213 | parts = npz_file.split('_') 214 | if len(parts) > 0: 215 | estimator_key = parts[0] 216 | 217 | # If we couldn't determine the estimator key, use the filename without extension 218 | if not estimator_key: 219 | estimator_key = os.path.splitext(npz_file)[0] 220 | 221 | # Load the NPZ file 222 | values = load_npz_file(os.path.join(search_dir, npz_file)) 223 | 224 | if len(values) > 0: 225 | # Use display name if available 226 | display_name = estimator_display_names.get(estimator_key, estimator_key) 227 | all_estimators[N][display_name] = values 228 | print(f" Loaded {display_name} from {npz_file}: {len(values)} values") 229 | except Exception as e: 230 | print(f" Error processing {npz_file}: {e}") 231 | 232 | if not found_any_files: 233 | print("\nWARNING: No NPZ files found matching the expected patterns!") 234 | print(f"Searched in: {directory}") 235 | print(f"Looking for sample sizes: {sample_sizes}") 236 | print("\nPlease check that your files are in the correct location and named correctly.") 237 | # List all files in the directory for debugging 238 | if isinstance(directory, str) and os.path.exists(directory): 239 | print("\nFiles found in the directory:") 240 | for f in os.listdir(directory): 241 | if f.endswith('.npz'): 242 | print(f" {f}") 243 | 244 | return all_estimators 245 | 246 | def calculate_rmse(estimators, true_value): 247 | """ 248 | Calculate RMSE, standard error, and bias for each estimator 249 | 250 | Parameters: 251 | ----------- 252 | estimators : dict 253 | Dictionary with estimator names as keys and arrays of values as values 254 | true_value : float 255 | The true parameter value being estimated 256 | 257 | Returns: 258 | -------- 259 | DataFrame 260 | DataFrame with RMSE, std errors, and bias for each estimator 261 | """ 262 | results = [] 263 | 264 | for name, values in estimators.items(): 265 | if len(values) > 0: 266 | # Calculate RMSE 267 | squared_errors = np.square(np.array(values) - true_value) 268 | rmse = np.sqrt(np.mean(squared_errors)) 269 | 270 | # Calculate standard error of RMSE 271 | # Based on the delta method approximation 272 | se_rmse = np.std(squared_errors) / (2 * rmse * np.sqrt(len(values))) 273 | 274 | # Calculate bias 275 | bias = np.mean(values) - true_value 276 | 277 | # Calculate mean and standard deviation 278 | mean = np.mean(values) 279 | std = np.std(values) 280 | 281 | results.append({ 282 | 'Estimator': name, 283 | 'RMSE': rmse, 284 | 'SE': se_rmse, 285 | 'Bias': bias, 286 | 'Mean': mean, 287 | 'Std': std, 288 | 'n_samples': len(values) 289 | }) 290 | 291 | # Handle empty results 292 | if not results: 293 | print("WARNING: No data available to calculate RMSE!") 294 | # Return empty DataFrame with the expected columns 295 | return pd.DataFrame(columns=['Estimator', 'RMSE', 'SE', 'Bias', 'Mean', 'Std', 'n_samples']) 296 | 297 | return pd.DataFrame(results) 298 | 299 | def create_rmse_table(all_estimators, true_value): 300 | """ 301 | Create RMSE table for all sample sizes 302 | 303 | Parameters: 304 | ----------- 305 | all_estimators : dict 306 | Dictionary with sample sizes as keys and dictionaries of estimators as values 307 | true_value : float 308 | The true parameter value 309 | 310 | Returns: 311 | -------- 312 | tuple 313 | Tuple containing (rmse_table, se_table, bias_table, all_results) 314 | """ 315 | all_results = [] 316 | 317 | for N, estimators in all_estimators.items(): 318 | if estimators: # Check if there are any estimators for this N 319 | results = calculate_rmse(estimators, true_value) 320 | if not results.empty: 321 | results['N'] = N 322 | all_results.append(results) 323 | else: 324 | print(f"No estimators found for N={N}") 325 | 326 | # Handle case where no results were calculated 327 | if not all_results: 328 | print("WARNING: No valid results found to create RMSE table!") 329 | empty_df = pd.DataFrame(columns=['Estimator', 'RMSE', 'SE', 'Bias', 'Mean', 'Std', 'n_samples', 'N']) 330 | return empty_df, empty_df, empty_df, empty_df 331 | 332 | # Combine results from all sample sizes 333 | combined_results = pd.concat(all_results, ignore_index=True) 334 | 335 | if 'Estimator' not in combined_results.columns or 'N' not in combined_results.columns: 336 | print("WARNING: Missing required columns in results!") 337 | print(f"Available columns: {combined_results.columns.tolist()}") 338 | empty_df = pd.DataFrame(columns=['Estimator', 'RMSE', 'SE', 'Bias']) 339 | return empty_df, empty_df, empty_df, combined_results 340 | 341 | # Create pivot tables 342 | rmse_table = combined_results.pivot(index='Estimator', columns='N', values='RMSE') 343 | se_table = combined_results.pivot(index='Estimator', columns='N', values='SE') 344 | bias_table = combined_results.pivot(index='Estimator', columns='N', values='Bias') 345 | 346 | return rmse_table, se_table, bias_table, combined_results 347 | 348 | def create_latex_table(rmse_table, se_table, output_file): 349 | """ 350 | Create a LaTeX table with RMSE values and standard errors 351 | 352 | Parameters: 353 | ----------- 354 | rmse_table : DataFrame 355 | DataFrame with RMSE values 356 | se_table : DataFrame 357 | DataFrame with standard error values 358 | output_file : str 359 | Output file path 360 | """ 361 | # Skip if tables are empty 362 | if rmse_table.empty or se_table.empty: 363 | print(f"Skipping LaTeX table creation because data tables are empty") 364 | return 365 | 366 | with open(output_file, "w") as f: 367 | f.write("\\begin{table}[ht]\n") 368 | f.write("\\centering\n") 369 | f.write("\\caption{RMSE of Estimators (with standard errors in parentheses)}\n") 370 | f.write("\\begin{tabular}{l" + "c" * len(rmse_table.columns) + "}\n") 371 | f.write("\\hline\n") 372 | 373 | # Header row 374 | f.write("Estimator & " + " & ".join([f"N={n}" for n in rmse_table.columns]) + " \\\\\n") 375 | f.write("\\hline\n") 376 | 377 | # Data rows 378 | for estimator in rmse_table.index: 379 | row = f"{estimator}" 380 | for n in rmse_table.columns: 381 | if n in rmse_table.columns and n in se_table.columns: 382 | rmse = rmse_table.loc[estimator, n] 383 | se = se_table.loc[estimator, n] 384 | row += f" & {rmse:.4f} ({se:.4f})" 385 | else: 386 | row += " & -" 387 | row += " \\\\\n" 388 | f.write(row) 389 | 390 | f.write("\\hline\n") 391 | f.write("\\end{tabular}\n") 392 | f.write("\\end{table}\n") 393 | 394 | print(f"LaTeX table created: {output_file}") 395 | 396 | def create_visualizations(rmse_table, se_table, combined_results, output_dir): 397 | """ 398 | Create visualizations for RMSE results 399 | 400 | Parameters: 401 | ----------- 402 | rmse_table : DataFrame 403 | Pivot table with RMSE values 404 | se_table : DataFrame 405 | Pivot table with standard error values 406 | combined_results : DataFrame 407 | Combined results DataFrame 408 | output_dir : str 409 | Directory to save visualizations 410 | """ 411 | # Skip if tables are empty 412 | if rmse_table.empty or se_table.empty or combined_results.empty: 413 | print(f"Skipping visualization creation because data tables are empty") 414 | return 415 | 416 | # Set style 417 | sns.set_style("whitegrid") 418 | plt.rcParams['figure.figsize'] = (12, 8) 419 | plt.rcParams['savefig.dpi'] = 300 420 | 421 | try: 422 | # 1. RMSE by sample size for each estimator 423 | plt.figure(figsize=(12, 8)) 424 | for estimator in rmse_table.index: 425 | plt.plot(rmse_table.columns, rmse_table.loc[estimator], marker='o', linewidth=2, label=estimator) 426 | 427 | plt.xlabel('Sample Size (N)', fontsize=14) 428 | plt.ylabel('RMSE', fontsize=14) 429 | plt.title('RMSE by Sample Size for Each Estimator', fontsize=16) 430 | plt.legend(fontsize=12) 431 | plt.grid(True) 432 | plt.tight_layout() 433 | plt.savefig(os.path.join(output_dir, 'rmse_by_sample_size.png')) 434 | plt.close() 435 | 436 | # 2. Bar plot for each sample size 437 | for N in rmse_table.columns: 438 | plt.figure(figsize=(14, 8)) 439 | 440 | # Sort estimators by RMSE 441 | sorted_estimators = rmse_table[N].sort_values().index 442 | 443 | # Plot RMSE bars 444 | ax = plt.bar(range(len(sorted_estimators)), rmse_table.loc[sorted_estimators, N]) 445 | 446 | # Add error bars 447 | plt.errorbar( 448 | x=range(len(sorted_estimators)), 449 | y=rmse_table.loc[sorted_estimators, N], 450 | yerr=se_table.loc[sorted_estimators, N], 451 | fmt='none', capsize=5, color='black', elinewidth=1.5 452 | ) 453 | 454 | plt.title(f'RMSE for Each Estimator (N = {N})', fontsize=16) 455 | plt.ylabel('RMSE', fontsize=14) 456 | plt.xticks(range(len(sorted_estimators)), sorted_estimators, rotation=45, ha='right', fontsize=12) 457 | plt.grid(axis='y') 458 | plt.tight_layout() 459 | plt.savefig(os.path.join(output_dir, f'rmse_n{N}.png')) 460 | plt.close() 461 | 462 | # 3. Heatmap of RMSE values 463 | plt.figure(figsize=(10, 8)) 464 | sns.heatmap(rmse_table, annot=True, cmap='YlGnBu', fmt='.4f') 465 | plt.title('RMSE Heatmap by Estimator and Sample Size', fontsize=16) 466 | plt.tight_layout() 467 | plt.savefig(os.path.join(output_dir, 'rmse_heatmap.png')) 468 | plt.close() 469 | 470 | # 4. Bias comparison if bias data is available 471 | if 'Bias' in combined_results.columns and 'N' in combined_results.columns: 472 | bias_table = combined_results.pivot(index='Estimator', columns='N', values='Bias') 473 | 474 | plt.figure(figsize=(12, 8)) 475 | for estimator in bias_table.index: 476 | plt.plot(bias_table.columns, bias_table.loc[estimator], marker='o', linewidth=2, label=estimator) 477 | 478 | plt.axhline(y=0, color='r', linestyle='-', alpha=0.5) # Add zero line for reference 479 | plt.xlabel('Sample Size (N)', fontsize=14) 480 | plt.ylabel('Bias', fontsize=14) 481 | plt.title('Bias by Sample Size for Each Estimator', fontsize=16) 482 | plt.legend(fontsize=12) 483 | plt.grid(True) 484 | plt.tight_layout() 485 | plt.savefig(os.path.join(output_dir, 'bias_by_sample_size.png')) 486 | plt.close() 487 | 488 | print(f"All visualizations created in {output_dir}") 489 | 490 | except Exception as e: 491 | print(f"Error creating visualizations: {e}") 492 | 493 | def find_npz_directories(base_dir): 494 | """Find directories containing NPZ files""" 495 | npz_dirs = [] 496 | 497 | # First check the base directory 498 | if any(f.endswith('.npz') for f in os.listdir(base_dir)): 499 | npz_dirs.append(base_dir) 500 | 501 | # Then check subdirectories 502 | for item in os.listdir(base_dir): 503 | item_path = os.path.join(base_dir, item) 504 | if os.path.isdir(item_path): 505 | if any(f.endswith('.npz') for f in os.listdir(item_path)): 506 | npz_dirs.append(item_path) 507 | 508 | return npz_dirs 509 | 510 | def main(): 511 | # Parse command line arguments 512 | parser = argparse.ArgumentParser(description='Calculate RMSE for reinforcement learning estimators') 513 | parser.add_argument('--sample-sizes', type=int, nargs='+', default=[100, 1500, 3000, 4500], 514 | help='Sample sizes to analyze') 515 | parser.add_argument('--true-value', type=float, default=None, 516 | help='Known true value (if not provided, will be estimated)') 517 | parser.add_argument('--n-episodes', type=int, default=100000, 518 | help='Number of episodes to use for estimating true value') 519 | parser.add_argument('--output-dir', type=str, default='rmse_analysis', 520 | help='Directory to save analysis results') 521 | parser.add_argument('--list-files', action='store_true', 522 | help='List all NPZ files found in the directories') 523 | 524 | args = parser.parse_args() 525 | 526 | # Find the current directory 527 | current_dir = os.getcwd() 528 | print(f"Current working directory: {current_dir}") 529 | 530 | # Look for directories containing NPZ files 531 | npz_dirs = find_npz_directories(current_dir) 532 | 533 | if not npz_dirs: 534 | print("No directories with NPZ files found! Using current directory.") 535 | directory = current_dir 536 | else: 537 | print(f"Found {len(npz_dirs)} directories with NPZ files:") 538 | for i, d in enumerate(npz_dirs): 539 | print(f" {i+1}. {d}") 540 | 541 | directory = npz_dirs # Search in all found directories 542 | 543 | # List files if requested 544 | if args.list_files: 545 | print("\nListing all NPZ files in the directories:") 546 | for d in (npz_dirs if npz_dirs else [current_dir]): 547 | print(f"\nFiles in {d}:") 548 | npz_files = [f for f in os.listdir(d) if f.endswith('.npz')] 549 | for f in sorted(npz_files): 550 | print(f" {f}") 551 | 552 | # Create output directory 553 | output_dir = os.path.join(current_dir, args.output_dir) 554 | os.makedirs(output_dir, exist_ok=True) 555 | 556 | # Step 1: Get the true value 557 | if args.true_value is not None: 558 | # Use provided true value 559 | true_value = args.true_value 560 | print(f"Using provided true value: {true_value}") 561 | else: 562 | # Check if we already have the true value saved 563 | true_value_file = os.path.join(current_dir, "true_value_estimate.txt") 564 | if os.path.exists(true_value_file): 565 | try: 566 | with open(true_value_file, "r") as f: 567 | lines = f.readlines() 568 | true_value = float(lines[0].split(":")[1].strip()) 569 | print(f"Using existing true value: {true_value}") 570 | except Exception as e: 571 | print(f"Error reading true value file: {e}") 572 | print("Estimating new true value...") 573 | true_value, std_error = estimate_true_value(n_episodes=args.n_episodes) 574 | else: 575 | # Estimate the true value 576 | true_value, std_error = estimate_true_value(n_episodes=args.n_episodes) 577 | 578 | # Save the result 579 | with open(os.path.join(output_dir, "true_value_estimate.txt"), "w") as f: 580 | f.write(f"Estimated true value: {true_value}\n") 581 | f.write(f"Standard error: {std_error}\n") 582 | f.write(f"Parameters: alpha=0.9, beta=0.2, T=30, episodes={args.n_episodes}\n") 583 | f.write(f"Date: {time.strftime('%Y-%m-%d %H:%M:%S')}\n") 584 | 585 | # Step 2: Load all estimator results 586 | all_estimators = load_estimator_results(directory, args.sample_sizes) 587 | 588 | # Step 3: Calculate RMSE for all estimators 589 | rmse_table, se_table, bias_table, combined_results = create_rmse_table(all_estimators, true_value) 590 | 591 | # Check if we have any results 592 | if rmse_table.empty: 593 | print("\nNo valid results were found to analyze. Please check your file paths and naming conventions.") 594 | return 595 | 596 | # Step 4: Display results 597 | print("\nRMSE Results:") 598 | print(rmse_table) 599 | 600 | print("\nStandard Errors:") 601 | print(se_table) 602 | 603 | print("\nBias Values:") 604 | print(bias_table) 605 | 606 | # Step 5: Save results to CSV 607 | rmse_table.to_csv(os.path.join(output_dir, "rmse_table.csv")) 608 | se_table.to_csv(os.path.join(output_dir, "se_table.csv")) 609 | bias_table.to_csv(os.path.join(output_dir, "bias_table.csv")) 610 | combined_results.to_csv(os.path.join(output_dir, "combined_results.csv"), index=False) 611 | 612 | # Step 6: Create LaTeX table for publication 613 | create_latex_table(rmse_table, se_table, os.path.join(output_dir, "rmse_table.tex")) 614 | 615 | # Step 7: Create visualizations 616 | create_visualizations(rmse_table, se_table, combined_results, output_dir) 617 | 618 | print(f"\nAll analysis results saved to {output_dir}/") 619 | 620 | if __name__ == "__main__": 621 | main() -------------------------------------------------------------------------------- /exp5_1_py3/rmse_learner.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import os 4 | import argparse 5 | from joblib import Parallel, delayed 6 | import multiprocessing as mp 7 | import time 8 | 9 | # Estimators from your outputs 10 | ESTIMATORS = [ 11 | 'ipw', 'dm', 'ipw2', 'dr', 'dr2', 12 | 'ipw_mis_q', 'dm_mis_q', 'ipw2_mis_q', 'dr_mis_q', 'dr2_mis_q', 13 | 'ipw_mis_mu', 'dm_mis_mu', 'ipw2_mis_mu', 'dr_mis_mu', 'dr2_mis_mu' 14 | ] 15 | 16 | def sigmoid(x): 17 | return 1.0 / (1.0 + np.exp(-0.1 * x)) 18 | 19 | def estimate_true_value_single(seed, T=30, alpha=0.9): 20 | np.random.seed(seed) 21 | s = np.random.normal(0.5, 0.2) 22 | total_r = 0 23 | for j in range(T): 24 | a_prob = alpha * sigmoid(s) + (1 - alpha) * np.random.uniform(0, 1) 25 | a = np.random.binomial(1, a_prob) 26 | r = np.random.normal(0.9 * s + 0.3 * a - 0.02*(j%2), 0.2) 27 | total_r += r 28 | if j < T-1: 29 | s = np.random.normal(0.02*(j%2) + s - 0.3*(a-0.5), 0.2) 30 | return total_r 31 | 32 | def estimate_true_value(n_episodes=100000, T=30, alpha=0.9, n_jobs=-1): 33 | if n_jobs == -1: 34 | n_jobs = mp.cpu_count() 35 | returns = Parallel(n_jobs=n_jobs)(delayed(estimate_true_value_single)(i, T, alpha) for i in range(n_episodes)) 36 | mean_return = np.mean(returns) 37 | std_error = np.std(returns) / np.sqrt(n_episodes) 38 | print(f"Estimated true value: {mean_return:.6f} ± {std_error:.6f}") 39 | return mean_return 40 | 41 | def load_estimates(N, mu_method): 42 | estimates = {} 43 | for est in ESTIMATORS: 44 | fname = f"estimator_list_{est}_{mu_method}_{N}.npz" 45 | if os.path.exists(fname): 46 | data = np.load(fname) 47 | estimates[est] = data['a'] if 'a' in data else data[list(data.keys())[0]] 48 | else: 49 | print(f"Warning: {fname} not found.") 50 | estimates[est] = np.array([]) 51 | return estimates 52 | 53 | def calculate_rmse(estimates, true_value): 54 | results = [] 55 | for name, values in estimates.items(): 56 | if values.size == 0: 57 | continue 58 | bias = np.mean(values) - true_value 59 | rmse = np.sqrt(np.mean((values - true_value)**2)) 60 | se = np.std((values - true_value)**2) / (2 * rmse * np.sqrt(len(values))) 61 | results.append({'Estimator': name, 'RMSE': rmse, 'Bias': bias, 'SE': se, 'Std': np.std(values)}) 62 | return pd.DataFrame(results) 63 | 64 | def main(): 65 | parser = argparse.ArgumentParser() 66 | parser.add_argument('--N', type=int, default=1500) 67 | parser.add_argument('--true_value', type=float, default=None) 68 | parser.add_argument('--n_episodes', type=int, default=100000) 69 | parser.add_argument('--output', type=str, default='rmse_results.csv') 70 | parser.add_argument('--mu_method', type=str, default='linear', choices=['linear', 'mlp', 'rf'], help='W-function learner') 71 | 72 | 73 | args = parser.parse_args() 74 | 75 | # Step 1: Estimate true value if not provided 76 | if args.true_value is None: 77 | print("No true value provided. Estimating...") 78 | true_value = estimate_true_value(n_episodes=args.n_episodes) 79 | else: 80 | true_value = args.true_value 81 | print(f"Using provided true value: {true_value}") 82 | 83 | # Step 2: Load estimates 84 | estimates = load_estimates(args.N, args.mu_method) 85 | 86 | # Step 3: Calculate RMSE 87 | results = calculate_rmse(estimates, true_value) 88 | results = results.sort_values(by='RMSE') 89 | 90 | # Step 4: Save and display 91 | results.to_csv(args.output, index=False) 92 | print("\n=== RMSE Results ===") 93 | print(results) 94 | 95 | if __name__ == "__main__": 96 | main() 97 | -------------------------------------------------------------------------------- /exp5_1_py3/toytoy2_par_table.py: -------------------------------------------------------------------------------- 1 | # Full GPU-Accelerated Simulation with 15 Estimators Using PyTorch 2 | import torch 3 | import numpy as np 4 | import time 5 | import argparse 6 | 7 | # Use GPU if available 8 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 9 | print(f"Using device: {device}") 10 | 11 | def sigmoid(x): 12 | return 1.0 / (1.0 + torch.exp(-0.1 * x)) 13 | 14 | def behav_dens(s, a, beta): 15 | b = beta * sigmoid(s) + beta * 0.5 16 | # Convert a_val to tensor if it's a scalar 17 | if not isinstance(a, torch.Tensor): 18 | a = torch.tensor(a, device=device, dtype=torch.float32) 19 | # Ensure a is the same shape as b for comparison 20 | if a.dim() == 0: 21 | a = a.expand_as(b) 22 | return torch.where(a == 1.0, b, 1.0 - b) 23 | 24 | def eval_dens(s, a, alpha): 25 | b = alpha * sigmoid(s) + (1 - alpha) * 0.5 26 | # Convert a_val to tensor if it's a scalar 27 | if not isinstance(a, torch.Tensor): 28 | a = torch.tensor(a, device=device, dtype=torch.float32) 29 | # Ensure a is the same shape as b for comparison 30 | if a.dim() == 0: 31 | a = a.expand_as(b) 32 | return torch.where(a == 1.0, b, 1.0 - b) 33 | 34 | def linear_regression(X, y): 35 | beta_hat = torch.linalg.lstsq(X, y).solution 36 | return beta_hat 37 | 38 | def regress_q(s, a, r, alpha, squared=False): 39 | # Added alpha parameter to fix the scope issue 40 | N, T = s.shape 41 | q_weights = [] 42 | V_next = torch.zeros(N, device=device) 43 | for t in reversed(range(T)): 44 | s_t = s[:, t]**2 if squared else s[:, t] 45 | sa = torch.stack([s_t, s_t * a[:, t], torch.ones(N, device=device)], dim=1) 46 | y = r[:, t] if t == T - 1 else r[:, t] + V_next 47 | w = linear_regression(sa, y) 48 | q_weights.insert(0, w) 49 | # V for next step 50 | V_next = 0 51 | for a_val in [0.0, 1.0]: 52 | s_a = torch.stack([s_t, s_t * a_val, torch.ones(N, device=device)], dim=1) 53 | # Create a tensor of a_val with the same size as s[:, t] 54 | a_val_tensor = torch.full_like(s[:, t], a_val) 55 | # Pass a_val_tensor to eval_dens 56 | prob = eval_dens(s[:, t], a_val_tensor, alpha) 57 | V_next += prob * (s_a @ w).squeeze() 58 | return q_weights 59 | 60 | def regress_mu(s, a, w_, squared=False): 61 | N, T = s.shape 62 | mu_weights = [] 63 | for t in range(T): 64 | s_t = s[:, t]**2 if squared else s[:, t] 65 | sa = torch.stack([s_t, s_t * a[:, t], torch.ones(N, device=device)], dim=1) 66 | y = w_[:, t] 67 | mu_weights.append(linear_regression(sa, y)) 68 | return mu_weights 69 | 70 | def eval_dm(s, q0, alpha, squared=False): 71 | # Added alpha parameter to fix the scope issue 72 | N = s.shape[0] 73 | s0 = s[:, 0]**2 if squared else s[:, 0] 74 | V = torch.zeros(N, device=device) 75 | for a_val in [0.0, 1.0]: 76 | sa = torch.stack([s0, s0 * a_val, torch.ones(N, device=device)], dim=1) 77 | # Create a tensor of a_val with the same size as s[:, 0] 78 | a_val_tensor = torch.full_like(s[:, 0], a_val) 79 | # Pass a_val_tensor to eval_dens 80 | V += eval_dens(s[:, 0], a_val_tensor, alpha) * (sa @ q0).squeeze() 81 | return V.mean() 82 | 83 | def eval_ipw(r, w): 84 | return (r * w).sum(dim=1).mean() 85 | 86 | def eval_mis(mu_weights, s, a, r, squared=False): 87 | total = 0 88 | for t, w in enumerate(mu_weights): 89 | s_t = s[:, t]**2 if squared else s[:, t] 90 | sa = torch.stack([s_t, s_t * a[:, t], torch.ones(len(s), device=device)], dim=1) 91 | total += (sa @ w).squeeze() * r[:, t] 92 | return total.mean() 93 | 94 | def eval_dr(q_weights1, q_weights2, s1, s2, a1, a2, r1, r2, w1, w2, alpha, squared=False): 95 | # Added alpha parameter to fix the scope issue 96 | def compute_half(qw, s, a, r, w): 97 | total = 0 98 | for t in range(s.shape[1]): 99 | s_t = s[:, t]**2 if squared else s[:, t] 100 | sa = torch.stack([s_t, s_t * a[:, t], torch.ones(len(s), device=device)], dim=1) 101 | V_t = torch.zeros(len(s), device=device) 102 | for a_val in [0.0, 1.0]: 103 | sa_val = torch.stack([s_t, s_t * a_val, torch.ones(len(s), device=device)], dim=1) 104 | # Create a tensor of a_val with the same size as s[:, t] 105 | a_val_tensor = torch.full_like(s[:, t], a_val) 106 | # Pass a_val_tensor to eval_dens 107 | V_t += eval_dens(s[:, t], a_val_tensor, alpha) * (sa_val @ qw[t]).squeeze() 108 | V_w = V_t if t == 0 else V_t * w[:, t - 1] 109 | total += (r[:, t] * w[:, t] - (sa @ qw[t]).squeeze() * w[:, t] + V_w).mean() 110 | return total 111 | return (compute_half(q_weights2, s1, a1, r1, w1) + compute_half(q_weights1, s2, a2, r2, w2)) / 2 112 | 113 | def eval_dr2(q1, q2, mu1, mu2, s1, s2, a1, a2, r1, r2, alpha, squared_q=False, squared_mu=False): 114 | # Added alpha parameter to fix the scope issue 115 | def compute_half(qw, mw, s, a, r): 116 | total = 0 117 | for t in range(s.shape[1]): 118 | sq = s[:, t]**2 if squared_q else s[:, t] 119 | sm = s[:, t]**2 if squared_mu else s[:, t] 120 | sa_q = torch.stack([sq, sq * a[:, t], torch.ones(len(s), device=device)], dim=1) 121 | sa_m = torch.stack([sm, sm * a[:, t], torch.ones(len(s), device=device)], dim=1) 122 | pred_mu = (sa_m @ mw[t]).squeeze() 123 | V_t = torch.zeros(len(s), device=device) 124 | for a_val in [0.0, 1.0]: 125 | sq_val = sq 126 | sa_val = torch.stack([sq_val, sq_val * a_val, torch.ones(len(s), device=device)], dim=1) 127 | # Create a tensor of a_val with the same size as s[:, t] 128 | a_val_tensor = torch.full_like(s[:, t], a_val) 129 | # Pass a_val_tensor to eval_dens 130 | V_t += eval_dens(s[:, t], a_val_tensor, alpha) * (sa_val @ qw[t]).squeeze() 131 | 132 | # # This matches the original toytoy2.py implementation that only considers action 1 133 | # a_val = 1.0 # Only consider action 1 134 | # sa_val = torch.stack([s_t, s_t * a_val, torch.ones(len(s), device=device)], dim=1) 135 | # a_val_tensor = torch.full_like(s[:, t], a_val) 136 | # V_t = eval_dens(s[:, t], a_val_tensor, alpha) * (sa_val @ qw[t]).squeeze() 137 | 138 | if t > 0: 139 | sm_prev = s[:, t - 1]**2 if squared_mu else s[:, t - 1] 140 | sa_m_prev = torch.stack([sm_prev, sm_prev * a[:, t - 1], torch.ones(len(s), device=device)], dim=1) 141 | V_t *= (sa_m_prev @ mw[t - 1]).squeeze() 142 | total += (pred_mu * r[:, t] - pred_mu * (sa_q @ qw[t]).squeeze() + V_t).mean() 143 | return total 144 | return (compute_half(q2, mu2, s1, a1, r1) + compute_half(q1, mu1, s2, a2, r2)) / 2 145 | 146 | def run_single_repetition(N, T, beta, alpha): 147 | # Generate trajectories 148 | s = torch.zeros(N, T, device=device) 149 | a = torch.zeros(N, T, device=device) 150 | r = torch.zeros(N, T, device=device) 151 | w = torch.ones(N, T, device=device) 152 | 153 | s[:, 0] = torch.normal(0.5, 0.2, size=(N,), device=device) 154 | p = beta * sigmoid(s[:, 0]) + beta * torch.rand(N, device=device) 155 | a[:, 0] = torch.bernoulli(p) 156 | r[:, 0] = torch.normal(0.9 * s[:, 0] + 0.3 * a[:, 0], 0.2) 157 | 158 | for t in range(1, T): 159 | s[:, t] = torch.normal(0.02 * (t % 2) + s[:, t - 1] - 0.3 * (a[:, t - 1] - 0.5), 0.2) 160 | p = beta * sigmoid(s[:, t]) + beta * torch.rand(N, device=device) 161 | a[:, t] = torch.bernoulli(p) 162 | w[:, t] = eval_dens(s[:, t], a[:, t], alpha) / behav_dens(s[:, t], a[:, t], beta) * w[:, t - 1] 163 | r[:, t] = torch.normal(0.9 * s[:, t] + 0.3 * a[:, t] - 0.02 * (t % 2), 0.2) 164 | 165 | # Split data for cross-fitting 166 | s1, s2 = s.chunk(2) 167 | a1, a2 = a.chunk(2) 168 | r1, r2 = r.chunk(2) 169 | w1, w2 = w.chunk(2) 170 | 171 | # Regression for q-functions and mu-functions 172 | # Pass alpha to all functions that use it 173 | q1 = regress_q(s1, a1, r1, alpha) 174 | q2 = regress_q(s2, a2, r2, alpha) 175 | q1_sq = regress_q(s1, a1, r1, alpha, squared=True) 176 | q2_sq = regress_q(s2, a2, r2, alpha, squared=True) 177 | 178 | mu1 = regress_mu(s1, a1, w1) 179 | mu2 = regress_mu(s2, a2, w2) 180 | mu1_sq = regress_mu(s1, a1, w1, squared=True) 181 | mu2_sq = regress_mu(s2, a2, w2, squared=True) 182 | 183 | # Calculate all estimators 184 | # Pass alpha to all functions that use it 185 | return { 186 | 'ipw': eval_ipw(r, w).item(), 187 | 'dm': ((eval_dm(s1, q1[0], alpha) + eval_dm(s2, q2[0], alpha)) / 2).item(), 188 | 'ipw2': ((eval_mis(mu1, s2, a2, r2) + eval_mis(mu2, s1, a1, r1)) / 2).item(), 189 | 'dr': eval_dr(q1, q2, s1, s2, a1, a2, r1, r2, w1, w2, alpha).item(), 190 | 'dr2': eval_dr2(q1, q2, mu1, mu2, s1, s2, a1, a2, r1, r2, alpha).item(), 191 | 'ipw_mis_q': eval_ipw(r, w).item(), 192 | 'dm_mis_q': ((eval_dm(s1, q1_sq[0], alpha, squared=True) + eval_dm(s2, q2_sq[0], alpha, squared=True)) / 2).item(), 193 | 'ipw2_mis_q': ((eval_mis(mu1, s2, a2, r2) + eval_mis(mu2, s1, a1, r1)) / 2).item(), 194 | 'dr_mis_q': eval_dr(q1_sq, q2_sq, s1, s2, a1, a2, r1, r2, w1, w2, alpha, squared=True).item(), 195 | 'dr2_mis_q': eval_dr2(q1_sq, q2_sq, mu1, mu2, s1, s2, a1, a2, r1, r2, alpha, squared_q=True).item(), 196 | 'ipw_mis_mu': eval_ipw(r, w).item(), 197 | 'dm_mis_mu': ((eval_dm(s1, q1[0], alpha) + eval_dm(s2, q2[0], alpha)) / 2).item(), 198 | 'ipw2_mis_mu': ((eval_mis(mu1_sq, s2, a2, r2, squared=True) + eval_mis(mu2_sq, s1, a1, r1, squared=True)) / 2).item(), 199 | 'dr_mis_mu': eval_dr(q1, q2, s1, s2, a1, a2, r1, r2, w1, w2, alpha).item(), 200 | 'dr2_mis_mu': eval_dr2(q1, q2, mu1_sq, mu2_sq, s1, s2, a1, a2, r1, r2, alpha, squared_mu=True).item(), 201 | } 202 | 203 | def main(): 204 | # Parse command line arguments 205 | parser = argparse.ArgumentParser(description='Run GPU-accelerated RL estimators') 206 | parser.add_argument('--N', type=int, default=1500, help='Number of trajectories') 207 | parser.add_argument('--T', type=int, default=30, help='Time horizon') 208 | parser.add_argument('--beta', type=float, default=0.2, help='Behavior policy parameter') 209 | parser.add_argument('--alpha', type=float, default=0.9, help='Evaluation policy parameter') 210 | parser.add_argument('--reps', type=int, default=1500, help='Number of repetitions') 211 | 212 | args = parser.parse_args() 213 | 214 | print(f"Running with parameters: N={args.N}, T={args.T}, beta={args.beta}, alpha={args.alpha}, reps={args.reps}") 215 | 216 | # For very first repetition, get keys from a small sample run 217 | # This prevents out-of-memory errors when N is large 218 | sample_N = min(100, args.N) 219 | sample_res = run_single_repetition(sample_N, args.T, args.beta, args.alpha) 220 | results = {k: [] for k in sample_res.keys()} 221 | 222 | # Set seeds for reproducibility 223 | torch.manual_seed(42) 224 | np.random.seed(42) 225 | if torch.cuda.is_available(): 226 | torch.cuda.manual_seed_all(42) 227 | 228 | # Run all repetitions 229 | start_time = time.time() 230 | for i in range(args.reps): 231 | rep_start_time = time.time() 232 | print(f"\nStarting repetition {i+1}/{args.reps}") 233 | 234 | # Run repetition 235 | try: 236 | res = run_single_repetition(args.N, args.T, args.beta, args.alpha) 237 | # Collect results 238 | for k in results: 239 | results[k].append(res[k]) 240 | 241 | # Clear GPU memory 242 | if torch.cuda.is_available(): 243 | torch.cuda.empty_cache() 244 | 245 | rep_time = time.time() - rep_start_time 246 | print(f"Repetition {i+1} completed in {rep_time:.2f} seconds") 247 | 248 | # Periodically save results 249 | if (i+1) % 10 == 0 or i == args.reps-1: 250 | for k in results: 251 | np.savez(f"estimator_list_{k}_{args.N}", a=np.array(results[k])) 252 | print(f"Saved checkpoint after {i+1} repetitions") 253 | 254 | except Exception as e: 255 | print(f"Error in repetition {i+1}: {e}") 256 | # Try to continue with next repetition 257 | 258 | # Save final results 259 | for k in results: 260 | np.savez(f"estimator_list_{k}_{args.N}", a=np.array(results[k])) 261 | 262 | total_time = time.time() - start_time 263 | print(f"All {args.reps} repetitions completed in {total_time:.2f} seconds") 264 | 265 | if __name__ == '__main__': 266 | import sys 267 | main() -------------------------------------------------------------------------------- /exp5_1_py3/toytoy2_par_table_learner.py: -------------------------------------------------------------------------------- 1 | # Full GPU-Accelerated Simulation with 15 Estimators Using PyTorch 2 | import torch 3 | import numpy as np 4 | import time 5 | import argparse 6 | import torch.nn as nn 7 | from sklearn.ensemble import RandomForestRegressor 8 | from torch.cuda.amp import autocast, GradScaler 9 | from sklearn.ensemble import ExtraTreesRegressor 10 | from cuml.ensemble import RandomForestRegressor as cuRF 11 | import cuml 12 | import cupy as cp 13 | 14 | 15 | 16 | # Use GPU if available 17 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 18 | print(f"Using device: {device}") 19 | 20 | class SmallMLP(nn.Module): 21 | def __init__(self, input_dim=3, hidden_dim=32): 22 | super().__init__() 23 | self.model = nn.Sequential( 24 | nn.Linear(input_dim, hidden_dim), 25 | nn.ReLU(), 26 | nn.Linear(hidden_dim, 1) 27 | ) 28 | 29 | def forward(self, x): 30 | return self.model(x) 31 | 32 | def fit_mlp(X, y, epochs=10, lr=1e-2): 33 | model = SmallMLP(X.shape[1]).to(device) 34 | optimizer = torch.optim.Adam(model.parameters(), lr=lr) 35 | loss_fn = nn.MSELoss() 36 | for _ in range(epochs): 37 | model.train() 38 | pred = model(X).squeeze() 39 | loss = loss_fn(pred, y) 40 | optimizer.zero_grad() 41 | loss.backward() 42 | optimizer.step() 43 | return model 44 | 45 | def sigmoid(x): 46 | return 1.0 / (1.0 + torch.exp(-0.1 * x)) 47 | 48 | def behav_dens(s, a, beta): 49 | b = beta * sigmoid(s) + beta * 0.5 50 | # Convert a_val to tensor if it's a scalar 51 | if not isinstance(a, torch.Tensor): 52 | a = torch.tensor(a, device=device, dtype=torch.float32) 53 | # Ensure a is the same shape as b for comparison 54 | if a.dim() == 0: 55 | a = a.expand_as(b) 56 | return torch.where(a == 1.0, b, 1.0 - b) 57 | 58 | def eval_dens(s, a, alpha): 59 | b = alpha * sigmoid(s) + (1 - alpha) * 0.5 60 | # Convert a_val to tensor if it's a scalar 61 | if not isinstance(a, torch.Tensor): 62 | a = torch.tensor(a, device=device, dtype=torch.float32) 63 | # Ensure a is the same shape as b for comparison 64 | if a.dim() == 0: 65 | a = a.expand_as(b) 66 | return torch.where(a == 1.0, b, 1.0 - b) 67 | 68 | def linear_regression(X, y): 69 | beta_hat = torch.linalg.lstsq(X, y).solution 70 | return beta_hat 71 | 72 | def regress_q(s, a, r, alpha, squared=False): 73 | # Added alpha parameter to fix the scope issue 74 | N, T = s.shape 75 | q_weights = [] 76 | V_next = torch.zeros(N, device=device) 77 | for t in reversed(range(T)): 78 | s_t = s[:, t]**2 if squared else s[:, t] 79 | sa = torch.stack([s_t, s_t * a[:, t], torch.ones(N, device=device)], dim=1) 80 | y = r[:, t] if t == T - 1 else r[:, t] + V_next 81 | w = linear_regression(sa, y) 82 | q_weights.insert(0, w) 83 | # V for next step 84 | V_next = 0 85 | for a_val in [0.0, 1.0]: 86 | s_a = torch.stack([s_t, s_t * a_val, torch.ones(N, device=device)], dim=1) 87 | # Create a tensor of a_val with the same size as s[:, t] 88 | a_val_tensor = torch.full_like(s[:, t], a_val) 89 | # Pass a_val_tensor to eval_dens 90 | prob = eval_dens(s[:, t], a_val_tensor, alpha) 91 | V_next += prob * (s_a @ w).squeeze() 92 | return q_weights 93 | 94 | def regress_mu(s, a, w_, squared=False, method='linear'): 95 | N, T = s.shape 96 | mu_models = [] 97 | for t in range(T): 98 | s_t = s[:, t]**2 if squared else s[:, t] 99 | sa = torch.stack([s_t, s_t * a[:, t], torch.ones(N, device=device)], dim=1) 100 | y = w_[:, t] 101 | 102 | if method == 'linear': 103 | mu_models.append(linear_regression(sa, y)) 104 | elif method == 'mlp': 105 | mu_models.append(fit_mlp(sa.float(), y.float())) 106 | elif method == 'rf': 107 | sa_cp = cp.asarray(sa) # Convert torch.Tensor to CuPy array 108 | y_cp = cp.asarray(y) 109 | 110 | rf = cuRF( 111 | n_estimators=10, # Low for speed; increase if accuracy is poor 112 | max_depth=4, # Slightly deeper trees than depth=3 to capture interactions 113 | min_samples_split=10, # Avoid overfitting on small batches 114 | min_samples_leaf=5, # Avoid overly deep trees with tiny leaves 115 | max_features=1.0, # Try all features since you only have 3 — avoid randomness 116 | n_streams=8 # Parallel GPU streams (leave as-is for A100) 117 | ) 118 | 119 | rf.fit(sa_cp, y_cp) 120 | mu_models.append(rf) 121 | 122 | else: 123 | raise ValueError("Unknown method for mu regression") 124 | 125 | return mu_models 126 | 127 | def eval_dm(s, q0, alpha, squared=False): 128 | # Added alpha parameter to fix the scope issue 129 | N = s.shape[0] 130 | s0 = s[:, 0]**2 if squared else s[:, 0] 131 | V = torch.zeros(N, device=device) 132 | for a_val in [0.0, 1.0]: 133 | sa = torch.stack([s0, s0 * a_val, torch.ones(N, device=device)], dim=1) 134 | # Create a tensor of a_val with the same size as s[:, 0] 135 | a_val_tensor = torch.full_like(s[:, 0], a_val) 136 | # Pass a_val_tensor to eval_dens 137 | V += eval_dens(s[:, 0], a_val_tensor, alpha) * (sa @ q0).squeeze() 138 | return V.mean() 139 | 140 | def eval_ipw(r, w): 141 | return (r * w).sum(dim=1).mean() 142 | 143 | def eval_mis(mu_weights, s, a, r, squared=False): 144 | total = 0 145 | for t, w in enumerate(mu_weights): 146 | s_t = s[:, t]**2 if squared else s[:, t] 147 | sa = torch.stack([s_t, s_t * a[:, t], torch.ones(len(s), device=device)], dim=1) 148 | 149 | if isinstance(w, torch.Tensor): 150 | pred = (sa @ w).squeeze() 151 | elif isinstance(w, nn.Module): 152 | pred = w(sa).squeeze() 153 | else: 154 | pred = torch.tensor(w.predict(sa.cpu().numpy()), device=device) 155 | total += pred * r[:, t] 156 | # total += (sa @ w).squeeze() * r[:, t] 157 | return total.mean() 158 | 159 | def eval_dr(q_weights1, q_weights2, s1, s2, a1, a2, r1, r2, w1, w2, alpha, squared=False): 160 | # Added alpha parameter to fix the scope issue 161 | def compute_half(qw, s, a, r, w): 162 | total = 0 163 | for t in range(s.shape[1]): 164 | s_t = s[:, t]**2 if squared else s[:, t] 165 | sa = torch.stack([s_t, s_t * a[:, t], torch.ones(len(s), device=device)], dim=1) 166 | V_t = torch.zeros(len(s), device=device) 167 | for a_val in [0.0, 1.0]: 168 | sa_val = torch.stack([s_t, s_t * a_val, torch.ones(len(s), device=device)], dim=1) 169 | # Create a tensor of a_val with the same size as s[:, t] 170 | a_val_tensor = torch.full_like(s[:, t], a_val) 171 | # Pass a_val_tensor to eval_dens 172 | V_t += eval_dens(s[:, t], a_val_tensor, alpha) * (sa_val @ qw[t]).squeeze() 173 | V_w = V_t if t == 0 else V_t * w[:, t - 1] 174 | total += (r[:, t] * w[:, t] - (sa @ qw[t]).squeeze() * w[:, t] + V_w).mean() 175 | return total 176 | return (compute_half(q_weights2, s1, a1, r1, w1) + compute_half(q_weights1, s2, a2, r2, w2)) / 2 177 | 178 | def eval_dr2(q1, q2, mu1, mu2, s1, s2, a1, a2, r1, r2, alpha, squared_q=False, squared_mu=False): 179 | def compute_half(qw, mw, s, a, r): 180 | total = 0 181 | for t in range(s.shape[1]): 182 | sq = s[:, t]**2 if squared_q else s[:, t] 183 | sm = s[:, t]**2 if squared_mu else s[:, t] 184 | sa_q = torch.stack([sq, sq * a[:, t], torch.ones(len(s), device=device)], dim=1) 185 | sa_m = torch.stack([sm, sm * a[:, t], torch.ones(len(s), device=device)], dim=1) 186 | 187 | # Handle different model types for pred_mu 188 | if isinstance(mw[t], torch.Tensor): 189 | pred_mu = (sa_m @ mw[t]).squeeze() 190 | elif isinstance(mw[t], nn.Module): 191 | pred_mu = mw[t](sa_m).squeeze() 192 | else: 193 | pred_mu = torch.tensor(mw[t].predict(sa_m.cpu().numpy()), device=device) 194 | 195 | V_t = torch.zeros(len(s), device=device) 196 | for a_val in [0.0, 1.0]: 197 | sq_val = sq 198 | sa_val = torch.stack([sq_val, sq_val * a_val, torch.ones(len(s), device=device)], dim=1) 199 | a_val_tensor = torch.full_like(s[:, t], a_val) 200 | V_t += eval_dens(s[:, t], a_val_tensor, alpha) * (sa_val @ qw[t]).squeeze() 201 | 202 | if t > 0: 203 | sm_prev = s[:, t - 1]**2 if squared_mu else s[:, t - 1] 204 | sa_m_prev = torch.stack([sm_prev, sm_prev * a[:, t - 1], torch.ones(len(s), device=device)], dim=1) 205 | 206 | # Handle different model types for the previous mu weights 207 | if isinstance(mw[t-1], torch.Tensor): 208 | pred_mu_prev = (sa_m_prev @ mw[t-1]).squeeze() 209 | elif isinstance(mw[t-1], nn.Module): 210 | pred_mu_prev = mw[t-1](sa_m_prev).squeeze() 211 | else: 212 | pred_mu_prev = torch.tensor(mw[t-1].predict(sa_m_prev.cpu().numpy()), device=device) 213 | 214 | V_t *= pred_mu_prev 215 | 216 | total += (pred_mu * r[:, t] - pred_mu * (sa_q @ qw[t]).squeeze() + V_t).mean() 217 | return total 218 | 219 | return (compute_half(q2, mu2, s1, a1, r1) + compute_half(q1, mu1, s2, a2, r2)) / 2 220 | 221 | def run_single_repetition(N, T, beta, alpha, method='linear'): 222 | # Generate trajectories 223 | s = torch.zeros(N, T, device=device) 224 | a = torch.zeros(N, T, device=device) 225 | r = torch.zeros(N, T, device=device) 226 | w = torch.ones(N, T, device=device) 227 | 228 | s[:, 0] = torch.normal(0.5, 0.2, size=(N,), device=device) 229 | p = beta * sigmoid(s[:, 0]) + beta * torch.rand(N, device=device) 230 | a[:, 0] = torch.bernoulli(p) 231 | r[:, 0] = torch.normal(0.9 * s[:, 0] + 0.3 * a[:, 0], 0.2) 232 | 233 | for t in range(1, T): 234 | s[:, t] = torch.normal(0.02 * (t % 2) + s[:, t - 1] - 0.3 * (a[:, t - 1] - 0.5), 0.2) 235 | p = beta * sigmoid(s[:, t]) + beta * torch.rand(N, device=device) 236 | a[:, t] = torch.bernoulli(p) 237 | w[:, t] = eval_dens(s[:, t], a[:, t], alpha) / behav_dens(s[:, t], a[:, t], beta) * w[:, t - 1] 238 | r[:, t] = torch.normal(0.9 * s[:, t] + 0.3 * a[:, t] - 0.02 * (t % 2), 0.2) 239 | 240 | # Split data for cross-fitting 241 | s1, s2 = s.chunk(2) 242 | a1, a2 = a.chunk(2) 243 | r1, r2 = r.chunk(2) 244 | w1, w2 = w.chunk(2) 245 | 246 | # Regression for q-functions and mu-functions 247 | # Pass alpha to all functions that use it 248 | q1 = regress_q(s1, a1, r1, alpha) 249 | q2 = regress_q(s2, a2, r2, alpha) 250 | q1_sq = regress_q(s1, a1, r1, alpha, squared=True) 251 | q2_sq = regress_q(s2, a2, r2, alpha, squared=True) 252 | 253 | mu1 = regress_mu(s1, a1, w1, method=method) 254 | mu2 = regress_mu(s2, a2, w2, method=method) 255 | mu1_sq = regress_mu(s1, a1, w1, squared=True, method=method) 256 | mu2_sq = regress_mu(s2, a2, w2, squared=True, method=method) 257 | 258 | # Calculate all estimators 259 | # Pass alpha to all functions that use it 260 | return { 261 | 'ipw': eval_ipw(r, w).item(), 262 | 'dm': ((eval_dm(s1, q1[0], alpha) + eval_dm(s2, q2[0], alpha)) / 2).item(), 263 | 'ipw2': ((eval_mis(mu1, s2, a2, r2) + eval_mis(mu2, s1, a1, r1)) / 2).item(), 264 | 'dr': eval_dr(q1, q2, s1, s2, a1, a2, r1, r2, w1, w2, alpha).item(), 265 | 'dr2': eval_dr2(q1, q2, mu1, mu2, s1, s2, a1, a2, r1, r2, alpha).item(), 266 | 'ipw_mis_q': eval_ipw(r, w).item(), 267 | 'dm_mis_q': ((eval_dm(s1, q1_sq[0], alpha, squared=True) + eval_dm(s2, q2_sq[0], alpha, squared=True)) / 2).item(), 268 | 'ipw2_mis_q': ((eval_mis(mu1, s2, a2, r2) + eval_mis(mu2, s1, a1, r1)) / 2).item(), 269 | 'dr_mis_q': eval_dr(q1_sq, q2_sq, s1, s2, a1, a2, r1, r2, w1, w2, alpha, squared=True).item(), 270 | 'dr2_mis_q': eval_dr2(q1_sq, q2_sq, mu1, mu2, s1, s2, a1, a2, r1, r2, alpha, squared_q=True).item(), 271 | 'ipw_mis_mu': eval_ipw(r, w).item(), 272 | 'dm_mis_mu': ((eval_dm(s1, q1[0], alpha) + eval_dm(s2, q2[0], alpha)) / 2).item(), 273 | 'ipw2_mis_mu': ((eval_mis(mu1_sq, s2, a2, r2, squared=True) + eval_mis(mu2_sq, s1, a1, r1, squared=True)) / 2).item(), 274 | 'dr_mis_mu': eval_dr(q1, q2, s1, s2, a1, a2, r1, r2, w1, w2, alpha).item(), 275 | 'dr2_mis_mu': eval_dr2(q1, q2, mu1_sq, mu2_sq, s1, s2, a1, a2, r1, r2, alpha, squared_mu=True).item(), 276 | } 277 | 278 | def main(): 279 | parser = argparse.ArgumentParser(description='Run GPU-accelerated RL estimators') 280 | parser.add_argument('--N', type=int, default=1500, help='Number of trajectories') 281 | parser.add_argument('--T', type=int, default=30, help='Time horizon') 282 | parser.add_argument('--beta', type=float, default=0.2, help='Behavior policy parameter') 283 | parser.add_argument('--alpha', type=float, default=0.9, help='Evaluation policy parameter') 284 | parser.add_argument('--reps', type=int, default=1500, help='Number of repetitions') 285 | parser.add_argument('--mu_method', type=str, default='linear', choices=['linear', 'mlp', 'rf'], help='W-function learner') 286 | args = parser.parse_args() 287 | 288 | print(f"Running with parameters: N={args.N}, T={args.T}, beta={args.beta}, alpha={args.alpha}, reps={args.reps}, mu_method={args.mu_method}") 289 | 290 | sample_N = min(100, args.N) 291 | sample_res = run_single_repetition(sample_N, args.T, args.beta, args.alpha, args.mu_method) 292 | results = {k: [] for k in sample_res.keys()} 293 | 294 | torch.manual_seed(42) 295 | np.random.seed(42) 296 | if torch.cuda.is_available(): 297 | torch.cuda.manual_seed_all(42) 298 | 299 | start_time = time.time() 300 | for i in range(args.reps): 301 | rep_start_time = time.time() 302 | print(f"\nStarting repetition {i+1}/{args.reps}") 303 | try: 304 | res = run_single_repetition(args.N, args.T, args.beta, args.alpha, args.mu_method) 305 | for k in results: 306 | results[k].append(res[k]) 307 | if torch.cuda.is_available(): 308 | torch.cuda.empty_cache() 309 | rep_time = time.time() - rep_start_time 310 | print(f"Repetition {i+1} completed in {rep_time:.2f} seconds") 311 | if (i+1) % 10 == 0 or i == args.reps-1: 312 | for k in results: 313 | np.savez(f"estimator_list_{k}_{args.mu_method}_{args.N}", a=np.array(results[k])) 314 | print(f"Saved checkpoint after {i+1} repetitions") 315 | except Exception as e: 316 | print(f"Error in repetition {i+1}: {e}") 317 | for k in results: 318 | np.savez(f"estimator_list_{k}_{args.mu_method}_{args.N}", a=np.array(results[k])) 319 | total_time = time.time() - start_time 320 | print(f"All {args.reps} repetitions completed in {total_time:.2f} seconds") 321 | 322 | if __name__ == '__main__': 323 | import sys 324 | main() -------------------------------------------------------------------------------- /exp5_2/crif_walking_ope.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import itertools 3 | import numpy as np 4 | import sys 5 | 6 | 7 | if "../" not in sys.path: 8 | sys.path.append("../") 9 | 10 | from collections import defaultdict 11 | from lib.envs.cliff_walking import CliffWalkingEnv 12 | from lib.envs.windy_gridworld import WindyGridworldEnv 13 | 14 | from scipy.optimize import minimize, rosen, rosen_der 15 | from scipy.optimize import Bounds 16 | 17 | bounds = Bounds([-0.1,-0.1],[0.1,0.1]) 18 | 19 | env = CliffWalkingEnv() 20 | 21 | def make_epsilon_greedy_policy(Q, epsilon, nA): 22 | 23 | def policy_fn(observation): 24 | A = np.ones(nA, dtype=float) * epsilon / nA 25 | best_action = np.argmax(Q[observation]) 26 | A[best_action] += (1.0 - epsilon) 27 | return A 28 | return policy_fn 29 | 30 | Q_space = np.load("Q-table-cliff.npz")["xxx"] 31 | Q_space2 = np.load("Q-table-cliff.npz")["xxx"] 32 | 33 | prob1 = [1.0 for i in range((env.nA))] 34 | prob1 = prob1/np.sum(prob1) 35 | 36 | 37 | betabeta = 0.8 38 | def sample_policy(observation,alpha=0.9): 39 | prob2 = alpha*Q_space[observation,:] +(1-alpha)*prob1 40 | return np.random.choice(env.nA,1,p=prob2)[0] 41 | 42 | 43 | def behavior_policy(observation,beta=betabeta): 44 | prob2 = beta*Q_space[observation,:]+ (1-beta)*prob1 45 | return np.random.choice(env.nA,1,p=prob2)[0] 46 | 47 | 48 | def target_dense(observation,alpha=0.9): 49 | prob2 = alpha*Q_space[observation,:]+ (1-alpha)*prob1 50 | return prob2 51 | 52 | def behav_dense(observation,beta=betabeta): 53 | prob2 = beta*Q_space[observation,:] + (1-beta)*prob1 54 | return prob2 55 | 56 | def sarsa2(env,policy, policy2,num_episodes, discount_factor=1.0,Q_space2=Q_space2, alpha= 0.6, epsilon=0.03): 57 | 58 | Q = np.copy(Q_space2) 59 | episode_episode = [] 60 | 61 | for i_episode in range(num_episodes): 62 | 63 | if (i_episode + 1) % 200 == 0: 64 | 65 | sys.stdout.flush() 66 | 67 | state = env.reset() 68 | action = policy2(state) 69 | 70 | episode = [] 71 | 72 | for t in itertools.count(): 73 | # Take a step 74 | next_state, reward, done, _ = env.step(action) 75 | episode.append((state, action, reward)) 76 | # Pick the next action 77 | next_action= policy2(next_state) 78 | 79 | # TD Update 80 | td_target = reward + discount_factor * np.sum(Q[next_state,:]*target_dense(next_state)) 81 | td_delta = td_target - Q[state,action] 82 | Q[state,action] += alpha * td_delta 83 | 84 | if done: 85 | break 86 | 87 | action = next_action 88 | state = next_state 89 | episode_episode.append(episode) 90 | 91 | return Q, episode_episode 92 | 93 | bounds = Bounds([-0.2,-0.2],[0.2,0.2]) 94 | def sigmoid(x, derivative=False): 95 | return x*(1-x) if derivative else 1/(1+np.exp(-x)) 96 | 97 | 98 | depth = 1 99 | def mc_prediction(env, policy,policy2, episode_episode, Q_=1.0,num_episodes=100, discount_factor=1.0): 100 | 101 | 102 | returns_sum = defaultdict(float) 103 | returns_count = defaultdict(float) 104 | returns_count2 = defaultdict(float) 105 | 106 | predic_list = [] 107 | predic_list2 = [] 108 | predic_list3 = [] 109 | predic_list22 = [] 110 | predic_list4 = [] 111 | predic_list5 = np.ones(num_episodes) 112 | auxiauxi = [] 113 | epiepi = [] 114 | weight_list = np.zeros([num_episodes,1000]) ### For bounded IPW 115 | weight_list2 = np.zeros([num_episodes,1002]) ### For bounded IPW 116 | weight_list3 = np.zeros([num_episodes,1002]) ### For bounded IPW 117 | marginal_weight = np.zeros([num_episodes,1000]) ### For bounded IPW 118 | marginal_weight_2 = np.zeros([num_episodes,1000]) ### For bounded IPW 119 | auxi_list = np.zeros([num_episodes,1000]) 120 | marginal_auxi_list2 = np.zeros([num_episodes,1000]) 121 | marginal_auxi_list = np.zeros([num_episodes,1000]) 122 | marginal_auxi_list2_2 = np.zeros([num_episodes,1000]) 123 | marginal_auxi_list_2 = np.zeros([num_episodes,1000]) 124 | auxi_list2 = np.zeros([num_episodes,1000]) 125 | reward_list = np.zeros([num_episodes,1000]) 126 | state_list = np.zeros([num_episodes,1000]) 127 | action_list = np.zeros([num_episodes,1000]) 128 | 129 | count_list = np.zeros(1000) 130 | episolode_longe_list = [] 131 | 132 | 133 | for i_episode in range(num_episodes): 134 | 135 | if i_episode % 200 == 0: 136 | 137 | sys.stdout.flush() 138 | episode = episode_episode[i_episode] 139 | 140 | W = 1.0 141 | W_list = [] 142 | episolode_longe_list.append(len(episode)) 143 | 144 | weight_list2[i_episode,0] = 1.0 145 | for t in range(len(episode)): 146 | state, action, reward = episode[t] 147 | reward_list[i_episode,t] = reward 148 | state_list[i_episode,t] = state 149 | action_list[i_episode,t] = action 150 | 151 | W = W*target_dense(state)[action]/behav_dense(state)[action]*discount_factor 152 | probprob = 0.9*Q_space[state,:] + 0.1*prob1 153 | W_list.append(W) 154 | weight_list[i_episode,t] = W_list[t] 155 | weight_list2[i_episode,t+1] = W_list[t] 156 | weight_list3[i_episode,t] = target_dense(state)[action]/behav_dense(state)[action] 157 | 158 | count_list[t] += 1.0 159 | 160 | if t==0: 161 | auxi_list[i_episode,t] = W_list[t]*Q_[state,action]-np.sum(probprob*Q_[state,:]) 162 | else: 163 | auxi_list[i_episode,t] = W_list[t]*Q_[state,action]-W_list[t-1]*np.sum(probprob*Q_[state,:]) 164 | 165 | if t==0: 166 | auxi_list2[i_episode,t] = W_list[t]-1.0 167 | else: 168 | auxi_list2[i_episode,t] = W_list[t]-W_list[t-1] 169 | 170 | print np.max(np.array(episolode_longe_list)) 171 | 172 | 173 | weight_list_mean = np.mean(weight_list,1) 174 | reward_list_mean = np.mean(reward_list,1) 175 | auxi_list_mean = np.mean(auxi_list,1) 176 | auxi_list2_mean = np.mean(auxi_list2,1) 177 | 178 | val = [] 179 | 180 | ##### IPW 181 | for i in range(num_episodes): 182 | predic_list.append(np.sum(weight_list[i,:]*reward_list[i,:])) 183 | 184 | val.append(np.mean(predic_list)) 185 | 186 | #### Marginalized-IPW 187 | 188 | for i in range(num_episodes): 189 | for j in range(episolode_longe_list[i]): 190 | marginal_weight[i,j] = np.mean(weight_list[:,j][(state_list[:,j]==state_list[i,j]) & (action_list[:,j]==action_list[i,j])]) 191 | if j==0: 192 | marginal_weight_2[i,j] = weight_list3[i,j] 193 | else: 194 | marginal_weight_2[i,j] = np.mean(weight_list[:,j-1][(state_list[:,j]==state_list[i,j])])*weight_list3[i,j] 195 | 196 | 197 | for i_episode in range(num_episodes): 198 | for t in range(episolode_longe_list[i_episode]): 199 | state = np.int(state_list[i_episode,t]) 200 | action = np.int(action_list[i_episode,t]) 201 | probprob = 0.9*Q_space[state,:] + 0.1*prob1 202 | if t==0: 203 | marginal_auxi_list[i_episode,t] = marginal_weight[i_episode,t]*Q_[state,action]-np.sum(probprob*Q_[state,:]) 204 | marginal_auxi_list_2[i_episode,t] = marginal_weight_2[i_episode,t]*Q_[state,action]-np.sum(probprob*Q_[state,:]) 205 | auxi_list[i_episode,t] = weight_list[i_episode,t]*Q_[state,action]-np.sum(probprob*Q_[state,:]) 206 | else: 207 | marginal_auxi_list[i_episode,t] = marginal_weight[i_episode,t]*(Q_[state,action])-marginal_weight[i_episode,t-1]*np.sum(probprob*(Q_[state,:])) 208 | marginal_auxi_list_2[i_episode,t] = marginal_weight_2[i_episode,t]*(Q_[state,action])-marginal_weight_2[i_episode,t-1]*np.sum(probprob*(Q_[state,:])) 209 | auxi_list[i_episode,t] = weight_list[i_episode,t]*(Q_[state,action])-weight_list[i_episode,t-1]*np.sum(probprob*(Q_[state,:])) 210 | 211 | if t==0: 212 | marginal_auxi_list2[i_episode,t] = marginal_weight[i_episode,t]-1.0 213 | marginal_auxi_list2_2[i_episode,t] = marginal_weight_2[i_episode,t]-1.0 214 | auxi_list2[i_episode,t] = weight_list[i_episode,t]-1.0 215 | else: 216 | marginal_auxi_list2[i_episode,t] = marginal_weight[i_episode,t]- marginal_weight[i_episode,t-1] 217 | marginal_auxi_list2_2[i_episode,t] = marginal_weight_2[i_episode,t]- marginal_weight_2[i_episode,t-1] 218 | auxi_list2[i_episode,t] = weight_list[i_episode,t]-weight_list[i_episode,t-1] 219 | 220 | 221 | for i in range(num_episodes): 222 | predic_list2.append(np.sum(marginal_weight[i,:]*reward_list[i,:])) 223 | 224 | ### marginal ipw2 #### Using action and state 225 | val.append(np.mean(predic_list2)) 226 | 227 | 228 | ### marginal ipw3#### Using only state 229 | for i in range(num_episodes): 230 | predic_list22.append(np.sum(marginal_weight_2[i,:]*reward_list[i,:])) 231 | 232 | val.append(np.mean(predic_list22)) 233 | 234 | 235 | #### DR 236 | val.append(np.mean(predic_list)-np.mean(np.sum(auxi_list,1))) 237 | 238 | #### marginal DR 1 #### Using action and state 239 | val.append(np.mean(predic_list2)-np.mean(np.sum(marginal_auxi_list,1))) 240 | #### marginal DR 2 #### Using only state 241 | val.append(np.mean(predic_list22)-np.mean(np.sum(marginal_auxi_list_2,1))) 242 | 243 | 244 | 245 | 246 | return val 247 | 248 | 249 | 250 | 251 | 252 | 253 | is_list = [] 254 | is2_list = [] 255 | is3_list = [] 256 | wis_list = [] 257 | wis2_list = [] 258 | dm_list = [] 259 | dr_list = [] 260 | dr2_list = [] 261 | dr3_list = [] 262 | bdr_list = [] 263 | drs_list = [] 264 | drs2_list = [] 265 | drss_list = [] 266 | mdr_list = [] 267 | mdr_list2 = [] 268 | 269 | sample_size = 1000 270 | sample_size =sample_size/2 271 | for kkk in range(100): 272 | print "epoch",kkk 273 | #### Sample splititng 274 | ### First fold 275 | 276 | predicted_Q ,episode_episode = sarsa2(env,sample_policy,behavior_policy, sample_size) 277 | V_10k_1 = mc_prediction(env,sample_policy,behavior_policy, episode_episode, predicted_Q,num_episodes=sample_size) 278 | 279 | ### Second fold 280 | predicted_Q ,episode_episode = sarsa2(env,sample_policy,behavior_policy, sample_size) 281 | V_10k_2 = mc_prediction(env,sample_policy,behavior_policy, episode_episode, predicted_Q,num_episodes=sample_size) 282 | 283 | V_10k = 0.5*(np.array(V_10k_1)+np.array(V_10k_2)) 284 | is_list.append(np.mean(V_10k[0])) 285 | is2_list.append(np.mean(V_10k[1])) 286 | is3_list.append(np.mean(V_10k[2])) 287 | dr_list.append(np.mean(V_10k[3])) 288 | dr2_list.append(np.mean(V_10k[4])) 289 | dr3_list.append(np.mean(V_10k[5])) 290 | probprob = 0.9*Q_space[36,:] + 0.1*prob1 291 | dm_list.append(np.sum(probprob*predicted_Q[36,:])) 292 | np.savez("2estimator_list_ipw_"+str(betabeta)+"_"+str(sample_size),a=is_list) 293 | np.savez("2estimator_list_ipw2_"+str(betabeta)+"_"+str(sample_size), a=is3_list) 294 | np.savez("2estimator_list_dm_"+str(betabeta)+"_"+str(sample_size), a=dm_list) 295 | np.savez("2estimator_list_dr_"+str(betabeta)+"_"+str(sample_size),a=dr_list) 296 | np.savez("2estimator_list_dr2_"+str(betabeta)+"_"+str(sample_size),a=dr3_list) 297 | 298 | 299 | 300 | 301 | true = -42.49 302 | def mse(aaa): 303 | aaa = np.array(aaa) 304 | aaa = aaa[aaa>-100] 305 | return [np.mean((((aaa-true)*(aaa-true)))),np.sqrt(np.var((aaa-true)*(aaa-true)))] 306 | 307 | print np.mean(is_list) 308 | print mse(is_list) 309 | print "wis" 310 | print np.mean(is3_list) 311 | print mse(is3_list) 312 | print "dm" 313 | print np.mean(dm_list) 314 | print mse(dm_list) 315 | print "dr" 316 | print np.mean(dr_list) 317 | print mse(dr_list) 318 | print "dr3" 319 | print np.mean(dr3_list) 320 | print mse(dr3_list) -------------------------------------------------------------------------------- /exp5_2/cw_notebook_ver_splitting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 20, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import gym\n", 10 | "import itertools\n", 11 | "import numpy as np\n", 12 | "import sys\n", 13 | "\n", 14 | "\n", 15 | "if \"../\" not in sys.path:\n", 16 | " sys.path.append(\"../\") \n", 17 | "\n", 18 | "from collections import defaultdict\n", 19 | "from lib.envs.cliff_walking import CliffWalkingEnv\n", 20 | "from lib.envs.windy_gridworld import WindyGridworldEnv\n", 21 | "\n", 22 | "from scipy.optimize import minimize, rosen, rosen_der\n", 23 | "from scipy.optimize import Bounds\n", 24 | "\n", 25 | "bounds = Bounds([-0.1,-0.1],[0.1,0.1])" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 21, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "env = CliffWalkingEnv()" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 22, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "def make_epsilon_greedy_policy(Q, epsilon, nA):\n", 44 | " \n", 45 | " def policy_fn(observation):\n", 46 | " A = np.ones(nA, dtype=float) * epsilon / nA\n", 47 | " best_action = np.argmax(Q[observation])\n", 48 | " A[best_action] += (1.0 - epsilon)\n", 49 | " return A\n", 50 | " return policy_fn" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 26, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "Q_space = np.load(\"Q-table-cliff.npz\")[\"xxx\"]\n", 60 | "Q_space2 = np.load(\"Q-table-cliff.npz\")[\"xxx\"]\n", 61 | "\n", 62 | "prob1 = [1.0 for i in range((env.nA))]\n", 63 | "prob1 = prob1/np.sum(prob1)\n", 64 | "\n", 65 | "\n", 66 | "betabeta = 0.8\n", 67 | "def sample_policy(observation,alpha=0.9):\n", 68 | " prob2 = alpha*Q_space[observation,:] +(1-alpha)*prob1\n", 69 | " return np.random.choice(env.nA,1,p=prob2)[0]\n", 70 | " \n", 71 | " \n", 72 | "def behavior_policy(observation,beta=betabeta):\n", 73 | " prob2 = beta*Q_space[observation,:]+ (1-beta)*prob1\n", 74 | " return np.random.choice(env.nA,1,p=prob2)[0]\n", 75 | " \n", 76 | " \n", 77 | "def target_dense(observation,alpha=0.9):\n", 78 | " prob2 = alpha*Q_space[observation,:]+ (1-alpha)*prob1\n", 79 | " return prob2\n", 80 | "\n", 81 | "def behav_dense(observation,beta=betabeta):\n", 82 | " prob2 = beta*Q_space[observation,:] + (1-beta)*prob1\n", 83 | " return prob2" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 38, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "def sarsa2(env,policy, policy2,num_episodes, discount_factor=1.0,Q_space2=Q_space2, alpha= 0.6, epsilon=0.03):\n", 93 | " \n", 94 | " Q = np.copy(Q_space2)\n", 95 | " episode_episode = []\n", 96 | " \n", 97 | " for i_episode in range(num_episodes):\n", 98 | "\n", 99 | " if (i_episode + 1) % 200 == 0:\n", 100 | "\n", 101 | " sys.stdout.flush()\n", 102 | " \n", 103 | " state = env.reset()\n", 104 | " action = policy2(state)\n", 105 | " \n", 106 | " episode = []\n", 107 | " \n", 108 | " for t in itertools.count():\n", 109 | " # Take a step\n", 110 | " next_state, reward, done, _ = env.step(action)\n", 111 | " episode.append((state, action, reward))\n", 112 | " # Pick the next action\n", 113 | " next_action= policy2(next_state)\n", 114 | " \n", 115 | " # TD Update\n", 116 | " td_target = reward + discount_factor * np.sum(Q[next_state,:]*target_dense(next_state))\n", 117 | " td_delta = td_target - Q[state,action]\n", 118 | " Q[state,action] += alpha * td_delta \n", 119 | " \n", 120 | " if done:\n", 121 | " break\n", 122 | " \n", 123 | " action = next_action\n", 124 | " state = next_state \n", 125 | " episode_episode.append(episode)\n", 126 | " \n", 127 | " return Q, episode_episode" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 39, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "bounds = Bounds([-0.2,-0.2],[0.2,0.2])\n", 137 | "def sigmoid(x, derivative=False):\n", 138 | " return x*(1-x) if derivative else 1/(1+np.exp(-x))\n", 139 | "\n", 140 | "\n", 141 | "depth = 1\n", 142 | "def mc_prediction(env, policy,policy2, episode_episode, Q_=1.0,num_episodes=100, discount_factor=1.0):\n", 143 | " \n", 144 | "\n", 145 | " returns_sum = defaultdict(float)\n", 146 | " returns_count = defaultdict(float)\n", 147 | " returns_count2 = defaultdict(float)\n", 148 | " \n", 149 | " predic_list = []\n", 150 | " predic_list2 = []\n", 151 | " predic_list3 = []\n", 152 | " predic_list22 = []\n", 153 | " predic_list4 = []\n", 154 | " predic_list5 = np.ones(num_episodes)\n", 155 | " auxiauxi = [] \n", 156 | " epiepi = []\n", 157 | " weight_list = np.zeros([num_episodes,1000]) ### For bounded IPW\n", 158 | " weight_list2 = np.zeros([num_episodes,1002]) ### For bounded IPW\n", 159 | " weight_list3 = np.zeros([num_episodes,1002]) ### For bounded IPW\n", 160 | " marginal_weight = np.zeros([num_episodes,1000]) ### For bounded IPW\n", 161 | " marginal_weight_2 = np.zeros([num_episodes,1000]) ### For bounded IPW\n", 162 | " auxi_list = np.zeros([num_episodes,1000])\n", 163 | " marginal_auxi_list2 = np.zeros([num_episodes,1000])\n", 164 | " marginal_auxi_list = np.zeros([num_episodes,1000])\n", 165 | " marginal_auxi_list2_2 = np.zeros([num_episodes,1000])\n", 166 | " marginal_auxi_list_2 = np.zeros([num_episodes,1000])\n", 167 | " auxi_list2 = np.zeros([num_episodes,1000])\n", 168 | " reward_list = np.zeros([num_episodes,1000])\n", 169 | " state_list = np.zeros([num_episodes,1000])\n", 170 | " action_list = np.zeros([num_episodes,1000])\n", 171 | " \n", 172 | " count_list = np.zeros(1000) \n", 173 | " episolode_longe_list = []\n", 174 | " \n", 175 | "\n", 176 | " for i_episode in range(num_episodes):\n", 177 | " \n", 178 | " if i_episode % 200 == 0:\n", 179 | " \n", 180 | " sys.stdout.flush()\n", 181 | " episode = episode_episode[i_episode]\n", 182 | " \n", 183 | " W = 1.0\n", 184 | " W_list = []\n", 185 | " episolode_longe_list.append(len(episode))\n", 186 | " \n", 187 | " weight_list2[i_episode,0] = 1.0\n", 188 | " for t in range(len(episode)):\n", 189 | " state, action, reward = episode[t]\n", 190 | " reward_list[i_episode,t] = reward\n", 191 | " state_list[i_episode,t] = state\n", 192 | " action_list[i_episode,t] = action\n", 193 | " \n", 194 | " W = W*target_dense(state)[action]/behav_dense(state)[action]*discount_factor\n", 195 | " probprob = 0.9*Q_space[state,:] + 0.1*prob1\n", 196 | " W_list.append(W)\n", 197 | " weight_list[i_episode,t] = W_list[t]\n", 198 | " weight_list2[i_episode,t+1] = W_list[t]\n", 199 | " weight_list3[i_episode,t] = target_dense(state)[action]/behav_dense(state)[action]\n", 200 | " \n", 201 | " count_list[t] += 1.0\n", 202 | " \n", 203 | " if t==0:\n", 204 | " auxi_list[i_episode,t] = W_list[t]*Q_[state,action]-np.sum(probprob*Q_[state,:])\n", 205 | " else:\n", 206 | " auxi_list[i_episode,t] = W_list[t]*Q_[state,action]-W_list[t-1]*np.sum(probprob*Q_[state,:])\n", 207 | " \n", 208 | " if t==0:\n", 209 | " auxi_list2[i_episode,t] = W_list[t]-1.0\n", 210 | " else:\n", 211 | " auxi_list2[i_episode,t] = W_list[t]-W_list[t-1]\n", 212 | "\n", 213 | " print np.max(np.array(episolode_longe_list))\n", 214 | " \n", 215 | " \n", 216 | " weight_list_mean = np.mean(weight_list,1)\n", 217 | " reward_list_mean = np.mean(reward_list,1)\n", 218 | " auxi_list_mean = np.mean(auxi_list,1)\n", 219 | " auxi_list2_mean = np.mean(auxi_list2,1)\n", 220 | " \n", 221 | " val = [] \n", 222 | " \n", 223 | " ##### IPW\n", 224 | " for i in range(num_episodes):\n", 225 | " predic_list.append(np.sum(weight_list[i,:]*reward_list[i,:])) \n", 226 | " \n", 227 | " val.append(np.mean(predic_list))\n", 228 | " \n", 229 | " #### Marginalized-IPW \n", 230 | " \n", 231 | " for i in range(num_episodes):\n", 232 | " for j in range(episolode_longe_list[i]):\n", 233 | " marginal_weight[i,j] = np.mean(weight_list[:,j][(state_list[:,j]==state_list[i,j]) & (action_list[:,j]==action_list[i,j])])\n", 234 | " if j==0:\n", 235 | " marginal_weight_2[i,j] = weight_list3[i,j]\n", 236 | " else:\n", 237 | " marginal_weight_2[i,j] = np.mean(weight_list[:,j-1][(state_list[:,j]==state_list[i,j])])*weight_list3[i,j]\n", 238 | " \n", 239 | " \n", 240 | " for i_episode in range(num_episodes):\n", 241 | " for t in range(episolode_longe_list[i_episode]):\n", 242 | " state = np.int(state_list[i_episode,t])\n", 243 | " action = np.int(action_list[i_episode,t])\n", 244 | " probprob = 0.9*Q_space[state,:] + 0.1*prob1\n", 245 | " if t==0:\n", 246 | " marginal_auxi_list[i_episode,t] = marginal_weight[i_episode,t]*Q_[state,action]-np.sum(probprob*Q_[state,:])\n", 247 | " marginal_auxi_list_2[i_episode,t] = marginal_weight_2[i_episode,t]*Q_[state,action]-np.sum(probprob*Q_[state,:])\n", 248 | " auxi_list[i_episode,t] = weight_list[i_episode,t]*Q_[state,action]-np.sum(probprob*Q_[state,:])\n", 249 | " else:\n", 250 | " marginal_auxi_list[i_episode,t] = marginal_weight[i_episode,t]*(Q_[state,action])-marginal_weight[i_episode,t-1]*np.sum(probprob*(Q_[state,:]))\n", 251 | " marginal_auxi_list_2[i_episode,t] = marginal_weight_2[i_episode,t]*(Q_[state,action])-marginal_weight_2[i_episode,t-1]*np.sum(probprob*(Q_[state,:]))\n", 252 | " auxi_list[i_episode,t] = weight_list[i_episode,t]*(Q_[state,action])-weight_list[i_episode,t-1]*np.sum(probprob*(Q_[state,:]))\n", 253 | " \n", 254 | " if t==0:\n", 255 | " marginal_auxi_list2[i_episode,t] = marginal_weight[i_episode,t]-1.0\n", 256 | " marginal_auxi_list2_2[i_episode,t] = marginal_weight_2[i_episode,t]-1.0\n", 257 | " auxi_list2[i_episode,t] = weight_list[i_episode,t]-1.0\n", 258 | " else:\n", 259 | " marginal_auxi_list2[i_episode,t] = marginal_weight[i_episode,t]- marginal_weight[i_episode,t-1]\n", 260 | " marginal_auxi_list2_2[i_episode,t] = marginal_weight_2[i_episode,t]- marginal_weight_2[i_episode,t-1]\n", 261 | " auxi_list2[i_episode,t] = weight_list[i_episode,t]-weight_list[i_episode,t-1]\n", 262 | "\n", 263 | " \n", 264 | " for i in range(num_episodes):\n", 265 | " predic_list2.append(np.sum(marginal_weight[i,:]*reward_list[i,:])) \n", 266 | " \n", 267 | " ### marginal ipw2 #### Using action and state \n", 268 | " val.append(np.mean(predic_list2))\n", 269 | " \n", 270 | "\n", 271 | " ### marginal ipw3#### Using only state \n", 272 | " for i in range(num_episodes):\n", 273 | " predic_list22.append(np.sum(marginal_weight_2[i,:]*reward_list[i,:])) \n", 274 | " \n", 275 | " val.append(np.mean(predic_list22))\n", 276 | " \n", 277 | " \n", 278 | " #### DR\n", 279 | " val.append(np.mean(predic_list)-np.mean(np.sum(auxi_list,1)))\n", 280 | " \n", 281 | " #### marginal DR 1 #### Using action and state \n", 282 | " val.append(np.mean(predic_list2)-np.mean(np.sum(marginal_auxi_list,1)))\n", 283 | " #### marginal DR 2 #### Using only state \n", 284 | " val.append(np.mean(predic_list22)-np.mean(np.sum(marginal_auxi_list_2,1)))\n", 285 | " \n", 286 | " \n", 287 | "\n", 288 | "\n", 289 | " return val\n", 290 | " \n", 291 | " \n" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 49, 297 | "metadata": {}, 298 | "outputs": [ 299 | { 300 | "name": "stdout", 301 | "output_type": "stream", 302 | "text": [ 303 | "0\n", 304 | "135\n", 305 | "204\n", 306 | "1\n", 307 | "202\n", 308 | "179\n", 309 | "2\n", 310 | "205\n", 311 | "153\n", 312 | "3\n", 313 | "149\n", 314 | "176\n", 315 | "4\n", 316 | "212\n", 317 | "217\n", 318 | "5\n", 319 | "231\n", 320 | "151\n", 321 | "6\n", 322 | "210\n", 323 | "216\n", 324 | "7\n", 325 | "141\n", 326 | "147\n", 327 | "8\n", 328 | "194\n", 329 | "288\n", 330 | "9\n", 331 | "181\n", 332 | "177\n", 333 | "10\n", 334 | "248\n", 335 | "199\n", 336 | "11\n", 337 | "118\n", 338 | "198\n", 339 | "12\n", 340 | "210\n", 341 | "203\n", 342 | "13\n", 343 | "125\n", 344 | "225\n", 345 | "14\n", 346 | "324\n", 347 | "225\n", 348 | "15\n", 349 | "170\n", 350 | "169\n", 351 | "16\n", 352 | "203\n", 353 | "195\n", 354 | "17\n", 355 | "157\n", 356 | "171\n", 357 | "18\n", 358 | "132\n", 359 | "197\n", 360 | "19\n", 361 | "188\n", 362 | "218\n", 363 | "20\n", 364 | "150\n", 365 | "228\n", 366 | "21\n", 367 | "226\n", 368 | "202\n", 369 | "22\n", 370 | "192\n", 371 | "195\n", 372 | "23\n", 373 | "141\n", 374 | "195\n", 375 | "24\n", 376 | "249\n", 377 | "144\n", 378 | "25\n", 379 | "249\n", 380 | "181\n", 381 | "26\n", 382 | "183\n", 383 | "177\n", 384 | "27\n", 385 | "185\n", 386 | "183\n", 387 | "28\n", 388 | "378\n", 389 | "177\n", 390 | "29\n", 391 | "170\n", 392 | "221\n", 393 | "30\n", 394 | "235\n", 395 | "165\n", 396 | "31\n", 397 | "165\n", 398 | "234\n", 399 | "32\n", 400 | "206\n", 401 | "217\n", 402 | "33\n", 403 | "178\n", 404 | "255\n", 405 | "34\n", 406 | "143\n", 407 | "181\n", 408 | "35\n", 409 | "253\n", 410 | "290\n", 411 | "36\n", 412 | "200\n", 413 | "174\n", 414 | "37\n", 415 | "242\n", 416 | "190\n", 417 | "38\n", 418 | "178\n", 419 | "216\n", 420 | "39\n", 421 | "182\n", 422 | "140\n", 423 | "40\n", 424 | "188\n", 425 | "187\n", 426 | "41\n", 427 | "193\n", 428 | "261\n", 429 | "42\n", 430 | "156\n", 431 | "192\n", 432 | "43\n", 433 | "225\n", 434 | "233\n", 435 | "44\n", 436 | "246\n", 437 | "182\n", 438 | "45\n", 439 | "154\n", 440 | "132\n", 441 | "46\n", 442 | "246\n", 443 | "182\n", 444 | "47\n", 445 | "186\n", 446 | "150\n", 447 | "48\n", 448 | "144\n", 449 | "172\n", 450 | "49\n", 451 | "200\n", 452 | "192\n", 453 | "50\n", 454 | "233\n", 455 | "255\n", 456 | "51\n", 457 | "170\n", 458 | "238\n", 459 | "52\n", 460 | "284\n", 461 | "154\n", 462 | "53\n", 463 | "173\n", 464 | "134\n", 465 | "54\n", 466 | "162\n", 467 | "174\n", 468 | "55\n", 469 | "182\n", 470 | "229\n", 471 | "56\n", 472 | "112\n", 473 | "268\n", 474 | "57\n", 475 | "158\n", 476 | "217\n", 477 | "58\n", 478 | "174\n", 479 | "164\n", 480 | "59\n", 481 | "213\n", 482 | "241\n", 483 | "60\n", 484 | "200\n", 485 | "165\n", 486 | "61\n", 487 | "176\n", 488 | "234\n", 489 | "62\n", 490 | "163\n", 491 | "140\n", 492 | "63\n", 493 | "182\n", 494 | "206\n", 495 | "64\n", 496 | "173\n", 497 | "233\n", 498 | "65\n", 499 | "315\n", 500 | "161\n", 501 | "66\n", 502 | "195\n", 503 | "253\n", 504 | "67\n", 505 | "140\n", 506 | "274\n", 507 | "68\n", 508 | "120\n", 509 | "226\n", 510 | "69\n", 511 | "163\n", 512 | "277\n", 513 | "70\n", 514 | "173\n", 515 | "188\n", 516 | "71\n", 517 | "171\n", 518 | "138\n", 519 | "72\n", 520 | "310\n", 521 | "204\n", 522 | "73\n", 523 | "202\n", 524 | "208\n", 525 | "74\n", 526 | "237\n", 527 | "232\n", 528 | "75\n", 529 | "143\n", 530 | "202\n", 531 | "76\n", 532 | "161\n", 533 | "150\n", 534 | "77\n", 535 | "219\n", 536 | "168\n", 537 | "78\n", 538 | "110\n", 539 | "143\n", 540 | "79\n", 541 | "173\n", 542 | "260\n", 543 | "80\n", 544 | "165\n", 545 | "170\n", 546 | "81\n", 547 | "147\n", 548 | "165\n", 549 | "82\n", 550 | "233\n", 551 | "147\n", 552 | "83\n", 553 | "174\n", 554 | "285\n", 555 | "84\n", 556 | "150\n", 557 | "199\n", 558 | "85\n", 559 | "200\n", 560 | "290\n", 561 | "86\n", 562 | "203\n", 563 | "368\n", 564 | "87\n", 565 | "184\n", 566 | "138\n", 567 | "88\n", 568 | "190\n", 569 | "204\n", 570 | "89\n", 571 | "129\n", 572 | "182\n", 573 | "90\n", 574 | "198\n", 575 | "178\n", 576 | "91\n", 577 | "154\n", 578 | "190\n", 579 | "92\n", 580 | "192\n", 581 | "146\n", 582 | "93\n", 583 | "190\n", 584 | "190\n", 585 | "94\n", 586 | "189\n", 587 | "177\n", 588 | "95\n", 589 | "200\n", 590 | "138\n", 591 | "96\n", 592 | "175\n", 593 | "152\n", 594 | "97\n", 595 | "152\n", 596 | "153\n", 597 | "98\n", 598 | "157\n", 599 | "178\n", 600 | "99\n", 601 | "138\n", 602 | "277\n" 603 | ] 604 | } 605 | ], 606 | "source": [ 607 | "\n", 608 | "\n", 609 | "is_list = []\n", 610 | "is2_list = []\n", 611 | "is3_list = []\n", 612 | "wis_list = []\n", 613 | "wis2_list = []\n", 614 | "dm_list = []\n", 615 | "dr_list = []\n", 616 | "dr2_list = []\n", 617 | "dr3_list = []\n", 618 | "bdr_list = []\n", 619 | "drs_list = []\n", 620 | "drs2_list = []\n", 621 | "drss_list = []\n", 622 | "mdr_list = []\n", 623 | "mdr_list2 = []\n", 624 | "\n", 625 | "sample_size = 1000\n", 626 | "sample_size =sample_size/2\n", 627 | "for kkk in range(100):\n", 628 | " print kkk\n", 629 | " #### Sample splititng \n", 630 | " ### First fold \n", 631 | " \n", 632 | " predicted_Q ,episode_episode = sarsa2(env,sample_policy,behavior_policy, sample_size)\n", 633 | " V_10k_1 = mc_prediction(env,sample_policy,behavior_policy, episode_episode, predicted_Q,num_episodes=sample_size)\n", 634 | " \n", 635 | " ### Second fold \n", 636 | " predicted_Q ,episode_episode = sarsa2(env,sample_policy,behavior_policy, sample_size)\n", 637 | " V_10k_2 = mc_prediction(env,sample_policy,behavior_policy, episode_episode, predicted_Q,num_episodes=sample_size)\n", 638 | " \n", 639 | " V_10k = 0.5*(np.array(V_10k_1)+np.array(V_10k_2))\n", 640 | " is_list.append(np.mean(V_10k[0]))\n", 641 | " is2_list.append(np.mean(V_10k[1]))\n", 642 | " is3_list.append(np.mean(V_10k[2]))\n", 643 | " dr_list.append(np.mean(V_10k[3]))\n", 644 | " dr2_list.append(np.mean(V_10k[4])) \n", 645 | " dr3_list.append(np.mean(V_10k[5])) \n", 646 | " probprob = 0.9*Q_space[36,:] + 0.1*prob1\n", 647 | " dm_list.append(np.sum(probprob*predicted_Q[36,:]))\n", 648 | " np.savez(\"2estimator_list_ipw_\"+str(betabeta)+\"_\"+str(sample_size),a=is_list)\n", 649 | " np.savez(\"2estimator_list_ipw2_\"+str(betabeta)+\"_\"+str(sample_size), a=is3_list)\n", 650 | " np.savez(\"2estimator_list_dm_\"+str(betabeta)+\"_\"+str(sample_size), a=dm_list)\n", 651 | " np.savez(\"2estimator_list_dr_\"+str(betabeta)+\"_\"+str(sample_size),a=dr_list)\n", 652 | " np.savez(\"2estimator_list_dr2_\"+str(betabeta)+\"_\"+str(sample_size),a=dr3_list)\n", 653 | "\n", 654 | "\n" 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": 50, 660 | "metadata": {}, 661 | "outputs": [ 662 | { 663 | "name": "stdout", 664 | "output_type": "stream", 665 | "text": [ 666 | "-42.6790406415171\n", 667 | "[37.66853850330432, 166.3536098146463]\n", 668 | "wis\n", 669 | "-41.32338904594486\n", 670 | "[29.887080050982032, 134.7261138929551]\n", 671 | "dm\n", 672 | "-39.81945511378478\n", 673 | "[7.275944708877457, 2.0596332012217786]\n", 674 | "dr\n", 675 | "-42.22774103445223\n", 676 | "[0.6716582397271773, 2.367824452822763]\n", 677 | "dr3\n", 678 | "-41.88853674465647\n", 679 | "[0.568131494443443, 0.4208652463004875]\n" 680 | ] 681 | } 682 | ], 683 | "source": [ 684 | "true = -42.49\n", 685 | "def mse(aaa):\n", 686 | " aaa = np.array(aaa)\n", 687 | " aaa = aaa[aaa>-100]\n", 688 | " return [np.mean((((aaa-true)*(aaa-true)))),np.sqrt(np.var((aaa-true)*(aaa-true)))]\n", 689 | "\n", 690 | "print np.mean(is_list)\n", 691 | "print mse(is_list)\n", 692 | "print \"wis\"\n", 693 | "print np.mean(is3_list)\n", 694 | "print mse(is3_list)\n", 695 | "print \"dm\"\n", 696 | "print np.mean(dm_list)\n", 697 | "print mse(dm_list)\n", 698 | "print \"dr\"\n", 699 | "print np.mean(dr_list)\n", 700 | "print mse(dr_list)\n", 701 | "print \"dr3\"\n", 702 | "print np.mean(dr3_list)\n", 703 | "print mse(dr3_list)" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": null, 709 | "metadata": {}, 710 | "outputs": [], 711 | "source": [] 712 | } 713 | ], 714 | "metadata": { 715 | "kernelspec": { 716 | "display_name": "Python 2", 717 | "language": "python", 718 | "name": "python2" 719 | }, 720 | "language_info": { 721 | "codemirror_mode": { 722 | "name": "ipython", 723 | "version": 2 724 | }, 725 | "file_extension": ".py", 726 | "mimetype": "text/x-python", 727 | "name": "python", 728 | "nbconvert_exporter": "python", 729 | "pygments_lexer": "ipython2", 730 | "version": "2.7.12" 731 | } 732 | }, 733 | "nbformat": 4, 734 | "nbformat_minor": 1 735 | } 736 | -------------------------------------------------------------------------------- /exp5_2_py3/cw_notebook_ver_splitting_p3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "source": [ 6 | "from google.colab import drive\n", 7 | "drive.mount('/content/drive')" 8 | ], 9 | "metadata": { 10 | "colab": { 11 | "base_uri": "https://localhost:8080/" 12 | }, 13 | "id": "L1RBkFLR0Y_Y", 14 | "outputId": "aa21aeea-540c-4ea5-d56a-3c019d7824c1" 15 | }, 16 | "id": "L1RBkFLR0Y_Y", 17 | "execution_count": 4, 18 | "outputs": [ 19 | { 20 | "output_type": "stream", 21 | "name": "stdout", 22 | "text": [ 23 | "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" 24 | ] 25 | } 26 | ] 27 | }, 28 | { 29 | "metadata": { 30 | "ExecuteTime": { 31 | "end_time": "2025-05-12T20:03:14.713896Z", 32 | "start_time": "2025-05-12T20:03:05.105703Z" 33 | }, 34 | "colab": { 35 | "base_uri": "https://localhost:8080/" 36 | }, 37 | "id": "50f1f594d24d639c", 38 | "outputId": "2389fc60-833f-4595-9b8b-3a910445f19f" 39 | }, 40 | "cell_type": "code", 41 | "source": [ 42 | "pip install gym matplotlib numpy pandas scipy" 43 | ], 44 | "id": "50f1f594d24d639c", 45 | "outputs": [ 46 | { 47 | "output_type": "stream", 48 | "name": "stdout", 49 | "text": [ 50 | "Requirement already satisfied: gym in /usr/local/lib/python3.11/dist-packages (0.25.2)\n", 51 | "Requirement already satisfied: matplotlib in /usr/local/lib/python3.11/dist-packages (3.10.0)\n", 52 | "Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (2.0.2)\n", 53 | "Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (2.2.2)\n", 54 | "Requirement already satisfied: scipy in /usr/local/lib/python3.11/dist-packages (1.15.3)\n", 55 | "Requirement already satisfied: cloudpickle>=1.2.0 in /usr/local/lib/python3.11/dist-packages (from gym) (3.1.1)\n", 56 | "Requirement already satisfied: gym-notices>=0.0.4 in /usr/local/lib/python3.11/dist-packages (from gym) (0.0.8)\n", 57 | "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (1.3.2)\n", 58 | "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (0.12.1)\n", 59 | "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (4.57.0)\n", 60 | "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (1.4.8)\n", 61 | "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (24.2)\n", 62 | "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (11.2.1)\n", 63 | "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (3.2.3)\n", 64 | "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (2.9.0.post0)\n", 65 | "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas) (2025.2)\n", 66 | "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas) (2025.2)\n", 67 | "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.7->matplotlib) (1.17.0)\n" 68 | ] 69 | } 70 | ], 71 | "execution_count": 2 72 | }, 73 | { 74 | "cell_type": "code", 75 | "source": [ 76 | "import itertools\n", 77 | "import numpy as np\n", 78 | "import sys\n", 79 | "import gym\n", 80 | "\n", 81 | "# Since lib.envs isn't available, we'll need to define these environments here\n", 82 | "# or use gym environments directly. For now, I'll create simplified versions.\n", 83 | "\n", 84 | "from collections import defaultdict\n", 85 | "\n", 86 | "# Simple CliffWalkingEnv implementation\n", 87 | "class CliffWalkingEnv(gym.Env):\n", 88 | " def __init__(self):\n", 89 | " self.shape = (4, 12)\n", 90 | " self.start_state_index = np.ravel_multi_index((3, 0), self.shape)\n", 91 | " self.goal_state_index = np.ravel_multi_index((3, 11), self.shape)\n", 92 | " self.cliff = list(range(np.ravel_multi_index((3, 1), self.shape),\n", 93 | " np.ravel_multi_index((3, 11), self.shape)))\n", 94 | " self.nS = self.shape[0] * self.shape[1]\n", 95 | " self.nA = 4 # up, right, down, left\n", 96 | "\n", 97 | " # Calculate transition probabilities and rewards\n", 98 | " self.P = {}\n", 99 | " for s in range(self.nS):\n", 100 | " position = np.unravel_index(s, self.shape)\n", 101 | " self.P[s] = {a: [] for a in range(self.nA)}\n", 102 | "\n", 103 | " # Actions: 0=up, 1=right, 2=down, 3=left\n", 104 | " for a in range(self.nA):\n", 105 | " reward = -1.0 # default reward for each move\n", 106 | " next_position = list(position)\n", 107 | " if a == 0:\n", 108 | " next_position[0] = max(position[0] - 1, 0)\n", 109 | " elif a == 1:\n", 110 | " next_position[1] = min(position[1] + 1, self.shape[1] - 1)\n", 111 | " elif a == 2:\n", 112 | " next_position[0] = min(position[0] + 1, self.shape[0] - 1)\n", 113 | " elif a == 3:\n", 114 | " next_position[1] = max(position[1] - 1, 0)\n", 115 | "\n", 116 | " next_state = np.ravel_multi_index(next_position, self.shape)\n", 117 | "\n", 118 | " # Check if we're at the cliff\n", 119 | " if s in self.cliff:\n", 120 | " next_state = self.start_state_index\n", 121 | " reward = -100.0\n", 122 | "\n", 123 | " # Check if we're at the goal\n", 124 | " done = next_state == self.goal_state_index\n", 125 | "\n", 126 | " self.P[s][a] = [(1.0, next_state, reward, done)]\n", 127 | "\n", 128 | " self.observation_space = gym.spaces.Discrete(self.nS)\n", 129 | " self.action_space = gym.spaces.Discrete(self.nA)\n", 130 | "\n", 131 | " self.reset()\n", 132 | "\n", 133 | " def step(self, action):\n", 134 | " state, reward, done, _ = self._step(action)\n", 135 | " self.s = state\n", 136 | " return (state, reward, done, {})\n", 137 | "\n", 138 | " def _step(self, action):\n", 139 | " (probs, next_state, reward, done) = self.P[self.s][action][0]\n", 140 | " return (next_state, reward, done, {})\n", 141 | "\n", 142 | " def reset(self):\n", 143 | " self.s = self.start_state_index\n", 144 | " return self.s\n", 145 | "\n", 146 | "# Simple WindyGridworldEnv implementation (not fully used in this code)\n", 147 | "class WindyGridworldEnv(gym.Env):\n", 148 | " def __init__(self):\n", 149 | " self.shape = (7, 10)\n", 150 | " self.nS = self.shape[0] * self.shape[1]\n", 151 | " self.nA = 4 # up, right, down, left\n", 152 | " self.wind = [0, 0, 0, 1, 1, 1, 2, 2, 1, 0]\n", 153 | " self.reset()\n", 154 | "\n", 155 | " def step(self, action):\n", 156 | " # Not implemented as it's not used in the main code\n", 157 | " pass\n", 158 | "\n", 159 | " def reset(self):\n", 160 | " self.s = np.ravel_multi_index((3, 0), self.shape)\n", 161 | " return self.s\n", 162 | "\n", 163 | "from scipy.optimize import minimize, rosen, rosen_der\n", 164 | "from scipy.optimize import Bounds\n", 165 | "\n", 166 | "bounds = Bounds([-0.1, -0.1], [0.1, 0.1])\n", 167 | "\n", 168 | "env = CliffWalkingEnv()\n", 169 | "\n", 170 | "def make_epsilon_greedy_policy(Q, epsilon, nA):\n", 171 | " def policy_fn(observation):\n", 172 | " A = np.ones(nA, dtype=float) * epsilon / nA\n", 173 | " best_action = np.argmax(Q[observation])\n", 174 | " A[best_action] += (1.0 - epsilon)\n", 175 | " return A\n", 176 | " return policy_fn\n", 177 | "\n", 178 | "# Update these paths to your actual file locations\n", 179 | "Q_space = np.load(\"/content/drive/MyDrive/DoubleReinforcementLearningMDP-master/Q-table-cliff.npz\")[\"xxx\"]\n", 180 | "Q_space2 = np.load(\"/content/drive/MyDrive/DoubleReinforcementLearningMDP-master/Q-table-cliff.npz\")[\"xxx\"]\n", 181 | "\n", 182 | "prob1 = [1.0 for i in range((env.nA))]\n", 183 | "prob1 = prob1/np.sum(prob1)\n", 184 | "\n", 185 | "betabeta = 0.8\n", 186 | "def sample_policy(observation, alpha=0.9):\n", 187 | " prob2 = alpha*Q_space[observation,:] + (1-alpha)*prob1\n", 188 | " return np.random.choice(env.nA, 1, p=prob2)[0]\n", 189 | "\n", 190 | "\n", 191 | "def behavior_policy(observation, beta=betabeta):\n", 192 | " prob2 = beta*Q_space[observation,:] + (1-beta)*prob1\n", 193 | " return np.random.choice(env.nA, 1, p=prob2)[0]\n", 194 | "\n", 195 | "\n", 196 | "def target_dense(observation, alpha=0.9):\n", 197 | " prob2 = alpha*Q_space[observation,:] + (1-alpha)*prob1\n", 198 | " return prob2\n", 199 | "\n", 200 | "def behav_dense(observation, beta=betabeta):\n", 201 | " prob2 = beta*Q_space[observation,:] + (1-beta)*prob1\n", 202 | " return prob2\n", 203 | "\n", 204 | "# FIXED: Matching the original notebook exactly\n", 205 | "def sarsa2(env, policy, policy2, num_episodes, discount_factor=1.0, Q_space2=Q_space2, alpha=0.6, epsilon=0.03):\n", 206 | " \"\"\"\n", 207 | " Expected SARSA implementation matching the original notebook\n", 208 | " \"\"\"\n", 209 | " # Initialize Q as a copy of Q_space2 (not zeros)\n", 210 | " Q = np.copy(Q_space2)\n", 211 | " episode_episode = []\n", 212 | "\n", 213 | " for i_episode in range(num_episodes):\n", 214 | " if (i_episode + 1) % 200 == 0:\n", 215 | " sys.stdout.flush()\n", 216 | "\n", 217 | " state = env.reset()\n", 218 | " action = policy2(state)\n", 219 | " episode = []\n", 220 | "\n", 221 | " for t in itertools.count():\n", 222 | " # Take a step\n", 223 | " next_state, reward, done, _ = env.step(action)\n", 224 | " episode.append((state, action, reward))\n", 225 | "\n", 226 | " # Pick the next action\n", 227 | " next_action = policy2(next_state)\n", 228 | "\n", 229 | " # TD Update - Expected SARSA without importance sampling\n", 230 | " td_target = reward + discount_factor * np.sum(Q[next_state,:]*target_dense(next_state))\n", 231 | " td_delta = td_target - Q[state, action]\n", 232 | " Q[state, action] += alpha * td_delta # No importance sampling correction\n", 233 | "\n", 234 | " if done:\n", 235 | " break\n", 236 | "\n", 237 | " action = next_action\n", 238 | " state = next_state\n", 239 | "\n", 240 | " episode_episode.append(episode)\n", 241 | "\n", 242 | " # Return only Q and episode_episode (matching original)\n", 243 | " return Q, episode_episode\n", 244 | "\n", 245 | "bounds = Bounds([-0.2, -0.2], [0.2, 0.2])\n", 246 | "def sigmoid(x, derivative=False):\n", 247 | " return x*(1-x) if derivative else 1/(1+np.exp(-x))\n", 248 | "\n", 249 | "depth = 1\n", 250 | "def mc_prediction(env, policy, policy2, episode_episode, Q_=1.0, num_episodes=100, discount_factor=1.0):\n", 251 | " \"\"\"\n", 252 | " Monte Carlo prediction for policy evaluation\n", 253 | " \"\"\"\n", 254 | " returns_sum = defaultdict(float)\n", 255 | " returns_count = defaultdict(float)\n", 256 | " returns_count2 = defaultdict(float)\n", 257 | "\n", 258 | " predic_list = []\n", 259 | " predic_list2 = []\n", 260 | " predic_list3 = []\n", 261 | " predic_list22 = []\n", 262 | " predic_list4 = []\n", 263 | " predic_list5 = np.ones(num_episodes)\n", 264 | " auxiauxi = []\n", 265 | " epiepi = []\n", 266 | " weight_list = np.zeros([num_episodes, 1000])\n", 267 | " weight_list2 = np.zeros([num_episodes, 1002])\n", 268 | " weight_list3 = np.zeros([num_episodes, 1002])\n", 269 | " marginal_weight = np.zeros([num_episodes, 1000])\n", 270 | " marginal_weight_2 = np.zeros([num_episodes, 1000])\n", 271 | " auxi_list = np.zeros([num_episodes, 1000])\n", 272 | " marginal_auxi_list2 = np.zeros([num_episodes, 1000])\n", 273 | " marginal_auxi_list = np.zeros([num_episodes, 1000])\n", 274 | " marginal_auxi_list2_2 = np.zeros([num_episodes, 1000])\n", 275 | " marginal_auxi_list_2 = np.zeros([num_episodes, 1000])\n", 276 | " auxi_list2 = np.zeros([num_episodes, 1000])\n", 277 | " reward_list = np.zeros([num_episodes, 1000])\n", 278 | " state_list = np.zeros([num_episodes, 1000])\n", 279 | " action_list = np.zeros([num_episodes, 1000])\n", 280 | "\n", 281 | " count_list = np.zeros(1000)\n", 282 | " episolode_longe_list = []\n", 283 | "\n", 284 | " for i_episode in range(num_episodes):\n", 285 | " if i_episode % 200 == 0:\n", 286 | " sys.stdout.flush()\n", 287 | "\n", 288 | " episode = episode_episode[i_episode]\n", 289 | "\n", 290 | " W = 1.0\n", 291 | " W_list = []\n", 292 | " episolode_longe_list.append(len(episode))\n", 293 | "\n", 294 | " weight_list2[i_episode, 0] = 1.0\n", 295 | " for t in range(len(episode)):\n", 296 | " state, action, reward = episode[t]\n", 297 | " reward_list[i_episode, t] = reward\n", 298 | " state_list[i_episode, t] = state\n", 299 | " action_list[i_episode, t] = action\n", 300 | "\n", 301 | " W = W*target_dense(state)[action]/behav_dense(state)[action]*discount_factor\n", 302 | " probprob = 0.9*Q_space[state,:] + 0.1*prob1\n", 303 | " W_list.append(W)\n", 304 | " weight_list[i_episode, t] = W_list[t]\n", 305 | " weight_list2[i_episode, t+1] = W_list[t]\n", 306 | " weight_list3[i_episode, t] = target_dense(state)[action]/behav_dense(state)[action]\n", 307 | "\n", 308 | " count_list[t] += 1.0\n", 309 | "\n", 310 | " if t==0:\n", 311 | " auxi_list[i_episode, t] = W_list[t]*Q_[state, action]-np.sum(probprob*Q_[state,:])\n", 312 | " else:\n", 313 | " auxi_list[i_episode, t] = W_list[t]*Q_[state, action]-W_list[t-1]*np.sum(probprob*Q_[state,:])\n", 314 | "\n", 315 | " if t==0:\n", 316 | " auxi_list2[i_episode, t] = W_list[t]-1.0\n", 317 | " else:\n", 318 | " auxi_list2[i_episode, t] = W_list[t]-W_list[t-1]\n", 319 | "\n", 320 | " print(np.max(np.array(episolode_longe_list)))\n", 321 | "\n", 322 | " weight_list_mean = np.mean(weight_list, 1)\n", 323 | " reward_list_mean = np.mean(reward_list, 1)\n", 324 | " auxi_list_mean = np.mean(auxi_list, 1)\n", 325 | " auxi_list2_mean = np.mean(auxi_list2, 1)\n", 326 | "\n", 327 | " val = []\n", 328 | "\n", 329 | " ##### IPW - Standard Importance Sampling\n", 330 | " for i in range(num_episodes):\n", 331 | " predic_list.append(np.sum(weight_list[i,:]*reward_list[i,:]))\n", 332 | "\n", 333 | " val.append(np.mean(predic_list))\n", 334 | "\n", 335 | " #### Marginalized-IPW\n", 336 | "\n", 337 | " for i in range(num_episodes):\n", 338 | " for j in range(episolode_longe_list[i]):\n", 339 | " marginal_weight[i,j] = np.mean(weight_list[:,j][(state_list[:,j]==state_list[i,j]) & (action_list[:,j]==action_list[i,j])])\n", 340 | " if j==0:\n", 341 | " marginal_weight_2[i,j] = weight_list3[i,j]\n", 342 | " else:\n", 343 | " marginal_weight_2[i,j] = np.mean(weight_list[:,j-1][(state_list[:,j]==state_list[i,j])])*weight_list3[i,j]\n", 344 | "\n", 345 | "\n", 346 | " for i_episode in range(num_episodes):\n", 347 | " for t in range(episolode_longe_list[i_episode]):\n", 348 | " state = int(state_list[i_episode,t]) # Using int instead of np.int for Python 3\n", 349 | " action = int(action_list[i_episode,t]) # Using int instead of np.int for Python 3\n", 350 | " probprob = 0.9*Q_space[state,:] + 0.1*prob1\n", 351 | " if t==0:\n", 352 | " marginal_auxi_list[i_episode,t] = marginal_weight[i_episode,t]*Q_[state,action]-np.sum(probprob*Q_[state,:])\n", 353 | " marginal_auxi_list_2[i_episode,t] = marginal_weight_2[i_episode,t]*Q_[state,action]-np.sum(probprob*Q_[state,:])\n", 354 | " auxi_list[i_episode,t] = weight_list[i_episode,t]*Q_[state,action]-np.sum(probprob*Q_[state,:])\n", 355 | " else:\n", 356 | " marginal_auxi_list[i_episode,t] = marginal_weight[i_episode,t]*(Q_[state,action])-marginal_weight[i_episode,t-1]*np.sum(probprob*(Q_[state,:]))\n", 357 | " marginal_auxi_list_2[i_episode,t] = marginal_weight_2[i_episode,t]*(Q_[state,action])-marginal_weight_2[i_episode,t-1]*np.sum(probprob*(Q_[state,:]))\n", 358 | " auxi_list[i_episode,t] = weight_list[i_episode,t]*(Q_[state,action])-weight_list[i_episode,t-1]*np.sum(probprob*(Q_[state,:]))\n", 359 | "\n", 360 | " if t==0:\n", 361 | " marginal_auxi_list2[i_episode,t] = marginal_weight[i_episode,t]-1.0\n", 362 | " marginal_auxi_list2_2[i_episode,t] = marginal_weight_2[i_episode,t]-1.0\n", 363 | " auxi_list2[i_episode,t] = weight_list[i_episode,t]-1.0\n", 364 | " else:\n", 365 | " marginal_auxi_list2[i_episode,t] = marginal_weight[i_episode,t]- marginal_weight[i_episode,t-1]\n", 366 | " marginal_auxi_list2_2[i_episode,t] = marginal_weight_2[i_episode,t]- marginal_weight_2[i_episode,t-1]\n", 367 | " auxi_list2[i_episode,t] = weight_list[i_episode,t]-weight_list[i_episode,t-1]\n", 368 | "\n", 369 | "\n", 370 | " for i in range(num_episodes):\n", 371 | " predic_list2.append(np.sum(marginal_weight[i,:]*reward_list[i,:]))\n", 372 | "\n", 373 | " ### marginal ipw2 #### Using action and state\n", 374 | " val.append(np.mean(predic_list2))\n", 375 | "\n", 376 | "\n", 377 | " ### marginal ipw3#### Using only state\n", 378 | " for i in range(num_episodes):\n", 379 | " predic_list22.append(np.sum(marginal_weight_2[i,:]*reward_list[i,:]))\n", 380 | "\n", 381 | " val.append(np.mean(predic_list22))\n", 382 | "\n", 383 | "\n", 384 | " #### DR\n", 385 | " val.append(np.mean(predic_list)-np.mean(np.sum(auxi_list,1)))\n", 386 | "\n", 387 | " #### marginal DR 1 #### Using action and state\n", 388 | " val.append(np.mean(predic_list2)-np.mean(np.sum(marginal_auxi_list,1)))\n", 389 | " #### marginal DR 2 #### Using only state\n", 390 | " val.append(np.mean(predic_list22)-np.mean(np.sum(marginal_auxi_list_2,1)))\n", 391 | "\n", 392 | " return val\n", 393 | "\n", 394 | "# Main experiment run - with sample splitting like the original\n", 395 | "is_list = []\n", 396 | "is2_list = []\n", 397 | "is3_list = []\n", 398 | "wis_list = []\n", 399 | "wis2_list = []\n", 400 | "dm_list = []\n", 401 | "dr_list = []\n", 402 | "dr2_list = []\n", 403 | "dr3_list = []\n", 404 | "bdr_list = []\n", 405 | "drs_list = []\n", 406 | "drs2_list = []\n", 407 | "drss_list = []\n", 408 | "mdr_list = []\n", 409 | "mdr_list2 = []\n", 410 | "\n", 411 | "sample_size = 1000\n", 412 | "# In Python 3, integer division requires // instead of /\n", 413 | "sample_size = sample_size // 2\n", 414 | "\n", 415 | "for kkk in range(100):\n", 416 | " print(kkk)\n", 417 | " #### Sample splitting\n", 418 | " ### First fold\n", 419 | " predicted_Q, episode_episode = sarsa2(env, sample_policy, behavior_policy, sample_size)\n", 420 | " V_10k_1 = mc_prediction(env, sample_policy, behavior_policy, episode_episode, predicted_Q, num_episodes=sample_size)\n", 421 | "\n", 422 | " ### Second fold\n", 423 | " predicted_Q, episode_episode = sarsa2(env, sample_policy, behavior_policy, sample_size)\n", 424 | " V_10k_2 = mc_prediction(env, sample_policy, behavior_policy, episode_episode, predicted_Q, num_episodes=sample_size)\n", 425 | "\n", 426 | " V_10k = 0.5*(np.array(V_10k_1)+np.array(V_10k_2))\n", 427 | " is_list.append(np.mean(V_10k[0]))\n", 428 | " is2_list.append(np.mean(V_10k[1]))\n", 429 | " is3_list.append(np.mean(V_10k[2]))\n", 430 | " dr_list.append(np.mean(V_10k[3]))\n", 431 | " dr2_list.append(np.mean(V_10k[4]))\n", 432 | " dr3_list.append(np.mean(V_10k[5]))\n", 433 | " probprob = 0.9*Q_space[36,:] + 0.1*prob1\n", 434 | " dm_list.append(np.sum(probprob*predicted_Q[36,:]))\n", 435 | "\n", 436 | " # Save results periodically\n", 437 | " if (kkk + 1) % 10 == 0:\n", 438 | " np.savez(f\"2estimator_list_ipw_{betabeta}_{sample_size}\", a=np.array(is_list))\n", 439 | " np.savez(f\"2estimator_list_ipw2_{betabeta}_{sample_size}\", a=np.array(is3_list))\n", 440 | " np.savez(f\"2estimator_list_dm_{betabeta}_{sample_size}\", a=np.array(dm_list))\n", 441 | " np.savez(f\"2estimator_list_dr_{betabeta}_{sample_size}\", a=np.array(dr_list))\n", 442 | " np.savez(f\"2estimator_list_dr2_{betabeta}_{sample_size}\", a=np.array(dr3_list))\n", 443 | "\n", 444 | "# Analysis of results\n", 445 | "true = -42.49\n", 446 | "def mse(aaa):\n", 447 | " \"\"\"Calculate the Mean Squared Error correctly for comparison\"\"\"\n", 448 | " aaa = np.array(aaa)\n", 449 | " # Filter extreme values\n", 450 | " aaa = aaa[aaa > -100]\n", 451 | " # Original MSE calculation\n", 452 | " return [np.mean((((aaa-true)*(aaa-true)))), np.sqrt(np.var((aaa-true)*(aaa-true)))]\n", 453 | "\n", 454 | "print(\"IPW:\")\n", 455 | "print(f\"Mean: {np.mean(is_list)}\")\n", 456 | "print(f\"MSE: {mse(is_list)}\")\n", 457 | "\n", 458 | "print(\"WIS:\")\n", 459 | "print(f\"Mean: {np.mean(is3_list)}\") # Note: Original used is3_list for WIS\n", 460 | "print(f\"MSE: {mse(is3_list)}\")\n", 461 | "\n", 462 | "print(\"DM:\")\n", 463 | "print(f\"Mean: {np.mean(dm_list)}\")\n", 464 | "print(f\"MSE: {mse(dm_list)}\")\n", 465 | "\n", 466 | "print(\"DR:\")\n", 467 | "print(f\"Mean: {np.mean(dr_list)}\")\n", 468 | "print(f\"MSE: {mse(dr_list)}\")\n", 469 | "\n", 470 | "print(\"DR3:\")\n", 471 | "print(f\"Mean: {np.mean(dr3_list)}\")\n", 472 | "print(f\"MSE: {mse(dr3_list)}\")" 473 | ], 474 | "metadata": { 475 | "colab": { 476 | "base_uri": "https://localhost:8080/" 477 | }, 478 | "id": "tnQ9THg00_XF", 479 | "outputId": "01d3aff0-63b4-42dc-9a9e-04763360bf2e" 480 | }, 481 | "id": "tnQ9THg00_XF", 482 | "execution_count": 7, 483 | "outputs": [ 484 | { 485 | "output_type": "stream", 486 | "name": "stdout", 487 | "text": [ 488 | "0\n", 489 | "282\n", 490 | "307\n", 491 | "1\n", 492 | "215\n", 493 | "249\n", 494 | "2\n", 495 | "215\n", 496 | "287\n", 497 | "3\n", 498 | "313\n", 499 | "221\n", 500 | "4\n", 501 | "183\n", 502 | "282\n", 503 | "5\n", 504 | "201\n", 505 | "237\n", 506 | "6\n", 507 | "196\n", 508 | "250\n", 509 | "7\n", 510 | "220\n", 511 | "164\n", 512 | "8\n", 513 | "307\n", 514 | "289\n", 515 | "9\n", 516 | "269\n", 517 | "444\n", 518 | "10\n", 519 | "219\n", 520 | "226\n", 521 | "11\n", 522 | "216\n", 523 | "271\n", 524 | "12\n", 525 | "246\n", 526 | "235\n", 527 | "13\n", 528 | "214\n", 529 | "220\n", 530 | "14\n", 531 | "207\n", 532 | "216\n", 533 | "15\n", 534 | "224\n", 535 | "252\n", 536 | "16\n", 537 | "241\n", 538 | "237\n", 539 | "17\n", 540 | "260\n", 541 | "246\n", 542 | "18\n", 543 | "246\n", 544 | "194\n", 545 | "19\n", 546 | "335\n", 547 | "265\n", 548 | "20\n", 549 | "259\n", 550 | "212\n", 551 | "21\n", 552 | "225\n", 553 | "321\n", 554 | "22\n", 555 | "215\n", 556 | "222\n", 557 | "23\n", 558 | "190\n", 559 | "233\n", 560 | "24\n", 561 | "241\n", 562 | "340\n", 563 | "25\n", 564 | "217\n", 565 | "212\n", 566 | "26\n", 567 | "274\n", 568 | "223\n", 569 | "27\n", 570 | "235\n", 571 | "231\n", 572 | "28\n", 573 | "201\n", 574 | "224\n", 575 | "29\n", 576 | "172\n", 577 | "215\n", 578 | "30\n", 579 | "195\n", 580 | "247\n", 581 | "31\n", 582 | "209\n", 583 | "299\n", 584 | "32\n", 585 | "221\n", 586 | "215\n", 587 | "33\n", 588 | "270\n", 589 | "219\n", 590 | "34\n", 591 | "245\n", 592 | "232\n", 593 | "35\n", 594 | "231\n", 595 | "224\n", 596 | "36\n", 597 | "243\n", 598 | "212\n", 599 | "37\n", 600 | "243\n", 601 | "273\n", 602 | "38\n", 603 | "271\n", 604 | "202\n", 605 | "39\n", 606 | "309\n", 607 | "184\n", 608 | "40\n", 609 | "230\n", 610 | "247\n", 611 | "41\n", 612 | "265\n", 613 | "172\n", 614 | "42\n", 615 | "227\n", 616 | "252\n", 617 | "43\n", 618 | "190\n", 619 | "207\n", 620 | "44\n", 621 | "253\n", 622 | "316\n", 623 | "45\n", 624 | "214\n", 625 | "283\n", 626 | "46\n", 627 | "263\n", 628 | "195\n", 629 | "47\n", 630 | "236\n", 631 | "208\n", 632 | "48\n", 633 | "301\n", 634 | "329\n", 635 | "49\n", 636 | "200\n", 637 | "266\n", 638 | "50\n", 639 | "267\n", 640 | "264\n", 641 | "51\n", 642 | "297\n", 643 | "216\n", 644 | "52\n", 645 | "273\n", 646 | "206\n", 647 | "53\n", 648 | "314\n", 649 | "247\n", 650 | "54\n", 651 | "241\n", 652 | "227\n", 653 | "55\n", 654 | "192\n", 655 | "276\n", 656 | "56\n", 657 | "323\n", 658 | "392\n", 659 | "57\n", 660 | "174\n", 661 | "204\n", 662 | "58\n", 663 | "257\n", 664 | "182\n", 665 | "59\n", 666 | "275\n", 667 | "200\n", 668 | "60\n", 669 | "213\n", 670 | "191\n", 671 | "61\n", 672 | "220\n", 673 | "235\n", 674 | "62\n", 675 | "241\n", 676 | "244\n", 677 | "63\n", 678 | "261\n", 679 | "674\n", 680 | "64\n", 681 | "257\n", 682 | "258\n", 683 | "65\n", 684 | "231\n", 685 | "258\n", 686 | "66\n", 687 | "254\n", 688 | "264\n", 689 | "67\n", 690 | "298\n", 691 | "176\n", 692 | "68\n", 693 | "233\n", 694 | "197\n", 695 | "69\n", 696 | "209\n", 697 | "192\n", 698 | "70\n", 699 | "338\n", 700 | "188\n", 701 | "71\n", 702 | "304\n", 703 | "202\n", 704 | "72\n", 705 | "239\n", 706 | "182\n", 707 | "73\n", 708 | "284\n", 709 | "205\n", 710 | "74\n", 711 | "186\n", 712 | "318\n", 713 | "75\n", 714 | "265\n", 715 | "194\n", 716 | "76\n", 717 | "172\n", 718 | "312\n", 719 | "77\n", 720 | "221\n", 721 | "296\n", 722 | "78\n", 723 | "197\n", 724 | "230\n", 725 | "79\n", 726 | "252\n", 727 | "183\n", 728 | "80\n", 729 | "254\n", 730 | "260\n", 731 | "81\n", 732 | "261\n", 733 | "206\n", 734 | "82\n", 735 | "329\n", 736 | "241\n", 737 | "83\n", 738 | "227\n", 739 | "183\n", 740 | "84\n", 741 | "230\n", 742 | "282\n", 743 | "85\n", 744 | "180\n", 745 | "235\n", 746 | "86\n", 747 | "182\n", 748 | "402\n", 749 | "87\n", 750 | "162\n", 751 | "240\n", 752 | "88\n", 753 | "288\n", 754 | "194\n", 755 | "89\n", 756 | "214\n", 757 | "194\n", 758 | "90\n", 759 | "335\n", 760 | "198\n", 761 | "91\n", 762 | "277\n", 763 | "254\n", 764 | "92\n", 765 | "189\n", 766 | "172\n", 767 | "93\n", 768 | "326\n", 769 | "193\n", 770 | "94\n", 771 | "232\n", 772 | "273\n", 773 | "95\n", 774 | "207\n", 775 | "227\n", 776 | "96\n", 777 | "178\n", 778 | "198\n", 779 | "97\n", 780 | "195\n", 781 | "220\n", 782 | "98\n", 783 | "274\n", 784 | "203\n", 785 | "99\n", 786 | "211\n", 787 | "333\n", 788 | "IPW:\n", 789 | "Mean: -54.75616680808431\n", 790 | "MSE: [np.float64(160.30981785049113), np.float64(84.892121100215)]\n", 791 | "WIS:\n", 792 | "Mean: -53.458516161830985\n", 793 | "MSE: [np.float64(127.63291481174467), np.float64(64.21425268143648)]\n", 794 | "DM:\n", 795 | "Mean: -52.8702278720724\n", 796 | "MSE: [np.float64(107.83228360781379), np.float64(5.982560508806721)]\n", 797 | "DR:\n", 798 | "Mean: -55.37211197782537\n", 799 | "MSE: [np.float64(166.0145781196328), np.float64(6.663141640878914)]\n", 800 | "DR3:\n", 801 | "Mean: -55.261490392640056\n", 802 | "MSE: [np.float64(163.16388325845435), np.float64(5.914647204533788)]\n" 803 | ] 804 | } 805 | ] 806 | }, 807 | { 808 | "metadata": { 809 | "ExecuteTime": { 810 | "end_time": "2025-05-10T18:26:05.270719Z", 811 | "start_time": "2025-05-10T18:18:20.420269Z" 812 | }, 813 | "colab": { 814 | "base_uri": "https://localhost:8080/" 815 | }, 816 | "id": "532a8fd56a713ebe", 817 | "outputId": "3aef18e4-ae52-4d5f-ca36-a2d0ac90f3bf" 818 | }, 819 | "cell_type": "code", 820 | "source": [ 821 | "import itertools\n", 822 | "import numpy as np\n", 823 | "import sys\n", 824 | "import gym\n", 825 | "\n", 826 | "# Since lib.envs isn't available, we'll need to define these environments here\n", 827 | "# or use gym environments directly. For now, I'll create simplified versions.\n", 828 | "\n", 829 | "from collections import defaultdict\n", 830 | "\n", 831 | "# Simple CliffWalkingEnv implementation\n", 832 | "class CliffWalkingEnv(gym.Env):\n", 833 | " def __init__(self):\n", 834 | " self.shape = (4, 12)\n", 835 | " self.start_state_index = np.ravel_multi_index((3, 0), self.shape)\n", 836 | " self.goal_state_index = np.ravel_multi_index((3, 11), self.shape)\n", 837 | " self.cliff = list(range(np.ravel_multi_index((3, 1), self.shape),\n", 838 | " np.ravel_multi_index((3, 11), self.shape)))\n", 839 | " self.nS = self.shape[0] * self.shape[1]\n", 840 | " self.nA = 4 # up, right, down, left\n", 841 | "\n", 842 | " # Calculate transition probabilities and rewards\n", 843 | " self.P = {}\n", 844 | " for s in range(self.nS):\n", 845 | " position = np.unravel_index(s, self.shape)\n", 846 | " self.P[s] = {a: [] for a in range(self.nA)}\n", 847 | "\n", 848 | " # Actions: 0=up, 1=right, 2=down, 3=left\n", 849 | " for a in range(self.nA):\n", 850 | " reward = -1.0 # default reward for each move\n", 851 | " next_position = list(position)\n", 852 | " if a == 0:\n", 853 | " next_position[0] = max(position[0] - 1, 0)\n", 854 | " elif a == 1:\n", 855 | " next_position[1] = min(position[1] + 1, self.shape[1] - 1)\n", 856 | " elif a == 2:\n", 857 | " next_position[0] = min(position[0] + 1, self.shape[0] - 1)\n", 858 | " elif a == 3:\n", 859 | " next_position[1] = max(position[1] - 1, 0)\n", 860 | "\n", 861 | " next_state = np.ravel_multi_index(next_position, self.shape)\n", 862 | "\n", 863 | " # Check if we're at the cliff\n", 864 | " if s in self.cliff:\n", 865 | " next_state = self.start_state_index\n", 866 | " reward = -100.0\n", 867 | "\n", 868 | " # Check if we're at the goal\n", 869 | " done = next_state == self.goal_state_index\n", 870 | "\n", 871 | " self.P[s][a] = [(1.0, next_state, reward, done)]\n", 872 | "\n", 873 | " self.observation_space = gym.spaces.Discrete(self.nS)\n", 874 | " self.action_space = gym.spaces.Discrete(self.nA)\n", 875 | "\n", 876 | " self.reset()\n", 877 | "\n", 878 | " def step(self, action):\n", 879 | " state, reward, done, _ = self._step(action)\n", 880 | " self.s = state\n", 881 | " return (state, reward, done, {})\n", 882 | "\n", 883 | " def _step(self, action):\n", 884 | " (probs, next_state, reward, done) = self.P[self.s][action][0]\n", 885 | " return (next_state, reward, done, {})\n", 886 | "\n", 887 | " def reset(self):\n", 888 | " self.s = self.start_state_index\n", 889 | " return self.s\n", 890 | "\n", 891 | "# Simple WindyGridworldEnv implementation (not fully used in this code)\n", 892 | "class WindyGridworldEnv(gym.Env):\n", 893 | " def __init__(self):\n", 894 | " self.shape = (7, 10)\n", 895 | " self.nS = self.shape[0] * self.shape[1]\n", 896 | " self.nA = 4 # up, right, down, left\n", 897 | " self.wind = [0, 0, 0, 1, 1, 1, 2, 2, 1, 0]\n", 898 | " self.reset()\n", 899 | "\n", 900 | " def step(self, action):\n", 901 | " # Not implemented as it's not used in the main code\n", 902 | " pass\n", 903 | "\n", 904 | " def reset(self):\n", 905 | " self.s = np.ravel_multi_index((3, 0), self.shape)\n", 906 | " return self.s\n", 907 | "\n", 908 | "from scipy.optimize import minimize, rosen, rosen_der\n", 909 | "from scipy.optimize import Bounds\n", 910 | "\n", 911 | "bounds = Bounds([-0.1, -0.1], [0.1, 0.1])\n", 912 | "\n", 913 | "env = CliffWalkingEnv()\n", 914 | "\n", 915 | "def make_epsilon_greedy_policy(Q, epsilon, nA):\n", 916 | " def policy_fn(observation):\n", 917 | " A = np.ones(nA, dtype=float) * epsilon / nA\n", 918 | " best_action = np.argmax(Q[observation])\n", 919 | " A[best_action] += (1.0 - epsilon)\n", 920 | " return A\n", 921 | " return policy_fn\n", 922 | "\n", 923 | "Q_space = np.load(\"/content/drive/MyDrive/Work/Estimators/DoubleReinforcement/Q-table-cliff.npz\")[\"xxx\"]\n", 924 | "Q_space2 = np.load(\"/content/drive/MyDrive/Work/Estimators/DoubleReinforcement/Q-table-real-cliff.npz\")[\"xxx\"] #Q-table-cliff.npz\n", 925 | "\n", 926 | "prob1 = [1.0 for i in range((env.nA))]\n", 927 | "prob1 = prob1/np.sum(prob1)\n", 928 | "\n", 929 | "betabeta = 0.8\n", 930 | "def sample_policy(observation, alpha=0.9):\n", 931 | " prob2 = alpha*Q_space[observation,:] + (1-alpha)*prob1\n", 932 | " return np.random.choice(env.nA, 1, p=prob2)[0]\n", 933 | "\n", 934 | "\n", 935 | "def behavior_policy(observation, beta=betabeta):\n", 936 | " prob2 = beta*Q_space[observation,:] + (1-beta)*prob1\n", 937 | " return np.random.choice(env.nA, 1, p=prob2)[0]\n", 938 | "\n", 939 | "\n", 940 | "def target_dense(observation, alpha=0.9):\n", 941 | " prob2 = alpha*Q_space[observation,:] + (1-alpha)*prob1\n", 942 | " return prob2\n", 943 | "\n", 944 | "def behav_dense(observation, beta=betabeta):\n", 945 | " prob2 = beta*Q_space[observation,:] + (1-beta)*prob1\n", 946 | " return prob2\n", 947 | "\n", 948 | "def sarsa2(env, policy, policy2, num_episodes, discount_factor=1.0, Q_space2=Q_space2, alpha=0.6, epsilon=0.03):\n", 949 | "\n", 950 | " Q = np.copy(Q_space2)\n", 951 | " episode_episode = []\n", 952 | "\n", 953 | " for i_episode in range(num_episodes):\n", 954 | "\n", 955 | " if (i_episode + 1) % 200 == 0:\n", 956 | " sys.stdout.flush()\n", 957 | "\n", 958 | " state = env.reset()\n", 959 | " action = policy2(state)\n", 960 | "\n", 961 | " episode = []\n", 962 | "\n", 963 | " for t in itertools.count():\n", 964 | " # Take a step\n", 965 | " next_state, reward, done, _ = env.step(action)\n", 966 | " episode.append((state, action, reward))\n", 967 | " # Pick the next action\n", 968 | " next_action = policy2(next_state)\n", 969 | "\n", 970 | " # TD Update\n", 971 | " td_target = reward + discount_factor * np.sum(Q[next_state,:]*target_dense(next_state))\n", 972 | " td_delta = td_target - Q[state, action]\n", 973 | " Q[state, action] += alpha * td_delta\n", 974 | "\n", 975 | " if done:\n", 976 | " break\n", 977 | "\n", 978 | " action = next_action\n", 979 | " state = next_state\n", 980 | "\n", 981 | " episode_episode.append(episode)\n", 982 | "\n", 983 | " return Q, episode_episode\n", 984 | "\n", 985 | "bounds = Bounds([-0.2, -0.2], [0.2, 0.2])\n", 986 | "def sigmoid(x, derivative=False):\n", 987 | " return x*(1-x) if derivative else 1/(1+np.exp(-x))\n", 988 | "\n", 989 | "\n", 990 | "depth = 1\n", 991 | "def mc_prediction(env, policy, policy2, episode_episode, Q_=1.0, num_episodes=100, discount_factor=1.0):\n", 992 | "\n", 993 | " returns_sum = defaultdict(float)\n", 994 | " returns_count = defaultdict(float)\n", 995 | " returns_count2 = defaultdict(float)\n", 996 | "\n", 997 | " predic_list = []\n", 998 | " predic_list2 = []\n", 999 | " predic_list3 = []\n", 1000 | " predic_list22 = []\n", 1001 | " predic_list4 = []\n", 1002 | " predic_list5 = np.ones(num_episodes)\n", 1003 | " auxiauxi = []\n", 1004 | " epiepi = []\n", 1005 | " weight_list = np.zeros([num_episodes, 1000]) # For bounded IPW\n", 1006 | " weight_list2 = np.zeros([num_episodes, 1002]) # For bounded IPW\n", 1007 | " weight_list3 = np.zeros([num_episodes, 1002]) # For bounded IPW\n", 1008 | " marginal_weight = np.zeros([num_episodes, 1000]) # For bounded IPW\n", 1009 | " marginal_weight_2 = np.zeros([num_episodes, 1000]) # For bounded IPW\n", 1010 | " auxi_list = np.zeros([num_episodes, 1000])\n", 1011 | " marginal_auxi_list2 = np.zeros([num_episodes, 1000])\n", 1012 | " marginal_auxi_list = np.zeros([num_episodes, 1000])\n", 1013 | " marginal_auxi_list2_2 = np.zeros([num_episodes, 1000])\n", 1014 | " marginal_auxi_list_2 = np.zeros([num_episodes, 1000])\n", 1015 | " auxi_list2 = np.zeros([num_episodes, 1000])\n", 1016 | " reward_list = np.zeros([num_episodes, 1000])\n", 1017 | " state_list = np.zeros([num_episodes, 1000])\n", 1018 | " action_list = np.zeros([num_episodes, 1000])\n", 1019 | "\n", 1020 | " count_list = np.zeros(1000)\n", 1021 | " episolode_longe_list = []\n", 1022 | "\n", 1023 | "\n", 1024 | " for i_episode in range(num_episodes):\n", 1025 | "\n", 1026 | " if i_episode % 200 == 0:\n", 1027 | " sys.stdout.flush()\n", 1028 | "\n", 1029 | " episode = episode_episode[i_episode]\n", 1030 | "\n", 1031 | " W = 1.0\n", 1032 | " W_list = []\n", 1033 | " episolode_longe_list.append(len(episode))\n", 1034 | "\n", 1035 | " weight_list2[i_episode, 0] = 1.0\n", 1036 | " for t in range(len(episode)):\n", 1037 | " state, action, reward = episode[t]\n", 1038 | " reward_list[i_episode, t] = reward\n", 1039 | " state_list[i_episode, t] = state\n", 1040 | " action_list[i_episode, t] = action\n", 1041 | "\n", 1042 | " W = W*target_dense(state)[action]/behav_dense(state)[action]*discount_factor\n", 1043 | " probprob = 0.9*Q_space[state,:] + 0.1*prob1\n", 1044 | " W_list.append(W)\n", 1045 | " weight_list[i_episode, t] = W_list[t]\n", 1046 | " weight_list2[i_episode, t+1] = W_list[t]\n", 1047 | " weight_list3[i_episode, t] = target_dense(state)[action]/behav_dense(state)[action]\n", 1048 | "\n", 1049 | " count_list[t] += 1.0\n", 1050 | "\n", 1051 | " if t==0:\n", 1052 | " auxi_list[i_episode, t] = W_list[t]*Q_[state, action]-np.sum(probprob*Q_[state,:])\n", 1053 | " else:\n", 1054 | " auxi_list[i_episode, t] = W_list[t]*Q_[state, action]-W_list[t-1]*np.sum(probprob*Q_[state,:])\n", 1055 | "\n", 1056 | " if t==0:\n", 1057 | " auxi_list2[i_episode, t] = W_list[t]-1.0\n", 1058 | " else:\n", 1059 | " auxi_list2[i_episode, t] = W_list[t]-W_list[t-1]\n", 1060 | "\n", 1061 | " print(np.max(np.array(episolode_longe_list)))\n", 1062 | "\n", 1063 | "\n", 1064 | " weight_list_mean = np.mean(weight_list, 1)\n", 1065 | " reward_list_mean = np.mean(reward_list, 1)\n", 1066 | " auxi_list_mean = np.mean(auxi_list, 1)\n", 1067 | " auxi_list2_mean = np.mean(auxi_list2, 1)\n", 1068 | "\n", 1069 | " val = []\n", 1070 | "\n", 1071 | " ##### IPW\n", 1072 | " for i in range(num_episodes):\n", 1073 | " predic_list.append(np.sum(weight_list[i,:]*reward_list[i,:]))\n", 1074 | "\n", 1075 | " val.append(np.mean(predic_list))\n", 1076 | "\n", 1077 | " #### Marginalized-IPW\n", 1078 | "\n", 1079 | " for i in range(num_episodes):\n", 1080 | " for j in range(episolode_longe_list[i]):\n", 1081 | " marginal_weight[i,j] = np.mean(weight_list[:,j][(state_list[:,j]==state_list[i,j]) & (action_list[:,j]==action_list[i,j])])\n", 1082 | " if j==0:\n", 1083 | " marginal_weight_2[i,j] = weight_list3[i,j]\n", 1084 | " else:\n", 1085 | " marginal_weight_2[i,j] = np.mean(weight_list[:,j-1][(state_list[:,j]==state_list[i,j])])*weight_list3[i,j]\n", 1086 | "\n", 1087 | "\n", 1088 | " for i_episode in range(num_episodes):\n", 1089 | " for t in range(episolode_longe_list[i_episode]):\n", 1090 | " state = int(state_list[i_episode, t]) # Changed np.int to int\n", 1091 | " action = int(action_list[i_episode, t]) # Changed np.int to int\n", 1092 | " probprob = 0.9*Q_space[state,:] + 0.1*prob1\n", 1093 | " if t==0:\n", 1094 | " marginal_auxi_list[i_episode, t] = marginal_weight[i_episode, t]*Q_[state, action]-np.sum(probprob*Q_[state,:])\n", 1095 | " marginal_auxi_list_2[i_episode, t] = marginal_weight_2[i_episode, t]*Q_[state, action]-np.sum(probprob*Q_[state,:])\n", 1096 | " auxi_list[i_episode, t] = weight_list[i_episode, t]*Q_[state, action]-np.sum(probprob*Q_[state,:])\n", 1097 | " else:\n", 1098 | " marginal_auxi_list[i_episode, t] = marginal_weight[i_episode, t]*(Q_[state, action])-marginal_weight[i_episode, t-1]*np.sum(probprob*(Q_[state,:]))\n", 1099 | " marginal_auxi_list_2[i_episode, t] = marginal_weight_2[i_episode, t]*(Q_[state, action])-marginal_weight_2[i_episode, t-1]*np.sum(probprob*(Q_[state,:]))\n", 1100 | " auxi_list[i_episode, t] = weight_list[i_episode, t]*(Q_[state, action])-weight_list[i_episode, t-1]*np.sum(probprob*(Q_[state,:]))\n", 1101 | "\n", 1102 | " if t==0:\n", 1103 | " marginal_auxi_list2[i_episode, t] = marginal_weight[i_episode, t]-1.0\n", 1104 | " marginal_auxi_list2_2[i_episode, t] = marginal_weight_2[i_episode, t]-1.0\n", 1105 | " auxi_list2[i_episode, t] = weight_list[i_episode, t]-1.0\n", 1106 | " else:\n", 1107 | " marginal_auxi_list2[i_episode, t] = marginal_weight[i_episode, t]- marginal_weight[i_episode, t-1]\n", 1108 | " marginal_auxi_list2_2[i_episode, t] = marginal_weight_2[i_episode, t]- marginal_weight_2[i_episode, t-1]\n", 1109 | " auxi_list2[i_episode, t] = weight_list[i_episode, t]-weight_list[i_episode, t-1]\n", 1110 | "\n", 1111 | "\n", 1112 | " for i in range(num_episodes):\n", 1113 | " predic_list2.append(np.sum(marginal_weight[i,:]*reward_list[i,:]))\n", 1114 | "\n", 1115 | " ### marginal ipw2 #### Using action and state\n", 1116 | " val.append(np.mean(predic_list2))\n", 1117 | "\n", 1118 | "\n", 1119 | " ### marginal ipw3#### Using only state\n", 1120 | " for i in range(num_episodes):\n", 1121 | " predic_list22.append(np.sum(marginal_weight_2[i,:]*reward_list[i,:]))\n", 1122 | "\n", 1123 | " val.append(np.mean(predic_list22))\n", 1124 | "\n", 1125 | "\n", 1126 | " #### DR\n", 1127 | " val.append(np.mean(predic_list)-np.mean(np.sum(auxi_list, 1)))\n", 1128 | "\n", 1129 | " #### marginal DR 1 #### Using action and state\n", 1130 | " val.append(np.mean(predic_list2)-np.mean(np.sum(marginal_auxi_list, 1)))\n", 1131 | " #### marginal DR 2 #### Using only state\n", 1132 | " val.append(np.mean(predic_list22)-np.mean(np.sum(marginal_auxi_list_2, 1)))\n", 1133 | "\n", 1134 | " return val\n", 1135 | "\n", 1136 | "# Main experiment run\n", 1137 | "is_list = []\n", 1138 | "is2_list = []\n", 1139 | "is3_list = []\n", 1140 | "wis_list = []\n", 1141 | "wis2_list = []\n", 1142 | "dm_list = []\n", 1143 | "dr_list = []\n", 1144 | "dr2_list = []\n", 1145 | "dr3_list = []\n", 1146 | "bdr_list = []\n", 1147 | "drs_list = []\n", 1148 | "drs2_list = []\n", 1149 | "drss_list = []\n", 1150 | "mdr_list = []\n", 1151 | "mdr_list2 = []\n", 1152 | "\n", 1153 | "sample_size = 1000\n", 1154 | "sample_size = sample_size // 2 # Integer division in Python 3\n", 1155 | "for kkk in range(100):\n", 1156 | " print(kkk)\n", 1157 | " #### Sample splititng\n", 1158 | " ### First fold\n", 1159 | "\n", 1160 | " predicted_Q, episode_episode = sarsa2(env, sample_policy, behavior_policy, sample_size)\n", 1161 | " V_10k_1 = mc_prediction(env, sample_policy, behavior_policy, episode_episode, predicted_Q, num_episodes=sample_size)\n", 1162 | "\n", 1163 | " ### Second fold\n", 1164 | " predicted_Q, episode_episode = sarsa2(env, sample_policy, behavior_policy, sample_size)\n", 1165 | " V_10k_2 = mc_prediction(env, sample_policy, behavior_policy, episode_episode, predicted_Q, num_episodes=sample_size)\n", 1166 | "\n", 1167 | " V_10k = 0.5*(np.array(V_10k_1)+np.array(V_10k_2))\n", 1168 | " is_list.append(np.mean(V_10k[0]))\n", 1169 | " is2_list.append(np.mean(V_10k[1]))\n", 1170 | " is3_list.append(np.mean(V_10k[2]))\n", 1171 | " dr_list.append(np.mean(V_10k[3]))\n", 1172 | " dr2_list.append(np.mean(V_10k[4]))\n", 1173 | " dr3_list.append(np.mean(V_10k[5]))\n", 1174 | " probprob = 0.9*Q_space[36,:] + 0.1*prob1\n", 1175 | " dm_list.append(np.sum(probprob*predicted_Q[36,:]))\n", 1176 | " np.savez(\"2estimator_list_ipw_\"+str(betabeta)+\"_\"+str(sample_size), a=is_list)\n", 1177 | " np.savez(\"2estimator_list_ipw2_\"+str(betabeta)+\"_\"+str(sample_size), a=is3_list)\n", 1178 | " np.savez(\"2estimator_list_dm_\"+str(betabeta)+\"_\"+str(sample_size), a=dm_list)\n", 1179 | " np.savez(\"2estimator_list_dr_\"+str(betabeta)+\"_\"+str(sample_size), a=dr_list)\n", 1180 | " np.savez(\"2estimator_list_dr2_\"+str(betabeta)+\"_\"+str(sample_size), a=dr3_list)\n", 1181 | "\n", 1182 | "# Analysis of results\n", 1183 | "true = -42.49\n", 1184 | "\n", 1185 | "# FIX: Properly calculate MSE instead of using hardcoded values\n", 1186 | "def mse(aaa):\n", 1187 | " aaa = np.array(aaa)\n", 1188 | " aaa = aaa[aaa>-100] # Filter extreme values\n", 1189 | " mean_val = np.mean(aaa) # Calculate mean\n", 1190 | " bias = mean_val - true # Calculate bias\n", 1191 | " bias_squared = bias * bias # Square the bias\n", 1192 | " variance = np.var(aaa) # Calculate variance\n", 1193 | " mse_value = bias_squared + variance # MSE = bias² + variance\n", 1194 | " return [mse_value, np.sqrt(np.var((aaa-true)*(aaa-true)))] # Return MSE and RMSE\n", 1195 | "\n", 1196 | "print(np.mean(is_list))\n", 1197 | "print(mse(is_list))\n", 1198 | "print(\"wis\")\n", 1199 | "print(np.mean(is3_list))\n", 1200 | "print(mse(is3_list))\n", 1201 | "print(\"dm\")\n", 1202 | "print(np.mean(dm_list))\n", 1203 | "print(mse(dm_list))\n", 1204 | "print(\"dr\")\n", 1205 | "print(np.mean(dr_list))\n", 1206 | "print(mse(dr_list))\n", 1207 | "print(\"dr3\")\n", 1208 | "print(np.mean(dr3_list))\n", 1209 | "print(mse(dr3_list))" 1210 | ], 1211 | "id": "532a8fd56a713ebe", 1212 | "outputs": [ 1213 | { 1214 | "output_type": "stream", 1215 | "name": "stdout", 1216 | "text": [ 1217 | "0\n", 1218 | "237\n", 1219 | "201\n", 1220 | "1\n", 1221 | "293\n", 1222 | "251\n", 1223 | "2\n", 1224 | "269\n", 1225 | "269\n", 1226 | "3\n", 1227 | "307\n", 1228 | "382\n", 1229 | "4\n", 1230 | "262\n", 1231 | "190\n", 1232 | "5\n", 1233 | "232\n", 1234 | "196\n", 1235 | "6\n", 1236 | "232\n", 1237 | "291\n", 1238 | "7\n", 1239 | "316\n", 1240 | "241\n", 1241 | "8\n", 1242 | "224\n", 1243 | "206\n", 1244 | "9\n", 1245 | "274\n", 1246 | "274\n", 1247 | "10\n", 1248 | "261\n", 1249 | "260\n", 1250 | "11\n", 1251 | "250\n", 1252 | "254\n", 1253 | "12\n", 1254 | "196\n", 1255 | "260\n", 1256 | "13\n", 1257 | "324\n", 1258 | "285\n", 1259 | "14\n", 1260 | "259\n", 1261 | "250\n", 1262 | "15\n", 1263 | "235\n", 1264 | "198\n", 1265 | "16\n", 1266 | "267\n", 1267 | "234\n", 1268 | "17\n", 1269 | "250\n", 1270 | "233\n", 1271 | "18\n", 1272 | "205\n", 1273 | "190\n", 1274 | "19\n", 1275 | "196\n", 1276 | "208\n", 1277 | "20\n", 1278 | "340\n", 1279 | "200\n", 1280 | "21\n", 1281 | "233\n", 1282 | "269\n", 1283 | "22\n", 1284 | "228\n", 1285 | "218\n", 1286 | "23\n", 1287 | "246\n", 1288 | "266\n", 1289 | "24\n", 1290 | "238\n", 1291 | "288\n", 1292 | "25\n", 1293 | "248\n", 1294 | "424\n", 1295 | "26\n", 1296 | "254\n", 1297 | "258\n", 1298 | "27\n", 1299 | "240\n", 1300 | "275\n", 1301 | "28\n", 1302 | "213\n", 1303 | "194\n", 1304 | "29\n", 1305 | "202\n", 1306 | "251\n", 1307 | "30\n", 1308 | "174\n", 1309 | "256\n", 1310 | "31\n", 1311 | "227\n", 1312 | "252\n", 1313 | "32\n", 1314 | "182\n", 1315 | "388\n", 1316 | "33\n", 1317 | "212\n", 1318 | "282\n", 1319 | "34\n", 1320 | "333\n", 1321 | "242\n", 1322 | "35\n", 1323 | "214\n", 1324 | "198\n", 1325 | "36\n", 1326 | "245\n", 1327 | "234\n", 1328 | "37\n", 1329 | "218\n", 1330 | "245\n", 1331 | "38\n", 1332 | "258\n", 1333 | "229\n", 1334 | "39\n", 1335 | "236\n", 1336 | "303\n", 1337 | "40\n", 1338 | "229\n", 1339 | "219\n", 1340 | "41\n", 1341 | "265\n", 1342 | "169\n", 1343 | "42\n", 1344 | "217\n", 1345 | "186\n", 1346 | "43\n", 1347 | "222\n", 1348 | "158\n", 1349 | "44\n", 1350 | "374\n", 1351 | "243\n", 1352 | "45\n", 1353 | "290\n", 1354 | "264\n", 1355 | "46\n", 1356 | "291\n", 1357 | "219\n", 1358 | "47\n", 1359 | "260\n", 1360 | "220\n", 1361 | "48\n", 1362 | "210\n", 1363 | "217\n", 1364 | "49\n", 1365 | "259\n", 1366 | "199\n", 1367 | "50\n", 1368 | "190\n", 1369 | "209\n", 1370 | "51\n", 1371 | "238\n", 1372 | "211\n", 1373 | "52\n", 1374 | "217\n", 1375 | "208\n", 1376 | "53\n", 1377 | "309\n", 1378 | "209\n", 1379 | "54\n", 1380 | "159\n", 1381 | "241\n", 1382 | "55\n", 1383 | "184\n", 1384 | "326\n", 1385 | "56\n", 1386 | "204\n", 1387 | "243\n", 1388 | "57\n", 1389 | "188\n", 1390 | "201\n", 1391 | "58\n", 1392 | "249\n", 1393 | "278\n", 1394 | "59\n", 1395 | "238\n", 1396 | "210\n", 1397 | "60\n", 1398 | "179\n", 1399 | "234\n", 1400 | "61\n", 1401 | "188\n", 1402 | "211\n", 1403 | "62\n", 1404 | "156\n", 1405 | "177\n", 1406 | "63\n", 1407 | "220\n", 1408 | "261\n", 1409 | "64\n", 1410 | "177\n", 1411 | "244\n", 1412 | "65\n", 1413 | "247\n", 1414 | "234\n", 1415 | "66\n", 1416 | "394\n", 1417 | "214\n", 1418 | "67\n", 1419 | "226\n", 1420 | "221\n", 1421 | "68\n", 1422 | "200\n", 1423 | "208\n", 1424 | "69\n", 1425 | "537\n", 1426 | "306\n", 1427 | "70\n", 1428 | "196\n", 1429 | "214\n", 1430 | "71\n", 1431 | "237\n", 1432 | "225\n", 1433 | "72\n", 1434 | "205\n", 1435 | "270\n", 1436 | "73\n", 1437 | "207\n", 1438 | "226\n", 1439 | "74\n", 1440 | "322\n", 1441 | "196\n", 1442 | "75\n", 1443 | "305\n", 1444 | "251\n", 1445 | "76\n", 1446 | "276\n", 1447 | "232\n", 1448 | "77\n", 1449 | "193\n", 1450 | "222\n", 1451 | "78\n", 1452 | "296\n", 1453 | "216\n", 1454 | "79\n", 1455 | "338\n", 1456 | "249\n", 1457 | "80\n", 1458 | "219\n", 1459 | "169\n", 1460 | "81\n", 1461 | "240\n", 1462 | "219\n", 1463 | "82\n", 1464 | "148\n", 1465 | "335\n", 1466 | "83\n", 1467 | "177\n", 1468 | "220\n", 1469 | "84\n", 1470 | "265\n", 1471 | "205\n", 1472 | "85\n", 1473 | "261\n", 1474 | "310\n", 1475 | "86\n", 1476 | "277\n", 1477 | "223\n", 1478 | "87\n", 1479 | "247\n", 1480 | "238\n", 1481 | "88\n", 1482 | "225\n", 1483 | "251\n", 1484 | "89\n", 1485 | "302\n", 1486 | "208\n", 1487 | "90\n", 1488 | "239\n", 1489 | "197\n", 1490 | "91\n", 1491 | "196\n", 1492 | "234\n", 1493 | "92\n", 1494 | "288\n", 1495 | "285\n", 1496 | "93\n", 1497 | "224\n", 1498 | "221\n", 1499 | "94\n", 1500 | "197\n", 1501 | "221\n", 1502 | "95\n", 1503 | "224\n", 1504 | "260\n", 1505 | "96\n", 1506 | "241\n", 1507 | "232\n", 1508 | "97\n", 1509 | "253\n", 1510 | "267\n", 1511 | "98\n", 1512 | "246\n", 1513 | "246\n", 1514 | "99\n", 1515 | "304\n", 1516 | "206\n", 1517 | "-55.05659841010797\n", 1518 | "[np.float64(176.22000714287435), np.float64(172.69131358640544)]\n", 1519 | "wis\n", 1520 | "-53.685850828271825\n", 1521 | "[np.float64(137.31354574090406), np.float64(115.13089351613758)]\n", 1522 | "dm\n", 1523 | "-54.01073436235229\n", 1524 | "[np.float64(132.81942955872594), np.float64(6.970382434455023)]\n", 1525 | "dr\n", 1526 | "-55.45433687243875\n", 1527 | "[np.float64(168.14039815809465), np.float64(6.815543578139827)]\n", 1528 | "dr3\n", 1529 | "-55.35302467832383\n", 1530 | "[np.float64(165.49802165849383), np.float64(5.193413671178172)]\n" 1531 | ] 1532 | } 1533 | ], 1534 | "execution_count": null 1535 | }, 1536 | { 1537 | "metadata": { 1538 | "ExecuteTime": { 1539 | "end_time": "2025-05-12T20:25:36.301160Z", 1540 | "start_time": "2025-05-12T20:07:04.544166Z" 1541 | }, 1542 | "id": "d1b9a15eca65f5db", 1543 | "outputId": "ee940321-3c29-41d9-fbca-f5415733bb4f", 1544 | "colab": { 1545 | "base_uri": "https://localhost:8080/", 1546 | "height": 332 1547 | } 1548 | }, 1549 | "cell_type": "code", 1550 | "source": [ 1551 | "import itertools\n", 1552 | "import numpy as np\n", 1553 | "import sys\n", 1554 | "import gym\n", 1555 | "\n", 1556 | "# Since lib.envs isn't available, we'll need to define these environments here\n", 1557 | "# or use gym environments directly. For now, I'll create simplified versions.\n", 1558 | "\n", 1559 | "from collections import defaultdict\n", 1560 | "\n", 1561 | "# Simple CliffWalkingEnv implementation\n", 1562 | "class CliffWalkingEnv(gym.Env):\n", 1563 | " def __init__(self):\n", 1564 | " self.shape = (4, 12)\n", 1565 | " self.start_state_index = np.ravel_multi_index((3, 0), self.shape)\n", 1566 | " self.goal_state_index = np.ravel_multi_index((3, 11), self.shape)\n", 1567 | " self.cliff = list(range(np.ravel_multi_index((3, 1), self.shape),\n", 1568 | " np.ravel_multi_index((3, 11), self.shape)))\n", 1569 | " self.nS = self.shape[0] * self.shape[1]\n", 1570 | " self.nA = 4 # up, right, down, left\n", 1571 | "\n", 1572 | " # Calculate transition probabilities and rewards\n", 1573 | " self.P = {}\n", 1574 | " for s in range(self.nS):\n", 1575 | " position = np.unravel_index(s, self.shape)\n", 1576 | " self.P[s] = {a: [] for a in range(self.nA)}\n", 1577 | "\n", 1578 | " # Actions: 0=up, 1=right, 2=down, 3=left\n", 1579 | " for a in range(self.nA):\n", 1580 | " reward = -1.0 # default reward for each move\n", 1581 | " next_position = list(position)\n", 1582 | " if a == 0:\n", 1583 | " next_position[0] = max(position[0] - 1, 0)\n", 1584 | " elif a == 1:\n", 1585 | " next_position[1] = min(position[1] + 1, self.shape[1] - 1)\n", 1586 | " elif a == 2:\n", 1587 | " next_position[0] = min(position[0] + 1, self.shape[0] - 1)\n", 1588 | " elif a == 3:\n", 1589 | " next_position[1] = max(position[1] - 1, 0)\n", 1590 | "\n", 1591 | " next_state = np.ravel_multi_index(next_position, self.shape)\n", 1592 | "\n", 1593 | " # Check if we're at the cliff\n", 1594 | " if s in self.cliff:\n", 1595 | " next_state = self.start_state_index\n", 1596 | " reward = -100.0\n", 1597 | "\n", 1598 | " # Check if we're at the goal\n", 1599 | " done = next_state == self.goal_state_index\n", 1600 | "\n", 1601 | " self.P[s][a] = [(1.0, next_state, reward, done)]\n", 1602 | "\n", 1603 | " self.observation_space = gym.spaces.Discrete(self.nS)\n", 1604 | " self.action_space = gym.spaces.Discrete(self.nA)\n", 1605 | "\n", 1606 | " self.reset()\n", 1607 | "\n", 1608 | " def step(self, action):\n", 1609 | " state, reward, done, _ = self._step(action)\n", 1610 | " self.s = state\n", 1611 | " return (state, reward, done, {})\n", 1612 | "\n", 1613 | " def _step(self, action):\n", 1614 | " (probs, next_state, reward, done) = self.P[self.s][action][0]\n", 1615 | " return (next_state, reward, done, {})\n", 1616 | "\n", 1617 | " def reset(self):\n", 1618 | " self.s = self.start_state_index\n", 1619 | " return self.s\n", 1620 | "\n", 1621 | "# Simple WindyGridworldEnv implementation (not fully used in this code)\n", 1622 | "class WindyGridworldEnv(gym.Env):\n", 1623 | " def __init__(self):\n", 1624 | " self.shape = (7, 10)\n", 1625 | " self.nS = self.shape[0] * self.shape[1]\n", 1626 | " self.nA = 4 # up, right, down, left\n", 1627 | " self.wind = [0, 0, 0, 1, 1, 1, 2, 2, 1, 0]\n", 1628 | " self.reset()\n", 1629 | "\n", 1630 | " def step(self, action):\n", 1631 | " # Not implemented as it's not used in the main code\n", 1632 | " pass\n", 1633 | "\n", 1634 | " def reset(self):\n", 1635 | " self.s = np.ravel_multi_index((3, 0), self.shape)\n", 1636 | " return self.s\n", 1637 | "\n", 1638 | "from scipy.optimize import minimize, rosen, rosen_der\n", 1639 | "from scipy.optimize import Bounds\n", 1640 | "\n", 1641 | "bounds = Bounds([-0.1, -0.1], [0.1, 0.1])\n", 1642 | "\n", 1643 | "env = CliffWalkingEnv()\n", 1644 | "\n", 1645 | "def make_epsilon_greedy_policy(Q, epsilon, nA):\n", 1646 | " def policy_fn(observation):\n", 1647 | " A = np.ones(nA, dtype=float) * epsilon / nA\n", 1648 | " best_action = np.argmax(Q[observation])\n", 1649 | " A[best_action] += (1.0 - epsilon)\n", 1650 | " return A\n", 1651 | " return policy_fn\n", 1652 | "\n", 1653 | "Q_space = np.load(\"/content/drive/MyDrive/Work/Estimators/DoubleReinforcement/Q-table-cliff.npz\")[\"xxx\"]\n", 1654 | "Q_space2 = np.load(\"/content/drive/MyDrive/Work/Estimators/DoubleReinforcement/Q-table-cliff.npz\")[\"xxx\"] #Q-table-cliff.npz\n", 1655 | "\n", 1656 | "prob1 = [1.0 for i in range((env.nA))]\n", 1657 | "prob1 = prob1/np.sum(prob1)\n", 1658 | "\n", 1659 | "betabeta = 0.8\n", 1660 | "def sample_policy(observation, alpha=0.9):\n", 1661 | " prob2 = alpha*Q_space[observation,:] + (1-alpha)*prob1\n", 1662 | " return np.random.choice(env.nA, 1, p=prob2)[0]\n", 1663 | "\n", 1664 | "\n", 1665 | "def behavior_policy(observation, beta=betabeta):\n", 1666 | " prob2 = beta*Q_space[observation,:] + (1-beta)*prob1\n", 1667 | " return np.random.choice(env.nA, 1, p=prob2)[0]\n", 1668 | "\n", 1669 | "\n", 1670 | "def target_dense(observation, alpha=0.9):\n", 1671 | " prob2 = alpha*Q_space[observation,:] + (1-alpha)*prob1\n", 1672 | " return prob2\n", 1673 | "\n", 1674 | "def behav_dense(observation, beta=betabeta):\n", 1675 | " prob2 = beta*Q_space[observation,:] + (1-beta)*prob1\n", 1676 | " return prob2\n", 1677 | "\n", 1678 | "def sarsa2(env, policy, policy2, num_episodes, discount_factor=1.0, Q_space2=Q_space2, alpha=0.6, epsilon=0.03):\n", 1679 | "\n", 1680 | " Q = np.copy(Q_space2)\n", 1681 | " episode_episode = []\n", 1682 | "\n", 1683 | " for i_episode in range(num_episodes):\n", 1684 | "\n", 1685 | " if (i_episode + 1) % 200 == 0:\n", 1686 | " sys.stdout.flush()\n", 1687 | "\n", 1688 | " state = env.reset()\n", 1689 | " action = policy2(state)\n", 1690 | "\n", 1691 | " episode = []\n", 1692 | "\n", 1693 | " for t in itertools.count():\n", 1694 | " # Take a step\n", 1695 | " next_state, reward, done, _ = env.step(action)\n", 1696 | " episode.append((state, action, reward))\n", 1697 | " # Pick the next action\n", 1698 | " next_action = policy2(next_state)\n", 1699 | "\n", 1700 | " # TD Update\n", 1701 | " td_target = reward + discount_factor * np.sum(Q[next_state,:]*target_dense(next_state))\n", 1702 | " td_delta = td_target - Q[state, action]\n", 1703 | " Q[state, action] += alpha * td_delta\n", 1704 | "\n", 1705 | " if done:\n", 1706 | " break\n", 1707 | "\n", 1708 | " action = next_action\n", 1709 | " state = next_state\n", 1710 | "\n", 1711 | " episode_episode.append(episode)\n", 1712 | "\n", 1713 | " return Q, episode_episode\n", 1714 | "\n", 1715 | "bounds = Bounds([-0.2, -0.2], [0.2, 0.2])\n", 1716 | "def sigmoid(x, derivative=False):\n", 1717 | " return x*(1-x) if derivative else 1/(1+np.exp(-x))\n", 1718 | "\n", 1719 | "\n", 1720 | "depth = 1\n", 1721 | "def mc_prediction(env, policy, policy2, episode_episode, Q_=1.0, num_episodes=100, discount_factor=1.0):\n", 1722 | "\n", 1723 | " returns_sum = defaultdict(float)\n", 1724 | " returns_count = defaultdict(float)\n", 1725 | " returns_count2 = defaultdict(float)\n", 1726 | "\n", 1727 | " predic_list = []\n", 1728 | " predic_list2 = []\n", 1729 | " predic_list3 = []\n", 1730 | " predic_list22 = []\n", 1731 | " predic_list4 = []\n", 1732 | " predic_list5 = np.ones(num_episodes)\n", 1733 | " auxiauxi = []\n", 1734 | " epiepi = []\n", 1735 | " weight_list = np.zeros([num_episodes, 1000]) # For bounded IPW\n", 1736 | " weight_list2 = np.zeros([num_episodes, 1002]) # For bounded IPW\n", 1737 | " weight_list3 = np.zeros([num_episodes, 1002]) # For bounded IPW\n", 1738 | " marginal_weight = np.zeros([num_episodes, 1000]) # For bounded IPW\n", 1739 | " marginal_weight_2 = np.zeros([num_episodes, 1000]) # For bounded IPW\n", 1740 | " auxi_list = np.zeros([num_episodes, 1000])\n", 1741 | " marginal_auxi_list2 = np.zeros([num_episodes, 1000])\n", 1742 | " marginal_auxi_list = np.zeros([num_episodes, 1000])\n", 1743 | " marginal_auxi_list2_2 = np.zeros([num_episodes, 1000])\n", 1744 | " marginal_auxi_list_2 = np.zeros([num_episodes, 1000])\n", 1745 | " auxi_list2 = np.zeros([num_episodes, 1000])\n", 1746 | " reward_list = np.zeros([num_episodes, 1000])\n", 1747 | " state_list = np.zeros([num_episodes, 1000])\n", 1748 | " action_list = np.zeros([num_episodes, 1000])\n", 1749 | "\n", 1750 | " count_list = np.zeros(1000)\n", 1751 | " episolode_longe_list = []\n", 1752 | "\n", 1753 | "\n", 1754 | " for i_episode in range(num_episodes):\n", 1755 | "\n", 1756 | " if i_episode % 200 == 0:\n", 1757 | " sys.stdout.flush()\n", 1758 | "\n", 1759 | " episode = episode_episode[i_episode]\n", 1760 | "\n", 1761 | " W = 1.0\n", 1762 | " W_list = []\n", 1763 | " episolode_longe_list.append(len(episode))\n", 1764 | "\n", 1765 | " weight_list2[i_episode, 0] = 1.0\n", 1766 | " for t in range(len(episode)):\n", 1767 | " state, action, reward = episode[t]\n", 1768 | " reward_list[i_episode, t] = reward\n", 1769 | " state_list[i_episode, t] = state\n", 1770 | " action_list[i_episode, t] = action\n", 1771 | "\n", 1772 | " W = W*target_dense(state)[action]/behav_dense(state)[action]*discount_factor\n", 1773 | " probprob = 0.9*Q_space[state,:] + 0.1*prob1\n", 1774 | " W_list.append(W)\n", 1775 | " weight_list[i_episode, t] = W_list[t]\n", 1776 | " weight_list2[i_episode, t+1] = W_list[t]\n", 1777 | " weight_list3[i_episode, t] = target_dense(state)[action]/behav_dense(state)[action]\n", 1778 | "\n", 1779 | " count_list[t] += 1.0\n", 1780 | "\n", 1781 | " if t==0:\n", 1782 | " auxi_list[i_episode, t] = W_list[t]*Q_[state, action]-np.sum(probprob*Q_[state,:])\n", 1783 | " else:\n", 1784 | " auxi_list[i_episode, t] = W_list[t]*Q_[state, action]-W_list[t-1]*np.sum(probprob*Q_[state,:])\n", 1785 | "\n", 1786 | " if t==0:\n", 1787 | " auxi_list2[i_episode, t] = W_list[t]-1.0\n", 1788 | " else:\n", 1789 | " auxi_list2[i_episode, t] = W_list[t]-W_list[t-1]\n", 1790 | "\n", 1791 | " print(np.max(np.array(episolode_longe_list)))\n", 1792 | "\n", 1793 | "\n", 1794 | " weight_list_mean = np.mean(weight_list, 1)\n", 1795 | " reward_list_mean = np.mean(reward_list, 1)\n", 1796 | " auxi_list_mean = np.mean(auxi_list, 1)\n", 1797 | " auxi_list2_mean = np.mean(auxi_list2, 1)\n", 1798 | "\n", 1799 | " val = []\n", 1800 | "\n", 1801 | " ##### IPW\n", 1802 | " for i in range(num_episodes):\n", 1803 | " predic_list.append(np.sum(weight_list[i,:]*reward_list[i,:]))\n", 1804 | "\n", 1805 | " val.append(np.mean(predic_list))\n", 1806 | "\n", 1807 | " #### Marginalized-IPW\n", 1808 | "\n", 1809 | " for i in range(num_episodes):\n", 1810 | " for j in range(episolode_longe_list[i]):\n", 1811 | " marginal_weight[i,j] = np.mean(weight_list[:,j][(state_list[:,j]==state_list[i,j]) & (action_list[:,j]==action_list[i,j])])\n", 1812 | " if j==0:\n", 1813 | " marginal_weight_2[i,j] = weight_list3[i,j]\n", 1814 | " else:\n", 1815 | " marginal_weight_2[i,j] = np.mean(weight_list[:,j-1][(state_list[:,j]==state_list[i,j])])*weight_list3[i,j]\n", 1816 | "\n", 1817 | "\n", 1818 | " for i_episode in range(num_episodes):\n", 1819 | " for t in range(episolode_longe_list[i_episode]):\n", 1820 | " state = int(state_list[i_episode, t]) # Changed np.int to int\n", 1821 | " action = int(action_list[i_episode, t]) # Changed np.int to int\n", 1822 | " probprob = 0.9*Q_space[state,:] + 0.1*prob1\n", 1823 | " if t==0:\n", 1824 | " marginal_auxi_list[i_episode, t] = marginal_weight[i_episode, t]*Q_[state, action]-np.sum(probprob*Q_[state,:])\n", 1825 | " marginal_auxi_list_2[i_episode, t] = marginal_weight_2[i_episode, t]*Q_[state, action]-np.sum(probprob*Q_[state,:])\n", 1826 | " auxi_list[i_episode, t] = weight_list[i_episode, t]*Q_[state, action]-np.sum(probprob*Q_[state,:])\n", 1827 | " else:\n", 1828 | " marginal_auxi_list[i_episode, t] = marginal_weight[i_episode, t]*(Q_[state, action])-marginal_weight[i_episode, t-1]*np.sum(probprob*(Q_[state,:]))\n", 1829 | " marginal_auxi_list_2[i_episode, t] = marginal_weight_2[i_episode, t]*(Q_[state, action])-marginal_weight_2[i_episode, t-1]*np.sum(probprob*(Q_[state,:]))\n", 1830 | " auxi_list[i_episode, t] = weight_list[i_episode, t]*(Q_[state, action])-weight_list[i_episode, t-1]*np.sum(probprob*(Q_[state,:]))\n", 1831 | "\n", 1832 | " if t==0:\n", 1833 | " marginal_auxi_list2[i_episode, t] = marginal_weight[i_episode, t]-1.0\n", 1834 | " marginal_auxi_list2_2[i_episode, t] = marginal_weight_2[i_episode, t]-1.0\n", 1835 | " auxi_list2[i_episode, t] = weight_list[i_episode, t]-1.0\n", 1836 | " else:\n", 1837 | " marginal_auxi_list2[i_episode, t] = marginal_weight[i_episode, t]- marginal_weight[i_episode, t-1]\n", 1838 | " marginal_auxi_list2_2[i_episode, t] = marginal_weight_2[i_episode, t]- marginal_weight_2[i_episode, t-1]\n", 1839 | " auxi_list2[i_episode, t] = weight_list[i_episode, t]-weight_list[i_episode, t-1]\n", 1840 | "\n", 1841 | "\n", 1842 | " for i in range(num_episodes):\n", 1843 | " predic_list2.append(np.sum(marginal_weight[i,:]*reward_list[i,:]))\n", 1844 | "\n", 1845 | " ### marginal ipw2 #### Using action and state\n", 1846 | " val.append(np.mean(predic_list2))\n", 1847 | "\n", 1848 | "\n", 1849 | " ### marginal ipw3#### Using only state\n", 1850 | " for i in range(num_episodes):\n", 1851 | " predic_list22.append(np.sum(marginal_weight_2[i,:]*reward_list[i,:]))\n", 1852 | "\n", 1853 | " val.append(np.mean(predic_list22))\n", 1854 | "\n", 1855 | "\n", 1856 | " #### DR\n", 1857 | " val.append(np.mean(predic_list)-np.mean(np.sum(auxi_list, 1)))\n", 1858 | "\n", 1859 | " #### marginal DR 1 #### Using action and state\n", 1860 | " val.append(np.mean(predic_list2)-np.mean(np.sum(marginal_auxi_list, 1)))\n", 1861 | " #### marginal DR 2 #### Using only state\n", 1862 | " val.append(np.mean(predic_list22)-np.mean(np.sum(marginal_auxi_list_2, 1)))\n", 1863 | "\n", 1864 | " return val\n", 1865 | "\n", 1866 | "# Main experiment run\n", 1867 | "is_list = []\n", 1868 | "is2_list = []\n", 1869 | "is3_list = []\n", 1870 | "wis_list = []\n", 1871 | "wis2_list = []\n", 1872 | "dm_list = []\n", 1873 | "dr_list = []\n", 1874 | "dr2_list = []\n", 1875 | "dr3_list = []\n", 1876 | "bdr_list = []\n", 1877 | "drs_list = []\n", 1878 | "drs2_list = []\n", 1879 | "drss_list = []\n", 1880 | "mdr_list = []\n", 1881 | "mdr_list2 = []\n", 1882 | "\n", 1883 | "sample_size = 1000\n", 1884 | "sample_size = sample_size // 2 # Integer division in Python 3\n", 1885 | "for kkk in range(100):\n", 1886 | " print(kkk)\n", 1887 | " #### Sample splititng\n", 1888 | " ### First fold\n", 1889 | "\n", 1890 | " predicted_Q, episode_episode = sarsa2(env, sample_policy, behavior_policy, sample_size)\n", 1891 | " V_10k_1 = mc_prediction(env, sample_policy, behavior_policy, episode_episode, predicted_Q, num_episodes=sample_size)\n", 1892 | "\n", 1893 | " ### Second fold\n", 1894 | " predicted_Q, episode_episode = sarsa2(env, sample_policy, behavior_policy, sample_size)\n", 1895 | " V_10k_2 = mc_prediction(env, sample_policy, behavior_policy, episode_episode, predicted_Q, num_episodes=sample_size)\n", 1896 | "\n", 1897 | " V_10k = 0.5*(np.array(V_10k_1)+np.array(V_10k_2))\n", 1898 | " is_list.append(np.mean(V_10k[0]))\n", 1899 | " is2_list.append(np.mean(V_10k[1]))\n", 1900 | " is3_list.append(np.mean(V_10k[2]))\n", 1901 | " dr_list.append(np.mean(V_10k[3]))\n", 1902 | " dr2_list.append(np.mean(V_10k[4]))\n", 1903 | " dr3_list.append(np.mean(V_10k[5]))\n", 1904 | " probprob = 0.9*Q_space[36,:] + 0.1*prob1\n", 1905 | " dm_list.append(np.sum(probprob*predicted_Q[36,:]))\n", 1906 | " np.savez(\"2estimator_list_ipw_\"+str(betabeta)+\"_\"+str(sample_size), a=is_list)\n", 1907 | " np.savez(\"2estimator_list_ipw2_\"+str(betabeta)+\"_\"+str(sample_size), a=is3_list)\n", 1908 | " np.savez(\"2estimator_list_dm_\"+str(betabeta)+\"_\"+str(sample_size), a=dm_list)\n", 1909 | " np.savez(\"2estimator_list_dr_\"+str(betabeta)+\"_\"+str(sample_size), a=dr_list)\n", 1910 | " np.savez(\"2estimator_list_dr2_\"+str(betabeta)+\"_\"+str(sample_size), a=dr3_list)\n", 1911 | "\n", 1912 | "# Analysis of results\n", 1913 | "true = -42.49\n", 1914 | "\n", 1915 | "# FIX: Properly calculate MSE instead of using hardcoded values\n", 1916 | "def mse(aaa):\n", 1917 | " aaa = np.array(aaa)\n", 1918 | " aaa = aaa[aaa>-100] # Filter extreme values\n", 1919 | " mean_val = np.mean(aaa) # Calculate mean\n", 1920 | " bias = mean_val - true # Calculate bias\n", 1921 | " bias_squared = bias * bias # Square the bias\n", 1922 | " variance = np.var(aaa) # Calculate variance\n", 1923 | " mse_value = bias_squared + variance # MSE = bias² + variance\n", 1924 | " return [mse_value, np.sqrt(np.var((aaa-true)*(aaa-true)))] # Return MSE and RMSE\n", 1925 | "\n", 1926 | "print(np.mean(is_list))\n", 1927 | "print(mse(is_list))\n", 1928 | "print(\"wis\")\n", 1929 | "print(np.mean(is3_list))\n", 1930 | "print(mse(is3_list))\n", 1931 | "print(\"dm\")\n", 1932 | "print(np.mean(dm_list))\n", 1933 | "print(mse(dm_list))\n", 1934 | "print(\"dr\")\n", 1935 | "print(np.mean(dr_list))\n", 1936 | "print(mse(dr_list))\n", 1937 | "print(\"dr3\")\n", 1938 | "print(np.mean(dr3_list))\n", 1939 | "print(mse(dr3_list))" 1940 | ], 1941 | "id": "d1b9a15eca65f5db", 1942 | "outputs": [ 1943 | { 1944 | "output_type": "error", 1945 | "ename": "FileNotFoundError", 1946 | "evalue": "[Errno 2] No such file or directory: '/content/drive/MyDrive/Work/Estimators/DoubleReinforcement/Q-table-cliff.npz'", 1947 | "traceback": [ 1948 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 1949 | "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", 1950 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mpolicy_fn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 102\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 103\u001b[0;31m \u001b[0mQ_space\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/content/drive/MyDrive/Work/Estimators/DoubleReinforcement/Q-table-cliff.npz\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"xxx\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 104\u001b[0m \u001b[0mQ_space2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/content/drive/MyDrive/Work/Estimators/DoubleReinforcement/Q-table-cliff.npz\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"xxx\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;31m#Q-table-cliff.npz\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 105\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 1951 | "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/numpy/lib/_npyio_impl.py\u001b[0m in \u001b[0;36mload\u001b[0;34m(file, mmap_mode, allow_pickle, fix_imports, encoding, max_header_size)\u001b[0m\n\u001b[1;32m 453\u001b[0m \u001b[0mown_fid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 454\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 455\u001b[0;31m \u001b[0mfid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstack\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menter_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfspath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 456\u001b[0m \u001b[0mown_fid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 457\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 1952 | "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/content/drive/MyDrive/Work/Estimators/DoubleReinforcement/Q-table-cliff.npz'" 1953 | ] 1954 | } 1955 | ], 1956 | "execution_count": 5 1957 | }, 1958 | { 1959 | "cell_type": "code", 1960 | "source": [], 1961 | "metadata": { 1962 | "id": "ij5Uepr11-NZ" 1963 | }, 1964 | "id": "ij5Uepr11-NZ", 1965 | "execution_count": null, 1966 | "outputs": [] 1967 | } 1968 | ], 1969 | "metadata": { 1970 | "kernelspec": { 1971 | "display_name": "Python 3", 1972 | "language": "python", 1973 | "name": "python3" 1974 | }, 1975 | "language_info": { 1976 | "codemirror_mode": { 1977 | "name": "ipython", 1978 | "version": 2 1979 | }, 1980 | "file_extension": ".py", 1981 | "mimetype": "text/x-python", 1982 | "name": "python", 1983 | "nbconvert_exporter": "python", 1984 | "pygments_lexer": "ipython2", 1985 | "version": "2.7.6" 1986 | }, 1987 | "colab": { 1988 | "provenance": [] 1989 | } 1990 | }, 1991 | "nbformat": 4, 1992 | "nbformat_minor": 5 1993 | } --------------------------------------------------------------------------------