├── .gitignore ├── README.md ├── requirements.txt └── src ├── conf ├── policy_params.yaml └── q_func_hyperparams.yaml ├── data.py ├── ope.py ├── policy.py └── run_sims.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # others 132 | .DS_Store 133 | .vscode/ 134 | pageblock/ 135 | log/ 136 | data/ 137 | *.ipynb 138 | note.md 139 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Optimal Off-Policy Evaluation from Multiple Logging Policies 2 | 3 | ## Overview 4 | This repository contains the code for replicating the experiments of the paper 5 | **"Optimal Off-Policy Evaluation from Multiple Logging Policies"** (ICML2021, proceedings.mlr.press/v139/kallus21a.html) 6 | 7 | If you find this code useful in your research then please cite: 8 | ``` 9 | @inproceedings{kallus2021optimal, 10 | title={Optimal Off-Policy Evaluation from Multiple Logging Policies}, 11 | author={Kallus, Nathan and Saito, Yuta and Uehara, Masatoshi}, 12 | booktitle = {Proceedings of the 38th International Conference on Machine Learning}, 13 | pages={5247-5256}, 14 | year={2021}, 15 | volume = {139}, 16 | publisher={PMLR}, 17 | } 18 | ``` 19 | 20 | ## Dependencies 21 | - python==3.7.3 22 | - numpy==1.18.1 23 | - pandas==0.25.1 24 | - scikit-learn==0.23.1 25 | - tensorflow==1.15.4 26 | - pyyaml==5.1 27 | - seaborn==0.10.1 28 | - matplotlib==3.2.2 29 | 30 | ### Running the code 31 | 32 | To run the simulations with the multi-class classification datasets, run the following commands in the `./src/` directory: 33 | 34 | ``` 35 | for data in optdigits pendigits 36 | do 37 | python run_sims.py --num_sims 200 --data $data --is_estimate_pi_b 38 | done 39 | ``` 40 | 41 | Nota that the configurations used in the experiments can be found in `./conf/policy_params.yaml`. 42 | Once the simulations have finished running, the summarized results can be found in the `../log/{data}` directory for each data. 43 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | python==3.7.3 2 | numpy==1.18.1 3 | pandas==0.25.1 4 | scikit-learn==0.23.1 5 | tensorflow==1.15.4 6 | pyyaml==5.1 7 | seaborn==0.10.1 8 | matplotlib==3.2.2 9 | -------------------------------------------------------------------------------- /src/conf/policy_params.yaml: -------------------------------------------------------------------------------- 1 | evaluation: 1.0 2 | behavior1: 0.95 3 | behavior2: 0.05 4 | -------------------------------------------------------------------------------- /src/conf/q_func_hyperparams.yaml: -------------------------------------------------------------------------------- 1 | eta: 0.01 2 | std: 0.01 3 | lam: 0.001 4 | batch_size: 256 5 | epochs: 200 6 | -------------------------------------------------------------------------------- /src/data.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | from pathlib import Path 3 | 4 | import numpy as np 5 | from sklearn.preprocessing import LabelEncoder 6 | from sklearn.model_selection import train_test_split 7 | 8 | 9 | def load_datasets( 10 | data: str, ratio: float, test_size: float = 0.5, random_state: int = 12345 11 | ): 12 | """Load and preprocess raw multiclass classification data.""" 13 | data_path = Path(f"../data/{data}") 14 | le = LabelEncoder() 15 | if data == "optdigits": 16 | data_ = np.r_[ 17 | np.loadtxt(data_path / f"{data}.tra", delimiter=","), 18 | np.loadtxt(data_path / f"{data}.tes", delimiter=","), 19 | ] 20 | elif data == "pendigits": 21 | data_ = np.r_[ 22 | np.loadtxt(data_path / f"{data}.tra", delimiter=","), 23 | np.loadtxt(data_path / f"{data}.tes", delimiter=","), 24 | ] 25 | elif data == "sat": 26 | data_ = np.r_[ 27 | np.loadtxt(data_path / f"{data}.trn", delimiter=" "), 28 | np.loadtxt(data_path / f"{data}.tst", delimiter=" "), 29 | ] 30 | data_[:, -1] = np.where(data_[:, -1] == 7, 5, data_[:, -1] - 1) 31 | elif data == "letter": 32 | data_ = np.genfromtxt( 33 | data_path / "letter-recognition.data", delimiter=",", dtype="str" 34 | ) 35 | data_ = np.c_[data_[:, 1:], le.fit_transform(data_[:, 0])].astype(float) 36 | 37 | np.random.shuffle(data_) 38 | data_tr, data_ev = train_test_split( 39 | data_, test_size=test_size, random_state=random_state 40 | ) 41 | n_train, n_eval = data_tr.shape[0], data_ev.shape[0] 42 | n_dim = np.int(data_tr.shape[1] / 2) 43 | y_tr, y_ev = data_tr[:, -1].astype(int), data_ev[:, -1].astype(int) 44 | n_class = np.unique(y_tr).shape[0] 45 | y_full_ev = np.zeros((n_eval, n_class)) 46 | y_full_ev[np.arange(n_eval), y_ev] = 1 47 | X_tr, X_ev = data_tr[:, :-1], data_ev[:, :-1] 48 | X_tr1, X_tr2 = data_tr[:, :n_dim], data_tr[:, n_dim:] 49 | X_ev1, X_ev2 = data_ev[:, :n_dim], data_ev[:, n_dim:] 50 | 51 | # multiple logger index generation 52 | ratio1 = ratio / (1 + ratio) 53 | n_eval1 = np.int(n_eval * ratio1) 54 | idx1 = np.ones(n_eval, dtype=bool) 55 | idx1[n_eval1:] = False 56 | 57 | return dict( 58 | n_train=n_train, 59 | n_eval=n_eval, 60 | n_dim=n_dim, 61 | n_class=n_class, 62 | n_behavior_policies=2, 63 | X_tr=X_tr, 64 | X_tr1=X_tr1, 65 | X_tr2=X_tr2, 66 | X_ev=X_ev, 67 | X_ev1=X_ev1, 68 | X_ev2=X_ev2, 69 | y_tr=y_tr, 70 | y_ev=y_ev, 71 | y_full_ev=y_full_ev, 72 | idx1=idx1, 73 | ratio1=(n_eval1 / n_eval), 74 | ) 75 | 76 | 77 | def generate_bandit_feedback(data_dict: Dict, pi_b1: np.ndarray, pi_b2: np.ndarray): 78 | """Generate logged bandit feedback data.""" 79 | n_eval = data_dict["n_eval"] 80 | idx1, ratio1 = data_dict["idx1"], data_dict["ratio1"] 81 | idx1_expanded = np.expand_dims(idx1, 1) 82 | pi_b = pi_b1 * idx1_expanded + pi_b2 * (1 - idx1_expanded) 83 | pi_b_star = pi_b1 * ratio1 + pi_b2 * (1.0 - ratio1) 84 | action_set = np.arange(data_dict["n_class"]) 85 | actions = np.zeros(data_dict["n_eval"], dtype=int) 86 | for i, pvals in enumerate(pi_b): 87 | actions[i] = np.random.choice(action_set, p=pvals) 88 | rewards = data_dict["y_full_ev"][np.arange(n_eval), actions] 89 | return dict( 90 | n_eval=data_dict["n_eval"], 91 | n_class=data_dict["n_class"], 92 | X_ev=data_dict["X_ev"], 93 | pi_b=pi_b, 94 | pi_b_star=pi_b_star, 95 | actions=actions, 96 | idx1=idx1, 97 | rewards=rewards, 98 | ) 99 | -------------------------------------------------------------------------------- /src/ope.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | import yaml 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import tensorflow as tf 7 | from tensorflow.python.framework import ops 8 | 9 | from sklearn.model_selection import StratifiedKFold, KFold 10 | from sklearn.linear_model import LogisticRegression 11 | 12 | 13 | def calc_ground_truth(y_true: np.ndarray, pi: np.ndarray) -> float: 14 | """Calculate the ground-truth policy value of an eval policy""" 15 | return pi[np.arange(y_true.shape[0]), y_true].mean() 16 | 17 | 18 | def calc_ipw( 19 | rewards: np.ndarray, 20 | actions: np.ndarray, 21 | pi_b: np.ndarray, 22 | pi_e: np.ndarray, 23 | ) -> float: 24 | n_data = actions.shape[0] 25 | iw = pi_e[np.arange(n_data), actions] / pi_b[np.arange(n_data), actions] 26 | return (rewards * iw).mean() 27 | 28 | 29 | def calc_var( 30 | rewards: np.ndarray, 31 | actions: np.ndarray, 32 | pi_b: np.ndarray, 33 | pi_e: np.ndarray, 34 | estimated_q_func: np.ndarray, 35 | ): 36 | n_data = actions.shape[0] 37 | v = np.average(estimated_q_func, weights=pi_e, axis=1) 38 | shifted_rewards = rewards - estimated_q_func[np.arange(n_data), actions] 39 | iw = pi_e[np.arange(n_data), actions] / pi_b[np.arange(n_data), actions] 40 | return np.var(shifted_rewards * iw + v) 41 | 42 | 43 | def calc_weighted( 44 | rewards: np.ndarray, 45 | actions: np.ndarray, 46 | idx1: np.ndarray, 47 | pi_b: np.ndarray, 48 | pi_e: np.ndarray, 49 | estimated_q_func: np.ndarray = None, 50 | n_fold: int = 2, 51 | ) -> float: 52 | estimated_rewards_list = list() 53 | if estimated_q_func is None: 54 | estimated_q_func = np.zeros((actions.shape[0], np.int(actions.max() + 1))) 55 | kf = KFold(n_splits=n_fold, shuffle=True, random_state=12345) 56 | for train_idx, test_idx in kf.split(rewards): 57 | rewards_tr, rewards_ev = rewards[train_idx], rewards[test_idx] 58 | actions_tr, actions_ev = actions[train_idx], actions[test_idx] 59 | idx1_tr, idx1_ev = idx1[train_idx], idx1[test_idx] 60 | pi_b_tr, pi_b_ev = pi_b[train_idx], pi_b[test_idx] 61 | pi_e_tr, pi_e_ev = pi_e[train_idx], pi_e[test_idx] 62 | estimated_q_func_tr = estimated_q_func[train_idx] 63 | estimated_q_func_ev = estimated_q_func[test_idx] 64 | # estimate lambda with one of the fold 65 | n_data1, n_data2 = idx1_tr.sum(), (~idx1_tr).sum() 66 | var1 = calc_var( 67 | rewards=rewards_tr[idx1_tr], 68 | actions=actions_tr[idx1_tr], 69 | pi_b=pi_b_tr[idx1_tr], 70 | pi_e=pi_e_tr[idx1_tr], 71 | estimated_q_func=estimated_q_func_tr[idx1_tr], 72 | ) 73 | var2 = calc_var( 74 | rewards=rewards_tr[~idx1_tr], 75 | actions=actions_tr[~idx1_tr], 76 | pi_b=pi_b_tr[~idx1_tr], 77 | pi_e=pi_e_tr[~idx1_tr], 78 | estimated_q_func=estimated_q_func_tr[~idx1_tr], 79 | ) 80 | denominator = (n_data1 / var1) + (n_data2 / var2) 81 | lam1 = (n_data1 / var1) / denominator 82 | lam2 = (n_data2 / var2) / denominator 83 | # estimate the policy value with the other fold 84 | iw1 = ( 85 | pi_e_ev[idx1_ev, actions_ev[idx1_ev]] 86 | / pi_b_ev[idx1_ev, actions_ev[idx1_ev]] 87 | ) 88 | iw2 = ( 89 | pi_e_ev[~idx1_ev, actions_ev[~idx1_ev]] 90 | / pi_b_ev[~idx1_ev, actions_ev[~idx1_ev]] 91 | ) 92 | v1 = np.average(estimated_q_func_ev[idx1_ev], weights=pi_e_ev[idx1_ev], axis=1) 93 | v2 = np.average( 94 | estimated_q_func_ev[~idx1_ev], weights=pi_e_ev[~idx1_ev], axis=1 95 | ) 96 | shifted_rewards1 = ( 97 | rewards_ev[idx1_ev] - estimated_q_func_ev[idx1_ev, actions_ev[idx1_ev]] 98 | ) 99 | shifted_rewards2 = ( 100 | rewards_ev[~idx1_ev] - estimated_q_func_ev[~idx1_ev, actions_ev[~idx1_ev]] 101 | ) 102 | estimated_rewards = lam1 * (iw1 * shifted_rewards1 + v1).mean() 103 | estimated_rewards += lam2 * (iw2 * shifted_rewards2 + v2).mean() 104 | estimated_rewards_list.append(estimated_rewards) 105 | return np.mean(estimated_rewards_list) 106 | 107 | 108 | def calc_dr( 109 | rewards: np.ndarray, 110 | actions: np.ndarray, 111 | estimated_q_func: np.ndarray, 112 | pi_b: np.ndarray, 113 | pi_e: np.ndarray, 114 | ) -> float: 115 | n_data = actions.shape[0] 116 | v = np.average(estimated_q_func, weights=pi_e, axis=1) 117 | iw = pi_e[np.arange(n_data), actions] / pi_b[np.arange(n_data), actions] 118 | shifted_rewards = rewards - estimated_q_func[np.arange(n_data), actions] 119 | return (iw * shifted_rewards + v).mean() 120 | 121 | 122 | def estimate_q_func( 123 | bandit_feedback, 124 | pi_e: np.ndarray, 125 | fitting_method: str = "naive", 126 | k_fold: int = 2, 127 | ) -> np.ndarray: 128 | # hyperparam 129 | with open("./conf/q_func_hyperparams.yaml", "rb") as f: 130 | q_func_hyperparams = yaml.safe_load(f) 131 | 132 | X = bandit_feedback["X_ev"] 133 | y = bandit_feedback["rewards"] 134 | pi_b_star = bandit_feedback["pi_b_star"] 135 | idx1 = bandit_feedback["idx1"].astype(int) 136 | a = pd.get_dummies(bandit_feedback["actions"]).values 137 | skf = StratifiedKFold(n_splits=k_fold) 138 | skf.get_n_splits(X, y) 139 | estimated_q_func = np.zeros((bandit_feedback["n_eval"], bandit_feedback["n_class"])) 140 | for train_idx, test_idx in skf.split(X, y): 141 | X_tr, X_ev = X[train_idx], X[test_idx] 142 | y_tr, a_tr = y[train_idx], a[train_idx].astype(float) 143 | pi_e_tr = pi_e[train_idx] 144 | pi_b_star_tr = pi_b_star[train_idx] 145 | idx1_tr = idx1[train_idx] 146 | ops.reset_default_graph() 147 | clf = QFuncEstimator( 148 | num_features=X_tr.shape[1], 149 | num_classes=bandit_feedback["n_class"], 150 | fitting_method=fitting_method, 151 | eta=q_func_hyperparams["eta"], 152 | std=q_func_hyperparams["std"], 153 | lam=q_func_hyperparams["lam"], 154 | batch_size=q_func_hyperparams["batch_size"], 155 | epochs=q_func_hyperparams["epochs"], 156 | ) 157 | clf.train( 158 | X=X_tr, 159 | a=a_tr, 160 | y=y_tr, 161 | pi_e=pi_e_tr, 162 | pi_b_star=pi_b_star_tr, 163 | idx1=idx1_tr, 164 | ) 165 | for a_idx in np.arange(bandit_feedback["n_class"]): 166 | estimated_q_func_for_a = clf.predict(X=X_ev, a_idx=a_idx)[:, a_idx] 167 | estimated_q_func[test_idx, a_idx] = estimated_q_func_for_a 168 | clf.s.close() 169 | return estimated_q_func 170 | 171 | 172 | @dataclass 173 | class QFuncEstimator: 174 | num_features: int 175 | num_classes: int 176 | eta: float = 0.01 177 | std: float = 0.01 178 | lam: float = 0.001 179 | batch_size: int = 256 180 | epochs: int = 200 181 | fitting_method: str = "stratified" 182 | 183 | def __post_init__(self) -> None: 184 | """Initialize Class.""" 185 | tf.set_random_seed(0) 186 | self.s = tf.Session() 187 | self.create_placeholders() 188 | self.build_graph() 189 | self.create_losses() 190 | self.add_optimizer() 191 | 192 | def create_placeholders(self) -> None: 193 | """Create the placeholders to be used.""" 194 | self.input_X = tf.placeholder( 195 | "float32", shape=(None, self.num_features), name="input_X" 196 | ) 197 | self.input_A = tf.placeholder( 198 | "float32", shape=(None, self.num_classes), name="input_A" 199 | ) 200 | self.input_R = tf.placeholder("float32", shape=(None,), name="input_R") 201 | self.input_pi_e = tf.placeholder( 202 | "float32", shape=(None, self.num_classes), name="input_pi_e" 203 | ) 204 | self.input_pi_b_star = tf.placeholder( 205 | "float32", shape=(None, self.num_classes), name="input_pi_b_star" 206 | ) 207 | self.input_idx1 = tf.placeholder("float32", shape=(None,), name="input_idx1") 208 | 209 | def build_graph(self) -> None: 210 | """Build the main tensorflow graph with embedding layers.""" 211 | self.weights = tf.Variable( 212 | tf.random_normal( 213 | [self.num_features + self.num_classes, self.num_classes], 214 | stddev=self.std, 215 | ) 216 | ) 217 | self.bias = tf.Variable(tf.random_normal([self.num_classes], stddev=self.std)) 218 | 219 | with tf.variable_scope("prediction"): 220 | input_X = tf.concat([self.input_X, self.input_A], axis=1) 221 | self.preds = tf.sigmoid(tf.matmul(input_X, self.weights) + self.bias) 222 | 223 | def create_losses(self) -> None: 224 | """Create the losses.""" 225 | with tf.name_scope("loss"): 226 | shifted_rewards = self.input_R - tf.reduce_sum( 227 | self.preds * self.input_A, axis=1 228 | ) 229 | if self.fitting_method == "normal": 230 | self.loss = tf.reduce_mean(tf.square(shifted_rewards)) 231 | else: 232 | ratio1 = tf.reduce_mean(self.input_idx1) 233 | input_idx2 = tf.ones_like(self.input_idx1) - self.input_idx1 234 | ratio2 = tf.reduce_mean(input_idx2) 235 | pi_e = tf.reduce_sum(self.input_pi_e * self.input_A, 1) 236 | pi_b_star = tf.reduce_sum(self.input_pi_b_star * self.input_A, 1) 237 | v = tf.reduce_sum(self.input_pi_e * self.preds, 1) 238 | phi = (pi_e / pi_b_star) * shifted_rewards + v 239 | phi1 = self.input_idx1 * phi 240 | phi2 = input_idx2 * phi 241 | if self.fitting_method == "stratified": 242 | self.loss = ratio1 * tf.reduce_mean(tf.square(phi1)) 243 | self.loss += ratio2 * tf.reduce_mean(tf.square(phi2)) 244 | self.loss -= ratio1 * tf.square(tf.reduce_mean(phi1)) 245 | self.loss -= ratio2 * tf.square(tf.reduce_mean(phi2)) 246 | elif self.fitting_method == "naive": 247 | self.loss = tf.reduce_mean(tf.square(phi)) 248 | self.loss -= tf.square(tf.reduce_mean(phi)) 249 | 250 | self.var_list = [self.weights, self.bias] 251 | l2_reg = [tf.nn.l2_loss(v) for v in self.var_list] 252 | self.loss += self.lam * tf.add_n(l2_reg) 253 | 254 | def add_optimizer(self) -> None: 255 | """Add the required optimizer to the graph.""" 256 | with tf.name_scope("optimizer"): 257 | self.apply_grads = tf.train.MomentumOptimizer( 258 | learning_rate=self.eta, momentum=0.8 259 | ).minimize(self.loss, var_list=self.var_list) 260 | 261 | def train( 262 | self, 263 | X: np.ndarray, 264 | a: np.ndarray, 265 | y: np.ndarray, 266 | pi_e: np.ndarray, 267 | pi_b_star: np.ndarray, 268 | idx1: np.ndarray, 269 | ) -> None: 270 | self.s.run(tf.global_variables_initializer()) 271 | for _ in np.arange(self.epochs): 272 | arr = np.arange(X.shape[0]) 273 | np.random.shuffle(arr) 274 | for idx in np.arange(0, X.shape[0], self.batch_size): 275 | arr_ = arr[idx : idx + self.batch_size] 276 | self.s.run( 277 | self.apply_grads, 278 | feed_dict={ 279 | self.input_X: X[arr_], 280 | self.input_A: a[arr_], 281 | self.input_R: y[arr_], 282 | self.input_pi_e: pi_e[arr_], 283 | self.input_pi_b_star: pi_b_star[arr_], 284 | self.input_idx1: idx1[arr_], 285 | }, 286 | ) 287 | 288 | def predict(self, X: np.ndarray, a_idx: int): 289 | a_ = np.zeros((X.shape[0], self.num_classes)) 290 | a_[:, a_idx] = 1 291 | return self.s.run(self.preds, feed_dict={self.input_X: X, self.input_A: a_}) 292 | 293 | 294 | def estimate_pi_b(bandit_feedback, k_fold: int = 2) -> None: 295 | X = bandit_feedback["X_ev"] 296 | idx1 = bandit_feedback["idx1"] 297 | a = bandit_feedback["actions"] 298 | skf = StratifiedKFold(n_splits=k_fold, shuffle=True) 299 | skf.get_n_splits(X, a) 300 | estimated_pi_b1 = np.zeros((bandit_feedback["n_eval"], bandit_feedback["n_class"])) 301 | estimated_pi_b2 = np.zeros((bandit_feedback["n_eval"], bandit_feedback["n_class"])) 302 | estimated_pi_b_star = np.zeros( 303 | (bandit_feedback["n_eval"], bandit_feedback["n_class"]) 304 | ) 305 | for train_idx, test_idx in skf.split(X, a): 306 | X_tr, X_ev = X[train_idx], X[test_idx] 307 | idx1_tr, a_tr = idx1[train_idx], a[train_idx] 308 | clf = LogisticRegression(random_state=12345) 309 | clf.fit(X=X_tr[idx1_tr], y=a_tr[idx1_tr]) 310 | estimated_pi_b1[test_idx, :] = clf.predict_proba(X_ev) 311 | clf = LogisticRegression(random_state=12345) 312 | clf.fit(X=X_tr[~idx1_tr], y=a_tr[~idx1_tr]) 313 | estimated_pi_b2[test_idx, :] = clf.predict_proba(X_ev) 314 | clf = LogisticRegression(random_state=12345) 315 | clf.fit(X=X_tr, y=a_tr) 316 | estimated_pi_b_star[test_idx, :] = clf.predict_proba(X_ev) 317 | idx1 = np.expand_dims(idx1.astype(int), 1) 318 | bandit_feedback["pi_b"] = np.clip( 319 | idx1 * estimated_pi_b1 + (1 - idx1) * estimated_pi_b2, 1e-6, 1.0 320 | ) 321 | bandit_feedback["pi_b_star"] = np.clip(estimated_pi_b_star, 1e-6, 1.0) 322 | return bandit_feedback 323 | -------------------------------------------------------------------------------- /src/policy.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | import yaml 3 | 4 | import numpy as np 5 | from sklearn.linear_model import LogisticRegression 6 | 7 | 8 | def train_policies(data_dict: Dict, random_state: int = 0) -> List[np.ndarray]: 9 | """Train evaluation and behavior policies.""" 10 | with open("./conf/policy_params.yaml", "rb") as f: 11 | policy_params = yaml.safe_load(f) 12 | 13 | policy_list = list() 14 | for pol in policy_params.keys(): 15 | # make label predictions 16 | X_tr, y_tr = data_dict[f"X_tr"], data_dict[f"y_tr"] 17 | clf = LogisticRegression( 18 | random_state=random_state, 19 | solver="lbfgs", 20 | multi_class="multinomial", 21 | ).fit(X=X_tr, y=y_tr) 22 | preds = clf.predict(X=data_dict[f"X_ev"]).astype(int) 23 | # transform predictions into distribution over actions 24 | alpha = policy_params[pol] 25 | pi = np.zeros((data_dict["n_eval"], data_dict["n_class"])) 26 | pi[:, :] = (1.0 - alpha) / data_dict["n_class"] 27 | pi[np.arange(data_dict["n_eval"]), preds] = ( 28 | alpha + (1.0 - alpha) / data_dict["n_class"] 29 | ) 30 | policy_list.append(pi) 31 | return policy_list 32 | -------------------------------------------------------------------------------- /src/run_sims.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import time 3 | import pickle 4 | import warnings 5 | from pathlib import Path 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from sklearn.exceptions import ConvergenceWarning 10 | 11 | warnings.filterwarnings(action="ignore", category=ConvergenceWarning) 12 | 13 | from data import load_datasets, generate_bandit_feedback 14 | from ope import ( 15 | calc_ipw, 16 | calc_weighted, 17 | calc_dr, 18 | estimate_q_func, 19 | estimate_pi_b, 20 | calc_ground_truth, 21 | ) 22 | from policy import train_policies 23 | 24 | 25 | def calc_rel_rmse(policy_value_true: float, policy_value_estimated: float) -> float: 26 | return np.sqrt( 27 | (((policy_value_true - policy_value_estimated) / policy_value_true) ** 2).mean() 28 | ) 29 | 30 | 31 | if __name__ == "__main__": 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument("--num_sims", "-n", type=int, required=True) 34 | parser.add_argument("--data", "-d", type=str, required=True) 35 | parser.add_argument("--test_size", "-t", type=float, default=0.7) 36 | parser.add_argument("--is_estimate_pi_b", "-i", action="store_true") 37 | args = parser.parse_args() 38 | print(args) 39 | 40 | # configurations 41 | num_sims = args.num_sims 42 | data = args.data 43 | test_size = args.test_size 44 | is_estimate_pi_b = args.is_estimate_pi_b 45 | np.random.seed(12345) 46 | ratio_list = [0.1, 0.2, 0.5, 1, 2, 4, 10] 47 | estimator_names = [ 48 | "ground_truth", 49 | "IS-Avg", 50 | "IS", 51 | "IS-PW(f)", 52 | "DR-Avg", 53 | "DR-PW", 54 | "DR", 55 | "MRDR", 56 | "SMRDR", 57 | ] 58 | log_path = ( 59 | Path("../log") / data / f"test_size={test_size}" / "estimated_pi_b" 60 | if is_estimate_pi_b 61 | else Path("../log") / data / f"test_size={test_size}" / "true_pi_b" 62 | ) 63 | log_path.mkdir(parents=True, exist_ok=True) 64 | raw_results_path = log_path / "raw_results" 65 | raw_results_path.mkdir(parents=True, exist_ok=True) 66 | 67 | rel_rmse_results = { 68 | name: {r: np.zeros(num_sims) for r in ratio_list} for name in estimator_names 69 | } 70 | for ratio in ratio_list: 71 | start = time.time() 72 | ope_results = {name: np.zeros(num_sims) for name in estimator_names} 73 | for sim_id in np.arange(num_sims): 74 | # load and split data 75 | data_dict = load_datasets( 76 | data=data, test_size=test_size, ratio=ratio, random_state=sim_id 77 | ) 78 | # train eval and two behavior policies 79 | pi_e, pi_b1, pi_b2 = train_policies( 80 | data_dict=data_dict, 81 | random_state=sim_id, 82 | ) 83 | # generate bandit feedback 84 | bandit_feedback_ = generate_bandit_feedback( 85 | data_dict=data_dict, pi_b1=pi_b1, pi_b2=pi_b2 86 | ) 87 | # estimate pi_b1, pi_b2, and pi_b_star with 2-fold cross-fitting 88 | if is_estimate_pi_b: 89 | bandit_feedback = estimate_pi_b(bandit_feedback=bandit_feedback_) 90 | else: 91 | bandit_feedback = bandit_feedback_ 92 | # estimate q-function with 2-fold cross-fitting 93 | estimated_q_func = estimate_q_func( 94 | bandit_feedback=bandit_feedback, 95 | pi_e=pi_e, 96 | fitting_method="normal", 97 | ) 98 | estimated_q_func_with_mrdr_wrong = estimate_q_func( 99 | bandit_feedback=bandit_feedback, 100 | pi_e=pi_e, 101 | fitting_method="naive", 102 | ) 103 | estimated_q_func_with_mrdr = estimate_q_func( 104 | bandit_feedback=bandit_feedback, 105 | pi_e=pi_e, 106 | fitting_method="stratified", 107 | ) 108 | # off-policy evaluation 109 | ope_results["ground_truth"][sim_id] = calc_ground_truth( 110 | y_true=data_dict["y_ev"], pi=pi_e 111 | ) 112 | ope_results["IS-Avg"][sim_id] = calc_ipw( 113 | rewards=bandit_feedback["rewards"], 114 | actions=bandit_feedback["actions"], 115 | pi_b=bandit_feedback["pi_b"], 116 | pi_e=pi_e, 117 | ) 118 | ope_results["IS"][sim_id] = calc_ipw( 119 | rewards=bandit_feedback["rewards"], 120 | actions=bandit_feedback["actions"], 121 | pi_b=bandit_feedback["pi_b_star"], 122 | pi_e=pi_e, 123 | ) 124 | ope_results["IS-PW(f)"][sim_id] = calc_weighted( 125 | rewards=bandit_feedback["rewards"], 126 | actions=bandit_feedback["actions"], 127 | idx1=bandit_feedback["idx1"], 128 | pi_b=bandit_feedback["pi_b"], 129 | pi_e=pi_e, 130 | ) 131 | ope_results["DR-Avg"][sim_id] = calc_dr( 132 | rewards=bandit_feedback["rewards"], 133 | actions=bandit_feedback["actions"], 134 | estimated_q_func=estimated_q_func, 135 | pi_b=bandit_feedback["pi_b"], 136 | pi_e=pi_e, 137 | ) 138 | ope_results["DR-PW"][sim_id] = calc_weighted( 139 | rewards=bandit_feedback["rewards"], 140 | actions=bandit_feedback["actions"], 141 | idx1=bandit_feedback["idx1"], 142 | pi_b=bandit_feedback["pi_b"], 143 | pi_e=pi_e, 144 | estimated_q_func=estimated_q_func, 145 | ) 146 | ope_results["DR"][sim_id] = calc_dr( 147 | rewards=bandit_feedback["rewards"], 148 | actions=bandit_feedback["actions"], 149 | estimated_q_func=estimated_q_func, 150 | pi_b=bandit_feedback["pi_b_star"], 151 | pi_e=pi_e, 152 | ) 153 | ope_results["MRDR"][sim_id] = calc_dr( 154 | rewards=bandit_feedback["rewards"], 155 | actions=bandit_feedback["actions"], 156 | estimated_q_func=estimated_q_func_with_mrdr_wrong, 157 | pi_b=bandit_feedback["pi_b_star"], 158 | pi_e=pi_e, 159 | ) 160 | ope_results["SMRDR"][sim_id] = calc_dr( 161 | rewards=bandit_feedback["rewards"], 162 | actions=bandit_feedback["actions"], 163 | estimated_q_func=estimated_q_func_with_mrdr, 164 | pi_b=bandit_feedback["pi_b_star"], 165 | pi_e=pi_e, 166 | ) 167 | if ((sim_id + 1) % 20) == 0: 168 | print( 169 | f"ratio={ratio}-{sim_id+1}th: {np.round((time.time() - start) / 60, 2)}min" 170 | ) 171 | # save raw off-policy evaluation results. 172 | with open(raw_results_path / f"ratio={ratio}.pkl", mode="wb") as f: 173 | pickle.dump(ope_results, f) 174 | for estimator in estimator_names: 175 | rel_rmse_results[estimator][ratio] = calc_rel_rmse( 176 | policy_value_true=ope_results["ground_truth"], 177 | policy_value_estimated=ope_results[estimator], 178 | ) 179 | print(f"finish ratio={ratio}: {np.round((time.time() - start) / 60, 2)}min") 180 | print("=" * 50) 181 | 182 | # save results of the evaluation of OPE 183 | rel_rmse_results_df = pd.DataFrame(rel_rmse_results).drop("ground_truth", 1) 184 | rel_rmse_results_df.T.round(5).to_csv(log_path / f"rel_rmse.csv") 185 | --------------------------------------------------------------------------------