├── .gitignore
├── README.md
├── requirements.txt
└── src
    ├── conf
        ├── policy_params.yaml
        └── q_func_hyperparams.yaml
    ├── data.py
    ├── ope.py
    ├── policy.py
    └── run_sims.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # others
132 | .DS_Store
133 | .vscode/
134 | pageblock/
135 | log/
136 | data/
137 | *.ipynb
138 | note.md
139 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Optimal Off-Policy Evaluation from Multiple Logging Policies
 2 | 
 3 | ## Overview
 4 | This repository contains the code for replicating the experiments of the paper
 5 | **"Optimal Off-Policy Evaluation from Multiple Logging Policies"** (ICML2021, proceedings.mlr.press/v139/kallus21a.html)
 6 | 
 7 | If you find this code useful in your research then please cite:
 8 | ```
 9 | @inproceedings{kallus2021optimal,
10 |   title={Optimal Off-Policy Evaluation from Multiple Logging Policies},
11 |   author={Kallus, Nathan and Saito, Yuta and Uehara, Masatoshi},
12 |   booktitle = {Proceedings of the 38th International Conference on Machine Learning},
13 |   pages={5247-5256},
14 |   year={2021},
15 |   volume = {139},
16 |   publisher={PMLR},
17 | }
18 | ```
19 | 
20 | ## Dependencies
21 | - python==3.7.3
22 | - numpy==1.18.1
23 | - pandas==0.25.1
24 | - scikit-learn==0.23.1
25 | - tensorflow==1.15.4
26 | - pyyaml==5.1
27 | - seaborn==0.10.1
28 | - matplotlib==3.2.2
29 | 
30 | ### Running the code
31 | 
32 | To run the simulations with the multi-class classification datasets, run the following commands in the `./src/` directory:
33 | 
34 | ```
35 | for data in optdigits pendigits
36 | do
37 |     python run_sims.py --num_sims 200 --data $data --is_estimate_pi_b
38 | done
39 | ```
40 | 
41 | Nota that the configurations used in the experiments can be found in `./conf/policy_params.yaml`.
42 | Once the simulations have finished running, the summarized results can be found in the `../log/{data}` directory for each data.
43 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | python==3.7.3
2 | numpy==1.18.1
3 | pandas==0.25.1
4 | scikit-learn==0.23.1
5 | tensorflow==1.15.4
6 | pyyaml==5.1
7 | seaborn==0.10.1
8 | matplotlib==3.2.2
9 | 


--------------------------------------------------------------------------------
/src/conf/policy_params.yaml:
--------------------------------------------------------------------------------
1 | evaluation: 1.0
2 | behavior1: 0.95
3 | behavior2: 0.05
4 | 


--------------------------------------------------------------------------------
/src/conf/q_func_hyperparams.yaml:
--------------------------------------------------------------------------------
1 | eta: 0.01
2 | std: 0.01
3 | lam: 0.001
4 | batch_size: 256
5 | epochs: 200
6 | 


--------------------------------------------------------------------------------
/src/data.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | from pathlib import Path
 3 | 
 4 | import numpy as np
 5 | from sklearn.preprocessing import LabelEncoder
 6 | from sklearn.model_selection import train_test_split
 7 | 
 8 | 
 9 | def load_datasets(
10 |     data: str, ratio: float, test_size: float = 0.5, random_state: int = 12345
11 | ):
12 |     """Load and preprocess raw multiclass classification data."""
13 |     data_path = Path(f"../data/{data}")
14 |     le = LabelEncoder()
15 |     if data == "optdigits":
16 |         data_ = np.r_[
17 |             np.loadtxt(data_path / f"{data}.tra", delimiter=","),
18 |             np.loadtxt(data_path / f"{data}.tes", delimiter=","),
19 |         ]
20 |     elif data == "pendigits":
21 |         data_ = np.r_[
22 |             np.loadtxt(data_path / f"{data}.tra", delimiter=","),
23 |             np.loadtxt(data_path / f"{data}.tes", delimiter=","),
24 |         ]
25 |     elif data == "sat":
26 |         data_ = np.r_[
27 |             np.loadtxt(data_path / f"{data}.trn", delimiter=" "),
28 |             np.loadtxt(data_path / f"{data}.tst", delimiter=" "),
29 |         ]
30 |         data_[:, -1] = np.where(data_[:, -1] == 7, 5, data_[:, -1] - 1)
31 |     elif data == "letter":
32 |         data_ = np.genfromtxt(
33 |             data_path / "letter-recognition.data", delimiter=",", dtype="str"
34 |         )
35 |         data_ = np.c_[data_[:, 1:], le.fit_transform(data_[:, 0])].astype(float)
36 | 
37 |     np.random.shuffle(data_)
38 |     data_tr, data_ev = train_test_split(
39 |         data_, test_size=test_size, random_state=random_state
40 |     )
41 |     n_train, n_eval = data_tr.shape[0], data_ev.shape[0]
42 |     n_dim = np.int(data_tr.shape[1] / 2)
43 |     y_tr, y_ev = data_tr[:, -1].astype(int), data_ev[:, -1].astype(int)
44 |     n_class = np.unique(y_tr).shape[0]
45 |     y_full_ev = np.zeros((n_eval, n_class))
46 |     y_full_ev[np.arange(n_eval), y_ev] = 1
47 |     X_tr, X_ev = data_tr[:, :-1], data_ev[:, :-1]
48 |     X_tr1, X_tr2 = data_tr[:, :n_dim], data_tr[:, n_dim:]
49 |     X_ev1, X_ev2 = data_ev[:, :n_dim], data_ev[:, n_dim:]
50 | 
51 |     # multiple logger index generation
52 |     ratio1 = ratio / (1 + ratio)
53 |     n_eval1 = np.int(n_eval * ratio1)
54 |     idx1 = np.ones(n_eval, dtype=bool)
55 |     idx1[n_eval1:] = False
56 | 
57 |     return dict(
58 |         n_train=n_train,
59 |         n_eval=n_eval,
60 |         n_dim=n_dim,
61 |         n_class=n_class,
62 |         n_behavior_policies=2,
63 |         X_tr=X_tr,
64 |         X_tr1=X_tr1,
65 |         X_tr2=X_tr2,
66 |         X_ev=X_ev,
67 |         X_ev1=X_ev1,
68 |         X_ev2=X_ev2,
69 |         y_tr=y_tr,
70 |         y_ev=y_ev,
71 |         y_full_ev=y_full_ev,
72 |         idx1=idx1,
73 |         ratio1=(n_eval1 / n_eval),
74 |     )
75 | 
76 | 
77 | def generate_bandit_feedback(data_dict: Dict, pi_b1: np.ndarray, pi_b2: np.ndarray):
78 |     """Generate logged bandit feedback data."""
79 |     n_eval = data_dict["n_eval"]
80 |     idx1, ratio1 = data_dict["idx1"], data_dict["ratio1"]
81 |     idx1_expanded = np.expand_dims(idx1, 1)
82 |     pi_b = pi_b1 * idx1_expanded + pi_b2 * (1 - idx1_expanded)
83 |     pi_b_star = pi_b1 * ratio1 + pi_b2 * (1.0 - ratio1)
84 |     action_set = np.arange(data_dict["n_class"])
85 |     actions = np.zeros(data_dict["n_eval"], dtype=int)
86 |     for i, pvals in enumerate(pi_b):
87 |         actions[i] = np.random.choice(action_set, p=pvals)
88 |     rewards = data_dict["y_full_ev"][np.arange(n_eval), actions]
89 |     return dict(
90 |         n_eval=data_dict["n_eval"],
91 |         n_class=data_dict["n_class"],
92 |         X_ev=data_dict["X_ev"],
93 |         pi_b=pi_b,
94 |         pi_b_star=pi_b_star,
95 |         actions=actions,
96 |         idx1=idx1,
97 |         rewards=rewards,
98 |     )
99 | 


--------------------------------------------------------------------------------
/src/ope.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | import yaml
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import tensorflow as tf
  7 | from tensorflow.python.framework import ops
  8 | 
  9 | from sklearn.model_selection import StratifiedKFold, KFold
 10 | from sklearn.linear_model import LogisticRegression
 11 | 
 12 | 
 13 | def calc_ground_truth(y_true: np.ndarray, pi: np.ndarray) -> float:
 14 |     """Calculate the ground-truth policy value of an eval policy"""
 15 |     return pi[np.arange(y_true.shape[0]), y_true].mean()
 16 | 
 17 | 
 18 | def calc_ipw(
 19 |     rewards: np.ndarray,
 20 |     actions: np.ndarray,
 21 |     pi_b: np.ndarray,
 22 |     pi_e: np.ndarray,
 23 | ) -> float:
 24 |     n_data = actions.shape[0]
 25 |     iw = pi_e[np.arange(n_data), actions] / pi_b[np.arange(n_data), actions]
 26 |     return (rewards * iw).mean()
 27 | 
 28 | 
 29 | def calc_var(
 30 |     rewards: np.ndarray,
 31 |     actions: np.ndarray,
 32 |     pi_b: np.ndarray,
 33 |     pi_e: np.ndarray,
 34 |     estimated_q_func: np.ndarray,
 35 | ):
 36 |     n_data = actions.shape[0]
 37 |     v = np.average(estimated_q_func, weights=pi_e, axis=1)
 38 |     shifted_rewards = rewards - estimated_q_func[np.arange(n_data), actions]
 39 |     iw = pi_e[np.arange(n_data), actions] / pi_b[np.arange(n_data), actions]
 40 |     return np.var(shifted_rewards * iw + v)
 41 | 
 42 | 
 43 | def calc_weighted(
 44 |     rewards: np.ndarray,
 45 |     actions: np.ndarray,
 46 |     idx1: np.ndarray,
 47 |     pi_b: np.ndarray,
 48 |     pi_e: np.ndarray,
 49 |     estimated_q_func: np.ndarray = None,
 50 |     n_fold: int = 2,
 51 | ) -> float:
 52 |     estimated_rewards_list = list()
 53 |     if estimated_q_func is None:
 54 |         estimated_q_func = np.zeros((actions.shape[0], np.int(actions.max() + 1)))
 55 |     kf = KFold(n_splits=n_fold, shuffle=True, random_state=12345)
 56 |     for train_idx, test_idx in kf.split(rewards):
 57 |         rewards_tr, rewards_ev = rewards[train_idx], rewards[test_idx]
 58 |         actions_tr, actions_ev = actions[train_idx], actions[test_idx]
 59 |         idx1_tr, idx1_ev = idx1[train_idx], idx1[test_idx]
 60 |         pi_b_tr, pi_b_ev = pi_b[train_idx], pi_b[test_idx]
 61 |         pi_e_tr, pi_e_ev = pi_e[train_idx], pi_e[test_idx]
 62 |         estimated_q_func_tr = estimated_q_func[train_idx]
 63 |         estimated_q_func_ev = estimated_q_func[test_idx]
 64 |         # estimate lambda with one of the fold
 65 |         n_data1, n_data2 = idx1_tr.sum(), (~idx1_tr).sum()
 66 |         var1 = calc_var(
 67 |             rewards=rewards_tr[idx1_tr],
 68 |             actions=actions_tr[idx1_tr],
 69 |             pi_b=pi_b_tr[idx1_tr],
 70 |             pi_e=pi_e_tr[idx1_tr],
 71 |             estimated_q_func=estimated_q_func_tr[idx1_tr],
 72 |         )
 73 |         var2 = calc_var(
 74 |             rewards=rewards_tr[~idx1_tr],
 75 |             actions=actions_tr[~idx1_tr],
 76 |             pi_b=pi_b_tr[~idx1_tr],
 77 |             pi_e=pi_e_tr[~idx1_tr],
 78 |             estimated_q_func=estimated_q_func_tr[~idx1_tr],
 79 |         )
 80 |         denominator = (n_data1 / var1) + (n_data2 / var2)
 81 |         lam1 = (n_data1 / var1) / denominator
 82 |         lam2 = (n_data2 / var2) / denominator
 83 |         # estimate the policy value with the other fold
 84 |         iw1 = (
 85 |             pi_e_ev[idx1_ev, actions_ev[idx1_ev]]
 86 |             / pi_b_ev[idx1_ev, actions_ev[idx1_ev]]
 87 |         )
 88 |         iw2 = (
 89 |             pi_e_ev[~idx1_ev, actions_ev[~idx1_ev]]
 90 |             / pi_b_ev[~idx1_ev, actions_ev[~idx1_ev]]
 91 |         )
 92 |         v1 = np.average(estimated_q_func_ev[idx1_ev], weights=pi_e_ev[idx1_ev], axis=1)
 93 |         v2 = np.average(
 94 |             estimated_q_func_ev[~idx1_ev], weights=pi_e_ev[~idx1_ev], axis=1
 95 |         )
 96 |         shifted_rewards1 = (
 97 |             rewards_ev[idx1_ev] - estimated_q_func_ev[idx1_ev, actions_ev[idx1_ev]]
 98 |         )
 99 |         shifted_rewards2 = (
100 |             rewards_ev[~idx1_ev] - estimated_q_func_ev[~idx1_ev, actions_ev[~idx1_ev]]
101 |         )
102 |         estimated_rewards = lam1 * (iw1 * shifted_rewards1 + v1).mean()
103 |         estimated_rewards += lam2 * (iw2 * shifted_rewards2 + v2).mean()
104 |         estimated_rewards_list.append(estimated_rewards)
105 |     return np.mean(estimated_rewards_list)
106 | 
107 | 
108 | def calc_dr(
109 |     rewards: np.ndarray,
110 |     actions: np.ndarray,
111 |     estimated_q_func: np.ndarray,
112 |     pi_b: np.ndarray,
113 |     pi_e: np.ndarray,
114 | ) -> float:
115 |     n_data = actions.shape[0]
116 |     v = np.average(estimated_q_func, weights=pi_e, axis=1)
117 |     iw = pi_e[np.arange(n_data), actions] / pi_b[np.arange(n_data), actions]
118 |     shifted_rewards = rewards - estimated_q_func[np.arange(n_data), actions]
119 |     return (iw * shifted_rewards + v).mean()
120 | 
121 | 
122 | def estimate_q_func(
123 |     bandit_feedback,
124 |     pi_e: np.ndarray,
125 |     fitting_method: str = "naive",
126 |     k_fold: int = 2,
127 | ) -> np.ndarray:
128 |     # hyperparam
129 |     with open("./conf/q_func_hyperparams.yaml", "rb") as f:
130 |         q_func_hyperparams = yaml.safe_load(f)
131 | 
132 |     X = bandit_feedback["X_ev"]
133 |     y = bandit_feedback["rewards"]
134 |     pi_b_star = bandit_feedback["pi_b_star"]
135 |     idx1 = bandit_feedback["idx1"].astype(int)
136 |     a = pd.get_dummies(bandit_feedback["actions"]).values
137 |     skf = StratifiedKFold(n_splits=k_fold)
138 |     skf.get_n_splits(X, y)
139 |     estimated_q_func = np.zeros((bandit_feedback["n_eval"], bandit_feedback["n_class"]))
140 |     for train_idx, test_idx in skf.split(X, y):
141 |         X_tr, X_ev = X[train_idx], X[test_idx]
142 |         y_tr, a_tr = y[train_idx], a[train_idx].astype(float)
143 |         pi_e_tr = pi_e[train_idx]
144 |         pi_b_star_tr = pi_b_star[train_idx]
145 |         idx1_tr = idx1[train_idx]
146 |         ops.reset_default_graph()
147 |         clf = QFuncEstimator(
148 |             num_features=X_tr.shape[1],
149 |             num_classes=bandit_feedback["n_class"],
150 |             fitting_method=fitting_method,
151 |             eta=q_func_hyperparams["eta"],
152 |             std=q_func_hyperparams["std"],
153 |             lam=q_func_hyperparams["lam"],
154 |             batch_size=q_func_hyperparams["batch_size"],
155 |             epochs=q_func_hyperparams["epochs"],
156 |         )
157 |         clf.train(
158 |             X=X_tr,
159 |             a=a_tr,
160 |             y=y_tr,
161 |             pi_e=pi_e_tr,
162 |             pi_b_star=pi_b_star_tr,
163 |             idx1=idx1_tr,
164 |         )
165 |         for a_idx in np.arange(bandit_feedback["n_class"]):
166 |             estimated_q_func_for_a = clf.predict(X=X_ev, a_idx=a_idx)[:, a_idx]
167 |             estimated_q_func[test_idx, a_idx] = estimated_q_func_for_a
168 |         clf.s.close()
169 |     return estimated_q_func
170 | 
171 | 
172 | @dataclass
173 | class QFuncEstimator:
174 |     num_features: int
175 |     num_classes: int
176 |     eta: float = 0.01
177 |     std: float = 0.01
178 |     lam: float = 0.001
179 |     batch_size: int = 256
180 |     epochs: int = 200
181 |     fitting_method: str = "stratified"
182 | 
183 |     def __post_init__(self) -> None:
184 |         """Initialize Class."""
185 |         tf.set_random_seed(0)
186 |         self.s = tf.Session()
187 |         self.create_placeholders()
188 |         self.build_graph()
189 |         self.create_losses()
190 |         self.add_optimizer()
191 | 
192 |     def create_placeholders(self) -> None:
193 |         """Create the placeholders to be used."""
194 |         self.input_X = tf.placeholder(
195 |             "float32", shape=(None, self.num_features), name="input_X"
196 |         )
197 |         self.input_A = tf.placeholder(
198 |             "float32", shape=(None, self.num_classes), name="input_A"
199 |         )
200 |         self.input_R = tf.placeholder("float32", shape=(None,), name="input_R")
201 |         self.input_pi_e = tf.placeholder(
202 |             "float32", shape=(None, self.num_classes), name="input_pi_e"
203 |         )
204 |         self.input_pi_b_star = tf.placeholder(
205 |             "float32", shape=(None, self.num_classes), name="input_pi_b_star"
206 |         )
207 |         self.input_idx1 = tf.placeholder("float32", shape=(None,), name="input_idx1")
208 | 
209 |     def build_graph(self) -> None:
210 |         """Build the main tensorflow graph with embedding layers."""
211 |         self.weights = tf.Variable(
212 |             tf.random_normal(
213 |                 [self.num_features + self.num_classes, self.num_classes],
214 |                 stddev=self.std,
215 |             )
216 |         )
217 |         self.bias = tf.Variable(tf.random_normal([self.num_classes], stddev=self.std))
218 | 
219 |         with tf.variable_scope("prediction"):
220 |             input_X = tf.concat([self.input_X, self.input_A], axis=1)
221 |             self.preds = tf.sigmoid(tf.matmul(input_X, self.weights) + self.bias)
222 | 
223 |     def create_losses(self) -> None:
224 |         """Create the losses."""
225 |         with tf.name_scope("loss"):
226 |             shifted_rewards = self.input_R - tf.reduce_sum(
227 |                 self.preds * self.input_A, axis=1
228 |             )
229 |             if self.fitting_method == "normal":
230 |                 self.loss = tf.reduce_mean(tf.square(shifted_rewards))
231 |             else:
232 |                 ratio1 = tf.reduce_mean(self.input_idx1)
233 |                 input_idx2 = tf.ones_like(self.input_idx1) - self.input_idx1
234 |                 ratio2 = tf.reduce_mean(input_idx2)
235 |                 pi_e = tf.reduce_sum(self.input_pi_e * self.input_A, 1)
236 |                 pi_b_star = tf.reduce_sum(self.input_pi_b_star * self.input_A, 1)
237 |                 v = tf.reduce_sum(self.input_pi_e * self.preds, 1)
238 |                 phi = (pi_e / pi_b_star) * shifted_rewards + v
239 |                 phi1 = self.input_idx1 * phi
240 |                 phi2 = input_idx2 * phi
241 |                 if self.fitting_method == "stratified":
242 |                     self.loss = ratio1 * tf.reduce_mean(tf.square(phi1))
243 |                     self.loss += ratio2 * tf.reduce_mean(tf.square(phi2))
244 |                     self.loss -= ratio1 * tf.square(tf.reduce_mean(phi1))
245 |                     self.loss -= ratio2 * tf.square(tf.reduce_mean(phi2))
246 |                 elif self.fitting_method == "naive":
247 |                     self.loss = tf.reduce_mean(tf.square(phi))
248 |                     self.loss -= tf.square(tf.reduce_mean(phi))
249 | 
250 |             self.var_list = [self.weights, self.bias]
251 |             l2_reg = [tf.nn.l2_loss(v) for v in self.var_list]
252 |             self.loss += self.lam * tf.add_n(l2_reg)
253 | 
254 |     def add_optimizer(self) -> None:
255 |         """Add the required optimizer to the graph."""
256 |         with tf.name_scope("optimizer"):
257 |             self.apply_grads = tf.train.MomentumOptimizer(
258 |                 learning_rate=self.eta, momentum=0.8
259 |             ).minimize(self.loss, var_list=self.var_list)
260 | 
261 |     def train(
262 |         self,
263 |         X: np.ndarray,
264 |         a: np.ndarray,
265 |         y: np.ndarray,
266 |         pi_e: np.ndarray,
267 |         pi_b_star: np.ndarray,
268 |         idx1: np.ndarray,
269 |     ) -> None:
270 |         self.s.run(tf.global_variables_initializer())
271 |         for _ in np.arange(self.epochs):
272 |             arr = np.arange(X.shape[0])
273 |             np.random.shuffle(arr)
274 |             for idx in np.arange(0, X.shape[0], self.batch_size):
275 |                 arr_ = arr[idx : idx + self.batch_size]
276 |                 self.s.run(
277 |                     self.apply_grads,
278 |                     feed_dict={
279 |                         self.input_X: X[arr_],
280 |                         self.input_A: a[arr_],
281 |                         self.input_R: y[arr_],
282 |                         self.input_pi_e: pi_e[arr_],
283 |                         self.input_pi_b_star: pi_b_star[arr_],
284 |                         self.input_idx1: idx1[arr_],
285 |                     },
286 |                 )
287 | 
288 |     def predict(self, X: np.ndarray, a_idx: int):
289 |         a_ = np.zeros((X.shape[0], self.num_classes))
290 |         a_[:, a_idx] = 1
291 |         return self.s.run(self.preds, feed_dict={self.input_X: X, self.input_A: a_})
292 | 
293 | 
294 | def estimate_pi_b(bandit_feedback, k_fold: int = 2) -> None:
295 |     X = bandit_feedback["X_ev"]
296 |     idx1 = bandit_feedback["idx1"]
297 |     a = bandit_feedback["actions"]
298 |     skf = StratifiedKFold(n_splits=k_fold, shuffle=True)
299 |     skf.get_n_splits(X, a)
300 |     estimated_pi_b1 = np.zeros((bandit_feedback["n_eval"], bandit_feedback["n_class"]))
301 |     estimated_pi_b2 = np.zeros((bandit_feedback["n_eval"], bandit_feedback["n_class"]))
302 |     estimated_pi_b_star = np.zeros(
303 |         (bandit_feedback["n_eval"], bandit_feedback["n_class"])
304 |     )
305 |     for train_idx, test_idx in skf.split(X, a):
306 |         X_tr, X_ev = X[train_idx], X[test_idx]
307 |         idx1_tr, a_tr = idx1[train_idx], a[train_idx]
308 |         clf = LogisticRegression(random_state=12345)
309 |         clf.fit(X=X_tr[idx1_tr], y=a_tr[idx1_tr])
310 |         estimated_pi_b1[test_idx, :] = clf.predict_proba(X_ev)
311 |         clf = LogisticRegression(random_state=12345)
312 |         clf.fit(X=X_tr[~idx1_tr], y=a_tr[~idx1_tr])
313 |         estimated_pi_b2[test_idx, :] = clf.predict_proba(X_ev)
314 |         clf = LogisticRegression(random_state=12345)
315 |         clf.fit(X=X_tr, y=a_tr)
316 |         estimated_pi_b_star[test_idx, :] = clf.predict_proba(X_ev)
317 |     idx1 = np.expand_dims(idx1.astype(int), 1)
318 |     bandit_feedback["pi_b"] = np.clip(
319 |         idx1 * estimated_pi_b1 + (1 - idx1) * estimated_pi_b2, 1e-6, 1.0
320 |     )
321 |     bandit_feedback["pi_b_star"] = np.clip(estimated_pi_b_star, 1e-6, 1.0)
322 |     return bandit_feedback
323 | 


--------------------------------------------------------------------------------
/src/policy.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List
 2 | import yaml
 3 | 
 4 | import numpy as np
 5 | from sklearn.linear_model import LogisticRegression
 6 | 
 7 | 
 8 | def train_policies(data_dict: Dict, random_state: int = 0) -> List[np.ndarray]:
 9 |     """Train evaluation and behavior policies."""
10 |     with open("./conf/policy_params.yaml", "rb") as f:
11 |         policy_params = yaml.safe_load(f)
12 | 
13 |     policy_list = list()
14 |     for pol in policy_params.keys():
15 |         # make label predictions
16 |         X_tr, y_tr = data_dict[f"X_tr"], data_dict[f"y_tr"]
17 |         clf = LogisticRegression(
18 |             random_state=random_state,
19 |             solver="lbfgs",
20 |             multi_class="multinomial",
21 |         ).fit(X=X_tr, y=y_tr)
22 |         preds = clf.predict(X=data_dict[f"X_ev"]).astype(int)
23 |         # transform predictions into distribution over actions
24 |         alpha = policy_params[pol]
25 |         pi = np.zeros((data_dict["n_eval"], data_dict["n_class"]))
26 |         pi[:, :] = (1.0 - alpha) / data_dict["n_class"]
27 |         pi[np.arange(data_dict["n_eval"]), preds] = (
28 |             alpha + (1.0 - alpha) / data_dict["n_class"]
29 |         )
30 |         policy_list.append(pi)
31 |     return policy_list
32 | 


--------------------------------------------------------------------------------
/src/run_sims.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import time
  3 | import pickle
  4 | import warnings
  5 | from pathlib import Path
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | from sklearn.exceptions import ConvergenceWarning
 10 | 
 11 | warnings.filterwarnings(action="ignore", category=ConvergenceWarning)
 12 | 
 13 | from data import load_datasets, generate_bandit_feedback
 14 | from ope import (
 15 |     calc_ipw,
 16 |     calc_weighted,
 17 |     calc_dr,
 18 |     estimate_q_func,
 19 |     estimate_pi_b,
 20 |     calc_ground_truth,
 21 | )
 22 | from policy import train_policies
 23 | 
 24 | 
 25 | def calc_rel_rmse(policy_value_true: float, policy_value_estimated: float) -> float:
 26 |     return np.sqrt(
 27 |         (((policy_value_true - policy_value_estimated) / policy_value_true) ** 2).mean()
 28 |     )
 29 | 
 30 | 
 31 | if __name__ == "__main__":
 32 |     parser = argparse.ArgumentParser()
 33 |     parser.add_argument("--num_sims", "-n", type=int, required=True)
 34 |     parser.add_argument("--data", "-d", type=str, required=True)
 35 |     parser.add_argument("--test_size", "-t", type=float, default=0.7)
 36 |     parser.add_argument("--is_estimate_pi_b", "-i", action="store_true")
 37 |     args = parser.parse_args()
 38 |     print(args)
 39 | 
 40 |     # configurations
 41 |     num_sims = args.num_sims
 42 |     data = args.data
 43 |     test_size = args.test_size
 44 |     is_estimate_pi_b = args.is_estimate_pi_b
 45 |     np.random.seed(12345)
 46 |     ratio_list = [0.1, 0.2, 0.5, 1, 2, 4, 10]
 47 |     estimator_names = [
 48 |         "ground_truth",
 49 |         "IS-Avg",
 50 |         "IS",
 51 |         "IS-PW(f)",
 52 |         "DR-Avg",
 53 |         "DR-PW",
 54 |         "DR",
 55 |         "MRDR",
 56 |         "SMRDR",
 57 |     ]
 58 |     log_path = (
 59 |         Path("../log") / data / f"test_size={test_size}" / "estimated_pi_b"
 60 |         if is_estimate_pi_b
 61 |         else Path("../log") / data / f"test_size={test_size}" / "true_pi_b"
 62 |     )
 63 |     log_path.mkdir(parents=True, exist_ok=True)
 64 |     raw_results_path = log_path / "raw_results"
 65 |     raw_results_path.mkdir(parents=True, exist_ok=True)
 66 | 
 67 |     rel_rmse_results = {
 68 |         name: {r: np.zeros(num_sims) for r in ratio_list} for name in estimator_names
 69 |     }
 70 |     for ratio in ratio_list:
 71 |         start = time.time()
 72 |         ope_results = {name: np.zeros(num_sims) for name in estimator_names}
 73 |         for sim_id in np.arange(num_sims):
 74 |             # load and split data
 75 |             data_dict = load_datasets(
 76 |                 data=data, test_size=test_size, ratio=ratio, random_state=sim_id
 77 |             )
 78 |             # train eval and two behavior policies
 79 |             pi_e, pi_b1, pi_b2 = train_policies(
 80 |                 data_dict=data_dict,
 81 |                 random_state=sim_id,
 82 |             )
 83 |             # generate bandit feedback
 84 |             bandit_feedback_ = generate_bandit_feedback(
 85 |                 data_dict=data_dict, pi_b1=pi_b1, pi_b2=pi_b2
 86 |             )
 87 |             # estimate pi_b1, pi_b2, and pi_b_star with 2-fold cross-fitting
 88 |             if is_estimate_pi_b:
 89 |                 bandit_feedback = estimate_pi_b(bandit_feedback=bandit_feedback_)
 90 |             else:
 91 |                 bandit_feedback = bandit_feedback_
 92 |             # estimate q-function with 2-fold cross-fitting
 93 |             estimated_q_func = estimate_q_func(
 94 |                 bandit_feedback=bandit_feedback,
 95 |                 pi_e=pi_e,
 96 |                 fitting_method="normal",
 97 |             )
 98 |             estimated_q_func_with_mrdr_wrong = estimate_q_func(
 99 |                 bandit_feedback=bandit_feedback,
100 |                 pi_e=pi_e,
101 |                 fitting_method="naive",
102 |             )
103 |             estimated_q_func_with_mrdr = estimate_q_func(
104 |                 bandit_feedback=bandit_feedback,
105 |                 pi_e=pi_e,
106 |                 fitting_method="stratified",
107 |             )
108 |             # off-policy evaluation
109 |             ope_results["ground_truth"][sim_id] = calc_ground_truth(
110 |                 y_true=data_dict["y_ev"], pi=pi_e
111 |             )
112 |             ope_results["IS-Avg"][sim_id] = calc_ipw(
113 |                 rewards=bandit_feedback["rewards"],
114 |                 actions=bandit_feedback["actions"],
115 |                 pi_b=bandit_feedback["pi_b"],
116 |                 pi_e=pi_e,
117 |             )
118 |             ope_results["IS"][sim_id] = calc_ipw(
119 |                 rewards=bandit_feedback["rewards"],
120 |                 actions=bandit_feedback["actions"],
121 |                 pi_b=bandit_feedback["pi_b_star"],
122 |                 pi_e=pi_e,
123 |             )
124 |             ope_results["IS-PW(f)"][sim_id] = calc_weighted(
125 |                 rewards=bandit_feedback["rewards"],
126 |                 actions=bandit_feedback["actions"],
127 |                 idx1=bandit_feedback["idx1"],
128 |                 pi_b=bandit_feedback["pi_b"],
129 |                 pi_e=pi_e,
130 |             )
131 |             ope_results["DR-Avg"][sim_id] = calc_dr(
132 |                 rewards=bandit_feedback["rewards"],
133 |                 actions=bandit_feedback["actions"],
134 |                 estimated_q_func=estimated_q_func,
135 |                 pi_b=bandit_feedback["pi_b"],
136 |                 pi_e=pi_e,
137 |             )
138 |             ope_results["DR-PW"][sim_id] = calc_weighted(
139 |                 rewards=bandit_feedback["rewards"],
140 |                 actions=bandit_feedback["actions"],
141 |                 idx1=bandit_feedback["idx1"],
142 |                 pi_b=bandit_feedback["pi_b"],
143 |                 pi_e=pi_e,
144 |                 estimated_q_func=estimated_q_func,
145 |             )
146 |             ope_results["DR"][sim_id] = calc_dr(
147 |                 rewards=bandit_feedback["rewards"],
148 |                 actions=bandit_feedback["actions"],
149 |                 estimated_q_func=estimated_q_func,
150 |                 pi_b=bandit_feedback["pi_b_star"],
151 |                 pi_e=pi_e,
152 |             )
153 |             ope_results["MRDR"][sim_id] = calc_dr(
154 |                 rewards=bandit_feedback["rewards"],
155 |                 actions=bandit_feedback["actions"],
156 |                 estimated_q_func=estimated_q_func_with_mrdr_wrong,
157 |                 pi_b=bandit_feedback["pi_b_star"],
158 |                 pi_e=pi_e,
159 |             )
160 |             ope_results["SMRDR"][sim_id] = calc_dr(
161 |                 rewards=bandit_feedback["rewards"],
162 |                 actions=bandit_feedback["actions"],
163 |                 estimated_q_func=estimated_q_func_with_mrdr,
164 |                 pi_b=bandit_feedback["pi_b_star"],
165 |                 pi_e=pi_e,
166 |             )
167 |             if ((sim_id + 1) % 20) == 0:
168 |                 print(
169 |                     f"ratio={ratio}-{sim_id+1}th: {np.round((time.time() - start) / 60, 2)}min"
170 |                 )
171 |         # save raw off-policy evaluation results.
172 |         with open(raw_results_path / f"ratio={ratio}.pkl", mode="wb") as f:
173 |             pickle.dump(ope_results, f)
174 |         for estimator in estimator_names:
175 |             rel_rmse_results[estimator][ratio] = calc_rel_rmse(
176 |                 policy_value_true=ope_results["ground_truth"],
177 |                 policy_value_estimated=ope_results[estimator],
178 |             )
179 |         print(f"finish ratio={ratio}: {np.round((time.time() - start) / 60, 2)}min")
180 |         print("=" * 50)
181 | 
182 |     # save results of the evaluation of OPE
183 |     rel_rmse_results_df = pd.DataFrame(rel_rmse_results).drop("ground_truth", 1)
184 |     rel_rmse_results_df.T.round(5).to_csv(log_path / f"rel_rmse.csv")
185 | 


--------------------------------------------------------------------------------