├── BernoulliMAB.py ├── ExponentialMAB.py ├── GaussianMAB.py ├── MAB.py ├── README.md ├── Trunc_GaussianMAB.py ├── __main__.py ├── arms.py ├── paper └── SDA_final.pdf ├── tracker.py ├── utils.py └── xp_helpers.py /BernoulliMAB.py: -------------------------------------------------------------------------------- 1 | """ Packages import """ 2 | from MAB import * 3 | from utils import rollavg_bottlneck, rd_choice, hypergeom_sample 4 | from scipy.optimize import brentq 5 | 6 | 7 | class BetaBernoulliMAB(GenericMAB): 8 | """ 9 | Bernoulli Bandit Problem 10 | """ 11 | def __init__(self, p): 12 | """ 13 | Initialization 14 | :param p: np.array, true probabilities of success for each arm 15 | """ 16 | # Initialization of arms from GenericMAB 17 | super().__init__(methods=['B']*len(p), p=p) 18 | # Complexity 19 | self.Cp = sum([(self.mu_max-x)/self.kl(x, self.mu_max) for x in self.means if x != self.mu_max]) 20 | 21 | @staticmethod 22 | def kl(x, y): 23 | """ 24 | Implementation of the Kullback-Leibler divergence for two Bernoulli distributions (B(x),B(y)) 25 | :param x: float 26 | :param y: float 27 | :return: float, KL(B(x), B(y)) 28 | """ 29 | if x == y: 30 | return 0 31 | elif x > 1-1e-6: 32 | return 0 33 | elif y == 0 or y == 1: 34 | return np.inf 35 | elif x < 1e-6: 36 | return (1-x) * np.log((1-x)/(1-y)) 37 | return x * np.log(x/y) + (1-x) * np.log((1-x)/(1-y)) 38 | 39 | def TS(self, T): 40 | """ 41 | Beta-Bernoulli Thompson Sampling 42 | :param T: Time Horizon 43 | :return: Tracker2 object 44 | """ 45 | def f(x): 46 | return np.random.beta(x.Sa+1, x.Na-x.Sa+1) 47 | return self.Index_Policy(T, f) 48 | 49 | def BESA_duel(self, indices, tracker): 50 | """ 51 | More efficient implementation of the BESA duel in the Bernoulli case 52 | :param indices: indices of arms of the duel 53 | :param tracker: Tracker2 object 54 | :return: winner of the duel 55 | """ 56 | i, j = indices[0], indices[1] 57 | ni, nj = tracker.Na[i], tracker.Na[j] 58 | si, sj = tracker.Sa[i], tracker.Sa[j] 59 | idx_min = np.argmin([ni, nj]) 60 | if idx_min == 0: 61 | sj = hypergeom_sample(sj, nj, ni) 62 | else: 63 | si = hypergeom_sample(si, ni, nj) 64 | return indices[rd_argmax(np.array([si, sj]))] 65 | 66 | def SSMC(self, T, explo_func=lambda x: np.sqrt(np.log(x))): 67 | """ 68 | More efficient implementation of SSMC for the Bernoulli case 69 | :param T: Time Horizon 70 | :param explo_func: Forced exploration function 71 | :return: Tracker2 object 72 | """ 73 | tr = Tracker2(self.means, T, store_rewards_arm=True) 74 | r, t, l = 1, 0, -1 75 | while t < self.nb_arms: 76 | arm = t 77 | tr.update(t, arm, self.MAB[arm].sample()[0]) 78 | t += 1 79 | while t < T: 80 | l_prev = l 81 | l = get_leader(tr.Na, tr.Sa, l_prev) 82 | t_prev, forced_explo = t, explo_func(r) 83 | 84 | indic = (tr.Na < tr.Na[l]) * (tr.Na < forced_explo) * 1. 85 | if l_prev != l or tr.rewards_arm[l][-1] == 0: 86 | for j in range(self.nb_arms): 87 | if indic[j] == 0 and j != l: 88 | if l_prev == l: 89 | lead_min = np.mean(tr.rewards_arm[l][-int(tr.Na[j]):]) 90 | else: 91 | lead_min = rollavg_bottlneck(tr.rewards_arm[l], int(tr.Na[j]))[(int(tr.Na[j])-1):].min() 92 | if tr.Sa[j]/tr.Na[j] >= lead_min and t < T: 93 | indic[j] = 1 94 | if indic.sum() == 0: 95 | tr.update(t, l, self.MAB[l].sample()[0]) 96 | t += 1 97 | else: 98 | to_draw = np.where(indic == 1)[0] 99 | np.random.shuffle(to_draw) 100 | for i in to_draw: 101 | if t < T: 102 | tr.update(t, i, self.MAB[i].sample()[0]) 103 | t += 1 104 | r += 1 105 | return tr 106 | 107 | def PHE(self, T, a, distrib=None): 108 | """ 109 | More efficient version of PHE for Bernoulli bandits 110 | :param T: Time Horizon 111 | :param a: proportion of perturbed history. a=1 -> same proportion, a=0-> no perturbed history 112 | :param distrib: distribution of the perturbed history 113 | :return: Tracker2 object 114 | """ 115 | tr = Tracker2(self.means, T, store_rewards_arm=True) 116 | for t in range(T): 117 | if t < self.nb_arms: 118 | arm = t 119 | else: 120 | idx_mean = np.zeros(self.nb_arms) 121 | for k in range(self.nb_arms): 122 | ph = np.random.binomial(n=np.int(a*tr.Na[k])+1, p=0.5) 123 | idx_mean[k] = (tr.Sa[k]+ph)/(tr.Na[k]+np.int(a*tr.Na[k])+1) 124 | arm = rd_argmax(idx_mean) 125 | reward = self.MAB[arm].sample()[0] 126 | tr.update(t, arm, reward) 127 | return tr 128 | 129 | def kl_ucb(self, T, f): 130 | """ 131 | Implementation of the KL-UCB algorithm for Bernoulli bandits 132 | :param T: Time Horizon 133 | :param f: Function in the minimization problem 134 | :return: Tracker2 object 135 | """ 136 | def index_func(x): 137 | res = [] 138 | for k in range(self.nb_arms): 139 | if x.Sa[k]/x.Na[k] < 1e-6: 140 | res.append(1) 141 | elif x.Sa[k]/x.Na[k] > 1-1e-6: 142 | res.append(1) 143 | else: 144 | def kl_shift(y): 145 | return self.kl(x.Sa[k]/x.Na[k], y) - f(x.t)/x.Na[k] 146 | res.append(brentq(kl_shift, x.Sa[k]/x.Na[k]-1e-7, 1 - 1e-10)) 147 | return np.array(res) 148 | return self.Index_Policy(T, index_func) 149 | -------------------------------------------------------------------------------- /ExponentialMAB.py: -------------------------------------------------------------------------------- 1 | """ Packages import """ 2 | from MAB import * 3 | from scipy.optimize import brentq 4 | 5 | 6 | class ExponentialMAB(GenericMAB): 7 | """ 8 | Gaussian Bandit Problem 9 | """ 10 | def __init__(self, p): 11 | """ 12 | Initialization 13 | :param p: np.array, true values of (mu, sigma) for each arm with mean sampled from N(mu, sigma) 14 | """ 15 | # Initialization of arms from GenericMAB 16 | super().__init__(methods=['Exp']*len(p), p=p) 17 | # Parameters used for stop learning policy 18 | self.Cp = sum([(self.mu_max-x)/self.kl(1/x, 1/self.mu_max) for x in self.means if x != self.mu_max]) 19 | 20 | 21 | @staticmethod 22 | def kl(x, y): 23 | """ 24 | Implementation of the Kullback-Leibler divergence for two Exponential Distributions 25 | WARNING: x, y are the inverse of the means of the distributions 26 | :param x: float 27 | :param y: float 28 | :return: float, KL(E(x), E(y)) 29 | """ 30 | return np.log(x/y) + y/x - 1 31 | 32 | def TS(self, T): 33 | """ 34 | Thompson Sampling with known variance, and an inproper uniform prior 35 | on the mean 36 | :param T: Time Horizon 37 | :return: Tracker2 object 38 | """ 39 | def f(x): 40 | return 1/np.random.gamma(shape=x.Na, scale=1/x.Sa) 41 | return self.Index_Policy(T, f) 42 | 43 | def kl_ucb(self, T, f): 44 | """ 45 | Implementation of KL-UCB for Exponential distributions 46 | :param T: Time Horizon 47 | :param f: function in the minimization problem 48 | :return: Tracker2 object 49 | """ 50 | def index_func(x): 51 | res = [] 52 | for k in range(self.nb_arms): 53 | mu = x.Sa[k] / x.Na[k] 54 | def kl_shift(y): 55 | return np.log(y/mu) + mu/y-1 - f(x.t) / x.Na[k] 56 | res.append(brentq(kl_shift, mu*np.exp(f(x.t)/x.Na[k]), mu*np.exp(f(x.t)/x.Na[k]+1))) 57 | return np.array(res) 58 | 59 | return self.Index_Policy(T, index_func) 60 | 61 | def IMED(self, T): 62 | """ 63 | Implementation of IMED for Exponential distributions 64 | :param T: Time Horizon 65 | :return: Tracker2 object 66 | """ 67 | def index_func(x): 68 | mu_max = np.max(x.Sa/x.Na) 69 | idx = [] 70 | for k in range(self.nb_arms): 71 | idx.append(x.Na[k]*self.kl(mu_max, x.Sa[k]/x.Na[k])+np.log(x.Na[k])) 72 | return -np.array(idx) 73 | return self.Index_Policy(T, index_func) 74 | -------------------------------------------------------------------------------- /GaussianMAB.py: -------------------------------------------------------------------------------- 1 | """ Packages import """ 2 | from MAB import * 3 | 4 | 5 | class GaussianMAB(GenericMAB): 6 | """ 7 | Gaussian Bandit Problem 8 | """ 9 | def __init__(self, p): 10 | """ 11 | Initialization 12 | :param p: np.array, true values of 1/lambda for each arm 13 | """ 14 | # Initialization of arms from GenericMAB 15 | super().__init__(methods=['G']*len(p), p=p) 16 | # Parameters used for stop learning policy 17 | self.best_arm = self.get_best_arm() 18 | # Careful: Cp is the bound only with same variance for each arm 19 | self.Cp = sum([(self.mu_max - arm.mu) / self.kl2(arm.mu, self.mu_max, arm.eta, self.MAB[self.best_arm].eta) 20 | for arm in self.MAB if arm.mu != self.mu_max]) 21 | 22 | def get_best_arm(self): 23 | ind = np.nonzero(self.means == np.amax(self.means))[0] 24 | std = [self.MAB[arm].eta for arm in ind] 25 | u = np.argmin(std) 26 | return ind[u] 27 | 28 | @staticmethod 29 | def kl(mu1, mu2): 30 | """ 31 | Implementation of the Kullback-Leibler divergence for two Gaussian N(mu, 1) 32 | :param x: float 33 | :param y: float 34 | :return: float, KL(B(x), B(y)) 35 | """ 36 | return (mu2-mu1)**2/2 37 | 38 | @staticmethod 39 | def kl2(mu1, mu2, sigma1, sigma2): 40 | """ 41 | Implementation of the Kullback-Leibler divergence for two Gaussian with different std 42 | :param x: float 43 | :param y: float 44 | :return: float, KL(B(x), B(y)) 45 | """ 46 | return np.log(sigma2/sigma1) + 0.5 * (sigma1**2/sigma2**2 + (mu2-mu1)**2/sigma2**2 - 1) 47 | 48 | def TS(self, T): 49 | """ 50 | Thompson Sampling for Gaussian distributions with known variance, and an inproper uniform prior 51 | on the mean 52 | :param T: Time Horizon 53 | :return: Tracker2 object 54 | """ 55 | eta = np.array([arm.eta for arm in self.MAB]) 56 | 57 | def f(x): 58 | return np.random.normal(x.Sa/x.Na, eta/np.sqrt(x.Na)) 59 | return self.Index_Policy(T, f) 60 | 61 | def kl_ucb(self, T, f): 62 | """ 63 | Implementation of KL-UCB for Gaussian bandits 64 | :param T: Time Horizon 65 | :param rho: coefficient for the upper bound 66 | :return: 67 | """ 68 | def index_func(x): 69 | return x.Sa / x.Na + np.sqrt(f(x.t)*2 / x.Na) 70 | return self.Index_Policy(T, index_func) 71 | -------------------------------------------------------------------------------- /MAB.py: -------------------------------------------------------------------------------- 1 | """ Packages import """ 2 | import numpy as np 3 | import arms 4 | from tqdm import tqdm 5 | from utils import rd_argmax, rd_choice, rollavg_bottlneck, get_leader 6 | from tracker import Tracker2 7 | from utils import get_SSMC_star_min 8 | #import sobol_seq # for LDS-SDA 9 | 10 | mapping = {'B': arms.ArmBernoulli, 'beta': arms.ArmBeta, 'F': arms.ArmFinite, 'G': arms.ArmGaussian, 11 | 'Exp': arms.ArmExponential, 'dirac': arms.dirac, 'TG': arms.ArmTG} 12 | 13 | 14 | def default_exp(x): 15 | """ 16 | :param x: float 17 | :return: default exploration function for SDA algorithms 18 | """ 19 | return 0 20 | # return np.sqrt(np.log(x)) 21 | 22 | 23 | class GenericMAB: 24 | """ 25 | Generic class to simulate a Multi-Arm Bandit problem 26 | """ 27 | def __init__(self, methods, p): 28 | """ 29 | Initialization of the arms 30 | :param methods: string, probability distribution of each arm 31 | :param p: np.array or list, parameters of the probability distribution of each arm 32 | """ 33 | self.MAB = self.generate_arms(methods, p) 34 | self.nb_arms = len(self.MAB) 35 | self.means = np.array([el.mean for el in self.MAB]) 36 | self.mu_max = np.max(self.means) 37 | self.mc_regret = None 38 | 39 | @staticmethod 40 | def generate_arms(methods, p): 41 | """ 42 | Method for generating different arms 43 | :param methods: string, probability distribution of each arm 44 | :param p: np.array or list, parameters of the probability distribution of each arm 45 | :return: list of class objects, list of arms 46 | """ 47 | arms_list = list() 48 | for i, m in enumerate(methods): 49 | args = [p[i]] + [[np.random.randint(1, 312414)]] 50 | args = sum(args, []) if type(p[i]) == list else args 51 | alg = mapping[m] 52 | arms_list.append(alg(*args)) 53 | return arms_list 54 | 55 | @staticmethod 56 | def kl(x, y): 57 | return None 58 | 59 | def MC_regret(self, method, N, T, param_dic, store_step=-1): 60 | """ 61 | Average Regret on a Number of Experiments 62 | :param method: string, method used (UCB, Thomson Sampling, etc..) 63 | :param N: int, number of independent experiments 64 | :param T: int, time horizon 65 | :param param_dic: dict, parameters for the different methods 66 | """ 67 | mc_regret = np.zeros(T) 68 | store = store_step > 0 69 | if store: 70 | all_regret = np.zeros((np.arange(T)[::store_step].shape[0], N)) 71 | alg = self.__getattribute__(method) 72 | for i in tqdm(range(N), desc='Computing ' + str(N) + ' simulations'): 73 | tr = alg(T, **param_dic) 74 | regret = tr.regret() 75 | mc_regret += regret 76 | if store: 77 | all_regret[:, i] = regret[::store_step] 78 | if store: 79 | return mc_regret / N, all_regret 80 | return mc_regret / N 81 | 82 | def DummyPolicy(self, T): 83 | """ 84 | Implementation of a random policy consisting in randomly choosing one of the available arms. Only useful 85 | for checking that the behavior of the different policies is normal 86 | :param T: int, time horizon 87 | :return: means, arm sequence 88 | """ 89 | tr = Tracker2(self.means, T) 90 | tr.arm_sequence = np.random.randint(self.nb_arms, size=T) 91 | return tr 92 | 93 | def ExploreCommit(self, T, m): 94 | """ 95 | Implementation of Explore-then-Commit algorithm 96 | :param T: int, time horizon 97 | :param m: int, number of rounds before choosing the best action 98 | :return: np.arrays, reward obtained by the policy and sequence of chosen arms 99 | """ 100 | tr = Tracker2(self.means, T) 101 | for t in range(m * self.nb_arms): 102 | arm = t % self.nb_arms 103 | tr.update(t, arm, self.MAB[arm].sample()[0]) 104 | arm = rd_argmax(tr.Sa / tr.Na) 105 | for t in range(m * self.nb_arms, T): 106 | tr.update(t, arm, self.MAB[arm].sample()[0]) 107 | return tr 108 | 109 | def Index_Policy(self, T, index_func, start_explo=1, store_rewards_arm=False): 110 | """ 111 | Implementation of UCB1 algorithm 112 | :param T: int, time horizon 113 | :param start_explo: number of time to explore each arm before comparing index 114 | :param index_func: function which computes the index with the tracker 115 | :return: np.arrays, reward obtained by the policy and sequence of chosen arms 116 | """ 117 | tr = Tracker2(self.means, T, store_rewards_arm) 118 | for t in range(T): 119 | if t < self.nb_arms*start_explo: 120 | arm = t % self.nb_arms 121 | else: 122 | arm = rd_argmax(index_func(tr)) 123 | reward = self.MAB[arm].sample()[0] 124 | tr.update(t, arm, reward) 125 | return tr 126 | 127 | def UCB1(self, T, rho=1.): 128 | """ 129 | :param T: Time Horizon 130 | :param rho: coefficient for the upper bound 131 | :return: 132 | """ 133 | def index_func(x): 134 | return x.Sa / x.Na + rho * np.sqrt(np.log(x.t + 1)*2 / x.Na) 135 | return self.Index_Policy(T, index_func) 136 | 137 | def BESA_duel(self, indices, tracker): 138 | """ 139 | :param indices: indices of the 2 competing arms 140 | :param tracker: Tracker2 object 141 | :return: winner arm of a single dual in BESA 142 | """ 143 | i, j = indices[0], indices[1] 144 | r_i, r_j = tracker.rewards_arm[i], tracker.rewards_arm[j] 145 | ni, nj = tracker.Na[i], tracker.Na[j] 146 | idx_max = rd_argmax(np.array([ni, nj])) 147 | if idx_max == 1: 148 | r_j = rd_choice(np.array(r_j), size=int(ni)) 149 | else: 150 | r_i = rd_choice(np.array(r_i), size=int(nj)) 151 | return indices[rd_argmax(np.array([np.mean(r_i), np.mean(r_j)]))] 152 | 153 | def BESA_step(self, tracker): 154 | """ 155 | :param tracker: Tracker2 object 156 | :return: Implementation of the tournament in BESA 157 | """ 158 | indices = list(np.arange(self.nb_arms)) 159 | while len(indices) > 1: 160 | np.random.shuffle(indices) # Changement pour enlever le biais 161 | winners = [] 162 | if len(indices) % 2 == 1: 163 | winners.append(indices[-1]) 164 | for i in range(len(indices)//2): 165 | winners.append(self.BESA_duel((indices[2*i], indices[2*i+1]), tracker)) 166 | indices = winners 167 | return indices[0] 168 | 169 | def BESA(self, T, n0=1): 170 | """ 171 | Implementation of the BESA algorithm 172 | :param T: Time Horizon 173 | :param n0: Number of time to pull each arm before starting the algorithm 174 | :return: Tracker object with the results of the run 175 | """ 176 | tr = Tracker2(self.means, T, store_rewards_arm=True) 177 | for t in range(T): 178 | if t < self.nb_arms * n0: 179 | arm = t % self.nb_arms 180 | else: 181 | arm = self.BESA_step(tr) 182 | tr.update(t, arm, self.MAB[arm].sample()[0]) 183 | return tr 184 | 185 | def SSMC(self, T, explo_func=lambda x: np.sqrt(np.log(x))): 186 | """ 187 | Implementation of the SSMC algorithm 188 | :param T: Time Horizon 189 | :param explo_func: Forced exploration function 190 | :return: Tracker object with the results of the run 191 | """ 192 | tr = Tracker2(self.means, T, store_rewards_arm=True) 193 | r, t, l = 1, 0, -1 194 | while t < self.nb_arms: 195 | arm = t 196 | tr.update(t, arm, self.MAB[arm].sample()[0]) 197 | t += 1 198 | while t < T: 199 | l_prev = l 200 | l = get_leader(tr.Na, tr.Sa, l_prev) 201 | t_prev, forced_explo = t, explo_func(r) 202 | 203 | indic = (tr.Na < tr.Na[l]) * (tr.Na < forced_explo) * 1. 204 | for j in range(self.nb_arms): 205 | if indic[j] == 0 and j != l: 206 | if l_prev == l: 207 | lead_min = np.mean(tr.rewards_arm[l][-int(tr.Na[j]):]) 208 | else: 209 | lead_min = rollavg_bottlneck(tr.rewards_arm[l], int(tr.Na[j]))[(int(tr.Na[j])-1):].min() 210 | if tr.Sa[j]/tr.Na[j] >= lead_min and t < T: 211 | indic[j] = 1 212 | if indic.sum() == 0: 213 | tr.update(t, l, self.MAB[l].sample()[0]) 214 | t += 1 215 | else: 216 | to_draw = np.where(indic == 1)[0] 217 | np.random.shuffle(to_draw) 218 | for i in to_draw: 219 | if t < T: 220 | tr.update(t, i, self.MAB[i].sample()[0]) 221 | t += 1 222 | r += 1 223 | return tr 224 | 225 | def SSMC_star(self, T, explo_func=default_exp): 226 | """ 227 | Implemention of SSMC*, a slightly modified version of SSMC 228 | :param T: Time Horizon 229 | :param explo_func: Forced Exploration function 230 | :return: Tracker object with the results of the run 231 | """ 232 | tr = Tracker2(self.means, T, store_rewards_arm=True) 233 | r, t, l = 1, 0, -1 234 | while t < self.nb_arms: 235 | arm = t 236 | tr.update(t, arm, self.MAB[arm].sample()[0]) 237 | t += 1 238 | while t < T: 239 | l_prev = l 240 | l = get_leader(tr.Na, tr.Sa, l_prev) 241 | t_prev, forced_explo = t, explo_func(r) 242 | all_reshape_size = np.zeros(self.nb_arms) 243 | indic = (tr.Na < tr.Na[l]) * (tr.Na < forced_explo) * 1. 244 | for j in range(self.nb_arms): 245 | reshape_size = len(tr.rewards_arm[l]) // tr.Na[j] 246 | if indic[j] == 0 and j != l: 247 | if l_prev == l and reshape_size == all_reshape_size[j]: 248 | lead_min = np.inf 249 | elif l_prev == l: 250 | lead_min = np.mean(tr.rewards_arm[l][-int(tr.Na[j]):]) 251 | else: 252 | lead_min = get_SSMC_star_min(tr.rewards_arm[l], 253 | int(tr.Na[j]), int(reshape_size)) 254 | if tr.Sa[j]/tr.Na[j] >= lead_min and t < T: 255 | indic[j] = 1 256 | all_reshape_size[j] = reshape_size 257 | if indic.sum() == 0: 258 | tr.update(t, l, self.MAB[l].sample()[0]) 259 | t += 1 260 | else: 261 | to_draw = np.where(indic == 1)[0] 262 | np.random.shuffle(to_draw) 263 | for i in to_draw: 264 | if t < T: 265 | tr.update(t, i, self.MAB[i].sample()[0]) 266 | t += 1 267 | r += 1 268 | return tr 269 | 270 | def non_parametric_TS(self, T, upper_bound=1): 271 | """ 272 | Implementation of the Non-parametric Thompson Sampling algorithm 273 | :param T: Time Horizon 274 | :param upper_bound: Upper bound for the reward 275 | :return: Tracker object with the results of the run 276 | """ 277 | tr = Tracker2(self.means, T) 278 | if upper_bound is not None: 279 | X = [[upper_bound] for _ in range(self.nb_arms)] 280 | tr.Na = tr.Na + 1 281 | for t in range(T): 282 | V = np.zeros(self.nb_arms) 283 | for i in range(self.nb_arms): 284 | V[i] = np.inner(np.random.dirichlet(np.ones(int(tr.Na[i]))), np.array(X[i])) 285 | arm = rd_argmax(V) 286 | tr.update(t, arm, self.MAB[arm].sample()[0]) 287 | X[arm].append(tr.reward[t]) 288 | return tr 289 | 290 | def WR_SDA(self, T, explo_func=default_exp): 291 | """ 292 | Implementation of WR-SDA 293 | :param T: Time Horizon 294 | :param explo_func: Forced exploration function 295 | :return: Tracker object with the results of the run 296 | """ 297 | tr = Tracker2(self.means, T, store_rewards_arm=True) 298 | r, t, l = 1, 0, -1 299 | while t < self.nb_arms: 300 | arm = t 301 | tr.update(t, arm, self.MAB[arm].sample()[0]) 302 | t += 1 303 | while t < T: 304 | l_prev = l 305 | l = get_leader(tr.Na, tr.Sa, l_prev) 306 | t_prev, forced_explo = t, explo_func(r) 307 | indic = (tr.Na < tr.Na[l]) * (tr.Na < forced_explo) * 1. 308 | for j in range(self.nb_arms): 309 | if indic[j] == 0 and j != l: 310 | if self.BESA_duel([l, j], tracker=tr) == j: 311 | indic[j] = 1 312 | if indic.sum() == 0: 313 | tr.update(t, l, self.MAB[l].sample()[0]) 314 | t += 1 315 | else: 316 | to_draw = np.where(indic == 1)[0] 317 | np.random.shuffle(to_draw) 318 | for i in to_draw: 319 | if t < T: 320 | tr.update(t, i, self.MAB[i].sample()[0]) 321 | t += 1 322 | r += 1 323 | return tr 324 | 325 | def RB_SDA(self, T, explo_func=default_exp): 326 | """ 327 | Implementation of RB-SDA 328 | :param T: Time Horizon 329 | :param explo_func: Forced exploration function 330 | :return: Tracker object with the results of the run 331 | """ 332 | tr = Tracker2(self.means, T, store_rewards_arm=True) 333 | r, t, l = 1, 0, -1 334 | while t < self.nb_arms: 335 | arm = t 336 | tr.update(t, arm, self.MAB[arm].sample()[0]) 337 | t += 1 338 | while t < T: 339 | l_prev = l 340 | l = get_leader(tr.Na, tr.Sa, l_prev) 341 | t_prev, forced_explo = t, explo_func(r) 342 | indic = (tr.Na < tr.Na[l]) * (tr.Na < forced_explo) * 1. 343 | for j in range(self.nb_arms): 344 | if indic[j] == 0 and j != l and tr.Na[j] < tr.Na[l]: 345 | tj = np.random.randint(tr.Na[l]-tr.Na[j]) 346 | lead_mean = np.mean(tr.rewards_arm[l][tj: tj+int(tr.Na[j])]) 347 | if tr.Sa[j]/tr.Na[j] >= lead_mean and t < T: 348 | indic[j] = 1 349 | if indic.sum() == 0: 350 | tr.update(t, l, self.MAB[l].sample()[0]) 351 | t += 1 352 | else: 353 | to_draw = np.where(indic == 1)[0] 354 | np.random.shuffle(to_draw) 355 | for i in to_draw: 356 | if t < T: 357 | tr.update(t, i, self.MAB[i].sample()[0]) 358 | t += 1 359 | r += 1 360 | return tr 361 | 362 | def IB_SDA(self, T, explo_func=default_exp): 363 | """ 364 | Implementation of IB-SDA (Independent Blocks-SDA): an algorithm not introduced in the paper 365 | using a SWR sampler which discards elements that were previously drawn until there are no 366 | more available elements. It is a way to enforce the diversity of sample. 367 | We did not present this sampler as it is not an independent sampler. 368 | :param T: Time Horizon 369 | :param explo_func: Forced exploration function 370 | :return: Tracker object with the results of the run 371 | """ 372 | tr = Tracker2(self.means, T, store_rewards_arm=True) 373 | r, t, l = 1, 0, -1 374 | while t < self.nb_arms: 375 | arm = t 376 | tr.update(t, arm, self.MAB[arm].sample()[0]) 377 | t += 1 378 | while t < T: 379 | l_prev = l 380 | l = get_leader(tr.Na, tr.Sa, l_prev) 381 | t_prev, forced_explo = t, explo_func(r) 382 | indic = (tr.Na < tr.Na[l]) * (tr.Na < forced_explo) * 1. 383 | if l_prev != l: 384 | weight_dic = np.ones((self.nb_arms, int(tr.Na[l]))) 385 | else: 386 | if weight_dic.shape[1] < tr.Na[l]: 387 | weight_dic = np.concatenate([weight_dic, 388 | np.ones((self.nb_arms, 1))], axis=1) 389 | for k in range(self.nb_arms): 390 | if k != l and weight_dic[k].sum() < tr.Na[k]: 391 | weight_dic[k] = np.ones(int(tr.Na[l])) 392 | for j in range(self.nb_arms): 393 | if indic[j] == 0 and j != l and tr.Na[j] < tr.Na[l]: 394 | besa_indices = np.random.choice( 395 | np.arange(tr.Na[l]).astype('int'), size=int(tr.Na[j]), 396 | replace=False, p=weight_dic[j]/weight_dic[j].sum()) 397 | lead_mean = np.mean(np.array(tr.rewards_arm[l])[besa_indices]) 398 | weight_dic[j][besa_indices] = 0 399 | if tr.Sa[j]/tr.Na[j] >= lead_mean and t < T: 400 | indic[j] = 1 401 | if indic.sum() == 0: 402 | tr.update(t, l, self.MAB[l].sample()[0]) 403 | t += 1 404 | else: 405 | to_draw = np.where(indic == 1)[0] 406 | np.random.shuffle(to_draw) 407 | for i in to_draw: 408 | if t < T: 409 | tr.update(t, i, self.MAB[i].sample()[0]) 410 | t += 1 411 | r += 1 412 | return tr 413 | 414 | def LB_SDA(self, T, explo_func=default_exp): 415 | """ 416 | Implementation of the LB-SDA algorithm 417 | :param T: Time Horizon 418 | :param explo_func: Forced exploration function 419 | :return: Tracker object with the results of the run 420 | """ 421 | tr = Tracker2(self.means, T, store_rewards_arm=True) 422 | r, t, l = 1, 0, -1 423 | while t < self.nb_arms: 424 | arm = t 425 | tr.update(t, arm, self.MAB[arm].sample()[0]) 426 | t += 1 427 | while t < T: 428 | l_prev = l 429 | l = get_leader(tr.Na, tr.Sa, l_prev) 430 | t_prev, forced_explo = t, explo_func(r) 431 | indic = (tr.Na < tr.Na[l]) * (tr.Na < forced_explo) * 1. 432 | for j in range(self.nb_arms): 433 | if indic[j] == 0 and j != l and tr.Na[j] < tr.Na[l]: 434 | lead_mean = np.mean(tr.rewards_arm[l][-int(tr.Na[j]):]) 435 | if tr.Sa[j]/tr.Na[j] >= lead_mean and t < T: 436 | indic[j] = 1 437 | if indic.sum() == 0: 438 | tr.update(t, l, self.MAB[l].sample()[0]) 439 | t += 1 440 | else: 441 | to_draw = np.where(indic == 1)[0] 442 | np.random.shuffle(to_draw) 443 | for i in to_draw: 444 | if t < T: 445 | tr.update(t, i, self.MAB[i].sample()[0]) 446 | t += 1 447 | r += 1 448 | return tr 449 | 450 | def LDS_SDA(self, T, explo_func=default_exp): 451 | """ 452 | Implementation of the LDS-SDA algorithm using a Sobol sequence 453 | :param T: Time Horizon 454 | :param explo_func: Forced exploration function 455 | :return: Tracker object with the results of the run 456 | """ 457 | tr = Tracker2(self.means, T, store_rewards_arm=True) 458 | r, t, l = 1, 0, -1 459 | while t < self.nb_arms: 460 | arm = t 461 | tr.update(t, arm, self.MAB[arm].sample()[0]) 462 | t += 1 463 | while t < T: 464 | l_prev = l 465 | l = get_leader(tr.Na, tr.Sa, l_prev) 466 | t_prev, forced_explo = t, explo_func(r) 467 | indic = (tr.Na < tr.Na[l]) * (tr.Na < forced_explo) * 1. 468 | t_b = int(sobol_seq.i4_sobol(1, seed=r)[0][0]*tr.Na[l]) 469 | for j in range(self.nb_arms): 470 | if indic[j] == 0 and j != l and tr.Na[j] < tr.Na[l]: 471 | b_0 = tr.rewards_arm[l][t_b:t_b+int(tr.Na[j])] 472 | if len(b_0) < tr.Na[j]: 473 | b_1 = tr.rewards_arm[l][:int(tr.Na[j])-int(tr.Na[l]-t_b)] 474 | lead_mean = (np.sum(b_0)+np.sum(b_1))/tr.Na[j] 475 | else: 476 | lead_mean = np.mean(b_0) 477 | if tr.Sa[j]/tr.Na[j] >= lead_mean and t < T: 478 | indic[j] = 1 479 | if indic.sum() == 0: 480 | tr.update(t, l, self.MAB[l].sample()[0]) 481 | t += 1 482 | else: 483 | to_draw = np.where(indic == 1)[0] 484 | np.random.shuffle(to_draw) 485 | for i in to_draw: 486 | if t < T: 487 | tr.update(t, i, self.MAB[i].sample()[0]) 488 | t += 1 489 | r += 1 490 | return tr 491 | 492 | def vanilla_bootstrap(self, T): 493 | """ 494 | Implementation of the Vanilla Bootstrap bandit algorithm 495 | :param T: Time Horizon 496 | :return: Tracker object with the results of the run 497 | """ 498 | tr = Tracker2(self.means, T, store_rewards_arm=True) 499 | for t in range(T): 500 | if t < self.nb_arms: 501 | arm = t 502 | else: 503 | bts_mean = np.zeros(self.nb_arms) 504 | for k in range(self.nb_arms): 505 | bts_mean[k] = np.random.choice(tr.rewards_arm[k], size=int(tr.Na[k]), replace=True).mean() 506 | arm = rd_argmax(bts_mean) 507 | reward = self.MAB[arm].sample()[0] 508 | tr.update(t, arm, reward) 509 | return tr 510 | 511 | def PHE(self, T, a, distrib): 512 | """ 513 | Implementation of the Perturbed History Exploration algorithm 514 | :param T: Time Horizon 515 | :param a: proportion of perturbed history. a=1 -> same proportion, a=0-> no perturbed history 516 | :param distrib: Distribution of the fake rewards 517 | :return: Tracker2 object 518 | """ 519 | tr = Tracker2(self.means, T, store_rewards_arm=True) 520 | for t in range(T): 521 | if t < self.nb_arms: 522 | arm = t 523 | else: 524 | idx_mean = np.zeros(self.nb_arms) 525 | for k in range(self.nb_arms): 526 | ph = distrib.rvs(size=np.int(a*tr.Na[k])+1) 527 | idx_mean[k] = (tr.Sa[k]+ph.sum())/(tr.Na[k]+np.int(a*tr.Na[k])+1) 528 | arm = rd_argmax(idx_mean) 529 | reward = self.MAB[arm].sample()[0] 530 | tr.update(t, arm, reward) 531 | return tr 532 | 533 | def ReBoot(self, T, sigma, weight_func=np.random.normal): 534 | """ 535 | Implementation of the Reboot algorithm 536 | :param T: Time Horizon 537 | :param sigma: sigma and -sigma are added to the rewards list before bootstrapping 538 | :param weight_func: a function of mean 0 and std 1 539 | :return: Tracker2 object 540 | """ 541 | def index_func(x): 542 | avg = x.Sa/x.Na 543 | idx = np.zeros(self.nb_arms) 544 | for k in range(self.nb_arms): 545 | s = int(x.Na[k]) + 2 546 | e = np.zeros(s) 547 | e[:-2] = np.array(x.rewards_arm[k])-avg[k] 548 | e[-2] = np.sqrt(s) * sigma 549 | e[-1] = -np.sqrt(s) * sigma 550 | w = weight_func(size=s) 551 | idx[k] = avg[k]+np.mean(w*e) 552 | return idx 553 | return self.Index_Policy(T, index_func, store_rewards_arm=True) 554 | 555 | def ReBootG(self, T, sigma): 556 | """ 557 | More efficient version of ReBoot with the gaussian bootstrap 558 | :param T: Time Horizon 559 | :param sigma: standard deviation of perturbation 560 | :return: Tracker2 object 561 | """ 562 | def index_func(x): 563 | avg = x.Sa/x.Na 564 | idx = np.zeros(self.nb_arms) 565 | for k in range(self.nb_arms): 566 | s = int(x.Na[k]) + 2 567 | e = np.zeros(s) 568 | e[:-2] = np.array(x.rewards_arm[k])-avg[k] 569 | e[-2] = np.sqrt(s) * sigma 570 | e[-1] = -np.sqrt(s) * sigma 571 | idx[k] = avg[k]+np.random.normal(loc=0, scale=1/(e.shape[0])*np.sqrt((e**2).sum())) 572 | return idx 573 | return self.Index_Policy(T, index_func, store_rewards_arm=True) 574 | 575 | def IMED(self, T): 576 | """ 577 | Implementation of the IMED algorithm 578 | :param T: Time Horizon 579 | :return: Tracker2 object 580 | """ 581 | def index_func(x): 582 | mu_max = np.max(x.Sa/x.Na) 583 | idx = [] 584 | for k in range(self.nb_arms): 585 | idx.append(x.Na[k]*self.kl(x.Sa[k]/x.Na[k], mu_max)+np.log(x.Na[k])) 586 | return -np.array(idx) 587 | return self.Index_Policy(T, index_func) 588 | 589 | def Bootstrapped_TS(self, T, prior, M): 590 | """ 591 | Implementation of the Bootstrapped Thompson Sampling (Osband et al., 2017) 592 | :param T: Time Horizon 593 | :param prior: prior for the fake history 594 | :param M: number of fake samples at each step 595 | :return: Tracker2 object 596 | """ 597 | # éventuellement rajouter l'algo de bootstrap en param. Pour l'instant: with replacement 598 | def index_func(x): 599 | idx = [] 600 | for k in range(self.nb_arms): 601 | artificial_hist = list(prior(size=M)) 602 | n_tot = int(M + x.Na[k]) 603 | bts_sample = np.random.choice(x.rewards_arm[k]+artificial_hist, replace=True, size=n_tot) 604 | idx.append(np.mean(bts_sample)) 605 | return np.array(idx) 606 | return self.Index_Policy(T, index_func, store_rewards_arm=True) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sub-Sampling Dueling Algorithms, Neurips 2020 2 | 3 | This repositery contains the code associated with the paper 4 | "Sub-sampling for efficient Non-Parametric Bandit Exploration" presented at Neurips 2020. We provide a description of the code structure and a short guide to run some experiments. 5 | 6 | ## How to run experiments 7 | 8 | The *__ main__.py* file contains different block with code that can be directly executed. This file relies on *xp_helpers.py*, that contains functions that 9 | allow to run two types of experiments: 10 | * Frequentist experiments: the user defines a bandit model and perform a number of runs 11 | of each algorithm for this particular model 12 | * Bayesian experiments: the user defines a prior distribution for the bandit model and draw a number of experiments from this distribution. 13 | Then, each bandit algorithm runs once on these problems. 14 | 15 | The file is divided in three blocks. The __xp_type__ parameter allows to choose which block to run. Several examples are proposed in each blocks. 16 | 17 | ## Code Structure 18 | 19 | ### Bandit algorithms 20 | 21 | Our implementation of the multi-arm bandit problem has its key structure in the *MAB.py* file. The initialization of the bandit relies on the *arms.py* file, which defines objects representing the arms and their properties (mean, how to sample the rewards, etc...). 22 | 23 | The __GenericMAB__ object is designed as a mother class for any bandit model. Several algorithms are already implemented in this class, when they don't have to be calibrated for specific distributions. The function __MC_regret__ allows to run a single bandit algorithm for a given number of runs and time horizon and returns the regret. 24 | 25 | The objects __BernoulliMAB.py__, __GaussianMAB.py__, __ExponentialMAB.py__ and __TruncatedGaussianMAB.py__ 26 | are inherited from __GenericMAB__ and refine the class to adapt it to the Bernoulli, Gaussian, Exponential and Truncated Gaussian distributions. In particular, 27 | they contain the algorithms that are specific to the family of distribution of the arms, or optimized versions of algorithms that are alerady in __GenericMAB__ (for instance in Bernoulli MAB). 28 | 29 | ### Helpers 30 | 31 | The __Tracker2__ object defined *tracker.py* is a useful object used in all of our bandit algorithms to store the settings of the experiments during the runs. 32 | In particular, it can be used to store the number of pulls, cumulated regret and reward history of each arm. 33 | 34 | *utils.py* contains several functions that are useful in the bandit algorithms. Some of these function use the *numba* package for faster computation. 35 | 36 | Finally, *xp_helpers.py* provide useful functions to perform large scale experiments in the frequentist and bayesian setting. Some of these functions use libraries that allow multiprocessing for parallel computation. 37 | -------------------------------------------------------------------------------- /Trunc_GaussianMAB.py: -------------------------------------------------------------------------------- 1 | """ Packages import """ 2 | from MAB import * 3 | from scipy.stats import norm 4 | from tracker import Tracker2 5 | import numpy as np 6 | 7 | 8 | class TruncGaussianMAB(GenericMAB): 9 | """ 10 | Gaussian Bandit Problem 11 | """ 12 | def __init__(self, p): 13 | """ 14 | Initialization 15 | :param p: np.array, true values of 1/lambda for each arm 16 | """ 17 | # Initialization of arms from GenericMAB 18 | super().__init__(methods=['TG']*len(p), p=p) 19 | self.best_arm = self.get_best_arm() 20 | self.Cp = self.get_complexity() 21 | 22 | def get_complexity(self): 23 | """ 24 | :return: Compute the constant in the Burnetas and Katehakis lower bound for TG arms 25 | """ 26 | Cp = 0 27 | for arm in self.MAB: 28 | if arm.mean != self.mu_max: 29 | gap = self.mu_max-arm.mean 30 | kl = self.KL_tg(arm.mu, self.MAB[self.best_arm].mu, arm.scale) 31 | Cp += gap/kl 32 | return Cp 33 | 34 | @staticmethod 35 | def KL_tg(mu1, mu2, scale, step=1e-6): 36 | """ 37 | :param mu1: mean of underlying Gaussian r.v of arm 1 38 | :param mu2: mean of underlying Gaussian r.v of arm 1 39 | :param scale: scale of underlying Gaussian r.v 40 | :param step: precision of numerical integration 41 | :return: KL divergence of two TG arms 42 | """ 43 | phi01 = norm.cdf(0, loc=mu1, scale=scale) 44 | phi02 = norm.cdf(0, loc=mu2, scale=scale) 45 | phi11 = 1 - norm.cdf(1, loc=mu1, scale=scale) 46 | phi12 = 1 - norm.cdf(1, loc=mu2, scale=scale) 47 | kl_1 = phi01 * np.log(phi01 / phi02) + phi11 * np.log(phi11 / phi12) 48 | X = np.arange(0, 1, step) 49 | kl_2 = (norm.pdf(X, loc=mu1, scale=scale) * np.log(norm.pdf( 50 | X, loc=mu1, scale=scale) / norm.pdf(X, loc=mu2, scale=scale))).mean() 51 | return kl_1 + kl_2 52 | 53 | def get_best_arm(self): 54 | """ 55 | :return: best arm of the bandit problem 56 | """ 57 | ind = np.nonzero(self.means == np.amax(self.means))[0] 58 | std = [self.MAB[arm].scale for arm in ind] 59 | u = np.argmin(std) 60 | return ind[u] 61 | 62 | def PHE(self, T, a, distrib=None): 63 | """ 64 | Optimized version of PHE for TG arms 65 | :param T: Time Horizon 66 | :param a: proportion of perturbed history. a=1 -> same proportion, a=0-> no perturbed history 67 | :param distrib: distribution of the perturbed history 68 | :return: 69 | """ 70 | tr = Tracker2(self.means, T, store_rewards_arm=True) 71 | for t in range(T): 72 | if t < self.nb_arms: 73 | arm = t 74 | else: 75 | idx_mean = np.zeros(self.nb_arms) 76 | for k in range(self.nb_arms): 77 | ph = np.random.binomial(n=np.int(a*tr.Na[k]), p=0.5) 78 | idx_mean[k] = (tr.Sa[k]+ph)/(tr.Na[k]+np.int(a*tr.Na[k])) 79 | arm = rd_argmax(idx_mean) 80 | reward = self.MAB[arm].sample()[0] 81 | tr.update(t, arm, reward) 82 | return tr 83 | 84 | def TS(self, T): 85 | """ 86 | Implementation of Thompson Sampling with a Binarization trick 87 | :param T: Time Horizon 88 | :return: Tracker2 object 89 | """ 90 | def f(S, N): 91 | return np.random.beta(S+1, N-S+1) 92 | tr = Tracker2(self.means, T) 93 | bin_Sa = np.zeros(self.nb_arms) 94 | for t in range(T): 95 | if t < self.nb_arms: 96 | arm = t % self.nb_arms 97 | else: 98 | arm = rd_argmax(f(bin_Sa, tr.Na)) 99 | reward = self.MAB[arm].sample()[0] 100 | bin_Sa[arm] += np.random.binomial(n=1, p=reward) 101 | tr.update(t, arm, reward) 102 | return tr 103 | 104 | def IMED(self, T): 105 | """ 106 | Implementation of IMED with a binarization trick 107 | :param T: 108 | :return: 109 | """ 110 | def kl_ber(x, y): 111 | if x == y: 112 | return 0 113 | elif x > 1 - 1e-6: 114 | return 0 115 | elif y == 0 or y == 1: 116 | return np.inf 117 | elif x < 1e-6: 118 | return (1 - x) * np.log((1 - x) / (1 - y)) 119 | return x * np.log(x / y) + (1 - x) * np.log((1 - x) / (1 - y)) 120 | 121 | def index_func(bin_Sa, x): 122 | mu_max = np.max(bin_Sa/x.Na) 123 | idx = [] 124 | for k in range(self.nb_arms): 125 | idx.append(x.Na[k]*kl_ber(bin_Sa[k]/x.Na[k], mu_max)+np.log(x.Na[k])) 126 | return -np.array(idx) 127 | tr = Tracker2(self.means, T) 128 | bin_Sa = np.zeros(self.nb_arms) 129 | for t in range(T): 130 | if t < self.nb_arms: 131 | arm = t % self.nb_arms 132 | else: 133 | arm = rd_argmax(index_func(bin_Sa, tr)) 134 | reward = self.MAB[arm].sample()[0] 135 | bin_Sa[arm] += np.random.binomial(n=1, p=reward) 136 | tr.update(t, arm, reward) 137 | return tr -------------------------------------------------------------------------------- /__main__.py: -------------------------------------------------------------------------------- 1 | from BernoulliMAB import BetaBernoulliMAB 2 | from GaussianMAB import GaussianMAB 3 | from ExponentialMAB import ExponentialMAB 4 | from Trunc_GaussianMAB import TruncGaussianMAB 5 | import numpy as np 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | from xp_helpers import multiprocess_MC, Bayesian_MC_regret, Bayesian_multiprocess_MC 9 | from time import time 10 | import pickle as pkl 11 | import os 12 | 13 | results_path = "xp_results" 14 | 15 | xp_type = "test" 16 | #xp_type = "frequentist" 17 | #xp_type = "bayesian" 18 | 19 | # Enter the parameters of each algorithms 20 | param = {'SSMC': {}, 'WR_SDA': {}, 'RB_SDA': {}, 'BESA': {}, 21 | 'TS': {}, 'kl_ucb': {'f': np.log}, 'PHE': {'a': 1.1}, 'vanilla_bootstrap': {}, 'LB_SDA': {}, 22 | 'LDS_SDA': {}, 'non_parametric_TS': {}, 'IMED': {}, 'Bootstrapped_TS': {'M': 10, 'prior': np.random.random}, 23 | 'ReBoot': {'sigma': 1}, 'ReBootG': {'sigma': 2.}} 24 | 25 | if __name__ == '__main__' and xp_type == "frequentist": 26 | # Settings of the experiments of the paper 27 | xp_bernoulli = {'xp1': [0.9, 0.8], 'xp2': [0.5, 0.6], 'xp3': [0.01]*3+[0.03]*3+[0.1]+[0.05]*3, 28 | 'xp4': [0.85]*7+[0.9]} 29 | xp_gaussian = {'xp1': [[0., 1.], [0.5, 1.]], 'xp2': [[0., 1.], [0., 1.], [0., 1.], [0.5, 1.]], 30 | 'xp3': [[0., 1.], [0.5, 1.], [1.0, 1.], [1.5, 1.]]} 31 | xp_expo = {'xp1': [1, 1.5], 'xp2': [0.1, 0.2], 'xp3': [10, 11], 'xp4': [1, 2, 3, 4], 32 | 'xp5': [0.1, 0.2, 0.3, 0.4], 'xp6': [4, 4, 4, 5]} 33 | xp_TG = {'xp1': [[0.5, 0.1], [0.6, 0.1]], 'xp2': [[0., 0.3], [0.2, 0.3]], 34 | 'xp3': [[1.5, 1.], [2., 1.]], 'xp4': [[0.4, 1.], [0.5, 1.], [0.6, 1.], [0.7, 1.]]} 35 | xp_settings = {'TG': xp_TG, 'G': xp_gaussian, 'Exp': xp_expo, 'B': xp_bernoulli} 36 | 37 | # General Parameters 38 | algs = ['TS', 'RB_SDA', 'WR_SDA', 'LB_SDA', 'BESA', 'SSMC'] # Select some Algorithms (check param file for availability) 39 | T, N = 1000, 100 # Time Horizon and Number of runs 40 | step = 25 # If results are saved trajectories are stored for all rounds such that t%step=0 41 | 42 | # Run 43 | xp_family = 'B' 44 | # xp = [(xp_family, x) for x in xp_settings[xp_family]] # To run all xp defined for a family 45 | xp = [('B', 'xp1'), ('G', 'xp2'), ('Exp', 'xp3'), ('TG', 'xp4')] # To run any subset of xp 46 | for x in xp: 47 | caption = x[1] + '_' + x[0] + str(int(np.random.uniform() * 1e6)) # Name of the results file 48 | print(caption) # caption=None allow to avoid saving the results 49 | res, traj = multiprocess_MC((x[0], xp_settings[x[0]][x[1]], T, N, 50 | algs, param, step), plot=True, pickle_path=results_path, caption=caption) 51 | 52 | 53 | if __name__ == "__main__" and xp_type == "bayesian": 54 | # Possible Samplers for the experiments 55 | def sp_xp_B(size): # Generate means uniformly in [0, 1] 56 | return np.random.uniform(0., 1., size=size) 57 | 58 | def sp_xp_G(size): # Generate means with a gaussian distribution and add a variance param of 1 59 | a = np.random.normal(0., 1., size=size) 60 | return [[x, 1] for x in a] 61 | 62 | # General Parameters 63 | n_arms = 3 # number of arms 64 | bandit = GaussianMAB # bandit model (object inherited from class MAB) 65 | xp_sampler = sp_xp_G # function to generate the problems 66 | N, T = 100, 2000 # Number of problems generate and Time Horizon of each run 67 | step = 100 68 | algs = ['TS', 'BESA', 'RB_SDA', 'WR_SDA', 'SSMC'] 69 | args = (bandit, algs, n_arms, N, T, param, xp_sampler, step) # do not modify 70 | caption = 'Gaussian_' + str(np.random.randint(1e5)) 71 | b = Bayesian_multiprocess_MC(args, pickle_path=results_path, caption=caption) 72 | 73 | if __name__ == "__main__" and xp_type == "test": 74 | # # Test anything here 75 | # model = TruncGaussianMAB([[0.5, 1], [0.6, 1]]) 76 | # print(model.MC_regret(method='RB_SDA', N=500, T=20000, param_dic=param['WR_SDA'])) 77 | # 78 | # # Example to load the pickles and read the results 79 | # name = 'xp1_B161845.pkl' 80 | # res = pkl.load(open(os.path.join(results_path, name), 'rb')) 81 | # print(res['info']) # parameters dic 82 | # # Working with every runs of some algorithm 83 | # print(res['trajectories']['BESA'][-1].mean(), res['trajectories']['BESA'][-1].std()) 84 | # # The average regret dataframe 85 | # res['df_regret'].plot() 86 | # plt.show() 87 | model = GaussianMAB([[0., 1], [0, 1]]) 88 | res = model.RB_SDA(T=2000) 89 | n = np.array([np.cumsum(res.arm_sequence == i) for i in range(2)]).T 90 | l = np.argmax(n, axis=1) 91 | count_change = np.sum([l[i] != l[i-1] for i in range(1, len(l))]) 92 | count_l_draw = np.sum([np.sum((l==i)*0) for i in range(2)]) 93 | rw = [np.cumsum(x)/(np.arange(len(x))+1) for x in res.rewards_arm] 94 | print(res) 95 | -------------------------------------------------------------------------------- /arms.py: -------------------------------------------------------------------------------- 1 | """ Packages import """ 2 | import numpy as np 3 | from numba import jit 4 | from scipy.stats import truncnorm as trunc_norm 5 | from utils import convert_tg_mean 6 | 7 | 8 | class AbstractArm(object): 9 | def __init__(self, mean, variance, random_state): 10 | """ 11 | :param mean: float, expectation of the arm 12 | :param variance: float, variance of the arm 13 | :param random_state: int, seed to make experiments reproducible 14 | """ 15 | self.mean = mean 16 | self.variance = variance 17 | self.local_random = np.random.RandomState(random_state) 18 | 19 | def sample(self): 20 | pass 21 | 22 | 23 | class ArmBernoulli(AbstractArm): 24 | def __init__(self, p, random_state=0): 25 | """ 26 | :param p: float, mean parameter 27 | :param random_state: int, seed to make experiments reproducible 28 | """ 29 | self.p = p 30 | super(ArmBernoulli, self).__init__(mean=p, 31 | variance=p * (1. - p), 32 | random_state=random_state) 33 | 34 | def sample(self): 35 | """ 36 | Sampling strategy 37 | :return: float, a sample from the arm 38 | """ 39 | return (self.local_random.rand(1) < self.p)*1. 40 | 41 | 42 | class ArmBeta(AbstractArm): 43 | def __init__(self, a, b, random_state=0): 44 | """ 45 | :param a: int, alpha coefficient in beta distribution 46 | :param b: int, beta coefficient in beta distribution 47 | :param random_state: int, seed to make experiments reproducible 48 | """ 49 | self.a = a 50 | self.b = b 51 | super(ArmBeta, self).__init__(mean=a/(a + b), 52 | variance=(a * b)/((a + b) ** 2 * (a + b + 1)), 53 | random_state=random_state) 54 | 55 | def sample(self): 56 | """ 57 | Sampling strategy 58 | :return: float, a sample from the arm 59 | """ 60 | return self.local_random.beta(self.a, self.b, 1) 61 | 62 | 63 | class ArmGaussian(AbstractArm): 64 | def __init__(self, mu, eta, random_state=0): 65 | """ 66 | :param mu: float, mean parameter in gaussian distribution 67 | :param eta: float, std parameter in gaussian distribution 68 | :param random_state: int, seed to make experiments reproducible 69 | """ 70 | self.mu = mu 71 | self.eta = eta 72 | super(ArmGaussian, self).__init__(mean=mu, 73 | variance=eta**2, 74 | random_state=random_state) 75 | 76 | def sample(self): 77 | """ 78 | Sampling strategy 79 | :return: float, a sample from the arm 80 | """ 81 | return self.local_random.normal(self.mu, self.eta, 1) 82 | 83 | 84 | class ArmFinite(AbstractArm): 85 | def __init__(self, X, P, random_state=0): 86 | """ 87 | :param X: np.array, support of the distribution 88 | :param P: np.array, associated probabilities 89 | :param random_state: int, seed to make experiments reproducible 90 | """ 91 | self.X = X 92 | self.P = P 93 | mean = np.sum(X * P) 94 | super(ArmFinite, self).__init__(mean=mean, 95 | variance=np.sum(X ** 2 * P) - mean ** 2, 96 | random_state=random_state) 97 | 98 | def sample(self): 99 | """ 100 | Sampling strategy for an arm with a finite support and the associated probability distribution 101 | :return: float, a sample from the arm 102 | """ 103 | i = self.local_random.choice(len(self.P), size=1, p=self.P) 104 | reward = self.X[i] 105 | return reward 106 | 107 | 108 | class ArmExponential(AbstractArm): 109 | def __init__(self, p, random_state=0): 110 | """ 111 | :param mu: float, mean parameter in gaussian distribution 112 | :param eta: float, std parameter in gaussian distribution 113 | :param random_state: int, seed to make experiments reproducible 114 | """ 115 | self.p = p 116 | super(ArmExponential, self).__init__(mean=p, 117 | variance=p**2, 118 | random_state=random_state) 119 | 120 | def sample(self): 121 | """ 122 | Sampling strategy 123 | :return: float, a sample from the arm 124 | """ 125 | return self.local_random.exponential(self.p, 1) 126 | 127 | 128 | class dirac(): 129 | def __init__(self, c, random_state): 130 | """ 131 | :param mean: float, expectation of the arm 132 | :param variance: float, variance of the arm 133 | :param random_state: int, seed to make experiments reproducible 134 | """ 135 | self.mean = c 136 | self.variance = 0 137 | self.local_random = np.random.RandomState(random_state) 138 | 139 | def sample(self): 140 | return [self.mean] 141 | 142 | 143 | class ArmTG(AbstractArm): 144 | def __init__(self, mu, scale, random_state=0): 145 | """ 146 | :param mu: mean 147 | :param random_state: int, seed to make experiments reproducible 148 | """ 149 | self.mu = mu 150 | self.scale = scale 151 | self.dist = trunc_norm(-mu/scale, b=(1-mu)/scale, loc=mu, scale=scale) 152 | self.dist.random_state = random_state 153 | super(ArmTG, self).__init__(mean=convert_tg_mean(mu, scale), variance=scale**2, 154 | random_state=random_state) 155 | 156 | def sample(self): 157 | """ 158 | Sampling strategy 159 | :return: float, a sample from the arm 160 | """ 161 | x = self.local_random.normal(self.mu, self.scale, 1) 162 | return x * (x > 0) * (x < 1) + (x > 1) 163 | -------------------------------------------------------------------------------- /paper/SDA_final.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DBaudry/Sub-Sampling-Dueling-Algorithms-Neurips20/c362488a273a01d41ac517b2a8e1ca8899647ad6/paper/SDA_final.pdf -------------------------------------------------------------------------------- /tracker.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Tracker2: 4 | """ 5 | This object is used in bandit models to store useful quantities to run the algorithm and report the experiment. 6 | """ 7 | def __init__(self, means, T, store_rewards_arm=False): 8 | self.means = means 9 | self.nb_arms = means.shape[0] 10 | self.T = T 11 | self.Sa = np.zeros(self.nb_arms) 12 | self.Na = np.zeros(self.nb_arms) 13 | self.reward = np.zeros(self.T) 14 | self.arm_sequence = np.empty(self.T, dtype=int) 15 | self.t = 0 16 | self.store_rewards_arm = store_rewards_arm 17 | if store_rewards_arm: 18 | self.rewards_arm = [[] for _ in range(self.nb_arms)] 19 | 20 | def reset(self): 21 | """ 22 | Initialization of quantities of interest used for all methods 23 | :param T: int, time horizon 24 | :return: - Sa: np.array, cumulative reward of arm a 25 | - Na: np.array, number of times arm a has been pulled 26 | - reward: np.array, rewards 27 | - arm_sequence: np.array, arm chose at each step 28 | """ 29 | self.Sa = np.zeros(self.nb_arms) 30 | self.Na = np.zeros(self.nb_arms) 31 | self.reward = np.zeros(self.T) 32 | self.arm_sequence = np.zeros(self.T, dtype=int) 33 | self.rewards_arm = [[]]*self.nb_arms 34 | if self.store_rewards_arm: 35 | self.rewards_arm = [[] for _ in range(self.nb_arms)] 36 | 37 | def update(self, t, arm, reward): 38 | """ 39 | Update all the parameters of interest after choosing the correct arm 40 | :param t: int, current time/round 41 | :param arm: int, arm chose at this round 42 | :param Sa: np.array, cumulative reward array up to time t-1 43 | :param Na: np.array, number of times arm has been pulled up to time t-1 44 | :param reward: np.array, rewards obtained with the policy up to time t-1 45 | :param arm_sequence: np.array, arm chose at each step up to time t-1 46 | """ 47 | self.Na[arm] += 1 48 | self.arm_sequence[t] = arm 49 | self.reward[t] = reward 50 | self.Sa[arm] += reward 51 | self.t = t 52 | if self.store_rewards_arm: 53 | self.rewards_arm[arm].append(reward) 54 | 55 | def regret(self): 56 | """ 57 | Compute the regret of a single experiment 58 | :param reward: np.array, the array of reward obtained from the policy up to time T 59 | :param T: int, time horizon 60 | :return: np.array, cumulative regret for a single experiment 61 | """ 62 | return self.means.max() * np.arange(1, self.T + 1) - np.cumsum(np.array(self.means)[self.arm_sequence]) -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | """ Packages import """ 2 | import numpy as np 3 | from numba import jit 4 | import bottleneck as bn 5 | import scipy.stats as sc 6 | 7 | @jit(nopython=True) 8 | def rd_argmax(vector): 9 | """ 10 | Compute random among eligible maximum indices 11 | :param vector: np.array 12 | :return: int, random index among eligible maximum indices 13 | """ 14 | m = np.amax(vector) 15 | indices = np.nonzero(vector == m)[0] 16 | return np.random.choice(indices) 17 | 18 | 19 | @jit(nopython=True) 20 | def rd_choice(vec, size): 21 | """ 22 | jit version of np.random.choice (slightly improve the computation time) 23 | """ 24 | return np.random.choice(vec, size=size, replace=False) 25 | 26 | 27 | @jit(nopython=True) 28 | def hypergeom_sample(s1, n1, n2): 29 | """ 30 | jit version of np.random.choice (slightly improve the computation time) 31 | """ 32 | return np.random.hypergeometric(s1, n1 - s1, nsample=n2) 33 | 34 | 35 | def rollavg_bottlneck(a, n): 36 | """ 37 | :param a: array 38 | :param n: window of the rolling average 39 | :return: A fast function for computing moving averages 40 | """ 41 | return bn.move_mean(a, window=n, min_count=n) 42 | 43 | 44 | @jit(nopython=True) 45 | def get_leader(Na, Sa, l_prev): 46 | """ 47 | :param Na: Number of pulls of each arm (array) 48 | :param Sa: Sum of rewards of each arm (array) 49 | :param l_prev: previous leader 50 | :return: Leader for SSMC and SDA algorithms 51 | """ 52 | m = np.amax(Na) 53 | n_argmax = np.nonzero(Na == m)[0] 54 | if n_argmax.shape[0] == 1: 55 | l = n_argmax[0] 56 | return l 57 | else: 58 | s_max = Sa[n_argmax].max() 59 | s_argmax = np.nonzero(Sa[n_argmax] == s_max)[0] 60 | if np.nonzero(n_argmax[s_argmax] == l_prev)[0].shape[0] > 0: 61 | return l_prev 62 | return n_argmax[np.random.choice(s_argmax)] 63 | 64 | 65 | def get_SSMC_star_min(rewards_l, n_challenger, reshape_size): 66 | """ 67 | little helper for SSMC* 68 | """ 69 | return (np.array(rewards_l)[:n_challenger * reshape_size].reshape( 70 | (reshape_size, n_challenger))).mean(axis=1).min() 71 | 72 | 73 | def convert_tg_mean(mu, scale, step=1e-7): 74 | """ 75 | :param mu: mean of the underlying gaussian r.v 76 | :param scale: scale of the underlying gaussian r.v 77 | :param step: precision of the numerical integration 78 | :return: compute the mean of the Truncated Gaussian r.v knowing the parameters of its 79 | associated Gaussian r.v 80 | """ 81 | X = np.arange(0, 1, step) 82 | return (X * sc.norm.pdf(X, loc=mu, scale=scale)).mean()+ 1 - sc.norm.cdf(1, loc=mu, scale=scale) -------------------------------------------------------------------------------- /xp_helpers.py: -------------------------------------------------------------------------------- 1 | from joblib import Parallel, delayed 2 | import multiprocessing as mp 3 | import numpy as np 4 | import pandas as pd 5 | import pickle as pkl 6 | from time import time 7 | import os 8 | import matplotlib.pyplot as plt 9 | import seaborn as sns 10 | from GaussianMAB import GaussianMAB 11 | from BernoulliMAB import BetaBernoulliMAB 12 | from ExponentialMAB import ExponentialMAB 13 | from Trunc_GaussianMAB import TruncGaussianMAB 14 | from tqdm import tqdm 15 | 16 | mapping = {'B': BetaBernoulliMAB, 'G': GaussianMAB, 'Exp': ExponentialMAB, 'TG': TruncGaussianMAB} 17 | mapping_name = {'B': 'Bernoulli', 'G': 'Gaussian', 'Exp': 'Exponential', 'TG': 'Truncated Gaussian'} 18 | 19 | def MC_xp(args, plot=False, pickle_path=None, caption='xp'): 20 | """ 21 | :param args: parameters of the experiment 22 | :param plot: Boolean, plot the average regret if True 23 | :param pickle_path: if not None, path to store the results 24 | :param caption: name of the file if pickle_path is not None 25 | :return: average regret, dict with all trajectories 26 | """ 27 | bandit, p, T, n_xp, methods, param, store_step = args 28 | model = mapping[bandit](p) 29 | all_r = [] 30 | all_traj = {} 31 | for x in methods: 32 | r, traj = model.MC_regret(x, n_xp, T, param[x], store_step) 33 | all_r.append(r) 34 | all_traj[x] = traj 35 | all_r.append(model.Cp*np.log(1+np.arange(T))) 36 | df_r = pd.DataFrame(all_r).T 37 | df_r.columns = methods + ['lower bound'] 38 | df_r['lower bound'].iloc[0] = 0 39 | if plot: 40 | df_r.plot(figsize=(10, 8), logx=True) 41 | if pickle_path is not None: 42 | pkl.dump(df_r, open(os.path.join(pickle_path, caption+'.pkl'), 'wb')) 43 | return df_r, all_traj 44 | 45 | 46 | def multiprocess_MC(args, plot=False, pickle_path=None, caption='xp'): 47 | """ 48 | Same function as MC_xp, but including multiprocessing tools to allow parallelization 49 | """ 50 | t0 = time() 51 | cpu = mp.cpu_count() 52 | print('Running on %i cores' % cpu) 53 | bandit, p, T, n_xp, methods, param, store_step = args 54 | new_args = (bandit, p, T, n_xp//cpu+1, methods, param, store_step) 55 | res = Parallel(n_jobs=cpu)(delayed(MC_xp)(new_args) for _ in range(cpu)) 56 | df_r = res[0][0] 57 | for i in range(cpu-1): 58 | df_r += res[i+1][0] 59 | df_r = df_r/cpu 60 | traj = {} 61 | for x in methods: 62 | traj[x] = np.concatenate([res[i][1][x] for i in range(cpu)], axis=1) 63 | if plot: 64 | df_r.index = 1 + df_r.index 65 | df_r.plot(figsize=(10, 8), logx=True) 66 | plt.title('Average Regret for experiment ' + caption.split('_')[0] + ', ' + mapping_name[bandit] + ' arms (log scale)') 67 | plt.show() 68 | if pickle_path is not None: 69 | info = {'proba': p, 'N_xp': n_xp, 'T': T, 'methods': methods, 'param': param, 'step_traj': store_step} 70 | my_pkl_obj = {'df_regret': df_r, 'trajectories': traj, 'info': info} 71 | pkl.dump(my_pkl_obj, open(os.path.join(pickle_path, caption+'.pkl'), 'wb')) 72 | print('Execution time: %s seconds' % str(time()-t0)) 73 | return df_r, traj 74 | 75 | 76 | def Bayesian_MC_regret(args): 77 | """ 78 | Implementation of Monte Carlo method to approximate the expectation of the regret 79 | :param method: list, methods used (UCB, Thomson Sampling, etc..) 80 | :param n_arms: number of arms for each experiment 81 | :param N: int, number of independent Monte Carlo simulation (one simul=one parameter) 82 | :param T: int, time horizon 83 | :param param_dic: dict, parameters for the different methods, can be the value of rho for UCB model or an int 84 | corresponding to the number of rounds of exploration for the ExploreCommit method 85 | """ 86 | bandit, methods, n_arms, N, T, param, xp_sampler, step = args 87 | store_xp = np.zeros((len(methods), N, np.arange(T)[::step].shape[0])) 88 | mc_regret = pd.DataFrame(np.zeros((T, len(methods))), columns=methods) 89 | xp_list = [] 90 | for n in tqdm(range(N)): 91 | p = xp_sampler(size=n_arms) 92 | xp_list.append(p) 93 | model = bandit(p) 94 | for i, m in enumerate(methods): 95 | alg = model.__getattribute__(m) 96 | tr = alg(T, **param[m]) 97 | regret = tr.regret() 98 | mc_regret[m] += regret 99 | store_xp[i, n, :] = regret[::step] 100 | return {'regret': mc_regret/N, 'traj': store_xp, 'xp_list': xp_list} 101 | 102 | def Bayesian_multiprocess_MC(args, pickle_path=None, plot=True, caption='xp'): 103 | """ 104 | :param args: parameters of the experiments 105 | :param pickle_path: If not None, path where the results are stored 106 | :param caption: Name of the file to store the results if pickle path is not none 107 | :return: dataframe of average regret, results for each trajectory/alg, xp settings 108 | """ 109 | t0 = time() 110 | cpu = mp.cpu_count() 111 | print('Running on %i cores' % cpu) 112 | bandit, methods, n_arms, N, T, param, xp_sampler, step = args 113 | new_args = (bandit, methods, n_arms, N//cpu+1, T, param, xp_sampler, step) 114 | res = Parallel(n_jobs=cpu)(delayed(Bayesian_MC_regret)(new_args) for _ in range(cpu)) 115 | 116 | df_r = res[0]['regret'] 117 | xp_list = res[0]['xp_list'] 118 | for i in range(cpu-1): 119 | df_r += res[i+1]['regret'] 120 | xp_list += res[i+1]['xp_list'] 121 | df_r = df_r/cpu 122 | traj = np.concatenate([res[i]['traj'] for i in range(cpu)], axis=1) 123 | if pickle_path is not None: 124 | info = {'type': 'Bayesian', 'N_xp': N, 'T': T, 'methods': methods, 'step_traj': step} 125 | my_pkl_obj = {'df_regret': df_r, 'trajectories': traj, 'info': info} 126 | pkl.dump(my_pkl_obj, open(os.path.join(pickle_path, caption+'.pkl'), 'wb')) 127 | if plot: 128 | df_r.index = 1 + df_r.index 129 | df_r.plot(figsize=(10, 8), logx=True) 130 | plt.title('Average Regret for bayesian experiment '+caption.split('_')[0] + ', ' + str(n_arms)+' arms (log scale)') 131 | plt.show() 132 | print('Execution time: %s seconds' % str(time()-t0)) 133 | return df_r, traj, xp_list --------------------------------------------------------------------------------