├── BernoulliMAB.py
├── ExponentialMAB.py
├── GaussianMAB.py
├── MAB.py
├── README.md
├── Trunc_GaussianMAB.py
├── __main__.py
├── arms.py
├── paper
    └── SDA_final.pdf
├── tracker.py
├── utils.py
└── xp_helpers.py


/BernoulliMAB.py:
--------------------------------------------------------------------------------
  1 | """ Packages import """
  2 | from MAB import *
  3 | from utils import  rollavg_bottlneck, rd_choice, hypergeom_sample
  4 | from scipy.optimize import brentq
  5 | 
  6 | 
  7 | class BetaBernoulliMAB(GenericMAB):
  8 |     """
  9 |     Bernoulli Bandit Problem
 10 |     """
 11 |     def __init__(self, p):
 12 |         """
 13 |         Initialization
 14 |         :param p: np.array, true probabilities of success for each arm
 15 |         """
 16 |         # Initialization of arms from GenericMAB
 17 |         super().__init__(methods=['B']*len(p), p=p)
 18 |         # Complexity
 19 |         self.Cp = sum([(self.mu_max-x)/self.kl(x, self.mu_max) for x in self.means if x != self.mu_max])
 20 | 
 21 |     @staticmethod
 22 |     def kl(x, y):
 23 |         """
 24 |         Implementation of the Kullback-Leibler divergence for two Bernoulli distributions (B(x),B(y))
 25 |         :param x: float
 26 |         :param y: float
 27 |         :return: float, KL(B(x), B(y))
 28 |         """
 29 |         if x == y:
 30 |             return 0
 31 |         elif x > 1-1e-6:
 32 |             return 0
 33 |         elif y == 0 or y == 1:
 34 |             return np.inf
 35 |         elif x < 1e-6:
 36 |             return (1-x) * np.log((1-x)/(1-y))
 37 |         return x * np.log(x/y) + (1-x) * np.log((1-x)/(1-y))
 38 | 
 39 |     def TS(self, T):
 40 |         """
 41 |         Beta-Bernoulli Thompson Sampling
 42 |         :param T: Time Horizon
 43 |         :return: Tracker2 object
 44 |         """
 45 |         def f(x):
 46 |             return np.random.beta(x.Sa+1, x.Na-x.Sa+1)
 47 |         return self.Index_Policy(T, f)
 48 | 
 49 |     def BESA_duel(self, indices, tracker):
 50 |         """
 51 |         More efficient implementation of the BESA duel in the Bernoulli case
 52 |         :param indices: indices of arms of the duel
 53 |         :param tracker: Tracker2 object
 54 |         :return: winner of the duel
 55 |         """
 56 |         i, j = indices[0], indices[1]
 57 |         ni, nj = tracker.Na[i], tracker.Na[j]
 58 |         si, sj = tracker.Sa[i], tracker.Sa[j]
 59 |         idx_min = np.argmin([ni, nj])
 60 |         if idx_min == 0:
 61 |             sj = hypergeom_sample(sj, nj, ni)
 62 |         else:
 63 |             si = hypergeom_sample(si, ni, nj)
 64 |         return indices[rd_argmax(np.array([si, sj]))]
 65 | 
 66 |     def SSMC(self, T, explo_func=lambda x: np.sqrt(np.log(x))):
 67 |         """
 68 |         More efficient implementation of SSMC for the Bernoulli case
 69 |         :param T: Time Horizon
 70 |         :param explo_func: Forced exploration function
 71 |         :return: Tracker2 object
 72 |         """
 73 |         tr = Tracker2(self.means, T, store_rewards_arm=True)
 74 |         r, t, l = 1, 0, -1
 75 |         while t < self.nb_arms:
 76 |             arm = t
 77 |             tr.update(t, arm, self.MAB[arm].sample()[0])
 78 |             t += 1
 79 |         while t < T:
 80 |             l_prev = l
 81 |             l = get_leader(tr.Na, tr.Sa, l_prev)
 82 |             t_prev, forced_explo = t, explo_func(r)
 83 | 
 84 |             indic = (tr.Na < tr.Na[l]) * (tr.Na < forced_explo) * 1.
 85 |             if l_prev != l or tr.rewards_arm[l][-1] == 0:
 86 |                 for j in range(self.nb_arms):
 87 |                     if indic[j] == 0 and j != l:
 88 |                         if l_prev == l:
 89 |                             lead_min = np.mean(tr.rewards_arm[l][-int(tr.Na[j]):])
 90 |                         else:
 91 |                             lead_min = rollavg_bottlneck(tr.rewards_arm[l], int(tr.Na[j]))[(int(tr.Na[j])-1):].min()
 92 |                         if tr.Sa[j]/tr.Na[j] >= lead_min and t < T:
 93 |                             indic[j] = 1
 94 |             if indic.sum() == 0:
 95 |                 tr.update(t, l, self.MAB[l].sample()[0])
 96 |                 t += 1
 97 |             else:
 98 |                 to_draw = np.where(indic == 1)[0]
 99 |                 np.random.shuffle(to_draw)
100 |                 for i in to_draw:
101 |                     if t < T:
102 |                         tr.update(t, i, self.MAB[i].sample()[0])
103 |                         t += 1
104 |             r += 1
105 |         return tr
106 | 
107 |     def PHE(self, T, a, distrib=None):
108 |         """
109 |         More efficient version of PHE for Bernoulli bandits
110 |         :param T: Time Horizon
111 |         :param a: proportion of perturbed history. a=1 -> same proportion, a=0-> no perturbed history
112 |         :param distrib: distribution of the perturbed history
113 |         :return: Tracker2 object
114 |         """
115 |         tr = Tracker2(self.means, T, store_rewards_arm=True)
116 |         for t in range(T):
117 |             if t < self.nb_arms:
118 |                 arm = t
119 |             else:
120 |                 idx_mean = np.zeros(self.nb_arms)
121 |                 for k in range(self.nb_arms):
122 |                     ph = np.random.binomial(n=np.int(a*tr.Na[k])+1, p=0.5)
123 |                     idx_mean[k] = (tr.Sa[k]+ph)/(tr.Na[k]+np.int(a*tr.Na[k])+1)
124 |                 arm = rd_argmax(idx_mean)
125 |             reward = self.MAB[arm].sample()[0]
126 |             tr.update(t, arm, reward)
127 |         return tr
128 | 
129 |     def kl_ucb(self, T, f):
130 |         """
131 |         Implementation of the KL-UCB algorithm for Bernoulli bandits
132 |         :param T: Time Horizon
133 |         :param f: Function in the minimization problem
134 |         :return: Tracker2 object
135 |         """
136 |         def index_func(x):
137 |             res = []
138 |             for k in range(self.nb_arms):
139 |                 if x.Sa[k]/x.Na[k] < 1e-6:
140 |                     res.append(1)
141 |                 elif x.Sa[k]/x.Na[k] > 1-1e-6:
142 |                     res.append(1)
143 |                 else:
144 |                     def kl_shift(y):
145 |                         return self.kl(x.Sa[k]/x.Na[k], y) - f(x.t)/x.Na[k]
146 |                     res.append(brentq(kl_shift, x.Sa[k]/x.Na[k]-1e-7, 1 - 1e-10))
147 |             return np.array(res)
148 |         return self.Index_Policy(T, index_func)
149 | 


--------------------------------------------------------------------------------
/ExponentialMAB.py:
--------------------------------------------------------------------------------
 1 | """ Packages import """
 2 | from MAB import *
 3 | from scipy.optimize import brentq
 4 | 
 5 | 
 6 | class ExponentialMAB(GenericMAB):
 7 |     """
 8 |     Gaussian Bandit Problem
 9 |     """
10 |     def __init__(self, p):
11 |         """
12 |         Initialization
13 |         :param p: np.array, true values of (mu, sigma) for each arm with mean sampled from N(mu, sigma)
14 |         """
15 |         # Initialization of arms from GenericMAB
16 |         super().__init__(methods=['Exp']*len(p), p=p)
17 |         # Parameters used for stop learning policy
18 |         self.Cp = sum([(self.mu_max-x)/self.kl(1/x, 1/self.mu_max) for x in self.means if x != self.mu_max])
19 | 
20 | 
21 |     @staticmethod
22 |     def kl(x, y):
23 |         """
24 |         Implementation of the Kullback-Leibler divergence for two Exponential Distributions
25 |         WARNING: x, y are the inverse of the means of the distributions
26 |         :param x: float
27 |         :param y: float
28 |         :return: float, KL(E(x), E(y))
29 |         """
30 |         return np.log(x/y) + y/x - 1
31 | 
32 |     def TS(self, T):
33 |         """
34 |         Thompson Sampling with known variance, and an inproper uniform prior
35 |          on the mean
36 |         :param T: Time Horizon
37 |         :return: Tracker2 object
38 |         """
39 |         def f(x):
40 |             return 1/np.random.gamma(shape=x.Na, scale=1/x.Sa)
41 |         return self.Index_Policy(T, f)
42 | 
43 |     def kl_ucb(self, T, f):
44 |         """
45 |         Implementation of KL-UCB for Exponential distributions
46 |         :param T: Time Horizon
47 |         :param f: function in the minimization problem
48 |         :return: Tracker2 object
49 |         """
50 |         def index_func(x):
51 |             res = []
52 |             for k in range(self.nb_arms):
53 |                 mu = x.Sa[k] / x.Na[k]
54 |                 def kl_shift(y):
55 |                     return np.log(y/mu) + mu/y-1 - f(x.t) / x.Na[k]
56 |                 res.append(brentq(kl_shift, mu*np.exp(f(x.t)/x.Na[k]), mu*np.exp(f(x.t)/x.Na[k]+1)))
57 |             return np.array(res)
58 | 
59 |         return self.Index_Policy(T, index_func)
60 | 
61 |     def IMED(self, T):
62 |         """
63 |         Implementation of IMED for Exponential distributions
64 |         :param T: Time Horizon
65 |         :return: Tracker2 object
66 |         """
67 |         def index_func(x):
68 |             mu_max = np.max(x.Sa/x.Na)
69 |             idx = []
70 |             for k in range(self.nb_arms):
71 |                 idx.append(x.Na[k]*self.kl(mu_max, x.Sa[k]/x.Na[k])+np.log(x.Na[k]))
72 |             return -np.array(idx)
73 |         return self.Index_Policy(T, index_func)
74 | 


--------------------------------------------------------------------------------
/GaussianMAB.py:
--------------------------------------------------------------------------------
 1 | """ Packages import """
 2 | from MAB import *
 3 | 
 4 | 
 5 | class GaussianMAB(GenericMAB):
 6 |     """
 7 |     Gaussian Bandit Problem
 8 |     """
 9 |     def __init__(self, p):
10 |         """
11 |         Initialization
12 |         :param p: np.array, true values of 1/lambda for each arm
13 |         """
14 |         # Initialization of arms from GenericMAB
15 |         super().__init__(methods=['G']*len(p), p=p)
16 |         # Parameters used for stop learning policy
17 |         self.best_arm = self.get_best_arm()
18 |         # Careful: Cp is the bound only with same variance for each arm
19 |         self.Cp = sum([(self.mu_max - arm.mu) / self.kl2(arm.mu, self.mu_max, arm.eta, self.MAB[self.best_arm].eta)
20 |                        for arm in self.MAB if arm.mu != self.mu_max])
21 | 
22 |     def get_best_arm(self):
23 |         ind = np.nonzero(self.means == np.amax(self.means))[0]
24 |         std = [self.MAB[arm].eta for arm in ind]
25 |         u = np.argmin(std)
26 |         return ind[u]
27 | 
28 |     @staticmethod
29 |     def kl(mu1, mu2):
30 |         """
31 |         Implementation of the Kullback-Leibler divergence for two Gaussian N(mu, 1)
32 |         :param x: float
33 |         :param y: float
34 |         :return: float, KL(B(x), B(y))
35 |         """
36 |         return (mu2-mu1)**2/2
37 | 
38 |     @staticmethod
39 |     def kl2(mu1, mu2, sigma1, sigma2):
40 |         """
41 |         Implementation of the Kullback-Leibler divergence for two Gaussian with different std
42 |         :param x: float
43 |         :param y: float
44 |         :return: float, KL(B(x), B(y))
45 |         """
46 |         return np.log(sigma2/sigma1) + 0.5 * (sigma1**2/sigma2**2 + (mu2-mu1)**2/sigma2**2 - 1)
47 | 
48 |     def TS(self, T):
49 |         """
50 |         Thompson Sampling for Gaussian distributions with known variance, and an inproper uniform prior
51 |         on the mean
52 |         :param T: Time Horizon
53 |         :return: Tracker2 object
54 |         """
55 |         eta = np.array([arm.eta for arm in self.MAB])
56 | 
57 |         def f(x):
58 |             return np.random.normal(x.Sa/x.Na, eta/np.sqrt(x.Na))
59 |         return self.Index_Policy(T, f)
60 | 
61 |     def kl_ucb(self, T, f):
62 |         """
63 |         Implementation of KL-UCB for Gaussian bandits
64 |         :param T: Time Horizon
65 |         :param rho: coefficient for the upper bound
66 |         :return:
67 |         """
68 |         def index_func(x):
69 |             return x.Sa / x.Na + np.sqrt(f(x.t)*2 / x.Na)
70 |         return self.Index_Policy(T, index_func)
71 | 


--------------------------------------------------------------------------------
/MAB.py:
--------------------------------------------------------------------------------
  1 | """ Packages import """
  2 | import numpy as np
  3 | import arms
  4 | from tqdm import tqdm
  5 | from utils import rd_argmax, rd_choice, rollavg_bottlneck, get_leader
  6 | from tracker import Tracker2
  7 | from utils import get_SSMC_star_min
  8 | #import sobol_seq # for LDS-SDA
  9 | 
 10 | mapping = {'B': arms.ArmBernoulli, 'beta': arms.ArmBeta, 'F': arms.ArmFinite, 'G': arms.ArmGaussian,
 11 |            'Exp': arms.ArmExponential, 'dirac': arms.dirac, 'TG': arms.ArmTG}
 12 | 
 13 | 
 14 | def default_exp(x):
 15 |     """
 16 |     :param x: float 
 17 |     :return: default exploration function for SDA algorithms
 18 |     """
 19 |     return 0
 20 |     # return np.sqrt(np.log(x))
 21 | 
 22 | 
 23 | class GenericMAB:
 24 |     """
 25 |     Generic class to simulate a Multi-Arm Bandit problem
 26 |     """
 27 |     def __init__(self, methods, p):
 28 |         """
 29 |         Initialization of the arms
 30 |         :param methods: string, probability distribution of each arm
 31 |         :param p: np.array or list, parameters of the probability distribution of each arm
 32 |         """
 33 |         self.MAB = self.generate_arms(methods, p)
 34 |         self.nb_arms = len(self.MAB)
 35 |         self.means = np.array([el.mean for el in self.MAB])
 36 |         self.mu_max = np.max(self.means)
 37 |         self.mc_regret = None
 38 | 
 39 |     @staticmethod
 40 |     def generate_arms(methods, p):
 41 |         """
 42 |         Method for generating different arms
 43 |         :param methods: string, probability distribution of each arm
 44 |         :param p: np.array or list, parameters of the probability distribution of each arm
 45 |         :return: list of class objects, list of arms
 46 |         """
 47 |         arms_list = list()
 48 |         for i, m in enumerate(methods):
 49 |             args = [p[i]] + [[np.random.randint(1, 312414)]]
 50 |             args = sum(args, []) if type(p[i]) == list else args
 51 |             alg = mapping[m]
 52 |             arms_list.append(alg(*args))
 53 |         return arms_list
 54 | 
 55 |     @staticmethod
 56 |     def kl(x, y):
 57 |         return None
 58 | 
 59 |     def MC_regret(self, method, N, T, param_dic, store_step=-1):
 60 |         """
 61 |         Average Regret on a Number of Experiments
 62 |         :param method: string, method used (UCB, Thomson Sampling, etc..)
 63 |         :param N: int, number of independent experiments
 64 |         :param T: int, time horizon
 65 |         :param param_dic: dict, parameters for the different methods
 66 |         """
 67 |         mc_regret = np.zeros(T)
 68 |         store = store_step > 0
 69 |         if store:
 70 |             all_regret = np.zeros((np.arange(T)[::store_step].shape[0], N))
 71 |         alg = self.__getattribute__(method)
 72 |         for i in tqdm(range(N), desc='Computing ' + str(N) + ' simulations'):
 73 |             tr = alg(T, **param_dic)
 74 |             regret = tr.regret()
 75 |             mc_regret += regret
 76 |             if store:
 77 |                 all_regret[:, i] = regret[::store_step]
 78 |         if store:
 79 |             return mc_regret / N, all_regret
 80 |         return mc_regret / N
 81 | 
 82 |     def DummyPolicy(self, T):
 83 |         """
 84 |         Implementation of a random policy consisting in randomly choosing one of the available arms. Only useful
 85 |         for checking that the behavior of the different policies is normal
 86 |         :param T:  int, time horizon
 87 |         :return: means, arm sequence
 88 |         """
 89 |         tr = Tracker2(self.means, T)
 90 |         tr.arm_sequence = np.random.randint(self.nb_arms, size=T)
 91 |         return tr
 92 | 
 93 |     def ExploreCommit(self, T, m):
 94 |         """
 95 |         Implementation of Explore-then-Commit algorithm
 96 |         :param T: int, time horizon
 97 |         :param m: int, number of rounds before choosing the best action
 98 |         :return: np.arrays, reward obtained by the policy and sequence of chosen arms
 99 |         """
100 |         tr = Tracker2(self.means, T)
101 |         for t in range(m * self.nb_arms):
102 |             arm = t % self.nb_arms
103 |             tr.update(t, arm, self.MAB[arm].sample()[0])
104 |         arm = rd_argmax(tr.Sa / tr.Na)
105 |         for t in range(m * self.nb_arms, T):
106 |             tr.update(t, arm, self.MAB[arm].sample()[0])
107 |         return tr
108 | 
109 |     def Index_Policy(self, T, index_func, start_explo=1, store_rewards_arm=False):
110 |         """
111 |         Implementation of UCB1 algorithm
112 |         :param T: int, time horizon
113 |         :param start_explo: number of time to explore each arm before comparing index
114 |         :param index_func: function which computes the index with the tracker
115 |         :return: np.arrays, reward obtained by the policy and sequence of chosen arms
116 |         """
117 |         tr = Tracker2(self.means, T, store_rewards_arm)
118 |         for t in range(T):
119 |             if t < self.nb_arms*start_explo:
120 |                 arm = t % self.nb_arms
121 |             else:
122 |                 arm = rd_argmax(index_func(tr))
123 |             reward = self.MAB[arm].sample()[0]
124 |             tr.update(t, arm, reward)
125 |         return tr
126 | 
127 |     def UCB1(self, T, rho=1.):
128 |         """
129 |         :param T: Time Horizon
130 |         :param rho: coefficient for the upper bound
131 |         :return:
132 |         """
133 |         def index_func(x):
134 |             return x.Sa / x.Na + rho * np.sqrt(np.log(x.t + 1)*2 / x.Na)
135 |         return self.Index_Policy(T, index_func)
136 | 
137 |     def BESA_duel(self, indices, tracker):
138 |         """
139 |         :param indices: indices of the 2 competing arms
140 |         :param tracker: Tracker2 object
141 |         :return: winner arm of a single dual in BESA
142 |         """
143 |         i, j = indices[0], indices[1]
144 |         r_i, r_j = tracker.rewards_arm[i], tracker.rewards_arm[j]
145 |         ni, nj = tracker.Na[i], tracker.Na[j]
146 |         idx_max = rd_argmax(np.array([ni, nj]))
147 |         if idx_max == 1:
148 |             r_j = rd_choice(np.array(r_j), size=int(ni))
149 |         else:
150 |             r_i = rd_choice(np.array(r_i), size=int(nj))
151 |         return indices[rd_argmax(np.array([np.mean(r_i), np.mean(r_j)]))]
152 | 
153 |     def BESA_step(self, tracker):
154 |         """
155 |         :param tracker: Tracker2 object
156 |         :return: Implementation of the tournament in BESA
157 |         """
158 |         indices = list(np.arange(self.nb_arms))
159 |         while len(indices) > 1:
160 |             np.random.shuffle(indices)  # Changement pour enlever le biais
161 |             winners = []
162 |             if len(indices) % 2 == 1:
163 |                 winners.append(indices[-1])
164 |             for i in range(len(indices)//2):
165 |                 winners.append(self.BESA_duel((indices[2*i], indices[2*i+1]), tracker))
166 |             indices = winners
167 |         return indices[0]
168 | 
169 |     def BESA(self, T, n0=1):
170 |         """
171 |         Implementation of the BESA algorithm
172 |         :param T: Time Horizon 
173 |         :param n0: Number of time to pull each arm before starting the algorithm
174 |         :return: Tracker object with the results of the run
175 |         """
176 |         tr = Tracker2(self.means, T, store_rewards_arm=True)
177 |         for t in range(T):
178 |             if t < self.nb_arms * n0:
179 |                 arm = t % self.nb_arms
180 |             else:
181 |                 arm = self.BESA_step(tr)
182 |             tr.update(t, arm, self.MAB[arm].sample()[0])
183 |         return tr
184 | 
185 |     def SSMC(self, T, explo_func=lambda x: np.sqrt(np.log(x))):
186 |         """
187 |         Implementation of the SSMC algorithm
188 |         :param T: Time Horizon
189 |         :param explo_func: Forced exploration function
190 |         :return: Tracker object with the results of the run
191 |         """
192 |         tr = Tracker2(self.means, T, store_rewards_arm=True)
193 |         r, t, l = 1, 0, -1
194 |         while t < self.nb_arms:
195 |             arm = t
196 |             tr.update(t, arm, self.MAB[arm].sample()[0])
197 |             t += 1
198 |         while t < T:
199 |             l_prev = l
200 |             l = get_leader(tr.Na, tr.Sa, l_prev)
201 |             t_prev, forced_explo = t, explo_func(r)
202 | 
203 |             indic = (tr.Na < tr.Na[l]) * (tr.Na < forced_explo) * 1.
204 |             for j in range(self.nb_arms):
205 |                 if indic[j] == 0 and j != l:
206 |                     if l_prev == l:
207 |                         lead_min = np.mean(tr.rewards_arm[l][-int(tr.Na[j]):])
208 |                     else:
209 |                         lead_min = rollavg_bottlneck(tr.rewards_arm[l], int(tr.Na[j]))[(int(tr.Na[j])-1):].min()
210 |                     if tr.Sa[j]/tr.Na[j] >= lead_min and t < T:
211 |                         indic[j] = 1
212 |             if indic.sum() == 0:
213 |                 tr.update(t, l, self.MAB[l].sample()[0])
214 |                 t += 1
215 |             else:
216 |                 to_draw = np.where(indic == 1)[0]
217 |                 np.random.shuffle(to_draw)
218 |                 for i in to_draw:
219 |                     if t < T:
220 |                         tr.update(t, i, self.MAB[i].sample()[0])
221 |                         t += 1
222 |             r += 1
223 |         return tr
224 |     
225 |     def SSMC_star(self, T, explo_func=default_exp):
226 |         """
227 |         Implemention of SSMC*, a slightly modified version of SSMC
228 |         :param T: Time Horizon 
229 |         :param explo_func: Forced Exploration function
230 |         :return: Tracker object with the results of the run
231 |         """
232 |         tr = Tracker2(self.means, T, store_rewards_arm=True)
233 |         r, t, l = 1, 0, -1
234 |         while t < self.nb_arms:
235 |             arm = t
236 |             tr.update(t, arm, self.MAB[arm].sample()[0])
237 |             t += 1
238 |         while t < T:
239 |             l_prev = l
240 |             l = get_leader(tr.Na, tr.Sa, l_prev)
241 |             t_prev, forced_explo = t, explo_func(r)
242 |             all_reshape_size = np.zeros(self.nb_arms)
243 |             indic = (tr.Na < tr.Na[l]) * (tr.Na < forced_explo) * 1.
244 |             for j in range(self.nb_arms):
245 |                 reshape_size = len(tr.rewards_arm[l]) // tr.Na[j]
246 |                 if indic[j] == 0 and j != l:
247 |                     if l_prev == l and reshape_size == all_reshape_size[j]:
248 |                         lead_min = np.inf
249 |                     elif l_prev == l:
250 |                         lead_min = np.mean(tr.rewards_arm[l][-int(tr.Na[j]):])
251 |                     else:
252 |                         lead_min = get_SSMC_star_min(tr.rewards_arm[l],
253 |                                                      int(tr.Na[j]), int(reshape_size))
254 |                     if tr.Sa[j]/tr.Na[j] >= lead_min and t < T:
255 |                         indic[j] = 1
256 |                 all_reshape_size[j] = reshape_size
257 |             if indic.sum() == 0:
258 |                 tr.update(t, l, self.MAB[l].sample()[0])
259 |                 t += 1
260 |             else:
261 |                 to_draw = np.where(indic == 1)[0]
262 |                 np.random.shuffle(to_draw)
263 |                 for i in to_draw:
264 |                     if t < T:
265 |                         tr.update(t, i, self.MAB[i].sample()[0])
266 |                         t += 1
267 |             r += 1
268 |         return tr
269 | 
270 |     def non_parametric_TS(self, T, upper_bound=1):
271 |         """
272 |         Implementation of the Non-parametric Thompson Sampling algorithm
273 |         :param T: Time Horizon
274 |         :param upper_bound: Upper bound for the reward
275 |         :return: Tracker object with the results of the run
276 |         """
277 |         tr = Tracker2(self.means, T)
278 |         if upper_bound is not None:
279 |             X = [[upper_bound] for _ in range(self.nb_arms)]
280 |         tr.Na = tr.Na + 1
281 |         for t in range(T):
282 |             V = np.zeros(self.nb_arms)
283 |             for i in range(self.nb_arms):
284 |                 V[i] = np.inner(np.random.dirichlet(np.ones(int(tr.Na[i]))), np.array(X[i]))
285 |             arm = rd_argmax(V)
286 |             tr.update(t, arm, self.MAB[arm].sample()[0])
287 |             X[arm].append(tr.reward[t])
288 |         return tr
289 | 
290 |     def WR_SDA(self, T, explo_func=default_exp):
291 |         """
292 |         Implementation of WR-SDA
293 |         :param T: Time Horizon
294 |         :param explo_func: Forced exploration function
295 |         :return: Tracker object with the results of the run
296 |         """
297 |         tr = Tracker2(self.means, T, store_rewards_arm=True)
298 |         r, t, l = 1, 0, -1
299 |         while t < self.nb_arms:
300 |             arm = t
301 |             tr.update(t, arm, self.MAB[arm].sample()[0])
302 |             t += 1
303 |         while t < T:
304 |             l_prev = l
305 |             l = get_leader(tr.Na, tr.Sa, l_prev)
306 |             t_prev, forced_explo = t, explo_func(r)
307 |             indic = (tr.Na < tr.Na[l]) * (tr.Na < forced_explo) * 1.
308 |             for j in range(self.nb_arms):
309 |                 if indic[j] == 0 and j != l:
310 |                     if self.BESA_duel([l, j], tracker=tr) == j:
311 |                         indic[j] = 1
312 |             if indic.sum() == 0:
313 |                 tr.update(t, l, self.MAB[l].sample()[0])
314 |                 t += 1
315 |             else:
316 |                 to_draw = np.where(indic == 1)[0]
317 |                 np.random.shuffle(to_draw)
318 |                 for i in to_draw:
319 |                     if t < T:
320 |                         tr.update(t, i, self.MAB[i].sample()[0])
321 |                         t += 1
322 |             r += 1
323 |         return tr
324 | 
325 |     def RB_SDA(self, T, explo_func=default_exp):
326 |         """
327 |         Implementation of RB-SDA
328 |         :param T: Time Horizon
329 |         :param explo_func: Forced exploration function
330 |         :return: Tracker object with the results of the run
331 |         """
332 |         tr = Tracker2(self.means, T, store_rewards_arm=True)
333 |         r, t, l = 1, 0, -1
334 |         while t < self.nb_arms:
335 |             arm = t
336 |             tr.update(t, arm, self.MAB[arm].sample()[0])
337 |             t += 1
338 |         while t < T:
339 |             l_prev = l
340 |             l = get_leader(tr.Na, tr.Sa, l_prev)
341 |             t_prev, forced_explo = t, explo_func(r)
342 |             indic = (tr.Na < tr.Na[l]) * (tr.Na < forced_explo) * 1.
343 |             for j in range(self.nb_arms):
344 |                 if indic[j] == 0 and j != l and tr.Na[j] < tr.Na[l]:
345 |                     tj = np.random.randint(tr.Na[l]-tr.Na[j])
346 |                     lead_mean = np.mean(tr.rewards_arm[l][tj: tj+int(tr.Na[j])])
347 |                     if tr.Sa[j]/tr.Na[j] >= lead_mean and t < T:
348 |                         indic[j] = 1
349 |             if indic.sum() == 0:
350 |                 tr.update(t, l, self.MAB[l].sample()[0])
351 |                 t += 1
352 |             else:
353 |                 to_draw = np.where(indic == 1)[0]
354 |                 np.random.shuffle(to_draw)
355 |                 for i in to_draw:
356 |                     if t < T:
357 |                         tr.update(t, i, self.MAB[i].sample()[0])
358 |                         t += 1
359 |             r += 1
360 |         return tr
361 | 
362 |     def IB_SDA(self, T, explo_func=default_exp):
363 |         """
364 |         Implementation of IB-SDA (Independent Blocks-SDA): an algorithm not introduced in the paper 
365 |         using a SWR sampler which discards elements that were previously drawn until there are no
366 |         more available elements. It is a way to enforce the diversity of sample.
367 |         We did not present this sampler as it is not an independent sampler.
368 |         :param T: Time Horizon
369 |         :param explo_func: Forced exploration function
370 |         :return: Tracker object with the results of the run
371 |         """
372 |         tr = Tracker2(self.means, T, store_rewards_arm=True)
373 |         r, t, l = 1, 0, -1
374 |         while t < self.nb_arms:
375 |             arm = t
376 |             tr.update(t, arm, self.MAB[arm].sample()[0])
377 |             t += 1
378 |         while t < T:
379 |             l_prev = l
380 |             l = get_leader(tr.Na, tr.Sa, l_prev)
381 |             t_prev, forced_explo = t, explo_func(r)
382 |             indic = (tr.Na < tr.Na[l]) * (tr.Na < forced_explo) * 1.
383 |             if l_prev != l:
384 |                 weight_dic = np.ones((self.nb_arms, int(tr.Na[l])))
385 |             else:
386 |                 if weight_dic.shape[1] < tr.Na[l]:
387 |                     weight_dic = np.concatenate([weight_dic,
388 |                                              np.ones((self.nb_arms, 1))], axis=1)
389 |                 for k in range(self.nb_arms):
390 |                     if k != l and weight_dic[k].sum() < tr.Na[k]:
391 |                         weight_dic[k] = np.ones(int(tr.Na[l]))
392 |             for j in range(self.nb_arms):
393 |                 if indic[j] == 0 and j != l and tr.Na[j] < tr.Na[l]:
394 |                     besa_indices = np.random.choice(
395 |                         np.arange(tr.Na[l]).astype('int'), size=int(tr.Na[j]),
396 |                         replace=False, p=weight_dic[j]/weight_dic[j].sum())
397 |                     lead_mean = np.mean(np.array(tr.rewards_arm[l])[besa_indices])
398 |                     weight_dic[j][besa_indices] = 0
399 |                     if tr.Sa[j]/tr.Na[j] >= lead_mean and t < T:
400 |                         indic[j] = 1
401 |             if indic.sum() == 0:
402 |                 tr.update(t, l, self.MAB[l].sample()[0])
403 |                 t += 1
404 |             else:
405 |                 to_draw = np.where(indic == 1)[0]
406 |                 np.random.shuffle(to_draw)
407 |                 for i in to_draw:
408 |                     if t < T:
409 |                         tr.update(t, i, self.MAB[i].sample()[0])
410 |                         t += 1
411 |             r += 1
412 |         return tr
413 | 
414 |     def LB_SDA(self, T, explo_func=default_exp):
415 |         """
416 |         Implementation of the LB-SDA algorithm
417 |         :param T: Time Horizon
418 |         :param explo_func: Forced exploration function
419 |         :return: Tracker object with the results of the run
420 |         """
421 |         tr = Tracker2(self.means, T, store_rewards_arm=True)
422 |         r, t, l = 1, 0, -1
423 |         while t < self.nb_arms:
424 |             arm = t
425 |             tr.update(t, arm, self.MAB[arm].sample()[0])
426 |             t += 1
427 |         while t < T:
428 |             l_prev = l
429 |             l = get_leader(tr.Na, tr.Sa, l_prev)
430 |             t_prev, forced_explo = t, explo_func(r)
431 |             indic = (tr.Na < tr.Na[l]) * (tr.Na < forced_explo) * 1.
432 |             for j in range(self.nb_arms):
433 |                 if indic[j] == 0 and j != l and tr.Na[j] < tr.Na[l]:
434 |                     lead_mean = np.mean(tr.rewards_arm[l][-int(tr.Na[j]):])
435 |                     if tr.Sa[j]/tr.Na[j] >= lead_mean and t < T:
436 |                         indic[j] = 1
437 |             if indic.sum() == 0:
438 |                 tr.update(t, l, self.MAB[l].sample()[0])
439 |                 t += 1
440 |             else:
441 |                 to_draw = np.where(indic == 1)[0]
442 |                 np.random.shuffle(to_draw)
443 |                 for i in to_draw:
444 |                     if t < T:
445 |                         tr.update(t, i, self.MAB[i].sample()[0])
446 |                         t += 1
447 |             r += 1
448 |         return tr
449 | 
450 |     def LDS_SDA(self, T, explo_func=default_exp):
451 |         """
452 |         Implementation of the LDS-SDA algorithm using a Sobol sequence
453 |         :param T: Time Horizon
454 |         :param explo_func: Forced exploration function
455 |         :return: Tracker object with the results of the run
456 |         """
457 |         tr = Tracker2(self.means, T, store_rewards_arm=True)
458 |         r, t, l = 1, 0, -1
459 |         while t < self.nb_arms:
460 |             arm = t
461 |             tr.update(t, arm, self.MAB[arm].sample()[0])
462 |             t += 1
463 |         while t < T:
464 |             l_prev = l
465 |             l = get_leader(tr.Na, tr.Sa, l_prev)
466 |             t_prev, forced_explo = t, explo_func(r)
467 |             indic = (tr.Na < tr.Na[l]) * (tr.Na < forced_explo) * 1.
468 |             t_b = int(sobol_seq.i4_sobol(1, seed=r)[0][0]*tr.Na[l])
469 |             for j in range(self.nb_arms):
470 |                 if indic[j] == 0 and j != l and tr.Na[j] < tr.Na[l]:
471 |                     b_0 = tr.rewards_arm[l][t_b:t_b+int(tr.Na[j])]
472 |                     if len(b_0) < tr.Na[j]:
473 |                         b_1 = tr.rewards_arm[l][:int(tr.Na[j])-int(tr.Na[l]-t_b)]
474 |                         lead_mean = (np.sum(b_0)+np.sum(b_1))/tr.Na[j]
475 |                     else:
476 |                         lead_mean = np.mean(b_0)
477 |                     if tr.Sa[j]/tr.Na[j] >= lead_mean and t < T:
478 |                         indic[j] = 1
479 |             if indic.sum() == 0:
480 |                 tr.update(t, l, self.MAB[l].sample()[0])
481 |                 t += 1
482 |             else:
483 |                 to_draw = np.where(indic == 1)[0]
484 |                 np.random.shuffle(to_draw)
485 |                 for i in to_draw:
486 |                     if t < T:
487 |                         tr.update(t, i, self.MAB[i].sample()[0])
488 |                         t += 1
489 |             r += 1
490 |         return tr
491 | 
492 |     def vanilla_bootstrap(self, T):
493 |         """
494 |         Implementation of the Vanilla Bootstrap bandit algorithm 
495 |         :param T: Time Horizon
496 |          :return: Tracker object with the results of the run
497 |         """
498 |         tr = Tracker2(self.means, T, store_rewards_arm=True)
499 |         for t in range(T):
500 |             if t < self.nb_arms:
501 |                 arm = t
502 |             else:
503 |                 bts_mean = np.zeros(self.nb_arms)
504 |                 for k in range(self.nb_arms):
505 |                     bts_mean[k] = np.random.choice(tr.rewards_arm[k], size=int(tr.Na[k]), replace=True).mean()
506 |                 arm = rd_argmax(bts_mean)
507 |             reward = self.MAB[arm].sample()[0]
508 |             tr.update(t, arm, reward)
509 |         return tr
510 | 
511 |     def PHE(self, T, a, distrib):
512 |         """
513 |         Implementation of the Perturbed History Exploration algorithm
514 |         :param T: Time Horizon
515 |         :param a: proportion of perturbed history. a=1 -> same proportion, a=0-> no perturbed history
516 |         :param distrib: Distribution of the fake rewards
517 |         :return: Tracker2 object
518 |         """
519 |         tr = Tracker2(self.means, T, store_rewards_arm=True)
520 |         for t in range(T):
521 |             if t < self.nb_arms:
522 |                 arm = t
523 |             else:
524 |                 idx_mean = np.zeros(self.nb_arms)
525 |                 for k in range(self.nb_arms):
526 |                     ph = distrib.rvs(size=np.int(a*tr.Na[k])+1)
527 |                     idx_mean[k] = (tr.Sa[k]+ph.sum())/(tr.Na[k]+np.int(a*tr.Na[k])+1)
528 |                 arm = rd_argmax(idx_mean)
529 |             reward = self.MAB[arm].sample()[0]
530 |             tr.update(t, arm, reward)
531 |         return tr
532 | 
533 |     def ReBoot(self, T, sigma, weight_func=np.random.normal):
534 |         """
535 |         Implementation of the Reboot algorithm
536 |         :param T: Time Horizon 
537 |         :param sigma: sigma and -sigma are added to the rewards list before bootstrapping
538 |         :param weight_func: a function of mean 0 and std 1
539 |         :return: Tracker2 object
540 |         """
541 |         def index_func(x):
542 |             avg = x.Sa/x.Na
543 |             idx = np.zeros(self.nb_arms)
544 |             for k in range(self.nb_arms):
545 |                 s = int(x.Na[k]) + 2
546 |                 e = np.zeros(s)
547 |                 e[:-2] = np.array(x.rewards_arm[k])-avg[k]
548 |                 e[-2] = np.sqrt(s) * sigma
549 |                 e[-1] = -np.sqrt(s) * sigma
550 |                 w = weight_func(size=s)
551 |                 idx[k] = avg[k]+np.mean(w*e)
552 |             return idx
553 |         return self.Index_Policy(T, index_func, store_rewards_arm=True)
554 | 
555 |     def ReBootG(self, T, sigma):
556 |         """
557 |         More efficient version of ReBoot with the gaussian bootstrap
558 |         :param T: Time Horizon
559 |         :param sigma: standard deviation of perturbation
560 |         :return: Tracker2 object
561 |         """
562 |         def index_func(x):
563 |             avg = x.Sa/x.Na
564 |             idx = np.zeros(self.nb_arms)
565 |             for k in range(self.nb_arms):
566 |                 s = int(x.Na[k]) + 2
567 |                 e = np.zeros(s)
568 |                 e[:-2] = np.array(x.rewards_arm[k])-avg[k]
569 |                 e[-2] = np.sqrt(s) * sigma
570 |                 e[-1] = -np.sqrt(s) * sigma
571 |                 idx[k] = avg[k]+np.random.normal(loc=0, scale=1/(e.shape[0])*np.sqrt((e**2).sum()))
572 |             return idx
573 |         return self.Index_Policy(T, index_func, store_rewards_arm=True)
574 | 
575 |     def IMED(self, T):
576 |         """
577 |         Implementation of the IMED algorithm
578 |         :param T: Time Horizon 
579 |         :return: Tracker2 object
580 |         """
581 |         def index_func(x):
582 |             mu_max = np.max(x.Sa/x.Na)
583 |             idx = []
584 |             for k in range(self.nb_arms):
585 |                 idx.append(x.Na[k]*self.kl(x.Sa[k]/x.Na[k], mu_max)+np.log(x.Na[k]))
586 |             return -np.array(idx)
587 |         return self.Index_Policy(T, index_func)
588 | 
589 |     def Bootstrapped_TS(self, T, prior, M):
590 |         """
591 |         Implementation of the Bootstrapped Thompson Sampling (Osband et al., 2017)
592 |         :param T: Time Horizon 
593 |         :param prior: prior for the fake history
594 |         :param M: number of fake samples at each step
595 |         :return: Tracker2 object
596 |         """
597 |         # éventuellement rajouter l'algo de bootstrap en param. Pour l'instant: with replacement
598 |         def index_func(x):
599 |             idx = []
600 |             for k in range(self.nb_arms):
601 |                 artificial_hist = list(prior(size=M))
602 |                 n_tot = int(M + x.Na[k])
603 |                 bts_sample = np.random.choice(x.rewards_arm[k]+artificial_hist, replace=True, size=n_tot)
604 |                 idx.append(np.mean(bts_sample))
605 |             return np.array(idx)
606 |         return self.Index_Policy(T, index_func, store_rewards_arm=True)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Sub-Sampling Dueling Algorithms, Neurips 2020
 2 | 
 3 | This repositery contains the code associated with the paper 
 4 | "Sub-sampling for efficient Non-Parametric Bandit Exploration" presented at Neurips 2020. We provide a description of the code structure and a short guide to run some experiments.
 5 | 
 6 | ## How to run experiments
 7 | 
 8 | The *__ main__.py* file contains different block with code that can be directly executed. This file relies on *xp_helpers.py*, that contains functions that 
 9 | allow to run two types of experiments: 
10 | * Frequentist experiments: the user defines a bandit model and perform a number of runs
11 | of each algorithm for this particular model
12 | * Bayesian experiments: the user defines a prior distribution for the bandit model and draw a number of experiments from this distribution.
13 | Then, each bandit algorithm runs once on these problems. 
14 | 
15 | The file is divided in three blocks. The __xp_type__ parameter allows to choose which block to run. Several examples are proposed in each blocks.
16 | 
17 | ## Code Structure
18 | 
19 | ### Bandit algorithms
20 | 
21 | Our implementation of the multi-arm bandit problem has its key structure in the *MAB.py* file. The initialization of the bandit relies on the *arms.py* file, which defines objects representing the arms and their properties (mean, how to sample the rewards, etc...). 
22 | 
23 | The __GenericMAB__ object is designed as a mother class for any bandit model. Several algorithms are already implemented in this class, when they don't have to be calibrated for specific distributions. The function __MC_regret__ allows to run a single bandit algorithm for a given number of runs and time horizon and returns the regret.
24 | 
25 | The objects __BernoulliMAB.py__, __GaussianMAB.py__, __ExponentialMAB.py__ and __TruncatedGaussianMAB.py__
26 | are inherited from __GenericMAB__ and refine the class to adapt it to the Bernoulli, Gaussian, Exponential and Truncated Gaussian distributions. In particular,
27 | they contain the algorithms that are specific to the family of distribution of the arms, or optimized versions of algorithms that are alerady in __GenericMAB__ (for instance in Bernoulli MAB). 
28 | 
29 | ### Helpers 
30 | 
31 | The __Tracker2__ object defined *tracker.py* is a useful object used in all of our bandit algorithms to store the settings of the experiments during the runs.
32 | In particular, it can be used to store the number of pulls, cumulated regret and reward history of each arm. 
33 | 
34 | *utils.py* contains several functions that are useful in the bandit algorithms. Some of these function use the *numba* package for faster computation. 
35 | 
36 | Finally, *xp_helpers.py* provide useful functions to perform large scale experiments in the frequentist and bayesian setting. Some of these functions use libraries that allow multiprocessing for parallel computation.
37 | 


--------------------------------------------------------------------------------
/Trunc_GaussianMAB.py:
--------------------------------------------------------------------------------
  1 | """ Packages import """
  2 | from MAB import *
  3 | from scipy.stats import norm
  4 | from tracker import Tracker2
  5 | import numpy as np
  6 | 
  7 | 
  8 | class TruncGaussianMAB(GenericMAB):
  9 |     """
 10 |     Gaussian Bandit Problem
 11 |     """
 12 |     def __init__(self, p):
 13 |         """
 14 |         Initialization
 15 |         :param p: np.array, true values of 1/lambda for each arm
 16 |         """
 17 |         # Initialization of arms from GenericMAB
 18 |         super().__init__(methods=['TG']*len(p), p=p)
 19 |         self.best_arm = self.get_best_arm()
 20 |         self.Cp = self.get_complexity()
 21 | 
 22 |     def get_complexity(self):
 23 |         """
 24 |         :return: Compute the constant in the Burnetas and Katehakis lower bound for TG arms
 25 |         """
 26 |         Cp = 0
 27 |         for arm in self.MAB:
 28 |             if arm.mean != self.mu_max:
 29 |                 gap = self.mu_max-arm.mean
 30 |                 kl = self.KL_tg(arm.mu, self.MAB[self.best_arm].mu, arm.scale)
 31 |                 Cp += gap/kl
 32 |         return Cp
 33 | 
 34 |     @staticmethod
 35 |     def KL_tg(mu1, mu2, scale, step=1e-6):
 36 |         """
 37 |         :param mu1: mean of underlying Gaussian r.v of arm 1
 38 |         :param mu2: mean of underlying Gaussian r.v of arm 1
 39 |         :param scale: scale of underlying Gaussian r.v
 40 |         :param step: precision of numerical integration
 41 |         :return: KL divergence of two TG arms
 42 |         """
 43 |         phi01 = norm.cdf(0, loc=mu1, scale=scale)
 44 |         phi02 = norm.cdf(0, loc=mu2, scale=scale)
 45 |         phi11 = 1 - norm.cdf(1, loc=mu1, scale=scale)
 46 |         phi12 = 1 - norm.cdf(1, loc=mu2, scale=scale)
 47 |         kl_1 = phi01 * np.log(phi01 / phi02) + phi11 * np.log(phi11 / phi12)
 48 |         X = np.arange(0, 1, step)
 49 |         kl_2 = (norm.pdf(X, loc=mu1, scale=scale) * np.log(norm.pdf(
 50 |             X, loc=mu1, scale=scale) / norm.pdf(X, loc=mu2, scale=scale))).mean()
 51 |         return kl_1 + kl_2
 52 | 
 53 |     def get_best_arm(self):
 54 |         """
 55 |         :return: best arm of the bandit problem
 56 |         """
 57 |         ind = np.nonzero(self.means == np.amax(self.means))[0]
 58 |         std = [self.MAB[arm].scale for arm in ind]
 59 |         u = np.argmin(std)
 60 |         return ind[u]
 61 | 
 62 |     def PHE(self, T, a, distrib=None):
 63 |         """
 64 |         Optimized version of PHE for TG arms
 65 |         :param T: Time Horizon
 66 |         :param a: proportion of perturbed history. a=1 -> same proportion, a=0-> no perturbed history
 67 |         :param distrib: distribution of the perturbed history
 68 |         :return:
 69 |         """
 70 |         tr = Tracker2(self.means, T, store_rewards_arm=True)
 71 |         for t in range(T):
 72 |             if t < self.nb_arms:
 73 |                 arm = t
 74 |             else:
 75 |                 idx_mean = np.zeros(self.nb_arms)
 76 |                 for k in range(self.nb_arms):
 77 |                     ph = np.random.binomial(n=np.int(a*tr.Na[k]), p=0.5)
 78 |                     idx_mean[k] = (tr.Sa[k]+ph)/(tr.Na[k]+np.int(a*tr.Na[k]))
 79 |                 arm = rd_argmax(idx_mean)
 80 |             reward = self.MAB[arm].sample()[0]
 81 |             tr.update(t, arm, reward)
 82 |         return tr
 83 | 
 84 |     def TS(self, T):
 85 |         """
 86 |         Implementation of Thompson Sampling with a Binarization trick
 87 |         :param T: Time Horizon
 88 |         :return: Tracker2 object
 89 |         """
 90 |         def f(S, N):
 91 |             return np.random.beta(S+1, N-S+1)
 92 |         tr = Tracker2(self.means, T)
 93 |         bin_Sa = np.zeros(self.nb_arms)
 94 |         for t in range(T):
 95 |             if t < self.nb_arms:
 96 |                 arm = t % self.nb_arms
 97 |             else:
 98 |                 arm = rd_argmax(f(bin_Sa, tr.Na))
 99 |             reward = self.MAB[arm].sample()[0]
100 |             bin_Sa[arm] += np.random.binomial(n=1, p=reward)
101 |             tr.update(t, arm, reward)
102 |         return tr
103 | 
104 |     def IMED(self, T):
105 |         """
106 |         Implementation of IMED with a binarization trick
107 |         :param T:
108 |         :return:
109 |         """
110 |         def kl_ber(x, y):
111 |             if x == y:
112 |                 return 0
113 |             elif x > 1 - 1e-6:
114 |                 return 0
115 |             elif y == 0 or y == 1:
116 |                 return np.inf
117 |             elif x < 1e-6:
118 |                 return (1 - x) * np.log((1 - x) / (1 - y))
119 |             return x * np.log(x / y) + (1 - x) * np.log((1 - x) / (1 - y))
120 | 
121 |         def index_func(bin_Sa, x):
122 |             mu_max = np.max(bin_Sa/x.Na)
123 |             idx = []
124 |             for k in range(self.nb_arms):
125 |                 idx.append(x.Na[k]*kl_ber(bin_Sa[k]/x.Na[k], mu_max)+np.log(x.Na[k]))
126 |             return -np.array(idx)
127 |         tr = Tracker2(self.means, T)
128 |         bin_Sa = np.zeros(self.nb_arms)
129 |         for t in range(T):
130 |             if t < self.nb_arms:
131 |                 arm = t % self.nb_arms
132 |             else:
133 |                 arm = rd_argmax(index_func(bin_Sa, tr))
134 |             reward = self.MAB[arm].sample()[0]
135 |             bin_Sa[arm] += np.random.binomial(n=1, p=reward)
136 |             tr.update(t, arm, reward)
137 |         return tr


--------------------------------------------------------------------------------
/__main__.py:
--------------------------------------------------------------------------------
 1 | from BernoulliMAB import BetaBernoulliMAB
 2 | from GaussianMAB import GaussianMAB
 3 | from ExponentialMAB import ExponentialMAB
 4 | from Trunc_GaussianMAB import TruncGaussianMAB
 5 | import numpy as np
 6 | import pandas as pd
 7 | import matplotlib.pyplot as plt
 8 | from xp_helpers import multiprocess_MC, Bayesian_MC_regret, Bayesian_multiprocess_MC
 9 | from time import time
10 | import pickle as pkl
11 | import os
12 | 
13 | results_path = "xp_results"
14 | 
15 | xp_type = "test"
16 | #xp_type = "frequentist"
17 | #xp_type = "bayesian"
18 | 
19 | # Enter the parameters of each algorithms
20 | param = {'SSMC': {}, 'WR_SDA': {}, 'RB_SDA': {}, 'BESA': {},
21 |          'TS': {}, 'kl_ucb': {'f': np.log}, 'PHE': {'a': 1.1}, 'vanilla_bootstrap': {}, 'LB_SDA': {},
22 |          'LDS_SDA': {}, 'non_parametric_TS': {}, 'IMED': {}, 'Bootstrapped_TS': {'M': 10, 'prior': np.random.random},
23 |          'ReBoot': {'sigma': 1}, 'ReBootG': {'sigma': 2.}}
24 | 
25 | if __name__ == '__main__' and xp_type == "frequentist":
26 |     # Settings of the experiments of the paper
27 |     xp_bernoulli = {'xp1': [0.9, 0.8], 'xp2': [0.5, 0.6], 'xp3': [0.01]*3+[0.03]*3+[0.1]+[0.05]*3,
28 |                     'xp4': [0.85]*7+[0.9]}
29 |     xp_gaussian = {'xp1': [[0., 1.], [0.5, 1.]], 'xp2': [[0., 1.], [0., 1.], [0., 1.], [0.5, 1.]],
30 |                    'xp3': [[0., 1.], [0.5, 1.], [1.0, 1.], [1.5, 1.]]}
31 |     xp_expo = {'xp1': [1, 1.5], 'xp2': [0.1, 0.2], 'xp3': [10, 11], 'xp4': [1, 2, 3, 4],
32 |                'xp5': [0.1, 0.2, 0.3, 0.4], 'xp6': [4, 4, 4, 5]}
33 |     xp_TG = {'xp1': [[0.5, 0.1], [0.6, 0.1]], 'xp2': [[0., 0.3], [0.2, 0.3]],
34 |             'xp3': [[1.5, 1.], [2., 1.]], 'xp4': [[0.4, 1.], [0.5, 1.], [0.6, 1.], [0.7, 1.]]}
35 |     xp_settings = {'TG': xp_TG, 'G': xp_gaussian, 'Exp': xp_expo, 'B': xp_bernoulli}
36 | 
37 |     # General Parameters
38 |     algs = ['TS', 'RB_SDA', 'WR_SDA', 'LB_SDA', 'BESA', 'SSMC']  # Select some Algorithms (check param file for availability)
39 |     T, N = 1000, 100  # Time Horizon and Number of runs
40 |     step = 25  # If results are saved trajectories are stored for all rounds such that t%step=0
41 | 
42 |     # Run
43 |     xp_family = 'B'
44 |     # xp = [(xp_family, x) for x in xp_settings[xp_family]] # To run all xp defined for a family
45 |     xp = [('B', 'xp1'), ('G', 'xp2'), ('Exp', 'xp3'), ('TG', 'xp4')]  # To run any subset of xp
46 |     for x in xp:
47 |         caption = x[1] + '_' + x[0] + str(int(np.random.uniform() * 1e6))  # Name of the results file
48 |         print(caption)  # caption=None allow to avoid saving the results
49 |         res, traj = multiprocess_MC((x[0], xp_settings[x[0]][x[1]], T, N,
50 |                                      algs, param, step), plot=True, pickle_path=results_path, caption=caption)
51 | 
52 | 
53 | if __name__ == "__main__" and xp_type == "bayesian":
54 |     # Possible Samplers for the experiments
55 |     def sp_xp_B(size):  # Generate means uniformly in [0, 1]
56 |         return np.random.uniform(0., 1., size=size)
57 | 
58 |     def sp_xp_G(size): # Generate means with a gaussian distribution and add a variance param of 1
59 |         a = np.random.normal(0., 1., size=size)
60 |         return [[x, 1] for x in a]
61 | 
62 |     # General Parameters
63 |     n_arms = 3  # number of arms
64 |     bandit = GaussianMAB  # bandit model (object inherited from class MAB)
65 |     xp_sampler = sp_xp_G  # function to generate the problems
66 |     N, T = 100, 2000  # Number of problems generate and Time Horizon of each run
67 |     step = 100
68 |     algs = ['TS', 'BESA', 'RB_SDA', 'WR_SDA', 'SSMC']
69 |     args = (bandit, algs, n_arms, N, T, param, xp_sampler, step)  # do not modify
70 |     caption = 'Gaussian_' + str(np.random.randint(1e5))
71 |     b = Bayesian_multiprocess_MC(args, pickle_path=results_path, caption=caption)
72 | 
73 | if __name__ == "__main__" and xp_type == "test":
74 |     # # Test anything here
75 |     # model = TruncGaussianMAB([[0.5, 1], [0.6, 1]])
76 |     # print(model.MC_regret(method='RB_SDA', N=500, T=20000, param_dic=param['WR_SDA']))
77 |     #
78 |     # # Example to load the pickles and read the results
79 |     # name = 'xp1_B161845.pkl'
80 |     # res = pkl.load(open(os.path.join(results_path, name), 'rb'))
81 |     # print(res['info'])  # parameters dic
82 |     # # Working with every runs of some algorithm
83 |     # print(res['trajectories']['BESA'][-1].mean(), res['trajectories']['BESA'][-1].std())
84 |     # # The average regret dataframe
85 |     # res['df_regret'].plot()
86 |     # plt.show()
87 |     model = GaussianMAB([[0., 1], [0, 1]])
88 |     res = model.RB_SDA(T=2000)
89 |     n = np.array([np.cumsum(res.arm_sequence == i) for i in range(2)]).T
90 |     l = np.argmax(n, axis=1)
91 |     count_change = np.sum([l[i] != l[i-1] for i in range(1, len(l))])
92 |     count_l_draw = np.sum([np.sum((l==i)*0) for i in range(2)])
93 |     rw = [np.cumsum(x)/(np.arange(len(x))+1) for x in res.rewards_arm]
94 |     print(res)
95 | 


--------------------------------------------------------------------------------
/arms.py:
--------------------------------------------------------------------------------
  1 | """ Packages import """
  2 | import numpy as np
  3 | from numba import jit
  4 | from scipy.stats import truncnorm as trunc_norm
  5 | from utils import convert_tg_mean
  6 | 
  7 | 
  8 | class AbstractArm(object):
  9 |     def __init__(self, mean, variance, random_state):
 10 |         """
 11 |         :param mean: float, expectation of the arm
 12 |         :param variance: float, variance of the arm
 13 |         :param random_state: int, seed to make experiments reproducible
 14 |         """
 15 |         self.mean = mean
 16 |         self.variance = variance
 17 |         self.local_random = np.random.RandomState(random_state)
 18 | 
 19 |     def sample(self):
 20 |         pass
 21 | 
 22 | 
 23 | class ArmBernoulli(AbstractArm):
 24 |     def __init__(self, p, random_state=0):
 25 |         """
 26 |         :param p: float, mean parameter
 27 |         :param random_state: int, seed to make experiments reproducible
 28 |         """
 29 |         self.p = p
 30 |         super(ArmBernoulli, self).__init__(mean=p,
 31 |                                            variance=p * (1. - p),
 32 |                                            random_state=random_state)
 33 | 
 34 |     def sample(self):
 35 |         """
 36 |         Sampling strategy
 37 |         :return: float, a sample from the arm
 38 |         """
 39 |         return (self.local_random.rand(1) < self.p)*1.
 40 | 
 41 | 
 42 | class ArmBeta(AbstractArm):
 43 |     def __init__(self, a, b, random_state=0):
 44 |         """
 45 |         :param a: int, alpha coefficient in beta distribution
 46 |         :param b: int, beta coefficient in beta distribution
 47 |         :param random_state: int, seed to make experiments reproducible
 48 |         """
 49 |         self.a = a
 50 |         self.b = b
 51 |         super(ArmBeta, self).__init__(mean=a/(a + b),
 52 |                                       variance=(a * b)/((a + b) ** 2 * (a + b + 1)),
 53 |                                       random_state=random_state)
 54 | 
 55 |     def sample(self):
 56 |         """
 57 |         Sampling strategy
 58 |         :return: float, a sample from the arm
 59 |         """
 60 |         return self.local_random.beta(self.a, self.b, 1)
 61 | 
 62 | 
 63 | class ArmGaussian(AbstractArm):
 64 |     def __init__(self, mu, eta, random_state=0):
 65 |         """
 66 |         :param mu: float, mean parameter in gaussian distribution
 67 |         :param eta: float, std parameter in gaussian distribution
 68 |         :param random_state: int, seed to make experiments reproducible
 69 |         """
 70 |         self.mu = mu
 71 |         self.eta = eta
 72 |         super(ArmGaussian, self).__init__(mean=mu,
 73 |                                           variance=eta**2,
 74 |                                           random_state=random_state)
 75 | 
 76 |     def sample(self):
 77 |         """
 78 |         Sampling strategy
 79 |         :return: float, a sample from the arm
 80 |         """
 81 |         return self.local_random.normal(self.mu, self.eta, 1)
 82 | 
 83 | 
 84 | class ArmFinite(AbstractArm):
 85 |     def __init__(self, X, P, random_state=0):
 86 |         """
 87 |         :param X: np.array, support of the distribution
 88 |         :param P: np.array, associated probabilities
 89 |         :param random_state: int, seed to make experiments reproducible
 90 |         """
 91 |         self.X = X
 92 |         self.P = P
 93 |         mean = np.sum(X * P)
 94 |         super(ArmFinite, self).__init__(mean=mean,
 95 |                                         variance=np.sum(X ** 2 * P) - mean ** 2,
 96 |                                         random_state=random_state)
 97 | 
 98 |     def sample(self):
 99 |         """
100 |         Sampling strategy for an arm with a finite support and the associated probability distribution
101 |         :return: float, a sample from the arm
102 |         """
103 |         i = self.local_random.choice(len(self.P), size=1, p=self.P)
104 |         reward = self.X[i]
105 |         return reward
106 | 
107 | 
108 | class ArmExponential(AbstractArm):
109 |     def __init__(self, p, random_state=0):
110 |         """
111 |         :param mu: float, mean parameter in gaussian distribution
112 |         :param eta: float, std parameter in gaussian distribution
113 |         :param random_state: int, seed to make experiments reproducible
114 |         """
115 |         self.p = p
116 |         super(ArmExponential, self).__init__(mean=p,
117 |                                           variance=p**2,
118 |                                           random_state=random_state)
119 | 
120 |     def sample(self):
121 |         """
122 |         Sampling strategy
123 |         :return: float, a sample from the arm
124 |         """
125 |         return self.local_random.exponential(self.p, 1)
126 | 
127 | 
128 | class dirac():
129 |     def __init__(self, c, random_state):
130 |         """
131 |         :param mean: float, expectation of the arm
132 |         :param variance: float, variance of the arm
133 |         :param random_state: int, seed to make experiments reproducible
134 |         """
135 |         self.mean = c
136 |         self.variance = 0
137 |         self.local_random = np.random.RandomState(random_state)
138 | 
139 |     def sample(self):
140 |         return [self.mean]
141 | 
142 | 
143 | class ArmTG(AbstractArm):
144 |     def __init__(self, mu, scale, random_state=0):
145 |         """
146 |         :param mu: mean
147 |         :param random_state: int, seed to make experiments reproducible
148 |         """
149 |         self.mu = mu
150 |         self.scale = scale
151 |         self.dist = trunc_norm(-mu/scale, b=(1-mu)/scale, loc=mu, scale=scale)
152 |         self.dist.random_state = random_state
153 |         super(ArmTG, self).__init__(mean=convert_tg_mean(mu, scale), variance=scale**2,
154 |                                       random_state=random_state)
155 | 
156 |     def sample(self):
157 |         """
158 |         Sampling strategy
159 |         :return: float, a sample from the arm
160 |         """
161 |         x = self.local_random.normal(self.mu, self.scale, 1)
162 |         return x * (x > 0) * (x < 1) + (x > 1)
163 | 


--------------------------------------------------------------------------------
/paper/SDA_final.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DBaudry/Sub-Sampling-Dueling-Algorithms-Neurips20/c362488a273a01d41ac517b2a8e1ca8899647ad6/paper/SDA_final.pdf


--------------------------------------------------------------------------------
/tracker.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Tracker2:
 4 |     """
 5 |     This object is used in bandit models to store useful quantities to run the algorithm and report the experiment.
 6 |     """
 7 |     def __init__(self, means, T, store_rewards_arm=False):
 8 |         self.means = means
 9 |         self.nb_arms = means.shape[0]
10 |         self.T = T
11 |         self.Sa = np.zeros(self.nb_arms)
12 |         self.Na = np.zeros(self.nb_arms)
13 |         self.reward = np.zeros(self.T)
14 |         self.arm_sequence = np.empty(self.T, dtype=int)
15 |         self.t = 0
16 |         self.store_rewards_arm = store_rewards_arm
17 |         if store_rewards_arm:
18 |             self.rewards_arm = [[] for _ in range(self.nb_arms)]
19 | 
20 |     def reset(self):
21 |         """
22 |         Initialization of quantities of interest used for all methods
23 |         :param T: int, time horizon
24 |         :return: - Sa: np.array, cumulative reward of arm a
25 |                  - Na: np.array, number of times arm a has been pulled
26 |                  - reward: np.array, rewards
27 |                  - arm_sequence: np.array, arm chose at each step
28 |         """
29 |         self.Sa = np.zeros(self.nb_arms)
30 |         self.Na = np.zeros(self.nb_arms)
31 |         self.reward = np.zeros(self.T)
32 |         self.arm_sequence = np.zeros(self.T, dtype=int)
33 |         self.rewards_arm = [[]]*self.nb_arms
34 |         if self.store_rewards_arm:
35 |             self.rewards_arm = [[] for _ in range(self.nb_arms)]
36 | 
37 |     def update(self, t, arm, reward):
38 |         """
39 |         Update all the parameters of interest after choosing the correct arm
40 |         :param t: int, current time/round
41 |         :param arm: int, arm chose at this round
42 |         :param Sa:  np.array, cumulative reward array up to time t-1
43 |         :param Na:  np.array, number of times arm has been pulled up to time t-1
44 |         :param reward: np.array, rewards obtained with the policy up to time t-1
45 |         :param arm_sequence: np.array, arm chose at each step up to time t-1
46 |         """
47 |         self.Na[arm] += 1
48 |         self.arm_sequence[t] = arm
49 |         self.reward[t] = reward
50 |         self.Sa[arm] += reward
51 |         self.t = t
52 |         if self.store_rewards_arm:
53 |             self.rewards_arm[arm].append(reward)
54 | 
55 |     def regret(self):
56 |         """
57 |         Compute the regret of a single experiment
58 |         :param reward: np.array, the array of reward obtained from the policy up to time T
59 |         :param T: int, time horizon
60 |         :return: np.array, cumulative regret for a single experiment
61 |         """
62 |         return self.means.max() * np.arange(1, self.T + 1) - np.cumsum(np.array(self.means)[self.arm_sequence])


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | """ Packages import """
 2 | import numpy as np
 3 | from numba import jit
 4 | import bottleneck as bn
 5 | import scipy.stats as sc
 6 | 
 7 | @jit(nopython=True)
 8 | def rd_argmax(vector):
 9 |     """
10 |     Compute random among eligible maximum indices
11 |     :param vector: np.array
12 |     :return: int, random index among eligible maximum indices
13 |     """
14 |     m = np.amax(vector)
15 |     indices = np.nonzero(vector == m)[0]
16 |     return np.random.choice(indices)
17 | 
18 | 
19 | @jit(nopython=True)
20 | def rd_choice(vec, size):
21 |     """
22 |     jit version of np.random.choice (slightly improve the computation time)
23 |     """
24 |     return np.random.choice(vec, size=size, replace=False)
25 | 
26 | 
27 | @jit(nopython=True)
28 | def hypergeom_sample(s1, n1, n2):
29 |     """
30 |     jit version of np.random.choice (slightly improve the computation time)
31 |     """
32 |     return np.random.hypergeometric(s1, n1 - s1, nsample=n2)
33 | 
34 | 
35 | def rollavg_bottlneck(a, n):
36 |     """
37 |     :param a: array
38 |     :param n: window of the rolling average
39 |     :return: A fast function for computing moving averages
40 |     """
41 |     return bn.move_mean(a, window=n, min_count=n)
42 | 
43 | 
44 | @jit(nopython=True)
45 | def get_leader(Na, Sa, l_prev):
46 |     """
47 |     :param Na: Number of pulls of each arm (array)
48 |     :param Sa: Sum of rewards of each arm (array)
49 |     :param l_prev: previous leader
50 |     :return: Leader for SSMC and SDA algorithms
51 |     """
52 |     m = np.amax(Na)
53 |     n_argmax = np.nonzero(Na == m)[0]
54 |     if n_argmax.shape[0] == 1:
55 |         l = n_argmax[0]
56 |         return l
57 |     else:
58 |         s_max = Sa[n_argmax].max()
59 |         s_argmax = np.nonzero(Sa[n_argmax] == s_max)[0]
60 |         if np.nonzero(n_argmax[s_argmax] == l_prev)[0].shape[0] > 0:
61 |             return l_prev
62 |     return n_argmax[np.random.choice(s_argmax)]
63 | 
64 | 
65 | def get_SSMC_star_min(rewards_l, n_challenger, reshape_size):
66 |     """
67 |     little helper for SSMC*
68 |     """
69 |     return (np.array(rewards_l)[:n_challenger * reshape_size].reshape(
70 |         (reshape_size, n_challenger))).mean(axis=1).min()
71 | 
72 | 
73 | def convert_tg_mean(mu, scale, step=1e-7):
74 |     """
75 |     :param mu: mean of the underlying gaussian r.v
76 |     :param scale: scale of the underlying gaussian r.v
77 |     :param step: precision of the numerical integration
78 |     :return: compute the mean of the Truncated Gaussian r.v knowing the parameters of its
79 |     associated Gaussian r.v
80 |     """
81 |     X = np.arange(0, 1, step)
82 |     return (X * sc.norm.pdf(X, loc=mu, scale=scale)).mean()+ 1 - sc.norm.cdf(1, loc=mu, scale=scale)


--------------------------------------------------------------------------------
/xp_helpers.py:
--------------------------------------------------------------------------------
  1 | from joblib import Parallel, delayed
  2 | import multiprocessing as mp
  3 | import numpy as np
  4 | import pandas as pd
  5 | import pickle as pkl
  6 | from time import time
  7 | import os
  8 | import matplotlib.pyplot as plt
  9 | import seaborn as sns
 10 | from GaussianMAB import GaussianMAB
 11 | from BernoulliMAB import BetaBernoulliMAB
 12 | from ExponentialMAB import ExponentialMAB
 13 | from Trunc_GaussianMAB import TruncGaussianMAB
 14 | from tqdm import tqdm
 15 | 
 16 | mapping = {'B': BetaBernoulliMAB, 'G': GaussianMAB, 'Exp': ExponentialMAB, 'TG': TruncGaussianMAB}
 17 | mapping_name = {'B': 'Bernoulli', 'G': 'Gaussian', 'Exp': 'Exponential', 'TG': 'Truncated Gaussian'}
 18 | 
 19 | def MC_xp(args, plot=False, pickle_path=None, caption='xp'):
 20 |     """
 21 |     :param args: parameters of the experiment
 22 |     :param plot: Boolean, plot the average regret if True
 23 |     :param pickle_path: if not None, path to store the results
 24 |     :param caption: name of the file if pickle_path is not None
 25 |     :return: average regret, dict with all trajectories
 26 |     """
 27 |     bandit, p, T, n_xp, methods, param, store_step = args
 28 |     model = mapping[bandit](p)
 29 |     all_r = []
 30 |     all_traj = {}
 31 |     for x in methods:
 32 |         r, traj = model.MC_regret(x, n_xp, T, param[x], store_step)
 33 |         all_r.append(r)
 34 |         all_traj[x] = traj
 35 |     all_r.append(model.Cp*np.log(1+np.arange(T)))
 36 |     df_r = pd.DataFrame(all_r).T
 37 |     df_r.columns = methods + ['lower bound']
 38 |     df_r['lower bound'].iloc[0] = 0
 39 |     if plot:
 40 |         df_r.plot(figsize=(10, 8), logx=True)
 41 |     if pickle_path is not None:
 42 |         pkl.dump(df_r, open(os.path.join(pickle_path, caption+'.pkl'), 'wb'))
 43 |     return df_r, all_traj
 44 | 
 45 | 
 46 | def multiprocess_MC(args, plot=False, pickle_path=None, caption='xp'):
 47 |     """
 48 |     Same function as MC_xp, but including multiprocessing tools to allow parallelization
 49 |     """
 50 |     t0 = time()
 51 |     cpu = mp.cpu_count()
 52 |     print('Running on %i cores' % cpu)
 53 |     bandit, p, T, n_xp, methods, param, store_step = args
 54 |     new_args = (bandit, p, T, n_xp//cpu+1, methods, param, store_step)
 55 |     res = Parallel(n_jobs=cpu)(delayed(MC_xp)(new_args) for _ in range(cpu))
 56 |     df_r = res[0][0]
 57 |     for i in range(cpu-1):
 58 |         df_r += res[i+1][0]
 59 |     df_r = df_r/cpu
 60 |     traj = {}
 61 |     for x in methods:
 62 |         traj[x] = np.concatenate([res[i][1][x] for i in range(cpu)], axis=1)
 63 |     if plot:
 64 |         df_r.index = 1 + df_r.index
 65 |         df_r.plot(figsize=(10, 8), logx=True)
 66 |         plt.title('Average Regret for experiment ' + caption.split('_')[0] + ', ' + mapping_name[bandit] + ' arms (log scale)')
 67 |         plt.show()
 68 |     if pickle_path is not None:
 69 |         info = {'proba': p, 'N_xp': n_xp, 'T': T, 'methods': methods, 'param': param, 'step_traj': store_step}
 70 |         my_pkl_obj = {'df_regret': df_r, 'trajectories': traj, 'info': info}
 71 |         pkl.dump(my_pkl_obj, open(os.path.join(pickle_path, caption+'.pkl'), 'wb'))
 72 |     print('Execution time: %s seconds' % str(time()-t0))
 73 |     return df_r, traj
 74 | 
 75 | 
 76 | def Bayesian_MC_regret(args):
 77 |     """
 78 |     Implementation of Monte Carlo method to approximate the expectation of the regret
 79 |     :param method: list, methods used (UCB, Thomson Sampling, etc..)
 80 |     :param n_arms: number of arms for each experiment
 81 |     :param N: int, number of independent Monte Carlo simulation (one simul=one parameter)
 82 |     :param T: int, time horizon
 83 |     :param param_dic: dict, parameters for the different methods, can be the value of rho for UCB model or an int
 84 |     corresponding to the number of rounds of exploration for the ExploreCommit method
 85 |     """
 86 |     bandit, methods, n_arms, N, T, param, xp_sampler, step = args
 87 |     store_xp = np.zeros((len(methods), N, np.arange(T)[::step].shape[0]))
 88 |     mc_regret = pd.DataFrame(np.zeros((T, len(methods))), columns=methods)
 89 |     xp_list = []
 90 |     for n in tqdm(range(N)):
 91 |         p = xp_sampler(size=n_arms)
 92 |         xp_list.append(p)
 93 |         model = bandit(p)
 94 |         for i, m in enumerate(methods):
 95 |             alg = model.__getattribute__(m)
 96 |             tr = alg(T, **param[m])
 97 |             regret = tr.regret()
 98 |             mc_regret[m] += regret
 99 |             store_xp[i, n, :] = regret[::step]
100 |     return {'regret': mc_regret/N, 'traj': store_xp, 'xp_list': xp_list}
101 | 
102 | def Bayesian_multiprocess_MC(args, pickle_path=None, plot=True, caption='xp'):
103 |     """
104 |     :param args:  parameters of the experiments
105 |     :param pickle_path: If not None, path where the results are stored
106 |     :param caption: Name of the file to store the results if pickle path is not none
107 |     :return: dataframe of average regret, results for each trajectory/alg, xp settings
108 |     """
109 |     t0 = time()
110 |     cpu = mp.cpu_count()
111 |     print('Running on %i cores' % cpu)
112 |     bandit, methods, n_arms, N, T, param, xp_sampler, step = args
113 |     new_args = (bandit, methods, n_arms, N//cpu+1, T, param, xp_sampler, step)
114 |     res = Parallel(n_jobs=cpu)(delayed(Bayesian_MC_regret)(new_args) for _ in range(cpu))
115 | 
116 |     df_r = res[0]['regret']
117 |     xp_list = res[0]['xp_list']
118 |     for i in range(cpu-1):
119 |         df_r += res[i+1]['regret']
120 |         xp_list += res[i+1]['xp_list']
121 |     df_r = df_r/cpu
122 |     traj = np.concatenate([res[i]['traj'] for i in range(cpu)], axis=1)
123 |     if pickle_path is not None:
124 |         info = {'type': 'Bayesian', 'N_xp': N, 'T': T, 'methods': methods, 'step_traj': step}
125 |         my_pkl_obj = {'df_regret': df_r, 'trajectories': traj, 'info': info}
126 |         pkl.dump(my_pkl_obj, open(os.path.join(pickle_path, caption+'.pkl'), 'wb'))
127 |     if plot:
128 |         df_r.index = 1 + df_r.index
129 |         df_r.plot(figsize=(10, 8), logx=True)
130 |         plt.title('Average Regret for bayesian experiment '+caption.split('_')[0] + ', ' + str(n_arms)+' arms (log scale)')
131 |         plt.show()
132 |     print('Execution time: %s seconds' % str(time()-t0))
133 |     return df_r, traj, xp_list


--------------------------------------------------------------------------------