├── tests ├── __init__.py └── multi_armed_bandit │ ├── __init__.py │ ├── arms │ ├── __init__.py │ ├── test_bernoulli_arm.py │ └── test_normal_arm.py │ └── algorithm │ ├── __init__.py │ └── test_epsilon_greedy.py ├── .version ├── multi_armed_bandit ├── __init__.py ├── arms │ ├── __init__.py │ ├── bernoulli_arm.py │ ├── normal_arm.py │ └── abstract_arm.py └── algorithm │ ├── __init__.py │ ├── epsilon_decreasing.py │ ├── softmax_decreasing.py │ ├── epsilon_first.py │ ├── softmix.py │ ├── greedy_mix.py │ ├── ucb1.py │ ├── epsilon_greedy.py │ ├── ucb1_tuned.py │ ├── softmax.py │ ├── exp3.py │ ├── ucb2.py │ └── abstract_algorithm.py ├── MANIFEST.in ├── requirements-dev.txt ├── .coveragerc ├── requirements.txt ├── tox.ini ├── README.md ├── main.py ├── LICENSE ├── setup.py └── .gitignore /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.version: -------------------------------------------------------------------------------- 1 | version==1.0 -------------------------------------------------------------------------------- /multi_armed_bandit/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /multi_armed_bandit/arms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/multi_armed_bandit/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /multi_armed_bandit/algorithm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/multi_armed_bandit/arms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/multi_armed_bandit/algorithm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include .version -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest==3.8.0 2 | pytest-cov==2.6.0 -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | tests\* 4 | .tox\* 5 | setup.py 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy<=1.14.5,>=1.13.3 2 | scipy==1.1.0 3 | pandas==0.23.4 4 | setuptools>=65.5.1 5 | tensorflow>=1.12.2 6 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py366 3 | skipsdist = False 4 | 5 | [testenv] 6 | deps = -r requirements.txt 7 | -r requirements-dev.txt 8 | setenv = 9 | PYTHONPATH = {toxinidir} 10 | commands = py.test {posargs} --cov-config .coveragerc --cov='{toxinidir}' -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Multi Armed Bandit Algorithm W\ Deep Learning 2 | 3 | Build using python 3.6.6 with tox 3.4.0 and tensorflow 1.12.2 4 | 5 | Policies implemented: 6 | * Epsilon Decreasing 7 | * Epsilon First 8 | * Epsilon Greedy 9 | * EXP3 10 | * GreedyMix 11 | * Softmax 12 | * Softmax Decreasing 13 | * SoftMix 14 | * UCB1 15 | * UCB1-Tuned 16 | * UCB2 17 | 18 | Arms: 19 | * Bernoulli 20 | * Normal 21 | -------------------------------------------------------------------------------- /multi_armed_bandit/algorithm/epsilon_decreasing.py: -------------------------------------------------------------------------------- 1 | from multi_armed_bandit.algorithm.epsilon_greedy import EpsilonGreedy 2 | 3 | 4 | class EpsilonDecreasing(EpsilonGreedy): 5 | """ 6 | The lever with the highest estimated mean is always pulled expect when a random lever is pull instead with 7 | epsilon_t frequency. epsilon_t is defined as min(1, epsilon_0/t) where epsilon_0 is the initial epsilon the policy 8 | gets and t is the iteration number. 9 | """ 10 | 11 | def _get_epsilon(self, iteration_number): 12 | return min(1.0, self._epsilon / (iteration_number + 1)) 13 | -------------------------------------------------------------------------------- /multi_armed_bandit/algorithm/softmax_decreasing.py: -------------------------------------------------------------------------------- 1 | from multi_armed_bandit.algorithm.softmax import Softmax 2 | 3 | 4 | class SoftmaxDecreasing(Softmax): 5 | """ 6 | Modified the same way as epsilon-greedy strategy into decreasing softmax where the temperature decreases with the 7 | number of rounds played. The decreasing softmax is identical to the softmax but with a temperature tua_t is 8 | min(1, tau_0/t) where tua_0 is the initial temperature the policy gets and t is the iteration number. 9 | """ 10 | 11 | def _get_temperature(self, iteration_number: int): 12 | return min(1.0, self._temperature / (iteration_number + 1)) 13 | -------------------------------------------------------------------------------- /multi_armed_bandit/algorithm/epsilon_first.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | from multi_armed_bandit.algorithm.epsilon_greedy import EpsilonGreedy 4 | 5 | 6 | class EpsilonFirst(EpsilonGreedy): 7 | """ 8 | Consist of doing the exploration all at once at the beginning. epsilon * T first rounds the levers are pulled 9 | randomly. (1-epsilon)*T rounds the lever of the highest estimated mean is pulled. 10 | """ 11 | 12 | def select_arm(self, iteration_number: int) -> int: 13 | if iteration_number < round(self._epsilon * self._iterations): 14 | return numpy.random.choice(len(self._arms)) 15 | return int(numpy.argmax(self._states)) 16 | -------------------------------------------------------------------------------- /tests/multi_armed_bandit/arms/test_bernoulli_arm.py: -------------------------------------------------------------------------------- 1 | from multi_armed_bandit.arms.bernoulli_arm import BernoulliArm 2 | 3 | 4 | def test_draw_bernoulli_arm(): 5 | arm = BernoulliArm(0, 0.5) 6 | assert 0 == arm.draw() 7 | 8 | 9 | def test_draw_bernoulli_arm_size_2(): 10 | arm = BernoulliArm(0, 0.5) 11 | rewards = arm.draw(2) 12 | assert 0 == rewards[0] 13 | assert 1 == rewards[1] 14 | 15 | 16 | def test_eq(): 17 | assert BernoulliArm(0, 0.5) == BernoulliArm(0, 0.5) 18 | 19 | 20 | def test_not_eq(): 21 | assert not BernoulliArm(1, 0.5) == BernoulliArm(0, 0.5) 22 | 23 | 24 | def test_not_object(): 25 | assert not 1 == BernoulliArm(0, 0.5) 26 | -------------------------------------------------------------------------------- /multi_armed_bandit/algorithm/softmix.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | from multi_armed_bandit.algorithm.softmax_decreasing import SoftmaxDecreasing 4 | 5 | 6 | class SoftMix(SoftmaxDecreasing): 7 | """ 8 | The SoftMix slightly differs from the decreasing softmax since it uses a temperature decreasing with a log(t)/t 9 | factor instead of 1/t factor. 10 | see: Cesa-Bianchi, Nicolo, and Paul Fischer. "Finite-Time Regret Bounds for the Multiarmed Bandit Problem." In ICML, 11 | pp. 100-108. 1998. 12 | """ 13 | 14 | def _get_temperature(self, iteration_number: int): 15 | return min(1.0, (self._temperature * numpy.log(iteration_number + 1)) / (iteration_number + 1)) 16 | -------------------------------------------------------------------------------- /multi_armed_bandit/algorithm/greedy_mix.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | from multi_armed_bandit.algorithm.epsilon_decreasing import EpsilonDecreasing 4 | 5 | 6 | class GreedyMix(EpsilonDecreasing): 7 | """ 8 | GreedyMix slightly differs from the epsilon-decreasing strategy as just presented because it uses a decreasing 9 | method of log(t) / t instead of 1/t, where t is the iteratation number. 10 | see: Cesa-Bianchi, Nicolo, and Paul Fischer. "Finite-Time Regret Bounds for the Multiarmed Bandit Problem." In ICML, 11 | pp. 100-108. 1998. 12 | """ 13 | 14 | def _get_epsilon(self, iteration_number): 15 | return min(1.0, (self._epsilon * numpy.log(iteration_number + 1)) / (iteration_number + 1)) 16 | -------------------------------------------------------------------------------- /multi_armed_bandit/arms/bernoulli_arm.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | from scipy.stats import bernoulli 3 | 4 | from multi_armed_bandit.arms.abstract_arm import Arm 5 | 6 | 7 | class BernoulliArm(Arm): 8 | def __init__(self, name: int, p: float) -> None: 9 | super().__init__(name) 10 | self._p = p 11 | 12 | def _get_rewards(self, size: int) -> numpy.ndarray: 13 | """ 14 | Extracts rewards from Bernoulli distribution. 15 | :param size: number of draws. 16 | :return: rewards. 17 | """ 18 | return bernoulli.rvs(self._p, size=size, random_state=42) 19 | 20 | def __eq__(self, other: object) -> bool: 21 | if not isinstance(other, self.__class__): 22 | return False 23 | return self._name == other._name and self._probability == other._probability and self._p == other._p 24 | -------------------------------------------------------------------------------- /multi_armed_bandit/arms/normal_arm.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | from scipy.stats import norm 3 | 4 | from multi_armed_bandit.arms.abstract_arm import Arm 5 | 6 | 7 | class NormalArm(Arm): 8 | def __init__(self, name: int, mu: float, sigma: float) -> None: 9 | super().__init__(name) 10 | self._mu = mu 11 | self._sigma = sigma 12 | 13 | def _get_rewards(self, size: int) -> numpy.ndarray: 14 | """ 15 | Extracts rewards from Normal distribution. 16 | :param size: number of draws. 17 | :return: rewards. 18 | """ 19 | return norm.rvs(self._mu, self._sigma, size=size, random_state=42) 20 | 21 | def __eq__(self, other: object) -> bool: 22 | if not isinstance(other, self.__class__): 23 | return False 24 | return self._name == other._name and self._probability == other._probability and self._mu == other._mu \ 25 | and self._sigma == other._sigma 26 | -------------------------------------------------------------------------------- /tests/multi_armed_bandit/arms/test_normal_arm.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from multi_armed_bandit.arms.normal_arm import NormalArm 4 | 5 | 6 | def test_draw_normal_arm(): 7 | arm = NormalArm(0, 0, 1) 8 | assert pytest.approx(0.4967, 0.0001) == round(arm.draw(), 4) 9 | 10 | 11 | def test_draw_normal_arm_size_2(): 12 | arm = NormalArm(0, 0, 1) 13 | rewards = arm.draw(2) 14 | assert pytest.approx(0.4967, 0.0001) == round(rewards[0], 4) 15 | assert pytest.approx(-0.1382, 0.001) == round(rewards[1], 4) 16 | 17 | 18 | def test_get_dict(): 19 | arm = NormalArm(0, 0, 1) 20 | arm.set_probability(0.5) 21 | assert {"name": 0, "probability": 0.5} == arm.get_dict() 22 | 23 | 24 | def test_eq(): 25 | assert NormalArm(0, 0, 1) == NormalArm(0, 0, 1) 26 | 27 | 28 | def test_not_eq(): 29 | assert not NormalArm(0, 0, 1) == NormalArm(1, 0, 1) 30 | 31 | 32 | def test_not_object(): 33 | assert not 1 == NormalArm(1, 0, 1) 34 | -------------------------------------------------------------------------------- /multi_armed_bandit/algorithm/ucb1.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | from multi_armed_bandit.algorithm.abstract_algorithm import MABAlgorithm 4 | 5 | 6 | class UCB1(MABAlgorithm): 7 | """ 8 | Upper Confidence Bounds. Initially, each arm is played once. Afterwards at round t, the algorithm greedily picks 9 | the i'th arm from max(mu^hat_i(t)+sqrt(2*ln(t)/n_i)). Where mu^hat_i(t) is the estimated mean reward of the i'th 10 | arm, t is the iteration number and n_i is the number of times the arm was played. 11 | """ 12 | 13 | def select_arm(self, iteration_number: int) -> int: 14 | if 0 in self._counts: 15 | return int(numpy.argmin(self._counts)) 16 | ucb_values = self._states + numpy.sqrt(2 * numpy.log(iteration_number + 1) / self._counts) 17 | return int(numpy.argmax(ucb_values)) 18 | 19 | def __eq__(self, other: object) -> bool: 20 | return isinstance(other, self.__class__) and self._arms == other._arms 21 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | 3 | from multi_armed_bandit.algorithm.epsilon_first import EpsilonFirst 4 | from multi_armed_bandit.algorithm.exp3 import EXP3 5 | from multi_armed_bandit.algorithm.greedy_mix import GreedyMix 6 | from multi_armed_bandit.algorithm.softmax import Softmax 7 | from multi_armed_bandit.algorithm.softmax_decreasing import SoftmaxDecreasing 8 | from multi_armed_bandit.algorithm.softmix import SoftMix 9 | from multi_armed_bandit.algorithm.ucb1 import UCB1 10 | from multi_armed_bandit.algorithm.ucb1_tuned import UCB1Tuned 11 | from multi_armed_bandit.algorithm.ucb2 import UCB2 12 | from multi_armed_bandit.arms.normal_arm import NormalArm 13 | from multi_armed_bandit.algorithm.epsilon_decreasing import EpsilonDecreasing 14 | 15 | if __name__ == '__main__': 16 | arms = [NormalArm(0, 0.3, 1), NormalArm(1, 0.5, 1), NormalArm(2, 1, 1)] 17 | epsilon = 0.5 18 | algorithm = UCB2(arms, epsilon) 19 | results = algorithm.run_simulation(1000) 20 | # df = pandas.DataFrame(results) 21 | # df.to_csv("results.csv", index=False) 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Idan Morad 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from setuptools import setup, find_packages 4 | 5 | version = Path(__file__).parents[0].joinpath(".version").read_text().split("==")[1] 6 | long_description = Path(__file__).parents[0].joinpath("README.md").read_text() 7 | 8 | setup(name="MultiArmedBandit-DeepLearning", 9 | version=version, 10 | author="Idan Morad", 11 | description="Multi Armed Bandit Algorithm W\ Deep Learning " 12 | "Build using python 3.6.6 with tox 3.4.0 and tensorflow 1.10.0", 13 | long_description=long_description, 14 | long_description_content_type="text/markdown", 15 | classifiers=["Development Status :: 5 - Production/Stable", 16 | "Framework :: tox", 17 | "Intended Audience :: Developers", 18 | "Intended Audience :: Education", 19 | "Intended Audience :: Science/Research", 20 | "License :: OSI Approved :: MIT License", 21 | "Programming Language :: Python :: 3.6", 22 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 23 | "Topic :: Software Development :: Testing :: Unit", 24 | "Topic :: Software Development :: Version Control :: Git"], 25 | keywords="multi-armed-bandit deep-learning", 26 | packages=find_packages(exclude=['contrib', 'docs', 'tests'])) 27 | -------------------------------------------------------------------------------- /multi_armed_bandit/algorithm/epsilon_greedy.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import numpy 4 | 5 | from multi_armed_bandit.algorithm.abstract_algorithm import MABAlgorithm 6 | from multi_armed_bandit.arms.abstract_arm import Arm 7 | 8 | 9 | class EpsilonGreedy(MABAlgorithm): 10 | """ 11 | Epsilon-greedy strategy - The best lever is selected for a proportion 1 - epsilon of the trails, and a lever is 12 | selected at random (with uniform probability) for a proportion epsilon. 13 | """ 14 | 15 | def __init__(self, arms: List[Arm], epsilon: float) -> None: 16 | super().__init__(arms) 17 | 18 | if not (0.0 < epsilon < 1.0): 19 | raise ValueError("epsilon is not between 0 and 1") 20 | 21 | self._epsilon = epsilon 22 | 23 | def select_arm(self, iteration_number: int) -> int: 24 | """ 25 | Select the best arm based on the strategy of epsilon proportion of time random and the rest are the 26 | strongest arm. 27 | :return: index of chosen arm. 28 | """ 29 | if numpy.random.random() > self._get_epsilon(iteration_number): 30 | return int(numpy.argmax(self._states)) 31 | else: 32 | return numpy.random.choice(len(self._arms)) 33 | 34 | def _get_epsilon(self, iteration_number: int) -> float: 35 | return self._epsilon 36 | 37 | def __eq__(self, other: object) -> bool: 38 | if not isinstance(other, self.__class__): 39 | return False 40 | return self._epsilon == other._epsilon and self._arms == other._arms 41 | -------------------------------------------------------------------------------- /multi_armed_bandit/algorithm/ucb1_tuned.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import numpy 4 | 5 | from multi_armed_bandit.algorithm.ucb1 import UCB1 6 | from multi_armed_bandit.arms.abstract_arm import Arm 7 | 8 | 9 | class UCB1Tuned(UCB1): 10 | """ 11 | The main feature of UCB1-Tuned is that it is takes into account the variance of each arm and not only empirical 12 | mean. More specifically, at round t the algorithm picks the i'th arm from max(mu^hat_i + sqrt(ln(t)/n_i * 13 | min(1/4, V_i(n_i)))). Where V_i(t)=sigma^2_i(t)+sqrt(2ln(t)/n_i(t)). The estimated of variance sigma^2_i(t) can 14 | be computed as usually by maintaining the empirical sum of squares of the reward, in addition to the empirical mean. 15 | """ 16 | 17 | def __init__(self, arms: List[Arm]) -> None: 18 | super().__init__(arms) 19 | self._reward_squares = numpy.zeros(len(self._arms)) 20 | 21 | def select_arm(self, iteration_number: int) -> int: 22 | if 0 in self._counts: 23 | return int(numpy.argmin(self._counts)) 24 | variances = (self._reward_squares / self._counts) - self._states ** 2 25 | vi = variances + numpy.sqrt(2 * numpy.log(iteration_number + 1) / self._counts) 26 | vi = numpy.array([min(0.25, value) for value in vi]) 27 | ucb_values = self._states + numpy.sqrt((2 * numpy.log(iteration_number + 1) / self._counts) * vi) 28 | return int(numpy.argmax(ucb_values)) 29 | 30 | def _after_draw(self, reward: float, chosen_arm_index: int) -> None: 31 | self._reward_squares[chosen_arm_index] += (reward ** 2) 32 | -------------------------------------------------------------------------------- /multi_armed_bandit/arms/abstract_arm.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import numpy 4 | 5 | 6 | class Arm(object): 7 | def __init__(self, name: Union[int, float, str]) -> None: 8 | self._name = name 9 | self._probability = 0.0 10 | 11 | def get_name(self) -> int: 12 | """ 13 | Returns the arm name. 14 | :return: the arm name. 15 | """ 16 | return self._name 17 | 18 | def set_probability(self, probability) -> None: 19 | """ 20 | Sets the arm probability. 21 | :param probability: the arm probability. 22 | """ 23 | self._probability = probability 24 | 25 | def get_probability(self) -> float: 26 | """ 27 | Returns the arm probability. 28 | :return: the arm probability. 29 | """ 30 | return self._probability 31 | 32 | def draw(self, size: int = 1) -> Union[float, numpy.ndarray]: 33 | """ 34 | Pull the level of the arm. 35 | :param size: number of draws. 36 | :return: the rewards from the draw. 37 | """ 38 | rewards = self._get_rewards(size) 39 | if size == 1: 40 | return rewards[0] 41 | return rewards 42 | 43 | def _get_rewards(self, size: int) -> numpy.ndarray: 44 | """ 45 | Extracts rewards per arm distribution. 46 | :param size: number of draws. 47 | :return: rewards. 48 | """ 49 | raise NotImplementedError 50 | 51 | def get_dict(self) -> dict: 52 | """ 53 | Retrn the arm values as dictionary with name and probability. 54 | :return: 55 | """ 56 | return {"name": self.get_name(), "probability": float(self.get_probability())} 57 | -------------------------------------------------------------------------------- /multi_armed_bandit/algorithm/softmax.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from typing import List 3 | 4 | import numpy 5 | 6 | from multi_armed_bandit.algorithm.abstract_algorithm import MABAlgorithm 7 | from multi_armed_bandit.arms.abstract_arm import Arm 8 | 9 | 10 | class Softmax(MABAlgorithm): 11 | """ 12 | Also known as Boltzmann Exploration. Each arm is assign with probability that is proportional to its average reward. 13 | Arms with greater empirical mean are therefore picked with higher probability. The probability to be picked is 14 | defined: p_i(t+1) = (e^(mu^hat_i(t)/tua)/sum^K_j=1(e^(mu^hat_j(t)/tua))). Where K is number of arms, mu^hat_i(t) is 15 | the estimated mean reward of the i'th arm, and tua is a temperature parameter controlling the randomness of the 16 | choice. when tau is close to 0 the policy acts like pure greedy, and as tau tends to infinity, the algorithm picks 17 | arms uniformly at random. 18 | """ 19 | 20 | def __init__(self, arms: List[Arm], temperature) -> None: 21 | super().__init__(arms) 22 | if temperature <= 0.0: 23 | raise ValueError("temperature must be positive") 24 | self._temperature = temperature 25 | 26 | def select_arm(self, iteration_number: int) -> int: 27 | exp = numpy.exp(self._states / self._get_temperature(iteration_number)) 28 | probabilities = exp / numpy.sum(exp, axis=0) 29 | return int(numpy.argmax(numpy.random.multinomial(1, probabilities))) 30 | 31 | def _get_temperature(self, iteration_number: int): 32 | return self._temperature 33 | 34 | def __eq__(self, other: object) -> bool: 35 | if not isinstance(other, self.__class__): 36 | return False 37 | return self._temperature == other._temperature and self._arms == other._arms 38 | -------------------------------------------------------------------------------- /tests/multi_armed_bandit/algorithm/test_epsilon_greedy.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from multi_armed_bandit.algorithm.epsilon_greedy import EpsilonGreedy 4 | from multi_armed_bandit.arms.normal_arm import NormalArm 5 | 6 | 7 | def test_epsilon_greedy_algorithm(): 8 | arms = [NormalArm(name=0, mu=0.5, sigma=1.0), NormalArm(1, 0.3, 1.0), NormalArm(2, 1.0, 1.0)] 9 | algorithm = EpsilonGreedy(arms, 0.9) 10 | algorithm.run_simulation(1000) 11 | assert pytest.approx(0.31, 0.1) == round(arms[0].get_probability(), 2) 12 | assert pytest.approx(0.30, 0.1) == round(arms[1].get_probability(), 2) 13 | assert pytest.approx(0.38, 0.1) == round(arms[2].get_probability(), 2) 14 | 15 | 16 | def test_epsilon_out_of_range(): 17 | with pytest.raises(ValueError): 18 | EpsilonGreedy([NormalArm(name=0, mu=0.5, sigma=1.0)], 0.0) 19 | 20 | with pytest.raises(ValueError): 21 | EpsilonGreedy([NormalArm(name=0, mu=0.5, sigma=1.0)], -5.0) 22 | 23 | with pytest.raises(ValueError): 24 | EpsilonGreedy([NormalArm(name=0, mu=0.5, sigma=1.0)], 1.1) 25 | 26 | 27 | def test_one_arm(): 28 | arm = NormalArm(name=0, mu=0.5, sigma=1.0) 29 | algorithm = EpsilonGreedy([arm], 0.9) 30 | algorithm.run_simulation(1000) 31 | assert 1.0 == arm.get_probability() 32 | 33 | 34 | def test_eq(): 35 | arm = NormalArm(name=0, mu=0.5, sigma=1.0) 36 | assert EpsilonGreedy([arm], 0.9) == EpsilonGreedy([arm], 0.9) 37 | 38 | 39 | def test_not_object(): 40 | arm = NormalArm(name=0, mu=0.5, sigma=1.0) 41 | assert not (1 == EpsilonGreedy([arm], 0.3)) 42 | 43 | 44 | def test_iteration_negative(): 45 | arm = NormalArm(name=0, mu=0.5, sigma=1.0) 46 | algorithm = EpsilonGreedy([arm], 0.9) 47 | 48 | with pytest.raises(ValueError): 49 | algorithm.run_simulation(0) 50 | 51 | with pytest.raises(ValueError): 52 | algorithm.run_simulation(-1) 53 | -------------------------------------------------------------------------------- /multi_armed_bandit/algorithm/exp3.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import numpy 4 | 5 | from multi_armed_bandit.algorithm.abstract_algorithm import MABAlgorithm 6 | from multi_armed_bandit.arms.abstract_arm import Arm 7 | 8 | 9 | class EXP3(MABAlgorithm): 10 | """ 11 | Exponential weight algorithm. The probability of choosing the lever k ar round t is: 12 | p_i(t) = (1 - gamma) * (weight_i(t)/ sum^K_j=i(weight_j(t))) + (gamma / K). Where K is the number of arms. 13 | if lever i has been pulled: weight_i(t + 1) = weight_i(t) * e^((gamma * r_i(t))/(probability_i(t) * K)). 14 | Where r_i(t) is the drawn reward in round t. Otherwise weight_i(t + 1) = weight_i(t) 15 | """ 16 | 17 | def __init__(self, arms: List[Arm], gamma: float) -> None: 18 | super().__init__(arms) 19 | 20 | if not (0.0 <= gamma < 1.0): 21 | raise ValueError("gamma must be between 0 and 1") 22 | self._gamma = gamma 23 | self._weights = numpy.ones(len(self._arms)) 24 | self._probabilities = numpy.zeros(len(self._arms)) 25 | 26 | def select_arm(self, iteration_number: int) -> int: 27 | self._probabilities = ((1 - self._gamma) * self._weights / numpy.sum(self._weights, axis=0)) + ( 28 | self._gamma / len(self._arms)) 29 | return int(numpy.argmax(numpy.random.multinomial(1, self._probabilities))) 30 | 31 | def _after_draw(self, reward: float, chosen_arm_index: int) -> None: 32 | probability = self._probabilities[chosen_arm_index] 33 | growth_factor = numpy.exp(self._gamma * (reward / (probability * len(self._arms)))) 34 | self._weights[chosen_arm_index] = self._weights[chosen_arm_index] * growth_factor 35 | 36 | def __eq__(self, other: object) -> bool: 37 | if not isinstance(other, self.__class__): 38 | return False 39 | return self._gamma == other._gamma and self._arms == other._arms 40 | -------------------------------------------------------------------------------- /multi_armed_bandit/algorithm/ucb2.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import numpy 4 | 5 | from multi_armed_bandit.algorithm.abstract_algorithm import MABAlgorithm 6 | from multi_armed_bandit.arms.abstract_arm import Arm 7 | 8 | 9 | class UCB2(MABAlgorithm): 10 | """ 11 | see: https://webdocs.cs.ualberta.ca/~games/go/seminar/notes/2007/slides_ucb.pdf 12 | """ 13 | 14 | def __init__(self, arms: List[Arm], alpha: float) -> None: 15 | super().__init__(arms) 16 | 17 | if not (0.0 <= alpha < 1.0): 18 | raise ValueError("alpha must be between 0 and 1") 19 | 20 | self._alpha = alpha 21 | self._r = numpy.zeros(len(self._arms)) 22 | self._next_iteration_to_calc = 0 23 | self._current_arm_index = 0 24 | 25 | def select_arm(self, iteration_number: int) -> int: 26 | if 0 in self._counts: 27 | chosen_arm = int(numpy.argmin(self._counts)) 28 | self._r[chosen_arm] += 1 29 | return chosen_arm 30 | 31 | if self._next_iteration_to_calc > iteration_number + 1: 32 | return self._current_arm_index 33 | 34 | ucb_values = self._states + numpy.sqrt( 35 | ((1 + self._alpha) * numpy.log(numpy.e * (iteration_number + 1) / self._get_tau())) / 2 * self._get_tau()) 36 | chosen_arm = int(numpy.argmax(ucb_values)) 37 | self._update_next_time_to_calc(chosen_arm) 38 | return chosen_arm 39 | 40 | def _get_tau(self) -> numpy.ndarray: 41 | return numpy.ceil((1 + self._alpha) ** self._r) 42 | 43 | def _update_next_time_to_calc(self, arm_index: int) -> None: 44 | self._current_arm_index = arm_index 45 | self._next_iteration_to_calc += max(1, self._get_tau_for_single_r( 46 | self._r[arm_index] + 1) - self._get_tau_for_single_r(self._r[arm_index])) 47 | self._r[arm_index] += 1 48 | 49 | def _get_tau_for_single_r(self, r: int) -> int: 50 | return int(numpy.ceil((1 + self._alpha) ** r)) 51 | 52 | def __eq__(self, other: object) -> bool: 53 | if not isinstance(other, self.__class__): 54 | return False 55 | return self._alpha == other._alpha and self._arms == other._arms 56 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Windows template 3 | # Windows thumbnail cache files 4 | Thumbs.db 5 | ehthumbs.db 6 | ehthumbs_vista.db 7 | 8 | # Dump file 9 | *.stackdump 10 | 11 | # Folder config file 12 | [Dd]esktop.ini 13 | 14 | # Recycle Bin used on file shares 15 | $RECYCLE.BIN/ 16 | 17 | # Windows Installer files 18 | *.cab 19 | *.msi 20 | *.msix 21 | *.msm 22 | *.msp 23 | 24 | # Windows shortcuts 25 | *.lnk 26 | ### Linux template 27 | *~ 28 | 29 | # temporary files which can be created if a process still has a handle open of a deleted file 30 | .fuse_hidden* 31 | 32 | # KDE directory preferences 33 | .directory 34 | 35 | # Linux trash folder which might appear on any partition or disk 36 | .Trash-* 37 | 38 | # .nfs files are created when an open file is removed but is still being accessed 39 | .nfs* 40 | ### JetBrains template 41 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 42 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 43 | 44 | # User-specific stuff 45 | .idea/ 46 | 47 | # CMake 48 | cmake-build-debug/ 49 | cmake-build-release/ 50 | 51 | # Mongo Explorer plugin 52 | .idea/**/mongoSettings.xml 53 | 54 | # File-based project format 55 | *.iws 56 | 57 | # IntelliJ 58 | out/ 59 | 60 | # mpeltonen/sbt-idea plugin 61 | .idea_modules/ 62 | 63 | # JIRA plugin 64 | atlassian-ide-plugin.xml 65 | 66 | # Cursive Clojure plugin 67 | .idea/replstate.xml 68 | 69 | # Crashlytics plugin (for Android Studio and IntelliJ) 70 | com_crashlytics_export_strings.xml 71 | crashlytics.properties 72 | crashlytics-build.properties 73 | fabric.properties 74 | 75 | # Editor-based Rest Client 76 | .idea/httpRequests 77 | ### macOS template 78 | # General 79 | .DS_Store 80 | .AppleDouble 81 | .LSOverride 82 | 83 | # Icon must end with two \r 84 | Icon 85 | 86 | # Thumbnails 87 | ._* 88 | 89 | # Files that might appear in the root of a volume 90 | .DocumentRevisions-V100 91 | .fseventsd 92 | .Spotlight-V100 93 | .TemporaryItems 94 | .Trashes 95 | .VolumeIcon.icns 96 | .com.apple.timemachine.donotpresent 97 | 98 | # Directories potentially created on remote AFP share 99 | .AppleDB 100 | .AppleDesktop 101 | Network Trash Folder 102 | Temporary Items 103 | .apdisk 104 | ### Python template 105 | # Byte-compiled / optimized / DLL files 106 | __pycache__/ 107 | *.py[cod] 108 | *$py.class 109 | 110 | # C extensions 111 | *.so 112 | 113 | # Distribution / packaging 114 | .Python 115 | build/ 116 | develop-eggs/ 117 | dist/ 118 | downloads/ 119 | eggs/ 120 | .eggs/ 121 | lib/ 122 | lib64/ 123 | parts/ 124 | sdist/ 125 | var/ 126 | wheels/ 127 | *.egg-info/ 128 | .installed.cfg 129 | *.egg 130 | MANIFEST 131 | 132 | # PyInstaller 133 | # Usually these files are written by a python script from a template 134 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 135 | *.manifest 136 | *.spec 137 | 138 | # Installer logs 139 | pip-log.txt 140 | pip-delete-this-directory.txt 141 | 142 | # Unit test / coverage reports 143 | htmlcov/ 144 | .tox/ 145 | .coverage 146 | .coverage.* 147 | .cache 148 | nosetests.xml 149 | coverage.xml 150 | *.cover 151 | .hypothesis/ 152 | .pytest_cache/ 153 | 154 | # Translations 155 | *.mo 156 | *.pot 157 | 158 | # Django stuff: 159 | *.log 160 | local_settings.py 161 | db.sqlite3 162 | 163 | # Flask stuff: 164 | instance/ 165 | .webassets-cache 166 | 167 | # Scrapy stuff: 168 | .scrapy 169 | 170 | # Sphinx documentation 171 | docs/_build/ 172 | 173 | # PyBuilder 174 | target/ 175 | 176 | # Jupyter Notebook 177 | .ipynb_checkpoints 178 | 179 | # pyenv 180 | .python-version 181 | 182 | # celery beat schedule file 183 | celerybeat-schedule 184 | 185 | # SageMath parsed files 186 | *.sage.py 187 | 188 | # Environments 189 | .env 190 | .venv 191 | env/ 192 | venv/ 193 | ENV/ 194 | env.bak/ 195 | venv.bak/ 196 | 197 | # Spyder project settings 198 | .spyderproject 199 | .spyproject 200 | 201 | # Rope project settings 202 | .ropeproject 203 | 204 | # mkdocs documentation 205 | /site 206 | 207 | # mypy 208 | .mypy_cache/ 209 | -------------------------------------------------------------------------------- /multi_armed_bandit/algorithm/abstract_algorithm.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import numpy 4 | import tensorflow 5 | 6 | from multi_armed_bandit.arms.abstract_arm import Arm 7 | 8 | 9 | class MABAlgorithm(object): 10 | """ 11 | Multi-armed bandit - In probability theory, the multi-armed bandit problem (sometimes called the K- or N-armed 12 | bandit problem) is a problem in which a fixed limited set of resources must be allocated between competing 13 | (alternative) choices in a way that maximizes their expected gain, when each choice's properties are only partially 14 | known at the time of allocation, and may become better understood as time passes or by allocating resources to the 15 | choice. taken from https://en.wikipedia.org/wiki/Multi-armed_bandit 16 | """ 17 | 18 | def __init__(self, arms: List[Arm]) -> None: 19 | self._arms = arms 20 | self._states = numpy.zeros(len(self._arms)) 21 | self._counts = numpy.zeros(len(self._arms)) 22 | self._iterations = 0 23 | 24 | def select_arm(self, iteration_number: int) -> int: 25 | """ 26 | The method that returns the index of the Arm that the algorithm selects on the current play. 27 | :return: index of chosen arm. 28 | """ 29 | raise NotImplementedError 30 | 31 | def run_simulation(self, iterations) -> List[dict]: 32 | """ 33 | Run simulation and update the probabilities to pull for each arm. 34 | :param iterations: number of iterations. 35 | """ 36 | if iterations < 1: 37 | raise ValueError("Iterations must be positive") 38 | 39 | self._iterations = iterations 40 | 41 | number_of_arms = len(self._arms) 42 | 43 | results = [] 44 | optimal_strategy_rewards = 0.0 45 | collected_rewards = 0.0 46 | 47 | rewards = numpy.zeros([number_of_arms, self._iterations]) 48 | 49 | tensorflow.reset_default_graph() 50 | weights = tensorflow.Variable(tensorflow.ones([number_of_arms])) 51 | 52 | reward_holder = tensorflow.placeholder(shape=[1], dtype=tensorflow.float32) 53 | action_holder = tensorflow.placeholder(shape=[1], dtype=tensorflow.int32) 54 | responsible_weight = tensorflow.slice(weights, action_holder, [1]) 55 | loss = -(tensorflow.log(responsible_weight) * reward_holder) 56 | optimizer = tensorflow.train.AdamOptimizer(learning_rate=0.001) 57 | update = optimizer.minimize(loss) 58 | init = tensorflow.global_variables_initializer() 59 | tensorflow.set_random_seed(42) 60 | 61 | ww = numpy.zeros(number_of_arms) 62 | 63 | with tensorflow.Session() as sess: 64 | sess.run(init) 65 | 66 | for arm_index in range(0, number_of_arms): 67 | rewards[arm_index] = self._arms[arm_index].draw(self._iterations) 68 | 69 | self._states = numpy.zeros(number_of_arms) 70 | self._counts = numpy.zeros(number_of_arms) 71 | 72 | for iteration in range(0, self._iterations): 73 | chosen_arm_index = self.select_arm(iteration) 74 | reward = rewards[chosen_arm_index, iteration] 75 | 76 | _, _, ww = sess.run([update, responsible_weight, weights], 77 | feed_dict={reward_holder: [reward], action_holder: [chosen_arm_index]}) 78 | 79 | self._counts[chosen_arm_index] += 1 80 | count = self._counts[chosen_arm_index] 81 | 82 | self._update_current_states(chosen_arm_index, count, reward) 83 | 84 | self._after_draw(reward, chosen_arm_index) 85 | 86 | collected_rewards += reward 87 | optimal_strategy_rewards += numpy.max(rewards[:, iteration]) 88 | regret = optimal_strategy_rewards - collected_rewards 89 | 90 | results.append( 91 | {"iteration": iteration, "chosen_arm": self._arms[chosen_arm_index].get_name(), "regret": regret, 92 | "avg_regret": regret / (iteration + 1), 93 | "avg_collected_rewards": collected_rewards / (iteration + 1)}) 94 | 95 | exp = numpy.exp(ww) 96 | probabilities = exp / numpy.sum(exp, axis=0) 97 | 98 | for index in range(0, number_of_arms): 99 | self._arms[index].set_probability(probabilities[index]) 100 | 101 | return results 102 | 103 | def _update_current_states(self, chosen_arm_index, count, reward) -> None: 104 | self._states[chosen_arm_index] = ((count - 1) / count) * self._states[chosen_arm_index] + ( 105 | 1 / count) * reward 106 | 107 | def _after_draw(self, reward: float, chosen_arm_index: int) -> None: 108 | """ 109 | After process method. 110 | :param reward: the reward drawn. 111 | :param chosen_arm_index: the chosen arm index. 112 | """ 113 | pass 114 | --------------------------------------------------------------------------------