├── tests
    ├── __init__.py
    └── multi_armed_bandit
    │   ├── __init__.py
    │   ├── arms
    │       ├── __init__.py
    │       ├── test_bernoulli_arm.py
    │       └── test_normal_arm.py
    │   └── algorithm
    │       ├── __init__.py
    │       └── test_epsilon_greedy.py
├── .version
├── multi_armed_bandit
    ├── __init__.py
    ├── arms
    │   ├── __init__.py
    │   ├── bernoulli_arm.py
    │   ├── normal_arm.py
    │   └── abstract_arm.py
    └── algorithm
    │   ├── __init__.py
    │   ├── epsilon_decreasing.py
    │   ├── softmax_decreasing.py
    │   ├── epsilon_first.py
    │   ├── softmix.py
    │   ├── greedy_mix.py
    │   ├── ucb1.py
    │   ├── epsilon_greedy.py
    │   ├── ucb1_tuned.py
    │   ├── softmax.py
    │   ├── exp3.py
    │   ├── ucb2.py
    │   └── abstract_algorithm.py
├── MANIFEST.in
├── requirements-dev.txt
├── .coveragerc
├── requirements.txt
├── tox.ini
├── README.md
├── main.py
├── LICENSE
├── setup.py
└── .gitignore


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.version:
--------------------------------------------------------------------------------
1 | version==1.0


--------------------------------------------------------------------------------
/multi_armed_bandit/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/multi_armed_bandit/arms/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/multi_armed_bandit/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/multi_armed_bandit/algorithm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/multi_armed_bandit/arms/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/multi_armed_bandit/algorithm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include .version


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pytest==3.8.0
2 | pytest-cov==2.6.0


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit =
3 |     tests\*
4 |     .tox\*
5 |     setup.py
6 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy<=1.14.5,>=1.13.3
2 | scipy==1.1.0
3 | pandas==0.23.4
4 | setuptools>=65.5.1
5 | tensorflow>=1.12.2
6 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py366
 3 | skipsdist = False
 4 | 
 5 | [testenv]
 6 | deps = -r requirements.txt
 7 |        -r requirements-dev.txt
 8 | setenv =
 9 |     PYTHONPATH = {toxinidir}
10 | commands = py.test {posargs} --cov-config .coveragerc --cov='{toxinidir}'


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Multi Armed Bandit Algorithm W\ Deep Learning
 2 | 
 3 | Build using python 3.6.6 with tox 3.4.0 and tensorflow 1.12.2
 4 | 
 5 | Policies implemented:
 6 | * Epsilon Decreasing
 7 | * Epsilon First
 8 | * Epsilon Greedy
 9 | * EXP3
10 | * GreedyMix
11 | * Softmax
12 | * Softmax Decreasing
13 | * SoftMix
14 | * UCB1
15 | * UCB1-Tuned
16 | * UCB2
17 | 
18 | Arms:
19 | * Bernoulli
20 | * Normal
21 | 


--------------------------------------------------------------------------------
/multi_armed_bandit/algorithm/epsilon_decreasing.py:
--------------------------------------------------------------------------------
 1 | from multi_armed_bandit.algorithm.epsilon_greedy import EpsilonGreedy
 2 | 
 3 | 
 4 | class EpsilonDecreasing(EpsilonGreedy):
 5 |     """
 6 |     The lever with the highest estimated mean is always pulled expect when a random lever is pull instead with
 7 |     epsilon_t frequency. epsilon_t is defined as min(1, epsilon_0/t) where epsilon_0 is the initial epsilon the policy
 8 |     gets and t is the iteration number.
 9 |     """
10 | 
11 |     def _get_epsilon(self, iteration_number):
12 |         return min(1.0, self._epsilon / (iteration_number + 1))
13 | 


--------------------------------------------------------------------------------
/multi_armed_bandit/algorithm/softmax_decreasing.py:
--------------------------------------------------------------------------------
 1 | from multi_armed_bandit.algorithm.softmax import Softmax
 2 | 
 3 | 
 4 | class SoftmaxDecreasing(Softmax):
 5 |     """
 6 |     Modified the same way as epsilon-greedy strategy into decreasing softmax where the temperature decreases with the
 7 |     number of rounds played. The decreasing softmax is identical to the softmax but with a temperature tua_t is
 8 |     min(1, tau_0/t) where tua_0 is the initial temperature the policy gets and t is the iteration number.
 9 |     """
10 | 
11 |     def _get_temperature(self, iteration_number: int):
12 |         return min(1.0, self._temperature / (iteration_number + 1))
13 | 


--------------------------------------------------------------------------------
/multi_armed_bandit/algorithm/epsilon_first.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | 
 3 | from multi_armed_bandit.algorithm.epsilon_greedy import EpsilonGreedy
 4 | 
 5 | 
 6 | class EpsilonFirst(EpsilonGreedy):
 7 |     """
 8 |     Consist of doing the exploration all at once at the beginning. epsilon * T first rounds the levers are pulled
 9 |     randomly. (1-epsilon)*T rounds the lever of the highest estimated mean is pulled.
10 |     """
11 | 
12 |     def select_arm(self, iteration_number: int) -> int:
13 |         if iteration_number < round(self._epsilon * self._iterations):
14 |             return numpy.random.choice(len(self._arms))
15 |         return int(numpy.argmax(self._states))
16 | 


--------------------------------------------------------------------------------
/tests/multi_armed_bandit/arms/test_bernoulli_arm.py:
--------------------------------------------------------------------------------
 1 | from multi_armed_bandit.arms.bernoulli_arm import BernoulliArm
 2 | 
 3 | 
 4 | def test_draw_bernoulli_arm():
 5 |     arm = BernoulliArm(0, 0.5)
 6 |     assert 0 == arm.draw()
 7 | 
 8 | 
 9 | def test_draw_bernoulli_arm_size_2():
10 |     arm = BernoulliArm(0, 0.5)
11 |     rewards = arm.draw(2)
12 |     assert 0 == rewards[0]
13 |     assert 1 == rewards[1]
14 | 
15 | 
16 | def test_eq():
17 |     assert BernoulliArm(0, 0.5) == BernoulliArm(0, 0.5)
18 | 
19 | 
20 | def test_not_eq():
21 |     assert not BernoulliArm(1, 0.5) == BernoulliArm(0, 0.5)
22 | 
23 | 
24 | def test_not_object():
25 |     assert not 1 == BernoulliArm(0, 0.5)
26 | 


--------------------------------------------------------------------------------
/multi_armed_bandit/algorithm/softmix.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | 
 3 | from multi_armed_bandit.algorithm.softmax_decreasing import SoftmaxDecreasing
 4 | 
 5 | 
 6 | class SoftMix(SoftmaxDecreasing):
 7 |     """
 8 |     The SoftMix slightly differs from the decreasing softmax since it uses a temperature decreasing with a log(t)/t
 9 |     factor instead of 1/t factor.
10 |     see: Cesa-Bianchi, Nicolo, and Paul Fischer. "Finite-Time Regret Bounds for the Multiarmed Bandit Problem." In ICML,
11 |     pp. 100-108. 1998.
12 |     """
13 | 
14 |     def _get_temperature(self, iteration_number: int):
15 |         return min(1.0, (self._temperature * numpy.log(iteration_number + 1)) / (iteration_number + 1))
16 | 


--------------------------------------------------------------------------------
/multi_armed_bandit/algorithm/greedy_mix.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | 
 3 | from multi_armed_bandit.algorithm.epsilon_decreasing import EpsilonDecreasing
 4 | 
 5 | 
 6 | class GreedyMix(EpsilonDecreasing):
 7 |     """
 8 |     GreedyMix slightly differs from the epsilon-decreasing strategy as just presented because it uses a decreasing
 9 |     method of log(t) / t instead of 1/t, where t is the iteratation number.
10 |     see: Cesa-Bianchi, Nicolo, and Paul Fischer. "Finite-Time Regret Bounds for the Multiarmed Bandit Problem." In ICML,
11 |     pp. 100-108. 1998.
12 |     """
13 | 
14 |     def _get_epsilon(self, iteration_number):
15 |         return min(1.0, (self._epsilon * numpy.log(iteration_number + 1)) / (iteration_number + 1))
16 | 


--------------------------------------------------------------------------------
/multi_armed_bandit/arms/bernoulli_arm.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | from scipy.stats import bernoulli
 3 | 
 4 | from multi_armed_bandit.arms.abstract_arm import Arm
 5 | 
 6 | 
 7 | class BernoulliArm(Arm):
 8 |     def __init__(self, name: int, p: float) -> None:
 9 |         super().__init__(name)
10 |         self._p = p
11 | 
12 |     def _get_rewards(self, size: int) -> numpy.ndarray:
13 |         """
14 |         Extracts rewards from Bernoulli distribution.
15 |         :param size: number of draws.
16 |         :return: rewards.
17 |         """
18 |         return bernoulli.rvs(self._p, size=size, random_state=42)
19 | 
20 |     def __eq__(self, other: object) -> bool:
21 |         if not isinstance(other, self.__class__):
22 |             return False
23 |         return self._name == other._name and self._probability == other._probability and self._p == other._p
24 | 


--------------------------------------------------------------------------------
/multi_armed_bandit/arms/normal_arm.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | from scipy.stats import norm
 3 | 
 4 | from multi_armed_bandit.arms.abstract_arm import Arm
 5 | 
 6 | 
 7 | class NormalArm(Arm):
 8 |     def __init__(self, name: int, mu: float, sigma: float) -> None:
 9 |         super().__init__(name)
10 |         self._mu = mu
11 |         self._sigma = sigma
12 | 
13 |     def _get_rewards(self, size: int) -> numpy.ndarray:
14 |         """
15 |         Extracts rewards from Normal distribution.
16 |         :param size: number of draws.
17 |         :return: rewards.
18 |         """
19 |         return norm.rvs(self._mu, self._sigma, size=size, random_state=42)
20 | 
21 |     def __eq__(self, other: object) -> bool:
22 |         if not isinstance(other, self.__class__):
23 |             return False
24 |         return self._name == other._name and self._probability == other._probability and self._mu == other._mu \
25 |                and self._sigma == other._sigma
26 | 


--------------------------------------------------------------------------------
/tests/multi_armed_bandit/arms/test_normal_arm.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from multi_armed_bandit.arms.normal_arm import NormalArm
 4 | 
 5 | 
 6 | def test_draw_normal_arm():
 7 |     arm = NormalArm(0, 0, 1)
 8 |     assert pytest.approx(0.4967, 0.0001) == round(arm.draw(), 4)
 9 | 
10 | 
11 | def test_draw_normal_arm_size_2():
12 |     arm = NormalArm(0, 0, 1)
13 |     rewards = arm.draw(2)
14 |     assert pytest.approx(0.4967, 0.0001) == round(rewards[0], 4)
15 |     assert pytest.approx(-0.1382, 0.001) == round(rewards[1], 4)
16 | 
17 | 
18 | def test_get_dict():
19 |     arm = NormalArm(0, 0, 1)
20 |     arm.set_probability(0.5)
21 |     assert {"name": 0, "probability": 0.5} == arm.get_dict()
22 | 
23 | 
24 | def test_eq():
25 |     assert NormalArm(0, 0, 1) == NormalArm(0, 0, 1)
26 | 
27 | 
28 | def test_not_eq():
29 |     assert not NormalArm(0, 0, 1) == NormalArm(1, 0, 1)
30 | 
31 | 
32 | def test_not_object():
33 |     assert not 1 == NormalArm(1, 0, 1)
34 | 


--------------------------------------------------------------------------------
/multi_armed_bandit/algorithm/ucb1.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | 
 3 | from multi_armed_bandit.algorithm.abstract_algorithm import MABAlgorithm
 4 | 
 5 | 
 6 | class UCB1(MABAlgorithm):
 7 |     """
 8 |     Upper Confidence Bounds. Initially, each arm is played once. Afterwards at round t, the algorithm greedily picks
 9 |     the i'th arm from max(mu^hat_i(t)+sqrt(2*ln(t)/n_i)). Where mu^hat_i(t) is the estimated mean reward of the i'th
10 |     arm, t is the iteration number and n_i is the number of times the arm was played.
11 |     """
12 | 
13 |     def select_arm(self, iteration_number: int) -> int:
14 |         if 0 in self._counts:
15 |             return int(numpy.argmin(self._counts))
16 |         ucb_values = self._states + numpy.sqrt(2 * numpy.log(iteration_number + 1) / self._counts)
17 |         return int(numpy.argmax(ucb_values))
18 | 
19 |     def __eq__(self, other: object) -> bool:
20 |         return isinstance(other, self.__class__) and self._arms == other._arms
21 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import pandas
 2 | 
 3 | from multi_armed_bandit.algorithm.epsilon_first import EpsilonFirst
 4 | from multi_armed_bandit.algorithm.exp3 import EXP3
 5 | from multi_armed_bandit.algorithm.greedy_mix import GreedyMix
 6 | from multi_armed_bandit.algorithm.softmax import Softmax
 7 | from multi_armed_bandit.algorithm.softmax_decreasing import SoftmaxDecreasing
 8 | from multi_armed_bandit.algorithm.softmix import SoftMix
 9 | from multi_armed_bandit.algorithm.ucb1 import UCB1
10 | from multi_armed_bandit.algorithm.ucb1_tuned import UCB1Tuned
11 | from multi_armed_bandit.algorithm.ucb2 import UCB2
12 | from multi_armed_bandit.arms.normal_arm import NormalArm
13 | from multi_armed_bandit.algorithm.epsilon_decreasing import EpsilonDecreasing
14 | 
15 | if __name__ == '__main__':
16 |     arms = [NormalArm(0, 0.3, 1), NormalArm(1, 0.5, 1), NormalArm(2, 1, 1)]
17 |     epsilon = 0.5
18 |     algorithm = UCB2(arms, epsilon)
19 |     results = algorithm.run_simulation(1000)
20 |     # df = pandas.DataFrame(results)
21 |     # df.to_csv("results.csv", index=False)
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Idan Morad
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | version = Path(__file__).parents[0].joinpath(".version").read_text().split("==")[1]
 6 | long_description = Path(__file__).parents[0].joinpath("README.md").read_text()
 7 | 
 8 | setup(name="MultiArmedBandit-DeepLearning",
 9 |       version=version,
10 |       author="Idan Morad",
11 |       description="Multi Armed Bandit Algorithm W\ Deep Learning "
12 |                   "Build using python 3.6.6 with tox 3.4.0 and tensorflow 1.10.0",
13 |       long_description=long_description,
14 |       long_description_content_type="text/markdown",
15 |       classifiers=["Development Status :: 5 - Production/Stable",
16 |                    "Framework :: tox",
17 |                    "Intended Audience :: Developers",
18 |                    "Intended Audience :: Education",
19 |                    "Intended Audience :: Science/Research",
20 |                    "License :: OSI Approved :: MIT License",
21 |                    "Programming Language :: Python :: 3.6",
22 |                    "Topic :: Scientific/Engineering :: Artificial Intelligence",
23 |                    "Topic :: Software Development :: Testing :: Unit",
24 |                    "Topic :: Software Development :: Version Control :: Git"],
25 |       keywords="multi-armed-bandit deep-learning",
26 |       packages=find_packages(exclude=['contrib', 'docs', 'tests']))
27 | 


--------------------------------------------------------------------------------
/multi_armed_bandit/algorithm/epsilon_greedy.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import numpy
 4 | 
 5 | from multi_armed_bandit.algorithm.abstract_algorithm import MABAlgorithm
 6 | from multi_armed_bandit.arms.abstract_arm import Arm
 7 | 
 8 | 
 9 | class EpsilonGreedy(MABAlgorithm):
10 |     """
11 |     Epsilon-greedy strategy - The best lever is selected for a proportion 1 - epsilon of the trails, and a lever is
12 |     selected at random (with uniform probability) for a proportion epsilon.
13 |     """
14 | 
15 |     def __init__(self, arms: List[Arm], epsilon: float) -> None:
16 |         super().__init__(arms)
17 | 
18 |         if not (0.0 < epsilon < 1.0):
19 |             raise ValueError("epsilon is not between 0 and 1")
20 | 
21 |         self._epsilon = epsilon
22 | 
23 |     def select_arm(self, iteration_number: int) -> int:
24 |         """
25 |         Select the best arm based on the strategy of epsilon proportion of time random and the rest are the
26 |         strongest arm.
27 |         :return: index of chosen arm.
28 |         """
29 |         if numpy.random.random() > self._get_epsilon(iteration_number):
30 |             return int(numpy.argmax(self._states))
31 |         else:
32 |             return numpy.random.choice(len(self._arms))
33 | 
34 |     def _get_epsilon(self, iteration_number: int) -> float:
35 |         return self._epsilon
36 | 
37 |     def __eq__(self, other: object) -> bool:
38 |         if not isinstance(other, self.__class__):
39 |             return False
40 |         return self._epsilon == other._epsilon and self._arms == other._arms
41 | 


--------------------------------------------------------------------------------
/multi_armed_bandit/algorithm/ucb1_tuned.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import numpy
 4 | 
 5 | from multi_armed_bandit.algorithm.ucb1 import UCB1
 6 | from multi_armed_bandit.arms.abstract_arm import Arm
 7 | 
 8 | 
 9 | class UCB1Tuned(UCB1):
10 |     """
11 |     The main feature of UCB1-Tuned is that it is takes into account the variance of each arm and not only empirical
12 |     mean. More specifically, at round t the algorithm picks the i'th arm from max(mu^hat_i + sqrt(ln(t)/n_i *
13 |     min(1/4, V_i(n_i)))). Where V_i(t)=sigma^2_i(t)+sqrt(2ln(t)/n_i(t)). The estimated of variance sigma^2_i(t) can
14 |     be computed as usually by maintaining the empirical sum of squares of the reward, in addition to the empirical mean.
15 |     """
16 | 
17 |     def __init__(self, arms: List[Arm]) -> None:
18 |         super().__init__(arms)
19 |         self._reward_squares = numpy.zeros(len(self._arms))
20 | 
21 |     def select_arm(self, iteration_number: int) -> int:
22 |         if 0 in self._counts:
23 |             return int(numpy.argmin(self._counts))
24 |         variances = (self._reward_squares / self._counts) - self._states ** 2
25 |         vi = variances + numpy.sqrt(2 * numpy.log(iteration_number + 1) / self._counts)
26 |         vi = numpy.array([min(0.25, value) for value in vi])
27 |         ucb_values = self._states + numpy.sqrt((2 * numpy.log(iteration_number + 1) / self._counts) * vi)
28 |         return int(numpy.argmax(ucb_values))
29 | 
30 |     def _after_draw(self, reward: float, chosen_arm_index: int) -> None:
31 |         self._reward_squares[chosen_arm_index] += (reward ** 2)
32 | 


--------------------------------------------------------------------------------
/multi_armed_bandit/arms/abstract_arm.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | import numpy
 4 | 
 5 | 
 6 | class Arm(object):
 7 |     def __init__(self, name: Union[int, float, str]) -> None:
 8 |         self._name = name
 9 |         self._probability = 0.0
10 | 
11 |     def get_name(self) -> int:
12 |         """
13 |         Returns the arm name.
14 |         :return: the arm name.
15 |         """
16 |         return self._name
17 | 
18 |     def set_probability(self, probability) -> None:
19 |         """
20 |         Sets the arm probability.
21 |         :param probability: the arm probability.
22 |         """
23 |         self._probability = probability
24 | 
25 |     def get_probability(self) -> float:
26 |         """
27 |         Returns the arm probability.
28 |         :return: the arm probability.
29 |         """
30 |         return self._probability
31 | 
32 |     def draw(self, size: int = 1) -> Union[float, numpy.ndarray]:
33 |         """
34 |         Pull the level of the arm.
35 |         :param size: number of draws.
36 |         :return: the rewards from the draw.
37 |         """
38 |         rewards = self._get_rewards(size)
39 |         if size == 1:
40 |             return rewards[0]
41 |         return rewards
42 | 
43 |     def _get_rewards(self, size: int) -> numpy.ndarray:
44 |         """
45 |         Extracts rewards per arm distribution.
46 |         :param size: number of draws.
47 |         :return: rewards.
48 |         """
49 |         raise NotImplementedError
50 | 
51 |     def get_dict(self) -> dict:
52 |         """
53 |         Retrn the arm values as dictionary with name and probability.
54 |         :return:
55 |         """
56 |         return {"name": self.get_name(), "probability": float(self.get_probability())}
57 | 


--------------------------------------------------------------------------------
/multi_armed_bandit/algorithm/softmax.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | from typing import List
 3 | 
 4 | import numpy
 5 | 
 6 | from multi_armed_bandit.algorithm.abstract_algorithm import MABAlgorithm
 7 | from multi_armed_bandit.arms.abstract_arm import Arm
 8 | 
 9 | 
10 | class Softmax(MABAlgorithm):
11 |     """
12 |     Also known as Boltzmann Exploration. Each arm is assign with probability that is proportional to its average reward.
13 |     Arms with greater empirical mean are therefore picked with higher probability. The probability to be picked is
14 |     defined: p_i(t+1) = (e^(mu^hat_i(t)/tua)/sum^K_j=1(e^(mu^hat_j(t)/tua))). Where K is number of arms, mu^hat_i(t) is
15 |     the estimated mean reward of the i'th arm, and tua is a temperature parameter controlling the randomness of the
16 |     choice. when tau is close to 0 the policy acts like pure greedy, and as tau tends to infinity, the algorithm picks
17 |     arms uniformly at random.
18 |     """
19 | 
20 |     def __init__(self, arms: List[Arm], temperature) -> None:
21 |         super().__init__(arms)
22 |         if temperature <= 0.0:
23 |             raise ValueError("temperature must be positive")
24 |         self._temperature = temperature
25 | 
26 |     def select_arm(self, iteration_number: int) -> int:
27 |         exp = numpy.exp(self._states / self._get_temperature(iteration_number))
28 |         probabilities = exp / numpy.sum(exp, axis=0)
29 |         return int(numpy.argmax(numpy.random.multinomial(1, probabilities)))
30 | 
31 |     def _get_temperature(self, iteration_number: int):
32 |         return self._temperature
33 | 
34 |     def __eq__(self, other: object) -> bool:
35 |         if not isinstance(other, self.__class__):
36 |             return False
37 |         return self._temperature == other._temperature and self._arms == other._arms
38 | 


--------------------------------------------------------------------------------
/tests/multi_armed_bandit/algorithm/test_epsilon_greedy.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from multi_armed_bandit.algorithm.epsilon_greedy import EpsilonGreedy
 4 | from multi_armed_bandit.arms.normal_arm import NormalArm
 5 | 
 6 | 
 7 | def test_epsilon_greedy_algorithm():
 8 |     arms = [NormalArm(name=0, mu=0.5, sigma=1.0), NormalArm(1, 0.3, 1.0), NormalArm(2, 1.0, 1.0)]
 9 |     algorithm = EpsilonGreedy(arms, 0.9)
10 |     algorithm.run_simulation(1000)
11 |     assert pytest.approx(0.31, 0.1) == round(arms[0].get_probability(), 2)
12 |     assert pytest.approx(0.30, 0.1) == round(arms[1].get_probability(), 2)
13 |     assert pytest.approx(0.38, 0.1) == round(arms[2].get_probability(), 2)
14 | 
15 | 
16 | def test_epsilon_out_of_range():
17 |     with pytest.raises(ValueError):
18 |         EpsilonGreedy([NormalArm(name=0, mu=0.5, sigma=1.0)], 0.0)
19 | 
20 |     with pytest.raises(ValueError):
21 |         EpsilonGreedy([NormalArm(name=0, mu=0.5, sigma=1.0)], -5.0)
22 | 
23 |     with pytest.raises(ValueError):
24 |         EpsilonGreedy([NormalArm(name=0, mu=0.5, sigma=1.0)], 1.1)
25 | 
26 | 
27 | def test_one_arm():
28 |     arm = NormalArm(name=0, mu=0.5, sigma=1.0)
29 |     algorithm = EpsilonGreedy([arm], 0.9)
30 |     algorithm.run_simulation(1000)
31 |     assert 1.0 == arm.get_probability()
32 | 
33 | 
34 | def test_eq():
35 |     arm = NormalArm(name=0, mu=0.5, sigma=1.0)
36 |     assert EpsilonGreedy([arm], 0.9) == EpsilonGreedy([arm], 0.9)
37 | 
38 | 
39 | def test_not_object():
40 |     arm = NormalArm(name=0, mu=0.5, sigma=1.0)
41 |     assert not (1 == EpsilonGreedy([arm], 0.3))
42 | 
43 | 
44 | def test_iteration_negative():
45 |     arm = NormalArm(name=0, mu=0.5, sigma=1.0)
46 |     algorithm = EpsilonGreedy([arm], 0.9)
47 | 
48 |     with pytest.raises(ValueError):
49 |         algorithm.run_simulation(0)
50 | 
51 |     with pytest.raises(ValueError):
52 |         algorithm.run_simulation(-1)
53 | 


--------------------------------------------------------------------------------
/multi_armed_bandit/algorithm/exp3.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import numpy
 4 | 
 5 | from multi_armed_bandit.algorithm.abstract_algorithm import MABAlgorithm
 6 | from multi_armed_bandit.arms.abstract_arm import Arm
 7 | 
 8 | 
 9 | class EXP3(MABAlgorithm):
10 |     """
11 |     Exponential weight algorithm. The probability of choosing the lever k ar round t is:
12 |     p_i(t) = (1 - gamma) * (weight_i(t)/ sum^K_j=i(weight_j(t))) + (gamma / K). Where K is the number of arms.
13 |     if lever i has been pulled: weight_i(t + 1) = weight_i(t) * e^((gamma * r_i(t))/(probability_i(t) * K)).
14 |     Where r_i(t) is the drawn reward in round t. Otherwise weight_i(t + 1) = weight_i(t)
15 |     """
16 | 
17 |     def __init__(self, arms: List[Arm], gamma: float) -> None:
18 |         super().__init__(arms)
19 | 
20 |         if not (0.0 <= gamma < 1.0):
21 |             raise ValueError("gamma must be between 0 and 1")
22 |         self._gamma = gamma
23 |         self._weights = numpy.ones(len(self._arms))
24 |         self._probabilities = numpy.zeros(len(self._arms))
25 | 
26 |     def select_arm(self, iteration_number: int) -> int:
27 |         self._probabilities = ((1 - self._gamma) * self._weights / numpy.sum(self._weights, axis=0)) + (
28 |                 self._gamma / len(self._arms))
29 |         return int(numpy.argmax(numpy.random.multinomial(1, self._probabilities)))
30 | 
31 |     def _after_draw(self, reward: float, chosen_arm_index: int) -> None:
32 |         probability = self._probabilities[chosen_arm_index]
33 |         growth_factor = numpy.exp(self._gamma * (reward / (probability * len(self._arms))))
34 |         self._weights[chosen_arm_index] = self._weights[chosen_arm_index] * growth_factor
35 | 
36 |     def __eq__(self, other: object) -> bool:
37 |         if not isinstance(other, self.__class__):
38 |             return False
39 |         return self._gamma == other._gamma and self._arms == other._arms
40 | 


--------------------------------------------------------------------------------
/multi_armed_bandit/algorithm/ucb2.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import numpy
 4 | 
 5 | from multi_armed_bandit.algorithm.abstract_algorithm import MABAlgorithm
 6 | from multi_armed_bandit.arms.abstract_arm import Arm
 7 | 
 8 | 
 9 | class UCB2(MABAlgorithm):
10 |     """
11 |     see: https://webdocs.cs.ualberta.ca/~games/go/seminar/notes/2007/slides_ucb.pdf
12 |     """
13 | 
14 |     def __init__(self, arms: List[Arm], alpha: float) -> None:
15 |         super().__init__(arms)
16 | 
17 |         if not (0.0 <= alpha < 1.0):
18 |             raise ValueError("alpha must be between 0 and 1")
19 | 
20 |         self._alpha = alpha
21 |         self._r = numpy.zeros(len(self._arms))
22 |         self._next_iteration_to_calc = 0
23 |         self._current_arm_index = 0
24 | 
25 |     def select_arm(self, iteration_number: int) -> int:
26 |         if 0 in self._counts:
27 |             chosen_arm = int(numpy.argmin(self._counts))
28 |             self._r[chosen_arm] += 1
29 |             return chosen_arm
30 | 
31 |         if self._next_iteration_to_calc > iteration_number + 1:
32 |             return self._current_arm_index
33 | 
34 |         ucb_values = self._states + numpy.sqrt(
35 |             ((1 + self._alpha) * numpy.log(numpy.e * (iteration_number + 1) / self._get_tau())) / 2 * self._get_tau())
36 |         chosen_arm = int(numpy.argmax(ucb_values))
37 |         self._update_next_time_to_calc(chosen_arm)
38 |         return chosen_arm
39 | 
40 |     def _get_tau(self) -> numpy.ndarray:
41 |         return numpy.ceil((1 + self._alpha) ** self._r)
42 | 
43 |     def _update_next_time_to_calc(self, arm_index: int) -> None:
44 |         self._current_arm_index = arm_index
45 |         self._next_iteration_to_calc += max(1, self._get_tau_for_single_r(
46 |             self._r[arm_index] + 1) - self._get_tau_for_single_r(self._r[arm_index]))
47 |         self._r[arm_index] += 1
48 | 
49 |     def _get_tau_for_single_r(self, r: int) -> int:
50 |         return int(numpy.ceil((1 + self._alpha) ** r))
51 | 
52 |     def __eq__(self, other: object) -> bool:
53 |         if not isinstance(other, self.__class__):
54 |             return False
55 |         return self._alpha == other._alpha and self._arms == other._arms
56 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### Windows template
  3 | # Windows thumbnail cache files
  4 | Thumbs.db
  5 | ehthumbs.db
  6 | ehthumbs_vista.db
  7 | 
  8 | # Dump file
  9 | *.stackdump
 10 | 
 11 | # Folder config file
 12 | [Dd]esktop.ini
 13 | 
 14 | # Recycle Bin used on file shares
 15 | $RECYCLE.BIN/
 16 | 
 17 | # Windows Installer files
 18 | *.cab
 19 | *.msi
 20 | *.msix
 21 | *.msm
 22 | *.msp
 23 | 
 24 | # Windows shortcuts
 25 | *.lnk
 26 | ### Linux template
 27 | *~
 28 | 
 29 | # temporary files which can be created if a process still has a handle open of a deleted file
 30 | .fuse_hidden*
 31 | 
 32 | # KDE directory preferences
 33 | .directory
 34 | 
 35 | # Linux trash folder which might appear on any partition or disk
 36 | .Trash-*
 37 | 
 38 | # .nfs files are created when an open file is removed but is still being accessed
 39 | .nfs*
 40 | ### JetBrains template
 41 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
 42 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 43 | 
 44 | # User-specific stuff
 45 | .idea/
 46 | 
 47 | # CMake
 48 | cmake-build-debug/
 49 | cmake-build-release/
 50 | 
 51 | # Mongo Explorer plugin
 52 | .idea/**/mongoSettings.xml
 53 | 
 54 | # File-based project format
 55 | *.iws
 56 | 
 57 | # IntelliJ
 58 | out/
 59 | 
 60 | # mpeltonen/sbt-idea plugin
 61 | .idea_modules/
 62 | 
 63 | # JIRA plugin
 64 | atlassian-ide-plugin.xml
 65 | 
 66 | # Cursive Clojure plugin
 67 | .idea/replstate.xml
 68 | 
 69 | # Crashlytics plugin (for Android Studio and IntelliJ)
 70 | com_crashlytics_export_strings.xml
 71 | crashlytics.properties
 72 | crashlytics-build.properties
 73 | fabric.properties
 74 | 
 75 | # Editor-based Rest Client
 76 | .idea/httpRequests
 77 | ### macOS template
 78 | # General
 79 | .DS_Store
 80 | .AppleDouble
 81 | .LSOverride
 82 | 
 83 | # Icon must end with two \r
 84 | Icon
 85 | 
 86 | # Thumbnails
 87 | ._*
 88 | 
 89 | # Files that might appear in the root of a volume
 90 | .DocumentRevisions-V100
 91 | .fseventsd
 92 | .Spotlight-V100
 93 | .TemporaryItems
 94 | .Trashes
 95 | .VolumeIcon.icns
 96 | .com.apple.timemachine.donotpresent
 97 | 
 98 | # Directories potentially created on remote AFP share
 99 | .AppleDB
100 | .AppleDesktop
101 | Network Trash Folder
102 | Temporary Items
103 | .apdisk
104 | ### Python template
105 | # Byte-compiled / optimized / DLL files
106 | __pycache__/
107 | *.py[cod]
108 | *$py.class
109 | 
110 | # C extensions
111 | *.so
112 | 
113 | # Distribution / packaging
114 | .Python
115 | build/
116 | develop-eggs/
117 | dist/
118 | downloads/
119 | eggs/
120 | .eggs/
121 | lib/
122 | lib64/
123 | parts/
124 | sdist/
125 | var/
126 | wheels/
127 | *.egg-info/
128 | .installed.cfg
129 | *.egg
130 | MANIFEST
131 | 
132 | # PyInstaller
133 | #  Usually these files are written by a python script from a template
134 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
135 | *.manifest
136 | *.spec
137 | 
138 | # Installer logs
139 | pip-log.txt
140 | pip-delete-this-directory.txt
141 | 
142 | # Unit test / coverage reports
143 | htmlcov/
144 | .tox/
145 | .coverage
146 | .coverage.*
147 | .cache
148 | nosetests.xml
149 | coverage.xml
150 | *.cover
151 | .hypothesis/
152 | .pytest_cache/
153 | 
154 | # Translations
155 | *.mo
156 | *.pot
157 | 
158 | # Django stuff:
159 | *.log
160 | local_settings.py
161 | db.sqlite3
162 | 
163 | # Flask stuff:
164 | instance/
165 | .webassets-cache
166 | 
167 | # Scrapy stuff:
168 | .scrapy
169 | 
170 | # Sphinx documentation
171 | docs/_build/
172 | 
173 | # PyBuilder
174 | target/
175 | 
176 | # Jupyter Notebook
177 | .ipynb_checkpoints
178 | 
179 | # pyenv
180 | .python-version
181 | 
182 | # celery beat schedule file
183 | celerybeat-schedule
184 | 
185 | # SageMath parsed files
186 | *.sage.py
187 | 
188 | # Environments
189 | .env
190 | .venv
191 | env/
192 | venv/
193 | ENV/
194 | env.bak/
195 | venv.bak/
196 | 
197 | # Spyder project settings
198 | .spyderproject
199 | .spyproject
200 | 
201 | # Rope project settings
202 | .ropeproject
203 | 
204 | # mkdocs documentation
205 | /site
206 | 
207 | # mypy
208 | .mypy_cache/
209 | 


--------------------------------------------------------------------------------
/multi_armed_bandit/algorithm/abstract_algorithm.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | 
  3 | import numpy
  4 | import tensorflow
  5 | 
  6 | from multi_armed_bandit.arms.abstract_arm import Arm
  7 | 
  8 | 
  9 | class MABAlgorithm(object):
 10 |     """
 11 |     Multi-armed bandit - In probability theory, the multi-armed bandit problem (sometimes called the K- or N-armed
 12 |     bandit problem) is a problem in which a fixed limited set of resources must be allocated between competing
 13 |     (alternative) choices in a way that maximizes their expected gain, when each choice's properties are only partially
 14 |     known at the time of allocation, and may become better understood as time passes or by allocating resources to the
 15 |     choice. taken from https://en.wikipedia.org/wiki/Multi-armed_bandit
 16 |     """
 17 | 
 18 |     def __init__(self, arms: List[Arm]) -> None:
 19 |         self._arms = arms
 20 |         self._states = numpy.zeros(len(self._arms))
 21 |         self._counts = numpy.zeros(len(self._arms))
 22 |         self._iterations = 0
 23 | 
 24 |     def select_arm(self, iteration_number: int) -> int:
 25 |         """
 26 |         The method that returns the index of the Arm that the algorithm selects on the current play.
 27 |         :return: index of chosen arm.
 28 |         """
 29 |         raise NotImplementedError
 30 | 
 31 |     def run_simulation(self, iterations) -> List[dict]:
 32 |         """
 33 |         Run simulation and update the probabilities to pull for each arm.
 34 |         :param iterations: number of iterations.
 35 |         """
 36 |         if iterations < 1:
 37 |             raise ValueError("Iterations must be positive")
 38 | 
 39 |         self._iterations = iterations
 40 | 
 41 |         number_of_arms = len(self._arms)
 42 | 
 43 |         results = []
 44 |         optimal_strategy_rewards = 0.0
 45 |         collected_rewards = 0.0
 46 | 
 47 |         rewards = numpy.zeros([number_of_arms, self._iterations])
 48 | 
 49 |         tensorflow.reset_default_graph()
 50 |         weights = tensorflow.Variable(tensorflow.ones([number_of_arms]))
 51 | 
 52 |         reward_holder = tensorflow.placeholder(shape=[1], dtype=tensorflow.float32)
 53 |         action_holder = tensorflow.placeholder(shape=[1], dtype=tensorflow.int32)
 54 |         responsible_weight = tensorflow.slice(weights, action_holder, [1])
 55 |         loss = -(tensorflow.log(responsible_weight) * reward_holder)
 56 |         optimizer = tensorflow.train.AdamOptimizer(learning_rate=0.001)
 57 |         update = optimizer.minimize(loss)
 58 |         init = tensorflow.global_variables_initializer()
 59 |         tensorflow.set_random_seed(42)
 60 | 
 61 |         ww = numpy.zeros(number_of_arms)
 62 | 
 63 |         with tensorflow.Session() as sess:
 64 |             sess.run(init)
 65 | 
 66 |             for arm_index in range(0, number_of_arms):
 67 |                 rewards[arm_index] = self._arms[arm_index].draw(self._iterations)
 68 | 
 69 |             self._states = numpy.zeros(number_of_arms)
 70 |             self._counts = numpy.zeros(number_of_arms)
 71 | 
 72 |             for iteration in range(0, self._iterations):
 73 |                 chosen_arm_index = self.select_arm(iteration)
 74 |                 reward = rewards[chosen_arm_index, iteration]
 75 | 
 76 |                 _, _, ww = sess.run([update, responsible_weight, weights],
 77 |                                     feed_dict={reward_holder: [reward], action_holder: [chosen_arm_index]})
 78 | 
 79 |                 self._counts[chosen_arm_index] += 1
 80 |                 count = self._counts[chosen_arm_index]
 81 | 
 82 |                 self._update_current_states(chosen_arm_index, count, reward)
 83 | 
 84 |                 self._after_draw(reward, chosen_arm_index)
 85 | 
 86 |                 collected_rewards += reward
 87 |                 optimal_strategy_rewards += numpy.max(rewards[:, iteration])
 88 |                 regret = optimal_strategy_rewards - collected_rewards
 89 | 
 90 |                 results.append(
 91 |                     {"iteration": iteration, "chosen_arm": self._arms[chosen_arm_index].get_name(), "regret": regret,
 92 |                      "avg_regret": regret / (iteration + 1),
 93 |                      "avg_collected_rewards": collected_rewards / (iteration + 1)})
 94 | 
 95 |         exp = numpy.exp(ww)
 96 |         probabilities = exp / numpy.sum(exp, axis=0)
 97 | 
 98 |         for index in range(0, number_of_arms):
 99 |             self._arms[index].set_probability(probabilities[index])
100 | 
101 |         return results
102 | 
103 |     def _update_current_states(self, chosen_arm_index, count, reward) -> None:
104 |         self._states[chosen_arm_index] = ((count - 1) / count) * self._states[chosen_arm_index] + (
105 |                 1 / count) * reward
106 | 
107 |     def _after_draw(self, reward: float, chosen_arm_index: int) -> None:
108 |         """
109 |         After process method.
110 |         :param reward: the reward drawn.
111 |         :param chosen_arm_index: the chosen arm index.
112 |         """
113 |         pass
114 | 


--------------------------------------------------------------------------------