├── setup.py
├── LICENSE
├── gym_bandits
    ├── __init__.py
    ├── bandit.py
    └── scoreboard.py
├── .gitignore
└── README.md


/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(name='gym_bandits',
 4 |       version='0.01',
 5 |       description='Gym User Env - Various N-Armed Bandit Problems',
 6 |       url='https://github.com/jkcooper2/gym_bandits',
 7 |       author='Jesse Cooper',
 8 |       packages=['gym_bandits'],
 9 |       author_email='jesse_cooper@hotmail.com',
10 |       license='MIT License',
11 |       install_requires=['gym>=0.2.3'],
12 | )
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Jesse Cooper
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/gym_bandits/__init__.py:
--------------------------------------------------------------------------------
 1 | from gym.envs.registration import register
 2 | 
 3 | from .bandit import BanditTenArmedRandomFixed
 4 | from .bandit import BanditTenArmedRandomRandom
 5 | from .bandit import BanditTenArmedGaussian
 6 | from .bandit import BanditTenArmedUniformDistributedReward
 7 | from .bandit import BanditTwoArmedDeterministicFixed
 8 | from .bandit import BanditTwoArmedHighHighFixed
 9 | from .bandit import BanditTwoArmedHighLowFixed
10 | from .bandit import BanditTwoArmedLowLowFixed
11 | from .bandit import BanditTwoArmedUniform
12 | 
13 | environments = [
14 |   ['BanditTenArmedRandomFixed', 'v0'],
15 |   ['BanditTenArmedRandomRandom', 'v0'],
16 |   ['BanditTenArmedGaussian', 'v0'],
17 |   ['BanditTenArmedUniformDistributedReward', 'v0'],
18 |   ['BanditTwoArmedDeterministicFixed', 'v0'],
19 |   ['BanditTwoArmedHighHighFixed', 'v0'],
20 |   ['BanditTwoArmedHighLowFixed', 'v0'],
21 |   ['BanditTwoArmedLowLowFixed', 'v0'],
22 |   ['BanditTwoArmedUniform', 'v0'],
23 | ]
24 | 
25 | for environment in environments:
26 |     register(
27 |         id='{}-{}'.format(environment[0], environment[1]),
28 |         entry_point='gym_bandits:{}'.format(environment[0]),
29 |         max_episode_steps=1,
30 |         nondeterministic=False,
31 |     )
32 |     
33 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 
91 | # PyCharm Project Files
92 | .idea
93 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Bandit Environments
 2 | 
 3 | Series of n-armed bandit environments for the OpenAI Gym
 4 | 
 5 | ## Environments
 6 | * `BanditTwoArmedDeterministicFixed-v0`: Simplest case where one bandit always pays, and the other always doesn't
 7 | * `BanditTwoArmedHighLowFixed-v0`: Stochastic version with a large difference between which bandit pays out of two choices
 8 | * `BanditTwoArmedHighHighFixed-v0`: Stochastic version with a small difference between which bandit pays where both are good
 9 | * `BanditTwoArmedLowLowFixed-v0`: Stochastic version with a small difference between which bandit pays where both are bad
10 | * `BanditTwoArmedUniform-v0`: Stochastic version both arms pay between 0 and 1
11 | * `BanditTenArmedRandomFixed-v0`: 10 armed bandit with random probabilities assigned to payouts
12 | * `BanditTenArmedRandomRandom-v0`: 10 armed bandit with random probabilities assigned to both payouts and rewards
13 | * `BanditTenArmedUniformDistributedReward-v0`: 10 armed bandit with that always pays out with a reward selected from a uniform distribution
14 | * `BanditTenArmedGaussian-v0`: 10 armed bandit mentioned on page 30 of [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/the-book-2nd.html) (Sutton and Barto)
15 | 
16 | ## Installation
17 | ```bash
18 | git clone git@github.com:mimoralea/gym-bandits.git
19 | cd gym-bandits
20 | pip install .
21 | ```
22 | 
23 | or:
24 | 
25 | ```bash
26 | pip install git+https://github.com/mimoralea/gym-bandits#egg=gym-bandits
27 | ```
28 | 
29 | 
30 | In your gym environment
31 | ```python
32 | import gym, gym_bandits
33 | env = gym.make("BanditTenArmedGaussian-v0")
34 | ```
35 | 


--------------------------------------------------------------------------------
/gym_bandits/bandit.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import gym
  3 | from gym import spaces
  4 | from gym.utils import seeding
  5 | 
  6 | 
  7 | class BanditEnv(gym.Env):
  8 |     """
  9 |     Bandit environment base to allow agents to interact with the class n-armed bandit
 10 |     in different variations
 11 | 
 12 |     p_dist:
 13 |         A list of probabilities of the likelihood that a particular bandit will pay out
 14 |     r_dist:
 15 |         A list of either rewards (if number) or means and standard deviations (if list)
 16 |         of the payout that bandit has
 17 |     """
 18 |     def __init__(self, p_dist, r_dist):
 19 |         if len(p_dist) != len(r_dist):
 20 |             raise ValueError("Probability and Reward distribution must be the same length")
 21 | 
 22 |         if min(p_dist) < 0 or max(p_dist) > 1:
 23 |             raise ValueError("All probabilities must be between 0 and 1")
 24 | 
 25 |         for reward in r_dist:
 26 |             if isinstance(reward, list) and reward[1] <= 0:
 27 |                 raise ValueError("Standard deviation in rewards must all be greater than 0")
 28 | 
 29 |         self.p_dist = p_dist
 30 |         self.r_dist = r_dist
 31 | 
 32 |         self.n_bandits = len(p_dist)
 33 |         self.action_space = spaces.Discrete(self.n_bandits)
 34 |         self.observation_space = spaces.Discrete(1)
 35 | 
 36 |         self.seed()
 37 | 
 38 |     def seed(self, seed=None):
 39 |         self.np_random, seed = seeding.np_random(seed)
 40 |         return [seed]
 41 | 
 42 |     def step(self, action):
 43 |         assert self.action_space.contains(action)
 44 | 
 45 |         reward = 0
 46 |         done = True
 47 | 
 48 |         if self.np_random.uniform() < self.p_dist[action]:
 49 |             if not isinstance(self.r_dist[action], list):
 50 |                 reward = self.r_dist[action]
 51 |             else:
 52 |                 reward = self.np_random.normal(self.r_dist[action][0], self.r_dist[action][1])
 53 | 
 54 |         return 0, reward, done, {}
 55 | 
 56 |     def reset(self):
 57 |         return 0
 58 | 
 59 |     def render(self, mode='human', close=False):
 60 |         pass
 61 | 
 62 | 
 63 | class BanditTwoArmedDeterministicFixed(BanditEnv):
 64 |     """Simplest case where one bandit always pays, and the other always doesn't"""
 65 |     def __init__(self):
 66 |         BanditEnv.__init__(self, p_dist=[1, 0], r_dist=[1, 1])
 67 | 
 68 | class BanditTwoArmedHighLowFixed(BanditEnv):
 69 |     """Stochastic version with a large difference between which bandit pays out of two choices"""
 70 |     def __init__(self):
 71 |         BanditEnv.__init__(self, p_dist=[0.8, 0.2], r_dist=[1, 1])
 72 | 
 73 | class BanditTwoArmedHighHighFixed(BanditEnv):
 74 |     """Stochastic version with a small difference between which bandit pays where both are good"""
 75 |     def __init__(self):
 76 |         BanditEnv.__init__(self, p_dist=[0.8, 0.9], r_dist=[1, 1])
 77 | 
 78 | class BanditTwoArmedLowLowFixed(BanditEnv):
 79 |     """Stochastic version with a small difference between which bandit pays where both are bad"""
 80 |     def __init__(self):
 81 |         BanditEnv.__init__(self, p_dist=[0.1, 0.2], r_dist=[1, 1])
 82 |         
 83 | class BanditTwoArmedUniform(BanditEnv):
 84 |     """Stochastic version with rewards of one and random probabilities assigned to both payouts"""
 85 |     def __init__(self, bandits=2, seed=1):
 86 |         self.seed(seed)
 87 |         p_dist = self.np_random.uniform(size=bandits)
 88 |         r_dist = np.full(bandits, 1)
 89 |         BanditEnv.__init__(self, p_dist=p_dist, r_dist=r_dist)
 90 | 
 91 | class BanditTenArmedRandomFixed(BanditEnv):
 92 |     """10 armed bandit with random probabilities assigned to payouts"""
 93 |     def __init__(self, bandits=10, seed=1):
 94 |         self.seed(seed)
 95 |         p_dist = self.np_random.uniform(size=bandits)
 96 |         r_dist = np.full(bandits, 1)
 97 |         BanditEnv.__init__(self, p_dist=p_dist, r_dist=r_dist)
 98 | 
 99 | class BanditTenArmedUniformDistributedReward(BanditEnv):
100 |     """10 armed bandit that always pays out with a reward selected from a uniform distribution"""
101 |     def __init__(self, bandits=10, seed=1):
102 |         self.seed(seed)
103 |         p_dist = np.full(bandits, 1)
104 |         r_dist = self.np_random.uniform(size=bandits)
105 |         BanditEnv.__init__(self, p_dist=p_dist, r_dist=r_dist)
106 | 
107 | class BanditTenArmedRandomRandom(BanditEnv):
108 |     """10 armed bandit with random probabilities assigned to both payouts and rewards"""
109 |     def __init__(self, bandits=10, seed=1):
110 |         self.seed(seed)
111 |         p_dist = self.np_random.uniform(size=bandits)
112 |         r_dist = self.np_random.uniform(size=bandits)
113 |         BanditEnv.__init__(self, p_dist=p_dist, r_dist=r_dist)
114 | 
115 | class BanditTenArmedGaussian(BanditEnv):
116 |     """
117 |     10 armed bandit mentioned on page 30 of Sutton and Barto's
118 |     [Reinforcement Learning: An Introduction](https://www.dropbox.com/s/b3psxv2r0ccmf80/book2015oct.pdf?dl=0)
119 | 
120 |     Actions always pay out
121 |     Mean of payout is pulled from a normal distribution (0, 1) (called q*(a))
122 |     Actual reward is drawn from a normal distribution (q*(a), 1)
123 |     """
124 |     def __init__(self, bandits=10, seed=1):
125 |         self.seed(seed)
126 |         p_dist = np.full(bandits, 1)
127 |         r_dist = []
128 | 
129 |         for i in range(bandits):
130 |             r_dist.append([self.np_random.normal(0, 1), 1])
131 | 
132 |         BanditEnv.__init__(self, p_dist=p_dist, r_dist=r_dist)
133 | 


--------------------------------------------------------------------------------
/gym_bandits/scoreboard.py:
--------------------------------------------------------------------------------
  1 | from gym.scoreboard.registration import add_task, add_group
  2 | 
  3 | 
  4 | add_group(
  5 |     id='bandits',
  6 |     name='Bandits',
  7 |     description='Various N-Armed Bandit environments'
  8 | )
  9 | 
 10 | add_task(
 11 |     id='BanditTwoArmedDeterministicFixed-v0',
 12 |     group='bandits',
 13 |     experimental=True,
 14 |     contributor='jkcooper2',
 15 |     summary="Simplest bandit where one action always pays, and the other never does.",
 16 |     description="""
 17 |     Each bandit takes in a probability distribution, which is the likelihood of the action paying out,
 18 |     and a reward distribution, which is the value or distribution of what the agent will be rewarded
 19 |     the bandit does payout.
 20 | 
 21 |     p_dist = [1, 0]
 22 |     r_dist = [1, 1]
 23 |     """,
 24 |     background=""
 25 | )
 26 | 
 27 | add_task(
 28 |     id='BanditTwoArmedHighHighFixed-v0',
 29 |     group='bandits',
 30 |     experimental=True,
 31 |     contributor='jkcooper2',
 32 |     summary="Stochastic version with a small difference between which bandit pays where both are likely",
 33 |     description="""
 34 |     Each bandit takes in a probability distribution, which is the likelihood of the action paying out,
 35 |     and a reward distribution, which is the value or distribution of what the agent will be rewarded
 36 |     the bandit does payout.
 37 | 
 38 |     p_dist = [0.8, 0.9]
 39 |     r_dist = [1, 1]
 40 |     """,
 41 |     background="Bandit B Figure 2.3 from Reinforcement Learning: An Introduction (Sutton & Barto) [link](https://webdocs.cs.ualberta.ca/~sutton/book/ebook/node18.html)"
 42 | )
 43 | 
 44 | add_task(
 45 |     id='BanditTwoArmedLowLowFixed-v0',
 46 |     group='bandits',
 47 |     experimental=True,
 48 |     contributor='jkcooper2',
 49 |     summary="Stochastic version with a small difference between which bandit pays where both are unlikley",
 50 |     description="""
 51 |     Each bandit takes in a probability distribution, which is the likelihood of the action paying out,
 52 |     and a reward distribution, which is the value or distribution of what the agent will be rewarded
 53 |     the bandit does payout.
 54 | 
 55 |     p_dist = [0.1, 0.2]
 56 |     r_dist = [1, 1]
 57 |     """,
 58 |     background="Bandit A Figure 2.3 from Reinforcement Learning: An Introduction (Sutton & Barto) [link](https://webdocs.cs.ualberta.ca/~sutton/book/ebook/node18.html)"
 59 | )
 60 | 
 61 | add_task(
 62 |     id='BanditTwoArmedHighLowFixed-v0',
 63 |     group='bandits',
 64 |     experimental=True,
 65 |     contributor='jkcooper2',
 66 |     summary="Stochastic version with a large difference between which bandit pays out of two choices",
 67 |     description="""
 68 |     Each bandit takes in a probability distribution, which is the likelihood of the action paying out,
 69 |     and a reward distribution, which is the value or distribution of what the agent will be rewarded
 70 |     the bandit does payout.
 71 | 
 72 |     p_dist = [0.8, 0.2]
 73 |     r_dist = [1, 1]
 74 |     """,
 75 |     background=""
 76 | )
 77 | 
 78 | add_task(
 79 |     id='BanditTenArmedGaussian-v0',
 80 |     group='bandits',
 81 |     experimental=True,
 82 |     contributor='jkcooper2',
 83 |     summary="10 armed bandit mentioned with reward based on a Gaussian distribution",
 84 |     description="""
 85 |     Each bandit takes in a probability distribution, which is the likelihood of the action paying out,
 86 |     and a reward distribution, which is the value or distribution of what the agent will be rewarded
 87 |     the bandit does payout.
 88 | 
 89 |     p_dist = [1] (* 10)
 90 |     r_dist = [numpy.random.normal(0, 1), 1] (* 10)
 91 | 
 92 |     Every bandit always pays out
 93 |     Each action has a reward mean (selected from a normal distribution with mean 0 and std 1), and the actual
 94 |     reward returns is selected with a std of 1 around the selected mean
 95 |     """,
 96 |     background="Described on page 30 of Sutton and Barto's [Reinforcement Learning: An Introduction](https://www.dropbox.com/s/b3psxv2r0ccmf80/book2015oct.pdf?dl=0)"
 97 | )
 98 | 
 99 | add_task(
100 |     id='BanditTenArmedRandomRandom-v0',
101 |     group='bandits',
102 |     experimental=True,
103 |     contributor='jkcooper2',
104 |     summary="10 armed bandit with random probabilities assigned to both payouts and rewards",
105 |     description="""
106 |     Each bandit takes in a probability distribution, which is the likelihood of the action paying out,
107 |     and a reward distribution, which is the value or distribution of what the agent will be rewarded
108 |     the bandit does payout.
109 | 
110 |     p_dist = numpy.random.uniform(size=10)
111 |     r_dist = numpy.random.uniform(size=10)
112 | 
113 |     Bandits have uniform probability of paying out and payout a reward of uniform probability
114 |     """,
115 |     background=""
116 | )
117 | 
118 | add_task(
119 |     id='BanditTenArmedRandomFixed-v0',
120 |     group='bandits',
121 |     experimental=True,
122 |     contributor='jkcooper2',
123 |     summary="10 armed bandit with random probabilities assigned to how often the action will provide a reward",
124 |     description="""
125 |         Each bandit takes in a probability distribution, which is the likelihood of the action paying out,
126 |         and a reward distribution, which is the value or distribution of what the agent will be rewarded
127 |         the bandit does payout.
128 | 
129 |         p_dist = numpy.random.uniform(size=10)
130 |         r_dist = numpy.full(bandits, 1)
131 | 
132 |         Bandits have a uniform probability of rewarding and always reward 1
133 |         """,
134 |     background=""
135 | )
136 | 
137 | add_task(
138 |     id='BanditTenArmedUniformDistributedReward-v0',
139 |     group='bandits',
140 |     experimental=True,
141 |     contributor='jkcooper2',
142 |     summary="10 armed bandit with that always pays out with a reward selected from a uniform distribution",
143 |     description="""
144 |         Each bandit takes in a probability distribution, which is the likelihood of the action paying out,
145 |         and a reward distribution, which is the value or distribution of what the agent will be rewarded
146 |         the bandit does payout.
147 | 
148 |         p_dist = numpy.full(bandits, 1)
149 |         r_dist = numpy.random.uniform(size=10)
150 | 
151 |         Bandits always pay out. Reward is selected from uniform distribution
152 |         """,
153 |     background="Based on comparisons from http://sudeepraja.github.io/Bandits/"
154 | )
155 | 
156 | 
157 | 
158 | 
159 | 


--------------------------------------------------------------------------------