├── setup.py ├── LICENSE ├── gym_bandits ├── __init__.py ├── bandit.py └── scoreboard.py ├── .gitignore └── README.md /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='gym_bandits', 4 | version='0.01', 5 | description='Gym User Env - Various N-Armed Bandit Problems', 6 | url='https://github.com/jkcooper2/gym_bandits', 7 | author='Jesse Cooper', 8 | packages=['gym_bandits'], 9 | author_email='jesse_cooper@hotmail.com', 10 | license='MIT License', 11 | install_requires=['gym>=0.2.3'], 12 | ) 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Jesse Cooper 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /gym_bandits/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | from .bandit import BanditTenArmedRandomFixed 4 | from .bandit import BanditTenArmedRandomRandom 5 | from .bandit import BanditTenArmedGaussian 6 | from .bandit import BanditTenArmedUniformDistributedReward 7 | from .bandit import BanditTwoArmedDeterministicFixed 8 | from .bandit import BanditTwoArmedHighHighFixed 9 | from .bandit import BanditTwoArmedHighLowFixed 10 | from .bandit import BanditTwoArmedLowLowFixed 11 | from .bandit import BanditTwoArmedUniform 12 | 13 | environments = [ 14 | ['BanditTenArmedRandomFixed', 'v0'], 15 | ['BanditTenArmedRandomRandom', 'v0'], 16 | ['BanditTenArmedGaussian', 'v0'], 17 | ['BanditTenArmedUniformDistributedReward', 'v0'], 18 | ['BanditTwoArmedDeterministicFixed', 'v0'], 19 | ['BanditTwoArmedHighHighFixed', 'v0'], 20 | ['BanditTwoArmedHighLowFixed', 'v0'], 21 | ['BanditTwoArmedLowLowFixed', 'v0'], 22 | ['BanditTwoArmedUniform', 'v0'], 23 | ] 24 | 25 | for environment in environments: 26 | register( 27 | id='{}-{}'.format(environment[0], environment[1]), 28 | entry_point='gym_bandits:{}'.format(environment[0]), 29 | max_episode_steps=1, 30 | nondeterministic=False, 31 | ) 32 | 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | # PyCharm Project Files 92 | .idea 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bandit Environments 2 | 3 | Series of n-armed bandit environments for the OpenAI Gym 4 | 5 | ## Environments 6 | * `BanditTwoArmedDeterministicFixed-v0`: Simplest case where one bandit always pays, and the other always doesn't 7 | * `BanditTwoArmedHighLowFixed-v0`: Stochastic version with a large difference between which bandit pays out of two choices 8 | * `BanditTwoArmedHighHighFixed-v0`: Stochastic version with a small difference between which bandit pays where both are good 9 | * `BanditTwoArmedLowLowFixed-v0`: Stochastic version with a small difference between which bandit pays where both are bad 10 | * `BanditTwoArmedUniform-v0`: Stochastic version both arms pay between 0 and 1 11 | * `BanditTenArmedRandomFixed-v0`: 10 armed bandit with random probabilities assigned to payouts 12 | * `BanditTenArmedRandomRandom-v0`: 10 armed bandit with random probabilities assigned to both payouts and rewards 13 | * `BanditTenArmedUniformDistributedReward-v0`: 10 armed bandit with that always pays out with a reward selected from a uniform distribution 14 | * `BanditTenArmedGaussian-v0`: 10 armed bandit mentioned on page 30 of [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/the-book-2nd.html) (Sutton and Barto) 15 | 16 | ## Installation 17 | ```bash 18 | git clone git@github.com:mimoralea/gym-bandits.git 19 | cd gym-bandits 20 | pip install . 21 | ``` 22 | 23 | or: 24 | 25 | ```bash 26 | pip install git+https://github.com/mimoralea/gym-bandits#egg=gym-bandits 27 | ``` 28 | 29 | 30 | In your gym environment 31 | ```python 32 | import gym, gym_bandits 33 | env = gym.make("BanditTenArmedGaussian-v0") 34 | ``` 35 | -------------------------------------------------------------------------------- /gym_bandits/bandit.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | from gym.utils import seeding 5 | 6 | 7 | class BanditEnv(gym.Env): 8 | """ 9 | Bandit environment base to allow agents to interact with the class n-armed bandit 10 | in different variations 11 | 12 | p_dist: 13 | A list of probabilities of the likelihood that a particular bandit will pay out 14 | r_dist: 15 | A list of either rewards (if number) or means and standard deviations (if list) 16 | of the payout that bandit has 17 | """ 18 | def __init__(self, p_dist, r_dist): 19 | if len(p_dist) != len(r_dist): 20 | raise ValueError("Probability and Reward distribution must be the same length") 21 | 22 | if min(p_dist) < 0 or max(p_dist) > 1: 23 | raise ValueError("All probabilities must be between 0 and 1") 24 | 25 | for reward in r_dist: 26 | if isinstance(reward, list) and reward[1] <= 0: 27 | raise ValueError("Standard deviation in rewards must all be greater than 0") 28 | 29 | self.p_dist = p_dist 30 | self.r_dist = r_dist 31 | 32 | self.n_bandits = len(p_dist) 33 | self.action_space = spaces.Discrete(self.n_bandits) 34 | self.observation_space = spaces.Discrete(1) 35 | 36 | self.seed() 37 | 38 | def seed(self, seed=None): 39 | self.np_random, seed = seeding.np_random(seed) 40 | return [seed] 41 | 42 | def step(self, action): 43 | assert self.action_space.contains(action) 44 | 45 | reward = 0 46 | done = True 47 | 48 | if self.np_random.uniform() < self.p_dist[action]: 49 | if not isinstance(self.r_dist[action], list): 50 | reward = self.r_dist[action] 51 | else: 52 | reward = self.np_random.normal(self.r_dist[action][0], self.r_dist[action][1]) 53 | 54 | return 0, reward, done, {} 55 | 56 | def reset(self): 57 | return 0 58 | 59 | def render(self, mode='human', close=False): 60 | pass 61 | 62 | 63 | class BanditTwoArmedDeterministicFixed(BanditEnv): 64 | """Simplest case where one bandit always pays, and the other always doesn't""" 65 | def __init__(self): 66 | BanditEnv.__init__(self, p_dist=[1, 0], r_dist=[1, 1]) 67 | 68 | class BanditTwoArmedHighLowFixed(BanditEnv): 69 | """Stochastic version with a large difference between which bandit pays out of two choices""" 70 | def __init__(self): 71 | BanditEnv.__init__(self, p_dist=[0.8, 0.2], r_dist=[1, 1]) 72 | 73 | class BanditTwoArmedHighHighFixed(BanditEnv): 74 | """Stochastic version with a small difference between which bandit pays where both are good""" 75 | def __init__(self): 76 | BanditEnv.__init__(self, p_dist=[0.8, 0.9], r_dist=[1, 1]) 77 | 78 | class BanditTwoArmedLowLowFixed(BanditEnv): 79 | """Stochastic version with a small difference between which bandit pays where both are bad""" 80 | def __init__(self): 81 | BanditEnv.__init__(self, p_dist=[0.1, 0.2], r_dist=[1, 1]) 82 | 83 | class BanditTwoArmedUniform(BanditEnv): 84 | """Stochastic version with rewards of one and random probabilities assigned to both payouts""" 85 | def __init__(self, bandits=2, seed=1): 86 | self.seed(seed) 87 | p_dist = self.np_random.uniform(size=bandits) 88 | r_dist = np.full(bandits, 1) 89 | BanditEnv.__init__(self, p_dist=p_dist, r_dist=r_dist) 90 | 91 | class BanditTenArmedRandomFixed(BanditEnv): 92 | """10 armed bandit with random probabilities assigned to payouts""" 93 | def __init__(self, bandits=10, seed=1): 94 | self.seed(seed) 95 | p_dist = self.np_random.uniform(size=bandits) 96 | r_dist = np.full(bandits, 1) 97 | BanditEnv.__init__(self, p_dist=p_dist, r_dist=r_dist) 98 | 99 | class BanditTenArmedUniformDistributedReward(BanditEnv): 100 | """10 armed bandit that always pays out with a reward selected from a uniform distribution""" 101 | def __init__(self, bandits=10, seed=1): 102 | self.seed(seed) 103 | p_dist = np.full(bandits, 1) 104 | r_dist = self.np_random.uniform(size=bandits) 105 | BanditEnv.__init__(self, p_dist=p_dist, r_dist=r_dist) 106 | 107 | class BanditTenArmedRandomRandom(BanditEnv): 108 | """10 armed bandit with random probabilities assigned to both payouts and rewards""" 109 | def __init__(self, bandits=10, seed=1): 110 | self.seed(seed) 111 | p_dist = self.np_random.uniform(size=bandits) 112 | r_dist = self.np_random.uniform(size=bandits) 113 | BanditEnv.__init__(self, p_dist=p_dist, r_dist=r_dist) 114 | 115 | class BanditTenArmedGaussian(BanditEnv): 116 | """ 117 | 10 armed bandit mentioned on page 30 of Sutton and Barto's 118 | [Reinforcement Learning: An Introduction](https://www.dropbox.com/s/b3psxv2r0ccmf80/book2015oct.pdf?dl=0) 119 | 120 | Actions always pay out 121 | Mean of payout is pulled from a normal distribution (0, 1) (called q*(a)) 122 | Actual reward is drawn from a normal distribution (q*(a), 1) 123 | """ 124 | def __init__(self, bandits=10, seed=1): 125 | self.seed(seed) 126 | p_dist = np.full(bandits, 1) 127 | r_dist = [] 128 | 129 | for i in range(bandits): 130 | r_dist.append([self.np_random.normal(0, 1), 1]) 131 | 132 | BanditEnv.__init__(self, p_dist=p_dist, r_dist=r_dist) 133 | -------------------------------------------------------------------------------- /gym_bandits/scoreboard.py: -------------------------------------------------------------------------------- 1 | from gym.scoreboard.registration import add_task, add_group 2 | 3 | 4 | add_group( 5 | id='bandits', 6 | name='Bandits', 7 | description='Various N-Armed Bandit environments' 8 | ) 9 | 10 | add_task( 11 | id='BanditTwoArmedDeterministicFixed-v0', 12 | group='bandits', 13 | experimental=True, 14 | contributor='jkcooper2', 15 | summary="Simplest bandit where one action always pays, and the other never does.", 16 | description=""" 17 | Each bandit takes in a probability distribution, which is the likelihood of the action paying out, 18 | and a reward distribution, which is the value or distribution of what the agent will be rewarded 19 | the bandit does payout. 20 | 21 | p_dist = [1, 0] 22 | r_dist = [1, 1] 23 | """, 24 | background="" 25 | ) 26 | 27 | add_task( 28 | id='BanditTwoArmedHighHighFixed-v0', 29 | group='bandits', 30 | experimental=True, 31 | contributor='jkcooper2', 32 | summary="Stochastic version with a small difference between which bandit pays where both are likely", 33 | description=""" 34 | Each bandit takes in a probability distribution, which is the likelihood of the action paying out, 35 | and a reward distribution, which is the value or distribution of what the agent will be rewarded 36 | the bandit does payout. 37 | 38 | p_dist = [0.8, 0.9] 39 | r_dist = [1, 1] 40 | """, 41 | background="Bandit B Figure 2.3 from Reinforcement Learning: An Introduction (Sutton & Barto) [link](https://webdocs.cs.ualberta.ca/~sutton/book/ebook/node18.html)" 42 | ) 43 | 44 | add_task( 45 | id='BanditTwoArmedLowLowFixed-v0', 46 | group='bandits', 47 | experimental=True, 48 | contributor='jkcooper2', 49 | summary="Stochastic version with a small difference between which bandit pays where both are unlikley", 50 | description=""" 51 | Each bandit takes in a probability distribution, which is the likelihood of the action paying out, 52 | and a reward distribution, which is the value or distribution of what the agent will be rewarded 53 | the bandit does payout. 54 | 55 | p_dist = [0.1, 0.2] 56 | r_dist = [1, 1] 57 | """, 58 | background="Bandit A Figure 2.3 from Reinforcement Learning: An Introduction (Sutton & Barto) [link](https://webdocs.cs.ualberta.ca/~sutton/book/ebook/node18.html)" 59 | ) 60 | 61 | add_task( 62 | id='BanditTwoArmedHighLowFixed-v0', 63 | group='bandits', 64 | experimental=True, 65 | contributor='jkcooper2', 66 | summary="Stochastic version with a large difference between which bandit pays out of two choices", 67 | description=""" 68 | Each bandit takes in a probability distribution, which is the likelihood of the action paying out, 69 | and a reward distribution, which is the value or distribution of what the agent will be rewarded 70 | the bandit does payout. 71 | 72 | p_dist = [0.8, 0.2] 73 | r_dist = [1, 1] 74 | """, 75 | background="" 76 | ) 77 | 78 | add_task( 79 | id='BanditTenArmedGaussian-v0', 80 | group='bandits', 81 | experimental=True, 82 | contributor='jkcooper2', 83 | summary="10 armed bandit mentioned with reward based on a Gaussian distribution", 84 | description=""" 85 | Each bandit takes in a probability distribution, which is the likelihood of the action paying out, 86 | and a reward distribution, which is the value or distribution of what the agent will be rewarded 87 | the bandit does payout. 88 | 89 | p_dist = [1] (* 10) 90 | r_dist = [numpy.random.normal(0, 1), 1] (* 10) 91 | 92 | Every bandit always pays out 93 | Each action has a reward mean (selected from a normal distribution with mean 0 and std 1), and the actual 94 | reward returns is selected with a std of 1 around the selected mean 95 | """, 96 | background="Described on page 30 of Sutton and Barto's [Reinforcement Learning: An Introduction](https://www.dropbox.com/s/b3psxv2r0ccmf80/book2015oct.pdf?dl=0)" 97 | ) 98 | 99 | add_task( 100 | id='BanditTenArmedRandomRandom-v0', 101 | group='bandits', 102 | experimental=True, 103 | contributor='jkcooper2', 104 | summary="10 armed bandit with random probabilities assigned to both payouts and rewards", 105 | description=""" 106 | Each bandit takes in a probability distribution, which is the likelihood of the action paying out, 107 | and a reward distribution, which is the value or distribution of what the agent will be rewarded 108 | the bandit does payout. 109 | 110 | p_dist = numpy.random.uniform(size=10) 111 | r_dist = numpy.random.uniform(size=10) 112 | 113 | Bandits have uniform probability of paying out and payout a reward of uniform probability 114 | """, 115 | background="" 116 | ) 117 | 118 | add_task( 119 | id='BanditTenArmedRandomFixed-v0', 120 | group='bandits', 121 | experimental=True, 122 | contributor='jkcooper2', 123 | summary="10 armed bandit with random probabilities assigned to how often the action will provide a reward", 124 | description=""" 125 | Each bandit takes in a probability distribution, which is the likelihood of the action paying out, 126 | and a reward distribution, which is the value or distribution of what the agent will be rewarded 127 | the bandit does payout. 128 | 129 | p_dist = numpy.random.uniform(size=10) 130 | r_dist = numpy.full(bandits, 1) 131 | 132 | Bandits have a uniform probability of rewarding and always reward 1 133 | """, 134 | background="" 135 | ) 136 | 137 | add_task( 138 | id='BanditTenArmedUniformDistributedReward-v0', 139 | group='bandits', 140 | experimental=True, 141 | contributor='jkcooper2', 142 | summary="10 armed bandit with that always pays out with a reward selected from a uniform distribution", 143 | description=""" 144 | Each bandit takes in a probability distribution, which is the likelihood of the action paying out, 145 | and a reward distribution, which is the value or distribution of what the agent will be rewarded 146 | the bandit does payout. 147 | 148 | p_dist = numpy.full(bandits, 1) 149 | r_dist = numpy.random.uniform(size=10) 150 | 151 | Bandits always pay out. Reward is selected from uniform distribution 152 | """, 153 | background="Based on comparisons from http://sudeepraja.github.io/Bandits/" 154 | ) 155 | 156 | 157 | 158 | 159 | --------------------------------------------------------------------------------