├── .github └── workflows │ ├── coverage.yml │ ├── release.yml │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── bayesian_testing ├── __init__.py ├── experiments │ ├── __init__.py │ ├── base.py │ ├── binary.py │ ├── delta_lognormal.py │ ├── delta_normal.py │ ├── discrete.py │ ├── exponential.py │ ├── normal.py │ └── poisson.py ├── metrics │ ├── __init__.py │ ├── evaluation.py │ └── posteriors.py └── utilities │ ├── __init__.py │ ├── common.py │ ├── logging.conf │ └── logging.py ├── codecov.yml ├── examples ├── README.md ├── data │ └── session_data.csv ├── dice_rolls_ab_testing.ipynb ├── goals_scored_ab_testing.ipynb ├── session_data_ab_testing.ipynb ├── session_data_manual_pbbs.ipynb └── waiting_time_ab_testing.ipynb ├── poetry.lock ├── pyproject.toml └── tests ├── README.md ├── test_binary.py ├── test_delta_lognormal.py ├── test_delta_normal.py ├── test_discrete.py ├── test_evaluation.py ├── test_exponential.py ├── test_normal.py ├── test_poisson.py ├── test_posteriors.py └── test_validators.py /.github/workflows/coverage.yml: -------------------------------------------------------------------------------- 1 | name: Coverage 2 | on: push 3 | 4 | jobs: 5 | coverage: 6 | runs-on: ubuntu-latest 7 | steps: 8 | - name: Check out repository code 9 | uses: actions/checkout@v2 10 | - name: Set up Python 11 | uses: actions/setup-python@v1 12 | with: 13 | python-version: '3.10' 14 | - name: Install dependencies 15 | run: | 16 | python -m pip install --upgrade pip 17 | pip install poetry==2.* 18 | poetry install 19 | - name: Test with pytest 20 | run: | 21 | poetry run coverage run -m pytest 22 | poetry run coverage report -i 23 | poetry run coverage xml -i 24 | - name: Upload coverage to Codecov 25 | uses: codecov/codecov-action@v2 26 | with: 27 | fail_ci_if_error: true 28 | token: ${{ secrets.CODECOV_TOKEN }} 29 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | on: 3 | release: 4 | types: [published] 5 | jobs: 6 | release: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v2 10 | - uses: actions/setup-python@v1 11 | with: 12 | python-version: '3.10' 13 | architecture: x64 14 | - run: python -m pip install --upgrade pip 15 | - run: pip install poetry==2.* 16 | - run: poetry install 17 | - run: poetry run coverage run -m pytest 18 | - run: poetry build 19 | - run: poetry publish --username=__token__ --password=${{ secrets.PYPI_TOKEN }} 20 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | on: 3 | pull_request: 4 | branches: 5 | - main 6 | 7 | jobs: 8 | tests: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Check out repository code 12 | uses: actions/checkout@v2 13 | - name: Set up Python 14 | uses: actions/setup-python@v1 15 | with: 16 | python-version: '3.10' 17 | - name: Install dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install poetry==2.* 21 | poetry install 22 | - name: Test with pytest 23 | run: | 24 | poetry run coverage run -m pytest 25 | poetry run coverage report -i 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # other 132 | .DS_Store 133 | .idea 134 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.5.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | - repo: https://github.com/psf/black 9 | rev: 22.3.0 10 | hooks: 11 | - id: black 12 | args: [--line-length=100] 13 | - repo: https://github.com/pycqa/flake8 14 | rev: 6.1.0 15 | hooks: 16 | - id: flake8 17 | args: [--max-line-length=100] 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Matus Baniar 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Tests](https://github.com/Matt52/bayesian-testing/workflows/Tests/badge.svg)](https://github.com/Matt52/bayesian-testing/actions?workflow=Tests) 2 | [![Codecov](https://codecov.io/gh/Matt52/bayesian-testing/branch/main/graph/badge.svg)](https://codecov.io/gh/Matt52/bayesian-testing) 3 | [![PyPI](https://img.shields.io/pypi/v/bayesian-testing.svg)](https://pypi.org/project/bayesian-testing/) 4 | # Bayesian A/B testing 5 | `bayesian_testing` is a small package for a quick evaluation of A/B (or A/B/C/...) tests using 6 | Bayesian approach. 7 | 8 | **Implemented tests:** 9 | - [BinaryDataTest](bayesian_testing/experiments/binary.py) 10 | - **_Input data_** - binary data (`[0, 1, 0, ...]`) 11 | - Designed for conversion-like data A/B testing. 12 | - [NormalDataTest](bayesian_testing/experiments/normal.py) 13 | - **_Input data_** - normal data with unknown variance 14 | - Designed for normal data A/B testing. 15 | - [DeltaLognormalDataTest](bayesian_testing/experiments/delta_lognormal.py) 16 | - **_Input data_** - lognormal data with zeros 17 | - Designed for revenue-like data A/B testing. 18 | - [DeltaNormalDataTest](bayesian_testing/experiments/delta_normal.py) 19 | - **_Input data_** - normal data with zeros 20 | - Designed for profit-like data A/B testing. 21 | - [DiscreteDataTest](bayesian_testing/experiments/discrete.py) 22 | - **_Input data_** - categorical data with numerical categories 23 | - Designed for discrete data A/B testing (e.g. dice rolls, star ratings, 1-10 ratings, etc.). 24 | - [PoissonDataTest](bayesian_testing/experiments/poisson.py) 25 | - **_Input data_** - non-negative integers (`[1, 0, 3, ...]`) 26 | - Designed for poisson data A/B testing. 27 | - [ExponentialDataTest](bayesian_testing/experiments/exponential.py) 28 | - **_Input data_** - exponential data (non-negative real numbers) 29 | - Designed for exponential data A/B testing (e.g. session/waiting time, time between events, 30 | etc.). 31 | 32 | **Implemented evaluation metrics:** 33 | - `Posterior Mean` 34 | - Expected value from the posterior distribution for a given variant. 35 | - `Credible Interval` 36 | - Quantile-based credible intervals based on simulations from posterior distributions (i.e. 37 | empirical). 38 | - Interval probability (`interval_alpha`) can be set during the evaluation (default value is 95%). 39 | - `Probability of Being Best` 40 | - Probability that a given variant is best among all variants. 41 | - By default, `the best` is equivalent to `the greatest` (from a data/metric point of view), 42 | however it is possible to change this by using `min_is_best=True` in the evaluation method 43 | (this can be useful if we try to find the variant with the smallest tested measure). 44 | - `Expected Loss` 45 | - "Risk" of choosing particular variant over other variants in the test. 46 | - Measured in same units as a tested measure (e.g. positive rate or average value). 47 | 48 | `Credible Interval`, `Probability of Being Best` and `Expected Loss` are calculated using 49 | simulations from posterior distributions (considering given data). 50 | 51 | 52 | ## Installation 53 | `bayesian_testing` can be installed using pip: 54 | ```console 55 | pip install bayesian_testing 56 | ``` 57 | Alternatively, you can clone the repository and use `poetry` manually: 58 | ```console 59 | cd bayesian_testing 60 | pip install poetry 61 | poetry install 62 | poetry shell 63 | ``` 64 | 65 | ## Basic Usage 66 | The primary features are classes: 67 | - `BinaryDataTest` 68 | - `NormalDataTest` 69 | - `DeltaLognormalDataTest` 70 | - `DeltaNormalDataTest` 71 | - `DiscreteDataTest` 72 | - `PoissonDataTest` 73 | - `ExponentialDataTest` 74 | 75 | All test classes support two methods to insert the data: 76 | - `add_variant_data` - Adding raw data for a variant as a list of observations (or numpy 1-D array). 77 | - `add_variant_data_agg` - Adding aggregated variant data (this can be practical for a large data, 78 | as the aggregation can be done already on a database level). 79 | 80 | Both methods for adding data allow specification of prior distributions 81 | (see details in respective docstrings). Default prior setup should be sufficient for most of the 82 | cases (e.g. cases with unknown priors or large amounts of data). 83 | 84 | To get the results of the test, simply call the method `evaluate`. 85 | 86 | Probability of being best, expected loss and credible intervals are approximated using simulations, 87 | hence the `evaluate` method can return slightly different values for different runs. To stabilize 88 | it, you can set the `sim_count` parameter of the `evaluate` to a higher value (default value is 89 | 20K), or even use the `seed` parameter to fix it completely. 90 | 91 | ### BinaryDataTest 92 | Class for a Bayesian A/B test for the binary-like data (e.g. conversions, successes, etc.). 93 | 94 | **Example:** 95 | ```python 96 | import numpy as np 97 | from bayesian_testing.experiments import BinaryDataTest 98 | 99 | # generating some random data 100 | rng = np.random.default_rng(52) 101 | # random 1x1500 array of 0/1 data with 5.2% probability for 1: 102 | data_a = rng.binomial(n=1, p=0.052, size=1500) 103 | # random 1x1200 array of 0/1 data with 6.7% probability for 1: 104 | data_b = rng.binomial(n=1, p=0.067, size=1200) 105 | 106 | # initialize a test: 107 | test = BinaryDataTest() 108 | 109 | # add variant using raw data (arrays of zeros and ones): 110 | test.add_variant_data("A", data_a) 111 | test.add_variant_data("B", data_b) 112 | # priors can be specified like this (default for this test is a=b=1/2): 113 | # test.add_variant_data("B", data_b, a_prior=1, b_prior=20) 114 | 115 | # add variant using aggregated data (same as raw data with 950 zeros and 50 ones): 116 | test.add_variant_data_agg("C", totals=1000, positives=50) 117 | 118 | # evaluate test: 119 | results = test.evaluate() 120 | results 121 | # print(pd.DataFrame(results).set_index('variant').T.to_markdown(tablefmt="grid")) 122 | ``` 123 | 124 | +-------------------+-----------+-------------+-------------+ 125 | | | A | B | C | 126 | +===================+===========+=============+=============+ 127 | | totals | 1500 | 1200 | 1000 | 128 | +-------------------+-----------+-------------+-------------+ 129 | | positives | 80 | 80 | 50 | 130 | +-------------------+-----------+-------------+-------------+ 131 | | positive_rate | 0.05333 | 0.06667 | 0.05 | 132 | +-------------------+-----------+-------------+-------------+ 133 | | posterior_mean | 0.05363 | 0.06703 | 0.05045 | 134 | +-------------------+-----------+-------------+-------------+ 135 | | credible_interval | [0.04284, | [0.0535309, | [0.0379814, | 136 | | | 0.065501] | 0.0816476] | 0.0648625] | 137 | +-------------------+-----------+-------------+-------------+ 138 | | prob_being_best | 0.06485 | 0.89295 | 0.0422 | 139 | +-------------------+-----------+-------------+-------------+ 140 | | expected_loss | 0.0139248 | 0.0004693 | 0.0170767 | 141 | +-------------------+-----------+-------------+-------------+ 142 | 143 | ### NormalDataTest 144 | Class for a Bayesian A/B test for the normal data. 145 | 146 | **Example:** 147 | ```python 148 | import numpy as np 149 | from bayesian_testing.experiments import NormalDataTest 150 | 151 | # generating some random data 152 | rng = np.random.default_rng(21) 153 | data_a = rng.normal(7.2, 2, 1000) 154 | data_b = rng.normal(7.1, 2, 800) 155 | data_c = rng.normal(7.0, 4, 500) 156 | 157 | # initialize a test: 158 | test = NormalDataTest() 159 | 160 | # add variant using raw data: 161 | test.add_variant_data("A", data_a) 162 | test.add_variant_data("B", data_b) 163 | # test.add_variant_data("C", data_c) 164 | 165 | # add variant using aggregated data: 166 | test.add_variant_data_agg("C", len(data_c), sum(data_c), sum(np.square(data_c))) 167 | 168 | # evaluate test: 169 | results = test.evaluate(sim_count=20000, seed=52, min_is_best=False, interval_alpha=0.99) 170 | results 171 | # print(pd.DataFrame(results).set_index('variant').T.to_markdown(tablefmt="grid")) 172 | ``` 173 | 174 | +-------------------+-------------+-------------+-------------+ 175 | | | A | B | C | 176 | +===================+=============+=============+=============+ 177 | | totals | 1000 | 800 | 500 | 178 | +-------------------+-------------+-------------+-------------+ 179 | | sum_values | 7294.67901 | 5685.86168 | 3736.91581 | 180 | +-------------------+-------------+-------------+-------------+ 181 | | avg_values | 7.29468 | 7.10733 | 7.47383 | 182 | +-------------------+-------------+-------------+-------------+ 183 | | posterior_mean | 7.29462 | 7.10725 | 7.4737 | 184 | +-------------------+-------------+-------------+-------------+ 185 | | credible_interval | [7.1359436, | [6.9324733, | [7.0240102, | 186 | | | 7.4528369] | 7.2779293] | 7.9379341] | 187 | +-------------------+-------------+-------------+-------------+ 188 | | prob_being_best | 0.1707 | 0.00125 | 0.82805 | 189 | +-------------------+-------------+-------------+-------------+ 190 | | expected_loss | 0.1968735 | 0.385112 | 0.0169998 | 191 | +-------------------+-------------+-------------+-------------+ 192 | 193 | ### DeltaLognormalDataTest 194 | Class for a Bayesian A/B test for the delta-lognormal data (log-normal with zeros). 195 | Delta-lognormal data is typical case of revenue per session data where many sessions have 0 revenue 196 | but non-zero values are positive values with possible log-normal distribution. 197 | To handle this data, the calculation is combining binary Bayes model for zero vs non-zero 198 | "conversions" and log-normal model for non-zero values. 199 | 200 | **Example:** 201 | ```python 202 | import numpy as np 203 | from bayesian_testing.experiments import DeltaLognormalDataTest 204 | 205 | test = DeltaLognormalDataTest() 206 | 207 | data_a = [7.1, 0.3, 5.9, 0, 1.3, 0.3, 0, 1.2, 0, 3.6, 0, 1.5, 208 | 2.2, 0, 4.9, 0, 0, 1.1, 0, 0, 7.1, 0, 6.9, 0] 209 | data_b = [4.0, 0, 3.3, 19.3, 18.5, 0, 0, 0, 12.9, 0, 0, 0, 10.2, 210 | 0, 0, 23.1, 0, 3.7, 0, 0, 11.3, 10.0, 0, 18.3, 12.1] 211 | 212 | # adding variant using raw data: 213 | test.add_variant_data("A", data_a) 214 | # test.add_variant_data("B", data_b) 215 | 216 | # alternatively, variant can be also added using aggregated data 217 | # (looks more complicated, but it can be quite handy for a large data): 218 | test.add_variant_data_agg( 219 | name="B", 220 | totals=len(data_b), 221 | positives=sum(x > 0 for x in data_b), 222 | sum_values=sum(data_b), 223 | sum_logs=sum([np.log(x) for x in data_b if x > 0]), 224 | sum_logs_2=sum([np.square(np.log(x)) for x in data_b if x > 0]) 225 | ) 226 | 227 | # evaluate test: 228 | results = test.evaluate(seed=21) 229 | results 230 | # print(pd.DataFrame(results).set_index('variant').T.to_markdown(tablefmt="grid")) 231 | ``` 232 | 233 | +---------------------+-------------+-------------+ 234 | | | A | B | 235 | +=====================+=============+=============+ 236 | | totals | 24 | 25 | 237 | +---------------------+-------------+-------------+ 238 | | positives | 13 | 12 | 239 | +---------------------+-------------+-------------+ 240 | | sum_values | 43.4 | 146.7 | 241 | +---------------------+-------------+-------------+ 242 | | avg_values | 1.80833 | 5.868 | 243 | +---------------------+-------------+-------------+ 244 | | avg_positive_values | 3.33846 | 12.225 | 245 | +---------------------+-------------+-------------+ 246 | | posterior_mean | 2.09766 | 6.19017 | 247 | +---------------------+-------------+-------------+ 248 | | credible_interval | [0.9884509, | [3.3746212, | 249 | | | 6.9054963] | 11.7349253] | 250 | +---------------------+-------------+-------------+ 251 | | prob_being_best | 0.04815 | 0.95185 | 252 | +---------------------+-------------+-------------+ 253 | | expected_loss | 4.0941101 | 0.1588627 | 254 | +---------------------+-------------+-------------+ 255 | 256 | ***Note**: Alternatively, `DeltaNormalDataTest` can be used for a case when conversions are not 257 | necessarily positive values.* 258 | 259 | ### DiscreteDataTest 260 | Class for a Bayesian A/B test for the discrete data with finite number of numerical categories 261 | (states), representing some value. 262 | This test can be used for instance for dice rolls data (when looking for the "best" of multiple 263 | dice) or rating data (e.g. 1-5 stars or 1-10 scale). 264 | 265 | **Example:** 266 | ```python 267 | from bayesian_testing.experiments import DiscreteDataTest 268 | 269 | # dice rolls data for 3 dice - A, B, C 270 | data_a = [2, 5, 1, 4, 6, 2, 2, 6, 3, 2, 6, 3, 4, 6, 3, 1, 6, 3, 5, 6] 271 | data_b = [1, 2, 2, 2, 2, 3, 2, 3, 4, 2] 272 | data_c = [1, 3, 6, 5, 4] 273 | 274 | # initialize a test with all possible states (i.e. numerical categories): 275 | test = DiscreteDataTest(states=[1, 2, 3, 4, 5, 6]) 276 | 277 | # add variant using raw data: 278 | test.add_variant_data("A", data_a) 279 | test.add_variant_data("B", data_b) 280 | test.add_variant_data("C", data_c) 281 | 282 | # add variant using aggregated data: 283 | # test.add_variant_data_agg("C", [1, 0, 1, 1, 1, 1]) # equivalent to rolls in data_c 284 | 285 | # evaluate test: 286 | results = test.evaluate(sim_count=20000, seed=52, min_is_best=False, interval_alpha=0.95) 287 | results 288 | # print(pd.DataFrame(results).set_index('variant').T.to_markdown(tablefmt="grid")) 289 | ``` 290 | 291 | +-------------------+------------------+------------------+------------------+ 292 | | | A | B | C | 293 | +===================+==================+==================+==================+ 294 | | concentration | {1: 2.0, 2: 4.0, | {1: 1.0, 2: 6.0, | {1: 1.0, 2: 0.0, | 295 | | | 3: 4.0, 4: 2.0, | 3: 2.0, 4: 1.0, | 3: 1.0, 4: 1.0, | 296 | | | 5: 2.0, 6: 6.0} | 5: 0.0, 6: 0.0} | 5: 1.0, 6: 1.0} | 297 | +-------------------+------------------+------------------+------------------+ 298 | | average_value | 3.8 | 2.3 | 3.8 | 299 | +-------------------+------------------+------------------+------------------+ 300 | | posterior_mean | 3.73077 | 2.75 | 3.63636 | 301 | +-------------------+------------------+------------------+------------------+ 302 | | credible_interval | [3.0710797, | [2.1791584, | [2.6556465, | 303 | | | 4.3888021] | 3.4589178] | 4.5784839] | 304 | +-------------------+------------------+------------------+------------------+ 305 | | prob_being_best | 0.54685 | 0.008 | 0.44515 | 306 | +-------------------+------------------+------------------+------------------+ 307 | | expected_loss | 0.199953 | 1.1826766 | 0.2870247 | 308 | +-------------------+------------------+------------------+------------------+ 309 | 310 | ### PoissonDataTest 311 | Class for a Bayesian A/B test for the poisson data. 312 | 313 | **Example:** 314 | ```python 315 | from bayesian_testing.experiments import PoissonDataTest 316 | 317 | # goals received - so less is better (duh...) 318 | psg_goals_against = [0, 2, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 3, 1, 0] 319 | city_goals_against = [0, 0, 3, 2, 0, 1, 0, 3, 0, 1, 1, 0, 1, 2] 320 | bayern_goals_against = [1, 0, 0, 1, 1, 2, 1, 0, 2, 0, 0, 2, 2, 1, 0] 321 | 322 | # initialize a test: 323 | test = PoissonDataTest() 324 | 325 | # add variant using raw data: 326 | test.add_variant_data('psg', psg_goals_against) 327 | 328 | # example with specific priors 329 | # ("b_prior" as an effective sample size, and "a_prior/b_prior" as a prior mean): 330 | test.add_variant_data('city', city_goals_against, a_prior=3, b_prior=1) 331 | # test.add_variant_data('bayern', bayern_goals_against) 332 | 333 | # add variant using aggregated data: 334 | test.add_variant_data_agg("bayern", len(bayern_goals_against), sum(bayern_goals_against)) 335 | 336 | # evaluate test (since fewer goals is better, we explicitly set the min_is_best to True) 337 | results = test.evaluate(sim_count=20000, seed=52, min_is_best=True) 338 | results 339 | # print(pd.DataFrame(results).set_index('variant').T.to_markdown(tablefmt="grid")) 340 | ``` 341 | 342 | +-------------------+-------------+-------------+------------+ 343 | | | psg | city | bayern | 344 | +===================+=============+=============+============+ 345 | | totals | 15 | 14 | 15 | 346 | +-------------------+-------------+-------------+------------+ 347 | | sum_values | 9 | 14 | 13 | 348 | +-------------------+-------------+-------------+------------+ 349 | | observed_average | 0.6 | 1.0 | 0.86667 | 350 | +-------------------+-------------+-------------+------------+ 351 | | posterior_mean | 0.60265 | 1.13333 | 0.86755 | 352 | +-------------------+-------------+-------------+------------+ 353 | | credible_interval | [0.2800848, | [0.6562029, | [0.465913, | 354 | | | 1.0570327] | 1.7265045] | 1.3964389] | 355 | +-------------------+-------------+-------------+------------+ 356 | | prob_being_best | 0.78175 | 0.0344 | 0.18385 | 357 | +-------------------+-------------+-------------+------------+ 358 | | expected_loss | 0.0369998 | 0.5620553 | 0.3003345 | 359 | +-------------------+-------------+-------------+------------+ 360 | 361 | _note: Since we set `min_is_best=True` (because received goals are "bad"), probability and loss are 362 | in a favor of variants with lower posterior means._ 363 | 364 | ### ExponentialDataTest 365 | Class for a Bayesian A/B test for the exponential data. 366 | 367 | **Example:** 368 | ```python 369 | import numpy as np 370 | from bayesian_testing.experiments import ExponentialDataTest 371 | 372 | # waiting times for 3 different variants, each with many observations, 373 | # generated using exponential distributions with defined scales (expected values) 374 | waiting_times_a = np.random.exponential(scale=10, size=200) 375 | waiting_times_b = np.random.exponential(scale=11, size=210) 376 | waiting_times_c = np.random.exponential(scale=11, size=220) 377 | 378 | # initialize a test: 379 | test = ExponentialDataTest() 380 | # adding variants using the observation data: 381 | test.add_variant_data('A', waiting_times_a) 382 | test.add_variant_data('B', waiting_times_b) 383 | test.add_variant_data('C', waiting_times_c) 384 | 385 | # alternatively, add variants using aggregated data: 386 | # test.add_variant_data_agg('A', len(waiting_times_a), sum(waiting_times_a)) 387 | 388 | # evaluate test (since a lower waiting time is better, we set the min_is_best to True) 389 | results = test.evaluate(sim_count=20000, min_is_best=True) 390 | results 391 | # print(pd.DataFrame(results).set_index('variant').T.to_markdown(tablefmt="grid")) 392 | ``` 393 | 394 | +-------------------+-------------+-------------+-------------+ 395 | | | A | B | C | 396 | +===================+=============+=============+=============+ 397 | | totals | 200 | 210 | 220 | 398 | +-------------------+-------------+-------------+-------------+ 399 | | sum_values | 1827.81709 | 2217.46016 | 2160.73134 | 400 | +-------------------+-------------+-------------+-------------+ 401 | | observed_average | 9.13909 | 10.55933 | 9.82151 | 402 | +-------------------+-------------+-------------+-------------+ 403 | | posterior_mean | 9.13502 | 10.55478 | 9.8175 | 404 | +-------------------+-------------+-------------+-------------+ 405 | | credible_interval | [7.994178, | [9.2543372, | [8.6184821, | 406 | | | 10.5410967] | 12.1527256] | 11.2566538] | 407 | +-------------------+-------------+-------------+-------------+ 408 | | prob_being_best | 0.7456 | 0.0405 | 0.2139 | 409 | +-------------------+-------------+-------------+-------------+ 410 | | expected_loss | 0.1428729 | 1.5674747 | 0.8230728 | 411 | +-------------------+-------------+-------------+-------------+ 412 | 413 | ## Development 414 | To set up a development environment, use [Poetry](https://python-poetry.org/) 415 | and [pre-commit](https://pre-commit.com): 416 | ```console 417 | pip install poetry 418 | poetry install 419 | poetry run pre-commit install 420 | ``` 421 | 422 | ## To be implemented 423 | 424 | Additional metrics: 425 | - `Potential Value Remaining` 426 | 427 | ## References 428 | - `bayesian_testing` package itself depends only on [numpy](https://numpy.org) package. 429 | - Work on this package (including default priors selection) was inspired mainly by a Coursera 430 | course [Bayesian Statistics: From Concept to Data Analysis](https://www.coursera.org/learn/bayesian-statistics). 431 | -------------------------------------------------------------------------------- /bayesian_testing/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from importlib.metadata import version, PackageNotFoundError # type: ignore 3 | except ImportError: # pragma: no cover 4 | from importlib_metadata import version, PackageNotFoundError # type: ignore 5 | 6 | try: 7 | __version__ = version(__name__) 8 | except PackageNotFoundError: # pragma: no cover 9 | __version__ = "unknown" 10 | -------------------------------------------------------------------------------- /bayesian_testing/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | from .binary import BinaryDataTest 2 | from .normal import NormalDataTest 3 | from .delta_lognormal import DeltaLognormalDataTest 4 | from .discrete import DiscreteDataTest 5 | from .poisson import PoissonDataTest 6 | from .delta_normal import DeltaNormalDataTest 7 | from .exponential import ExponentialDataTest 8 | 9 | __all__ = [ 10 | "BinaryDataTest", 11 | "NormalDataTest", 12 | "DeltaLognormalDataTest", 13 | "DeltaNormalDataTest", 14 | "DiscreteDataTest", 15 | "PoissonDataTest", 16 | "ExponentialDataTest", 17 | ] 18 | -------------------------------------------------------------------------------- /bayesian_testing/experiments/base.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | import warnings 3 | 4 | 5 | class BaseDataTest: 6 | """ 7 | Base class for Bayesian A/B test. 8 | """ 9 | 10 | def __init__(self) -> None: 11 | """ 12 | Initialize BaseDataTest class. 13 | """ 14 | self.data = {} 15 | 16 | @property 17 | def variant_names(self): 18 | return [k for k in self.data] 19 | 20 | def eval_simulation( 21 | self, 22 | sim_count: int = 20000, 23 | seed: int = None, 24 | min_is_best: bool = False, 25 | interval_alpha: float = 0.95, 26 | ) -> Tuple[dict, dict, dict]: 27 | """ 28 | Should be implemented in each individual experiment. 29 | """ 30 | raise NotImplementedError 31 | 32 | def probabs_of_being_best( 33 | self, 34 | sim_count: int = 20000, 35 | seed: int = None, 36 | min_is_best: bool = False, 37 | interval_alpha: float = 0.95, 38 | ) -> dict: 39 | """ 40 | Calculate probabilities of being best for a current class state. 41 | 42 | Parameters 43 | ---------- 44 | sim_count : Number of simulations to be used for probability estimation. 45 | seed : Random seed. 46 | min_is_best : Option to change "being best" to a minimum. Default is maximum. 47 | interval_alpha : Credible interval probability (value between 0 and 1). 48 | 49 | Returns 50 | ------- 51 | pbbs : Dictionary with probabilities of being best for all variants in experiment. 52 | """ 53 | pbbs, loss, intervals = self.eval_simulation(sim_count, seed, min_is_best, interval_alpha) 54 | 55 | return pbbs 56 | 57 | def expected_loss( 58 | self, 59 | sim_count: int = 20000, 60 | seed: int = None, 61 | min_is_best: bool = False, 62 | interval_alpha: float = 0.95, 63 | ) -> dict: 64 | """ 65 | Calculate expected loss for a current class state. 66 | 67 | Parameters 68 | ---------- 69 | sim_count : Number of simulations to be used for probability estimation. 70 | seed : Random seed. 71 | min_is_best : Option to change "being best" to a minimum. Default is maximum. 72 | interval_alpha : Credible interval probability (value between 0 and 1). 73 | 74 | Returns 75 | ------- 76 | loss : Dictionary with expected loss for all variants in experiment. 77 | """ 78 | pbbs, loss, intervals = self.eval_simulation(sim_count, seed, min_is_best, interval_alpha) 79 | 80 | return loss 81 | 82 | def credible_intervals( 83 | self, 84 | sim_count: int = 20000, 85 | seed: int = None, 86 | min_is_best: bool = False, 87 | interval_alpha: float = 0.95, 88 | ) -> dict: 89 | """ 90 | Calculate quantile-based credible intervals for a current class state. 91 | 92 | Parameters 93 | ---------- 94 | sim_count : Number of simulations to be used for probability estimation. 95 | seed : Random seed. 96 | min_is_best : Option to change "being best" to a minimum. Default is maximum. 97 | interval_alpha : Credible interval probability (value between 0 and 1). 98 | 99 | Returns 100 | ------- 101 | intervals : Dictionary with quantile-based credible intervals for all variants. 102 | """ 103 | pbbs, loss, intervals = self.eval_simulation(sim_count, seed, min_is_best, interval_alpha) 104 | 105 | return intervals 106 | 107 | def delete_variant(self, name: str) -> None: 108 | """ 109 | Delete variant and all its data from experiment. 110 | 111 | Parameters 112 | ---------- 113 | name : Variant name. 114 | """ 115 | if not isinstance(name, str): 116 | raise ValueError("Variant name has to be a string.") 117 | if name not in self.variant_names: 118 | warnings.warn(f"Nothing to be deleted. Variant {name} is not in experiment.") 119 | else: 120 | del self.data[name] 121 | -------------------------------------------------------------------------------- /bayesian_testing/experiments/binary.py: -------------------------------------------------------------------------------- 1 | from numbers import Number 2 | from typing import List, Tuple 3 | 4 | from bayesian_testing.experiments.base import BaseDataTest 5 | from bayesian_testing.metrics import eval_bernoulli_agg 6 | from bayesian_testing.utilities import get_logger 7 | 8 | logger = get_logger("bayesian_testing") 9 | 10 | 11 | class BinaryDataTest(BaseDataTest): 12 | """ 13 | Class for Bayesian A/B test for binary-like data (conversions, successes, etc.). 14 | 15 | After class initialization, use add_variant methods to insert variant data. 16 | Then to get results of the test, use for instance `evaluate` method. 17 | """ 18 | 19 | def __init__(self) -> None: 20 | """ 21 | Initialize BinaryDataTest class. 22 | """ 23 | super().__init__() 24 | 25 | @property 26 | def totals(self): 27 | return [self.data[k]["totals"] for k in self.data] 28 | 29 | @property 30 | def positives(self): 31 | return [self.data[k]["positives"] for k in self.data] 32 | 33 | @property 34 | def a_priors(self): 35 | return [self.data[k]["a_prior"] for k in self.data] 36 | 37 | @property 38 | def b_priors(self): 39 | return [self.data[k]["b_prior"] for k in self.data] 40 | 41 | def eval_simulation( 42 | self, 43 | sim_count: int = 20000, 44 | seed: int = None, 45 | min_is_best: bool = False, 46 | interval_alpha: float = 0.95, 47 | ) -> Tuple[dict, dict, dict]: 48 | """ 49 | Calculate probabilities of being best, expected loss and credible intervals for a current 50 | class state. 51 | 52 | Parameters 53 | ---------- 54 | sim_count : Number of simulations to be used for probability estimation. 55 | seed : Random seed. 56 | min_is_best : Option to change "being best" to a minimum. Default is maximum. 57 | interval_alpha : Credible interval probability (value between 0 and 1). 58 | 59 | Returns 60 | ------- 61 | res_pbbs : Dictionary with probabilities of being best for all variants in experiment. 62 | res_loss : Dictionary with expected loss for all variants in experiment. 63 | res_intervals : Dictionary with quantile-based credible intervals for all variants. 64 | """ 65 | pbbs, loss, intervals = eval_bernoulli_agg( 66 | self.totals, 67 | self.positives, 68 | self.a_priors, 69 | self.b_priors, 70 | sim_count, 71 | seed, 72 | min_is_best, 73 | interval_alpha, 74 | ) 75 | res_pbbs = dict(zip(self.variant_names, pbbs)) 76 | res_loss = dict(zip(self.variant_names, loss)) 77 | res_intervals = dict(zip(self.variant_names, intervals)) 78 | 79 | return res_pbbs, res_loss, res_intervals 80 | 81 | def evaluate( 82 | self, 83 | sim_count: int = 20000, 84 | seed: int = None, 85 | min_is_best: bool = False, 86 | interval_alpha: float = 0.95, 87 | ) -> List[dict]: 88 | """ 89 | Evaluation of experiment. 90 | 91 | Parameters 92 | ---------- 93 | sim_count : Number of simulations to be used for probability estimation. 94 | seed : Random seed. 95 | min_is_best : Option to change "being best" to a minimum. Default is maximum. 96 | interval_alpha : Credible interval probability (value between 0 and 1). 97 | 98 | Returns 99 | ------- 100 | res : List of dictionaries with results per variant. 101 | """ 102 | keys = [ 103 | "variant", 104 | "totals", 105 | "positives", 106 | "positive_rate", 107 | "posterior_mean", 108 | "credible_interval", 109 | "prob_being_best", 110 | "expected_loss", 111 | ] 112 | positive_rate = [round(i[0] / i[1], 5) for i in zip(self.positives, self.totals)] 113 | posterior_mean = [ 114 | round((i[2] + i[0]) / (i[2] + i[3] + i[1]), 5) 115 | for i in zip(self.positives, self.totals, self.a_priors, self.b_priors) 116 | ] 117 | eval_pbbs, eval_loss, eval_intervals = self.eval_simulation( 118 | sim_count, seed, min_is_best, interval_alpha 119 | ) 120 | pbbs = list(eval_pbbs.values()) 121 | loss = list(eval_loss.values()) 122 | intervals = list(eval_intervals.values()) 123 | data = [ 124 | self.variant_names, 125 | self.totals, 126 | self.positives, 127 | positive_rate, 128 | posterior_mean, 129 | intervals, 130 | pbbs, 131 | loss, 132 | ] 133 | res = [dict(zip(keys, item)) for item in zip(*data)] 134 | 135 | return res 136 | 137 | def add_variant_data_agg( 138 | self, 139 | name: str, 140 | totals: int, 141 | positives: int, 142 | a_prior: Number = 0.5, 143 | b_prior: Number = 0.5, 144 | replace: bool = True, 145 | ) -> None: 146 | """ 147 | Add variant data to test class using aggregated binary data. 148 | This can be convenient as aggregation can be done on database level. 149 | 150 | Default prior setup is set for Beta(1/2, 1/2) which is non-information prior. 151 | 152 | Parameters 153 | ---------- 154 | name : Variant name. 155 | totals : Total number of experiment observations (e.g. number of sessions). 156 | positives : Total number of 1s for a given variant (e.g. number of conversions). 157 | a_prior : Prior alpha parameter of a Beta distribution (conjugate prior). 158 | Default value 0.5 is based on non-information prior Beta(0.5, 0.5). 159 | b_prior : Prior beta parameter of a Beta distribution (conjugate prior). 160 | Default value 0.5 is based on non-information prior Beta(0.5, 0.5). 161 | replace : Replace data if variant already exists. 162 | If set to False, data of existing variant will be appended to existing data. 163 | """ 164 | if not isinstance(name, str): 165 | raise ValueError("Variant name has to be a string.") 166 | if a_prior <= 0 or b_prior <= 0: 167 | raise ValueError("Both [a_prior, b_prior] have to be positive numbers.") 168 | if totals <= 0: 169 | raise ValueError("Input variable 'totals' is expected to be positive integer.") 170 | if positives < 0: 171 | raise ValueError("Input variable 'positives' is expected to be non-negative integer.") 172 | if totals < positives: 173 | raise ValueError("Not possible to have more positives that totals!") 174 | 175 | if name not in self.variant_names: 176 | self.data[name] = { 177 | "totals": totals, 178 | "positives": positives, 179 | "a_prior": a_prior, 180 | "b_prior": b_prior, 181 | } 182 | elif name in self.variant_names and replace: 183 | msg = ( 184 | f"Variant {name} already exists - new data is replacing it. " 185 | "If you wish to append instead, use replace=False." 186 | ) 187 | logger.info(msg) 188 | self.data[name] = { 189 | "totals": totals, 190 | "positives": positives, 191 | "a_prior": a_prior, 192 | "b_prior": b_prior, 193 | } 194 | elif name in self.variant_names and not replace: 195 | msg = ( 196 | f"Variant {name} already exists - new data is appended to variant, " 197 | "keeping its original prior setup. " 198 | "If you wish to replace data instead, use replace=True." 199 | ) 200 | logger.info(msg) 201 | self.data[name]["totals"] += totals 202 | self.data[name]["positives"] += positives 203 | 204 | def add_variant_data( 205 | self, 206 | name: str, 207 | data: List[int], 208 | a_prior: Number = 0.5, 209 | b_prior: Number = 0.5, 210 | replace: bool = True, 211 | ) -> None: 212 | """ 213 | Add variant data to test class using raw binary data. 214 | 215 | Default prior setup is set for Beta(1/2, 1/2) which is non-information prior. 216 | 217 | Parameters 218 | ---------- 219 | name : Variant name. 220 | data : List of binary data containing zeros (non-conversion) and ones (conversions). 221 | a_prior : Prior alpha parameter of a Beta distribution (conjugate prior). 222 | Default value 0.5 is based on non-information prior Beta(0.5, 0.5). 223 | b_prior : Prior beta parameter of a Beta distribution (conjugate prior). 224 | Default value 0.5 is based on non-information prior Beta(0.5, 0.5). 225 | replace : Replace data if variant already exists. 226 | If set to False, data of existing variant will be appended to existing data. 227 | """ 228 | if len(data) == 0: 229 | raise ValueError("Data of added variant needs to have some observations.") 230 | if not min([i in [0, 1] for i in data]): 231 | raise ValueError("Input data needs to be a list of zeros and ones.") 232 | 233 | totals = len(data) 234 | positives = sum(data) 235 | 236 | self.add_variant_data_agg(name, totals, positives, a_prior, b_prior, replace) 237 | -------------------------------------------------------------------------------- /bayesian_testing/experiments/delta_lognormal.py: -------------------------------------------------------------------------------- 1 | from numbers import Number 2 | from typing import List, Tuple 3 | 4 | import numpy as np 5 | 6 | from bayesian_testing.experiments.base import BaseDataTest 7 | from bayesian_testing.metrics import eval_delta_lognormal_agg 8 | from bayesian_testing.utilities import get_logger 9 | 10 | logger = get_logger("bayesian_testing") 11 | 12 | 13 | class DeltaLognormalDataTest(BaseDataTest): 14 | """ 15 | Class for Bayesian A/B test for Delta-LogNormal data (Log-Normal with possible zeros). 16 | Delta-lognormal data is typical case of revenue/session data where many 17 | sessions are with 0 revenue (meaning non-conversions). 18 | To handle this data, the evaluation methods are combining binary bayes model for 19 | zero vs non-zero "conversion" and log-normal model for non-zero values. 20 | 21 | After class initialization, use add_variant methods to insert variant data. 22 | Then to get results of the test, use for instance `evaluate` method. 23 | """ 24 | 25 | def __init__(self) -> None: 26 | """ 27 | Initialize DeltaLognormalDataTest class. 28 | """ 29 | super().__init__() 30 | 31 | @property 32 | def totals(self): 33 | return [self.data[k]["totals"] for k in self.data] 34 | 35 | @property 36 | def positives(self): 37 | return [self.data[k]["positives"] for k in self.data] 38 | 39 | @property 40 | def sum_values(self): 41 | return [self.data[k]["sum_values"] for k in self.data] 42 | 43 | @property 44 | def sum_logs(self): 45 | return [self.data[k]["sum_logs"] for k in self.data] 46 | 47 | @property 48 | def sum_logs_2(self): 49 | return [self.data[k]["sum_logs_2"] for k in self.data] 50 | 51 | @property 52 | def a_priors_beta(self): 53 | return [self.data[k]["a_prior_beta"] for k in self.data] 54 | 55 | @property 56 | def b_priors_beta(self): 57 | return [self.data[k]["b_prior_beta"] for k in self.data] 58 | 59 | @property 60 | def m_priors(self): 61 | return [self.data[k]["m_prior"] for k in self.data] 62 | 63 | @property 64 | def a_priors_ig(self): 65 | return [self.data[k]["a_prior_ig"] for k in self.data] 66 | 67 | @property 68 | def b_priors_ig(self): 69 | return [self.data[k]["b_prior_ig"] for k in self.data] 70 | 71 | @property 72 | def w_priors(self): 73 | return [self.data[k]["w_prior"] for k in self.data] 74 | 75 | def eval_simulation( 76 | self, 77 | sim_count: int = 20000, 78 | seed: int = None, 79 | min_is_best: bool = False, 80 | interval_alpha: float = 0.95, 81 | ) -> Tuple[dict, dict, dict]: 82 | """ 83 | Calculate probabilities of being best, expected loss and credible intervals for a current 84 | class state. 85 | 86 | Parameters 87 | ---------- 88 | sim_count : Number of simulations to be used for probability estimation. 89 | seed : Random seed. 90 | min_is_best : Option to change "being best" to a minimum. Default is maximum. 91 | interval_alpha : Credible interval probability (value between 0 and 1). 92 | 93 | Returns 94 | ------- 95 | res_pbbs : Dictionary with probabilities of being best for all variants in experiment. 96 | res_loss : Dictionary with expected loss for all variants in experiment. 97 | res_intervals : Dictionary with quantile-based credible intervals for all variants. 98 | """ 99 | pbbs, loss, intervals = eval_delta_lognormal_agg( 100 | self.totals, 101 | self.positives, 102 | self.sum_logs, 103 | self.sum_logs_2, 104 | sim_count=sim_count, 105 | a_priors_beta=self.a_priors_beta, 106 | b_priors_beta=self.b_priors_beta, 107 | m_priors=self.m_priors, 108 | a_priors_ig=self.a_priors_ig, 109 | b_priors_ig=self.b_priors_ig, 110 | w_priors=self.w_priors, 111 | seed=seed, 112 | min_is_best=min_is_best, 113 | interval_alpha=interval_alpha, 114 | ) 115 | res_pbbs = dict(zip(self.variant_names, pbbs)) 116 | res_loss = dict(zip(self.variant_names, loss)) 117 | res_intervals = dict(zip(self.variant_names, intervals)) 118 | 119 | return res_pbbs, res_loss, res_intervals 120 | 121 | def evaluate( 122 | self, 123 | sim_count: int = 20000, 124 | seed: int = None, 125 | min_is_best: bool = False, 126 | interval_alpha: float = 0.95, 127 | ) -> List[dict]: 128 | """ 129 | Evaluation of experiment. 130 | 131 | Parameters 132 | ---------- 133 | sim_count : Number of simulations to be used for probability estimation. 134 | seed : Random seed. 135 | min_is_best : Option to change "being best" to a minimum. Default is maximum. 136 | interval_alpha : Credible interval probability (value between 0 and 1). 137 | 138 | Returns 139 | ------- 140 | res : List of dictionaries with results per variant. 141 | """ 142 | keys = [ 143 | "variant", 144 | "totals", 145 | "positives", 146 | "sum_values", 147 | "avg_values", 148 | "avg_positive_values", 149 | "posterior_mean", 150 | "credible_interval", 151 | "prob_being_best", 152 | "expected_loss", 153 | ] 154 | avg_values = [round(i[0] / i[1], 5) for i in zip(self.sum_values, self.totals)] 155 | avg_pos_values = [round(i[0] / i[1], 5) for i in zip(self.sum_values, self.positives)] 156 | a_posterior_ig = [i[0] + (i[1] / 2) for i in zip(self.a_priors_ig, self.positives)] 157 | x_ig = [i[0] / i[1] for i in zip(self.sum_logs, self.positives)] 158 | b_posterior_ig = [ 159 | ( 160 | i[6] 161 | + (1 / 2) * (i[1] - 2 * i[0] * i[3] + i[2] * (i[3] ** 2)) 162 | + ((i[2] * i[5]) / (2 * (i[2] + i[5]))) * ((i[3] - i[4]) ** 2) 163 | ) 164 | for i in zip( 165 | self.sum_logs, 166 | self.sum_logs_2, 167 | self.positives, 168 | x_ig, 169 | self.m_priors, 170 | self.w_priors, 171 | self.b_priors_ig, 172 | ) 173 | ] 174 | posterior_mean = [ 175 | round( 176 | np.exp(((i[0] + i[3] * i[4]) / (i[1] + i[4])) + i[8] / (2 * i[7])) 177 | * ((i[5] + i[1]) / (i[6] + i[2])), 178 | 5, 179 | ) 180 | for i in zip( 181 | self.sum_logs, 182 | self.positives, 183 | self.totals, 184 | self.m_priors, 185 | self.w_priors, 186 | self.a_priors_beta, 187 | self.b_priors_beta, 188 | a_posterior_ig, 189 | b_posterior_ig, 190 | ) 191 | ] 192 | eval_pbbs, eval_loss, eval_intervals = self.eval_simulation( 193 | sim_count, seed, min_is_best, interval_alpha 194 | ) 195 | pbbs = list(eval_pbbs.values()) 196 | loss = list(eval_loss.values()) 197 | intervals = list(eval_intervals.values()) 198 | data = [ 199 | self.variant_names, 200 | self.totals, 201 | self.positives, 202 | [round(i, 5) for i in self.sum_values], 203 | avg_values, 204 | avg_pos_values, 205 | posterior_mean, 206 | intervals, 207 | pbbs, 208 | loss, 209 | ] 210 | res = [dict(zip(keys, item)) for item in zip(*data)] 211 | 212 | return res 213 | 214 | def add_variant_data_agg( 215 | self, 216 | name: str, 217 | totals: int, 218 | positives: int, 219 | sum_values: float, 220 | sum_logs: float, 221 | sum_logs_2: float, 222 | a_prior_beta: Number = 0.5, 223 | b_prior_beta: Number = 0.5, 224 | m_prior: Number = 1, 225 | a_prior_ig: Number = 0, 226 | b_prior_ig: Number = 0, 227 | w_prior: Number = 0.01, 228 | replace: bool = True, 229 | ) -> None: 230 | """ 231 | Add variant data to test class using aggregated Delta-LogNormal data. 232 | This can be convenient as aggregation can be done on database level. 233 | 234 | The goal of default prior setup is to be low information. 235 | It should be tuned with caution. 236 | 237 | Parameters 238 | ---------- 239 | name : Variant name. 240 | totals : Total number of experiment observations (e.g. number of sessions). 241 | positives : Total number of non-zero values for a given variant. 242 | sum_values : Sum of non-zero values for a given variant. 243 | sum_logs : Sum of logarithms of non-zero data values for a given variant. 244 | sum_logs_2 : Sum of logarithms squrared of non-zero data values for a given variant. 245 | a_prior_beta : Prior alpha parameter from Beta distribution for conversion part. 246 | b_prior_beta : Prior beta parameter from Beta distribution for conversion part. 247 | m_prior : Prior normal mean for logarithms of non-zero data. 248 | a_prior_ig : Prior alpha from inverse gamma dist. for unknown variance of logarithms. 249 | In theory a > 0, but as we always have at least one observation, we can start at 0. 250 | b_prior_ig : Prior beta from inverse gamma dist. for unknown variance of logarithms. 251 | In theory b > 0, but as we always have at least one observation, we can start at 0. 252 | w_prior : Prior effective sample sizes for normal distribution of logarithms of data. 253 | replace : Replace data if variant already exists. 254 | If set to False, data of existing variant will be appended to existing data. 255 | """ 256 | if not isinstance(name, str): 257 | raise ValueError("Variant name has to be a string.") 258 | if a_prior_beta <= 0 or b_prior_beta <= 0: 259 | raise ValueError("Both [a_prior_beta, b_prior_beta] have to be positive numbers.") 260 | if m_prior < 0 or a_prior_ig < 0 or b_prior_ig < 0 or w_prior < 0: 261 | raise ValueError("All priors of [m, a_ig, b_ig, w] have to be non-negative numbers.") 262 | if positives == 0: 263 | raise ValueError("Variant has to have some non-zero (positive) values.") 264 | if positives < 0: 265 | raise ValueError("Input variable 'positives' is expected to be a positive integer.") 266 | if totals < positives: 267 | raise ValueError("Not possible to have more positives that totals!") 268 | 269 | if name not in self.variant_names: 270 | self.data[name] = { 271 | "totals": totals, 272 | "positives": positives, 273 | "sum_values": sum_values, 274 | "sum_logs": sum_logs, 275 | "sum_logs_2": sum_logs_2, 276 | "a_prior_beta": a_prior_beta, 277 | "b_prior_beta": b_prior_beta, 278 | "m_prior": m_prior, 279 | "a_prior_ig": a_prior_ig, 280 | "b_prior_ig": b_prior_ig, 281 | "w_prior": w_prior, 282 | } 283 | elif name in self.variant_names and replace: 284 | msg = ( 285 | f"Variant {name} already exists - new data is replacing it. " 286 | "If you wish to append instead, use replace=False." 287 | ) 288 | logger.info(msg) 289 | self.data[name] = { 290 | "totals": totals, 291 | "positives": positives, 292 | "sum_values": sum_values, 293 | "sum_logs": sum_logs, 294 | "sum_logs_2": sum_logs_2, 295 | "a_prior_beta": a_prior_beta, 296 | "b_prior_beta": b_prior_beta, 297 | "m_prior": m_prior, 298 | "a_prior_ig": a_prior_ig, 299 | "b_prior_ig": b_prior_ig, 300 | "w_prior": w_prior, 301 | } 302 | elif name in self.variant_names and not replace: 303 | msg = ( 304 | f"Variant {name} already exists - new data is appended to variant, " 305 | "keeping its original prior setup. " 306 | "If you wish to replace data instead, use replace=True." 307 | ) 308 | logger.info(msg) 309 | self.data[name]["totals"] += totals 310 | self.data[name]["positives"] += positives 311 | self.data[name]["sum_values"] += sum_values 312 | self.data[name]["sum_logs"] += sum_logs 313 | self.data[name]["sum_logs_2"] += sum_logs_2 314 | 315 | def add_variant_data( 316 | self, 317 | name: str, 318 | data: List[Number], 319 | a_prior_beta: Number = 0.5, 320 | b_prior_beta: Number = 0.5, 321 | m_prior: Number = 1, 322 | a_prior_ig: Number = 0, 323 | b_prior_ig: Number = 0, 324 | w_prior: Number = 0.01, 325 | replace: bool = True, 326 | ) -> None: 327 | """ 328 | Add variant data to test class using raw Delta-LogNormal data. 329 | 330 | The goal of default prior setup is to be low information. It should be tuned with caution. 331 | 332 | Parameters 333 | ---------- 334 | name : Variant name. 335 | data : List of delta-lognormal data (e.g. revenues in sessions). 336 | a_prior_beta : Prior alpha parameter from Beta distribution for conversion part. 337 | b_prior_beta : Prior beta parameter from Beta distribution for conversion part. 338 | m_prior : Prior mean for logarithms of non-zero data. 339 | a_prior_ig : Prior alpha from inverse gamma dist. for unknown variance of logarithms. 340 | In theory a > 0, but as we always have at least one observation, we can start at 0. 341 | b_prior_ig : Prior beta from inverse gamma dist. for unknown variance of logarithms. 342 | In theory b > 0, but as we always have at least one observation, we can start at 0. 343 | w_prior : Prior effective sample sizes for normal distribution of logarithms of data. 344 | replace : Replace data if variant already exists. 345 | If set to False, data of existing variant will be appended to existing data. 346 | """ 347 | if len(data) == 0: 348 | raise ValueError("Data of added variant needs to have some observations.") 349 | if min(data) < 0: 350 | raise ValueError("Input data needs to be a list of non-negative numbers.") 351 | 352 | totals = len(data) 353 | positives = sum(x > 0 for x in data) 354 | sum_values = sum(data) 355 | sum_logs = sum([np.log(x) for x in data if x > 0]) 356 | sum_logs_2 = sum([np.square(np.log(x)) for x in data if x > 0]) 357 | 358 | self.add_variant_data_agg( 359 | name, 360 | totals, 361 | positives, 362 | sum_values, 363 | sum_logs, 364 | sum_logs_2, 365 | a_prior_beta, 366 | b_prior_beta, 367 | m_prior, 368 | a_prior_ig, 369 | b_prior_ig, 370 | w_prior, 371 | replace, 372 | ) 373 | -------------------------------------------------------------------------------- /bayesian_testing/experiments/delta_normal.py: -------------------------------------------------------------------------------- 1 | from numbers import Number 2 | from typing import List, Tuple 3 | import numpy as np 4 | from bayesian_testing.experiments.base import BaseDataTest 5 | from bayesian_testing.metrics import eval_delta_normal_agg 6 | from bayesian_testing.utilities import get_logger 7 | 8 | logger = get_logger("bayesian_testing") 9 | 10 | 11 | class DeltaNormalDataTest(BaseDataTest): 12 | """ 13 | Class for Bayesian A/B test for Delta-Normal data (Normally distributed conversions). 14 | Delta-normal data is typical case of net profit data where many sessions have 0 values 15 | (meaning non-conversions), and the remaining revenue data is normally distributed. 16 | To handle this data, the evaluation methods are combining binary bayes model 17 | for zero vs non-zero “conversion” and normal model. 18 | 19 | After class initialization, use add_variant methods to insert variant data. 20 | Then to get results of the test, use for instance `evaluate` method. 21 | """ 22 | 23 | def __init__(self) -> None: 24 | """ 25 | Initialize DeltaNormalDataTest class. 26 | """ 27 | super().__init__() 28 | 29 | @property 30 | def totals(self): 31 | return [self.data[k]["totals"] for k in self.data] 32 | 33 | @property 34 | def non_zeros(self): 35 | return [self.data[k]["non_zeros"] for k in self.data] 36 | 37 | @property 38 | def sum_values(self): 39 | return [self.data[k]["sum_values"] for k in self.data] 40 | 41 | @property 42 | def sum_values_2(self): 43 | return [self.data[k]["sum_values_2"] for k in self.data] 44 | 45 | @property 46 | def a_priors_beta(self): 47 | return [self.data[k]["a_prior_beta"] for k in self.data] 48 | 49 | @property 50 | def b_priors_beta(self): 51 | return [self.data[k]["b_prior_beta"] for k in self.data] 52 | 53 | @property 54 | def m_priors(self): 55 | return [self.data[k]["m_prior"] for k in self.data] 56 | 57 | @property 58 | def a_priors_ig(self): 59 | return [self.data[k]["a_prior_ig"] for k in self.data] 60 | 61 | @property 62 | def b_priors_ig(self): 63 | return [self.data[k]["b_prior_ig"] for k in self.data] 64 | 65 | @property 66 | def w_priors(self): 67 | return [self.data[k]["w_prior"] for k in self.data] 68 | 69 | def eval_simulation( 70 | self, 71 | sim_count: int = 20000, 72 | seed: int = None, 73 | min_is_best: bool = False, 74 | interval_alpha: float = 0.95, 75 | ) -> Tuple[dict, dict, dict]: 76 | """ 77 | Calculate probabilities of being best, expected loss and credible intervals for a current 78 | class state. 79 | 80 | Parameters 81 | ---------- 82 | sim_count : Number of simulations to be used for probability estimation. 83 | seed : Random seed. 84 | min_is_best : Option to change "being best" to a minimum. Default is maximum. 85 | interval_alpha : Credible interval probability (value between 0 and 1). 86 | 87 | Returns 88 | ------- 89 | res_pbbs : Dictionary with probabilities of being best for all variants in experiment. 90 | res_loss : Dictionary with expected loss for all variants in experiment. 91 | res_intervals : Dictionary with quantile-based credible intervals for all variants. 92 | """ 93 | pbbs, loss, intervals = eval_delta_normal_agg( 94 | self.totals, 95 | self.non_zeros, 96 | self.sum_values, 97 | self.sum_values_2, 98 | sim_count=sim_count, 99 | a_priors_beta=self.a_priors_beta, 100 | b_priors_beta=self.b_priors_beta, 101 | m_priors=self.m_priors, 102 | a_priors_ig=self.a_priors_ig, 103 | b_priors_ig=self.b_priors_ig, 104 | w_priors=self.w_priors, 105 | seed=seed, 106 | min_is_best=min_is_best, 107 | interval_alpha=interval_alpha, 108 | ) 109 | res_pbbs = dict(zip(self.variant_names, pbbs)) 110 | res_loss = dict(zip(self.variant_names, loss)) 111 | res_intervals = dict(zip(self.variant_names, intervals)) 112 | 113 | return res_pbbs, res_loss, res_intervals 114 | 115 | def evaluate( 116 | self, 117 | sim_count: int = 20000, 118 | seed: int = None, 119 | min_is_best: bool = False, 120 | interval_alpha: float = 0.95, 121 | ) -> List[dict]: 122 | """ 123 | Evaluation of experiment. 124 | 125 | Parameters 126 | ---------- 127 | sim_count : Number of simulations to be used for probability estimation. 128 | seed : Random seed. 129 | min_is_best : Option to change "being best" to a minimum. Default is maximum. 130 | interval_alpha : Credible interval probability (value between 0 and 1). 131 | 132 | Returns 133 | ------- 134 | res : List of dictionaries with results per variant. 135 | """ 136 | keys = [ 137 | "variant", 138 | "totals", 139 | "non_zeros", 140 | "sum_values", 141 | "avg_values", 142 | "avg_non_zero_values", 143 | "posterior_mean", 144 | "credible_interval", 145 | "prob_being_best", 146 | "expected_loss", 147 | ] 148 | avg_values = [round(i[0] / i[1], 5) for i in zip(self.sum_values, self.totals)] 149 | avg_pos_values = [round(i[0] / i[1], 5) for i in zip(self.sum_values, self.non_zeros)] 150 | posterior_mean = [ 151 | round(((i[0] + i[3] * i[4]) / (i[1] + i[4])) * ((i[5] + i[1]) / (i[6] + i[2])), 5) 152 | for i in zip( 153 | self.sum_values, 154 | self.non_zeros, 155 | self.totals, 156 | self.m_priors, 157 | self.w_priors, 158 | self.a_priors_beta, 159 | self.b_priors_beta, 160 | ) 161 | ] 162 | eval_pbbs, eval_loss, eval_intervals = self.eval_simulation( 163 | sim_count, seed, min_is_best, interval_alpha 164 | ) 165 | pbbs = list(eval_pbbs.values()) 166 | loss = list(eval_loss.values()) 167 | intervals = list(eval_intervals.values()) 168 | data = [ 169 | self.variant_names, 170 | self.totals, 171 | self.non_zeros, 172 | [round(i, 5) for i in self.sum_values], 173 | avg_values, 174 | avg_pos_values, 175 | posterior_mean, 176 | intervals, 177 | pbbs, 178 | loss, 179 | ] 180 | res = [dict(zip(keys, item)) for item in zip(*data)] 181 | 182 | return res 183 | 184 | def add_variant_data_agg( 185 | self, 186 | name: str, 187 | totals: int, 188 | non_zeros: int, 189 | sum_values: float, 190 | sum_values_2: float, 191 | a_prior_beta: Number = 0.5, 192 | b_prior_beta: Number = 0.5, 193 | m_prior: Number = 1, 194 | a_prior_ig: Number = 0, 195 | b_prior_ig: Number = 0, 196 | w_prior: Number = 0.01, 197 | replace: bool = True, 198 | ) -> None: 199 | """ 200 | Add variant data to test class using aggregated Delta-Normal data. 201 | This can be convenient as aggregation can be done on database level. 202 | 203 | The goal of default prior setup is to be low information. 204 | It should be tuned with caution. 205 | 206 | Parameters 207 | ---------- 208 | name : Variant name. 209 | totals : Total number of experiment observations (e.g. number of sessions). 210 | non_zeros : Total number of non-zero values for a given variant. 211 | sum_values : Sum of non-zero values for a given variant. 212 | sum_values_2 : Sum of values squared for a given variant. 213 | a_prior_beta : Prior alpha parameter from Beta distribution for conversion part. 214 | b_prior_beta : Prior beta parameter from Beta distribution for conversion part. 215 | m_prior : Prior normal mean. 216 | a_prior_ig : Prior alpha from inverse gamma dist. for unknown variance. 217 | In theory a > 0, but as we always have at least one observation, we can start at 0. 218 | b_prior_ig : Prior beta from inverse gamma dist. for unknown variance. 219 | In theory b > 0, but as we always have at least one observation, we can start at 0. 220 | w_prior : Prior effective sample sizes. 221 | replace : Replace data if variant already exists. 222 | If set to False, data of existing variant will be appended to existing data. 223 | """ 224 | if not isinstance(name, str): 225 | raise ValueError("Variant name has to be a string.") 226 | if a_prior_beta <= 0 or b_prior_beta <= 0: 227 | raise ValueError("Both [a_prior_beta, b_prior_beta] have to be positive numbers.") 228 | if m_prior < 0 or a_prior_ig < 0 or b_prior_ig < 0 or w_prior < 0: 229 | raise ValueError("All priors of [m, a_ig, b_ig, w] have to be non-negative numbers.") 230 | if non_zeros == 0: 231 | raise ValueError("Variant has to have some non-zero values.") 232 | if non_zeros < 0: 233 | raise ValueError("Input variable 'non_zeros' is expected to be positive integer.") 234 | if totals < non_zeros: 235 | raise ValueError("Not possible to have more non_zero numbers that totals!") 236 | 237 | if name not in self.variant_names: 238 | self.data[name] = { 239 | "totals": totals, 240 | "non_zeros": non_zeros, 241 | "sum_values": sum_values, 242 | "sum_values_2": sum_values_2, 243 | "a_prior_beta": a_prior_beta, 244 | "b_prior_beta": b_prior_beta, 245 | "m_prior": m_prior, 246 | "a_prior_ig": a_prior_ig, 247 | "b_prior_ig": b_prior_ig, 248 | "w_prior": w_prior, 249 | } 250 | elif name in self.variant_names and replace: 251 | msg = ( 252 | f"Variant {name} already exists - new data is replacing it. " 253 | "If you wish to append instead, use replace=False." 254 | ) 255 | logger.info(msg) 256 | self.data[name] = { 257 | "totals": totals, 258 | "non_zeros": non_zeros, 259 | "sum_values": sum_values, 260 | "sum_values_2": sum_values_2, 261 | "a_prior_beta": a_prior_beta, 262 | "b_prior_beta": b_prior_beta, 263 | "m_prior": m_prior, 264 | "a_prior_ig": a_prior_ig, 265 | "b_prior_ig": b_prior_ig, 266 | "w_prior": w_prior, 267 | } 268 | elif name in self.variant_names and not replace: 269 | msg = ( 270 | f"Variant {name} already exists - new data is appended to variant, " 271 | "keeping its original prior setup. " 272 | "If you wish to replace data instead, use replace=True." 273 | ) 274 | logger.info(msg) 275 | self.data[name]["totals"] += totals 276 | self.data[name]["non_zeros"] += non_zeros 277 | self.data[name]["sum_values"] += sum_values 278 | self.data[name]["sum_values_2"] += sum_values_2 279 | 280 | def add_variant_data( 281 | self, 282 | name: str, 283 | data: List[Number], 284 | a_prior_beta: Number = 0.5, 285 | b_prior_beta: Number = 0.5, 286 | m_prior: Number = 1, 287 | a_prior_ig: Number = 0, 288 | b_prior_ig: Number = 0, 289 | w_prior: Number = 0.01, 290 | replace: bool = True, 291 | ) -> None: 292 | """ 293 | Add variant data to test class using raw Delta-Normal data. 294 | 295 | The goal of default prior setup is to be low information. It should be tuned with caution. 296 | 297 | Parameters 298 | ---------- 299 | name : Variant name. 300 | data : List of delta-normal data (e.g. revenues in sessions). 301 | a_prior_beta : Prior alpha parameter from Beta distribution for conversion part. 302 | b_prior_beta : Prior beta parameter from Beta distribution for conversion part. 303 | m_prior : Prior normal mean. 304 | a_prior_ig : Prior alpha from inverse gamma dist. for unknown variance. 305 | In theory a > 0, but as we always have at least one observation, we can start at 0. 306 | b_prior_ig : Prior beta from inverse gamma dist. for unknown variance. 307 | In theory b > 0, but as we always have at least one observation, we can start at 0. 308 | w_prior : Prior effective sample sizes. 309 | replace : Replace data if variant already exists. 310 | If set to False, data of existing variant will be appended to existing data. 311 | """ 312 | if len(data) == 0: 313 | raise ValueError("Data of added variant needs to have some observations.") 314 | 315 | totals = len(data) 316 | non_zeros = sum(x != 0 for x in data) 317 | sum_values = sum(data) 318 | sum_values_2 = sum(np.square(data)) 319 | 320 | self.add_variant_data_agg( 321 | name, 322 | totals, 323 | non_zeros, 324 | sum_values, 325 | sum_values_2, 326 | a_prior_beta, 327 | b_prior_beta, 328 | m_prior, 329 | a_prior_ig, 330 | b_prior_ig, 331 | w_prior, 332 | replace, 333 | ) 334 | -------------------------------------------------------------------------------- /bayesian_testing/experiments/discrete.py: -------------------------------------------------------------------------------- 1 | from numbers import Number 2 | from typing import List, Tuple, Union 3 | import numpy as np 4 | 5 | from bayesian_testing.experiments.base import BaseDataTest 6 | from bayesian_testing.metrics import eval_numerical_dirichlet_agg 7 | from bayesian_testing.utilities import get_logger 8 | 9 | logger = get_logger("bayesian_testing") 10 | 11 | 12 | class DiscreteDataTest(BaseDataTest): 13 | """ 14 | Class for Bayesian A/B test for data with finite discrete states (i.e. categorical data 15 | with numerical categories). As a real world examples we can think of dice rolls, 16 | 1-5 star ratings, 1-10 ratings, etc. 17 | 18 | After class initialization, use add_variant methods to insert variant data. 19 | Then to get results of the test, use for instance `evaluate` method. 20 | """ 21 | 22 | def __init__(self, states: List[Union[float, int]]) -> None: 23 | """ 24 | Initialize DiscreteDataTest class. 25 | 26 | Parameters 27 | ---------- 28 | states : List of all possible states for a given discrete variable. 29 | """ 30 | super().__init__() 31 | if not self.check_if_numerical(states): 32 | raise ValueError("States in the test have to be numbers (int or float).") 33 | self.states = states 34 | 35 | @property 36 | def concentrations(self): 37 | return [self.data[k]["concentration"] for k in self.data] 38 | 39 | @property 40 | def prior_alphas(self): 41 | return [self.data[k]["prior"] for k in self.data] 42 | 43 | @staticmethod 44 | def check_if_numerical(values): 45 | res = True 46 | for v in values: 47 | if not isinstance(v, Number): 48 | res = False 49 | return res 50 | 51 | def eval_simulation( 52 | self, 53 | sim_count: int = 20000, 54 | seed: int = None, 55 | min_is_best: bool = False, 56 | interval_alpha: float = 0.95, 57 | ) -> Tuple[dict, dict, dict]: 58 | """ 59 | Calculate probabilities of being best, expected loss and credible intervals for a current 60 | class state. 61 | 62 | Parameters 63 | ---------- 64 | sim_count : Number of simulations to be used for probability estimation. 65 | seed : Random seed. 66 | min_is_best : Option to change "being best" to a minimum. Default is maximum. 67 | interval_alpha : Credible interval probability (value between 0 and 1). 68 | 69 | Returns 70 | ------- 71 | res_pbbs : Dictionary with probabilities of being best for all variants in experiment. 72 | res_loss : Dictionary with expected loss for all variants in experiment. 73 | res_intervals : Dictionary with quantile-based credible intervals for all variants. 74 | """ 75 | pbbs, loss, intervals = eval_numerical_dirichlet_agg( 76 | self.states, 77 | self.concentrations, 78 | self.prior_alphas, 79 | sim_count, 80 | seed, 81 | min_is_best, 82 | interval_alpha, 83 | ) 84 | res_pbbs = dict(zip(self.variant_names, pbbs)) 85 | res_loss = dict(zip(self.variant_names, loss)) 86 | res_intervals = dict(zip(self.variant_names, intervals)) 87 | 88 | return res_pbbs, res_loss, res_intervals 89 | 90 | def evaluate( 91 | self, 92 | sim_count: int = 20000, 93 | seed: int = None, 94 | min_is_best: bool = False, 95 | interval_alpha: float = 0.95, 96 | ) -> List[dict]: 97 | """ 98 | Evaluation of experiment. 99 | 100 | Parameters 101 | ---------- 102 | sim_count : Number of simulations to be used for probability estimation. 103 | seed : Random seed. 104 | min_is_best : Option to change "being best" to a minimum. Default is maximum. 105 | interval_alpha : Credible interval probability (value between 0 and 1). 106 | 107 | Returns 108 | ------- 109 | res : List of dictionaries with results per variant. 110 | """ 111 | keys = [ 112 | "variant", 113 | "concentration", 114 | "average_value", 115 | "posterior_mean", 116 | "credible_interval", 117 | "prob_being_best", 118 | "expected_loss", 119 | ] 120 | posterior_alphas = [ 121 | list(np.array(i[0]) + np.array(i[1])) 122 | for i in zip(self.concentrations, self.prior_alphas) 123 | ] 124 | posterior_mean = [ 125 | round(sum(np.multiply(np.array(self.states), np.array(i[0]) / sum(np.array(i[0])))), 5) 126 | for i in zip(posterior_alphas) 127 | ] 128 | eval_pbbs, eval_loss, eval_intervals = self.eval_simulation( 129 | sim_count, seed, min_is_best, interval_alpha 130 | ) 131 | pbbs = list(eval_pbbs.values()) 132 | loss = list(eval_loss.values()) 133 | intervals = list(eval_intervals.values()) 134 | average_values = [ 135 | np.sum(np.multiply(i, self.states)) / np.sum(i) for i in self.concentrations 136 | ] 137 | data = [ 138 | self.variant_names, 139 | [dict(zip(self.states, i)) for i in self.concentrations], 140 | average_values, 141 | posterior_mean, 142 | intervals, 143 | pbbs, 144 | loss, 145 | ] 146 | res = [dict(zip(keys, item)) for item in zip(*data)] 147 | 148 | return res 149 | 150 | def add_variant_data_agg( 151 | self, 152 | name: str, 153 | concentration: List[int], 154 | prior: List[Union[float, int]] = None, 155 | replace: bool = True, 156 | ) -> None: 157 | """ 158 | Add variant data to test class using aggregated discrete data. 159 | This can be convenient as aggregation can be done on database level. 160 | 161 | Default prior setup is Dirichlet(1,...,1) which is low information prior 162 | (we can interpret it as prior 1 observation of each state). 163 | 164 | Parameters 165 | ---------- 166 | name : Variant name. 167 | concentration : Total number of experiment observations for each state 168 | (e.g. number of rolls for each side in a die roll data). 169 | prior : Prior alpha parameters of a Dirichlet distribution (conjugate prior). 170 | replace : Replace data if variant already exists. 171 | If set to False, data of existing variant will be appended to existing data. 172 | """ 173 | if not isinstance(name, str): 174 | raise ValueError("Variant name has to be a string.") 175 | if not len(self.states) == len(concentration): 176 | msg = ( 177 | f"Concentration list has to have same size as number of states in a test " 178 | f"{len(concentration)} != {len(self.states)}." 179 | ) 180 | raise ValueError(msg) 181 | if not self.check_if_numerical(concentration): 182 | raise ValueError("Concentration parameter has to be a list of integer values.") 183 | 184 | if not prior: 185 | prior = [1] * len(self.states) 186 | 187 | if name not in self.variant_names: 188 | self.data[name] = {"concentration": concentration, "prior": prior} 189 | elif name in self.variant_names and replace: 190 | msg = ( 191 | f"Variant {name} already exists - new data is replacing it. " 192 | "If you wish to append instead, use replace=False." 193 | ) 194 | logger.info(msg) 195 | self.data[name] = {"concentration": concentration, "prior": prior} 196 | elif name in self.variant_names and not replace: 197 | msg = ( 198 | f"Variant {name} already exists - new data is appended to variant, " 199 | "keeping its original prior setup. " 200 | "If you wish to replace data instead, use replace=True." 201 | ) 202 | logger.info(msg) 203 | self.data[name]["concentration"] = [ 204 | sum(x) for x in zip(self.data[name]["concentration"], concentration) 205 | ] 206 | 207 | def add_variant_data( 208 | self, 209 | name: str, 210 | data: List[int], 211 | prior: List[Union[float, int]] = None, 212 | replace: bool = True, 213 | ) -> None: 214 | """ 215 | Add variant data to test class using raw discrete data. 216 | 217 | Default prior setup is Dirichlet(1,...,1) which is low information prior 218 | (we can interpret it as prior 1 observation of each state). 219 | 220 | Parameters 221 | ---------- 222 | name : Variant name. 223 | data : List of numerical data observations from possible states. 224 | prior : Prior alpha parameters of a Dirichlet distribution (conjugate prior). 225 | replace : Replace data if variant already exists. 226 | If set to False, data of existing variant will be appended to existing data. 227 | """ 228 | if len(data) == 0: 229 | raise ValueError("Data of added variant needs to have some observations.") 230 | if not min([i in self.states for i in data]): 231 | msg = ( 232 | "Input data needs to be a list of numbers from possible states: " f"{self.states}." 233 | ) 234 | raise ValueError(msg) 235 | 236 | counter_dict = dict(zip(self.states, np.zeros(len(self.states)))) 237 | for i in data: 238 | counter_dict[i] += 1 239 | concentration = [counter_dict[i] for i in self.states] 240 | 241 | self.add_variant_data_agg(name, concentration, prior, replace) 242 | -------------------------------------------------------------------------------- /bayesian_testing/experiments/exponential.py: -------------------------------------------------------------------------------- 1 | from numbers import Number 2 | from typing import List, Tuple, Union 3 | 4 | from bayesian_testing.experiments.base import BaseDataTest 5 | from bayesian_testing.metrics import eval_exponential_agg 6 | from bayesian_testing.utilities import get_logger 7 | 8 | logger = get_logger("bayesian_testing") 9 | 10 | 11 | class ExponentialDataTest(BaseDataTest): 12 | """ 13 | Class for Bayesian A/B test for Exponential data (e.g. session time, waiting time, etc.). 14 | 15 | After class initialization, use add_variant methods to insert variant data. 16 | Then to get results of the test, use for instance `evaluate` method. 17 | """ 18 | 19 | def __init__(self) -> None: 20 | """ 21 | Initialize BinaryDataTest class. 22 | """ 23 | super().__init__() 24 | 25 | @property 26 | def totals(self): 27 | return [self.data[k]["totals"] for k in self.data] 28 | 29 | @property 30 | def sum_values(self): 31 | return [self.data[k]["sum_values"] for k in self.data] 32 | 33 | @property 34 | def a_priors(self): 35 | return [self.data[k]["a_prior"] for k in self.data] 36 | 37 | @property 38 | def b_priors(self): 39 | return [self.data[k]["b_prior"] for k in self.data] 40 | 41 | def eval_simulation( 42 | self, 43 | sim_count: int = 20000, 44 | seed: int = None, 45 | min_is_best: bool = False, 46 | interval_alpha: float = 0.95, 47 | ) -> Tuple[dict, dict, dict]: 48 | """ 49 | Calculate probabilities of being best, expected loss and credible intervals for a current 50 | class state. 51 | 52 | Parameters 53 | ---------- 54 | sim_count : Number of simulations to be used for probability estimation. 55 | seed : Random seed. 56 | min_is_best : Option to change "being best" to a minimum. Default is maximum. 57 | interval_alpha : Credible interval probability (value between 0 and 1). 58 | 59 | Returns 60 | ------- 61 | res_pbbs : Dictionary with probabilities of being best for all variants in experiment. 62 | res_loss : Dictionary with expected loss for all variants in experiment. 63 | res_intervals : Dictionary with quantile-based credible intervals for all variants. 64 | """ 65 | pbbs, loss, intervals = eval_exponential_agg( 66 | self.totals, 67 | self.sum_values, 68 | self.a_priors, 69 | self.b_priors, 70 | sim_count, 71 | seed, 72 | min_is_best, 73 | interval_alpha, 74 | ) 75 | res_pbbs = dict(zip(self.variant_names, pbbs)) 76 | res_loss = dict(zip(self.variant_names, loss)) 77 | res_intervals = dict(zip(self.variant_names, intervals)) 78 | 79 | return res_pbbs, res_loss, res_intervals 80 | 81 | def evaluate( 82 | self, 83 | sim_count: int = 20000, 84 | seed: int = None, 85 | min_is_best: bool = False, 86 | interval_alpha: float = 0.95, 87 | ) -> List[dict]: 88 | """ 89 | Evaluation of experiment. 90 | 91 | Parameters 92 | ---------- 93 | sim_count : Number of simulations to be used for probability estimation. 94 | seed : Random seed. 95 | min_is_best : Option to change "being best" to a minimum. Default is maximum. 96 | interval_alpha : Credible interval probability (value between 0 and 1). 97 | 98 | Returns 99 | ------- 100 | res : List of dictionaries with results per variant. 101 | """ 102 | keys = [ 103 | "variant", 104 | "totals", 105 | "sum_values", 106 | "observed_average", 107 | "posterior_mean", 108 | "credible_interval", 109 | "prob_being_best", 110 | "expected_loss", 111 | ] 112 | observed_average = [round(i[0] / i[1], 5) for i in zip(self.sum_values, self.totals)] 113 | posterior_mean = [ 114 | round((i[3] + i[1]) / (i[2] + i[0]), 5) 115 | for i in zip(self.totals, self.sum_values, self.a_priors, self.b_priors) 116 | ] 117 | eval_pbbs, eval_loss, eval_intervals = self.eval_simulation( 118 | sim_count, seed, min_is_best, interval_alpha 119 | ) 120 | pbbs = list(eval_pbbs.values()) 121 | loss = list(eval_loss.values()) 122 | intervals = list(eval_intervals.values()) 123 | data = [ 124 | self.variant_names, 125 | self.totals, 126 | [round(i, 5) for i in self.sum_values], 127 | observed_average, 128 | posterior_mean, 129 | intervals, 130 | pbbs, 131 | loss, 132 | ] 133 | res = [dict(zip(keys, item)) for item in zip(*data)] 134 | 135 | return res 136 | 137 | def add_variant_data_agg( 138 | self, 139 | name: str, 140 | totals: int, 141 | sum_values: Union[float, int], 142 | a_prior: Number = 0.1, 143 | b_prior: Number = 0.1, 144 | replace: bool = True, 145 | ) -> None: 146 | """ 147 | Add variant data to a test class using aggregated Exponential data. 148 | This can be convenient as aggregation can be done on database level. 149 | 150 | Default prior setup is set for Gamma(0.1, 0.1) which is on purpose very vague prior. 151 | 152 | Parameters 153 | ---------- 154 | name : Variant name. 155 | totals : Total number of experiment observations (e.g. number of sessions). 156 | sum_values : Sum of values for a given variant (e.g. total sum of waiting time). 157 | a_prior : Prior alpha parameter of a Gamma distribution (conjugate prior). 158 | Default value 0.1 is on purpose to be vague (lower information). 159 | b_prior : Prior beta parameter (rate) of a Gamma distribution (conjugate prior). 160 | Default value 0.1 is on purpose to be vague (lower information). 161 | replace : Replace data if variant already exists. 162 | If set to False, data of existing variant will be appended to existing data. 163 | """ 164 | if not isinstance(name, str): 165 | raise ValueError("Variant name has to be a string.") 166 | if a_prior <= 0 or b_prior <= 0: 167 | raise ValueError("Both [a_prior, b_prior] have to be positive numbers.") 168 | if totals <= 0: 169 | raise ValueError("Input variable 'totals' is expected to be positive integer.") 170 | if sum_values < 0: 171 | raise ValueError("Input variable 'sum_values' is expected to be non-negative number.") 172 | 173 | if name not in self.variant_names: 174 | self.data[name] = { 175 | "totals": totals, 176 | "sum_values": sum_values, 177 | "a_prior": a_prior, 178 | "b_prior": b_prior, 179 | } 180 | elif name in self.variant_names and replace: 181 | msg = ( 182 | f"Variant {name} already exists - new data is replacing it. " 183 | "If you wish to append instead, use replace=False." 184 | ) 185 | logger.info(msg) 186 | self.data[name] = { 187 | "totals": totals, 188 | "sum_values": sum_values, 189 | "a_prior": a_prior, 190 | "b_prior": b_prior, 191 | } 192 | elif name in self.variant_names and not replace: 193 | msg = ( 194 | f"Variant {name} already exists - new data is appended to variant, " 195 | "keeping its original prior setup. " 196 | "If you wish to replace data instead, use replace=True." 197 | ) 198 | logger.info(msg) 199 | self.data[name]["totals"] += totals 200 | self.data[name]["sum_values"] += sum_values 201 | 202 | def add_variant_data( 203 | self, 204 | name: str, 205 | data: List[Union[float, int]], 206 | a_prior: Number = 0.1, 207 | b_prior: Number = 0.1, 208 | replace: bool = True, 209 | ) -> None: 210 | """ 211 | Add variant data to a test class using raw Exponential data. 212 | 213 | Default prior setup is set for Gamma(0.1, 0.1) which is non-information prior. 214 | 215 | Parameters 216 | ---------- 217 | name : Variant name.s 218 | data : List of Exponential data. 219 | a_prior : Prior alpha parameter of a Gamma distribution (conjugate prior). 220 | Default value 0.1 is on purpose to be vague (lower information). 221 | b_prior : Prior beta parameter (rate) of a Gamma distribution (conjugate prior). 222 | Default value 0.1 is on purpose to be vague (lower information). 223 | replace : Replace data if variant already exists. 224 | If set to False, data of existing variant will be appended to existing data. 225 | """ 226 | if len(data) == 0: 227 | raise ValueError("Data of added variant needs to have some observations.") 228 | if not min([i >= 0 for i in data]): 229 | raise ValueError("Input data needs to be a list of non-negative integers.") 230 | 231 | totals = len(data) 232 | sum_values = sum(data) 233 | 234 | self.add_variant_data_agg(name, totals, sum_values, a_prior, b_prior, replace) 235 | -------------------------------------------------------------------------------- /bayesian_testing/experiments/normal.py: -------------------------------------------------------------------------------- 1 | from numbers import Number 2 | from typing import List, Tuple 3 | 4 | import numpy as np 5 | 6 | from bayesian_testing.experiments.base import BaseDataTest 7 | from bayesian_testing.metrics import eval_normal_agg 8 | from bayesian_testing.utilities import get_logger 9 | 10 | logger = get_logger("bayesian_testing") 11 | 12 | 13 | class NormalDataTest(BaseDataTest): 14 | """ 15 | Class for Bayesian A/B test for Normal data. 16 | 17 | After class initialization, use add_variant methods to insert variant data. 18 | Then to get results of the test, use for instance `evaluate` method. 19 | """ 20 | 21 | def __init__(self) -> None: 22 | """ 23 | Initialize NormalDataTest class. 24 | """ 25 | super().__init__() 26 | 27 | @property 28 | def totals(self): 29 | return [self.data[k]["totals"] for k in self.data] 30 | 31 | @property 32 | def sum_values(self): 33 | return [self.data[k]["sum_values"] for k in self.data] 34 | 35 | @property 36 | def sum_values_2(self): 37 | return [self.data[k]["sum_values_2"] for k in self.data] 38 | 39 | @property 40 | def m_priors(self): 41 | return [self.data[k]["m_prior"] for k in self.data] 42 | 43 | @property 44 | def a_priors_ig(self): 45 | return [self.data[k]["a_prior_ig"] for k in self.data] 46 | 47 | @property 48 | def b_priors_ig(self): 49 | return [self.data[k]["b_prior_ig"] for k in self.data] 50 | 51 | @property 52 | def w_priors(self): 53 | return [self.data[k]["w_prior"] for k in self.data] 54 | 55 | def eval_simulation( 56 | self, 57 | sim_count: int = 20000, 58 | seed: int = None, 59 | min_is_best: bool = False, 60 | interval_alpha: float = 0.95, 61 | ) -> Tuple[dict, dict, dict]: 62 | """ 63 | Calculate probabilities of being best, expected loss and credible intervals for a current 64 | class state. 65 | 66 | Parameters 67 | ---------- 68 | sim_count : Number of simulations to be used for probability estimation. 69 | seed : Random seed. 70 | min_is_best : Option to change "being best" to a minimum. Default is maximum. 71 | interval_alpha : Credible interval probability (value between 0 and 1). 72 | 73 | Returns 74 | ------- 75 | res_pbbs : Dictionary with probabilities of being best for all variants in experiment. 76 | res_loss : Dictionary with expected loss for all variants in experiment. 77 | res_intervals : Dictionary with quantile-based credible intervals for all variants. 78 | """ 79 | pbbs, loss, intervals = eval_normal_agg( 80 | self.totals, 81 | self.sum_values, 82 | self.sum_values_2, 83 | sim_count=sim_count, 84 | m_priors=self.m_priors, 85 | a_priors_ig=self.a_priors_ig, 86 | b_priors_ig=self.b_priors_ig, 87 | w_priors=self.w_priors, 88 | seed=seed, 89 | min_is_best=min_is_best, 90 | interval_alpha=interval_alpha, 91 | ) 92 | res_pbbs = dict(zip(self.variant_names, pbbs)) 93 | res_loss = dict(zip(self.variant_names, loss)) 94 | res_intervals = dict(zip(self.variant_names, intervals)) 95 | 96 | return res_pbbs, res_loss, res_intervals 97 | 98 | def evaluate( 99 | self, 100 | sim_count: int = 20000, 101 | seed: int = None, 102 | min_is_best: bool = False, 103 | interval_alpha: float = 0.95, 104 | ) -> List[dict]: 105 | """ 106 | Evaluation of experiment. 107 | 108 | Parameters 109 | ---------- 110 | sim_count : Number of simulations to be used for probability estimation. 111 | seed : Random seed. 112 | min_is_best : Option to change "being best" to a minimum. Default is maximum. 113 | interval_alpha : Credible interval probability (value between 0 and 1). 114 | 115 | Returns 116 | ------- 117 | res : List of dictionaries with results per variant. 118 | """ 119 | keys = [ 120 | "variant", 121 | "totals", 122 | "sum_values", 123 | "avg_values", 124 | "posterior_mean", 125 | "credible_interval", 126 | "prob_being_best", 127 | "expected_loss", 128 | ] 129 | avg_values = [round(i[0] / i[1], 5) for i in zip(self.sum_values, self.totals)] 130 | posterior_mean = [ 131 | round((i[0] + i[3] * i[2]) / (i[1] + i[3]), 5) 132 | for i in zip(self.sum_values, self.totals, self.m_priors, self.w_priors) 133 | ] 134 | eval_pbbs, eval_loss, eval_intervals = self.eval_simulation( 135 | sim_count, seed, min_is_best, interval_alpha 136 | ) 137 | pbbs = list(eval_pbbs.values()) 138 | loss = list(eval_loss.values()) 139 | intervals = list(eval_intervals.values()) 140 | data = [ 141 | self.variant_names, 142 | self.totals, 143 | [round(i, 5) for i in self.sum_values], 144 | avg_values, 145 | posterior_mean, 146 | intervals, 147 | pbbs, 148 | loss, 149 | ] 150 | res = [dict(zip(keys, item)) for item in zip(*data)] 151 | 152 | return res 153 | 154 | def add_variant_data_agg( 155 | self, 156 | name: str, 157 | totals: int, 158 | sum_values: float, 159 | sum_values_2: float, 160 | m_prior: Number = 1, 161 | a_prior_ig: Number = 0, 162 | b_prior_ig: Number = 0, 163 | w_prior: Number = 0.01, 164 | replace: bool = True, 165 | ) -> None: 166 | """ 167 | Add variant data to test class using aggregated Normal data. 168 | This can be convenient as aggregation can be done on database level. 169 | 170 | The goal of default prior setup is to be low information. 171 | It should be tuned with caution. 172 | 173 | Parameters 174 | ---------- 175 | name : Variant name. 176 | totals : Total number of experiment observations (e.g. number of sessions). 177 | sum_values : Sum of values for a given variant. 178 | sum_values_2 : Sum of values squared for a given variant. 179 | m_prior : Prior normal mean. 180 | a_prior_ig : Prior alpha from inverse gamma dist. for unknown variance. 181 | In theory a > 0, but as we always have at least one observation, we can start at 0. 182 | b_prior_ig : Prior beta from inverse gamma dist. for unknown variance. 183 | In theory b > 0, but as we always have at least one observation, we can start at 0. 184 | w_prior : Prior effective sample sizes. 185 | replace : Replace data if variant already exists. 186 | If set to False, data of existing variant will be appended to existing data. 187 | """ 188 | if not isinstance(name, str): 189 | raise ValueError("Variant name has to be a string.") 190 | if m_prior < 0 or a_prior_ig < 0 or b_prior_ig < 0 or w_prior < 0: 191 | raise ValueError("All priors of [m, a_ig, b_ig, w] have to be non-negative numbers.") 192 | if totals <= 0: 193 | raise ValueError("Input variable 'totals' is expected to be positive integer.") 194 | 195 | if name not in self.variant_names: 196 | self.data[name] = { 197 | "totals": totals, 198 | "sum_values": sum_values, 199 | "sum_values_2": sum_values_2, 200 | "m_prior": m_prior, 201 | "a_prior_ig": a_prior_ig, 202 | "b_prior_ig": b_prior_ig, 203 | "w_prior": w_prior, 204 | } 205 | elif name in self.variant_names and replace: 206 | msg = ( 207 | f"Variant {name} already exists - new data is replacing it. " 208 | "If you wish to append instead, use replace=False." 209 | ) 210 | logger.info(msg) 211 | self.data[name] = { 212 | "totals": totals, 213 | "sum_values": sum_values, 214 | "sum_values_2": sum_values_2, 215 | "m_prior": m_prior, 216 | "a_prior_ig": a_prior_ig, 217 | "b_prior_ig": b_prior_ig, 218 | "w_prior": w_prior, 219 | } 220 | elif name in self.variant_names and not replace: 221 | msg = ( 222 | f"Variant {name} already exists - new data is appended to variant, " 223 | "keeping its original prior setup. " 224 | "If you wish to replace data instead, use replace=True." 225 | ) 226 | logger.info(msg) 227 | self.data[name]["totals"] += totals 228 | self.data[name]["sum_values"] += sum_values 229 | self.data[name]["sum_values_2"] += sum_values_2 230 | 231 | def add_variant_data( 232 | self, 233 | name: str, 234 | data: List[Number], 235 | m_prior: Number = 1, 236 | a_prior_ig: Number = 0, 237 | b_prior_ig: Number = 0, 238 | w_prior: Number = 0.01, 239 | replace: bool = True, 240 | ) -> None: 241 | """ 242 | Add variant data to test class using raw Normal data. 243 | 244 | The goal of default prior setup is to be low information. It should be tuned with caution. 245 | 246 | Parameters 247 | ---------- 248 | name : Variant name. 249 | data : List of normal data. 250 | m_prior : Prior mean. 251 | a_prior_ig : Prior alpha from inverse gamma dist. for unknown variance. 252 | In theory a > 0, but as we always have at least one observation, we can start at 0. 253 | b_prior_ig : Prior beta from inverse gamma dist. for unknown variance. 254 | In theory b > 0, but as we always have at least one observation, we can start at 0. 255 | w_prior : Prior effective sample sizes. 256 | replace : Replace data if variant already exists. 257 | If set to False, data of existing variant will be appended to existing data. 258 | """ 259 | if len(data) == 0: 260 | raise ValueError("Data of added variant needs to have some observations.") 261 | 262 | totals = len(data) 263 | sum_values = sum(data) 264 | sum_values_2 = sum(np.square(data)) 265 | 266 | self.add_variant_data_agg( 267 | name, 268 | totals, 269 | sum_values, 270 | sum_values_2, 271 | m_prior, 272 | a_prior_ig, 273 | b_prior_ig, 274 | w_prior, 275 | replace, 276 | ) 277 | -------------------------------------------------------------------------------- /bayesian_testing/experiments/poisson.py: -------------------------------------------------------------------------------- 1 | from numbers import Number 2 | from typing import List, Tuple, Union 3 | 4 | from bayesian_testing.experiments.base import BaseDataTest 5 | from bayesian_testing.metrics import eval_poisson_agg 6 | from bayesian_testing.utilities import get_logger 7 | 8 | logger = get_logger("bayesian_testing") 9 | 10 | 11 | class PoissonDataTest(BaseDataTest): 12 | """ 13 | Class for Bayesian A/B test for Poisson data (i.e. numbers of events, e.g. goals scored). 14 | 15 | After class initialization, use add_variant methods to insert variant data. 16 | Then to get results of the test, use for instance `evaluate` method. 17 | """ 18 | 19 | def __init__(self) -> None: 20 | """ 21 | Initialize BinaryDataTest class. 22 | """ 23 | super().__init__() 24 | 25 | @property 26 | def totals(self): 27 | return [self.data[k]["totals"] for k in self.data] 28 | 29 | @property 30 | def sum_values(self): 31 | return [self.data[k]["sum_values"] for k in self.data] 32 | 33 | @property 34 | def a_priors(self): 35 | return [self.data[k]["a_prior"] for k in self.data] 36 | 37 | @property 38 | def b_priors(self): 39 | return [self.data[k]["b_prior"] for k in self.data] 40 | 41 | def eval_simulation( 42 | self, 43 | sim_count: int = 20000, 44 | seed: int = None, 45 | min_is_best: bool = False, 46 | interval_alpha: float = 0.95, 47 | ) -> Tuple[dict, dict, dict]: 48 | """ 49 | Calculate probabilities of being best, expected loss and credible intervals for a current 50 | class state. 51 | 52 | Parameters 53 | ---------- 54 | sim_count : Number of simulations to be used for probability estimation. 55 | seed : Random seed. 56 | min_is_best : Option to change "being best" to a minimum. Default is maximum. 57 | interval_alpha : Credible interval probability (value between 0 and 1). 58 | 59 | Returns 60 | ------- 61 | res_pbbs : Dictionary with probabilities of being best for all variants in experiment. 62 | res_loss : Dictionary with expected loss for all variants in experiment. 63 | res_intervals : Dictionary with quantile-based credible intervals for all variants. 64 | """ 65 | pbbs, loss, intervals = eval_poisson_agg( 66 | self.totals, 67 | self.sum_values, 68 | self.a_priors, 69 | self.b_priors, 70 | sim_count, 71 | seed, 72 | min_is_best, 73 | interval_alpha, 74 | ) 75 | res_pbbs = dict(zip(self.variant_names, pbbs)) 76 | res_loss = dict(zip(self.variant_names, loss)) 77 | res_intervals = dict(zip(self.variant_names, intervals)) 78 | 79 | return res_pbbs, res_loss, res_intervals 80 | 81 | def evaluate( 82 | self, 83 | sim_count: int = 20000, 84 | seed: int = None, 85 | min_is_best: bool = False, 86 | interval_alpha: float = 0.95, 87 | ) -> List[dict]: 88 | """ 89 | Evaluation of experiment. 90 | 91 | Parameters 92 | ---------- 93 | sim_count : Number of simulations to be used for probability estimation. 94 | seed : Random seed. 95 | min_is_best : Option to change "being best" to a minimum. Default is maximum. 96 | interval_alpha : Credible interval probability (value between 0 and 1). 97 | 98 | Returns 99 | ------- 100 | res : List of dictionaries with results per variant. 101 | """ 102 | keys = [ 103 | "variant", 104 | "totals", 105 | "sum_values", 106 | "observed_average", 107 | "posterior_mean", 108 | "credible_interval", 109 | "prob_being_best", 110 | "expected_loss", 111 | ] 112 | observed_average = [round(i[0] / i[1], 5) for i in zip(self.sum_values, self.totals)] 113 | posterior_mean = [ 114 | round((i[2] + i[0]) / (i[3] + i[1]), 5) 115 | for i in zip(self.sum_values, self.totals, self.a_priors, self.b_priors) 116 | ] 117 | eval_pbbs, eval_loss, eval_intervals = self.eval_simulation( 118 | sim_count, seed, min_is_best, interval_alpha 119 | ) 120 | pbbs = list(eval_pbbs.values()) 121 | loss = list(eval_loss.values()) 122 | intervals = list(eval_intervals.values()) 123 | data = [ 124 | self.variant_names, 125 | self.totals, 126 | self.sum_values, 127 | observed_average, 128 | posterior_mean, 129 | intervals, 130 | pbbs, 131 | loss, 132 | ] 133 | res = [dict(zip(keys, item)) for item in zip(*data)] 134 | 135 | return res 136 | 137 | def add_variant_data_agg( 138 | self, 139 | name: str, 140 | totals: int, 141 | sum_values: Union[float, int], 142 | a_prior: Number = 0.1, 143 | b_prior: Number = 0.1, 144 | replace: bool = True, 145 | ) -> None: 146 | """ 147 | Add variant data to test class using aggregated Poisson data. 148 | This can be convenient as aggregation can be done on database level. 149 | 150 | Default prior setup is set for Gamma(0.1, 0.1) which is on purpose very vague prior. 151 | 152 | Parameters 153 | ---------- 154 | name : Variant name. 155 | totals : Total number of experiment observations (e.g. number of matches). 156 | sum_values : Sum of values for a given variant (e.g. total number of goals). 157 | a_prior : Prior alpha parameter of a Gamma distribution (conjugate prior). 158 | Default value 0.1 is on purpose to be vague (lower information). 159 | b_prior : Prior beta parameter (rate) of a Gamma distribution (conjugate prior). 160 | Default value 0.1 is on purpose to be vague (lower information). 161 | replace : Replace data if variant already exists. 162 | If set to False, data of existing variant will be appended to existing data. 163 | """ 164 | if not isinstance(name, str): 165 | raise ValueError("Variant name has to be a string.") 166 | if a_prior <= 0 or b_prior <= 0: 167 | raise ValueError("Both [a_prior, b_prior] have to be positive numbers.") 168 | if totals <= 0: 169 | raise ValueError("Input variable 'totals' is expected to be positive integer.") 170 | if sum_values < 0: 171 | raise ValueError("Input variable 'sum_values' is expected to be non-negative number.") 172 | 173 | if name not in self.variant_names: 174 | self.data[name] = { 175 | "totals": totals, 176 | "sum_values": sum_values, 177 | "a_prior": a_prior, 178 | "b_prior": b_prior, 179 | } 180 | elif name in self.variant_names and replace: 181 | msg = ( 182 | f"Variant {name} already exists - new data is replacing it. " 183 | "If you wish to append instead, use replace=False." 184 | ) 185 | logger.info(msg) 186 | self.data[name] = { 187 | "totals": totals, 188 | "sum_values": sum_values, 189 | "a_prior": a_prior, 190 | "b_prior": b_prior, 191 | } 192 | elif name in self.variant_names and not replace: 193 | msg = ( 194 | f"Variant {name} already exists - new data is appended to variant, " 195 | "keeping its original prior setup. " 196 | "If you wish to replace data instead, use replace=True." 197 | ) 198 | logger.info(msg) 199 | self.data[name]["totals"] += totals 200 | self.data[name]["sum_values"] += sum_values 201 | 202 | def add_variant_data( 203 | self, 204 | name: str, 205 | data: List[int], 206 | a_prior: Number = 0.1, 207 | b_prior: Number = 0.1, 208 | replace: bool = True, 209 | ) -> None: 210 | """ 211 | Add variant data to test class using raw Poisson data. 212 | 213 | Default prior setup is set for Gamma(0.1, 0.1) which is non-information prior. 214 | 215 | Parameters 216 | ---------- 217 | name : Variant name. 218 | data : List of Poisson data. 219 | a_prior : Prior alpha parameter of a Gamma distribution (conjugate prior). 220 | Default value 0.1 is on purpose to be vague (lower information). 221 | b_prior : Prior beta parameter (rate) of a Gamma distribution (conjugate prior). 222 | Default value 0.1 is on purpose to be vague (lower information). 223 | replace : Replace data if variant already exists. 224 | If set to False, data of existing variant will be appended to existing data. 225 | """ 226 | if len(data) == 0: 227 | raise ValueError("Data of added variant needs to have some observations.") 228 | if not min([i >= 0 for i in data]): 229 | raise ValueError("Input data needs to be a list of non-negative integers.") 230 | 231 | totals = len(data) 232 | sum_values = sum(data) 233 | 234 | self.add_variant_data_agg(name, totals, sum_values, a_prior, b_prior, replace) 235 | -------------------------------------------------------------------------------- /bayesian_testing/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from .evaluation import ( 2 | eval_bernoulli_agg, 3 | eval_normal_agg, 4 | eval_delta_lognormal_agg, 5 | eval_numerical_dirichlet_agg, 6 | eval_poisson_agg, 7 | eval_delta_normal_agg, 8 | eval_exponential_agg, 9 | ) 10 | 11 | __all__ = [ 12 | "eval_bernoulli_agg", 13 | "eval_normal_agg", 14 | "eval_delta_lognormal_agg", 15 | "eval_delta_normal_agg", 16 | "eval_numerical_dirichlet_agg", 17 | "eval_poisson_agg", 18 | "eval_exponential_agg", 19 | ] 20 | -------------------------------------------------------------------------------- /bayesian_testing/metrics/posteriors.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Union 2 | 3 | import numpy as np 4 | 5 | 6 | def beta_posteriors_all( 7 | totals: List[int], 8 | positives: List[int], 9 | sim_count: int, 10 | a_priors_beta: List[Union[float, int]], 11 | b_priors_beta: List[Union[float, int]], 12 | seed: Union[int, np.random.bit_generator.SeedSequence] = None, 13 | ) -> np.ndarray: 14 | """ 15 | Draw from Beta posterior distributions for all variants at once. 16 | 17 | Parameters 18 | ---------- 19 | totals : List of total experiment observations (e.g. number of sessions) for each variant. 20 | positives : List of total number of ones (e.g. number of conversions) for each variant. 21 | sim_count : Number of simulations to be used for probability estimation. 22 | a_priors_beta : List of prior alpha parameters of Beta distributions for each variant. 23 | b_priors_beta : List of prior beta parameters of Beta distributions for each variant. 24 | seed : Random seed. 25 | 26 | Returns 27 | ------- 28 | beta_samples : List of lists of beta distribution samples for all variants. 29 | """ 30 | rng = np.random.default_rng(seed) 31 | 32 | beta_samples = np.array( 33 | [ 34 | rng.beta( 35 | positives[i] + a_priors_beta[i], 36 | totals[i] - positives[i] + b_priors_beta[i], 37 | sim_count, 38 | ) 39 | for i in range(len(totals)) 40 | ] 41 | ) 42 | return beta_samples 43 | 44 | 45 | def normal_posteriors( 46 | total: int, 47 | sums: float, 48 | sums_2: float, 49 | sim_count: int = 20000, 50 | prior_m: Union[float, int] = 1, 51 | prior_a: Union[float, int] = 0, 52 | prior_b: Union[float, int] = 0, 53 | prior_w: Union[float, int] = 0.01, 54 | seed: Union[int, np.random.bit_generator.SeedSequence] = None, 55 | ) -> Tuple[List[Union[float, int]], List[Union[float, int]]]: 56 | """ 57 | Drawing mus and sigmas from posterior Normal distribution considering given aggregated data. 58 | 59 | Parameters 60 | ---------- 61 | total : Number of data observations from normal data. 62 | sums : Sum of original data. 63 | sums_2 : Sum of squares of original data. 64 | sim_count : Number of simulations. 65 | prior_m : Prior mean. 66 | prior_a : Prior alpha from inverse gamma dist. for unknown variance of original data. 67 | In theory a > 0, but as we always have at least one observation, we can start at 0. 68 | prior_b : Prior beta from inverse gamma dist. for unknown variance of original data. 69 | In theory b > 0, but as we always have at least one observation, we can start at 0. 70 | prior_w : Prior effective sample size. 71 | seed : Random seed. 72 | 73 | Returns 74 | ------- 75 | mu_post : List of size sim_count with mus drawn from normal distribution. 76 | sig_2_post : List of size sim_count with mus drawn from normal distribution. 77 | """ 78 | rng = np.random.default_rng(seed) 79 | 80 | x_bar = sums / total 81 | a_post = prior_a + (total / 2) 82 | b_post = ( 83 | prior_b 84 | + (1 / 2) * (sums_2 - 2 * sums * x_bar + total * (x_bar**2)) 85 | + ((total * prior_w) / (2 * (total + prior_w))) * ((x_bar - prior_m) ** 2) 86 | ) 87 | 88 | # here it has to be 1/b as it is a scale, and not a rate 89 | sig_2_post = 1 / rng.gamma(a_post, 1 / b_post, sim_count) 90 | 91 | m_post = (total * x_bar + prior_w * prior_m) / (total + prior_w) 92 | 93 | mu_post = rng.normal(m_post, np.sqrt(sig_2_post / (total + prior_w))) 94 | 95 | return mu_post, sig_2_post 96 | 97 | 98 | def lognormal_posteriors( 99 | total: int, 100 | sum_logs: float, 101 | sum_logs_2: float, 102 | sim_count: int = 20000, 103 | prior_m: Union[float, int] = 1, 104 | prior_a: Union[float, int] = 0, 105 | prior_b: Union[float, int] = 0, 106 | prior_w: Union[float, int] = 0.01, 107 | seed: Union[int, np.random.bit_generator.SeedSequence] = None, 108 | ) -> List[float]: 109 | """ 110 | Drawing from posterior LogNormal distribution using logarithms of original (lognormal) data 111 | (logarithms of lognormal data are normal). Input data is in aggregated form. 112 | 113 | Parameters 114 | ---------- 115 | total : Number of lognormal data observations. 116 | Could be number of conversions in session data. 117 | sum_logs : Sum of logarithms of original data. 118 | sum_logs_2 : Sum of logarithms squared of original data. 119 | sim_count : Number of simulations. 120 | prior_m : Prior mean of logarithms of original data. 121 | prior_a : Prior alpha from inverse gamma dist. for unknown variance of logarithms 122 | of original data. In theory a > 0, but as we always have at least one observation, 123 | we can start at 0. 124 | prior_b : Prior beta from inverse gamma dist. for unknown variance of logarithms 125 | of original data. In theory b > 0, but as we always have at least one observation, 126 | we can start at 0. 127 | prior_w : Prior effective sample size. 128 | seed : Random seed. 129 | 130 | Returns 131 | ------- 132 | res : List of sim_count numbers drawn from lognormal distribution. 133 | """ 134 | if total <= 0: 135 | return list(np.zeros(sim_count)) 136 | 137 | # normal posterior for aggregated data of logarithms of original data 138 | normal_mu_post, normal_sig_2_post = normal_posteriors( 139 | total, sum_logs, sum_logs_2, sim_count, prior_m, prior_a, prior_b, prior_w, seed 140 | ) 141 | 142 | # final simulated lognormal means using simulated normal means and sigmas 143 | res = np.exp(normal_mu_post + (normal_sig_2_post / 2)) 144 | 145 | return res 146 | 147 | 148 | def dirichlet_posteriors( 149 | concentration: List[int], 150 | prior: List[Union[float, int]], 151 | sim_count: int = 20000, 152 | seed: Union[int, np.random.bit_generator.SeedSequence] = None, 153 | ) -> np.ndarray: 154 | """ 155 | Drawing from Dirichlet posterior for a single variant. 156 | 157 | Parameters 158 | ---------- 159 | concentration : List of numbers of observation for each possible category. 160 | In dice example it would be numbers of observations for each possible face. 161 | prior : List of prior values for each category in dirichlet distribution. 162 | sim_count : Number of simulations. 163 | seed : Random seed. 164 | 165 | Returns 166 | ------- 167 | res : List of lists of dirichlet samples. 168 | """ 169 | rng = np.random.default_rng(seed) 170 | 171 | posterior_concentration = [sum(x) for x in zip(prior, concentration)] 172 | res = rng.dirichlet(posterior_concentration, sim_count) 173 | 174 | return res 175 | 176 | 177 | def pois_gamma_posteriors_all( 178 | totals: List[int], 179 | sums: List[Union[float, int]], 180 | sim_count: int, 181 | a_priors_gamma: List[Union[float, int]], 182 | b_priors_gamma: List[Union[float, int]], 183 | seed: Union[int, np.random.bit_generator.SeedSequence] = None, 184 | ) -> np.ndarray: 185 | """ 186 | Draw from Gamma posterior distributions for all variants of Poisson data at once. 187 | 188 | Parameters 189 | ---------- 190 | totals : List of total experiment observations (e.g. number of matches) for each variant. 191 | sums : List of sums of observations (e.g. number of goals) for each variant. 192 | sim_count : Number of simulations to be used for probability estimation. 193 | a_priors_gamma : List of prior alpha parameters of Gamma distributions for each variant. 194 | b_priors_gamma : List of prior beta parameters (rates) of Gamma distributions for each variant. 195 | seed : Random seed. 196 | 197 | Returns 198 | ------- 199 | gamma_samples : List of lists of Gamma distribution samples for all variants. 200 | """ 201 | rng = np.random.default_rng(seed) 202 | 203 | gamma_samples = np.array( 204 | [ 205 | rng.gamma( 206 | sums[i] + a_priors_gamma[i], 207 | # here it has to be 1/(...) as it is a scale, and not a rate 208 | 1 / (totals[i] + b_priors_gamma[i]), 209 | sim_count, 210 | ) 211 | for i in range(len(totals)) 212 | ] 213 | ) 214 | return gamma_samples 215 | 216 | 217 | def exp_gamma_posteriors_all( 218 | totals: List[int], 219 | sums: List[Union[float, int]], 220 | sim_count: int, 221 | a_priors_gamma: List[Union[float, int]], 222 | b_priors_gamma: List[Union[float, int]], 223 | seed: Union[int, np.random.bit_generator.SeedSequence] = None, 224 | ) -> np.ndarray: 225 | """ 226 | Draw from Gamma posterior distributions for all variants of Exponential data at once. 227 | 228 | Parameters 229 | ---------- 230 | totals : List of total experiment observations (e.g. number of sessions) for each variant. 231 | sums : List of sums of observations (e.g. total time spent) for each variant. 232 | sim_count : Number of simulations to be used for probability estimation. 233 | a_priors_gamma : List of prior alpha parameters of Gamma distributions for each variant. 234 | b_priors_gamma : List of prior beta parameters (rates) of Gamma distributions for each variant. 235 | seed : Random seed. 236 | 237 | Returns 238 | ------- 239 | gamma_samples : List of lists of Gamma distribution samples for all variants. 240 | """ 241 | rng = np.random.default_rng(seed) 242 | 243 | gamma_samples = np.array( 244 | [ 245 | rng.gamma( 246 | totals[i] + a_priors_gamma[i], 247 | # here it has to be 1/(...) as it is a scale, and not a rate 248 | 1 / (sums[i] + b_priors_gamma[i]), 249 | sim_count, 250 | ) 251 | for i in range(len(totals)) 252 | ] 253 | ) 254 | return gamma_samples 255 | -------------------------------------------------------------------------------- /bayesian_testing/utilities/__init__.py: -------------------------------------------------------------------------------- 1 | from .logging import get_logger 2 | 3 | __all__ = ["get_logger"] 4 | -------------------------------------------------------------------------------- /bayesian_testing/utilities/common.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | 4 | def check_list_lengths(lists: List[List]) -> None: 5 | """ 6 | Check if input lists are all of same length. 7 | Parameters 8 | ---------- 9 | lists : List of lists of different possible types. 10 | """ 11 | it = iter(lists) 12 | the_len = len(next(it)) 13 | if not all(len(i) == the_len for i in it): 14 | raise ValueError("Not all lists have same length!") 15 | -------------------------------------------------------------------------------- /bayesian_testing/utilities/logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root,bayesian_testing 3 | 4 | [handlers] 5 | keys=consoleHandler 6 | 7 | [formatters] 8 | keys=simpleFormatter 9 | 10 | [logger_root] 11 | level=INFO 12 | handlers=consoleHandler 13 | 14 | [logger_bayesian_testing] 15 | level=INFO 16 | handlers=consoleHandler 17 | qualname=bayesian_testing 18 | propagate=0 19 | 20 | [handler_consoleHandler] 21 | class=StreamHandler 22 | level=INFO 23 | formatter=simpleFormatter 24 | args=(sys.stdout,) 25 | 26 | [formatter_simpleFormatter] 27 | format=%(asctime)s - %(name)s - %(levelname)s - %(message)s 28 | -------------------------------------------------------------------------------- /bayesian_testing/utilities/logging.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import logging.config 3 | from os import path 4 | 5 | log_file_path = path.join(path.dirname(path.abspath(__file__)), "logging.conf") 6 | 7 | logging.config.fileConfig(log_file_path, disable_existing_loggers=False) 8 | 9 | 10 | def get_logger(logger_name): 11 | return logging.getLogger(logger_name) 12 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Matt52/bayesian-testing/cea9afa5d7e3321d159d7b387ff57803467a18d5/codecov.yml -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | -------------------------------------------------------------------------------- /examples/dice_rolls_ab_testing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "45ce22be-8ae0-4b0e-bce5-9e9aab6f105f", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from IPython.core.interactiveshell import InteractiveShell\n", 13 | "InteractiveShell.ast_node_interactivity = \"all\"\n", 14 | "\n", 15 | "import numpy as np\n", 16 | "np.set_printoptions(legacy=\"1.25\")\n", 17 | "import pandas as pd\n", 18 | "from bayesian_testing.experiments import DiscreteDataTest" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "id": "f888f299-69a0-4f3c-bd57-af3a59bedba0", 25 | "metadata": { 26 | "tags": [] 27 | }, 28 | "outputs": [ 29 | { 30 | "data": { 31 | "text/plain": [ 32 | "Generator(PCG64) at 0x132BA19E0" 33 | ] 34 | }, 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "output_type": "execute_result" 38 | } 39 | ], 40 | "source": [ 41 | "np.random.default_rng(52)\n", 42 | "\n", 43 | "values = [1,2,3,4,5,6]" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 3, 49 | "id": "24a15d66-d928-432c-beb3-e25e3be10cc0", 50 | "metadata": { 51 | "tags": [] 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "die_A_rolls = list(np.random.choice(values, 1000, p=[1/6, 1/6, 1/6, 1/6, 1/6, 1/6]))\n", 56 | "die_B_rolls = list(np.random.choice(values, 1200, p=[0.2, 0.2, 0.1, 0.1, 0.2, 0.2]))\n", 57 | "die_C_rolls = list(np.random.choice(values, 500, p=[0.2, 0.1, 0.1, 0.2, 0.2, 0.2]))" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 4, 63 | "id": "35989040-af25-4129-9678-de04c0397c32", 64 | "metadata": { 65 | "tags": [] 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "discrete_test = DiscreteDataTest(values)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 5, 75 | "id": "e902885d-7382-42c8-af7f-1d82fba06bb4", 76 | "metadata": { 77 | "tags": [] 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "discrete_test.add_variant_data('A', die_A_rolls)\n", 82 | "discrete_test.add_variant_data('B', die_B_rolls)\n", 83 | "discrete_test.add_variant_data('C', die_C_rolls)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 6, 89 | "id": "a3ee97a1-d48c-407b-b13c-5cfb11e6591f", 90 | "metadata": { 91 | "tags": [] 92 | }, 93 | "outputs": [ 94 | { 95 | "data": { 96 | "text/plain": [ 97 | "{'A': {'concentration': [168.0, 166.0, 176.0, 172.0, 168.0, 150.0],\n", 98 | " 'prior': [1, 1, 1, 1, 1, 1]},\n", 99 | " 'B': {'concentration': [256.0, 246.0, 111.0, 116.0, 239.0, 232.0],\n", 100 | " 'prior': [1, 1, 1, 1, 1, 1]},\n", 101 | " 'C': {'concentration': [84.0, 57.0, 58.0, 100.0, 100.0, 101.0],\n", 102 | " 'prior': [1, 1, 1, 1, 1, 1]}}" 103 | ] 104 | }, 105 | "execution_count": 6, 106 | "metadata": {}, 107 | "output_type": "execute_result" 108 | } 109 | ], 110 | "source": [ 111 | "discrete_test.data" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 7, 117 | "id": "23484578-dc84-4325-9aa0-7a1498ee161b", 118 | "metadata": { 119 | "tags": [] 120 | }, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/plain": [ 125 | "{'A': 0.00065, 'B': 0.00035, 'C': 0.999}" 126 | ] 127 | }, 128 | "execution_count": 7, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | } 132 | ], 133 | "source": [ 134 | "discrete_test.probabs_of_being_best(sim_count = 20000, seed=52)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 8, 140 | "id": "7001814b-7705-420a-813d-b65393e68288", 141 | "metadata": { 142 | "tags": [] 143 | }, 144 | "outputs": [ 145 | { 146 | "data": { 147 | "text/plain": [ 148 | "{'A': 0.2964593, 'B': 0.309296, 'C': 3.45e-05}" 149 | ] 150 | }, 151 | "execution_count": 8, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "discrete_test.expected_loss(sim_count = 20000, seed=52)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 9, 163 | "id": "86cb2b3b-cc93-489f-ae1d-7becac229c33", 164 | "metadata": { 165 | "tags": [] 166 | }, 167 | "outputs": [ 168 | { 169 | "name": "stdout", 170 | "output_type": "stream", 171 | "text": [ 172 | "+-----------+--------------------------------------------------------------+-----------------+------------------+------------------------+-------------------+-----------------+\n", 173 | "| variant | concentration | average_value | posterior_mean | credible_interval | prob_being_best | expected_loss |\n", 174 | "+===========+==============================================================+=================+==================+========================+===================+=================+\n", 175 | "| A | {1: 168.0, 2: 166.0, 3: 176.0, 4: 172.0, 5: 168.0, 6: 150.0} | 3.456 | 3.45626 | [3.3530612, 3.559381] | 0.0006 | 0.296753 |\n", 176 | "+-----------+--------------------------------------------------------------+-----------------+------------------+------------------------+-------------------+-----------------+\n", 177 | "| B | {1: 256.0, 2: 246.0, 3: 111.0, 4: 116.0, 5: 239.0, 6: 232.0} | 3.44333 | 3.44362 | [3.3386877, 3.5493953] | 0.0006 | 0.309481 |\n", 178 | "+-----------+--------------------------------------------------------------+-----------------+------------------+------------------------+-------------------+-----------------+\n", 179 | "| C | {1: 84.0, 2: 57.0, 3: 58.0, 4: 100.0, 5: 100.0, 6: 101.0} | 3.756 | 3.75296 | [3.5993774, 3.904388] | 0.9988 | 4.27e-05 |\n", 180 | "+-----------+--------------------------------------------------------------+-----------------+------------------+------------------------+-------------------+-----------------+\n" 181 | ] 182 | } 183 | ], 184 | "source": [ 185 | "results = discrete_test.evaluate()\n", 186 | "print(pd.DataFrame(results).to_markdown(tablefmt=\"grid\", index=False))" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 10, 192 | "id": "616e35ba-26d3-4d10-ad65-4dc37e5771a6", 193 | "metadata": { 194 | "tags": [] 195 | }, 196 | "outputs": [ 197 | { 198 | "data": { 199 | "text/plain": [ 200 | "{'A': 0.4319, 'B': 0.568, 'C': 0.0001}" 201 | ] 202 | }, 203 | "execution_count": 10, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | }, 207 | { 208 | "data": { 209 | "text/plain": [ 210 | "{'A': 0.0371495, 'B': 0.0243128, 'C': 0.3335743}" 211 | ] 212 | }, 213 | "execution_count": 10, 214 | "metadata": {}, 215 | "output_type": "execute_result" 216 | }, 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "+-----------+--------------------------------------------------------------+-----------------+------------------+------------------------+-------------------+-----------------+\n", 222 | "| variant | concentration | average_value | posterior_mean | credible_interval | prob_being_best | expected_loss |\n", 223 | "+===========+==============================================================+=================+==================+========================+===================+=================+\n", 224 | "| A | {1: 168.0, 2: 166.0, 3: 176.0, 4: 172.0, 5: 168.0, 6: 150.0} | 3.456 | 3.45626 | [3.3515318, 3.5614544] | 0.4304 | 0.0370878 |\n", 225 | "+-----------+--------------------------------------------------------------+-----------------+------------------+------------------------+-------------------+-----------------+\n", 226 | "| B | {1: 256.0, 2: 246.0, 3: 111.0, 4: 116.0, 5: 239.0, 6: 232.0} | 3.44333 | 3.44362 | [3.3376023, 3.5515158] | 0.56955 | 0.0246001 |\n", 227 | "+-----------+--------------------------------------------------------------+-----------------+------------------+------------------------+-------------------+-----------------+\n", 228 | "| C | {1: 84.0, 2: 57.0, 3: 58.0, 4: 100.0, 5: 100.0, 6: 101.0} | 3.756 | 3.75296 | [3.6002351, 3.9037053] | 5e-05 | 0.33356 |\n", 229 | "+-----------+--------------------------------------------------------------+-----------------+------------------+------------------------+-------------------+-----------------+\n" 230 | ] 231 | } 232 | ], 233 | "source": [ 234 | "# reversed test (where minimum is best)\n", 235 | "discrete_test.probabs_of_being_best(sim_count = 20000, seed=52, min_is_best=True)\n", 236 | "discrete_test.expected_loss(sim_count = 20000, seed=52, min_is_best=True)\n", 237 | "results_min = discrete_test.evaluate(min_is_best=True)\n", 238 | "print(pd.DataFrame(results_min).to_markdown(tablefmt=\"grid\", index=False))" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "id": "19b56eb8-143e-47aa-9a22-a2473f91cfa1", 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [] 248 | } 249 | ], 250 | "metadata": { 251 | "kernelspec": { 252 | "display_name": "Python 3 (ipykernel)", 253 | "language": "python", 254 | "name": "python3" 255 | }, 256 | "language_info": { 257 | "codemirror_mode": { 258 | "name": "ipython", 259 | "version": 3 260 | }, 261 | "file_extension": ".py", 262 | "mimetype": "text/x-python", 263 | "name": "python", 264 | "nbconvert_exporter": "python", 265 | "pygments_lexer": "ipython3", 266 | "version": "3.10.12" 267 | } 268 | }, 269 | "nbformat": 4, 270 | "nbformat_minor": 5 271 | } 272 | -------------------------------------------------------------------------------- /examples/goals_scored_ab_testing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "c5f8cedc-94d6-4805-90d4-466d4de6b293", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from IPython.core.interactiveshell import InteractiveShell\n", 13 | "InteractiveShell.ast_node_interactivity = \"all\"\n", 14 | "\n", 15 | "import numpy as np\n", 16 | "import pandas as pd\n", 17 | "from bayesian_testing.experiments import PoissonDataTest" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "id": "8e57546e-4b90-4c89-8668-aafe4aff6485", 24 | "metadata": { 25 | "tags": [] 26 | }, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/plain": [ 31 | "Generator(PCG64) at 0x111F9C660" 32 | ] 33 | }, 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "output_type": "execute_result" 37 | } 38 | ], 39 | "source": [ 40 | "np.random.default_rng(52)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "id": "1cc3f939-43a0-4d19-af63-9ae632861dee", 47 | "metadata": { 48 | "tags": [] 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "# goals scored - more is better (duh...)\n", 53 | "psg_goals_for = [5, 5, 7, 1, 3, 3, 1, 1, 2, 0, 1, 3, 4, 2, 5]\n", 54 | "city_goals_for = [2, 4, 3, 4, 6, 1, 3, 6, 4, 0, 3, 1, 2, 1]\n", 55 | "bayern_goals_for = [6, 2, 7, 1, 1, 2, 0, 4, 2, 5, 2, 6, 3, 6, 2]\n", 56 | "\n", 57 | "\n", 58 | "# goals received - so less is better\n", 59 | "psg_goals_against = [0, 2, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 3, 1, 0]\n", 60 | "city_goals_against = [0, 0, 3, 2, 0, 1, 0, 3, 0, 1, 1, 0, 1, 2]\n", 61 | "bayern_goals_against = [1, 0, 0, 1, 1, 2, 1, 0, 2, 0, 0, 2, 2, 1, 0]" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 4, 67 | "id": "fe532f01-6c91-4462-9213-e33379be1f9e", 68 | "metadata": { 69 | "tags": [] 70 | }, 71 | "outputs": [], 72 | "source": [ 73 | "# Poisson test for \"goals for\"\n", 74 | "poisson_test_gf = PoissonDataTest()\n", 75 | "poisson_test_gf.add_variant_data('psg', psg_goals_for)\n", 76 | "# adding \"city\" with effective sample size 10 and the prior mean 2 (20/10):\n", 77 | "poisson_test_gf.add_variant_data('city', city_goals_for, a_prior=20, b_prior=10)\n", 78 | "# adding \"bayern\" with aggregated data instead of list of all observations\n", 79 | "poisson_test_gf.add_variant_data_agg('bayern', totals=len(bayern_goals_for), sum_values=sum(bayern_goals_for))\n", 80 | "\n", 81 | "\n", 82 | "# Poisson test for \"goals against\"\n", 83 | "poisson_test_ga = PoissonDataTest()\n", 84 | "poisson_test_ga.add_variant_data('psg', psg_goals_against)\n", 85 | "poisson_test_ga.add_variant_data('city', city_goals_against)\n", 86 | "poisson_test_ga.add_variant_data('bayern', bayern_goals_against)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 5, 92 | "id": "93cd6353-01c6-4873-a62e-9816932679fe", 93 | "metadata": { 94 | "tags": [] 95 | }, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "+-----------+----------+--------------+--------------------+------------------+------------------------+-------------------+-----------------+\n", 102 | "| variant | totals | sum_values | observed_average | posterior_mean | credible_interval | prob_being_best | expected_loss |\n", 103 | "+===========+==========+==============+====================+==================+========================+===================+=================+\n", 104 | "| psg | 15 | 43 | 2.86667 | 2.8543 | [2.0701365, 3.7817813] | 0.24485 | 0.512094 |\n", 105 | "+-----------+----------+--------------+--------------------+------------------+------------------------+-------------------+-----------------+\n", 106 | "| city | 14 | 40 | 2.85714 | 2.5 | [1.9035733, 3.1737824] | 0.04655 | 0.870001 |\n", 107 | "+-----------+----------+--------------+--------------------+------------------+------------------------+-------------------+-----------------+\n", 108 | "| bayern | 15 | 49 | 3.26667 | 3.25166 | [2.4038302, 4.2176997] | 0.7086 | 0.109746 |\n", 109 | "+-----------+----------+--------------+--------------------+------------------+------------------------+-------------------+-----------------+\n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "# poisson_test_gf.probabs_of_being_best(sim_count = 20000, seed=52)\n", 115 | "# poisson_test_gf.expected_loss(sim_count = 20000, seed=52)\n", 116 | "results_gf = poisson_test_gf.evaluate()\n", 117 | "print(pd.DataFrame(results_gf).to_markdown(tablefmt=\"grid\", index=False))" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 6, 123 | "id": "39220217-6553-4f88-b537-064ade561996", 124 | "metadata": { 125 | "tags": [] 126 | }, 127 | "outputs": [ 128 | { 129 | "name": "stdout", 130 | "output_type": "stream", 131 | "text": [ 132 | "+-----------+----------+--------------+--------------------+------------------+------------------------+-------------------+-----------------+\n", 133 | "| variant | totals | sum_values | observed_average | posterior_mean | credible_interval | prob_being_best | expected_loss |\n", 134 | "+===========+==========+==============+====================+==================+========================+===================+=================+\n", 135 | "| psg | 15 | 9 | 0.6 | 0.60265 | [0.2140532, 1.2324781] | 0.756 | 0.0425375 |\n", 136 | "+-----------+----------+--------------+--------------------+------------------+------------------------+-------------------+-----------------+\n", 137 | "| city | 14 | 14 | 1 | 1 | [0.4487859, 1.8478473] | 0.07585 | 0.439937 |\n", 138 | "+-----------+----------+--------------+--------------------+------------------+------------------------+-------------------+-----------------+\n", 139 | "| bayern | 15 | 13 | 0.86667 | 0.86755 | [0.3680665, 1.6067354] | 0.16815 | 0.30884 |\n", 140 | "+-----------+----------+--------------+--------------------+------------------+------------------------+-------------------+-----------------+\n" 141 | ] 142 | } 143 | ], 144 | "source": [ 145 | "# poisson_test_ga.probabs_of_being_best(sim_count = 20000, seed=52, min_is_best=True)\n", 146 | "# poisson_test_ga.expected_loss(sim_count = 20000, seed=52, min_is_best=True)\n", 147 | "results_ga = poisson_test_ga.evaluate(min_is_best=True, interval_alpha=0.99)\n", 148 | "print(pd.DataFrame(results_ga).to_markdown(tablefmt=\"grid\", index=False))" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "id": "d36d68d3-d119-49a4-b757-016da25f6f28", 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [] 158 | } 159 | ], 160 | "metadata": { 161 | "kernelspec": { 162 | "display_name": "Python 3 (ipykernel)", 163 | "language": "python", 164 | "name": "python3" 165 | }, 166 | "language_info": { 167 | "codemirror_mode": { 168 | "name": "ipython", 169 | "version": 3 170 | }, 171 | "file_extension": ".py", 172 | "mimetype": "text/x-python", 173 | "name": "python", 174 | "nbconvert_exporter": "python", 175 | "pygments_lexer": "ipython3", 176 | "version": "3.10.12" 177 | } 178 | }, 179 | "nbformat": 4, 180 | "nbformat_minor": 5 181 | } 182 | -------------------------------------------------------------------------------- /examples/session_data_manual_pbbs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "8b11e1e0-ccc4-4fc9-9cdd-9f906e64b1c7", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from IPython.core.interactiveshell import InteractiveShell\n", 13 | "InteractiveShell.ast_node_interactivity = \"all\"\n", 14 | "\n", 15 | "import numpy as np\n", 16 | "np.set_printoptions(legacy=\"1.25\")\n", 17 | "import pandas as pd\n", 18 | "from bayesian_testing.metrics import eval_bernoulli_agg, eval_delta_lognormal_agg" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "id": "b31da712-cdb9-4671-b3ed-63e351896915", 25 | "metadata": { 26 | "tags": [] 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "df = pd.read_csv(\"data/session_data.csv\")" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "id": "6032fc6e-3a4c-47f3-830f-1a85d49c253c", 37 | "metadata": { 38 | "tags": [] 39 | }, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/plain": [ 44 | "94500" 45 | ] 46 | }, 47 | "execution_count": 3, 48 | "metadata": {}, 49 | "output_type": "execute_result" 50 | }, 51 | { 52 | "data": { 53 | "text/html": [ 54 | "
\n", 55 | "\n", 68 | "\n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | "
conversiondaterevenuesourcevariant
002021-08-070.000000desktopB
112021-08-057.241015desktopC
202021-08-060.000000desktopA
302021-08-050.000000desktopC
402021-08-030.000000desktopA
\n", 122 | "
" 123 | ], 124 | "text/plain": [ 125 | " conversion date revenue source variant\n", 126 | "0 0 2021-08-07 0.000000 desktop B\n", 127 | "1 1 2021-08-05 7.241015 desktop C\n", 128 | "2 0 2021-08-06 0.000000 desktop A\n", 129 | "3 0 2021-08-05 0.000000 desktop C\n", 130 | "4 0 2021-08-03 0.000000 desktop A" 131 | ] 132 | }, 133 | "execution_count": 3, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "# example session data - each row represent one session\n", 140 | "len(df)\n", 141 | "df.head()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 4, 147 | "id": "744e5833-cbc3-45d3-963d-11c2a92acff2", 148 | "metadata": { 149 | "tags": [] 150 | }, 151 | "outputs": [ 152 | { 153 | "data": { 154 | "text/html": [ 155 | "
\n", 156 | "\n", 169 | "\n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | "
sessionsconversionsrevenueconversion_raterevenue_per_sessionrevenue_per_converted_sessions
variant
A31500158030830.0256130.0501590.97873119.512674
B32000170035203.2168880.0531251.10010120.707775
C31000155037259.5633640.0500001.20192124.038428
\n", 220 | "
" 221 | ], 222 | "text/plain": [ 223 | " sessions conversions revenue conversion_rate \\\n", 224 | "variant \n", 225 | "A 31500 1580 30830.025613 0.050159 \n", 226 | "B 32000 1700 35203.216888 0.053125 \n", 227 | "C 31000 1550 37259.563364 0.050000 \n", 228 | "\n", 229 | " revenue_per_session revenue_per_converted_sessions \n", 230 | "variant \n", 231 | "A 0.978731 19.512674 \n", 232 | "B 1.100101 20.707775 \n", 233 | "C 1.201921 24.038428 " 234 | ] 235 | }, 236 | "execution_count": 4, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "# summary statistics per variant\n", 243 | "\n", 244 | "summary = df.groupby('variant')[['variant', 'conversion', 'revenue']]\\\n", 245 | " .agg({'variant': 'count', 'conversion': 'sum','revenue': 'sum'})\\\n", 246 | " .rename(columns = {'variant': 'sessions', 'conversion': 'conversions'})\n", 247 | "\n", 248 | "summary['conversion_rate'] = summary['conversions'] / summary['sessions']\n", 249 | "summary['revenue_per_session'] = summary['revenue'] / summary['sessions']\n", 250 | "summary['revenue_per_converted_sessions'] = summary['revenue'] / summary['conversions']\n", 251 | "\n", 252 | "summary" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 5, 258 | "id": "be57dc82-8958-4118-aab0-71122490d17a", 259 | "metadata": { 260 | "tags": [] 261 | }, 262 | "outputs": [ 263 | { 264 | "data": { 265 | "text/plain": [ 266 | "[31500, 32000, 31000]" 267 | ] 268 | }, 269 | "execution_count": 5, 270 | "metadata": {}, 271 | "output_type": "execute_result" 272 | }, 273 | { 274 | "data": { 275 | "text/plain": [ 276 | "[1580, 1700, 1550]" 277 | ] 278 | }, 279 | "execution_count": 5, 280 | "metadata": {}, 281 | "output_type": "execute_result" 282 | }, 283 | { 284 | "data": { 285 | "text/plain": [ 286 | "[3831.806394737816, 4211.72986767986, 4055.965234848171]" 287 | ] 288 | }, 289 | "execution_count": 5, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | }, 293 | { 294 | "data": { 295 | "text/plain": [ 296 | "[11029.923165846496, 12259.51868396913, 12357.911862914]" 297 | ] 298 | }, 299 | "execution_count": 5, 300 | "metadata": {}, 301 | "output_type": "execute_result" 302 | } 303 | ], 304 | "source": [ 305 | "variant_A = df['revenue'][(df.variant == 'A')].values\n", 306 | "variant_B = df['revenue'][(df.variant == 'B')].values\n", 307 | "variant_C = df['revenue'][(df.variant == 'C')].values\n", 308 | "\n", 309 | "sessions = [\n", 310 | " variant_A.size,\n", 311 | " variant_B.size,\n", 312 | " variant_C.size\n", 313 | "]\n", 314 | "\n", 315 | "conversions = [\n", 316 | " sum(variant_A > 0),\n", 317 | " sum(variant_B > 0),\n", 318 | " sum(variant_C > 0)\n", 319 | "]\n", 320 | "\n", 321 | "sum_log_revenue = [\n", 322 | " np.log(variant_A[variant_A > 0]).sum(),\n", 323 | " np.log(variant_B[variant_B > 0]).sum(),\n", 324 | " np.log(variant_C[variant_C > 0]).sum()\n", 325 | "]\n", 326 | "\n", 327 | "sum_log_2_revenue = [\n", 328 | " np.square(np.log(variant_A[variant_A > 0])).sum(),\n", 329 | " np.square(np.log(variant_B[variant_B > 0])).sum(),\n", 330 | " np.square(np.log(variant_C[variant_C > 0])).sum()\n", 331 | "]\n", 332 | "\n", 333 | "sessions\n", 334 | "conversions\n", 335 | "sum_log_revenue\n", 336 | "sum_log_2_revenue" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "id": "a68cc3a7-1c6e-40c4-b5af-59a7fb9fb548", 342 | "metadata": {}, 343 | "source": [ 344 | "## Results" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 6, 350 | "id": "3ade0625-fb50-434f-93f6-e70c3c543713", 351 | "metadata": { 352 | "tags": [] 353 | }, 354 | "outputs": [ 355 | { 356 | "name": "stdout", 357 | "output_type": "stream", 358 | "text": [ 359 | "Probabilities of being best: [0.043, 0.92335, 0.03365]\n", 360 | "Expected loss: [0.0030022, 5.89e-05, 0.0031487]\n", 361 | "95% credible intervals: [[0.0477987, 0.0525911], [0.0506903, 0.0556017], [0.0476257, 0.0524881]]\n" 362 | ] 363 | } 364 | ], 365 | "source": [ 366 | "# conversion rate probabilities of being best, expected loss and credible intervals for each variant\n", 367 | "pbbs, loss, intervals = eval_bernoulli_agg(sessions, conversions)\n", 368 | "print(f\"Probabilities of being best: {pbbs}\")\n", 369 | "print(f\"Expected loss: {loss}\")\n", 370 | "print(f\"95% credible intervals: {intervals}\")" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 7, 376 | "id": "21c3ae3c-46b3-4bc7-bd33-5306d3e20506", 377 | "metadata": { 378 | "tags": [] 379 | }, 380 | "outputs": [ 381 | { 382 | "name": "stdout", 383 | "output_type": "stream", 384 | "text": [ 385 | "Probabilities of being best: [0.0002, 0.03395, 0.96585]\n", 386 | "Expected loss: [0.2212336, 0.1210695, 0.0008982]\n", 387 | "95% credible intervals: [[0.9086416, 1.0649507], [1.0043019, 1.170394], [1.1094296, 1.3069562]]\n" 388 | ] 389 | } 390 | ], 391 | "source": [ 392 | "# revenue per session probabilities of being best, expected loss and credible intervals for each variant\n", 393 | "pbbs, loss, intervals = eval_delta_lognormal_agg(sessions, conversions, sum_log_revenue, sum_log_2_revenue)\n", 394 | "print(f\"Probabilities of being best: {pbbs}\")\n", 395 | "print(f\"Expected loss: {loss}\")\n", 396 | "print(f\"95% credible intervals: {intervals}\")" 397 | ] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "id": "bdb6c1f2-f144-4cfa-9808-b429ceed6354", 402 | "metadata": {}, 403 | "source": [ 404 | "### Results for \"being best\" = \"being minimum\"" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": 8, 410 | "id": "b651c9d9-6d51-4ad1-aabb-475296963a88", 411 | "metadata": { 412 | "tags": [] 413 | }, 414 | "outputs": [ 415 | { 416 | "name": "stdout", 417 | "output_type": "stream", 418 | "text": [ 419 | "Probabilities of being best: [0.4572, 0.00945, 0.53335]\n", 420 | "Expected loss: [0.0007868, 0.00374, 0.00062]\n", 421 | "95% credible intervals: [[0.0478316, 0.0526332], [0.050685, 0.0556378], [0.0476584, 0.0524571]]\n" 422 | ] 423 | } 424 | ], 425 | "source": [ 426 | "# conversion rate probabilities of being best, expected loss and credible intervals for each variant\n", 427 | "pbbs, loss, intervals = eval_bernoulli_agg(sessions, conversions, min_is_best=True)\n", 428 | "print(f\"Probabilities of being best: {pbbs}\")\n", 429 | "print(f\"Expected loss: {loss}\")\n", 430 | "print(f\"95% credible intervals: {intervals}\")" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 9, 436 | "id": "f6573b6b-314e-49de-ac63-c75201eac707", 437 | "metadata": { 438 | "tags": [] 439 | }, 440 | "outputs": [ 441 | { 442 | "name": "stdout", 443 | "output_type": "stream", 444 | "text": [ 445 | "Probabilities of being best: [0.95695, 0.04285, 0.0002]\n", 446 | "Expected loss: [0.0010886, 0.1012619, 0.2202282]\n", 447 | "95% credible intervals: [[0.9073725, 1.0666041], [1.0044587, 1.1692741], [1.1082288, 1.305592]]\n" 448 | ] 449 | } 450 | ], 451 | "source": [ 452 | "# revenue per session probabilities of being best, expected loss and credible intervals for each variant\n", 453 | "pbbs, loss, intervals = eval_delta_lognormal_agg(sessions, conversions, sum_log_revenue, sum_log_2_revenue, min_is_best=True)\n", 454 | "print(f\"Probabilities of being best: {pbbs}\")\n", 455 | "print(f\"Expected loss: {loss}\")\n", 456 | "print(f\"95% credible intervals: {intervals}\")" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": null, 462 | "id": "bd9b3af3-d34c-4781-a05a-d94a7bc7ee1c", 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [] 466 | } 467 | ], 468 | "metadata": { 469 | "kernelspec": { 470 | "display_name": "Python 3 (ipykernel)", 471 | "language": "python", 472 | "name": "python3" 473 | }, 474 | "language_info": { 475 | "codemirror_mode": { 476 | "name": "ipython", 477 | "version": 3 478 | }, 479 | "file_extension": ".py", 480 | "mimetype": "text/x-python", 481 | "name": "python", 482 | "nbconvert_exporter": "python", 483 | "pygments_lexer": "ipython3", 484 | "version": "3.10.12" 485 | } 486 | }, 487 | "nbformat": 4, 488 | "nbformat_minor": 5 489 | } 490 | -------------------------------------------------------------------------------- /examples/waiting_time_ab_testing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "ffef50bb-d334-438c-b170-4d70c2d6d19e", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from IPython.core.interactiveshell import InteractiveShell\n", 13 | "InteractiveShell.ast_node_interactivity = \"all\"\n", 14 | "\n", 15 | "import numpy as np\n", 16 | "np.set_printoptions(legacy=\"1.25\")\n", 17 | "import pandas as pd\n", 18 | "from bayesian_testing.experiments import ExponentialDataTest" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "id": "133748f0-26cd-4647-9cf3-e0b7646a51af", 25 | "metadata": { 26 | "tags": [] 27 | }, 28 | "outputs": [ 29 | { 30 | "data": { 31 | "text/plain": [ 32 | "Generator(PCG64) at 0x132F99AC0" 33 | ] 34 | }, 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "output_type": "execute_result" 38 | } 39 | ], 40 | "source": [ 41 | "# optionally stabilize the random seed:\n", 42 | "np.random.default_rng(100)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "id": "98cbecdc-69a9-48f4-a95e-d5c71644f00c", 49 | "metadata": { 50 | "tags": [] 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "# waiting times for 3 different variants, each with many observations\n", 55 | "# generated using exponential distributions with defined scales (expected values)\n", 56 | "waiting_times_a = np.random.exponential(scale=10, size=200)\n", 57 | "waiting_times_b = np.random.exponential(scale=11, size=210)\n", 58 | "waiting_times_c = np.random.exponential(scale=11, size=220)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "id": "6510ba7f-d854-4a88-b063-eb44fc59cf1b", 65 | "metadata": { 66 | "tags": [] 67 | }, 68 | "outputs": [ 69 | { 70 | "data": { 71 | "text/plain": [ 72 | "9.547258592723825" 73 | ] 74 | }, 75 | "execution_count": 4, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | }, 79 | { 80 | "data": { 81 | "text/plain": [ 82 | "11.761611555402082" 83 | ] 84 | }, 85 | "execution_count": 4, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | }, 89 | { 90 | "data": { 91 | "text/plain": [ 92 | "12.042807741815093" 93 | ] 94 | }, 95 | "execution_count": 4, 96 | "metadata": {}, 97 | "output_type": "execute_result" 98 | } 99 | ], 100 | "source": [ 101 | "np.mean(waiting_times_a)\n", 102 | "np.mean(waiting_times_b)\n", 103 | "np.mean(waiting_times_c)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 5, 109 | "id": "fde94e6d-a05b-4863-8c85-002e623ca2fb", 110 | "metadata": { 111 | "tags": [] 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "# Exponential A/B/C test\n", 116 | "exponential_test = ExponentialDataTest()\n", 117 | "exponential_test.add_variant_data('A', waiting_times_a)\n", 118 | "exponential_test.add_variant_data('B', waiting_times_b)\n", 119 | "exponential_test.add_variant_data('C', waiting_times_c)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 6, 125 | "id": "86c03a05-d091-4de5-a223-27efdfbe0615", 126 | "metadata": { 127 | "tags": [] 128 | }, 129 | "outputs": [ 130 | { 131 | "name": "stdout", 132 | "output_type": "stream", 133 | "text": [ 134 | "+-----------+----------+--------------+--------------------+------------------+--------------------------+-------------------+-----------------+\n", 135 | "| variant | totals | sum_values | observed_average | posterior_mean | credible_interval | prob_being_best | expected_loss |\n", 136 | "+===========+==========+==============+====================+==================+==========================+===================+=================+\n", 137 | "| A | 200 | 1909.45 | 9.54726 | 9.54299 | [8.3546163, 11.024919] | 0.97495 | 0.0094311 |\n", 138 | "+-----------+----------+--------------+--------------------+------------------+--------------------------+-------------------+-----------------+\n", 139 | "| B | 210 | 2469.94 | 11.7616 | 11.7565 | [10.3265266, 13.5271393] | 0.0177 | 2.23267 |\n", 140 | "+-----------+----------+--------------+--------------------+------------------+--------------------------+-------------------+-----------------+\n", 141 | "| C | 220 | 2649.42 | 12.0428 | 12.0378 | [10.5696647, 13.8087663] | 0.00735 | 2.50462 |\n", 142 | "+-----------+----------+--------------+--------------------+------------------+--------------------------+-------------------+-----------------+\n" 143 | ] 144 | } 145 | ], 146 | "source": [ 147 | "# evaluate test (using min_is_best=True as a lower waiting time is better)\n", 148 | "results = exponential_test.evaluate(min_is_best=True)\n", 149 | "print(pd.DataFrame(results).to_markdown(tablefmt=\"grid\", index=False))" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "id": "5cbe6fe3-d6c8-422a-ab62-ffd87b345459", 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [] 159 | } 160 | ], 161 | "metadata": { 162 | "kernelspec": { 163 | "display_name": "Python 3 (ipykernel)", 164 | "language": "python", 165 | "name": "python3" 166 | }, 167 | "language_info": { 168 | "codemirror_mode": { 169 | "name": "ipython", 170 | "version": 3 171 | }, 172 | "file_extension": ".py", 173 | "mimetype": "text/x-python", 174 | "name": "python", 175 | "nbconvert_exporter": "python", 176 | "pygments_lexer": "ipython3", 177 | "version": "3.10.12" 178 | } 179 | }, 180 | "nbformat": 4, 181 | "nbformat_minor": 5 182 | } 183 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "bayesian-testing" 3 | version = "0.9.1" 4 | description = "Bayesian A/B testing with simple probabilities." 5 | authors = ["Matus Baniar"] 6 | license = "MIT" 7 | readme = "README.md" 8 | homepage = "https://github.com/Matt52/bayesian-testing" 9 | repository = "https://github.com/Matt52/bayesian-testing" 10 | keywords = ["ab testing", "bayes", "bayesian statistics"] 11 | include = [ 12 | "LICENSE", 13 | ] 14 | 15 | packages = [ 16 | {include = "bayesian_testing"} 17 | ] 18 | 19 | [tool.poetry.dependencies] 20 | python = ">=3.8" 21 | numpy = ">=1.19" 22 | 23 | [tool.poetry.group.dev.dependencies] 24 | jupyter = ">=1.1" 25 | jupyterlab = ">=4.3" 26 | black = ">=23.1" 27 | pytest = ">=8.3" 28 | coverage = ">=7.6" 29 | pandas = ">=1.5" 30 | pre-commit = ">=3.1" 31 | isort = ">=5.10" 32 | tabulate = ">=0.9.0" 33 | setuptools = { version = "^78.1.1", markers = "python_version >= '3.9'" } 34 | tornado = { version = "^6.5.0", markers = "python_version >= '3.9'" } 35 | 36 | [build-system] 37 | requires = ["poetry-core>=1.0.0"] 38 | build-backend = "poetry.core.masonry.api" 39 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Tests 2 | 3 | Run in top level directory: 4 | ```bash 5 | python -m pytest 6 | ``` 7 | 8 | or: 9 | ```bash 10 | coverage run -m pytest 11 | coverage report 12 | coverage html 13 | ``` 14 | -------------------------------------------------------------------------------- /tests/test_binary.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from bayesian_testing.experiments import BinaryDataTest 4 | 5 | 6 | @pytest.fixture 7 | def conv_test(): 8 | cv = BinaryDataTest() 9 | cv.add_variant_data("A", [0, 1, 0, 1, 0, 0, 0, 0, 0, 1]) 10 | cv.add_variant_data("B", [0, 0, 0, 1, 0, 0, 0, 0, 0, 1]) 11 | cv.add_variant_data_agg("C", 11, 2, a_prior=1, b_prior=2) 12 | cv.add_variant_data_agg("D", 10, 10) 13 | cv.add_variant_data_agg("D", 20, 20, replace=False) 14 | cv.add_variant_data_agg("D", 20, 20, replace=True) 15 | cv.delete_variant("D") 16 | return cv 17 | 18 | 19 | def test_variants(conv_test): 20 | assert conv_test.variant_names == ["A", "B", "C"] 21 | 22 | 23 | def test_totals(conv_test): 24 | assert conv_test.totals == [10, 10, 11] 25 | 26 | 27 | def test_positives(conv_test): 28 | assert conv_test.positives == [3, 2, 2] 29 | 30 | 31 | def test_a_priors(conv_test): 32 | assert conv_test.a_priors == [0.5, 0.5, 1] 33 | 34 | 35 | def test_b_priors(conv_test): 36 | assert conv_test.b_priors == [0.5, 0.5, 2] 37 | 38 | 39 | def test_probabs_of_being_best(conv_test): 40 | pbbs = conv_test.probabs_of_being_best(sim_count=20000, seed=52) 41 | assert pbbs == {"A": 0.57225, "B": 0.233, "C": 0.19475} 42 | 43 | 44 | def test_expected_loss(conv_test): 45 | loss = conv_test.expected_loss(sim_count=20000, seed=52) 46 | assert loss == {"A": 0.0529281, "B": 0.1452113, "C": 0.1557502} 47 | 48 | 49 | def test_credible_intervals_95(conv_test): 50 | ci = conv_test.credible_intervals(sim_count=20000, seed=52) 51 | assert ci == { 52 | "A": [0.0917579, 0.6028411], 53 | "B": [0.0442435, 0.5032699], 54 | "C": [0.0522996, 0.452392], 55 | } 56 | 57 | 58 | def test_credible_intervals_99(conv_test): 59 | ci = conv_test.credible_intervals(sim_count=20000, seed=52, interval_alpha=0.99) 60 | assert ci == { 61 | "A": [0.0552614, 0.6892976], 62 | "B": [0.0214602, 0.6045644], 63 | "C": [0.0300364, 0.5320378], 64 | } 65 | 66 | 67 | def test_evaluate(conv_test): 68 | eval_report = conv_test.evaluate(sim_count=20000, seed=52) 69 | assert eval_report == [ 70 | { 71 | "variant": "A", 72 | "totals": 10, 73 | "positives": 3, 74 | "positive_rate": 0.3, 75 | "posterior_mean": 0.31818, 76 | "credible_interval": [0.0917579, 0.6028411], 77 | "prob_being_best": 0.57225, 78 | "expected_loss": 0.0529281, 79 | }, 80 | { 81 | "variant": "B", 82 | "totals": 10, 83 | "positives": 2, 84 | "positive_rate": 0.2, 85 | "posterior_mean": 0.22727, 86 | "credible_interval": [0.0442435, 0.5032699], 87 | "prob_being_best": 0.233, 88 | "expected_loss": 0.1452113, 89 | }, 90 | { 91 | "variant": "C", 92 | "totals": 11, 93 | "positives": 2, 94 | "positive_rate": 0.18182, 95 | "posterior_mean": 0.21429, 96 | "credible_interval": [0.0522996, 0.452392], 97 | "prob_being_best": 0.19475, 98 | "expected_loss": 0.1557502, 99 | }, 100 | ] 101 | 102 | 103 | def test_wrong_inputs(): 104 | cv = BinaryDataTest() 105 | with pytest.raises(ValueError): 106 | cv.add_variant_data(10, [1, 0, 1]) 107 | with pytest.raises(ValueError): 108 | cv.add_variant_data("A", [1, 0, 1], a_prior=-1) 109 | with pytest.raises(ValueError): 110 | cv.add_variant_data_agg("A", -1, 7) 111 | with pytest.raises(ValueError): 112 | cv.add_variant_data_agg("A", 1, -7) 113 | with pytest.raises(ValueError): 114 | cv.add_variant_data("A", []) 115 | with pytest.raises(ValueError): 116 | cv.add_variant_data("A", [1, 2, 0]) 117 | 118 | 119 | def test_wrong_credible_interval_input(conv_test): 120 | with pytest.raises(ValueError): 121 | conv_test.evaluate(interval_alpha=2) 122 | with pytest.raises(ValueError): 123 | conv_test.evaluate(interval_alpha=-1) 124 | -------------------------------------------------------------------------------- /tests/test_delta_lognormal.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from bayesian_testing.experiments import DeltaLognormalDataTest 4 | 5 | 6 | @pytest.fixture 7 | def rev_test(): 8 | rev = DeltaLognormalDataTest() 9 | rev.add_variant_data_agg( 10 | "A", 31500, 1580, 30830.02561, 3831.806394737816, 11029.923165846496, a_prior_beta=1 11 | ) 12 | rev.add_variant_data_agg( 13 | "B", 32000, 1700, 35203.21689, 4211.72986767986, 12259.51868396913, m_prior=2, w_prior=0.02 14 | ) 15 | rev.add_variant_data_agg( 16 | "C", 17 | 31000, 18 | 1550, 19 | 37259.56336, 20 | 4055.965234848171, 21 | 12357.911862914, 22 | a_prior_ig=1, 23 | b_prior_ig=2, 24 | ) 25 | rev.add_variant_data("D", [0, 10.7, 0, 8, 0, 0, 0, 0, 0, 11.22]) 26 | rev.add_variant_data("D", [0, 10.7, 0, 8, 0, 0, 0, 0, 0, 11.22], replace=False) 27 | rev.add_variant_data("D", [0, 10.7, 0, 8, 0, 0, 0, 0, 0, 11.22], replace=True) 28 | rev.delete_variant("D") 29 | return rev 30 | 31 | 32 | def test_variants(rev_test): 33 | assert rev_test.variant_names == ["A", "B", "C"] 34 | 35 | 36 | def test_totals(rev_test): 37 | assert rev_test.totals == [31500, 32000, 31000] 38 | 39 | 40 | def test_positives(rev_test): 41 | assert rev_test.positives == [1580, 1700, 1550] 42 | 43 | 44 | def test_sum_values(rev_test): 45 | assert rev_test.sum_values == [30830.02561, 35203.21689, 37259.56336] 46 | 47 | 48 | def test_sum_logs(rev_test): 49 | assert [round(i, 5) for i in rev_test.sum_logs] == [3831.80639, 4211.72987, 4055.96523] 50 | 51 | 52 | def test_sum_logs_2(rev_test): 53 | assert [round(i, 5) for i in rev_test.sum_logs_2] == [11029.92317, 12259.51868, 12357.91186] 54 | 55 | 56 | def test_a_priors_beta(rev_test): 57 | assert rev_test.a_priors_beta == [1, 0.5, 0.5] 58 | 59 | 60 | def test_b_priors_beta(rev_test): 61 | assert rev_test.b_priors_beta == [0.5, 0.5, 0.5] 62 | 63 | 64 | def test_m_priors(rev_test): 65 | assert rev_test.m_priors == [1, 2, 1] 66 | 67 | 68 | def test_a_priors_ig(rev_test): 69 | assert rev_test.a_priors_ig == [0, 0, 1] 70 | 71 | 72 | def test_b_priors_ig(rev_test): 73 | assert rev_test.b_priors_ig == [0, 0, 2] 74 | 75 | 76 | def test_w_priors(rev_test): 77 | assert rev_test.w_priors == [0.01, 0.02, 0.01] 78 | 79 | 80 | def test_probabs_of_being_best(rev_test): 81 | pbbs = rev_test.probabs_of_being_best(sim_count=20000, seed=152) 82 | assert pbbs == {"A": 0.0004, "B": 0.03355, "C": 0.96605} 83 | 84 | 85 | def test_expected_loss(rev_test): 86 | loss = rev_test.expected_loss(sim_count=20000, seed=152) 87 | assert loss == {"A": 0.2214416, "B": 0.1212818, "C": 0.0008639} 88 | 89 | 90 | def test_credible_intervals_95(rev_test): 91 | ci = rev_test.credible_intervals(sim_count=20000, seed=152) 92 | assert ci == { 93 | "A": [0.9084717, 1.0661301], 94 | "B": [1.0038179, 1.1705975], 95 | "C": [1.1097381, 1.3084524], 96 | } 97 | 98 | 99 | def test_credible_intervals_99(rev_test): 100 | ci = rev_test.credible_intervals(sim_count=20000, seed=152, interval_alpha=0.99) 101 | assert ci == { 102 | "A": [0.8847602, 1.0948976], 103 | "B": [0.9789665, 1.1996421], 104 | "C": [1.0813447, 1.3416523], 105 | } 106 | 107 | 108 | def test_evaluate(rev_test): 109 | eval_report = rev_test.evaluate(sim_count=20000, seed=152) 110 | assert eval_report == [ 111 | { 112 | "variant": "A", 113 | "totals": 31500, 114 | "positives": 1580, 115 | "sum_values": 30830.02561, 116 | "avg_values": 0.97873, 117 | "avg_positive_values": 19.51267, 118 | "posterior_mean": 0.98309, 119 | "credible_interval": [0.9084717, 1.0661301], 120 | "prob_being_best": 0.0004, 121 | "expected_loss": 0.2214416, 122 | }, 123 | { 124 | "variant": "B", 125 | "totals": 32000, 126 | "positives": 1700, 127 | "sum_values": 35203.21689, 128 | "avg_values": 1.1001, 129 | "avg_positive_values": 20.70777, 130 | "posterior_mean": 1.08266, 131 | "credible_interval": [1.0038179, 1.1705975], 132 | "prob_being_best": 0.03355, 133 | "expected_loss": 0.1212818, 134 | }, 135 | { 136 | "variant": "C", 137 | "totals": 31000, 138 | "positives": 1550, 139 | "sum_values": 37259.56336, 140 | "avg_values": 1.20192, 141 | "avg_positive_values": 24.03843, 142 | "posterior_mean": 1.20276, 143 | "credible_interval": [1.1097381, 1.3084524], 144 | "prob_being_best": 0.96605, 145 | "expected_loss": 0.0008639, 146 | }, 147 | ] 148 | 149 | 150 | def test_wrong_inputs(): 151 | dl_test = DeltaLognormalDataTest() 152 | with pytest.raises(ValueError): 153 | dl_test.add_variant_data(10, [1, 2, 3]) 154 | with pytest.raises(ValueError): 155 | dl_test.add_variant_data("A", [1, 2, 3], a_prior_beta=-1) 156 | with pytest.raises(ValueError): 157 | dl_test.add_variant_data("A", []) 158 | with pytest.raises(ValueError): 159 | dl_test.add_variant_data("A", [0, 0, 0]) 160 | with pytest.raises(ValueError): 161 | dl_test.add_variant_data("C", [0, 10.7, -1]) 162 | -------------------------------------------------------------------------------- /tests/test_delta_normal.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from bayesian_testing.experiments import DeltaNormalDataTest 3 | 4 | 5 | @pytest.fixture 6 | def delta_norm_test(): 7 | delta_norm = DeltaNormalDataTest() 8 | delta_norm.add_variant_data_agg( 9 | name="A", 10 | totals=31500, 11 | non_zeros=10, 12 | sum_values=102.02561, 13 | sum_values_2=1700.8, 14 | a_prior_beta=1, 15 | ) 16 | delta_norm.add_variant_data_agg( 17 | name="B", 18 | totals=32000, 19 | non_zeros=40, 20 | sum_values=273.02, 21 | sum_values_2=3567.5, 22 | a_prior_beta=0.02, 23 | m_prior=2, 24 | w_prior=0.02, 25 | ) 26 | 27 | delta_norm.add_variant_data("C", [0, 10.7, -1, 8, 0, -3, 0, -10, 0, 11.22]) 28 | delta_norm.add_variant_data("C", [0, 10.7, -1, 8, 0, -3, 0, -10, 0, 11.22], replace=False) 29 | delta_norm.add_variant_data("C", [0, 10.7, -1, 8, 0, -3, 0, -10, 0, 11.22], replace=True) 30 | delta_norm.delete_variant("C") 31 | return delta_norm 32 | 33 | 34 | def test_variants(delta_norm_test): 35 | assert delta_norm_test.variant_names == ["A", "B"] 36 | 37 | 38 | def test_totals(delta_norm_test): 39 | assert delta_norm_test.totals == [31500, 32000] 40 | 41 | 42 | def test_non_zeros(delta_norm_test): 43 | assert delta_norm_test.non_zeros == [10, 40] 44 | 45 | 46 | def test_sum_values(delta_norm_test): 47 | assert delta_norm_test.sum_values == [102.02561, 273.02] 48 | 49 | 50 | def test_sum_values_2(delta_norm_test): 51 | assert delta_norm_test.sum_values_2 == [1700.8, 3567.5] 52 | 53 | 54 | def test_a_priors_beta(delta_norm_test): 55 | assert delta_norm_test.a_priors_beta == [1, 0.02] 56 | 57 | 58 | def test_b_priors_beta(delta_norm_test): 59 | assert delta_norm_test.b_priors_beta == [0.5, 0.5] 60 | 61 | 62 | def test_m_priors(delta_norm_test): 63 | assert delta_norm_test.m_priors == [1, 2] 64 | 65 | 66 | def test_a_priors_ig(delta_norm_test): 67 | assert delta_norm_test.a_priors_ig == [0, 0] 68 | 69 | 70 | def test_b_priors_ig(delta_norm_test): 71 | assert delta_norm_test.b_priors_ig == [0, 0] 72 | 73 | 74 | def test_w_priors(delta_norm_test): 75 | assert delta_norm_test.w_priors == [0.01, 0.02] 76 | 77 | 78 | def test_probabs_of_being_best(delta_norm_test): 79 | pbbs = delta_norm_test.probabs_of_being_best(sim_count=20000, seed=152) 80 | assert pbbs == {"A": 0.02235, "B": 0.97765} 81 | 82 | 83 | def test_expected_loss(delta_norm_test): 84 | loss = delta_norm_test.expected_loss(sim_count=20000, seed=152) 85 | assert loss == {"A": 0.005, "B": 2.46e-05} 86 | 87 | 88 | def test_credible_intervals_95(delta_norm_test): 89 | ci = delta_norm_test.credible_intervals(sim_count=20000, seed=152) 90 | assert ci == { 91 | "A": [0.0011935, 0.0070944], 92 | "B": [0.0051651, 0.0125917], 93 | } 94 | 95 | 96 | def test_credible_intervals_99(delta_norm_test): 97 | ci = delta_norm_test.credible_intervals(sim_count=20000, seed=152, interval_alpha=0.99) 98 | assert ci == { 99 | "A": [0.0006048, 0.0087352], 100 | "B": [0.0043509, 0.0142946], 101 | } 102 | 103 | 104 | def test_evaluate(delta_norm_test): 105 | eval_report = delta_norm_test.evaluate(sim_count=20000, seed=152) 106 | assert eval_report == [ 107 | { 108 | "variant": "A", 109 | "totals": 31500, 110 | "non_zeros": 10, 111 | "sum_values": 102.02561, 112 | "avg_values": 0.00324, 113 | "avg_non_zero_values": 10.20256, 114 | "posterior_mean": 0.00356, 115 | "credible_interval": [0.0011935, 0.0070944], 116 | "prob_being_best": 0.02235, 117 | "expected_loss": 0.005, 118 | }, 119 | { 120 | "variant": "B", 121 | "totals": 32000, 122 | "non_zeros": 40, 123 | "sum_values": 273.02, 124 | "avg_values": 0.00853, 125 | "avg_non_zero_values": 6.8255, 126 | "posterior_mean": 0.00853, 127 | "credible_interval": [0.0051651, 0.0125917], 128 | "prob_being_best": 0.97765, 129 | "expected_loss": 2.46e-05, 130 | }, 131 | ] 132 | 133 | 134 | def test_wrong_inputs(): 135 | dn_test = DeltaNormalDataTest() 136 | with pytest.raises(ValueError): 137 | dn_test.add_variant_data(10, [1, 2, 3]) 138 | with pytest.raises(ValueError): 139 | dn_test.add_variant_data("A", [1, 2, 3], a_prior_beta=-1) 140 | with pytest.raises(ValueError): 141 | dn_test.add_variant_data_agg("A", 2, 3, 6, 21) 142 | with pytest.raises(ValueError): 143 | dn_test.add_variant_data_agg("A", 1, -7, 6, 21) 144 | with pytest.raises(ValueError): 145 | dn_test.add_variant_data("A", []) 146 | with pytest.raises(ValueError): 147 | dn_test.add_variant_data("A", [0, 0, 0]) 148 | with pytest.raises(ValueError): 149 | dn_test.add_variant_data("C", [0, 10.7, -1], a_prior_ig=-1) 150 | -------------------------------------------------------------------------------- /tests/test_discrete.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from bayesian_testing.experiments import DiscreteDataTest 4 | 5 | 6 | @pytest.fixture 7 | def discrete_test(): 8 | disc = DiscreteDataTest(states=[1, 2, 3, 4, 5, 6]) 9 | disc.add_variant_data("A", [6, 5, 4, 4, 4, 2, 5, 4, 2, 1, 2, 5, 4, 6, 2, 3, 6, 2, 3, 6]) 10 | disc.add_variant_data("B", [4, 6, 3, 6, 4, 6, 6, 1, 4, 1]) 11 | disc.add_variant_data_agg("C", [10, 10, 10, 10, 10, 10], prior=[100, 100, 100, 100, 100, 100]) 12 | disc.add_variant_data_agg("D", [1, 2, 3, 8, 10, 7]) 13 | disc.add_variant_data_agg("D", [1, 2, 3, 8, 10, 6], replace=False) 14 | disc.add_variant_data_agg("D", [1, 2, 3, 8, 10, 6], replace=True) 15 | disc.delete_variant("D") 16 | return disc 17 | 18 | 19 | def test_variants(discrete_test): 20 | assert discrete_test.variant_names == ["A", "B", "C"] 21 | 22 | 23 | def test_states(discrete_test): 24 | assert discrete_test.states == [1, 2, 3, 4, 5, 6] 25 | 26 | 27 | def test_concentrations(discrete_test): 28 | assert discrete_test.concentrations == [ 29 | [1, 5, 2, 5, 3, 4], 30 | [2, 0, 1, 3, 0, 4], 31 | [10, 10, 10, 10, 10, 10], 32 | ] 33 | 34 | 35 | def test_probabs_of_being_best(discrete_test): 36 | pbbs = discrete_test.probabs_of_being_best(sim_count=20000, seed=52) 37 | assert pbbs == {"A": 0.35595, "B": 0.59325, "C": 0.0508} 38 | 39 | 40 | def test_expected_loss(discrete_test): 41 | loss = discrete_test.expected_loss(sim_count=20000, seed=52) 42 | assert loss == {"A": 0.3053921, "B": 0.1560257, "C": 0.5328904} 43 | 44 | 45 | def test_credible_intervals_95(discrete_test): 46 | ci = discrete_test.credible_intervals(sim_count=20000, seed=52) 47 | assert ci == { 48 | "A": [3.122705, 4.3265574], 49 | "B": [2.9826238, 4.7094185], 50 | "C": [3.3681015, 3.6302274], 51 | } 52 | 53 | 54 | def test_credible_intervals_99(discrete_test): 55 | ci = discrete_test.credible_intervals(sim_count=20000, seed=52, interval_alpha=0.99) 56 | assert ci == { 57 | "A": [2.9260719, 4.5245231], 58 | "B": [2.7013326, 4.9277036], 59 | "C": [3.3281699, 3.6751105], 60 | } 61 | 62 | 63 | def test_evaluate(discrete_test): 64 | eval_report = discrete_test.evaluate(sim_count=20000, seed=52) 65 | assert eval_report == [ 66 | { 67 | "variant": "A", 68 | "concentration": {1: 1.0, 2: 5.0, 3: 2.0, 4: 5.0, 5: 3.0, 6: 4.0}, 69 | "average_value": 3.8, 70 | "posterior_mean": 3.73077, 71 | "credible_interval": [3.122705, 4.3265574], 72 | "prob_being_best": 0.35595, 73 | "expected_loss": 0.3053921, 74 | }, 75 | { 76 | "variant": "B", 77 | "concentration": {1: 2.0, 2: 0.0, 3: 1.0, 4: 3.0, 5: 0.0, 6: 4.0}, 78 | "average_value": 4.1, 79 | "posterior_mean": 3.875, 80 | "credible_interval": [2.9826238, 4.7094185], 81 | "prob_being_best": 0.59325, 82 | "expected_loss": 0.1560257, 83 | }, 84 | { 85 | "variant": "C", 86 | "concentration": {1: 10, 2: 10, 3: 10, 4: 10, 5: 10, 6: 10}, 87 | "average_value": 3.5, 88 | "posterior_mean": 3.5, 89 | "credible_interval": [3.3681015, 3.6302274], 90 | "prob_being_best": 0.0508, 91 | "expected_loss": 0.5328904, 92 | }, 93 | ] 94 | 95 | 96 | def test_non_numerical_states_error(): 97 | with pytest.raises(ValueError): 98 | DiscreteDataTest(states=[1, 2.0, "3"]) 99 | 100 | 101 | def test_non_string_variant_error(discrete_test): 102 | with pytest.raises(ValueError): 103 | discrete_test.add_variant_data_agg(1, [1, 2, 3, 8, 10, 7]) 104 | 105 | 106 | def test_length_mismatch_input_error(discrete_test): 107 | with pytest.raises(ValueError): 108 | discrete_test.add_variant_data_agg("D", [1, 2, 3, 8, 10]) 109 | 110 | 111 | def test_empty_data_error(discrete_test): 112 | with pytest.raises(ValueError): 113 | discrete_test.add_variant_data("D", []) 114 | 115 | 116 | def test_non_existing_state_error(discrete_test): 117 | with pytest.raises(ValueError): 118 | discrete_test.add_variant_data("D", [1, 2, 3, 5, 21]) 119 | -------------------------------------------------------------------------------- /tests/test_evaluation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from bayesian_testing.metrics import ( 5 | eval_bernoulli_agg, 6 | eval_normal_agg, 7 | eval_delta_lognormal_agg, 8 | eval_delta_normal_agg, 9 | eval_numerical_dirichlet_agg, 10 | eval_poisson_agg, 11 | eval_exponential_agg, 12 | ) 13 | 14 | PBB_BERNOULLI_AGG_INPUTS = [ 15 | { 16 | "input": { 17 | "totals": [31500, 32000, 31000], 18 | "successes": [1580, 1700, 1550], 19 | "sim_count": 20000, 20 | "seed": 52, 21 | "min_is_best": False, 22 | "interval_alpha": 0.95, 23 | }, 24 | "expected_output": ( 25 | [0.04185, 0.92235, 0.0358], 26 | [0.0030138, 6.06e-05, 0.0031649], 27 | [[0.0477826, 0.0526302], [0.0506933, 0.0555936], [0.0476604, 0.0524757]], 28 | ), 29 | }, 30 | { 31 | "input": { 32 | "totals": [31500, 32000, 31000], 33 | "successes": [1580, 1700, 1550], 34 | "sim_count": 20000, 35 | "seed": 52, 36 | "min_is_best": True, 37 | "interval_alpha": 0.99, 38 | }, 39 | "expected_output": ( 40 | [0.4594, 0.00925, 0.53135], 41 | [0.000781, 0.0037342, 0.0006299], 42 | [[0.0470873, 0.0534391], [0.0499116, 0.056421], [0.0469394, 0.0532695]], 43 | ), 44 | }, 45 | { 46 | "input": { 47 | "totals": [100, 200], 48 | "successes": [80, 160], 49 | "sim_count": 10000, 50 | "seed": 52, 51 | "min_is_best": False, 52 | "interval_alpha": 0.5, 53 | }, 54 | "expected_output": ( 55 | [0.4899, 0.5101], 56 | [0.0204051, 0.0182965], 57 | [[0.7713375, 0.8248972], [0.7810789, 0.8179153]], 58 | ), 59 | }, 60 | { 61 | "input": { 62 | "totals": [100, 100], 63 | "successes": [0, 0], 64 | "sim_count": 20000, 65 | "seed": 52, 66 | "min_is_best": False, 67 | "interval_alpha": 0.95, 68 | }, 69 | "expected_output": ( 70 | [0.5008, 0.4992], 71 | [0.0030829, 0.0031614], 72 | [[4.8e-06, 0.0252857], [4.8e-06, 0.0243717]], 73 | ), 74 | }, 75 | { 76 | "input": { 77 | "totals": [100], 78 | "successes": [77], 79 | "sim_count": 20000, 80 | "seed": 52, 81 | "min_is_best": False, 82 | "interval_alpha": 0.95, 83 | }, 84 | "expected_output": ([1], [0], [[0.6810233, 0.8442006]]), 85 | }, 86 | { 87 | "input": { 88 | "totals": [], 89 | "successes": [], 90 | "sim_count": 20000, 91 | "seed": 52, 92 | "min_is_best": False, 93 | "interval_alpha": 0.95, 94 | }, 95 | "expected_output": ([], [], []), 96 | }, 97 | ] 98 | 99 | PBB_NORMAL_AGG_INPUTS = [ 100 | { 101 | "input": { 102 | "totals": [31000, 30000, 32000], 103 | "sums": [33669.629254438274, 32451.58924937506, 34745.69678322253], 104 | "sums_2": [659657.6891070933, 95284.82070196551, 260327.13931832163], 105 | "sim_count": 20000, 106 | "seed": 52, 107 | "interval_alpha": 0.95, 108 | }, 109 | "expected_output": ( 110 | [0.43605, 0.19685, 0.3671], 111 | [0.0133512, 0.0179947, 0.0137618], 112 | [[1.0366696, 1.13634], [1.0652914, 1.0977888], [1.0574217, 1.1141581]], 113 | ), 114 | }, 115 | { 116 | "input": { 117 | "totals": [10000, 10000], 118 | "sums": [11446.345516947431, 10708.892428298526], 119 | "sums_2": [214614.35949718487, 31368.55305547222], 120 | "sim_count": 20000, 121 | "seed": 52, 122 | "interval_alpha": 0.99, 123 | }, 124 | "expected_output": ( 125 | [0.94445, 0.05555], 126 | [0.0011338, 0.0753121], 127 | [[1.0278553, 1.2601174], [1.0337017, 1.1071861]], 128 | ), 129 | }, 130 | { 131 | "input": { 132 | "totals": [10, 20, 30, 40], 133 | "sums": [0, 0, 0, 0], 134 | "sums_2": [0, 0, 0, 0], 135 | "sim_count": 20000, 136 | "seed": 52, 137 | "interval_alpha": 0.95, 138 | }, 139 | "expected_output": ( 140 | [0.40785, 0.25105, 0.1928, 0.1483], 141 | [0.0058965, 0.0065083, 0.0066249, 0.0067183], 142 | [ 143 | [-0.021071, 0.0232855], 144 | [-0.0101753, 0.0108701], 145 | [-0.0064358, 0.0070877], 146 | [-0.004795, 0.0052896], 147 | ], 148 | ), 149 | }, 150 | { 151 | "input": { 152 | "totals": [100], 153 | "sums": [0], 154 | "sums_2": [0], 155 | "sim_count": 10000, 156 | "seed": 52, 157 | "interval_alpha": 0.95, 158 | }, 159 | "expected_output": ([1], [0], [[-0.0019355, 0.0020896]]), 160 | }, 161 | { 162 | "input": { 163 | "totals": [10000, 10000], 164 | "sums": [11446.35, 11446.35], 165 | "sums_2": [214614.36, 214614.36], 166 | "sim_count": 20000, 167 | "seed": 52, 168 | "interval_alpha": 0.95, 169 | }, 170 | "expected_output": ( 171 | [0.5024, 0.4976], 172 | [0.0250157, 0.0256253], 173 | [[1.0577297, 1.2331092], [1.0545188, 1.2327107]], 174 | ), 175 | }, 176 | { 177 | "input": { 178 | "totals": [], 179 | "sums": [], 180 | "sums_2": [], 181 | "sim_count": 10000, 182 | "seed": 52, 183 | "interval_alpha": 0.95, 184 | }, 185 | "expected_output": ([], [], []), 186 | }, 187 | ] 188 | 189 | PBB_DELTA_LOGNORMAL_AGG_INPUTS = [ 190 | { 191 | "input": { 192 | "totals": [31500, 32000, 31000], 193 | "successes": [1580, 1700, 1550], 194 | "sum_logs": [3831.806394737816, 4211.72986767986, 4055.965234848171], 195 | "sum_logs_2": [11029.923165846496, 12259.51868396913, 12357.911862914], 196 | "sim_count": 20000, 197 | "seed": 52, 198 | "interval_alpha": 0.95, 199 | }, 200 | "expected_output": ( 201 | [0.00015, 0.03345, 0.9664], 202 | [0.2209593, 0.1205541, 0.0008458], 203 | [[0.9065769, 1.0655643], [1.0046391, 1.1707248], [1.1085257, 1.3061752]], 204 | ), 205 | }, 206 | { 207 | "input": { 208 | "totals": [31000, 31000], 209 | "successes": [1550, 1550], 210 | "sum_logs": [4055.965234848171, 4055.965234848171], 211 | "sum_logs_2": [12357.911862914, 12357.911862914], 212 | "sim_count": 10000, 213 | "seed": 52, 214 | "interval_alpha": 0.9, 215 | }, 216 | "expected_output": ( 217 | [0.5013, 0.4987], 218 | [0.028189, 0.0287233], 219 | [[1.1227657, 1.2882371], [1.1210866, 1.2895949]], 220 | ), 221 | }, 222 | { 223 | "input": { 224 | "totals": [10, 20, 30, 40], 225 | "successes": [0, 0, 0, 0], 226 | "sum_logs": [0, 0, 0, 0], 227 | "sum_logs_2": [0, 0, 0, 0], 228 | "sim_count": 10000, 229 | "seed": 52, 230 | "interval_alpha": 0.5, 231 | }, 232 | "expected_output": ( 233 | [0.25, 0.25, 0.25, 0.25], 234 | [np.nan, np.nan, np.nan, np.nan], 235 | [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], 236 | ), 237 | }, 238 | { 239 | "input": { 240 | "totals": [100], 241 | "successes": [10], 242 | "sum_logs": [0], 243 | "sum_logs_2": [0], 244 | "sim_count": 10000, 245 | "seed": 52, 246 | "interval_alpha": 0.95, 247 | }, 248 | "expected_output": ([1], [0], [[0.051825, 0.1697968]]), 249 | }, 250 | { 251 | "input": { 252 | "totals": [], 253 | "successes": [], 254 | "sum_logs": [], 255 | "sum_logs_2": [], 256 | "sim_count": 10000, 257 | "seed": 52, 258 | "interval_alpha": 0.95, 259 | }, 260 | "expected_output": ([], [], []), 261 | }, 262 | ] 263 | 264 | PBB_NUMERICAL_DIRICHLET_AGG_INPUTS = [ 265 | { 266 | "input": { 267 | "states": [1, 2, 3, 4, 5, 6], 268 | "concentrations": [ 269 | [10, 10, 10, 10, 20, 10], 270 | [10, 10, 10, 10, 10, 20], 271 | [10, 10, 10, 20, 10, 10], 272 | ], 273 | "sim_count": 20000, 274 | "seed": 52, 275 | "interval_alpha": 0.9, 276 | }, 277 | "expected_output": ( 278 | [0.28205, 0.62335, 0.0946], 279 | [0.1999528, 0.0698306, 0.334045], 280 | [[3.3214796, 4.0718396], [3.4218451, 4.2243033], [3.1984494, 3.9184425]], 281 | ), 282 | }, 283 | { 284 | "input": { 285 | "states": [1, 2, 3], 286 | "concentrations": [[100, 100, 100]], 287 | "sim_count": 20000, 288 | "seed": 52, 289 | "interval_alpha": 0.9, 290 | }, 291 | "expected_output": ([1], [0], [[1.9077157, 2.0908699]]), 292 | }, 293 | { 294 | "input": { 295 | "states": [], 296 | "concentrations": [], 297 | "sim_count": 20000, 298 | "seed": 52, 299 | "interval_alpha": 0.9, 300 | }, 301 | "expected_output": ([], [], []), 302 | }, 303 | ] 304 | 305 | PBB_POISSON_AGG_INPUTS = [ 306 | { 307 | "input": { 308 | "totals": [3150, 3200, 3100], 309 | "sums": [10000, 10000, 10000], 310 | "sim_count": 20000, 311 | "seed": 52, 312 | "min_is_best": False, 313 | "interval_alpha": 0.95, 314 | }, 315 | "expected_output": ( 316 | [0.127, 0.00695, 0.86605], 317 | [0.0539495, 0.1042691, 0.0030418], 318 | [[3.1132541, 3.2375641], [3.0635577, 3.1863114], [3.1634511, 3.2890376]], 319 | ), 320 | }, 321 | { 322 | "input": { 323 | "totals": [3150, 3200, 3100], 324 | "sums": [10000, 10000, 10000], 325 | "sim_count": 20000, 326 | "seed": 52, 327 | "min_is_best": True, 328 | "interval_alpha": 0.9, 329 | }, 330 | "expected_output": ( 331 | [0.12775, 0.8656, 0.00665], 332 | [0.0532581, 0.0029385, 0.1041658], 333 | [[3.123135, 3.2276693], [3.0732817, 3.1764313], [3.1729959, 3.2788603]], 334 | ), 335 | }, 336 | { 337 | "input": { 338 | "totals": [100], 339 | "sums": [77], 340 | "sim_count": 20000, 341 | "seed": 52, 342 | "min_is_best": False, 343 | "interval_alpha": 0.75, 344 | }, 345 | "expected_output": ([1], [0], [[0.6723231, 0.8727923]]), 346 | }, 347 | { 348 | "input": { 349 | "totals": [], 350 | "sums": [], 351 | "sim_count": 20000, 352 | "seed": 52, 353 | "min_is_best": False, 354 | "interval_alpha": 0.9, 355 | }, 356 | "expected_output": ([], [], []), 357 | }, 358 | ] 359 | 360 | PBB_EXPONENTIAL_AGG_INPUTS = [ 361 | { 362 | "input": { 363 | "totals": [100, 90, 80], 364 | "sums": [1040.29884, 993.66883, 883.05801], 365 | "sim_count": 20000, 366 | "seed": 52, 367 | "min_is_best": False, 368 | "interval_alpha": 0.9, 369 | }, 370 | "expected_output": ( 371 | [0.1826, 0.4065, 0.4109], 372 | [1.5195025, 0.8380173, 0.8431285], 373 | [[8.8658129, 12.3263561], [9.3561749, 13.2588682], [9.2650625, 13.3809534]], 374 | ), 375 | }, 376 | { 377 | "input": { 378 | "totals": [1000, 1000, 1000], 379 | "sums": [2288.69431, 2471.61961, 2745.7794], 380 | "sim_count": 20000, 381 | "seed": 52, 382 | "min_is_best": True, 383 | "interval_alpha": 0.9, 384 | }, 385 | "expected_output": ( 386 | [0.9594, 0.0406, 0.0], 387 | [0.0017238, 0.1865276, 0.4598496], 388 | [[2.1727503, 2.4111014], [2.3482046, 2.6066663], [2.6087576, 2.8941021]], 389 | ), 390 | }, 391 | { 392 | "input": { 393 | "totals": [100], 394 | "sums": [1007.25317], 395 | "sim_count": 20000, 396 | "seed": 52, 397 | "min_is_best": True, 398 | "interval_alpha": 0.912, 399 | }, 400 | "expected_output": ([1], [0], [[8.5325723, 11.9986705]]), 401 | }, 402 | { 403 | "input": { 404 | "totals": [], 405 | "sums": [], 406 | "sim_count": 20000, 407 | "seed": 52, 408 | "min_is_best": False, 409 | "interval_alpha": 0.9, 410 | }, 411 | "expected_output": ([], [], []), 412 | }, 413 | ] 414 | 415 | PBB_DELTA_NORMAL_AGG_INPUTS = [ 416 | { 417 | "input": { 418 | "totals": [10000, 1000], 419 | "non_zeros": [1009, 111], 420 | "sums": [7026.30599, 801.53947], 421 | "sums_2": [49993.4988, 5891.6073], 422 | "sim_count": 20000, 423 | "seed": 52, 424 | "min_is_best": False, 425 | "interval_alpha": 0.9, 426 | }, 427 | "expected_output": ( 428 | [0.08285, 0.91715], 429 | [0.1045921, 0.0026141], 430 | [[0.6683901, 0.7384471], [0.6897179, 0.9275315]], 431 | ), 432 | }, 433 | { 434 | "input": { 435 | "totals": [10, 20, 30, 40], 436 | "non_zeros": [0, 0, 0, 0], 437 | "sums": [0, 0, 0, 0], 438 | "sums_2": [0, 0, 0, 0], 439 | "sim_count": 10000, 440 | "seed": 52, 441 | "min_is_best": False, 442 | "interval_alpha": 0.9, 443 | }, 444 | "expected_output": ( 445 | [0.25, 0.25, 0.25, 0.25], 446 | [np.nan, np.nan, np.nan, np.nan], 447 | [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], 448 | ), 449 | }, 450 | { 451 | "input": { 452 | "totals": [100], 453 | "non_zeros": [10], 454 | "sums": [0], 455 | "sums_2": [0], 456 | "sim_count": 10000, 457 | "seed": 52, 458 | "min_is_best": False, 459 | "interval_alpha": 0.9, 460 | }, 461 | "expected_output": ([1], [0], [[-0.0017847, 0.0020072]]), 462 | }, 463 | { 464 | "input": { 465 | "totals": [], 466 | "non_zeros": [], 467 | "sums": [], 468 | "sums_2": [], 469 | "sim_count": 10000, 470 | "seed": 52, 471 | "min_is_best": False, 472 | "interval_alpha": 0.9, 473 | }, 474 | "expected_output": ([], [], []), 475 | }, 476 | ] 477 | 478 | 479 | @pytest.mark.parametrize("inp", PBB_BERNOULLI_AGG_INPUTS) 480 | def test_eval_bernoulli_agg(inp): 481 | i = inp["input"] 482 | res = eval_bernoulli_agg( 483 | i["totals"], 484 | i["successes"], 485 | sim_count=i["sim_count"], 486 | seed=i["seed"], 487 | min_is_best=i["min_is_best"], 488 | interval_alpha=i["interval_alpha"], 489 | ) 490 | assert res == inp["expected_output"] 491 | 492 | 493 | @pytest.mark.parametrize("inp", PBB_NORMAL_AGG_INPUTS) 494 | def test_eval_normal_agg(inp): 495 | i = inp["input"] 496 | res = eval_normal_agg( 497 | i["totals"], 498 | i["sums"], 499 | i["sums_2"], 500 | sim_count=i["sim_count"], 501 | seed=i["seed"], 502 | interval_alpha=i["interval_alpha"], 503 | ) 504 | assert res == inp["expected_output"] 505 | 506 | 507 | def test_eval_normal_agg_different_runs(): 508 | # two different runs of same input without seed should be different 509 | run1 = eval_normal_agg([100, 100], [10, 10], [20, 20]) 510 | run2 = eval_normal_agg([100, 100], [10, 10], [20, 20]) 511 | assert run1 != run2 512 | 513 | 514 | @pytest.mark.parametrize("inp", PBB_DELTA_LOGNORMAL_AGG_INPUTS) 515 | def test_eval_delta_lognormal_agg(inp): 516 | i = inp["input"] 517 | res = eval_delta_lognormal_agg( 518 | i["totals"], 519 | i["successes"], 520 | i["sum_logs"], 521 | i["sum_logs_2"], 522 | sim_count=i["sim_count"], 523 | seed=i["seed"], 524 | interval_alpha=i["interval_alpha"], 525 | ) 526 | assert res == inp["expected_output"] 527 | 528 | 529 | def test_eval_delta_lognormal_agg_different_runs(): 530 | # two different runs of same input without seed should be different 531 | run1 = eval_delta_lognormal_agg([1000, 1000], [100, 100], [10, 10], [20, 20], sim_count=100000) 532 | run2 = eval_delta_lognormal_agg([1000, 1000], [100, 100], [10, 10], [20, 20], sim_count=100000) 533 | assert run1 != run2 534 | 535 | 536 | @pytest.mark.parametrize("inp", PBB_NUMERICAL_DIRICHLET_AGG_INPUTS) 537 | def test_eval_numerical_dirichlet_agg(inp): 538 | i = inp["input"] 539 | res = eval_numerical_dirichlet_agg( 540 | i["states"], i["concentrations"], sim_count=i["sim_count"], seed=i["seed"] 541 | ) 542 | assert res == inp["expected_output"] 543 | 544 | 545 | def test_eval_numerical_dirichlet_agg_different_runs(): 546 | # two different runs of same input without seed should be different 547 | run1 = eval_numerical_dirichlet_agg([1, 20], [[10, 10], [20, 20]]) 548 | run2 = eval_numerical_dirichlet_agg([1, 20], [[10, 10], [20, 20]]) 549 | assert run1 != run2 550 | 551 | 552 | @pytest.mark.parametrize("inp", PBB_POISSON_AGG_INPUTS) 553 | def test_eval_poisson_agg(inp): 554 | i = inp["input"] 555 | res = eval_poisson_agg( 556 | i["totals"], 557 | i["sums"], 558 | sim_count=i["sim_count"], 559 | seed=i["seed"], 560 | min_is_best=i["min_is_best"], 561 | interval_alpha=i["interval_alpha"], 562 | ) 563 | assert res == inp["expected_output"] 564 | 565 | 566 | @pytest.mark.parametrize("inp", PBB_EXPONENTIAL_AGG_INPUTS) 567 | def test_eval_exponential_agg(inp): 568 | i = inp["input"] 569 | res = eval_exponential_agg( 570 | i["totals"], 571 | i["sums"], 572 | sim_count=i["sim_count"], 573 | seed=i["seed"], 574 | min_is_best=i["min_is_best"], 575 | interval_alpha=i["interval_alpha"], 576 | ) 577 | assert res == inp["expected_output"] 578 | 579 | 580 | @pytest.mark.parametrize("inp", PBB_DELTA_NORMAL_AGG_INPUTS) 581 | def test_eval_delta_normal_agg(inp): 582 | i = inp["input"] 583 | res = eval_delta_normal_agg( 584 | i["totals"], 585 | i["non_zeros"], 586 | i["sums"], 587 | i["sums_2"], 588 | sim_count=i["sim_count"], 589 | seed=i["seed"], 590 | min_is_best=i["min_is_best"], 591 | interval_alpha=i["interval_alpha"], 592 | ) 593 | assert res == inp["expected_output"] 594 | -------------------------------------------------------------------------------- /tests/test_exponential.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from bayesian_testing.experiments import ExponentialDataTest 4 | 5 | 6 | @pytest.fixture 7 | def exponential_test(): 8 | expo = ExponentialDataTest() 9 | expo.add_variant_data( 10 | "A", 11 | [ 12 | 3.27, 13 | 5.62, 14 | 0.31, 15 | 3.9, 16 | 2.4, 17 | 10.49, 18 | 0.63, 19 | 2.71, 20 | 1.64, 21 | 0.43, 22 | 0.22, 23 | 0.3, 24 | 1.99, 25 | 0.69, 26 | 5.15, 27 | 1.31, 28 | 1.01, 29 | 1.26, 30 | 0.2, 31 | 1.6, 32 | ], 33 | ) 34 | expo.add_variant_data( 35 | "B", 36 | [ 37 | 0.28, 38 | 0.18, 39 | 0.13, 40 | 4.79, 41 | 1.07, 42 | 0.69, 43 | 5.75, 44 | 2.07, 45 | 9.67, 46 | 2.79, 47 | 0.18, 48 | 5.8, 49 | 12.81, 50 | 2.33, 51 | 2.28, 52 | 1.56, 53 | 4.18, 54 | 1.47, 55 | 1.67, 56 | 0.98, 57 | ], 58 | ) 59 | expo.add_variant_data_agg("C", 20, 72.27, a_prior=1, b_prior=2) 60 | expo.add_variant_data_agg("D", 100, 200) 61 | expo.add_variant_data_agg("D", 100, 220, replace=False) 62 | expo.add_variant_data_agg("D", 10, 20, replace=True) 63 | expo.delete_variant("D") 64 | return expo 65 | 66 | 67 | def test_variants(exponential_test): 68 | assert exponential_test.variant_names == ["A", "B", "C"] 69 | 70 | 71 | def test_totals(exponential_test): 72 | assert exponential_test.totals == [20, 20, 20] 73 | 74 | 75 | def test_positives(exponential_test): 76 | assert exponential_test.sum_values == [45.13, 60.68, 72.27] 77 | 78 | 79 | def test_a_priors(exponential_test): 80 | assert exponential_test.a_priors == [0.1, 0.1, 1] 81 | 82 | 83 | def test_b_priors(exponential_test): 84 | assert exponential_test.b_priors == [0.1, 0.1, 2] 85 | 86 | 87 | def test_probabs_of_being_best(exponential_test): 88 | pbbs = exponential_test.probabs_of_being_best(sim_count=20000, seed=52) 89 | assert pbbs == {"A": 0.0414, "B": 0.29885, "C": 0.65975} 90 | 91 | 92 | def test_expected_loss(exponential_test): 93 | loss = exponential_test.expected_loss(sim_count=20000, seed=52) 94 | assert loss == {"A": 1.5907038, "B": 0.7596064, "C": 0.2414208} 95 | 96 | 97 | def test_credible_intervals_95(exponential_test): 98 | ci = exponential_test.credible_intervals(sim_count=20000, seed=52) 99 | assert ci == { 100 | "A": [1.5151401, 3.6571069], 101 | "B": [2.0455239, 4.9692854], 102 | "C": [2.4059958, 5.6846722], 103 | } 104 | 105 | 106 | def test_credible_intervals_99(exponential_test): 107 | ci = exponential_test.credible_intervals(sim_count=20000, seed=52, interval_alpha=0.99) 108 | assert ci == { 109 | "A": [1.3525642, 4.3405547], 110 | "B": [1.8175504, 5.9001709], 111 | "C": [2.124576, 6.7291228], 112 | } 113 | 114 | 115 | def test_evaluate(exponential_test): 116 | eval_report = exponential_test.evaluate(sim_count=20000, seed=52) 117 | assert eval_report == [ 118 | { 119 | "variant": "A", 120 | "totals": 20, 121 | "sum_values": 45.13, 122 | "observed_average": 2.2565, 123 | "posterior_mean": 2.25025, 124 | "credible_interval": [1.5151401, 3.6571069], 125 | "prob_being_best": 0.0414, 126 | "expected_loss": 1.5907038, 127 | }, 128 | { 129 | "variant": "B", 130 | "totals": 20, 131 | "sum_values": 60.68, 132 | "observed_average": 3.034, 133 | "posterior_mean": 3.02388, 134 | "credible_interval": [2.0455239, 4.9692854], 135 | "prob_being_best": 0.29885, 136 | "expected_loss": 0.7596064, 137 | }, 138 | { 139 | "variant": "C", 140 | "totals": 20, 141 | "sum_values": 72.27, 142 | "observed_average": 3.6135, 143 | "posterior_mean": 3.53667, 144 | "credible_interval": [2.4059958, 5.6846722], 145 | "prob_being_best": 0.65975, 146 | "expected_loss": 0.2414208, 147 | }, 148 | ] 149 | 150 | 151 | def test_wrong_inputs(): 152 | exp_test = ExponentialDataTest() 153 | with pytest.raises(ValueError): 154 | exp_test.add_variant_data(10, [1, 2, 3]) 155 | with pytest.raises(ValueError): 156 | exp_test.add_variant_data("A", [1, 2, 3], a_prior=-1) 157 | with pytest.raises(ValueError): 158 | exp_test.add_variant_data_agg("A", -1, 7) 159 | with pytest.raises(ValueError): 160 | exp_test.add_variant_data_agg("A", 1, -7) 161 | with pytest.raises(ValueError): 162 | exp_test.add_variant_data("A", []) 163 | with pytest.raises(ValueError): 164 | exp_test.add_variant_data("A", [1, 2, -3]) 165 | -------------------------------------------------------------------------------- /tests/test_normal.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from bayesian_testing.experiments import NormalDataTest 4 | 5 | 6 | @pytest.fixture 7 | def norm_test(): 8 | norm = NormalDataTest() 9 | norm.add_variant_data( 10 | "A", 11 | [ 12 | 11.8, 13 | 12.2, 14 | 12.4, 15 | 9.5, 16 | 2.2, 17 | 3.3, 18 | 16.2, 19 | 4.9, 20 | 12.4, 21 | 6.8, 22 | 8.7, 23 | 9.8, 24 | 5.4, 25 | 9.0, 26 | 15.0, 27 | 12.3, 28 | 9.6, 29 | 12.5, 30 | 9.1, 31 | 10.2, 32 | ], 33 | m_prior=9, 34 | ) 35 | norm.add_variant_data( 36 | "B", 37 | [ 38 | 10.6, 39 | 5.1, 40 | 9.4, 41 | 11.2, 42 | 2.0, 43 | 13.4, 44 | 14.1, 45 | 15.4, 46 | 16.3, 47 | 11.7, 48 | 7.3, 49 | 6.8, 50 | 8.2, 51 | 16.2, 52 | 10.8, 53 | 7.1, 54 | 12.2, 55 | 11.2, 56 | ], 57 | w_prior=0.03, 58 | ) 59 | norm.add_variant_data( 60 | "C", 61 | [ 62 | 25.3, 63 | 10.3, 64 | 24.7, 65 | -8.1, 66 | 8.4, 67 | 10.3, 68 | 14.8, 69 | 13.4, 70 | 11.5, 71 | -4.7, 72 | 5.3, 73 | 7.4, 74 | 17.2, 75 | 15.4, 76 | 13.0, 77 | 12.9, 78 | 19.2, 79 | 11.6, 80 | 0.4, 81 | 5.7, 82 | 23.5, 83 | 15.2, 84 | ], 85 | b_prior_ig=2, 86 | ) 87 | norm.add_variant_data_agg("A", 20, 193.3, 2127.71, replace=False) 88 | norm.add_variant_data("D", [0, 10.7, 0, 8, 0, 0, 0, 0, 0, 11.22]) 89 | norm.add_variant_data("D", [0, 10.7, 0, 8, 0, 0, 0, 0, 0, 11.22], replace=False) 90 | norm.add_variant_data("D", [0, 10.7, 0, 8, 0, 0, 0, 0, 0, 11.22], replace=True) 91 | norm.delete_variant("D") 92 | return norm 93 | 94 | 95 | def test_variants(norm_test): 96 | assert norm_test.variant_names == ["A", "B", "C"] 97 | 98 | 99 | def test_totals(norm_test): 100 | assert norm_test.totals == [40, 18, 22] 101 | 102 | 103 | def test_sum_values(norm_test): 104 | assert norm_test.sum_values == [386.6, 188.99999999999997, 252.69999999999996] 105 | 106 | 107 | def test_sum_values_2(norm_test): 108 | assert norm_test.sum_values_2 == [4255.42, 2244.8200000000006, 4421.87] 109 | 110 | 111 | def test_m_priors(norm_test): 112 | assert norm_test.m_priors == [9, 1, 1] 113 | 114 | 115 | def test_a_priors_ig(norm_test): 116 | assert norm_test.a_priors_ig == [0, 0, 0] 117 | 118 | 119 | def test_b_priors_ig(norm_test): 120 | assert norm_test.b_priors_ig == [0, 0, 2] 121 | 122 | 123 | def test_w_priors(norm_test): 124 | assert norm_test.w_priors == [0.01, 0.03, 0.01] 125 | 126 | 127 | def test_probabs_of_being_best(norm_test): 128 | pbbs = norm_test.probabs_of_being_best(sim_count=20000, seed=52) 129 | assert pbbs == {"A": 0.05105, "B": 0.27935, "C": 0.6696} 130 | 131 | 132 | def test_expected_loss(norm_test): 133 | loss = norm_test.expected_loss(sim_count=20000, seed=52) 134 | assert loss == {"A": 2.2696341, "B": 1.4580033, "C": 0.4464154} 135 | 136 | 137 | def test_credible_intervals_95(norm_test): 138 | ci = norm_test.credible_intervals(sim_count=20000, seed=52) 139 | assert ci == { 140 | "A": [8.5300072, 10.8231841], 141 | "B": [8.5577171, 12.3448628], 142 | "C": [7.8915125, 15.1179586], 143 | } 144 | 145 | 146 | def test_credible_intervals_99(norm_test): 147 | ci = norm_test.credible_intervals(sim_count=20000, seed=52, interval_alpha=0.99) 148 | assert ci == { 149 | "A": [8.1196181, 11.2023581], 150 | "B": [7.8792145, 13.0964176], 151 | "C": [6.5669908, 16.5226358], 152 | } 153 | 154 | 155 | def test_evaluate(norm_test): 156 | eval_report = norm_test.evaluate(sim_count=20000, seed=52) 157 | assert eval_report == [ 158 | { 159 | "variant": "A", 160 | "totals": 40, 161 | "sum_values": 386.6, 162 | "avg_values": 9.665, 163 | "posterior_mean": 9.66483, 164 | "credible_interval": [8.5300072, 10.8231841], 165 | "prob_being_best": 0.05105, 166 | "expected_loss": 2.2696341, 167 | }, 168 | { 169 | "variant": "B", 170 | "totals": 18, 171 | "sum_values": 189.0, 172 | "avg_values": 10.5, 173 | "posterior_mean": 10.48419, 174 | "credible_interval": [8.5577171, 12.3448628], 175 | "prob_being_best": 0.27935, 176 | "expected_loss": 1.4580033, 177 | }, 178 | { 179 | "variant": "C", 180 | "totals": 22, 181 | "sum_values": 252.7, 182 | "avg_values": 11.48636, 183 | "posterior_mean": 11.4816, 184 | "credible_interval": [7.8915125, 15.1179586], 185 | "prob_being_best": 0.6696, 186 | "expected_loss": 0.4464154, 187 | }, 188 | ] 189 | -------------------------------------------------------------------------------- /tests/test_poisson.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from bayesian_testing.experiments import PoissonDataTest 4 | 5 | 6 | @pytest.fixture 7 | def poisson_test(): 8 | pois = PoissonDataTest() 9 | pois.add_variant_data("A", [5, 5, 7, 1, 3, 3, 1, 1, 2, 0, 1, 3, 4, 2, 5]) 10 | pois.add_variant_data("B", [2, 4, 3, 4, 6, 1, 3, 6, 4, 0, 3, 1, 2, 1]) 11 | pois.add_variant_data_agg("C", 15, 49, a_prior=1, b_prior=2) 12 | pois.add_variant_data_agg("D", 10, 10) 13 | pois.add_variant_data_agg("D", 20, 20, replace=False) 14 | pois.add_variant_data_agg("D", 20, 20, replace=True) 15 | pois.delete_variant("D") 16 | return pois 17 | 18 | 19 | def test_variants(poisson_test): 20 | assert poisson_test.variant_names == ["A", "B", "C"] 21 | 22 | 23 | def test_totals(poisson_test): 24 | assert poisson_test.totals == [15, 14, 15] 25 | 26 | 27 | def test_positives(poisson_test): 28 | assert poisson_test.sum_values == [43, 40, 49] 29 | 30 | 31 | def test_a_priors(poisson_test): 32 | assert poisson_test.a_priors == [0.1, 0.1, 1] 33 | 34 | 35 | def test_b_priors(poisson_test): 36 | assert poisson_test.b_priors == [0.1, 0.1, 2] 37 | 38 | 39 | def test_probabs_of_being_best(poisson_test): 40 | pbbs = poisson_test.probabs_of_being_best(sim_count=20000, seed=52) 41 | assert pbbs == {"A": 0.30945, "B": 0.29665, "C": 0.3939} 42 | 43 | 44 | def test_expected_loss(poisson_test): 45 | loss = poisson_test.expected_loss(sim_count=20000, seed=52) 46 | assert loss == {"A": 0.3936672, "B": 0.4144949, "C": 0.3109256} 47 | 48 | 49 | def test_credible_intervals_95(poisson_test): 50 | ci = poisson_test.credible_intervals(sim_count=20000, seed=52) 51 | assert ci == { 52 | "A": [2.0742056, 3.7731115], 53 | "B": [2.0264899, 3.7822918], 54 | "C": [2.1895805, 3.8084984], 55 | } 56 | 57 | 58 | def test_credible_intervals_99(poisson_test): 59 | ci = poisson_test.credible_intervals(sim_count=20000, seed=52, interval_alpha=0.99) 60 | assert ci == { 61 | "A": [1.8569798, 4.0897961], 62 | "B": [1.8082962, 4.1242607], 63 | "C": [1.9771075, 4.1434489], 64 | } 65 | 66 | 67 | def test_evaluate(poisson_test): 68 | eval_report = poisson_test.evaluate(sim_count=20000, seed=52) 69 | assert eval_report == [ 70 | { 71 | "variant": "A", 72 | "totals": 15, 73 | "sum_values": 43, 74 | "observed_average": 2.86667, 75 | "posterior_mean": 2.8543, 76 | "credible_interval": [2.0742056, 3.7731115], 77 | "prob_being_best": 0.30945, 78 | "expected_loss": 0.3936672, 79 | }, 80 | { 81 | "variant": "B", 82 | "totals": 14, 83 | "sum_values": 40, 84 | "observed_average": 2.85714, 85 | "posterior_mean": 2.84397, 86 | "credible_interval": [2.0264899, 3.7822918], 87 | "prob_being_best": 0.29665, 88 | "expected_loss": 0.4144949, 89 | }, 90 | { 91 | "variant": "C", 92 | "totals": 15, 93 | "sum_values": 49, 94 | "observed_average": 3.26667, 95 | "posterior_mean": 2.94118, 96 | "credible_interval": [2.1895805, 3.8084984], 97 | "prob_being_best": 0.3939, 98 | "expected_loss": 0.3109256, 99 | }, 100 | ] 101 | 102 | 103 | def test_wrong_inputs(): 104 | pois_test = PoissonDataTest() 105 | with pytest.raises(ValueError): 106 | pois_test.add_variant_data(10, [1, 2, 3]) 107 | with pytest.raises(ValueError): 108 | pois_test.add_variant_data("A", [1, 2, 3], a_prior=-1) 109 | with pytest.raises(ValueError): 110 | pois_test.add_variant_data_agg("A", -1, 7) 111 | with pytest.raises(ValueError): 112 | pois_test.add_variant_data_agg("A", 1, -7) 113 | with pytest.raises(ValueError): 114 | pois_test.add_variant_data("A", []) 115 | with pytest.raises(ValueError): 116 | pois_test.add_variant_data("A", [1, 2, -3]) 117 | -------------------------------------------------------------------------------- /tests/test_posteriors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from bayesian_testing.metrics.posteriors import ( 5 | beta_posteriors_all, 6 | lognormal_posteriors, 7 | dirichlet_posteriors, 8 | pois_gamma_posteriors_all, 9 | exp_gamma_posteriors_all, 10 | ) 11 | 12 | BETA_POSTERIORS_ALL_INPUTS = [ 13 | { 14 | "totals": [10, 20, 30], 15 | "successes": [8, 16, 24], 16 | "sim_count": 10, 17 | "a_priors_beta": [0.5, 0.5, 0.5], 18 | "b_priors_beta": [0.5, 0.5, 0.5], 19 | }, 20 | { 21 | "totals": [20, 30], 22 | "successes": [16, 24], 23 | "sim_count": 20, 24 | "a_priors_beta": [0.5, 0.5], 25 | "b_priors_beta": [0.5, 0.5], 26 | }, 27 | ] 28 | 29 | LOGNORMAL_POSTERIORS_INPUTS = [ 30 | { 31 | "totals": 1580, 32 | "sum_logs": 3831.806394737816, 33 | "sum_logs_2": 11029.923165846496, 34 | "sim_count": 10000, 35 | }, 36 | { 37 | "totals": 1580, 38 | "sum_logs": 4055.965234848171, 39 | "sum_logs_2": 12357.911862914, 40 | "sim_count": 100, 41 | }, 42 | { 43 | "totals": 0, 44 | "sum_logs": 0, 45 | "sum_logs_2": 0, 46 | "sim_count": 100, 47 | }, 48 | ] 49 | 50 | DIRICHLET_POSTERIORS_INPUTS = [ 51 | { 52 | "concentration": [1, 2, 3], 53 | "prior": [1, 1, 1], 54 | "sim_count": 10000, 55 | }, 56 | { 57 | "concentration": [100, 200], 58 | "prior": [1 / 2, 1 / 2], 59 | "sim_count": 100, 60 | }, 61 | ] 62 | 63 | GAMMA_POSTERIORS_ALL_INPUTS = [ 64 | { 65 | "totals": [10, 20, 30], 66 | "sums": [80, 161, 260], 67 | "sim_count": 10, 68 | "a_priors_gamma": [0.5, 0.5, 0.5], 69 | "b_priors_gamma": [0.5, 0.5, 0.5], 70 | }, 71 | { 72 | "totals": [20, 30], 73 | "sums": [160, 240], 74 | "sim_count": 20, 75 | "a_priors_gamma": [0.5, 0.5], 76 | "b_priors_gamma": [0.5, 0.5], 77 | }, 78 | ] 79 | 80 | 81 | @pytest.mark.parametrize("inp", BETA_POSTERIORS_ALL_INPUTS) 82 | def test_beta_posteriors_all(inp): 83 | all_pos = beta_posteriors_all( 84 | inp["totals"], 85 | inp["successes"], 86 | inp["sim_count"], 87 | inp["a_priors_beta"], 88 | inp["b_priors_beta"], 89 | ) 90 | all_pos_shape = np.array(all_pos).shape 91 | assert all_pos_shape == (len(inp["totals"]), inp["sim_count"]) 92 | 93 | 94 | @pytest.mark.parametrize("inp", LOGNORMAL_POSTERIORS_INPUTS) 95 | def test_lognormal_posteriors(inp): 96 | all_pos = lognormal_posteriors( 97 | inp["totals"], 98 | inp["sum_logs"], 99 | inp["sum_logs_2"], 100 | inp["sim_count"], 101 | ) 102 | assert len(all_pos) == inp["sim_count"] 103 | 104 | 105 | @pytest.mark.parametrize("inp", DIRICHLET_POSTERIORS_INPUTS) 106 | def test_dirichlet_posteriors(inp): 107 | all_pos = dirichlet_posteriors( 108 | inp["concentration"], 109 | inp["prior"], 110 | inp["sim_count"], 111 | ) 112 | assert all_pos.shape == (inp["sim_count"], len(inp["concentration"])) 113 | 114 | 115 | @pytest.mark.parametrize("inp", GAMMA_POSTERIORS_ALL_INPUTS) 116 | def test_pois_gamma_posteriors_all(inp): 117 | all_pos = pois_gamma_posteriors_all( 118 | inp["totals"], 119 | inp["sums"], 120 | inp["sim_count"], 121 | inp["a_priors_gamma"], 122 | inp["b_priors_gamma"], 123 | ) 124 | all_pos_shape = np.array(all_pos).shape 125 | assert all_pos_shape == (len(inp["totals"]), inp["sim_count"]) 126 | 127 | 128 | @pytest.mark.parametrize("inp", GAMMA_POSTERIORS_ALL_INPUTS) 129 | def test_exp_gamma_posteriors_all(inp): 130 | all_pos = exp_gamma_posteriors_all( 131 | inp["totals"], 132 | inp["sums"], 133 | inp["sim_count"], 134 | inp["a_priors_gamma"], 135 | inp["b_priors_gamma"], 136 | ) 137 | all_pos_shape = np.array(all_pos).shape 138 | assert all_pos_shape == (len(inp["totals"]), inp["sim_count"]) 139 | -------------------------------------------------------------------------------- /tests/test_validators.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from bayesian_testing.metrics.evaluation import validate_bernoulli_input 4 | from bayesian_testing.utilities.common import check_list_lengths 5 | 6 | 7 | def test_validate_bernoulli_input(): 8 | validate_bernoulli_input([1, 2, 3], [1, 1, 1]) 9 | validate_bernoulli_input([1, 2], [1, 1]) 10 | validate_bernoulli_input([1], [1]) 11 | 12 | 13 | def test_validate_bernoulli_input_error(): 14 | with pytest.raises(ValueError): 15 | validate_bernoulli_input([1, 2], [1]) 16 | 17 | 18 | def test_check_list_lengths(): 19 | check_list_lengths([[1, 2, 3], [1, 1, 1], [2, 2, 2], [7, 7, 7]]) 20 | check_list_lengths([[], [], []]) 21 | 22 | 23 | def test_check_list_lengths_error(): 24 | with pytest.raises(ValueError): 25 | check_list_lengths([[1, 2, 3], [1, 1, 1], [2, 2, 2], [7, 7]]) 26 | --------------------------------------------------------------------------------