├── .github └── workflows │ ├── ci.yml │ ├── docs.yml │ └── release.yml ├── .gitignore ├── .markdownlint.yaml ├── LICENSE ├── README.md ├── docs ├── CNAME ├── api │ ├── aggr.md │ ├── config.md │ ├── datasets.md │ ├── experiment.md │ ├── index.md │ ├── metrics │ │ ├── base.md │ │ ├── index.md │ │ ├── mean.md │ │ ├── proportion.md │ │ └── resampling.md │ ├── multiplicity.md │ └── utils.md ├── assets │ ├── tea-cup-black.svg │ ├── tea-cup-white-on-black.svg │ └── tea-cup-white.svg ├── custom-metrics.md ├── data-backends.md ├── index.md ├── javascripts │ └── override-copy.js ├── multiple-testing.md ├── power-analysis.md ├── simulated-experiments.md ├── stylesheets │ └── extra.css └── user-guide.md ├── examples ├── README.md ├── custom-metrics.py ├── data-backends.py ├── multiple-testing.py ├── power-analysis.py ├── simulated-experiments.py └── user-guide.py ├── mkdocs.yml ├── pyproject.toml ├── src ├── _internal │ ├── __init__.py │ ├── create_examples.py │ ├── external_links.py │ └── strip_doctest_artifacts.py └── tea_tasting │ ├── __init__.py │ ├── aggr.py │ ├── config.py │ ├── datasets.py │ ├── experiment.py │ ├── metrics │ ├── __init__.py │ ├── base.py │ ├── mean.py │ ├── proportion.py │ └── resampling.py │ ├── multiplicity.py │ ├── utils.py │ └── version.py └── tests ├── __init__.py ├── metrics ├── __init__.py ├── test_base.py ├── test_mean.py ├── test_proportion.py └── test_resampling.py ├── test_aggr.py ├── test_config.py ├── test_datasets.py ├── test_experiment.py ├── test_multiplicity.py ├── test_utils.py └── test_version.py /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | on: 3 | pull_request: 4 | paths: 5 | - '**.py' 6 | - '.github/workflows/ci.yml' 7 | - 'pyproject.toml' 8 | push: 9 | branches: [main] 10 | paths: 11 | - '**.py' 12 | - '.github/workflows/ci.yml' 13 | - 'pyproject.toml' 14 | workflow_dispatch: 15 | jobs: 16 | doctest: 17 | runs-on: ${{ matrix.os }} 18 | strategy: 19 | matrix: 20 | os: [ubuntu-latest] 21 | python-version: ["3.10"] 22 | steps: 23 | - name: checkout 24 | uses: actions/checkout@v4 25 | - name: set up uv 26 | uses: astral-sh/setup-uv@v6 27 | with: 28 | activate-environment: true 29 | cache-suffix: "${{ matrix.python-version }}-test" 30 | enable-cache: true 31 | python-version: ${{ matrix.python-version }} 32 | - name: install dependencies 33 | run: uv sync --group test 34 | - name: doctest with pytest 35 | run: | 36 | pytest \ 37 | --doctest-continue-on-failure \ 38 | --doctest-glob=*.md \ 39 | --doctest-modules \ 40 | --ignore=examples/ \ 41 | --ignore=tests/ \ 42 | --ignore-glob=src/_* \ 43 | test-lowest: 44 | strategy: 45 | matrix: 46 | os: [ubuntu-latest] 47 | python-version: ["3.10"] 48 | runs-on: ${{ matrix.os }} 49 | steps: 50 | - name: checkout 51 | uses: actions/checkout@v4 52 | - name: set up uv 53 | uses: astral-sh/setup-uv@v6 54 | with: 55 | activate-environment: true 56 | cache-suffix: "${{ matrix.python-version }}-test-lowest" 57 | enable-cache: true 58 | python-version: ${{ matrix.python-version }} 59 | - name: install dependencies 60 | run: uv sync --group test --resolution lowest-direct 61 | - name: test-lowest with pytest 62 | run: pytest 63 | test: 64 | strategy: 65 | matrix: 66 | os: [ubuntu-latest, macos-13, windows-latest] 67 | python-version: ["3.10", "3.11", "3.12", "3.13"] 68 | runs-on: ${{ matrix.os }} 69 | steps: 70 | - name: checkout 71 | uses: actions/checkout@v4 72 | - name: set up uv 73 | uses: astral-sh/setup-uv@v6 74 | with: 75 | activate-environment: true 76 | cache-suffix: "${{ matrix.python-version }}-test" 77 | enable-cache: true 78 | python-version: ${{ matrix.python-version }} 79 | - name: install dependencies 80 | run: uv sync --group test 81 | - name: test with pytest 82 | run: coverage run -m pytest 83 | - name: convert coverage report 84 | run: coverage xml 85 | - name: upload coverage reports to codecov 86 | uses: codecov/codecov-action@v5 87 | with: 88 | files: ./coverage.xml 89 | env: 90 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 91 | lint: 92 | runs-on: ubuntu-latest 93 | strategy: 94 | matrix: 95 | python-version: ["3.10", "3.11", "3.12", "3.13"] 96 | steps: 97 | - name: checkout 98 | uses: actions/checkout@v4 99 | - name: set up uv 100 | uses: astral-sh/setup-uv@v6 101 | with: 102 | activate-environment: true 103 | cache-suffix: "${{ matrix.python-version }}-lint" 104 | enable-cache: true 105 | python-version: ${{ matrix.python-version }} 106 | - name: install dependencies 107 | run: uv sync --group lint --group test 108 | - name: check with ruff 109 | run: ruff check . 110 | - name: check with pyright 111 | run: pyright 112 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: docs 2 | on: 3 | release: 4 | types: [published] 5 | workflow_dispatch: 6 | permissions: 7 | contents: write 8 | jobs: 9 | docs: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: checkout 13 | uses: actions/checkout@v4 14 | - name: set up uv 15 | uses: astral-sh/setup-uv@v6 16 | with: 17 | activate-environment: true 18 | python-version: 3.12 19 | - name: install dependencies 20 | run: uv sync --group docs 21 | - name: build and publish docs 22 | run: mkdocs gh-deploy --force 23 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: release 2 | on: 3 | release: 4 | types: [published] 5 | jobs: 6 | publish: 7 | runs-on: ubuntu-latest 8 | permissions: 9 | id-token: write 10 | steps: 11 | - name: checkout 12 | uses: actions/checkout@v4 13 | - name: set up uv 14 | uses: astral-sh/setup-uv@v6 15 | with: 16 | activate-environment: true 17 | python-version: 3.12 18 | - name: install dependencies 19 | run: uv sync --no-dev 20 | - name: build 21 | run: uv build 22 | - name: publish 23 | run: uv publish 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | 164 | # VSCode 165 | .vscode/ 166 | 167 | # Version file 168 | src/tea_tasting/_version.txt 169 | 170 | # uv lockfile 171 | uv.lock 172 | -------------------------------------------------------------------------------- /.markdownlint.yaml: -------------------------------------------------------------------------------- 1 | MD007: 2 | indent: 4 3 | 4 | MD013: false 5 | MD046: false 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Evgeny Ivanov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tea-tasting: statistical analysis of A/B tests 2 | 3 | [![CI](https://github.com/e10v/tea-tasting/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/e10v/tea-tasting/actions/workflows/ci.yml) 4 | [![Docs](https://github.com/e10v/tea-tasting/actions/workflows/docs.yml/badge.svg)](https://tea-tasting.e10v.me/) 5 | [![Coverage](https://codecov.io/github/e10v/tea-tasting/coverage.svg?branch=main)](https://codecov.io/gh/e10v/tea-tasting) 6 | [![License](https://img.shields.io/github/license/e10v/tea-tasting)](https://github.com/e10v/tea-tasting/blob/main/LICENSE) 7 | [![Package Status](https://img.shields.io/pypi/status/tea-tasting.svg)](https://pypi.org/project/tea-tasting/) 8 | [![Version](https://img.shields.io/pypi/v/tea-tasting.svg)](https://pypi.org/project/tea-tasting/) 9 | [![PyPI Python Versions](https://img.shields.io/pypi/pyversions/tea-tasting.svg)](https://pypi.org/project/tea-tasting/) 10 | 11 | tea-tasting is a Python package for the statistical analysis of A/B tests featuring: 12 | 13 | - Student's t-test, Z-test, bootstrap, and quantile metrics out of the box. 14 | - Extensible API that lets you define and use statistical tests of your choice. 15 | - [Delta method](https://alexdeng.github.io/public/files/kdd2018-dm.pdf) for ratio metrics. 16 | - Variance reduction using [CUPED](https://exp-platform.com/Documents/2013-02-CUPED-ImprovingSensitivityOfControlledExperiments.pdf)/[CUPAC](https://doordash.engineering/2020/06/08/improving-experimental-power-through-control-using-predictions-as-covariate-cupac/), which can be combined with the Delta method for ratio metrics. 17 | - Confidence intervals for both absolute and percentage changes. 18 | - Checks for sample-ratio mismatches. 19 | - Power analysis. 20 | - Multiple hypothesis testing (family-wise error rate and false discovery rate). 21 | - Simulated experiments, including A/A tests. 22 | 23 | tea-tasting calculates statistics directly within data backends such as BigQuery, ClickHouse, DuckDB, PostgreSQL, Snowflake, Spark, and many other backends supported by [Ibis](https://github.com/ibis-project/ibis). This approach eliminates the need to import granular data into a Python environment. 24 | 25 | tea-tasting also accepts dataframes supported by [Narwhals](https://github.com/narwhals-dev/narwhals): cuDF, Dask, Modin, pandas, Polars, PyArrow. 26 | 27 | ## Installation 28 | 29 | ```bash 30 | uv pip install tea-tasting 31 | ``` 32 | 33 | ## Basic example 34 | 35 | ```pycon 36 | >>> import tea_tasting as tt 37 | 38 | >>> data = tt.make_users_data(seed=42) 39 | >>> experiment = tt.Experiment( 40 | ... sessions_per_user=tt.Mean("sessions"), 41 | ... orders_per_session=tt.RatioOfMeans("orders", "sessions"), 42 | ... orders_per_user=tt.Mean("orders"), 43 | ... revenue_per_user=tt.Mean("revenue"), 44 | ... ) 45 | >>> result = experiment.analyze(data) 46 | >>> result 47 | metric control treatment rel_effect_size rel_effect_size_ci pvalue 48 | sessions_per_user 2.00 1.98 -0.66% [-3.7%, 2.5%] 0.674 49 | orders_per_session 0.266 0.289 8.8% [-0.89%, 19%] 0.0762 50 | orders_per_user 0.530 0.573 8.0% [-2.0%, 19%] 0.118 51 | revenue_per_user 5.24 5.73 9.3% [-2.4%, 22%] 0.123 52 | 53 | ``` 54 | 55 | Learn more in the detailed [user guide](https://tea-tasting.e10v.me/user-guide/). Additionally, see the guides on more specific topics: 56 | 57 | - [Data backends](https://tea-tasting.e10v.me/data-backends/). 58 | - [Power analysis](https://tea-tasting.e10v.me/power-analysis/). 59 | - [Multiple hypothesis testing](https://tea-tasting.e10v.me/multiple-testing/). 60 | - [Custom metrics](https://tea-tasting.e10v.me/custom-metrics/). 61 | - [Simulated experiments](https://tea-tasting.e10v.me/simulated-experiments/). 62 | 63 | ## Examples 64 | 65 | The tea-tasting repository includes [examples](https://github.com/e10v/tea-tasting/tree/main/examples) as copies of the guides in the [marimo](https://github.com/marimo-team/marimo) notebook format. You can either download them from GitHub and run in your local environment, or you can run them as WASM notebooks in the online playground. 66 | 67 | ### Run in a local environment 68 | 69 | To run the examples in your local environment, clone the repository and change the directory: 70 | 71 | ```bash 72 | git clone git@github.com:e10v/tea-tasting.git && cd tea-tasting 73 | ``` 74 | 75 | Install marimo, tea-tasting, and other packages used in the examples: 76 | 77 | ```bash 78 | uv venv && uv pip install marimo tea-tasting polars ibis-framework[duckdb] 79 | ``` 80 | 81 | Launch the notebook server: 82 | 83 | ```bash 84 | uv run marimo edit examples 85 | ``` 86 | 87 | Now you can choose and run the example notebooks. 88 | 89 | ### Run in the online playground 90 | 91 | To run the examples as WASM notebooks in the online playground, open the following links: 92 | 93 | - [User guide](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fuser-guide.py&embed=true). 94 | - [Data backends](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fdata-backends.py&embed=true). 95 | - [Power analysis](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fpower-analysis.py&embed=true). 96 | - [Multiple hypothesis testing](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fmultiple-testing.py&embed=true). 97 | - [Custom metrics](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fcustom-metrics.py&embed=true). 98 | - [Simulated experiments](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fsimulated-experiments.py&embed=true). 99 | 100 | [WASM notebooks](https://docs.marimo.io/guides/wasm/) run entirely in the browser on [Pyodide](https://github.com/pyodide/pyodide) and thus have some limitations. In particular: 101 | 102 | - Tables and dataframes render less attractively because Pyodide doesn't always include the latest [packages versions](https://pyodide.org/en/stable/usage/packages-in-pyodide.html). 103 | - You can't simulate experiments [in parallel](https://tea-tasting.e10v.me/simulated-experiments/#parallel-execution) because Pyodide currently [doesn't support multiprocessing](https://pyodide.org/en/stable/usage/wasm-constraints.html#included-but-not-working-modules). 104 | - Other unpredictable issues may arise, such as the inability to use duckdb with ibis. 105 | 106 | ## Package name 107 | 108 | The package name "tea-tasting" is a play on words that refers to two subjects: 109 | 110 | - [Lady tasting tea](https://en.wikipedia.org/wiki/Lady_tasting_tea) is a famous experiment which was devised by Ronald Fisher. In this experiment, Fisher developed the null hypothesis significance testing framework to analyze a lady's claim that she could discern whether the tea or the milk was added first to the cup. 111 | - "tea-tasting" phonetically resembles "t-testing", referencing Student's t-test, a statistical method developed by William Gosset. 112 | -------------------------------------------------------------------------------- /docs/CNAME: -------------------------------------------------------------------------------- 1 | tea-tasting.e10v.me 2 | -------------------------------------------------------------------------------- /docs/api/aggr.md: -------------------------------------------------------------------------------- 1 | ::: tea_tasting.aggr 2 | -------------------------------------------------------------------------------- /docs/api/config.md: -------------------------------------------------------------------------------- 1 | ::: tea_tasting.config 2 | -------------------------------------------------------------------------------- /docs/api/datasets.md: -------------------------------------------------------------------------------- 1 | ::: tea_tasting.datasets 2 | options: 3 | members_order: source 4 | -------------------------------------------------------------------------------- /docs/api/experiment.md: -------------------------------------------------------------------------------- 1 | ::: tea_tasting.experiment 2 | -------------------------------------------------------------------------------- /docs/api/index.md: -------------------------------------------------------------------------------- 1 | ::: tea_tasting 2 | -------------------------------------------------------------------------------- /docs/api/metrics/base.md: -------------------------------------------------------------------------------- 1 | ::: tea_tasting.metrics.base 2 | -------------------------------------------------------------------------------- /docs/api/metrics/index.md: -------------------------------------------------------------------------------- 1 | ::: tea_tasting.metrics 2 | -------------------------------------------------------------------------------- /docs/api/metrics/mean.md: -------------------------------------------------------------------------------- 1 | ::: tea_tasting.metrics.mean 2 | -------------------------------------------------------------------------------- /docs/api/metrics/proportion.md: -------------------------------------------------------------------------------- 1 | ::: tea_tasting.metrics.proportion 2 | -------------------------------------------------------------------------------- /docs/api/metrics/resampling.md: -------------------------------------------------------------------------------- 1 | ::: tea_tasting.metrics.resampling 2 | -------------------------------------------------------------------------------- /docs/api/multiplicity.md: -------------------------------------------------------------------------------- 1 | ::: tea_tasting.multiplicity 2 | -------------------------------------------------------------------------------- /docs/api/utils.md: -------------------------------------------------------------------------------- 1 | ::: tea_tasting.utils 2 | options: 3 | group_by_category: false 4 | members_order: source 5 | -------------------------------------------------------------------------------- /docs/assets/tea-cup-black.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | 11 | 12 | 13 | 14 | 15 | 19 | 20 | 21 | 22 | 23 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /docs/assets/tea-cup-white-on-black.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /docs/assets/tea-cup-white.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /docs/custom-metrics.md: -------------------------------------------------------------------------------- 1 | # Custom metrics 2 | 3 | ## Intro 4 | 5 | tea-tasting supports Student's t-test, Z-test, and [some other statistical tests](api/metrics/index.md) out of the box. However, you might want to analyze an experiment using other statistical criteria. In this case, you can define a custom metric with a statistical test of your choice. 6 | 7 | In tea-tasting, there are two types of metrics: 8 | 9 | - Metrics that require only aggregated statistics for the analysis. 10 | - Metrics that require granular data for the analysis. 11 | 12 | This guide explains how to define a custom metric for each type. 13 | 14 | First, let's import all the required modules and prepare the data: 15 | 16 | ```pycon 17 | >>> from typing import Literal, NamedTuple 18 | >>> import numpy as np 19 | >>> import pyarrow as pa 20 | >>> import pyarrow.compute as pc 21 | >>> import scipy.stats 22 | >>> import tea_tasting as tt 23 | >>> import tea_tasting.aggr 24 | >>> import tea_tasting.config 25 | >>> import tea_tasting.metrics 26 | >>> import tea_tasting.utils 27 | 28 | >>> data = tt.make_users_data(seed=42) 29 | >>> data = data.append_column( 30 | ... "has_order", 31 | ... pc.greater(data["orders"], 0).cast(pa.int64()), 32 | ... ) 33 | >>> data 34 | pyarrow.Table 35 | user: int64 36 | variant: int64 37 | sessions: int64 38 | orders: int64 39 | revenue: double 40 | has_order: int64 41 | ---- 42 | user: [[0,1,2,3,4,...,3995,3996,3997,3998,3999]] 43 | variant: [[1,0,1,1,0,...,0,0,0,0,0]] 44 | sessions: [[2,2,2,2,1,...,2,2,3,1,5]] 45 | orders: [[1,1,1,1,1,...,0,0,0,0,2]] 46 | revenue: [[9.17,6.43,7.94,15.93,7.14,...,0,0,0,0,17.16]] 47 | has_order: [[1,1,1,1,1,...,0,0,0,0,1]] 48 | 49 | ``` 50 | 51 | This guide uses PyArrow as the data backend, but it's valid for other backends as well. See the [guide on data backends](data-backends.md) for more details. 52 | 53 | ## Metrics based on aggregated statistics 54 | 55 | Let's define a metric that performs a proportion test, [G-test](https://en.wikipedia.org/wiki/G-test) or [Pearson's chi-squared test](https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test), on a binary column (with values `0` or `1`). 56 | 57 | The first step is defining a result class. It should be a named tuple or a dictionary. 58 | 59 | ```pycon 60 | >>> class ProportionResult(NamedTuple): 61 | ... control: float 62 | ... treatment: float 63 | ... effect_size: float 64 | ... rel_effect_size: float 65 | ... pvalue: float 66 | ... statistic: float 67 | ... 68 | 69 | ``` 70 | 71 | The second step is defining the metric class itself. A metric based on aggregated statistics should be a subclass of [`MetricBaseAggregated`](api/metrics/base.md#tea_tasting.metrics.base.MetricBaseAggregated). `MetricBaseAggregated` is a generic class with the result class as a type variable. 72 | 73 | The metric should have the following methods and properties defined: 74 | 75 | - Method `__init__` checks and saves metric parameters. 76 | - Property `aggr_cols` returns columns to be aggregated for analysis for each type of statistic. 77 | - Method `analyze_aggregates` analyzes the metric using aggregated statistics. 78 | 79 | Let's define the metric and discuss each method in details: 80 | 81 | ```pycon 82 | >>> class Proportion(tea_tasting.metrics.MetricBaseAggregated[ProportionResult]): 83 | ... def __init__( 84 | ... self, 85 | ... column: str, 86 | ... *, 87 | ... correction: bool = True, 88 | ... method: Literal["g-test", "pearson"] = "g-test", 89 | ... ) -> None: 90 | ... self.column = tea_tasting.utils.check_scalar(column, "column", typ=str) 91 | ... self.correction = tea_tasting.utils.auto_check(correction, "correction") 92 | ... self.method = tea_tasting.utils.check_scalar( 93 | ... method, "method", typ=str, in_={"g-test", "pearson"}) 94 | ... @property 95 | ... def aggr_cols(self) -> tea_tasting.metrics.AggrCols: 96 | ... return tea_tasting.metrics.AggrCols( 97 | ... has_count=True, 98 | ... mean_cols=(self.column,), 99 | ... ) 100 | ... def analyze_aggregates( 101 | ... self, 102 | ... control: tea_tasting.aggr.Aggregates, 103 | ... treatment: tea_tasting.aggr.Aggregates, 104 | ... ) -> ProportionResult: 105 | ... observed = np.empty(shape=(2, 2), dtype=np.int64) 106 | ... observed[0, 0] = round(control.count() * control.mean(self.column)) 107 | ... observed[1, 0] = control.count() - observed[0, 0] 108 | ... observed[0, 1] = round(treatment.count() * treatment.mean(self.column)) 109 | ... observed[1, 1] = treatment.count() - observed[0, 1] 110 | ... res = scipy.stats.chi2_contingency( 111 | ... observed=observed, 112 | ... correction=self.correction, 113 | ... lambda_=int(self.method == "pearson"), 114 | ... ) 115 | ... return ProportionResult( 116 | ... control=control.mean(self.column), 117 | ... treatment=treatment.mean(self.column), 118 | ... effect_size=treatment.mean(self.column) - control.mean(self.column), 119 | ... rel_effect_size=treatment.mean(self.column)/control.mean(self.column) - 1, 120 | ... pvalue=res.pvalue, 121 | ... statistic=res.statistic, 122 | ... ) 123 | ... 124 | 125 | ``` 126 | 127 | Method `__init__` saves metric parameters to be used in the analysis. You can use utility functions [`check_scalar`](api/utils.md#tea_tasting.utils.check_scalar) and [`auto_check`](api/utils.md#tea_tasting.utils.auto_check) to check parameter values. 128 | 129 | Property `aggr_cols` returns an instance of [`AggrCols`](api/metrics/base.md#tea_tasting.metrics.base.AggrCols). Analysis of proportion requires the number of rows (`has_count=True`) and the average value for the column of interest (`mean_cols=(self.column,)`) for each variant. 130 | 131 | Method `analyze_aggregates` accepts two parameters: `control` and `treatment` data as instances of class [`Aggregates`](api/aggr.md#tea_tasting.aggr.Aggregates). They contain values for statistics and columns specified in `aggr_cols`. 132 | 133 | Method `analyze_aggregates` returns an instance of `ProportionResult`, defined earlier, with the analysis result. 134 | 135 | Now we can analyze the proportion of users who created at least one order during the experiment. For comparison, let's also add a metric that performs a Z-test on the same column. 136 | 137 | ```pycon 138 | >>> experiment_prop = tt.Experiment( 139 | ... prop_users_with_orders=Proportion("has_order"), 140 | ... mean_users_with_orders=tt.Mean("has_order", use_t=False), 141 | ... ) 142 | >>> experiment_prop.analyze(data) 143 | metric control treatment rel_effect_size rel_effect_size_ci pvalue 144 | prop_users_with_orders 0.345 0.384 11% [-, -] 0.0117 145 | mean_users_with_orders 0.345 0.384 11% [2.5%, 21%] 0.0106 146 | 147 | ``` 148 | 149 | ## Metrics based on granular data 150 | 151 | Now let's define a metric that performs the Mann-Whitney U test. While it's possible to use the aggregated sum of ranks for the test, this example uses granular data for analysis. 152 | 153 | The result class: 154 | 155 | ```pycon 156 | >>> class MannWhitneyUResult(NamedTuple): 157 | ... pvalue: float 158 | ... statistic: float 159 | ... 160 | 161 | ``` 162 | 163 | A metric that analyzes granular data should be a subclass of [`MetricBaseGranular`](api/metrics/base.md#tea_tasting.metrics.base.MetricBaseGranular). `MetricBaseGranular` is a generic class with the result class as a type variable. 164 | 165 | Metric should have the following methods and properties defined: 166 | 167 | - Method `__init__` checks and saves metric parameters. 168 | - Property `cols` returns columns to be fetched for an analysis. 169 | - Method `analyze_granular` analyzes the metric using granular data. 170 | 171 | ```pycon 172 | >>> class MannWhitneyU(tea_tasting.metrics.MetricBaseGranular[MannWhitneyUResult]): 173 | ... def __init__( 174 | ... self, 175 | ... column: str, 176 | ... *, 177 | ... correction: bool = True, 178 | ... alternative: Literal["two-sided", "less", "greater"] | None = None, 179 | ... ) -> None: 180 | ... self.column = tea_tasting.utils.check_scalar(column, "column", typ=str) 181 | ... self.correction = tea_tasting.utils.auto_check(correction, "correction") 182 | ... self.alternative = ( 183 | ... tea_tasting.utils.auto_check(alternative, "alternative") 184 | ... if alternative is not None 185 | ... else tea_tasting.config.get_config("alternative") 186 | ... ) 187 | ... @property 188 | ... def cols(self) -> tuple[str]: 189 | ... return (self.column,) 190 | ... def analyze_granular( 191 | ... self, 192 | ... control: pa.Table, 193 | ... treatment: pa.Table, 194 | ... ) -> MannWhitneyUResult: 195 | ... res = scipy.stats.mannwhitneyu( 196 | ... treatment[self.column].combine_chunks().to_numpy(zero_copy_only=False), 197 | ... control[self.column].combine_chunks().to_numpy(zero_copy_only=False), 198 | ... use_continuity=self.correction, 199 | ... alternative=self.alternative, 200 | ... ) 201 | ... return MannWhitneyUResult( 202 | ... pvalue=res.pvalue, 203 | ... statistic=res.statistic, 204 | ... ) 205 | ... 206 | 207 | ``` 208 | 209 | Property `cols` should return a sequence of strings. 210 | 211 | Method `analyze_granular` accepts two parameters: control and treatment data as PyArrow Tables. Even with [data backend](data-backends.md) different from PyArrow, tea-tasting will retrieve the data and transform into a PyArrow Table. 212 | 213 | Method `analyze_granular` returns an instance of `MannWhitneyUResult`, defined earlier, with analysis result. 214 | 215 | Now we can perform the Mann-Whitney U test: 216 | 217 | ```pycon 218 | >>> experiment_mwu = tt.Experiment( 219 | ... mwu_orders=MannWhitneyU("orders"), 220 | ... mwu_revenue=MannWhitneyU("revenue"), 221 | ... ) 222 | >>> result_mwu = experiment_mwu.analyze(data) 223 | >>> result_mwu.with_keys(("metric", "pvalue", "statistic")) 224 | metric pvalue statistic 225 | mwu_orders 0.0263 2069092 226 | mwu_revenue 0.0300 2068060 227 | 228 | ``` 229 | 230 | ## Analyzing two types of metrics together 231 | 232 | It's also possible to analyze two types of metrics in one experiment: 233 | 234 | ```pycon 235 | >>> experiment = tt.Experiment( 236 | ... prop_users_with_orders=Proportion("has_order"), 237 | ... mean_users_with_orders=tt.Mean("has_order"), 238 | ... mwu_orders=MannWhitneyU("orders"), 239 | ... mwu_revenue=MannWhitneyU("revenue"), 240 | ... ) 241 | >>> experiment.analyze(data) 242 | metric control treatment rel_effect_size rel_effect_size_ci pvalue 243 | prop_users_with_orders 0.345 0.384 11% [-, -] 0.0117 244 | mean_users_with_orders 0.345 0.384 11% [2.5%, 21%] 0.0106 245 | mwu_orders - - - [-, -] 0.0263 246 | mwu_revenue - - - [-, -] 0.0300 247 | 248 | ``` 249 | 250 | In this case, tea-tasting performs two queries on the experimental data: 251 | 252 | - With aggregated statistics required for analysis of metrics of type `MetricBaseAggregated`. 253 | - With detailed data with columns required for analysis of metrics of type `MetricBaseGranular`. 254 | 255 | ## Recommendations 256 | 257 | Follow these recommendations when defining custom metrics: 258 | 259 | - Use parameter and attribute names consistent with the ones that are already defined in tea-tasting. For example, use `pvalue` instead of `p_value` or `correction` instead of `use_continuity`. 260 | - End confidence interval boundary names with `"_ci_lower"` and `"_ci_upper"`. 261 | - During initialization, save parameter values in metric attributes using the same names. For example, use `self.correction = correction` instead of `self.use_continuity = correction`. 262 | - Use global settings as default values for standard parameters, such as `alternative` or `confidence_level`. See the [reference](api/config.md#tea_tasting.config.config_context) for the full list of standard parameters. You can also define and use your own global parameters. 263 | -------------------------------------------------------------------------------- /docs/data-backends.md: -------------------------------------------------------------------------------- 1 | # Data backends 2 | 3 | ## Intro 4 | 5 | tea-tasting supports a wide range of data backends such as BigQuery, ClickHouse, DuckDB, PostgreSQL, Snowflake, Spark, and many other backends supported by [Ibis](https://github.com/ibis-project/ibis). Ibis is a DataFrame API to various data backends. 6 | 7 | Many statistical tests, such as the Student's t-test or the Z-test, require only aggregated data for analysis. For these tests, tea-tasting retrieves only aggregated statistics like mean and variance instead of downloading all detailed data. 8 | 9 | For example, if the raw experimental data are stored in ClickHouse, it's faster and more efficient to calculate counts, averages, variances, and covariances directly in ClickHouse rather than fetching granular data and performing aggregations in a Python environment. 10 | 11 | tea-tasting also accepts dataframes supported by [Narwhals](https://github.com/narwhals-dev/narwhals): cuDF, Dask, Modin, pandas, Polars, PyArrow. Narwhals is a compatibility layer between dataframe libraries. 12 | 13 | This guide: 14 | 15 | - Shows how to use tea-tasting with a data backend of your choice for the analysis of an experiment. 16 | - Explains some internals of how tea-tasting uses Ibis to work with data backends. 17 | 18 | ## Demo database 19 | 20 | /// admonition | Note 21 | 22 | This guide uses [DuckDB](https://github.com/duckdb/duckdb), an in-process analytical database, and [Polars](https://github.com/pola-rs/polars) as example data backends. Install these packages in addition to tea-tasting to reproduce the examples: 23 | 24 | ```bash 25 | uv pip install ibis-framework[duckdb] polars 26 | ``` 27 | 28 | /// 29 | 30 | First, let's prepare a demo database: 31 | 32 | ```pycon 33 | >>> import ibis 34 | >>> import polars as pl 35 | >>> import tea_tasting as tt 36 | 37 | >>> users_data = tt.make_users_data(seed=42) 38 | >>> con = ibis.connect("duckdb://") 39 | >>> con.create_table("users_data", users_data) 40 | DatabaseTable: memory.main.users_data 41 | user int64 42 | variant int64 43 | sessions int64 44 | orders int64 45 | revenue float64 46 | 47 | ``` 48 | 49 | In the example above: 50 | 51 | - Function `tt.make_users_data` returns a PyArrow Table with example experimental data. 52 | - Function `ibis.duckdb.connect` creates a DuckDB in-process database using Ibis API. 53 | - Method `con.create_table` creates and populates a table in the database based on the PyArrow Table. 54 | 55 | See the [Ibis documentation on how to create connections](https://ibis-project.org/reference/connection) to other data backends. 56 | 57 | ## Querying experimental data 58 | 59 | Method `con.create_table` in the example above returns an Ibis Table which already can be used in the analysis of the experiment. But let's see how to use an SQL query to create an Ibis Table: 60 | 61 | ```pycon 62 | >>> data = con.sql("select * from users_data") 63 | >>> data 64 | SQLQueryResult 65 | query: 66 | select * from users_data 67 | schema: 68 | user int64 69 | variant int64 70 | sessions int64 71 | orders int64 72 | revenue float64 73 | 74 | ``` 75 | 76 | It's a very simple query. In the real world, you might need to use joins, aggregations, and CTEs to get the data. You can define any SQL query supported by your data backend and use it to create Ibis Table. 77 | 78 | Keep in mind that tea-tasting assumes that: 79 | 80 | - Data is grouped by randomization units, such as individual users. 81 | - There is a column indicating the variant of the A/B test (typically labeled as A, B, etc.). 82 | - All necessary columns for metric calculations (like the number of orders, revenue, etc.) are included in the table. 83 | 84 | Ibis Table is a lazy object. It doesn't fetch the data when created. You can use Ibis DataFrame API to query the table and fetch the result: 85 | 86 | ```pycon 87 | >>> ibis.options.interactive = True 88 | >>> print(data.head(5)) 89 | ┏━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┓ 90 | ┃ user ┃ variant ┃ sessions ┃ orders ┃ revenue ┃ 91 | ┡━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━┩ 92 | │ int64 │ int64 │ int64 │ int64 │ float64 │ 93 | ├───────┼─────────┼──────────┼────────┼─────────┤ 94 | │ 0 │ 1 │ 2 │ 1 │ 9.17 │ 95 | │ 1 │ 0 │ 2 │ 1 │ 6.43 │ 96 | │ 2 │ 1 │ 2 │ 1 │ 7.94 │ 97 | │ 3 │ 1 │ 2 │ 1 │ 15.93 │ 98 | │ 4 │ 0 │ 1 │ 1 │ 7.14 │ 99 | └───────┴─────────┴──────────┴────────┴─────────┘ 100 | 101 | >>> ibis.options.interactive = False 102 | 103 | ``` 104 | 105 | ## Ibis example 106 | 107 | To better understand what Ibis does, let's consider the example with grouping and aggregation by variants: 108 | 109 | ```pycon 110 | >>> aggr_data = data.group_by("variant").aggregate( 111 | ... sessions_per_user=data.sessions.mean(), 112 | ... orders_per_session=data.orders.mean() / data.sessions.mean(), 113 | ... orders_per_user=data.orders.mean(), 114 | ... revenue_per_user=data.revenue.mean(), 115 | ... ) 116 | >>> aggr_data 117 | r0 := SQLQueryResult 118 | query: 119 | select * from users_data 120 | schema: 121 | user int64 122 | variant int64 123 | sessions int64 124 | orders int64 125 | revenue float64 126 | 127 | Aggregate[r0] 128 | groups: 129 | variant: r0.variant 130 | metrics: 131 | sessions_per_user: Mean(r0.sessions) 132 | orders_per_session: Mean(r0.orders) / Mean(r0.sessions) 133 | orders_per_user: Mean(r0.orders) 134 | revenue_per_user: Mean(r0.revenue) 135 | 136 | ``` 137 | 138 | `aggr_data` is another Ibis Table defined as a query over the previously defined `data`. Let's fetch the result: 139 | 140 | ```pycon 141 | >>> ibis.options.interactive = True 142 | >>> print(aggr_data) # doctest: +SKIP 143 | ┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓ 144 | ┃ variant ┃ sessions_per_user ┃ orders_per_session ┃ orders_per_user ┃ revenue_per_user ┃ 145 | ┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩ 146 | │ int64 │ float64 │ float64 │ float64 │ float64 │ 147 | ├─────────┼───────────────────┼────────────────────┼─────────────────┼──────────────────┤ 148 | │ 0 │ 1.996045 │ 0.265726 │ 0.530400 │ 5.241028 │ 149 | │ 1 │ 1.982802 │ 0.289031 │ 0.573091 │ 5.730111 │ 150 | └─────────┴───────────────────┴────────────────────┴─────────────────┴──────────────────┘ 151 | 152 | >>> ibis.options.interactive = False 153 | 154 | ``` 155 | 156 | Internally, Ibis compiles a Table to an SQL query supported by the backend: 157 | 158 | ```pycon 159 | >>> print(aggr_data.compile(pretty=True)) 160 | SELECT 161 | "t0"."variant", 162 | AVG("t0"."sessions") AS "sessions_per_user", 163 | AVG("t0"."orders") / AVG("t0"."sessions") AS "orders_per_session", 164 | AVG("t0"."orders") AS "orders_per_user", 165 | AVG("t0"."revenue") AS "revenue_per_user" 166 | FROM ( 167 | SELECT 168 | * 169 | FROM users_data 170 | ) AS "t0" 171 | GROUP BY 172 | 1 173 | 174 | ``` 175 | 176 | See [Ibis documentation](https://ibis-project.org/tutorials/getting_started) for more details. 177 | 178 | ## Experiment analysis 179 | 180 | The example above shows how to query the metric averages. But for statistical inference, it's not enough. For example, Student's t-test and Z-test also require number of rows and variance. Additionally, analysis of ratio metrics and variance reduction with CUPED requires covariances. 181 | 182 | Querying all the required statistics manually can be a daunting and error-prone task. But don't worry—tea-tasting does this work for you. You just need to specify the metrics: 183 | 184 | ```pycon 185 | >>> experiment = tt.Experiment( 186 | ... sessions_per_user=tt.Mean("sessions"), 187 | ... orders_per_session=tt.RatioOfMeans("orders", "sessions"), 188 | ... orders_per_user=tt.Mean("orders"), 189 | ... revenue_per_user=tt.Mean("revenue"), 190 | ... ) 191 | >>> result = experiment.analyze(data) 192 | >>> result 193 | metric control treatment rel_effect_size rel_effect_size_ci pvalue 194 | sessions_per_user 2.00 1.98 -0.66% [-3.7%, 2.5%] 0.674 195 | orders_per_session 0.266 0.289 8.8% [-0.89%, 19%] 0.0762 196 | orders_per_user 0.530 0.573 8.0% [-2.0%, 19%] 0.118 197 | revenue_per_user 5.24 5.73 9.3% [-2.4%, 22%] 0.123 198 | 199 | ``` 200 | 201 | In the example above, tea-tasting fetches all the required statistics with a single query and then uses them to analyze the experiment. 202 | 203 | Some statistical methods, like bootstrap, require granular data for analysis. In this case, tea-tasting fetches the detailed data as well. 204 | 205 | ## Example with CUPED 206 | 207 | An example of a slightly more complicated analysis using variance reduction with CUPED: 208 | 209 | ```pycon 210 | >>> users_data_cuped = tt.make_users_data(seed=42, covariates=True) 211 | >>> con.create_table("users_data_cuped", users_data_cuped) 212 | DatabaseTable: memory.main.users_data_cuped 213 | user int64 214 | variant int64 215 | sessions int64 216 | orders int64 217 | revenue float64 218 | sessions_covariate int64 219 | orders_covariate int64 220 | revenue_covariate float64 221 | 222 | >>> data_cuped = con.sql("select * from users_data_cuped") 223 | >>> experiment_cuped = tt.Experiment( 224 | ... sessions_per_user=tt.Mean("sessions", "sessions_covariate"), 225 | ... orders_per_session=tt.RatioOfMeans( 226 | ... numer="orders", 227 | ... denom="sessions", 228 | ... numer_covariate="orders_covariate", 229 | ... denom_covariate="sessions_covariate", 230 | ... ), 231 | ... orders_per_user=tt.Mean("orders", "orders_covariate"), 232 | ... revenue_per_user=tt.Mean("revenue", "revenue_covariate"), 233 | ... ) 234 | >>> result_cuped = experiment_cuped.analyze(data_cuped) 235 | >>> result_cuped 236 | metric control treatment rel_effect_size rel_effect_size_ci pvalue 237 | sessions_per_user 2.00 1.98 -0.68% [-3.2%, 1.9%] 0.603 238 | orders_per_session 0.262 0.293 12% [4.2%, 21%] 0.00229 239 | orders_per_user 0.523 0.581 11% [2.9%, 20%] 0.00733 240 | revenue_per_user 5.12 5.85 14% [3.8%, 26%] 0.00674 241 | 242 | ``` 243 | 244 | ## Polars example 245 | 246 | Here’s an example of how to analyze data using a Polars DataFrame: 247 | 248 | ```pycon 249 | >>> data_polars = pl.from_arrow(users_data) 250 | >>> experiment.analyze(data_polars) 251 | metric control treatment rel_effect_size rel_effect_size_ci pvalue 252 | sessions_per_user 2.00 1.98 -0.66% [-3.7%, 2.5%] 0.674 253 | orders_per_session 0.266 0.289 8.8% [-0.89%, 19%] 0.0762 254 | orders_per_user 0.530 0.573 8.0% [-2.0%, 19%] 0.118 255 | revenue_per_user 5.24 5.73 9.3% [-2.4%, 22%] 0.123 256 | 257 | ``` 258 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # tea-tasting: statistical analysis of A/B tests 2 | 3 | [![CI](https://github.com/e10v/tea-tasting/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/e10v/tea-tasting/actions/workflows/ci.yml) 4 | [![Docs](https://github.com/e10v/tea-tasting/actions/workflows/docs.yml/badge.svg)](https://tea-tasting.e10v.me/) 5 | [![Coverage](https://codecov.io/github/e10v/tea-tasting/coverage.svg?branch=main)](https://codecov.io/gh/e10v/tea-tasting) 6 | [![License](https://img.shields.io/github/license/e10v/tea-tasting)](https://github.com/e10v/tea-tasting/blob/main/LICENSE) 7 | [![Package Status](https://img.shields.io/pypi/status/tea-tasting.svg)](https://pypi.org/project/tea-tasting/) 8 | [![Version](https://img.shields.io/pypi/v/tea-tasting.svg)](https://pypi.org/project/tea-tasting/) 9 | [![PyPI Python Versions](https://img.shields.io/pypi/pyversions/tea-tasting.svg)](https://pypi.org/project/tea-tasting/) 10 | 11 | tea-tasting is a Python package for the statistical analysis of A/B tests featuring: 12 | 13 | - Student's t-test, Z-test, bootstrap, and quantile metrics out of the box. 14 | - Extensible API that lets you define and use statistical tests of your choice. 15 | - [Delta method](https://alexdeng.github.io/public/files/kdd2018-dm.pdf) for ratio metrics. 16 | - Variance reduction using [CUPED](https://exp-platform.com/Documents/2013-02-CUPED-ImprovingSensitivityOfControlledExperiments.pdf)/[CUPAC](https://doordash.engineering/2020/06/08/improving-experimental-power-through-control-using-predictions-as-covariate-cupac/), which can be combined with the Delta method for ratio metrics. 17 | - Confidence intervals for both absolute and percentage changes. 18 | - Checks for sample-ratio mismatches. 19 | - Power analysis. 20 | - Multiple hypothesis testing (family-wise error rate and false discovery rate). 21 | - Simulated experiments, including A/A tests. 22 | 23 | tea-tasting calculates statistics directly within data backends such as BigQuery, ClickHouse, DuckDB, PostgreSQL, Snowflake, Spark, and many other backends supported by [Ibis](https://github.com/ibis-project/ibis). This approach eliminates the need to import granular data into a Python environment. 24 | 25 | tea-tasting also accepts dataframes supported by [Narwhals](https://github.com/narwhals-dev/narwhals): cuDF, Dask, Modin, pandas, Polars, PyArrow. 26 | 27 | ## Installation 28 | 29 | ```bash 30 | uv pip install tea-tasting 31 | ``` 32 | 33 | ## Basic example 34 | 35 | ```pycon 36 | >>> import tea_tasting as tt 37 | 38 | >>> data = tt.make_users_data(seed=42) 39 | >>> experiment = tt.Experiment( 40 | ... sessions_per_user=tt.Mean("sessions"), 41 | ... orders_per_session=tt.RatioOfMeans("orders", "sessions"), 42 | ... orders_per_user=tt.Mean("orders"), 43 | ... revenue_per_user=tt.Mean("revenue"), 44 | ... ) 45 | >>> result = experiment.analyze(data) 46 | >>> result 47 | metric control treatment rel_effect_size rel_effect_size_ci pvalue 48 | sessions_per_user 2.00 1.98 -0.66% [-3.7%, 2.5%] 0.674 49 | orders_per_session 0.266 0.289 8.8% [-0.89%, 19%] 0.0762 50 | orders_per_user 0.530 0.573 8.0% [-2.0%, 19%] 0.118 51 | revenue_per_user 5.24 5.73 9.3% [-2.4%, 22%] 0.123 52 | 53 | ``` 54 | 55 | Learn more in the detailed [user guide](https://tea-tasting.e10v.me/user-guide/). Additionally, see the guides on more specific topics: 56 | 57 | - [Data backends](https://tea-tasting.e10v.me/data-backends/). 58 | - [Power analysis](https://tea-tasting.e10v.me/power-analysis/). 59 | - [Multiple hypothesis testing](https://tea-tasting.e10v.me/multiple-testing/). 60 | - [Custom metrics](https://tea-tasting.e10v.me/custom-metrics/). 61 | - [Simulated experiments](https://tea-tasting.e10v.me/simulated-experiments/). 62 | 63 | ## Examples 64 | 65 | The tea-tasting repository includes [examples](https://github.com/e10v/tea-tasting/tree/main/examples) as copies of the guides in the [marimo](https://github.com/marimo-team/marimo) notebook format. You can either download them from GitHub and run in your local environment, or you can run them as WASM notebooks in the online playground. 66 | 67 | ### Run in a local environment 68 | 69 | To run the examples in your local environment, clone the repository and change the directory: 70 | 71 | ```bash 72 | git clone git@github.com:e10v/tea-tasting.git && cd tea-tasting 73 | ``` 74 | 75 | Install marimo, tea-tasting, and other packages used in the examples: 76 | 77 | ```bash 78 | uv venv && uv pip install marimo tea-tasting polars ibis-framework[duckdb] 79 | ``` 80 | 81 | Launch the notebook server: 82 | 83 | ```bash 84 | uv run marimo edit examples 85 | ``` 86 | 87 | Now you can choose and run the example notebooks. 88 | 89 | ### Run in the online playground 90 | 91 | To run the examples as WASM notebooks in the online playground, open the following links: 92 | 93 | - [User guide](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fuser-guide.py&embed=true). 94 | - [Data backends](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fdata-backends.py&embed=true). 95 | - [Power analysis](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fpower-analysis.py&embed=true). 96 | - [Multiple hypothesis testing](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fmultiple-testing.py&embed=true). 97 | - [Custom metrics](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fcustom-metrics.py&embed=true). 98 | - [Simulated experiments](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fsimulated-experiments.py&embed=true). 99 | 100 | [WASM notebooks](https://docs.marimo.io/guides/wasm/) run entirely in the browser on [Pyodide](https://github.com/pyodide/pyodide) and thus have some limitations. In particular: 101 | 102 | - Tables and dataframes render less attractively because Pyodide doesn't always include the latest [packages versions](https://pyodide.org/en/stable/usage/packages-in-pyodide.html). 103 | - You can't simulate experiments [in parallel](https://tea-tasting.e10v.me/simulated-experiments/#parallel-execution) because Pyodide currently [doesn't support multiprocessing](https://pyodide.org/en/stable/usage/wasm-constraints.html#included-but-not-working-modules). 104 | - Other unpredictable issues may arise, such as the inability to use duckdb with ibis. 105 | 106 | ## Package name 107 | 108 | The package name "tea-tasting" is a play on words that refers to two subjects: 109 | 110 | - [Lady tasting tea](https://en.wikipedia.org/wiki/Lady_tasting_tea) is a famous experiment which was devised by Ronald Fisher. In this experiment, Fisher developed the null hypothesis significance testing framework to analyze a lady's claim that she could discern whether the tea or the milk was added first to the cup. 111 | - "tea-tasting" phonetically resembles "t-testing", referencing Student's t-test, a statistical method developed by William Gosset. 112 | -------------------------------------------------------------------------------- /docs/javascripts/override-copy.js: -------------------------------------------------------------------------------- 1 | function attachCustomCopy() { 2 | document.querySelectorAll("button.md-clipboard").forEach((button) => { 3 | button.removeEventListener("click", handleCopy); 4 | }); 5 | 6 | document.querySelectorAll("button.md-clipboard").forEach((button) => { 7 | button.addEventListener("click", handleCopy); 8 | }); 9 | } 10 | 11 | function handleCopy(event) { 12 | event.preventDefault(); 13 | const button = event.currentTarget; 14 | const codeBlock = document.querySelector(button.getAttribute('data-clipboard-target')); 15 | const codeBlockClone = codeBlock.cloneNode(true); 16 | codeBlockClone.querySelectorAll('.go').forEach(span => { 17 | const prev = span.previousSibling; 18 | if (prev && prev.nodeType === Node.TEXT_NODE) { 19 | prev.textContent = prev.textContent.replace(/[\r\n]+$/, ''); 20 | } 21 | }); 22 | codeBlockClone.querySelectorAll('.gp, .go').forEach(span => span.remove()); 23 | navigator.clipboard.writeText(codeBlockClone.textContent || codeBlockClone.innerText); 24 | } 25 | 26 | document$.subscribe(() => { 27 | attachCustomCopy(); 28 | }); 29 | -------------------------------------------------------------------------------- /docs/multiple-testing.md: -------------------------------------------------------------------------------- 1 | # Multiple testing 2 | 3 | ## Multiple hypothesis testing problem 4 | 5 | /// admonition | Note 6 | 7 | This guide uses [Polars](https://github.com/pola-rs/polars) as an example data backend. Install Polars in addition to tea-tasting to reproduce the examples: 8 | 9 | ```bash 10 | uv pip install polars 11 | ``` 12 | 13 | /// 14 | 15 | The [multiple hypothesis testing problem](https://en.wikipedia.org/wiki/Multiple_comparisons_problem) arises when there is more than one success metric or more than one treatment variant in an A/B test. 16 | 17 | tea-tasting provides the following methods for multiple testing correction: 18 | 19 | - [False discovery rate](https://en.wikipedia.org/wiki/False_discovery_rate) (FDR) controlling procedures: 20 | - Benjamini-Hochberg procedure, assuming non-negative correlation between hypotheses. 21 | - Benjamini-Yekutieli procedure, assuming arbitrary dependence between hypotheses. 22 | - [Family-wise error rate](https://en.wikipedia.org/wiki/Family-wise_error_rate) (FWER) controlling procedures: 23 | - Hochberg's step-up procedure, assuming non-negative correlation between hypotheses. 24 | - Holm's step-down procedure, assuming arbitrary dependence between hypotheses. 25 | 26 | As an example, consider an experiment with three variants, a control and two treatments: 27 | 28 | ```pycon 29 | >>> import polars as pl 30 | >>> import tea_tasting as tt 31 | 32 | >>> data = pl.concat(( 33 | ... tt.make_users_data( 34 | ... seed=42, 35 | ... orders_uplift=0.10, 36 | ... revenue_uplift=0.15, 37 | ... return_type="polars", 38 | ... ), 39 | ... tt.make_users_data( 40 | ... seed=21, 41 | ... orders_uplift=0.15, 42 | ... revenue_uplift=0.20, 43 | ... return_type="polars", 44 | ... ) 45 | ... .filter(pl.col("variant").eq(1)) 46 | ... .with_columns(variant=pl.lit(2, pl.Int64)), 47 | ... )) 48 | >>> data 49 | shape: (6_046, 5) 50 | ┌──────┬─────────┬──────────┬────────┬─────────┐ 51 | │ user ┆ variant ┆ sessions ┆ orders ┆ revenue │ 52 | │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ 53 | │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ f64 │ 54 | ╞══════╪═════════╪══════════╪════════╪═════════╡ 55 | │ 0 ┆ 1 ┆ 2 ┆ 1 ┆ 9.58 │ 56 | │ 1 ┆ 0 ┆ 2 ┆ 1 ┆ 6.43 │ 57 | │ 2 ┆ 1 ┆ 2 ┆ 1 ┆ 8.3 │ 58 | │ 3 ┆ 1 ┆ 2 ┆ 1 ┆ 16.65 │ 59 | │ 4 ┆ 0 ┆ 1 ┆ 1 ┆ 7.14 │ 60 | │ … ┆ … ┆ … ┆ … ┆ … │ 61 | │ 3989 ┆ 2 ┆ 4 ┆ 4 ┆ 34.93 │ 62 | │ 3991 ┆ 2 ┆ 1 ┆ 0 ┆ 0.0 │ 63 | │ 3992 ┆ 2 ┆ 3 ┆ 3 ┆ 27.96 │ 64 | │ 3994 ┆ 2 ┆ 2 ┆ 1 ┆ 17.22 │ 65 | │ 3998 ┆ 2 ┆ 3 ┆ 0 ┆ 0.0 │ 66 | └──────┴─────────┴──────────┴────────┴─────────┘ 67 | 68 | ``` 69 | 70 | Let's calculate the experiment results: 71 | 72 | ```pycon 73 | >>> experiment = tt.Experiment( 74 | ... sessions_per_user=tt.Mean("sessions"), 75 | ... orders_per_session=tt.RatioOfMeans("orders", "sessions"), 76 | ... orders_per_user=tt.Mean("orders"), 77 | ... revenue_per_user=tt.Mean("revenue"), 78 | ... ) 79 | >>> results = experiment.analyze(data, control=0, all_variants=True) 80 | >>> results 81 | variants metric control treatment rel_effect_size rel_effect_size_ci pvalue 82 | (0, 1) sessions_per_user 2.00 1.98 -0.66% [-3.7%, 2.5%] 0.674 83 | (0, 1) orders_per_session 0.266 0.289 8.8% [-0.89%, 19%] 0.0762 84 | (0, 1) orders_per_user 0.530 0.573 8.0% [-2.0%, 19%] 0.118 85 | (0, 1) revenue_per_user 5.24 5.99 14% [2.1%, 28%] 0.0211 86 | (0, 2) sessions_per_user 2.00 2.02 0.98% [-2.1%, 4.1%] 0.532 87 | (0, 2) orders_per_session 0.266 0.295 11% [1.2%, 22%] 0.0273 88 | (0, 2) orders_per_user 0.530 0.594 12% [1.7%, 23%] 0.0213 89 | (0, 2) revenue_per_user 5.24 6.25 19% [6.6%, 33%] 0.00218 90 | 91 | ``` 92 | 93 | Suppose only the two metrics `orders_per_user` and `revenue_per_user` are considered as success metrics, while the other two metrics `sessions_per_user` and `orders_per_session` are second-order diagnostic metrics. 94 | 95 | ```pycon 96 | >>> metrics = {"orders_per_user", "revenue_per_user"} 97 | 98 | ``` 99 | 100 | With two treatment variants and two success metrics, there are four hypotheses in total, which increases the probability of false positives (also called "false discoveries"). It's recommended to adjust the p-values or the significance level (alpha) in this case. Let's explore the correction methods provided by tea-tasting. 101 | 102 | ## False discovery rate 103 | 104 | False discovery rate (FDR) is the expected value of the proportion of false discoveries among the discoveries (rejections of the null hypothesis). To control for FDR, use the [`adjust_fdr`](api/multiplicity.md#tea_tasting.multiplicity.adjust_fdr) method: 105 | 106 | ```pycon 107 | >>> adjusted_results_fdr = tt.adjust_fdr(results, metrics) 108 | >>> adjusted_results_fdr 109 | comparison metric control treatment rel_effect_size pvalue pvalue_adj 110 | (0, 1) orders_per_user 0.530 0.573 8.0% 0.118 0.118 111 | (0, 1) revenue_per_user 5.24 5.99 14% 0.0211 0.0284 112 | (0, 2) orders_per_user 0.530 0.594 12% 0.0213 0.0284 113 | (0, 2) revenue_per_user 5.24 6.25 19% 0.00218 0.00872 114 | 115 | ``` 116 | 117 | The method adjusts p-values and saves them as `pvalue_adj`. Compare these values to the desired significance level alpha to determine if the null hypotheses can be rejected. 118 | 119 | The method also adjusts the significance level alpha and saves it as `alpha_adj`. Compare non-adjusted p-values (`pvalue`) to the `alpha_adj` to determine if the null hypotheses can be rejected: 120 | 121 | ```pycon 122 | >>> adjusted_results_fdr.with_keys(( 123 | ... "comparison", 124 | ... "metric", 125 | ... "control", 126 | ... "treatment", 127 | ... "rel_effect_size", 128 | ... "pvalue", 129 | ... "alpha_adj", 130 | ... )) 131 | comparison metric control treatment rel_effect_size pvalue alpha_adj 132 | (0, 1) orders_per_user 0.530 0.573 8.0% 0.118 0.0500 133 | (0, 1) revenue_per_user 5.24 5.99 14% 0.0211 0.0375 134 | (0, 2) orders_per_user 0.530 0.594 12% 0.0213 0.0375 135 | (0, 2) revenue_per_user 5.24 6.25 19% 0.00218 0.0375 136 | 137 | ``` 138 | 139 | By default, tea-tasting assumes non-negative correlation between hypotheses and performs the Benjamini-Hochberg procedure. To perform the Benjamini-Yekutieli procedure, assuming arbitrary dependence between hypotheses, set the `arbitrary_dependence` parameter to `True`: 140 | 141 | ```pycon 142 | >>> tt.adjust_fdr(results, metrics, arbitrary_dependence=True) 143 | comparison metric control treatment rel_effect_size pvalue pvalue_adj 144 | (0, 1) orders_per_user 0.530 0.573 8.0% 0.118 0.245 145 | (0, 1) revenue_per_user 5.24 5.99 14% 0.0211 0.0592 146 | (0, 2) orders_per_user 0.530 0.594 12% 0.0213 0.0592 147 | (0, 2) revenue_per_user 5.24 6.25 19% 0.00218 0.0182 148 | 149 | ``` 150 | 151 | ## Family-wise error rate 152 | 153 | Family-wise error rate (FWER) is the probability of making at least one type I error. To control for FWER, use the [`adjust_fwer`](api/multiplicity.md#tea_tasting.multiplicity.adjust_fwer) method: 154 | 155 | ```pycon 156 | >>> tt.adjust_fwer(results, metrics) 157 | comparison metric control treatment rel_effect_size pvalue pvalue_adj 158 | (0, 1) orders_per_user 0.530 0.573 8.0% 0.118 0.118 159 | (0, 1) revenue_per_user 5.24 5.99 14% 0.0211 0.0422 160 | (0, 2) orders_per_user 0.530 0.594 12% 0.0213 0.0422 161 | (0, 2) revenue_per_user 5.24 6.25 19% 0.00218 0.00869 162 | 163 | ``` 164 | 165 | By default, tea-tasting assumes non-negative correlation between hypotheses and performs the Hochberg's step-up procedure with the Šidák correction, which is slightly more powerful than the Bonferroni correction. 166 | 167 | To perform the Holm's step-down procedure, assuming arbitrary dependence between hypotheses, set the `arbitrary_dependence` parameter to `True`. In this case, it's recommended to use the Bonferroni correction, since the Šidák correction assumes non-negative correlation between hypotheses: 168 | 169 | ```pycon 170 | >>> tt.adjust_fwer( 171 | ... results, 172 | ... metrics, 173 | ... arbitrary_dependence=True, 174 | ... method="bonferroni", 175 | ... ) 176 | comparison metric control treatment rel_effect_size pvalue pvalue_adj 177 | (0, 1) orders_per_user 0.530 0.573 8.0% 0.118 0.118 178 | (0, 1) revenue_per_user 5.24 5.99 14% 0.0211 0.0634 179 | (0, 2) orders_per_user 0.530 0.594 12% 0.0213 0.0634 180 | (0, 2) revenue_per_user 5.24 6.25 19% 0.00218 0.00872 181 | 182 | ``` 183 | 184 | ## Other inputs 185 | 186 | In the examples above, the methods `adjust_fdr` and `adjust_fwer` received results from a *single experiment* with *more than two variants*. They can also accept the results from *multiple experiments* with *two variants* in each: 187 | 188 | ```pycon 189 | >>> data1 = tt.make_users_data(seed=42, orders_uplift=0.10, revenue_uplift=0.15) 190 | >>> data2 = tt.make_users_data(seed=21, orders_uplift=0.15, revenue_uplift=0.20) 191 | >>> result1 = experiment.analyze(data1) 192 | >>> result2 = experiment.analyze(data2) 193 | >>> tt.adjust_fdr( 194 | ... {"Experiment 1": result1, "Experiment 2": result2}, 195 | ... metrics, 196 | ... ) 197 | comparison metric control treatment rel_effect_size pvalue pvalue_adj 198 | Experiment 1 orders_per_user 0.530 0.573 8.0% 0.118 0.118 199 | Experiment 1 revenue_per_user 5.24 5.99 14% 0.0211 0.0282 200 | Experiment 2 orders_per_user 0.514 0.594 16% 0.00427 0.00853 201 | Experiment 2 revenue_per_user 5.10 6.25 22% 6.27e-04 0.00251 202 | 203 | ``` 204 | 205 | The methods `adjust_fdr` and `adjust_fwer` can also accept the result of *a single experiment with two variants*: 206 | 207 | ```pycon 208 | >>> tt.adjust_fwer(result2, metrics) 209 | comparison metric control treatment rel_effect_size pvalue pvalue_adj 210 | - orders_per_user 0.514 0.594 16% 0.00427 0.00427 211 | - revenue_per_user 5.10 6.25 22% 6.27e-04 0.00125 212 | 213 | ``` 214 | -------------------------------------------------------------------------------- /docs/power-analysis.md: -------------------------------------------------------------------------------- 1 | # Power analysis 2 | 3 | In tea-tasting, you can analyze the statistical power for `Mean` and `RatioOfMeans` metrics. There are three possible options: 4 | 5 | - Calculate the effect size, given statistical power and the total number of observations. 6 | - Calculate the total number of observations, given statistical power and the effect size. 7 | - Calculate statistical power, given the effect size and the total number of observations. 8 | 9 | In this example, tea-tasting calculates statistical power given the relative effect size and the number of observations: 10 | 11 | ```pycon 12 | >>> import tea_tasting as tt 13 | 14 | >>> data = tt.make_users_data( 15 | ... seed=42, 16 | ... sessions_uplift=0, 17 | ... orders_uplift=0, 18 | ... revenue_uplift=0, 19 | ... covariates=True, 20 | ... ) 21 | >>> orders_per_session = tt.RatioOfMeans("orders", "sessions", rel_effect_size=0.1) 22 | >>> orders_per_session.solve_power(data, "power") 23 | power effect_size rel_effect_size n_obs 24 | 52% 0.0261 10% 4000 25 | 26 | ``` 27 | 28 | Besides `alternative`, `equal_var`, `use_t`, and covariates (CUPED), the following metric parameters affect the result: 29 | 30 | - `alpha`: Significance level. 31 | - `ratio`: Ratio of the number of observations in the treatment relative to the control. 32 | - `power`: Statistical power. 33 | - `effect_size` and `rel_effect_size`: Absolute and relative effect size. Only one of them can be defined. 34 | - `n_obs`: Number of observations in the control and in the treatment together. If the number of observations is not set explicitly, it's inferred from the dataset. 35 | 36 | You can change the default values of `alpha`, `ratio`, `power`, and `n_obs` using the [global settings](user-guide.md#global-settings). 37 | 38 | tea-tasting can analyze power for several values of parameters `effect_size`, `rel_effect_size`, or `n_obs`. Example: 39 | 40 | ```pycon 41 | >>> orders_per_user = tt.Mean("orders", alpha=0.1, power=0.7, n_obs=(10_000, 20_000)) 42 | >>> orders_per_user.solve_power(data, "rel_effect_size") 43 | power effect_size rel_effect_size n_obs 44 | 70% 0.0367 7.1% 10000 45 | 70% 0.0260 5.0% 20000 46 | 47 | ``` 48 | 49 | You can analyze power for all metrics in the experiment. Example: 50 | 51 | ```pycon 52 | >>> with tt.config_context(n_obs=(10_000, 20_000)): 53 | ... experiment = tt.Experiment( 54 | ... sessions_per_user=tt.Mean("sessions", "sessions_covariate"), 55 | ... orders_per_session=tt.RatioOfMeans( 56 | ... numer="orders", 57 | ... denom="sessions", 58 | ... numer_covariate="orders_covariate", 59 | ... denom_covariate="sessions_covariate", 60 | ... ), 61 | ... orders_per_user=tt.Mean("orders", "orders_covariate"), 62 | ... revenue_per_user=tt.Mean("revenue", "revenue_covariate"), 63 | ... ) 64 | ... 65 | >>> power_result = experiment.solve_power(data) 66 | >>> power_result 67 | metric power effect_size rel_effect_size n_obs 68 | sessions_per_user 80% 0.0458 2.3% 10000 69 | sessions_per_user 80% 0.0324 1.6% 20000 70 | orders_per_session 80% 0.0177 6.8% 10000 71 | orders_per_session 80% 0.0125 4.8% 20000 72 | orders_per_user 80% 0.0374 7.2% 10000 73 | orders_per_user 80% 0.0264 5.1% 20000 74 | revenue_per_user 80% 0.488 9.2% 10000 75 | revenue_per_user 80% 0.345 6.5% 20000 76 | 77 | ``` 78 | 79 | In the example above, tea-tasting calculates both the relative and absolute effect size for all metrics for two possible sample size values, `10_000` and `20_000`. 80 | 81 | The `solve_power` methods of a [metric](api/metrics/mean.md#tea_tasting.metrics.mean.Mean.solve_power) and of an [experiment](api/experiment.md#tea_tasting.experiment.Experiment.solve_power) return the instances of [`MetricPowerResults`](api/metrics/base.md#tea_tasting.metrics.base.MetricPowerResults) and [`ExperimentPowerResult`](api/experiment.md#tea_tasting.experiment.ExperimentPowerResult) respectively. These result classes provide the serialization methods similar to the experiment result: `to_dicts`, `to_arrow`, `to_pandas`, `to_polars`, `to_pretty_dicts`, `to_string`, `to_html`. They are also rendered as an HTML tables in IPython and Jupyter, and as a table in marimo notebooks. 82 | -------------------------------------------------------------------------------- /docs/simulated-experiments.md: -------------------------------------------------------------------------------- 1 | # Simulated experiments 2 | 3 | ## Intro 4 | 5 | In tea-tasting, you can run multiple simulated A/A or A/B tests. In each simulation, tea-tasting splits the data into control and treatment groups and can optionally modify the treatment data. A simulation without changing the treatment data is called an A/A test. 6 | 7 | A/A tests are useful for identifying potential issues before conducting the actual A/B test. Treatment simulations are great for power analysis—especially when you need a specific uplift distribution or when an analytical formula doesn’t exist. 8 | 9 | /// admonition | Note 10 | 11 | This guide uses [Polars](https://github.com/pola-rs/polars) and [tqdm](https://github.com/tqdm/tqdm). Install these packages in addition to tea-tasting to reproduce the examples: 12 | 13 | ```bash 14 | uv pip install polars tqdm 15 | ``` 16 | 17 | /// 18 | 19 | ## Running A/A tests 20 | 21 | First, let's prepare the data without any uplift and drop the `"variant"` column. 22 | 23 | ```pycon 24 | >>> import polars as pl 25 | >>> import tea_tasting as tt 26 | 27 | >>> data = ( 28 | ... tt.make_users_data(seed=42, orders_uplift=0, revenue_uplift=0) 29 | ... .drop_columns("variant") 30 | ... ) 31 | >>> data 32 | pyarrow.Table 33 | user: int64 34 | sessions: int64 35 | orders: int64 36 | revenue: double 37 | ---- 38 | user: [[0,1,2,3,4,...,3995,3996,3997,3998,3999]] 39 | sessions: [[2,2,2,2,1,...,2,2,3,1,5]] 40 | orders: [[1,1,1,0,1,...,0,1,1,0,4]] 41 | revenue: [[19.06,12.09,8.84,0,9.9,...,0,4.8,9.63,0,12.7]] 42 | 43 | ``` 44 | 45 | To run A/A tests, first define the metrics for the experiment, then call the [`simulate`](api/experiment.md#tea_tasting.experiment.Experiment.simulate) method, providing the data and the number of simulations as arguments. 46 | 47 | ```pycon 48 | >>> experiment = tt.Experiment( 49 | ... sessions_per_user=tt.Mean("sessions"), 50 | ... orders_per_session=tt.RatioOfMeans("orders", "sessions"), 51 | ... orders_per_user=tt.Mean("orders"), 52 | ... revenue_per_user=tt.Mean("revenue"), 53 | ... n_users=tt.SampleRatio(), 54 | ... ) 55 | >>> results = experiment.simulate(data, 100, seed=42) 56 | >>> results_data = results.to_polars() 57 | >>> results_data.select( 58 | ... "metric", 59 | ... "control", 60 | ... "treatment", 61 | ... "rel_effect_size", 62 | ... "rel_effect_size_ci_lower", 63 | ... "rel_effect_size_ci_upper", 64 | ... "pvalue", 65 | ... ) # doctest: +SKIP 66 | shape: (500, 7) 67 | ┌────────────────────┬──────────┬───────────┬─────────────────┬────────────────────┬────────────────────┬──────────┐ 68 | │ metric ┆ control ┆ treatment ┆ rel_effect_size ┆ rel_effect_size_ci ┆ rel_effect_size_ci ┆ pvalue │ 69 | │ --- ┆ --- ┆ --- ┆ --- ┆ _lower ┆ _upper ┆ --- │ 70 | │ str ┆ f64 ┆ f64 ┆ f64 ┆ --- ┆ --- ┆ f64 │ 71 | │ ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ │ 72 | ╞════════════════════╪══════════╪═══════════╪═════════════════╪════════════════════╪════════════════════╪══════════╡ 73 | │ sessions_per_user ┆ 1.98004 ┆ 1.998998 ┆ 0.009575 ┆ -0.021272 ┆ 0.041393 ┆ 0.547091 │ 74 | │ orders_per_session ┆ 0.263105 ┆ 0.258647 ┆ -0.016945 ┆ -0.108177 ┆ 0.083621 ┆ 0.730827 │ 75 | │ orders_per_user ┆ 0.520958 ┆ 0.517034 ┆ -0.007532 ┆ -0.102993 ┆ 0.098087 ┆ 0.883462 │ 76 | │ revenue_per_user ┆ 5.446662 ┆ 5.14521 ┆ -0.055346 ┆ -0.162811 ┆ 0.065914 ┆ 0.356327 │ 77 | │ n_users ┆ 2004.0 ┆ 1996.0 ┆ null ┆ null ┆ null ┆ 0.91187 │ 78 | │ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │ 79 | │ sessions_per_user ┆ 1.993624 ┆ 1.985212 ┆ -0.00422 ┆ -0.034685 ┆ 0.027207 ┆ 0.78959 │ 80 | │ orders_per_session ┆ 0.269373 ┆ 0.251991 ┆ -0.064527 ┆ -0.151401 ┆ 0.03124 ┆ 0.179445 │ 81 | │ orders_per_user ┆ 0.537028 ┆ 0.500255 ┆ -0.068475 ┆ -0.158141 ┆ 0.030742 ┆ 0.169217 │ 82 | │ revenue_per_user ┆ 5.511967 ┆ 5.071928 ┆ -0.079833 ┆ -0.184806 ┆ 0.038656 ┆ 0.177868 │ 83 | │ n_users ┆ 2039.0 ┆ 1961.0 ┆ null ┆ null ┆ null ┆ 0.223423 │ 84 | └────────────────────┴──────────┴───────────┴─────────────────┴────────────────────┴────────────────────┴──────────┘ 85 | 86 | ``` 87 | 88 | The `simulate` method accepts data in the same formats as the `analyze` method. Internally, however, it converts the data to a PyArrow Table before running the simulations. 89 | 90 | The method returns an instance of the [`SimulationResults`](api/experiment.md#tea_tasting.experiment.SimulationResults) class, which contains the results of all simulations for all metrics. The resulting object provides serialization methods to those of the experiment result, including `to_dicts`, `to_arrow`, `to_pandas`, `to_polars`, `to_pretty_dicts`, `to_string`, `to_html`. 91 | 92 | For instance, we can now calculate the proportion of rejected null hypotheses, using various significance levels (`alpha`). In A/A tests, it estimates the type I error rate. 93 | 94 | ```pycon 95 | >>> def null_rejected( 96 | ... results_data: pl.DataFrame, 97 | ... alphas: tuple[float, ...] = (0.01, 0.02, 0.05), 98 | ... ) -> pl.DataFrame: 99 | ... return results_data.group_by("metric", maintain_order=True).agg( 100 | ... pl.col("pvalue").le(alpha).mean().alias(f"null_rejected_{alpha}") 101 | ... for alpha in alphas 102 | ... ) 103 | ... 104 | >>> null_rejected(results_data) 105 | shape: (5, 4) 106 | ┌────────────────────┬────────────────────┬────────────────────┬────────────────────┐ 107 | │ metric ┆ null_rejected_0.01 ┆ null_rejected_0.02 ┆ null_rejected_0.05 │ 108 | │ --- ┆ --- ┆ --- ┆ --- │ 109 | │ str ┆ f64 ┆ f64 ┆ f64 │ 110 | ╞════════════════════╪════════════════════╪════════════════════╪════════════════════╡ 111 | │ sessions_per_user ┆ 0.01 ┆ 0.02 ┆ 0.05 │ 112 | │ orders_per_session ┆ 0.02 ┆ 0.02 ┆ 0.06 │ 113 | │ orders_per_user ┆ 0.01 ┆ 0.02 ┆ 0.05 │ 114 | │ revenue_per_user ┆ 0.02 ┆ 0.03 ┆ 0.06 │ 115 | │ n_users ┆ 0.01 ┆ 0.01 ┆ 0.04 │ 116 | └────────────────────┴────────────────────┴────────────────────┴────────────────────┘ 117 | 118 | ``` 119 | 120 | 100 simulations, as in the example above, produce a very rough estimate. In practice, a larger number of simulations, such as the default `10_000`, is recommended. 121 | 122 | ## Simulating experiments with treatment 123 | 124 | To simulate experiments with treatment, define a treatment function that takes data in the form of a PyArrow Table and returns a PyArrow Table with the modified data: 125 | 126 | ```pycon 127 | >>> import pyarrow as pa 128 | >>> import pyarrow.compute as pc 129 | 130 | >>> def treat(data: pa.Table) -> pa.Table: 131 | ... return ( 132 | ... data.drop_columns(["orders", "revenue"]) 133 | ... .append_column("orders", pc.multiply(data["orders"], pa.scalar(1.1))) 134 | ... .append_column("revenue", pc.multiply(data["revenue"], pa.scalar(1.1))) 135 | ... ) 136 | ... 137 | >>> results_treat = experiment.simulate(data, 100, seed=42, treat=treat) 138 | >>> null_rejected(results_treat.to_polars()) 139 | shape: (5, 4) 140 | ┌────────────────────┬────────────────────┬────────────────────┬────────────────────┐ 141 | │ metric ┆ null_rejected_0.01 ┆ null_rejected_0.02 ┆ null_rejected_0.05 │ 142 | │ --- ┆ --- ┆ --- ┆ --- │ 143 | │ str ┆ f64 ┆ f64 ┆ f64 │ 144 | ╞════════════════════╪════════════════════╪════════════════════╪════════════════════╡ 145 | │ sessions_per_user ┆ 0.01 ┆ 0.02 ┆ 0.05 │ 146 | │ orders_per_session ┆ 0.23 ┆ 0.31 ┆ 0.42 │ 147 | │ orders_per_user ┆ 0.21 ┆ 0.29 ┆ 0.4 │ 148 | │ revenue_per_user ┆ 0.11 ┆ 0.16 ┆ 0.31 │ 149 | │ n_users ┆ 0.01 ┆ 0.01 ┆ 0.04 │ 150 | └────────────────────┴────────────────────┴────────────────────┴────────────────────┘ 151 | 152 | ``` 153 | 154 | In the example above, we've defined a function that increases the number of orders and the revenue by 10%. For these metrics, the proportion of rejected null hypotheses is an estimate of statistical power. 155 | 156 | ## Using a function instead of static data 157 | 158 | You can use a function instead of static data to generate input dynamically. The function should take an instance of `numpy.random.Generator` as a parameter named `seed` and return experimental data in any format supported by tea-tasting. 159 | 160 | As an example, let's use the `make_users_data` function. 161 | 162 | ```pycon 163 | >>> results_data_gen = experiment.simulate(tt.make_users_data, 100, seed=42) 164 | >>> null_rejected(results_data_gen.to_polars()) 165 | shape: (5, 4) 166 | ┌────────────────────┬────────────────────┬────────────────────┬────────────────────┐ 167 | │ metric ┆ null_rejected_0.01 ┆ null_rejected_0.02 ┆ null_rejected_0.05 │ 168 | │ --- ┆ --- ┆ --- ┆ --- │ 169 | │ str ┆ f64 ┆ f64 ┆ f64 │ 170 | ╞════════════════════╪════════════════════╪════════════════════╪════════════════════╡ 171 | │ sessions_per_user ┆ 0.01 ┆ 0.01 ┆ 0.06 │ 172 | │ orders_per_session ┆ 0.27 ┆ 0.36 ┆ 0.54 │ 173 | │ orders_per_user ┆ 0.24 ┆ 0.32 ┆ 0.49 │ 174 | │ revenue_per_user ┆ 0.17 ┆ 0.26 ┆ 0.39 │ 175 | │ n_users ┆ 0.01 ┆ 0.01 ┆ 0.04 │ 176 | └────────────────────┴────────────────────┴────────────────────┴────────────────────┘ 177 | 178 | ``` 179 | 180 | On each iteration, tea-tasting calls `make_users_data` with a new `seed` and uses the returned data for the analysis of the experiment. The data returned by `make_users_data` already contains the `"variant"` column, so tea-tasting reuses that split. By default, `make_users_data` also adds the treatment uplift, and you can see it in the proportion of rejected null hypotheses. 181 | 182 | ## Tracking progress 183 | 184 | To track the progress of simulations with [`tqdm`](https://github.com/tqdm/tqdm) or [`marimo.status.progress_bar`](https://docs.marimo.io/api/status/#progress-bar), use the `progress` parameter. 185 | 186 | ```pycon 187 | >>> import tqdm 188 | 189 | >>> results_progress = experiment.simulate( 190 | ... data, 191 | ... 100, 192 | ... seed=42, 193 | ... progress=tqdm.tqdm, 194 | ... ) # doctest: +SKIP 195 | 100%|██████████████████████████████████████| 100/100 [00:01<00:00, 64.47it/s] 196 | 197 | ``` 198 | 199 | ## Parallel execution 200 | 201 | /// admonition | Note 202 | 203 | The code below won't work in the [marimo online playground](https://docs.marimo.io/guides/publishing/playground/) as it relies on the `multiprocessing` module which is currently [not supported](https://docs.marimo.io/guides/wasm/#limitations) by WASM notebooks. [WASM notebooks](https://docs.marimo.io/guides/wasm/) are the marimo notebooks that run entirely in the browser. 204 | 205 | /// 206 | 207 | To speed up simulations and run them in parallel, use the `map_` parameter with an alternative mapping function. 208 | 209 | ```pycon 210 | >>> import concurrent.futures 211 | 212 | >>> with concurrent.futures.ProcessPoolExecutor() as executor: 213 | ... results_parallel = experiment.simulate( 214 | ... data, 215 | ... 100, 216 | ... seed=42, 217 | ... treat=treat, 218 | ... map_=executor.map, 219 | ... progress=tqdm.tqdm, 220 | ... ) # doctest: +SKIP 221 | ... 222 | 100%|█████████████████████████████████████| 100/100 [00:00<00:00, 251.60it/s] 223 | 224 | ``` 225 | 226 | As an alternative to [`concurrent.futures.ProcessPoolExecutor`](https://docs.python.org/3/library/concurrent.futures.html#processpoolexecutor), you can use the `map`, `imap`, or `imap_unordered` methods of [`multiprocessing.pool.Pool`](https://docs.python.org/3/library/multiprocessing.html#module-multiprocessing.pool). 227 | 228 | It's also possible to run simulations on a distributed [Dask](https://distributed.dask.org/en/stable/api.html#distributed.Client.map) or [Ray](https://docs.ray.io/en/latest/ray-core/api/doc/ray.util.ActorPool.map.html#ray.util.ActorPool.map) cluster. 229 | -------------------------------------------------------------------------------- /docs/stylesheets/extra.css: -------------------------------------------------------------------------------- 1 | :root { 2 | --md-code-font: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; 3 | --md-text-font: system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", "Noto Sans", "Liberation Sans", Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji"; 4 | } 5 | .md-typeset code { 6 | font-size: .875em; 7 | } 8 | .md-typeset ol li,.md-typeset ul li { 9 | margin-bottom: .25em 10 | } 11 | div.highlight span.gp { /* gp: Generic.Prompt */ 12 | user-select: none; 13 | -webkit-user-select: text; /* Safari fallback only */ 14 | -webkit-user-select: none; /* Chrome/Safari */ 15 | -moz-user-select: none; /* Firefox */ 16 | -ms-user-select: none; /* IE10+ */ 17 | } 18 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | The tea-tasting repository includes [examples](https://github.com/e10v/tea-tasting/tree/main/examples) as copies of the guides in the [marimo](https://github.com/marimo-team/marimo) notebook format. You can either download them from GitHub and run in your local environment, or you can run them as WASM notebooks in the online playground. 4 | 5 | ## Run in a local environment 6 | 7 | To run the examples in your local environment, clone the repository and change the directory: 8 | 9 | ```bash 10 | git clone git@github.com:e10v/tea-tasting.git && cd tea-tasting 11 | ``` 12 | 13 | Install marimo, tea-tasting, and other packages used in the examples: 14 | 15 | ```bash 16 | uv venv && uv pip install marimo tea-tasting polars ibis-framework[duckdb] 17 | ``` 18 | 19 | Launch the notebook server: 20 | 21 | ```bash 22 | uv run marimo edit examples 23 | ``` 24 | 25 | Now you can choose and run the example notebooks. 26 | 27 | ## Run in the online playground 28 | 29 | To run the examples as WASM notebooks in the online playground, open the following links: 30 | 31 | - [User guide](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fuser-guide.py&embed=true). 32 | - [Data backends](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fdata-backends.py&embed=true). 33 | - [Power analysis](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fpower-analysis.py&embed=true). 34 | - [Multiple hypothesis testing](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fmultiple-testing.py&embed=true). 35 | - [Custom metrics](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fcustom-metrics.py&embed=true). 36 | - [Simulated experiments](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fsimulated-experiments.py&embed=true). 37 | 38 | [WASM notebooks](https://docs.marimo.io/guides/wasm/) run entirely in the browser on [Pyodide](https://github.com/pyodide/pyodide) and thus have some limitations. In particular: 39 | 40 | - Tables and dataframes render less attractively because Pyodide doesn't always include the latest [packages versions](https://pyodide.org/en/stable/usage/packages-in-pyodide.html). 41 | - You can't simulate experiments [in parallel](https://tea-tasting.e10v.me/simulated-experiments/#parallel-execution) because Pyodide currently [doesn't support multiprocessing](https://pyodide.org/en/stable/usage/wasm-constraints.html#included-but-not-working-modules). 42 | - Other unpredictable issues may arise, such as the inability to use duckdb with ibis. 43 | -------------------------------------------------------------------------------- /examples/custom-metrics.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.10" 3 | # dependencies = [ 4 | # "marimo", 5 | # "tea-tasting", 6 | # ] 7 | # [tool.marimo.display] 8 | # cell_output = "below" 9 | # /// 10 | 11 | import marimo 12 | 13 | __generated_with = "0.13.6" 14 | app = marimo.App() 15 | 16 | 17 | @app.cell(hide_code=True) 18 | def _(mo): 19 | mo.md( 20 | r""" 21 | # Custom metrics 22 | 23 | ## Intro 24 | 25 | tea-tasting supports Student's t-test, Z-test, and [some other statistical tests](https://tea-tasting.e10v.me/api/metrics/index/) out of the box. However, you might want to analyze an experiment using other statistical criteria. In this case, you can define a custom metric with a statistical test of your choice. 26 | 27 | In tea-tasting, there are two types of metrics: 28 | 29 | - Metrics that require only aggregated statistics for the analysis. 30 | - Metrics that require granular data for the analysis. 31 | 32 | This guide explains how to define a custom metric for each type. 33 | 34 | First, let's import all the required modules and prepare the data: 35 | """ 36 | ) 37 | return 38 | 39 | 40 | @app.cell 41 | def _(): 42 | from typing import Literal, NamedTuple 43 | import numpy as np 44 | import pyarrow as pa 45 | import pyarrow.compute as pc 46 | import scipy.stats 47 | import tea_tasting as tt 48 | import tea_tasting.aggr 49 | import tea_tasting.config 50 | import tea_tasting.metrics 51 | import tea_tasting.utils 52 | 53 | data = tt.make_users_data(seed=42) 54 | data = data.append_column( 55 | "has_order", 56 | pc.greater(data["orders"], 0).cast(pa.int64()), 57 | ) 58 | data 59 | return Literal, NamedTuple, data, np, pa, scipy, tea_tasting, tt 60 | 61 | 62 | @app.cell(hide_code=True) 63 | def _(mo): 64 | mo.md( 65 | r""" 66 | This guide uses PyArrow as the data backend, but it's valid for other backends as well. See the [guide on data backends](https://tea-tasting.e10v.me/data-backends/) for more details. 67 | 68 | ## Metrics based on aggregated statistics 69 | 70 | Let's define a metric that performs a proportion test, [G-test](https://en.wikipedia.org/wiki/G-test) or [Pearson's chi-squared test](https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test), on a binary column (with values `0` or `1`). 71 | 72 | The first step is defining a result class. It should be a named tuple or a dictionary. 73 | """ 74 | ) 75 | return 76 | 77 | 78 | @app.cell 79 | def _(NamedTuple): 80 | class ProportionResult(NamedTuple): 81 | control: float 82 | treatment: float 83 | effect_size: float 84 | rel_effect_size: float 85 | pvalue: float 86 | statistic: float 87 | return (ProportionResult,) 88 | 89 | 90 | @app.cell(hide_code=True) 91 | def _(mo): 92 | mo.md( 93 | r""" 94 | The second step is defining the metric class itself. A metric based on aggregated statistics should be a subclass of [`MetricBaseAggregated`](https://tea-tasting.e10v.me/api/metrics/base/#tea_tasting.metrics.base.MetricBaseAggregated). `MetricBaseAggregated` is a generic class with the result class as a type variable. 95 | 96 | The metric should have the following methods and properties defined: 97 | 98 | - Method `__init__` checks and saves metric parameters. 99 | - Property `aggr_cols` returns columns to be aggregated for analysis for each type of statistic. 100 | - Method `analyze_aggregates` analyzes the metric using aggregated statistics. 101 | 102 | Let's define the metric and discuss each method in details: 103 | """ 104 | ) 105 | return 106 | 107 | 108 | @app.cell 109 | def _(Literal, ProportionResult, np, scipy, tea_tasting): 110 | class Proportion(tea_tasting.metrics.MetricBaseAggregated[ProportionResult]): 111 | def __init__( 112 | self, 113 | column: str, 114 | *, 115 | correction: bool = True, 116 | method: Literal["g-test", "pearson"] = "g-test", 117 | ) -> None: 118 | self.column = tea_tasting.utils.check_scalar(column, "column", typ=str) 119 | self.correction = tea_tasting.utils.auto_check(correction, "correction") 120 | self.method = tea_tasting.utils.check_scalar( 121 | method, "method", typ=str, in_={"g-test", "pearson"}) 122 | @property 123 | def aggr_cols(self) -> tea_tasting.metrics.AggrCols: 124 | return tea_tasting.metrics.AggrCols( 125 | has_count=True, 126 | mean_cols=(self.column,), 127 | ) 128 | def analyze_aggregates( 129 | self, 130 | control: tea_tasting.aggr.Aggregates, 131 | treatment: tea_tasting.aggr.Aggregates, 132 | ) -> ProportionResult: 133 | observed = np.empty(shape=(2, 2), dtype=np.int64) 134 | observed[0, 0] = round(control.count() * control.mean(self.column)) 135 | observed[1, 0] = control.count() - observed[0, 0] 136 | observed[0, 1] = round(treatment.count() * treatment.mean(self.column)) 137 | observed[1, 1] = treatment.count() - observed[0, 1] 138 | res = scipy.stats.chi2_contingency( 139 | observed=observed, 140 | correction=self.correction, 141 | lambda_=int(self.method == "pearson"), 142 | ) 143 | return ProportionResult( 144 | control=control.mean(self.column), 145 | treatment=treatment.mean(self.column), 146 | effect_size=treatment.mean(self.column) - control.mean(self.column), 147 | rel_effect_size=treatment.mean(self.column)/control.mean(self.column) - 1, 148 | pvalue=res.pvalue, 149 | statistic=res.statistic, 150 | ) 151 | return (Proportion,) 152 | 153 | 154 | @app.cell(hide_code=True) 155 | def _(mo): 156 | mo.md( 157 | r""" 158 | Method `__init__` saves metric parameters to be used in the analysis. You can use utility functions [`check_scalar`](https://tea-tasting.e10v.me/api/utils/#tea_tasting.utils.check_scalar) and [`auto_check`](https://tea-tasting.e10v.me/api/utils/#tea_tasting.utils.auto_check) to check parameter values. 159 | 160 | Property `aggr_cols` returns an instance of [`AggrCols`](https://tea-tasting.e10v.me/api/metrics/base/#tea_tasting.metrics.base.AggrCols). Analysis of proportion requires the number of rows (`has_count=True`) and the average value for the column of interest (`mean_cols=(self.column,)`) for each variant. 161 | 162 | Method `analyze_aggregates` accepts two parameters: `control` and `treatment` data as instances of class [`Aggregates`](https://tea-tasting.e10v.me/api/aggr/#tea_tasting.aggr.Aggregates). They contain values for statistics and columns specified in `aggr_cols`. 163 | 164 | Method `analyze_aggregates` returns an instance of `ProportionResult`, defined earlier, with the analysis result. 165 | 166 | Now we can analyze the proportion of users who created at least one order during the experiment. For comparison, let's also add a metric that performs a Z-test on the same column. 167 | """ 168 | ) 169 | return 170 | 171 | 172 | @app.cell 173 | def _(Proportion, data, tt): 174 | experiment_prop = tt.Experiment( 175 | prop_users_with_orders=Proportion("has_order"), 176 | mean_users_with_orders=tt.Mean("has_order", use_t=False), 177 | ) 178 | experiment_prop.analyze(data) 179 | return 180 | 181 | 182 | @app.cell(hide_code=True) 183 | def _(mo): 184 | mo.md( 185 | r""" 186 | ## Metrics based on granular data 187 | 188 | Now let's define a metric that performs the Mann-Whitney U test. While it's possible to use the aggregated sum of ranks for the test, this example uses granular data for analysis. 189 | 190 | The result class: 191 | """ 192 | ) 193 | return 194 | 195 | 196 | @app.cell 197 | def _(NamedTuple): 198 | class MannWhitneyUResult(NamedTuple): 199 | pvalue: float 200 | statistic: float 201 | return (MannWhitneyUResult,) 202 | 203 | 204 | @app.cell(hide_code=True) 205 | def _(mo): 206 | mo.md( 207 | r""" 208 | A metric that analyzes granular data should be a subclass of [`MetricBaseGranular`](https://tea-tasting.e10v.me/api/metrics/base/#tea_tasting.metrics.base.MetricBaseGranular). `MetricBaseGranular` is a generic class with the result class as a type variable. 209 | 210 | Metric should have the following methods and properties defined: 211 | 212 | - Method `__init__` checks and saves metric parameters. 213 | - Property `cols` returns columns to be fetched for an analysis. 214 | - Method `analyze_granular` analyzes the metric using granular data. 215 | """ 216 | ) 217 | return 218 | 219 | 220 | @app.cell 221 | def _(Literal, MannWhitneyUResult, pa, scipy, tea_tasting): 222 | class MannWhitneyU(tea_tasting.metrics.MetricBaseGranular[MannWhitneyUResult]): 223 | def __init__( 224 | self, 225 | column: str, 226 | *, 227 | correction: bool = True, 228 | alternative: Literal["two-sided", "less", "greater"] | None = None, 229 | ) -> None: 230 | self.column = tea_tasting.utils.check_scalar(column, "column", typ=str) 231 | self.correction = tea_tasting.utils.auto_check(correction, "correction") 232 | self.alternative = ( 233 | tea_tasting.utils.auto_check(alternative, "alternative") 234 | if alternative is not None 235 | else tea_tasting.config.get_config("alternative") 236 | ) 237 | @property 238 | def cols(self) -> tuple[str]: 239 | return (self.column,) 240 | def analyze_granular( 241 | self, 242 | control: pa.Table, 243 | treatment: pa.Table, 244 | ) -> MannWhitneyUResult: 245 | res = scipy.stats.mannwhitneyu( 246 | treatment[self.column].combine_chunks().to_numpy(zero_copy_only=False), 247 | control[self.column].combine_chunks().to_numpy(zero_copy_only=False), 248 | use_continuity=self.correction, 249 | alternative=self.alternative, 250 | ) 251 | return MannWhitneyUResult( 252 | pvalue=res.pvalue, 253 | statistic=res.statistic, 254 | ) 255 | return (MannWhitneyU,) 256 | 257 | 258 | @app.cell(hide_code=True) 259 | def _(mo): 260 | mo.md( 261 | r""" 262 | Property `cols` should return a sequence of strings. 263 | 264 | Method `analyze_granular` accepts two parameters: control and treatment data as PyArrow Tables. Even with [data backend](https://tea-tasting.e10v.me/data-backends/) different from PyArrow, tea-tasting will retrieve the data and transform into a PyArrow Table. 265 | 266 | Method `analyze_granular` returns an instance of `MannWhitneyUResult`, defined earlier, with analysis result. 267 | 268 | Now we can perform the Mann-Whitney U test: 269 | """ 270 | ) 271 | return 272 | 273 | 274 | @app.cell 275 | def _(MannWhitneyU, data, tt): 276 | experiment_mwu = tt.Experiment( 277 | mwu_orders=MannWhitneyU("orders"), 278 | mwu_revenue=MannWhitneyU("revenue"), 279 | ) 280 | result_mwu = experiment_mwu.analyze(data) 281 | result_mwu.with_keys(("metric", "pvalue", "statistic")) 282 | return 283 | 284 | 285 | @app.cell(hide_code=True) 286 | def _(mo): 287 | mo.md( 288 | r""" 289 | ## Analyzing two types of metrics together 290 | 291 | It's also possible to analyze two types of metrics in one experiment: 292 | """ 293 | ) 294 | return 295 | 296 | 297 | @app.cell 298 | def _(MannWhitneyU, Proportion, data, tt): 299 | experiment = tt.Experiment( 300 | prop_users_with_orders=Proportion("has_order"), 301 | mean_users_with_orders=tt.Mean("has_order"), 302 | mwu_orders=MannWhitneyU("orders"), 303 | mwu_revenue=MannWhitneyU("revenue"), 304 | ) 305 | experiment.analyze(data) 306 | return 307 | 308 | 309 | @app.cell(hide_code=True) 310 | def _(mo): 311 | mo.md( 312 | r""" 313 | In this case, tea-tasting performs two queries on the experimental data: 314 | 315 | - With aggregated statistics required for analysis of metrics of type `MetricBaseAggregated`. 316 | - With detailed data with columns required for analysis of metrics of type `MetricBaseGranular`. 317 | 318 | ## Recommendations 319 | 320 | Follow these recommendations when defining custom metrics: 321 | 322 | - Use parameter and attribute names consistent with the ones that are already defined in tea-tasting. For example, use `pvalue` instead of `p_value` or `correction` instead of `use_continuity`. 323 | - End confidence interval boundary names with `"_ci_lower"` and `"_ci_upper"`. 324 | - During initialization, save parameter values in metric attributes using the same names. For example, use `self.correction = correction` instead of `self.use_continuity = correction`. 325 | - Use global settings as default values for standard parameters, such as `alternative` or `confidence_level`. See the [reference](https://tea-tasting.e10v.me/api/config/#tea_tasting.config.config_context) for the full list of standard parameters. You can also define and use your own global parameters. 326 | """ 327 | ) 328 | return 329 | 330 | 331 | @app.cell(hide_code=True) 332 | def _(): 333 | import marimo as mo 334 | return (mo,) 335 | 336 | 337 | if __name__ == "__main__": 338 | app.run() 339 | -------------------------------------------------------------------------------- /examples/data-backends.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.10" 3 | # dependencies = [ 4 | # "ibis-framework[duckdb]", 5 | # "marimo", 6 | # "polars", 7 | # "tea-tasting", 8 | # ] 9 | # [tool.marimo.display] 10 | # cell_output = "below" 11 | # /// 12 | 13 | import marimo 14 | 15 | __generated_with = "0.13.6" 16 | app = marimo.App() 17 | 18 | 19 | @app.cell(hide_code=True) 20 | def _(mo): 21 | mo.md( 22 | r""" 23 | # Data backends 24 | 25 | ## Intro 26 | 27 | tea-tasting supports a wide range of data backends such as BigQuery, ClickHouse, DuckDB, PostgreSQL, Snowflake, Spark, and many other backends supported by [Ibis](https://github.com/ibis-project/ibis). Ibis is a DataFrame API to various data backends. 28 | 29 | Many statistical tests, such as the Student's t-test or the Z-test, require only aggregated data for analysis. For these tests, tea-tasting retrieves only aggregated statistics like mean and variance instead of downloading all detailed data. 30 | 31 | For example, if the raw experimental data are stored in ClickHouse, it's faster and more efficient to calculate counts, averages, variances, and covariances directly in ClickHouse rather than fetching granular data and performing aggregations in a Python environment. 32 | 33 | tea-tasting also accepts dataframes supported by [Narwhals](https://github.com/narwhals-dev/narwhals): cuDF, Dask, Modin, pandas, Polars, PyArrow. Narwhals is a compatibility layer between dataframe libraries. 34 | 35 | This guide: 36 | 37 | - Shows how to use tea-tasting with a data backend of your choice for the analysis of an experiment. 38 | - Explains some internals of how tea-tasting uses Ibis to work with data backends. 39 | 40 | ## Demo database 41 | 42 | /// admonition | Note 43 | 44 | This guide uses [DuckDB](https://github.com/duckdb/duckdb), an in-process analytical database, and [Polars](https://github.com/pola-rs/polars) as example data backends. Install these packages in addition to tea-tasting to reproduce the examples: 45 | 46 | ```bash 47 | uv pip install ibis-framework[duckdb] polars 48 | ``` 49 | 50 | /// 51 | 52 | First, let's prepare a demo database: 53 | """ 54 | ) 55 | return 56 | 57 | 58 | @app.cell 59 | def _(): 60 | import ibis 61 | import polars as pl 62 | import tea_tasting as tt 63 | 64 | users_data = tt.make_users_data(seed=42) 65 | con = ibis.connect("duckdb://") 66 | con.create_table("users_data", users_data) 67 | return con, ibis, pl, tt, users_data 68 | 69 | 70 | @app.cell(hide_code=True) 71 | def _(mo): 72 | mo.md( 73 | r""" 74 | In the example above: 75 | 76 | - Function `tt.make_users_data` returns a PyArrow Table with example experimental data. 77 | - Function `ibis.duckdb.connect` creates a DuckDB in-process database using Ibis API. 78 | - Method `con.create_table` creates and populates a table in the database based on the PyArrow Table. 79 | 80 | See the [Ibis documentation on how to create connections](https://ibis-project.org/reference/connection) to other data backends. 81 | 82 | ## Querying experimental data 83 | 84 | Method `con.create_table` in the example above returns an Ibis Table which already can be used in the analysis of the experiment. But let's see how to use an SQL query to create an Ibis Table: 85 | """ 86 | ) 87 | return 88 | 89 | 90 | @app.cell 91 | def _(con): 92 | data = con.sql("select * from users_data") 93 | data 94 | return (data,) 95 | 96 | 97 | @app.cell(hide_code=True) 98 | def _(mo): 99 | mo.md( 100 | r""" 101 | It's a very simple query. In the real world, you might need to use joins, aggregations, and CTEs to get the data. You can define any SQL query supported by your data backend and use it to create Ibis Table. 102 | 103 | Keep in mind that tea-tasting assumes that: 104 | 105 | - Data is grouped by randomization units, such as individual users. 106 | - There is a column indicating the variant of the A/B test (typically labeled as A, B, etc.). 107 | - All necessary columns for metric calculations (like the number of orders, revenue, etc.) are included in the table. 108 | 109 | Ibis Table is a lazy object. It doesn't fetch the data when created. You can use Ibis DataFrame API to query the table and fetch the result: 110 | """ 111 | ) 112 | return 113 | 114 | 115 | @app.cell 116 | def _(data, ibis): 117 | ibis.options.interactive = True 118 | print(data.head(5)) 119 | 120 | ibis.options.interactive = False 121 | return 122 | 123 | 124 | @app.cell(hide_code=True) 125 | def _(mo): 126 | mo.md( 127 | r""" 128 | ## Ibis example 129 | 130 | To better understand what Ibis does, let's consider the example with grouping and aggregation by variants: 131 | """ 132 | ) 133 | return 134 | 135 | 136 | @app.cell 137 | def _(data): 138 | aggr_data = data.group_by("variant").aggregate( 139 | sessions_per_user=data.sessions.mean(), 140 | orders_per_session=data.orders.mean() / data.sessions.mean(), 141 | orders_per_user=data.orders.mean(), 142 | revenue_per_user=data.revenue.mean(), 143 | ) 144 | aggr_data 145 | return (aggr_data,) 146 | 147 | 148 | @app.cell(hide_code=True) 149 | def _(mo): 150 | mo.md( 151 | r""" 152 | `aggr_data` is another Ibis Table defined as a query over the previously defined `data`. Let's fetch the result: 153 | """ 154 | ) 155 | return 156 | 157 | 158 | @app.cell 159 | def _(aggr_data, ibis): 160 | ibis.options.interactive = True 161 | print(aggr_data) 162 | 163 | ibis.options.interactive = False 164 | return 165 | 166 | 167 | @app.cell(hide_code=True) 168 | def _(mo): 169 | mo.md( 170 | r""" 171 | Internally, Ibis compiles a Table to an SQL query supported by the backend: 172 | """ 173 | ) 174 | return 175 | 176 | 177 | @app.cell 178 | def _(aggr_data): 179 | print(aggr_data.compile(pretty=True)) 180 | return 181 | 182 | 183 | @app.cell(hide_code=True) 184 | def _(mo): 185 | mo.md( 186 | r""" 187 | See [Ibis documentation](https://ibis-project.org/tutorials/getting_started) for more details. 188 | 189 | ## Experiment analysis 190 | 191 | The example above shows how to query the metric averages. But for statistical inference, it's not enough. For example, Student's t-test and Z-test also require number of rows and variance. Additionally, analysis of ratio metrics and variance reduction with CUPED requires covariances. 192 | 193 | Querying all the required statistics manually can be a daunting and error-prone task. But don't worry—tea-tasting does this work for you. You just need to specify the metrics: 194 | """ 195 | ) 196 | return 197 | 198 | 199 | @app.cell 200 | def _(data, tt): 201 | experiment = tt.Experiment( 202 | sessions_per_user=tt.Mean("sessions"), 203 | orders_per_session=tt.RatioOfMeans("orders", "sessions"), 204 | orders_per_user=tt.Mean("orders"), 205 | revenue_per_user=tt.Mean("revenue"), 206 | ) 207 | result = experiment.analyze(data) 208 | result 209 | return (experiment,) 210 | 211 | 212 | @app.cell(hide_code=True) 213 | def _(mo): 214 | mo.md( 215 | r""" 216 | In the example above, tea-tasting fetches all the required statistics with a single query and then uses them to analyze the experiment. 217 | 218 | Some statistical methods, like bootstrap, require granular data for analysis. In this case, tea-tasting fetches the detailed data as well. 219 | 220 | ## Example with CUPED 221 | 222 | An example of a slightly more complicated analysis using variance reduction with CUPED: 223 | """ 224 | ) 225 | return 226 | 227 | 228 | @app.cell 229 | def _(con, tt): 230 | users_data_cuped = tt.make_users_data(seed=42, covariates=True) 231 | con.create_table("users_data_cuped", users_data_cuped) 232 | 233 | data_cuped = con.sql("select * from users_data_cuped") 234 | experiment_cuped = tt.Experiment( 235 | sessions_per_user=tt.Mean("sessions", "sessions_covariate"), 236 | orders_per_session=tt.RatioOfMeans( 237 | numer="orders", 238 | denom="sessions", 239 | numer_covariate="orders_covariate", 240 | denom_covariate="sessions_covariate", 241 | ), 242 | orders_per_user=tt.Mean("orders", "orders_covariate"), 243 | revenue_per_user=tt.Mean("revenue", "revenue_covariate"), 244 | ) 245 | result_cuped = experiment_cuped.analyze(data_cuped) 246 | result_cuped 247 | return 248 | 249 | 250 | @app.cell(hide_code=True) 251 | def _(mo): 252 | mo.md( 253 | r""" 254 | ## Polars example 255 | 256 | Here’s an example of how to analyze data using a Polars DataFrame: 257 | """ 258 | ) 259 | return 260 | 261 | 262 | @app.cell 263 | def _(experiment, pl, users_data): 264 | data_polars = pl.from_arrow(users_data) 265 | experiment.analyze(data_polars) 266 | return 267 | 268 | 269 | @app.cell(hide_code=True) 270 | def _(mo): 271 | mo.md( 272 | r""" 273 | 274 | """ 275 | ) 276 | return 277 | 278 | 279 | @app.cell(hide_code=True) 280 | def _(): 281 | import marimo as mo 282 | return (mo,) 283 | 284 | 285 | if __name__ == "__main__": 286 | app.run() 287 | -------------------------------------------------------------------------------- /examples/multiple-testing.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.10" 3 | # dependencies = [ 4 | # "marimo", 5 | # "polars", 6 | # "tea-tasting", 7 | # ] 8 | # [tool.marimo.display] 9 | # cell_output = "below" 10 | # /// 11 | 12 | import marimo 13 | 14 | __generated_with = "0.13.6" 15 | app = marimo.App() 16 | 17 | 18 | @app.cell(hide_code=True) 19 | def _(mo): 20 | mo.md( 21 | r""" 22 | # Multiple testing 23 | 24 | ## Multiple hypothesis testing problem 25 | 26 | /// admonition | Note 27 | 28 | This guide uses [Polars](https://github.com/pola-rs/polars) as an example data backend. Install Polars in addition to tea-tasting to reproduce the examples: 29 | 30 | ```bash 31 | uv pip install polars 32 | ``` 33 | 34 | /// 35 | 36 | The [multiple hypothesis testing problem](https://en.wikipedia.org/wiki/Multiple_comparisons_problem) arises when there is more than one success metric or more than one treatment variant in an A/B test. 37 | 38 | tea-tasting provides the following methods for multiple testing correction: 39 | 40 | - [False discovery rate](https://en.wikipedia.org/wiki/False_discovery_rate) (FDR) controlling procedures: 41 | - Benjamini-Hochberg procedure, assuming non-negative correlation between hypotheses. 42 | - Benjamini-Yekutieli procedure, assuming arbitrary dependence between hypotheses. 43 | - [Family-wise error rate](https://en.wikipedia.org/wiki/Family-wise_error_rate) (FWER) controlling procedures: 44 | - Hochberg's step-up procedure, assuming non-negative correlation between hypotheses. 45 | - Holm's step-down procedure, assuming arbitrary dependence between hypotheses. 46 | 47 | As an example, consider an experiment with three variants, a control and two treatments: 48 | """ 49 | ) 50 | return 51 | 52 | 53 | @app.cell 54 | def _(): 55 | import polars as pl 56 | import tea_tasting as tt 57 | 58 | data = pl.concat(( 59 | tt.make_users_data( 60 | seed=42, 61 | orders_uplift=0.10, 62 | revenue_uplift=0.15, 63 | return_type="polars", 64 | ), 65 | tt.make_users_data( 66 | seed=21, 67 | orders_uplift=0.15, 68 | revenue_uplift=0.20, 69 | return_type="polars", 70 | ) 71 | .filter(pl.col("variant").eq(1)) 72 | .with_columns(variant=pl.lit(2, pl.Int64)), 73 | )) 74 | data 75 | return data, tt 76 | 77 | 78 | @app.cell(hide_code=True) 79 | def _(mo): 80 | mo.md( 81 | r""" 82 | Let's calculate the experiment results: 83 | """ 84 | ) 85 | return 86 | 87 | 88 | @app.cell 89 | def _(data, tt): 90 | experiment = tt.Experiment( 91 | sessions_per_user=tt.Mean("sessions"), 92 | orders_per_session=tt.RatioOfMeans("orders", "sessions"), 93 | orders_per_user=tt.Mean("orders"), 94 | revenue_per_user=tt.Mean("revenue"), 95 | ) 96 | results = experiment.analyze(data, control=0, all_variants=True) 97 | results 98 | return experiment, results 99 | 100 | 101 | @app.cell(hide_code=True) 102 | def _(mo): 103 | mo.md( 104 | r""" 105 | Suppose only the two metrics `orders_per_user` and `revenue_per_user` are considered as success metrics, while the other two metrics `sessions_per_user` and `orders_per_session` are second-order diagnostic metrics. 106 | """ 107 | ) 108 | return 109 | 110 | 111 | @app.cell 112 | def _(): 113 | metrics = {"orders_per_user", "revenue_per_user"} 114 | return (metrics,) 115 | 116 | 117 | @app.cell(hide_code=True) 118 | def _(mo): 119 | mo.md( 120 | r""" 121 | With two treatment variants and two success metrics, there are four hypotheses in total, which increases the probability of false positives (also called "false discoveries"). It's recommended to adjust the p-values or the significance level (alpha) in this case. Let's explore the correction methods provided by tea-tasting. 122 | 123 | ## False discovery rate 124 | 125 | False discovery rate (FDR) is the expected value of the proportion of false discoveries among the discoveries (rejections of the null hypothesis). To control for FDR, use the [`adjust_fdr`](https://tea-tasting.e10v.me/api/multiplicity/#tea_tasting.multiplicity.adjust_fdr) method: 126 | """ 127 | ) 128 | return 129 | 130 | 131 | @app.cell 132 | def _(metrics, results, tt): 133 | adjusted_results_fdr = tt.adjust_fdr(results, metrics) 134 | adjusted_results_fdr 135 | return (adjusted_results_fdr,) 136 | 137 | 138 | @app.cell(hide_code=True) 139 | def _(mo): 140 | mo.md( 141 | r""" 142 | The method adjusts p-values and saves them as `pvalue_adj`. Compare these values to the desired significance level alpha to determine if the null hypotheses can be rejected. 143 | 144 | The method also adjusts the significance level alpha and saves it as `alpha_adj`. Compare non-adjusted p-values (`pvalue`) to the `alpha_adj` to determine if the null hypotheses can be rejected: 145 | """ 146 | ) 147 | return 148 | 149 | 150 | @app.cell 151 | def _(adjusted_results_fdr): 152 | adjusted_results_fdr.with_keys(( 153 | "comparison", 154 | "metric", 155 | "control", 156 | "treatment", 157 | "rel_effect_size", 158 | "pvalue", 159 | "alpha_adj", 160 | )) 161 | return 162 | 163 | 164 | @app.cell(hide_code=True) 165 | def _(mo): 166 | mo.md( 167 | r""" 168 | By default, tea-tasting assumes non-negative correlation between hypotheses and performs the Benjamini-Hochberg procedure. To perform the Benjamini-Yekutieli procedure, assuming arbitrary dependence between hypotheses, set the `arbitrary_dependence` parameter to `True`: 169 | """ 170 | ) 171 | return 172 | 173 | 174 | @app.cell 175 | def _(metrics, results, tt): 176 | tt.adjust_fdr(results, metrics, arbitrary_dependence=True) 177 | return 178 | 179 | 180 | @app.cell(hide_code=True) 181 | def _(mo): 182 | mo.md( 183 | r""" 184 | ## Family-wise error rate 185 | 186 | Family-wise error rate (FWER) is the probability of making at least one type I error. To control for FWER, use the [`adjust_fwer`](https://tea-tasting.e10v.me/api/multiplicity/#tea_tasting.multiplicity.adjust_fwer) method: 187 | """ 188 | ) 189 | return 190 | 191 | 192 | @app.cell 193 | def _(metrics, results, tt): 194 | tt.adjust_fwer(results, metrics) 195 | return 196 | 197 | 198 | @app.cell(hide_code=True) 199 | def _(mo): 200 | mo.md( 201 | r""" 202 | By default, tea-tasting assumes non-negative correlation between hypotheses and performs the Hochberg's step-up procedure with the Šidák correction, which is slightly more powerful than the Bonferroni correction. 203 | 204 | To perform the Holm's step-down procedure, assuming arbitrary dependence between hypotheses, set the `arbitrary_dependence` parameter to `True`. In this case, it's recommended to use the Bonferroni correction, since the Šidák correction assumes non-negative correlation between hypotheses: 205 | """ 206 | ) 207 | return 208 | 209 | 210 | @app.cell 211 | def _(metrics, results, tt): 212 | tt.adjust_fwer( 213 | results, 214 | metrics, 215 | arbitrary_dependence=True, 216 | method="bonferroni", 217 | ) 218 | return 219 | 220 | 221 | @app.cell(hide_code=True) 222 | def _(mo): 223 | mo.md( 224 | r""" 225 | ## Other inputs 226 | 227 | In the examples above, the methods `adjust_fdr` and `adjust_fwer` received results from a *single experiment* with *more than two variants*. They can also accept the results from *multiple experiments* with *two variants* in each: 228 | """ 229 | ) 230 | return 231 | 232 | 233 | @app.cell 234 | def _(experiment, metrics, tt): 235 | data1 = tt.make_users_data(seed=42, orders_uplift=0.10, revenue_uplift=0.15) 236 | data2 = tt.make_users_data(seed=21, orders_uplift=0.15, revenue_uplift=0.20) 237 | result1 = experiment.analyze(data1) 238 | result2 = experiment.analyze(data2) 239 | tt.adjust_fdr( 240 | {"Experiment 1": result1, "Experiment 2": result2}, 241 | metrics, 242 | ) 243 | return (result2,) 244 | 245 | 246 | @app.cell(hide_code=True) 247 | def _(mo): 248 | mo.md( 249 | r""" 250 | The methods `adjust_fdr` and `adjust_fwer` can also accept the result of *a single experiment with two variants*: 251 | """ 252 | ) 253 | return 254 | 255 | 256 | @app.cell 257 | def _(metrics, result2, tt): 258 | tt.adjust_fwer(result2, metrics) 259 | return 260 | 261 | 262 | @app.cell(hide_code=True) 263 | def _(mo): 264 | mo.md( 265 | r""" 266 | 267 | """ 268 | ) 269 | return 270 | 271 | 272 | @app.cell(hide_code=True) 273 | def _(): 274 | import marimo as mo 275 | return (mo,) 276 | 277 | 278 | if __name__ == "__main__": 279 | app.run() 280 | -------------------------------------------------------------------------------- /examples/power-analysis.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.10" 3 | # dependencies = [ 4 | # "marimo", 5 | # "tea-tasting", 6 | # ] 7 | # [tool.marimo.display] 8 | # cell_output = "below" 9 | # /// 10 | 11 | import marimo 12 | 13 | __generated_with = "0.13.6" 14 | app = marimo.App() 15 | 16 | 17 | @app.cell(hide_code=True) 18 | def _(mo): 19 | mo.md( 20 | r""" 21 | # Power analysis 22 | 23 | In tea-tasting, you can analyze the statistical power for `Mean` and `RatioOfMeans` metrics. There are three possible options: 24 | 25 | - Calculate the effect size, given statistical power and the total number of observations. 26 | - Calculate the total number of observations, given statistical power and the effect size. 27 | - Calculate statistical power, given the effect size and the total number of observations. 28 | 29 | In this example, tea-tasting calculates statistical power given the relative effect size and the number of observations: 30 | """ 31 | ) 32 | return 33 | 34 | 35 | @app.cell 36 | def _(): 37 | import tea_tasting as tt 38 | 39 | data = tt.make_users_data( 40 | seed=42, 41 | sessions_uplift=0, 42 | orders_uplift=0, 43 | revenue_uplift=0, 44 | covariates=True, 45 | ) 46 | orders_per_session = tt.RatioOfMeans("orders", "sessions", rel_effect_size=0.1) 47 | orders_per_session.solve_power(data, "power") 48 | return data, tt 49 | 50 | 51 | @app.cell(hide_code=True) 52 | def _(mo): 53 | mo.md( 54 | r""" 55 | Besides `alternative`, `equal_var`, `use_t`, and covariates (CUPED), the following metric parameters affect the result: 56 | 57 | - `alpha`: Significance level. 58 | - `ratio`: Ratio of the number of observations in the treatment relative to the control. 59 | - `power`: Statistical power. 60 | - `effect_size` and `rel_effect_size`: Absolute and relative effect size. Only one of them can be defined. 61 | - `n_obs`: Number of observations in the control and in the treatment together. If the number of observations is not set explicitly, it's inferred from the dataset. 62 | 63 | You can change the default values of `alpha`, `ratio`, `power`, and `n_obs` using the [global settings](https://tea-tasting.e10v.me/user-guide/#global-settings). 64 | 65 | tea-tasting can analyze power for several values of parameters `effect_size`, `rel_effect_size`, or `n_obs`. Example: 66 | """ 67 | ) 68 | return 69 | 70 | 71 | @app.cell 72 | def _(data, tt): 73 | orders_per_user = tt.Mean("orders", alpha=0.1, power=0.7, n_obs=(10_000, 20_000)) 74 | orders_per_user.solve_power(data, "rel_effect_size") 75 | return 76 | 77 | 78 | @app.cell(hide_code=True) 79 | def _(mo): 80 | mo.md( 81 | r""" 82 | You can analyze power for all metrics in the experiment. Example: 83 | """ 84 | ) 85 | return 86 | 87 | 88 | @app.cell 89 | def _(data, tt): 90 | with tt.config_context(n_obs=(10_000, 20_000)): 91 | experiment = tt.Experiment( 92 | sessions_per_user=tt.Mean("sessions", "sessions_covariate"), 93 | orders_per_session=tt.RatioOfMeans( 94 | numer="orders", 95 | denom="sessions", 96 | numer_covariate="orders_covariate", 97 | denom_covariate="sessions_covariate", 98 | ), 99 | orders_per_user=tt.Mean("orders", "orders_covariate"), 100 | revenue_per_user=tt.Mean("revenue", "revenue_covariate"), 101 | ) 102 | 103 | power_result = experiment.solve_power(data) 104 | power_result 105 | return 106 | 107 | 108 | @app.cell(hide_code=True) 109 | def _(mo): 110 | mo.md( 111 | r""" 112 | In the example above, tea-tasting calculates both the relative and absolute effect size for all metrics for two possible sample size values, `10_000` and `20_000`. 113 | 114 | The `solve_power` methods of a [metric](https://tea-tasting.e10v.me/api/metrics/mean/#tea_tasting.metrics.mean.Mean.solve_power) and of an [experiment](https://tea-tasting.e10v.me/api/experiment/#tea_tasting.experiment.Experiment.solve_power) return the instances of [`MetricPowerResults`](https://tea-tasting.e10v.me/api/metrics/base/#tea_tasting.metrics.base.MetricPowerResults) and [`ExperimentPowerResult`](https://tea-tasting.e10v.me/api/experiment/#tea_tasting.experiment.ExperimentPowerResult) respectively. These result classes provide the serialization methods similar to the experiment result: `to_dicts`, `to_arrow`, `to_pandas`, `to_polars`, `to_pretty_dicts`, `to_string`, `to_html`. They are also rendered as an HTML tables in IPython and Jupyter, and as a table in marimo notebooks. 115 | """ 116 | ) 117 | return 118 | 119 | 120 | @app.cell(hide_code=True) 121 | def _(): 122 | import marimo as mo 123 | return (mo,) 124 | 125 | 126 | if __name__ == "__main__": 127 | app.run() 128 | -------------------------------------------------------------------------------- /examples/simulated-experiments.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.10" 3 | # dependencies = [ 4 | # "marimo", 5 | # "polars", 6 | # "tea-tasting", 7 | # ] 8 | # [tool.marimo.display] 9 | # cell_output = "below" 10 | # /// 11 | 12 | import marimo 13 | 14 | __generated_with = "0.13.6" 15 | app = marimo.App() 16 | 17 | 18 | @app.cell(hide_code=True) 19 | def _(mo): 20 | mo.md( 21 | r""" 22 | # Simulated experiments 23 | 24 | ## Intro 25 | 26 | In tea-tasting, you can run multiple simulated A/A or A/B tests. In each simulation, tea-tasting splits the data into control and treatment groups and can optionally modify the treatment data. A simulation without changing the treatment data is called an A/A test. 27 | 28 | A/A tests are useful for identifying potential issues before conducting the actual A/B test. Treatment simulations are great for power analysis—especially when you need a specific uplift distribution or when an analytical formula doesn’t exist. 29 | 30 | /// admonition | Note 31 | 32 | This guide uses [Polars](https://github.com/pola-rs/polars) and [marimo](https://github.com/marimo-team/marimo). Install these packages in addition to tea-tasting to reproduce the examples: 33 | 34 | ```bash 35 | uv pip install polars marimo 36 | ``` 37 | 38 | /// 39 | 40 | ## Running A/A tests 41 | 42 | First, let's prepare the data without any uplift and drop the `"variant"` column. 43 | """ 44 | ) 45 | return 46 | 47 | 48 | @app.cell 49 | def _(): 50 | import polars as pl 51 | import tea_tasting as tt 52 | 53 | data = ( 54 | tt.make_users_data(seed=42, orders_uplift=0, revenue_uplift=0) 55 | .drop_columns("variant") 56 | ) 57 | data 58 | return data, pl, tt 59 | 60 | 61 | @app.cell(hide_code=True) 62 | def _(mo): 63 | mo.md( 64 | r""" 65 | To run A/A tests, first define the metrics for the experiment, then call the [`simulate`](https://tea-tasting.e10v.me/api/experiment/#tea_tasting.experiment.Experiment.simulate) method, providing the data and the number of simulations as arguments. 66 | """ 67 | ) 68 | return 69 | 70 | 71 | @app.cell 72 | def _(data, tt): 73 | experiment = tt.Experiment( 74 | sessions_per_user=tt.Mean("sessions"), 75 | orders_per_session=tt.RatioOfMeans("orders", "sessions"), 76 | orders_per_user=tt.Mean("orders"), 77 | revenue_per_user=tt.Mean("revenue"), 78 | n_users=tt.SampleRatio(), 79 | ) 80 | results = experiment.simulate(data, 100, seed=42) 81 | results_data = results.to_polars() 82 | results_data.select( 83 | "metric", 84 | "control", 85 | "treatment", 86 | "rel_effect_size", 87 | "rel_effect_size_ci_lower", 88 | "rel_effect_size_ci_upper", 89 | "pvalue", 90 | ) 91 | return experiment, results_data 92 | 93 | 94 | @app.cell(hide_code=True) 95 | def _(mo): 96 | mo.md( 97 | r""" 98 | The `simulate` method accepts data in the same formats as the `analyze` method. Internally, however, it converts the data to a PyArrow Table before running the simulations. 99 | 100 | The method returns an instance of the [`SimulationResults`](https://tea-tasting.e10v.me/api/experiment/#tea_tasting.experiment.SimulationResults) class, which contains the results of all simulations for all metrics. The resulting object provides serialization methods to those of the experiment result, including `to_dicts`, `to_arrow`, `to_pandas`, `to_polars`, `to_pretty_dicts`, `to_string`, `to_html`. 101 | 102 | For instance, we can now calculate the proportion of rejected null hypotheses, using various significance levels (`alpha`). In A/A tests, it estimates the type I error rate. 103 | """ 104 | ) 105 | return 106 | 107 | 108 | @app.cell 109 | def _(pl, results_data): 110 | def null_rejected( 111 | results_data: pl.DataFrame, 112 | alphas: tuple[float, ...] = (0.01, 0.02, 0.05), 113 | ) -> pl.DataFrame: 114 | return results_data.group_by("metric", maintain_order=True).agg( 115 | pl.col("pvalue").le(alpha).mean().alias(f"null_rejected_{alpha}") 116 | for alpha in alphas 117 | ) 118 | 119 | null_rejected(results_data) 120 | return (null_rejected,) 121 | 122 | 123 | @app.cell(hide_code=True) 124 | def _(mo): 125 | mo.md( 126 | r""" 127 | 100 simulations, as in the example above, produce a very rough estimate. In practice, a larger number of simulations, such as the default `10_000`, is recommended. 128 | 129 | ## Simulating experiments with treatment 130 | 131 | To simulate experiments with treatment, define a treatment function that takes data in the form of a PyArrow Table and returns a PyArrow Table with the modified data: 132 | """ 133 | ) 134 | return 135 | 136 | 137 | @app.cell 138 | def _(data, experiment, null_rejected): 139 | import pyarrow as pa 140 | import pyarrow.compute as pc 141 | 142 | def treat(data: pa.Table) -> pa.Table: 143 | return ( 144 | data.drop_columns(["orders", "revenue"]) 145 | .append_column("orders", pc.multiply(data["orders"], pa.scalar(1.1))) 146 | .append_column("revenue", pc.multiply(data["revenue"], pa.scalar(1.1))) 147 | ) 148 | 149 | results_treat = experiment.simulate(data, 100, seed=42, treat=treat) 150 | null_rejected(results_treat.to_polars()) 151 | return (treat,) 152 | 153 | 154 | @app.cell(hide_code=True) 155 | def _(mo): 156 | mo.md( 157 | r""" 158 | In the example above, we've defined a function that increases the number of orders and the revenue by 10%. For these metrics, the proportion of rejected null hypotheses is an estimate of statistical power. 159 | 160 | ## Using a function instead of static data 161 | 162 | You can use a function instead of static data to generate input dynamically. The function should take an instance of `numpy.random.Generator` as a parameter named `seed` and return experimental data in any format supported by tea-tasting. 163 | 164 | As an example, let's use the `make_users_data` function. 165 | """ 166 | ) 167 | return 168 | 169 | 170 | @app.cell 171 | def _(experiment, null_rejected, tt): 172 | results_data_gen = experiment.simulate(tt.make_users_data, 100, seed=42) 173 | null_rejected(results_data_gen.to_polars()) 174 | return 175 | 176 | 177 | @app.cell(hide_code=True) 178 | def _(mo): 179 | mo.md( 180 | r""" 181 | On each iteration, tea-tasting calls `make_users_data` with a new `seed` and uses the returned data for the analysis of the experiment. The data returned by `make_users_data` already contains the `"variant"` column, so tea-tasting reuses that split. By default, `make_users_data` also adds the treatment uplift, and you can see it in the proportion of rejected null hypotheses. 182 | 183 | ## Tracking progress 184 | 185 | To track the progress of simulations with [`tqdm`](https://github.com/tqdm/tqdm) or [`marimo.status.progress_bar`](https://docs.marimo.io/api/status/#progress-bar), use the `progress` parameter. 186 | """ 187 | ) 188 | return 189 | 190 | 191 | @app.cell 192 | def _(data, experiment, mo): 193 | results_progress = experiment.simulate( 194 | data, 195 | 100, 196 | seed=42, 197 | progress=mo.status.progress_bar, 198 | ) 199 | return 200 | 201 | 202 | @app.cell(hide_code=True) 203 | def _(mo): 204 | mo.md( 205 | r""" 206 | ## Parallel execution 207 | 208 | /// admonition | Note 209 | 210 | The code below won't work in the [marimo online playground](https://docs.marimo.io/guides/publishing/playground/) as it relies on the `multiprocessing` module which is currently [not supported](https://docs.marimo.io/guides/wasm/#limitations) by WASM notebooks. [WASM notebooks](https://docs.marimo.io/guides/wasm/) are the marimo notebooks that run entirely in the browser. 211 | 212 | /// 213 | 214 | To speed up simulations and run them in parallel, use the `map_` parameter with an alternative mapping function. 215 | """ 216 | ) 217 | return 218 | 219 | 220 | @app.cell 221 | def _(data, experiment, mo, treat): 222 | import concurrent.futures 223 | 224 | with concurrent.futures.ProcessPoolExecutor() as executor: 225 | results_parallel = experiment.simulate( 226 | data, 227 | 100, 228 | seed=42, 229 | treat=treat, 230 | map_=executor.map, 231 | progress=mo.status.progress_bar, 232 | ) 233 | return 234 | 235 | 236 | @app.cell(hide_code=True) 237 | def _(mo): 238 | mo.md( 239 | r""" 240 | As an alternative to [`concurrent.futures.ProcessPoolExecutor`](https://docs.python.org/3/library/concurrent.futures.html#processpoolexecutor), you can use the `map`, `imap`, or `imap_unordered` methods of [`multiprocessing.pool.Pool`](https://docs.python.org/3/library/multiprocessing.html#module-multiprocessing.pool). 241 | 242 | It's also possible to run simulations on a distributed [Dask](https://distributed.dask.org/en/stable/api.html#distributed.Client.map) or [Ray](https://docs.ray.io/en/latest/ray-core/api/doc/ray.util.ActorPool.map.html#ray.util.ActorPool.map) cluster. 243 | """ 244 | ) 245 | return 246 | 247 | 248 | @app.cell(hide_code=True) 249 | def _(): 250 | import marimo as mo 251 | return (mo,) 252 | 253 | 254 | if __name__ == "__main__": 255 | app.run() 256 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: "tea-tasting: statistical analysis of A/B tests" 2 | site_url: https://tea-tasting.e10v.me/ 3 | site_description: A Python package for the statistical analysis of A/B tests 4 | site_author: Evgeny Ivanov 5 | copyright: © Evgeny Ivanov
The logo is designed by Freepik 6 | repo_name: e10v/tea-tasting 7 | repo_url: https://github.com/e10v/tea-tasting 8 | 9 | nav: 10 | - Overview: index.md 11 | - User guide: user-guide.md 12 | - Data backends: data-backends.md 13 | - Power analysis: power-analysis.md 14 | - Multiple testing: multiple-testing.md 15 | - Custom metrics: custom-metrics.md 16 | - Simulated experiments: simulated-experiments.md 17 | - API reference: 18 | - API reference: api/index.md 19 | - Metrics: 20 | - Metrics: api/metrics/index.md 21 | - Base: api/metrics/base.md 22 | - Mean: api/metrics/mean.md 23 | - Proportion: api/metrics/proportion.md 24 | - Resampling: api/metrics/resampling.md 25 | - Experiment: api/experiment.md 26 | - Multiplicity: api/multiplicity.md 27 | - Datasets: api/datasets.md 28 | - Global configuration: api/config.md 29 | - Aggregates: api/aggr.md 30 | - Utilities: api/utils.md 31 | 32 | theme: 33 | name: material 34 | palette: 35 | - media: "(prefers-color-scheme)" 36 | toggle: 37 | icon: material/brightness-auto 38 | name: Switch to light mode 39 | - media: "(prefers-color-scheme: light)" 40 | scheme: default 41 | primary: deep orange 42 | accent: deep orange 43 | toggle: 44 | icon: material/brightness-7 45 | name: Switch to dark mode 46 | - media: "(prefers-color-scheme: dark)" 47 | scheme: slate 48 | primary: deep orange 49 | accent: deep orange 50 | toggle: 51 | icon: material/brightness-4 52 | name: Switch to system preference 53 | logo: assets/tea-cup-white.svg 54 | favicon: assets/tea-cup-white-on-black.svg 55 | icon: 56 | repo: fontawesome/brands/github 57 | features: 58 | - content.code.copy 59 | - navigation.indexes 60 | - navigation.instant 61 | - navigation.instant.progress 62 | - navigation.top 63 | - navigation.tracking 64 | - search.highlight 65 | - search.suggest 66 | - toc.follow 67 | 68 | plugins: 69 | - mkdocstrings: 70 | default_handler: python 71 | handlers: 72 | python: 73 | options: 74 | filters: ["!^_"] 75 | heading_level: 1 76 | inherited_members: true 77 | merge_init_into_class: true 78 | show_overloads: false 79 | show_root_heading: true 80 | - search 81 | 82 | markdown_extensions: 83 | - _internal.external_links 84 | - _internal.strip_doctest_artifacts 85 | - pymdownx.blocks.admonition 86 | - pymdownx.superfences 87 | - toc: 88 | permalink: "#" 89 | 90 | extra_css: 91 | - stylesheets/extra.css 92 | 93 | extra_javascript: 94 | - javascripts/override-copy.js 95 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "tea-tasting" 3 | dynamic = ["version"] 4 | description = "A Python package for the statistical analysis of A/B tests." 5 | authors = [ 6 | {name = "Evgeny Ivanov", email = "ivanov.evgeny.n@gmail.com"}, 7 | ] 8 | dependencies = [ 9 | "ibis-framework>=9", 10 | "narwhals>=1.4", 11 | "numpy>=1.25", 12 | "pyarrow>=16", 13 | "scipy>=1.11", 14 | ] 15 | requires-python = ">=3.10" 16 | readme = "README.md" 17 | license = {text = "MIT"} 18 | classifiers = [ 19 | "Development Status :: 5 - Production/Stable", 20 | "Intended Audience :: Developers", 21 | "Intended Audience :: Information Technology", 22 | "Intended Audience :: Science/Research", 23 | "License :: OSI Approved", 24 | "License :: OSI Approved :: MIT License", 25 | "Operating System :: OS Independent", 26 | "Programming Language :: Python", 27 | "Programming Language :: Python :: 3", 28 | "Programming Language :: Python :: 3.10", 29 | "Programming Language :: Python :: 3.11", 30 | "Programming Language :: Python :: 3.12", 31 | "Programming Language :: Python :: 3.13", 32 | "Topic :: Scientific/Engineering", 33 | "Topic :: Scientific/Engineering :: Information Analysis", 34 | "Topic :: Scientific/Engineering :: Mathematics", 35 | "Typing :: Typed", 36 | ] 37 | 38 | [project.urls] 39 | homepage = "https://tea-tasting.e10v.me" 40 | documentation = "https://tea-tasting.e10v.me/user-guide" 41 | source = "https://github.com/e10v/tea-tasting" 42 | "release notes" = "https://github.com/e10v/tea-tasting/releases" 43 | 44 | 45 | [dependency-groups] 46 | docs = ["mkdocs-material", "mkdocstrings[python]"] 47 | lint = ["markdown", "marimo", "pyright", "ruff"] 48 | test = [ 49 | "coverage[toml]>=7", 50 | "ibis-framework[duckdb,sqlite]", 51 | "marimo>=0.10", 52 | "pandas>=2", 53 | "polars>=1", 54 | "pytest>=8", 55 | "tqdm>=4", 56 | ] 57 | 58 | 59 | [build-system] 60 | requires = ["pdm-backend"] 61 | build-backend = "pdm.backend" 62 | 63 | 64 | [tool.pdm.build] 65 | excludes = ["src/_*/**/*"] 66 | package-dir = "src" 67 | 68 | [tool.pdm.scripts] 69 | all.composite = ["doctest", "test", "cover", "lint", "type"] 70 | all.keep_going = true 71 | cover = "coverage report -m" 72 | docserv = "mkdocs serve -w docs -w src -w mkdocs.yml" 73 | doctest.cmd = [ 74 | "pytest", 75 | "--doctest-continue-on-failure", 76 | "--doctest-glob=*.md", 77 | "--doctest-modules", 78 | "--ignore=examples/", 79 | "--ignore=tests/", 80 | "--ignore-glob=src/_*", 81 | ] 82 | lint = "ruff check ." 83 | test = "coverage run -m pytest" 84 | type = "pyright" 85 | 86 | [tool.pdm.version] 87 | source = "scm" 88 | write_to = "tea_tasting/_version.txt" 89 | 90 | 91 | [tool.coverage.run] 92 | source = ["src/tea_tasting"] 93 | [tool.coverage.report] 94 | exclude_lines = ["if TYPE_CHECKING:", "pragma: no cover", "@overload", "@abc.abstractmethod"] 95 | 96 | 97 | [tool.ruff] 98 | extend-exclude = ["examples"] 99 | src = ["src"] 100 | 101 | [tool.ruff.lint] 102 | select = [ 103 | "A", "ANN", "ARG", "B", "BLE", "C4", "C90", "COM", "D", "DOC", "E", "ERA", 104 | "F", "FA", "FBT", "FIX", "FLY", "FURB", "I", "ICN", "INP", "INT", "ISC", 105 | "N", "NPY", "PD", "PERF", "PGH", "PIE", "PL", "PT", "Q", "RET", "RSE", 106 | "RUF", "S", "SIM", "SLF", "SLOT", "T10", "T20", "TC", "TD", "TID", "TRY", 107 | "UP", "W", 108 | ] 109 | ignore = ["ANN401", "PGH003", "SLF001", "TRY003"] 110 | 111 | [tool.ruff.lint.per-file-ignores] 112 | "*/__init__.py" = ["F401"] 113 | "tests/*" = [ 114 | "ANN201", "D", "FBT003", "PLR2004", "PT001", "S101", 115 | ] 116 | 117 | [tool.ruff.lint.isort] 118 | force-sort-within-sections = true 119 | lines-after-imports = 2 120 | 121 | [tool.ruff.lint.pydocstyle] 122 | convention = "google" 123 | 124 | [tool.ruff.lint.pylint] 125 | max-args = 8 126 | 127 | 128 | [tool.pyright] 129 | exclude = ["examples", "**/node_modules", "**/__pycache__", "**/.*"] 130 | typeCheckingMode = "strict" 131 | reportMissingTypeStubs = false 132 | reportPrivateUsage = false 133 | reportUnknownArgumentType = false 134 | reportUnknownMemberType = false 135 | reportUnknownParameterType = false 136 | reportUnknownVariableType = false 137 | -------------------------------------------------------------------------------- /src/_internal/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e10v/tea-tasting/70b129d8dad04654b4aceb56c8ee2e348b0636b0/src/_internal/__init__.py -------------------------------------------------------------------------------- /src/_internal/create_examples.py: -------------------------------------------------------------------------------- 1 | """Convert guides to examples as marimo notebooks.""" 2 | # pyright: reportPrivateImportUsage=false 3 | 4 | from __future__ import annotations 5 | 6 | import re 7 | import textwrap 8 | 9 | import marimo._ast.cell 10 | import marimo._convert.utils 11 | 12 | 13 | GUIDES: dict[str, tuple[str, ...]] = { 14 | "user-guide": ("polars",), 15 | "data-backends": ("ibis-framework[duckdb]", "polars"), 16 | "power-analysis": (), 17 | "multiple-testing": ("polars",), 18 | "custom-metrics": (), 19 | "simulated-experiments": ("polars",), 20 | } 21 | 22 | HIDE_CODE = marimo._ast.cell.CellConfig(hide_code=True) 23 | SHOW_CODE = marimo._ast.cell.CellConfig(hide_code=False) 24 | 25 | RE_LINK = re.compile(r"\[([^\]]+)\]\((?!#)([^)]+)\)") 26 | RE_DOCTEST = re.compile(r"\s+# doctest:.*") 27 | 28 | 29 | def convert_guide(name: str, deps: tuple[str, ...]) -> None: 30 | with open(f"docs/{name}.md") as f: 31 | guide_text = f.read() 32 | 33 | sources = [] 34 | cell_configs = [] 35 | for text in guide_text.split("```pycon"): 36 | if len(sources) == 0: 37 | md = text 38 | else: 39 | end_of_code = text.find("```") 40 | md = text[end_of_code + 3:] 41 | sources.append(convert_code(text[:end_of_code])) 42 | cell_configs.append(SHOW_CODE) 43 | 44 | sources.append(marimo._convert.utils.markdown_to_marimo(convert_md(md))) 45 | cell_configs.append(HIDE_CODE) 46 | 47 | sources.append("import marimo as mo") 48 | cell_configs.append(HIDE_CODE) 49 | 50 | code = marimo._convert.utils.generate_from_sources( 51 | sources=sources, 52 | cell_configs=cell_configs, 53 | header_comments=create_header_comments(deps), 54 | ) 55 | with open(f"examples/{name}.py", "w") as f: 56 | f.write(code) 57 | 58 | 59 | def convert_code(code: str) -> str: 60 | lines = [] 61 | for line in code.split("\n"): 62 | if line == ">>> import tqdm": 63 | pass 64 | elif line.startswith((">>>", "...")): 65 | lines.append(RE_DOCTEST.sub("", line[4:])) 66 | elif line == "": 67 | lines.append("") 68 | return "\n".join(lines).strip().replace("tqdm.tqdm", "mo.status.progress_bar") 69 | 70 | 71 | def convert_md(md: str) -> str: 72 | return ( 73 | RE_LINK.sub(update_link, md.strip()) 74 | .replace( 75 | "[tqdm](https://github.com/tqdm/tqdm)", 76 | "[marimo](https://github.com/marimo-team/marimo)", 77 | ) 78 | .replace(" tqdm", " marimo") 79 | ) 80 | 81 | 82 | def update_link(match: re.Match[str]) -> str: 83 | label = match.group(1) 84 | url = match.group(2).replace(".md", "/") 85 | root = "" if url.startswith("http") else "https://tea-tasting.e10v.me/" 86 | return f"[{label}]({root}{url})" 87 | 88 | 89 | def create_header_comments(deps: tuple[str, ...]) -> str: 90 | dependencies = "\n".join( 91 | f'# "{dep}",' 92 | for dep in sorted((*deps, "marimo", "tea-tasting")) 93 | ) 94 | return textwrap.dedent(""" 95 | # /// script 96 | # requires-python = ">=3.10" 97 | # dependencies = [ 98 | {dependencies} 99 | # ] 100 | # [tool.marimo.display] 101 | # cell_output = "below" 102 | # /// 103 | """).format(dependencies=dependencies) 104 | 105 | 106 | if __name__ == "__main__": 107 | for name, deps in GUIDES.items(): 108 | convert_guide(name, deps) 109 | -------------------------------------------------------------------------------- /src/_internal/external_links.py: -------------------------------------------------------------------------------- 1 | """Markdown extension that adds target="_blank" and rel="noopener" to external links.""" 2 | # ruff: noqa: N802 3 | from __future__ import annotations 4 | 5 | from typing import TYPE_CHECKING 6 | import urllib.parse 7 | 8 | import markdown 9 | import markdown.extensions 10 | import markdown.treeprocessors 11 | 12 | 13 | if TYPE_CHECKING: 14 | import xml.etree.ElementTree as ET 15 | 16 | 17 | class ExternalLinksTreeprocessor(markdown.treeprocessors.Treeprocessor): 18 | def run(self, root: ET.Element) -> None: 19 | for a in root.iter("a"): 20 | url = urllib.parse.urlparse(a.get("href", "")) 21 | if ( 22 | url.scheme in {"http", "https"} and 23 | url.hostname is not None and 24 | not url.hostname.startswith(("tea-tasting.e10v.me", "127.0.0.1")) 25 | ): 26 | a.set("target", "_blank") 27 | a.set("rel", "noopener") 28 | 29 | class ExternalLinksExtension(markdown.extensions.Extension): 30 | def extendMarkdown(self, md: markdown.Markdown) -> None: 31 | md.treeprocessors.register( 32 | ExternalLinksTreeprocessor(md), 33 | "external_links", 34 | -1000, 35 | ) 36 | 37 | def makeExtension(**kwargs: dict[str, object]) -> ExternalLinksExtension: 38 | return ExternalLinksExtension(**kwargs) 39 | -------------------------------------------------------------------------------- /src/_internal/strip_doctest_artifacts.py: -------------------------------------------------------------------------------- 1 | """Markdown extension that strips doctest artifacts.""" 2 | # ruff: noqa: N802 3 | from __future__ import annotations 4 | 5 | import re 6 | 7 | import markdown 8 | import markdown.extensions 9 | import markdown.preprocessors 10 | 11 | 12 | RE_DOCTEST = re.compile(r"|\s+# doctest:.*") 13 | 14 | class StripDoctestArtifactsPreprocessor(markdown.preprocessors.Preprocessor): 15 | def run(self, lines: list[str]) -> list[str]: 16 | return [RE_DOCTEST.sub("", line) for line in lines] 17 | 18 | class StripDoctestArtifactsExtension(markdown.extensions.Extension): 19 | def extendMarkdown(self, md: markdown.Markdown) -> None: 20 | md.preprocessors.register( 21 | StripDoctestArtifactsPreprocessor(md), 22 | "strip_doctest_artifacts", 23 | 175, 24 | ) 25 | 26 | def makeExtension(**kwargs: dict[str, object]) -> StripDoctestArtifactsExtension: 27 | return StripDoctestArtifactsExtension(**kwargs) 28 | -------------------------------------------------------------------------------- /src/tea_tasting/__init__.py: -------------------------------------------------------------------------------- 1 | """A Python package for the statistical analysis of A/B tests. 2 | 3 | All classes and functions for the analysis of the experiments can be imported 4 | from the root `tea_tasting` module. 5 | 6 | There are functions and classes for advanced use cases such as defining custom metrics. 7 | They can be imported from submodules of `tea_tasting`. 8 | 9 | For convenience, the API reference is provided by submodules: 10 | 11 | - `tea_tasting.metrics`: Built-in metrics. 12 | - `tea_tasting.experiment`: Experiment and experiment result. 13 | - `tea_tasting.multiplicity`: Multiple hypothesis testing. 14 | - `tea_tasting.datasets`: Example datasets. 15 | - `tea_tasting.config`: Global configuration. 16 | - `tea_tasting.aggr`: Module for working with aggregated statistics. 17 | - `tea_tasting.utils`: Useful functions and classes. 18 | """ 19 | # pyright: reportUnusedImport=false 20 | 21 | from tea_tasting.config import config_context, get_config, set_config 22 | from tea_tasting.datasets import make_sessions_data, make_users_data 23 | from tea_tasting.experiment import Experiment 24 | from tea_tasting.metrics import Bootstrap, Mean, Quantile, RatioOfMeans, SampleRatio 25 | from tea_tasting.multiplicity import adjust_fdr, adjust_fwer 26 | from tea_tasting.version import __version__ 27 | -------------------------------------------------------------------------------- /src/tea_tasting/config.py: -------------------------------------------------------------------------------- 1 | """Global configuration.""" 2 | # ruff: noqa: PLR0913 3 | 4 | from __future__ import annotations 5 | 6 | import contextlib 7 | import contextvars 8 | from typing import TYPE_CHECKING, overload 9 | 10 | import tea_tasting.utils 11 | 12 | 13 | if TYPE_CHECKING: 14 | from collections.abc import Iterator, Sequence 15 | from typing import Literal 16 | 17 | 18 | _DEFAULT_CONFIG: dict[str, object] = { 19 | "alpha": 0.05, 20 | "alternative": "two-sided", 21 | "confidence_level": 0.95, 22 | "equal_var": False, 23 | "n_obs": None, 24 | "n_resamples": 10_000, 25 | "power": 0.8, 26 | "ratio": 1, 27 | "use_t": True, 28 | } 29 | 30 | _config_var: contextvars.ContextVar[dict[str, object]] = contextvars.ContextVar( 31 | "tea_tasting.config", 32 | default=_DEFAULT_CONFIG.copy(), # noqa: B039 33 | ) 34 | 35 | 36 | @overload 37 | def get_config(option: Literal["alpha"]) -> float: 38 | ... 39 | 40 | @overload 41 | def get_config(option: Literal["alternative"]) -> str: 42 | ... 43 | 44 | @overload 45 | def get_config(option: Literal["confidence_level"]) -> float: 46 | ... 47 | 48 | @overload 49 | def get_config(option: Literal["equal_var"]) -> bool: 50 | ... 51 | 52 | @overload 53 | def get_config(option: Literal["n_obs"]) -> int | Sequence[int] | None: 54 | ... 55 | 56 | @overload 57 | def get_config(option: Literal["n_resamples"]) -> str: 58 | ... 59 | 60 | @overload 61 | def get_config(option: Literal["power"]) -> float: 62 | ... 63 | 64 | @overload 65 | def get_config(option: Literal["ratio"]) -> float | int: 66 | ... 67 | 68 | @overload 69 | def get_config(option: Literal["use_t"]) -> bool: 70 | ... 71 | 72 | @overload 73 | def get_config(option: str) -> object: 74 | ... 75 | 76 | @overload 77 | def get_config(option: None = None) -> dict[str, object]: 78 | ... 79 | 80 | def get_config(option: str | None = None) -> object: 81 | """Retrieve the current settings of the global configuration. 82 | 83 | Args: 84 | option: The option name. 85 | 86 | Returns: 87 | The specified option value if its name is provided, 88 | or a dictionary containing all options otherwise. 89 | 90 | Examples: 91 | ```pycon 92 | >>> import tea_tasting as tt 93 | 94 | >>> tt.get_config("equal_var") 95 | False 96 | 97 | ``` 98 | """ 99 | config = _config_var.get() 100 | return config[option] if option is not None else config.copy() 101 | 102 | 103 | def _set_config(**params: object) -> contextvars.Token[dict[str, object]]: 104 | config = _config_var.get().copy() 105 | for name, value in params.items(): 106 | if value is not None: 107 | config[name] = tea_tasting.utils.auto_check(value, name) 108 | return _config_var.set(config) 109 | 110 | 111 | def set_config( 112 | *, 113 | alpha: float | None = None, 114 | alternative: Literal["two-sided", "greater", "less"] | None = None, 115 | confidence_level: float | None = None, 116 | equal_var: bool | None = None, 117 | n_obs: int | Sequence[int] | None = None, 118 | n_resamples: int | None = None, 119 | power: float | None = None, 120 | ratio: float | int | None = None, 121 | use_t: bool | None = None, 122 | **kwargs: object, 123 | ) -> None: 124 | """Update the global configuration with specified settings. 125 | 126 | Args: 127 | alpha: Significance level. Default is 0.05. 128 | alternative: Alternative hypothesis: 129 | 130 | - `"two-sided"`: the means are unequal, 131 | - `"greater"`: the mean in the treatment variant is greater than the mean 132 | in the control variant, 133 | - `"less"`: the mean in the treatment variant is less than the mean 134 | in the control variant. 135 | 136 | Default is `"two-sided"`. 137 | 138 | confidence_level: Confidence level for the confidence interval. 139 | Default is `0.95`. 140 | equal_var: Defines whether equal variance is assumed. If `True`, 141 | pooled variance is used for the calculation of the standard error 142 | of the difference between two means. Default is `False`. 143 | n_obs: Number of observations in the control and in the treatment together. 144 | Default is `None`. 145 | n_resamples: The number of resamples performed to form the bootstrap 146 | distribution of a statistic. Default is `10_000`. 147 | power: Statistical power. Default is 0.8. 148 | ratio: Ratio of the number of observations in the treatment 149 | relative to the control. Default is 1. 150 | use_t: Defines whether to use the Student's t-distribution (`True`) or 151 | the Normal distribution (`False`) by default. Default is `True`. 152 | **kwargs: User-defined global parameters. 153 | 154 | Examples: 155 | ```pycon 156 | >>> import tea_tasting as tt 157 | 158 | >>> tt.set_config(equal_var=True, use_t=False) 159 | >>> experiment = tt.Experiment( 160 | ... sessions_per_user=tt.Mean("sessions"), 161 | ... orders_per_session=tt.RatioOfMeans("orders", "sessions"), 162 | ... orders_per_user=tt.Mean("orders"), 163 | ... revenue_per_user=tt.Mean("revenue"), 164 | ... ) 165 | >>> tt.set_config(equal_var=False, use_t=True) 166 | >>> experiment.metrics["orders_per_user"] 167 | Mean(value='orders', covariate=None, alternative='two-sided', confidence_level=0.95, equal_var=True, use_t=False, alpha=0.05, ratio=1, power=0.8, effect_size=None, rel_effect_size=None, n_obs=None) 168 | 169 | ``` 170 | """ # noqa: E501 171 | _set_config(**{k: v for k, v in locals().items() if k != "kwargs"}, **kwargs) 172 | 173 | 174 | @contextlib.contextmanager 175 | def config_context( 176 | *, 177 | alpha: float | None = None, 178 | alternative: Literal["two-sided", "greater", "less"] | None = None, 179 | confidence_level: float | None = None, 180 | equal_var: bool | None = None, 181 | n_obs: int | Sequence[int] | None = None, 182 | n_resamples: int | None = None, 183 | power: float | None = None, 184 | ratio: float | int | None = None, 185 | use_t: bool | None = None, 186 | **kwargs: object, 187 | ) -> Iterator[object]: 188 | """A context manager that temporarily modifies the global configuration. 189 | 190 | Args: 191 | alpha: Significance level. Default is 0.05. 192 | alternative: Alternative hypothesis: 193 | 194 | - `"two-sided"`: the means are unequal, 195 | - `"greater"`: the mean in the treatment variant is greater than the mean 196 | in the control variant, 197 | - `"less"`: the mean in the treatment variant is less than the mean 198 | in the control variant. 199 | 200 | Default is `"two-sided"`. 201 | 202 | confidence_level: Confidence level for the confidence interval. 203 | Default is `0.95`. 204 | equal_var: Defines whether equal variance is assumed. If `True`, 205 | pooled variance is used for the calculation of the standard error 206 | of the difference between two means. Default is `False`. 207 | n_obs: Number of observations in the control and in the treatment together. 208 | Default is `None`. 209 | n_resamples: The number of resamples performed to form the bootstrap 210 | distribution of a statistic. Default is `10_000`. 211 | power: Statistical power. Default is 0.8. 212 | ratio: Ratio of the number of observations in the treatment 213 | relative to the control. Default is 1. 214 | use_t: Defines whether to use the Student's t-distribution (`True`) or 215 | the Normal distribution (`False`) by default. Default is `True`. 216 | **kwargs: User-defined global parameters. 217 | 218 | Examples: 219 | ```pycon 220 | >>> import tea_tasting as tt 221 | 222 | >>> with tt.config_context(equal_var=True, use_t=False): 223 | ... experiment = tt.Experiment( 224 | ... sessions_per_user=tt.Mean("sessions"), 225 | ... orders_per_session=tt.RatioOfMeans("orders", "sessions"), 226 | ... orders_per_user=tt.Mean("orders"), 227 | ... revenue_per_user=tt.Mean("revenue"), 228 | ... ) 229 | >>> experiment.metrics["orders_per_user"] 230 | Mean(value='orders', covariate=None, alternative='two-sided', confidence_level=0.95, equal_var=True, use_t=False, alpha=0.05, ratio=1, power=0.8, effect_size=None, rel_effect_size=None, n_obs=None) 231 | 232 | ``` 233 | """ # noqa: E501 234 | token = _set_config( 235 | **{k: v for k, v in locals().items() if k != "kwargs"}, 236 | **kwargs, 237 | ) 238 | try: 239 | yield 240 | finally: 241 | _config_var.reset(token) 242 | -------------------------------------------------------------------------------- /src/tea_tasting/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | """This module provides built-in metrics used to analyze experimental data. 2 | 3 | All metric classes can be imported from `tea_tasting.metrics` module. 4 | For convenience, the API reference is provided by submodules of `tea_tasting.metrics`: 5 | 6 | - `tea_tasting.metrics.base`: Base classes for metrics. 7 | - `tea_tasting.metrics.mean`: Metrics for the analysis of means. 8 | - `tea_tasting.metrics.proportion`: Metrics for the analysis of proportions. 9 | - `tea_tasting.metrics.resampling`: Metrics analyzed using resampling methods. 10 | """ 11 | # pyright: reportUnusedImport=false 12 | 13 | from tea_tasting.metrics.base import ( 14 | AggrCols, 15 | MetricBase, 16 | MetricBaseAggregated, 17 | MetricBaseGranular, 18 | MetricPowerResults, 19 | MetricResult, 20 | PowerBase, 21 | PowerBaseAggregated, 22 | aggregate_by_variants, 23 | read_granular, 24 | ) 25 | from tea_tasting.metrics.mean import Mean, RatioOfMeans 26 | from tea_tasting.metrics.proportion import SampleRatio 27 | from tea_tasting.metrics.resampling import Bootstrap, Quantile 28 | -------------------------------------------------------------------------------- /src/tea_tasting/metrics/base.py: -------------------------------------------------------------------------------- 1 | """Base classes for metrics.""" 2 | 3 | from __future__ import annotations 4 | 5 | import abc 6 | from collections import UserList 7 | from typing import ( 8 | TYPE_CHECKING, 9 | Generic, 10 | NamedTuple, 11 | TypeAlias, 12 | TypeVar, 13 | Union, 14 | overload, 15 | ) 16 | 17 | import ibis 18 | import ibis.expr.types 19 | import narwhals as nw 20 | import pyarrow as pa 21 | import pyarrow.compute as pc 22 | 23 | import tea_tasting.aggr 24 | import tea_tasting.utils 25 | 26 | 27 | if TYPE_CHECKING: 28 | from collections.abc import Sequence 29 | from typing import Literal 30 | 31 | import narwhals.typing # noqa: TC004 32 | 33 | 34 | # The | operator doesn't work for NamedTuple, but Union works. 35 | MetricResult: TypeAlias = Union[NamedTuple, dict[str, object]] # noqa: UP007 36 | MetricPowerResult: TypeAlias = Union[NamedTuple, dict[str, object]] # noqa: UP007 37 | 38 | R = TypeVar("R", bound=MetricResult) 39 | P = TypeVar("P", bound=MetricPowerResult) 40 | 41 | 42 | class MetricPowerResults(tea_tasting.utils.DictsReprMixin, UserList[P]): 43 | """Power analysis results.""" 44 | default_keys = ("power", "effect_size", "rel_effect_size", "n_obs") 45 | 46 | @tea_tasting.utils._cache_method 47 | def to_dicts(self) -> tuple[dict[str, object], ...]: 48 | """"Convert the results to a sequence of dictionaries.""" 49 | return tuple((v if isinstance(v, dict) else v._asdict()) for v in self) 50 | 51 | S = TypeVar("S", bound=MetricPowerResults) # type: ignore 52 | 53 | 54 | class MetricBase(abc.ABC, Generic[R], tea_tasting.utils.ReprMixin): 55 | """Base class for metrics.""" 56 | @abc.abstractmethod 57 | def analyze( 58 | self, 59 | data: narwhals.typing.IntoFrame | ibis.expr.types.Table, 60 | control: object, 61 | treatment: object, 62 | variant: str, 63 | ) -> R: 64 | """Analyze a metric in an experiment. 65 | 66 | Args: 67 | data: Experimental data. 68 | control: Control variant. 69 | treatment: Treatment variant. 70 | variant: Variant column name. 71 | 72 | Returns: 73 | Analysis result. 74 | """ 75 | 76 | 77 | class PowerBase(abc.ABC, Generic[S], tea_tasting.utils.ReprMixin): 78 | """Base class for the analysis of power.""" 79 | @abc.abstractmethod 80 | def solve_power( 81 | self, 82 | data: narwhals.typing.IntoFrame | ibis.expr.types.Table, 83 | parameter: Literal[ 84 | "power", "effect_size", "rel_effect_size", "n_obs"] = "rel_effect_size", 85 | ) -> S: 86 | """Solve for a parameter of the power of a test. 87 | 88 | Args: 89 | data: Sample data. 90 | parameter: Parameter name. 91 | 92 | Returns: 93 | Power analysis result. 94 | """ 95 | 96 | 97 | class AggrCols(NamedTuple): 98 | """Columns to be aggregated for a metric analysis. 99 | 100 | Attributes: 101 | has_count: If `True`, include the sample size. 102 | mean_cols: Column names for calculation of sample means. 103 | var_cols: Column names for calculation of sample variances. 104 | cov_cols: Pairs of column names for calculation of sample covariances. 105 | """ 106 | has_count: bool = False 107 | mean_cols: Sequence[str] = () 108 | var_cols: Sequence[str] = () 109 | cov_cols: Sequence[tuple[str, str]] = () 110 | 111 | def __or__(self, other: AggrCols) -> AggrCols: 112 | """Merge two aggregation column specifications. 113 | 114 | Args: 115 | other: Second objects. 116 | 117 | Returns: 118 | Merged column specifications. 119 | """ 120 | return AggrCols( 121 | has_count=self.has_count or other.has_count, 122 | mean_cols=tuple({*self.mean_cols, *other.mean_cols}), 123 | var_cols=tuple({*self.var_cols, *other.var_cols}), 124 | cov_cols=tuple({ 125 | tea_tasting.aggr._sorted_tuple(*cols) 126 | for cols in tuple({*self.cov_cols, *other.cov_cols}) 127 | }), 128 | ) 129 | 130 | def __len__(self) -> int: 131 | """Total length of all object attributes. 132 | 133 | If has_count is True then its value is 1, or 0 otherwise. 134 | """ 135 | return ( 136 | int(self.has_count) 137 | + len(self.mean_cols) 138 | + len(self.var_cols) 139 | + len(self.cov_cols) 140 | ) 141 | 142 | 143 | class _HasAggrCols(abc.ABC): 144 | @property 145 | @abc.abstractmethod 146 | def aggr_cols(self) -> AggrCols: 147 | """Columns to be aggregated for an analysis.""" 148 | 149 | 150 | class MetricBaseAggregated(MetricBase[R], _HasAggrCols): 151 | """Base class for metrics, which are analyzed using aggregated statistics.""" 152 | @overload 153 | def analyze( 154 | self, 155 | data: dict[object, tea_tasting.aggr.Aggregates], 156 | control: object, 157 | treatment: object, 158 | variant: str | None = None, 159 | ) -> R: 160 | ... 161 | 162 | @overload 163 | def analyze( 164 | self, 165 | data: narwhals.typing.IntoFrame | ibis.expr.types.Table, 166 | control: object, 167 | treatment: object, 168 | variant: str, 169 | ) -> R: 170 | ... 171 | 172 | def analyze( 173 | self, 174 | data: narwhals.typing.IntoFrame | ibis.expr.types.Table | dict[ 175 | object, tea_tasting.aggr.Aggregates], 176 | control: object, 177 | treatment: object, 178 | variant: str | None = None, 179 | ) -> R: 180 | """Analyze a metric in an experiment. 181 | 182 | Args: 183 | data: Experimental data. 184 | control: Control variant. 185 | treatment: Treatment variant. 186 | variant: Variant column name. 187 | 188 | Returns: 189 | Analysis result. 190 | """ 191 | tea_tasting.utils.check_scalar(variant, "variant", typ=str | None) 192 | aggr = aggregate_by_variants( 193 | data, 194 | aggr_cols=self.aggr_cols, 195 | variant=variant, 196 | ) 197 | return self.analyze_aggregates( 198 | control=aggr[control], 199 | treatment=aggr[treatment], 200 | ) 201 | 202 | @abc.abstractmethod 203 | def analyze_aggregates( 204 | self, 205 | control: tea_tasting.aggr.Aggregates, 206 | treatment: tea_tasting.aggr.Aggregates, 207 | ) -> R: 208 | """Analyze metric in an experiment using aggregated statistics. 209 | 210 | Args: 211 | control: Control data. 212 | treatment: Treatment data. 213 | 214 | Returns: 215 | Analysis result. 216 | """ 217 | 218 | 219 | class PowerBaseAggregated(PowerBase[S], _HasAggrCols): 220 | """Base class for the analysis of power using aggregated statistics.""" 221 | def solve_power( 222 | self, 223 | data: ( 224 | narwhals.typing.IntoFrame | 225 | ibis.expr.types.Table | 226 | tea_tasting.aggr.Aggregates 227 | ), 228 | parameter: Literal[ 229 | "power", "effect_size", "rel_effect_size", "n_obs"] = "rel_effect_size", 230 | ) -> S: 231 | """Solve for a parameter of the power of a test. 232 | 233 | Args: 234 | data: Sample data. 235 | parameter: Parameter name. 236 | 237 | Returns: 238 | Power analysis result. 239 | """ 240 | tea_tasting.utils.check_scalar( 241 | parameter, 242 | "parameter", 243 | in_={"power", "effect_size", "rel_effect_size", "n_obs"}, 244 | ) 245 | if not isinstance(data, tea_tasting.aggr.Aggregates): 246 | data = tea_tasting.aggr.read_aggregates( 247 | data=data, 248 | group_col=None, 249 | **self.aggr_cols._asdict(), 250 | ) 251 | return self.solve_power_from_aggregates(data=data, parameter=parameter) 252 | 253 | @abc.abstractmethod 254 | def solve_power_from_aggregates( 255 | self, 256 | data: tea_tasting.aggr.Aggregates, 257 | parameter: Literal[ 258 | "power", "effect_size", "rel_effect_size", "n_obs"] = "rel_effect_size", 259 | ) -> S: 260 | """Solve for a parameter of the power of a test. 261 | 262 | Args: 263 | data: Sample data. 264 | parameter: Parameter name. 265 | 266 | Returns: 267 | Power analysis result. 268 | """ 269 | 270 | 271 | def aggregate_by_variants( 272 | data: ( 273 | narwhals.typing.IntoFrame | 274 | ibis.expr.types.Table | 275 | dict[object, tea_tasting.aggr.Aggregates] 276 | ), 277 | aggr_cols: AggrCols, 278 | variant: str | None = None, 279 | ) -> dict[object, tea_tasting.aggr.Aggregates]: 280 | """Aggregate experimental data by variants. 281 | 282 | Args: 283 | data: Experimental data. 284 | aggr_cols: Columns to be aggregated. 285 | variant: Variant column name. 286 | 287 | Returns: 288 | Experimental data as a dictionary of Aggregates. 289 | """ 290 | if isinstance(data, dict): 291 | return data 292 | 293 | if variant is None: 294 | raise ValueError("The variant parameter is required but was not provided.") 295 | 296 | return tea_tasting.aggr.read_aggregates( 297 | data=data, 298 | group_col=variant, 299 | **aggr_cols._asdict(), 300 | ) 301 | 302 | 303 | class _HasCols(abc.ABC): 304 | @property 305 | @abc.abstractmethod 306 | def cols(self) -> Sequence[str]: 307 | """Columns to be fetched for an analysis.""" 308 | 309 | 310 | class MetricBaseGranular(MetricBase[R], _HasCols): 311 | """Base class for metrics, which are analyzed using granular data.""" 312 | @overload 313 | def analyze( 314 | self, 315 | data: dict[object, pa.Table], 316 | control: object, 317 | treatment: object, 318 | variant: str | None = None, 319 | ) -> R: 320 | ... 321 | 322 | @overload 323 | def analyze( 324 | self, 325 | data: narwhals.typing.IntoFrame | ibis.expr.types.Table, 326 | control: object, 327 | treatment: object, 328 | variant: str, 329 | ) -> R: 330 | ... 331 | 332 | def analyze( 333 | self, 334 | data: ( 335 | narwhals.typing.IntoFrame | 336 | ibis.expr.types.Table | 337 | dict[object, pa.Table] 338 | ), 339 | control: object, 340 | treatment: object, 341 | variant: str | None = None, 342 | ) -> R: 343 | """Analyze a metric in an experiment. 344 | 345 | Args: 346 | data: Experimental data. 347 | control: Control variant. 348 | treatment: Treatment variant. 349 | variant: Variant column name. 350 | 351 | Returns: 352 | Analysis result. 353 | """ 354 | tea_tasting.utils.check_scalar(variant, "variant", typ=str | None) 355 | dfs = read_granular( 356 | data, 357 | cols=self.cols, 358 | variant=variant, 359 | ) 360 | return self.analyze_granular( 361 | control=dfs[control], 362 | treatment=dfs[treatment], 363 | ) 364 | 365 | @abc.abstractmethod 366 | def analyze_granular( 367 | self, 368 | control: pa.Table, 369 | treatment: pa.Table, 370 | ) -> R: 371 | """Analyze metric in an experiment using granular data. 372 | 373 | Args: 374 | control: Control data. 375 | treatment: Treatment data. 376 | 377 | Returns: 378 | Analysis result. 379 | """ 380 | 381 | 382 | @overload 383 | def read_granular( 384 | data: narwhals.typing.IntoFrame | ibis.expr.types.Table, 385 | cols: Sequence[str] = (), 386 | variant: None = None, 387 | ) -> pa.Table: 388 | ... 389 | 390 | @overload 391 | def read_granular( 392 | data: dict[object, pa.Table], 393 | cols: Sequence[str] = (), 394 | variant: None = None, 395 | ) -> dict[object, pa.Table]: 396 | ... 397 | 398 | @overload 399 | def read_granular( 400 | data: narwhals.typing.IntoFrame | ibis.expr.types.Table | dict[object, pa.Table], 401 | cols: Sequence[str], 402 | variant: str, 403 | ) -> dict[object, pa.Table]: 404 | ... 405 | 406 | def read_granular( 407 | data: narwhals.typing.IntoFrame | ibis.expr.types.Table | dict[object, pa.Table], 408 | cols: Sequence[str] = (), 409 | variant: str | None = None, 410 | ) -> pa.Table | dict[object, pa.Table]: 411 | """Read granular experimental data. 412 | 413 | Args: 414 | data: Experimental data. 415 | cols: Columns to read. 416 | variant: Variant column name. 417 | 418 | Returns: 419 | Experimental data as a dictionary of PyArrow Tables. 420 | """ 421 | if isinstance(data, dict): 422 | return data 423 | 424 | variant_cols = () if variant is None else (variant,) 425 | if isinstance(data, ibis.expr.types.Table): 426 | if len(cols) + len(variant_cols) > 0: 427 | data = data.select(*cols, *variant_cols) 428 | table = data.to_pyarrow() 429 | else: 430 | data = nw.from_native(data) 431 | if isinstance(data, nw.LazyFrame): 432 | data = data.collect() 433 | if len(cols) + len(variant_cols) > 0: 434 | data = data.select(*cols, *variant_cols) 435 | table = data.to_arrow() 436 | 437 | if variant is None: 438 | return table 439 | 440 | variant_array = table[variant] 441 | if len(cols) > 0: 442 | table = table.select(cols) 443 | return { 444 | var: table.filter(pc.equal(variant_array, pa.scalar(var))) # type: ignore 445 | for var in variant_array.unique().to_pylist() 446 | } 447 | -------------------------------------------------------------------------------- /src/tea_tasting/metrics/proportion.py: -------------------------------------------------------------------------------- 1 | """Metrics for the analysis of proportions.""" 2 | 3 | from __future__ import annotations 4 | 5 | import math 6 | from typing import TYPE_CHECKING, NamedTuple 7 | 8 | import scipy.stats 9 | 10 | import tea_tasting.aggr 11 | import tea_tasting.metrics 12 | from tea_tasting.metrics.base import AggrCols, MetricBaseAggregated 13 | import tea_tasting.utils 14 | 15 | 16 | if TYPE_CHECKING: 17 | from typing import Literal 18 | 19 | import ibis.expr.types 20 | import narwhals.typing 21 | 22 | 23 | _MAX_EXACT_THRESHOLD = 1000 24 | 25 | 26 | class SampleRatioResult(NamedTuple): 27 | """Result of the sample ratio mismatch check. 28 | 29 | Attributes: 30 | control: Number of observations in control. 31 | treatment: Number of observations in treatment. 32 | pvalue: P-value 33 | """ 34 | control: float 35 | treatment: float 36 | pvalue: float 37 | 38 | 39 | class SampleRatio(MetricBaseAggregated[SampleRatioResult]): # noqa: D101 40 | def __init__( 41 | self, 42 | ratio: float | int | dict[object, float | int] = 1, 43 | *, 44 | method: Literal["auto", "binom", "norm"] = "auto", 45 | correction: bool = True, 46 | ) -> None: 47 | """Metric for sample ratio mismatch check. 48 | 49 | Args: 50 | ratio: Expected ratio of the number of observations in the treatment 51 | relative to the control. 52 | method: Statistical test used for calculation of p-value: 53 | 54 | - `"auto"`: Apply exact binomial test if the total number 55 | of observations is < 1000; or normal approximation otherwise. 56 | - `"binom"`: Apply exact binomial test. 57 | - `"norm"`: Apply normal approximation of the binomial distribution. 58 | 59 | correction: If `True`, add continuity correction. 60 | Only for normal approximation. 61 | 62 | Examples: 63 | ```pycon 64 | >>> import tea_tasting as tt 65 | 66 | >>> experiment = tt.Experiment( 67 | ... sample_ratio=tt.SampleRatio(), 68 | ... ) 69 | >>> data = tt.make_users_data(seed=42) 70 | >>> result = experiment.analyze(data) 71 | >>> result.with_keys(("metric", "control", "treatment", "pvalue")) 72 | metric control treatment pvalue 73 | sample_ratio 2023 1977 0.477 74 | 75 | ``` 76 | 77 | Different expected ratio: 78 | 79 | ```pycon 80 | >>> experiment = tt.Experiment( 81 | ... sample_ratio=tt.SampleRatio(0.5), 82 | ... ) 83 | >>> data = tt.make_users_data(seed=42) 84 | >>> result = experiment.analyze(data) 85 | >>> result.with_keys(("metric", "control", "treatment", "pvalue")) 86 | metric control treatment pvalue 87 | sample_ratio 2023 1977 3.26e-103 88 | 89 | ``` 90 | """ 91 | if isinstance(ratio, dict): 92 | for val in ratio.values(): 93 | tea_tasting.utils.auto_check(val, "ratio") 94 | else: 95 | tea_tasting.utils.auto_check(ratio, "ratio") 96 | self.ratio = ratio 97 | 98 | self.method = tea_tasting.utils.check_scalar( 99 | method, "method", typ=str, in_={"auto", "binom", "norm"}) 100 | self.correction = tea_tasting.utils.auto_check(correction, "correction") 101 | 102 | 103 | @property 104 | def aggr_cols(self) -> AggrCols: 105 | """Columns to be aggregated for a metric analysis.""" 106 | return AggrCols(has_count=True) 107 | 108 | 109 | def analyze( 110 | self, 111 | data: narwhals.typing.IntoFrame | ibis.expr.types.Table | dict[ 112 | object, tea_tasting.aggr.Aggregates], 113 | control: object, 114 | treatment: object, 115 | variant: str | None = None, 116 | ) -> SampleRatioResult: 117 | """Perform a sample ratio mismatch check. 118 | 119 | Args: 120 | data: Experimental data. 121 | control: Control variant. 122 | treatment: Treatment variant. 123 | variant: Variant column name. 124 | 125 | Returns: 126 | Analysis result. 127 | """ 128 | tea_tasting.utils.check_scalar(variant, "variant", typ=str | None) 129 | aggr = tea_tasting.metrics.aggregate_by_variants( 130 | data, 131 | aggr_cols=self.aggr_cols, 132 | variant=variant, 133 | ) 134 | 135 | k = aggr[treatment].count() 136 | n = k + aggr[control].count() 137 | 138 | r = ( 139 | self.ratio 140 | if isinstance(self.ratio, float | int) 141 | else self.ratio[treatment] / self.ratio[control] 142 | ) 143 | p = r / (1 + r) 144 | 145 | if ( 146 | self.method == "binom" or 147 | (self.method == "auto" and n < _MAX_EXACT_THRESHOLD) 148 | ): 149 | pvalue = scipy.stats.binomtest(k=int(k), n=int(n), p=p).pvalue 150 | else: # norm 151 | d = k - n*p 152 | if self.correction and d != 0: 153 | d = min(d + 0.5, 0) if d < 0 else max(d - 0.5, 0) 154 | z = d / math.sqrt(n * p * (1 - p)) 155 | pvalue = 2 * scipy.stats.norm.sf(abs(z)) 156 | 157 | return SampleRatioResult( 158 | control=n - k, 159 | treatment=k, 160 | pvalue=pvalue, # type: ignore 161 | ) 162 | 163 | 164 | def analyze_aggregates( 165 | self, 166 | control: tea_tasting.aggr.Aggregates, 167 | treatment: tea_tasting.aggr.Aggregates, 168 | ) -> SampleRatioResult: 169 | """Stub method for compatibility with the base class.""" 170 | raise NotImplementedError 171 | -------------------------------------------------------------------------------- /src/tea_tasting/version.py: -------------------------------------------------------------------------------- 1 | """Package version.""" 2 | 3 | from __future__ import annotations 4 | 5 | import importlib.metadata 6 | import importlib.resources 7 | 8 | 9 | try: 10 | __version__ = importlib.metadata.version(__package__ or "tea-tasting") 11 | except importlib.metadata.PackageNotFoundError: 12 | __version__ = ( 13 | importlib.resources.files("tea_tasting") 14 | .joinpath("_version.txt") 15 | .read_text() 16 | .strip() 17 | ) 18 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e10v/tea-tasting/70b129d8dad04654b4aceb56c8ee2e348b0636b0/tests/__init__.py -------------------------------------------------------------------------------- /tests/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e10v/tea-tasting/70b129d8dad04654b4aceb56c8ee2e348b0636b0/tests/metrics/__init__.py -------------------------------------------------------------------------------- /tests/metrics/test_base.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING, NamedTuple 4 | import unittest.mock 5 | 6 | import ibis 7 | import polars as pl 8 | import pyarrow as pa 9 | import pyarrow.compute as pc 10 | import pytest 11 | 12 | import tea_tasting.aggr 13 | import tea_tasting.datasets 14 | import tea_tasting.metrics.base 15 | 16 | 17 | if TYPE_CHECKING: 18 | from typing import Any, Literal 19 | 20 | import ibis.expr.types # noqa: TC004 21 | import pandas as pd 22 | 23 | 24 | Frame = ibis.expr.types.Table | pa.Table | pd.DataFrame | pl.LazyFrame 25 | 26 | 27 | def test_aggr_cols_or(): 28 | aggr_cols0 = tea_tasting.metrics.base.AggrCols( 29 | has_count=False, 30 | mean_cols=("a", "b"), 31 | var_cols=("b", "c"), 32 | cov_cols=(("a", "b"), ("c", "b")), 33 | ) 34 | 35 | aggr_cols1 = tea_tasting.metrics.base.AggrCols( 36 | has_count=True, 37 | mean_cols=("b", "c"), 38 | var_cols=("c", "d"), 39 | cov_cols=(("b", "c"), ("d", "c")), 40 | ) 41 | 42 | aggr_cols = aggr_cols0 | aggr_cols1 43 | 44 | assert isinstance(aggr_cols, tea_tasting.metrics.base.AggrCols) 45 | assert aggr_cols.has_count is True 46 | assert set(aggr_cols.mean_cols) == {"a", "b", "c"} 47 | assert len(aggr_cols.mean_cols) == 3 48 | assert set(aggr_cols.var_cols) == {"b", "c", "d"} 49 | assert len(aggr_cols.var_cols) == 3 50 | assert set(aggr_cols.cov_cols) == {("a", "b"), ("b", "c"), ("c", "d")} 51 | assert len(aggr_cols.cov_cols) == 3 52 | 53 | 54 | def test_aggr_cols_len(): 55 | assert len(tea_tasting.metrics.base.AggrCols( 56 | has_count=False, 57 | mean_cols=("a", "b"), 58 | var_cols=("b", "c"), 59 | cov_cols=(("a", "b"), ("c", "b")), 60 | )) == 6 61 | assert len(tea_tasting.metrics.base.AggrCols( 62 | has_count=True, 63 | mean_cols=("b", "c"), 64 | var_cols=("c", "d"), 65 | cov_cols=(("b", "c"), ("d", "c")), 66 | )) == 7 67 | 68 | 69 | @pytest.fixture 70 | def data_arrow() -> pa.Table: 71 | return tea_tasting.datasets.make_users_data(n_users=100, seed=42) 72 | 73 | @pytest.fixture 74 | def data_pandas(data_arrow: pa.Table) -> pd.DataFrame: 75 | return data_arrow.to_pandas() 76 | 77 | @pytest.fixture 78 | def data_polars(data_arrow: pa.Table) -> pl.DataFrame: 79 | return pl.from_arrow(data_arrow) # type: ignore 80 | 81 | @pytest.fixture 82 | def data_polars_lazy(data_polars: pl.DataFrame) -> pl.LazyFrame: 83 | return data_polars.lazy() 84 | 85 | @pytest.fixture 86 | def data_duckdb(data_arrow: pa.Table) -> ibis.expr.types.Table: 87 | return ibis.connect("duckdb://").create_table("data", data_arrow) 88 | 89 | @pytest.fixture 90 | def data_sqlite(data_arrow: pa.Table) -> ibis.expr.types.Table: 91 | return ibis.connect("sqlite://").create_table("data", data_arrow) 92 | 93 | @pytest.fixture(params=[ 94 | "data_arrow", "data_pandas", 95 | "data_polars", "data_polars_lazy", 96 | "data_duckdb", "data_sqlite", 97 | ]) 98 | def data(request: pytest.FixtureRequest) -> Frame: 99 | return request.getfixturevalue(request.param) 100 | 101 | 102 | @pytest.fixture 103 | def aggr_cols() -> tea_tasting.metrics.base.AggrCols: 104 | return tea_tasting.metrics.base.AggrCols( 105 | has_count=True, 106 | mean_cols=("sessions", "orders"), 107 | var_cols=("orders", "revenue"), 108 | cov_cols=(("sessions", "revenue"),), 109 | ) 110 | 111 | @pytest.fixture 112 | def correct_aggrs( 113 | data_arrow: pa.Table, 114 | aggr_cols: tea_tasting.metrics.base.AggrCols, 115 | ) -> dict[object, tea_tasting.aggr.Aggregates]: 116 | return tea_tasting.aggr.read_aggregates( 117 | data_arrow, 118 | group_col="variant", 119 | **aggr_cols._asdict(), 120 | ) 121 | 122 | @pytest.fixture 123 | def correct_aggr( 124 | data_arrow: pa.Table, 125 | aggr_cols: tea_tasting.metrics.base.AggrCols, 126 | ) -> tea_tasting.aggr.Aggregates: 127 | return tea_tasting.aggr.read_aggregates( 128 | data_arrow, 129 | group_col=None, 130 | **aggr_cols._asdict(), 131 | ) 132 | 133 | @pytest.fixture 134 | def cols() -> tuple[str, ...]: 135 | return ("sessions", "orders", "revenue") 136 | 137 | @pytest.fixture 138 | def correct_gran( 139 | data_arrow: pa.Table, 140 | cols: tuple[str, ...], 141 | ) -> dict[object, pa.Table]: 142 | variant_col = data_arrow["variant"] 143 | table = data_arrow.select(cols) 144 | return { 145 | var: table.filter(pc.equal(variant_col, pa.scalar(var))) # type: ignore 146 | for var in variant_col.unique().to_pylist() 147 | } 148 | 149 | @pytest.fixture 150 | def aggr_metric( 151 | aggr_cols: tea_tasting.metrics.base.AggrCols, 152 | ) -> tea_tasting.metrics.base.MetricBaseAggregated[dict[str, object]]: 153 | class AggrMetric(tea_tasting.metrics.base.MetricBaseAggregated[dict[str, object]]): 154 | @property 155 | def aggr_cols(self) -> tea_tasting.metrics.base.AggrCols: 156 | return aggr_cols 157 | 158 | def analyze_aggregates( 159 | self, 160 | control: tea_tasting.aggr.Aggregates, # noqa: ARG002 161 | treatment: tea_tasting.aggr.Aggregates, # noqa: ARG002 162 | ) -> dict[str, object]: 163 | return {} 164 | 165 | return AggrMetric() 166 | 167 | @pytest.fixture 168 | def aggr_power( 169 | aggr_cols: tea_tasting.metrics.base.AggrCols, 170 | ) -> tea_tasting.metrics.base.PowerBaseAggregated[ 171 | tea_tasting.metrics.base.MetricPowerResults[dict[str, object]] 172 | ]: 173 | class AggrPower( 174 | tea_tasting.metrics.base.PowerBaseAggregated[ 175 | tea_tasting.metrics.base.MetricPowerResults[dict[str, object]] 176 | ], 177 | ): 178 | @property 179 | def aggr_cols(self) -> tea_tasting.metrics.base.AggrCols: 180 | return aggr_cols 181 | 182 | def solve_power_from_aggregates( 183 | self, 184 | data: tea_tasting.aggr.Aggregates, # noqa: ARG002 185 | parameter: Literal[ # noqa: ARG002 186 | "power", 187 | "effect_size", 188 | "rel_effect_size", 189 | "n_obs", 190 | ] = "power", 191 | ) -> tea_tasting.metrics.base.MetricPowerResults[dict[str, object]]: 192 | return tea_tasting.metrics.base.MetricPowerResults() 193 | return AggrPower() 194 | 195 | @pytest.fixture 196 | def gran_metric( 197 | cols: tuple[str, ...], 198 | ) -> tea_tasting.metrics.base.MetricBaseGranular[dict[str, object]]: 199 | class GranMetric(tea_tasting.metrics.base.MetricBaseGranular[dict[str, object]]): 200 | @property 201 | def cols(self) -> tuple[str, ...]: 202 | return cols 203 | 204 | def analyze_granular( 205 | self, 206 | control: pa.Table, # noqa: ARG002 207 | treatment: pa.Table, # noqa: ARG002 208 | ) -> dict[str, object]: 209 | return {} 210 | 211 | return GranMetric() 212 | 213 | 214 | def _compare_aggrs( 215 | left: tea_tasting.aggr.Aggregates, 216 | right: tea_tasting.aggr.Aggregates, 217 | ) -> None: 218 | assert left.count_ == right.count_ 219 | assert left.mean_ == pytest.approx(right.mean_) 220 | assert left.var_ == pytest.approx(right.var_) 221 | assert left.cov_ == pytest.approx(right.cov_) 222 | 223 | 224 | def test_metric_power_results_to_dicts(): 225 | result0 = { 226 | "power": 0.8, 227 | "effect_size": 1, 228 | "rel_effect_size": 0.05, 229 | "n_obs": 10_000, 230 | } 231 | result1 = { 232 | "power": 0.9, 233 | "effect_size": 2, 234 | "rel_effect_size": 0.1, 235 | "n_obs": 20_000, 236 | } 237 | 238 | results = tea_tasting.metrics.base.MetricPowerResults[dict[str, float | int]]( # type: ignore 239 | [result0, result1]) 240 | assert results.to_dicts() == (result0, result1) 241 | 242 | class PowerResult(NamedTuple): 243 | power: float 244 | effect_size: float 245 | rel_effect_size: float 246 | n_obs: float 247 | results = tea_tasting.metrics.base.MetricPowerResults[PowerResult]([ 248 | PowerResult(**result0), 249 | PowerResult(**result1), 250 | ]) 251 | assert results.to_dicts() == (result0, result1) 252 | 253 | 254 | def test_metric_base_aggregated_analyze_frame( 255 | aggr_metric: tea_tasting.metrics.base.MetricBaseAggregated[dict[str, object]], 256 | data_arrow: pa.Table, 257 | correct_aggrs: dict[object, tea_tasting.aggr.Aggregates], 258 | ): 259 | aggr_metric.analyze_aggregates = unittest.mock.MagicMock() 260 | aggr_metric.analyze(data_arrow, control=0, treatment=1, variant="variant") 261 | aggr_metric.analyze_aggregates.assert_called_once() 262 | kwargs = aggr_metric.analyze_aggregates.call_args.kwargs 263 | _compare_aggrs(kwargs["control"], correct_aggrs[0]) 264 | _compare_aggrs(kwargs["treatment"], correct_aggrs[1]) 265 | 266 | def test_metric_base_aggregated_analyze_aggrs( 267 | aggr_metric: tea_tasting.metrics.base.MetricBaseAggregated[dict[str, object]], 268 | correct_aggrs: dict[object, tea_tasting.aggr.Aggregates], 269 | ): 270 | aggr_metric.analyze_aggregates = unittest.mock.MagicMock() 271 | aggr_metric.analyze(correct_aggrs, control=0, treatment=1) 272 | aggr_metric.analyze_aggregates.assert_called_once() 273 | kwargs = aggr_metric.analyze_aggregates.call_args.kwargs 274 | _compare_aggrs(kwargs["control"], correct_aggrs[0]) 275 | _compare_aggrs(kwargs["treatment"], correct_aggrs[1]) 276 | 277 | 278 | def test_power_base_aggregated_analyze_frame( 279 | aggr_power: tea_tasting.metrics.base.PowerBaseAggregated[Any], 280 | data_arrow: pa.Table, 281 | correct_aggr: tea_tasting.aggr.Aggregates, 282 | ): 283 | aggr_power.solve_power_from_aggregates = unittest.mock.MagicMock() 284 | aggr_power.solve_power(data_arrow, "effect_size") 285 | aggr_power.solve_power_from_aggregates.assert_called_once() 286 | kwargs = aggr_power.solve_power_from_aggregates.call_args.kwargs 287 | _compare_aggrs(kwargs["data"], correct_aggr) 288 | assert kwargs["parameter"] == "effect_size" 289 | 290 | def test_power_base_aggregated_analyze_aggr( 291 | aggr_power: tea_tasting.metrics.base.PowerBaseAggregated[Any], 292 | correct_aggr: tea_tasting.aggr.Aggregates, 293 | ): 294 | aggr_power.solve_power_from_aggregates = unittest.mock.MagicMock() 295 | aggr_power.solve_power(correct_aggr, "rel_effect_size") 296 | aggr_power.solve_power_from_aggregates.assert_called_once() 297 | kwargs = aggr_power.solve_power_from_aggregates.call_args.kwargs 298 | _compare_aggrs(kwargs["data"], correct_aggr) 299 | assert kwargs["parameter"] == "rel_effect_size" 300 | 301 | 302 | def test_aggregate_by_variants_frame( 303 | data_arrow: pa.Table, 304 | aggr_cols: tea_tasting.metrics.base.AggrCols, 305 | correct_aggrs: dict[object, tea_tasting.aggr.Aggregates], 306 | ): 307 | aggrs = tea_tasting.metrics.base.aggregate_by_variants( 308 | data_arrow, 309 | aggr_cols=aggr_cols, 310 | variant="variant", 311 | ) 312 | _compare_aggrs(aggrs[0], correct_aggrs[0]) 313 | _compare_aggrs(aggrs[1], correct_aggrs[1]) 314 | 315 | def test_aggregate_by_variants_aggrs( 316 | aggr_cols: tea_tasting.metrics.base.AggrCols, 317 | correct_aggrs: dict[object, tea_tasting.aggr.Aggregates], 318 | ): 319 | aggrs = tea_tasting.metrics.base.aggregate_by_variants( 320 | correct_aggrs, 321 | aggr_cols=aggr_cols, 322 | variant="variant", 323 | ) 324 | _compare_aggrs(aggrs[0], correct_aggrs[0]) 325 | _compare_aggrs(aggrs[1], correct_aggrs[1]) 326 | 327 | def test_aggregate_by_variants_raises( 328 | data_arrow: pa.Table, 329 | aggr_cols: tea_tasting.metrics.base.AggrCols, 330 | ): 331 | with pytest.raises(ValueError, match="variant"): 332 | tea_tasting.metrics.base.aggregate_by_variants(data_arrow, aggr_cols=aggr_cols) 333 | 334 | 335 | def test_metric_base_granular_frame( 336 | gran_metric: tea_tasting.metrics.base.MetricBaseGranular[dict[str, object]], 337 | data_arrow: pa.Table, 338 | correct_gran: dict[object, pa.Table], 339 | ): 340 | gran_metric.analyze_granular = unittest.mock.MagicMock() 341 | gran_metric.analyze(data_arrow, control=0, treatment=1, variant="variant") 342 | gran_metric.analyze_granular.assert_called_once() 343 | kwargs = gran_metric.analyze_granular.call_args.kwargs 344 | assert kwargs["control"].equals(correct_gran[0]) 345 | assert kwargs["treatment"].equals(correct_gran[1]) 346 | 347 | def test_metric_base_granular_gran( 348 | gran_metric: tea_tasting.metrics.base.MetricBaseGranular[dict[str, object]], 349 | correct_gran: dict[object, pa.Table], 350 | ): 351 | gran_metric.analyze_granular = unittest.mock.MagicMock() 352 | gran_metric.analyze(correct_gran, control=0, treatment=1) 353 | gran_metric.analyze_granular.assert_called_once() 354 | kwargs = gran_metric.analyze_granular.call_args.kwargs 355 | assert kwargs["control"].equals(correct_gran[0]) 356 | assert kwargs["treatment"].equals(correct_gran[1]) 357 | 358 | 359 | def test_read_granular_frame( 360 | data: Frame, 361 | cols: tuple[str, ...], 362 | correct_gran: dict[object, pa.Table], 363 | ): 364 | gran = tea_tasting.metrics.base.read_granular( 365 | data, 366 | cols=cols, 367 | variant="variant", 368 | ) 369 | assert gran[0].equals(correct_gran[0]) 370 | assert gran[1].equals(correct_gran[1]) 371 | 372 | def test_read_granular_dict( 373 | cols: tuple[str, ...], 374 | correct_gran: dict[object, pa.Table], 375 | ): 376 | gran = tea_tasting.metrics.base.read_granular( 377 | correct_gran, 378 | cols=cols, 379 | variant="variant", 380 | ) 381 | assert gran[0].equals(correct_gran[0]) 382 | assert gran[1].equals(correct_gran[1]) 383 | 384 | def test_read_granular_none( 385 | data: Frame, 386 | cols: tuple[str, ...], 387 | data_arrow: pa.Table, 388 | ): 389 | gran = tea_tasting.metrics.base.read_granular(data, cols=cols) 390 | assert gran.equals(data_arrow.select(cols)) 391 | -------------------------------------------------------------------------------- /tests/metrics/test_proportion.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING, NamedTuple 4 | import unittest.mock 5 | 6 | import pytest 7 | 8 | import tea_tasting.aggr 9 | import tea_tasting.datasets 10 | import tea_tasting.metrics.base 11 | import tea_tasting.metrics.proportion 12 | 13 | 14 | if TYPE_CHECKING: 15 | import pyarrow as pa 16 | 17 | 18 | @pytest.fixture 19 | def data_arrow() -> pa.Table: 20 | return tea_tasting.datasets.make_users_data(n_users=100, seed=42) 21 | 22 | @pytest.fixture 23 | def data_aggr(data_arrow: pa.Table) -> dict[object, tea_tasting.aggr.Aggregates]: 24 | return tea_tasting.aggr.read_aggregates( 25 | data_arrow, 26 | group_col="variant", 27 | has_count=True, 28 | mean_cols=(), 29 | var_cols=(), 30 | cov_cols=(), 31 | ) 32 | 33 | 34 | def test_sample_ratio_init_default(): 35 | metric = tea_tasting.metrics.proportion.SampleRatio() 36 | assert metric.ratio == 1 37 | assert metric.method == "auto" 38 | assert metric.correction is True 39 | 40 | def test_sample_ratio_init_custom(): 41 | metric = tea_tasting.metrics.proportion.SampleRatio( 42 | {0: 0.5, 1: 0.5}, 43 | method="norm", 44 | correction=False, 45 | ) 46 | assert metric.ratio == {0: 0.5, 1: 0.5} 47 | assert metric.method == "norm" 48 | assert metric.correction is False 49 | 50 | 51 | def test_sample_ratio_aggr_cols(): 52 | metric = tea_tasting.metrics.proportion.SampleRatio() 53 | assert metric.aggr_cols == tea_tasting.metrics.base.AggrCols(has_count=True) 54 | 55 | 56 | def test_sample_ratio_analyze_frame(data_arrow: pa.Table): 57 | metric = tea_tasting.metrics.proportion.SampleRatio() 58 | result = metric.analyze(data_arrow, 0, 1, variant="variant") 59 | assert isinstance(result, tea_tasting.metrics.proportion.SampleRatioResult) 60 | 61 | def test_sample_ratio_analyze_auto(): 62 | metric = tea_tasting.metrics.proportion.SampleRatio() 63 | with unittest.mock.patch("scipy.stats.binomtest") as mock: 64 | mock.return_value = NamedTuple("Result", (("pvalue", float),))(pvalue=0.1) 65 | data = tea_tasting.datasets.make_users_data( 66 | seed=42, 67 | n_users=tea_tasting.metrics.proportion._MAX_EXACT_THRESHOLD - 1, 68 | ) 69 | metric.analyze(data, 0, 1, variant="variant") 70 | mock.assert_called_once() 71 | with unittest.mock.patch("scipy.stats.norm.sf") as mock: 72 | mock.return_value = 0.1 73 | data = tea_tasting.datasets.make_users_data( 74 | seed=42, 75 | n_users=tea_tasting.metrics.proportion._MAX_EXACT_THRESHOLD, 76 | ) 77 | metric.analyze(data, 0, 1, variant="variant") 78 | mock.assert_called_once() 79 | 80 | def test_sample_ratio_analyze_binom( 81 | data_aggr: dict[object, tea_tasting.aggr.Aggregates], 82 | ): 83 | metric = tea_tasting.metrics.proportion.SampleRatio(method="binom") 84 | result = metric.analyze(data_aggr, 0, 1, variant="variant") 85 | assert result.control == 53 86 | assert result.treatment == 47 87 | assert result.pvalue == pytest.approx(0.6172994135892521) 88 | 89 | def test_sample_ratio_analyze_norm_corr( 90 | data_aggr: dict[object, tea_tasting.aggr.Aggregates], 91 | ): 92 | metric = tea_tasting.metrics.proportion.SampleRatio(method="norm", correction=True) 93 | result = metric.analyze(data_aggr, 0, 1, variant="variant") 94 | assert result.control == 53 95 | assert result.treatment == 47 96 | assert result.pvalue == pytest.approx(0.6170750774519738) 97 | 98 | def test_sample_ratio_analyze_norm_no_corr( 99 | data_aggr: dict[object, tea_tasting.aggr.Aggregates], 100 | ): 101 | metric = tea_tasting.metrics.proportion.SampleRatio(method="norm", correction=False) 102 | result = metric.analyze(data_aggr, 0, 1, variant="variant") 103 | assert result.control == 53 104 | assert result.treatment == 47 105 | assert result.pvalue == pytest.approx(0.5485062355001472) 106 | 107 | def test_sample_ratio_analyze_aggregates( 108 | data_aggr: dict[object, tea_tasting.aggr.Aggregates], 109 | ): 110 | metric = tea_tasting.metrics.proportion.SampleRatio() 111 | with pytest.raises(NotImplementedError): 112 | metric.analyze_aggregates(data_aggr[0], data_aggr[1]) 113 | -------------------------------------------------------------------------------- /tests/metrics/test_resampling.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | import numpy as np 6 | import pytest 7 | 8 | import tea_tasting.config 9 | import tea_tasting.datasets 10 | import tea_tasting.metrics.base 11 | import tea_tasting.metrics.resampling 12 | 13 | 14 | if TYPE_CHECKING: 15 | import numpy.typing as npt 16 | import pyarrow as pa 17 | 18 | 19 | @pytest.fixture 20 | def data_arrow() -> pa.Table: 21 | return tea_tasting.datasets.make_users_data(n_users=100, seed=42) 22 | 23 | @pytest.fixture 24 | def data_gran(data_arrow: pa.Table) -> dict[object, pa.Table]: 25 | return tea_tasting.metrics.base.read_granular( 26 | data_arrow, 27 | ("sessions", "orders", "revenue"), 28 | variant="variant", 29 | ) 30 | 31 | 32 | def test_bootstrap_init_default(): 33 | metric = tea_tasting.metrics.resampling.Bootstrap("a", np.mean) 34 | assert metric.columns == "a" 35 | assert metric.statistic == np.mean 36 | assert metric.alternative == tea_tasting.config.get_config("alternative") 37 | assert metric.confidence_level == tea_tasting.config.get_config("confidence_level") 38 | assert metric.n_resamples == tea_tasting.config.get_config("n_resamples") 39 | assert metric.method == "bca" 40 | assert metric.batch is None 41 | assert metric.random_state is None 42 | 43 | def test_bootstrap_init_custom(): 44 | metric = tea_tasting.metrics.resampling.Bootstrap( 45 | ("a", "b"), 46 | np.mean, 47 | alternative="greater", 48 | confidence_level=0.9, 49 | n_resamples=1000, 50 | method="basic", 51 | batch=100, 52 | random_state=42, 53 | ) 54 | assert metric.columns == ("a", "b") 55 | assert metric.statistic == np.mean 56 | assert metric.alternative == "greater" 57 | assert metric.confidence_level == 0.9 58 | assert metric.n_resamples == 1000 59 | assert metric.method == "basic" 60 | assert metric.batch == 100 61 | assert metric.random_state == 42 62 | 63 | 64 | def test_bootstrap_cols(): 65 | metric = tea_tasting.metrics.resampling.Bootstrap("a", np.mean) 66 | assert metric.cols == ("a",) 67 | 68 | metric = tea_tasting.metrics.resampling.Bootstrap(("a", "b"), np.mean) 69 | assert metric.cols == ("a", "b") 70 | 71 | 72 | def test_bootstrap_analyze_frame(data_arrow: pa.Table): 73 | metric = tea_tasting.metrics.resampling.Bootstrap("sessions", np.mean) 74 | result = metric.analyze(data_arrow, 0, 1, variant="variant") 75 | assert isinstance(result, tea_tasting.metrics.resampling.BootstrapResult) 76 | 77 | 78 | def test_bootstrap_analyze_default(data_gran: dict[object, pa.Table]): 79 | metric = tea_tasting.metrics.resampling.Bootstrap( 80 | "revenue", 81 | np.mean, 82 | n_resamples=100, 83 | random_state=42, 84 | ) 85 | result = metric.analyze(data_gran, 0, 1) 86 | assert isinstance(result, tea_tasting.metrics.resampling.BootstrapResult) 87 | assert result.control == pytest.approx(5.029811320754717) 88 | assert result.treatment == pytest.approx(5.43) 89 | assert result.effect_size == pytest.approx(0.4001886792452831) 90 | assert result.effect_size_ci_lower == pytest.approx(-3.269396309565539) 91 | assert result.effect_size_ci_upper == pytest.approx(7.219843380442667) 92 | assert result.rel_effect_size == pytest.approx(0.07956335809137971) 93 | assert result.rel_effect_size_ci_lower == pytest.approx(-0.5658493834599828) 94 | assert result.rel_effect_size_ci_upper == pytest.approx(1.8185473860534842) 95 | 96 | def test_bootstrap_analyze_multiple_columns(data_gran: dict[object, pa.Table]): 97 | def ratio_of_means( 98 | sample: npt.NDArray[np.number], 99 | axis: int, 100 | ) -> npt.NDArray[np.number]: 101 | stat = np.mean(sample, axis=axis) # type: ignore 102 | return stat[0] / stat[1] 103 | 104 | metric = tea_tasting.metrics.resampling.Bootstrap( 105 | ("orders", "sessions"), 106 | ratio_of_means, 107 | n_resamples=100, 108 | random_state=42, 109 | ) 110 | result = metric.analyze(data_gran, 0, 1) 111 | assert isinstance(result, tea_tasting.metrics.resampling.BootstrapResult) 112 | assert result.control == pytest.approx(0.2857142857142857) 113 | assert result.treatment == pytest.approx(0.20224719101123595) 114 | assert result.effect_size == pytest.approx(-0.08346709470304975) 115 | assert result.effect_size_ci_lower == pytest.approx(-0.24780839493679777) 116 | assert result.effect_size_ci_upper == pytest.approx(0.07730723504025493) 117 | assert result.rel_effect_size == pytest.approx(-0.2921348314606741) 118 | assert result.rel_effect_size_ci_lower == pytest.approx(-0.6424902672606227) 119 | assert result.rel_effect_size_ci_upper == pytest.approx(0.4374404130492657) 120 | 121 | def test_bootstrap_analyze_division_by_zero(data_gran: dict[object, pa.Table]): 122 | metric = tea_tasting.metrics.resampling.Bootstrap( 123 | "orders", 124 | np.median, 125 | n_resamples=100, 126 | random_state=42, 127 | method="basic", 128 | ) 129 | result = metric.analyze(data_gran, 0, 1) 130 | assert isinstance(result, tea_tasting.metrics.resampling.BootstrapResult) 131 | assert result.control == 0 132 | assert result.treatment == 0 133 | assert result.effect_size == 0 134 | assert result.effect_size_ci_lower == 0 135 | assert result.effect_size_ci_upper == 0 136 | assert np.isnan(result.rel_effect_size) 137 | assert np.isnan(result.rel_effect_size_ci_lower) 138 | assert np.isnan(result.rel_effect_size_ci_upper) 139 | 140 | def test_quantile(data_gran: dict[object, pa.Table]): 141 | metric = tea_tasting.metrics.resampling.Quantile( 142 | "revenue", 143 | q=0.8, 144 | alternative="greater", 145 | confidence_level=0.9, 146 | n_resamples=100, 147 | random_state=42, 148 | ) 149 | assert metric.column == "revenue" 150 | assert metric.q == 0.8 151 | result = metric.analyze(data_gran, 0, 1) 152 | assert isinstance(result, tea_tasting.metrics.resampling.BootstrapResult) 153 | assert result.control == pytest.approx(11.972000000000001) 154 | assert result.treatment == pytest.approx(6.2820000000000045) 155 | assert result.effect_size == pytest.approx(-5.689999999999997) 156 | assert result.effect_size_ci_lower == pytest.approx(-10.875800000000003) 157 | assert result.effect_size_ci_upper == float("inf") 158 | assert result.rel_effect_size == pytest.approx(-0.47527564316739024) 159 | assert result.rel_effect_size_ci_lower == pytest.approx(-0.8743329817472134) 160 | assert result.rel_effect_size_ci_upper == float("inf") 161 | -------------------------------------------------------------------------------- /tests/test_aggr.py: -------------------------------------------------------------------------------- 1 | # pyright: reportAttributeAccessIssue=false 2 | from __future__ import annotations 3 | 4 | from typing import TYPE_CHECKING 5 | 6 | import ibis 7 | import numpy as np 8 | import polars as pl 9 | import pyarrow as pa 10 | import pyarrow.compute as pc 11 | import pytest 12 | 13 | import tea_tasting.aggr 14 | import tea_tasting.datasets 15 | 16 | 17 | if TYPE_CHECKING: 18 | import ibis.expr.types # noqa: TC004 19 | import pandas as pd 20 | 21 | 22 | Frame = ibis.expr.types.Table | pa.Table | pd.DataFrame | pl.LazyFrame 23 | 24 | 25 | COUNT = 100 26 | MEAN = {"x": 5.0, "y": 4} 27 | VAR = {"x": 3.0, "y": 2} 28 | COV = {("x", "y"): 1.0} 29 | 30 | @pytest.fixture 31 | def aggr() -> tea_tasting.aggr.Aggregates: 32 | return tea_tasting.aggr.Aggregates( 33 | count_=COUNT, 34 | mean_=MEAN, 35 | var_=VAR, 36 | cov_=COV, # type: ignore 37 | ) 38 | 39 | 40 | @pytest.fixture 41 | def data_arrow() -> pa.Table: 42 | return tea_tasting.datasets.make_users_data(n_users=100, seed=42) 43 | 44 | @pytest.fixture 45 | def data_pandas(data_arrow: pa.Table) -> pd.DataFrame: 46 | return data_arrow.to_pandas() 47 | 48 | @pytest.fixture 49 | def data_polars(data_arrow: pa.Table) -> pl.DataFrame: 50 | return pl.from_arrow(data_arrow) # type: ignore 51 | 52 | @pytest.fixture 53 | def data_polars_lazy(data_polars: pl.DataFrame) -> pl.LazyFrame: 54 | return data_polars.lazy() 55 | 56 | @pytest.fixture 57 | def data_duckdb(data_arrow: pa.Table) -> ibis.expr.types.Table: 58 | return ibis.connect("duckdb://").create_table("data", data_arrow) 59 | 60 | @pytest.fixture 61 | def data_sqlite(data_arrow: pa.Table) -> ibis.expr.types.Table: 62 | return ibis.connect("sqlite://").create_table("data", data_arrow) 63 | 64 | @pytest.fixture(params=[ 65 | "data_arrow", "data_pandas", 66 | "data_polars", "data_polars_lazy", 67 | "data_duckdb", "data_sqlite", 68 | ]) 69 | def data(request: pytest.FixtureRequest) -> Frame: 70 | return request.getfixturevalue(request.param) 71 | 72 | 73 | @pytest.fixture 74 | def correct_aggr(data_arrow: pa.Table) -> tea_tasting.aggr.Aggregates: 75 | return tea_tasting.aggr.Aggregates( 76 | count_=data_arrow.num_rows, 77 | mean_={ 78 | "sessions": pc.mean(data_arrow["sessions"]).as_py(), 79 | "orders": pc.mean(data_arrow["orders"]).as_py(), 80 | }, 81 | var_={ 82 | "sessions": pc.variance(data_arrow["sessions"], ddof=1).as_py(), 83 | "orders": pc.variance(data_arrow["orders"], ddof=1).as_py(), 84 | }, 85 | cov_={ 86 | ("orders", "sessions"): np.cov( 87 | data_arrow["sessions"].combine_chunks().to_numpy(zero_copy_only=False), 88 | data_arrow["orders"].combine_chunks().to_numpy(zero_copy_only=False), 89 | ddof=1, 90 | )[0, 1], 91 | }, 92 | ) 93 | 94 | @pytest.fixture 95 | def correct_aggrs(data_arrow: pa.Table) -> dict[int, tea_tasting.aggr.Aggregates]: 96 | variant_col = data_arrow["variant"] 97 | aggrs = {} 98 | for var in variant_col.unique().to_pylist(): 99 | var_data = data_arrow.filter(pc.equal(variant_col, pa.scalar(var))) 100 | aggrs |= {var: tea_tasting.aggr.Aggregates( 101 | count_=var_data.num_rows, 102 | mean_={ 103 | "sessions": pc.mean(var_data["sessions"]).as_py(), 104 | "orders": pc.mean(var_data["orders"]).as_py(), 105 | }, 106 | var_={ 107 | "sessions": pc.variance(var_data["sessions"], ddof=1).as_py(), 108 | "orders": pc.variance(var_data["orders"], ddof=1).as_py(), 109 | }, 110 | cov_={ 111 | ("orders", "sessions"): np.cov( 112 | var_data["sessions"].combine_chunks().to_numpy(zero_copy_only=False), 113 | var_data["orders"].combine_chunks().to_numpy(zero_copy_only=False), 114 | ddof=1, 115 | )[0, 1], 116 | }, 117 | )} 118 | return aggrs 119 | 120 | 121 | def test_aggregates_init(aggr: tea_tasting.aggr.Aggregates): 122 | assert aggr.count_ == COUNT 123 | assert aggr.mean_ == MEAN 124 | assert aggr.var_ == VAR 125 | assert aggr.cov_ == COV 126 | 127 | def test_aggregates_calls(aggr: tea_tasting.aggr.Aggregates): 128 | assert aggr.count() == COUNT 129 | assert aggr.mean("x") == MEAN["x"] 130 | assert aggr.mean("y") == MEAN["y"] 131 | assert aggr.var("x") == VAR["x"] 132 | assert aggr.mean("y") == MEAN["y"] 133 | assert aggr.cov("x", "y") == COV["x", "y"] 134 | 135 | def test_aggregates_count_raises(): 136 | aggr = tea_tasting.aggr.Aggregates(count_=None, mean_={}, var_={}, cov_={}) 137 | with pytest.raises(RuntimeError): 138 | aggr.count() 139 | 140 | def test_aggregates_none(aggr: tea_tasting.aggr.Aggregates): 141 | assert aggr.mean(None) == 1 142 | assert aggr.var(None) == 0 143 | assert aggr.cov(None, "y") == 0 144 | assert aggr.cov("x", None) == 0 145 | 146 | def test_aggregates_ratio_var(aggr: tea_tasting.aggr.Aggregates): 147 | assert aggr.ratio_var("x", "y") == pytest.approx(0.2265625) 148 | 149 | def test_aggregates_ratio_cov(): 150 | aggr = tea_tasting.aggr.Aggregates( 151 | count_=None, 152 | mean_={"a": 8, "b": 7, "c": 6, "d": 5}, 153 | var_={}, 154 | cov_={("a", "c"): 4, ("a", "d"): 3, ("b", "c"): 2, ("b", "d"): 1}, 155 | ) 156 | assert aggr.ratio_cov("a", "b", "c", "d") == pytest.approx(-0.0146938775510204) 157 | 158 | def test_aggregates_add( 159 | correct_aggr: tea_tasting.aggr.Aggregates, 160 | correct_aggrs: dict[int, tea_tasting.aggr.Aggregates], 161 | ): 162 | aggrs_add = correct_aggrs[0] + correct_aggrs[1] 163 | assert aggrs_add.count_ == pytest.approx(correct_aggr.count_) 164 | assert aggrs_add.mean_ == pytest.approx(correct_aggr.mean_) 165 | assert aggrs_add.var_ == pytest.approx(correct_aggr.var_) 166 | assert aggrs_add.cov_ == pytest.approx(correct_aggr.cov_) 167 | 168 | 169 | def test_read_aggregates_groups( 170 | data: Frame, 171 | correct_aggrs: dict[int, tea_tasting.aggr.Aggregates], 172 | ): 173 | aggrs = tea_tasting.aggr.read_aggregates( 174 | data, 175 | group_col="variant", 176 | has_count=True, 177 | mean_cols=("sessions", "orders"), 178 | var_cols=("sessions", "orders"), 179 | cov_cols=(("sessions", "orders"),), 180 | ) 181 | for i in (0, 1): 182 | assert aggrs[i].count_ == pytest.approx(correct_aggrs[i].count_) 183 | assert aggrs[i].mean_ == pytest.approx(correct_aggrs[i].mean_) 184 | assert aggrs[i].var_ == pytest.approx(correct_aggrs[i].var_) 185 | assert aggrs[i].cov_ == pytest.approx(correct_aggrs[i].cov_) 186 | 187 | def test_read_aggregates_no_groups( 188 | data: Frame, 189 | correct_aggr: tea_tasting.aggr.Aggregates, 190 | ): 191 | aggr = tea_tasting.aggr.read_aggregates( 192 | data, 193 | group_col=None, 194 | has_count=True, 195 | mean_cols=("sessions", "orders"), 196 | var_cols=("sessions", "orders"), 197 | cov_cols=(("sessions", "orders"),), 198 | ) 199 | assert aggr.count_ == pytest.approx(correct_aggr.count_) 200 | assert aggr.mean_ == pytest.approx(correct_aggr.mean_) 201 | assert aggr.var_ == pytest.approx(correct_aggr.var_) 202 | assert aggr.cov_ == pytest.approx(correct_aggr.cov_) 203 | 204 | def test_read_aggregates_no_count(data_arrow: pa.Table): 205 | aggr = tea_tasting.aggr.read_aggregates( 206 | data_arrow, 207 | group_col=None, 208 | has_count=False, 209 | mean_cols=("sessions", "orders"), 210 | var_cols=(), 211 | cov_cols=(), 212 | ) 213 | assert aggr.count_ is None 214 | assert aggr.var_ == {} 215 | assert aggr.cov_ == {} 216 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | import pytest 6 | 7 | import tea_tasting.config 8 | 9 | 10 | if TYPE_CHECKING: 11 | from collections.abc import Iterator 12 | 13 | 14 | @pytest.fixture 15 | def reset_config() -> Iterator[None]: 16 | try: 17 | yield 18 | finally: 19 | tea_tasting.config._config_var.set(tea_tasting.config._DEFAULT_CONFIG.copy()) 20 | 21 | 22 | @pytest.mark.usefixtures("reset_config") 23 | def test_get_config(): 24 | config = tea_tasting.config.get_config() 25 | assert config == tea_tasting.config._config_var.get() 26 | config["equal_var"] = not config["equal_var"] 27 | assert config != tea_tasting.config._config_var.get() 28 | 29 | assert ( 30 | tea_tasting.config.get_config("equal_var") == 31 | tea_tasting.config._config_var.get()["equal_var"] 32 | ) 33 | 34 | 35 | @pytest.mark.usefixtures("reset_config") 36 | def test_set_config(): 37 | tea_tasting.config.set_config(equal_var=True) 38 | assert tea_tasting.config._config_var.get()["equal_var"] is True 39 | 40 | tea_tasting.config.set_config(equal_var=False) 41 | assert tea_tasting.config._config_var.get()["equal_var"] is False 42 | 43 | 44 | @pytest.mark.usefixtures("reset_config") 45 | def test_config_context(): 46 | old_equal_var = tea_tasting.config._config_var.get()["equal_var"] 47 | 48 | with tea_tasting.config.config_context(equal_var=not old_equal_var): 49 | assert tea_tasting.config._config_var.get() is not old_equal_var 50 | 51 | assert tea_tasting.config._config_var.get()["equal_var"] is old_equal_var 52 | -------------------------------------------------------------------------------- /tests/test_datasets.py: -------------------------------------------------------------------------------- 1 | # pyright: reportAttributeAccessIssue=false 2 | from __future__ import annotations 3 | 4 | import pandas as pd 5 | import polars as pl 6 | import pyarrow as pa 7 | import pyarrow.compute as pc 8 | 9 | import tea_tasting.datasets 10 | 11 | 12 | def test_make_users_data_default(): 13 | n_users = 100 14 | data = tea_tasting.datasets.make_users_data(seed=42, n_users=n_users) 15 | assert isinstance(data, pa.Table) 16 | assert data.column_names == ["user", "variant", "sessions", "orders", "revenue"] 17 | assert data.num_rows == n_users 18 | assert pc.count_distinct(data["user"]).as_py() == n_users 19 | assert pc.count_distinct(data["variant"]).as_py() == 2 20 | assert pc.min(data["sessions"]).as_py() > 0 21 | assert pc.min(data["orders"]).as_py() >= 0 22 | assert pc.min(data["revenue"]).as_py() >= 0 23 | assert pc.min(pc.subtract(data["orders"], data["sessions"])).as_py() <= 0 24 | assert int(pc.min(pc.equal( 25 | pc.greater_equal(data["revenue"], 0), 26 | pc.greater_equal(data["orders"], 0), 27 | )).as_py()) == 1 28 | 29 | def test_make_users_data_pandas(): 30 | n_users = 100 31 | data = tea_tasting.datasets.make_users_data( 32 | seed=42, n_users=n_users, return_type="pandas") 33 | assert isinstance(data, pd.DataFrame) 34 | assert data.columns.to_list() == [ 35 | "user", "variant", "sessions", "orders", "revenue"] 36 | assert data.shape[0] == n_users 37 | 38 | def test_make_users_data_polars(): 39 | n_users = 100 40 | data = tea_tasting.datasets.make_users_data( 41 | seed=42, n_users=n_users, return_type="polars") 42 | assert isinstance(data, pl.DataFrame) 43 | assert data.columns == [ 44 | "user", "variant", "sessions", "orders", "revenue"] 45 | assert data.shape[0] == n_users 46 | 47 | 48 | def test_make_users_data_covariates(): 49 | n_users = 100 50 | data = tea_tasting.datasets.make_users_data( 51 | seed=42, covariates=True, n_users=n_users) 52 | assert isinstance(data, pa.Table) 53 | assert data.column_names == [ 54 | "user", "variant", "sessions", "orders", "revenue", 55 | "sessions_covariate", "orders_covariate", "revenue_covariate", 56 | ] 57 | assert pc.min(data["sessions_covariate"]).as_py() >= 0 58 | assert pc.min(data["orders_covariate"]).as_py() >= 0 59 | assert pc.min(data["revenue_covariate"]).as_py() >= 0 60 | assert pc.min(pc.subtract( 61 | data["orders_covariate"], 62 | data["sessions_covariate"], 63 | )).as_py() <= 0 64 | assert int(pc.min(pc.equal( 65 | pc.greater_equal(data["revenue_covariate"], 0), 66 | pc.greater_equal(data["orders_covariate"], 0), 67 | )).as_py()) == 1 68 | 69 | 70 | def test_make_sessions_data_default(): 71 | n_users = 100 72 | data = tea_tasting.datasets.make_sessions_data(seed=42, n_users=n_users) 73 | assert isinstance(data, pa.Table) 74 | assert data.column_names == ["user", "variant", "sessions", "orders", "revenue"] 75 | assert data.num_rows > n_users 76 | assert pc.count_distinct(data["user"]).as_py() == n_users 77 | assert pc.count_distinct(data["variant"]).as_py() == 2 78 | assert pc.min(data["sessions"]).as_py() == 1 79 | assert pc.max(data["sessions"]).as_py() == 1 80 | assert pc.min(data["orders"]).as_py() >= 0 81 | assert pc.min(data["revenue"]).as_py() >= 0 82 | assert pc.min(pc.subtract(data["orders"], data["sessions"])).as_py() <= 0 83 | assert int(pc.min(pc.equal( 84 | pc.greater_equal(data["revenue"], 0), 85 | pc.greater_equal(data["orders"], 0), 86 | )).as_py()) == 1 87 | 88 | def test_make_sessions_data_pandas(): 89 | n_users = 100 90 | data = tea_tasting.datasets.make_sessions_data( 91 | seed=42, n_users=n_users, return_type="pandas") 92 | assert isinstance(data, pd.DataFrame) 93 | assert data.columns.to_list() == [ 94 | "user", "variant", "sessions", "orders", "revenue"] 95 | assert data.shape[0] > n_users 96 | 97 | def test_make_sessions_data_polars(): 98 | n_users = 100 99 | data = tea_tasting.datasets.make_sessions_data( 100 | seed=42, n_users=n_users, return_type="polars") 101 | assert isinstance(data, pl.DataFrame) 102 | assert data.columns == [ 103 | "user", "variant", "sessions", "orders", "revenue"] 104 | assert data.shape[0] > n_users 105 | 106 | 107 | def test_make_sessions_data_covariates(): 108 | n_users = 100 109 | data = tea_tasting.datasets.make_sessions_data( 110 | seed=42, covariates=True, n_users=n_users) 111 | assert isinstance(data, pa.Table) 112 | assert data.column_names == [ 113 | "user", "variant", "sessions", "orders", "revenue", 114 | "sessions_covariate", "orders_covariate", "revenue_covariate", 115 | ] 116 | assert pc.min(data["sessions_covariate"]).as_py() >= 0 117 | assert pc.min(data["orders_covariate"]).as_py() >= 0 118 | assert pc.min(data["revenue_covariate"]).as_py() >= 0 119 | assert pc.min(pc.subtract( 120 | data["orders_covariate"], 121 | data["sessions_covariate"], 122 | )).as_py() <= 0 123 | assert int(pc.min(pc.equal( 124 | pc.greater_equal(data["revenue_covariate"], 0), 125 | pc.greater_equal(data["orders_covariate"], 0), 126 | )).as_py()) == 1 127 | -------------------------------------------------------------------------------- /tests/test_version.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import importlib 4 | import importlib.metadata 5 | import unittest.mock 6 | 7 | import tea_tasting.version 8 | 9 | 10 | def test_version(): 11 | assert isinstance(tea_tasting.version.__version__, str) 12 | 13 | with ( 14 | unittest.mock.patch( 15 | "tea_tasting.version.importlib.metadata.version") as version, 16 | unittest.mock.patch("tea_tasting.version.importlib.resources.files") as files, 17 | ): 18 | ( 19 | files.return_value 20 | .joinpath.return_value 21 | .read_text.return_value 22 | .strip.return_value 23 | ) = "version" 24 | 25 | version.side_effect = importlib.metadata.PackageNotFoundError("Not found") 26 | importlib.reload(tea_tasting.version) 27 | assert isinstance(tea_tasting.version.__version__, str) 28 | --------------------------------------------------------------------------------