├── .github
└── workflows
│ ├── ci.yml
│ ├── docs.yml
│ └── release.yml
├── .gitignore
├── .markdownlint.yaml
├── LICENSE
├── README.md
├── docs
├── CNAME
├── api
│ ├── aggr.md
│ ├── config.md
│ ├── datasets.md
│ ├── experiment.md
│ ├── index.md
│ ├── metrics
│ │ ├── base.md
│ │ ├── index.md
│ │ ├── mean.md
│ │ ├── proportion.md
│ │ └── resampling.md
│ ├── multiplicity.md
│ └── utils.md
├── assets
│ ├── tea-cup-black.svg
│ ├── tea-cup-white-on-black.svg
│ └── tea-cup-white.svg
├── custom-metrics.md
├── data-backends.md
├── index.md
├── javascripts
│ └── override-copy.js
├── multiple-testing.md
├── power-analysis.md
├── simulated-experiments.md
├── stylesheets
│ └── extra.css
└── user-guide.md
├── examples
├── README.md
├── custom-metrics.py
├── data-backends.py
├── multiple-testing.py
├── power-analysis.py
├── simulated-experiments.py
└── user-guide.py
├── mkdocs.yml
├── pyproject.toml
├── src
├── _internal
│ ├── __init__.py
│ ├── create_examples.py
│ ├── external_links.py
│ └── strip_doctest_artifacts.py
└── tea_tasting
│ ├── __init__.py
│ ├── aggr.py
│ ├── config.py
│ ├── datasets.py
│ ├── experiment.py
│ ├── metrics
│ ├── __init__.py
│ ├── base.py
│ ├── mean.py
│ ├── proportion.py
│ └── resampling.py
│ ├── multiplicity.py
│ ├── utils.py
│ └── version.py
└── tests
├── __init__.py
├── metrics
├── __init__.py
├── test_base.py
├── test_mean.py
├── test_proportion.py
└── test_resampling.py
├── test_aggr.py
├── test_config.py
├── test_datasets.py
├── test_experiment.py
├── test_multiplicity.py
├── test_utils.py
└── test_version.py
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: ci
2 | on:
3 | pull_request:
4 | paths:
5 | - '**.py'
6 | - '.github/workflows/ci.yml'
7 | - 'pyproject.toml'
8 | push:
9 | branches: [main]
10 | paths:
11 | - '**.py'
12 | - '.github/workflows/ci.yml'
13 | - 'pyproject.toml'
14 | workflow_dispatch:
15 | jobs:
16 | doctest:
17 | runs-on: ${{ matrix.os }}
18 | strategy:
19 | matrix:
20 | os: [ubuntu-latest]
21 | python-version: ["3.10"]
22 | steps:
23 | - name: checkout
24 | uses: actions/checkout@v4
25 | - name: set up uv
26 | uses: astral-sh/setup-uv@v6
27 | with:
28 | activate-environment: true
29 | cache-suffix: "${{ matrix.python-version }}-test"
30 | enable-cache: true
31 | python-version: ${{ matrix.python-version }}
32 | - name: install dependencies
33 | run: uv sync --group test
34 | - name: doctest with pytest
35 | run: |
36 | pytest \
37 | --doctest-continue-on-failure \
38 | --doctest-glob=*.md \
39 | --doctest-modules \
40 | --ignore=examples/ \
41 | --ignore=tests/ \
42 | --ignore-glob=src/_* \
43 | test-lowest:
44 | strategy:
45 | matrix:
46 | os: [ubuntu-latest]
47 | python-version: ["3.10"]
48 | runs-on: ${{ matrix.os }}
49 | steps:
50 | - name: checkout
51 | uses: actions/checkout@v4
52 | - name: set up uv
53 | uses: astral-sh/setup-uv@v6
54 | with:
55 | activate-environment: true
56 | cache-suffix: "${{ matrix.python-version }}-test-lowest"
57 | enable-cache: true
58 | python-version: ${{ matrix.python-version }}
59 | - name: install dependencies
60 | run: uv sync --group test --resolution lowest-direct
61 | - name: test-lowest with pytest
62 | run: pytest
63 | test:
64 | strategy:
65 | matrix:
66 | os: [ubuntu-latest, macos-13, windows-latest]
67 | python-version: ["3.10", "3.11", "3.12", "3.13"]
68 | runs-on: ${{ matrix.os }}
69 | steps:
70 | - name: checkout
71 | uses: actions/checkout@v4
72 | - name: set up uv
73 | uses: astral-sh/setup-uv@v6
74 | with:
75 | activate-environment: true
76 | cache-suffix: "${{ matrix.python-version }}-test"
77 | enable-cache: true
78 | python-version: ${{ matrix.python-version }}
79 | - name: install dependencies
80 | run: uv sync --group test
81 | - name: test with pytest
82 | run: coverage run -m pytest
83 | - name: convert coverage report
84 | run: coverage xml
85 | - name: upload coverage reports to codecov
86 | uses: codecov/codecov-action@v5
87 | with:
88 | files: ./coverage.xml
89 | env:
90 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
91 | lint:
92 | runs-on: ubuntu-latest
93 | strategy:
94 | matrix:
95 | python-version: ["3.10", "3.11", "3.12", "3.13"]
96 | steps:
97 | - name: checkout
98 | uses: actions/checkout@v4
99 | - name: set up uv
100 | uses: astral-sh/setup-uv@v6
101 | with:
102 | activate-environment: true
103 | cache-suffix: "${{ matrix.python-version }}-lint"
104 | enable-cache: true
105 | python-version: ${{ matrix.python-version }}
106 | - name: install dependencies
107 | run: uv sync --group lint --group test
108 | - name: check with ruff
109 | run: ruff check .
110 | - name: check with pyright
111 | run: pyright
112 |
--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
1 | name: docs
2 | on:
3 | release:
4 | types: [published]
5 | workflow_dispatch:
6 | permissions:
7 | contents: write
8 | jobs:
9 | docs:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - name: checkout
13 | uses: actions/checkout@v4
14 | - name: set up uv
15 | uses: astral-sh/setup-uv@v6
16 | with:
17 | activate-environment: true
18 | python-version: 3.12
19 | - name: install dependencies
20 | run: uv sync --group docs
21 | - name: build and publish docs
22 | run: mkdocs gh-deploy --force
23 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: release
2 | on:
3 | release:
4 | types: [published]
5 | jobs:
6 | publish:
7 | runs-on: ubuntu-latest
8 | permissions:
9 | id-token: write
10 | steps:
11 | - name: checkout
12 | uses: actions/checkout@v4
13 | - name: set up uv
14 | uses: astral-sh/setup-uv@v6
15 | with:
16 | activate-environment: true
17 | python-version: 3.12
18 | - name: install dependencies
19 | run: uv sync --no-dev
20 | - name: build
21 | run: uv build
22 | - name: publish
23 | run: uv publish
24 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build
113 |
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 |
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 |
121 | # SageMath parsed files
122 | *.sage.py
123 |
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 |
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 |
137 | # Rope project settings
138 | .ropeproject
139 |
140 | # mkdocs documentation
141 | /site
142 |
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 |
148 | # Pyre type checker
149 | .pyre/
150 |
151 | # pytype static type analyzer
152 | .pytype/
153 |
154 | # Cython debug symbols
155 | cython_debug/
156 |
157 | # PyCharm
158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | # and can be added to the global gitignore or merged into this file. For a more nuclear
161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 |
164 | # VSCode
165 | .vscode/
166 |
167 | # Version file
168 | src/tea_tasting/_version.txt
169 |
170 | # uv lockfile
171 | uv.lock
172 |
--------------------------------------------------------------------------------
/.markdownlint.yaml:
--------------------------------------------------------------------------------
1 | MD007:
2 | indent: 4
3 |
4 | MD013: false
5 | MD046: false
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Evgeny Ivanov
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # tea-tasting: statistical analysis of A/B tests
2 |
3 | [](https://github.com/e10v/tea-tasting/actions/workflows/ci.yml)
4 | [](https://tea-tasting.e10v.me/)
5 | [](https://codecov.io/gh/e10v/tea-tasting)
6 | [](https://github.com/e10v/tea-tasting/blob/main/LICENSE)
7 | [](https://pypi.org/project/tea-tasting/)
8 | [](https://pypi.org/project/tea-tasting/)
9 | [](https://pypi.org/project/tea-tasting/)
10 |
11 | tea-tasting is a Python package for the statistical analysis of A/B tests featuring:
12 |
13 | - Student's t-test, Z-test, bootstrap, and quantile metrics out of the box.
14 | - Extensible API that lets you define and use statistical tests of your choice.
15 | - [Delta method](https://alexdeng.github.io/public/files/kdd2018-dm.pdf) for ratio metrics.
16 | - Variance reduction using [CUPED](https://exp-platform.com/Documents/2013-02-CUPED-ImprovingSensitivityOfControlledExperiments.pdf)/[CUPAC](https://doordash.engineering/2020/06/08/improving-experimental-power-through-control-using-predictions-as-covariate-cupac/), which can be combined with the Delta method for ratio metrics.
17 | - Confidence intervals for both absolute and percentage changes.
18 | - Checks for sample-ratio mismatches.
19 | - Power analysis.
20 | - Multiple hypothesis testing (family-wise error rate and false discovery rate).
21 | - Simulated experiments, including A/A tests.
22 |
23 | tea-tasting calculates statistics directly within data backends such as BigQuery, ClickHouse, DuckDB, PostgreSQL, Snowflake, Spark, and many other backends supported by [Ibis](https://github.com/ibis-project/ibis). This approach eliminates the need to import granular data into a Python environment.
24 |
25 | tea-tasting also accepts dataframes supported by [Narwhals](https://github.com/narwhals-dev/narwhals): cuDF, Dask, Modin, pandas, Polars, PyArrow.
26 |
27 | ## Installation
28 |
29 | ```bash
30 | uv pip install tea-tasting
31 | ```
32 |
33 | ## Basic example
34 |
35 | ```pycon
36 | >>> import tea_tasting as tt
37 |
38 | >>> data = tt.make_users_data(seed=42)
39 | >>> experiment = tt.Experiment(
40 | ... sessions_per_user=tt.Mean("sessions"),
41 | ... orders_per_session=tt.RatioOfMeans("orders", "sessions"),
42 | ... orders_per_user=tt.Mean("orders"),
43 | ... revenue_per_user=tt.Mean("revenue"),
44 | ... )
45 | >>> result = experiment.analyze(data)
46 | >>> result
47 | metric control treatment rel_effect_size rel_effect_size_ci pvalue
48 | sessions_per_user 2.00 1.98 -0.66% [-3.7%, 2.5%] 0.674
49 | orders_per_session 0.266 0.289 8.8% [-0.89%, 19%] 0.0762
50 | orders_per_user 0.530 0.573 8.0% [-2.0%, 19%] 0.118
51 | revenue_per_user 5.24 5.73 9.3% [-2.4%, 22%] 0.123
52 |
53 | ```
54 |
55 | Learn more in the detailed [user guide](https://tea-tasting.e10v.me/user-guide/). Additionally, see the guides on more specific topics:
56 |
57 | - [Data backends](https://tea-tasting.e10v.me/data-backends/).
58 | - [Power analysis](https://tea-tasting.e10v.me/power-analysis/).
59 | - [Multiple hypothesis testing](https://tea-tasting.e10v.me/multiple-testing/).
60 | - [Custom metrics](https://tea-tasting.e10v.me/custom-metrics/).
61 | - [Simulated experiments](https://tea-tasting.e10v.me/simulated-experiments/).
62 |
63 | ## Examples
64 |
65 | The tea-tasting repository includes [examples](https://github.com/e10v/tea-tasting/tree/main/examples) as copies of the guides in the [marimo](https://github.com/marimo-team/marimo) notebook format. You can either download them from GitHub and run in your local environment, or you can run them as WASM notebooks in the online playground.
66 |
67 | ### Run in a local environment
68 |
69 | To run the examples in your local environment, clone the repository and change the directory:
70 |
71 | ```bash
72 | git clone git@github.com:e10v/tea-tasting.git && cd tea-tasting
73 | ```
74 |
75 | Install marimo, tea-tasting, and other packages used in the examples:
76 |
77 | ```bash
78 | uv venv && uv pip install marimo tea-tasting polars ibis-framework[duckdb]
79 | ```
80 |
81 | Launch the notebook server:
82 |
83 | ```bash
84 | uv run marimo edit examples
85 | ```
86 |
87 | Now you can choose and run the example notebooks.
88 |
89 | ### Run in the online playground
90 |
91 | To run the examples as WASM notebooks in the online playground, open the following links:
92 |
93 | - [User guide](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fuser-guide.py&embed=true).
94 | - [Data backends](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fdata-backends.py&embed=true).
95 | - [Power analysis](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fpower-analysis.py&embed=true).
96 | - [Multiple hypothesis testing](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fmultiple-testing.py&embed=true).
97 | - [Custom metrics](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fcustom-metrics.py&embed=true).
98 | - [Simulated experiments](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fsimulated-experiments.py&embed=true).
99 |
100 | [WASM notebooks](https://docs.marimo.io/guides/wasm/) run entirely in the browser on [Pyodide](https://github.com/pyodide/pyodide) and thus have some limitations. In particular:
101 |
102 | - Tables and dataframes render less attractively because Pyodide doesn't always include the latest [packages versions](https://pyodide.org/en/stable/usage/packages-in-pyodide.html).
103 | - You can't simulate experiments [in parallel](https://tea-tasting.e10v.me/simulated-experiments/#parallel-execution) because Pyodide currently [doesn't support multiprocessing](https://pyodide.org/en/stable/usage/wasm-constraints.html#included-but-not-working-modules).
104 | - Other unpredictable issues may arise, such as the inability to use duckdb with ibis.
105 |
106 | ## Package name
107 |
108 | The package name "tea-tasting" is a play on words that refers to two subjects:
109 |
110 | - [Lady tasting tea](https://en.wikipedia.org/wiki/Lady_tasting_tea) is a famous experiment which was devised by Ronald Fisher. In this experiment, Fisher developed the null hypothesis significance testing framework to analyze a lady's claim that she could discern whether the tea or the milk was added first to the cup.
111 | - "tea-tasting" phonetically resembles "t-testing", referencing Student's t-test, a statistical method developed by William Gosset.
112 |
--------------------------------------------------------------------------------
/docs/CNAME:
--------------------------------------------------------------------------------
1 | tea-tasting.e10v.me
2 |
--------------------------------------------------------------------------------
/docs/api/aggr.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting.aggr
2 |
--------------------------------------------------------------------------------
/docs/api/config.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting.config
2 |
--------------------------------------------------------------------------------
/docs/api/datasets.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting.datasets
2 | options:
3 | members_order: source
4 |
--------------------------------------------------------------------------------
/docs/api/experiment.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting.experiment
2 |
--------------------------------------------------------------------------------
/docs/api/index.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting
2 |
--------------------------------------------------------------------------------
/docs/api/metrics/base.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting.metrics.base
2 |
--------------------------------------------------------------------------------
/docs/api/metrics/index.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting.metrics
2 |
--------------------------------------------------------------------------------
/docs/api/metrics/mean.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting.metrics.mean
2 |
--------------------------------------------------------------------------------
/docs/api/metrics/proportion.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting.metrics.proportion
2 |
--------------------------------------------------------------------------------
/docs/api/metrics/resampling.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting.metrics.resampling
2 |
--------------------------------------------------------------------------------
/docs/api/multiplicity.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting.multiplicity
2 |
--------------------------------------------------------------------------------
/docs/api/utils.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting.utils
2 | options:
3 | group_by_category: false
4 | members_order: source
5 |
--------------------------------------------------------------------------------
/docs/assets/tea-cup-black.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/docs/assets/tea-cup-white-on-black.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/docs/assets/tea-cup-white.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/docs/custom-metrics.md:
--------------------------------------------------------------------------------
1 | # Custom metrics
2 |
3 | ## Intro
4 |
5 | tea-tasting supports Student's t-test, Z-test, and [some other statistical tests](api/metrics/index.md) out of the box. However, you might want to analyze an experiment using other statistical criteria. In this case, you can define a custom metric with a statistical test of your choice.
6 |
7 | In tea-tasting, there are two types of metrics:
8 |
9 | - Metrics that require only aggregated statistics for the analysis.
10 | - Metrics that require granular data for the analysis.
11 |
12 | This guide explains how to define a custom metric for each type.
13 |
14 | First, let's import all the required modules and prepare the data:
15 |
16 | ```pycon
17 | >>> from typing import Literal, NamedTuple
18 | >>> import numpy as np
19 | >>> import pyarrow as pa
20 | >>> import pyarrow.compute as pc
21 | >>> import scipy.stats
22 | >>> import tea_tasting as tt
23 | >>> import tea_tasting.aggr
24 | >>> import tea_tasting.config
25 | >>> import tea_tasting.metrics
26 | >>> import tea_tasting.utils
27 |
28 | >>> data = tt.make_users_data(seed=42)
29 | >>> data = data.append_column(
30 | ... "has_order",
31 | ... pc.greater(data["orders"], 0).cast(pa.int64()),
32 | ... )
33 | >>> data
34 | pyarrow.Table
35 | user: int64
36 | variant: int64
37 | sessions: int64
38 | orders: int64
39 | revenue: double
40 | has_order: int64
41 | ----
42 | user: [[0,1,2,3,4,...,3995,3996,3997,3998,3999]]
43 | variant: [[1,0,1,1,0,...,0,0,0,0,0]]
44 | sessions: [[2,2,2,2,1,...,2,2,3,1,5]]
45 | orders: [[1,1,1,1,1,...,0,0,0,0,2]]
46 | revenue: [[9.17,6.43,7.94,15.93,7.14,...,0,0,0,0,17.16]]
47 | has_order: [[1,1,1,1,1,...,0,0,0,0,1]]
48 |
49 | ```
50 |
51 | This guide uses PyArrow as the data backend, but it's valid for other backends as well. See the [guide on data backends](data-backends.md) for more details.
52 |
53 | ## Metrics based on aggregated statistics
54 |
55 | Let's define a metric that performs a proportion test, [G-test](https://en.wikipedia.org/wiki/G-test) or [Pearson's chi-squared test](https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test), on a binary column (with values `0` or `1`).
56 |
57 | The first step is defining a result class. It should be a named tuple or a dictionary.
58 |
59 | ```pycon
60 | >>> class ProportionResult(NamedTuple):
61 | ... control: float
62 | ... treatment: float
63 | ... effect_size: float
64 | ... rel_effect_size: float
65 | ... pvalue: float
66 | ... statistic: float
67 | ...
68 |
69 | ```
70 |
71 | The second step is defining the metric class itself. A metric based on aggregated statistics should be a subclass of [`MetricBaseAggregated`](api/metrics/base.md#tea_tasting.metrics.base.MetricBaseAggregated). `MetricBaseAggregated` is a generic class with the result class as a type variable.
72 |
73 | The metric should have the following methods and properties defined:
74 |
75 | - Method `__init__` checks and saves metric parameters.
76 | - Property `aggr_cols` returns columns to be aggregated for analysis for each type of statistic.
77 | - Method `analyze_aggregates` analyzes the metric using aggregated statistics.
78 |
79 | Let's define the metric and discuss each method in details:
80 |
81 | ```pycon
82 | >>> class Proportion(tea_tasting.metrics.MetricBaseAggregated[ProportionResult]):
83 | ... def __init__(
84 | ... self,
85 | ... column: str,
86 | ... *,
87 | ... correction: bool = True,
88 | ... method: Literal["g-test", "pearson"] = "g-test",
89 | ... ) -> None:
90 | ... self.column = tea_tasting.utils.check_scalar(column, "column", typ=str)
91 | ... self.correction = tea_tasting.utils.auto_check(correction, "correction")
92 | ... self.method = tea_tasting.utils.check_scalar(
93 | ... method, "method", typ=str, in_={"g-test", "pearson"})
94 | ... @property
95 | ... def aggr_cols(self) -> tea_tasting.metrics.AggrCols:
96 | ... return tea_tasting.metrics.AggrCols(
97 | ... has_count=True,
98 | ... mean_cols=(self.column,),
99 | ... )
100 | ... def analyze_aggregates(
101 | ... self,
102 | ... control: tea_tasting.aggr.Aggregates,
103 | ... treatment: tea_tasting.aggr.Aggregates,
104 | ... ) -> ProportionResult:
105 | ... observed = np.empty(shape=(2, 2), dtype=np.int64)
106 | ... observed[0, 0] = round(control.count() * control.mean(self.column))
107 | ... observed[1, 0] = control.count() - observed[0, 0]
108 | ... observed[0, 1] = round(treatment.count() * treatment.mean(self.column))
109 | ... observed[1, 1] = treatment.count() - observed[0, 1]
110 | ... res = scipy.stats.chi2_contingency(
111 | ... observed=observed,
112 | ... correction=self.correction,
113 | ... lambda_=int(self.method == "pearson"),
114 | ... )
115 | ... return ProportionResult(
116 | ... control=control.mean(self.column),
117 | ... treatment=treatment.mean(self.column),
118 | ... effect_size=treatment.mean(self.column) - control.mean(self.column),
119 | ... rel_effect_size=treatment.mean(self.column)/control.mean(self.column) - 1,
120 | ... pvalue=res.pvalue,
121 | ... statistic=res.statistic,
122 | ... )
123 | ...
124 |
125 | ```
126 |
127 | Method `__init__` saves metric parameters to be used in the analysis. You can use utility functions [`check_scalar`](api/utils.md#tea_tasting.utils.check_scalar) and [`auto_check`](api/utils.md#tea_tasting.utils.auto_check) to check parameter values.
128 |
129 | Property `aggr_cols` returns an instance of [`AggrCols`](api/metrics/base.md#tea_tasting.metrics.base.AggrCols). Analysis of proportion requires the number of rows (`has_count=True`) and the average value for the column of interest (`mean_cols=(self.column,)`) for each variant.
130 |
131 | Method `analyze_aggregates` accepts two parameters: `control` and `treatment` data as instances of class [`Aggregates`](api/aggr.md#tea_tasting.aggr.Aggregates). They contain values for statistics and columns specified in `aggr_cols`.
132 |
133 | Method `analyze_aggregates` returns an instance of `ProportionResult`, defined earlier, with the analysis result.
134 |
135 | Now we can analyze the proportion of users who created at least one order during the experiment. For comparison, let's also add a metric that performs a Z-test on the same column.
136 |
137 | ```pycon
138 | >>> experiment_prop = tt.Experiment(
139 | ... prop_users_with_orders=Proportion("has_order"),
140 | ... mean_users_with_orders=tt.Mean("has_order", use_t=False),
141 | ... )
142 | >>> experiment_prop.analyze(data)
143 | metric control treatment rel_effect_size rel_effect_size_ci pvalue
144 | prop_users_with_orders 0.345 0.384 11% [-, -] 0.0117
145 | mean_users_with_orders 0.345 0.384 11% [2.5%, 21%] 0.0106
146 |
147 | ```
148 |
149 | ## Metrics based on granular data
150 |
151 | Now let's define a metric that performs the Mann-Whitney U test. While it's possible to use the aggregated sum of ranks for the test, this example uses granular data for analysis.
152 |
153 | The result class:
154 |
155 | ```pycon
156 | >>> class MannWhitneyUResult(NamedTuple):
157 | ... pvalue: float
158 | ... statistic: float
159 | ...
160 |
161 | ```
162 |
163 | A metric that analyzes granular data should be a subclass of [`MetricBaseGranular`](api/metrics/base.md#tea_tasting.metrics.base.MetricBaseGranular). `MetricBaseGranular` is a generic class with the result class as a type variable.
164 |
165 | Metric should have the following methods and properties defined:
166 |
167 | - Method `__init__` checks and saves metric parameters.
168 | - Property `cols` returns columns to be fetched for an analysis.
169 | - Method `analyze_granular` analyzes the metric using granular data.
170 |
171 | ```pycon
172 | >>> class MannWhitneyU(tea_tasting.metrics.MetricBaseGranular[MannWhitneyUResult]):
173 | ... def __init__(
174 | ... self,
175 | ... column: str,
176 | ... *,
177 | ... correction: bool = True,
178 | ... alternative: Literal["two-sided", "less", "greater"] | None = None,
179 | ... ) -> None:
180 | ... self.column = tea_tasting.utils.check_scalar(column, "column", typ=str)
181 | ... self.correction = tea_tasting.utils.auto_check(correction, "correction")
182 | ... self.alternative = (
183 | ... tea_tasting.utils.auto_check(alternative, "alternative")
184 | ... if alternative is not None
185 | ... else tea_tasting.config.get_config("alternative")
186 | ... )
187 | ... @property
188 | ... def cols(self) -> tuple[str]:
189 | ... return (self.column,)
190 | ... def analyze_granular(
191 | ... self,
192 | ... control: pa.Table,
193 | ... treatment: pa.Table,
194 | ... ) -> MannWhitneyUResult:
195 | ... res = scipy.stats.mannwhitneyu(
196 | ... treatment[self.column].combine_chunks().to_numpy(zero_copy_only=False),
197 | ... control[self.column].combine_chunks().to_numpy(zero_copy_only=False),
198 | ... use_continuity=self.correction,
199 | ... alternative=self.alternative,
200 | ... )
201 | ... return MannWhitneyUResult(
202 | ... pvalue=res.pvalue,
203 | ... statistic=res.statistic,
204 | ... )
205 | ...
206 |
207 | ```
208 |
209 | Property `cols` should return a sequence of strings.
210 |
211 | Method `analyze_granular` accepts two parameters: control and treatment data as PyArrow Tables. Even with [data backend](data-backends.md) different from PyArrow, tea-tasting will retrieve the data and transform into a PyArrow Table.
212 |
213 | Method `analyze_granular` returns an instance of `MannWhitneyUResult`, defined earlier, with analysis result.
214 |
215 | Now we can perform the Mann-Whitney U test:
216 |
217 | ```pycon
218 | >>> experiment_mwu = tt.Experiment(
219 | ... mwu_orders=MannWhitneyU("orders"),
220 | ... mwu_revenue=MannWhitneyU("revenue"),
221 | ... )
222 | >>> result_mwu = experiment_mwu.analyze(data)
223 | >>> result_mwu.with_keys(("metric", "pvalue", "statistic"))
224 | metric pvalue statistic
225 | mwu_orders 0.0263 2069092
226 | mwu_revenue 0.0300 2068060
227 |
228 | ```
229 |
230 | ## Analyzing two types of metrics together
231 |
232 | It's also possible to analyze two types of metrics in one experiment:
233 |
234 | ```pycon
235 | >>> experiment = tt.Experiment(
236 | ... prop_users_with_orders=Proportion("has_order"),
237 | ... mean_users_with_orders=tt.Mean("has_order"),
238 | ... mwu_orders=MannWhitneyU("orders"),
239 | ... mwu_revenue=MannWhitneyU("revenue"),
240 | ... )
241 | >>> experiment.analyze(data)
242 | metric control treatment rel_effect_size rel_effect_size_ci pvalue
243 | prop_users_with_orders 0.345 0.384 11% [-, -] 0.0117
244 | mean_users_with_orders 0.345 0.384 11% [2.5%, 21%] 0.0106
245 | mwu_orders - - - [-, -] 0.0263
246 | mwu_revenue - - - [-, -] 0.0300
247 |
248 | ```
249 |
250 | In this case, tea-tasting performs two queries on the experimental data:
251 |
252 | - With aggregated statistics required for analysis of metrics of type `MetricBaseAggregated`.
253 | - With detailed data with columns required for analysis of metrics of type `MetricBaseGranular`.
254 |
255 | ## Recommendations
256 |
257 | Follow these recommendations when defining custom metrics:
258 |
259 | - Use parameter and attribute names consistent with the ones that are already defined in tea-tasting. For example, use `pvalue` instead of `p_value` or `correction` instead of `use_continuity`.
260 | - End confidence interval boundary names with `"_ci_lower"` and `"_ci_upper"`.
261 | - During initialization, save parameter values in metric attributes using the same names. For example, use `self.correction = correction` instead of `self.use_continuity = correction`.
262 | - Use global settings as default values for standard parameters, such as `alternative` or `confidence_level`. See the [reference](api/config.md#tea_tasting.config.config_context) for the full list of standard parameters. You can also define and use your own global parameters.
263 |
--------------------------------------------------------------------------------
/docs/data-backends.md:
--------------------------------------------------------------------------------
1 | # Data backends
2 |
3 | ## Intro
4 |
5 | tea-tasting supports a wide range of data backends such as BigQuery, ClickHouse, DuckDB, PostgreSQL, Snowflake, Spark, and many other backends supported by [Ibis](https://github.com/ibis-project/ibis). Ibis is a DataFrame API to various data backends.
6 |
7 | Many statistical tests, such as the Student's t-test or the Z-test, require only aggregated data for analysis. For these tests, tea-tasting retrieves only aggregated statistics like mean and variance instead of downloading all detailed data.
8 |
9 | For example, if the raw experimental data are stored in ClickHouse, it's faster and more efficient to calculate counts, averages, variances, and covariances directly in ClickHouse rather than fetching granular data and performing aggregations in a Python environment.
10 |
11 | tea-tasting also accepts dataframes supported by [Narwhals](https://github.com/narwhals-dev/narwhals): cuDF, Dask, Modin, pandas, Polars, PyArrow. Narwhals is a compatibility layer between dataframe libraries.
12 |
13 | This guide:
14 |
15 | - Shows how to use tea-tasting with a data backend of your choice for the analysis of an experiment.
16 | - Explains some internals of how tea-tasting uses Ibis to work with data backends.
17 |
18 | ## Demo database
19 |
20 | /// admonition | Note
21 |
22 | This guide uses [DuckDB](https://github.com/duckdb/duckdb), an in-process analytical database, and [Polars](https://github.com/pola-rs/polars) as example data backends. Install these packages in addition to tea-tasting to reproduce the examples:
23 |
24 | ```bash
25 | uv pip install ibis-framework[duckdb] polars
26 | ```
27 |
28 | ///
29 |
30 | First, let's prepare a demo database:
31 |
32 | ```pycon
33 | >>> import ibis
34 | >>> import polars as pl
35 | >>> import tea_tasting as tt
36 |
37 | >>> users_data = tt.make_users_data(seed=42)
38 | >>> con = ibis.connect("duckdb://")
39 | >>> con.create_table("users_data", users_data)
40 | DatabaseTable: memory.main.users_data
41 | user int64
42 | variant int64
43 | sessions int64
44 | orders int64
45 | revenue float64
46 |
47 | ```
48 |
49 | In the example above:
50 |
51 | - Function `tt.make_users_data` returns a PyArrow Table with example experimental data.
52 | - Function `ibis.duckdb.connect` creates a DuckDB in-process database using Ibis API.
53 | - Method `con.create_table` creates and populates a table in the database based on the PyArrow Table.
54 |
55 | See the [Ibis documentation on how to create connections](https://ibis-project.org/reference/connection) to other data backends.
56 |
57 | ## Querying experimental data
58 |
59 | Method `con.create_table` in the example above returns an Ibis Table which already can be used in the analysis of the experiment. But let's see how to use an SQL query to create an Ibis Table:
60 |
61 | ```pycon
62 | >>> data = con.sql("select * from users_data")
63 | >>> data
64 | SQLQueryResult
65 | query:
66 | select * from users_data
67 | schema:
68 | user int64
69 | variant int64
70 | sessions int64
71 | orders int64
72 | revenue float64
73 |
74 | ```
75 |
76 | It's a very simple query. In the real world, you might need to use joins, aggregations, and CTEs to get the data. You can define any SQL query supported by your data backend and use it to create Ibis Table.
77 |
78 | Keep in mind that tea-tasting assumes that:
79 |
80 | - Data is grouped by randomization units, such as individual users.
81 | - There is a column indicating the variant of the A/B test (typically labeled as A, B, etc.).
82 | - All necessary columns for metric calculations (like the number of orders, revenue, etc.) are included in the table.
83 |
84 | Ibis Table is a lazy object. It doesn't fetch the data when created. You can use Ibis DataFrame API to query the table and fetch the result:
85 |
86 | ```pycon
87 | >>> ibis.options.interactive = True
88 | >>> print(data.head(5))
89 | ┏━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┓
90 | ┃ user ┃ variant ┃ sessions ┃ orders ┃ revenue ┃
91 | ┡━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━┩
92 | │ int64 │ int64 │ int64 │ int64 │ float64 │
93 | ├───────┼─────────┼──────────┼────────┼─────────┤
94 | │ 0 │ 1 │ 2 │ 1 │ 9.17 │
95 | │ 1 │ 0 │ 2 │ 1 │ 6.43 │
96 | │ 2 │ 1 │ 2 │ 1 │ 7.94 │
97 | │ 3 │ 1 │ 2 │ 1 │ 15.93 │
98 | │ 4 │ 0 │ 1 │ 1 │ 7.14 │
99 | └───────┴─────────┴──────────┴────────┴─────────┘
100 |
101 | >>> ibis.options.interactive = False
102 |
103 | ```
104 |
105 | ## Ibis example
106 |
107 | To better understand what Ibis does, let's consider the example with grouping and aggregation by variants:
108 |
109 | ```pycon
110 | >>> aggr_data = data.group_by("variant").aggregate(
111 | ... sessions_per_user=data.sessions.mean(),
112 | ... orders_per_session=data.orders.mean() / data.sessions.mean(),
113 | ... orders_per_user=data.orders.mean(),
114 | ... revenue_per_user=data.revenue.mean(),
115 | ... )
116 | >>> aggr_data
117 | r0 := SQLQueryResult
118 | query:
119 | select * from users_data
120 | schema:
121 | user int64
122 | variant int64
123 | sessions int64
124 | orders int64
125 | revenue float64
126 |
127 | Aggregate[r0]
128 | groups:
129 | variant: r0.variant
130 | metrics:
131 | sessions_per_user: Mean(r0.sessions)
132 | orders_per_session: Mean(r0.orders) / Mean(r0.sessions)
133 | orders_per_user: Mean(r0.orders)
134 | revenue_per_user: Mean(r0.revenue)
135 |
136 | ```
137 |
138 | `aggr_data` is another Ibis Table defined as a query over the previously defined `data`. Let's fetch the result:
139 |
140 | ```pycon
141 | >>> ibis.options.interactive = True
142 | >>> print(aggr_data) # doctest: +SKIP
143 | ┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
144 | ┃ variant ┃ sessions_per_user ┃ orders_per_session ┃ orders_per_user ┃ revenue_per_user ┃
145 | ┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
146 | │ int64 │ float64 │ float64 │ float64 │ float64 │
147 | ├─────────┼───────────────────┼────────────────────┼─────────────────┼──────────────────┤
148 | │ 0 │ 1.996045 │ 0.265726 │ 0.530400 │ 5.241028 │
149 | │ 1 │ 1.982802 │ 0.289031 │ 0.573091 │ 5.730111 │
150 | └─────────┴───────────────────┴────────────────────┴─────────────────┴──────────────────┘
151 |
152 | >>> ibis.options.interactive = False
153 |
154 | ```
155 |
156 | Internally, Ibis compiles a Table to an SQL query supported by the backend:
157 |
158 | ```pycon
159 | >>> print(aggr_data.compile(pretty=True))
160 | SELECT
161 | "t0"."variant",
162 | AVG("t0"."sessions") AS "sessions_per_user",
163 | AVG("t0"."orders") / AVG("t0"."sessions") AS "orders_per_session",
164 | AVG("t0"."orders") AS "orders_per_user",
165 | AVG("t0"."revenue") AS "revenue_per_user"
166 | FROM (
167 | SELECT
168 | *
169 | FROM users_data
170 | ) AS "t0"
171 | GROUP BY
172 | 1
173 |
174 | ```
175 |
176 | See [Ibis documentation](https://ibis-project.org/tutorials/getting_started) for more details.
177 |
178 | ## Experiment analysis
179 |
180 | The example above shows how to query the metric averages. But for statistical inference, it's not enough. For example, Student's t-test and Z-test also require number of rows and variance. Additionally, analysis of ratio metrics and variance reduction with CUPED requires covariances.
181 |
182 | Querying all the required statistics manually can be a daunting and error-prone task. But don't worry—tea-tasting does this work for you. You just need to specify the metrics:
183 |
184 | ```pycon
185 | >>> experiment = tt.Experiment(
186 | ... sessions_per_user=tt.Mean("sessions"),
187 | ... orders_per_session=tt.RatioOfMeans("orders", "sessions"),
188 | ... orders_per_user=tt.Mean("orders"),
189 | ... revenue_per_user=tt.Mean("revenue"),
190 | ... )
191 | >>> result = experiment.analyze(data)
192 | >>> result
193 | metric control treatment rel_effect_size rel_effect_size_ci pvalue
194 | sessions_per_user 2.00 1.98 -0.66% [-3.7%, 2.5%] 0.674
195 | orders_per_session 0.266 0.289 8.8% [-0.89%, 19%] 0.0762
196 | orders_per_user 0.530 0.573 8.0% [-2.0%, 19%] 0.118
197 | revenue_per_user 5.24 5.73 9.3% [-2.4%, 22%] 0.123
198 |
199 | ```
200 |
201 | In the example above, tea-tasting fetches all the required statistics with a single query and then uses them to analyze the experiment.
202 |
203 | Some statistical methods, like bootstrap, require granular data for analysis. In this case, tea-tasting fetches the detailed data as well.
204 |
205 | ## Example with CUPED
206 |
207 | An example of a slightly more complicated analysis using variance reduction with CUPED:
208 |
209 | ```pycon
210 | >>> users_data_cuped = tt.make_users_data(seed=42, covariates=True)
211 | >>> con.create_table("users_data_cuped", users_data_cuped)
212 | DatabaseTable: memory.main.users_data_cuped
213 | user int64
214 | variant int64
215 | sessions int64
216 | orders int64
217 | revenue float64
218 | sessions_covariate int64
219 | orders_covariate int64
220 | revenue_covariate float64
221 |
222 | >>> data_cuped = con.sql("select * from users_data_cuped")
223 | >>> experiment_cuped = tt.Experiment(
224 | ... sessions_per_user=tt.Mean("sessions", "sessions_covariate"),
225 | ... orders_per_session=tt.RatioOfMeans(
226 | ... numer="orders",
227 | ... denom="sessions",
228 | ... numer_covariate="orders_covariate",
229 | ... denom_covariate="sessions_covariate",
230 | ... ),
231 | ... orders_per_user=tt.Mean("orders", "orders_covariate"),
232 | ... revenue_per_user=tt.Mean("revenue", "revenue_covariate"),
233 | ... )
234 | >>> result_cuped = experiment_cuped.analyze(data_cuped)
235 | >>> result_cuped
236 | metric control treatment rel_effect_size rel_effect_size_ci pvalue
237 | sessions_per_user 2.00 1.98 -0.68% [-3.2%, 1.9%] 0.603
238 | orders_per_session 0.262 0.293 12% [4.2%, 21%] 0.00229
239 | orders_per_user 0.523 0.581 11% [2.9%, 20%] 0.00733
240 | revenue_per_user 5.12 5.85 14% [3.8%, 26%] 0.00674
241 |
242 | ```
243 |
244 | ## Polars example
245 |
246 | Here’s an example of how to analyze data using a Polars DataFrame:
247 |
248 | ```pycon
249 | >>> data_polars = pl.from_arrow(users_data)
250 | >>> experiment.analyze(data_polars)
251 | metric control treatment rel_effect_size rel_effect_size_ci pvalue
252 | sessions_per_user 2.00 1.98 -0.66% [-3.7%, 2.5%] 0.674
253 | orders_per_session 0.266 0.289 8.8% [-0.89%, 19%] 0.0762
254 | orders_per_user 0.530 0.573 8.0% [-2.0%, 19%] 0.118
255 | revenue_per_user 5.24 5.73 9.3% [-2.4%, 22%] 0.123
256 |
257 | ```
258 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # tea-tasting: statistical analysis of A/B tests
2 |
3 | [](https://github.com/e10v/tea-tasting/actions/workflows/ci.yml)
4 | [](https://tea-tasting.e10v.me/)
5 | [](https://codecov.io/gh/e10v/tea-tasting)
6 | [](https://github.com/e10v/tea-tasting/blob/main/LICENSE)
7 | [](https://pypi.org/project/tea-tasting/)
8 | [](https://pypi.org/project/tea-tasting/)
9 | [](https://pypi.org/project/tea-tasting/)
10 |
11 | tea-tasting is a Python package for the statistical analysis of A/B tests featuring:
12 |
13 | - Student's t-test, Z-test, bootstrap, and quantile metrics out of the box.
14 | - Extensible API that lets you define and use statistical tests of your choice.
15 | - [Delta method](https://alexdeng.github.io/public/files/kdd2018-dm.pdf) for ratio metrics.
16 | - Variance reduction using [CUPED](https://exp-platform.com/Documents/2013-02-CUPED-ImprovingSensitivityOfControlledExperiments.pdf)/[CUPAC](https://doordash.engineering/2020/06/08/improving-experimental-power-through-control-using-predictions-as-covariate-cupac/), which can be combined with the Delta method for ratio metrics.
17 | - Confidence intervals for both absolute and percentage changes.
18 | - Checks for sample-ratio mismatches.
19 | - Power analysis.
20 | - Multiple hypothesis testing (family-wise error rate and false discovery rate).
21 | - Simulated experiments, including A/A tests.
22 |
23 | tea-tasting calculates statistics directly within data backends such as BigQuery, ClickHouse, DuckDB, PostgreSQL, Snowflake, Spark, and many other backends supported by [Ibis](https://github.com/ibis-project/ibis). This approach eliminates the need to import granular data into a Python environment.
24 |
25 | tea-tasting also accepts dataframes supported by [Narwhals](https://github.com/narwhals-dev/narwhals): cuDF, Dask, Modin, pandas, Polars, PyArrow.
26 |
27 | ## Installation
28 |
29 | ```bash
30 | uv pip install tea-tasting
31 | ```
32 |
33 | ## Basic example
34 |
35 | ```pycon
36 | >>> import tea_tasting as tt
37 |
38 | >>> data = tt.make_users_data(seed=42)
39 | >>> experiment = tt.Experiment(
40 | ... sessions_per_user=tt.Mean("sessions"),
41 | ... orders_per_session=tt.RatioOfMeans("orders", "sessions"),
42 | ... orders_per_user=tt.Mean("orders"),
43 | ... revenue_per_user=tt.Mean("revenue"),
44 | ... )
45 | >>> result = experiment.analyze(data)
46 | >>> result
47 | metric control treatment rel_effect_size rel_effect_size_ci pvalue
48 | sessions_per_user 2.00 1.98 -0.66% [-3.7%, 2.5%] 0.674
49 | orders_per_session 0.266 0.289 8.8% [-0.89%, 19%] 0.0762
50 | orders_per_user 0.530 0.573 8.0% [-2.0%, 19%] 0.118
51 | revenue_per_user 5.24 5.73 9.3% [-2.4%, 22%] 0.123
52 |
53 | ```
54 |
55 | Learn more in the detailed [user guide](https://tea-tasting.e10v.me/user-guide/). Additionally, see the guides on more specific topics:
56 |
57 | - [Data backends](https://tea-tasting.e10v.me/data-backends/).
58 | - [Power analysis](https://tea-tasting.e10v.me/power-analysis/).
59 | - [Multiple hypothesis testing](https://tea-tasting.e10v.me/multiple-testing/).
60 | - [Custom metrics](https://tea-tasting.e10v.me/custom-metrics/).
61 | - [Simulated experiments](https://tea-tasting.e10v.me/simulated-experiments/).
62 |
63 | ## Examples
64 |
65 | The tea-tasting repository includes [examples](https://github.com/e10v/tea-tasting/tree/main/examples) as copies of the guides in the [marimo](https://github.com/marimo-team/marimo) notebook format. You can either download them from GitHub and run in your local environment, or you can run them as WASM notebooks in the online playground.
66 |
67 | ### Run in a local environment
68 |
69 | To run the examples in your local environment, clone the repository and change the directory:
70 |
71 | ```bash
72 | git clone git@github.com:e10v/tea-tasting.git && cd tea-tasting
73 | ```
74 |
75 | Install marimo, tea-tasting, and other packages used in the examples:
76 |
77 | ```bash
78 | uv venv && uv pip install marimo tea-tasting polars ibis-framework[duckdb]
79 | ```
80 |
81 | Launch the notebook server:
82 |
83 | ```bash
84 | uv run marimo edit examples
85 | ```
86 |
87 | Now you can choose and run the example notebooks.
88 |
89 | ### Run in the online playground
90 |
91 | To run the examples as WASM notebooks in the online playground, open the following links:
92 |
93 | - [User guide](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fuser-guide.py&embed=true).
94 | - [Data backends](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fdata-backends.py&embed=true).
95 | - [Power analysis](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fpower-analysis.py&embed=true).
96 | - [Multiple hypothesis testing](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fmultiple-testing.py&embed=true).
97 | - [Custom metrics](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fcustom-metrics.py&embed=true).
98 | - [Simulated experiments](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fsimulated-experiments.py&embed=true).
99 |
100 | [WASM notebooks](https://docs.marimo.io/guides/wasm/) run entirely in the browser on [Pyodide](https://github.com/pyodide/pyodide) and thus have some limitations. In particular:
101 |
102 | - Tables and dataframes render less attractively because Pyodide doesn't always include the latest [packages versions](https://pyodide.org/en/stable/usage/packages-in-pyodide.html).
103 | - You can't simulate experiments [in parallel](https://tea-tasting.e10v.me/simulated-experiments/#parallel-execution) because Pyodide currently [doesn't support multiprocessing](https://pyodide.org/en/stable/usage/wasm-constraints.html#included-but-not-working-modules).
104 | - Other unpredictable issues may arise, such as the inability to use duckdb with ibis.
105 |
106 | ## Package name
107 |
108 | The package name "tea-tasting" is a play on words that refers to two subjects:
109 |
110 | - [Lady tasting tea](https://en.wikipedia.org/wiki/Lady_tasting_tea) is a famous experiment which was devised by Ronald Fisher. In this experiment, Fisher developed the null hypothesis significance testing framework to analyze a lady's claim that she could discern whether the tea or the milk was added first to the cup.
111 | - "tea-tasting" phonetically resembles "t-testing", referencing Student's t-test, a statistical method developed by William Gosset.
112 |
--------------------------------------------------------------------------------
/docs/javascripts/override-copy.js:
--------------------------------------------------------------------------------
1 | function attachCustomCopy() {
2 | document.querySelectorAll("button.md-clipboard").forEach((button) => {
3 | button.removeEventListener("click", handleCopy);
4 | });
5 |
6 | document.querySelectorAll("button.md-clipboard").forEach((button) => {
7 | button.addEventListener("click", handleCopy);
8 | });
9 | }
10 |
11 | function handleCopy(event) {
12 | event.preventDefault();
13 | const button = event.currentTarget;
14 | const codeBlock = document.querySelector(button.getAttribute('data-clipboard-target'));
15 | const codeBlockClone = codeBlock.cloneNode(true);
16 | codeBlockClone.querySelectorAll('.go').forEach(span => {
17 | const prev = span.previousSibling;
18 | if (prev && prev.nodeType === Node.TEXT_NODE) {
19 | prev.textContent = prev.textContent.replace(/[\r\n]+$/, '');
20 | }
21 | });
22 | codeBlockClone.querySelectorAll('.gp, .go').forEach(span => span.remove());
23 | navigator.clipboard.writeText(codeBlockClone.textContent || codeBlockClone.innerText);
24 | }
25 |
26 | document$.subscribe(() => {
27 | attachCustomCopy();
28 | });
29 |
--------------------------------------------------------------------------------
/docs/multiple-testing.md:
--------------------------------------------------------------------------------
1 | # Multiple testing
2 |
3 | ## Multiple hypothesis testing problem
4 |
5 | /// admonition | Note
6 |
7 | This guide uses [Polars](https://github.com/pola-rs/polars) as an example data backend. Install Polars in addition to tea-tasting to reproduce the examples:
8 |
9 | ```bash
10 | uv pip install polars
11 | ```
12 |
13 | ///
14 |
15 | The [multiple hypothesis testing problem](https://en.wikipedia.org/wiki/Multiple_comparisons_problem) arises when there is more than one success metric or more than one treatment variant in an A/B test.
16 |
17 | tea-tasting provides the following methods for multiple testing correction:
18 |
19 | - [False discovery rate](https://en.wikipedia.org/wiki/False_discovery_rate) (FDR) controlling procedures:
20 | - Benjamini-Hochberg procedure, assuming non-negative correlation between hypotheses.
21 | - Benjamini-Yekutieli procedure, assuming arbitrary dependence between hypotheses.
22 | - [Family-wise error rate](https://en.wikipedia.org/wiki/Family-wise_error_rate) (FWER) controlling procedures:
23 | - Hochberg's step-up procedure, assuming non-negative correlation between hypotheses.
24 | - Holm's step-down procedure, assuming arbitrary dependence between hypotheses.
25 |
26 | As an example, consider an experiment with three variants, a control and two treatments:
27 |
28 | ```pycon
29 | >>> import polars as pl
30 | >>> import tea_tasting as tt
31 |
32 | >>> data = pl.concat((
33 | ... tt.make_users_data(
34 | ... seed=42,
35 | ... orders_uplift=0.10,
36 | ... revenue_uplift=0.15,
37 | ... return_type="polars",
38 | ... ),
39 | ... tt.make_users_data(
40 | ... seed=21,
41 | ... orders_uplift=0.15,
42 | ... revenue_uplift=0.20,
43 | ... return_type="polars",
44 | ... )
45 | ... .filter(pl.col("variant").eq(1))
46 | ... .with_columns(variant=pl.lit(2, pl.Int64)),
47 | ... ))
48 | >>> data
49 | shape: (6_046, 5)
50 | ┌──────┬─────────┬──────────┬────────┬─────────┐
51 | │ user ┆ variant ┆ sessions ┆ orders ┆ revenue │
52 | │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
53 | │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ f64 │
54 | ╞══════╪═════════╪══════════╪════════╪═════════╡
55 | │ 0 ┆ 1 ┆ 2 ┆ 1 ┆ 9.58 │
56 | │ 1 ┆ 0 ┆ 2 ┆ 1 ┆ 6.43 │
57 | │ 2 ┆ 1 ┆ 2 ┆ 1 ┆ 8.3 │
58 | │ 3 ┆ 1 ┆ 2 ┆ 1 ┆ 16.65 │
59 | │ 4 ┆ 0 ┆ 1 ┆ 1 ┆ 7.14 │
60 | │ … ┆ … ┆ … ┆ … ┆ … │
61 | │ 3989 ┆ 2 ┆ 4 ┆ 4 ┆ 34.93 │
62 | │ 3991 ┆ 2 ┆ 1 ┆ 0 ┆ 0.0 │
63 | │ 3992 ┆ 2 ┆ 3 ┆ 3 ┆ 27.96 │
64 | │ 3994 ┆ 2 ┆ 2 ┆ 1 ┆ 17.22 │
65 | │ 3998 ┆ 2 ┆ 3 ┆ 0 ┆ 0.0 │
66 | └──────┴─────────┴──────────┴────────┴─────────┘
67 |
68 | ```
69 |
70 | Let's calculate the experiment results:
71 |
72 | ```pycon
73 | >>> experiment = tt.Experiment(
74 | ... sessions_per_user=tt.Mean("sessions"),
75 | ... orders_per_session=tt.RatioOfMeans("orders", "sessions"),
76 | ... orders_per_user=tt.Mean("orders"),
77 | ... revenue_per_user=tt.Mean("revenue"),
78 | ... )
79 | >>> results = experiment.analyze(data, control=0, all_variants=True)
80 | >>> results
81 | variants metric control treatment rel_effect_size rel_effect_size_ci pvalue
82 | (0, 1) sessions_per_user 2.00 1.98 -0.66% [-3.7%, 2.5%] 0.674
83 | (0, 1) orders_per_session 0.266 0.289 8.8% [-0.89%, 19%] 0.0762
84 | (0, 1) orders_per_user 0.530 0.573 8.0% [-2.0%, 19%] 0.118
85 | (0, 1) revenue_per_user 5.24 5.99 14% [2.1%, 28%] 0.0211
86 | (0, 2) sessions_per_user 2.00 2.02 0.98% [-2.1%, 4.1%] 0.532
87 | (0, 2) orders_per_session 0.266 0.295 11% [1.2%, 22%] 0.0273
88 | (0, 2) orders_per_user 0.530 0.594 12% [1.7%, 23%] 0.0213
89 | (0, 2) revenue_per_user 5.24 6.25 19% [6.6%, 33%] 0.00218
90 |
91 | ```
92 |
93 | Suppose only the two metrics `orders_per_user` and `revenue_per_user` are considered as success metrics, while the other two metrics `sessions_per_user` and `orders_per_session` are second-order diagnostic metrics.
94 |
95 | ```pycon
96 | >>> metrics = {"orders_per_user", "revenue_per_user"}
97 |
98 | ```
99 |
100 | With two treatment variants and two success metrics, there are four hypotheses in total, which increases the probability of false positives (also called "false discoveries"). It's recommended to adjust the p-values or the significance level (alpha) in this case. Let's explore the correction methods provided by tea-tasting.
101 |
102 | ## False discovery rate
103 |
104 | False discovery rate (FDR) is the expected value of the proportion of false discoveries among the discoveries (rejections of the null hypothesis). To control for FDR, use the [`adjust_fdr`](api/multiplicity.md#tea_tasting.multiplicity.adjust_fdr) method:
105 |
106 | ```pycon
107 | >>> adjusted_results_fdr = tt.adjust_fdr(results, metrics)
108 | >>> adjusted_results_fdr
109 | comparison metric control treatment rel_effect_size pvalue pvalue_adj
110 | (0, 1) orders_per_user 0.530 0.573 8.0% 0.118 0.118
111 | (0, 1) revenue_per_user 5.24 5.99 14% 0.0211 0.0284
112 | (0, 2) orders_per_user 0.530 0.594 12% 0.0213 0.0284
113 | (0, 2) revenue_per_user 5.24 6.25 19% 0.00218 0.00872
114 |
115 | ```
116 |
117 | The method adjusts p-values and saves them as `pvalue_adj`. Compare these values to the desired significance level alpha to determine if the null hypotheses can be rejected.
118 |
119 | The method also adjusts the significance level alpha and saves it as `alpha_adj`. Compare non-adjusted p-values (`pvalue`) to the `alpha_adj` to determine if the null hypotheses can be rejected:
120 |
121 | ```pycon
122 | >>> adjusted_results_fdr.with_keys((
123 | ... "comparison",
124 | ... "metric",
125 | ... "control",
126 | ... "treatment",
127 | ... "rel_effect_size",
128 | ... "pvalue",
129 | ... "alpha_adj",
130 | ... ))
131 | comparison metric control treatment rel_effect_size pvalue alpha_adj
132 | (0, 1) orders_per_user 0.530 0.573 8.0% 0.118 0.0500
133 | (0, 1) revenue_per_user 5.24 5.99 14% 0.0211 0.0375
134 | (0, 2) orders_per_user 0.530 0.594 12% 0.0213 0.0375
135 | (0, 2) revenue_per_user 5.24 6.25 19% 0.00218 0.0375
136 |
137 | ```
138 |
139 | By default, tea-tasting assumes non-negative correlation between hypotheses and performs the Benjamini-Hochberg procedure. To perform the Benjamini-Yekutieli procedure, assuming arbitrary dependence between hypotheses, set the `arbitrary_dependence` parameter to `True`:
140 |
141 | ```pycon
142 | >>> tt.adjust_fdr(results, metrics, arbitrary_dependence=True)
143 | comparison metric control treatment rel_effect_size pvalue pvalue_adj
144 | (0, 1) orders_per_user 0.530 0.573 8.0% 0.118 0.245
145 | (0, 1) revenue_per_user 5.24 5.99 14% 0.0211 0.0592
146 | (0, 2) orders_per_user 0.530 0.594 12% 0.0213 0.0592
147 | (0, 2) revenue_per_user 5.24 6.25 19% 0.00218 0.0182
148 |
149 | ```
150 |
151 | ## Family-wise error rate
152 |
153 | Family-wise error rate (FWER) is the probability of making at least one type I error. To control for FWER, use the [`adjust_fwer`](api/multiplicity.md#tea_tasting.multiplicity.adjust_fwer) method:
154 |
155 | ```pycon
156 | >>> tt.adjust_fwer(results, metrics)
157 | comparison metric control treatment rel_effect_size pvalue pvalue_adj
158 | (0, 1) orders_per_user 0.530 0.573 8.0% 0.118 0.118
159 | (0, 1) revenue_per_user 5.24 5.99 14% 0.0211 0.0422
160 | (0, 2) orders_per_user 0.530 0.594 12% 0.0213 0.0422
161 | (0, 2) revenue_per_user 5.24 6.25 19% 0.00218 0.00869
162 |
163 | ```
164 |
165 | By default, tea-tasting assumes non-negative correlation between hypotheses and performs the Hochberg's step-up procedure with the Šidák correction, which is slightly more powerful than the Bonferroni correction.
166 |
167 | To perform the Holm's step-down procedure, assuming arbitrary dependence between hypotheses, set the `arbitrary_dependence` parameter to `True`. In this case, it's recommended to use the Bonferroni correction, since the Šidák correction assumes non-negative correlation between hypotheses:
168 |
169 | ```pycon
170 | >>> tt.adjust_fwer(
171 | ... results,
172 | ... metrics,
173 | ... arbitrary_dependence=True,
174 | ... method="bonferroni",
175 | ... )
176 | comparison metric control treatment rel_effect_size pvalue pvalue_adj
177 | (0, 1) orders_per_user 0.530 0.573 8.0% 0.118 0.118
178 | (0, 1) revenue_per_user 5.24 5.99 14% 0.0211 0.0634
179 | (0, 2) orders_per_user 0.530 0.594 12% 0.0213 0.0634
180 | (0, 2) revenue_per_user 5.24 6.25 19% 0.00218 0.00872
181 |
182 | ```
183 |
184 | ## Other inputs
185 |
186 | In the examples above, the methods `adjust_fdr` and `adjust_fwer` received results from a *single experiment* with *more than two variants*. They can also accept the results from *multiple experiments* with *two variants* in each:
187 |
188 | ```pycon
189 | >>> data1 = tt.make_users_data(seed=42, orders_uplift=0.10, revenue_uplift=0.15)
190 | >>> data2 = tt.make_users_data(seed=21, orders_uplift=0.15, revenue_uplift=0.20)
191 | >>> result1 = experiment.analyze(data1)
192 | >>> result2 = experiment.analyze(data2)
193 | >>> tt.adjust_fdr(
194 | ... {"Experiment 1": result1, "Experiment 2": result2},
195 | ... metrics,
196 | ... )
197 | comparison metric control treatment rel_effect_size pvalue pvalue_adj
198 | Experiment 1 orders_per_user 0.530 0.573 8.0% 0.118 0.118
199 | Experiment 1 revenue_per_user 5.24 5.99 14% 0.0211 0.0282
200 | Experiment 2 orders_per_user 0.514 0.594 16% 0.00427 0.00853
201 | Experiment 2 revenue_per_user 5.10 6.25 22% 6.27e-04 0.00251
202 |
203 | ```
204 |
205 | The methods `adjust_fdr` and `adjust_fwer` can also accept the result of *a single experiment with two variants*:
206 |
207 | ```pycon
208 | >>> tt.adjust_fwer(result2, metrics)
209 | comparison metric control treatment rel_effect_size pvalue pvalue_adj
210 | - orders_per_user 0.514 0.594 16% 0.00427 0.00427
211 | - revenue_per_user 5.10 6.25 22% 6.27e-04 0.00125
212 |
213 | ```
214 |
--------------------------------------------------------------------------------
/docs/power-analysis.md:
--------------------------------------------------------------------------------
1 | # Power analysis
2 |
3 | In tea-tasting, you can analyze the statistical power for `Mean` and `RatioOfMeans` metrics. There are three possible options:
4 |
5 | - Calculate the effect size, given statistical power and the total number of observations.
6 | - Calculate the total number of observations, given statistical power and the effect size.
7 | - Calculate statistical power, given the effect size and the total number of observations.
8 |
9 | In this example, tea-tasting calculates statistical power given the relative effect size and the number of observations:
10 |
11 | ```pycon
12 | >>> import tea_tasting as tt
13 |
14 | >>> data = tt.make_users_data(
15 | ... seed=42,
16 | ... sessions_uplift=0,
17 | ... orders_uplift=0,
18 | ... revenue_uplift=0,
19 | ... covariates=True,
20 | ... )
21 | >>> orders_per_session = tt.RatioOfMeans("orders", "sessions", rel_effect_size=0.1)
22 | >>> orders_per_session.solve_power(data, "power")
23 | power effect_size rel_effect_size n_obs
24 | 52% 0.0261 10% 4000
25 |
26 | ```
27 |
28 | Besides `alternative`, `equal_var`, `use_t`, and covariates (CUPED), the following metric parameters affect the result:
29 |
30 | - `alpha`: Significance level.
31 | - `ratio`: Ratio of the number of observations in the treatment relative to the control.
32 | - `power`: Statistical power.
33 | - `effect_size` and `rel_effect_size`: Absolute and relative effect size. Only one of them can be defined.
34 | - `n_obs`: Number of observations in the control and in the treatment together. If the number of observations is not set explicitly, it's inferred from the dataset.
35 |
36 | You can change the default values of `alpha`, `ratio`, `power`, and `n_obs` using the [global settings](user-guide.md#global-settings).
37 |
38 | tea-tasting can analyze power for several values of parameters `effect_size`, `rel_effect_size`, or `n_obs`. Example:
39 |
40 | ```pycon
41 | >>> orders_per_user = tt.Mean("orders", alpha=0.1, power=0.7, n_obs=(10_000, 20_000))
42 | >>> orders_per_user.solve_power(data, "rel_effect_size")
43 | power effect_size rel_effect_size n_obs
44 | 70% 0.0367 7.1% 10000
45 | 70% 0.0260 5.0% 20000
46 |
47 | ```
48 |
49 | You can analyze power for all metrics in the experiment. Example:
50 |
51 | ```pycon
52 | >>> with tt.config_context(n_obs=(10_000, 20_000)):
53 | ... experiment = tt.Experiment(
54 | ... sessions_per_user=tt.Mean("sessions", "sessions_covariate"),
55 | ... orders_per_session=tt.RatioOfMeans(
56 | ... numer="orders",
57 | ... denom="sessions",
58 | ... numer_covariate="orders_covariate",
59 | ... denom_covariate="sessions_covariate",
60 | ... ),
61 | ... orders_per_user=tt.Mean("orders", "orders_covariate"),
62 | ... revenue_per_user=tt.Mean("revenue", "revenue_covariate"),
63 | ... )
64 | ...
65 | >>> power_result = experiment.solve_power(data)
66 | >>> power_result
67 | metric power effect_size rel_effect_size n_obs
68 | sessions_per_user 80% 0.0458 2.3% 10000
69 | sessions_per_user 80% 0.0324 1.6% 20000
70 | orders_per_session 80% 0.0177 6.8% 10000
71 | orders_per_session 80% 0.0125 4.8% 20000
72 | orders_per_user 80% 0.0374 7.2% 10000
73 | orders_per_user 80% 0.0264 5.1% 20000
74 | revenue_per_user 80% 0.488 9.2% 10000
75 | revenue_per_user 80% 0.345 6.5% 20000
76 |
77 | ```
78 |
79 | In the example above, tea-tasting calculates both the relative and absolute effect size for all metrics for two possible sample size values, `10_000` and `20_000`.
80 |
81 | The `solve_power` methods of a [metric](api/metrics/mean.md#tea_tasting.metrics.mean.Mean.solve_power) and of an [experiment](api/experiment.md#tea_tasting.experiment.Experiment.solve_power) return the instances of [`MetricPowerResults`](api/metrics/base.md#tea_tasting.metrics.base.MetricPowerResults) and [`ExperimentPowerResult`](api/experiment.md#tea_tasting.experiment.ExperimentPowerResult) respectively. These result classes provide the serialization methods similar to the experiment result: `to_dicts`, `to_arrow`, `to_pandas`, `to_polars`, `to_pretty_dicts`, `to_string`, `to_html`. They are also rendered as an HTML tables in IPython and Jupyter, and as a table in marimo notebooks.
82 |
--------------------------------------------------------------------------------
/docs/simulated-experiments.md:
--------------------------------------------------------------------------------
1 | # Simulated experiments
2 |
3 | ## Intro
4 |
5 | In tea-tasting, you can run multiple simulated A/A or A/B tests. In each simulation, tea-tasting splits the data into control and treatment groups and can optionally modify the treatment data. A simulation without changing the treatment data is called an A/A test.
6 |
7 | A/A tests are useful for identifying potential issues before conducting the actual A/B test. Treatment simulations are great for power analysis—especially when you need a specific uplift distribution or when an analytical formula doesn’t exist.
8 |
9 | /// admonition | Note
10 |
11 | This guide uses [Polars](https://github.com/pola-rs/polars) and [tqdm](https://github.com/tqdm/tqdm). Install these packages in addition to tea-tasting to reproduce the examples:
12 |
13 | ```bash
14 | uv pip install polars tqdm
15 | ```
16 |
17 | ///
18 |
19 | ## Running A/A tests
20 |
21 | First, let's prepare the data without any uplift and drop the `"variant"` column.
22 |
23 | ```pycon
24 | >>> import polars as pl
25 | >>> import tea_tasting as tt
26 |
27 | >>> data = (
28 | ... tt.make_users_data(seed=42, orders_uplift=0, revenue_uplift=0)
29 | ... .drop_columns("variant")
30 | ... )
31 | >>> data
32 | pyarrow.Table
33 | user: int64
34 | sessions: int64
35 | orders: int64
36 | revenue: double
37 | ----
38 | user: [[0,1,2,3,4,...,3995,3996,3997,3998,3999]]
39 | sessions: [[2,2,2,2,1,...,2,2,3,1,5]]
40 | orders: [[1,1,1,0,1,...,0,1,1,0,4]]
41 | revenue: [[19.06,12.09,8.84,0,9.9,...,0,4.8,9.63,0,12.7]]
42 |
43 | ```
44 |
45 | To run A/A tests, first define the metrics for the experiment, then call the [`simulate`](api/experiment.md#tea_tasting.experiment.Experiment.simulate) method, providing the data and the number of simulations as arguments.
46 |
47 | ```pycon
48 | >>> experiment = tt.Experiment(
49 | ... sessions_per_user=tt.Mean("sessions"),
50 | ... orders_per_session=tt.RatioOfMeans("orders", "sessions"),
51 | ... orders_per_user=tt.Mean("orders"),
52 | ... revenue_per_user=tt.Mean("revenue"),
53 | ... n_users=tt.SampleRatio(),
54 | ... )
55 | >>> results = experiment.simulate(data, 100, seed=42)
56 | >>> results_data = results.to_polars()
57 | >>> results_data.select(
58 | ... "metric",
59 | ... "control",
60 | ... "treatment",
61 | ... "rel_effect_size",
62 | ... "rel_effect_size_ci_lower",
63 | ... "rel_effect_size_ci_upper",
64 | ... "pvalue",
65 | ... ) # doctest: +SKIP
66 | shape: (500, 7)
67 | ┌────────────────────┬──────────┬───────────┬─────────────────┬────────────────────┬────────────────────┬──────────┐
68 | │ metric ┆ control ┆ treatment ┆ rel_effect_size ┆ rel_effect_size_ci ┆ rel_effect_size_ci ┆ pvalue │
69 | │ --- ┆ --- ┆ --- ┆ --- ┆ _lower ┆ _upper ┆ --- │
70 | │ str ┆ f64 ┆ f64 ┆ f64 ┆ --- ┆ --- ┆ f64 │
71 | │ ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ │
72 | ╞════════════════════╪══════════╪═══════════╪═════════════════╪════════════════════╪════════════════════╪══════════╡
73 | │ sessions_per_user ┆ 1.98004 ┆ 1.998998 ┆ 0.009575 ┆ -0.021272 ┆ 0.041393 ┆ 0.547091 │
74 | │ orders_per_session ┆ 0.263105 ┆ 0.258647 ┆ -0.016945 ┆ -0.108177 ┆ 0.083621 ┆ 0.730827 │
75 | │ orders_per_user ┆ 0.520958 ┆ 0.517034 ┆ -0.007532 ┆ -0.102993 ┆ 0.098087 ┆ 0.883462 │
76 | │ revenue_per_user ┆ 5.446662 ┆ 5.14521 ┆ -0.055346 ┆ -0.162811 ┆ 0.065914 ┆ 0.356327 │
77 | │ n_users ┆ 2004.0 ┆ 1996.0 ┆ null ┆ null ┆ null ┆ 0.91187 │
78 | │ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │
79 | │ sessions_per_user ┆ 1.993624 ┆ 1.985212 ┆ -0.00422 ┆ -0.034685 ┆ 0.027207 ┆ 0.78959 │
80 | │ orders_per_session ┆ 0.269373 ┆ 0.251991 ┆ -0.064527 ┆ -0.151401 ┆ 0.03124 ┆ 0.179445 │
81 | │ orders_per_user ┆ 0.537028 ┆ 0.500255 ┆ -0.068475 ┆ -0.158141 ┆ 0.030742 ┆ 0.169217 │
82 | │ revenue_per_user ┆ 5.511967 ┆ 5.071928 ┆ -0.079833 ┆ -0.184806 ┆ 0.038656 ┆ 0.177868 │
83 | │ n_users ┆ 2039.0 ┆ 1961.0 ┆ null ┆ null ┆ null ┆ 0.223423 │
84 | └────────────────────┴──────────┴───────────┴─────────────────┴────────────────────┴────────────────────┴──────────┘
85 |
86 | ```
87 |
88 | The `simulate` method accepts data in the same formats as the `analyze` method. Internally, however, it converts the data to a PyArrow Table before running the simulations.
89 |
90 | The method returns an instance of the [`SimulationResults`](api/experiment.md#tea_tasting.experiment.SimulationResults) class, which contains the results of all simulations for all metrics. The resulting object provides serialization methods to those of the experiment result, including `to_dicts`, `to_arrow`, `to_pandas`, `to_polars`, `to_pretty_dicts`, `to_string`, `to_html`.
91 |
92 | For instance, we can now calculate the proportion of rejected null hypotheses, using various significance levels (`alpha`). In A/A tests, it estimates the type I error rate.
93 |
94 | ```pycon
95 | >>> def null_rejected(
96 | ... results_data: pl.DataFrame,
97 | ... alphas: tuple[float, ...] = (0.01, 0.02, 0.05),
98 | ... ) -> pl.DataFrame:
99 | ... return results_data.group_by("metric", maintain_order=True).agg(
100 | ... pl.col("pvalue").le(alpha).mean().alias(f"null_rejected_{alpha}")
101 | ... for alpha in alphas
102 | ... )
103 | ...
104 | >>> null_rejected(results_data)
105 | shape: (5, 4)
106 | ┌────────────────────┬────────────────────┬────────────────────┬────────────────────┐
107 | │ metric ┆ null_rejected_0.01 ┆ null_rejected_0.02 ┆ null_rejected_0.05 │
108 | │ --- ┆ --- ┆ --- ┆ --- │
109 | │ str ┆ f64 ┆ f64 ┆ f64 │
110 | ╞════════════════════╪════════════════════╪════════════════════╪════════════════════╡
111 | │ sessions_per_user ┆ 0.01 ┆ 0.02 ┆ 0.05 │
112 | │ orders_per_session ┆ 0.02 ┆ 0.02 ┆ 0.06 │
113 | │ orders_per_user ┆ 0.01 ┆ 0.02 ┆ 0.05 │
114 | │ revenue_per_user ┆ 0.02 ┆ 0.03 ┆ 0.06 │
115 | │ n_users ┆ 0.01 ┆ 0.01 ┆ 0.04 │
116 | └────────────────────┴────────────────────┴────────────────────┴────────────────────┘
117 |
118 | ```
119 |
120 | 100 simulations, as in the example above, produce a very rough estimate. In practice, a larger number of simulations, such as the default `10_000`, is recommended.
121 |
122 | ## Simulating experiments with treatment
123 |
124 | To simulate experiments with treatment, define a treatment function that takes data in the form of a PyArrow Table and returns a PyArrow Table with the modified data:
125 |
126 | ```pycon
127 | >>> import pyarrow as pa
128 | >>> import pyarrow.compute as pc
129 |
130 | >>> def treat(data: pa.Table) -> pa.Table:
131 | ... return (
132 | ... data.drop_columns(["orders", "revenue"])
133 | ... .append_column("orders", pc.multiply(data["orders"], pa.scalar(1.1)))
134 | ... .append_column("revenue", pc.multiply(data["revenue"], pa.scalar(1.1)))
135 | ... )
136 | ...
137 | >>> results_treat = experiment.simulate(data, 100, seed=42, treat=treat)
138 | >>> null_rejected(results_treat.to_polars())
139 | shape: (5, 4)
140 | ┌────────────────────┬────────────────────┬────────────────────┬────────────────────┐
141 | │ metric ┆ null_rejected_0.01 ┆ null_rejected_0.02 ┆ null_rejected_0.05 │
142 | │ --- ┆ --- ┆ --- ┆ --- │
143 | │ str ┆ f64 ┆ f64 ┆ f64 │
144 | ╞════════════════════╪════════════════════╪════════════════════╪════════════════════╡
145 | │ sessions_per_user ┆ 0.01 ┆ 0.02 ┆ 0.05 │
146 | │ orders_per_session ┆ 0.23 ┆ 0.31 ┆ 0.42 │
147 | │ orders_per_user ┆ 0.21 ┆ 0.29 ┆ 0.4 │
148 | │ revenue_per_user ┆ 0.11 ┆ 0.16 ┆ 0.31 │
149 | │ n_users ┆ 0.01 ┆ 0.01 ┆ 0.04 │
150 | └────────────────────┴────────────────────┴────────────────────┴────────────────────┘
151 |
152 | ```
153 |
154 | In the example above, we've defined a function that increases the number of orders and the revenue by 10%. For these metrics, the proportion of rejected null hypotheses is an estimate of statistical power.
155 |
156 | ## Using a function instead of static data
157 |
158 | You can use a function instead of static data to generate input dynamically. The function should take an instance of `numpy.random.Generator` as a parameter named `seed` and return experimental data in any format supported by tea-tasting.
159 |
160 | As an example, let's use the `make_users_data` function.
161 |
162 | ```pycon
163 | >>> results_data_gen = experiment.simulate(tt.make_users_data, 100, seed=42)
164 | >>> null_rejected(results_data_gen.to_polars())
165 | shape: (5, 4)
166 | ┌────────────────────┬────────────────────┬────────────────────┬────────────────────┐
167 | │ metric ┆ null_rejected_0.01 ┆ null_rejected_0.02 ┆ null_rejected_0.05 │
168 | │ --- ┆ --- ┆ --- ┆ --- │
169 | │ str ┆ f64 ┆ f64 ┆ f64 │
170 | ╞════════════════════╪════════════════════╪════════════════════╪════════════════════╡
171 | │ sessions_per_user ┆ 0.01 ┆ 0.01 ┆ 0.06 │
172 | │ orders_per_session ┆ 0.27 ┆ 0.36 ┆ 0.54 │
173 | │ orders_per_user ┆ 0.24 ┆ 0.32 ┆ 0.49 │
174 | │ revenue_per_user ┆ 0.17 ┆ 0.26 ┆ 0.39 │
175 | │ n_users ┆ 0.01 ┆ 0.01 ┆ 0.04 │
176 | └────────────────────┴────────────────────┴────────────────────┴────────────────────┘
177 |
178 | ```
179 |
180 | On each iteration, tea-tasting calls `make_users_data` with a new `seed` and uses the returned data for the analysis of the experiment. The data returned by `make_users_data` already contains the `"variant"` column, so tea-tasting reuses that split. By default, `make_users_data` also adds the treatment uplift, and you can see it in the proportion of rejected null hypotheses.
181 |
182 | ## Tracking progress
183 |
184 | To track the progress of simulations with [`tqdm`](https://github.com/tqdm/tqdm) or [`marimo.status.progress_bar`](https://docs.marimo.io/api/status/#progress-bar), use the `progress` parameter.
185 |
186 | ```pycon
187 | >>> import tqdm
188 |
189 | >>> results_progress = experiment.simulate(
190 | ... data,
191 | ... 100,
192 | ... seed=42,
193 | ... progress=tqdm.tqdm,
194 | ... ) # doctest: +SKIP
195 | 100%|██████████████████████████████████████| 100/100 [00:01<00:00, 64.47it/s]
196 |
197 | ```
198 |
199 | ## Parallel execution
200 |
201 | /// admonition | Note
202 |
203 | The code below won't work in the [marimo online playground](https://docs.marimo.io/guides/publishing/playground/) as it relies on the `multiprocessing` module which is currently [not supported](https://docs.marimo.io/guides/wasm/#limitations) by WASM notebooks. [WASM notebooks](https://docs.marimo.io/guides/wasm/) are the marimo notebooks that run entirely in the browser.
204 |
205 | ///
206 |
207 | To speed up simulations and run them in parallel, use the `map_` parameter with an alternative mapping function.
208 |
209 | ```pycon
210 | >>> import concurrent.futures
211 |
212 | >>> with concurrent.futures.ProcessPoolExecutor() as executor:
213 | ... results_parallel = experiment.simulate(
214 | ... data,
215 | ... 100,
216 | ... seed=42,
217 | ... treat=treat,
218 | ... map_=executor.map,
219 | ... progress=tqdm.tqdm,
220 | ... ) # doctest: +SKIP
221 | ...
222 | 100%|█████████████████████████████████████| 100/100 [00:00<00:00, 251.60it/s]
223 |
224 | ```
225 |
226 | As an alternative to [`concurrent.futures.ProcessPoolExecutor`](https://docs.python.org/3/library/concurrent.futures.html#processpoolexecutor), you can use the `map`, `imap`, or `imap_unordered` methods of [`multiprocessing.pool.Pool`](https://docs.python.org/3/library/multiprocessing.html#module-multiprocessing.pool).
227 |
228 | It's also possible to run simulations on a distributed [Dask](https://distributed.dask.org/en/stable/api.html#distributed.Client.map) or [Ray](https://docs.ray.io/en/latest/ray-core/api/doc/ray.util.ActorPool.map.html#ray.util.ActorPool.map) cluster.
229 |
--------------------------------------------------------------------------------
/docs/stylesheets/extra.css:
--------------------------------------------------------------------------------
1 | :root {
2 | --md-code-font: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
3 | --md-text-font: system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", "Noto Sans", "Liberation Sans", Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";
4 | }
5 | .md-typeset code {
6 | font-size: .875em;
7 | }
8 | .md-typeset ol li,.md-typeset ul li {
9 | margin-bottom: .25em
10 | }
11 | div.highlight span.gp { /* gp: Generic.Prompt */
12 | user-select: none;
13 | -webkit-user-select: text; /* Safari fallback only */
14 | -webkit-user-select: none; /* Chrome/Safari */
15 | -moz-user-select: none; /* Firefox */
16 | -ms-user-select: none; /* IE10+ */
17 | }
18 |
--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples
2 |
3 | The tea-tasting repository includes [examples](https://github.com/e10v/tea-tasting/tree/main/examples) as copies of the guides in the [marimo](https://github.com/marimo-team/marimo) notebook format. You can either download them from GitHub and run in your local environment, or you can run them as WASM notebooks in the online playground.
4 |
5 | ## Run in a local environment
6 |
7 | To run the examples in your local environment, clone the repository and change the directory:
8 |
9 | ```bash
10 | git clone git@github.com:e10v/tea-tasting.git && cd tea-tasting
11 | ```
12 |
13 | Install marimo, tea-tasting, and other packages used in the examples:
14 |
15 | ```bash
16 | uv venv && uv pip install marimo tea-tasting polars ibis-framework[duckdb]
17 | ```
18 |
19 | Launch the notebook server:
20 |
21 | ```bash
22 | uv run marimo edit examples
23 | ```
24 |
25 | Now you can choose and run the example notebooks.
26 |
27 | ## Run in the online playground
28 |
29 | To run the examples as WASM notebooks in the online playground, open the following links:
30 |
31 | - [User guide](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fuser-guide.py&embed=true).
32 | - [Data backends](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fdata-backends.py&embed=true).
33 | - [Power analysis](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fpower-analysis.py&embed=true).
34 | - [Multiple hypothesis testing](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fmultiple-testing.py&embed=true).
35 | - [Custom metrics](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fcustom-metrics.py&embed=true).
36 | - [Simulated experiments](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fsimulated-experiments.py&embed=true).
37 |
38 | [WASM notebooks](https://docs.marimo.io/guides/wasm/) run entirely in the browser on [Pyodide](https://github.com/pyodide/pyodide) and thus have some limitations. In particular:
39 |
40 | - Tables and dataframes render less attractively because Pyodide doesn't always include the latest [packages versions](https://pyodide.org/en/stable/usage/packages-in-pyodide.html).
41 | - You can't simulate experiments [in parallel](https://tea-tasting.e10v.me/simulated-experiments/#parallel-execution) because Pyodide currently [doesn't support multiprocessing](https://pyodide.org/en/stable/usage/wasm-constraints.html#included-but-not-working-modules).
42 | - Other unpredictable issues may arise, such as the inability to use duckdb with ibis.
43 |
--------------------------------------------------------------------------------
/examples/custom-metrics.py:
--------------------------------------------------------------------------------
1 | # /// script
2 | # requires-python = ">=3.10"
3 | # dependencies = [
4 | # "marimo",
5 | # "tea-tasting",
6 | # ]
7 | # [tool.marimo.display]
8 | # cell_output = "below"
9 | # ///
10 |
11 | import marimo
12 |
13 | __generated_with = "0.13.6"
14 | app = marimo.App()
15 |
16 |
17 | @app.cell(hide_code=True)
18 | def _(mo):
19 | mo.md(
20 | r"""
21 | # Custom metrics
22 |
23 | ## Intro
24 |
25 | tea-tasting supports Student's t-test, Z-test, and [some other statistical tests](https://tea-tasting.e10v.me/api/metrics/index/) out of the box. However, you might want to analyze an experiment using other statistical criteria. In this case, you can define a custom metric with a statistical test of your choice.
26 |
27 | In tea-tasting, there are two types of metrics:
28 |
29 | - Metrics that require only aggregated statistics for the analysis.
30 | - Metrics that require granular data for the analysis.
31 |
32 | This guide explains how to define a custom metric for each type.
33 |
34 | First, let's import all the required modules and prepare the data:
35 | """
36 | )
37 | return
38 |
39 |
40 | @app.cell
41 | def _():
42 | from typing import Literal, NamedTuple
43 | import numpy as np
44 | import pyarrow as pa
45 | import pyarrow.compute as pc
46 | import scipy.stats
47 | import tea_tasting as tt
48 | import tea_tasting.aggr
49 | import tea_tasting.config
50 | import tea_tasting.metrics
51 | import tea_tasting.utils
52 |
53 | data = tt.make_users_data(seed=42)
54 | data = data.append_column(
55 | "has_order",
56 | pc.greater(data["orders"], 0).cast(pa.int64()),
57 | )
58 | data
59 | return Literal, NamedTuple, data, np, pa, scipy, tea_tasting, tt
60 |
61 |
62 | @app.cell(hide_code=True)
63 | def _(mo):
64 | mo.md(
65 | r"""
66 | This guide uses PyArrow as the data backend, but it's valid for other backends as well. See the [guide on data backends](https://tea-tasting.e10v.me/data-backends/) for more details.
67 |
68 | ## Metrics based on aggregated statistics
69 |
70 | Let's define a metric that performs a proportion test, [G-test](https://en.wikipedia.org/wiki/G-test) or [Pearson's chi-squared test](https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test), on a binary column (with values `0` or `1`).
71 |
72 | The first step is defining a result class. It should be a named tuple or a dictionary.
73 | """
74 | )
75 | return
76 |
77 |
78 | @app.cell
79 | def _(NamedTuple):
80 | class ProportionResult(NamedTuple):
81 | control: float
82 | treatment: float
83 | effect_size: float
84 | rel_effect_size: float
85 | pvalue: float
86 | statistic: float
87 | return (ProportionResult,)
88 |
89 |
90 | @app.cell(hide_code=True)
91 | def _(mo):
92 | mo.md(
93 | r"""
94 | The second step is defining the metric class itself. A metric based on aggregated statistics should be a subclass of [`MetricBaseAggregated`](https://tea-tasting.e10v.me/api/metrics/base/#tea_tasting.metrics.base.MetricBaseAggregated). `MetricBaseAggregated` is a generic class with the result class as a type variable.
95 |
96 | The metric should have the following methods and properties defined:
97 |
98 | - Method `__init__` checks and saves metric parameters.
99 | - Property `aggr_cols` returns columns to be aggregated for analysis for each type of statistic.
100 | - Method `analyze_aggregates` analyzes the metric using aggregated statistics.
101 |
102 | Let's define the metric and discuss each method in details:
103 | """
104 | )
105 | return
106 |
107 |
108 | @app.cell
109 | def _(Literal, ProportionResult, np, scipy, tea_tasting):
110 | class Proportion(tea_tasting.metrics.MetricBaseAggregated[ProportionResult]):
111 | def __init__(
112 | self,
113 | column: str,
114 | *,
115 | correction: bool = True,
116 | method: Literal["g-test", "pearson"] = "g-test",
117 | ) -> None:
118 | self.column = tea_tasting.utils.check_scalar(column, "column", typ=str)
119 | self.correction = tea_tasting.utils.auto_check(correction, "correction")
120 | self.method = tea_tasting.utils.check_scalar(
121 | method, "method", typ=str, in_={"g-test", "pearson"})
122 | @property
123 | def aggr_cols(self) -> tea_tasting.metrics.AggrCols:
124 | return tea_tasting.metrics.AggrCols(
125 | has_count=True,
126 | mean_cols=(self.column,),
127 | )
128 | def analyze_aggregates(
129 | self,
130 | control: tea_tasting.aggr.Aggregates,
131 | treatment: tea_tasting.aggr.Aggregates,
132 | ) -> ProportionResult:
133 | observed = np.empty(shape=(2, 2), dtype=np.int64)
134 | observed[0, 0] = round(control.count() * control.mean(self.column))
135 | observed[1, 0] = control.count() - observed[0, 0]
136 | observed[0, 1] = round(treatment.count() * treatment.mean(self.column))
137 | observed[1, 1] = treatment.count() - observed[0, 1]
138 | res = scipy.stats.chi2_contingency(
139 | observed=observed,
140 | correction=self.correction,
141 | lambda_=int(self.method == "pearson"),
142 | )
143 | return ProportionResult(
144 | control=control.mean(self.column),
145 | treatment=treatment.mean(self.column),
146 | effect_size=treatment.mean(self.column) - control.mean(self.column),
147 | rel_effect_size=treatment.mean(self.column)/control.mean(self.column) - 1,
148 | pvalue=res.pvalue,
149 | statistic=res.statistic,
150 | )
151 | return (Proportion,)
152 |
153 |
154 | @app.cell(hide_code=True)
155 | def _(mo):
156 | mo.md(
157 | r"""
158 | Method `__init__` saves metric parameters to be used in the analysis. You can use utility functions [`check_scalar`](https://tea-tasting.e10v.me/api/utils/#tea_tasting.utils.check_scalar) and [`auto_check`](https://tea-tasting.e10v.me/api/utils/#tea_tasting.utils.auto_check) to check parameter values.
159 |
160 | Property `aggr_cols` returns an instance of [`AggrCols`](https://tea-tasting.e10v.me/api/metrics/base/#tea_tasting.metrics.base.AggrCols). Analysis of proportion requires the number of rows (`has_count=True`) and the average value for the column of interest (`mean_cols=(self.column,)`) for each variant.
161 |
162 | Method `analyze_aggregates` accepts two parameters: `control` and `treatment` data as instances of class [`Aggregates`](https://tea-tasting.e10v.me/api/aggr/#tea_tasting.aggr.Aggregates). They contain values for statistics and columns specified in `aggr_cols`.
163 |
164 | Method `analyze_aggregates` returns an instance of `ProportionResult`, defined earlier, with the analysis result.
165 |
166 | Now we can analyze the proportion of users who created at least one order during the experiment. For comparison, let's also add a metric that performs a Z-test on the same column.
167 | """
168 | )
169 | return
170 |
171 |
172 | @app.cell
173 | def _(Proportion, data, tt):
174 | experiment_prop = tt.Experiment(
175 | prop_users_with_orders=Proportion("has_order"),
176 | mean_users_with_orders=tt.Mean("has_order", use_t=False),
177 | )
178 | experiment_prop.analyze(data)
179 | return
180 |
181 |
182 | @app.cell(hide_code=True)
183 | def _(mo):
184 | mo.md(
185 | r"""
186 | ## Metrics based on granular data
187 |
188 | Now let's define a metric that performs the Mann-Whitney U test. While it's possible to use the aggregated sum of ranks for the test, this example uses granular data for analysis.
189 |
190 | The result class:
191 | """
192 | )
193 | return
194 |
195 |
196 | @app.cell
197 | def _(NamedTuple):
198 | class MannWhitneyUResult(NamedTuple):
199 | pvalue: float
200 | statistic: float
201 | return (MannWhitneyUResult,)
202 |
203 |
204 | @app.cell(hide_code=True)
205 | def _(mo):
206 | mo.md(
207 | r"""
208 | A metric that analyzes granular data should be a subclass of [`MetricBaseGranular`](https://tea-tasting.e10v.me/api/metrics/base/#tea_tasting.metrics.base.MetricBaseGranular). `MetricBaseGranular` is a generic class with the result class as a type variable.
209 |
210 | Metric should have the following methods and properties defined:
211 |
212 | - Method `__init__` checks and saves metric parameters.
213 | - Property `cols` returns columns to be fetched for an analysis.
214 | - Method `analyze_granular` analyzes the metric using granular data.
215 | """
216 | )
217 | return
218 |
219 |
220 | @app.cell
221 | def _(Literal, MannWhitneyUResult, pa, scipy, tea_tasting):
222 | class MannWhitneyU(tea_tasting.metrics.MetricBaseGranular[MannWhitneyUResult]):
223 | def __init__(
224 | self,
225 | column: str,
226 | *,
227 | correction: bool = True,
228 | alternative: Literal["two-sided", "less", "greater"] | None = None,
229 | ) -> None:
230 | self.column = tea_tasting.utils.check_scalar(column, "column", typ=str)
231 | self.correction = tea_tasting.utils.auto_check(correction, "correction")
232 | self.alternative = (
233 | tea_tasting.utils.auto_check(alternative, "alternative")
234 | if alternative is not None
235 | else tea_tasting.config.get_config("alternative")
236 | )
237 | @property
238 | def cols(self) -> tuple[str]:
239 | return (self.column,)
240 | def analyze_granular(
241 | self,
242 | control: pa.Table,
243 | treatment: pa.Table,
244 | ) -> MannWhitneyUResult:
245 | res = scipy.stats.mannwhitneyu(
246 | treatment[self.column].combine_chunks().to_numpy(zero_copy_only=False),
247 | control[self.column].combine_chunks().to_numpy(zero_copy_only=False),
248 | use_continuity=self.correction,
249 | alternative=self.alternative,
250 | )
251 | return MannWhitneyUResult(
252 | pvalue=res.pvalue,
253 | statistic=res.statistic,
254 | )
255 | return (MannWhitneyU,)
256 |
257 |
258 | @app.cell(hide_code=True)
259 | def _(mo):
260 | mo.md(
261 | r"""
262 | Property `cols` should return a sequence of strings.
263 |
264 | Method `analyze_granular` accepts two parameters: control and treatment data as PyArrow Tables. Even with [data backend](https://tea-tasting.e10v.me/data-backends/) different from PyArrow, tea-tasting will retrieve the data and transform into a PyArrow Table.
265 |
266 | Method `analyze_granular` returns an instance of `MannWhitneyUResult`, defined earlier, with analysis result.
267 |
268 | Now we can perform the Mann-Whitney U test:
269 | """
270 | )
271 | return
272 |
273 |
274 | @app.cell
275 | def _(MannWhitneyU, data, tt):
276 | experiment_mwu = tt.Experiment(
277 | mwu_orders=MannWhitneyU("orders"),
278 | mwu_revenue=MannWhitneyU("revenue"),
279 | )
280 | result_mwu = experiment_mwu.analyze(data)
281 | result_mwu.with_keys(("metric", "pvalue", "statistic"))
282 | return
283 |
284 |
285 | @app.cell(hide_code=True)
286 | def _(mo):
287 | mo.md(
288 | r"""
289 | ## Analyzing two types of metrics together
290 |
291 | It's also possible to analyze two types of metrics in one experiment:
292 | """
293 | )
294 | return
295 |
296 |
297 | @app.cell
298 | def _(MannWhitneyU, Proportion, data, tt):
299 | experiment = tt.Experiment(
300 | prop_users_with_orders=Proportion("has_order"),
301 | mean_users_with_orders=tt.Mean("has_order"),
302 | mwu_orders=MannWhitneyU("orders"),
303 | mwu_revenue=MannWhitneyU("revenue"),
304 | )
305 | experiment.analyze(data)
306 | return
307 |
308 |
309 | @app.cell(hide_code=True)
310 | def _(mo):
311 | mo.md(
312 | r"""
313 | In this case, tea-tasting performs two queries on the experimental data:
314 |
315 | - With aggregated statistics required for analysis of metrics of type `MetricBaseAggregated`.
316 | - With detailed data with columns required for analysis of metrics of type `MetricBaseGranular`.
317 |
318 | ## Recommendations
319 |
320 | Follow these recommendations when defining custom metrics:
321 |
322 | - Use parameter and attribute names consistent with the ones that are already defined in tea-tasting. For example, use `pvalue` instead of `p_value` or `correction` instead of `use_continuity`.
323 | - End confidence interval boundary names with `"_ci_lower"` and `"_ci_upper"`.
324 | - During initialization, save parameter values in metric attributes using the same names. For example, use `self.correction = correction` instead of `self.use_continuity = correction`.
325 | - Use global settings as default values for standard parameters, such as `alternative` or `confidence_level`. See the [reference](https://tea-tasting.e10v.me/api/config/#tea_tasting.config.config_context) for the full list of standard parameters. You can also define and use your own global parameters.
326 | """
327 | )
328 | return
329 |
330 |
331 | @app.cell(hide_code=True)
332 | def _():
333 | import marimo as mo
334 | return (mo,)
335 |
336 |
337 | if __name__ == "__main__":
338 | app.run()
339 |
--------------------------------------------------------------------------------
/examples/data-backends.py:
--------------------------------------------------------------------------------
1 | # /// script
2 | # requires-python = ">=3.10"
3 | # dependencies = [
4 | # "ibis-framework[duckdb]",
5 | # "marimo",
6 | # "polars",
7 | # "tea-tasting",
8 | # ]
9 | # [tool.marimo.display]
10 | # cell_output = "below"
11 | # ///
12 |
13 | import marimo
14 |
15 | __generated_with = "0.13.6"
16 | app = marimo.App()
17 |
18 |
19 | @app.cell(hide_code=True)
20 | def _(mo):
21 | mo.md(
22 | r"""
23 | # Data backends
24 |
25 | ## Intro
26 |
27 | tea-tasting supports a wide range of data backends such as BigQuery, ClickHouse, DuckDB, PostgreSQL, Snowflake, Spark, and many other backends supported by [Ibis](https://github.com/ibis-project/ibis). Ibis is a DataFrame API to various data backends.
28 |
29 | Many statistical tests, such as the Student's t-test or the Z-test, require only aggregated data for analysis. For these tests, tea-tasting retrieves only aggregated statistics like mean and variance instead of downloading all detailed data.
30 |
31 | For example, if the raw experimental data are stored in ClickHouse, it's faster and more efficient to calculate counts, averages, variances, and covariances directly in ClickHouse rather than fetching granular data and performing aggregations in a Python environment.
32 |
33 | tea-tasting also accepts dataframes supported by [Narwhals](https://github.com/narwhals-dev/narwhals): cuDF, Dask, Modin, pandas, Polars, PyArrow. Narwhals is a compatibility layer between dataframe libraries.
34 |
35 | This guide:
36 |
37 | - Shows how to use tea-tasting with a data backend of your choice for the analysis of an experiment.
38 | - Explains some internals of how tea-tasting uses Ibis to work with data backends.
39 |
40 | ## Demo database
41 |
42 | /// admonition | Note
43 |
44 | This guide uses [DuckDB](https://github.com/duckdb/duckdb), an in-process analytical database, and [Polars](https://github.com/pola-rs/polars) as example data backends. Install these packages in addition to tea-tasting to reproduce the examples:
45 |
46 | ```bash
47 | uv pip install ibis-framework[duckdb] polars
48 | ```
49 |
50 | ///
51 |
52 | First, let's prepare a demo database:
53 | """
54 | )
55 | return
56 |
57 |
58 | @app.cell
59 | def _():
60 | import ibis
61 | import polars as pl
62 | import tea_tasting as tt
63 |
64 | users_data = tt.make_users_data(seed=42)
65 | con = ibis.connect("duckdb://")
66 | con.create_table("users_data", users_data)
67 | return con, ibis, pl, tt, users_data
68 |
69 |
70 | @app.cell(hide_code=True)
71 | def _(mo):
72 | mo.md(
73 | r"""
74 | In the example above:
75 |
76 | - Function `tt.make_users_data` returns a PyArrow Table with example experimental data.
77 | - Function `ibis.duckdb.connect` creates a DuckDB in-process database using Ibis API.
78 | - Method `con.create_table` creates and populates a table in the database based on the PyArrow Table.
79 |
80 | See the [Ibis documentation on how to create connections](https://ibis-project.org/reference/connection) to other data backends.
81 |
82 | ## Querying experimental data
83 |
84 | Method `con.create_table` in the example above returns an Ibis Table which already can be used in the analysis of the experiment. But let's see how to use an SQL query to create an Ibis Table:
85 | """
86 | )
87 | return
88 |
89 |
90 | @app.cell
91 | def _(con):
92 | data = con.sql("select * from users_data")
93 | data
94 | return (data,)
95 |
96 |
97 | @app.cell(hide_code=True)
98 | def _(mo):
99 | mo.md(
100 | r"""
101 | It's a very simple query. In the real world, you might need to use joins, aggregations, and CTEs to get the data. You can define any SQL query supported by your data backend and use it to create Ibis Table.
102 |
103 | Keep in mind that tea-tasting assumes that:
104 |
105 | - Data is grouped by randomization units, such as individual users.
106 | - There is a column indicating the variant of the A/B test (typically labeled as A, B, etc.).
107 | - All necessary columns for metric calculations (like the number of orders, revenue, etc.) are included in the table.
108 |
109 | Ibis Table is a lazy object. It doesn't fetch the data when created. You can use Ibis DataFrame API to query the table and fetch the result:
110 | """
111 | )
112 | return
113 |
114 |
115 | @app.cell
116 | def _(data, ibis):
117 | ibis.options.interactive = True
118 | print(data.head(5))
119 |
120 | ibis.options.interactive = False
121 | return
122 |
123 |
124 | @app.cell(hide_code=True)
125 | def _(mo):
126 | mo.md(
127 | r"""
128 | ## Ibis example
129 |
130 | To better understand what Ibis does, let's consider the example with grouping and aggregation by variants:
131 | """
132 | )
133 | return
134 |
135 |
136 | @app.cell
137 | def _(data):
138 | aggr_data = data.group_by("variant").aggregate(
139 | sessions_per_user=data.sessions.mean(),
140 | orders_per_session=data.orders.mean() / data.sessions.mean(),
141 | orders_per_user=data.orders.mean(),
142 | revenue_per_user=data.revenue.mean(),
143 | )
144 | aggr_data
145 | return (aggr_data,)
146 |
147 |
148 | @app.cell(hide_code=True)
149 | def _(mo):
150 | mo.md(
151 | r"""
152 | `aggr_data` is another Ibis Table defined as a query over the previously defined `data`. Let's fetch the result:
153 | """
154 | )
155 | return
156 |
157 |
158 | @app.cell
159 | def _(aggr_data, ibis):
160 | ibis.options.interactive = True
161 | print(aggr_data)
162 |
163 | ibis.options.interactive = False
164 | return
165 |
166 |
167 | @app.cell(hide_code=True)
168 | def _(mo):
169 | mo.md(
170 | r"""
171 | Internally, Ibis compiles a Table to an SQL query supported by the backend:
172 | """
173 | )
174 | return
175 |
176 |
177 | @app.cell
178 | def _(aggr_data):
179 | print(aggr_data.compile(pretty=True))
180 | return
181 |
182 |
183 | @app.cell(hide_code=True)
184 | def _(mo):
185 | mo.md(
186 | r"""
187 | See [Ibis documentation](https://ibis-project.org/tutorials/getting_started) for more details.
188 |
189 | ## Experiment analysis
190 |
191 | The example above shows how to query the metric averages. But for statistical inference, it's not enough. For example, Student's t-test and Z-test also require number of rows and variance. Additionally, analysis of ratio metrics and variance reduction with CUPED requires covariances.
192 |
193 | Querying all the required statistics manually can be a daunting and error-prone task. But don't worry—tea-tasting does this work for you. You just need to specify the metrics:
194 | """
195 | )
196 | return
197 |
198 |
199 | @app.cell
200 | def _(data, tt):
201 | experiment = tt.Experiment(
202 | sessions_per_user=tt.Mean("sessions"),
203 | orders_per_session=tt.RatioOfMeans("orders", "sessions"),
204 | orders_per_user=tt.Mean("orders"),
205 | revenue_per_user=tt.Mean("revenue"),
206 | )
207 | result = experiment.analyze(data)
208 | result
209 | return (experiment,)
210 |
211 |
212 | @app.cell(hide_code=True)
213 | def _(mo):
214 | mo.md(
215 | r"""
216 | In the example above, tea-tasting fetches all the required statistics with a single query and then uses them to analyze the experiment.
217 |
218 | Some statistical methods, like bootstrap, require granular data for analysis. In this case, tea-tasting fetches the detailed data as well.
219 |
220 | ## Example with CUPED
221 |
222 | An example of a slightly more complicated analysis using variance reduction with CUPED:
223 | """
224 | )
225 | return
226 |
227 |
228 | @app.cell
229 | def _(con, tt):
230 | users_data_cuped = tt.make_users_data(seed=42, covariates=True)
231 | con.create_table("users_data_cuped", users_data_cuped)
232 |
233 | data_cuped = con.sql("select * from users_data_cuped")
234 | experiment_cuped = tt.Experiment(
235 | sessions_per_user=tt.Mean("sessions", "sessions_covariate"),
236 | orders_per_session=tt.RatioOfMeans(
237 | numer="orders",
238 | denom="sessions",
239 | numer_covariate="orders_covariate",
240 | denom_covariate="sessions_covariate",
241 | ),
242 | orders_per_user=tt.Mean("orders", "orders_covariate"),
243 | revenue_per_user=tt.Mean("revenue", "revenue_covariate"),
244 | )
245 | result_cuped = experiment_cuped.analyze(data_cuped)
246 | result_cuped
247 | return
248 |
249 |
250 | @app.cell(hide_code=True)
251 | def _(mo):
252 | mo.md(
253 | r"""
254 | ## Polars example
255 |
256 | Here’s an example of how to analyze data using a Polars DataFrame:
257 | """
258 | )
259 | return
260 |
261 |
262 | @app.cell
263 | def _(experiment, pl, users_data):
264 | data_polars = pl.from_arrow(users_data)
265 | experiment.analyze(data_polars)
266 | return
267 |
268 |
269 | @app.cell(hide_code=True)
270 | def _(mo):
271 | mo.md(
272 | r"""
273 |
274 | """
275 | )
276 | return
277 |
278 |
279 | @app.cell(hide_code=True)
280 | def _():
281 | import marimo as mo
282 | return (mo,)
283 |
284 |
285 | if __name__ == "__main__":
286 | app.run()
287 |
--------------------------------------------------------------------------------
/examples/multiple-testing.py:
--------------------------------------------------------------------------------
1 | # /// script
2 | # requires-python = ">=3.10"
3 | # dependencies = [
4 | # "marimo",
5 | # "polars",
6 | # "tea-tasting",
7 | # ]
8 | # [tool.marimo.display]
9 | # cell_output = "below"
10 | # ///
11 |
12 | import marimo
13 |
14 | __generated_with = "0.13.6"
15 | app = marimo.App()
16 |
17 |
18 | @app.cell(hide_code=True)
19 | def _(mo):
20 | mo.md(
21 | r"""
22 | # Multiple testing
23 |
24 | ## Multiple hypothesis testing problem
25 |
26 | /// admonition | Note
27 |
28 | This guide uses [Polars](https://github.com/pola-rs/polars) as an example data backend. Install Polars in addition to tea-tasting to reproduce the examples:
29 |
30 | ```bash
31 | uv pip install polars
32 | ```
33 |
34 | ///
35 |
36 | The [multiple hypothesis testing problem](https://en.wikipedia.org/wiki/Multiple_comparisons_problem) arises when there is more than one success metric or more than one treatment variant in an A/B test.
37 |
38 | tea-tasting provides the following methods for multiple testing correction:
39 |
40 | - [False discovery rate](https://en.wikipedia.org/wiki/False_discovery_rate) (FDR) controlling procedures:
41 | - Benjamini-Hochberg procedure, assuming non-negative correlation between hypotheses.
42 | - Benjamini-Yekutieli procedure, assuming arbitrary dependence between hypotheses.
43 | - [Family-wise error rate](https://en.wikipedia.org/wiki/Family-wise_error_rate) (FWER) controlling procedures:
44 | - Hochberg's step-up procedure, assuming non-negative correlation between hypotheses.
45 | - Holm's step-down procedure, assuming arbitrary dependence between hypotheses.
46 |
47 | As an example, consider an experiment with three variants, a control and two treatments:
48 | """
49 | )
50 | return
51 |
52 |
53 | @app.cell
54 | def _():
55 | import polars as pl
56 | import tea_tasting as tt
57 |
58 | data = pl.concat((
59 | tt.make_users_data(
60 | seed=42,
61 | orders_uplift=0.10,
62 | revenue_uplift=0.15,
63 | return_type="polars",
64 | ),
65 | tt.make_users_data(
66 | seed=21,
67 | orders_uplift=0.15,
68 | revenue_uplift=0.20,
69 | return_type="polars",
70 | )
71 | .filter(pl.col("variant").eq(1))
72 | .with_columns(variant=pl.lit(2, pl.Int64)),
73 | ))
74 | data
75 | return data, tt
76 |
77 |
78 | @app.cell(hide_code=True)
79 | def _(mo):
80 | mo.md(
81 | r"""
82 | Let's calculate the experiment results:
83 | """
84 | )
85 | return
86 |
87 |
88 | @app.cell
89 | def _(data, tt):
90 | experiment = tt.Experiment(
91 | sessions_per_user=tt.Mean("sessions"),
92 | orders_per_session=tt.RatioOfMeans("orders", "sessions"),
93 | orders_per_user=tt.Mean("orders"),
94 | revenue_per_user=tt.Mean("revenue"),
95 | )
96 | results = experiment.analyze(data, control=0, all_variants=True)
97 | results
98 | return experiment, results
99 |
100 |
101 | @app.cell(hide_code=True)
102 | def _(mo):
103 | mo.md(
104 | r"""
105 | Suppose only the two metrics `orders_per_user` and `revenue_per_user` are considered as success metrics, while the other two metrics `sessions_per_user` and `orders_per_session` are second-order diagnostic metrics.
106 | """
107 | )
108 | return
109 |
110 |
111 | @app.cell
112 | def _():
113 | metrics = {"orders_per_user", "revenue_per_user"}
114 | return (metrics,)
115 |
116 |
117 | @app.cell(hide_code=True)
118 | def _(mo):
119 | mo.md(
120 | r"""
121 | With two treatment variants and two success metrics, there are four hypotheses in total, which increases the probability of false positives (also called "false discoveries"). It's recommended to adjust the p-values or the significance level (alpha) in this case. Let's explore the correction methods provided by tea-tasting.
122 |
123 | ## False discovery rate
124 |
125 | False discovery rate (FDR) is the expected value of the proportion of false discoveries among the discoveries (rejections of the null hypothesis). To control for FDR, use the [`adjust_fdr`](https://tea-tasting.e10v.me/api/multiplicity/#tea_tasting.multiplicity.adjust_fdr) method:
126 | """
127 | )
128 | return
129 |
130 |
131 | @app.cell
132 | def _(metrics, results, tt):
133 | adjusted_results_fdr = tt.adjust_fdr(results, metrics)
134 | adjusted_results_fdr
135 | return (adjusted_results_fdr,)
136 |
137 |
138 | @app.cell(hide_code=True)
139 | def _(mo):
140 | mo.md(
141 | r"""
142 | The method adjusts p-values and saves them as `pvalue_adj`. Compare these values to the desired significance level alpha to determine if the null hypotheses can be rejected.
143 |
144 | The method also adjusts the significance level alpha and saves it as `alpha_adj`. Compare non-adjusted p-values (`pvalue`) to the `alpha_adj` to determine if the null hypotheses can be rejected:
145 | """
146 | )
147 | return
148 |
149 |
150 | @app.cell
151 | def _(adjusted_results_fdr):
152 | adjusted_results_fdr.with_keys((
153 | "comparison",
154 | "metric",
155 | "control",
156 | "treatment",
157 | "rel_effect_size",
158 | "pvalue",
159 | "alpha_adj",
160 | ))
161 | return
162 |
163 |
164 | @app.cell(hide_code=True)
165 | def _(mo):
166 | mo.md(
167 | r"""
168 | By default, tea-tasting assumes non-negative correlation between hypotheses and performs the Benjamini-Hochberg procedure. To perform the Benjamini-Yekutieli procedure, assuming arbitrary dependence between hypotheses, set the `arbitrary_dependence` parameter to `True`:
169 | """
170 | )
171 | return
172 |
173 |
174 | @app.cell
175 | def _(metrics, results, tt):
176 | tt.adjust_fdr(results, metrics, arbitrary_dependence=True)
177 | return
178 |
179 |
180 | @app.cell(hide_code=True)
181 | def _(mo):
182 | mo.md(
183 | r"""
184 | ## Family-wise error rate
185 |
186 | Family-wise error rate (FWER) is the probability of making at least one type I error. To control for FWER, use the [`adjust_fwer`](https://tea-tasting.e10v.me/api/multiplicity/#tea_tasting.multiplicity.adjust_fwer) method:
187 | """
188 | )
189 | return
190 |
191 |
192 | @app.cell
193 | def _(metrics, results, tt):
194 | tt.adjust_fwer(results, metrics)
195 | return
196 |
197 |
198 | @app.cell(hide_code=True)
199 | def _(mo):
200 | mo.md(
201 | r"""
202 | By default, tea-tasting assumes non-negative correlation between hypotheses and performs the Hochberg's step-up procedure with the Šidák correction, which is slightly more powerful than the Bonferroni correction.
203 |
204 | To perform the Holm's step-down procedure, assuming arbitrary dependence between hypotheses, set the `arbitrary_dependence` parameter to `True`. In this case, it's recommended to use the Bonferroni correction, since the Šidák correction assumes non-negative correlation between hypotheses:
205 | """
206 | )
207 | return
208 |
209 |
210 | @app.cell
211 | def _(metrics, results, tt):
212 | tt.adjust_fwer(
213 | results,
214 | metrics,
215 | arbitrary_dependence=True,
216 | method="bonferroni",
217 | )
218 | return
219 |
220 |
221 | @app.cell(hide_code=True)
222 | def _(mo):
223 | mo.md(
224 | r"""
225 | ## Other inputs
226 |
227 | In the examples above, the methods `adjust_fdr` and `adjust_fwer` received results from a *single experiment* with *more than two variants*. They can also accept the results from *multiple experiments* with *two variants* in each:
228 | """
229 | )
230 | return
231 |
232 |
233 | @app.cell
234 | def _(experiment, metrics, tt):
235 | data1 = tt.make_users_data(seed=42, orders_uplift=0.10, revenue_uplift=0.15)
236 | data2 = tt.make_users_data(seed=21, orders_uplift=0.15, revenue_uplift=0.20)
237 | result1 = experiment.analyze(data1)
238 | result2 = experiment.analyze(data2)
239 | tt.adjust_fdr(
240 | {"Experiment 1": result1, "Experiment 2": result2},
241 | metrics,
242 | )
243 | return (result2,)
244 |
245 |
246 | @app.cell(hide_code=True)
247 | def _(mo):
248 | mo.md(
249 | r"""
250 | The methods `adjust_fdr` and `adjust_fwer` can also accept the result of *a single experiment with two variants*:
251 | """
252 | )
253 | return
254 |
255 |
256 | @app.cell
257 | def _(metrics, result2, tt):
258 | tt.adjust_fwer(result2, metrics)
259 | return
260 |
261 |
262 | @app.cell(hide_code=True)
263 | def _(mo):
264 | mo.md(
265 | r"""
266 |
267 | """
268 | )
269 | return
270 |
271 |
272 | @app.cell(hide_code=True)
273 | def _():
274 | import marimo as mo
275 | return (mo,)
276 |
277 |
278 | if __name__ == "__main__":
279 | app.run()
280 |
--------------------------------------------------------------------------------
/examples/power-analysis.py:
--------------------------------------------------------------------------------
1 | # /// script
2 | # requires-python = ">=3.10"
3 | # dependencies = [
4 | # "marimo",
5 | # "tea-tasting",
6 | # ]
7 | # [tool.marimo.display]
8 | # cell_output = "below"
9 | # ///
10 |
11 | import marimo
12 |
13 | __generated_with = "0.13.6"
14 | app = marimo.App()
15 |
16 |
17 | @app.cell(hide_code=True)
18 | def _(mo):
19 | mo.md(
20 | r"""
21 | # Power analysis
22 |
23 | In tea-tasting, you can analyze the statistical power for `Mean` and `RatioOfMeans` metrics. There are three possible options:
24 |
25 | - Calculate the effect size, given statistical power and the total number of observations.
26 | - Calculate the total number of observations, given statistical power and the effect size.
27 | - Calculate statistical power, given the effect size and the total number of observations.
28 |
29 | In this example, tea-tasting calculates statistical power given the relative effect size and the number of observations:
30 | """
31 | )
32 | return
33 |
34 |
35 | @app.cell
36 | def _():
37 | import tea_tasting as tt
38 |
39 | data = tt.make_users_data(
40 | seed=42,
41 | sessions_uplift=0,
42 | orders_uplift=0,
43 | revenue_uplift=0,
44 | covariates=True,
45 | )
46 | orders_per_session = tt.RatioOfMeans("orders", "sessions", rel_effect_size=0.1)
47 | orders_per_session.solve_power(data, "power")
48 | return data, tt
49 |
50 |
51 | @app.cell(hide_code=True)
52 | def _(mo):
53 | mo.md(
54 | r"""
55 | Besides `alternative`, `equal_var`, `use_t`, and covariates (CUPED), the following metric parameters affect the result:
56 |
57 | - `alpha`: Significance level.
58 | - `ratio`: Ratio of the number of observations in the treatment relative to the control.
59 | - `power`: Statistical power.
60 | - `effect_size` and `rel_effect_size`: Absolute and relative effect size. Only one of them can be defined.
61 | - `n_obs`: Number of observations in the control and in the treatment together. If the number of observations is not set explicitly, it's inferred from the dataset.
62 |
63 | You can change the default values of `alpha`, `ratio`, `power`, and `n_obs` using the [global settings](https://tea-tasting.e10v.me/user-guide/#global-settings).
64 |
65 | tea-tasting can analyze power for several values of parameters `effect_size`, `rel_effect_size`, or `n_obs`. Example:
66 | """
67 | )
68 | return
69 |
70 |
71 | @app.cell
72 | def _(data, tt):
73 | orders_per_user = tt.Mean("orders", alpha=0.1, power=0.7, n_obs=(10_000, 20_000))
74 | orders_per_user.solve_power(data, "rel_effect_size")
75 | return
76 |
77 |
78 | @app.cell(hide_code=True)
79 | def _(mo):
80 | mo.md(
81 | r"""
82 | You can analyze power for all metrics in the experiment. Example:
83 | """
84 | )
85 | return
86 |
87 |
88 | @app.cell
89 | def _(data, tt):
90 | with tt.config_context(n_obs=(10_000, 20_000)):
91 | experiment = tt.Experiment(
92 | sessions_per_user=tt.Mean("sessions", "sessions_covariate"),
93 | orders_per_session=tt.RatioOfMeans(
94 | numer="orders",
95 | denom="sessions",
96 | numer_covariate="orders_covariate",
97 | denom_covariate="sessions_covariate",
98 | ),
99 | orders_per_user=tt.Mean("orders", "orders_covariate"),
100 | revenue_per_user=tt.Mean("revenue", "revenue_covariate"),
101 | )
102 |
103 | power_result = experiment.solve_power(data)
104 | power_result
105 | return
106 |
107 |
108 | @app.cell(hide_code=True)
109 | def _(mo):
110 | mo.md(
111 | r"""
112 | In the example above, tea-tasting calculates both the relative and absolute effect size for all metrics for two possible sample size values, `10_000` and `20_000`.
113 |
114 | The `solve_power` methods of a [metric](https://tea-tasting.e10v.me/api/metrics/mean/#tea_tasting.metrics.mean.Mean.solve_power) and of an [experiment](https://tea-tasting.e10v.me/api/experiment/#tea_tasting.experiment.Experiment.solve_power) return the instances of [`MetricPowerResults`](https://tea-tasting.e10v.me/api/metrics/base/#tea_tasting.metrics.base.MetricPowerResults) and [`ExperimentPowerResult`](https://tea-tasting.e10v.me/api/experiment/#tea_tasting.experiment.ExperimentPowerResult) respectively. These result classes provide the serialization methods similar to the experiment result: `to_dicts`, `to_arrow`, `to_pandas`, `to_polars`, `to_pretty_dicts`, `to_string`, `to_html`. They are also rendered as an HTML tables in IPython and Jupyter, and as a table in marimo notebooks.
115 | """
116 | )
117 | return
118 |
119 |
120 | @app.cell(hide_code=True)
121 | def _():
122 | import marimo as mo
123 | return (mo,)
124 |
125 |
126 | if __name__ == "__main__":
127 | app.run()
128 |
--------------------------------------------------------------------------------
/examples/simulated-experiments.py:
--------------------------------------------------------------------------------
1 | # /// script
2 | # requires-python = ">=3.10"
3 | # dependencies = [
4 | # "marimo",
5 | # "polars",
6 | # "tea-tasting",
7 | # ]
8 | # [tool.marimo.display]
9 | # cell_output = "below"
10 | # ///
11 |
12 | import marimo
13 |
14 | __generated_with = "0.13.6"
15 | app = marimo.App()
16 |
17 |
18 | @app.cell(hide_code=True)
19 | def _(mo):
20 | mo.md(
21 | r"""
22 | # Simulated experiments
23 |
24 | ## Intro
25 |
26 | In tea-tasting, you can run multiple simulated A/A or A/B tests. In each simulation, tea-tasting splits the data into control and treatment groups and can optionally modify the treatment data. A simulation without changing the treatment data is called an A/A test.
27 |
28 | A/A tests are useful for identifying potential issues before conducting the actual A/B test. Treatment simulations are great for power analysis—especially when you need a specific uplift distribution or when an analytical formula doesn’t exist.
29 |
30 | /// admonition | Note
31 |
32 | This guide uses [Polars](https://github.com/pola-rs/polars) and [marimo](https://github.com/marimo-team/marimo). Install these packages in addition to tea-tasting to reproduce the examples:
33 |
34 | ```bash
35 | uv pip install polars marimo
36 | ```
37 |
38 | ///
39 |
40 | ## Running A/A tests
41 |
42 | First, let's prepare the data without any uplift and drop the `"variant"` column.
43 | """
44 | )
45 | return
46 |
47 |
48 | @app.cell
49 | def _():
50 | import polars as pl
51 | import tea_tasting as tt
52 |
53 | data = (
54 | tt.make_users_data(seed=42, orders_uplift=0, revenue_uplift=0)
55 | .drop_columns("variant")
56 | )
57 | data
58 | return data, pl, tt
59 |
60 |
61 | @app.cell(hide_code=True)
62 | def _(mo):
63 | mo.md(
64 | r"""
65 | To run A/A tests, first define the metrics for the experiment, then call the [`simulate`](https://tea-tasting.e10v.me/api/experiment/#tea_tasting.experiment.Experiment.simulate) method, providing the data and the number of simulations as arguments.
66 | """
67 | )
68 | return
69 |
70 |
71 | @app.cell
72 | def _(data, tt):
73 | experiment = tt.Experiment(
74 | sessions_per_user=tt.Mean("sessions"),
75 | orders_per_session=tt.RatioOfMeans("orders", "sessions"),
76 | orders_per_user=tt.Mean("orders"),
77 | revenue_per_user=tt.Mean("revenue"),
78 | n_users=tt.SampleRatio(),
79 | )
80 | results = experiment.simulate(data, 100, seed=42)
81 | results_data = results.to_polars()
82 | results_data.select(
83 | "metric",
84 | "control",
85 | "treatment",
86 | "rel_effect_size",
87 | "rel_effect_size_ci_lower",
88 | "rel_effect_size_ci_upper",
89 | "pvalue",
90 | )
91 | return experiment, results_data
92 |
93 |
94 | @app.cell(hide_code=True)
95 | def _(mo):
96 | mo.md(
97 | r"""
98 | The `simulate` method accepts data in the same formats as the `analyze` method. Internally, however, it converts the data to a PyArrow Table before running the simulations.
99 |
100 | The method returns an instance of the [`SimulationResults`](https://tea-tasting.e10v.me/api/experiment/#tea_tasting.experiment.SimulationResults) class, which contains the results of all simulations for all metrics. The resulting object provides serialization methods to those of the experiment result, including `to_dicts`, `to_arrow`, `to_pandas`, `to_polars`, `to_pretty_dicts`, `to_string`, `to_html`.
101 |
102 | For instance, we can now calculate the proportion of rejected null hypotheses, using various significance levels (`alpha`). In A/A tests, it estimates the type I error rate.
103 | """
104 | )
105 | return
106 |
107 |
108 | @app.cell
109 | def _(pl, results_data):
110 | def null_rejected(
111 | results_data: pl.DataFrame,
112 | alphas: tuple[float, ...] = (0.01, 0.02, 0.05),
113 | ) -> pl.DataFrame:
114 | return results_data.group_by("metric", maintain_order=True).agg(
115 | pl.col("pvalue").le(alpha).mean().alias(f"null_rejected_{alpha}")
116 | for alpha in alphas
117 | )
118 |
119 | null_rejected(results_data)
120 | return (null_rejected,)
121 |
122 |
123 | @app.cell(hide_code=True)
124 | def _(mo):
125 | mo.md(
126 | r"""
127 | 100 simulations, as in the example above, produce a very rough estimate. In practice, a larger number of simulations, such as the default `10_000`, is recommended.
128 |
129 | ## Simulating experiments with treatment
130 |
131 | To simulate experiments with treatment, define a treatment function that takes data in the form of a PyArrow Table and returns a PyArrow Table with the modified data:
132 | """
133 | )
134 | return
135 |
136 |
137 | @app.cell
138 | def _(data, experiment, null_rejected):
139 | import pyarrow as pa
140 | import pyarrow.compute as pc
141 |
142 | def treat(data: pa.Table) -> pa.Table:
143 | return (
144 | data.drop_columns(["orders", "revenue"])
145 | .append_column("orders", pc.multiply(data["orders"], pa.scalar(1.1)))
146 | .append_column("revenue", pc.multiply(data["revenue"], pa.scalar(1.1)))
147 | )
148 |
149 | results_treat = experiment.simulate(data, 100, seed=42, treat=treat)
150 | null_rejected(results_treat.to_polars())
151 | return (treat,)
152 |
153 |
154 | @app.cell(hide_code=True)
155 | def _(mo):
156 | mo.md(
157 | r"""
158 | In the example above, we've defined a function that increases the number of orders and the revenue by 10%. For these metrics, the proportion of rejected null hypotheses is an estimate of statistical power.
159 |
160 | ## Using a function instead of static data
161 |
162 | You can use a function instead of static data to generate input dynamically. The function should take an instance of `numpy.random.Generator` as a parameter named `seed` and return experimental data in any format supported by tea-tasting.
163 |
164 | As an example, let's use the `make_users_data` function.
165 | """
166 | )
167 | return
168 |
169 |
170 | @app.cell
171 | def _(experiment, null_rejected, tt):
172 | results_data_gen = experiment.simulate(tt.make_users_data, 100, seed=42)
173 | null_rejected(results_data_gen.to_polars())
174 | return
175 |
176 |
177 | @app.cell(hide_code=True)
178 | def _(mo):
179 | mo.md(
180 | r"""
181 | On each iteration, tea-tasting calls `make_users_data` with a new `seed` and uses the returned data for the analysis of the experiment. The data returned by `make_users_data` already contains the `"variant"` column, so tea-tasting reuses that split. By default, `make_users_data` also adds the treatment uplift, and you can see it in the proportion of rejected null hypotheses.
182 |
183 | ## Tracking progress
184 |
185 | To track the progress of simulations with [`tqdm`](https://github.com/tqdm/tqdm) or [`marimo.status.progress_bar`](https://docs.marimo.io/api/status/#progress-bar), use the `progress` parameter.
186 | """
187 | )
188 | return
189 |
190 |
191 | @app.cell
192 | def _(data, experiment, mo):
193 | results_progress = experiment.simulate(
194 | data,
195 | 100,
196 | seed=42,
197 | progress=mo.status.progress_bar,
198 | )
199 | return
200 |
201 |
202 | @app.cell(hide_code=True)
203 | def _(mo):
204 | mo.md(
205 | r"""
206 | ## Parallel execution
207 |
208 | /// admonition | Note
209 |
210 | The code below won't work in the [marimo online playground](https://docs.marimo.io/guides/publishing/playground/) as it relies on the `multiprocessing` module which is currently [not supported](https://docs.marimo.io/guides/wasm/#limitations) by WASM notebooks. [WASM notebooks](https://docs.marimo.io/guides/wasm/) are the marimo notebooks that run entirely in the browser.
211 |
212 | ///
213 |
214 | To speed up simulations and run them in parallel, use the `map_` parameter with an alternative mapping function.
215 | """
216 | )
217 | return
218 |
219 |
220 | @app.cell
221 | def _(data, experiment, mo, treat):
222 | import concurrent.futures
223 |
224 | with concurrent.futures.ProcessPoolExecutor() as executor:
225 | results_parallel = experiment.simulate(
226 | data,
227 | 100,
228 | seed=42,
229 | treat=treat,
230 | map_=executor.map,
231 | progress=mo.status.progress_bar,
232 | )
233 | return
234 |
235 |
236 | @app.cell(hide_code=True)
237 | def _(mo):
238 | mo.md(
239 | r"""
240 | As an alternative to [`concurrent.futures.ProcessPoolExecutor`](https://docs.python.org/3/library/concurrent.futures.html#processpoolexecutor), you can use the `map`, `imap`, or `imap_unordered` methods of [`multiprocessing.pool.Pool`](https://docs.python.org/3/library/multiprocessing.html#module-multiprocessing.pool).
241 |
242 | It's also possible to run simulations on a distributed [Dask](https://distributed.dask.org/en/stable/api.html#distributed.Client.map) or [Ray](https://docs.ray.io/en/latest/ray-core/api/doc/ray.util.ActorPool.map.html#ray.util.ActorPool.map) cluster.
243 | """
244 | )
245 | return
246 |
247 |
248 | @app.cell(hide_code=True)
249 | def _():
250 | import marimo as mo
251 | return (mo,)
252 |
253 |
254 | if __name__ == "__main__":
255 | app.run()
256 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: "tea-tasting: statistical analysis of A/B tests"
2 | site_url: https://tea-tasting.e10v.me/
3 | site_description: A Python package for the statistical analysis of A/B tests
4 | site_author: Evgeny Ivanov
5 | copyright: © Evgeny Ivanov
The logo is designed by Freepik
6 | repo_name: e10v/tea-tasting
7 | repo_url: https://github.com/e10v/tea-tasting
8 |
9 | nav:
10 | - Overview: index.md
11 | - User guide: user-guide.md
12 | - Data backends: data-backends.md
13 | - Power analysis: power-analysis.md
14 | - Multiple testing: multiple-testing.md
15 | - Custom metrics: custom-metrics.md
16 | - Simulated experiments: simulated-experiments.md
17 | - API reference:
18 | - API reference: api/index.md
19 | - Metrics:
20 | - Metrics: api/metrics/index.md
21 | - Base: api/metrics/base.md
22 | - Mean: api/metrics/mean.md
23 | - Proportion: api/metrics/proportion.md
24 | - Resampling: api/metrics/resampling.md
25 | - Experiment: api/experiment.md
26 | - Multiplicity: api/multiplicity.md
27 | - Datasets: api/datasets.md
28 | - Global configuration: api/config.md
29 | - Aggregates: api/aggr.md
30 | - Utilities: api/utils.md
31 |
32 | theme:
33 | name: material
34 | palette:
35 | - media: "(prefers-color-scheme)"
36 | toggle:
37 | icon: material/brightness-auto
38 | name: Switch to light mode
39 | - media: "(prefers-color-scheme: light)"
40 | scheme: default
41 | primary: deep orange
42 | accent: deep orange
43 | toggle:
44 | icon: material/brightness-7
45 | name: Switch to dark mode
46 | - media: "(prefers-color-scheme: dark)"
47 | scheme: slate
48 | primary: deep orange
49 | accent: deep orange
50 | toggle:
51 | icon: material/brightness-4
52 | name: Switch to system preference
53 | logo: assets/tea-cup-white.svg
54 | favicon: assets/tea-cup-white-on-black.svg
55 | icon:
56 | repo: fontawesome/brands/github
57 | features:
58 | - content.code.copy
59 | - navigation.indexes
60 | - navigation.instant
61 | - navigation.instant.progress
62 | - navigation.top
63 | - navigation.tracking
64 | - search.highlight
65 | - search.suggest
66 | - toc.follow
67 |
68 | plugins:
69 | - mkdocstrings:
70 | default_handler: python
71 | handlers:
72 | python:
73 | options:
74 | filters: ["!^_"]
75 | heading_level: 1
76 | inherited_members: true
77 | merge_init_into_class: true
78 | show_overloads: false
79 | show_root_heading: true
80 | - search
81 |
82 | markdown_extensions:
83 | - _internal.external_links
84 | - _internal.strip_doctest_artifacts
85 | - pymdownx.blocks.admonition
86 | - pymdownx.superfences
87 | - toc:
88 | permalink: "#"
89 |
90 | extra_css:
91 | - stylesheets/extra.css
92 |
93 | extra_javascript:
94 | - javascripts/override-copy.js
95 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "tea-tasting"
3 | dynamic = ["version"]
4 | description = "A Python package for the statistical analysis of A/B tests."
5 | authors = [
6 | {name = "Evgeny Ivanov", email = "ivanov.evgeny.n@gmail.com"},
7 | ]
8 | dependencies = [
9 | "ibis-framework>=9",
10 | "narwhals>=1.4",
11 | "numpy>=1.25",
12 | "pyarrow>=16",
13 | "scipy>=1.11",
14 | ]
15 | requires-python = ">=3.10"
16 | readme = "README.md"
17 | license = {text = "MIT"}
18 | classifiers = [
19 | "Development Status :: 5 - Production/Stable",
20 | "Intended Audience :: Developers",
21 | "Intended Audience :: Information Technology",
22 | "Intended Audience :: Science/Research",
23 | "License :: OSI Approved",
24 | "License :: OSI Approved :: MIT License",
25 | "Operating System :: OS Independent",
26 | "Programming Language :: Python",
27 | "Programming Language :: Python :: 3",
28 | "Programming Language :: Python :: 3.10",
29 | "Programming Language :: Python :: 3.11",
30 | "Programming Language :: Python :: 3.12",
31 | "Programming Language :: Python :: 3.13",
32 | "Topic :: Scientific/Engineering",
33 | "Topic :: Scientific/Engineering :: Information Analysis",
34 | "Topic :: Scientific/Engineering :: Mathematics",
35 | "Typing :: Typed",
36 | ]
37 |
38 | [project.urls]
39 | homepage = "https://tea-tasting.e10v.me"
40 | documentation = "https://tea-tasting.e10v.me/user-guide"
41 | source = "https://github.com/e10v/tea-tasting"
42 | "release notes" = "https://github.com/e10v/tea-tasting/releases"
43 |
44 |
45 | [dependency-groups]
46 | docs = ["mkdocs-material", "mkdocstrings[python]"]
47 | lint = ["markdown", "marimo", "pyright", "ruff"]
48 | test = [
49 | "coverage[toml]>=7",
50 | "ibis-framework[duckdb,sqlite]",
51 | "marimo>=0.10",
52 | "pandas>=2",
53 | "polars>=1",
54 | "pytest>=8",
55 | "tqdm>=4",
56 | ]
57 |
58 |
59 | [build-system]
60 | requires = ["pdm-backend"]
61 | build-backend = "pdm.backend"
62 |
63 |
64 | [tool.pdm.build]
65 | excludes = ["src/_*/**/*"]
66 | package-dir = "src"
67 |
68 | [tool.pdm.scripts]
69 | all.composite = ["doctest", "test", "cover", "lint", "type"]
70 | all.keep_going = true
71 | cover = "coverage report -m"
72 | docserv = "mkdocs serve -w docs -w src -w mkdocs.yml"
73 | doctest.cmd = [
74 | "pytest",
75 | "--doctest-continue-on-failure",
76 | "--doctest-glob=*.md",
77 | "--doctest-modules",
78 | "--ignore=examples/",
79 | "--ignore=tests/",
80 | "--ignore-glob=src/_*",
81 | ]
82 | lint = "ruff check ."
83 | test = "coverage run -m pytest"
84 | type = "pyright"
85 |
86 | [tool.pdm.version]
87 | source = "scm"
88 | write_to = "tea_tasting/_version.txt"
89 |
90 |
91 | [tool.coverage.run]
92 | source = ["src/tea_tasting"]
93 | [tool.coverage.report]
94 | exclude_lines = ["if TYPE_CHECKING:", "pragma: no cover", "@overload", "@abc.abstractmethod"]
95 |
96 |
97 | [tool.ruff]
98 | extend-exclude = ["examples"]
99 | src = ["src"]
100 |
101 | [tool.ruff.lint]
102 | select = [
103 | "A", "ANN", "ARG", "B", "BLE", "C4", "C90", "COM", "D", "DOC", "E", "ERA",
104 | "F", "FA", "FBT", "FIX", "FLY", "FURB", "I", "ICN", "INP", "INT", "ISC",
105 | "N", "NPY", "PD", "PERF", "PGH", "PIE", "PL", "PT", "Q", "RET", "RSE",
106 | "RUF", "S", "SIM", "SLF", "SLOT", "T10", "T20", "TC", "TD", "TID", "TRY",
107 | "UP", "W",
108 | ]
109 | ignore = ["ANN401", "PGH003", "SLF001", "TRY003"]
110 |
111 | [tool.ruff.lint.per-file-ignores]
112 | "*/__init__.py" = ["F401"]
113 | "tests/*" = [
114 | "ANN201", "D", "FBT003", "PLR2004", "PT001", "S101",
115 | ]
116 |
117 | [tool.ruff.lint.isort]
118 | force-sort-within-sections = true
119 | lines-after-imports = 2
120 |
121 | [tool.ruff.lint.pydocstyle]
122 | convention = "google"
123 |
124 | [tool.ruff.lint.pylint]
125 | max-args = 8
126 |
127 |
128 | [tool.pyright]
129 | exclude = ["examples", "**/node_modules", "**/__pycache__", "**/.*"]
130 | typeCheckingMode = "strict"
131 | reportMissingTypeStubs = false
132 | reportPrivateUsage = false
133 | reportUnknownArgumentType = false
134 | reportUnknownMemberType = false
135 | reportUnknownParameterType = false
136 | reportUnknownVariableType = false
137 |
--------------------------------------------------------------------------------
/src/_internal/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e10v/tea-tasting/70b129d8dad04654b4aceb56c8ee2e348b0636b0/src/_internal/__init__.py
--------------------------------------------------------------------------------
/src/_internal/create_examples.py:
--------------------------------------------------------------------------------
1 | """Convert guides to examples as marimo notebooks."""
2 | # pyright: reportPrivateImportUsage=false
3 |
4 | from __future__ import annotations
5 |
6 | import re
7 | import textwrap
8 |
9 | import marimo._ast.cell
10 | import marimo._convert.utils
11 |
12 |
13 | GUIDES: dict[str, tuple[str, ...]] = {
14 | "user-guide": ("polars",),
15 | "data-backends": ("ibis-framework[duckdb]", "polars"),
16 | "power-analysis": (),
17 | "multiple-testing": ("polars",),
18 | "custom-metrics": (),
19 | "simulated-experiments": ("polars",),
20 | }
21 |
22 | HIDE_CODE = marimo._ast.cell.CellConfig(hide_code=True)
23 | SHOW_CODE = marimo._ast.cell.CellConfig(hide_code=False)
24 |
25 | RE_LINK = re.compile(r"\[([^\]]+)\]\((?!#)([^)]+)\)")
26 | RE_DOCTEST = re.compile(r"\s+# doctest:.*")
27 |
28 |
29 | def convert_guide(name: str, deps: tuple[str, ...]) -> None:
30 | with open(f"docs/{name}.md") as f:
31 | guide_text = f.read()
32 |
33 | sources = []
34 | cell_configs = []
35 | for text in guide_text.split("```pycon"):
36 | if len(sources) == 0:
37 | md = text
38 | else:
39 | end_of_code = text.find("```")
40 | md = text[end_of_code + 3:]
41 | sources.append(convert_code(text[:end_of_code]))
42 | cell_configs.append(SHOW_CODE)
43 |
44 | sources.append(marimo._convert.utils.markdown_to_marimo(convert_md(md)))
45 | cell_configs.append(HIDE_CODE)
46 |
47 | sources.append("import marimo as mo")
48 | cell_configs.append(HIDE_CODE)
49 |
50 | code = marimo._convert.utils.generate_from_sources(
51 | sources=sources,
52 | cell_configs=cell_configs,
53 | header_comments=create_header_comments(deps),
54 | )
55 | with open(f"examples/{name}.py", "w") as f:
56 | f.write(code)
57 |
58 |
59 | def convert_code(code: str) -> str:
60 | lines = []
61 | for line in code.split("\n"):
62 | if line == ">>> import tqdm":
63 | pass
64 | elif line.startswith((">>>", "...")):
65 | lines.append(RE_DOCTEST.sub("", line[4:]))
66 | elif line == "":
67 | lines.append("")
68 | return "\n".join(lines).strip().replace("tqdm.tqdm", "mo.status.progress_bar")
69 |
70 |
71 | def convert_md(md: str) -> str:
72 | return (
73 | RE_LINK.sub(update_link, md.strip())
74 | .replace(
75 | "[tqdm](https://github.com/tqdm/tqdm)",
76 | "[marimo](https://github.com/marimo-team/marimo)",
77 | )
78 | .replace(" tqdm", " marimo")
79 | )
80 |
81 |
82 | def update_link(match: re.Match[str]) -> str:
83 | label = match.group(1)
84 | url = match.group(2).replace(".md", "/")
85 | root = "" if url.startswith("http") else "https://tea-tasting.e10v.me/"
86 | return f"[{label}]({root}{url})"
87 |
88 |
89 | def create_header_comments(deps: tuple[str, ...]) -> str:
90 | dependencies = "\n".join(
91 | f'# "{dep}",'
92 | for dep in sorted((*deps, "marimo", "tea-tasting"))
93 | )
94 | return textwrap.dedent("""
95 | # /// script
96 | # requires-python = ">=3.10"
97 | # dependencies = [
98 | {dependencies}
99 | # ]
100 | # [tool.marimo.display]
101 | # cell_output = "below"
102 | # ///
103 | """).format(dependencies=dependencies)
104 |
105 |
106 | if __name__ == "__main__":
107 | for name, deps in GUIDES.items():
108 | convert_guide(name, deps)
109 |
--------------------------------------------------------------------------------
/src/_internal/external_links.py:
--------------------------------------------------------------------------------
1 | """Markdown extension that adds target="_blank" and rel="noopener" to external links."""
2 | # ruff: noqa: N802
3 | from __future__ import annotations
4 |
5 | from typing import TYPE_CHECKING
6 | import urllib.parse
7 |
8 | import markdown
9 | import markdown.extensions
10 | import markdown.treeprocessors
11 |
12 |
13 | if TYPE_CHECKING:
14 | import xml.etree.ElementTree as ET
15 |
16 |
17 | class ExternalLinksTreeprocessor(markdown.treeprocessors.Treeprocessor):
18 | def run(self, root: ET.Element) -> None:
19 | for a in root.iter("a"):
20 | url = urllib.parse.urlparse(a.get("href", ""))
21 | if (
22 | url.scheme in {"http", "https"} and
23 | url.hostname is not None and
24 | not url.hostname.startswith(("tea-tasting.e10v.me", "127.0.0.1"))
25 | ):
26 | a.set("target", "_blank")
27 | a.set("rel", "noopener")
28 |
29 | class ExternalLinksExtension(markdown.extensions.Extension):
30 | def extendMarkdown(self, md: markdown.Markdown) -> None:
31 | md.treeprocessors.register(
32 | ExternalLinksTreeprocessor(md),
33 | "external_links",
34 | -1000,
35 | )
36 |
37 | def makeExtension(**kwargs: dict[str, object]) -> ExternalLinksExtension:
38 | return ExternalLinksExtension(**kwargs)
39 |
--------------------------------------------------------------------------------
/src/_internal/strip_doctest_artifacts.py:
--------------------------------------------------------------------------------
1 | """Markdown extension that strips doctest artifacts."""
2 | # ruff: noqa: N802
3 | from __future__ import annotations
4 |
5 | import re
6 |
7 | import markdown
8 | import markdown.extensions
9 | import markdown.preprocessors
10 |
11 |
12 | RE_DOCTEST = re.compile(r"|\s+# doctest:.*")
13 |
14 | class StripDoctestArtifactsPreprocessor(markdown.preprocessors.Preprocessor):
15 | def run(self, lines: list[str]) -> list[str]:
16 | return [RE_DOCTEST.sub("", line) for line in lines]
17 |
18 | class StripDoctestArtifactsExtension(markdown.extensions.Extension):
19 | def extendMarkdown(self, md: markdown.Markdown) -> None:
20 | md.preprocessors.register(
21 | StripDoctestArtifactsPreprocessor(md),
22 | "strip_doctest_artifacts",
23 | 175,
24 | )
25 |
26 | def makeExtension(**kwargs: dict[str, object]) -> StripDoctestArtifactsExtension:
27 | return StripDoctestArtifactsExtension(**kwargs)
28 |
--------------------------------------------------------------------------------
/src/tea_tasting/__init__.py:
--------------------------------------------------------------------------------
1 | """A Python package for the statistical analysis of A/B tests.
2 |
3 | All classes and functions for the analysis of the experiments can be imported
4 | from the root `tea_tasting` module.
5 |
6 | There are functions and classes for advanced use cases such as defining custom metrics.
7 | They can be imported from submodules of `tea_tasting`.
8 |
9 | For convenience, the API reference is provided by submodules:
10 |
11 | - `tea_tasting.metrics`: Built-in metrics.
12 | - `tea_tasting.experiment`: Experiment and experiment result.
13 | - `tea_tasting.multiplicity`: Multiple hypothesis testing.
14 | - `tea_tasting.datasets`: Example datasets.
15 | - `tea_tasting.config`: Global configuration.
16 | - `tea_tasting.aggr`: Module for working with aggregated statistics.
17 | - `tea_tasting.utils`: Useful functions and classes.
18 | """
19 | # pyright: reportUnusedImport=false
20 |
21 | from tea_tasting.config import config_context, get_config, set_config
22 | from tea_tasting.datasets import make_sessions_data, make_users_data
23 | from tea_tasting.experiment import Experiment
24 | from tea_tasting.metrics import Bootstrap, Mean, Quantile, RatioOfMeans, SampleRatio
25 | from tea_tasting.multiplicity import adjust_fdr, adjust_fwer
26 | from tea_tasting.version import __version__
27 |
--------------------------------------------------------------------------------
/src/tea_tasting/config.py:
--------------------------------------------------------------------------------
1 | """Global configuration."""
2 | # ruff: noqa: PLR0913
3 |
4 | from __future__ import annotations
5 |
6 | import contextlib
7 | import contextvars
8 | from typing import TYPE_CHECKING, overload
9 |
10 | import tea_tasting.utils
11 |
12 |
13 | if TYPE_CHECKING:
14 | from collections.abc import Iterator, Sequence
15 | from typing import Literal
16 |
17 |
18 | _DEFAULT_CONFIG: dict[str, object] = {
19 | "alpha": 0.05,
20 | "alternative": "two-sided",
21 | "confidence_level": 0.95,
22 | "equal_var": False,
23 | "n_obs": None,
24 | "n_resamples": 10_000,
25 | "power": 0.8,
26 | "ratio": 1,
27 | "use_t": True,
28 | }
29 |
30 | _config_var: contextvars.ContextVar[dict[str, object]] = contextvars.ContextVar(
31 | "tea_tasting.config",
32 | default=_DEFAULT_CONFIG.copy(), # noqa: B039
33 | )
34 |
35 |
36 | @overload
37 | def get_config(option: Literal["alpha"]) -> float:
38 | ...
39 |
40 | @overload
41 | def get_config(option: Literal["alternative"]) -> str:
42 | ...
43 |
44 | @overload
45 | def get_config(option: Literal["confidence_level"]) -> float:
46 | ...
47 |
48 | @overload
49 | def get_config(option: Literal["equal_var"]) -> bool:
50 | ...
51 |
52 | @overload
53 | def get_config(option: Literal["n_obs"]) -> int | Sequence[int] | None:
54 | ...
55 |
56 | @overload
57 | def get_config(option: Literal["n_resamples"]) -> str:
58 | ...
59 |
60 | @overload
61 | def get_config(option: Literal["power"]) -> float:
62 | ...
63 |
64 | @overload
65 | def get_config(option: Literal["ratio"]) -> float | int:
66 | ...
67 |
68 | @overload
69 | def get_config(option: Literal["use_t"]) -> bool:
70 | ...
71 |
72 | @overload
73 | def get_config(option: str) -> object:
74 | ...
75 |
76 | @overload
77 | def get_config(option: None = None) -> dict[str, object]:
78 | ...
79 |
80 | def get_config(option: str | None = None) -> object:
81 | """Retrieve the current settings of the global configuration.
82 |
83 | Args:
84 | option: The option name.
85 |
86 | Returns:
87 | The specified option value if its name is provided,
88 | or a dictionary containing all options otherwise.
89 |
90 | Examples:
91 | ```pycon
92 | >>> import tea_tasting as tt
93 |
94 | >>> tt.get_config("equal_var")
95 | False
96 |
97 | ```
98 | """
99 | config = _config_var.get()
100 | return config[option] if option is not None else config.copy()
101 |
102 |
103 | def _set_config(**params: object) -> contextvars.Token[dict[str, object]]:
104 | config = _config_var.get().copy()
105 | for name, value in params.items():
106 | if value is not None:
107 | config[name] = tea_tasting.utils.auto_check(value, name)
108 | return _config_var.set(config)
109 |
110 |
111 | def set_config(
112 | *,
113 | alpha: float | None = None,
114 | alternative: Literal["two-sided", "greater", "less"] | None = None,
115 | confidence_level: float | None = None,
116 | equal_var: bool | None = None,
117 | n_obs: int | Sequence[int] | None = None,
118 | n_resamples: int | None = None,
119 | power: float | None = None,
120 | ratio: float | int | None = None,
121 | use_t: bool | None = None,
122 | **kwargs: object,
123 | ) -> None:
124 | """Update the global configuration with specified settings.
125 |
126 | Args:
127 | alpha: Significance level. Default is 0.05.
128 | alternative: Alternative hypothesis:
129 |
130 | - `"two-sided"`: the means are unequal,
131 | - `"greater"`: the mean in the treatment variant is greater than the mean
132 | in the control variant,
133 | - `"less"`: the mean in the treatment variant is less than the mean
134 | in the control variant.
135 |
136 | Default is `"two-sided"`.
137 |
138 | confidence_level: Confidence level for the confidence interval.
139 | Default is `0.95`.
140 | equal_var: Defines whether equal variance is assumed. If `True`,
141 | pooled variance is used for the calculation of the standard error
142 | of the difference between two means. Default is `False`.
143 | n_obs: Number of observations in the control and in the treatment together.
144 | Default is `None`.
145 | n_resamples: The number of resamples performed to form the bootstrap
146 | distribution of a statistic. Default is `10_000`.
147 | power: Statistical power. Default is 0.8.
148 | ratio: Ratio of the number of observations in the treatment
149 | relative to the control. Default is 1.
150 | use_t: Defines whether to use the Student's t-distribution (`True`) or
151 | the Normal distribution (`False`) by default. Default is `True`.
152 | **kwargs: User-defined global parameters.
153 |
154 | Examples:
155 | ```pycon
156 | >>> import tea_tasting as tt
157 |
158 | >>> tt.set_config(equal_var=True, use_t=False)
159 | >>> experiment = tt.Experiment(
160 | ... sessions_per_user=tt.Mean("sessions"),
161 | ... orders_per_session=tt.RatioOfMeans("orders", "sessions"),
162 | ... orders_per_user=tt.Mean("orders"),
163 | ... revenue_per_user=tt.Mean("revenue"),
164 | ... )
165 | >>> tt.set_config(equal_var=False, use_t=True)
166 | >>> experiment.metrics["orders_per_user"]
167 | Mean(value='orders', covariate=None, alternative='two-sided', confidence_level=0.95, equal_var=True, use_t=False, alpha=0.05, ratio=1, power=0.8, effect_size=None, rel_effect_size=None, n_obs=None)
168 |
169 | ```
170 | """ # noqa: E501
171 | _set_config(**{k: v for k, v in locals().items() if k != "kwargs"}, **kwargs)
172 |
173 |
174 | @contextlib.contextmanager
175 | def config_context(
176 | *,
177 | alpha: float | None = None,
178 | alternative: Literal["two-sided", "greater", "less"] | None = None,
179 | confidence_level: float | None = None,
180 | equal_var: bool | None = None,
181 | n_obs: int | Sequence[int] | None = None,
182 | n_resamples: int | None = None,
183 | power: float | None = None,
184 | ratio: float | int | None = None,
185 | use_t: bool | None = None,
186 | **kwargs: object,
187 | ) -> Iterator[object]:
188 | """A context manager that temporarily modifies the global configuration.
189 |
190 | Args:
191 | alpha: Significance level. Default is 0.05.
192 | alternative: Alternative hypothesis:
193 |
194 | - `"two-sided"`: the means are unequal,
195 | - `"greater"`: the mean in the treatment variant is greater than the mean
196 | in the control variant,
197 | - `"less"`: the mean in the treatment variant is less than the mean
198 | in the control variant.
199 |
200 | Default is `"two-sided"`.
201 |
202 | confidence_level: Confidence level for the confidence interval.
203 | Default is `0.95`.
204 | equal_var: Defines whether equal variance is assumed. If `True`,
205 | pooled variance is used for the calculation of the standard error
206 | of the difference between two means. Default is `False`.
207 | n_obs: Number of observations in the control and in the treatment together.
208 | Default is `None`.
209 | n_resamples: The number of resamples performed to form the bootstrap
210 | distribution of a statistic. Default is `10_000`.
211 | power: Statistical power. Default is 0.8.
212 | ratio: Ratio of the number of observations in the treatment
213 | relative to the control. Default is 1.
214 | use_t: Defines whether to use the Student's t-distribution (`True`) or
215 | the Normal distribution (`False`) by default. Default is `True`.
216 | **kwargs: User-defined global parameters.
217 |
218 | Examples:
219 | ```pycon
220 | >>> import tea_tasting as tt
221 |
222 | >>> with tt.config_context(equal_var=True, use_t=False):
223 | ... experiment = tt.Experiment(
224 | ... sessions_per_user=tt.Mean("sessions"),
225 | ... orders_per_session=tt.RatioOfMeans("orders", "sessions"),
226 | ... orders_per_user=tt.Mean("orders"),
227 | ... revenue_per_user=tt.Mean("revenue"),
228 | ... )
229 | >>> experiment.metrics["orders_per_user"]
230 | Mean(value='orders', covariate=None, alternative='two-sided', confidence_level=0.95, equal_var=True, use_t=False, alpha=0.05, ratio=1, power=0.8, effect_size=None, rel_effect_size=None, n_obs=None)
231 |
232 | ```
233 | """ # noqa: E501
234 | token = _set_config(
235 | **{k: v for k, v in locals().items() if k != "kwargs"},
236 | **kwargs,
237 | )
238 | try:
239 | yield
240 | finally:
241 | _config_var.reset(token)
242 |
--------------------------------------------------------------------------------
/src/tea_tasting/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | """This module provides built-in metrics used to analyze experimental data.
2 |
3 | All metric classes can be imported from `tea_tasting.metrics` module.
4 | For convenience, the API reference is provided by submodules of `tea_tasting.metrics`:
5 |
6 | - `tea_tasting.metrics.base`: Base classes for metrics.
7 | - `tea_tasting.metrics.mean`: Metrics for the analysis of means.
8 | - `tea_tasting.metrics.proportion`: Metrics for the analysis of proportions.
9 | - `tea_tasting.metrics.resampling`: Metrics analyzed using resampling methods.
10 | """
11 | # pyright: reportUnusedImport=false
12 |
13 | from tea_tasting.metrics.base import (
14 | AggrCols,
15 | MetricBase,
16 | MetricBaseAggregated,
17 | MetricBaseGranular,
18 | MetricPowerResults,
19 | MetricResult,
20 | PowerBase,
21 | PowerBaseAggregated,
22 | aggregate_by_variants,
23 | read_granular,
24 | )
25 | from tea_tasting.metrics.mean import Mean, RatioOfMeans
26 | from tea_tasting.metrics.proportion import SampleRatio
27 | from tea_tasting.metrics.resampling import Bootstrap, Quantile
28 |
--------------------------------------------------------------------------------
/src/tea_tasting/metrics/base.py:
--------------------------------------------------------------------------------
1 | """Base classes for metrics."""
2 |
3 | from __future__ import annotations
4 |
5 | import abc
6 | from collections import UserList
7 | from typing import (
8 | TYPE_CHECKING,
9 | Generic,
10 | NamedTuple,
11 | TypeAlias,
12 | TypeVar,
13 | Union,
14 | overload,
15 | )
16 |
17 | import ibis
18 | import ibis.expr.types
19 | import narwhals as nw
20 | import pyarrow as pa
21 | import pyarrow.compute as pc
22 |
23 | import tea_tasting.aggr
24 | import tea_tasting.utils
25 |
26 |
27 | if TYPE_CHECKING:
28 | from collections.abc import Sequence
29 | from typing import Literal
30 |
31 | import narwhals.typing # noqa: TC004
32 |
33 |
34 | # The | operator doesn't work for NamedTuple, but Union works.
35 | MetricResult: TypeAlias = Union[NamedTuple, dict[str, object]] # noqa: UP007
36 | MetricPowerResult: TypeAlias = Union[NamedTuple, dict[str, object]] # noqa: UP007
37 |
38 | R = TypeVar("R", bound=MetricResult)
39 | P = TypeVar("P", bound=MetricPowerResult)
40 |
41 |
42 | class MetricPowerResults(tea_tasting.utils.DictsReprMixin, UserList[P]):
43 | """Power analysis results."""
44 | default_keys = ("power", "effect_size", "rel_effect_size", "n_obs")
45 |
46 | @tea_tasting.utils._cache_method
47 | def to_dicts(self) -> tuple[dict[str, object], ...]:
48 | """"Convert the results to a sequence of dictionaries."""
49 | return tuple((v if isinstance(v, dict) else v._asdict()) for v in self)
50 |
51 | S = TypeVar("S", bound=MetricPowerResults) # type: ignore
52 |
53 |
54 | class MetricBase(abc.ABC, Generic[R], tea_tasting.utils.ReprMixin):
55 | """Base class for metrics."""
56 | @abc.abstractmethod
57 | def analyze(
58 | self,
59 | data: narwhals.typing.IntoFrame | ibis.expr.types.Table,
60 | control: object,
61 | treatment: object,
62 | variant: str,
63 | ) -> R:
64 | """Analyze a metric in an experiment.
65 |
66 | Args:
67 | data: Experimental data.
68 | control: Control variant.
69 | treatment: Treatment variant.
70 | variant: Variant column name.
71 |
72 | Returns:
73 | Analysis result.
74 | """
75 |
76 |
77 | class PowerBase(abc.ABC, Generic[S], tea_tasting.utils.ReprMixin):
78 | """Base class for the analysis of power."""
79 | @abc.abstractmethod
80 | def solve_power(
81 | self,
82 | data: narwhals.typing.IntoFrame | ibis.expr.types.Table,
83 | parameter: Literal[
84 | "power", "effect_size", "rel_effect_size", "n_obs"] = "rel_effect_size",
85 | ) -> S:
86 | """Solve for a parameter of the power of a test.
87 |
88 | Args:
89 | data: Sample data.
90 | parameter: Parameter name.
91 |
92 | Returns:
93 | Power analysis result.
94 | """
95 |
96 |
97 | class AggrCols(NamedTuple):
98 | """Columns to be aggregated for a metric analysis.
99 |
100 | Attributes:
101 | has_count: If `True`, include the sample size.
102 | mean_cols: Column names for calculation of sample means.
103 | var_cols: Column names for calculation of sample variances.
104 | cov_cols: Pairs of column names for calculation of sample covariances.
105 | """
106 | has_count: bool = False
107 | mean_cols: Sequence[str] = ()
108 | var_cols: Sequence[str] = ()
109 | cov_cols: Sequence[tuple[str, str]] = ()
110 |
111 | def __or__(self, other: AggrCols) -> AggrCols:
112 | """Merge two aggregation column specifications.
113 |
114 | Args:
115 | other: Second objects.
116 |
117 | Returns:
118 | Merged column specifications.
119 | """
120 | return AggrCols(
121 | has_count=self.has_count or other.has_count,
122 | mean_cols=tuple({*self.mean_cols, *other.mean_cols}),
123 | var_cols=tuple({*self.var_cols, *other.var_cols}),
124 | cov_cols=tuple({
125 | tea_tasting.aggr._sorted_tuple(*cols)
126 | for cols in tuple({*self.cov_cols, *other.cov_cols})
127 | }),
128 | )
129 |
130 | def __len__(self) -> int:
131 | """Total length of all object attributes.
132 |
133 | If has_count is True then its value is 1, or 0 otherwise.
134 | """
135 | return (
136 | int(self.has_count)
137 | + len(self.mean_cols)
138 | + len(self.var_cols)
139 | + len(self.cov_cols)
140 | )
141 |
142 |
143 | class _HasAggrCols(abc.ABC):
144 | @property
145 | @abc.abstractmethod
146 | def aggr_cols(self) -> AggrCols:
147 | """Columns to be aggregated for an analysis."""
148 |
149 |
150 | class MetricBaseAggregated(MetricBase[R], _HasAggrCols):
151 | """Base class for metrics, which are analyzed using aggregated statistics."""
152 | @overload
153 | def analyze(
154 | self,
155 | data: dict[object, tea_tasting.aggr.Aggregates],
156 | control: object,
157 | treatment: object,
158 | variant: str | None = None,
159 | ) -> R:
160 | ...
161 |
162 | @overload
163 | def analyze(
164 | self,
165 | data: narwhals.typing.IntoFrame | ibis.expr.types.Table,
166 | control: object,
167 | treatment: object,
168 | variant: str,
169 | ) -> R:
170 | ...
171 |
172 | def analyze(
173 | self,
174 | data: narwhals.typing.IntoFrame | ibis.expr.types.Table | dict[
175 | object, tea_tasting.aggr.Aggregates],
176 | control: object,
177 | treatment: object,
178 | variant: str | None = None,
179 | ) -> R:
180 | """Analyze a metric in an experiment.
181 |
182 | Args:
183 | data: Experimental data.
184 | control: Control variant.
185 | treatment: Treatment variant.
186 | variant: Variant column name.
187 |
188 | Returns:
189 | Analysis result.
190 | """
191 | tea_tasting.utils.check_scalar(variant, "variant", typ=str | None)
192 | aggr = aggregate_by_variants(
193 | data,
194 | aggr_cols=self.aggr_cols,
195 | variant=variant,
196 | )
197 | return self.analyze_aggregates(
198 | control=aggr[control],
199 | treatment=aggr[treatment],
200 | )
201 |
202 | @abc.abstractmethod
203 | def analyze_aggregates(
204 | self,
205 | control: tea_tasting.aggr.Aggregates,
206 | treatment: tea_tasting.aggr.Aggregates,
207 | ) -> R:
208 | """Analyze metric in an experiment using aggregated statistics.
209 |
210 | Args:
211 | control: Control data.
212 | treatment: Treatment data.
213 |
214 | Returns:
215 | Analysis result.
216 | """
217 |
218 |
219 | class PowerBaseAggregated(PowerBase[S], _HasAggrCols):
220 | """Base class for the analysis of power using aggregated statistics."""
221 | def solve_power(
222 | self,
223 | data: (
224 | narwhals.typing.IntoFrame |
225 | ibis.expr.types.Table |
226 | tea_tasting.aggr.Aggregates
227 | ),
228 | parameter: Literal[
229 | "power", "effect_size", "rel_effect_size", "n_obs"] = "rel_effect_size",
230 | ) -> S:
231 | """Solve for a parameter of the power of a test.
232 |
233 | Args:
234 | data: Sample data.
235 | parameter: Parameter name.
236 |
237 | Returns:
238 | Power analysis result.
239 | """
240 | tea_tasting.utils.check_scalar(
241 | parameter,
242 | "parameter",
243 | in_={"power", "effect_size", "rel_effect_size", "n_obs"},
244 | )
245 | if not isinstance(data, tea_tasting.aggr.Aggregates):
246 | data = tea_tasting.aggr.read_aggregates(
247 | data=data,
248 | group_col=None,
249 | **self.aggr_cols._asdict(),
250 | )
251 | return self.solve_power_from_aggregates(data=data, parameter=parameter)
252 |
253 | @abc.abstractmethod
254 | def solve_power_from_aggregates(
255 | self,
256 | data: tea_tasting.aggr.Aggregates,
257 | parameter: Literal[
258 | "power", "effect_size", "rel_effect_size", "n_obs"] = "rel_effect_size",
259 | ) -> S:
260 | """Solve for a parameter of the power of a test.
261 |
262 | Args:
263 | data: Sample data.
264 | parameter: Parameter name.
265 |
266 | Returns:
267 | Power analysis result.
268 | """
269 |
270 |
271 | def aggregate_by_variants(
272 | data: (
273 | narwhals.typing.IntoFrame |
274 | ibis.expr.types.Table |
275 | dict[object, tea_tasting.aggr.Aggregates]
276 | ),
277 | aggr_cols: AggrCols,
278 | variant: str | None = None,
279 | ) -> dict[object, tea_tasting.aggr.Aggregates]:
280 | """Aggregate experimental data by variants.
281 |
282 | Args:
283 | data: Experimental data.
284 | aggr_cols: Columns to be aggregated.
285 | variant: Variant column name.
286 |
287 | Returns:
288 | Experimental data as a dictionary of Aggregates.
289 | """
290 | if isinstance(data, dict):
291 | return data
292 |
293 | if variant is None:
294 | raise ValueError("The variant parameter is required but was not provided.")
295 |
296 | return tea_tasting.aggr.read_aggregates(
297 | data=data,
298 | group_col=variant,
299 | **aggr_cols._asdict(),
300 | )
301 |
302 |
303 | class _HasCols(abc.ABC):
304 | @property
305 | @abc.abstractmethod
306 | def cols(self) -> Sequence[str]:
307 | """Columns to be fetched for an analysis."""
308 |
309 |
310 | class MetricBaseGranular(MetricBase[R], _HasCols):
311 | """Base class for metrics, which are analyzed using granular data."""
312 | @overload
313 | def analyze(
314 | self,
315 | data: dict[object, pa.Table],
316 | control: object,
317 | treatment: object,
318 | variant: str | None = None,
319 | ) -> R:
320 | ...
321 |
322 | @overload
323 | def analyze(
324 | self,
325 | data: narwhals.typing.IntoFrame | ibis.expr.types.Table,
326 | control: object,
327 | treatment: object,
328 | variant: str,
329 | ) -> R:
330 | ...
331 |
332 | def analyze(
333 | self,
334 | data: (
335 | narwhals.typing.IntoFrame |
336 | ibis.expr.types.Table |
337 | dict[object, pa.Table]
338 | ),
339 | control: object,
340 | treatment: object,
341 | variant: str | None = None,
342 | ) -> R:
343 | """Analyze a metric in an experiment.
344 |
345 | Args:
346 | data: Experimental data.
347 | control: Control variant.
348 | treatment: Treatment variant.
349 | variant: Variant column name.
350 |
351 | Returns:
352 | Analysis result.
353 | """
354 | tea_tasting.utils.check_scalar(variant, "variant", typ=str | None)
355 | dfs = read_granular(
356 | data,
357 | cols=self.cols,
358 | variant=variant,
359 | )
360 | return self.analyze_granular(
361 | control=dfs[control],
362 | treatment=dfs[treatment],
363 | )
364 |
365 | @abc.abstractmethod
366 | def analyze_granular(
367 | self,
368 | control: pa.Table,
369 | treatment: pa.Table,
370 | ) -> R:
371 | """Analyze metric in an experiment using granular data.
372 |
373 | Args:
374 | control: Control data.
375 | treatment: Treatment data.
376 |
377 | Returns:
378 | Analysis result.
379 | """
380 |
381 |
382 | @overload
383 | def read_granular(
384 | data: narwhals.typing.IntoFrame | ibis.expr.types.Table,
385 | cols: Sequence[str] = (),
386 | variant: None = None,
387 | ) -> pa.Table:
388 | ...
389 |
390 | @overload
391 | def read_granular(
392 | data: dict[object, pa.Table],
393 | cols: Sequence[str] = (),
394 | variant: None = None,
395 | ) -> dict[object, pa.Table]:
396 | ...
397 |
398 | @overload
399 | def read_granular(
400 | data: narwhals.typing.IntoFrame | ibis.expr.types.Table | dict[object, pa.Table],
401 | cols: Sequence[str],
402 | variant: str,
403 | ) -> dict[object, pa.Table]:
404 | ...
405 |
406 | def read_granular(
407 | data: narwhals.typing.IntoFrame | ibis.expr.types.Table | dict[object, pa.Table],
408 | cols: Sequence[str] = (),
409 | variant: str | None = None,
410 | ) -> pa.Table | dict[object, pa.Table]:
411 | """Read granular experimental data.
412 |
413 | Args:
414 | data: Experimental data.
415 | cols: Columns to read.
416 | variant: Variant column name.
417 |
418 | Returns:
419 | Experimental data as a dictionary of PyArrow Tables.
420 | """
421 | if isinstance(data, dict):
422 | return data
423 |
424 | variant_cols = () if variant is None else (variant,)
425 | if isinstance(data, ibis.expr.types.Table):
426 | if len(cols) + len(variant_cols) > 0:
427 | data = data.select(*cols, *variant_cols)
428 | table = data.to_pyarrow()
429 | else:
430 | data = nw.from_native(data)
431 | if isinstance(data, nw.LazyFrame):
432 | data = data.collect()
433 | if len(cols) + len(variant_cols) > 0:
434 | data = data.select(*cols, *variant_cols)
435 | table = data.to_arrow()
436 |
437 | if variant is None:
438 | return table
439 |
440 | variant_array = table[variant]
441 | if len(cols) > 0:
442 | table = table.select(cols)
443 | return {
444 | var: table.filter(pc.equal(variant_array, pa.scalar(var))) # type: ignore
445 | for var in variant_array.unique().to_pylist()
446 | }
447 |
--------------------------------------------------------------------------------
/src/tea_tasting/metrics/proportion.py:
--------------------------------------------------------------------------------
1 | """Metrics for the analysis of proportions."""
2 |
3 | from __future__ import annotations
4 |
5 | import math
6 | from typing import TYPE_CHECKING, NamedTuple
7 |
8 | import scipy.stats
9 |
10 | import tea_tasting.aggr
11 | import tea_tasting.metrics
12 | from tea_tasting.metrics.base import AggrCols, MetricBaseAggregated
13 | import tea_tasting.utils
14 |
15 |
16 | if TYPE_CHECKING:
17 | from typing import Literal
18 |
19 | import ibis.expr.types
20 | import narwhals.typing
21 |
22 |
23 | _MAX_EXACT_THRESHOLD = 1000
24 |
25 |
26 | class SampleRatioResult(NamedTuple):
27 | """Result of the sample ratio mismatch check.
28 |
29 | Attributes:
30 | control: Number of observations in control.
31 | treatment: Number of observations in treatment.
32 | pvalue: P-value
33 | """
34 | control: float
35 | treatment: float
36 | pvalue: float
37 |
38 |
39 | class SampleRatio(MetricBaseAggregated[SampleRatioResult]): # noqa: D101
40 | def __init__(
41 | self,
42 | ratio: float | int | dict[object, float | int] = 1,
43 | *,
44 | method: Literal["auto", "binom", "norm"] = "auto",
45 | correction: bool = True,
46 | ) -> None:
47 | """Metric for sample ratio mismatch check.
48 |
49 | Args:
50 | ratio: Expected ratio of the number of observations in the treatment
51 | relative to the control.
52 | method: Statistical test used for calculation of p-value:
53 |
54 | - `"auto"`: Apply exact binomial test if the total number
55 | of observations is < 1000; or normal approximation otherwise.
56 | - `"binom"`: Apply exact binomial test.
57 | - `"norm"`: Apply normal approximation of the binomial distribution.
58 |
59 | correction: If `True`, add continuity correction.
60 | Only for normal approximation.
61 |
62 | Examples:
63 | ```pycon
64 | >>> import tea_tasting as tt
65 |
66 | >>> experiment = tt.Experiment(
67 | ... sample_ratio=tt.SampleRatio(),
68 | ... )
69 | >>> data = tt.make_users_data(seed=42)
70 | >>> result = experiment.analyze(data)
71 | >>> result.with_keys(("metric", "control", "treatment", "pvalue"))
72 | metric control treatment pvalue
73 | sample_ratio 2023 1977 0.477
74 |
75 | ```
76 |
77 | Different expected ratio:
78 |
79 | ```pycon
80 | >>> experiment = tt.Experiment(
81 | ... sample_ratio=tt.SampleRatio(0.5),
82 | ... )
83 | >>> data = tt.make_users_data(seed=42)
84 | >>> result = experiment.analyze(data)
85 | >>> result.with_keys(("metric", "control", "treatment", "pvalue"))
86 | metric control treatment pvalue
87 | sample_ratio 2023 1977 3.26e-103
88 |
89 | ```
90 | """
91 | if isinstance(ratio, dict):
92 | for val in ratio.values():
93 | tea_tasting.utils.auto_check(val, "ratio")
94 | else:
95 | tea_tasting.utils.auto_check(ratio, "ratio")
96 | self.ratio = ratio
97 |
98 | self.method = tea_tasting.utils.check_scalar(
99 | method, "method", typ=str, in_={"auto", "binom", "norm"})
100 | self.correction = tea_tasting.utils.auto_check(correction, "correction")
101 |
102 |
103 | @property
104 | def aggr_cols(self) -> AggrCols:
105 | """Columns to be aggregated for a metric analysis."""
106 | return AggrCols(has_count=True)
107 |
108 |
109 | def analyze(
110 | self,
111 | data: narwhals.typing.IntoFrame | ibis.expr.types.Table | dict[
112 | object, tea_tasting.aggr.Aggregates],
113 | control: object,
114 | treatment: object,
115 | variant: str | None = None,
116 | ) -> SampleRatioResult:
117 | """Perform a sample ratio mismatch check.
118 |
119 | Args:
120 | data: Experimental data.
121 | control: Control variant.
122 | treatment: Treatment variant.
123 | variant: Variant column name.
124 |
125 | Returns:
126 | Analysis result.
127 | """
128 | tea_tasting.utils.check_scalar(variant, "variant", typ=str | None)
129 | aggr = tea_tasting.metrics.aggregate_by_variants(
130 | data,
131 | aggr_cols=self.aggr_cols,
132 | variant=variant,
133 | )
134 |
135 | k = aggr[treatment].count()
136 | n = k + aggr[control].count()
137 |
138 | r = (
139 | self.ratio
140 | if isinstance(self.ratio, float | int)
141 | else self.ratio[treatment] / self.ratio[control]
142 | )
143 | p = r / (1 + r)
144 |
145 | if (
146 | self.method == "binom" or
147 | (self.method == "auto" and n < _MAX_EXACT_THRESHOLD)
148 | ):
149 | pvalue = scipy.stats.binomtest(k=int(k), n=int(n), p=p).pvalue
150 | else: # norm
151 | d = k - n*p
152 | if self.correction and d != 0:
153 | d = min(d + 0.5, 0) if d < 0 else max(d - 0.5, 0)
154 | z = d / math.sqrt(n * p * (1 - p))
155 | pvalue = 2 * scipy.stats.norm.sf(abs(z))
156 |
157 | return SampleRatioResult(
158 | control=n - k,
159 | treatment=k,
160 | pvalue=pvalue, # type: ignore
161 | )
162 |
163 |
164 | def analyze_aggregates(
165 | self,
166 | control: tea_tasting.aggr.Aggregates,
167 | treatment: tea_tasting.aggr.Aggregates,
168 | ) -> SampleRatioResult:
169 | """Stub method for compatibility with the base class."""
170 | raise NotImplementedError
171 |
--------------------------------------------------------------------------------
/src/tea_tasting/version.py:
--------------------------------------------------------------------------------
1 | """Package version."""
2 |
3 | from __future__ import annotations
4 |
5 | import importlib.metadata
6 | import importlib.resources
7 |
8 |
9 | try:
10 | __version__ = importlib.metadata.version(__package__ or "tea-tasting")
11 | except importlib.metadata.PackageNotFoundError:
12 | __version__ = (
13 | importlib.resources.files("tea_tasting")
14 | .joinpath("_version.txt")
15 | .read_text()
16 | .strip()
17 | )
18 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e10v/tea-tasting/70b129d8dad04654b4aceb56c8ee2e348b0636b0/tests/__init__.py
--------------------------------------------------------------------------------
/tests/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e10v/tea-tasting/70b129d8dad04654b4aceb56c8ee2e348b0636b0/tests/metrics/__init__.py
--------------------------------------------------------------------------------
/tests/metrics/test_base.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import TYPE_CHECKING, NamedTuple
4 | import unittest.mock
5 |
6 | import ibis
7 | import polars as pl
8 | import pyarrow as pa
9 | import pyarrow.compute as pc
10 | import pytest
11 |
12 | import tea_tasting.aggr
13 | import tea_tasting.datasets
14 | import tea_tasting.metrics.base
15 |
16 |
17 | if TYPE_CHECKING:
18 | from typing import Any, Literal
19 |
20 | import ibis.expr.types # noqa: TC004
21 | import pandas as pd
22 |
23 |
24 | Frame = ibis.expr.types.Table | pa.Table | pd.DataFrame | pl.LazyFrame
25 |
26 |
27 | def test_aggr_cols_or():
28 | aggr_cols0 = tea_tasting.metrics.base.AggrCols(
29 | has_count=False,
30 | mean_cols=("a", "b"),
31 | var_cols=("b", "c"),
32 | cov_cols=(("a", "b"), ("c", "b")),
33 | )
34 |
35 | aggr_cols1 = tea_tasting.metrics.base.AggrCols(
36 | has_count=True,
37 | mean_cols=("b", "c"),
38 | var_cols=("c", "d"),
39 | cov_cols=(("b", "c"), ("d", "c")),
40 | )
41 |
42 | aggr_cols = aggr_cols0 | aggr_cols1
43 |
44 | assert isinstance(aggr_cols, tea_tasting.metrics.base.AggrCols)
45 | assert aggr_cols.has_count is True
46 | assert set(aggr_cols.mean_cols) == {"a", "b", "c"}
47 | assert len(aggr_cols.mean_cols) == 3
48 | assert set(aggr_cols.var_cols) == {"b", "c", "d"}
49 | assert len(aggr_cols.var_cols) == 3
50 | assert set(aggr_cols.cov_cols) == {("a", "b"), ("b", "c"), ("c", "d")}
51 | assert len(aggr_cols.cov_cols) == 3
52 |
53 |
54 | def test_aggr_cols_len():
55 | assert len(tea_tasting.metrics.base.AggrCols(
56 | has_count=False,
57 | mean_cols=("a", "b"),
58 | var_cols=("b", "c"),
59 | cov_cols=(("a", "b"), ("c", "b")),
60 | )) == 6
61 | assert len(tea_tasting.metrics.base.AggrCols(
62 | has_count=True,
63 | mean_cols=("b", "c"),
64 | var_cols=("c", "d"),
65 | cov_cols=(("b", "c"), ("d", "c")),
66 | )) == 7
67 |
68 |
69 | @pytest.fixture
70 | def data_arrow() -> pa.Table:
71 | return tea_tasting.datasets.make_users_data(n_users=100, seed=42)
72 |
73 | @pytest.fixture
74 | def data_pandas(data_arrow: pa.Table) -> pd.DataFrame:
75 | return data_arrow.to_pandas()
76 |
77 | @pytest.fixture
78 | def data_polars(data_arrow: pa.Table) -> pl.DataFrame:
79 | return pl.from_arrow(data_arrow) # type: ignore
80 |
81 | @pytest.fixture
82 | def data_polars_lazy(data_polars: pl.DataFrame) -> pl.LazyFrame:
83 | return data_polars.lazy()
84 |
85 | @pytest.fixture
86 | def data_duckdb(data_arrow: pa.Table) -> ibis.expr.types.Table:
87 | return ibis.connect("duckdb://").create_table("data", data_arrow)
88 |
89 | @pytest.fixture
90 | def data_sqlite(data_arrow: pa.Table) -> ibis.expr.types.Table:
91 | return ibis.connect("sqlite://").create_table("data", data_arrow)
92 |
93 | @pytest.fixture(params=[
94 | "data_arrow", "data_pandas",
95 | "data_polars", "data_polars_lazy",
96 | "data_duckdb", "data_sqlite",
97 | ])
98 | def data(request: pytest.FixtureRequest) -> Frame:
99 | return request.getfixturevalue(request.param)
100 |
101 |
102 | @pytest.fixture
103 | def aggr_cols() -> tea_tasting.metrics.base.AggrCols:
104 | return tea_tasting.metrics.base.AggrCols(
105 | has_count=True,
106 | mean_cols=("sessions", "orders"),
107 | var_cols=("orders", "revenue"),
108 | cov_cols=(("sessions", "revenue"),),
109 | )
110 |
111 | @pytest.fixture
112 | def correct_aggrs(
113 | data_arrow: pa.Table,
114 | aggr_cols: tea_tasting.metrics.base.AggrCols,
115 | ) -> dict[object, tea_tasting.aggr.Aggregates]:
116 | return tea_tasting.aggr.read_aggregates(
117 | data_arrow,
118 | group_col="variant",
119 | **aggr_cols._asdict(),
120 | )
121 |
122 | @pytest.fixture
123 | def correct_aggr(
124 | data_arrow: pa.Table,
125 | aggr_cols: tea_tasting.metrics.base.AggrCols,
126 | ) -> tea_tasting.aggr.Aggregates:
127 | return tea_tasting.aggr.read_aggregates(
128 | data_arrow,
129 | group_col=None,
130 | **aggr_cols._asdict(),
131 | )
132 |
133 | @pytest.fixture
134 | def cols() -> tuple[str, ...]:
135 | return ("sessions", "orders", "revenue")
136 |
137 | @pytest.fixture
138 | def correct_gran(
139 | data_arrow: pa.Table,
140 | cols: tuple[str, ...],
141 | ) -> dict[object, pa.Table]:
142 | variant_col = data_arrow["variant"]
143 | table = data_arrow.select(cols)
144 | return {
145 | var: table.filter(pc.equal(variant_col, pa.scalar(var))) # type: ignore
146 | for var in variant_col.unique().to_pylist()
147 | }
148 |
149 | @pytest.fixture
150 | def aggr_metric(
151 | aggr_cols: tea_tasting.metrics.base.AggrCols,
152 | ) -> tea_tasting.metrics.base.MetricBaseAggregated[dict[str, object]]:
153 | class AggrMetric(tea_tasting.metrics.base.MetricBaseAggregated[dict[str, object]]):
154 | @property
155 | def aggr_cols(self) -> tea_tasting.metrics.base.AggrCols:
156 | return aggr_cols
157 |
158 | def analyze_aggregates(
159 | self,
160 | control: tea_tasting.aggr.Aggregates, # noqa: ARG002
161 | treatment: tea_tasting.aggr.Aggregates, # noqa: ARG002
162 | ) -> dict[str, object]:
163 | return {}
164 |
165 | return AggrMetric()
166 |
167 | @pytest.fixture
168 | def aggr_power(
169 | aggr_cols: tea_tasting.metrics.base.AggrCols,
170 | ) -> tea_tasting.metrics.base.PowerBaseAggregated[
171 | tea_tasting.metrics.base.MetricPowerResults[dict[str, object]]
172 | ]:
173 | class AggrPower(
174 | tea_tasting.metrics.base.PowerBaseAggregated[
175 | tea_tasting.metrics.base.MetricPowerResults[dict[str, object]]
176 | ],
177 | ):
178 | @property
179 | def aggr_cols(self) -> tea_tasting.metrics.base.AggrCols:
180 | return aggr_cols
181 |
182 | def solve_power_from_aggregates(
183 | self,
184 | data: tea_tasting.aggr.Aggregates, # noqa: ARG002
185 | parameter: Literal[ # noqa: ARG002
186 | "power",
187 | "effect_size",
188 | "rel_effect_size",
189 | "n_obs",
190 | ] = "power",
191 | ) -> tea_tasting.metrics.base.MetricPowerResults[dict[str, object]]:
192 | return tea_tasting.metrics.base.MetricPowerResults()
193 | return AggrPower()
194 |
195 | @pytest.fixture
196 | def gran_metric(
197 | cols: tuple[str, ...],
198 | ) -> tea_tasting.metrics.base.MetricBaseGranular[dict[str, object]]:
199 | class GranMetric(tea_tasting.metrics.base.MetricBaseGranular[dict[str, object]]):
200 | @property
201 | def cols(self) -> tuple[str, ...]:
202 | return cols
203 |
204 | def analyze_granular(
205 | self,
206 | control: pa.Table, # noqa: ARG002
207 | treatment: pa.Table, # noqa: ARG002
208 | ) -> dict[str, object]:
209 | return {}
210 |
211 | return GranMetric()
212 |
213 |
214 | def _compare_aggrs(
215 | left: tea_tasting.aggr.Aggregates,
216 | right: tea_tasting.aggr.Aggregates,
217 | ) -> None:
218 | assert left.count_ == right.count_
219 | assert left.mean_ == pytest.approx(right.mean_)
220 | assert left.var_ == pytest.approx(right.var_)
221 | assert left.cov_ == pytest.approx(right.cov_)
222 |
223 |
224 | def test_metric_power_results_to_dicts():
225 | result0 = {
226 | "power": 0.8,
227 | "effect_size": 1,
228 | "rel_effect_size": 0.05,
229 | "n_obs": 10_000,
230 | }
231 | result1 = {
232 | "power": 0.9,
233 | "effect_size": 2,
234 | "rel_effect_size": 0.1,
235 | "n_obs": 20_000,
236 | }
237 |
238 | results = tea_tasting.metrics.base.MetricPowerResults[dict[str, float | int]]( # type: ignore
239 | [result0, result1])
240 | assert results.to_dicts() == (result0, result1)
241 |
242 | class PowerResult(NamedTuple):
243 | power: float
244 | effect_size: float
245 | rel_effect_size: float
246 | n_obs: float
247 | results = tea_tasting.metrics.base.MetricPowerResults[PowerResult]([
248 | PowerResult(**result0),
249 | PowerResult(**result1),
250 | ])
251 | assert results.to_dicts() == (result0, result1)
252 |
253 |
254 | def test_metric_base_aggregated_analyze_frame(
255 | aggr_metric: tea_tasting.metrics.base.MetricBaseAggregated[dict[str, object]],
256 | data_arrow: pa.Table,
257 | correct_aggrs: dict[object, tea_tasting.aggr.Aggregates],
258 | ):
259 | aggr_metric.analyze_aggregates = unittest.mock.MagicMock()
260 | aggr_metric.analyze(data_arrow, control=0, treatment=1, variant="variant")
261 | aggr_metric.analyze_aggregates.assert_called_once()
262 | kwargs = aggr_metric.analyze_aggregates.call_args.kwargs
263 | _compare_aggrs(kwargs["control"], correct_aggrs[0])
264 | _compare_aggrs(kwargs["treatment"], correct_aggrs[1])
265 |
266 | def test_metric_base_aggregated_analyze_aggrs(
267 | aggr_metric: tea_tasting.metrics.base.MetricBaseAggregated[dict[str, object]],
268 | correct_aggrs: dict[object, tea_tasting.aggr.Aggregates],
269 | ):
270 | aggr_metric.analyze_aggregates = unittest.mock.MagicMock()
271 | aggr_metric.analyze(correct_aggrs, control=0, treatment=1)
272 | aggr_metric.analyze_aggregates.assert_called_once()
273 | kwargs = aggr_metric.analyze_aggregates.call_args.kwargs
274 | _compare_aggrs(kwargs["control"], correct_aggrs[0])
275 | _compare_aggrs(kwargs["treatment"], correct_aggrs[1])
276 |
277 |
278 | def test_power_base_aggregated_analyze_frame(
279 | aggr_power: tea_tasting.metrics.base.PowerBaseAggregated[Any],
280 | data_arrow: pa.Table,
281 | correct_aggr: tea_tasting.aggr.Aggregates,
282 | ):
283 | aggr_power.solve_power_from_aggregates = unittest.mock.MagicMock()
284 | aggr_power.solve_power(data_arrow, "effect_size")
285 | aggr_power.solve_power_from_aggregates.assert_called_once()
286 | kwargs = aggr_power.solve_power_from_aggregates.call_args.kwargs
287 | _compare_aggrs(kwargs["data"], correct_aggr)
288 | assert kwargs["parameter"] == "effect_size"
289 |
290 | def test_power_base_aggregated_analyze_aggr(
291 | aggr_power: tea_tasting.metrics.base.PowerBaseAggregated[Any],
292 | correct_aggr: tea_tasting.aggr.Aggregates,
293 | ):
294 | aggr_power.solve_power_from_aggregates = unittest.mock.MagicMock()
295 | aggr_power.solve_power(correct_aggr, "rel_effect_size")
296 | aggr_power.solve_power_from_aggregates.assert_called_once()
297 | kwargs = aggr_power.solve_power_from_aggregates.call_args.kwargs
298 | _compare_aggrs(kwargs["data"], correct_aggr)
299 | assert kwargs["parameter"] == "rel_effect_size"
300 |
301 |
302 | def test_aggregate_by_variants_frame(
303 | data_arrow: pa.Table,
304 | aggr_cols: tea_tasting.metrics.base.AggrCols,
305 | correct_aggrs: dict[object, tea_tasting.aggr.Aggregates],
306 | ):
307 | aggrs = tea_tasting.metrics.base.aggregate_by_variants(
308 | data_arrow,
309 | aggr_cols=aggr_cols,
310 | variant="variant",
311 | )
312 | _compare_aggrs(aggrs[0], correct_aggrs[0])
313 | _compare_aggrs(aggrs[1], correct_aggrs[1])
314 |
315 | def test_aggregate_by_variants_aggrs(
316 | aggr_cols: tea_tasting.metrics.base.AggrCols,
317 | correct_aggrs: dict[object, tea_tasting.aggr.Aggregates],
318 | ):
319 | aggrs = tea_tasting.metrics.base.aggregate_by_variants(
320 | correct_aggrs,
321 | aggr_cols=aggr_cols,
322 | variant="variant",
323 | )
324 | _compare_aggrs(aggrs[0], correct_aggrs[0])
325 | _compare_aggrs(aggrs[1], correct_aggrs[1])
326 |
327 | def test_aggregate_by_variants_raises(
328 | data_arrow: pa.Table,
329 | aggr_cols: tea_tasting.metrics.base.AggrCols,
330 | ):
331 | with pytest.raises(ValueError, match="variant"):
332 | tea_tasting.metrics.base.aggregate_by_variants(data_arrow, aggr_cols=aggr_cols)
333 |
334 |
335 | def test_metric_base_granular_frame(
336 | gran_metric: tea_tasting.metrics.base.MetricBaseGranular[dict[str, object]],
337 | data_arrow: pa.Table,
338 | correct_gran: dict[object, pa.Table],
339 | ):
340 | gran_metric.analyze_granular = unittest.mock.MagicMock()
341 | gran_metric.analyze(data_arrow, control=0, treatment=1, variant="variant")
342 | gran_metric.analyze_granular.assert_called_once()
343 | kwargs = gran_metric.analyze_granular.call_args.kwargs
344 | assert kwargs["control"].equals(correct_gran[0])
345 | assert kwargs["treatment"].equals(correct_gran[1])
346 |
347 | def test_metric_base_granular_gran(
348 | gran_metric: tea_tasting.metrics.base.MetricBaseGranular[dict[str, object]],
349 | correct_gran: dict[object, pa.Table],
350 | ):
351 | gran_metric.analyze_granular = unittest.mock.MagicMock()
352 | gran_metric.analyze(correct_gran, control=0, treatment=1)
353 | gran_metric.analyze_granular.assert_called_once()
354 | kwargs = gran_metric.analyze_granular.call_args.kwargs
355 | assert kwargs["control"].equals(correct_gran[0])
356 | assert kwargs["treatment"].equals(correct_gran[1])
357 |
358 |
359 | def test_read_granular_frame(
360 | data: Frame,
361 | cols: tuple[str, ...],
362 | correct_gran: dict[object, pa.Table],
363 | ):
364 | gran = tea_tasting.metrics.base.read_granular(
365 | data,
366 | cols=cols,
367 | variant="variant",
368 | )
369 | assert gran[0].equals(correct_gran[0])
370 | assert gran[1].equals(correct_gran[1])
371 |
372 | def test_read_granular_dict(
373 | cols: tuple[str, ...],
374 | correct_gran: dict[object, pa.Table],
375 | ):
376 | gran = tea_tasting.metrics.base.read_granular(
377 | correct_gran,
378 | cols=cols,
379 | variant="variant",
380 | )
381 | assert gran[0].equals(correct_gran[0])
382 | assert gran[1].equals(correct_gran[1])
383 |
384 | def test_read_granular_none(
385 | data: Frame,
386 | cols: tuple[str, ...],
387 | data_arrow: pa.Table,
388 | ):
389 | gran = tea_tasting.metrics.base.read_granular(data, cols=cols)
390 | assert gran.equals(data_arrow.select(cols))
391 |
--------------------------------------------------------------------------------
/tests/metrics/test_proportion.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import TYPE_CHECKING, NamedTuple
4 | import unittest.mock
5 |
6 | import pytest
7 |
8 | import tea_tasting.aggr
9 | import tea_tasting.datasets
10 | import tea_tasting.metrics.base
11 | import tea_tasting.metrics.proportion
12 |
13 |
14 | if TYPE_CHECKING:
15 | import pyarrow as pa
16 |
17 |
18 | @pytest.fixture
19 | def data_arrow() -> pa.Table:
20 | return tea_tasting.datasets.make_users_data(n_users=100, seed=42)
21 |
22 | @pytest.fixture
23 | def data_aggr(data_arrow: pa.Table) -> dict[object, tea_tasting.aggr.Aggregates]:
24 | return tea_tasting.aggr.read_aggregates(
25 | data_arrow,
26 | group_col="variant",
27 | has_count=True,
28 | mean_cols=(),
29 | var_cols=(),
30 | cov_cols=(),
31 | )
32 |
33 |
34 | def test_sample_ratio_init_default():
35 | metric = tea_tasting.metrics.proportion.SampleRatio()
36 | assert metric.ratio == 1
37 | assert metric.method == "auto"
38 | assert metric.correction is True
39 |
40 | def test_sample_ratio_init_custom():
41 | metric = tea_tasting.metrics.proportion.SampleRatio(
42 | {0: 0.5, 1: 0.5},
43 | method="norm",
44 | correction=False,
45 | )
46 | assert metric.ratio == {0: 0.5, 1: 0.5}
47 | assert metric.method == "norm"
48 | assert metric.correction is False
49 |
50 |
51 | def test_sample_ratio_aggr_cols():
52 | metric = tea_tasting.metrics.proportion.SampleRatio()
53 | assert metric.aggr_cols == tea_tasting.metrics.base.AggrCols(has_count=True)
54 |
55 |
56 | def test_sample_ratio_analyze_frame(data_arrow: pa.Table):
57 | metric = tea_tasting.metrics.proportion.SampleRatio()
58 | result = metric.analyze(data_arrow, 0, 1, variant="variant")
59 | assert isinstance(result, tea_tasting.metrics.proportion.SampleRatioResult)
60 |
61 | def test_sample_ratio_analyze_auto():
62 | metric = tea_tasting.metrics.proportion.SampleRatio()
63 | with unittest.mock.patch("scipy.stats.binomtest") as mock:
64 | mock.return_value = NamedTuple("Result", (("pvalue", float),))(pvalue=0.1)
65 | data = tea_tasting.datasets.make_users_data(
66 | seed=42,
67 | n_users=tea_tasting.metrics.proportion._MAX_EXACT_THRESHOLD - 1,
68 | )
69 | metric.analyze(data, 0, 1, variant="variant")
70 | mock.assert_called_once()
71 | with unittest.mock.patch("scipy.stats.norm.sf") as mock:
72 | mock.return_value = 0.1
73 | data = tea_tasting.datasets.make_users_data(
74 | seed=42,
75 | n_users=tea_tasting.metrics.proportion._MAX_EXACT_THRESHOLD,
76 | )
77 | metric.analyze(data, 0, 1, variant="variant")
78 | mock.assert_called_once()
79 |
80 | def test_sample_ratio_analyze_binom(
81 | data_aggr: dict[object, tea_tasting.aggr.Aggregates],
82 | ):
83 | metric = tea_tasting.metrics.proportion.SampleRatio(method="binom")
84 | result = metric.analyze(data_aggr, 0, 1, variant="variant")
85 | assert result.control == 53
86 | assert result.treatment == 47
87 | assert result.pvalue == pytest.approx(0.6172994135892521)
88 |
89 | def test_sample_ratio_analyze_norm_corr(
90 | data_aggr: dict[object, tea_tasting.aggr.Aggregates],
91 | ):
92 | metric = tea_tasting.metrics.proportion.SampleRatio(method="norm", correction=True)
93 | result = metric.analyze(data_aggr, 0, 1, variant="variant")
94 | assert result.control == 53
95 | assert result.treatment == 47
96 | assert result.pvalue == pytest.approx(0.6170750774519738)
97 |
98 | def test_sample_ratio_analyze_norm_no_corr(
99 | data_aggr: dict[object, tea_tasting.aggr.Aggregates],
100 | ):
101 | metric = tea_tasting.metrics.proportion.SampleRatio(method="norm", correction=False)
102 | result = metric.analyze(data_aggr, 0, 1, variant="variant")
103 | assert result.control == 53
104 | assert result.treatment == 47
105 | assert result.pvalue == pytest.approx(0.5485062355001472)
106 |
107 | def test_sample_ratio_analyze_aggregates(
108 | data_aggr: dict[object, tea_tasting.aggr.Aggregates],
109 | ):
110 | metric = tea_tasting.metrics.proportion.SampleRatio()
111 | with pytest.raises(NotImplementedError):
112 | metric.analyze_aggregates(data_aggr[0], data_aggr[1])
113 |
--------------------------------------------------------------------------------
/tests/metrics/test_resampling.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import TYPE_CHECKING
4 |
5 | import numpy as np
6 | import pytest
7 |
8 | import tea_tasting.config
9 | import tea_tasting.datasets
10 | import tea_tasting.metrics.base
11 | import tea_tasting.metrics.resampling
12 |
13 |
14 | if TYPE_CHECKING:
15 | import numpy.typing as npt
16 | import pyarrow as pa
17 |
18 |
19 | @pytest.fixture
20 | def data_arrow() -> pa.Table:
21 | return tea_tasting.datasets.make_users_data(n_users=100, seed=42)
22 |
23 | @pytest.fixture
24 | def data_gran(data_arrow: pa.Table) -> dict[object, pa.Table]:
25 | return tea_tasting.metrics.base.read_granular(
26 | data_arrow,
27 | ("sessions", "orders", "revenue"),
28 | variant="variant",
29 | )
30 |
31 |
32 | def test_bootstrap_init_default():
33 | metric = tea_tasting.metrics.resampling.Bootstrap("a", np.mean)
34 | assert metric.columns == "a"
35 | assert metric.statistic == np.mean
36 | assert metric.alternative == tea_tasting.config.get_config("alternative")
37 | assert metric.confidence_level == tea_tasting.config.get_config("confidence_level")
38 | assert metric.n_resamples == tea_tasting.config.get_config("n_resamples")
39 | assert metric.method == "bca"
40 | assert metric.batch is None
41 | assert metric.random_state is None
42 |
43 | def test_bootstrap_init_custom():
44 | metric = tea_tasting.metrics.resampling.Bootstrap(
45 | ("a", "b"),
46 | np.mean,
47 | alternative="greater",
48 | confidence_level=0.9,
49 | n_resamples=1000,
50 | method="basic",
51 | batch=100,
52 | random_state=42,
53 | )
54 | assert metric.columns == ("a", "b")
55 | assert metric.statistic == np.mean
56 | assert metric.alternative == "greater"
57 | assert metric.confidence_level == 0.9
58 | assert metric.n_resamples == 1000
59 | assert metric.method == "basic"
60 | assert metric.batch == 100
61 | assert metric.random_state == 42
62 |
63 |
64 | def test_bootstrap_cols():
65 | metric = tea_tasting.metrics.resampling.Bootstrap("a", np.mean)
66 | assert metric.cols == ("a",)
67 |
68 | metric = tea_tasting.metrics.resampling.Bootstrap(("a", "b"), np.mean)
69 | assert metric.cols == ("a", "b")
70 |
71 |
72 | def test_bootstrap_analyze_frame(data_arrow: pa.Table):
73 | metric = tea_tasting.metrics.resampling.Bootstrap("sessions", np.mean)
74 | result = metric.analyze(data_arrow, 0, 1, variant="variant")
75 | assert isinstance(result, tea_tasting.metrics.resampling.BootstrapResult)
76 |
77 |
78 | def test_bootstrap_analyze_default(data_gran: dict[object, pa.Table]):
79 | metric = tea_tasting.metrics.resampling.Bootstrap(
80 | "revenue",
81 | np.mean,
82 | n_resamples=100,
83 | random_state=42,
84 | )
85 | result = metric.analyze(data_gran, 0, 1)
86 | assert isinstance(result, tea_tasting.metrics.resampling.BootstrapResult)
87 | assert result.control == pytest.approx(5.029811320754717)
88 | assert result.treatment == pytest.approx(5.43)
89 | assert result.effect_size == pytest.approx(0.4001886792452831)
90 | assert result.effect_size_ci_lower == pytest.approx(-3.269396309565539)
91 | assert result.effect_size_ci_upper == pytest.approx(7.219843380442667)
92 | assert result.rel_effect_size == pytest.approx(0.07956335809137971)
93 | assert result.rel_effect_size_ci_lower == pytest.approx(-0.5658493834599828)
94 | assert result.rel_effect_size_ci_upper == pytest.approx(1.8185473860534842)
95 |
96 | def test_bootstrap_analyze_multiple_columns(data_gran: dict[object, pa.Table]):
97 | def ratio_of_means(
98 | sample: npt.NDArray[np.number],
99 | axis: int,
100 | ) -> npt.NDArray[np.number]:
101 | stat = np.mean(sample, axis=axis) # type: ignore
102 | return stat[0] / stat[1]
103 |
104 | metric = tea_tasting.metrics.resampling.Bootstrap(
105 | ("orders", "sessions"),
106 | ratio_of_means,
107 | n_resamples=100,
108 | random_state=42,
109 | )
110 | result = metric.analyze(data_gran, 0, 1)
111 | assert isinstance(result, tea_tasting.metrics.resampling.BootstrapResult)
112 | assert result.control == pytest.approx(0.2857142857142857)
113 | assert result.treatment == pytest.approx(0.20224719101123595)
114 | assert result.effect_size == pytest.approx(-0.08346709470304975)
115 | assert result.effect_size_ci_lower == pytest.approx(-0.24780839493679777)
116 | assert result.effect_size_ci_upper == pytest.approx(0.07730723504025493)
117 | assert result.rel_effect_size == pytest.approx(-0.2921348314606741)
118 | assert result.rel_effect_size_ci_lower == pytest.approx(-0.6424902672606227)
119 | assert result.rel_effect_size_ci_upper == pytest.approx(0.4374404130492657)
120 |
121 | def test_bootstrap_analyze_division_by_zero(data_gran: dict[object, pa.Table]):
122 | metric = tea_tasting.metrics.resampling.Bootstrap(
123 | "orders",
124 | np.median,
125 | n_resamples=100,
126 | random_state=42,
127 | method="basic",
128 | )
129 | result = metric.analyze(data_gran, 0, 1)
130 | assert isinstance(result, tea_tasting.metrics.resampling.BootstrapResult)
131 | assert result.control == 0
132 | assert result.treatment == 0
133 | assert result.effect_size == 0
134 | assert result.effect_size_ci_lower == 0
135 | assert result.effect_size_ci_upper == 0
136 | assert np.isnan(result.rel_effect_size)
137 | assert np.isnan(result.rel_effect_size_ci_lower)
138 | assert np.isnan(result.rel_effect_size_ci_upper)
139 |
140 | def test_quantile(data_gran: dict[object, pa.Table]):
141 | metric = tea_tasting.metrics.resampling.Quantile(
142 | "revenue",
143 | q=0.8,
144 | alternative="greater",
145 | confidence_level=0.9,
146 | n_resamples=100,
147 | random_state=42,
148 | )
149 | assert metric.column == "revenue"
150 | assert metric.q == 0.8
151 | result = metric.analyze(data_gran, 0, 1)
152 | assert isinstance(result, tea_tasting.metrics.resampling.BootstrapResult)
153 | assert result.control == pytest.approx(11.972000000000001)
154 | assert result.treatment == pytest.approx(6.2820000000000045)
155 | assert result.effect_size == pytest.approx(-5.689999999999997)
156 | assert result.effect_size_ci_lower == pytest.approx(-10.875800000000003)
157 | assert result.effect_size_ci_upper == float("inf")
158 | assert result.rel_effect_size == pytest.approx(-0.47527564316739024)
159 | assert result.rel_effect_size_ci_lower == pytest.approx(-0.8743329817472134)
160 | assert result.rel_effect_size_ci_upper == float("inf")
161 |
--------------------------------------------------------------------------------
/tests/test_aggr.py:
--------------------------------------------------------------------------------
1 | # pyright: reportAttributeAccessIssue=false
2 | from __future__ import annotations
3 |
4 | from typing import TYPE_CHECKING
5 |
6 | import ibis
7 | import numpy as np
8 | import polars as pl
9 | import pyarrow as pa
10 | import pyarrow.compute as pc
11 | import pytest
12 |
13 | import tea_tasting.aggr
14 | import tea_tasting.datasets
15 |
16 |
17 | if TYPE_CHECKING:
18 | import ibis.expr.types # noqa: TC004
19 | import pandas as pd
20 |
21 |
22 | Frame = ibis.expr.types.Table | pa.Table | pd.DataFrame | pl.LazyFrame
23 |
24 |
25 | COUNT = 100
26 | MEAN = {"x": 5.0, "y": 4}
27 | VAR = {"x": 3.0, "y": 2}
28 | COV = {("x", "y"): 1.0}
29 |
30 | @pytest.fixture
31 | def aggr() -> tea_tasting.aggr.Aggregates:
32 | return tea_tasting.aggr.Aggregates(
33 | count_=COUNT,
34 | mean_=MEAN,
35 | var_=VAR,
36 | cov_=COV, # type: ignore
37 | )
38 |
39 |
40 | @pytest.fixture
41 | def data_arrow() -> pa.Table:
42 | return tea_tasting.datasets.make_users_data(n_users=100, seed=42)
43 |
44 | @pytest.fixture
45 | def data_pandas(data_arrow: pa.Table) -> pd.DataFrame:
46 | return data_arrow.to_pandas()
47 |
48 | @pytest.fixture
49 | def data_polars(data_arrow: pa.Table) -> pl.DataFrame:
50 | return pl.from_arrow(data_arrow) # type: ignore
51 |
52 | @pytest.fixture
53 | def data_polars_lazy(data_polars: pl.DataFrame) -> pl.LazyFrame:
54 | return data_polars.lazy()
55 |
56 | @pytest.fixture
57 | def data_duckdb(data_arrow: pa.Table) -> ibis.expr.types.Table:
58 | return ibis.connect("duckdb://").create_table("data", data_arrow)
59 |
60 | @pytest.fixture
61 | def data_sqlite(data_arrow: pa.Table) -> ibis.expr.types.Table:
62 | return ibis.connect("sqlite://").create_table("data", data_arrow)
63 |
64 | @pytest.fixture(params=[
65 | "data_arrow", "data_pandas",
66 | "data_polars", "data_polars_lazy",
67 | "data_duckdb", "data_sqlite",
68 | ])
69 | def data(request: pytest.FixtureRequest) -> Frame:
70 | return request.getfixturevalue(request.param)
71 |
72 |
73 | @pytest.fixture
74 | def correct_aggr(data_arrow: pa.Table) -> tea_tasting.aggr.Aggregates:
75 | return tea_tasting.aggr.Aggregates(
76 | count_=data_arrow.num_rows,
77 | mean_={
78 | "sessions": pc.mean(data_arrow["sessions"]).as_py(),
79 | "orders": pc.mean(data_arrow["orders"]).as_py(),
80 | },
81 | var_={
82 | "sessions": pc.variance(data_arrow["sessions"], ddof=1).as_py(),
83 | "orders": pc.variance(data_arrow["orders"], ddof=1).as_py(),
84 | },
85 | cov_={
86 | ("orders", "sessions"): np.cov(
87 | data_arrow["sessions"].combine_chunks().to_numpy(zero_copy_only=False),
88 | data_arrow["orders"].combine_chunks().to_numpy(zero_copy_only=False),
89 | ddof=1,
90 | )[0, 1],
91 | },
92 | )
93 |
94 | @pytest.fixture
95 | def correct_aggrs(data_arrow: pa.Table) -> dict[int, tea_tasting.aggr.Aggregates]:
96 | variant_col = data_arrow["variant"]
97 | aggrs = {}
98 | for var in variant_col.unique().to_pylist():
99 | var_data = data_arrow.filter(pc.equal(variant_col, pa.scalar(var)))
100 | aggrs |= {var: tea_tasting.aggr.Aggregates(
101 | count_=var_data.num_rows,
102 | mean_={
103 | "sessions": pc.mean(var_data["sessions"]).as_py(),
104 | "orders": pc.mean(var_data["orders"]).as_py(),
105 | },
106 | var_={
107 | "sessions": pc.variance(var_data["sessions"], ddof=1).as_py(),
108 | "orders": pc.variance(var_data["orders"], ddof=1).as_py(),
109 | },
110 | cov_={
111 | ("orders", "sessions"): np.cov(
112 | var_data["sessions"].combine_chunks().to_numpy(zero_copy_only=False),
113 | var_data["orders"].combine_chunks().to_numpy(zero_copy_only=False),
114 | ddof=1,
115 | )[0, 1],
116 | },
117 | )}
118 | return aggrs
119 |
120 |
121 | def test_aggregates_init(aggr: tea_tasting.aggr.Aggregates):
122 | assert aggr.count_ == COUNT
123 | assert aggr.mean_ == MEAN
124 | assert aggr.var_ == VAR
125 | assert aggr.cov_ == COV
126 |
127 | def test_aggregates_calls(aggr: tea_tasting.aggr.Aggregates):
128 | assert aggr.count() == COUNT
129 | assert aggr.mean("x") == MEAN["x"]
130 | assert aggr.mean("y") == MEAN["y"]
131 | assert aggr.var("x") == VAR["x"]
132 | assert aggr.mean("y") == MEAN["y"]
133 | assert aggr.cov("x", "y") == COV["x", "y"]
134 |
135 | def test_aggregates_count_raises():
136 | aggr = tea_tasting.aggr.Aggregates(count_=None, mean_={}, var_={}, cov_={})
137 | with pytest.raises(RuntimeError):
138 | aggr.count()
139 |
140 | def test_aggregates_none(aggr: tea_tasting.aggr.Aggregates):
141 | assert aggr.mean(None) == 1
142 | assert aggr.var(None) == 0
143 | assert aggr.cov(None, "y") == 0
144 | assert aggr.cov("x", None) == 0
145 |
146 | def test_aggregates_ratio_var(aggr: tea_tasting.aggr.Aggregates):
147 | assert aggr.ratio_var("x", "y") == pytest.approx(0.2265625)
148 |
149 | def test_aggregates_ratio_cov():
150 | aggr = tea_tasting.aggr.Aggregates(
151 | count_=None,
152 | mean_={"a": 8, "b": 7, "c": 6, "d": 5},
153 | var_={},
154 | cov_={("a", "c"): 4, ("a", "d"): 3, ("b", "c"): 2, ("b", "d"): 1},
155 | )
156 | assert aggr.ratio_cov("a", "b", "c", "d") == pytest.approx(-0.0146938775510204)
157 |
158 | def test_aggregates_add(
159 | correct_aggr: tea_tasting.aggr.Aggregates,
160 | correct_aggrs: dict[int, tea_tasting.aggr.Aggregates],
161 | ):
162 | aggrs_add = correct_aggrs[0] + correct_aggrs[1]
163 | assert aggrs_add.count_ == pytest.approx(correct_aggr.count_)
164 | assert aggrs_add.mean_ == pytest.approx(correct_aggr.mean_)
165 | assert aggrs_add.var_ == pytest.approx(correct_aggr.var_)
166 | assert aggrs_add.cov_ == pytest.approx(correct_aggr.cov_)
167 |
168 |
169 | def test_read_aggregates_groups(
170 | data: Frame,
171 | correct_aggrs: dict[int, tea_tasting.aggr.Aggregates],
172 | ):
173 | aggrs = tea_tasting.aggr.read_aggregates(
174 | data,
175 | group_col="variant",
176 | has_count=True,
177 | mean_cols=("sessions", "orders"),
178 | var_cols=("sessions", "orders"),
179 | cov_cols=(("sessions", "orders"),),
180 | )
181 | for i in (0, 1):
182 | assert aggrs[i].count_ == pytest.approx(correct_aggrs[i].count_)
183 | assert aggrs[i].mean_ == pytest.approx(correct_aggrs[i].mean_)
184 | assert aggrs[i].var_ == pytest.approx(correct_aggrs[i].var_)
185 | assert aggrs[i].cov_ == pytest.approx(correct_aggrs[i].cov_)
186 |
187 | def test_read_aggregates_no_groups(
188 | data: Frame,
189 | correct_aggr: tea_tasting.aggr.Aggregates,
190 | ):
191 | aggr = tea_tasting.aggr.read_aggregates(
192 | data,
193 | group_col=None,
194 | has_count=True,
195 | mean_cols=("sessions", "orders"),
196 | var_cols=("sessions", "orders"),
197 | cov_cols=(("sessions", "orders"),),
198 | )
199 | assert aggr.count_ == pytest.approx(correct_aggr.count_)
200 | assert aggr.mean_ == pytest.approx(correct_aggr.mean_)
201 | assert aggr.var_ == pytest.approx(correct_aggr.var_)
202 | assert aggr.cov_ == pytest.approx(correct_aggr.cov_)
203 |
204 | def test_read_aggregates_no_count(data_arrow: pa.Table):
205 | aggr = tea_tasting.aggr.read_aggregates(
206 | data_arrow,
207 | group_col=None,
208 | has_count=False,
209 | mean_cols=("sessions", "orders"),
210 | var_cols=(),
211 | cov_cols=(),
212 | )
213 | assert aggr.count_ is None
214 | assert aggr.var_ == {}
215 | assert aggr.cov_ == {}
216 |
--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import TYPE_CHECKING
4 |
5 | import pytest
6 |
7 | import tea_tasting.config
8 |
9 |
10 | if TYPE_CHECKING:
11 | from collections.abc import Iterator
12 |
13 |
14 | @pytest.fixture
15 | def reset_config() -> Iterator[None]:
16 | try:
17 | yield
18 | finally:
19 | tea_tasting.config._config_var.set(tea_tasting.config._DEFAULT_CONFIG.copy())
20 |
21 |
22 | @pytest.mark.usefixtures("reset_config")
23 | def test_get_config():
24 | config = tea_tasting.config.get_config()
25 | assert config == tea_tasting.config._config_var.get()
26 | config["equal_var"] = not config["equal_var"]
27 | assert config != tea_tasting.config._config_var.get()
28 |
29 | assert (
30 | tea_tasting.config.get_config("equal_var") ==
31 | tea_tasting.config._config_var.get()["equal_var"]
32 | )
33 |
34 |
35 | @pytest.mark.usefixtures("reset_config")
36 | def test_set_config():
37 | tea_tasting.config.set_config(equal_var=True)
38 | assert tea_tasting.config._config_var.get()["equal_var"] is True
39 |
40 | tea_tasting.config.set_config(equal_var=False)
41 | assert tea_tasting.config._config_var.get()["equal_var"] is False
42 |
43 |
44 | @pytest.mark.usefixtures("reset_config")
45 | def test_config_context():
46 | old_equal_var = tea_tasting.config._config_var.get()["equal_var"]
47 |
48 | with tea_tasting.config.config_context(equal_var=not old_equal_var):
49 | assert tea_tasting.config._config_var.get() is not old_equal_var
50 |
51 | assert tea_tasting.config._config_var.get()["equal_var"] is old_equal_var
52 |
--------------------------------------------------------------------------------
/tests/test_datasets.py:
--------------------------------------------------------------------------------
1 | # pyright: reportAttributeAccessIssue=false
2 | from __future__ import annotations
3 |
4 | import pandas as pd
5 | import polars as pl
6 | import pyarrow as pa
7 | import pyarrow.compute as pc
8 |
9 | import tea_tasting.datasets
10 |
11 |
12 | def test_make_users_data_default():
13 | n_users = 100
14 | data = tea_tasting.datasets.make_users_data(seed=42, n_users=n_users)
15 | assert isinstance(data, pa.Table)
16 | assert data.column_names == ["user", "variant", "sessions", "orders", "revenue"]
17 | assert data.num_rows == n_users
18 | assert pc.count_distinct(data["user"]).as_py() == n_users
19 | assert pc.count_distinct(data["variant"]).as_py() == 2
20 | assert pc.min(data["sessions"]).as_py() > 0
21 | assert pc.min(data["orders"]).as_py() >= 0
22 | assert pc.min(data["revenue"]).as_py() >= 0
23 | assert pc.min(pc.subtract(data["orders"], data["sessions"])).as_py() <= 0
24 | assert int(pc.min(pc.equal(
25 | pc.greater_equal(data["revenue"], 0),
26 | pc.greater_equal(data["orders"], 0),
27 | )).as_py()) == 1
28 |
29 | def test_make_users_data_pandas():
30 | n_users = 100
31 | data = tea_tasting.datasets.make_users_data(
32 | seed=42, n_users=n_users, return_type="pandas")
33 | assert isinstance(data, pd.DataFrame)
34 | assert data.columns.to_list() == [
35 | "user", "variant", "sessions", "orders", "revenue"]
36 | assert data.shape[0] == n_users
37 |
38 | def test_make_users_data_polars():
39 | n_users = 100
40 | data = tea_tasting.datasets.make_users_data(
41 | seed=42, n_users=n_users, return_type="polars")
42 | assert isinstance(data, pl.DataFrame)
43 | assert data.columns == [
44 | "user", "variant", "sessions", "orders", "revenue"]
45 | assert data.shape[0] == n_users
46 |
47 |
48 | def test_make_users_data_covariates():
49 | n_users = 100
50 | data = tea_tasting.datasets.make_users_data(
51 | seed=42, covariates=True, n_users=n_users)
52 | assert isinstance(data, pa.Table)
53 | assert data.column_names == [
54 | "user", "variant", "sessions", "orders", "revenue",
55 | "sessions_covariate", "orders_covariate", "revenue_covariate",
56 | ]
57 | assert pc.min(data["sessions_covariate"]).as_py() >= 0
58 | assert pc.min(data["orders_covariate"]).as_py() >= 0
59 | assert pc.min(data["revenue_covariate"]).as_py() >= 0
60 | assert pc.min(pc.subtract(
61 | data["orders_covariate"],
62 | data["sessions_covariate"],
63 | )).as_py() <= 0
64 | assert int(pc.min(pc.equal(
65 | pc.greater_equal(data["revenue_covariate"], 0),
66 | pc.greater_equal(data["orders_covariate"], 0),
67 | )).as_py()) == 1
68 |
69 |
70 | def test_make_sessions_data_default():
71 | n_users = 100
72 | data = tea_tasting.datasets.make_sessions_data(seed=42, n_users=n_users)
73 | assert isinstance(data, pa.Table)
74 | assert data.column_names == ["user", "variant", "sessions", "orders", "revenue"]
75 | assert data.num_rows > n_users
76 | assert pc.count_distinct(data["user"]).as_py() == n_users
77 | assert pc.count_distinct(data["variant"]).as_py() == 2
78 | assert pc.min(data["sessions"]).as_py() == 1
79 | assert pc.max(data["sessions"]).as_py() == 1
80 | assert pc.min(data["orders"]).as_py() >= 0
81 | assert pc.min(data["revenue"]).as_py() >= 0
82 | assert pc.min(pc.subtract(data["orders"], data["sessions"])).as_py() <= 0
83 | assert int(pc.min(pc.equal(
84 | pc.greater_equal(data["revenue"], 0),
85 | pc.greater_equal(data["orders"], 0),
86 | )).as_py()) == 1
87 |
88 | def test_make_sessions_data_pandas():
89 | n_users = 100
90 | data = tea_tasting.datasets.make_sessions_data(
91 | seed=42, n_users=n_users, return_type="pandas")
92 | assert isinstance(data, pd.DataFrame)
93 | assert data.columns.to_list() == [
94 | "user", "variant", "sessions", "orders", "revenue"]
95 | assert data.shape[0] > n_users
96 |
97 | def test_make_sessions_data_polars():
98 | n_users = 100
99 | data = tea_tasting.datasets.make_sessions_data(
100 | seed=42, n_users=n_users, return_type="polars")
101 | assert isinstance(data, pl.DataFrame)
102 | assert data.columns == [
103 | "user", "variant", "sessions", "orders", "revenue"]
104 | assert data.shape[0] > n_users
105 |
106 |
107 | def test_make_sessions_data_covariates():
108 | n_users = 100
109 | data = tea_tasting.datasets.make_sessions_data(
110 | seed=42, covariates=True, n_users=n_users)
111 | assert isinstance(data, pa.Table)
112 | assert data.column_names == [
113 | "user", "variant", "sessions", "orders", "revenue",
114 | "sessions_covariate", "orders_covariate", "revenue_covariate",
115 | ]
116 | assert pc.min(data["sessions_covariate"]).as_py() >= 0
117 | assert pc.min(data["orders_covariate"]).as_py() >= 0
118 | assert pc.min(data["revenue_covariate"]).as_py() >= 0
119 | assert pc.min(pc.subtract(
120 | data["orders_covariate"],
121 | data["sessions_covariate"],
122 | )).as_py() <= 0
123 | assert int(pc.min(pc.equal(
124 | pc.greater_equal(data["revenue_covariate"], 0),
125 | pc.greater_equal(data["orders_covariate"], 0),
126 | )).as_py()) == 1
127 |
--------------------------------------------------------------------------------
/tests/test_version.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import importlib
4 | import importlib.metadata
5 | import unittest.mock
6 |
7 | import tea_tasting.version
8 |
9 |
10 | def test_version():
11 | assert isinstance(tea_tasting.version.__version__, str)
12 |
13 | with (
14 | unittest.mock.patch(
15 | "tea_tasting.version.importlib.metadata.version") as version,
16 | unittest.mock.patch("tea_tasting.version.importlib.resources.files") as files,
17 | ):
18 | (
19 | files.return_value
20 | .joinpath.return_value
21 | .read_text.return_value
22 | .strip.return_value
23 | ) = "version"
24 |
25 | version.side_effect = importlib.metadata.PackageNotFoundError("Not found")
26 | importlib.reload(tea_tasting.version)
27 | assert isinstance(tea_tasting.version.__version__, str)
28 |
--------------------------------------------------------------------------------