├── .github
    └── workflows
    │   ├── ci.yml
    │   ├── docs.yml
    │   └── release.yml
├── .gitignore
├── .markdownlint.yaml
├── LICENSE
├── README.md
├── docs
    ├── CNAME
    ├── api
    │   ├── aggr.md
    │   ├── config.md
    │   ├── datasets.md
    │   ├── experiment.md
    │   ├── index.md
    │   ├── metrics
    │   │   ├── base.md
    │   │   ├── index.md
    │   │   ├── mean.md
    │   │   ├── proportion.md
    │   │   └── resampling.md
    │   ├── multiplicity.md
    │   └── utils.md
    ├── assets
    │   ├── tea-cup-black.svg
    │   ├── tea-cup-white-on-black.svg
    │   └── tea-cup-white.svg
    ├── custom-metrics.md
    ├── data-backends.md
    ├── index.md
    ├── javascripts
    │   └── override-copy.js
    ├── multiple-testing.md
    ├── power-analysis.md
    ├── simulated-experiments.md
    ├── stylesheets
    │   └── extra.css
    └── user-guide.md
├── examples
    ├── README.md
    ├── custom-metrics.py
    ├── data-backends.py
    ├── multiple-testing.py
    ├── power-analysis.py
    ├── simulated-experiments.py
    └── user-guide.py
├── mkdocs.yml
├── pyproject.toml
├── src
    ├── _internal
    │   ├── __init__.py
    │   ├── create_examples.py
    │   ├── external_links.py
    │   └── strip_doctest_artifacts.py
    └── tea_tasting
    │   ├── __init__.py
    │   ├── aggr.py
    │   ├── config.py
    │   ├── datasets.py
    │   ├── experiment.py
    │   ├── metrics
    │       ├── __init__.py
    │       ├── base.py
    │       ├── mean.py
    │       ├── proportion.py
    │       └── resampling.py
    │   ├── multiplicity.py
    │   ├── utils.py
    │   └── version.py
└── tests
    ├── __init__.py
    ├── metrics
        ├── __init__.py
        ├── test_base.py
        ├── test_mean.py
        ├── test_proportion.py
        └── test_resampling.py
    ├── test_aggr.py
    ├── test_config.py
    ├── test_datasets.py
    ├── test_experiment.py
    ├── test_multiplicity.py
    ├── test_utils.py
    └── test_version.py


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: ci
  2 | on:
  3 |   pull_request:
  4 |     paths:
  5 |       - '**.py'
  6 |       - '.github/workflows/ci.yml'
  7 |       - 'pyproject.toml'
  8 |   push:
  9 |     branches: [main]
 10 |     paths:
 11 |       - '**.py'
 12 |       - '.github/workflows/ci.yml'
 13 |       - 'pyproject.toml'
 14 |   workflow_dispatch:
 15 | jobs:
 16 |   doctest:
 17 |     runs-on: ${{ matrix.os }}
 18 |     strategy:
 19 |       matrix:
 20 |         os: [ubuntu-latest]
 21 |         python-version: ["3.10"]
 22 |     steps:
 23 |       - name: checkout
 24 |         uses: actions/checkout@v4
 25 |       - name: set up uv
 26 |         uses: astral-sh/setup-uv@v6
 27 |         with:
 28 |           activate-environment: true
 29 |           cache-suffix: "${{ matrix.python-version }}-test"
 30 |           enable-cache: true
 31 |           python-version: ${{ matrix.python-version }}
 32 |       - name: install dependencies
 33 |         run: uv sync --group test
 34 |       - name: doctest with pytest
 35 |         run: |
 36 |           pytest \
 37 |             --doctest-continue-on-failure \
 38 |             --doctest-glob=*.md \
 39 |             --doctest-modules \
 40 |             --ignore=examples/ \
 41 |             --ignore=tests/ \
 42 |             --ignore-glob=src/_* \
 43 |   test-lowest:
 44 |     strategy:
 45 |       matrix:
 46 |         os: [ubuntu-latest]
 47 |         python-version: ["3.10"]
 48 |     runs-on: ${{ matrix.os }}
 49 |     steps:
 50 |       - name: checkout
 51 |         uses: actions/checkout@v4
 52 |       - name: set up uv
 53 |         uses: astral-sh/setup-uv@v6
 54 |         with:
 55 |           activate-environment: true
 56 |           cache-suffix: "${{ matrix.python-version }}-test-lowest"
 57 |           enable-cache: true
 58 |           python-version: ${{ matrix.python-version }}
 59 |       - name: install dependencies
 60 |         run: uv sync --group test --resolution lowest-direct
 61 |       - name: test-lowest with pytest
 62 |         run: pytest
 63 |   test:
 64 |     strategy:
 65 |       matrix:
 66 |         os: [ubuntu-latest, macos-13, windows-latest]
 67 |         python-version: ["3.10", "3.11", "3.12", "3.13"]
 68 |     runs-on: ${{ matrix.os }}
 69 |     steps:
 70 |       - name: checkout
 71 |         uses: actions/checkout@v4
 72 |       - name: set up uv
 73 |         uses: astral-sh/setup-uv@v6
 74 |         with:
 75 |           activate-environment: true
 76 |           cache-suffix: "${{ matrix.python-version }}-test"
 77 |           enable-cache: true
 78 |           python-version: ${{ matrix.python-version }}
 79 |       - name: install dependencies
 80 |         run: uv sync --group test
 81 |       - name: test with pytest
 82 |         run: coverage run -m pytest
 83 |       - name: convert coverage report
 84 |         run: coverage xml
 85 |       - name: upload coverage reports to codecov
 86 |         uses: codecov/codecov-action@v5
 87 |         with:
 88 |           files: ./coverage.xml
 89 |         env:
 90 |           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
 91 |   lint:
 92 |     runs-on: ubuntu-latest
 93 |     strategy:
 94 |       matrix:
 95 |         python-version: ["3.10", "3.11", "3.12", "3.13"]
 96 |     steps:
 97 |       - name: checkout
 98 |         uses: actions/checkout@v4
 99 |       - name: set up uv
100 |         uses: astral-sh/setup-uv@v6
101 |         with:
102 |           activate-environment: true
103 |           cache-suffix: "${{ matrix.python-version }}-lint"
104 |           enable-cache: true
105 |           python-version: ${{ matrix.python-version }}
106 |       - name: install dependencies
107 |         run: uv sync --group lint --group test
108 |       - name: check with ruff
109 |         run: ruff check .
110 |       - name: check with pyright
111 |         run: pyright
112 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: docs
 2 | on:
 3 |   release:
 4 |     types: [published]
 5 |   workflow_dispatch:
 6 | permissions:
 7 |   contents: write
 8 | jobs:
 9 |   docs:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: checkout
13 |         uses: actions/checkout@v4
14 |       - name: set up uv
15 |         uses: astral-sh/setup-uv@v6
16 |         with:
17 |           activate-environment: true
18 |           python-version: 3.12
19 |       - name: install dependencies
20 |         run: uv sync --group docs
21 |       - name: build and publish docs
22 |         run: mkdocs gh-deploy --force
23 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: release
 2 | on:
 3 |   release:
 4 |     types: [published]
 5 | jobs:
 6 |   publish:
 7 |     runs-on: ubuntu-latest
 8 |     permissions:
 9 |       id-token: write
10 |     steps:
11 |       - name: checkout
12 |         uses: actions/checkout@v4
13 |       - name: set up uv
14 |         uses: astral-sh/setup-uv@v6
15 |         with:
16 |           activate-environment: true
17 |           python-version: 3.12
18 |       - name: install dependencies
19 |         run: uv sync --no-dev
20 |       - name: build
21 |         run: uv build
22 |       - name: publish
23 |         run: uv publish
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 
164 | # VSCode
165 | .vscode/
166 | 
167 | # Version file
168 | src/tea_tasting/_version.txt
169 | 
170 | # uv lockfile
171 | uv.lock
172 | 


--------------------------------------------------------------------------------
/.markdownlint.yaml:
--------------------------------------------------------------------------------
1 | MD007:
2 |   indent: 4
3 | 
4 | MD013: false
5 | MD046: false
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Evgeny Ivanov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # tea-tasting: statistical analysis of A/B tests
  2 | 
  3 | [![CI](https://github.com/e10v/tea-tasting/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/e10v/tea-tasting/actions/workflows/ci.yml)
  4 | [![Docs](https://github.com/e10v/tea-tasting/actions/workflows/docs.yml/badge.svg)](https://tea-tasting.e10v.me/)
  5 | [![Coverage](https://codecov.io/github/e10v/tea-tasting/coverage.svg?branch=main)](https://codecov.io/gh/e10v/tea-tasting)
  6 | [![License](https://img.shields.io/github/license/e10v/tea-tasting)](https://github.com/e10v/tea-tasting/blob/main/LICENSE)
  7 | [![Package Status](https://img.shields.io/pypi/status/tea-tasting.svg)](https://pypi.org/project/tea-tasting/)
  8 | [![Version](https://img.shields.io/pypi/v/tea-tasting.svg)](https://pypi.org/project/tea-tasting/)
  9 | [![PyPI Python Versions](https://img.shields.io/pypi/pyversions/tea-tasting.svg)](https://pypi.org/project/tea-tasting/)
 10 | 
 11 | tea-tasting is a Python package for the statistical analysis of A/B tests featuring:
 12 | 
 13 | - Student's t-test, Z-test, bootstrap, and quantile metrics out of the box.
 14 | - Extensible API that lets you define and use statistical tests of your choice.
 15 | - [Delta method](https://alexdeng.github.io/public/files/kdd2018-dm.pdf) for ratio metrics.
 16 | - Variance reduction using [CUPED](https://exp-platform.com/Documents/2013-02-CUPED-ImprovingSensitivityOfControlledExperiments.pdf)/[CUPAC](https://doordash.engineering/2020/06/08/improving-experimental-power-through-control-using-predictions-as-covariate-cupac/), which can be combined with the Delta method for ratio metrics.
 17 | - Confidence intervals for both absolute and percentage changes.
 18 | - Checks for sample-ratio mismatches.
 19 | - Power analysis.
 20 | - Multiple hypothesis testing (family-wise error rate and false discovery rate).
 21 | - Simulated experiments, including A/A tests.
 22 | 
 23 | tea-tasting calculates statistics directly within data backends such as BigQuery, ClickHouse, DuckDB, PostgreSQL, Snowflake, Spark, and many other backends supported by [Ibis](https://github.com/ibis-project/ibis). This approach eliminates the need to import granular data into a Python environment.
 24 | 
 25 | tea-tasting also accepts dataframes supported by [Narwhals](https://github.com/narwhals-dev/narwhals): cuDF, Dask, Modin, pandas, Polars, PyArrow.
 26 | 
 27 | ## Installation
 28 | 
 29 | ```bash
 30 | uv pip install tea-tasting
 31 | ```
 32 | 
 33 | ## Basic example
 34 | 
 35 | ```pycon
 36 | >>> import tea_tasting as tt
 37 | 
 38 | >>> data = tt.make_users_data(seed=42)
 39 | >>> experiment = tt.Experiment(
 40 | ...     sessions_per_user=tt.Mean("sessions"),
 41 | ...     orders_per_session=tt.RatioOfMeans("orders", "sessions"),
 42 | ...     orders_per_user=tt.Mean("orders"),
 43 | ...     revenue_per_user=tt.Mean("revenue"),
 44 | ... )
 45 | >>> result = experiment.analyze(data)
 46 | >>> result
 47 |             metric control treatment rel_effect_size rel_effect_size_ci pvalue
 48 |  sessions_per_user    2.00      1.98          -0.66%      [-3.7%, 2.5%]  0.674
 49 | orders_per_session   0.266     0.289            8.8%      [-0.89%, 19%] 0.0762
 50 |    orders_per_user   0.530     0.573            8.0%       [-2.0%, 19%]  0.118
 51 |   revenue_per_user    5.24      5.73            9.3%       [-2.4%, 22%]  0.123
 52 | 
 53 | ```
 54 | 
 55 | Learn more in the detailed [user guide](https://tea-tasting.e10v.me/user-guide/). Additionally, see the guides on more specific topics:
 56 | 
 57 | - [Data backends](https://tea-tasting.e10v.me/data-backends/).
 58 | - [Power analysis](https://tea-tasting.e10v.me/power-analysis/).
 59 | - [Multiple hypothesis testing](https://tea-tasting.e10v.me/multiple-testing/).
 60 | - [Custom metrics](https://tea-tasting.e10v.me/custom-metrics/).
 61 | - [Simulated experiments](https://tea-tasting.e10v.me/simulated-experiments/).
 62 | 
 63 | ## Examples
 64 | 
 65 | The tea-tasting repository includes [examples](https://github.com/e10v/tea-tasting/tree/main/examples) as copies of the guides in the [marimo](https://github.com/marimo-team/marimo) notebook format. You can either download them from GitHub and run in your local environment, or you can run them as WASM notebooks in the online playground.
 66 | 
 67 | ### Run in a local environment
 68 | 
 69 | To run the examples in your local environment, clone the repository and change the directory:
 70 | 
 71 | ```bash
 72 | git clone git@github.com:e10v/tea-tasting.git && cd tea-tasting
 73 | ```
 74 | 
 75 | Install marimo, tea-tasting, and other packages used in the examples:
 76 | 
 77 | ```bash
 78 | uv venv && uv pip install marimo tea-tasting polars ibis-framework[duckdb]
 79 | ```
 80 | 
 81 | Launch the notebook server:
 82 | 
 83 | ```bash
 84 | uv run marimo edit examples
 85 | ```
 86 | 
 87 | Now you can choose and run the example notebooks.
 88 | 
 89 | ### Run in the online playground
 90 | 
 91 | To run the examples as WASM notebooks in the online playground, open the following links:
 92 | 
 93 | - [User guide](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fuser-guide.py&embed=true).
 94 | - [Data backends](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fdata-backends.py&embed=true).
 95 | - [Power analysis](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fpower-analysis.py&embed=true).
 96 | - [Multiple hypothesis testing](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fmultiple-testing.py&embed=true).
 97 | - [Custom metrics](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fcustom-metrics.py&embed=true).
 98 | - [Simulated experiments](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fsimulated-experiments.py&embed=true).
 99 | 
100 | [WASM notebooks](https://docs.marimo.io/guides/wasm/) run entirely in the browser on [Pyodide](https://github.com/pyodide/pyodide) and thus have some limitations. In particular:
101 | 
102 | - Tables and dataframes render less attractively because Pyodide doesn't always include the latest [packages versions](https://pyodide.org/en/stable/usage/packages-in-pyodide.html).
103 | - You can't simulate experiments [in parallel](https://tea-tasting.e10v.me/simulated-experiments/#parallel-execution) because Pyodide currently [doesn't support multiprocessing](https://pyodide.org/en/stable/usage/wasm-constraints.html#included-but-not-working-modules).
104 | - Other unpredictable issues may arise, such as the inability to use duckdb with ibis.
105 | 
106 | ## Package name
107 | 
108 | The package name "tea-tasting" is a play on words that refers to two subjects:
109 | 
110 | - [Lady tasting tea](https://en.wikipedia.org/wiki/Lady_tasting_tea) is a famous experiment which was devised by Ronald Fisher. In this experiment, Fisher developed the null hypothesis significance testing framework to analyze a lady's claim that she could discern whether the tea or the milk was added first to the cup.
111 | - "tea-tasting" phonetically resembles "t-testing", referencing Student's t-test, a statistical method developed by William Gosset.
112 | 


--------------------------------------------------------------------------------
/docs/CNAME:
--------------------------------------------------------------------------------
1 | tea-tasting.e10v.me
2 | 


--------------------------------------------------------------------------------
/docs/api/aggr.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting.aggr
2 | 


--------------------------------------------------------------------------------
/docs/api/config.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting.config
2 | 


--------------------------------------------------------------------------------
/docs/api/datasets.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting.datasets
2 |     options:
3 |       members_order: source
4 | 


--------------------------------------------------------------------------------
/docs/api/experiment.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting.experiment
2 | 


--------------------------------------------------------------------------------
/docs/api/index.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting
2 | 


--------------------------------------------------------------------------------
/docs/api/metrics/base.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting.metrics.base
2 | 


--------------------------------------------------------------------------------
/docs/api/metrics/index.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting.metrics
2 | 


--------------------------------------------------------------------------------
/docs/api/metrics/mean.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting.metrics.mean
2 | 


--------------------------------------------------------------------------------
/docs/api/metrics/proportion.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting.metrics.proportion
2 | 


--------------------------------------------------------------------------------
/docs/api/metrics/resampling.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting.metrics.resampling
2 | 


--------------------------------------------------------------------------------
/docs/api/multiplicity.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting.multiplicity
2 | 


--------------------------------------------------------------------------------
/docs/api/utils.md:
--------------------------------------------------------------------------------
1 | ::: tea_tasting.utils
2 |     options:
3 |       group_by_category: false
4 |       members_order: source
5 | 


--------------------------------------------------------------------------------
/docs/assets/tea-cup-black.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="iso-8859-1"?>
 2 | <!-- Uploaded to: SVG Repo, www.svgrepo.com, Generator: SVG Repo Mixer Tools -->
 3 | <svg fill="#000000" height="800px" width="800px" version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" 
 4 | 	 viewBox="0 0 512.001 512.001" xml:space="preserve">
 5 | <g>
 6 | 	<g>
 7 | 		<path d="M428.229,190.018h-16.934V168.04c0-6.965-5.646-12.611-12.611-12.611H96.383c-6.965,0-12.611,5.646-12.611,12.611v104.13
 8 | 			c0,41.563,15.491,79.5,41.005,108.376h245.512c6.304-7.136,12-14.823,16.998-22.984h40.941c46.192,0,83.773-37.58,83.773-83.773
 9 | 			C512.001,227.598,474.421,190.018,428.229,190.018z M428.227,319.729h-23.958c4.561-15.052,7.024-31.018,7.024-47.561v-44.318
10 | 			h16.934c25.331,0,45.94,20.609,45.94,45.94C474.167,299.121,453.559,319.729,428.227,319.729z"/>
11 | 	</g>
12 | </g>
13 | <g>
14 | 	<g>
15 | 		<path d="M485.448,415.922c-2.748-1.556-5.917-2.455-9.3-2.455H18.916v0c-3.383,0-6.553,0.899-9.3,2.455
16 | 			C3.879,419.171,0,425.32,0,432.383C0,442.83,8.469,451.3,18.916,451.3h51.712l6.595,12.43
17 | 			c12.775,24.077,37.802,39.129,65.058,39.129h210.5c27.256,0,52.283-15.052,65.058-39.129l6.595-12.43h51.712
18 | 			c10.447,0,18.916-8.469,18.916-18.916C495.064,425.32,491.185,419.171,485.448,415.922z"/>
19 | 	</g>
20 | </g>
21 | <g>
22 | 	<g>
23 | 		<path d="M325.304,15.507c-30.064-12.523-66.005-6.569-90.463,17.888c-24.457,24.457-30.411,60.398-17.888,90.463
24 | 			c30.064,12.523,66.005,6.57,90.463-17.888C331.873,81.512,337.827,45.571,325.304,15.507z"/>
25 | 	</g>
26 | </g>
27 | </svg>


--------------------------------------------------------------------------------
/docs/assets/tea-cup-white-on-black.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
3 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="1300px" height="1300px" viewBox="-0.5 -0.5 1300 1300"><defs/><g><g><ellipse cx="650" cy="650" rx="650" ry="650" fill="#000000" stroke="none" pointer-events="all"/></g><g><image x="239.5" y="229.5" width="800" height="800" xlink:href="data:image/svg+xml;base64,PHN2ZyB4bWxuczp4bGluaz0iaHR0cDovL3d3dy53My5vcmcvMTk5OS94bGluayIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiBzdHJva2U9IiNmZmZmZmYiIHhtbDpzcGFjZT0icHJlc2VydmUiIHZpZXdCb3g9IjAgMCA1MTIuMDAxIDUxMi4wMDEiIGlkPSJMYXllcl8xIiB2ZXJzaW9uPSIxLjEiIHdpZHRoPSI4MDBweCIgaGVpZ2h0PSI4MDBweCIgZmlsbD0iI2ZmZmZmZiI+JiN4YTsmI3hhOzxnIHN0cm9rZS13aWR0aD0iMCIgaWQ9IlNWR1JlcG9fYmdDYXJyaWVyIi8+JiN4YTsmI3hhOzxnIHN0cm9rZS1saW5lam9pbj0icm91bmQiIHN0cm9rZS1saW5lY2FwPSJyb3VuZCIgaWQ9IlNWR1JlcG9fdHJhY2VyQ2FycmllciIvPiYjeGE7JiN4YTs8ZyBpZD0iU1ZHUmVwb19pY29uQ2FycmllciI+IDxnPiA8Zz4gPHBhdGggZD0iTTQyOC4yMjksMTkwLjAxOGgtMTYuOTM0VjE2OC4wNGMwLTYuOTY1LTUuNjQ2LTEyLjYxMS0xMi42MTEtMTIuNjExSDk2LjM4M2MtNi45NjUsMC0xMi42MTEsNS42NDYtMTIuNjExLDEyLjYxMXYxMDQuMTMgYzAsNDEuNTYzLDE1LjQ5MSw3OS41LDQxLjAwNSwxMDguMzc2aDI0NS41MTJjNi4zMDQtNy4xMzYsMTItMTQuODIzLDE2Ljk5OC0yMi45ODRoNDAuOTQxYzQ2LjE5MiwwLDgzLjc3My0zNy41OCw4My43NzMtODMuNzczIEM1MTIuMDAxLDIyNy41OTgsNDc0LjQyMSwxOTAuMDE4LDQyOC4yMjksMTkwLjAxOHogTTQyOC4yMjcsMzE5LjcyOWgtMjMuOTU4YzQuNTYxLTE1LjA1Miw3LjAyNC0zMS4wMTgsNy4wMjQtNDcuNTYxdi00NC4zMTggaDE2LjkzNGMyNS4zMzEsMCw0NS45NCwyMC42MDksNDUuOTQsNDUuOTRDNDc0LjE2NywyOTkuMTIxLDQ1My41NTksMzE5LjcyOSw0MjguMjI3LDMxOS43Mjl6Ii8+IDwvZz4gPC9nPiA8Zz4gPGc+IDxwYXRoIGQ9Ik00ODUuNDQ4LDQxNS45MjJjLTIuNzQ4LTEuNTU2LTUuOTE3LTIuNDU1LTkuMy0yLjQ1NUgxOC45MTZ2MGMtMy4zODMsMC02LjU1MywwLjg5OS05LjMsMi40NTUgQzMuODc5LDQxOS4xNzEsMCw0MjUuMzIsMCw0MzIuMzgzQzAsNDQyLjgzLDguNDY5LDQ1MS4zLDE4LjkxNiw0NTEuM2g1MS43MTJsNi41OTUsMTIuNDMgYzEyLjc3NSwyNC4wNzcsMzcuODAyLDM5LjEyOSw2NS4wNTgsMzkuMTI5aDIxMC41YzI3LjI1NiwwLDUyLjI4My0xNS4wNTIsNjUuMDU4LTM5LjEyOWw2LjU5NS0xMi40M2g1MS43MTIgYzEwLjQ0NywwLDE4LjkxNi04LjQ2OSwxOC45MTYtMTguOTE2QzQ5NS4wNjQsNDI1LjMyLDQ5MS4xODUsNDE5LjE3MSw0ODUuNDQ4LDQxNS45MjJ6Ii8+IDwvZz4gPC9nPiA8Zz4gPGc+IDxwYXRoIGQ9Ik0zMjUuMzA0LDE1LjUwN2MtMzAuMDY0LTEyLjUyMy02Ni4wMDUtNi41NjktOTAuNDYzLDE3Ljg4OGMtMjQuNDU3LDI0LjQ1Ny0zMC40MTEsNjAuMzk4LTE3Ljg4OCw5MC40NjMgYzMwLjA2NCwxMi41MjMsNjYuMDA1LDYuNTcsOTAuNDYzLTE3Ljg4OEMzMzEuODczLDgxLjUxMiwzMzcuODI3LDQ1LjU3MSwzMjUuMzA0LDE1LjUwN3oiLz4gPC9nPiA8L2c+IDwvZz4mI3hhOyYjeGE7PC9zdmc+" preserveAspectRatio="none"/></g></g></svg>


--------------------------------------------------------------------------------
/docs/assets/tea-cup-white.svg:
--------------------------------------------------------------------------------
1 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
2 | <!-- Uploaded to: SVG Repo, www.svgrepo.com, Transformed by: SVG Repo Mixer Tools -->
3 | <svg fill="#ffffff" height="800px" width="800px" version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 512.001 512.001" xml:space="preserve" stroke="#ffffff">
4 | <g id="SVGRepo_bgCarrier" stroke-width="0"/>
5 | <g id="SVGRepo_tracerCarrier" stroke-linecap="round" stroke-linejoin="round"/>
6 | <g id="SVGRepo_iconCarrier"> <g> <g> <path d="M428.229,190.018h-16.934V168.04c0-6.965-5.646-12.611-12.611-12.611H96.383c-6.965,0-12.611,5.646-12.611,12.611v104.13 c0,41.563,15.491,79.5,41.005,108.376h245.512c6.304-7.136,12-14.823,16.998-22.984h40.941c46.192,0,83.773-37.58,83.773-83.773 C512.001,227.598,474.421,190.018,428.229,190.018z M428.227,319.729h-23.958c4.561-15.052,7.024-31.018,7.024-47.561v-44.318 h16.934c25.331,0,45.94,20.609,45.94,45.94C474.167,299.121,453.559,319.729,428.227,319.729z"/> </g> </g> <g> <g> <path d="M485.448,415.922c-2.748-1.556-5.917-2.455-9.3-2.455H18.916v0c-3.383,0-6.553,0.899-9.3,2.455 C3.879,419.171,0,425.32,0,432.383C0,442.83,8.469,451.3,18.916,451.3h51.712l6.595,12.43 c12.775,24.077,37.802,39.129,65.058,39.129h210.5c27.256,0,52.283-15.052,65.058-39.129l6.595-12.43h51.712 c10.447,0,18.916-8.469,18.916-18.916C495.064,425.32,491.185,419.171,485.448,415.922z"/> </g> </g> <g> <g> <path d="M325.304,15.507c-30.064-12.523-66.005-6.569-90.463,17.888c-24.457,24.457-30.411,60.398-17.888,90.463 c30.064,12.523,66.005,6.57,90.463-17.888C331.873,81.512,337.827,45.571,325.304,15.507z"/> </g> </g> </g>
7 | </svg>


--------------------------------------------------------------------------------
/docs/custom-metrics.md:
--------------------------------------------------------------------------------
  1 | # Custom metrics
  2 | 
  3 | ## Intro
  4 | 
  5 | tea-tasting supports Student's t-test, Z-test, and [some other statistical tests](api/metrics/index.md) out of the box. However, you might want to analyze an experiment using other statistical criteria. In this case, you can define a custom metric with a statistical test of your choice.
  6 | 
  7 | In tea-tasting, there are two types of metrics:
  8 | 
  9 | - Metrics that require only aggregated statistics for the analysis.
 10 | - Metrics that require granular data for the analysis.
 11 | 
 12 | This guide explains how to define a custom metric for each type.
 13 | 
 14 | First, let's import all the required modules and prepare the data:
 15 | 
 16 | ```pycon
 17 | >>> from typing import Literal, NamedTuple
 18 | >>> import numpy as np
 19 | >>> import pyarrow as pa
 20 | >>> import pyarrow.compute as pc
 21 | >>> import scipy.stats
 22 | >>> import tea_tasting as tt
 23 | >>> import tea_tasting.aggr
 24 | >>> import tea_tasting.config
 25 | >>> import tea_tasting.metrics
 26 | >>> import tea_tasting.utils
 27 | 
 28 | >>> data = tt.make_users_data(seed=42)
 29 | >>> data = data.append_column(
 30 | ...     "has_order",
 31 | ...     pc.greater(data["orders"], 0).cast(pa.int64()),
 32 | ... )
 33 | >>> data
 34 | pyarrow.Table
 35 | user: int64
 36 | variant: int64
 37 | sessions: int64
 38 | orders: int64
 39 | revenue: double
 40 | has_order: int64
 41 | ----
 42 | user: [[0,1,2,3,4,...,3995,3996,3997,3998,3999]]
 43 | variant: [[1,0,1,1,0,...,0,0,0,0,0]]
 44 | sessions: [[2,2,2,2,1,...,2,2,3,1,5]]
 45 | orders: [[1,1,1,1,1,...,0,0,0,0,2]]
 46 | revenue: [[9.17,6.43,7.94,15.93,7.14,...,0,0,0,0,17.16]]
 47 | has_order: [[1,1,1,1,1,...,0,0,0,0,1]]
 48 | 
 49 | ```
 50 | 
 51 | This guide uses PyArrow as the data backend, but it's valid for other backends as well. See the [guide on data backends](data-backends.md) for more details.
 52 | 
 53 | ## Metrics based on aggregated statistics
 54 | 
 55 | Let's define a metric that performs a proportion test, [G-test](https://en.wikipedia.org/wiki/G-test) or [Pearson's chi-squared test](https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test), on a binary column (with values `0` or `1`).
 56 | 
 57 | The first step is defining a result class. It should be a named tuple or a dictionary.
 58 | 
 59 | ```pycon
 60 | >>> class ProportionResult(NamedTuple):
 61 | ...     control: float
 62 | ...     treatment: float
 63 | ...     effect_size: float
 64 | ...     rel_effect_size: float
 65 | ...     pvalue: float
 66 | ...     statistic: float
 67 | ... 
 68 | 
 69 | ```
 70 | 
 71 | The second step is defining the metric class itself. A metric based on aggregated statistics should be a subclass of [`MetricBaseAggregated`](api/metrics/base.md#tea_tasting.metrics.base.MetricBaseAggregated). `MetricBaseAggregated` is a generic class with the result class as a type variable.
 72 | 
 73 | The metric should have the following methods and properties defined:
 74 | 
 75 | - Method `__init__` checks and saves metric parameters.
 76 | - Property `aggr_cols` returns columns to be aggregated for analysis for each type of statistic.
 77 | - Method `analyze_aggregates` analyzes the metric using aggregated statistics.
 78 | 
 79 | Let's define the metric and discuss each method in details:
 80 | 
 81 | ```pycon
 82 | >>> class Proportion(tea_tasting.metrics.MetricBaseAggregated[ProportionResult]):
 83 | ...     def __init__(
 84 | ...         self,
 85 | ...         column: str,
 86 | ...         *,
 87 | ...         correction: bool = True,
 88 | ...         method: Literal["g-test", "pearson"] = "g-test",
 89 | ...     ) -> None:
 90 | ...         self.column = tea_tasting.utils.check_scalar(column, "column", typ=str)
 91 | ...         self.correction = tea_tasting.utils.auto_check(correction, "correction")
 92 | ...         self.method = tea_tasting.utils.check_scalar(
 93 | ...             method, "method", typ=str, in_={"g-test", "pearson"})
 94 | ...     @property
 95 | ...     def aggr_cols(self) -> tea_tasting.metrics.AggrCols:
 96 | ...         return tea_tasting.metrics.AggrCols(
 97 | ...             has_count=True,
 98 | ...             mean_cols=(self.column,),
 99 | ...         )
100 | ...     def analyze_aggregates(
101 | ...         self,
102 | ...         control: tea_tasting.aggr.Aggregates,
103 | ...         treatment: tea_tasting.aggr.Aggregates,
104 | ...     ) -> ProportionResult:
105 | ...         observed = np.empty(shape=(2, 2), dtype=np.int64)
106 | ...         observed[0, 0] = round(control.count() * control.mean(self.column))
107 | ...         observed[1, 0] = control.count() - observed[0, 0]
108 | ...         observed[0, 1] = round(treatment.count() * treatment.mean(self.column))
109 | ...         observed[1, 1] = treatment.count() - observed[0, 1]
110 | ...         res = scipy.stats.chi2_contingency(
111 | ...             observed=observed,
112 | ...             correction=self.correction,
113 | ...             lambda_=int(self.method == "pearson"),
114 | ...         )
115 | ...         return ProportionResult(
116 | ...             control=control.mean(self.column),
117 | ...             treatment=treatment.mean(self.column),
118 | ...             effect_size=treatment.mean(self.column) - control.mean(self.column),
119 | ...             rel_effect_size=treatment.mean(self.column)/control.mean(self.column) - 1,
120 | ...             pvalue=res.pvalue,
121 | ...             statistic=res.statistic,
122 | ...         )
123 | ... 
124 | 
125 | ```
126 | 
127 | Method `__init__` saves metric parameters to be used in the analysis. You can use utility functions [`check_scalar`](api/utils.md#tea_tasting.utils.check_scalar) and [`auto_check`](api/utils.md#tea_tasting.utils.auto_check) to check parameter values.
128 | 
129 | Property `aggr_cols` returns an instance of [`AggrCols`](api/metrics/base.md#tea_tasting.metrics.base.AggrCols). Analysis of proportion requires the number of rows (`has_count=True`) and the average value for the column of interest (`mean_cols=(self.column,)`) for each variant.
130 | 
131 | Method `analyze_aggregates` accepts two parameters: `control` and `treatment` data as instances of class [`Aggregates`](api/aggr.md#tea_tasting.aggr.Aggregates). They contain values for statistics and columns specified in `aggr_cols`.
132 | 
133 | Method `analyze_aggregates` returns an instance of `ProportionResult`, defined earlier, with the analysis result.
134 | 
135 | Now we can analyze the proportion of users who created at least one order during the experiment. For comparison, let's also add a metric that performs a Z-test on the same column.
136 | 
137 | ```pycon
138 | >>> experiment_prop = tt.Experiment(
139 | ...     prop_users_with_orders=Proportion("has_order"),
140 | ...     mean_users_with_orders=tt.Mean("has_order", use_t=False),
141 | ... )
142 | >>> experiment_prop.analyze(data)
143 |                 metric control treatment rel_effect_size rel_effect_size_ci pvalue
144 | prop_users_with_orders   0.345     0.384             11%             [-, -] 0.0117
145 | mean_users_with_orders   0.345     0.384             11%        [2.5%, 21%] 0.0106
146 | 
147 | ```
148 | 
149 | ## Metrics based on granular data
150 | 
151 | Now let's define a metric that performs the Mann-Whitney U test. While it's possible to use the aggregated sum of ranks for the test, this example uses granular data for analysis.
152 | 
153 | The result class:
154 | 
155 | ```pycon
156 | >>> class MannWhitneyUResult(NamedTuple):
157 | ...     pvalue: float
158 | ...     statistic: float
159 | ... 
160 | 
161 | ```
162 | 
163 | A metric that analyzes granular data should be a subclass of [`MetricBaseGranular`](api/metrics/base.md#tea_tasting.metrics.base.MetricBaseGranular). `MetricBaseGranular` is a generic class with the result class as a type variable.
164 | 
165 | Metric should have the following methods and properties defined:
166 | 
167 | - Method `__init__` checks and saves metric parameters.
168 | - Property `cols` returns columns to be fetched for an analysis.
169 | - Method `analyze_granular` analyzes the metric using granular data.
170 | 
171 | ```pycon
172 | >>> class MannWhitneyU(tea_tasting.metrics.MetricBaseGranular[MannWhitneyUResult]):
173 | ...     def __init__(
174 | ...         self,
175 | ...         column: str,
176 | ...         *,
177 | ...         correction: bool = True,
178 | ...         alternative: Literal["two-sided", "less", "greater"] | None = None,
179 | ...     ) -> None:
180 | ...         self.column = tea_tasting.utils.check_scalar(column, "column", typ=str)
181 | ...         self.correction = tea_tasting.utils.auto_check(correction, "correction")
182 | ...         self.alternative = (
183 | ...             tea_tasting.utils.auto_check(alternative, "alternative")
184 | ...             if alternative is not None
185 | ...             else tea_tasting.config.get_config("alternative")
186 | ...         )
187 | ...     @property
188 | ...     def cols(self) -> tuple[str]:
189 | ...         return (self.column,)
190 | ...     def analyze_granular(
191 | ...         self,
192 | ...         control: pa.Table,
193 | ...         treatment: pa.Table,
194 | ...     ) -> MannWhitneyUResult:
195 | ...         res = scipy.stats.mannwhitneyu(
196 | ...             treatment[self.column].combine_chunks().to_numpy(zero_copy_only=False),
197 | ...             control[self.column].combine_chunks().to_numpy(zero_copy_only=False),
198 | ...             use_continuity=self.correction,
199 | ...             alternative=self.alternative,
200 | ...         )
201 | ...         return MannWhitneyUResult(
202 | ...             pvalue=res.pvalue,
203 | ...             statistic=res.statistic,
204 | ...         )
205 | ... 
206 | 
207 | ```
208 | 
209 | Property `cols` should return a sequence of strings.
210 | 
211 | Method `analyze_granular` accepts two parameters: control and treatment data as PyArrow Tables. Even with [data backend](data-backends.md) different from PyArrow, tea-tasting will retrieve the data and transform into a PyArrow Table.
212 | 
213 | Method `analyze_granular` returns an instance of `MannWhitneyUResult`, defined earlier, with analysis result.
214 | 
215 | Now we can perform the Mann-Whitney U test:
216 | 
217 | ```pycon
218 | >>> experiment_mwu = tt.Experiment(
219 | ...     mwu_orders=MannWhitneyU("orders"),
220 | ...     mwu_revenue=MannWhitneyU("revenue"),
221 | ... )
222 | >>> result_mwu = experiment_mwu.analyze(data)
223 | >>> result_mwu.with_keys(("metric", "pvalue", "statistic"))
224 |      metric pvalue statistic
225 |  mwu_orders 0.0263   2069092
226 | mwu_revenue 0.0300   2068060
227 | 
228 | ```
229 | 
230 | ## Analyzing two types of metrics together
231 | 
232 | It's also possible to analyze two types of metrics in one experiment:
233 | 
234 | ```pycon
235 | >>> experiment = tt.Experiment(
236 | ...     prop_users_with_orders=Proportion("has_order"),
237 | ...     mean_users_with_orders=tt.Mean("has_order"),
238 | ...     mwu_orders=MannWhitneyU("orders"),
239 | ...     mwu_revenue=MannWhitneyU("revenue"),
240 | ... )
241 | >>> experiment.analyze(data)
242 |                 metric control treatment rel_effect_size rel_effect_size_ci pvalue
243 | prop_users_with_orders   0.345     0.384             11%             [-, -] 0.0117
244 | mean_users_with_orders   0.345     0.384             11%        [2.5%, 21%] 0.0106
245 |             mwu_orders       -         -               -             [-, -] 0.0263
246 |            mwu_revenue       -         -               -             [-, -] 0.0300
247 | 
248 | ```
249 | 
250 | In this case, tea-tasting performs two queries on the experimental data:
251 | 
252 | - With aggregated statistics required for analysis of metrics of type `MetricBaseAggregated`.
253 | - With detailed data with columns required for analysis of metrics of type `MetricBaseGranular`.
254 | 
255 | ## Recommendations
256 | 
257 | Follow these recommendations when defining custom metrics:
258 | 
259 | - Use parameter and attribute names consistent with the ones that are already defined in tea-tasting. For example, use `pvalue` instead of `p_value` or `correction` instead of `use_continuity`.
260 | - End confidence interval boundary names with `"_ci_lower"` and `"_ci_upper"`.
261 | - During initialization, save parameter values in metric attributes using the same names. For example, use `self.correction = correction` instead of `self.use_continuity = correction`.
262 | - Use global settings as default values for standard parameters, such as `alternative` or `confidence_level`. See the [reference](api/config.md#tea_tasting.config.config_context) for the full list of standard parameters. You can also define and use your own global parameters.
263 | 


--------------------------------------------------------------------------------
/docs/data-backends.md:
--------------------------------------------------------------------------------
  1 | # Data backends
  2 | 
  3 | ## Intro
  4 | 
  5 | tea-tasting supports a wide range of data backends such as BigQuery, ClickHouse, DuckDB, PostgreSQL, Snowflake, Spark, and many other backends supported by [Ibis](https://github.com/ibis-project/ibis). Ibis is a DataFrame API to various data backends.
  6 | 
  7 | Many statistical tests, such as the Student's t-test or the Z-test, require only aggregated data for analysis. For these tests, tea-tasting retrieves only aggregated statistics like mean and variance instead of downloading all detailed data.
  8 | 
  9 | For example, if the raw experimental data are stored in ClickHouse, it's faster and more efficient to calculate counts, averages, variances, and covariances directly in ClickHouse rather than fetching granular data and performing aggregations in a Python environment.
 10 | 
 11 | tea-tasting also accepts dataframes supported by [Narwhals](https://github.com/narwhals-dev/narwhals): cuDF, Dask, Modin, pandas, Polars, PyArrow. Narwhals is a compatibility layer between dataframe libraries.
 12 | 
 13 | This guide:
 14 | 
 15 | - Shows how to use tea-tasting with a data backend of your choice for the analysis of an experiment.
 16 | - Explains some internals of how tea-tasting uses Ibis to work with data backends.
 17 | 
 18 | ## Demo database
 19 | 
 20 | /// admonition | Note
 21 | 
 22 | This guide uses [DuckDB](https://github.com/duckdb/duckdb), an in-process analytical database, and [Polars](https://github.com/pola-rs/polars) as example data backends. Install these packages in addition to tea-tasting to reproduce the examples:
 23 | 
 24 | ```bash
 25 | uv pip install ibis-framework[duckdb] polars
 26 | ```
 27 | 
 28 | ///
 29 | 
 30 | First, let's prepare a demo database:
 31 | 
 32 | ```pycon
 33 | >>> import ibis
 34 | >>> import polars as pl
 35 | >>> import tea_tasting as tt
 36 | 
 37 | >>> users_data = tt.make_users_data(seed=42)
 38 | >>> con = ibis.connect("duckdb://")
 39 | >>> con.create_table("users_data", users_data)
 40 | DatabaseTable: memory.main.users_data
 41 |   user     int64
 42 |   variant  int64
 43 |   sessions int64
 44 |   orders   int64
 45 |   revenue  float64
 46 | 
 47 | ```
 48 | 
 49 | In the example above:
 50 | 
 51 | - Function `tt.make_users_data` returns a PyArrow Table with example experimental data.
 52 | - Function `ibis.duckdb.connect` creates a DuckDB in-process database using Ibis API.
 53 | - Method `con.create_table` creates and populates a table in the database based on the PyArrow Table.
 54 | 
 55 | See the [Ibis documentation on how to create connections](https://ibis-project.org/reference/connection) to other data backends.
 56 | 
 57 | ## Querying experimental data
 58 | 
 59 | Method `con.create_table` in the example above returns an Ibis Table which already can be used in the analysis of the experiment. But let's see how to use an SQL query to create an Ibis Table:
 60 | 
 61 | ```pycon
 62 | >>> data = con.sql("select * from users_data")
 63 | >>> data
 64 | SQLQueryResult
 65 |   query:
 66 |     select * from users_data
 67 |   schema:
 68 |     user     int64
 69 |     variant  int64
 70 |     sessions int64
 71 |     orders   int64
 72 |     revenue  float64
 73 | 
 74 | ```
 75 | 
 76 | It's a very simple query. In the real world, you might need to use joins, aggregations, and CTEs to get the data. You can define any SQL query supported by your data backend and use it to create Ibis Table.
 77 | 
 78 | Keep in mind that tea-tasting assumes that:
 79 | 
 80 | - Data is grouped by randomization units, such as individual users.
 81 | - There is a column indicating the variant of the A/B test (typically labeled as A, B, etc.).
 82 | - All necessary columns for metric calculations (like the number of orders, revenue, etc.) are included in the table.
 83 | 
 84 | Ibis Table is a lazy object. It doesn't fetch the data when created. You can use Ibis DataFrame API to query the table and fetch the result:
 85 | 
 86 | ```pycon
 87 | >>> ibis.options.interactive = True
 88 | >>> print(data.head(5))
 89 | ┏━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┓
 90 | ┃ user  ┃ variant ┃ sessions ┃ orders ┃ revenue ┃
 91 | ┡━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━┩
 92 | │ int64 │ int64   │ int64    │ int64  │ float64 │
 93 | ├───────┼─────────┼──────────┼────────┼─────────┤
 94 | │     0 │       1 │        2 │      1 │    9.17 │
 95 | │     1 │       0 │        2 │      1 │    6.43 │
 96 | │     2 │       1 │        2 │      1 │    7.94 │
 97 | │     3 │       1 │        2 │      1 │   15.93 │
 98 | │     4 │       0 │        1 │      1 │    7.14 │
 99 | └───────┴─────────┴──────────┴────────┴─────────┘
100 | 
101 | >>> ibis.options.interactive = False
102 | 
103 | ```
104 | 
105 | ## Ibis example
106 | 
107 | To better understand what Ibis does, let's consider the example with grouping and aggregation by variants:
108 | 
109 | ```pycon
110 | >>> aggr_data = data.group_by("variant").aggregate(
111 | ...     sessions_per_user=data.sessions.mean(),
112 | ...     orders_per_session=data.orders.mean() / data.sessions.mean(),
113 | ...     orders_per_user=data.orders.mean(),
114 | ...     revenue_per_user=data.revenue.mean(),
115 | ... )
116 | >>> aggr_data
117 | r0 := SQLQueryResult
118 |   query:
119 |     select * from users_data
120 |   schema:
121 |     user     int64
122 |     variant  int64
123 |     sessions int64
124 |     orders   int64
125 |     revenue  float64
126 | <BLANKLINE>
127 | Aggregate[r0]
128 |   groups:
129 |     variant: r0.variant
130 |   metrics:
131 |     sessions_per_user:  Mean(r0.sessions)
132 |     orders_per_session: Mean(r0.orders) / Mean(r0.sessions)
133 |     orders_per_user:    Mean(r0.orders)
134 |     revenue_per_user:   Mean(r0.revenue)
135 | 
136 | ```
137 | 
138 | `aggr_data` is another Ibis Table defined as a query over the previously defined `data`. Let's fetch the result:
139 | 
140 | ```pycon
141 | >>> ibis.options.interactive = True
142 | >>> print(aggr_data)  # doctest: +SKIP
143 | ┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
144 | ┃ variant ┃ sessions_per_user ┃ orders_per_session ┃ orders_per_user ┃ revenue_per_user ┃
145 | ┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
146 | │ int64   │ float64           │ float64            │ float64         │ float64          │
147 | ├─────────┼───────────────────┼────────────────────┼─────────────────┼──────────────────┤
148 | │       0 │          1.996045 │           0.265726 │        0.530400 │         5.241028 │
149 | │       1 │          1.982802 │           0.289031 │        0.573091 │         5.730111 │
150 | └─────────┴───────────────────┴────────────────────┴─────────────────┴──────────────────┘
151 | 
152 | >>> ibis.options.interactive = False
153 | 
154 | ```
155 | 
156 | Internally, Ibis compiles a Table to an SQL query supported by the backend:
157 | 
158 | ```pycon
159 | >>> print(aggr_data.compile(pretty=True))
160 | SELECT
161 |   "t0"."variant",
162 |   AVG("t0"."sessions") AS "sessions_per_user",
163 |   AVG("t0"."orders") / AVG("t0"."sessions") AS "orders_per_session",
164 |   AVG("t0"."orders") AS "orders_per_user",
165 |   AVG("t0"."revenue") AS "revenue_per_user"
166 | FROM (
167 |   SELECT
168 |     *
169 |   FROM users_data
170 | ) AS "t0"
171 | GROUP BY
172 |   1
173 | 
174 | ```
175 | 
176 | See [Ibis documentation](https://ibis-project.org/tutorials/getting_started) for more details.
177 | 
178 | ## Experiment analysis
179 | 
180 | The example above shows how to query the metric averages. But for statistical inference, it's not enough. For example, Student's t-test and Z-test also require number of rows and variance. Additionally, analysis of ratio metrics and variance reduction with CUPED requires covariances.
181 | 
182 | Querying all the required statistics manually can be a daunting and error-prone task. But don't worry—tea-tasting does this work for you. You just need to specify the metrics:
183 | 
184 | ```pycon
185 | >>> experiment = tt.Experiment(
186 | ...     sessions_per_user=tt.Mean("sessions"),
187 | ...     orders_per_session=tt.RatioOfMeans("orders", "sessions"),
188 | ...     orders_per_user=tt.Mean("orders"),
189 | ...     revenue_per_user=tt.Mean("revenue"),
190 | ... )
191 | >>> result = experiment.analyze(data)
192 | >>> result
193 |             metric control treatment rel_effect_size rel_effect_size_ci pvalue
194 |  sessions_per_user    2.00      1.98          -0.66%      [-3.7%, 2.5%]  0.674
195 | orders_per_session   0.266     0.289            8.8%      [-0.89%, 19%] 0.0762
196 |    orders_per_user   0.530     0.573            8.0%       [-2.0%, 19%]  0.118
197 |   revenue_per_user    5.24      5.73            9.3%       [-2.4%, 22%]  0.123
198 | 
199 | ```
200 | 
201 | In the example above, tea-tasting fetches all the required statistics with a single query and then uses them to analyze the experiment.
202 | 
203 | Some statistical methods, like bootstrap, require granular data for analysis. In this case, tea-tasting fetches the detailed data as well.
204 | 
205 | ## Example with CUPED
206 | 
207 | An example of a slightly more complicated analysis using variance reduction with CUPED:
208 | 
209 | ```pycon
210 | >>> users_data_cuped = tt.make_users_data(seed=42, covariates=True)
211 | >>> con.create_table("users_data_cuped", users_data_cuped)
212 | DatabaseTable: memory.main.users_data_cuped
213 |   user               int64
214 |   variant            int64
215 |   sessions           int64
216 |   orders             int64
217 |   revenue            float64
218 |   sessions_covariate int64
219 |   orders_covariate   int64
220 |   revenue_covariate  float64
221 | 
222 | >>> data_cuped = con.sql("select * from users_data_cuped")
223 | >>> experiment_cuped = tt.Experiment(
224 | ...     sessions_per_user=tt.Mean("sessions", "sessions_covariate"),
225 | ...     orders_per_session=tt.RatioOfMeans(
226 | ...         numer="orders",
227 | ...         denom="sessions",
228 | ...         numer_covariate="orders_covariate",
229 | ...         denom_covariate="sessions_covariate",
230 | ...     ),
231 | ...     orders_per_user=tt.Mean("orders", "orders_covariate"),
232 | ...     revenue_per_user=tt.Mean("revenue", "revenue_covariate"),
233 | ... )
234 | >>> result_cuped = experiment_cuped.analyze(data_cuped)
235 | >>> result_cuped
236 |             metric control treatment rel_effect_size rel_effect_size_ci  pvalue
237 |  sessions_per_user    2.00      1.98          -0.68%      [-3.2%, 1.9%]   0.603
238 | orders_per_session   0.262     0.293             12%        [4.2%, 21%] 0.00229
239 |    orders_per_user   0.523     0.581             11%        [2.9%, 20%] 0.00733
240 |   revenue_per_user    5.12      5.85             14%        [3.8%, 26%] 0.00674
241 | 
242 | ```
243 | 
244 | ## Polars example
245 | 
246 | Here’s an example of how to analyze data using a Polars DataFrame:
247 | 
248 | ```pycon
249 | >>> data_polars = pl.from_arrow(users_data)
250 | >>> experiment.analyze(data_polars)
251 |             metric control treatment rel_effect_size rel_effect_size_ci pvalue
252 |  sessions_per_user    2.00      1.98          -0.66%      [-3.7%, 2.5%]  0.674
253 | orders_per_session   0.266     0.289            8.8%      [-0.89%, 19%] 0.0762
254 |    orders_per_user   0.530     0.573            8.0%       [-2.0%, 19%]  0.118
255 |   revenue_per_user    5.24      5.73            9.3%       [-2.4%, 22%]  0.123
256 | 
257 | ```
258 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
  1 | # tea-tasting: statistical analysis of A/B tests
  2 | 
  3 | [![CI](https://github.com/e10v/tea-tasting/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/e10v/tea-tasting/actions/workflows/ci.yml)
  4 | [![Docs](https://github.com/e10v/tea-tasting/actions/workflows/docs.yml/badge.svg)](https://tea-tasting.e10v.me/)
  5 | [![Coverage](https://codecov.io/github/e10v/tea-tasting/coverage.svg?branch=main)](https://codecov.io/gh/e10v/tea-tasting)
  6 | [![License](https://img.shields.io/github/license/e10v/tea-tasting)](https://github.com/e10v/tea-tasting/blob/main/LICENSE)
  7 | [![Package Status](https://img.shields.io/pypi/status/tea-tasting.svg)](https://pypi.org/project/tea-tasting/)
  8 | [![Version](https://img.shields.io/pypi/v/tea-tasting.svg)](https://pypi.org/project/tea-tasting/)
  9 | [![PyPI Python Versions](https://img.shields.io/pypi/pyversions/tea-tasting.svg)](https://pypi.org/project/tea-tasting/)
 10 | 
 11 | tea-tasting is a Python package for the statistical analysis of A/B tests featuring:
 12 | 
 13 | - Student's t-test, Z-test, bootstrap, and quantile metrics out of the box.
 14 | - Extensible API that lets you define and use statistical tests of your choice.
 15 | - [Delta method](https://alexdeng.github.io/public/files/kdd2018-dm.pdf) for ratio metrics.
 16 | - Variance reduction using [CUPED](https://exp-platform.com/Documents/2013-02-CUPED-ImprovingSensitivityOfControlledExperiments.pdf)/[CUPAC](https://doordash.engineering/2020/06/08/improving-experimental-power-through-control-using-predictions-as-covariate-cupac/), which can be combined with the Delta method for ratio metrics.
 17 | - Confidence intervals for both absolute and percentage changes.
 18 | - Checks for sample-ratio mismatches.
 19 | - Power analysis.
 20 | - Multiple hypothesis testing (family-wise error rate and false discovery rate).
 21 | - Simulated experiments, including A/A tests.
 22 | 
 23 | tea-tasting calculates statistics directly within data backends such as BigQuery, ClickHouse, DuckDB, PostgreSQL, Snowflake, Spark, and many other backends supported by [Ibis](https://github.com/ibis-project/ibis). This approach eliminates the need to import granular data into a Python environment.
 24 | 
 25 | tea-tasting also accepts dataframes supported by [Narwhals](https://github.com/narwhals-dev/narwhals): cuDF, Dask, Modin, pandas, Polars, PyArrow.
 26 | 
 27 | ## Installation
 28 | 
 29 | ```bash
 30 | uv pip install tea-tasting
 31 | ```
 32 | 
 33 | ## Basic example
 34 | 
 35 | ```pycon
 36 | >>> import tea_tasting as tt
 37 | 
 38 | >>> data = tt.make_users_data(seed=42)
 39 | >>> experiment = tt.Experiment(
 40 | ...     sessions_per_user=tt.Mean("sessions"),
 41 | ...     orders_per_session=tt.RatioOfMeans("orders", "sessions"),
 42 | ...     orders_per_user=tt.Mean("orders"),
 43 | ...     revenue_per_user=tt.Mean("revenue"),
 44 | ... )
 45 | >>> result = experiment.analyze(data)
 46 | >>> result
 47 |             metric control treatment rel_effect_size rel_effect_size_ci pvalue
 48 |  sessions_per_user    2.00      1.98          -0.66%      [-3.7%, 2.5%]  0.674
 49 | orders_per_session   0.266     0.289            8.8%      [-0.89%, 19%] 0.0762
 50 |    orders_per_user   0.530     0.573            8.0%       [-2.0%, 19%]  0.118
 51 |   revenue_per_user    5.24      5.73            9.3%       [-2.4%, 22%]  0.123
 52 | 
 53 | ```
 54 | 
 55 | Learn more in the detailed [user guide](https://tea-tasting.e10v.me/user-guide/). Additionally, see the guides on more specific topics:
 56 | 
 57 | - [Data backends](https://tea-tasting.e10v.me/data-backends/).
 58 | - [Power analysis](https://tea-tasting.e10v.me/power-analysis/).
 59 | - [Multiple hypothesis testing](https://tea-tasting.e10v.me/multiple-testing/).
 60 | - [Custom metrics](https://tea-tasting.e10v.me/custom-metrics/).
 61 | - [Simulated experiments](https://tea-tasting.e10v.me/simulated-experiments/).
 62 | 
 63 | ## Examples
 64 | 
 65 | The tea-tasting repository includes [examples](https://github.com/e10v/tea-tasting/tree/main/examples) as copies of the guides in the [marimo](https://github.com/marimo-team/marimo) notebook format. You can either download them from GitHub and run in your local environment, or you can run them as WASM notebooks in the online playground.
 66 | 
 67 | ### Run in a local environment
 68 | 
 69 | To run the examples in your local environment, clone the repository and change the directory:
 70 | 
 71 | ```bash
 72 | git clone git@github.com:e10v/tea-tasting.git && cd tea-tasting
 73 | ```
 74 | 
 75 | Install marimo, tea-tasting, and other packages used in the examples:
 76 | 
 77 | ```bash
 78 | uv venv && uv pip install marimo tea-tasting polars ibis-framework[duckdb]
 79 | ```
 80 | 
 81 | Launch the notebook server:
 82 | 
 83 | ```bash
 84 | uv run marimo edit examples
 85 | ```
 86 | 
 87 | Now you can choose and run the example notebooks.
 88 | 
 89 | ### Run in the online playground
 90 | 
 91 | To run the examples as WASM notebooks in the online playground, open the following links:
 92 | 
 93 | - [User guide](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fuser-guide.py&embed=true).
 94 | - [Data backends](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fdata-backends.py&embed=true).
 95 | - [Power analysis](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fpower-analysis.py&embed=true).
 96 | - [Multiple hypothesis testing](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fmultiple-testing.py&embed=true).
 97 | - [Custom metrics](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fcustom-metrics.py&embed=true).
 98 | - [Simulated experiments](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fsimulated-experiments.py&embed=true).
 99 | 
100 | [WASM notebooks](https://docs.marimo.io/guides/wasm/) run entirely in the browser on [Pyodide](https://github.com/pyodide/pyodide) and thus have some limitations. In particular:
101 | 
102 | - Tables and dataframes render less attractively because Pyodide doesn't always include the latest [packages versions](https://pyodide.org/en/stable/usage/packages-in-pyodide.html).
103 | - You can't simulate experiments [in parallel](https://tea-tasting.e10v.me/simulated-experiments/#parallel-execution) because Pyodide currently [doesn't support multiprocessing](https://pyodide.org/en/stable/usage/wasm-constraints.html#included-but-not-working-modules).
104 | - Other unpredictable issues may arise, such as the inability to use duckdb with ibis.
105 | 
106 | ## Package name
107 | 
108 | The package name "tea-tasting" is a play on words that refers to two subjects:
109 | 
110 | - [Lady tasting tea](https://en.wikipedia.org/wiki/Lady_tasting_tea) is a famous experiment which was devised by Ronald Fisher. In this experiment, Fisher developed the null hypothesis significance testing framework to analyze a lady's claim that she could discern whether the tea or the milk was added first to the cup.
111 | - "tea-tasting" phonetically resembles "t-testing", referencing Student's t-test, a statistical method developed by William Gosset.
112 | 


--------------------------------------------------------------------------------
/docs/javascripts/override-copy.js:
--------------------------------------------------------------------------------
 1 | function attachCustomCopy() {
 2 |   document.querySelectorAll("button.md-clipboard").forEach((button) => {
 3 |     button.removeEventListener("click", handleCopy);
 4 |   });
 5 | 
 6 |   document.querySelectorAll("button.md-clipboard").forEach((button) => {
 7 |     button.addEventListener("click", handleCopy);
 8 |   });
 9 | }
10 | 
11 | function handleCopy(event) {
12 |   event.preventDefault();
13 |   const button = event.currentTarget;
14 |   const codeBlock = document.querySelector(button.getAttribute('data-clipboard-target'));
15 |   const codeBlockClone = codeBlock.cloneNode(true);
16 |   codeBlockClone.querySelectorAll('.go').forEach(span => {
17 |     const prev = span.previousSibling;
18 |     if (prev && prev.nodeType === Node.TEXT_NODE) {
19 |       prev.textContent = prev.textContent.replace(/[\r\n]+$/, '');
20 |     }
21 |   });
22 |   codeBlockClone.querySelectorAll('.gp, .go').forEach(span => span.remove());
23 |   navigator.clipboard.writeText(codeBlockClone.textContent || codeBlockClone.innerText);
24 | }
25 | 
26 | document$.subscribe(() => {
27 |   attachCustomCopy();
28 | });
29 | 


--------------------------------------------------------------------------------
/docs/multiple-testing.md:
--------------------------------------------------------------------------------
  1 | # Multiple testing
  2 | 
  3 | ## Multiple hypothesis testing problem
  4 | 
  5 | /// admonition | Note
  6 | 
  7 | This guide uses [Polars](https://github.com/pola-rs/polars) as an example data backend. Install Polars in addition to tea-tasting to reproduce the examples:
  8 | 
  9 | ```bash
 10 | uv pip install polars
 11 | ```
 12 | 
 13 | ///
 14 | 
 15 | The [multiple hypothesis testing problem](https://en.wikipedia.org/wiki/Multiple_comparisons_problem) arises when there is more than one success metric or more than one treatment variant in an A/B test.
 16 | 
 17 | tea-tasting provides the following methods for multiple testing correction:
 18 | 
 19 | - [False discovery rate](https://en.wikipedia.org/wiki/False_discovery_rate) (FDR) controlling procedures:
 20 |     - Benjamini-Hochberg procedure, assuming non-negative correlation between hypotheses.
 21 |     - Benjamini-Yekutieli procedure, assuming arbitrary dependence between hypotheses.
 22 | - [Family-wise error rate](https://en.wikipedia.org/wiki/Family-wise_error_rate) (FWER) controlling procedures:
 23 |     - Hochberg's step-up procedure, assuming non-negative correlation between hypotheses.
 24 |     - Holm's step-down procedure, assuming arbitrary dependence between hypotheses.
 25 | 
 26 | As an example, consider an experiment with three variants, a control and two treatments:
 27 | 
 28 | ```pycon
 29 | >>> import polars as pl
 30 | >>> import tea_tasting as tt
 31 | 
 32 | >>> data = pl.concat((
 33 | ...     tt.make_users_data(
 34 | ...         seed=42,
 35 | ...         orders_uplift=0.10,
 36 | ...         revenue_uplift=0.15,
 37 | ...         return_type="polars",
 38 | ...     ),
 39 | ...     tt.make_users_data(
 40 | ...         seed=21,
 41 | ...         orders_uplift=0.15,
 42 | ...         revenue_uplift=0.20,
 43 | ...         return_type="polars",
 44 | ...     )
 45 | ...         .filter(pl.col("variant").eq(1))
 46 | ...         .with_columns(variant=pl.lit(2, pl.Int64)),
 47 | ... ))
 48 | >>> data
 49 | shape: (6_046, 5)
 50 | ┌──────┬─────────┬──────────┬────────┬─────────┐
 51 | │ user ┆ variant ┆ sessions ┆ orders ┆ revenue │
 52 | │ ---  ┆ ---     ┆ ---      ┆ ---    ┆ ---     │
 53 | │ i64  ┆ i64     ┆ i64      ┆ i64    ┆ f64     │
 54 | ╞══════╪═════════╪══════════╪════════╪═════════╡
 55 | │ 0    ┆ 1       ┆ 2        ┆ 1      ┆ 9.58    │
 56 | │ 1    ┆ 0       ┆ 2        ┆ 1      ┆ 6.43    │
 57 | │ 2    ┆ 1       ┆ 2        ┆ 1      ┆ 8.3     │
 58 | │ 3    ┆ 1       ┆ 2        ┆ 1      ┆ 16.65   │
 59 | │ 4    ┆ 0       ┆ 1        ┆ 1      ┆ 7.14    │
 60 | │ …    ┆ …       ┆ …        ┆ …      ┆ …       │
 61 | │ 3989 ┆ 2       ┆ 4        ┆ 4      ┆ 34.93   │
 62 | │ 3991 ┆ 2       ┆ 1        ┆ 0      ┆ 0.0     │
 63 | │ 3992 ┆ 2       ┆ 3        ┆ 3      ┆ 27.96   │
 64 | │ 3994 ┆ 2       ┆ 2        ┆ 1      ┆ 17.22   │
 65 | │ 3998 ┆ 2       ┆ 3        ┆ 0      ┆ 0.0     │
 66 | └──────┴─────────┴──────────┴────────┴─────────┘
 67 | 
 68 | ```
 69 | 
 70 | Let's calculate the experiment results:
 71 | 
 72 | ```pycon
 73 | >>> experiment = tt.Experiment(
 74 | ...     sessions_per_user=tt.Mean("sessions"),
 75 | ...     orders_per_session=tt.RatioOfMeans("orders", "sessions"),
 76 | ...     orders_per_user=tt.Mean("orders"),
 77 | ...     revenue_per_user=tt.Mean("revenue"),
 78 | ... )
 79 | >>> results = experiment.analyze(data, control=0, all_variants=True)
 80 | >>> results
 81 | variants             metric control treatment rel_effect_size rel_effect_size_ci  pvalue
 82 |   (0, 1)  sessions_per_user    2.00      1.98          -0.66%      [-3.7%, 2.5%]   0.674
 83 |   (0, 1) orders_per_session   0.266     0.289            8.8%      [-0.89%, 19%]  0.0762
 84 |   (0, 1)    orders_per_user   0.530     0.573            8.0%       [-2.0%, 19%]   0.118
 85 |   (0, 1)   revenue_per_user    5.24      5.99             14%        [2.1%, 28%]  0.0211
 86 |   (0, 2)  sessions_per_user    2.00      2.02           0.98%      [-2.1%, 4.1%]   0.532
 87 |   (0, 2) orders_per_session   0.266     0.295             11%        [1.2%, 22%]  0.0273
 88 |   (0, 2)    orders_per_user   0.530     0.594             12%        [1.7%, 23%]  0.0213
 89 |   (0, 2)   revenue_per_user    5.24      6.25             19%        [6.6%, 33%] 0.00218
 90 | 
 91 | ```
 92 | 
 93 | Suppose only the two metrics `orders_per_user` and `revenue_per_user` are considered as success metrics, while the other two metrics `sessions_per_user` and `orders_per_session` are second-order diagnostic metrics.
 94 | 
 95 | ```pycon
 96 | >>> metrics = {"orders_per_user", "revenue_per_user"}
 97 | 
 98 | ```
 99 | 
100 | With two treatment variants and two success metrics, there are four hypotheses in total, which increases the probability of false positives (also called "false discoveries"). It's recommended to adjust the p-values or the significance level (alpha) in this case. Let's explore the correction methods provided by tea-tasting.
101 | 
102 | ## False discovery rate
103 | 
104 | False discovery rate (FDR) is the expected value of the proportion of false discoveries among the discoveries (rejections of the null hypothesis). To control for FDR, use the [`adjust_fdr`](api/multiplicity.md#tea_tasting.multiplicity.adjust_fdr) method:
105 | 
106 | ```pycon
107 | >>> adjusted_results_fdr = tt.adjust_fdr(results, metrics)
108 | >>> adjusted_results_fdr
109 | comparison           metric control treatment rel_effect_size  pvalue pvalue_adj
110 |     (0, 1)  orders_per_user   0.530     0.573            8.0%   0.118      0.118
111 |     (0, 1) revenue_per_user    5.24      5.99             14%  0.0211     0.0284
112 |     (0, 2)  orders_per_user   0.530     0.594             12%  0.0213     0.0284
113 |     (0, 2) revenue_per_user    5.24      6.25             19% 0.00218    0.00872
114 | 
115 | ```
116 | 
117 | The method adjusts p-values and saves them as `pvalue_adj`. Compare these values to the desired significance level alpha to determine if the null hypotheses can be rejected.
118 | 
119 | The method also adjusts the significance level alpha and saves it as `alpha_adj`. Compare non-adjusted p-values (`pvalue`) to the `alpha_adj` to determine if the null hypotheses can be rejected:
120 | 
121 | ```pycon
122 | >>> adjusted_results_fdr.with_keys((
123 | ...     "comparison",
124 | ...     "metric",
125 | ...     "control",
126 | ...     "treatment",
127 | ...     "rel_effect_size",
128 | ...     "pvalue",
129 | ...     "alpha_adj",
130 | ... ))
131 | comparison           metric control treatment rel_effect_size  pvalue alpha_adj
132 |     (0, 1)  orders_per_user   0.530     0.573            8.0%   0.118    0.0500
133 |     (0, 1) revenue_per_user    5.24      5.99             14%  0.0211    0.0375
134 |     (0, 2)  orders_per_user   0.530     0.594             12%  0.0213    0.0375
135 |     (0, 2) revenue_per_user    5.24      6.25             19% 0.00218    0.0375
136 | 
137 | ```
138 | 
139 | By default, tea-tasting assumes non-negative correlation between hypotheses and performs the Benjamini-Hochberg procedure. To perform the Benjamini-Yekutieli procedure, assuming arbitrary dependence between hypotheses, set the `arbitrary_dependence` parameter to `True`:
140 | 
141 | ```pycon
142 | >>> tt.adjust_fdr(results, metrics, arbitrary_dependence=True)
143 | comparison           metric control treatment rel_effect_size  pvalue pvalue_adj
144 |     (0, 1)  orders_per_user   0.530     0.573            8.0%   0.118      0.245
145 |     (0, 1) revenue_per_user    5.24      5.99             14%  0.0211     0.0592
146 |     (0, 2)  orders_per_user   0.530     0.594             12%  0.0213     0.0592
147 |     (0, 2) revenue_per_user    5.24      6.25             19% 0.00218     0.0182
148 | 
149 | ```
150 | 
151 | ## Family-wise error rate
152 | 
153 | Family-wise error rate (FWER) is the probability of making at least one type I error. To control for FWER, use the [`adjust_fwer`](api/multiplicity.md#tea_tasting.multiplicity.adjust_fwer) method:
154 | 
155 | ```pycon
156 | >>> tt.adjust_fwer(results, metrics)
157 | comparison           metric control treatment rel_effect_size  pvalue pvalue_adj
158 |     (0, 1)  orders_per_user   0.530     0.573            8.0%   0.118      0.118
159 |     (0, 1) revenue_per_user    5.24      5.99             14%  0.0211     0.0422
160 |     (0, 2)  orders_per_user   0.530     0.594             12%  0.0213     0.0422
161 |     (0, 2) revenue_per_user    5.24      6.25             19% 0.00218    0.00869
162 | 
163 | ```
164 | 
165 | By default, tea-tasting assumes non-negative correlation between hypotheses and performs the Hochberg's step-up procedure with the Šidák correction, which is slightly more powerful than the Bonferroni correction.
166 | 
167 | To perform the Holm's step-down procedure, assuming arbitrary dependence between hypotheses, set the `arbitrary_dependence` parameter to `True`. In this case, it's recommended to use the Bonferroni correction, since the Šidák correction assumes non-negative correlation between hypotheses:
168 | 
169 | ```pycon
170 | >>> tt.adjust_fwer(
171 | ...     results,
172 | ...     metrics,
173 | ...     arbitrary_dependence=True,
174 | ...     method="bonferroni",
175 | ... )
176 | comparison           metric control treatment rel_effect_size  pvalue pvalue_adj
177 |     (0, 1)  orders_per_user   0.530     0.573            8.0%   0.118      0.118
178 |     (0, 1) revenue_per_user    5.24      5.99             14%  0.0211     0.0634
179 |     (0, 2)  orders_per_user   0.530     0.594             12%  0.0213     0.0634
180 |     (0, 2) revenue_per_user    5.24      6.25             19% 0.00218    0.00872
181 | 
182 | ```
183 | 
184 | ## Other inputs
185 | 
186 | In the examples above, the methods `adjust_fdr` and `adjust_fwer` received results from a *single experiment* with *more than two variants*. They can also accept the results from *multiple experiments* with *two variants* in each:
187 | 
188 | ```pycon
189 | >>> data1 = tt.make_users_data(seed=42, orders_uplift=0.10, revenue_uplift=0.15)
190 | >>> data2 = tt.make_users_data(seed=21, orders_uplift=0.15, revenue_uplift=0.20)
191 | >>> result1 = experiment.analyze(data1)
192 | >>> result2 = experiment.analyze(data2)
193 | >>> tt.adjust_fdr(
194 | ...     {"Experiment 1": result1, "Experiment 2": result2},
195 | ...     metrics,
196 | ... )
197 |   comparison           metric control treatment rel_effect_size   pvalue pvalue_adj
198 | Experiment 1  orders_per_user   0.530     0.573            8.0%    0.118      0.118
199 | Experiment 1 revenue_per_user    5.24      5.99             14%   0.0211     0.0282
200 | Experiment 2  orders_per_user   0.514     0.594             16%  0.00427    0.00853
201 | Experiment 2 revenue_per_user    5.10      6.25             22% 6.27e-04    0.00251
202 | 
203 | ```
204 | 
205 | The methods `adjust_fdr` and `adjust_fwer` can also accept the result of *a single experiment with two variants*:
206 | 
207 | ```pycon
208 | >>> tt.adjust_fwer(result2, metrics)
209 | comparison           metric control treatment rel_effect_size   pvalue pvalue_adj
210 |          -  orders_per_user   0.514     0.594             16%  0.00427    0.00427
211 |          - revenue_per_user    5.10      6.25             22% 6.27e-04    0.00125
212 | 
213 | ```
214 | 


--------------------------------------------------------------------------------
/docs/power-analysis.md:
--------------------------------------------------------------------------------
 1 | # Power analysis
 2 | 
 3 | In tea-tasting, you can analyze the statistical power for `Mean` and `RatioOfMeans` metrics. There are three possible options:
 4 | 
 5 | - Calculate the effect size, given statistical power and the total number of observations.
 6 | - Calculate the total number of observations, given statistical power and the effect size.
 7 | - Calculate statistical power, given the effect size and the total number of observations.
 8 | 
 9 | In this example, tea-tasting calculates statistical power given the relative effect size and the number of observations:
10 | 
11 | ```pycon
12 | >>> import tea_tasting as tt
13 | 
14 | >>> data = tt.make_users_data(
15 | ...     seed=42,
16 | ...     sessions_uplift=0,
17 | ...     orders_uplift=0,
18 | ...     revenue_uplift=0,
19 | ...     covariates=True,
20 | ... )
21 | >>> orders_per_session = tt.RatioOfMeans("orders", "sessions", rel_effect_size=0.1)
22 | >>> orders_per_session.solve_power(data, "power")
23 | power effect_size rel_effect_size n_obs
24 |   52%      0.0261             10%  4000
25 | 
26 | ```
27 | 
28 | Besides `alternative`, `equal_var`, `use_t`, and covariates (CUPED), the following metric parameters affect the result:
29 | 
30 | - `alpha`: Significance level.
31 | - `ratio`: Ratio of the number of observations in the treatment relative to the control.
32 | - `power`: Statistical power.
33 | - `effect_size` and `rel_effect_size`: Absolute and relative effect size. Only one of them can be defined.
34 | - `n_obs`: Number of observations in the control and in the treatment together. If the number of observations is not set explicitly, it's inferred from the dataset.
35 | 
36 | You can change the default values of `alpha`, `ratio`, `power`, and `n_obs` using the [global settings](user-guide.md#global-settings).
37 | 
38 | tea-tasting can analyze power for several values of parameters `effect_size`, `rel_effect_size`, or `n_obs`. Example:
39 | 
40 | ```pycon
41 | >>> orders_per_user = tt.Mean("orders", alpha=0.1, power=0.7, n_obs=(10_000, 20_000))
42 | >>> orders_per_user.solve_power(data, "rel_effect_size")
43 | power effect_size rel_effect_size n_obs
44 |   70%      0.0367            7.1% 10000
45 |   70%      0.0260            5.0% 20000
46 | 
47 | ```
48 | 
49 | You can analyze power for all metrics in the experiment. Example:
50 | 
51 | ```pycon
52 | >>> with tt.config_context(n_obs=(10_000, 20_000)):
53 | ...     experiment = tt.Experiment(
54 | ...         sessions_per_user=tt.Mean("sessions", "sessions_covariate"),
55 | ...         orders_per_session=tt.RatioOfMeans(
56 | ...             numer="orders",
57 | ...             denom="sessions",
58 | ...             numer_covariate="orders_covariate",
59 | ...             denom_covariate="sessions_covariate",
60 | ...         ),
61 | ...         orders_per_user=tt.Mean("orders", "orders_covariate"),
62 | ...         revenue_per_user=tt.Mean("revenue", "revenue_covariate"),
63 | ...     )
64 | ... 
65 | >>> power_result = experiment.solve_power(data)
66 | >>> power_result
67 |             metric power effect_size rel_effect_size n_obs
68 |  sessions_per_user   80%      0.0458            2.3% 10000
69 |  sessions_per_user   80%      0.0324            1.6% 20000
70 | orders_per_session   80%      0.0177            6.8% 10000
71 | orders_per_session   80%      0.0125            4.8% 20000
72 |    orders_per_user   80%      0.0374            7.2% 10000
73 |    orders_per_user   80%      0.0264            5.1% 20000
74 |   revenue_per_user   80%       0.488            9.2% 10000
75 |   revenue_per_user   80%       0.345            6.5% 20000
76 | 
77 | ```
78 | 
79 | In the example above, tea-tasting calculates both the relative and absolute effect size for all metrics for two possible sample size values, `10_000` and `20_000`.
80 | 
81 | The `solve_power` methods of a [metric](api/metrics/mean.md#tea_tasting.metrics.mean.Mean.solve_power) and of an [experiment](api/experiment.md#tea_tasting.experiment.Experiment.solve_power) return the instances of [`MetricPowerResults`](api/metrics/base.md#tea_tasting.metrics.base.MetricPowerResults) and [`ExperimentPowerResult`](api/experiment.md#tea_tasting.experiment.ExperimentPowerResult) respectively. These result classes provide the serialization methods similar to the experiment result: `to_dicts`, `to_arrow`, `to_pandas`, `to_polars`, `to_pretty_dicts`, `to_string`, `to_html`. They are also rendered as an HTML tables in IPython and Jupyter, and as a table in marimo notebooks.
82 | 


--------------------------------------------------------------------------------
/docs/simulated-experiments.md:
--------------------------------------------------------------------------------
  1 | # Simulated experiments
  2 | 
  3 | ## Intro
  4 | 
  5 | In tea-tasting, you can run multiple simulated A/A or A/B tests. In each simulation, tea-tasting splits the data into control and treatment groups and can optionally modify the treatment data. A simulation without changing the treatment data is called an A/A test.
  6 | 
  7 | A/A tests are useful for identifying potential issues before conducting the actual A/B test. Treatment simulations are great for power analysis—especially when you need a specific uplift distribution or when an analytical formula doesn’t exist.
  8 | 
  9 | /// admonition | Note
 10 | 
 11 | This guide uses [Polars](https://github.com/pola-rs/polars) and [tqdm](https://github.com/tqdm/tqdm). Install these packages in addition to tea-tasting to reproduce the examples:
 12 | 
 13 | ```bash
 14 | uv pip install polars tqdm
 15 | ```
 16 | 
 17 | ///
 18 | 
 19 | ## Running A/A tests
 20 | 
 21 | First, let's prepare the data without any uplift and drop the `"variant"` column.
 22 | 
 23 | ```pycon
 24 | >>> import polars as pl
 25 | >>> import tea_tasting as tt
 26 | 
 27 | >>> data = (
 28 | ...     tt.make_users_data(seed=42, orders_uplift=0, revenue_uplift=0)
 29 | ...     .drop_columns("variant")
 30 | ... )
 31 | >>> data
 32 | pyarrow.Table
 33 | user: int64
 34 | sessions: int64
 35 | orders: int64
 36 | revenue: double
 37 | ----
 38 | user: [[0,1,2,3,4,...,3995,3996,3997,3998,3999]]
 39 | sessions: [[2,2,2,2,1,...,2,2,3,1,5]]
 40 | orders: [[1,1,1,0,1,...,0,1,1,0,4]]
 41 | revenue: [[19.06,12.09,8.84,0,9.9,...,0,4.8,9.63,0,12.7]]
 42 | 
 43 | ```
 44 | 
 45 | To run A/A tests, first define the metrics for the experiment, then call the [`simulate`](api/experiment.md#tea_tasting.experiment.Experiment.simulate) method, providing the data and the number of simulations as arguments.
 46 | 
 47 | ```pycon
 48 | >>> experiment = tt.Experiment(
 49 | ...     sessions_per_user=tt.Mean("sessions"),
 50 | ...     orders_per_session=tt.RatioOfMeans("orders", "sessions"),
 51 | ...     orders_per_user=tt.Mean("orders"),
 52 | ...     revenue_per_user=tt.Mean("revenue"),
 53 | ...     n_users=tt.SampleRatio(),
 54 | ... )
 55 | >>> results = experiment.simulate(data, 100, seed=42)
 56 | >>> results_data = results.to_polars()
 57 | >>> results_data.select(
 58 | ...     "metric",
 59 | ...     "control",
 60 | ...     "treatment",
 61 | ...     "rel_effect_size",
 62 | ...     "rel_effect_size_ci_lower",
 63 | ...     "rel_effect_size_ci_upper",
 64 | ...     "pvalue",
 65 | ... )  # doctest: +SKIP
 66 | shape: (500, 7)
 67 | ┌────────────────────┬──────────┬───────────┬─────────────────┬────────────────────┬────────────────────┬──────────┐
 68 | │ metric             ┆ control  ┆ treatment ┆ rel_effect_size ┆ rel_effect_size_ci ┆ rel_effect_size_ci ┆ pvalue   │
 69 | │ ---                ┆ ---      ┆ ---       ┆ ---             ┆ _lower             ┆ _upper             ┆ ---      │
 70 | │ str                ┆ f64      ┆ f64       ┆ f64             ┆ ---                ┆ ---                ┆ f64      │
 71 | │                    ┆          ┆           ┆                 ┆ f64                ┆ f64                ┆          │
 72 | ╞════════════════════╪══════════╪═══════════╪═════════════════╪════════════════════╪════════════════════╪══════════╡
 73 | │ sessions_per_user  ┆ 1.98004  ┆ 1.998998  ┆ 0.009575        ┆ -0.021272          ┆ 0.041393           ┆ 0.547091 │
 74 | │ orders_per_session ┆ 0.263105 ┆ 0.258647  ┆ -0.016945       ┆ -0.108177          ┆ 0.083621           ┆ 0.730827 │
 75 | │ orders_per_user    ┆ 0.520958 ┆ 0.517034  ┆ -0.007532       ┆ -0.102993          ┆ 0.098087           ┆ 0.883462 │
 76 | │ revenue_per_user   ┆ 5.446662 ┆ 5.14521   ┆ -0.055346       ┆ -0.162811          ┆ 0.065914           ┆ 0.356327 │
 77 | │ n_users            ┆ 2004.0   ┆ 1996.0    ┆ null            ┆ null               ┆ null               ┆ 0.91187  │
 78 | │ …                  ┆ …        ┆ …         ┆ …               ┆ …                  ┆ …                  ┆ …        │
 79 | │ sessions_per_user  ┆ 1.993624 ┆ 1.985212  ┆ -0.00422        ┆ -0.034685          ┆ 0.027207           ┆ 0.78959  │
 80 | │ orders_per_session ┆ 0.269373 ┆ 0.251991  ┆ -0.064527       ┆ -0.151401          ┆ 0.03124            ┆ 0.179445 │
 81 | │ orders_per_user    ┆ 0.537028 ┆ 0.500255  ┆ -0.068475       ┆ -0.158141          ┆ 0.030742           ┆ 0.169217 │
 82 | │ revenue_per_user   ┆ 5.511967 ┆ 5.071928  ┆ -0.079833       ┆ -0.184806          ┆ 0.038656           ┆ 0.177868 │
 83 | │ n_users            ┆ 2039.0   ┆ 1961.0    ┆ null            ┆ null               ┆ null               ┆ 0.223423 │
 84 | └────────────────────┴──────────┴───────────┴─────────────────┴────────────────────┴────────────────────┴──────────┘
 85 | 
 86 | ```
 87 | 
 88 | The `simulate` method accepts data in the same formats as the `analyze` method. Internally, however, it converts the data to a PyArrow Table before running the simulations.
 89 | 
 90 | The method returns an instance of the [`SimulationResults`](api/experiment.md#tea_tasting.experiment.SimulationResults) class, which contains the results of all simulations for all metrics. The resulting object provides serialization methods to those of the experiment result, including `to_dicts`, `to_arrow`, `to_pandas`, `to_polars`, `to_pretty_dicts`, `to_string`, `to_html`.
 91 | 
 92 | For instance, we can now calculate the proportion of rejected null hypotheses, using various significance levels (`alpha`). In A/A tests, it estimates the type I error rate.
 93 | 
 94 | ```pycon
 95 | >>> def null_rejected(
 96 | ...     results_data: pl.DataFrame,
 97 | ...     alphas: tuple[float, ...] = (0.01, 0.02, 0.05),
 98 | ... ) -> pl.DataFrame:
 99 | ...     return results_data.group_by("metric", maintain_order=True).agg(
100 | ...         pl.col("pvalue").le(alpha).mean().alias(f"null_rejected_{alpha}")
101 | ...         for alpha in alphas
102 | ...     )
103 | ... 
104 | >>> null_rejected(results_data)
105 | shape: (5, 4)
106 | ┌────────────────────┬────────────────────┬────────────────────┬────────────────────┐
107 | │ metric             ┆ null_rejected_0.01 ┆ null_rejected_0.02 ┆ null_rejected_0.05 │
108 | │ ---                ┆ ---                ┆ ---                ┆ ---                │
109 | │ str                ┆ f64                ┆ f64                ┆ f64                │
110 | ╞════════════════════╪════════════════════╪════════════════════╪════════════════════╡
111 | │ sessions_per_user  ┆ 0.01               ┆ 0.02               ┆ 0.05               │
112 | │ orders_per_session ┆ 0.02               ┆ 0.02               ┆ 0.06               │
113 | │ orders_per_user    ┆ 0.01               ┆ 0.02               ┆ 0.05               │
114 | │ revenue_per_user   ┆ 0.02               ┆ 0.03               ┆ 0.06               │
115 | │ n_users            ┆ 0.01               ┆ 0.01               ┆ 0.04               │
116 | └────────────────────┴────────────────────┴────────────────────┴────────────────────┘
117 | 
118 | ```
119 | 
120 | 100 simulations, as in the example above, produce a very rough estimate. In practice, a larger number of simulations, such as the default `10_000`, is recommended.
121 | 
122 | ## Simulating experiments with treatment
123 | 
124 | To simulate experiments with treatment, define a treatment function that takes data in the form of a PyArrow Table and returns a PyArrow Table with the modified data:
125 | 
126 | ```pycon
127 | >>> import pyarrow as pa
128 | >>> import pyarrow.compute as pc
129 | 
130 | >>> def treat(data: pa.Table) -> pa.Table:
131 | ...     return (
132 | ...         data.drop_columns(["orders", "revenue"])
133 | ...         .append_column("orders", pc.multiply(data["orders"], pa.scalar(1.1)))
134 | ...         .append_column("revenue", pc.multiply(data["revenue"], pa.scalar(1.1)))
135 | ...     )
136 | ... 
137 | >>> results_treat = experiment.simulate(data, 100, seed=42, treat=treat)
138 | >>> null_rejected(results_treat.to_polars())
139 | shape: (5, 4)
140 | ┌────────────────────┬────────────────────┬────────────────────┬────────────────────┐
141 | │ metric             ┆ null_rejected_0.01 ┆ null_rejected_0.02 ┆ null_rejected_0.05 │
142 | │ ---                ┆ ---                ┆ ---                ┆ ---                │
143 | │ str                ┆ f64                ┆ f64                ┆ f64                │
144 | ╞════════════════════╪════════════════════╪════════════════════╪════════════════════╡
145 | │ sessions_per_user  ┆ 0.01               ┆ 0.02               ┆ 0.05               │
146 | │ orders_per_session ┆ 0.23               ┆ 0.31               ┆ 0.42               │
147 | │ orders_per_user    ┆ 0.21               ┆ 0.29               ┆ 0.4                │
148 | │ revenue_per_user   ┆ 0.11               ┆ 0.16               ┆ 0.31               │
149 | │ n_users            ┆ 0.01               ┆ 0.01               ┆ 0.04               │
150 | └────────────────────┴────────────────────┴────────────────────┴────────────────────┘
151 | 
152 | ```
153 | 
154 | In the example above, we've defined a function that increases the number of orders and the revenue by 10%. For these metrics, the proportion of rejected null hypotheses is an estimate of statistical power.
155 | 
156 | ## Using a function instead of static data
157 | 
158 | You can use a function instead of static data to generate input dynamically. The function should take an instance of `numpy.random.Generator` as a parameter named `seed` and return experimental data in any format supported by tea-tasting.
159 | 
160 | As an example, let's use the `make_users_data` function.
161 | 
162 | ```pycon
163 | >>> results_data_gen = experiment.simulate(tt.make_users_data, 100, seed=42)
164 | >>> null_rejected(results_data_gen.to_polars())
165 | shape: (5, 4)
166 | ┌────────────────────┬────────────────────┬────────────────────┬────────────────────┐
167 | │ metric             ┆ null_rejected_0.01 ┆ null_rejected_0.02 ┆ null_rejected_0.05 │
168 | │ ---                ┆ ---                ┆ ---                ┆ ---                │
169 | │ str                ┆ f64                ┆ f64                ┆ f64                │
170 | ╞════════════════════╪════════════════════╪════════════════════╪════════════════════╡
171 | │ sessions_per_user  ┆ 0.01               ┆ 0.01               ┆ 0.06               │
172 | │ orders_per_session ┆ 0.27               ┆ 0.36               ┆ 0.54               │
173 | │ orders_per_user    ┆ 0.24               ┆ 0.32               ┆ 0.49               │
174 | │ revenue_per_user   ┆ 0.17               ┆ 0.26               ┆ 0.39               │
175 | │ n_users            ┆ 0.01               ┆ 0.01               ┆ 0.04               │
176 | └────────────────────┴────────────────────┴────────────────────┴────────────────────┘
177 | 
178 | ```
179 | 
180 | On each iteration, tea-tasting calls `make_users_data` with a new `seed` and uses the returned data for the analysis of the experiment. The data returned by `make_users_data` already contains the `"variant"` column, so tea-tasting reuses that split. By default, `make_users_data` also adds the treatment uplift, and you can see it in the proportion of rejected null hypotheses.
181 | 
182 | ## Tracking progress
183 | 
184 | To track the progress of simulations with [`tqdm`](https://github.com/tqdm/tqdm) or [`marimo.status.progress_bar`](https://docs.marimo.io/api/status/#progress-bar), use the `progress` parameter.
185 | 
186 | ```pycon
187 | >>> import tqdm
188 | 
189 | >>> results_progress = experiment.simulate(
190 | ...     data,
191 | ...     100,
192 | ...     seed=42,
193 | ...     progress=tqdm.tqdm,
194 | ... )  # doctest: +SKIP
195 | 100%|██████████████████████████████████████| 100/100 [00:01<00:00, 64.47it/s]
196 | 
197 | ```
198 | 
199 | ## Parallel execution
200 | 
201 | /// admonition | Note
202 | 
203 | The code below won't work in the [marimo online playground](https://docs.marimo.io/guides/publishing/playground/) as it relies on the `multiprocessing` module which is currently [not supported](https://docs.marimo.io/guides/wasm/#limitations) by WASM notebooks. [WASM notebooks](https://docs.marimo.io/guides/wasm/) are the marimo notebooks that run entirely in the browser.
204 | 
205 | ///
206 | 
207 | To speed up simulations and run them in parallel, use the `map_` parameter with an alternative mapping function.
208 | 
209 | ```pycon
210 | >>> import concurrent.futures
211 | 
212 | >>> with concurrent.futures.ProcessPoolExecutor() as executor:
213 | ...     results_parallel = experiment.simulate(
214 | ...         data,
215 | ...         100,
216 | ...         seed=42,
217 | ...         treat=treat,
218 | ...         map_=executor.map,
219 | ...         progress=tqdm.tqdm,
220 | ...     )  # doctest: +SKIP
221 | ... 
222 | 100%|█████████████████████████████████████| 100/100 [00:00<00:00, 251.60it/s]
223 | 
224 | ```
225 | 
226 | As an alternative to [`concurrent.futures.ProcessPoolExecutor`](https://docs.python.org/3/library/concurrent.futures.html#processpoolexecutor), you can use the `map`, `imap`, or `imap_unordered` methods of [`multiprocessing.pool.Pool`](https://docs.python.org/3/library/multiprocessing.html#module-multiprocessing.pool).
227 | 
228 | It's also possible to run simulations on a distributed [Dask](https://distributed.dask.org/en/stable/api.html#distributed.Client.map) or [Ray](https://docs.ray.io/en/latest/ray-core/api/doc/ray.util.ActorPool.map.html#ray.util.ActorPool.map) cluster.
229 | 


--------------------------------------------------------------------------------
/docs/stylesheets/extra.css:
--------------------------------------------------------------------------------
 1 | :root {
 2 |   --md-code-font: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
 3 |   --md-text-font: system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", "Noto Sans", "Liberation Sans", Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";
 4 | }
 5 | .md-typeset code {
 6 |   font-size: .875em;
 7 | }
 8 | .md-typeset ol li,.md-typeset ul li {
 9 |   margin-bottom: .25em
10 | }
11 | div.highlight span.gp {  /* gp: Generic.Prompt */
12 |   user-select: none;
13 |   -webkit-user-select: text; /* Safari fallback only */
14 |   -webkit-user-select: none; /* Chrome/Safari */
15 |   -moz-user-select: none; /* Firefox */
16 |   -ms-user-select: none; /* IE10+ */
17 | }
18 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Examples
 2 | 
 3 | The tea-tasting repository includes [examples](https://github.com/e10v/tea-tasting/tree/main/examples) as copies of the guides in the [marimo](https://github.com/marimo-team/marimo) notebook format. You can either download them from GitHub and run in your local environment, or you can run them as WASM notebooks in the online playground.
 4 | 
 5 | ## Run in a local environment
 6 | 
 7 | To run the examples in your local environment, clone the repository and change the directory:
 8 | 
 9 | ```bash
10 | git clone git@github.com:e10v/tea-tasting.git && cd tea-tasting
11 | ```
12 | 
13 | Install marimo, tea-tasting, and other packages used in the examples:
14 | 
15 | ```bash
16 | uv venv && uv pip install marimo tea-tasting polars ibis-framework[duckdb]
17 | ```
18 | 
19 | Launch the notebook server:
20 | 
21 | ```bash
22 | uv run marimo edit examples
23 | ```
24 | 
25 | Now you can choose and run the example notebooks.
26 | 
27 | ## Run in the online playground
28 | 
29 | To run the examples as WASM notebooks in the online playground, open the following links:
30 | 
31 | - [User guide](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fuser-guide.py&embed=true).
32 | - [Data backends](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fdata-backends.py&embed=true).
33 | - [Power analysis](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fpower-analysis.py&embed=true).
34 | - [Multiple hypothesis testing](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fmultiple-testing.py&embed=true).
35 | - [Custom metrics](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fcustom-metrics.py&embed=true).
36 | - [Simulated experiments](https://marimo.app/gh/e10v/tea-tasting/main?entrypoint=examples%2Fsimulated-experiments.py&embed=true).
37 | 
38 | [WASM notebooks](https://docs.marimo.io/guides/wasm/) run entirely in the browser on [Pyodide](https://github.com/pyodide/pyodide) and thus have some limitations. In particular:
39 | 
40 | - Tables and dataframes render less attractively because Pyodide doesn't always include the latest [packages versions](https://pyodide.org/en/stable/usage/packages-in-pyodide.html).
41 | - You can't simulate experiments [in parallel](https://tea-tasting.e10v.me/simulated-experiments/#parallel-execution) because Pyodide currently [doesn't support multiprocessing](https://pyodide.org/en/stable/usage/wasm-constraints.html#included-but-not-working-modules).
42 | - Other unpredictable issues may arise, such as the inability to use duckdb with ibis.
43 | 


--------------------------------------------------------------------------------
/examples/custom-metrics.py:
--------------------------------------------------------------------------------
  1 | # /// script
  2 | # requires-python = ">=3.10"
  3 | # dependencies = [
  4 | #     "marimo",
  5 | #     "tea-tasting",
  6 | # ]
  7 | # [tool.marimo.display]
  8 | # cell_output = "below"
  9 | # ///
 10 | 
 11 | import marimo
 12 | 
 13 | __generated_with = "0.13.6"
 14 | app = marimo.App()
 15 | 
 16 | 
 17 | @app.cell(hide_code=True)
 18 | def _(mo):
 19 |     mo.md(
 20 |         r"""
 21 |         # Custom metrics
 22 | 
 23 |         ## Intro
 24 | 
 25 |         tea-tasting supports Student's t-test, Z-test, and [some other statistical tests](https://tea-tasting.e10v.me/api/metrics/index/) out of the box. However, you might want to analyze an experiment using other statistical criteria. In this case, you can define a custom metric with a statistical test of your choice.
 26 | 
 27 |         In tea-tasting, there are two types of metrics:
 28 | 
 29 |         - Metrics that require only aggregated statistics for the analysis.
 30 |         - Metrics that require granular data for the analysis.
 31 | 
 32 |         This guide explains how to define a custom metric for each type.
 33 | 
 34 |         First, let's import all the required modules and prepare the data:
 35 |         """
 36 |     )
 37 |     return
 38 | 
 39 | 
 40 | @app.cell
 41 | def _():
 42 |     from typing import Literal, NamedTuple
 43 |     import numpy as np
 44 |     import pyarrow as pa
 45 |     import pyarrow.compute as pc
 46 |     import scipy.stats
 47 |     import tea_tasting as tt
 48 |     import tea_tasting.aggr
 49 |     import tea_tasting.config
 50 |     import tea_tasting.metrics
 51 |     import tea_tasting.utils
 52 | 
 53 |     data = tt.make_users_data(seed=42)
 54 |     data = data.append_column(
 55 |         "has_order",
 56 |         pc.greater(data["orders"], 0).cast(pa.int64()),
 57 |     )
 58 |     data
 59 |     return Literal, NamedTuple, data, np, pa, scipy, tea_tasting, tt
 60 | 
 61 | 
 62 | @app.cell(hide_code=True)
 63 | def _(mo):
 64 |     mo.md(
 65 |         r"""
 66 |         This guide uses PyArrow as the data backend, but it's valid for other backends as well. See the [guide on data backends](https://tea-tasting.e10v.me/data-backends/) for more details.
 67 | 
 68 |         ## Metrics based on aggregated statistics
 69 | 
 70 |         Let's define a metric that performs a proportion test, [G-test](https://en.wikipedia.org/wiki/G-test) or [Pearson's chi-squared test](https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test), on a binary column (with values `0` or `1`).
 71 | 
 72 |         The first step is defining a result class. It should be a named tuple or a dictionary.
 73 |         """
 74 |     )
 75 |     return
 76 | 
 77 | 
 78 | @app.cell
 79 | def _(NamedTuple):
 80 |     class ProportionResult(NamedTuple):
 81 |         control: float
 82 |         treatment: float
 83 |         effect_size: float
 84 |         rel_effect_size: float
 85 |         pvalue: float
 86 |         statistic: float
 87 |     return (ProportionResult,)
 88 | 
 89 | 
 90 | @app.cell(hide_code=True)
 91 | def _(mo):
 92 |     mo.md(
 93 |         r"""
 94 |         The second step is defining the metric class itself. A metric based on aggregated statistics should be a subclass of [`MetricBaseAggregated`](https://tea-tasting.e10v.me/api/metrics/base/#tea_tasting.metrics.base.MetricBaseAggregated). `MetricBaseAggregated` is a generic class with the result class as a type variable.
 95 | 
 96 |         The metric should have the following methods and properties defined:
 97 | 
 98 |         - Method `__init__` checks and saves metric parameters.
 99 |         - Property `aggr_cols` returns columns to be aggregated for analysis for each type of statistic.
100 |         - Method `analyze_aggregates` analyzes the metric using aggregated statistics.
101 | 
102 |         Let's define the metric and discuss each method in details:
103 |         """
104 |     )
105 |     return
106 | 
107 | 
108 | @app.cell
109 | def _(Literal, ProportionResult, np, scipy, tea_tasting):
110 |     class Proportion(tea_tasting.metrics.MetricBaseAggregated[ProportionResult]):
111 |         def __init__(
112 |             self,
113 |             column: str,
114 |             *,
115 |             correction: bool = True,
116 |             method: Literal["g-test", "pearson"] = "g-test",
117 |         ) -> None:
118 |             self.column = tea_tasting.utils.check_scalar(column, "column", typ=str)
119 |             self.correction = tea_tasting.utils.auto_check(correction, "correction")
120 |             self.method = tea_tasting.utils.check_scalar(
121 |                 method, "method", typ=str, in_={"g-test", "pearson"})
122 |         @property
123 |         def aggr_cols(self) -> tea_tasting.metrics.AggrCols:
124 |             return tea_tasting.metrics.AggrCols(
125 |                 has_count=True,
126 |                 mean_cols=(self.column,),
127 |             )
128 |         def analyze_aggregates(
129 |             self,
130 |             control: tea_tasting.aggr.Aggregates,
131 |             treatment: tea_tasting.aggr.Aggregates,
132 |         ) -> ProportionResult:
133 |             observed = np.empty(shape=(2, 2), dtype=np.int64)
134 |             observed[0, 0] = round(control.count() * control.mean(self.column))
135 |             observed[1, 0] = control.count() - observed[0, 0]
136 |             observed[0, 1] = round(treatment.count() * treatment.mean(self.column))
137 |             observed[1, 1] = treatment.count() - observed[0, 1]
138 |             res = scipy.stats.chi2_contingency(
139 |                 observed=observed,
140 |                 correction=self.correction,
141 |                 lambda_=int(self.method == "pearson"),
142 |             )
143 |             return ProportionResult(
144 |                 control=control.mean(self.column),
145 |                 treatment=treatment.mean(self.column),
146 |                 effect_size=treatment.mean(self.column) - control.mean(self.column),
147 |                 rel_effect_size=treatment.mean(self.column)/control.mean(self.column) - 1,
148 |                 pvalue=res.pvalue,
149 |                 statistic=res.statistic,
150 |             )
151 |     return (Proportion,)
152 | 
153 | 
154 | @app.cell(hide_code=True)
155 | def _(mo):
156 |     mo.md(
157 |         r"""
158 |         Method `__init__` saves metric parameters to be used in the analysis. You can use utility functions [`check_scalar`](https://tea-tasting.e10v.me/api/utils/#tea_tasting.utils.check_scalar) and [`auto_check`](https://tea-tasting.e10v.me/api/utils/#tea_tasting.utils.auto_check) to check parameter values.
159 | 
160 |         Property `aggr_cols` returns an instance of [`AggrCols`](https://tea-tasting.e10v.me/api/metrics/base/#tea_tasting.metrics.base.AggrCols). Analysis of proportion requires the number of rows (`has_count=True`) and the average value for the column of interest (`mean_cols=(self.column,)`) for each variant.
161 | 
162 |         Method `analyze_aggregates` accepts two parameters: `control` and `treatment` data as instances of class [`Aggregates`](https://tea-tasting.e10v.me/api/aggr/#tea_tasting.aggr.Aggregates). They contain values for statistics and columns specified in `aggr_cols`.
163 | 
164 |         Method `analyze_aggregates` returns an instance of `ProportionResult`, defined earlier, with the analysis result.
165 | 
166 |         Now we can analyze the proportion of users who created at least one order during the experiment. For comparison, let's also add a metric that performs a Z-test on the same column.
167 |         """
168 |     )
169 |     return
170 | 
171 | 
172 | @app.cell
173 | def _(Proportion, data, tt):
174 |     experiment_prop = tt.Experiment(
175 |         prop_users_with_orders=Proportion("has_order"),
176 |         mean_users_with_orders=tt.Mean("has_order", use_t=False),
177 |     )
178 |     experiment_prop.analyze(data)
179 |     return
180 | 
181 | 
182 | @app.cell(hide_code=True)
183 | def _(mo):
184 |     mo.md(
185 |         r"""
186 |         ## Metrics based on granular data
187 | 
188 |         Now let's define a metric that performs the Mann-Whitney U test. While it's possible to use the aggregated sum of ranks for the test, this example uses granular data for analysis.
189 | 
190 |         The result class:
191 |         """
192 |     )
193 |     return
194 | 
195 | 
196 | @app.cell
197 | def _(NamedTuple):
198 |     class MannWhitneyUResult(NamedTuple):
199 |         pvalue: float
200 |         statistic: float
201 |     return (MannWhitneyUResult,)
202 | 
203 | 
204 | @app.cell(hide_code=True)
205 | def _(mo):
206 |     mo.md(
207 |         r"""
208 |         A metric that analyzes granular data should be a subclass of [`MetricBaseGranular`](https://tea-tasting.e10v.me/api/metrics/base/#tea_tasting.metrics.base.MetricBaseGranular). `MetricBaseGranular` is a generic class with the result class as a type variable.
209 | 
210 |         Metric should have the following methods and properties defined:
211 | 
212 |         - Method `__init__` checks and saves metric parameters.
213 |         - Property `cols` returns columns to be fetched for an analysis.
214 |         - Method `analyze_granular` analyzes the metric using granular data.
215 |         """
216 |     )
217 |     return
218 | 
219 | 
220 | @app.cell
221 | def _(Literal, MannWhitneyUResult, pa, scipy, tea_tasting):
222 |     class MannWhitneyU(tea_tasting.metrics.MetricBaseGranular[MannWhitneyUResult]):
223 |         def __init__(
224 |             self,
225 |             column: str,
226 |             *,
227 |             correction: bool = True,
228 |             alternative: Literal["two-sided", "less", "greater"] | None = None,
229 |         ) -> None:
230 |             self.column = tea_tasting.utils.check_scalar(column, "column", typ=str)
231 |             self.correction = tea_tasting.utils.auto_check(correction, "correction")
232 |             self.alternative = (
233 |                 tea_tasting.utils.auto_check(alternative, "alternative")
234 |                 if alternative is not None
235 |                 else tea_tasting.config.get_config("alternative")
236 |             )
237 |         @property
238 |         def cols(self) -> tuple[str]:
239 |             return (self.column,)
240 |         def analyze_granular(
241 |             self,
242 |             control: pa.Table,
243 |             treatment: pa.Table,
244 |         ) -> MannWhitneyUResult:
245 |             res = scipy.stats.mannwhitneyu(
246 |                 treatment[self.column].combine_chunks().to_numpy(zero_copy_only=False),
247 |                 control[self.column].combine_chunks().to_numpy(zero_copy_only=False),
248 |                 use_continuity=self.correction,
249 |                 alternative=self.alternative,
250 |             )
251 |             return MannWhitneyUResult(
252 |                 pvalue=res.pvalue,
253 |                 statistic=res.statistic,
254 |             )
255 |     return (MannWhitneyU,)
256 | 
257 | 
258 | @app.cell(hide_code=True)
259 | def _(mo):
260 |     mo.md(
261 |         r"""
262 |         Property `cols` should return a sequence of strings.
263 | 
264 |         Method `analyze_granular` accepts two parameters: control and treatment data as PyArrow Tables. Even with [data backend](https://tea-tasting.e10v.me/data-backends/) different from PyArrow, tea-tasting will retrieve the data and transform into a PyArrow Table.
265 | 
266 |         Method `analyze_granular` returns an instance of `MannWhitneyUResult`, defined earlier, with analysis result.
267 | 
268 |         Now we can perform the Mann-Whitney U test:
269 |         """
270 |     )
271 |     return
272 | 
273 | 
274 | @app.cell
275 | def _(MannWhitneyU, data, tt):
276 |     experiment_mwu = tt.Experiment(
277 |         mwu_orders=MannWhitneyU("orders"),
278 |         mwu_revenue=MannWhitneyU("revenue"),
279 |     )
280 |     result_mwu = experiment_mwu.analyze(data)
281 |     result_mwu.with_keys(("metric", "pvalue", "statistic"))
282 |     return
283 | 
284 | 
285 | @app.cell(hide_code=True)
286 | def _(mo):
287 |     mo.md(
288 |         r"""
289 |         ## Analyzing two types of metrics together
290 | 
291 |         It's also possible to analyze two types of metrics in one experiment:
292 |         """
293 |     )
294 |     return
295 | 
296 | 
297 | @app.cell
298 | def _(MannWhitneyU, Proportion, data, tt):
299 |     experiment = tt.Experiment(
300 |         prop_users_with_orders=Proportion("has_order"),
301 |         mean_users_with_orders=tt.Mean("has_order"),
302 |         mwu_orders=MannWhitneyU("orders"),
303 |         mwu_revenue=MannWhitneyU("revenue"),
304 |     )
305 |     experiment.analyze(data)
306 |     return
307 | 
308 | 
309 | @app.cell(hide_code=True)
310 | def _(mo):
311 |     mo.md(
312 |         r"""
313 |         In this case, tea-tasting performs two queries on the experimental data:
314 | 
315 |         - With aggregated statistics required for analysis of metrics of type `MetricBaseAggregated`.
316 |         - With detailed data with columns required for analysis of metrics of type `MetricBaseGranular`.
317 | 
318 |         ## Recommendations
319 | 
320 |         Follow these recommendations when defining custom metrics:
321 | 
322 |         - Use parameter and attribute names consistent with the ones that are already defined in tea-tasting. For example, use `pvalue` instead of `p_value` or `correction` instead of `use_continuity`.
323 |         - End confidence interval boundary names with `"_ci_lower"` and `"_ci_upper"`.
324 |         - During initialization, save parameter values in metric attributes using the same names. For example, use `self.correction = correction` instead of `self.use_continuity = correction`.
325 |         - Use global settings as default values for standard parameters, such as `alternative` or `confidence_level`. See the [reference](https://tea-tasting.e10v.me/api/config/#tea_tasting.config.config_context) for the full list of standard parameters. You can also define and use your own global parameters.
326 |         """
327 |     )
328 |     return
329 | 
330 | 
331 | @app.cell(hide_code=True)
332 | def _():
333 |     import marimo as mo
334 |     return (mo,)
335 | 
336 | 
337 | if __name__ == "__main__":
338 |     app.run()
339 | 


--------------------------------------------------------------------------------
/examples/data-backends.py:
--------------------------------------------------------------------------------
  1 | # /// script
  2 | # requires-python = ">=3.10"
  3 | # dependencies = [
  4 | #     "ibis-framework[duckdb]",
  5 | #     "marimo",
  6 | #     "polars",
  7 | #     "tea-tasting",
  8 | # ]
  9 | # [tool.marimo.display]
 10 | # cell_output = "below"
 11 | # ///
 12 | 
 13 | import marimo
 14 | 
 15 | __generated_with = "0.13.6"
 16 | app = marimo.App()
 17 | 
 18 | 
 19 | @app.cell(hide_code=True)
 20 | def _(mo):
 21 |     mo.md(
 22 |         r"""
 23 |         # Data backends
 24 | 
 25 |         ## Intro
 26 | 
 27 |         tea-tasting supports a wide range of data backends such as BigQuery, ClickHouse, DuckDB, PostgreSQL, Snowflake, Spark, and many other backends supported by [Ibis](https://github.com/ibis-project/ibis). Ibis is a DataFrame API to various data backends.
 28 | 
 29 |         Many statistical tests, such as the Student's t-test or the Z-test, require only aggregated data for analysis. For these tests, tea-tasting retrieves only aggregated statistics like mean and variance instead of downloading all detailed data.
 30 | 
 31 |         For example, if the raw experimental data are stored in ClickHouse, it's faster and more efficient to calculate counts, averages, variances, and covariances directly in ClickHouse rather than fetching granular data and performing aggregations in a Python environment.
 32 | 
 33 |         tea-tasting also accepts dataframes supported by [Narwhals](https://github.com/narwhals-dev/narwhals): cuDF, Dask, Modin, pandas, Polars, PyArrow. Narwhals is a compatibility layer between dataframe libraries.
 34 | 
 35 |         This guide:
 36 | 
 37 |         - Shows how to use tea-tasting with a data backend of your choice for the analysis of an experiment.
 38 |         - Explains some internals of how tea-tasting uses Ibis to work with data backends.
 39 | 
 40 |         ## Demo database
 41 | 
 42 |         /// admonition | Note
 43 | 
 44 |         This guide uses [DuckDB](https://github.com/duckdb/duckdb), an in-process analytical database, and [Polars](https://github.com/pola-rs/polars) as example data backends. Install these packages in addition to tea-tasting to reproduce the examples:
 45 | 
 46 |         ```bash
 47 |         uv pip install ibis-framework[duckdb] polars
 48 |         ```
 49 | 
 50 |         ///
 51 | 
 52 |         First, let's prepare a demo database:
 53 |         """
 54 |     )
 55 |     return
 56 | 
 57 | 
 58 | @app.cell
 59 | def _():
 60 |     import ibis
 61 |     import polars as pl
 62 |     import tea_tasting as tt
 63 | 
 64 |     users_data = tt.make_users_data(seed=42)
 65 |     con = ibis.connect("duckdb://")
 66 |     con.create_table("users_data", users_data)
 67 |     return con, ibis, pl, tt, users_data
 68 | 
 69 | 
 70 | @app.cell(hide_code=True)
 71 | def _(mo):
 72 |     mo.md(
 73 |         r"""
 74 |         In the example above:
 75 | 
 76 |         - Function `tt.make_users_data` returns a PyArrow Table with example experimental data.
 77 |         - Function `ibis.duckdb.connect` creates a DuckDB in-process database using Ibis API.
 78 |         - Method `con.create_table` creates and populates a table in the database based on the PyArrow Table.
 79 | 
 80 |         See the [Ibis documentation on how to create connections](https://ibis-project.org/reference/connection) to other data backends.
 81 | 
 82 |         ## Querying experimental data
 83 | 
 84 |         Method `con.create_table` in the example above returns an Ibis Table which already can be used in the analysis of the experiment. But let's see how to use an SQL query to create an Ibis Table:
 85 |         """
 86 |     )
 87 |     return
 88 | 
 89 | 
 90 | @app.cell
 91 | def _(con):
 92 |     data = con.sql("select * from users_data")
 93 |     data
 94 |     return (data,)
 95 | 
 96 | 
 97 | @app.cell(hide_code=True)
 98 | def _(mo):
 99 |     mo.md(
100 |         r"""
101 |         It's a very simple query. In the real world, you might need to use joins, aggregations, and CTEs to get the data. You can define any SQL query supported by your data backend and use it to create Ibis Table.
102 | 
103 |         Keep in mind that tea-tasting assumes that:
104 | 
105 |         - Data is grouped by randomization units, such as individual users.
106 |         - There is a column indicating the variant of the A/B test (typically labeled as A, B, etc.).
107 |         - All necessary columns for metric calculations (like the number of orders, revenue, etc.) are included in the table.
108 | 
109 |         Ibis Table is a lazy object. It doesn't fetch the data when created. You can use Ibis DataFrame API to query the table and fetch the result:
110 |         """
111 |     )
112 |     return
113 | 
114 | 
115 | @app.cell
116 | def _(data, ibis):
117 |     ibis.options.interactive = True
118 |     print(data.head(5))
119 | 
120 |     ibis.options.interactive = False
121 |     return
122 | 
123 | 
124 | @app.cell(hide_code=True)
125 | def _(mo):
126 |     mo.md(
127 |         r"""
128 |         ## Ibis example
129 | 
130 |         To better understand what Ibis does, let's consider the example with grouping and aggregation by variants:
131 |         """
132 |     )
133 |     return
134 | 
135 | 
136 | @app.cell
137 | def _(data):
138 |     aggr_data = data.group_by("variant").aggregate(
139 |         sessions_per_user=data.sessions.mean(),
140 |         orders_per_session=data.orders.mean() / data.sessions.mean(),
141 |         orders_per_user=data.orders.mean(),
142 |         revenue_per_user=data.revenue.mean(),
143 |     )
144 |     aggr_data
145 |     return (aggr_data,)
146 | 
147 | 
148 | @app.cell(hide_code=True)
149 | def _(mo):
150 |     mo.md(
151 |         r"""
152 |         `aggr_data` is another Ibis Table defined as a query over the previously defined `data`. Let's fetch the result:
153 |         """
154 |     )
155 |     return
156 | 
157 | 
158 | @app.cell
159 | def _(aggr_data, ibis):
160 |     ibis.options.interactive = True
161 |     print(aggr_data)
162 | 
163 |     ibis.options.interactive = False
164 |     return
165 | 
166 | 
167 | @app.cell(hide_code=True)
168 | def _(mo):
169 |     mo.md(
170 |         r"""
171 |         Internally, Ibis compiles a Table to an SQL query supported by the backend:
172 |         """
173 |     )
174 |     return
175 | 
176 | 
177 | @app.cell
178 | def _(aggr_data):
179 |     print(aggr_data.compile(pretty=True))
180 |     return
181 | 
182 | 
183 | @app.cell(hide_code=True)
184 | def _(mo):
185 |     mo.md(
186 |         r"""
187 |         See [Ibis documentation](https://ibis-project.org/tutorials/getting_started) for more details.
188 | 
189 |         ## Experiment analysis
190 | 
191 |         The example above shows how to query the metric averages. But for statistical inference, it's not enough. For example, Student's t-test and Z-test also require number of rows and variance. Additionally, analysis of ratio metrics and variance reduction with CUPED requires covariances.
192 | 
193 |         Querying all the required statistics manually can be a daunting and error-prone task. But don't worry—tea-tasting does this work for you. You just need to specify the metrics:
194 |         """
195 |     )
196 |     return
197 | 
198 | 
199 | @app.cell
200 | def _(data, tt):
201 |     experiment = tt.Experiment(
202 |         sessions_per_user=tt.Mean("sessions"),
203 |         orders_per_session=tt.RatioOfMeans("orders", "sessions"),
204 |         orders_per_user=tt.Mean("orders"),
205 |         revenue_per_user=tt.Mean("revenue"),
206 |     )
207 |     result = experiment.analyze(data)
208 |     result
209 |     return (experiment,)
210 | 
211 | 
212 | @app.cell(hide_code=True)
213 | def _(mo):
214 |     mo.md(
215 |         r"""
216 |         In the example above, tea-tasting fetches all the required statistics with a single query and then uses them to analyze the experiment.
217 | 
218 |         Some statistical methods, like bootstrap, require granular data for analysis. In this case, tea-tasting fetches the detailed data as well.
219 | 
220 |         ## Example with CUPED
221 | 
222 |         An example of a slightly more complicated analysis using variance reduction with CUPED:
223 |         """
224 |     )
225 |     return
226 | 
227 | 
228 | @app.cell
229 | def _(con, tt):
230 |     users_data_cuped = tt.make_users_data(seed=42, covariates=True)
231 |     con.create_table("users_data_cuped", users_data_cuped)
232 | 
233 |     data_cuped = con.sql("select * from users_data_cuped")
234 |     experiment_cuped = tt.Experiment(
235 |         sessions_per_user=tt.Mean("sessions", "sessions_covariate"),
236 |         orders_per_session=tt.RatioOfMeans(
237 |             numer="orders",
238 |             denom="sessions",
239 |             numer_covariate="orders_covariate",
240 |             denom_covariate="sessions_covariate",
241 |         ),
242 |         orders_per_user=tt.Mean("orders", "orders_covariate"),
243 |         revenue_per_user=tt.Mean("revenue", "revenue_covariate"),
244 |     )
245 |     result_cuped = experiment_cuped.analyze(data_cuped)
246 |     result_cuped
247 |     return
248 | 
249 | 
250 | @app.cell(hide_code=True)
251 | def _(mo):
252 |     mo.md(
253 |         r"""
254 |         ## Polars example
255 | 
256 |         Here’s an example of how to analyze data using a Polars DataFrame:
257 |         """
258 |     )
259 |     return
260 | 
261 | 
262 | @app.cell
263 | def _(experiment, pl, users_data):
264 |     data_polars = pl.from_arrow(users_data)
265 |     experiment.analyze(data_polars)
266 |     return
267 | 
268 | 
269 | @app.cell(hide_code=True)
270 | def _(mo):
271 |     mo.md(
272 |         r"""
273 | 
274 |         """
275 |     )
276 |     return
277 | 
278 | 
279 | @app.cell(hide_code=True)
280 | def _():
281 |     import marimo as mo
282 |     return (mo,)
283 | 
284 | 
285 | if __name__ == "__main__":
286 |     app.run()
287 | 


--------------------------------------------------------------------------------
/examples/multiple-testing.py:
--------------------------------------------------------------------------------
  1 | # /// script
  2 | # requires-python = ">=3.10"
  3 | # dependencies = [
  4 | #     "marimo",
  5 | #     "polars",
  6 | #     "tea-tasting",
  7 | # ]
  8 | # [tool.marimo.display]
  9 | # cell_output = "below"
 10 | # ///
 11 | 
 12 | import marimo
 13 | 
 14 | __generated_with = "0.13.6"
 15 | app = marimo.App()
 16 | 
 17 | 
 18 | @app.cell(hide_code=True)
 19 | def _(mo):
 20 |     mo.md(
 21 |         r"""
 22 |         # Multiple testing
 23 | 
 24 |         ## Multiple hypothesis testing problem
 25 | 
 26 |         /// admonition | Note
 27 | 
 28 |         This guide uses [Polars](https://github.com/pola-rs/polars) as an example data backend. Install Polars in addition to tea-tasting to reproduce the examples:
 29 | 
 30 |         ```bash
 31 |         uv pip install polars
 32 |         ```
 33 | 
 34 |         ///
 35 | 
 36 |         The [multiple hypothesis testing problem](https://en.wikipedia.org/wiki/Multiple_comparisons_problem) arises when there is more than one success metric or more than one treatment variant in an A/B test.
 37 | 
 38 |         tea-tasting provides the following methods for multiple testing correction:
 39 | 
 40 |         - [False discovery rate](https://en.wikipedia.org/wiki/False_discovery_rate) (FDR) controlling procedures:
 41 |             - Benjamini-Hochberg procedure, assuming non-negative correlation between hypotheses.
 42 |             - Benjamini-Yekutieli procedure, assuming arbitrary dependence between hypotheses.
 43 |         - [Family-wise error rate](https://en.wikipedia.org/wiki/Family-wise_error_rate) (FWER) controlling procedures:
 44 |             - Hochberg's step-up procedure, assuming non-negative correlation between hypotheses.
 45 |             - Holm's step-down procedure, assuming arbitrary dependence between hypotheses.
 46 | 
 47 |         As an example, consider an experiment with three variants, a control and two treatments:
 48 |         """
 49 |     )
 50 |     return
 51 | 
 52 | 
 53 | @app.cell
 54 | def _():
 55 |     import polars as pl
 56 |     import tea_tasting as tt
 57 | 
 58 |     data = pl.concat((
 59 |         tt.make_users_data(
 60 |             seed=42,
 61 |             orders_uplift=0.10,
 62 |             revenue_uplift=0.15,
 63 |             return_type="polars",
 64 |         ),
 65 |         tt.make_users_data(
 66 |             seed=21,
 67 |             orders_uplift=0.15,
 68 |             revenue_uplift=0.20,
 69 |             return_type="polars",
 70 |         )
 71 |             .filter(pl.col("variant").eq(1))
 72 |             .with_columns(variant=pl.lit(2, pl.Int64)),
 73 |     ))
 74 |     data
 75 |     return data, tt
 76 | 
 77 | 
 78 | @app.cell(hide_code=True)
 79 | def _(mo):
 80 |     mo.md(
 81 |         r"""
 82 |         Let's calculate the experiment results:
 83 |         """
 84 |     )
 85 |     return
 86 | 
 87 | 
 88 | @app.cell
 89 | def _(data, tt):
 90 |     experiment = tt.Experiment(
 91 |         sessions_per_user=tt.Mean("sessions"),
 92 |         orders_per_session=tt.RatioOfMeans("orders", "sessions"),
 93 |         orders_per_user=tt.Mean("orders"),
 94 |         revenue_per_user=tt.Mean("revenue"),
 95 |     )
 96 |     results = experiment.analyze(data, control=0, all_variants=True)
 97 |     results
 98 |     return experiment, results
 99 | 
100 | 
101 | @app.cell(hide_code=True)
102 | def _(mo):
103 |     mo.md(
104 |         r"""
105 |         Suppose only the two metrics `orders_per_user` and `revenue_per_user` are considered as success metrics, while the other two metrics `sessions_per_user` and `orders_per_session` are second-order diagnostic metrics.
106 |         """
107 |     )
108 |     return
109 | 
110 | 
111 | @app.cell
112 | def _():
113 |     metrics = {"orders_per_user", "revenue_per_user"}
114 |     return (metrics,)
115 | 
116 | 
117 | @app.cell(hide_code=True)
118 | def _(mo):
119 |     mo.md(
120 |         r"""
121 |         With two treatment variants and two success metrics, there are four hypotheses in total, which increases the probability of false positives (also called "false discoveries"). It's recommended to adjust the p-values or the significance level (alpha) in this case. Let's explore the correction methods provided by tea-tasting.
122 | 
123 |         ## False discovery rate
124 | 
125 |         False discovery rate (FDR) is the expected value of the proportion of false discoveries among the discoveries (rejections of the null hypothesis). To control for FDR, use the [`adjust_fdr`](https://tea-tasting.e10v.me/api/multiplicity/#tea_tasting.multiplicity.adjust_fdr) method:
126 |         """
127 |     )
128 |     return
129 | 
130 | 
131 | @app.cell
132 | def _(metrics, results, tt):
133 |     adjusted_results_fdr = tt.adjust_fdr(results, metrics)
134 |     adjusted_results_fdr
135 |     return (adjusted_results_fdr,)
136 | 
137 | 
138 | @app.cell(hide_code=True)
139 | def _(mo):
140 |     mo.md(
141 |         r"""
142 |         The method adjusts p-values and saves them as `pvalue_adj`. Compare these values to the desired significance level alpha to determine if the null hypotheses can be rejected.
143 | 
144 |         The method also adjusts the significance level alpha and saves it as `alpha_adj`. Compare non-adjusted p-values (`pvalue`) to the `alpha_adj` to determine if the null hypotheses can be rejected:
145 |         """
146 |     )
147 |     return
148 | 
149 | 
150 | @app.cell
151 | def _(adjusted_results_fdr):
152 |     adjusted_results_fdr.with_keys((
153 |         "comparison",
154 |         "metric",
155 |         "control",
156 |         "treatment",
157 |         "rel_effect_size",
158 |         "pvalue",
159 |         "alpha_adj",
160 |     ))
161 |     return
162 | 
163 | 
164 | @app.cell(hide_code=True)
165 | def _(mo):
166 |     mo.md(
167 |         r"""
168 |         By default, tea-tasting assumes non-negative correlation between hypotheses and performs the Benjamini-Hochberg procedure. To perform the Benjamini-Yekutieli procedure, assuming arbitrary dependence between hypotheses, set the `arbitrary_dependence` parameter to `True`:
169 |         """
170 |     )
171 |     return
172 | 
173 | 
174 | @app.cell
175 | def _(metrics, results, tt):
176 |     tt.adjust_fdr(results, metrics, arbitrary_dependence=True)
177 |     return
178 | 
179 | 
180 | @app.cell(hide_code=True)
181 | def _(mo):
182 |     mo.md(
183 |         r"""
184 |         ## Family-wise error rate
185 | 
186 |         Family-wise error rate (FWER) is the probability of making at least one type I error. To control for FWER, use the [`adjust_fwer`](https://tea-tasting.e10v.me/api/multiplicity/#tea_tasting.multiplicity.adjust_fwer) method:
187 |         """
188 |     )
189 |     return
190 | 
191 | 
192 | @app.cell
193 | def _(metrics, results, tt):
194 |     tt.adjust_fwer(results, metrics)
195 |     return
196 | 
197 | 
198 | @app.cell(hide_code=True)
199 | def _(mo):
200 |     mo.md(
201 |         r"""
202 |         By default, tea-tasting assumes non-negative correlation between hypotheses and performs the Hochberg's step-up procedure with the Šidák correction, which is slightly more powerful than the Bonferroni correction.
203 | 
204 |         To perform the Holm's step-down procedure, assuming arbitrary dependence between hypotheses, set the `arbitrary_dependence` parameter to `True`. In this case, it's recommended to use the Bonferroni correction, since the Šidák correction assumes non-negative correlation between hypotheses:
205 |         """
206 |     )
207 |     return
208 | 
209 | 
210 | @app.cell
211 | def _(metrics, results, tt):
212 |     tt.adjust_fwer(
213 |         results,
214 |         metrics,
215 |         arbitrary_dependence=True,
216 |         method="bonferroni",
217 |     )
218 |     return
219 | 
220 | 
221 | @app.cell(hide_code=True)
222 | def _(mo):
223 |     mo.md(
224 |         r"""
225 |         ## Other inputs
226 | 
227 |         In the examples above, the methods `adjust_fdr` and `adjust_fwer` received results from a *single experiment* with *more than two variants*. They can also accept the results from *multiple experiments* with *two variants* in each:
228 |         """
229 |     )
230 |     return
231 | 
232 | 
233 | @app.cell
234 | def _(experiment, metrics, tt):
235 |     data1 = tt.make_users_data(seed=42, orders_uplift=0.10, revenue_uplift=0.15)
236 |     data2 = tt.make_users_data(seed=21, orders_uplift=0.15, revenue_uplift=0.20)
237 |     result1 = experiment.analyze(data1)
238 |     result2 = experiment.analyze(data2)
239 |     tt.adjust_fdr(
240 |         {"Experiment 1": result1, "Experiment 2": result2},
241 |         metrics,
242 |     )
243 |     return (result2,)
244 | 
245 | 
246 | @app.cell(hide_code=True)
247 | def _(mo):
248 |     mo.md(
249 |         r"""
250 |         The methods `adjust_fdr` and `adjust_fwer` can also accept the result of *a single experiment with two variants*:
251 |         """
252 |     )
253 |     return
254 | 
255 | 
256 | @app.cell
257 | def _(metrics, result2, tt):
258 |     tt.adjust_fwer(result2, metrics)
259 |     return
260 | 
261 | 
262 | @app.cell(hide_code=True)
263 | def _(mo):
264 |     mo.md(
265 |         r"""
266 | 
267 |         """
268 |     )
269 |     return
270 | 
271 | 
272 | @app.cell(hide_code=True)
273 | def _():
274 |     import marimo as mo
275 |     return (mo,)
276 | 
277 | 
278 | if __name__ == "__main__":
279 |     app.run()
280 | 


--------------------------------------------------------------------------------
/examples/power-analysis.py:
--------------------------------------------------------------------------------
  1 | # /// script
  2 | # requires-python = ">=3.10"
  3 | # dependencies = [
  4 | #     "marimo",
  5 | #     "tea-tasting",
  6 | # ]
  7 | # [tool.marimo.display]
  8 | # cell_output = "below"
  9 | # ///
 10 | 
 11 | import marimo
 12 | 
 13 | __generated_with = "0.13.6"
 14 | app = marimo.App()
 15 | 
 16 | 
 17 | @app.cell(hide_code=True)
 18 | def _(mo):
 19 |     mo.md(
 20 |         r"""
 21 |         # Power analysis
 22 | 
 23 |         In tea-tasting, you can analyze the statistical power for `Mean` and `RatioOfMeans` metrics. There are three possible options:
 24 | 
 25 |         - Calculate the effect size, given statistical power and the total number of observations.
 26 |         - Calculate the total number of observations, given statistical power and the effect size.
 27 |         - Calculate statistical power, given the effect size and the total number of observations.
 28 | 
 29 |         In this example, tea-tasting calculates statistical power given the relative effect size and the number of observations:
 30 |         """
 31 |     )
 32 |     return
 33 | 
 34 | 
 35 | @app.cell
 36 | def _():
 37 |     import tea_tasting as tt
 38 | 
 39 |     data = tt.make_users_data(
 40 |         seed=42,
 41 |         sessions_uplift=0,
 42 |         orders_uplift=0,
 43 |         revenue_uplift=0,
 44 |         covariates=True,
 45 |     )
 46 |     orders_per_session = tt.RatioOfMeans("orders", "sessions", rel_effect_size=0.1)
 47 |     orders_per_session.solve_power(data, "power")
 48 |     return data, tt
 49 | 
 50 | 
 51 | @app.cell(hide_code=True)
 52 | def _(mo):
 53 |     mo.md(
 54 |         r"""
 55 |         Besides `alternative`, `equal_var`, `use_t`, and covariates (CUPED), the following metric parameters affect the result:
 56 | 
 57 |         - `alpha`: Significance level.
 58 |         - `ratio`: Ratio of the number of observations in the treatment relative to the control.
 59 |         - `power`: Statistical power.
 60 |         - `effect_size` and `rel_effect_size`: Absolute and relative effect size. Only one of them can be defined.
 61 |         - `n_obs`: Number of observations in the control and in the treatment together. If the number of observations is not set explicitly, it's inferred from the dataset.
 62 | 
 63 |         You can change the default values of `alpha`, `ratio`, `power`, and `n_obs` using the [global settings](https://tea-tasting.e10v.me/user-guide/#global-settings).
 64 | 
 65 |         tea-tasting can analyze power for several values of parameters `effect_size`, `rel_effect_size`, or `n_obs`. Example:
 66 |         """
 67 |     )
 68 |     return
 69 | 
 70 | 
 71 | @app.cell
 72 | def _(data, tt):
 73 |     orders_per_user = tt.Mean("orders", alpha=0.1, power=0.7, n_obs=(10_000, 20_000))
 74 |     orders_per_user.solve_power(data, "rel_effect_size")
 75 |     return
 76 | 
 77 | 
 78 | @app.cell(hide_code=True)
 79 | def _(mo):
 80 |     mo.md(
 81 |         r"""
 82 |         You can analyze power for all metrics in the experiment. Example:
 83 |         """
 84 |     )
 85 |     return
 86 | 
 87 | 
 88 | @app.cell
 89 | def _(data, tt):
 90 |     with tt.config_context(n_obs=(10_000, 20_000)):
 91 |         experiment = tt.Experiment(
 92 |             sessions_per_user=tt.Mean("sessions", "sessions_covariate"),
 93 |             orders_per_session=tt.RatioOfMeans(
 94 |                 numer="orders",
 95 |                 denom="sessions",
 96 |                 numer_covariate="orders_covariate",
 97 |                 denom_covariate="sessions_covariate",
 98 |             ),
 99 |             orders_per_user=tt.Mean("orders", "orders_covariate"),
100 |             revenue_per_user=tt.Mean("revenue", "revenue_covariate"),
101 |         )
102 | 
103 |     power_result = experiment.solve_power(data)
104 |     power_result
105 |     return
106 | 
107 | 
108 | @app.cell(hide_code=True)
109 | def _(mo):
110 |     mo.md(
111 |         r"""
112 |         In the example above, tea-tasting calculates both the relative and absolute effect size for all metrics for two possible sample size values, `10_000` and `20_000`.
113 | 
114 |         The `solve_power` methods of a [metric](https://tea-tasting.e10v.me/api/metrics/mean/#tea_tasting.metrics.mean.Mean.solve_power) and of an [experiment](https://tea-tasting.e10v.me/api/experiment/#tea_tasting.experiment.Experiment.solve_power) return the instances of [`MetricPowerResults`](https://tea-tasting.e10v.me/api/metrics/base/#tea_tasting.metrics.base.MetricPowerResults) and [`ExperimentPowerResult`](https://tea-tasting.e10v.me/api/experiment/#tea_tasting.experiment.ExperimentPowerResult) respectively. These result classes provide the serialization methods similar to the experiment result: `to_dicts`, `to_arrow`, `to_pandas`, `to_polars`, `to_pretty_dicts`, `to_string`, `to_html`. They are also rendered as an HTML tables in IPython and Jupyter, and as a table in marimo notebooks.
115 |         """
116 |     )
117 |     return
118 | 
119 | 
120 | @app.cell(hide_code=True)
121 | def _():
122 |     import marimo as mo
123 |     return (mo,)
124 | 
125 | 
126 | if __name__ == "__main__":
127 |     app.run()
128 | 


--------------------------------------------------------------------------------
/examples/simulated-experiments.py:
--------------------------------------------------------------------------------
  1 | # /// script
  2 | # requires-python = ">=3.10"
  3 | # dependencies = [
  4 | #     "marimo",
  5 | #     "polars",
  6 | #     "tea-tasting",
  7 | # ]
  8 | # [tool.marimo.display]
  9 | # cell_output = "below"
 10 | # ///
 11 | 
 12 | import marimo
 13 | 
 14 | __generated_with = "0.13.6"
 15 | app = marimo.App()
 16 | 
 17 | 
 18 | @app.cell(hide_code=True)
 19 | def _(mo):
 20 |     mo.md(
 21 |         r"""
 22 |         # Simulated experiments
 23 | 
 24 |         ## Intro
 25 | 
 26 |         In tea-tasting, you can run multiple simulated A/A or A/B tests. In each simulation, tea-tasting splits the data into control and treatment groups and can optionally modify the treatment data. A simulation without changing the treatment data is called an A/A test.
 27 | 
 28 |         A/A tests are useful for identifying potential issues before conducting the actual A/B test. Treatment simulations are great for power analysis—especially when you need a specific uplift distribution or when an analytical formula doesn’t exist.
 29 | 
 30 |         /// admonition | Note
 31 | 
 32 |         This guide uses [Polars](https://github.com/pola-rs/polars) and [marimo](https://github.com/marimo-team/marimo). Install these packages in addition to tea-tasting to reproduce the examples:
 33 | 
 34 |         ```bash
 35 |         uv pip install polars marimo
 36 |         ```
 37 | 
 38 |         ///
 39 | 
 40 |         ## Running A/A tests
 41 | 
 42 |         First, let's prepare the data without any uplift and drop the `"variant"` column.
 43 |         """
 44 |     )
 45 |     return
 46 | 
 47 | 
 48 | @app.cell
 49 | def _():
 50 |     import polars as pl
 51 |     import tea_tasting as tt
 52 | 
 53 |     data = (
 54 |         tt.make_users_data(seed=42, orders_uplift=0, revenue_uplift=0)
 55 |         .drop_columns("variant")
 56 |     )
 57 |     data
 58 |     return data, pl, tt
 59 | 
 60 | 
 61 | @app.cell(hide_code=True)
 62 | def _(mo):
 63 |     mo.md(
 64 |         r"""
 65 |         To run A/A tests, first define the metrics for the experiment, then call the [`simulate`](https://tea-tasting.e10v.me/api/experiment/#tea_tasting.experiment.Experiment.simulate) method, providing the data and the number of simulations as arguments.
 66 |         """
 67 |     )
 68 |     return
 69 | 
 70 | 
 71 | @app.cell
 72 | def _(data, tt):
 73 |     experiment = tt.Experiment(
 74 |         sessions_per_user=tt.Mean("sessions"),
 75 |         orders_per_session=tt.RatioOfMeans("orders", "sessions"),
 76 |         orders_per_user=tt.Mean("orders"),
 77 |         revenue_per_user=tt.Mean("revenue"),
 78 |         n_users=tt.SampleRatio(),
 79 |     )
 80 |     results = experiment.simulate(data, 100, seed=42)
 81 |     results_data = results.to_polars()
 82 |     results_data.select(
 83 |         "metric",
 84 |         "control",
 85 |         "treatment",
 86 |         "rel_effect_size",
 87 |         "rel_effect_size_ci_lower",
 88 |         "rel_effect_size_ci_upper",
 89 |         "pvalue",
 90 |     )
 91 |     return experiment, results_data
 92 | 
 93 | 
 94 | @app.cell(hide_code=True)
 95 | def _(mo):
 96 |     mo.md(
 97 |         r"""
 98 |         The `simulate` method accepts data in the same formats as the `analyze` method. Internally, however, it converts the data to a PyArrow Table before running the simulations.
 99 | 
100 |         The method returns an instance of the [`SimulationResults`](https://tea-tasting.e10v.me/api/experiment/#tea_tasting.experiment.SimulationResults) class, which contains the results of all simulations for all metrics. The resulting object provides serialization methods to those of the experiment result, including `to_dicts`, `to_arrow`, `to_pandas`, `to_polars`, `to_pretty_dicts`, `to_string`, `to_html`.
101 | 
102 |         For instance, we can now calculate the proportion of rejected null hypotheses, using various significance levels (`alpha`). In A/A tests, it estimates the type I error rate.
103 |         """
104 |     )
105 |     return
106 | 
107 | 
108 | @app.cell
109 | def _(pl, results_data):
110 |     def null_rejected(
111 |         results_data: pl.DataFrame,
112 |         alphas: tuple[float, ...] = (0.01, 0.02, 0.05),
113 |     ) -> pl.DataFrame:
114 |         return results_data.group_by("metric", maintain_order=True).agg(
115 |             pl.col("pvalue").le(alpha).mean().alias(f"null_rejected_{alpha}")
116 |             for alpha in alphas
117 |         )
118 | 
119 |     null_rejected(results_data)
120 |     return (null_rejected,)
121 | 
122 | 
123 | @app.cell(hide_code=True)
124 | def _(mo):
125 |     mo.md(
126 |         r"""
127 |         100 simulations, as in the example above, produce a very rough estimate. In practice, a larger number of simulations, such as the default `10_000`, is recommended.
128 | 
129 |         ## Simulating experiments with treatment
130 | 
131 |         To simulate experiments with treatment, define a treatment function that takes data in the form of a PyArrow Table and returns a PyArrow Table with the modified data:
132 |         """
133 |     )
134 |     return
135 | 
136 | 
137 | @app.cell
138 | def _(data, experiment, null_rejected):
139 |     import pyarrow as pa
140 |     import pyarrow.compute as pc
141 | 
142 |     def treat(data: pa.Table) -> pa.Table:
143 |         return (
144 |             data.drop_columns(["orders", "revenue"])
145 |             .append_column("orders", pc.multiply(data["orders"], pa.scalar(1.1)))
146 |             .append_column("revenue", pc.multiply(data["revenue"], pa.scalar(1.1)))
147 |         )
148 | 
149 |     results_treat = experiment.simulate(data, 100, seed=42, treat=treat)
150 |     null_rejected(results_treat.to_polars())
151 |     return (treat,)
152 | 
153 | 
154 | @app.cell(hide_code=True)
155 | def _(mo):
156 |     mo.md(
157 |         r"""
158 |         In the example above, we've defined a function that increases the number of orders and the revenue by 10%. For these metrics, the proportion of rejected null hypotheses is an estimate of statistical power.
159 | 
160 |         ## Using a function instead of static data
161 | 
162 |         You can use a function instead of static data to generate input dynamically. The function should take an instance of `numpy.random.Generator` as a parameter named `seed` and return experimental data in any format supported by tea-tasting.
163 | 
164 |         As an example, let's use the `make_users_data` function.
165 |         """
166 |     )
167 |     return
168 | 
169 | 
170 | @app.cell
171 | def _(experiment, null_rejected, tt):
172 |     results_data_gen = experiment.simulate(tt.make_users_data, 100, seed=42)
173 |     null_rejected(results_data_gen.to_polars())
174 |     return
175 | 
176 | 
177 | @app.cell(hide_code=True)
178 | def _(mo):
179 |     mo.md(
180 |         r"""
181 |         On each iteration, tea-tasting calls `make_users_data` with a new `seed` and uses the returned data for the analysis of the experiment. The data returned by `make_users_data` already contains the `"variant"` column, so tea-tasting reuses that split. By default, `make_users_data` also adds the treatment uplift, and you can see it in the proportion of rejected null hypotheses.
182 | 
183 |         ## Tracking progress
184 | 
185 |         To track the progress of simulations with [`tqdm`](https://github.com/tqdm/tqdm) or [`marimo.status.progress_bar`](https://docs.marimo.io/api/status/#progress-bar), use the `progress` parameter.
186 |         """
187 |     )
188 |     return
189 | 
190 | 
191 | @app.cell
192 | def _(data, experiment, mo):
193 |     results_progress = experiment.simulate(
194 |         data,
195 |         100,
196 |         seed=42,
197 |         progress=mo.status.progress_bar,
198 |     )
199 |     return
200 | 
201 | 
202 | @app.cell(hide_code=True)
203 | def _(mo):
204 |     mo.md(
205 |         r"""
206 |         ## Parallel execution
207 | 
208 |         /// admonition | Note
209 | 
210 |         The code below won't work in the [marimo online playground](https://docs.marimo.io/guides/publishing/playground/) as it relies on the `multiprocessing` module which is currently [not supported](https://docs.marimo.io/guides/wasm/#limitations) by WASM notebooks. [WASM notebooks](https://docs.marimo.io/guides/wasm/) are the marimo notebooks that run entirely in the browser.
211 | 
212 |         ///
213 | 
214 |         To speed up simulations and run them in parallel, use the `map_` parameter with an alternative mapping function.
215 |         """
216 |     )
217 |     return
218 | 
219 | 
220 | @app.cell
221 | def _(data, experiment, mo, treat):
222 |     import concurrent.futures
223 | 
224 |     with concurrent.futures.ProcessPoolExecutor() as executor:
225 |         results_parallel = experiment.simulate(
226 |             data,
227 |             100,
228 |             seed=42,
229 |             treat=treat,
230 |             map_=executor.map,
231 |             progress=mo.status.progress_bar,
232 |         )
233 |     return
234 | 
235 | 
236 | @app.cell(hide_code=True)
237 | def _(mo):
238 |     mo.md(
239 |         r"""
240 |         As an alternative to [`concurrent.futures.ProcessPoolExecutor`](https://docs.python.org/3/library/concurrent.futures.html#processpoolexecutor), you can use the `map`, `imap`, or `imap_unordered` methods of [`multiprocessing.pool.Pool`](https://docs.python.org/3/library/multiprocessing.html#module-multiprocessing.pool).
241 | 
242 |         It's also possible to run simulations on a distributed [Dask](https://distributed.dask.org/en/stable/api.html#distributed.Client.map) or [Ray](https://docs.ray.io/en/latest/ray-core/api/doc/ray.util.ActorPool.map.html#ray.util.ActorPool.map) cluster.
243 |         """
244 |     )
245 |     return
246 | 
247 | 
248 | @app.cell(hide_code=True)
249 | def _():
250 |     import marimo as mo
251 |     return (mo,)
252 | 
253 | 
254 | if __name__ == "__main__":
255 |     app.run()
256 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: "tea-tasting: statistical analysis of A/B tests"
 2 | site_url: https://tea-tasting.e10v.me/
 3 | site_description: A Python package for the statistical analysis of A/B tests
 4 | site_author: Evgeny Ivanov
 5 | copyright: © Evgeny Ivanov <br> The logo is designed by <a href="http://www.freepik.com/">Freepik</a>
 6 | repo_name: e10v/tea-tasting
 7 | repo_url: https://github.com/e10v/tea-tasting
 8 | 
 9 | nav:
10 |   - Overview: index.md
11 |   - User guide: user-guide.md
12 |   - Data backends: data-backends.md
13 |   - Power analysis: power-analysis.md
14 |   - Multiple testing: multiple-testing.md
15 |   - Custom metrics: custom-metrics.md
16 |   - Simulated experiments: simulated-experiments.md
17 |   - API reference:
18 |     - API reference: api/index.md
19 |     - Metrics:
20 |       - Metrics: api/metrics/index.md
21 |       - Base: api/metrics/base.md
22 |       - Mean: api/metrics/mean.md
23 |       - Proportion: api/metrics/proportion.md
24 |       - Resampling: api/metrics/resampling.md
25 |     - Experiment: api/experiment.md
26 |     - Multiplicity: api/multiplicity.md
27 |     - Datasets: api/datasets.md
28 |     - Global configuration: api/config.md
29 |     - Aggregates: api/aggr.md
30 |     - Utilities: api/utils.md
31 | 
32 | theme:
33 |   name: material
34 |   palette: 
35 |     - media: "(prefers-color-scheme)"
36 |       toggle:
37 |         icon: material/brightness-auto
38 |         name: Switch to light mode
39 |     - media: "(prefers-color-scheme: light)"
40 |       scheme: default
41 |       primary: deep orange
42 |       accent: deep orange
43 |       toggle:
44 |         icon: material/brightness-7
45 |         name: Switch to dark mode
46 |     - media: "(prefers-color-scheme: dark)"
47 |       scheme: slate
48 |       primary: deep orange
49 |       accent: deep orange
50 |       toggle:
51 |         icon: material/brightness-4
52 |         name: Switch to system preference
53 |   logo: assets/tea-cup-white.svg
54 |   favicon: assets/tea-cup-white-on-black.svg
55 |   icon:
56 |     repo: fontawesome/brands/github
57 |   features:
58 |     - content.code.copy
59 |     - navigation.indexes
60 |     - navigation.instant
61 |     - navigation.instant.progress
62 |     - navigation.top
63 |     - navigation.tracking
64 |     - search.highlight
65 |     - search.suggest
66 |     - toc.follow
67 |   
68 | plugins:
69 |   - mkdocstrings:
70 |       default_handler: python
71 |       handlers:
72 |         python:
73 |           options:
74 |             filters: ["!^_"]
75 |             heading_level: 1
76 |             inherited_members: true
77 |             merge_init_into_class: true
78 |             show_overloads: false
79 |             show_root_heading: true
80 |   - search
81 | 
82 | markdown_extensions:
83 |   - _internal.external_links
84 |   - _internal.strip_doctest_artifacts
85 |   - pymdownx.blocks.admonition
86 |   - pymdownx.superfences
87 |   - toc:
88 |       permalink: "#"
89 | 
90 | extra_css:
91 |   - stylesheets/extra.css
92 | 
93 | extra_javascript:
94 |   - javascripts/override-copy.js
95 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [project]
  2 | name = "tea-tasting"
  3 | dynamic = ["version"]
  4 | description = "A Python package for the statistical analysis of A/B tests."
  5 | authors = [
  6 |     {name = "Evgeny Ivanov", email = "ivanov.evgeny.n@gmail.com"},
  7 | ]
  8 | dependencies = [
  9 |     "ibis-framework>=9",
 10 |     "narwhals>=1.4",
 11 |     "numpy>=1.25",
 12 |     "pyarrow>=16",
 13 |     "scipy>=1.11",
 14 | ]
 15 | requires-python = ">=3.10"
 16 | readme = "README.md"
 17 | license = {text = "MIT"}
 18 | classifiers = [
 19 |     "Development Status :: 5 - Production/Stable",
 20 |     "Intended Audience :: Developers",
 21 |     "Intended Audience :: Information Technology",
 22 |     "Intended Audience :: Science/Research",
 23 |     "License :: OSI Approved",
 24 |     "License :: OSI Approved :: MIT License",
 25 |     "Operating System :: OS Independent",
 26 |     "Programming Language :: Python",
 27 |     "Programming Language :: Python :: 3",
 28 |     "Programming Language :: Python :: 3.10",
 29 |     "Programming Language :: Python :: 3.11",
 30 |     "Programming Language :: Python :: 3.12",
 31 |     "Programming Language :: Python :: 3.13",
 32 |     "Topic :: Scientific/Engineering",
 33 |     "Topic :: Scientific/Engineering :: Information Analysis",
 34 |     "Topic :: Scientific/Engineering :: Mathematics",
 35 |     "Typing :: Typed",
 36 | ]
 37 | 
 38 | [project.urls]
 39 | homepage = "https://tea-tasting.e10v.me"
 40 | documentation = "https://tea-tasting.e10v.me/user-guide"
 41 | source = "https://github.com/e10v/tea-tasting"
 42 | "release notes" = "https://github.com/e10v/tea-tasting/releases"
 43 | 
 44 | 
 45 | [dependency-groups]
 46 | docs = ["mkdocs-material", "mkdocstrings[python]"]
 47 | lint = ["markdown", "marimo", "pyright", "ruff"]
 48 | test = [
 49 |     "coverage[toml]>=7",
 50 |     "ibis-framework[duckdb,sqlite]",
 51 |     "marimo>=0.10",
 52 |     "pandas>=2",
 53 |     "polars>=1",
 54 |     "pytest>=8",
 55 |     "tqdm>=4",
 56 | ]
 57 | 
 58 | 
 59 | [build-system]
 60 | requires = ["pdm-backend"]
 61 | build-backend = "pdm.backend"
 62 | 
 63 | 
 64 | [tool.pdm.build]
 65 | excludes = ["src/_*/**/*"]
 66 | package-dir = "src"
 67 | 
 68 | [tool.pdm.scripts]
 69 | all.composite = ["doctest", "test", "cover", "lint", "type"]
 70 | all.keep_going = true
 71 | cover = "coverage report -m"
 72 | docserv = "mkdocs serve -w docs -w src -w mkdocs.yml"
 73 | doctest.cmd = [
 74 |     "pytest",
 75 |     "--doctest-continue-on-failure",
 76 |     "--doctest-glob=*.md",
 77 |     "--doctest-modules",
 78 |     "--ignore=examples/",
 79 |     "--ignore=tests/",
 80 |     "--ignore-glob=src/_*",
 81 | ]
 82 | lint = "ruff check ."
 83 | test = "coverage run -m pytest"
 84 | type = "pyright"
 85 | 
 86 | [tool.pdm.version]
 87 | source = "scm"
 88 | write_to = "tea_tasting/_version.txt"
 89 | 
 90 | 
 91 | [tool.coverage.run]
 92 | source = ["src/tea_tasting"]
 93 | [tool.coverage.report]
 94 | exclude_lines = ["if TYPE_CHECKING:", "pragma: no cover", "@overload", "@abc.abstractmethod"]
 95 | 
 96 | 
 97 | [tool.ruff]
 98 | extend-exclude = ["examples"]
 99 | src = ["src"]
100 | 
101 | [tool.ruff.lint]
102 | select = [
103 |     "A", "ANN", "ARG", "B", "BLE", "C4", "C90", "COM", "D", "DOC", "E", "ERA",
104 |     "F", "FA", "FBT", "FIX", "FLY", "FURB", "I", "ICN", "INP", "INT", "ISC",
105 |     "N", "NPY", "PD", "PERF", "PGH", "PIE", "PL", "PT", "Q", "RET", "RSE",
106 |     "RUF", "S", "SIM", "SLF", "SLOT", "T10", "T20", "TC", "TD", "TID", "TRY",
107 |     "UP", "W",
108 | ]
109 | ignore = ["ANN401", "PGH003", "SLF001", "TRY003"]
110 | 
111 | [tool.ruff.lint.per-file-ignores]
112 | "*/__init__.py" = ["F401"]
113 | "tests/*" = [
114 |     "ANN201", "D", "FBT003", "PLR2004", "PT001", "S101",
115 | ]
116 | 
117 | [tool.ruff.lint.isort]
118 | force-sort-within-sections = true
119 | lines-after-imports = 2
120 | 
121 | [tool.ruff.lint.pydocstyle]
122 | convention = "google"
123 | 
124 | [tool.ruff.lint.pylint]
125 | max-args = 8
126 | 
127 | 
128 | [tool.pyright]
129 | exclude = ["examples", "**/node_modules", "**/__pycache__", "**/.*"]
130 | typeCheckingMode = "strict"
131 | reportMissingTypeStubs = false
132 | reportPrivateUsage = false
133 | reportUnknownArgumentType = false
134 | reportUnknownMemberType = false
135 | reportUnknownParameterType = false
136 | reportUnknownVariableType = false
137 | 


--------------------------------------------------------------------------------
/src/_internal/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e10v/tea-tasting/70b129d8dad04654b4aceb56c8ee2e348b0636b0/src/_internal/__init__.py


--------------------------------------------------------------------------------
/src/_internal/create_examples.py:
--------------------------------------------------------------------------------
  1 | """Convert guides to examples as marimo notebooks."""
  2 | # pyright: reportPrivateImportUsage=false
  3 | 
  4 | from __future__ import annotations
  5 | 
  6 | import re
  7 | import textwrap
  8 | 
  9 | import marimo._ast.cell
 10 | import marimo._convert.utils
 11 | 
 12 | 
 13 | GUIDES: dict[str, tuple[str, ...]] = {
 14 |     "user-guide": ("polars",),
 15 |     "data-backends": ("ibis-framework[duckdb]", "polars"),
 16 |     "power-analysis": (),
 17 |     "multiple-testing": ("polars",),
 18 |     "custom-metrics": (),
 19 |     "simulated-experiments": ("polars",),
 20 | }
 21 | 
 22 | HIDE_CODE = marimo._ast.cell.CellConfig(hide_code=True)
 23 | SHOW_CODE = marimo._ast.cell.CellConfig(hide_code=False)
 24 | 
 25 | RE_LINK = re.compile(r"\[([^\]]+)\]\((?!#)([^)]+)\)")
 26 | RE_DOCTEST = re.compile(r"\s+# doctest:.*")
 27 | 
 28 | 
 29 | def convert_guide(name: str, deps: tuple[str, ...]) -> None:
 30 |     with open(f"docs/{name}.md") as f:
 31 |         guide_text = f.read()
 32 | 
 33 |     sources = []
 34 |     cell_configs = []
 35 |     for text in guide_text.split("```pycon"):
 36 |         if len(sources) == 0:
 37 |             md = text
 38 |         else:
 39 |             end_of_code = text.find("```")
 40 |             md = text[end_of_code + 3:]
 41 |             sources.append(convert_code(text[:end_of_code]))
 42 |             cell_configs.append(SHOW_CODE)
 43 | 
 44 |         sources.append(marimo._convert.utils.markdown_to_marimo(convert_md(md)))
 45 |         cell_configs.append(HIDE_CODE)
 46 | 
 47 |     sources.append("import marimo as mo")
 48 |     cell_configs.append(HIDE_CODE)
 49 | 
 50 |     code = marimo._convert.utils.generate_from_sources(
 51 |         sources=sources,
 52 |         cell_configs=cell_configs,
 53 |         header_comments=create_header_comments(deps),
 54 |     )
 55 |     with open(f"examples/{name}.py", "w") as f:
 56 |         f.write(code)
 57 | 
 58 | 
 59 | def convert_code(code: str) -> str:
 60 |     lines = []
 61 |     for line in code.split("\n"):
 62 |         if line == ">>> import tqdm":
 63 |             pass
 64 |         elif line.startswith((">>>", "...")):
 65 |             lines.append(RE_DOCTEST.sub("", line[4:]))
 66 |         elif line == "":
 67 |             lines.append("")
 68 |     return "\n".join(lines).strip().replace("tqdm.tqdm", "mo.status.progress_bar")
 69 | 
 70 | 
 71 | def convert_md(md: str) -> str:
 72 |     return (
 73 |         RE_LINK.sub(update_link, md.strip())
 74 |         .replace(
 75 |             "[tqdm](https://github.com/tqdm/tqdm)",
 76 |             "[marimo](https://github.com/marimo-team/marimo)",
 77 |         )
 78 |         .replace(" tqdm", " marimo")
 79 |     )
 80 | 
 81 | 
 82 | def update_link(match: re.Match[str]) -> str:
 83 |     label = match.group(1)
 84 |     url = match.group(2).replace(".md", "/")
 85 |     root = "" if url.startswith("http") else "https://tea-tasting.e10v.me/"
 86 |     return f"[{label}]({root}{url})"
 87 | 
 88 | 
 89 | def create_header_comments(deps: tuple[str, ...]) -> str:
 90 |     dependencies = "\n".join(
 91 |         f'#     "{dep}",'
 92 |         for dep in sorted((*deps, "marimo", "tea-tasting"))
 93 |     )
 94 |     return textwrap.dedent("""
 95 |         # /// script
 96 |         # requires-python = ">=3.10"
 97 |         # dependencies = [
 98 |         {dependencies}
 99 |         # ]
100 |         # [tool.marimo.display]
101 |         # cell_output = "below"
102 |         # ///
103 |     """).format(dependencies=dependencies)
104 | 
105 | 
106 | if __name__ == "__main__":
107 |     for name, deps in GUIDES.items():
108 |         convert_guide(name, deps)
109 | 


--------------------------------------------------------------------------------
/src/_internal/external_links.py:
--------------------------------------------------------------------------------
 1 | """Markdown extension that adds target="_blank" and rel="noopener" to external links."""
 2 | # ruff: noqa: N802
 3 | from __future__ import annotations
 4 | 
 5 | from typing import TYPE_CHECKING
 6 | import urllib.parse
 7 | 
 8 | import markdown
 9 | import markdown.extensions
10 | import markdown.treeprocessors
11 | 
12 | 
13 | if TYPE_CHECKING:
14 |     import xml.etree.ElementTree as ET
15 | 
16 | 
17 | class ExternalLinksTreeprocessor(markdown.treeprocessors.Treeprocessor):
18 |     def run(self, root: ET.Element) -> None:
19 |         for a in root.iter("a"):
20 |             url = urllib.parse.urlparse(a.get("href", ""))
21 |             if (
22 |                 url.scheme in {"http", "https"} and
23 |                 url.hostname is not None and
24 |                 not url.hostname.startswith(("tea-tasting.e10v.me", "127.0.0.1"))
25 |             ):
26 |                 a.set("target", "_blank")
27 |                 a.set("rel", "noopener")
28 | 
29 | class ExternalLinksExtension(markdown.extensions.Extension):
30 |     def extendMarkdown(self, md: markdown.Markdown) -> None:
31 |         md.treeprocessors.register(
32 |             ExternalLinksTreeprocessor(md),
33 |             "external_links",
34 |             -1000,
35 |         )
36 | 
37 | def makeExtension(**kwargs: dict[str, object]) -> ExternalLinksExtension:
38 |     return ExternalLinksExtension(**kwargs)
39 | 


--------------------------------------------------------------------------------
/src/_internal/strip_doctest_artifacts.py:
--------------------------------------------------------------------------------
 1 | """Markdown extension that strips doctest artifacts."""
 2 | # ruff: noqa: N802
 3 | from __future__ import annotations
 4 | 
 5 | import re
 6 | 
 7 | import markdown
 8 | import markdown.extensions
 9 | import markdown.preprocessors
10 | 
11 | 
12 | RE_DOCTEST = re.compile(r"<BLANKLINE>|\s+# doctest:.*")
13 | 
14 | class StripDoctestArtifactsPreprocessor(markdown.preprocessors.Preprocessor):
15 |     def run(self, lines: list[str]) -> list[str]:
16 |         return [RE_DOCTEST.sub("", line) for line in lines]
17 | 
18 | class StripDoctestArtifactsExtension(markdown.extensions.Extension):
19 |     def extendMarkdown(self, md: markdown.Markdown) -> None:
20 |         md.preprocessors.register(
21 |             StripDoctestArtifactsPreprocessor(md),
22 |             "strip_doctest_artifacts",
23 |             175,
24 |         )
25 | 
26 | def makeExtension(**kwargs: dict[str, object]) -> StripDoctestArtifactsExtension:
27 |     return StripDoctestArtifactsExtension(**kwargs)
28 | 


--------------------------------------------------------------------------------
/src/tea_tasting/__init__.py:
--------------------------------------------------------------------------------
 1 | """A Python package for the statistical analysis of A/B tests.
 2 | 
 3 | All classes and functions for the analysis of the experiments can be imported
 4 | from the root `tea_tasting` module.
 5 | 
 6 | There are functions and classes for advanced use cases such as defining custom metrics.
 7 | They can be imported from submodules of `tea_tasting`.
 8 | 
 9 | For convenience, the API reference is provided by submodules:
10 | 
11 | - `tea_tasting.metrics`: Built-in metrics.
12 | - `tea_tasting.experiment`: Experiment and experiment result.
13 | - `tea_tasting.multiplicity`: Multiple hypothesis testing.
14 | - `tea_tasting.datasets`: Example datasets.
15 | - `tea_tasting.config`: Global configuration.
16 | - `tea_tasting.aggr`: Module for working with aggregated statistics.
17 | - `tea_tasting.utils`: Useful functions and classes.
18 | """
19 | # pyright: reportUnusedImport=false
20 | 
21 | from tea_tasting.config import config_context, get_config, set_config
22 | from tea_tasting.datasets import make_sessions_data, make_users_data
23 | from tea_tasting.experiment import Experiment
24 | from tea_tasting.metrics import Bootstrap, Mean, Quantile, RatioOfMeans, SampleRatio
25 | from tea_tasting.multiplicity import adjust_fdr, adjust_fwer
26 | from tea_tasting.version import __version__
27 | 


--------------------------------------------------------------------------------
/src/tea_tasting/config.py:
--------------------------------------------------------------------------------
  1 | """Global configuration."""
  2 | # ruff: noqa: PLR0913
  3 | 
  4 | from __future__ import annotations
  5 | 
  6 | import contextlib
  7 | import contextvars
  8 | from typing import TYPE_CHECKING, overload
  9 | 
 10 | import tea_tasting.utils
 11 | 
 12 | 
 13 | if TYPE_CHECKING:
 14 |     from collections.abc import Iterator, Sequence
 15 |     from typing import Literal
 16 | 
 17 | 
 18 | _DEFAULT_CONFIG: dict[str, object] = {
 19 |     "alpha": 0.05,
 20 |     "alternative": "two-sided",
 21 |     "confidence_level": 0.95,
 22 |     "equal_var": False,
 23 |     "n_obs": None,
 24 |     "n_resamples": 10_000,
 25 |     "power": 0.8,
 26 |     "ratio": 1,
 27 |     "use_t": True,
 28 | }
 29 | 
 30 | _config_var: contextvars.ContextVar[dict[str, object]] = contextvars.ContextVar(
 31 |     "tea_tasting.config",
 32 |     default=_DEFAULT_CONFIG.copy(),  # noqa: B039
 33 | )
 34 | 
 35 | 
 36 | @overload
 37 | def get_config(option: Literal["alpha"]) -> float:
 38 |     ...
 39 | 
 40 | @overload
 41 | def get_config(option: Literal["alternative"]) -> str:
 42 |     ...
 43 | 
 44 | @overload
 45 | def get_config(option: Literal["confidence_level"]) -> float:
 46 |     ...
 47 | 
 48 | @overload
 49 | def get_config(option: Literal["equal_var"]) -> bool:
 50 |     ...
 51 | 
 52 | @overload
 53 | def get_config(option: Literal["n_obs"]) -> int | Sequence[int] | None:
 54 |     ...
 55 | 
 56 | @overload
 57 | def get_config(option: Literal["n_resamples"]) -> str:
 58 |     ...
 59 | 
 60 | @overload
 61 | def get_config(option: Literal["power"]) -> float:
 62 |     ...
 63 | 
 64 | @overload
 65 | def get_config(option: Literal["ratio"]) -> float | int:
 66 |     ...
 67 | 
 68 | @overload
 69 | def get_config(option: Literal["use_t"]) -> bool:
 70 |     ...
 71 | 
 72 | @overload
 73 | def get_config(option: str) -> object:
 74 |     ...
 75 | 
 76 | @overload
 77 | def get_config(option: None = None) -> dict[str, object]:
 78 |     ...
 79 | 
 80 | def get_config(option: str | None = None) -> object:
 81 |     """Retrieve the current settings of the global configuration.
 82 | 
 83 |     Args:
 84 |         option: The option name.
 85 | 
 86 |     Returns:
 87 |         The specified option value if its name is provided,
 88 |             or a dictionary containing all options otherwise.
 89 | 
 90 |     Examples:
 91 |         ```pycon
 92 |         >>> import tea_tasting as tt
 93 | 
 94 |         >>> tt.get_config("equal_var")
 95 |         False
 96 | 
 97 |         ```
 98 |     """
 99 |     config = _config_var.get()
100 |     return config[option] if option is not None else config.copy()
101 | 
102 | 
103 | def _set_config(**params: object) -> contextvars.Token[dict[str, object]]:
104 |     config = _config_var.get().copy()
105 |     for name, value in params.items():
106 |         if value is not None:
107 |             config[name] = tea_tasting.utils.auto_check(value, name)
108 |     return _config_var.set(config)
109 | 
110 | 
111 | def set_config(
112 |     *,
113 |     alpha: float | None = None,
114 |     alternative: Literal["two-sided", "greater", "less"] | None = None,
115 |     confidence_level: float | None = None,
116 |     equal_var: bool | None = None,
117 |     n_obs: int | Sequence[int] | None = None,
118 |     n_resamples: int | None = None,
119 |     power: float | None = None,
120 |     ratio: float | int | None = None,
121 |     use_t: bool | None = None,
122 |     **kwargs: object,
123 | ) -> None:
124 |     """Update the global configuration with specified settings.
125 | 
126 |     Args:
127 |         alpha: Significance level. Default is 0.05.
128 |         alternative: Alternative hypothesis:
129 | 
130 |             - `"two-sided"`: the means are unequal,
131 |             - `"greater"`: the mean in the treatment variant is greater than the mean
132 |                 in the control variant,
133 |             - `"less"`: the mean in the treatment variant is less than the mean
134 |                 in the control variant.
135 | 
136 |             Default is `"two-sided"`.
137 | 
138 |         confidence_level: Confidence level for the confidence interval.
139 |             Default is `0.95`.
140 |         equal_var: Defines whether equal variance is assumed. If `True`,
141 |             pooled variance is used for the calculation of the standard error
142 |             of the difference between two means. Default is `False`.
143 |         n_obs: Number of observations in the control and in the treatment together.
144 |             Default is `None`.
145 |         n_resamples: The number of resamples performed to form the bootstrap
146 |             distribution of a statistic. Default is `10_000`.
147 |         power: Statistical power. Default is 0.8.
148 |         ratio: Ratio of the number of observations in the treatment
149 |             relative to the control. Default is 1.
150 |         use_t: Defines whether to use the Student's t-distribution (`True`) or
151 |             the Normal distribution (`False`) by default. Default is `True`.
152 |         **kwargs: User-defined global parameters.
153 | 
154 |     Examples:
155 |         ```pycon
156 |         >>> import tea_tasting as tt
157 | 
158 |         >>> tt.set_config(equal_var=True, use_t=False)
159 |         >>> experiment = tt.Experiment(
160 |         ...     sessions_per_user=tt.Mean("sessions"),
161 |         ...     orders_per_session=tt.RatioOfMeans("orders", "sessions"),
162 |         ...     orders_per_user=tt.Mean("orders"),
163 |         ...     revenue_per_user=tt.Mean("revenue"),
164 |         ... )
165 |         >>> tt.set_config(equal_var=False, use_t=True)
166 |         >>> experiment.metrics["orders_per_user"]
167 |         Mean(value='orders', covariate=None, alternative='two-sided', confidence_level=0.95, equal_var=True, use_t=False, alpha=0.05, ratio=1, power=0.8, effect_size=None, rel_effect_size=None, n_obs=None)
168 | 
169 |         ```
170 |     """  # noqa: E501
171 |     _set_config(**{k: v for k, v in locals().items() if k != "kwargs"}, **kwargs)
172 | 
173 | 
174 | @contextlib.contextmanager
175 | def config_context(
176 |     *,
177 |     alpha: float | None = None,
178 |     alternative: Literal["two-sided", "greater", "less"] | None = None,
179 |     confidence_level: float | None = None,
180 |     equal_var: bool | None = None,
181 |     n_obs: int | Sequence[int] | None = None,
182 |     n_resamples: int | None = None,
183 |     power: float | None = None,
184 |     ratio: float | int | None = None,
185 |     use_t: bool | None = None,
186 |     **kwargs: object,
187 | ) -> Iterator[object]:
188 |     """A context manager that temporarily modifies the global configuration.
189 | 
190 |     Args:
191 |         alpha: Significance level. Default is 0.05.
192 |         alternative: Alternative hypothesis:
193 | 
194 |             - `"two-sided"`: the means are unequal,
195 |             - `"greater"`: the mean in the treatment variant is greater than the mean
196 |                 in the control variant,
197 |             - `"less"`: the mean in the treatment variant is less than the mean
198 |                 in the control variant.
199 | 
200 |             Default is `"two-sided"`.
201 | 
202 |         confidence_level: Confidence level for the confidence interval.
203 |             Default is `0.95`.
204 |         equal_var: Defines whether equal variance is assumed. If `True`,
205 |             pooled variance is used for the calculation of the standard error
206 |             of the difference between two means. Default is `False`.
207 |         n_obs: Number of observations in the control and in the treatment together.
208 |             Default is `None`.
209 |         n_resamples: The number of resamples performed to form the bootstrap
210 |             distribution of a statistic. Default is `10_000`.
211 |         power: Statistical power. Default is 0.8.
212 |         ratio: Ratio of the number of observations in the treatment
213 |             relative to the control. Default is 1.
214 |         use_t: Defines whether to use the Student's t-distribution (`True`) or
215 |             the Normal distribution (`False`) by default. Default is `True`.
216 |         **kwargs: User-defined global parameters.
217 | 
218 |     Examples:
219 |         ```pycon
220 |         >>> import tea_tasting as tt
221 | 
222 |         >>> with tt.config_context(equal_var=True, use_t=False):
223 |         ...     experiment = tt.Experiment(
224 |         ...         sessions_per_user=tt.Mean("sessions"),
225 |         ...         orders_per_session=tt.RatioOfMeans("orders", "sessions"),
226 |         ...         orders_per_user=tt.Mean("orders"),
227 |         ...         revenue_per_user=tt.Mean("revenue"),
228 |         ...     )
229 |         >>> experiment.metrics["orders_per_user"]
230 |         Mean(value='orders', covariate=None, alternative='two-sided', confidence_level=0.95, equal_var=True, use_t=False, alpha=0.05, ratio=1, power=0.8, effect_size=None, rel_effect_size=None, n_obs=None)
231 | 
232 |         ```
233 |     """  # noqa: E501
234 |     token = _set_config(
235 |         **{k: v for k, v in locals().items() if k != "kwargs"},
236 |         **kwargs,
237 |     )
238 |     try:
239 |         yield
240 |     finally:
241 |         _config_var.reset(token)
242 | 


--------------------------------------------------------------------------------
/src/tea_tasting/metrics/__init__.py:
--------------------------------------------------------------------------------
 1 | """This module provides built-in metrics used to analyze experimental data.
 2 | 
 3 | All metric classes can be imported from `tea_tasting.metrics` module.
 4 | For convenience, the API reference is provided by submodules of `tea_tasting.metrics`:
 5 | 
 6 | - `tea_tasting.metrics.base`: Base classes for metrics.
 7 | - `tea_tasting.metrics.mean`: Metrics for the analysis of means.
 8 | - `tea_tasting.metrics.proportion`: Metrics for the analysis of proportions.
 9 | - `tea_tasting.metrics.resampling`: Metrics analyzed using resampling methods.
10 | """
11 | # pyright: reportUnusedImport=false
12 | 
13 | from tea_tasting.metrics.base import (
14 |     AggrCols,
15 |     MetricBase,
16 |     MetricBaseAggregated,
17 |     MetricBaseGranular,
18 |     MetricPowerResults,
19 |     MetricResult,
20 |     PowerBase,
21 |     PowerBaseAggregated,
22 |     aggregate_by_variants,
23 |     read_granular,
24 | )
25 | from tea_tasting.metrics.mean import Mean, RatioOfMeans
26 | from tea_tasting.metrics.proportion import SampleRatio
27 | from tea_tasting.metrics.resampling import Bootstrap, Quantile
28 | 


--------------------------------------------------------------------------------
/src/tea_tasting/metrics/base.py:
--------------------------------------------------------------------------------
  1 | """Base classes for metrics."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import abc
  6 | from collections import UserList
  7 | from typing import (
  8 |     TYPE_CHECKING,
  9 |     Generic,
 10 |     NamedTuple,
 11 |     TypeAlias,
 12 |     TypeVar,
 13 |     Union,
 14 |     overload,
 15 | )
 16 | 
 17 | import ibis
 18 | import ibis.expr.types
 19 | import narwhals as nw
 20 | import pyarrow as pa
 21 | import pyarrow.compute as pc
 22 | 
 23 | import tea_tasting.aggr
 24 | import tea_tasting.utils
 25 | 
 26 | 
 27 | if TYPE_CHECKING:
 28 |     from collections.abc import Sequence
 29 |     from typing import Literal
 30 | 
 31 |     import narwhals.typing  # noqa: TC004
 32 | 
 33 | 
 34 | # The | operator doesn't work for NamedTuple, but Union works.
 35 | MetricResult: TypeAlias = Union[NamedTuple, dict[str, object]]  # noqa: UP007
 36 | MetricPowerResult: TypeAlias = Union[NamedTuple, dict[str, object]]  # noqa: UP007
 37 | 
 38 | R = TypeVar("R", bound=MetricResult)
 39 | P = TypeVar("P", bound=MetricPowerResult)
 40 | 
 41 | 
 42 | class MetricPowerResults(tea_tasting.utils.DictsReprMixin, UserList[P]):
 43 |     """Power analysis results."""
 44 |     default_keys = ("power", "effect_size", "rel_effect_size", "n_obs")
 45 | 
 46 |     @tea_tasting.utils._cache_method
 47 |     def to_dicts(self) -> tuple[dict[str, object], ...]:
 48 |         """"Convert the results to a sequence of dictionaries."""
 49 |         return tuple((v if isinstance(v, dict) else v._asdict()) for v in self)
 50 | 
 51 | S = TypeVar("S", bound=MetricPowerResults)  # type: ignore
 52 | 
 53 | 
 54 | class MetricBase(abc.ABC, Generic[R], tea_tasting.utils.ReprMixin):
 55 |     """Base class for metrics."""
 56 |     @abc.abstractmethod
 57 |     def analyze(
 58 |         self,
 59 |         data: narwhals.typing.IntoFrame | ibis.expr.types.Table,
 60 |         control: object,
 61 |         treatment: object,
 62 |         variant: str,
 63 |     ) -> R:
 64 |         """Analyze a metric in an experiment.
 65 | 
 66 |         Args:
 67 |             data: Experimental data.
 68 |             control: Control variant.
 69 |             treatment: Treatment variant.
 70 |             variant: Variant column name.
 71 | 
 72 |         Returns:
 73 |             Analysis result.
 74 |         """
 75 | 
 76 | 
 77 | class PowerBase(abc.ABC, Generic[S], tea_tasting.utils.ReprMixin):
 78 |     """Base class for the analysis of power."""
 79 |     @abc.abstractmethod
 80 |     def solve_power(
 81 |         self,
 82 |         data: narwhals.typing.IntoFrame | ibis.expr.types.Table,
 83 |         parameter: Literal[
 84 |             "power", "effect_size", "rel_effect_size", "n_obs"] = "rel_effect_size",
 85 |     ) -> S:
 86 |         """Solve for a parameter of the power of a test.
 87 | 
 88 |         Args:
 89 |             data: Sample data.
 90 |             parameter: Parameter name.
 91 | 
 92 |         Returns:
 93 |             Power analysis result.
 94 |         """
 95 | 
 96 | 
 97 | class AggrCols(NamedTuple):
 98 |     """Columns to be aggregated for a metric analysis.
 99 | 
100 |     Attributes:
101 |         has_count: If `True`, include the sample size.
102 |         mean_cols: Column names for calculation of sample means.
103 |         var_cols: Column names for calculation of sample variances.
104 |         cov_cols: Pairs of column names for calculation of sample covariances.
105 |     """
106 |     has_count: bool = False
107 |     mean_cols: Sequence[str] = ()
108 |     var_cols: Sequence[str] = ()
109 |     cov_cols: Sequence[tuple[str, str]] = ()
110 | 
111 |     def __or__(self, other: AggrCols) -> AggrCols:
112 |         """Merge two aggregation column specifications.
113 | 
114 |         Args:
115 |             other: Second objects.
116 | 
117 |         Returns:
118 |             Merged column specifications.
119 |         """
120 |         return AggrCols(
121 |             has_count=self.has_count or other.has_count,
122 |             mean_cols=tuple({*self.mean_cols, *other.mean_cols}),
123 |             var_cols=tuple({*self.var_cols, *other.var_cols}),
124 |             cov_cols=tuple({
125 |                 tea_tasting.aggr._sorted_tuple(*cols)
126 |                 for cols in tuple({*self.cov_cols, *other.cov_cols})
127 |             }),
128 |         )
129 | 
130 |     def __len__(self) -> int:
131 |         """Total length of all object attributes.
132 | 
133 |         If has_count is True then its value is 1, or 0 otherwise.
134 |         """
135 |         return (
136 |             int(self.has_count)
137 |             + len(self.mean_cols)
138 |             + len(self.var_cols)
139 |             + len(self.cov_cols)
140 |         )
141 | 
142 | 
143 | class _HasAggrCols(abc.ABC):
144 |     @property
145 |     @abc.abstractmethod
146 |     def aggr_cols(self) -> AggrCols:
147 |         """Columns to be aggregated for an analysis."""
148 | 
149 | 
150 | class MetricBaseAggregated(MetricBase[R], _HasAggrCols):
151 |     """Base class for metrics, which are analyzed using aggregated statistics."""
152 |     @overload
153 |     def analyze(
154 |         self,
155 |         data: dict[object, tea_tasting.aggr.Aggregates],
156 |         control: object,
157 |         treatment: object,
158 |         variant: str | None = None,
159 |     ) -> R:
160 |         ...
161 | 
162 |     @overload
163 |     def analyze(
164 |         self,
165 |         data: narwhals.typing.IntoFrame | ibis.expr.types.Table,
166 |         control: object,
167 |         treatment: object,
168 |         variant: str,
169 |     ) -> R:
170 |         ...
171 | 
172 |     def analyze(
173 |         self,
174 |         data: narwhals.typing.IntoFrame | ibis.expr.types.Table | dict[
175 |             object, tea_tasting.aggr.Aggregates],
176 |         control: object,
177 |         treatment: object,
178 |         variant: str | None = None,
179 |     ) -> R:
180 |         """Analyze a metric in an experiment.
181 | 
182 |         Args:
183 |             data: Experimental data.
184 |             control: Control variant.
185 |             treatment: Treatment variant.
186 |             variant: Variant column name.
187 | 
188 |         Returns:
189 |             Analysis result.
190 |         """
191 |         tea_tasting.utils.check_scalar(variant, "variant", typ=str | None)
192 |         aggr = aggregate_by_variants(
193 |             data,
194 |             aggr_cols=self.aggr_cols,
195 |             variant=variant,
196 |         )
197 |         return self.analyze_aggregates(
198 |             control=aggr[control],
199 |             treatment=aggr[treatment],
200 |         )
201 | 
202 |     @abc.abstractmethod
203 |     def analyze_aggregates(
204 |         self,
205 |         control: tea_tasting.aggr.Aggregates,
206 |         treatment: tea_tasting.aggr.Aggregates,
207 |     ) -> R:
208 |         """Analyze metric in an experiment using aggregated statistics.
209 | 
210 |         Args:
211 |             control: Control data.
212 |             treatment: Treatment data.
213 | 
214 |         Returns:
215 |             Analysis result.
216 |         """
217 | 
218 | 
219 | class PowerBaseAggregated(PowerBase[S], _HasAggrCols):
220 |     """Base class for the analysis of power using aggregated statistics."""
221 |     def solve_power(
222 |         self,
223 |         data: (
224 |             narwhals.typing.IntoFrame |
225 |             ibis.expr.types.Table |
226 |             tea_tasting.aggr.Aggregates
227 |         ),
228 |         parameter: Literal[
229 |             "power", "effect_size", "rel_effect_size", "n_obs"] = "rel_effect_size",
230 |     ) -> S:
231 |         """Solve for a parameter of the power of a test.
232 | 
233 |         Args:
234 |             data: Sample data.
235 |             parameter: Parameter name.
236 | 
237 |         Returns:
238 |             Power analysis result.
239 |         """
240 |         tea_tasting.utils.check_scalar(
241 |             parameter,
242 |             "parameter",
243 |             in_={"power", "effect_size", "rel_effect_size", "n_obs"},
244 |         )
245 |         if not isinstance(data, tea_tasting.aggr.Aggregates):
246 |             data = tea_tasting.aggr.read_aggregates(
247 |                 data=data,
248 |                 group_col=None,
249 |                 **self.aggr_cols._asdict(),
250 |             )
251 |         return self.solve_power_from_aggregates(data=data, parameter=parameter)
252 | 
253 |     @abc.abstractmethod
254 |     def solve_power_from_aggregates(
255 |         self,
256 |         data: tea_tasting.aggr.Aggregates,
257 |         parameter: Literal[
258 |             "power", "effect_size", "rel_effect_size", "n_obs"] = "rel_effect_size",
259 |     ) -> S:
260 |         """Solve for a parameter of the power of a test.
261 | 
262 |         Args:
263 |             data: Sample data.
264 |             parameter: Parameter name.
265 | 
266 |         Returns:
267 |             Power analysis result.
268 |         """
269 | 
270 | 
271 | def aggregate_by_variants(
272 |     data: (
273 |         narwhals.typing.IntoFrame |
274 |         ibis.expr.types.Table |
275 |         dict[object, tea_tasting.aggr.Aggregates]
276 |     ),
277 |     aggr_cols: AggrCols,
278 |     variant: str | None = None,
279 | ) ->  dict[object, tea_tasting.aggr.Aggregates]:
280 |     """Aggregate experimental data by variants.
281 | 
282 |     Args:
283 |         data: Experimental data.
284 |         aggr_cols: Columns to be aggregated.
285 |         variant: Variant column name.
286 | 
287 |     Returns:
288 |         Experimental data as a dictionary of Aggregates.
289 |     """
290 |     if isinstance(data, dict):
291 |         return data
292 | 
293 |     if variant is None:
294 |         raise ValueError("The variant parameter is required but was not provided.")
295 | 
296 |     return tea_tasting.aggr.read_aggregates(
297 |         data=data,
298 |         group_col=variant,
299 |         **aggr_cols._asdict(),
300 |     )
301 | 
302 | 
303 | class _HasCols(abc.ABC):
304 |     @property
305 |     @abc.abstractmethod
306 |     def cols(self) -> Sequence[str]:
307 |         """Columns to be fetched for an analysis."""
308 | 
309 | 
310 | class MetricBaseGranular(MetricBase[R], _HasCols):
311 |     """Base class for metrics, which are analyzed using granular data."""
312 |     @overload
313 |     def analyze(
314 |         self,
315 |         data: dict[object, pa.Table],
316 |         control: object,
317 |         treatment: object,
318 |         variant: str | None = None,
319 |     ) -> R:
320 |         ...
321 | 
322 |     @overload
323 |     def analyze(
324 |         self,
325 |         data: narwhals.typing.IntoFrame | ibis.expr.types.Table,
326 |         control: object,
327 |         treatment: object,
328 |         variant: str,
329 |     ) -> R:
330 |         ...
331 | 
332 |     def analyze(
333 |         self,
334 |         data: (
335 |             narwhals.typing.IntoFrame |
336 |             ibis.expr.types.Table |
337 |             dict[object, pa.Table]
338 |         ),
339 |         control: object,
340 |         treatment: object,
341 |         variant: str | None = None,
342 |     ) -> R:
343 |         """Analyze a metric in an experiment.
344 | 
345 |         Args:
346 |             data: Experimental data.
347 |             control: Control variant.
348 |             treatment: Treatment variant.
349 |             variant: Variant column name.
350 | 
351 |         Returns:
352 |             Analysis result.
353 |         """
354 |         tea_tasting.utils.check_scalar(variant, "variant", typ=str | None)
355 |         dfs = read_granular(
356 |             data,
357 |             cols=self.cols,
358 |             variant=variant,
359 |         )
360 |         return self.analyze_granular(
361 |             control=dfs[control],
362 |             treatment=dfs[treatment],
363 |         )
364 | 
365 |     @abc.abstractmethod
366 |     def analyze_granular(
367 |         self,
368 |         control: pa.Table,
369 |         treatment: pa.Table,
370 |     ) -> R:
371 |         """Analyze metric in an experiment using granular data.
372 | 
373 |         Args:
374 |             control: Control data.
375 |             treatment: Treatment data.
376 | 
377 |         Returns:
378 |             Analysis result.
379 |         """
380 | 
381 | 
382 | @overload
383 | def read_granular(
384 |     data: narwhals.typing.IntoFrame | ibis.expr.types.Table,
385 |     cols: Sequence[str] = (),
386 |     variant: None = None,
387 | ) -> pa.Table:
388 |     ...
389 | 
390 | @overload
391 | def read_granular(
392 |     data: dict[object, pa.Table],
393 |     cols: Sequence[str] = (),
394 |     variant: None = None,
395 | ) -> dict[object, pa.Table]:
396 |     ...
397 | 
398 | @overload
399 | def read_granular(
400 |     data: narwhals.typing.IntoFrame | ibis.expr.types.Table | dict[object, pa.Table],
401 |     cols: Sequence[str],
402 |     variant: str,
403 | ) -> dict[object, pa.Table]:
404 |     ...
405 | 
406 | def read_granular(
407 |     data: narwhals.typing.IntoFrame | ibis.expr.types.Table | dict[object, pa.Table],
408 |     cols: Sequence[str] = (),
409 |     variant: str | None = None,
410 | ) -> pa.Table | dict[object, pa.Table]:
411 |     """Read granular experimental data.
412 | 
413 |     Args:
414 |         data: Experimental data.
415 |         cols: Columns to read.
416 |         variant: Variant column name.
417 | 
418 |     Returns:
419 |         Experimental data as a dictionary of PyArrow Tables.
420 |     """
421 |     if isinstance(data, dict):
422 |         return data
423 | 
424 |     variant_cols = () if variant is None else (variant,)
425 |     if isinstance(data, ibis.expr.types.Table):
426 |         if len(cols) + len(variant_cols) > 0:
427 |             data = data.select(*cols, *variant_cols)
428 |         table = data.to_pyarrow()
429 |     else:
430 |         data = nw.from_native(data)
431 |         if isinstance(data, nw.LazyFrame):
432 |             data = data.collect()
433 |         if len(cols) + len(variant_cols) > 0:
434 |             data = data.select(*cols, *variant_cols)
435 |         table = data.to_arrow()
436 | 
437 |     if variant is None:
438 |         return table
439 | 
440 |     variant_array = table[variant]
441 |     if len(cols) > 0:
442 |         table = table.select(cols)
443 |     return {
444 |         var: table.filter(pc.equal(variant_array, pa.scalar(var)))  # type: ignore
445 |         for var in variant_array.unique().to_pylist()
446 |     }
447 | 


--------------------------------------------------------------------------------
/src/tea_tasting/metrics/proportion.py:
--------------------------------------------------------------------------------
  1 | """Metrics for the analysis of proportions."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import math
  6 | from typing import TYPE_CHECKING, NamedTuple
  7 | 
  8 | import scipy.stats
  9 | 
 10 | import tea_tasting.aggr
 11 | import tea_tasting.metrics
 12 | from tea_tasting.metrics.base import AggrCols, MetricBaseAggregated
 13 | import tea_tasting.utils
 14 | 
 15 | 
 16 | if TYPE_CHECKING:
 17 |     from typing import Literal
 18 | 
 19 |     import ibis.expr.types
 20 |     import narwhals.typing
 21 | 
 22 | 
 23 | _MAX_EXACT_THRESHOLD = 1000
 24 | 
 25 | 
 26 | class SampleRatioResult(NamedTuple):
 27 |     """Result of the sample ratio mismatch check.
 28 | 
 29 |     Attributes:
 30 |         control: Number of observations in control.
 31 |         treatment: Number of observations in treatment.
 32 |         pvalue: P-value
 33 |     """
 34 |     control: float
 35 |     treatment: float
 36 |     pvalue: float
 37 | 
 38 | 
 39 | class SampleRatio(MetricBaseAggregated[SampleRatioResult]):  # noqa: D101
 40 |     def __init__(
 41 |         self,
 42 |         ratio: float | int | dict[object, float | int] = 1,
 43 |         *,
 44 |         method: Literal["auto", "binom", "norm"] = "auto",
 45 |         correction: bool = True,
 46 |     ) -> None:
 47 |         """Metric for sample ratio mismatch check.
 48 | 
 49 |         Args:
 50 |             ratio: Expected ratio of the number of observations in the treatment
 51 |                 relative to the control.
 52 |             method: Statistical test used for calculation of p-value:
 53 | 
 54 |                 - `"auto"`: Apply exact binomial test if the total number
 55 |                     of observations is < 1000; or normal approximation otherwise.
 56 |                 - `"binom"`: Apply exact binomial test.
 57 |                 - `"norm"`: Apply normal approximation of the binomial distribution.
 58 | 
 59 |             correction: If `True`, add continuity correction.
 60 |                 Only for normal approximation.
 61 | 
 62 |         Examples:
 63 |             ```pycon
 64 |             >>> import tea_tasting as tt
 65 | 
 66 |             >>> experiment = tt.Experiment(
 67 |             ...     sample_ratio=tt.SampleRatio(),
 68 |             ... )
 69 |             >>> data = tt.make_users_data(seed=42)
 70 |             >>> result = experiment.analyze(data)
 71 |             >>> result.with_keys(("metric", "control", "treatment", "pvalue"))
 72 |                   metric control treatment pvalue
 73 |             sample_ratio    2023      1977  0.477
 74 | 
 75 |             ```
 76 | 
 77 |             Different expected ratio:
 78 | 
 79 |             ```pycon
 80 |             >>> experiment = tt.Experiment(
 81 |             ...     sample_ratio=tt.SampleRatio(0.5),
 82 |             ... )
 83 |             >>> data = tt.make_users_data(seed=42)
 84 |             >>> result = experiment.analyze(data)
 85 |             >>> result.with_keys(("metric", "control", "treatment", "pvalue"))
 86 |                   metric control treatment    pvalue
 87 |             sample_ratio    2023      1977 3.26e-103
 88 | 
 89 |             ```
 90 |         """
 91 |         if isinstance(ratio, dict):
 92 |             for val in ratio.values():
 93 |                 tea_tasting.utils.auto_check(val, "ratio")
 94 |         else:
 95 |             tea_tasting.utils.auto_check(ratio, "ratio")
 96 |         self.ratio = ratio
 97 | 
 98 |         self.method = tea_tasting.utils.check_scalar(
 99 |             method, "method", typ=str, in_={"auto", "binom", "norm"})
100 |         self.correction = tea_tasting.utils.auto_check(correction, "correction")
101 | 
102 | 
103 |     @property
104 |     def aggr_cols(self) -> AggrCols:
105 |         """Columns to be aggregated for a metric analysis."""
106 |         return AggrCols(has_count=True)
107 | 
108 | 
109 |     def analyze(
110 |         self,
111 |         data: narwhals.typing.IntoFrame | ibis.expr.types.Table | dict[
112 |             object, tea_tasting.aggr.Aggregates],
113 |         control: object,
114 |         treatment: object,
115 |         variant: str | None = None,
116 |     ) -> SampleRatioResult:
117 |         """Perform a sample ratio mismatch check.
118 | 
119 |         Args:
120 |             data: Experimental data.
121 |             control: Control variant.
122 |             treatment: Treatment variant.
123 |             variant: Variant column name.
124 | 
125 |         Returns:
126 |             Analysis result.
127 |         """
128 |         tea_tasting.utils.check_scalar(variant, "variant", typ=str | None)
129 |         aggr = tea_tasting.metrics.aggregate_by_variants(
130 |             data,
131 |             aggr_cols=self.aggr_cols,
132 |             variant=variant,
133 |         )
134 | 
135 |         k = aggr[treatment].count()
136 |         n = k + aggr[control].count()
137 | 
138 |         r = (
139 |             self.ratio
140 |             if isinstance(self.ratio, float | int)
141 |             else self.ratio[treatment] / self.ratio[control]
142 |         )
143 |         p = r / (1 + r)
144 | 
145 |         if (
146 |             self.method == "binom" or
147 |             (self.method == "auto" and n < _MAX_EXACT_THRESHOLD)
148 |         ):
149 |             pvalue = scipy.stats.binomtest(k=int(k), n=int(n), p=p).pvalue
150 |         else:  # norm
151 |             d = k - n*p
152 |             if self.correction and d != 0:
153 |                 d = min(d + 0.5, 0) if d < 0 else max(d - 0.5, 0)
154 |             z = d / math.sqrt(n * p * (1 - p))
155 |             pvalue = 2 * scipy.stats.norm.sf(abs(z))
156 | 
157 |         return SampleRatioResult(
158 |             control=n - k,
159 |             treatment=k,
160 |             pvalue=pvalue,  # type: ignore
161 |         )
162 | 
163 | 
164 |     def analyze_aggregates(
165 |         self,
166 |         control: tea_tasting.aggr.Aggregates,
167 |         treatment: tea_tasting.aggr.Aggregates,
168 |     ) -> SampleRatioResult:
169 |         """Stub method for compatibility with the base class."""
170 |         raise NotImplementedError
171 | 


--------------------------------------------------------------------------------
/src/tea_tasting/version.py:
--------------------------------------------------------------------------------
 1 | """Package version."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import importlib.metadata
 6 | import importlib.resources
 7 | 
 8 | 
 9 | try:
10 |     __version__ = importlib.metadata.version(__package__ or "tea-tasting")
11 | except importlib.metadata.PackageNotFoundError:
12 |     __version__ = (
13 |         importlib.resources.files("tea_tasting")
14 |         .joinpath("_version.txt")
15 |         .read_text()
16 |         .strip()
17 |     )
18 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e10v/tea-tasting/70b129d8dad04654b4aceb56c8ee2e348b0636b0/tests/__init__.py


--------------------------------------------------------------------------------
/tests/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e10v/tea-tasting/70b129d8dad04654b4aceb56c8ee2e348b0636b0/tests/metrics/__init__.py


--------------------------------------------------------------------------------
/tests/metrics/test_base.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import TYPE_CHECKING, NamedTuple
  4 | import unittest.mock
  5 | 
  6 | import ibis
  7 | import polars as pl
  8 | import pyarrow as pa
  9 | import pyarrow.compute as pc
 10 | import pytest
 11 | 
 12 | import tea_tasting.aggr
 13 | import tea_tasting.datasets
 14 | import tea_tasting.metrics.base
 15 | 
 16 | 
 17 | if TYPE_CHECKING:
 18 |     from typing import Any, Literal
 19 | 
 20 |     import ibis.expr.types  # noqa: TC004
 21 |     import pandas as pd
 22 | 
 23 | 
 24 |     Frame = ibis.expr.types.Table | pa.Table | pd.DataFrame | pl.LazyFrame
 25 | 
 26 | 
 27 | def test_aggr_cols_or():
 28 |     aggr_cols0 = tea_tasting.metrics.base.AggrCols(
 29 |         has_count=False,
 30 |         mean_cols=("a", "b"),
 31 |         var_cols=("b", "c"),
 32 |         cov_cols=(("a", "b"), ("c", "b")),
 33 |     )
 34 | 
 35 |     aggr_cols1 = tea_tasting.metrics.base.AggrCols(
 36 |         has_count=True,
 37 |         mean_cols=("b", "c"),
 38 |         var_cols=("c", "d"),
 39 |         cov_cols=(("b", "c"), ("d", "c")),
 40 |     )
 41 | 
 42 |     aggr_cols = aggr_cols0 | aggr_cols1
 43 | 
 44 |     assert isinstance(aggr_cols, tea_tasting.metrics.base.AggrCols)
 45 |     assert aggr_cols.has_count is True
 46 |     assert set(aggr_cols.mean_cols) == {"a", "b", "c"}
 47 |     assert len(aggr_cols.mean_cols) == 3
 48 |     assert set(aggr_cols.var_cols) == {"b", "c", "d"}
 49 |     assert len(aggr_cols.var_cols) == 3
 50 |     assert set(aggr_cols.cov_cols) == {("a", "b"), ("b", "c"), ("c", "d")}
 51 |     assert len(aggr_cols.cov_cols) == 3
 52 | 
 53 | 
 54 | def test_aggr_cols_len():
 55 |     assert len(tea_tasting.metrics.base.AggrCols(
 56 |         has_count=False,
 57 |         mean_cols=("a", "b"),
 58 |         var_cols=("b", "c"),
 59 |         cov_cols=(("a", "b"), ("c", "b")),
 60 |     )) == 6
 61 |     assert len(tea_tasting.metrics.base.AggrCols(
 62 |         has_count=True,
 63 |         mean_cols=("b", "c"),
 64 |         var_cols=("c", "d"),
 65 |         cov_cols=(("b", "c"), ("d", "c")),
 66 |     )) == 7
 67 | 
 68 | 
 69 | @pytest.fixture
 70 | def data_arrow() -> pa.Table:
 71 |     return tea_tasting.datasets.make_users_data(n_users=100, seed=42)
 72 | 
 73 | @pytest.fixture
 74 | def data_pandas(data_arrow: pa.Table) -> pd.DataFrame:
 75 |     return data_arrow.to_pandas()
 76 | 
 77 | @pytest.fixture
 78 | def data_polars(data_arrow: pa.Table) -> pl.DataFrame:
 79 |     return pl.from_arrow(data_arrow)  # type: ignore
 80 | 
 81 | @pytest.fixture
 82 | def data_polars_lazy(data_polars: pl.DataFrame) -> pl.LazyFrame:
 83 |     return data_polars.lazy()
 84 | 
 85 | @pytest.fixture
 86 | def data_duckdb(data_arrow: pa.Table) -> ibis.expr.types.Table:
 87 |     return ibis.connect("duckdb://").create_table("data", data_arrow)
 88 | 
 89 | @pytest.fixture
 90 | def data_sqlite(data_arrow: pa.Table) -> ibis.expr.types.Table:
 91 |     return ibis.connect("sqlite://").create_table("data", data_arrow)
 92 | 
 93 | @pytest.fixture(params=[
 94 |     "data_arrow", "data_pandas",
 95 |     "data_polars", "data_polars_lazy",
 96 |     "data_duckdb", "data_sqlite",
 97 | ])
 98 | def data(request: pytest.FixtureRequest) -> Frame:
 99 |     return request.getfixturevalue(request.param)
100 | 
101 | 
102 | @pytest.fixture
103 | def aggr_cols() -> tea_tasting.metrics.base.AggrCols:
104 |     return tea_tasting.metrics.base.AggrCols(
105 |         has_count=True,
106 |         mean_cols=("sessions", "orders"),
107 |         var_cols=("orders", "revenue"),
108 |         cov_cols=(("sessions", "revenue"),),
109 |     )
110 | 
111 | @pytest.fixture
112 | def correct_aggrs(
113 |     data_arrow: pa.Table,
114 |     aggr_cols: tea_tasting.metrics.base.AggrCols,
115 | ) -> dict[object, tea_tasting.aggr.Aggregates]:
116 |     return tea_tasting.aggr.read_aggregates(
117 |         data_arrow,
118 |         group_col="variant",
119 |         **aggr_cols._asdict(),
120 |     )
121 | 
122 | @pytest.fixture
123 | def correct_aggr(
124 |     data_arrow: pa.Table,
125 |     aggr_cols: tea_tasting.metrics.base.AggrCols,
126 | ) -> tea_tasting.aggr.Aggregates:
127 |     return tea_tasting.aggr.read_aggregates(
128 |         data_arrow,
129 |         group_col=None,
130 |         **aggr_cols._asdict(),
131 |     )
132 | 
133 | @pytest.fixture
134 | def cols() -> tuple[str, ...]:
135 |     return ("sessions", "orders", "revenue")
136 | 
137 | @pytest.fixture
138 | def correct_gran(
139 |     data_arrow: pa.Table,
140 |     cols: tuple[str, ...],
141 | ) -> dict[object, pa.Table]:
142 |     variant_col = data_arrow["variant"]
143 |     table = data_arrow.select(cols)
144 |     return {
145 |         var: table.filter(pc.equal(variant_col, pa.scalar(var)))  # type: ignore
146 |         for var in variant_col.unique().to_pylist()
147 |     }
148 | 
149 | @pytest.fixture
150 | def aggr_metric(
151 |     aggr_cols: tea_tasting.metrics.base.AggrCols,
152 | ) -> tea_tasting.metrics.base.MetricBaseAggregated[dict[str, object]]:
153 |     class AggrMetric(tea_tasting.metrics.base.MetricBaseAggregated[dict[str, object]]):
154 |         @property
155 |         def aggr_cols(self) -> tea_tasting.metrics.base.AggrCols:
156 |             return aggr_cols
157 | 
158 |         def analyze_aggregates(
159 |             self,
160 |             control: tea_tasting.aggr.Aggregates,  # noqa: ARG002
161 |             treatment: tea_tasting.aggr.Aggregates,  # noqa: ARG002
162 |         ) -> dict[str, object]:
163 |             return {}
164 | 
165 |     return AggrMetric()
166 | 
167 | @pytest.fixture
168 | def aggr_power(
169 |     aggr_cols: tea_tasting.metrics.base.AggrCols,
170 | ) -> tea_tasting.metrics.base.PowerBaseAggregated[
171 |     tea_tasting.metrics.base.MetricPowerResults[dict[str, object]]
172 | ]:
173 |     class AggrPower(
174 |         tea_tasting.metrics.base.PowerBaseAggregated[
175 |             tea_tasting.metrics.base.MetricPowerResults[dict[str, object]]
176 |         ],
177 |     ):
178 |         @property
179 |         def aggr_cols(self) -> tea_tasting.metrics.base.AggrCols:
180 |             return aggr_cols
181 | 
182 |         def solve_power_from_aggregates(
183 |             self,
184 |             data: tea_tasting.aggr.Aggregates,  # noqa: ARG002
185 |             parameter: Literal[  # noqa: ARG002
186 |                 "power",
187 |                 "effect_size",
188 |                 "rel_effect_size",
189 |                 "n_obs",
190 |             ] = "power",
191 |         ) -> tea_tasting.metrics.base.MetricPowerResults[dict[str, object]]:
192 |             return tea_tasting.metrics.base.MetricPowerResults()
193 |     return AggrPower()
194 | 
195 | @pytest.fixture
196 | def gran_metric(
197 |     cols: tuple[str, ...],
198 | ) -> tea_tasting.metrics.base.MetricBaseGranular[dict[str, object]]:
199 |     class GranMetric(tea_tasting.metrics.base.MetricBaseGranular[dict[str, object]]):
200 |         @property
201 |         def cols(self) -> tuple[str, ...]:
202 |             return cols
203 | 
204 |         def analyze_granular(
205 |             self,
206 |             control: pa.Table,  # noqa: ARG002
207 |             treatment: pa.Table,  # noqa: ARG002
208 |         ) -> dict[str, object]:
209 |             return {}
210 | 
211 |     return GranMetric()
212 | 
213 | 
214 | def _compare_aggrs(
215 |     left: tea_tasting.aggr.Aggregates,
216 |     right: tea_tasting.aggr.Aggregates,
217 | ) -> None:
218 |     assert left.count_ == right.count_
219 |     assert left.mean_ == pytest.approx(right.mean_)
220 |     assert left.var_ == pytest.approx(right.var_)
221 |     assert left.cov_ == pytest.approx(right.cov_)
222 | 
223 | 
224 | def test_metric_power_results_to_dicts():
225 |     result0 = {
226 |         "power": 0.8,
227 |         "effect_size": 1,
228 |         "rel_effect_size": 0.05,
229 |         "n_obs": 10_000,
230 |     }
231 |     result1 = {
232 |         "power": 0.9,
233 |         "effect_size": 2,
234 |         "rel_effect_size": 0.1,
235 |         "n_obs": 20_000,
236 |     }
237 | 
238 |     results = tea_tasting.metrics.base.MetricPowerResults[dict[str, float | int]](  # type: ignore
239 |         [result0, result1])
240 |     assert results.to_dicts() == (result0, result1)
241 | 
242 |     class PowerResult(NamedTuple):
243 |         power: float
244 |         effect_size: float
245 |         rel_effect_size: float
246 |         n_obs: float
247 |     results = tea_tasting.metrics.base.MetricPowerResults[PowerResult]([
248 |         PowerResult(**result0),
249 |         PowerResult(**result1),
250 |     ])
251 |     assert results.to_dicts() == (result0, result1)
252 | 
253 | 
254 | def test_metric_base_aggregated_analyze_frame(
255 |     aggr_metric: tea_tasting.metrics.base.MetricBaseAggregated[dict[str, object]],
256 |     data_arrow: pa.Table,
257 |     correct_aggrs: dict[object, tea_tasting.aggr.Aggregates],
258 | ):
259 |     aggr_metric.analyze_aggregates = unittest.mock.MagicMock()
260 |     aggr_metric.analyze(data_arrow, control=0, treatment=1, variant="variant")
261 |     aggr_metric.analyze_aggregates.assert_called_once()
262 |     kwargs = aggr_metric.analyze_aggregates.call_args.kwargs
263 |     _compare_aggrs(kwargs["control"], correct_aggrs[0])
264 |     _compare_aggrs(kwargs["treatment"], correct_aggrs[1])
265 | 
266 | def test_metric_base_aggregated_analyze_aggrs(
267 |     aggr_metric: tea_tasting.metrics.base.MetricBaseAggregated[dict[str, object]],
268 |     correct_aggrs: dict[object, tea_tasting.aggr.Aggregates],
269 | ):
270 |     aggr_metric.analyze_aggregates = unittest.mock.MagicMock()
271 |     aggr_metric.analyze(correct_aggrs, control=0, treatment=1)
272 |     aggr_metric.analyze_aggregates.assert_called_once()
273 |     kwargs = aggr_metric.analyze_aggregates.call_args.kwargs
274 |     _compare_aggrs(kwargs["control"], correct_aggrs[0])
275 |     _compare_aggrs(kwargs["treatment"], correct_aggrs[1])
276 | 
277 | 
278 | def test_power_base_aggregated_analyze_frame(
279 |     aggr_power: tea_tasting.metrics.base.PowerBaseAggregated[Any],
280 |     data_arrow: pa.Table,
281 |     correct_aggr: tea_tasting.aggr.Aggregates,
282 | ):
283 |     aggr_power.solve_power_from_aggregates = unittest.mock.MagicMock()
284 |     aggr_power.solve_power(data_arrow, "effect_size")
285 |     aggr_power.solve_power_from_aggregates.assert_called_once()
286 |     kwargs = aggr_power.solve_power_from_aggregates.call_args.kwargs
287 |     _compare_aggrs(kwargs["data"], correct_aggr)
288 |     assert kwargs["parameter"] == "effect_size"
289 | 
290 | def test_power_base_aggregated_analyze_aggr(
291 |     aggr_power: tea_tasting.metrics.base.PowerBaseAggregated[Any],
292 |     correct_aggr: tea_tasting.aggr.Aggregates,
293 | ):
294 |     aggr_power.solve_power_from_aggregates = unittest.mock.MagicMock()
295 |     aggr_power.solve_power(correct_aggr, "rel_effect_size")
296 |     aggr_power.solve_power_from_aggregates.assert_called_once()
297 |     kwargs = aggr_power.solve_power_from_aggregates.call_args.kwargs
298 |     _compare_aggrs(kwargs["data"], correct_aggr)
299 |     assert kwargs["parameter"] == "rel_effect_size"
300 | 
301 | 
302 | def test_aggregate_by_variants_frame(
303 |     data_arrow: pa.Table,
304 |     aggr_cols: tea_tasting.metrics.base.AggrCols,
305 |     correct_aggrs: dict[object, tea_tasting.aggr.Aggregates],
306 | ):
307 |     aggrs = tea_tasting.metrics.base.aggregate_by_variants(
308 |         data_arrow,
309 |         aggr_cols=aggr_cols,
310 |         variant="variant",
311 |     )
312 |     _compare_aggrs(aggrs[0], correct_aggrs[0])
313 |     _compare_aggrs(aggrs[1], correct_aggrs[1])
314 | 
315 | def test_aggregate_by_variants_aggrs(
316 |     aggr_cols: tea_tasting.metrics.base.AggrCols,
317 |     correct_aggrs: dict[object, tea_tasting.aggr.Aggregates],
318 | ):
319 |     aggrs = tea_tasting.metrics.base.aggregate_by_variants(
320 |         correct_aggrs,
321 |         aggr_cols=aggr_cols,
322 |         variant="variant",
323 |     )
324 |     _compare_aggrs(aggrs[0], correct_aggrs[0])
325 |     _compare_aggrs(aggrs[1], correct_aggrs[1])
326 | 
327 | def test_aggregate_by_variants_raises(
328 |     data_arrow: pa.Table,
329 |     aggr_cols: tea_tasting.metrics.base.AggrCols,
330 | ):
331 |     with pytest.raises(ValueError, match="variant"):
332 |         tea_tasting.metrics.base.aggregate_by_variants(data_arrow, aggr_cols=aggr_cols)
333 | 
334 | 
335 | def test_metric_base_granular_frame(
336 |     gran_metric: tea_tasting.metrics.base.MetricBaseGranular[dict[str, object]],
337 |     data_arrow: pa.Table,
338 |     correct_gran: dict[object, pa.Table],
339 | ):
340 |     gran_metric.analyze_granular = unittest.mock.MagicMock()
341 |     gran_metric.analyze(data_arrow, control=0, treatment=1, variant="variant")
342 |     gran_metric.analyze_granular.assert_called_once()
343 |     kwargs = gran_metric.analyze_granular.call_args.kwargs
344 |     assert kwargs["control"].equals(correct_gran[0])
345 |     assert kwargs["treatment"].equals(correct_gran[1])
346 | 
347 | def test_metric_base_granular_gran(
348 |     gran_metric: tea_tasting.metrics.base.MetricBaseGranular[dict[str, object]],
349 |     correct_gran: dict[object, pa.Table],
350 | ):
351 |     gran_metric.analyze_granular = unittest.mock.MagicMock()
352 |     gran_metric.analyze(correct_gran, control=0, treatment=1)
353 |     gran_metric.analyze_granular.assert_called_once()
354 |     kwargs = gran_metric.analyze_granular.call_args.kwargs
355 |     assert kwargs["control"].equals(correct_gran[0])
356 |     assert kwargs["treatment"].equals(correct_gran[1])
357 | 
358 | 
359 | def test_read_granular_frame(
360 |     data: Frame,
361 |     cols: tuple[str, ...],
362 |     correct_gran: dict[object, pa.Table],
363 | ):
364 |     gran = tea_tasting.metrics.base.read_granular(
365 |         data,
366 |         cols=cols,
367 |         variant="variant",
368 |     )
369 |     assert gran[0].equals(correct_gran[0])
370 |     assert gran[1].equals(correct_gran[1])
371 | 
372 | def test_read_granular_dict(
373 |     cols: tuple[str, ...],
374 |     correct_gran: dict[object, pa.Table],
375 | ):
376 |     gran = tea_tasting.metrics.base.read_granular(
377 |         correct_gran,
378 |         cols=cols,
379 |         variant="variant",
380 |     )
381 |     assert gran[0].equals(correct_gran[0])
382 |     assert gran[1].equals(correct_gran[1])
383 | 
384 | def test_read_granular_none(
385 |     data: Frame,
386 |     cols: tuple[str, ...],
387 |     data_arrow: pa.Table,
388 | ):
389 |     gran = tea_tasting.metrics.base.read_granular(data, cols=cols)
390 |     assert gran.equals(data_arrow.select(cols))
391 | 


--------------------------------------------------------------------------------
/tests/metrics/test_proportion.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import TYPE_CHECKING, NamedTuple
  4 | import unittest.mock
  5 | 
  6 | import pytest
  7 | 
  8 | import tea_tasting.aggr
  9 | import tea_tasting.datasets
 10 | import tea_tasting.metrics.base
 11 | import tea_tasting.metrics.proportion
 12 | 
 13 | 
 14 | if TYPE_CHECKING:
 15 |     import pyarrow as pa
 16 | 
 17 | 
 18 | @pytest.fixture
 19 | def data_arrow() -> pa.Table:
 20 |     return tea_tasting.datasets.make_users_data(n_users=100, seed=42)
 21 | 
 22 | @pytest.fixture
 23 | def data_aggr(data_arrow: pa.Table) -> dict[object, tea_tasting.aggr.Aggregates]:
 24 |     return tea_tasting.aggr.read_aggregates(
 25 |         data_arrow,
 26 |         group_col="variant",
 27 |         has_count=True,
 28 |         mean_cols=(),
 29 |         var_cols=(),
 30 |         cov_cols=(),
 31 |     )
 32 | 
 33 | 
 34 | def test_sample_ratio_init_default():
 35 |     metric = tea_tasting.metrics.proportion.SampleRatio()
 36 |     assert metric.ratio == 1
 37 |     assert metric.method == "auto"
 38 |     assert metric.correction is True
 39 | 
 40 | def test_sample_ratio_init_custom():
 41 |     metric = tea_tasting.metrics.proportion.SampleRatio(
 42 |         {0: 0.5, 1: 0.5},
 43 |         method="norm",
 44 |         correction=False,
 45 |     )
 46 |     assert metric.ratio == {0: 0.5, 1: 0.5}
 47 |     assert metric.method == "norm"
 48 |     assert metric.correction is False
 49 | 
 50 | 
 51 | def test_sample_ratio_aggr_cols():
 52 |     metric = tea_tasting.metrics.proportion.SampleRatio()
 53 |     assert metric.aggr_cols == tea_tasting.metrics.base.AggrCols(has_count=True)
 54 | 
 55 | 
 56 | def test_sample_ratio_analyze_frame(data_arrow: pa.Table):
 57 |     metric = tea_tasting.metrics.proportion.SampleRatio()
 58 |     result = metric.analyze(data_arrow, 0, 1, variant="variant")
 59 |     assert isinstance(result, tea_tasting.metrics.proportion.SampleRatioResult)
 60 | 
 61 | def test_sample_ratio_analyze_auto():
 62 |     metric = tea_tasting.metrics.proportion.SampleRatio()
 63 |     with unittest.mock.patch("scipy.stats.binomtest") as mock:
 64 |         mock.return_value = NamedTuple("Result", (("pvalue", float),))(pvalue=0.1)
 65 |         data = tea_tasting.datasets.make_users_data(
 66 |             seed=42,
 67 |             n_users=tea_tasting.metrics.proportion._MAX_EXACT_THRESHOLD - 1,
 68 |         )
 69 |         metric.analyze(data, 0, 1, variant="variant")
 70 |         mock.assert_called_once()
 71 |     with unittest.mock.patch("scipy.stats.norm.sf") as mock:
 72 |         mock.return_value = 0.1
 73 |         data = tea_tasting.datasets.make_users_data(
 74 |             seed=42,
 75 |             n_users=tea_tasting.metrics.proportion._MAX_EXACT_THRESHOLD,
 76 |         )
 77 |         metric.analyze(data, 0, 1, variant="variant")
 78 |         mock.assert_called_once()
 79 | 
 80 | def test_sample_ratio_analyze_binom(
 81 |     data_aggr: dict[object, tea_tasting.aggr.Aggregates],
 82 | ):
 83 |     metric = tea_tasting.metrics.proportion.SampleRatio(method="binom")
 84 |     result = metric.analyze(data_aggr, 0, 1, variant="variant")
 85 |     assert result.control == 53
 86 |     assert result.treatment == 47
 87 |     assert result.pvalue == pytest.approx(0.6172994135892521)
 88 | 
 89 | def test_sample_ratio_analyze_norm_corr(
 90 |     data_aggr: dict[object, tea_tasting.aggr.Aggregates],
 91 | ):
 92 |     metric = tea_tasting.metrics.proportion.SampleRatio(method="norm", correction=True)
 93 |     result = metric.analyze(data_aggr, 0, 1, variant="variant")
 94 |     assert result.control == 53
 95 |     assert result.treatment == 47
 96 |     assert result.pvalue == pytest.approx(0.6170750774519738)
 97 | 
 98 | def test_sample_ratio_analyze_norm_no_corr(
 99 |     data_aggr: dict[object, tea_tasting.aggr.Aggregates],
100 | ):
101 |     metric = tea_tasting.metrics.proportion.SampleRatio(method="norm", correction=False)
102 |     result = metric.analyze(data_aggr, 0, 1, variant="variant")
103 |     assert result.control == 53
104 |     assert result.treatment == 47
105 |     assert result.pvalue == pytest.approx(0.5485062355001472)
106 | 
107 | def test_sample_ratio_analyze_aggregates(
108 |     data_aggr: dict[object, tea_tasting.aggr.Aggregates],
109 | ):
110 |     metric = tea_tasting.metrics.proportion.SampleRatio()
111 |     with pytest.raises(NotImplementedError):
112 |         metric.analyze_aggregates(data_aggr[0], data_aggr[1])
113 | 


--------------------------------------------------------------------------------
/tests/metrics/test_resampling.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import TYPE_CHECKING
  4 | 
  5 | import numpy as np
  6 | import pytest
  7 | 
  8 | import tea_tasting.config
  9 | import tea_tasting.datasets
 10 | import tea_tasting.metrics.base
 11 | import tea_tasting.metrics.resampling
 12 | 
 13 | 
 14 | if TYPE_CHECKING:
 15 |     import numpy.typing as npt
 16 |     import pyarrow as pa
 17 | 
 18 | 
 19 | @pytest.fixture
 20 | def data_arrow() -> pa.Table:
 21 |     return tea_tasting.datasets.make_users_data(n_users=100, seed=42)
 22 | 
 23 | @pytest.fixture
 24 | def data_gran(data_arrow: pa.Table) -> dict[object, pa.Table]:
 25 |     return tea_tasting.metrics.base.read_granular(
 26 |         data_arrow,
 27 |         ("sessions", "orders", "revenue"),
 28 |         variant="variant",
 29 |     )
 30 | 
 31 | 
 32 | def test_bootstrap_init_default():
 33 |     metric = tea_tasting.metrics.resampling.Bootstrap("a", np.mean)
 34 |     assert metric.columns == "a"
 35 |     assert metric.statistic == np.mean
 36 |     assert metric.alternative == tea_tasting.config.get_config("alternative")
 37 |     assert metric.confidence_level == tea_tasting.config.get_config("confidence_level")
 38 |     assert metric.n_resamples == tea_tasting.config.get_config("n_resamples")
 39 |     assert metric.method == "bca"
 40 |     assert metric.batch is None
 41 |     assert metric.random_state is None
 42 | 
 43 | def test_bootstrap_init_custom():
 44 |     metric = tea_tasting.metrics.resampling.Bootstrap(
 45 |         ("a", "b"),
 46 |         np.mean,
 47 |         alternative="greater",
 48 |         confidence_level=0.9,
 49 |         n_resamples=1000,
 50 |         method="basic",
 51 |         batch=100,
 52 |         random_state=42,
 53 |     )
 54 |     assert metric.columns == ("a", "b")
 55 |     assert metric.statistic == np.mean
 56 |     assert metric.alternative == "greater"
 57 |     assert metric.confidence_level == 0.9
 58 |     assert metric.n_resamples == 1000
 59 |     assert metric.method == "basic"
 60 |     assert metric.batch == 100
 61 |     assert metric.random_state == 42
 62 | 
 63 | 
 64 | def test_bootstrap_cols():
 65 |     metric = tea_tasting.metrics.resampling.Bootstrap("a", np.mean)
 66 |     assert metric.cols == ("a",)
 67 | 
 68 |     metric = tea_tasting.metrics.resampling.Bootstrap(("a", "b"), np.mean)
 69 |     assert metric.cols == ("a", "b")
 70 | 
 71 | 
 72 | def test_bootstrap_analyze_frame(data_arrow: pa.Table):
 73 |     metric = tea_tasting.metrics.resampling.Bootstrap("sessions", np.mean)
 74 |     result = metric.analyze(data_arrow, 0, 1, variant="variant")
 75 |     assert isinstance(result, tea_tasting.metrics.resampling.BootstrapResult)
 76 | 
 77 | 
 78 | def test_bootstrap_analyze_default(data_gran: dict[object, pa.Table]):
 79 |     metric = tea_tasting.metrics.resampling.Bootstrap(
 80 |         "revenue",
 81 |         np.mean,
 82 |         n_resamples=100,
 83 |         random_state=42,
 84 |     )
 85 |     result = metric.analyze(data_gran, 0, 1)
 86 |     assert isinstance(result, tea_tasting.metrics.resampling.BootstrapResult)
 87 |     assert result.control == pytest.approx(5.029811320754717)
 88 |     assert result.treatment == pytest.approx(5.43)
 89 |     assert result.effect_size == pytest.approx(0.4001886792452831)
 90 |     assert result.effect_size_ci_lower == pytest.approx(-3.269396309565539)
 91 |     assert result.effect_size_ci_upper == pytest.approx(7.219843380442667)
 92 |     assert result.rel_effect_size == pytest.approx(0.07956335809137971)
 93 |     assert result.rel_effect_size_ci_lower == pytest.approx(-0.5658493834599828)
 94 |     assert result.rel_effect_size_ci_upper == pytest.approx(1.8185473860534842)
 95 | 
 96 | def test_bootstrap_analyze_multiple_columns(data_gran: dict[object, pa.Table]):
 97 |     def ratio_of_means(
 98 |         sample: npt.NDArray[np.number],
 99 |         axis: int,
100 |     ) -> npt.NDArray[np.number]:
101 |         stat = np.mean(sample, axis=axis)  # type: ignore
102 |         return stat[0] / stat[1]
103 | 
104 |     metric = tea_tasting.metrics.resampling.Bootstrap(
105 |         ("orders", "sessions"),
106 |         ratio_of_means,
107 |         n_resamples=100,
108 |         random_state=42,
109 |     )
110 |     result = metric.analyze(data_gran, 0, 1)
111 |     assert isinstance(result, tea_tasting.metrics.resampling.BootstrapResult)
112 |     assert result.control == pytest.approx(0.2857142857142857)
113 |     assert result.treatment == pytest.approx(0.20224719101123595)
114 |     assert result.effect_size == pytest.approx(-0.08346709470304975)
115 |     assert result.effect_size_ci_lower == pytest.approx(-0.24780839493679777)
116 |     assert result.effect_size_ci_upper == pytest.approx(0.07730723504025493)
117 |     assert result.rel_effect_size == pytest.approx(-0.2921348314606741)
118 |     assert result.rel_effect_size_ci_lower == pytest.approx(-0.6424902672606227)
119 |     assert result.rel_effect_size_ci_upper == pytest.approx(0.4374404130492657)
120 | 
121 | def test_bootstrap_analyze_division_by_zero(data_gran: dict[object, pa.Table]):
122 |     metric = tea_tasting.metrics.resampling.Bootstrap(
123 |         "orders",
124 |         np.median,
125 |         n_resamples=100,
126 |         random_state=42,
127 |         method="basic",
128 |     )
129 |     result = metric.analyze(data_gran, 0, 1)
130 |     assert isinstance(result, tea_tasting.metrics.resampling.BootstrapResult)
131 |     assert result.control == 0
132 |     assert result.treatment == 0
133 |     assert result.effect_size == 0
134 |     assert result.effect_size_ci_lower == 0
135 |     assert result.effect_size_ci_upper == 0
136 |     assert np.isnan(result.rel_effect_size)
137 |     assert np.isnan(result.rel_effect_size_ci_lower)
138 |     assert np.isnan(result.rel_effect_size_ci_upper)
139 | 
140 | def test_quantile(data_gran: dict[object, pa.Table]):
141 |     metric = tea_tasting.metrics.resampling.Quantile(
142 |         "revenue",
143 |         q=0.8,
144 |         alternative="greater",
145 |         confidence_level=0.9,
146 |         n_resamples=100,
147 |         random_state=42,
148 |     )
149 |     assert metric.column == "revenue"
150 |     assert metric.q == 0.8
151 |     result = metric.analyze(data_gran, 0, 1)
152 |     assert isinstance(result, tea_tasting.metrics.resampling.BootstrapResult)
153 |     assert result.control == pytest.approx(11.972000000000001)
154 |     assert result.treatment == pytest.approx(6.2820000000000045)
155 |     assert result.effect_size == pytest.approx(-5.689999999999997)
156 |     assert result.effect_size_ci_lower == pytest.approx(-10.875800000000003)
157 |     assert result.effect_size_ci_upper == float("inf")
158 |     assert result.rel_effect_size == pytest.approx(-0.47527564316739024)
159 |     assert result.rel_effect_size_ci_lower == pytest.approx(-0.8743329817472134)
160 |     assert result.rel_effect_size_ci_upper == float("inf")
161 | 


--------------------------------------------------------------------------------
/tests/test_aggr.py:
--------------------------------------------------------------------------------
  1 | # pyright: reportAttributeAccessIssue=false
  2 | from __future__ import annotations
  3 | 
  4 | from typing import TYPE_CHECKING
  5 | 
  6 | import ibis
  7 | import numpy as np
  8 | import polars as pl
  9 | import pyarrow as pa
 10 | import pyarrow.compute as pc
 11 | import pytest
 12 | 
 13 | import tea_tasting.aggr
 14 | import tea_tasting.datasets
 15 | 
 16 | 
 17 | if TYPE_CHECKING:
 18 |     import ibis.expr.types  # noqa: TC004
 19 |     import pandas as pd
 20 | 
 21 | 
 22 |     Frame = ibis.expr.types.Table | pa.Table | pd.DataFrame | pl.LazyFrame
 23 | 
 24 | 
 25 | COUNT = 100
 26 | MEAN = {"x": 5.0, "y": 4}
 27 | VAR = {"x": 3.0, "y": 2}
 28 | COV = {("x", "y"): 1.0}
 29 | 
 30 | @pytest.fixture
 31 | def aggr() -> tea_tasting.aggr.Aggregates:
 32 |     return tea_tasting.aggr.Aggregates(
 33 |         count_=COUNT,
 34 |         mean_=MEAN,
 35 |         var_=VAR,
 36 |         cov_=COV,  # type: ignore
 37 |     )
 38 | 
 39 | 
 40 | @pytest.fixture
 41 | def data_arrow() -> pa.Table:
 42 |     return tea_tasting.datasets.make_users_data(n_users=100, seed=42)
 43 | 
 44 | @pytest.fixture
 45 | def data_pandas(data_arrow: pa.Table) -> pd.DataFrame:
 46 |     return data_arrow.to_pandas()
 47 | 
 48 | @pytest.fixture
 49 | def data_polars(data_arrow: pa.Table) -> pl.DataFrame:
 50 |     return pl.from_arrow(data_arrow)  # type: ignore
 51 | 
 52 | @pytest.fixture
 53 | def data_polars_lazy(data_polars: pl.DataFrame) -> pl.LazyFrame:
 54 |     return data_polars.lazy()
 55 | 
 56 | @pytest.fixture
 57 | def data_duckdb(data_arrow: pa.Table) -> ibis.expr.types.Table:
 58 |     return ibis.connect("duckdb://").create_table("data", data_arrow)
 59 | 
 60 | @pytest.fixture
 61 | def data_sqlite(data_arrow: pa.Table) -> ibis.expr.types.Table:
 62 |     return ibis.connect("sqlite://").create_table("data", data_arrow)
 63 | 
 64 | @pytest.fixture(params=[
 65 |     "data_arrow", "data_pandas",
 66 |     "data_polars", "data_polars_lazy",
 67 |     "data_duckdb", "data_sqlite",
 68 | ])
 69 | def data(request: pytest.FixtureRequest) -> Frame:
 70 |     return request.getfixturevalue(request.param)
 71 | 
 72 | 
 73 | @pytest.fixture
 74 | def correct_aggr(data_arrow: pa.Table) -> tea_tasting.aggr.Aggregates:
 75 |     return tea_tasting.aggr.Aggregates(
 76 |         count_=data_arrow.num_rows,
 77 |         mean_={
 78 |             "sessions": pc.mean(data_arrow["sessions"]).as_py(),
 79 |             "orders": pc.mean(data_arrow["orders"]).as_py(),
 80 |         },
 81 |         var_={
 82 |             "sessions": pc.variance(data_arrow["sessions"], ddof=1).as_py(),
 83 |             "orders": pc.variance(data_arrow["orders"], ddof=1).as_py(),
 84 |         },
 85 |         cov_={
 86 |             ("orders", "sessions"): np.cov(
 87 |                 data_arrow["sessions"].combine_chunks().to_numpy(zero_copy_only=False),
 88 |                 data_arrow["orders"].combine_chunks().to_numpy(zero_copy_only=False),
 89 |                 ddof=1,
 90 |             )[0, 1],
 91 |         },
 92 |     )
 93 | 
 94 | @pytest.fixture
 95 | def correct_aggrs(data_arrow: pa.Table) -> dict[int, tea_tasting.aggr.Aggregates]:
 96 |     variant_col = data_arrow["variant"]
 97 |     aggrs = {}
 98 |     for var in variant_col.unique().to_pylist():
 99 |         var_data = data_arrow.filter(pc.equal(variant_col, pa.scalar(var)))
100 |         aggrs |= {var: tea_tasting.aggr.Aggregates(
101 |             count_=var_data.num_rows,
102 |             mean_={
103 |                 "sessions": pc.mean(var_data["sessions"]).as_py(),
104 |                 "orders": pc.mean(var_data["orders"]).as_py(),
105 |             },
106 |             var_={
107 |                 "sessions": pc.variance(var_data["sessions"], ddof=1).as_py(),
108 |                 "orders": pc.variance(var_data["orders"], ddof=1).as_py(),
109 |             },
110 |             cov_={
111 |                 ("orders", "sessions"): np.cov(
112 |                     var_data["sessions"].combine_chunks().to_numpy(zero_copy_only=False),
113 |                     var_data["orders"].combine_chunks().to_numpy(zero_copy_only=False),
114 |                     ddof=1,
115 |                 )[0, 1],
116 |             },
117 |         )}
118 |     return aggrs
119 | 
120 | 
121 | def test_aggregates_init(aggr: tea_tasting.aggr.Aggregates):
122 |     assert aggr.count_ == COUNT
123 |     assert aggr.mean_ == MEAN
124 |     assert aggr.var_ == VAR
125 |     assert aggr.cov_ == COV
126 | 
127 | def test_aggregates_calls(aggr: tea_tasting.aggr.Aggregates):
128 |     assert aggr.count() == COUNT
129 |     assert aggr.mean("x") == MEAN["x"]
130 |     assert aggr.mean("y") == MEAN["y"]
131 |     assert aggr.var("x") == VAR["x"]
132 |     assert aggr.mean("y") == MEAN["y"]
133 |     assert aggr.cov("x", "y") == COV["x", "y"]
134 | 
135 | def test_aggregates_count_raises():
136 |     aggr = tea_tasting.aggr.Aggregates(count_=None, mean_={}, var_={}, cov_={})
137 |     with pytest.raises(RuntimeError):
138 |         aggr.count()
139 | 
140 | def test_aggregates_none(aggr: tea_tasting.aggr.Aggregates):
141 |     assert aggr.mean(None) == 1
142 |     assert aggr.var(None) == 0
143 |     assert aggr.cov(None, "y") == 0
144 |     assert aggr.cov("x", None) == 0
145 | 
146 | def test_aggregates_ratio_var(aggr: tea_tasting.aggr.Aggregates):
147 |     assert aggr.ratio_var("x", "y") == pytest.approx(0.2265625)
148 | 
149 | def test_aggregates_ratio_cov():
150 |     aggr = tea_tasting.aggr.Aggregates(
151 |         count_=None,
152 |         mean_={"a": 8, "b": 7, "c": 6, "d": 5},
153 |         var_={},
154 |         cov_={("a", "c"): 4, ("a", "d"): 3, ("b", "c"): 2, ("b", "d"): 1},
155 |     )
156 |     assert aggr.ratio_cov("a", "b", "c", "d") == pytest.approx(-0.0146938775510204)
157 | 
158 | def test_aggregates_add(
159 |     correct_aggr: tea_tasting.aggr.Aggregates,
160 |     correct_aggrs: dict[int, tea_tasting.aggr.Aggregates],
161 | ):
162 |     aggrs_add = correct_aggrs[0] + correct_aggrs[1]
163 |     assert aggrs_add.count_ == pytest.approx(correct_aggr.count_)
164 |     assert aggrs_add.mean_ == pytest.approx(correct_aggr.mean_)
165 |     assert aggrs_add.var_ == pytest.approx(correct_aggr.var_)
166 |     assert aggrs_add.cov_ == pytest.approx(correct_aggr.cov_)
167 | 
168 | 
169 | def test_read_aggregates_groups(
170 |     data: Frame,
171 |     correct_aggrs: dict[int, tea_tasting.aggr.Aggregates],
172 | ):
173 |     aggrs = tea_tasting.aggr.read_aggregates(
174 |         data,
175 |         group_col="variant",
176 |         has_count=True,
177 |         mean_cols=("sessions", "orders"),
178 |         var_cols=("sessions", "orders"),
179 |         cov_cols=(("sessions", "orders"),),
180 |     )
181 |     for i in (0, 1):
182 |         assert aggrs[i].count_ == pytest.approx(correct_aggrs[i].count_)
183 |         assert aggrs[i].mean_ == pytest.approx(correct_aggrs[i].mean_)
184 |         assert aggrs[i].var_ == pytest.approx(correct_aggrs[i].var_)
185 |         assert aggrs[i].cov_ == pytest.approx(correct_aggrs[i].cov_)
186 | 
187 | def test_read_aggregates_no_groups(
188 |     data: Frame,
189 |     correct_aggr: tea_tasting.aggr.Aggregates,
190 | ):
191 |     aggr = tea_tasting.aggr.read_aggregates(
192 |         data,
193 |         group_col=None,
194 |         has_count=True,
195 |         mean_cols=("sessions", "orders"),
196 |         var_cols=("sessions", "orders"),
197 |         cov_cols=(("sessions", "orders"),),
198 |     )
199 |     assert aggr.count_ == pytest.approx(correct_aggr.count_)
200 |     assert aggr.mean_ == pytest.approx(correct_aggr.mean_)
201 |     assert aggr.var_ == pytest.approx(correct_aggr.var_)
202 |     assert aggr.cov_ == pytest.approx(correct_aggr.cov_)
203 | 
204 | def test_read_aggregates_no_count(data_arrow: pa.Table):
205 |     aggr = tea_tasting.aggr.read_aggregates(
206 |         data_arrow,
207 |         group_col=None,
208 |         has_count=False,
209 |         mean_cols=("sessions", "orders"),
210 |         var_cols=(),
211 |         cov_cols=(),
212 |     )
213 |     assert aggr.count_ is None
214 |     assert aggr.var_ == {}
215 |     assert aggr.cov_ == {}
216 | 


--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import TYPE_CHECKING
 4 | 
 5 | import pytest
 6 | 
 7 | import tea_tasting.config
 8 | 
 9 | 
10 | if TYPE_CHECKING:
11 |     from collections.abc import Iterator
12 | 
13 | 
14 | @pytest.fixture
15 | def reset_config() -> Iterator[None]:
16 |     try:
17 |         yield
18 |     finally:
19 |         tea_tasting.config._config_var.set(tea_tasting.config._DEFAULT_CONFIG.copy())
20 | 
21 | 
22 | @pytest.mark.usefixtures("reset_config")
23 | def test_get_config():
24 |     config = tea_tasting.config.get_config()
25 |     assert config == tea_tasting.config._config_var.get()
26 |     config["equal_var"] = not config["equal_var"]
27 |     assert config != tea_tasting.config._config_var.get()
28 | 
29 |     assert (
30 |         tea_tasting.config.get_config("equal_var") ==
31 |         tea_tasting.config._config_var.get()["equal_var"]
32 |     )
33 | 
34 | 
35 | @pytest.mark.usefixtures("reset_config")
36 | def test_set_config():
37 |     tea_tasting.config.set_config(equal_var=True)
38 |     assert tea_tasting.config._config_var.get()["equal_var"] is True
39 | 
40 |     tea_tasting.config.set_config(equal_var=False)
41 |     assert tea_tasting.config._config_var.get()["equal_var"] is False
42 | 
43 | 
44 | @pytest.mark.usefixtures("reset_config")
45 | def test_config_context():
46 |     old_equal_var = tea_tasting.config._config_var.get()["equal_var"]
47 | 
48 |     with tea_tasting.config.config_context(equal_var=not old_equal_var):
49 |         assert tea_tasting.config._config_var.get() is not old_equal_var
50 | 
51 |     assert tea_tasting.config._config_var.get()["equal_var"] is old_equal_var
52 | 


--------------------------------------------------------------------------------
/tests/test_datasets.py:
--------------------------------------------------------------------------------
  1 | # pyright: reportAttributeAccessIssue=false
  2 | from __future__ import annotations
  3 | 
  4 | import pandas as pd
  5 | import polars as pl
  6 | import pyarrow as pa
  7 | import pyarrow.compute as pc
  8 | 
  9 | import tea_tasting.datasets
 10 | 
 11 | 
 12 | def test_make_users_data_default():
 13 |     n_users = 100
 14 |     data = tea_tasting.datasets.make_users_data(seed=42, n_users=n_users)
 15 |     assert isinstance(data, pa.Table)
 16 |     assert data.column_names == ["user", "variant", "sessions", "orders", "revenue"]
 17 |     assert data.num_rows == n_users
 18 |     assert pc.count_distinct(data["user"]).as_py() == n_users
 19 |     assert pc.count_distinct(data["variant"]).as_py() == 2
 20 |     assert pc.min(data["sessions"]).as_py() > 0
 21 |     assert pc.min(data["orders"]).as_py() >= 0
 22 |     assert pc.min(data["revenue"]).as_py() >= 0
 23 |     assert pc.min(pc.subtract(data["orders"], data["sessions"])).as_py() <= 0
 24 |     assert int(pc.min(pc.equal(
 25 |         pc.greater_equal(data["revenue"], 0),
 26 |         pc.greater_equal(data["orders"], 0),
 27 |     )).as_py()) == 1
 28 | 
 29 | def test_make_users_data_pandas():
 30 |     n_users = 100
 31 |     data = tea_tasting.datasets.make_users_data(
 32 |         seed=42, n_users=n_users, return_type="pandas")
 33 |     assert isinstance(data, pd.DataFrame)
 34 |     assert data.columns.to_list() == [
 35 |         "user", "variant", "sessions", "orders", "revenue"]
 36 |     assert data.shape[0] == n_users
 37 | 
 38 | def test_make_users_data_polars():
 39 |     n_users = 100
 40 |     data = tea_tasting.datasets.make_users_data(
 41 |         seed=42, n_users=n_users, return_type="polars")
 42 |     assert isinstance(data, pl.DataFrame)
 43 |     assert data.columns == [
 44 |         "user", "variant", "sessions", "orders", "revenue"]
 45 |     assert data.shape[0] == n_users
 46 | 
 47 | 
 48 | def test_make_users_data_covariates():
 49 |     n_users = 100
 50 |     data = tea_tasting.datasets.make_users_data(
 51 |         seed=42, covariates=True, n_users=n_users)
 52 |     assert isinstance(data, pa.Table)
 53 |     assert data.column_names == [
 54 |         "user", "variant", "sessions", "orders", "revenue",
 55 |         "sessions_covariate", "orders_covariate", "revenue_covariate",
 56 |     ]
 57 |     assert pc.min(data["sessions_covariate"]).as_py() >= 0
 58 |     assert pc.min(data["orders_covariate"]).as_py() >= 0
 59 |     assert pc.min(data["revenue_covariate"]).as_py() >= 0
 60 |     assert pc.min(pc.subtract(
 61 |         data["orders_covariate"],
 62 |         data["sessions_covariate"],
 63 |     )).as_py() <= 0
 64 |     assert int(pc.min(pc.equal(
 65 |         pc.greater_equal(data["revenue_covariate"], 0),
 66 |         pc.greater_equal(data["orders_covariate"], 0),
 67 |     )).as_py()) == 1
 68 | 
 69 | 
 70 | def test_make_sessions_data_default():
 71 |     n_users = 100
 72 |     data = tea_tasting.datasets.make_sessions_data(seed=42, n_users=n_users)
 73 |     assert isinstance(data, pa.Table)
 74 |     assert data.column_names == ["user", "variant", "sessions", "orders", "revenue"]
 75 |     assert data.num_rows > n_users
 76 |     assert pc.count_distinct(data["user"]).as_py() == n_users
 77 |     assert pc.count_distinct(data["variant"]).as_py() == 2
 78 |     assert pc.min(data["sessions"]).as_py() == 1
 79 |     assert pc.max(data["sessions"]).as_py() == 1
 80 |     assert pc.min(data["orders"]).as_py() >= 0
 81 |     assert pc.min(data["revenue"]).as_py() >= 0
 82 |     assert pc.min(pc.subtract(data["orders"], data["sessions"])).as_py() <= 0
 83 |     assert int(pc.min(pc.equal(
 84 |         pc.greater_equal(data["revenue"], 0),
 85 |         pc.greater_equal(data["orders"], 0),
 86 |     )).as_py()) == 1
 87 | 
 88 | def test_make_sessions_data_pandas():
 89 |     n_users = 100
 90 |     data = tea_tasting.datasets.make_sessions_data(
 91 |         seed=42, n_users=n_users, return_type="pandas")
 92 |     assert isinstance(data, pd.DataFrame)
 93 |     assert data.columns.to_list() == [
 94 |         "user", "variant", "sessions", "orders", "revenue"]
 95 |     assert data.shape[0] > n_users
 96 | 
 97 | def test_make_sessions_data_polars():
 98 |     n_users = 100
 99 |     data = tea_tasting.datasets.make_sessions_data(
100 |         seed=42, n_users=n_users, return_type="polars")
101 |     assert isinstance(data, pl.DataFrame)
102 |     assert data.columns == [
103 |         "user", "variant", "sessions", "orders", "revenue"]
104 |     assert data.shape[0] > n_users
105 | 
106 | 
107 | def test_make_sessions_data_covariates():
108 |     n_users = 100
109 |     data = tea_tasting.datasets.make_sessions_data(
110 |         seed=42, covariates=True, n_users=n_users)
111 |     assert isinstance(data, pa.Table)
112 |     assert data.column_names == [
113 |         "user", "variant", "sessions", "orders", "revenue",
114 |         "sessions_covariate", "orders_covariate", "revenue_covariate",
115 |     ]
116 |     assert pc.min(data["sessions_covariate"]).as_py() >= 0
117 |     assert pc.min(data["orders_covariate"]).as_py() >= 0
118 |     assert pc.min(data["revenue_covariate"]).as_py() >= 0
119 |     assert pc.min(pc.subtract(
120 |         data["orders_covariate"],
121 |         data["sessions_covariate"],
122 |     )).as_py() <= 0
123 |     assert int(pc.min(pc.equal(
124 |         pc.greater_equal(data["revenue_covariate"], 0),
125 |         pc.greater_equal(data["orders_covariate"], 0),
126 |     )).as_py()) == 1
127 | 


--------------------------------------------------------------------------------
/tests/test_version.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import importlib
 4 | import importlib.metadata
 5 | import unittest.mock
 6 | 
 7 | import tea_tasting.version
 8 | 
 9 | 
10 | def test_version():
11 |     assert isinstance(tea_tasting.version.__version__, str)
12 | 
13 |     with (
14 |         unittest.mock.patch(
15 |             "tea_tasting.version.importlib.metadata.version") as version,
16 |         unittest.mock.patch("tea_tasting.version.importlib.resources.files") as files,
17 |     ):
18 |         (
19 |             files.return_value
20 |             .joinpath.return_value
21 |             .read_text.return_value
22 |             .strip.return_value
23 |         ) = "version"
24 | 
25 |         version.side_effect = importlib.metadata.PackageNotFoundError("Not found")
26 |         importlib.reload(tea_tasting.version)
27 |         assert isinstance(tea_tasting.version.__version__, str)
28 | 


--------------------------------------------------------------------------------