├── .github
    └── workflows
    │   └── deploy.yml
├── .gitignore
├── LICENSE
├── README.md
├── animation.ipynb
├── bayeshist
    ├── __init__.py
    ├── bayeshist.py
    └── plotting.py
├── demo.ipynb
├── doc
    ├── bayesian-histogram-comp.png
    └── samples.png
└── setup.py


/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
 1 | name: Upload release
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 | 
11 |     steps:
12 |       - uses: actions/checkout@v2
13 | 
14 |       - name: Set up Python
15 |         uses: actions/setup-python@v2
16 |         with:
17 |           python-version: '3.x'
18 | 
19 |       - name: Auto-bump version
20 |         run: |
21 |           # from refs/tags/v1.2.3 get 1.2.3
22 |           VERSION=$(echo $GITHUB_REF | sed 's#.*/v##')
23 |           PLACEHOLDER="__version__\s*=\s*[\"'](.+)[\"']"
24 |           VERSION_FILE="bayeshist/__init__.py"
25 |           # ensure the placeholder is there. If grep doesn't find the placeholder
26 |           # it exits with exit code 1 and github actions aborts the build.
27 |           VERSION_LINE=$(grep -E "$PLACEHOLDER" "$VERSION_FILE")
28 |           sed -i "s/$VERSION_LINE/__version__ = \"${VERSION}\"/g" "$VERSION_FILE"
29 |         shell: bash
30 | 
31 |       - name: Install dependencies
32 |         run: |
33 |           python -m pip install --upgrade pip
34 |           pip install setuptools wheel twine
35 | 
36 |       - name: Build sdist
37 |         run: |
38 |           python setup.py sdist
39 | 
40 |       - name: Publish to PyPI
41 |         uses: pypa/gh-action-pypi-publish@release/v1
42 |         with:
43 |           user: __token__
44 |           password: ${{ secrets.PYPI_API_TOKEN }}
45 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Dion Häfner
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Bayesian histograms
  2 | 
  3 | **Bayesian histograms** are a nifty tool for data mining if:
  4 | 
  5 | - you want to know how the *event rate* (probability) of a binary **rare event** depends on a parameter;
  6 | - you have millions or even **billions of data points**, but few positive samples;
  7 | - you suspect the event rate depends **highly non-linearly** on the parameter;
  8 | - you don't know whether you have *enough data*, so you need **uncertainty information**.
  9 | 
 10 | Thanks to an adaptive bin pruning algorithm, you don't even have to choose the number of bins, and you should get good results out of the box.
 11 | 
 12 | This is how they look in practice ([see full example below](#usage-example)):
 13 | 
 14 | <p align="center">
 15 | <img src="doc/bayesian-histogram-comp.png?raw=true" width="450px">
 16 | </p>
 17 | 
 18 | ## Installation
 19 | 
 20 | ```bash
 21 | $ pip install bayeshist
 22 | ```
 23 | 
 24 | ## Usage example
 25 | 
 26 | Assume you have binary samples of a rare event like this:
 27 | 
 28 | <p align="center">
 29 | <img src="doc/samples.png?raw=true" width="450px">
 30 | </p>
 31 | 
 32 | Compute and plot a Bayesian histogram:
 33 | 
 34 | ```python
 35 | >>> from bayeshist import bayesian_histogram, plot_bayesian_histogram
 36 | 
 37 | # compute Bayesian histogram from samples
 38 | >>> bin_edges, beta_dist = bayesian_histogram(X, y, bins=100, pruning_method="bayes")
 39 | 
 40 | # beta_dist is a `scipy.stats.Beta` object, so we can get the
 41 | # predicted mean event rate for each histogram bin like this:
 42 | >>> bin_mean_pred = best_dist.mean()
 43 | 
 44 | # plot it up
 45 | >>> plot_bayesian_histogram(bin_edges, beta_dist)
 46 | ```
 47 | 
 48 | The result is something like this:
 49 | 
 50 | <p align="center">
 51 | <img src="doc/bayesian-histogram-comp.png?raw=true" width="450px">
 52 | </p>
 53 | 
 54 | See also [demo.ipynb](demo.ipynb) for a full walkthrough of this example.
 55 | 
 56 | ## But how do they work?
 57 | 
 58 | [Here's the blog post.](https://dionhaefner.github.io/2021/09/bayesian-histograms-for-rare-event-classification/)
 59 | 
 60 | ## API reference
 61 | 
 62 | ### `bayesian_histogram`
 63 | 
 64 | ```python
 65 | 
 66 | def bayesian_histogram(
 67 |     x: np.ndarray,
 68 |     y: np.ndarray,
 69 |     bins: Union[int, Iterable] = 100,
 70 |     x_range: Optional[Tuple[float, float]] = None,
 71 |     prior_params: Optional[Tuple[float, float]] = None,
 72 |     pruning_method: Optional[Literal["bayes", "fisher"]] = "bayes",
 73 |     pruning_threshold: Optional[float] = None,
 74 |     max_bin_size: Optional[float] = None,
 75 | ) -> Tuple[np.ndarray, FrozenDistType]:
 76 |     """Compute Bayesian histogram for data x, binary target y.
 77 | 
 78 |     The output is a Beta distribution over the event rate for each bin.
 79 | 
 80 |     Parameters:
 81 | 
 82 |         x:
 83 |             1-dim array of data.
 84 | 
 85 |         y:
 86 |             1-dim array of binary labels (0 or 1).
 87 | 
 88 |         bins:
 89 |             int giving the number of equally spaced intial bins,
 90 |             or array giving initial bin edges. (default: 100)
 91 | 
 92 |         x_range:
 93 |             Range spanned by binning. Not used if `bins` is an array.
 94 |             (default: [min(x), max(x)])
 95 | 
 96 |         prior_params:
 97 |             Parameters to use in Beta prior. First value relates to positive,
 98 |             second value to negative samples. [0.5, 0.5] represents Jeffrey's prior, [1, 1] a flat
 99 |             prior. The default is a weakly informative prior based on the global event rate.
100 |             (default: `[1, num_neg / num_pos]`)
101 | 
102 |         pruning_method:
103 |             Method to use to decide whether neighboring bins should be merged or not.
104 |             Valid values are "bayes" (Bayes factor), "fisher" (exact Fisher test), or None
105 |             (no pruning). (default: "bayes")
106 | 
107 |         pruning_threshold:
108 |             Threshold to use in significance test specified by `pruning_method`.
109 |             (default: 2 for "bayes", 0.2 for "fisher")
110 | 
111 |         max_bin_size:
112 |             Maximum size (in units of x) above which bins will not be merged
113 |             (except empty bins). (default: unlimited size)
114 | 
115 |     Returns:
116 | 
117 |         bin_edges: Coordinates of bin edges
118 |         beta_dist: n-dimensional Beta distribution (n = number of bins)
119 | 
120 |     Example:
121 | 
122 |         >>> x = np.random.randn(1000)
123 |         >>> p = 10 ** (-2 + x)
124 |         >>> y = np.random.rand() < p
125 |         >>> bins, beta_dist = bayesian_histogram(x, y)
126 |         >>> plt.plot(0.5 * (bins[1:] + bins[:-1]), beta_dist.mean())
127 | 
128 |     """
129 | ```
130 | 
131 | ### `plot_bayesian_histogram`
132 | 
133 | ```python
134 | def plot_bayesian_histogram(
135 |     bin_edges: np.ndarray,
136 |     data_dist: FrozenDistType,
137 |     color: Union[str, Iterable[float], None] = None,
138 |     label: Optional[str] = None,
139 |     ax: Any = None,
140 |     ci: Optional[Tuple[float, float]] = (0.01, 0.99)
141 | ) -> None:
142 |     """Plot a Bayesian histogram as horizontal lines with credible intervals.
143 | 
144 |     Parameters:
145 | 
146 |         bin_edges:
147 |             Coordinates of bin edges
148 | 
149 |         data_dist:
150 |             n-dimensional Beta distribution (n = number of bins)
151 | 
152 |         color:
153 |             Color to use (default: use next in current color cycle)
154 | 
155 |         label:
156 |             Legend label (default: no label)
157 | 
158 |         ax:
159 |             Matplotlib axis to use (default: current axis)
160 | 
161 |         ci:
162 |             Credible interval used for shading, use `None` to disable shading.
163 | 
164 |     Example:
165 | 
166 |         >>> x = np.random.randn(1000)
167 |         >>> p = 10 ** (-2 + x)
168 |         >>> y = np.random.rand() < p
169 |         >>> bins, beta_dist = bayesian_histogram(x, y)
170 |         >>> plot_bayesian_histogram(bins, beta_dist)
171 | 
172 |     """
173 | ```
174 | 
175 | ## Questions?
176 | 
177 | [Feel free to open an issue.](https://github.com/dionhaefner/bayesian-histograms/issues)
178 | 


--------------------------------------------------------------------------------
/animation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "presidential-syndrome",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Create an animation of histogram pruning\n",
  9 |     "\n",
 10 |     "(messy code warning)"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "editorial-samoa",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "from functools import partial\n",
 21 |     "from tqdm.notebook import tqdm\n",
 22 |     "from matplotlib.animation import FuncAnimation\n",
 23 |     "from matplotlib.collections import PatchCollection\n",
 24 |     "from matplotlib.patches import Rectangle\n",
 25 |     "from IPython.display import display\n",
 26 |     "\n",
 27 |     "from bayeshist.bayeshist import _prune_histogram, _bayes_factor_test\n",
 28 |     "\n",
 29 |     "\n",
 30 |     "plt.rcParams[\"animation.html\"] = \"html5\"\n",
 31 |     "\n",
 32 |     "bin_edges = np.linspace(-4, 4, 40)\n",
 33 |     "neg_samples, _ = np.histogram(test_x[test_y == 0], bins=bin_edges)\n",
 34 |     "pos_samples, _ = np.histogram(test_x[test_y == 1], bins=bin_edges)\n",
 35 |     "\n",
 36 |     "pruning_threshold = 2\n",
 37 |     "prior_params = (1, 1000)\n",
 38 |     "test = partial(_bayes_factor_test, threshold=pruning_threshold)\n",
 39 |     "pruner = _prune_histogram(bin_edges, pos_samples, neg_samples, test, prior_params, yield_steps=True)\n",
 40 |     "\n",
 41 |     "states = [state for state in pruner if not isinstance(state, tuple)]\n",
 42 |     "\n",
 43 |     "fig = plt.figure(figsize=(9, 6))\n",
 44 |     "ylim = 1e2 * max(pos_samples.max(), neg_samples.max())\n",
 45 |     "\n",
 46 |     "pbar = tqdm()\n",
 47 |     "speedup_after = 3\n",
 48 |     "num_steps = 5\n",
 49 |     "frame_cutoff = 10 * speedup_after * num_steps\n",
 50 |     "\n",
 51 |     "\n",
 52 |     "def animate(frameno):\n",
 53 |     "    pbar.update(1)\n",
 54 |     "    \n",
 55 |     "    if frameno < frame_cutoff:\n",
 56 |     "        frameno = frameno // 10\n",
 57 |     "    else:\n",
 58 |     "        frameno = frameno - (frame_cutoff - frame_cutoff // 10)\n",
 59 |     "        \n",
 60 |     "    state_idx, step = frameno // num_steps, frameno % num_steps\n",
 61 |     "    \n",
 62 |     "    fig.clear()\n",
 63 |     "    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])\n",
 64 |     "    \n",
 65 |     "    axprob = fig.add_axes([0.1, 0.1, 0.8, 0.8], frameon=False)\n",
 66 |     "    axprob.grid(False)\n",
 67 |     "    axprob.yaxis.tick_right()\n",
 68 |     "    axprob.yaxis.set_label_position(\"right\")\n",
 69 |     "    axprob.set(\n",
 70 |     "        xlim=(bin_edges[0], bin_edges[-1]),\n",
 71 |     "        xticks=[],\n",
 72 |     "        ylim=(1e-4, 1e1),\n",
 73 |     "        yscale=\"log\"\n",
 74 |     "    )\n",
 75 |     "    axprob.set_ylabel(\"Event rate\", y=0.25)\n",
 76 |     "    axprob.set_yticks([1e-4, 1e-3, 1e-2])\n",
 77 |     "    axprob.tick_params(which=\"both\", right=False)\n",
 78 |     "    \n",
 79 |     "    is_final_state = state_idx >= len(states)\n",
 80 |     "    \n",
 81 |     "    if is_final_state:\n",
 82 |     "        state = states[-1]\n",
 83 |     "        step = 0\n",
 84 |     "    else:\n",
 85 |     "        state = states[state_idx]\n",
 86 |     "        \n",
 87 |     "    i = state[\"i\"]\n",
 88 |     "    bins = state[\"bins\"]\n",
 89 |     "    bin_centers = 0.5 * (bins[1:] + bins[:-1])\n",
 90 |     "\n",
 91 |     "    if is_final_state:\n",
 92 |     "        ax.hist(test_x[test_y == 0], log=True, alpha=0.6, bins=bins, facecolor=\"C0\", label=\"y = 0\")\n",
 93 |     "        ax.hist(test_x[test_y == 1], log=True, alpha=0.6, bins=bins, facecolor=\"C1\", label=\"y = 1\")\n",
 94 |     "        fig.text(0.5, 0.85, \"Final Bayesian histogram\", ha=\"center\", weight=\"bold\")\n",
 95 |     "    else:\n",
 96 |     "        ax.hist(test_x[test_y == 0], log=True, alpha=0.4, bins=bins, facecolor=\"C0\")\n",
 97 |     "        ax.hist(test_x[test_y == 1], log=True, alpha=0.4, bins=bins, facecolor=\"C1\")\n",
 98 |     "\n",
 99 |     "        ax.hist(test_x[test_y == 0], log=True, alpha=0.8, bins=bins[i:i+3], facecolor=\"C0\", label=\"y = 0\")\n",
100 |     "        ax.hist(test_x[test_y == 1], log=True, alpha=0.8, bins=bins[i:i+3], facecolor=\"C1\", label=\"y = 1\")\n",
101 |     "    \n",
102 |     "    event_dist = scipy.stats.beta(state[\"pos_samples\"] + prior_params[0], state[\"neg_samples\"] + prior_params[1])\n",
103 |     "    ci_low, ci_high = event_dist.ppf(0.01), event_dist.ppf(0.99)\n",
104 |     "\n",
105 |     "    # background boxes\n",
106 |     "    errorboxes = [\n",
107 |     "        Rectangle((x1, y1), x2 - x1, y2 - y1)\n",
108 |     "        for x1, x2, y1, y2\n",
109 |     "        in zip(bins[:-1], bins[1:], ci_low, ci_high)\n",
110 |     "    ]\n",
111 |     "\n",
112 |     "    pc = PatchCollection(errorboxes, facecolor=\"0.2\", alpha=0.2)\n",
113 |     "    axprob.add_collection(pc)\n",
114 |     "\n",
115 |     "    # median indicator\n",
116 |     "    axprob.hlines(event_dist.median(), bins[:-1], bins[1:], colors=\"0.2\", label=\"p(y = 1)\")\n",
117 |     "\n",
118 |     "    # box edges\n",
119 |     "    ax.hlines(ci_low, bins[:-1], bins[1:], colors=\"0.2\", alpha=0.8, linewidth=1)\n",
120 |     "    ax.hlines(ci_high, bins[:-1], bins[1:], colors=\"0.2\", alpha=0.8, linewidth=1)\n",
121 |     "    \n",
122 |     "    fig.legend(loc=\"upper center\", ncol=3, frameon=False)\n",
123 |     "\n",
124 |     "    if step > 0:\n",
125 |     "        axdist1 = fig.add_axes([0.16, 0.72, 0.2, 0.1])\n",
126 |     "        axdist1.axis(\"off\")        \n",
127 |     "        dist_x = np.logspace(-5, 0, 100)\n",
128 |     "\n",
129 |     "        with np.errstate(divide='ignore'):\n",
130 |     "            alpha_1, beta_1 = state[\"samples_1\"]\n",
131 |     "            axdist1.plot(dist_x, scipy.stats.beta(alpha_1 + prior_params[0], beta_1 + prior_params[1]).pdf(dist_x), c=\"0.2\", label=\"original\")\n",
132 |     "\n",
133 |     "            alpha_2, beta_2 = state[\"samples_2\"]\n",
134 |     "            axdist1.plot(dist_x, scipy.stats.beta(alpha_2 + prior_params[0], beta_2 + prior_params[1]).pdf(dist_x), c=\"0.2\")\n",
135 |     "\n",
136 |     "            alpha_comb, beta_comb = alpha_1 + alpha_2, beta_1 + beta_2\n",
137 |     "            axdist1.plot(dist_x, scipy.stats.beta(alpha_comb + prior_params[0], beta_comb + prior_params[1]).pdf(dist_x), c=\"coral\", label=\"merged\")\n",
138 |     "\n",
139 |     "        axdist1.text(0.5, -0.05, \"p(y = 1)\", transform=axdist1.transAxes, va=\"top\", ha=\"center\", color=\"0.2\")\n",
140 |     "        axdist1.set_xscale(\"log\")\n",
141 |     "        axdist1.set_title(\"Event rate distributions\", weight=\"bold\")\n",
142 |     "        axdist1.legend(loc=\"upper right\", frameon=False, labelcolor=\"linecolor\", handlelength=0)\n",
143 |     "\n",
144 |     "    if step > 1:\n",
145 |     "        p_1 = scipy.stats.betabinom(alpha_1 + beta_1, alpha_1 + prior_params[0], beta_1 + prior_params[1]).logpmf(alpha_1)\n",
146 |     "        ax.text(bin_centers[i], beta_1, f\"{p_1:.1f}\", ha=\"center\", va=\"bottom\", fontsize=9)\n",
147 |     "\n",
148 |     "        p_2 = scipy.stats.betabinom(alpha_2 + beta_2, alpha_2 + prior_params[0], beta_2 + prior_params[1]).logpmf(alpha_2)\n",
149 |     "        ax.text(bin_centers[i+1], beta_2, f\"{p_2:.1f}\", ha=\"center\", fontsize=9)\n",
150 |     "\n",
151 |     "        p_c1 = scipy.stats.betabinom(alpha_1 + beta_1, alpha_comb + prior_params[0], beta_comb + prior_params[1]).logpmf(alpha_1)\n",
152 |     "        ax.text(bin_centers[i], beta_1 * 2, f\"{p_c1:.1f}\", color=\"coral\", ha=\"center\", fontsize=9)\n",
153 |     "\n",
154 |     "        p_c2 = scipy.stats.betabinom(alpha_2 + beta_2, alpha_comb + prior_params[0], beta_comb + prior_params[1]).logpmf(alpha_2)\n",
155 |     "        ax.text(bin_centers[i+1], beta_2 * 2, f\"{p_c2:.1f}\", color=\"coral\", ha=\"center\", fontsize=9)\n",
156 |     "\n",
157 |     "        ax.text(bins[i+1], max(beta_1, beta_2) * 4, \"Data log likelihood\", ha=\"center\", weight=\"bold\")\n",
158 |     "\n",
159 |     "    if step > 2:\n",
160 |     "        compsign = \"$>$\" if state[\"test_value\"] > pruning_threshold else \"$\\\\ngtr$\" \n",
161 |     "        fig.text(0.8, 0.85, \"Log likelihood $\\\\Delta$\", ha=\"center\", va=\"top\", weight=\"bold\")\n",
162 |     "        fig.text(0.8, 0.82, f\"{np.log(state['test_value']):.2f} {compsign} log({pruning_threshold})\", ha=\"center\", va=\"top\")\n",
163 |     "\n",
164 |     "    if step > 3:\n",
165 |     "        merge_text = \"merge\" if state[\"reverse_split\"] else \"don't merge\"\n",
166 |     "        ax.annotate(\"\", xy=(0.8, 0.78), xytext=(0.8, 0.7), arrowprops=dict(arrowstyle=\"<-\", color=\"black\"), xycoords=\"figure fraction\", textcoords=\"figure fraction\")\n",
167 |     "        fig.text(0.8, 0.7, f\"{merge_text}\", ha=\"center\", va=\"top\", weight=\"bold\")\n",
168 |     "\n",
169 |     "    ax.set(\n",
170 |     "        xlabel=\"x\",\n",
171 |     "        ylabel=\"Count\",\n",
172 |     "        xlim=(bin_edges[0], bin_edges[-1]),\n",
173 |     "        ylim=(0.5, ylim),\n",
174 |     "    )\n",
175 |     "    \n",
176 |     "animate(100)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "id": "loved-steps",
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "num_frames = frame_cutoff - frame_cutoff // 10 + len(states) * num_steps + 20\n",
187 |     "\n",
188 |     "with tqdm(total=num_frames) as pbar:\n",
189 |     "    anim = FuncAnimation(fig, animate, frames=num_frames, interval=100)\n",
190 |     "    display(anim)"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "id": "brown-stephen",
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "anim.save(\"bayes-pruning.mp4\")"
201 |    ]
202 |   }
203 |  ],
204 |  "metadata": {
205 |   "kernelspec": {
206 |    "display_name": "Python 3",
207 |    "language": "python",
208 |    "name": "python3"
209 |   },
210 |   "language_info": {
211 |    "codemirror_mode": {
212 |     "name": "ipython",
213 |     "version": 3
214 |    },
215 |    "file_extension": ".py",
216 |    "mimetype": "text/x-python",
217 |    "name": "python",
218 |    "nbconvert_exporter": "python",
219 |    "pygments_lexer": "ipython3",
220 |    "version": "3.9.6"
221 |   }
222 |  },
223 |  "nbformat": 4,
224 |  "nbformat_minor": 5
225 | }
226 | 


--------------------------------------------------------------------------------
/bayeshist/__init__.py:
--------------------------------------------------------------------------------
1 | from .bayeshist import bayesian_histogram
2 | from .plotting import plot_bayesian_histogram
3 | 
4 | __version__ = "v0.0+dev"
5 | __all__ = ["bayesian_histogram", "plot_bayesian_histogram"]
6 | 


--------------------------------------------------------------------------------
/bayeshist/bayeshist.py:
--------------------------------------------------------------------------------
  1 | """bayeshist.py
  2 | 
  3 | Bayesian histograms for binary targets.
  4 | """
  5 | 
  6 | from typing import Iterable, Literal, Optional, Tuple, Union
  7 | from functools import partial
  8 | 
  9 | import numpy as np
 10 | 
 11 | from scipy.stats import beta as beta_dist, fisher_exact
 12 | from scipy.special import betaln
 13 | 
 14 | FrozenDistType = type(beta_dist(0, 0))
 15 | 
 16 | 
 17 | def _bayes_factor_test(ps1, ns1, ps2, ns2, prior_p, prior_n, threshold=2):
 18 |     """Tests whether two binomial datasets come from the same distribution.
 19 | 
 20 |     Computes the Bayes Factor of hypotheses:
 21 | 
 22 |         H1: Samples are drawn with p_i ~ Beta(alpha_i, beta_i), i={1,2}
 23 |         H0: Both samples are drawn with p ~ Beta(alpha_1 + alpha_2, beta_1 + beta_2)
 24 | 
 25 |     The Bayes Factor gives the relative increase in data likelihood
 26 |     after the split (higher values -> splitting is more favorable).
 27 |     """
 28 |     # alpha and beta coefficients for distribution of p(y=1)
 29 |     alpha_1 = ps1 + prior_p
 30 |     beta_1 = ns1 + prior_n
 31 | 
 32 |     alpha_2 = ps2 + prior_p
 33 |     beta_2 = ns2 + prior_n
 34 | 
 35 |     alpha_tot = ps1 + ps2 + prior_p
 36 |     beta_tot = ns1 + ns2 + prior_n
 37 | 
 38 |     # we could use scipy.state.betabinom here, but betaln is faster
 39 |     def betabinom_logp(ps, ns, alpha, beta):
 40 |         # this omits choose(n, k), drops out in Bayes factor
 41 |         return betaln(ps + alpha, ns + beta) - betaln(alpha, beta)
 42 | 
 43 |     bayes_factor = np.exp(
 44 |         -betabinom_logp(ps1, ns1, alpha_tot, beta_tot)
 45 |         - betabinom_logp(ps2, ns2, alpha_tot, beta_tot)
 46 |         + betabinom_logp(ps1, ns1, alpha_1, beta_1)
 47 |         + betabinom_logp(ps2, ns2, alpha_2, beta_2)
 48 |     )
 49 | 
 50 |     return bayes_factor > threshold, bayes_factor
 51 | 
 52 | 
 53 | def _fisher_test(ps1, ns1, ps2, ns2, *args, threshold=0.05):
 54 |     """Tests whether two binomial datasets come from the same distribution.
 55 | 
 56 |     Uses an exact Fisher test. Prior parameters are unused.
 57 |     """
 58 |     _, pvalue = fisher_exact([[ps1, ps2], [ns1, ns2]])
 59 |     return pvalue < threshold, pvalue
 60 | 
 61 | 
 62 | def _prune_histogram(bin_edges, pos_samples, neg_samples, test, prior_params, max_bin_size=None, yield_steps=False):
 63 |     """Perform histogram pruning.
 64 | 
 65 |     This iteratively merges neighboring bins until all neighbor pairs pass
 66 |     the given statistical test.
 67 |     """
 68 |     if max_bin_size is None:
 69 |         max_bin_size = float("inf")
 70 | 
 71 |     while True:
 72 |         new_bins = []
 73 |         new_pos_samples = []
 74 |         new_neg_samples = []
 75 | 
 76 |         num_bins = len(bin_edges) - 1
 77 |         splits_reversed = 0
 78 | 
 79 |         i = 0
 80 | 
 81 |         while True:
 82 |             if i == num_bins:
 83 |                 break
 84 | 
 85 |             elif i == num_bins - 1:
 86 |                 # only 1 bin left, nothing to compare to
 87 |                 new_bins.append(bin_edges[i])
 88 |                 new_pos_samples.append(pos_samples[i])
 89 |                 new_neg_samples.append(neg_samples[i])
 90 |                 break
 91 | 
 92 |             is_significant, test_value = test(
 93 |                 pos_samples[i],
 94 |                 neg_samples[i],
 95 |                 pos_samples[i + 1],
 96 |                 neg_samples[i + 1],
 97 |                 *prior_params
 98 |             )
 99 | 
100 |             reverse_split = (
101 |                 not is_significant
102 |                 # ensure that we stay below max_bin_size
103 |                 and (bin_edges[i + 1] - bin_edges[i] < max_bin_size)
104 |                 # but always merge empty bins
105 |                 or (neg_samples[i] == pos_samples[i] == 0)
106 |                 or (neg_samples[i + 1] == pos_samples[i + 1] == 0)
107 |             )
108 | 
109 |             if yield_steps:
110 |                 yield dict(
111 |                     i=i - splits_reversed,
112 |                     samples_1=(pos_samples[i], neg_samples[i]),
113 |                     samples_2=(pos_samples[i+1], neg_samples[i+1]),
114 |                     test_value=test_value,
115 |                     is_significant=is_significant,
116 |                     reverse_split=reverse_split,
117 |                     bins=np.concatenate((new_bins, bin_edges[i:])),
118 |                     pos_samples=np.concatenate((new_pos_samples, pos_samples[i:])),
119 |                     neg_samples=np.concatenate((new_neg_samples, neg_samples[i:])),
120 |                 )
121 | 
122 |             if reverse_split:
123 |                 splits_reversed += 1
124 |                 new_bins.append(bin_edges[i])
125 |                 new_pos_samples.append(pos_samples[i] + pos_samples[i + 1])
126 |                 new_neg_samples.append(neg_samples[i] + neg_samples[i + 1])
127 |                 i += 2
128 |             else:
129 |                 # keep everything and proceed with next pair
130 |                 new_bins.append(bin_edges[i])
131 |                 new_pos_samples.append(pos_samples[i])
132 |                 new_neg_samples.append(neg_samples[i])
133 |                 i += 1
134 | 
135 |         new_bins.append(bin_edges[-1])
136 | 
137 |         assert len(new_bins) == len(bin_edges) - splits_reversed
138 | 
139 |         bin_edges = new_bins
140 |         pos_samples = new_pos_samples
141 |         neg_samples = new_neg_samples
142 | 
143 |         if not splits_reversed:
144 |             # no changes made -> we are done
145 |             break
146 | 
147 |     bin_edges = np.array(bin_edges)
148 |     pos_samples = np.array(pos_samples)
149 |     neg_samples = np.array(neg_samples)
150 | 
151 |     yield bin_edges, pos_samples, neg_samples
152 | 
153 | 
154 | def bayesian_histogram(
155 |     x: np.ndarray,
156 |     y: np.ndarray,
157 |     bins: Union[int, Iterable] = 100,
158 |     x_range: Optional[Tuple[float, float]] = None,
159 |     prior_params: Optional[Tuple[float, float]] = None,
160 |     pruning_method: Optional[Literal["bayes", "fisher"]] = "bayes",
161 |     pruning_threshold: Optional[float] = None,
162 |     max_bin_size: Optional[float] = None,
163 | ) -> Tuple[np.ndarray, FrozenDistType]:
164 |     """Compute Bayesian histogram for data x, binary target y.
165 | 
166 |     The output is a Beta distribution over the event rate for each bin.
167 | 
168 |     Parameters:
169 | 
170 |         x:
171 |             1-dim array of data.
172 | 
173 |         y:
174 |             1-dim array of binary labels (0 or 1).
175 | 
176 |         bins:
177 |             int giving the number of equally spaced intial bins,
178 |             or array giving initial bin edges. (default: 100)
179 | 
180 |         x_range:
181 |             Range spanned by binning. Not used if `bins` is an array.
182 |             (default: [min(x), max(x)])
183 | 
184 |         prior_params:
185 |             Parameters to use in Beta prior. First value relates to positive,
186 |             second value to negative samples. [0.5, 0.5] represents Jeffrey's prior, [1, 1] a flat
187 |             prior. The default is a weakly informative prior based on the global event rate.
188 |             (default: `[1, num_neg / num_pos]`)
189 | 
190 |         pruning_method:
191 |             Method to use to decide whether neighboring bins should be merged or not.
192 |             Valid values are "bayes" (Bayes factor), "fisher" (exact Fisher test), or None
193 |             (no pruning). (default: "bayes")
194 | 
195 |         pruning_threshold:
196 |             Threshold to use in significance test specified by `pruning_method`.
197 |             (default: 2 for "bayes", 0.2 for "fisher")
198 | 
199 |         max_bin_size:
200 |             Maximum size (in units of x) above which bins will not be merged
201 |             (except empty bins). (default: unlimited size)
202 | 
203 |     Returns:
204 | 
205 |         bin_edges: Coordinates of bin edges
206 |         beta_dist: n-dimensional Beta distribution (n = number of bins)
207 | 
208 |     Example:
209 | 
210 |         >>> x = np.random.randn(1000)
211 |         >>> p = 10 ** (-2 + x)
212 |         >>> y = np.random.rand() < p
213 |         >>> bins, beta_dist = bayesian_histogram(x, y)
214 |         >>> plt.plot(0.5 * (bins[1:] + bins[:-1]), beta_dist.mean())
215 | 
216 |     """
217 |     x = np.asarray(x)
218 |     y = np.asarray(y)
219 | 
220 |     if not np.all(np.isin(np.unique(y), [0, 1])):
221 |         raise ValueError("Binary targets y can only have values 0 and 1")
222 | 
223 |     if x_range is None:
224 |         x_range = (np.min(x), np.max(x))
225 | 
226 |     if pruning_method == "bayes":
227 |         if pruning_threshold is None:
228 |             # default bayes factor threshold
229 |             pruning_threshold = 2
230 | 
231 |         test = partial(_bayes_factor_test, threshold=pruning_threshold)
232 | 
233 |     elif pruning_method == "fisher":
234 |         if pruning_threshold is None:
235 |             # default p-value threshold
236 |             pruning_threshold = 0.2
237 | 
238 |         test = partial(_fisher_test, threshold=pruning_threshold)
239 | 
240 |     elif pruning_method is not None:
241 |         raise ValueError('pruning_method must be "bayes", "fisher", or None.')
242 | 
243 |     if np.isscalar(bins):
244 |         bin_edges = np.linspace(*x_range, bins + 1)
245 |     else:
246 |         bin_edges = np.asarray(bins)
247 | 
248 |     neg_samples, _ = np.histogram(x[y == 0], bins=bin_edges)
249 |     pos_samples, _ = np.histogram(x[y == 1], bins=bin_edges)
250 | 
251 |     if prior_params is None:
252 |         # default prior is weakly informative, using global event rate
253 |         num_pos_samples = np.sum(pos_samples)
254 |         num_neg_samples = np.sum(neg_samples)
255 | 
256 |         if num_pos_samples > num_neg_samples:
257 |             prior_params = (num_pos_samples / num_neg_samples, 1)
258 |         else:
259 |             prior_params = (1, num_neg_samples / num_pos_samples)
260 | 
261 |     if pruning_method is not None:
262 |         pruner = _prune_histogram(
263 |             bin_edges, pos_samples, neg_samples, test, prior_params, max_bin_size=max_bin_size
264 |         )
265 |         bin_edges, pos_samples, neg_samples = next(iter(pruner))
266 | 
267 |     return bin_edges, beta_dist(
268 |         pos_samples + prior_params[0], neg_samples + prior_params[1]
269 |     )
270 | 


--------------------------------------------------------------------------------
/bayeshist/plotting.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, Iterable, Optional, Any, Tuple
 2 | import numpy as np
 3 | 
 4 | from .bayeshist import FrozenDistType
 5 | 
 6 | 
 7 | def plot_bayesian_histogram(
 8 |     bin_edges: np.ndarray,
 9 |     data_dist: FrozenDistType,
10 |     color: Union[str, Iterable[float], None] = None,
11 |     label: Optional[str] = None,
12 |     ax: Any = None,
13 |     ci: Optional[Tuple[float, float]] = (0.01, 0.99)
14 | ) -> None:
15 |     """Plot a Bayesian histogram as horizontal lines with credible intervals.
16 | 
17 |     Parameters:
18 | 
19 |         bin_edges:
20 |             Coordinates of bin edges
21 | 
22 |         data_dist:
23 |             n-dimensional Beta distribution (n = number of bins)
24 | 
25 |         color:
26 |             Color to use (default: use next in current color cycle)
27 | 
28 |         label:
29 |             Legend label (default: no label)
30 | 
31 |         ax:
32 |             Matplotlib axis to use (default: current axis)
33 | 
34 |         ci:
35 |             Credible interval used for shading, use `None` to disable shading.
36 | 
37 |     Example:
38 | 
39 |         >>> x = np.random.randn(1000)
40 |         >>> p = 10 ** (-2 + x)
41 |         >>> y = np.random.rand() < p
42 |         >>> bins, beta_dist = bayesian_histogram(x, y)
43 |         >>> plot_bayesian_histogram(bins, beta_dist)
44 | 
45 |     """
46 |     import matplotlib.pyplot as plt
47 |     from matplotlib.collections import PatchCollection
48 |     from matplotlib.patches import Rectangle
49 | 
50 |     if ax is None:
51 |         ax = plt.gca()
52 | 
53 |     if color is None:
54 |         # advance color cycle
55 |         dummy, = ax.plot([], [])
56 |         color = dummy.get_color()
57 | 
58 |     if ci is not None:
59 |         ci_low, ci_high = data_dist.ppf(ci[0]), data_dist.ppf(ci[1])
60 | 
61 |         # background boxes
62 |         errorboxes = [
63 |             Rectangle((x1, y1), x2 - x1, y2 - y1)
64 |             for x1, x2, y1, y2
65 |             in zip(bin_edges[:-1], bin_edges[1:], ci_low, ci_high)
66 |         ]
67 | 
68 |         pc = PatchCollection(errorboxes, facecolor=color, alpha=0.2)
69 |         ax.add_collection(pc)
70 | 
71 |         # box edges
72 |         ax.hlines(ci_low, bin_edges[:-1], bin_edges[1:], colors=color, alpha=0.8, linewidth=1)
73 |         ax.hlines(ci_high, bin_edges[:-1], bin_edges[1:], colors=color, alpha=0.8, linewidth=1)
74 | 
75 |     # median indicator
76 |     ax.hlines(data_dist.median(), bin_edges[:-1], bin_edges[1:], colors=color, label=label)
77 | 


--------------------------------------------------------------------------------
/doc/bayesian-histogram-comp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dionhaefner/bayesian-histograms/f0caff585d2837ae759e798d1d486a4411ff2fdf/doc/bayesian-histogram-comp.png


--------------------------------------------------------------------------------
/doc/samples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dionhaefner/bayesian-histograms/f0caff585d2837ae759e798d1d486a4411ff2fdf/doc/samples.png


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import setup
 4 | 
 5 | from codecs import open
 6 | import os
 7 | import re
 8 | 
 9 | here = os.path.abspath(os.path.dirname(__file__))
10 | 
11 | with open(os.path.join(here, "README.md"), encoding="utf-8") as f:
12 |     long_description = f.read()
13 | 
14 | 
15 | # read version from __init__.py
16 | version_file = os.path.join(here, "bayeshist", "__init__.py")
17 | version_pattern = re.compile(r"__version__\s*=\s*[\"'](.+)[\"']")
18 | 
19 | with open(version_file, encoding="utf-8") as f:
20 |     for line in f:
21 |         match = version_pattern.match(line)
22 |         if match:
23 |             version = match.group(1)
24 |             break
25 |     else:
26 |         raise RuntimeError("Could not determine version from __init__.py")
27 | 
28 | 
29 | setup(
30 |     name="bayeshist",
31 |     license="MIT",
32 |     version=version,
33 |     description="Bayesian histograms for estimation of binary event rates",
34 |     long_description=long_description,
35 |     long_description_content_type="text/markdown",
36 |     author="Dion Häfner",
37 |     author_email="mail@dionhaefner.de",
38 |     url="https://github.com/dionhaefner/bayeshist",
39 |     packages=["bayeshist"],
40 |     install_requires=[
41 |         "numpy",
42 |         "scipy",
43 |     ],
44 |     python_requires=">=3.6",
45 |     zip_safe=False,
46 | )
47 | 


--------------------------------------------------------------------------------