├── LICENSE ├── README.md ├── __init__.py ├── ndtest.py ├── requirements.txt └── setup.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 syrte 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ndtest 2 | Multi-dimensional statistical tests with python 3 | 4 | - 2D Kolmogorov–Smirnov test (KS test) 5 | - 1D and 2D energy distance statistics test 6 | 7 | ## Citation 8 | The code has been cited in ~40 papers 9 | according to [Google Scholar](https://scholar.google.com/scholar?q=%22syrte%2Fndtest%22) :star_struck: 10 | 11 | If you used this *code* in your paper, I would be grateful for a simple footnote mentioning the author and URL of this repo, 12 | such as 13 | ``` 14 | We perform the 2D Kolmogorov–Smirnov test ... 15 | using the public code \textsc{ndtest}\footenote{Written by Zhaozhou Li, \url{https://github.com/syrte/ndtest}}. 16 | ``` 17 | As to references for the *algorithms*, please check the docstrings. 18 | 19 | 20 | ## Installation 21 | 22 | ```bash 23 | pip install git+https://github.com/syrte/ndtest 24 | ``` 25 | 26 | ## Usage example 27 | ``` 28 | import numpy as np 29 | import ndtest 30 | 31 | # generate mock samples for the test 32 | np.random.seed(42) 33 | x1, y1 = np.random.randn(2, 100) 34 | x2, y2 = np.random.randn(2, 100) # same distribution as (x1, y1) 35 | x3, y3 = np.random.randn(2, 100) * 1.5 + 0.5 # different distribution from (x1, y1) 36 | 37 | # 2D KS 38 | P, D = ndtest.ks2d2s(x1, y1, x2, y2, extra=True) 39 | print(f"{P=:.3g}, {D=:.3g}") 40 | # P=0.219, D=0.17 41 | 42 | P, D = ndtest.ks2d2s(x1, y1, x3, y3, extra=True) 43 | print(f"{P=:.3g}, {D=:.3g}") 44 | # P=2.36e-05, D=0.385 # very small P, as expected. 45 | ``` 46 | 47 | See the docstring for the detailed usage and explanation. 48 | 49 | 50 | ## Star History 51 | 52 | [![Star History Chart](https://api.star-history.com/svg?repos=syrte/ndtest&type=Date)](https://star-history.com/#syrte/ndtest&Date) 53 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from .ndtest import * 2 | -------------------------------------------------------------------------------- /ndtest.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | from numpy import random 4 | from scipy.spatial.distance import pdist, cdist 5 | from scipy.stats import kstwobign, pearsonr 6 | from scipy.stats import genextreme 7 | 8 | __all__ = ['ks2d2s', 'estat', 'estat2d'] 9 | 10 | 11 | def ks2d2s(x1, y1, x2, y2, nboot=None, extra=False): 12 | '''Two-dimensional Kolmogorov-Smirnov test on two samples. 13 | Parameters 14 | ---------- 15 | x1, y1 : ndarray, shape (n1, ) 16 | Data of sample 1. 17 | x2, y2 : ndarray, shape (n2, ) 18 | Data of sample 2. Size of two samples can be different. 19 | nboot : None or int 20 | Number of bootstrap resample to estimate the p-value. A large number is expected. 21 | If None, an approximate analytic estimate will be used. 22 | extra: bool, optional 23 | If True, KS statistic is also returned. Default is False. 24 | 25 | Returns 26 | ------- 27 | p : float 28 | Two-tailed p-value. 29 | D : float, optional 30 | KS statistic, returned if keyword `extra` is True. 31 | 32 | Notes 33 | ----- 34 | This is the two-sided K-S test. Small p-values means that the two samples are significantly different. 35 | Note that the p-value is only an approximation as the analytic distribution is unkonwn. The approximation 36 | is accurate enough when N > ~20 and p-value < ~0.20 or so. When p-value > 0.20, the value may not be accurate, 37 | but it certainly implies that the two samples are not significantly different. (cf. Press 2007) 38 | 39 | References 40 | ---------- 41 | Peacock, J.A. 1983, Two-Dimensional Goodness-of-Fit Testing in Astronomy, MNRAS, 202, 615-627 42 | Fasano, G. and Franceschini, A. 1987, A Multidimensional Version of the Kolmogorov-Smirnov Test, MNRAS, 225, 155-170 43 | Press, W.H. et al. 2007, Numerical Recipes, section 14.8 44 | 45 | ''' 46 | assert (len(x1) == len(y1)) and (len(x2) == len(y2)) 47 | n1, n2 = len(x1), len(x2) 48 | D = avgmaxdist(x1, y1, x2, y2) 49 | 50 | if nboot is None: 51 | sqen = np.sqrt(n1 * n2 / (n1 + n2)) 52 | r1 = pearsonr(x1, y1)[0] 53 | r2 = pearsonr(x2, y2)[0] 54 | r = np.sqrt(1 - 0.5 * (r1**2 + r2**2)) 55 | d = D * sqen / (1 + r * (0.25 - 0.75 / sqen)) 56 | p = kstwobign.sf(d) 57 | else: 58 | n = n1 + n2 59 | x = np.concatenate([x1, x2]) 60 | y = np.concatenate([y1, y2]) 61 | d = np.empty(nboot, 'f') 62 | for i in range(nboot): 63 | idx = random.choice(n, n, replace=True) 64 | ix1, ix2 = idx[:n1], idx[n1:] 65 | #ix1 = random.choice(n, n1, replace=True) 66 | #ix2 = random.choice(n, n2, replace=True) 67 | d[i] = avgmaxdist(x[ix1], y[ix1], x[ix2], y[ix2]) 68 | p = np.sum(d > D).astype('f') / nboot 69 | if extra: 70 | return p, D 71 | else: 72 | return p 73 | 74 | 75 | def avgmaxdist(x1, y1, x2, y2): 76 | D1 = maxdist(x1, y1, x2, y2) 77 | D2 = maxdist(x2, y2, x1, y1) 78 | return (D1 + D2) / 2 79 | 80 | 81 | def maxdist(x1, y1, x2, y2): 82 | n1 = len(x1) 83 | D1 = np.empty((n1, 4)) 84 | for i in range(n1): 85 | a1, b1, c1, d1 = quadct(x1[i], y1[i], x1, y1) 86 | a2, b2, c2, d2 = quadct(x1[i], y1[i], x2, y2) 87 | D1[i] = [a1 - a2, b1 - b2, c1 - c2, d1 - d2] 88 | 89 | # re-assign the point to maximize difference, 90 | # the discrepancy is significant for N < ~50 91 | D1[:, 0] -= 1 / n1 92 | 93 | dmin, dmax = -D1.min(), D1.max() + 1 / n1 94 | return max(dmin, dmax) 95 | 96 | 97 | def quadct(x, y, xx, yy): 98 | n = len(xx) 99 | ix1, ix2 = xx <= x, yy <= y 100 | a = np.sum(ix1 & ix2) / n 101 | b = np.sum(ix1 & ~ix2) / n 102 | c = np.sum(~ix1 & ix2) / n 103 | d = 1 - a - b - c 104 | return a, b, c, d 105 | 106 | 107 | def estat2d(x1, y1, x2, y2, **kwds): 108 | return estat(np.c_[x1, y1], np.c_[x2, y2], **kwds) 109 | 110 | 111 | def estat(x, y, nboot=1000, replace=False, method='log', fitting=False): 112 | ''' 113 | Energy distance statistics test. 114 | Reference 115 | --------- 116 | Aslan, B, Zech, G (2005) Statistical energy as a tool for binning-free 117 | multivariate goodness-of-fit tests, two-sample comparison and unfolding. 118 | Nuc Instr and Meth in Phys Res A 537: 626-636 119 | Szekely, G, Rizzo, M (2014) Energy statistics: A class of statistics 120 | based on distances. J Stat Planning & Infer 143: 1249-1272 121 | Brian Lau, multdist, https://github.com/brian-lau/multdist 122 | 123 | ''' 124 | n, N = len(x), len(x) + len(y) 125 | stack = np.vstack([x, y]) 126 | stack = (stack - stack.mean(0)) / stack.std(0) 127 | if replace: 128 | rand = lambda x: random.randint(x, size=x) 129 | else: 130 | rand = random.permutation 131 | 132 | en = energy(stack[:n], stack[n:], method) 133 | en_boot = np.zeros(nboot, 'f') 134 | for i in range(nboot): 135 | idx = rand(N) 136 | en_boot[i] = energy(stack[idx[:n]], stack[idx[n:]], method) 137 | 138 | if fitting: 139 | param = genextreme.fit(en_boot) 140 | p = genextreme.sf(en, *param) 141 | return p, en, param 142 | else: 143 | p = (en_boot >= en).sum() / nboot 144 | return p, en, en_boot 145 | 146 | 147 | def energy(x, y, method='log'): 148 | dx, dy, dxy = pdist(x), pdist(y), cdist(x, y) 149 | n, m = len(x), len(y) 150 | if method == 'log': 151 | dx, dy, dxy = np.log(dx), np.log(dy), np.log(dxy) 152 | elif method == 'gaussian': 153 | raise NotImplementedError 154 | elif method == 'linear': 155 | pass 156 | else: 157 | raise ValueError 158 | z = dxy.sum() / (n * m) - dx.sum() / n**2 - dy.sum() / m**2 159 | # z = ((n*m)/(n+m)) * z # ref. SR 160 | return z 161 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | with open('README.md', 'r') as fp: 4 | long_description = fp.read() 5 | 6 | setup( 7 | name='ndtest', 8 | version='0.1', 9 | description='Multi-dimensional statistical tests with python, including the 2D Kolmogorov–Smirnov test and energy distance statistics.', 10 | long_description=long_description, 11 | long_description_content_type='text/markdown', 12 | url='https://github.com/syrte/ndtest/', 13 | keywords=['statistics', 'statistical test', 'Python'], 14 | author='Zhaozhou Li', 15 | author_email='lizz.astro@gmail.com', 16 | py_modules=['ndtest'], 17 | install_requires=['numpy', 'scipy'], 18 | classifiers=[ 19 | 'Development Status :: 5 - Production/Stable', 20 | 'Topic :: Scientific/Engineering', 21 | 'License :: OSI Approved :: GNU General Public License v2 (GPLv2)', 22 | 'Programming Language :: Python :: 2', 23 | 'Programming Language :: Python :: 3', 24 | ], 25 | ) 26 | --------------------------------------------------------------------------------