├── .gitignore ├── README.rst ├── LICENSE.rst └── bb.py /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | 3 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Bayesian Blocks 2 | ======================================= 3 | 4 | This program computes Bayesian Blocks, including a visualisation, 5 | from a sequence of event times. 6 | 7 | Usage 8 | ------ 9 | 10 | $ ./bb.py mydatafile 11 | 12 | * Will create a plot "mydatafilebblocks.pdf" 13 | * And a data file "mydatafilebblocks.txt", containing three columns: 14 | 15 | * start time 16 | * stop time 17 | * rate 18 | 19 | License 20 | -------- 21 | 22 | The implementation is adopted from astroML, under BSD-3 clause license. 23 | 24 | -------------------------------------------------------------------------------- /LICENSE.rst: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012-2013, Jacob Vanderplas 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | 9 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 10 | 11 | -------------------------------------------------------------------------------- /bb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | """ 3 | Bayesian Block implementation 4 | ============================= 5 | 6 | Dynamic programming algorithm for finding the optimal adaptive-width histogram. 7 | 8 | Based on Scargle et al 2012 [1]_ 9 | 10 | References 11 | ---------- 12 | .. [1] http://adsabs.harvard.edu/abs/2012arXiv1207.5578S 13 | """ 14 | import numpy as np 15 | # TODO: implement other fitness functions from appendix B of Scargle 2012 16 | 17 | 18 | class FitnessFunc(object): 19 | """Base class for fitness functions 20 | 21 | Each fitness function class has the following: 22 | - fitness(...) : compute fitness function. 23 | Arguments accepted by fitness must be among [T_k, N_k, a_k, b_k, c_k] 24 | - prior(N, Ntot) : compute prior on N given a total number of points Ntot 25 | """ 26 | def __init__(self, p0=0.05, gamma=None): 27 | self.p0 = p0 28 | self.gamma = gamma 29 | 30 | def validate_input(self, t, x, sigma): 31 | """Check that input is valid""" 32 | pass 33 | 34 | def fitness(**kwargs): 35 | raise NotImplementedError() 36 | 37 | def prior(self, N, Ntot): 38 | if self.gamma is None: 39 | return self.p0_prior(N, Ntot) 40 | else: 41 | return self.gamma_prior(N, Ntot) 42 | 43 | def p0_prior(self, N, Ntot): 44 | # eq. 21 from Scargle 2012 45 | return 4 - np.log(73.53 * self.p0 * (N ** -0.478)) 46 | 47 | def gamma_prior(self, N, Ntot): 48 | """Basic prior, parametrized by gamma (eq. 3 in Scargle 2012)""" 49 | if self.gamma == 1: 50 | return 0 51 | else: 52 | return (np.log(1 - self.gamma) 53 | - np.log(1 - self.gamma ** (Ntot + 1)) 54 | + N * np.log(self.gamma)) 55 | 56 | # the fitness_args property will return the list of arguments accepted by 57 | # the method fitness(). This allows more efficient computation below. 58 | @property 59 | def args(self): 60 | try: 61 | # Python 2 62 | return self.fitness.func_code.co_varnames[1:] 63 | except AttributeError: 64 | return self.fitness.__code__.co_varnames[1:] 65 | 66 | 67 | class Events(FitnessFunc): 68 | """Fitness for binned or unbinned events 69 | 70 | Parameters 71 | ---------- 72 | p0 : float 73 | False alarm probability, used to compute the prior on N 74 | (see eq. 21 of Scargle 2012). Default prior is for p0 = 0. 75 | gamma : float or None 76 | If specified, then use this gamma to compute the general prior form, 77 | p ~ gamma^N. If gamma is specified, p0 is ignored. 78 | """ 79 | def fitness(self, N_k, T_k): 80 | # eq. 19 from Scargle 2012 81 | return N_k * (np.log(N_k) - np.log(T_k)) 82 | 83 | def prior(self, N, Ntot): 84 | if self.gamma is not None: 85 | return self.gamma_prior(N, Ntot) 86 | else: 87 | # eq. 21 from Scargle 2012 88 | return 4 - np.log(73.53 * self.p0 * (N ** -0.478)) 89 | 90 | 91 | class RegularEvents(FitnessFunc): 92 | """Fitness for regular events 93 | 94 | This is for data which has a fundamental "tick" length, so that all 95 | measured values are multiples of this tick length. In each tick, there 96 | are either zero or one counts. 97 | 98 | Parameters 99 | ---------- 100 | dt : float 101 | tick rate for data 102 | gamma : float 103 | specifies the prior on the number of bins: p ~ gamma^N 104 | """ 105 | def __init__(self, dt, p0=0.05, gamma=None): 106 | self.dt = dt 107 | self.p0 = p0 108 | self.gamma = gamma 109 | 110 | def validate_input(self, t, x, sigma): 111 | unique_x = np.unique(x) 112 | if list(unique_x) not in ([0], [1], [0, 1]): 113 | raise ValueError("Regular events must have only 0 and 1 in x") 114 | 115 | def fitness(self, T_k, N_k): 116 | # Eq. 75 of Scargle 2012 117 | M_k = T_k / self.dt 118 | N_over_M = N_k * 1. / M_k 119 | 120 | eps = 1E-8 121 | if np.any(N_over_M > 1 + eps): 122 | import warnings 123 | warnings.warn('regular events: N/M > 1. ' 124 | 'Is the time step correct?') 125 | 126 | one_m_NM = 1 - N_over_M 127 | N_over_M[N_over_M <= 0] = 1 128 | one_m_NM[one_m_NM <= 0] = 1 129 | 130 | return N_k * np.log(N_over_M) + (M_k - N_k) * np.log(one_m_NM) 131 | 132 | 133 | class PointMeasures(FitnessFunc): 134 | """Fitness for point measures 135 | 136 | Parameters 137 | ---------- 138 | gamma : float 139 | specifies the prior on the number of bins: p ~ gamma^N 140 | if gamma is not specified, then a prior based on simulations 141 | will be used (see sec 3.3 of Scargle 2012) 142 | """ 143 | def __init__(self, p0=None, gamma=None): 144 | self.p0 = p0 145 | self.gamma = gamma 146 | 147 | def fitness(self, a_k, b_k): 148 | # eq. 41 from Scargle 2012 149 | return (b_k * b_k) / (4 * a_k) 150 | 151 | def prior(self, N, Ntot): 152 | if self.gamma is not None: 153 | return self.gamma_prior(N, Ntot) 154 | elif self.p0 is not None: 155 | return self.p0_prior(N, Ntot) 156 | else: 157 | # eq. at end of sec 3.3 in Scargle 2012 158 | return 1.32 + 0.577 * np.log10(N) 159 | 160 | 161 | def bayesian_blocks(t, x=None, sigma=None, 162 | fitness='events', **kwargs): 163 | """Bayesian Blocks Implementation 164 | 165 | This is a flexible implementation of the Bayesian Blocks algorithm 166 | described in Scargle 2012 [1]_ 167 | 168 | Parameters 169 | ---------- 170 | t : array_like 171 | data times (one dimensional, length N) 172 | x : array_like (optional) 173 | data values 174 | sigma : array_like or float (optional) 175 | data errors 176 | fitness : str or object 177 | the fitness function to use. 178 | If a string, the following options are supported: 179 | 180 | - 'events' : binned or unbinned event data 181 | extra arguments are `p0`, which gives the false alarm probability 182 | to compute the prior, or `gamma` which gives the slope of the 183 | prior on the number of bins. 184 | - 'regular_events' : non-overlapping events measured at multiples 185 | of a fundamental tick rate, `dt`, which must be specified as an 186 | additional argument. The prior can be specified through `gamma`, 187 | which gives the slope of the prior on the number of bins. 188 | - 'measures' : fitness for a measured sequence with Gaussian errors 189 | The prior can be specified using `gamma`, which gives the slope 190 | of the prior on the number of bins. If `gamma` is not specified, 191 | then a simulation-derived prior will be used. 192 | 193 | Alternatively, the fitness can be a user-specified object of 194 | type derived from the FitnessFunc class. 195 | 196 | Returns 197 | ------- 198 | edges : ndarray 199 | array containing the (N+1) bin edges 200 | 201 | Examples 202 | -------- 203 | Event data: 204 | 205 | >>> t = np.random.normal(size=100) 206 | >>> bins = bayesian_blocks(t, fitness='events', p0=0.01) 207 | 208 | Event data with repeats: 209 | 210 | >>> t = np.random.normal(size=100) 211 | >>> t[80:] = t[:20] 212 | >>> bins = bayesian_blocks(t, fitness='events', p0=0.01) 213 | 214 | Regular event data: 215 | 216 | >>> dt = 0.01 217 | >>> t = dt * np.arange(1000) 218 | >>> x = np.zeros(len(t)) 219 | >>> x[np.random.randint(0, len(t), len(t) / 10)] = 1 220 | >>> bins = bayesian_blocks(t, fitness='regular_events', dt=dt, gamma=0.9) 221 | 222 | Measured point data with errors: 223 | 224 | >>> t = 100 * np.random.random(100) 225 | >>> x = np.exp(-0.5 * (t - 50) ** 2) 226 | >>> sigma = 0.1 227 | >>> x_obs = np.random.normal(x, sigma) 228 | >>> bins = bayesian_blocks(t, fitness='measures') 229 | 230 | References 231 | ---------- 232 | .. [1] Scargle, J `et al.` (2012) 233 | http://adsabs.harvard.edu/abs/2012arXiv1207.5578S 234 | 235 | See Also 236 | -------- 237 | astroML.plotting.hist : histogram plotting function which can make use 238 | of bayesian blocks. 239 | """ 240 | # validate array input 241 | t = np.asarray(t, dtype=float) 242 | if x is not None: 243 | x = np.asarray(x) 244 | if sigma is not None: 245 | sigma = np.asarray(sigma) 246 | 247 | # verify the fitness function 248 | if fitness == 'events': 249 | if x is not None and np.any(x % 1 > 0): 250 | raise ValueError("x must be integer counts for fitness='events'") 251 | fitfunc = Events(**kwargs) 252 | elif fitness == 'regular_events': 253 | if x is not None and (np.any(x % 1 > 0) or np.any(x > 1)): 254 | raise ValueError("x must be 0 or 1 for fitness='regular_events'") 255 | fitfunc = RegularEvents(**kwargs) 256 | elif fitness == 'measures': 257 | if x is None: 258 | raise ValueError("x must be specified for fitness='measures'") 259 | fitfunc = PointMeasures(**kwargs) 260 | else: 261 | if not (hasattr(fitness, 'args') and 262 | hasattr(fitness, 'fitness') and 263 | hasattr(fitness, 'prior')): 264 | raise ValueError("fitness not understood") 265 | fitfunc = fitness 266 | 267 | # find unique values of t 268 | t = np.array(t, dtype=float) 269 | assert t.ndim == 1 270 | unq_t, unq_ind, unq_inv = np.unique(t, return_index=True, 271 | return_inverse=True) 272 | 273 | # if x is not specified, x will be counts at each time 274 | if x is None: 275 | if sigma is not None: 276 | raise ValueError("If sigma is specified, x must be specified") 277 | 278 | if len(unq_t) == len(t): 279 | x = np.ones_like(t) 280 | else: 281 | x = np.bincount(unq_inv) 282 | 283 | t = unq_t 284 | sigma = 1 285 | 286 | # if x is specified, then we need to sort t and x together 287 | else: 288 | x = np.asarray(x) 289 | 290 | if len(t) != len(x): 291 | raise ValueError("Size of t and x does not match") 292 | 293 | if len(unq_t) != len(t): 294 | raise ValueError("Repeated values in t not supported when " 295 | "x is specified") 296 | t = unq_t 297 | x = x[unq_ind] 298 | 299 | # verify the given sigma value 300 | N = t.size 301 | if sigma is not None: 302 | sigma = np.asarray(sigma) 303 | if sigma.shape not in [(), (1,), (N,)]: 304 | raise ValueError('sigma does not match the shape of x') 305 | else: 306 | sigma = 1 307 | 308 | # validate the input 309 | fitfunc.validate_input(t, x, sigma) 310 | 311 | # compute values needed for computation, below 312 | if 'a_k' in fitfunc.args: 313 | ak_raw = np.ones_like(x) / sigma / sigma 314 | if 'b_k' in fitfunc.args: 315 | bk_raw = x / sigma / sigma 316 | if 'c_k' in fitfunc.args: 317 | ck_raw = x * x / sigma / sigma 318 | 319 | # create length-(N + 1) array of cell edges 320 | edges = np.concatenate([t[:1], 321 | 0.5 * (t[1:] + t[:-1]), 322 | t[-1:]]) 323 | block_length = t[-1] - edges 324 | 325 | # arrays to store the best configuration 326 | best = np.zeros(N, dtype=float) 327 | last = np.zeros(N, dtype=int) 328 | 329 | #----------------------------------------------------------------- 330 | # Start with first data cell; add one cell at each iteration 331 | #----------------------------------------------------------------- 332 | for R in range(N): 333 | # Compute fit_vec : fitness of putative last block (end at R) 334 | kwds = {} 335 | 336 | # T_k: width/duration of each block 337 | if 'T_k' in fitfunc.args: 338 | kwds['T_k'] = block_length[:R + 1] - block_length[R + 1] 339 | 340 | # N_k: number of elements in each block 341 | if 'N_k' in fitfunc.args: 342 | kwds['N_k'] = np.cumsum(x[:R + 1][::-1])[::-1] 343 | 344 | # a_k: eq. 31 345 | if 'a_k' in fitfunc.args: 346 | kwds['a_k'] = 0.5 * np.cumsum(ak_raw[:R + 1][::-1])[::-1] 347 | 348 | # b_k: eq. 32 349 | if 'b_k' in fitfunc.args: 350 | kwds['b_k'] = - np.cumsum(bk_raw[:R + 1][::-1])[::-1] 351 | 352 | # c_k: eq. 33 353 | if 'c_k' in fitfunc.args: 354 | kwds['c_k'] = 0.5 * np.cumsum(ck_raw[:R + 1][::-1])[::-1] 355 | 356 | # evaluate fitness function 357 | fit_vec = fitfunc.fitness(**kwds) 358 | 359 | A_R = fit_vec - fitfunc.prior(R + 1, N) 360 | A_R[1:] += best[:R] 361 | 362 | i_max = np.argmax(A_R) 363 | last[R] = i_max 364 | best[R] = A_R[i_max] 365 | 366 | #----------------------------------------------------------------- 367 | # Now find changepoints by iteratively peeling off the last block 368 | #----------------------------------------------------------------- 369 | change_points = np.zeros(N, dtype=int) 370 | i_cp = N 371 | ind = N 372 | while True: 373 | i_cp -= 1 374 | change_points[i_cp] = ind 375 | if ind == 0: 376 | break 377 | ind = last[ind - 1] 378 | change_points = change_points[i_cp:] 379 | 380 | return edges[change_points] 381 | 382 | def bbhist(x): 383 | edges = bayesian_blocks(x) 384 | los = edges[:-1] 385 | his = edges[1:] 386 | values = np.array([np.logical_and(x >= lo, x < hi).sum() / (hi - lo) for lo, hi in zip(edges[:-1], edges[1:])]) 387 | return edges, values 388 | 389 | if __name__ == '__main__': 390 | import sys 391 | filename = sys.argv[1] 392 | data = np.loadtxt(filename) 393 | edges, values = bbhist(data) 394 | np.savetxt(filename + 'bblocks.txt', zip(edges[:-1], edges[1:], values)) 395 | import matplotlib.pyplot as plt 396 | plt.bar(left=edges[:-1], width=edges[1:] - edges[:-1], height=values) 397 | plt.savefig(filename + 'bblocks.pdf', bbox_inches='tight') 398 | plt.close() 399 | 400 | 401 | --------------------------------------------------------------------------------