├── setup.cfg ├── pybursts ├── __init__.py ├── .DS_Store └── pybursts.py ├── .gitignore ├── CHANGELOG.md ├── setup.py ├── LICENSE.txt └── README.md /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /pybursts/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["pybursts"] 2 | from pybursts import * 3 | -------------------------------------------------------------------------------- /pybursts/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/romain-fontugne/pybursts/HEAD/pybursts/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | 4 | build/ 5 | develop-eggs/ 6 | dist/ 7 | eggs/ 8 | lib/ 9 | lib64/ 10 | parts/ 11 | sdist/ 12 | var/ 13 | *.egg-info/ 14 | .installed.cfg 15 | *.egg 16 | 17 | pip-log.txt 18 | pip-delete-this-directory.txt 19 | 20 | MANIFEST 21 | .DS_Store 22 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ### 0.1.1 2 | * Update readme [view commit](http://github.com/rpoddighe/pybursts/commit/92e695f30ab8faf7375d81030f1124b73b903fa5) 3 | * Tidy up module imports [view commit](http://github.com/rpoddighe/pybursts/commit/c665e5ffee63d3087eae99bc6781773ea4d64aef) 4 | * Add .gitignore [view commit](http://github.com/rpoddighe/pybursts/commit/d8ed0480afe89193e4f56c008a7edf4922571855) 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup( 4 | name = 'pybursts', 5 | packages = ['pybursts'], # this must be the same as the name above 6 | version = '0.1.1', 7 | description = 'A Python port from the \'burst detection\' algorithm by Kleinberg, originally implemented in R', 8 | author = 'Renzo Poddighe', 9 | author_email = 'poddighe.renzo@gmail.com', 10 | url = 'https://github.com/rpoddighe/pybursts', # use the URL to the github repo 11 | download_url = 'https://github.com/rpoddighe/pybursts/tarball/0.1.1', # I'll explain this in a second 12 | keywords = ['burst detection', 'data mining', 'text mining'], # arbitrary keywords 13 | classifiers = [], 14 | ) 15 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Renzo Poddighe 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pybursts 2 | 3 | ## Changelog 4 | 5 | ### 0.1.2 6 | * Update readme [view commit](http://github.com/rpoddighe/pybursts/commit/92e695f30ab8faf7375d81030f1124b73b903fa5) 7 | * Tidy up module imports [view commit](http://github.com/rpoddighe/pybursts/commit/c665e5ffee63d3087eae99bc6781773ea4d64aef) 8 | * Add .gitignore [view commit](http://github.com/rpoddighe/pybursts/commit/d8ed0480afe89193e4f56c008a7edf4922571855) 9 | 10 | ## Description 11 | This is a Python port of the [R implementation](http://cran.r-project.org/web/packages/bursts/index.html) of Kleinberg's algorithm (described in ['Bursty and Hierarchical Structure in Streams'](http://www.cs.cornell.edu/home/kleinber/bhs.pdf)). The algorithm models activity bursts in a time series as an infinite hidden Markov model. 12 | 13 | ## Installation 14 | 15 | ```shell 16 | pip install pybursts 17 | ``` 18 | 19 | or 20 | 21 | ```shell 22 | easy_install pybursts 23 | ``` 24 | 25 | ## Dependencies 26 | * [NumPy](http://www.numpy.org/) 27 | 28 | 29 | ## Usage 30 | ```python 31 | 32 | import pybursts 33 | 34 | offsets = [4, 17, 23, 27, 33, 35, 37, 76, 77, 82, 84, 88, 90, 92] 35 | print pybursts.kleinberg(offsets, s=2, gamma=0.1) 36 | 37 | ``` 38 | 39 | ## Input 40 | 41 | * *offsets*: a list of time offsets (numeric) 42 | * *s*: the base of the exponential distribution that is used for modeling the event frequencies 43 | * *gamma*: coefficient for the transition costs between states 44 | 45 | This version also alows to set n and T in order to have a fixed cost function (not dependent of the given offsets). Which is needed if you want to compare bursts for different inputs. 46 | 47 | ## Output 48 | 49 | An array of intervals in which a burst of activity was detected. The first column denotes the level within the hierarchy; the second column the start value of the interval; the third column the end value. The first row is always the top-level activity (the complete interval from start to finish). 50 | 51 | ## References 52 | 53 | * [CRAN - Package bursts](http://cran.r-project.org/web/packages/bursts/index.html) 54 | * [J. Kleinberg. Bursty and Hierarchical Structure in Streams. Proc. 8th ACM SIGKDD Intl. Conf. on Knowledge Discovery and Data Mining, 2002.](http://www.cs.cornell.edu/home/kleinber/bhs.pdf) 55 | -------------------------------------------------------------------------------- /pybursts/pybursts.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | import math 4 | 5 | def kleinberg(offsets, s=2, gamma=1, n=None, T=None, k=None): 6 | """Kleinberg's algorithm (described in 'Bursty and Hierarchical Structure 7 | in Streams'). The algorithm models activity bursts in a time series as an 8 | infinite hidden Markov model. 9 | 10 | Input: 11 | offsets: a list of time offsets (numeric) 12 | s: the base of the exponential distribution that is used for modeling 13 | the event frequencies 14 | gamma: coefficient for the transition costs between states 15 | n, T: to have a fixed cost function (not dependent of the given offsets). 16 | Which is needed if you want to compare bursts for different inputs. 17 | k: maximum burst level""" 18 | 19 | if s <= 1: 20 | raise ValueError("s must be greater than 1!") 21 | if gamma <= 0: 22 | raise ValueError("gamma must be positive!") 23 | if not n is None and n <= 0: 24 | raise ValueError("n must be positive!") 25 | if not T is None and T <= 0: 26 | raise ValueError("T must be positive!") 27 | if len(offsets) < 1: 28 | raise ValueError("offsets must be non-empty!") 29 | 30 | offsets = np.array(offsets, dtype=object) 31 | 32 | if offsets.size == 1: 33 | bursts = np.array([0, offsets[0], offsets[0]], ndmin=2, dtype=object) 34 | return bursts 35 | 36 | offsets = np.sort(offsets) 37 | gaps = np.diff(offsets) 38 | 39 | if not np.all(gaps): 40 | raise ValueError("Input cannot contain events with zero time between!") 41 | 42 | if T is None: 43 | T = np.sum(gaps) 44 | 45 | if n is None: 46 | n = np.size(gaps) 47 | 48 | g_hat = T / n 49 | gamma_log_n = gamma * math.log(n) 50 | 51 | if k is None: 52 | k = int(math.ceil(float(1 + math.log(T, s) + math.log(1 / np.amin(gaps), s)))) 53 | 54 | def tau(i, j): 55 | if i >= j: 56 | return 0 57 | else: 58 | return (j - i) * gamma_log_n 59 | 60 | alpha_function = np.vectorize(lambda x: s ** x / g_hat) 61 | alpha = alpha_function(np.arange(k)) 62 | 63 | def f(j, x): 64 | return alpha[j] * math.exp(-alpha[j] * x) 65 | 66 | C = np.repeat(float("inf"), k) 67 | C[0] = 0 68 | 69 | q = np.empty((k, 0)) 70 | for t in range(np.size(gaps)): 71 | C_prime = np.repeat(float("inf"), k) 72 | q_prime = np.empty((k, t+1)) 73 | q_prime.fill(np.nan) 74 | 75 | for j in range(k): 76 | cost_function = np.vectorize(lambda x: C[x] + tau(x, j)) 77 | cost = cost_function(np.arange(0, k)) 78 | 79 | el = np.argmin(cost) 80 | 81 | if f(j, gaps[t]) > 0: 82 | C_prime[j] = cost[el] - math.log(f(j, gaps[t])) 83 | 84 | if t > 0: 85 | q_prime[j,:t] = q[el,:] 86 | 87 | q_prime[j, t] = j + 1 88 | 89 | C = C_prime 90 | q = q_prime 91 | 92 | j = np.argmin(C) 93 | q = q[j,:] 94 | 95 | prev_q = 0 96 | 97 | N = 0 98 | for t in range(np.size(gaps)): 99 | if q[t] > prev_q: 100 | N = N + q[t] - prev_q 101 | prev_q = q[t] 102 | 103 | bursts = np.array([np.repeat(np.nan, N), np.repeat(offsets[0],N),np.repeat(offsets[0], N)], ndmin=2, dtype=object).transpose() 104 | 105 | burst_counter = -1 106 | prev_q = 0 107 | stack = np.zeros(int(N), dtype=int) 108 | stack_counter = -1 109 | for t in range(np.size(gaps)): 110 | if q[t] > prev_q: 111 | num_levels_opened = q[t] - prev_q 112 | for i in range(int(num_levels_opened)): 113 | burst_counter += 1 114 | bursts[burst_counter, 0] = prev_q + i 115 | bursts[burst_counter, 1] = offsets[t] 116 | stack_counter += 1 117 | stack[stack_counter] = int(burst_counter) 118 | elif q[t] < prev_q: 119 | num_levels_closed = prev_q - q[t] 120 | for i in range(int(num_levels_closed)): 121 | bursts[stack[stack_counter], 2] = offsets[t] 122 | stack_counter -= 1 123 | prev_q = q[t] 124 | 125 | while stack_counter >= 0: 126 | bursts[stack[stack_counter], 2] = offsets[np.size(gaps)] 127 | stack_counter -= 1 128 | 129 | return bursts 130 | 131 | --------------------------------------------------------------------------------