├── .gitignore ├── README.md ├── pyadwin ├── __init__.py ├── ad_win_list.py ├── ad_win_list_node.py └── adwin.py ├── setup.py └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *,cover 48 | .hypothesis/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # IPython Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # dotenv 81 | .env 82 | 83 | # virtualenv 84 | venv/ 85 | ENV/ 86 | 87 | # Spyder project settings 88 | .spyderproject 89 | 90 | # Rope project settings 91 | .ropeproject 92 | 93 | .gitignore 94 | .idea/ 95 | pyAdwin.iml 96 | src/ 97 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pyAdwin 2 | 3 | ADWIN is an adaptive sliding window algorithm for detecting change and keeping updated statistics from a data stream, and use it as a black-box in place or counters in learning and mining algorithms initially not designed for drifting data. [Reference: The Adwin Software](http://adaptive-mining.sourceforge.net/?page_id=20) 4 | 5 | Implementation based on C ++ implementations in this repository [Adwin C++](https://github.com/abifet/adwin) 6 | 7 | ## Prerequisites 8 | 9 | * Python > 2.7 10 | 11 | ## Install 12 | 13 | * Clone this repository in your project folder 14 | 15 | ``` 16 | git clone https://github.com/rsdevigo/pyAdwin.git 17 | ``` 18 | 19 | * Access the pyAdwin folder and execute the setup.py 20 | 21 | ``` 22 | cd pyAdwin && sudo python setup.py install 23 | ``` 24 | 25 | ## Example 26 | 27 | ``` 28 | from pyadwin import Adwin 29 | 30 | #Delta's standard of 0.01 , but Adwin builder can receive a floating point as the delta parameter. 31 | adwin = Adwin() 32 | 33 | data_stream = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.7] 34 | 35 | for data in data_stream: 36 | if (adwin.update(data)) 37 | print "Change has been detected in data: "+str(data) 38 | print adwin.getEstimation() # Prints the next value of the estimated form of data_stream 39 | ``` 40 | -------------------------------------------------------------------------------- /pyadwin/__init__.py: -------------------------------------------------------------------------------- 1 | from ad_win_list_node import AdWinListNode 2 | from ad_win_list import AdWinList 3 | from adwin import Adwin 4 | 5 | 6 | -------------------------------------------------------------------------------- /pyadwin/ad_win_list.py: -------------------------------------------------------------------------------- 1 | from pyadwin import AdWinListNode 2 | 3 | 4 | class AdWinList(object): 5 | def __init__(self, M): 6 | self.head = None 7 | self.tail = None 8 | self.count = 0 9 | self.M = M 10 | self.addToHead() 11 | 12 | def addToTail(self): 13 | temp = AdWinListNode(self.M) 14 | if self.tail is not None: 15 | temp.prev = self.tail 16 | self.tail.next = temp 17 | self.tail = temp 18 | if self.head is None: 19 | self.head = self.tail 20 | self.count += 1 21 | 22 | def removeFromHead(self): 23 | temp = self.head 24 | self.head = self.head.next 25 | if self.head is not None: 26 | self.head.prev = None 27 | else: 28 | self.tail = None 29 | self.count -= 1 30 | 31 | def addToHead(self): 32 | temp = AdWinListNode(self.M) 33 | if self.head is not None: 34 | temp.next = self.head 35 | self.head.prev = temp 36 | self.head = temp 37 | if self.tail is None: 38 | self.tail = self.head 39 | self.count += 1 40 | 41 | def removeFromTail(self): 42 | temp = self.tail 43 | self.tail = self.tail.prev 44 | if self.tail is None: 45 | self.head = None 46 | else: 47 | self.tail.next = None 48 | self.count -= 1 49 | -------------------------------------------------------------------------------- /pyadwin/ad_win_list_node.py: -------------------------------------------------------------------------------- 1 | class AdWinListNode(object): 2 | def __init__(self, M): 3 | self.M = M 4 | self.size = 0 5 | self.next = None 6 | self.prev = None 7 | self.sum = [] 8 | self.variance = [] 9 | for i in range(self.M + 1): 10 | self.sum.append(0.0) 11 | self.variance.append(0.0) 12 | 13 | def addBack(self, value, var): 14 | 15 | self.sum[self.size] = value 16 | self.variance[self.size] = var 17 | self.size += 1 18 | 19 | def dropFront(self, n=1): 20 | for k in range(n, self.M + 1): 21 | self.sum[k - n] = self.sum[k] 22 | self.variance[k - n] = self.variance[k] 23 | for k in range(1, n + 1): 24 | self.sum[self.M - k + 1] = 0.0 25 | self.variance[self.M - k + 1] = 0.0 26 | 27 | self.size -= n 28 | -------------------------------------------------------------------------------- /pyadwin/adwin.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from pyadwin import AdWinList 4 | 5 | 6 | class Adwin(object): 7 | def __init__(self, delta=0.01): 8 | self.MINTCLOCK = 1.0 9 | self.MINLENGTHWINDOW = 16 10 | self.DELTA = delta 11 | self.MAXBUCKETS = 5 12 | self.bucketList = AdWinList(self.MAXBUCKETS) 13 | self.mintTime = 0.0 14 | self.mintClock = self.MINTCLOCK 15 | self.mdblError = 0.0 16 | self.mdblWidth = 0.0 17 | self.lastBucketRow = 0 18 | self.sum = 0.0 19 | self.W = 0.0 20 | self.var = 0.0 21 | self.bucketNumber = 0 22 | 23 | def getEstimation(self): 24 | if self.W > 0: 25 | return self.sum / float(self.W) 26 | else: 27 | return 0 28 | 29 | def update(self, value): 30 | self.insertElement(value) 31 | self.compressBuckets() 32 | return self.checkDrift() 33 | 34 | def printInfo(self): 35 | it = self.bucketList.tail 36 | if it is None: 37 | print "It None" 38 | 39 | i = self.lastBucketRow 40 | 41 | while True: 42 | for k in range(it.size - 1, -1, -1): 43 | print str(i) + " [" + str(it.sum[k]) + " de " + str(self.bucketSize(i)) + "],", 44 | 45 | print 46 | it = it.prev 47 | i -= 1 48 | if it is None: 49 | break 50 | 51 | def length(self): 52 | return self.W 53 | 54 | def insertElement(self, value): 55 | self.W += 1 56 | self.bucketList.head.addBack(float(value), 0.0) 57 | self.bucketNumber += 1 58 | 59 | if self.W > 1: 60 | self.var += (self.W - 1) * (value - self.sum / (self.W - 1)) * (value - self.sum / (self.W - 1)) / self.W 61 | 62 | self.sum += value 63 | 64 | def compressBuckets(self): 65 | i = 0 66 | cont = 0 67 | 68 | cursor = self.bucketList.head 69 | nextNode = None 70 | 71 | while True: 72 | k = cursor.size 73 | if k == self.MAXBUCKETS + 1: 74 | nextNode = cursor.next 75 | if nextNode is None: 76 | self.bucketList.addToTail() 77 | nextNode = cursor.next 78 | self.lastBucketRow += 1 79 | n1 = self.bucketSize(i) 80 | n2 = self.bucketSize(i) 81 | u1 = cursor.sum[0] / n1 82 | u2 = cursor.sum[1] / n2 83 | incVariance = n1 * n2 * (u1 - u2) * (u1 - u2) / (n1 + n2) 84 | nextNode.addBack(cursor.sum[0] + cursor.sum[1], cursor.variance[0] + cursor.variance[1] + incVariance) 85 | self.bucketNumber -= 1 86 | cursor.dropFront(2) 87 | if nextNode.size <= self.MAXBUCKETS: 88 | break 89 | else: 90 | break 91 | cursor = cursor.next 92 | i += 1 93 | if cursor is None: 94 | break 95 | 96 | def checkDrift(self): 97 | change = False 98 | quit = False 99 | it = None 100 | 101 | self.mintTime += 1 102 | 103 | if self.mintTime % self.mintClock == 0 and self.W > self.MINLENGTHWINDOW: 104 | blnTalla = True 105 | 106 | while blnTalla: 107 | blnTalla = False 108 | quit = False 109 | n0 = 0.0 110 | n1 = float(self.W) 111 | u0 = 0.0 112 | u1 = float(self.sum) 113 | it = self.bucketList.tail 114 | i = self.lastBucketRow 115 | 116 | while True: 117 | for k in range(it.size): 118 | if i == 0 and k == it.size - 1: 119 | quit = True 120 | break 121 | n0 += self.bucketSize(i) 122 | n1 -= self.bucketSize(i) 123 | u0 += it.sum[k] 124 | u1 -= it.sum[k] 125 | mintMinWinLength = 5 126 | if n0 >= mintMinWinLength and n1 >= mintMinWinLength and self.cutExpression(n0, n1, u0, u1): 127 | blnTalla = True 128 | change = True 129 | if self.W > 0: 130 | self.deleteElement() 131 | quit = True 132 | break 133 | it = it.prev 134 | i -= 1 135 | if quit or it is None: 136 | break 137 | return change 138 | 139 | def deleteElement(self): 140 | node = self.bucketList.tail 141 | n1 = self.bucketSize(self.lastBucketRow) 142 | self.W -= n1 143 | self.sum -= node.sum[0] 144 | u1 = node.sum[0] / n1 145 | incVariance = float(node.variance[0] + n1 * self.W * (u1 - self.sum / self.W) * (u1 - self.sum / self.W)) / ( 146 | float(n1 + self.W)) 147 | self.var -= incVariance 148 | node.dropFront() 149 | self.bucketNumber -= 1 150 | if node.size == 0: 151 | self.bucketList.removeFromTail() 152 | self.lastBucketRow -= 1 153 | 154 | def cutExpression(self, N0, N1, u0, u1): 155 | n0 = float(N0) 156 | n1 = float(N1) 157 | n = float(self.W) 158 | diff = float(u0 / n0) - float(u1 / n1) 159 | 160 | v = self.var / self.W 161 | dd = math.log(2.0 * math.log(n) / self.DELTA) 162 | 163 | mintMinWinLength = 5 164 | m = (float(1 / ((n0 - mintMinWinLength + 1)))) + (float(1 / ((n1 - mintMinWinLength + 1)))) 165 | eps = math.sqrt(2 * m * v * dd) + float(2 / 3 * dd * m) 166 | 167 | if math.fabs(diff) > eps: 168 | return True 169 | else: 170 | return False 171 | 172 | def bucketSize(self, Row): 173 | return int(math.pow(2, Row)) 174 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='pyadwin', 4 | version='0.1', 5 | description='ADWIN is an adaptive sliding window algorithm for detecting change and keeping updated statistics from a data stream, and use it as a black-box in place or counters in learning and mining algorithms initially not designed for drifting data.', 6 | url='', 7 | author='Rodrigo Sanches Devigo', 8 | author_email='rsdevigo@gmail.com', 9 | license='MIT', 10 | packages=['pyadwin'], 11 | zip_safe=False) 12 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from pyadwin import Adwin 2 | 3 | # Delta's standard of 0.01 , but Adwin builder can receive a floating point as the delta parameter. 4 | adwin = Adwin(0.01) 5 | 6 | data_stream = [1] * 30 7 | data_stream.extend([2] * 15) 8 | 9 | for data in data_stream: 10 | if (adwin.update(data)): 11 | print "Change has been detected in data: " + str(data) 12 | print adwin.getEstimation() # Prints the next value of the estimated form of data_stream 13 | --------------------------------------------------------------------------------