├── .gitignore ├── README.md ├── classifiers ├── __init__.py └── detector_classifier.py ├── data └── elecNormNew.csv ├── doc ├── doc_drift_detection.md └── doc_drift_detection.pdf ├── drift_detector ├── DDM.py ├── __init__.py ├── adwin.py ├── adwin_list.py ├── adwin_list_node.py └── stream_volatility │ ├── __init__.py │ ├── buffer.py │ ├── reservoir.py │ └── volatility_detector.py ├── evluation ├── __init__.py ├── metrics.py └── prequential.py └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | classifer/*.pyc 3 | drift_detector/*.pyc 4 | evaluation/*.pyc 5 | .idea/ 6 | .ipynb_checkpoints/ 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ### Project of IoT(Internet of Things) data stream mining course 3 | 4 | > Objectives 5 | 6 | * Implementation of three drift detection algorithms 7 | * Adwin 8 | * DDM 9 | * Stream Volatility 10 | 11 | For more documentation, please find at doc folder. 12 | -------------------------------------------------------------------------------- /classifiers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexeyegorov/drift-detection/a738e07f69db257e6fcf0a2dbfb596620fe1149c/classifiers/__init__.py -------------------------------------------------------------------------------- /classifiers/detector_classifier.py: -------------------------------------------------------------------------------- 1 | """ Detector Classifier, a wrapper to combine the classifier and drift detector """ 2 | 3 | # Authors: Wenjun Bai 4 | # Shu Shang 5 | # Duyen Phuc Nguyen 6 | # License: BSD 3 clause 7 | 8 | import numpy as np 9 | 10 | from sklearn import clone 11 | from sklearn.metrics import accuracy_score 12 | 13 | 14 | class DetectorClassifier(): 15 | """ 16 | A detector classifier is a classifier combined with a drift detector. 17 | This class serves as wrapper to combine a classifier and a drift detector together. 18 | """ 19 | def __init__(self, clf, drift_detector): 20 | """ 21 | Initialize a detector classifier. 22 | 23 | Parameters 24 | ---------- 25 | clf: a classifier, like Naive Bayes classifier 26 | drift_detector: a drift detector, like adwin, DDM 27 | """ 28 | self.classes = None 29 | self.clf = clf 30 | self.drift_detector = drift_detector 31 | self.num_change_detected = 0 32 | 33 | def fit(self, X, y): 34 | """Fit drift detector classifier according to X, y 35 | 36 | Parameters 37 | ---------- 38 | X : {array-like, sparse matrix}, shape = [n_samples, n_features] 39 | Training vectors, where n_samples is the number of samples and 40 | n_features is number of features. 41 | y : array-like, shape = [n_samples] 42 | Target values. 43 | 44 | Returns 45 | ------- 46 | self : object 47 | return self 48 | """ 49 | self.clf.fit(X, y) 50 | self.classes = np.unique(y) 51 | return self 52 | 53 | def partial_fit(self, X, y): 54 | """Incremental fit on a batch of samples. 55 | This method is expected to be called several times consecutively 56 | on different chunks of a dataset so as to implement out-of-core 57 | or online learning. 58 | 59 | This is especially useful when the whole dataset is too big to fit in 60 | memory at once. 61 | 62 | This method has some performance and numerical stability overhead, 63 | hence it is better to call partial_fit on chunks of data that are 64 | as large as possible (as long as fitting in the memory budget) to 65 | hide the overhead. 66 | 67 | Parameters 68 | ---------- 69 | X : array-like, shape(n_samples, n_features) 70 | Training vectors, when n_samples is the number of samples 71 | and n_features is the number of features. 72 | y : array-like, shape(n_samples, ) 73 | Target values. 74 | 75 | Returns 76 | ------- 77 | self : object 78 | return self. 79 | """ 80 | pre_y = self.clf.predict(X) 81 | if self.drift_detector.set_input(accuracy_score(pre_y, y)): 82 | self.num_change_detected += 1 83 | self.clf = clone(self.clf) 84 | # print("change detected...") 85 | # self.clf.fit(X, y) 86 | self.clf.partial_fit(X, y, classes=self.classes) 87 | else: 88 | self.clf.partial_fit(X, y) 89 | return self 90 | 91 | def predict(self, X): 92 | """ 93 | Perform prediction on an array of test vectors X. 94 | 95 | Parameters 96 | ---------- 97 | X : array-like, shape = [n_samples, n_features] 98 | 99 | Returns 100 | ------- 101 | array, shape = [n_samples] 102 | Predicted target values for X 103 | """ 104 | return self.clf.predict(X) 105 | 106 | def get_detector_name(self): 107 | return self.drift_detector.__class__.__name__ 108 | -------------------------------------------------------------------------------- /doc/doc_drift_detection.md: -------------------------------------------------------------------------------- 1 | # Adaptive Sliding Window (ADWIN) 2 | 3 | Adaptive sliding window algorithm is automatically maintaining the window of variable size when date change with rigorous guarantees of it performance. 4 | 5 | > Algorithm Setting 6 | 7 | **Input**: 8 | 9 | - Confidence value $\delta$ belong to (0, 1) 10 | - Sequence of real vaues $x_1$, $x_2$, $x_2$, . . . , $x_t$ , . . . 11 | - The value of $x_t$ is available only at time $t$ 12 | 13 | **Assume**: 14 | 15 | - $x_t$ is always in [0, 1] 16 | - an Interval [a, b] such that $a<=$$x_t$$<=b$ with probability 1 17 | - Nothing else is known about the sequence of distributions $D_t$ 18 | - $\mu_t$ and $\sigma_t^2$ are unknown for all $t$ 19 | 20 | 21 | The adapting sliding window algorithm called ADWIN suitable for data streams with sudden drift. The algorithm keeps a sliding window *W* with the most recently read examples. The main idea of ADWIN is as follows: whenever two “large enough” subwindows of *W* exhibit “distinct enough” averages, one can conclude that the corresponding expected values are different, and the older portion of the window is dropped. This involves answering a statistical hypothesis: “Has the average $\mu_t$ remained constant in *W* with confidence $\delta$ ”? The pseudo-code of ADWIN is listed below. 22 | 23 | > **ADWIN0:** ADAPTIVE WINDOWING ALGORITHM 24 | > 25 | 26 | > 1: Initialize Window $W$ 27 | > 2: **for** each $t > 0$ 28 | > 3:     **do** {$x_t$} $\bigcup$ *W* $\to$ *W* (i.e., add $x_t$ to the head of *W*) 29 | > 4:         **repeat** Drop elements from the tail of *W* 30 | > 5:             **until** | $\hat{\mu}_{W_0}$ - $\hat{\mu}_{W_1}$ | < ${\epsilon}_{cut}$ holds 31 | > 6:                 for every spilt of $W$ into $W$ = $W_0 W_1$ 32 | > 7:         output $\hat{\mu}_{W}$ 33 | 34 | 35 | The key part of the algorithm lies in the definition of ${\epsilon}_{cut}$ and the test it is used for. The different statistical tests can be used for this purpose, but propose only one specific implementation. Let $n$ denote the size of $W$, and $n_0$ and $n_1$ the sizes of $W_0$ and $W_1$ consequently, so that $n = n_0 +n_1$. Let $\hat{\mu}_{W_0}$ and $\hat{\mu}_{W_1}$ be the averages of the values in $W_0$ and $W_1$ , and $\hat{\mu}_{W_0}$ and $\hat{\mu}_{W_1}$ their expected values. The value of ${\epsilon}_{cut}$ is proposed as follows: 36 | $$ 37 | {\epsilon}_{cut} = \sqrt{\frac{1}{2m}\frac{4}{\delta'}} 38 | $$ 39 | where 40 | $$ 41 | m = \frac{1}{\frac{1}{n_0}+\frac{1}{n_1}}, and , \delta '=\frac{\delta}{n} 42 | $$ 43 | The statistical test in line 6 of the pseudo-code checks if the observed average in both subwindows differs by more than threshold ${\epsilon}_{cut}$. The threshold is calculated using the Hoeffding bound, thus gives formal guarantees of the base classifiers performance. The phrase “holds for every split of $W$ into $ W = W_0 W_1$” means that we need to check all pairs of subwindows W0 and W1 created by splitting $W$ into two. The verification of all subwindows is very costly due to the number of possible split points. That is why the authors proposed an improvement to the algorithm that allows to find a good cut point quickly. The originally proposed ADWIN algorithms are also lossless learners, thus the window size $W$ can grow infinitely if no drift occurs. This can be easily improved by adding a parameter that would limit the windows maximal size. In its original form, proposed by Bifet, ADWIN works only for 1-dimensional data, e.g., the running error. For this method to be used for n-dimensional raw data, a separate window should be maintained for each dimension. Such a modified model, although costly, reflects the fact that the importance of each feature may change at different pace. 44 | 45 | **References:** 46 | 47 | > - A. Bifet, R. Gavalda. (2007). "Learning from Time-Changing Data with Adaptive Windowing". Proceedings of the 2007 SIAM International Conference on Data Mining 443-448. 48 | > - A. Bifet, J. Read, B.Pfahringer.G. Holmes, I. Zliobaite. (2013). "CD-MOA: Change Detection Framework for Massive Online Analysis". Springer Berlin Heidelberg 8207(9): 443-448. 49 | 50 | 51 | # Drift Detection Method (DDM) 52 | 53 | Drift Detection Method (DDM) model the number of classification errors with a Binomial distribution. The idea of this method is that in each iteration an online classifier predicts the decision class of an example. That prediction can be either *true* or *false*, thus for a set of examples the error is a random variable from Bernoulli trials. 54 | 55 | Let us denote $p_{i}$ as the probability of a *false* prediction and $s_{i}$ as its standard deviation calculated as given by Equation (1): 56 | $$ 57 | s_{i}=\sqrt{\frac{p_{i}(1-p{i})}{i}} \tag{1} 58 | $$ 59 | For a sufficiently large number of examples (*n* > 30), the Binomial distribution is closely approximated by a Normal distribution with the same mean and variance. For each example in the data stream the error rate is tracked updating two registers: $p_{min}$ and $s_{min}$. These values are used to calculate a *warning level* condition presented in Equation 2 and an *alarm level* condition presented in Equation 3. Each time a warning level is reached, examples are remembered in a separate window. If afterwards the error rate falls below the warning threshold, the warning is treated as a false alarm and the separate window is dropped. However, if the alarm level is reached, the previously taught base learner is dropped and a new one is created, but only from the examples stored in the separate warning” window. 60 | $$ p_{i} + s_{i} \geq p_{min} + \alpha s_{min} \tag{2} 61 | $$ 62 | 63 | $$ 64 | p_{i} + s_{i} \geq p_{min} + \beta s_{min} \tag{3} 65 | $$ 66 | 67 | The value $\alpha$ and $\beta$ in the above conditions decide about the confidence levels at which the warning and alarm signals are triggered. 68 | 69 | 70 | > Algorithm Setting 71 | 72 | **Input**: 73 | 74 | - $S$: a data stream of examples 75 | - $C$: classifier 76 | 77 | **Output**: *W* : a window with examples selected to train classifier $C$ 78 | 79 | >DDM: Drift Detection Method 80 | > 81 | 1: Initialize $(i, p_{i}, s_{i}, ps_{min}, p_{min}, s_{min});$ 82 | 2: $newDrift \leftarrow false;$ 83 | 3: $W \leftarrow \emptyset;$ 84 | 4: $W^{'} \leftarrow \emptyset;$ 85 | 5: **for all** examples $x_{i} \in S$ **do** 86 | 6:      **if** prediction $C(x_{i})$ is incorrect **then** 87 | 7:         $p_{i} \leftarrow p_{i} +(1.0-p_{i})/i; $ 88 | 8:     **else** 89 | 9:         $p_{i} \leftarrow p_{i}-(p_{i})/i;$ 90 | 10:     compute $s_{i}$ using (1); 91 | 11:     $i \leftarrow i+1;$ 92 | 12:      **if** $ i>30$ (approximated normal distribution) **then** 93 | 13:          **if** $p_{i} +p_{s} \leq ps_{min}$ **then** 94 | 14:              $p_{min} \leftarrow p_{i};$ 95 | 15:             $s_{min} \leftarrow s_{i};$ 96 | 16:             $ps_{min} \leftarrow p_{i}+s_{i};$ 97 | 17:         **if** drift detected (3) **then** 98 | 18:             Initalize ($i,p_{i},s_{i},ps_{min},p_{min},s_{min};$ 99 | 19:             $W \leftarrow W^{'};$ 100 | 20:             $W^{'} \leftarrow \emptyset;$ 101 | 21:         **else if** warning level reached (2) **then** 102 | 22:             **if** $newDrift=true$ **then** 103 | 23:                 $W^{'} \leftarrow \emptyset;$ 104 | 24:                  $newDrift \leftarrow false;$ 105 | 25:             $W^{'} \leftarrow W^{'}\cup {x_{i}};$ 106 | 26:          **else** 107 | 27:            $newDrift \leftarrow true;$ 108 | 28:      $W \leftarrow W \cup {x_{i}};$ 109 | 110 | 111 | **References:** 112 | 113 | > - Gama, J., Medas, P., Castillo, G., Rodrigues, P.: Learning with drift detection. In: Bazzan, A.L.C., Labidi, S. (eds.) SBIA 2004. LNCS (LNAI), vol. 3171, pp. 286–295. Springer, Heidelberg (2004) 114 | 115 | 116 | # Stream Volatility 117 | 118 | Current drift detection techniques detect a change in distribution within a stream. However, there are no current techniques that analyze the change in the rate of these detected changes. We coin the term stream volatility, to describe the rate of changes in a stream. A stream has a high volatility if changes are detected frequently and has a low volatility if changes are detected infrequently. 119 | 120 | Volatility detection focus on the rate de changes of detected concept drifts in the data stream. Volatility detection works usually in parallel with drift detectors and the two are designed to run concurrently. 121 | 122 | To illustrate the differences between drift detection and volatility detection. Firstly, we will formally define these two notions. 123 | 124 | **Drift Detection**: Let $S_{1} = (x_{1}, x_{2},...,x_{m})$ and $S_{2} = (x_{m+1}, x_{m+2},...,x_{n})$ with $0 \epsilon) > \delta$, where $\delta$ lies in the interval (0,1) and is a parameter that controls the maximum allowable false positive rate, while $\epsilon$ is is a function of $\delta$ and the test statistic used to model the difference between the sample means. 125 | 126 | **Volatility Detection**: Let $C_{1} = (c_{1}, c_{2},...,c_{k})$ and $C_{2} = (c_{k+1}, c_{k+2},...,c_{t})$ represent a sample of cut points detected from a stream. $p_{i}$ represent the distance intervals (periods) between two consecutive cut points $c_{i}$ and $c_{i-1}$. We are able to derive volatility windows $P_{1}= (p_{1}, p_{2},...,p_{k})$ and $P_{2}= (p_{k+1}, p_{k+2},...,p_{t-1})$ with sample variance of $\sigma_{1}$ and $\sigma_{2}$. The volatility detection problem can be expressed as testing $\frac{\sigma_{1}}{\sigma_{2}}\lessgtr 1.0\pm\beta$, where $\beta$ is a user-expressed tolerance threshold. If the test holds true we say that there is a shift in volatility between the two samples. 127 | 128 | > Algorithm Setting 129 | 130 | **Input** 131 | 132 | - A sequence of real values $p_{1}, p_{2},...,p_{t}$ representing the distance intervals between cut points discovered by drift detection techniques. 133 | 134 | **Output** 135 | 136 | - Shift points of stream volatility 137 | 138 | > Stream Volatility Detector 139 | > 140 | > 1: Initialize buffer $B$ and Reservoir $R$; 141 | > 2: Boolean: volatilityShift $\leftarrow$ false 142 | > 3: **for each** $t > 0$ **do** 143 | > 4:         $j$ $\leftarrow$ addToBuffer$(x_{t}, B)$; 144 | > 5:         addToReservoir$(j, R)$; 145 | > 6:         RelativeVariance $\leftarrow \frac{\sigma_{B}}{\sigma_{R}}$; 146 | > 7:         **if** Relative Variance $\lessgtr 1.0\pm\beta$ **then** 147 | > 8:                  volatilityShift $\leftarrow$ True; 148 | > 9:         **end** 149 | > 10: **end** 150 | > 11: **Funtion** addToBuffer(item $k$, Buffer $B$) 151 | > 12:         add $k$ as tail of $B$; 152 | > 13:         return head of $B$; 153 | > 14: **end** 154 | > 15: **Function** addToReservoir(iterm $k$, Reservoir $R$) 155 | > 16:         rPos $\leftarrow random()$; 156 | > 17:         R[rPos] $\leftarrow k$; 157 | > 18: **end** 158 | 159 | There are two main components in the volatility detector: a buffer and a reservoir. The buffer is a sliding window that keeps the most recent samples of drift intervals acquired from a drift detection technique. The reservoir is a pool that stores previous samples which ideally represent the overall state of the stream. 160 | 161 | The progression of the volatility detector when inputs come in is as follows: First, when input $x_{t}$ arrives at position $t$, it is first moved into the buffer where a sliding window keeps recent samples. As the sliding window slides, the oldest entry in the buffer is dropped from the buffer and moved into the reservoir, then the reservoir stores the dropped entry by randomly replacing one of its stored samples. Lastly, the detector then compares the samples in the buffer to the samples in the reservoir to analyze for differences. The primary difference between the buffer and the reservoir is that the buffer always keeps the most recent contents of the stream whereas the reservoir keeps an overall view of the stream. For a change in relative volatility to be detected, we use the Relative Variance measure. Relative Variance at a particular point is calculated as $\frac{\sigma_{B}}{\sigma_{R}}$ , where $\sigma_{B}$ is the variance calculated using the samples in the buffer and $\sigma_{R}$ is the variance calculated using the samples in the reservoir. 162 | 163 | **Reference** 164 | > - Huang, D.T.J., Koh, Y.S., Dobbie, G., Pears, R.: Detecting volatility shift in data streams. In: 2014 IEEE International Conference on Data Mining (ICDM), pp. 863–868 (2014) 165 | -------------------------------------------------------------------------------- /doc/doc_drift_detection.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexeyegorov/drift-detection/a738e07f69db257e6fcf0a2dbfb596620fe1149c/doc/doc_drift_detection.pdf -------------------------------------------------------------------------------- /drift_detector/DDM.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ Drift detection method based in DDM method of Joao Gama SBIA 2004. """ 3 | 4 | # Authors: Wenjun Bai 5 | # Shu Shang 6 | # Duyen Phuc Nguyen 7 | # License: BSD 3 clause 8 | 9 | import sys 10 | import math 11 | 12 | 13 | class DDM: 14 | """ 15 | The drift detection method (DDM) controls the number of errors 16 | produced by the learning model during prediction. It compares 17 | the statistics of two windows: the first contains all the data, 18 | and the second contains only the data from the beginning until 19 | the number of errors increases. 20 | Their method doesn't store these windows in memory. 21 | It keeps only statistics and a window of recent errors data.". 22 | 23 | References 24 | --------- 25 | Gama, J., Medas, P., Castillo, G., Rodrigues, P.: 26 | "Learning with drift detection". In: Bazzan, A.L.C., Labidi, 27 | S. (eds.) SBIA 2004. LNCS (LNAI), vol. 3171, pp. 286–295. Springer, Heidelberg (2004) 28 | """ 29 | 30 | def __init__(self): 31 | self.m_n = 1 32 | self.m_p = 1 33 | self.m_s = 0 34 | self.m_psmin = sys.float_info.max 35 | self.m_pmin = sys.float_info.max 36 | self.m_smin = sys.float_info.max 37 | self.change_detected = False 38 | self.is_initialized = True 39 | self.estimation = 0.0 40 | self.is_warning_zone = False 41 | 42 | def set_input(self, prediction): 43 | """ 44 | The number of errors in a sample of n examples is modelled by a binomial distribution. 45 | For each point t in the sequence that is being sampled, the error rate is the probability 46 | of mis-classifying p(t), with standard deviation s(t). 47 | DDM checks two conditions: 48 | 1) p(t) + s(t) > p(min) + 2 * s(min) for the warning level 49 | 2) p(t) + s(t) > p(min) + 3 * s(min) for the drift level 50 | 51 | Parameters 52 | ---------- 53 | prediction : new element, it monitors the error rate 54 | 55 | Returns 56 | ------- 57 | change_detected : boolean 58 | True if a change was detected. 59 | """ 60 | if self.change_detected is True or self.is_initialized is False: 61 | self.reset() 62 | self.is_initialized = True 63 | 64 | self.m_p += (prediction - self.m_p) / float(self.m_n) 65 | self.m_s = math.sqrt(self.m_p * (1 - self.m_p) / float(self.m_n)) 66 | 67 | self.m_n += 1 68 | self.estimation = self.m_p 69 | self.change_detected = False 70 | 71 | if self.m_n < 30: 72 | return False 73 | 74 | if self.m_p + self.m_s <= self.m_psmin: 75 | self.m_pmin = self.m_p; 76 | self.m_smin = self.m_s; 77 | self.m_psmin = self.m_p + self.m_s; 78 | 79 | if self.m_p + self.m_s > self.m_pmin + 3 * self.m_smin: 80 | self.change_detected = True 81 | elif self.m_p + self.m_s > self.m_pmin + 2 * self.m_smin: 82 | self.is_warning_zone = True 83 | else: 84 | self.is_warning_zone = False 85 | 86 | return self.change_detected 87 | 88 | def reset(self): 89 | """reset the DDM drift detector""" 90 | self.m_n = 1 91 | self.m_p = 1 92 | self.m_s = 0 93 | self.m_psmin = sys.float_info.max 94 | self.m_pmin = sys.float_info.max 95 | self.m_smin = sys.float_info.max 96 | -------------------------------------------------------------------------------- /drift_detector/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexeyegorov/drift-detection/a738e07f69db257e6fcf0a2dbfb596620fe1149c/drift_detector/__init__.py -------------------------------------------------------------------------------- /drift_detector/adwin.py: -------------------------------------------------------------------------------- 1 | """ADaptive sliding WINdow Algorithm (ADWIN).""" 2 | 3 | # Authors: Wenjun Bai 4 | # Shu Shang 5 | # Duyen Phuc Nguyen 6 | # License: BSD 3 clause 7 | 8 | import math 9 | 10 | from drift_detector.adwin_list import AdwinList 11 | 12 | 13 | class Adwin(object): 14 | """The Adwin algorithm is a change detector and estimator. 15 | It keeps a sliding (variable-length) window with the most 16 | recently read example,with the property that the window 17 | has the maximal length statistically consistent with the 18 | hypothesis that "there has been no change in the average 19 | value inside the window". 20 | 21 | References 22 | ---------- 23 | A. Bifet, R. Gavalda. (2007). "Learning from Time-Changing 24 | Data with Adaptive Windowing". Proceedings of the 2007 SIAM 25 | International Conference on Data Mining 443-448. 26 | http://www.lsi.upc.edu/~abifet/Timevarying.pdf 27 | 28 | A. Bifet, J. Read, B.Pfahringer.G. Holmes, I. Zliobaite. 29 | (2013). "CD-MOA: Change Detection Framework for Massive Online 30 | Analysis". Springer Berlin Heidelberg 8207(9): 443-448. 31 | https://sites.google.com/site/zliobaitefiles/cdMOA-CR.pdf?attredirects=0 32 | """ 33 | 34 | def __init__(self, delta=0.01): 35 | """Init the buckets 36 | 37 | Parameters 38 | ---------- 39 | delta : float 40 | confidence value. 41 | """ 42 | 43 | self.mint_clock = 1.0 44 | self.min_window_length = 16 45 | self.delta = delta 46 | self.max_number_of_buckets = 5 47 | self.bucket_list = AdwinList(self.max_number_of_buckets) 48 | self.mint_time = 0.0 49 | self.min_clock = self.mint_clock 50 | self.mdbl_error = 0.0 51 | self.mdbl_width = 0.0 52 | self.last_bucket_row = 0 53 | self.sum = 0.0 54 | self.width = 0.0 55 | self.variance = 0.0 56 | self.bucket_number = 0 57 | 58 | def get_estimation(self): 59 | """Get the estimation value""" 60 | if self.width > 0: 61 | return self.sum / float(self.width) 62 | else: 63 | return 0 64 | 65 | def set_input(self, value): 66 | """Add new element and reduce the window 67 | 68 | Parameters 69 | ---------- 70 | value : new element 71 | 72 | Returns 73 | ------- 74 | boolean: the return value of the method check_drift(), true if a drift was detected. 75 | """ 76 | self.insert_element(value) 77 | self.compress_buckets() 78 | return self.check_drift() 79 | 80 | def length(self): 81 | """Get the length of window""" 82 | return self.width 83 | 84 | def insert_element(self, value): 85 | """insert new bucket""" 86 | self.width += 1 87 | self.bucket_list.head.insert_bucket(float(value), 0.0) 88 | self.bucket_number += 1 89 | if self.width > 1: 90 | self.variance += (self.width - 1) * (value - self.sum / (self.width - 1)) \ 91 | * (value - self.sum / (self.width - 1)) / self.width 92 | self.sum += value 93 | 94 | def compress_buckets(self): 95 | """ 96 | Merge buckets. 97 | Find the number of buckets in a row, if the row is full, then merge the two buckets. 98 | """ 99 | i = 0 100 | cont = 0 101 | cursor = self.bucket_list.head 102 | next_node = None 103 | while True: 104 | k = cursor.size 105 | if k == self.max_number_of_buckets + 1: 106 | next_node = cursor.next 107 | if next_node is None: 108 | self.bucket_list.add_to_tail() 109 | next_node = cursor.next 110 | self.last_bucket_row += 1 111 | n1 = self.bucket_size(i) 112 | n2 = self.bucket_size(i) 113 | u1 = cursor.sum[0] / n1 114 | u2 = cursor.sum[1] / n2 115 | internal_variance = n1 * n2 * (u1 - u2) * (u1 - u2) / (n1 + n2) 116 | next_node.insert_bucket(cursor.sum[0] + cursor.sum[1], 117 | cursor.variance[0] + cursor.variance[1] + internal_variance) 118 | self.bucket_number -= 1 119 | cursor.drop_bucket(2) 120 | if next_node.size <= self.max_number_of_buckets: 121 | break 122 | else: 123 | break 124 | cursor = cursor.next 125 | i += 1 126 | if cursor is None: 127 | break 128 | 129 | def check_drift(self): 130 | """ 131 | Reduce the window, detecting if there is a drift. 132 | 133 | Returns 134 | ------- 135 | change : boolean value 136 | Result of whether the window has changed. 137 | """ 138 | 139 | change = False 140 | exit = False 141 | cursor = None 142 | self.mint_time += 1 143 | if self.mint_time % self.min_clock == 0 and self.width > self.min_window_length: 144 | reduce_width = True 145 | while reduce_width: 146 | reduce_width = False 147 | exit = False 148 | n0 = 0.0 149 | n1 = float(self.width) 150 | u0 = 0.0 151 | u1 = float(self.sum) 152 | cursor = self.bucket_list.tail 153 | i = self.last_bucket_row 154 | while True: 155 | for k in range(cursor.size): 156 | if i == 0 and k == cursor.size - 1: 157 | exit = True 158 | break 159 | n0 += self.bucket_size(i) 160 | n1 -= self.bucket_size(i) 161 | u0 += cursor.sum[k] 162 | u1 -= cursor.sum[k] 163 | min_length_of_subwindow = 5 164 | if n0 >= min_length_of_subwindow and n1 >= min_length_of_subwindow and self.cut_expression(n0, 165 | n1, 166 | u0, 167 | u1): 168 | reduce_width = True 169 | change = True 170 | if self.width > 0: 171 | self.delete_element() 172 | exit = True 173 | break 174 | cursor = cursor.prev 175 | i -= 1 176 | if exit or cursor is None: 177 | break 178 | return change 179 | 180 | def delete_element(self): 181 | """delete the bucket at the tail of window""" 182 | node = self.bucket_list.tail 183 | n1 = self.bucket_size(self.last_bucket_row) 184 | self.width -= n1 185 | self.sum -= node.sum[0] 186 | u1 = node.sum[0] / n1 187 | incVariance = float( 188 | node.variance[0] + n1 * self.width * (u1 - self.sum / self.width) * (u1 - self.sum / self.width)) / ( 189 | float(n1 + self.width)) 190 | self.variance -= incVariance 191 | node.drop_bucket() 192 | self.bucket_number -= 1 193 | if node.size == 0: 194 | self.bucket_list.remove_from_tail() 195 | self.last_bucket_row -= 1 196 | 197 | def cut_expression(self, n0_, n1_, u0, u1): 198 | """Expression calculation""" 199 | n0 = float(n0_) 200 | n1 = float(n1_) 201 | n = float(self.width) 202 | diff = float(u0 / n0) - float(u1 / n1) 203 | v = self.variance / self.width 204 | dd = math.log(2.0 * math.log(n) / self.delta) 205 | min_length_of_subwindow = 5 206 | m = (float(1 / (n0 - min_length_of_subwindow + 1))) + (float(1 / (n1 - min_length_of_subwindow + 1))) 207 | eps = math.sqrt(2 * m * v * dd) + float(2 / 3 * dd * m) 208 | if math.fabs(diff) > eps: 209 | return True 210 | else: 211 | return False 212 | 213 | def bucket_size(self, Row): 214 | return int(math.pow(2, Row)) 215 | -------------------------------------------------------------------------------- /drift_detector/adwin_list.py: -------------------------------------------------------------------------------- 1 | """Implementation of an adwin list""" 2 | 3 | # Authors: Wenjun Bai 4 | # Shu Shang 5 | # Duyen Phuc Nguyen 6 | # License: BSD 3 clause 7 | 8 | 9 | from drift_detector.adwin_list_node import AdwinListNode 10 | 11 | 12 | class AdwinList(object): 13 | def __init__(self, max_number_bucket): 14 | """Init a adwin list with a given parameter max_number_buckets 15 | 16 | Parameters 17 | ---------- 18 | max_number_bucket : max number of elements in the bucket 19 | """ 20 | self.head = None 21 | self.tail = None 22 | self.count = 0 23 | self.max_number_bucket = max_number_bucket 24 | self.add_to_head() 25 | 26 | def add_to_tail(self): 27 | """add a node at the tail of adwin list, used in the initialization of an AdwinList""" 28 | temp = AdwinListNode(self.max_number_bucket) 29 | if self.tail is not None: 30 | temp.prev = self.tail 31 | self.tail.next = temp 32 | self.tail = temp 33 | if self.head is None: 34 | self.head = self.tail 35 | self.count += 1 36 | 37 | def add_to_head(self): 38 | """Add a node to the head of an AdwinList""" 39 | temp = AdwinListNode(self.max_number_bucket) 40 | if self.head is not None: 41 | temp.next = self.head 42 | self.head.prev = temp 43 | self.head = temp 44 | if self.tail is None: 45 | self.tail = self.head 46 | self.count += 1 47 | 48 | def remove_from_head(self): 49 | """Remove the head node of an AdwinList""" 50 | temp = self.head 51 | self.head = self.head.next 52 | if self.head is not None: 53 | self.head.prev = None 54 | else: 55 | self.tail = None 56 | self.count -= 1 57 | 58 | def remove_from_tail(self): 59 | """Remove the tail node of an AdwinList""" 60 | temp = self.tail 61 | self.tail = self.tail.prev 62 | if self.tail is None: 63 | self.head = None 64 | else: 65 | self.tail.next = None 66 | self.count -= 1 67 | -------------------------------------------------------------------------------- /drift_detector/adwin_list_node.py: -------------------------------------------------------------------------------- 1 | """Node implementation of adwin list data structure""" 2 | 3 | 4 | # Authors: Wenjun Bai 5 | # Shu Shang 6 | # Duyen Phuc Nguyen 7 | # License: BSD 3 clause 8 | 9 | 10 | class AdwinListNode(object): 11 | """Implementation of a node of adwin list""" 12 | 13 | def __init__(self, max_number_of_buckets): 14 | """Init a node with a given parameter number_of_buckets 15 | 16 | Parameters 17 | ---------- 18 | max_number_of_buckets : In each row, the max number of buckets 19 | """ 20 | self.max_number_of_buckets = max_number_of_buckets 21 | self.size = 0 22 | self.next = None 23 | self.prev = None 24 | self.sum = [] 25 | self.variance = [] 26 | for i in range(self.max_number_of_buckets + 1): 27 | self.sum.append(0.0) 28 | self.variance.append(0.0) 29 | 30 | def insert_bucket(self, value, variance): 31 | """Insert a bucket at the end 32 | 33 | Parameters 34 | ---------- 35 | value: the totally size of the new one 36 | variance : the variance of the new one 37 | """ 38 | self.sum[self.size] = value 39 | self.variance[self.size] = variance 40 | self.size += 1 41 | 42 | def drop_bucket(self, n=1): 43 | """Drop the older portion of the bucket 44 | 45 | Parameters 46 | ---------- 47 | n :number data of drop bucket 48 | """ 49 | for k in range(n, self.max_number_of_buckets + 1): 50 | self.sum[k - n] = self.sum[k] 51 | self.variance[k - n] = self.variance[k] 52 | for k in range(1, n + 1): 53 | self.sum[self.max_number_of_buckets - k + 1] = 0.0 54 | self.variance[self.max_number_of_buckets - k + 1] = 0.0 55 | self.size -= n 56 | -------------------------------------------------------------------------------- /drift_detector/stream_volatility/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexeyegorov/drift-detection/a738e07f69db257e6fcf0a2dbfb596620fe1149c/drift_detector/stream_volatility/__init__.py -------------------------------------------------------------------------------- /drift_detector/stream_volatility/buffer.py: -------------------------------------------------------------------------------- 1 | """ Buffer as a component of volatility detector """ 2 | 3 | # Authors: Wenjun BAI 4 | # Shu SHANG 5 | # Duyen Phuc Nguyen 6 | # License: BSD 3 clause 7 | 8 | import math 9 | import numpy as np 10 | 11 | 12 | def calculate_stddev(times, mean): 13 | count = 0 14 | sum = 0 15 | for d in times: 16 | if d > 0: 17 | count += 1 18 | sum += math.pow(d - mean, 2) 19 | if count == 0: 20 | return 0 21 | else: 22 | return math.sqrt(sum / count) 23 | 24 | 25 | class Buffer: 26 | def __init__(self, size): 27 | """Initialize the buffer with the given size 28 | 29 | Parameters 30 | ---------- 31 | size : int 32 | Size of the buffer, the buffer is initialized with zeros. 33 | """ 34 | self.buffer = [] 35 | self.size = size 36 | self.sliding_index = 0 37 | self.is_full = False 38 | self.total = 0 39 | for i in range(self.size): 40 | self.buffer.append(0.0) 41 | 42 | def add(self, value): 43 | """Add an element into the buffer 44 | 45 | Parameters 46 | ---------- 47 | value : real value 48 | Input value, the old one in the buffer is removed, using a sliding index 49 | Total size of buffer is 32 by default. 50 | 51 | Returns 52 | ------- 53 | removed: if the buffer is full, else return -1. 54 | """ 55 | if self.sliding_index == self.size: 56 | self.is_full = True 57 | self.sliding_index = 0 58 | self.buffer[:] = [] 59 | for i in range(self.size): 60 | self.buffer.append(0.0) 61 | 62 | removed = self.buffer[self.sliding_index] 63 | self.total -= removed 64 | 65 | self.buffer.append(value) 66 | self.sliding_index += 1 67 | self.total += value 68 | 69 | if self.is_full: 70 | return removed 71 | else: 72 | return -1 73 | 74 | def get_mean(self): 75 | """Calculate the mean value of the buffer""" 76 | if self.is_full: 77 | return self.total / self.size 78 | else: 79 | return self.total / self.sliding_index 80 | 81 | def get_stddev(self): 82 | """Calculate the standard deviation""" 83 | stddev = calculate_stddev(self.buffer, self.get_mean()) 84 | if stddev == 0: 85 | return 0.00000000001 86 | else: 87 | return stddev 88 | 89 | def clear(self): 90 | """Clear the buffer, reset the parameters""" 91 | self.buffer[:] = [] 92 | for i in range(self.size): 93 | self.buffer.append(0.0) 94 | self.sliding_index = 0 95 | self.is_full = False 96 | self.total = 0 97 | -------------------------------------------------------------------------------- /drift_detector/stream_volatility/reservoir.py: -------------------------------------------------------------------------------- 1 | """ Reservoir as a component of volatility detector """ 2 | 3 | # Authors: Wenjun Bai 4 | # Shu Shang 5 | # Duyen Phuc Nguyen 6 | # License: BSD 3 clause 7 | 8 | import numpy as np 9 | from buffer import calculate_stddev 10 | 11 | 12 | class Reservoir: 13 | def __init__(self, size): 14 | """Initialize the reservoir with a given size. 15 | 16 | Parameters 17 | ---------- 18 | size : int 19 | Size of the reservoir, the reservoir is initialized with zeros. The number of elements of the 20 | reservoir equals to this size. 21 | """ 22 | self.size = size 23 | self.elements = [] 24 | self.element_total = 0 25 | self.e_index = 0 26 | self.rand = np.random 27 | for i in range(self.size): 28 | self.elements.append(0.0) 29 | 30 | def add_element(self, input_value): 31 | """Add an element to the reservoir. As the sliding window slides, the oldest entry in the 32 | buffer is dropped from a buffer and moved into the reservoir, then the reservoir stores 33 | the dropped entry by randomly replacing one of its stored samples. 34 | 35 | Parameters 36 | ---------- 37 | input_value: real value 38 | The input value to the buffer, randomly replacing one element in the reservoir 39 | The type of input value should be real value like int, float... because in this 40 | method the value will be used to calculate the statistics "total" 41 | """ 42 | if self.e_index < self.size: 43 | self.elements[self.e_index] = input_value 44 | self.element_total += input_value 45 | self.e_index += 1 46 | else: 47 | index_remove = int(self.rand.rand() * self.e_index) 48 | self.element_total -= self.elements[index_remove] 49 | self.elements[index_remove] = input_value 50 | self.element_total += input_value 51 | 52 | def get_reservoir_mean(self): 53 | """Calculate the mean of the elements stored in reservoir""" 54 | return self.element_total / self.e_index 55 | 56 | def get_stddev(self): 57 | """Calculate the standard deviation of the elements stored in reservoir""" 58 | stddev = calculate_stddev(self.elements, self.get_reservoir_mean()) 59 | if stddev == 0: 60 | return 0.00000000001 61 | else: 62 | return stddev 63 | 64 | def get_count(self): 65 | """Get the number of elements in the reservoir, this statistics is monitored by e_index""" 66 | return self.e_index 67 | 68 | def check_full(self): 69 | if self.e_index == self.size: 70 | return True 71 | else: 72 | return False 73 | 74 | def clear(self): 75 | self.elements[:] = [] 76 | for i in range(self.size): 77 | self.elements.append(0.0) 78 | self.element_total = 0 79 | self.e_index = 0 80 | 81 | def check_is_clear(self): 82 | return self.e_index == 0 83 | -------------------------------------------------------------------------------- /drift_detector/stream_volatility/volatility_detector.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ Relative Stream Volatility Detector """ 3 | 4 | # Authors: Wenjun Bai 5 | # Shu Shang 6 | # Duyen Phuc Nguyen 7 | # License: BSD 3 clause 8 | 9 | from drift_detector.stream_volatility.buffer import Buffer 10 | from drift_detector.stream_volatility.reservoir import Reservoir 11 | 12 | 13 | class VolatilityDetector: 14 | """ 15 | A drift detector is a detector that monitors the changes of stream volatility. 16 | Stream Volatility is the rate of changes of the detected changes given by a drift detector like Adwin. 17 | We can see this kind of detector as a drift detector the a set of given drifts and we call it volatility detector. 18 | 19 | A volatility detector takes the output of a drift detector and outputs an alarm if there is a change in the rate 20 | of detected drifts. 21 | 22 | The implementation uses two components: a buffer and a reservoir. 23 | The buffer is a sliding window that keeps the most recent samples of drift intervals acquired from 24 | a drift detection technique. The reservoir is a pool that stores previous samples which ideally represent 25 | the overall state of the stream. 26 | 27 | References 28 | ---------- 29 | Huang, D.T.J., Koh, Y.S., Dobbie, G., Pears, R.: Detecting volatility shift in data streams. 30 | In: 2014 IEEE International Conference on Data Mining (ICDM), pp. 863–868 (2014) 31 | 32 | """ 33 | def __init__(self, drift_detector, size): 34 | """ 35 | Initialize a drift detector 36 | 37 | Parameters 38 | ---------- 39 | drift_detector: type drift_detector 40 | The volatility detector takes the output of a drift detector. 41 | The corresponding drift detector is passed here to monitor its outputs. 42 | size: int 43 | Size of the reservoir and buffer by default. 44 | """ 45 | self.drift_detector = drift_detector 46 | self.sample = 0 47 | self.reservoir = Reservoir(size) 48 | self.buffer = Buffer(size) 49 | self.confidence = 0.05 50 | self.recent_interval = [] 51 | self.timestamp = 0 52 | self.vol_drift_found = False 53 | self.drift_found = False 54 | self.pre_drift_point = -1 55 | self.rolling_index = 0 56 | for i in range(size * 2 + 1): 57 | self.recent_interval.append(0.0) 58 | 59 | def set_input(self, input_value): 60 | """ 61 | Main part of the algorithm, takes the drifts detected by a drift detector. 62 | 63 | Parameters 64 | ---------- 65 | input_value: real value 66 | The input value of the volatility detector, the value should be real values and should be the output 67 | of some drift detector. 68 | 69 | Returns 70 | ------- 71 | vol_drift_found: true if a drift of stream volatility was found. 72 | """ 73 | self.sample += 1 74 | self.drift_found = self.drift_detector.set_input(input_value) 75 | if self.drift_found: 76 | self.timestamp += 1 77 | if self.buffer.is_full: 78 | result_buffer = self.buffer.add(self.timestamp) 79 | self.reservoir.add_element(result_buffer) 80 | else: 81 | self.buffer.add(self.timestamp) 82 | interval = self.timestamp 83 | self.recent_interval[self.rolling_index] = interval 84 | self.rolling_index += 1 85 | if self.rolling_index == self.reservoir.size * 2: 86 | self.rolling_index = 0 87 | self.timestamp = 0 88 | self.pre_drift_point = self.sample 89 | if self.buffer.is_full and self.reservoir.check_full(): 90 | relative_var = self.buffer.get_stddev() / self.reservoir.get_stddev() 91 | if relative_var > (1.0 + self.confidence) or relative_var < (1.0 - self.confidence): 92 | self.buffer.clear() 93 | # self.severity_buffer[:] = [] 94 | self.vol_drift_found = True 95 | else: 96 | self.vol_drift_found = False 97 | else: 98 | self.timestamp += 1 99 | self.vol_drift_found = False 100 | 101 | return self.vol_drift_found 102 | -------------------------------------------------------------------------------- /evluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexeyegorov/drift-detection/a738e07f69db257e6fcf0a2dbfb596620fe1149c/evluation/__init__.py -------------------------------------------------------------------------------- /evluation/metrics.py: -------------------------------------------------------------------------------- 1 | from numpy import * 2 | 3 | def Hamming_loss(Ytest,Ypred): 4 | ''' Hamming loss aka Hamming distance ''' 5 | return 1.-Hamming_score(Ytest,Ypred) 6 | 7 | def Hamming_score(Ytest,Ypred): 8 | ''' Hamming score aka Hamming match ''' 9 | N_test,L = Ytest.shape 10 | return sum((Ytest == Ypred) * 1.) / N_test / L 11 | 12 | def Hamming_matches(Ytest,Ypred): 13 | N_test,L = Ytest.shape 14 | return sum((Ytest == Ypred) * 1.,axis=0) / N_test 15 | 16 | def Hamming_losses(Ytest,Ypred): 17 | return 1.-Hamming_matches(Ytest,Ypred) 18 | 19 | from sklearn.metrics import log_loss 20 | 21 | 22 | def Log_loss(Ytest,Ydist): 23 | return log_loss(Ytest, Ydist, eps=1e-15, normalize=True) 24 | # N_test,L = Ytest.shape 25 | # return sum((Ytest == Ypred) * 1.) / N_test / L 26 | 27 | def J_index(Ytest,Ypred): 28 | N_test,L = Ytest.shape 29 | s = 0.0 30 | for i in range(N_test): 31 | inter = sum((Ytest[i,:] * Ypred[i,:]) > 0) * 1. 32 | union = sum((Ytest[i,:] + Ypred[i,:]) > 0) * 1. 33 | if union > 0: 34 | s = s + ( inter / union ) 35 | elif sum(Ytest[i,:]) == 0: 36 | s = s + 1. 37 | return s * 1. / N_test 38 | 39 | def Exact_match(Ytest,Ypred): 40 | N_test,L = Ytest.shape 41 | return sum(sum((Ytest == Ypred) * 1,axis=1)==L) * 1. / N_test 42 | 43 | def printEvalHeader(): 44 | print("Algorithm Jacc. Hamm. Exact Time ") 45 | 46 | def printEval(Ytest,Ypred,name="Method",time = 0.0): 47 | print("%-20s %.3f %.3f %.3f %0.1f" % (name, J_index(Ytest,Ypred), Hamming_loss(Ytest,Ypred), Exact_match(Ytest,Ypred), time)) 48 | 49 | def Edit_distance(Ytest,Ypred): 50 | ''' Average edit distance ''' 51 | N_test,L = Ytest.shape 52 | s = 0. 53 | for i in range(N_test): 54 | s = s + edit_distance(Ytest[i,:],Ypred[i,:]) 55 | return s * 1. / N_test 56 | 57 | def h_loss(ytest,ypred): 58 | ''' note: required by edit_distance to only return bits (not average bits / L) ''' 59 | return sum(ytest != ypred) 60 | 61 | def Hamming_distances(Ytest,Ypred): 62 | ''' probably only to be used for sequential data ''' 63 | N_test,L = Ytest.shape 64 | return sum((Ytest != Ypred) * 1.,axis=0) / N_test 65 | 66 | def Edit_distances(Ytest,Ypred): 67 | N_test,L = Ytest.shape 68 | d = zeros(L) 69 | for j in range(L): 70 | d[j] = Edit_distance(Ytest[:,0:j+1],Ypred[:,0:j+1]) 71 | return d / arange(1,L+1) 72 | 73 | def edit_distance(y, p): 74 | ''' 75 | aka Levenshtein 76 | From Wikipedia article; Iterative with two matrix rows. 77 | ''' 78 | if h_loss(y,p) == 0: return 0 79 | elif len(y) == 0: return len(p) 80 | elif len(p) == 0: return len(y) 81 | v0 = [None] * (len(p) + 1) 82 | v1 = [None] * (len(p) + 1) 83 | for i in range(len(v0)): 84 | v0[i] = i 85 | for i in range(len(y)): 86 | v1[0] = i + 1 87 | for j in range(len(p)): 88 | cost = 0 if y[i] == p[j] else 1 89 | v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost) 90 | for j in range(len(v0)): 91 | v0[j] = v1[j] 92 | 93 | return v1[len(p)] 94 | 95 | #y = array([0,8,2,9,7]) 96 | #p = array([8,2,9,7,0]) 97 | #print h_loss(y,p) 98 | #print edit_distance(y,p) 99 | -------------------------------------------------------------------------------- /evluation/prequential.py: -------------------------------------------------------------------------------- 1 | from numpy import * 2 | from time import clock 3 | import resource as rs 4 | import os 5 | import psutil as pu 6 | 7 | 8 | def exact(yt, yp): 9 | ''' 10 | Error function 11 | -------------- 12 | ''' 13 | return (yp == yt) * 1 14 | 15 | 16 | from evluation.metrics import J_index 17 | 18 | 19 | def get_errors(Y, P, J=J_index): 20 | N, L = Y.shape 21 | E = zeros((N)) 22 | for i in range(N): 23 | E[i] = J(Y[i, :].reshape(1, -1), P[i, :].reshape(1, -1)) 24 | return E 25 | 26 | 27 | def prequential_evaluation(X, Y, H, N_train): 28 | ''' 29 | Prequential Evaluation 30 | ---------------------- 31 | X instances 32 | Y labels 33 | H = [h_1,...,h_H] a set of classifiers 34 | N_train number of instances for initial batch 35 | return the label predictions for each test instance, and the associated running time 36 | ''' 37 | M = len(H) 38 | T, L = Y.shape 39 | 40 | # split off an initial batch (maybe) ... 41 | Y_init = Y[0:N_train] 42 | X_init = X[0:N_train] 43 | 44 | # ... and then use the remainder, used for both incremental training and evaluation. 45 | Y = Y[N_train:] 46 | X = X[N_train:] 47 | 48 | E_pred = zeros((M, T - N_train, L)) 49 | E_time = zeros((M, T - N_train)) 50 | E_usage = zeros((M, T - N_train)) 51 | 52 | for m in range(M): 53 | # start_time = clock() 54 | H[m].fit(X_init, Y_init) 55 | # E_time[m,0] = clock() - start_time 56 | 57 | for t in range(0, T - N_train): 58 | for m in range(M): 59 | start_time = clock() 60 | E_pred[m, t, :] = H[m].predict(X[t, :].reshape(1, -1)) 61 | H[m].partial_fit(X[t, :].reshape(1, -1), Y[t, :].reshape(1, -1)) 62 | E_time[m, t] += (clock() - start_time) 63 | process = pu.Process(os.getpid()) 64 | # memory_use = py.memory_info()[0] / float(2 ** 30) # in GB 65 | memory_use_percentage = process.memory_percent() 66 | E_usage[m, t] = memory_use_percentage 67 | 68 | return E_pred, E_time, E_usage 69 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | This is a test script for the three algorithms (Adwin, DDM, Stream Volatility) 4 | implemented in this project. The script takes the input dataset of Power Supply. 5 | The test is based on Prequential Evaluation, and monitors 3 indicators of 6 | performance: Accuracy, Time and memory usage 7 | """ 8 | 9 | # Authors: Wenjun Bai 10 | # Shu Shang 11 | # Duyen Phuc Nguyen 12 | # License: BSD 3 clause 13 | 14 | import pandas as pd 15 | import matplotlib.pyplot as plt 16 | import numpy as np 17 | 18 | from sklearn.naive_bayes import GaussianNB 19 | from classifiers.detector_classifier import DetectorClassifier 20 | from drift_detector.adwin import Adwin 21 | from drift_detector.DDM import DDM 22 | from drift_detector.stream_volatility.volatility_detector import VolatilityDetector 23 | from evluation.metrics import Exact_match 24 | from evluation.prequential import prequential_evaluation, get_errors 25 | 26 | np.random.seed(0) 27 | 28 | print('Load data') 29 | 30 | """ 31 | Dataset: Power Supply Dataset 32 | 33 | Download 34 | -------- 35 | moa.cms.waikato.ac.nz/datasets/ 36 | http://www.cse.fau.edu/ ∼ xqzhu/stream.html 37 | 38 | Data Structure 39 | -------------- 40 | |"date"|"day"|"period"|"nswprice"|"nswdemand"|"vicprice"|"vicdemand"|"transfer"|"class"| 41 | 42 | shape = (45312, 9) 43 | n_features = 8 44 | label = column['class'] = {"UP", "DOWN"} 45 | """ 46 | df = pd.read_csv("data/elecNormNew.csv") 47 | df['class'] = df['class'].map({'UP': 0, 'DOWN': 1}) 48 | L = 8 49 | N_train = 1000 50 | 51 | labels = df.columns.values.tolist()[L:] 52 | data = df.values 53 | T = len(data) 54 | Y = data[:, L:] 55 | X = data[:, 0:L] 56 | 57 | print("Experimentation") 58 | 59 | h = [DetectorClassifier(GaussianNB(), Adwin()), 60 | DetectorClassifier(GaussianNB(), VolatilityDetector(drift_detector=Adwin(), size=32)), 61 | DetectorClassifier(GaussianNB(), DDM()), 62 | GaussianNB()] 63 | E_pred, E_time, E_usage = prequential_evaluation(X, Y, h, N_train) 64 | 65 | print("Evaluation") 66 | 67 | E = np.zeros((len(h), T - N_train)) 68 | for m in range(len(h)): 69 | E[m] = get_errors(Y[N_train:], E_pred[m], J=Exact_match) 70 | 71 | print("Plot Results") 72 | print("---------------------------------------") 73 | w = 200 74 | fig, axes = plt.subplots(nrows=3, ncols=1) 75 | fig.tight_layout() 76 | for m in range(len(h)): 77 | acc = np.mean(E[m, :]) 78 | time = np.mean(E_time[m, :]) 79 | usage = np.mean(E_usage[m, :]) 80 | if h[m].__class__.__name__ == 'DetectorClassifier': 81 | print(h[m].__class__.__name__) 82 | print(h[m].get_detector_name()) 83 | else: 84 | print(h[m].__class__.__name__) 85 | print("Exact Match %3.2f" % np.mean(acc)) 86 | # print("Running Time %3.2f" % np.mean(time)) 87 | if h[m].__class__.__name__ == 'DetectorClassifier': 88 | print("Number of detected drifts: %d" % h[m].num_change_detected) 89 | print("---------------------------------------") 90 | acc_run = np.convolve(E[m, :], np.ones((w,)) / w, 'same') 91 | acc_time = np.convolve(E_time[m, :], np.ones((w,)) / w, 'same') 92 | acc_usage = np.convolve(E_usage[m, :], np.ones((w,)) / w, 'same') 93 | if h[m].__class__.__name__ == 'DetectorClassifier': 94 | plt.subplot(3, 1, 1) 95 | plt.plot(np.arange(len(acc_run)), acc_run, '-', label=h[m].get_detector_name()) 96 | plt.subplot(3, 1, 2) 97 | plt.plot(np.arange(len(acc_time)), acc_time, '-', label=h[m].get_detector_name()) 98 | plt.subplot(3, 1, 3) 99 | plt.plot(np.arange(len(acc_usage)), acc_usage, '-', label=h[m].get_detector_name()) 100 | else: 101 | plt.subplot(3, 1, 1) 102 | plt.plot(np.arange(len(acc_run)), acc_run, '-', label=h[m].__class__.__name__) 103 | plt.subplot(3, 1, 2) 104 | plt.plot(np.arange(len(acc_time)), acc_time, '-', label=h[m].__class__.__name__) 105 | plt.subplot(3, 1, 3) 106 | plt.plot(np.arange(len(acc_usage)), acc_usage, '-', label=h[m].__class__.__name__) 107 | 108 | plt.subplot(3, 1, 1) 109 | plt.xlabel('Instance(samples)') 110 | plt.ylabel('Accuracy(exact match)') 111 | plt.title('Performance(acc)') 112 | plt.legend(loc='best') 113 | plt.subplot(3, 1, 2) 114 | plt.xlabel('Instance(samples)') 115 | plt.ylabel('Running time(ms)') 116 | plt.title('Performance(Running time)') 117 | plt.legend(loc='best') 118 | plt.subplot(3, 1, 3) 119 | plt.xlabel('Instance(samples)') 120 | plt.ylabel('Memory usage (%MEM)') 121 | plt.title('Performance(Memory usage)') 122 | plt.legend(loc='best') 123 | plt.show() 124 | --------------------------------------------------------------------------------