├── .gitignore
├── README.md
├── classifiers
    ├── __init__.py
    └── detector_classifier.py
├── data
    └── elecNormNew.csv
├── doc
    ├── doc_drift_detection.md
    └── doc_drift_detection.pdf
├── drift_detector
    ├── DDM.py
    ├── __init__.py
    ├── adwin.py
    ├── adwin_list.py
    ├── adwin_list_node.py
    └── stream_volatility
    │   ├── __init__.py
    │   ├── buffer.py
    │   ├── reservoir.py
    │   └── volatility_detector.py
├── evluation
    ├── __init__.py
    ├── metrics.py
    └── prequential.py
└── test.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | classifer/*.pyc
3 | drift_detector/*.pyc
4 | evaluation/*.pyc
5 | .idea/
6 | .ipynb_checkpoints/
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### Project of IoT(Internet of Things) data stream mining course
 3 | 
 4 | > Objectives
 5 | 
 6 | * Implementation of three drift detection algorithms 
 7 |   * Adwin
 8 |   * DDM
 9 |   * Stream Volatility
10 | 
11 | For more documentation, please find at doc folder.
12 | 


--------------------------------------------------------------------------------
/classifiers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexeyegorov/drift-detection/a738e07f69db257e6fcf0a2dbfb596620fe1149c/classifiers/__init__.py


--------------------------------------------------------------------------------
/classifiers/detector_classifier.py:
--------------------------------------------------------------------------------
  1 | """ Detector Classifier, a wrapper to combine the classifier and drift detector  """
  2 | 
  3 | # Authors: Wenjun Bai <vivianbai.cn@gmail.com>
  4 | #          Shu Shang <ignatius.sun@gmail.com>
  5 | #          Duyen Phuc Nguyen <nguyenduyenphuc@gmail.com>
  6 | # License: BSD 3 clause
  7 | 
  8 | import numpy as np
  9 | 
 10 | from sklearn import clone
 11 | from sklearn.metrics import accuracy_score
 12 | 
 13 | 
 14 | class DetectorClassifier():
 15 |     """
 16 |     A detector classifier is a classifier combined with a drift detector.
 17 |     This class serves as wrapper to combine a classifier and a drift detector together.
 18 |     """
 19 |     def __init__(self, clf, drift_detector):
 20 |         """
 21 |         Initialize a detector classifier.
 22 | 
 23 |         Parameters
 24 |         ----------
 25 |         clf: a classifier, like Naive Bayes classifier
 26 |         drift_detector: a drift detector, like adwin, DDM
 27 |         """
 28 |         self.classes = None
 29 |         self.clf = clf
 30 |         self.drift_detector = drift_detector
 31 |         self.num_change_detected = 0
 32 | 
 33 |     def fit(self, X, y):
 34 |         """Fit drift detector classifier according to X, y
 35 | 
 36 |         Parameters
 37 |         ----------
 38 |         X : {array-like, sparse matrix}, shape = [n_samples, n_features]
 39 |             Training vectors, where n_samples is the number of samples and
 40 |             n_features is number of features.
 41 |         y : array-like, shape = [n_samples]
 42 |             Target values.
 43 | 
 44 |         Returns
 45 |         -------
 46 |         self : object
 47 |             return self
 48 |         """
 49 |         self.clf.fit(X, y)
 50 |         self.classes = np.unique(y)
 51 |         return self
 52 | 
 53 |     def partial_fit(self, X, y):
 54 |         """Incremental fit on a batch of samples.
 55 |         This method is expected to be called several times consecutively
 56 |         on different chunks of a dataset so as to implement out-of-core
 57 |         or online learning.
 58 | 
 59 |         This is especially useful when the whole dataset is too big to fit in
 60 |         memory at once.
 61 | 
 62 |         This method has some performance and numerical stability overhead,
 63 |         hence it is better to call partial_fit on chunks of data that are
 64 |         as large as possible (as long as fitting in the memory budget) to
 65 |         hide the overhead.
 66 | 
 67 |         Parameters
 68 |         ----------
 69 |         X : array-like, shape(n_samples, n_features)
 70 |             Training vectors, when n_samples is the number of samples
 71 |             and n_features is the number of features.
 72 |         y : array-like, shape(n_samples, )
 73 |             Target values.
 74 | 
 75 |         Returns
 76 |         -------
 77 |         self : object
 78 |             return self.
 79 |         """
 80 |         pre_y = self.clf.predict(X)
 81 |         if self.drift_detector.set_input(accuracy_score(pre_y, y)):
 82 |             self.num_change_detected += 1
 83 |             self.clf = clone(self.clf)
 84 |             # print("change detected...")
 85 |             # self.clf.fit(X, y)
 86 |             self.clf.partial_fit(X, y, classes=self.classes)
 87 |         else:
 88 |             self.clf.partial_fit(X, y)
 89 |         return self
 90 | 
 91 |     def predict(self, X):
 92 |         """
 93 |         Perform prediction on an array of test vectors X.
 94 | 
 95 |         Parameters
 96 |         ----------
 97 |         X : array-like, shape = [n_samples, n_features]
 98 | 
 99 |         Returns
100 |         -------
101 |         array, shape = [n_samples]
102 |             Predicted target values for X
103 |         """
104 |         return self.clf.predict(X)
105 | 
106 |     def get_detector_name(self):
107 |         return self.drift_detector.__class__.__name__
108 | 


--------------------------------------------------------------------------------
/doc/doc_drift_detection.md:
--------------------------------------------------------------------------------
  1 | # Adaptive Sliding Window (ADWIN)
  2 | 
  3 | Adaptive sliding window algorithm is  automatically maintaining the window of variable size when date change with rigorous guarantees of it performance. 
  4 | 
  5 | > Algorithm Setting
  6 | 
  7 | **Input**: 
  8 | 
  9 |  - Confidence value $\delta$ belong to (0, 1)
 10 |  - Sequence of real vaues  $x_1$, $x_2$, $x_2$, . . . , $x_t$ , . . .
 11 |  - The value of $x_t$ is available only at time $t$
 12 | 
 13 | **Assume**:
 14 | 
 15 |  - $x_t$ is always in [0, 1]
 16 |  - an Interval [a, b] such that $a<=$$x_t$$<=b$ with probability 1
 17 |  - Nothing else is known about the sequence of distributions $D_t$
 18 |  - $\mu_t$ and $\sigma_t^2$ are unknown for all $t$
 19 | 
 20 | 
 21 | The adapting sliding window algorithm called ADWIN suitable for data streams with sudden drift. The  algorithm keeps a sliding window *W* with the most recently read examples. The main idea of ADWIN is as follows: whenever two “large enough” subwindows of *W* exhibit “distinct enough” averages, one can conclude that the corresponding expected values are different, and the older portion of the window is dropped. This involves answering a statistical hypothesis: “Has the average $\mu_t$ remained constant in *W* with confidence $\delta$ ”? The pseudo-code of ADWIN is listed below.
 22 | 
 23 | > **ADWIN0:** ADAPTIVE WINDOWING ALGORITHM
 24 | > 
 25 | 
 26 | > 1: Initialize Window $W$
 27 | > 2: **for** each $t > 0$
 28 | > 3: &nbsp;&nbsp;&nbsp;&nbsp;**do** {$x_t$} $\bigcup$ *W* $\to$ *W* (i.e., add $x_t$ to the head of *W*)
 29 | > 4: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;**repeat** Drop elements from the tail of *W*
 30 | > 5: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;**until** | $\hat{\mu}_{W_0}$ - $\hat{\mu}_{W_1}$ | < ${\epsilon}_{cut}$ holds
 31 | > 6: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;for every spilt of $W$ into $W$ = $W_0 W_1$
 32 | > 7: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;output $\hat{\mu}_{W}$
 33 | 
 34 | 
 35 | The key part of the algorithm lies in the definition of ${\epsilon}_{cut}$ and the test it is used for. The different statistical tests can be used for this purpose, but propose only one specific implementation. Let $n$ denote the size of $W$, and $n_0$ and $n_1$ the sizes of $W_0$ and $W_1$ consequently, so that $n = n_0 +n_1$. Let $\hat{\mu}_{W_0}$ and $\hat{\mu}_{W_1}$ be the averages of the values in $W_0$ and $W_1$ , and $\hat{\mu}_{W_0}$ and $\hat{\mu}_{W_1}$ their expected values. The value of ${\epsilon}_{cut}$ is proposed as follows:
 36 | $$
 37 | {\epsilon}_{cut} = \sqrt{\frac{1}{2m}\frac{4}{\delta'}}
 38 | $$
 39 | where
 40 | $$
 41 | m = \frac{1}{\frac{1}{n_0}+\frac{1}{n_1}}, and  ,  \delta '=\frac{\delta}{n}
 42 | $$
 43 | The statistical test in line 6 of the pseudo-code checks if the observed average in both subwindows differs by more than threshold ${\epsilon}_{cut}$. The threshold is calculated using the Hoeffding bound, thus gives formal guarantees of the base classifiers performance. The phrase “holds for every split of $W$ into $ W = W_0 W_1$” means that we need to check all pairs of subwindows W0 and W1 created by splitting $W$ into two. The verification of all subwindows is very costly due to the number of possible split points. That is why the authors proposed an improvement to the algorithm that allows to find a good cut point quickly. The originally proposed ADWIN algorithms are also lossless learners, thus the window size $W$ can grow infinitely if no drift occurs. This can be easily improved by adding a parameter that would limit the windows maximal size. In its original form, proposed by Bifet, ADWIN works only for 1-dimensional data, e.g., the running error. For this method to be used for n-dimensional raw data, a separate window should be maintained for each dimension. Such a modified model, although costly, reflects the fact that the importance of each feature may change at different pace.
 44 | 
 45 | **References:** 
 46 | 
 47 | > - A. Bifet, R. Gavalda. (2007). "Learning from Time-Changing Data with Adaptive Windowing". Proceedings of the 2007 SIAM International Conference on Data Mining 443-448.
 48 | > - A. Bifet, J. Read, B.Pfahringer.G. Holmes, I. Zliobaite. (2013). "CD-MOA: Change Detection Framework for Massive Online Analysis". Springer Berlin Heidelberg 8207(9): 443-448.
 49 | 
 50 | 
 51 | # Drift Detection Method (DDM)
 52 | 
 53 | Drift Detection Method (DDM) model the number of classification errors with a Binomial distribution. The idea of this method is that in each iteration an online classifier predicts the decision class of an example. That prediction can be either *true* or *false*, thus for a set of examples the error is a random variable from Bernoulli trials.
 54 | 
 55 | Let us denote $p_{i}$ as the probability of a *false* prediction and $s_{i}$ as its standard deviation calculated as given by Equation (1):
 56 | $$
 57 | s_{i}=\sqrt{\frac{p_{i}(1-p{i})}{i}} \tag{1}
 58 | $$ 
 59 | For a sufficiently large number of examples (*n* > 30), the Binomial distribution is closely approximated by a Normal distribution with the same mean and variance. For each example in the data stream the error rate is tracked updating two registers:  $p_{min}$ and $s_{min}$. These values are used to calculate a *warning level* condition presented in Equation 2 and an *alarm level* condition presented in Equation 3. Each time a warning level is reached, examples are remembered in a separate window. If afterwards the error rate falls below the warning threshold, the warning is treated as a false alarm and the separate window is dropped. However, if the alarm level is reached, the previously taught base learner is dropped and a new one is created, but only from the examples stored in the separate  warning” window.
 60 | $$ p_{i} + s_{i}   \geq p_{min} + \alpha s_{min}     \tag{2} 
 61 | $$
 62 | 
 63 | $$
 64 | p_{i} + s_{i}   \geq p_{min} + \beta s_{min}     \tag{3}
 65 | $$
 66 | 
 67 | The value $\alpha$ and $\beta$ in the above conditions decide about the confidence levels at which the warning and alarm signals are triggered. 
 68 | 
 69 | 
 70 | > Algorithm Setting 
 71 | 
 72 | **Input**: 
 73 | 
 74 |  - $S$: a data stream of examples
 75 |  - $C$: classifier
 76 | 
 77 | **Output**: *W* : a window with examples selected to train classifier $C$
 78 | 
 79 | >DDM: Drift Detection Method
 80 | >
 81 | 1:  Initialize $(i, p_{i}, s_{i}, ps_{min}, p_{min}, s_{min});$
 82 | 2: $newDrift \leftarrow false;$
 83 | 3: $W \leftarrow \emptyset;$
 84 | 4: $W^{'} \leftarrow \emptyset;$
 85 | 5: **for all** examples $x_{i} \in S$ **do**
 86 | 6: &nbsp;&nbsp;&nbsp;&nbsp; **if** prediction $C(x_{i})$ is incorrect **then**
 87 | 7:	&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;$p_{i} \leftarrow p_{i} +(1.0-p_{i})/i; $
 88 | 8:&nbsp;&nbsp;&nbsp;&nbsp;   **else**
 89 | 9:	&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;$p_{i} \leftarrow p_{i}-(p_{i})/i;$
 90 | 10: &nbsp;&nbsp;&nbsp;&nbsp;compute $s_{i}$ using (1);
 91 | 11: &nbsp;&nbsp;&nbsp;&nbsp;$i \leftarrow i+1;$
 92 | 12: &nbsp;&nbsp;&nbsp;&nbsp; **if**  $ i>30$ (approximated normal distribution) **then**
 93 | 13: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **if** $p_{i} +p_{s} \leq  ps_{min}$ **then**
 94 | 14:&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;	&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;  $p_{min} \leftarrow p_{i};$
 95 | 15: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;$s_{min} \leftarrow s_{i};$
 96 | 16:&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; $ps_{min} \leftarrow p_{i}+s_{i};$
 97 | 17: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;**if** drift detected (3) **then**
 98 | 18: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Initalize ($i,p_{i},s_{i},ps_{min},p_{min},s_{min};$
 99 | 19: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;$W \leftarrow W^{'};$
100 | 20: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;$W^{'} \leftarrow \emptyset;$
101 | 21:&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **else if** warning level reached (2) **then**
102 | 22:&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **if**  $newDrift=true$ **then**
103 | 23:&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; $W^{'} \leftarrow \emptyset;$
104 | 24: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; $newDrift \leftarrow false;$
105 | 25: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;$W^{'} \leftarrow W^{'}\cup {x_{i}};$
106 | 26: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;  **else**
107 | 27:&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;$newDrift \leftarrow true;$
108 | 28: &nbsp;&nbsp;&nbsp;&nbsp; $W \leftarrow W \cup {x_{i}};$
109 | 
110 | 
111 | **References:** 
112 | 
113 | > - Gama, J., Medas, P., Castillo, G., Rodrigues, P.: Learning with drift detection. In: Bazzan, A.L.C., Labidi, S. (eds.) SBIA 2004. LNCS (LNAI), vol. 3171, pp. 286–295. Springer, Heidelberg (2004)
114 | 
115 | 
116 | # Stream Volatility
117 | 
118 | Current drift detection techniques detect a change in distribution within a stream. However, there are no current techniques that analyze the change in the rate of these detected changes. We coin the term stream volatility, to describe the rate of changes in a stream. A stream has a high volatility if changes are detected frequently and has a low volatility if changes are detected infrequently. 
119 | 
120 | Volatility detection focus on the rate de changes of detected concept drifts in the data stream. Volatility detection works usually in parallel with drift detectors and the two are designed to run concurrently.
121 | 
122 | To illustrate the differences between drift detection and volatility detection. Firstly, we will formally define these two notions.
123 | 
124 | **Drift Detection**: Let $S_{1} = (x_{1}, x_{2},...,x_{m})$ and $S_{2} = (x_{m+1}, x_{m+2},...,x_{n})$ with $0<m<n$ represent two samples of instances from a stream with population means $\mu_{1}$ and $\mu_{2}$ respectively. The drift detection problem is testing the null hypothesis $H_{0}$ that $\mu_{1}=\mu_{2}$ against the alternate hypothesis $H_{1}$ that they are from different distributions with $\mu_{1}\neq\mu_{2}$. In practice the underlying data distribution is unknown and a test statistic based on sample means needs to be constructed by the drift detector. A false negative occurs when the null hypothesis is accepted incorrectly and a false positive occurs when the alternate hypothesis is accepted incorrectly. The hypothesis test is as follows: we accept hypothesis $H_{1}$ whenever $Pr(|\hat{\mu}_{1} - \hat{\mu}_{2}| > \epsilon) > \delta$, where $\delta$ lies in the interval (0,1) and is a parameter that controls the maximum allowable false positive rate, while $\epsilon$ is is a function of $\delta$ and the test statistic used to model the difference between the sample means.
125 | 
126 | **Volatility Detection**: Let $C_{1} = (c_{1}, c_{2},...,c_{k})$ and $C_{2} = (c_{k+1}, c_{k+2},...,c_{t})$ represent a sample of cut points detected from a stream. $p_{i}$ represent the distance intervals (periods) between two consecutive cut points $c_{i}$ and $c_{i-1}$. We are able to derive volatility windows $P_{1}= (p_{1}, p_{2},...,p_{k})$ and $P_{2}= (p_{k+1}, p_{k+2},...,p_{t-1})$ with sample variance of $\sigma_{1}$ and $\sigma_{2}$. The volatility detection problem can be expressed as testing $\frac{\sigma_{1}}{\sigma_{2}}\lessgtr 1.0\pm\beta$, where $\beta$ is a user-expressed tolerance threshold. If the test holds true we say that there is a shift in volatility between the two samples.
127 |  
128 | > Algorithm Setting 
129 | 
130 | **Input**
131 | 
132 | -  A sequence of real values $p_{1}, p_{2},...,p_{t}$ representing the distance intervals between cut points discovered by drift detection techniques. 
133 | 
134 | **Output**
135 | 
136 | - Shift points of stream volatility
137 | 
138 | > Stream Volatility Detector
139 | > 
140 | > 1: Initialize buffer $B$ and Reservoir $R$;
141 | > 2: Boolean: volatilityShift $\leftarrow$ false
142 | > 3: **for each** $t > 0$ **do**
143 | > 4: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;$j$ $\leftarrow$ addToBuffer$(x_{t}, B)$;
144 | > 5: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;addToReservoir$(j, R)$;
145 | > 6: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;RelativeVariance $\leftarrow \frac{\sigma_{B}}{\sigma_{R}}$;
146 | > 7: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;**if** Relative Variance $\lessgtr 1.0\pm\beta$ **then**
147 | > 8: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; volatilityShift $\leftarrow$ True;
148 | > 9: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;**end**
149 | > 10: **end**
150 | > 11: **Funtion** addToBuffer(item $k$, Buffer $B$)
151 | > 12: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;add $k$ as tail of $B$;
152 | > 13: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;return head of $B$;
153 | > 14: **end**
154 | > 15: **Function** addToReservoir(iterm $k$, Reservoir $R$)
155 | > 16: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;rPos $\leftarrow random()$;
156 | > 17: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;R[rPos] $\leftarrow k$;
157 | > 18: **end**
158 | 
159 | There are two main components in the volatility detector: a buffer and a reservoir. The buffer is a sliding window that keeps the most recent samples of drift intervals acquired from a drift detection technique. The reservoir is a pool that stores previous samples which ideally represent the overall state of the stream.
160 | 
161 | The progression of the volatility detector when inputs come in is as follows: First, when input $x_{t}$ arrives at position $t$, it is ﬁrst moved into the buffer where a sliding window keeps recent samples. As the sliding window slides, the oldest entry in the buffer is dropped from the buffer and moved into the reservoir, then the reservoir stores the dropped entry by randomly replacing one of its stored samples. Lastly, the detector then compares the samples in the buffer to the samples in the reservoir to analyze for differences. The primary difference between the buffer and the reservoir is that the buffer always keeps the most recent contents of the stream whereas the reservoir keeps an overall view of the stream. For a change in relative volatility to be detected, we use the Relative Variance measure. Relative Variance at a particular point is calculated as $\frac{\sigma_{B}}{\sigma_{R}}$ , where $\sigma_{B}$ is the variance calculated using the samples in the buffer and $\sigma_{R}$ is the variance calculated using the samples in the reservoir.
162 | 
163 | **Reference**
164 | > - Huang, D.T.J., Koh, Y.S., Dobbie, G., Pears, R.: Detecting volatility shift in data streams. In: 2014 IEEE International Conference on Data Mining (ICDM), pp. 863–868 (2014)
165 | 


--------------------------------------------------------------------------------
/doc/doc_drift_detection.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexeyegorov/drift-detection/a738e07f69db257e6fcf0a2dbfb596620fe1149c/doc/doc_drift_detection.pdf


--------------------------------------------------------------------------------
/drift_detector/DDM.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | """ Drift detection method based in DDM method of Joao Gama SBIA 2004. """
 3 | 
 4 | # Authors: Wenjun Bai <vivianbai.cn@gmail.com>
 5 | #          Shu Shang <ignatius.sun@gmail.com>
 6 | #          Duyen Phuc Nguyen <nguyenduyenphuc@gmail.com>
 7 | # License: BSD 3 clause
 8 | 
 9 | import sys
10 | import math
11 | 
12 | 
13 | class DDM:
14 |     """
15 |     The drift detection method (DDM) controls the number of errors
16 |     produced by the learning model during prediction. It compares
17 |     the statistics of two windows: the first contains all the data,
18 |     and the second contains only the data from the beginning until
19 |     the number of errors increases.
20 |     Their method doesn't store these windows in memory.
21 |     It keeps only statistics and a window of recent errors data.".
22 | 
23 |     References
24 |     ---------
25 |     Gama, J., Medas, P., Castillo, G., Rodrigues, P.:
26 |     "Learning with drift detection". In: Bazzan, A.L.C., Labidi,
27 |     S. (eds.) SBIA 2004. LNCS (LNAI), vol. 3171, pp. 286–295. Springer, Heidelberg (2004)
28 |     """
29 | 
30 |     def __init__(self):
31 |         self.m_n = 1
32 |         self.m_p = 1
33 |         self.m_s = 0
34 |         self.m_psmin = sys.float_info.max
35 |         self.m_pmin = sys.float_info.max
36 |         self.m_smin = sys.float_info.max
37 |         self.change_detected = False
38 |         self.is_initialized = True
39 |         self.estimation = 0.0
40 |         self.is_warning_zone = False
41 | 
42 |     def set_input(self, prediction):
43 |         """
44 |         The number of errors in a sample of n examples is modelled by a binomial distribution.
45 |         For each point t in the sequence that is being sampled, the error rate is the probability
46 |         of mis-classifying p(t), with standard deviation s(t).
47 |         DDM checks two conditions:
48 |         1) p(t) + s(t) > p(min) + 2 * s(min) for the warning level
49 |         2) p(t) + s(t) > p(min) + 3 * s(min) for the drift level
50 | 
51 |         Parameters
52 |         ----------
53 |         prediction : new element, it monitors the error rate
54 | 
55 |         Returns
56 |         -------
57 |         change_detected : boolean
58 |                     True if a change was detected.
59 |         """
60 |         if self.change_detected is True or self.is_initialized is False:
61 |             self.reset()
62 |             self.is_initialized = True
63 | 
64 |         self.m_p += (prediction - self.m_p) / float(self.m_n)
65 |         self.m_s = math.sqrt(self.m_p * (1 - self.m_p) / float(self.m_n))
66 | 
67 |         self.m_n += 1
68 |         self.estimation = self.m_p
69 |         self.change_detected = False
70 | 
71 |         if self.m_n < 30:
72 |             return False
73 | 
74 |         if self.m_p + self.m_s <= self.m_psmin:
75 |             self.m_pmin = self.m_p;
76 |             self.m_smin = self.m_s;
77 |             self.m_psmin = self.m_p + self.m_s;
78 | 
79 |         if self.m_p + self.m_s > self.m_pmin + 3 * self.m_smin:
80 |             self.change_detected = True
81 |         elif self.m_p + self.m_s > self.m_pmin + 2 * self.m_smin:
82 |             self.is_warning_zone = True
83 |         else:
84 |             self.is_warning_zone = False
85 | 
86 |         return self.change_detected
87 | 
88 |     def reset(self):
89 |         """reset the DDM drift detector"""
90 |         self.m_n = 1
91 |         self.m_p = 1
92 |         self.m_s = 0
93 |         self.m_psmin = sys.float_info.max
94 |         self.m_pmin = sys.float_info.max
95 |         self.m_smin = sys.float_info.max
96 | 


--------------------------------------------------------------------------------
/drift_detector/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexeyegorov/drift-detection/a738e07f69db257e6fcf0a2dbfb596620fe1149c/drift_detector/__init__.py


--------------------------------------------------------------------------------
/drift_detector/adwin.py:
--------------------------------------------------------------------------------
  1 | """ADaptive sliding WINdow  Algorithm (ADWIN)."""
  2 | 
  3 | # Authors: Wenjun Bai <vivianbai.cn@gmail.com>
  4 | #          Shu Shang <ignatius.sun@gmail.com>
  5 | #          Duyen Phuc Nguyen <nguyenduyenphuc@gmail.com>
  6 | # License: BSD 3 clause
  7 | 
  8 | import math
  9 | 
 10 | from drift_detector.adwin_list import AdwinList
 11 | 
 12 | 
 13 | class Adwin(object):
 14 |     """The Adwin algorithm is a change detector and estimator.
 15 |     It keeps a sliding (variable-length) window with the most
 16 |     recently read example,with the property that the window
 17 |     has the maximal length statistically consistent with the
 18 |     hypothesis that "there has been no change in the average
 19 |     value inside the window".
 20 | 
 21 |     References
 22 |     ----------
 23 |     A. Bifet, R. Gavalda. (2007). "Learning from Time-Changing
 24 |     Data with Adaptive Windowing". Proceedings of the 2007 SIAM
 25 |     International Conference on Data Mining 443-448.
 26 |     http://www.lsi.upc.edu/~abifet/Timevarying.pdf
 27 | 
 28 |     A. Bifet, J. Read, B.Pfahringer.G. Holmes, I. Zliobaite.
 29 |     (2013). "CD-MOA: Change Detection Framework for Massive Online
 30 |     Analysis". Springer Berlin Heidelberg 8207(9): 443-448.
 31 |     https://sites.google.com/site/zliobaitefiles/cdMOA-CR.pdf?attredirects=0
 32 |     """
 33 | 
 34 |     def __init__(self, delta=0.01):
 35 |         """Init the buckets
 36 | 
 37 |         Parameters
 38 |         ----------
 39 |         delta : float
 40 |             confidence value.
 41 |         """
 42 | 
 43 |         self.mint_clock = 1.0
 44 |         self.min_window_length = 16
 45 |         self.delta = delta
 46 |         self.max_number_of_buckets = 5
 47 |         self.bucket_list = AdwinList(self.max_number_of_buckets)
 48 |         self.mint_time = 0.0
 49 |         self.min_clock = self.mint_clock
 50 |         self.mdbl_error = 0.0
 51 |         self.mdbl_width = 0.0
 52 |         self.last_bucket_row = 0
 53 |         self.sum = 0.0
 54 |         self.width = 0.0
 55 |         self.variance = 0.0
 56 |         self.bucket_number = 0
 57 | 
 58 |     def get_estimation(self):
 59 |         """Get the estimation value"""
 60 |         if self.width > 0:
 61 |             return self.sum / float(self.width)
 62 |         else:
 63 |             return 0
 64 | 
 65 |     def set_input(self, value):
 66 |         """Add new element and reduce the window
 67 | 
 68 |         Parameters
 69 |         ----------
 70 |         value : new element
 71 | 
 72 |         Returns
 73 |         -------
 74 |         boolean: the return value of the method check_drift(), true if a drift was detected.
 75 |         """
 76 |         self.insert_element(value)
 77 |         self.compress_buckets()
 78 |         return self.check_drift()
 79 | 
 80 |     def length(self):
 81 |         """Get the length of window"""
 82 |         return self.width
 83 | 
 84 |     def insert_element(self, value):
 85 |         """insert new bucket"""
 86 |         self.width += 1
 87 |         self.bucket_list.head.insert_bucket(float(value), 0.0)
 88 |         self.bucket_number += 1
 89 |         if self.width > 1:
 90 |             self.variance += (self.width - 1) * (value - self.sum / (self.width - 1)) \
 91 |                              * (value - self.sum / (self.width - 1)) / self.width
 92 |         self.sum += value
 93 | 
 94 |     def compress_buckets(self):
 95 |         """
 96 |         Merge buckets.
 97 |         Find the number of buckets in a row, if the row is full, then merge the two buckets.
 98 |         """
 99 |         i = 0
100 |         cont = 0
101 |         cursor = self.bucket_list.head
102 |         next_node = None
103 |         while True:
104 |             k = cursor.size
105 |             if k == self.max_number_of_buckets + 1:
106 |                 next_node = cursor.next
107 |                 if next_node is None:
108 |                     self.bucket_list.add_to_tail()
109 |                     next_node = cursor.next
110 |                     self.last_bucket_row += 1
111 |                 n1 = self.bucket_size(i)
112 |                 n2 = self.bucket_size(i)
113 |                 u1 = cursor.sum[0] / n1
114 |                 u2 = cursor.sum[1] / n2
115 |                 internal_variance = n1 * n2 * (u1 - u2) * (u1 - u2) / (n1 + n2)
116 |                 next_node.insert_bucket(cursor.sum[0] + cursor.sum[1],
117 |                                         cursor.variance[0] + cursor.variance[1] + internal_variance)
118 |                 self.bucket_number -= 1
119 |                 cursor.drop_bucket(2)
120 |                 if next_node.size <= self.max_number_of_buckets:
121 |                     break
122 |                 else:
123 |                     break
124 |             cursor = cursor.next
125 |             i += 1
126 |             if cursor is None:
127 |                 break
128 | 
129 |     def check_drift(self):
130 |         """
131 |         Reduce the window, detecting if there is a drift.
132 | 
133 |         Returns
134 |         -------
135 |         change : boolean value
136 |         Result of whether the window has changed.
137 |         """
138 | 
139 |         change = False
140 |         exit = False
141 |         cursor = None
142 |         self.mint_time += 1
143 |         if self.mint_time % self.min_clock == 0 and self.width > self.min_window_length:
144 |             reduce_width = True
145 |             while reduce_width:
146 |                 reduce_width = False
147 |                 exit = False
148 |                 n0 = 0.0
149 |                 n1 = float(self.width)
150 |                 u0 = 0.0
151 |                 u1 = float(self.sum)
152 |                 cursor = self.bucket_list.tail
153 |                 i = self.last_bucket_row
154 |                 while True:
155 |                     for k in range(cursor.size):
156 |                         if i == 0 and k == cursor.size - 1:
157 |                             exit = True
158 |                             break
159 |                         n0 += self.bucket_size(i)
160 |                         n1 -= self.bucket_size(i)
161 |                         u0 += cursor.sum[k]
162 |                         u1 -= cursor.sum[k]
163 |                         min_length_of_subwindow = 5
164 |                         if n0 >= min_length_of_subwindow and n1 >= min_length_of_subwindow and self.cut_expression(n0,
165 |                                                                                                                    n1,
166 |                                                                                                                    u0,
167 |                                                                                                                    u1):
168 |                             reduce_width = True
169 |                             change = True
170 |                             if self.width > 0:
171 |                                 self.delete_element()
172 |                                 exit = True
173 |                                 break
174 |                     cursor = cursor.prev
175 |                     i -= 1
176 |                     if exit or cursor is None:
177 |                         break
178 |         return change
179 | 
180 |     def delete_element(self):
181 |         """delete the bucket at the tail of window"""
182 |         node = self.bucket_list.tail
183 |         n1 = self.bucket_size(self.last_bucket_row)
184 |         self.width -= n1
185 |         self.sum -= node.sum[0]
186 |         u1 = node.sum[0] / n1
187 |         incVariance = float(
188 |             node.variance[0] + n1 * self.width * (u1 - self.sum / self.width) * (u1 - self.sum / self.width)) / (
189 |                           float(n1 + self.width))
190 |         self.variance -= incVariance
191 |         node.drop_bucket()
192 |         self.bucket_number -= 1
193 |         if node.size == 0:
194 |             self.bucket_list.remove_from_tail()
195 |             self.last_bucket_row -= 1
196 | 
197 |     def cut_expression(self, n0_, n1_, u0, u1):
198 |         """Expression calculation"""
199 |         n0 = float(n0_)
200 |         n1 = float(n1_)
201 |         n = float(self.width)
202 |         diff = float(u0 / n0) - float(u1 / n1)
203 |         v = self.variance / self.width
204 |         dd = math.log(2.0 * math.log(n) / self.delta)
205 |         min_length_of_subwindow = 5
206 |         m = (float(1 / (n0 - min_length_of_subwindow + 1))) + (float(1 / (n1 - min_length_of_subwindow + 1)))
207 |         eps = math.sqrt(2 * m * v * dd) + float(2 / 3 * dd * m)
208 |         if math.fabs(diff) > eps:
209 |             return True
210 |         else:
211 |             return False
212 | 
213 |     def bucket_size(self, Row):
214 |         return int(math.pow(2, Row))
215 | 


--------------------------------------------------------------------------------
/drift_detector/adwin_list.py:
--------------------------------------------------------------------------------
 1 | """Implementation of an adwin list"""
 2 | 
 3 | # Authors: Wenjun Bai <vivianbai.cn@gmail.com>
 4 | #          Shu Shang <ignatius.sun@gmail.com>
 5 | #          Duyen Phuc Nguyen <nguyenduyenphuc@gmail.com>
 6 | # License: BSD 3 clause
 7 | 
 8 | 
 9 | from drift_detector.adwin_list_node import AdwinListNode
10 | 
11 | 
12 | class AdwinList(object):
13 |     def __init__(self, max_number_bucket):
14 |         """Init a adwin list with a given parameter max_number_buckets
15 | 
16 |         Parameters
17 |         ----------
18 |         max_number_bucket : max number of elements in the bucket
19 |         """
20 |         self.head = None
21 |         self.tail = None
22 |         self.count = 0
23 |         self.max_number_bucket = max_number_bucket
24 |         self.add_to_head()
25 | 
26 |     def add_to_tail(self):
27 |         """add a node at the tail of adwin list, used in the initialization of an AdwinList"""
28 |         temp = AdwinListNode(self.max_number_bucket)
29 |         if self.tail is not None:
30 |             temp.prev = self.tail
31 |             self.tail.next = temp
32 |         self.tail = temp
33 |         if self.head is None:
34 |             self.head = self.tail
35 |         self.count += 1
36 | 
37 |     def add_to_head(self):
38 |         """Add a node to the head of an AdwinList"""
39 |         temp = AdwinListNode(self.max_number_bucket)
40 |         if self.head is not None:
41 |             temp.next = self.head
42 |             self.head.prev = temp
43 |         self.head = temp
44 |         if self.tail is None:
45 |             self.tail = self.head
46 |         self.count += 1
47 | 
48 |     def remove_from_head(self):
49 |         """Remove the head node of an AdwinList"""
50 |         temp = self.head
51 |         self.head = self.head.next
52 |         if self.head is not None:
53 |             self.head.prev = None
54 |         else:
55 |             self.tail = None
56 |         self.count -= 1
57 | 
58 |     def remove_from_tail(self):
59 |         """Remove the tail node of an AdwinList"""
60 |         temp = self.tail
61 |         self.tail = self.tail.prev
62 |         if self.tail is None:
63 |             self.head = None
64 |         else:
65 |             self.tail.next = None
66 |         self.count -= 1
67 | 


--------------------------------------------------------------------------------
/drift_detector/adwin_list_node.py:
--------------------------------------------------------------------------------
 1 | """Node implementation of adwin list data structure"""
 2 | 
 3 | 
 4 | # Authors: Wenjun Bai <vivianbai.cn@gmail.com>
 5 | #          Shu Shang <ignatius.sun@gmail.com>
 6 | #          Duyen Phuc Nguyen <nguyenduyenphuc@gmail.com>
 7 | # License: BSD 3 clause
 8 | 
 9 | 
10 | class AdwinListNode(object):
11 |     """Implementation of a node of adwin list"""
12 | 
13 |     def __init__(self, max_number_of_buckets):
14 |         """Init a node with a given parameter number_of_buckets
15 | 
16 |         Parameters
17 |         ----------
18 |         max_number_of_buckets : In each row, the max number of buckets
19 |         """
20 |         self.max_number_of_buckets = max_number_of_buckets
21 |         self.size = 0
22 |         self.next = None
23 |         self.prev = None
24 |         self.sum = []
25 |         self.variance = []
26 |         for i in range(self.max_number_of_buckets + 1):
27 |             self.sum.append(0.0)
28 |             self.variance.append(0.0)
29 | 
30 |     def insert_bucket(self, value, variance):
31 |         """Insert a bucket at the end
32 | 
33 |         Parameters
34 |         ----------
35 |         value: the totally size of the new one
36 |         variance : the variance of the new one
37 |         """
38 |         self.sum[self.size] = value
39 |         self.variance[self.size] = variance
40 |         self.size += 1
41 | 
42 |     def drop_bucket(self, n=1):
43 |         """Drop the older portion of the bucket
44 | 
45 |         Parameters
46 |         ----------
47 |         n :number data of drop bucket
48 |         """
49 |         for k in range(n, self.max_number_of_buckets + 1):
50 |             self.sum[k - n] = self.sum[k]
51 |             self.variance[k - n] = self.variance[k]
52 |         for k in range(1, n + 1):
53 |             self.sum[self.max_number_of_buckets - k + 1] = 0.0
54 |             self.variance[self.max_number_of_buckets - k + 1] = 0.0
55 |         self.size -= n
56 | 


--------------------------------------------------------------------------------
/drift_detector/stream_volatility/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexeyegorov/drift-detection/a738e07f69db257e6fcf0a2dbfb596620fe1149c/drift_detector/stream_volatility/__init__.py


--------------------------------------------------------------------------------
/drift_detector/stream_volatility/buffer.py:
--------------------------------------------------------------------------------
 1 | """ Buffer as a component of volatility detector """
 2 | 
 3 | # Authors: Wenjun BAI <vivianbai.cn@gmail.com>
 4 | #          Shu SHANG <ignatius.sun@gmail.com>
 5 | #          Duyen Phuc Nguyen <nguyenduyenphuc@gmail.com>
 6 | # License: BSD 3 clause
 7 | 
 8 | import math
 9 | import numpy as np
10 | 
11 | 
12 | def calculate_stddev(times, mean):
13 |     count = 0
14 |     sum = 0
15 |     for d in times:
16 |         if d > 0:
17 |             count += 1
18 |             sum += math.pow(d - mean, 2)
19 |     if count == 0:
20 |         return 0
21 |     else:
22 |         return math.sqrt(sum / count)
23 | 
24 | 
25 | class Buffer:
26 |     def __init__(self, size):
27 |         """Initialize the buffer with the given size
28 | 
29 |         Parameters
30 |         ----------
31 |         size : int
32 |             Size of the buffer, the buffer is initialized with zeros.
33 |         """
34 |         self.buffer = []
35 |         self.size = size
36 |         self.sliding_index = 0
37 |         self.is_full = False
38 |         self.total = 0
39 |         for i in range(self.size):
40 |             self.buffer.append(0.0)
41 | 
42 |     def add(self, value):
43 |         """Add an element into the buffer
44 | 
45 |         Parameters
46 |         ----------
47 |         value : real value
48 |             Input value, the old one in the buffer is removed, using a sliding index
49 |             Total size of buffer is 32 by default.
50 | 
51 |         Returns
52 |         -------
53 |         removed: if the buffer is full, else return -1.
54 |         """
55 |         if self.sliding_index == self.size:
56 |             self.is_full = True
57 |             self.sliding_index = 0
58 |             self.buffer[:] = []
59 |             for i in range(self.size):
60 |                 self.buffer.append(0.0)
61 | 
62 |         removed = self.buffer[self.sliding_index]
63 |         self.total -= removed
64 | 
65 |         self.buffer.append(value)
66 |         self.sliding_index += 1
67 |         self.total += value
68 | 
69 |         if self.is_full:
70 |             return removed
71 |         else:
72 |             return -1
73 | 
74 |     def get_mean(self):
75 |         """Calculate the mean value of the buffer"""
76 |         if self.is_full:
77 |             return self.total / self.size
78 |         else:
79 |             return self.total / self.sliding_index
80 | 
81 |     def get_stddev(self):
82 |         """Calculate the standard deviation"""
83 |         stddev = calculate_stddev(self.buffer, self.get_mean())
84 |         if stddev == 0:
85 |             return 0.00000000001
86 |         else:
87 |             return stddev
88 | 
89 |     def clear(self):
90 |         """Clear the buffer, reset the parameters"""
91 |         self.buffer[:] = []
92 |         for i in range(self.size):
93 |             self.buffer.append(0.0)
94 |         self.sliding_index = 0
95 |         self.is_full = False
96 |         self.total = 0
97 | 


--------------------------------------------------------------------------------
/drift_detector/stream_volatility/reservoir.py:
--------------------------------------------------------------------------------
 1 | """ Reservoir as a component of volatility detector """
 2 | 
 3 | # Authors: Wenjun Bai <vivianbai.cn@gmail.com>
 4 | #          Shu Shang <ignatius.sun@gmail.com>
 5 | #          Duyen Phuc Nguyen <nguyenduyenphuc@gmail.com>
 6 | # License: BSD 3 clause
 7 | 
 8 | import numpy as np
 9 | from buffer import calculate_stddev
10 | 
11 | 
12 | class Reservoir:
13 |     def __init__(self, size):
14 |         """Initialize the reservoir with a given size.
15 | 
16 |         Parameters
17 |         ----------
18 |         size : int
19 |             Size of the reservoir, the reservoir is initialized with zeros. The number of elements of the
20 |             reservoir equals to this size.
21 |         """
22 |         self.size = size
23 |         self.elements = []
24 |         self.element_total = 0
25 |         self.e_index = 0
26 |         self.rand = np.random
27 |         for i in range(self.size):
28 |             self.elements.append(0.0)
29 | 
30 |     def add_element(self, input_value):
31 |         """Add an element to the reservoir. As the sliding window slides, the oldest entry in the
32 |         buffer is dropped from a buffer and moved into the reservoir, then the reservoir stores
33 |         the dropped entry by randomly replacing one of its stored samples.
34 | 
35 |         Parameters
36 |         ----------
37 |         input_value: real value
38 |             The input value to the buffer, randomly replacing one element in the reservoir
39 |             The type of input value should be real value like int, float... because in this
40 |             method the value will be used to calculate the statistics "total"
41 |         """
42 |         if self.e_index < self.size:
43 |             self.elements[self.e_index] = input_value
44 |             self.element_total += input_value
45 |             self.e_index += 1
46 |         else:
47 |             index_remove = int(self.rand.rand() * self.e_index)
48 |             self.element_total -= self.elements[index_remove]
49 |             self.elements[index_remove] = input_value
50 |             self.element_total += input_value
51 | 
52 |     def get_reservoir_mean(self):
53 |         """Calculate the mean of the elements stored in reservoir"""
54 |         return self.element_total / self.e_index
55 | 
56 |     def get_stddev(self):
57 |         """Calculate the standard deviation of the elements stored in reservoir"""
58 |         stddev = calculate_stddev(self.elements, self.get_reservoir_mean())
59 |         if stddev == 0:
60 |             return 0.00000000001
61 |         else:
62 |             return stddev
63 | 
64 |     def get_count(self):
65 |         """Get the number of elements in the reservoir, this statistics is monitored by e_index"""
66 |         return self.e_index
67 | 
68 |     def check_full(self):
69 |         if self.e_index == self.size:
70 |             return True
71 |         else:
72 |             return False
73 | 
74 |     def clear(self):
75 |         self.elements[:] = []
76 |         for i in range(self.size):
77 |             self.elements.append(0.0)
78 |         self.element_total = 0
79 |         self.e_index = 0
80 | 
81 |     def check_is_clear(self):
82 |         return self.e_index == 0
83 | 


--------------------------------------------------------------------------------
/drift_detector/stream_volatility/volatility_detector.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """ Relative Stream Volatility Detector """
  3 | 
  4 | # Authors: Wenjun Bai <vivianbai.cn@gmail.com>
  5 | #          Shu Shang <ignatius.sun@gmail.com>
  6 | #          Duyen Phuc Nguyen <nguyenduyenphuc@gmail.com>
  7 | # License: BSD 3 clause
  8 | 
  9 | from drift_detector.stream_volatility.buffer import Buffer
 10 | from drift_detector.stream_volatility.reservoir import Reservoir
 11 | 
 12 | 
 13 | class VolatilityDetector:
 14 |     """
 15 |     A drift detector is a detector that monitors the changes of stream volatility.
 16 |     Stream Volatility is the rate of changes of the detected changes given by a drift detector like Adwin.
 17 |     We can see this kind of detector as a drift detector the a set of given drifts and we call it volatility detector.
 18 | 
 19 |     A volatility detector takes the output of a drift detector and outputs an alarm if there is a change in the rate
 20 |     of detected drifts.
 21 | 
 22 |     The implementation uses two components: a buffer and a reservoir.
 23 |     The buffer is a sliding window that keeps the most recent samples of drift intervals acquired from
 24 |     a drift detection technique. The reservoir is a pool that stores previous samples which ideally represent
 25 |     the overall state of the stream.
 26 | 
 27 |     References
 28 |     ----------
 29 |     Huang, D.T.J., Koh, Y.S., Dobbie, G., Pears, R.: Detecting volatility shift in data streams.
 30 |     In: 2014 IEEE International Conference on Data Mining (ICDM), pp. 863–868 (2014)
 31 | 
 32 |     """
 33 |     def __init__(self, drift_detector, size):
 34 |         """
 35 |         Initialize a drift detector
 36 | 
 37 |         Parameters
 38 |         ----------
 39 |         drift_detector: type drift_detector
 40 |                     The volatility detector takes the output of a drift detector.
 41 |                     The corresponding drift detector is passed here to monitor its outputs.
 42 |         size: int
 43 |             Size of the reservoir and buffer by default.
 44 |         """
 45 |         self.drift_detector = drift_detector
 46 |         self.sample = 0
 47 |         self.reservoir = Reservoir(size)
 48 |         self.buffer = Buffer(size)
 49 |         self.confidence = 0.05
 50 |         self.recent_interval = []
 51 |         self.timestamp = 0
 52 |         self.vol_drift_found = False
 53 |         self.drift_found = False
 54 |         self.pre_drift_point = -1
 55 |         self.rolling_index = 0
 56 |         for i in range(size * 2 + 1):
 57 |             self.recent_interval.append(0.0)
 58 | 
 59 |     def set_input(self, input_value):
 60 |         """
 61 |         Main part of the algorithm, takes the drifts detected by a drift detector.
 62 | 
 63 |         Parameters
 64 |         ----------
 65 |         input_value: real value
 66 |                 The input value of the volatility detector, the value should be real values and should be the output
 67 |                 of some drift detector.
 68 | 
 69 |         Returns
 70 |         -------
 71 |         vol_drift_found: true if a drift of stream volatility was found.
 72 |         """
 73 |         self.sample += 1
 74 |         self.drift_found = self.drift_detector.set_input(input_value)
 75 |         if self.drift_found:
 76 |             self.timestamp += 1
 77 |             if self.buffer.is_full:
 78 |                 result_buffer = self.buffer.add(self.timestamp)
 79 |                 self.reservoir.add_element(result_buffer)
 80 |             else:
 81 |                 self.buffer.add(self.timestamp)
 82 |             interval = self.timestamp
 83 |             self.recent_interval[self.rolling_index] = interval
 84 |             self.rolling_index += 1
 85 |             if self.rolling_index == self.reservoir.size * 2:
 86 |                 self.rolling_index = 0
 87 |             self.timestamp = 0
 88 |             self.pre_drift_point = self.sample
 89 |             if self.buffer.is_full and self.reservoir.check_full():
 90 |                 relative_var = self.buffer.get_stddev() / self.reservoir.get_stddev()
 91 |                 if relative_var > (1.0 + self.confidence) or relative_var < (1.0 - self.confidence):
 92 |                     self.buffer.clear()
 93 |                     # self.severity_buffer[:] = []
 94 |                     self.vol_drift_found = True
 95 |                 else:
 96 |                     self.vol_drift_found = False
 97 |         else:
 98 |             self.timestamp += 1
 99 |             self.vol_drift_found = False
100 | 
101 |         return self.vol_drift_found
102 | 


--------------------------------------------------------------------------------
/evluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexeyegorov/drift-detection/a738e07f69db257e6fcf0a2dbfb596620fe1149c/evluation/__init__.py


--------------------------------------------------------------------------------
/evluation/metrics.py:
--------------------------------------------------------------------------------
 1 | from numpy import *
 2 | 
 3 | def Hamming_loss(Ytest,Ypred):
 4 |     ''' Hamming loss aka Hamming distance '''
 5 |     return 1.-Hamming_score(Ytest,Ypred)
 6 | 
 7 | def Hamming_score(Ytest,Ypred):
 8 |     ''' Hamming score aka Hamming match '''
 9 |     N_test,L = Ytest.shape
10 |     return sum((Ytest == Ypred) * 1.) / N_test / L
11 | 
12 | def Hamming_matches(Ytest,Ypred):
13 |     N_test,L = Ytest.shape
14 |     return sum((Ytest == Ypred) * 1.,axis=0) / N_test 
15 | 
16 | def Hamming_losses(Ytest,Ypred):
17 |     return 1.-Hamming_matches(Ytest,Ypred)
18 | 
19 | from sklearn.metrics import log_loss
20 | 
21 | 
22 | def Log_loss(Ytest,Ydist):
23 |     return log_loss(Ytest, Ydist, eps=1e-15, normalize=True)
24 | #    N_test,L = Ytest.shape
25 | #    return sum((Ytest == Ypred) * 1.) / N_test / L
26 | 
27 | def J_index(Ytest,Ypred):
28 |     N_test,L = Ytest.shape
29 |     s = 0.0
30 |     for i in range(N_test):
31 |         inter = sum((Ytest[i,:] * Ypred[i,:]) > 0) * 1.
32 |         union = sum((Ytest[i,:] + Ypred[i,:]) > 0) * 1.
33 |         if union > 0:
34 |             s = s + ( inter / union )
35 |         elif sum(Ytest[i,:]) == 0:
36 |             s = s + 1.
37 |     return s * 1. / N_test
38 | 
39 | def Exact_match(Ytest,Ypred):
40 |     N_test,L = Ytest.shape
41 |     return sum(sum((Ytest == Ypred) * 1,axis=1)==L) * 1. / N_test
42 | 
43 | def printEvalHeader():
44 |     print("Algorithm            Jacc. Hamm. Exact Time  ")
45 | 
46 | def printEval(Ytest,Ypred,name="Method",time = 0.0):
47 |     print("%-20s %.3f %.3f %.3f %0.1f" % (name, J_index(Ytest,Ypred), Hamming_loss(Ytest,Ypred), Exact_match(Ytest,Ypred), time))
48 | 
49 | def Edit_distance(Ytest,Ypred):
50 |     ''' Average edit distance '''
51 |     N_test,L = Ytest.shape
52 |     s = 0.
53 |     for i in range(N_test):
54 |         s = s + edit_distance(Ytest[i,:],Ypred[i,:])
55 |     return s * 1. / N_test
56 | 
57 | def h_loss(ytest,ypred):
58 |     ''' note: required by edit_distance to only return bits (not average bits / L) '''
59 |     return sum(ytest != ypred)
60 | 
61 | def Hamming_distances(Ytest,Ypred):
62 |     ''' probably only to be used for sequential data '''
63 |     N_test,L = Ytest.shape
64 |     return sum((Ytest != Ypred) * 1.,axis=0) / N_test
65 | 
66 | def Edit_distances(Ytest,Ypred):
67 |     N_test,L = Ytest.shape
68 |     d = zeros(L)
69 |     for j in range(L):
70 |         d[j] = Edit_distance(Ytest[:,0:j+1],Ypred[:,0:j+1])
71 |     return d / arange(1,L+1)
72 | 
73 | def edit_distance(y, p):
74 |     ''' 
75 |         aka Levenshtein
76 |         From Wikipedia article; Iterative with two matrix rows. 
77 |     '''
78 |     if h_loss(y,p) == 0: return 0
79 |     elif len(y) == 0: return len(p)
80 |     elif len(p) == 0: return len(y)
81 |     v0 = [None] * (len(p) + 1)
82 |     v1 = [None] * (len(p) + 1)
83 |     for i in range(len(v0)):
84 |         v0[i] = i
85 |     for i in range(len(y)):
86 |         v1[0] = i + 1
87 |         for j in range(len(p)):
88 |             cost = 0 if y[i] == p[j] else 1
89 |             v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost)
90 |         for j in range(len(v0)):
91 |             v0[j] = v1[j]
92 |             
93 |     return v1[len(p)]
94 | 
95 | #y = array([0,8,2,9,7])
96 | #p = array([8,2,9,7,0])
97 | #print h_loss(y,p)
98 | #print edit_distance(y,p)
99 | 


--------------------------------------------------------------------------------
/evluation/prequential.py:
--------------------------------------------------------------------------------
 1 | from numpy import *
 2 | from time import clock
 3 | import resource as rs
 4 | import os
 5 | import psutil as pu
 6 | 
 7 | 
 8 | def exact(yt, yp):
 9 |     '''
10 |         Error function
11 |         --------------
12 |     '''
13 |     return (yp == yt) * 1
14 | 
15 | 
16 | from evluation.metrics import J_index
17 | 
18 | 
19 | def get_errors(Y, P, J=J_index):
20 |     N, L = Y.shape
21 |     E = zeros((N))
22 |     for i in range(N):
23 |         E[i] = J(Y[i, :].reshape(1, -1), P[i, :].reshape(1, -1))
24 |     return E
25 | 
26 | 
27 | def prequential_evaluation(X, Y, H, N_train):
28 |     '''
29 |         Prequential Evaluation
30 |         ----------------------
31 |         X                       instances
32 |         Y                       labels
33 |         H = [h_1,...,h_H]       a set of classifiers
34 |         N_train                 number of instances for initial batch
35 |         return the label predictions for each test instance, and the associated running time 
36 |     '''
37 |     M = len(H)
38 |     T, L = Y.shape
39 | 
40 |     # split off an initial batch (maybe) ...
41 |     Y_init = Y[0:N_train]
42 |     X_init = X[0:N_train]
43 | 
44 |     # ... and then use the remainder, used for both incremental training and evaluation.
45 |     Y = Y[N_train:]
46 |     X = X[N_train:]
47 | 
48 |     E_pred = zeros((M, T - N_train, L))
49 |     E_time = zeros((M, T - N_train))
50 |     E_usage = zeros((M, T - N_train))
51 | 
52 |     for m in range(M):
53 |         # start_time = clock()
54 |         H[m].fit(X_init, Y_init)
55 |         # E_time[m,0] = clock() - start_time
56 | 
57 |     for t in range(0, T - N_train):
58 |         for m in range(M):
59 |             start_time = clock()
60 |             E_pred[m, t, :] = H[m].predict(X[t, :].reshape(1, -1))
61 |             H[m].partial_fit(X[t, :].reshape(1, -1), Y[t, :].reshape(1, -1))
62 |             E_time[m, t] += (clock() - start_time)
63 |             process = pu.Process(os.getpid())
64 |             # memory_use = py.memory_info()[0] / float(2 ** 30) # in GB
65 |             memory_use_percentage = process.memory_percent()
66 |             E_usage[m, t] = memory_use_percentage
67 | 
68 |     return E_pred, E_time, E_usage
69 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """
  3 | This is a test script for the three algorithms (Adwin, DDM, Stream Volatility)
  4 | implemented in this project. The script takes the input dataset of Power Supply.
  5 | The test is based on Prequential Evaluation, and monitors 3 indicators of
  6 | performance: Accuracy, Time and memory usage
  7 | """
  8 | 
  9 | # Authors: Wenjun Bai <vivianbai.cn@gmail.com>
 10 | #          Shu Shang <ignatius.sun@gmail.com>
 11 | #          Duyen Phuc Nguyen <nguyenduyenphuc@gmail.com>
 12 | # License: BSD 3 clause
 13 | 
 14 | import pandas as pd
 15 | import matplotlib.pyplot as plt
 16 | import numpy as np
 17 | 
 18 | from sklearn.naive_bayes import GaussianNB
 19 | from classifiers.detector_classifier import DetectorClassifier
 20 | from drift_detector.adwin import Adwin
 21 | from drift_detector.DDM import DDM
 22 | from drift_detector.stream_volatility.volatility_detector import VolatilityDetector
 23 | from evluation.metrics import Exact_match
 24 | from evluation.prequential import prequential_evaluation, get_errors
 25 | 
 26 | np.random.seed(0)
 27 | 
 28 | print('Load data')
 29 | 
 30 | """
 31 | Dataset: Power Supply Dataset
 32 | 
 33 | Download
 34 | --------
 35 | moa.cms.waikato.ac.nz/datasets/
 36 | http://www.cse.fau.edu/ ∼ xqzhu/stream.html
 37 | 
 38 | Data Structure
 39 | --------------
 40 | |"date"|"day"|"period"|"nswprice"|"nswdemand"|"vicprice"|"vicdemand"|"transfer"|"class"|
 41 | 
 42 | shape = (45312, 9)
 43 | n_features = 8
 44 | label = column['class'] = {"UP", "DOWN"}
 45 | """
 46 | df = pd.read_csv("data/elecNormNew.csv")
 47 | df['class'] = df['class'].map({'UP': 0, 'DOWN': 1})
 48 | L = 8
 49 | N_train = 1000
 50 | 
 51 | labels = df.columns.values.tolist()[L:]
 52 | data = df.values
 53 | T = len(data)
 54 | Y = data[:, L:]
 55 | X = data[:, 0:L]
 56 | 
 57 | print("Experimentation")
 58 | 
 59 | h = [DetectorClassifier(GaussianNB(), Adwin()),
 60 |      DetectorClassifier(GaussianNB(), VolatilityDetector(drift_detector=Adwin(), size=32)),
 61 |      DetectorClassifier(GaussianNB(), DDM()),
 62 |      GaussianNB()]
 63 | E_pred, E_time, E_usage = prequential_evaluation(X, Y, h, N_train)
 64 | 
 65 | print("Evaluation")
 66 | 
 67 | E = np.zeros((len(h), T - N_train))
 68 | for m in range(len(h)):
 69 |     E[m] = get_errors(Y[N_train:], E_pred[m], J=Exact_match)
 70 | 
 71 | print("Plot Results")
 72 | print("---------------------------------------")
 73 | w = 200
 74 | fig, axes = plt.subplots(nrows=3, ncols=1)
 75 | fig.tight_layout()
 76 | for m in range(len(h)):
 77 |     acc = np.mean(E[m, :])
 78 |     time = np.mean(E_time[m, :])
 79 |     usage = np.mean(E_usage[m, :])
 80 |     if h[m].__class__.__name__ == 'DetectorClassifier':
 81 |         print(h[m].__class__.__name__)
 82 |         print(h[m].get_detector_name())
 83 |     else:
 84 |         print(h[m].__class__.__name__)
 85 |     print("Exact Match %3.2f" % np.mean(acc))
 86 |     # print("Running Time  %3.2f" % np.mean(time))
 87 |     if h[m].__class__.__name__ == 'DetectorClassifier':
 88 |         print("Number of detected drifts: %d" % h[m].num_change_detected)
 89 |     print("---------------------------------------")
 90 |     acc_run = np.convolve(E[m, :], np.ones((w,)) / w, 'same')
 91 |     acc_time = np.convolve(E_time[m, :], np.ones((w,)) / w, 'same')
 92 |     acc_usage = np.convolve(E_usage[m, :], np.ones((w,)) / w, 'same')
 93 |     if h[m].__class__.__name__ == 'DetectorClassifier':
 94 |         plt.subplot(3, 1, 1)
 95 |         plt.plot(np.arange(len(acc_run)), acc_run, '-', label=h[m].get_detector_name())
 96 |         plt.subplot(3, 1, 2)
 97 |         plt.plot(np.arange(len(acc_time)), acc_time, '-', label=h[m].get_detector_name())
 98 |         plt.subplot(3, 1, 3)
 99 |         plt.plot(np.arange(len(acc_usage)), acc_usage, '-', label=h[m].get_detector_name())
100 |     else:
101 |         plt.subplot(3, 1, 1)
102 |         plt.plot(np.arange(len(acc_run)), acc_run, '-', label=h[m].__class__.__name__)
103 |         plt.subplot(3, 1, 2)
104 |         plt.plot(np.arange(len(acc_time)), acc_time, '-', label=h[m].__class__.__name__)
105 |         plt.subplot(3, 1, 3)
106 |         plt.plot(np.arange(len(acc_usage)), acc_usage, '-', label=h[m].__class__.__name__)
107 | 
108 | plt.subplot(3, 1, 1)
109 | plt.xlabel('Instance(samples)')
110 | plt.ylabel('Accuracy(exact match)')
111 | plt.title('Performance(acc)')
112 | plt.legend(loc='best')
113 | plt.subplot(3, 1, 2)
114 | plt.xlabel('Instance(samples)')
115 | plt.ylabel('Running time(ms)')
116 | plt.title('Performance(Running time)')
117 | plt.legend(loc='best')
118 | plt.subplot(3, 1, 3)
119 | plt.xlabel('Instance(samples)')
120 | plt.ylabel('Memory usage (%MEM)')
121 | plt.title('Performance(Memory usage)')
122 | plt.legend(loc='best')
123 | plt.show()
124 | 


--------------------------------------------------------------------------------