├── .gitignore
├── README.md
├── pom.xml
└── src
├── main
├── java
│ └── org
│ │ └── apache
│ │ └── mahout
│ │ └── anomalydetection
│ │ ├── EKGAnomalyDetection.java
│ │ └── TimeSeriesAnomalyDetection.java
└── resources
│ └── a02.dat
└── test
└── java
└── org
└── apache
└── mahout
└── anomalydetection
└── EKGAnomalyDetectionTest.java
/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | pom.xml.tag
3 | pom.xml.releaseBackup
4 | pom.xml.next
5 | release.properties
6 | *.iml
7 | .idea/
8 | dependency-reduced-pom.xml
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Time series anomaly detection in Mahout
2 |
3 | ## Introduction
4 |
5 | This repository contains a new class for time series anomaly detection in Mahout and a corresponding example based on Ted Dunning's previous work on EKG data.
6 |
7 | You can find the new class under ```src/main/java/org/apache/mahout/anomalydetection/TimeSeriesAnomalyDetection.java```.
8 |
9 | The TimeSeriesAnomalyDetection class embeds the t-digest algorithm in order to spot anomalies and guides the user through the process of anomaly detection.
10 |
11 | The EKAnomalyDetection class implements a time series anomaly detection scenario by applying the newly introduced TimeSeriesAnomalyDetection class.
12 |
13 | The example is provided under ```src/main/java/org/apache/mahout/anomalydetection/EKGAnomalyDetection.java```.
14 |
15 |
16 | ## How to run the example
17 |
18 | In order to run the example:
19 |
20 | 1. Assure maven is installed in your system ([https://maven.apache.org/](https://maven.apache.org/))
21 | 2. Execute: ```mvn clean install```
22 | 3. Execute the following command:
23 |
24 | ```mvn -q exec:java -Dexec.mainClass=org.apache.mahout.anomalydetection.EKGAnomalyDetection```
25 |
26 | In order to test it run: ```mvn test```
27 |
28 | ## References
29 | For further information:
30 |
31 | ### Anomaly detection
32 |
33 | * [Practical Machine Learning: A New Look At Anomaly Detection by Ted Dunning and Ellen Friedman](http://info.mapr.com/resources_ebook_anewlook_anomalydetection.html?cid=blog)
34 | * [A talk about anomaly detection](http://berlinbuzzwords.de/session/deep-learning-high-performance-time-series-databases)
35 | * [Related to this example on anomaly detection on EKG data](https://github.com/tdunning/anomaly-detection)
36 |
37 | ### t-digest algorithm
38 |
39 | * [The original implementation of and documentation for t-digest](https://github.com/tdunning/t-digest)
40 |
41 |
42 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | org.apache.mahout.anomalydetection
8 | AnomalyDetector
9 | 1.0-SNAPSHOT
10 |
11 | jar
12 | Anomaly Detection Demo
13 |
14 | Anomaly Detection in Mahout with t-digest and example code.
15 |
16 | https://github.com/pollo/anomaly_detection
17 |
18 |
19 |
20 | The Apache Software License, Version 2.0
21 | http://www.apache.org/licenses/LICENSE-2.0.txt
22 | repo
23 |
24 |
25 |
26 |
27 | scm:git:https://github.com/pollo/anomaly_detection.git
28 | scm:git:https://github.com/pollo/anomaly_detection.git
29 |
30 | HEAD
31 | https://github.com/pollo/anomaly_detection
32 |
33 |
34 |
35 |
36 | pollo
37 | Matteo Poletti
38 | pole.matteo@gmail.com
39 |
40 | developer
41 |
42 |
43 |
44 | dbernau
45 | Daniel Bernau
46 | dan.bernau@gmail.com
47 |
48 | developer
49 |
50 |
51 |
52 | nodo
53 | Andrea Nodari
54 | andrea.nodari91@gmail.com
55 |
56 | developer
57 |
58 |
59 |
60 |
61 | UTF-8
62 |
63 |
64 |
65 |
66 | org.apache.mahout
67 | mahout-mr
68 | 0.10.0
69 |
70 |
71 | junit
72 | junit
73 | 4.12
74 |
75 |
76 | com.tdunning
77 | t-digest
78 | 3.1
79 |
80 |
81 |
82 |
83 |
84 |
85 | org.apache.maven.plugins
86 | maven-compiler-plugin
87 | 3.3
88 |
89 | true
90 | 1.7
91 | 1.7
92 | 1.7
93 |
94 |
95 |
96 | org.apache.maven.plugins
97 | maven-source-plugin
98 | 2.4
99 |
100 |
101 | attach-sources
102 |
103 | jar
104 |
105 |
106 |
107 |
108 |
109 | org.apache.maven.plugins
110 | maven-surefire-plugin
111 | 2.18.1
112 |
113 |
114 | org.apache.maven.plugins
115 | maven-javadoc-plugin
116 | 2.10.3
117 |
118 |
119 | attach-javadocs
120 |
121 | jar
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
--------------------------------------------------------------------------------
/src/main/java/org/apache/mahout/anomalydetection/EKGAnomalyDetection.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.mahout.anomalydetection;
19 |
20 | import com.google.common.io.Resources;
21 | import org.apache.mahout.clustering.streaming.cluster.BallKMeans;
22 | import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
23 | import org.apache.mahout.math.DenseMatrix;
24 | import org.apache.mahout.math.DenseVector;
25 | import org.apache.mahout.math.Matrix;
26 | import org.apache.mahout.math.Vector;
27 | import org.apache.mahout.math.WeightedVector;
28 | import org.apache.mahout.math.function.Functions;
29 | import org.apache.mahout.math.neighborhood.BruteSearch;
30 | import org.apache.mahout.math.neighborhood.UpdatableSearcher;
31 | import org.apache.mahout.math.random.WeightedThing;
32 |
33 | import java.io.DataInputStream;
34 | import java.io.File;
35 | import java.io.FileInputStream;
36 | import java.io.FileNotFoundException;
37 | import java.io.IOException;
38 | import java.net.URL;
39 | import java.util.ArrayList;
40 | import java.util.Formatter;
41 | import java.util.List;
42 |
43 | /**
44 | * Example of anomaly detection using AnomalyDetection. The code is taken from Ted Dunning's EKG
45 | * anomaly detection example (https://github.com/tdunning/anomaly-detection) and adapted to the
46 | * AnomalyDetection pattern.
47 | *
48 | * According EKG Data can be found at physionet.org/physiobank/database/#ecg-databases.
49 | *
50 | * Read EKG data, extract windows and apply k-means clustering. Afterwards, build a reconstructed
51 | * signal to identify out the error.
52 | */
53 | public class EKGAnomalyDetection extends TimeSeriesAnomalyDetection {
54 | // the fraction of returned anomalies
55 | public static final double ANOMALY_FRACTION = 10.0 / 100;
56 | // Window Size for EKG Data Example
57 | private final int WINDOW = 32;
58 | // distance between starting points of two adjacent windows
59 | private static final int STEP = 2;
60 | // number of constructed windows used for clustering
61 | private static final int SAMPLES = 200000;
62 | // according to Ted Dunning's description, 100 roughly represents a
63 | // compression ratio for small to mid-size data sets
64 | private static final double COMPRESSION = 100;
65 |
66 | private Vector window;
67 | private double t0;
68 | private double t1;
69 |
70 | UpdatableSearcher clustering;
71 |
72 | /**
73 | * Read EKG trace and extract scaled data points.
74 | *
75 | * @param in Input Data File
76 | * @param scale Scaling Factor for Data Points
77 | * @return Vector of Data Points
78 | * @throws IOException
79 | */
80 | public Matrix read16b(File in, double scale) throws IOException {
81 | DataInputStream input = new DataInputStream(new FileInputStream(in));
82 |
83 | int rows = (int) (in.length() / 2);
84 |
85 | DenseMatrix data = new DenseMatrix(rows, 1);
86 | for (int i = 0; i < rows; i++) {
87 | data.setQuick(i, 0, input.readShort() * scale);
88 | }
89 | return data;
90 | }
91 |
92 | @Override
93 | /**
94 | * Build a model based on extracted EKG data points by using k-means clustering.
95 | *
96 | * @param trace EKG Data Points.
97 | */
98 | public void buildModel(Matrix data) {
99 | Vector trace = data.viewColumn(0);
100 |
101 | // initialize variable for timing output
102 | this.t0 = System.nanoTime() / 1e9;
103 |
104 | // list of windowed data
105 | List r = new ArrayList<>();
106 |
107 | // create windows according to SAMPLES and STEP
108 | for (int i = 0; i < SAMPLES; i++) {
109 | int offset = i * STEP;
110 | WeightedVector row = new WeightedVector(new DenseVector(WINDOW), 1, i);
111 | row.assign(trace.viewPart(offset, WINDOW));
112 | row.assign(this.window, Functions.MULT);
113 | // normalizing the data
114 | row.assign(Functions.mult(1 / row.norm(2)));
115 | r.add(row);
116 | }
117 | // time for windowing data
118 | this.t1 = System.nanoTime() / 1e9;
119 | System.out.printf("Windowed data in %.2f s\n", this.t1 - this.t0);
120 |
121 | // clustering the data by applying k-means
122 | this.t0 = System.nanoTime() / 1e9;
123 | BallKMeans km = new BallKMeans(new BruteSearch(new EuclideanDistanceMeasure()), 400, 10);
124 | this.clustering = km.cluster(r);
125 | this.t1 = System.nanoTime() / 1e9;
126 | System.out.printf("Clustered in %.2f s\n", this.t1 - this.t0);
127 |
128 | // Output clustering results. One line per cluster
129 | // centroid, each with WINDOW values
130 | this.t0 = System.nanoTime() / 1e9;
131 | try (Formatter out = new Formatter("dict.tsv")) {
132 | for (Vector v : this.clustering) {
133 | String separator = "";
134 | for (Vector.Element element : v.all()) {
135 | out.format("%s%.3f", separator, element.get());
136 | separator = "\t";
137 | }
138 | out.format("\n");
139 | }
140 | } catch (FileNotFoundException e) {
141 | e.printStackTrace();
142 | }
143 | }
144 |
145 | @Override
146 | /**
147 | *
148 | * Reconstruct the Signal by matching original windows against the nearest cluster centroid as provided by k-means output.
149 | *
150 | * @param trace Data EKG Data Points.
151 | * @return The reconstructed Signal.
152 | */
153 | public Matrix reconstructSignal(Matrix data) {
154 | Vector trace = data.viewColumn(0);
155 | // Reconstruct the Signal. Each window can be looked at independently
156 | // due to the windowing.
157 | // This works because the window before and after the current one will
158 | // independently approximate the portion of the signal
159 | // left over after subtracting this window.
160 |
161 | Matrix reconstructedSignal = new DenseMatrix(trace.size(), 1);
162 |
163 | try (Formatter out = new Formatter("trace.tsv")) {
164 | // First Column = original, Second Column = reconstructed
165 | Matrix rx = new DenseMatrix(WINDOW / 2, 2);
166 | // Memorize window order
167 | Vector previous = new DenseVector(WINDOW);
168 | Vector current = new DenseVector(WINDOW);
169 |
170 | for (int i = 0; i + WINDOW < trace.size(); i += WINDOW / 2) {
171 | // copy chunk of data to temporary window storage and multiply
172 | // by window
173 | WeightedVector row = new WeightedVector(
174 | new DenseVector(WINDOW), 1, i);
175 | row.assign(trace.viewPart(i, WINDOW));
176 | // applying the window to the original data
177 | row.assign(this.window, Functions.MULT);
178 |
179 | // scale data
180 | double scale = row.norm(2);
181 | row.assign(Functions.mult(1 / scale));
182 |
183 | // find the closest centroid according to scaled data
184 | WeightedThing cluster = this.clustering.search(row, 1)
185 | .get(0);
186 | current.assign(cluster.getValue());
187 | // scale data back to original
188 | current.assign(Functions.mult(scale));
189 |
190 | // Produce results of half a window at a time. The reconstructed
191 | // Signal is the sum of the 2nd half of the previous window and
192 | // the 1st half of the current window
193 | rx.viewColumn(0).assign(trace.viewPart(i, WINDOW / 2));
194 | rx.viewColumn(1).assign(
195 | previous.viewPart(WINDOW / 2, WINDOW / 2));
196 | rx.viewColumn(1).assign(current.viewPart(0, WINDOW / 2),
197 | Functions.PLUS);
198 | previous.assign(current);
199 |
200 | for (int j = 0; j < WINDOW / 2; j++) {
201 | out.format("%.3f\t%.3f\t%d\n", rx.get(j, 0), rx.get(j, 1),
202 | ((WeightedVector) cluster.getValue()).getIndex());
203 | reconstructedSignal.setQuick(i + j, 0, rx.get(j, 1));
204 | }
205 | }
206 | } catch (FileNotFoundException e) {
207 | e.printStackTrace();
208 | }
209 | // Time for Signal reconstruction
210 | this.t1 = System.nanoTime() / 1e9;
211 | System.out.printf("Output in %.2f s\n", this.t1 - this.t0);
212 |
213 | return reconstructedSignal;
214 | }
215 |
216 | @Override
217 | /**
218 | * Returns the error as the difference of the two numbers contained in the two vectors
219 | */
220 | protected double computeError(Vector actualPoint,
221 | Vector reconstructedPoint) {
222 | return actualPoint.getQuick(0) - reconstructedPoint.getQuick(0);
223 | }
224 |
225 | public void run() throws IOException {
226 |
227 | // read the EKG data
228 | URL x = Resources.getResource("a02.dat");
229 | this.t0 = System.nanoTime() / 1e9;
230 | Matrix trace = this.read16b(new File(x.getPath()), 1.0 / 200);
231 | this.t1 = System.nanoTime() / 1e9;
232 | System.out
233 | .printf("Read test data from %s in %.2f s\n", x, this.t1 - t0);
234 |
235 | // set up the window vector
236 | this.window = new DenseVector(WINDOW);
237 | for (int i = 0; i < WINDOW; i++) {
238 | double w = Math.sin(Math.PI * i / (WINDOW - 1.0));
239 | this.window.set(i, w * w);
240 | }
241 |
242 | this.buildModel(trace);
243 |
244 | List anomalies;
245 | try {
246 | anomalies = this.detectAnomalies(trace, ANOMALY_FRACTION,
247 | COMPRESSION);
248 | } catch (IllegalArgumentException e) {
249 | System.err.println("Error occurred while detecting anomalies");
250 | throw e;
251 | }
252 |
253 | // output anomalies
254 | try (Formatter out = new Formatter("anomalies.tsv")) {
255 | for (Anomaly a : anomalies) {
256 | out.format("%.3f\t%.3f\t%d\n", a.getData().getQuick(0), a.getError(),
257 | a.getIndex());
258 | }
259 | }
260 | }
261 |
262 | public static void main(String[] args) throws IOException {
263 | new EKGAnomalyDetection().run();
264 | }
265 | }
266 |
--------------------------------------------------------------------------------
/src/main/java/org/apache/mahout/anomalydetection/TimeSeriesAnomalyDetection.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the License); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an AS IS BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.mahout.anomalydetection;
19 |
20 | import org.apache.mahout.math.DenseVector;
21 | import org.apache.mahout.math.Matrix;
22 | import org.apache.mahout.math.Vector;
23 |
24 | import com.tdunning.math.stats.TDigest;
25 |
26 | import java.util.ArrayList;
27 | import java.util.List;
28 |
29 | /**
30 | * Abstract base class for time series anomaly detection. The process is modeled in three steps: - building a
31 | * model for the data - using the model to build a representation of the data - apply t-digest to
32 | * detect when the representation differs from the data.
33 | *
34 | * The time series is represented as a Matrix, where each row correspond to a point of the time series. Each
35 | * point of the time series may be represented by multiple features and is thus stored as a Vector.
36 | *
37 | * The class should be extended implementing the methods buildModel and reconstructSignal. The
38 | * method detectAnomalies can then be used to retrieve the anomalies found with the application of
39 | * the t-digest algorithm.
40 | *
41 | * The concept is taken from the book Practical Machine Learning:A New Look At Anomaly Detection by
42 | * Ted Dunning and Ellen Friedman
43 | */
44 | public abstract class TimeSeriesAnomalyDetection {
45 | /**
46 | * Build a model for the data. The user dependent implementation should store the model into the
47 | * state of the class. The model will then be used into the reconstructSignal method to build a
48 | * representation of the data.
49 | *
50 | * @param data Data used to build (or fit) the model
51 | */
52 | public abstract void buildModel(org.apache.mahout.math.Matrix data);
53 |
54 | /**
55 | * Builds and returns the closest representation (reconstructed signal) of the data which the
56 | * model constructed in buildModel can provide.
57 | *
58 | * @param data Data for reconstruction
59 | * @return The reconstructed signal
60 | */
61 | public abstract org.apache.mahout.math.Matrix reconstructSignal(
62 | org.apache.mahout.math.Matrix data);
63 |
64 | /**
65 | * Used by detectAnomalies to compute the error between the feature vector of an actual point
66 | * of the time series and the feature vector of a point in the reconstructed time series.
67 | *
68 | * Computes the error vector as difference between the two vectors and return s
69 | *
70 | * @param actualPoint Feature vector of an actual point
71 | * @param reconstructedPoint Feature vector of a reconstructed point
72 | * @return The error between the two points.
73 | */
74 | protected double computeError(org.apache.mahout.math.Vector actualPoint,
75 | org.apache.mahout.math.Vector reconstructedPoint) {
76 | Vector error = actualPoint.minus(reconstructedPoint);
77 | return error.norm(2);
78 | }
79 |
80 | /**
81 | * Detects and returns the anomalies.
82 | *
83 | * First a reconstruction of the data is obtained using the reconstructSignal method. Then for
84 | * each point in the time series the reconstructed signal is compared to the actual data. The error
85 | * between them is computed using the method computeError which by default returns the difference
86 | * vector norm but may be overridden by the user.
87 | *
88 | * Then the t-digest algorithm is used to detect when the reconstructed signal differs too much from the
89 | * actual data (depending on the quantile parameter).
90 | *
91 | * @param data Data used for anomaly detection
92 | * @param anomalyFraction Fraction of data point reported as anomalies
93 | * @param compression Parameter used from the t-digest algorithm to set the data compression
94 | * @return List of Anomaly, each Anomaly identifies an anomalous data point reporting the data
95 | * value, the difference from the reconstructed signal (error) and the position in the sequence.
96 | * @throws IllegalArgumentException Thrown when size of the reconstructed signal differs from the
97 | * size of the data
98 | */
99 | public List detectAnomalies(
100 | Matrix data,
101 | double anomalyFraction,
102 | double compression) throws IllegalArgumentException {
103 | // reconstruct signal starting from data
104 | Matrix reconstructedSignal = this.reconstructSignal(data);
105 |
106 | // check length reconstructed signal = actual data
107 | if (data.numRows() != reconstructedSignal.numRows()) {
108 | throw new IllegalArgumentException("The size of reconstructedSignal differs from the data size");
109 | }
110 |
111 | // run t-digest to compute threshold corresponding to the quantile
112 | TDigest digest = TDigest.createDigest(compression);
113 |
114 | Vector delta = new DenseVector(data.numRows());
115 | // for each point in the time series add computed error to the TDigest
116 | for (int i=0; i anomalies = new ArrayList<>();
126 | for (int i = 0; i < data.numRows(); i++) {
127 | double element = delta.getQuick(i);
128 | if (Math.abs(element) > threshold) {
129 | // insert data, error and index into return Map
130 | anomalies.add(new Anomaly(data.viewRow(i),
131 | element,
132 | i));
133 | }
134 | }
135 |
136 | return anomalies;
137 | }
138 | }
139 |
140 | class Anomaly {
141 | private Vector data;
142 | private double error;
143 | private int index;
144 |
145 | public Anomaly(Vector data,
146 | double error,
147 | int index) {
148 | this.data = data;
149 | this.error = error;
150 | this.index = index;
151 | }
152 |
153 | public Vector getData() {
154 | return data;
155 | }
156 |
157 | public double getError() {
158 | return error;
159 | }
160 |
161 | public int getIndex() {
162 | return index;
163 | }
164 | }
165 |
--------------------------------------------------------------------------------
/src/main/resources/a02.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pollo/anomaly_detection/52a13c0213dc60163a9073018548795e5aa1f024/src/main/resources/a02.dat
--------------------------------------------------------------------------------
/src/test/java/org/apache/mahout/anomalydetection/EKGAnomalyDetectionTest.java:
--------------------------------------------------------------------------------
1 | package org.apache.mahout.anomalydetection;
2 |
3 | import java.io.File;
4 | import java.util.Scanner;
5 | import org.junit.Test;
6 |
7 | import static org.junit.Assert.assertTrue;
8 |
9 | public class EKGAnomalyDetectionTest {
10 |
11 | @Test
12 | public void testNumberFoundAnomalies() throws Exception {
13 | new EKGAnomalyDetection().run();
14 | Scanner traceFile = new Scanner(new File("trace.tsv"));
15 | int points_number = 0;
16 | while (traceFile.hasNextLine()) {
17 | points_number += 1;
18 | traceFile.nextLine();
19 | }
20 | Scanner anomaliesFile = new Scanner(new File("anomalies.tsv"));
21 | int anomalies_number = 0;
22 | while (anomaliesFile.hasNextLine()) {
23 | anomalies_number += 1;
24 | anomaliesFile.nextLine();
25 | }
26 |
27 | double threshold = 0.001;
28 | assertTrue("The number of found anomalies is not the fraction expected",
29 | Math.abs((double)anomalies_number/points_number - EKGAnomalyDetection.ANOMALY_FRACTION) < threshold);
30 | }
31 | }
32 |
--------------------------------------------------------------------------------