├── .gitignore ├── README.md ├── pom.xml └── src ├── main ├── java │ └── org │ │ └── apache │ │ └── mahout │ │ └── anomalydetection │ │ ├── EKGAnomalyDetection.java │ │ └── TimeSeriesAnomalyDetection.java └── resources │ └── a02.dat └── test └── java └── org └── apache └── mahout └── anomalydetection └── EKGAnomalyDetectionTest.java /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | pom.xml.tag 3 | pom.xml.releaseBackup 4 | pom.xml.next 5 | release.properties 6 | *.iml 7 | .idea/ 8 | dependency-reduced-pom.xml -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Time series anomaly detection in Mahout 2 | 3 | ## Introduction 4 | 5 | This repository contains a new class for time series anomaly detection in Mahout and a corresponding example based on Ted Dunning's previous work on EKG data. 6 | 7 | You can find the new class under ```src/main/java/org/apache/mahout/anomalydetection/TimeSeriesAnomalyDetection.java```. 8 | 9 | The TimeSeriesAnomalyDetection class embeds the t-digest algorithm in order to spot anomalies and guides the user through the process of anomaly detection. 10 | 11 | The EKAnomalyDetection class implements a time series anomaly detection scenario by applying the newly introduced TimeSeriesAnomalyDetection class. 12 | 13 | The example is provided under ```src/main/java/org/apache/mahout/anomalydetection/EKGAnomalyDetection.java```. 14 | 15 | 16 | ## How to run the example 17 | 18 | In order to run the example: 19 | 20 | 1. Assure maven is installed in your system ([https://maven.apache.org/](https://maven.apache.org/)) 21 | 2. Execute: ```mvn clean install``` 22 | 3. Execute the following command: 23 | 24 | ```mvn -q exec:java -Dexec.mainClass=org.apache.mahout.anomalydetection.EKGAnomalyDetection``` 25 | 26 | In order to test it run: ```mvn test``` 27 | 28 | ## References 29 | For further information: 30 | 31 | ### Anomaly detection 32 | 33 | * [Practical Machine Learning: A New Look At Anomaly Detection by Ted Dunning and Ellen Friedman](http://info.mapr.com/resources_ebook_anewlook_anomalydetection.html?cid=blog) 34 | * [A talk about anomaly detection](http://berlinbuzzwords.de/session/deep-learning-high-performance-time-series-databases) 35 | * [Related to this example on anomaly detection on EKG data](https://github.com/tdunning/anomaly-detection) 36 | 37 | ### t-digest algorithm 38 | 39 | * [The original implementation of and documentation for t-digest](https://github.com/tdunning/t-digest) 40 | 41 | 42 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.apache.mahout.anomalydetection 8 | AnomalyDetector 9 | 1.0-SNAPSHOT 10 | 11 | jar 12 | Anomaly Detection Demo 13 | 14 | Anomaly Detection in Mahout with t-digest and example code. 15 | 16 | https://github.com/pollo/anomaly_detection 17 | 18 | 19 | 20 | The Apache Software License, Version 2.0 21 | http://www.apache.org/licenses/LICENSE-2.0.txt 22 | repo 23 | 24 | 25 | 26 | 27 | scm:git:https://github.com/pollo/anomaly_detection.git 28 | scm:git:https://github.com/pollo/anomaly_detection.git 29 | 30 | HEAD 31 | https://github.com/pollo/anomaly_detection 32 | 33 | 34 | 35 | 36 | pollo 37 | Matteo Poletti 38 | pole.matteo@gmail.com 39 | 40 | developer 41 | 42 | 43 | 44 | dbernau 45 | Daniel Bernau 46 | dan.bernau@gmail.com 47 | 48 | developer 49 | 50 | 51 | 52 | nodo 53 | Andrea Nodari 54 | andrea.nodari91@gmail.com 55 | 56 | developer 57 | 58 | 59 | 60 | 61 | UTF-8 62 | 63 | 64 | 65 | 66 | org.apache.mahout 67 | mahout-mr 68 | 0.10.0 69 | 70 | 71 | junit 72 | junit 73 | 4.12 74 | 75 | 76 | com.tdunning 77 | t-digest 78 | 3.1 79 | 80 | 81 | 82 | 83 | 84 | 85 | org.apache.maven.plugins 86 | maven-compiler-plugin 87 | 3.3 88 | 89 | true 90 | 1.7 91 | 1.7 92 | 1.7 93 | 94 | 95 | 96 | org.apache.maven.plugins 97 | maven-source-plugin 98 | 2.4 99 | 100 | 101 | attach-sources 102 | 103 | jar 104 | 105 | 106 | 107 | 108 | 109 | org.apache.maven.plugins 110 | maven-surefire-plugin 111 | 2.18.1 112 | 113 | 114 | org.apache.maven.plugins 115 | maven-javadoc-plugin 116 | 2.10.3 117 | 118 | 119 | attach-javadocs 120 | 121 | jar 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /src/main/java/org/apache/mahout/anomalydetection/EKGAnomalyDetection.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.mahout.anomalydetection; 19 | 20 | import com.google.common.io.Resources; 21 | import org.apache.mahout.clustering.streaming.cluster.BallKMeans; 22 | import org.apache.mahout.common.distance.EuclideanDistanceMeasure; 23 | import org.apache.mahout.math.DenseMatrix; 24 | import org.apache.mahout.math.DenseVector; 25 | import org.apache.mahout.math.Matrix; 26 | import org.apache.mahout.math.Vector; 27 | import org.apache.mahout.math.WeightedVector; 28 | import org.apache.mahout.math.function.Functions; 29 | import org.apache.mahout.math.neighborhood.BruteSearch; 30 | import org.apache.mahout.math.neighborhood.UpdatableSearcher; 31 | import org.apache.mahout.math.random.WeightedThing; 32 | 33 | import java.io.DataInputStream; 34 | import java.io.File; 35 | import java.io.FileInputStream; 36 | import java.io.FileNotFoundException; 37 | import java.io.IOException; 38 | import java.net.URL; 39 | import java.util.ArrayList; 40 | import java.util.Formatter; 41 | import java.util.List; 42 | 43 | /** 44 | * Example of anomaly detection using AnomalyDetection. The code is taken from Ted Dunning's EKG 45 | * anomaly detection example (https://github.com/tdunning/anomaly-detection) and adapted to the 46 | * AnomalyDetection pattern. 47 | *

48 | * According EKG Data can be found at physionet.org/physiobank/database/#ecg-databases. 49 | *

50 | * Read EKG data, extract windows and apply k-means clustering. Afterwards, build a reconstructed 51 | * signal to identify out the error. 52 | */ 53 | public class EKGAnomalyDetection extends TimeSeriesAnomalyDetection { 54 | // the fraction of returned anomalies 55 | public static final double ANOMALY_FRACTION = 10.0 / 100; 56 | // Window Size for EKG Data Example 57 | private final int WINDOW = 32; 58 | // distance between starting points of two adjacent windows 59 | private static final int STEP = 2; 60 | // number of constructed windows used for clustering 61 | private static final int SAMPLES = 200000; 62 | // according to Ted Dunning's description, 100 roughly represents a 63 | // compression ratio for small to mid-size data sets 64 | private static final double COMPRESSION = 100; 65 | 66 | private Vector window; 67 | private double t0; 68 | private double t1; 69 | 70 | UpdatableSearcher clustering; 71 | 72 | /** 73 | * Read EKG trace and extract scaled data points. 74 | * 75 | * @param in Input Data File 76 | * @param scale Scaling Factor for Data Points 77 | * @return Vector of Data Points 78 | * @throws IOException 79 | */ 80 | public Matrix read16b(File in, double scale) throws IOException { 81 | DataInputStream input = new DataInputStream(new FileInputStream(in)); 82 | 83 | int rows = (int) (in.length() / 2); 84 | 85 | DenseMatrix data = new DenseMatrix(rows, 1); 86 | for (int i = 0; i < rows; i++) { 87 | data.setQuick(i, 0, input.readShort() * scale); 88 | } 89 | return data; 90 | } 91 | 92 | @Override 93 | /** 94 | * Build a model based on extracted EKG data points by using k-means clustering. 95 | * 96 | * @param trace EKG Data Points. 97 | */ 98 | public void buildModel(Matrix data) { 99 | Vector trace = data.viewColumn(0); 100 | 101 | // initialize variable for timing output 102 | this.t0 = System.nanoTime() / 1e9; 103 | 104 | // list of windowed data 105 | List r = new ArrayList<>(); 106 | 107 | // create windows according to SAMPLES and STEP 108 | for (int i = 0; i < SAMPLES; i++) { 109 | int offset = i * STEP; 110 | WeightedVector row = new WeightedVector(new DenseVector(WINDOW), 1, i); 111 | row.assign(trace.viewPart(offset, WINDOW)); 112 | row.assign(this.window, Functions.MULT); 113 | // normalizing the data 114 | row.assign(Functions.mult(1 / row.norm(2))); 115 | r.add(row); 116 | } 117 | // time for windowing data 118 | this.t1 = System.nanoTime() / 1e9; 119 | System.out.printf("Windowed data in %.2f s\n", this.t1 - this.t0); 120 | 121 | // clustering the data by applying k-means 122 | this.t0 = System.nanoTime() / 1e9; 123 | BallKMeans km = new BallKMeans(new BruteSearch(new EuclideanDistanceMeasure()), 400, 10); 124 | this.clustering = km.cluster(r); 125 | this.t1 = System.nanoTime() / 1e9; 126 | System.out.printf("Clustered in %.2f s\n", this.t1 - this.t0); 127 | 128 | // Output clustering results. One line per cluster 129 | // centroid, each with WINDOW values 130 | this.t0 = System.nanoTime() / 1e9; 131 | try (Formatter out = new Formatter("dict.tsv")) { 132 | for (Vector v : this.clustering) { 133 | String separator = ""; 134 | for (Vector.Element element : v.all()) { 135 | out.format("%s%.3f", separator, element.get()); 136 | separator = "\t"; 137 | } 138 | out.format("\n"); 139 | } 140 | } catch (FileNotFoundException e) { 141 | e.printStackTrace(); 142 | } 143 | } 144 | 145 | @Override 146 | /** 147 | * 148 | * Reconstruct the Signal by matching original windows against the nearest cluster centroid as provided by k-means output. 149 | * 150 | * @param trace Data EKG Data Points. 151 | * @return The reconstructed Signal. 152 | */ 153 | public Matrix reconstructSignal(Matrix data) { 154 | Vector trace = data.viewColumn(0); 155 | // Reconstruct the Signal. Each window can be looked at independently 156 | // due to the windowing. 157 | // This works because the window before and after the current one will 158 | // independently approximate the portion of the signal 159 | // left over after subtracting this window. 160 | 161 | Matrix reconstructedSignal = new DenseMatrix(trace.size(), 1); 162 | 163 | try (Formatter out = new Formatter("trace.tsv")) { 164 | // First Column = original, Second Column = reconstructed 165 | Matrix rx = new DenseMatrix(WINDOW / 2, 2); 166 | // Memorize window order 167 | Vector previous = new DenseVector(WINDOW); 168 | Vector current = new DenseVector(WINDOW); 169 | 170 | for (int i = 0; i + WINDOW < trace.size(); i += WINDOW / 2) { 171 | // copy chunk of data to temporary window storage and multiply 172 | // by window 173 | WeightedVector row = new WeightedVector( 174 | new DenseVector(WINDOW), 1, i); 175 | row.assign(trace.viewPart(i, WINDOW)); 176 | // applying the window to the original data 177 | row.assign(this.window, Functions.MULT); 178 | 179 | // scale data 180 | double scale = row.norm(2); 181 | row.assign(Functions.mult(1 / scale)); 182 | 183 | // find the closest centroid according to scaled data 184 | WeightedThing cluster = this.clustering.search(row, 1) 185 | .get(0); 186 | current.assign(cluster.getValue()); 187 | // scale data back to original 188 | current.assign(Functions.mult(scale)); 189 | 190 | // Produce results of half a window at a time. The reconstructed 191 | // Signal is the sum of the 2nd half of the previous window and 192 | // the 1st half of the current window 193 | rx.viewColumn(0).assign(trace.viewPart(i, WINDOW / 2)); 194 | rx.viewColumn(1).assign( 195 | previous.viewPart(WINDOW / 2, WINDOW / 2)); 196 | rx.viewColumn(1).assign(current.viewPart(0, WINDOW / 2), 197 | Functions.PLUS); 198 | previous.assign(current); 199 | 200 | for (int j = 0; j < WINDOW / 2; j++) { 201 | out.format("%.3f\t%.3f\t%d\n", rx.get(j, 0), rx.get(j, 1), 202 | ((WeightedVector) cluster.getValue()).getIndex()); 203 | reconstructedSignal.setQuick(i + j, 0, rx.get(j, 1)); 204 | } 205 | } 206 | } catch (FileNotFoundException e) { 207 | e.printStackTrace(); 208 | } 209 | // Time for Signal reconstruction 210 | this.t1 = System.nanoTime() / 1e9; 211 | System.out.printf("Output in %.2f s\n", this.t1 - this.t0); 212 | 213 | return reconstructedSignal; 214 | } 215 | 216 | @Override 217 | /** 218 | * Returns the error as the difference of the two numbers contained in the two vectors 219 | */ 220 | protected double computeError(Vector actualPoint, 221 | Vector reconstructedPoint) { 222 | return actualPoint.getQuick(0) - reconstructedPoint.getQuick(0); 223 | } 224 | 225 | public void run() throws IOException { 226 | 227 | // read the EKG data 228 | URL x = Resources.getResource("a02.dat"); 229 | this.t0 = System.nanoTime() / 1e9; 230 | Matrix trace = this.read16b(new File(x.getPath()), 1.0 / 200); 231 | this.t1 = System.nanoTime() / 1e9; 232 | System.out 233 | .printf("Read test data from %s in %.2f s\n", x, this.t1 - t0); 234 | 235 | // set up the window vector 236 | this.window = new DenseVector(WINDOW); 237 | for (int i = 0; i < WINDOW; i++) { 238 | double w = Math.sin(Math.PI * i / (WINDOW - 1.0)); 239 | this.window.set(i, w * w); 240 | } 241 | 242 | this.buildModel(trace); 243 | 244 | List anomalies; 245 | try { 246 | anomalies = this.detectAnomalies(trace, ANOMALY_FRACTION, 247 | COMPRESSION); 248 | } catch (IllegalArgumentException e) { 249 | System.err.println("Error occurred while detecting anomalies"); 250 | throw e; 251 | } 252 | 253 | // output anomalies 254 | try (Formatter out = new Formatter("anomalies.tsv")) { 255 | for (Anomaly a : anomalies) { 256 | out.format("%.3f\t%.3f\t%d\n", a.getData().getQuick(0), a.getError(), 257 | a.getIndex()); 258 | } 259 | } 260 | } 261 | 262 | public static void main(String[] args) throws IOException { 263 | new EKGAnomalyDetection().run(); 264 | } 265 | } 266 | -------------------------------------------------------------------------------- /src/main/java/org/apache/mahout/anomalydetection/TimeSeriesAnomalyDetection.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the License); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an AS IS BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.mahout.anomalydetection; 19 | 20 | import org.apache.mahout.math.DenseVector; 21 | import org.apache.mahout.math.Matrix; 22 | import org.apache.mahout.math.Vector; 23 | 24 | import com.tdunning.math.stats.TDigest; 25 | 26 | import java.util.ArrayList; 27 | import java.util.List; 28 | 29 | /** 30 | * Abstract base class for time series anomaly detection. The process is modeled in three steps: - building a 31 | * model for the data - using the model to build a representation of the data - apply t-digest to 32 | * detect when the representation differs from the data. 33 | *

34 | * The time series is represented as a Matrix, where each row correspond to a point of the time series. Each 35 | * point of the time series may be represented by multiple features and is thus stored as a Vector. 36 | *

37 | * The class should be extended implementing the methods buildModel and reconstructSignal. The 38 | * method detectAnomalies can then be used to retrieve the anomalies found with the application of 39 | * the t-digest algorithm. 40 | *

41 | * The concept is taken from the book Practical Machine Learning:A New Look At Anomaly Detection by 42 | * Ted Dunning and Ellen Friedman 43 | */ 44 | public abstract class TimeSeriesAnomalyDetection { 45 | /** 46 | * Build a model for the data. The user dependent implementation should store the model into the 47 | * state of the class. The model will then be used into the reconstructSignal method to build a 48 | * representation of the data. 49 | * 50 | * @param data Data used to build (or fit) the model 51 | */ 52 | public abstract void buildModel(org.apache.mahout.math.Matrix data); 53 | 54 | /** 55 | * Builds and returns the closest representation (reconstructed signal) of the data which the 56 | * model constructed in buildModel can provide. 57 | * 58 | * @param data Data for reconstruction 59 | * @return The reconstructed signal 60 | */ 61 | public abstract org.apache.mahout.math.Matrix reconstructSignal( 62 | org.apache.mahout.math.Matrix data); 63 | 64 | /** 65 | * Used by detectAnomalies to compute the error between the feature vector of an actual point 66 | * of the time series and the feature vector of a point in the reconstructed time series. 67 | *

68 | * Computes the error vector as difference between the two vectors and return s 69 | * 70 | * @param actualPoint Feature vector of an actual point 71 | * @param reconstructedPoint Feature vector of a reconstructed point 72 | * @return The error between the two points. 73 | */ 74 | protected double computeError(org.apache.mahout.math.Vector actualPoint, 75 | org.apache.mahout.math.Vector reconstructedPoint) { 76 | Vector error = actualPoint.minus(reconstructedPoint); 77 | return error.norm(2); 78 | } 79 | 80 | /** 81 | * Detects and returns the anomalies. 82 | *

83 | * First a reconstruction of the data is obtained using the reconstructSignal method. Then for 84 | * each point in the time series the reconstructed signal is compared to the actual data. The error 85 | * between them is computed using the method computeError which by default returns the difference 86 | * vector norm but may be overridden by the user. 87 | *

88 | * Then the t-digest algorithm is used to detect when the reconstructed signal differs too much from the 89 | * actual data (depending on the quantile parameter). 90 | * 91 | * @param data Data used for anomaly detection 92 | * @param anomalyFraction Fraction of data point reported as anomalies 93 | * @param compression Parameter used from the t-digest algorithm to set the data compression 94 | * @return List of Anomaly, each Anomaly identifies an anomalous data point reporting the data 95 | * value, the difference from the reconstructed signal (error) and the position in the sequence. 96 | * @throws IllegalArgumentException Thrown when size of the reconstructed signal differs from the 97 | * size of the data 98 | */ 99 | public List detectAnomalies( 100 | Matrix data, 101 | double anomalyFraction, 102 | double compression) throws IllegalArgumentException { 103 | // reconstruct signal starting from data 104 | Matrix reconstructedSignal = this.reconstructSignal(data); 105 | 106 | // check length reconstructed signal = actual data 107 | if (data.numRows() != reconstructedSignal.numRows()) { 108 | throw new IllegalArgumentException("The size of reconstructedSignal differs from the data size"); 109 | } 110 | 111 | // run t-digest to compute threshold corresponding to the quantile 112 | TDigest digest = TDigest.createDigest(compression); 113 | 114 | Vector delta = new DenseVector(data.numRows()); 115 | // for each point in the time series add computed error to the TDigest 116 | for (int i=0; i anomalies = new ArrayList<>(); 126 | for (int i = 0; i < data.numRows(); i++) { 127 | double element = delta.getQuick(i); 128 | if (Math.abs(element) > threshold) { 129 | // insert data, error and index into return Map 130 | anomalies.add(new Anomaly(data.viewRow(i), 131 | element, 132 | i)); 133 | } 134 | } 135 | 136 | return anomalies; 137 | } 138 | } 139 | 140 | class Anomaly { 141 | private Vector data; 142 | private double error; 143 | private int index; 144 | 145 | public Anomaly(Vector data, 146 | double error, 147 | int index) { 148 | this.data = data; 149 | this.error = error; 150 | this.index = index; 151 | } 152 | 153 | public Vector getData() { 154 | return data; 155 | } 156 | 157 | public double getError() { 158 | return error; 159 | } 160 | 161 | public int getIndex() { 162 | return index; 163 | } 164 | } 165 | -------------------------------------------------------------------------------- /src/main/resources/a02.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pollo/anomaly_detection/52a13c0213dc60163a9073018548795e5aa1f024/src/main/resources/a02.dat -------------------------------------------------------------------------------- /src/test/java/org/apache/mahout/anomalydetection/EKGAnomalyDetectionTest.java: -------------------------------------------------------------------------------- 1 | package org.apache.mahout.anomalydetection; 2 | 3 | import java.io.File; 4 | import java.util.Scanner; 5 | import org.junit.Test; 6 | 7 | import static org.junit.Assert.assertTrue; 8 | 9 | public class EKGAnomalyDetectionTest { 10 | 11 | @Test 12 | public void testNumberFoundAnomalies() throws Exception { 13 | new EKGAnomalyDetection().run(); 14 | Scanner traceFile = new Scanner(new File("trace.tsv")); 15 | int points_number = 0; 16 | while (traceFile.hasNextLine()) { 17 | points_number += 1; 18 | traceFile.nextLine(); 19 | } 20 | Scanner anomaliesFile = new Scanner(new File("anomalies.tsv")); 21 | int anomalies_number = 0; 22 | while (anomaliesFile.hasNextLine()) { 23 | anomalies_number += 1; 24 | anomaliesFile.nextLine(); 25 | } 26 | 27 | double threshold = 0.001; 28 | assertTrue("The number of found anomalies is not the fraction expected", 29 | Math.abs((double)anomalies_number/points_number - EKGAnomalyDetection.ANOMALY_FRACTION) < threshold); 30 | } 31 | } 32 | --------------------------------------------------------------------------------