├── .gitignore ├── LICENSE.md ├── README.md ├── saxpy.py └── tests └── test_saxpy.py /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.pyc -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Nathan Hoffman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #saxpy.py 2 | 3 | An implementation of Symbolic Aggregate approXimation in python. 4 | 5 | Based on the paper A Symbolic Representation of Time Series, with Implications for Streaming Algorithms 6 | 7 | ************ 8 | *General use:* 9 | -------------- 10 | ``` 11 | from saxpy import SAX 12 | 13 | s = SAX(wordSize, alphabetSize, epsilon) 14 | ``` 15 | You can optionally specify word size, alphabet size and epsilon 16 | 17 | If you want to compare x1 and x2 (lists of values): 18 | 19 | ``` 20 | (x1String, x1Indices) = s.to_letter_rep(x1) 21 | (x2String, x2Indices) = s.to_letter_rep(x2) 22 | 23 | x1x2ComparisonScore = s.compare_strings(x1String, x2String) 24 | ``` 25 | 26 | If you want to use the sliding window functionality: 27 | 28 | (say you want to break x3 into a lot of subsequences) 29 | 30 | can optionally specify the number of subsequences and how much each subsequence 31 | overlaps with the previous subsequence 32 | ``` 33 | (x3Strings, x3Indices) = s.sliding_window(x3, numSubsequences, overlappingFraction) 34 | ``` 35 | 36 | Then if you wanted to compare each subsequence to another string (say x2): 37 | 38 | ``` 39 | x3x2ComparisonScores = s.batch_compare(x3Strings,x2String) 40 | ``` 41 | 42 | ***** 43 | *Missing data:* 44 | 45 | Missing data is supported. Input values of `numpy.nan` will not affect normalization and will be converted to the `-` character during alphabetization. When comparing two strings containing the `-` character, the distance contribution for any such characters will be 0. For example, comparing `aa-a` to `a-aa` will result in a distance of 0. As another example, comparing `abc` to `d-f` will be the same as comparing `ac` to `df`. 46 | 47 | ***** 48 | *Note:* 49 | 50 | If you haven't generated the strings through the same SAX object, the scaling 51 | factor (square root of the length of the input vector over the word size) will be 52 | incorrect, you can correct it using: 53 | 54 | ``` 55 | s.set_scaling_factor(scalingFactor) 56 | ``` 57 | 58 | ***** 59 | To run the tests, just do: 60 | 61 | ``` 62 | nosetests ./tests/ 63 | ``` 64 | -------------------------------------------------------------------------------- /saxpy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import numpy as np 5 | import math 6 | 7 | 8 | class DictionarySizeIsNotSupported(Exception): pass 9 | class StringsAreDifferentLength(Exception): pass 10 | class OverlapSpecifiedIsNotSmallerThanWindowSize(Exception): pass 11 | 12 | 13 | class SAX(object): 14 | """ 15 | This class is for computing common things with the Symbolic 16 | Aggregate approXimation method. In short, this translates 17 | a series of data to a string, which can then be compared with other 18 | such strings using a lookup table. 19 | """ 20 | 21 | def __init__(self, wordSize = 8, alphabetSize = 7, epsilon = 1e-6): 22 | 23 | if alphabetSize < 3 or alphabetSize > 20: 24 | raise DictionarySizeIsNotSupported() 25 | self.aOffset = ord('a') 26 | self.wordSize = wordSize 27 | self.alphabetSize = alphabetSize 28 | self.eps = epsilon 29 | self.breakpoints = {'3' : [-0.43, 0.43], 30 | '4' : [-0.67, 0, 0.67], 31 | '5' : [-0.84, -0.25, 0.25, 0.84], 32 | '6' : [-0.97, -0.43, 0, 0.43, 0.97], 33 | '7' : [-1.07, -0.57, -0.18, 0.18, 0.57, 1.07], 34 | '8' : [-1.15, -0.67, -0.32, 0, 0.32, 0.67, 1.15], 35 | '9' : [-1.22, -0.76, -0.43, -0.14, 0.14, 0.43, 0.76, 1.22], 36 | '10': [-1.28, -0.84, -0.52, -0.25, 0, 0.25, 0.52, 0.84, 1.28], 37 | '11': [-1.34, -0.91, -0.6, -0.35, -0.11, 0.11, 0.35, 0.6, 0.91, 1.34], 38 | '12': [-1.38, -0.97, -0.67, -0.43, -0.21, 0, 0.21, 0.43, 0.67, 0.97, 1.38], 39 | '13': [-1.43, -1.02, -0.74, -0.5, -0.29, -0.1, 0.1, 0.29, 0.5, 0.74, 1.02, 1.43], 40 | '14': [-1.47, -1.07, -0.79, -0.57, -0.37, -0.18, 0, 0.18, 0.37, 0.57, 0.79, 1.07, 1.47], 41 | '15': [-1.5, -1.11, -0.84, -0.62, -0.43, -0.25, -0.08, 0.08, 0.25, 0.43, 0.62, 0.84, 1.11, 1.5], 42 | '16': [-1.53, -1.15, -0.89, -0.67, -0.49, -0.32, -0.16, 0, 0.16, 0.32, 0.49, 0.67, 0.89, 1.15, 1.53], 43 | '17': [-1.56, -1.19, -0.93, -0.72, -0.54, -0.38, -0.22, -0.07, 0.07, 0.22, 0.38, 0.54, 0.72, 0.93, 1.19, 1.56], 44 | '18': [-1.59, -1.22, -0.97, -0.76, -0.59, -0.43, -0.28, -0.14, 0, 0.14, 0.28, 0.43, 0.59, 0.76, 0.97, 1.22, 1.59], 45 | '19': [-1.62, -1.25, -1, -0.8, -0.63, -0.48, -0.34, -0.2, -0.07, 0.07, 0.2, 0.34, 0.48, 0.63, 0.8, 1, 1.25, 1.62], 46 | '20': [-1.64, -1.28, -1.04, -0.84, -0.67, -0.52, -0.39, -0.25, -0.13, 0, 0.13, 0.25, 0.39, 0.52, 0.67, 0.84, 1.04, 1.28, 1.64] 47 | } 48 | self.beta = self.breakpoints[str(self.alphabetSize)] 49 | self.build_letter_compare_dict() 50 | self.scalingFactor = 1 51 | 52 | 53 | def to_letter_rep(self, x): 54 | """ 55 | Function takes a series of data, x, and transforms it to a string representation 56 | """ 57 | (paaX, indices) = self.to_PAA(self.normalize(x)) 58 | self.scalingFactor = np.sqrt((len(x) * 1.0) / (self.wordSize * 1.0)) 59 | return (self.alphabetize(paaX), indices) 60 | 61 | def normalize(self, x): 62 | """ 63 | Function will normalize an array (give it a mean of 0, and a 64 | standard deviation of 1) unless it's standard deviation is below 65 | epsilon, in which case it returns an array of zeros the length 66 | of the original array. 67 | """ 68 | X = np.asanyarray(x) 69 | if np.nanstd(X) < self.eps: 70 | res = [] 71 | for entry in X: 72 | if not np.isnan(entry): 73 | res.append(0) 74 | else: 75 | res.append(np.nan) 76 | return res 77 | return (X - np.nanmean(X)) / np.nanstd(X) 78 | 79 | def to_PAA(self, x): 80 | """ 81 | Function performs Piecewise Aggregate Approximation on data set, reducing 82 | the dimension of the dataset x to w discrete levels. returns the reduced 83 | dimension data set, as well as the indices corresponding to the original 84 | data for each reduced dimension 85 | """ 86 | n = len(x) 87 | stepFloat = n/float(self.wordSize) 88 | step = int(math.ceil(stepFloat)) 89 | frameStart = 0 90 | approximation = [] 91 | indices = [] 92 | i = 0 93 | while frameStart <= n-step: 94 | thisFrame = np.array(x[frameStart:int(frameStart + step)]) 95 | approximation.append(np.mean(thisFrame)) 96 | indices.append((frameStart, int(frameStart + step))) 97 | i += 1 98 | frameStart = int(i*stepFloat) 99 | return (np.array(approximation), indices) 100 | 101 | def alphabetize(self,paaX): 102 | """ 103 | Converts the Piecewise Aggregate Approximation of x to a series of letters. 104 | """ 105 | alphabetizedX = '' 106 | for i in range(0, len(paaX)): 107 | letterFound = False 108 | for j in range(0, len(self.beta)): 109 | if np.isnan(paaX[i]): 110 | alphabetizedX += '-' 111 | letterFound = True 112 | break 113 | if paaX[i] < self.beta[j]: 114 | alphabetizedX += chr(self.aOffset + j) 115 | letterFound = True 116 | break 117 | if not letterFound: 118 | alphabetizedX += chr(self.aOffset + len(self.beta)) 119 | return alphabetizedX 120 | 121 | def compare_strings(self, sA, sB): 122 | """ 123 | Compares two strings based on individual letter distance 124 | Requires that both strings are the same length 125 | """ 126 | if len(sA) != len(sB): 127 | raise StringsAreDifferentLength() 128 | list_letters_a = [x for x in sA] 129 | list_letters_b = [x for x in sB] 130 | mindist = 0.0 131 | for i in range(0, len(list_letters_a)): 132 | if list_letters_a[i] is not '-' and list_letters_b[i] is not '-': 133 | mindist += self.compare_letters(list_letters_a[i], list_letters_b[i])**2 134 | mindist = self.scalingFactor* np.sqrt(mindist) 135 | return mindist 136 | 137 | def compare_letters(self, la, lb): 138 | """ 139 | Compare two letters based on letter distance return distance between 140 | """ 141 | return self.compareDict[la+lb] 142 | 143 | def build_letter_compare_dict(self): 144 | """ 145 | Builds up the lookup table to determine numeric distance between two letters 146 | given an alphabet size. Entries for both 'ab' and 'ba' will be created 147 | and will have identical values. 148 | """ 149 | 150 | number_rep = range(0,self.alphabetSize) 151 | letters = [chr(x + self.aOffset) for x in number_rep] 152 | self.compareDict = {} 153 | for i in range(0, len(letters)): 154 | for j in range(0, len(letters)): 155 | if np.abs(number_rep[i]-number_rep[j]) <=1: 156 | self.compareDict[letters[i]+letters[j]] = 0 157 | else: 158 | high_num = np.max([number_rep[i], number_rep[j]])-1 159 | low_num = np.min([number_rep[i], number_rep[j]]) 160 | self.compareDict[letters[i]+letters[j]] = self.beta[high_num] - self.beta[low_num] 161 | 162 | def sliding_window(self, x, numSubsequences = None, overlappingFraction = None): 163 | if not numSubsequences: 164 | numSubsequences = 20 165 | self.windowSize = int(len(x)/numSubsequences) 166 | if not overlappingFraction: 167 | overlappingFraction = 0.9 168 | overlap = self.windowSize*overlappingFraction 169 | moveSize = int(self.windowSize - overlap) 170 | if moveSize < 1: 171 | raise OverlapSpecifiedIsNotSmallerThanWindowSize() 172 | ptr = 0 173 | n = len(x) 174 | windowIndices = [] 175 | stringRep = [] 176 | while ptr < n-self.windowSize+1: 177 | thisSubRange = x[ptr:ptr+self.windowSize] 178 | (thisStringRep,indices) = self.to_letter_rep(thisSubRange) 179 | stringRep.append(thisStringRep) 180 | windowIndices.append((ptr, ptr+self.windowSize)) 181 | ptr += moveSize 182 | return (stringRep,windowIndices) 183 | 184 | def batch_compare(self, xStrings, refString): 185 | return [self.compare_strings(x, refString) for x in xStrings] 186 | 187 | def set_scaling_factor(self, scalingFactor): 188 | self.scalingFactor = scalingFactor 189 | 190 | def set_window_size(self, windowSize): 191 | self.windowSize = windowSize 192 | -------------------------------------------------------------------------------- /tests/test_saxpy.py: -------------------------------------------------------------------------------- 1 | from saxpy import SAX 2 | import numpy as np 3 | 4 | class TestSAX(object): 5 | def setUp(self): 6 | # All tests will be run with 6 letter words 7 | # and 5 letter alphabet 8 | self.sax = SAX(6, 5, 1e-6) 9 | 10 | def test_to_letter_rep(self): 11 | arr = [7,1,4,4,4,4] 12 | (letters, indices) = self.sax.to_letter_rep(arr) 13 | assert letters == 'eacccc' 14 | 15 | def test_to_letter_rep_missing(self): 16 | arr = [7,1,4,4,np.nan,4] 17 | (letters, indices) = self.sax.to_letter_rep(arr) 18 | assert letters == 'eacc-c' 19 | 20 | def test_long_to_letter_rep(self): 21 | long_arr = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,6,6,6,6,10,100] 22 | (letters, indices) = self.sax.to_letter_rep(long_arr) 23 | assert letters == 'bbbbce' 24 | 25 | def test_long_to_letter_rep_missing(self): 26 | long_arr = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,np.nan,1,1,6,6,6,6,10,100] 27 | (letters, indices) = self.sax.to_letter_rep(long_arr) 28 | assert letters == 'bbb-ce' 29 | 30 | def test_compare_strings(self): 31 | base_string = 'aaabbc' 32 | similar_string = 'aabbbc' 33 | dissimilar_string = 'ccddbc' 34 | similar_score = self.sax.compare_strings(base_string, similar_string) 35 | dissimilar_score = self.sax.compare_strings(base_string, dissimilar_string) 36 | assert similar_score < dissimilar_score 37 | 38 | def test_compare_strings_missing(self): 39 | assert self.sax.compare_strings('a-b-c-', 'b-c-d-') == 0 40 | 41 | def test_normalize_missing(self): 42 | # two arrays which should normalize to the same result 43 | # except one should contain a nan value in place of the input nan value 44 | incomplete_arr_res = self.sax.normalize([1,0,0,0,0,1,np.nan]) 45 | complete_arr_res = self.sax.normalize([1,0,0,0,0,1]) 46 | assert np.array_equal(incomplete_arr_res[:-1], complete_arr_res) 47 | assert np.isnan(incomplete_arr_res[-1]) 48 | def test_normalize_under_epsilon(self): 49 | array_under_epsilon = self.sax.normalize([1e-7, 2e-7, 1.5e-7]) 50 | assert np.array_equal(array_under_epsilon, [0,0,0]) 51 | --------------------------------------------------------------------------------