├── MK_Motif_Discovery.py └── Motif_Matching.py /MK_Motif_Discovery.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | import numpy as np 3 | import random 4 | 5 | class Motif_Discovery(object): 6 | """docstring for Motif_Discovery""" 7 | def __init__(self): 8 | super(Motif_Discovery, self).__init__() 9 | # self.arg = arg 10 | 11 | 12 | def distance(self, x, y): 13 | sum = 0 14 | i = 0 15 | while (i < len(x)): 16 | sum = sum + (x[i] - y[i])**2 17 | i = i + 1 18 | return (sum ** 0.5) 19 | 20 | # MK Motif Discovery 21 | # @param R, int number of reference points 22 | # @param D, list, time series 23 | # @param tempLen, the length of the Template to Discover 24 | # @param r, the minimum dsitance between two motifs to be considered 25 | # returns the beginning index of the two most similar motifs 26 | def MK_Motif(self, D,R, tempLen = 30): 27 | R = min(R,1000) 28 | best_so_far = 999999999.0 29 | D = np.array(D) 30 | m = len(D)-tempLen # the number of time series to compare 31 | S = [] 32 | Dist = [] 33 | for i in range(R): 34 | r = random.randint(0,len(D)-tempLen) 35 | ref_i = D[r:r+tempLen] # a randomly chosen time series Dr from D 36 | Dist_i = [] 37 | for j in range(m): 38 | Dj = D[j:j+tempLen] 39 | Dist_ij = self.distance(ref_i,Dj) # euclidian or DTW distance between two time series 40 | Dist_i.append(Dist_ij) 41 | 42 | if Dist_ij < best_so_far and r!=j: 43 | best_so_far = Dist_ij 44 | # print "best_so_far: "+ `best_so_far` 45 | L1 = r 46 | L2 = j 47 | Si = np.std(Dist_i) 48 | S.append(Si) 49 | Dist.append(Dist_i) 50 | # find an ordering "Z" of the indicdes to the reference time series ref such that S_z(i) >= S_z(i+1) 51 | S_arr = np.array(S) # convert list to array 52 | Z = (-S_arr).argsort() 53 | # find an ordering "I" of the indices to the time series in D such that Dist_z(1),I(j) <= Dist_z(1),I(J+1) 54 | Dist = np.array(Dist) 55 | 56 | I = (-Dist[Z[0]]).argsort() 57 | offset = 0 58 | abandon = False 59 | 60 | while abandon == False: 61 | offset = offset + 1 62 | abandon = True 63 | 64 | for j in range(R): 65 | reject = False 66 | for i in range(R): 67 | # print 68 | # print "i " + `i` 69 | # print "j" + `j` 70 | # print "offset " + `offset` 71 | # print "len[I]: " + `len(I)` 72 | if j + offset < len(I): 73 | lower_bound = abs(Dist[Z[i],I[j]] - Dist[Z[i],I[j+offset]]) 74 | if lower_bound > best_so_far: 75 | reject = True 76 | break 77 | elif i == 1: 78 | abandon = False 79 | if reject == False and j + offset < len(I): 80 | check_d = self.distance(D[I[j]:(I[j]+tempLen)],D[I[j+offset]:(I[j+offset]+tempLen)]) 81 | # print "check_d: " + `check_d` 82 | if check_d < best_so_far and I[j]!=I[j+offset]: 83 | best_so_far = check_d 84 | L1 = I[j] 85 | L2 = I[j+offset] 86 | return [L1, L2] -------------------------------------------------------------------------------- /Motif_Matching.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | import numpy as np 3 | import csv 4 | import time 5 | 6 | 7 | def zNormalize(my_time_series): 8 | data = np.array(my_time_series) 9 | y_normed = (data-np.mean(data))/np.std(data) 10 | return y_normed 11 | 12 | # adapted from Mueen and Keogh 2012 13 | # Q query: a list of numbers. The motif to be matched to. 14 | # filePathT: A the path pointing to a csv containing one number per row. 15 | # sample_rate: an integer. If you wish to sample every nth value of the time series, set this to n. Note that the matching motif much also be collected at this same sampling rate. 16 | # normalize: If true, compare distaces between z-normalized query and test-data. If false compare absolute distance 17 | # returns the first index of the closest fit, nn 18 | def find_matches(filePathT,Q,sample_rate=1, normalize = True): 19 | with open(filePathT) as tsFile: 20 | T = csv.reader(tsFile) 21 | 22 | bestSoFar = float("inf") 23 | count = 0 # the 'step' in the time series 24 | if normalize: 25 | Q = zNormalize(Q) 26 | m = len(Q) 27 | X = [0]*m 28 | ex = 0; ex2 = 0 29 | 30 | try: 31 | tnext = float(T.next()[0]) # This implemntation assumes the CSV stores one value per line 32 | except StopIteration: 33 | print "empty list" 34 | return 0 35 | else: 36 | hasNext = True 37 | while hasNext: #can choose to limit the depth you look into the TS with "and count < 100000:" 38 | count = count + 1 39 | if count % sample_rate == 0: 40 | i = count/sample_rate % m 41 | X[i] = tnext # circular buffer to store current subsequence. 42 | ex = ex + X[i] # iteratively sum up values to use for the mean 43 | ex2 = ex2 + X[i] ** 2 # sum up squared values to use for sdev 44 | 45 | if count >= m-1: 46 | u = ex/m # u is mu, or the mean 47 | sdev = abs(ex2/m - u**2) ** (0.5) 48 | j = 0 49 | dist = 0 50 | 51 | # compare Q and T[i] 52 | while j < m and dist < bestSoFar: 53 | if normalize: 54 | dist = dist + (Q[j]-(X[(i+j)%m]-u)/sdev)**2 55 | else: 56 | dist = dist + abs(Q[j]-(X[(i+j+1)%m])) 57 | j = j + 1 58 | 59 | if dist < bestSoFar: 60 | bestSoFar = dist 61 | nn = count - m # count gives the end of the matched motif. Move m spaces back to finds its head. 62 | 63 | # keep the mean and sdev and moving averages. 64 | ex = ex - X[(i+1)%m] 65 | ex2 = ex2 - X[(i+1)%m]**2 66 | try: 67 | tnext = float(T.next()[0]) 68 | except StopIteration: 69 | print "end of list" 70 | hasNext = False 71 | return nn # closest match spot in time series T --------------------------------------------------------------------------------