├── Models ├── DeepAE.py ├── RobustDeepAutoencoder.py ├── __init__.py ├── anomaly_detection.py └── rPCA.py ├── Notebook ├── ... ├── Examples_and_Tests-Autoencoder.ipynb └── Examples_and_Tests-rPCA.ipynb ├── README.md └── data ├── ... └── sunspots.txt /Models/DeepAE.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | def batches(l, n): 5 | """Yield successive n-sized batches from l, the last batch is the left indexes.""" 6 | for i in range(0, l, n): 7 | yield range(i,min(l,i+n)) 8 | 9 | 10 | class Deep_Autoencoder(object): 11 | def __init__(self, sess, input_dim_list=[7,64,64,7],transfer_function=tf.nn.relu,learning_rate=0.001): 12 | """input_dim_list must include the original data dimension""" 13 | #assert len(input_dim_list) < 2 14 | #raise ValueError( 15 | # "Do you need more one layer!") 16 | self.W_list = [] 17 | self.encoding_b_list = [] 18 | self.decoding_b_list = [] 19 | self.dim_list = input_dim_list 20 | self.transfer = transfer_function 21 | self.learning_rate=0.001 22 | 23 | ## Encoders parameters 24 | for i in range(len(input_dim_list)-1): 25 | init_max_value = 4*np.sqrt(6. / (self.dim_list[i] + self.dim_list[i+1])) 26 | 27 | self.W_list.append(tf.Variable(tf.random_uniform([self.dim_list[i],self.dim_list[i+1]], 28 | np.negative(init_max_value),init_max_value))) 29 | self.encoding_b_list.append(tf.Variable(tf.random_uniform([self.dim_list[i+1]],-0.1,0.1))) 30 | ## Decoders parameters 31 | for i in range(len(input_dim_list)-2,-1,-1): 32 | self.decoding_b_list.append(tf.Variable(tf.random_uniform([self.dim_list[i]],-0.1,0.1))) 33 | ## Placeholder for input 34 | self.input_x = tf.placeholder(tf.float32,[None,self.dim_list[0]]) 35 | ## coding graph : 36 | last_layer = self.input_x 37 | for weight,bias in zip(self.W_list,self.encoding_b_list): 38 | hidden = self.transfer(tf.matmul(last_layer,weight) + bias) 39 | last_layer = hidden 40 | self.hidden = hidden 41 | ## decode graph: 42 | for weight,bias in zip(reversed(self.W_list),self.decoding_b_list): 43 | hidden = self.transfer(tf.matmul(last_layer,tf.transpose(weight)) + bias) 44 | last_layer = hidden 45 | self.recon = last_layer 46 | 47 | #self.cost = tf.reduce_mean(tf.square(self.input_x - self.recon)) 48 | self.cost =0.5 * tf.reduce_sum(tf.pow(tf.subtract(self.recon, self.input_x), 2.0)) 49 | self.train_step = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost) 50 | sess.run(tf.global_variables_initializer()) 51 | 52 | def fit(self, X, sess,iteration=100, batch_size=12, init=False,verbose=False): 53 | assert X.shape[1] == self.dim_list[0] 54 | 55 | if init: 56 | sess.run(tf.global_variables_initializer()) 57 | sample_size = X.shape[0] 58 | 59 | for i in range(iteration): 60 | for one_batch in batches(sample_size, batch_size): 61 | e,op=sess.run((self.cost,self.train_step),feed_dict = {self.input_x:X[one_batch]}) 62 | 63 | if verbose and i%20==0: 64 | #e = self.cost.eval(session = sess,feed_dict = {self.input_x: X[one_batch]}) 65 | print(" iteration :", i ,", cost:", e) 66 | 67 | def transform(self, X, sess): 68 | return self.hidden.eval(session = sess, feed_dict={self.input_x: X}) 69 | 70 | def getRecon(self, X, sess): 71 | return self.recon.eval(session = sess,feed_dict={self.input_x: X}) 72 | -------------------------------------------------------------------------------- /Models/RobustDeepAutoencoder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy.linalg as nplin 3 | import tensorflow as tf 4 | from .DeepAE import * 5 | import sklearn.preprocessing as prep 6 | 7 | 8 | Transf=prep.MinMaxScaler() 9 | 10 | def shrink(epsilon, x): 11 | """ 12 | @Original Author: Prof. Randy 13 | @Modified by: Chong Zhou 14 | 15 | Args: 16 | epsilon: the shrinkage parameter (either a scalar or a vector) 17 | x: the vector to shrink on 18 | 19 | Returns: 20 | The shrunk vector 21 | """ 22 | output = np.array(x*0.) 23 | 24 | for i in range(len(x)): 25 | if x[i] > epsilon: 26 | output[i] = x[i] - epsilon 27 | elif x[i] < -epsilon: 28 | output[i] = x[i] + epsilon 29 | else: 30 | output[i] = 0 31 | return output 32 | 33 | 34 | class RDAE(object): 35 | """ 36 | @author: Chong Zhou 37 | 2.0 version. 38 | complete: 10/17/2016 39 | version changes: move implementation from theano to tensorflow. 40 | 3.0 41 | complete: 2/12/2018 42 | changes: delete unused parameter, move shrink function to other file 43 | Des: 44 | X = L + S 45 | L is a non-linearly low rank matrix and S is a sparse matrix. 46 | argmin ||L - Decoder(Encoder(L))|| + ||S||_1 47 | Use Alternating projection to train model 48 | 49 | @Modified by: Daniel Legorreta 50 | It was adapted for test with Time Series 51 | Date Modification: August-2018 52 | 53 | 54 | """ 55 | def __init__(self, sess, layers_sizes, lambda_=1.0, error = 1.0e-7,transfer_function=tf.nn.sigmoid,learning_rate=0.001): 56 | """ 57 | sess: a Tensorflow tf.Session object 58 | layers_sizes: a list that contain the deep ae layer sizes, including the input layer 59 | lambda_: tuning the weight of l1 penalty of S 60 | error: converge criterior for jump out training iteration 61 | """ 62 | self.lambda_ = lambda_ 63 | self.layers_sizes = layers_sizes 64 | self.transfer_function=transfer_function 65 | self.error = error 66 | self.errors=[] 67 | self.learning_rate=learning_rate 68 | self.AE = Deep_Autoencoder( sess = sess, input_dim_list = self.layers_sizes, 69 | transfer_function=self.transfer_function,learning_rate=self.learning_rate) 70 | 71 | def fit(self, X, sess, inner_iteration = 80, 72 | iteration=20, batch_size=12, verbose=False): 73 | ## The first layer must be the input layer, so they should have same sizes. 74 | #assert X.shape[1] == self.layers_sizes[0] 75 | #X =(frequency,Cicles) 76 | 77 | #Transf.fit(X.astype('float')) 78 | 79 | #self.X=Transf.transform(X.astype('float')) 80 | self.X=X 81 | 82 | ## initialize L, S, mu(shrinkage operator) 83 | 84 | self.L = np.zeros(X.shape) 85 | self.S = np.zeros(X.shape) 86 | #self.L_Aux=np.zeros(X.shape) 87 | #self.X=X 88 | 89 | mu = (X.size) / (4.0 * nplin.norm(self.X,1)) 90 | self.shrink=self.lambda_ / mu 91 | print ("shrink parameter:", self.shrink) 92 | LS0 = self.L + self.S#7X205 93 | 94 | XFnorm = nplin.norm(self.X,'fro') 95 | if verbose: 96 | print ("X shape: ", X.shape) 97 | print ("L shape: ", self.L.shape) 98 | print ("S shape: ", self.S.shape) 99 | #print ("X2 shape: ", self.X.shape) 100 | 101 | print ("mu: ", mu) 102 | print ("XFnorm: ", XFnorm) 103 | 104 | for it in range(iteration): 105 | if verbose: 106 | print ("Out iteration: " , it) 107 | ## alternating project, first project to L 108 | self.L = self.X - self.S #7X205 109 | ## Using L to train the auto-encoder 110 | self.AE.fit(X = self.L, sess = sess, 111 | iteration = inner_iteration, 112 | batch_size = batch_size, 113 | verbose = verbose) 114 | ## get optmized L 115 | self.L = self.AE.getRecon(X = self.L, sess = sess)#205X7 116 | #print ("L_Aux shape: ", self.L_Aux.shape) 117 | #print(np.sqrt(mean_squared_error(self.X.T.reshape((-1,1)).ravel(),self.L_Aux.reshape((-1,1)).ravel()))) 118 | 119 | #self.L=self.L_Aux.T.copy() 120 | ## alternating project, now project to S 121 | self.S = shrink(self.shrink, (self.X - self.L).reshape(X.size,order='C')).reshape(X.shape,order='C') 122 | #print(np.mean(self.S)) 123 | 124 | ## break criterion 1: the L and S are close enough to X 125 | c1 = nplin.norm(self.X - self.L - self.S, 'fro') / XFnorm 126 | ## break criterion 2: there is no changes for L and S 127 | c2 = np.min([mu,np.sqrt(mu)]) * nplin.norm(LS0 - self.L - self.S) / XFnorm 128 | 129 | if verbose: 130 | print ("c1: ", c1) 131 | print ("c2: ", c2) 132 | 133 | if c1 < self.error and c2 < self.error : 134 | print ("early break") 135 | break 136 | ## save L + S for c2 check in the next iteration 137 | LS0 = self.L + self.S 138 | 139 | #self.L_final=Transf.inverse_transform(self.L) 140 | #self.S_final=Transf.inverse_transform(self.S) 141 | 142 | return self.L , self.S 143 | 144 | def transform(self, X, sess): 145 | L = X - self.S 146 | return self.AE.transform(X = L, sess = sess) 147 | 148 | def getRecon(self, X, sess): 149 | return self.AE.getRecon(self.L, sess = sess) 150 | # if __name__ == "__main__": 151 | # x = np.load(r"/home/czhou2/Documents/train_x_small.pkl") 152 | # sess = tf.Session() 153 | # rae = RDAE(sess = sess, lambda_= 2000, layers_sizes=[784,400]) 154 | 155 | # L, S = rae.fit(x ,sess = sess, learning_rate=0.01, batch_size = 40, inner_iteration = 50, 156 | # iteration=5, verbose=True) 157 | 158 | # recon_rae = rae.getRecon(x, sess = sess) 159 | 160 | # sess.close() 161 | # print ("cost errors, not used for now:", rae.errors) 162 | # from collections import Counter 163 | # print ("number of zero values in S:", Counter(S.reshape(S.size))[0]) 164 | # print () 165 | -------------------------------------------------------------------------------- /Models/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Models/anomaly_detection.py: -------------------------------------------------------------------------------- 1 | #Anomaly Outliers over Time Series 2 | #@DLegorreta 3 | import pandas as pd 4 | import numpy as np 5 | from statsmodels.tsa.stattools import adfuller 6 | from .rPCA import RPCA as rpca 7 | from .RobustDeepAutoencoder import * 8 | import tensorflow as tf 9 | from statsmodels import robust 10 | from sklearn.preprocessing import MinMaxScaler 11 | 12 | def check_representation(frequency,X): 13 | """ 14 | Validation lenght(X)/frequency 15 | 16 | """ 17 | if type(frequency)==float: 18 | raise ValueError( 19 | "Expected int") 20 | if X.ndim>1: 21 | raise ValueError( 22 | "Expected 1D array, got 2D array instead") 23 | if X.shape[0]%frequency!=0: 24 | raise ValueError( 25 | "Expected 1D array with the length of this array should be divisible by frequency") 26 | else: 27 | return X 28 | 29 | def autodiff(X): 30 | """ Apply Test Dickey-Fuller Test over Time Serie 31 | """ 32 | adf=adfuller(X) 33 | if(adf[1]>0.05): 34 | flag=True 35 | X=X.diff().fillna(0) 36 | return X,flag 37 | else: 38 | flag=False 39 | return X,flag 40 | 41 | def scale(Val_scale,X): 42 | """ Normalize the Time Serie 43 | """ 44 | if(Val_scale): 45 | global_mean=np.mean(X) 46 | global_sdt=np.std(X) 47 | X=(X-global_mean)/global_sdt 48 | return global_mean,global_sdt,X 49 | else: 50 | global_mean=0 51 | global_sdt=1 52 | return global_mean,global_sdt,X 53 | 54 | 55 | def Mad_Outliers_DF(df,column='S_transform'): 56 | 57 | L=df.copy() 58 | mad=robust.mad(L[column].replace(0,np.nan).dropna().apply(np.abs)) 59 | median=L[column].replace(0,np.nan).dropna().apply(np.abs).median(skipna=True) 60 | 61 | # Create an empty column for outlier info 62 | L['MAD_Outlier'] = None 63 | # Iterate over rows 64 | for idx, row in L.iterrows(): 65 | # Update the 'Outlier' column with True if the wind speed is higher than our threshold value 66 | if (row[column]!=0) and (((np.abs(np.abs(row[column])- median))/mad) > 1.4826) : 67 | L.loc[idx, 'MAD_Outlier'] = 1 68 | else: 69 | L.loc[idx, 'MAD_Outlier'] = 0 70 | return L 71 | 72 | 73 | class AnomalyDetection_RPCA(object): 74 | """ 75 | Detection of anomalies over Time series: time series anomaly detection 76 | using Robust Principal Component Pursuit 77 | 78 | Inspired by Surus Project Netflix:https://github.com/Netflix/Surus 79 | 80 | 81 | """ 82 | 83 | def __init__(self, frequency=7,autodiff=True,forcediff = False,scale = True,Lpenalty=-1,Spenalty=-1,verbose=False): 84 | self.frequency=frequency 85 | self.autodiff=autodiff 86 | self.forcediff=forcediff 87 | self.scale=scale 88 | self.Lpenalty=Lpenalty 89 | self.Spenalty=Spenalty 90 | self.verbose=verbose 91 | self.usediff=False 92 | 93 | def fit(self,X): 94 | 95 | X=check_representation(self.frequency,X) 96 | self.X=pd.Series(X) 97 | X_orig=X.copy() 98 | #if(self.autodiff==True and self.forcediff==True): 99 | # raise ValueError( 100 | # "Default apply autodiff, if you set forcediff== True " 101 | # "you will need autodiff=False") 102 | 103 | if self.forcediff==True: 104 | X=X.diff().fillna(0) 105 | self.usediff=True 106 | #self.diff=True 107 | 108 | elif self.autodiff == True: 109 | X,self.autodiff=autodiff(X) 110 | self.usediff=self.autodiff 111 | #self.diff=True 112 | 113 | self.global_mean,self.global_sdt,X=scale(self.scale,X) 114 | 115 | M=X.values.reshape((self.frequency,-1),order='F') 116 | 117 | 118 | if(self.verbose): 119 | 120 | print("..........Start Process..........") 121 | print("Time Series, frequency=%d and Num Periods= %d." %(M.shape[0],M.shape[1])) 122 | Xpca,L_matrix,S_matrix,E_matrix=rpca(M,Lpenalty=self.Lpenalty,Spenalty=self.Spenalty,verbose=self.verbose) 123 | 124 | 125 | else: 126 | Xpca,L_matrix,S_matrix,E_matrix=rpca(M,Lpenalty=self.Lpenalty,Spenalty=self.Spenalty,verbose=self.verbose) 127 | 128 | 129 | #if(self.usediff==True): 130 | self.X_original=X_orig 131 | self.X_transform=(Xpca.T.reshape((-1,1)).ravel()*self.global_sdt)+self.global_mean 132 | self.L_transform=(L_matrix.T.reshape((-1,1)).ravel()*self.global_sdt)+self.global_mean 133 | self.S_transform=(S_matrix.T.reshape((-1,1)).ravel()*self.global_sdt) 134 | self.E_transform=(E_matrix.T.reshape((-1,1)).ravel()*self.global_sdt) 135 | #else: 136 | # self.X_original=X_orig 137 | # self.X_transform=(Xpca.T.reshape((-1,1)).ravel()*self.global_sdt)+self.global_mean 138 | # self.L_transform=(L_matrix.T.reshape((-1,1)).ravel()*self.global_sdt)+self.global_mean 139 | # self.S_transform=(S_matrix.T.reshape((-1,1)).ravel()*self.global_sdt) 140 | # self.E_transform=(E_matrix.T.reshape((-1,1)).ravel()*self.global_sdt) 141 | 142 | return self 143 | 144 | def to_frame(self,add_mad=True): 145 | Output=pd.DataFrame({'X_original':self.X_original, 146 | 'X_transform':self.X_transform, 147 | 'L_transform':self.L_transform, 148 | 'S_transform':self.S_transform, 149 | 'E_transform':self.E_transform}) 150 | 151 | if add_mad: 152 | return Output.pipe(Mad_Outliers_DF) 153 | else: 154 | return Output 155 | 156 | def num_outliers(self): 157 | return sum(np.abs(self.S_transform)>0) 158 | 159 | 160 | 161 | def Function_RDAE(X, layers, lamda=2.2, learning_rate = 0.001, inner = 120, batch_size =12,outer=5,verbose=True): 162 | X=X.copy() 163 | with tf.Graph().as_default(): 164 | with tf.Session() as sess: 165 | rdae = RDAE(sess = sess, lambda_= lamda, layers_sizes=layers,learning_rate=learning_rate) 166 | L, S= rdae.fit(X = X, sess = sess, inner_iteration = inner, iteration = outer, 167 | batch_size = batch_size,verbose = verbose) 168 | M_Transf=rdae.transform(X=X,sess=sess) 169 | 170 | 171 | M_Recons = rdae.getRecon(X = X, sess = sess) 172 | 173 | return L, S,M_Transf,M_Recons,rdae.S 174 | 175 | 176 | Prep=MinMaxScaler() 177 | 178 | class AnomalyDetection_AUTOENCODER(object): 179 | """ 180 | Detection of anomalies over Time series : adaptation of the algorithm 181 | Anomaly Detection with Robust Deep Autoencoder 182 | Original Research: http://www.kdd.org/kdd2017/papers/view/anomaly-detection-with-robust-deep-auto-encoders 183 | """ 184 | def __init__(self,frequency=7,autodiff=True,forcediff = False,scale = True,verbose=True,lamda=2.2,layers=[7,64,64,7],batch_size=12): 185 | self.frequency=frequency 186 | self.autodiff=autodiff 187 | self.forcediff=forcediff 188 | self.scale=scale 189 | self.layers=layers 190 | self.verbose=verbose 191 | self.lamda=lamda 192 | self.batch_size=batch_size 193 | self.usediff=False 194 | 195 | def fit(self,X): 196 | 197 | X=check_representation(self.frequency,X) 198 | self.X=pd.Series(X) 199 | X_orig=X.copy() 200 | 201 | if self.forcediff==True: 202 | X=X.diff().fillna(0) 203 | self.usediff=True 204 | 205 | elif self.autodiff == True: 206 | X,self.autodiff=autodiff(X) 207 | self.usediff=self.autodiff 208 | 209 | self.global_mean,self.global_sdt,X=scale(self.scale,X) 210 | #Last stage 211 | X=Prep.fit_transform(X.values.reshape(-1,1)) 212 | 213 | M=X.reshape((-1,self.frequency),order='C').copy() 214 | print("Time Series, frequency=%d and Num Periods= %d." %(M.shape[1],M.shape[0])) 215 | 216 | if(self.verbose): 217 | print("..........Start Process..........") 218 | L_matrix,S_matrix,M_trans,M_Recons,S_Outliers=Function_RDAE(M,layers=self.layers,lamda=self.lamda,verbose=self.verbose,batch_size=self.batch_size) 219 | else: 220 | L_matrix,S_matrix,M_trans,M_Recons,S_Outliers=Function_RDAE(M,layers=self.layers,lamda=self.lamda,verbose=self.verbose,batch_size=self.batch_size) 221 | 222 | if (self.forcediff ==True or self.autodiff ==True): 223 | if (self.scale==True): 224 | M=Prep.inverse_transform(M.reshape((-1,1))) 225 | L=Prep.inverse_transform(L_matrix.reshape((-1,1))) 226 | M_trans=Prep.inverse_transform(M_trans.reshape((-1,1))) 227 | 228 | self.X_original=X_orig 229 | self.X_transform=(M.reshape((-1,1)).ravel()*self.global_sdt)+self.global_mean 230 | self.L_transform=(L.reshape((-1,1)).ravel()*self.global_sdt)+self.global_mean 231 | self.S_transform=(S_matrix.reshape((-1,1)).ravel()) 232 | self.M_trans=(M_trans.reshape((-1,1)).ravel()*self.global_sdt)+self.global_mean 233 | self.M_Recons=M_Recons.reshape((-1,1)).ravel() 234 | self.S_Outliers=S_Outliers.reshape((-1,1)).ravel() 235 | 236 | else: 237 | M=Prep.inverse_transform(M.reshape((-1,1))) 238 | L=Prep.inverse_transform(L_matrix.reshape((-1,1))) 239 | M_trans=Prep.inverse_transform(M_trans.reshape((-1,1))) 240 | 241 | self.X_original=X_orig 242 | self.X_transform=M.reshape((-1,1)).ravel() 243 | self.L_transform=L.reshape((-1,1)).ravel() 244 | self.S_transform=(S_matrix.reshape((-1,1)).ravel()) 245 | self.M_trans=M_trans.reshape((-1,1)).ravel() 246 | self.M_Recons=M_Recons.reshape((-1,1)).ravel() 247 | self.S_Outliers=S_Outliers.reshape((-1,1)).ravel() 248 | 249 | else: 250 | if (self.scale==True): 251 | M=Prep.inverse_transform(M.reshape((-1,1))) 252 | L=Prep.inverse_transform(L_matrix.reshape((-1,1))) 253 | M_trans=Prep.inverse_transform(M_trans.reshape((-1,1))) 254 | 255 | self.X_original=X_orig 256 | self.X_transform=(M.reshape((-1,1)).ravel()*self.global_sdt)+self.global_mean 257 | self.L_transform=(L.reshape((-1,1)).ravel()*self.global_sdt)+self.global_mean 258 | self.S_transform=(S_matrix.reshape((-1,1)).ravel()) 259 | self.M_trans=(M_trans.reshape((-1,1)).ravel()*self.global_sdt)+self.global_mean 260 | self.M_Recons=M_Recons.reshape((-1,1)).ravel() 261 | self.S_Outliers=S_Outliers.reshape((-1,1)).ravel() 262 | 263 | else: 264 | M=Prep.inverse_transform(M.reshape((-1,1))) 265 | L=Prep.inverse_transform(L_matrix.reshape((-1,1))) 266 | M_trans=Prep.inverse_transform(M_trans.reshape((-1,1))) 267 | 268 | self.X_original=X_orig 269 | self.X_transform=M.reshape((-1,1)).ravel() 270 | self.L_transform=L.reshape((-1,1)).ravel() 271 | self.S_transform=(S_matrix.reshape((-1,1)).ravel()) 272 | self.M_trans=M_trans.reshape((-1,1)).ravel() 273 | self.M_Recons=M_Recons.reshape((-1,1)).ravel() 274 | self.S_Outliers=S_Outliers.reshape((-1,1)).ravel() 275 | return self 276 | 277 | def to_frame(self,add_mad=True): 278 | Output=pd.DataFrame({'X_original':self.X_original, 279 | 'X_transform':self.X_transform, 280 | 'L_transform':self.L_transform, 281 | 'S_transform':self.S_transform, 282 | 'Trans_X':self.M_trans, 283 | 'Recover_X':self.M_Recons, 284 | 'S_Outliers':self.S_Outliers}) 285 | if add_mad: 286 | Output['S_Outliers']=Output['S_Outliers'].apply(lambda x:np.log1p(x)).copy() 287 | return Output.pipe(Mad_Outliers_DF,'S_Outliers') 288 | else: 289 | return Output 290 | def num_outliers(self): 291 | return sum(np.abs(self.S_Outliers>0)) 292 | 293 | 294 | -------------------------------------------------------------------------------- /Models/rPCA.py: -------------------------------------------------------------------------------- 1 | # Code Stable Principal Component Pursuit 2 | #@DLegorreta 3 | 4 | import numpy as np 5 | import scipy as sc 6 | from scipy import linalg 7 | from statsmodels import robust 8 | from numba import double 9 | from numba.decorators import jit 10 | 11 | @jit 12 | def Dsoft(M,penalty): 13 | """ Inverts the singular values 14 | takes advantage of the fact that singular values are never negative 15 | 16 | Parameters 17 | ---------- 18 | 19 | M : numpy array 20 | 21 | penalty: float number 22 | penalty scalar to penalize singular values 23 | 24 | Returns 25 | ------- 26 | 27 | M matrix with elements penalized when them viole this condition 28 | 29 | """ 30 | penalty=float(penalty) 31 | #for x in np.nditer(M, op_flags=['readwrite']): 32 | # penalized=x-penalty 33 | # if(penalized<0): 34 | # x[...]=0.0 35 | # else: 36 | # x[...]=penalized 37 | return np.maximum((M-penalty),0.0,out=None) 38 | 39 | @jit 40 | def SVT(M,penalty): 41 | """ 42 | Singular Value Thresholding on a numeric matrix 43 | 44 | Parameters 45 | ---------- 46 | 47 | M : numpy array 48 | 49 | penalty: float number 50 | penalty scalar to penalize singular values 51 | 52 | Returns 53 | ------- 54 | 55 | S: numpy array 56 | The singular value thresholded matrix its 57 | 58 | Ds: numpy array 59 | Thresholded singular values 60 | 61 | """ 62 | penalty=float(penalty) 63 | U,s,V=np.linalg.svd(M,full_matrices=False) 64 | Ds=Dsoft(s,penalty) 65 | S=np.dot(np.multiply(U,np.diag(Ds)),V) 66 | #np.dot(U,np.dot(np.diag(Ds),V)) 67 | return S,Ds 68 | 69 | @jit 70 | def SoftThresholdScalar(x,penalty): 71 | """sign(x) * pmax(abs(x) - penalty,0) 72 | """ 73 | # np.sign(x)np.maximum(np.abs(x)-penalty,0) 74 | 75 | x=np.array(x).astype('float64') 76 | penalty=float(penalty) 77 | penalized=np.abs(x)-penalty 78 | if (penalized<0): return 0 79 | elif (x>0): return penalized 80 | else: 81 | return -penalized 82 | 83 | @jit 84 | def SoftThresholdVector(X,penalty): 85 | X=np.array(X).astype('float64') 86 | penalty=float(penalty) 87 | for x in np.nditer(X, op_flags=['readwrite']): 88 | x[...]=SoftThresholdScalar(x,penalty) 89 | return X 90 | 91 | @jit 92 | def SoftThresholdMatrix(X,penalty): 93 | X=np.array(X).astype('float64') 94 | penalty=float(penalty) 95 | return np.apply_along_axis(SoftThresholdVector, 1, X,penalty=penalty) 96 | 97 | #@jit 98 | #def SoftThresholdScalar2(X,penalty): 99 | """sign(x) * pmax(abs(x) - penalty,0) 100 | """ 101 | # return np.multiply(np.sign(X),np.maximum(np.abs(X)-penalty,0.0,out=None)) 102 | 103 | 104 | @jit 105 | def median(X): 106 | X=np.array(X).astype('float64')# No se usa esta funcion en el code, se encontraba en la fuente original de Netflix 107 | return np.median(X) 108 | 109 | @jit 110 | def mad(X): 111 | X=np.array(X).astype('float64') 112 | return robust.mad(X) 113 | 114 | @jit 115 | def getDynamicMu(X): 116 | X=np.array(X).astype('float64') 117 | m,n=X.shape 118 | E_sd=X.std() 119 | #mu=0.0 120 | #if(m>n): 121 | # mu=E_sd*np.sqrt(2*m) 122 | #else: 123 | # mu=E_sd*np.sqrt(2*n) 124 | mu=E_sd*np.sqrt(2*np.maximum(m,n)) 125 | #if (mu<0.01): return 0.01 126 | #else: 127 | #return mu 128 | return np.maximum(mu,0.001) 129 | 130 | @jit 131 | def getL(X,S,mu,L_penalty): 132 | mu=float(mu) 133 | L_penalty=float(L_penalty) 134 | #X=np.array(X).astype('float64') 135 | #S=np.array(S).astype('float64') 136 | L_penalty2 = L_penalty*mu 137 | C=np.subtract(X,S,out=None)#cambio 138 | L0,L1=SVT(C,L_penalty) 139 | L_nuclearnorm =np.sum(L1) 140 | return L0,L_penalty2*L_nuclearnorm 141 | 142 | @jit 143 | def getS(X,L,mu,s_penalty): 144 | mu=float(mu) 145 | s_penalty=float(s_penalty) 146 | #X=np.array(X).astype('float64') 147 | #L=np.array(L).astype('float64') 148 | s_penalty2 = s_penalty*mu 149 | C=np.subtract(X,L,out=None)#Cambio 150 | S=SoftThresholdMatrix(C,s_penalty2) 151 | S_l1norm = np.sum(np.abs(S)) 152 | return S,s_penalty2*S_l1norm 153 | 154 | @jit 155 | def getE(X,L,S): 156 | #X=np.array(X).astype('float64') 157 | #L=np.array(L).astype('float64') 158 | #S=np.array(S).astype('float64') 159 | R=X.copy() 160 | np.subtract(X,L,out=R) 161 | E=np.subtract(R,S,out=None) 162 | #E=X-L-S 163 | return E,np.linalg.norm(E,'fro') 164 | 165 | @jit 166 | def objective(L,S,E): 167 | return (0.5*E)+L+S 168 | 169 | #@jit 170 | def RPCA(X,Lpenalty=-1,Spenalty =-1, verbose = True): 171 | """ 172 | Robust Principal Component Pursuit. 173 | 174 | Parameters 175 | ---------- 176 | 177 | X : numpy array 178 | 179 | Lpenalty: float, default -1 180 | 181 | Spenalty: float, default -1 182 | Scalar to penalize remainder matrix to find Anomalous Values or Noise Values 183 | verbose: bool, optional (default=False) 184 | Controls the verbosity of the matrix building process. 185 | 186 | Returns: 187 | -------- 188 | X : numpy array original 189 | L_matrix : numpy array, L_matrix is low rank 190 | S_matrix : numpy array, S_matrix is sparse 191 | E_matrix : numpy array, E_matrix is the remainder matrix of noise 192 | 193 | Reference: 194 | ---------- 195 | 196 | Stable Principal Component Pursuit 197 | Zihan Zhou, Xiaodong Li, John Wright, Emmanuel Candes, Yi Ma 198 | https://arxiv.org/pdf/1001.2363.pdf 199 | 200 | """ 201 | X=np.array(X).astype('float64').copy() 202 | m,n=X.shape 203 | 204 | if (Lpenalty == -1): 205 | Lpenalty = 1 206 | if (Spenalty == -1): 207 | Spenalty=(1.6)/np.sqrt(max(n,m)) 208 | #if (m > n): 209 | # Spenalty = 1.4 / np.sqrt(m) 210 | #else: 211 | # Spenalty = 1.4 / np.sqrt(n) 212 | 213 | itere=1 214 | maxIter=2000 215 | converged=False 216 | obj_prev=0.5*np.linalg.norm(X,'fro') 217 | tol=(1e-10) * obj_prev 218 | diff=2*tol 219 | mu=(X.size)/(4*np.linalg.norm(X,1)) 220 | print("Value obj_prev %2.10f and tol %2.10f"%(obj_prev,tol) ) 221 | L_matrix =np.zeros_like(X,dtype='float') 222 | S_matrix =np.zeros_like(X,dtype='float') 223 | E_matrix =np.zeros_like(X,dtype='float') 224 | while (itere < maxIter and diff > tol): 225 | 226 | S_matrix,S_1 = getS(X, L_matrix, mu, Spenalty) 227 | #S_matrix = S[0] 228 | L_matrix,L_1 = getL(X, S_matrix, mu, Lpenalty) 229 | #L_matrix = L[0] 230 | E_matrix,E_1 = getE(X, L_matrix, S_matrix) 231 | #E_matrix = E[0] 232 | obj = objective(L_1,S_1, E_1) 233 | if (verbose): 234 | print("Objective function: %4.8f on previous iteration %d "%(obj_prev,itere-1)) 235 | print("Objective function: %4.8f on iteration %d "%(obj,itere)) 236 | diff=np.abs(obj_prev-obj) 237 | obj_prev=obj 238 | mu=getDynamicMu(E_matrix) 239 | itere +=1 240 | #print( "Diff Value:%2.10f and tol value %2.10f"%(diff,tol)) 241 | if(diff