├── README.md ├── code ├── __init__.py ├── loadData.py ├── main.py └── oddball.py └── requirements.txt /README.md: -------------------------------------------------------------------------------- 1 | # oddball_py3 2 | Python3 implementation of oddball 3 | 4 | This is the Python3 implementation of oddball. 5 | For more detail, you can see: Akoglu L., McGlohon M., Faloutsos C.(2010) oddBall: Spotting Anomalies in Weighted Graphs. 6 | 7 | ## Environments 8 | networkx (version: 2.1) 9 | numpy 10 | scikit_learn 11 | You can use the following command to download the environments directly. 12 | ``` 13 | pip install -r requirements.txt 14 | ``` 15 | 16 | ## Run 17 | The input is a weighted undirected graph which format is 'edge1 edge2 weight'. 18 | ### Options: 19 | --input: input file 20 | --output: output file 21 | --lof: Use LOF. 0: not use. 1: use. Default value is 0. 22 | --anomaly_type: Anomaly Type. 1:star_or_clique. 2:heavy_vicinity. 3:dominant_edge. 23 | 24 | You can use --help for more details. 25 | Here is a sample. 26 | ``` 27 | python main.py --input inputFile --output outputFile --lof 0 --anomaly_type 1 28 | ``` 29 | 30 | ### Miscellaneous 31 | Hope you will like this repository and have fun. 32 | Don't forget a star, lol~ 33 | -------------------------------------------------------------------------------- /code/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gloooryyt/oddball_py3/3dbad3dab51f7764adab895b17d6b6cd837f9d14/code/__init__.py -------------------------------------------------------------------------------- /code/loadData.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Data load and pre process for oddball 3 | 4 | @author: 5 | Tao Yu (gloooryyt@gmail.com) 6 | 7 | ''' 8 | 9 | import numpy as np 10 | import networkx as nx 11 | 12 | #load data, a weighted undirected graph 13 | def load_data(path): 14 | data = np.loadtxt(path).astype('int32') 15 | G = nx.Graph() 16 | for ite in data: 17 | G.add_edge(ite[0], ite[1], weight=ite[2]) 18 | return G 19 | 20 | 21 | def get_feature(G): 22 | #feature dictionary which format is {node i's id:Ni, Ei, Wi, λw,i} 23 | featureDict = {} 24 | nodelist = list(G.nodes) 25 | for ite in nodelist: 26 | featureDict[ite] = [] 27 | #the number of node i's neighbor 28 | Ni = G.degree(ite) 29 | featureDict[ite].append(Ni) 30 | #the set of node i's neighbor 31 | iNeighbor = list(G.neighbors(ite)) 32 | #the number of edges in egonet i 33 | Ei = 0 34 | #sum of weights in egonet i 35 | Wi = 0 36 | #the principal eigenvalue(the maximum eigenvalue with abs) of egonet i's weighted adjacency matrix 37 | Lambda_w_i = 0 38 | Ei += Ni 39 | egonet = nx.Graph() 40 | for nei in iNeighbor: 41 | Wi += G[nei][ite]['weight'] 42 | egonet.add_edge(ite, nei, weight=G[nei][ite]['weight']) 43 | iNeighborLen = len(iNeighbor) 44 | for it1 in range(iNeighborLen): 45 | for it2 in range(it1+1, iNeighborLen): 46 | #if it1 in it2's neighbor list 47 | if iNeighbor[it1] in list(G.neighbors(iNeighbor[it2])): 48 | Ei += 1 49 | Wi += G[iNeighbor[it1]][iNeighbor[it2]]['weight'] 50 | egonet.add_edge(iNeighbor[it1], iNeighbor[it2], weight=G[iNeighbor[it1]][iNeighbor[it2]]['weight']) 51 | egonet_adjacency_matrix = nx.adjacency_matrix(egonet).todense() 52 | eigenvalue, eigenvector = np.linalg.eig(egonet_adjacency_matrix) 53 | eigenvalue.sort() 54 | Lambda_w_i = max(abs(eigenvalue[0]), abs(eigenvalue[-1])) 55 | featureDict[ite].append(Ei) 56 | featureDict[ite].append(Wi) 57 | featureDict[ite].append(Lambda_w_i) 58 | return featureDict -------------------------------------------------------------------------------- /code/main.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Python3 implementation of oddball 3 | 4 | @author: 5 | Tao Yu (gloooryyt@gmail.com) 6 | 7 | ''' 8 | 9 | import numpy as np 10 | import networkx as nx 11 | from loadData import * 12 | from oddball import * 13 | import argparse 14 | 15 | def main(): 16 | parser = argparse.ArgumentParser(description='Run Oddball.') 17 | parser.add_argument('--input', type=str, required=True, help='Path of input file.') 18 | parser.add_argument('--output', type=str, required=True, help='Path of output file.') 19 | parser.add_argument('--lof', type=int, default=0, help='Use LOF. 0: not use. 1: use. Default value is 0.') 20 | parser.add_argument('--anomaly_type', type=int, required=True, help='Anomaly Type. 1:star_or_clique. 2:heavy_vicinity. 3:dominant_edge.') 21 | args = parser.parse_args() 22 | 23 | #Input a weighted undirected graph which format is 'a b weight' 24 | input_path = args.input 25 | output_path = args.output 26 | G = load_data(input_path) 27 | featureDict = get_feature(G) 28 | if args.lof == 0: 29 | if args.anomaly_type == 1: 30 | star_or_clique_score = star_or_clique(featureDict) 31 | star_or_clique_array = [] 32 | for key in star_or_clique_score.keys(): 33 | star_or_clique_array.append(np.array([key, star_or_clique_score[key]])) 34 | star_or_clique_array = np.array(star_or_clique_array) 35 | star_or_clique_array = star_or_clique_array[star_or_clique_array[:, 1].argsort()[::-1]] # Sort by score from large to small 36 | with open(output_path, 'w') as f: 37 | for key in star_or_clique_array: 38 | f.write(str(int(key[0])) + ' ' + str(key[1])) 39 | f.write('\n') 40 | f.close() 41 | elif args.anomaly_type == 2: 42 | heavy_vicinity_score = heavy_vicinity(featureDict) 43 | heavy_vicinity_array = [] 44 | for key in heavy_vicinity_score.keys(): 45 | heavy_vicinity_array.append(np.array([key, heavy_vicinity_score[key]])) 46 | heavy_vicinity_array = np.array(heavy_vicinity_array) 47 | heavy_vicinity_array = heavy_vicinity_array[heavy_vicinity_array[:, 1].argsort()[::-1]] # Sort by score from large to small 48 | with open(output_path, 'w') as f: 49 | for key in heavy_vicinity_array: 50 | f.write(str(int(key[0])) + ' ' + str(key[1])) 51 | f.write('\n') 52 | f.close() 53 | elif args.anomaly_type == 3: 54 | dominant_edge_score = dominant_edge(featureDict) 55 | dominant_edge_array = [] 56 | for key in dominant_edge_score.keys(): 57 | dominant_edge_array.append(np.array([key, dominant_edge_score[key]])) 58 | dominant_edge_array = np.array(dominant_edge_array) 59 | dominant_edge_array = dominant_edge_array[dominant_edge_array[:, 1].argsort()[::-1]] # Sort by score from large to small 60 | with open(output_path, 'w') as f: 61 | for key in dominant_edge_array: 62 | f.write(str(int(key[0])) + ' ' + str(key[1])) 63 | f.write('\n') 64 | f.close() 65 | else: 66 | print('parameter error!') 67 | elif args.lof == 1: 68 | if args.anomaly_type == 1: 69 | star_or_clique_withLOF_score = star_or_clique_withLOF(featureDict) 70 | star_or_clique_withLOF_array = [] 71 | for key in star_or_clique_withLOF_score.keys(): 72 | star_or_clique_withLOF_array.append(np.array([key, star_or_clique_withLOF_score[key]])) 73 | star_or_clique_withLOF_array = np.array(star_or_clique_withLOF_array) 74 | star_or_clique_withLOF_array = star_or_clique_withLOF_array[star_or_clique_withLOF_array[:, 1].argsort()[::-1]] # Sort by score from large to small 75 | with open(output_path, 'w') as f: 76 | for key in star_or_clique_withLOF_array: 77 | f.write(str(int(key[0])) + ' ' + str(key[1])) 78 | f.write('\n') 79 | f.close() 80 | elif args.anomaly_type == 2: 81 | heavy_vicinity_withLOF_score = heavy_vicinity_withLOF(featureDict) 82 | heavy_vicinity_withLOF_array = [] 83 | for key in heavy_vicinity_withLOF_score.keys(): 84 | heavy_vicinity_withLOF_array.append(np.array([key, heavy_vicinity_withLOF_score[key]])) 85 | heavy_vicinity_withLOF_array = np.array(heavy_vicinity_withLOF_array) 86 | heavy_vicinity_withLOF_array = heavy_vicinity_withLOF_array[heavy_vicinity_withLOF_array[:, 1].argsort()[::-1]] # Sort by score from large to small 87 | with open(output_path, 'w') as f: 88 | for key in heavy_vicinity_withLOF_array: 89 | f.write(str(int(key[0])) + ' ' + str(key[1])) 90 | f.write('\n') 91 | f.close() 92 | elif args.anomaly_type == 3: 93 | dominant_edge_withLOF_score = dominant_edge_withLOF(featureDict) 94 | dominant_edge_withLOF_array = [] 95 | for key in dominant_edge_withLOF_score.keys(): 96 | dominant_edge_withLOF_array.append(np.array([key, dominant_edge_withLOF_score[key]])) 97 | dominant_edge_withLOF_array = np.array(dominant_edge_withLOF_array) 98 | dominant_edge_withLOF_array = dominant_edge_withLOF_array[dominant_edge_withLOF_array[:, 1].argsort()[::-1]] # Sort by score from large to small 99 | with open(output_path, 'w') as f: 100 | for key in dominant_edge_withLOF_array: 101 | f.write(str(int(key[0])) + ' ' + str(key[1])) 102 | f.write('\n') 103 | f.close() 104 | else: 105 | print('parameter error!') 106 | else: 107 | print('parameter error!') 108 | 109 | if __name__ == "__main__": 110 | main() -------------------------------------------------------------------------------- /code/oddball.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Python3 implementation of oddball 3 | 4 | @author: 5 | Tao Yu (gloooryyt@gmail.com) 6 | 7 | ''' 8 | 9 | import numpy as np 10 | from sklearn.linear_model import LinearRegression 11 | from sklearn.neighbors import LocalOutlierFactor 12 | 13 | # feature dictionary which format is {node i's id:Ni, Ei, Wi, λw,i} 14 | 15 | def star_or_clique(featureDict): 16 | N = [] 17 | E = [] 18 | for key in featureDict.keys(): 19 | N.append(featureDict[key][0]) 20 | E.append(featureDict[key][1]) 21 | #E=CN^α => log on both sides => logE=logC+αlogN 22 | #regard as y=b+wx to do linear regression 23 | #here the base of log is 2 24 | y_train = np.log2(E) 25 | y_train = np.array(y_train) 26 | y_train = y_train.reshape(len(E), 1) 27 | x_train = np.log2(N) 28 | x_train = np.array(x_train) 29 | x_train = x_train.reshape(len(N), 1) 30 | model = LinearRegression() 31 | model.fit(x_train, y_train) 32 | w = model.coef_[0][0] 33 | b = model.intercept_[0] 34 | C = 2**b 35 | alpha = w 36 | outlineScoreDict = {} 37 | for key in featureDict.keys(): 38 | yi = featureDict[key][1] 39 | xi = featureDict[key][0] 40 | outlineScore = (max(yi, C*(xi**alpha))/min(yi, C*(xi**alpha)))*np.log(abs(yi-C*(xi**alpha))+1) 41 | outlineScoreDict[key] = outlineScore 42 | return outlineScoreDict 43 | 44 | 45 | def heavy_vicinity(featureDict): 46 | W = [] 47 | E = [] 48 | for key in featureDict.keys(): 49 | W.append(featureDict[key][2]) 50 | E.append(featureDict[key][1]) 51 | #W=CE^β => log on both sides => logW=logC+βlogE 52 | #regard as y=b+wx to do linear regression 53 | #here the base of log is 2 54 | y_train = np.log2(W) 55 | y_train = np.array(y_train) 56 | y_train = y_train.reshape(len(W), 1) 57 | x_train = np.log2(E) 58 | x_train = np.array(x_train) 59 | x_train = x_train.reshape(len(E), 1) 60 | model = LinearRegression() 61 | model.fit(x_train, y_train) 62 | w = model.coef_[0][0] 63 | b = model.intercept_[0] 64 | C = 2**b 65 | beta = w 66 | outlineScoreDict = {} 67 | for key in featureDict.keys(): 68 | yi = featureDict[key][2] 69 | xi = featureDict[key][1] 70 | outlineScore = (max(yi, C*(xi**beta))/min(yi, C*(xi**beta)))*np.log(abs(yi-C*(xi**beta))+1) 71 | outlineScoreDict[key] = outlineScore 72 | return outlineScoreDict 73 | 74 | 75 | def dominant_edge(featureDict): 76 | Lambda_w_i = [] 77 | W = [] 78 | for key in featureDict.keys(): 79 | Lambda_w_i.append(featureDict[key][3]) 80 | W.append(featureDict[key][2]) 81 | #λ=CW^γ => log on both sides => logλ=logC+γlogW 82 | #regard as y=b+wx to do linear regression 83 | #here the base of log is 2 84 | y_train = np.log2(Lambda_w_i) 85 | y_train = np.array(y_train) 86 | y_train = y_train.reshape(len(Lambda_w_i), 1) 87 | x_train = np.log2(W) 88 | x_train = np.array(x_train) 89 | x_train = x_train.reshape(len(W), 1) 90 | model = LinearRegression() 91 | model.fit(x_train, y_train) 92 | w = model.coef_[0][0] 93 | b = model.intercept_[0] 94 | C = 2 ** b 95 | beta = w 96 | outlineScoreDict = {} 97 | for key in featureDict.keys(): 98 | yi = featureDict[key][3] 99 | xi = featureDict[key][2] 100 | outlineScore = (max(yi, C * (xi ** beta)) / min(yi, C * (xi ** beta))) * np.log(abs(yi - C * (xi ** beta)) + 1) 101 | outlineScoreDict[key] = outlineScore 102 | return outlineScoreDict 103 | 104 | 105 | def star_or_clique_withLOF(featureDict): 106 | N = [] 107 | E = [] 108 | for key in featureDict.keys(): 109 | N.append(featureDict[key][0]) 110 | E.append(featureDict[key][1]) 111 | #E=CN^α => log on both sides => logE=logC+αlogN 112 | #regard as y=b+wx to do linear regression 113 | #here the base of log is 2 114 | y_train = np.log2(E) 115 | y_train = np.array(y_train) 116 | y_train = y_train.reshape(len(E), 1) 117 | x_train = np.log2(N) 118 | x_train = np.array(x_train) 119 | x_train = x_train.reshape(len(N), 1) #the order in x_train and y_train is the same as which in featureDict.keys() now 120 | 121 | #prepare data for LOF 122 | xAndyForLOF = [] 123 | for index in range(len(N)): 124 | tempArray = np.array([x_train[index][0], y_train[index][0]]) 125 | xAndyForLOF.append(tempArray) 126 | xAndyForLOF = np.array(xAndyForLOF) 127 | 128 | model = LinearRegression() 129 | model.fit(x_train, y_train) 130 | w = model.coef_[0][0] 131 | b = model.intercept_[0] 132 | C = 2**b 133 | alpha = w 134 | print('alpha={}'.format(alpha)) 135 | 136 | #LOF algorithm 137 | clf = LocalOutlierFactor(n_neighbors=20) 138 | clf.fit(xAndyForLOF) 139 | LOFScoreArray = -clf.negative_outlier_factor_ 140 | 141 | outScoreDict = {} 142 | count = 0 #Used to take LOFScore in sequence from LOFScoreArray 143 | 144 | #get the maximum outLine 145 | maxOutLine = 0 146 | for key in featureDict.keys(): 147 | yi = featureDict[key][1] 148 | xi = featureDict[key][0] 149 | outlineScore = (max(yi, C*(xi**alpha))/min(yi, C*(xi**alpha)))*np.log(abs(yi-C*(xi**alpha))+1) 150 | if outlineScore > maxOutLine: 151 | maxOutLine = outlineScore 152 | 153 | print('maxOutLine={}'.format(maxOutLine)) 154 | 155 | #get the maximum LOFScore 156 | maxLOFScore = 0 157 | for ite in range(len(N)): 158 | if LOFScoreArray[ite] > maxLOFScore: 159 | maxLOFScore = LOFScoreArray[ite] 160 | 161 | print('maxLOFScore={}'.format(maxLOFScore)) 162 | 163 | for key in featureDict.keys(): 164 | yi = featureDict[key][1] 165 | xi = featureDict[key][0] 166 | outlineScore = (max(yi, C*(xi**alpha))/min(yi, C*(xi**alpha)))*np.log(abs(yi-C*(xi**alpha))+1) 167 | LOFScore = LOFScoreArray[count] 168 | count += 1 169 | outScore = outlineScore/maxOutLine + LOFScore/maxLOFScore 170 | outScoreDict[key] = outScore 171 | return outScoreDict 172 | 173 | 174 | def heavy_vicinity_withLOF(featureDict): 175 | W = [] 176 | E = [] 177 | for key in featureDict.keys(): 178 | W.append(featureDict[key][2]) 179 | E.append(featureDict[key][1]) 180 | #W=CE^β => log on both sides => logW=logC+βlogE 181 | #regard as y=b+wx to do linear regression 182 | #here the base of log is 2 183 | y_train = np.log2(W) 184 | y_train = np.array(y_train) 185 | y_train = y_train.reshape(len(W), 1) 186 | x_train = np.log2(E) 187 | x_train = np.array(x_train) 188 | x_train = x_train.reshape(len(E), 1) #the order in x_train and y_train is the same as which in featureDict.keys() now 189 | 190 | #prepare data for LOF 191 | xAndyForLOF = [] 192 | for index in range(len(W)): 193 | tempArray = np.array([x_train[index][0], y_train[index][0]]) 194 | xAndyForLOF.append(tempArray) 195 | xAndyForLOF = np.array(xAndyForLOF) 196 | 197 | model = LinearRegression() 198 | model.fit(x_train, y_train) 199 | w = model.coef_[0][0] 200 | b = model.intercept_[0] 201 | C = 2**b 202 | beta = w 203 | print('beta={}'.format(beta)) 204 | 205 | #LOF algorithm 206 | clf = LocalOutlierFactor(n_neighbors=20) 207 | clf.fit(xAndyForLOF) 208 | LOFScoreArray = -clf.negative_outlier_factor_ 209 | 210 | outScoreDict = {} 211 | count = 0 #Used to take LOFScore in sequence from LOFScoreArray 212 | 213 | #get the maximum outLine 214 | maxOutLine = 0 215 | for key in featureDict.keys(): 216 | yi = featureDict[key][2] 217 | xi = featureDict[key][1] 218 | outlineScore = (max(yi, C*(xi**beta))/min(yi, C*(xi**beta)))*np.log(abs(yi-C*(xi**beta))+1) 219 | if outlineScore > maxOutLine: 220 | maxOutLine = outlineScore 221 | 222 | print('maxOutLine={}'.format(maxOutLine)) 223 | 224 | #get the maximum LOFScore 225 | maxLOFScore = 0 226 | for ite in range(len(W)): 227 | if LOFScoreArray[ite] > maxLOFScore: 228 | maxLOFScore = LOFScoreArray[ite] 229 | 230 | print('maxLOFScore={}'.format(maxLOFScore)) 231 | 232 | for key in featureDict.keys(): 233 | yi = featureDict[key][2] 234 | xi = featureDict[key][1] 235 | outlineScore = (max(yi, C*(xi**beta))/min(yi, C*(xi**beta)))*np.log(abs(yi-C*(xi**beta))+1) 236 | LOFScore = LOFScoreArray[count] 237 | count += 1 238 | outScore = outlineScore/maxOutLine + LOFScore/maxLOFScore 239 | outScoreDict[key] = outScore 240 | return outScoreDict 241 | 242 | def dominant_edge_withLOF(featureDict): 243 | Lambda_w_i = [] 244 | W = [] 245 | for key in featureDict.keys(): 246 | Lambda_w_i.append(featureDict[key][3]) 247 | W.append(featureDict[key][2]) 248 | #λ=CW^γ => log on both sides => logλ=logC+γlogW 249 | #regard as y=b+wx to do linear regression 250 | #here the base of log is 2 251 | y_train = np.log2(Lambda_w_i) 252 | y_train = np.array(y_train) 253 | y_train = y_train.reshape(len(Lambda_w_i), 1) 254 | x_train = np.log2(W) 255 | x_train = np.array(x_train) 256 | x_train = x_train.reshape(len(W), 1) #the order in x_train and y_train is the same as which in featureDict.keys() now 257 | 258 | #prepare data for LOF 259 | xAndyForLOF = [] 260 | for index in range(len(W)): 261 | tempArray = np.array([x_train[index][0], y_train[index][0]]) 262 | xAndyForLOF.append(tempArray) 263 | xAndyForLOF = np.array(xAndyForLOF) 264 | 265 | model = LinearRegression() 266 | model.fit(x_train, y_train) 267 | w = model.coef_[0][0] 268 | b = model.intercept_[0] 269 | C = 2**b 270 | gamma = w 271 | print('gamma={}'.format(gamma)) 272 | 273 | #LOF algorithm 274 | clf = LocalOutlierFactor(n_neighbors=20) 275 | clf.fit(xAndyForLOF) 276 | LOFScoreArray = -clf.negative_outlier_factor_ 277 | 278 | outScoreDict = {} 279 | count = 0 #Used to take LOFScore in sequence from LOFScoreArray 280 | 281 | #get the maximum outLine 282 | maxOutLine = 0 283 | for key in featureDict.keys(): 284 | yi = featureDict[key][3] 285 | xi = featureDict[key][2] 286 | outlineScore = (max(yi, C*(xi**gamma))/min(yi, C*(xi**gamma)))*np.log(abs(yi-C*(xi**gamma))+1) 287 | if outlineScore > maxOutLine: 288 | maxOutLine = outlineScore 289 | 290 | print('maxOutLine={}'.format(maxOutLine)) 291 | 292 | #get the maximum LOFScore 293 | maxLOFScore = 0 294 | for ite in range(len(W)): 295 | if LOFScoreArray[ite] > maxLOFScore: 296 | maxLOFScore = LOFScoreArray[ite] 297 | 298 | print('maxLOFScore={}'.format(maxLOFScore)) 299 | 300 | for key in featureDict.keys(): 301 | yi = featureDict[key][3] 302 | xi = featureDict[key][2] 303 | outlineScore = (max(yi, C*(xi**gamma))/min(yi, C*(xi**gamma)))*np.log(abs(yi-C*(xi**gamma))+1) 304 | LOFScore = LOFScoreArray[count] 305 | count += 1 306 | outScore = outlineScore/maxOutLine + LOFScore/maxLOFScore 307 | outScoreDict[key] = outScore 308 | return outScoreDict 309 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | networkx==2.1 2 | numpy==1.17.0 3 | scikit_learn==0.23.1 4 | --------------------------------------------------------------------------------