├── README.md
├── data
└── wbc
│ ├── X.csv
│ └── y.csv
└── src
├── main.py
├── outlier_interpreter.py
├── prediction_strength.py
└── utils.py
/README.md:
--------------------------------------------------------------------------------
1 | # Contextual-Outlier-Interpretation
2 |
3 | This project provides an implementation for the [paper](https://www.ijcai.org/proceedings/2018/0341.pdf):
4 |
5 | > **Contextual Outlier Interpretation**
6 | Ninghao Liu, Donghwa Shin, Xia Hu
7 | IJCAI 2018
8 |
9 |
10 | ### Files in the folder
11 | - `data/`
12 | - `wbc/`: a example dataset used in outlier detection
13 | - `X.csv`: each line represents one instance;
14 | - `y.csv`: labels indicating whether each instance is an outlier (1) or not (0);
15 | - `src/`: implementations of the proposed outlier explanation method
16 | - `main.py`: runs the proposed method on the dataset
17 | - `outlier_interpreter.py`: implementation of the interpretation method
18 | - `prediction_strength.py`: estimates the number of clusters
19 | - `utils.py`: includes LOF and IsolationForest as outlier detectors
20 |
--------------------------------------------------------------------------------
/data/wbc/X.csv:
--------------------------------------------------------------------------------
1 | 5,1,1,1,2,1,3,1,1
2 | 5,4,4,5,7,10,3,2,1
3 | 3,1,1,1,2,2,3,1,1
4 | 6,8,8,1,3,4,3,7,1
5 | 4,1,1,3,2,1,3,1,1
6 | 1,1,1,1,2,10,3,1,1
7 | 2,1,2,1,2,1,3,1,1
8 | 2,1,1,1,2,1,1,1,5
9 | 4,2,1,1,2,1,2,1,1
10 | 1,1,1,1,1,1,3,1,1
11 | 2,1,1,1,2,1,2,1,1
12 | 1,1,1,1,2,3,3,1,1
13 | 4,1,1,1,2,1,2,1,1
14 | 4,1,1,1,2,1,3,1,1
15 | 6,1,1,1,2,1,3,1,1
16 | 3,1,1,1,2,1,2,1,1
17 | 1,1,1,1,2,1,3,1,1
18 | 3,2,1,1,1,1,2,1,1
19 | 5,1,1,1,2,1,2,1,1
20 | 2,1,1,1,2,1,2,1,1
21 | 1,1,3,1,2,1,1,1,1
22 | 3,1,1,1,1,1,2,1,1
23 | 2,1,1,1,2,1,3,1,1
24 | 2,1,1,2,2,1,3,1,1
25 | 3,1,2,1,2,1,2,1,1
26 | 2,1,1,1,2,1,2,1,1
27 | 6,2,1,1,1,1,7,1,1
28 | 6,6,6,9,6,1,7,8,1
29 | 1,1,1,1,2,1,2,1,2
30 | 1,1,1,1,2,1,2,1,1
31 | 4,1,1,3,2,1,3,1,1
32 | 1,1,1,1,2,2,2,1,1
33 | 1,1,1,1,2,1,2,1,1
34 | 4,1,1,1,2,1,3,1,1
35 | 1,1,1,1,2,1,3,2,1
36 | 5,1,3,1,2,1,2,1,1
37 | 1,3,3,2,2,1,7,2,1
38 | 1,1,2,1,2,2,4,2,1
39 | 1,1,4,1,2,1,2,1,1
40 | 5,3,1,2,2,1,2,1,1
41 | 3,1,1,1,2,3,3,1,1
42 | 2,1,1,1,3,1,2,1,1
43 | 2,2,2,1,1,1,7,1,1
44 | 4,1,1,2,2,1,2,1,1
45 | 5,2,1,1,2,1,3,1,1
46 | 3,1,1,1,2,2,7,1,1
47 | 4,1,1,1,2,1,3,1,1
48 | 2,1,1,2,3,1,2,1,1
49 | 1,1,1,1,2,1,3,1,1
50 | 3,1,1,2,2,1,1,1,1
51 | 4,1,1,1,2,1,3,1,1
52 | 1,1,1,1,2,1,2,1,1
53 | 2,1,1,1,2,1,3,1,1
54 | 1,1,1,1,2,1,3,1,1
55 | 2,1,1,2,2,1,1,1,1
56 | 5,1,1,1,2,1,3,1,1
57 | 4,1,2,1,2,1,3,1,1
58 | 1,1,1,1,2,1,2,3,1
59 | 1,3,1,2,2,2,5,3,2
60 | 3,3,2,1,2,3,3,1,1
61 | 1,1,1,1,2,5,1,1,1
62 | 8,3,3,1,2,2,3,2,1
63 | 1,1,1,1,4,3,1,1,1
64 | 3,2,1,1,2,2,3,1,1
65 | 1,1,2,2,2,1,3,1,1
66 | 4,2,1,1,2,2,3,1,1
67 | 1,1,1,1,2,1,2,1,1
68 | 3,1,1,1,2,1,3,1,1
69 | 1,1,1,1,10,1,1,1,1
70 | 5,1,3,1,2,1,2,1,1
71 | 2,1,1,1,2,1,3,1,1
72 | 3,1,1,1,2,1,2,2,1
73 | 3,1,1,1,3,1,2,1,1
74 | 5,1,1,1,2,2,3,3,1
75 | 4,1,1,1,2,1,2,1,1
76 | 3,1,1,1,2,1,1,1,1
77 | 4,1,2,1,2,1,2,1,1
78 | 1,1,1,1,1,1,2,1,1
79 | 3,1,1,1,2,1,1,1,1
80 | 2,1,1,1,2,1,1,1,1
81 | 1,1,1,1,2,5,1,1,1
82 | 2,1,1,1,2,1,2,1,1
83 | 1,1,3,1,2,1,2,1,1
84 | 1,1,1,1,3,2,2,1,1
85 | 3,1,1,3,8,1,5,8,1
86 | 1,1,1,1,1,1,3,1,1
87 | 4,1,1,1,2,3,1,1,1
88 | 1,1,1,1,2,1,1,1,1
89 | 1,2,2,1,2,1,2,1,1
90 | 2,1,1,1,2,1,3,1,1
91 | 1,1,2,1,3,1,1,1,1
92 | 4,1,1,1,2,1,3,2,1
93 | 3,1,1,1,2,1,3,1,1
94 | 1,1,1,2,1,3,1,1,7
95 | 5,1,1,1,2,2,3,1,1
96 | 4,1,1,1,2,2,3,2,1
97 | 3,1,1,1,2,1,3,1,1
98 | 1,1,1,2,1,1,1,1,1
99 | 3,1,1,1,2,1,1,1,1
100 | 1,1,1,1,2,1,3,1,1
101 | 1,1,1,1,2,1,2,1,1
102 | 2,1,1,1,2,1,3,1,1
103 | 4,1,1,1,2,1,3,1,1
104 | 1,1,1,1,1,1,3,1,1
105 | 1,1,1,1,2,1,1,1,1
106 | 6,1,1,1,2,1,3,1,1
107 | 2,1,1,1,1,1,3,1,1
108 | 1,2,3,1,2,1,3,1,1
109 | 5,1,1,1,2,1,2,1,1
110 | 1,1,1,1,2,1,3,1,1
111 | 3,1,1,1,2,1,3,1,1
112 | 4,1,1,1,2,1,3,1,1
113 | 8,4,4,5,4,7,7,8,2
114 | 5,1,1,4,2,1,3,1,1
115 | 1,1,1,1,2,1,1,1,1
116 | 3,1,1,1,2,1,2,1,1
117 | 1,1,1,1,2,1,3,1,1
118 | 5,1,1,1,2,1,3,1,1
119 | 1,1,1,1,2,1,3,1,1
120 | 1,1,1,1,1,1,3,1,1
121 | 1,1,1,1,1,1,3,1,1
122 | 5,1,1,1,1,1,3,1,1
123 | 1,1,1,1,2,1,3,1,1
124 | 1,1,1,1,2,1,2,1,1
125 | 1,1,1,1,2,1,3,1,1
126 | 6,1,3,1,2,1,3,1,1
127 | 1,1,1,2,2,1,3,1,1
128 | 1,1,1,1,2,1,2,1,1
129 | 1,1,1,1,1,1,3,1,1
130 | 8,4,6,3,3,1,4,3,1
131 | 3,3,2,1,3,1,3,6,1
132 | 3,1,4,1,2,1,3,1,1
133 | 5,1,3,3,2,2,2,3,1
134 | 3,1,1,3,1,1,3,1,1
135 | 2,1,1,1,2,1,3,1,1
136 | 1,1,1,1,2,5,5,1,1
137 | 1,1,1,1,2,1,3,1,1
138 | 5,1,1,2,2,2,3,1,1
139 | 4,1,1,1,2,1,3,6,1
140 | 3,1,1,1,2,1,3,1,1
141 | 1,2,2,1,2,1,1,1,1
142 | 6,3,3,5,3,10,3,5,3
143 | 3,1,1,1,2,1,1,1,1
144 | 3,1,1,1,2,1,2,1,1
145 | 3,1,1,1,2,1,3,1,1
146 | 5,7,7,1,5,8,3,4,1
147 | 5,1,4,1,2,1,3,2,1
148 | 1,1,1,1,2,1,3,1,1
149 | 5,1,1,1,2,1,3,1,1
150 | 3,1,1,1,2,1,3,2,1
151 | 3,1,3,1,2,1,2,1,1
152 | 3,1,1,1,2,1,2,1,1
153 | 1,1,1,1,2,1,2,1,1
154 | 1,1,1,1,2,1,3,1,1
155 | 3,1,1,1,2,1,3,1,1
156 | 2,1,1,2,2,1,3,1,1
157 | 3,1,1,1,3,1,2,1,1
158 | 1,1,1,1,2,1,1,1,1
159 | 1,1,1,1,2,1,3,1,1
160 | 1,1,1,1,2,1,2,1,1
161 | 5,3,4,3,4,5,4,7,1
162 | 5,4,3,1,2,2,2,3,1
163 | 8,2,1,1,5,1,1,1,1
164 | 1,1,1,1,2,1,3,1,1
165 | 1,1,1,1,2,1,3,1,1
166 | 1,1,1,1,2,1,3,1,1
167 | 1,1,1,1,2,1,3,1,1
168 | 3,1,1,1,2,5,5,1,1
169 | 2,1,1,1,3,1,2,1,1
170 | 1,1,1,1,2,1,1,1,1
171 | 1,1,1,1,2,1,1,1,1
172 | 1,1,1,1,1,1,2,1,1
173 | 4,6,5,6,7,3,4,9,1
174 | 1,1,1,1,5,1,3,1,1
175 | 4,4,4,4,6,5,7,3,1
176 | 3,1,1,1,2,2,3,1,1
177 | 3,1,1,1,2,1,3,1,1
178 | 1,1,1,1,2,1,3,1,1
179 | 3,2,2,1,2,1,2,3,1
180 | 1,1,1,1,2,1,2,1,1
181 | 5,1,1,1,2,1,3,1,2
182 | 5,2,2,2,2,1,2,2,1
183 | 1,1,1,1,2,1,1,1,1
184 | 1,1,1,1,2,1,3,1,1
185 | 1,1,1,1,1,1,2,1,1
186 | 1,1,1,1,2,1,3,1,1
187 | 2,1,1,1,2,1,1,1,1
188 | 1,1,1,1,2,1,1,1,1
189 | 1,1,1,1,2,1,1,1,1
190 | 5,2,2,2,3,1,1,3,1
191 | 1,1,1,1,1,1,1,3,1
192 | 5,1,1,3,2,1,1,1,1
193 | 2,1,1,1,2,1,3,1,1
194 | 3,4,5,3,7,3,4,6,1
195 | 1,1,1,1,2,1,2,1,1
196 | 4,1,1,1,3,1,2,2,1
197 | 3,2,2,1,4,3,2,1,1
198 | 4,4,4,2,2,3,2,1,1
199 | 2,1,1,1,2,1,3,1,1
200 | 2,1,1,1,2,1,2,1,1
201 | 1,1,3,1,2,1,1,1,1
202 | 1,1,3,1,1,1,2,1,1
203 | 4,3,2,1,3,1,2,1,1
204 | 1,1,3,1,2,1,1,1,1
205 | 4,1,2,1,2,1,2,1,1
206 | 5,1,1,2,2,1,2,1,1
207 | 3,1,2,1,2,1,2,1,1
208 | 1,1,1,1,2,1,1,1,1
209 | 1,1,1,1,2,1,2,1,1
210 | 1,1,1,1,1,1,2,1,1
211 | 3,1,1,4,3,1,2,2,1
212 | 5,3,4,1,4,1,3,1,1
213 | 1,1,1,1,2,1,1,1,1
214 | 3,2,2,2,2,1,3,2,1
215 | 2,1,1,1,2,1,1,1,1
216 | 2,1,1,1,2,1,1,1,1
217 | 3,3,2,2,3,1,1,2,3
218 | 5,3,3,2,3,1,3,1,1
219 | 2,1,1,1,2,1,2,2,1
220 | 5,1,1,1,3,2,2,2,1
221 | 1,1,1,2,2,1,2,1,1
222 | 3,1,1,1,2,1,2,1,1
223 | 1,1,1,1,1,1,1,1,1
224 | 1,2,3,1,2,1,2,1,1
225 | 3,1,1,1,2,1,2,1,1
226 | 3,1,1,1,2,1,3,1,1
227 | 4,1,1,1,2,1,1,1,1
228 | 3,2,1,1,2,1,2,2,1
229 | 1,2,3,1,2,1,1,1,1
230 | 3,1,1,1,2,1,1,1,1
231 | 5,3,3,1,2,1,2,1,1
232 | 3,1,1,1,2,4,1,1,1
233 | 1,2,1,3,2,1,1,2,1
234 | 1,1,1,1,2,1,2,1,1
235 | 4,2,2,1,2,1,2,1,1
236 | 1,1,1,1,2,1,2,1,1
237 | 2,3,2,2,2,2,3,1,1
238 | 3,1,2,1,2,1,2,1,1
239 | 1,1,1,1,2,1,2,1,1
240 | 1,1,1,1,1,1,2,1,1
241 | 5,1,2,1,2,1,3,1,1
242 | 3,3,2,6,3,3,3,5,1
243 | 1,1,1,1,2,1,2,1,1
244 | 5,2,2,2,2,2,3,2,2
245 | 2,3,1,1,5,1,1,1,1
246 | 3,2,2,3,2,3,3,1,1
247 | 4,3,3,1,2,1,3,3,1
248 | 5,1,3,1,2,1,2,1,1
249 | 3,1,1,1,2,1,1,1,1
250 | 5,3,6,1,2,1,1,1,1
251 | 1,1,1,1,2,1,2,1,1
252 | 2,1,1,1,2,1,2,1,1
253 | 1,3,1,1,2,1,2,2,1
254 | 5,1,1,3,4,1,3,2,1
255 | 5,1,1,1,2,1,2,2,1
256 | 3,2,2,3,2,1,1,1,1
257 | 6,9,7,5,5,8,4,2,1
258 | 4,1,1,1,2,1,1,1,1
259 | 4,1,3,3,2,1,1,1,1
260 | 5,1,1,1,2,1,1,1,1
261 | 5,2,2,4,2,4,1,1,1
262 | 1,1,1,3,2,3,1,1,1
263 | 1,1,1,1,2,2,1,1,1
264 | 5,1,1,6,3,1,2,1,1
265 | 2,1,1,1,2,1,1,1,1
266 | 1,1,1,1,2,1,1,1,1
267 | 5,1,1,1,2,1,1,1,1
268 | 1,1,1,1,1,1,1,1,1
269 | 4,1,1,3,1,1,2,1,1
270 | 5,1,1,1,2,1,1,1,1
271 | 3,1,1,3,2,1,1,1,1
272 | 2,3,1,1,3,1,1,1,1
273 | 5,1,2,1,2,1,1,1,1
274 | 5,1,3,1,2,1,1,1,1
275 | 5,1,1,3,2,1,1,1,1
276 | 3,1,1,1,2,5,1,1,1
277 | 6,1,1,3,2,1,1,1,1
278 | 4,1,1,1,2,1,1,2,1
279 | 4,1,1,1,2,1,1,1,1
280 | 4,1,1,1,2,1,1,1,1
281 | 1,1,2,1,2,1,2,1,1
282 | 3,1,1,1,1,1,2,1,1
283 | 6,1,1,3,2,1,1,1,1
284 | 6,1,1,1,1,1,1,1,1
285 | 4,1,1,1,2,1,1,1,1
286 | 5,1,1,1,2,1,1,1,1
287 | 3,1,1,1,2,1,1,1,1
288 | 4,1,2,1,2,1,1,1,1
289 | 4,1,1,1,2,1,1,1,1
290 | 5,2,1,1,2,1,1,1,1
291 | 5,1,1,1,1,1,1,1,1
292 | 5,3,2,4,2,1,1,1,1
293 | 5,1,2,1,2,1,1,1,1
294 | 1,1,1,3,1,3,1,1,1
295 | 3,1,1,1,1,1,2,1,1
296 | 1,1,1,1,2,1,1,1,1
297 | 4,1,1,1,1,1,2,1,1
298 | 5,1,2,10,4,5,2,1,1
299 | 3,1,1,1,1,1,2,1,1
300 | 1,1,1,1,1,1,1,1,1
301 | 4,2,1,1,2,1,1,1,1
302 | 4,1,1,1,2,1,2,1,1
303 | 4,1,1,1,2,1,2,1,1
304 | 6,1,1,1,2,1,3,1,1
305 | 4,1,1,1,2,1,2,1,1
306 | 4,1,1,2,2,1,2,1,1
307 | 4,1,1,1,2,1,3,1,1
308 | 1,1,1,1,2,1,1,1,1
309 | 3,3,1,1,2,1,1,1,1
310 | 1,1,1,1,2,4,1,1,1
311 | 5,1,1,1,2,1,1,1,1
312 | 2,1,1,1,2,1,1,1,1
313 | 1,1,1,1,2,1,1,1,1
314 | 5,1,1,1,2,1,2,1,1
315 | 5,1,1,1,2,1,1,1,1
316 | 3,1,1,1,1,1,2,1,1
317 | 1,1,1,1,1,1,1,1,1
318 | 1,1,1,1,1,1,2,1,1
319 | 3,1,2,2,2,1,1,1,1
320 | 1,1,1,1,3,1,1,1,1
321 | 4,1,1,1,3,1,1,1,1
322 | 3,1,1,1,2,1,2,1,1
323 | 3,1,1,2,2,1,1,1,1
324 | 4,1,1,1,2,1,1,1,1
325 | 4,1,1,1,2,1,3,1,1
326 | 6,1,3,2,2,1,1,1,1
327 | 4,1,1,1,1,1,2,1,1
328 | 4,2,2,1,2,1,2,1,1
329 | 1,1,1,1,1,1,3,1,1
330 | 3,1,1,1,2,1,2,1,1
331 | 2,1,1,1,2,1,2,1,1
332 | 1,1,3,2,2,1,3,1,1
333 | 5,1,1,1,2,1,3,1,1
334 | 5,1,2,1,2,1,3,1,1
335 | 4,1,1,1,2,1,2,1,1
336 | 6,1,1,1,2,1,2,1,1
337 | 5,1,1,1,2,2,2,1,1
338 | 3,1,1,1,2,1,1,1,1
339 | 5,3,1,1,2,1,1,1,1
340 | 4,1,1,1,2,1,2,1,1
341 | 2,1,3,2,2,1,2,1,1
342 | 5,1,1,1,2,1,2,1,1
343 | 2,1,1,1,1,1,1,1,1
344 | 3,1,1,1,1,1,1,1,1
345 | 3,1,1,1,2,1,2,1,1
346 | 1,1,1,1,2,1,3,1,1
347 | 3,2,2,2,2,1,4,2,1
348 | 4,4,2,1,2,5,2,1,2
349 | 3,1,1,1,2,1,1,1,1
350 | 4,3,1,1,2,1,4,8,1
351 | 5,2,2,2,1,1,2,1,1
352 | 5,1,1,3,2,1,1,1,1
353 | 2,1,1,1,2,1,2,1,1
354 | 5,1,1,1,2,1,2,1,1
355 | 5,1,1,1,2,1,3,1,1
356 | 5,1,1,1,2,1,3,1,1
357 | 1,1,1,1,2,1,3,1,1
358 | 3,1,1,1,2,1,2,1,1
359 | 4,1,1,1,2,1,3,2,1
360 | 3,1,2,1,2,1,3,1,1
361 | 4,1,1,1,2,3,2,1,1
362 | 3,1,1,1,2,1,2,1,1
363 | 1,1,1,1,2,1,2,1,1
364 | 5,1,2,1,2,1,3,1,1
365 | 5,1,1,1,2,1,2,1,1
366 | 1,1,1,1,2,1,2,1,1
367 | 1,1,1,1,2,1,2,1,1
368 | 1,1,1,1,2,1,3,1,1
369 | 5,1,2,1,2,1,2,1,1
370 | 3,1,1,1,2,1,1,1,1
371 | 5,1,1,6,3,1,1,1,1
372 | 1,1,1,1,2,1,1,1,1
373 | 5,1,1,1,2,1,2,2,1
374 | 5,1,1,1,2,1,1,1,1
375 | 5,1,2,1,2,1,1,1,1
376 | 5,1,1,1,2,1,2,1,1
377 | 4,1,2,1,2,1,2,1,1
378 | 5,1,3,1,2,1,3,1,1
379 | 3,1,1,1,2,1,2,1,1
380 | 5,2,4,1,1,1,1,1,1
381 | 3,1,1,1,2,1,2,1,1
382 | 1,1,1,1,1,1,2,1,1
383 | 4,1,1,1,2,1,2,1,1
384 | 4,1,1,2,2,1,1,1,1
385 | 1,1,1,1,2,1,1,1,1
386 | 5,1,1,1,2,1,1,1,1
387 | 2,3,1,1,2,1,2,1,1
388 | 2,1,1,1,1,1,2,1,1
389 | 4,1,3,1,2,1,2,1,1
390 | 3,1,1,1,2,1,2,1,1
391 | 1,1,1,1,1,1,1,1,1
392 | 4,1,1,1,2,1,2,1,1
393 | 5,1,1,1,2,1,2,1,1
394 | 3,1,1,1,2,1,2,1,1
395 | 6,3,3,3,3,2,6,1,1
396 | 7,1,2,3,2,1,2,1,1
397 | 1,1,1,1,2,1,1,1,1
398 | 5,1,1,2,1,1,2,1,1
399 | 3,1,3,1,3,4,1,1,1
400 | 2,1,1,1,2,5,1,1,1
401 | 2,1,1,1,2,1,1,1,1
402 | 4,1,1,1,2,1,1,1,1
403 | 6,2,3,1,2,1,1,1,1
404 | 5,1,1,1,2,1,2,1,1
405 | 1,1,1,1,2,1,1,1,1
406 | 3,1,1,1,2,1,1,1,1
407 | 3,1,4,1,2,1,1,1,1
408 | 4,2,4,3,2,2,2,1,1
409 | 4,1,1,1,2,1,1,1,1
410 | 5,1,1,3,2,1,1,1,1
411 | 4,1,1,3,2,1,1,1,1
412 | 3,1,1,1,2,1,2,1,1
413 | 3,1,1,1,2,1,2,1,1
414 | 1,1,1,1,2,1,1,1,1
415 | 2,1,1,1,2,1,1,1,1
416 | 3,1,1,1,2,1,2,1,1
417 | 1,2,2,1,2,1,1,1,1
418 | 1,1,1,3,2,1,1,1,1
419 | 3,1,1,1,2,1,2,1,1
420 | 3,1,1,2,3,4,1,1,1
421 | 1,2,1,3,2,1,2,1,1
422 | 5,1,1,1,2,1,2,2,1
423 | 4,1,1,1,2,1,2,1,1
424 | 3,1,1,1,2,1,3,1,1
425 | 3,1,1,1,2,1,2,1,1
426 | 5,1,1,1,2,1,2,1,1
427 | 5,4,5,1,8,1,3,6,1
428 | 1,1,1,1,2,1,1,1,1
429 | 1,1,1,1,2,1,2,1,1
430 | 4,1,1,1,2,1,3,1,1
431 | 1,1,3,1,2,1,2,1,1
432 | 1,1,3,1,2,1,2,1,1
433 | 3,1,1,3,2,1,2,1,1
434 | 1,1,1,1,2,1,1,1,1
435 | 5,2,2,2,2,1,1,1,2
436 | 3,1,1,1,2,1,3,1,1
437 | 3,2,1,2,2,1,3,1,1
438 | 2,1,1,1,2,1,3,1,1
439 | 5,3,2,1,3,1,1,1,1
440 | 1,1,1,1,2,1,2,1,1
441 | 4,1,4,1,2,1,1,1,1
442 | 1,1,2,1,2,1,2,1,1
443 | 5,1,1,1,2,1,1,1,1
444 | 1,1,1,1,2,1,1,1,1
445 | 2,1,1,1,2,1,1,1,1
446 | 5,1,1,1,2,1,3,2,1
447 | 1,1,1,1,2,1,1,1,1
448 | 1,1,1,1,2,1,1,1,1
449 | 1,1,1,1,2,1,1,1,1
450 | 1,1,1,1,2,1,1,1,1
451 | 3,1,1,1,2,1,2,3,1
452 | 4,1,1,1,2,1,1,1,1
453 | 1,1,1,1,2,1,1,1,8
454 | 1,1,1,3,2,1,1,1,1
455 | 3,1,1,1,2,1,1,1,1
456 | 3,1,1,1,2,1,2,1,2
457 | 3,1,1,1,3,2,1,1,1
458 | 2,1,1,1,2,1,1,1,1
459 |
--------------------------------------------------------------------------------
/data/wbc/y.csv:
--------------------------------------------------------------------------------
1 | 0
2 | 0
3 | 0
4 | 1
5 | 0
6 | 1
7 | 0
8 | 1
9 | 0
10 | 0
11 | 0
12 | 0
13 | 0
14 | 0
15 | 0
16 | 0
17 | 0
18 | 0
19 | 0
20 | 0
21 | 0
22 | 0
23 | 0
24 | 0
25 | 0
26 | 0
27 | 1
28 | 1
29 | 0
30 | 0
31 | 0
32 | 0
33 | 0
34 | 0
35 | 0
36 | 0
37 | 1
38 | 0
39 | 0
40 | 0
41 | 0
42 | 0
43 | 1
44 | 0
45 | 0
46 | 1
47 | 0
48 | 0
49 | 0
50 | 0
51 | 0
52 | 0
53 | 0
54 | 0
55 | 0
56 | 0
57 | 0
58 | 0
59 | 1
60 | 0
61 | 0
62 | 0
63 | 0
64 | 0
65 | 0
66 | 0
67 | 0
68 | 0
69 | 1
70 | 0
71 | 0
72 | 0
73 | 0
74 | 0
75 | 0
76 | 0
77 | 0
78 | 0
79 | 0
80 | 0
81 | 0
82 | 0
83 | 0
84 | 0
85 | 1
86 | 0
87 | 0
88 | 0
89 | 0
90 | 0
91 | 0
92 | 0
93 | 0
94 | 1
95 | 0
96 | 0
97 | 0
98 | 0
99 | 0
100 | 0
101 | 0
102 | 0
103 | 0
104 | 0
105 | 0
106 | 0
107 | 0
108 | 0
109 | 0
110 | 0
111 | 0
112 | 0
113 | 0
114 | 0
115 | 0
116 | 0
117 | 0
118 | 0
119 | 0
120 | 0
121 | 0
122 | 0
123 | 0
124 | 0
125 | 0
126 | 0
127 | 0
128 | 0
129 | 0
130 | 1
131 | 1
132 | 0
133 | 0
134 | 0
135 | 0
136 | 1
137 | 0
138 | 0
139 | 1
140 | 0
141 | 0
142 | 0
143 | 0
144 | 0
145 | 0
146 | 0
147 | 0
148 | 0
149 | 0
150 | 0
151 | 0
152 | 0
153 | 0
154 | 0
155 | 0
156 | 0
157 | 0
158 | 0
159 | 0
160 | 0
161 | 0
162 | 0
163 | 1
164 | 0
165 | 0
166 | 0
167 | 0
168 | 1
169 | 0
170 | 0
171 | 0
172 | 0
173 | 0
174 | 1
175 | 0
176 | 0
177 | 0
178 | 0
179 | 0
180 | 0
181 | 0
182 | 0
183 | 0
184 | 0
185 | 0
186 | 0
187 | 0
188 | 0
189 | 0
190 | 0
191 | 0
192 | 0
193 | 0
194 | 0
195 | 0
196 | 0
197 | 0
198 | 0
199 | 0
200 | 0
201 | 0
202 | 0
203 | 0
204 | 0
205 | 0
206 | 0
207 | 0
208 | 0
209 | 0
210 | 0
211 | 0
212 | 0
213 | 0
214 | 0
215 | 0
216 | 0
217 | 0
218 | 0
219 | 0
220 | 0
221 | 0
222 | 0
223 | 0
224 | 0
225 | 0
226 | 0
227 | 0
228 | 0
229 | 0
230 | 0
231 | 0
232 | 0
233 | 0
234 | 0
235 | 0
236 | 0
237 | 0
238 | 0
239 | 0
240 | 0
241 | 0
242 | 0
243 | 0
244 | 0
245 | 1
246 | 0
247 | 0
248 | 0
249 | 0
250 | 0
251 | 0
252 | 0
253 | 0
254 | 0
255 | 0
256 | 0
257 | 1
258 | 0
259 | 0
260 | 0
261 | 0
262 | 0
263 | 0
264 | 0
265 | 0
266 | 0
267 | 0
268 | 0
269 | 0
270 | 0
271 | 0
272 | 0
273 | 0
274 | 0
275 | 0
276 | 0
277 | 0
278 | 0
279 | 0
280 | 0
281 | 0
282 | 0
283 | 0
284 | 0
285 | 0
286 | 0
287 | 0
288 | 0
289 | 0
290 | 0
291 | 0
292 | 0
293 | 0
294 | 1
295 | 0
296 | 0
297 | 0
298 | 1
299 | 0
300 | 0
301 | 0
302 | 0
303 | 0
304 | 0
305 | 0
306 | 0
307 | 0
308 | 0
309 | 0
310 | 0
311 | 0
312 | 0
313 | 0
314 | 0
315 | 0
316 | 0
317 | 0
318 | 0
319 | 0
320 | 0
321 | 0
322 | 0
323 | 0
324 | 0
325 | 0
326 | 0
327 | 0
328 | 0
329 | 0
330 | 0
331 | 0
332 | 0
333 | 0
334 | 0
335 | 0
336 | 0
337 | 0
338 | 0
339 | 0
340 | 0
341 | 0
342 | 0
343 | 0
344 | 0
345 | 0
346 | 0
347 | 0
348 | 0
349 | 0
350 | 1
351 | 0
352 | 0
353 | 0
354 | 0
355 | 0
356 | 0
357 | 0
358 | 0
359 | 0
360 | 0
361 | 0
362 | 0
363 | 0
364 | 0
365 | 0
366 | 0
367 | 0
368 | 0
369 | 0
370 | 0
371 | 0
372 | 0
373 | 0
374 | 0
375 | 0
376 | 0
377 | 0
378 | 0
379 | 0
380 | 0
381 | 0
382 | 0
383 | 0
384 | 0
385 | 0
386 | 0
387 | 0
388 | 0
389 | 0
390 | 0
391 | 0
392 | 0
393 | 0
394 | 0
395 | 0
396 | 0
397 | 0
398 | 0
399 | 0
400 | 0
401 | 0
402 | 0
403 | 0
404 | 0
405 | 0
406 | 0
407 | 0
408 | 0
409 | 0
410 | 0
411 | 0
412 | 0
413 | 0
414 | 0
415 | 0
416 | 0
417 | 0
418 | 0
419 | 0
420 | 0
421 | 0
422 | 0
423 | 0
424 | 0
425 | 0
426 | 0
427 | 0
428 | 0
429 | 0
430 | 0
431 | 0
432 | 0
433 | 0
434 | 0
435 | 0
436 | 0
437 | 0
438 | 0
439 | 0
440 | 0
441 | 0
442 | 0
443 | 0
444 | 0
445 | 0
446 | 0
447 | 0
448 | 0
449 | 0
450 | 0
451 | 0
452 | 0
453 | 1
454 | 0
455 | 0
456 | 0
457 | 0
458 | 0
459 |
--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import numpy as np
4 | import argparse
5 | from utils import detect_lof, detect_isoforest
6 | from outlier_interpreter import OutlierInterpreter
7 |
8 |
9 | def main(args):
10 | # read data
11 | num_inst, dim, X = read_data(args)
12 | X = np.array(X, dtype=float)
13 |
14 | # measure degree of outlierness
15 | labels_otlr = detect_outliers(args, X)
16 |
17 | # interpret outliers
18 | sgnf_prior = 1 # we do not have prior knowledge of feature importance for WBC data, so set it as scalar 1
19 | interpreter = OutlierInterpreter(X, labels_otlr, args.ratio_nbr,
20 | AUG=args.AUG, MIN_CLUSTER_SIZE=args.MIN_CLUSTER_SIZE, MAX_NUM_CLUSTER=args.MAX_NUM_CLUSTER,
21 | VAL_TIMES = args.VAL_TIMES, C_SVM=args.C_SVM, THRE_PS=args.THRE_PS, DEFK=args.DEFK)
22 | ids_target = np.where(labels_otlr == 1)[0] # sample id of outliers
23 | importance_attr, outlierness = interpreter.interpret_outliers(ids_target, sgnf_prior, int_flag=1)
24 |
25 | print("Sample ID of outliers:", '\n', ids_target)
26 | print("Outlying degree of attributes:", '\n', importance_attr)
27 | print("Outlierness scores re-estimated:", '\n', outlierness)
28 |
29 | return ids_target, importance_attr, outlierness
30 |
31 |
32 | def read_data(args):
33 | data_name = args.dataset
34 | fn = os.path.join(os.path.dirname(os.getcwd()), "data", data_name, "X.csv")
35 | X = np.genfromtxt(fn, delimiter=',', dtype=int)
36 | num_inst = X.shape[0]
37 | dim = X.shape[1]
38 |
39 | return num_inst, dim, X
40 |
41 |
42 | def detect_outliers(args, X):
43 | # A larger outlier score means greater outlierness
44 | if args.detector == 'lof':
45 | labels_otlr = detect_lof(args, X)
46 | labels_otlr = np.array(0.5*(1-labels_otlr), dtype=int) # outlier label = 1, normal label = 0
47 | #print(labels_otlr)
48 | elif args.detector == 'isoforest':
49 | labels_otlr = detect_isoforest(args, X)
50 | labels_otlr = np.array(0.5 * (1 - labels_otlr), dtype=int) # outlier label = 1, normal label = 0
51 | #print(labels_otlr)
52 | else:
53 | print("The detector type is not considered in current implementation...")
54 | sys.exit()
55 |
56 | return labels_otlr
57 |
58 |
59 | if __name__ == "__main__":
60 | parser = argparse.ArgumentParser()
61 | parser.add_argument('--dataset', type=str, default='wbc', help='which dataset to use')
62 | parser.add_argument('--AUG', type=float, default=10, help='an additional attribute value as augmentation')
63 | parser.add_argument('--detector', type=str, default='lof', help='which outlier detector to use')
64 | parser.add_argument('--ratio_nbr', type=float, default=0.1,
65 | help='controls number of neighbors to use in kneighbors queries')
66 | parser.add_argument('--MIN_CLUSTER_SIZE', type=int, default=5,
67 | help='minimum number of samples required in a cluster')
68 | parser.add_argument('--MAX_NUM_CLUSTER', type=int, default=4, help='maximum number of clusters for each context')
69 | parser.add_argument('--VAL_TIMES', type=int, default=10, help='number of iterations for computing prediction strength')
70 | parser.add_argument('--C_SVM', type=float, default=1., help='penalty parameter for svm')
71 | parser.add_argument('--DEFK', type=int, default=0,
72 | help='pre-determined number of clusters in each context (use prediction strength if 0)')
73 | parser.add_argument('--THRE_PS', type=float, default=0.85,
74 | help='threshold for deciding the best cluster value in prediction strength')
75 | parser.add_argument('--RESOLUTION', type=float, default=0.05, help='attribute resolution')
76 | args = parser.parse_args()
77 |
78 | main(args)
--------------------------------------------------------------------------------
/src/outlier_interpreter.py:
--------------------------------------------------------------------------------
1 | import os
2 | import copy
3 | import numpy as np
4 | from sklearn.neighbors import NearestNeighbors
5 | from sklearn.cluster import KMeans
6 | from sklearn import svm
7 | from prediction_strength import optimalK
8 |
9 | class OutlierInterpreter(object):
10 | def __init__(self, data, inds_otlr, nbrs_ratio,
11 | AUG=1.0, MIN_CLUSTER_SIZE=5, MAX_NUM_CLUSTER=4, VAL_TIMES=10, C_SVM=1.,
12 | RESOLUTION=0.05, THRE_PS=0.85, DEFK=0):
13 | '''
14 | data: Data matrix, each row represents one instance
15 | inds_otlr: A vector with each entry telling whether this instance is outlier (1) or not (0)
16 | nbrs_ratio: The ratio of normal instances as the context for each outlier
17 | AUG: An additional feature attached to the input as data augmentation
18 | MIN_CLUSTER_SIZE: Minimum number of nodes in each cluster
19 | MAX_NUM_CLUSTER: Maximum number of clusters considered in prediction strength computation
20 | VAL_TIMES: Number of iterations for computing prediction strength
21 | C_SVM: A hyperparameter in SVM (optimum value would be better to be estimated through validation)
22 | DEFK: Predefined number of clusters in each context. Value 0 means using Prediction Strength to estimate it.
23 | '''
24 |
25 | self.data = data
26 | self.inds_otlr = inds_otlr
27 | self.AUG = float(AUG)
28 |
29 | self.num_inst = data.shape[0]
30 | self.num_feat = data.shape[1]
31 | self.num_nbrs = int(nbrs_ratio * self.num_inst)
32 |
33 | self.MIN_CLUSTER_SIZE = MIN_CLUSTER_SIZE
34 | self.MAX_NUM_CLUSTER = MAX_NUM_CLUSTER
35 | self.VAL_TIMES = VAL_TIMES
36 | self.C_SVM = C_SVM
37 | self.RESOLUTION = RESOLUTION
38 | self.THRE_PS = THRE_PS
39 | self.DEFK = DEFK
40 |
41 | # normal instances
42 | self.data_normal = self.data[np.where(self.inds_otlr == 0)[0]]
43 |
44 | # nearest nbrs object based on normal instances
45 | self.nbrs = NearestNeighbors(n_neighbors=self.num_nbrs)
46 | self.nbrs.fit(self.data_normal)
47 |
48 |
49 | def interpret_outliers(self, ids_target, sgnf_vec, int_flag=0):
50 | """
51 | ids_target: Indices of target outliers
52 | sgnf_vec: A vector indicating the importance of each attribute, as prior knowledge
53 | int_flag: Discrete attribute or not
54 | :return: A list of sorted (outlier_ID, outlierness) tuples, a list of clfs, attr importance 2D-array
55 | """
56 |
57 | # Attach 0 to the augmented feature
58 | if isinstance(sgnf_vec, int) or isinstance(sgnf_vec, float):
59 | sgnf_vec = np.hstack((np.ones(self.num_feat), 0))
60 | else:
61 | sgnf_vec = np.hstack((sgnf_vec, [0]))
62 |
63 | # Interpret each target outlier
64 | oid_devt_dict = dict() # id-score tuples
65 | score_attr_mat = []
66 | cnt = 0
67 | for i in ids_target:
68 | if cnt % 20 == 0:
69 | print(cnt)
70 | cnt += 1
71 |
72 | # Do clustering on the context, build one classifier for each cluster
73 | nums_c, clfs, cluster_attr_scale = self.cluster_context(i, int_flag)
74 |
75 | # Calculate outlierness score
76 | devt_i = self.CalculateOutlierness(i, clfs, nums_c, sgnf_vec)
77 | oid_devt_dict[i] = devt_i
78 |
79 | # Find outlying attributes
80 | score_attr = np.zeros(self.num_feat)
81 | for num_c, clf in zip(nums_c, clfs):
82 | score_attr += num_c * np.abs(clf.coef_[0]) # weighted by the normal cluster size
83 | score_attr /= float(np.sum(nums_c))
84 | score_attr /= np.sum(score_attr) # relative importance
85 | score_attr_mat.append(copy.copy(score_attr))
86 | #print(score_attr)
87 |
88 | return np.array(score_attr_mat), oid_devt_dict
89 |
90 |
91 | def cluster_context(self, id_outlier, int_flag):
92 | # find the context of the outlier
93 | dist_btwn, otlr_nbrs = self.nbrs.kneighbors([self.data[id_outlier]])
94 | dist_btwn, otlr_nbrs = dist_btwn[0], self.data_normal[otlr_nbrs[0], :]
95 | #print(self.data[id_outlier])
96 | #print(otlr_nbrs)
97 |
98 | # choose the number of clusters in the context
99 | if self.DEFK == 0:
100 | k_best = optimalK(otlr_nbrs, self.VAL_TIMES, self.MAX_NUM_CLUSTER, self.THRE_PS)
101 | else:
102 | k_best = self.DEFK
103 | k_best = min(k_best+1, self.MAX_NUM_CLUSTER) # empirically, it is better to have a lager K
104 | # print('Best k:', k_best)
105 |
106 | # clutering the context
107 | kmeans = KMeans(n_clusters=k_best, random_state=0).fit(otlr_nbrs)
108 | label_nbrs = kmeans.labels_
109 |
110 | clfs = []
111 | nbrs_mean = []
112 | nums_c = []
113 | cluster_attr_scale = []
114 |
115 | # build a linear classifier for each cluster of nbrs
116 | for c in range(k_best):
117 | # indices for instances in cluster c
118 | inds_c = np.where(label_nbrs == c)[0]
119 |
120 | # the cluster cannot be too small
121 | if np.size(inds_c) < self.MIN_CLUSTER_SIZE:
122 | continue
123 | nums_c.append(len(inds_c))
124 |
125 | # instances for cluster c
126 | otlr_nbrs_c = otlr_nbrs[inds_c, :]
127 | dist_btwn_c = dist_btwn[inds_c]
128 |
129 | # distance property of cluster c
130 | cluster_attr_scale.append(np.hstack((np.max(otlr_nbrs_c, axis=0) - np.min(otlr_nbrs_c, axis=0), 0))) # scale for each attr
131 |
132 | # synthetic sampling to build two classes
133 | insts_c0 = self.SyntheticSampling(otlr_nbrs_c, self.data[id_outlier], int_flag)
134 |
135 | insts_c1 = otlr_nbrs_c
136 | clf = self.SVCInterpreter(insts_c0, insts_c1)
137 |
138 | clfs.append(clf)
139 | nbrs_mean.append(np.average(insts_c1, axis=0))
140 |
141 | return nums_c, clfs, cluster_attr_scale
142 |
143 |
144 | def SyntheticSampling(self, insts, otlr, int_flag):
145 | '''
146 | Expand the outlier into a class.
147 |
148 | insts: normal instances
149 | otlr: the outlier instance
150 | expand_ratio: expand ratio
151 | int_flag: whether to round to int
152 | :return: two classes of data points
153 | '''
154 |
155 | num_c0_new = insts.shape[0] - 1
156 | coeff_c0_new = np.random.rand(num_c0_new, insts.shape[0]) # transformation matrix for synthetic sampling
157 | nbrs_local = NearestNeighbors(n_neighbors=1).fit(insts)
158 | min_dist_to_nbr = nbrs_local.kneighbors([otlr])[0][0, 0]/insts.shape[1]
159 |
160 | for r in range(coeff_c0_new.shape[0]):
161 | coeff_c0_new[r, :] /= sum(coeff_c0_new[r, :])
162 | insts_c0_new = np.dot(coeff_c0_new, insts - np.dot(np.ones((insts.shape[0], 1)), [otlr]))
163 | for r in range(insts_c0_new.shape[0]): # shrink to prevent overlap
164 | insts_c0_new[r, :] *= (0.2 * np.random.rand(1)[0] * min_dist_to_nbr)
165 | insts_c0_new += np.dot(np.ones((num_c0_new, 1)), [otlr]) # origin + shift
166 | if int_flag:
167 | insts_c0_new = np.round(insts_c0_new)
168 | insts_c0 = np.vstack((otlr, insts_c0_new))
169 |
170 | return insts_c0
171 |
172 |
173 | def SVCInterpreter(self, insts_c0, insts_c1):
174 | # classification between normal instances and outliers, where outliers have negative output
175 |
176 | clf = svm.LinearSVC(penalty='l1', C=self.C_SVM, dual=False, intercept_scaling=self.AUG)
177 | X_c = np.vstack((insts_c0, insts_c1))
178 | y_c = np.hstack((np.zeros(insts_c0.shape[0]), np.ones(insts_c1.shape[0])))
179 | clf.fit(X_c, y_c)
180 | #print(insts_c1)
181 | #print(insts_c0)
182 |
183 | return clf
184 |
185 |
186 | def CalculateOutlierness(self, id_outlier, clfs, nums_c, sgnf_vec):
187 | otlr = self.data[id_outlier]
188 |
189 | devt_overall = 0.
190 | for c in range(len(nums_c)):
191 | # distance to the boundary
192 | otlr_aug = np.hstack((otlr, self.AUG))
193 | w = np.hstack((clfs[c].coef_[0], clfs[c].intercept_[0]/self.AUG))
194 | w_a = np.hstack((clfs[c].coef_[0], 0))
195 | dist = -min(0, np.inner(otlr_aug, w))/np.linalg.norm(w_a)
196 |
197 | # rescale deviation according to attributes' importance
198 | devt = np.linalg.norm(np.multiply(dist * w_a / np.linalg.norm(w_a), sgnf_vec))
199 | if np.isnan(devt):
200 | devt = 0.
201 |
202 | # weighted by the opponent cluster size
203 | devt_overall += devt * nums_c[c]
204 |
205 | devt_overall /= sum(nums_c)
206 |
207 | return devt_overall
--------------------------------------------------------------------------------
/src/prediction_strength.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | from sklearn.neighbors import NearestNeighbors
5 | from sklearn.cluster import KMeans
6 | from sklearn.datasets.samples_generator import make_blobs
7 |
8 |
9 | def ClosestCenter(point, centroids):
10 | # Find the closest center over all centroids
11 | min_index = -1
12 | min_dist = float('inf')
13 | for i in range(len(centroids)):
14 | center = centroids[i]
15 | dist_cur = np.linalg.norm(point - center)
16 | if dist_cur < min_dist:
17 | min_index = i
18 | min_dist = dist_cur
19 |
20 | return min_index
21 |
22 |
23 | def PredictionStrength(data_test, test_labels, train_centers, c):
24 | # Compute prediction strength under c clusters
25 | pred_strength = np.zeros(c)
26 | for cc in range(c):
27 | num_cc = test_labels.tolist().count(cc)
28 | count = 0.
29 | for i in range(len(test_labels)-1):
30 | for j in range(i+1, len(test_labels)):
31 | if test_labels[i] == test_labels[j] == cc:
32 | pi = data_test[i]
33 | pj = data_test[j]
34 | if ClosestCenter(pi, train_centers) == ClosestCenter(pj, train_centers):
35 | count += 1
36 |
37 | if num_cc <= 1:
38 | pred_strength[cc] = float('inf')
39 | else:
40 | pred_strength[cc] = count/(num_cc * (num_cc-1)/2.)
41 |
42 | return min(pred_strength)
43 |
44 |
45 | def optimalK(data, num_fold, maxClusters=5, THRE_PS=0.90):
46 | # Find the best number of clusters using prediction strength
47 | num_data = data.shape[0]
48 | num_feat = data.shape[1]
49 |
50 | pred_strength_avg = np.zeros(maxClusters+1)
51 | for nf in range(num_fold):
52 | # Split into training and testing samples
53 | inds_train = np.random.choice(num_data, int(num_data*0.5), replace=False)
54 | inds_test = list(set(range(num_data)).difference(inds_train))
55 | data_train = data[inds_train]
56 | data_test = data[inds_test]
57 |
58 | pred_strength_cur = np.zeros(maxClusters+1)
59 | for c in range(1, maxClusters+1):
60 | train_cluster = KMeans(n_clusters=c).fit(data_train)
61 | test_cluster = KMeans(n_clusters=c).fit(data_test)
62 | pred_strength_cur[c] = PredictionStrength(data_test, test_cluster.labels_, train_cluster.cluster_centers_, c)
63 |
64 | pred_strength_avg += pred_strength_cur
65 |
66 | pred_strength_avg /= num_fold
67 | # print("Prediction Strength vec: ", pred_strength_avg)
68 |
69 | k_optimal = max([i for i,j in enumerate(pred_strength_avg) if j > THRE_PS])
70 |
71 | return k_optimal
72 |
73 |
74 | if __name__ == "__main__":
75 | x, y = make_blobs(24, n_features=5, centers=3)
76 | plt.scatter(x[:, 0], x[:, 1])
77 | plt.show()
78 |
79 | k = optimalK(x, 10)
80 | print('Optimal k is: ', k)
--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.neighbors import LocalOutlierFactor
3 | from sklearn.ensemble import IsolationForest
4 |
5 | def detect_lof(args, X):
6 | num_inst = X.shape[0]
7 | num_nbr = int(num_inst * args.ratio_nbr)
8 | clf = LocalOutlierFactor(n_neighbors=num_nbr)
9 | y_pred = clf.fit_predict(X)
10 | outlier_scores = -clf.negative_outlier_factor_
11 |
12 | return y_pred
13 |
14 |
15 | def detect_isoforest(args, X):
16 | num_inst = X.shape[0]
17 | clf = IsolationForest(behaviour='new', max_samples=num_inst, random_state=0)
18 | clf.fit(X)
19 | y_pred = clf.predict(X)
20 | outlier_scores = -clf.decision_function(X)
21 |
22 | return y_pred
--------------------------------------------------------------------------------