├── README.md ├── data └── wbc │ ├── X.csv │ └── y.csv └── src ├── main.py ├── outlier_interpreter.py ├── prediction_strength.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # Contextual-Outlier-Interpretation 2 | 3 | This project provides an implementation for the [paper](https://www.ijcai.org/proceedings/2018/0341.pdf):
4 | 5 | > **Contextual Outlier Interpretation**
6 | Ninghao Liu, Donghwa Shin, Xia Hu
7 | IJCAI 2018
8 | 9 | 10 | ### Files in the folder 11 | - `data/` 12 | - `wbc/`: a example dataset used in outlier detection 13 | - `X.csv`: each line represents one instance; 14 | - `y.csv`: labels indicating whether each instance is an outlier (1) or not (0); 15 | - `src/`: implementations of the proposed outlier explanation method 16 | - `main.py`: runs the proposed method on the dataset 17 | - `outlier_interpreter.py`: implementation of the interpretation method 18 | - `prediction_strength.py`: estimates the number of clusters 19 | - `utils.py`: includes LOF and IsolationForest as outlier detectors 20 | -------------------------------------------------------------------------------- /data/wbc/X.csv: -------------------------------------------------------------------------------- 1 | 5,1,1,1,2,1,3,1,1 2 | 5,4,4,5,7,10,3,2,1 3 | 3,1,1,1,2,2,3,1,1 4 | 6,8,8,1,3,4,3,7,1 5 | 4,1,1,3,2,1,3,1,1 6 | 1,1,1,1,2,10,3,1,1 7 | 2,1,2,1,2,1,3,1,1 8 | 2,1,1,1,2,1,1,1,5 9 | 4,2,1,1,2,1,2,1,1 10 | 1,1,1,1,1,1,3,1,1 11 | 2,1,1,1,2,1,2,1,1 12 | 1,1,1,1,2,3,3,1,1 13 | 4,1,1,1,2,1,2,1,1 14 | 4,1,1,1,2,1,3,1,1 15 | 6,1,1,1,2,1,3,1,1 16 | 3,1,1,1,2,1,2,1,1 17 | 1,1,1,1,2,1,3,1,1 18 | 3,2,1,1,1,1,2,1,1 19 | 5,1,1,1,2,1,2,1,1 20 | 2,1,1,1,2,1,2,1,1 21 | 1,1,3,1,2,1,1,1,1 22 | 3,1,1,1,1,1,2,1,1 23 | 2,1,1,1,2,1,3,1,1 24 | 2,1,1,2,2,1,3,1,1 25 | 3,1,2,1,2,1,2,1,1 26 | 2,1,1,1,2,1,2,1,1 27 | 6,2,1,1,1,1,7,1,1 28 | 6,6,6,9,6,1,7,8,1 29 | 1,1,1,1,2,1,2,1,2 30 | 1,1,1,1,2,1,2,1,1 31 | 4,1,1,3,2,1,3,1,1 32 | 1,1,1,1,2,2,2,1,1 33 | 1,1,1,1,2,1,2,1,1 34 | 4,1,1,1,2,1,3,1,1 35 | 1,1,1,1,2,1,3,2,1 36 | 5,1,3,1,2,1,2,1,1 37 | 1,3,3,2,2,1,7,2,1 38 | 1,1,2,1,2,2,4,2,1 39 | 1,1,4,1,2,1,2,1,1 40 | 5,3,1,2,2,1,2,1,1 41 | 3,1,1,1,2,3,3,1,1 42 | 2,1,1,1,3,1,2,1,1 43 | 2,2,2,1,1,1,7,1,1 44 | 4,1,1,2,2,1,2,1,1 45 | 5,2,1,1,2,1,3,1,1 46 | 3,1,1,1,2,2,7,1,1 47 | 4,1,1,1,2,1,3,1,1 48 | 2,1,1,2,3,1,2,1,1 49 | 1,1,1,1,2,1,3,1,1 50 | 3,1,1,2,2,1,1,1,1 51 | 4,1,1,1,2,1,3,1,1 52 | 1,1,1,1,2,1,2,1,1 53 | 2,1,1,1,2,1,3,1,1 54 | 1,1,1,1,2,1,3,1,1 55 | 2,1,1,2,2,1,1,1,1 56 | 5,1,1,1,2,1,3,1,1 57 | 4,1,2,1,2,1,3,1,1 58 | 1,1,1,1,2,1,2,3,1 59 | 1,3,1,2,2,2,5,3,2 60 | 3,3,2,1,2,3,3,1,1 61 | 1,1,1,1,2,5,1,1,1 62 | 8,3,3,1,2,2,3,2,1 63 | 1,1,1,1,4,3,1,1,1 64 | 3,2,1,1,2,2,3,1,1 65 | 1,1,2,2,2,1,3,1,1 66 | 4,2,1,1,2,2,3,1,1 67 | 1,1,1,1,2,1,2,1,1 68 | 3,1,1,1,2,1,3,1,1 69 | 1,1,1,1,10,1,1,1,1 70 | 5,1,3,1,2,1,2,1,1 71 | 2,1,1,1,2,1,3,1,1 72 | 3,1,1,1,2,1,2,2,1 73 | 3,1,1,1,3,1,2,1,1 74 | 5,1,1,1,2,2,3,3,1 75 | 4,1,1,1,2,1,2,1,1 76 | 3,1,1,1,2,1,1,1,1 77 | 4,1,2,1,2,1,2,1,1 78 | 1,1,1,1,1,1,2,1,1 79 | 3,1,1,1,2,1,1,1,1 80 | 2,1,1,1,2,1,1,1,1 81 | 1,1,1,1,2,5,1,1,1 82 | 2,1,1,1,2,1,2,1,1 83 | 1,1,3,1,2,1,2,1,1 84 | 1,1,1,1,3,2,2,1,1 85 | 3,1,1,3,8,1,5,8,1 86 | 1,1,1,1,1,1,3,1,1 87 | 4,1,1,1,2,3,1,1,1 88 | 1,1,1,1,2,1,1,1,1 89 | 1,2,2,1,2,1,2,1,1 90 | 2,1,1,1,2,1,3,1,1 91 | 1,1,2,1,3,1,1,1,1 92 | 4,1,1,1,2,1,3,2,1 93 | 3,1,1,1,2,1,3,1,1 94 | 1,1,1,2,1,3,1,1,7 95 | 5,1,1,1,2,2,3,1,1 96 | 4,1,1,1,2,2,3,2,1 97 | 3,1,1,1,2,1,3,1,1 98 | 1,1,1,2,1,1,1,1,1 99 | 3,1,1,1,2,1,1,1,1 100 | 1,1,1,1,2,1,3,1,1 101 | 1,1,1,1,2,1,2,1,1 102 | 2,1,1,1,2,1,3,1,1 103 | 4,1,1,1,2,1,3,1,1 104 | 1,1,1,1,1,1,3,1,1 105 | 1,1,1,1,2,1,1,1,1 106 | 6,1,1,1,2,1,3,1,1 107 | 2,1,1,1,1,1,3,1,1 108 | 1,2,3,1,2,1,3,1,1 109 | 5,1,1,1,2,1,2,1,1 110 | 1,1,1,1,2,1,3,1,1 111 | 3,1,1,1,2,1,3,1,1 112 | 4,1,1,1,2,1,3,1,1 113 | 8,4,4,5,4,7,7,8,2 114 | 5,1,1,4,2,1,3,1,1 115 | 1,1,1,1,2,1,1,1,1 116 | 3,1,1,1,2,1,2,1,1 117 | 1,1,1,1,2,1,3,1,1 118 | 5,1,1,1,2,1,3,1,1 119 | 1,1,1,1,2,1,3,1,1 120 | 1,1,1,1,1,1,3,1,1 121 | 1,1,1,1,1,1,3,1,1 122 | 5,1,1,1,1,1,3,1,1 123 | 1,1,1,1,2,1,3,1,1 124 | 1,1,1,1,2,1,2,1,1 125 | 1,1,1,1,2,1,3,1,1 126 | 6,1,3,1,2,1,3,1,1 127 | 1,1,1,2,2,1,3,1,1 128 | 1,1,1,1,2,1,2,1,1 129 | 1,1,1,1,1,1,3,1,1 130 | 8,4,6,3,3,1,4,3,1 131 | 3,3,2,1,3,1,3,6,1 132 | 3,1,4,1,2,1,3,1,1 133 | 5,1,3,3,2,2,2,3,1 134 | 3,1,1,3,1,1,3,1,1 135 | 2,1,1,1,2,1,3,1,1 136 | 1,1,1,1,2,5,5,1,1 137 | 1,1,1,1,2,1,3,1,1 138 | 5,1,1,2,2,2,3,1,1 139 | 4,1,1,1,2,1,3,6,1 140 | 3,1,1,1,2,1,3,1,1 141 | 1,2,2,1,2,1,1,1,1 142 | 6,3,3,5,3,10,3,5,3 143 | 3,1,1,1,2,1,1,1,1 144 | 3,1,1,1,2,1,2,1,1 145 | 3,1,1,1,2,1,3,1,1 146 | 5,7,7,1,5,8,3,4,1 147 | 5,1,4,1,2,1,3,2,1 148 | 1,1,1,1,2,1,3,1,1 149 | 5,1,1,1,2,1,3,1,1 150 | 3,1,1,1,2,1,3,2,1 151 | 3,1,3,1,2,1,2,1,1 152 | 3,1,1,1,2,1,2,1,1 153 | 1,1,1,1,2,1,2,1,1 154 | 1,1,1,1,2,1,3,1,1 155 | 3,1,1,1,2,1,3,1,1 156 | 2,1,1,2,2,1,3,1,1 157 | 3,1,1,1,3,1,2,1,1 158 | 1,1,1,1,2,1,1,1,1 159 | 1,1,1,1,2,1,3,1,1 160 | 1,1,1,1,2,1,2,1,1 161 | 5,3,4,3,4,5,4,7,1 162 | 5,4,3,1,2,2,2,3,1 163 | 8,2,1,1,5,1,1,1,1 164 | 1,1,1,1,2,1,3,1,1 165 | 1,1,1,1,2,1,3,1,1 166 | 1,1,1,1,2,1,3,1,1 167 | 1,1,1,1,2,1,3,1,1 168 | 3,1,1,1,2,5,5,1,1 169 | 2,1,1,1,3,1,2,1,1 170 | 1,1,1,1,2,1,1,1,1 171 | 1,1,1,1,2,1,1,1,1 172 | 1,1,1,1,1,1,2,1,1 173 | 4,6,5,6,7,3,4,9,1 174 | 1,1,1,1,5,1,3,1,1 175 | 4,4,4,4,6,5,7,3,1 176 | 3,1,1,1,2,2,3,1,1 177 | 3,1,1,1,2,1,3,1,1 178 | 1,1,1,1,2,1,3,1,1 179 | 3,2,2,1,2,1,2,3,1 180 | 1,1,1,1,2,1,2,1,1 181 | 5,1,1,1,2,1,3,1,2 182 | 5,2,2,2,2,1,2,2,1 183 | 1,1,1,1,2,1,1,1,1 184 | 1,1,1,1,2,1,3,1,1 185 | 1,1,1,1,1,1,2,1,1 186 | 1,1,1,1,2,1,3,1,1 187 | 2,1,1,1,2,1,1,1,1 188 | 1,1,1,1,2,1,1,1,1 189 | 1,1,1,1,2,1,1,1,1 190 | 5,2,2,2,3,1,1,3,1 191 | 1,1,1,1,1,1,1,3,1 192 | 5,1,1,3,2,1,1,1,1 193 | 2,1,1,1,2,1,3,1,1 194 | 3,4,5,3,7,3,4,6,1 195 | 1,1,1,1,2,1,2,1,1 196 | 4,1,1,1,3,1,2,2,1 197 | 3,2,2,1,4,3,2,1,1 198 | 4,4,4,2,2,3,2,1,1 199 | 2,1,1,1,2,1,3,1,1 200 | 2,1,1,1,2,1,2,1,1 201 | 1,1,3,1,2,1,1,1,1 202 | 1,1,3,1,1,1,2,1,1 203 | 4,3,2,1,3,1,2,1,1 204 | 1,1,3,1,2,1,1,1,1 205 | 4,1,2,1,2,1,2,1,1 206 | 5,1,1,2,2,1,2,1,1 207 | 3,1,2,1,2,1,2,1,1 208 | 1,1,1,1,2,1,1,1,1 209 | 1,1,1,1,2,1,2,1,1 210 | 1,1,1,1,1,1,2,1,1 211 | 3,1,1,4,3,1,2,2,1 212 | 5,3,4,1,4,1,3,1,1 213 | 1,1,1,1,2,1,1,1,1 214 | 3,2,2,2,2,1,3,2,1 215 | 2,1,1,1,2,1,1,1,1 216 | 2,1,1,1,2,1,1,1,1 217 | 3,3,2,2,3,1,1,2,3 218 | 5,3,3,2,3,1,3,1,1 219 | 2,1,1,1,2,1,2,2,1 220 | 5,1,1,1,3,2,2,2,1 221 | 1,1,1,2,2,1,2,1,1 222 | 3,1,1,1,2,1,2,1,1 223 | 1,1,1,1,1,1,1,1,1 224 | 1,2,3,1,2,1,2,1,1 225 | 3,1,1,1,2,1,2,1,1 226 | 3,1,1,1,2,1,3,1,1 227 | 4,1,1,1,2,1,1,1,1 228 | 3,2,1,1,2,1,2,2,1 229 | 1,2,3,1,2,1,1,1,1 230 | 3,1,1,1,2,1,1,1,1 231 | 5,3,3,1,2,1,2,1,1 232 | 3,1,1,1,2,4,1,1,1 233 | 1,2,1,3,2,1,1,2,1 234 | 1,1,1,1,2,1,2,1,1 235 | 4,2,2,1,2,1,2,1,1 236 | 1,1,1,1,2,1,2,1,1 237 | 2,3,2,2,2,2,3,1,1 238 | 3,1,2,1,2,1,2,1,1 239 | 1,1,1,1,2,1,2,1,1 240 | 1,1,1,1,1,1,2,1,1 241 | 5,1,2,1,2,1,3,1,1 242 | 3,3,2,6,3,3,3,5,1 243 | 1,1,1,1,2,1,2,1,1 244 | 5,2,2,2,2,2,3,2,2 245 | 2,3,1,1,5,1,1,1,1 246 | 3,2,2,3,2,3,3,1,1 247 | 4,3,3,1,2,1,3,3,1 248 | 5,1,3,1,2,1,2,1,1 249 | 3,1,1,1,2,1,1,1,1 250 | 5,3,6,1,2,1,1,1,1 251 | 1,1,1,1,2,1,2,1,1 252 | 2,1,1,1,2,1,2,1,1 253 | 1,3,1,1,2,1,2,2,1 254 | 5,1,1,3,4,1,3,2,1 255 | 5,1,1,1,2,1,2,2,1 256 | 3,2,2,3,2,1,1,1,1 257 | 6,9,7,5,5,8,4,2,1 258 | 4,1,1,1,2,1,1,1,1 259 | 4,1,3,3,2,1,1,1,1 260 | 5,1,1,1,2,1,1,1,1 261 | 5,2,2,4,2,4,1,1,1 262 | 1,1,1,3,2,3,1,1,1 263 | 1,1,1,1,2,2,1,1,1 264 | 5,1,1,6,3,1,2,1,1 265 | 2,1,1,1,2,1,1,1,1 266 | 1,1,1,1,2,1,1,1,1 267 | 5,1,1,1,2,1,1,1,1 268 | 1,1,1,1,1,1,1,1,1 269 | 4,1,1,3,1,1,2,1,1 270 | 5,1,1,1,2,1,1,1,1 271 | 3,1,1,3,2,1,1,1,1 272 | 2,3,1,1,3,1,1,1,1 273 | 5,1,2,1,2,1,1,1,1 274 | 5,1,3,1,2,1,1,1,1 275 | 5,1,1,3,2,1,1,1,1 276 | 3,1,1,1,2,5,1,1,1 277 | 6,1,1,3,2,1,1,1,1 278 | 4,1,1,1,2,1,1,2,1 279 | 4,1,1,1,2,1,1,1,1 280 | 4,1,1,1,2,1,1,1,1 281 | 1,1,2,1,2,1,2,1,1 282 | 3,1,1,1,1,1,2,1,1 283 | 6,1,1,3,2,1,1,1,1 284 | 6,1,1,1,1,1,1,1,1 285 | 4,1,1,1,2,1,1,1,1 286 | 5,1,1,1,2,1,1,1,1 287 | 3,1,1,1,2,1,1,1,1 288 | 4,1,2,1,2,1,1,1,1 289 | 4,1,1,1,2,1,1,1,1 290 | 5,2,1,1,2,1,1,1,1 291 | 5,1,1,1,1,1,1,1,1 292 | 5,3,2,4,2,1,1,1,1 293 | 5,1,2,1,2,1,1,1,1 294 | 1,1,1,3,1,3,1,1,1 295 | 3,1,1,1,1,1,2,1,1 296 | 1,1,1,1,2,1,1,1,1 297 | 4,1,1,1,1,1,2,1,1 298 | 5,1,2,10,4,5,2,1,1 299 | 3,1,1,1,1,1,2,1,1 300 | 1,1,1,1,1,1,1,1,1 301 | 4,2,1,1,2,1,1,1,1 302 | 4,1,1,1,2,1,2,1,1 303 | 4,1,1,1,2,1,2,1,1 304 | 6,1,1,1,2,1,3,1,1 305 | 4,1,1,1,2,1,2,1,1 306 | 4,1,1,2,2,1,2,1,1 307 | 4,1,1,1,2,1,3,1,1 308 | 1,1,1,1,2,1,1,1,1 309 | 3,3,1,1,2,1,1,1,1 310 | 1,1,1,1,2,4,1,1,1 311 | 5,1,1,1,2,1,1,1,1 312 | 2,1,1,1,2,1,1,1,1 313 | 1,1,1,1,2,1,1,1,1 314 | 5,1,1,1,2,1,2,1,1 315 | 5,1,1,1,2,1,1,1,1 316 | 3,1,1,1,1,1,2,1,1 317 | 1,1,1,1,1,1,1,1,1 318 | 1,1,1,1,1,1,2,1,1 319 | 3,1,2,2,2,1,1,1,1 320 | 1,1,1,1,3,1,1,1,1 321 | 4,1,1,1,3,1,1,1,1 322 | 3,1,1,1,2,1,2,1,1 323 | 3,1,1,2,2,1,1,1,1 324 | 4,1,1,1,2,1,1,1,1 325 | 4,1,1,1,2,1,3,1,1 326 | 6,1,3,2,2,1,1,1,1 327 | 4,1,1,1,1,1,2,1,1 328 | 4,2,2,1,2,1,2,1,1 329 | 1,1,1,1,1,1,3,1,1 330 | 3,1,1,1,2,1,2,1,1 331 | 2,1,1,1,2,1,2,1,1 332 | 1,1,3,2,2,1,3,1,1 333 | 5,1,1,1,2,1,3,1,1 334 | 5,1,2,1,2,1,3,1,1 335 | 4,1,1,1,2,1,2,1,1 336 | 6,1,1,1,2,1,2,1,1 337 | 5,1,1,1,2,2,2,1,1 338 | 3,1,1,1,2,1,1,1,1 339 | 5,3,1,1,2,1,1,1,1 340 | 4,1,1,1,2,1,2,1,1 341 | 2,1,3,2,2,1,2,1,1 342 | 5,1,1,1,2,1,2,1,1 343 | 2,1,1,1,1,1,1,1,1 344 | 3,1,1,1,1,1,1,1,1 345 | 3,1,1,1,2,1,2,1,1 346 | 1,1,1,1,2,1,3,1,1 347 | 3,2,2,2,2,1,4,2,1 348 | 4,4,2,1,2,5,2,1,2 349 | 3,1,1,1,2,1,1,1,1 350 | 4,3,1,1,2,1,4,8,1 351 | 5,2,2,2,1,1,2,1,1 352 | 5,1,1,3,2,1,1,1,1 353 | 2,1,1,1,2,1,2,1,1 354 | 5,1,1,1,2,1,2,1,1 355 | 5,1,1,1,2,1,3,1,1 356 | 5,1,1,1,2,1,3,1,1 357 | 1,1,1,1,2,1,3,1,1 358 | 3,1,1,1,2,1,2,1,1 359 | 4,1,1,1,2,1,3,2,1 360 | 3,1,2,1,2,1,3,1,1 361 | 4,1,1,1,2,3,2,1,1 362 | 3,1,1,1,2,1,2,1,1 363 | 1,1,1,1,2,1,2,1,1 364 | 5,1,2,1,2,1,3,1,1 365 | 5,1,1,1,2,1,2,1,1 366 | 1,1,1,1,2,1,2,1,1 367 | 1,1,1,1,2,1,2,1,1 368 | 1,1,1,1,2,1,3,1,1 369 | 5,1,2,1,2,1,2,1,1 370 | 3,1,1,1,2,1,1,1,1 371 | 5,1,1,6,3,1,1,1,1 372 | 1,1,1,1,2,1,1,1,1 373 | 5,1,1,1,2,1,2,2,1 374 | 5,1,1,1,2,1,1,1,1 375 | 5,1,2,1,2,1,1,1,1 376 | 5,1,1,1,2,1,2,1,1 377 | 4,1,2,1,2,1,2,1,1 378 | 5,1,3,1,2,1,3,1,1 379 | 3,1,1,1,2,1,2,1,1 380 | 5,2,4,1,1,1,1,1,1 381 | 3,1,1,1,2,1,2,1,1 382 | 1,1,1,1,1,1,2,1,1 383 | 4,1,1,1,2,1,2,1,1 384 | 4,1,1,2,2,1,1,1,1 385 | 1,1,1,1,2,1,1,1,1 386 | 5,1,1,1,2,1,1,1,1 387 | 2,3,1,1,2,1,2,1,1 388 | 2,1,1,1,1,1,2,1,1 389 | 4,1,3,1,2,1,2,1,1 390 | 3,1,1,1,2,1,2,1,1 391 | 1,1,1,1,1,1,1,1,1 392 | 4,1,1,1,2,1,2,1,1 393 | 5,1,1,1,2,1,2,1,1 394 | 3,1,1,1,2,1,2,1,1 395 | 6,3,3,3,3,2,6,1,1 396 | 7,1,2,3,2,1,2,1,1 397 | 1,1,1,1,2,1,1,1,1 398 | 5,1,1,2,1,1,2,1,1 399 | 3,1,3,1,3,4,1,1,1 400 | 2,1,1,1,2,5,1,1,1 401 | 2,1,1,1,2,1,1,1,1 402 | 4,1,1,1,2,1,1,1,1 403 | 6,2,3,1,2,1,1,1,1 404 | 5,1,1,1,2,1,2,1,1 405 | 1,1,1,1,2,1,1,1,1 406 | 3,1,1,1,2,1,1,1,1 407 | 3,1,4,1,2,1,1,1,1 408 | 4,2,4,3,2,2,2,1,1 409 | 4,1,1,1,2,1,1,1,1 410 | 5,1,1,3,2,1,1,1,1 411 | 4,1,1,3,2,1,1,1,1 412 | 3,1,1,1,2,1,2,1,1 413 | 3,1,1,1,2,1,2,1,1 414 | 1,1,1,1,2,1,1,1,1 415 | 2,1,1,1,2,1,1,1,1 416 | 3,1,1,1,2,1,2,1,1 417 | 1,2,2,1,2,1,1,1,1 418 | 1,1,1,3,2,1,1,1,1 419 | 3,1,1,1,2,1,2,1,1 420 | 3,1,1,2,3,4,1,1,1 421 | 1,2,1,3,2,1,2,1,1 422 | 5,1,1,1,2,1,2,2,1 423 | 4,1,1,1,2,1,2,1,1 424 | 3,1,1,1,2,1,3,1,1 425 | 3,1,1,1,2,1,2,1,1 426 | 5,1,1,1,2,1,2,1,1 427 | 5,4,5,1,8,1,3,6,1 428 | 1,1,1,1,2,1,1,1,1 429 | 1,1,1,1,2,1,2,1,1 430 | 4,1,1,1,2,1,3,1,1 431 | 1,1,3,1,2,1,2,1,1 432 | 1,1,3,1,2,1,2,1,1 433 | 3,1,1,3,2,1,2,1,1 434 | 1,1,1,1,2,1,1,1,1 435 | 5,2,2,2,2,1,1,1,2 436 | 3,1,1,1,2,1,3,1,1 437 | 3,2,1,2,2,1,3,1,1 438 | 2,1,1,1,2,1,3,1,1 439 | 5,3,2,1,3,1,1,1,1 440 | 1,1,1,1,2,1,2,1,1 441 | 4,1,4,1,2,1,1,1,1 442 | 1,1,2,1,2,1,2,1,1 443 | 5,1,1,1,2,1,1,1,1 444 | 1,1,1,1,2,1,1,1,1 445 | 2,1,1,1,2,1,1,1,1 446 | 5,1,1,1,2,1,3,2,1 447 | 1,1,1,1,2,1,1,1,1 448 | 1,1,1,1,2,1,1,1,1 449 | 1,1,1,1,2,1,1,1,1 450 | 1,1,1,1,2,1,1,1,1 451 | 3,1,1,1,2,1,2,3,1 452 | 4,1,1,1,2,1,1,1,1 453 | 1,1,1,1,2,1,1,1,8 454 | 1,1,1,3,2,1,1,1,1 455 | 3,1,1,1,2,1,1,1,1 456 | 3,1,1,1,2,1,2,1,2 457 | 3,1,1,1,3,2,1,1,1 458 | 2,1,1,1,2,1,1,1,1 459 | -------------------------------------------------------------------------------- /data/wbc/y.csv: -------------------------------------------------------------------------------- 1 | 0 2 | 0 3 | 0 4 | 1 5 | 0 6 | 1 7 | 0 8 | 1 9 | 0 10 | 0 11 | 0 12 | 0 13 | 0 14 | 0 15 | 0 16 | 0 17 | 0 18 | 0 19 | 0 20 | 0 21 | 0 22 | 0 23 | 0 24 | 0 25 | 0 26 | 0 27 | 1 28 | 1 29 | 0 30 | 0 31 | 0 32 | 0 33 | 0 34 | 0 35 | 0 36 | 0 37 | 1 38 | 0 39 | 0 40 | 0 41 | 0 42 | 0 43 | 1 44 | 0 45 | 0 46 | 1 47 | 0 48 | 0 49 | 0 50 | 0 51 | 0 52 | 0 53 | 0 54 | 0 55 | 0 56 | 0 57 | 0 58 | 0 59 | 1 60 | 0 61 | 0 62 | 0 63 | 0 64 | 0 65 | 0 66 | 0 67 | 0 68 | 0 69 | 1 70 | 0 71 | 0 72 | 0 73 | 0 74 | 0 75 | 0 76 | 0 77 | 0 78 | 0 79 | 0 80 | 0 81 | 0 82 | 0 83 | 0 84 | 0 85 | 1 86 | 0 87 | 0 88 | 0 89 | 0 90 | 0 91 | 0 92 | 0 93 | 0 94 | 1 95 | 0 96 | 0 97 | 0 98 | 0 99 | 0 100 | 0 101 | 0 102 | 0 103 | 0 104 | 0 105 | 0 106 | 0 107 | 0 108 | 0 109 | 0 110 | 0 111 | 0 112 | 0 113 | 0 114 | 0 115 | 0 116 | 0 117 | 0 118 | 0 119 | 0 120 | 0 121 | 0 122 | 0 123 | 0 124 | 0 125 | 0 126 | 0 127 | 0 128 | 0 129 | 0 130 | 1 131 | 1 132 | 0 133 | 0 134 | 0 135 | 0 136 | 1 137 | 0 138 | 0 139 | 1 140 | 0 141 | 0 142 | 0 143 | 0 144 | 0 145 | 0 146 | 0 147 | 0 148 | 0 149 | 0 150 | 0 151 | 0 152 | 0 153 | 0 154 | 0 155 | 0 156 | 0 157 | 0 158 | 0 159 | 0 160 | 0 161 | 0 162 | 0 163 | 1 164 | 0 165 | 0 166 | 0 167 | 0 168 | 1 169 | 0 170 | 0 171 | 0 172 | 0 173 | 0 174 | 1 175 | 0 176 | 0 177 | 0 178 | 0 179 | 0 180 | 0 181 | 0 182 | 0 183 | 0 184 | 0 185 | 0 186 | 0 187 | 0 188 | 0 189 | 0 190 | 0 191 | 0 192 | 0 193 | 0 194 | 0 195 | 0 196 | 0 197 | 0 198 | 0 199 | 0 200 | 0 201 | 0 202 | 0 203 | 0 204 | 0 205 | 0 206 | 0 207 | 0 208 | 0 209 | 0 210 | 0 211 | 0 212 | 0 213 | 0 214 | 0 215 | 0 216 | 0 217 | 0 218 | 0 219 | 0 220 | 0 221 | 0 222 | 0 223 | 0 224 | 0 225 | 0 226 | 0 227 | 0 228 | 0 229 | 0 230 | 0 231 | 0 232 | 0 233 | 0 234 | 0 235 | 0 236 | 0 237 | 0 238 | 0 239 | 0 240 | 0 241 | 0 242 | 0 243 | 0 244 | 0 245 | 1 246 | 0 247 | 0 248 | 0 249 | 0 250 | 0 251 | 0 252 | 0 253 | 0 254 | 0 255 | 0 256 | 0 257 | 1 258 | 0 259 | 0 260 | 0 261 | 0 262 | 0 263 | 0 264 | 0 265 | 0 266 | 0 267 | 0 268 | 0 269 | 0 270 | 0 271 | 0 272 | 0 273 | 0 274 | 0 275 | 0 276 | 0 277 | 0 278 | 0 279 | 0 280 | 0 281 | 0 282 | 0 283 | 0 284 | 0 285 | 0 286 | 0 287 | 0 288 | 0 289 | 0 290 | 0 291 | 0 292 | 0 293 | 0 294 | 1 295 | 0 296 | 0 297 | 0 298 | 1 299 | 0 300 | 0 301 | 0 302 | 0 303 | 0 304 | 0 305 | 0 306 | 0 307 | 0 308 | 0 309 | 0 310 | 0 311 | 0 312 | 0 313 | 0 314 | 0 315 | 0 316 | 0 317 | 0 318 | 0 319 | 0 320 | 0 321 | 0 322 | 0 323 | 0 324 | 0 325 | 0 326 | 0 327 | 0 328 | 0 329 | 0 330 | 0 331 | 0 332 | 0 333 | 0 334 | 0 335 | 0 336 | 0 337 | 0 338 | 0 339 | 0 340 | 0 341 | 0 342 | 0 343 | 0 344 | 0 345 | 0 346 | 0 347 | 0 348 | 0 349 | 0 350 | 1 351 | 0 352 | 0 353 | 0 354 | 0 355 | 0 356 | 0 357 | 0 358 | 0 359 | 0 360 | 0 361 | 0 362 | 0 363 | 0 364 | 0 365 | 0 366 | 0 367 | 0 368 | 0 369 | 0 370 | 0 371 | 0 372 | 0 373 | 0 374 | 0 375 | 0 376 | 0 377 | 0 378 | 0 379 | 0 380 | 0 381 | 0 382 | 0 383 | 0 384 | 0 385 | 0 386 | 0 387 | 0 388 | 0 389 | 0 390 | 0 391 | 0 392 | 0 393 | 0 394 | 0 395 | 0 396 | 0 397 | 0 398 | 0 399 | 0 400 | 0 401 | 0 402 | 0 403 | 0 404 | 0 405 | 0 406 | 0 407 | 0 408 | 0 409 | 0 410 | 0 411 | 0 412 | 0 413 | 0 414 | 0 415 | 0 416 | 0 417 | 0 418 | 0 419 | 0 420 | 0 421 | 0 422 | 0 423 | 0 424 | 0 425 | 0 426 | 0 427 | 0 428 | 0 429 | 0 430 | 0 431 | 0 432 | 0 433 | 0 434 | 0 435 | 0 436 | 0 437 | 0 438 | 0 439 | 0 440 | 0 441 | 0 442 | 0 443 | 0 444 | 0 445 | 0 446 | 0 447 | 0 448 | 0 449 | 0 450 | 0 451 | 0 452 | 0 453 | 1 454 | 0 455 | 0 456 | 0 457 | 0 458 | 0 459 | -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | import argparse 5 | from utils import detect_lof, detect_isoforest 6 | from outlier_interpreter import OutlierInterpreter 7 | 8 | 9 | def main(args): 10 | # read data 11 | num_inst, dim, X = read_data(args) 12 | X = np.array(X, dtype=float) 13 | 14 | # measure degree of outlierness 15 | labels_otlr = detect_outliers(args, X) 16 | 17 | # interpret outliers 18 | sgnf_prior = 1 # we do not have prior knowledge of feature importance for WBC data, so set it as scalar 1 19 | interpreter = OutlierInterpreter(X, labels_otlr, args.ratio_nbr, 20 | AUG=args.AUG, MIN_CLUSTER_SIZE=args.MIN_CLUSTER_SIZE, MAX_NUM_CLUSTER=args.MAX_NUM_CLUSTER, 21 | VAL_TIMES = args.VAL_TIMES, C_SVM=args.C_SVM, THRE_PS=args.THRE_PS, DEFK=args.DEFK) 22 | ids_target = np.where(labels_otlr == 1)[0] # sample id of outliers 23 | importance_attr, outlierness = interpreter.interpret_outliers(ids_target, sgnf_prior, int_flag=1) 24 | 25 | print("Sample ID of outliers:", '\n', ids_target) 26 | print("Outlying degree of attributes:", '\n', importance_attr) 27 | print("Outlierness scores re-estimated:", '\n', outlierness) 28 | 29 | return ids_target, importance_attr, outlierness 30 | 31 | 32 | def read_data(args): 33 | data_name = args.dataset 34 | fn = os.path.join(os.path.dirname(os.getcwd()), "data", data_name, "X.csv") 35 | X = np.genfromtxt(fn, delimiter=',', dtype=int) 36 | num_inst = X.shape[0] 37 | dim = X.shape[1] 38 | 39 | return num_inst, dim, X 40 | 41 | 42 | def detect_outliers(args, X): 43 | # A larger outlier score means greater outlierness 44 | if args.detector == 'lof': 45 | labels_otlr = detect_lof(args, X) 46 | labels_otlr = np.array(0.5*(1-labels_otlr), dtype=int) # outlier label = 1, normal label = 0 47 | #print(labels_otlr) 48 | elif args.detector == 'isoforest': 49 | labels_otlr = detect_isoforest(args, X) 50 | labels_otlr = np.array(0.5 * (1 - labels_otlr), dtype=int) # outlier label = 1, normal label = 0 51 | #print(labels_otlr) 52 | else: 53 | print("The detector type is not considered in current implementation...") 54 | sys.exit() 55 | 56 | return labels_otlr 57 | 58 | 59 | if __name__ == "__main__": 60 | parser = argparse.ArgumentParser() 61 | parser.add_argument('--dataset', type=str, default='wbc', help='which dataset to use') 62 | parser.add_argument('--AUG', type=float, default=10, help='an additional attribute value as augmentation') 63 | parser.add_argument('--detector', type=str, default='lof', help='which outlier detector to use') 64 | parser.add_argument('--ratio_nbr', type=float, default=0.1, 65 | help='controls number of neighbors to use in kneighbors queries') 66 | parser.add_argument('--MIN_CLUSTER_SIZE', type=int, default=5, 67 | help='minimum number of samples required in a cluster') 68 | parser.add_argument('--MAX_NUM_CLUSTER', type=int, default=4, help='maximum number of clusters for each context') 69 | parser.add_argument('--VAL_TIMES', type=int, default=10, help='number of iterations for computing prediction strength') 70 | parser.add_argument('--C_SVM', type=float, default=1., help='penalty parameter for svm') 71 | parser.add_argument('--DEFK', type=int, default=0, 72 | help='pre-determined number of clusters in each context (use prediction strength if 0)') 73 | parser.add_argument('--THRE_PS', type=float, default=0.85, 74 | help='threshold for deciding the best cluster value in prediction strength') 75 | parser.add_argument('--RESOLUTION', type=float, default=0.05, help='attribute resolution') 76 | args = parser.parse_args() 77 | 78 | main(args) -------------------------------------------------------------------------------- /src/outlier_interpreter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import copy 3 | import numpy as np 4 | from sklearn.neighbors import NearestNeighbors 5 | from sklearn.cluster import KMeans 6 | from sklearn import svm 7 | from prediction_strength import optimalK 8 | 9 | class OutlierInterpreter(object): 10 | def __init__(self, data, inds_otlr, nbrs_ratio, 11 | AUG=1.0, MIN_CLUSTER_SIZE=5, MAX_NUM_CLUSTER=4, VAL_TIMES=10, C_SVM=1., 12 | RESOLUTION=0.05, THRE_PS=0.85, DEFK=0): 13 | ''' 14 | data: Data matrix, each row represents one instance 15 | inds_otlr: A vector with each entry telling whether this instance is outlier (1) or not (0) 16 | nbrs_ratio: The ratio of normal instances as the context for each outlier 17 | AUG: An additional feature attached to the input as data augmentation 18 | MIN_CLUSTER_SIZE: Minimum number of nodes in each cluster 19 | MAX_NUM_CLUSTER: Maximum number of clusters considered in prediction strength computation 20 | VAL_TIMES: Number of iterations for computing prediction strength 21 | C_SVM: A hyperparameter in SVM (optimum value would be better to be estimated through validation) 22 | DEFK: Predefined number of clusters in each context. Value 0 means using Prediction Strength to estimate it. 23 | ''' 24 | 25 | self.data = data 26 | self.inds_otlr = inds_otlr 27 | self.AUG = float(AUG) 28 | 29 | self.num_inst = data.shape[0] 30 | self.num_feat = data.shape[1] 31 | self.num_nbrs = int(nbrs_ratio * self.num_inst) 32 | 33 | self.MIN_CLUSTER_SIZE = MIN_CLUSTER_SIZE 34 | self.MAX_NUM_CLUSTER = MAX_NUM_CLUSTER 35 | self.VAL_TIMES = VAL_TIMES 36 | self.C_SVM = C_SVM 37 | self.RESOLUTION = RESOLUTION 38 | self.THRE_PS = THRE_PS 39 | self.DEFK = DEFK 40 | 41 | # normal instances 42 | self.data_normal = self.data[np.where(self.inds_otlr == 0)[0]] 43 | 44 | # nearest nbrs object based on normal instances 45 | self.nbrs = NearestNeighbors(n_neighbors=self.num_nbrs) 46 | self.nbrs.fit(self.data_normal) 47 | 48 | 49 | def interpret_outliers(self, ids_target, sgnf_vec, int_flag=0): 50 | """ 51 | ids_target: Indices of target outliers 52 | sgnf_vec: A vector indicating the importance of each attribute, as prior knowledge 53 | int_flag: Discrete attribute or not 54 | :return: A list of sorted (outlier_ID, outlierness) tuples, a list of clfs, attr importance 2D-array 55 | """ 56 | 57 | # Attach 0 to the augmented feature 58 | if isinstance(sgnf_vec, int) or isinstance(sgnf_vec, float): 59 | sgnf_vec = np.hstack((np.ones(self.num_feat), 0)) 60 | else: 61 | sgnf_vec = np.hstack((sgnf_vec, [0])) 62 | 63 | # Interpret each target outlier 64 | oid_devt_dict = dict() # id-score tuples 65 | score_attr_mat = [] 66 | cnt = 0 67 | for i in ids_target: 68 | if cnt % 20 == 0: 69 | print(cnt) 70 | cnt += 1 71 | 72 | # Do clustering on the context, build one classifier for each cluster 73 | nums_c, clfs, cluster_attr_scale = self.cluster_context(i, int_flag) 74 | 75 | # Calculate outlierness score 76 | devt_i = self.CalculateOutlierness(i, clfs, nums_c, sgnf_vec) 77 | oid_devt_dict[i] = devt_i 78 | 79 | # Find outlying attributes 80 | score_attr = np.zeros(self.num_feat) 81 | for num_c, clf in zip(nums_c, clfs): 82 | score_attr += num_c * np.abs(clf.coef_[0]) # weighted by the normal cluster size 83 | score_attr /= float(np.sum(nums_c)) 84 | score_attr /= np.sum(score_attr) # relative importance 85 | score_attr_mat.append(copy.copy(score_attr)) 86 | #print(score_attr) 87 | 88 | return np.array(score_attr_mat), oid_devt_dict 89 | 90 | 91 | def cluster_context(self, id_outlier, int_flag): 92 | # find the context of the outlier 93 | dist_btwn, otlr_nbrs = self.nbrs.kneighbors([self.data[id_outlier]]) 94 | dist_btwn, otlr_nbrs = dist_btwn[0], self.data_normal[otlr_nbrs[0], :] 95 | #print(self.data[id_outlier]) 96 | #print(otlr_nbrs) 97 | 98 | # choose the number of clusters in the context 99 | if self.DEFK == 0: 100 | k_best = optimalK(otlr_nbrs, self.VAL_TIMES, self.MAX_NUM_CLUSTER, self.THRE_PS) 101 | else: 102 | k_best = self.DEFK 103 | k_best = min(k_best+1, self.MAX_NUM_CLUSTER) # empirically, it is better to have a lager K 104 | # print('Best k:', k_best) 105 | 106 | # clutering the context 107 | kmeans = KMeans(n_clusters=k_best, random_state=0).fit(otlr_nbrs) 108 | label_nbrs = kmeans.labels_ 109 | 110 | clfs = [] 111 | nbrs_mean = [] 112 | nums_c = [] 113 | cluster_attr_scale = [] 114 | 115 | # build a linear classifier for each cluster of nbrs 116 | for c in range(k_best): 117 | # indices for instances in cluster c 118 | inds_c = np.where(label_nbrs == c)[0] 119 | 120 | # the cluster cannot be too small 121 | if np.size(inds_c) < self.MIN_CLUSTER_SIZE: 122 | continue 123 | nums_c.append(len(inds_c)) 124 | 125 | # instances for cluster c 126 | otlr_nbrs_c = otlr_nbrs[inds_c, :] 127 | dist_btwn_c = dist_btwn[inds_c] 128 | 129 | # distance property of cluster c 130 | cluster_attr_scale.append(np.hstack((np.max(otlr_nbrs_c, axis=0) - np.min(otlr_nbrs_c, axis=0), 0))) # scale for each attr 131 | 132 | # synthetic sampling to build two classes 133 | insts_c0 = self.SyntheticSampling(otlr_nbrs_c, self.data[id_outlier], int_flag) 134 | 135 | insts_c1 = otlr_nbrs_c 136 | clf = self.SVCInterpreter(insts_c0, insts_c1) 137 | 138 | clfs.append(clf) 139 | nbrs_mean.append(np.average(insts_c1, axis=0)) 140 | 141 | return nums_c, clfs, cluster_attr_scale 142 | 143 | 144 | def SyntheticSampling(self, insts, otlr, int_flag): 145 | ''' 146 | Expand the outlier into a class. 147 | 148 | insts: normal instances 149 | otlr: the outlier instance 150 | expand_ratio: expand ratio 151 | int_flag: whether to round to int 152 | :return: two classes of data points 153 | ''' 154 | 155 | num_c0_new = insts.shape[0] - 1 156 | coeff_c0_new = np.random.rand(num_c0_new, insts.shape[0]) # transformation matrix for synthetic sampling 157 | nbrs_local = NearestNeighbors(n_neighbors=1).fit(insts) 158 | min_dist_to_nbr = nbrs_local.kneighbors([otlr])[0][0, 0]/insts.shape[1] 159 | 160 | for r in range(coeff_c0_new.shape[0]): 161 | coeff_c0_new[r, :] /= sum(coeff_c0_new[r, :]) 162 | insts_c0_new = np.dot(coeff_c0_new, insts - np.dot(np.ones((insts.shape[0], 1)), [otlr])) 163 | for r in range(insts_c0_new.shape[0]): # shrink to prevent overlap 164 | insts_c0_new[r, :] *= (0.2 * np.random.rand(1)[0] * min_dist_to_nbr) 165 | insts_c0_new += np.dot(np.ones((num_c0_new, 1)), [otlr]) # origin + shift 166 | if int_flag: 167 | insts_c0_new = np.round(insts_c0_new) 168 | insts_c0 = np.vstack((otlr, insts_c0_new)) 169 | 170 | return insts_c0 171 | 172 | 173 | def SVCInterpreter(self, insts_c0, insts_c1): 174 | # classification between normal instances and outliers, where outliers have negative output 175 | 176 | clf = svm.LinearSVC(penalty='l1', C=self.C_SVM, dual=False, intercept_scaling=self.AUG) 177 | X_c = np.vstack((insts_c0, insts_c1)) 178 | y_c = np.hstack((np.zeros(insts_c0.shape[0]), np.ones(insts_c1.shape[0]))) 179 | clf.fit(X_c, y_c) 180 | #print(insts_c1) 181 | #print(insts_c0) 182 | 183 | return clf 184 | 185 | 186 | def CalculateOutlierness(self, id_outlier, clfs, nums_c, sgnf_vec): 187 | otlr = self.data[id_outlier] 188 | 189 | devt_overall = 0. 190 | for c in range(len(nums_c)): 191 | # distance to the boundary 192 | otlr_aug = np.hstack((otlr, self.AUG)) 193 | w = np.hstack((clfs[c].coef_[0], clfs[c].intercept_[0]/self.AUG)) 194 | w_a = np.hstack((clfs[c].coef_[0], 0)) 195 | dist = -min(0, np.inner(otlr_aug, w))/np.linalg.norm(w_a) 196 | 197 | # rescale deviation according to attributes' importance 198 | devt = np.linalg.norm(np.multiply(dist * w_a / np.linalg.norm(w_a), sgnf_vec)) 199 | if np.isnan(devt): 200 | devt = 0. 201 | 202 | # weighted by the opponent cluster size 203 | devt_overall += devt * nums_c[c] 204 | 205 | devt_overall /= sum(nums_c) 206 | 207 | return devt_overall -------------------------------------------------------------------------------- /src/prediction_strength.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from sklearn.neighbors import NearestNeighbors 5 | from sklearn.cluster import KMeans 6 | from sklearn.datasets.samples_generator import make_blobs 7 | 8 | 9 | def ClosestCenter(point, centroids): 10 | # Find the closest center over all centroids 11 | min_index = -1 12 | min_dist = float('inf') 13 | for i in range(len(centroids)): 14 | center = centroids[i] 15 | dist_cur = np.linalg.norm(point - center) 16 | if dist_cur < min_dist: 17 | min_index = i 18 | min_dist = dist_cur 19 | 20 | return min_index 21 | 22 | 23 | def PredictionStrength(data_test, test_labels, train_centers, c): 24 | # Compute prediction strength under c clusters 25 | pred_strength = np.zeros(c) 26 | for cc in range(c): 27 | num_cc = test_labels.tolist().count(cc) 28 | count = 0. 29 | for i in range(len(test_labels)-1): 30 | for j in range(i+1, len(test_labels)): 31 | if test_labels[i] == test_labels[j] == cc: 32 | pi = data_test[i] 33 | pj = data_test[j] 34 | if ClosestCenter(pi, train_centers) == ClosestCenter(pj, train_centers): 35 | count += 1 36 | 37 | if num_cc <= 1: 38 | pred_strength[cc] = float('inf') 39 | else: 40 | pred_strength[cc] = count/(num_cc * (num_cc-1)/2.) 41 | 42 | return min(pred_strength) 43 | 44 | 45 | def optimalK(data, num_fold, maxClusters=5, THRE_PS=0.90): 46 | # Find the best number of clusters using prediction strength 47 | num_data = data.shape[0] 48 | num_feat = data.shape[1] 49 | 50 | pred_strength_avg = np.zeros(maxClusters+1) 51 | for nf in range(num_fold): 52 | # Split into training and testing samples 53 | inds_train = np.random.choice(num_data, int(num_data*0.5), replace=False) 54 | inds_test = list(set(range(num_data)).difference(inds_train)) 55 | data_train = data[inds_train] 56 | data_test = data[inds_test] 57 | 58 | pred_strength_cur = np.zeros(maxClusters+1) 59 | for c in range(1, maxClusters+1): 60 | train_cluster = KMeans(n_clusters=c).fit(data_train) 61 | test_cluster = KMeans(n_clusters=c).fit(data_test) 62 | pred_strength_cur[c] = PredictionStrength(data_test, test_cluster.labels_, train_cluster.cluster_centers_, c) 63 | 64 | pred_strength_avg += pred_strength_cur 65 | 66 | pred_strength_avg /= num_fold 67 | # print("Prediction Strength vec: ", pred_strength_avg) 68 | 69 | k_optimal = max([i for i,j in enumerate(pred_strength_avg) if j > THRE_PS]) 70 | 71 | return k_optimal 72 | 73 | 74 | if __name__ == "__main__": 75 | x, y = make_blobs(24, n_features=5, centers=3) 76 | plt.scatter(x[:, 0], x[:, 1]) 77 | plt.show() 78 | 79 | k = optimalK(x, 10) 80 | print('Optimal k is: ', k) -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.neighbors import LocalOutlierFactor 3 | from sklearn.ensemble import IsolationForest 4 | 5 | def detect_lof(args, X): 6 | num_inst = X.shape[0] 7 | num_nbr = int(num_inst * args.ratio_nbr) 8 | clf = LocalOutlierFactor(n_neighbors=num_nbr) 9 | y_pred = clf.fit_predict(X) 10 | outlier_scores = -clf.negative_outlier_factor_ 11 | 12 | return y_pred 13 | 14 | 15 | def detect_isoforest(args, X): 16 | num_inst = X.shape[0] 17 | clf = IsolationForest(behaviour='new', max_samples=num_inst, random_state=0) 18 | clf.fit(X) 19 | y_pred = clf.predict(X) 20 | outlier_scores = -clf.decision_function(X) 21 | 22 | return y_pred --------------------------------------------------------------------------------