├── .gitignore
├── .res
└── process_models
│ ├── gigantic.plg
│ ├── huge.plg
│ ├── large.plg
│ ├── medium.plg
│ ├── p2p.plg
│ ├── paper.plg
│ ├── small.plg
│ └── wide.plg
├── LICENSE
├── README.md
├── deepalign
├── __init__.py
├── alignments
│ ├── __init__.py
│ ├── bibs.py
│ ├── confnet.py
│ ├── core.py
│ └── processmining.py
├── anomalydetection
│ ├── __init__.py
│ ├── binarizer.py
│ ├── heuristic.py
│ ├── result.py
│ └── utils.py
├── dataset.py
├── enums.py
├── fs.py
├── generation
│ ├── __init__.py
│ ├── anomaly.py
│ ├── attribute_generator.py
│ ├── event_log_generator.py
│ ├── example_values.py
│ └── utils.py
├── processmining
│ ├── __init__.py
│ ├── alignments.py
│ ├── case.py
│ ├── event.py
│ ├── log.py
│ └── process_map.py
└── utils.py
├── environment.yml
├── notebooks
├── 1. Paper Process from Sec. 4.ipynb
├── 2. Dataset Generation.ipynb
├── 2.A1 Generation Algorithm.ipynb
├── 3. Training the Models.ipynb
├── 4. Alignments.ipynb
├── 5. Caching the Alignments.ipynb
├── 6. Evaluation Script.ipynb
└── 7. Evaluation.ipynb
├── requirements.txt
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | /.idea/
2 | /.vscode/
3 |
4 | /.config/
5 | /.out/
6 |
7 | /.res/real
8 | /.res/bpic
9 |
10 | *.log
11 | *.pyc
12 | .ipynb_checkpoints
13 | *.egg-info/
14 | .DS_Store
--------------------------------------------------------------------------------
/.res/process_models/gigantic.plg:
--------------------------------------------------------------------------------
1 |
2 |
10 |
11 |
12 |
13 | libPlg
14 | 2.0.5
15 | Gigantic
16 | oeleckfp9dv07fq9d5i848o3v7
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
--------------------------------------------------------------------------------
/.res/process_models/huge.plg:
--------------------------------------------------------------------------------
1 |
2 |
10 |
11 |
12 |
13 | libPlg
14 | 2.0.3
15 | Huge
16 | 5u7umlvjfguuhureg43pk0d2ru
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
--------------------------------------------------------------------------------
/.res/process_models/large.plg:
--------------------------------------------------------------------------------
1 |
2 |
10 |
11 |
12 |
13 | libPlg
14 | 2.0.3
15 | Large
16 | fsncf699b9uhl0jeomrour3igu
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
--------------------------------------------------------------------------------
/.res/process_models/medium.plg:
--------------------------------------------------------------------------------
1 |
2 |
10 |
11 |
12 |
13 | libPlg
14 | 2.0.3
15 | Medium
16 | ullotet162h448nn7nd2ql3nrt
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
--------------------------------------------------------------------------------
/.res/process_models/p2p.plg:
--------------------------------------------------------------------------------
1 |
2 |
10 |
11 |
12 |
13 | libPlg
14 | 2.0.5
15 | P2P
16 | b71o7oepcrf8c2bc9cop5st4p7
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
--------------------------------------------------------------------------------
/.res/process_models/paper.plg:
--------------------------------------------------------------------------------
1 |
2 |
10 |
11 |
12 |
13 | libPlg
14 | 2.0.5
15 | Process_1
16 | 4fim8975tn1fjt31oht2636mjm
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
--------------------------------------------------------------------------------
/.res/process_models/small.plg:
--------------------------------------------------------------------------------
1 |
2 |
10 |
11 |
12 |
13 | libPlg
14 | 2.0.3
15 | Small
16 | bg5p2p9v5hv0h40ulg6st0k511
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
--------------------------------------------------------------------------------
/.res/process_models/wide.plg:
--------------------------------------------------------------------------------
1 |
2 |
10 |
11 |
12 |
13 | libPlg
14 | 2.0.5
15 | Wide
16 | n5eu0m9j47ca8cjck0uhnc0epa
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DeepAlign: Alignment-based Process Anomaly Correction Using Recurrent Neural Networks
2 |
3 | This repository holds an efficient implementation of the DeepAlign algorithm as proposed in the paper.
4 | The code in this repository can be used to reproduce the results given in the paper.
5 |
6 | ## Setup
7 |
8 | The easiest way to setup an environment is to use Miniconda.
9 |
10 | 1. Install [Miniconda](https://conda.io/miniconda.html) (make sure to use a Python 3 version)
11 | 2. After setting up miniconda you can make use of the `conda` command in your command line (Powershell, CMD, Bash)
12 | 3. We suggest that you set up a dedicated environment for this project by running `conda env create -f environment.yml`
13 | - This will setup a virtual conda environment with all necessary dependencies.
14 | - If your device does have a GPU replace `tensorflow` with `tensorflow-gpu` in the `environement.yml`
15 | 4. Depending on your operating system you can activate the virtual environment with `conda activate deepalign`
16 | on Linux and macOS, and `activate deepalign` on Windows (`cmd` only).
17 | 5. If you want to make use of a GPU, you must install the CUDA Toolkit. To install the CUDA Toolkit on your computer refer to the [TensorFlow installation guide](https://www.tensorflow.org/install/install_windows).
18 | 6. If you want to quickly install the `deepalign` package, run `pip install -e .` inside the root directory.
19 | 7. Now you can start the notebook server by `jupyter notebook notebooks`.
20 |
21 | Note: To use the graph plotting methods, you will have to install Graphviz.
22 |
23 | ## Additional Material
24 |
25 | To illustrate the findings in our paper, this repository contains Jupyter notebooks.
26 |
27 | 1. [Paper Process from Sec. 4](https://nbviewer.jupyter.org/github/tnolle/deepalign/blob/master/notebooks/1.%20Paper%20Process%20from%20Sec.%204.ipynb)
28 | - Describes the creation of the paper process used as the running example in the paper.
29 | 2. [Dataset Generation](https://nbviewer.jupyter.org/github/tnolle/deepalign/blob/master/notebooks/2.%20Dataset%20Generation.ipynb)
30 | - Downloads the pretrained models and datasets used in the evaluation. Also includes the dataset generation script.
31 | - [2.A1 Generation Algorithm](https://nbviewer.jupyter.org/github/tnolle/deepalign/blob/master/notebooks/2.A1%20Generation%20Algorithm.ipynb) explains how the generation algorithm works.
32 | 3. [Training the Models](https://nbviewer.jupyter.org/github/tnolle/deepalign/blob/master/notebooks/3.%20Training%20the%20Models.ipynb)
33 | - Demonstartes how to train your own models.
34 | 4. [Alignments](https://nbviewer.jupyter.org/github/tnolle/deepalign/blob/master/notebooks/4.%20Alignments.ipynb)
35 | - This notebook contains all the examples from the Evaluation section of the paper and outlines how to reproduce them.
36 | 5. [Caching the Alignments](https://nbviewer.jupyter.org/github/tnolle/deepalign/blob/master/notebooks/5.%20Caching%20the%20Alignments.ipynb)
37 | - This is a helper script to speed up the evaluation.
38 | 6. [Evaluation Script](https://nbviewer.jupyter.org/github/tnolle/deepalign/blob/master/notebooks/6.%20Evaluation%20Script.ipynb)
39 | - This is the evaluation script used in the paper.
40 | 7. [Evaluation](https://nbviewer.jupyter.org/github/tnolle/deepalign/blob/master/notebooks/7.%20Evaluation.ipynb)
41 | - This notebook contains all tables used in the paper. It also contains some figures that didn't make it into the paper.
42 |
43 | ## References
44 |
45 | 1. [Nolle, T., Seeliger, A., Mühlhäuser, M.: Unsupervised Anomaly Detection in Noisy Business Process Event Logs Using Denoising Autoencoders, 2016](https://doi.org/10.1007/978-3-319-46307-0_28)
46 | 2. [Nolle, T., Luettgen, S., Seeliger A., Mühlhäuser, M.: Analyzing Business Process Anomalies Using Autoencoders, 2018](https://doi.org/10.1007/s10994-018-5702-8)
47 | 3. [Nolle, T., Seeliger, A., Mühlhäuser, M.: BINet: Multivariate Business Process Anomaly Detection Using Deep Learning, 2018](https://doi.org/10.1007/978-3-319-98648-7_16)
48 | 4. [Nolle, T., Luettgen, S., Seeliger, A., Mühlhäuser, M.: BINet: Multi-perspective Business Process Anomaly Classification, 2019](https://doi.org/10.1016/j.is.2019.101458)
49 | 5. [Nolle, T., Seeliger, A., Thoma, N, Mühlhäuser, M.: DeepAlign: Alignment-based Process Anomaly Correction Using Recurrent Neural Networks, 2020](https://doi.org/10.1007/978-3-030-49435-3_20)
50 |
--------------------------------------------------------------------------------
/deepalign/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 | from deepalign.dataset import Dataset
18 | from deepalign.fs import generate as generate_folders
19 | from deepalign.generation import EventLogGenerator
20 |
21 | # create dirs if non-existent
22 | generate_folders()
23 |
--------------------------------------------------------------------------------
/deepalign/alignments/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 | import inspect
18 | import sys
19 |
20 | from deepalign.alignments.confnet import ConfNet
21 | from deepalign.alignments.processmining import *
22 |
23 | # Lookup dict for AD abbreviations
24 | ALIGNERS = dict((ad.abbreviation, ad) for _, ad in inspect.getmembers(sys.modules[__name__], inspect.isclass)
25 | if hasattr(ad, 'abbreviation') and ad.abbreviation is not None)
--------------------------------------------------------------------------------
/deepalign/alignments/bibs.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 | import numpy as np
18 |
19 | from deepalign.utils import align
20 | from deepalign.utils import gather
21 |
22 |
23 | def top_k(p, y, k=5):
24 | positions = np.zeros_like(p, dtype=int) + np.arange(p.shape[1])[None, :, None] + 1
25 |
26 | shape = (p.shape[0], np.product(p.shape[1:]))
27 | idx = p.reshape(shape).argsort(-1)[:, ::-1][:, :k]
28 |
29 | p_new = gather(p.reshape(shape), idx)
30 | y_new = gather(y.reshape(shape), idx)
31 | positions = gather(positions.reshape(shape), idx)
32 |
33 | return p_new, y_new, positions
34 |
35 |
36 | def bibs_step(x, head_preds, head_probs, tail_preds, tail_probs, guard=None, k=5, go_backwards=False, delete_max=3):
37 | # Top-k prections for every timestamp
38 | if not go_backwards:
39 | y = head_preds.argsort(-1)[:, :, ::-1][:, :, :k]
40 | else:
41 | y = align(tail_preds.argsort(-1)[:, :, ::-1][:, :, :k], 1)
42 |
43 | mask = x == 0
44 | y[mask] = 0
45 |
46 | p_empty = np.atleast_3d(head_probs) + align(tail_probs, 1)
47 | p_empty = p_empty[:, :, 0].sum(-1) / ((~mask).sum(-1) - 1) # -1 to remove end symbol
48 |
49 | p_y = \
50 | align(head_probs, -1) + \
51 | gather(head_preds, y) + \
52 | gather(align(tail_preds, 1), y) + \
53 | align(tail_probs, 2)
54 | p_y[align(mask, 1, 1)[:, :, 0]] = -np.inf
55 |
56 | def p_remove_next(i):
57 | p_remove = \
58 | align(head_probs, -1) + \
59 | gather(head_preds, align(x, 1 + i)) + \
60 | gather(align(tail_preds, 1 + i), np.atleast_3d(x)) + \
61 | align(tail_probs, 2 + i)
62 | p_remove[align(mask, 1 + i, 1)] = -np.inf
63 | if guard is not None:
64 | for j in range(i):
65 | p_remove[align(guard, j + 1, 0)] = -np.inf
66 | return p_remove
67 |
68 | p_remove = np.concatenate([p_remove_next(i + 1) for i in range(delete_max)], -1)
69 | y_remove = \
70 | np.zeros((y.shape[0], y.shape[1], p_remove.shape[-1]), dtype=int) + \
71 | np.array([-(i + 1) for i in range(delete_max)])[None, None, :]
72 |
73 | # Combine
74 | p = np.concatenate((p_y, p_remove), -1)
75 | y = np.concatenate((y, y_remove), -1)
76 |
77 | # Mask
78 | p[mask] = -np.inf
79 | y[align(mask, 1, 1)[:, :, 0]] = 0
80 |
81 | # Insert empty at bottom right, this will always be free, and this makes the code much simpler
82 | p[:, -1, -1] = p_empty
83 | y[:, -1, -1] = -42 # Identifier for 'do nothing'
84 |
85 | # Top-k beams
86 | beams_p, beams, positions = top_k(p, y, k=k)
87 |
88 | return beams_p, beams, positions, p, y
89 |
90 |
91 | def get_indices(indices, types, l):
92 | idx = np.zeros((len(indices), l), dtype=int)
93 | for j, (i, t) in enumerate(zip(indices, types)):
94 | normal = np.arange(l)
95 | if t == -42 or t == 0 or i == 0:
96 | idx[j, :] = normal
97 | elif t >= 0:
98 | idx[j, :i] = normal[:i]
99 | idx[j, i] = l - 1
100 | idx[j, i + 1:] = normal[i:-1]
101 | elif t < 0:
102 | idx[j, :i] = normal[:i]
103 | idx[j, i:t] = normal[i - t:]
104 | idx[j, t:] = normal[-1]
105 | return idx
106 |
107 |
108 | def build_beams(x, y, pos):
109 | idx = get_indices(pos.ravel(), y.ravel(), l=x.shape[1])
110 | y[y < 0] = 0
111 | x[:, -1] = y.ravel()
112 | return gather(x, idx)
113 |
114 |
115 | def get_delete_indices(indices, types, l):
116 | idx = np.zeros((len(indices), l), dtype=int)
117 | for j, (i, t) in enumerate(zip(indices, types)):
118 | normal = np.arange(l)
119 | if t == -42 or t >= 0 or i == 0:
120 | idx[j, :] = normal
121 | elif t < 0:
122 | idx[j, :i] = normal[:i]
123 | idx[j, i:i - t] = l - 1
124 | idx[j, i - t:] = normal[i - t:]
125 | return idx
126 |
127 |
128 | def build_alignments(inserts, deletes, y, pos, step):
129 | for j, (i, t) in enumerate(zip(pos.ravel(), y.ravel())):
130 | if t == -42 or i == 0 or t > 0:
131 | continue
132 | insert_offset = (inserts[j, :i] > 0).sum()
133 | delete_offset = (deletes[j, :i] > 0).sum()
134 | d = -t
135 | for k in range(deletes.shape[1] - i):
136 | if d == 0:
137 | break
138 | if deletes[j, i - insert_offset + delete_offset + k] == 0:
139 | deletes[j, i - insert_offset + delete_offset + k] = step
140 | d -= 1
141 |
142 | insert_idx = get_indices(pos.ravel(), y.ravel(), l=inserts.shape[1])
143 | insert_y = np.copy(y)
144 | insert_y[y == -42] = 0
145 | insert_y[y < 0] = 0
146 | insert_y[y > 0] = step
147 | inserts[:, -1] = insert_y.ravel()
148 | inserts = gather(inserts, insert_idx)
149 |
150 | return inserts, deletes
151 |
152 |
153 | def get_alignment(log, model, inserts, deletes):
154 | if np.all(log == model):
155 | log = log[log != 0]
156 | model = model[model != 0]
157 | else:
158 | log = log.tolist()
159 | model = model.tolist()
160 | inserts = inserts.tolist()
161 | deletes = deletes.tolist()
162 |
163 | end = len(log)
164 | for i in range(len(log)):
165 | if log[i] == model[i] == 0:
166 | end = i
167 | break
168 | if deletes[i] > 0:
169 | model = model[:i] + [0] + model[i:]
170 | inserts = inserts[:i] + [0] + inserts[i:]
171 | if inserts[i] > 0:
172 | log = log[:i] + [0] + log[i:]
173 | deletes = deletes[:i] + [0] + deletes[i:]
174 |
175 | log = log[:end]
176 | model = model[:end]
177 |
178 | alignment = np.vstack((log, model))
179 |
180 | return alignment
181 |
182 |
183 | def get_alignments(originals, beams, inserts, deletes):
184 | alignments = -np.ones((*beams.shape[:-1], 2, beams.shape[-1]), dtype=int)
185 | for case_index in range(originals.shape[0]):
186 | l = originals[case_index]
187 | for beam_index in range(beams.shape[1]):
188 | m = beams[case_index][beam_index]
189 | i = inserts[case_index][beam_index]
190 | d = deletes[case_index][beam_index]
191 | a = get_alignment(l, m, i, d)
192 | alignments[case_index, beam_index, :, :a.shape[1]] = a
193 | return alignments
194 |
--------------------------------------------------------------------------------
/deepalign/alignments/confnet.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 | from time import time
18 |
19 | import numpy as np
20 | import tensorflow as tf
21 |
22 | from deepalign import Dataset
23 | from deepalign.alignments.bibs import bibs_step
24 | from deepalign.alignments.bibs import build_alignments
25 | from deepalign.alignments.bibs import build_beams
26 | from deepalign.alignments.bibs import get_alignments
27 | from deepalign.anomalydetection import AnomalyDetectionResult
28 | from deepalign.anomalydetection import Binarizer
29 | from deepalign.enums import AttributeType
30 | from deepalign.enums import FeatureType
31 | from deepalign.enums import Heuristic
32 | from deepalign.enums import Strategy
33 | from deepalign.utils import align
34 | from deepalign.utils import gather
35 | from deepalign.utils import log_probs
36 | from deepalign.utils import reverse
37 | from deepalign.utils import to_targets
38 |
39 |
40 | def binet_scores_fn(features, predictions):
41 | sums = [1 - np.cumsum(np.sort(p, -1), -1) for p in predictions]
42 | indices = [(np.argsort(p, -1) == features[:, :, i:i + 1]).argmax(-1) for i, p in enumerate(predictions)]
43 | scores = np.zeros(features.shape)
44 | for (i, j, k), f in np.ndenumerate(features):
45 | if f != 0 and k < len(predictions):
46 | scores[i, j, k] = sums[k][i, j][indices[k][i, j]]
47 | return scores
48 |
49 |
50 | class BINet(tf.keras.Model):
51 | abbreviation = 'binet'
52 | name = 'BINet'
53 |
54 | def __init__(self,
55 | dataset,
56 | latent_dim=None,
57 | use_case_attributes=None,
58 | use_event_attributes=None,
59 | use_present_activity=None,
60 | use_present_attributes=None,
61 | use_attention=None):
62 | super(BINet, self).__init__()
63 |
64 | # Validate parameters
65 | if latent_dim is None:
66 | latent_dim = min(int(dataset.max_len * 10), 256)
67 | if use_event_attributes and dataset.num_attributes == 1:
68 | use_event_attributes = False
69 | use_case_attributes = False
70 | if use_present_activity and dataset.num_attributes == 1:
71 | use_present_activity = False
72 | if use_present_attributes and dataset.num_attributes == 1:
73 | use_present_attributes = False
74 |
75 | # Parameters
76 | self.latent_dim = latent_dim
77 | self.use_case_attributes = use_case_attributes
78 | self.use_event_attributes = use_event_attributes
79 | self.use_present_activity = use_present_activity
80 | self.use_present_attributes = use_present_attributes
81 | self.use_attention = use_attention
82 |
83 | # Single layers
84 | self.fc = None
85 | if self.use_case_attributes:
86 | self.fc = tf.keras.Sequential([
87 | tf.keras.layers.Dense(latent_dim // 8),
88 | tf.keras.layers.Dropout(0.5),
89 | tf.keras.layers.Dense(latent_dim, activation='linear')
90 | ])
91 |
92 | self.rnn = tf.keras.layers.GRU(latent_dim, return_sequences=True, return_state=True)
93 |
94 | # Layer lists
95 | self.fc_inputs = []
96 | self.rnn_inputs = []
97 | self.outs = []
98 |
99 | inputs = zip(dataset.attribute_dims, dataset.attribute_keys, dataset.attribute_types, dataset.feature_types)
100 | for dim, key, t, feature_type in inputs:
101 | if t == AttributeType.CATEGORICAL:
102 | voc_size = int(dim + 1) # we start at 1, 0 is padding
103 | emb_dim = np.clip(voc_size // 10, 2, 10)
104 | embed = tf.keras.layers.Embedding(input_dim=voc_size, output_dim=emb_dim, mask_zero=True)
105 | else:
106 | embed = tf.keras.layers.Dense(1, activation='linear')
107 |
108 | if feature_type == FeatureType.CASE:
109 | self.fc_inputs.append(embed)
110 | else:
111 | self.rnn_inputs.append(embed)
112 | out = tf.keras.layers.Dense(dim + 1, activation='softmax')
113 | self.outs.append(out)
114 |
115 | def call(self, inputs, training=False, return_state=False, initial_state=None):
116 | if not isinstance(inputs, list):
117 | inputs = [inputs]
118 |
119 | split = len(self.rnn_inputs)
120 |
121 | rnn_x = inputs[:split]
122 | fc_x = inputs[split:]
123 |
124 | fc_embeddings = []
125 | for x, input_layer in zip(fc_x, self.fc_inputs):
126 | if isinstance(input_layer, tf.keras.layers.Dense):
127 | x = x[:, None]
128 | x = input_layer(x)
129 | fc_embeddings.append(x)
130 |
131 | if len(fc_embeddings) > 0:
132 | if len(fc_embeddings) > 1:
133 | fc_embeddings = tf.concat(fc_embeddings, axis=-1)
134 | else:
135 | fc_embeddings = fc_embeddings[0]
136 |
137 | fc_output = None
138 | if not isinstance(fc_embeddings, list):
139 | fc_output = self.fc(fc_embeddings)
140 |
141 | rnn_embeddings = []
142 | for x, input_layer in zip(rnn_x, self.rnn_inputs):
143 | x = input_layer(x)
144 | rnn_embeddings.append(x)
145 |
146 | if len(rnn_embeddings) > 0:
147 | if len(rnn_embeddings) > 1:
148 | rnn_embeddings = tf.concat(rnn_embeddings, axis=-1)
149 | else:
150 | rnn_embeddings = rnn_embeddings[0]
151 |
152 | if initial_state is not None:
153 | rnn, h = self.rnn(rnn_embeddings, initial_state=initial_state)
154 | elif fc_output is not None:
155 | if len(fc_output.shape) == 3:
156 | fc_output = fc_output[:, 0]
157 | rnn, h = self.rnn(rnn_embeddings, initial_state=fc_output)
158 | else:
159 | rnn, h = self.rnn(rnn_embeddings)
160 |
161 | outputs = []
162 | for i, out in enumerate(self.outs):
163 | x = rnn
164 | if i > 0:
165 | if self.use_present_attributes:
166 | x = tf.concat([x, *[tf.pad(e[:, 1:x.shape[1]], [(0, 0), (0, 1), (0, 0)], 'constant', 0)
167 | for j, e in enumerate(rnn_embeddings) if i != j]], axis=-1)
168 | elif self.use_present_activity:
169 | x = tf.concat([x, tf.pad(rnn_embeddings[0][:, 1:x.shape[1]], [(0, 0), (0, 1), (0, 0)], 'constant', 0)],
170 | axis=-1)
171 | x = out(x)
172 | outputs.append(x)
173 |
174 | if return_state:
175 | return outputs, h
176 |
177 | return outputs
178 |
179 | def score(self, features, predictions):
180 | for i, prediction in enumerate(predictions):
181 | p = np.pad(prediction[:, :-1], ((0, 0), (1, 0), (0, 0)), mode='constant')
182 | p[:, 0, features[i][0, 0]] = 1
183 | predictions[i] = p
184 | return binet_scores_fn(np.dstack(features), predictions)
185 |
186 | def detect(self, dataset):
187 | if isinstance(dataset, Dataset):
188 | features = dataset.features
189 | else:
190 | features = dataset
191 | predictions = self.predict(features)
192 | if not isinstance(predictions, list):
193 | predictions = [predictions]
194 | return AnomalyDetectionResult(scores=self.score(features, predictions), predictions=predictions)
195 |
196 |
197 | class ConfNet:
198 | abbreviation = 'confnet'
199 | name = 'ConfNet'
200 |
201 | def __init__(self, dataset, latent_dim=None, use_case_attributes=None, use_event_attributes=None):
202 | super(ConfNet, self).__init__()
203 |
204 | self.use_case_attributes = use_case_attributes
205 | self.use_event_attributes = use_event_attributes
206 |
207 | self.net_f = BINet(dataset=dataset,
208 | latent_dim=latent_dim,
209 | use_case_attributes=use_case_attributes,
210 | use_event_attributes=use_event_attributes)
211 | self.net_b = BINet(dataset=dataset,
212 | latent_dim=latent_dim,
213 | use_case_attributes=use_case_attributes,
214 | use_event_attributes=use_event_attributes)
215 |
216 | self.net_f.compile(tf.keras.optimizers.Adam(), 'sparse_categorical_crossentropy')
217 | self.net_b.compile(tf.keras.optimizers.Adam(), 'sparse_categorical_crossentropy')
218 |
219 | @property
220 | def identifier(self):
221 | return f'{self.abbreviation}{int(self.use_event_attributes)}{int(self.use_case_attributes)}'
222 |
223 | def predict(self, inputs_f, inputs_b):
224 | out_f = self.net_f.predict(inputs_f)
225 | out_b = self.net_b.predict(inputs_b)
226 | if not isinstance(out_f, list):
227 | out_f = [out_f]
228 | if not isinstance(out_b, list):
229 | out_b = [out_b]
230 | return out_f, out_b
231 |
232 | def fit(self, dataset, batch_size=32, **kwargs):
233 | dataset.reverse(False)
234 | h1 = self.net_f.fit(dataset.features, dataset.targets, batch_size=batch_size, **kwargs)
235 | dataset.reverse(True)
236 | h2 = self.net_b.fit(dataset.features, dataset.targets, batch_size=batch_size, **kwargs)
237 | return h1, h2
238 |
239 | def save(self, file_name):
240 | self.net_f.save_weights(file_name + '_forward.h5')
241 | self.net_b.save_weights(file_name + '_backward.h5')
242 |
243 | def load(self, file_name):
244 | self.net_f([tf.ones(i) for i in ([(1, 1)] * len(self.net_f.rnn_inputs) + [(1,)] * len(self.net_f.fc_inputs))])
245 | self.net_f.load_weights(file_name + '_forward.h5')
246 | self.net_b([tf.ones(i) for i in ([(1, 1)] * len(self.net_b.rnn_inputs) + [(1,)] * len(self.net_b.fc_inputs))])
247 | self.net_b.load_weights(file_name + '_backward.h5')
248 |
249 | def batch_align(self, dataset, batch_size=5000, detailed=False, **kwargs):
250 | alignments = []
251 | start_beams = []
252 | start_probs = []
253 | beams = []
254 | probs = []
255 | costs = []
256 |
257 | for x, y in dataset.to_tf_dataset().batch(batch_size):
258 | if not isinstance(x, tuple):
259 | x = [x]
260 | a, b, c, sb, sp, p, _, _ = self.align([_x.numpy() for _x in x], detailed=True, **kwargs)
261 |
262 | alignments.append(a)
263 | start_beams.append(sb)
264 | start_probs.append(sp)
265 | beams.append(b)
266 | probs.append(p)
267 | costs.append(c)
268 |
269 | alignments = np.concatenate(alignments)
270 | start_beams = np.concatenate(start_beams)
271 | start_probs = np.concatenate(start_probs)
272 | beams = np.concatenate(beams)
273 | probs = np.concatenate(probs)
274 | costs = np.concatenate(costs)
275 |
276 | if detailed:
277 | return alignments, start_beams, start_probs, beams, probs, costs
278 |
279 | return alignments, beams, costs
280 |
281 | def align(self, dataset, k=5, hot_start=True, steps=10, delete_max=3, detailed=False):
282 | i = 0
283 | converged = False
284 | go_backwards = False
285 | start_probs = None
286 |
287 | if isinstance(dataset, Dataset):
288 | dataset.reverse(False)
289 | x = dataset.features
290 | else:
291 | x = dataset
292 |
293 | # Prepare data
294 | x_case = [_x for _x in x if len(_x.shape) == 1]
295 | x = [np.pad(_x, ((0, 0), (0, steps + 1))) for _x in x if len(_x.shape) == 2] # Create space for inserts
296 | start_beams = np.copy(x[0])
297 | alive = np.ones(x[0].shape[0], dtype=bool)
298 | x_p = np.zeros(x[0].shape[0])
299 |
300 | # Alignments
301 | inserts = np.zeros_like(x[0])
302 | deletes = np.zeros_like(x[0])
303 |
304 | # Convergence
305 | last_beams_y = None
306 |
307 | for _ in range(steps):
308 | if converged:
309 | print('Converged')
310 | break
311 |
312 | # Keep time for progress output
313 | start_time = time()
314 |
315 | # Forwards data
316 | x_f = [_x[alive] for _x in x]
317 | y_f = [to_targets(_x) for _x in x_f]
318 | m_f = y_f[0] != 0
319 |
320 | # Backwards data
321 | reverse_mask = x[0][alive] != 0
322 | x_b = [reverse(_x[alive], reverse_mask) for _x in x]
323 | y_b = [to_targets(_x) for _x in x_b]
324 | m_b = y_b[0] != 0
325 |
326 | # RNN predictions
327 | _x_case = [_x[alive] for _x in x_case]
328 | y_pred_f, y_pred_b = self.predict(x_f + _x_case, x_b + _x_case)
329 |
330 | y_probs_f, cum_y_probs_f = log_probs(y_f[0], y_pred_f[0], m_f)
331 | y_probs_b, cum_y_probs_b = log_probs(y_b[0], y_pred_b[0], m_b)
332 |
333 | # Reverse backwards
334 | y_pred_b = [reverse(_y, reverse_mask) for _y in y_pred_b]
335 | cum_y_probs_b = reverse(cum_y_probs_b, reverse_mask)
336 |
337 | # Hot start
338 | if i == 0 and hot_start:
339 | scores = self.net_f.score(x_f, [np.copy(f) for f in y_pred_f])
340 | result = AnomalyDetectionResult(scores=scores, predictions=y_pred_f)
341 | b_f = Binarizer(result, ~m_f[:, :, None], np.dstack(x_f))
342 | detection_f = b_f.binarize(heuristic=Heuristic.LP_MEAN, strategy=Strategy.ATTRIBUTE)
343 |
344 | # Original probs
345 | if i == 0:
346 | start_probs = np.atleast_3d(cum_y_probs_f) + align(cum_y_probs_b, 1)
347 | start_probs = start_probs[:, :, 0].sum(-1) / ((~(x_f[0] == 0)).sum(-1) - 1) # -1 to remove end symbol
348 |
349 | # BiBS step
350 | beams_p, beams_y, positions, p, y = bibs_step(x_f[0],
351 | np.log(y_pred_f[0]), cum_y_probs_f,
352 | np.log(y_pred_b[0]), cum_y_probs_b,
353 | inserts[alive] > 0,
354 | k=k, go_backwards=go_backwards, delete_max=delete_max)
355 |
356 | # Beams for event attributes
357 | beams_y = [beams_y]
358 | for n, (_y_f, _y_b) in enumerate(zip(y_pred_f[1:], y_pred_b[1:])):
359 | _y = (_y_f * align(_y_b, 1)).argmax(-1)
360 | _beams_y = gather(_y, positions - 1)
361 | _beams_y[beams_y[0] < 0] = beams_y[0][beams_y[0] < 0]
362 | beams_y.append(_beams_y)
363 |
364 | # Prepare old x
365 | if i == 0:
366 | # In the first run we have to repeat the original cases to match the dimension of `num_cases * k`
367 | x = [np.repeat(_x, k, 0) for _x in x]
368 | x_case = [np.repeat(_x, k) for _x in x_case]
369 | x_f = [np.repeat(_x, k, 0) for _x in x_f]
370 | x_p = np.repeat(x_p, k, 0)
371 | inserts = np.repeat(inserts, k, 0)
372 | deletes = np.repeat(deletes, k, 0)
373 | alive = np.repeat(alive, k, 0)
374 | else:
375 | # Get top-k beams for all cases. There are `k * k` beams available.
376 | shape = (alive.sum() // k, beams_p.shape[0] // (alive.sum() // k) * k)
377 | costs = (inserts[alive] > 0).sum(-1) + (deletes[alive] > 0).sum(-1)
378 | cost_y = np.zeros_like(beams_y[0])
379 | cost_y[beams_y[0] > 0] = 1
380 | cost_y[beams_y[0] < 0] = -beams_y[0][beams_y[0] < 0]
381 | cost_y[beams_y[0] == -42] = 0
382 | costs = costs[:, None] + cost_y
383 |
384 | idx = np.lexsort((-costs.reshape(shape), beams_p.reshape(shape)), axis=-1)[:, ::-1][:, :k]
385 | x_idx = (np.zeros_like(beams_p, dtype=int) + np.arange(alive.sum())[:, None]).reshape(shape)
386 | x_idx = gather(x_idx, idx).reshape(alive.sum())
387 |
388 | beams_y = [gather(_y.reshape(shape), idx) for _y in beams_y]
389 | positions = gather(positions.reshape(shape), idx)
390 | beams_p = gather(beams_p.reshape(shape), idx)
391 | x_f = [_x[x_idx] for _x in x_f]
392 | inserts[alive] = inserts[alive][x_idx]
393 | deletes[alive] = deletes[alive][x_idx]
394 |
395 | # Update probs
396 | x_p[alive] = beams_p.ravel()
397 |
398 | # New alignments
399 | inserts[alive], deletes[alive] = build_alignments(inserts[alive], deletes[alive],
400 | beams_y[0], positions, i + 1)
401 |
402 | # Build new x
403 | for attr_i in range(len(x_f)):
404 | x[attr_i][alive] = build_beams(x_f[attr_i], np.copy(beams_y[attr_i]), positions)
405 |
406 | # Cases with all beams indicating 'do nothing' are finished
407 | finished = np.all(beams_y[0] == -42, -1)
408 | if i == 0 and hot_start:
409 | finished = np.logical_or(finished, np.all(detection_f[:, :, 0] == 0, -1))
410 | if last_beams_y is not None and beams_y[0].shape[0] == last_beams_y.shape[0]:
411 | finished = np.logical_or(finished, np.all(beams_y[0] == last_beams_y, -1))
412 | alive[alive] = np.repeat(~finished, k, 0)
413 | last_beams_y = beams_y[0]
414 |
415 | # Print progress
416 | print(
417 | f'Step {i + 1} {"←" if go_backwards else "→"} {time() - start_time}s {x[0].shape} finished={(~alive).sum() // k}')
418 |
419 | # Go the other way the next step
420 | go_backwards = not go_backwards
421 |
422 | # Converged
423 | converged = alive.sum() == 0
424 |
425 | # i++
426 | i += 1
427 |
428 | shape = (x[0].shape[0] // k, k, x[0].shape[1])
429 | beams = x[0].reshape(shape)
430 | inserts = inserts.reshape(shape)
431 | deletes = deletes.reshape(shape)
432 | costs = (inserts > 0).sum(-1) + (deletes > 0).sum(-1)
433 | probs = x_p.reshape((x[0].shape[0] // k, k))
434 |
435 | # Calculate alignments
436 | alignments = get_alignments(start_beams, beams, inserts, deletes)
437 |
438 | if detailed:
439 | return alignments, beams, costs, start_beams, start_probs, probs, inserts, deletes
440 |
441 | return alignments, beams, costs
442 |
--------------------------------------------------------------------------------
/deepalign/alignments/core.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 | import pickle
18 |
19 | from deepalign.fs import AlignerFile
20 |
21 |
22 | class Aligner:
23 | abbreviation = None
24 | name = None
25 |
26 | def __init__(self, model=None):
27 | self.model = None
28 | if model is not None:
29 | self.load(model)
30 |
31 | def save(self, file_name=None):
32 | """Save the class instance using pickle.
33 |
34 | :param file_name: Custom file name
35 | :return: the file path
36 | """
37 | if self.model is not None:
38 | model_file = AlignerFile(file_name)
39 | with open(model_file.str_path, 'wb') as f:
40 | pickle.dump(self.model, f)
41 | return model_file
42 | else:
43 | raise RuntimeError(
44 | 'Saving not possible. No model has been trained yet.')
45 |
46 | def load(self, file_name):
47 | # load model file
48 | model_file = AlignerFile(file_name)
49 |
50 | # load model
51 | self.model = pickle.load(open(model_file.path, 'rb'))
52 |
53 | def fit(self, dataset):
54 | raise NotImplementedError()
55 |
56 | def align(self, dataset):
57 | raise NotImplementedError()
58 |
--------------------------------------------------------------------------------
/deepalign/alignments/processmining.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 | import subprocess
18 | from multiprocessing import Pool
19 |
20 | import editdistance
21 | import numpy as np
22 | import pm4pycvxopt
23 | from pm4py.algo.conformance.alignments import factory as aligner
24 | from pm4py.algo.discovery.alpha import factory as alpha_miner
25 | from pm4py.algo.discovery.dfg import factory as dfg_miner
26 | from pm4py.algo.discovery.heuristics import factory as heuristics_miner
27 | from pm4py.algo.discovery.inductive import factory as inductive_miner
28 | from pm4py.objects.conversion.dfg import factory as dfg_mining_factory
29 | from pm4py.objects.log.log import Event
30 | from pm4py.objects.log.log import EventLog
31 | from pm4py.objects.log.log import Trace
32 | from pm4py.objects.petri.importer import factory as pnml_importer
33 | from tqdm import tqdm
34 |
35 | from deepalign import fs
36 | from deepalign.alignments.core import Aligner
37 | from deepalign.processmining.alignments import needleman_wunsch
38 |
39 |
40 | def align_top_k_cost(obj):
41 | case, paths, k = obj
42 | costs = np.array([editdistance.eval(case, path) for path in paths])
43 | best_indices = np.argsort(costs)[:k]
44 | return [needleman_wunsch(case, paths[i])[0] for i in best_indices]
45 |
46 |
47 | class OptimalCostAligner(Aligner):
48 | abbreviation = 'optimal'
49 | name = 'OptimalCost'
50 |
51 | def __init__(self, model=None):
52 | super(OptimalCostAligner, self).__init__(model=model)
53 |
54 | def fit(self, dataset):
55 | self.model = [np.trim_zeros(f).tolist() for f in np.unique(dataset.correct_features[0], axis=0)]
56 |
57 | def align(self, dataset, k=5):
58 | cases, index, inverse = np.unique(dataset.features[0], return_index=True, return_inverse=True, axis=0)
59 | k = min(len(self.model), k)
60 |
61 | _alignments = []
62 | with Pool() as pool:
63 | for alignment in tqdm(pool.imap(align_top_k_cost, [(c, self.model, k) for c in cases]),
64 | total=len(cases), desc=dataset.dataset_name):
65 | _alignments.append(alignment)
66 |
67 | max_len = max([len(a[0]) for alignment in _alignments for a in alignment])
68 |
69 | costs = np.zeros((len(cases), k))
70 | alignments = -np.ones((len(cases), k, 2, max_len))
71 | beams = np.zeros(((len(cases)), k, max_len))
72 | for i, alignment in enumerate(_alignments):
73 | for j, a in enumerate(alignment):
74 | alignments[i, j, :, :a.shape[1]] = a
75 | beam = np.array([_a for _a in a[1] if _a != 0])
76 | beams[i, j, :beam.shape[0]] = beam
77 | costs[i, j] = (a == 0).sum()
78 |
79 | return alignments[inverse], beams[inverse], costs[inverse]
80 |
81 |
82 | def pm4py_align(obj):
83 | trace, net, im, fm = obj
84 | return aligner.apply_log([trace], net, im, fm)[0]
85 |
86 |
87 | class PM4PYAligner(Aligner):
88 | miner = None
89 | parameters = None
90 | fast = pm4pycvxopt # ensure import is not being removed
91 |
92 | def __init__(self, model=None):
93 | super(PM4PYAligner, self).__init__(model=model)
94 |
95 | def _convert_log(self, dataset):
96 | _, index, inverse = np.unique(dataset.features[0], return_index=True, return_inverse=True, axis=0)
97 |
98 | log = EventLog()
99 | for case in dataset.event_log[index]:
100 | trace = Trace()
101 | for e in case:
102 | event = Event()
103 | event['concept:name'] = e.name
104 | trace.append(event)
105 | log.append(trace)
106 |
107 | return log, inverse
108 |
109 | def fit(self, dataset):
110 | log, inverse = self._convert_log(dataset)
111 | self.model = self.miner.apply([log[i] for i in inverse], parameters=self.parameters)
112 |
113 | def align(self, dataset):
114 | log, inverse = self._convert_log(dataset)
115 | net, im, fm = self.model
116 |
117 | _alignments = []
118 | with Pool() as pool:
119 | for a in tqdm(pool.imap(pm4py_align, [(trace, net, im, fm) for trace in log]), total=len(log)):
120 | _alignments.append(a)
121 | # _alignments = [aligner.apply_log([trace], net, im, fm)[0] for trace in tqdm(log)]
122 |
123 | _alignments = [[a for a in alignment['alignment'] if a != ('>>', None)] for alignment in _alignments]
124 | max_len = max([len(alignment) for alignment in _alignments]) + 2 # +2 for start and end symbol
125 |
126 | start_symbol = dataset.attribute_dims[0]
127 | end_symbol = dataset.attribute_dims[0] - 1
128 |
129 | encode = dict((c, i) for i, c in enumerate(dataset.encoders['name'].classes_))
130 | encode['>>'] = 0
131 |
132 | costs = np.zeros((len(_alignments), 1))
133 | alignments = -np.ones((len(_alignments), 1, 2, max_len))
134 | beams = np.zeros((len(_alignments), 1, max_len), dtype=int)
135 |
136 | for i, alignment in enumerate(_alignments):
137 | alignment = np.array(
138 | [[start_symbol, start_symbol]] +
139 | [[encode[a[0]], encode[a[1]]] for a in alignment] +
140 | [[end_symbol, end_symbol]]
141 | ).T
142 | alignments[i, :, :, :alignment.shape[1]] = alignment
143 | beam = np.array([a for a in alignment[1] if a != 0])
144 | beams[i, :, :beam.shape[0]] = beam
145 | costs[i] = (alignment == 0).sum()
146 |
147 | return alignments[inverse], beams[inverse], costs[inverse]
148 |
149 |
150 | class AlphaMinerAligner(PM4PYAligner):
151 | abbreviation = 'alpha'
152 | name = 'AlphaMiner'
153 |
154 | miner = alpha_miner
155 |
156 | def __init__(self, model=None):
157 | super(AlphaMinerAligner, self).__init__(model=model)
158 |
159 |
160 | class AlphaMinerPlusAligner(PM4PYAligner):
161 | abbreviation = 'alphaplus'
162 | name = 'AlphaMinerPlus'
163 |
164 | miner = alpha_miner
165 |
166 | def __init__(self, model=None):
167 | super(AlphaMinerPlusAligner, self).__init__(model=model)
168 |
169 | def fit(self, dataset):
170 | log, inverse = self._convert_log(dataset)
171 | self.model = self.miner.apply([log[i] for i in inverse], variant='plus')
172 |
173 |
174 | class HeuristicsMinerAligner(PM4PYAligner):
175 | abbreviation = 'hm'
176 | name = 'HeuristicsMiner'
177 |
178 | miner = heuristics_miner
179 | parameters = {'dependency_thresh': 0.99}
180 |
181 | def __init__(self, model=None, parameters=None):
182 | if parameters is not None:
183 | self.parameters = parameters
184 | super(HeuristicsMinerAligner, self).__init__(model=model)
185 |
186 |
187 | class InductiveMinerAligner(PM4PYAligner):
188 | abbreviation = 'im'
189 | name = 'InductiveMiner'
190 |
191 | miner = inductive_miner
192 | parameters = {'noiseThreshold': 0.2}
193 |
194 | def __init__(self, model=None, parameters=None):
195 | if parameters is not None:
196 | self.parameters = parameters
197 | super(InductiveMinerAligner, self).__init__(model=model)
198 |
199 |
200 | class DFGMinerAligner(PM4PYAligner):
201 | abbreviation = 'dfg'
202 | name = 'DFGMiner'
203 |
204 | miner = dfg_miner
205 |
206 | def __init__(self, model=None):
207 | super(DFGMinerAligner, self).__init__(model=model)
208 |
209 | def fit(self, dataset):
210 | log, inverse = self._convert_log(dataset)
211 | dfg = self.miner.apply([log[i] for i in inverse])
212 | self.model = dfg_mining_factory.apply(dfg)
213 |
214 |
215 | class SplitMinerAligner(PM4PYAligner):
216 | abbreviation = 'sm'
217 | name = 'SplitMiner'
218 |
219 | def fit(self, dataset):
220 | el_path = str(fs.OUT_DIR / 'xes' / (dataset.dataset_name + '.xes'))
221 | splitminer_dir = fs.RES_DIR / 'splitminer'
222 | splitminer = str(splitminer_dir / 'splitminer.jar')
223 | out_path = str(splitminer_dir / 'outputs' / dataset.dataset_name)
224 | lib = str(splitminer_dir / 'lib')
225 |
226 | subprocess.call(
227 | [f'java', f'-cp', f'{splitminer}:{lib}/*', 'au.edu.unimelb.services.ServiceProvider', 'SMPN', '0.0', '0.4',
228 | 'true', el_path, out_path])
229 |
230 | self.model = pnml_importer.apply(out_path + '.pnml')
231 |
--------------------------------------------------------------------------------
/deepalign/anomalydetection/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 | from deepalign.anomalydetection.binarizer import Binarizer
18 | from deepalign.anomalydetection.result import AnomalyDetectionResult
19 |
--------------------------------------------------------------------------------
/deepalign/anomalydetection/binarizer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 | import numpy as np
18 |
19 | from deepalign.anomalydetection.utils import label_collapse
20 | from deepalign.anomalydetection.utils import max_collapse
21 | from deepalign.anomalydetection.heuristic import best_heuristic
22 | from deepalign.anomalydetection.heuristic import elbow_heuristic
23 | from deepalign.anomalydetection.heuristic import lowest_plateau_heuristic
24 | from deepalign.anomalydetection.heuristic import ratio_heuristic
25 | from deepalign.enums import Base
26 | from deepalign.enums import Heuristic
27 | from deepalign.enums import Strategy
28 |
29 |
30 | class Binarizer(object):
31 | def __init__(self, result, mask, features, targets=None):
32 | self.result = result
33 | self._mask = mask
34 | self.features = features
35 | self._targets = targets
36 |
37 | # Try to fix dimensions
38 | if self._mask.shape != self.result.scores.shape:
39 | if len(self._mask) != len(self.result.scores.shape):
40 | self._mask = np.expand_dims(self._mask, axis=-1)
41 | self._mask = np.repeat(self._mask, self.result.scores.shape[-1], axis=-1)
42 |
43 | self.targets = None
44 | if self._targets is not None:
45 | self.targets = dict((a, self.apply_mask(label_collapse(self._targets, axis=a))) for a in [0, 1, 2])
46 |
47 | def apply_mask(self, a):
48 | if len(a.shape) == 1:
49 | m = self._mask[:, 0, 0]
50 | elif len(a.shape) == 2:
51 | m = self._mask[:, :, 0]
52 | else:
53 | m = self._mask
54 | return np.ma.array(a, mask=m)
55 |
56 | def get_targets(self, axis=2):
57 | return self.targets.get(axis)
58 |
59 | def correct_shape(self, tau, strategy):
60 | tau = np.asarray(tau)
61 | if strategy == Strategy.POSITION:
62 | tau = tau[:, None]
63 | if strategy == Strategy.POSITION_ATTRIBUTE:
64 | tau = tau.reshape(*self.result.scores.shape[1:])
65 | return tau
66 |
67 | def split_by_strategy(self, a, strategy):
68 | if strategy == Strategy.SINGLE:
69 | return [a]
70 | elif isinstance(a, list):
71 | if strategy == Strategy.POSITION:
72 | return [[_a[:, i:i + 1] for _a in a] for i in range(len(a[0][0]))]
73 | elif strategy == Strategy.ATTRIBUTE:
74 | return [[_a] for _a in a]
75 | elif strategy == Strategy.POSITION_ATTRIBUTE:
76 | return [[_a[:, i:i + 1]] for i in range(len(a[0][0])) for _a in a]
77 | else:
78 | if strategy == Strategy.POSITION:
79 | return [a[:, i:i + 1, :] for i in range(a.shape[1])]
80 | elif strategy == Strategy.ATTRIBUTE:
81 | return [a[:, :, i:i + 1] for i in range(a.shape[2])]
82 | elif strategy == Strategy.POSITION_ATTRIBUTE:
83 | return [a[:, i:i + 1, j:j + 1] for i in range(a.shape[1]) for j in range(a.shape[2])]
84 |
85 | def get_grid_candidate_taus(self, a, steps=20, axis=0):
86 | """G in the paper."""
87 | return np.linspace(max_collapse(a, axis=axis).min() - .001, a.max(), steps)
88 |
89 | def get_candidate_taus(self, a, axis=0):
90 | a = max_collapse(a, axis=axis).compressed()
91 | if (len(a) == 0):
92 | return np.array([0, 0, 0, 0, 0])
93 | a_min = a.min()
94 | a_max = a.max()
95 | if a_max > a_min:
96 | a = (a_max - a) / (a_max - a_min)
97 | a = 2 * (a / 2).round(2)
98 | if a_max > a_min:
99 | a = a_max - a * (a_max - a_min)
100 | a = np.sort(np.unique(a))
101 | a[0] -= .001
102 | if len(a) < 5:
103 | a = np.linspace(a_min - .001, a_max, 5)
104 | return a
105 |
106 | def get_legacy_tau(self, scores, heuristic=Heuristic.DEFAULT, strategy=Strategy.SINGLE, axis=0):
107 | if heuristic == Heuristic.DEFAULT:
108 | return np.array([0.5])
109 |
110 | if not isinstance(scores, np.ma.MaskedArray):
111 | scores = self.apply_mask(scores)
112 |
113 | alpha = None
114 | if strategy == Strategy.SINGLE:
115 | alpha = np.array([scores.mean()])
116 | elif strategy == Strategy.ATTRIBUTE:
117 | alpha = scores.mean(axis=1).mean(axis=0).data
118 | elif strategy == Strategy.POSITION:
119 | alpha = scores.mean(axis=2).mean(axis=0).data[:, None]
120 | elif strategy == Strategy.POSITION_ATTRIBUTE:
121 | alpha = scores.mean(axis=0).data
122 |
123 | taus = self.get_grid_candidate_taus(scores / alpha, axis=axis)
124 | tau = None
125 | if heuristic == Heuristic.BEST:
126 | y_true = self.get_targets(axis=axis)
127 | tau = best_heuristic(taus=taus, theta=self.legacy_binarize, y_true=y_true, alpha=alpha, scores=scores,
128 | axis=axis)
129 |
130 | if heuristic == Heuristic.RATIO:
131 | tau = ratio_heuristic(taus=taus, theta=self.legacy_binarize, scores=scores, axis=axis, alpha=alpha)
132 |
133 | if heuristic in [Heuristic.ELBOW_DOWN, Heuristic.ELBOW_UP]:
134 | tau = elbow_heuristic(taus=taus, theta=self.legacy_binarize, scores=scores, axis=axis,
135 | alpha=alpha)[heuristic]
136 |
137 | if heuristic in [Heuristic.LP_LEFT, Heuristic.LP_MEAN, Heuristic.LP_RIGHT]:
138 | tau = lowest_plateau_heuristic(taus=taus, theta=self.legacy_binarize, scores=scores, axis=axis,
139 | alpha=alpha)[heuristic]
140 |
141 | return tau * alpha
142 |
143 | def get_tau(self, scores, heuristic=Heuristic.DEFAULT, strategy=Strategy.SINGLE, axis=0, taus=None):
144 | if heuristic == Heuristic.DEFAULT:
145 | return np.array([0.5])
146 |
147 | if not isinstance(scores, np.ma.MaskedArray):
148 | scores = self.apply_mask(scores)
149 |
150 | scores = self.split_by_strategy(scores, strategy)
151 |
152 | if heuristic in [Heuristic.MEAN, Heuristic.MEDIAN]:
153 | scores = [max_collapse(s, axis=axis) for s in scores]
154 | if heuristic == Heuristic.MEAN:
155 | return self.correct_shape([np.mean(s[np.round(s, 1) > 0]) for s in scores], strategy)
156 | elif heuristic == Heuristic.MEDIAN:
157 | return self.correct_shape([np.median(s[np.round(s, 1) > 0]) for s in scores], strategy)
158 |
159 | if taus is None:
160 | taus = [self.get_candidate_taus(s, axis=axis) for s in scores]
161 | else:
162 | taus = [taus] * len(scores)
163 |
164 | tau = None
165 | if heuristic == Heuristic.BEST:
166 | y_trues = self.split_by_strategy(self.get_targets(axis=2), strategy)
167 | y_trues = [label_collapse(y, axis=axis) for y in y_trues]
168 | tau = [best_heuristic(taus=t, theta=self.threshold_binarize, y_true=y, scores=s, axis=axis)
169 | for s, t, y in zip(scores, taus, y_trues)]
170 |
171 | if heuristic == Heuristic.RATIO:
172 | tau = [ratio_heuristic(taus=t, scores=s, theta=self.threshold_binarize, axis=axis)
173 | for s, t in zip(scores, taus)]
174 |
175 | if heuristic in [Heuristic.ELBOW_DOWN, Heuristic.ELBOW_UP]:
176 | tau = [elbow_heuristic(taus=t, scores=s, theta=self.threshold_binarize, axis=axis)[heuristic]
177 | for s, t in zip(scores, taus)]
178 |
179 | if heuristic in [Heuristic.LP_LEFT, Heuristic.LP_MEAN, Heuristic.LP_RIGHT]:
180 | tau = [lowest_plateau_heuristic(taus=t, scores=s, theta=self.threshold_binarize, axis=axis)[heuristic]
181 | for s, t in zip(scores, taus)]
182 |
183 | return self.correct_shape(tau, strategy)
184 |
185 | def legacy_binarize(self, scores, tau, alpha, axis=0):
186 | # Apply the threshold function (Theta in the paper) using alpha as a scaling factor
187 | return self.threshold_binarize(tau=tau * alpha, scores=scores, axis=axis)
188 |
189 | def threshold_binarize(self, tau, scores, axis=0):
190 | # Apply the threshold function (Theta in the paper)
191 | predictions = np.array(scores.data > tau, dtype=int)
192 |
193 | # Apply mask
194 | predictions = np.ma.array(predictions, mask=scores.mask)
195 |
196 | # Positive axis flatten predictions
197 | if axis in [0, 1]:
198 | predictions = label_collapse(predictions, axis=axis)
199 |
200 | return predictions
201 |
202 | def binarize(self, scores=None, tau=None, base=None, heuristic=None, strategy=None, go_backwards=False,
203 | return_parameters=False, axis=2, heuristic_axis=None):
204 |
205 | if heuristic_axis is None:
206 | heuristic_axis = axis
207 |
208 | if scores is None:
209 | if go_backwards:
210 | scores = self.result.scores_backward
211 | else:
212 | scores = self.result.scores
213 |
214 | if not isinstance(scores, np.ma.MaskedArray):
215 | scores = self.apply_mask(scores)
216 |
217 | # Get baseline threshold (tau in the paper)
218 | if tau is None or heuristic != Heuristic.MANUAL:
219 | if base == Base.LEGACY:
220 | tau = self.get_legacy_tau(scores=scores, heuristic=heuristic, strategy=strategy, axis=heuristic_axis)
221 | else:
222 | tau = self.get_tau(scores=scores, heuristic=heuristic, strategy=strategy, axis=heuristic_axis)
223 |
224 | # Apply the threshold function (Theta in the paper)
225 | predictions = self.threshold_binarize(scores=scores, tau=tau, axis=axis)
226 |
227 | if return_parameters:
228 | return predictions, tau
229 |
230 | return predictions
231 |
232 | @staticmethod
233 | def get_scores(probabilities):
234 | scores = np.zeros_like(probabilities)
235 | for i in range(scores.shape[2]):
236 | p = probabilities[:, :, i:i + 1]
237 | _p = np.copy(probabilities)
238 | _p[_p <= p] = 0
239 | scores[:, :, i] = _p.sum(axis=2)
240 | return scores
241 |
--------------------------------------------------------------------------------
/deepalign/anomalydetection/heuristic.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 | import numpy as np
18 | from sklearn import metrics
19 |
20 | from deepalign.anomalydetection.utils import anomaly_ratio
21 | from deepalign.enums import Heuristic
22 |
23 |
24 | def best_heuristic(taus, theta, y_true, **kwargs):
25 | f1s = [metrics.f1_score(y_true.compressed(), theta(tau=tau, **kwargs).compressed()) for tau in taus]
26 | return taus[np.argmax(f1s)]
27 |
28 |
29 | def elbow_heuristic(taus, theta, **kwargs):
30 | if len(taus) < 4:
31 | return taus[-1]
32 | r = np.array([anomaly_ratio(theta(tau=tau, **kwargs)) for tau in taus])
33 | step = taus[1:] - taus[:-1]
34 | r_prime_prime = (r[2:] - 2 * r[1:-1] + r[:-2]) / (step[1:] * step[:-1])
35 | return {
36 | Heuristic.ELBOW_DOWN: taus[np.argmax(r_prime_prime) + 1],
37 | Heuristic.ELBOW_UP: taus[np.argmin(r_prime_prime) + 1]
38 | }
39 |
40 |
41 | def lowest_plateau_heuristic(taus, theta, **kwargs):
42 | if len(taus) < 4:
43 | return taus[-1]
44 | r = np.array([anomaly_ratio(theta(tau=tau, **kwargs)) for tau in taus])
45 | r_prime = (r[1:] - r[:-1]) / (taus[1:] - taus[:-1])
46 | stable_region = r_prime > np.mean(r_prime) / 2
47 | regions = np.split(np.arange(len(stable_region)), np.where(~stable_region)[0])
48 | regions = [taus[idx[1:]] for idx in regions if len(idx) > 1]
49 | if len(regions) == 0:
50 | regions = [taus[-2:]]
51 | return {
52 | Heuristic.LP_LEFT: regions[-1].min(),
53 | Heuristic.LP_MEAN: regions[-1].mean(),
54 | Heuristic.LP_RIGHT: regions[-1].max()
55 | }
56 |
57 |
58 | def ratio_heuristic(taus, theta, nu=0.3, **kwargs):
59 | for tau in taus:
60 | r = anomaly_ratio(theta(tau=tau, **kwargs))
61 | if r < nu:
62 | return tau
63 | return 0
64 |
--------------------------------------------------------------------------------
/deepalign/anomalydetection/result.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 |
18 | class AnomalyDetectionResult(object):
19 | def __init__(self,
20 | scores,
21 | predictions=None,
22 | attentions=None,
23 | scores_backward=None,
24 | predictions_backward=None,
25 | attentions_backward=None):
26 | self.scores_forward = scores
27 | self.scores_backward = scores_backward
28 |
29 | self.predictions = predictions
30 | self.predictions_backward = predictions_backward
31 |
32 | self.attentions = attentions
33 | self.attentions_backward = attentions_backward
34 |
35 | @property
36 | def scores(self):
37 | return self.scores_forward
38 |
39 | @staticmethod
40 | def minmax_normalize(scores):
41 | return (scores - scores.min()) / (scores.max() - scores.min())
42 |
--------------------------------------------------------------------------------
/deepalign/anomalydetection/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 |
18 | def label_collapse(a, axis=0):
19 | if a.ndim > 1 and axis < 2:
20 | a = a.any(-1)
21 | if a.ndim > 1 and axis < 1:
22 | a = a.any(-1)
23 | return a.astype(int)
24 |
25 |
26 | def max_collapse(a, axis=0):
27 | if a.ndim > 1 and axis < 2:
28 | a = a.max(-1)
29 | if a.ndim > 1 and axis < 1:
30 | a = a.max(-1)
31 | return a
32 |
33 |
34 | def anomaly_ratio(a):
35 | """r in the paper"""
36 | if a.max() == 0:
37 | return 0.
38 | elif a.min() == 1:
39 | return 1.
40 | else:
41 | return a.mean()
42 |
--------------------------------------------------------------------------------
/deepalign/enums.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 |
18 | class AttributeType(object):
19 | CATEGORICAL = 0
20 | NUMERICAL = 1
21 |
22 | @staticmethod
23 | def values():
24 | return ['Categorical', 'Numerical']
25 |
26 | @staticmethod
27 | def keys():
28 | return [AttributeType.CATEGORICAL, AttributeType.NUMERICAL]
29 |
30 | @staticmethod
31 | def items():
32 | return dict(zip(AttributeType.keys(), AttributeType.values()))
33 |
34 |
35 | class FeatureType(object):
36 | CONTROL_FLOW = 0
37 | EVENT = 1
38 | CASE = 2
39 |
40 | @staticmethod
41 | def values():
42 | return ['Control Flow', 'Event', 'Case']
43 |
44 | @staticmethod
45 | def keys():
46 | return [FeatureType.CONTROL_FLOW, FeatureType.EVENT, FeatureType.CASE]
47 |
48 | @staticmethod
49 | def items():
50 | return dict(zip(FeatureType.keys(), FeatureType.values()))
51 |
52 |
53 | class Axis(object):
54 | CASE = 0
55 | EVENT = 1
56 | ATTRIBUTE = 2
57 |
58 | @staticmethod
59 | def values():
60 | return ['Case', 'Event', 'Attribute']
61 |
62 | @staticmethod
63 | def keys():
64 | return [Axis.CASE, Axis.EVENT, Axis.ATTRIBUTE]
65 |
66 | @staticmethod
67 | def items():
68 | return dict(zip(Axis.keys(), Axis.values()))
69 |
70 |
71 | class Class(object):
72 | NORMAL_ATTRIBUTE = -1
73 | NORMAL = 0
74 | ANOMALY = 1
75 | INSERT = 2
76 | SKIP = 3
77 | REWORK = 4
78 | EARLY = 5
79 | LATE = 6
80 | SHIFT = 7
81 | REPLACE = 8
82 | ATTRIBUTE = 9
83 |
84 | @staticmethod
85 | def values():
86 | return ['Normal Attribute', 'Normal', 'Anomaly', 'Insert', 'Skip', 'Rework', 'Early', 'Late', 'Shift',
87 | 'Replace', 'Attribute']
88 |
89 | @staticmethod
90 | def colors():
91 | return ['#F5F5F5', '#F5F5F5', '#F44336', '#3F51B5', '#F57F17', '#388E3C', '#f06292', '#c2185b', '#795548',
92 | '#AB47BC', '#ab47bc']
93 |
94 | @staticmethod
95 | def color(key):
96 | return dict(zip(Class.keys(), Class.colors())).get(key)
97 |
98 | @staticmethod
99 | def keys():
100 | return [Class.NORMAL_ATTRIBUTE, Class.NORMAL, Class.ANOMALY, Class.INSERT, Class.SKIP, Class.REWORK,
101 | Class.EARLY, Class.LATE, Class.SHIFT, Class.REPLACE, Class.ATTRIBUTE, Class.ATTRIBUTE]
102 |
103 | @staticmethod
104 | def items():
105 | return dict(zip(Class.keys(), Class.values()))
106 |
107 |
108 | class Mode(object):
109 | BINARIZE = 'binarize'
110 | CLASSIFY = 'classify'
111 |
112 | @staticmethod
113 | def values():
114 | return ['Binarize', 'Classify']
115 |
116 | @staticmethod
117 | def keys():
118 | return [Mode.BINARIZE, Mode.CLASSIFY]
119 |
120 | @staticmethod
121 | def items():
122 | return dict(zip(Mode.keys(), Mode.values()))
123 |
124 |
125 | class Base(object):
126 | LEGACY = 'legacy'
127 | SCORES = 'scores'
128 |
129 | @staticmethod
130 | def values():
131 | return ['Legacy', 'Scores']
132 |
133 | @staticmethod
134 | def keys():
135 | return [Base.LEGACY, Base.SCORES]
136 |
137 | @staticmethod
138 | def items():
139 | return dict(zip(Base.keys(), Base.values()))
140 |
141 |
142 | class Heuristic(object):
143 | DEFAULT = 'default'
144 | MANUAL = 'manual'
145 | BEST = 'best'
146 | ELBOW_DOWN = 'elbow'
147 | ELBOW_UP = 'broken_elbow'
148 | LP_LEFT = 'stable_left'
149 | LP_MEAN = 'stable_mean'
150 | LP_RIGHT = 'stable_right'
151 | MEAN = 'mean'
152 | MEDIAN = 'median'
153 | RATIO = 'ratio'
154 |
155 | @staticmethod
156 | def values():
157 | return [r'$default$', r'$manual$', r'$best$', r'$elbow_\downarrow$', r'$elbow_\uparrow$',
158 | r'$lp_\leftarrow$', r'$lp_\leftrightarrow$', r'$lp_\rightarrow$', r'$\bar{S}$', r'$\tilde{S}$',
159 | r'$ratio$']
160 |
161 | @staticmethod
162 | def keys():
163 | return [Heuristic.DEFAULT, Heuristic.MANUAL, Heuristic.BEST, Heuristic.ELBOW_DOWN, Heuristic.ELBOW_UP,
164 | Heuristic.LP_LEFT, Heuristic.LP_MEAN, Heuristic.LP_RIGHT,
165 | Heuristic.MEAN, Heuristic.MEDIAN, Heuristic.RATIO]
166 |
167 | @staticmethod
168 | def items():
169 | return dict(zip(Heuristic.keys(), Heuristic.values()))
170 |
171 |
172 | class Strategy(object):
173 | DEFAULT = 'default'
174 | SINGLE = 'single'
175 | ATTRIBUTE = 'attribute'
176 | POSITION = 'position'
177 | POSITION_ATTRIBUTE = 'position_attribute'
178 |
179 | @staticmethod
180 | def values():
181 | return ['Default', r'$h$', r'$h^{(a)}$', r'$h^{(e)}$', r'$h^{(ea)}$']
182 |
183 | @staticmethod
184 | def keys():
185 | return [Strategy.DEFAULT, Strategy.SINGLE, Strategy.ATTRIBUTE, Strategy.POSITION, Strategy.POSITION_ATTRIBUTE]
186 |
187 | @staticmethod
188 | def items():
189 | return dict(zip(Strategy.keys(), Strategy.values()))
190 |
--------------------------------------------------------------------------------
/deepalign/fs.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 | from pathlib import Path
18 |
19 | import arrow
20 |
21 | # Base
22 | ROOT_DIR = Path(__file__).parent.parent
23 |
24 | # Base directories
25 | OUT_DIR = ROOT_DIR / '.out' # For anything that is being generated
26 | RES_DIR = ROOT_DIR / '.res' # For resources shipped with the repository
27 | CACHE_DIR = OUT_DIR / '.cache' # Used to cache event logs, results, etc.
28 |
29 | # Resources
30 | PROCESS_MODEL_DIR = RES_DIR / 'process_models' # Randomly generated process models from PLG2
31 |
32 | # Output
33 | EVENTLOG_DIR = OUT_DIR / 'eventlogs' # For generated event logs
34 | MODEL_DIR = OUT_DIR / 'models' # For alignment models
35 | PLOT_DIR = OUT_DIR / 'plots'
36 |
37 | # Cache
38 | EVENTLOG_CACHE_DIR = CACHE_DIR / 'eventlogs' # For caching datasets so the event log does not always have to be loaded
39 | RESULT_DIR = CACHE_DIR / 'results' # For caching ConfNet alignments
40 | ALIGNMENTS_DIR = CACHE_DIR / 'alignments' # For caching optimal alignments
41 | CORRECTIONS_DIR = CACHE_DIR / 'corrections' # For caching the original features before applying anomalies
42 |
43 | # Extensions
44 | MODEL_EXT = '.h5'
45 |
46 | # Misc
47 | DATE_FORMAT = 'YYYYMMDD-HHmmss.SSSSSS'
48 |
49 |
50 | def generate():
51 | """Generate directories."""
52 | dirs = [
53 | ROOT_DIR,
54 | OUT_DIR,
55 | RES_DIR,
56 | CACHE_DIR,
57 | RESULT_DIR,
58 | EVENTLOG_CACHE_DIR,
59 | MODEL_DIR,
60 | ALIGNMENTS_DIR,
61 | CORRECTIONS_DIR,
62 | PROCESS_MODEL_DIR,
63 | EVENTLOG_DIR,
64 | PLOT_DIR
65 | ]
66 | for d in dirs:
67 | if not d.exists():
68 | d.mkdir()
69 |
70 |
71 | def split_eventlog_name(name):
72 | s = name.split('-')
73 | model = None
74 | p = None
75 | id = None
76 | if len(s) > 0:
77 | model = s[0]
78 | if len(s) > 1:
79 | p = float(s[1])
80 | if len(s) > 2:
81 | id = int(s[2])
82 | return model, p, id
83 |
84 |
85 | def split_model_name(name):
86 | s = name.split('_')
87 | event_log_name = None
88 | ad = None
89 | date = None
90 | if len(s) > 0:
91 | event_log_name = s[0]
92 | if len(s) > 1:
93 | ad = s[1]
94 | if len(s) > 2:
95 | date = arrow.get(s[2], DATE_FORMAT)
96 | return event_log_name, ad, date
97 |
98 |
99 | class File(object):
100 | ext = None
101 |
102 | def __init__(self, path):
103 | if not isinstance(path, Path):
104 | path = Path(path)
105 |
106 | self.path = path
107 | self.file = self.path.name
108 | self.name = self.path.stem
109 | self.str_path = str(path)
110 |
111 | def remove(self):
112 | import os
113 | if self.path.exists():
114 | os.remove(self.path)
115 |
116 |
117 | class EventLogFile(File):
118 | def __init__(self, path):
119 | if not isinstance(path, Path):
120 | path = Path(path)
121 | if '.json' not in path.suffixes:
122 | path = Path(str(path) + '.json.gz')
123 | if not path.is_absolute():
124 | path = EVENTLOG_DIR / path.name
125 |
126 | super(EventLogFile, self).__init__(path)
127 |
128 | if len(self.path.suffixes) > 1:
129 | self.name = Path(self.path.stem).stem
130 |
131 | self.model, self.p, self.id = split_eventlog_name(self.name)
132 |
133 | @property
134 | def cache_file(self):
135 | return EVENTLOG_CACHE_DIR / (self.name + '.h5')
136 |
137 |
138 | class ModelFile(File):
139 | ext = MODEL_EXT
140 |
141 | def __init__(self, path):
142 | if not isinstance(path, Path):
143 | path = Path(path)
144 | if path.suffix != self.ext:
145 | path = Path(str(path) + self.ext)
146 | if not path.is_absolute():
147 | path = MODEL_DIR / path.name
148 |
149 | super(ModelFile, self).__init__(path)
150 |
151 | self.event_log_name, self.model_name, self.date = split_model_name(self.name)
152 | self.model, self.p, self.id = split_eventlog_name(self.event_log_name)
153 |
154 | @property
155 | def result_file(self):
156 | return RESULT_DIR / (self.name + MODEL_EXT)
157 |
158 |
159 | class AlignerFile(ModelFile):
160 | ext = MODEL_EXT
161 |
162 | def __init__(self, path):
163 | if not isinstance(path, Path):
164 | path = Path(path)
165 | if path.suffix != self.ext:
166 | path = Path(str(path) + self.ext)
167 | if not path.is_absolute():
168 | path = MODEL_DIR / path.name
169 |
170 | super(AlignerFile, self).__init__(path)
171 |
172 | self.event_log_name, self.ad, self.date = split_model_name(self.name)
173 | self.model, self.p, self.id = split_eventlog_name(self.event_log_name)
174 |
175 | self.use_case_attributes = None
176 | self.use_event_attributes = None
177 | if 'confnet' in self.ad:
178 | ea, ca = int(self.ad[-2:-1]), int(self.ad[-1:])
179 | self.use_case_attributes = bool(ca)
180 | self.use_event_attributes = bool(ea)
181 | # self.ad = self.ad[:-2]
182 |
183 | @property
184 | def result_file(self):
185 | return RESULT_DIR / (self.name + MODEL_EXT)
186 |
187 |
188 | class ResultFile(File):
189 | ext = MODEL_EXT
190 |
191 | @property
192 | def model_file(self):
193 | return MODEL_DIR / (self.name + MODEL_EXT)
194 |
195 |
196 | def get_event_log_files(path=None):
197 | if path is None:
198 | path = EVENTLOG_DIR
199 | for f in path.glob('*.json*'):
200 | yield EventLogFile(f)
201 |
202 |
203 | def get_model_files(path=None):
204 | if path is None:
205 | path = MODEL_DIR
206 | for f in path.glob(f'*{MODEL_EXT}'):
207 | yield ModelFile(f)
208 |
209 |
210 | def get_aligner_files(path=None):
211 | if path is None:
212 | path = MODEL_DIR
213 | for f in path.glob(f'*{MODEL_EXT}'):
214 | yield AlignerFile(f)
215 |
216 |
217 | def get_result_files(path=None):
218 | if path is None:
219 | path = RESULT_DIR
220 | for f in path.glob(f'*{MODEL_EXT}'):
221 | yield ResultFile(f)
222 |
223 |
224 | def get_process_model_files(path=None):
225 | if path is None:
226 | path = PROCESS_MODEL_DIR
227 | for f in path.glob('*.plg'):
228 | yield f.stem
229 |
--------------------------------------------------------------------------------
/deepalign/generation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 | from deepalign.generation.anomaly import *
18 | from deepalign.generation.attribute_generator import *
19 | from deepalign.generation.event_log_generator import EventLogGenerator
20 |
--------------------------------------------------------------------------------
/deepalign/generation/attribute_generator.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 | import numpy as np
18 |
19 | DECIMALS = 2
20 |
21 |
22 | class AttributeGenerator(object):
23 | def __init__(self, name):
24 | self.name = name
25 |
26 | def random_value(self):
27 | pass
28 |
29 | def __str__(self):
30 | return self.__class__.__name__[:-9]
31 |
32 | @property
33 | def json(self):
34 | return dict(type=str(self), parameters=dict((k, v) for k, v in vars(self).items()))
35 |
36 |
37 | class CategoricalAttributeGenerator(AttributeGenerator):
38 | def __init__(self, name, values=10, domain=None, min_group=1, max_group=None, p=None):
39 | super(CategoricalAttributeGenerator, self).__init__(name=name)
40 |
41 | self.p = p
42 |
43 | if isinstance(values, int):
44 | values = list(range(1, values + 1))
45 | elif not isinstance(values, list):
46 | raise TypeError('Incompatible values type, must be list.')
47 |
48 | self.values = sorted(values)
49 |
50 | if domain is None:
51 | self.domain = self.values
52 | else:
53 | self.domain = sorted(domain)
54 |
55 | if max_group is None or max_group > len(self.domain):
56 | self.max_group = len(self.domain)
57 | else:
58 | self.max_group = max_group
59 |
60 | self.min_group = max(min_group, 1)
61 | if self.min_group >= self.max_group:
62 | self.min_group = self.max_group - 1
63 |
64 | def create(self, variance=0):
65 | values = np.random.choice(self.values, np.random.randint(self.min_group, self.max_group + 1), replace=False)
66 | p = np.random.randint(1, variance + 2, len(values))
67 | p = p / np.sum(p)
68 | return CategoricalAttributeGenerator(name=self.name, values=values.tolist(), domain=self.domain, p=p)
69 |
70 | def random_value(self):
71 | return str(np.random.choice(self.values, p=self.p))
72 |
73 | def incorrect_value(self):
74 | values = [x for x in self.domain if x not in self.values]
75 | if len(values) == 0:
76 | raise AttributeError('No incorrect values possible.')
77 | return str(np.random.choice(values))
78 |
79 |
80 | class NumericalAttributeGenerator(AttributeGenerator):
81 | def __init__(self, name):
82 | super(NumericalAttributeGenerator, self).__init__(name=name)
83 |
84 |
85 | class UniformNumericalAttributeGenerator(NumericalAttributeGenerator):
86 | def __init__(self, name, low=0, high=100):
87 | super(UniformNumericalAttributeGenerator, self).__init__(name=name)
88 | self.low = float(low)
89 | self.high = float(high)
90 |
91 | def random_value(self):
92 | return np.round(np.random.uniform(self.low, self.high), DECIMALS).astype(float)
93 |
94 | def incorrect_value(self):
95 | diff = np.abs(self.high - self.low)
96 | smaller = np.random.uniform(self.low - diff, self.low)
97 | greater = np.random.uniform(self.high, self.high + diff)
98 | return np.round(np.random.choice([smaller, greater]), DECIMALS).astype(float)
99 |
100 |
101 | class NormalNumericalAttributeGenerator(NumericalAttributeGenerator):
102 | def __init__(self, name, sigma=1.0, mu=0.0):
103 | super(NormalNumericalAttributeGenerator, self).__init__(name=name)
104 | self.sigma = float(sigma)
105 | self.mu = float(mu)
106 |
107 | def random_value(self):
108 | return np.round(np.random.normal(loc=self.mu, scale=self.sigma), DECIMALS).astype(float)
109 |
110 | def incorrect_value(self):
111 | smaller = np.random.normal(loc=self.mu - self.sigma * 10, scale=self.sigma)
112 | greater = np.random.normal(loc=self.mu + self.sigma * 10, scale=self.sigma)
113 | return np.round(np.random.choice([smaller, greater]), DECIMALS).astype(float)
114 |
--------------------------------------------------------------------------------
/deepalign/generation/event_log_generator.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 | import itertools
18 | import uuid
19 |
20 | import networkx as nx
21 | import numpy as np
22 |
23 | from deepalign.fs import PLOT_DIR
24 | from deepalign.generation import AttributeGenerator
25 | from deepalign.generation import NoneAnomaly
26 | from deepalign.processmining import ProcessMap
27 | from deepalign.processmining.log import EventLog
28 |
29 |
30 | class EventLogGenerator(object):
31 | def __init__(self, process_map=None, event_attributes=None, case_attributes=None):
32 | self.process_map = None
33 | self.likelihood_graph = None
34 | self.event_attributes = self._check_attributes(event_attributes)
35 | self.case_attributes = self._check_attributes(case_attributes)
36 |
37 | if process_map is not None:
38 | if isinstance(process_map, str):
39 | self.process_map = ProcessMap.from_plg(process_map)
40 | elif isinstance(process_map, ProcessMap):
41 | self.process_map = process_map
42 | else:
43 | raise TypeError('Only String and ProcessMap are supported.')
44 |
45 | @staticmethod
46 | def _check_attributes(attributes):
47 | if isinstance(attributes, list):
48 | if not all([isinstance(a, AttributeGenerator) for a in attributes]):
49 | raise TypeError('Not all attributes are of class Attribute.')
50 | else:
51 | return attributes
52 | else:
53 | return []
54 |
55 | def build_likelihood_graph(self,
56 | activity_dependency_p=0.0,
57 | attribute_dependency_p=0.0,
58 | probability_variance_max=None,
59 | seed=None):
60 |
61 | def add_attribute_dependency_between(source, target, p):
62 | attribute_values = []
63 | attribute_min_groups = []
64 | attribute_max_groups = []
65 |
66 | for attribute in self.event_attributes:
67 | attribute_min_groups.append(attribute.min_group)
68 | attribute_max_groups.append(attribute.max_group)
69 |
70 | num_values = np.random.randint(attribute.min_group, attribute.max_group + 1)
71 |
72 | values = np.random.choice(attribute.values, num_values, replace=False)
73 |
74 | attribute_values.append(values)
75 |
76 | combinations = np.array(list(itertools.product(*attribute_values)))
77 | if np.random.uniform(0, 1) >= p:
78 | random_indices = range(len(combinations))
79 | else:
80 | random_indices = np.random.choice(
81 | range(len(combinations)),
82 | np.random.randint(
83 | np.max(attribute_min_groups),
84 | np.max(attribute_max_groups)
85 | )
86 | )
87 |
88 | nodes = {source: source, target: target}
89 | for attribute_values in combinations[random_indices]:
90 | path = [source, *attribute_values, target]
91 | names = [a.name for a in self.event_attributes]
92 |
93 | for i, (s, t) in enumerate(zip(path[:-1], path[1:])):
94 | if s not in nodes:
95 | nodes[s] = uuid.uuid1()
96 | self.likelihood_graph.add_node(nodes[s], name=names[i - 1], value=s)
97 |
98 | if t not in nodes:
99 | nodes[t] = uuid.uuid1()
100 | self.likelihood_graph.add_node(nodes[t], name=names[i], value=t)
101 |
102 | self.likelihood_graph.add_edge(nodes[s], nodes[t])
103 |
104 | def add_activity_dependency_to(g, source):
105 | source_value = self.likelihood_graph.nodes[source]['value']
106 |
107 | if source_value == EventLog.end_symbol:
108 | return
109 | else:
110 | targets = []
111 | for target in g.successors(source_value):
112 | if target not in nodes:
113 | nodes[target] = []
114 |
115 | split_activity = np.random.uniform(0, 1) <= activity_dependency_p
116 | if (split_activity or not nodes[target]) and target != EventLog.end_symbol:
117 | identifier = uuid.uuid1()
118 | nodes[target].append(identifier)
119 | self.likelihood_graph.add_node(identifier, value=target, name='name')
120 | targets.append(identifier)
121 | else:
122 | targets.append(np.random.choice(nodes[target]))
123 |
124 | for target in targets:
125 | if source_value != EventLog.start_symbol:
126 | if source not in edges:
127 | edges[source] = []
128 |
129 | if target not in edges[source]:
130 | if len(self.event_attributes) > 0:
131 | add_attribute_dependency_between(source, target, attribute_dependency_p)
132 | else:
133 | self.likelihood_graph.add_edge(source, target)
134 | edges[source].append(target)
135 | else:
136 | self.likelihood_graph.add_edge(source, target)
137 |
138 | add_activity_dependency_to(g, target)
139 |
140 | # Set seed for consistency
141 | if seed is not None:
142 | np.random.seed(seed)
143 |
144 | # Init graph
145 | self.likelihood_graph = nx.DiGraph()
146 |
147 | # Init helper dictionaries
148 | nodes = {}
149 | edges = {}
150 | for node in self.process_map.graph:
151 | if node in [EventLog.start_symbol, EventLog.end_symbol]:
152 | self.likelihood_graph.add_node(node, value=node, name='name')
153 | nodes[node] = [node]
154 |
155 | # Add attribute and activity dependencies
156 | add_activity_dependency_to(self.process_map.graph, EventLog.start_symbol)
157 |
158 | # Annotate with probabilities
159 | for node in self.likelihood_graph:
160 | if node == EventLog.end_symbol:
161 | continue
162 |
163 | successors = list(self.likelihood_graph.successors(node))
164 |
165 | if probability_variance_max is not None:
166 | variance = np.random.random() * np.abs(probability_variance_max) + .0001
167 | probabilities = np.abs(np.random.normal(0, variance, len(successors)))
168 | probabilities /= np.sum(probabilities)
169 | else:
170 | probabilities = np.ones(len(successors)) / len(successors)
171 |
172 | for successor, probability in zip(successors, probabilities):
173 | self.likelihood_graph.nodes[successor]['probability'] = probability
174 | self.likelihood_graph.edges[node, successor]['probability'] = np.round(probability, 2)
175 |
176 | return self.likelihood_graph
177 |
178 | def generate(self,
179 | size,
180 | anomalies=None,
181 | anomaly_p=None,
182 | anomaly_type_p=None,
183 | activity_dependency_p=.5,
184 | attribute_dependency_p=.5,
185 | probability_variance_max=None,
186 | seed=None,
187 | show_progress='tqdm',
188 | likelihood_graph=None):
189 |
190 | def random_walk(g):
191 | node = EventLog.start_symbol
192 |
193 | # Random walk until we reach the end event
194 | path = []
195 | while node != EventLog.end_symbol:
196 | # Skip the start node
197 | if node != EventLog.start_symbol:
198 | path.append(node)
199 |
200 | # Get successors for node
201 | successors = list(g.successors(node))
202 |
203 | # Retrieve probabilities from nodes
204 | p = [g.edges[node, s]['probability'] for s in successors]
205 |
206 | # Check for and fix rounding errors
207 | if np.sum(p) != 0:
208 | p /= np.sum(p)
209 |
210 | # Chose random successor based on probabilities
211 | node = np.random.choice(successors, p=p)
212 |
213 | return path
214 |
215 | if seed is not None:
216 | np.random.seed(seed)
217 |
218 | # Build the likelihood graph
219 | # TODO: Persist the likelihood graph
220 | if likelihood_graph is not None:
221 | self.likelihood_graph = likelihood_graph
222 | else:
223 | self.build_likelihood_graph(
224 | activity_dependency_p=activity_dependency_p,
225 | attribute_dependency_p=attribute_dependency_p,
226 | probability_variance_max=probability_variance_max,
227 | seed=seed
228 | )
229 |
230 | # Add metadata to anomalies
231 | activities = sorted(list(set([self.likelihood_graph.nodes[node]['value'] for node in self.likelihood_graph
232 | if self.likelihood_graph.nodes[node]['name'] == 'name'
233 | and self.likelihood_graph.nodes[node]['value'] not in
234 | [EventLog.start_symbol, EventLog.end_symbol]])))
235 | none_anomaly = NoneAnomaly()
236 | none_anomaly.activities = activities
237 | none_anomaly.graph = self.likelihood_graph
238 | none_anomaly.attributes = self.event_attributes
239 | for anomaly in anomalies:
240 | anomaly.activities = activities
241 | anomaly.graph = self.likelihood_graph
242 | anomaly.attributes = self.event_attributes
243 |
244 | # Generate the event log
245 | if show_progress == 'tqdm':
246 | from tqdm import tqdm
247 | iter = tqdm(range(size), desc='Generate event log')
248 | elif show_progress == 'tqdm_notebook':
249 | from tqdm import tqdm_notebook
250 | iter = tqdm_notebook(range(size), desc='Generate event log')
251 | else:
252 | iter = range(size)
253 |
254 | # Case attributes
255 | case_dependencies = {}
256 | if len(self.case_attributes) > 0:
257 | for variant in self.process_map.variants:
258 | key = '->'.join(variant.trace)
259 | case_dependencies[key] = []
260 | for case_attr in self.case_attributes:
261 | case_dependencies[key].append(case_attr.create(10))
262 |
263 | # Apply anomalies and add case id
264 | cases = []
265 | for case_id, path in enumerate([random_walk(self.likelihood_graph) for _ in iter], start=1):
266 | variant = '->'.join(none_anomaly.path_to_case(path).trace)
267 | case_attrs = case_dependencies[variant] if variant in case_dependencies else []
268 | if np.random.uniform(0, 1) <= anomaly_p:
269 | anomaly = np.random.choice(anomalies, p=anomaly_type_p)
270 | else:
271 | anomaly = none_anomaly
272 | case = anomaly.apply_to_path(path)
273 | case.id = case_id
274 | for case_attr in case_attrs:
275 | case.attributes[case_attr.name] = case_attr.random_value()
276 | cases.append(case)
277 |
278 | event_log = EventLog(cases=cases)
279 |
280 | event_log.attributes['generation_parameters'] = dict(
281 | size=size,
282 | attributes=[a.json for a in self.event_attributes],
283 | anomalies=[a.json for a in anomalies],
284 | anomaly_p=anomaly_p,
285 | anomaly_type_p=anomaly_type_p,
286 | activity_dependency_p=activity_dependency_p,
287 | attribute_dependency_p=attribute_dependency_p,
288 | probability_variance_max=probability_variance_max,
289 | seed=int(seed)
290 | )
291 |
292 | return event_log
293 |
294 | def plot_likelihood_graph(self, file_name=None, figsize=None):
295 | from deepalign.utils import microsoft_colors
296 | from matplotlib import pylab as plt
297 |
298 | l = self.likelihood_graph
299 | pos = nx.drawing.nx_agraph.graphviz_layout(l, prog='dot')
300 |
301 | if figsize is None:
302 | figsize = (10, 14)
303 | fig = plt.figure(1, figsize=figsize)
304 |
305 | attribute_names = [a.name for a in self.event_attributes]
306 | attribute_colors = microsoft_colors[3:]
307 | colors = dict(zip(attribute_names, attribute_colors))
308 |
309 | color_map = []
310 | for node in l:
311 | if node in [EventLog.start_symbol, EventLog.end_symbol]:
312 | color_map.append(microsoft_colors[0])
313 | elif l.nodes[node]['name'] == 'name':
314 | color_map.append(microsoft_colors[2])
315 | else:
316 | color_map.append(colors[l.nodes[node]['name']])
317 | nx.draw(l, pos, node_color=color_map)
318 | nx.draw_networkx_labels(l, pos, labels=nx.get_node_attributes(l, 'value'))
319 | nx.draw_networkx_edge_labels(l, pos, edge_labels=nx.get_edge_attributes(l, 'probability'))
320 |
321 | if file_name is not None:
322 | # Save to disk
323 | fig.savefig(str(PLOT_DIR / file_name))
324 | plt.close()
325 | else:
326 | plt.show()
327 |
--------------------------------------------------------------------------------
/deepalign/generation/example_values.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 | user_names = ['Roy', 'Earl', 'James', 'Charles', 'Ryan', 'Marilyn', 'Emily', 'Craig', 'Howard', 'Amanda', 'Johnny',
18 | 'Brian', 'Jack', 'Paul', 'Joe', 'Ronald', 'Donald', 'Anna', 'Steve', 'Lisa', 'Gema', 'Doretta', 'Hannah',
19 | 'Maryellen', 'Pam', 'Sherell', 'Micheline', 'Shandi', 'Hugo', 'Jamika', 'Brant', 'Rossana', 'Della',
20 | 'Velda', 'Hoyt', 'Tiffiny', 'Frances', 'Alpha', 'Jimmy', 'Junior', 'Issac', 'Evelin', 'Deloras', 'Hassie',
21 | 'Josef', 'Clayton', 'Sandra', 'Rossie', 'Vickie', 'Lourdes', 'Jin', 'Sigrid', 'Elisha', 'Sherlene',
22 | 'Lucy', 'Chan', 'Lannie', 'Alyce', 'Melany', 'Wilton', 'Seth', 'Sonia', 'Iluminada', 'Michaele', 'Ling',
23 | 'Keven', 'Roseanne', 'Sharee', 'Carmella', 'Grayce']
24 |
25 | working_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
26 | weekend_days = ['Saturday', 'Sunday']
27 | week_days = working_days + weekend_days
28 |
29 | company_names = ['Openlane', 'Yearin', 'Goodsilron', 'Condax', 'Opentech', 'Golddex', 'year-job', 'Isdom', 'Gogozoom',
30 | 'Y-corporation', 'Nam-zim', 'Donquadtech', 'Warephase', 'Donware', 'Faxquote', 'Sunnamplex',
31 | 'Lexiqvolax', 'Sumace', 'Treequote', 'Iselectrics', 'Zencorporation', 'Plusstrip', 'dambase',
32 | 'Toughzap', 'Codehow', 'Zotware', 'Statholdings', 'Conecom', 'Zathunicon', 'Labdrill', 'Ron-tech',
33 | 'Green-Plus', 'Groovestreet', 'Zoomit', 'Bioplex', 'Zumgoity', 'Scotfind', 'Dalttechnology',
34 | 'Kinnamplus', 'Konex', 'Stanredtax', 'Cancity', 'Finhigh', 'Kan-code', 'Blackzim', 'Dontechi',
35 | 'Xx-zobam', 'Fasehatice', 'Hatfan', 'Streethex', 'Inity', 'Konmatfix', 'Bioholding', 'Hottechi',
36 | 'Ganjaflex', 'Betatech', 'Domzoom', 'Ontomedia', 'Newex', 'Betasoloin', 'Mathtouch', 'Rantouch',
37 | 'Silis', 'Plussunin', 'Plexzap', 'Finjob', 'Xx-holding', 'Scottech', 'Funholding', 'Sonron',
38 | 'Singletechno', 'Rangreen', 'J-Texon', 'Rundofase', 'Doncon']
39 |
40 | countries = ['Afghanistan', 'Aland Islands', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Anguilla',
41 | 'Antarctica', 'Antigua And Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan',
42 | 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda',
43 | 'Bhutan', 'Bolivia', 'Bosnia And Herzegovina', 'Botswana', 'Bouvet Island', 'Brazil',
44 | 'British Indian Ocean Territory', 'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia',
45 | 'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands', 'Central African Republic', 'Chad', 'Chile', 'China',
46 | 'Christmas Island', 'Cocos (Keeling) Islands', 'Colombia', 'Comoros', 'Congo',
47 | 'Congo, Democratic Republic', 'Cook Islands', 'Costa Rica', 'Cote D\'Ivoire', 'Croatia', 'Cuba', 'Cyprus',
48 | 'Czech Republic', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt',
49 | 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Falkland Islands (Malvinas)',
50 | 'Faroe Islands', 'Fiji', 'Finland', 'France', 'French Guiana', 'French Polynesia',
51 | 'French Southern Territories', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Gibraltar', 'Greece',
52 | 'Greenland', 'Grenada', 'Guadeloupe', 'Guam', 'Guatemala', 'Guernsey', 'Guinea', 'Guinea-Bissau', 'Guyana',
53 | 'Haiti', 'Heard Island & Mcdonald Islands', 'Holy See (Vatican City State)', 'Honduras', 'Hong Kong',
54 | 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran, Islamic Republic Of', 'Iraq', 'Ireland', 'Isle Of Man',
55 | 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jersey', 'Jordan', 'Kazakhstan', 'Kenya', 'Kiribati', 'Korea',
56 | 'Kuwait', 'Kyrgyzstan', 'Lao People\'s Democratic Republic', 'Latvia', 'Lebanon', 'Lesotho', 'Liberia',
57 | 'Libyan Arab Jamahiriya', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Macao', 'Macedonia', 'Madagascar',
58 | 'Malawi', 'Malaysia', 'Maldives', 'Mali', 'Malta', 'Marshall Islands', 'Martinique', 'Mauritania',
59 | 'Mauritius', 'Mayotte', 'Mexico', 'Micronesia, Federated States Of', 'Moldova', 'Monaco', 'Mongolia',
60 | 'Montenegro', 'Montserrat', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nauru', 'Nepal', 'Netherlands',
61 | 'Netherlands Antilles', 'New Caledonia', 'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'Niue',
62 | 'Norfolk Island', 'Northern Mariana Islands', 'Norway', 'Oman', 'Pakistan', 'Palau',
63 | 'Palestinian Territory, Occupied', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines',
64 | 'Pitcairn', 'Poland', 'Portugal', 'Puerto Rico', 'Qatar', 'Reunion', 'Romania', 'Russian Federation',
65 | 'Rwanda', 'Saint Barthelemy', 'Saint Helena', 'Saint Kitts And Nevis', 'Saint Lucia', 'Saint Martin',
66 | 'Saint Pierre And Miquelon', 'Saint Vincent And Grenadines', 'Samoa', 'San Marino',
67 | 'Sao Tome And Principe', 'Saudi Arabia', 'Senegal', 'Serbia', 'Seychelles', 'Sierra Leone', 'Singapore',
68 | 'Slovakia', 'Slovenia', 'Solomon Islands', 'Somalia', 'South Africa', 'South Georgia And Sandwich Isl.',
69 | 'Spain', 'Sri Lanka', 'Sudan', 'Suriname', 'Svalbard And Jan Mayen', 'Swaziland', 'Sweden', 'Switzerland',
70 | 'Syrian Arab Republic', 'Taiwan', 'Tajikistan', 'Tanzania', 'Thailand', 'Timor-Leste', 'Togo', 'Tokelau',
71 | 'Tonga', 'Trinidad And Tobago', 'Tunisia', 'Turkey', 'Turkmenistan', 'Turks And Caicos Islands', 'Tuvalu',
72 | 'Uganda', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'United States',
73 | 'United States Outlying Islands', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela', 'Viet Nam',
74 | 'Virgin Islands, British', 'Virgin Islands, U.S.', 'Wallis And Futuna', 'Western Sahara', 'Yemen',
75 | 'Zambia', 'Zimbabwe']
76 |
77 | countries_iso = ['AF', 'AX', 'AL', 'DZ', 'AS', 'AD', 'AO', 'AI', 'AQ', 'AG', 'AR', 'AM', 'AW', 'AU', 'AT', 'AZ', 'BS',
78 | 'BH', 'BD', 'BB', 'BY', 'BE', 'BZ', 'BJ', 'BM', 'BT', 'BO', 'BA', 'BW', 'BV', 'BR', 'IO', 'BN', 'BG',
79 | 'BF', 'BI', 'KH', 'CM', 'CA', 'CV', 'KY', 'CF', 'TD', 'CL', 'CN', 'CX', 'CC', 'CO', 'KM', 'CG', 'CD',
80 | 'CK', 'CR', 'CI', 'HR', 'CU', 'CY', 'CZ', 'DK', 'DJ', 'DM', 'DO', 'EC', 'EG', 'SV', 'GQ', 'ER', 'EE',
81 | 'ET', 'FK', 'FO', 'FJ', 'FI', 'FR', 'GF', 'PF', 'TF', 'GA', 'GM', 'GE', 'DE', 'GH', 'GI', 'GR', 'GL',
82 | 'GD', 'GP', 'GU', 'GT', 'GG', 'GN', 'GW', 'GY', 'HT', 'HM', 'VA', 'HN', 'HK', 'HU', 'IS', 'IN', 'ID',
83 | 'IR', 'IQ', 'IE', 'IM', 'IL', 'IT', 'JM', 'JP', 'JE', 'JO', 'KZ', 'KE', 'KI', 'KR', 'KW', 'KG', 'LA',
84 | 'LV', 'LB', 'LS', 'LR', 'LY', 'LI', 'LT', 'LU', 'MO', 'MK', 'MG', 'MW', 'MY', 'MV', 'ML', 'MT', 'MH',
85 | 'MQ', 'MR', 'MU', 'YT', 'MX', 'FM', 'MD', 'MC', 'MN', 'ME', 'MS', 'MA', 'MZ', 'MM', 'NA', 'NR', 'NP',
86 | 'NL', 'AN', 'NC', 'NZ', 'NI', 'NE', 'NG', 'NU', 'NF', 'MP', 'NO', 'OM', 'PK', 'PW', 'PS', 'PA', 'PG',
87 | 'PY', 'PE', 'PH', 'PN', 'PL', 'PT', 'PR', 'QA', 'RE', 'RO', 'RU', 'RW', 'BL', 'SH', 'KN', 'LC', 'MF',
88 | 'PM', 'VC', 'WS', 'SM', 'ST', 'SA', 'SN', 'RS', 'SC', 'SL', 'SG', 'SK', 'SI', 'SB', 'SO', 'ZA', 'GS',
89 | 'ES', 'LK', 'SD', 'SR', 'SJ', 'SZ', 'SE', 'CH', 'SY', 'TW', 'TJ', 'TZ', 'TH', 'TL', 'TG', 'TK', 'TO',
90 | 'TT', 'TN', 'TR', 'TM', 'TC', 'TV', 'UG', 'UA', 'AE', 'GB', 'US', 'UM', 'UY', 'UZ', 'VU', 'VE', 'VN',
91 | 'VG', 'VI', 'WF', 'EH', 'YE', 'ZM', 'ZW']
92 |
--------------------------------------------------------------------------------
/deepalign/generation/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 | import itertools
18 | from typing import Iterable
19 |
20 | import numpy as np
21 |
22 | from deepalign import EventLogGenerator
23 | from deepalign.generation import AttributeAnomaly
24 | from deepalign.generation import CategoricalAttributeGenerator
25 | from deepalign.generation.example_values import company_names
26 | from deepalign.generation.example_values import countries
27 | from deepalign.generation.example_values import user_names
28 | from deepalign.generation.example_values import week_days
29 | from deepalign.generation.example_values import working_days
30 | from deepalign.processmining import ProcessMap
31 |
32 |
33 | def generate_for_process_model(process_model, size=5000, anomalies=None, anomaly_p=0.3, num_attr=0,
34 | activity_dependency_ps=.25, attribute_dependency_ps=.75, p_var=5, seed=0, postfix='',
35 | show_progress='tqdm'):
36 | if show_progress == 'tqdm':
37 | from tqdm import tqdm
38 | elif show_progress == 'tqdm_notebook':
39 | from tqdm import tqdm_notebook as tqdm
40 |
41 | if not isinstance(anomaly_p, Iterable):
42 | anomaly_p = [anomaly_p]
43 |
44 | if not isinstance(num_attr, Iterable):
45 | num_attr = [num_attr]
46 |
47 | if not isinstance(activity_dependency_ps, Iterable):
48 | activity_dependency_ps = [activity_dependency_ps]
49 |
50 | if not isinstance(attribute_dependency_ps, Iterable):
51 | attribute_dependency_ps = [attribute_dependency_ps]
52 |
53 | if not isinstance(p_var, Iterable):
54 | p_var = [p_var]
55 |
56 | process_map = ProcessMap.from_plg(process_model)
57 |
58 | event_attributes = [
59 | CategoricalAttributeGenerator(name='user', values=user_names, min_group=1, max_group=5),
60 | CategoricalAttributeGenerator(name='day', values=working_days, domain=week_days, min_group=2, max_group=5),
61 | CategoricalAttributeGenerator(name='country', values=countries, min_group=1, max_group=5),
62 | CategoricalAttributeGenerator(name='company', values=company_names, min_group=1, max_group=5)
63 | ]
64 |
65 | case_attributes = [
66 | CategoricalAttributeGenerator(name='decision', values=20, min_group=2, max_group=5),
67 | CategoricalAttributeGenerator(name='topic', values=10, min_group=2, max_group=8),
68 | CategoricalAttributeGenerator(name='quality', values=50, min_group=8, max_group=20),
69 | CategoricalAttributeGenerator(name='system', values=5, min_group=1, max_group=5)
70 | ]
71 |
72 | if anomalies is None:
73 | anomalies = []
74 |
75 | parameters = list(itertools.product(anomaly_p,
76 | num_attr,
77 | activity_dependency_ps,
78 | attribute_dependency_ps,
79 | p_var))
80 |
81 | np.random.seed(seed)
82 | seeds = np.random.randint(0, 10000, size=len(parameters))
83 |
84 | for seed, params in tqdm(zip(seeds, parameters), desc=process_model, total=len(seeds)):
85 | anom_p, num_attr, act_dep_p, attr_dep_p, p_var = params
86 |
87 | _anomalies = anomalies
88 | if num_attr == 0:
89 | _anomalies = [a for a in anomalies if not isinstance(a, AttributeAnomaly)]
90 |
91 | # Save event log
92 | generator = EventLogGenerator(process_map,
93 | event_attributes=event_attributes[:num_attr],
94 | case_attributes=case_attributes[:num_attr])
95 | event_log = generator.generate(size=size,
96 | anomalies=_anomalies,
97 | anomaly_p=anom_p,
98 | activity_dependency_p=act_dep_p,
99 | attribute_dependency_p=attr_dep_p,
100 | probability_variance_max=p_var,
101 | seed=seed,
102 | show_progress=show_progress)
103 |
104 | generator.plot_likelihood_graph(f'graph_{process_model}{postfix}-{anom_p}-{num_attr}.pdf', figsize=(20, 50))
105 |
106 | event_log.save(process_model + postfix, anom_p, num_attr)
107 |
--------------------------------------------------------------------------------
/deepalign/processmining/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 | from deepalign.processmining.case import Case
18 | from deepalign.processmining.event import Event
19 | from deepalign.processmining.log import EventLog
20 | from deepalign.processmining.process_map import ProcessMap
21 |
--------------------------------------------------------------------------------
/deepalign/processmining/alignments.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 | import numpy as np
18 |
19 |
20 | class Penalties:
21 | MATCH = 1
22 | MISMATCH = -np.inf
23 | GAP = -1
24 |
25 |
26 | PADDING_VAL = 0
27 |
28 |
29 | def align(a, b):
30 | a = np.trim_zeros(a)
31 | m = len(a) + 1
32 |
33 | beam = np.trim_zeros(b)
34 | n = len(beam) + 1 # Technically, all beams should have the same length, but better safe then sorry :)
35 |
36 | # Initialize alignment and direction matrices
37 | alignment_matrix = np.zeros((m, n))
38 | direction_matrix = np.zeros((m, n, 3)) # 0 = Diagonal, 1 = Vertical, 2 = Horizontal
39 |
40 | # Fill first row with gap penalty
41 | alignment_matrix[:, 0] = np.arange(m) * Penalties.GAP
42 | direction_matrix[:, 0, :] = np.repeat([[0, 0, 1]], m, axis=0)
43 | direction_matrix[0, :, :] = np.repeat([[0, 1, 0]], n, axis=0)
44 |
45 | # Fill first column with gap penalty
46 | alignment_matrix[0, :] = np.arange(n) * Penalties.GAP
47 | direction_matrix[0, 0, :] = [1, 0, 0]
48 |
49 | # Build matrices
50 | for i in range(1, m):
51 | for j in range(1, n):
52 | penalty = Penalties.MATCH if a[i - 1] == beam[j - 1] else Penalties.MISMATCH
53 |
54 | d = alignment_matrix[i - 1][j - 1] + penalty # Diagonal / Top-Left - Match / Mismatch
55 | v = alignment_matrix[i - 1][j] + Penalties.GAP # Vertical / Top - Gap
56 | h = alignment_matrix[i][j - 1] + Penalties.GAP # Horizontal / Left - Gap
57 |
58 | candidates = np.array([d, v, h])
59 | max_val = np.amax(candidates)
60 |
61 | alignment_matrix[i][j] = max_val
62 | direction_matrix[i][j] = (candidates == max_val) * 1
63 |
64 | # Init stack with lower-right corner and empty alignments
65 | stack = [(m - 1, n - 1, [], [])]
66 | alignments = []
67 | if len(stack) > 0:
68 | i, j, alignment_a, alignment_b = stack.pop()
69 |
70 | # Trace back path
71 | while i > 0 or j > 0:
72 | c_dir = direction_matrix[i][j]
73 |
74 | old_i, old_j = i, j
75 | already_moved = False
76 |
77 | # Diagonal - Match / Mismatch
78 | if c_dir[0] > 0:
79 | alignment_a.append(a[i - 1])
80 | alignment_b.append(beam[j - 1])
81 | i -= 1
82 | j -= 1
83 | already_moved = True
84 |
85 | # Vertical - Gap in Beam
86 | if c_dir[1] > 0:
87 | if not already_moved:
88 | alignment_a.append(a[i - 1])
89 | alignment_b.append(PADDING_VAL)
90 | i -= 1
91 | already_moved = True
92 | else:
93 | alignment_a_ = alignment_a[:-1].copy()
94 | alignment_b_ = alignment_b[:-1].copy()
95 | alignment_a_.append(a[old_i - 1])
96 | alignment_b_.append(PADDING_VAL)
97 | stack.append((old_i - 1, old_j, alignment_a_, alignment_b_))
98 |
99 | # Horizontal - Gap in Original
100 | if c_dir[2] > 0:
101 | if not already_moved:
102 | alignment_a.append(PADDING_VAL)
103 | alignment_b.append(beam[j - 1])
104 | j -= 1
105 | already_moved = True
106 | else:
107 | alignment_a_ = alignment_a[:-1].copy()
108 | alignment_b_ = alignment_b[:-1].copy()
109 | alignment_a_.append(PADDING_VAL)
110 | alignment_b_.append(beam[old_j - 1])
111 | stack.append((old_i, old_j - 1, alignment_a_, alignment_b_))
112 |
113 | alignment_a.reverse()
114 | alignment_b.reverse()
115 |
116 | alignments.append(np.array([alignment_a, alignment_b]))
117 |
118 | return alignments
119 |
120 |
121 | def needleman_wunsch(a, b):
122 | if isinstance(a[0], list) and isinstance(b[0], list):
123 | return [align(_a, _b) for _a, _b in zip(a, b)]
124 | else:
125 | return align(a, b)
126 |
--------------------------------------------------------------------------------
/deepalign/processmining/case.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 | import numpy as np
18 | import pandas as pd
19 |
20 | from deepalign.processmining.event import Event
21 |
22 |
23 | class Case(object):
24 | def __init__(self, id=None, events=None, **kwargs):
25 | self.id = id
26 | if events is None:
27 | self.events = []
28 | else:
29 | self.events = events
30 | self.attributes = dict(kwargs)
31 |
32 | def __eq__(self, other):
33 | if len(self) != len(other):
34 | return False
35 | return all([a == b for a, b in zip(self, other)]) and self.attributes == other.attributes
36 |
37 | def __iter__(self):
38 | return iter(self.events)
39 |
40 | def __str__(self):
41 | s = [f'Case {self.id}: #events = {self.num_events}']
42 |
43 | attributes = [f'{key} = {value}' for key, value in self.attributes.items()]
44 | if len(attributes) > 0:
45 | s.append(f', {", ".join(attributes)}')
46 |
47 | s.append('-' * len(s[0]))
48 |
49 | for i, event in enumerate(self.events):
50 | _s = f'Event {i + 1}: name = {event.name}, timestamp = {event.timestamp}'
51 |
52 | attributes = [f'{key} = {value}' for key, value in event.attributes.items()]
53 | if len(attributes) > 0:
54 | _s += f', {", ".join(attributes)}'
55 |
56 | s.append(_s)
57 |
58 | s.append('')
59 |
60 | return '\n'.join(s)
61 |
62 | def __getitem__(self, indices):
63 | return np.asarray(self.events)[indices]
64 |
65 | def __setitem__(self, index, value):
66 | self.events[index] = value
67 |
68 | def __len__(self):
69 | return len(self.events)
70 |
71 | def index(self, index):
72 | return self.events.index(index)
73 |
74 | def add_event(self, event):
75 | self.events.append(event)
76 |
77 | @property
78 | def num_events(self):
79 | return len(self.events)
80 |
81 | @property
82 | def trace(self):
83 | return [str(event.name) for event in self.events]
84 |
85 | @property
86 | def attribute_names(self):
87 | return sorted(set().union(*(event.attributes.keys() for event in self.events)))
88 |
89 | @property
90 | def json(self):
91 | """Return the case object as a json compatible python dictionary."""
92 | return dict(id=self.id, events=[event.json for event in self.events], attributes=self.attributes)
93 |
94 | @staticmethod
95 | def clone(trace):
96 | events = [Event.clone(event) for event in trace.events]
97 | return Case(id=trace.id, events=events, **dict(trace.attributes))
98 |
99 | def dataframe(self):
100 | return pd.DataFrame(
101 | [[self.id, e.name, *[e.attributes[key] for key in self.attribute_names]] for e in self.events],
102 | columns=['id', 'name', *self.attribute_names]
103 | )
104 |
--------------------------------------------------------------------------------
/deepalign/processmining/event.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 |
18 | class Event(object):
19 | def __init__(self, name, timestamp=None, timestamp_end=None, **kwargs):
20 | self.name = name
21 | self.timestamp = timestamp
22 | self.timestamp_end = timestamp_end
23 | self.attributes = dict(kwargs)
24 |
25 | def __str__(self):
26 | _s = f'Event: name = {self.name}, timestamp = {self.timestamp}'
27 |
28 | attributes = [f'{key} = {value}' for key, value in self.attributes.items()]
29 | if len(attributes) > 0:
30 | _s += f', {", ".join(attributes)}'
31 |
32 | return _s
33 |
34 | def __eq__(self, other):
35 | if not isinstance(other, Event):
36 | return False
37 | return self.name == other.name and self.attributes == other.attributes
38 |
39 | @property
40 | def json(self):
41 | """Return the event object as a json compatible python dictionary."""
42 | return dict(name=self.name, timestamp=self.timestamp, timestamp_end=self.timestamp_end,
43 | attributes=self.attributes)
44 |
45 | @staticmethod
46 | def clone(event):
47 | return Event(name=event.name, timestamp=event.timestamp, timestamp_end=event.timestamp_end,
48 | **dict(event.attributes))
49 |
--------------------------------------------------------------------------------
/deepalign/processmining/process_map.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 | import _pickle as pickle
18 | import networkx as nx
19 | import numpy as np
20 | import os
21 | import untangle
22 | from matplotlib import pyplot as plt
23 |
24 | from deepalign.fs import PLOT_DIR
25 | from deepalign.fs import PROCESS_MODEL_DIR
26 | from deepalign.processmining import Case
27 | from deepalign.processmining import Event
28 | from deepalign.processmining.log import EventLog
29 | from deepalign.utils import microsoft_colors
30 |
31 |
32 | class ProcessMap(object):
33 | def __init__(self, graph=None):
34 | self.graph = graph
35 | self.start_event = EventLog.start_symbol
36 | self.end_event = EventLog.end_symbol
37 |
38 | self._variants = None
39 | self._variant_probabilities = None
40 |
41 | def load(self, file):
42 | """
43 | Load from a pickle file
44 |
45 | :param file:
46 | :return:
47 | """
48 | with open(file, 'rb') as f:
49 | self.graph = pickle.load(f)
50 |
51 | def save(self, file):
52 | """
53 | Save to a pickle file
54 |
55 | :param file:
56 | :return:
57 | """
58 | with open(file, 'wb') as f:
59 | pickle.dump(self.graph, f)
60 |
61 | def _check_edge(self, edge):
62 | """
63 | Returns whether the edge is an anomaly or not.
64 | True = anomaly
65 | False = normal
66 |
67 | :param edge: edge
68 | :return: boolean
69 | """
70 | return edge in self.graph.edges()
71 |
72 | def _check_edges(self, edges):
73 | """
74 | Returns for a list of given edges whether an edge is an anomaly. Cf. check_edge()
75 |
76 | :param edges: list of edges
77 | :return: list of booleans
78 | """
79 | return np.array([self._check_edge(e) for e in edges])
80 |
81 | def _check_trace(self, trace):
82 | """
83 | Returns a list of booleans representing whether a transition within the trace is an anomaly or not.
84 | True = anomaly
85 | False = normal
86 |
87 | :param trace: Trace object
88 | :return: list of booleans
89 | """
90 |
91 | # zip(...) generates the edges from the traces
92 | return self._check_edges(zip(trace[:-1], trace[1:]))
93 |
94 | def check_traces(self, traces):
95 | """
96 | Returns a list of booleans for each trace. See check_trace().
97 |
98 | :param traces: list of traces
99 | :return: list of list of booleans
100 | """
101 | return np.array([self._check_trace(s) for s in traces])
102 |
103 | def _get_variants(self):
104 | # variants
105 | variants = sorted(nx.all_simple_paths(self.graph, source=self.start_event, target=self.end_event))
106 | traces = [Case(id=i + 1, events=[Event(name=e) for e in v[1:-1]]) for i, v in enumerate(variants)]
107 |
108 | # probabilities
109 | def get_num_successors(x):
110 | return len([edge[1] for edge in self.graph.edges() if edge[0] == x])
111 |
112 | probabilities = [np.product([1 / max(1, get_num_successors(node)) for node in path]) for path in variants]
113 |
114 | # set globally
115 | self._variants = EventLog(cases=traces)
116 | self._variant_probabilities = probabilities
117 |
118 | return self._variants, self._variant_probabilities
119 |
120 | @property
121 | def activities(self):
122 | return sorted(n for n in self.graph if n != EventLog.start_symbol and n != EventLog.end_symbol)
123 |
124 | @property
125 | def variants(self):
126 | if self._variants is None:
127 | self._get_variants()
128 | return self._variants
129 |
130 | @property
131 | def variant_probabilities(self):
132 | if self._variant_probabilities is None:
133 | self._get_variants()
134 | return self._variant_probabilities
135 |
136 | @staticmethod
137 | def from_plg(file_path):
138 | """Load a process model from a plg file (the format PLG2 uses).
139 |
140 | Gates will be ignored in the resulting process map.
141 |
142 | :param file_path: path to plg file
143 | :return: ProcessMap object
144 | """
145 |
146 | if not file_path.endswith('.plg'):
147 | file_path += '.plg'
148 | if not os.path.isabs(file_path):
149 | file_path = os.path.join(PROCESS_MODEL_DIR, file_path)
150 |
151 | with open(file_path) as f:
152 | file_content = untangle.parse(f.read())
153 |
154 | start_event = int(file_content.process.elements.startEvent['id'])
155 | end_event = int(file_content.process.elements.endEvent['id'])
156 |
157 | id_activity = dict((int(task['id']), str(task['name'])) for task in file_content.process.elements.task)
158 | id_activity[start_event] = EventLog.start_symbol
159 | id_activity[end_event] = EventLog.end_symbol
160 |
161 | activities = id_activity.keys()
162 |
163 | gateways = [int(g['id']) for g in file_content.process.elements.gateway]
164 | gateway_followers = dict((id_, []) for id_ in gateways)
165 | followers = dict((id_, []) for id_ in activities)
166 |
167 | for sf in file_content.process.elements.sequenceFlow:
168 | source = int(sf['sourceRef'])
169 | target = int(sf['targetRef'])
170 | if source in gateways:
171 | gateway_followers[source].append(target)
172 |
173 | for sf in file_content.process.elements.sequenceFlow:
174 | source = int(sf['sourceRef'])
175 | target = int(sf['targetRef'])
176 | if source in activities and target in activities:
177 | followers[source].append(target)
178 | elif source in activities and target in gateways:
179 | followers[source] = gateway_followers.get(target)
180 |
181 | graph = nx.DiGraph()
182 | graph.add_nodes_from([id_activity.get(activity) for activity in activities])
183 | for source, targets in followers.items():
184 | for target in targets:
185 | graph.add_edge(id_activity.get(source), id_activity.get(target))
186 |
187 | return ProcessMap(graph)
188 |
189 | def plot_process_map(self, name=None, figsize=None):
190 | g = self.graph
191 |
192 | # Draw
193 | pos = nx.drawing.nx_agraph.graphviz_layout(self.graph, prog='dot')
194 |
195 | # Set figure size
196 | if figsize is None:
197 | figsize = (8, 8)
198 | fig = plt.figure(3, figsize=figsize)
199 |
200 | color_map = []
201 | for node in g:
202 | if node in [EventLog.start_symbol, EventLog.end_symbol]:
203 | color_map.append(microsoft_colors[0])
204 | else:
205 | color_map.append(microsoft_colors[2])
206 |
207 | nx.draw(g, pos, node_color=color_map, with_labels=True)
208 |
209 | if name is not None:
210 | # Save to disk
211 | plt.tight_layout()
212 | fig.savefig(str(PLOT_DIR / name))
213 | plt.close()
214 | else:
215 | plt.show()
216 |
--------------------------------------------------------------------------------
/deepalign/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Timo Nolle
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program. If not, see .
15 | # ==============================================================================
16 |
17 | import numpy as np
18 | import seaborn as sns
19 |
20 | from deepalign import fs
21 | from deepalign.enums import Axis
22 | from deepalign.enums import Heuristic
23 | from deepalign.enums import Strategy
24 |
25 | microsoft_colors = sns.color_palette(['#01b8aa', '#374649', '#fd625e', '#f2c80f', '#5f6b6d',
26 | '#8ad4eb', '#fe9666', '#a66999', '#3599b8', '#dfbfbf'])
27 |
28 |
29 | def reverse(x, m=None):
30 | if m is None:
31 | m = x != 0
32 | rx = np.copy(x)
33 | for i, j in enumerate(m):
34 | rx[i, j] = rx[i, j][::-1]
35 | return rx
36 |
37 |
38 | def log_probs(y_true, y_pred, m=1):
39 | log_probs = np.log(gather(y_pred, np.atleast_3d(y_true))[:, :, 0]) * m
40 | cum_sum = np.cumsum(log_probs, axis=-1) * m
41 | return log_probs, cum_sum
42 |
43 |
44 | def top_k_acc(x, y, k=1):
45 | match = np.any(np.all(x == y[:, None, :], axis=-1)[:, :k], axis=-1)
46 | correct = np.where(match)[0]
47 | incorrect = np.where(~match)[0]
48 | return correct.shape[0] / (correct.shape[0] + incorrect.shape[0])
49 |
50 |
51 | def gather(x, query):
52 | base = lambda i, n, v: tuple([v if i == j else 1 for j in range(n)])
53 | idx = np.zeros_like(query, dtype=int)
54 | for i, dim in enumerate(x.shape[:-1]):
55 | idx += np.arange(x.shape[i]).reshape(base(i, x.ndim, x.shape[i])) * np.product(x.shape[i + 1:]).astype(int)
56 | idx += query
57 | return x.ravel()[idx].reshape(query.shape)
58 |
59 |
60 | def align(x, s, constant_values=0):
61 | if x.ndim != 3:
62 | x = np.atleast_3d(x)
63 | if s > 0:
64 | x = np.pad(x[:, s:], ((0, 0), (0, s), (0, 0)), 'constant', constant_values=constant_values)
65 | else:
66 | x = np.pad(x[:, :s], ((0, 0), (-s, 0), (0, 0)), 'constant', constant_values=constant_values)
67 | return x
68 |
69 |
70 | def to_targets(x):
71 | return align(x, 1)[:, :, 0] * (x != 0).astype(int)
72 |
73 |
74 | def download(url, to):
75 | from tqdm import tqdm
76 | import requests
77 |
78 | r = requests.get(url, stream=True)
79 | total_size = int(r.headers.get('content-length', 0))
80 | block_size = 1024 #1 Kibibyte
81 | t=tqdm(total=total_size, unit='iB', unit_scale=True)
82 | with open(to, 'wb') as f:
83 | for data in r.iter_content(block_size):
84 | t.update(len(data))
85 | f.write(data)
86 | t.close()
87 | if total_size != 0 and t.n != total_size:
88 | print(f"error: could not download {url}")
89 |
90 |
91 | def download_pretrained_models():
92 | import os
93 | from io import BytesIO
94 | from zipfile import ZipFile
95 | import requests
96 | from deepalign.fs import OUT_DIR
97 |
98 | url = 'https://github.com/tnolle/deepalign/releases/download/2.0.0/'
99 | models = 'pretrained-models.zip'
100 | evaluation = 'evaluation.zip'
101 |
102 | download(url + models, OUT_DIR / 'pretrained-models.zip')
103 | download(url + evaluation, OUT_DIR / 'evaluation.zip')
104 |
105 | print('Extracting pretrained-models.zip')
106 | file = ZipFile(str(OUT_DIR / 'pretrained-models.zip'))
107 | file.extractall(OUT_DIR)
108 | os.remove(OUT_DIR / 'pretrained-models.zip')
109 | print('Done')
110 |
111 | print('Extracting evaluation.zip')
112 | file = ZipFile(str(OUT_DIR / 'evaluation.zip'))
113 | file.extractall(OUT_DIR)
114 | os.remove(OUT_DIR / 'evaluation.zip')
115 | print('Done')
116 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: deepalign
2 | channels:
3 | - defaults
4 | dependencies:
5 | - python=3.6
6 | - h5py==2.10.0
7 | - jupyter
8 | - lxml
9 | - matplotlib
10 | - networkx
11 | - numpy
12 | - openpyxl
13 | - pandas
14 | - pip
15 | - pygraphviz
16 | - scikit-learn
17 | - scipy
18 | - seaborn
19 | - xlrd
20 | - pip:
21 | - arrow
22 | - editdistance
23 | - ipywidgets
24 | - pm4py==1.2.9
25 | - pm4pycvxopt==0.0.8
26 | - pydot
27 | - tensorflow==2.0.0
28 | - tqdm
29 | - untangle
30 |
--------------------------------------------------------------------------------
/notebooks/2. Dataset Generation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from tqdm.notebook import tqdm\n",
10 | "\n",
11 | "from deepalign import fs\n",
12 | "from deepalign.fs import get_process_model_files\n",
13 | "from deepalign.generation.anomaly import *\n",
14 | "from deepalign.generation.utils import generate_for_process_model\n",
15 | "from deepalign.utils import download_pretrained_models"
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {},
21 | "source": [
22 | "# The Datasets Used in the Evaluation Section"
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "metadata": {},
28 | "source": [
29 | "To start quickly, you can download all model and dataset files directly from the GitHub Release."
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 2,
35 | "metadata": {},
36 | "outputs": [
37 | {
38 | "name": "stderr",
39 | "output_type": "stream",
40 | "text": [
41 | "100%|██████████| 1.25G/1.25G [00:41<00:00, 30.2MiB/s] \n",
42 | "100%|██████████| 933M/933M [00:27<00:00, 33.8MiB/s] \n"
43 | ]
44 | },
45 | {
46 | "name": "stdout",
47 | "output_type": "stream",
48 | "text": [
49 | "Extracting pretrained-models.zip\n",
50 | "Done\n",
51 | "Extracting evaluation.zip\n",
52 | "Done\n"
53 | ]
54 | }
55 | ],
56 | "source": [
57 | "download_pretrained_models()"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "However, below code was used to generate the datasets used in the evaluation of the paper."
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {},
70 | "source": [
71 | "Define the anomalies."
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "anomalies = [\n",
81 | " SkipSequenceAnomaly(max_sequence_size=2),\n",
82 | " ReworkAnomaly(max_distance=5, max_sequence_size=3),\n",
83 | " EarlyAnomaly(max_distance=5, max_sequence_size=2),\n",
84 | " LateAnomaly(max_distance=5, max_sequence_size=2),\n",
85 | " InsertAnomaly(max_inserts=2),\n",
86 | " AttributeAnomaly(max_events=3, max_attributes=2)\n",
87 | "]"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "Now, we can generate the datasets used in the paper. Using these parameters and `seed=1337` will produce the same datasets."
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "process_models = [m for m in get_process_model_files() if 'paper' not in m] # Ignore the paper process model\n",
104 | "for process_model in tqdm(process_models, desc='Generate'):\n",
105 | " generate_for_process_model(process_model, \n",
106 | " size=10, \n",
107 | " anomalies=anomalies, \n",
108 | " num_attr=[1, 2, 3, 4], \n",
109 | " anomaly_p=np.arange(0.1, 1.0, 0.1),\n",
110 | " seed=1337, \n",
111 | " show_progress='tqdm_notebook')"
112 | ]
113 | }
114 | ],
115 | "metadata": {
116 | "kernelspec": {
117 | "display_name": "Python 3",
118 | "language": "python",
119 | "name": "python3"
120 | },
121 | "language_info": {
122 | "codemirror_mode": {
123 | "name": "ipython",
124 | "version": 3
125 | },
126 | "file_extension": ".py",
127 | "mimetype": "text/x-python",
128 | "name": "python",
129 | "nbconvert_exporter": "python",
130 | "pygments_lexer": "ipython3",
131 | "version": "3.6.9"
132 | }
133 | },
134 | "nbformat": 4,
135 | "nbformat_minor": 4
136 | }
137 |
--------------------------------------------------------------------------------
/notebooks/3. Training the Models.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Training the Neural Networks"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "The pretrained neural networks are available as part of the download from notebook [2. Dataset Generation](2.%20Dataset%20Generation.ipynb). If you haven't downloaded the models from the release yet, we suggest that you do now."
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "In case you want to train your own models, here is the code to do so."
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 1,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "import arrow\n",
31 | "import tensorflow as tf\n",
32 | "from tqdm.notebook import tqdm\n",
33 | "\n",
34 | "from deepalign import Dataset\n",
35 | "from deepalign import fs\n",
36 | "from deepalign.alignments import ConfNet"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "We can get all dataset filenames using this helper method."
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 2,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "datasets = sorted([f.name for f in fs.get_event_log_files() if 'paper' in f.name])"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | "Now we can train a ConfNet (name of the RNN model) model for each of the datasets using the following for loop. It will create a version of ConfNet with no attributes `(0, 0)`, only case attributes `(0, 1)`, only event attributes `(1, 0)`, and both `(1, 1)`."
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "for dataset_name in datasets:\n",
69 | " for ea, ca in [(0, 0), (0, 1), (1, 0), (1, 1)]:\n",
70 | " start_time = arrow.now()\n",
71 | " dataset = Dataset(dataset_name, use_case_attributes=ca, use_event_attributes=ea)\n",
72 | " if ca and dataset.num_case_attributes == 0:\n",
73 | " continue\n",
74 | " confnet = ConfNet(dataset, use_case_attributes=ca, use_event_attributes=ea)\n",
75 | " confnet.fit(dataset, batch_size=100, epochs=1, validation_split=0.1,\n",
76 | " callbacks=[tf.keras.callbacks.EarlyStopping(patience=5)])\n",
77 | " confnet.save(\n",
78 | " str(fs.MODEL_DIR / f'{dataset_name}_{confnet.identifier}_{start_time.format(fs.DATE_FORMAT)}'))"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {},
84 | "source": [
85 | "# Creating the Baseline Models"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 5,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "from deepalign.alignments.processmining import OptimalCostAligner\n",
95 | "from deepalign.alignments.processmining import HeuristicsMinerAligner\n",
96 | "from deepalign.alignments.processmining import InductiveMinerAligner"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 6,
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "datasets = sorted([f.name for f in fs.get_event_log_files() if 'paper' in f.name])"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 7,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "aligners = [OptimalCostAligner, HeuristicsMinerAligner, InductiveMinerAligner]"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": 8,
120 | "metadata": {},
121 | "outputs": [
122 | {
123 | "data": {
124 | "application/vnd.jupyter.widget-view+json": {
125 | "model_id": "18479ec4a6d848489482d97d28960ebc",
126 | "version_major": 2,
127 | "version_minor": 0
128 | },
129 | "text/plain": [
130 | "HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))"
131 | ]
132 | },
133 | "metadata": {},
134 | "output_type": "display_data"
135 | },
136 | {
137 | "data": {
138 | "application/vnd.jupyter.widget-view+json": {
139 | "model_id": "0ed94f0e49de410c9b29d2d524e3e4b7",
140 | "version_major": 2,
141 | "version_minor": 0
142 | },
143 | "text/plain": [
144 | "HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))"
145 | ]
146 | },
147 | "metadata": {},
148 | "output_type": "display_data"
149 | },
150 | {
151 | "name": "stderr",
152 | "output_type": "stream",
153 | "text": [
154 | "100%|██████████| 5000/5000 [00:01<00:00, 4931.97it/s]\n",
155 | "100%|██████████| 5000/5000 [00:00<00:00, 5677.68it/s]\n",
156 | "100%|██████████| 5000/5000 [00:00<00:00, 6142.05it/s]\n",
157 | "100%|██████████| 5000/5000 [00:00<00:00, 6058.73it/s]"
158 | ]
159 | },
160 | {
161 | "name": "stdout",
162 | "output_type": "stream",
163 | "text": [
164 | "\n"
165 | ]
166 | },
167 | {
168 | "name": "stderr",
169 | "output_type": "stream",
170 | "text": [
171 | "\n"
172 | ]
173 | },
174 | {
175 | "data": {
176 | "application/vnd.jupyter.widget-view+json": {
177 | "model_id": "dd554791370a40c9b8abb26a5bf431f4",
178 | "version_major": 2,
179 | "version_minor": 0
180 | },
181 | "text/plain": [
182 | "HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))"
183 | ]
184 | },
185 | "metadata": {},
186 | "output_type": "display_data"
187 | },
188 | {
189 | "name": "stdout",
190 | "output_type": "stream",
191 | "text": [
192 | "\n"
193 | ]
194 | },
195 | {
196 | "data": {
197 | "application/vnd.jupyter.widget-view+json": {
198 | "model_id": "9fd8df847861451d9a9220ad35380bb0",
199 | "version_major": 2,
200 | "version_minor": 0
201 | },
202 | "text/plain": [
203 | "HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))"
204 | ]
205 | },
206 | "metadata": {},
207 | "output_type": "display_data"
208 | },
209 | {
210 | "name": "stdout",
211 | "output_type": "stream",
212 | "text": [
213 | "\n",
214 | "\n"
215 | ]
216 | }
217 | ],
218 | "source": [
219 | "for aligner_class in tqdm(aligners):\n",
220 | " for dataset_name in tqdm(datasets):\n",
221 | " dataset = Dataset(dataset_name)\n",
222 | " aligner = aligner_class()\n",
223 | " aligner.fit(dataset)\n",
224 | " file_name = f'{dataset_name}_{aligner.abbreviation}'\n",
225 | " aligner.save(file_name)"
226 | ]
227 | }
228 | ],
229 | "metadata": {
230 | "kernelspec": {
231 | "display_name": "Python 3",
232 | "language": "python",
233 | "name": "python3"
234 | },
235 | "language_info": {
236 | "codemirror_mode": {
237 | "name": "ipython",
238 | "version": 3
239 | },
240 | "file_extension": ".py",
241 | "mimetype": "text/x-python",
242 | "name": "python",
243 | "nbconvert_exporter": "python",
244 | "pygments_lexer": "ipython3",
245 | "version": "3.6.10"
246 | }
247 | },
248 | "nbformat": 4,
249 | "nbformat_minor": 2
250 | }
251 |
--------------------------------------------------------------------------------
/notebooks/5. Caching the Alignments.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import h5py\n",
10 | "from tqdm.notebook import tqdm\n",
11 | "\n",
12 | "from deepalign import Dataset\n",
13 | "from deepalign import fs\n",
14 | "from deepalign.alignments import ALIGNERS\n",
15 | "from deepalign.alignments.confnet import ConfNet"
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {},
21 | "source": [
22 | "To speed up the evaluation, we are caching all results. You will have received these cache files with the download of the GitHub release. In case you want to run your own experiments, this is the code."
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 2,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "def get_aligner(model_file, dataset):\n",
32 | " if 'confnet' in model_file.ad:\n",
33 | " aligner = ALIGNERS[model_file.ad[:-2]](dataset,\n",
34 | " use_case_attributes=model_file.use_case_attributes,\n",
35 | " use_event_attributes=model_file.use_event_attributes)\n",
36 | " else:\n",
37 | " aligner = ALIGNERS[model_file.ad]()\n",
38 | " aligner.load(str(fs.MODEL_DIR / model_file.name))\n",
39 | " return aligner"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 3,
45 | "metadata": {
46 | "scrolled": false
47 | },
48 | "outputs": [
49 | {
50 | "data": {
51 | "application/vnd.jupyter.widget-view+json": {
52 | "model_id": "625e950bc3ee40128874579f92a6e5f5",
53 | "version_major": 2,
54 | "version_minor": 0
55 | },
56 | "text/plain": [
57 | "HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))"
58 | ]
59 | },
60 | "metadata": {},
61 | "output_type": "display_data"
62 | },
63 | {
64 | "name": "stderr",
65 | "output_type": "stream",
66 | "text": [
67 | "100%|██████████| 651/651 [00:17<00:00, 37.05it/s]\n",
68 | "paper-0.3-4: 100%|██████████| 651/651 [00:04<00:00, 156.82it/s]\n"
69 | ]
70 | },
71 | {
72 | "name": "stdout",
73 | "output_type": "stream",
74 | "text": [
75 | "\n"
76 | ]
77 | }
78 | ],
79 | "source": [
80 | "synthetic = ['paper', 'p2p', 'small', 'medium', 'large', 'huge', 'gigantic', 'wide']\n",
81 | "\n",
82 | "models = sorted(list(set([f.name.replace('_forward', '').replace('_backward', '')\n",
83 | " for f in fs.get_aligner_files()])))\n",
84 | "\n",
85 | "models = [m for m in models if not (fs.RESULT_DIR / (fs.ModelFile(m).name + '.h5')).exists()]\n",
86 | "\n",
87 | "for model in tqdm(models):\n",
88 | " model_file = fs.AlignerFile(model)\n",
89 | " dataset = Dataset(model_file.event_log_name,\n",
90 | " use_case_attributes=model_file.use_case_attributes,\n",
91 | " use_event_attributes=model_file.use_event_attributes)\n",
92 | " aligner = get_aligner(model_file, dataset)\n",
93 | "\n",
94 | " if isinstance(aligner, ConfNet):\n",
95 | " alignments, beams, costs = aligner.batch_align(dataset, batch_size=5000)\n",
96 | " else:\n",
97 | " try:\n",
98 | " alignments, beams, costs = aligner.align(dataset)\n",
99 | " except Exception as e:\n",
100 | " print(e)\n",
101 | " continue\n",
102 | "\n",
103 | " with h5py.File(str(fs.RESULT_DIR / (model_file.name + '.h5')), 'w') as file:\n",
104 | " file.create_dataset('alignments', data=alignments, compression=\"gzip\", compression_opts=9)\n",
105 | " file.create_dataset('beams', data=beams, compression=\"gzip\", compression_opts=9)\n",
106 | " file.create_dataset('costs', data=costs, compression=\"gzip\", compression_opts=9)"
107 | ]
108 | }
109 | ],
110 | "metadata": {
111 | "kernelspec": {
112 | "display_name": "Python 3",
113 | "language": "python",
114 | "name": "python3"
115 | },
116 | "language_info": {
117 | "codemirror_mode": {
118 | "name": "ipython",
119 | "version": 3
120 | },
121 | "file_extension": ".py",
122 | "mimetype": "text/x-python",
123 | "name": "python",
124 | "nbconvert_exporter": "python",
125 | "pygments_lexer": "ipython3",
126 | "version": "3.6.10"
127 | }
128 | },
129 | "nbformat": 4,
130 | "nbformat_minor": 2
131 | }
132 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | arrow
2 | editdistance
3 | h5py==2.10.0
4 | ipywidgets
5 | jupyter
6 | lxml
7 | matplotlib
8 | networkx
9 | numpy
10 | openpyxl
11 | pandas
12 | pip
13 | pm4py==1.2.9
14 | pm4pycvxopt==0.0.8
15 | pydot
16 | pygraphviz
17 | scikit-learn
18 | scipy
19 | seaborn
20 | tensorflow==2.0.0
21 | tqdm
22 | untangle
23 | xlrd
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages
2 | from setuptools import setup
3 |
4 | setup(name='deepalign',
5 | version='1.0.0',
6 | description='DeepAlign: Alignment-based Process Anomaly Correction',
7 | long_description='',
8 | author='Timo Nolle',
9 | author_email='timonolle@gmail.com',
10 | url='https://github.com/tnolle/deepalign',
11 | license='GPL-3.0',
12 | requires=[],
13 | packages=find_packages())
14 |
--------------------------------------------------------------------------------