├── .github
└── workflows
│ └── repo2docker.yml
├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── binder
└── Dockerfile
├── data
├── L_heur_1.csv
├── Lfull.xes
├── README.md
├── helpdesk.csv
├── patients.csv
├── sepsis.bpmn
└── sepsis.csv
├── install.R
├── python
├── README.md
├── lecture1-eventlogs.ipynb
├── lecture2-discovery.ipynb
├── lecture3-conformance.ipynb
└── lecture4-prediction.ipynb
├── r
├── README.md
├── lecture1-eventlogs.ipynb
└── lecture2-discovery.ipynb
└── requirements.txt
/.github/workflows/repo2docker.yml:
--------------------------------------------------------------------------------
1 | name: Binder
2 | on: [push]
3 |
4 | jobs:
5 | Build:
6 | runs-on: ubuntu-latest
7 | steps:
8 |
9 | - name: checkout files in repo
10 | uses: actions/checkout@master
11 |
12 | - name: update jupyter dependencies with repo2docker
13 | uses: jupyterhub/repo2docker-action@1835b83b93ae93043d30c85210766d957df68f3f
14 | with:
15 | DOCKER_USERNAME: ${{ github.actor }}
16 | DOCKER_PASSWORD: ${{ secrets.GITHUB_TOKEN }}
17 | BINDER_CACHE: true
18 | DOCKER_REGISTRY: "ghcr.io"
19 | IMAGE_NAME: "fmannhardt/course-applied-processmining"
20 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | .vscode
3 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM rocker/binder:4.2.1
2 |
3 | ARG NB_USER
4 | ARG NB_UID
5 |
6 | COPY --chown=${NB_USER}:${NB_USER} . ${HOME}
7 |
8 | ## Run an install.R script, if it exists.
9 | RUN if [ -f install.R ]; then R --quiet -f install.R; fi
10 |
11 | USER root
12 | RUN python3 -m pip install --no-cache-dir -r requirements.txt \
13 | && rm -rf /tmp/* /var/tmp/* \
14 | && find /usr/local/lib -follow -type f -name '*.pyc' -delete \
15 | && find /usr/local/lib -follow -type f -name '*.js.map' -delete
16 | RUN apt-get update && apt-get -y install graphviz && apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
17 | USER ${NB_USER}
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Felix Mannhardt
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Applied Process Mining
2 |
3 | The notebooks in this repository are part of a course on Applied Process Mining course given by Dr. Felix Mannhardt ([@fmannhardt](https://twitter.com/fmannhardt)) of [Process Analytics group](https://pa.win.tue.nl/) at Eindhoven University of Technology. In total there are currently *4* lectures and the associated hands-on notebooks in this repository. The collection of notebooks is a *living document* and subject to change. Each lecture is accompanied by a notebook in both R and Python using the Process Mining frameworks bupaR and PM4Py, respectively.
4 |
5 |
6 | ## Table of Contents
7 |
8 | ### Block 1 - 'Event Logs and Process Visualization'
9 |
10 | * Lecture Notebooks
11 | * [R](r/lecture1-eventlogs.ipynb) [](https://mybinder.org/v2/gh/fmannhardt/course-applied-processmining/HEAD?urlpath=lab%2Ftree%2Fr%2Flecture1-eventlogs.ipynb)
12 | * [Python](python/lecture1-eventlogs.ipynb) [](https://mybinder.org/v2/gh/fmannhardt/course-applied-processmining/HEAD?urlpath=lab%2Ftree%2Fpython%2Flecture1-eventlogs.ipynb)
13 |
14 | ### Block 2 - 'Process Discovery'
15 |
16 | * Lecture Notebooks
17 | * [R](r/lecture2-discovery.ipynb) [](https://mybinder.org/v2/gh/fmannhardt/course-applied-processmining/HEAD?urlpath=lab%2Ftree%2Fr%2Flecture2-discovery.ipynb)
18 | * [Python](python/lecture2-discovery.ipynb) [](https://mybinder.org/v2/gh/fmannhardt/course-applied-processmining/HEAD?urlpath=lab%2Ftree%2Fpython%2Flecture2-discovery.ipynb)
19 |
20 | ### Block 3 - 'Conformance Checking'
21 |
22 | * Lecture Notebooks
23 | * 🚧 (there is currently not conformance checking functionality in R)
24 | * [Python](python/lecture3-conformance.ipynb) [](https://mybinder.org/v2/gh/fmannhardt/course-applied-processmining/HEAD?urlpath=lab%2Ftree%2Fpython%2Flecture3-conformance.ipynb)
25 |
26 | ### Block 4 - 'Predictive Process Mining'
27 |
28 | * Lecture Notebooks
29 | * 🚧 (R version is under construction)
30 | * [Python](python/lecture4-prediction.ipynb) [](https://mybinder.org/v2/gh/fmannhardt/course-applied-processmining/HEAD?urlpath=lab%2Ftree%2Fpython%2Flecture4-prediction.ipynb)
31 |
32 | ## Installation \& Usage
33 |
34 | ### Using MyBinder
35 |
36 | Simply click on the `launch binder` links for either the R or the Python notebook. You may also use the Google Colab service by clicking on the Google Colab links, however, you may need to prepare the Google Colab environment to have the respective packages installed.
37 |
38 | ### Run locally
39 |
40 | #### Docker
41 |
42 | Simply build a Docker image with the provided Dockerfile:
43 |
44 | ```
45 | docker build -t fmannhardt/course-applied-processmining .
46 | ```
47 |
48 | And start the Docker container running Jupyter on [localhost:8888](http://localhost:8888?token=processmining):
49 |
50 | ```
51 | docker run --rm -ti -e JUPYTER_TOKEN=processmining -p 8888:8888 fmannhardt/course-applied-processmining
52 | ```
53 |
54 | or use the Jupyter Lab interface:
55 |
56 | ```
57 | docker run --rm -ti -e JUPYTER_TOKEN=processmining -p 8888:8888 fmannhardt/course-applied-processmining sh -c "jupyter lab --ip 0.0.0.0 --no-browser"
58 | ```
59 |
60 | #### Jupyter
61 |
62 | You should be able to run the Jupyter notebooks directly in a Jupyter environment. Please make sure to have installed the following requirements:
63 |
64 | **Python**
65 |
66 | ```
67 | pip install -r requirements.txt
68 | ```
69 |
70 | Make sure to install GraphViz for the visualization. On Windows with Chocolately this should work:
71 | ```
72 | choco install graphviz
73 | ```
74 | Consult the [PM4Py documentation](https://pm4py.fit.fraunhofer.de/install) for further details.
75 |
76 | **R**
77 |
78 | Install the Jupyter kernel for R:
79 | ```
80 | install.packages(c("IRkernel"))
81 | ```
82 |
83 | and install the nessecary packages:
84 | ```
85 | R --quiet -f install.R
86 | ```
87 |
88 | Depending on your system configuration, it can be tricky to make the `IRkernel` known to Jupyter. Please follow the instructions on their [Github page](https://github.com/IRkernel/IRkernel).
89 | As a hint, you may need to open the R console from an Anaconda console and perform `IRkernel::installspec()` in case you are using conda environment.
90 |
--------------------------------------------------------------------------------
/binder/Dockerfile:
--------------------------------------------------------------------------------
1 | ### DO NOT EDIT THIS FILE! This Is Automatically Generated And Will Be Overwritten ###
2 | FROM ghcr.io/fmannhardt/course-applied-processmining:db7255c186c9
--------------------------------------------------------------------------------
/data/L_heur_1.csv:
--------------------------------------------------------------------------------
1 | CASE_concept_name;activity_id;lifecycle_id;resource_id;timestamp;activity_instance_id;.order
2 | Case2.0;a;complete;UNDEFINED;2010-11-04T19:57:15Z;1;1
3 | Case2.0;b;complete;UNDEFINED;2010-11-04T19:58:15Z;2;2
4 | Case2.0;c;complete;UNDEFINED;2010-11-04T19:59:15Z;3;3
5 | Case2.0;e;complete;UNDEFINED;2010-11-04T20:00:15Z;4;4
6 | Case8.0;a;complete;UNDEFINED;2010-11-04T19:57:15Z;5;5
7 | Case8.0;d;complete;UNDEFINED;2010-11-04T19:58:15Z;6;6
8 | Case8.0;d;complete;UNDEFINED;2010-11-04T19:59:15Z;7;7
9 | Case8.0;d;complete;UNDEFINED;2010-11-04T20:00:15Z;8;8
10 | Case8.0;e;complete;UNDEFINED;2010-11-04T20:01:15Z;9;9
11 | Case7.0;a;complete;UNDEFINED;2010-11-04T19:57:15Z;10;10
12 | Case7.0;d;complete;UNDEFINED;2010-11-04T19:58:15Z;11;11
13 | Case7.0;d;complete;UNDEFINED;2010-11-04T19:59:15Z;12;12
14 | Case7.0;e;complete;UNDEFINED;2010-11-04T20:00:15Z;13;13
15 | Case7.1;a;complete;UNDEFINED;2010-11-04T19:57:15Z;14;14
16 | Case7.1;d;complete;UNDEFINED;2010-11-04T19:58:15Z;15;15
17 | Case7.1;d;complete;UNDEFINED;2010-11-04T19:59:15Z;16;16
18 | Case7.1;e;complete;UNDEFINED;2010-11-04T20:00:15Z;17;17
19 | Case2.7;a;complete;UNDEFINED;2010-11-04T19:57:15Z;18;18
20 | Case2.7;b;complete;UNDEFINED;2010-11-04T19:58:15Z;19;19
21 | Case2.7;c;complete;UNDEFINED;2010-11-04T19:59:15Z;20;20
22 | Case2.7;e;complete;UNDEFINED;2010-11-04T20:00:15Z;21;21
23 | Case2.8;a;complete;UNDEFINED;2010-11-04T19:57:15Z;22;22
24 | Case2.8;b;complete;UNDEFINED;2010-11-04T19:58:15Z;23;23
25 | Case2.8;c;complete;UNDEFINED;2010-11-04T19:59:15Z;24;24
26 | Case2.8;e;complete;UNDEFINED;2010-11-04T20:00:15Z;25;25
27 | Case2.5;a;complete;UNDEFINED;2010-11-04T19:57:15Z;26;26
28 | Case2.5;b;complete;UNDEFINED;2010-11-04T19:58:15Z;27;27
29 | Case2.5;c;complete;UNDEFINED;2010-11-04T19:59:15Z;28;28
30 | Case2.5;e;complete;UNDEFINED;2010-11-04T20:00:15Z;29;29
31 | Case2.6;a;complete;UNDEFINED;2010-11-04T19:57:15Z;30;30
32 | Case2.6;b;complete;UNDEFINED;2010-11-04T19:58:15Z;31;31
33 | Case2.6;c;complete;UNDEFINED;2010-11-04T19:59:15Z;32;32
34 | Case2.6;e;complete;UNDEFINED;2010-11-04T20:00:15Z;33;33
35 | Case2.3;a;complete;UNDEFINED;2010-11-04T19:57:15Z;34;34
36 | Case2.3;b;complete;UNDEFINED;2010-11-04T19:58:15Z;35;35
37 | Case2.3;c;complete;UNDEFINED;2010-11-04T19:59:15Z;36;36
38 | Case2.3;e;complete;UNDEFINED;2010-11-04T20:00:15Z;37;37
39 | Case2.4;a;complete;UNDEFINED;2010-11-04T19:57:15Z;38;38
40 | Case2.4;b;complete;UNDEFINED;2010-11-04T19:58:15Z;39;39
41 | Case2.4;c;complete;UNDEFINED;2010-11-04T19:59:15Z;40;40
42 | Case2.4;e;complete;UNDEFINED;2010-11-04T20:00:15Z;41;41
43 | Case2.1;a;complete;UNDEFINED;2010-11-04T19:57:15Z;42;42
44 | Case2.1;b;complete;UNDEFINED;2010-11-04T19:58:15Z;43;43
45 | Case2.1;c;complete;UNDEFINED;2010-11-04T19:59:15Z;44;44
46 | Case2.1;e;complete;UNDEFINED;2010-11-04T20:00:15Z;45;45
47 | Case2.2;a;complete;UNDEFINED;2010-11-04T19:57:15Z;46;46
48 | Case2.2;b;complete;UNDEFINED;2010-11-04T19:58:15Z;47;47
49 | Case2.2;c;complete;UNDEFINED;2010-11-04T19:59:15Z;48;48
50 | Case2.2;e;complete;UNDEFINED;2010-11-04T20:00:15Z;49;49
51 | Case2.9;a;complete;UNDEFINED;2010-11-04T19:57:15Z;50;50
52 | Case2.9;b;complete;UNDEFINED;2010-11-04T19:58:15Z;51;51
53 | Case2.9;c;complete;UNDEFINED;2010-11-04T19:59:15Z;52;52
54 | Case2.9;e;complete;UNDEFINED;2010-11-04T20:00:15Z;53;53
55 | Case6.9;a;complete;UNDEFINED;2010-11-04T19:57:15Z;54;54
56 | Case6.9;d;complete;UNDEFINED;2010-11-04T19:58:15Z;55;55
57 | Case6.9;e;complete;UNDEFINED;2010-11-04T19:59:15Z;56;56
58 | Case6.7;a;complete;UNDEFINED;2010-11-04T19:57:15Z;57;57
59 | Case6.7;d;complete;UNDEFINED;2010-11-04T19:58:15Z;58;58
60 | Case6.7;e;complete;UNDEFINED;2010-11-04T19:59:15Z;59;59
61 | Case6.8;a;complete;UNDEFINED;2010-11-04T19:57:15Z;60;60
62 | Case6.8;d;complete;UNDEFINED;2010-11-04T19:58:15Z;61;61
63 | Case6.8;e;complete;UNDEFINED;2010-11-04T19:59:15Z;62;62
64 | Case6.5;a;complete;UNDEFINED;2010-11-04T19:57:15Z;63;63
65 | Case6.5;d;complete;UNDEFINED;2010-11-04T19:58:15Z;64;64
66 | Case6.5;e;complete;UNDEFINED;2010-11-04T19:59:15Z;65;65
67 | Case6.6;a;complete;UNDEFINED;2010-11-04T19:57:15Z;66;66
68 | Case6.6;d;complete;UNDEFINED;2010-11-04T19:58:15Z;67;67
69 | Case6.6;e;complete;UNDEFINED;2010-11-04T19:59:15Z;68;68
70 | Case3.8;a;complete;UNDEFINED;2010-11-04T19:57:15Z;69;69
71 | Case3.8;c;complete;UNDEFINED;2010-11-04T19:58:15Z;70;70
72 | Case3.8;b;complete;UNDEFINED;2010-11-04T19:59:15Z;71;71
73 | Case3.8;e;complete;UNDEFINED;2010-11-04T20:00:15Z;72;72
74 | Case3.9;a;complete;UNDEFINED;2010-11-04T19:57:15Z;73;73
75 | Case3.9;c;complete;UNDEFINED;2010-11-04T19:58:15Z;74;74
76 | Case3.9;b;complete;UNDEFINED;2010-11-04T19:59:15Z;75;75
77 | Case3.9;e;complete;UNDEFINED;2010-11-04T20:00:15Z;76;76
78 | Case5.0;a;complete;UNDEFINED;2010-11-04T19:57:15Z;77;77
79 | Case5.0;c;complete;UNDEFINED;2010-11-04T19:58:15Z;78;78
80 | Case5.0;e;complete;UNDEFINED;2010-11-04T19:59:15Z;79;79
81 | Case3.1;a;complete;UNDEFINED;2010-11-04T19:57:15Z;80;80
82 | Case3.1;c;complete;UNDEFINED;2010-11-04T19:58:15Z;81;81
83 | Case3.1;b;complete;UNDEFINED;2010-11-04T19:59:15Z;82;82
84 | Case3.1;e;complete;UNDEFINED;2010-11-04T20:00:15Z;83;83
85 | Case3.0;a;complete;UNDEFINED;2010-11-04T19:57:15Z;84;84
86 | Case3.0;c;complete;UNDEFINED;2010-11-04T19:58:15Z;85;85
87 | Case3.0;b;complete;UNDEFINED;2010-11-04T19:59:15Z;86;86
88 | Case3.0;e;complete;UNDEFINED;2010-11-04T20:00:15Z;87;87
89 | Case3.3;a;complete;UNDEFINED;2010-11-04T19:57:15Z;88;88
90 | Case3.3;c;complete;UNDEFINED;2010-11-04T19:58:15Z;89;89
91 | Case3.3;b;complete;UNDEFINED;2010-11-04T19:59:15Z;90;90
92 | Case3.3;e;complete;UNDEFINED;2010-11-04T20:00:15Z;91;91
93 | Case3.2;a;complete;UNDEFINED;2010-11-04T19:57:15Z;92;92
94 | Case3.2;c;complete;UNDEFINED;2010-11-04T19:58:15Z;93;93
95 | Case3.2;b;complete;UNDEFINED;2010-11-04T19:59:15Z;94;94
96 | Case3.2;e;complete;UNDEFINED;2010-11-04T20:00:15Z;95;95
97 | Case1.3;a;complete;UNDEFINED;2010-11-04T19:57:15Z;96;96
98 | Case1.3;e;complete;UNDEFINED;2010-11-04T19:58:15Z;97;97
99 | Case3.5;a;complete;UNDEFINED;2010-11-04T19:57:15Z;98;98
100 | Case3.5;c;complete;UNDEFINED;2010-11-04T19:58:15Z;99;99
101 | Case3.5;b;complete;UNDEFINED;2010-11-04T19:59:15Z;100;100
102 | Case3.5;e;complete;UNDEFINED;2010-11-04T20:00:15Z;101;101
103 | Case4.0;a;complete;UNDEFINED;2010-11-04T19:57:15Z;102;102
104 | Case4.0;b;complete;UNDEFINED;2010-11-04T19:58:15Z;103;103
105 | Case4.0;e;complete;UNDEFINED;2010-11-04T19:59:15Z;104;104
106 | Case1.2;a;complete;UNDEFINED;2010-11-04T19:57:15Z;105;105
107 | Case1.2;e;complete;UNDEFINED;2010-11-04T19:58:15Z;106;106
108 | Case3.4;a;complete;UNDEFINED;2010-11-04T19:57:15Z;107;107
109 | Case3.4;c;complete;UNDEFINED;2010-11-04T19:58:15Z;108;108
110 | Case3.4;b;complete;UNDEFINED;2010-11-04T19:59:15Z;109;109
111 | Case3.4;e;complete;UNDEFINED;2010-11-04T20:00:15Z;110;110
112 | Case3.7;a;complete;UNDEFINED;2010-11-04T19:57:15Z;111;111
113 | Case3.7;c;complete;UNDEFINED;2010-11-04T19:58:15Z;112;112
114 | Case3.7;b;complete;UNDEFINED;2010-11-04T19:59:15Z;113;113
115 | Case3.7;e;complete;UNDEFINED;2010-11-04T20:00:15Z;114;114
116 | Case1.4;a;complete;UNDEFINED;2010-11-04T19:57:15Z;115;115
117 | Case1.4;e;complete;UNDEFINED;2010-11-04T19:58:15Z;116;116
118 | Case3.6;a;complete;UNDEFINED;2010-11-04T19:57:15Z;117;117
119 | Case3.6;c;complete;UNDEFINED;2010-11-04T19:58:15Z;118;118
120 | Case3.6;b;complete;UNDEFINED;2010-11-04T19:59:15Z;119;119
121 | Case3.6;e;complete;UNDEFINED;2010-11-04T20:00:15Z;120;120
122 | Case6.4;a;complete;UNDEFINED;2010-11-04T19:57:15Z;121;121
123 | Case6.4;d;complete;UNDEFINED;2010-11-04T19:58:15Z;122;122
124 | Case6.4;e;complete;UNDEFINED;2010-11-04T19:59:15Z;123;123
125 | Case6.3;a;complete;UNDEFINED;2010-11-04T19:57:15Z;124;124
126 | Case6.3;d;complete;UNDEFINED;2010-11-04T19:58:15Z;125;125
127 | Case6.3;e;complete;UNDEFINED;2010-11-04T19:59:15Z;126;126
128 | Case6.2;a;complete;UNDEFINED;2010-11-04T19:57:15Z;127;127
129 | Case6.2;d;complete;UNDEFINED;2010-11-04T19:58:15Z;128;128
130 | Case6.2;e;complete;UNDEFINED;2010-11-04T19:59:15Z;129;129
131 | Case1.1;a;complete;UNDEFINED;2010-11-04T19:57:15Z;130;130
132 | Case1.1;e;complete;UNDEFINED;2010-11-04T19:58:15Z;131;131
133 | Case6.1;a;complete;UNDEFINED;2010-11-04T19:57:15Z;132;132
134 | Case6.1;d;complete;UNDEFINED;2010-11-04T19:58:15Z;133;133
135 | Case6.1;e;complete;UNDEFINED;2010-11-04T19:59:15Z;134;134
136 | Case1.0;a;complete;UNDEFINED;2010-11-04T19:57:15Z;135;135
137 | Case1.0;e;complete;UNDEFINED;2010-11-04T19:58:15Z;136;136
138 | Case6.0;a;complete;UNDEFINED;2010-11-04T19:57:15Z;137;137
139 | Case6.0;d;complete;UNDEFINED;2010-11-04T19:58:15Z;138;138
140 | Case6.0;e;complete;UNDEFINED;2010-11-04T19:59:15Z;139;139
141 |
--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | # Applied Process Mining Module
2 |
3 | Selected datasets that are used in the notebooks.
--------------------------------------------------------------------------------
/data/sepsis.bpmn:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Flow_00xkme2
6 |
7 |
8 | Flow_00xkme2
9 | Flow_1sshmma
10 |
11 |
12 |
13 | Flow_0g03tuy
14 | Flow_0ervt1p
15 |
16 |
17 | Flow_0ervt1p
18 | Flow_16b6grt
19 | Flow_0gi819h
20 |
21 |
22 |
23 | Flow_16b6grt
24 | Flow_03t8mpb
25 | Flow_0pvx13v
26 | Flow_0w8j2t8
27 | Flow_1qs3hew
28 |
29 |
30 |
31 | Flow_1psquce
32 | Flow_0q613re
33 |
34 |
35 | Flow_0tmlftf
36 | Flow_0mx1uqx
37 |
38 |
39 | Flow_03t8mpb
40 | Flow_1jmytbe
41 | Flow_0erd9gv
42 |
43 |
44 |
45 | Flow_1jmytbe
46 | Flow_1psquce
47 | Flow_0tmlftf
48 |
49 |
50 |
51 |
52 |
53 | Flow_0q613re
54 | Flow_0mx1uqx
55 | Flow_1w1ljj9
56 |
57 |
58 |
59 |
60 | Flow_1r5j78x
61 | Flow_0dbtbxi
62 |
63 |
64 | Flow_0erd9gv
65 | Flow_1w1ljj9
66 | Flow_1nbmptc
67 | Flow_1r5j78x
68 | Flow_0bkwocg
69 |
70 |
71 |
72 |
73 |
74 | Flow_1sshmma
75 | Flow_0g03tuy
76 |
77 |
78 |
79 |
80 | Flow_0bkwocg
81 | Flow_0h9riy7
82 |
83 |
84 |
85 | Flow_0dbtbxi
86 | Flow_0h9riy7
87 | Flow_1nbmptc
88 | Flow_0k1p8n0
89 |
90 |
91 |
92 |
93 |
94 | Flow_1uekm1e
95 | Flow_1s9el2t
96 | Flow_0rmlzq7
97 | Flow_03tubt6
98 | Flow_18aq2my
99 | Flow_1hdie3u
100 |
101 |
102 | Flow_1s9el2t
103 | Flow_10ubq4y
104 |
105 |
106 |
107 | Flow_0rmlzq7
108 | Flow_0w3ch08
109 |
110 |
111 |
112 | Flow_03tubt6
113 | Flow_02gqrn0
114 |
115 |
116 |
117 | Flow_18aq2my
118 | Flow_0gy3ikd
119 |
120 |
121 |
122 | Flow_1hdie3u
123 | Flow_1iv5gcr
124 |
125 |
126 |
127 | Flow_10ubq4y
128 | Flow_0w3ch08
129 | Flow_02gqrn0
130 | Flow_0gy3ikd
131 | Flow_1iv5gcr
132 | Flow_03exriy
133 | Flow_0ubnjhr
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 | Flow_0ubnjhr
142 | Flow_0vbqqlm
143 |
144 |
145 | Flow_0vbqqlm
146 | Flow_03exriy
147 | Flow_0gi819h
148 | Flow_1jdn0hv
149 |
150 |
151 |
152 | Flow_1jdn0hv
153 |
154 |
155 |
156 |
157 |
158 | Flow_0k1p8n0
159 | Flow_0mtz4hr
160 | Flow_1qteu9a
161 | Flow_11c40x0
162 | Flow_1uekm1e
163 |
164 |
165 |
166 |
167 | Flow_1qi3gz5
168 | Flow_1my3d0v
169 |
170 |
171 | Flow_1oi66n1
172 | Flow_0h89xem
173 |
174 |
175 | Flow_0fovmx3
176 | Flow_1sklw38
177 |
178 |
179 | Flow_1my3d0v
180 | Flow_0wi4i4s
181 | Flow_0mtz4hr
182 |
183 |
184 |
185 | Flow_0pvx13v
186 | Flow_0wi4i4s
187 | Flow_1qi3gz5
188 |
189 |
190 |
191 | Flow_0w8j2t8
192 | Flow_1w1e70w
193 | Flow_1oi66n1
194 |
195 |
196 |
197 | Flow_1qs3hew
198 | Flow_0gt8eps
199 | Flow_0fovmx3
200 |
201 |
202 |
203 |
204 |
205 |
206 | Flow_0h89xem
207 | Flow_1w1e70w
208 | Flow_1qteu9a
209 |
210 |
211 |
212 | Flow_1sklw38
213 | Flow_0gt8eps
214 | Flow_11c40x0
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
--------------------------------------------------------------------------------
/install.R:
--------------------------------------------------------------------------------
1 | install.packages(c("bupaR", "processanimateR", "xesreadR", "heuristicsmineR", "petrinetR", "R.utils"))
--------------------------------------------------------------------------------
/python/README.md:
--------------------------------------------------------------------------------
1 | # Applied Process Mining Module
2 |
3 | **Course under construction** 🚧
4 |
5 | Python notebooks, see the course overview on the [landing page](../).
--------------------------------------------------------------------------------
/python/lecture1-eventlogs.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook is part of a course on Applied Process Mining. The collection of notebooks is a *living document* and subject to change. "
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# Lecture 1 - 'Event Logs and Process Discovery' (Python / PM4Py)"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Setup\n",
22 | "\n",
23 | "
\n",
24 | "\n",
25 | "In this notebook, we are using several libraries:\n",
26 | "\n",
27 | "* [PM4Py](https://pm4py.fit.fraunhofer.de/)\n",
28 | "* [pandas](https://pandas.pydata.org/)\n",
29 | "* [plotnine](https://plotnine.readthedocs.io/en/stable/)\n",
30 | "\n",
31 | "Often used dependencies are imported:"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "import pandas as pd\n",
41 | "import pm4py\n",
42 | "import plotnine\n",
43 | "from plotnine import ggplot, geom_point, aes, theme_bw, coord_flip, scale_y_discrete, theme, element_text, geom_bin2d, ylab, scale_x_datetime"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "## Event Logs\n",
51 | "\n",
52 | "This part introduces event logs and their unique properties that provide the basis for any Process Mining method. We use the same event logs as provided by `bupaR`. However, we need to load them from the CSV files in the `data` directory of this repository. In this lecture we are going to make use of the following datasets:\n",
53 | "\n",
54 | "* Patients, a synthetically generated example event log in a hospital setting.\n",
55 | "* Sepsis, a real-life event log taken from a Dutch hospital. The event log is publicly available here: https://doi.org/10.4121/uuid:915d2bfb-7e84-49ad-a286-dc35f063a460 and has been used in many Process Mining related publications."
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "### Import Patients Data"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "patients = pd.read_csv(\"../data/patients.csv\", sep=';')\n",
72 | "patients['time'] = pd.to_datetime(patients['time'])\n",
73 | "num_rows = len(patients)\n",
74 | "print(\"Number of rows: {}\".format(num_rows))"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "### Import Sepsis Data"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "sepsis = pd.read_csv(\"../data/sepsis.csv\", sep=';')\n",
91 | "num_rows = len(sepsis)\n",
92 | "sepsis['timestamp'] = pd.to_datetime(sepsis['timestamp'])\n",
93 | "print(\"Number of rows: {}\".format(num_rows))"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "### Exploring Event Data\n",
101 | "\n",
102 | "Let us first explore the event data without any prior knowledge about event log structure or properties. We use standard Pandas and plotnine features to do so. Regarding the choice for plotnine, any other plotting library such as Matplotlib could also be used and it is simply used to deviate as little as possible from the exploration performed with ggplot2 in the R version of this lecture. "
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {},
109 | "outputs": [],
110 | "source": [
111 | "patients.head()"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "metadata": {},
117 | "source": [
118 | "The most important ingredient of an event log is the timestamps column `time`. This allows us to establish a sequence of events."
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": null,
124 | "metadata": {},
125 | "outputs": [],
126 | "source": [
127 | "patients_sample = patients[patients['time'] < '2017-01-31']\n",
128 | "(ggplot(patients_sample, aes('time', 0))\n",
129 | " + geom_point() \n",
130 | " + theme_bw()\n",
131 | " + ylab(\"Event\")\n",
132 | " + scale_x_datetime(date_breaks = \"1 days\"))"
133 | ]
134 | },
135 | {
136 | "cell_type": "markdown",
137 | "metadata": {},
138 | "source": [
139 | "We also need to have information on the kind of actions or `activities` performed:"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "patients.drop_duplicates(subset='handling')[[\"handling\"]]"
149 | ]
150 | },
151 | {
152 | "cell_type": "markdown",
153 | "metadata": {},
154 | "source": [
155 | "Let us have a look at what other data is available:"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "patients.drop_duplicates(subset='patient')[[\"patient\"]].head()"
165 | ]
166 | },
167 | {
168 | "cell_type": "markdown",
169 | "metadata": {},
170 | "source": [
171 | "Maybe the patient identifier could be a good candidate for defining a process `case` since this is an 'entity' that we would like to follow. When counting the events that occurred per individual patient it seems that there is a similar number of events for each patient, which is generally a good indicator for a process case identifier:"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": null,
177 | "metadata": {},
178 | "outputs": [],
179 | "source": [
180 | "patients.groupby(['patient'])[\"patient\"].agg(['count']).head()"
181 | ]
182 | },
183 | {
184 | "cell_type": "markdown",
185 | "metadata": {},
186 | "source": [
187 | "Let use decide that we want to look at the process by following the patient identifier as `case identifier`:"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": null,
193 | "metadata": {},
194 | "outputs": [],
195 | "source": [
196 | "patients_sample = patients[patients['time'] < '2017-01-31']\n",
197 | "(ggplot(patients_sample, aes('time', 'patient', color = 'handling'))\n",
198 | " + geom_point() \n",
199 | " + theme_bw()\n",
200 | " + scale_x_datetime(date_breaks = \"7 days\"))"
201 | ]
202 | },
203 | {
204 | "cell_type": "markdown",
205 | "metadata": {},
206 | "source": [
207 | "The scatterplot above is known as `Dotted Chart` in the process mining community and provides an 'at a glance' overview on the events and their temporal relation when grouped by a case. It seems that each of the sequences of events (also known as `traces`) start with the `Registration` event. Let us have a look at the event data sorted by patient identifier and by time:"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": null,
213 | "metadata": {},
214 | "outputs": [],
215 | "source": [
216 | "patients.sort_values(['patient', 'time']).head(14)"
217 | ]
218 | },
219 | {
220 | "cell_type": "markdown",
221 | "metadata": {},
222 | "source": [
223 | "An individual process execution (e.g., for patient 1) consists of several activities that are done in a sequence. However, we have more information available than simply the sequence of events. For each occurrence of an activity we have two events: a `start` event and a `complete` event as captured in the column `registration_type`. These event refer to the lifecycle of an activity and allow us to capture the `duration` of an activity. Much more complex lifecycles of activities are possible, a general model is described here: http://bupar.net/creating_eventlogs.html#Transactional_life_cycle"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "metadata": {},
229 | "source": [
230 | "### Further resources\n",
231 | "\n",
232 | "* [XES Standard](http://xes-standard.org/)\n",
233 | "* [Importing CSV event logs](https://pm4py.fit.fraunhofer.de/documentation#item-import-csv)\n",
234 | "* [Importing XES event logs](https://pm4py.fit.fraunhofer.de/documentation#item-impoort-xes)"
235 | ]
236 | },
237 | {
238 | "cell_type": "markdown",
239 | "metadata": {},
240 | "source": [
241 | "#### Reflection Questions\n",
242 | "\n",
243 | "* What could be the reason a column `.order` is included in this dataset?\n",
244 | "* How could the column `employee` be used?\n",
245 | "* What is the use of the column `handling_id` and in which situation is it required?"
246 | ]
247 | },
248 | {
249 | "cell_type": "markdown",
250 | "metadata": {},
251 | "source": [
252 | "## Basic Process Visualization"
253 | ]
254 | },
255 | {
256 | "cell_type": "markdown",
257 | "metadata": {},
258 | "source": [
259 | "### Set of Traces \n",
260 | "\n",
261 | "Exploring traces as a set visualization is currently not implemented in PM4Py. \n",
262 | "**Challenge** implement a visualization similar to that in bupaR with Python and open a pull request. Here are some reference implementations of a 'trace explorer':\n",
263 | "\n",
264 | "* http://bupar.net/trace_explorer.html\n",
265 | "* https://fmannhardt.de/blog/software/prom/explorer (ProM)\n"
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": null,
271 | "metadata": {},
272 | "outputs": [],
273 | "source": [
274 | "# implement a view of the event log as a set of traces "
275 | ]
276 | },
277 | {
278 | "cell_type": "markdown",
279 | "metadata": {},
280 | "source": [
281 | "### Dotted Chart\n",
282 | "\n",
283 | "The `Dotted Chart` adds the timing aspect of the individual traces and visualized all of them at-a-glance. It can be configured in many different ways and provides a good insight into time-related aspects of the process behavior. PM4Py provides a basic Dotted Chart visualization:"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": null,
289 | "metadata": {},
290 | "outputs": [],
291 | "source": [
292 | "patients_log = pm4py.format_dataframe(patients, case_id='patient', activity_key='handling', timestamp_key='time')\n",
293 | "pm4py.view_dotted_chart(pm4py.filter_time_range(patients_log, \"1970-01-01 00:00:00\", \"2017-01-31 00:00:00\", mode='events'))"
294 | ]
295 | },
296 | {
297 | "cell_type": "markdown",
298 | "metadata": {},
299 | "source": [
300 | "Alternatively, to allow for more customization of the visuals, the same view can be simply reproduced using plotnine and allows for some more flexibility in choosing the perspectives:"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": null,
306 | "metadata": {},
307 | "outputs": [],
308 | "source": [
309 | "# Necessary pre-processing on the data frame\n",
310 | "patients_sorted = patients.sort_values(['time'])\n",
311 | "# Creating categories for the case identifier\n",
312 | "patients_sorted['patient'] = pd.Categorical(patients_sorted['patient'], \n",
313 | " categories = patients_sorted['patient'].drop_duplicates().tolist()[::-1], ordered= True)"
314 | ]
315 | },
316 | {
317 | "cell_type": "markdown",
318 | "metadata": {},
319 | "source": [
320 | "#### Absolute Time Dimension"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": null,
326 | "metadata": {},
327 | "outputs": [],
328 | "source": [
329 | "(ggplot(patients_sorted[patients_sorted['time'] < '2017-01-31'], \n",
330 | " aes('time', 'patient', color = 'handling'))\n",
331 | " + geom_point()\n",
332 | " + theme_bw()\n",
333 | " + scale_y_discrete(labels = \"\")\n",
334 | " + theme(axis_text_x=element_text(rotation=45, hjust=1)))"
335 | ]
336 | },
337 | {
338 | "cell_type": "markdown",
339 | "metadata": {},
340 | "source": [
341 | "#### Relative Time Dimension"
342 | ]
343 | },
344 | {
345 | "cell_type": "markdown",
346 | "metadata": {},
347 | "source": [
348 | "We meed to make the time relative and add a new column `time_relative` for that purpose:"
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": null,
354 | "metadata": {},
355 | "outputs": [],
356 | "source": [
357 | "patients_sorted['time_relative'] = patients_sorted['time'].sub( patients_sorted.groupby('patient')['time'].transform('first'))"
358 | ]
359 | },
360 | {
361 | "cell_type": "code",
362 | "execution_count": null,
363 | "metadata": {},
364 | "outputs": [],
365 | "source": [
366 | "(ggplot(patients_sorted, aes('time_relative', 'patient', color = 'handling'))\n",
367 | " + geom_point()\n",
368 | " + theme_bw()\n",
369 | " + scale_y_discrete(labels = \"\")\n",
370 | " + theme(axis_text_x=element_text(rotation=45, hjust=1)))"
371 | ]
372 | },
373 | {
374 | "cell_type": "markdown",
375 | "metadata": {},
376 | "source": [
377 | "We still need to sort by the overall duration to replicate the `bupaR`"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": null,
383 | "metadata": {},
384 | "outputs": [],
385 | "source": [
386 | "patients_sorted['duration'] = patients_sorted.groupby('patient')['time_relative'].transform('max')"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": null,
392 | "metadata": {},
393 | "outputs": [],
394 | "source": [
395 | "patients_sorted_duration = patients_sorted.sort_values(['duration'])\n",
396 | "patients_sorted_duration['patient'] = pd.Categorical(patients_sorted_duration['patient'], categories = patients_sorted_duration['patient'].drop_duplicates().tolist()[::-1], ordered= True)\n",
397 | "\n",
398 | "(ggplot(patients_sorted_duration, aes('time_relative', 'patient', color = 'handling'))\n",
399 | " + geom_point()\n",
400 | " + theme_bw()\n",
401 | " + scale_y_discrete(labels = \"\")\n",
402 | " + theme(axis_text_x=element_text(rotation=45, hjust=1)))"
403 | ]
404 | },
405 | {
406 | "cell_type": "markdown",
407 | "metadata": {},
408 | "source": [
409 | "Check out other basic process visualization options using PM4Py:\n",
410 | "\n",
411 | "* [Basic Process Statistics](https://pm4py.fit.fraunhofer.de/documentation#statistics)"
412 | ]
413 | },
414 | {
415 | "cell_type": "markdown",
416 | "metadata": {},
417 | "source": [
418 | "## Process Map Visualization"
419 | ]
420 | },
421 | {
422 | "cell_type": "markdown",
423 | "metadata": {},
424 | "source": [
425 | "Again, there is no built-in precedence matrix visualization in PM4Py, but it can be replicated easily:"
426 | ]
427 | },
428 | {
429 | "cell_type": "code",
430 | "execution_count": null,
431 | "metadata": {},
432 | "outputs": [],
433 | "source": [
434 | "patients_sorted['antecedent'] = patients_sorted.groupby([\"patient\"])['handling'].shift(1).fillna(\"Start\")\n",
435 | "patients_sorted['consequent'] = patients_sorted['handling']"
436 | ]
437 | },
438 | {
439 | "cell_type": "code",
440 | "execution_count": null,
441 | "metadata": {},
442 | "outputs": [],
443 | "source": [
444 | "(ggplot(patients_sorted, aes('consequent', 'antecedent', ))\n",
445 | " + geom_bin2d() \n",
446 | " + theme_bw()\n",
447 | " + theme(axis_text_x=element_text(rotation=45, hjust=1)))"
448 | ]
449 | },
450 | {
451 | "cell_type": "markdown",
452 | "metadata": {},
453 | "source": [
454 | "### Directly-follows Graph / Process Map"
455 | ]
456 | },
457 | {
458 | "cell_type": "markdown",
459 | "metadata": {},
460 | "source": [
461 | "The process map or directly-follows graph visualization in PM4Py cannot deal yet with `activity instances`, so we need to only focus on the `complete` events."
462 | ]
463 | },
464 | {
465 | "cell_type": "code",
466 | "execution_count": null,
467 | "metadata": {},
468 | "outputs": [],
469 | "source": [
470 | "patients_log = patients_log[patients_log['registration_type'] == 'complete']"
471 | ]
472 | },
473 | {
474 | "cell_type": "code",
475 | "execution_count": null,
476 | "metadata": {},
477 | "outputs": [],
478 | "source": [
479 | "dfg, sa, ea = pm4py.discover_directly_follows_graph(patients_log)"
480 | ]
481 | },
482 | {
483 | "cell_type": "code",
484 | "execution_count": null,
485 | "metadata": {},
486 | "outputs": [],
487 | "source": [
488 | "pm4py.view_dfg(dfg, sa, ea)"
489 | ]
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": null,
494 | "metadata": {},
495 | "outputs": [],
496 | "source": [
497 | "from pm4py.algo.discovery.dfg import algorithm as dfg_discovery\n",
498 | "from pm4py.algo.discovery.dfg import algorithm as dfg_discovery\n",
499 | "from pm4py.visualization.dfg import visualizer as dfg_visualization\n",
500 | "\n",
501 | "dfg = dfg_discovery.apply(patients_log)\n",
502 | "\n",
503 | "dfg = dfg_discovery.apply(patients_log, variant=dfg_discovery.Variants.PERFORMANCE)\n",
504 | "gviz = dfg_visualization.apply(dfg, log=patients_log, variant=dfg_visualization.Variants.PERFORMANCE)\n",
505 | "dfg_visualization.view(gviz)"
506 | ]
507 | },
508 | {
509 | "cell_type": "markdown",
510 | "metadata": {},
511 | "source": [
512 | "### Further Perspectives and Animation\n",
513 | "\n",
514 | "No such feature in PM4Py yet."
515 | ]
516 | },
517 | {
518 | "cell_type": "markdown",
519 | "metadata": {},
520 | "source": [
521 | "## Real-life Processes"
522 | ]
523 | },
524 | {
525 | "cell_type": "code",
526 | "execution_count": null,
527 | "metadata": {},
528 | "outputs": [],
529 | "source": [
530 | "sepsis"
531 | ]
532 | },
533 | {
534 | "cell_type": "code",
535 | "execution_count": null,
536 | "metadata": {},
537 | "outputs": [],
538 | "source": [
539 | "sepsis_sorted = sepsis.sort_values(['timestamp'])\n",
540 | "sepsis_sorted['timestamp'] = pd.to_datetime(sepsis_sorted['timestamp'])"
541 | ]
542 | },
543 | {
544 | "cell_type": "code",
545 | "execution_count": null,
546 | "metadata": {},
547 | "outputs": [],
548 | "source": [
549 | "sepsis_sorted['antecedent'] = sepsis_sorted.groupby([\"case_id\"])['activity'].shift(1).fillna(\"Start\")\n",
550 | "sepsis_sorted['consequent'] = sepsis_sorted['activity']"
551 | ]
552 | },
553 | {
554 | "cell_type": "code",
555 | "execution_count": null,
556 | "metadata": {},
557 | "outputs": [],
558 | "source": [
559 | "(ggplot(sepsis_sorted, aes('consequent', 'antecedent', ))\n",
560 | " + geom_bin2d() \n",
561 | " + theme_bw()\n",
562 | " + theme(axis_text_x=element_text(rotation=45, hjust=1)))"
563 | ]
564 | }
565 | ],
566 | "metadata": {
567 | "kernelspec": {
568 | "display_name": "Python 3 (ipykernel)",
569 | "language": "python",
570 | "name": "python3"
571 | },
572 | "language_info": {
573 | "codemirror_mode": {
574 | "name": "ipython",
575 | "version": 3
576 | },
577 | "file_extension": ".py",
578 | "mimetype": "text/x-python",
579 | "name": "python",
580 | "nbconvert_exporter": "python",
581 | "pygments_lexer": "ipython3",
582 | "version": "3.8.13"
583 | },
584 | "vscode": {
585 | "interpreter": {
586 | "hash": "697fc57cee903094f2e79d714169e862761fd41795cb1acde2e554f7b56adc7f"
587 | }
588 | }
589 | },
590 | "nbformat": 4,
591 | "nbformat_minor": 4
592 | }
593 |
--------------------------------------------------------------------------------
/python/lecture2-discovery.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook is part of a course on Applied Process Mining. The collection of notebooks is a *living document* and subject to change. \n",
8 | "\n",
9 | "# Lecture 2 - 'Process Discovery with the Heuristics Miner' (Python / PM4Py)"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "## Setup\n",
17 | "\n",
18 | "
\n",
19 | "\n",
20 | "In this notebook, we are using several libraries:\n",
21 | "\n",
22 | "* [PM4Py](https://pm4py.fit.fraunhofer.de/)\n",
23 | "* [pandas](https://pandas.pydata.org/)\n",
24 | "* [plotnine](https://plotnine.readthedocs.io/en/stable/)\n",
25 | "\n",
26 | "Often used dependencies are imported:"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "import pandas as pd\n",
36 | "import pm4py\n",
37 | "import plotnine\n",
38 | "from plotnine import ggplot, geom_point, aes, theme_bw, coord_flip, scale_y_discrete, theme, element_text, geom_bin2d"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "## Process Discovery"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "### Process Models\n",
53 | "\n",
54 | "A common industry standard for Process Model is [BPMN](https://www.bpmn.org/), which can be created by web-based tools like [BPMN.io](https://bpmn.io/) and loaded and used by PM4Py."
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "import pm4py\n",
64 | "import os\n",
65 | "\n",
66 | "bpmn_graph = pm4py.read_bpmn(os.path.join(\"..\", \"data\", \"sepsis.bpmn\"))"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "pm4py.vis.view_bpmn(bpmn_graph)"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "metadata": {},
81 | "source": [
82 | "### Quality Dimensions\n",
83 | "\n",
84 | "The quality dimensions `fitness`, `precision`, `simplicity`, and `generalisation` are best illustrated by using a small example event log.\n",
85 | "We are using an example event log in XES format that is used in the book `Process Mining - Data Science in Action` by Wil van der Aalst, which is downloaded and stored in the `../data` directory with the code below:"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "from pm4py.objects.log.importer.xes import importer as xes_importer\n",
95 | "example_log = xes_importer.apply('../data/Lfull.xes')"
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "metadata": {},
101 | "source": [
102 | "Let us have a look at the event log in tabular form. The mapping of the activity labels to actual activities is:\n",
103 | "\n",
104 | "* a = register request, \n",
105 | "* b = examine thoroughly, \n",
106 | "* c = examine casually, \n",
107 | "* d = check ticket, \n",
108 | "* e = decide, \n",
109 | "* f = reinitiate request, \n",
110 | "* g = pay compensation, and \n",
111 | "* h = reject request."
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "pm4py.convert_to_dataframe(example_log)"
121 | ]
122 | },
123 | {
124 | "cell_type": "markdown",
125 | "metadata": {},
126 | "source": [
127 | "Now let us discover a process map as we have seen in Lecture 1:"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": null,
133 | "metadata": {},
134 | "outputs": [],
135 | "source": [
136 | "pm4py.view_dfg(*pm4py.discover_directly_follows_graph(example_log))"
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "metadata": {},
142 | "source": [
143 | "Not really very insightful the directly-follows based process map visualization."
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {},
149 | "source": [
150 | "### Heuristics Miner 🚧"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": null,
156 | "metadata": {},
157 | "outputs": [],
158 | "source": [
159 | "L_heur_1 = pd.read_csv(\"../data/L_heur_1.csv\", sep=';')\n",
160 | "num_rows = len(L_heur_1)\n",
161 | "print(\"Number of rows: {}\".format(num_rows))\n",
162 | "L_heur_1_log = pm4py.format_dataframe(L_heur_1, case_id='CASE_concept_name', activity_key='activity_id', timestamp_key='timestamp')"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "dfg, sa, ea = pm4py.discover_directly_follows_graph(L_heur_1_log)\n",
172 | "pm4py.view_dfg(dfg, sa, ea)"
173 | ]
174 | },
175 | {
176 | "cell_type": "markdown",
177 | "metadata": {},
178 | "source": [
179 | "### Dependency Graph 🚧"
180 | ]
181 | },
182 | {
183 | "cell_type": "markdown",
184 | "metadata": {},
185 | "source": [
186 | "In PM4Py, there is no option to review the intermediate data structures used for the Heuristics Miner such as the dependency graph. This would need to be implemented on top of Numpy. Feel free to provide a pull request :-)"
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "metadata": {},
192 | "source": [
193 | "### Causal net"
194 | ]
195 | },
196 | {
197 | "cell_type": "markdown",
198 | "metadata": {},
199 | "source": [
200 | "The Causal net formalism is called `Heuristics Net` in PM4Py along with the definitions used in the original paper that proposed the Heuristics Miner."
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": null,
206 | "metadata": {},
207 | "outputs": [],
208 | "source": [
209 | "from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner\n",
210 | "\n",
211 | "# due to bug here: https://github.com/pm4py/pm4py-core/issues/220\n",
212 | "# we need to convert this to a event log object\n",
213 | "\n",
214 | "L_heur_1_log = pm4py.convert_to_event_log(L_heur_1_log)\n",
215 | "L_heur_1_map = pm4py.discover_heuristics_net(L_heur_1_log, dependency_threshold = 0.8, and_threshold = 0.65, loop_two_threshold = 0.5)\n",
216 | "pm4py.view_heuristics_net(L_heur_1_map)"
217 | ]
218 | },
219 | {
220 | "cell_type": "markdown",
221 | "metadata": {},
222 | "source": [
223 | "#### Convert to BPMN"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "metadata": {},
230 | "outputs": [],
231 | "source": [
232 | "from pm4py.objects.conversion.wf_net.variants import to_bpmn\n",
233 | "\n",
234 | "# we need to have an intermediate step to get a Petri net\n",
235 | "L_heur_1_petrinet, im, fm = pm4py.convert_to_petri_net(L_heur_1_map)\n",
236 | "L_heur_1_bpmn = to_bpmn.apply(L_heur_1_petrinet, im, fm)"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": null,
242 | "metadata": {},
243 | "outputs": [],
244 | "source": [
245 | "pm4py.view_bpmn(L_heur_1_bpmn)"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": null,
251 | "metadata": {},
252 | "outputs": [],
253 | "source": [
254 | "from pm4py.objects.petri_net.exporter import exporter as pnml_exporter\n",
255 | "pnml_exporter.apply(L_heur_1_petrinet, im, \"petri_final.pnml\", final_marking=fm)"
256 | ]
257 | }
258 | ],
259 | "metadata": {
260 | "kernelspec": {
261 | "display_name": "Python 3 (ipykernel)",
262 | "language": "python",
263 | "name": "python3"
264 | },
265 | "language_info": {
266 | "codemirror_mode": {
267 | "name": "ipython",
268 | "version": 3
269 | },
270 | "file_extension": ".py",
271 | "mimetype": "text/x-python",
272 | "name": "python",
273 | "nbconvert_exporter": "python",
274 | "pygments_lexer": "ipython3",
275 | "version": "3.8.13"
276 | },
277 | "vscode": {
278 | "interpreter": {
279 | "hash": "28aff1567d8aae5536826c1be921f2ff2e204808293d43dc67bdcb73bd29110e"
280 | }
281 | }
282 | },
283 | "nbformat": 4,
284 | "nbformat_minor": 4
285 | }
286 |
--------------------------------------------------------------------------------
/python/lecture3-conformance.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Applied Process Mining Module\n",
8 | "\n",
9 | "This notebook is part of an Applied Process Mining module. The collection of notebooks is a *living document* and subject to change. \n",
10 | "\n",
11 | "# Lecture 3 - 'Conformance Checking' (Python / PM4Py)"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "## Setup\n",
19 | "\n",
20 | "
\n",
21 | "\n",
22 | "In this notebook, we are using the [PM4Py library](https://pm4py.fit.fraunhofer.de/) in combination with several standard Python data science libraries:\n",
23 | "\n",
24 | "* [pandas](https://pandas.pydata.org/)"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": null,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "## Perform the commented out commands to install the dependencies\n",
34 | "# %pip install pandas\n",
35 | "# %pip install matplotlib\n",
36 | "# we use the old PM4Py API that was considerably changed in 2.3.0\n",
37 | "# %pip install pm4py~=2.2.32"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "import pandas as pd\n",
47 | "import pm4py\n",
48 | "import os"
49 | ]
50 | },
51 | {
52 | "cell_type": "markdown",
53 | "metadata": {},
54 | "source": [
55 | "## Conformance Checking"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "### Process Models\n",
63 | "\n",
64 | "A common industry standard for Process Model is [BPMN](https://www.bpmn.org/), which can be created by web-based tools like [BPMN.io](https://bpmn.io/) and loaded and used by PM4Py. We load a manually drawn BPMN model that described the expected process behaviour of the `Sepsis Process`. This is a process that describes several logistical and diagnostic activities performed for patients in a hospital that are suspected to have a life-threatening sepsis condition. An event log that was extracted from the information system of a hospital is publicly available and serves as ouor running example."
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "sepsis_bpmn = pm4py.read_bpmn(os.path.join(\"..\", \"data\", \"sepsis.bpmn\"))"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "pm4py.vis.view_bpmn(sepsis_bpmn)"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "### Alignments\n",
90 | "\n"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "metadata": {},
97 | "outputs": [],
98 | "source": [
99 | "sepsis = pd.read_csv(\"../data/sepsis.csv\", sep=';')\n",
100 | "num_rows = len(sepsis)\n",
101 | "print(\"Number of rows: {}\".format(num_rows))"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "sepsis.head()"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {},
117 | "outputs": [],
118 | "source": [
119 | "#sepsis['timestamp'] = pd.to_datetime(sepsis['timestamp'])\n",
120 | "sepsis_log = pm4py.format_dataframe(sepsis, case_id='case_id', activity_key='activity', timestamp_key='timestamp')\n",
121 | "sepsis_log = pm4py.convert_to_event_log(sepsis_log)"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "Now let us discover a process map as we have seen in Lecture 1:"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "metadata": {},
135 | "outputs": [],
136 | "source": [
137 | "from pm4py.algo.discovery.dfg import algorithm as dfg_discovery\n",
138 | "from pm4py.algo.discovery.dfg import algorithm as dfg_discovery\n",
139 | "from pm4py.visualization.dfg import visualizer as dfg_visualization\n",
140 | "\n",
141 | "dfg = dfg_discovery.apply(sepsis_log)\n",
142 | "\n",
143 | "dfg = dfg_discovery.apply(sepsis_log, variant=dfg_discovery.Variants.PERFORMANCE)\n",
144 | "gviz = dfg_visualization.apply(dfg, log=sepsis_log, variant=dfg_visualization.Variants.PERFORMANCE)\n",
145 | "dfg_visualization.view(gviz)"
146 | ]
147 | },
148 | {
149 | "cell_type": "markdown",
150 | "metadata": {},
151 | "source": [
152 | "This is clearly not usable at all. In Lecture 2, we have seen how to use Process Discovery methods to overcome this problem to a certain degree.\n",
153 | "However, when having a normative process model such as the handmade BPMN model of the Sepsis Process there is another way to get value from the event log: by means of conformance checking with alignments.\n",
154 | "\n",
155 | "In PM4Py, we can convert the BPMN model to a Petri net and then compute an optimal alignment between the traces contained in the event log and our model. This will give us, for each trace, a mapping from the events to the path through the model (or run) that is most similar to what has been recorded in the event log. (**Warning**: This is computationally expensive and may take a while)\n"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "from pm4py.objects.conversion.bpmn import converter as bpmn_converter\n",
165 | "\n",
166 | "sepsis_net, sepsis_im, sepsis_fm = bpmn_converter.apply(sepsis_bpmn)\n",
167 | "\n",
168 | "sepsis_alignment = pm4py.conformance_diagnostics_alignments(sepsis_log, sepsis_net, sepsis_im, sepsis_fm) "
169 | ]
170 | },
171 | {
172 | "cell_type": "markdown",
173 | "metadata": {},
174 | "source": [
175 | "This is how an *alignment* of an individual trace looks like:"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "sepsis_alignment[0]"
185 | ]
186 | },
187 | {
188 | "cell_type": "markdown",
189 | "metadata": {},
190 | "source": [
191 | "This can be used to project statistics on top of the Petri net representation:"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": null,
197 | "metadata": {},
198 | "outputs": [],
199 | "source": [
200 | "from pm4py.visualization.petri_net import visualizer as pn_vis\n",
201 | "\n",
202 | "pn_vis.apply(sepsis_net, sepsis_im, sepsis_fm, sepsis_log, variant = pn_vis.Variants.ALIGNMENTS)"
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "metadata": {},
208 | "source": [
209 | "The statistics can be read as (number of skips or move on model, number of correct or synchronous events)"
210 | ]
211 | },
212 | {
213 | "cell_type": "markdown",
214 | "metadata": {},
215 | "source": [
216 | "An average fitness calculation of the model for the full log can be obtained from the alignment result:"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": null,
222 | "metadata": {},
223 | "outputs": [],
224 | "source": [
225 | "from pm4py.algo.evaluation.replay_fitness import algorithm as replay_fitness\n",
226 | "log_fitness = replay_fitness.evaluate(sepsis_alignment, variant=replay_fitness.Variants.ALIGNMENT_BASED)\n",
227 | "print(log_fitness) "
228 | ]
229 | },
230 | {
231 | "cell_type": "markdown",
232 | "metadata": {},
233 | "source": [
234 | "PM4Py also provides a method to compute the *precision* based on the idea of escaping edges and alignments:"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": null,
240 | "metadata": {},
241 | "outputs": [],
242 | "source": [
243 | "from pm4py.algo.evaluation.precision import algorithm as precision_evaluator\n",
244 | "prec = precision_evaluator.apply(sepsis_log, sepsis_net, sepsis_im, sepsis_fm, variant=precision_evaluator.Variants.ALIGN_ETCONFORMANCE)"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": null,
250 | "metadata": {},
251 | "outputs": [],
252 | "source": [
253 | "prec"
254 | ]
255 | }
256 | ],
257 | "metadata": {
258 | "kernelspec": {
259 | "display_name": "Python 3 (ipykernel)",
260 | "language": "python",
261 | "name": "python3"
262 | },
263 | "language_info": {
264 | "codemirror_mode": {
265 | "name": "ipython",
266 | "version": 3
267 | },
268 | "file_extension": ".py",
269 | "mimetype": "text/x-python",
270 | "name": "python",
271 | "nbconvert_exporter": "python",
272 | "pygments_lexer": "ipython3",
273 | "version": "3.8.13"
274 | }
275 | },
276 | "nbformat": 4,
277 | "nbformat_minor": 4
278 | }
279 |
--------------------------------------------------------------------------------
/python/lecture4-prediction.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Applied Process Mining Module\n",
8 | "\n",
9 | "This notebook is part of an Applied Process Mining module. The collection of notebooks is a *living document* and subject to change. \n",
10 | "\n",
11 | "# Lecture 4 - 'Predictive Process Mining' (Python / PM4Py)"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "## Setup\n",
19 | "\n",
20 | "
\n",
21 | "\n",
22 | "In this notebook, we are using the [PM4Py library](https://pm4py.fit.fraunhofer.de/) in combination with several standard Python data science libraries:\n",
23 | "\n",
24 | "* [pandas](https://pandas.pydata.org/)\n",
25 | "* [PyTorch](https://pytorch.org/)"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "## Perform the commented out commands to install the dependencies\n",
35 | "# %pip install pandas\n",
36 | "# %pip install matplotlib\n",
37 | "# %pip install pm4py\n",
38 | "# %pip install torch\n",
39 | "# %pip install tqdm"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "import numpy as np\n",
49 | "import pandas as pd\n",
50 | "import pm4py\n",
51 | "import os\n",
52 | "import torch\n",
53 | "import torch.nn as nn\n",
54 | "from tqdm.autonotebook import tqdm"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {},
60 | "source": [
61 | "# Predictive Process Mining"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {
67 | "tags": []
68 | },
69 | "source": [
70 | "## Event Log \n",
71 | "\n",
72 | "We are using the Sepsis event log as an example."
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "from urllib.request import urlretrieve\n",
82 | "import os\n",
83 | "\n",
84 | "# download from 4tu.nl\n",
85 | "urlretrieve('https://data.4tu.nl/ndownloader/files/24061976', 'sepsis.xes.gz')\n",
86 | "sepsis_log = pm4py.read_xes('sepsis.xes.gz')\n",
87 | "os.unlink('sepsis.xes.gz') # clean up"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "len(sepsis_log)"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {
102 | "tags": []
103 | },
104 | "source": [
105 | "## Prefix Extraction\n",
106 | "\n",
107 | "Many different prediction tasks are possible based on an event log. Often, the assumption is made that only a prefix of a trace is known and that a prediction on some future state of the process instance represented by that trace should be made.\n",
108 | "\n",
109 | "The first step is to generate suitable prefixes of the traces contained in the event log to be used as the training samples. As a *simple example*, we may be interested in predicting whether the patient in the process returns ot the emergency room indicated by the event *Return ER* as the last event. Since the event *Return ER* is part of the event log, we need to remove that event and remember in which trace it occurred. "
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "sepsis_returns = [len(list(filter(lambda e: e[\"concept:name\"] == \"Return ER\" ,trace))) > 0 for trace in sepsis_log]"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": null,
124 | "metadata": {},
125 | "outputs": [],
126 | "source": [
127 | "# check if this worked\n",
128 | "print(sepsis_log[3][-1])\n",
129 | "print(sepsis_returns[3])\n",
130 | "\n",
131 | "print(sepsis_log[0][-1])\n",
132 | "print(sepsis_returns[0])"
133 | ]
134 | },
135 | {
136 | "cell_type": "markdown",
137 | "metadata": {},
138 | "source": [
139 | "At the same time, we may be interested in how well we can predict whether a patient returns for different sizes of the prefix, e.g., we can generate a new event log keeping only prefixes of each trace with at most size 10 (*10-prefix*).\n",
140 | "\n",
141 | "**Note that this is just a simple example with 10 chosen as arbitrary prefix length and in the general case you generate not only prefixes of a specific size but of variables or all sizes. Also, some traces are less than 10 events long in which case we would use the full trace for the prediction, which would not be very useful in practice.**"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "# remove Return ER event\n",
151 | "sepsis_log = pm4py.filter_event_attribute_values(sepsis_log, \"concept:name\", \"Return ER\", level = \"event\", retain=False)\n",
152 | "\n",
153 | "from pm4py.objects.log.obj import EventLog, Trace\n",
154 | "# generate prefixes, note that we need to add the casts to EventLog and Trace to make sure that the result is a PM4Py EventLog object\n",
155 | "sepsis_prefixes = EventLog([Trace(trace[0:10], attributes = trace.attributes) for trace in sepsis_log])"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "# check the trace length\n",
165 | "print([len(trace) for trace in sepsis_log][0:15])\n",
166 | "print([len(trace) for trace in sepsis_prefixes][0:15])"
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {},
172 | "source": [
173 | "## Prefix Encoding\n",
174 | "\n",
175 | "For training a prediction model, the traces or sequences of events need to be often transformed to a vector representation. We show how to compute three basic encodings+ using the built-in PM4Py [feature selection and processing](https://pm4py.fit.fraunhofer.de/documentation#decision-trees) functionality.\n",
176 | "\n",
177 | "Of course, more complex encodings such as representing each trace as a sequence of features are possible, e.g., for sequential models such as LSTMs. This is left as exercise. \n",
178 | "\n",
179 | "### Feature Selection \\& Engineering\n",
180 | "\n",
181 | "Before we do prefix encoding, we need to select which features we will use for the prediction. In this example we will only use the \"activity\" of the events as feature. Depending on your prediction problem, you might want to include additional trace/event attributes.\n",
182 | "\n",
183 | "Additionally, you can also derive new trace-level features (e.g., day of week, time since case start) or log-based features (e.g., workload of resources, number of active cases at a certain time). This is left as exercise."
184 | ]
185 | },
186 | {
187 | "cell_type": "markdown",
188 | "metadata": {},
189 | "source": [
190 | "### Encoding as Set of Events"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": null,
196 | "metadata": {},
197 | "outputs": [],
198 | "source": [
199 | "from pm4py.algo.transformation.log_to_features import algorithm as log_to_features\n",
200 | "\n",
201 | "# log_to_feature provides a flexible interface to compute features on an event and trace level\n",
202 | "# see the documentation for more information: https://pm4py.fit.fraunhofer.de/documentation#item-7-0-2 \n",
203 | "data, feature_names = log_to_features.apply(sepsis_prefixes, parameters={\"str_ev_attr\": [\"concept:name\"]})"
204 | ]
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "metadata": {},
209 | "source": [
210 | "The standard encoding of the `concept:name` attribute (i.e., the event label) is a one-hot encoded vector. Let us have a look at the encoding. The index of the number corresponds to the index in the feature label vector."
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": null,
216 | "metadata": {},
217 | "outputs": [],
218 | "source": [
219 | "from pm4py.objects.log.util.log import project_traces\n",
220 | "def project_nth(log, index):\n",
221 | " print(str(project_traces(log)[index]))"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": null,
227 | "metadata": {},
228 | "outputs": [],
229 | "source": [
230 | "project_nth(sepsis_prefixes, 0)"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": null,
236 | "metadata": {},
237 | "outputs": [],
238 | "source": [
239 | "print(feature_names)"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": null,
245 | "metadata": {},
246 | "outputs": [],
247 | "source": [
248 | "print(data[0])"
249 | ]
250 | },
251 | {
252 | "cell_type": "markdown",
253 | "metadata": {},
254 | "source": [
255 | "The overall data shape is:"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": null,
261 | "metadata": {},
262 | "outputs": [],
263 | "source": [
264 | "np.asarray(data).shape"
265 | ]
266 | },
267 | {
268 | "cell_type": "markdown",
269 | "metadata": {},
270 | "source": [
271 | "So, PM4Py gives us a *one-hot encoding* of the so called *set abstraction* of the event log. This means there are 16 distinct activities in the event log and the feature vector simply encodes whether that activity is present or not in the data. \n",
272 | "\n",
273 | "Let us have a look at the distribution of these feature vectors:"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": null,
279 | "metadata": {},
280 | "outputs": [],
281 | "source": [
282 | "# look at the unique vectors and their occurrence frequency/count\n",
283 | "dist_features = np.unique(data, return_counts= True, axis = 0)\n",
284 | "print(dist_features)"
285 | ]
286 | },
287 | {
288 | "cell_type": "markdown",
289 | "metadata": {},
290 | "source": [
291 | "What is the most common feature vector?"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": null,
297 | "metadata": {},
298 | "outputs": [],
299 | "source": [
300 | "# argmax give use the index of the most frequent vector\n",
301 | "dist_features[0][np.argmax(dist_features[1])]"
302 | ]
303 | },
304 | {
305 | "cell_type": "markdown",
306 | "metadata": {},
307 | "source": [
308 | "Makes sense, almost all activities actually are bound to occur in this process. There are only few choices.\n",
309 | "So, this encoding is likely not the most useful one but a very simple one."
310 | ]
311 | },
312 | {
313 | "cell_type": "markdown",
314 | "metadata": {},
315 | "source": [
316 | "### Encoding as Bi-Grams / Succession Relation"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": null,
322 | "metadata": {},
323 | "outputs": [],
324 | "source": [
325 | "data_2gram, feature_names = log_to_features.apply(sepsis_prefixes, \n",
326 | " parameters={\"str_ev_attr\": [], \n",
327 | " \"str_tr_attr\": [], \n",
328 | " \"num_ev_attr\": [], \n",
329 | " \"num_tr_attr\": [], \n",
330 | " \"str_evsucc_attr\": [\"concept:name\"]})\n",
331 | "feature_names"
332 | ]
333 | },
334 | {
335 | "cell_type": "markdown",
336 | "metadata": {},
337 | "source": [
338 | "Each feature represents the succession relation (or bigram) between any two activities of the event log. We transform the features into a tensor."
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": null,
344 | "metadata": {},
345 | "outputs": [],
346 | "source": [
347 | "data_2gram = np.asarray(data_2gram)"
348 | ]
349 | },
350 | {
351 | "cell_type": "markdown",
352 | "metadata": {},
353 | "source": [
354 | "Let us, again, have a look at the encoding of the first trace."
355 | ]
356 | },
357 | {
358 | "cell_type": "code",
359 | "execution_count": null,
360 | "metadata": {},
361 | "outputs": [],
362 | "source": [
363 | "project_nth(sepsis_log, 0)"
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": null,
369 | "metadata": {},
370 | "outputs": [],
371 | "source": [
372 | "print(data_2gram[0])"
373 | ]
374 | },
375 | {
376 | "cell_type": "markdown",
377 | "metadata": {},
378 | "source": [
379 | "### Encoding as Bag of Words / Multiset of Events"
380 | ]
381 | },
382 | {
383 | "cell_type": "markdown",
384 | "metadata": {},
385 | "source": [
386 | "Another option would be to use the encoding known as [bag-of-words model](https://en.wikipedia.org/wiki/Bag-of-words_model) in Natural Language Processing, which is constructing a multiset of the one-hot encoded events. So, the frequency with which each activity occurs is reflected. This encoding is not provided in PM4Py but can be easily computed with Pandas and Numpy."
387 | ]
388 | },
389 | {
390 | "cell_type": "markdown",
391 | "metadata": {},
392 | "source": [
393 | "We first need to transform the PM4Py event log to a Pandas data frame."
394 | ]
395 | },
396 | {
397 | "cell_type": "code",
398 | "execution_count": null,
399 | "metadata": {},
400 | "outputs": [],
401 | "source": [
402 | "sepsis_df = pm4py.convert_to_dataframe(sepsis_prefixes)\n",
403 | "sepsis_df.head(25)"
404 | ]
405 | },
406 | {
407 | "cell_type": "markdown",
408 | "metadata": {},
409 | "source": [
410 | "We build a bag of words representation by grouping our data and then counting the number of events refering to the individual activities."
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": null,
416 | "metadata": {},
417 | "outputs": [],
418 | "source": [
419 | "# concept:name refers to the activity\n",
420 | "# case:concept:name refers to the case identifier\n",
421 | "sepsis_case_act = sepsis_df.loc[:,[\"case:concept:name\", \"concept:name\"]]\n",
422 | "sepsis_case_act"
423 | ]
424 | },
425 | {
426 | "cell_type": "code",
427 | "execution_count": null,
428 | "metadata": {},
429 | "outputs": [],
430 | "source": [
431 | "# Count the occurrence of activities in a trace (no sorting to keep order of traces stable!)\n",
432 | "sepsis_act_count = sepsis_case_act.groupby([\"case:concept:name\", \"concept:name\"], sort=False).size()\n",
433 | "sepsis_act_count"
434 | ]
435 | },
436 | {
437 | "cell_type": "markdown",
438 | "metadata": {},
439 | "source": [
440 | "We have the count of each activity for each trace and still need to convert this to a tensor format such that we have one feature vector (columns) per case (row)."
441 | ]
442 | },
443 | {
444 | "cell_type": "code",
445 | "execution_count": null,
446 | "metadata": {},
447 | "outputs": [],
448 | "source": [
449 | "sepsis_bag = np.asarray(sepsis_act_count.unstack(fill_value=0))\n",
450 | "sepsis_bag"
451 | ]
452 | },
453 | {
454 | "cell_type": "code",
455 | "execution_count": null,
456 | "metadata": {},
457 | "outputs": [],
458 | "source": [
459 | "sepsis_bag.shape"
460 | ]
461 | },
462 | {
463 | "cell_type": "markdown",
464 | "metadata": {},
465 | "source": [
466 | "Let us, again, have a look at the encoding of the first trace."
467 | ]
468 | },
469 | {
470 | "cell_type": "code",
471 | "execution_count": null,
472 | "metadata": {},
473 | "outputs": [],
474 | "source": [
475 | "project_nth(sepsis_log, 0)\n",
476 | "print(sepsis_bag[0])"
477 | ]
478 | },
479 | {
480 | "cell_type": "code",
481 | "execution_count": null,
482 | "metadata": {},
483 | "outputs": [],
484 | "source": [
485 | "project_nth(sepsis_log, 1)\n",
486 | "print(sepsis_bag[1])"
487 | ]
488 | },
489 | {
490 | "cell_type": "markdown",
491 | "metadata": {},
492 | "source": [
493 | "This already gives us much more information to work with."
494 | ]
495 | },
496 | {
497 | "cell_type": "markdown",
498 | "metadata": {},
499 | "source": [
500 | "## Prediction\n",
501 | "\n",
502 | "Let us try to build a basic prediction model based on this information. In this example, we aim to predict the binary outcome whether the event `Return ER` occurred or not. \n",
503 | "\n",
504 | "**Disclaimer: here *basic* means that the model and encoding is not expected to be of any quality. Also note that the prediction task, while useful, may not be feasible based on the prefix encoding that we chose. Treat the following code as an example and starting point only!**"
505 | ]
506 | },
507 | {
508 | "cell_type": "markdown",
509 | "metadata": {},
510 | "source": [
511 | "### Data Preparation"
512 | ]
513 | },
514 | {
515 | "cell_type": "markdown",
516 | "metadata": {},
517 | "source": [
518 | "#### Target Variable\n",
519 | "\n",
520 | "Let us look at the distribution of the target variable."
521 | ]
522 | },
523 | {
524 | "cell_type": "code",
525 | "execution_count": null,
526 | "metadata": {},
527 | "outputs": [],
528 | "source": [
529 | "np.unique(sepsis_returns, return_counts=True)"
530 | ]
531 | },
532 | {
533 | "cell_type": "code",
534 | "execution_count": null,
535 | "metadata": {},
536 | "outputs": [],
537 | "source": [
538 | "# For future processing we need 0 and 1 instead of True and False\n",
539 | "sepsis_returns = np.asarray(sepsis_returns).astype(int)\n",
540 | "sepsis_returns.shape"
541 | ]
542 | },
543 | {
544 | "cell_type": "markdown",
545 | "metadata": {},
546 | "source": [
547 | "#### Data Scaling & Loading\n",
548 | "\n",
549 | "This often helps prediction models to perform better.\n",
550 | "\n",
551 | "**Important:** make sure to not compute the scaling with the test set included since there is a risk of data leakage otherwise. In other words, the test set should be separated before any pre-processing, which may use a property of the dataset, is applied. Of course, the test set is scaled as well but with the scaler *trained* only on the training set. "
552 | ]
553 | },
554 | {
555 | "cell_type": "code",
556 | "execution_count": null,
557 | "metadata": {},
558 | "outputs": [],
559 | "source": [
560 | "from sklearn.preprocessing import FunctionTransformer, MinMaxScaler\n",
561 | "\n",
562 | "scaler_x = MinMaxScaler()\n",
563 | "data_scaled = scaler_x.fit_transform(sepsis_bag)\n",
564 | "\n",
565 | "scaler_y = FunctionTransformer() # for binary values scaling does not make sense at all but we keep it for symetry and apply the \"NoOp\" scaler\n",
566 | "target_scaled = scaler_y.fit_transform(sepsis_returns.reshape(-1, 1))"
567 | ]
568 | },
569 | {
570 | "cell_type": "markdown",
571 | "metadata": {},
572 | "source": [
573 | "### Model Definition\n",
574 | "\n",
575 | "Let's define a simple network and try to overfit. We make use of PyTorch to build a simple Neural Network. \n",
576 | "\n",
577 | "**Disclaimer: Again, this is just a simple example and not in anyway meant as a recommendation for a model.**"
578 | ]
579 | },
580 | {
581 | "cell_type": "code",
582 | "execution_count": null,
583 | "metadata": {},
584 | "outputs": [],
585 | "source": [
586 | "class NeuralNetworkBinaryOutcome(nn.Module):\n",
587 | " def __init__(self):\n",
588 | " super(NeuralNetworkBinaryOutcome, self).__init__()\n",
589 | " self.linear_relu_stack = nn.Sequential( \n",
590 | " torch.nn.Linear(x.shape[1], 64),\n",
591 | " nn.BatchNorm1d(num_features=64),\n",
592 | " nn.LeakyReLU(), \n",
593 | " torch.nn.Linear(64, 128),\n",
594 | " nn.BatchNorm1d(num_features=128), \n",
595 | " torch.nn.Linear(128, 1),\n",
596 | " nn.Sigmoid()\n",
597 | " )\n",
598 | "\n",
599 | " def forward(self, x):\n",
600 | " logits = self.linear_relu_stack(x)\n",
601 | " return logits"
602 | ]
603 | },
604 | {
605 | "cell_type": "markdown",
606 | "metadata": {},
607 | "source": [
608 | "We use a standard training loop in PyTorch:"
609 | ]
610 | },
611 | {
612 | "cell_type": "code",
613 | "execution_count": null,
614 | "metadata": {},
615 | "outputs": [],
616 | "source": [
617 | "def train(dataloader, model, \n",
618 | " loss_fn, measure_fn, \n",
619 | " optimizer, device, epochs): \n",
620 | " \n",
621 | " losses = []\n",
622 | " size = len(dataloader.dataset)\n",
623 | " \n",
624 | " loop = tqdm(range(epochs))\n",
625 | " \n",
626 | " for epoch in loop:\n",
627 | " \n",
628 | " for batch, (X, y) in enumerate(dataloader):\n",
629 | " X, y = X.to(device), y.to(device)\n",
630 | "\n",
631 | " optimizer.zero_grad()\n",
632 | "\n",
633 | " # Compute prediction error\n",
634 | " pred = model(X)\n",
635 | " \n",
636 | " loss = loss_fn(pred, y)\n",
637 | " measure = measure_fn(pred, y)\n",
638 | "\n",
639 | " # Backpropagation\n",
640 | " loss.backward()\n",
641 | " optimizer.step()\n",
642 | " \n",
643 | " losses.append([loss.item(), measure.item()])\n",
644 | " \n",
645 | " loop.set_description('Epoch {}/{}'.format(epoch + 1, epochs))\n",
646 | " loop.set_postfix(loss=loss.item(), measure=measure.item())\n",
647 | " \n",
648 | " return losses"
649 | ]
650 | },
651 | {
652 | "cell_type": "markdown",
653 | "metadata": {},
654 | "source": [
655 | "And can use the following function to get all evaluation results:"
656 | ]
657 | },
658 | {
659 | "cell_type": "code",
660 | "execution_count": null,
661 | "metadata": {},
662 | "outputs": [],
663 | "source": [
664 | "def evaluate_all(dataloader, model, device): \n",
665 | " size = len(dataloader.dataset)\n",
666 | " num_batches = len(dataloader)\n",
667 | " \n",
668 | " model.eval()\n",
669 | " \n",
670 | " result = []\n",
671 | " original = []\n",
672 | "\n",
673 | " with torch.no_grad(): \n",
674 | " for X, y in tqdm(dataloader): \n",
675 | " X, y = X.to(device), y.to(device) \n",
676 | " pred = model(X) \n",
677 | " \n",
678 | " result.extend(pred.flatten().numpy())\n",
679 | " original.extend(y.flatten().numpy())\n",
680 | " \n",
681 | " return np.asarray(result), np.asarray(original)"
682 | ]
683 | },
684 | {
685 | "cell_type": "markdown",
686 | "metadata": {},
687 | "source": [
688 | "### Training\n",
689 | "\n",
690 | "Prepare the data for the PyTorch data loading mechanism."
691 | ]
692 | },
693 | {
694 | "cell_type": "code",
695 | "execution_count": null,
696 | "metadata": {},
697 | "outputs": [],
698 | "source": [
699 | "from torch.utils.data import TensorDataset, DataLoader\n",
700 | "\n",
701 | "# We need float32 data\n",
702 | "x = torch.from_numpy(data_scaled.astype('float32'))\n",
703 | "y = torch.from_numpy(target_scaled.astype('float32'))\n",
704 | "\n",
705 | "# Always check the shapes\n",
706 | "print(x.shape)\n",
707 | "print(y.shape)\n",
708 | "\n",
709 | "ds = TensorDataset(x, y)\n",
710 | "train_dataloader = DataLoader(ds, batch_size=64, shuffle=True)"
711 | ]
712 | },
713 | {
714 | "cell_type": "markdown",
715 | "metadata": {},
716 | "source": [
717 | "Let us check a random single sample from our data loader (always a good idea!) "
718 | ]
719 | },
720 | {
721 | "cell_type": "code",
722 | "execution_count": null,
723 | "metadata": {},
724 | "outputs": [],
725 | "source": [
726 | "inputs, classes = next(iter(train_dataloader))\n",
727 | "print(inputs[0])\n",
728 | "print(classes[0])"
729 | ]
730 | },
731 | {
732 | "cell_type": "markdown",
733 | "metadata": {},
734 | "source": [
735 | "We train the model using cross entropy as loss function accuracy as easier to interpret measure to report."
736 | ]
737 | },
738 | {
739 | "cell_type": "code",
740 | "execution_count": null,
741 | "metadata": {
742 | "tags": []
743 | },
744 | "outputs": [],
745 | "source": [
746 | "## if you want ot use a GPU you need to tweak the requirements.txt to include the GPU-enabled PyTorch\n",
747 | "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
748 | "print('Using {} device'.format(device))\n",
749 | "\n",
750 | "# fix a seed to get reproducible results\n",
751 | "torch.manual_seed(42)\n",
752 | "\n",
753 | "model = NeuralNetworkBinaryOutcome().to(device)\n",
754 | "print(model)\n",
755 | "\n",
756 | "def get_accuracy(y_prob, y_true): \n",
757 | " y_true = y_true.flatten()\n",
758 | " y_prob = y_prob.flatten()\n",
759 | " assert y_true.ndim == 1 and y_true.size() == y_prob.size()\n",
760 | " y_prob = y_prob > 0.5\n",
761 | " return (y_true == y_prob).sum() / y_true.size(0)\n",
762 | "measure_fn = get_accuracy\n",
763 | "\n",
764 | "results = train(train_dataloader, model, \n",
765 | " nn.BCELoss(), # crossentropy for binary target \n",
766 | " get_accuracy, \n",
767 | " torch.optim.Adam(model.parameters()), \n",
768 | " device, 100)"
769 | ]
770 | },
771 | {
772 | "cell_type": "code",
773 | "execution_count": null,
774 | "metadata": {},
775 | "outputs": [],
776 | "source": [
777 | "%matplotlib inline\n",
778 | "\n",
779 | "results_data = pd.DataFrame(results)\n",
780 | "results_data.columns = ['loss', 'measure']\n",
781 | "ax = results_data.plot(subplots=True);"
782 | ]
783 | },
784 | {
785 | "cell_type": "code",
786 | "execution_count": null,
787 | "metadata": {},
788 | "outputs": [],
789 | "source": [
790 | "print(\"Accuracy: \" + str(results[len(results)-1][1]))\n",
791 | "\n",
792 | "true_returns = np.unique(sepsis_returns, return_counts=True)[1][0]\n",
793 | "true_not_returns = np.unique(sepsis_returns, return_counts=True)[1][1]\n",
794 | "\n",
795 | "print(\"Accuracy (never returns)\" + str(true_returns / len(sepsis_returns)))\n",
796 | "print(\"Accuracy (always returns)\" + str(true_not_returns / len(sepsis_returns)))"
797 | ]
798 | },
799 | {
800 | "cell_type": "markdown",
801 | "metadata": {},
802 | "source": [
803 | "## Brief Evaluation"
804 | ]
805 | },
806 | {
807 | "cell_type": "markdown",
808 | "metadata": {},
809 | "source": [
810 | "Ok, that is a bit better compared to simply always saying that the patient does not return. But the accuracy on the training set (**not even considering a test set!**) is still varying a lot and the variation of the log and accuracy over the epochs trained does not look good either. So, let us still have a look at the individual predictions and their score depending on the ground truth."
811 | ]
812 | },
813 | {
814 | "cell_type": "code",
815 | "execution_count": null,
816 | "metadata": {},
817 | "outputs": [],
818 | "source": [
819 | "test_dataloader = DataLoader(ds, batch_size=256, shuffle=False)\n",
820 | "result, original = evaluate_all(test_dataloader, model, device)"
821 | ]
822 | },
823 | {
824 | "cell_type": "code",
825 | "execution_count": null,
826 | "metadata": {},
827 | "outputs": [],
828 | "source": [
829 | "pd_pos = pd.DataFrame({'Returns': result[original == 1]})\n",
830 | "pd_neg = pd.DataFrame({'Does not return': result[original == 0]})\n",
831 | "pd.concat([pd_pos, pd_neg],axis=1).boxplot().set_ylabel('Score')"
832 | ]
833 | },
834 | {
835 | "cell_type": "markdown",
836 | "metadata": {},
837 | "source": [
838 | "There seems to be some separation but likely the prediction model will give us many false positives when used to identify returning patients in practice.\n",
839 | "\n",
840 | "**Of course, you should now compute the usual measures for classification tasks and the threshold for making a decision: recall, precision, confusion matrices, area under the curve and many other ways to deeply evaluate a prediction model. Always consider what would be the use case of your prediction.**"
841 | ]
842 | },
843 | {
844 | "cell_type": "markdown",
845 | "metadata": {},
846 | "source": [
847 | "Why is this so bad? Let us have a look at the data distribution we put in:"
848 | ]
849 | },
850 | {
851 | "cell_type": "code",
852 | "execution_count": null,
853 | "metadata": {},
854 | "outputs": [],
855 | "source": [
856 | "# count the unique vectors\n",
857 | "dist_bags = np.unique(sepsis_bag, return_counts=True, axis=0)\n",
858 | "\n",
859 | "# sort them with numpy\n",
860 | "unique_vectors = dist_bags[0][np.argsort(-dist_bags[1])]\n",
861 | "count_vectors = dist_bags[1][np.argsort(-dist_bags[1])]\n",
862 | "\n",
863 | "pd.DataFrame({'Occurrence of unique sample vectors': count_vectors}).boxplot().set_ylabel('Frequency')"
864 | ]
865 | },
866 | {
867 | "cell_type": "markdown",
868 | "metadata": {},
869 | "source": [
870 | "Many of the traces result in the exact same sample. Let us check what is the \"return status\" for the most common sample that represents more than 175 traces."
871 | ]
872 | },
873 | {
874 | "cell_type": "code",
875 | "execution_count": null,
876 | "metadata": {},
877 | "outputs": [],
878 | "source": [
879 | "# most frequently used vector\n",
880 | "unique_vectors[0]"
881 | ]
882 | },
883 | {
884 | "cell_type": "code",
885 | "execution_count": null,
886 | "metadata": {},
887 | "outputs": [],
888 | "source": [
889 | "# find the sample indicies for this vector\n",
890 | "sample_indicies = np.where((sepsis_bag == unique_vectors[0]).all(axis=1)) \n",
891 | "sample_durations = target_scaled[sample_indicies]"
892 | ]
893 | },
894 | {
895 | "cell_type": "code",
896 | "execution_count": null,
897 | "metadata": {},
898 | "outputs": [],
899 | "source": [
900 | "np.unique(sample_durations, return_counts=True)"
901 | ]
902 | },
903 | {
904 | "cell_type": "markdown",
905 | "metadata": {},
906 | "source": [
907 | "It is clear that, without additional information, there is nothing the prediction model can learn to represent this division for the exact same feature values. We can look at further examples, but it seems we simply cannot reliably predict whether a patient will return from the bag-of-words / multiset of events model in the Sepsis event log.\n",
908 | "\n",
909 | "This was just an example on how to use a predictive model with an event log to predict a binary process characteristic based on events contained in the event log."
910 | ]
911 | }
912 | ],
913 | "metadata": {
914 | "kernelspec": {
915 | "display_name": "Python 3 (ipykernel)",
916 | "language": "python",
917 | "name": "python3"
918 | },
919 | "language_info": {
920 | "codemirror_mode": {
921 | "name": "ipython",
922 | "version": 3
923 | },
924 | "file_extension": ".py",
925 | "mimetype": "text/x-python",
926 | "name": "python",
927 | "nbconvert_exporter": "python",
928 | "pygments_lexer": "ipython3",
929 | "version": "3.8.13"
930 | },
931 | "vscode": {
932 | "interpreter": {
933 | "hash": "986d19a5f30747571c3041dc5ec04d8eebcc8ad808bb611da7ac1a06db10f3a6"
934 | }
935 | }
936 | },
937 | "nbformat": 4,
938 | "nbformat_minor": 4
939 | }
940 |
--------------------------------------------------------------------------------
/r/README.md:
--------------------------------------------------------------------------------
1 | # Applied Process Mining Module
2 |
3 | **Course under construction** 🚧
4 |
5 | R notebooks, see the course overview on the [landing page](../).
--------------------------------------------------------------------------------
/r/lecture1-eventlogs.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "tags": []
7 | },
8 | "source": [
9 | "This notebook is part of a course on Applied Process Mining. The collection of notebooks is a *living document* and subject to change. \n",
10 | "\n",
11 | "# Lecture 1 - 'Event Logs and Process Visualization' (R / bupaR)"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "## Setup\n",
19 | "\n",
20 | "
\n",
21 | "\n",
22 | "In this notebook, we are going to need the `tidyverse` and the `bupaR` packages. If you run this notebook in the recommended Docker environment then there is no need to install any packages. Otherwise, you may need to install the requirements that are commented out below:"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "## Perform the commented out commands below in a separate R session\n",
32 | "# install.packages(\"tidyverse\")\n",
33 | "# install.packages(\"bupaR\")\n",
34 | "# install.packages(\"processmapR\")\n",
35 | "# install.packages(\"processanimateR\")"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "We are setting up some convenicence options for the notebook and import dependencies:"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "# for larger and readable plots\n",
52 | "options(jupyter.plot_scale=1.25)"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "# the initial execution of these may give you warnings that we can safely ignore\n",
62 | "suppressPackageStartupMessages (library(tidyverse)) \n",
63 | "suppressPackageStartupMessages (library(bupaR))\n",
64 | "library(processmapR)\n",
65 | "library(processanimateR)"
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "metadata": {},
71 | "source": [
72 | "## Event Logs\n",
73 | "\n",
74 | "This part introduces event logs and their unique properties that provide the basis for any Process Mining method. Together with `bupaR` several event logs are distributed that can be loaded without further processing. \n",
75 | "In this lecture we are going to make use of the following datasets:\n",
76 | "\n",
77 | "* Patients, a synthetically generated example event log in a hospital setting.\n",
78 | "* Sepsis, a real-life event log taken from a Dutch hospital. The event log is publicly available here: https://doi.org/10.4121/uuid:915d2bfb-7e84-49ad-a286-dc35f063a460 and has been used in many Process Mining related publications."
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {},
84 | "source": [
85 | "### Exploring Event Data\n",
86 | "\n",
87 | "Let us first explore the event data without any prior knowledge about event log structure or properties. We convert the `patients` event log below to a standard `tibble` (https://tibble.tidyverse.org/) and inspect the first rows."
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "patients %>%\n",
97 | " as_tibble() %>%\n",
98 | " head()"
99 | ]
100 | },
101 | {
102 | "cell_type": "markdown",
103 | "metadata": {},
104 | "source": [
105 | "The most important ingredient of an event log is the timestamps column `time`. This allows us to establish a sequence of events."
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "patients %>% \n",
115 | " filter(time < '2017-01-31') %>% \n",
116 | " ggplot(aes(time, \"Event\")) + \n",
117 | " geom_point() + \n",
118 | " theme_bw()"
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "metadata": {},
124 | "source": [
125 | "We also need to have information on the kind of actions or `activities` performed:"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": null,
131 | "metadata": {},
132 | "outputs": [],
133 | "source": [
134 | "patients %>%\n",
135 | " as_tibble() %>% \n",
136 | " distinct(handling)"
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "metadata": {},
142 | "source": [
143 | "Let us have a look at what other data is available:"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": null,
149 | "metadata": {},
150 | "outputs": [],
151 | "source": [
152 | "patients %>%\n",
153 | " as_tibble() %>% \n",
154 | " distinct(patient) %>% \n",
155 | " head()"
156 | ]
157 | },
158 | {
159 | "cell_type": "markdown",
160 | "metadata": {},
161 | "source": [
162 | "Maybe the patient identifier could be a good candidate for defining a process `case` since this is an 'entity' that we would like to follow. When counting the events that occurred per individual patient it seems that there is a similar number of events for each patient, which is generally a good indicator for a process case identifier:"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "patients %>%\n",
172 | " as_tibble() %>% \n",
173 | " count(patient) %>% \n",
174 | " head()"
175 | ]
176 | },
177 | {
178 | "cell_type": "markdown",
179 | "metadata": {},
180 | "source": [
181 | "Let use decide that we want to look at the process by following the patient identifier as `case identifier`:"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": null,
187 | "metadata": {},
188 | "outputs": [],
189 | "source": [
190 | "patients %>% \n",
191 | " filter(time < '2017-01-31') %>% \n",
192 | " ggplot(aes(time, patient, color = handling)) + \n",
193 | " geom_point() + \n",
194 | " theme_bw()"
195 | ]
196 | },
197 | {
198 | "cell_type": "markdown",
199 | "metadata": {},
200 | "source": [
201 | "The scatterplot above is known as `Dotted Chart` in the process mining community and provides an 'at a glance' overview on the events and their temporal relation when grouped by a case. It seems that each of the sequences of events (also known as `traces`) start with the `Registration` event. Let us have a look at the event data sorted by patient identifier and by time:"
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": null,
207 | "metadata": {},
208 | "outputs": [],
209 | "source": [
210 | "patients %>% \n",
211 | " as_tibble() %>% \n",
212 | " arrange(patient, time) %>% \n",
213 | " head(14)"
214 | ]
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "metadata": {},
219 | "source": [
220 | "An individual process execution (e.g., for patient 1) consists of several activities that are done in a sequence. However, we have more information available than simply the sequence of events. For each occurrence of an activity we have two events: a `start` event and a `complete` event as captured in the column `registration_type`. These event refer to the lifecycle of an activity and allow us to capture the `duration` of an activity. Much more complex lifecycles of activities are possible, a general model is described here: http://bupar.net/creating_eventlogs.html#Transactional_life_cycle"
221 | ]
222 | },
223 | {
224 | "cell_type": "markdown",
225 | "metadata": {},
226 | "source": [
227 | "### Further resources"
228 | ]
229 | },
230 | {
231 | "cell_type": "markdown",
232 | "metadata": {},
233 | "source": [
234 | "* [XES Standard](http://xes-standard.org/)\n",
235 | "* [Creating event logs from CSV files in bupaR](http://bupar.net/creating_eventlogs.html)\n",
236 | "* [Changing the case, activity notiions in bupaR](http://bupar.net/mapping.html)"
237 | ]
238 | },
239 | {
240 | "cell_type": "markdown",
241 | "metadata": {},
242 | "source": [
243 | "### Reflection Questions\n",
244 | "\n",
245 | "* What could be the reason a column `.order` is included in this dataset?\n",
246 | "* How could the column `employee` be used?\n",
247 | "* What is the use of the column `handling_id` and in which situation is it required?"
248 | ]
249 | },
250 | {
251 | "cell_type": "markdown",
252 | "metadata": {},
253 | "source": [
254 | "## Basic Process Visualization"
255 | ]
256 | },
257 | {
258 | "cell_type": "markdown",
259 | "metadata": {},
260 | "source": [
261 | "There are several generic visualizations that can be used to get a basic understanding of the process behavior."
262 | ]
263 | },
264 | {
265 | "cell_type": "markdown",
266 | "metadata": {},
267 | "source": [
268 | "### Set of Traces"
269 | ]
270 | },
271 | {
272 | "cell_type": "markdown",
273 | "metadata": {},
274 | "source": [
275 | "Since a process, in our basic definition, is a set of event sequences or traces, we can simply visualize the set of distinct trace variants. Here we only consider the `trace variant` which means that we only consider the order of activities executed disregarding any other aspect (timing, lifecycles)."
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": null,
281 | "metadata": {},
282 | "outputs": [],
283 | "source": [
284 | "patients %>% \n",
285 | " trace_explorer(coverage = 1.0, abbreviate = T, type = ) # abbreviated here due to poor Jupyter notebook output scaling"
286 | ]
287 | },
288 | {
289 | "cell_type": "markdown",
290 | "metadata": {},
291 | "source": [
292 | "### Dotted Chart"
293 | ]
294 | },
295 | {
296 | "cell_type": "markdown",
297 | "metadata": {},
298 | "source": [
299 | "The `Dotted Chart` adds the timing aspect of the individual traces and visualized all of them at-a-glance. It can be configured in many different ways and provides a good insight into time-related aspects of the process behavior."
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": null,
305 | "metadata": {},
306 | "outputs": [],
307 | "source": [
308 | "patients %>%\n",
309 | " filter(time < '2017-01-31') %>% \n",
310 | " dotted_chart(add_end_events = T)"
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": null,
316 | "metadata": {},
317 | "outputs": [],
318 | "source": [
319 | "patients %>% \n",
320 | " dotted_chart(\"relative\", add_end_events = T)"
321 | ]
322 | },
323 | {
324 | "cell_type": "markdown",
325 | "metadata": {},
326 | "source": [
327 | "We can also use `plotly` to get an interactive visualization:"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": null,
333 | "metadata": {},
334 | "outputs": [],
335 | "source": [
336 | "patients %>% \n",
337 | " dotted_chart(\"relative\", add_end_events = T, plotly = TRUE)"
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": null,
343 | "metadata": {},
344 | "outputs": [],
345 | "source": [
346 | "sepsis %>% \n",
347 | " dotted_chart(\"relative_day\",\n",
348 | " sort = \"start_day\", \n",
349 | " units = \"hours\")"
350 | ]
351 | },
352 | {
353 | "cell_type": "markdown",
354 | "metadata": {},
355 | "source": [
356 | "Check out other process visualization options using bupaR:\n",
357 | "\n",
358 | "* [Further Dotted Charts](http://bupar.net/dotted_chart.html)\n",
359 | "* [Exploring Time, Resources, Structuredness](http://bupar.net/exploring.html)"
360 | ]
361 | },
362 | {
363 | "cell_type": "markdown",
364 | "metadata": {},
365 | "source": [
366 | "## Process Map Visualization"
367 | ]
368 | },
369 | {
370 | "cell_type": "code",
371 | "execution_count": null,
372 | "metadata": {},
373 | "outputs": [],
374 | "source": [
375 | "patients %>% \n",
376 | " process_matrix() %>% \n",
377 | " plot()"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": null,
383 | "metadata": {},
384 | "outputs": [],
385 | "source": [
386 | "patients %>% \n",
387 | " process_map()"
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": null,
393 | "metadata": {},
394 | "outputs": [],
395 | "source": [
396 | "patients %>% \n",
397 | " process_map(type = performance(units = \"hours\"))"
398 | ]
399 | },
400 | {
401 | "cell_type": "markdown",
402 | "metadata": {},
403 | "source": [
404 | "## Real-life Processes"
405 | ]
406 | },
407 | {
408 | "cell_type": "code",
409 | "execution_count": null,
410 | "metadata": {},
411 | "outputs": [],
412 | "source": [
413 | "sepsis %>% \n",
414 | " precedence_matrix() %>% \n",
415 | " plot()"
416 | ]
417 | }
418 | ],
419 | "metadata": {
420 | "kernelspec": {
421 | "display_name": "R",
422 | "language": "R",
423 | "name": "ir"
424 | },
425 | "language_info": {
426 | "codemirror_mode": "r",
427 | "file_extension": ".r",
428 | "mimetype": "text/x-r-source",
429 | "name": "R",
430 | "pygments_lexer": "r",
431 | "version": "4.2.2"
432 | },
433 | "vscode": {
434 | "interpreter": {
435 | "hash": "28aff1567d8aae5536826c1be921f2ff2e204808293d43dc67bdcb73bd29110e"
436 | }
437 | }
438 | },
439 | "nbformat": 4,
440 | "nbformat_minor": 4
441 | }
442 |
--------------------------------------------------------------------------------
/r/lecture2-discovery.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "tags": []
7 | },
8 | "source": [
9 | "This notebook is part of a course on Applied Process Mining. The collection of notebooks is a *living document* and subject to change.\n",
10 | "\n",
11 | "# Lecture 2 - 'Process Discovery with the Heuristics Miner' (R / bupaR)"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "## Setup\n",
19 | "\n",
20 | "
\n",
21 | "\n",
22 | "In this notebook, we are going to need the `tidyverse` and the `bupaR` packages."
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "## Perform the commented out commands below in a separate R session\n",
32 | "# install.packages(\"tidyverse\")\n",
33 | "# install.packages(\"bupaR\")"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "# for larger and readable plots\n",
43 | "options(jupyter.plot_scale=1.25)"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "# the initial execution of these may give you warnings that we can safely ignore\n",
53 | "suppressPackageStartupMessages(library(tidyverse)) \n",
54 | "suppressPackageStartupMessages(library(bupaR))\n",
55 | "library(xesreadR)\n",
56 | "library(processanimateR)\n",
57 | "suppressPackageStartupMessages(library(heuristicsmineR))"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "## Process Discovery"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {},
70 | "source": [
71 | "### Process Models 🚧\n",
72 | "\n",
73 | "bupaR does not provide an option to load BPMN models yet. Please have a look at the PM4Py instructions."
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "metadata": {},
79 | "source": [
80 | "### Quality Dimensions\n",
81 | "\n",
82 | "The quality dimensions `fitness`, `precision`, `simplicity`, and `generalisation` are best illustrated by using a small example event log.\n",
83 | "We are using an example event log in XES format that is used in the book `Process Mining - Data Science in Action` by Wil van der Aalst, which is downloaded and stored in the `../data` directory with the code below:"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "# ignore the warnings, the package needs to be updated and no 'activity instance identifier' is required in this example\n",
93 | "example_log <- xesreadR::read_xes(\"../data/Lfull.xes\")"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "Let us have a look at the event log in tabular form. The mapping of the activity labels to actual activities is:\n",
101 | "\n",
102 | "* a = register request, \n",
103 | "* b = examine thoroughly, \n",
104 | "* c = examine casually, \n",
105 | "* d = check ticket, \n",
106 | "* e = decide, \n",
107 | "* f = reinitiate request, \n",
108 | "* g = pay compensation, and \n",
109 | "* h = reject request."
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "example_log %>% head(10)"
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "metadata": {},
124 | "source": [
125 | "Now let us discover a process map as we have seen in Lecture 1:"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": null,
131 | "metadata": {
132 | "tags": []
133 | },
134 | "outputs": [],
135 | "source": [
136 | "example_log %>% process_map()"
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "metadata": {},
142 | "source": [
143 | "Not really very insightful the directly-follows based process map visualization."
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {},
149 | "source": [
150 | "### Heuristics Miner"
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "metadata": {},
156 | "source": [
157 | "#### L_heur_1 Event Log\n",
158 | "We are using an example event log that is suited to introduce the Heuristics Miner algorithm. This event log is already included with the `heuristicsmineR` package in bupaR."
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": null,
164 | "metadata": {},
165 | "outputs": [],
166 | "source": [
167 | "L_heur_1 %>% head(9)"
168 | ]
169 | },
170 | {
171 | "cell_type": "markdown",
172 | "metadata": {},
173 | "source": [
174 | "The naive process map drawing reveals some weird behaviour between the activities `b` and `c`. There seems to be a loop between both activities even though they never occur more than once in each trace."
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "metadata": {},
181 | "outputs": [],
182 | "source": [
183 | "L_heur_1 %>%\n",
184 | " as_tibble() %>%\n",
185 | " mutate(activity_id = as.character(activity_id)) %>%\n",
186 | " mutate(activity_id = if_else(activity_id == \"b\" | activity_id == \"c\", \"cb\", activity_id)) %>%\n",
187 | " simple_eventlog(case_id = \"CASE_concept_name\", activity_id = \"activity_id\", timestamp = \"timestamp\") %>%\n",
188 | " process_map()"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": null,
194 | "metadata": {},
195 | "outputs": [],
196 | "source": [
197 | "L_heur_1 %>% process_map()"
198 | ]
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "metadata": {},
203 | "source": [
204 | "#### Dependency Graphs"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "L_heur_1 %>% precedence_matrix(type = \"absolute\") %>% plot()"
214 | ]
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "metadata": {},
219 | "source": [
220 | "Based on the precedence matrix, we can follows the formula for the dependency relation:"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": null,
226 | "metadata": {},
227 | "outputs": [],
228 | "source": [
229 | "mat_pre <- L_heur_1 %>% precedence_matrix(type = \"absolute\") %>% as.matrix()\n",
230 | "mat_pre"
231 | ]
232 | },
233 | {
234 | "cell_type": "markdown",
235 | "metadata": {},
236 | "source": [
237 | "Since, we want to compute how often activities follow each other in either direction, we need the transposed matrix:"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": null,
243 | "metadata": {},
244 | "outputs": [],
245 | "source": [
246 | "t_mat_pre <- t(mat_pre)\n",
247 | "t_mat_pre"
248 | ]
249 | },
250 | {
251 | "cell_type": "markdown",
252 | "metadata": {},
253 | "source": [
254 | "And, then it is basic math:"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": null,
260 | "metadata": {},
261 | "outputs": [],
262 | "source": [
263 | "(mat_pre - t_mat_pre) / (mat_pre + t_mat_pre + 1)"
264 | ]
265 | },
266 | {
267 | "cell_type": "markdown",
268 | "metadata": {},
269 | "source": [
270 | "Of course, this has already been implemented in the `heuristicsmineR` package. There are also some more details of the algorithm that deal with the detection of loops as well as making sure that all activities are connected to each other. Please consult the original [Heuristics Miner paper](https://is.ieis.tue.nl/staff/aweijters/WP334_FHMv3.pdf) and the documentation of `heuristicsmineR` for more details."
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": null,
276 | "metadata": {},
277 | "outputs": [],
278 | "source": [
279 | "L_heur_1 %>% dependency_matrix(threshold = 0) %>% plot()"
280 | ]
281 | },
282 | {
283 | "cell_type": "markdown",
284 | "metadata": {},
285 | "source": [
286 | "A dependency graph can be "
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": null,
292 | "metadata": {},
293 | "outputs": [],
294 | "source": [
295 | "L_heur_1 %>% \n",
296 | " dependency_matrix(threshold = 0.8) %>% \n",
297 | " render_dependency_matrix()"
298 | ]
299 | },
300 | {
301 | "cell_type": "markdown",
302 | "metadata": {},
303 | "source": [
304 | "Have a look at the parameters (via `?dependency_matrix`) and try to change some of them to see what happens."
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": null,
310 | "metadata": {},
311 | "outputs": [],
312 | "source": [
313 | "L_heur_1 %>% \n",
314 | " dependency_matrix(threshold = 0.9) %>% \n",
315 | " render_dependency_matrix()"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": null,
321 | "metadata": {},
322 | "outputs": [],
323 | "source": [
324 | "sepsis %>% precedence_matrix() %>% plot"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": null,
330 | "metadata": {},
331 | "outputs": [],
332 | "source": [
333 | "sepsis %>% \n",
334 | " dependency_matrix(threshold = 0.7) %>% \n",
335 | " render_dependency_matrix()"
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": null,
341 | "metadata": {},
342 | "outputs": [],
343 | "source": [
344 | "sepsis %>% \n",
345 | " dependency_matrix(threshold = 0.9) %>% \n",
346 | " render_dependency_matrix()"
347 | ]
348 | },
349 | {
350 | "cell_type": "markdown",
351 | "metadata": {},
352 | "source": [
353 | "### Causal nets"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": null,
359 | "metadata": {},
360 | "outputs": [],
361 | "source": [
362 | "L_heur_1 %>% \n",
363 | " causal_net(threshold = 0.8) %>%\n",
364 | " render_causal_net()"
365 | ]
366 | },
367 | {
368 | "cell_type": "code",
369 | "execution_count": null,
370 | "metadata": {},
371 | "outputs": [],
372 | "source": [
373 | "sepsis %>%\n",
374 | " act_unite(Release = c(\"Release A\", \"Release B\", \"Release C\", \"Release D\", \"Release E\")) %>%\n",
375 | " causal_net(all_connected = TRUE) %>%\n",
376 | " render_causal_net()"
377 | ]
378 | },
379 | {
380 | "cell_type": "code",
381 | "execution_count": null,
382 | "metadata": {},
383 | "outputs": [],
384 | "source": [
385 | "example_log %>% \n",
386 | " causal_net() %>% \n",
387 | " render_causal_net()"
388 | ]
389 | },
390 | {
391 | "cell_type": "markdown",
392 | "metadata": {},
393 | "source": [
394 | "#### Visualise / Convert as BPMN 🚧"
395 | ]
396 | },
397 | {
398 | "cell_type": "markdown",
399 | "metadata": {},
400 | "source": [
401 | "In bupaR there is currently no support for BPMN visualizations. However, it is possible to convert the Causal net into a Petri net. For simple process models, the mapping between BPMN and Petri nets is easy to understand. Thus, we are using Petri nets here."
402 | ]
403 | },
404 | {
405 | "cell_type": "code",
406 | "execution_count": null,
407 | "metadata": {},
408 | "outputs": [],
409 | "source": [
410 | "L_heur_1 %>% \n",
411 | " causal_net() %>%\n",
412 | " as.petrinet() %>%\n",
413 | " petrinetR::render_PN()"
414 | ]
415 | },
416 | {
417 | "cell_type": "code",
418 | "execution_count": null,
419 | "metadata": {},
420 | "outputs": [],
421 | "source": [
422 | "example_log %>% \n",
423 | " causal_net() %>%\n",
424 | " as.petrinet() %>%\n",
425 | " petrinetR::render_PN()"
426 | ]
427 | },
428 | {
429 | "cell_type": "markdown",
430 | "metadata": {},
431 | "source": [
432 | "**TODO** we could use the discovered Petri net with PM4Py to do further processing 🚧"
433 | ]
434 | }
435 | ],
436 | "metadata": {
437 | "kernelspec": {
438 | "display_name": "R",
439 | "language": "R",
440 | "name": "ir"
441 | },
442 | "language_info": {
443 | "codemirror_mode": "r",
444 | "file_extension": ".r",
445 | "mimetype": "text/x-r-source",
446 | "name": "R",
447 | "pygments_lexer": "r",
448 | "version": "4.2.2"
449 | }
450 | },
451 | "nbformat": 4,
452 | "nbformat_minor": 4
453 | }
454 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # we use the old PM4Py API that was considerably changed in 2.3.0
2 | pm4py~=2.2.32
3 | numpy
4 | pandas
5 | networkx
6 | deprecation
7 | matplotlib
8 | scipy
9 | scikit-learn
10 | lxml
11 | graphviz
12 | plotnine
13 | tqdm
14 | jupyterlab
15 | ipywidgets
16 | # CUDA enabled PyTorch is very large
17 | --extra-index-url https://download.pytorch.org/whl/cpu
18 | torch
--------------------------------------------------------------------------------