├── .gitignore ├── CODEOWNERS ├── LICENSE ├── README.md ├── demo ├── README.md ├── log_examples │ ├── DomesticDeclarations.xes │ ├── nurse_workflow.csv │ ├── physician_workflow.csv │ ├── remote_monitoring.csv │ └── remote_monitoring_eng.csv ├── profit_examples.py ├── profit_examples_eng.ipynb └── profit_examples_rus.ipynb ├── meta ├── CHANGES.rst ├── cat_logo.jpg ├── cycles_joining.png ├── logo.png ├── logo_2.png ├── logo_3.png ├── pm_general.png └── process.png ├── profit ├── README.md ├── __init__.py ├── graph.py ├── log.py ├── observer_abc.py ├── process_map.py ├── renderer.py ├── transition_matrix.py ├── util_agg.py └── util_pm.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | <<<<<<< HEAD 2 | ======= 3 | # Editors 4 | .vscode/ 5 | .idea/ 6 | 7 | # Vagrant 8 | .vagrant/ 9 | 10 | # Mac/OSX 11 | .DS_Store 12 | 13 | # Windows 14 | Thumbs.db 15 | 16 | # Source for the following rules: https://raw.githubusercontent.com/github/gitignore/master/Python.gitignore 17 | >>>>>>> master 18 | # Byte-compiled / optimized / DLL files 19 | __pycache__/ 20 | *.py[cod] 21 | *$py.class 22 | 23 | # C extensions 24 | *.so 25 | 26 | # Distribution / packaging 27 | .Python 28 | build/ 29 | develop-eggs/ 30 | dist/ 31 | downloads/ 32 | eggs/ 33 | .eggs/ 34 | lib/ 35 | lib64/ 36 | parts/ 37 | sdist/ 38 | var/ 39 | wheels/ 40 | *.egg-info/ 41 | .installed.cfg 42 | *.egg 43 | MANIFEST 44 | 45 | # PyInstaller 46 | # Usually these files are written by a python script from a template 47 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 48 | *.manifest 49 | *.spec 50 | 51 | # Installer logs 52 | pip-log.txt 53 | pip-delete-this-directory.txt 54 | 55 | # Unit test / coverage reports 56 | htmlcov/ 57 | .tox/ 58 | <<<<<<< HEAD 59 | ======= 60 | .nox/ 61 | >>>>>>> master 62 | .coverage 63 | .coverage.* 64 | .cache 65 | nosetests.xml 66 | coverage.xml 67 | *.cover 68 | .hypothesis/ 69 | .pytest_cache/ 70 | 71 | # Translations 72 | *.mo 73 | *.pot 74 | 75 | # Django stuff: 76 | *.log 77 | local_settings.py 78 | db.sqlite3 79 | 80 | # Flask stuff: 81 | instance/ 82 | .webassets-cache 83 | 84 | # Scrapy stuff: 85 | .scrapy 86 | 87 | # Sphinx documentation 88 | docs/_build/ 89 | 90 | # PyBuilder 91 | target/ 92 | 93 | # Jupyter Notebook 94 | .ipynb_checkpoints 95 | 96 | <<<<<<< HEAD 97 | ======= 98 | # IPython 99 | profile_default/ 100 | ipython_config.py 101 | 102 | >>>>>>> master 103 | # pyenv 104 | .python-version 105 | 106 | # celery beat schedule file 107 | celerybeat-schedule 108 | 109 | # SageMath parsed files 110 | *.sage.py 111 | 112 | # Environments 113 | .env 114 | .venv 115 | env/ 116 | venv/ 117 | ENV/ 118 | env.bak/ 119 | venv.bak/ 120 | 121 | # Spyder project settings 122 | .spyderproject 123 | .spyproject 124 | 125 | # Rope project settings 126 | .ropeproject 127 | 128 | # mkdocs documentation 129 | /site 130 | 131 | # mypy 132 | .mypy_cache/ 133 | <<<<<<< HEAD 134 | ======= 135 | .dmypy.json 136 | dmypy.json 137 | >>>>>>> master 138 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @Siella 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Liubov Elkhovskaya 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Table of contents 4 | * [General info](#general-info) 5 | * [Setup](#setup) 6 | * [Features](#features) 7 | * [Status](#status) 8 | * [Publications](#publications) 9 | * [References](#references) 10 | 11 | ## General info 12 | > Process mining module for Python. 13 | 14 | *Process mining* is an emerging discipline that has been developing for the last two decades. It is a promising approach for the analysis and improving of intraorganizational workflows. Process mining has several types of techniques: *process discovery*, *conformance checking*, and *process enhancement*. With process discovery techniques, one can automatically construct a process model from routinely recorded data, an *event log*. Conformance checking aims to evaluate model compliance with data. After analysis of the real process executions, its enhancement can be proposed. 15 | 16 | There is a plenty of commercial and open-source software for process mining, but how to use knowledge such tools extract from a log? In most cases, it could be performed only by visual assessment of a process model or via its statistics, and it would be desirable to extract knowledge *from* discovered processes. The derived data or formal structure (model) describing the process could be used, e.g., in modelling. This is the main motivation for the ProFIT development which provides such an opportunity, to look behind process discovery results. Future work encompasses all three steps in process mining. 17 | 18 | See short demo-video below about ProFIT! (In Russian) [Here](https://youtu.be/JQ9eBVE8OlU) is another video with application cases. 19 | 20 | 21 | https://user-images.githubusercontent.com/36673448/142000636-30833b62-b6cf-4976-beb5-ad4fd2f09c0f.mp4 22 | 23 | 24 | 25 | ## Setup 26 | You can clone this repository with the `git clone` command on a local machine and add a directory with the package to PATH. To start work with ProFIT, you should import `ProcessMap` from the `profit` module. 27 | 28 | See the details of how to use it in [demo section](https://github.com/Siella/ProFIT/blob/master/demo). 29 | 30 | **Required packages**: 31 | * `Pandas` 32 | * `Graphviz` 33 | * `PM4Py` 34 | 35 | (See [requirements](https://github.com/Siella/ProFIT/blob/master/requirements.txt)) 36 | 37 | ## Features 38 | Process model discovered by ProFIT is a directly-follows graph (see figure below) with activities represented in nodes and their precendence relations as edges. The green node indicates the beginning of the process and shows the total number of cases presenting in the log, and the red node is related to the end of the process. The internal nodes and edges of the graph show the absolute frequencies of events and transitions, respectively: the more absolute value is, the darker or thicker element is. 39 | ![Process model example](/meta/process.png) 40 | 41 | The discovery algorithm includes the basics of *Fuzzy Miner*, and as known, its properties do not guarantee a reachable graph which is desired to see the complete behaviors of the process. So, we perform depth-first search on the graph two times to check whether each node is a descendant of the initial state (“start”) and an ancestor of the terminal state (“end”) of the process model. This way, the model adjusted represents an executable process. 42 | 43 | One can change process model details by tuning activities and paths rates: from the simplest to complex and fullness one. To achieve more automated way of complexity control, we defined the problem of discovering an optimal process model. In this optimization problem, an objective function includes complexity and loss terms. Regularization factor controls the trade-off between human comprehension of the process model and algorithm performance of log behaviours capturing. Thus, one can discover a process model by defining appropriate balance between its complexity and completeness. 44 | 45 | We also introduced an approach for process model aggregation and abstraction (and possible simplification) via *meta-states* search. The idea originated from the healthcare domain, where a patient is involved in the processes. Still, it is broadly considered as an extension of a process discovery technique. Cycle nodes comprise a meta-state, if probability of cycle occurrence in the log exceeds specified threshold. 46 | 47 | **List of main features ready**: 48 | - [x] Model complexity / completeness control via `set_rates()` method (`activities`: int [0,100], `paths`: int [0,100]); 49 | - [x] Optimized process model discovering via `set_params()` method (`optimize`: bool); 50 | - [x] Process model simplification by nodes aggregation via `set_params()` method (`aggregate`: bool). 51 | 52 | **To-do list**: 53 | - [ ] Consider length-2(*k*)-relationship in log; 54 | - [ ] Perform unit-tests; 55 | - [x] Use results in predictive modeling. 56 | 57 | ## Status 58 | This project is an ongoing research work, but you can try it already! 59 | 60 | 61 | 62 | ## Publications 63 | 1. Elkhovskaya L., Kovalchuk S. (2021) Feature Engineering with Process Mining Technique for Patient State Predictions. In: Paszynski M., Kranzlmüller D., Krzhizhanovskaya V.V., Dongarra J.J., Sloot P.M. (eds) Computational Science – ICCS 2021. ICCS 2021. Lecture Notes in Computer Science, vol 12744. Springer, Cham. https://doi.org/10.1007/978-3-030-77967-2_48. 64 | 65 | ## References 66 | 1. Van der Aalst, W. M. (2016). Process mining: Data science in action. Springer, Berlin, Heidelberg. 67 | 2. Ferreira, D. R. (2017). A primer on process mining. Springer, Cham. 68 | 3. Günther, C. W., & Van Der Aalst, W. M. (2007, September). Fuzzy mining–adaptive process simplification based on multi-perspective metrics. In International conference on business process management (pp. 328-343). Springer, Berlin, Heidelberg. 69 | -------------------------------------------------------------------------------- /demo/README.md: -------------------------------------------------------------------------------- 1 | ## Data description 2 | ProFIT usage is demonstrated with two event logs: 3 | 1. [**Remote monitoring**](https://github.com/Siella/ProFIT/blob/master/demo/log_examples/remote_monitoring_eng.csv): data provided by [PMT Online](https://pmtonline.ru/) within a collaborative project. It contains events triggered by in-home blood pressure measurements made by patients suffering from arterial hypertension. There are several clinical and non-clinical events: red zone (exceeding critical levels), yellow zone (exceeding target levels), notifications about measurement missing, etc. 4 | 2. [**Reimbursement process**](https://github.com/Siella/ProFIT/blob/master/demo/log_examples/DomesticDeclarations.xes): data from BPI Challenge 2020 collected from the reimbursement process at Eindhoven University of Technology for 2017-2018. See data description on the [BPI Challenge website](https://icpmconference.org/2020/bpi-challenge/). 5 | 6 | Additionaly, we upload two event logs for physician and nurse workflows in [Almazov center](http://www.almazovcentre.ru/?lang=en) in Saint Petersburg, one of the leading cardiological centers in Russia we collaborate. We used data from a hospital access control system and a healthcare information system to compose an event log of staff activities and logistics. These sophisticated processes are comprised of labs, procedures, branch communications, etc. Data need to be processed thoroughly, so we do not use these cases for demo now. 7 | 8 | ## Code examples 9 | 10 | Init ProcessMap and fit data via `set_log` method. 11 | 12 | ```python 13 | declarations = "../ProFIT/demo/log_examples/DomesticDeclarations.xes" 14 | pm = ProcessMap() 15 | pm.set_log(FILE_PATH = declarations) 16 | pm.update() # may be called after series of settings 17 | ``` 18 | 19 | Model adjustment. 20 | ```python 21 | pm.set_rates(80, 5) # activity and path rates (should set optimize=False for this setting) 22 | # below are default parameters 23 | pm.set_params(optimize=True, # option to discover an optimal process model 24 | lambd=0.5, # regularization factor for model complexity and completeness (increasing lambda results in a simpler model) 25 | step=10, # step size for grid search of an optimal model 26 | verbose=False, # print the progress of optimization 27 | aggregate=False, # option to aggregate nodes into meta-states (if there are) 28 | agg_type='outer', # type of aggregation (possible are 'inner' and 'outer') 29 | heuristic='all', # heuristic to use for element relations redirecting 30 | pre_traverse=False, # establish order of activities traversing a directed graph 31 | ordered=False, # whether the order of meta-state activities are strict 32 | cycle_rel=0.5, # significance threshold for cycles to compose meta-states 33 | colored=True, # black and white or colored process visualization 34 | render_format='png') # saving format (should be supported by Graphviz) 35 | pm.update() 36 | ``` 37 | 38 | Visualize a result (for jupyter-notebook `show_only=False`). 39 | ```python 40 | pm.render(show_only=True, save_path=None) 41 | ``` 42 | -------------------------------------------------------------------------------- /demo/log_examples/nurse_workflow.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itmo-escience/ProFIT/9146c674e424bddd5ee24c13beef8beddef5da62/demo/log_examples/nurse_workflow.csv -------------------------------------------------------------------------------- /demo/log_examples/physician_workflow.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itmo-escience/ProFIT/9146c674e424bddd5ee24c13beef8beddef5da62/demo/log_examples/physician_workflow.csv -------------------------------------------------------------------------------- /demo/log_examples/remote_monitoring.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itmo-escience/ProFIT/9146c674e424bddd5ee24c13beef8beddef5da62/demo/log_examples/remote_monitoring.csv -------------------------------------------------------------------------------- /demo/profit_examples.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import pandas as pd 3 | 4 | # Package import 5 | PATH = os.getcwd()[:os.getcwd().rfind('\\')] # path to ProFIT directory 6 | sys.path.append(PATH) 7 | sys.path.append(PATH+'\\profit') 8 | 9 | from profit import ProcessMap 10 | 11 | monitoring = PATH + "/demo/log_examples/remote_monitoring_eng.csv" 12 | declarations = PATH + "/demo/log_examples/DomesticDeclarations.xes" 13 | 14 | def main(): 15 | if __name__== "__main__" : 16 | # Log example in csv 17 | df_monitoring = pd.read_csv(monitoring, encoding='cp1251') 18 | print(df_monitoring.head()) 19 | 20 | # Init ProcessMap and set log in CSV/TXT/XES/pandas.DataFrame 21 | pm = ProcessMap() 22 | pm.set_log(FILE_PATH = monitoring, 23 | # data = df_monitoring, 24 | encoding='cp1251') 25 | pm.set_rates(80, 5) # activity and path rates (should set optimize=False for this setting) 26 | pm.set_params(optimize=False, aggregate=False) # option to discover an optimal process model and its elements aggregation 27 | pm.update() # update method have to be called after each setting (or series)! 28 | pm.render(show_only=True, save_path=None) # pass a path to a directory where the result will be saved 29 | 30 | main() -------------------------------------------------------------------------------- /demo/profit_examples_eng.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Contributors:\n", 15 | "- *Liubov Elkhovskaya* lelkhovskaya@itmo.ru\n", 16 | "- *Alexander Kshenin* adkshenin@itmo.ru\n", 17 | "- *Marina Balakhontceva* mbalakhontceva@itmo.ru\n", 18 | "- *Sergey Kovalchuk* kovalchuk@itmo.ru" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "ProFIT is for automatic process model construction from data. Input is an ordered (by a *timestamp*) event log (table data in CSV/TXT or xml data in XES) including *case id* and *activity* attributes at least. Process model is shown as a directly-follows graph, where the green node is the beginning of the process and red one is the end." 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## Package location" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "*For demo launch from repository in jupyter-notebook.*" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "Repository link for download: https://github.com/Siella/ProFIT." 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 1, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "import os\n", 63 | "import configparser\n", 64 | "\n", 65 | "PATH = os.getcwd()[:os.getcwd().rfind('\\\\')] # path to ProFIT directory\n", 66 | "config = configparser.ConfigParser()\n", 67 | "config.add_section(\"packageLocation\")\n", 68 | "config.set(\"packageLocation\", \"workingDir\", PATH)\n", 69 | "config.set(\"packageLocation\", \"packageDir\", PATH+'\\\\profit')" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "## Import" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 2, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "import sys\n", 86 | "sys.path.append(config[\"packageLocation\"][\"workingDir\"])\n", 87 | "sys.path.append(config[\"packageLocation\"][\"packageDir\"])" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 3, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "from profit import ProcessMap" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "## How to use" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "To start use ProFIT, one should create a variable and declare it as a `ProcessMap` object. Next, a path to log data in CSV/TXT/XES or as `pandas.DataFrame` should be passed via `set_log` method." 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 4, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "monitoring = PATH + \"/demo/log_examples/remote_monitoring_eng.csv\"\n", 120 | "declarations = PATH + \"/demo/log_examples/DomesticDeclarations.xes\"" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 5, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/html": [ 131 | "
\n", 132 | "\n", 145 | "\n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | "
case_idtasktimestamp
04082New med. program (operator)2018-01-10 00:00:03
14082Red zone (operator)2018-01-10 22:09:21
24173New med. program (operator)2018-01-12 00:00:04
34176New med. program (operator)2018-01-12 00:00:04
44082Red zone (doctor FD)2018-01-12 02:44:28
\n", 187 | "
" 188 | ], 189 | "text/plain": [ 190 | " case_id task timestamp\n", 191 | "0 4082 New med. program (operator) 2018-01-10 00:00:03\n", 192 | "1 4082 Red zone (operator) 2018-01-10 22:09:21\n", 193 | "2 4173 New med. program (operator) 2018-01-12 00:00:04\n", 194 | "3 4176 New med. program (operator) 2018-01-12 00:00:04\n", 195 | "4 4082 Red zone (doctor FD) 2018-01-12 02:44:28" 196 | ] 197 | }, 198 | "execution_count": 5, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "import pandas as pd\n", 205 | "\n", 206 | "# log demo\n", 207 | "df_monitoring = pd.read_csv(monitoring, encoding='cp1251')\n", 208 | "df_monitoring.head()" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 6, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "pm = ProcessMap()" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 7, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "pm.set_log(FILE_PATH = monitoring, \n", 227 | "# data = df_monitoring,\n", 228 | " encoding = 'utf-8')\n", 229 | "\n", 230 | "pm.update()" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "`update` method have to be called after each setting (or series)!" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "`render` method returns a process model as a directly-follows graph in DOT language, which is supported and can be visualized by Graphviz. To show and/or save a model in formats supported by Graphviz, one needs to call this method and pass a path to a directory where the result will be saved." 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 8, 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "image/svg+xml": [ 255 | "\r\n", 256 | "\r\n", 258 | "\r\n", 260 | "\r\n", 261 | "\r\n", 263 | "\r\n", 264 | "%3\r\n", 265 | "\r\n", 266 | "\r\n", 267 | "Red zone (operator)\r\n", 268 | "\r\n", 269 | "Red zone (operator) (6719)\r\n", 270 | "\r\n", 271 | "\r\n", 272 | "Red zone (operator)->Red zone (operator)\r\n", 273 | "\r\n", 274 | "\r\n", 275 | "490\r\n", 276 | "\r\n", 277 | "\r\n", 278 | "Red zone (doctor FD)\r\n", 279 | "\r\n", 280 | "Red zone (doctor FD) (6683)\r\n", 281 | "\r\n", 282 | "\r\n", 283 | "Red zone (operator)->Red zone (doctor FD)\r\n", 284 | "\r\n", 285 | "\r\n", 286 | "2158\r\n", 287 | "\r\n", 288 | "\r\n", 289 | "Red zone (doctor)\r\n", 290 | "\r\n", 291 | "Red zone (doctor) (5275)\r\n", 292 | "\r\n", 293 | "\r\n", 294 | "Red zone (operator)->Red zone (doctor)\r\n", 295 | "\r\n", 296 | "\r\n", 297 | "3672\r\n", 298 | "\r\n", 299 | "\r\n", 300 | "Meas. missing (doctor)\r\n", 301 | "\r\n", 302 | "Meas. missing (doctor) (1674)\r\n", 303 | "\r\n", 304 | "\r\n", 305 | "Notification (operator)\r\n", 306 | "\r\n", 307 | "Notification (operator) (4462)\r\n", 308 | "\r\n", 309 | "\r\n", 310 | "Meas. missing (doctor)->Notification (operator)\r\n", 311 | "\r\n", 312 | "\r\n", 313 | "1058\r\n", 314 | "\r\n", 315 | "\r\n", 316 | "Yellow zone (doctor FD)\r\n", 317 | "\r\n", 318 | "Yellow zone (doctor FD) (3493)\r\n", 319 | "\r\n", 320 | "\r\n", 321 | "Yellow zone (doctor FD)->Red zone (operator)\r\n", 322 | "\r\n", 323 | "\r\n", 324 | "642\r\n", 325 | "\r\n", 326 | "\r\n", 327 | "Yellow zone (doctor FD)->Notification (operator)\r\n", 328 | "\r\n", 329 | "\r\n", 330 | "872\r\n", 331 | "\r\n", 332 | "\r\n", 333 | "Yellow zone (doctor)\r\n", 334 | "\r\n", 335 | "Yellow zone (doctor) (3359)\r\n", 336 | "\r\n", 337 | "\r\n", 338 | "Yellow zone (doctor FD)->Yellow zone (doctor)\r\n", 339 | "\r\n", 340 | "\r\n", 341 | "1073\r\n", 342 | "\r\n", 343 | "\r\n", 344 | "Red zone (doctor FD)->Red zone (operator)\r\n", 345 | "\r\n", 346 | "\r\n", 347 | "2244\r\n", 348 | "\r\n", 349 | "\r\n", 350 | "Red zone (doctor FD)->Red zone (doctor FD)\r\n", 351 | "\r\n", 352 | "\r\n", 353 | "1429\r\n", 354 | "\r\n", 355 | "\r\n", 356 | "Red zone (doctor FD)->Notification (operator)\r\n", 357 | "\r\n", 358 | "\r\n", 359 | "536\r\n", 360 | "\r\n", 361 | "\r\n", 362 | "Red zone (doctor FD)->Red zone (doctor)\r\n", 363 | "\r\n", 364 | "\r\n", 365 | "674\r\n", 366 | "\r\n", 367 | "\r\n", 368 | "Red zone (doctor FD)->Yellow zone (doctor)\r\n", 369 | "\r\n", 370 | "\r\n", 371 | "976\r\n", 372 | "\r\n", 373 | "\r\n", 374 | "Notification (operator)->Notification (operator)\r\n", 375 | "\r\n", 376 | "\r\n", 377 | "983\r\n", 378 | "\r\n", 379 | "\r\n", 380 | "Notification (operator)->Yellow zone (doctor)\r\n", 381 | "\r\n", 382 | "\r\n", 383 | "633\r\n", 384 | "\r\n", 385 | "\r\n", 386 | "Meas. missing (operator)\r\n", 387 | "\r\n", 388 | "Meas. missing (operator) (2336)\r\n", 389 | "\r\n", 390 | "\r\n", 391 | "Notification (operator)->Meas. missing (operator)\r\n", 392 | "\r\n", 393 | "\r\n", 394 | "1632\r\n", 395 | "\r\n", 396 | "\r\n", 397 | "end\r\n", 398 | "\r\n", 399 | "\r\n", 400 | "\r\n", 401 | "\r\n", 402 | "Notification (operator)->end\r\n", 403 | "\r\n", 404 | "\r\n", 405 | "131\r\n", 406 | "\r\n", 407 | "\r\n", 408 | "Red zone (doctor)->Red zone (operator)\r\n", 409 | "\r\n", 410 | "\r\n", 411 | "2269\r\n", 412 | "\r\n", 413 | "\r\n", 414 | "Red zone (doctor)->Red zone (doctor FD)\r\n", 415 | "\r\n", 416 | "\r\n", 417 | "2304\r\n", 418 | "\r\n", 419 | "\r\n", 420 | "Yellow zone (doctor)->Yellow zone (doctor FD)\r\n", 421 | "\r\n", 422 | "\r\n", 423 | "2415\r\n", 424 | "\r\n", 425 | "\r\n", 426 | "New med. program (operator)\r\n", 427 | "\r\n", 428 | "New med. program (operator) (253)\r\n", 429 | "\r\n", 430 | "\r\n", 431 | "New med. program (operator)->Red zone (operator)\r\n", 432 | "\r\n", 433 | "\r\n", 434 | "171\r\n", 435 | "\r\n", 436 | "\r\n", 437 | "Meas. missing (operator)->Meas. missing (doctor)\r\n", 438 | "\r\n", 439 | "\r\n", 440 | "1466\r\n", 441 | "\r\n", 442 | "\r\n", 443 | "start\r\n", 444 | "\r\n", 445 | "272\r\n", 446 | "\r\n", 447 | "\r\n", 448 | "start->New med. program (operator)\r\n", 449 | "\r\n", 450 | "\r\n", 451 | "241\r\n", 452 | "\r\n", 453 | "\r\n", 454 | "\r\n" 455 | ], 456 | "text/plain": [ 457 | "" 458 | ] 459 | }, 460 | "execution_count": 8, 461 | "metadata": {}, 462 | "output_type": "execute_result" 463 | } 464 | ], 465 | "source": [ 466 | "# without saving\n", 467 | "pm.render()" 468 | ] 469 | }, 470 | { 471 | "cell_type": "markdown", 472 | "metadata": {}, 473 | "source": [ 474 | "An user can tune several parameters via `set_rates` and `set_params` methods:\n", 475 | "- activity and path rates which regulate process model details;\n", 476 | "- an option to discover an optimal process model based on mixed-paradigm process mining;\n", 477 | "- a regularization factor to control the balance between model complexity and completeness;\n", 478 | "- an option to meta-states search and cycle aggregation in the model;\n", 479 | "- types of aggregation;\n", 480 | "- etc. (see docstrings for `ProcessMap`)\n", 481 | "\n", 482 | "As a default setting, the algorithm tries to find an optimal process model. So, when changing rates, one should set `optimize=False` to turn off its autotune. A general optimization problem is as follows:" 483 | ] 484 | }, 485 | { 486 | "cell_type": "markdown", 487 | "metadata": {}, 488 | "source": [ 489 | "$$\\mathcal{Q}(p, X^l) = (1-\\lambda)\\cdot F + \\lambda\\cdot C_{\\mathcal{J}} \\longrightarrow \\min_{\\theta},$$" 490 | ] 491 | }, 492 | { 493 | "cell_type": "markdown", 494 | "metadata": {}, 495 | "source": [ 496 | "where $\\mathcal{Q}$ is a quality function, $p$ is an algorithm for process model discovery from data, $X^l$ is a log sample, $\\lambda$ is a regularization factor, $F$ is a loss function, $C_\\mathcal{J}$ is a complexity function, $\\theta$ is a vector of rate parameters." 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": 9, 502 | "metadata": {}, 503 | "outputs": [ 504 | { 505 | "data": { 506 | "application/vnd.jupyter.widget-view+json": { 507 | "model_id": "1be2781e29ed4145bd5620113d4b8936", 508 | "version_major": 2, 509 | "version_minor": 0 510 | }, 511 | "text/plain": [ 512 | "HBox(children=(HTML(value='parsing log, completed traces :: '), FloatProgress(value=0.0, max=10500.0), HTML(va…" 513 | ] 514 | }, 515 | "metadata": {}, 516 | "output_type": "display_data" 517 | }, 518 | { 519 | "name": "stdout", 520 | "output_type": "stream", 521 | "text": [ 522 | "\n" 523 | ] 524 | } 525 | ], 526 | "source": [ 527 | "pm2 = ProcessMap()\n", 528 | "pm2.set_log(FILE_PATH=declarations)\n", 529 | "pm2.update()" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 10, 535 | "metadata": { 536 | "scrolled": false 537 | }, 538 | "outputs": [ 539 | { 540 | "data": { 541 | "image/svg+xml": [ 542 | "\r\n", 543 | "\r\n", 545 | "\r\n", 547 | "\r\n", 548 | "\r\n", 550 | "\r\n", 551 | "%3\r\n", 552 | "\r\n", 553 | "\r\n", 554 | "Declaration APPROVED by ADMINISTRATION\r\n", 555 | "\r\n", 556 | "Declaration APPROVED by ADMINISTRATION (8202)\r\n", 557 | "\r\n", 558 | "\r\n", 559 | "Declaration FINAL_APPROVED by SUPERVISOR\r\n", 560 | "\r\n", 561 | "Declaration FINAL_APPROVED by SUPERVISOR (10131)\r\n", 562 | "\r\n", 563 | "\r\n", 564 | "Declaration APPROVED by ADMINISTRATION->Declaration FINAL_APPROVED by SUPERVISOR\r\n", 565 | "\r\n", 566 | "\r\n", 567 | "5133\r\n", 568 | "\r\n", 569 | "\r\n", 570 | "Payment Handled\r\n", 571 | "\r\n", 572 | "Payment Handled (10044)\r\n", 573 | "\r\n", 574 | "\r\n", 575 | "end\r\n", 576 | "\r\n", 577 | "\r\n", 578 | "\r\n", 579 | "\r\n", 580 | "Payment Handled->end\r\n", 581 | "\r\n", 582 | "\r\n", 583 | "10043\r\n", 584 | "\r\n", 585 | "\r\n", 586 | "Declaration SUBMITTED by EMPLOYEE\r\n", 587 | "\r\n", 588 | "Declaration SUBMITTED by EMPLOYEE (11531)\r\n", 589 | "\r\n", 590 | "\r\n", 591 | "Declaration SUBMITTED by EMPLOYEE->Declaration APPROVED by ADMINISTRATION\r\n", 592 | "\r\n", 593 | "\r\n", 594 | "8202\r\n", 595 | "\r\n", 596 | "\r\n", 597 | "Request Payment\r\n", 598 | "\r\n", 599 | "Request Payment (10040)\r\n", 600 | "\r\n", 601 | "\r\n", 602 | "Request Payment->Payment Handled\r\n", 603 | "\r\n", 604 | "\r\n", 605 | "10031\r\n", 606 | "\r\n", 607 | "\r\n", 608 | "Declaration FINAL_APPROVED by SUPERVISOR->Request Payment\r\n", 609 | "\r\n", 610 | "\r\n", 611 | "10032\r\n", 612 | "\r\n", 613 | "\r\n", 614 | "start\r\n", 615 | "\r\n", 616 | "10500\r\n", 617 | "\r\n", 618 | "\r\n", 619 | "start->Declaration SUBMITTED by EMPLOYEE\r\n", 620 | "\r\n", 621 | "\r\n", 622 | "10365\r\n", 623 | "\r\n", 624 | "\r\n", 625 | "\r\n" 626 | ], 627 | "text/plain": [ 628 | "" 629 | ] 630 | }, 631 | "execution_count": 10, 632 | "metadata": {}, 633 | "output_type": "execute_result" 634 | } 635 | ], 636 | "source": [ 637 | "pm2.render()" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": 11, 643 | "metadata": {}, 644 | "outputs": [], 645 | "source": [ 646 | "pm.set_rates(activity_rate=80, path_rate=15)\n", 647 | "pm.set_params(optimize=False, aggregate=False)\n", 648 | "\n", 649 | "pm.update()" 650 | ] 651 | }, 652 | { 653 | "cell_type": "code", 654 | "execution_count": 12, 655 | "metadata": {}, 656 | "outputs": [ 657 | { 658 | "data": { 659 | "image/svg+xml": [ 660 | "\r\n", 661 | "\r\n", 663 | "\r\n", 665 | "\r\n", 666 | "\r\n", 668 | "\r\n", 669 | "%3\r\n", 670 | "\r\n", 671 | "\r\n", 672 | "Program change (operator)\r\n", 673 | "\r\n", 674 | "Program change (operator) (85)\r\n", 675 | "\r\n", 676 | "\r\n", 677 | "Yellow zone "Therapy control" (operator)\r\n", 678 | "\r\n", 679 | "Yellow zone "Therapy control" (operator) (1120)\r\n", 680 | "\r\n", 681 | "\r\n", 682 | "Program change (operator)->Yellow zone "Therapy control" (operator)\r\n", 683 | "\r\n", 684 | "\r\n", 685 | "40\r\n", 686 | "\r\n", 687 | "\r\n", 688 | "Red zone (operator)\r\n", 689 | "\r\n", 690 | "Red zone (operator) (6719)\r\n", 691 | "\r\n", 692 | "\r\n", 693 | "Red zone (doctor FD)\r\n", 694 | "\r\n", 695 | "Red zone (doctor FD) (6683)\r\n", 696 | "\r\n", 697 | "\r\n", 698 | "Red zone (operator)->Red zone (doctor FD)\r\n", 699 | "\r\n", 700 | "\r\n", 701 | "2158\r\n", 702 | "\r\n", 703 | "\r\n", 704 | "Red zone (doctor)\r\n", 705 | "\r\n", 706 | "Red zone (doctor) (5275)\r\n", 707 | "\r\n", 708 | "\r\n", 709 | "Red zone (operator)->Red zone (doctor)\r\n", 710 | "\r\n", 711 | "\r\n", 712 | "3672\r\n", 713 | "\r\n", 714 | "\r\n", 715 | "Meas. missing (doctor)\r\n", 716 | "\r\n", 717 | "Meas. missing (doctor) (1674)\r\n", 718 | "\r\n", 719 | "\r\n", 720 | "Notification (operator)\r\n", 721 | "\r\n", 722 | "Notification (operator) (4462)\r\n", 723 | "\r\n", 724 | "\r\n", 725 | "Meas. missing (doctor)->Notification (operator)\r\n", 726 | "\r\n", 727 | "\r\n", 728 | "1058\r\n", 729 | "\r\n", 730 | "\r\n", 731 | "Yellow zone "Therapy control" (operator)->Red zone (operator)\r\n", 732 | "\r\n", 733 | "\r\n", 734 | "157\r\n", 735 | "\r\n", 736 | "\r\n", 737 | "Yellow zone "Therapy control" (operator)->Notification (operator)\r\n", 738 | "\r\n", 739 | "\r\n", 740 | "117\r\n", 741 | "\r\n", 742 | "\r\n", 743 | "Yellow zone (doctor FD)\r\n", 744 | "\r\n", 745 | "Yellow zone (doctor FD) (3493)\r\n", 746 | "\r\n", 747 | "\r\n", 748 | "Yellow zone (doctor FD)->Program change (operator)\r\n", 749 | "\r\n", 750 | "\r\n", 751 | "29\r\n", 752 | "\r\n", 753 | "\r\n", 754 | "Yellow zone (doctor FD)->Red zone (operator)\r\n", 755 | "\r\n", 756 | "\r\n", 757 | "642\r\n", 758 | "\r\n", 759 | "\r\n", 760 | "Yellow zone (doctor FD)->Notification (operator)\r\n", 761 | "\r\n", 762 | "\r\n", 763 | "872\r\n", 764 | "\r\n", 765 | "\r\n", 766 | "Yellow zone (doctor)\r\n", 767 | "\r\n", 768 | "Yellow zone (doctor) (3359)\r\n", 769 | "\r\n", 770 | "\r\n", 771 | "Yellow zone (doctor FD)->Yellow zone (doctor)\r\n", 772 | "\r\n", 773 | "\r\n", 774 | "1073\r\n", 775 | "\r\n", 776 | "\r\n", 777 | "Red zone (doctor FD)->Red zone (operator)\r\n", 778 | "\r\n", 779 | "\r\n", 780 | "2244\r\n", 781 | "\r\n", 782 | "\r\n", 783 | "Red zone (doctor FD)->Yellow zone "Therapy control" (operator)\r\n", 784 | "\r\n", 785 | "\r\n", 786 | "205\r\n", 787 | "\r\n", 788 | "\r\n", 789 | "Red zone (doctor FD)->Red zone (doctor FD)\r\n", 790 | "\r\n", 791 | "\r\n", 792 | "1429\r\n", 793 | "\r\n", 794 | "\r\n", 795 | "Red zone (doctor FD)->Notification (operator)\r\n", 796 | "\r\n", 797 | "\r\n", 798 | "536\r\n", 799 | "\r\n", 800 | "\r\n", 801 | "Red zone (doctor FD)->Yellow zone (doctor)\r\n", 802 | "\r\n", 803 | "\r\n", 804 | "976\r\n", 805 | "\r\n", 806 | "\r\n", 807 | "Meas. missing (operator)\r\n", 808 | "\r\n", 809 | "Meas. missing (operator) (2336)\r\n", 810 | "\r\n", 811 | "\r\n", 812 | "Notification (operator)->Meas. missing (operator)\r\n", 813 | "\r\n", 814 | "\r\n", 815 | "1632\r\n", 816 | "\r\n", 817 | "\r\n", 818 | "end\r\n", 819 | "\r\n", 820 | "\r\n", 821 | "\r\n", 822 | "\r\n", 823 | "Notification (operator)->end\r\n", 824 | "\r\n", 825 | "\r\n", 826 | "131\r\n", 827 | "\r\n", 828 | "\r\n", 829 | "Red zone (doctor)->Red zone (operator)\r\n", 830 | "\r\n", 831 | "\r\n", 832 | "2269\r\n", 833 | "\r\n", 834 | "\r\n", 835 | "Red zone (doctor)->Red zone (doctor FD)\r\n", 836 | "\r\n", 837 | "\r\n", 838 | "2304\r\n", 839 | "\r\n", 840 | "\r\n", 841 | "Yellow zone (doctor)->Yellow zone (doctor FD)\r\n", 842 | "\r\n", 843 | "\r\n", 844 | "2415\r\n", 845 | "\r\n", 846 | "\r\n", 847 | "New med. program (operator)\r\n", 848 | "\r\n", 849 | "New med. program (operator) (253)\r\n", 850 | "\r\n", 851 | "\r\n", 852 | "New med. program (operator)->Red zone (operator)\r\n", 853 | "\r\n", 854 | "\r\n", 855 | "171\r\n", 856 | "\r\n", 857 | "\r\n", 858 | "Meas. missing (operator)->Meas. missing (doctor)\r\n", 859 | "\r\n", 860 | "\r\n", 861 | "1466\r\n", 862 | "\r\n", 863 | "\r\n", 864 | "start\r\n", 865 | "\r\n", 866 | "272\r\n", 867 | "\r\n", 868 | "\r\n", 869 | "start->New med. program (operator)\r\n", 870 | "\r\n", 871 | "\r\n", 872 | "241\r\n", 873 | "\r\n", 874 | "\r\n", 875 | "\r\n" 876 | ], 877 | "text/plain": [ 878 | "" 879 | ] 880 | }, 881 | "execution_count": 12, 882 | "metadata": {}, 883 | "output_type": "execute_result" 884 | } 885 | ], 886 | "source": [ 887 | "pm.render()" 888 | ] 889 | }, 890 | { 891 | "cell_type": "markdown", 892 | "metadata": {}, 893 | "source": [ 894 | "### Meta-states discovering" 895 | ] 896 | }, 897 | { 898 | "cell_type": "markdown", 899 | "metadata": {}, 900 | "source": [ 901 | "We define a meta-state as a significant cycle in the model, i.e., frequent cyclic behaviour in the log." 902 | ] 903 | }, 904 | { 905 | "cell_type": "markdown", 906 | "metadata": {}, 907 | "source": [ 908 | "" 909 | ] 910 | }, 911 | { 912 | "cell_type": "markdown", 913 | "metadata": {}, 914 | "source": [ 915 | "Example of process model reconstruction: (a) initial model; (b) cycle folding with outer joining; (c) cycle folding with inner joining and \"to all\" redirecting heuristic; (d) cycle folding with inner joining and \"to frequent\" redirecting heuristic" 916 | ] 917 | }, 918 | { 919 | "cell_type": "code", 920 | "execution_count": 13, 921 | "metadata": {}, 922 | "outputs": [], 923 | "source": [ 924 | "pm.set_rates(activity_rate=80, path_rate=5)\n", 925 | "pm.set_params(optimize=False, \n", 926 | " aggregate=True,\n", 927 | " heuristic='all',\n", 928 | " agg_type='inner')\n", 929 | "\n", 930 | "pm.update()" 931 | ] 932 | }, 933 | { 934 | "cell_type": "code", 935 | "execution_count": 14, 936 | "metadata": {}, 937 | "outputs": [ 938 | { 939 | "data": { 940 | "image/svg+xml": [ 941 | "\r\n", 942 | "\r\n", 944 | "\r\n", 946 | "\r\n", 947 | "\r\n", 949 | "\r\n", 950 | "%3\r\n", 951 | "\r\n", 952 | "\r\n", 953 | "('Red zone (doctor FD)', 'Red zone (operator)', 'Red zone (doctor)')\r\n", 954 | "\r\n", 955 | "Red zone (doctor FD) (7062)\r\n", 956 | "Red zone (operator) (6456)\r\n", 957 | "Red zone (doctor) (5850)\r\n", 958 | "(2091)\r\n", 959 | "\r\n", 960 | "\r\n", 961 | "('Yellow zone (doctor FD)', 'Yellow zone (doctor)')\r\n", 962 | "\r\n", 963 | "Yellow zone (doctor FD) (3972)\r\n", 964 | "Yellow zone (doctor) (3874)\r\n", 965 | "(1541)\r\n", 966 | "\r\n", 967 | "\r\n", 968 | "('Red zone (doctor FD)', 'Red zone (operator)', 'Red zone (doctor)')->('Yellow zone (doctor FD)', 'Yellow zone (doctor)')\r\n", 969 | "\r\n", 970 | "\r\n", 971 | "276\r\n", 972 | "\r\n", 973 | "\r\n", 974 | "('Meas. missing (doctor)', 'Notification (operator)', 'Meas. missing (operator)')\r\n", 975 | "\r\n", 976 | "Meas. missing (doctor) (2324)\r\n", 977 | "Notification (operator) (4982)\r\n", 978 | "Meas. missing (operator) (2980)\r\n", 979 | "(1603)\r\n", 980 | "\r\n", 981 | "\r\n", 982 | "end\r\n", 983 | "\r\n", 984 | "\r\n", 985 | "\r\n", 986 | "\r\n", 987 | "('Meas. missing (doctor)', 'Notification (operator)', 'Meas. missing (operator)')->end\r\n", 988 | "\r\n", 989 | "\r\n", 990 | "120\r\n", 991 | "\r\n", 992 | "\r\n", 993 | "('Red zone (operator)', 'Red zone (doctor FD)')\r\n", 994 | "\r\n", 995 | "Red zone (operator) (6069)\r\n", 996 | "Red zone (doctor FD) (6675)\r\n", 997 | "(1704)\r\n", 998 | "\r\n", 999 | "\r\n", 1000 | "('Red zone (operator)', 'Red zone (doctor FD)')->('Yellow zone (doctor FD)', 'Yellow zone (doctor)')\r\n", 1001 | "\r\n", 1002 | "\r\n", 1003 | "274\r\n", 1004 | "\r\n", 1005 | "\r\n", 1006 | "('Yellow zone (doctor FD)', 'Yellow zone (doctor)')->('Red zone (doctor FD)', 'Red zone (operator)', 'Red zone (doctor)')\r\n", 1007 | "\r\n", 1008 | "\r\n", 1009 | "189\r\n", 1010 | "\r\n", 1011 | "\r\n", 1012 | "('Yellow zone (doctor FD)', 'Yellow zone (doctor)')->('Meas. missing (doctor)', 'Notification (operator)', 'Meas. missing (operator)')\r\n", 1013 | "\r\n", 1014 | "\r\n", 1015 | "74\r\n", 1016 | "\r\n", 1017 | "\r\n", 1018 | "('Yellow zone (doctor FD)', 'Yellow zone (doctor)')->('Yellow zone (doctor FD)', 'Yellow zone (doctor)')\r\n", 1019 | "\r\n", 1020 | "\r\n", 1021 | "535\r\n", 1022 | "\r\n", 1023 | "\r\n", 1024 | "('Red zone (operator)', 'Red zone (doctor)')\r\n", 1025 | "\r\n", 1026 | "Red zone (operator) (5494)\r\n", 1027 | "Red zone (doctor) (4888)\r\n", 1028 | "(1129)\r\n", 1029 | "\r\n", 1030 | "\r\n", 1031 | "('Red zone (operator)', 'Red zone (doctor)')->('Red zone (operator)', 'Red zone (doctor FD)')\r\n", 1032 | "\r\n", 1033 | "\r\n", 1034 | "316\r\n", 1035 | "\r\n", 1036 | "\r\n", 1037 | "New med. program (operator)\r\n", 1038 | "\r\n", 1039 | "New med. program (operator) (253)\r\n", 1040 | "\r\n", 1041 | "\r\n", 1042 | "New med. program (operator)->('Red zone (doctor FD)', 'Red zone (operator)', 'Red zone (doctor)')\r\n", 1043 | "\r\n", 1044 | "\r\n", 1045 | "181\r\n", 1046 | "\r\n", 1047 | "\r\n", 1048 | "New med. program (operator)->('Red zone (operator)', 'Red zone (doctor FD)')\r\n", 1049 | "\r\n", 1050 | "\r\n", 1051 | "130\r\n", 1052 | "\r\n", 1053 | "\r\n", 1054 | "New med. program (operator)->('Red zone (operator)', 'Red zone (doctor)')\r\n", 1055 | "\r\n", 1056 | "\r\n", 1057 | "189\r\n", 1058 | "\r\n", 1059 | "\r\n", 1060 | "start\r\n", 1061 | "\r\n", 1062 | "272\r\n", 1063 | "\r\n", 1064 | "\r\n", 1065 | "start->New med. program (operator)\r\n", 1066 | "\r\n", 1067 | "\r\n", 1068 | "241\r\n", 1069 | "\r\n", 1070 | "\r\n", 1071 | "\r\n" 1072 | ], 1073 | "text/plain": [ 1074 | "" 1075 | ] 1076 | }, 1077 | "execution_count": 14, 1078 | "metadata": {}, 1079 | "output_type": "execute_result" 1080 | } 1081 | ], 1082 | "source": [ 1083 | "pm.render()" 1084 | ] 1085 | } 1086 | ], 1087 | "metadata": { 1088 | "kernelspec": { 1089 | "display_name": "Python 3", 1090 | "language": "python", 1091 | "name": "python3" 1092 | }, 1093 | "language_info": { 1094 | "codemirror_mode": { 1095 | "name": "ipython", 1096 | "version": 3 1097 | }, 1098 | "file_extension": ".py", 1099 | "mimetype": "text/x-python", 1100 | "name": "python", 1101 | "nbconvert_exporter": "python", 1102 | "pygments_lexer": "ipython3", 1103 | "version": "3.8.5" 1104 | }, 1105 | "pycharm": { 1106 | "stem_cell": { 1107 | "cell_type": "raw", 1108 | "metadata": { 1109 | "collapsed": false 1110 | }, 1111 | "source": [] 1112 | } 1113 | } 1114 | }, 1115 | "nbformat": 4, 1116 | "nbformat_minor": 2 1117 | } 1118 | -------------------------------------------------------------------------------- /demo/profit_examples_rus.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Contributors:\n", 15 | "- *Liubov Elkhovskaya* lelkhovskaya@itmo.ru\n", 16 | "- *Alexander Kshenin* adkshenin@itmo.ru\n", 17 | "- *Marina Balakhontceva* mbalakhontceva@itmo.ru\n", 18 | "- *Sergey Kovalchuk* kovalchuk@itmo.ru" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "ProFIT автоматически строит модели бизнес-процессов по данным. Входные данные — журнал событий, содержащий записи об идентификаторах случаев и выполненных действиях, упорядоченные по времени регистрации в системе. Модель процесса представлена в виде ориентированного графа, где зелёная вершина — начало процесса, а красная — конец." 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## Package location" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "*Для запуска демо из репозитория в jupyter-notebook.*" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "Ссылка на проект: https://github.com/Siella/ProFIT." 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 1, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "import os\n", 63 | "import configparser\n", 64 | "\n", 65 | "PATH = os.getcwd()[:os.getcwd().rfind('\\\\')] # путь до директории ProFIT\n", 66 | "config = configparser.ConfigParser()\n", 67 | "config.add_section(\"packageLocation\")\n", 68 | "config.set(\"packageLocation\", \"workingDir\", PATH)\n", 69 | "config.set(\"packageLocation\", \"packageDir\", PATH+'\\\\profit')" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "## Import" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 2, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "import sys\n", 86 | "sys.path.append(config[\"packageLocation\"][\"workingDir\"])\n", 87 | "sys.path.append(config[\"packageLocation\"][\"packageDir\"])" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 3, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "from profit import ProcessMap" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "## How to use" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "Чтобы начать использовать ProFIT, достаточно объявить и присвоить переменной экземпляр класса ProcessMap, а затем передать путь к логу в формате CSV/TXT/XES (либо сами данные в виде pandas.DataFrame) через метод set_log." 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 4, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "monitoring = PATH + \"/demo/log_examples/remote_monitoring.csv\"\n", 120 | "declarations = PATH + \"/demo/log_examples/DomesticDeclarations.xes\"" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 5, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/html": [ 131 | "
\n", 132 | "\n", 145 | "\n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | "
case_idtasktimestamp
04082нов.прогр_оператор2018-01-10 00:00:03
14082КЗ_оператор2018-01-10 22:09:21
24173нов.прогр_оператор2018-01-12 00:00:04
34176нов.прогр_оператор2018-01-12 00:00:04
44082КЗ_врачФД2018-01-12 02:44:28
\n", 187 | "
" 188 | ], 189 | "text/plain": [ 190 | " case_id task timestamp\n", 191 | "0 4082 нов.прогр_оператор 2018-01-10 00:00:03\n", 192 | "1 4082 КЗ_оператор 2018-01-10 22:09:21\n", 193 | "2 4173 нов.прогр_оператор 2018-01-12 00:00:04\n", 194 | "3 4176 нов.прогр_оператор 2018-01-12 00:00:04\n", 195 | "4 4082 КЗ_врачФД 2018-01-12 02:44:28" 196 | ] 197 | }, 198 | "execution_count": 5, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "import pandas as pd\n", 205 | "\n", 206 | "# log demo\n", 207 | "df_monitoring = pd.read_csv(monitoring, encoding='cp1251')\n", 208 | "df_monitoring.head()" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 6, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "pm = ProcessMap()" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 7, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "pm.set_log(FILE_PATH = monitoring, \n", 227 | "# data = df_monitoring,\n", 228 | " encoding = 'cp1251')\n", 229 | "\n", 230 | "pm.update()" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "После каждой настройки необходимо вызвать метод update!" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "Метод render возвращает модель процесса в виде ориентированного графа на DOT языке (поддерживается и визуализируется пакетом утилит Graphviz). Чтобы показать и сохранить модель в формате, поддерживаемом Graphviz, нужно вызвать данный метод и указать путь к директории, где будет сохранён результат." 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 8, 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "image/svg+xml": [ 255 | "\r\n", 256 | "\r\n", 258 | "\r\n", 260 | "\r\n", 261 | "\r\n", 263 | "\r\n", 264 | "%3\r\n", 265 | "\r\n", 266 | "\r\n", 267 | "КЗ_оператор\r\n", 268 | "\r\n", 269 | "КЗ_оператор (6719)\r\n", 270 | "\r\n", 271 | "\r\n", 272 | "КЗ_оператор->КЗ_оператор\r\n", 273 | "\r\n", 274 | "\r\n", 275 | "490\r\n", 276 | "\r\n", 277 | "\r\n", 278 | "КЗ_врач\r\n", 279 | "\r\n", 280 | "КЗ_врач (5275)\r\n", 281 | "\r\n", 282 | "\r\n", 283 | "КЗ_оператор->КЗ_врач\r\n", 284 | "\r\n", 285 | "\r\n", 286 | "3672\r\n", 287 | "\r\n", 288 | "\r\n", 289 | "КЗ_врачФД\r\n", 290 | "\r\n", 291 | "КЗ_врачФД (6683)\r\n", 292 | "\r\n", 293 | "\r\n", 294 | "КЗ_оператор->КЗ_врачФД\r\n", 295 | "\r\n", 296 | "\r\n", 297 | "2158\r\n", 298 | "\r\n", 299 | "\r\n", 300 | "КЗ_врач->КЗ_оператор\r\n", 301 | "\r\n", 302 | "\r\n", 303 | "2269\r\n", 304 | "\r\n", 305 | "\r\n", 306 | "КЗ_врач->КЗ_врачФД\r\n", 307 | "\r\n", 308 | "\r\n", 309 | "2304\r\n", 310 | "\r\n", 311 | "\r\n", 312 | "оповещ_оператор\r\n", 313 | "\r\n", 314 | "оповещ_оператор (4462)\r\n", 315 | "\r\n", 316 | "\r\n", 317 | "оповещ_оператор->оповещ_оператор\r\n", 318 | "\r\n", 319 | "\r\n", 320 | "983\r\n", 321 | "\r\n", 322 | "\r\n", 323 | "отс.изм_оператор\r\n", 324 | "\r\n", 325 | "отс.изм_оператор (2336)\r\n", 326 | "\r\n", 327 | "\r\n", 328 | "оповещ_оператор->отс.изм_оператор\r\n", 329 | "\r\n", 330 | "\r\n", 331 | "1632\r\n", 332 | "\r\n", 333 | "\r\n", 334 | "ЖЗ_врач\r\n", 335 | "\r\n", 336 | "ЖЗ_врач (3359)\r\n", 337 | "\r\n", 338 | "\r\n", 339 | "оповещ_оператор->ЖЗ_врач\r\n", 340 | "\r\n", 341 | "\r\n", 342 | "633\r\n", 343 | "\r\n", 344 | "\r\n", 345 | "end\r\n", 346 | "\r\n", 347 | "\r\n", 348 | "\r\n", 349 | "\r\n", 350 | "оповещ_оператор->end\r\n", 351 | "\r\n", 352 | "\r\n", 353 | "131\r\n", 354 | "\r\n", 355 | "\r\n", 356 | "отс.изм_врач\r\n", 357 | "\r\n", 358 | "отс.изм_врач (1674)\r\n", 359 | "\r\n", 360 | "\r\n", 361 | "отс.изм_оператор->отс.изм_врач\r\n", 362 | "\r\n", 363 | "\r\n", 364 | "1466\r\n", 365 | "\r\n", 366 | "\r\n", 367 | "ЖЗ_врачФД\r\n", 368 | "\r\n", 369 | "ЖЗ_врачФД (3493)\r\n", 370 | "\r\n", 371 | "\r\n", 372 | "ЖЗ_врачФД->КЗ_оператор\r\n", 373 | "\r\n", 374 | "\r\n", 375 | "642\r\n", 376 | "\r\n", 377 | "\r\n", 378 | "ЖЗ_врачФД->оповещ_оператор\r\n", 379 | "\r\n", 380 | "\r\n", 381 | "872\r\n", 382 | "\r\n", 383 | "\r\n", 384 | "ЖЗ_врачФД->ЖЗ_врач\r\n", 385 | "\r\n", 386 | "\r\n", 387 | "1073\r\n", 388 | "\r\n", 389 | "\r\n", 390 | "отс.изм_врач->оповещ_оператор\r\n", 391 | "\r\n", 392 | "\r\n", 393 | "1058\r\n", 394 | "\r\n", 395 | "\r\n", 396 | "нов.прогр_оператор\r\n", 397 | "\r\n", 398 | "нов.прогр_оператор (253)\r\n", 399 | "\r\n", 400 | "\r\n", 401 | "нов.прогр_оператор->КЗ_оператор\r\n", 402 | "\r\n", 403 | "\r\n", 404 | "171\r\n", 405 | "\r\n", 406 | "\r\n", 407 | "КЗ_врачФД->КЗ_оператор\r\n", 408 | "\r\n", 409 | "\r\n", 410 | "2244\r\n", 411 | "\r\n", 412 | "\r\n", 413 | "КЗ_врачФД->КЗ_врач\r\n", 414 | "\r\n", 415 | "\r\n", 416 | "674\r\n", 417 | "\r\n", 418 | "\r\n", 419 | "КЗ_врачФД->оповещ_оператор\r\n", 420 | "\r\n", 421 | "\r\n", 422 | "536\r\n", 423 | "\r\n", 424 | "\r\n", 425 | "КЗ_врачФД->КЗ_врачФД\r\n", 426 | "\r\n", 427 | "\r\n", 428 | "1429\r\n", 429 | "\r\n", 430 | "\r\n", 431 | "КЗ_врачФД->ЖЗ_врач\r\n", 432 | "\r\n", 433 | "\r\n", 434 | "976\r\n", 435 | "\r\n", 436 | "\r\n", 437 | "ЖЗ_врач->ЖЗ_врачФД\r\n", 438 | "\r\n", 439 | "\r\n", 440 | "2415\r\n", 441 | "\r\n", 442 | "\r\n", 443 | "start\r\n", 444 | "\r\n", 445 | "272\r\n", 446 | "\r\n", 447 | "\r\n", 448 | "start->нов.прогр_оператор\r\n", 449 | "\r\n", 450 | "\r\n", 451 | "241\r\n", 452 | "\r\n", 453 | "\r\n", 454 | "\r\n" 455 | ], 456 | "text/plain": [ 457 | "" 458 | ] 459 | }, 460 | "execution_count": 8, 461 | "metadata": {}, 462 | "output_type": "execute_result" 463 | } 464 | ], 465 | "source": [ 466 | "# without saving\n", 467 | "pm.render()" 468 | ] 469 | }, 470 | { 471 | "cell_type": "markdown", 472 | "metadata": {}, 473 | "source": [ 474 | "С помощью методов set_rates и set_params пользователь может настроить несколько входных параметров: \n", 475 | "- уровни отображения событий и переходов, регулирующие детализацию модели процесса; \n", 476 | "- опцию построения оптимальной модели процесса, основанной на комбинированной оценке сложности и точности модели;\n", 477 | "- параметр оптимизации, регулирующий простоту восприятия и полноту модели;\n", 478 | "- опцию выделения мета-состояний путём агрегации циклов в модели; \n", 479 | "- способы агрегации и проч. (см. документацию)\n", 480 | "\n", 481 | "По умолчанию (без настроек) ищется оптимальная модель процесса, поэтому при изменении уровней отображений необходимо выставить optimize=False, чтобы отключить их автонастройку. Задача оптимизации выглядит следующим образом:" 482 | ] 483 | }, 484 | { 485 | "cell_type": "markdown", 486 | "metadata": {}, 487 | "source": [ 488 | "$$\\mathcal{Q}(p, X^l) = (1-\\lambda)\\cdot F + \\lambda\\cdot C_{\\mathcal{J}} \\longrightarrow \\min_{\\theta},$$" 489 | ] 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "metadata": {}, 494 | "source": [ 495 | "где $\\mathcal{Q}$ — функционал качества, $p$ — алгоритм извлечения модели процесса из данных, $X^l$ — подвыборка лога, $\\lambda$ — коэффициент регуляризации, $F$ — фитнес-функция, $C_\\mathcal{J}$ — функция сложности, $\\theta$ — уровни отображений." 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": 9, 501 | "metadata": {}, 502 | "outputs": [ 503 | { 504 | "data": { 505 | "application/vnd.jupyter.widget-view+json": { 506 | "model_id": "280b1ca6124a40a281d25d622886a336", 507 | "version_major": 2, 508 | "version_minor": 0 509 | }, 510 | "text/plain": [ 511 | "HBox(children=(HTML(value='parsing log, completed traces :: '), FloatProgress(value=0.0, max=10500.0), HTML(va…" 512 | ] 513 | }, 514 | "metadata": {}, 515 | "output_type": "display_data" 516 | }, 517 | { 518 | "name": "stdout", 519 | "output_type": "stream", 520 | "text": [ 521 | "\n" 522 | ] 523 | } 524 | ], 525 | "source": [ 526 | "pm2 = ProcessMap()\n", 527 | "pm2.set_log(FILE_PATH=declarations)\n", 528 | "pm2.update()" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": 10, 534 | "metadata": { 535 | "scrolled": false 536 | }, 537 | "outputs": [ 538 | { 539 | "data": { 540 | "image/svg+xml": [ 541 | "\r\n", 542 | "\r\n", 544 | "\r\n", 546 | "\r\n", 547 | "\r\n", 549 | "\r\n", 550 | "%3\r\n", 551 | "\r\n", 552 | "\r\n", 553 | "Declaration APPROVED by ADMINISTRATION\r\n", 554 | "\r\n", 555 | "Declaration APPROVED by ADMINISTRATION (8202)\r\n", 556 | "\r\n", 557 | "\r\n", 558 | "Declaration FINAL_APPROVED by SUPERVISOR\r\n", 559 | "\r\n", 560 | "Declaration FINAL_APPROVED by SUPERVISOR (10131)\r\n", 561 | "\r\n", 562 | "\r\n", 563 | "Declaration APPROVED by ADMINISTRATION->Declaration FINAL_APPROVED by SUPERVISOR\r\n", 564 | "\r\n", 565 | "\r\n", 566 | "5133\r\n", 567 | "\r\n", 568 | "\r\n", 569 | "Payment Handled\r\n", 570 | "\r\n", 571 | "Payment Handled (10044)\r\n", 572 | "\r\n", 573 | "\r\n", 574 | "end\r\n", 575 | "\r\n", 576 | "\r\n", 577 | "\r\n", 578 | "\r\n", 579 | "Payment Handled->end\r\n", 580 | "\r\n", 581 | "\r\n", 582 | "10043\r\n", 583 | "\r\n", 584 | "\r\n", 585 | "Declaration SUBMITTED by EMPLOYEE\r\n", 586 | "\r\n", 587 | "Declaration SUBMITTED by EMPLOYEE (11531)\r\n", 588 | "\r\n", 589 | "\r\n", 590 | "Declaration SUBMITTED by EMPLOYEE->Declaration APPROVED by ADMINISTRATION\r\n", 591 | "\r\n", 592 | "\r\n", 593 | "8202\r\n", 594 | "\r\n", 595 | "\r\n", 596 | "Request Payment\r\n", 597 | "\r\n", 598 | "Request Payment (10040)\r\n", 599 | "\r\n", 600 | "\r\n", 601 | "Declaration FINAL_APPROVED by SUPERVISOR->Request Payment\r\n", 602 | "\r\n", 603 | "\r\n", 604 | "10032\r\n", 605 | "\r\n", 606 | "\r\n", 607 | "Request Payment->Payment Handled\r\n", 608 | "\r\n", 609 | "\r\n", 610 | "10031\r\n", 611 | "\r\n", 612 | "\r\n", 613 | "start\r\n", 614 | "\r\n", 615 | "10500\r\n", 616 | "\r\n", 617 | "\r\n", 618 | "start->Declaration SUBMITTED by EMPLOYEE\r\n", 619 | "\r\n", 620 | "\r\n", 621 | "10365\r\n", 622 | "\r\n", 623 | "\r\n", 624 | "\r\n" 625 | ], 626 | "text/plain": [ 627 | "" 628 | ] 629 | }, 630 | "execution_count": 10, 631 | "metadata": {}, 632 | "output_type": "execute_result" 633 | } 634 | ], 635 | "source": [ 636 | "pm2.render()" 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": 11, 642 | "metadata": {}, 643 | "outputs": [], 644 | "source": [ 645 | "pm.set_rates(activity_rate=80, path_rate=15)\n", 646 | "pm.set_params(optimize=False, aggregate=False)\n", 647 | "\n", 648 | "pm.update()" 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": 12, 654 | "metadata": {}, 655 | "outputs": [ 656 | { 657 | "data": { 658 | "image/svg+xml": [ 659 | "\r\n", 660 | "\r\n", 662 | "\r\n", 664 | "\r\n", 665 | "\r\n", 667 | "\r\n", 668 | "%3\r\n", 669 | "\r\n", 670 | "\r\n", 671 | "КЗ_оператор\r\n", 672 | "\r\n", 673 | "КЗ_оператор (6719)\r\n", 674 | "\r\n", 675 | "\r\n", 676 | "КЗ_врач\r\n", 677 | "\r\n", 678 | "КЗ_врач (5275)\r\n", 679 | "\r\n", 680 | "\r\n", 681 | "КЗ_оператор->КЗ_врач\r\n", 682 | "\r\n", 683 | "\r\n", 684 | "3672\r\n", 685 | "\r\n", 686 | "\r\n", 687 | "КЗ_врачФД\r\n", 688 | "\r\n", 689 | "КЗ_врачФД (6683)\r\n", 690 | "\r\n", 691 | "\r\n", 692 | "КЗ_оператор->КЗ_врачФД\r\n", 693 | "\r\n", 694 | "\r\n", 695 | "2158\r\n", 696 | "\r\n", 697 | "\r\n", 698 | "КЗ_врач->КЗ_оператор\r\n", 699 | "\r\n", 700 | "\r\n", 701 | "2269\r\n", 702 | "\r\n", 703 | "\r\n", 704 | "КЗ_врач->КЗ_врачФД\r\n", 705 | "\r\n", 706 | "\r\n", 707 | "2304\r\n", 708 | "\r\n", 709 | "\r\n", 710 | "оповещ_оператор\r\n", 711 | "\r\n", 712 | "оповещ_оператор (4462)\r\n", 713 | "\r\n", 714 | "\r\n", 715 | "отс.изм_оператор\r\n", 716 | "\r\n", 717 | "отс.изм_оператор (2336)\r\n", 718 | "\r\n", 719 | "\r\n", 720 | "оповещ_оператор->отс.изм_оператор\r\n", 721 | "\r\n", 722 | "\r\n", 723 | "1632\r\n", 724 | "\r\n", 725 | "\r\n", 726 | "end\r\n", 727 | "\r\n", 728 | "\r\n", 729 | "\r\n", 730 | "\r\n", 731 | "оповещ_оператор->end\r\n", 732 | "\r\n", 733 | "\r\n", 734 | "131\r\n", 735 | "\r\n", 736 | "\r\n", 737 | "отс.изм_врач\r\n", 738 | "\r\n", 739 | "отс.изм_врач (1674)\r\n", 740 | "\r\n", 741 | "\r\n", 742 | "отс.изм_оператор->отс.изм_врач\r\n", 743 | "\r\n", 744 | "\r\n", 745 | "1466\r\n", 746 | "\r\n", 747 | "\r\n", 748 | "ЖЗ_врачФД\r\n", 749 | "\r\n", 750 | "ЖЗ_врачФД (3493)\r\n", 751 | "\r\n", 752 | "\r\n", 753 | "ЖЗ_врачФД->КЗ_оператор\r\n", 754 | "\r\n", 755 | "\r\n", 756 | "642\r\n", 757 | "\r\n", 758 | "\r\n", 759 | "ЖЗ_врачФД->оповещ_оператор\r\n", 760 | "\r\n", 761 | "\r\n", 762 | "872\r\n", 763 | "\r\n", 764 | "\r\n", 765 | "изм.прогр_оператор\r\n", 766 | "\r\n", 767 | "изм.прогр_оператор (85)\r\n", 768 | "\r\n", 769 | "\r\n", 770 | "ЖЗ_врачФД->изм.прогр_оператор\r\n", 771 | "\r\n", 772 | "\r\n", 773 | "29\r\n", 774 | "\r\n", 775 | "\r\n", 776 | "ЖЗ_врач\r\n", 777 | "\r\n", 778 | "ЖЗ_врач (3359)\r\n", 779 | "\r\n", 780 | "\r\n", 781 | "ЖЗ_врачФД->ЖЗ_врач\r\n", 782 | "\r\n", 783 | "\r\n", 784 | "1073\r\n", 785 | "\r\n", 786 | "\r\n", 787 | "отс.изм_врач->оповещ_оператор\r\n", 788 | "\r\n", 789 | "\r\n", 790 | "1058\r\n", 791 | "\r\n", 792 | "\r\n", 793 | "нов.прогр_оператор\r\n", 794 | "\r\n", 795 | "нов.прогр_оператор (253)\r\n", 796 | "\r\n", 797 | "\r\n", 798 | "нов.прогр_оператор->КЗ_оператор\r\n", 799 | "\r\n", 800 | "\r\n", 801 | "171\r\n", 802 | "\r\n", 803 | "\r\n", 804 | "ЖЗ(«КТ»)_оператор\r\n", 805 | "\r\n", 806 | "ЖЗ(«КТ»)_оператор (1120)\r\n", 807 | "\r\n", 808 | "\r\n", 809 | "ЖЗ(«КТ»)_оператор->КЗ_оператор\r\n", 810 | "\r\n", 811 | "\r\n", 812 | "157\r\n", 813 | "\r\n", 814 | "\r\n", 815 | "ЖЗ(«КТ»)_оператор->оповещ_оператор\r\n", 816 | "\r\n", 817 | "\r\n", 818 | "117\r\n", 819 | "\r\n", 820 | "\r\n", 821 | "изм.прогр_оператор->ЖЗ(«КТ»)_оператор\r\n", 822 | "\r\n", 823 | "\r\n", 824 | "40\r\n", 825 | "\r\n", 826 | "\r\n", 827 | "КЗ_врачФД->КЗ_оператор\r\n", 828 | "\r\n", 829 | "\r\n", 830 | "2244\r\n", 831 | "\r\n", 832 | "\r\n", 833 | "КЗ_врачФД->оповещ_оператор\r\n", 834 | "\r\n", 835 | "\r\n", 836 | "536\r\n", 837 | "\r\n", 838 | "\r\n", 839 | "КЗ_врачФД->ЖЗ(«КТ»)_оператор\r\n", 840 | "\r\n", 841 | "\r\n", 842 | "205\r\n", 843 | "\r\n", 844 | "\r\n", 845 | "КЗ_врачФД->КЗ_врачФД\r\n", 846 | "\r\n", 847 | "\r\n", 848 | "1429\r\n", 849 | "\r\n", 850 | "\r\n", 851 | "КЗ_врачФД->ЖЗ_врач\r\n", 852 | "\r\n", 853 | "\r\n", 854 | "976\r\n", 855 | "\r\n", 856 | "\r\n", 857 | "ЖЗ_врач->ЖЗ_врачФД\r\n", 858 | "\r\n", 859 | "\r\n", 860 | "2415\r\n", 861 | "\r\n", 862 | "\r\n", 863 | "start\r\n", 864 | "\r\n", 865 | "272\r\n", 866 | "\r\n", 867 | "\r\n", 868 | "start->нов.прогр_оператор\r\n", 869 | "\r\n", 870 | "\r\n", 871 | "241\r\n", 872 | "\r\n", 873 | "\r\n", 874 | "\r\n" 875 | ], 876 | "text/plain": [ 877 | "" 878 | ] 879 | }, 880 | "execution_count": 12, 881 | "metadata": {}, 882 | "output_type": "execute_result" 883 | } 884 | ], 885 | "source": [ 886 | "pm.render()" 887 | ] 888 | }, 889 | { 890 | "cell_type": "markdown", 891 | "metadata": {}, 892 | "source": [ 893 | "### Meta-states discovering" 894 | ] 895 | }, 896 | { 897 | "cell_type": "markdown", 898 | "metadata": {}, 899 | "source": [ 900 | "Под мета-состояниями понимаем значимые циклы, то есть которые часто встречаются в логе." 901 | ] 902 | }, 903 | { 904 | "cell_type": "markdown", 905 | "metadata": {}, 906 | "source": [ 907 | "" 908 | ] 909 | }, 910 | { 911 | "cell_type": "markdown", 912 | "metadata": {}, 913 | "source": [ 914 | "Пример перестроения модели процесса (a) изначальная модель; (b) свёртка циклов типа outer joining; \n", 915 | "(c) свёртка циклов типа inner joining и эвристикой all; (d) свёртка циклов типа inner joining и эвристикой frequent" 916 | ] 917 | }, 918 | { 919 | "cell_type": "code", 920 | "execution_count": 13, 921 | "metadata": {}, 922 | "outputs": [], 923 | "source": [ 924 | "pm.set_rates(activity_rate=80, path_rate=5)\n", 925 | "pm.set_params(optimize=False, \n", 926 | " aggregate=True,\n", 927 | " heuristic='all',\n", 928 | " agg_type='inner')\n", 929 | "\n", 930 | "pm.update()" 931 | ] 932 | }, 933 | { 934 | "cell_type": "code", 935 | "execution_count": 14, 936 | "metadata": {}, 937 | "outputs": [ 938 | { 939 | "data": { 940 | "image/svg+xml": [ 941 | "\r\n", 942 | "\r\n", 944 | "\r\n", 946 | "\r\n", 947 | "\r\n", 949 | "\r\n", 950 | "%3\r\n", 951 | "\r\n", 952 | "\r\n", 953 | "('оповещ_оператор', 'отс.изм_оператор', 'отс.изм_врач')\r\n", 954 | "\r\n", 955 | "оповещ_оператор (4982)\r\n", 956 | "отс.изм_оператор (2980)\r\n", 957 | "отс.изм_врач (2324)\r\n", 958 | "(1603)\r\n", 959 | "\r\n", 960 | "\r\n", 961 | "end\r\n", 962 | "\r\n", 963 | "\r\n", 964 | "\r\n", 965 | "\r\n", 966 | "('оповещ_оператор', 'отс.изм_оператор', 'отс.изм_врач')->end\r\n", 967 | "\r\n", 968 | "\r\n", 969 | "120\r\n", 970 | "\r\n", 971 | "\r\n", 972 | "('КЗ_оператор', 'КЗ_врач')\r\n", 973 | "\r\n", 974 | "КЗ_оператор (5494)\r\n", 975 | "КЗ_врач (4888)\r\n", 976 | "(1129)\r\n", 977 | "\r\n", 978 | "\r\n", 979 | "('КЗ_оператор', 'КЗ_врачФД')\r\n", 980 | "\r\n", 981 | "КЗ_оператор (6069)\r\n", 982 | "КЗ_врачФД (6675)\r\n", 983 | "(1704)\r\n", 984 | "\r\n", 985 | "\r\n", 986 | "('КЗ_оператор', 'КЗ_врач')->('КЗ_оператор', 'КЗ_врачФД')\r\n", 987 | "\r\n", 988 | "\r\n", 989 | "316\r\n", 990 | "\r\n", 991 | "\r\n", 992 | "('ЖЗ_врачФД', 'ЖЗ_врач')\r\n", 993 | "\r\n", 994 | "ЖЗ_врачФД (3972)\r\n", 995 | "ЖЗ_врач (3874)\r\n", 996 | "(1541)\r\n", 997 | "\r\n", 998 | "\r\n", 999 | "('ЖЗ_врачФД', 'ЖЗ_врач')->('оповещ_оператор', 'отс.изм_оператор', 'отс.изм_врач')\r\n", 1000 | "\r\n", 1001 | "\r\n", 1002 | "74\r\n", 1003 | "\r\n", 1004 | "\r\n", 1005 | "('ЖЗ_врачФД', 'ЖЗ_врач')->('ЖЗ_врачФД', 'ЖЗ_врач')\r\n", 1006 | "\r\n", 1007 | "\r\n", 1008 | "535\r\n", 1009 | "\r\n", 1010 | "\r\n", 1011 | "('КЗ_врачФД', 'КЗ_оператор', 'КЗ_врач')\r\n", 1012 | "\r\n", 1013 | "КЗ_врачФД (7062)\r\n", 1014 | "КЗ_оператор (6456)\r\n", 1015 | "КЗ_врач (5850)\r\n", 1016 | "(2091)\r\n", 1017 | "\r\n", 1018 | "\r\n", 1019 | "('ЖЗ_врачФД', 'ЖЗ_врач')->('КЗ_врачФД', 'КЗ_оператор', 'КЗ_врач')\r\n", 1020 | "\r\n", 1021 | "\r\n", 1022 | "189\r\n", 1023 | "\r\n", 1024 | "\r\n", 1025 | "('КЗ_врачФД', 'КЗ_оператор', 'КЗ_врач')->('ЖЗ_врачФД', 'ЖЗ_врач')\r\n", 1026 | "\r\n", 1027 | "\r\n", 1028 | "276\r\n", 1029 | "\r\n", 1030 | "\r\n", 1031 | "нов.прогр_оператор\r\n", 1032 | "\r\n", 1033 | "нов.прогр_оператор (253)\r\n", 1034 | "\r\n", 1035 | "\r\n", 1036 | "нов.прогр_оператор->('КЗ_оператор', 'КЗ_врач')\r\n", 1037 | "\r\n", 1038 | "\r\n", 1039 | "189\r\n", 1040 | "\r\n", 1041 | "\r\n", 1042 | "нов.прогр_оператор->('КЗ_врачФД', 'КЗ_оператор', 'КЗ_врач')\r\n", 1043 | "\r\n", 1044 | "\r\n", 1045 | "181\r\n", 1046 | "\r\n", 1047 | "\r\n", 1048 | "нов.прогр_оператор->('КЗ_оператор', 'КЗ_врачФД')\r\n", 1049 | "\r\n", 1050 | "\r\n", 1051 | "130\r\n", 1052 | "\r\n", 1053 | "\r\n", 1054 | "('КЗ_оператор', 'КЗ_врачФД')->('ЖЗ_врачФД', 'ЖЗ_врач')\r\n", 1055 | "\r\n", 1056 | "\r\n", 1057 | "274\r\n", 1058 | "\r\n", 1059 | "\r\n", 1060 | "start\r\n", 1061 | "\r\n", 1062 | "272\r\n", 1063 | "\r\n", 1064 | "\r\n", 1065 | "start->нов.прогр_оператор\r\n", 1066 | "\r\n", 1067 | "\r\n", 1068 | "241\r\n", 1069 | "\r\n", 1070 | "\r\n", 1071 | "\r\n" 1072 | ], 1073 | "text/plain": [ 1074 | "" 1075 | ] 1076 | }, 1077 | "execution_count": 14, 1078 | "metadata": {}, 1079 | "output_type": "execute_result" 1080 | } 1081 | ], 1082 | "source": [ 1083 | "pm.render()" 1084 | ] 1085 | } 1086 | ], 1087 | "metadata": { 1088 | "kernelspec": { 1089 | "display_name": "Python 3", 1090 | "language": "python", 1091 | "name": "python3" 1092 | }, 1093 | "language_info": { 1094 | "codemirror_mode": { 1095 | "name": "ipython", 1096 | "version": 3 1097 | }, 1098 | "file_extension": ".py", 1099 | "mimetype": "text/x-python", 1100 | "name": "python", 1101 | "nbconvert_exporter": "python", 1102 | "pygments_lexer": "ipython3", 1103 | "version": "3.8.5" 1104 | }, 1105 | "pycharm": { 1106 | "stem_cell": { 1107 | "cell_type": "raw", 1108 | "metadata": { 1109 | "collapsed": false 1110 | }, 1111 | "source": [] 1112 | } 1113 | } 1114 | }, 1115 | "nbformat": 4, 1116 | "nbformat_minor": 2 1117 | } 1118 | -------------------------------------------------------------------------------- /meta/CHANGES.rst: -------------------------------------------------------------------------------- 1 | 0.1.0 2 | ~~~~~ 3 | 4 | 2020-03-03 5 | 6 | * Initial release. 7 | 8 | 0.1.1 9 | ~~~~~ 10 | 11 | 2020-10-04 12 | 13 | * Fixed aggregation bugs. Now, log is not changed when aggregation is performed. -------------------------------------------------------------------------------- /meta/cat_logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itmo-escience/ProFIT/9146c674e424bddd5ee24c13beef8beddef5da62/meta/cat_logo.jpg -------------------------------------------------------------------------------- /meta/cycles_joining.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itmo-escience/ProFIT/9146c674e424bddd5ee24c13beef8beddef5da62/meta/cycles_joining.png -------------------------------------------------------------------------------- /meta/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itmo-escience/ProFIT/9146c674e424bddd5ee24c13beef8beddef5da62/meta/logo.png -------------------------------------------------------------------------------- /meta/logo_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itmo-escience/ProFIT/9146c674e424bddd5ee24c13beef8beddef5da62/meta/logo_2.png -------------------------------------------------------------------------------- /meta/logo_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itmo-escience/ProFIT/9146c674e424bddd5ee24c13beef8beddef5da62/meta/logo_3.png -------------------------------------------------------------------------------- /meta/pm_general.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itmo-escience/ProFIT/9146c674e424bddd5ee24c13beef8beddef5da62/meta/pm_general.png -------------------------------------------------------------------------------- /meta/process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itmo-escience/ProFIT/9146c674e424bddd5ee24c13beef8beddef5da62/meta/process.png -------------------------------------------------------------------------------- /profit/README.md: -------------------------------------------------------------------------------- 1 | # Functionality 2 | The main functionality is presented below. For more details see docstrings in an appropriate module. 3 | * Class `ProcessMap` 4 | - `.set_log(self, FILE_PATH, cols=(0,1), *args, **kwargs)`: Set Log attribute of the class. 5 | - `.set_rates(self, activity_rate, path_rate)`: Set Rates attribute of the class. 6 | - `.set_params(self, **kwargs)`: Set Params attribute of the class. 7 | - `.update(self)`: Update "observers" and rates if settings were changed. 8 | - `.get_log(self)`: Return flat log. 9 | - `.get_rates(self)`: Return activities and paths rates. 10 | - `.get_params(self)`: Return parameters of process model discovering. 11 | - `.get_T(self)`: Return transition matrix. 12 | - `.get_graph(self)`: Return process model structure as a set of edges. 13 | - `.render(self, save_path=None)`: Return a graph object that can be rendered with the Graphviz installation. 14 | 15 | * Class `Graph` 16 | - `.update(self, log, activity_rate, path_rate, T)`: Update nodes and edges attributes. 17 | - `.optimize(self, log, T, lambd, step, verbose=False)`: Find optimal rates for the process model. 18 | - `.aggregate(self, log, activity_rate, path_rate, pre_traverse=False, ordered=False)`: Aggregate cycle nodes into meta state. 19 | - `.cycles_search(self, pre_traverse=False)`: Perform DFS for cycles search in a graph. 20 | - `.cycles_replay(self, log, cycles=[], ordered=False)`: Replay log and count occurrences of cycles found in the process model. 21 | - `.find_states(self, log, ordered=False, pre_traverse=False)`: Define meta states in the model. 22 | - `.fitness(self, log, T=None)`: Return the value of a cost function that includes only loss term. 23 | 24 | * Class `Renderer` 25 | - `.update(self, TM, G, colored=True)`: Update graph object and its representation. 26 | - `.show(self)`: Return graph in DOT language. 27 | - `.save(self, save_path=None)`: Render and save graph in PNG (GV) format in the working directory or in *save_path*. 28 | -------------------------------------------------------------------------------- /profit/__init__.py: -------------------------------------------------------------------------------- 1 | from profit.process_map import ProcessMap -------------------------------------------------------------------------------- /profit/graph.py: -------------------------------------------------------------------------------- 1 | from log import Log 2 | from transition_matrix import TransitionMatrix 3 | from observer_abc import Observer 4 | from util_pm import * 5 | from util_agg import * 6 | import sys 7 | import math 8 | 9 | class Graph(Observer): 10 | """Class to represent process model as a graph structure.""" 11 | 12 | def __init__(self): 13 | """Graph object as a set of nodes (default None) and 14 | a set of edges (default None). 15 | """ 16 | self.nodes = None 17 | self.edges = None 18 | 19 | def update(self, log, activity_rate, path_rate, T, S_node=None): 20 | """Update nodes and edges attributes performing node 21 | and edge filtering according to activity and path rates, 22 | respectively. 23 | 24 | Parameters 25 | ---------- 26 | log: Log 27 | Ordered records of events 28 | activity_rate: float 29 | The inverse value to node significance threshold: the 30 | more it is, the more activities are observed in the model 31 | path_rate: float 32 | The inverse value to edge significance threshold: the 33 | more it is, the more transitions are observed in the model 34 | T: TransitionMatrix / dict 35 | A matrix describing the transitions of a Markov chain 36 | (Note: dictionary is passed when aggregation is performed) 37 | S_node: dict 38 | Node significance. Used only for aggregation type 'inner' 39 | (default None) 40 | 41 | See Also 42 | --------- 43 | Log 44 | TransitionMatrix 45 | 46 | References 47 | ---------- 48 | .. [1] Ferreira, D. R. (2017). A primer on process mining. Springer, Cham. 49 | .. [2] Günther, C. W., & Van Der Aalst, W. M. (2007, September). Fuzzy 50 | mining–adaptive process simplification based on multi-perspective 51 | metrics. In International conference on business process management 52 | (pp. 328-343). Springer, Berlin, Heidelberg. 53 | """ 54 | # 1. Node filtering 55 | S = S_node if S_node else node_significance(log) 56 | S_norm = dict_normalization(S, nested=False) 57 | activities = [a for a in S_norm if S_norm[a] >= (1 - activity_rate / 100)] 58 | 59 | # 2. Edge filtering 60 | T = T if type(T)==dict else transit_matrix(log, T.T) 61 | # Significance matrix of outcoming edges 62 | S_out = edge_sig(T, source=activities+['start'], \ 63 | target=activities+['end'], type_='out') 64 | # Significance matrix of incoming edges (inverse outcoming) 65 | S_in = edge_sig(T, source=activities+['end'], \ 66 | target=activities+['start'], type_='in') 67 | # Self-loops case significance 68 | S_loop = {a_i: T[a_i][a_j][1] / len(log.cases) for a_i in T \ 69 | for a_j in T[a_i] if (a_i == a_j) & (a_i in activities)} 70 | # Evaluate the relative significance of conflicting relations 71 | rS = rel_sig(S_out, S_in) 72 | # Normalization 73 | S_out_norm = dict_normalization(S_out, nested=True) 74 | S_in_norm = dict_normalization(S_in, nested=True) 75 | S_loop_norm = dict_normalization(S_loop) 76 | # Early algorithm stop 77 | if path_rate == 100: 78 | transitions = [(a_i, a_j) for a_i in T for a_j in T[a_i] \ 79 | if (a_i in activities + ['start', 'end']) \ 80 | & (a_j in activities + ['start', 'end'])] 81 | else: 82 | co = 1 - path_rate / 100 # cut-off threshold 83 | transitions = list(conflict_resolution(rS)) # initial set of transitions to preserve 84 | transitions = edge_filtering(S_in_norm, transitions, co=co, type_='in') 85 | transitions = edge_filtering(S_out_norm, transitions, co=co, type_='out') 86 | for a_i in S_loop_norm: 87 | if (S_loop_norm[a_i] - 0.01 >= co) | (co == 0): 88 | transitions.append((a_i, a_i)) 89 | 90 | # 3. Check graph connectivity 91 | I = incidence_matrix(transitions) # Filtered incidence matrix 92 | check_feasibility(activities, transitions, T, I, S_norm, S_out_norm) 93 | 94 | activitiesDict = {a: (sum([v[0] for v in T[a].values()]), 95 | int(S[a] * len(log.cases))) for a in activities} 96 | transitionsDict = dict() 97 | for t in transitions: 98 | try: transitionsDict[tuple(t)] = T[t[0]][t[1]] 99 | except: transitionsDict[tuple(t)] = (0, 0) # "imaginary" edges 100 | 101 | self.nodes = activitiesDict 102 | self.edges = transitionsDict 103 | 104 | def optimize(self, log, T, lambd, step, verbose=False): 105 | """Find optimal rates for the process model in terms of 106 | completeness and comprehension via quality function 107 | optimization. 108 | 109 | Parameters 110 | ---------- 111 | log: Log 112 | Ordered records of events 113 | T: TransitionMatrix 114 | A matrix describing the transitions of a Markov chain 115 | lambd: float 116 | Regularization term coefficient: the more it is, the 117 | more penalty for the model complexity is 118 | step: int / float / list 119 | Step value or list of grid points for the search space 120 | 121 | Returns 122 | ======= 123 | dict: optimal activities and paths rates 124 | 125 | See Also 126 | --------- 127 | Log 128 | TransitionMatrix 129 | """ 130 | transitions_cnt = len([1 for i in log.flat_log 131 | for j in log.flat_log[i]]) \ 132 | + len(log.flat_log.keys()) 133 | ADS = ADS_matrix(log, T.T) 134 | N = len(log.activities) 135 | M = len([1 for a in T.T for b in T.T[a] if (a != 'start') & (b != 'end')]) 136 | 137 | def Q(theta1, theta2, lambd): 138 | """Quality (cost) function (losses + regularization term). 139 | The losses are defined by fitness function (see fitness) 140 | and the regularization term is the average degree of a 141 | directed graph. 142 | """ 143 | self.update(log, theta1, theta2, T) 144 | n, m = len(self.nodes)+2, len(self.edges) 145 | losses = self.fitness(log, T.T, ADS) 146 | # Calculate average degree 147 | compl = m / n 148 | 149 | return losses, compl 150 | 151 | Q_val = dict() 152 | per_done = 0 153 | if type(step) in [int, float]: 154 | per_step = 100 / (100 // step + 1) ** 2 155 | grid = range(0, 101, step) 156 | else: 157 | per_step = 100 / len(step) 158 | grid = step 159 | 160 | for a in grid: 161 | for p in grid: 162 | Q_val[(a,p)] = Q(a, p, lambd) 163 | if not verbose: continue 164 | per_done += per_step 165 | sys.stdout.write("\rOptimization ..... {0:.2f}%".\ 166 | format(per_done)) 167 | sys.stdout.flush() 168 | max_loss = Q(0, 0, lambd)[0] 169 | max_compl = Q(100, 100, lambd)[1] 170 | for theta in Q_val: 171 | Q_val[theta] = (1 - lambd) * Q_val[theta][0] / max_loss + \ 172 | lambd * Q_val[theta][1] / max_compl 173 | Q_opt = min(Q_val, key=lambda theta: Q_val[theta]) 174 | self.update(log, Q_opt[0], Q_opt[1], T) 175 | 176 | return {'activities': Q_opt[0], 'paths': Q_opt[1]} 177 | 178 | def aggregate(self, log, activity_rate, path_rate, agg_type='outer', 179 | heuristic='all', pre_traverse=False, ordered=False, cycle_rel=0.5): 180 | """Aggregate cycle nodes into meta state, if it is 181 | significant one. Note: the log is not changed. 182 | 183 | See also 184 | -------- 185 | find_states 186 | reconstruct_log 187 | redirect_edges 188 | """ 189 | SC = self.find_states(log, pre_traverse, ordered, cycle_rel) 190 | log_agg = Log() 191 | log_agg.flat_log = reconstruct_log(log, SC, ordered) 192 | log_agg.activities = log.activities.union(set(SC)) 193 | log_agg.cases = log.cases 194 | T = TransitionMatrix() 195 | T.update(log_agg.flat_log) 196 | if agg_type not in ['outer', 'inner']: 197 | raise ValueError('Invalid aggregation type') 198 | if heuristic not in ['all', 'frequent']: 199 | raise ValueError('Invalid heuristic') 200 | if agg_type == 'inner': 201 | self.update(log_agg, 100, 0, T) 202 | nodes = self.nodes 203 | S = node_significance_filtered(log_agg, T.T, nodes, SC, heuristic) 204 | T_ = transit_matrix(log_agg, T.T) 205 | T1 = T_filtered(log_agg, T_, nodes, SC, heuristic=heuristic) 206 | log_agg.flat_log, log_agg.activities = filter_connections(log_agg, SC) 207 | self.update(log_agg, activity_rate, path_rate, T1, S) 208 | self.nodes = add_frq(self.nodes, nodes, SC, T.T, heuristic) 209 | else: 210 | self.update(log_agg, activity_rate, path_rate, T) 211 | 212 | def find_nodes_order(self): 213 | """Perform traverse of a process model from start node. 214 | Return list of nodes ordered by their closeness to start. 215 | """ 216 | G = incidence_matrix(self.edges) 217 | nodes = ['start', 'end'] + list(self.nodes) 218 | ordered_nodes = [] 219 | visited = dict.fromkeys(nodes, False) 220 | 221 | def preorder_traversal(start_node): 222 | """ Define the order of nodes traverse starting 223 | from the initial node ('start') of a process model. 224 | """ 225 | visited[start_node] = True 226 | ordered_nodes.append(start_node) 227 | try: successors = G[start_node] 228 | except: successors = [] 229 | for successor in successors: 230 | if not visited[successor]: 231 | preorder_traversal(successor) 232 | 233 | preorder_traversal('start') 234 | return ordered_nodes 235 | 236 | def find_cycles(self, log, pre_traverse=False, ordered=False): 237 | """Search cycles in log and count their occurrences. 238 | 239 | Parameters 240 | ---------- 241 | log: Log 242 | Ordered records of events to replay 243 | pre_traverse: bool 244 | If True, performs graph traversal from 'start' node to define 245 | the order of activities in the cycles (default False) 246 | ordered: bool 247 | If True, the order of cycle activities is fixed strictly (default False) 248 | Returns 249 | ======= 250 | dict: with cycle (tuple) as a key and its occurrence 251 | frequency in the log as a value 252 | """ 253 | def check_edges(bad_edges_inds, s_ind, f_ind): 254 | for ind in bad_edges_inds: 255 | if ind >= f_ind: 256 | return True 257 | elif s_ind <= ind: 258 | return False 259 | return True 260 | 261 | cycles = dict() 262 | for case_log in log.flat_log.values(): 263 | bad_edges = [i for i, e in enumerate(zip(case_log, case_log[1:])) 264 | if e not in self.edges] 265 | 266 | case_cycles = set() 267 | for node in self.nodes: 268 | case_indices = [i for i, e in enumerate(case_log) if e == node] 269 | 270 | for s_i, f_i in zip(case_indices, case_indices[1:]): 271 | cycle = case_log[s_i:f_i] 272 | 273 | if f_i - s_i == len(set(cycle)) and check_edges(bad_edges, s_i, f_i): 274 | 275 | if cycle not in cycles: 276 | cycles[cycle] = [1, 0] 277 | else: 278 | cycles[cycle][0] += 1 279 | 280 | if cycle not in case_cycles: 281 | cycles[cycle][1] += 1 282 | case_cycles.add(cycle) 283 | 284 | if pre_traverse: 285 | ordered_nodes = self.find_nodes_order() 286 | 287 | if not ordered: 288 | sum_cycles = dict() 289 | left = set() 290 | for cycle in cycles: 291 | if cycle not in left: 292 | cycle_seq = [cycle[i:len(cycle)] + cycle[0:i] 293 | for i in range(len(cycle))] 294 | if pre_traverse: 295 | cycle_seq = {c: ordered_nodes.index(c[0]) for c in cycle_seq} 296 | cycle = min(cycle_seq, key=cycle_seq.get) 297 | 298 | sum_cycles[cycle] = [sum(cycles[c][i] for c in cycle_seq if c in cycles) 299 | for i in range(2)] 300 | for c in cycle_seq: 301 | left.add(c) 302 | 303 | cycles = sum_cycles 304 | 305 | return cycles 306 | 307 | def find_states(self, log, pre_traverse=False, ordered=False, cycle_rel=0.5): 308 | """Define meta states, i.e. significant cycles, in the model. 309 | A cycle found in the model is significant, if it occurs more 310 | than in cycle_rel of cases in the log. 311 | 312 | Parameters 313 | ---------- 314 | log: Log 315 | Ordered records of events to replay 316 | pre_traverse: bool 317 | If True, performs graph traversal from 'start' node to define 318 | the order of activities in the cycles (default False) 319 | ordered: bool 320 | If True, the order of cycle activities is fixed strictly (default False) 321 | cycle_rel: float 322 | Significance level for meta states (default 0.5) 323 | Returns 324 | ======= 325 | list: of significant cycles (meta states) 326 | 327 | See also 328 | -------- 329 | find_cycles 330 | """ 331 | cycles = self.find_cycles(log, pre_traverse, ordered) 332 | 333 | case_cnt = len(log.cases) 334 | return [c for c, (abs_freq, case_freq) in cycles.items() 335 | if len(c) > 1 and case_freq / case_cnt >= cycle_rel] 336 | 337 | def fitness(self, log, T=None, ADS=None): 338 | """Return the value of a cost function that includes 339 | only loss term. 340 | """ 341 | if T is None: 342 | TM = TransitionMatrix() 343 | TM.update(log) 344 | T = TM.T 345 | if ADS is None: 346 | ADS = ADS_matrix(log, T) 347 | 348 | case_cnt = len(log.cases) 349 | eps = 10 ** (-len(str(case_cnt))) 350 | 351 | def loss(a_i, a_j): 352 | """Perform the loss function for log replay. 353 | The biggest penalty is for the absence of 354 | transition in the model, if this transition 355 | always presences in the log. 356 | 357 | See also 358 | -------- 359 | ADS_matrix 360 | """ 361 | loss = 0 362 | if ADS[a_i][a_j] == 'A': 363 | loss = 1 364 | elif ADS[a_i][a_j] == 'S': 365 | loss = T[a_i][a_j][1] / case_cnt 366 | else: 367 | loss = eps 368 | return loss 369 | 370 | edges = self.edges 371 | edges1 = [] 372 | for e in edges: 373 | if (type(e[0]) == tuple) & (type(e[1]) == tuple): 374 | for e_i in e[0]: 375 | for e_j in e[1]: 376 | edges1.append((e_i,e_j)) 377 | edges1 += [(e[0][i], e[0][i+1]) for i in range(len(e[0]) - 1)] 378 | edges1 += [(e[1][i], e[1][i+1]) for i in range(len(e[1]) - 1)] 379 | edges1 += [(e[0][-1], e[0][0]), (e[1][-1], e[1][0])] 380 | elif type(e[0]) == tuple: 381 | for e_i in e[0]: 382 | edges1.append((e_i,e[1])) 383 | edges1 += [(e[0][i], e[0][i+1]) for i in range(len(e[0]) - 1)] 384 | edges1 += [(e[0][-1], e[0][0])] 385 | elif type(e[1]) == tuple: 386 | for e_j in e[1]: 387 | edges1.append((e[0], e_j)) 388 | edges1 += [(e[1][i], e[1][i+1]) for i in range(len(e[1]) - 1)] 389 | edges1 += [(e[1][-1], e[1][0])] 390 | else: 391 | edges1.append(e) 392 | edges1 = set(edges1) 393 | 394 | losses = 0 395 | for log_trace in log.flat_log.values(): 396 | losses += loss('start', log_trace[0]) 397 | for a_i, a_j in zip(log_trace, log_trace[1:]): 398 | if (a_i, a_j) not in edges1: 399 | losses += loss(a_i, a_j) 400 | losses += loss(log_trace[-1], 'end') 401 | for edge in edges1: 402 | losses += loss(edge[0], edge[1]) 403 | return losses 404 | -------------------------------------------------------------------------------- /profit/log.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pm4py 3 | 4 | class Log(object): 5 | """Perform event log object from a log-file. 6 | 7 | Attributes 8 | ---------- 9 | flat_log: dict 10 | Event log as a dictionary where the key is a case id 11 | and the value is a sequence of events 12 | cases: set 13 | Set of cases in the log 14 | activities: set 15 | Set of activities in the log 16 | 17 | Examples 18 | -------- 19 | >>> log = Log("../PATH/LOG-FILE.csv", encoding='cp1251') 20 | """ 21 | def __init__(self): 22 | """Class Constructor.""" 23 | self.flat_log = dict() 24 | self.cases = set() 25 | self.activities = set() 26 | 27 | def read_xes(self, FILE_PATH): 28 | """Read XES file into DataFrame.""" 29 | log = pm4py.read_xes(FILE_PATH) 30 | df = pd.DataFrame([], columns=['ID', 'Activity', 'TimeStamp']) 31 | trace_id, activity, timestamp = [], [], [] 32 | for i,events in enumerate(log): 33 | for e in events: 34 | trace_id.append(i) 35 | activity.append(e['concept:name']) 36 | timestamp.append(e['time:timestamp']) 37 | df['ID'] = trace_id 38 | df['Activity'] = activity 39 | df['TimeStamp'] = timestamp 40 | df = df.sort_values('TimeStamp').drop('TimeStamp', axis=1) 41 | 42 | return df 43 | 44 | def update(self, data=None, FILE_PATH='', cols=(0,1), *args, **kwargs): 45 | """Update attributes via file reading. 46 | 47 | Parameters 48 | ---------- 49 | data: DataFrame 50 | Log-file as DataFrame 51 | FILE_PATH: str 52 | Path to the CSV/TXT/XES log-file (alternative to data) 53 | cols: tuple 54 | Columns in the log-file to use as case id and activity 55 | attributes, respectively (default (0,1)) 56 | """ 57 | if FILE_PATH: 58 | if FILE_PATH[-4:] == ".xes": 59 | log = self.read_xes(FILE_PATH) 60 | else: 61 | log = pd.read_csv(FILE_PATH, usecols=cols, *args, **kwargs) 62 | else: 63 | log = data.iloc[:, list(cols)] 64 | log.columns = ['case_id', 'activity'] 65 | log.set_index('case_id', drop=True, inplace=True) 66 | self.flat_log = dict(log.activity.groupby(level=0).agg(tuple)) 67 | self.cases = set(log.index) 68 | self.activities = set(log.activity) 69 | -------------------------------------------------------------------------------- /profit/observer_abc.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | class Observer(ABC): 4 | @abstractmethod 5 | def update(self): 6 | pass 7 | -------------------------------------------------------------------------------- /profit/process_map.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | from log import Log 3 | from transition_matrix import TransitionMatrix 4 | from graph import Graph 5 | from renderer import Renderer 6 | 7 | class ProcessMap: 8 | """Class to perform a process model from event log. 9 | 10 | Attributes 11 | ---------- 12 | Log: Log 13 | Ordered records of events 14 | Rates: dict 15 | Dictionary of values for the process model 16 | simplification 17 | Params: dict 18 | Dictionary of parameters to regulate the 19 | way of process model discovering and 20 | representation 21 | _Observers: dict 22 | Dictionary of "observers" that react to 23 | parameters/rates/data change 24 | 25 | See Also 26 | --------- 27 | Log 28 | Graph 29 | Updater 30 | Renderer 31 | TransitionMatrix 32 | 33 | Exampless 34 | -------- 35 | >>> pm = ProcessMap() 36 | >>> pm.set_log("../PATH/LOG-FILE.csv", encoding='cp1251') 37 | >>> pm.set_rates(100, 0) 38 | >>> pm.set_params(optimize=False, aggregate=False) 39 | >>> pm.update() 40 | """ 41 | 42 | def __init__(self): 43 | """Represent process model with default options. 44 | 45 | Settings 46 | ---------- 47 | activities: float 48 | The inverse value to node significance threshold: the 49 | more it is, the more activities are observed in the model 50 | (default 100) 51 | paths: float 52 | The inverse value to edge significance threshold: the 53 | more it is, the more transitions are observed in the model 54 | (default 100) 55 | optimize: bool 56 | Find optimal rates for the process model (default True) 57 | aggregate: bool 58 | Aggregate activities into meta-states (default False) 59 | lambd: float 60 | Regularization factor of completeness and comprehension 61 | of the process model (default 0.5) 62 | step: int / float / list 63 | Step value or list of grid points for the search space 64 | (default 10) 65 | pre_traverse: bool 66 | If True, performs graph traversal from 'start' node 67 | to define the order of activities in meta states 68 | (default False) 69 | ordered: bool 70 | If True, the order of meta states activities is fixed 71 | strictly (default False) 72 | colored: bool 73 | Whether represent graph elements in color or in black 74 | and white (default True) 75 | verbose: bool 76 | If True, show optimization progress bar (default False) 77 | render_format: string 78 | Graphviz output format. 79 | """ 80 | self.Log = Log() 81 | self.Rates = {'activities': 100, 'paths': 0} 82 | self.Params = {'optimize': True, 83 | 'lambd': 0.5, 84 | 'step': 10, 85 | 'verbose': False, 86 | 'aggregate': False, 87 | 'agg_type': 'outer', 88 | 'heuristic': 'all', 89 | 'pre_traverse': False, 90 | 'ordered' : False, 91 | 'cycle_rel': 0.5, 92 | 'colored': True, 93 | 'render_format': 'png'} 94 | self._Observers = {'T': TransitionMatrix(), 95 | 'Graph': Graph(), 96 | 'Renderer': Renderer()} 97 | 98 | def set_log(self, data=None, FILE_PATH='', cols=(0, 1), *args, **kwargs): 99 | """Set Log attribute of the class.""" 100 | self.Log.update(data, FILE_PATH, cols=cols, *args, **kwargs) 101 | 102 | def set_rates(self, activity_rate, path_rate): 103 | """Set Rates attribute of the class.""" 104 | if (activity_rate < 0) | (activity_rate > 100): 105 | raise ValueError('Activity rate is out of range') 106 | if (path_rate < 0) | (path_rate > 100): 107 | raise ValueError('Path rate is out of range') 108 | self.Rates = {'activities': activity_rate, 109 | 'paths': path_rate} 110 | 111 | def set_params(self, **kwargs): 112 | """Set Params attribute of the class.""" 113 | 114 | def change_param(p): 115 | try: 116 | self.Params[p] = kwargs[p] 117 | except: 118 | print(str(IOError) + ': No such parameter \'{}\'.'.format(p)) 119 | 120 | for p in kwargs: 121 | change_param(p) 122 | 123 | def update(self): 124 | """Update "observers" and rates if settings were changed.""" 125 | self._Observers['T'].update(self.Log.flat_log) 126 | 127 | if self.Params['optimize']: 128 | self.Rates = self._Observers['Graph'].optimize(self.Log, 129 | self._Observers['T'], 130 | self.Params['lambd'], 131 | self.Params['step'], 132 | self.Params['verbose']) 133 | else: 134 | self._Observers['Graph'].update(self.Log, 135 | self.Rates['activities'], 136 | self.Rates['paths'], 137 | self._Observers['T']) 138 | if self.Params['aggregate']: 139 | self._Observers['Graph'].aggregate(self.Log, 140 | self.Rates['activities'], 141 | self.Rates['paths'], 142 | self.Params['agg_type'], 143 | self.Params['heuristic'], 144 | self.Params['pre_traverse'], 145 | self.Params['ordered'], 146 | self.Params['cycle_rel']) 147 | 148 | self._Observers['Renderer'].update(self._Observers['T'], 149 | self._Observers['Graph'], 150 | self.Params['colored'], 151 | self.Params['render_format']) 152 | 153 | def get_log(self): 154 | """Return flat log (see Log).""" 155 | return self.Log.flat_log 156 | 157 | def get_rates(self): 158 | """Return activities and paths rates.""" 159 | return self.Rates 160 | 161 | def get_params(self): 162 | """Return parameters of process model discovering.""" 163 | return self.Params 164 | 165 | def get_T(self): 166 | """Return transition matrix (see TransitionMatrix).""" 167 | return self._Observers['T'].T 168 | 169 | def get_graph(self): 170 | """Return process model structure as a set of edges (see Graph).""" 171 | return self._Observers['Graph'].edges 172 | 173 | def render(self, show_only=False, save_path=None, gv_format_save=False): 174 | """Return a graph object that can be rendered with the Graphviz 175 | installation (see Renderer).""" 176 | if show_only: 177 | self._Observers['Renderer'].show() 178 | if save_path: 179 | self._Observers['Renderer'].save(save_path, gv_format_save=gv_format_save) 180 | return self._Observers['Renderer'].GV 181 | 182 | -------------------------------------------------------------------------------- /profit/renderer.py: -------------------------------------------------------------------------------- 1 | color_map = {range(0,10) : "#1d2559", range(10,20) : "#203078", 2 | range(20,30) : "#1f3b98", range(30,40) : "#1946ba", 3 | range(40,50) : "#5661c6", range(50,60) : "#7d7fd2", 4 | range(60,70) : "#a09dde", range(70,80) : "#c0bde9", 5 | range(80,90) : "#e0ddf4", range(90,101) : "#ffffff"} 6 | from observer_abc import Observer 7 | import graphviz as gv 8 | import os 9 | 10 | DECORATE = False 11 | 12 | def _decorate_label(label, sep='_', max_len=15): 13 | """Text wrapping to the next line by sep.""" 14 | if not DECORATE: return label 15 | new_label = '' 16 | beg = 0 17 | while label: 18 | id_sep = label.find(sep, beg) 19 | if (len(label[:id_sep]) > max_len) & (id_sep != -1): 20 | new_label += label[:id_sep] + '\n' 21 | label = label[id_sep+1:] 22 | beg = 0 23 | elif (id_sep == -1): 24 | new_label += label 25 | label = '' 26 | else: 27 | beg += id_sep + 1 28 | return new_label 29 | 30 | class Renderer(Observer): 31 | """Class to represent the visualization of a process model.""" 32 | 33 | def __init__(self): 34 | """GV attribute (default None) represents dot format (directed) graph 35 | object that can be rendered with the Graphviz installation.""" 36 | self.GV = None 37 | 38 | def update(self, TM, G, colored=True, render_format='png'): 39 | """Update graph object (GV attribute) and its representation: elements 40 | count, node color, edge thickness, etc. 41 | 42 | Parameters 43 | ---------- 44 | TM: TransitionMatrix 45 | A matrix describing the transitions of a Markov chain 46 | G: Graph 47 | Graph structure of the model 48 | colored: bool 49 | Whether represent graph elements in color or in black 50 | and white (default True) 51 | 52 | References 53 | ---------- 54 | .. [1] Ferreira, D. R. (2017). A primer on process mining. Springer, Cham. 55 | """ 56 | T, nodes, edges = TM.T, G.nodes, G.edges 57 | G = gv.Digraph(strict=False, format=render_format) 58 | G.attr('edge', fontname='Sans Not-Rotated 14') 59 | G.attr('node', shape='box', style='filled', fontname='Sans Not-Rotated 14') 60 | 61 | # 1. Node color and shape 62 | F = dict() # Activities absolute frequencies 63 | for a, a_freq in nodes.items(): 64 | if type(a_freq[-1]) == dict: 65 | vals = [v for v in a_freq[-1].values()] 66 | F[a] = sum(vals) / len(vals) 67 | else: 68 | F[a] = a_freq[0] 69 | case_cnt = sum([v[0] for v in T['start'].values()]) 70 | x_max, x_min = max(F.values()), min(F.values()) 71 | for a, a_freq in nodes.items(): 72 | color = int((x_max - F[a]) / (x_max - x_min + 1e-6) * 100.) 73 | fill, font = "#ffffff", 'black' 74 | if colored: 75 | for interval in color_map: 76 | if color in interval: 77 | fill = color_map[interval] 78 | break 79 | else: fill = 'gray' + str(color) 80 | if color < 50: 81 | font = 'white' 82 | if type(a) == tuple: 83 | if type(a_freq[-1]) == dict: 84 | add_counts = [' ('+str(a_freq[-1][c])+')' for c in a] 85 | else: add_counts = [''] * len(a) 86 | node_label = str(a[0]) + add_counts[0] 87 | for i in range(1, len(a)): 88 | node_label += '\n' + str(a[i]) + add_counts[i] 89 | node_label += '\n(' + str(a_freq[0]) + ')' 90 | G.node(str(a), label=node_label, fillcolor=fill, fontcolor=font, shape='octagon') 91 | else: 92 | node_label = _decorate_label(str(a)) + '\n(' + str(F[a]) + ')' 93 | G.node(str(a), label=node_label, fillcolor=fill, fontcolor=font) 94 | G.node("start", shape="circle", label=str(case_cnt), 95 | fillcolor="#95d600" if colored else "#ffffff", margin='0.05') 96 | G.node("end", shape="doublecircle", label='', 97 | fillcolor="#ea4126" if colored else "#ffffff") 98 | 99 | # 2. Edge thickness and style 100 | values = [freq[0] for freq in edges.values()] 101 | if values: t_min, t_max = min(values), max(values) 102 | for e, freq in edges.items(): 103 | if freq == (0, 0): 104 | G.edge(str(e[0]), str(e[1]), style='dotted') 105 | continue 106 | if (e[0] == 'start') | (e[1] == 'end'): 107 | G.edge(str(e[0]), str(e[1]), label=str(freq[0]), style='dashed') 108 | else: 109 | y = 1.0 + (5.0 - 1.0) * (freq[0] - t_min) / (t_max - t_min + 1e-6) 110 | G.edge(str(e[0]), str(e[1]), label=str(freq[0]), penwidth=str(y)) 111 | 112 | self.GV = G 113 | 114 | def show(self): 115 | """Show graph without saving.""" 116 | self.GV.view('tmp_view') 117 | os.system("pause") 118 | for fname in os.listdir(): 119 | if fname.startswith("tmp_view"): 120 | os.remove(fname) 121 | return 122 | 123 | def save(self, save_path=None, gv_format_save=False): 124 | """Render and save graph in PNG (GV) format in the working directory, 125 | if no path to specific directory was indicated in save_path. 126 | """ 127 | if save_path is None: 128 | save_path = os.path.dirname(os.path.abspath(__file__)) 129 | if os.path.isdir(save_path): 130 | save_name = input("Enter file name: ") 131 | save_path = save_path + save_name 132 | self.GV.render(save_path, view=False) 133 | if not gv_format_save: 134 | os.remove(save_path) 135 | -------------------------------------------------------------------------------- /profit/transition_matrix.py: -------------------------------------------------------------------------------- 1 | from observer_abc import Observer 2 | 3 | class TransitionMatrix(Observer): 4 | """Class to represent a transition matrix that 5 | describes the transitions of a Markov chain. 6 | """ 7 | 8 | def __init__(self): 9 | """"Transition matrix is represented in the T attribute 10 | (default empty dictionary). 11 | """ 12 | self.T = dict() 13 | 14 | def update(self, log): 15 | """Transition matrix as dictionary indicating relations 16 | between activities, i.e. their following each other in 17 | the log, and their absolute and case frequencies. 18 | """ 19 | T = dict() 20 | to_add = dict() 21 | for log_trace in log.values(): 22 | for a_i, a_j in zip(log_trace, log_trace[1:]): 23 | if a_i not in T: 24 | T[a_i] = dict() 25 | to_add[a_i] = dict() 26 | if a_j not in T[a_i]: 27 | T[a_i][a_j] = [0, 0] 28 | to_add[a_i][a_j] = True 29 | T[a_i][a_j][0] += 1 30 | if to_add[a_i][a_j]: 31 | T[a_i][a_j][1] += 1 32 | to_add[a_i][a_j] = False 33 | for a_i in to_add: 34 | for a_j in to_add[a_i]: 35 | to_add[a_i][a_j] = True 36 | for a_i in T: 37 | for a_j in T[a_i]: 38 | T[a_i][a_j] = tuple(T[a_i][a_j]) 39 | 40 | self.T = T 41 | -------------------------------------------------------------------------------- /profit/util_agg.py: -------------------------------------------------------------------------------- 1 | def reconstruct_log(log, meta_states, ordered=False): 2 | """Rebuild log according to meta states found in the model. 3 | If ordered=True, the order of meta state activities is fixed 4 | strictly. 5 | """ 6 | meta_states.sort(key=len, reverse=True) 7 | states_events = {v for state in meta_states for v in state} 8 | states_seq = {s: [s[i:len(s)]+s[0:i] for i in range(len(s))] \ 9 | for s in meta_states} 10 | new_log = dict() 11 | for case, case_log in log.flat_log.items(): 12 | case_log1 = [] 13 | aggregated = False 14 | i = 0 15 | while i < len(case_log): 16 | if case_log[i] in states_events: 17 | for s in meta_states: 18 | try: tmp = case_log[i:i + len(s) + 1] 19 | except: continue 20 | if tmp[0] == tmp[-1]: 21 | if ordered: cond = (tmp[:-1] == s) 22 | else: cond = (tmp[:-1] in states_seq[s]) 23 | if cond: 24 | case_log1.append(s) 25 | i += len(s) - 1 26 | aggregated = True 27 | break 28 | if not aggregated: 29 | case_log1.append(case_log[i]) 30 | i += 1 31 | aggregated = False 32 | new_log[case] = tuple(case_log1) 33 | 34 | return new_log 35 | 36 | def check_dict_key(d, key, set_val): 37 | if key not in d: 38 | d[key] = set_val 39 | 40 | def dict_event_states(meta_states, nodes): 41 | event_states = dict() 42 | for state in meta_states: 43 | for event in state: 44 | check_dict_key(event_states, event, dict()) 45 | event_states[event][state] = nodes[state][0] 46 | return event_states 47 | 48 | def node_significance_filtered(log, T, nodes, meta_states, heuristic='all'): 49 | """Return node significance, i.e. activities case frequencies.""" 50 | event_states = dict_event_states(meta_states, nodes) 51 | caseF = dict() 52 | # if heuristic in ['all','frequent']: 53 | for a in log.activities: 54 | if a in event_states: 55 | for case_log in log.flat_log.values(): 56 | if heuristic == 'all': 57 | for state in event_states[a]: 58 | if state in case_log: 59 | check_dict_key(caseF, state, 0) 60 | caseF[state] += 1 # 1 / len(state) 61 | else: 62 | state = max(event_states[a], key=event_states[a].get) 63 | if state in case_log: 64 | check_dict_key(caseF, state, 0) 65 | caseF[state] += 1 # 1 / len(state) 66 | else: 67 | check_dict_key(caseF, a, 0) 68 | for case_log in log.flat_log.values(): 69 | if a in case_log: caseF[a] += 1 70 | # Activities (node) significance 71 | S = {a: caseF[a] / len(log.cases) for a in caseF} 72 | return S 73 | 74 | def T_filtered(log, T, nodes, meta_states, heuristic='all'): 75 | """Redirect edges and its frequencies.""" 76 | Tf = dict() 77 | for a in T: 78 | Tf[a] = dict() 79 | for x in T[a]: 80 | Tf[a][x] = T[a][x] 81 | event_states =dict_event_states(meta_states, nodes) 82 | to_add, to_dec = dict(), dict() 83 | 84 | def check_add(a_i, a_j, reverse=False): 85 | if reverse: a_i, a_j = a_j, a_i 86 | check_dict_key(Tf[a_i], a_j, (0, 0)) 87 | abs_frq, cse_frq = Tf[a_i][a_j] 88 | Tf[a_i][a_j] = (abs_frq + 1, cse_frq) 89 | check_dict_key(to_add, a_i, dict()) 90 | check_dict_key(to_add[a_i], a_j, True) 91 | if to_add[a_i][a_j]: 92 | abs_frq, cse_frq = Tf[a_i][a_j] 93 | Tf[a_i][a_j] = (abs_frq, cse_frq + 1) 94 | to_add[a_i][a_j] = False 95 | 96 | def apply_heuristic_all(a_i, a_j, reverse=False): 97 | if a_i not in event_states[a_j]: 98 | for state in event_states[a_j]: 99 | check_add(a_i, state, reverse=reverse) 100 | 101 | def apply_heuristic_frequent(a_i, a_j, reverse=False): 102 | state = max(event_states[a_j], key=event_states[a_j].get) 103 | if a_i != state: 104 | check_add(a_i, state, reverse=reverse) 105 | 106 | def apply_heuristic(a_i, a_j, reverse=False): 107 | if heuristic == 'all': 108 | apply_heuristic_all(a_i, a_j, reverse=reverse) 109 | elif heuristic == 'frequent': 110 | apply_heuristic_frequent(a_i, a_j, reverse=reverse) 111 | 112 | for case_log in log.flat_log.values(): 113 | case_log = ['start'] + list(case_log) + ['end'] 114 | for a_i, a_k, a_j in zip(case_log, case_log[1:], case_log[2:]): 115 | if (a_k in event_states) & (a_i not in event_states): 116 | apply_heuristic(a_i, a_k, reverse=False) 117 | if (a_k in event_states) & (a_j not in event_states): 118 | apply_heuristic(a_j, a_k, reverse=True) 119 | if a_k in meta_states: 120 | check_dict_key(to_dec, a_i, dict()) 121 | check_dict_key(to_dec[a_i], a_k, False) 122 | to_dec[a_i][a_k] = True 123 | check_dict_key(to_dec, a_k, dict()) 124 | check_dict_key(to_dec[a_k], a_i, False) 125 | to_dec[a_k][a_i] = True 126 | 127 | for i in to_add: 128 | for j in to_add[i]: 129 | if i in to_dec and j in to_dec[i]: 130 | if (not to_add[i][j]) & (to_dec[i][j]): 131 | abs_frq, cse_frq = Tf[i][j] 132 | Tf[i][j] = (abs_frq, cse_frq - 1) 133 | to_add[i][j] = True 134 | for i in to_dec: 135 | for j in to_dec[i]: 136 | to_dec[i][j] = False 137 | return Tf 138 | 139 | def filter_connections(log, meta_states): 140 | """Exclude from log single events that presents in meta-states.""" 141 | events_to_filtrate = {v for state in meta_states for v in state} 142 | new_log = {case: tuple(e for e in case_log if 143 | e not in events_to_filtrate) 144 | for case, case_log in log.flat_log.items()} 145 | activities = [a for a in log.activities if a not in events_to_filtrate] 146 | return new_log, activities 147 | 148 | def add_frq(nodes, nodes_to_add, meta_states, T, heuristic='all'): 149 | event_states = dict_event_states(meta_states, nodes_to_add) 150 | nodes1 = dict() 151 | for v, freq in nodes.items(): 152 | if v in meta_states: 153 | nodes1[v] = (freq[0], freq[1], dict()) 154 | for v_i in v: 155 | if v_i not in nodes1[v][2]: 156 | nodes1[v][2][v_i] = freq[0] 157 | if heuristic == 'all': 158 | if v_i in nodes_to_add: 159 | nodes1[v][2][v_i] += nodes_to_add[v_i][0] 160 | elif heuristic == 'frequent': 161 | if v == max(event_states[v_i], key=event_states[v_i].get): 162 | nodes1[v][2][v_i] += nodes_to_add[v_i][0] 163 | else: 164 | nodes1[v] = nodes_to_add[v] 165 | return nodes1 166 | -------------------------------------------------------------------------------- /profit/util_pm.py: -------------------------------------------------------------------------------- 1 | def incidence_matrix(edges, excpt=[]): 2 | """Return an incidence matrix as dict where 1 indicates 3 | a relationship between two nodes in a directed graph. 4 | 5 | Parameters 6 | ---------- 7 | edges: list 8 | Set of graph's edges 9 | excpt: list 10 | Set of nodes to exclude from matrix construction 11 | (default []) 12 | """ 13 | I = dict() 14 | for e in edges: 15 | a_i = e[0] 16 | a_j = e[1] 17 | if (a_i in excpt) | (a_j in excpt): 18 | continue 19 | if a_i not in I: 20 | I[a_i] = dict() 21 | if a_j not in I[a_i]: 22 | I[a_i][a_j] = 1 23 | return I 24 | 25 | def dict_normalization(dict_, nested=False): 26 | """Return normalized along rows matrix as a dictionary. 27 | 28 | Parameters 29 | ---------- 30 | dict_: dict 31 | Dictionary (array or matrix) to normalize 32 | nested: bool 33 | Indicate object dimension: if True the 2-dimensional 34 | object is passed, else 1-dimensional (default False) 35 | """ 36 | dict_norm = dict() 37 | if not nested: 38 | if dict_.values(): 39 | d_max = max(dict_.values()) 40 | d_min = min(dict_.values()) 41 | if d_max - d_min == 0: 42 | dict_norm = {key: 1 for key in dict_} 43 | else: 44 | dict_norm = {key: (dict_[key] - d_min) / (d_max - d_min) for key in dict_} 45 | else: 46 | for key_1 in dict_: 47 | if dict_[key_1]: 48 | dict_norm[key_1] = dict() 49 | else: continue 50 | d_max = max(dict_[key_1].values()) 51 | d_min = min(dict_[key_1].values()) 52 | for key_2 in dict_[key_1]: 53 | if d_max - d_min == 0: 54 | dict_norm[key_1][key_2] = 1 / len(dict_[key_1]) 55 | else: 56 | dict_norm[key_1][key_2] = (dict_[key_1][key_2] - d_min) / (d_max - d_min) 57 | return dict_norm 58 | 59 | def node_significance(log): 60 | """Return node significance, i.e. activities case frequencies.""" 61 | caseF = {a: sum(a in case_log for case_log in log.flat_log.values()) 62 | for a in log.activities} 63 | # Activities (node) significance 64 | S = {a: caseF[a] / len(log.cases) for a in caseF} 65 | return S 66 | 67 | def transit_matrix(log, T): 68 | """Return transition matrix with 'start' and 'end' nodes.""" 69 | process_start, process_end = dict(), dict() 70 | for case_log in log.flat_log.values(): 71 | s = case_log[0] 72 | e = case_log[-1] 73 | if s not in process_start: process_start[s] = 0 74 | process_start[s] += 1 75 | if e not in process_end: process_end[e] = 0 76 | process_end[e] += 1 77 | T['start'] = {s: (process_start[s],process_start[s]) for s in process_start} 78 | for e in process_end: 79 | if e not in T: T[e] = dict() 80 | T[e]['end'] = (process_end[e],process_end[e]) 81 | return T 82 | 83 | def ADS_matrix(log, T): 84 | """Return a matrix that represents whether events in the log 85 | actually always (A), never (N), or sometimes (S) followed each 86 | other. 87 | """ 88 | case_cnt = len(log.cases) 89 | T = transit_matrix(log, T) 90 | activities = log.activities 91 | ADS = dict() 92 | for v1 in list(activities) + ['start']: 93 | ADS[v1] = dict() 94 | for v2 in list(activities) + ['end']: 95 | try: f_rel = T[v1][v2][1] 96 | except: f_rel = -1 97 | if f_rel == case_cnt: 98 | ADS[v1][v2] = 'A' # always 99 | elif f_rel > 0: 100 | ADS[v1][v2] = 'S' # sometimes 101 | elif f_rel == -1: 102 | ADS[v1][v2] = 'N' # never 103 | return ADS 104 | 105 | def edge_sig(T, source=[], target=[], type_='out'): 106 | """Return edge significance, i.e. transitions case frequencies. 107 | 108 | Parameters 109 | ---------- 110 | T: dict 111 | Transition matrix as a dictionary 112 | source: list 113 | Nodes that have outcoming edges 114 | (default []) 115 | target: list 116 | Nodes that have incoming edges 117 | (default []) 118 | type_: str 119 | Determine type of edges (in- or outcoming) 120 | to filtrate (default 'out') 121 | """ 122 | case_cnt = sum([v[0] for v in T['start'].values()]) 123 | S = dict() 124 | for a_i in source: 125 | S[a_i] = dict() 126 | target_ = T if type_ != 'out' else T[a_i] 127 | for a_j in target_: 128 | if (a_i == a_j) | (a_j not in target): continue 129 | if type_ != 'out': 130 | if a_i in T[a_j]: S[a_i][a_j] = T[a_j][a_i][1] / case_cnt 131 | else: S[a_i][a_j] = T[a_i][a_j][1] / case_cnt 132 | return S 133 | 134 | def rel_sig(S_out, S_in): 135 | """Return relative significance of conflicting relations.""" 136 | rS = dict() 137 | for A in S_out: 138 | rS[A] = dict() 139 | for B in S_out[A]: 140 | if A in S_in and B in S_in[A]: 141 | sigAX = sum(S_out[A].values()) 142 | sigXB = sum(S_in[B].values()) 143 | rS[A][B] = .5 * S_out[A][B] / sigAX + .5 * S_out[A][B] / sigXB 144 | return rS 145 | 146 | def conflict_resolution(rS, pth=0.3, rth=2*0.3/3): 147 | """Determine the most significant behavior in the process. Return 148 | a set of preserved edges. 149 | 150 | Parameters 151 | ---------- 152 | rS: dict 153 | Relative significance matrix 154 | pth: float 155 | Preserve threshold (default 0.3) 156 | rth: float 157 | Ratio threshold (default 0.2) 158 | 159 | References 160 | ---------- 161 | .. [1] Günther, C. W., & Van Der Aalst, W. M. (2007, September). Fuzzy 162 | mining–adaptive process simplification based on multi-perspective 163 | metrics. In International conference on business process management 164 | (pp. 328-343). Springer, Berlin, Heidelberg. 165 | """ 166 | ttp = [] # transitions in conflict to preserve 167 | for A in rS: 168 | for B in rS[A]: 169 | if (rS[A][B] >= pth) & (rS[B][A] >= pth): # preserve threshold 170 | ttp.append(tuple([A,B])) 171 | ttp.append(tuple([B,A])) 172 | elif abs(rS[A][B] - rS[B][A]) >= rth: # ratio threshold 173 | if rS[A][B] - rS[B][A] >= 0: 174 | ttp.append(tuple([A,B])) 175 | else: 176 | ttp.append(tuple([B,A])) 177 | return set(ttp) 178 | 179 | def edge_filtering(S, edge_list, co=0, type_='out'): 180 | """Process model simplification. Return filtrated set of edges. 181 | 182 | Parameters 183 | ---------- 184 | S: dict 185 | Edge significance matrix 186 | edge_list: list 187 | Set of edges to filtrate 188 | co: float 189 | Cut-off threshold for edge filtration 190 | (default 0) 191 | type_: str 192 | Determine type of edges (in- or outcoming) 193 | to filtrate (default 'out') 194 | """ 195 | edges = edge_list[:] 196 | for a in S: 197 | S_sort = sorted(S[a], key=S[a].get, reverse=True) 198 | for i in range(len(S[a])): 199 | b = S_sort[i] 200 | if (S[a][b] >= co) | (i == 0): 201 | if type_ != 'out': 202 | if (b,a) not in edges: edges.append((b,a)) 203 | else: 204 | if (a,b) not in edges: edges.append((a,b)) 205 | else: break 206 | return edges 207 | 208 | def check_feasibility(nodes, edges, T, I, S, S_out): 209 | """Perform two DFS types to check conditions.""" 210 | # 1. All nodes are end ancestors 211 | def isAncestor(start, node): 212 | marked[node] = True 213 | try: successors = I[node] 214 | except: successors = [] 215 | if 'end' in successors: 216 | end_ancestor[start] = True 217 | if not end_ancestor[start]: 218 | for successor in successors: 219 | if not marked[successor]: 220 | isAncestor(start, successor) 221 | 222 | # 2. All nodes are start descendants 223 | def isDescendant(node): 224 | start_descendant[node] = True 225 | try: successors = I[node] 226 | except: successors = [] 227 | for successor in successors: 228 | if successor != 'end' and start_descendant[successor] == False: 229 | isDescendant(successor) 230 | 231 | # Find extra edges if condition fails. 232 | def make_connected(edges, state, check_cond='desc'): 233 | component_nodes = [k for k, v in state.items() if v == False] 234 | directed_nodes = [k for k, v in state.items() if v == True] 235 | source = directed_nodes if check_cond == 'desc' else component_nodes 236 | target = component_nodes if check_cond == 'desc' else directed_nodes 237 | extra_edges = dict() 238 | for node in source: 239 | for a in T[node]: 240 | if a in target: 241 | extra_edges[(node, a)] = S_out[node][a] 242 | if len(extra_edges) == 0: 243 | S_comp = {k: v for k, v in S.items() if k in component_nodes} 244 | if check_cond == 'desc': 245 | edges.append(('start', max(S_comp, key=S_comp.get))) 246 | if 'start' not in I: 247 | I['start'] = dict() 248 | I['start'][max(S_comp, key=S_comp.get)] = 1 249 | else: 250 | edges.append((max(S_comp, key=S_comp.get), 'end')) 251 | if max(S_comp, key=S_comp.get) not in I: 252 | I[max(S_comp, key=S_comp.get)] = dict() 253 | I[max(S_comp, key=S_comp.get)]['end'] = 1 254 | else: 255 | extra_edge = max(extra_edges, key=extra_edges.get) 256 | edges.append((extra_edge[0], extra_edge[1])) 257 | if extra_edge[0] not in I: 258 | I[extra_edge[0]] = dict() 259 | I[extra_edge[0]][extra_edge[1]] = 1 260 | 261 | while True: 262 | end_ancestor = dict.fromkeys(nodes, False) 263 | for v in nodes: 264 | marked = dict.fromkeys(nodes, False) 265 | isAncestor(v, v) 266 | if all(end_ancestor.values()): break 267 | else: make_connected(edges, end_ancestor, 'anc') 268 | 269 | while True: 270 | start_descendant = dict.fromkeys(nodes, False) 271 | isDescendant('start') 272 | if all(start_descendant.values()): break 273 | else: make_connected(edges, start_descendant, 'desc') 274 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | graphviz==0.15 2 | pm4py==2.1.1 3 | pandas==1.1.3 4 | --------------------------------------------------------------------------------