├── .github └── workflows │ └── tests.yml ├── .gitignore ├── .readthedocs.yaml ├── CHANGELOG.md ├── CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── docs ├── Makefile ├── conf.py ├── index.rst ├── install │ └── installation.rst ├── logo.png ├── make.bat ├── pipeline.png ├── requirements.txt └── sphinxext │ ├── doi_role.py │ └── github_link.py ├── examples ├── README.rst ├── feature_extracion.py ├── plot_download_api.py ├── plot_pipeline_evaluation.py ├── plot_rt_pipeline.py └── plot_unbiased_split.py ├── pyproject.toml ├── src └── skpm │ ├── __init__.py │ ├── base.py │ ├── config.py │ ├── event_logs │ ├── __init__.py │ ├── base.py │ ├── bpi.py │ ├── download.py │ ├── extract.py │ ├── parser.py │ └── split.py │ ├── feature_extraction │ ├── __init__.py │ ├── case │ │ ├── __init__.py │ │ ├── _helpers.py │ │ ├── time.py │ │ └── variant.py │ ├── event │ │ ├── __init__.py │ │ ├── inter_case.py │ │ ├── resource.py │ │ └── time.py │ ├── targets.py │ └── time.py │ ├── sequence_encoding │ ├── __init__.py │ ├── aggregation.py │ ├── bucketing.py │ └── index.py │ ├── utils │ ├── __init__.py │ ├── graph.py │ ├── helpers.py │ └── validation.py │ └── warnings.py └── tests ├── __init__.py ├── event_logs ├── __init__.py ├── test_bpi.py ├── test_download_extract.py └── test_parser.py ├── feature_extraction ├── __init__.py ├── case │ └── test_variant.py ├── event │ ├── test_resource.py │ ├── test_time.py │ └── test_wip.py └── test_targets.py ├── sequence_encoding ├── __init__.py ├── test_aggregation.py ├── test_bucketing.py └── test_index.py └── utils ├── __init__.py ├── test_graph.py └── test_validation.py /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | 9 | jobs: 10 | test: 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | python: ['3.10.16'] 16 | os: [ubuntu-latest] #, windows-latest] 17 | 18 | steps: 19 | - uses: actions/checkout@v3 20 | 21 | - name: Set up Python ${{ matrix.python }} 22 | uses: actions/setup-python@v4 23 | with: 24 | python-version: ${{ matrix.python }} 25 | cache: 'pip' 26 | 27 | - name: Install Poetry 28 | uses: snok/install-poetry@v1 29 | with: 30 | virtualenvs-create: true 31 | virtualenvs-in-project: true 32 | 33 | - name: Install Python dependencies 34 | run: | 35 | poetry install 36 | 37 | - run: | 38 | source $VENV 39 | pytest --version 40 | 41 | - name: pytest 42 | run: poetry run pytest --cov=skpm tests 43 | 44 | - name: Statistics 45 | if: success() 46 | run: | 47 | poetry run coverage report 48 | poetry run coverage xml 49 | 50 | 51 | - name: Upload coverage to Codecov 52 | uses: codecov/codecov-action@v3 53 | if: always() 54 | # see: https://github.com/actions/toolkit/issues/399 55 | continue-on-error: true 56 | with: 57 | token: ${{ secrets.CODECOV_TOKEN }} 58 | file: coverage.xml 59 | # flags: cpu 60 | name: Coverage 61 | fail_ci_if_error: false -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Custom user defined 2 | .pytest_cache/ 3 | .mypy_cache/ 4 | data/ 5 | htmlcov/ 6 | notebooks/ 7 | 8 | .coverage 9 | notes.md 10 | skpm-venv/ 11 | poetry.lock 12 | 13 | # Byte-compiled / optimized / DLL files 14 | __pycache__/ 15 | dist/ 16 | 17 | # Sphinx documentation 18 | docs/_build/ 19 | docs/auto_examples/ 20 | docs/gen_modules/ 21 | docs/sg_execution_times.rst 22 | 23 | # Jupyter Notebook 24 | .ipynb_checkpoints 25 | __pypackages__/ 26 | 27 | # VsCode 28 | .vscode/ 29 | 30 | # files 31 | *.log 32 | *.csv 33 | *.parquet -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the OS, Python version and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.10.16" 13 | # You can also specify other tool versions: 14 | # nodejs: "19" 15 | # rust: "1.64" 16 | # golang: "1.19" 17 | 18 | # Build documentation in the "docs/" directory with Sphinx 19 | sphinx: 20 | configuration: docs/conf.py 21 | 22 | python: 23 | install: 24 | - requirements: docs/requirements.txt 25 | - method: pip 26 | path: . -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | 4 | 5 | ## v0.0.1 (25/01/2024) 6 | 7 | - First release of `skpm`! -------------------------------------------------------------------------------- /CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | * Using welcoming and inclusive language 12 | * Being respectful of differing viewpoints and experiences 13 | * Gracefully accepting constructive criticism 14 | * Focusing on what is best for the community 15 | * Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | * Trolling, insulting/derogatory comments, and personal or political attacks 21 | * Public or private harassment 22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | * Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant homepage](http://contributor-covenant.org/version/1/4), version 1.4. 44 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Contributions are welcome, and they are greatly appreciated! Every little bit 4 | helps, and credit will always be given. 5 | 6 | ## Types of Contributions 7 | 8 | ### Report Bugs 9 | 10 | If you are reporting a bug, please include: 11 | 12 | * Your operating system name and version. 13 | * Any details about your local setup that might be helpful in troubleshooting. 14 | * Detailed steps to reproduce the bug. 15 | 16 | ### Fix Bugs 17 | 18 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help 19 | wanted" is open to whoever wants to implement it. 20 | 21 | ### Implement Features 22 | 23 | Look through the GitHub issues for features. Anything tagged with "enhancement" 24 | and "help wanted" is open to whoever wants to implement it. 25 | 26 | ### Write Documentation 27 | 28 | You can never have enough documentation! Please feel free to contribute to any 29 | part of the documentation, such as the official docs, docstrings, or even 30 | on the web in blog posts, articles, and such. 31 | 32 | ### Submit Feedback 33 | 34 | If you are proposing a feature: 35 | 36 | * Explain in detail how it would work. 37 | * Keep the scope as narrow as possible, to make it easier to implement. 38 | * Remember that this is a volunteer-driven project, and that contributions 39 | are welcome :) 40 | 41 | ## Get Started! 42 | 43 | Ready to contribute? Here's how to set up `skpm` for local development. 44 | 45 | 1. Download a copy of `skpm` locally. 46 | 2. Install `skpm` using `poetry`: 47 | 48 | ```console 49 | $ poetry install 50 | ``` 51 | 52 | 3. Use `git` (or similar) to create a branch for local development and make your changes: 53 | 54 | ```console 55 | $ git checkout -b name-of-your-bugfix-or-feature 56 | ``` 57 | 58 | 4. When you're done making changes, check that your changes conform to any code formatting requirements and pass any tests. 59 | 60 | 5. Commit your changes and open a pull request. 61 | 62 | ## Pull Request Guidelines 63 | 64 | Before you submit a pull request, check that it meets these guidelines: 65 | 66 | 1. The pull request should include additional tests if appropriate. 67 | 2. If the pull request adds functionality, the docs should be updated. 68 | 3. The pull request should work for all currently supported operating systems and versions of Python. 69 | 70 | ## Code of Conduct 71 | 72 | Please note that the `skpm` project is released with a 73 | Code of Conduct. By contributing to this project you agree to abide by its terms. 74 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024, Rafael Oyamada 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SkPM: a Scikit-learn Extension for Process Mining 2 | 3 |

4 | 5 |

6 | 7 | 8 |
9 | 10 | [![License: CC BY 4.0](https://img.shields.io/badge/License-CC%20BY%204.0-lightgrey.svg)](https://creativecommons.org/licenses/by/4.0/) 11 | [![Read the Docs](https://img.shields.io/readthedocs/skpm)](https://skpm.readthedocs.io/en/latest/) 12 | [![Codecov](https://img.shields.io/codecov/c/github/raseidi/skpm)](https://codecov.io/gh/raseidi/skpm) 13 | 14 |
15 | 16 | ## Overview 17 | 18 | SkPM is an open-source extension of the widely used [Scikit-learn](https://scikit-learn.org/) library, designed to meet the specific needs of Process Mining applications. It aims to provide a **standard**, **reproducible**, and **easily accessible** set of tools for PM research and practical applications. 19 | 20 | ## Available examples 21 | 22 | - **NEW** [**ICPM/ML4PM 2024 Tutorial**](https://colab.research.google.com/drive/1s6TxG14bKbh2zlOENLGGd9dy_1BLEBiO?usp=sharing): A notebook highlighting all the available features in SkPM! 23 | - [**Predictive Monitoring**](https://skpm.readthedocs.io/en/latest/auto_examples/plot_rt_pipeline.html#): Build end-to-end applications of traditional process mining tasks, such as remaining time and next activity prediction! 24 | - [**Event Log Preprocessing**](https://skpm.readthedocs.io/en/latest/auto_examples/feature_extracion.html): Several feature extraction and trace encoding techniques implemented! 25 | - [**Download Public Event Logs**](https://skpm.readthedocs.io/en/latest/auto_examples/plot_rt_pipeline.html#download-the-example-dataset): Download well-known event logs (e.g., BPI Challenges) from the 4tu repository! 26 | - [**Unbiased Event Log Split**](https://skpm.readthedocs.io/en/latest/auto_examples/plot_unbiased_split.html): Temporal and unbiased split of event logs for train/validation. 27 | 28 |

29 | 30 |

31 | 32 | ## Installation 33 | 34 | **Soon available on PyPI**. 35 | 36 | To install SkPM, you can clone the repository and install the required dependencies using `pip`: 37 | 38 | ```bash 39 | git clone https://github.com/raseidi/skpm.git 40 | cd skpm 41 | pip install . 42 | ``` 43 | 44 | ## Usage 45 | 46 | Below is an example of how to use SkPM to build a pipeline for remaining time prediction. 47 | 48 | ```python 49 | # skpm modules 50 | from skpm.encoding import Aggregation 51 | from skpm.event_feature_extraction import ( 52 | TimestampExtractor, 53 | ResourcePoolExtractor, 54 | ) 55 | 56 | # sklearn modules 57 | from sklearn.ensemble import RandomForestRegressor 58 | from sklearn.pipeline import Pipeline 59 | from sklearn.compose import ColumnTransformer 60 | from sklearn.preprocessing import StandardScaler 61 | 62 | # Example pipeline for remaining time prediction 63 | preprocessor = ColumnTransformer( 64 | transformers=[ 65 | ('timestamp', TimestampExtractor(), 'timestamp_column'), 66 | ('activity', OneHotEncoder(), 'activity_column'), 67 | ('resource', ResourcePoolExtractor(), 'resource_column'), 68 | ] 69 | ) 70 | 71 | pipeline = Pipeline(steps=[ 72 | ('preprocessor', preprocessor), 73 | ('aggregator', TraceAggregator()), 74 | ('standardization', StandardScaler()), 75 | ('regressor', RandomForestRegressor()) 76 | ]) 77 | 78 | # Fit the pipeline to your event log data 79 | pipeline.fit(X_train, y_train) 80 | 81 | # Make predictions on new cases 82 | predictions = pipeline.predict(X_test) 83 | ``` 84 | 85 | ## Documentation 86 | 87 | Detailed documentation and examples can be found [here](https://skpm.readthedocs.io/en/latest/). 88 | 89 | ## Roadmap, next steps, and help needed! 90 | 91 | - Improving documentation by including examples. 92 | - Implementing new applications and writing tutorials. 93 | - Adding new methods (feature extraction, trace encoding, and models). 94 | - Writing unit tests! 95 | 96 | ## Contributing 97 | 98 | We welcome contributions from the community! 99 | 100 | Check the [sklearn guidelines](https://scikit-learn.org/1.5/developers/contributing.html#reading-the-existing-code-base) to understand the `fit`, `predict`, and `transform` APIs! 101 | 102 | Check [our guidelines](CONTRIBUTING.md) as well to see how to open an issue or a PR. In summary: 103 | 104 | 1. Fork the repository. 105 | 2. Create a feature branch (`git checkout -b feature-branch`). 106 | 3. Commit your changes (`git commit -m 'feat: add new feature'`). 107 | 4. Push to the branch (`git push origin feature-branch`). 108 | 5. Open a pull request. 109 | 110 | ## License 111 | 112 | This project was created by Rafael Oyamada and is licensed under the [CC BY 4.0 License](https://creativecommons.org/licenses/by/4.0/). Feel free to use, modify, and distribute the code with attribution. 113 | 114 | ## Credits 115 | 116 | `skpm` was created with [`cookiecutter`](https://cookiecutter.readthedocs.io/en/latest/) and the `py-pkgs-cookiecutter` [template](https://github.com/py-pkgs/py-pkgs-cookiecutter). 117 | 118 | ## Citation 119 | 120 | ```bibtex 121 | @inproceedings{OyamadaTJC23, 122 | author = {Rafael Seidi Oyamada and 123 | Gabriel Marques Tavares and 124 | Sylvio Barbon Junior and 125 | Paolo Ceravolo}, 126 | editor = {Felix Mannhardt and 127 | Nour Assy}, 128 | title = {A Scikit-learn Extension Dedicated to Process Mining Purposes}, 129 | booktitle = {Proceedings of the Demonstration Track co-located with the International 130 | Conference on Cooperative Information Systems 2023, CoopIS 2023, Groningen, 131 | The Netherlands, October 30 - November 3, 2023}, 132 | series = {{CEUR} Workshop Proceedings}, 133 | publisher = {CEUR-WS.org}, 134 | } 135 | ``` -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | import os 7 | import sys 8 | from sphinx_gallery.sorting import FileNameSortKey 9 | from pathlib import Path 10 | 11 | 12 | # find project 13 | cwd = os.getcwd() 14 | parent = os.path.dirname(cwd) 15 | sys.path.append(parent) 16 | sys.path.insert(0, os.path.abspath("sphinxext")) 17 | 18 | from github_link import make_linkcode_resolve 19 | 20 | # -- Project information ----------------------------------------------------- 21 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 22 | 23 | 24 | project = "skpm" 25 | copyright = "2024, Rafael Oyamada" 26 | author = "Rafael Oyamada" 27 | release = "0.0.1" 28 | 29 | # -- General configuration --------------------------------------------------- 30 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 31 | 32 | extensions = [ 33 | "sphinx.ext.autodoc", 34 | "sphinx.ext.autosummary", 35 | "sphinx_gallery.gen_gallery", 36 | "sphinx.ext.napoleon", 37 | "sphinx.ext.intersphinx", 38 | "sphinx.ext.linkcode", 39 | "doi_role", 40 | "sphinx.ext.viewcode", 41 | "autoapi.extension", 42 | ] 43 | 44 | autoapi_dirs = ["../src"] 45 | 46 | templates_path = ["_templates"] 47 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 48 | source_suffix = {'.rst': 'restructuredtext', '.md': 'markdown'} 49 | 50 | 51 | # -- Options for HTML output ------------------------------------------------- 52 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 53 | 54 | html_theme = "sphinx_rtd_theme" 55 | # html_static_path = ["_static"] 56 | 57 | 58 | sg_examples_dir = "../examples" 59 | sg_gallery_dir = "auto_examples" 60 | sphinx_gallery_conf = { 61 | # path to your example scripts 62 | "examples_dirs": [sg_examples_dir], 63 | # path to where to save gallery generated output 64 | "gallery_dirs": [sg_gallery_dir], 65 | # specify that examples should be ordered according to filename 66 | "within_subsection_order": FileNameSortKey, 67 | # directory where function granular galleries are stored 68 | "backreferences_dir": "gen_modules/backreferences", 69 | # Modules for which function level galleries are created. In 70 | # this case sphinx_gallery and numpy in a tuple of strings. 71 | "doc_module": ("skpm"), 72 | # "filename_pattern": "/*.py", 73 | } 74 | 75 | # configuration for intersphinx: refer to the Python standard library. 76 | intersphinx_mapping = { 77 | "python": ( 78 | "https://docs.python.org/{.major}".format(sys.version_info), 79 | None, 80 | ), 81 | "matplotlib": ("https://matplotlib.org/", None), 82 | } 83 | 84 | linkcode_resolve = make_linkcode_resolve( 85 | "skpm", 86 | ( 87 | "https://github.com/raseidi/" 88 | "skpm/blob/{revision}/" 89 | "{package}/{path}#L{lineno}" 90 | ), 91 | ) 92 | 93 | autosummary_generate = True 94 | root_doc = "index" 95 | 96 | exclude_patterns = [ 97 | "_build", 98 | "templates", 99 | "includes", 100 | "**/sg_execution_times.rst", 101 | ] -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. skpm documentation master file, created by 2 | sphinx-quickstart on Thu Jan 25 16:27:22 2024. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to SkPM's documentation! 7 | ================================ 8 | 9 | **SkPM** *(Scikit-learn for Process Mining)* is a library built upon `Scikit-learn `_ to easily write and train machine learning pipelines tailored for process mining tasks. 10 | 11 | This work is still in progress! So far, we have focused more on feature extraction and encoding techniques to be easily used along with Scikit-learn's `pipelines `_ and `grid search-related functionalities `_. 12 | 13 | 14 | .. toctree:: 15 | :maxdepth: 1 16 | :caption: Install SkPM 17 | 18 | install/installation 19 | 20 | .. toctree:: 21 | :maxdepth: 1 22 | :caption: Tutorials 23 | 24 | auto_examples/index 25 | 26 | .. toctree:: 27 | :maxdepth: 1 28 | :caption: API Reference -------------------------------------------------------------------------------- /docs/install/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | SkPM is available from Python 3.10 to 3.12. 5 | 6 | Installation via PyPi 7 | --------------------- 8 | 9 | SkPM will be available on PyPi soon! 10 | 11 | .. .. code-block:: none 12 | 13 | .. pip install skpm 14 | 15 | Installation from source 16 | ------------------------ 17 | 18 | To install SkPM via GitHub, you can clone the repository and install it using pip: 19 | 20 | .. code-block:: none 21 | 22 | git clone 23 | cd skpm 24 | pip install . 25 | 26 | Alternatively, you can install it using poetry: 27 | 28 | .. code-block:: none 29 | 30 | python3.10 -m venv skpm-venv 31 | source skpm-venv/bin/activate 32 | pip install -U pip setuptools poetry 33 | poetry install 34 | -------------------------------------------------------------------------------- /docs/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raseidi/skpm/7c35b6bc3888cf64ac78210754438ad6429a869f/docs/logo.png -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raseidi/skpm/7c35b6bc3888cf64ac78210754438ad6429a869f/docs/pipeline.png -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx-autoapi==3.0.0 2 | sphinx-rtd-theme==2.0.0 3 | sphinx-gallery==0.17.1 4 | matplotlib==3.8 -------------------------------------------------------------------------------- /docs/sphinxext/doi_role.py: -------------------------------------------------------------------------------- 1 | """ 2 | doilinks 3 | ~~~~~~~~ 4 | Extension to add links to DOIs. With this extension you can use e.g. 5 | :doi:`10.1016/S0022-2836(05)80360-2` in your documents. This will 6 | create a link to a DOI resolver 7 | (``https://doi.org/10.1016/S0022-2836(05)80360-2``). 8 | The link caption will be the raw DOI. 9 | You can also give an explicit caption, e.g. 10 | :doi:`Basic local alignment search tool <10.1016/S0022-2836(05)80360-2>`. 11 | 12 | :copyright: Copyright 2015 Jon Lund Steffensen. Based on extlinks by 13 | the Sphinx team. 14 | :license: BSD. 15 | """ 16 | 17 | from docutils import nodes, utils 18 | from sphinx.util.nodes import split_explicit_title 19 | 20 | 21 | def reference_role(typ, rawtext, text, lineno, inliner, options={}, content=[]): 22 | text = utils.unescape(text) 23 | has_explicit_title, title, part = split_explicit_title(text) 24 | if typ in ["arXiv", "arxiv"]: 25 | full_url = "https://arxiv.org/abs/" + part 26 | if not has_explicit_title: 27 | title = "arXiv:" + part 28 | pnode = nodes.reference(title, title, internal=False, refuri=full_url) 29 | return [pnode], [] 30 | if typ in ["doi", "DOI"]: 31 | full_url = "https://doi.org/" + part 32 | if not has_explicit_title: 33 | title = "DOI:" + part 34 | pnode = nodes.reference(title, title, internal=False, refuri=full_url) 35 | return [pnode], [] 36 | 37 | 38 | def setup_link_role(app): 39 | app.add_role("arxiv", reference_role, override=True) 40 | app.add_role("arXiv", reference_role, override=True) 41 | app.add_role("doi", reference_role, override=True) 42 | app.add_role("DOI", reference_role, override=True) 43 | 44 | 45 | def setup(app): 46 | app.connect("builder-inited", setup_link_role) 47 | return {"version": "0.1", "parallel_read_safe": True} 48 | -------------------------------------------------------------------------------- /docs/sphinxext/github_link.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import os 3 | import subprocess 4 | import sys 5 | from functools import partial 6 | from operator import attrgetter 7 | 8 | REVISION_CMD = "git rev-parse --short HEAD" 9 | 10 | 11 | def _get_git_revision(): 12 | try: 13 | revision = subprocess.check_output(REVISION_CMD.split()).strip() 14 | except (subprocess.CalledProcessError, OSError): 15 | print("Failed to execute git to get revision") 16 | return None 17 | return revision.decode("utf-8") 18 | 19 | 20 | def _linkcode_resolve(domain, info, package, url_fmt, revision): 21 | """Determine a link to online source for a class/method/function 22 | 23 | This is called by sphinx.ext.linkcode 24 | 25 | An example with a long-untouched module that everyone has 26 | >>> _linkcode_resolve('py', {'module': 'tty', 27 | ... 'fullname': 'setraw'}, 28 | ... package='tty', 29 | ... url_fmt='https://hg.python.org/cpython/file/' 30 | ... '{revision}/Lib/{package}/{path}#L{lineno}', 31 | ... revision='xxxx') 32 | 'https://hg.python.org/cpython/file/xxxx/Lib/tty/tty.py#L18' 33 | """ 34 | 35 | if revision is None: 36 | return 37 | if domain not in ("py", "pyx"): 38 | return 39 | if not info.get("module") or not info.get("fullname"): 40 | return 41 | 42 | class_name = info["fullname"].split(".")[0] 43 | module = __import__(info["module"], fromlist=[class_name]) 44 | obj = attrgetter(info["fullname"])(module) 45 | 46 | # Unwrap the object to get the correct source 47 | # file in case that is wrapped by a decorator 48 | obj = inspect.unwrap(obj) 49 | 50 | try: 51 | fn = inspect.getsourcefile(obj) 52 | except Exception: 53 | fn = None 54 | if not fn: 55 | try: 56 | fn = inspect.getsourcefile(sys.modules[obj.__module__]) 57 | except Exception: 58 | fn = None 59 | if not fn: 60 | return 61 | 62 | fn = os.path.relpath(fn, start=os.path.dirname(__import__(package).__file__)) 63 | try: 64 | lineno = inspect.getsourcelines(obj)[1] 65 | except Exception: 66 | lineno = "" 67 | return url_fmt.format(revision=revision, package=package, path=fn, lineno=lineno) 68 | 69 | 70 | def make_linkcode_resolve(package, url_fmt): 71 | """Returns a linkcode_resolve function for the given URL format 72 | 73 | revision is a git commit reference (hash or name) 74 | 75 | package is the name of the root module of the package 76 | 77 | url_fmt is along the lines of ('https://github.com/USER/PROJECT/' 78 | 'blob/{revision}/{package}/' 79 | '{path}#L{lineno}') 80 | """ 81 | revision = _get_git_revision() 82 | return partial( 83 | _linkcode_resolve, revision=revision, package=package, url_fmt=url_fmt 84 | ) 85 | -------------------------------------------------------------------------------- /examples/README.rst: -------------------------------------------------------------------------------- 1 | .. _examples: 2 | 3 | Examples 4 | ======== 5 | 6 | This is the gallery of examples that showcase how SkPM can be used. Some 7 | examples demonstrate the use of the api in general and some 8 | demonstrate specific applications in tutorial form. -------------------------------------------------------------------------------- /examples/feature_extracion.py: -------------------------------------------------------------------------------- 1 | """ 2 | Event Feature Extraction 3 | ======================== 4 | 5 | In this tutorial, we introduce a few feature extraction techniques 6 | available in our library. Currently, we provide two modules for 7 | feature extraction: :mod:`skpm.case_feature_extraction` and 8 | :mod:`skpm.event_feature_extraction`. The former is still 9 | uder development so we will focus on the latter. 10 | """ 11 | 12 | # %% 13 | # Event features 14 | # -------------- 15 | # The :mod:`skpm.event_feature_extraction` module provides 16 | # a set of function to extract relevant features proposed in the 17 | # literature. In this example, we show how to extract features from 18 | # timestamps, resources, and an the inter-case perspective. 19 | # 20 | # Time-related features 21 | # --------------------- 22 | # The :class:`skpm.event_feature_extraction.TimestampExtractor` class 23 | # allows us to extract several features, such as the execution time of 24 | # each event, the accumulated time throughout the case, and the weekday. 25 | # Let's see how it works. 26 | 27 | # %% 28 | import pandas as pd 29 | from skpm.config import EventLogConfig as elc 30 | from skpm.feature_extraction import TimestampExtractor 31 | from skpm.event_logs import split, BPI17 32 | 33 | # download the dataset 34 | log = BPI17() 35 | 36 | # select the columns of interest 37 | df = log.dataframe[[elc.case_id, elc.activity, elc.timestamp, elc.resource]].copy() 38 | 39 | # split the data into train and test 40 | train, _ = split.unbiased(df, **log.unbiased_split_params) 41 | 42 | # sphinx_gallery_start_ignore 43 | del log 44 | del df 45 | # sphinx_gallery_end_ignore 46 | 47 | # extract the features 48 | te = TimestampExtractor().fit(train) 49 | train[te.get_feature_names_out()] = te.transform(train) 50 | 51 | # first event as an example 52 | train.iloc[0, :].T 53 | 54 | # %% 55 | # In the literature, features like the weekday are usually extracted 56 | # as a categorical features, but we currently implement it as a 57 | # numerical by normalizing the values between `[-0.5, 0.5]`. 58 | # In the future, we intend to provide a parameter to choose between 59 | # the two options. 60 | # 61 | # Resource-related features 62 | # ------------------------- 63 | # The resource pool extractor is a feature extractor that identifies 64 | # resource roles based on the correlation between activities and resources. 65 | # You can easily use this function as shown below: 66 | 67 | from skpm.feature_extraction import ResourcePoolExtractor 68 | 69 | re = ResourcePoolExtractor().fit(train) 70 | # re.get_feature_names_out() 71 | train["resource_role"] = re.transform(train) 72 | 73 | train.loc[0, [elc.case_id, elc.activity, elc.resource, "resource_role"]].T 74 | 75 | # %% 76 | # From the machine learning perspective, it can be seen as a nice way 77 | # to encode the resource information and reduce the dimensionality of the 78 | # data. In this example, we grouped 133 resource labels into 5 roles: 79 | 80 | import matplotlib.pyplot as plt 81 | plt.style.use("ggplot") 82 | 83 | features = train[[elc.resource, "resource_role"]].nunique().index.values 84 | values = train[[elc.resource, "resource_role"]].nunique().values 85 | 86 | fig, ax = plt.subplots() 87 | ax.bar(features, values, edgecolor="black") 88 | 89 | # %% 90 | # Inter-case features 91 | # ------------------- 92 | # Inter-case features refer to features that are computed based on the 93 | # relationship between different cases. It aims to quantify and module 94 | # the resource sharing between cases, for instance. In the current version 95 | # of our library, we only have a simple example of such feature: the number of 96 | # cases in progress simultaneously. This feature is commonly called 97 | # work in progress. 98 | # 99 | # Let's see how it works: 100 | 101 | from skpm.feature_extraction import WorkInProgress 102 | 103 | wip = WorkInProgress() 104 | wip.fit(train) 105 | train["wip"] = wip.transform(train) 106 | 107 | # visualizing it 108 | train = ( 109 | train 110 | .set_index(elc.timestamp) 111 | .resample("D")[["wip"]] 112 | .mean() 113 | .reset_index() 114 | ) 115 | plt.figure(figsize=(10, 5)) 116 | plt.plot(pd.to_datetime(train[elc.timestamp]), train["wip"]) 117 | plt.title("Average daily \nWork in Progress (WIP) over time") 118 | 119 | # %% 120 | # In this tutorial, we showed how to extract features from timestamps, 121 | # resources, and the inter-case perspective. We hope you find it useful 122 | # for your projects. If you have any questions or suggestions, please 123 | # open an issue on our GitHub repository or 124 | # `contact me `_ directly. -------------------------------------------------------------------------------- /examples/plot_download_api.py: -------------------------------------------------------------------------------- 1 | """ 2 | Downloading event logs via API 3 | ============================== 4 | 5 | This example demonstrates how we can easily download well-known process mining event logs 6 | from the 4TU.Centre for Research Data using the `skpm.event_logs` module. 7 | 8 | The `skpm.event_logs` module provides a set of event logs, such as the Sepsis and BPI 2012. 9 | """ 10 | 11 | # %% 12 | # The API overview 13 | # ---------------- 14 | # Implementing each event log as a class is a design choice that allows us to 15 | # easily manipulate each of them according to their specific characteristics. 16 | # One of the main challenges in process mining is the completely different 17 | # nature of datasets, since 18 | # each of them is composed of very particular business rules. 19 | # 20 | # For instance, an unbiased split of event logs was proposed in [1]. Roughly 21 | # speaking, each event log is splitted based on specific temporal 22 | # characteristics, which is hard coded within each specific event log. You can 23 | # check this feature in :ref:`Unbiased split 24 | # `. 25 | # 26 | # Now, let us see how to easily download event logs below. 27 | # 28 | # Downloading the BPI 2013 event log 29 | # ---------------------------------- 30 | # The BPI 2013 event log is a well-known event log that contains data about 31 | # closed problems from the Volvo IT Belgium. We can easily download it as 32 | # follows: 33 | 34 | from skpm.event_logs import BPI13ClosedProblems 35 | 36 | bpi13 = BPI13ClosedProblems() # this will automatically download it 37 | bpi13 38 | 39 | # %% 40 | # Notice, the `__repr__`method returns a brief overview of the event log. 41 | # In order to acess the dataframe, just call the `dataframe` attribute. 42 | 43 | bpi13.dataframe.head() 44 | 45 | # %% 46 | # In this tutorial, we showed how to user our API to automatically 47 | # download event logs from the `4TU Repository `_. 48 | # We hope you find it useful 49 | # for your projects. If you have any questions or suggestions, please 50 | # open an issue on our GitHub repository or 51 | # `contact me `_ directly. 52 | # 53 | # References 54 | # ---------- 55 | # [1] Hans Weytjens, Jochen De Weerdt. Creating Unbiased Public Benchmark Datasets with Data Leakage Prevention for Predictive Process Monitoring, 2021. doi: 10.1007/978-3-030-94343-1_2 -------------------------------------------------------------------------------- /examples/plot_pipeline_evaluation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pipeline selection 3 | ================== 4 | 5 | In this tutorial, we will learn how to choose a suitable pipeline for 6 | a PPM task. We will compare two approaches for preparing data before 7 | training Gradient Boosting and Random Forest regressors. The first 8 | approach uses more detailed steps, including timestamp features, 9 | one-hot encoding, and resource pool extraction. The second approach is 10 | simpler and relies only on one-hot encoding. We will train each type 11 | of model with each approach to see how they perform. 12 | """ 13 | 14 | # %% 15 | # Let us first import the necessary libraries and set the random seed 16 | # for reproducibility. 17 | 18 | import numpy as np 19 | import pandas as pd 20 | 21 | from sklearn.pipeline import Pipeline 22 | from sklearn.compose import ColumnTransformer 23 | from sklearn.metrics import root_mean_squared_error 24 | from sklearn.preprocessing import StandardScaler, OneHotEncoder 25 | from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor 26 | 27 | from skpm.sequence_encoding import Aggregation 28 | from skpm.config import EventLogConfig as elc 29 | from skpm.event_logs import BPI20PrepaidTravelCosts, split 30 | from skpm.feature_extraction.targets import remaining_time 31 | from skpm.feature_extraction import TimestampExtractor, ResourcePoolExtractor 32 | 33 | # Set random state for reproducible results 34 | RANDOM_STATE = 44 35 | np.random.seed(RANDOM_STATE) 36 | 37 | # %% 38 | # Below we load one of the BPI20 event logs, select relevant columns 39 | # for this example, extract the remaining time to use as the target, 40 | # and split the data into train and test sets.1 41 | 42 | # Load event log data 43 | log = BPI20PrepaidTravelCosts() 44 | 45 | # Select basic columns 46 | df = log.dataframe[[elc.case_id, elc.activity, elc.resource, elc.timestamp]].copy() 47 | df[elc.timestamp] = pd.to_datetime(df[elc.timestamp], utc=True) 48 | 49 | # Compute remaining time in seconds 50 | df["remaining_time"] = remaining_time(df, time_unit="seconds") 51 | 52 | # Split into train/test sets using provided split method 53 | train, test = split.unbiased(df, **log.unbiased_split_params) 54 | 55 | # Separate features and targets for train and test 56 | X_train = train.drop(columns=["remaining_time"]) 57 | y_train = train["remaining_time"] 58 | X_test = test.drop(columns=["remaining_time"]) 59 | y_test = test["remaining_time"] 60 | 61 | # %% 62 | # Defining an advanced and a simple preprocessing pipeline 63 | # -------------------------------------------------------- 64 | # We will define two pipelines for preprocessing the data before 65 | # training the models. 66 | 67 | # Advanced preprocessing pipeline 68 | data_prep_advanced = Pipeline([ 69 | ("preprocessing", ColumnTransformer( 70 | transformers=[ 71 | ("timestamp_features", TimestampExtractor(), [elc.timestamp, elc.case_id]), 72 | ("activity_encode", OneHotEncoder(sparse_output=False), [elc.activity]), 73 | ("resource_pool", ResourcePoolExtractor(), [elc.case_id, elc.activity, elc.resource]), 74 | ("case_id_pass", "passthrough", [elc.case_id]), 75 | ])), 76 | ("encode_agg", Aggregation(method="mean", prefix_len=6)), 77 | ("scaling", StandardScaler()), 78 | ]) 79 | 80 | data_prep_simple = Pipeline([ 81 | ("preprocessing", ColumnTransformer( 82 | transformers=[ 83 | ("activity_encode", OneHotEncoder(sparse_output=False), [elc.activity]), 84 | ("case_id_pass", "passthrough", [elc.case_id]), 85 | ])), 86 | ("encode_agg", Aggregation(method="mean", prefix_len=6)), 87 | ("scaling", StandardScaler()), 88 | ]) 89 | 90 | # %% 91 | # Training the models 92 | # ------------------- 93 | # We will train two Gradient Boosting and two Random Forest models 94 | # using the advanced and simple preprocessing pipelines. We will then 95 | # evaluate the models using the root mean squared error (RMSE) metric. 96 | 97 | # Gradient Boosting pipelines 98 | gb_pipe_advanced = Pipeline([ 99 | ("preprocessing", data_prep_advanced), 100 | ("regressor", GradientBoostingRegressor(random_state=RANDOM_STATE)) 101 | ]) 102 | 103 | gb_pipe_simple = Pipeline([ 104 | ("preprocessing", data_prep_simple), 105 | ("regressor", GradientBoostingRegressor(random_state=RANDOM_STATE)) 106 | ]) 107 | 108 | # Random Forest pipelines 109 | rf_pipe_advanced = Pipeline([ 110 | ("preprocessing", data_prep_advanced), 111 | ("regressor", RandomForestRegressor(n_estimators=10, random_state=RANDOM_STATE)) 112 | ]) 113 | 114 | rf_pipe_simple = Pipeline([ 115 | ("preprocessing", data_prep_simple), 116 | ("regressor", RandomForestRegressor(n_estimators=10, random_state=RANDOM_STATE)) 117 | ]) 118 | 119 | # %% 120 | # Fit all models: 121 | 122 | # Fit all models 123 | gb_pipe_advanced.fit(X_train, y_train) 124 | gb_pipe_simple.fit(X_train, y_train) 125 | rf_pipe_advanced.fit(X_train, y_train) 126 | rf_pipe_simple.fit(X_train, y_train) 127 | 128 | # Print scores 129 | print("GB-advanced score:", root_mean_squared_error(y_test, gb_pipe_advanced.predict(X_test))) 130 | print("GB-simple score:", root_mean_squared_error(y_test, gb_pipe_simple.predict(X_test))) 131 | print("RF-advanced score:", root_mean_squared_error(y_test, rf_pipe_advanced.predict(X_test))) 132 | print("RF-simple score:", root_mean_squared_error(y_test, rf_pipe_simple.predict(X_test))) 133 | 134 | scores = pd.DataFrame({ 135 | "model": ["GB1", "GB2", "RF1", "RF2"], 136 | "score": [ 137 | root_mean_squared_error(y_test, gb_pipe_advanced.predict(X_test)), 138 | root_mean_squared_error(y_test, gb_pipe_simple.predict(X_test)), 139 | root_mean_squared_error(y_test, rf_pipe_advanced.predict(X_test)), 140 | root_mean_squared_error(y_test, rf_pipe_simple.predict(X_test)) 141 | ] 142 | }) 143 | 144 | 145 | # %% 146 | # Visualizing the results 147 | # ----------------------- 148 | # In this step, we will look at the RMSE scores to understand how well each model performed. 149 | # At first glance, GB1 appears weaker than RF2, which might lead us to believe that Random Forest 150 | # is the better choice. However, this comparison is not fair, because each model was trained using 151 | # a different preprocessing pipeline. To make a fair comparison, we need to examine models that 152 | # use the same data preparation steps. 153 | # 154 | # When we compare models trained with the same preprocessing pipeline, we see that the Gradient 155 | # Boosting model actually scores better than both Random Forest pipelines. This shows how important 156 | # it is to evaluate models under consistent preprocessing conditions to accurately judge their 157 | # performance. 158 | 159 | import matplotlib.pyplot as plt 160 | plt.style.use("ggplot") 161 | 162 | scores.plot( 163 | kind="barh", 164 | x="model", 165 | y="score", 166 | color="steelblue", 167 | legend=False, 168 | figsize=(8, 4) 169 | ) 170 | plt.ylabel("") 171 | plt.xlabel("RMSE") 172 | plt.xscale("log") 173 | plt.tight_layout() -------------------------------------------------------------------------------- /examples/plot_rt_pipeline.py: -------------------------------------------------------------------------------- 1 | """ 2 | Remaining Time Prediction Pipeline 3 | ================================== 4 | 5 | This example demonstrates how to build a pipeline for remaining time prediction 6 | using the BPI 2013 Closed Problems dataset. 7 | 8 | The pipeline consists of the following steps: 9 | 1. Preprocessing: Extracts features from the event log. 10 | 2. Encoding: Aggregates the extracted features. 11 | 3. Regression: Fits a regression model to predict the remaining time. 12 | 13 | The pipeline is evaluated using the R^2 score. We conclude by showing a trick 14 | to improve the performance of the regression model by transforming the target 15 | using `sklearn.compose.TransformedTargetRegressor`. 16 | """ 17 | 18 | # %% 19 | # Required imports 20 | # ------------------ 21 | # We start by importing the required modules and classes. 22 | from sklearn.compose import TransformedTargetRegressor 23 | import numpy as np 24 | import pandas as pd 25 | 26 | from sklearn.preprocessing import StandardScaler, OneHotEncoder 27 | from sklearn.pipeline import Pipeline, FunctionTransformer 28 | from sklearn.compose import ColumnTransformer 29 | from sklearn.ensemble import RandomForestRegressor 30 | 31 | from skpm.sequence_encoding import Aggregation 32 | from skpm.config import EventLogConfig as elc 33 | from skpm.feature_extraction import ( 34 | TimestampExtractor, 35 | ResourcePoolExtractor, 36 | ) 37 | from skpm.feature_extraction.targets import remaining_time 38 | from skpm.event_logs import BPI13ClosedProblems 39 | 40 | # %% 41 | # Download the example dataset 42 | # ---------------------------- 43 | # We can automatically download event logs using SkPM. 44 | # In this example, let's use the :class:`~skpm.event_logs.BPI13ClosedProblems`. 45 | log = BPI13ClosedProblems() 46 | log # Note: this is a TUEventLog object, not a dataframe 47 | 48 | # %% 49 | # Subsequently, let's access the `pd.DataFrame` and 50 | # extract the target variable `remaining_time` using the 51 | # :func:`~skpm.event_feature_extraction.targets.remaining_time` function. 52 | log = log.dataframe.copy() 53 | log = log[[elc.case_id, elc.activity, elc.resource, elc.timestamp]] 54 | 55 | # extract the target variable 56 | log.loc[:, "remaining_time"] = remaining_time(log, time_unit="seconds") 57 | 58 | # In order to keep this example simple, we are skipping the train-test split. 59 | X_train = log.drop(columns=["remaining_time"]) 60 | y_train = log["remaining_time"] 61 | 62 | log.head() 63 | 64 | # %% 65 | # Build the pipeline 66 | # ------------------ 67 | # We build the pipeline by creating a sequence of steps. 68 | # The pipeline consists of the following steps: 69 | # 70 | # 1. **Preprocessing**: Extracts features from the event log. 71 | # 72 | # 2. **Encoding and normalizing**: Aggregates the extracted features and 73 | # applies the StandardScaler. 74 | # 75 | # 3. **Regression**: Fits a regression model to predict the remaining time. 76 | # 77 | # We create a `ColumnTransformer` to apply different transformations to 78 | # different columns. More specifically, we apply the following transformations: 79 | # 80 | # - `TimestampExtractor` to extract timestamp features. 81 | # 82 | # - `OneHotEncoder` to encode the activity column. 83 | # 84 | # - `ResourcePoolExtractor` to extract resource pool of each activity. 85 | transformers = ColumnTransformer( 86 | transformers=[ 87 | ( 88 | "timestamp_features", 89 | TimestampExtractor(), 90 | [elc.timestamp, elc.case_id], 91 | ), 92 | (elc.activity, OneHotEncoder(sparse_output=False), [elc.activity]), 93 | ( 94 | elc.resource, 95 | ResourcePoolExtractor(), 96 | [elc.case_id, elc.activity, elc.resource], 97 | ), 98 | (elc.case_id, "passthrough", [elc.case_id]), 99 | ] 100 | ) 101 | 102 | # %% 103 | # Integrating the preprocessing transformers with the full pipeline. 104 | # The pipeline will transformer/extractu features, encode the traces, 105 | # normalize the features, and fit a regression model to predict the remaining 106 | # time. 107 | pipe = Pipeline( 108 | [ 109 | ("preprocessing", transformers), 110 | ("encoding", Aggregation(method="mean")), 111 | ("scaling", StandardScaler()), 112 | ("regressor", RandomForestRegressor()), 113 | ] 114 | ) 115 | 116 | print(pipe.fit(X_train, y_train).score(X_train, y_train)) 117 | 118 | # %% 119 | # We can leverage the `TransformedTargetRegressor` class to improve the 120 | # performance of the regression model. This class allows us to transform the 121 | # target variable using a transformer before fitting the model. In this example, 122 | # we use the `FunctionTransformer` class to apply the `log1p` transformation to 123 | # the target. The pipeline will output the target in the original scale since 124 | # we set the `inverse_func` parameter to `np.expm1`. 125 | # 126 | # Such trick allows us to enhance the predictive performance of the model. 127 | 128 | # sphinx_gallery_start_ignore 129 | import warnings 130 | warnings.filterwarnings("ignore") 131 | # sphinx_gallery_end_ignore 132 | y_trans = FunctionTransformer(np.log1p, inverse_func=np.expm1) 133 | regr = TransformedTargetRegressor(regressor=pipe, transformer=y_trans) 134 | 135 | print(regr.fit(X_train, y_train).score(X_train, y_train)) 136 | 137 | # %% 138 | # In this tutorial, we showed how to run an end-to-end predictive 139 | # process monitoring pipleine. We hope you find it useful 140 | # for your projects. If you have any questions or suggestions, please 141 | # open an issue on our GitHub repository or 142 | # `contact me `_ directly. -------------------------------------------------------------------------------- /examples/plot_unbiased_split.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | Unbiased Split of Event Logs 4 | ============================ 5 | 6 | In this tutorial we provide an overview of how the unbiased split of event 7 | logs [1] works and how to use it in the `skpm` package. 8 | """ 9 | 10 | # %% 11 | # The `biased` split problem 12 | # -------------------------- 13 | # In machine learning, standardizing how datasets are split is a common, often 14 | # essential, practice to ensure fair and reproducible results. However, in the 15 | # field of Process Mining, machine learning applications have not consistently 16 | # adopted this practice. Weytjens and De Weerdt's work [1] proposes the first 17 | # significant effort to address this gap. 18 | # 19 | # More specifically, their paper tackles three key challenges: 20 | # 21 | # 1. **Inconsistent Dataset Split**: Different datasets and preprocessing 22 | # methods make it hard to compare research outcomes. Certain preprocessing 23 | # choices can even lead to biased results due to the use of domain knowledge 24 | # that may not be accessible to all researchers. 25 | # 26 | # 2. **Data Leakage**: Training and test sets often overlap, with events from 27 | # the same case appearing in both, which leads to overfitted performance 28 | # measures and inaccurate predictions. 29 | # 30 | # 3. **Test Set Bias**: The test sets frequently suffer from bias due to 31 | # unequal distributions of case durations and active cases, especially at the 32 | # start and end of the dataset. This skews evaluation results, making them 33 | # less reflective of real-world performance. 34 | # 35 | # The `SkPM` package adapted the available code from the authors' GitHub [2]. 36 | # 37 | # Unbised Split API 38 | # ----------------- 39 | # Only a few datasets are currently supported by the unbiased split method. 40 | # The usage is really simple and can be seen in the following example: 41 | 42 | from skpm.event_logs import split, BPI20RequestForPayment 43 | 44 | bpi20 = BPI20RequestForPayment() 45 | 46 | train, test = split.unbiased(bpi20, **bpi20.unbiased_split_params) 47 | train.shape, test.shape 48 | 49 | # %% 50 | # The hyperparameters for the unbiased split are hardcoded in the original 51 | # implementation. However, they are 52 | # derived based on an data-driven analysis. In the future, we may consider 53 | # to implement this generic approach in order to extend the unbiased split 54 | # to other datasets. The hardcoded hyperparameters are: 55 | # 56 | # - `start_date`: the start date of the event log. 57 | # - `end_date`: the end date of the event log. 58 | # - `max_days`: the maximum duration of cases. 59 | # 60 | bpi20.unbiased_split_params 61 | 62 | # %% 63 | # For datasets without hardcoded hyperparameters, an exception will be raised: 64 | 65 | from skpm.event_logs import Sepsis 66 | 67 | sepsis = Sepsis() 68 | try: 69 | _ = split.unbiased(sepsis, **sepsis.unbiased_split_params) 70 | except Exception as e: 71 | print(e) 72 | 73 | # %% 74 | # The unbiased split is available for the following datasets: 75 | # 76 | # - :class:`~skpm.event_logs.BPI12` 77 | # - :class:`~skpm.event_logs.BPI17` 78 | # - :class:`~skpm.event_logs.BPI19` 79 | # - :class:`~skpm.event_logs.BPI20PrepaidTravelCosts` 80 | # - :class:`~skpm.event_logs.BPI20TravelPermitData` 81 | # - :class:`~skpm.event_logs.BPI20RequestForPayment` 82 | # 83 | # %% 84 | # In this tutorial, we showed to use the unbiased split API. 85 | # We hope you find it useful 86 | # for your projects. If you have any questions or suggestions, please 87 | # open an issue on our GitHub repository or 88 | # `contact me `_ directly. 89 | # 90 | # References 91 | # ---------- 92 | # [1] Hans Weytjens, Jochen De Weerdt. Creating Unbiased Public Benchmark 93 | # Datasets with Data Leakage Prevention for Predictive Process Monitoring, 94 | # 2021. doi: 10.1007/978-3-030-94343-1_2 95 | # [2] https://github.com/hansweytjens/predictive-process-monitoring-benchmarks -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "skpm" 3 | version = "0.0.1" 4 | description = "A process mining library built upon scikit-learn!" 5 | authors = ["Rafael Oyamada"] 6 | license = "MIT" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = ">=3.10" 11 | scikit-learn = ">=1.6.1" 12 | pandas = ">=2.2.0" 13 | pyarrow = ">=16.0.0" 14 | polars = ">=0.20.16" 15 | lxml = "5.1.0" 16 | 17 | [tool.poetry.group.dev.dependencies] 18 | pytest = ">=7.4.4" 19 | pytest-cov = ">=4.1.0" 20 | sphinx-autoapi = ">=3.0.0" 21 | sphinx-rtd-theme = ">=2.0.0" 22 | sphinx-gallery = ">=0.17.1" 23 | black = ">=24.1.1" 24 | codecov = ">=2.1.13" 25 | mypy = ">=1.8.0" 26 | coverage = ">=7.4.1" 27 | matplotlib = ">=3.8" 28 | 29 | [build-system] 30 | requires = ["poetry-core>=1.0.0"] 31 | build-backend = "poetry.core.masonry.api" 32 | 33 | [tool.black] 34 | line-length = 80 -------------------------------------------------------------------------------- /src/skpm/__init__.py: -------------------------------------------------------------------------------- 1 | # read version from installed package 2 | from importlib.metadata import version 3 | 4 | __version__ = version("skpm") 5 | 6 | 7 | from sklearn import set_config 8 | 9 | set_config(transform_output="pandas") 10 | -------------------------------------------------------------------------------- /src/skpm/base.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | from pandas import DataFrame 3 | from sklearn.base import BaseEstimator 4 | from sklearn.utils.validation import validate_data 5 | 6 | from .config import EventLogConfig as elc 7 | from .utils.validation import ensure_list, validate_columns 8 | 9 | 10 | class BaseProcessEstimator(BaseEstimator): 11 | """Base class for all process estimators in skpm. 12 | 13 | This class implements a common interface for all process, 14 | aiming at standardizing the validation and transformation 15 | of event logs. 16 | 17 | For instance, all event logs must have a `case_id` column. 18 | """ 19 | 20 | def _validate_log( 21 | self, 22 | X: DataFrame, 23 | y: DataFrame = None, 24 | reset: bool = True, 25 | copy: bool = True, 26 | ): 27 | """ 28 | Validate and preprocess the input event log DataFrame. 29 | 30 | Parameters 31 | ---------- 32 | X : DataFrame 33 | The input DataFrame representing the event log. 34 | y : DataFrame, default=None 35 | The target DataFrame associated with the event log. 36 | reset : bool, default=True 37 | Whether to reset the index of the DataFrame after validation. 38 | copy : bool, default=True 39 | Whether to create a copy of the DataFrame before validation. 40 | 41 | Returns 42 | ------- 43 | DataFrame 44 | The preprocessed and validated event log DataFrame. 45 | 46 | Raises 47 | ------ 48 | ValueError 49 | If the input is not a DataFrame or if the case ID column is missing. 50 | """ 51 | is_polars = False 52 | if isinstance(X, pl.DataFrame): # For Polars DataFrame 53 | X = X.to_pandas() 54 | is_polars = True 55 | 56 | self._validate_params() 57 | 58 | # TODO: the validation of a dataframe might be done 59 | # through the `pd.api.extensions`. 60 | # This would decrease the dependency between data validation 61 | # and sklearn estimators. 62 | # See: https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-pandas 63 | data = X.copy() if copy else X 64 | 65 | # despite the bottlenecks, event logs are better handled as dataframes 66 | assert isinstance(data, DataFrame), "Input must be a dataframe." 67 | cols = ensure_list(data.columns) 68 | 69 | self._case_id = self._ensure_case_id(data.columns) 70 | 71 | validate_data( 72 | self, 73 | X=X.drop(columns=self._case_id, axis=1), 74 | y=y, 75 | reset=reset, 76 | ) 77 | 78 | cols = validate_columns( 79 | input_columns=data.columns, 80 | required=[self._case_id] + list(self.feature_names_in_), 81 | ) 82 | 83 | if is_polars: # For Polars DataFrame 84 | data = pl.from_pandas(data) 85 | return data[cols] 86 | 87 | def _ensure_case_id(self, columns: list[str]): 88 | """ 89 | Ensure that the case ID column is present in the list of columns. 90 | 91 | Parameters 92 | ---------- 93 | columns : list[str] 94 | The list of column names to check for the presence of the case ID. 95 | 96 | Returns 97 | ------- 98 | bool 99 | True if the case ID column is found, False otherwise. 100 | """ 101 | for col in columns: 102 | if col.endswith(elc.case_id): 103 | return col 104 | raise ValueError(f"Case ID column not found.") 105 | -------------------------------------------------------------------------------- /src/skpm/config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | EndOfTrace = str 4 | 5 | @dataclass 6 | class EventLogConfig: 7 | case_id: str = "case:concept:name" 8 | activity: str = "concept:name" 9 | resource: str = "org:resource" 10 | timestamp: str = "time:timestamp" 11 | 12 | default_file_format: str = ".parquet" 13 | 14 | EOT: EndOfTrace = "EOT" 15 | 16 | def update(self, **kwargs): 17 | for key, value in kwargs.items(): 18 | setattr(self, key, value) -------------------------------------------------------------------------------- /src/skpm/event_logs/__init__.py: -------------------------------------------------------------------------------- 1 | from .bpi import ( 2 | BPI12, 3 | BPI13ClosedProblems, 4 | BPI13Incidents, 5 | BPI13OpenProblems, 6 | BPI17, 7 | BPI19, 8 | BPI20PrepaidTravelCosts, 9 | BPI20TravelPermitData, 10 | BPI20RequestForPayment, 11 | BPI20DomesticDeclarations, 12 | BPI20InternationalDeclarations, 13 | Sepsis, 14 | ) 15 | 16 | from .parser import read_xes 17 | 18 | __all__ = [ 19 | "BPI12", 20 | "BPI13ClosedProblems", 21 | "BPI13Incidents", 22 | "BPI13OpenProblems", 23 | "BPI17", 24 | "BPI19", 25 | "BPI20PrepaidTravelCosts", 26 | "BPI20TravelPermitData", 27 | "BPI20RequestForPayment", 28 | "BPI20DomesticDeclarations", 29 | "BPI20InternationalDeclarations", 30 | "Sepsis", 31 | "read_xes", 32 | ] 33 | -------------------------------------------------------------------------------- /src/skpm/event_logs/base.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pandas as pd 4 | 5 | from skpm.config import EventLogConfig as elc 6 | from skpm.event_logs.parser import read_xes 7 | from skpm.event_logs.download import download_url 8 | from skpm.event_logs.extract import extract_gz 9 | 10 | 11 | class BasePreprocessing: 12 | def preprocess(self): 13 | """ 14 | Preprocess the event log by converting the timestamp column to 15 | datetime format. 16 | """ 17 | self._dataframe[elc.timestamp] = pd.to_datetime( 18 | self._dataframe[elc.timestamp], utc=True, format="mixed" 19 | ) 20 | 21 | 22 | class TUEventLog(BasePreprocessing): 23 | """ 24 | Base class for event logs from the 4TU repository. 25 | 26 | It provides the basic structure for downloading, preprocessing, and 27 | splitting 28 | Furthermore, it provides the basic structure for caching the logs. 29 | 30 | Event logs from the 4tu repository [1] are downloaded as .xes.gz files 31 | and then converted to parquet files. The parquet files are then used to 32 | load the event logs. 33 | By default, we keep the .xes files in the raw folder 34 | 35 | Parameters 36 | ---------- 37 | root_folder : str, optional 38 | Path where the event log will be stored. Defaults to "./data". 39 | save_as_pandas : bool, optional 40 | Whether to save the event log as a Pandas DataFrame. Defaults to 41 | True. 42 | train_set : bool, optional 43 | Whether the event log is for the training set. Defaults to True. 44 | file_path : str, optional 45 | Path to the event log file. If None, the file will be downloaded. 46 | Defaults to None. 47 | 48 | References: 49 | ----------- 50 | [1] 4TU Research Data: https://data.4tu.nl/ 51 | """ 52 | 53 | url: str = None 54 | md5: str = None 55 | file_name: str = None 56 | meta_data: str = None # TODO: download DATA.xml from the 4TU repository 57 | 58 | _unbiased_split_params: dict = None 59 | 60 | def __init__( 61 | self, 62 | root_folder: str = "./data", 63 | save_as_pandas: bool = True, 64 | train_set: bool = True, 65 | file_path: str = None, 66 | ) -> None: 67 | super().__init__() 68 | self.root_folder = root_folder 69 | self.save_as_pandas = save_as_pandas 70 | self.train_set = train_set 71 | 72 | if file_path is None: 73 | self._file_path = os.path.join( 74 | self.root_folder, 75 | self.__class__.__name__, 76 | self.file_name.replace(".gz", "").replace( 77 | ".xes", elc.default_file_format 78 | ), 79 | ) 80 | else: 81 | self._file_path = file_path 82 | 83 | if not os.path.exists(self.file_path): 84 | self.download() 85 | 86 | self._dataframe = self.read_log() 87 | self.preprocess() 88 | 89 | @property 90 | def dataframe(self) -> pd.DataFrame: 91 | """ 92 | pd.DataFrame: DataFrame containing the event log data. 93 | """ 94 | return self._dataframe 95 | 96 | @property 97 | def file_path(self) -> str: 98 | """ 99 | str: Path to the event log file. 100 | """ 101 | return self._file_path 102 | 103 | @file_path.setter 104 | def file_path(self, value): 105 | self._file_path = value 106 | 107 | @property 108 | def unbiased_split_params(self) -> dict: 109 | """ 110 | dict: Parameters for the unbiased split of the event log. 111 | """ 112 | if self._unbiased_split_params is None: 113 | raise ValueError( 114 | f"Unbiased split not available for {self.__class__.__name__}." 115 | ) 116 | return self._unbiased_split_params 117 | 118 | def __len__(self): 119 | """ 120 | Get the number of events in the event log. 121 | 122 | Returns 123 | ------- 124 | int 125 | Number of events in the event log. 126 | """ 127 | return len(self._dataframe) 128 | 129 | def download(self) -> None: 130 | """Generic method to download the event log from the 4TU Repository. 131 | 132 | It downloads the event log from the url, uncompresses 133 | it, and stores it. It can be overwritten by the 134 | subclasses if needed. 135 | """ 136 | destination_folder = os.path.join("data", self.__class__.__name__) 137 | print(f"Downloading {destination_folder}") 138 | path = download_url( 139 | url=self.url, folder=destination_folder, file_name=self.file_name 140 | ) 141 | if path.endswith(".xes"): 142 | self.file_path = path 143 | return 144 | 145 | if path.endswith(".gz"): 146 | self.file_path = extract_gz( 147 | path=path, folder=os.path.dirname(destination_folder) 148 | ) 149 | # TODO: elif other formats 150 | os.remove(path) 151 | 152 | def read_log(self) -> pd.DataFrame: 153 | """ 154 | Read the event log from the file. 155 | 156 | Returns 157 | ------- 158 | pd.DataFrame 159 | DataFrame containing the event log data. 160 | """ 161 | if self.file_path.endswith(".xes"): 162 | log = read_xes(self.file_path) 163 | 164 | if self.save_as_pandas: 165 | new_file_path = self.file_path.replace( 166 | ".xes", elc.default_file_format 167 | ) 168 | if elc.default_file_format == ".parquet": 169 | log.to_parquet(new_file_path) 170 | else: 171 | raise ValueError("File format not implemented.") 172 | os.remove(self.file_path) 173 | self.file_path = new_file_path 174 | 175 | elif self.file_path.endswith(elc.default_file_format): 176 | log = pd.read_parquet(self.file_path) 177 | else: 178 | raise ValueError("File format not implemented.") 179 | 180 | return log 181 | 182 | def __repr__(self) -> str: 183 | """ 184 | Return a string representation of the TUEventLog object. 185 | 186 | Returns 187 | ------- 188 | str 189 | String representation of the TUEventLog object. 190 | """ 191 | head = f"{self.__class__.__name__} Event Log" 192 | body = [f"Number of cases: {self._dataframe[elc.case_id].nunique()}"] 193 | body.append(f"Number of events: {self.__len__()}") 194 | if self.file_path is not None: 195 | body.append( 196 | f"Event log location: {os.path.normpath(self.file_path)}" 197 | ) 198 | body += "".splitlines() 199 | lines = [head] + [" " * 4 + line for line in body] 200 | return "\n".join(lines) 201 | -------------------------------------------------------------------------------- /src/skpm/event_logs/bpi.py: -------------------------------------------------------------------------------- 1 | from skpm.event_logs.base import TUEventLog 2 | 3 | 4 | class BPI12(TUEventLog): 5 | """:doi:`BPI Challenge 2012 event log 6 | <10.4121/uuid:3926db30-f712-4394-aebc-75976070e91f>`. 7 | 8 | This dataset is from the Business Process Intelligence (BPI) Challenge 9 | 2012 and contains event logs from a real-life financial institution. The 10 | event log records the execution of various activities related to a loan 11 | application process. Each event in the log represents a step in handling a 12 | loan request, with relevant information about the case, timestamp, and 13 | resource involved. 14 | 15 | 16 | Parameters 17 | ---------- 18 | root_folder (str, optional): Path where the event log will be stored. 19 | Defaults to "data/". 20 | save_as_pandas (bool, optional): Whether to save the event log as a 21 | pandas parquet file. 22 | Defaults to True. 23 | train_set (bool, optional): Whether to use the train set or the test 24 | set. 25 | If True, use the train set. If False, use the test set. Defaults 26 | to True. 27 | 28 | Examples 29 | -------- 30 | >>> bpi_12 = BPI12() 31 | >>> bpi_12.download() # Manually download the event log 32 | >>> event_log = bpi_12.dataframe() # Access the event log DataFrame 33 | """ 34 | 35 | url: str = ( 36 | "https://data.4tu.nl/file/533f66a4-8911-4ac7-8612-1235d65d1f37/3276db7f-8bee-4f2b-88ee-92dbffb5a893" 37 | ) 38 | md5: str = "74c7ba9aba85bfcb181a22c9d565e5b5" 39 | file_name: str = "BPI_Challenge_2012.xes.gz" 40 | 41 | _unbiased_split_params: dict = { 42 | "start_date": None, 43 | "end_date": "2012-02", 44 | "max_days": 32.28, 45 | } 46 | 47 | 48 | class BPI13ClosedProblems(TUEventLog): 49 | """:doi:`BPI Challenge 2013 Closed problems event log 50 | `. 51 | 52 | The BPI 2013 Closed Problems log consists of 1487 cases and 6660 events. 53 | It originates from the problem management process of Volvo IT Belgium, 54 | focusing on cases where problems were diagnosed and resolved to enhance IT 55 | service quality. 56 | 57 | 58 | 59 | Parameters 60 | ---------- 61 | root_folder : str, optional 62 | Path where the event log will be stored. Defaults to "data/". 63 | save_as_pandas : bool, optional 64 | Whether to save the event log as a pandas parquet file. Defaults to 65 | True. 66 | train_set : bool, optional 67 | Whether to use the train set or the test set. If True, use the train 68 | set. If False, use the test set. Defaults to True. 69 | 70 | Examples 71 | -------- 72 | >>> bpi_13_incidents = BPI13Incidents() 73 | >>> bpi_13_incidents.download() # Manually download the event log 74 | >>> event_log = bpi_13_incidents.dataframe() # Access the event log DataFrame 75 | """ 76 | 77 | url: str = ( 78 | "https://data.4tu.nl/file/1987a2a6-9f5b-4b14-8d26-ab7056b17929/8b99119d-9525-452e-bc8f-236ac76fa9c9" 79 | ) 80 | md5: str = "4f9c35942f42cb90d911ee4936bbad87" 81 | file_name: str = "BPI_Challenge_2013_closed_problems.xes.gz" 82 | 83 | 84 | class BPI13Incidents(TUEventLog): 85 | """:doi:`BPI Challenge 2013 Incidents 86 | `. 87 | 88 | The BPI 2013 Incidents log contains 7554 cases and 65533 events. 89 | It is part of the incident management process at Volvo IT Belgium, 90 | aimed at restoring normal service operations for customers as quickly as 91 | possible, while maintaining high levels of service quality and 92 | availability. 93 | 94 | Parameters 95 | ---------- 96 | root_folder : str, optional 97 | Path where the event log will be stored. Defaults to "data/". 98 | save_as_pandas : bool, optional 99 | Whether to save the event log as a pandas parquet file. Defaults to True. 100 | train_set : bool, optional 101 | Whether to use the train set or the test set. If True, use the train set. If False, use the test set. Defaults to True. 102 | 103 | Examples 104 | -------- 105 | >>> bpi_13_open_problems = BPI13OpenProblems() 106 | >>> bpi_13_open_problems.download() # Manually download the event log 107 | >>> event_log = bpi_13_open_problems.dataframe() # Access the event log DataFrame 108 | """ 109 | 110 | url: str = ( 111 | "https://data.4tu.nl/file/0fc5c579-e544-4fab-9143-fab1f5192432/aa51ffbb-25fd-4b5a-b0b8-9aba659b7e8c" 112 | ) 113 | md5: str = "d4809bd55e3e1c15b017ab4e58228297" 114 | file_name: str = "BPI_Challenge_2013_incidents.xes.gz" 115 | 116 | 117 | class BPI13OpenProblems(TUEventLog): 118 | """:doi:`BPI Challenge 2013 open problems 119 | `. 120 | 121 | The BPI 2013 Open Problems log contains 819 cases and 2351 events. 122 | It originates from the problem management process of Volvo IT Belgium, 123 | focusing on unresolved problems that are still open and require further 124 | diagnosis and action to improve IT service quality. 125 | 126 | 127 | Parameters 128 | ---------- 129 | root_folder : str, optional 130 | Path where the event log will be stored. Defaults to "data/". 131 | save_as_pandas : bool, optional 132 | Whether to save the event log as a pandas parquet file. Defaults to 133 | True. 134 | train_set : bool, optional 135 | Whether to use the train set or the test set. If True, use the train 136 | set. If False, use the test set. Defaults to True. 137 | 138 | Examples 139 | -------- 140 | >>> bpi_13_open_problems = BPI13OpenProblems() 141 | >>> bpi_13_open_problems.download() # Manually download the event log 142 | >>> event_log = bpi_13_open_problems.dataframe() # Access the event log DataFrame 143 | """ 144 | 145 | url: str = ( 146 | "https://data.4tu.nl/file/7aafbf5b-97ae-48ba-bd0a-4d973a68cd35/0647ad1a-fa73-4376-bdb4-1b253576c3a1" 147 | ) 148 | md5: str = "9663e544a2292edf1fe369747736e7b4" 149 | file_name: str = "BPI_Challenge_2013_open_problems.xes.gz" 150 | 151 | 152 | class BPI17(TUEventLog): 153 | """:doi:`BPI Challenge 2017 154 | `. 155 | 156 | The BPI 2017 event log originates from a loan application process at a 157 | Dutch financial institution. The data encompasses all loan applications 158 | submitted through an online system. This event log follows the same 159 | company and process as the BPI Challenge 2012. A notable feature of 160 | the new system is its ability to handle multiple offers for a single loan 161 | application, and these offers are tracked by their IDs within the event 162 | log. 163 | 164 | 165 | Parameters 166 | ---------- 167 | root_folder : str, optional 168 | Path where the event log will be stored. Defaults to "./data". 169 | save_as_pandas : bool, optional 170 | Whether to save the event log as a pandas parquet file. Defaults to 171 | True. 172 | train_set : bool, optional 173 | Whether to use the train set or the test set. If True, use the train 174 | set. If False, use the test set. Defaults to True. 175 | file_path : str, optional 176 | Path to the file containing the event log. If provided, the event log 177 | will be loaded from this file. Defaults to None. 178 | 179 | Examples 180 | -------- 181 | >>> bpi_17 = BPI17() 182 | >>> bpi_17.download() # Manually download the event log 183 | >>> event_log = bpi_17.dataframe() # Access the event log DataFrame 184 | """ 185 | 186 | url: str = ( 187 | "https://data.4tu.nl/file/34c3f44b-3101-4ea9-8281-e38905c68b8d/f3aec4f7-d52c-4217-82f4-57d719a8298c" 188 | ) 189 | md5: str = "10b37a2f78e870d78406198403ff13d2" 190 | file_name: str = "BPI Challenge 2017.xes.gz" 191 | 192 | _unbiased_split_params: dict = { 193 | "start_date": None, 194 | "end_date": "2017-01", 195 | "max_days": 47.81, 196 | } 197 | 198 | 199 | class BPI19(TUEventLog): 200 | """:doi:`BPI Challenge 2019 201 | `. 202 | 203 | 204 | The BPI 2019 event log comes from a large multinational company in the 205 | coatings and paints industry, based in the Netherlands. It focuses on the 206 | purchase order handling process across 60 subsidiaries. Each purchase 207 | order contains one or more line items, with four types of matching flows: 208 | 3-way matching with goods receipt, 3-way matching without goods receipt, 209 | 2-way matching, and consignment. The log records 76,349 purchase documents, 210 | covering 251,734 items, with a total of 1,595,923 events. These events 211 | span 42 activities performed by 627 users, including both batch and normal 212 | users. The data is fully anonymized and structured in an IEEE-XES 213 | compliant format. 214 | 215 | 216 | Parameters 217 | ---------- 218 | root_folder : str, optional 219 | Path where the event log will be stored. Defaults to "data/". 220 | save_as_pandas : bool, optional 221 | Whether to save the event log as a pandas parquet file. Defaults to True. 222 | train_set : bool, optional 223 | Whether to use the train set or the test set. If True, use the train set. If False, use the test set. Defaults to True. 224 | 225 | Examples 226 | -------- 227 | >>> bpi_19 = BPI19() 228 | >>> bpi_19.download() # Manually download the event log 229 | >>> event_log = bpi_19.dataframe() # Access the event log DataFrame 230 | """ 231 | 232 | url: str = ( 233 | "https://data.4tu.nl/file/35ed7122-966a-484e-a0e1-749b64e3366d/864493d1-3a58-47f6-ad6f-27f95f995828" 234 | ) 235 | md5: str = "4eb909242351193a61e1c15b9c3cc814" 236 | file_name: str = "BPI_Challenge_2019.xes" 237 | 238 | _unbiased_split_params: dict = { 239 | "start_date": "2018-01", 240 | "end_date": "2019-02", 241 | "max_days": 143.33, 242 | } 243 | 244 | 245 | class BPI20PrepaidTravelCosts(TUEventLog): 246 | """:doi:`BPI2020 Prepaid Travel Costs 247 | `. 248 | 249 | 250 | The BPI 2020 Prepaid Travel Costs event log records two years of travel 251 | expense claims for a university. In 2017, the data covers two departments, 252 | while in 2018, it extends to the entire university. The dataset includes 253 | various declarations and requests, such as domestic and international 254 | travel declarations, pre-paid travel costs, and payment requests. The 255 | process begins with submission by an employee, followed by approval from 256 | the travel administration, budget owner, and supervisor. For international 257 | trips, prior permission from the supervisor is mandatory, while domestic 258 | trips do not require prior approval. Reimbursement claims can be filed 259 | either upon payment of costs or within two months after the trip. 260 | 261 | 262 | Parameters 263 | ---------- 264 | root_folder : str, optional 265 | Path where the event log will be stored. Defaults to "data/". 266 | save_as_pandas : bool, optional 267 | Whether to save the event log as a pandas parquet file. Defaults to 268 | True. 269 | train_set : bool, optional 270 | Whether to use the train set or the test set. If True, use the train 271 | set. If False, use the test set. Defaults to True. 272 | 273 | Examples 274 | -------- 275 | >>> bpi_20 = BPI20PrepaidTravelCosts() 276 | >>> bpi_20.download() # Manually download the event log 277 | >>> event_log = bpi_20.dataframe() # Access the event log DataFrame 278 | """ 279 | 280 | url: str = ( 281 | "https://data.4tu.nl/file/fb84cf2d-166f-4de2-87be-62ee317077e5/612068f6-14d0-4a82-b118-1b51db52e73a" 282 | ) 283 | md5: str = "b6ab8ee749e2954f09a4fef030960598" 284 | file_name: str = "PrepaidTravelCost.xes.gz" 285 | 286 | _unbiased_split_params: dict = { 287 | "start_date": None, 288 | "end_date": "2019-01", 289 | "max_days": 114.26, 290 | } 291 | 292 | 293 | class BPI20TravelPermitData(TUEventLog): 294 | """:doi:`BPI2020 Travel Permit Data 295 | `. 296 | 297 | The BPI 2020 Travel Permit event log contains 7,065 cases and 86,581 298 | events, covering two years of travel expense claims at a university. In 299 | 2017, data was gathered from two departments, expanding to the entire 300 | university in 2018. The log tracks the full process of travel permits, 301 | including related prepaid travel cost declarations and travel declarations. 302 | The process begins with the submission of a travel permit request by an 303 | employee, followed by approval from the travel administration, budget 304 | owner, and supervisor. For international trips, prior approval is required 305 | before making any travel arrangements, while domestic trips do not need 306 | prior approval. Reimbursement claims for costs can be submitted either 307 | upon payment or within two months after the trip. 308 | 309 | 310 | Parameters 311 | ---------- 312 | root_folder : str, optional 313 | Path where the event log will be stored. Defaults to "data/". 314 | save_as_pandas : bool, optional 315 | Whether to save the event log as a pandas parquet file. Defaults to 316 | True. 317 | train_set : bool, optional 318 | Whether to use the train set or the test set. If True, use the train 319 | set. If False, use the test set. Defaults to True. 320 | 321 | Examples 322 | -------- 323 | >>> bpi_20 = BPI20TravelPermitData() 324 | >>> bpi_20.download() # Manually download the event log 325 | >>> event_log = bpi_20.dataframe() # Access the event log DataFrame 326 | """ 327 | 328 | url: str = ( 329 | "https://data.4tu.nl/file/db35afac-2133-40f3-a565-2dc77a9329a3/12b48cc1-18a8-4089-ae01-7078fc5e8f90" 330 | ) 331 | md5: str = "b6e9ff00d946f6ad4c91eb6fb550aee4" 332 | file_name: str = "PermitLog.xes.gz" 333 | 334 | _unbiased_split_params: dict = { 335 | "start_date": None, 336 | "end_date": "2019-10", 337 | "max_days": 258.81, 338 | } 339 | 340 | 341 | class BPI20RequestForPayment(TUEventLog): 342 | """:doi:`BPI2020 Request For Payment 343 | `. 344 | 345 | 346 | The BPI 2020 Request for Payment event log contains 6,886 cases and 36,796 347 | events, primarily focusing on requests for payment that are not related to 348 | travel. However, some events may mistakenly be linked to travel, which is 349 | considered an unwanted deviation. The dataset covers two years of events, 350 | with data collected from two departments in 2017 and the entire university 351 | in 2018. The process for requests follows a similar flow to other 352 | declarations: submission by an employee, approval by the travel 353 | administration, and further approvals by the budget owner and supervisor 354 | if necessary. 355 | 356 | 357 | Parameters 358 | ---------- 359 | root_folder : str, optional 360 | Path where the event log will be stored. Defaults to "data/". 361 | save_as_pandas : bool, optional 362 | Whether to save the event log as a pandas parquet file. Defaults to 363 | True. 364 | train_set : bool, optional 365 | Whether to use the train set or the test set. If True, use the train 366 | set. If False, use the test set. Defaults to True. 367 | 368 | Examples 369 | -------- 370 | >>> bpi_20 = BPI20RequestForPayment() 371 | >>> bpi_20.download() # Manually download the event log 372 | >>> event_log = bpi_20.dataframe() # Access the event log DataFrame 373 | """ 374 | 375 | url: str = ( 376 | "https://data.4tu.nl/file/a6f651a7-5ce0-4bc6-8be1-a7747effa1cc/7b1f2e56-e4a8-43ee-9a09-6e64f45a1a98" 377 | ) 378 | md5: str = "2eb4dd20e70b8de4e32cc3c239bde7f2" 379 | file_name: str = "RequestForPayment.xes.gz" 380 | 381 | _unbiased_split_params: dict = { 382 | "start_date": None, 383 | "end_date": "2018-12", 384 | "max_days": 28.86, 385 | } 386 | 387 | 388 | class BPI20DomesticDeclarations(TUEventLog): 389 | """:doi:`BPI2020 Domestic Declarations 390 | `. 391 | 392 | 393 | The BPI 2020 Domestic Declarations event log contains 10,500 cases and 394 | 56,437 events. The dataset focuses on domestic travel expense claims over 395 | a two-year period. In 2017, data was collected from two departments, while 396 | in 2018, it covered the entire university. Domestic declarations do not 397 | require prior permission; employees can complete these trips and later 398 | request reimbursement for the incurred costs. The process follows a 399 | similar approval flow: after submission by the employee, the request is 400 | reviewed by the travel administration and further approved by the budget 401 | owner and supervisor, if necessary. 402 | 403 | 404 | Parameters 405 | ---------- 406 | root_folder : str, optional 407 | Path where the event log will be stored. Defaults to "data/". 408 | save_as_pandas : bool, optional 409 | Whether to save the event log as a pandas parquet file. Defaults to 410 | True. 411 | train_set : bool, optional 412 | Whether to use the train set or the test set. If True, use the train 413 | set. If False, use the test set. Defaults to True. 414 | 415 | Examples 416 | -------- 417 | >>> bpi_20 = BPI20DomesticDeclarations() 418 | >>> bpi_20.download() # Manually download the event log 419 | >>> event_log = bpi_20.dataframe() # Access the event log DataFrame 420 | """ 421 | 422 | url: str = ( 423 | "https://data.4tu.nl/file/6a0a26d2-82d0-4018-b1cd-89afb0e8627f/6eeb0328-f991-48c7-95f2-35033504036e" 424 | ) 425 | md5: str = "6a78c39491498363ce4788e0e8ca75ef" 426 | file_name: str = "DomesticDeclarations.xes.gz" 427 | 428 | 429 | class BPI20InternationalDeclarations(TUEventLog): 430 | """:doi:`BPI2020 International Declarations 431 | `. 432 | 433 | The BPI 2020 International Declarations event log contains 6,449 cases and 434 | 72,151 events, covering two years of travel expense claims at a university. 435 | In 2017, the data was collected from two departments, expanding to the 436 | entire university in 2018. Unlike domestic trips, international trips 437 | require prior approval from the supervisor, which is obtained by 438 | submitting a travel permit. Once the permit is approved, the employee can 439 | proceed with travel arrangements. After the trip or payment of related 440 | expenses (e.g., flights or conference fees), a reimbursement claim is 441 | filed, which can be submitted either upon payment or within two months 442 | after the trip. 443 | 444 | 445 | Parameters 446 | ---------- 447 | root_folder : str, optional 448 | Path where the event log will be stored. Defaults to "data/". 449 | save_as_pandas : bool, optional 450 | Whether to save the event log as a pandas parquet file. Defaults to 451 | True. 452 | train_set : bool, optional 453 | Whether to use the train set or the test set. If True, use the train 454 | set. If False, use the test set. Defaults to True. 455 | 456 | Examples 457 | -------- 458 | >>> bpi_20 = BPI20InternationalDeclarations() 459 | >>> bpi_20.download() # Manually download the event log 460 | >>> event_log = bpi_20.dataframe() # Access the event log DataFrame 461 | 462 | """ 463 | 464 | url: str = ( 465 | "https://data.4tu.nl/file/91fd1fa8-4df4-4b1a-9a3f-0116c412378f/d45ee7dc-952c-4885-b950-4579a91ef426" 466 | ) 467 | md5: str = "1ec65e046f70bb399cc6d2c154cd615a" 468 | file_name: str = "InternationalDeclarations.xes.gz" 469 | 470 | 471 | class Sepsis(TUEventLog): 472 | """:doi:`Sepsis 473 | `. 474 | 475 | 476 | The Sepsis event log contains real-life hospital data regarding sepsis 477 | cases, a life-threatening condition often caused by infection. Each case 478 | in the log represents a patient's pathway through the hospital. The 479 | dataset includes around 1000 cases and approximately 15,000 events, 480 | covering 16 different activities. Additionally, 39 data attributes are 481 | recorded, such as the responsible group for each activity, test results, 482 | and information from checklists. All events and attribute values have been 483 | anonymized. While the timestamps of events have been randomized, the 484 | intervals between events within a trace remain unchanged. 485 | 486 | 487 | Parameters 488 | ---------- 489 | root_folder : str, optional 490 | Path where the event log will be stored. Defaults to "data/". 491 | save_as_pandas : bool, optional 492 | Whether to save the event log as a pandas parquet file. Defaults to 493 | True. 494 | train_set : bool, optional 495 | Whether to use the train set or the test set. If True, use the train 496 | set. If False, use the test set. Defaults to True. 497 | 498 | Examples 499 | -------- 500 | >>> sepsis = Sepsis() 501 | >>> sepsis.download() # Manually download the event log 502 | >>> event_log = sepsis.dataframe() # Access the event log DataFrame 503 | 504 | """ 505 | 506 | url: str = ( 507 | "https://data.4tu.nl/file/33632f3c-5c48-40cf-8d8f-2db57f5a6ce7/643dccf2-985a-459e-835c-a82bce1c0339" 508 | ) 509 | 510 | md5: str = "b5671166ac71eb20680d3c74616c43d2" 511 | file_name: str = "Sepsis Cases - Event Log.xes.gz" 512 | -------------------------------------------------------------------------------- /src/skpm/event_logs/download.py: -------------------------------------------------------------------------------- 1 | import os 2 | import typing as t 3 | from urllib import request 4 | 5 | 6 | def download_url( 7 | url: str, folder: t.Optional[str] = None, file_name: t.Optional[str] = None 8 | ) -> str: 9 | """Download a file from a `url` and place it in `folder`. 10 | 11 | Args: 12 | url (str): URL to download file from 13 | folder (str, optional): Folder to download file to. 14 | If None, use the current working directory. Defaults to None. 15 | file_name (str, optional): Name to save the file under. 16 | If None, use the basename of the URL. Defaults to None. 17 | 18 | Returns: 19 | folder (str): Path to downloaded file 20 | """ 21 | if folder is None: 22 | folder = os.getcwd() 23 | 24 | if file_name is None: 25 | # TODO: maybe get the file_name from the request? 26 | # response.info().get_file_name() 27 | file_name = os.path.basename(url) 28 | path = os.path.join(folder, file_name) 29 | 30 | if os.path.exists(path): 31 | return path 32 | 33 | # try: 34 | os.makedirs(os.path.expanduser(os.path.normpath(folder)), exist_ok=True) 35 | # except OSError as e: 36 | # raise e 37 | 38 | _urlretrieve(url=url, destination=path) 39 | return path 40 | 41 | 42 | def _save_response_content( 43 | content: t.Iterator[bytes], 44 | destination: str, 45 | ) -> None: 46 | """ 47 | Save the content received from an HTTP response to a file. 48 | 49 | Parameters 50 | ---------- 51 | content : Iterator[bytes] 52 | Iterator yielding binary data chunks from the HTTP response. 53 | destination : str 54 | Path to the file where the content will be saved. 55 | 56 | Returns 57 | ------- 58 | None 59 | """ 60 | with open(destination, "wb") as fh: 61 | for chunk in content: 62 | # filter out keep-alive new chunks 63 | # if not chunk: 64 | # continue 65 | 66 | fh.write(chunk) 67 | 68 | 69 | def _urlretrieve( 70 | url: str, destination: str, chunk_size: int = 1024 * 32 71 | ) -> None: 72 | """ 73 | Retrieve a URL and save its contents to a file. 74 | 75 | Parameters 76 | ---------- 77 | url : str 78 | The URL of the resource to retrieve. 79 | destination : str 80 | Path to the file where the content will be saved. 81 | chunk_size : int, optional 82 | Size of the chunks to read from the response at a time, in bytes. 83 | Defaults to 32KB. 84 | 85 | Returns 86 | ------- 87 | None 88 | """ 89 | with request.urlopen(request.Request(url)) as response: 90 | _save_response_content( 91 | iter(lambda: response.read(chunk_size), b""), 92 | destination, 93 | ) 94 | -------------------------------------------------------------------------------- /src/skpm/event_logs/extract.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import os.path as osp 3 | import zipfile 4 | 5 | 6 | def extract_gz(path: str, folder: str): 7 | r"""Extracts a gz archive to a specific folder. 8 | 9 | Args: 10 | path (str): The path to the tar archive. 11 | folder (str): The folder. 12 | log (bool, optional): If :obj:`False`, will not print anything to the 13 | console. (default: :obj:`True`) 14 | """ 15 | path = osp.abspath(path) 16 | file_path = osp.join(folder, ".".join(path.split(".")[:-1])) 17 | with gzip.open(path, "r") as r: 18 | with open(file_path, "wb") as w: 19 | w.write(r.read()) 20 | 21 | return file_path 22 | 23 | 24 | # def extract_zip(path: str, folder: str): 25 | # r"""Extracts a zip archive to a specific folder. 26 | 27 | # Args: 28 | # path (str): The path to the tar archive. 29 | # folder (str): The folder. 30 | # log (bool, optional): If :obj:`False`, will not print anything to the 31 | # console. (default: :obj:`True`) 32 | # """ 33 | # with zipfile.ZipFile(path, "r") as f: 34 | # f.extractall(folder) 35 | 36 | 37 | # commenting out the following functions because 38 | # they are not used in the codebase. Maybe in the future. 39 | 40 | # import bz2 41 | # import tarfile 42 | 43 | # def extract_tar(path: str, folder: str, mode: str = 'r:gz'): 44 | # r"""Extracts a tar archive to a specific folder. 45 | 46 | # Args: 47 | # path (str): The path to the tar archive. 48 | # folder (str): The folder. 49 | # mode (str, optional): The compression mode. (default: :obj:`"r:gz"`) 50 | # log (bool, optional): If :obj:`False`, will not print anything to the 51 | # console. (default: :obj:`True`) 52 | # """ 53 | # with tarfile.open(path, mode) as f: 54 | # f.extractall(folder) 55 | 56 | 57 | # def extract_bz2(path: str, folder: str): 58 | # r"""Extracts a bz2 archive to a specific folder. 59 | 60 | # Args: 61 | # path (str): The path to the tar archive. 62 | # folder (str): The folder. 63 | # log (bool, optional): If :obj:`False`, will not print anything to the 64 | # console. (default: :obj:`True`) 65 | # """ 66 | # path = osp.abspath(path) 67 | # with bz2.open(path, 'r') as r: 68 | # with open(osp.join(folder, '.'.join(path.split('.')[:-1])), 'wb') as w: 69 | # w.write(r.read()) 70 | -------------------------------------------------------------------------------- /src/skpm/event_logs/parser.py: -------------------------------------------------------------------------------- 1 | from lxml import etree 2 | from itertools import chain 3 | from joblib import Parallel, delayed 4 | from typing import Generator 5 | import pandas as pd 6 | 7 | 8 | class Event(dict): 9 | pass 10 | 11 | 12 | class TagXES: 13 | # attributes 14 | STRING: str = "string" 15 | DATE: str = "date" 16 | FLOAT: str = "float" 17 | BOOLEAN: str = "boolean" 18 | INT: str = "int" 19 | 20 | # elements 21 | EVENT: str = "event" 22 | TRACE: str = "trace" 23 | 24 | _DTYPES: tuple = (STRING, DATE, FLOAT, BOOLEAN, INT) 25 | 26 | @classmethod 27 | def is_attribute(cls, element: etree._Element) -> bool: 28 | """element is an attribute if it ends with one of the dtypes.""" 29 | return element.tag.endswith(cls._DTYPES) 30 | 31 | @classmethod 32 | def is_valid(cls, element: etree._Element) -> bool: 33 | return element.tag.endswith( 34 | tuple(v for v in vars(cls).values() if not v.startswith("_")) 35 | ) 36 | 37 | @classmethod 38 | def get_dtypes(cls) -> tuple: 39 | return cls._DTYPES 40 | 41 | 42 | tag = TagXES 43 | 44 | 45 | def extract_case_attributes(trace: etree._Element, ns: dict) -> Event: 46 | """ 47 | Extracts case-level attributes from the trace. 48 | 49 | Using findall for case attributes is faster than using iter since 50 | cases has fewer attributes than events. 51 | 52 | Args: 53 | trace (etree._Element): The trace element. 54 | ns (dict): Namespace mapping for XML parsing. 55 | 56 | Returns: 57 | Event: A dictionary of case-level attributes. 58 | """ 59 | case_attrs = Event() 60 | for attr in tag.get_dtypes(): 61 | # Find all attributes of the given type in the trace 62 | attrs = trace.findall(attr, ns) 63 | # Update case_attrs with the found attributes 64 | case_attrs.update( 65 | {f'case:{e.get("key")}': e.get("value") for e in attrs} 66 | ) 67 | return case_attrs 68 | 69 | 70 | def extract_event_attributes(event: etree._Element) -> Event: 71 | """ 72 | Extracts attributes from an event element. 73 | 74 | Using iter is slightly faster than findall for events since 75 | there many events and event attributes in a trace. 76 | 77 | Args: 78 | event (etree._Element): The event element. 79 | 80 | Returns: 81 | Event: A dictionary of event attributes. 82 | """ 83 | event_attrs = Event() 84 | for e_attr in event.iter(): 85 | if tag.is_attribute(e_attr): 86 | event_attrs[e_attr.get("key")] = e_attr.get("value") 87 | return event_attrs 88 | 89 | 90 | def parse_trace(trace: etree._Element, ns: dict) -> list[Event]: 91 | """Parses a list of XML elements representing a trace. 92 | 93 | Args: 94 | trace (list[etree._Element]): List of XML elements representing a trace from a XES file. 95 | 96 | Returns: 97 | list[Event]: The respective events from the trace. 98 | """ 99 | 100 | if isinstance(trace, bytes): 101 | trace = etree.fromstring(trace) 102 | 103 | case_attrs = extract_case_attributes(trace, ns) 104 | 105 | # Parse each event 106 | parsed_events = [] 107 | events = trace.findall(tag.EVENT, ns) 108 | for event in events: 109 | event_attrs = extract_event_attributes(event) 110 | 111 | # Add case-level attributes to event attributes 112 | event_attrs.update(case_attrs) 113 | parsed_events.append(event_attrs) 114 | 115 | # Clear the event to free memory 116 | event.clear() 117 | 118 | trace.clear() 119 | return parsed_events 120 | 121 | 122 | def lazy_serialize( 123 | elements: list[etree._Element], 124 | ) -> Generator[bytes, None, None]: 125 | """Lazy serialization of a list of XML elements. Used for parallel processing.""" 126 | for element in elements: 127 | yield etree.tostring(element) 128 | 129 | 130 | def read_xes( 131 | filepath: str, n_jobs: int = None 132 | ) -> pd.DataFrame: 133 | """Reads an event log from a XES file. 134 | 135 | Rough overview: 136 | This function reads an event log from a XES file. It uses the lxml library to 137 | parse the XML file. The function is parallelized using the joblib library. 138 | 139 | 1. For each trace, the function `_parse_trace` is called. 140 | 2. Extract the case attributes and all the events from the trace. 141 | 3. For each event, extract the event attributes. 142 | 4. Update the event attributes with the case attributes. 143 | 5. Append the event to the final list of events corresponding to the trace. 144 | 6. Return trace and repeat. 145 | 146 | Args: 147 | filepath (str): Filepath to the XES file. 148 | n_jobs (int, optional): Number of CPU cores to use. If None, only one core 149 | is used. Defaults to None. 150 | 151 | Returns: 152 | list[Event]: an event log as a list of Event objects. 153 | """ 154 | tree = etree.parse(filepath).getroot() 155 | ns = tree.nsmap 156 | 157 | traces = tree.findall(tag.TRACE, ns) 158 | 159 | if n_jobs in [1, None]: 160 | log = [] 161 | for trace in traces: 162 | log.extend(parse_trace(trace, ns)) 163 | else: 164 | from functools import partial 165 | 166 | parse_trace_partial = partial(parse_trace, ns=ns) 167 | 168 | traces = lazy_serialize(traces) 169 | log = Parallel(n_jobs=n_jobs)( 170 | delayed(parse_trace_partial)(trace) for trace in traces 171 | ) 172 | log = list(chain(*log)) 173 | 174 | log = pd.DataFrame(log) 175 | 176 | return log 177 | -------------------------------------------------------------------------------- /src/skpm/event_logs/split.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from skpm.config import EventLogConfig as elc 3 | from skpm.event_logs.base import TUEventLog 4 | 5 | 6 | def _bounded_dataset( 7 | dataset: pd.DataFrame, start_date, end_date: int 8 | ) -> pd.DataFrame: 9 | grouped = dataset.groupby(elc.case_id, as_index=False)[elc.timestamp].agg( 10 | ["min", "max"] 11 | ) 12 | 13 | start_date = ( 14 | pd.Period(start_date) 15 | if start_date 16 | else dataset[elc.timestamp].min().to_period("M") 17 | ) 18 | end_date = ( 19 | pd.Period(end_date) 20 | if end_date 21 | else dataset[elc.timestamp].max().to_period("M") 22 | ) 23 | bounded_cases = grouped[ 24 | (grouped["min"].dt.to_period("M") >= start_date) 25 | & (grouped["max"].dt.to_period("M") <= end_date) 26 | ][elc.case_id].values 27 | dataset = dataset[dataset[elc.case_id].isin(bounded_cases)] 28 | return dataset 29 | 30 | 31 | def _unbiased(dataset: pd.DataFrame, max_days: int) -> pd.DataFrame: 32 | grouped = ( 33 | dataset.groupby(elc.case_id, as_index=False)[elc.timestamp] 34 | .agg(["min", "max"]) 35 | .assign( 36 | duration=lambda x: (x["max"] - x["min"]).dt.total_seconds() 37 | / (24 * 60 * 60) 38 | ) 39 | ) 40 | 41 | # condition 1: cases are shorter than max_duration 42 | condition_1 = grouped["duration"] <= max_days * 1.00000000001 43 | # condition 2: drop cases starting after the dataset's last timestamp - the max_duration 44 | latest_start = dataset[elc.timestamp].max() - pd.Timedelta( 45 | max_days, unit="D" 46 | ) 47 | condition_2 = grouped["min"] <= latest_start 48 | 49 | unbiased_cases = grouped[condition_1 & condition_2][elc.case_id].values 50 | dataset = dataset[dataset[elc.case_id].isin(unbiased_cases)] 51 | return dataset 52 | 53 | 54 | def unbiased( 55 | dataset: pd.DataFrame | TUEventLog, 56 | start_date: str | pd.Period | None, 57 | end_date: str | pd.Period | None, 58 | max_days: int, 59 | test_len: float = 0.2, 60 | ) -> tuple[pd.DataFrame, pd.DataFrame]: 61 | """ 62 | Unbiased split of event log into training and test set [1]. 63 | 64 | Code adapted from [2]. 65 | 66 | Parameters 67 | ---------- 68 | dataset: pd.DataFrame 69 | Event log. 70 | 71 | start_date: str 72 | Start date of the event log. 73 | 74 | end_date: str 75 | End date of the event log. 76 | 77 | max_days: int 78 | Maximum duration of cases. 79 | 80 | test_len: float, default=0.2 81 | Proportion of cases to be used for the test set. 82 | 83 | Returns 84 | ------- 85 | - df_train: pd.DataFrame, training set 86 | - df_test: pd.DataFrame, test set 87 | 88 | Example 89 | ------- 90 | >>> from skpm.event_logs import BPI12 91 | >>> from skpm.event_logs import split 92 | >>> bpi12 = BPI12() 93 | >>> df_train, df_test = split.unbiased(bpi12, **bpi12.unbiased_split_params) 94 | >>> df_train.shape, df_test.shape 95 | ((117546, 7), (55952, 7)) 96 | 97 | References: 98 | ----------- 99 | [1] Hans Weytjens, Jochen De Weerdt. Creating Unbiased Public Benchmark Datasets with Data Leakage Prevention for Predictive Process Monitoring, 2021. doi: 10.1007/978-3-030-94343-1_2 100 | [2] https://github.com/hansweytjens/predictive-process-monitoring-benchmarks 101 | """ 102 | if isinstance(dataset, TUEventLog): 103 | dataset = dataset.dataframe 104 | 105 | dataset = dataset.copy() 106 | 107 | dataset[elc.timestamp] = pd.to_datetime( 108 | dataset[elc.timestamp], utc=True 109 | ).dt.tz_localize(None) 110 | 111 | # bounding the event log 112 | if start_date or end_date: 113 | dataset = _bounded_dataset(dataset, start_date, end_date) 114 | 115 | # drop longest cases and debiasing end of dataset 116 | dataset = _unbiased(dataset, max_days) 117 | 118 | # preliminaries 119 | grouped = dataset.groupby(elc.case_id, as_index=False)[elc.timestamp].agg( 120 | ["min", "max"] 121 | ) 122 | 123 | ### TEST SET ### 124 | first_test_case_nr = int(len(grouped) * (1 - test_len)) 125 | first_test_start_time = ( 126 | grouped["min"].sort_values().values[first_test_case_nr] 127 | ) 128 | # retain cases that end after first_test_start time 129 | test_case_nrs = grouped.loc[ 130 | grouped["max"].values >= first_test_start_time, elc.case_id 131 | ] 132 | df_test = dataset[dataset[elc.case_id].isin(test_case_nrs)].reset_index( 133 | drop=True 134 | ) 135 | 136 | #### TRAINING SET ### 137 | df_train = dataset[~dataset[elc.case_id].isin(test_case_nrs)].reset_index( 138 | drop=True 139 | ) 140 | 141 | return df_train, df_test 142 | -------------------------------------------------------------------------------- /src/skpm/feature_extraction/__init__.py: -------------------------------------------------------------------------------- 1 | from skpm.feature_extraction.time import TimestampExtractor 2 | from skpm.feature_extraction.event.resource import ResourcePoolExtractor 3 | from skpm.feature_extraction.event.inter_case import WorkInProgress 4 | 5 | __all__ = ["TimestampExtractor", "ResourcePoolExtractor"] -------------------------------------------------------------------------------- /src/skpm/feature_extraction/case/__init__.py: -------------------------------------------------------------------------------- 1 | from .variant import VariantExtractor 2 | 3 | __all__ = [ 4 | "VariantExtractor", 5 | ] 6 | -------------------------------------------------------------------------------- /src/skpm/feature_extraction/case/_helpers.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | from sklearn.pipeline import Pipeline 3 | 4 | 5 | def ensure_not_pipeline(fit_method): 6 | def wrapper(estimator, *args, **kwargs): 7 | in_pipeline = False 8 | for frame_info in inspect.stack(): 9 | frame = frame_info.frame 10 | if "self" in frame.f_locals: 11 | caller_self = frame.f_locals["self"] 12 | if isinstance(caller_self, Pipeline): 13 | in_pipeline = True 14 | break 15 | if in_pipeline: 16 | class_name = estimator.__class__.__name__ 17 | raise ValueError( 18 | f"{class_name} is a case-wise feature extractor and cannot be used in a pipeline." 19 | ) 20 | 21 | return fit_method(estimator, *args, **kwargs) 22 | 23 | return wrapper 24 | -------------------------------------------------------------------------------- /src/skpm/feature_extraction/case/time.py: -------------------------------------------------------------------------------- 1 | from skpm.config import EventLogConfig as elc 2 | 3 | class TimestampCaseLevel: 4 | """ 5 | Extracts time-related features at the case level. 6 | 7 | Notes 8 | ----- 9 | Separating the implementation for event-level and case-level features improves performance. 10 | The case-level implementation is slower due to the use of `groupby`. 11 | """ 12 | 13 | TIME_UNIT_MULTIPLIER = { 14 | "s": 1, 15 | "m": 60, 16 | "h": 60 * 60, 17 | "d": 60 * 60 * 24, 18 | "w": 60 * 60 * 24 * 7, 19 | } 20 | 21 | @classmethod 22 | def accumulated_time(cls, case, ix_list, time_unit="s"): 23 | """Calculate the accumulated time from the start of each case in seconds.""" 24 | return ( 25 | case[elc.timestamp] 26 | .apply(lambda x: x - x.min()) 27 | .loc[ix_list] 28 | .dt.total_seconds() 29 | / cls.TIME_UNIT_MULTIPLIER.get(time_unit, 1) 30 | ) 31 | 32 | @classmethod 33 | def execution_time(cls, case, ix_list, time_unit="s"): 34 | """Calculate the execution time of each event in seconds. 35 | 36 | **NOTE**: This should be used as a target feature, since the _next_ step is 37 | needed to calculate the execution time of each event.""" 38 | return ( 39 | case[elc.timestamp] 40 | .diff(-1) 41 | .dt.total_seconds() 42 | .fillna(0) 43 | .loc[ix_list] 44 | .abs() # to avoid negative numbers caused by diff-1 45 | / cls.TIME_UNIT_MULTIPLIER.get(time_unit, 1) 46 | ) 47 | 48 | @classmethod 49 | def remaining_time(cls, case, ix_list, time_unit="s"): 50 | """Calculate the remaining time until the end of each case in seconds. 51 | 52 | **NOTE**: This should be used as a target feature, since the _last_ step 53 | is needed to calculate the remaining time of each event.""" 54 | 55 | return ( 56 | case[elc.timestamp] 57 | .apply(lambda x: x.max() - x) 58 | .loc[ix_list] 59 | .dt.total_seconds() 60 | / cls.TIME_UNIT_MULTIPLIER.get(time_unit, 1) 61 | ) -------------------------------------------------------------------------------- /src/skpm/feature_extraction/case/variant.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator, TransformerMixin 2 | from sklearn.preprocessing import LabelEncoder 3 | 4 | from skpm.feature_extraction.case._helpers import ensure_not_pipeline 5 | 6 | 7 | class VariantExtractor(TransformerMixin, BaseEstimator): 8 | """Extract trace variants from an event log.""" 9 | 10 | def __init__(self, strategy="default"): 11 | self.strategy = strategy 12 | 13 | @ensure_not_pipeline 14 | def fit(self, X, y=None): 15 | if self.strategy != "default": 16 | raise NotImplementedError("Only the default strategy is supported.") 17 | 18 | self.variants = ( 19 | X.groupby("case:concept:name", as_index=False)["concept:name"] 20 | .apply(tuple) 21 | .rename(columns={"concept:name": "variant"}) 22 | ) 23 | 24 | self._le = LabelEncoder() 25 | self.variants["variant"] = self._le.fit_transform( 26 | self.variants["variant"] 27 | ) 28 | return self 29 | 30 | def transform(self, X): 31 | """Get trace variants.""" 32 | return self.variants 33 | 34 | def inverse_transform(self, X): 35 | """Get trace variants.""" 36 | return self._le.inverse_transform(X) 37 | -------------------------------------------------------------------------------- /src/skpm/feature_extraction/event/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Notes on inter-case features: 3 | Inter-case features are features that can be leveraged by cases in parallel. For instance, the availability of a resource at a time window `t_0` can be represented as a binary variable. 4 | This brings up an observation, not an issue I believe, that `fit` methods in this module will just return self, and all the logic should be within `transform`. This is due to the temporal splits, expected for temporal process data. 5 | We cannot `fit` on the train set and `transform` on the test 6 | set, i.e. define the bins based on `freq`, since in a temporal 7 | split the test set will have unkown bins. TODO: further explore 8 | if this is an issue. 9 | For TimestampExtractor, if we have on the training set a trace [t_0, ..., t_n] whereas on the test set we have the remaining trace, i.e., [t_{n+1}, ..., t_m], the `accumulated_time` feature should take this info into consideration. 10 | """ 11 | 12 | from .resource import ResourcePoolExtractor 13 | from .inter_case import WorkInProgress 14 | 15 | __all__ = ["ResourcePoolExtractor", "WorkInProgress"] 16 | -------------------------------------------------------------------------------- /src/skpm/feature_extraction/event/inter_case.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.base import ( 3 | BaseEstimator, 4 | TransformerMixin, 5 | ) 6 | from skpm.config import EventLogConfig as elc 7 | 8 | 9 | class WorkInProgress(TransformerMixin, BaseEstimator): 10 | """Work in Progress (WIP) feature extractor. 11 | 12 | This transformer calculates the number of cases (work) in progress within 13 | specified time windows. 14 | 15 | Parameters: 16 | ----------- 17 | window_size : str, default='1D' 18 | Frequency of the time windows to count the number of cases in progress. 19 | It follows the Pandas offset aliases convention. For more details, see 20 | https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases 21 | 22 | Returns: 23 | -------- 24 | ndarray 25 | WIP feature array of shape (n_samples, 1) 26 | 27 | Examples: 28 | --------- 29 | >>> import pandas as pd 30 | >>> from sklearn.pipeline import Pipeline 31 | >>> from skpm.event_feature_extraction import WorkInProgress 32 | >>> from skpm.config import EventLogConfig as elc 33 | >>> # Assuming X is your dataframe containing event data with columns 'timestamp' and 'case_id' 34 | >>> X = pd.DataFrame({elc.timestamp: pd.date_range(start='2024-01-01', end='2024-01-10', freq='D'), 35 | ... elc.case_id: [1, 1, 2, 3, 4, 4, 4, 5, 6, 6]}) 36 | >>> wip_transformer = WorkInProgress(window_size='2D') # Calculate WIP over 2-day windows 37 | >>> wip_transformer.fit_transform(X) 38 | array([2., 1., 1., 2., 2., 1., 1., 2., 2., 2.]) 39 | """ 40 | 41 | # see https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases 42 | def __init__( 43 | self, 44 | window_size="1D", 45 | ) -> None: 46 | self.window_size = window_size 47 | 48 | def get_feature_names_out(self): 49 | return ["wip"] 50 | 51 | def fit( 52 | self, 53 | X: pd.DataFrame, 54 | y=None, 55 | ): 56 | assert isinstance(X, pd.DataFrame), "Input must be a dataframe." 57 | 58 | return self 59 | 60 | def transform(self, X: pd.DataFrame): 61 | """Transform the input DataFrame to calculate the Work in Progress (WIP) feature. 62 | 63 | This method calculates the number of cases in progress within specified time windows based on the input event data. 64 | 65 | Parameters: 66 | ----------- 67 | X : pd.DataFrame 68 | Input DataFrame containing event data with columns 'timestamp' and 'case_id'. 69 | 70 | Returns: 71 | -------- 72 | ndarray 73 | WIP feature array of shape (n_samples, 1), where each value represents the number of cases in progress at each time step. 74 | 75 | Notes: 76 | ------ 77 | 1. The method performs the following steps: 78 | a. Groups the event data by time windows specified by the 'window_size' parameter. 79 | b. Counts the number of unique cases within each time window. 80 | c. Maps the counts to the corresponding time windows. 81 | d. Fills any missing values with the number of NaN values (representing time windows with no events). 82 | """ 83 | self._grouped_wip = X.groupby( 84 | pd.Grouper(key=elc.timestamp, freq=self.window_size) 85 | )[elc.case_id].nunique() 86 | self._bins = pd.cut( 87 | X[elc.timestamp], 88 | bins=self._grouped_wip.index, 89 | labels=self._grouped_wip.index[:-1], 90 | ) 91 | wip = self._bins.map(self._grouped_wip) 92 | wip = wip.fillna(self._bins.isna().sum()).values 93 | 94 | return wip 95 | -------------------------------------------------------------------------------- /src/skpm/feature_extraction/event/resource.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import numpy as np 4 | from pandas import DataFrame 5 | from scipy.sparse.csgraph import connected_components 6 | from sklearn.base import ( 7 | BaseEstimator, 8 | TransformerMixin, 9 | check_is_fitted, 10 | ) 11 | 12 | from skpm.utils import validate_columns 13 | from skpm.config import EventLogConfig as elc 14 | from skpm.warnings import ConceptDriftWarning 15 | 16 | 17 | class ResourcePoolExtractor(TransformerMixin, BaseEstimator): 18 | """ 19 | Extracts resource roles based on resource-activity correlations. 20 | 21 | This class identifies resource roles within a process based on correlations 22 | between resources and activities in event logs. It computes a correlation 23 | matrix between resources and activities and then identifies subgraphs 24 | representing roles based on a user-defined threshold. 25 | This approach proposed in [1], and code adapted from [2]. 26 | 27 | Todo: 28 | ------ 29 | implement other distance metrics. 30 | 31 | 32 | Parameters: 33 | ----------- 34 | threshold : float, default=0.7 35 | The correlation threshold for identifying resource roles. 36 | Resources with correlation coefficients above this threshold 37 | are considered to belong to the same role. 38 | 39 | References: 40 | ----------- 41 | - [1] Minseok Song, Wil M.P. van der Aalst. "Towards comprehensive support for organizational mining," Decision Support Systems (2008). 42 | - [2] Code adapted from https://github.com/AdaptiveBProcess/GenerativeLSTM 43 | 44 | Notes 45 | ----- 46 | - distance metrics: (dis)similarity between two vectors (variables). It must 47 | satisfy the following mathematical properties: d(x,x) = 0, d(x,y) >= 0, 48 | d(x,y) = d(y,x), d(x,z) <= d(x,y) + d(y,z) 49 | - correlation coeficients: statistical relationships between vectors (variables) 50 | that quantify how much they are related. 51 | 52 | The original paper mentions Pearson correlation as a distance metric. For 53 | academic purposes, it's crucial to grasp the distinction since correlation 54 | does not satisfy the triangular inequality. Yet, there are instances where 55 | I think correlation can be informally employed as a 'similarity' measure. 56 | In the context of organizational mining, I believe statistical relationships 57 | and similarity ultimately serve the same purpose. 58 | 59 | Examples: 60 | --------- 61 | >>> from skpm.event_feature_extraction.resource import ResourcePoolExtractor 62 | >>> import pandas as pd 63 | >>> # Assuming X is your dataframe containing event data with columns 'activity' and 'resource' 64 | >>> X = pd.DataFrame({'activity': ['A', 'B', 'A', 'B'], 'resource': ['R1', 'R2', 'R1', 'R2']}) 65 | >>> # Initialize and fit the extractor 66 | >>> extractor = ResourcePoolExtractor(threshold=0.7) 67 | >>> extractor.fit(X) 68 | >>> # Transform the data to extract resource roles 69 | >>> resource_roles = extractor.transform(X) 70 | >>> print(resource_roles) 71 | [0 1 0 1] 72 | """ 73 | 74 | def __init__(self, threshold=0.7): 75 | """ 76 | Initialize the ResourcePoolExtractor. 77 | 78 | Parameters: 79 | ----------- 80 | threshold : float, default=0.7 81 | The correlation threshold for identifying resource roles. 82 | """ 83 | # the original implementation uses 0.7 as threshold but in the argparser they set 0.85 84 | self.threshold = threshold 85 | 86 | def get_feature_names_out(self): 87 | """Return the feature names. 88 | 89 | Returns: 90 | -------- 91 | feature_names : list 92 | List containing the feature names. 93 | """ 94 | return ["resource_roles"] 95 | 96 | def fit(self, X: DataFrame, y=None): 97 | """Fit the ResourcePoolExtractor. 98 | 99 | Parameters: 100 | ----------- 101 | X : DataFrame, shape (n_samples, n_features) 102 | The input data containing activity and resource columns. 103 | 104 | Returns: 105 | -------- 106 | self : object 107 | Returns self. 108 | """ 109 | X = self._validate_data(X) 110 | 111 | # defining vocabs for activities and resources 112 | self.atoi_, self.itoa_ = self._define_vocabs(X[elc.activity].unique()) 113 | self.rtoi_, self.itor_ = self._define_vocabs(X[elc.resource].unique()) 114 | 115 | X[elc.activity] = X[elc.activity].map(self.atoi_) 116 | X[elc.resource] = X[elc.resource].map(self.rtoi_) 117 | 118 | # building a pairwise frequency matrix 119 | freq_matrix = ( 120 | X.groupby([elc.activity, elc.resource]).value_counts().to_dict() 121 | ) 122 | 123 | # building an activity profile for each resource 124 | 125 | # matrix profile: rows = resources, columns = activities 126 | # the unown labels are generating a row of zeros, and this is throwing a warning when calculating the correlation matrix: TODO 127 | # https://stackoverflow.com/questions/45897003/python-numpy-corrcoef-runtimewarning-invalid-value-encountered-in-true-divide 128 | profiles = np.zeros((len(self.rtoi_), len(self.atoi_)), dtype=int) 129 | for pair_ar, freq in freq_matrix.items(): 130 | # pair_ar = (activity, resource); order defined by groupby 131 | profiles[pair_ar[1], pair_ar[0]] = freq 132 | 133 | # correlation matrix 134 | with warnings.catch_warnings(): 135 | warnings.simplefilter("ignore") 136 | corr = np.corrcoef( 137 | profiles 138 | ) # TODO: include similarity/correlation metric parameter 139 | 140 | np.fill_diagonal( 141 | corr, 0 142 | ) # the original paper does not consider self-relationship 143 | 144 | # subgraphs as roles 145 | n_components, labels = connected_components( 146 | corr > self.threshold, directed=False 147 | ) 148 | 149 | sub_graphs = list() 150 | for i in range(n_components): 151 | sub_graphs.append(set(np.where(labels == i)[0])) 152 | 153 | # role definition 154 | self.resource_to_roles_ = dict() 155 | for role_ix, role in enumerate(sub_graphs): 156 | for user_id in role: 157 | self.resource_to_roles_[user_id] = role_ix 158 | 159 | return self 160 | 161 | def transform(self, X: DataFrame, y=None): 162 | """Transform the input data to extract resource roles. 163 | 164 | Parameters: 165 | ----------- 166 | X : DataFrame, shape (n_samples, n_features) 167 | The input data containing activity and resource columns. 168 | 169 | Returns: 170 | -------- 171 | resource_roles : numpy.ndarray, shape (n_samples,) 172 | An array containing the resource roles for each sample. 173 | """ 174 | check_is_fitted(self, "resource_to_roles_") 175 | X = self._validate_data(X) 176 | resource_roles = X[elc.resource].map(self.resource_to_roles_).values 177 | return resource_roles 178 | 179 | def _validate_data(self, X: DataFrame): 180 | """Validate the input data. 181 | 182 | Parameters: 183 | ----------- 184 | X : DataFrame, shape (n_samples, n_features) 185 | The input data containing activity and resource columns. 186 | 187 | Returns: 188 | -------- 189 | x : DataFrame 190 | The validated input data. 191 | """ 192 | assert isinstance(X, DataFrame), "Input must be a dataframe." 193 | x = X.copy() 194 | x.reset_index(drop=True, inplace=True) 195 | columns = validate_columns( 196 | input_columns=x.columns, required=[elc.activity, elc.resource] 197 | ) 198 | x = x[columns] 199 | 200 | if x[elc.activity].isnull().any(): 201 | raise ValueError("Activity column contains null values.") 202 | if x[elc.resource].isnull().any(): 203 | raise ValueError("Resource column contains null values.") 204 | 205 | # i.e. if fitted, check unkown labels 206 | if hasattr(self, "resource_to_roles_"): 207 | x[elc.resource] = self._check_unknown( 208 | x[elc.resource].values, self.rtoi_.keys(), elc.resource 209 | ) 210 | x[elc.activity] = self._check_unknown( 211 | x[elc.activity].values, self.atoi_.keys(), elc.activity 212 | ) 213 | 214 | x[elc.activity] = x[elc.activity].map(self.atoi_) 215 | x[elc.resource] = x[elc.resource].map(self.rtoi_) 216 | 217 | return x 218 | 219 | def _check_unknown(self, input: np.ndarray, vocab: np.ndarray, name: str): 220 | """Check for unknown labels in the input data. 221 | 222 | Parameters: 223 | ----------- 224 | input : numpy.ndarray 225 | The input data containing labels. 226 | vocab : numpy.ndarray 227 | The vocabulary of known labels. 228 | name : str 229 | The name of the label (e.g., 'activity' or 'resource'). 230 | 231 | Returns: 232 | -------- 233 | input : numpy.ndarray 234 | The input data with unknown labels replaced by 'UNK'. 235 | """ 236 | unkown = set(input) - set(vocab) 237 | if unkown: 238 | warnings.warn( 239 | message=( 240 | f"The label '{name}' contains values unseen during fitting. These values will be set to 'UNK': {unkown}" 241 | ), 242 | category=ConceptDriftWarning, 243 | stacklevel=2, 244 | ) 245 | 246 | input = np.array(["UNK" if x in unkown else x for x in input]) 247 | # input = input.replace(unkown, "UNK") 248 | return input 249 | 250 | def _define_vocabs(self, unique_labels: np.ndarray): 251 | """Define vocabularies for unique labels. 252 | 253 | Parameters: 254 | ----------- 255 | unique_labels : numpy.ndarray 256 | An array containing unique labels. 257 | 258 | Returns: 259 | -------- 260 | stoi : dict 261 | A dictionary mapping labels to indices. 262 | itos : dict 263 | A dictionary mapping indices to labels. 264 | """ 265 | stoi, itos = {"UNK": 0}, {0: "UNK"} 266 | stoi.update({label: i + 1 for i, label in enumerate(unique_labels)}) 267 | itos.update({i + 1: label for i, label in enumerate(unique_labels)}) 268 | return stoi, itos 269 | -------------------------------------------------------------------------------- /src/skpm/feature_extraction/event/time.py: -------------------------------------------------------------------------------- 1 | from skpm.config import EventLogConfig as elc 2 | 3 | class TimestampEventLevel: 4 | """ 5 | Provides methods to extract time-related features from the event level. 6 | 7 | Implementing event-level and case-level seperately makes code faster since here we do not need to group by case_id. 8 | 9 | """ 10 | TIME_UNIT_MULTIPLIER = { 11 | "s": 1, 12 | "m": 60, 13 | "h": 60 * 60, 14 | "d": 60 * 60 * 24, 15 | "w": 60 * 60 * 24 * 7, 16 | } 17 | 18 | @classmethod 19 | def sec_of_min(cls, X): 20 | """Second of minute encoded as value between [-0.5, 0.5]""" 21 | return X.dt.second / 59.0 - 0.5 22 | 23 | @classmethod 24 | def min_of_hour(cls, X): 25 | """Minute of hour encoded as value between [-0.5, 0.5]""" 26 | 27 | return X.dt.minute / 59.0 - 0.5 28 | 29 | @classmethod 30 | def hour_of_day(cls, X): 31 | """Hour of day encoded as value between [-0.5, 0.5]""" 32 | 33 | return X.dt.hour / 23.0 - 0.5 34 | 35 | @classmethod 36 | def day_of_week(cls, X): 37 | """Hour of day encoded as value between [-0.5, 0.5]""" 38 | 39 | return X.dt.dayofweek / 6.0 - 0.5 40 | 41 | @classmethod 42 | def day_of_month(cls, X): 43 | """Day of month encoded as value between [-0.5, 0.5]""" 44 | return (X.dt.day - 1) / 30.0 - 0.5 45 | 46 | @classmethod 47 | def day_of_year(cls, X): 48 | """Day of year encoded as value between [-0.5, 0.5]""" 49 | 50 | return (X.dt.dayofyear - 1) / 365.0 - 0.5 51 | 52 | @classmethod 53 | def week_of_year(cls, X): 54 | """Week of year encoded as value between [-0.5, 0.5]""" 55 | return (X.dt.isocalendar().week - 1) / 52.0 - 0.5 56 | 57 | @classmethod 58 | def month_of_year(cls, X): 59 | """Month of year encoded as value between [-0.5, 0.5]""" 60 | return (X.dt.month - 1) / 11.0 - 0.5 61 | 62 | @classmethod 63 | def secs_within_day(cls, X): 64 | """Extract the number of seconds elapsed within each day from the timestamps encoded as value between [-0.5, 0.5].""" 65 | return ( 66 | (X.dt.hour * 3600 + X.dt.minute * 60 + X.dt.second) / 86400 67 | ) - 0.5 68 | 69 | @classmethod 70 | def secs_since_sunday(cls, X): 71 | """Extract the number of seconds elapsed since the last Sunday from the timestamps encoded as value between [-0.5, 0.5].""" 72 | return ( 73 | (X.dt.hour * 3600 + X.dt.minute * 60 + X.dt.second) / 604800 74 | ) - 0.5 75 | 76 | @classmethod 77 | def numerical_timestamp(cls, X, time_unit="s"): 78 | """Numerical representation of the timestamp.""" 79 | return X.astype("int64") // 10**9 / cls.TIME_UNIT_MULTIPLIER.get(time_unit, 1) 80 | -------------------------------------------------------------------------------- /src/skpm/feature_extraction/targets.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from skpm.config import EventLogConfig as elc 3 | 4 | 5 | def next_activity(log: pd.DataFrame): 6 | """Returns the next activity of each trace. 7 | 8 | Parameters 9 | ---------- 10 | log : pd.DataFrame 11 | An event log. 12 | 13 | Returns 14 | ------- 15 | pd.DataFrame 16 | A dataframe with the next activity of each trace. 17 | """ 18 | return ( 19 | log.groupby(elc.case_id, observed=True, as_index=True)[elc.activity] 20 | .shift(-1, fill_value=elc.EOT) 21 | .values 22 | ) 23 | 24 | 25 | def remaining_time(log: pd.DataFrame, time_unit="s"): 26 | """Returns the remaining time of each trace. 27 | 28 | Parameters 29 | ---------- 30 | log : pd.DataFrame 31 | An event log. 32 | 33 | Returns 34 | ------- 35 | pd.DataFrame 36 | A dataframe with the remaining time of each trace. 37 | """ 38 | from skpm.feature_extraction import TimestampExtractor 39 | 40 | return TimestampExtractor( 41 | case_features=None, 42 | event_features=None, 43 | targets="remaining_time", 44 | time_unit=time_unit 45 | ).set_output(transform="default").fit_transform(log) -------------------------------------------------------------------------------- /src/skpm/feature_extraction/time.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | from typing import Optional, Union 3 | 4 | import pandas as pd 5 | from sklearn.base import ( 6 | BaseEstimator, 7 | ClassNamePrefixFeaturesOutMixin, 8 | TransformerMixin, 9 | check_is_fitted, 10 | ) 11 | 12 | from skpm.config import EventLogConfig as elc 13 | from skpm.feature_extraction.case.time import TimestampCaseLevel 14 | from skpm.feature_extraction.event.time import TimestampEventLevel 15 | from skpm.utils import validate_columns, validate_methods_from_class 16 | 17 | def _to_list(x): 18 | if x == "all" or x is None: 19 | return x 20 | return [x] if not isinstance(x, list) else x 21 | 22 | class TimestampExtractor( 23 | ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator 24 | ): 25 | """Extracts features from a timestamp column. 26 | 27 | This class extracts various features and targets from a timestamp column in a DataFrame. 28 | 29 | The current targets are: `execution_time` and `remaining_time`. All the remaining attributes 30 | are considered as features. 31 | 32 | Parameters: 33 | ----------- 34 | case_features (Union[list, str], optional): List of case features to extract. Defaults to "all". 35 | event_features (Union[list, str], optional): List of event features to extract. Defaults to "all". 36 | targets (Union[list, str], optional): List of target features to extract. Defaults to None. 37 | time_unit (str, optional): Time unit for the features. Defaults to "s". 38 | 39 | 40 | Attributes: 41 | ----------- 42 | _n_features_out: int 43 | Number of features extracted. 44 | _n_targets_out: int 45 | Number of targets extracted. 46 | 47 | Methods: 48 | -------- 49 | fit(X, y=None): 50 | Fit the transformer to the input data. 51 | transform(X, y=None): 52 | Transform the input data to calculate timestamp features. 53 | get_feature_names_out(): 54 | Get the names of the features extracted. 55 | inverse_transform(X): 56 | Inverse transform the input data. 57 | 58 | Notes: 59 | ------ 60 | - This class requires a DataFrame with columns for case IDs and timestamps. 61 | - Validation of columns and timestamps is performed in the `fit` method. 62 | - Lowest scale is seconds. Nanoseconds, milliseconds, etc. are disregarded. 63 | 64 | Examples: 65 | --------- 66 | >>> from skpm.event_feature_extraction.time import TimestampExtractor 67 | >>> import pandas as pd 68 | >>> # Assuming X is your dataframe containing event data with columns 'case_id' and 'timestamp' 69 | >>> X = pd.DataFrame({'case_id': [1, 1, 2, 2], 'timestamp': ['2023-01-01 10:30:00', '2023-01-01 11:00:00', '2023-01-01 09:00:00', '2023-01-01 09:30:00']}) 70 | >>> feature_extractor = TimestampExtractor() 71 | >>> feature_extractor.fit(X) 72 | >>> feature_extractor.transform(X) 73 | """ 74 | available_targets = ["execution_time", "remaining_time"] 75 | 76 | def __init__( 77 | self, 78 | case_features: Union[str, list, None] = "all", 79 | event_features: Union[str, list, None] = "all", 80 | targets: Optional[Union[str, list]] = None, 81 | time_unit: str = "s", 82 | ): 83 | # TODO: feature time unit (secs, hours, days, etc) 84 | # TODO: subset of features rather than all 85 | # TODO: param for event-level and case-level 86 | 87 | self.case_features = _to_list(case_features) 88 | self.event_features = _to_list(event_features) 89 | self.targets = _to_list(targets) 90 | self.time_unit = time_unit 91 | 92 | def fit( 93 | self, 94 | X: pd.DataFrame, 95 | y=None, 96 | ): 97 | """Fit the transformer to the input data. 98 | 99 | This method checks if the input is a DataFrame, validates the required columns, 100 | and computes the desired features. 101 | 102 | Parameters: 103 | ----------- 104 | X : DataFrame 105 | Input DataFrame containing columns for case IDs and timestamps. 106 | y : None 107 | Ignored. 108 | 109 | Returns: 110 | -------- 111 | self : TimestampExtractor 112 | Fitted transformer instance. 113 | """ 114 | _ = self._validate_data(X) 115 | 116 | self.event_features = validate_methods_from_class( 117 | class_obj=TimestampEventLevel, 118 | methods=self.event_features 119 | ) 120 | self.case_features = validate_methods_from_class( 121 | class_obj=TimestampCaseLevel, 122 | methods=self.case_features, 123 | ) 124 | self.targets = validate_methods_from_class( 125 | class_obj=TimestampCaseLevel, 126 | methods=self.targets, 127 | ) 128 | 129 | self._n_features_out = len(self.event_features) + len( 130 | self.case_features 131 | ) 132 | self._n_targets_out = len(self.targets) 133 | 134 | if self._n_features_out + self._n_targets_out == 0: 135 | raise ValueError("No features selected. Please select at least one feature, either from the event level or the case level.") 136 | 137 | return self 138 | 139 | def get_feature_names_out(self): 140 | return [ 141 | f[0] for f in self.case_features + self.event_features + self.targets 142 | ] 143 | 144 | def transform(self, X: pd.DataFrame, y=None): 145 | """Transform the input data to calculate timestamp features. 146 | 147 | Parameters: 148 | ----------- 149 | X : DataFrame 150 | Input DataFrame containing columns for case IDs and timestamps. 151 | y : None 152 | Ignored. 153 | 154 | Returns: 155 | -------- 156 | X_tr : DataFrame 157 | Transformed DataFrame with calculated timestamp features added. 158 | """ 159 | # Check if fit had been called 160 | check_is_fitted(self, "_n_features_out") 161 | 162 | # data validation 163 | X = self._validate_data(X) 164 | 165 | # case-level features 166 | self.group_ = X.groupby( 167 | elc.case_id, as_index=False, group_keys=False, observed=True 168 | ) 169 | for feature_name, feature_fn in self.case_features: 170 | X[feature_name] = feature_fn( 171 | case=self.group_, 172 | ix_list=X.index.values, 173 | time_unit=self.time_unit, 174 | ) 175 | 176 | # for event-level features 177 | for feature_name, feature_fn in self.event_features: 178 | sig = inspect.signature(feature_fn) 179 | if "time_unit" in sig.parameters: 180 | X[feature_name] = feature_fn(X[elc.timestamp], time_unit=self.time_unit) 181 | else: 182 | X[feature_name] = feature_fn(X[elc.timestamp]) 183 | 184 | # targets 185 | for feature_name, feature_fn in self.targets: 186 | X[feature_name] = feature_fn( 187 | case=self.group_, 188 | ix_list=X.index.values, 189 | time_unit=self.time_unit, 190 | ) 191 | output_columns = [ 192 | feature[0] 193 | for feature in self.case_features + self.event_features + self.targets 194 | ] 195 | return X.loc[:, output_columns].values 196 | 197 | def _validate_data(self, X: pd.DataFrame): 198 | """ 199 | Validates the input DataFrame and timestamp column. 200 | 201 | Parameters: 202 | ----------- 203 | X : DataFrame 204 | Input DataFrame containing columns for case IDs and timestamps. 205 | 206 | Returns: 207 | -------- 208 | X : DataFrame 209 | Validated DataFrame after processing. 210 | """ 211 | assert isinstance(X, pd.DataFrame), "Input must be a dataframe." 212 | x = X.copy() 213 | x.reset_index(drop=True, inplace=True) 214 | # x.columns = self._validate_columns(x.columns) 215 | valid_cols = validate_columns( 216 | input_columns=x.columns, required=[elc.case_id, elc.timestamp] 217 | ) 218 | x = x[valid_cols] 219 | 220 | # check if it is a datetime column 221 | x[elc.timestamp] = self._validate_timestamp_format(x) 222 | 223 | return x 224 | 225 | def _validate_timestamp_format( 226 | self, x: pd.DataFrame, timestamp_format: str = "%Y-%m-%d %H:%M:%S" 227 | ): 228 | """ 229 | Validates the format of the timestamp column. 230 | 231 | Parameters: 232 | ----------- 233 | x : DataFrame 234 | DataFrame containing columns for case IDs and timestamps. 235 | timestamp_format : str, optional 236 | Expected format of the timestamp, by default "%Y-%m-%d %H:%M:%S". 237 | 238 | Returns: 239 | -------- 240 | x[elc.timestamp] : Series 241 | Series containing the validated timestamps. 242 | """ 243 | if not x[elc.timestamp].dtype == "datetime64[ns]": 244 | # pd = check_pandas_support( 245 | # "'pandas' not found. Please install it to use this method." 246 | # ) 247 | try: 248 | # for now, since we are only employing the BPI event logs, 249 | # we are assuming that the datetime format is '%Y-%m-%d %H:%M:%S'. 250 | # TODO: validate alternative datetime formats. 251 | # '%Y-%m-%d %H:%M:%S' format should be mandatory 252 | x[elc.timestamp] = pd.to_datetime( 253 | x[elc.timestamp], format=timestamp_format 254 | ) 255 | except: 256 | raise ValueError( 257 | f"Column '{elc.timestamp}' is not a valid datetime column." 258 | ) 259 | 260 | # TODO: ensure datetime format 261 | # try: 262 | # # Attempt to parse the datetime string with the specified format 263 | # datetime_obj = datetime.strptime(x, '%Y-%m-%d %H:%M:%S') 264 | # print(f"'{x}' is a valid datetime with the correct format: {datetime_obj}") 265 | # except ValueError: 266 | # print(f"'{x}' is not in the correct format '%Y-%m-%d %H:%M:%S'") 267 | # pass 268 | return x[elc.timestamp] 269 | 270 | -------------------------------------------------------------------------------- /src/skpm/sequence_encoding/__init__.py: -------------------------------------------------------------------------------- 1 | from .aggregation import Aggregation 2 | from .index import Indexing 3 | from .bucketing import Bucketing 4 | 5 | __all__ = ["Aggregation", "Indexing", "Bucketing"] -------------------------------------------------------------------------------- /src/skpm/sequence_encoding/aggregation.py: -------------------------------------------------------------------------------- 1 | from typing import Literal, Union 2 | 3 | import pandas as pd 4 | import polars as pl 5 | from sklearn.base import OneToOneFeatureMixin, TransformerMixin 6 | from sklearn.utils._param_validation import StrOptions 7 | from sklearn.utils.validation import check_is_fitted 8 | 9 | from skpm.base import BaseProcessEstimator 10 | from skpm.config import EventLogConfig as elc 11 | 12 | def handle_aggregation_method(method): 13 | """Handle the aggregation method. 14 | 15 | Parameters 16 | ---------- 17 | method : str 18 | The aggregation method to be handled. 19 | 20 | Returns 21 | ------- 22 | str, callable 23 | The aggregation method that pandas or polars can use. 24 | """ 25 | if method == "norm": 26 | from numpy import linalg 27 | return linalg.norm 28 | return method 29 | 30 | class Aggregation(OneToOneFeatureMixin, TransformerMixin, BaseProcessEstimator): 31 | """Sequence Encoding Transformer. 32 | 33 | This module implements a method for encoding sequences by 34 | aggregating features. It adapts the approach from a 35 | research paper [1] that abstracts event sequences by 36 | disregarding their order and using aggregation functions. 37 | Common aggregation functions include frequency-based 38 | methods for categorical features and general statistics 39 | (average, sum, etc.) for numeric attributes. 40 | 41 | In our implementation, we assume that categorical 42 | features are already encoded categorical features and 43 | apply aggregation methods accordingly: frequency 44 | aggregation for integer (categorical) features and 45 | general statistical measures for float (numerical) 46 | features. This design choice allows flexibility in 47 | aggregating user-engineered features, not limited to 48 | one-hot encoding as described in the original 49 | paper [1]. 50 | 51 | 52 | Parameters 53 | ---------- 54 | method : str, default="mean" 55 | The method to aggregate features. 56 | Possible values: "sum", "mean". 57 | prefix_len : int, default=None 58 | The length of the prefix to consider for aggregation. If None, the length of the longest trace is used. 59 | engine : str, default="pandas" 60 | The DataFrame engine to use. Supported engines are "pandas" and "polars". 61 | 62 | References 63 | ---------- 64 | [1] Outcome-Oriented Predictive Process Monitoring: Review and Benchmark, Teinemaa, I., Dumas, M., Maggi, F. M., & La Rosa, M. (2019). 65 | 66 | Examples 67 | -------- 68 | >>> import numpy as np 69 | >>> import pandas as pd 70 | >>> from skpm.encoding import Aggregation 71 | >>> from skpm.config import EventLogConfig as elc 72 | >>> df = pd.DataFrame({ 73 | ... elc.timestamp: np.arange(10), 74 | ... elc.activity: np.random.randint(0, 10, 10), 75 | ... elc.resource: np.random.randint(0, 3, 10), 76 | ... elc.case_id: np.random.randint(0, 3, 10), 77 | ... }).sort_values(by=[elc.case_id, elc.timestamp]) 78 | >>> df = pd.get_dummies(df, columns=[elc.activity,elc.resource], dtype=int) 79 | >>> df = df.drop(elc.timestamp, axis=1) 80 | >>> Aggregation().fit_transform(df) 81 | """ 82 | 83 | _case_id = elc.case_id 84 | _parameter_constraints = { 85 | "method": [ 86 | StrOptions({"sum", "mean", "median", "norm"}), 87 | ], 88 | "engine": [ 89 | StrOptions({"pandas", "polars"}), 90 | ], 91 | } 92 | 93 | def __init__( 94 | self, 95 | method: str = "mean", 96 | prefix_len: int = None, 97 | # n_jobs=1, 98 | engine: Literal[ 99 | "pandas", "polars" 100 | ] = "pandas", # Default to Pandas DataFrame 101 | ) -> None: 102 | self.method = method 103 | self.prefix_len = prefix_len 104 | self.engine = engine 105 | 106 | def validate_engine_with_df(self, X, y=None): 107 | if ( 108 | self.engine == "pandas" 109 | and not isinstance(X, pd.DataFrame) 110 | ): 111 | X = pd.DataFrame(X) 112 | y = pd.DataFrame(y) if y is not None else None 113 | elif ( 114 | self.engine == "polars" 115 | and not isinstance(X, pl.DataFrame) 116 | ): 117 | X = pl.DataFrame(X) 118 | y = pl.DataFrame(y) if y is not None else None 119 | return X, y 120 | 121 | def fit(self, X, y=None): 122 | """Fit transformer. 123 | 124 | Checks if the input is a dataframe, if it 125 | contains the required columns, validates 126 | the timestamp column, and the desired features. 127 | 128 | Parameters 129 | ---------- 130 | X : {DataFrame} of shape (n_samples, n_features+1) 131 | The data must contain `n_features` plus a column with case ids. 132 | y : None. 133 | Ignored. 134 | 135 | Returns 136 | ------- 137 | self : object 138 | Fitted aggregator. 139 | 140 | """ 141 | X = self._validate_log(X) 142 | 143 | if self.prefix_len is None: 144 | self.prefix_len = len(X) 145 | 146 | return self 147 | 148 | def transform(self, X: Union[pd.DataFrame, pl.DataFrame], y=None): 149 | """Performs the aggregation of event features from a trace. 150 | 151 | Parameters 152 | ---------- 153 | X : {DataFrame} of shape (n_samples, n_features+1) 154 | An event log. It must contain n_features + 1 columns, 155 | representing the case id and the event features. 156 | 157 | Returns 158 | ------- 159 | X : {DataFrame} of shape (n_samples, n_features) 160 | The aggregated event log. 161 | """ 162 | check_is_fitted(self, "n_features_in_") 163 | X = self._validate_log(X) 164 | 165 | X, y = self.validate_engine_with_df(X, y) 166 | self._method_fn = handle_aggregation_method(self.method) 167 | if self.engine == "pandas": # If using Pandas DataFrame 168 | if isinstance(X, pl.DataFrame): 169 | X = X.to_pandas() 170 | return self._transform_pandas(X) 171 | 172 | else: 173 | if isinstance(X, pd.DataFrame): 174 | X = pl.DataFrame(X) 175 | X = self._transform_polars(X) 176 | return X.to_pandas() 177 | 178 | def _transform_pandas(self, X: pd.DataFrame): 179 | """Transforms Pandas DataFrame.""" 180 | group = X.groupby(self._case_id) 181 | 182 | X = ( 183 | group.rolling(window=self.prefix_len, min_periods=1) 184 | .agg(self._method_fn) 185 | .reset_index(drop=True) 186 | ) 187 | return X 188 | 189 | def _transform_polars(self, X: pl.DataFrame): 190 | """Transforms Polars DataFrame.""" 191 | 192 | def _make_rolling_expr(col_name: str, method_fn: Union[str, callable]) -> pl.Expr: 193 | expr = pl.col(col_name) 194 | 195 | if isinstance(method_fn, str): 196 | builtin = f"rolling_{method_fn}" 197 | fn = getattr(expr, builtin) 198 | return fn(window_size=self.prefix_len, min_samples=1) 199 | else: 200 | expr = pl.col(col_name).cast(pl.Float32) 201 | return expr.rolling_map( 202 | function=method_fn, 203 | window_size=self.prefix_len, 204 | min_samples=1 205 | ) 206 | 207 | X = X.with_columns([ 208 | _make_rolling_expr(c, self._method_fn).over(self._case_id) 209 | for c in X.columns 210 | if c != self._case_id 211 | ]) 212 | 213 | return X.drop(self._case_id) 214 | -------------------------------------------------------------------------------- /src/skpm/sequence_encoding/bucketing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.base import TransformerMixin 3 | from skpm.config import EventLogConfig as elc 4 | from skpm.base import BaseProcessEstimator 5 | 6 | 7 | class Bucketing(TransformerMixin, BaseProcessEstimator): 8 | """ 9 | Event Bucketing Transformer inherits from :class:`sklearn.base.TransformerMixin` and :class:`skpm.base.BaseProcessEstimator`. 10 | 11 | This class implements a method for bucketing traces based on different strategies. 12 | 13 | Parameters 14 | ---------- 15 | method : str, optional 16 | The method used for bucketing traces. Possible values are "single", "prefix", or "clustering". 17 | Default is "single". 18 | 19 | - "single": Assigns all events to a single bucket. 20 | - "prefix": Groups events based on the order in which they occur within each case, assigning sequential buckets. 21 | - "clustering": Not implemented yet, but intended to assign buckets based on clustering of event features. 22 | 23 | Methods 24 | ------- 25 | fit(X, y=None) 26 | Fit the transformer. 27 | 28 | transform(X, y=None) 29 | Transform input data by bucketing traces. 30 | 31 | get_feature_names_out() 32 | Get the names of the output features. 33 | """ 34 | 35 | def __init__(self, method="single"): 36 | """ 37 | Initialize Bucketing Transformer. 38 | 39 | Parameters 40 | ---------- 41 | method : str, optional 42 | The method used for bucketing traces. Possible values are "single", "prefix", or "clustering". 43 | Default is "single". 44 | """ 45 | assert method in [ 46 | "single", 47 | "prefix", 48 | "clustering", 49 | ], f"Invalid method: {method}" 50 | 51 | self.method = method 52 | 53 | def fit(self, X, y=None): 54 | """ 55 | Fit the transformer. 56 | 57 | Parameters 58 | ---------- 59 | X : array-like or DataFrame 60 | The input data. 61 | 62 | Returns 63 | ------- 64 | self : Bucketing 65 | Returns the instance itself. 66 | """ 67 | return self 68 | 69 | def transform(self, X, y=None): 70 | """ 71 | Transform input data by bucketing traces. 72 | 73 | Parameters 74 | ---------- 75 | X : array-like or DataFrame 76 | The input data. 77 | 78 | Returns 79 | ------- 80 | bucket_labels : array 81 | An array containing the bucket labels assigned to each event. 82 | """ 83 | if self.method == "single": 84 | # For the single method, assign all events to a single bucket. 85 | bucket_labels = np.array(["b1"] * len(X)) 86 | elif self.method == "prefix": 87 | # For the prefix method, group events by case ID and assign sequential buckets. 88 | bucket_labels = ( 89 | X.groupby(elc.case_id) 90 | .cumcount() 91 | .apply(lambda x: f"b{x + 1}") 92 | .values 93 | ) 94 | elif self.method == "clustering": 95 | # Clustering method is not implemented yet. 96 | raise NotImplementedError( 97 | "Clustering method is not implemented yet" 98 | ) 99 | 100 | return bucket_labels 101 | 102 | def get_feature_names_out(self): 103 | """ 104 | Get the names of the output features. 105 | 106 | Returns 107 | ------- 108 | feature_names : list 109 | A list containing the name of the output feature. 110 | """ 111 | return ["bucket"] 112 | -------------------------------------------------------------------------------- /src/skpm/sequence_encoding/index.py: -------------------------------------------------------------------------------- 1 | from skpm.config import EventLogConfig as elc 2 | from sklearn.base import TransformerMixin, _fit_context 3 | from skpm.base import BaseProcessEstimator 4 | from sklearn.utils._param_validation import Interval, _IterablesNotString, Options 5 | from numbers import Integral, Real 6 | import numpy as np 7 | import pandas as pd 8 | from typing import Union 9 | 10 | class Indexing(TransformerMixin, BaseProcessEstimator): 11 | _parameter_constraints = { 12 | "n": [Interval(type=Integral, left=1, right=None, closed="left"), None], 13 | "attributes": [str, list, None], 14 | "fill_value": [Real, None], 15 | } 16 | def __init__(self, n: int = 2, attributes: Union[str, list] = None, fill_value: int = None): 17 | self.n = n 18 | self.attributes = attributes 19 | self.fill_value = fill_value 20 | 21 | @_fit_context(prefer_skip_nested_validation=True) 22 | def fit(self, X: pd.DataFrame, y=None): 23 | if isinstance(self.attributes, str): 24 | self.attributes = [self.attributes] 25 | 26 | if self.attributes is None: 27 | self.attributes = X.columns.difference([elc.case_id]).tolist() 28 | return self 29 | 30 | def transform(self, X: pd.DataFrame, y=None): 31 | group = X.groupby(elc.case_id) 32 | 33 | out_df = pd.DataFrame() 34 | lags = range(self.n) 35 | for col in self.attributes: 36 | lagged_cols = [f"{col}_lag_{lag}" for lag in lags] 37 | out_df[lagged_cols] = group[col].shift(lags, fill_value=self.fill_value) 38 | 39 | return out_df -------------------------------------------------------------------------------- /src/skpm/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .validation import ( 2 | validate_methods_from_class, 3 | validate_columns, 4 | ensure_list, 5 | ) 6 | 7 | __all__ = ["validate_methods_from_class", "validate_columns", "ensure_list"] 8 | -------------------------------------------------------------------------------- /src/skpm/utils/graph.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | __all__ = ["frequency_matrix", "node_degree"] 4 | 5 | 6 | def frequency_matrix( 7 | traces: list, set_of_states: set 8 | ) -> tuple[np.ndarray, dict, dict]: 9 | """ 10 | Returns a transition frequency matrix. 11 | 12 | This function takes a list of traces, where each trace 13 | is an ordered sequence of states, and computes a transition 14 | frequency matrix. 15 | 16 | States can be any hashable object, but they must be comparable. 17 | For instance, a state can be a string, an integer, or a tuple. 18 | 19 | Parameters 20 | ---------- 21 | traces : list of list of states 22 | A list of traces, where each trace is a list of states. 23 | set_of_states : set of states 24 | A set of all possible states. 25 | 26 | Returns 27 | ------- 28 | freq_matrix : numpy.ndarray 29 | A transition frequency matrix. 30 | 31 | stoi : dict 32 | A dictionary mapping states to indices. 33 | 34 | itos : dict 35 | A dictionary mapping indices to states. 36 | 37 | Examples 38 | -------- 39 | >>> traces = [[1, 2, 3], [1, 2, 3, 4]] 40 | >>> set_of_states = {1, 2, 3, 4} 41 | >>> frequency_matrix(traces, set_of_states) 42 | (array([[0, 2, 0, 0], 43 | [0, 0, 2, 0], 44 | [0, 0, 0, 1], 45 | [0, 0, 0, 0]]), 46 | {1: 0, 2: 1, 3: 2, 4: 3}, 47 | {0: 1, 1: 2, 2: 3, 3: 4}) 48 | 49 | >>> traces = [["a", "b", "c"], ["a", "b", "c", "d"]] 50 | >>> set_of_states = {"a", "b", "c", "d"} 51 | >>> frequency_matrix(traces, set_of_states) 52 | (array([[0, 0, 2, 0], 53 | [2, 0, 0, 0], 54 | [0, 0, 0, 1], 55 | [0, 0, 0, 0]]), 56 | {'b': 0, 'a': 1, 'c': 2, 'd': 3}, 57 | {0: 'b', 1: 'a', 2: 'c', 3: 'd'}) 58 | 59 | >>> traces = [[("a", "b"), ("b", "c")], [("a", "b"), ("b", "c"), ("c", "d")]] 60 | >>> set_of_states = {("a", "b"), ("b", "c"), ("c", "d")} 61 | >>> frequency_matrix(traces, set_of_states) 62 | (array([[0, 0, 0], 63 | [1, 0, 0], 64 | [0, 2, 0]]), 65 | {('c', 'd'): 0, ('b', 'c'): 1, ('a', 'b'): 2}, 66 | {0: ('c', 'd'), 1: ('b', 'c'), 2: ('a', 'b')}) 67 | 68 | """ 69 | stoi = {value: ix for ix, value in enumerate(set_of_states)} 70 | itos = {ix: value for value, ix in stoi.items()} 71 | freq_matrix = np.zeros((len(stoi), len(stoi)), dtype=np.int32) 72 | 73 | for transition in traces: 74 | for origin, destiny in zip(transition, transition[1:]): 75 | freq_matrix[stoi[origin], stoi[destiny]] += 1 76 | 77 | return freq_matrix, stoi, itos 78 | 79 | 80 | def node_degree(frequency_matrix: np.ndarray) -> tuple[np.ndarray, np.ndarray]: 81 | """ 82 | Returns the in-degree and out-degree of each node. 83 | 84 | Parameters 85 | ---------- 86 | frequency_matrix : numpy.ndarray 87 | A graph as a transition frequency matrix. 88 | 89 | Returns 90 | ------- 91 | in_degree : numpy.ndarray 92 | An array with the in-degree of each node. 93 | 94 | out_degree : numpy.ndarray 95 | An array with the out-degree of each node. 96 | """ 97 | in_degree = frequency_matrix.sum(axis=0) 98 | out_degree = frequency_matrix.sum(axis=1) 99 | 100 | return in_degree, out_degree 101 | 102 | 103 | def density(graph): 104 | """ 105 | Returns the density of a graph. 106 | 107 | Parameters 108 | ---------- 109 | graph : numpy.ndarray 110 | A graph as a transition frequency matrix. 111 | 112 | Returns 113 | ------- 114 | density : float 115 | The density of the graph. 116 | """ 117 | n_nodes = graph.shape[0] 118 | n_edges = graph.sum(axis=None) 119 | max_edges = n_nodes * (n_nodes - 1) 120 | density = n_edges / max_edges 121 | return density 122 | 123 | 124 | def nodes_in_cycles(frequency_matrix, max_cycle_length): 125 | """ 126 | Returns a list of whether each node is in a cycle. 127 | 128 | Notice: this function actually returns self-loops, not cycles. 129 | By definition, a cycle is a path that starts and ends at the same node 130 | and visits each node at most once. A self-loop is an edge that connects 131 | a node to itself. A self-loop is a cycle of length 1. 132 | 133 | 134 | Parameters 135 | ---------- 136 | frequency_matrix : numpy.ndarray 137 | A graph as a transition frequency matrix. 138 | 139 | max_cycle_length: int 140 | The maximum length of a cycle to be counted. 141 | 142 | Returns 143 | ------- 144 | in_cycle : list of bool 145 | A list of whether each node is in a cycle. 146 | 147 | """ 148 | frequency_matrix = np.array(frequency_matrix) 149 | num_nodes = frequency_matrix.shape[0] 150 | in_cycle = [ 151 | False 152 | ] * num_nodes # Initialize list to store whether each node is in a cycle 153 | 154 | for n in range(2, max_cycle_length + 1): 155 | matrix_power = np.linalg.matrix_power(frequency_matrix, n) 156 | for i in range(num_nodes): 157 | if matrix_power[i, i] > 0: 158 | in_cycle[i] = ( 159 | True # Mark node i as in a cycle if diagonal entry is non-zero 160 | ) 161 | 162 | return in_cycle 163 | -------------------------------------------------------------------------------- /src/skpm/utils/helpers.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | 3 | 4 | # def flatten_list(l: list): 5 | # return [item for sublist in l for item in sublist] 6 | 7 | 8 | def infer_column_types(df, int_as_cat=False) -> tuple: 9 | """Infer column types from a dataframe.""" 10 | if isinstance(df, pl.DataFrame): # For Polars DataFrame 11 | df = df.to_pandas() 12 | cat_cols = ["object", "category"] 13 | if int_as_cat: 14 | cat_cols.append("int") 15 | cat = df.select_dtypes(include=cat_cols).columns.tolist() 16 | num = df.select_dtypes(include=["number"]).columns.tolist() 17 | time = df.select_dtypes( 18 | include=[ 19 | "datetime", 20 | "datetime64", 21 | "datetimetz", 22 | "datetime64[ns]", 23 | "timedelta", 24 | "timedelta64", 25 | ] 26 | ).columns.tolist() 27 | 28 | # remove the int columns from num if int_as_cat is True 29 | if int_as_cat: 30 | num = list(set(num) - set(cat)) 31 | 32 | return cat, num, time 33 | -------------------------------------------------------------------------------- /src/skpm/utils/validation.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | from typing import Iterable, Union, Any 3 | 4 | 5 | def validate_methods_from_class( 6 | class_obj: Any, methods: Union[str, list[str]] = "all" 7 | ) -> list[tuple[str, callable]]: 8 | """Validate methods from a class. 9 | 10 | Args: 11 | class_obj (Any): a class object cotaining class methods. 12 | methods (Union[str, list[str]]), {"all", str, list[str]}: a list of methods to validate. 13 | 14 | Returns: 15 | list[tuple[str, callable]]: a list of tuples 16 | containing the name of the methods and the callable. 17 | """ 18 | available_methods = inspect.getmembers( 19 | class_obj, predicate=inspect.ismethod 20 | ) 21 | out_methods = [] 22 | if methods == "all": 23 | out_methods = available_methods 24 | else: 25 | if not isinstance(methods, (tuple, list)): 26 | methods = [methods] 27 | for f in available_methods: 28 | if f[0] in methods and not f[0].startswith("_"): 29 | out_methods.append(f) 30 | 31 | return out_methods 32 | 33 | 34 | def validate_columns(input_columns: Iterable, required: list) -> list: 35 | """Validate required columns. 36 | 37 | This method checks if the input columns 38 | contain the required columns. 39 | 40 | Args: 41 | input_columns (Iterable): Input columns. 42 | required (list): Required columns. 43 | 44 | Raises: 45 | ValueError: If the input is missing any 46 | of the required columns. 47 | 48 | Returns: 49 | list: the input columns 50 | """ 51 | diff = set(required) - set(input_columns) 52 | if diff: 53 | raise ValueError(f"Input is missing the following columns: {diff}.") 54 | return required 55 | 56 | 57 | def ensure_list(input: Any) -> list: 58 | """Ensure input is a list. 59 | 60 | Args: 61 | input (Any): Input to be converted to a list. 62 | 63 | Returns: 64 | list: Input as a list. 65 | """ 66 | if not isinstance(input, list): 67 | if isinstance(input, (str, int)): 68 | input = [input] 69 | else: 70 | input = list(input) 71 | return input 72 | -------------------------------------------------------------------------------- /src/skpm/warnings.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | class ConceptDriftWarning(UserWarning): 4 | pass -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raseidi/skpm/7c35b6bc3888cf64ac78210754438ad6429a869f/tests/__init__.py -------------------------------------------------------------------------------- /tests/event_logs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raseidi/skpm/7c35b6bc3888cf64ac78210754438ad6429a869f/tests/event_logs/__init__.py -------------------------------------------------------------------------------- /tests/event_logs/test_bpi.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import pandas as pd 4 | from skpm.event_logs import BPI13ClosedProblems, BPI19 5 | 6 | """Shapes to validate the logs:""" 7 | # BPI12 8 | # (262200, 7) 9 | # (117546, 7) (55952, 7) 10 | # ======================================== 11 | # BPI13ClosedProblems 12 | # (6660, 12) 13 | # Unbiased split parameters not supported. 14 | # ======================================== 15 | # BPI13Incidents 16 | # (65533, 12) 17 | # Unbiased split parameters not supported. 18 | # ======================================== 19 | # BPI13OpenProblems 20 | # (2351, 11) 21 | # Unbiased split parameters not supported. 22 | # ======================================== 23 | # BPI17 24 | # (1202267, 19) 25 | # (805809, 19) (294882, 19) 26 | # ======================================== 27 | # BPI19 28 | # (1595923, 21) 29 | # (538293, 21) (538880, 21) 30 | # ======================================== 31 | # BPI20PrepaidTravelCosts 32 | # (18246, 22) 33 | # (10809, 22) (4928, 22) 34 | # ======================================== 35 | # BPI20TravelPermitData 36 | # (86581, 173) 37 | # (51878, 173) (32390, 173) 38 | # ======================================== 39 | # BPI20RequestForPayment 40 | # (36796, 14) 41 | # (23295, 14) (7053, 14) 42 | # ======================================== 43 | # BPI20DomesticDeclarations 44 | # (56437, 10) 45 | # Unbiased split parameters not supported. 46 | # ======================================== 47 | # BPI20InternationalDeclarations 48 | # (72151, 23) 49 | # Unbiased split parameters not supported. 50 | # ======================================== 51 | # Sepsis 52 | # (15214, 32) 53 | # Unbiased split parameters not supported. 54 | # ======================================== 55 | 56 | from tempfile import TemporaryDirectory 57 | 58 | 59 | def test_bpi(): 60 | with TemporaryDirectory() as tmpdirname: 61 | bpi = BPI13ClosedProblems(root_folder=tmpdirname) 62 | 63 | assert isinstance(bpi.dataframe, pd.DataFrame) 64 | assert isinstance(bpi.__repr__(), str) 65 | assert isinstance(len(bpi.dataframe), int) 66 | 67 | # covering pytest when the file already exists 68 | bpi = BPI13ClosedProblems(bpi.file_path) 69 | -------------------------------------------------------------------------------- /tests/event_logs/test_download_extract.py: -------------------------------------------------------------------------------- 1 | import os 2 | from skpm.event_logs.extract import extract_gz 3 | from skpm.event_logs.download import download_url 4 | 5 | 6 | def _download(test_folder: str): 7 | url = "https://data.4tu.nl/file/1987a2a6-9f5b-4b14-8d26-ab7056b17929/8b99119d-9525-452e-bc8f-236ac76fa9c9" 8 | file_name = "BPI_Challenge_2013_closed_problems.xes.gz" 9 | output_fold_download = download_url( 10 | url, folder=test_folder, file_name=file_name 11 | ) 12 | exists = os.path.exists(output_fold_download) 13 | assert exists 14 | 15 | output_fold_extract = extract_gz( 16 | path=output_fold_download, folder=os.path.dirname(output_fold_download) 17 | ) 18 | extracted_exists = os.path.exists(output_fold_download.replace(".gz", "")) 19 | assert extracted_exists 20 | 21 | duplicated = download_url(url, folder=test_folder, file_name=file_name) 22 | assert duplicated == output_fold_download 23 | 24 | no_file_name = download_url(url, folder=".", file_name=None) 25 | assert os.path.isfile(no_file_name) 26 | os.remove(no_file_name) 27 | 28 | if exists: 29 | base_output_fold_download = os.path.abspath( 30 | os.path.dirname(output_fold_download) 31 | ) 32 | if base_output_fold_download != os.getcwd(): 33 | import shutil 34 | 35 | shutil.rmtree(base_output_fold_download) 36 | else: 37 | os.remove(output_fold_download) 38 | os.remove(output_fold_extract) 39 | 40 | 41 | def test_download_extract(): 42 | _download(test_folder="test_download_skpm") 43 | _download(test_folder=None) 44 | _download(test_folder=".") -------------------------------------------------------------------------------- /tests/event_logs/test_parser.py: -------------------------------------------------------------------------------- 1 | # from skpm.event_logs.parser import read_xes 2 | # from skpm.event_logs import ( 3 | # BPI12, 4 | # BPI13ClosedProblems, 5 | # BPI13Incidents, 6 | # BPI17, 7 | # BPI19, 8 | # BPI20, 9 | # ) 10 | 11 | 12 | def test_rerad_xes(): 13 | assert True 14 | 15 | 16 | # """ToDo 17 | 18 | # I gotta learn how to cache files on GitHub Actions. 19 | # """ 20 | 21 | # logs = ( 22 | # BPI12, 23 | # BPI13ClosedProblems, 24 | # BPI13Incidents, 25 | # BPI17, 26 | # BPI19, 27 | # # BPI20, 28 | # ) 29 | 30 | # shapes = { 31 | # "BPI12": (262200, 7), 32 | # "BPI13ClosedProblems": (6660, 12), 33 | # "BPI13Incidents": (65533, 12), 34 | # "BPI17": (1202267, 19), 35 | # "BPI19": (1595923, 21), 36 | # } 37 | 38 | # for l in logs: 39 | # df = l() 40 | # assert df.log.shape == shapes[l.__name__] 41 | -------------------------------------------------------------------------------- /tests/feature_extraction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raseidi/skpm/7c35b6bc3888cf64ac78210754438ad6429a869f/tests/feature_extraction/__init__.py -------------------------------------------------------------------------------- /tests/feature_extraction/case/test_variant.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | import pytest 5 | from skpm.feature_extraction.case import VariantExtractor 6 | from skpm.config import EventLogConfig as elc 7 | 8 | 9 | def test_variants(): 10 | n_cases = 100 11 | np.random.seed(42) 12 | dummy_data = pd.DataFrame( 13 | { 14 | elc.case_id: np.random.randint(0, n_cases, 1000), 15 | elc.activity: np.random.randint(0, 10, 1000), 16 | } 17 | ) 18 | 19 | rp = VariantExtractor() 20 | rp.fit(dummy_data) 21 | df = rp.transform(dummy_data) 22 | assert df.variant.nunique() == n_cases 23 | 24 | inv_t = rp.inverse_transform(df.variant) 25 | assert inv_t.shape == (n_cases,) 26 | assert isinstance(inv_t[0], tuple) -------------------------------------------------------------------------------- /tests/feature_extraction/event/test_resource.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | import pytest 5 | from skpm.feature_extraction.event import ResourcePoolExtractor 6 | from skpm.config import EventLogConfig as elc 7 | 8 | 9 | def test_resource(): 10 | dummy_data = pd.DataFrame( 11 | { 12 | elc.activity: np.random.randint(0, 10, 1000), 13 | elc.resource: np.random.randint(0, 3, 1000), 14 | } 15 | ) 16 | 17 | dummy_data_test = pd.DataFrame( 18 | { 19 | elc.activity: np.random.randint(0, 10, 100), 20 | elc.resource: np.random.randint(0, 3, 100), 21 | } 22 | ) 23 | 24 | rp = ResourcePoolExtractor() 25 | rp.fit(dummy_data) 26 | out = rp.transform(dummy_data) 27 | assert isinstance(out, pd.DataFrame) 28 | assert out.shape[1] == 1 29 | assert out.columns.tolist() == ["resource_roles"] 30 | 31 | test_out = rp.transform(dummy_data_test) 32 | assert test_out.shape[0] == dummy_data_test.shape[0] 33 | 34 | with pytest.raises(Exception): 35 | dummy_data_test[elc.resource] = dummy_data_test[elc.resource].replace( 36 | 2, np.nan 37 | ) 38 | rp.transform(dummy_data_test[[elc.activity, elc.resource]]) 39 | 40 | with pytest.warns(): 41 | dummy_data_test[elc.resource] = dummy_data_test[elc.resource].fillna( 42 | 100 43 | ) 44 | test_out = rp.transform(dummy_data_test) 45 | 46 | with pytest.raises(Exception): 47 | dummy_data_test[elc.activity] = dummy_data_test[elc.activity].replace( 48 | 2, np.nan 49 | ) 50 | rp.transform(dummy_data_test[[elc.activity, elc.resource]]) 51 | -------------------------------------------------------------------------------- /tests/feature_extraction/event/test_time.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import datetime as dt 4 | 5 | import pytest 6 | from skpm.feature_extraction import TimestampExtractor 7 | from skpm.config import EventLogConfig as elc 8 | 9 | @pytest.fixture(name="dummy_data") 10 | def fixture_dummy_pd(): 11 | return pd.DataFrame( 12 | { 13 | elc.case_id: np.repeat(np.arange(0, 10), 100), 14 | elc.activity: np.random.randint(0, 10, 1000), 15 | elc.timestamp: pd.date_range( 16 | start="1/1/2020", periods=1000, 17 | ), 18 | } 19 | ) 20 | 21 | def test_time(dummy_data): 22 | # test TimeStampExtractor 23 | t = TimestampExtractor() 24 | t.fit(dummy_data) 25 | out = t.transform(dummy_data) 26 | assert out.shape[1] == t._n_features_out 27 | assert isinstance(out, pd.DataFrame) 28 | 29 | t = TimestampExtractor(case_features="execution_time", event_features=None) 30 | t.fit(dummy_data) 31 | out = t.transform(dummy_data) 32 | assert out.shape[1] == 1 33 | assert isinstance(out, pd.DataFrame) 34 | 35 | t = TimestampExtractor(case_features="execution_time", event_features=["month_of_year", "day_of_week"]) 36 | t.fit(dummy_data) 37 | out = t.transform(dummy_data) 38 | assert out.shape[1] == 1 + 2 39 | assert isinstance(out, pd.DataFrame) 40 | 41 | with pytest.raises(Exception): 42 | t = TimestampExtractor(case_features=None, event_features=None) 43 | t.fit(dummy_data) 44 | out = t.transform(dummy_data) 45 | 46 | dummy_data = pd.DataFrame( 47 | { 48 | elc.case_id: [1, 1, 1, 2, 2, 2], 49 | elc.timestamp: ["aaaaa", "bbbbb", "ccccc", "ddddd", "eeeee", ""], 50 | } 51 | ) 52 | t = TimestampExtractor() 53 | with pytest.raises(Exception): 54 | t.fit(dummy_data[[elc.case_id, elc.timestamp]]) 55 | -------------------------------------------------------------------------------- /tests/feature_extraction/event/test_wip.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | from skpm.feature_extraction.event import WorkInProgress 5 | from skpm.config import EventLogConfig as elc 6 | 7 | 8 | def test_wip(): 9 | # Test with random data 10 | dummy_log = pd.DataFrame( 11 | { 12 | elc.case_id: np.random.randint(1, 10, 100), 13 | elc.timestamp: pd.date_range("2021-01-01", periods=100, freq="6h"), 14 | elc.activity: np.random.choice(["a", "b", "c"], 100), 15 | } 16 | ).sort_values(elc.timestamp) 17 | 18 | # Test fit_transform with default window_size 19 | wip = WorkInProgress() 20 | wip_values = wip.fit_transform(dummy_log) 21 | assert isinstance(wip_values, pd.DataFrame) 22 | assert wip_values.shape == (len(dummy_log), 1) 23 | 24 | # Test fit_transform with different window_size 25 | wip = WorkInProgress(window_size="2D") 26 | wip_values = wip.fit_transform(dummy_log) 27 | assert isinstance(wip_values, pd.DataFrame) 28 | assert wip_values.shape == (len(dummy_log), 1) 29 | 30 | # Test set_output with transform="pandas" 31 | wip_df = WorkInProgress().fit(dummy_log).transform(dummy_log) 32 | assert isinstance(wip_df, pd.DataFrame) 33 | 34 | # Test with empty dataframe 35 | empty_log = pd.DataFrame(columns=[elc.case_id, elc.timestamp, elc.activity]) 36 | wip_empty = WorkInProgress() 37 | wip_empty.fit(empty_log) 38 | with pytest.raises(TypeError): 39 | wip_empty_values = wip_empty.transform(empty_log) 40 | assert isinstance(wip_empty_values, np.ndarray) 41 | assert len(wip_empty_values) == 0 42 | -------------------------------------------------------------------------------- /tests/feature_extraction/test_targets.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | import pytest 5 | from skpm.feature_extraction.targets import next_activity, remaining_time 6 | from skpm.config import EventLogConfig as elc 7 | 8 | @pytest.fixture(name="dummy_data") 9 | def fixture_dummy_pd(): 10 | return pd.DataFrame( 11 | { 12 | elc.case_id: np.repeat(np.arange(0, 10), 100), 13 | elc.activity: np.random.randint(0, 10, 1000), 14 | elc.timestamp: pd.date_range( 15 | start="1/1/2020", periods=1000, 16 | ), 17 | } 18 | ) 19 | 20 | def test_next_activity(dummy_data): 21 | # test next_activity 22 | out = next_activity(dummy_data) 23 | assert len(out) == len(dummy_data) 24 | assert isinstance(out, np.ndarray) 25 | assert out.dtype == object 26 | 27 | def test_remaining_time(dummy_data): 28 | out = remaining_time(dummy_data) 29 | assert len(out) == len(dummy_data) 30 | assert out.dtype == float 31 | -------------------------------------------------------------------------------- /tests/sequence_encoding/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raseidi/skpm/7c35b6bc3888cf64ac78210754438ad6429a869f/tests/sequence_encoding/__init__.py -------------------------------------------------------------------------------- /tests/sequence_encoding/test_aggregation.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | import pytest 3 | import numpy as np 4 | import pandas as pd 5 | from skpm.sequence_encoding import Aggregation 6 | from skpm.config import EventLogConfig as elc 7 | 8 | 9 | @pytest.fixture(name="pd_df") 10 | def fixture_dummy_pd(): 11 | return pd.DataFrame( 12 | { 13 | elc.case_id: np.repeat(np.arange(0, 10), 100), 14 | elc.activity: np.random.randint(0, 10, 1000), 15 | elc.resource: np.random.randint(0, 3, 1000), 16 | } 17 | ) 18 | 19 | def test_aggregation(pd_df): 20 | # Test default aggregation 21 | rp = Aggregation() 22 | rp.fit(pd_df) 23 | out = rp.transform(pd_df) 24 | assert isinstance(out, pd.DataFrame) 25 | assert out.shape[0] == pd_df.shape[0] 26 | 27 | # Test aggregation with different numerical method 28 | rp = Aggregation(method="sum") 29 | rp.fit(pd_df) 30 | out = rp.transform(pd_df) 31 | assert isinstance(out, pd.DataFrame) 32 | assert out.shape[0] == pd_df.shape[0] 33 | 34 | # Test aggregation with invalid input data 35 | with pytest.raises(Exception): 36 | rp.transform(pd_df[[elc.activity, elc.resource]]) 37 | 38 | 39 | def test_aggregation_with_window(pd_df): 40 | # Test aggregation with different numerical method 41 | rp = Aggregation(prefix_len=3) 42 | rp.fit(pd_df) 43 | out = rp.transform(pd_df) 44 | assert isinstance(out, pd.DataFrame) 45 | assert out.shape[0] == pd_df.shape[0] 46 | 47 | # Test window aggregation with window size larger than len(data) must work 48 | rp = Aggregation(prefix_len=len(pd_df) + 1) 49 | rp.fit(pd_df) 50 | out = rp.transform(pd_df) 51 | assert isinstance(out, pd.DataFrame) 52 | assert out.shape[0] == pd_df.shape[0] 53 | 54 | # Test window aggregation with invalid window size 55 | with pytest.raises(Exception): 56 | rp = Aggregation(prefix_len=0) 57 | rp.fit(pd_df) 58 | out = rp.transform(pd_df) 59 | 60 | 61 | def test_aggregation_with_polars(pd_df): 62 | pl_df = pl.DataFrame(pd_df) 63 | 64 | rp = Aggregation(engine="polars") 65 | rp.fit(pl_df) 66 | out = rp.transform(pl_df) 67 | assert isinstance(out, pd.DataFrame) 68 | out = pl.DataFrame(out) 69 | assert out.height == pl_df.height 70 | 71 | 72 | def test_aggregation_output(pd_df): 73 | pl_df = pl.DataFrame(pd_df) 74 | 75 | pd_agg = Aggregation(method="sum") 76 | pl_agg = Aggregation(method="sum", engine="polars") 77 | 78 | pd_agg = pd_agg.fit_transform(pd_df) 79 | pl_agg = pl_agg.fit_transform(pl_df) 80 | 81 | pd_agg = pd_agg.astype(pl_agg.dtypes) 82 | assert isinstance(pl_agg, pd.DataFrame) 83 | assert pd_agg.equals(pl_agg) 84 | 85 | pd_agg = Aggregation(prefix_len=3) 86 | pd_agg = pd_agg.fit_transform(pd_df) 87 | pl_agg = Aggregation(prefix_len=3, engine="polars") 88 | pl_agg = pl_agg.fit_transform(pl_df) 89 | pl_agg = pl_agg.astype(pd_agg.dtypes) 90 | assert isinstance(pl_agg, pd.DataFrame) 91 | assert pd_agg.equals(pl_agg) 92 | 93 | 94 | def test_invalid_input(pd_df): 95 | # invalid arguments 96 | with pytest.raises(Exception): 97 | agg = Aggregation(method="abc") 98 | agg.fit_transform(pd_df) 99 | 100 | # invalid arguments 101 | from sklearn.utils._param_validation import InvalidParameterError 102 | 103 | with pytest.raises(InvalidParameterError): 104 | agg = Aggregation(engine="abc") 105 | agg.fit_transform(pd_df) 106 | 107 | # invalid input data 108 | with pytest.raises(AssertionError): 109 | agg = Aggregation() 110 | agg.fit(pd_df.values) 111 | 112 | # invalid input data 113 | with pytest.raises(AssertionError): 114 | agg = Aggregation().fit(pd_df) 115 | agg.transform(pd_df.values) 116 | 117 | def test_methods(pd_df): 118 | methods = Aggregation._parameter_constraints["method"][0].options 119 | for method in methods: 120 | out_pd = Aggregation(method=method).fit_transform(pd_df) 121 | out_pl = Aggregation(method=method, engine="polars").fit_transform(pd_df) 122 | pd.testing.assert_frame_equal(out_pd, out_pl, check_dtype=False) 123 | 124 | # pandas engine 125 | assert isinstance(out_pd, pd.DataFrame) 126 | assert out_pd.shape[0] == pd_df.shape[0] 127 | 128 | # polars engine 129 | assert isinstance(out_pl, pd.DataFrame) 130 | assert out_pl.shape[0] == pd_df.shape[0] -------------------------------------------------------------------------------- /tests/sequence_encoding/test_bucketing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | from skpm.sequence_encoding import Bucketing 5 | from skpm.config import EventLogConfig as elc 6 | 7 | 8 | @pytest.fixture(name="dummy_log") 9 | def get_dummy_log(): 10 | return pd.DataFrame( 11 | { 12 | elc.case_id: np.random.randint(1, 10, 100), 13 | elc.timestamp: pd.date_range("2021-01-01", periods=100, freq="6h"), 14 | elc.activity: np.random.choice(["a", "b", "c"], 100), 15 | } 16 | ).sort_values(elc.timestamp) 17 | 18 | 19 | def test_single(dummy_log): 20 | bucketing = Bucketing(method="single") 21 | bucketing.fit(dummy_log) 22 | bucketing_values = bucketing.transform(dummy_log) 23 | assert isinstance(bucketing_values, pd.DataFrame) 24 | assert bucketing_values.shape == (len(dummy_log), 1) 25 | assert np.unique(bucketing_values) == "b1" 26 | 27 | bucketing = Bucketing().fit(dummy_log).transform(dummy_log) 28 | assert isinstance(bucketing, pd.DataFrame) 29 | 30 | 31 | def test_prefix(dummy_log): 32 | bucketing = Bucketing(method="prefix") 33 | bucketing.fit(dummy_log) 34 | bucketing_values = bucketing.transform(dummy_log) 35 | assert isinstance(bucketing_values, pd.DataFrame) 36 | assert bucketing_values.shape == (len(dummy_log), 1) 37 | assert isinstance(len(np.unique(bucketing_values)), int) 38 | 39 | 40 | def test_clustering_not_implemented(dummy_log): 41 | with pytest.raises(NotImplementedError): 42 | Bucketing(method="clustering").fit(dummy_log).transform(dummy_log) 43 | 44 | 45 | def test_invalid_method(dummy_log): 46 | with pytest.raises(AssertionError): 47 | Bucketing(method="invalid_method").fit(dummy_log) 48 | 49 | 50 | def test_output_feature_names(): 51 | bucketing = Bucketing(method="single") 52 | feature_names = bucketing.get_feature_names_out() 53 | assert isinstance(feature_names, list) 54 | assert len(feature_names) == 1 55 | assert feature_names[0] == "bucket" 56 | -------------------------------------------------------------------------------- /tests/sequence_encoding/test_index.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | import pytest 3 | import numpy as np 4 | import pandas as pd 5 | from skpm.sequence_encoding import Indexing 6 | from skpm.config import EventLogConfig as elc 7 | 8 | 9 | @pytest.fixture(name="pd_df") 10 | def fixture_dummy_pd(): 11 | return pd.DataFrame( 12 | { 13 | elc.case_id: np.repeat(np.arange(0, 10), 100), 14 | elc.activity: np.random.randint(0, 10, 1000), 15 | elc.resource: np.random.randint(0, 3, 1000), 16 | } 17 | ) 18 | 19 | def test_indexing(pd_df): 20 | # Test default Indexing 21 | rp = Indexing(n=2, attributes=[elc.activity, elc.resource], fill_value=0) 22 | rp.fit(pd_df) 23 | out = rp.transform(pd_df) 24 | assert isinstance(out, pd.DataFrame) 25 | assert out.shape[0] == pd_df.shape[0] 26 | 27 | with pytest.raises(Exception): 28 | rp.transform(pd_df[[elc.activity, elc.resource]]) 29 | 30 | rp = Indexing(n=2, attributes=elc.activity, fill_value=0) 31 | rp.fit(pd_df) 32 | 33 | rp = Indexing(n=2, attributes=None, fill_value=0) 34 | rp.fit(pd_df) -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raseidi/skpm/7c35b6bc3888cf64ac78210754438ad6429a869f/tests/utils/__init__.py -------------------------------------------------------------------------------- /tests/utils/test_graph.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from skpm.utils.graph import ( 4 | frequency_matrix, 5 | node_degree, 6 | density, 7 | nodes_in_cycles, 8 | ) 9 | 10 | 11 | @pytest.fixture 12 | def example_traces(): 13 | return [[1, 2, 3], [1, 2, 3, 4]] 14 | 15 | 16 | @pytest.fixture 17 | def example_set_of_states(): 18 | return {1, 2, 3, 4} 19 | 20 | 21 | @pytest.fixture 22 | def example_frequency_matrix(): 23 | return np.array([[0, 2, 0, 0], [0, 0, 2, 0], [0, 0, 0, 1], [0, 0, 0, 0]]) 24 | 25 | 26 | def test_frequency_matrix( 27 | example_traces, example_set_of_states, example_frequency_matrix 28 | ): 29 | freq_matrix, stoi, itos = frequency_matrix( 30 | example_traces, example_set_of_states 31 | ) 32 | assert np.array_equal(freq_matrix, example_frequency_matrix) 33 | assert stoi == {1: 0, 2: 1, 3: 2, 4: 3} 34 | assert itos == {0: 1, 1: 2, 2: 3, 3: 4} 35 | 36 | 37 | @pytest.fixture 38 | def example_frequency_matrix_node_degree(): 39 | return np.array([[0, 2, 0, 0], [0, 0, 2, 0], [0, 0, 0, 1], [0, 0, 0, 0]]) 40 | 41 | 42 | def test_node_degree(example_frequency_matrix_node_degree): 43 | in_degree, out_degree = node_degree(example_frequency_matrix_node_degree) 44 | assert np.array_equal(in_degree, np.array([0, 2, 2, 1])) 45 | assert np.array_equal(out_degree, np.array([2, 2, 1, 0])) 46 | 47 | 48 | def test_density(): 49 | graph = np.array([[0, 2, 0, 0], [0, 0, 2, 0], [0, 0, 0, 1], [0, 0, 0, 0]]) 50 | assert density(graph) == 0.4166666666666667 51 | 52 | 53 | def test_nodes_in_cycles(): 54 | graph = np.array([[0, 1, 0], [0, 0, 1], [1, 0, 0]]) 55 | assert nodes_in_cycles(graph, max_cycle_length=3) == [True, True, True] 56 | -------------------------------------------------------------------------------- /tests/utils/test_validation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import numpy as np 4 | from skpm.event_logs.download import download_url 5 | from skpm.event_logs.extract import extract_gz 6 | from skpm.utils import validation as v 7 | 8 | 9 | def test_validation(): 10 | with pytest.raises(Exception): 11 | v.validate_columns(input_columns=[1, 2, 3], required=[4]) 12 | 13 | out = v.ensure_list("exception") 14 | assert isinstance(out, list) 15 | 16 | out = v.ensure_list({1, 2, 3}) 17 | assert isinstance(out, list) 18 | --------------------------------------------------------------------------------