├── .github
└── workflows
│ └── tests.yml
├── .gitignore
├── .readthedocs.yaml
├── CHANGELOG.md
├── CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── docs
├── Makefile
├── conf.py
├── index.rst
├── install
│ └── installation.rst
├── logo.png
├── make.bat
├── pipeline.png
├── requirements.txt
└── sphinxext
│ ├── doi_role.py
│ └── github_link.py
├── examples
├── README.rst
├── feature_extracion.py
├── plot_download_api.py
├── plot_pipeline_evaluation.py
├── plot_rt_pipeline.py
└── plot_unbiased_split.py
├── pyproject.toml
├── src
└── skpm
│ ├── __init__.py
│ ├── base.py
│ ├── config.py
│ ├── event_logs
│ ├── __init__.py
│ ├── base.py
│ ├── bpi.py
│ ├── download.py
│ ├── extract.py
│ ├── parser.py
│ └── split.py
│ ├── feature_extraction
│ ├── __init__.py
│ ├── case
│ │ ├── __init__.py
│ │ ├── _helpers.py
│ │ ├── time.py
│ │ └── variant.py
│ ├── event
│ │ ├── __init__.py
│ │ ├── inter_case.py
│ │ ├── resource.py
│ │ └── time.py
│ ├── targets.py
│ └── time.py
│ ├── sequence_encoding
│ ├── __init__.py
│ ├── aggregation.py
│ ├── bucketing.py
│ └── index.py
│ ├── utils
│ ├── __init__.py
│ ├── graph.py
│ ├── helpers.py
│ └── validation.py
│ └── warnings.py
└── tests
├── __init__.py
├── event_logs
├── __init__.py
├── test_bpi.py
├── test_download_extract.py
└── test_parser.py
├── feature_extraction
├── __init__.py
├── case
│ └── test_variant.py
├── event
│ ├── test_resource.py
│ ├── test_time.py
│ └── test_wip.py
└── test_targets.py
├── sequence_encoding
├── __init__.py
├── test_aggregation.py
├── test_bucketing.py
└── test_index.py
└── utils
├── __init__.py
├── test_graph.py
└── test_validation.py
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: tests
2 |
3 | on:
4 | push:
5 | branches: [main]
6 | pull_request:
7 | branches: [main]
8 |
9 | jobs:
10 | test:
11 | runs-on: ${{ matrix.os }}
12 | strategy:
13 | fail-fast: false
14 | matrix:
15 | python: ['3.10.16']
16 | os: [ubuntu-latest] #, windows-latest]
17 |
18 | steps:
19 | - uses: actions/checkout@v3
20 |
21 | - name: Set up Python ${{ matrix.python }}
22 | uses: actions/setup-python@v4
23 | with:
24 | python-version: ${{ matrix.python }}
25 | cache: 'pip'
26 |
27 | - name: Install Poetry
28 | uses: snok/install-poetry@v1
29 | with:
30 | virtualenvs-create: true
31 | virtualenvs-in-project: true
32 |
33 | - name: Install Python dependencies
34 | run: |
35 | poetry install
36 |
37 | - run: |
38 | source $VENV
39 | pytest --version
40 |
41 | - name: pytest
42 | run: poetry run pytest --cov=skpm tests
43 |
44 | - name: Statistics
45 | if: success()
46 | run: |
47 | poetry run coverage report
48 | poetry run coverage xml
49 |
50 |
51 | - name: Upload coverage to Codecov
52 | uses: codecov/codecov-action@v3
53 | if: always()
54 | # see: https://github.com/actions/toolkit/issues/399
55 | continue-on-error: true
56 | with:
57 | token: ${{ secrets.CODECOV_TOKEN }}
58 | file: coverage.xml
59 | # flags: cpu
60 | name: Coverage
61 | fail_ci_if_error: false
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Custom user defined
2 | .pytest_cache/
3 | .mypy_cache/
4 | data/
5 | htmlcov/
6 | notebooks/
7 |
8 | .coverage
9 | notes.md
10 | skpm-venv/
11 | poetry.lock
12 |
13 | # Byte-compiled / optimized / DLL files
14 | __pycache__/
15 | dist/
16 |
17 | # Sphinx documentation
18 | docs/_build/
19 | docs/auto_examples/
20 | docs/gen_modules/
21 | docs/sg_execution_times.rst
22 |
23 | # Jupyter Notebook
24 | .ipynb_checkpoints
25 | __pypackages__/
26 |
27 | # VsCode
28 | .vscode/
29 |
30 | # files
31 | *.log
32 | *.csv
33 | *.parquet
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yaml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # Set the OS, Python version and other tools you might need
9 | build:
10 | os: ubuntu-22.04
11 | tools:
12 | python: "3.10.16"
13 | # You can also specify other tool versions:
14 | # nodejs: "19"
15 | # rust: "1.64"
16 | # golang: "1.19"
17 |
18 | # Build documentation in the "docs/" directory with Sphinx
19 | sphinx:
20 | configuration: docs/conf.py
21 |
22 | python:
23 | install:
24 | - requirements: docs/requirements.txt
25 | - method: pip
26 | path: .
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 |
4 |
5 | ## v0.0.1 (25/01/2024)
6 |
7 | - First release of `skpm`!
--------------------------------------------------------------------------------
/CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
6 |
7 | ## Our Standards
8 |
9 | Examples of behavior that contributes to creating a positive environment include:
10 |
11 | * Using welcoming and inclusive language
12 | * Being respectful of differing viewpoints and experiences
13 | * Gracefully accepting constructive criticism
14 | * Focusing on what is best for the community
15 | * Showing empathy towards other community members
16 |
17 | Examples of unacceptable behavior by participants include:
18 |
19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | * Trolling, insulting/derogatory comments, and personal or political attacks
21 | * Public or private harassment
22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | * Other conduct which could reasonably be considered inappropriate in a professional setting
24 |
25 | ## Our Responsibilities
26 |
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 |
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 |
31 | ## Scope
32 |
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 |
35 | ## Enforcement
36 |
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 |
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 |
41 | ## Attribution
42 |
43 | This Code of Conduct is adapted from the [Contributor Covenant homepage](http://contributor-covenant.org/version/1/4), version 1.4.
44 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | Contributions are welcome, and they are greatly appreciated! Every little bit
4 | helps, and credit will always be given.
5 |
6 | ## Types of Contributions
7 |
8 | ### Report Bugs
9 |
10 | If you are reporting a bug, please include:
11 |
12 | * Your operating system name and version.
13 | * Any details about your local setup that might be helpful in troubleshooting.
14 | * Detailed steps to reproduce the bug.
15 |
16 | ### Fix Bugs
17 |
18 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help
19 | wanted" is open to whoever wants to implement it.
20 |
21 | ### Implement Features
22 |
23 | Look through the GitHub issues for features. Anything tagged with "enhancement"
24 | and "help wanted" is open to whoever wants to implement it.
25 |
26 | ### Write Documentation
27 |
28 | You can never have enough documentation! Please feel free to contribute to any
29 | part of the documentation, such as the official docs, docstrings, or even
30 | on the web in blog posts, articles, and such.
31 |
32 | ### Submit Feedback
33 |
34 | If you are proposing a feature:
35 |
36 | * Explain in detail how it would work.
37 | * Keep the scope as narrow as possible, to make it easier to implement.
38 | * Remember that this is a volunteer-driven project, and that contributions
39 | are welcome :)
40 |
41 | ## Get Started!
42 |
43 | Ready to contribute? Here's how to set up `skpm` for local development.
44 |
45 | 1. Download a copy of `skpm` locally.
46 | 2. Install `skpm` using `poetry`:
47 |
48 | ```console
49 | $ poetry install
50 | ```
51 |
52 | 3. Use `git` (or similar) to create a branch for local development and make your changes:
53 |
54 | ```console
55 | $ git checkout -b name-of-your-bugfix-or-feature
56 | ```
57 |
58 | 4. When you're done making changes, check that your changes conform to any code formatting requirements and pass any tests.
59 |
60 | 5. Commit your changes and open a pull request.
61 |
62 | ## Pull Request Guidelines
63 |
64 | Before you submit a pull request, check that it meets these guidelines:
65 |
66 | 1. The pull request should include additional tests if appropriate.
67 | 2. If the pull request adds functionality, the docs should be updated.
68 | 3. The pull request should work for all currently supported operating systems and versions of Python.
69 |
70 | ## Code of Conduct
71 |
72 | Please note that the `skpm` project is released with a
73 | Code of Conduct. By contributing to this project you agree to abide by its terms.
74 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024, Rafael Oyamada
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SkPM: a Scikit-learn Extension for Process Mining
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 | [](https://creativecommons.org/licenses/by/4.0/)
11 | [](https://skpm.readthedocs.io/en/latest/)
12 | [](https://codecov.io/gh/raseidi/skpm)
13 |
14 |
15 |
16 | ## Overview
17 |
18 | SkPM is an open-source extension of the widely used [Scikit-learn](https://scikit-learn.org/) library, designed to meet the specific needs of Process Mining applications. It aims to provide a **standard**, **reproducible**, and **easily accessible** set of tools for PM research and practical applications.
19 |
20 | ## Available examples
21 |
22 | - **NEW** [**ICPM/ML4PM 2024 Tutorial**](https://colab.research.google.com/drive/1s6TxG14bKbh2zlOENLGGd9dy_1BLEBiO?usp=sharing): A notebook highlighting all the available features in SkPM!
23 | - [**Predictive Monitoring**](https://skpm.readthedocs.io/en/latest/auto_examples/plot_rt_pipeline.html#): Build end-to-end applications of traditional process mining tasks, such as remaining time and next activity prediction!
24 | - [**Event Log Preprocessing**](https://skpm.readthedocs.io/en/latest/auto_examples/feature_extracion.html): Several feature extraction and trace encoding techniques implemented!
25 | - [**Download Public Event Logs**](https://skpm.readthedocs.io/en/latest/auto_examples/plot_rt_pipeline.html#download-the-example-dataset): Download well-known event logs (e.g., BPI Challenges) from the 4tu repository!
26 | - [**Unbiased Event Log Split**](https://skpm.readthedocs.io/en/latest/auto_examples/plot_unbiased_split.html): Temporal and unbiased split of event logs for train/validation.
27 |
28 |
29 |
30 |
31 |
32 | ## Installation
33 |
34 | **Soon available on PyPI**.
35 |
36 | To install SkPM, you can clone the repository and install the required dependencies using `pip`:
37 |
38 | ```bash
39 | git clone https://github.com/raseidi/skpm.git
40 | cd skpm
41 | pip install .
42 | ```
43 |
44 | ## Usage
45 |
46 | Below is an example of how to use SkPM to build a pipeline for remaining time prediction.
47 |
48 | ```python
49 | # skpm modules
50 | from skpm.encoding import Aggregation
51 | from skpm.event_feature_extraction import (
52 | TimestampExtractor,
53 | ResourcePoolExtractor,
54 | )
55 |
56 | # sklearn modules
57 | from sklearn.ensemble import RandomForestRegressor
58 | from sklearn.pipeline import Pipeline
59 | from sklearn.compose import ColumnTransformer
60 | from sklearn.preprocessing import StandardScaler
61 |
62 | # Example pipeline for remaining time prediction
63 | preprocessor = ColumnTransformer(
64 | transformers=[
65 | ('timestamp', TimestampExtractor(), 'timestamp_column'),
66 | ('activity', OneHotEncoder(), 'activity_column'),
67 | ('resource', ResourcePoolExtractor(), 'resource_column'),
68 | ]
69 | )
70 |
71 | pipeline = Pipeline(steps=[
72 | ('preprocessor', preprocessor),
73 | ('aggregator', TraceAggregator()),
74 | ('standardization', StandardScaler()),
75 | ('regressor', RandomForestRegressor())
76 | ])
77 |
78 | # Fit the pipeline to your event log data
79 | pipeline.fit(X_train, y_train)
80 |
81 | # Make predictions on new cases
82 | predictions = pipeline.predict(X_test)
83 | ```
84 |
85 | ## Documentation
86 |
87 | Detailed documentation and examples can be found [here](https://skpm.readthedocs.io/en/latest/).
88 |
89 | ## Roadmap, next steps, and help needed!
90 |
91 | - Improving documentation by including examples.
92 | - Implementing new applications and writing tutorials.
93 | - Adding new methods (feature extraction, trace encoding, and models).
94 | - Writing unit tests!
95 |
96 | ## Contributing
97 |
98 | We welcome contributions from the community!
99 |
100 | Check the [sklearn guidelines](https://scikit-learn.org/1.5/developers/contributing.html#reading-the-existing-code-base) to understand the `fit`, `predict`, and `transform` APIs!
101 |
102 | Check [our guidelines](CONTRIBUTING.md) as well to see how to open an issue or a PR. In summary:
103 |
104 | 1. Fork the repository.
105 | 2. Create a feature branch (`git checkout -b feature-branch`).
106 | 3. Commit your changes (`git commit -m 'feat: add new feature'`).
107 | 4. Push to the branch (`git push origin feature-branch`).
108 | 5. Open a pull request.
109 |
110 | ## License
111 |
112 | This project was created by Rafael Oyamada and is licensed under the [CC BY 4.0 License](https://creativecommons.org/licenses/by/4.0/). Feel free to use, modify, and distribute the code with attribution.
113 |
114 | ## Credits
115 |
116 | `skpm` was created with [`cookiecutter`](https://cookiecutter.readthedocs.io/en/latest/) and the `py-pkgs-cookiecutter` [template](https://github.com/py-pkgs/py-pkgs-cookiecutter).
117 |
118 | ## Citation
119 |
120 | ```bibtex
121 | @inproceedings{OyamadaTJC23,
122 | author = {Rafael Seidi Oyamada and
123 | Gabriel Marques Tavares and
124 | Sylvio Barbon Junior and
125 | Paolo Ceravolo},
126 | editor = {Felix Mannhardt and
127 | Nour Assy},
128 | title = {A Scikit-learn Extension Dedicated to Process Mining Purposes},
129 | booktitle = {Proceedings of the Demonstration Track co-located with the International
130 | Conference on Cooperative Information Systems 2023, CoopIS 2023, Groningen,
131 | The Netherlands, October 30 - November 3, 2023},
132 | series = {{CEUR} Workshop Proceedings},
133 | publisher = {CEUR-WS.org},
134 | }
135 | ```
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # For the full list of built-in configuration values, see the documentation:
4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
5 |
6 | import os
7 | import sys
8 | from sphinx_gallery.sorting import FileNameSortKey
9 | from pathlib import Path
10 |
11 |
12 | # find project
13 | cwd = os.getcwd()
14 | parent = os.path.dirname(cwd)
15 | sys.path.append(parent)
16 | sys.path.insert(0, os.path.abspath("sphinxext"))
17 |
18 | from github_link import make_linkcode_resolve
19 |
20 | # -- Project information -----------------------------------------------------
21 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
22 |
23 |
24 | project = "skpm"
25 | copyright = "2024, Rafael Oyamada"
26 | author = "Rafael Oyamada"
27 | release = "0.0.1"
28 |
29 | # -- General configuration ---------------------------------------------------
30 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
31 |
32 | extensions = [
33 | "sphinx.ext.autodoc",
34 | "sphinx.ext.autosummary",
35 | "sphinx_gallery.gen_gallery",
36 | "sphinx.ext.napoleon",
37 | "sphinx.ext.intersphinx",
38 | "sphinx.ext.linkcode",
39 | "doi_role",
40 | "sphinx.ext.viewcode",
41 | "autoapi.extension",
42 | ]
43 |
44 | autoapi_dirs = ["../src"]
45 |
46 | templates_path = ["_templates"]
47 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
48 | source_suffix = {'.rst': 'restructuredtext', '.md': 'markdown'}
49 |
50 |
51 | # -- Options for HTML output -------------------------------------------------
52 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
53 |
54 | html_theme = "sphinx_rtd_theme"
55 | # html_static_path = ["_static"]
56 |
57 |
58 | sg_examples_dir = "../examples"
59 | sg_gallery_dir = "auto_examples"
60 | sphinx_gallery_conf = {
61 | # path to your example scripts
62 | "examples_dirs": [sg_examples_dir],
63 | # path to where to save gallery generated output
64 | "gallery_dirs": [sg_gallery_dir],
65 | # specify that examples should be ordered according to filename
66 | "within_subsection_order": FileNameSortKey,
67 | # directory where function granular galleries are stored
68 | "backreferences_dir": "gen_modules/backreferences",
69 | # Modules for which function level galleries are created. In
70 | # this case sphinx_gallery and numpy in a tuple of strings.
71 | "doc_module": ("skpm"),
72 | # "filename_pattern": "/*.py",
73 | }
74 |
75 | # configuration for intersphinx: refer to the Python standard library.
76 | intersphinx_mapping = {
77 | "python": (
78 | "https://docs.python.org/{.major}".format(sys.version_info),
79 | None,
80 | ),
81 | "matplotlib": ("https://matplotlib.org/", None),
82 | }
83 |
84 | linkcode_resolve = make_linkcode_resolve(
85 | "skpm",
86 | (
87 | "https://github.com/raseidi/"
88 | "skpm/blob/{revision}/"
89 | "{package}/{path}#L{lineno}"
90 | ),
91 | )
92 |
93 | autosummary_generate = True
94 | root_doc = "index"
95 |
96 | exclude_patterns = [
97 | "_build",
98 | "templates",
99 | "includes",
100 | "**/sg_execution_times.rst",
101 | ]
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. skpm documentation master file, created by
2 | sphinx-quickstart on Thu Jan 25 16:27:22 2024.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | Welcome to SkPM's documentation!
7 | ================================
8 |
9 | **SkPM** *(Scikit-learn for Process Mining)* is a library built upon `Scikit-learn `_ to easily write and train machine learning pipelines tailored for process mining tasks.
10 |
11 | This work is still in progress! So far, we have focused more on feature extraction and encoding techniques to be easily used along with Scikit-learn's `pipelines `_ and `grid search-related functionalities `_.
12 |
13 |
14 | .. toctree::
15 | :maxdepth: 1
16 | :caption: Install SkPM
17 |
18 | install/installation
19 |
20 | .. toctree::
21 | :maxdepth: 1
22 | :caption: Tutorials
23 |
24 | auto_examples/index
25 |
26 | .. toctree::
27 | :maxdepth: 1
28 | :caption: API Reference
--------------------------------------------------------------------------------
/docs/install/installation.rst:
--------------------------------------------------------------------------------
1 | Installation
2 | ============
3 |
4 | SkPM is available from Python 3.10 to 3.12.
5 |
6 | Installation via PyPi
7 | ---------------------
8 |
9 | SkPM will be available on PyPi soon!
10 |
11 | .. .. code-block:: none
12 |
13 | .. pip install skpm
14 |
15 | Installation from source
16 | ------------------------
17 |
18 | To install SkPM via GitHub, you can clone the repository and install it using pip:
19 |
20 | .. code-block:: none
21 |
22 | git clone
23 | cd skpm
24 | pip install .
25 |
26 | Alternatively, you can install it using poetry:
27 |
28 | .. code-block:: none
29 |
30 | python3.10 -m venv skpm-venv
31 | source skpm-venv/bin/activate
32 | pip install -U pip setuptools poetry
33 | poetry install
34 |
--------------------------------------------------------------------------------
/docs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raseidi/skpm/7c35b6bc3888cf64ac78210754438ad6429a869f/docs/logo.png
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | echo.
16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | echo.installed, then set the SPHINXBUILD environment variable to point
18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | echo.may add the Sphinx directory to PATH.
20 | echo.
21 | echo.If you don't have Sphinx installed, grab it from
22 | echo.https://www.sphinx-doc.org/
23 | exit /b 1
24 | )
25 |
26 | if "%1" == "" goto help
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raseidi/skpm/7c35b6bc3888cf64ac78210754438ad6429a869f/docs/pipeline.png
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx-autoapi==3.0.0
2 | sphinx-rtd-theme==2.0.0
3 | sphinx-gallery==0.17.1
4 | matplotlib==3.8
--------------------------------------------------------------------------------
/docs/sphinxext/doi_role.py:
--------------------------------------------------------------------------------
1 | """
2 | doilinks
3 | ~~~~~~~~
4 | Extension to add links to DOIs. With this extension you can use e.g.
5 | :doi:`10.1016/S0022-2836(05)80360-2` in your documents. This will
6 | create a link to a DOI resolver
7 | (``https://doi.org/10.1016/S0022-2836(05)80360-2``).
8 | The link caption will be the raw DOI.
9 | You can also give an explicit caption, e.g.
10 | :doi:`Basic local alignment search tool <10.1016/S0022-2836(05)80360-2>`.
11 |
12 | :copyright: Copyright 2015 Jon Lund Steffensen. Based on extlinks by
13 | the Sphinx team.
14 | :license: BSD.
15 | """
16 |
17 | from docutils import nodes, utils
18 | from sphinx.util.nodes import split_explicit_title
19 |
20 |
21 | def reference_role(typ, rawtext, text, lineno, inliner, options={}, content=[]):
22 | text = utils.unescape(text)
23 | has_explicit_title, title, part = split_explicit_title(text)
24 | if typ in ["arXiv", "arxiv"]:
25 | full_url = "https://arxiv.org/abs/" + part
26 | if not has_explicit_title:
27 | title = "arXiv:" + part
28 | pnode = nodes.reference(title, title, internal=False, refuri=full_url)
29 | return [pnode], []
30 | if typ in ["doi", "DOI"]:
31 | full_url = "https://doi.org/" + part
32 | if not has_explicit_title:
33 | title = "DOI:" + part
34 | pnode = nodes.reference(title, title, internal=False, refuri=full_url)
35 | return [pnode], []
36 |
37 |
38 | def setup_link_role(app):
39 | app.add_role("arxiv", reference_role, override=True)
40 | app.add_role("arXiv", reference_role, override=True)
41 | app.add_role("doi", reference_role, override=True)
42 | app.add_role("DOI", reference_role, override=True)
43 |
44 |
45 | def setup(app):
46 | app.connect("builder-inited", setup_link_role)
47 | return {"version": "0.1", "parallel_read_safe": True}
48 |
--------------------------------------------------------------------------------
/docs/sphinxext/github_link.py:
--------------------------------------------------------------------------------
1 | import inspect
2 | import os
3 | import subprocess
4 | import sys
5 | from functools import partial
6 | from operator import attrgetter
7 |
8 | REVISION_CMD = "git rev-parse --short HEAD"
9 |
10 |
11 | def _get_git_revision():
12 | try:
13 | revision = subprocess.check_output(REVISION_CMD.split()).strip()
14 | except (subprocess.CalledProcessError, OSError):
15 | print("Failed to execute git to get revision")
16 | return None
17 | return revision.decode("utf-8")
18 |
19 |
20 | def _linkcode_resolve(domain, info, package, url_fmt, revision):
21 | """Determine a link to online source for a class/method/function
22 |
23 | This is called by sphinx.ext.linkcode
24 |
25 | An example with a long-untouched module that everyone has
26 | >>> _linkcode_resolve('py', {'module': 'tty',
27 | ... 'fullname': 'setraw'},
28 | ... package='tty',
29 | ... url_fmt='https://hg.python.org/cpython/file/'
30 | ... '{revision}/Lib/{package}/{path}#L{lineno}',
31 | ... revision='xxxx')
32 | 'https://hg.python.org/cpython/file/xxxx/Lib/tty/tty.py#L18'
33 | """
34 |
35 | if revision is None:
36 | return
37 | if domain not in ("py", "pyx"):
38 | return
39 | if not info.get("module") or not info.get("fullname"):
40 | return
41 |
42 | class_name = info["fullname"].split(".")[0]
43 | module = __import__(info["module"], fromlist=[class_name])
44 | obj = attrgetter(info["fullname"])(module)
45 |
46 | # Unwrap the object to get the correct source
47 | # file in case that is wrapped by a decorator
48 | obj = inspect.unwrap(obj)
49 |
50 | try:
51 | fn = inspect.getsourcefile(obj)
52 | except Exception:
53 | fn = None
54 | if not fn:
55 | try:
56 | fn = inspect.getsourcefile(sys.modules[obj.__module__])
57 | except Exception:
58 | fn = None
59 | if not fn:
60 | return
61 |
62 | fn = os.path.relpath(fn, start=os.path.dirname(__import__(package).__file__))
63 | try:
64 | lineno = inspect.getsourcelines(obj)[1]
65 | except Exception:
66 | lineno = ""
67 | return url_fmt.format(revision=revision, package=package, path=fn, lineno=lineno)
68 |
69 |
70 | def make_linkcode_resolve(package, url_fmt):
71 | """Returns a linkcode_resolve function for the given URL format
72 |
73 | revision is a git commit reference (hash or name)
74 |
75 | package is the name of the root module of the package
76 |
77 | url_fmt is along the lines of ('https://github.com/USER/PROJECT/'
78 | 'blob/{revision}/{package}/'
79 | '{path}#L{lineno}')
80 | """
81 | revision = _get_git_revision()
82 | return partial(
83 | _linkcode_resolve, revision=revision, package=package, url_fmt=url_fmt
84 | )
85 |
--------------------------------------------------------------------------------
/examples/README.rst:
--------------------------------------------------------------------------------
1 | .. _examples:
2 |
3 | Examples
4 | ========
5 |
6 | This is the gallery of examples that showcase how SkPM can be used. Some
7 | examples demonstrate the use of the api in general and some
8 | demonstrate specific applications in tutorial form.
--------------------------------------------------------------------------------
/examples/feature_extracion.py:
--------------------------------------------------------------------------------
1 | """
2 | Event Feature Extraction
3 | ========================
4 |
5 | In this tutorial, we introduce a few feature extraction techniques
6 | available in our library. Currently, we provide two modules for
7 | feature extraction: :mod:`skpm.case_feature_extraction` and
8 | :mod:`skpm.event_feature_extraction`. The former is still
9 | uder development so we will focus on the latter.
10 | """
11 |
12 | # %%
13 | # Event features
14 | # --------------
15 | # The :mod:`skpm.event_feature_extraction` module provides
16 | # a set of function to extract relevant features proposed in the
17 | # literature. In this example, we show how to extract features from
18 | # timestamps, resources, and an the inter-case perspective.
19 | #
20 | # Time-related features
21 | # ---------------------
22 | # The :class:`skpm.event_feature_extraction.TimestampExtractor` class
23 | # allows us to extract several features, such as the execution time of
24 | # each event, the accumulated time throughout the case, and the weekday.
25 | # Let's see how it works.
26 |
27 | # %%
28 | import pandas as pd
29 | from skpm.config import EventLogConfig as elc
30 | from skpm.feature_extraction import TimestampExtractor
31 | from skpm.event_logs import split, BPI17
32 |
33 | # download the dataset
34 | log = BPI17()
35 |
36 | # select the columns of interest
37 | df = log.dataframe[[elc.case_id, elc.activity, elc.timestamp, elc.resource]].copy()
38 |
39 | # split the data into train and test
40 | train, _ = split.unbiased(df, **log.unbiased_split_params)
41 |
42 | # sphinx_gallery_start_ignore
43 | del log
44 | del df
45 | # sphinx_gallery_end_ignore
46 |
47 | # extract the features
48 | te = TimestampExtractor().fit(train)
49 | train[te.get_feature_names_out()] = te.transform(train)
50 |
51 | # first event as an example
52 | train.iloc[0, :].T
53 |
54 | # %%
55 | # In the literature, features like the weekday are usually extracted
56 | # as a categorical features, but we currently implement it as a
57 | # numerical by normalizing the values between `[-0.5, 0.5]`.
58 | # In the future, we intend to provide a parameter to choose between
59 | # the two options.
60 | #
61 | # Resource-related features
62 | # -------------------------
63 | # The resource pool extractor is a feature extractor that identifies
64 | # resource roles based on the correlation between activities and resources.
65 | # You can easily use this function as shown below:
66 |
67 | from skpm.feature_extraction import ResourcePoolExtractor
68 |
69 | re = ResourcePoolExtractor().fit(train)
70 | # re.get_feature_names_out()
71 | train["resource_role"] = re.transform(train)
72 |
73 | train.loc[0, [elc.case_id, elc.activity, elc.resource, "resource_role"]].T
74 |
75 | # %%
76 | # From the machine learning perspective, it can be seen as a nice way
77 | # to encode the resource information and reduce the dimensionality of the
78 | # data. In this example, we grouped 133 resource labels into 5 roles:
79 |
80 | import matplotlib.pyplot as plt
81 | plt.style.use("ggplot")
82 |
83 | features = train[[elc.resource, "resource_role"]].nunique().index.values
84 | values = train[[elc.resource, "resource_role"]].nunique().values
85 |
86 | fig, ax = plt.subplots()
87 | ax.bar(features, values, edgecolor="black")
88 |
89 | # %%
90 | # Inter-case features
91 | # -------------------
92 | # Inter-case features refer to features that are computed based on the
93 | # relationship between different cases. It aims to quantify and module
94 | # the resource sharing between cases, for instance. In the current version
95 | # of our library, we only have a simple example of such feature: the number of
96 | # cases in progress simultaneously. This feature is commonly called
97 | # work in progress.
98 | #
99 | # Let's see how it works:
100 |
101 | from skpm.feature_extraction import WorkInProgress
102 |
103 | wip = WorkInProgress()
104 | wip.fit(train)
105 | train["wip"] = wip.transform(train)
106 |
107 | # visualizing it
108 | train = (
109 | train
110 | .set_index(elc.timestamp)
111 | .resample("D")[["wip"]]
112 | .mean()
113 | .reset_index()
114 | )
115 | plt.figure(figsize=(10, 5))
116 | plt.plot(pd.to_datetime(train[elc.timestamp]), train["wip"])
117 | plt.title("Average daily \nWork in Progress (WIP) over time")
118 |
119 | # %%
120 | # In this tutorial, we showed how to extract features from timestamps,
121 | # resources, and the inter-case perspective. We hope you find it useful
122 | # for your projects. If you have any questions or suggestions, please
123 | # open an issue on our GitHub repository or
124 | # `contact me `_ directly.
--------------------------------------------------------------------------------
/examples/plot_download_api.py:
--------------------------------------------------------------------------------
1 | """
2 | Downloading event logs via API
3 | ==============================
4 |
5 | This example demonstrates how we can easily download well-known process mining event logs
6 | from the 4TU.Centre for Research Data using the `skpm.event_logs` module.
7 |
8 | The `skpm.event_logs` module provides a set of event logs, such as the Sepsis and BPI 2012.
9 | """
10 |
11 | # %%
12 | # The API overview
13 | # ----------------
14 | # Implementing each event log as a class is a design choice that allows us to
15 | # easily manipulate each of them according to their specific characteristics.
16 | # One of the main challenges in process mining is the completely different
17 | # nature of datasets, since
18 | # each of them is composed of very particular business rules.
19 | #
20 | # For instance, an unbiased split of event logs was proposed in [1]. Roughly
21 | # speaking, each event log is splitted based on specific temporal
22 | # characteristics, which is hard coded within each specific event log. You can
23 | # check this feature in :ref:`Unbiased split
24 | # `.
25 | #
26 | # Now, let us see how to easily download event logs below.
27 | #
28 | # Downloading the BPI 2013 event log
29 | # ----------------------------------
30 | # The BPI 2013 event log is a well-known event log that contains data about
31 | # closed problems from the Volvo IT Belgium. We can easily download it as
32 | # follows:
33 |
34 | from skpm.event_logs import BPI13ClosedProblems
35 |
36 | bpi13 = BPI13ClosedProblems() # this will automatically download it
37 | bpi13
38 |
39 | # %%
40 | # Notice, the `__repr__`method returns a brief overview of the event log.
41 | # In order to acess the dataframe, just call the `dataframe` attribute.
42 |
43 | bpi13.dataframe.head()
44 |
45 | # %%
46 | # In this tutorial, we showed how to user our API to automatically
47 | # download event logs from the `4TU Repository `_.
48 | # We hope you find it useful
49 | # for your projects. If you have any questions or suggestions, please
50 | # open an issue on our GitHub repository or
51 | # `contact me `_ directly.
52 | #
53 | # References
54 | # ----------
55 | # [1] Hans Weytjens, Jochen De Weerdt. Creating Unbiased Public Benchmark Datasets with Data Leakage Prevention for Predictive Process Monitoring, 2021. doi: 10.1007/978-3-030-94343-1_2
--------------------------------------------------------------------------------
/examples/plot_pipeline_evaluation.py:
--------------------------------------------------------------------------------
1 | """
2 | Pipeline selection
3 | ==================
4 |
5 | In this tutorial, we will learn how to choose a suitable pipeline for
6 | a PPM task. We will compare two approaches for preparing data before
7 | training Gradient Boosting and Random Forest regressors. The first
8 | approach uses more detailed steps, including timestamp features,
9 | one-hot encoding, and resource pool extraction. The second approach is
10 | simpler and relies only on one-hot encoding. We will train each type
11 | of model with each approach to see how they perform.
12 | """
13 |
14 | # %%
15 | # Let us first import the necessary libraries and set the random seed
16 | # for reproducibility.
17 |
18 | import numpy as np
19 | import pandas as pd
20 |
21 | from sklearn.pipeline import Pipeline
22 | from sklearn.compose import ColumnTransformer
23 | from sklearn.metrics import root_mean_squared_error
24 | from sklearn.preprocessing import StandardScaler, OneHotEncoder
25 | from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
26 |
27 | from skpm.sequence_encoding import Aggregation
28 | from skpm.config import EventLogConfig as elc
29 | from skpm.event_logs import BPI20PrepaidTravelCosts, split
30 | from skpm.feature_extraction.targets import remaining_time
31 | from skpm.feature_extraction import TimestampExtractor, ResourcePoolExtractor
32 |
33 | # Set random state for reproducible results
34 | RANDOM_STATE = 44
35 | np.random.seed(RANDOM_STATE)
36 |
37 | # %%
38 | # Below we load one of the BPI20 event logs, select relevant columns
39 | # for this example, extract the remaining time to use as the target,
40 | # and split the data into train and test sets.1
41 |
42 | # Load event log data
43 | log = BPI20PrepaidTravelCosts()
44 |
45 | # Select basic columns
46 | df = log.dataframe[[elc.case_id, elc.activity, elc.resource, elc.timestamp]].copy()
47 | df[elc.timestamp] = pd.to_datetime(df[elc.timestamp], utc=True)
48 |
49 | # Compute remaining time in seconds
50 | df["remaining_time"] = remaining_time(df, time_unit="seconds")
51 |
52 | # Split into train/test sets using provided split method
53 | train, test = split.unbiased(df, **log.unbiased_split_params)
54 |
55 | # Separate features and targets for train and test
56 | X_train = train.drop(columns=["remaining_time"])
57 | y_train = train["remaining_time"]
58 | X_test = test.drop(columns=["remaining_time"])
59 | y_test = test["remaining_time"]
60 |
61 | # %%
62 | # Defining an advanced and a simple preprocessing pipeline
63 | # --------------------------------------------------------
64 | # We will define two pipelines for preprocessing the data before
65 | # training the models.
66 |
67 | # Advanced preprocessing pipeline
68 | data_prep_advanced = Pipeline([
69 | ("preprocessing", ColumnTransformer(
70 | transformers=[
71 | ("timestamp_features", TimestampExtractor(), [elc.timestamp, elc.case_id]),
72 | ("activity_encode", OneHotEncoder(sparse_output=False), [elc.activity]),
73 | ("resource_pool", ResourcePoolExtractor(), [elc.case_id, elc.activity, elc.resource]),
74 | ("case_id_pass", "passthrough", [elc.case_id]),
75 | ])),
76 | ("encode_agg", Aggregation(method="mean", prefix_len=6)),
77 | ("scaling", StandardScaler()),
78 | ])
79 |
80 | data_prep_simple = Pipeline([
81 | ("preprocessing", ColumnTransformer(
82 | transformers=[
83 | ("activity_encode", OneHotEncoder(sparse_output=False), [elc.activity]),
84 | ("case_id_pass", "passthrough", [elc.case_id]),
85 | ])),
86 | ("encode_agg", Aggregation(method="mean", prefix_len=6)),
87 | ("scaling", StandardScaler()),
88 | ])
89 |
90 | # %%
91 | # Training the models
92 | # -------------------
93 | # We will train two Gradient Boosting and two Random Forest models
94 | # using the advanced and simple preprocessing pipelines. We will then
95 | # evaluate the models using the root mean squared error (RMSE) metric.
96 |
97 | # Gradient Boosting pipelines
98 | gb_pipe_advanced = Pipeline([
99 | ("preprocessing", data_prep_advanced),
100 | ("regressor", GradientBoostingRegressor(random_state=RANDOM_STATE))
101 | ])
102 |
103 | gb_pipe_simple = Pipeline([
104 | ("preprocessing", data_prep_simple),
105 | ("regressor", GradientBoostingRegressor(random_state=RANDOM_STATE))
106 | ])
107 |
108 | # Random Forest pipelines
109 | rf_pipe_advanced = Pipeline([
110 | ("preprocessing", data_prep_advanced),
111 | ("regressor", RandomForestRegressor(n_estimators=10, random_state=RANDOM_STATE))
112 | ])
113 |
114 | rf_pipe_simple = Pipeline([
115 | ("preprocessing", data_prep_simple),
116 | ("regressor", RandomForestRegressor(n_estimators=10, random_state=RANDOM_STATE))
117 | ])
118 |
119 | # %%
120 | # Fit all models:
121 |
122 | # Fit all models
123 | gb_pipe_advanced.fit(X_train, y_train)
124 | gb_pipe_simple.fit(X_train, y_train)
125 | rf_pipe_advanced.fit(X_train, y_train)
126 | rf_pipe_simple.fit(X_train, y_train)
127 |
128 | # Print scores
129 | print("GB-advanced score:", root_mean_squared_error(y_test, gb_pipe_advanced.predict(X_test)))
130 | print("GB-simple score:", root_mean_squared_error(y_test, gb_pipe_simple.predict(X_test)))
131 | print("RF-advanced score:", root_mean_squared_error(y_test, rf_pipe_advanced.predict(X_test)))
132 | print("RF-simple score:", root_mean_squared_error(y_test, rf_pipe_simple.predict(X_test)))
133 |
134 | scores = pd.DataFrame({
135 | "model": ["GB1", "GB2", "RF1", "RF2"],
136 | "score": [
137 | root_mean_squared_error(y_test, gb_pipe_advanced.predict(X_test)),
138 | root_mean_squared_error(y_test, gb_pipe_simple.predict(X_test)),
139 | root_mean_squared_error(y_test, rf_pipe_advanced.predict(X_test)),
140 | root_mean_squared_error(y_test, rf_pipe_simple.predict(X_test))
141 | ]
142 | })
143 |
144 |
145 | # %%
146 | # Visualizing the results
147 | # -----------------------
148 | # In this step, we will look at the RMSE scores to understand how well each model performed.
149 | # At first glance, GB1 appears weaker than RF2, which might lead us to believe that Random Forest
150 | # is the better choice. However, this comparison is not fair, because each model was trained using
151 | # a different preprocessing pipeline. To make a fair comparison, we need to examine models that
152 | # use the same data preparation steps.
153 | #
154 | # When we compare models trained with the same preprocessing pipeline, we see that the Gradient
155 | # Boosting model actually scores better than both Random Forest pipelines. This shows how important
156 | # it is to evaluate models under consistent preprocessing conditions to accurately judge their
157 | # performance.
158 |
159 | import matplotlib.pyplot as plt
160 | plt.style.use("ggplot")
161 |
162 | scores.plot(
163 | kind="barh",
164 | x="model",
165 | y="score",
166 | color="steelblue",
167 | legend=False,
168 | figsize=(8, 4)
169 | )
170 | plt.ylabel("")
171 | plt.xlabel("RMSE")
172 | plt.xscale("log")
173 | plt.tight_layout()
--------------------------------------------------------------------------------
/examples/plot_rt_pipeline.py:
--------------------------------------------------------------------------------
1 | """
2 | Remaining Time Prediction Pipeline
3 | ==================================
4 |
5 | This example demonstrates how to build a pipeline for remaining time prediction
6 | using the BPI 2013 Closed Problems dataset.
7 |
8 | The pipeline consists of the following steps:
9 | 1. Preprocessing: Extracts features from the event log.
10 | 2. Encoding: Aggregates the extracted features.
11 | 3. Regression: Fits a regression model to predict the remaining time.
12 |
13 | The pipeline is evaluated using the R^2 score. We conclude by showing a trick
14 | to improve the performance of the regression model by transforming the target
15 | using `sklearn.compose.TransformedTargetRegressor`.
16 | """
17 |
18 | # %%
19 | # Required imports
20 | # ------------------
21 | # We start by importing the required modules and classes.
22 | from sklearn.compose import TransformedTargetRegressor
23 | import numpy as np
24 | import pandas as pd
25 |
26 | from sklearn.preprocessing import StandardScaler, OneHotEncoder
27 | from sklearn.pipeline import Pipeline, FunctionTransformer
28 | from sklearn.compose import ColumnTransformer
29 | from sklearn.ensemble import RandomForestRegressor
30 |
31 | from skpm.sequence_encoding import Aggregation
32 | from skpm.config import EventLogConfig as elc
33 | from skpm.feature_extraction import (
34 | TimestampExtractor,
35 | ResourcePoolExtractor,
36 | )
37 | from skpm.feature_extraction.targets import remaining_time
38 | from skpm.event_logs import BPI13ClosedProblems
39 |
40 | # %%
41 | # Download the example dataset
42 | # ----------------------------
43 | # We can automatically download event logs using SkPM.
44 | # In this example, let's use the :class:`~skpm.event_logs.BPI13ClosedProblems`.
45 | log = BPI13ClosedProblems()
46 | log # Note: this is a TUEventLog object, not a dataframe
47 |
48 | # %%
49 | # Subsequently, let's access the `pd.DataFrame` and
50 | # extract the target variable `remaining_time` using the
51 | # :func:`~skpm.event_feature_extraction.targets.remaining_time` function.
52 | log = log.dataframe.copy()
53 | log = log[[elc.case_id, elc.activity, elc.resource, elc.timestamp]]
54 |
55 | # extract the target variable
56 | log.loc[:, "remaining_time"] = remaining_time(log, time_unit="seconds")
57 |
58 | # In order to keep this example simple, we are skipping the train-test split.
59 | X_train = log.drop(columns=["remaining_time"])
60 | y_train = log["remaining_time"]
61 |
62 | log.head()
63 |
64 | # %%
65 | # Build the pipeline
66 | # ------------------
67 | # We build the pipeline by creating a sequence of steps.
68 | # The pipeline consists of the following steps:
69 | #
70 | # 1. **Preprocessing**: Extracts features from the event log.
71 | #
72 | # 2. **Encoding and normalizing**: Aggregates the extracted features and
73 | # applies the StandardScaler.
74 | #
75 | # 3. **Regression**: Fits a regression model to predict the remaining time.
76 | #
77 | # We create a `ColumnTransformer` to apply different transformations to
78 | # different columns. More specifically, we apply the following transformations:
79 | #
80 | # - `TimestampExtractor` to extract timestamp features.
81 | #
82 | # - `OneHotEncoder` to encode the activity column.
83 | #
84 | # - `ResourcePoolExtractor` to extract resource pool of each activity.
85 | transformers = ColumnTransformer(
86 | transformers=[
87 | (
88 | "timestamp_features",
89 | TimestampExtractor(),
90 | [elc.timestamp, elc.case_id],
91 | ),
92 | (elc.activity, OneHotEncoder(sparse_output=False), [elc.activity]),
93 | (
94 | elc.resource,
95 | ResourcePoolExtractor(),
96 | [elc.case_id, elc.activity, elc.resource],
97 | ),
98 | (elc.case_id, "passthrough", [elc.case_id]),
99 | ]
100 | )
101 |
102 | # %%
103 | # Integrating the preprocessing transformers with the full pipeline.
104 | # The pipeline will transformer/extractu features, encode the traces,
105 | # normalize the features, and fit a regression model to predict the remaining
106 | # time.
107 | pipe = Pipeline(
108 | [
109 | ("preprocessing", transformers),
110 | ("encoding", Aggregation(method="mean")),
111 | ("scaling", StandardScaler()),
112 | ("regressor", RandomForestRegressor()),
113 | ]
114 | )
115 |
116 | print(pipe.fit(X_train, y_train).score(X_train, y_train))
117 |
118 | # %%
119 | # We can leverage the `TransformedTargetRegressor` class to improve the
120 | # performance of the regression model. This class allows us to transform the
121 | # target variable using a transformer before fitting the model. In this example,
122 | # we use the `FunctionTransformer` class to apply the `log1p` transformation to
123 | # the target. The pipeline will output the target in the original scale since
124 | # we set the `inverse_func` parameter to `np.expm1`.
125 | #
126 | # Such trick allows us to enhance the predictive performance of the model.
127 |
128 | # sphinx_gallery_start_ignore
129 | import warnings
130 | warnings.filterwarnings("ignore")
131 | # sphinx_gallery_end_ignore
132 | y_trans = FunctionTransformer(np.log1p, inverse_func=np.expm1)
133 | regr = TransformedTargetRegressor(regressor=pipe, transformer=y_trans)
134 |
135 | print(regr.fit(X_train, y_train).score(X_train, y_train))
136 |
137 | # %%
138 | # In this tutorial, we showed how to run an end-to-end predictive
139 | # process monitoring pipleine. We hope you find it useful
140 | # for your projects. If you have any questions or suggestions, please
141 | # open an issue on our GitHub repository or
142 | # `contact me `_ directly.
--------------------------------------------------------------------------------
/examples/plot_unbiased_split.py:
--------------------------------------------------------------------------------
1 |
2 | """
3 | Unbiased Split of Event Logs
4 | ============================
5 |
6 | In this tutorial we provide an overview of how the unbiased split of event
7 | logs [1] works and how to use it in the `skpm` package.
8 | """
9 |
10 | # %%
11 | # The `biased` split problem
12 | # --------------------------
13 | # In machine learning, standardizing how datasets are split is a common, often
14 | # essential, practice to ensure fair and reproducible results. However, in the
15 | # field of Process Mining, machine learning applications have not consistently
16 | # adopted this practice. Weytjens and De Weerdt's work [1] proposes the first
17 | # significant effort to address this gap.
18 | #
19 | # More specifically, their paper tackles three key challenges:
20 | #
21 | # 1. **Inconsistent Dataset Split**: Different datasets and preprocessing
22 | # methods make it hard to compare research outcomes. Certain preprocessing
23 | # choices can even lead to biased results due to the use of domain knowledge
24 | # that may not be accessible to all researchers.
25 | #
26 | # 2. **Data Leakage**: Training and test sets often overlap, with events from
27 | # the same case appearing in both, which leads to overfitted performance
28 | # measures and inaccurate predictions.
29 | #
30 | # 3. **Test Set Bias**: The test sets frequently suffer from bias due to
31 | # unequal distributions of case durations and active cases, especially at the
32 | # start and end of the dataset. This skews evaluation results, making them
33 | # less reflective of real-world performance.
34 | #
35 | # The `SkPM` package adapted the available code from the authors' GitHub [2].
36 | #
37 | # Unbised Split API
38 | # -----------------
39 | # Only a few datasets are currently supported by the unbiased split method.
40 | # The usage is really simple and can be seen in the following example:
41 |
42 | from skpm.event_logs import split, BPI20RequestForPayment
43 |
44 | bpi20 = BPI20RequestForPayment()
45 |
46 | train, test = split.unbiased(bpi20, **bpi20.unbiased_split_params)
47 | train.shape, test.shape
48 |
49 | # %%
50 | # The hyperparameters for the unbiased split are hardcoded in the original
51 | # implementation. However, they are
52 | # derived based on an data-driven analysis. In the future, we may consider
53 | # to implement this generic approach in order to extend the unbiased split
54 | # to other datasets. The hardcoded hyperparameters are:
55 | #
56 | # - `start_date`: the start date of the event log.
57 | # - `end_date`: the end date of the event log.
58 | # - `max_days`: the maximum duration of cases.
59 | #
60 | bpi20.unbiased_split_params
61 |
62 | # %%
63 | # For datasets without hardcoded hyperparameters, an exception will be raised:
64 |
65 | from skpm.event_logs import Sepsis
66 |
67 | sepsis = Sepsis()
68 | try:
69 | _ = split.unbiased(sepsis, **sepsis.unbiased_split_params)
70 | except Exception as e:
71 | print(e)
72 |
73 | # %%
74 | # The unbiased split is available for the following datasets:
75 | #
76 | # - :class:`~skpm.event_logs.BPI12`
77 | # - :class:`~skpm.event_logs.BPI17`
78 | # - :class:`~skpm.event_logs.BPI19`
79 | # - :class:`~skpm.event_logs.BPI20PrepaidTravelCosts`
80 | # - :class:`~skpm.event_logs.BPI20TravelPermitData`
81 | # - :class:`~skpm.event_logs.BPI20RequestForPayment`
82 | #
83 | # %%
84 | # In this tutorial, we showed to use the unbiased split API.
85 | # We hope you find it useful
86 | # for your projects. If you have any questions or suggestions, please
87 | # open an issue on our GitHub repository or
88 | # `contact me `_ directly.
89 | #
90 | # References
91 | # ----------
92 | # [1] Hans Weytjens, Jochen De Weerdt. Creating Unbiased Public Benchmark
93 | # Datasets with Data Leakage Prevention for Predictive Process Monitoring,
94 | # 2021. doi: 10.1007/978-3-030-94343-1_2
95 | # [2] https://github.com/hansweytjens/predictive-process-monitoring-benchmarks
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "skpm"
3 | version = "0.0.1"
4 | description = "A process mining library built upon scikit-learn!"
5 | authors = ["Rafael Oyamada"]
6 | license = "MIT"
7 | readme = "README.md"
8 |
9 | [tool.poetry.dependencies]
10 | python = ">=3.10"
11 | scikit-learn = ">=1.6.1"
12 | pandas = ">=2.2.0"
13 | pyarrow = ">=16.0.0"
14 | polars = ">=0.20.16"
15 | lxml = "5.1.0"
16 |
17 | [tool.poetry.group.dev.dependencies]
18 | pytest = ">=7.4.4"
19 | pytest-cov = ">=4.1.0"
20 | sphinx-autoapi = ">=3.0.0"
21 | sphinx-rtd-theme = ">=2.0.0"
22 | sphinx-gallery = ">=0.17.1"
23 | black = ">=24.1.1"
24 | codecov = ">=2.1.13"
25 | mypy = ">=1.8.0"
26 | coverage = ">=7.4.1"
27 | matplotlib = ">=3.8"
28 |
29 | [build-system]
30 | requires = ["poetry-core>=1.0.0"]
31 | build-backend = "poetry.core.masonry.api"
32 |
33 | [tool.black]
34 | line-length = 80
--------------------------------------------------------------------------------
/src/skpm/__init__.py:
--------------------------------------------------------------------------------
1 | # read version from installed package
2 | from importlib.metadata import version
3 |
4 | __version__ = version("skpm")
5 |
6 |
7 | from sklearn import set_config
8 |
9 | set_config(transform_output="pandas")
10 |
--------------------------------------------------------------------------------
/src/skpm/base.py:
--------------------------------------------------------------------------------
1 | import polars as pl
2 | from pandas import DataFrame
3 | from sklearn.base import BaseEstimator
4 | from sklearn.utils.validation import validate_data
5 |
6 | from .config import EventLogConfig as elc
7 | from .utils.validation import ensure_list, validate_columns
8 |
9 |
10 | class BaseProcessEstimator(BaseEstimator):
11 | """Base class for all process estimators in skpm.
12 |
13 | This class implements a common interface for all process,
14 | aiming at standardizing the validation and transformation
15 | of event logs.
16 |
17 | For instance, all event logs must have a `case_id` column.
18 | """
19 |
20 | def _validate_log(
21 | self,
22 | X: DataFrame,
23 | y: DataFrame = None,
24 | reset: bool = True,
25 | copy: bool = True,
26 | ):
27 | """
28 | Validate and preprocess the input event log DataFrame.
29 |
30 | Parameters
31 | ----------
32 | X : DataFrame
33 | The input DataFrame representing the event log.
34 | y : DataFrame, default=None
35 | The target DataFrame associated with the event log.
36 | reset : bool, default=True
37 | Whether to reset the index of the DataFrame after validation.
38 | copy : bool, default=True
39 | Whether to create a copy of the DataFrame before validation.
40 |
41 | Returns
42 | -------
43 | DataFrame
44 | The preprocessed and validated event log DataFrame.
45 |
46 | Raises
47 | ------
48 | ValueError
49 | If the input is not a DataFrame or if the case ID column is missing.
50 | """
51 | is_polars = False
52 | if isinstance(X, pl.DataFrame): # For Polars DataFrame
53 | X = X.to_pandas()
54 | is_polars = True
55 |
56 | self._validate_params()
57 |
58 | # TODO: the validation of a dataframe might be done
59 | # through the `pd.api.extensions`.
60 | # This would decrease the dependency between data validation
61 | # and sklearn estimators.
62 | # See: https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-pandas
63 | data = X.copy() if copy else X
64 |
65 | # despite the bottlenecks, event logs are better handled as dataframes
66 | assert isinstance(data, DataFrame), "Input must be a dataframe."
67 | cols = ensure_list(data.columns)
68 |
69 | self._case_id = self._ensure_case_id(data.columns)
70 |
71 | validate_data(
72 | self,
73 | X=X.drop(columns=self._case_id, axis=1),
74 | y=y,
75 | reset=reset,
76 | )
77 |
78 | cols = validate_columns(
79 | input_columns=data.columns,
80 | required=[self._case_id] + list(self.feature_names_in_),
81 | )
82 |
83 | if is_polars: # For Polars DataFrame
84 | data = pl.from_pandas(data)
85 | return data[cols]
86 |
87 | def _ensure_case_id(self, columns: list[str]):
88 | """
89 | Ensure that the case ID column is present in the list of columns.
90 |
91 | Parameters
92 | ----------
93 | columns : list[str]
94 | The list of column names to check for the presence of the case ID.
95 |
96 | Returns
97 | -------
98 | bool
99 | True if the case ID column is found, False otherwise.
100 | """
101 | for col in columns:
102 | if col.endswith(elc.case_id):
103 | return col
104 | raise ValueError(f"Case ID column not found.")
105 |
--------------------------------------------------------------------------------
/src/skpm/config.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 |
3 | EndOfTrace = str
4 |
5 | @dataclass
6 | class EventLogConfig:
7 | case_id: str = "case:concept:name"
8 | activity: str = "concept:name"
9 | resource: str = "org:resource"
10 | timestamp: str = "time:timestamp"
11 |
12 | default_file_format: str = ".parquet"
13 |
14 | EOT: EndOfTrace = "EOT"
15 |
16 | def update(self, **kwargs):
17 | for key, value in kwargs.items():
18 | setattr(self, key, value)
--------------------------------------------------------------------------------
/src/skpm/event_logs/__init__.py:
--------------------------------------------------------------------------------
1 | from .bpi import (
2 | BPI12,
3 | BPI13ClosedProblems,
4 | BPI13Incidents,
5 | BPI13OpenProblems,
6 | BPI17,
7 | BPI19,
8 | BPI20PrepaidTravelCosts,
9 | BPI20TravelPermitData,
10 | BPI20RequestForPayment,
11 | BPI20DomesticDeclarations,
12 | BPI20InternationalDeclarations,
13 | Sepsis,
14 | )
15 |
16 | from .parser import read_xes
17 |
18 | __all__ = [
19 | "BPI12",
20 | "BPI13ClosedProblems",
21 | "BPI13Incidents",
22 | "BPI13OpenProblems",
23 | "BPI17",
24 | "BPI19",
25 | "BPI20PrepaidTravelCosts",
26 | "BPI20TravelPermitData",
27 | "BPI20RequestForPayment",
28 | "BPI20DomesticDeclarations",
29 | "BPI20InternationalDeclarations",
30 | "Sepsis",
31 | "read_xes",
32 | ]
33 |
--------------------------------------------------------------------------------
/src/skpm/event_logs/base.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import pandas as pd
4 |
5 | from skpm.config import EventLogConfig as elc
6 | from skpm.event_logs.parser import read_xes
7 | from skpm.event_logs.download import download_url
8 | from skpm.event_logs.extract import extract_gz
9 |
10 |
11 | class BasePreprocessing:
12 | def preprocess(self):
13 | """
14 | Preprocess the event log by converting the timestamp column to
15 | datetime format.
16 | """
17 | self._dataframe[elc.timestamp] = pd.to_datetime(
18 | self._dataframe[elc.timestamp], utc=True, format="mixed"
19 | )
20 |
21 |
22 | class TUEventLog(BasePreprocessing):
23 | """
24 | Base class for event logs from the 4TU repository.
25 |
26 | It provides the basic structure for downloading, preprocessing, and
27 | splitting
28 | Furthermore, it provides the basic structure for caching the logs.
29 |
30 | Event logs from the 4tu repository [1] are downloaded as .xes.gz files
31 | and then converted to parquet files. The parquet files are then used to
32 | load the event logs.
33 | By default, we keep the .xes files in the raw folder
34 |
35 | Parameters
36 | ----------
37 | root_folder : str, optional
38 | Path where the event log will be stored. Defaults to "./data".
39 | save_as_pandas : bool, optional
40 | Whether to save the event log as a Pandas DataFrame. Defaults to
41 | True.
42 | train_set : bool, optional
43 | Whether the event log is for the training set. Defaults to True.
44 | file_path : str, optional
45 | Path to the event log file. If None, the file will be downloaded.
46 | Defaults to None.
47 |
48 | References:
49 | -----------
50 | [1] 4TU Research Data: https://data.4tu.nl/
51 | """
52 |
53 | url: str = None
54 | md5: str = None
55 | file_name: str = None
56 | meta_data: str = None # TODO: download DATA.xml from the 4TU repository
57 |
58 | _unbiased_split_params: dict = None
59 |
60 | def __init__(
61 | self,
62 | root_folder: str = "./data",
63 | save_as_pandas: bool = True,
64 | train_set: bool = True,
65 | file_path: str = None,
66 | ) -> None:
67 | super().__init__()
68 | self.root_folder = root_folder
69 | self.save_as_pandas = save_as_pandas
70 | self.train_set = train_set
71 |
72 | if file_path is None:
73 | self._file_path = os.path.join(
74 | self.root_folder,
75 | self.__class__.__name__,
76 | self.file_name.replace(".gz", "").replace(
77 | ".xes", elc.default_file_format
78 | ),
79 | )
80 | else:
81 | self._file_path = file_path
82 |
83 | if not os.path.exists(self.file_path):
84 | self.download()
85 |
86 | self._dataframe = self.read_log()
87 | self.preprocess()
88 |
89 | @property
90 | def dataframe(self) -> pd.DataFrame:
91 | """
92 | pd.DataFrame: DataFrame containing the event log data.
93 | """
94 | return self._dataframe
95 |
96 | @property
97 | def file_path(self) -> str:
98 | """
99 | str: Path to the event log file.
100 | """
101 | return self._file_path
102 |
103 | @file_path.setter
104 | def file_path(self, value):
105 | self._file_path = value
106 |
107 | @property
108 | def unbiased_split_params(self) -> dict:
109 | """
110 | dict: Parameters for the unbiased split of the event log.
111 | """
112 | if self._unbiased_split_params is None:
113 | raise ValueError(
114 | f"Unbiased split not available for {self.__class__.__name__}."
115 | )
116 | return self._unbiased_split_params
117 |
118 | def __len__(self):
119 | """
120 | Get the number of events in the event log.
121 |
122 | Returns
123 | -------
124 | int
125 | Number of events in the event log.
126 | """
127 | return len(self._dataframe)
128 |
129 | def download(self) -> None:
130 | """Generic method to download the event log from the 4TU Repository.
131 |
132 | It downloads the event log from the url, uncompresses
133 | it, and stores it. It can be overwritten by the
134 | subclasses if needed.
135 | """
136 | destination_folder = os.path.join("data", self.__class__.__name__)
137 | print(f"Downloading {destination_folder}")
138 | path = download_url(
139 | url=self.url, folder=destination_folder, file_name=self.file_name
140 | )
141 | if path.endswith(".xes"):
142 | self.file_path = path
143 | return
144 |
145 | if path.endswith(".gz"):
146 | self.file_path = extract_gz(
147 | path=path, folder=os.path.dirname(destination_folder)
148 | )
149 | # TODO: elif other formats
150 | os.remove(path)
151 |
152 | def read_log(self) -> pd.DataFrame:
153 | """
154 | Read the event log from the file.
155 |
156 | Returns
157 | -------
158 | pd.DataFrame
159 | DataFrame containing the event log data.
160 | """
161 | if self.file_path.endswith(".xes"):
162 | log = read_xes(self.file_path)
163 |
164 | if self.save_as_pandas:
165 | new_file_path = self.file_path.replace(
166 | ".xes", elc.default_file_format
167 | )
168 | if elc.default_file_format == ".parquet":
169 | log.to_parquet(new_file_path)
170 | else:
171 | raise ValueError("File format not implemented.")
172 | os.remove(self.file_path)
173 | self.file_path = new_file_path
174 |
175 | elif self.file_path.endswith(elc.default_file_format):
176 | log = pd.read_parquet(self.file_path)
177 | else:
178 | raise ValueError("File format not implemented.")
179 |
180 | return log
181 |
182 | def __repr__(self) -> str:
183 | """
184 | Return a string representation of the TUEventLog object.
185 |
186 | Returns
187 | -------
188 | str
189 | String representation of the TUEventLog object.
190 | """
191 | head = f"{self.__class__.__name__} Event Log"
192 | body = [f"Number of cases: {self._dataframe[elc.case_id].nunique()}"]
193 | body.append(f"Number of events: {self.__len__()}")
194 | if self.file_path is not None:
195 | body.append(
196 | f"Event log location: {os.path.normpath(self.file_path)}"
197 | )
198 | body += "".splitlines()
199 | lines = [head] + [" " * 4 + line for line in body]
200 | return "\n".join(lines)
201 |
--------------------------------------------------------------------------------
/src/skpm/event_logs/bpi.py:
--------------------------------------------------------------------------------
1 | from skpm.event_logs.base import TUEventLog
2 |
3 |
4 | class BPI12(TUEventLog):
5 | """:doi:`BPI Challenge 2012 event log
6 | <10.4121/uuid:3926db30-f712-4394-aebc-75976070e91f>`.
7 |
8 | This dataset is from the Business Process Intelligence (BPI) Challenge
9 | 2012 and contains event logs from a real-life financial institution. The
10 | event log records the execution of various activities related to a loan
11 | application process. Each event in the log represents a step in handling a
12 | loan request, with relevant information about the case, timestamp, and
13 | resource involved.
14 |
15 |
16 | Parameters
17 | ----------
18 | root_folder (str, optional): Path where the event log will be stored.
19 | Defaults to "data/".
20 | save_as_pandas (bool, optional): Whether to save the event log as a
21 | pandas parquet file.
22 | Defaults to True.
23 | train_set (bool, optional): Whether to use the train set or the test
24 | set.
25 | If True, use the train set. If False, use the test set. Defaults
26 | to True.
27 |
28 | Examples
29 | --------
30 | >>> bpi_12 = BPI12()
31 | >>> bpi_12.download() # Manually download the event log
32 | >>> event_log = bpi_12.dataframe() # Access the event log DataFrame
33 | """
34 |
35 | url: str = (
36 | "https://data.4tu.nl/file/533f66a4-8911-4ac7-8612-1235d65d1f37/3276db7f-8bee-4f2b-88ee-92dbffb5a893"
37 | )
38 | md5: str = "74c7ba9aba85bfcb181a22c9d565e5b5"
39 | file_name: str = "BPI_Challenge_2012.xes.gz"
40 |
41 | _unbiased_split_params: dict = {
42 | "start_date": None,
43 | "end_date": "2012-02",
44 | "max_days": 32.28,
45 | }
46 |
47 |
48 | class BPI13ClosedProblems(TUEventLog):
49 | """:doi:`BPI Challenge 2013 Closed problems event log
50 | `.
51 |
52 | The BPI 2013 Closed Problems log consists of 1487 cases and 6660 events.
53 | It originates from the problem management process of Volvo IT Belgium,
54 | focusing on cases where problems were diagnosed and resolved to enhance IT
55 | service quality.
56 |
57 |
58 |
59 | Parameters
60 | ----------
61 | root_folder : str, optional
62 | Path where the event log will be stored. Defaults to "data/".
63 | save_as_pandas : bool, optional
64 | Whether to save the event log as a pandas parquet file. Defaults to
65 | True.
66 | train_set : bool, optional
67 | Whether to use the train set or the test set. If True, use the train
68 | set. If False, use the test set. Defaults to True.
69 |
70 | Examples
71 | --------
72 | >>> bpi_13_incidents = BPI13Incidents()
73 | >>> bpi_13_incidents.download() # Manually download the event log
74 | >>> event_log = bpi_13_incidents.dataframe() # Access the event log DataFrame
75 | """
76 |
77 | url: str = (
78 | "https://data.4tu.nl/file/1987a2a6-9f5b-4b14-8d26-ab7056b17929/8b99119d-9525-452e-bc8f-236ac76fa9c9"
79 | )
80 | md5: str = "4f9c35942f42cb90d911ee4936bbad87"
81 | file_name: str = "BPI_Challenge_2013_closed_problems.xes.gz"
82 |
83 |
84 | class BPI13Incidents(TUEventLog):
85 | """:doi:`BPI Challenge 2013 Incidents
86 | `.
87 |
88 | The BPI 2013 Incidents log contains 7554 cases and 65533 events.
89 | It is part of the incident management process at Volvo IT Belgium,
90 | aimed at restoring normal service operations for customers as quickly as
91 | possible, while maintaining high levels of service quality and
92 | availability.
93 |
94 | Parameters
95 | ----------
96 | root_folder : str, optional
97 | Path where the event log will be stored. Defaults to "data/".
98 | save_as_pandas : bool, optional
99 | Whether to save the event log as a pandas parquet file. Defaults to True.
100 | train_set : bool, optional
101 | Whether to use the train set or the test set. If True, use the train set. If False, use the test set. Defaults to True.
102 |
103 | Examples
104 | --------
105 | >>> bpi_13_open_problems = BPI13OpenProblems()
106 | >>> bpi_13_open_problems.download() # Manually download the event log
107 | >>> event_log = bpi_13_open_problems.dataframe() # Access the event log DataFrame
108 | """
109 |
110 | url: str = (
111 | "https://data.4tu.nl/file/0fc5c579-e544-4fab-9143-fab1f5192432/aa51ffbb-25fd-4b5a-b0b8-9aba659b7e8c"
112 | )
113 | md5: str = "d4809bd55e3e1c15b017ab4e58228297"
114 | file_name: str = "BPI_Challenge_2013_incidents.xes.gz"
115 |
116 |
117 | class BPI13OpenProblems(TUEventLog):
118 | """:doi:`BPI Challenge 2013 open problems
119 | `.
120 |
121 | The BPI 2013 Open Problems log contains 819 cases and 2351 events.
122 | It originates from the problem management process of Volvo IT Belgium,
123 | focusing on unresolved problems that are still open and require further
124 | diagnosis and action to improve IT service quality.
125 |
126 |
127 | Parameters
128 | ----------
129 | root_folder : str, optional
130 | Path where the event log will be stored. Defaults to "data/".
131 | save_as_pandas : bool, optional
132 | Whether to save the event log as a pandas parquet file. Defaults to
133 | True.
134 | train_set : bool, optional
135 | Whether to use the train set or the test set. If True, use the train
136 | set. If False, use the test set. Defaults to True.
137 |
138 | Examples
139 | --------
140 | >>> bpi_13_open_problems = BPI13OpenProblems()
141 | >>> bpi_13_open_problems.download() # Manually download the event log
142 | >>> event_log = bpi_13_open_problems.dataframe() # Access the event log DataFrame
143 | """
144 |
145 | url: str = (
146 | "https://data.4tu.nl/file/7aafbf5b-97ae-48ba-bd0a-4d973a68cd35/0647ad1a-fa73-4376-bdb4-1b253576c3a1"
147 | )
148 | md5: str = "9663e544a2292edf1fe369747736e7b4"
149 | file_name: str = "BPI_Challenge_2013_open_problems.xes.gz"
150 |
151 |
152 | class BPI17(TUEventLog):
153 | """:doi:`BPI Challenge 2017
154 | `.
155 |
156 | The BPI 2017 event log originates from a loan application process at a
157 | Dutch financial institution. The data encompasses all loan applications
158 | submitted through an online system. This event log follows the same
159 | company and process as the BPI Challenge 2012. A notable feature of
160 | the new system is its ability to handle multiple offers for a single loan
161 | application, and these offers are tracked by their IDs within the event
162 | log.
163 |
164 |
165 | Parameters
166 | ----------
167 | root_folder : str, optional
168 | Path where the event log will be stored. Defaults to "./data".
169 | save_as_pandas : bool, optional
170 | Whether to save the event log as a pandas parquet file. Defaults to
171 | True.
172 | train_set : bool, optional
173 | Whether to use the train set or the test set. If True, use the train
174 | set. If False, use the test set. Defaults to True.
175 | file_path : str, optional
176 | Path to the file containing the event log. If provided, the event log
177 | will be loaded from this file. Defaults to None.
178 |
179 | Examples
180 | --------
181 | >>> bpi_17 = BPI17()
182 | >>> bpi_17.download() # Manually download the event log
183 | >>> event_log = bpi_17.dataframe() # Access the event log DataFrame
184 | """
185 |
186 | url: str = (
187 | "https://data.4tu.nl/file/34c3f44b-3101-4ea9-8281-e38905c68b8d/f3aec4f7-d52c-4217-82f4-57d719a8298c"
188 | )
189 | md5: str = "10b37a2f78e870d78406198403ff13d2"
190 | file_name: str = "BPI Challenge 2017.xes.gz"
191 |
192 | _unbiased_split_params: dict = {
193 | "start_date": None,
194 | "end_date": "2017-01",
195 | "max_days": 47.81,
196 | }
197 |
198 |
199 | class BPI19(TUEventLog):
200 | """:doi:`BPI Challenge 2019
201 | `.
202 |
203 |
204 | The BPI 2019 event log comes from a large multinational company in the
205 | coatings and paints industry, based in the Netherlands. It focuses on the
206 | purchase order handling process across 60 subsidiaries. Each purchase
207 | order contains one or more line items, with four types of matching flows:
208 | 3-way matching with goods receipt, 3-way matching without goods receipt,
209 | 2-way matching, and consignment. The log records 76,349 purchase documents,
210 | covering 251,734 items, with a total of 1,595,923 events. These events
211 | span 42 activities performed by 627 users, including both batch and normal
212 | users. The data is fully anonymized and structured in an IEEE-XES
213 | compliant format.
214 |
215 |
216 | Parameters
217 | ----------
218 | root_folder : str, optional
219 | Path where the event log will be stored. Defaults to "data/".
220 | save_as_pandas : bool, optional
221 | Whether to save the event log as a pandas parquet file. Defaults to True.
222 | train_set : bool, optional
223 | Whether to use the train set or the test set. If True, use the train set. If False, use the test set. Defaults to True.
224 |
225 | Examples
226 | --------
227 | >>> bpi_19 = BPI19()
228 | >>> bpi_19.download() # Manually download the event log
229 | >>> event_log = bpi_19.dataframe() # Access the event log DataFrame
230 | """
231 |
232 | url: str = (
233 | "https://data.4tu.nl/file/35ed7122-966a-484e-a0e1-749b64e3366d/864493d1-3a58-47f6-ad6f-27f95f995828"
234 | )
235 | md5: str = "4eb909242351193a61e1c15b9c3cc814"
236 | file_name: str = "BPI_Challenge_2019.xes"
237 |
238 | _unbiased_split_params: dict = {
239 | "start_date": "2018-01",
240 | "end_date": "2019-02",
241 | "max_days": 143.33,
242 | }
243 |
244 |
245 | class BPI20PrepaidTravelCosts(TUEventLog):
246 | """:doi:`BPI2020 Prepaid Travel Costs
247 | `.
248 |
249 |
250 | The BPI 2020 Prepaid Travel Costs event log records two years of travel
251 | expense claims for a university. In 2017, the data covers two departments,
252 | while in 2018, it extends to the entire university. The dataset includes
253 | various declarations and requests, such as domestic and international
254 | travel declarations, pre-paid travel costs, and payment requests. The
255 | process begins with submission by an employee, followed by approval from
256 | the travel administration, budget owner, and supervisor. For international
257 | trips, prior permission from the supervisor is mandatory, while domestic
258 | trips do not require prior approval. Reimbursement claims can be filed
259 | either upon payment of costs or within two months after the trip.
260 |
261 |
262 | Parameters
263 | ----------
264 | root_folder : str, optional
265 | Path where the event log will be stored. Defaults to "data/".
266 | save_as_pandas : bool, optional
267 | Whether to save the event log as a pandas parquet file. Defaults to
268 | True.
269 | train_set : bool, optional
270 | Whether to use the train set or the test set. If True, use the train
271 | set. If False, use the test set. Defaults to True.
272 |
273 | Examples
274 | --------
275 | >>> bpi_20 = BPI20PrepaidTravelCosts()
276 | >>> bpi_20.download() # Manually download the event log
277 | >>> event_log = bpi_20.dataframe() # Access the event log DataFrame
278 | """
279 |
280 | url: str = (
281 | "https://data.4tu.nl/file/fb84cf2d-166f-4de2-87be-62ee317077e5/612068f6-14d0-4a82-b118-1b51db52e73a"
282 | )
283 | md5: str = "b6ab8ee749e2954f09a4fef030960598"
284 | file_name: str = "PrepaidTravelCost.xes.gz"
285 |
286 | _unbiased_split_params: dict = {
287 | "start_date": None,
288 | "end_date": "2019-01",
289 | "max_days": 114.26,
290 | }
291 |
292 |
293 | class BPI20TravelPermitData(TUEventLog):
294 | """:doi:`BPI2020 Travel Permit Data
295 | `.
296 |
297 | The BPI 2020 Travel Permit event log contains 7,065 cases and 86,581
298 | events, covering two years of travel expense claims at a university. In
299 | 2017, data was gathered from two departments, expanding to the entire
300 | university in 2018. The log tracks the full process of travel permits,
301 | including related prepaid travel cost declarations and travel declarations.
302 | The process begins with the submission of a travel permit request by an
303 | employee, followed by approval from the travel administration, budget
304 | owner, and supervisor. For international trips, prior approval is required
305 | before making any travel arrangements, while domestic trips do not need
306 | prior approval. Reimbursement claims for costs can be submitted either
307 | upon payment or within two months after the trip.
308 |
309 |
310 | Parameters
311 | ----------
312 | root_folder : str, optional
313 | Path where the event log will be stored. Defaults to "data/".
314 | save_as_pandas : bool, optional
315 | Whether to save the event log as a pandas parquet file. Defaults to
316 | True.
317 | train_set : bool, optional
318 | Whether to use the train set or the test set. If True, use the train
319 | set. If False, use the test set. Defaults to True.
320 |
321 | Examples
322 | --------
323 | >>> bpi_20 = BPI20TravelPermitData()
324 | >>> bpi_20.download() # Manually download the event log
325 | >>> event_log = bpi_20.dataframe() # Access the event log DataFrame
326 | """
327 |
328 | url: str = (
329 | "https://data.4tu.nl/file/db35afac-2133-40f3-a565-2dc77a9329a3/12b48cc1-18a8-4089-ae01-7078fc5e8f90"
330 | )
331 | md5: str = "b6e9ff00d946f6ad4c91eb6fb550aee4"
332 | file_name: str = "PermitLog.xes.gz"
333 |
334 | _unbiased_split_params: dict = {
335 | "start_date": None,
336 | "end_date": "2019-10",
337 | "max_days": 258.81,
338 | }
339 |
340 |
341 | class BPI20RequestForPayment(TUEventLog):
342 | """:doi:`BPI2020 Request For Payment
343 | `.
344 |
345 |
346 | The BPI 2020 Request for Payment event log contains 6,886 cases and 36,796
347 | events, primarily focusing on requests for payment that are not related to
348 | travel. However, some events may mistakenly be linked to travel, which is
349 | considered an unwanted deviation. The dataset covers two years of events,
350 | with data collected from two departments in 2017 and the entire university
351 | in 2018. The process for requests follows a similar flow to other
352 | declarations: submission by an employee, approval by the travel
353 | administration, and further approvals by the budget owner and supervisor
354 | if necessary.
355 |
356 |
357 | Parameters
358 | ----------
359 | root_folder : str, optional
360 | Path where the event log will be stored. Defaults to "data/".
361 | save_as_pandas : bool, optional
362 | Whether to save the event log as a pandas parquet file. Defaults to
363 | True.
364 | train_set : bool, optional
365 | Whether to use the train set or the test set. If True, use the train
366 | set. If False, use the test set. Defaults to True.
367 |
368 | Examples
369 | --------
370 | >>> bpi_20 = BPI20RequestForPayment()
371 | >>> bpi_20.download() # Manually download the event log
372 | >>> event_log = bpi_20.dataframe() # Access the event log DataFrame
373 | """
374 |
375 | url: str = (
376 | "https://data.4tu.nl/file/a6f651a7-5ce0-4bc6-8be1-a7747effa1cc/7b1f2e56-e4a8-43ee-9a09-6e64f45a1a98"
377 | )
378 | md5: str = "2eb4dd20e70b8de4e32cc3c239bde7f2"
379 | file_name: str = "RequestForPayment.xes.gz"
380 |
381 | _unbiased_split_params: dict = {
382 | "start_date": None,
383 | "end_date": "2018-12",
384 | "max_days": 28.86,
385 | }
386 |
387 |
388 | class BPI20DomesticDeclarations(TUEventLog):
389 | """:doi:`BPI2020 Domestic Declarations
390 | `.
391 |
392 |
393 | The BPI 2020 Domestic Declarations event log contains 10,500 cases and
394 | 56,437 events. The dataset focuses on domestic travel expense claims over
395 | a two-year period. In 2017, data was collected from two departments, while
396 | in 2018, it covered the entire university. Domestic declarations do not
397 | require prior permission; employees can complete these trips and later
398 | request reimbursement for the incurred costs. The process follows a
399 | similar approval flow: after submission by the employee, the request is
400 | reviewed by the travel administration and further approved by the budget
401 | owner and supervisor, if necessary.
402 |
403 |
404 | Parameters
405 | ----------
406 | root_folder : str, optional
407 | Path where the event log will be stored. Defaults to "data/".
408 | save_as_pandas : bool, optional
409 | Whether to save the event log as a pandas parquet file. Defaults to
410 | True.
411 | train_set : bool, optional
412 | Whether to use the train set or the test set. If True, use the train
413 | set. If False, use the test set. Defaults to True.
414 |
415 | Examples
416 | --------
417 | >>> bpi_20 = BPI20DomesticDeclarations()
418 | >>> bpi_20.download() # Manually download the event log
419 | >>> event_log = bpi_20.dataframe() # Access the event log DataFrame
420 | """
421 |
422 | url: str = (
423 | "https://data.4tu.nl/file/6a0a26d2-82d0-4018-b1cd-89afb0e8627f/6eeb0328-f991-48c7-95f2-35033504036e"
424 | )
425 | md5: str = "6a78c39491498363ce4788e0e8ca75ef"
426 | file_name: str = "DomesticDeclarations.xes.gz"
427 |
428 |
429 | class BPI20InternationalDeclarations(TUEventLog):
430 | """:doi:`BPI2020 International Declarations
431 | `.
432 |
433 | The BPI 2020 International Declarations event log contains 6,449 cases and
434 | 72,151 events, covering two years of travel expense claims at a university.
435 | In 2017, the data was collected from two departments, expanding to the
436 | entire university in 2018. Unlike domestic trips, international trips
437 | require prior approval from the supervisor, which is obtained by
438 | submitting a travel permit. Once the permit is approved, the employee can
439 | proceed with travel arrangements. After the trip or payment of related
440 | expenses (e.g., flights or conference fees), a reimbursement claim is
441 | filed, which can be submitted either upon payment or within two months
442 | after the trip.
443 |
444 |
445 | Parameters
446 | ----------
447 | root_folder : str, optional
448 | Path where the event log will be stored. Defaults to "data/".
449 | save_as_pandas : bool, optional
450 | Whether to save the event log as a pandas parquet file. Defaults to
451 | True.
452 | train_set : bool, optional
453 | Whether to use the train set or the test set. If True, use the train
454 | set. If False, use the test set. Defaults to True.
455 |
456 | Examples
457 | --------
458 | >>> bpi_20 = BPI20InternationalDeclarations()
459 | >>> bpi_20.download() # Manually download the event log
460 | >>> event_log = bpi_20.dataframe() # Access the event log DataFrame
461 |
462 | """
463 |
464 | url: str = (
465 | "https://data.4tu.nl/file/91fd1fa8-4df4-4b1a-9a3f-0116c412378f/d45ee7dc-952c-4885-b950-4579a91ef426"
466 | )
467 | md5: str = "1ec65e046f70bb399cc6d2c154cd615a"
468 | file_name: str = "InternationalDeclarations.xes.gz"
469 |
470 |
471 | class Sepsis(TUEventLog):
472 | """:doi:`Sepsis
473 | `.
474 |
475 |
476 | The Sepsis event log contains real-life hospital data regarding sepsis
477 | cases, a life-threatening condition often caused by infection. Each case
478 | in the log represents a patient's pathway through the hospital. The
479 | dataset includes around 1000 cases and approximately 15,000 events,
480 | covering 16 different activities. Additionally, 39 data attributes are
481 | recorded, such as the responsible group for each activity, test results,
482 | and information from checklists. All events and attribute values have been
483 | anonymized. While the timestamps of events have been randomized, the
484 | intervals between events within a trace remain unchanged.
485 |
486 |
487 | Parameters
488 | ----------
489 | root_folder : str, optional
490 | Path where the event log will be stored. Defaults to "data/".
491 | save_as_pandas : bool, optional
492 | Whether to save the event log as a pandas parquet file. Defaults to
493 | True.
494 | train_set : bool, optional
495 | Whether to use the train set or the test set. If True, use the train
496 | set. If False, use the test set. Defaults to True.
497 |
498 | Examples
499 | --------
500 | >>> sepsis = Sepsis()
501 | >>> sepsis.download() # Manually download the event log
502 | >>> event_log = sepsis.dataframe() # Access the event log DataFrame
503 |
504 | """
505 |
506 | url: str = (
507 | "https://data.4tu.nl/file/33632f3c-5c48-40cf-8d8f-2db57f5a6ce7/643dccf2-985a-459e-835c-a82bce1c0339"
508 | )
509 |
510 | md5: str = "b5671166ac71eb20680d3c74616c43d2"
511 | file_name: str = "Sepsis Cases - Event Log.xes.gz"
512 |
--------------------------------------------------------------------------------
/src/skpm/event_logs/download.py:
--------------------------------------------------------------------------------
1 | import os
2 | import typing as t
3 | from urllib import request
4 |
5 |
6 | def download_url(
7 | url: str, folder: t.Optional[str] = None, file_name: t.Optional[str] = None
8 | ) -> str:
9 | """Download a file from a `url` and place it in `folder`.
10 |
11 | Args:
12 | url (str): URL to download file from
13 | folder (str, optional): Folder to download file to.
14 | If None, use the current working directory. Defaults to None.
15 | file_name (str, optional): Name to save the file under.
16 | If None, use the basename of the URL. Defaults to None.
17 |
18 | Returns:
19 | folder (str): Path to downloaded file
20 | """
21 | if folder is None:
22 | folder = os.getcwd()
23 |
24 | if file_name is None:
25 | # TODO: maybe get the file_name from the request?
26 | # response.info().get_file_name()
27 | file_name = os.path.basename(url)
28 | path = os.path.join(folder, file_name)
29 |
30 | if os.path.exists(path):
31 | return path
32 |
33 | # try:
34 | os.makedirs(os.path.expanduser(os.path.normpath(folder)), exist_ok=True)
35 | # except OSError as e:
36 | # raise e
37 |
38 | _urlretrieve(url=url, destination=path)
39 | return path
40 |
41 |
42 | def _save_response_content(
43 | content: t.Iterator[bytes],
44 | destination: str,
45 | ) -> None:
46 | """
47 | Save the content received from an HTTP response to a file.
48 |
49 | Parameters
50 | ----------
51 | content : Iterator[bytes]
52 | Iterator yielding binary data chunks from the HTTP response.
53 | destination : str
54 | Path to the file where the content will be saved.
55 |
56 | Returns
57 | -------
58 | None
59 | """
60 | with open(destination, "wb") as fh:
61 | for chunk in content:
62 | # filter out keep-alive new chunks
63 | # if not chunk:
64 | # continue
65 |
66 | fh.write(chunk)
67 |
68 |
69 | def _urlretrieve(
70 | url: str, destination: str, chunk_size: int = 1024 * 32
71 | ) -> None:
72 | """
73 | Retrieve a URL and save its contents to a file.
74 |
75 | Parameters
76 | ----------
77 | url : str
78 | The URL of the resource to retrieve.
79 | destination : str
80 | Path to the file where the content will be saved.
81 | chunk_size : int, optional
82 | Size of the chunks to read from the response at a time, in bytes.
83 | Defaults to 32KB.
84 |
85 | Returns
86 | -------
87 | None
88 | """
89 | with request.urlopen(request.Request(url)) as response:
90 | _save_response_content(
91 | iter(lambda: response.read(chunk_size), b""),
92 | destination,
93 | )
94 |
--------------------------------------------------------------------------------
/src/skpm/event_logs/extract.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import os.path as osp
3 | import zipfile
4 |
5 |
6 | def extract_gz(path: str, folder: str):
7 | r"""Extracts a gz archive to a specific folder.
8 |
9 | Args:
10 | path (str): The path to the tar archive.
11 | folder (str): The folder.
12 | log (bool, optional): If :obj:`False`, will not print anything to the
13 | console. (default: :obj:`True`)
14 | """
15 | path = osp.abspath(path)
16 | file_path = osp.join(folder, ".".join(path.split(".")[:-1]))
17 | with gzip.open(path, "r") as r:
18 | with open(file_path, "wb") as w:
19 | w.write(r.read())
20 |
21 | return file_path
22 |
23 |
24 | # def extract_zip(path: str, folder: str):
25 | # r"""Extracts a zip archive to a specific folder.
26 |
27 | # Args:
28 | # path (str): The path to the tar archive.
29 | # folder (str): The folder.
30 | # log (bool, optional): If :obj:`False`, will not print anything to the
31 | # console. (default: :obj:`True`)
32 | # """
33 | # with zipfile.ZipFile(path, "r") as f:
34 | # f.extractall(folder)
35 |
36 |
37 | # commenting out the following functions because
38 | # they are not used in the codebase. Maybe in the future.
39 |
40 | # import bz2
41 | # import tarfile
42 |
43 | # def extract_tar(path: str, folder: str, mode: str = 'r:gz'):
44 | # r"""Extracts a tar archive to a specific folder.
45 |
46 | # Args:
47 | # path (str): The path to the tar archive.
48 | # folder (str): The folder.
49 | # mode (str, optional): The compression mode. (default: :obj:`"r:gz"`)
50 | # log (bool, optional): If :obj:`False`, will not print anything to the
51 | # console. (default: :obj:`True`)
52 | # """
53 | # with tarfile.open(path, mode) as f:
54 | # f.extractall(folder)
55 |
56 |
57 | # def extract_bz2(path: str, folder: str):
58 | # r"""Extracts a bz2 archive to a specific folder.
59 |
60 | # Args:
61 | # path (str): The path to the tar archive.
62 | # folder (str): The folder.
63 | # log (bool, optional): If :obj:`False`, will not print anything to the
64 | # console. (default: :obj:`True`)
65 | # """
66 | # path = osp.abspath(path)
67 | # with bz2.open(path, 'r') as r:
68 | # with open(osp.join(folder, '.'.join(path.split('.')[:-1])), 'wb') as w:
69 | # w.write(r.read())
70 |
--------------------------------------------------------------------------------
/src/skpm/event_logs/parser.py:
--------------------------------------------------------------------------------
1 | from lxml import etree
2 | from itertools import chain
3 | from joblib import Parallel, delayed
4 | from typing import Generator
5 | import pandas as pd
6 |
7 |
8 | class Event(dict):
9 | pass
10 |
11 |
12 | class TagXES:
13 | # attributes
14 | STRING: str = "string"
15 | DATE: str = "date"
16 | FLOAT: str = "float"
17 | BOOLEAN: str = "boolean"
18 | INT: str = "int"
19 |
20 | # elements
21 | EVENT: str = "event"
22 | TRACE: str = "trace"
23 |
24 | _DTYPES: tuple = (STRING, DATE, FLOAT, BOOLEAN, INT)
25 |
26 | @classmethod
27 | def is_attribute(cls, element: etree._Element) -> bool:
28 | """element is an attribute if it ends with one of the dtypes."""
29 | return element.tag.endswith(cls._DTYPES)
30 |
31 | @classmethod
32 | def is_valid(cls, element: etree._Element) -> bool:
33 | return element.tag.endswith(
34 | tuple(v for v in vars(cls).values() if not v.startswith("_"))
35 | )
36 |
37 | @classmethod
38 | def get_dtypes(cls) -> tuple:
39 | return cls._DTYPES
40 |
41 |
42 | tag = TagXES
43 |
44 |
45 | def extract_case_attributes(trace: etree._Element, ns: dict) -> Event:
46 | """
47 | Extracts case-level attributes from the trace.
48 |
49 | Using findall for case attributes is faster than using iter since
50 | cases has fewer attributes than events.
51 |
52 | Args:
53 | trace (etree._Element): The trace element.
54 | ns (dict): Namespace mapping for XML parsing.
55 |
56 | Returns:
57 | Event: A dictionary of case-level attributes.
58 | """
59 | case_attrs = Event()
60 | for attr in tag.get_dtypes():
61 | # Find all attributes of the given type in the trace
62 | attrs = trace.findall(attr, ns)
63 | # Update case_attrs with the found attributes
64 | case_attrs.update(
65 | {f'case:{e.get("key")}': e.get("value") for e in attrs}
66 | )
67 | return case_attrs
68 |
69 |
70 | def extract_event_attributes(event: etree._Element) -> Event:
71 | """
72 | Extracts attributes from an event element.
73 |
74 | Using iter is slightly faster than findall for events since
75 | there many events and event attributes in a trace.
76 |
77 | Args:
78 | event (etree._Element): The event element.
79 |
80 | Returns:
81 | Event: A dictionary of event attributes.
82 | """
83 | event_attrs = Event()
84 | for e_attr in event.iter():
85 | if tag.is_attribute(e_attr):
86 | event_attrs[e_attr.get("key")] = e_attr.get("value")
87 | return event_attrs
88 |
89 |
90 | def parse_trace(trace: etree._Element, ns: dict) -> list[Event]:
91 | """Parses a list of XML elements representing a trace.
92 |
93 | Args:
94 | trace (list[etree._Element]): List of XML elements representing a trace from a XES file.
95 |
96 | Returns:
97 | list[Event]: The respective events from the trace.
98 | """
99 |
100 | if isinstance(trace, bytes):
101 | trace = etree.fromstring(trace)
102 |
103 | case_attrs = extract_case_attributes(trace, ns)
104 |
105 | # Parse each event
106 | parsed_events = []
107 | events = trace.findall(tag.EVENT, ns)
108 | for event in events:
109 | event_attrs = extract_event_attributes(event)
110 |
111 | # Add case-level attributes to event attributes
112 | event_attrs.update(case_attrs)
113 | parsed_events.append(event_attrs)
114 |
115 | # Clear the event to free memory
116 | event.clear()
117 |
118 | trace.clear()
119 | return parsed_events
120 |
121 |
122 | def lazy_serialize(
123 | elements: list[etree._Element],
124 | ) -> Generator[bytes, None, None]:
125 | """Lazy serialization of a list of XML elements. Used for parallel processing."""
126 | for element in elements:
127 | yield etree.tostring(element)
128 |
129 |
130 | def read_xes(
131 | filepath: str, n_jobs: int = None
132 | ) -> pd.DataFrame:
133 | """Reads an event log from a XES file.
134 |
135 | Rough overview:
136 | This function reads an event log from a XES file. It uses the lxml library to
137 | parse the XML file. The function is parallelized using the joblib library.
138 |
139 | 1. For each trace, the function `_parse_trace` is called.
140 | 2. Extract the case attributes and all the events from the trace.
141 | 3. For each event, extract the event attributes.
142 | 4. Update the event attributes with the case attributes.
143 | 5. Append the event to the final list of events corresponding to the trace.
144 | 6. Return trace and repeat.
145 |
146 | Args:
147 | filepath (str): Filepath to the XES file.
148 | n_jobs (int, optional): Number of CPU cores to use. If None, only one core
149 | is used. Defaults to None.
150 |
151 | Returns:
152 | list[Event]: an event log as a list of Event objects.
153 | """
154 | tree = etree.parse(filepath).getroot()
155 | ns = tree.nsmap
156 |
157 | traces = tree.findall(tag.TRACE, ns)
158 |
159 | if n_jobs in [1, None]:
160 | log = []
161 | for trace in traces:
162 | log.extend(parse_trace(trace, ns))
163 | else:
164 | from functools import partial
165 |
166 | parse_trace_partial = partial(parse_trace, ns=ns)
167 |
168 | traces = lazy_serialize(traces)
169 | log = Parallel(n_jobs=n_jobs)(
170 | delayed(parse_trace_partial)(trace) for trace in traces
171 | )
172 | log = list(chain(*log))
173 |
174 | log = pd.DataFrame(log)
175 |
176 | return log
177 |
--------------------------------------------------------------------------------
/src/skpm/event_logs/split.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from skpm.config import EventLogConfig as elc
3 | from skpm.event_logs.base import TUEventLog
4 |
5 |
6 | def _bounded_dataset(
7 | dataset: pd.DataFrame, start_date, end_date: int
8 | ) -> pd.DataFrame:
9 | grouped = dataset.groupby(elc.case_id, as_index=False)[elc.timestamp].agg(
10 | ["min", "max"]
11 | )
12 |
13 | start_date = (
14 | pd.Period(start_date)
15 | if start_date
16 | else dataset[elc.timestamp].min().to_period("M")
17 | )
18 | end_date = (
19 | pd.Period(end_date)
20 | if end_date
21 | else dataset[elc.timestamp].max().to_period("M")
22 | )
23 | bounded_cases = grouped[
24 | (grouped["min"].dt.to_period("M") >= start_date)
25 | & (grouped["max"].dt.to_period("M") <= end_date)
26 | ][elc.case_id].values
27 | dataset = dataset[dataset[elc.case_id].isin(bounded_cases)]
28 | return dataset
29 |
30 |
31 | def _unbiased(dataset: pd.DataFrame, max_days: int) -> pd.DataFrame:
32 | grouped = (
33 | dataset.groupby(elc.case_id, as_index=False)[elc.timestamp]
34 | .agg(["min", "max"])
35 | .assign(
36 | duration=lambda x: (x["max"] - x["min"]).dt.total_seconds()
37 | / (24 * 60 * 60)
38 | )
39 | )
40 |
41 | # condition 1: cases are shorter than max_duration
42 | condition_1 = grouped["duration"] <= max_days * 1.00000000001
43 | # condition 2: drop cases starting after the dataset's last timestamp - the max_duration
44 | latest_start = dataset[elc.timestamp].max() - pd.Timedelta(
45 | max_days, unit="D"
46 | )
47 | condition_2 = grouped["min"] <= latest_start
48 |
49 | unbiased_cases = grouped[condition_1 & condition_2][elc.case_id].values
50 | dataset = dataset[dataset[elc.case_id].isin(unbiased_cases)]
51 | return dataset
52 |
53 |
54 | def unbiased(
55 | dataset: pd.DataFrame | TUEventLog,
56 | start_date: str | pd.Period | None,
57 | end_date: str | pd.Period | None,
58 | max_days: int,
59 | test_len: float = 0.2,
60 | ) -> tuple[pd.DataFrame, pd.DataFrame]:
61 | """
62 | Unbiased split of event log into training and test set [1].
63 |
64 | Code adapted from [2].
65 |
66 | Parameters
67 | ----------
68 | dataset: pd.DataFrame
69 | Event log.
70 |
71 | start_date: str
72 | Start date of the event log.
73 |
74 | end_date: str
75 | End date of the event log.
76 |
77 | max_days: int
78 | Maximum duration of cases.
79 |
80 | test_len: float, default=0.2
81 | Proportion of cases to be used for the test set.
82 |
83 | Returns
84 | -------
85 | - df_train: pd.DataFrame, training set
86 | - df_test: pd.DataFrame, test set
87 |
88 | Example
89 | -------
90 | >>> from skpm.event_logs import BPI12
91 | >>> from skpm.event_logs import split
92 | >>> bpi12 = BPI12()
93 | >>> df_train, df_test = split.unbiased(bpi12, **bpi12.unbiased_split_params)
94 | >>> df_train.shape, df_test.shape
95 | ((117546, 7), (55952, 7))
96 |
97 | References:
98 | -----------
99 | [1] Hans Weytjens, Jochen De Weerdt. Creating Unbiased Public Benchmark Datasets with Data Leakage Prevention for Predictive Process Monitoring, 2021. doi: 10.1007/978-3-030-94343-1_2
100 | [2] https://github.com/hansweytjens/predictive-process-monitoring-benchmarks
101 | """
102 | if isinstance(dataset, TUEventLog):
103 | dataset = dataset.dataframe
104 |
105 | dataset = dataset.copy()
106 |
107 | dataset[elc.timestamp] = pd.to_datetime(
108 | dataset[elc.timestamp], utc=True
109 | ).dt.tz_localize(None)
110 |
111 | # bounding the event log
112 | if start_date or end_date:
113 | dataset = _bounded_dataset(dataset, start_date, end_date)
114 |
115 | # drop longest cases and debiasing end of dataset
116 | dataset = _unbiased(dataset, max_days)
117 |
118 | # preliminaries
119 | grouped = dataset.groupby(elc.case_id, as_index=False)[elc.timestamp].agg(
120 | ["min", "max"]
121 | )
122 |
123 | ### TEST SET ###
124 | first_test_case_nr = int(len(grouped) * (1 - test_len))
125 | first_test_start_time = (
126 | grouped["min"].sort_values().values[first_test_case_nr]
127 | )
128 | # retain cases that end after first_test_start time
129 | test_case_nrs = grouped.loc[
130 | grouped["max"].values >= first_test_start_time, elc.case_id
131 | ]
132 | df_test = dataset[dataset[elc.case_id].isin(test_case_nrs)].reset_index(
133 | drop=True
134 | )
135 |
136 | #### TRAINING SET ###
137 | df_train = dataset[~dataset[elc.case_id].isin(test_case_nrs)].reset_index(
138 | drop=True
139 | )
140 |
141 | return df_train, df_test
142 |
--------------------------------------------------------------------------------
/src/skpm/feature_extraction/__init__.py:
--------------------------------------------------------------------------------
1 | from skpm.feature_extraction.time import TimestampExtractor
2 | from skpm.feature_extraction.event.resource import ResourcePoolExtractor
3 | from skpm.feature_extraction.event.inter_case import WorkInProgress
4 |
5 | __all__ = ["TimestampExtractor", "ResourcePoolExtractor"]
--------------------------------------------------------------------------------
/src/skpm/feature_extraction/case/__init__.py:
--------------------------------------------------------------------------------
1 | from .variant import VariantExtractor
2 |
3 | __all__ = [
4 | "VariantExtractor",
5 | ]
6 |
--------------------------------------------------------------------------------
/src/skpm/feature_extraction/case/_helpers.py:
--------------------------------------------------------------------------------
1 | import inspect
2 | from sklearn.pipeline import Pipeline
3 |
4 |
5 | def ensure_not_pipeline(fit_method):
6 | def wrapper(estimator, *args, **kwargs):
7 | in_pipeline = False
8 | for frame_info in inspect.stack():
9 | frame = frame_info.frame
10 | if "self" in frame.f_locals:
11 | caller_self = frame.f_locals["self"]
12 | if isinstance(caller_self, Pipeline):
13 | in_pipeline = True
14 | break
15 | if in_pipeline:
16 | class_name = estimator.__class__.__name__
17 | raise ValueError(
18 | f"{class_name} is a case-wise feature extractor and cannot be used in a pipeline."
19 | )
20 |
21 | return fit_method(estimator, *args, **kwargs)
22 |
23 | return wrapper
24 |
--------------------------------------------------------------------------------
/src/skpm/feature_extraction/case/time.py:
--------------------------------------------------------------------------------
1 | from skpm.config import EventLogConfig as elc
2 |
3 | class TimestampCaseLevel:
4 | """
5 | Extracts time-related features at the case level.
6 |
7 | Notes
8 | -----
9 | Separating the implementation for event-level and case-level features improves performance.
10 | The case-level implementation is slower due to the use of `groupby`.
11 | """
12 |
13 | TIME_UNIT_MULTIPLIER = {
14 | "s": 1,
15 | "m": 60,
16 | "h": 60 * 60,
17 | "d": 60 * 60 * 24,
18 | "w": 60 * 60 * 24 * 7,
19 | }
20 |
21 | @classmethod
22 | def accumulated_time(cls, case, ix_list, time_unit="s"):
23 | """Calculate the accumulated time from the start of each case in seconds."""
24 | return (
25 | case[elc.timestamp]
26 | .apply(lambda x: x - x.min())
27 | .loc[ix_list]
28 | .dt.total_seconds()
29 | / cls.TIME_UNIT_MULTIPLIER.get(time_unit, 1)
30 | )
31 |
32 | @classmethod
33 | def execution_time(cls, case, ix_list, time_unit="s"):
34 | """Calculate the execution time of each event in seconds.
35 |
36 | **NOTE**: This should be used as a target feature, since the _next_ step is
37 | needed to calculate the execution time of each event."""
38 | return (
39 | case[elc.timestamp]
40 | .diff(-1)
41 | .dt.total_seconds()
42 | .fillna(0)
43 | .loc[ix_list]
44 | .abs() # to avoid negative numbers caused by diff-1
45 | / cls.TIME_UNIT_MULTIPLIER.get(time_unit, 1)
46 | )
47 |
48 | @classmethod
49 | def remaining_time(cls, case, ix_list, time_unit="s"):
50 | """Calculate the remaining time until the end of each case in seconds.
51 |
52 | **NOTE**: This should be used as a target feature, since the _last_ step
53 | is needed to calculate the remaining time of each event."""
54 |
55 | return (
56 | case[elc.timestamp]
57 | .apply(lambda x: x.max() - x)
58 | .loc[ix_list]
59 | .dt.total_seconds()
60 | / cls.TIME_UNIT_MULTIPLIER.get(time_unit, 1)
61 | )
--------------------------------------------------------------------------------
/src/skpm/feature_extraction/case/variant.py:
--------------------------------------------------------------------------------
1 | from sklearn.base import BaseEstimator, TransformerMixin
2 | from sklearn.preprocessing import LabelEncoder
3 |
4 | from skpm.feature_extraction.case._helpers import ensure_not_pipeline
5 |
6 |
7 | class VariantExtractor(TransformerMixin, BaseEstimator):
8 | """Extract trace variants from an event log."""
9 |
10 | def __init__(self, strategy="default"):
11 | self.strategy = strategy
12 |
13 | @ensure_not_pipeline
14 | def fit(self, X, y=None):
15 | if self.strategy != "default":
16 | raise NotImplementedError("Only the default strategy is supported.")
17 |
18 | self.variants = (
19 | X.groupby("case:concept:name", as_index=False)["concept:name"]
20 | .apply(tuple)
21 | .rename(columns={"concept:name": "variant"})
22 | )
23 |
24 | self._le = LabelEncoder()
25 | self.variants["variant"] = self._le.fit_transform(
26 | self.variants["variant"]
27 | )
28 | return self
29 |
30 | def transform(self, X):
31 | """Get trace variants."""
32 | return self.variants
33 |
34 | def inverse_transform(self, X):
35 | """Get trace variants."""
36 | return self._le.inverse_transform(X)
37 |
--------------------------------------------------------------------------------
/src/skpm/feature_extraction/event/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Notes on inter-case features:
3 | Inter-case features are features that can be leveraged by cases in parallel. For instance, the availability of a resource at a time window `t_0` can be represented as a binary variable.
4 | This brings up an observation, not an issue I believe, that `fit` methods in this module will just return self, and all the logic should be within `transform`. This is due to the temporal splits, expected for temporal process data.
5 | We cannot `fit` on the train set and `transform` on the test
6 | set, i.e. define the bins based on `freq`, since in a temporal
7 | split the test set will have unkown bins. TODO: further explore
8 | if this is an issue.
9 | For TimestampExtractor, if we have on the training set a trace [t_0, ..., t_n] whereas on the test set we have the remaining trace, i.e., [t_{n+1}, ..., t_m], the `accumulated_time` feature should take this info into consideration.
10 | """
11 |
12 | from .resource import ResourcePoolExtractor
13 | from .inter_case import WorkInProgress
14 |
15 | __all__ = ["ResourcePoolExtractor", "WorkInProgress"]
16 |
--------------------------------------------------------------------------------
/src/skpm/feature_extraction/event/inter_case.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.base import (
3 | BaseEstimator,
4 | TransformerMixin,
5 | )
6 | from skpm.config import EventLogConfig as elc
7 |
8 |
9 | class WorkInProgress(TransformerMixin, BaseEstimator):
10 | """Work in Progress (WIP) feature extractor.
11 |
12 | This transformer calculates the number of cases (work) in progress within
13 | specified time windows.
14 |
15 | Parameters:
16 | -----------
17 | window_size : str, default='1D'
18 | Frequency of the time windows to count the number of cases in progress.
19 | It follows the Pandas offset aliases convention. For more details, see
20 | https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
21 |
22 | Returns:
23 | --------
24 | ndarray
25 | WIP feature array of shape (n_samples, 1)
26 |
27 | Examples:
28 | ---------
29 | >>> import pandas as pd
30 | >>> from sklearn.pipeline import Pipeline
31 | >>> from skpm.event_feature_extraction import WorkInProgress
32 | >>> from skpm.config import EventLogConfig as elc
33 | >>> # Assuming X is your dataframe containing event data with columns 'timestamp' and 'case_id'
34 | >>> X = pd.DataFrame({elc.timestamp: pd.date_range(start='2024-01-01', end='2024-01-10', freq='D'),
35 | ... elc.case_id: [1, 1, 2, 3, 4, 4, 4, 5, 6, 6]})
36 | >>> wip_transformer = WorkInProgress(window_size='2D') # Calculate WIP over 2-day windows
37 | >>> wip_transformer.fit_transform(X)
38 | array([2., 1., 1., 2., 2., 1., 1., 2., 2., 2.])
39 | """
40 |
41 | # see https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
42 | def __init__(
43 | self,
44 | window_size="1D",
45 | ) -> None:
46 | self.window_size = window_size
47 |
48 | def get_feature_names_out(self):
49 | return ["wip"]
50 |
51 | def fit(
52 | self,
53 | X: pd.DataFrame,
54 | y=None,
55 | ):
56 | assert isinstance(X, pd.DataFrame), "Input must be a dataframe."
57 |
58 | return self
59 |
60 | def transform(self, X: pd.DataFrame):
61 | """Transform the input DataFrame to calculate the Work in Progress (WIP) feature.
62 |
63 | This method calculates the number of cases in progress within specified time windows based on the input event data.
64 |
65 | Parameters:
66 | -----------
67 | X : pd.DataFrame
68 | Input DataFrame containing event data with columns 'timestamp' and 'case_id'.
69 |
70 | Returns:
71 | --------
72 | ndarray
73 | WIP feature array of shape (n_samples, 1), where each value represents the number of cases in progress at each time step.
74 |
75 | Notes:
76 | ------
77 | 1. The method performs the following steps:
78 | a. Groups the event data by time windows specified by the 'window_size' parameter.
79 | b. Counts the number of unique cases within each time window.
80 | c. Maps the counts to the corresponding time windows.
81 | d. Fills any missing values with the number of NaN values (representing time windows with no events).
82 | """
83 | self._grouped_wip = X.groupby(
84 | pd.Grouper(key=elc.timestamp, freq=self.window_size)
85 | )[elc.case_id].nunique()
86 | self._bins = pd.cut(
87 | X[elc.timestamp],
88 | bins=self._grouped_wip.index,
89 | labels=self._grouped_wip.index[:-1],
90 | )
91 | wip = self._bins.map(self._grouped_wip)
92 | wip = wip.fillna(self._bins.isna().sum()).values
93 |
94 | return wip
95 |
--------------------------------------------------------------------------------
/src/skpm/feature_extraction/event/resource.py:
--------------------------------------------------------------------------------
1 | import warnings
2 |
3 | import numpy as np
4 | from pandas import DataFrame
5 | from scipy.sparse.csgraph import connected_components
6 | from sklearn.base import (
7 | BaseEstimator,
8 | TransformerMixin,
9 | check_is_fitted,
10 | )
11 |
12 | from skpm.utils import validate_columns
13 | from skpm.config import EventLogConfig as elc
14 | from skpm.warnings import ConceptDriftWarning
15 |
16 |
17 | class ResourcePoolExtractor(TransformerMixin, BaseEstimator):
18 | """
19 | Extracts resource roles based on resource-activity correlations.
20 |
21 | This class identifies resource roles within a process based on correlations
22 | between resources and activities in event logs. It computes a correlation
23 | matrix between resources and activities and then identifies subgraphs
24 | representing roles based on a user-defined threshold.
25 | This approach proposed in [1], and code adapted from [2].
26 |
27 | Todo:
28 | ------
29 | implement other distance metrics.
30 |
31 |
32 | Parameters:
33 | -----------
34 | threshold : float, default=0.7
35 | The correlation threshold for identifying resource roles.
36 | Resources with correlation coefficients above this threshold
37 | are considered to belong to the same role.
38 |
39 | References:
40 | -----------
41 | - [1] Minseok Song, Wil M.P. van der Aalst. "Towards comprehensive support for organizational mining," Decision Support Systems (2008).
42 | - [2] Code adapted from https://github.com/AdaptiveBProcess/GenerativeLSTM
43 |
44 | Notes
45 | -----
46 | - distance metrics: (dis)similarity between two vectors (variables). It must
47 | satisfy the following mathematical properties: d(x,x) = 0, d(x,y) >= 0,
48 | d(x,y) = d(y,x), d(x,z) <= d(x,y) + d(y,z)
49 | - correlation coeficients: statistical relationships between vectors (variables)
50 | that quantify how much they are related.
51 |
52 | The original paper mentions Pearson correlation as a distance metric. For
53 | academic purposes, it's crucial to grasp the distinction since correlation
54 | does not satisfy the triangular inequality. Yet, there are instances where
55 | I think correlation can be informally employed as a 'similarity' measure.
56 | In the context of organizational mining, I believe statistical relationships
57 | and similarity ultimately serve the same purpose.
58 |
59 | Examples:
60 | ---------
61 | >>> from skpm.event_feature_extraction.resource import ResourcePoolExtractor
62 | >>> import pandas as pd
63 | >>> # Assuming X is your dataframe containing event data with columns 'activity' and 'resource'
64 | >>> X = pd.DataFrame({'activity': ['A', 'B', 'A', 'B'], 'resource': ['R1', 'R2', 'R1', 'R2']})
65 | >>> # Initialize and fit the extractor
66 | >>> extractor = ResourcePoolExtractor(threshold=0.7)
67 | >>> extractor.fit(X)
68 | >>> # Transform the data to extract resource roles
69 | >>> resource_roles = extractor.transform(X)
70 | >>> print(resource_roles)
71 | [0 1 0 1]
72 | """
73 |
74 | def __init__(self, threshold=0.7):
75 | """
76 | Initialize the ResourcePoolExtractor.
77 |
78 | Parameters:
79 | -----------
80 | threshold : float, default=0.7
81 | The correlation threshold for identifying resource roles.
82 | """
83 | # the original implementation uses 0.7 as threshold but in the argparser they set 0.85
84 | self.threshold = threshold
85 |
86 | def get_feature_names_out(self):
87 | """Return the feature names.
88 |
89 | Returns:
90 | --------
91 | feature_names : list
92 | List containing the feature names.
93 | """
94 | return ["resource_roles"]
95 |
96 | def fit(self, X: DataFrame, y=None):
97 | """Fit the ResourcePoolExtractor.
98 |
99 | Parameters:
100 | -----------
101 | X : DataFrame, shape (n_samples, n_features)
102 | The input data containing activity and resource columns.
103 |
104 | Returns:
105 | --------
106 | self : object
107 | Returns self.
108 | """
109 | X = self._validate_data(X)
110 |
111 | # defining vocabs for activities and resources
112 | self.atoi_, self.itoa_ = self._define_vocabs(X[elc.activity].unique())
113 | self.rtoi_, self.itor_ = self._define_vocabs(X[elc.resource].unique())
114 |
115 | X[elc.activity] = X[elc.activity].map(self.atoi_)
116 | X[elc.resource] = X[elc.resource].map(self.rtoi_)
117 |
118 | # building a pairwise frequency matrix
119 | freq_matrix = (
120 | X.groupby([elc.activity, elc.resource]).value_counts().to_dict()
121 | )
122 |
123 | # building an activity profile for each resource
124 |
125 | # matrix profile: rows = resources, columns = activities
126 | # the unown labels are generating a row of zeros, and this is throwing a warning when calculating the correlation matrix: TODO
127 | # https://stackoverflow.com/questions/45897003/python-numpy-corrcoef-runtimewarning-invalid-value-encountered-in-true-divide
128 | profiles = np.zeros((len(self.rtoi_), len(self.atoi_)), dtype=int)
129 | for pair_ar, freq in freq_matrix.items():
130 | # pair_ar = (activity, resource); order defined by groupby
131 | profiles[pair_ar[1], pair_ar[0]] = freq
132 |
133 | # correlation matrix
134 | with warnings.catch_warnings():
135 | warnings.simplefilter("ignore")
136 | corr = np.corrcoef(
137 | profiles
138 | ) # TODO: include similarity/correlation metric parameter
139 |
140 | np.fill_diagonal(
141 | corr, 0
142 | ) # the original paper does not consider self-relationship
143 |
144 | # subgraphs as roles
145 | n_components, labels = connected_components(
146 | corr > self.threshold, directed=False
147 | )
148 |
149 | sub_graphs = list()
150 | for i in range(n_components):
151 | sub_graphs.append(set(np.where(labels == i)[0]))
152 |
153 | # role definition
154 | self.resource_to_roles_ = dict()
155 | for role_ix, role in enumerate(sub_graphs):
156 | for user_id in role:
157 | self.resource_to_roles_[user_id] = role_ix
158 |
159 | return self
160 |
161 | def transform(self, X: DataFrame, y=None):
162 | """Transform the input data to extract resource roles.
163 |
164 | Parameters:
165 | -----------
166 | X : DataFrame, shape (n_samples, n_features)
167 | The input data containing activity and resource columns.
168 |
169 | Returns:
170 | --------
171 | resource_roles : numpy.ndarray, shape (n_samples,)
172 | An array containing the resource roles for each sample.
173 | """
174 | check_is_fitted(self, "resource_to_roles_")
175 | X = self._validate_data(X)
176 | resource_roles = X[elc.resource].map(self.resource_to_roles_).values
177 | return resource_roles
178 |
179 | def _validate_data(self, X: DataFrame):
180 | """Validate the input data.
181 |
182 | Parameters:
183 | -----------
184 | X : DataFrame, shape (n_samples, n_features)
185 | The input data containing activity and resource columns.
186 |
187 | Returns:
188 | --------
189 | x : DataFrame
190 | The validated input data.
191 | """
192 | assert isinstance(X, DataFrame), "Input must be a dataframe."
193 | x = X.copy()
194 | x.reset_index(drop=True, inplace=True)
195 | columns = validate_columns(
196 | input_columns=x.columns, required=[elc.activity, elc.resource]
197 | )
198 | x = x[columns]
199 |
200 | if x[elc.activity].isnull().any():
201 | raise ValueError("Activity column contains null values.")
202 | if x[elc.resource].isnull().any():
203 | raise ValueError("Resource column contains null values.")
204 |
205 | # i.e. if fitted, check unkown labels
206 | if hasattr(self, "resource_to_roles_"):
207 | x[elc.resource] = self._check_unknown(
208 | x[elc.resource].values, self.rtoi_.keys(), elc.resource
209 | )
210 | x[elc.activity] = self._check_unknown(
211 | x[elc.activity].values, self.atoi_.keys(), elc.activity
212 | )
213 |
214 | x[elc.activity] = x[elc.activity].map(self.atoi_)
215 | x[elc.resource] = x[elc.resource].map(self.rtoi_)
216 |
217 | return x
218 |
219 | def _check_unknown(self, input: np.ndarray, vocab: np.ndarray, name: str):
220 | """Check for unknown labels in the input data.
221 |
222 | Parameters:
223 | -----------
224 | input : numpy.ndarray
225 | The input data containing labels.
226 | vocab : numpy.ndarray
227 | The vocabulary of known labels.
228 | name : str
229 | The name of the label (e.g., 'activity' or 'resource').
230 |
231 | Returns:
232 | --------
233 | input : numpy.ndarray
234 | The input data with unknown labels replaced by 'UNK'.
235 | """
236 | unkown = set(input) - set(vocab)
237 | if unkown:
238 | warnings.warn(
239 | message=(
240 | f"The label '{name}' contains values unseen during fitting. These values will be set to 'UNK': {unkown}"
241 | ),
242 | category=ConceptDriftWarning,
243 | stacklevel=2,
244 | )
245 |
246 | input = np.array(["UNK" if x in unkown else x for x in input])
247 | # input = input.replace(unkown, "UNK")
248 | return input
249 |
250 | def _define_vocabs(self, unique_labels: np.ndarray):
251 | """Define vocabularies for unique labels.
252 |
253 | Parameters:
254 | -----------
255 | unique_labels : numpy.ndarray
256 | An array containing unique labels.
257 |
258 | Returns:
259 | --------
260 | stoi : dict
261 | A dictionary mapping labels to indices.
262 | itos : dict
263 | A dictionary mapping indices to labels.
264 | """
265 | stoi, itos = {"UNK": 0}, {0: "UNK"}
266 | stoi.update({label: i + 1 for i, label in enumerate(unique_labels)})
267 | itos.update({i + 1: label for i, label in enumerate(unique_labels)})
268 | return stoi, itos
269 |
--------------------------------------------------------------------------------
/src/skpm/feature_extraction/event/time.py:
--------------------------------------------------------------------------------
1 | from skpm.config import EventLogConfig as elc
2 |
3 | class TimestampEventLevel:
4 | """
5 | Provides methods to extract time-related features from the event level.
6 |
7 | Implementing event-level and case-level seperately makes code faster since here we do not need to group by case_id.
8 |
9 | """
10 | TIME_UNIT_MULTIPLIER = {
11 | "s": 1,
12 | "m": 60,
13 | "h": 60 * 60,
14 | "d": 60 * 60 * 24,
15 | "w": 60 * 60 * 24 * 7,
16 | }
17 |
18 | @classmethod
19 | def sec_of_min(cls, X):
20 | """Second of minute encoded as value between [-0.5, 0.5]"""
21 | return X.dt.second / 59.0 - 0.5
22 |
23 | @classmethod
24 | def min_of_hour(cls, X):
25 | """Minute of hour encoded as value between [-0.5, 0.5]"""
26 |
27 | return X.dt.minute / 59.0 - 0.5
28 |
29 | @classmethod
30 | def hour_of_day(cls, X):
31 | """Hour of day encoded as value between [-0.5, 0.5]"""
32 |
33 | return X.dt.hour / 23.0 - 0.5
34 |
35 | @classmethod
36 | def day_of_week(cls, X):
37 | """Hour of day encoded as value between [-0.5, 0.5]"""
38 |
39 | return X.dt.dayofweek / 6.0 - 0.5
40 |
41 | @classmethod
42 | def day_of_month(cls, X):
43 | """Day of month encoded as value between [-0.5, 0.5]"""
44 | return (X.dt.day - 1) / 30.0 - 0.5
45 |
46 | @classmethod
47 | def day_of_year(cls, X):
48 | """Day of year encoded as value between [-0.5, 0.5]"""
49 |
50 | return (X.dt.dayofyear - 1) / 365.0 - 0.5
51 |
52 | @classmethod
53 | def week_of_year(cls, X):
54 | """Week of year encoded as value between [-0.5, 0.5]"""
55 | return (X.dt.isocalendar().week - 1) / 52.0 - 0.5
56 |
57 | @classmethod
58 | def month_of_year(cls, X):
59 | """Month of year encoded as value between [-0.5, 0.5]"""
60 | return (X.dt.month - 1) / 11.0 - 0.5
61 |
62 | @classmethod
63 | def secs_within_day(cls, X):
64 | """Extract the number of seconds elapsed within each day from the timestamps encoded as value between [-0.5, 0.5]."""
65 | return (
66 | (X.dt.hour * 3600 + X.dt.minute * 60 + X.dt.second) / 86400
67 | ) - 0.5
68 |
69 | @classmethod
70 | def secs_since_sunday(cls, X):
71 | """Extract the number of seconds elapsed since the last Sunday from the timestamps encoded as value between [-0.5, 0.5]."""
72 | return (
73 | (X.dt.hour * 3600 + X.dt.minute * 60 + X.dt.second) / 604800
74 | ) - 0.5
75 |
76 | @classmethod
77 | def numerical_timestamp(cls, X, time_unit="s"):
78 | """Numerical representation of the timestamp."""
79 | return X.astype("int64") // 10**9 / cls.TIME_UNIT_MULTIPLIER.get(time_unit, 1)
80 |
--------------------------------------------------------------------------------
/src/skpm/feature_extraction/targets.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from skpm.config import EventLogConfig as elc
3 |
4 |
5 | def next_activity(log: pd.DataFrame):
6 | """Returns the next activity of each trace.
7 |
8 | Parameters
9 | ----------
10 | log : pd.DataFrame
11 | An event log.
12 |
13 | Returns
14 | -------
15 | pd.DataFrame
16 | A dataframe with the next activity of each trace.
17 | """
18 | return (
19 | log.groupby(elc.case_id, observed=True, as_index=True)[elc.activity]
20 | .shift(-1, fill_value=elc.EOT)
21 | .values
22 | )
23 |
24 |
25 | def remaining_time(log: pd.DataFrame, time_unit="s"):
26 | """Returns the remaining time of each trace.
27 |
28 | Parameters
29 | ----------
30 | log : pd.DataFrame
31 | An event log.
32 |
33 | Returns
34 | -------
35 | pd.DataFrame
36 | A dataframe with the remaining time of each trace.
37 | """
38 | from skpm.feature_extraction import TimestampExtractor
39 |
40 | return TimestampExtractor(
41 | case_features=None,
42 | event_features=None,
43 | targets="remaining_time",
44 | time_unit=time_unit
45 | ).set_output(transform="default").fit_transform(log)
--------------------------------------------------------------------------------
/src/skpm/feature_extraction/time.py:
--------------------------------------------------------------------------------
1 | import inspect
2 | from typing import Optional, Union
3 |
4 | import pandas as pd
5 | from sklearn.base import (
6 | BaseEstimator,
7 | ClassNamePrefixFeaturesOutMixin,
8 | TransformerMixin,
9 | check_is_fitted,
10 | )
11 |
12 | from skpm.config import EventLogConfig as elc
13 | from skpm.feature_extraction.case.time import TimestampCaseLevel
14 | from skpm.feature_extraction.event.time import TimestampEventLevel
15 | from skpm.utils import validate_columns, validate_methods_from_class
16 |
17 | def _to_list(x):
18 | if x == "all" or x is None:
19 | return x
20 | return [x] if not isinstance(x, list) else x
21 |
22 | class TimestampExtractor(
23 | ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator
24 | ):
25 | """Extracts features from a timestamp column.
26 |
27 | This class extracts various features and targets from a timestamp column in a DataFrame.
28 |
29 | The current targets are: `execution_time` and `remaining_time`. All the remaining attributes
30 | are considered as features.
31 |
32 | Parameters:
33 | -----------
34 | case_features (Union[list, str], optional): List of case features to extract. Defaults to "all".
35 | event_features (Union[list, str], optional): List of event features to extract. Defaults to "all".
36 | targets (Union[list, str], optional): List of target features to extract. Defaults to None.
37 | time_unit (str, optional): Time unit for the features. Defaults to "s".
38 |
39 |
40 | Attributes:
41 | -----------
42 | _n_features_out: int
43 | Number of features extracted.
44 | _n_targets_out: int
45 | Number of targets extracted.
46 |
47 | Methods:
48 | --------
49 | fit(X, y=None):
50 | Fit the transformer to the input data.
51 | transform(X, y=None):
52 | Transform the input data to calculate timestamp features.
53 | get_feature_names_out():
54 | Get the names of the features extracted.
55 | inverse_transform(X):
56 | Inverse transform the input data.
57 |
58 | Notes:
59 | ------
60 | - This class requires a DataFrame with columns for case IDs and timestamps.
61 | - Validation of columns and timestamps is performed in the `fit` method.
62 | - Lowest scale is seconds. Nanoseconds, milliseconds, etc. are disregarded.
63 |
64 | Examples:
65 | ---------
66 | >>> from skpm.event_feature_extraction.time import TimestampExtractor
67 | >>> import pandas as pd
68 | >>> # Assuming X is your dataframe containing event data with columns 'case_id' and 'timestamp'
69 | >>> X = pd.DataFrame({'case_id': [1, 1, 2, 2], 'timestamp': ['2023-01-01 10:30:00', '2023-01-01 11:00:00', '2023-01-01 09:00:00', '2023-01-01 09:30:00']})
70 | >>> feature_extractor = TimestampExtractor()
71 | >>> feature_extractor.fit(X)
72 | >>> feature_extractor.transform(X)
73 | """
74 | available_targets = ["execution_time", "remaining_time"]
75 |
76 | def __init__(
77 | self,
78 | case_features: Union[str, list, None] = "all",
79 | event_features: Union[str, list, None] = "all",
80 | targets: Optional[Union[str, list]] = None,
81 | time_unit: str = "s",
82 | ):
83 | # TODO: feature time unit (secs, hours, days, etc)
84 | # TODO: subset of features rather than all
85 | # TODO: param for event-level and case-level
86 |
87 | self.case_features = _to_list(case_features)
88 | self.event_features = _to_list(event_features)
89 | self.targets = _to_list(targets)
90 | self.time_unit = time_unit
91 |
92 | def fit(
93 | self,
94 | X: pd.DataFrame,
95 | y=None,
96 | ):
97 | """Fit the transformer to the input data.
98 |
99 | This method checks if the input is a DataFrame, validates the required columns,
100 | and computes the desired features.
101 |
102 | Parameters:
103 | -----------
104 | X : DataFrame
105 | Input DataFrame containing columns for case IDs and timestamps.
106 | y : None
107 | Ignored.
108 |
109 | Returns:
110 | --------
111 | self : TimestampExtractor
112 | Fitted transformer instance.
113 | """
114 | _ = self._validate_data(X)
115 |
116 | self.event_features = validate_methods_from_class(
117 | class_obj=TimestampEventLevel,
118 | methods=self.event_features
119 | )
120 | self.case_features = validate_methods_from_class(
121 | class_obj=TimestampCaseLevel,
122 | methods=self.case_features,
123 | )
124 | self.targets = validate_methods_from_class(
125 | class_obj=TimestampCaseLevel,
126 | methods=self.targets,
127 | )
128 |
129 | self._n_features_out = len(self.event_features) + len(
130 | self.case_features
131 | )
132 | self._n_targets_out = len(self.targets)
133 |
134 | if self._n_features_out + self._n_targets_out == 0:
135 | raise ValueError("No features selected. Please select at least one feature, either from the event level or the case level.")
136 |
137 | return self
138 |
139 | def get_feature_names_out(self):
140 | return [
141 | f[0] for f in self.case_features + self.event_features + self.targets
142 | ]
143 |
144 | def transform(self, X: pd.DataFrame, y=None):
145 | """Transform the input data to calculate timestamp features.
146 |
147 | Parameters:
148 | -----------
149 | X : DataFrame
150 | Input DataFrame containing columns for case IDs and timestamps.
151 | y : None
152 | Ignored.
153 |
154 | Returns:
155 | --------
156 | X_tr : DataFrame
157 | Transformed DataFrame with calculated timestamp features added.
158 | """
159 | # Check if fit had been called
160 | check_is_fitted(self, "_n_features_out")
161 |
162 | # data validation
163 | X = self._validate_data(X)
164 |
165 | # case-level features
166 | self.group_ = X.groupby(
167 | elc.case_id, as_index=False, group_keys=False, observed=True
168 | )
169 | for feature_name, feature_fn in self.case_features:
170 | X[feature_name] = feature_fn(
171 | case=self.group_,
172 | ix_list=X.index.values,
173 | time_unit=self.time_unit,
174 | )
175 |
176 | # for event-level features
177 | for feature_name, feature_fn in self.event_features:
178 | sig = inspect.signature(feature_fn)
179 | if "time_unit" in sig.parameters:
180 | X[feature_name] = feature_fn(X[elc.timestamp], time_unit=self.time_unit)
181 | else:
182 | X[feature_name] = feature_fn(X[elc.timestamp])
183 |
184 | # targets
185 | for feature_name, feature_fn in self.targets:
186 | X[feature_name] = feature_fn(
187 | case=self.group_,
188 | ix_list=X.index.values,
189 | time_unit=self.time_unit,
190 | )
191 | output_columns = [
192 | feature[0]
193 | for feature in self.case_features + self.event_features + self.targets
194 | ]
195 | return X.loc[:, output_columns].values
196 |
197 | def _validate_data(self, X: pd.DataFrame):
198 | """
199 | Validates the input DataFrame and timestamp column.
200 |
201 | Parameters:
202 | -----------
203 | X : DataFrame
204 | Input DataFrame containing columns for case IDs and timestamps.
205 |
206 | Returns:
207 | --------
208 | X : DataFrame
209 | Validated DataFrame after processing.
210 | """
211 | assert isinstance(X, pd.DataFrame), "Input must be a dataframe."
212 | x = X.copy()
213 | x.reset_index(drop=True, inplace=True)
214 | # x.columns = self._validate_columns(x.columns)
215 | valid_cols = validate_columns(
216 | input_columns=x.columns, required=[elc.case_id, elc.timestamp]
217 | )
218 | x = x[valid_cols]
219 |
220 | # check if it is a datetime column
221 | x[elc.timestamp] = self._validate_timestamp_format(x)
222 |
223 | return x
224 |
225 | def _validate_timestamp_format(
226 | self, x: pd.DataFrame, timestamp_format: str = "%Y-%m-%d %H:%M:%S"
227 | ):
228 | """
229 | Validates the format of the timestamp column.
230 |
231 | Parameters:
232 | -----------
233 | x : DataFrame
234 | DataFrame containing columns for case IDs and timestamps.
235 | timestamp_format : str, optional
236 | Expected format of the timestamp, by default "%Y-%m-%d %H:%M:%S".
237 |
238 | Returns:
239 | --------
240 | x[elc.timestamp] : Series
241 | Series containing the validated timestamps.
242 | """
243 | if not x[elc.timestamp].dtype == "datetime64[ns]":
244 | # pd = check_pandas_support(
245 | # "'pandas' not found. Please install it to use this method."
246 | # )
247 | try:
248 | # for now, since we are only employing the BPI event logs,
249 | # we are assuming that the datetime format is '%Y-%m-%d %H:%M:%S'.
250 | # TODO: validate alternative datetime formats.
251 | # '%Y-%m-%d %H:%M:%S' format should be mandatory
252 | x[elc.timestamp] = pd.to_datetime(
253 | x[elc.timestamp], format=timestamp_format
254 | )
255 | except:
256 | raise ValueError(
257 | f"Column '{elc.timestamp}' is not a valid datetime column."
258 | )
259 |
260 | # TODO: ensure datetime format
261 | # try:
262 | # # Attempt to parse the datetime string with the specified format
263 | # datetime_obj = datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
264 | # print(f"'{x}' is a valid datetime with the correct format: {datetime_obj}")
265 | # except ValueError:
266 | # print(f"'{x}' is not in the correct format '%Y-%m-%d %H:%M:%S'")
267 | # pass
268 | return x[elc.timestamp]
269 |
270 |
--------------------------------------------------------------------------------
/src/skpm/sequence_encoding/__init__.py:
--------------------------------------------------------------------------------
1 | from .aggregation import Aggregation
2 | from .index import Indexing
3 | from .bucketing import Bucketing
4 |
5 | __all__ = ["Aggregation", "Indexing", "Bucketing"]
--------------------------------------------------------------------------------
/src/skpm/sequence_encoding/aggregation.py:
--------------------------------------------------------------------------------
1 | from typing import Literal, Union
2 |
3 | import pandas as pd
4 | import polars as pl
5 | from sklearn.base import OneToOneFeatureMixin, TransformerMixin
6 | from sklearn.utils._param_validation import StrOptions
7 | from sklearn.utils.validation import check_is_fitted
8 |
9 | from skpm.base import BaseProcessEstimator
10 | from skpm.config import EventLogConfig as elc
11 |
12 | def handle_aggregation_method(method):
13 | """Handle the aggregation method.
14 |
15 | Parameters
16 | ----------
17 | method : str
18 | The aggregation method to be handled.
19 |
20 | Returns
21 | -------
22 | str, callable
23 | The aggregation method that pandas or polars can use.
24 | """
25 | if method == "norm":
26 | from numpy import linalg
27 | return linalg.norm
28 | return method
29 |
30 | class Aggregation(OneToOneFeatureMixin, TransformerMixin, BaseProcessEstimator):
31 | """Sequence Encoding Transformer.
32 |
33 | This module implements a method for encoding sequences by
34 | aggregating features. It adapts the approach from a
35 | research paper [1] that abstracts event sequences by
36 | disregarding their order and using aggregation functions.
37 | Common aggregation functions include frequency-based
38 | methods for categorical features and general statistics
39 | (average, sum, etc.) for numeric attributes.
40 |
41 | In our implementation, we assume that categorical
42 | features are already encoded categorical features and
43 | apply aggregation methods accordingly: frequency
44 | aggregation for integer (categorical) features and
45 | general statistical measures for float (numerical)
46 | features. This design choice allows flexibility in
47 | aggregating user-engineered features, not limited to
48 | one-hot encoding as described in the original
49 | paper [1].
50 |
51 |
52 | Parameters
53 | ----------
54 | method : str, default="mean"
55 | The method to aggregate features.
56 | Possible values: "sum", "mean".
57 | prefix_len : int, default=None
58 | The length of the prefix to consider for aggregation. If None, the length of the longest trace is used.
59 | engine : str, default="pandas"
60 | The DataFrame engine to use. Supported engines are "pandas" and "polars".
61 |
62 | References
63 | ----------
64 | [1] Outcome-Oriented Predictive Process Monitoring: Review and Benchmark, Teinemaa, I., Dumas, M., Maggi, F. M., & La Rosa, M. (2019).
65 |
66 | Examples
67 | --------
68 | >>> import numpy as np
69 | >>> import pandas as pd
70 | >>> from skpm.encoding import Aggregation
71 | >>> from skpm.config import EventLogConfig as elc
72 | >>> df = pd.DataFrame({
73 | ... elc.timestamp: np.arange(10),
74 | ... elc.activity: np.random.randint(0, 10, 10),
75 | ... elc.resource: np.random.randint(0, 3, 10),
76 | ... elc.case_id: np.random.randint(0, 3, 10),
77 | ... }).sort_values(by=[elc.case_id, elc.timestamp])
78 | >>> df = pd.get_dummies(df, columns=[elc.activity,elc.resource], dtype=int)
79 | >>> df = df.drop(elc.timestamp, axis=1)
80 | >>> Aggregation().fit_transform(df)
81 | """
82 |
83 | _case_id = elc.case_id
84 | _parameter_constraints = {
85 | "method": [
86 | StrOptions({"sum", "mean", "median", "norm"}),
87 | ],
88 | "engine": [
89 | StrOptions({"pandas", "polars"}),
90 | ],
91 | }
92 |
93 | def __init__(
94 | self,
95 | method: str = "mean",
96 | prefix_len: int = None,
97 | # n_jobs=1,
98 | engine: Literal[
99 | "pandas", "polars"
100 | ] = "pandas", # Default to Pandas DataFrame
101 | ) -> None:
102 | self.method = method
103 | self.prefix_len = prefix_len
104 | self.engine = engine
105 |
106 | def validate_engine_with_df(self, X, y=None):
107 | if (
108 | self.engine == "pandas"
109 | and not isinstance(X, pd.DataFrame)
110 | ):
111 | X = pd.DataFrame(X)
112 | y = pd.DataFrame(y) if y is not None else None
113 | elif (
114 | self.engine == "polars"
115 | and not isinstance(X, pl.DataFrame)
116 | ):
117 | X = pl.DataFrame(X)
118 | y = pl.DataFrame(y) if y is not None else None
119 | return X, y
120 |
121 | def fit(self, X, y=None):
122 | """Fit transformer.
123 |
124 | Checks if the input is a dataframe, if it
125 | contains the required columns, validates
126 | the timestamp column, and the desired features.
127 |
128 | Parameters
129 | ----------
130 | X : {DataFrame} of shape (n_samples, n_features+1)
131 | The data must contain `n_features` plus a column with case ids.
132 | y : None.
133 | Ignored.
134 |
135 | Returns
136 | -------
137 | self : object
138 | Fitted aggregator.
139 |
140 | """
141 | X = self._validate_log(X)
142 |
143 | if self.prefix_len is None:
144 | self.prefix_len = len(X)
145 |
146 | return self
147 |
148 | def transform(self, X: Union[pd.DataFrame, pl.DataFrame], y=None):
149 | """Performs the aggregation of event features from a trace.
150 |
151 | Parameters
152 | ----------
153 | X : {DataFrame} of shape (n_samples, n_features+1)
154 | An event log. It must contain n_features + 1 columns,
155 | representing the case id and the event features.
156 |
157 | Returns
158 | -------
159 | X : {DataFrame} of shape (n_samples, n_features)
160 | The aggregated event log.
161 | """
162 | check_is_fitted(self, "n_features_in_")
163 | X = self._validate_log(X)
164 |
165 | X, y = self.validate_engine_with_df(X, y)
166 | self._method_fn = handle_aggregation_method(self.method)
167 | if self.engine == "pandas": # If using Pandas DataFrame
168 | if isinstance(X, pl.DataFrame):
169 | X = X.to_pandas()
170 | return self._transform_pandas(X)
171 |
172 | else:
173 | if isinstance(X, pd.DataFrame):
174 | X = pl.DataFrame(X)
175 | X = self._transform_polars(X)
176 | return X.to_pandas()
177 |
178 | def _transform_pandas(self, X: pd.DataFrame):
179 | """Transforms Pandas DataFrame."""
180 | group = X.groupby(self._case_id)
181 |
182 | X = (
183 | group.rolling(window=self.prefix_len, min_periods=1)
184 | .agg(self._method_fn)
185 | .reset_index(drop=True)
186 | )
187 | return X
188 |
189 | def _transform_polars(self, X: pl.DataFrame):
190 | """Transforms Polars DataFrame."""
191 |
192 | def _make_rolling_expr(col_name: str, method_fn: Union[str, callable]) -> pl.Expr:
193 | expr = pl.col(col_name)
194 |
195 | if isinstance(method_fn, str):
196 | builtin = f"rolling_{method_fn}"
197 | fn = getattr(expr, builtin)
198 | return fn(window_size=self.prefix_len, min_samples=1)
199 | else:
200 | expr = pl.col(col_name).cast(pl.Float32)
201 | return expr.rolling_map(
202 | function=method_fn,
203 | window_size=self.prefix_len,
204 | min_samples=1
205 | )
206 |
207 | X = X.with_columns([
208 | _make_rolling_expr(c, self._method_fn).over(self._case_id)
209 | for c in X.columns
210 | if c != self._case_id
211 | ])
212 |
213 | return X.drop(self._case_id)
214 |
--------------------------------------------------------------------------------
/src/skpm/sequence_encoding/bucketing.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.base import TransformerMixin
3 | from skpm.config import EventLogConfig as elc
4 | from skpm.base import BaseProcessEstimator
5 |
6 |
7 | class Bucketing(TransformerMixin, BaseProcessEstimator):
8 | """
9 | Event Bucketing Transformer inherits from :class:`sklearn.base.TransformerMixin` and :class:`skpm.base.BaseProcessEstimator`.
10 |
11 | This class implements a method for bucketing traces based on different strategies.
12 |
13 | Parameters
14 | ----------
15 | method : str, optional
16 | The method used for bucketing traces. Possible values are "single", "prefix", or "clustering".
17 | Default is "single".
18 |
19 | - "single": Assigns all events to a single bucket.
20 | - "prefix": Groups events based on the order in which they occur within each case, assigning sequential buckets.
21 | - "clustering": Not implemented yet, but intended to assign buckets based on clustering of event features.
22 |
23 | Methods
24 | -------
25 | fit(X, y=None)
26 | Fit the transformer.
27 |
28 | transform(X, y=None)
29 | Transform input data by bucketing traces.
30 |
31 | get_feature_names_out()
32 | Get the names of the output features.
33 | """
34 |
35 | def __init__(self, method="single"):
36 | """
37 | Initialize Bucketing Transformer.
38 |
39 | Parameters
40 | ----------
41 | method : str, optional
42 | The method used for bucketing traces. Possible values are "single", "prefix", or "clustering".
43 | Default is "single".
44 | """
45 | assert method in [
46 | "single",
47 | "prefix",
48 | "clustering",
49 | ], f"Invalid method: {method}"
50 |
51 | self.method = method
52 |
53 | def fit(self, X, y=None):
54 | """
55 | Fit the transformer.
56 |
57 | Parameters
58 | ----------
59 | X : array-like or DataFrame
60 | The input data.
61 |
62 | Returns
63 | -------
64 | self : Bucketing
65 | Returns the instance itself.
66 | """
67 | return self
68 |
69 | def transform(self, X, y=None):
70 | """
71 | Transform input data by bucketing traces.
72 |
73 | Parameters
74 | ----------
75 | X : array-like or DataFrame
76 | The input data.
77 |
78 | Returns
79 | -------
80 | bucket_labels : array
81 | An array containing the bucket labels assigned to each event.
82 | """
83 | if self.method == "single":
84 | # For the single method, assign all events to a single bucket.
85 | bucket_labels = np.array(["b1"] * len(X))
86 | elif self.method == "prefix":
87 | # For the prefix method, group events by case ID and assign sequential buckets.
88 | bucket_labels = (
89 | X.groupby(elc.case_id)
90 | .cumcount()
91 | .apply(lambda x: f"b{x + 1}")
92 | .values
93 | )
94 | elif self.method == "clustering":
95 | # Clustering method is not implemented yet.
96 | raise NotImplementedError(
97 | "Clustering method is not implemented yet"
98 | )
99 |
100 | return bucket_labels
101 |
102 | def get_feature_names_out(self):
103 | """
104 | Get the names of the output features.
105 |
106 | Returns
107 | -------
108 | feature_names : list
109 | A list containing the name of the output feature.
110 | """
111 | return ["bucket"]
112 |
--------------------------------------------------------------------------------
/src/skpm/sequence_encoding/index.py:
--------------------------------------------------------------------------------
1 | from skpm.config import EventLogConfig as elc
2 | from sklearn.base import TransformerMixin, _fit_context
3 | from skpm.base import BaseProcessEstimator
4 | from sklearn.utils._param_validation import Interval, _IterablesNotString, Options
5 | from numbers import Integral, Real
6 | import numpy as np
7 | import pandas as pd
8 | from typing import Union
9 |
10 | class Indexing(TransformerMixin, BaseProcessEstimator):
11 | _parameter_constraints = {
12 | "n": [Interval(type=Integral, left=1, right=None, closed="left"), None],
13 | "attributes": [str, list, None],
14 | "fill_value": [Real, None],
15 | }
16 | def __init__(self, n: int = 2, attributes: Union[str, list] = None, fill_value: int = None):
17 | self.n = n
18 | self.attributes = attributes
19 | self.fill_value = fill_value
20 |
21 | @_fit_context(prefer_skip_nested_validation=True)
22 | def fit(self, X: pd.DataFrame, y=None):
23 | if isinstance(self.attributes, str):
24 | self.attributes = [self.attributes]
25 |
26 | if self.attributes is None:
27 | self.attributes = X.columns.difference([elc.case_id]).tolist()
28 | return self
29 |
30 | def transform(self, X: pd.DataFrame, y=None):
31 | group = X.groupby(elc.case_id)
32 |
33 | out_df = pd.DataFrame()
34 | lags = range(self.n)
35 | for col in self.attributes:
36 | lagged_cols = [f"{col}_lag_{lag}" for lag in lags]
37 | out_df[lagged_cols] = group[col].shift(lags, fill_value=self.fill_value)
38 |
39 | return out_df
--------------------------------------------------------------------------------
/src/skpm/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .validation import (
2 | validate_methods_from_class,
3 | validate_columns,
4 | ensure_list,
5 | )
6 |
7 | __all__ = ["validate_methods_from_class", "validate_columns", "ensure_list"]
8 |
--------------------------------------------------------------------------------
/src/skpm/utils/graph.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | __all__ = ["frequency_matrix", "node_degree"]
4 |
5 |
6 | def frequency_matrix(
7 | traces: list, set_of_states: set
8 | ) -> tuple[np.ndarray, dict, dict]:
9 | """
10 | Returns a transition frequency matrix.
11 |
12 | This function takes a list of traces, where each trace
13 | is an ordered sequence of states, and computes a transition
14 | frequency matrix.
15 |
16 | States can be any hashable object, but they must be comparable.
17 | For instance, a state can be a string, an integer, or a tuple.
18 |
19 | Parameters
20 | ----------
21 | traces : list of list of states
22 | A list of traces, where each trace is a list of states.
23 | set_of_states : set of states
24 | A set of all possible states.
25 |
26 | Returns
27 | -------
28 | freq_matrix : numpy.ndarray
29 | A transition frequency matrix.
30 |
31 | stoi : dict
32 | A dictionary mapping states to indices.
33 |
34 | itos : dict
35 | A dictionary mapping indices to states.
36 |
37 | Examples
38 | --------
39 | >>> traces = [[1, 2, 3], [1, 2, 3, 4]]
40 | >>> set_of_states = {1, 2, 3, 4}
41 | >>> frequency_matrix(traces, set_of_states)
42 | (array([[0, 2, 0, 0],
43 | [0, 0, 2, 0],
44 | [0, 0, 0, 1],
45 | [0, 0, 0, 0]]),
46 | {1: 0, 2: 1, 3: 2, 4: 3},
47 | {0: 1, 1: 2, 2: 3, 3: 4})
48 |
49 | >>> traces = [["a", "b", "c"], ["a", "b", "c", "d"]]
50 | >>> set_of_states = {"a", "b", "c", "d"}
51 | >>> frequency_matrix(traces, set_of_states)
52 | (array([[0, 0, 2, 0],
53 | [2, 0, 0, 0],
54 | [0, 0, 0, 1],
55 | [0, 0, 0, 0]]),
56 | {'b': 0, 'a': 1, 'c': 2, 'd': 3},
57 | {0: 'b', 1: 'a', 2: 'c', 3: 'd'})
58 |
59 | >>> traces = [[("a", "b"), ("b", "c")], [("a", "b"), ("b", "c"), ("c", "d")]]
60 | >>> set_of_states = {("a", "b"), ("b", "c"), ("c", "d")}
61 | >>> frequency_matrix(traces, set_of_states)
62 | (array([[0, 0, 0],
63 | [1, 0, 0],
64 | [0, 2, 0]]),
65 | {('c', 'd'): 0, ('b', 'c'): 1, ('a', 'b'): 2},
66 | {0: ('c', 'd'), 1: ('b', 'c'), 2: ('a', 'b')})
67 |
68 | """
69 | stoi = {value: ix for ix, value in enumerate(set_of_states)}
70 | itos = {ix: value for value, ix in stoi.items()}
71 | freq_matrix = np.zeros((len(stoi), len(stoi)), dtype=np.int32)
72 |
73 | for transition in traces:
74 | for origin, destiny in zip(transition, transition[1:]):
75 | freq_matrix[stoi[origin], stoi[destiny]] += 1
76 |
77 | return freq_matrix, stoi, itos
78 |
79 |
80 | def node_degree(frequency_matrix: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
81 | """
82 | Returns the in-degree and out-degree of each node.
83 |
84 | Parameters
85 | ----------
86 | frequency_matrix : numpy.ndarray
87 | A graph as a transition frequency matrix.
88 |
89 | Returns
90 | -------
91 | in_degree : numpy.ndarray
92 | An array with the in-degree of each node.
93 |
94 | out_degree : numpy.ndarray
95 | An array with the out-degree of each node.
96 | """
97 | in_degree = frequency_matrix.sum(axis=0)
98 | out_degree = frequency_matrix.sum(axis=1)
99 |
100 | return in_degree, out_degree
101 |
102 |
103 | def density(graph):
104 | """
105 | Returns the density of a graph.
106 |
107 | Parameters
108 | ----------
109 | graph : numpy.ndarray
110 | A graph as a transition frequency matrix.
111 |
112 | Returns
113 | -------
114 | density : float
115 | The density of the graph.
116 | """
117 | n_nodes = graph.shape[0]
118 | n_edges = graph.sum(axis=None)
119 | max_edges = n_nodes * (n_nodes - 1)
120 | density = n_edges / max_edges
121 | return density
122 |
123 |
124 | def nodes_in_cycles(frequency_matrix, max_cycle_length):
125 | """
126 | Returns a list of whether each node is in a cycle.
127 |
128 | Notice: this function actually returns self-loops, not cycles.
129 | By definition, a cycle is a path that starts and ends at the same node
130 | and visits each node at most once. A self-loop is an edge that connects
131 | a node to itself. A self-loop is a cycle of length 1.
132 |
133 |
134 | Parameters
135 | ----------
136 | frequency_matrix : numpy.ndarray
137 | A graph as a transition frequency matrix.
138 |
139 | max_cycle_length: int
140 | The maximum length of a cycle to be counted.
141 |
142 | Returns
143 | -------
144 | in_cycle : list of bool
145 | A list of whether each node is in a cycle.
146 |
147 | """
148 | frequency_matrix = np.array(frequency_matrix)
149 | num_nodes = frequency_matrix.shape[0]
150 | in_cycle = [
151 | False
152 | ] * num_nodes # Initialize list to store whether each node is in a cycle
153 |
154 | for n in range(2, max_cycle_length + 1):
155 | matrix_power = np.linalg.matrix_power(frequency_matrix, n)
156 | for i in range(num_nodes):
157 | if matrix_power[i, i] > 0:
158 | in_cycle[i] = (
159 | True # Mark node i as in a cycle if diagonal entry is non-zero
160 | )
161 |
162 | return in_cycle
163 |
--------------------------------------------------------------------------------
/src/skpm/utils/helpers.py:
--------------------------------------------------------------------------------
1 | import polars as pl
2 |
3 |
4 | # def flatten_list(l: list):
5 | # return [item for sublist in l for item in sublist]
6 |
7 |
8 | def infer_column_types(df, int_as_cat=False) -> tuple:
9 | """Infer column types from a dataframe."""
10 | if isinstance(df, pl.DataFrame): # For Polars DataFrame
11 | df = df.to_pandas()
12 | cat_cols = ["object", "category"]
13 | if int_as_cat:
14 | cat_cols.append("int")
15 | cat = df.select_dtypes(include=cat_cols).columns.tolist()
16 | num = df.select_dtypes(include=["number"]).columns.tolist()
17 | time = df.select_dtypes(
18 | include=[
19 | "datetime",
20 | "datetime64",
21 | "datetimetz",
22 | "datetime64[ns]",
23 | "timedelta",
24 | "timedelta64",
25 | ]
26 | ).columns.tolist()
27 |
28 | # remove the int columns from num if int_as_cat is True
29 | if int_as_cat:
30 | num = list(set(num) - set(cat))
31 |
32 | return cat, num, time
33 |
--------------------------------------------------------------------------------
/src/skpm/utils/validation.py:
--------------------------------------------------------------------------------
1 | import inspect
2 | from typing import Iterable, Union, Any
3 |
4 |
5 | def validate_methods_from_class(
6 | class_obj: Any, methods: Union[str, list[str]] = "all"
7 | ) -> list[tuple[str, callable]]:
8 | """Validate methods from a class.
9 |
10 | Args:
11 | class_obj (Any): a class object cotaining class methods.
12 | methods (Union[str, list[str]]), {"all", str, list[str]}: a list of methods to validate.
13 |
14 | Returns:
15 | list[tuple[str, callable]]: a list of tuples
16 | containing the name of the methods and the callable.
17 | """
18 | available_methods = inspect.getmembers(
19 | class_obj, predicate=inspect.ismethod
20 | )
21 | out_methods = []
22 | if methods == "all":
23 | out_methods = available_methods
24 | else:
25 | if not isinstance(methods, (tuple, list)):
26 | methods = [methods]
27 | for f in available_methods:
28 | if f[0] in methods and not f[0].startswith("_"):
29 | out_methods.append(f)
30 |
31 | return out_methods
32 |
33 |
34 | def validate_columns(input_columns: Iterable, required: list) -> list:
35 | """Validate required columns.
36 |
37 | This method checks if the input columns
38 | contain the required columns.
39 |
40 | Args:
41 | input_columns (Iterable): Input columns.
42 | required (list): Required columns.
43 |
44 | Raises:
45 | ValueError: If the input is missing any
46 | of the required columns.
47 |
48 | Returns:
49 | list: the input columns
50 | """
51 | diff = set(required) - set(input_columns)
52 | if diff:
53 | raise ValueError(f"Input is missing the following columns: {diff}.")
54 | return required
55 |
56 |
57 | def ensure_list(input: Any) -> list:
58 | """Ensure input is a list.
59 |
60 | Args:
61 | input (Any): Input to be converted to a list.
62 |
63 | Returns:
64 | list: Input as a list.
65 | """
66 | if not isinstance(input, list):
67 | if isinstance(input, (str, int)):
68 | input = [input]
69 | else:
70 | input = list(input)
71 | return input
72 |
--------------------------------------------------------------------------------
/src/skpm/warnings.py:
--------------------------------------------------------------------------------
1 | import warnings
2 |
3 | class ConceptDriftWarning(UserWarning):
4 | pass
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raseidi/skpm/7c35b6bc3888cf64ac78210754438ad6429a869f/tests/__init__.py
--------------------------------------------------------------------------------
/tests/event_logs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raseidi/skpm/7c35b6bc3888cf64ac78210754438ad6429a869f/tests/event_logs/__init__.py
--------------------------------------------------------------------------------
/tests/event_logs/test_bpi.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pytest
3 | import pandas as pd
4 | from skpm.event_logs import BPI13ClosedProblems, BPI19
5 |
6 | """Shapes to validate the logs:"""
7 | # BPI12
8 | # (262200, 7)
9 | # (117546, 7) (55952, 7)
10 | # ========================================
11 | # BPI13ClosedProblems
12 | # (6660, 12)
13 | # Unbiased split parameters not supported.
14 | # ========================================
15 | # BPI13Incidents
16 | # (65533, 12)
17 | # Unbiased split parameters not supported.
18 | # ========================================
19 | # BPI13OpenProblems
20 | # (2351, 11)
21 | # Unbiased split parameters not supported.
22 | # ========================================
23 | # BPI17
24 | # (1202267, 19)
25 | # (805809, 19) (294882, 19)
26 | # ========================================
27 | # BPI19
28 | # (1595923, 21)
29 | # (538293, 21) (538880, 21)
30 | # ========================================
31 | # BPI20PrepaidTravelCosts
32 | # (18246, 22)
33 | # (10809, 22) (4928, 22)
34 | # ========================================
35 | # BPI20TravelPermitData
36 | # (86581, 173)
37 | # (51878, 173) (32390, 173)
38 | # ========================================
39 | # BPI20RequestForPayment
40 | # (36796, 14)
41 | # (23295, 14) (7053, 14)
42 | # ========================================
43 | # BPI20DomesticDeclarations
44 | # (56437, 10)
45 | # Unbiased split parameters not supported.
46 | # ========================================
47 | # BPI20InternationalDeclarations
48 | # (72151, 23)
49 | # Unbiased split parameters not supported.
50 | # ========================================
51 | # Sepsis
52 | # (15214, 32)
53 | # Unbiased split parameters not supported.
54 | # ========================================
55 |
56 | from tempfile import TemporaryDirectory
57 |
58 |
59 | def test_bpi():
60 | with TemporaryDirectory() as tmpdirname:
61 | bpi = BPI13ClosedProblems(root_folder=tmpdirname)
62 |
63 | assert isinstance(bpi.dataframe, pd.DataFrame)
64 | assert isinstance(bpi.__repr__(), str)
65 | assert isinstance(len(bpi.dataframe), int)
66 |
67 | # covering pytest when the file already exists
68 | bpi = BPI13ClosedProblems(bpi.file_path)
69 |
--------------------------------------------------------------------------------
/tests/event_logs/test_download_extract.py:
--------------------------------------------------------------------------------
1 | import os
2 | from skpm.event_logs.extract import extract_gz
3 | from skpm.event_logs.download import download_url
4 |
5 |
6 | def _download(test_folder: str):
7 | url = "https://data.4tu.nl/file/1987a2a6-9f5b-4b14-8d26-ab7056b17929/8b99119d-9525-452e-bc8f-236ac76fa9c9"
8 | file_name = "BPI_Challenge_2013_closed_problems.xes.gz"
9 | output_fold_download = download_url(
10 | url, folder=test_folder, file_name=file_name
11 | )
12 | exists = os.path.exists(output_fold_download)
13 | assert exists
14 |
15 | output_fold_extract = extract_gz(
16 | path=output_fold_download, folder=os.path.dirname(output_fold_download)
17 | )
18 | extracted_exists = os.path.exists(output_fold_download.replace(".gz", ""))
19 | assert extracted_exists
20 |
21 | duplicated = download_url(url, folder=test_folder, file_name=file_name)
22 | assert duplicated == output_fold_download
23 |
24 | no_file_name = download_url(url, folder=".", file_name=None)
25 | assert os.path.isfile(no_file_name)
26 | os.remove(no_file_name)
27 |
28 | if exists:
29 | base_output_fold_download = os.path.abspath(
30 | os.path.dirname(output_fold_download)
31 | )
32 | if base_output_fold_download != os.getcwd():
33 | import shutil
34 |
35 | shutil.rmtree(base_output_fold_download)
36 | else:
37 | os.remove(output_fold_download)
38 | os.remove(output_fold_extract)
39 |
40 |
41 | def test_download_extract():
42 | _download(test_folder="test_download_skpm")
43 | _download(test_folder=None)
44 | _download(test_folder=".")
--------------------------------------------------------------------------------
/tests/event_logs/test_parser.py:
--------------------------------------------------------------------------------
1 | # from skpm.event_logs.parser import read_xes
2 | # from skpm.event_logs import (
3 | # BPI12,
4 | # BPI13ClosedProblems,
5 | # BPI13Incidents,
6 | # BPI17,
7 | # BPI19,
8 | # BPI20,
9 | # )
10 |
11 |
12 | def test_rerad_xes():
13 | assert True
14 |
15 |
16 | # """ToDo
17 |
18 | # I gotta learn how to cache files on GitHub Actions.
19 | # """
20 |
21 | # logs = (
22 | # BPI12,
23 | # BPI13ClosedProblems,
24 | # BPI13Incidents,
25 | # BPI17,
26 | # BPI19,
27 | # # BPI20,
28 | # )
29 |
30 | # shapes = {
31 | # "BPI12": (262200, 7),
32 | # "BPI13ClosedProblems": (6660, 12),
33 | # "BPI13Incidents": (65533, 12),
34 | # "BPI17": (1202267, 19),
35 | # "BPI19": (1595923, 21),
36 | # }
37 |
38 | # for l in logs:
39 | # df = l()
40 | # assert df.log.shape == shapes[l.__name__]
41 |
--------------------------------------------------------------------------------
/tests/feature_extraction/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raseidi/skpm/7c35b6bc3888cf64ac78210754438ad6429a869f/tests/feature_extraction/__init__.py
--------------------------------------------------------------------------------
/tests/feature_extraction/case/test_variant.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | import pytest
5 | from skpm.feature_extraction.case import VariantExtractor
6 | from skpm.config import EventLogConfig as elc
7 |
8 |
9 | def test_variants():
10 | n_cases = 100
11 | np.random.seed(42)
12 | dummy_data = pd.DataFrame(
13 | {
14 | elc.case_id: np.random.randint(0, n_cases, 1000),
15 | elc.activity: np.random.randint(0, 10, 1000),
16 | }
17 | )
18 |
19 | rp = VariantExtractor()
20 | rp.fit(dummy_data)
21 | df = rp.transform(dummy_data)
22 | assert df.variant.nunique() == n_cases
23 |
24 | inv_t = rp.inverse_transform(df.variant)
25 | assert inv_t.shape == (n_cases,)
26 | assert isinstance(inv_t[0], tuple)
--------------------------------------------------------------------------------
/tests/feature_extraction/event/test_resource.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | import pytest
5 | from skpm.feature_extraction.event import ResourcePoolExtractor
6 | from skpm.config import EventLogConfig as elc
7 |
8 |
9 | def test_resource():
10 | dummy_data = pd.DataFrame(
11 | {
12 | elc.activity: np.random.randint(0, 10, 1000),
13 | elc.resource: np.random.randint(0, 3, 1000),
14 | }
15 | )
16 |
17 | dummy_data_test = pd.DataFrame(
18 | {
19 | elc.activity: np.random.randint(0, 10, 100),
20 | elc.resource: np.random.randint(0, 3, 100),
21 | }
22 | )
23 |
24 | rp = ResourcePoolExtractor()
25 | rp.fit(dummy_data)
26 | out = rp.transform(dummy_data)
27 | assert isinstance(out, pd.DataFrame)
28 | assert out.shape[1] == 1
29 | assert out.columns.tolist() == ["resource_roles"]
30 |
31 | test_out = rp.transform(dummy_data_test)
32 | assert test_out.shape[0] == dummy_data_test.shape[0]
33 |
34 | with pytest.raises(Exception):
35 | dummy_data_test[elc.resource] = dummy_data_test[elc.resource].replace(
36 | 2, np.nan
37 | )
38 | rp.transform(dummy_data_test[[elc.activity, elc.resource]])
39 |
40 | with pytest.warns():
41 | dummy_data_test[elc.resource] = dummy_data_test[elc.resource].fillna(
42 | 100
43 | )
44 | test_out = rp.transform(dummy_data_test)
45 |
46 | with pytest.raises(Exception):
47 | dummy_data_test[elc.activity] = dummy_data_test[elc.activity].replace(
48 | 2, np.nan
49 | )
50 | rp.transform(dummy_data_test[[elc.activity, elc.resource]])
51 |
--------------------------------------------------------------------------------
/tests/feature_extraction/event/test_time.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import datetime as dt
4 |
5 | import pytest
6 | from skpm.feature_extraction import TimestampExtractor
7 | from skpm.config import EventLogConfig as elc
8 |
9 | @pytest.fixture(name="dummy_data")
10 | def fixture_dummy_pd():
11 | return pd.DataFrame(
12 | {
13 | elc.case_id: np.repeat(np.arange(0, 10), 100),
14 | elc.activity: np.random.randint(0, 10, 1000),
15 | elc.timestamp: pd.date_range(
16 | start="1/1/2020", periods=1000,
17 | ),
18 | }
19 | )
20 |
21 | def test_time(dummy_data):
22 | # test TimeStampExtractor
23 | t = TimestampExtractor()
24 | t.fit(dummy_data)
25 | out = t.transform(dummy_data)
26 | assert out.shape[1] == t._n_features_out
27 | assert isinstance(out, pd.DataFrame)
28 |
29 | t = TimestampExtractor(case_features="execution_time", event_features=None)
30 | t.fit(dummy_data)
31 | out = t.transform(dummy_data)
32 | assert out.shape[1] == 1
33 | assert isinstance(out, pd.DataFrame)
34 |
35 | t = TimestampExtractor(case_features="execution_time", event_features=["month_of_year", "day_of_week"])
36 | t.fit(dummy_data)
37 | out = t.transform(dummy_data)
38 | assert out.shape[1] == 1 + 2
39 | assert isinstance(out, pd.DataFrame)
40 |
41 | with pytest.raises(Exception):
42 | t = TimestampExtractor(case_features=None, event_features=None)
43 | t.fit(dummy_data)
44 | out = t.transform(dummy_data)
45 |
46 | dummy_data = pd.DataFrame(
47 | {
48 | elc.case_id: [1, 1, 1, 2, 2, 2],
49 | elc.timestamp: ["aaaaa", "bbbbb", "ccccc", "ddddd", "eeeee", ""],
50 | }
51 | )
52 | t = TimestampExtractor()
53 | with pytest.raises(Exception):
54 | t.fit(dummy_data[[elc.case_id, elc.timestamp]])
55 |
--------------------------------------------------------------------------------
/tests/feature_extraction/event/test_wip.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import pytest
4 | from skpm.feature_extraction.event import WorkInProgress
5 | from skpm.config import EventLogConfig as elc
6 |
7 |
8 | def test_wip():
9 | # Test with random data
10 | dummy_log = pd.DataFrame(
11 | {
12 | elc.case_id: np.random.randint(1, 10, 100),
13 | elc.timestamp: pd.date_range("2021-01-01", periods=100, freq="6h"),
14 | elc.activity: np.random.choice(["a", "b", "c"], 100),
15 | }
16 | ).sort_values(elc.timestamp)
17 |
18 | # Test fit_transform with default window_size
19 | wip = WorkInProgress()
20 | wip_values = wip.fit_transform(dummy_log)
21 | assert isinstance(wip_values, pd.DataFrame)
22 | assert wip_values.shape == (len(dummy_log), 1)
23 |
24 | # Test fit_transform with different window_size
25 | wip = WorkInProgress(window_size="2D")
26 | wip_values = wip.fit_transform(dummy_log)
27 | assert isinstance(wip_values, pd.DataFrame)
28 | assert wip_values.shape == (len(dummy_log), 1)
29 |
30 | # Test set_output with transform="pandas"
31 | wip_df = WorkInProgress().fit(dummy_log).transform(dummy_log)
32 | assert isinstance(wip_df, pd.DataFrame)
33 |
34 | # Test with empty dataframe
35 | empty_log = pd.DataFrame(columns=[elc.case_id, elc.timestamp, elc.activity])
36 | wip_empty = WorkInProgress()
37 | wip_empty.fit(empty_log)
38 | with pytest.raises(TypeError):
39 | wip_empty_values = wip_empty.transform(empty_log)
40 | assert isinstance(wip_empty_values, np.ndarray)
41 | assert len(wip_empty_values) == 0
42 |
--------------------------------------------------------------------------------
/tests/feature_extraction/test_targets.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | import pytest
5 | from skpm.feature_extraction.targets import next_activity, remaining_time
6 | from skpm.config import EventLogConfig as elc
7 |
8 | @pytest.fixture(name="dummy_data")
9 | def fixture_dummy_pd():
10 | return pd.DataFrame(
11 | {
12 | elc.case_id: np.repeat(np.arange(0, 10), 100),
13 | elc.activity: np.random.randint(0, 10, 1000),
14 | elc.timestamp: pd.date_range(
15 | start="1/1/2020", periods=1000,
16 | ),
17 | }
18 | )
19 |
20 | def test_next_activity(dummy_data):
21 | # test next_activity
22 | out = next_activity(dummy_data)
23 | assert len(out) == len(dummy_data)
24 | assert isinstance(out, np.ndarray)
25 | assert out.dtype == object
26 |
27 | def test_remaining_time(dummy_data):
28 | out = remaining_time(dummy_data)
29 | assert len(out) == len(dummy_data)
30 | assert out.dtype == float
31 |
--------------------------------------------------------------------------------
/tests/sequence_encoding/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raseidi/skpm/7c35b6bc3888cf64ac78210754438ad6429a869f/tests/sequence_encoding/__init__.py
--------------------------------------------------------------------------------
/tests/sequence_encoding/test_aggregation.py:
--------------------------------------------------------------------------------
1 | import polars as pl
2 | import pytest
3 | import numpy as np
4 | import pandas as pd
5 | from skpm.sequence_encoding import Aggregation
6 | from skpm.config import EventLogConfig as elc
7 |
8 |
9 | @pytest.fixture(name="pd_df")
10 | def fixture_dummy_pd():
11 | return pd.DataFrame(
12 | {
13 | elc.case_id: np.repeat(np.arange(0, 10), 100),
14 | elc.activity: np.random.randint(0, 10, 1000),
15 | elc.resource: np.random.randint(0, 3, 1000),
16 | }
17 | )
18 |
19 | def test_aggregation(pd_df):
20 | # Test default aggregation
21 | rp = Aggregation()
22 | rp.fit(pd_df)
23 | out = rp.transform(pd_df)
24 | assert isinstance(out, pd.DataFrame)
25 | assert out.shape[0] == pd_df.shape[0]
26 |
27 | # Test aggregation with different numerical method
28 | rp = Aggregation(method="sum")
29 | rp.fit(pd_df)
30 | out = rp.transform(pd_df)
31 | assert isinstance(out, pd.DataFrame)
32 | assert out.shape[0] == pd_df.shape[0]
33 |
34 | # Test aggregation with invalid input data
35 | with pytest.raises(Exception):
36 | rp.transform(pd_df[[elc.activity, elc.resource]])
37 |
38 |
39 | def test_aggregation_with_window(pd_df):
40 | # Test aggregation with different numerical method
41 | rp = Aggregation(prefix_len=3)
42 | rp.fit(pd_df)
43 | out = rp.transform(pd_df)
44 | assert isinstance(out, pd.DataFrame)
45 | assert out.shape[0] == pd_df.shape[0]
46 |
47 | # Test window aggregation with window size larger than len(data) must work
48 | rp = Aggregation(prefix_len=len(pd_df) + 1)
49 | rp.fit(pd_df)
50 | out = rp.transform(pd_df)
51 | assert isinstance(out, pd.DataFrame)
52 | assert out.shape[0] == pd_df.shape[0]
53 |
54 | # Test window aggregation with invalid window size
55 | with pytest.raises(Exception):
56 | rp = Aggregation(prefix_len=0)
57 | rp.fit(pd_df)
58 | out = rp.transform(pd_df)
59 |
60 |
61 | def test_aggregation_with_polars(pd_df):
62 | pl_df = pl.DataFrame(pd_df)
63 |
64 | rp = Aggregation(engine="polars")
65 | rp.fit(pl_df)
66 | out = rp.transform(pl_df)
67 | assert isinstance(out, pd.DataFrame)
68 | out = pl.DataFrame(out)
69 | assert out.height == pl_df.height
70 |
71 |
72 | def test_aggregation_output(pd_df):
73 | pl_df = pl.DataFrame(pd_df)
74 |
75 | pd_agg = Aggregation(method="sum")
76 | pl_agg = Aggregation(method="sum", engine="polars")
77 |
78 | pd_agg = pd_agg.fit_transform(pd_df)
79 | pl_agg = pl_agg.fit_transform(pl_df)
80 |
81 | pd_agg = pd_agg.astype(pl_agg.dtypes)
82 | assert isinstance(pl_agg, pd.DataFrame)
83 | assert pd_agg.equals(pl_agg)
84 |
85 | pd_agg = Aggregation(prefix_len=3)
86 | pd_agg = pd_agg.fit_transform(pd_df)
87 | pl_agg = Aggregation(prefix_len=3, engine="polars")
88 | pl_agg = pl_agg.fit_transform(pl_df)
89 | pl_agg = pl_agg.astype(pd_agg.dtypes)
90 | assert isinstance(pl_agg, pd.DataFrame)
91 | assert pd_agg.equals(pl_agg)
92 |
93 |
94 | def test_invalid_input(pd_df):
95 | # invalid arguments
96 | with pytest.raises(Exception):
97 | agg = Aggregation(method="abc")
98 | agg.fit_transform(pd_df)
99 |
100 | # invalid arguments
101 | from sklearn.utils._param_validation import InvalidParameterError
102 |
103 | with pytest.raises(InvalidParameterError):
104 | agg = Aggregation(engine="abc")
105 | agg.fit_transform(pd_df)
106 |
107 | # invalid input data
108 | with pytest.raises(AssertionError):
109 | agg = Aggregation()
110 | agg.fit(pd_df.values)
111 |
112 | # invalid input data
113 | with pytest.raises(AssertionError):
114 | agg = Aggregation().fit(pd_df)
115 | agg.transform(pd_df.values)
116 |
117 | def test_methods(pd_df):
118 | methods = Aggregation._parameter_constraints["method"][0].options
119 | for method in methods:
120 | out_pd = Aggregation(method=method).fit_transform(pd_df)
121 | out_pl = Aggregation(method=method, engine="polars").fit_transform(pd_df)
122 | pd.testing.assert_frame_equal(out_pd, out_pl, check_dtype=False)
123 |
124 | # pandas engine
125 | assert isinstance(out_pd, pd.DataFrame)
126 | assert out_pd.shape[0] == pd_df.shape[0]
127 |
128 | # polars engine
129 | assert isinstance(out_pl, pd.DataFrame)
130 | assert out_pl.shape[0] == pd_df.shape[0]
--------------------------------------------------------------------------------
/tests/sequence_encoding/test_bucketing.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import pytest
4 | from skpm.sequence_encoding import Bucketing
5 | from skpm.config import EventLogConfig as elc
6 |
7 |
8 | @pytest.fixture(name="dummy_log")
9 | def get_dummy_log():
10 | return pd.DataFrame(
11 | {
12 | elc.case_id: np.random.randint(1, 10, 100),
13 | elc.timestamp: pd.date_range("2021-01-01", periods=100, freq="6h"),
14 | elc.activity: np.random.choice(["a", "b", "c"], 100),
15 | }
16 | ).sort_values(elc.timestamp)
17 |
18 |
19 | def test_single(dummy_log):
20 | bucketing = Bucketing(method="single")
21 | bucketing.fit(dummy_log)
22 | bucketing_values = bucketing.transform(dummy_log)
23 | assert isinstance(bucketing_values, pd.DataFrame)
24 | assert bucketing_values.shape == (len(dummy_log), 1)
25 | assert np.unique(bucketing_values) == "b1"
26 |
27 | bucketing = Bucketing().fit(dummy_log).transform(dummy_log)
28 | assert isinstance(bucketing, pd.DataFrame)
29 |
30 |
31 | def test_prefix(dummy_log):
32 | bucketing = Bucketing(method="prefix")
33 | bucketing.fit(dummy_log)
34 | bucketing_values = bucketing.transform(dummy_log)
35 | assert isinstance(bucketing_values, pd.DataFrame)
36 | assert bucketing_values.shape == (len(dummy_log), 1)
37 | assert isinstance(len(np.unique(bucketing_values)), int)
38 |
39 |
40 | def test_clustering_not_implemented(dummy_log):
41 | with pytest.raises(NotImplementedError):
42 | Bucketing(method="clustering").fit(dummy_log).transform(dummy_log)
43 |
44 |
45 | def test_invalid_method(dummy_log):
46 | with pytest.raises(AssertionError):
47 | Bucketing(method="invalid_method").fit(dummy_log)
48 |
49 |
50 | def test_output_feature_names():
51 | bucketing = Bucketing(method="single")
52 | feature_names = bucketing.get_feature_names_out()
53 | assert isinstance(feature_names, list)
54 | assert len(feature_names) == 1
55 | assert feature_names[0] == "bucket"
56 |
--------------------------------------------------------------------------------
/tests/sequence_encoding/test_index.py:
--------------------------------------------------------------------------------
1 | import polars as pl
2 | import pytest
3 | import numpy as np
4 | import pandas as pd
5 | from skpm.sequence_encoding import Indexing
6 | from skpm.config import EventLogConfig as elc
7 |
8 |
9 | @pytest.fixture(name="pd_df")
10 | def fixture_dummy_pd():
11 | return pd.DataFrame(
12 | {
13 | elc.case_id: np.repeat(np.arange(0, 10), 100),
14 | elc.activity: np.random.randint(0, 10, 1000),
15 | elc.resource: np.random.randint(0, 3, 1000),
16 | }
17 | )
18 |
19 | def test_indexing(pd_df):
20 | # Test default Indexing
21 | rp = Indexing(n=2, attributes=[elc.activity, elc.resource], fill_value=0)
22 | rp.fit(pd_df)
23 | out = rp.transform(pd_df)
24 | assert isinstance(out, pd.DataFrame)
25 | assert out.shape[0] == pd_df.shape[0]
26 |
27 | with pytest.raises(Exception):
28 | rp.transform(pd_df[[elc.activity, elc.resource]])
29 |
30 | rp = Indexing(n=2, attributes=elc.activity, fill_value=0)
31 | rp.fit(pd_df)
32 |
33 | rp = Indexing(n=2, attributes=None, fill_value=0)
34 | rp.fit(pd_df)
--------------------------------------------------------------------------------
/tests/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raseidi/skpm/7c35b6bc3888cf64ac78210754438ad6429a869f/tests/utils/__init__.py
--------------------------------------------------------------------------------
/tests/utils/test_graph.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pytest
3 | from skpm.utils.graph import (
4 | frequency_matrix,
5 | node_degree,
6 | density,
7 | nodes_in_cycles,
8 | )
9 |
10 |
11 | @pytest.fixture
12 | def example_traces():
13 | return [[1, 2, 3], [1, 2, 3, 4]]
14 |
15 |
16 | @pytest.fixture
17 | def example_set_of_states():
18 | return {1, 2, 3, 4}
19 |
20 |
21 | @pytest.fixture
22 | def example_frequency_matrix():
23 | return np.array([[0, 2, 0, 0], [0, 0, 2, 0], [0, 0, 0, 1], [0, 0, 0, 0]])
24 |
25 |
26 | def test_frequency_matrix(
27 | example_traces, example_set_of_states, example_frequency_matrix
28 | ):
29 | freq_matrix, stoi, itos = frequency_matrix(
30 | example_traces, example_set_of_states
31 | )
32 | assert np.array_equal(freq_matrix, example_frequency_matrix)
33 | assert stoi == {1: 0, 2: 1, 3: 2, 4: 3}
34 | assert itos == {0: 1, 1: 2, 2: 3, 3: 4}
35 |
36 |
37 | @pytest.fixture
38 | def example_frequency_matrix_node_degree():
39 | return np.array([[0, 2, 0, 0], [0, 0, 2, 0], [0, 0, 0, 1], [0, 0, 0, 0]])
40 |
41 |
42 | def test_node_degree(example_frequency_matrix_node_degree):
43 | in_degree, out_degree = node_degree(example_frequency_matrix_node_degree)
44 | assert np.array_equal(in_degree, np.array([0, 2, 2, 1]))
45 | assert np.array_equal(out_degree, np.array([2, 2, 1, 0]))
46 |
47 |
48 | def test_density():
49 | graph = np.array([[0, 2, 0, 0], [0, 0, 2, 0], [0, 0, 0, 1], [0, 0, 0, 0]])
50 | assert density(graph) == 0.4166666666666667
51 |
52 |
53 | def test_nodes_in_cycles():
54 | graph = np.array([[0, 1, 0], [0, 0, 1], [1, 0, 0]])
55 | assert nodes_in_cycles(graph, max_cycle_length=3) == [True, True, True]
56 |
--------------------------------------------------------------------------------
/tests/utils/test_validation.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pytest
3 | import numpy as np
4 | from skpm.event_logs.download import download_url
5 | from skpm.event_logs.extract import extract_gz
6 | from skpm.utils import validation as v
7 |
8 |
9 | def test_validation():
10 | with pytest.raises(Exception):
11 | v.validate_columns(input_columns=[1, 2, 3], required=[4])
12 |
13 | out = v.ensure_list("exception")
14 | assert isinstance(out, list)
15 |
16 | out = v.ensure_list({1, 2, 3})
17 | assert isinstance(out, list)
18 |
--------------------------------------------------------------------------------