├── .gitignore
├── LICENSE
├── README.md
├── input
    └── BPI2020_DomesticDeclarations.csv
├── main.py
├── models
    ├── gat_model.py
    └── lstm_model.py
├── modules
    ├── data_preprocessing.py
    ├── process_mining.py
    └── rl_optimization.py
├── requirements.txt
└── visualization
    └── process_viz.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | *.so
 6 | .Python
 7 | build/
 8 | develop-eggs/
 9 | dist/
10 | downloads/
11 | eggs/
12 | .eggs/
13 | lib/
14 | lib64/
15 | parts/
16 | sdist/
17 | var/
18 | wheels/
19 | *.egg-info/
20 | .installed.cfg
21 | *.egg
22 | 
23 | # Virtual Environment
24 | venv/
25 | env/
26 | ENV/
27 | pm-venv/
28 | 
29 | # IDE
30 | .idea/
31 | .vscode/
32 | *.swp
33 | *.swo
34 | 
35 | # Data and Results
36 | results/
37 | *.pth
38 | *.png
39 | *.html
40 | 
41 | # Logs
42 | *.log
43 | experiment_results.txt
44 | 
45 | # OS
46 | .DS_Store
47 | .env
48 | .env.local 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [2025] [ERP.AI]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Process Mining with Graph Neural Networks
  2 | 
  3 | An advanced implementation combining Graph Neural Networks, Deep Learning, and Process Mining techniques for business process analysis and prediction.
  4 | 
  5 | ## 1. Overview
  6 | 
  7 | This research project implements a novel approach to process mining using Graph Neural Networks (GNN) and deep learning techniques. The framework combines state-of-the-art machine learning models with traditional process mining methods to provide comprehensive process analysis and prediction capabilities.
  8 | 
  9 | ## 2. Authors
 10 | 
 11 | - **Somesh Misra** [@mathprobro](https://x.com/mathprobro)
 12 | - **Shashank Dixit** [@sphinx](https://x.com/protosphinx)
 13 | - **Research Group**: [ERP.AI](https://www.erp.ai) Research
 14 | 
 15 | ## 3. Key Components
 16 | 
 17 | 1. **Process Analysis**
 18 | - Advanced bottleneck detection using temporal analysis
 19 | - Conformance checking with inductive mining
 20 | - Cycle time analysis and prediction
 21 | - Transition pattern discovery
 22 | - Spectral clustering for process segmentation
 23 | 
 24 | 2. **Machine Learning Models**
 25 | - Graph Attention Networks (GAT) for structural learning
 26 | - LSTM networks for temporal dependencies
 27 | - Reinforcement Learning for process optimization
 28 | - Custom neural architectures for process prediction
 29 | 
 30 | 3. **Visualization Suite**
 31 | - Interactive process flow visualization
 32 | - Temporal pattern analysis
 33 | - Performance bottleneck identification
 34 | - Resource utilization patterns
 35 | - Custom process metrics
 36 | 
 37 | ## 4. Technical Architecture
 38 | 
 39 | ```
 40 | src/
 41 | ├── input/                # input files
 42 | ├── models/
 43 | │   ├── gat_model.py      # Graph Attention Network implementation
 44 | │   └── lstm_model.py     # LSTM sequence model
 45 | ├── modules/
 46 | │   ├── data_preprocessing.py  # Data handling and feature engineering
 47 | │   ├── process_mining.py     # Core process mining functions
 48 | │   └── rl_optimization.py    # Reinforcement learning components
 49 | ├── visualization/
 50 | │   └── process_viz.py        # Visualization toolkit
 51 | └── main.py                   # Main execution script
 52 | ```
 53 | 
 54 | ## 5. Technical Requirements
 55 | 
 56 | - Python 3.8+
 57 | - PyTorch 1.9+
 58 | - PyTorch Geometric
 59 | - PM4Py
 60 | - NetworkX
 61 | - Additional dependencies in requirements.txt
 62 | 
 63 | ## 6. Installation
 64 | 
 65 | 1. Clone the repository:
 66 | ```bash
 67 | git clone https://github.com/ERPdotAI/GNN.git
 68 | cd GNN
 69 | ```
 70 | 
 71 | 2. Install dependencies:
 72 | ```bash
 73 | pip install -r requirements.txt
 74 | ```
 75 | 
 76 | ## 7. Data Requirements
 77 | 
 78 | The system expects process event logs in CSV format with the following structure:
 79 | - case_id: Process instance identifier
 80 | - task_name: Activity name
 81 | - timestamp: Activity timestamp
 82 | - resource: Resource identifier
 83 | - amount: Numerical attribute (if applicable)
 84 | 
 85 | ## 8. Usage
 86 | 
 87 | ```bash
 88 | python main.py <input-file-path>
 89 | ```
 90 | 
 91 | Results are stored in timestamped directories under `results/` with the following structure:
 92 | ```
 93 | results/run_timestamp/
 94 | ├── models/          # Trained model weights
 95 | ├── visualizations/  # Generated visualizations
 96 | ├── metrics/         # Performance metrics
 97 | ├── analysis/        # Detailed analysis results
 98 | └── policies/        # Learned optimization policies
 99 | ```
100 | 
101 | ## 9. Technical Details
102 | 
103 | Graph Neural Network Architecture
104 | - Multi-head attention mechanisms
105 | - Dynamic graph construction
106 | - Adaptive feature learning
107 | - Custom loss functions for process-specific metrics
108 | 
109 | LSTM Implementation
110 | - Bidirectional sequence modeling
111 | - Variable-length sequence handling
112 | - Custom embedding layer for process activities
113 | 
114 | Process Mining Components
115 | - Inductive miner implementation
116 | - Token-based replay
117 | - Custom conformance checking metrics
118 | - Advanced bottleneck detection algorithms
119 | 
120 | Reinforcement Learning
121 | - Custom environment for process optimization
122 | - State-action space modeling
123 | - Policy gradient methods
124 | - Resource allocation optimization
125 | 
126 | ## 10. Contributing
127 | 
128 | We welcome contributions from the research community. Please follow these steps:
129 | 
130 | 1. Fork the repository
131 | 2. Create a feature branch
132 | 3. Implement your changes
133 | 4. Submit a pull request with detailed documentation
134 | 
135 | ## 11. Citation
136 | 
137 | If you use this code in your research, please cite:
138 | 
139 | ```bibtex
140 | @software{GNN_ProcessMining,
141 |   author = {Shashank Dixit/Somesh Misra},
142 |   title = {Process Mining with Graph Neural Networks},
143 |   year = {2025},
144 |   publisher = {ERP.AI},
145 |   url = {https://github.com/ERPdotAI/GNN}
146 | }
147 | ``` 
148 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Main script for enhanced process mining with GNN, LSTM, and RL
  6 | """
  7 | 
  8 | import os
  9 | import torch
 10 | import random
 11 | import numpy as np
 12 | from torch_geometric.loader import DataLoader
 13 | from datetime import datetime
 14 | import json
 15 | import sys
 16 | import shutil
 17 | 
 18 | # Set random seeds
 19 | random.seed(42)
 20 | np.random.seed(42)
 21 | torch.manual_seed(42)
 22 | 
 23 | # Setup device
 24 | if torch.cuda.is_available():
 25 |     device = torch.device("cuda")
 26 | elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
 27 |     device = torch.device("mps")
 28 | else:
 29 |     device = torch.device("cpu")
 30 | print("Using device:", device)
 31 | 
 32 | # Import local modules
 33 | from modules.data_preprocessing import (
 34 |     load_and_preprocess_data,
 35 |     create_feature_representation,
 36 |     build_graph_data,
 37 |     compute_class_weights
 38 | )
 39 | from models.gat_model import (
 40 |     NextTaskGAT,
 41 |     train_gat_model,
 42 |     evaluate_gat_model
 43 | )
 44 | from models.lstm_model import (
 45 |     NextActivityLSTM,
 46 |     prepare_sequence_data,
 47 |     make_padded_dataset,
 48 |     train_lstm_model,
 49 |     evaluate_lstm_model
 50 | )
 51 | from modules.process_mining import (
 52 |     analyze_bottlenecks,
 53 |     analyze_cycle_times,
 54 |     analyze_rare_transitions,
 55 |     perform_conformance_checking,
 56 |     analyze_transition_patterns,
 57 |     spectral_cluster_graph,
 58 |     build_task_adjacency
 59 | )
 60 | from modules.rl_optimization import (
 61 |     ProcessEnv,
 62 |     run_q_learning,
 63 |     get_optimal_policy
 64 | )
 65 | from visualization.process_viz import (
 66 |     plot_confusion_matrix,
 67 |     plot_embeddings,
 68 |     plot_cycle_time_distribution,
 69 |     plot_process_flow,
 70 |     plot_transition_heatmap,
 71 |     create_sankey_diagram
 72 | )
 73 | 
 74 | def setup_results_dir():
 75 |     """Create timestamped results directory structure"""
 76 |     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 77 |     # Use absolute path
 78 |     script_dir = os.path.dirname(os.path.abspath(__file__))
 79 |     base_dir = os.path.join(script_dir, "results")
 80 |     run_dir = os.path.join(base_dir, f"run_{timestamp}")
 81 |     
 82 |     # Create subdirectories
 83 |     subdirs = [
 84 |         "models",          # For saved model weights
 85 |         "visualizations",  # For all plots and diagrams
 86 |         "metrics",        # For performance metrics
 87 |         "analysis",       # For process mining analysis results
 88 |         "policies"        # For RL policies
 89 |     ]
 90 |     
 91 |     for subdir in subdirs:
 92 |         os.makedirs(os.path.join(run_dir, subdir), exist_ok=True)
 93 |     
 94 |     return run_dir
 95 | 
 96 | def save_metrics(metrics_dict, run_dir, filename):
 97 |     """Save metrics to JSON file"""
 98 |     filepath = os.path.join(run_dir, "metrics", filename)
 99 |     with open(filepath, 'w') as f:
100 |         json.dump(metrics_dict, f, indent=4)
101 | 
102 | def main():
103 |     # Create results directory
104 |     run_dir = setup_results_dir()
105 |     print(f"Results will be saved in: {run_dir}")
106 |     
107 |     # 1. Load and preprocess data
108 |     if len(sys.argv) < 2:
109 |         raise ValueError("Error: Missing dataset path. Please provide the path to the dataset as a command line argument.")
110 |     data_path = sys.argv[1]
111 |     if not os.path.exists(data_path):
112 |         print(f"Error: dataset not found at {data_path}")
113 |         return
114 |     
115 |     print("\n1. Loading and preprocessing data...")
116 |     df = load_and_preprocess_data(data_path)
117 |     df, le_task, le_resource = create_feature_representation(df, use_norm_features=True)
118 |     
119 |     # Save preprocessing info
120 |     preproc_info = {
121 |         "num_tasks": len(le_task.classes_),
122 |         "num_resources": len(le_resource.classes_),
123 |         "num_cases": df["case_id"].nunique(),
124 |         "date_range": [str(df["timestamp"].min()), str(df["timestamp"].max())]
125 |     }
126 |     save_metrics(preproc_info, run_dir, "preprocessing_info.json")
127 |     
128 |     # 2. Build graph data
129 |     print("\n2. Building graph data...")
130 |     graphs = build_graph_data(df)
131 |     train_size = int(len(graphs)*0.8)
132 |     train_graphs = graphs[:train_size]
133 |     val_graphs = graphs[train_size:]
134 |     
135 |     train_loader = DataLoader(train_graphs, batch_size=32, shuffle=True)
136 |     val_loader = DataLoader(val_graphs, batch_size=32, shuffle=False)
137 |     
138 |     # 3. Train GAT model
139 |     print("\n3. Training GAT model...")
140 |     num_classes = len(le_task.classes_)
141 |     class_weights = compute_class_weights(df, num_classes).to(device)
142 |     
143 |     gat_model = NextTaskGAT(5, 64, num_classes, num_layers=2, heads=4, dropout=0.5).to(device)
144 |     criterion = torch.nn.CrossEntropyLoss(weight=class_weights)
145 |     optimizer = torch.optim.AdamW(gat_model.parameters(), lr=0.0005, weight_decay=5e-4)
146 |     
147 |     gat_model_path = os.path.join(run_dir, "models", "best_gnn_model.pth")
148 |     gat_model = train_gat_model(
149 |         gat_model, train_loader, val_loader,
150 |         criterion, optimizer, device,
151 |         num_epochs=20, model_path=gat_model_path
152 |     )
153 |     
154 |     # 4. Evaluate GAT model
155 |     print("\n4. Evaluating GAT model...")
156 |     y_true, y_pred, y_prob = evaluate_gat_model(gat_model, val_loader, device)
157 |     plot_confusion_matrix(
158 |         y_true, y_pred, le_task.classes_,
159 |         os.path.join(run_dir, "visualizations", "gat_confusion_matrix.png")
160 |     )
161 |     
162 |     # Save GAT metrics
163 |     from sklearn.metrics import accuracy_score, matthews_corrcoef
164 |     gat_metrics = {
165 |         "accuracy": float(accuracy_score(y_true, y_pred)),
166 |         "mcc": float(matthews_corrcoef(y_true, y_pred))
167 |     }
168 |     save_metrics(gat_metrics, run_dir, "gat_metrics.json")
169 |     
170 |     # 5. Train LSTM model
171 |     print("\n5. Training LSTM model...")
172 |     train_seq, test_seq = prepare_sequence_data(df)
173 |     X_train_pad, X_train_len, y_train_lstm, _ = make_padded_dataset(train_seq, num_classes)
174 |     X_test_pad, X_test_len, y_test_lstm, _ = make_padded_dataset(test_seq, num_classes)
175 |     
176 |     lstm_model = NextActivityLSTM(num_classes, emb_dim=64, hidden_dim=64, num_layers=1).to(device)
177 |     lstm_model_path = os.path.join(run_dir, "models", "lstm_next_activity.pth")
178 |     lstm_model = train_lstm_model(
179 |         lstm_model, X_train_pad, X_train_len, y_train_lstm,
180 |         device, batch_size=64, epochs=5, model_path=lstm_model_path
181 |     )
182 |     
183 |     # 6. Process Mining Analysis
184 |     print("\n6. Performing process mining analysis...")
185 |     bottleneck_stats, significant_bottlenecks = analyze_bottlenecks(df)
186 |     case_merged, long_cases, cut95 = analyze_cycle_times(df)
187 |     rare_trans = analyze_rare_transitions(bottleneck_stats)
188 |     replayed, n_deviant = perform_conformance_checking(df)
189 |     
190 |     # Save process mining analysis results
191 |     process_analysis = {
192 |         "num_long_cases": len(long_cases),
193 |         "cycle_time_95th_percentile": float(cut95),
194 |         "num_rare_transitions": len(rare_trans),
195 |         "num_deviant_traces": n_deviant,
196 |         "total_traces": len(replayed)
197 |     }
198 |     save_metrics(process_analysis, run_dir, "process_analysis.json")
199 |     
200 |     print(f"Found {len(long_cases)} long-running cases above 95th percentile (> {cut95:.1f}h)")
201 |     print(f"Found {len(rare_trans)} rare transitions")
202 |     print(f"Conformance Checking: {n_deviant} deviant traces out of {len(replayed)}")
203 |     
204 |     # 7. Visualizations
205 |     print("\n7. Creating visualizations...")
206 |     viz_dir = os.path.join(run_dir, "visualizations")
207 |     plot_cycle_time_distribution(
208 |         case_merged["duration_h"].values,
209 |         os.path.join(viz_dir, "cycle_time_distribution.png")
210 |     )
211 |     plot_process_flow(
212 |         bottleneck_stats, le_task, significant_bottlenecks.head(),
213 |         os.path.join(viz_dir, "process_flow_bottlenecks.png")
214 |     )
215 |     
216 |     # Get transition patterns first
217 |     transitions, trans_count, prob_matrix = analyze_transition_patterns(df)
218 |     plot_transition_heatmap(
219 |         transitions, le_task,
220 |         os.path.join(viz_dir, "transition_probability_heatmap.png")
221 |     )
222 |     create_sankey_diagram(
223 |         transitions, le_task,
224 |         os.path.join(viz_dir, "process_flow_sankey.html")
225 |     )
226 |     
227 |     # 8. Spectral Clustering
228 |     print("\n8. Performing spectral clustering...")
229 |     adj_matrix = build_task_adjacency(df, num_classes)
230 |     cluster_labels = spectral_cluster_graph(adj_matrix, k=3)
231 |     
232 |     # Save clustering results
233 |     clustering_results = {
234 |         "task_clusters": {
235 |             le_task.inverse_transform([t_id])[0]: int(lbl)
236 |             for t_id, lbl in enumerate(cluster_labels)
237 |         }
238 |     }
239 |     save_metrics(clustering_results, run_dir, "clustering_results.json")
240 |     
241 |     print("Spectral clustering results:")
242 |     for t_id, lbl in enumerate(cluster_labels):
243 |         t_name = le_task.inverse_transform([t_id])[0]
244 |         print(f" Task={t_name} => cluster {lbl}")
245 |     
246 |     # 9. Reinforcement Learning
247 |     print("\n9. Training RL agent...")
248 |     dummy_resources = [0, 1]  # Example with 2 resources
249 |     env = ProcessEnv(df, le_task, dummy_resources)
250 |     q_table = run_q_learning(env, episodes=30)
251 |     
252 |     # Get optimal policy
253 |     all_actions = [(t, r) for t in env.all_tasks for r in env.resources]
254 |     policy = get_optimal_policy(q_table, all_actions)
255 |     
256 |     # Save RL results
257 |     rl_results = {
258 |         "num_states": len(policy),
259 |         "num_actions": len(all_actions),
260 |         "policy": {
261 |             str(state): {"task": int(action[0]), "resource": int(action[1])}
262 |             for state, action in policy.items()
263 |         }
264 |     }
265 |     save_metrics(rl_results, run_dir, "rl_results.json")
266 |     
267 |     print(f"Learned policy for {len(policy)} states")
268 |     print(f"\nDone! Results saved in {run_dir}")
269 | 
270 | if __name__ == "__main__":
271 |     main() 


--------------------------------------------------------------------------------
/models/gat_model.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Graph Attention Network (GAT) model for process mining
  6 | """
  7 | 
  8 | import torch
  9 | import torch.nn as nn
 10 | from torch_geometric.nn import GATConv, global_mean_pool
 11 | 
 12 | class NextTaskGAT(nn.Module):
 13 |     """
 14 |     Graph Attention Network for next task prediction
 15 |     """
 16 |     def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2, heads=4, dropout=0.5):
 17 |         super().__init__()
 18 |         self.convs = nn.ModuleList()
 19 |         self.convs.append(GATConv(input_dim, hidden_dim, heads=heads, concat=True))
 20 |         for _ in range(num_layers-1):
 21 |             self.convs.append(GATConv(hidden_dim*heads, hidden_dim, heads=heads, concat=True))
 22 |         self.fc = nn.Linear(hidden_dim*heads, output_dim)
 23 |         self.dropout = dropout
 24 | 
 25 |     def forward(self, x, edge_index, batch):
 26 |         for conv in self.convs:
 27 |             x = conv(x, edge_index)
 28 |             x = torch.nn.functional.elu(x)
 29 |             x = torch.nn.functional.dropout(x, p=self.dropout, training=self.training)
 30 |         x = global_mean_pool(x, batch)
 31 |         return self.fc(x)
 32 | 
 33 | def train_gat_model(model, train_loader, val_loader, criterion, optimizer, 
 34 |                    device, num_epochs=20, model_path="best_gnn_model.pth"):
 35 |     """
 36 |     Train the GAT model
 37 |     """
 38 |     best_val_loss = float('inf')
 39 |     
 40 |     for epoch in range(1, num_epochs+1):
 41 |         model.train()
 42 |         total_loss = 0.0
 43 |         for batch_data in train_loader:
 44 |             out = model(batch_data.x.to(device),
 45 |                        batch_data.edge_index.to(device),
 46 |                        batch_data.batch.to(device))
 47 |             graph_labels = compute_graph_label(batch_data.y, batch_data.batch).to(device, dtype=torch.long)
 48 |             loss = criterion(out, graph_labels)
 49 | 
 50 |             optimizer.zero_grad()
 51 |             loss.backward()
 52 |             optimizer.step()
 53 |             total_loss += loss.item()
 54 |         avg_train_loss = total_loss / len(train_loader)
 55 | 
 56 |         # Validation
 57 |         model.eval()
 58 |         val_loss = 0.0
 59 |         with torch.no_grad():
 60 |             for batch_data in val_loader:
 61 |                 out = model(batch_data.x.to(device),
 62 |                           batch_data.edge_index.to(device),
 63 |                           batch_data.batch.to(device))
 64 |                 glabels = compute_graph_label(batch_data.y, batch_data.batch).to(device, dtype=torch.long)
 65 |                 val_loss += criterion(out, glabels).item()
 66 |         avg_val_loss = val_loss/len(val_loader)
 67 |         
 68 |         print(f"[Epoch {epoch}/{num_epochs}] train_loss={avg_train_loss:.4f}, val_loss={avg_val_loss:.4f}")
 69 |         
 70 |         if avg_val_loss < best_val_loss:
 71 |             best_val_loss = avg_val_loss
 72 |             torch.save(model.state_dict(), model_path)
 73 |             print(f"  Saved best model (val_loss={best_val_loss:.4f})")
 74 |     
 75 |     return model
 76 | 
 77 | def compute_graph_label(y, batch):
 78 |     """
 79 |     Compute graph-level labels (MPS-compatible)
 80 |     """
 81 |     unique_batches = batch.unique()
 82 |     labels_out = []
 83 |     for bidx in unique_batches:
 84 |         mask = (batch==bidx)
 85 |         yvals_cpu = y[mask].detach().cpu()
 86 |         vals, counts = torch.unique(yvals_cpu, return_counts=True)
 87 |         lbl = vals[torch.argmax(counts)]
 88 |         labels_out.append(lbl)
 89 |     return torch.stack(labels_out)
 90 | 
 91 | def evaluate_gat_model(model, val_loader, device):
 92 |     """
 93 |     Evaluate GAT model and return predictions and probabilities
 94 |     """
 95 |     model.eval()
 96 |     y_true_all, y_pred_all, y_prob_all = [], [], []
 97 |     
 98 |     with torch.no_grad():
 99 |         for batch_data in val_loader:
100 |             logits = model(batch_data.x.to(device),
101 |                          batch_data.edge_index.to(device),
102 |                          batch_data.batch.to(device))
103 |             probs = torch.softmax(logits, dim=1).cpu().numpy()
104 |             glabels = compute_graph_label(batch_data.y, batch_data.batch)
105 |             
106 |             for i in range(logits.size(0)):
107 |                 y_pred_all.append(int(torch.argmax(logits[i]).cpu()))
108 |                 y_prob_all.append(probs[i])
109 |                 y_true_all.append(int(glabels[i]))
110 |     
111 |     return (
112 |         torch.tensor(y_true_all),
113 |         torch.tensor(y_pred_all),
114 |         torch.tensor(y_prob_all)
115 |     ) 


--------------------------------------------------------------------------------
/models/lstm_model.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | LSTM model for next activity prediction in process mining
  6 | """
  7 | 
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.nn.functional as F
 11 | import numpy as np
 12 | import random
 13 | 
 14 | class NextActivityLSTM(nn.Module):
 15 |     """
 16 |     LSTM model for next activity prediction
 17 |     """
 18 |     def __init__(self, num_cls, emb_dim=64, hidden_dim=64, num_layers=1):
 19 |         super().__init__()
 20 |         self.emb = nn.Embedding(num_cls+1, emb_dim, padding_idx=0)
 21 |         self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers=num_layers, batch_first=True)
 22 |         self.fc = nn.Linear(hidden_dim, num_cls)
 23 | 
 24 |     def forward(self, x, seq_len):
 25 |         seq_len_sorted, perm_idx = seq_len.sort(0, descending=True)
 26 |         x_sorted = x[perm_idx]
 27 |         x_emb = self.emb(x_sorted)
 28 |         packed = nn.utils.rnn.pack_padded_sequence(
 29 |             x_emb, seq_len_sorted.cpu(), batch_first=True, enforce_sorted=True
 30 |         )
 31 |         out_packed, (h_n, c_n) = self.lstm(packed)
 32 |         last_hidden = h_n[-1]
 33 |         _, unperm_idx = perm_idx.sort(0)
 34 |         last_hidden = last_hidden[unperm_idx]
 35 |         logits = self.fc(last_hidden)
 36 |         return logits
 37 | 
 38 | def prepare_sequence_data(df, max_len=None):
 39 |     """
 40 |     Prepare sequence data for LSTM training
 41 |     """
 42 |     prefix_samples = []
 43 |     for cid, cdata in df.groupby("case_id"):
 44 |         cdata = cdata.sort_values("timestamp")
 45 |         tasks_list = cdata["task_id"].tolist()
 46 |         for i in range(1, len(tasks_list)):
 47 |             prefix = tasks_list[:i]
 48 |             label = tasks_list[i]
 49 |             prefix_samples.append((prefix, label))
 50 |     
 51 |     random.shuffle(prefix_samples)
 52 |     split_idx = int(0.8*len(prefix_samples))
 53 |     train_seq = prefix_samples[:split_idx]
 54 |     test_seq = prefix_samples[split_idx:]
 55 |     
 56 |     return train_seq, test_seq
 57 | 
 58 | def make_padded_dataset(sample_list, num_cls):
 59 |     """
 60 |     Convert sequence data to padded tensor format
 61 |     """
 62 |     max_len = max(len(s[0]) for s in sample_list)
 63 |     X_padded, X_lens, Y_labels = [], [], []
 64 |     
 65 |     for (pfx, nxt) in sample_list:
 66 |         seqlen = len(pfx)
 67 |         X_lens.append(seqlen)
 68 |         seq = [(tid+1) for tid in pfx]  # shift for pad=0
 69 |         pad_len = max_len - seqlen
 70 |         seq += [0]*pad_len
 71 |         X_padded.append(seq)
 72 |         Y_labels.append(nxt)
 73 |     
 74 |     return (
 75 |         torch.tensor(X_padded, dtype=torch.long),
 76 |         torch.tensor(X_lens, dtype=torch.long),
 77 |         torch.tensor(Y_labels, dtype=torch.long),
 78 |         max_len
 79 |     )
 80 | 
 81 | def train_lstm_model(model, X_train_pad, X_train_len, y_train, 
 82 |                     device, batch_size=64, epochs=5, 
 83 |                     model_path="lstm_next_activity.pth"):
 84 |     """
 85 |     Train the LSTM model
 86 |     """
 87 |     loss_fn = nn.CrossEntropyLoss()
 88 |     optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
 89 |     dataset_size = X_train_pad.size(0)
 90 |     
 91 |     for ep in range(1, epochs+1):
 92 |         model.train()
 93 |         indices = np.random.permutation(dataset_size)
 94 |         total_loss = 0.0
 95 |         
 96 |         for start in range(0, dataset_size, batch_size):
 97 |             end = min(start+batch_size, dataset_size)
 98 |             idx = indices[start:end]
 99 |             
100 |             bx = X_train_pad[idx].to(device)
101 |             blen = X_train_len[idx].to(device)
102 |             by = y_train[idx].to(device)
103 |             
104 |             optimizer.zero_grad()
105 |             out = model(bx, blen)
106 |             lval = loss_fn(out, by)
107 |             lval.backward()
108 |             optimizer.step()
109 |             total_loss += lval.item()
110 |             
111 |         avg_loss = total_loss/((dataset_size + batch_size - 1)//batch_size)
112 |         print(f"[LSTM Ep {ep}/{epochs}] Loss={avg_loss:.4f}")
113 |     
114 |     torch.save(model.state_dict(), model_path)
115 |     return model
116 | 
117 | def evaluate_lstm_model(model, X_test_pad, X_test_len, batch_size, device):
118 |     """
119 |     Evaluate LSTM model and return predictions and probabilities
120 |     """
121 |     model.eval()
122 |     test_size = X_test_pad.size(0)
123 |     logits_list = []
124 |     
125 |     with torch.no_grad():
126 |         for start in range(0, test_size, batch_size):
127 |             end = min(start+batch_size, test_size)
128 |             bx = X_test_pad[start:end].to(device)
129 |             blen = X_test_len[start:end].to(device)
130 |             out = model(bx, blen)
131 |             logits_list.append(out.cpu().numpy())
132 |     
133 |     logits_arr = np.concatenate(logits_list, axis=0)
134 |     
135 |     # Stable softmax
136 |     logits_exp = np.exp(logits_arr - np.max(logits_arr, axis=1, keepdims=True))
137 |     probs = logits_exp / np.sum(logits_exp, axis=1, keepdims=True)
138 |     preds = np.argmax(logits_arr, axis=1)
139 |     
140 |     return preds, probs 


--------------------------------------------------------------------------------
/modules/data_preprocessing.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Data preprocessing module for process mining
  6 | Handles data loading, cleaning, and feature engineering
  7 | """
  8 | 
  9 | import pandas as pd
 10 | import numpy as np
 11 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler, Normalizer
 12 | import torch
 13 | from torch_geometric.data import Data
 14 | 
 15 | def load_and_preprocess_data(data_path, required_cols=None):
 16 |     """Load and preprocess the event log data"""
 17 |     if required_cols is None:
 18 |         required_cols = ["case_id", "task_name", "timestamp", "resource", "amount"]
 19 |         
 20 |     df = pd.read_csv(data_path)
 21 |     df.rename(columns={
 22 |         "case:id": "case_id",
 23 |         "concept:name": "task_name",
 24 |         "time:timestamp": "timestamp",
 25 |         "org:resource": "resource",
 26 |         "case:Amount": "amount"
 27 |     }, inplace=True, errors="ignore")
 28 | 
 29 |     # Validate required columns
 30 |     for c in required_cols:
 31 |         if c not in df.columns:
 32 |             raise ValueError(f"Missing '{c}' in CSV. Found cols: {df.columns.tolist()}")
 33 | 
 34 |     # Process timestamps
 35 |     df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
 36 |     df.dropna(subset=["timestamp"], inplace=True)
 37 |     df.sort_values(["case_id","timestamp"], inplace=True)
 38 | 
 39 |     return df
 40 | 
 41 | def create_feature_representation(df, use_norm_features=True):
 42 |     """Create scaled or normalized feature representation"""
 43 |     # Time features
 44 |     df["day_of_week"] = df["timestamp"].dt.dayofweek
 45 |     df["hour_of_day"] = df["timestamp"].dt.hour
 46 | 
 47 |     # Encode tasks and resources
 48 |     le_task = LabelEncoder()
 49 |     le_resource = LabelEncoder()
 50 |     
 51 |     df["task_id"] = le_task.fit_transform(df["task_name"])
 52 |     df["resource_id"] = le_resource.fit_transform(df["resource"])
 53 | 
 54 |     # Next task
 55 |     df["next_task"] = df.groupby("case_id")["task_id"].shift(-1)
 56 |     df.dropna(subset=["next_task"], inplace=True)
 57 |     df["next_task"] = df["next_task"].astype(int)
 58 | 
 59 |     # Feature scaling
 60 |     feature_cols = ["task_id", "resource_id", "amount", "day_of_week", "hour_of_day"]
 61 |     raw_features = df[feature_cols].values
 62 | 
 63 |     scaler = MinMaxScaler()
 64 |     features_scaled = scaler.fit_transform(raw_features)
 65 | 
 66 |     normalizer = Normalizer(norm='l2')
 67 |     features_normed = normalizer.fit_transform(raw_features)
 68 | 
 69 |     # Choose feature representation
 70 |     combined_features = features_normed if use_norm_features else features_scaled
 71 | 
 72 |     # Add features back to dataframe
 73 |     df["feat_task_id"] = combined_features[:,0]
 74 |     df["feat_resource_id"] = combined_features[:,1]
 75 |     df["feat_amount"] = combined_features[:,2]
 76 |     df["feat_day_of_week"] = combined_features[:,3]
 77 |     df["feat_hour_of_day"] = combined_features[:,4]
 78 | 
 79 |     return df, le_task, le_resource
 80 | 
 81 | def build_graph_data(df):
 82 |     """Convert preprocessed data into graph format for GNN"""
 83 |     graphs = []
 84 |     for cid, cdata in df.groupby("case_id"):
 85 |         cdata.sort_values("timestamp", inplace=True)
 86 | 
 87 |         x_data = torch.tensor(cdata[[
 88 |             "feat_task_id","feat_resource_id","feat_amount",
 89 |             "feat_day_of_week","feat_hour_of_day"
 90 |         ]].values, dtype=torch.float)
 91 | 
 92 |         n_nodes = len(cdata)
 93 |         if n_nodes > 1:
 94 |             src = list(range(n_nodes-1))
 95 |             tgt = list(range(1,n_nodes))
 96 |             edge_index = torch.tensor([src+tgt, tgt+src], dtype=torch.long)
 97 |         else:
 98 |             edge_index = torch.empty((2,0), dtype=torch.long)
 99 |             
100 |         y_data = torch.tensor(cdata["next_task"].values, dtype=torch.long)
101 |         data_obj = Data(x=x_data, edge_index=edge_index, y=y_data)
102 |         graphs.append(data_obj)
103 | 
104 |     return graphs
105 | 
106 | def compute_class_weights(df, num_classes):
107 |     """Compute balanced class weights for training"""
108 |     from sklearn.utils.class_weight import compute_class_weight
109 |     train_labels = df["next_task"].values
110 |     class_weights = np.ones(num_classes, dtype=np.float32)
111 |     present = np.unique(train_labels)
112 |     cw = compute_class_weight("balanced", classes=present, y=train_labels)
113 |     for i, cval in enumerate(present):
114 |         class_weights[cval] = cw[i]
115 |     return torch.tensor(class_weights, dtype=torch.float32) 


--------------------------------------------------------------------------------
/modules/process_mining.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Process Mining Analysis Module
  6 | Includes bottleneck analysis, conformance checking, and cycle time analysis
  7 | """
  8 | 
  9 | import pandas as pd
 10 | import numpy as np
 11 | from pm4py.objects.log.util import dataframe_utils
 12 | from pm4py.objects.conversion.log import converter as log_converter
 13 | from pm4py.algo.discovery.inductive import algorithm as inductive_miner
 14 | from pm4py.algo.conformance.tokenreplay import algorithm as token_replay
 15 | 
 16 | def analyze_bottlenecks(df, freq_threshold=5):
 17 |     """
 18 |     Analyze process bottlenecks based on waiting times between activities
 19 |     """
 20 |     df = df.copy()
 21 |     df["next_task_id"] = df.groupby("case_id")["task_id"].shift(-1)
 22 |     df["next_timestamp"] = df.groupby("case_id")["timestamp"].shift(-1)
 23 |     transitions = df.dropna(subset=["next_task_id"]).copy()
 24 |     transitions["wait_sec"] = (transitions["next_timestamp"] - transitions["timestamp"]).dt.total_seconds()
 25 |     
 26 |     bottleneck_stats = transitions.groupby(["task_id","next_task_id"])["wait_sec"].agg([
 27 |         "mean","count"
 28 |     ]).reset_index()
 29 |     
 30 |     bottleneck_stats["mean_hours"] = bottleneck_stats["mean"]/3600.0
 31 |     bottleneck_stats.sort_values("mean_hours", ascending=False, inplace=True)
 32 |     
 33 |     # Filter by frequency threshold
 34 |     significant_bottlenecks = bottleneck_stats[bottleneck_stats["count"] >= freq_threshold]
 35 |     
 36 |     return bottleneck_stats, significant_bottlenecks
 37 | 
 38 | def analyze_cycle_times(df):
 39 |     """
 40 |     Analyze process cycle times
 41 |     """
 42 |     case_grouped = df.groupby("case_id")["timestamp"].agg(["min","max"])
 43 |     case_grouped["cycle_time_hours"] = (
 44 |         case_grouped["max"] - case_grouped["min"]
 45 |     ).dt.total_seconds()/3600.0
 46 |     case_grouped.reset_index(inplace=True)
 47 |     
 48 |     df_feats = df.groupby("case_id").agg({
 49 |         "amount": "mean",
 50 |         "task_id": "count"
 51 |     }).rename(columns={
 52 |         "amount": "mean_amount",
 53 |         "task_id": "num_events"
 54 |     }).reset_index()
 55 |     
 56 |     case_merged = pd.merge(case_grouped, df_feats, on="case_id", how="left")
 57 |     case_merged["duration_h"] = case_merged["cycle_time_hours"]
 58 |     
 59 |     # Identify long-running cases (95th percentile)
 60 |     cut95 = case_merged["duration_h"].quantile(0.95)
 61 |     long_cases = case_merged[case_merged["duration_h"] > cut95]
 62 |     
 63 |     return case_merged, long_cases, cut95
 64 | 
 65 | def analyze_rare_transitions(bottleneck_stats, rare_threshold=2):
 66 |     """
 67 |     Identify rare transitions in the process
 68 |     """
 69 |     rare_trans = bottleneck_stats[bottleneck_stats["count"] <= rare_threshold]
 70 |     return rare_trans
 71 | 
 72 | def perform_conformance_checking(df):
 73 |     """
 74 |     Perform conformance checking using inductive miner and token replay
 75 |     """
 76 |     df_pm = df[["case_id","task_name","timestamp"]].rename(columns={
 77 |         "case_id": "case:concept:name",
 78 |         "task_name": "concept:name",
 79 |         "timestamp": "time:timestamp"
 80 |     })
 81 |     
 82 |     df_pm = dataframe_utils.convert_timestamp_columns_in_df(df_pm)
 83 |     event_log = log_converter.apply(df_pm)
 84 |     
 85 |     process_tree = inductive_miner.apply(event_log)
 86 |     from pm4py.objects.conversion.process_tree import converter as pt_converter
 87 |     net, im, fm = pt_converter.apply(process_tree)
 88 |     
 89 |     replayed = token_replay.apply(event_log, net, im, fm)
 90 |     n_deviant = sum(1 for t in replayed if not t["trace_is_fit"])
 91 |     
 92 |     return replayed, n_deviant
 93 | 
 94 | def analyze_transition_patterns(df):
 95 |     """
 96 |     Analyze transition patterns and compute transition matrix
 97 |     """
 98 |     transitions = df.copy()
 99 |     transitions["next_task_id"] = transitions.groupby("case_id")["task_id"].shift(-1)
100 |     trans_count = transitions.groupby(["task_id","next_task_id"]).size().unstack(fill_value=0)
101 |     prob_matrix = trans_count.div(trans_count.sum(axis=1), axis=0)
102 |     
103 |     return transitions, trans_count, prob_matrix
104 | 
105 | def spectral_cluster_graph(adj_matrix, k=2):
106 |     """
107 |     Perform spectral clustering on process graph
108 |     """
109 |     from sklearn.cluster import KMeans
110 |     
111 |     degrees = np.sum(adj_matrix, axis=1)
112 |     D = np.diag(degrees)
113 |     L = D - adj_matrix  # unnormalized Laplacian
114 | 
115 |     eigenvals, eigenvecs = np.linalg.eig(L)
116 |     idx = np.argsort(eigenvals)
117 |     eigenvals, eigenvecs = eigenvals[idx], eigenvecs[:, idx]
118 | 
119 |     if k == 2:
120 |         # Fiedler vector = second smallest eigenvector
121 |         fiedler_vec = np.real(eigenvecs[:, 1])
122 |         # Partition by sign
123 |         labels = (fiedler_vec >= 0).astype(int)
124 |     else:
125 |         # multi-cluster
126 |         embedding = np.real(eigenvecs[:, 1:k])
127 |         kmeans = KMeans(n_clusters=k, n_init=10, random_state=42).fit(embedding)
128 |         labels = kmeans.labels_
129 |         
130 |     return labels
131 | 
132 | def build_task_adjacency(df, num_tasks):
133 |     """
134 |     Build adjacency matrix weighted by transition frequencies
135 |     """
136 |     A = np.zeros((num_tasks, num_tasks), dtype=np.float32)
137 |     for cid, cdata in df.groupby("case_id"):
138 |         cdata = cdata.sort_values("timestamp")
139 |         tasks_seq = cdata["task_id"].values
140 |         for i in range(len(tasks_seq)-1):
141 |             src = tasks_seq[i]
142 |             tgt = tasks_seq[i+1]
143 |             A[src, tgt] += 1.0
144 |     return A 


--------------------------------------------------------------------------------
/modules/rl_optimization.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Reinforcement Learning module for process optimization
  6 | """
  7 | 
  8 | import numpy as np
  9 | import random
 10 | 
 11 | class ProcessEnv:
 12 |     """
 13 |     Environment for process optimization using RL
 14 |     The agent chooses (next_activity, resource) pairs and receives rewards
 15 |     based on cost, delay, and resource utilization
 16 |     """
 17 |     def __init__(self, df, le_task, resources):
 18 |         self.df = df
 19 |         self.le_task = le_task
 20 |         self.all_tasks = sorted(df["task_id"].unique())
 21 |         self.resources = resources
 22 |         self.start_task_id = 0
 23 |         self.done = False
 24 |         self.current_task = None
 25 |         
 26 |         # Additional state information could be added here
 27 |         self.resource_usage = {r: 0 for r in resources}
 28 |         self.total_cost = 0
 29 |         self.total_delay = 0
 30 |         
 31 |     def reset(self):
 32 |         """Reset the environment to initial state"""
 33 |         self.current_task = self.start_task_id
 34 |         self.done = False
 35 |         self.resource_usage = {r: 0 for r in self.resources}
 36 |         self.total_cost = 0
 37 |         self.total_delay = 0
 38 |         return self._get_state()
 39 |     
 40 |     def _get_state(self):
 41 |         """
 42 |         Get current state representation
 43 |         Currently using one-hot encoding for current task
 44 |         Could be extended with more features
 45 |         """
 46 |         state_vec = np.zeros(len(self.all_tasks), dtype=np.float32)
 47 |         idx = self.current_task
 48 |         state_vec[idx] = 1.0
 49 |         return state_vec
 50 |     
 51 |     def step(self, action):
 52 |         """
 53 |         Take a step in the environment
 54 |         action = (next_activity_id, resource_id)
 55 |         Returns: (next_state, reward, done, info)
 56 |         """
 57 |         next_task, resource = action
 58 |         
 59 |         if next_task not in self.all_tasks:
 60 |             # Invalid action
 61 |             reward = -100.0
 62 |             self.done = True
 63 |             return self._get_state(), reward, self.done, {}
 64 |         
 65 |         # Compute costs and delays
 66 |         transition_cost = self._compute_transition_cost(self.current_task, next_task)
 67 |         processing_delay = self._compute_processing_delay(next_task, resource)
 68 |         resource_efficiency = self._compute_resource_efficiency(resource)
 69 |         
 70 |         # Update internal state
 71 |         self.total_cost += transition_cost
 72 |         self.total_delay += processing_delay
 73 |         self.resource_usage[resource] += 1
 74 |         
 75 |         # Compute reward components
 76 |         cost_penalty = -transition_cost
 77 |         delay_penalty = -processing_delay
 78 |         efficiency_bonus = resource_efficiency
 79 |         
 80 |         # Combined reward
 81 |         reward = cost_penalty + delay_penalty + efficiency_bonus
 82 |         
 83 |         # Move to next state
 84 |         self.current_task = next_task
 85 |         
 86 |         # Check if process should end
 87 |         if self._should_terminate():
 88 |             self.done = True
 89 |         
 90 |         info = {
 91 |             'transition_cost': transition_cost,
 92 |             'processing_delay': processing_delay,
 93 |             'resource_efficiency': resource_efficiency
 94 |         }
 95 |         
 96 |         return self._get_state(), reward, self.done, info
 97 |     
 98 |     def _compute_transition_cost(self, current_task, next_task):
 99 |         """
100 |         Compute cost of transitioning between tasks
101 |         Currently using a simple distance metric
102 |         Could be replaced with actual cost data
103 |         """
104 |         return abs(next_task - current_task) * 1.0
105 |     
106 |     def _compute_processing_delay(self, task, resource):
107 |         """
108 |         Compute processing delay for task-resource pair
109 |         Currently using random delays
110 |         Could be replaced with historical data
111 |         """
112 |         base_delay = random.random() * 2.0
113 |         resource_factor = 1.0 + (self.resource_usage[resource] * 0.1)
114 |         return base_delay * resource_factor
115 |     
116 |     def _compute_resource_efficiency(self, resource):
117 |         """
118 |         Compute resource utilization efficiency
119 |         Rewards balanced resource usage
120 |         """
121 |         total_usage = sum(self.resource_usage.values())
122 |         if total_usage == 0:
123 |             return 1.0
124 |         
125 |         current_usage = self.resource_usage[resource]
126 |         expected_usage = total_usage / len(self.resources)
127 |         
128 |         if current_usage <= expected_usage:
129 |             return 1.0
130 |         else:
131 |             return max(0.0, 1.0 - (current_usage - expected_usage) * 0.1)
132 |     
133 |     def _should_terminate(self):
134 |         """
135 |         Determine if the process should terminate
136 |         Currently using a simple random termination
137 |         Could be replaced with actual process end conditions
138 |         """
139 |         return random.random() < 0.1
140 | 
141 | def run_q_learning(env, episodes=30, alpha=0.1, gamma=0.9, epsilon=0.1):
142 |     """
143 |     Q-learning algorithm for process optimization
144 |     
145 |     Parameters:
146 |     - env: ProcessEnv instance
147 |     - episodes: Number of training episodes
148 |     - alpha: Learning rate
149 |     - gamma: Discount factor
150 |     - epsilon: Exploration rate
151 |     
152 |     Returns:
153 |     - Q-table mapping state-action pairs to values
154 |     """
155 |     possible_tasks = env.all_tasks
156 |     possible_resources = env.resources
157 |     
158 |     # All possible actions (task, resource pairs)
159 |     all_actions = []
160 |     for t in possible_tasks:
161 |         for r in possible_resources:
162 |             all_actions.append((t, r))
163 |     num_actions = len(all_actions)
164 |     
165 |     Q_table = {}
166 |     
167 |     def get_state_key(state):
168 |         """Convert state array to hashable tuple"""
169 |         return tuple(state.round(3))
170 |     
171 |     def get_Q(state):
172 |         """Get Q-values for state, initialize if needed"""
173 |         sk = get_state_key(state)
174 |         if sk not in Q_table:
175 |             Q_table[sk] = np.zeros(num_actions, dtype=np.float32)
176 |         return Q_table[sk]
177 |     
178 |     # Training loop
179 |     for ep in range(episodes):
180 |         s = env.reset()
181 |         done = False
182 |         total_reward = 0
183 |         
184 |         while not done:
185 |             # ε-greedy action selection
186 |             if random.random() < epsilon:
187 |                 action_idx = random.randrange(num_actions)
188 |             else:
189 |                 q_values = get_Q(s)
190 |                 action_idx = int(np.argmax(q_values))
191 |             
192 |             action = all_actions[action_idx]
193 |             next_state, reward, done, _info = env.step(action)
194 |             total_reward += reward
195 |             
196 |             # Q-learning update
197 |             current_q = get_Q(s)
198 |             next_q = get_Q(next_state)
199 |             best_next_q = 0.0 if done else np.max(next_q)
200 |             
201 |             # Update Q-value
202 |             current_q[action_idx] += alpha * (
203 |                 reward + gamma * best_next_q - current_q[action_idx]
204 |             )
205 |             
206 |             s = next_state
207 |         
208 |         print(f"Episode {ep+1}/{episodes}, total_reward={total_reward:.2f}")
209 |     
210 |     return Q_table
211 | 
212 | def get_optimal_policy(Q_table, all_actions):
213 |     """
214 |     Extract optimal policy from Q-table
215 |     
216 |     Returns:
217 |     - Dictionary mapping states to optimal actions
218 |     """
219 |     policy = {}
220 |     for state in Q_table:
221 |         q_values = Q_table[state]
222 |         optimal_action_idx = np.argmax(q_values)
223 |         policy[state] = all_actions[optimal_action_idx]
224 |     return policy 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch>=1.9.0
 2 | torch-geometric>=2.0.0
 3 | torch-scatter>=2.0.9
 4 | torch-sparse>=0.6.12
 5 | numpy>=1.19.5
 6 | pandas>=1.3.0
 7 | scikit-learn>=0.24.2
 8 | networkx>=2.6.3
 9 | matplotlib>=3.4.3
10 | seaborn>=0.11.2
11 | plotly>=5.3.1
12 | pm4py>=2.2.19
13 | umap-learn>=0.5.1
14 | xgboost>=1.5.0 


--------------------------------------------------------------------------------
/visualization/process_viz.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Visualization module for process mining analysis
  6 | """
  7 | 
  8 | import matplotlib.pyplot as plt
  9 | import seaborn as sns
 10 | import networkx as nx
 11 | import plotly.graph_objects as go
 12 | import numpy as np
 13 | from sklearn.manifold import TSNE
 14 | import umap
 15 | 
 16 | def plot_confusion_matrix(y_true, y_pred, class_names, save_path="confusion_matrix.png"):
 17 |     """Plot confusion matrix"""
 18 |     from sklearn.metrics import confusion_matrix
 19 |     
 20 |     plt.figure(figsize=(8,6))
 21 |     cm = confusion_matrix(y_true, y_pred)
 22 |     sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
 23 |                 xticklabels=class_names,
 24 |                 yticklabels=class_names)
 25 |     plt.title("Confusion Matrix")
 26 |     plt.xlabel("Predicted")
 27 |     plt.ylabel("True")
 28 |     plt.tight_layout()
 29 |     plt.savefig(save_path)
 30 |     plt.close()
 31 | 
 32 | def plot_embeddings(embeddings, method="tsne", save_path=None):
 33 |     """Plot task embeddings using t-SNE or UMAP"""
 34 |     if method == "tsne":
 35 |         tsne_perp = min(30, embeddings.shape[0]-1)
 36 |         coords = TSNE(n_components=2, perplexity=tsne_perp, random_state=42).fit_transform(embeddings)
 37 |         title = "Task Embeddings - t-SNE"
 38 |     else:  # umap
 39 |         coords = umap.UMAP(n_components=2, random_state=42).fit_transform(embeddings)
 40 |         title = "Task Embeddings - UMAP"
 41 |     
 42 |     plt.figure(figsize=(6,5))
 43 |     sns.scatterplot(x=coords[:,0], y=coords[:,1])
 44 |     plt.title(title)
 45 |     if save_path:
 46 |         plt.savefig(save_path)
 47 |     plt.close()
 48 | 
 49 | def plot_cycle_time_distribution(durations, save_path="cycle_time_distribution.png"):
 50 |     """Plot cycle time distribution"""
 51 |     plt.figure(figsize=(6,4))
 52 |     plt.hist(durations, bins=30, color="skyblue", edgecolor="black")
 53 |     plt.title("Cycle Time Distribution (hours)")
 54 |     plt.xlabel("Hours")
 55 |     plt.ylabel("Number of Cases")
 56 |     mean_c = np.mean(durations)
 57 |     plt.axvline(mean_c, color="red", linestyle="--", label=f"Mean={mean_c:.1f}h")
 58 |     plt.legend()
 59 |     plt.tight_layout()
 60 |     plt.savefig(save_path)
 61 |     plt.close()
 62 | 
 63 | def plot_process_flow(bottleneck_stats, le_task, top_bottlenecks, 
 64 |                      save_path="process_flow_bottlenecks.png"):
 65 |     """Plot process flow with bottlenecks highlighted"""
 66 |     G_flow = nx.DiGraph()
 67 |     for i, row in bottleneck_stats.iterrows():
 68 |         src = int(row["task_id"])
 69 |         dst = int(row["next_task_id"])
 70 |         G_flow.add_edge(src, dst, freq=int(row["count"]), mean_hours=row["mean_hours"])
 71 |     
 72 |     btop_edges = set((int(src), int(dst)) for src, dst in zip(
 73 |         top_bottlenecks["task_id"], top_bottlenecks["next_task_id"]
 74 |     ))
 75 |     
 76 |     edge_cols, edge_wids = [], []
 77 |     for (u,v) in G_flow.edges():
 78 |         if (u,v) in btop_edges:
 79 |             edge_cols.append("red")
 80 |             edge_wids.append(2.0)
 81 |         else:
 82 |             edge_cols.append("gray")
 83 |             edge_wids.append(1.0)
 84 | 
 85 |     plt.figure(figsize=(9,7))
 86 |     pos = nx.spring_layout(G_flow, seed=42)
 87 |     nx.draw_networkx_nodes(G_flow, pos, node_color="lightblue", node_size=600)
 88 |     
 89 |     labels_dict = {n: le_task.inverse_transform([int(n)])[0] for n in G_flow.nodes()}
 90 |     nx.draw_networkx_labels(G_flow, pos, labels_dict, font_size=8)
 91 |     nx.draw_networkx_edges(G_flow, pos, edge_color=edge_cols, width=edge_wids, arrows=True)
 92 | 
 93 |     edge_lbl = {}
 94 |     for (u,v) in btop_edges:
 95 |         edge_lbl[(u,v)] = f"{G_flow[u][v]['mean_hours']:.1f}h"
 96 |     nx.draw_networkx_edge_labels(G_flow, pos, edge_labels=edge_lbl, 
 97 |                                 font_color="red", font_size=7)
 98 |     
 99 |     plt.title("Process Flow with Bottlenecks (Red edges)")
100 |     plt.tight_layout()
101 |     plt.savefig(save_path)
102 |     plt.close()
103 | 
104 | def plot_transition_heatmap(transitions, le_task, save_path="transition_probability_heatmap.png"):
105 |     """Plot transition probability heatmap"""
106 |     trans_count = transitions.groupby(["task_id","next_task_id"]).size().unstack(fill_value=0)
107 |     prob_matrix = trans_count.div(trans_count.sum(axis=1), axis=0)
108 |     
109 |     plt.figure(figsize=(10,8))
110 |     xticklabels = [le_task.inverse_transform([int(c)])[0] for c in prob_matrix.columns]
111 |     yticklabels = [le_task.inverse_transform([int(r)])[0] for r in prob_matrix.index]
112 |     
113 |     sns.heatmap(prob_matrix, cmap="YlGnBu", annot=False,
114 |                 xticklabels=xticklabels,
115 |                 yticklabels=yticklabels)
116 |     plt.title("Transition Probability Heatmap")
117 |     plt.xlabel("Next Activity")
118 |     plt.ylabel("Current Activity")
119 |     plt.tight_layout()
120 |     plt.savefig(save_path)
121 |     plt.close()
122 | 
123 | def create_sankey_diagram(df, le_task, save_path="process_flow_sankey.html"):
124 |     """Create Sankey diagram of process flow"""
125 |     start_counts = df.groupby("case_id").first()["task_id"].value_counts().to_dict()
126 |     end_counts = df.groupby("case_id").last()["task_id"].value_counts().to_dict()
127 |     
128 |     trans_count = df.groupby(["task_id","next_task_id"]).size().unstack(fill_value=0)
129 |     arr = trans_count.stack().reset_index().values
130 |     
131 |     unique_nodes = ["Start"] + list(le_task.classes_) + ["End"]
132 |     node_idx = {n:i for i,n in enumerate(unique_nodes)}
133 | 
134 |     sources, targets, values = [], [], []
135 |     
136 |     # Start transitions
137 |     for act_id, ct in start_counts.items():
138 |         sources.append(node_idx["Start"])
139 |         act_name = le_task.inverse_transform([int(act_id)])[0]
140 |         targets.append(node_idx[act_name])
141 |         values.append(int(ct))
142 |     
143 |     # End transitions
144 |     for act_id, ct in end_counts.items():
145 |         act_name = le_task.inverse_transform([int(act_id)])[0]
146 |         sources.append(node_idx[act_name])
147 |         targets.append(node_idx["End"])
148 |         values.append(int(ct))
149 |     
150 |     # Internal transitions
151 |     for row in arr:
152 |         sid, tid, ccount = row
153 |         sid_name = le_task.inverse_transform([int(sid)])[0]
154 |         tid_name = le_task.inverse_transform([int(tid)])[0]
155 |         sources.append(node_idx[sid_name])
156 |         targets.append(node_idx[tid_name])
157 |         values.append(int(ccount))
158 | 
159 |     sankey_fig = go.Figure(data=[go.Sankey(
160 |         node=dict(label=unique_nodes),
161 |         link=dict(source=sources, target=targets, value=values)
162 |     )])
163 |     
164 |     sankey_fig.update_layout(title_text="Process Flow Sankey Diagram", font_size=10)
165 |     sankey_fig.write_html(save_path) 


--------------------------------------------------------------------------------