├── .gitignore ├── LICENSE ├── README.md ├── input └── BPI2020_DomesticDeclarations.csv ├── main.py ├── models ├── gat_model.py └── lstm_model.py ├── modules ├── data_preprocessing.py ├── process_mining.py └── rl_optimization.py ├── requirements.txt └── visualization └── process_viz.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | build/ 8 | develop-eggs/ 9 | dist/ 10 | downloads/ 11 | eggs/ 12 | .eggs/ 13 | lib/ 14 | lib64/ 15 | parts/ 16 | sdist/ 17 | var/ 18 | wheels/ 19 | *.egg-info/ 20 | .installed.cfg 21 | *.egg 22 | 23 | # Virtual Environment 24 | venv/ 25 | env/ 26 | ENV/ 27 | pm-venv/ 28 | 29 | # IDE 30 | .idea/ 31 | .vscode/ 32 | *.swp 33 | *.swo 34 | 35 | # Data and Results 36 | results/ 37 | *.pth 38 | *.png 39 | *.html 40 | 41 | # Logs 42 | *.log 43 | experiment_results.txt 44 | 45 | # OS 46 | .DS_Store 47 | .env 48 | .env.local -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [2025] [ERP.AI] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Process Mining with Graph Neural Networks 2 | 3 | An advanced implementation combining Graph Neural Networks, Deep Learning, and Process Mining techniques for business process analysis and prediction. 4 | 5 | ## 1. Overview 6 | 7 | This research project implements a novel approach to process mining using Graph Neural Networks (GNN) and deep learning techniques. The framework combines state-of-the-art machine learning models with traditional process mining methods to provide comprehensive process analysis and prediction capabilities. 8 | 9 | ## 2. Authors 10 | 11 | - **Somesh Misra** [@mathprobro](https://x.com/mathprobro) 12 | - **Shashank Dixit** [@sphinx](https://x.com/protosphinx) 13 | - **Research Group**: [ERP.AI](https://www.erp.ai) Research 14 | 15 | ## 3. Key Components 16 | 17 | 1. **Process Analysis** 18 | - Advanced bottleneck detection using temporal analysis 19 | - Conformance checking with inductive mining 20 | - Cycle time analysis and prediction 21 | - Transition pattern discovery 22 | - Spectral clustering for process segmentation 23 | 24 | 2. **Machine Learning Models** 25 | - Graph Attention Networks (GAT) for structural learning 26 | - LSTM networks for temporal dependencies 27 | - Reinforcement Learning for process optimization 28 | - Custom neural architectures for process prediction 29 | 30 | 3. **Visualization Suite** 31 | - Interactive process flow visualization 32 | - Temporal pattern analysis 33 | - Performance bottleneck identification 34 | - Resource utilization patterns 35 | - Custom process metrics 36 | 37 | ## 4. Technical Architecture 38 | 39 | ``` 40 | src/ 41 | ├── input/ # input files 42 | ├── models/ 43 | │ ├── gat_model.py # Graph Attention Network implementation 44 | │ └── lstm_model.py # LSTM sequence model 45 | ├── modules/ 46 | │ ├── data_preprocessing.py # Data handling and feature engineering 47 | │ ├── process_mining.py # Core process mining functions 48 | │ └── rl_optimization.py # Reinforcement learning components 49 | ├── visualization/ 50 | │ └── process_viz.py # Visualization toolkit 51 | └── main.py # Main execution script 52 | ``` 53 | 54 | ## 5. Technical Requirements 55 | 56 | - Python 3.8+ 57 | - PyTorch 1.9+ 58 | - PyTorch Geometric 59 | - PM4Py 60 | - NetworkX 61 | - Additional dependencies in requirements.txt 62 | 63 | ## 6. Installation 64 | 65 | 1. Clone the repository: 66 | ```bash 67 | git clone https://github.com/ERPdotAI/GNN.git 68 | cd GNN 69 | ``` 70 | 71 | 2. Install dependencies: 72 | ```bash 73 | pip install -r requirements.txt 74 | ``` 75 | 76 | ## 7. Data Requirements 77 | 78 | The system expects process event logs in CSV format with the following structure: 79 | - case_id: Process instance identifier 80 | - task_name: Activity name 81 | - timestamp: Activity timestamp 82 | - resource: Resource identifier 83 | - amount: Numerical attribute (if applicable) 84 | 85 | ## 8. Usage 86 | 87 | ```bash 88 | python main.py 89 | ``` 90 | 91 | Results are stored in timestamped directories under `results/` with the following structure: 92 | ``` 93 | results/run_timestamp/ 94 | ├── models/ # Trained model weights 95 | ├── visualizations/ # Generated visualizations 96 | ├── metrics/ # Performance metrics 97 | ├── analysis/ # Detailed analysis results 98 | └── policies/ # Learned optimization policies 99 | ``` 100 | 101 | ## 9. Technical Details 102 | 103 | Graph Neural Network Architecture 104 | - Multi-head attention mechanisms 105 | - Dynamic graph construction 106 | - Adaptive feature learning 107 | - Custom loss functions for process-specific metrics 108 | 109 | LSTM Implementation 110 | - Bidirectional sequence modeling 111 | - Variable-length sequence handling 112 | - Custom embedding layer for process activities 113 | 114 | Process Mining Components 115 | - Inductive miner implementation 116 | - Token-based replay 117 | - Custom conformance checking metrics 118 | - Advanced bottleneck detection algorithms 119 | 120 | Reinforcement Learning 121 | - Custom environment for process optimization 122 | - State-action space modeling 123 | - Policy gradient methods 124 | - Resource allocation optimization 125 | 126 | ## 10. Contributing 127 | 128 | We welcome contributions from the research community. Please follow these steps: 129 | 130 | 1. Fork the repository 131 | 2. Create a feature branch 132 | 3. Implement your changes 133 | 4. Submit a pull request with detailed documentation 134 | 135 | ## 11. Citation 136 | 137 | If you use this code in your research, please cite: 138 | 139 | ```bibtex 140 | @software{GNN_ProcessMining, 141 | author = {Shashank Dixit/Somesh Misra}, 142 | title = {Process Mining with Graph Neural Networks}, 143 | year = {2025}, 144 | publisher = {ERP.AI}, 145 | url = {https://github.com/ERPdotAI/GNN} 146 | } 147 | ``` 148 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Main script for enhanced process mining with GNN, LSTM, and RL 6 | """ 7 | 8 | import os 9 | import torch 10 | import random 11 | import numpy as np 12 | from torch_geometric.loader import DataLoader 13 | from datetime import datetime 14 | import json 15 | import sys 16 | import shutil 17 | 18 | # Set random seeds 19 | random.seed(42) 20 | np.random.seed(42) 21 | torch.manual_seed(42) 22 | 23 | # Setup device 24 | if torch.cuda.is_available(): 25 | device = torch.device("cuda") 26 | elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): 27 | device = torch.device("mps") 28 | else: 29 | device = torch.device("cpu") 30 | print("Using device:", device) 31 | 32 | # Import local modules 33 | from modules.data_preprocessing import ( 34 | load_and_preprocess_data, 35 | create_feature_representation, 36 | build_graph_data, 37 | compute_class_weights 38 | ) 39 | from models.gat_model import ( 40 | NextTaskGAT, 41 | train_gat_model, 42 | evaluate_gat_model 43 | ) 44 | from models.lstm_model import ( 45 | NextActivityLSTM, 46 | prepare_sequence_data, 47 | make_padded_dataset, 48 | train_lstm_model, 49 | evaluate_lstm_model 50 | ) 51 | from modules.process_mining import ( 52 | analyze_bottlenecks, 53 | analyze_cycle_times, 54 | analyze_rare_transitions, 55 | perform_conformance_checking, 56 | analyze_transition_patterns, 57 | spectral_cluster_graph, 58 | build_task_adjacency 59 | ) 60 | from modules.rl_optimization import ( 61 | ProcessEnv, 62 | run_q_learning, 63 | get_optimal_policy 64 | ) 65 | from visualization.process_viz import ( 66 | plot_confusion_matrix, 67 | plot_embeddings, 68 | plot_cycle_time_distribution, 69 | plot_process_flow, 70 | plot_transition_heatmap, 71 | create_sankey_diagram 72 | ) 73 | 74 | def setup_results_dir(): 75 | """Create timestamped results directory structure""" 76 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 77 | # Use absolute path 78 | script_dir = os.path.dirname(os.path.abspath(__file__)) 79 | base_dir = os.path.join(script_dir, "results") 80 | run_dir = os.path.join(base_dir, f"run_{timestamp}") 81 | 82 | # Create subdirectories 83 | subdirs = [ 84 | "models", # For saved model weights 85 | "visualizations", # For all plots and diagrams 86 | "metrics", # For performance metrics 87 | "analysis", # For process mining analysis results 88 | "policies" # For RL policies 89 | ] 90 | 91 | for subdir in subdirs: 92 | os.makedirs(os.path.join(run_dir, subdir), exist_ok=True) 93 | 94 | return run_dir 95 | 96 | def save_metrics(metrics_dict, run_dir, filename): 97 | """Save metrics to JSON file""" 98 | filepath = os.path.join(run_dir, "metrics", filename) 99 | with open(filepath, 'w') as f: 100 | json.dump(metrics_dict, f, indent=4) 101 | 102 | def main(): 103 | # Create results directory 104 | run_dir = setup_results_dir() 105 | print(f"Results will be saved in: {run_dir}") 106 | 107 | # 1. Load and preprocess data 108 | if len(sys.argv) < 2: 109 | raise ValueError("Error: Missing dataset path. Please provide the path to the dataset as a command line argument.") 110 | data_path = sys.argv[1] 111 | if not os.path.exists(data_path): 112 | print(f"Error: dataset not found at {data_path}") 113 | return 114 | 115 | print("\n1. Loading and preprocessing data...") 116 | df = load_and_preprocess_data(data_path) 117 | df, le_task, le_resource = create_feature_representation(df, use_norm_features=True) 118 | 119 | # Save preprocessing info 120 | preproc_info = { 121 | "num_tasks": len(le_task.classes_), 122 | "num_resources": len(le_resource.classes_), 123 | "num_cases": df["case_id"].nunique(), 124 | "date_range": [str(df["timestamp"].min()), str(df["timestamp"].max())] 125 | } 126 | save_metrics(preproc_info, run_dir, "preprocessing_info.json") 127 | 128 | # 2. Build graph data 129 | print("\n2. Building graph data...") 130 | graphs = build_graph_data(df) 131 | train_size = int(len(graphs)*0.8) 132 | train_graphs = graphs[:train_size] 133 | val_graphs = graphs[train_size:] 134 | 135 | train_loader = DataLoader(train_graphs, batch_size=32, shuffle=True) 136 | val_loader = DataLoader(val_graphs, batch_size=32, shuffle=False) 137 | 138 | # 3. Train GAT model 139 | print("\n3. Training GAT model...") 140 | num_classes = len(le_task.classes_) 141 | class_weights = compute_class_weights(df, num_classes).to(device) 142 | 143 | gat_model = NextTaskGAT(5, 64, num_classes, num_layers=2, heads=4, dropout=0.5).to(device) 144 | criterion = torch.nn.CrossEntropyLoss(weight=class_weights) 145 | optimizer = torch.optim.AdamW(gat_model.parameters(), lr=0.0005, weight_decay=5e-4) 146 | 147 | gat_model_path = os.path.join(run_dir, "models", "best_gnn_model.pth") 148 | gat_model = train_gat_model( 149 | gat_model, train_loader, val_loader, 150 | criterion, optimizer, device, 151 | num_epochs=20, model_path=gat_model_path 152 | ) 153 | 154 | # 4. Evaluate GAT model 155 | print("\n4. Evaluating GAT model...") 156 | y_true, y_pred, y_prob = evaluate_gat_model(gat_model, val_loader, device) 157 | plot_confusion_matrix( 158 | y_true, y_pred, le_task.classes_, 159 | os.path.join(run_dir, "visualizations", "gat_confusion_matrix.png") 160 | ) 161 | 162 | # Save GAT metrics 163 | from sklearn.metrics import accuracy_score, matthews_corrcoef 164 | gat_metrics = { 165 | "accuracy": float(accuracy_score(y_true, y_pred)), 166 | "mcc": float(matthews_corrcoef(y_true, y_pred)) 167 | } 168 | save_metrics(gat_metrics, run_dir, "gat_metrics.json") 169 | 170 | # 5. Train LSTM model 171 | print("\n5. Training LSTM model...") 172 | train_seq, test_seq = prepare_sequence_data(df) 173 | X_train_pad, X_train_len, y_train_lstm, _ = make_padded_dataset(train_seq, num_classes) 174 | X_test_pad, X_test_len, y_test_lstm, _ = make_padded_dataset(test_seq, num_classes) 175 | 176 | lstm_model = NextActivityLSTM(num_classes, emb_dim=64, hidden_dim=64, num_layers=1).to(device) 177 | lstm_model_path = os.path.join(run_dir, "models", "lstm_next_activity.pth") 178 | lstm_model = train_lstm_model( 179 | lstm_model, X_train_pad, X_train_len, y_train_lstm, 180 | device, batch_size=64, epochs=5, model_path=lstm_model_path 181 | ) 182 | 183 | # 6. Process Mining Analysis 184 | print("\n6. Performing process mining analysis...") 185 | bottleneck_stats, significant_bottlenecks = analyze_bottlenecks(df) 186 | case_merged, long_cases, cut95 = analyze_cycle_times(df) 187 | rare_trans = analyze_rare_transitions(bottleneck_stats) 188 | replayed, n_deviant = perform_conformance_checking(df) 189 | 190 | # Save process mining analysis results 191 | process_analysis = { 192 | "num_long_cases": len(long_cases), 193 | "cycle_time_95th_percentile": float(cut95), 194 | "num_rare_transitions": len(rare_trans), 195 | "num_deviant_traces": n_deviant, 196 | "total_traces": len(replayed) 197 | } 198 | save_metrics(process_analysis, run_dir, "process_analysis.json") 199 | 200 | print(f"Found {len(long_cases)} long-running cases above 95th percentile (> {cut95:.1f}h)") 201 | print(f"Found {len(rare_trans)} rare transitions") 202 | print(f"Conformance Checking: {n_deviant} deviant traces out of {len(replayed)}") 203 | 204 | # 7. Visualizations 205 | print("\n7. Creating visualizations...") 206 | viz_dir = os.path.join(run_dir, "visualizations") 207 | plot_cycle_time_distribution( 208 | case_merged["duration_h"].values, 209 | os.path.join(viz_dir, "cycle_time_distribution.png") 210 | ) 211 | plot_process_flow( 212 | bottleneck_stats, le_task, significant_bottlenecks.head(), 213 | os.path.join(viz_dir, "process_flow_bottlenecks.png") 214 | ) 215 | 216 | # Get transition patterns first 217 | transitions, trans_count, prob_matrix = analyze_transition_patterns(df) 218 | plot_transition_heatmap( 219 | transitions, le_task, 220 | os.path.join(viz_dir, "transition_probability_heatmap.png") 221 | ) 222 | create_sankey_diagram( 223 | transitions, le_task, 224 | os.path.join(viz_dir, "process_flow_sankey.html") 225 | ) 226 | 227 | # 8. Spectral Clustering 228 | print("\n8. Performing spectral clustering...") 229 | adj_matrix = build_task_adjacency(df, num_classes) 230 | cluster_labels = spectral_cluster_graph(adj_matrix, k=3) 231 | 232 | # Save clustering results 233 | clustering_results = { 234 | "task_clusters": { 235 | le_task.inverse_transform([t_id])[0]: int(lbl) 236 | for t_id, lbl in enumerate(cluster_labels) 237 | } 238 | } 239 | save_metrics(clustering_results, run_dir, "clustering_results.json") 240 | 241 | print("Spectral clustering results:") 242 | for t_id, lbl in enumerate(cluster_labels): 243 | t_name = le_task.inverse_transform([t_id])[0] 244 | print(f" Task={t_name} => cluster {lbl}") 245 | 246 | # 9. Reinforcement Learning 247 | print("\n9. Training RL agent...") 248 | dummy_resources = [0, 1] # Example with 2 resources 249 | env = ProcessEnv(df, le_task, dummy_resources) 250 | q_table = run_q_learning(env, episodes=30) 251 | 252 | # Get optimal policy 253 | all_actions = [(t, r) for t in env.all_tasks for r in env.resources] 254 | policy = get_optimal_policy(q_table, all_actions) 255 | 256 | # Save RL results 257 | rl_results = { 258 | "num_states": len(policy), 259 | "num_actions": len(all_actions), 260 | "policy": { 261 | str(state): {"task": int(action[0]), "resource": int(action[1])} 262 | for state, action in policy.items() 263 | } 264 | } 265 | save_metrics(rl_results, run_dir, "rl_results.json") 266 | 267 | print(f"Learned policy for {len(policy)} states") 268 | print(f"\nDone! Results saved in {run_dir}") 269 | 270 | if __name__ == "__main__": 271 | main() -------------------------------------------------------------------------------- /models/gat_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Graph Attention Network (GAT) model for process mining 6 | """ 7 | 8 | import torch 9 | import torch.nn as nn 10 | from torch_geometric.nn import GATConv, global_mean_pool 11 | 12 | class NextTaskGAT(nn.Module): 13 | """ 14 | Graph Attention Network for next task prediction 15 | """ 16 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2, heads=4, dropout=0.5): 17 | super().__init__() 18 | self.convs = nn.ModuleList() 19 | self.convs.append(GATConv(input_dim, hidden_dim, heads=heads, concat=True)) 20 | for _ in range(num_layers-1): 21 | self.convs.append(GATConv(hidden_dim*heads, hidden_dim, heads=heads, concat=True)) 22 | self.fc = nn.Linear(hidden_dim*heads, output_dim) 23 | self.dropout = dropout 24 | 25 | def forward(self, x, edge_index, batch): 26 | for conv in self.convs: 27 | x = conv(x, edge_index) 28 | x = torch.nn.functional.elu(x) 29 | x = torch.nn.functional.dropout(x, p=self.dropout, training=self.training) 30 | x = global_mean_pool(x, batch) 31 | return self.fc(x) 32 | 33 | def train_gat_model(model, train_loader, val_loader, criterion, optimizer, 34 | device, num_epochs=20, model_path="best_gnn_model.pth"): 35 | """ 36 | Train the GAT model 37 | """ 38 | best_val_loss = float('inf') 39 | 40 | for epoch in range(1, num_epochs+1): 41 | model.train() 42 | total_loss = 0.0 43 | for batch_data in train_loader: 44 | out = model(batch_data.x.to(device), 45 | batch_data.edge_index.to(device), 46 | batch_data.batch.to(device)) 47 | graph_labels = compute_graph_label(batch_data.y, batch_data.batch).to(device, dtype=torch.long) 48 | loss = criterion(out, graph_labels) 49 | 50 | optimizer.zero_grad() 51 | loss.backward() 52 | optimizer.step() 53 | total_loss += loss.item() 54 | avg_train_loss = total_loss / len(train_loader) 55 | 56 | # Validation 57 | model.eval() 58 | val_loss = 0.0 59 | with torch.no_grad(): 60 | for batch_data in val_loader: 61 | out = model(batch_data.x.to(device), 62 | batch_data.edge_index.to(device), 63 | batch_data.batch.to(device)) 64 | glabels = compute_graph_label(batch_data.y, batch_data.batch).to(device, dtype=torch.long) 65 | val_loss += criterion(out, glabels).item() 66 | avg_val_loss = val_loss/len(val_loader) 67 | 68 | print(f"[Epoch {epoch}/{num_epochs}] train_loss={avg_train_loss:.4f}, val_loss={avg_val_loss:.4f}") 69 | 70 | if avg_val_loss < best_val_loss: 71 | best_val_loss = avg_val_loss 72 | torch.save(model.state_dict(), model_path) 73 | print(f" Saved best model (val_loss={best_val_loss:.4f})") 74 | 75 | return model 76 | 77 | def compute_graph_label(y, batch): 78 | """ 79 | Compute graph-level labels (MPS-compatible) 80 | """ 81 | unique_batches = batch.unique() 82 | labels_out = [] 83 | for bidx in unique_batches: 84 | mask = (batch==bidx) 85 | yvals_cpu = y[mask].detach().cpu() 86 | vals, counts = torch.unique(yvals_cpu, return_counts=True) 87 | lbl = vals[torch.argmax(counts)] 88 | labels_out.append(lbl) 89 | return torch.stack(labels_out) 90 | 91 | def evaluate_gat_model(model, val_loader, device): 92 | """ 93 | Evaluate GAT model and return predictions and probabilities 94 | """ 95 | model.eval() 96 | y_true_all, y_pred_all, y_prob_all = [], [], [] 97 | 98 | with torch.no_grad(): 99 | for batch_data in val_loader: 100 | logits = model(batch_data.x.to(device), 101 | batch_data.edge_index.to(device), 102 | batch_data.batch.to(device)) 103 | probs = torch.softmax(logits, dim=1).cpu().numpy() 104 | glabels = compute_graph_label(batch_data.y, batch_data.batch) 105 | 106 | for i in range(logits.size(0)): 107 | y_pred_all.append(int(torch.argmax(logits[i]).cpu())) 108 | y_prob_all.append(probs[i]) 109 | y_true_all.append(int(glabels[i])) 110 | 111 | return ( 112 | torch.tensor(y_true_all), 113 | torch.tensor(y_pred_all), 114 | torch.tensor(y_prob_all) 115 | ) -------------------------------------------------------------------------------- /models/lstm_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | LSTM model for next activity prediction in process mining 6 | """ 7 | 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | import numpy as np 12 | import random 13 | 14 | class NextActivityLSTM(nn.Module): 15 | """ 16 | LSTM model for next activity prediction 17 | """ 18 | def __init__(self, num_cls, emb_dim=64, hidden_dim=64, num_layers=1): 19 | super().__init__() 20 | self.emb = nn.Embedding(num_cls+1, emb_dim, padding_idx=0) 21 | self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers=num_layers, batch_first=True) 22 | self.fc = nn.Linear(hidden_dim, num_cls) 23 | 24 | def forward(self, x, seq_len): 25 | seq_len_sorted, perm_idx = seq_len.sort(0, descending=True) 26 | x_sorted = x[perm_idx] 27 | x_emb = self.emb(x_sorted) 28 | packed = nn.utils.rnn.pack_padded_sequence( 29 | x_emb, seq_len_sorted.cpu(), batch_first=True, enforce_sorted=True 30 | ) 31 | out_packed, (h_n, c_n) = self.lstm(packed) 32 | last_hidden = h_n[-1] 33 | _, unperm_idx = perm_idx.sort(0) 34 | last_hidden = last_hidden[unperm_idx] 35 | logits = self.fc(last_hidden) 36 | return logits 37 | 38 | def prepare_sequence_data(df, max_len=None): 39 | """ 40 | Prepare sequence data for LSTM training 41 | """ 42 | prefix_samples = [] 43 | for cid, cdata in df.groupby("case_id"): 44 | cdata = cdata.sort_values("timestamp") 45 | tasks_list = cdata["task_id"].tolist() 46 | for i in range(1, len(tasks_list)): 47 | prefix = tasks_list[:i] 48 | label = tasks_list[i] 49 | prefix_samples.append((prefix, label)) 50 | 51 | random.shuffle(prefix_samples) 52 | split_idx = int(0.8*len(prefix_samples)) 53 | train_seq = prefix_samples[:split_idx] 54 | test_seq = prefix_samples[split_idx:] 55 | 56 | return train_seq, test_seq 57 | 58 | def make_padded_dataset(sample_list, num_cls): 59 | """ 60 | Convert sequence data to padded tensor format 61 | """ 62 | max_len = max(len(s[0]) for s in sample_list) 63 | X_padded, X_lens, Y_labels = [], [], [] 64 | 65 | for (pfx, nxt) in sample_list: 66 | seqlen = len(pfx) 67 | X_lens.append(seqlen) 68 | seq = [(tid+1) for tid in pfx] # shift for pad=0 69 | pad_len = max_len - seqlen 70 | seq += [0]*pad_len 71 | X_padded.append(seq) 72 | Y_labels.append(nxt) 73 | 74 | return ( 75 | torch.tensor(X_padded, dtype=torch.long), 76 | torch.tensor(X_lens, dtype=torch.long), 77 | torch.tensor(Y_labels, dtype=torch.long), 78 | max_len 79 | ) 80 | 81 | def train_lstm_model(model, X_train_pad, X_train_len, y_train, 82 | device, batch_size=64, epochs=5, 83 | model_path="lstm_next_activity.pth"): 84 | """ 85 | Train the LSTM model 86 | """ 87 | loss_fn = nn.CrossEntropyLoss() 88 | optimizer = torch.optim.Adam(model.parameters(), lr=0.001) 89 | dataset_size = X_train_pad.size(0) 90 | 91 | for ep in range(1, epochs+1): 92 | model.train() 93 | indices = np.random.permutation(dataset_size) 94 | total_loss = 0.0 95 | 96 | for start in range(0, dataset_size, batch_size): 97 | end = min(start+batch_size, dataset_size) 98 | idx = indices[start:end] 99 | 100 | bx = X_train_pad[idx].to(device) 101 | blen = X_train_len[idx].to(device) 102 | by = y_train[idx].to(device) 103 | 104 | optimizer.zero_grad() 105 | out = model(bx, blen) 106 | lval = loss_fn(out, by) 107 | lval.backward() 108 | optimizer.step() 109 | total_loss += lval.item() 110 | 111 | avg_loss = total_loss/((dataset_size + batch_size - 1)//batch_size) 112 | print(f"[LSTM Ep {ep}/{epochs}] Loss={avg_loss:.4f}") 113 | 114 | torch.save(model.state_dict(), model_path) 115 | return model 116 | 117 | def evaluate_lstm_model(model, X_test_pad, X_test_len, batch_size, device): 118 | """ 119 | Evaluate LSTM model and return predictions and probabilities 120 | """ 121 | model.eval() 122 | test_size = X_test_pad.size(0) 123 | logits_list = [] 124 | 125 | with torch.no_grad(): 126 | for start in range(0, test_size, batch_size): 127 | end = min(start+batch_size, test_size) 128 | bx = X_test_pad[start:end].to(device) 129 | blen = X_test_len[start:end].to(device) 130 | out = model(bx, blen) 131 | logits_list.append(out.cpu().numpy()) 132 | 133 | logits_arr = np.concatenate(logits_list, axis=0) 134 | 135 | # Stable softmax 136 | logits_exp = np.exp(logits_arr - np.max(logits_arr, axis=1, keepdims=True)) 137 | probs = logits_exp / np.sum(logits_exp, axis=1, keepdims=True) 138 | preds = np.argmax(logits_arr, axis=1) 139 | 140 | return preds, probs -------------------------------------------------------------------------------- /modules/data_preprocessing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Data preprocessing module for process mining 6 | Handles data loading, cleaning, and feature engineering 7 | """ 8 | 9 | import pandas as pd 10 | import numpy as np 11 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler, Normalizer 12 | import torch 13 | from torch_geometric.data import Data 14 | 15 | def load_and_preprocess_data(data_path, required_cols=None): 16 | """Load and preprocess the event log data""" 17 | if required_cols is None: 18 | required_cols = ["case_id", "task_name", "timestamp", "resource", "amount"] 19 | 20 | df = pd.read_csv(data_path) 21 | df.rename(columns={ 22 | "case:id": "case_id", 23 | "concept:name": "task_name", 24 | "time:timestamp": "timestamp", 25 | "org:resource": "resource", 26 | "case:Amount": "amount" 27 | }, inplace=True, errors="ignore") 28 | 29 | # Validate required columns 30 | for c in required_cols: 31 | if c not in df.columns: 32 | raise ValueError(f"Missing '{c}' in CSV. Found cols: {df.columns.tolist()}") 33 | 34 | # Process timestamps 35 | df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce") 36 | df.dropna(subset=["timestamp"], inplace=True) 37 | df.sort_values(["case_id","timestamp"], inplace=True) 38 | 39 | return df 40 | 41 | def create_feature_representation(df, use_norm_features=True): 42 | """Create scaled or normalized feature representation""" 43 | # Time features 44 | df["day_of_week"] = df["timestamp"].dt.dayofweek 45 | df["hour_of_day"] = df["timestamp"].dt.hour 46 | 47 | # Encode tasks and resources 48 | le_task = LabelEncoder() 49 | le_resource = LabelEncoder() 50 | 51 | df["task_id"] = le_task.fit_transform(df["task_name"]) 52 | df["resource_id"] = le_resource.fit_transform(df["resource"]) 53 | 54 | # Next task 55 | df["next_task"] = df.groupby("case_id")["task_id"].shift(-1) 56 | df.dropna(subset=["next_task"], inplace=True) 57 | df["next_task"] = df["next_task"].astype(int) 58 | 59 | # Feature scaling 60 | feature_cols = ["task_id", "resource_id", "amount", "day_of_week", "hour_of_day"] 61 | raw_features = df[feature_cols].values 62 | 63 | scaler = MinMaxScaler() 64 | features_scaled = scaler.fit_transform(raw_features) 65 | 66 | normalizer = Normalizer(norm='l2') 67 | features_normed = normalizer.fit_transform(raw_features) 68 | 69 | # Choose feature representation 70 | combined_features = features_normed if use_norm_features else features_scaled 71 | 72 | # Add features back to dataframe 73 | df["feat_task_id"] = combined_features[:,0] 74 | df["feat_resource_id"] = combined_features[:,1] 75 | df["feat_amount"] = combined_features[:,2] 76 | df["feat_day_of_week"] = combined_features[:,3] 77 | df["feat_hour_of_day"] = combined_features[:,4] 78 | 79 | return df, le_task, le_resource 80 | 81 | def build_graph_data(df): 82 | """Convert preprocessed data into graph format for GNN""" 83 | graphs = [] 84 | for cid, cdata in df.groupby("case_id"): 85 | cdata.sort_values("timestamp", inplace=True) 86 | 87 | x_data = torch.tensor(cdata[[ 88 | "feat_task_id","feat_resource_id","feat_amount", 89 | "feat_day_of_week","feat_hour_of_day" 90 | ]].values, dtype=torch.float) 91 | 92 | n_nodes = len(cdata) 93 | if n_nodes > 1: 94 | src = list(range(n_nodes-1)) 95 | tgt = list(range(1,n_nodes)) 96 | edge_index = torch.tensor([src+tgt, tgt+src], dtype=torch.long) 97 | else: 98 | edge_index = torch.empty((2,0), dtype=torch.long) 99 | 100 | y_data = torch.tensor(cdata["next_task"].values, dtype=torch.long) 101 | data_obj = Data(x=x_data, edge_index=edge_index, y=y_data) 102 | graphs.append(data_obj) 103 | 104 | return graphs 105 | 106 | def compute_class_weights(df, num_classes): 107 | """Compute balanced class weights for training""" 108 | from sklearn.utils.class_weight import compute_class_weight 109 | train_labels = df["next_task"].values 110 | class_weights = np.ones(num_classes, dtype=np.float32) 111 | present = np.unique(train_labels) 112 | cw = compute_class_weight("balanced", classes=present, y=train_labels) 113 | for i, cval in enumerate(present): 114 | class_weights[cval] = cw[i] 115 | return torch.tensor(class_weights, dtype=torch.float32) -------------------------------------------------------------------------------- /modules/process_mining.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Process Mining Analysis Module 6 | Includes bottleneck analysis, conformance checking, and cycle time analysis 7 | """ 8 | 9 | import pandas as pd 10 | import numpy as np 11 | from pm4py.objects.log.util import dataframe_utils 12 | from pm4py.objects.conversion.log import converter as log_converter 13 | from pm4py.algo.discovery.inductive import algorithm as inductive_miner 14 | from pm4py.algo.conformance.tokenreplay import algorithm as token_replay 15 | 16 | def analyze_bottlenecks(df, freq_threshold=5): 17 | """ 18 | Analyze process bottlenecks based on waiting times between activities 19 | """ 20 | df = df.copy() 21 | df["next_task_id"] = df.groupby("case_id")["task_id"].shift(-1) 22 | df["next_timestamp"] = df.groupby("case_id")["timestamp"].shift(-1) 23 | transitions = df.dropna(subset=["next_task_id"]).copy() 24 | transitions["wait_sec"] = (transitions["next_timestamp"] - transitions["timestamp"]).dt.total_seconds() 25 | 26 | bottleneck_stats = transitions.groupby(["task_id","next_task_id"])["wait_sec"].agg([ 27 | "mean","count" 28 | ]).reset_index() 29 | 30 | bottleneck_stats["mean_hours"] = bottleneck_stats["mean"]/3600.0 31 | bottleneck_stats.sort_values("mean_hours", ascending=False, inplace=True) 32 | 33 | # Filter by frequency threshold 34 | significant_bottlenecks = bottleneck_stats[bottleneck_stats["count"] >= freq_threshold] 35 | 36 | return bottleneck_stats, significant_bottlenecks 37 | 38 | def analyze_cycle_times(df): 39 | """ 40 | Analyze process cycle times 41 | """ 42 | case_grouped = df.groupby("case_id")["timestamp"].agg(["min","max"]) 43 | case_grouped["cycle_time_hours"] = ( 44 | case_grouped["max"] - case_grouped["min"] 45 | ).dt.total_seconds()/3600.0 46 | case_grouped.reset_index(inplace=True) 47 | 48 | df_feats = df.groupby("case_id").agg({ 49 | "amount": "mean", 50 | "task_id": "count" 51 | }).rename(columns={ 52 | "amount": "mean_amount", 53 | "task_id": "num_events" 54 | }).reset_index() 55 | 56 | case_merged = pd.merge(case_grouped, df_feats, on="case_id", how="left") 57 | case_merged["duration_h"] = case_merged["cycle_time_hours"] 58 | 59 | # Identify long-running cases (95th percentile) 60 | cut95 = case_merged["duration_h"].quantile(0.95) 61 | long_cases = case_merged[case_merged["duration_h"] > cut95] 62 | 63 | return case_merged, long_cases, cut95 64 | 65 | def analyze_rare_transitions(bottleneck_stats, rare_threshold=2): 66 | """ 67 | Identify rare transitions in the process 68 | """ 69 | rare_trans = bottleneck_stats[bottleneck_stats["count"] <= rare_threshold] 70 | return rare_trans 71 | 72 | def perform_conformance_checking(df): 73 | """ 74 | Perform conformance checking using inductive miner and token replay 75 | """ 76 | df_pm = df[["case_id","task_name","timestamp"]].rename(columns={ 77 | "case_id": "case:concept:name", 78 | "task_name": "concept:name", 79 | "timestamp": "time:timestamp" 80 | }) 81 | 82 | df_pm = dataframe_utils.convert_timestamp_columns_in_df(df_pm) 83 | event_log = log_converter.apply(df_pm) 84 | 85 | process_tree = inductive_miner.apply(event_log) 86 | from pm4py.objects.conversion.process_tree import converter as pt_converter 87 | net, im, fm = pt_converter.apply(process_tree) 88 | 89 | replayed = token_replay.apply(event_log, net, im, fm) 90 | n_deviant = sum(1 for t in replayed if not t["trace_is_fit"]) 91 | 92 | return replayed, n_deviant 93 | 94 | def analyze_transition_patterns(df): 95 | """ 96 | Analyze transition patterns and compute transition matrix 97 | """ 98 | transitions = df.copy() 99 | transitions["next_task_id"] = transitions.groupby("case_id")["task_id"].shift(-1) 100 | trans_count = transitions.groupby(["task_id","next_task_id"]).size().unstack(fill_value=0) 101 | prob_matrix = trans_count.div(trans_count.sum(axis=1), axis=0) 102 | 103 | return transitions, trans_count, prob_matrix 104 | 105 | def spectral_cluster_graph(adj_matrix, k=2): 106 | """ 107 | Perform spectral clustering on process graph 108 | """ 109 | from sklearn.cluster import KMeans 110 | 111 | degrees = np.sum(adj_matrix, axis=1) 112 | D = np.diag(degrees) 113 | L = D - adj_matrix # unnormalized Laplacian 114 | 115 | eigenvals, eigenvecs = np.linalg.eig(L) 116 | idx = np.argsort(eigenvals) 117 | eigenvals, eigenvecs = eigenvals[idx], eigenvecs[:, idx] 118 | 119 | if k == 2: 120 | # Fiedler vector = second smallest eigenvector 121 | fiedler_vec = np.real(eigenvecs[:, 1]) 122 | # Partition by sign 123 | labels = (fiedler_vec >= 0).astype(int) 124 | else: 125 | # multi-cluster 126 | embedding = np.real(eigenvecs[:, 1:k]) 127 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=42).fit(embedding) 128 | labels = kmeans.labels_ 129 | 130 | return labels 131 | 132 | def build_task_adjacency(df, num_tasks): 133 | """ 134 | Build adjacency matrix weighted by transition frequencies 135 | """ 136 | A = np.zeros((num_tasks, num_tasks), dtype=np.float32) 137 | for cid, cdata in df.groupby("case_id"): 138 | cdata = cdata.sort_values("timestamp") 139 | tasks_seq = cdata["task_id"].values 140 | for i in range(len(tasks_seq)-1): 141 | src = tasks_seq[i] 142 | tgt = tasks_seq[i+1] 143 | A[src, tgt] += 1.0 144 | return A -------------------------------------------------------------------------------- /modules/rl_optimization.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Reinforcement Learning module for process optimization 6 | """ 7 | 8 | import numpy as np 9 | import random 10 | 11 | class ProcessEnv: 12 | """ 13 | Environment for process optimization using RL 14 | The agent chooses (next_activity, resource) pairs and receives rewards 15 | based on cost, delay, and resource utilization 16 | """ 17 | def __init__(self, df, le_task, resources): 18 | self.df = df 19 | self.le_task = le_task 20 | self.all_tasks = sorted(df["task_id"].unique()) 21 | self.resources = resources 22 | self.start_task_id = 0 23 | self.done = False 24 | self.current_task = None 25 | 26 | # Additional state information could be added here 27 | self.resource_usage = {r: 0 for r in resources} 28 | self.total_cost = 0 29 | self.total_delay = 0 30 | 31 | def reset(self): 32 | """Reset the environment to initial state""" 33 | self.current_task = self.start_task_id 34 | self.done = False 35 | self.resource_usage = {r: 0 for r in self.resources} 36 | self.total_cost = 0 37 | self.total_delay = 0 38 | return self._get_state() 39 | 40 | def _get_state(self): 41 | """ 42 | Get current state representation 43 | Currently using one-hot encoding for current task 44 | Could be extended with more features 45 | """ 46 | state_vec = np.zeros(len(self.all_tasks), dtype=np.float32) 47 | idx = self.current_task 48 | state_vec[idx] = 1.0 49 | return state_vec 50 | 51 | def step(self, action): 52 | """ 53 | Take a step in the environment 54 | action = (next_activity_id, resource_id) 55 | Returns: (next_state, reward, done, info) 56 | """ 57 | next_task, resource = action 58 | 59 | if next_task not in self.all_tasks: 60 | # Invalid action 61 | reward = -100.0 62 | self.done = True 63 | return self._get_state(), reward, self.done, {} 64 | 65 | # Compute costs and delays 66 | transition_cost = self._compute_transition_cost(self.current_task, next_task) 67 | processing_delay = self._compute_processing_delay(next_task, resource) 68 | resource_efficiency = self._compute_resource_efficiency(resource) 69 | 70 | # Update internal state 71 | self.total_cost += transition_cost 72 | self.total_delay += processing_delay 73 | self.resource_usage[resource] += 1 74 | 75 | # Compute reward components 76 | cost_penalty = -transition_cost 77 | delay_penalty = -processing_delay 78 | efficiency_bonus = resource_efficiency 79 | 80 | # Combined reward 81 | reward = cost_penalty + delay_penalty + efficiency_bonus 82 | 83 | # Move to next state 84 | self.current_task = next_task 85 | 86 | # Check if process should end 87 | if self._should_terminate(): 88 | self.done = True 89 | 90 | info = { 91 | 'transition_cost': transition_cost, 92 | 'processing_delay': processing_delay, 93 | 'resource_efficiency': resource_efficiency 94 | } 95 | 96 | return self._get_state(), reward, self.done, info 97 | 98 | def _compute_transition_cost(self, current_task, next_task): 99 | """ 100 | Compute cost of transitioning between tasks 101 | Currently using a simple distance metric 102 | Could be replaced with actual cost data 103 | """ 104 | return abs(next_task - current_task) * 1.0 105 | 106 | def _compute_processing_delay(self, task, resource): 107 | """ 108 | Compute processing delay for task-resource pair 109 | Currently using random delays 110 | Could be replaced with historical data 111 | """ 112 | base_delay = random.random() * 2.0 113 | resource_factor = 1.0 + (self.resource_usage[resource] * 0.1) 114 | return base_delay * resource_factor 115 | 116 | def _compute_resource_efficiency(self, resource): 117 | """ 118 | Compute resource utilization efficiency 119 | Rewards balanced resource usage 120 | """ 121 | total_usage = sum(self.resource_usage.values()) 122 | if total_usage == 0: 123 | return 1.0 124 | 125 | current_usage = self.resource_usage[resource] 126 | expected_usage = total_usage / len(self.resources) 127 | 128 | if current_usage <= expected_usage: 129 | return 1.0 130 | else: 131 | return max(0.0, 1.0 - (current_usage - expected_usage) * 0.1) 132 | 133 | def _should_terminate(self): 134 | """ 135 | Determine if the process should terminate 136 | Currently using a simple random termination 137 | Could be replaced with actual process end conditions 138 | """ 139 | return random.random() < 0.1 140 | 141 | def run_q_learning(env, episodes=30, alpha=0.1, gamma=0.9, epsilon=0.1): 142 | """ 143 | Q-learning algorithm for process optimization 144 | 145 | Parameters: 146 | - env: ProcessEnv instance 147 | - episodes: Number of training episodes 148 | - alpha: Learning rate 149 | - gamma: Discount factor 150 | - epsilon: Exploration rate 151 | 152 | Returns: 153 | - Q-table mapping state-action pairs to values 154 | """ 155 | possible_tasks = env.all_tasks 156 | possible_resources = env.resources 157 | 158 | # All possible actions (task, resource pairs) 159 | all_actions = [] 160 | for t in possible_tasks: 161 | for r in possible_resources: 162 | all_actions.append((t, r)) 163 | num_actions = len(all_actions) 164 | 165 | Q_table = {} 166 | 167 | def get_state_key(state): 168 | """Convert state array to hashable tuple""" 169 | return tuple(state.round(3)) 170 | 171 | def get_Q(state): 172 | """Get Q-values for state, initialize if needed""" 173 | sk = get_state_key(state) 174 | if sk not in Q_table: 175 | Q_table[sk] = np.zeros(num_actions, dtype=np.float32) 176 | return Q_table[sk] 177 | 178 | # Training loop 179 | for ep in range(episodes): 180 | s = env.reset() 181 | done = False 182 | total_reward = 0 183 | 184 | while not done: 185 | # ε-greedy action selection 186 | if random.random() < epsilon: 187 | action_idx = random.randrange(num_actions) 188 | else: 189 | q_values = get_Q(s) 190 | action_idx = int(np.argmax(q_values)) 191 | 192 | action = all_actions[action_idx] 193 | next_state, reward, done, _info = env.step(action) 194 | total_reward += reward 195 | 196 | # Q-learning update 197 | current_q = get_Q(s) 198 | next_q = get_Q(next_state) 199 | best_next_q = 0.0 if done else np.max(next_q) 200 | 201 | # Update Q-value 202 | current_q[action_idx] += alpha * ( 203 | reward + gamma * best_next_q - current_q[action_idx] 204 | ) 205 | 206 | s = next_state 207 | 208 | print(f"Episode {ep+1}/{episodes}, total_reward={total_reward:.2f}") 209 | 210 | return Q_table 211 | 212 | def get_optimal_policy(Q_table, all_actions): 213 | """ 214 | Extract optimal policy from Q-table 215 | 216 | Returns: 217 | - Dictionary mapping states to optimal actions 218 | """ 219 | policy = {} 220 | for state in Q_table: 221 | q_values = Q_table[state] 222 | optimal_action_idx = np.argmax(q_values) 223 | policy[state] = all_actions[optimal_action_idx] 224 | return policy -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.9.0 2 | torch-geometric>=2.0.0 3 | torch-scatter>=2.0.9 4 | torch-sparse>=0.6.12 5 | numpy>=1.19.5 6 | pandas>=1.3.0 7 | scikit-learn>=0.24.2 8 | networkx>=2.6.3 9 | matplotlib>=3.4.3 10 | seaborn>=0.11.2 11 | plotly>=5.3.1 12 | pm4py>=2.2.19 13 | umap-learn>=0.5.1 14 | xgboost>=1.5.0 -------------------------------------------------------------------------------- /visualization/process_viz.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Visualization module for process mining analysis 6 | """ 7 | 8 | import matplotlib.pyplot as plt 9 | import seaborn as sns 10 | import networkx as nx 11 | import plotly.graph_objects as go 12 | import numpy as np 13 | from sklearn.manifold import TSNE 14 | import umap 15 | 16 | def plot_confusion_matrix(y_true, y_pred, class_names, save_path="confusion_matrix.png"): 17 | """Plot confusion matrix""" 18 | from sklearn.metrics import confusion_matrix 19 | 20 | plt.figure(figsize=(8,6)) 21 | cm = confusion_matrix(y_true, y_pred) 22 | sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", 23 | xticklabels=class_names, 24 | yticklabels=class_names) 25 | plt.title("Confusion Matrix") 26 | plt.xlabel("Predicted") 27 | plt.ylabel("True") 28 | plt.tight_layout() 29 | plt.savefig(save_path) 30 | plt.close() 31 | 32 | def plot_embeddings(embeddings, method="tsne", save_path=None): 33 | """Plot task embeddings using t-SNE or UMAP""" 34 | if method == "tsne": 35 | tsne_perp = min(30, embeddings.shape[0]-1) 36 | coords = TSNE(n_components=2, perplexity=tsne_perp, random_state=42).fit_transform(embeddings) 37 | title = "Task Embeddings - t-SNE" 38 | else: # umap 39 | coords = umap.UMAP(n_components=2, random_state=42).fit_transform(embeddings) 40 | title = "Task Embeddings - UMAP" 41 | 42 | plt.figure(figsize=(6,5)) 43 | sns.scatterplot(x=coords[:,0], y=coords[:,1]) 44 | plt.title(title) 45 | if save_path: 46 | plt.savefig(save_path) 47 | plt.close() 48 | 49 | def plot_cycle_time_distribution(durations, save_path="cycle_time_distribution.png"): 50 | """Plot cycle time distribution""" 51 | plt.figure(figsize=(6,4)) 52 | plt.hist(durations, bins=30, color="skyblue", edgecolor="black") 53 | plt.title("Cycle Time Distribution (hours)") 54 | plt.xlabel("Hours") 55 | plt.ylabel("Number of Cases") 56 | mean_c = np.mean(durations) 57 | plt.axvline(mean_c, color="red", linestyle="--", label=f"Mean={mean_c:.1f}h") 58 | plt.legend() 59 | plt.tight_layout() 60 | plt.savefig(save_path) 61 | plt.close() 62 | 63 | def plot_process_flow(bottleneck_stats, le_task, top_bottlenecks, 64 | save_path="process_flow_bottlenecks.png"): 65 | """Plot process flow with bottlenecks highlighted""" 66 | G_flow = nx.DiGraph() 67 | for i, row in bottleneck_stats.iterrows(): 68 | src = int(row["task_id"]) 69 | dst = int(row["next_task_id"]) 70 | G_flow.add_edge(src, dst, freq=int(row["count"]), mean_hours=row["mean_hours"]) 71 | 72 | btop_edges = set((int(src), int(dst)) for src, dst in zip( 73 | top_bottlenecks["task_id"], top_bottlenecks["next_task_id"] 74 | )) 75 | 76 | edge_cols, edge_wids = [], [] 77 | for (u,v) in G_flow.edges(): 78 | if (u,v) in btop_edges: 79 | edge_cols.append("red") 80 | edge_wids.append(2.0) 81 | else: 82 | edge_cols.append("gray") 83 | edge_wids.append(1.0) 84 | 85 | plt.figure(figsize=(9,7)) 86 | pos = nx.spring_layout(G_flow, seed=42) 87 | nx.draw_networkx_nodes(G_flow, pos, node_color="lightblue", node_size=600) 88 | 89 | labels_dict = {n: le_task.inverse_transform([int(n)])[0] for n in G_flow.nodes()} 90 | nx.draw_networkx_labels(G_flow, pos, labels_dict, font_size=8) 91 | nx.draw_networkx_edges(G_flow, pos, edge_color=edge_cols, width=edge_wids, arrows=True) 92 | 93 | edge_lbl = {} 94 | for (u,v) in btop_edges: 95 | edge_lbl[(u,v)] = f"{G_flow[u][v]['mean_hours']:.1f}h" 96 | nx.draw_networkx_edge_labels(G_flow, pos, edge_labels=edge_lbl, 97 | font_color="red", font_size=7) 98 | 99 | plt.title("Process Flow with Bottlenecks (Red edges)") 100 | plt.tight_layout() 101 | plt.savefig(save_path) 102 | plt.close() 103 | 104 | def plot_transition_heatmap(transitions, le_task, save_path="transition_probability_heatmap.png"): 105 | """Plot transition probability heatmap""" 106 | trans_count = transitions.groupby(["task_id","next_task_id"]).size().unstack(fill_value=0) 107 | prob_matrix = trans_count.div(trans_count.sum(axis=1), axis=0) 108 | 109 | plt.figure(figsize=(10,8)) 110 | xticklabels = [le_task.inverse_transform([int(c)])[0] for c in prob_matrix.columns] 111 | yticklabels = [le_task.inverse_transform([int(r)])[0] for r in prob_matrix.index] 112 | 113 | sns.heatmap(prob_matrix, cmap="YlGnBu", annot=False, 114 | xticklabels=xticklabels, 115 | yticklabels=yticklabels) 116 | plt.title("Transition Probability Heatmap") 117 | plt.xlabel("Next Activity") 118 | plt.ylabel("Current Activity") 119 | plt.tight_layout() 120 | plt.savefig(save_path) 121 | plt.close() 122 | 123 | def create_sankey_diagram(df, le_task, save_path="process_flow_sankey.html"): 124 | """Create Sankey diagram of process flow""" 125 | start_counts = df.groupby("case_id").first()["task_id"].value_counts().to_dict() 126 | end_counts = df.groupby("case_id").last()["task_id"].value_counts().to_dict() 127 | 128 | trans_count = df.groupby(["task_id","next_task_id"]).size().unstack(fill_value=0) 129 | arr = trans_count.stack().reset_index().values 130 | 131 | unique_nodes = ["Start"] + list(le_task.classes_) + ["End"] 132 | node_idx = {n:i for i,n in enumerate(unique_nodes)} 133 | 134 | sources, targets, values = [], [], [] 135 | 136 | # Start transitions 137 | for act_id, ct in start_counts.items(): 138 | sources.append(node_idx["Start"]) 139 | act_name = le_task.inverse_transform([int(act_id)])[0] 140 | targets.append(node_idx[act_name]) 141 | values.append(int(ct)) 142 | 143 | # End transitions 144 | for act_id, ct in end_counts.items(): 145 | act_name = le_task.inverse_transform([int(act_id)])[0] 146 | sources.append(node_idx[act_name]) 147 | targets.append(node_idx["End"]) 148 | values.append(int(ct)) 149 | 150 | # Internal transitions 151 | for row in arr: 152 | sid, tid, ccount = row 153 | sid_name = le_task.inverse_transform([int(sid)])[0] 154 | tid_name = le_task.inverse_transform([int(tid)])[0] 155 | sources.append(node_idx[sid_name]) 156 | targets.append(node_idx[tid_name]) 157 | values.append(int(ccount)) 158 | 159 | sankey_fig = go.Figure(data=[go.Sankey( 160 | node=dict(label=unique_nodes), 161 | link=dict(source=sources, target=targets, value=values) 162 | )]) 163 | 164 | sankey_fig.update_layout(title_text="Process Flow Sankey Diagram", font_size=10) 165 | sankey_fig.write_html(save_path) --------------------------------------------------------------------------------