├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── dg_predictiction.py ├── dg_training.py ├── environment.yml ├── input_files └── event_logs.zip ├── model_prediction ├── __init__.py ├── event_log_predictor.py ├── interfaces.py ├── model_predictor.py ├── next_event_predictor.py ├── next_event_samples_creator.py ├── suffix_predictor.py └── suffix_samples_creator.py ├── model_training ├── __init__.py ├── embedding_training.py ├── features_manager.py ├── intercase_features │ └── __init__.py ├── model_hpc_optimizer.py ├── model_loader.py ├── model_optimizer.py ├── model_trainer.py ├── models │ ├── __init__.py │ ├── model_concatenated.py │ ├── model_concatenated_cx.py │ ├── model_gru_concatenated.py │ ├── model_gru_concatenated_cx.py │ ├── model_gru_shared_cat.py │ ├── model_gru_shared_cat_cx.py │ ├── model_gru_specialized.py │ ├── model_shared_cat.py │ ├── model_shared_cat_cx.py │ └── model_specialized.py ├── samples_creator.py └── slurm_trainer.py ├── models_spec.ini └── support_modules ├── __init__.py ├── callbacks ├── __init__.py ├── clean_models_callback.py └── time_callback.py └── role_discovery.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | output_files/ 3 | jobs_files/ 4 | .idea/* 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeepGenerator: Learning Accurate Generative Models of Business Processes with LSTM Neural Networks 2 | 3 | The code here presented is able to execute different pre- and post-processing methods and architectures for building and using generative models from event logs in XES format using LSTM anf GRU neural networks. This code can perform the next tasks: 4 | 5 | 6 | * Training LSTM neuronal networks using an event log as input. 7 | * Generate full event logs using a trained LSTM neuronal network. 8 | * Predict the remaining time and the continuation (suffix) of an incomplete business process trace. 9 | 10 | 11 | ## Getting Started 12 | 13 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. 14 | 15 | ``` 16 | git clone https://github.com/AdaptiveBProcess/GenerativeLSTM.git 17 | ``` 18 | 19 | ### Prerequisites 20 | 21 | To execute this code you just need to install Anaconda in your system, and create an environment using the *environment.yml* specification provided in the repository. 22 | ``` 23 | cd GenerativeLSTM 24 | conda env create -f environment.yml 25 | conda activate deep_generator 26 | ``` 27 | 28 | ## Running the script 29 | 30 | Once created the environment, you can perform each one of the tasks, specifying the following parameters in the lstm.py module, or by command line as is described below: 31 | 32 | *Training LSTM neuronal network:* To perform this task you need to set the required activity (-a) as 'training' followed by the name of the (-f) event log, and all the following parameters: 33 | 34 | * Filename (-f): Log filename. 35 | * Model family (-m): The available options are lstm, gru, lstm_cx and gru_cx. 36 | * Max Eval (-e): Maximum number of evaluations. 37 | * Opt method (-o): Optimization method used. The available options are hpc and bayesian. 38 | 39 | ``` 40 | (lstm_env) C:\sc_lstm>python dg_training.py -f Helpdesk.xes -m lstm -e 1 -o bayesian 41 | ``` 42 | 43 | *Predictive task:* It is possible to execute various predictive tasks with DeepGenerator, such as predicting the next event, the case continuation, and the remaining time of an ongoing case. Similarly, it is possible to generate complete event logs starting from a zero prefix size. To perform these tasks, you need to set the activity (-a) as ‘predict_next’ for the next event prediction, ‘pred_sfx’ for case continuation and remaining time, and ‘pred_log’ for the full event log generation. Additionally, it's required to indicate the folder where the predictive model is located (-c), and the name of the .h5 model (-b). Finally, you need to specify the method for selecting the next predicted task (-v) ‘random_choice’ or ‘arg_max’ and the number of repetitions of the experiment (-r). **NB! The folders and models were generated in the training task and can be found in the output_files folder: 44 | 45 | ``` 46 | (lstm_env) C:\sc_lstm>-a pred_log -c 20201001_426975C9_FAC6_453A_9F0B_4DD528CB554B -b "model_shared_cat_02-1.10.h5" -v "random_choice" -r 1" 47 | ``` 48 | *Predict the next event and role:* To perform this task the only changes with respect with the previous ones are that you need to set the required activity as 'predict_next' and its not necesary to set the maximum trace length: 49 | 50 | ``` 51 | (lstm_env) C:\sc_lstm>python lstm.py -a predict_next -c 20190228_155935509575 -b "model_rd_150 Nadam_22-0.59.h5" -x False 52 | ``` 53 | ## Examples 54 | 55 | Models examples and experimental results can be found at examples 56 | ## Authors 57 | 58 | * **Manuel Camargo** 59 | * **Marlon Dumas** 60 | * **Oscar Gonzalez-Rojas** 61 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /dg_predictiction.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Feb 23 19:08:25 2021 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import os 8 | 9 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 10 | import sys 11 | import getopt 12 | 13 | from model_prediction import model_predictor as pr 14 | 15 | 16 | # ============================================================================= 17 | # Main function 18 | # ============================================================================= 19 | def catch_parameter(opt): 20 | """Change the captured parameters names""" 21 | switch = {'-h': 'help', '-a': 'activity', '-c': 'folder', 22 | '-b': 'model_file', '-v': 'variant', '-r': 'rep'} 23 | return switch.get(opt) 24 | 25 | 26 | def main(argv): 27 | parameters = dict() 28 | column_names = {'Case ID': 'caseid', 29 | 'Activity': 'task', 30 | 'lifecycle:transition': 'event_type', 31 | 'Resource': 'user'} 32 | parameters['one_timestamp'] = False # Only one timestamp in the log 33 | parameters['read_options'] = { 34 | 'timeformat': '%Y-%m-%dT%H:%M:%S.%f', 35 | 'column_names': column_names, 36 | 'one_timestamp': parameters['one_timestamp'], 37 | 'filter_d_attrib': False} 38 | # Parameters settled manually or catched by console for batch operations 39 | if not argv: 40 | # predict_next, pred_sfx 41 | parameters['activity'] = 'pred_log' 42 | parameters['folder'] = '20230302_3CC0DC8C_5A76_4ED1_8E90_AFB851EB1AA0' 43 | parameters['model_file'] = 'Production.h5' 44 | parameters['is_single_exec'] = False # single or batch execution 45 | # variants and repetitions to be tested Random Choice, Arg Max 46 | parameters['variant'] = 'Random Choice' 47 | parameters['rep'] = 1 48 | else: 49 | # Catch parms by console 50 | try: 51 | opts, _ = getopt.getopt(argv, "ho:a:f:c:b:v:r:", 52 | ['one_timestamp=', 'activity=', 'folder=', 53 | 'model_file=', 'variant=', 'rep=']) 54 | for opt, arg in opts: 55 | key = catch_parameter(opt) 56 | if key in ['rep']: 57 | parameters[key] = int(arg) 58 | else: 59 | parameters[key] = arg 60 | except getopt.GetoptError: 61 | print('Invalid option') 62 | sys.exit(2) 63 | print(parameters['folder']) 64 | print(parameters['model_file']) 65 | pr.ModelPredictor(parameters) 66 | 67 | 68 | if __name__ == "__main__": 69 | main(sys.argv[1:]) 70 | -------------------------------------------------------------------------------- /dg_training.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jun 26 13:27:58 2020 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import os 8 | 9 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 10 | import sys 11 | import getopt 12 | 13 | from model_training import model_trainer as tr 14 | 15 | 16 | # ============================================================================= 17 | # Main function 18 | # ============================================================================= 19 | def catch_parameter(opt): 20 | """Change the captured parameters names""" 21 | switch = {'-h': 'help', '-f': 'file_name', '-m': 'model_family', 22 | '-e': 'max_eval', '-o': 'opt_method'} 23 | return switch.get(opt) 24 | 25 | 26 | def main(argv): 27 | parameters = dict() 28 | column_names = {'Case ID': 'caseid', 29 | 'Activity': 'task', 30 | 'lifecycle:transition': 'event_type', 31 | 'Resource': 'user'} 32 | parameters['one_timestamp'] = False # Only one timestamp in the log 33 | parameters['read_options'] = { 34 | 'timeformat': '%Y-%m-%dT%H:%M:%S.%f', 35 | 'column_names': column_names, 36 | 'one_timestamp': parameters['one_timestamp']} 37 | # Parameters settled manually or catched by console for batch operations 38 | if not argv: 39 | # Event-log filename 40 | parameters['file_name'] = 'Production.csv' 41 | parameters['model_family'] = 'lstm' 42 | parameters['opt_method'] = 'bayesian' # 'rand_hpc', 'bayesian' 43 | parameters['max_eval'] = 1 44 | else: 45 | # Catch parms by console 46 | try: 47 | opts, _ = getopt.getopt(argv, "h:f:m:e:o:", 48 | ['file_name=', 'model_family=', 49 | 'max_eval=', 'opt_method=']) 50 | for opt, arg in opts: 51 | key = catch_parameter(opt) 52 | if key in ['max_eval']: 53 | parameters[key] = int(arg) 54 | else: 55 | parameters[key] = arg 56 | except getopt.GetoptError: 57 | print('Invalid option') 58 | sys.exit(2) 59 | # Similarity btw the resources profile execution (Song e.t. all) 60 | parameters['rp_sim'] = 0.85 61 | parameters['batch_size'] = 32 # Usually 32/64/128/256 62 | parameters['norm_method'] = ['max', 'lognorm'] 63 | parameters['imp'] = 1 64 | parameters['epochs'] = 200 65 | parameters['n_size'] = [5, 10, 15] 66 | parameters['l_size'] = [50, 100] 67 | parameters['lstm_act'] = ['selu', 'tanh'] 68 | if parameters['model_family'] == 'lstm': 69 | parameters['model_type'] = ['shared_cat', 'concatenated'] 70 | elif parameters['model_family'] == 'gru': 71 | parameters['model_type'] = ['shared_cat_gru', 'concatenated_gru'] 72 | elif parameters['model_family'] == 'lstm_cx': 73 | parameters['model_type'] = ['shared_cat_cx', 'concatenated_cx'] 74 | elif parameters['model_family'] == 'gru_cx': 75 | parameters['model_type'] = ['shared_cat_gru_cx', 'concatenated_gru_cx'] 76 | parameters['dense_act'] = ['linear'] 77 | parameters['optim'] = ['Nadam'] 78 | 79 | if parameters['model_type'] == 'simple_gan': 80 | parameters['gan_pretrain'] = False 81 | parameters.pop('model_family', None) 82 | # Train models 83 | tr.ModelTrainer(parameters) 84 | 85 | 86 | if __name__ == "__main__": 87 | main(sys.argv[1:]) 88 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: deep_generator 2 | channels: 3 | - defaults 4 | dependencies: 5 | - spyder==5.1.5 6 | - pandas 7 | - lxml 8 | - matplotlib 9 | - nltk 10 | - scikit-learn 11 | - ipywidgets 12 | - pip: 13 | - hyperopt 14 | - jellyfish 15 | - keras 16 | - pm4py 17 | - tensorflow 18 | - opyenxes 19 | - git+http://github.com/Mcamargo85/support_modules.git 20 | -------------------------------------------------------------------------------- /input_files/event_logs.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AdaptiveBProcess/GenerativeLSTM/cf0e2bee25843fa58a9710314a33778821cd92c9/input_files/event_logs.zip -------------------------------------------------------------------------------- /model_prediction/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /model_prediction/interfaces.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Mar 17 16:24:38 2020 4 | 5 | @author: Manuel Camargo 6 | """ 7 | from model_prediction import next_event_samples_creator as nesc 8 | from model_prediction import suffix_samples_creator as ssc 9 | 10 | 11 | from model_prediction import next_event_predictor as nep 12 | from model_prediction import suffix_predictor as sp 13 | from model_prediction import event_log_predictor as elp 14 | 15 | 16 | class SamplesCreator: 17 | def create(self, predictor, activity): 18 | sampler = self._get_samples_creator(activity) 19 | predictor.sampling(sampler) 20 | 21 | def _get_samples_creator(self, activity): 22 | if activity == 'predict_next': 23 | return nesc.NextEventSamplesCreator() 24 | elif activity == 'pred_sfx': 25 | return ssc.SuffixSamplesCreator() 26 | else: 27 | raise ValueError(activity) 28 | 29 | 30 | class PredictionTasksExecutioner: 31 | def predict(self, predictor, activity, run_num): 32 | executioner = self._get_predictor(activity) 33 | predictor.predict(executioner, run_num) 34 | 35 | def _get_predictor(self, activity): 36 | if activity == 'predict_next': 37 | return nep.NextEventPredictor() 38 | elif activity == 'pred_sfx': 39 | return sp.SuffixPredictor() 40 | elif activity == 'pred_log': 41 | return elp.EventLogPredictor() 42 | else: 43 | raise ValueError(activity) 44 | -------------------------------------------------------------------------------- /model_prediction/model_predictor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Mar 17 10:49:28 2020 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import os 8 | import json 9 | import copy 10 | 11 | import pandas as pd 12 | import numpy as np 13 | import configparser as cp 14 | 15 | import readers.log_reader as lr 16 | import utils.support as sup 17 | 18 | from model_training import features_manager as feat 19 | from model_prediction import interfaces as it 20 | import analyzers.sim_evaluator as ev 21 | 22 | 23 | class ModelPredictor(): 24 | """ 25 | This is the man class encharged of the model evaluation 26 | """ 27 | 28 | def __init__(self, parms): 29 | self.output_route = os.path.join('output_files', parms['folder']) 30 | self.parms = parms 31 | # load parameters 32 | self.load_parameters() 33 | self.model_name = os.path.join(self.output_route, parms['model_file']) 34 | self.log = self.load_log_test(self.output_route, self.parms) 35 | 36 | self.samples = dict() 37 | self.predictions = None 38 | self.sim_values = list() 39 | 40 | self.model_def = dict() 41 | self.read_model_definition(self.parms['model_type']) 42 | self.parms['additional_columns'] = self.model_def['additional_columns'] 43 | self.acc = self.execute_predictive_task() 44 | 45 | def execute_predictive_task(self): 46 | # create examples for next event and suffix 47 | if self.parms['activity'] == 'pred_log': 48 | self.parms['num_cases'] = len(self.log.caseid.unique()) 49 | self.parms['start_time'] = self.log.start_timestamp.min() 50 | else: 51 | feat_mannager = feat.FeaturesMannager(self.parms) 52 | feat_mannager.register_scaler(self.parms['model_type'], 53 | self.model_def['vectorizer']) 54 | self.log, _ = feat_mannager.calculate( 55 | self.log, self.parms['additional_columns']) 56 | sampler = it.SamplesCreator() 57 | sampler.create(self, self.parms['activity']) 58 | # predict 59 | self.imp = self.parms['variant'] 60 | for run_num in range(0, self.parms['rep']): 61 | self.predict_values(run_num) 62 | # export predictions 63 | self.export_predictions(run_num) 64 | # assesment 65 | evaluator = EvaluateTask() 66 | if self.parms['activity'] == 'pred_log': 67 | self.sim_values.extend( 68 | evaluator.evaluate(self.parms, 69 | self.log, 70 | self.predictions, 71 | run_num)) 72 | else: 73 | evaluator.evaluate(self.predictions, self.parms) 74 | self._export_results(self.output_route) 75 | 76 | def predict_values(self, run_num): 77 | # Predict values 78 | executioner = it.PredictionTasksExecutioner() 79 | executioner.predict(self, self.parms['activity'], run_num) 80 | 81 | @staticmethod 82 | def load_log_test(output_route, parms): 83 | df_test = lr.LogReader( 84 | os.path.join(output_route, 'parameters', 'test_log.csv'), 85 | parms['read_options']) 86 | df_test = pd.DataFrame(df_test.data) 87 | df_test = df_test[~df_test.task.isin(['Start', 'End'])] 88 | return df_test 89 | 90 | def load_parameters(self): 91 | # Loading of parameters from training 92 | path = os.path.join(self.output_route, 93 | 'parameters', 94 | 'model_parameters.json') 95 | with open(path) as file: 96 | data = json.load(file) 97 | if 'activity' in data: 98 | del data['activity'] 99 | parms = {k: v for k, v in data.items()} 100 | parms.pop('rep', None) 101 | self.parms = {**self.parms, **parms} 102 | if 'dim' in data.keys(): 103 | self.parms['dim'] = {k: int(v) for k, v in data['dim'].items()} 104 | if self.parms['one_timestamp']: 105 | self.parms['scale_args'] = { 106 | k: float(v) for k, v in data['scale_args'].items()} 107 | else: 108 | for key in data['scale_args'].keys(): 109 | self.parms['scale_args'][key] = { 110 | k: float(v) for k, v in data['scale_args'][key].items()} 111 | self.parms['index_ac'] = {int(k): v 112 | for k, v in data['index_ac'].items()} 113 | self.parms['index_rl'] = {int(k): v 114 | for k, v in data['index_rl'].items()} 115 | file.close() 116 | self.ac_index = {v: k for k, v in self.parms['index_ac'].items()} 117 | self.rl_index = {v: k for k, v in self.parms['index_rl'].items()} 118 | 119 | def sampling(self, sampler): 120 | sampler.register_sampler(self.parms['model_type'], 121 | self.model_def['vectorizer']) 122 | self.samples = sampler.create_samples( 123 | self.parms, self.log, self.ac_index, 124 | self.rl_index, self.model_def['additional_columns']) 125 | 126 | 127 | def predict(self, executioner, run_num): 128 | 129 | results = executioner.predict(self.parms, 130 | self.model_name, 131 | self.samples, 132 | self.imp, 133 | self.model_def['vectorizer']) 134 | results = pd.DataFrame(results) 135 | self.predictions = results 136 | 137 | def export_predictions(self, r_num): 138 | # output_folder = os.path.join(self.output_route, 'results') 139 | if not os.path.exists(self.output_route): 140 | os.makedirs(self.output_route) 141 | self.predictions.to_csv( 142 | os.path.join( 143 | self.output_route, 'gen_'+ 144 | self.parms['model_file'].split('.')[0]+'_'+str(r_num+1)+'.csv'), 145 | index=False) 146 | 147 | @staticmethod 148 | def scale_feature(log, feature, parms, replace=False): 149 | """Scales a number given a technique. 150 | Args: 151 | log: Event-log to be scaled. 152 | feature: Feature to be scaled. 153 | method: Scaling method max, lognorm, normal, per activity. 154 | replace (optional): replace the original value or keep both. 155 | Returns: 156 | Scaleded value between 0 and 1. 157 | """ 158 | method = parms['norm_method'] 159 | scale_args = parms['scale_args'] 160 | if method == 'lognorm': 161 | log[feature + '_log'] = np.log1p(log[feature]) 162 | max_value = scale_args['max_value'] 163 | min_value = scale_args['min_value'] 164 | log[feature+'_norm'] = np.divide( 165 | np.subtract(log[feature+'_log'], min_value), (max_value - min_value)) 166 | log = log.drop((feature + '_log'), axis=1) 167 | elif method == 'normal': 168 | max_value = scale_args['max_value'] 169 | min_value = scale_args['min_value'] 170 | log[feature+'_norm'] = np.divide( 171 | np.subtract(log[feature], min_value), (max_value - min_value)) 172 | elif method == 'standard': 173 | mean = scale_args['mean'] 174 | std = scale_args['std'] 175 | log[feature + '_norm'] = np.divide(np.subtract(log[feature], mean), 176 | std) 177 | elif method == 'max': 178 | max_value = scale_args['max_value'] 179 | log[feature + '_norm'] = (np.divide(log[feature], max_value) 180 | if max_value > 0 else 0) 181 | elif method is None: 182 | log[feature+'_norm'] = log[feature] 183 | else: 184 | raise ValueError(method) 185 | if replace: 186 | log = log.drop(feature, axis=1) 187 | return log 188 | 189 | def read_model_definition(self, model_type): 190 | Config = cp.ConfigParser(interpolation=None) 191 | Config.read('models_spec.ini') 192 | #File name with extension 193 | self.model_def['additional_columns'] = sup.reduce_list( 194 | Config.get(model_type,'additional_columns'), dtype='str') 195 | self.model_def['vectorizer'] = Config.get(model_type, 'vectorizer') 196 | 197 | def _export_results(self, output_path) -> None: 198 | # Save results 199 | pd.DataFrame(self.sim_values).to_csv( 200 | os.path.join(self.output_route, sup.file_id(prefix='SE_')), 201 | index=False) 202 | # Save logs 203 | log_test = self.log[~self.log.task.isin(['Start', 'End'])] 204 | log_test.to_csv( 205 | os.path.join(self.output_route, 'tst_'+ 206 | self.parms['model_file'].split('.')[0]+'.csv'), 207 | index=False) 208 | 209 | class EvaluateTask(): 210 | 211 | def evaluate(self, parms, log, predictions, rep_num): 212 | sampler = self._get_evaluator(parms['activity']) 213 | return sampler(parms, log, predictions, rep_num) 214 | 215 | def _get_evaluator(self, activity): 216 | if activity == 'predict_next': 217 | return self._evaluate_predict_next 218 | elif activity == 'pred_sfx': 219 | return self._evaluate_pred_sfx 220 | elif activity == 'pred_log': 221 | return self._evaluate_predict_log 222 | else: 223 | raise ValueError(activity) 224 | 225 | def _evaluate_predict_next(self, data, parms, rep_num): 226 | exp_desc = self.clean_parameters(parms.copy()) 227 | evaluator = ev.Evaluator(parms['one_timestamp']) 228 | ac_sim = evaluator.measure('accuracy', data, 'ac') 229 | rl_sim = evaluator.measure('accuracy', data, 'rl') 230 | mean_ac = ac_sim.accuracy.mean() 231 | exp_desc = pd.DataFrame([exp_desc]) 232 | exp_desc = pd.concat([exp_desc]*len(ac_sim), ignore_index=True) 233 | ac_sim = pd.concat([ac_sim, exp_desc], axis=1).to_dict('records') 234 | rl_sim = pd.concat([rl_sim, exp_desc], axis=1).to_dict('records') 235 | self.save_results(ac_sim, 'ac', parms) 236 | self.save_results(rl_sim, 'rl', parms) 237 | if parms['one_timestamp']: 238 | tm_mae = evaluator.measure('mae_next', data, 'tm') 239 | tm_mae = pd.concat([tm_mae, exp_desc], axis=1).to_dict('records') 240 | self.save_results(tm_mae, 'tm', parms) 241 | else: 242 | dur_mae = evaluator.measure('mae_next', data, 'dur') 243 | wait_mae = evaluator.measure('mae_next', data, 'wait') 244 | dur_mae = pd.concat([dur_mae, exp_desc], axis=1).to_dict('records') 245 | wait_mae = pd.concat([wait_mae, exp_desc], axis=1).to_dict('records') 246 | self.save_results(dur_mae, 'dur', parms) 247 | self.save_results(wait_mae, 'wait', parms) 248 | return mean_ac 249 | 250 | def _evaluate_pred_sfx(self, data, parms, rep_num): 251 | exp_desc = self.clean_parameters(parms.copy()) 252 | evaluator = ev.Evaluator(parms['one_timestamp']) 253 | ac_sim = evaluator.measure('similarity', data, 'ac') 254 | rl_sim = evaluator.measure('similarity', data, 'rl') 255 | mean_sim = ac_sim['mean'].mean() 256 | exp_desc = pd.DataFrame([exp_desc]) 257 | exp_desc = pd.concat([exp_desc]*len(ac_sim), ignore_index=True) 258 | ac_sim = pd.concat([ac_sim, exp_desc], axis=1).to_dict('records') 259 | rl_sim = pd.concat([rl_sim, exp_desc], axis=1).to_dict('records') 260 | self.save_results(ac_sim, 'ac', parms) 261 | self.save_results(rl_sim, 'rl', parms) 262 | if parms['one_timestamp']: 263 | tm_mae = evaluator.measure('mae_suffix', data, 'tm') 264 | tm_mae = pd.concat([tm_mae, exp_desc], axis=1).to_dict('records') 265 | self.save_results(tm_mae, 'tm', parms) 266 | else: 267 | dur_mae = evaluator.measure('mae_suffix', data, 'dur') 268 | wait_mae = evaluator.measure('mae_suffix', data, 'wait') 269 | dur_mae = pd.concat([dur_mae, exp_desc], axis=1).to_dict('records') 270 | wait_mae = pd.concat([wait_mae, exp_desc], axis=1).to_dict('records') 271 | self.save_results(dur_mae, 'dur', parms) 272 | self.save_results(wait_mae, 'wait', parms) 273 | return mean_sim 274 | 275 | @staticmethod 276 | def _evaluate_predict_log(parms, log, sim_log, rep_num): 277 | """Reads the simulation results stats 278 | Args: 279 | settings (dict): Path to jar and file names 280 | rep (int): repetition number 281 | """ 282 | sim_values = list() 283 | log = copy.deepcopy(log) 284 | log = log[~log.task.isin(['Start', 'End', 'start', 'end'])] 285 | log['caseid'] = log['caseid'].astype(str) 286 | log['caseid'] = 'Case' + log['caseid'] 287 | sim_log = sim_log[~sim_log.task.isin(['Start', 'End', 'start', 'end'])] 288 | evaluator = ev.SimilarityEvaluator(log, sim_log, parms) 289 | metrics = ['tsd', 'day_hour_emd', 'log_mae', 'dl', 'mae'] 290 | for metric in metrics: 291 | evaluator.measure_distance(metric) 292 | sim_values.append({**{'run_num': rep_num}, **evaluator.similarity}) 293 | return sim_values 294 | 295 | @staticmethod 296 | def clean_parameters(parms): 297 | exp_desc = parms.copy() 298 | exp_desc.pop('activity', None) 299 | exp_desc.pop('read_options', None) 300 | exp_desc.pop('column_names', None) 301 | exp_desc.pop('one_timestamp', None) 302 | exp_desc.pop('reorder', None) 303 | exp_desc.pop('index_ac', None) 304 | exp_desc.pop('index_rl', None) 305 | exp_desc.pop('dim', None) 306 | exp_desc.pop('max_dur', None) 307 | exp_desc.pop('variants', None) 308 | exp_desc.pop('is_single_exec', None) 309 | return exp_desc 310 | -------------------------------------------------------------------------------- /model_prediction/next_event_predictor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Mar 17 20:35:53 2020 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import numpy as np 8 | 9 | import utils.support as sup 10 | 11 | 12 | class NextEventPredictor(): 13 | 14 | def __init__(self): 15 | """constructor""" 16 | self.model = None 17 | self.spl = dict() 18 | self.imp = 'arg_max' 19 | 20 | def predict(self, params, model, spl, imp, vectorizer): 21 | self.model = model 22 | self.spl = spl 23 | self.imp = imp 24 | predictor = self._get_predictor(params['model_type']) 25 | sup.print_performed_task('Predicting next events') 26 | return predictor(params, vectorizer) 27 | 28 | def _get_predictor(self, model_type): 29 | # OJO: This is an extension point just incase 30 | # a different predictor being neccesary 31 | return self._predict_next_event_shared_cat 32 | 33 | def _predict_next_event_shared_cat(self, parameters, vectorizer): 34 | """Generate business process suffixes using a keras trained model. 35 | Args: 36 | model (keras model): keras trained model. 37 | prefixes (list): list of prefixes. 38 | ac_index (dict): index of activities. 39 | rl_index (dict): index of roles. 40 | imp (str): method of next event selection. 41 | """ 42 | # Generation of predictions 43 | results = list() 44 | for i, _ in enumerate(self.spl['prefixes']['activities']): 45 | # Activities and roles input shape(1,5) 46 | x_ac_ngram = (np.append( 47 | np.zeros(parameters['dim']['time_dim']), 48 | np.array(self.spl['prefixes']['activities'][i]), 49 | axis=0)[-parameters['dim']['time_dim']:] 50 | .reshape((1, parameters['dim']['time_dim']))) 51 | 52 | x_rl_ngram = (np.append( 53 | np.zeros(parameters['dim']['time_dim']), 54 | np.array(self.spl['prefixes']['roles'][i]), 55 | axis=0)[-parameters['dim']['time_dim']:] 56 | .reshape((1, parameters['dim']['time_dim']))) 57 | 58 | # times input shape(1,5,1) 59 | times_attr_num = (self.spl['prefixes']['times'][i].shape[1]) 60 | x_t_ngram = np.array( 61 | [np.append(np.zeros( 62 | (parameters['dim']['time_dim'], times_attr_num)), 63 | self.spl['prefixes']['times'][i], axis=0) 64 | [-parameters['dim']['time_dim']:] 65 | .reshape((parameters['dim']['time_dim'], times_attr_num))] 66 | ) 67 | 68 | # add intercase features if necessary 69 | if vectorizer in ['basic']: 70 | inputs = [x_ac_ngram, x_rl_ngram, x_t_ngram] 71 | elif vectorizer in ['inter']: 72 | # times input shape(1,5,1) 73 | inter_attr_num = (self.spl['prefixes']['inter_attr'][i] 74 | .shape[1]) 75 | x_inter_ngram = np.array( 76 | [np.append(np.zeros(( 77 | parameters['dim']['time_dim'], inter_attr_num)), 78 | self.spl['prefixes']['inter_attr'][i], axis=0) 79 | [-parameters['dim']['time_dim']:] 80 | .reshape( 81 | (parameters['dim']['time_dim'], inter_attr_num))] 82 | ) 83 | inputs = [x_ac_ngram, x_rl_ngram, x_t_ngram, x_inter_ngram] 84 | # predict 85 | preds = self.model.predict(inputs) 86 | if self.imp == 'random_choice': 87 | # Use this to get a random choice following as PDF 88 | pos = np.random.choice(np.arange(0, len(preds[0][0])), 89 | p=preds[0][0]) 90 | pos1 = np.random.choice(np.arange(0, len(preds[1][0])), 91 | p=preds[1][0]) 92 | elif self.imp == 'arg_max': 93 | # Use this to get the max prediction 94 | pos = np.argmax(preds[0][0]) 95 | pos1 = np.argmax(preds[1][0]) 96 | 97 | # save results 98 | predictions = [pos, pos1, preds[2][0][0]] 99 | if not parameters['one_timestamp']: 100 | predictions.extend([preds[2][0][1]]) 101 | results.append( 102 | self.create_result_record(i, self.spl, predictions, parameters)) 103 | sup.print_done_task() 104 | return results 105 | 106 | def create_result_record(self, index, spl, preds, parms): 107 | record = dict() 108 | record['ac_prefix'] = spl['prefixes']['activities'][index] 109 | record['ac_expect'] = spl['next_evt']['activities'][index] 110 | record['ac_pred'] = preds[0] 111 | record['rl_prefix'] = spl['prefixes']['roles'][index] 112 | record['rl_expect'] = spl['next_evt']['roles'][index] 113 | record['rl_pred'] = preds[1] 114 | if parms['one_timestamp']: 115 | record['tm_prefix'] = [self.rescale( 116 | x, parms, parms['scale_args']) 117 | for x in spl['prefixes']['times'][index]] 118 | record['tm_expect'] = self.rescale( 119 | spl['next_evt']['times'][index][0], 120 | parms, parms['scale_args']) 121 | record['tm_pred'] = self.rescale( 122 | preds[2], parms, parms['scale_args']) 123 | else: 124 | # Duration 125 | record['dur_prefix'] = [self.rescale( 126 | x[0], parms, parms['scale_args']['dur']) 127 | for x in spl['prefixes']['times'][index]] 128 | record['dur_expect'] = self.rescale( 129 | spl['next_evt']['times'][index][0], parms, 130 | parms['scale_args']['dur']) 131 | record['dur_pred'] = self.rescale( 132 | preds[2], parms, parms['scale_args']['dur']) 133 | # Waiting 134 | record['wait_prefix'] = [self.rescale( 135 | x[1], parms, parms['scale_args']['wait']) 136 | for x in spl['prefixes']['times'][index]] 137 | record['wait_expect'] = self.rescale( 138 | spl['next_evt']['times'][index][1], parms, 139 | parms['scale_args']['wait']) 140 | record['wait_pred'] = self.rescale( 141 | preds[3], parms, parms['scale_args']['wait']) 142 | return record 143 | 144 | @staticmethod 145 | def rescale(value, parms, scale_args): 146 | if parms['norm_method'] == 'lognorm': 147 | max_value = scale_args['max_value'] 148 | min_value = scale_args['min_value'] 149 | value = (value * (max_value - min_value)) + min_value 150 | value = np.expm1(value) 151 | elif parms['norm_method'] == 'normal': 152 | max_value = scale_args['max_value'] 153 | min_value = scale_args['min_value'] 154 | value = (value * (max_value - min_value)) + min_value 155 | elif parms['norm_method'] == 'standard': 156 | mean = scale_args['mean'] 157 | std = scale_args['std'] 158 | value = (value * std) + mean 159 | elif parms['norm_method'] == 'max': 160 | max_value = scale_args['max_value'] 161 | value = np.rint(value * max_value) 162 | elif parms['norm_method'] is None: 163 | value = value 164 | else: 165 | raise ValueError(parms['norm_method']) 166 | return value 167 | -------------------------------------------------------------------------------- /model_prediction/suffix_predictor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Mar 18 10:35:37 2020 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import numpy as np 8 | 9 | import utils.support as sup 10 | 11 | 12 | class SuffixPredictor(): 13 | 14 | def __init__(self): 15 | """constructor""" 16 | self.model = None 17 | self.spl = dict() 18 | self.imp = 'arg_max' 19 | self.max_trace_size = 0 20 | 21 | def predict(self, params, model, spl, imp, vectorizer): 22 | self.model = model 23 | self.spl = spl 24 | self.max_trace_size = params['max_trace_size'] 25 | self.imp = imp 26 | predictor = self._get_predictor(params['model_type']) 27 | sup.print_performed_task('Predicting suffixes') 28 | return predictor(params, vectorizer) 29 | 30 | def _get_predictor(self, model_type): 31 | # OJO: This is an extension point just incase 32 | # a different predictor being neccesary 33 | return self._predict_suffix_shared_cat 34 | 35 | def _predict_suffix_shared_cat(self, parms, vectorizer): 36 | """Generate business process suffixes using a keras trained model. 37 | Args: 38 | model (keras model): keras trained model. 39 | prefixes (list): list of prefixes. 40 | ac_index (dict): index of activities. 41 | rl_index (dict): index of roles. 42 | imp (str): method of next event selection. 43 | """ 44 | # Generation of predictions 45 | results = list() 46 | for i, _ in enumerate(self.spl['prefixes']['activities']): 47 | # Activities and roles input shape(1,5) 48 | x_ac_ngram = (np.append( 49 | np.zeros(parms['dim']['time_dim']), 50 | np.array(self.spl['prefixes']['activities'][i]), 51 | axis=0)[-parms['dim']['time_dim']:] 52 | .reshape((1, parms['dim']['time_dim']))) 53 | 54 | x_rl_ngram = (np.append( 55 | np.zeros(parms['dim']['time_dim']), 56 | np.array(self.spl['prefixes']['roles'][i]), 57 | axis=0)[-parms['dim']['time_dim']:] 58 | .reshape((1, parms['dim']['time_dim']))) 59 | 60 | times_attr_num = (self.spl['prefixes']['times'][i].shape[1]) 61 | x_t_ngram = np.array( 62 | [np.append(np.zeros( 63 | (parms['dim']['time_dim'], times_attr_num)), 64 | self.spl['prefixes']['times'][i], axis=0) 65 | [-parms['dim']['time_dim']:] 66 | .reshape((parms['dim']['time_dim'], times_attr_num))] 67 | ) 68 | if vectorizer in ['basic']: 69 | inputs = [x_ac_ngram, x_rl_ngram, x_t_ngram] 70 | elif vectorizer in ['inter']: 71 | inter_attr_num = self.spl['prefixes']['inter_attr'][i].shape[1] 72 | x_inter_ngram = np.array([np.append( 73 | np.zeros((parms['dim']['time_dim'], inter_attr_num)), 74 | self.spl['prefixes']['inter_attr'][i], 75 | axis=0)[-parms['dim']['time_dim']:].reshape((parms['dim']['time_dim'], inter_attr_num))]) 76 | inputs = [x_ac_ngram, x_rl_ngram, x_t_ngram, x_inter_ngram] 77 | 78 | pref_size = len(self.spl['prefixes']['activities'][i]) 79 | acum_dur, acum_wait = list(), list() 80 | ac_suf, rl_suf = list(), list() 81 | for _ in range(1, self.max_trace_size): 82 | preds = self.model.predict(inputs) 83 | if self.imp == 'random_choice': 84 | # Use this to get a random choice following as PDF the predictions 85 | pos = np.random.choice( 86 | np.arange(0,len(preds[0][0])), p=preds[0][0]) 87 | pos1 = np.random.choice( 88 | np.arange(0, len(preds[1][0])), p=preds[1][0]) 89 | elif self.imp == 'arg_max': 90 | # Use this to get the max prediction 91 | pos = np.argmax(preds[0][0]) 92 | pos1 = np.argmax(preds[1][0]) 93 | # Activities accuracy evaluation 94 | x_ac_ngram = np.append(x_ac_ngram, [[pos]], axis=1) 95 | x_ac_ngram = np.delete(x_ac_ngram, 0, 1) 96 | x_rl_ngram = np.append(x_rl_ngram, [[pos1]], axis=1) 97 | x_rl_ngram = np.delete(x_rl_ngram, 0, 1) 98 | x_t_ngram = np.append(x_t_ngram, [preds[2]], axis=1) 99 | x_t_ngram = np.delete(x_t_ngram, 0, 1) 100 | if vectorizer in ['basic']: 101 | inputs = [x_ac_ngram, x_rl_ngram, x_t_ngram] 102 | elif vectorizer in ['inter']: 103 | x_inter_ngram = np.append(x_inter_ngram, [preds[3]], axis=1) 104 | x_inter_ngram = np.delete(x_inter_ngram, 0, 1) 105 | inputs = [x_ac_ngram, x_rl_ngram, x_t_ngram, x_inter_ngram] 106 | # Stop if the next prediction is the end of the trace 107 | # otherwise until the defined max_size 108 | ac_suf.append(pos) 109 | rl_suf.append(pos1) 110 | acum_dur.append(preds[2][0][0]) 111 | if not parms['one_timestamp']: 112 | acum_wait.append(preds[2][0][1]) 113 | if parms['index_ac'][pos] == 'end': 114 | break 115 | # save results 116 | predictions = [ac_suf, rl_suf, acum_dur] 117 | if not parms['one_timestamp']: 118 | predictions.extend([acum_wait]) 119 | results.append( 120 | self.create_result_record(i, self.spl, predictions, parms, pref_size)) 121 | sup.print_done_task() 122 | return results 123 | 124 | def create_result_record(self, index, spl, preds, parms, pref_size): 125 | record = dict() 126 | record['pref_size'] = pref_size 127 | record['ac_prefix'] = spl['prefixes']['activities'][index] 128 | record['ac_expect'] = spl['next_evt']['activities'][index] 129 | record['ac_pred'] = preds[0] 130 | record['rl_prefix'] = spl['prefixes']['roles'][index] 131 | record['rl_expect'] = spl['next_evt']['roles'][index] 132 | record['rl_pred'] = preds[1] 133 | if parms['one_timestamp']: 134 | record['tm_prefix'] = [self.rescale( 135 | x[0], parms, parms['scale_args']) 136 | for x in spl['prefixes']['times'][index]] 137 | record['tm_expect'] = [self.rescale( 138 | x[0], parms, parms['scale_args']) 139 | for x in spl['next_evt']['times'][index]] 140 | record['tm_pred'] = [self.rescale( 141 | x, parms, parms['scale_args']) 142 | for x in preds[2]] 143 | else: 144 | # Duration 145 | record['dur_prefix'] = [self.rescale( 146 | x[0], parms, parms['scale_args']['dur']) 147 | for x in spl['prefixes']['times'][index]] 148 | record['dur_expect'] = [self.rescale( 149 | x[0], parms, parms['scale_args']['dur']) 150 | for x in spl['next_evt']['times'][index]] 151 | record['dur_pred'] = [self.rescale( 152 | x, parms, parms['scale_args']['dur']) 153 | for x in preds[2]] 154 | # Waiting 155 | record['wait_prefix'] = [self.rescale( 156 | x[1], parms, parms['scale_args']['wait']) 157 | for x in spl['prefixes']['times'][index]] 158 | record['wait_expect'] = [self.rescale( 159 | x[1], parms, parms['scale_args']['wait']) 160 | for x in spl['next_evt']['times'][index]] 161 | record['wait_pred'] = [self.rescale( 162 | x, parms, parms['scale_args']['wait']) 163 | for x in preds[3]] 164 | return record 165 | 166 | @staticmethod 167 | def rescale(value, parms, scale_args): 168 | if parms['norm_method'] == 'lognorm': 169 | max_value = scale_args['max_value'] 170 | min_value = scale_args['min_value'] 171 | value = (value * (max_value - min_value)) + min_value 172 | value = np.expm1(value) 173 | elif parms['norm_method'] == 'normal': 174 | max_value = scale_args['max_value'] 175 | min_value = scale_args['min_value'] 176 | value = (value * (max_value - min_value)) + min_value 177 | elif parms['norm_method'] == 'standard': 178 | mean = scale_args['mean'] 179 | std = scale_args['std'] 180 | value = (value * std) + mean 181 | elif parms['norm_method'] == 'max': 182 | max_value = scale_args['max_value'] 183 | value = np.rint(value * max_value) 184 | elif parms['norm_method'] is None: 185 | value = value 186 | else: 187 | raise ValueError(parms['norm_method']) 188 | return value 189 | -------------------------------------------------------------------------------- /model_prediction/suffix_samples_creator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Mar 18 10:03:26 2020 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import itertools 8 | 9 | import pandas as pd 10 | import numpy as np 11 | 12 | 13 | class SuffixSamplesCreator(): 14 | """ 15 | This is the man class encharged of the model training 16 | """ 17 | 18 | def __init__(self): 19 | self.log = pd.DataFrame 20 | self.ac_index = dict() 21 | self.rl_index = dict() 22 | self._samplers = dict() 23 | self._samp_dispatcher = {'basic': self._sample_suffix, 24 | 'inter': self._sample_suffix_inter} 25 | 26 | def create_samples(self, params, log, ac_index, rl_index, add_cols): 27 | self.log = log 28 | self.ac_index = ac_index 29 | self.rl_index = rl_index 30 | columns = self.define_columns(add_cols, params['one_timestamp']) 31 | sampler = self._get_model_specific_sampler(params['model_type']) 32 | return sampler(columns, params) 33 | 34 | @staticmethod 35 | def define_columns(add_cols, one_timestamp): 36 | columns = ['ac_index', 'rl_index', 'dur_norm'] 37 | add_cols = [x+'_norm' if x != 'weekday' else x for x in add_cols ] 38 | columns.extend(add_cols) 39 | if not one_timestamp: 40 | columns.extend(['wait_norm']) 41 | return columns 42 | # def define_columns(add_cols, one_timestamp): 43 | # columns = ['ac_index', 'rl_index', 'dur_norm'] 44 | # add_cols = [x+'_norm' for x in add_cols] 45 | # columns.extend(add_cols) 46 | # if not one_timestamp: 47 | # columns.extend(['wait_norm']) 48 | # return columns 49 | 50 | def register_sampler(self, model_type, sampler): 51 | try: 52 | self._samplers[model_type] = self._samp_dispatcher[sampler] 53 | except KeyError: 54 | raise ValueError(sampler) 55 | 56 | def _get_model_specific_sampler(self, model_type): 57 | sampler = self._samplers.get(model_type) 58 | if not sampler: 59 | raise ValueError(model_type) 60 | return sampler 61 | 62 | def _sample_suffix(self, columns, parms): 63 | """ 64 | Extraction of prefixes and expected suffixes from event log. 65 | Args: 66 | self.log (dataframe): testing dataframe in pandas format. 67 | ac_index (dict): index of activities. 68 | rl_index (dict): index of roles. 69 | pref_size (int): size of the prefixes to extract. 70 | Returns: 71 | list: list of prefixes and expected sufixes. 72 | """ 73 | print(columns) 74 | times = ['dur_norm'] if parms['one_timestamp'] else ['dur_norm', 'wait_norm'] 75 | equi = {'ac_index': 'activities', 'rl_index': 'roles'} 76 | vec = {'prefixes': dict(), 77 | 'next_evt': dict()} 78 | x_times_dict = dict() 79 | y_times_dict = dict() 80 | self.log = self.reformat_events(columns, parms['one_timestamp']) 81 | # n-gram definition 82 | for i, _ in enumerate(self.log): 83 | for x in columns: 84 | serie, y_serie = list(), list() 85 | for idx in range(1, len(self.log[i][x])): 86 | serie.append(self.log[i][x][:idx]) 87 | y_serie.append(self.log[i][x][idx:]) 88 | if x in list(equi.keys()): 89 | vec['prefixes'][equi[x]] = ( 90 | vec['prefixes'][equi[x]] + serie 91 | if i > 0 else serie) 92 | vec['next_evt'][equi[x]] = ( 93 | vec['next_evt'][equi[x]] + y_serie 94 | if i > 0 else y_serie) 95 | elif x in times: 96 | x_times_dict[x] = ( 97 | x_times_dict[x] + serie if i > 0 else serie) 98 | y_times_dict[x] = ( 99 | y_times_dict[x] + y_serie if i > 0 else y_serie) 100 | vec['prefixes']['times'] = list() 101 | x_times_dict = pd.DataFrame(x_times_dict) 102 | for row in x_times_dict.values: 103 | new_row = [np.array(x) for x in row] 104 | new_row = np.dstack(new_row) 105 | new_row = new_row.reshape((new_row.shape[1], new_row.shape[2])) 106 | vec['prefixes']['times'].append(new_row) 107 | # Reshape intercase expected attributes (prefixes, # attributes) 108 | vec['next_evt']['times'] = list() 109 | y_times_dict = pd.DataFrame(y_times_dict) 110 | for row in y_times_dict.values: 111 | new_row = [np.array(x) for x in row] 112 | new_row = np.dstack(new_row) 113 | new_row = new_row.reshape((new_row.shape[1], new_row.shape[2])) 114 | vec['next_evt']['times'].append(new_row) 115 | return vec 116 | 117 | 118 | def _sample_suffix_inter(self, columns, parms): 119 | self.log = self.reformat_events(columns, parms['one_timestamp']) 120 | spl = {'prefixes': dict(), 'suffixes': dict()} 121 | # n-gram definition 122 | equi = {'ac_index': 'activities', 123 | 'rl_index': 'roles', 124 | 'dur_norm': 'times'} 125 | x_inter_dict, y_inter_dict = dict(), dict() 126 | for i, _ in enumerate(self.log): 127 | for x in columns: 128 | serie, y_serie = list(), list() 129 | for idx in range(1, len(self.log[i][x])): 130 | serie.append(self.log[i][x][:idx]) 131 | y_serie.append(self.log[i][x][idx:]) 132 | if x in list(equi.keys()): 133 | spl['prefixes'][equi[x]] = ( 134 | spl['prefixes'][equi[x]] + serie if i > 0 else serie) 135 | spl['suffixes'][equi[x]] = ( 136 | spl['suffixes'][equi[x]] + y_serie if i > 0 else y_serie) 137 | else: 138 | x_inter_dict[x] = ( 139 | x_inter_dict[x] + serie if i > 0 else serie) 140 | y_inter_dict[x] = ( 141 | y_inter_dict[x] + y_serie if i > 0 else y_serie) 142 | # Reshape intercase attributes (prefixes, n-gram size, # attributes) 143 | spl['prefixes']['inter_attr'] = list() 144 | x_inter_dict = pd.DataFrame(x_inter_dict) 145 | for row in x_inter_dict.values: 146 | new_row = [np.array(x) for x in row] 147 | new_row = np.dstack(new_row) 148 | new_row = new_row.reshape((new_row.shape[1], new_row.shape[2])) 149 | spl['prefixes']['inter_attr'].append(new_row) 150 | # Reshape intercase expected attributes (prefixes, # attributes) 151 | spl['suffixes']['inter_attr'] = list() 152 | y_inter_dict = pd.DataFrame(y_inter_dict) 153 | for row in y_inter_dict.values: 154 | new_row = [np.array(x) for x in row] 155 | new_row = np.dstack(new_row) 156 | new_row = new_row.reshape((new_row.shape[1], new_row.shape[2])) 157 | spl['suffixes']['inter_attr'].append(new_row) 158 | return spl 159 | 160 | # ============================================================================= 161 | # Reformat 162 | # ============================================================================= 163 | def reformat_events(self, columns, one_timestamp): 164 | """Creates series of activities, roles and relative times per trace. 165 | Args: 166 | log_df: dataframe. 167 | ac_index (dict): index of activities. 168 | rl_index (dict): index of roles. 169 | Returns: 170 | list: lists of activities, roles and relative times. 171 | """ 172 | temp_data = list() 173 | log_df = self.log.to_dict('records') 174 | key = 'end_timestamp' if one_timestamp else 'start_timestamp' 175 | log_df = sorted(log_df, key=lambda x: (x['caseid'], key)) 176 | for key, group in itertools.groupby(log_df, key=lambda x: x['caseid']): 177 | trace = list(group) 178 | temp_dict = dict() 179 | for x in columns: 180 | serie = [y[x] for y in trace] 181 | if x == 'ac_index': 182 | serie.insert(0, self.ac_index[('start')]) 183 | serie.append(self.ac_index[('end')]) 184 | elif x == 'rl_index': 185 | serie.insert(0, self.rl_index[('start')]) 186 | serie.append(self.rl_index[('end')]) 187 | else: 188 | serie.insert(0, 0) 189 | serie.append(0) 190 | temp_dict = {**{x: serie}, **temp_dict} 191 | temp_dict = {**{'caseid': key}, **temp_dict} 192 | temp_data.append(temp_dict) 193 | return temp_data 194 | -------------------------------------------------------------------------------- /model_training/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /model_training/embedding_training.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Nov 21 21:23:55 2018 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import os 8 | import random 9 | import itertools 10 | import math 11 | import numpy as np 12 | 13 | from keras.models import Model 14 | from keras.layers import Input, Embedding, Dot, Reshape 15 | 16 | import utils.support as sup 17 | 18 | 19 | def training_model(parameters, log, ac_index, index_ac, rl_index, index_rl): 20 | """Main method of the embedding training module. 21 | Args: 22 | parameters (dict): parameters for training the embeddeding network. 23 | timeformat (str): event-log date-time format. 24 | no_loops (boolean): remove loops fom the event-log (optional). 25 | """ 26 | # Define the number of dimensions as the 4th root of the # of categories 27 | dim_number = math.ceil( 28 | len(list(itertools.product(*[list(ac_index.items()), 29 | list(rl_index.items())])))**0.25) 30 | 31 | ac_weights, rl_weights = train_embedded(log, 32 | ac_index, rl_index, dim_number) 33 | 34 | if not os.path.exists(os.path.join('input_files', 'embedded_matix')): 35 | os.makedirs(os.path.join('input_files', 'embedded_matix')) 36 | 37 | sup.create_file_from_list( 38 | reformat_matrix(index_ac, ac_weights), 39 | os.path.join(os.path.join('input_files', 'embedded_matix'), 40 | 'ac_' + parameters['file_name'].split('.')[0]+'.emb')) 41 | sup.create_file_from_list( 42 | reformat_matrix(index_rl, rl_weights), 43 | os.path.join(os.path.join('input_files', 'embedded_matix'), 44 | 'rl_' + parameters['file_name'].split('.')[0]+'.emb')) 45 | 46 | 47 | # ============================================================================= 48 | # Pre-processing: embedded dimension 49 | # ============================================================================= 50 | 51 | def train_embedded(log_df, ac_index, rl_index, dim_number): 52 | """Carry out the training of the embeddings""" 53 | # Iterate through each book 54 | pairs = list() 55 | for i in range(0, len(log_df)): 56 | # Iterate through the links in the book 57 | pairs.append((ac_index[log_df.iloc[i]['task']], 58 | rl_index[log_df.iloc[i]['role']])) 59 | 60 | model = ac_rl_embedding_model(ac_index, rl_index, dim_number) 61 | model.summary() 62 | 63 | n_positive = 1024 64 | gen = generate_batch(pairs, ac_index, rl_index, 65 | n_positive, negative_ratio=2) 66 | # Train 67 | model.fit_generator(gen, epochs=100, 68 | steps_per_epoch=len(pairs) // n_positive, 69 | verbose=2) 70 | 71 | # Extract embeddings 72 | ac_layer = model.get_layer('activity_embedding') 73 | rl_layer = model.get_layer('role_embedding') 74 | 75 | ac_weights = ac_layer.get_weights()[0] 76 | rl_weights = rl_layer.get_weights()[0] 77 | 78 | return ac_weights, rl_weights 79 | 80 | 81 | def generate_batch(pairs, ac_index, rl_index, n_positive=50, 82 | negative_ratio=1.0): 83 | """Generate batches of samples for training""" 84 | batch_size = n_positive * (1 + negative_ratio) 85 | batch = np.zeros((batch_size, 3)) 86 | pairs_set = set(pairs) 87 | activities = list(ac_index.keys()) 88 | roles = list(rl_index.keys()) 89 | # This creates a generator 90 | while True: 91 | # randomly choose positive examples 92 | idx = 0 93 | for idx, (activity, role) in enumerate(random.sample(pairs, 94 | n_positive)): 95 | batch[idx, :] = (activity, role, 1) 96 | # Increment idx by 1 97 | idx += 1 98 | 99 | # Add negative examples until reach batch size 100 | while idx < batch_size: 101 | # random selection 102 | random_ac = random.randrange(len(activities)) 103 | random_rl = random.randrange(len(roles)) 104 | 105 | # Check to make sure this is not a positive example 106 | if (random_ac, random_rl) not in pairs_set: 107 | 108 | # Add to batch and increment index, 0 due classification task 109 | batch[idx, :] = (random_ac, random_rl, 0) 110 | idx += 1 111 | 112 | # Make sure to shuffle order 113 | np.random.shuffle(batch) 114 | yield {'activity': batch[:, 0], 'role': batch[:, 1]}, batch[:, 2] 115 | 116 | 117 | def ac_rl_embedding_model(ac_index, rl_index, embedding_size): 118 | """Model to embed activities and roles using the functional API""" 119 | 120 | # Both inputs are 1-dimensional 121 | activity = Input(name='activity', shape=[1]) 122 | role = Input(name='role', shape=[1]) 123 | 124 | # Embedding the activity (shape will be (None, 1, embedding_size)) 125 | activity_embedding = Embedding(name='activity_embedding', 126 | input_dim=len(ac_index), 127 | output_dim=embedding_size)(activity) 128 | 129 | # Embedding the role (shape will be (None, 1, embedding_size)) 130 | role_embedding = Embedding(name='role_embedding', 131 | input_dim=len(rl_index), 132 | output_dim=embedding_size)(role) 133 | 134 | # Merge the layers with a dot product 135 | # along the second axis (shape will be (None, 1, 1)) 136 | merged = Dot(name='dot_product', 137 | normalize=True, axes=2)([activity_embedding, role_embedding]) 138 | 139 | # Reshape to be a single number (shape will be (None, 1)) 140 | merged = Reshape(target_shape=[1])(merged) 141 | 142 | # Loss function is mean squared error 143 | model = Model(inputs=[activity, role], outputs=merged) 144 | model.compile(optimizer='Adam', loss='mse') 145 | 146 | return model 147 | 148 | # ============================================================================= 149 | # Support 150 | # ============================================================================= 151 | 152 | 153 | def reformat_matrix(index, weigths): 154 | """Reformating of the embedded matrix for exporting. 155 | Args: 156 | index: index of activities or roles. 157 | weigths: matrix of calculated coordinates. 158 | Returns: 159 | matrix with indexes. 160 | """ 161 | matrix = list() 162 | for i, _ in enumerate(index): 163 | data = [i, index[i]] 164 | data.extend(weigths[i]) 165 | matrix.append(data) 166 | return matrix 167 | -------------------------------------------------------------------------------- /model_training/features_manager.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Mar 14 19:18:18 2020 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import pandas as pd 8 | import numpy as np 9 | 10 | import itertools 11 | from operator import itemgetter 12 | try: 13 | from support_modules import role_discovery as rl 14 | except: 15 | import os 16 | from importlib import util 17 | spec = util.spec_from_file_location( 18 | 'role_discovery', 19 | os.path.join(os.getcwd(), 'support_modules', 'role_discovery.py')) 20 | rl = util.module_from_spec(spec) 21 | spec.loader.exec_module(rl) 22 | 23 | class FeaturesMannager(): 24 | 25 | 26 | def __init__(self, params): 27 | """constructor""" 28 | self.model_type = params['model_type'] 29 | self.one_timestamp = params['one_timestamp'] 30 | # self.resources = pd.DataFrame 31 | self.norm_method = params['norm_method'] 32 | self._scalers = dict() 33 | self.scale_dispatcher = {'basic': self._scale_base, 34 | 'inter': self._scale_inter} 35 | 36 | def calculate(self, log, add_cols): 37 | log = self.add_calculated_times(log) 38 | log = self.filter_features(log, add_cols) 39 | return self.scale_features(log, add_cols) 40 | 41 | @staticmethod 42 | def add_resources(log, rp_sim): 43 | # Resource pool discovery 44 | res_analyzer = rl.ResourcePoolAnalyser(log, sim_threshold=rp_sim) 45 | # Role discovery 46 | resources = pd.DataFrame.from_records(res_analyzer.resource_table) 47 | resources = resources.rename(index=str, 48 | columns={"resource": "user"}) 49 | # Add roles information 50 | log = log.merge(resources, on='user', how='left') 51 | log = log[~log.task.isin(['Start', 'End'])] 52 | log = log.reset_index(drop=True) 53 | return log 54 | 55 | def filter_features(self, log, add_cols): 56 | # Add intercase features 57 | columns = ['caseid', 'task', 'user', 'end_timestamp', 58 | 'role', 'dur', 'ac_index', 'rl_index'] 59 | if not self.one_timestamp: 60 | columns.extend(['start_timestamp', 'wait']) 61 | columns.extend(add_cols) 62 | log = log[columns] 63 | return log 64 | 65 | def add_calculated_times(self, log): 66 | """Appends the indexes and relative time to the dataframe. 67 | parms: 68 | log: dataframe. 69 | Returns: 70 | Dataframe: The dataframe with the calculated features added. 71 | """ 72 | log['dur'] = 0 73 | log['acc_cycle'] = 0 74 | log['daytime'] = 0 75 | log = log.to_dict('records') 76 | log = sorted(log, key=lambda x: x['caseid']) 77 | for _, group in itertools.groupby(log, key=lambda x: x['caseid']): 78 | events = list(group) 79 | ordk = 'end_timestamp' if self.one_timestamp else 'start_timestamp' 80 | events = sorted(events, key=itemgetter(ordk)) 81 | for i in range(0, len(events)): 82 | # In one-timestamp approach the first activity of the trace 83 | # is taken as instantsince there is no previous timestamp 84 | # to find a range 85 | if self.one_timestamp: 86 | if i == 0: 87 | dur = 0 88 | acc = 0 89 | else: 90 | dur = (events[i]['end_timestamp'] - 91 | events[i-1]['end_timestamp']).total_seconds() 92 | acc = (events[i]['end_timestamp'] - 93 | events[0]['end_timestamp']).total_seconds() 94 | else: 95 | dur = (events[i]['end_timestamp'] - 96 | events[i]['start_timestamp']).total_seconds() 97 | acc = (events[i]['end_timestamp'] - 98 | events[0]['start_timestamp']).total_seconds() 99 | if i == 0: 100 | wit = 0 101 | else: 102 | wit = (events[i]['start_timestamp'] - 103 | events[i-1]['end_timestamp']).total_seconds() 104 | events[i]['wait'] = wit if wit >= 0 else 0 105 | events[i]['dur'] = dur 106 | events[i]['acc_cycle'] = acc 107 | time = events[i][ordk].time() 108 | time = time.second + time.minute*60 + time.hour*3600 109 | events[i]['daytime'] = time 110 | events[i]['weekday'] = events[i]['start_timestamp'].weekday() 111 | return pd.DataFrame.from_dict(log) 112 | 113 | def scale_features(self, log, add_cols): 114 | scaler = self._get_scaler(self.model_type) 115 | return scaler(log, add_cols) 116 | 117 | def register_scaler(self, model_type, scaler): 118 | try: 119 | self._scalers[model_type] = self.scale_dispatcher[scaler] 120 | except KeyError: 121 | raise ValueError(scaler) 122 | 123 | def _get_scaler(self, model_type): 124 | scaler = self._scalers.get(model_type) 125 | if not scaler: 126 | raise ValueError(model_type) 127 | return scaler 128 | 129 | def _scale_base(self, log, add_cols): 130 | if self.one_timestamp: 131 | log, scale_args = self.scale_feature(log, 'dur', self.norm_method) 132 | else: 133 | log, dur_scale = self.scale_feature(log, 'dur', self.norm_method) 134 | log, wait_scale = self.scale_feature(log, 'wait', self.norm_method) 135 | scale_args = {'dur': dur_scale, 'wait': wait_scale} 136 | return log, scale_args 137 | 138 | def _scale_inter(self, log, add_cols): 139 | # log, scale_args = self.scale_feature(log, 'dur', self.norm_method) 140 | if self.one_timestamp: 141 | log, scale_args = self.scale_feature(log, 'dur', self.norm_method) 142 | else: 143 | log, dur_scale = self.scale_feature(log, 'dur', self.norm_method) 144 | log, wait_scale = self.scale_feature(log, 'wait', self.norm_method) 145 | scale_args = {'dur': dur_scale, 'wait': wait_scale} 146 | for col in add_cols: 147 | if col == 'daytime': 148 | log, _ = self.scale_feature(log, 'daytime', 'day_secs', True) 149 | elif col == 'weekday': 150 | continue 151 | else: 152 | log, _ = self.scale_feature(log, col, self.norm_method, True) 153 | return log, scale_args 154 | 155 | # ========================================================================= 156 | # Scale features 157 | # ========================================================================= 158 | @staticmethod 159 | def scale_feature(log, feature, method, replace=False): 160 | """Scales a number given a technique. 161 | Args: 162 | log: Event-log to be scaled. 163 | feature: Feature to be scaled. 164 | method: Scaling method max, lognorm, normal, per activity. 165 | replace (optional): replace the original value or keep both. 166 | Returns: 167 | Scaleded value between 0 and 1. 168 | """ 169 | scale_args = dict() 170 | if method == 'lognorm': 171 | log[feature + '_log'] = np.log1p(log[feature]) 172 | max_value = np.max(log[feature+'_log']) 173 | min_value = np.min(log[feature+'_log']) 174 | log[feature+'_norm'] = np.divide( 175 | np.subtract(log[feature+'_log'], min_value), (max_value - min_value)) 176 | log = log.drop((feature + '_log'), axis=1) 177 | scale_args = {'max_value': max_value, 'min_value': min_value} 178 | elif method == 'normal': 179 | max_value = np.max(log[feature]) 180 | min_value = np.min(log[feature]) 181 | log[feature+'_norm'] = np.divide( 182 | np.subtract(log[feature], min_value), (max_value - min_value)) 183 | scale_args = {'max_value': max_value, 'min_value': min_value} 184 | elif method == 'standard': 185 | mean = np.mean(log[feature]) 186 | std = np.std(log[feature]) 187 | log[feature + '_norm'] = np.divide(np.subtract(log[feature], mean), 188 | std) 189 | scale_args = {'mean': mean, 'std': std} 190 | elif method == 'max': 191 | max_value = np.max(log[feature]) 192 | log[feature + '_norm'] = (np.divide(log[feature], max_value) 193 | if max_value > 0 else 0) 194 | scale_args = {'max_value': max_value} 195 | elif method == 'day_secs': 196 | max_value = 86400 197 | log[feature + '_norm'] = (np.divide(log[feature], max_value) 198 | if max_value > 0 else 0) 199 | scale_args = {'max_value': max_value} 200 | elif method is None: 201 | log[feature+'_norm'] = log[feature] 202 | else: 203 | raise ValueError(method) 204 | if replace: 205 | log = log.drop(feature, axis=1) 206 | return log, scale_args -------------------------------------------------------------------------------- /model_training/intercase_features/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /model_training/model_hpc_optimizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jan 26 15:05:36 2021 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import os 8 | import copy 9 | import random 10 | import itertools 11 | import traceback 12 | import ast 13 | 14 | import pandas as pd 15 | import utils.support as sup 16 | import utils.slurm_multiprocess as slmp 17 | 18 | 19 | class ModelHPCOptimizer(): 20 | """ 21 | Hyperparameter-optimizer class 22 | """ 23 | 24 | def __init__(self, parms, log, ac_index, rl_index): 25 | """constructor""" 26 | self.space = self.define_search_space(parms) 27 | self.log = copy.deepcopy(log) 28 | self.ac_index = ac_index 29 | self.rl_index = rl_index 30 | 31 | # Load settings 32 | self.parms = parms 33 | self.temp_output = parms['output'] 34 | if not os.path.exists(self.temp_output): 35 | os.makedirs(self.temp_output) 36 | os.makedirs(os.path.join(self.temp_output, 'opt_parms')) 37 | self.file_name = sup.file_id(prefix='OP_') 38 | # Results file 39 | if not os.path.exists(os.path.join(self.temp_output, self.file_name)): 40 | open(os.path.join(self.temp_output, self.file_name), 'w').close() 41 | 42 | self.conn = {'partition': 'main', 43 | 'mem': str(32000), 44 | 'cpus': str(10), 45 | 'env': 'deep_generator_pip', 46 | 'script': os.path.join('model_training', 47 | 'slurm_trainer.py')} 48 | self.slurm_workers = 50 49 | self.best_output = None 50 | self.best_parms = dict() 51 | self.best_loss = 1 52 | 53 | @staticmethod 54 | def define_search_space(parms): 55 | space = list() 56 | listOLists = [parms['lstm_act'], 57 | parms['dense_act'], 58 | parms['norm_method'], 59 | parms['n_size'], 60 | parms['l_size'], 61 | parms['optim'], 62 | parms['model_type']] 63 | # selection method definition 64 | preconfigs = list() 65 | for lists in itertools.product(*listOLists): 66 | preconfigs.append(dict(lstm_act=lists[0], 67 | dense_act=lists[1], 68 | norm_method=lists[2], 69 | n_size=lists[3], 70 | l_size=lists[4], 71 | optim=lists[5], 72 | model_type=lists[6])) 73 | def_parms = { 74 | 'imp': parms['imp'], 'file': parms['file_name'], 75 | 'batch_size': parms['batch_size'], 'epochs': parms['epochs'], 76 | 'one_timestamp': parms['one_timestamp']} 77 | for config in random.sample(preconfigs, parms['max_eval']): 78 | space.append({**config, **def_parms}) 79 | return space 80 | 81 | def export_params(self): 82 | configs_files = list() 83 | for config in self.space: 84 | config['ac_index'] = self.ac_index 85 | config['rl_index'] = self.rl_index 86 | conf_file = sup.file_id(prefix='CNF_', extension='.json') 87 | sup.create_json( 88 | config, os.path.join(self.temp_output, 'opt_parms', conf_file)) 89 | configs_files.append(conf_file) 90 | self.log.to_csv( 91 | os.path.join(self.temp_output, 'opt_parms', 'train.csv'), 92 | index=False, encoding='utf-8') 93 | return configs_files 94 | 95 | def execute_trials(self): 96 | configs_files = self.export_params() 97 | args = [{'p': config, 98 | 'f': self.temp_output, 99 | 'r': self.file_name} for config in configs_files] 100 | mprocessor = slmp.HPC_Multiprocess(self.conn, 101 | args, 102 | self.temp_output, 103 | None, 104 | self.slurm_workers, 105 | timeout=5) 106 | mprocessor.parallelize() 107 | try: 108 | self.file_name = os.path.join(self.temp_output, self.file_name) 109 | results = (pd.read_csv(self.file_name) 110 | .sort_values('loss', ascending=bool)) 111 | result = results.head(1).iloc[0] 112 | self.best_output = result.output 113 | self.best_loss = result.loss 114 | self.best_parms = results.head(1).to_dict('records')[0] 115 | self.best_parms['scale_args'] = ast.literal_eval( 116 | self.best_parms.get('scale_args')) 117 | except Exception as e: 118 | print(e) 119 | traceback.print_exc() 120 | pass 121 | 122 | -------------------------------------------------------------------------------- /model_training/model_loader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 16 09:53:54 2020 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import tensorflow as tf 8 | 9 | from model_training.models import model_specialized as mspec 10 | from model_training.models import model_concatenated as mcat 11 | from model_training.models import model_shared_cat as mshcat 12 | 13 | from model_training.models import model_gru_specialized as mspecg 14 | from model_training.models import model_gru_concatenated as mcatg 15 | from model_training.models import model_gru_shared_cat as mshcatg 16 | 17 | 18 | from model_training.models import model_shared_cat_cx as mshcati 19 | from model_training.models import model_concatenated_cx as mcati 20 | from model_training.models import model_gru_concatenated_cx as mcatgi 21 | from model_training.models import model_gru_shared_cat_cx as mshcatgi 22 | 23 | # from model_training.models import model_shared_cat_intercase as mshcati 24 | # from model_training.models import model_concatenated_inter as mcati 25 | # from model_training.models import model_gru_concatenated_inter as mcatgi 26 | # from model_training.models import model_gru_shared_cat_intercase as mshcatgi 27 | 28 | class ModelLoader(): 29 | 30 | def __init__(self, parms): 31 | self.parms = parms 32 | self._trainers = dict() 33 | self.trainer_dispatcher = {'specialized': mspec._training_model, 34 | 'concatenated': mcat._training_model, 35 | 'concatenated_cx': mcati._training_model, 36 | 'shared_cat': mshcat._training_model, 37 | 'shared_cat_cx': mshcati._training_model, 38 | 'specialized_gru': mspecg._training_model, 39 | 'concatenated_gru': mcatg._training_model, 40 | 'concatenated_gru_cx': mcatgi._training_model, 41 | 'shared_cat_gru': mshcatg._training_model, 42 | 'shared_cat_gru_cx': mshcatgi._training_model, 43 | # 'cnn_lstm': cnnl._training_model, 44 | # 'gan': mgan._training_model 45 | } 46 | 47 | def train(self, model_type, train_vec, valdn_vec, ac_weights, rl_weights, output_folder): 48 | loader = self._get_trainer(model_type) 49 | tf.compat.v1.reset_default_graph() 50 | return loader(train_vec, 51 | valdn_vec, 52 | ac_weights, 53 | rl_weights, 54 | output_folder, 55 | self.parms) 56 | 57 | def register_model(self, model_type, trainer): 58 | try: 59 | self._trainers[model_type] = self.trainer_dispatcher[trainer] 60 | except KeyError: 61 | raise ValueError(trainer) 62 | 63 | def _get_trainer(self, model_type): 64 | trainer = self._trainers.get(model_type) 65 | if not trainer: 66 | raise ValueError(model_type) 67 | return trainer -------------------------------------------------------------------------------- /model_training/model_optimizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Nov 17 10:48:57 2020 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import os 8 | import copy 9 | import ast 10 | import traceback 11 | import pandas as pd 12 | import configparser as cp 13 | from hyperopt import tpe 14 | from hyperopt import Trials, hp, fmin, STATUS_OK, STATUS_FAIL 15 | 16 | import utils.support as sup 17 | import readers.log_splitter as ls 18 | 19 | import tensorflow as tf 20 | from model_training import samples_creator as sc 21 | from model_training import model_loader as mload 22 | from model_training import features_manager as feat 23 | 24 | 25 | class ModelOptimizer(): 26 | """ 27 | Hyperparameter-optimizer class 28 | """ 29 | class Decorators(object): 30 | 31 | @classmethod 32 | def safe_exec(cls, method): 33 | """ 34 | Decorator to safe execute methods and return the state 35 | ---------- 36 | method : Any method. 37 | Returns 38 | ------- 39 | dict : execution status 40 | """ 41 | def safety_check(*args, **kw): 42 | status = kw.get('status', method.__name__.upper()) 43 | response = {'values': [], 'status': status} 44 | if status == STATUS_OK: 45 | try: 46 | response['values'] = method(*args) 47 | except Exception as e: 48 | print(e) 49 | traceback.print_exc() 50 | response['status'] = STATUS_FAIL 51 | return response 52 | return safety_check 53 | 54 | def __init__(self, parms, log, ac_index, ac_weights, rl_index, rl_weights): 55 | """constructor""" 56 | self.space = self.define_search_space(parms) 57 | self.log = copy.deepcopy(log) 58 | self.ac_index = ac_index 59 | self.ac_weights = ac_weights 60 | self.rl_index = rl_index 61 | self.rl_weights = rl_weights 62 | 63 | # Load settings 64 | self.parms = parms 65 | self.temp_output = parms['output'] 66 | if not os.path.exists(self.temp_output): 67 | os.makedirs(self.temp_output) 68 | self.file_name = os.path.join(self.temp_output, 69 | sup.file_id(prefix='OP_')) 70 | # Results file 71 | if not os.path.exists(self.file_name): 72 | open(self.file_name, 'w').close() 73 | # Trials object to track progress 74 | self.bayes_trials = Trials() 75 | self.best_output = None 76 | self.best_params = dict() 77 | self.best_loss = 1 78 | 79 | @staticmethod 80 | def define_search_space(parms): 81 | space = {'model_type': hp.choice('model_type', parms['model_type']), 82 | 'n_size': hp.choice('n_size', parms['n_size']), 83 | 'l_size': hp.choice('l_size', parms['l_size']), 84 | 'lstm_act': hp.choice('lstm_act', parms['lstm_act']), 85 | 'dense_act': hp.choice('dense_act', parms['dense_act']), 86 | 'norm_method': hp.choice('norm_method', parms['norm_method']), 87 | 'optim': hp.choice('optim', parms['optim']), 88 | 'imp': parms['imp'], 'file': parms['file_name'], 89 | 'batch_size': parms['batch_size'], 'epochs': parms['epochs'], 90 | 'one_timestamp': parms['one_timestamp']} 91 | return space 92 | 93 | def execute_trials(self): 94 | def exec_pipeline(trial_stg): 95 | print(trial_stg) 96 | status = STATUS_OK 97 | # Path redefinition 98 | rsp = self._temp_path_redef(trial_stg, status=status) 99 | status = rsp['status'] 100 | trial_stg = rsp['values'] if status == STATUS_OK else trial_stg 101 | # Model definition 102 | model_def = self.read_model_definition(trial_stg['model_type']) 103 | # Scale values 104 | log, trial_stg = self._scale_values(self.log, trial_stg, model_def) 105 | # split validation 106 | log_valdn, log_train = self.split_timeline(0.8, log, trial_stg['one_timestamp']) 107 | print('train split size:', len(log_train)) 108 | print('valdn split size:', len(log_valdn)) 109 | # Vectorize input 110 | vectorizer = sc.SequencesCreator( 111 | self.parms['read_options']['one_timestamp'], 112 | self.ac_index, self.rl_index) 113 | vectorizer.register_vectorizer(trial_stg['model_type'], 114 | model_def['vectorizer']) 115 | train_vec = vectorizer.vectorize(trial_stg['model_type'], 116 | log_train, 117 | trial_stg, 118 | model_def['additional_columns']) 119 | valdn_vec = vectorizer.vectorize(trial_stg['model_type'], 120 | log_valdn, 121 | trial_stg, 122 | model_def['additional_columns']) 123 | # Train 124 | m_loader = mload.ModelLoader(trial_stg) 125 | m_loader.register_model(trial_stg['model_type'], 126 | model_def['trainer']) 127 | tf.compat.v1.reset_default_graph() 128 | model = m_loader.train(trial_stg['model_type'], 129 | train_vec, 130 | valdn_vec, 131 | self.ac_weights, 132 | self.rl_weights, 133 | trial_stg['output']) 134 | # evaluation 135 | x_input = {'ac_input': valdn_vec['prefixes']['activities'], 136 | 'rl_input': valdn_vec['prefixes']['roles'], 137 | 't_input': valdn_vec['prefixes']['times']} 138 | if trial_stg['model_type'] in ['shared_cat_cx', 139 | 'concatenated_cx', 140 | 'shared_cat_gru_cx', 141 | 'concatenated_gru_cx']: 142 | x_input['inter_input']= valdn_vec['prefixes']['inter_attr'] 143 | acc = model.evaluate( 144 | x=x_input, 145 | y={'act_output': valdn_vec['next_evt']['activities'], 146 | 'role_output': valdn_vec['next_evt']['roles'], 147 | 'time_output': valdn_vec['next_evt']['times']}, 148 | return_dict=True) 149 | rsp = self._define_response(trial_stg, status, acc['loss']) 150 | print("-- End of trial --") 151 | return rsp 152 | 153 | # Optimize 154 | best = fmin(fn=exec_pipeline, 155 | space=self.space, 156 | algo=tpe.suggest, 157 | max_evals=self.parms['max_eval'], 158 | trials=self.bayes_trials, 159 | show_progressbar=False) 160 | # Save results 161 | try: 162 | results = (pd.DataFrame(self.bayes_trials.results) 163 | .sort_values('loss', ascending=True)) 164 | result = results[results.status == 'ok'].head(1).iloc[0] 165 | self.best_output = result.output 166 | self.best_loss = result.loss 167 | self.best_params = {k: self.parms[k][v] for k, v in best.items()} 168 | opt_res = pd.read_csv(self.file_name) 169 | opt_res = opt_res[opt_res.output == result.output].iloc[0] 170 | self.best_params['scale_args'] = ast.literal_eval(opt_res.scale_args) 171 | except Exception as e: 172 | print(e) 173 | pass 174 | 175 | @Decorators.safe_exec 176 | def _temp_path_redef(self, settings, **kwargs) -> dict: 177 | # Paths redefinition 178 | settings['output'] = os.path.join(self.temp_output, sup.folder_id()) 179 | # Output folder creation 180 | if not os.path.exists(settings['output']): 181 | os.makedirs(settings['output']) 182 | return settings 183 | 184 | @staticmethod 185 | def _scale_values(log, params, model_def): 186 | # Features treatment 187 | inp = feat.FeaturesMannager(params) 188 | # Register scaler 189 | inp.register_scaler(params['model_type'], model_def['scaler']) 190 | # Scale features 191 | log, params['scale_args'] = inp.calculate( 192 | log, model_def['additional_columns']) 193 | return log, params 194 | 195 | def _define_response(self, parms, status, loss, **kwargs) -> dict: 196 | print(loss) 197 | response = dict() 198 | measurements = list() 199 | data = {'n_size': parms['n_size'], 200 | 'l_size': parms['l_size'], 201 | 'lstm_act': parms['lstm_act'], 202 | 'dense_act': parms['dense_act'], 203 | 'optim': parms['optim'], 204 | 'scale_args': parms['scale_args'], 205 | 'output': parms['output']} 206 | response['output'] = parms['output'] 207 | if status == STATUS_OK: 208 | response['loss'] = loss 209 | response['status'] = status if loss > 0 else STATUS_FAIL 210 | measurements.append({**{'loss': loss, 211 | 'sim_metric': 'val_loss', 212 | 'status': response['status']}, 213 | **data}) 214 | else: 215 | response['status'] = status 216 | measurements.append({**{'loss': 1, 217 | 'sim_metric': 'val_loss', 218 | 'status': response['status']}, 219 | **data}) 220 | if os.path.getsize(self.file_name) > 0: 221 | sup.create_csv_file(measurements, self.file_name, mode='a') 222 | else: 223 | sup.create_csv_file_header(measurements, self.file_name) 224 | return response 225 | 226 | @staticmethod 227 | def split_timeline(size: float, log: pd.DataFrame, one_ts: bool) -> None: 228 | """ 229 | Split an event log dataframe by time to peform split-validation. 230 | prefered method time splitting removing incomplete traces. 231 | If the testing set is smaller than the 10% of the log size 232 | the second method is sort by traces start and split taking the whole 233 | traces no matter if they are contained in the timeframe or not 234 | 235 | Parameters 236 | ---------- 237 | size : float, validation percentage. 238 | one_ts : bool, Support only one timestamp. 239 | """ 240 | # Split log data 241 | splitter = ls.LogSplitter(log) 242 | train, valdn = splitter.split_log('timeline_contained', size, one_ts) 243 | total_events = len(log) 244 | # Check size and change time splitting method if necesary 245 | if len(valdn) < int(total_events*0.1): 246 | train, valdn = splitter.split_log('timeline_trace', size, one_ts) 247 | # Set splits 248 | key = 'end_timestamp' if one_ts else 'start_timestamp' 249 | valdn = pd.DataFrame(valdn) 250 | train = pd.DataFrame(train) 251 | log_valdn = (valdn.sort_values(key, ascending=True).reset_index(drop=True)) 252 | log_train = (train.sort_values(key, ascending=True).reset_index(drop=True)) 253 | return log_valdn, log_train 254 | 255 | @staticmethod 256 | def read_model_definition(model_type): 257 | model_def = dict() 258 | config = cp.ConfigParser(interpolation=None) 259 | config.read('models_spec.ini') 260 | # File name with extension 261 | model_def['additional_columns'] = sup.reduce_list(config.get(model_type, 'additional_columns'), dtype='str') 262 | model_def['scaler'] = config.get(model_type, 'scaler') 263 | model_def['vectorizer'] = config.get(model_type, 'vectorizer') 264 | model_def['trainer'] = config.get(model_type, 'trainer') 265 | return model_def 266 | -------------------------------------------------------------------------------- /model_training/model_trainer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Mar 12 15:07:19 2020 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import os 8 | import csv 9 | 10 | import pandas as pd 11 | import numpy as np 12 | import shutil 13 | 14 | import readers.log_reader as lr 15 | import utils.support as sup 16 | import readers.log_splitter as ls 17 | 18 | from model_training.features_manager import FeaturesMannager as feat 19 | from model_training import embedding_training as em 20 | from model_training import model_optimizer as op 21 | from model_training import model_hpc_optimizer as hpc_op 22 | 23 | 24 | class ModelTrainer(): 25 | """ 26 | This is the man class encharged of the model training 27 | """ 28 | 29 | def __init__(self, params): 30 | """constructor""" 31 | self.log = self.load_log(params) 32 | # Split validation partitions 33 | self.log_train = pd.DataFrame() 34 | self.log_test = pd.DataFrame() 35 | # Activities and roles indexes 36 | self.ac_index = dict() 37 | self.index_ac = dict() 38 | 39 | self.rl_index = dict() 40 | self.index_rl = dict() 41 | # Training examples 42 | self.examples = dict() 43 | # Embedded dimensions 44 | self.ac_weights = list() 45 | self.rl_weights = list() 46 | # Preprocess the event-log 47 | self.preprocess(params) 48 | # Train model 49 | params['output'] = os.path.join('output_files', sup.folder_id()) 50 | if params['opt_method'] == 'rand_hpc': 51 | optimizer = hpc_op.ModelHPCOptimizer(params, 52 | self.log, 53 | self.ac_index, 54 | self.rl_index) 55 | optimizer.execute_trials() 56 | elif params['opt_method'] == 'bayesian': 57 | optimizer = op.ModelOptimizer(params, 58 | self.log, 59 | self.ac_index, 60 | self.ac_weights, 61 | self.rl_index, 62 | self.rl_weights) 63 | optimizer.execute_trials() 64 | # Export results 65 | output_path = os.path.join('output_files', sup.folder_id()) 66 | shutil.copytree(optimizer.best_output, output_path) 67 | shutil.copy(optimizer.file_name, output_path) 68 | self.export_parms(output_path, optimizer.best_params) 69 | # Remove folder 70 | shutil.rmtree(params['output']) 71 | 72 | def preprocess(self, params): 73 | self.log = feat.add_resources(self.log, params['rp_sim']) 74 | # indexes creation 75 | self.indexing() 76 | # split validation 77 | self.split_timeline(0.8, params['one_timestamp']) 78 | # Load embedded matrix 79 | ac_emb_name = 'ac_' + params['file_name'].split('.')[0]+'.emb' 80 | rl_emb_name = 'rl_' + params['file_name'].split('.')[0]+'.emb' 81 | if os.path.exists(os.path.join('input_files', 82 | 'embedded_matix', 83 | ac_emb_name)): 84 | self.ac_weights = self.load_embedded(self.index_ac, ac_emb_name) 85 | self.rl_weights = self.load_embedded(self.index_rl, rl_emb_name) 86 | else: 87 | em.training_model(params, 88 | self.log, 89 | self.ac_index, self.index_ac, 90 | self.rl_index, self.index_rl) 91 | self.ac_weights = self.load_embedded(self.index_ac, ac_emb_name) 92 | self.rl_weights = self.load_embedded(self.index_rl, rl_emb_name) 93 | 94 | @staticmethod 95 | def load_log(params): 96 | params['read_options']['filter_d_attrib'] = False 97 | log = lr.LogReader(os.path.join('input_files', params['file_name']), 98 | params['read_options']) 99 | log_df = pd.DataFrame(log.data) 100 | if set(['Unnamed: 0', 'role']).issubset(set(log_df.columns)): 101 | log_df.drop(columns=['Unnamed: 0', 'role'], inplace=True) 102 | log_df = log_df[~log_df.task.isin(['Start', 'End'])] 103 | return log_df 104 | 105 | def indexing(self): 106 | # Activities index creation 107 | self.ac_index = self.create_index(self.log, 'task') 108 | self.ac_index['start'] = 0 109 | self.ac_index['end'] = len(self.ac_index) 110 | self.index_ac = {v: k for k, v in self.ac_index.items()} 111 | # Roles index creation 112 | self.rl_index = self.create_index(self.log, 'role') 113 | self.rl_index['start'] = 0 114 | self.rl_index['end'] = len(self.rl_index) 115 | self.index_rl = {v: k for k, v in self.rl_index.items()} 116 | # Add index to the event log 117 | ac_idx = lambda x: self.ac_index[x['task']] 118 | self.log['ac_index'] = self.log.apply(ac_idx, axis=1) 119 | rl_idx = lambda x: self.rl_index[x['role']] 120 | self.log['rl_index'] = self.log.apply(rl_idx, axis=1) 121 | 122 | @staticmethod 123 | def create_index(log_df, column): 124 | """Creates an idx for a categorical attribute. 125 | parms: 126 | log_df: dataframe. 127 | column: column name. 128 | Returns: 129 | index of a categorical attribute pairs. 130 | """ 131 | temp_list = log_df[[column]].values.tolist() 132 | subsec_set = {(x[0]) for x in temp_list} 133 | subsec_set = sorted(list(subsec_set)) 134 | alias = dict() 135 | for i, _ in enumerate(subsec_set): 136 | alias[subsec_set[i]] = i + 1 137 | return alias 138 | 139 | 140 | def split_timeline(self, size: float, one_ts: bool) -> None: 141 | """ 142 | Split an event log dataframe by time to peform split-validation. 143 | prefered method time splitting removing incomplete traces. 144 | If the testing set is smaller than the 10% of the log size 145 | the second method is sort by traces start and split taking the whole 146 | traces no matter if they are contained in the timeframe or not 147 | 148 | Parameters 149 | ---------- 150 | size : float, validation percentage. 151 | one_ts : bool, Support only one timestamp. 152 | """ 153 | # Split log data 154 | splitter = ls.LogSplitter(self.log) 155 | train, test = splitter.split_log('timeline_contained', size, one_ts) 156 | total_events = len(self.log) 157 | # Check size and change time splitting method if necesary 158 | if len(test) < int(total_events*0.1): 159 | train, test = splitter.split_log('timeline_trace', size, one_ts) 160 | # Set splits 161 | key = 'end_timestamp' if one_ts else 'start_timestamp' 162 | test = pd.DataFrame(test) 163 | train = pd.DataFrame(train) 164 | self.log_test = (test.sort_values(key, ascending=True) 165 | .reset_index(drop=True)) 166 | self.log_train = (train.sort_values(key, ascending=True) 167 | .reset_index(drop=True)) 168 | 169 | 170 | 171 | @staticmethod 172 | def load_embedded(index, filename): 173 | """Loading of the embedded matrices. 174 | parms: 175 | index (dict): index of activities or roles. 176 | filename (str): filename of the matrix file. 177 | Returns: 178 | numpy array: array of weights. 179 | """ 180 | weights = list() 181 | input_folder = os.path.join('input_files', 'embedded_matix') 182 | with open(os.path.join(input_folder, filename), 'r') as csvfile: 183 | filereader = csv.reader(csvfile, delimiter=',', quotechar='"') 184 | for row in filereader: 185 | cat_ix = int(row[0]) 186 | if index[cat_ix] == row[1].strip(): 187 | weights.append([float(x) for x in row[2:]]) 188 | csvfile.close() 189 | return np.array(weights) 190 | 191 | def export_parms(self, output_folder, parms): 192 | if not os.path.exists(os.path.join(output_folder, 'parameters')): 193 | os.makedirs(os.path.join(output_folder, 'parameters')) 194 | 195 | parms['max_trace_size'] = int(self.log.groupby('caseid')['task'] 196 | .count().max()) 197 | 198 | parms['index_ac'] = self.index_ac 199 | parms['index_rl'] = self.index_rl 200 | 201 | sup.create_json(parms, os.path.join(output_folder, 202 | 'parameters', 203 | 'model_parameters.json')) 204 | self.log_test.to_csv(os.path.join(output_folder, 205 | 'parameters', 206 | 'test_log.csv'), 207 | index=False, 208 | encoding='utf-8') 209 | 210 | -------------------------------------------------------------------------------- /model_training/models/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /model_training/models/model_concatenated.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Feb 28 10:15:12 2019 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import os 8 | 9 | from tensorflow.keras.models import Model 10 | from tensorflow.keras.layers import Input, Embedding, Concatenate 11 | from tensorflow.keras.layers import Dense, LSTM, BatchNormalization 12 | from tensorflow.keras.optimizers import Nadam, Adam, SGD, Adagrad 13 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau 14 | 15 | try: 16 | from support_modules.callbacks import time_callback as tc 17 | except: 18 | from importlib import util 19 | spec = util.spec_from_file_location( 20 | 'time_callback', 21 | os.path.join(os.getcwd(), 'support_modules', 'callbacks', 'time_callback.py')) 22 | tc = util.module_from_spec(spec) 23 | spec.loader.exec_module(tc) 24 | 25 | 26 | def _training_model(train_vec, valdn_vec, ac_weights, rl_weights, output_folder, args, log_path=None): 27 | """Example function with types documented in the docstring. 28 | Args: 29 | param1 (int): The first parameter. 30 | param2 (str): The second parameter. 31 | Returns: 32 | bool: The return value. True for success, False otherwise. 33 | """ 34 | 35 | print('Build model...') 36 | print(args) 37 | # ============================================================================= 38 | # Input layer 39 | # ============================================================================= 40 | ac_input = Input(shape=(train_vec['prefixes']['activities'].shape[1], ), name='ac_input') 41 | rl_input = Input(shape=(train_vec['prefixes']['roles'].shape[1], ), name='rl_input') 42 | t_input = Input(shape=(train_vec['prefixes']['times'].shape[1], 43 | train_vec['prefixes']['times'].shape[2]), name='t_input') 44 | 45 | # ============================================================================= 46 | # Embedding layer for categorical attributes 47 | # ============================================================================= 48 | ac_embedding = Embedding(ac_weights.shape[0], 49 | ac_weights.shape[1], 50 | weights=[ac_weights], 51 | input_length=train_vec['prefixes']['activities'].shape[1], 52 | trainable=False, name='ac_embedding')(ac_input) 53 | 54 | rl_embedding = Embedding(rl_weights.shape[0], 55 | rl_weights.shape[1], 56 | weights=[rl_weights], 57 | input_length=train_vec['prefixes']['roles'].shape[1], 58 | trainable=False, name='rl_embedding')(rl_input) 59 | 60 | # ============================================================================= 61 | # Layer 1 62 | # ============================================================================= 63 | concatenate = Concatenate(name='concatenated', axis=2)([ac_embedding, rl_embedding, t_input]) 64 | 65 | if args['lstm_act'] is not None: 66 | l1_c1 = LSTM(args['l_size'], 67 | activation=args['lstm_act'], 68 | kernel_initializer='glorot_uniform', 69 | return_sequences=True, 70 | dropout=0.2, 71 | implementation=args['imp'])(concatenate) 72 | else: 73 | l1_c1 = LSTM(args['l_size'], 74 | kernel_initializer='glorot_uniform', 75 | return_sequences=True, 76 | dropout=0.2, 77 | implementation=args['imp'])(concatenate) 78 | 79 | # ============================================================================= 80 | # Batch Normalization Layer 81 | # ============================================================================= 82 | batch1 = BatchNormalization()(l1_c1) 83 | 84 | # ============================================================================= 85 | # The layer specialized in prediction 86 | # ============================================================================= 87 | l2_c1 = LSTM(args['l_size'], 88 | kernel_initializer='glorot_uniform', 89 | return_sequences=False, 90 | dropout=0.2, 91 | implementation=args['imp'])(batch1) 92 | 93 | # The layer specialized in role prediction 94 | l2_c2 = LSTM(args['l_size'], 95 | kernel_initializer='glorot_uniform', 96 | return_sequences=False, 97 | dropout=0.2, 98 | implementation=args['imp'])(batch1) 99 | 100 | # The layer specialized in role prediction 101 | l2_3 = LSTM(args['l_size'], 102 | activation=args['lstm_act'], 103 | kernel_initializer='glorot_uniform', 104 | return_sequences=False, 105 | dropout=0.2, 106 | implementation=args['imp'])(batch1) 107 | 108 | # ============================================================================= 109 | # Output Layer 110 | # ============================================================================= 111 | act_output = Dense(ac_weights.shape[0], 112 | activation='softmax', 113 | kernel_initializer='glorot_uniform', 114 | name='act_output')(l2_c1) 115 | 116 | role_output = Dense(rl_weights.shape[0], 117 | activation='softmax', 118 | kernel_initializer='glorot_uniform', 119 | name='role_output')(l2_c2) 120 | 121 | if ('dense_act' in args) and (args['dense_act'] is not None): 122 | time_output = Dense(train_vec['next_evt']['times'].shape[1], 123 | activation=args['dense_act'], 124 | kernel_initializer='glorot_uniform', 125 | name='time_output')(l2_3) 126 | else: 127 | time_output = Dense(train_vec['next_evt']['times'].shape[1], 128 | kernel_initializer='glorot_uniform', 129 | name='time_output')(l2_3) 130 | 131 | model = Model(inputs=[ac_input, rl_input, t_input], 132 | outputs=[act_output, role_output, time_output]) 133 | 134 | if args['optim'] == 'Nadam': 135 | opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999) 136 | elif args['optim'] == 'Adam': 137 | opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False) 138 | elif args['optim'] == 'SGD': 139 | opt = SGD(learning_rate=0.01, momentum=0.0, nesterov=False) 140 | elif args['optim'] == 'Adagrad': 141 | opt = Adagrad(learning_rate=0.01) 142 | 143 | model.compile(loss={'act_output': 'categorical_crossentropy', 144 | 'role_output': 'categorical_crossentropy', 145 | 'time_output': 'mae'}, optimizer=opt) 146 | 147 | model.summary() 148 | 149 | early_stopping = EarlyStopping(monitor='val_loss', patience=50) 150 | if log_path: 151 | cb = tc.TimingCallback(output_folder, log_path=log_path) 152 | else: 153 | cb = tc.TimingCallback(output_folder) 154 | 155 | # Output file 156 | output_file_path = os.path.join(output_folder, 157 | os.path.splitext(args['file'])[0]+'.h5') 158 | 159 | # Saving 160 | model_checkpoint = ModelCheckpoint(output_file_path, 161 | monitor='val_loss', 162 | verbose=0, 163 | save_best_only=True, 164 | save_weights_only=False, 165 | mode='auto') 166 | lr_reducer = ReduceLROnPlateau(monitor='val_loss', 167 | factor=0.5, 168 | patience=10, 169 | verbose=0, 170 | mode='auto', 171 | min_delta=0.0001, 172 | cooldown=0, 173 | min_lr=0) 174 | 175 | batch_size = args['batch_size'] 176 | model.fit({'ac_input': train_vec['prefixes']['activities'], 177 | 'rl_input': train_vec['prefixes']['roles'], 178 | 't_input': train_vec['prefixes']['times']}, 179 | {'act_output': train_vec['next_evt']['activities'], 180 | 'role_output': train_vec['next_evt']['roles'], 181 | 'time_output': train_vec['next_evt']['times']}, 182 | validation_data=( 183 | {'ac_input': valdn_vec['prefixes']['activities'], 184 | 'rl_input': valdn_vec['prefixes']['roles'], 185 | 't_input': valdn_vec['prefixes']['times']}, 186 | {'act_output': valdn_vec['next_evt']['activities'], 187 | 'role_output': valdn_vec['next_evt']['roles'], 188 | 'time_output': valdn_vec['next_evt']['times']}), 189 | verbose=2, 190 | callbacks=[early_stopping, model_checkpoint, 191 | lr_reducer, cb], 192 | batch_size=batch_size, 193 | epochs=args['epochs']) 194 | return model 195 | -------------------------------------------------------------------------------- /model_training/models/model_concatenated_cx.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Feb 28 10:15:12 2019 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import os 8 | 9 | from tensorflow.keras.models import Model 10 | from tensorflow.keras.layers import Input, Embedding, Concatenate 11 | from tensorflow.keras.layers import Dense, LSTM, BatchNormalization 12 | from tensorflow.keras.optimizers import Nadam, Adam, SGD, Adagrad 13 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau 14 | 15 | try: 16 | from support_modules.callbacks import time_callback as tc 17 | except: 18 | from importlib import util 19 | spec = util.spec_from_file_location( 20 | 'time_callback', 21 | os.path.join(os.getcwd(), 'support_modules', 'callbacks', 'time_callback.py')) 22 | tc = util.module_from_spec(spec) 23 | spec.loader.exec_module(tc) 24 | 25 | 26 | def _training_model(train_vec, valdn_vec, ac_weights, rl_weights, 27 | output_folder, args, log_path=None): 28 | """Example function with types documented in the docstring. 29 | Args: 30 | param1 (int): The first parameter. 31 | param2 (str): The second parameter. 32 | Returns: 33 | bool: The return value. True for success, False otherwise. 34 | """ 35 | 36 | print('Build model...') 37 | print(args) 38 | # ============================================================================= 39 | # Input layer 40 | # ============================================================================= 41 | ac_input = Input(shape=(train_vec['prefixes']['activities'].shape[1], ), 42 | name='ac_input') 43 | rl_input = Input(shape=(train_vec['prefixes']['roles'].shape[1], ), 44 | name='rl_input') 45 | t_input = Input(shape=(train_vec['prefixes']['times'].shape[1], 46 | train_vec['prefixes']['times'].shape[2]), name='t_input') 47 | inter_input = Input(shape=(train_vec['prefixes']['inter_attr'].shape[1], 48 | train_vec['prefixes']['inter_attr'].shape[2]), 49 | name='inter_input') 50 | 51 | # ============================================================================= 52 | # Embedding layer for categorical attributes 53 | # ============================================================================= 54 | ac_embedding = Embedding(ac_weights.shape[0], 55 | ac_weights.shape[1], 56 | weights=[ac_weights], 57 | input_length=train_vec['prefixes']['activities'].shape[1], 58 | trainable=False, name='ac_embedding')(ac_input) 59 | 60 | rl_embedding = Embedding(rl_weights.shape[0], 61 | rl_weights.shape[1], 62 | weights=[rl_weights], 63 | input_length=train_vec['prefixes']['roles'].shape[1], 64 | trainable=False, name='rl_embedding')(rl_input) 65 | 66 | # ============================================================================= 67 | # Layer 1 68 | # ============================================================================= 69 | concatenate = Concatenate(name='concatenated', axis=2)( 70 | [ac_embedding, rl_embedding, t_input, inter_input]) 71 | 72 | if args['lstm_act'] is not None: 73 | l1_c1 = LSTM(args['l_size'], 74 | activation=args['lstm_act'], 75 | kernel_initializer='glorot_uniform', 76 | return_sequences=True, 77 | dropout=0.2, 78 | implementation=args['imp'])(concatenate) 79 | else: 80 | l1_c1 = LSTM(args['l_size'], 81 | kernel_initializer='glorot_uniform', 82 | return_sequences=True, 83 | dropout=0.2, 84 | implementation=args['imp'])(concatenate) 85 | 86 | # ============================================================================= 87 | # Batch Normalization Layer 88 | # ============================================================================= 89 | batch1 = BatchNormalization()(l1_c1) 90 | 91 | # ============================================================================= 92 | # The layer specialized in prediction 93 | # ============================================================================= 94 | l2_c1 = LSTM(args['l_size'], 95 | kernel_initializer='glorot_uniform', 96 | return_sequences=False, 97 | dropout=0.2, 98 | implementation=args['imp'])(batch1) 99 | 100 | # The layer specialized in role prediction 101 | l2_c2 = LSTM(args['l_size'], 102 | kernel_initializer='glorot_uniform', 103 | return_sequences=False, 104 | dropout=0.2, 105 | implementation=args['imp'])(batch1) 106 | 107 | # The layer specialized in role prediction 108 | l2_c3 = LSTM(args['l_size'], 109 | activation=args['lstm_act'], 110 | kernel_initializer='glorot_uniform', 111 | return_sequences=False, 112 | dropout=0.2, 113 | implementation=args['imp'])(batch1) 114 | 115 | # ============================================================================= 116 | # Output Layer 117 | # ============================================================================= 118 | act_output = Dense(ac_weights.shape[0], 119 | activation='softmax', 120 | kernel_initializer='glorot_uniform', 121 | name='act_output')(l2_c1) 122 | 123 | role_output = Dense(rl_weights.shape[0], 124 | activation='softmax', 125 | kernel_initializer='glorot_uniform', 126 | name='role_output')(l2_c2) 127 | 128 | if ('dense_act' in args) and (args['dense_act'] is not None): 129 | time_output = Dense(train_vec['next_evt']['times'].shape[1], 130 | activation=args['dense_act'], 131 | kernel_initializer='glorot_uniform', 132 | name='time_output')(l2_c3) 133 | else: 134 | time_output = Dense(train_vec['next_evt']['times'].shape[1], 135 | kernel_initializer='glorot_uniform', 136 | name='time_output')(l2_c3) 137 | model = Model(inputs=[ac_input, rl_input, t_input, inter_input], 138 | outputs=[act_output, role_output, time_output]) 139 | 140 | if args['optim'] == 'Nadam': 141 | opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999) 142 | elif args['optim'] == 'Adam': 143 | opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False) 144 | elif args['optim'] == 'SGD': 145 | opt = SGD(learning_rate=0.01, momentum=0.0, nesterov=False) 146 | elif args['optim'] == 'Adagrad': 147 | opt = Adagrad(learning_rate=0.01) 148 | 149 | model.compile(loss={'act_output': 'categorical_crossentropy', 150 | 'role_output': 'categorical_crossentropy', 151 | 'time_output': 'mae'}, optimizer=opt) 152 | 153 | model.summary() 154 | 155 | early_stopping = EarlyStopping(monitor='val_loss', patience=40) 156 | if log_path: 157 | cb = tc.TimingCallback(output_folder, log_path=log_path) 158 | else: 159 | cb = tc.TimingCallback(output_folder) 160 | 161 | # Output file 162 | output_file_path = os.path.join(output_folder, 163 | os.path.splitext(args['file'])[0]+'.h5') 164 | 165 | # Saving 166 | model_checkpoint = ModelCheckpoint(output_file_path, 167 | monitor='val_loss', 168 | verbose=0, 169 | save_best_only=True, 170 | save_weights_only=False, 171 | mode='auto') 172 | lr_reducer = ReduceLROnPlateau(monitor='val_loss', 173 | factor=0.5, 174 | patience=10, 175 | verbose=0, 176 | mode='auto', 177 | min_delta=0.0001, 178 | cooldown=0, 179 | min_lr=0) 180 | 181 | batch_size = args['batch_size'] 182 | model.fit({'ac_input': train_vec['prefixes']['activities'], 183 | 'rl_input': train_vec['prefixes']['roles'], 184 | 't_input': train_vec['prefixes']['times'], 185 | 'inter_input': train_vec['prefixes']['inter_attr']}, 186 | {'act_output': train_vec['next_evt']['activities'], 187 | 'role_output': train_vec['next_evt']['roles'], 188 | 'time_output': train_vec['next_evt']['times']}, 189 | validation_data=( 190 | {'ac_input': valdn_vec['prefixes']['activities'], 191 | 'rl_input': valdn_vec['prefixes']['roles'], 192 | 't_input': valdn_vec['prefixes']['times'], 193 | 'inter_input': valdn_vec['prefixes']['inter_attr']}, 194 | {'act_output': valdn_vec['next_evt']['activities'], 195 | 'role_output': valdn_vec['next_evt']['roles'], 196 | 'time_output': valdn_vec['next_evt']['times']}), 197 | verbose=2, 198 | callbacks=[early_stopping, model_checkpoint, 199 | lr_reducer, cb], 200 | batch_size=batch_size, 201 | epochs=args['epochs']) 202 | return model 203 | -------------------------------------------------------------------------------- /model_training/models/model_gru_concatenated.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Feb 28 10:15:12 2019 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import os 8 | 9 | from tensorflow.keras.models import Model 10 | from tensorflow.keras.layers import Input, Embedding, Concatenate 11 | from tensorflow.keras.layers import Dense, GRU, BatchNormalization 12 | from tensorflow.keras.optimizers import Nadam, Adam, SGD, Adagrad 13 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau 14 | 15 | try: 16 | from support_modules.callbacks import time_callback as tc 17 | except: 18 | from importlib import util 19 | spec = util.spec_from_file_location( 20 | 'time_callback', 21 | os.path.join(os.getcwd(), 'support_modules', 'callbacks', 'time_callback.py')) 22 | tc = util.module_from_spec(spec) 23 | spec.loader.exec_module(tc) 24 | 25 | 26 | def _training_model(train_vec, valdn_vec, ac_weights, rl_weights, 27 | output_folder, args, log_path=None): 28 | """Example function with types documented in the docstring. 29 | Args: 30 | param1 (int): The first parameter. 31 | param2 (str): The second parameter. 32 | Returns: 33 | bool: The return value. True for success, False otherwise. 34 | """ 35 | 36 | print('Build model...') 37 | print(args) 38 | # ============================================================================= 39 | # Input layer 40 | # ============================================================================= 41 | ac_input = Input(shape=(train_vec['prefixes']['activities'].shape[1], ), name='ac_input') 42 | rl_input = Input(shape=(train_vec['prefixes']['roles'].shape[1], ), name='rl_input') 43 | t_input = Input(shape=(train_vec['prefixes']['times'].shape[1], 44 | train_vec['prefixes']['times'].shape[2]), name='t_input') 45 | 46 | # ============================================================================= 47 | # Embedding layer for categorical attributes 48 | # ============================================================================= 49 | ac_embedding = Embedding(ac_weights.shape[0], 50 | ac_weights.shape[1], 51 | weights=[ac_weights], 52 | input_length=train_vec['prefixes']['activities'].shape[1], 53 | trainable=False, name='ac_embedding')(ac_input) 54 | 55 | rl_embedding = Embedding(rl_weights.shape[0], 56 | rl_weights.shape[1], 57 | weights=[rl_weights], 58 | input_length=train_vec['prefixes']['roles'].shape[1], 59 | trainable=False, name='rl_embedding')(rl_input) 60 | 61 | # ============================================================================= 62 | # Layer 1 63 | # ============================================================================= 64 | concatenate = Concatenate(name='concatenated', axis=2)([ac_embedding, rl_embedding, t_input]) 65 | 66 | if args['lstm_act'] is not None: 67 | l1_c1 = GRU(args['l_size'], 68 | activation=args['lstm_act'], 69 | kernel_initializer='glorot_uniform', 70 | return_sequences=True, 71 | dropout=0.2, 72 | implementation=args['imp'])(concatenate) 73 | else: 74 | l1_c1 = GRU(args['l_size'], 75 | kernel_initializer='glorot_uniform', 76 | return_sequences=True, 77 | dropout=0.2, 78 | implementation=args['imp'])(concatenate) 79 | 80 | # ============================================================================= 81 | # Batch Normalization Layer 82 | # ============================================================================= 83 | batch1 = BatchNormalization()(l1_c1) 84 | 85 | # ============================================================================= 86 | # The layer specialized in prediction 87 | # ============================================================================= 88 | l2_c1 = GRU(args['l_size'], 89 | kernel_initializer='glorot_uniform', 90 | return_sequences=False, 91 | dropout=0.2, 92 | implementation=args['imp'])(batch1) 93 | 94 | # The layer specialized in role prediction 95 | l2_c2 = GRU(args['l_size'], 96 | kernel_initializer='glorot_uniform', 97 | return_sequences=False, 98 | dropout=0.2, 99 | implementation=args['imp'])(batch1) 100 | 101 | # The layer specialized in role prediction 102 | l2_3 = GRU(args['l_size'], 103 | activation=args['lstm_act'], 104 | kernel_initializer='glorot_uniform', 105 | return_sequences=False, 106 | dropout=0.2, 107 | implementation=args['imp'])(batch1) 108 | 109 | # ============================================================================= 110 | # Output Layer 111 | # ============================================================================= 112 | act_output = Dense(ac_weights.shape[0], 113 | activation='softmax', 114 | kernel_initializer='glorot_uniform', 115 | name='act_output')(l2_c1) 116 | 117 | role_output = Dense(rl_weights.shape[0], 118 | activation='softmax', 119 | kernel_initializer='glorot_uniform', 120 | name='role_output')(l2_c2) 121 | 122 | if ('dense_act' in args) and (args['dense_act'] is not None): 123 | time_output = Dense(train_vec['next_evt']['times'].shape[1], 124 | activation=args['dense_act'], 125 | kernel_initializer='glorot_uniform', 126 | name='time_output')(l2_3) 127 | else: 128 | time_output = Dense(train_vec['next_evt']['times'].shape[1], 129 | kernel_initializer='glorot_uniform', 130 | name='time_output')(l2_3) 131 | 132 | model = Model(inputs=[ac_input, rl_input, t_input], 133 | outputs=[act_output, role_output, time_output]) 134 | 135 | if args['optim'] == 'Nadam': 136 | opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999) 137 | elif args['optim'] == 'Adam': 138 | opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False) 139 | elif args['optim'] == 'SGD': 140 | opt = SGD(learning_rate=0.01, momentum=0.0, nesterov=False) 141 | elif args['optim'] == 'Adagrad': 142 | opt = Adagrad(learning_rate=0.01) 143 | 144 | model.compile(loss={'act_output': 'categorical_crossentropy', 145 | 'role_output': 'categorical_crossentropy', 146 | 'time_output': 'mae'}, optimizer=opt) 147 | 148 | model.summary() 149 | 150 | early_stopping = EarlyStopping(monitor='val_loss', patience=40) 151 | if log_path: 152 | cb = tc.TimingCallback(output_folder, log_path=log_path) 153 | else: 154 | cb = tc.TimingCallback(output_folder) 155 | 156 | # Output file 157 | output_file_path = os.path.join(output_folder, 158 | os.path.splitext(args['file'])[0]+'.h5') 159 | 160 | # Saving 161 | model_checkpoint = ModelCheckpoint(output_file_path, 162 | monitor='val_loss', 163 | verbose=0, 164 | save_best_only=True, 165 | save_weights_only=False, 166 | mode='auto') 167 | lr_reducer = ReduceLROnPlateau(monitor='val_loss', 168 | factor=0.5, 169 | patience=10, 170 | verbose=0, 171 | mode='auto', 172 | min_delta=0.0001, 173 | cooldown=0, 174 | min_lr=0) 175 | 176 | batch_size = args['batch_size'] 177 | model.fit({'ac_input': train_vec['prefixes']['activities'], 178 | 'rl_input': train_vec['prefixes']['roles'], 179 | 't_input': train_vec['prefixes']['times']}, 180 | {'act_output': train_vec['next_evt']['activities'], 181 | 'role_output': train_vec['next_evt']['roles'], 182 | 'time_output': train_vec['next_evt']['times']}, 183 | validation_data=( 184 | {'ac_input': valdn_vec['prefixes']['activities'], 185 | 'rl_input': valdn_vec['prefixes']['roles'], 186 | 't_input': valdn_vec['prefixes']['times']}, 187 | {'act_output': valdn_vec['next_evt']['activities'], 188 | 'role_output': valdn_vec['next_evt']['roles'], 189 | 'time_output': valdn_vec['next_evt']['times']}), 190 | verbose=2, 191 | callbacks=[early_stopping, model_checkpoint, lr_reducer, cb], 192 | batch_size=batch_size, 193 | epochs=args['epochs']) 194 | return model 195 | -------------------------------------------------------------------------------- /model_training/models/model_gru_concatenated_cx.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Feb 28 10:15:12 2019 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import os 8 | 9 | from tensorflow.keras.models import Model 10 | from tensorflow.keras.layers import Input, Embedding, Concatenate 11 | from tensorflow.keras.layers import Dense, GRU, BatchNormalization 12 | from tensorflow.keras.optimizers import Nadam, Adam, SGD, Adagrad 13 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau 14 | 15 | try: 16 | from support_modules.callbacks import time_callback as tc 17 | except: 18 | from importlib import util 19 | spec = util.spec_from_file_location( 20 | 'time_callback', 21 | os.path.join(os.getcwd(), 'support_modules', 'callbacks', 'time_callback.py')) 22 | tc = util.module_from_spec(spec) 23 | spec.loader.exec_module(tc) 24 | 25 | 26 | def _training_model(train_vec, valdn_vec, ac_weights, rl_weights, 27 | output_folder, args, log_path=None): 28 | """Example function with types documented in the docstring. 29 | Args: 30 | param1 (int): The first parameter. 31 | param2 (str): The second parameter. 32 | Returns: 33 | bool: The return value. True for success, False otherwise. 34 | """ 35 | 36 | print('Build model...') 37 | print(args) 38 | # ============================================================================= 39 | # Input layer 40 | # ============================================================================= 41 | ac_input = Input(shape=(train_vec['prefixes']['activities'].shape[1], ), 42 | name='ac_input') 43 | rl_input = Input(shape=(train_vec['prefixes']['roles'].shape[1], ), 44 | name='rl_input') 45 | t_input = Input(shape=(train_vec['prefixes']['times'].shape[1], 46 | train_vec['prefixes']['times'].shape[2]), name='t_input') 47 | inter_input = Input(shape=(train_vec['prefixes']['inter_attr'].shape[1], 48 | train_vec['prefixes']['inter_attr'].shape[2]), 49 | name='inter_input') 50 | 51 | # ============================================================================= 52 | # Embedding layer for categorical attributes 53 | # ============================================================================= 54 | ac_embedding = Embedding(ac_weights.shape[0], 55 | ac_weights.shape[1], 56 | weights=[ac_weights], 57 | input_length=train_vec['prefixes']['activities'].shape[1], 58 | trainable=False, name='ac_embedding')(ac_input) 59 | 60 | rl_embedding = Embedding(rl_weights.shape[0], 61 | rl_weights.shape[1], 62 | weights=[rl_weights], 63 | input_length=train_vec['prefixes']['roles'].shape[1], 64 | trainable=False, name='rl_embedding')(rl_input) 65 | 66 | # ============================================================================= 67 | # Layer 1 68 | # ============================================================================= 69 | concatenate = Concatenate(name='concatenated', axis=2)( 70 | [ac_embedding, rl_embedding, t_input, inter_input]) 71 | 72 | if args['lstm_act'] is not None: 73 | l1_c1 = GRU(args['l_size'], 74 | activation=args['lstm_act'], 75 | kernel_initializer='glorot_uniform', 76 | return_sequences=True, 77 | dropout=0.2, 78 | implementation=args['imp'])(concatenate) 79 | else: 80 | l1_c1 = GRU(args['l_size'], 81 | kernel_initializer='glorot_uniform', 82 | return_sequences=True, 83 | dropout=0.2, 84 | implementation=args['imp'])(concatenate) 85 | 86 | # ============================================================================= 87 | # Batch Normalization Layer 88 | # ============================================================================= 89 | batch1 = BatchNormalization()(l1_c1) 90 | 91 | # ============================================================================= 92 | # The layer specialized in prediction 93 | # ============================================================================= 94 | l2_c1 = GRU(args['l_size'], 95 | kernel_initializer='glorot_uniform', 96 | return_sequences=False, 97 | dropout=0.2, 98 | implementation=args['imp'])(batch1) 99 | 100 | # The layer specialized in role prediction 101 | l2_c2 = GRU(args['l_size'], 102 | kernel_initializer='glorot_uniform', 103 | return_sequences=False, 104 | dropout=0.2, 105 | implementation=args['imp'])(batch1) 106 | 107 | # The layer specialized in role prediction 108 | l2_c3 = GRU(args['l_size'], 109 | activation=args['lstm_act'], 110 | kernel_initializer='glorot_uniform', 111 | return_sequences=False, 112 | dropout=0.2, 113 | implementation=args['imp'])(batch1) 114 | 115 | 116 | # ============================================================================= 117 | # Output Layer 118 | # ============================================================================= 119 | act_output = Dense(ac_weights.shape[0], 120 | activation='softmax', 121 | kernel_initializer='glorot_uniform', 122 | name='act_output')(l2_c1) 123 | 124 | role_output = Dense(rl_weights.shape[0], 125 | activation='softmax', 126 | kernel_initializer='glorot_uniform', 127 | name='role_output')(l2_c2) 128 | 129 | if ('dense_act' in args) and (args['dense_act'] is not None): 130 | time_output = Dense(train_vec['next_evt']['times'].shape[1], 131 | activation=args['dense_act'], 132 | kernel_initializer='glorot_uniform', 133 | name='time_output')(l2_c3) 134 | else: 135 | time_output = Dense(train_vec['next_evt']['times'].shape[1], 136 | kernel_initializer='glorot_uniform', 137 | name='time_output')(l2_c3) 138 | model = Model(inputs=[ac_input, rl_input, t_input, inter_input], 139 | outputs=[act_output, role_output, time_output]) 140 | 141 | if args['optim'] == 'Nadam': 142 | opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999) 143 | elif args['optim'] == 'Adam': 144 | opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False) 145 | elif args['optim'] == 'SGD': 146 | opt = SGD(learning_rate=0.01, momentum=0.0, nesterov=False) 147 | elif args['optim'] == 'Adagrad': 148 | opt = Adagrad(learning_rate=0.01) 149 | 150 | model.compile(loss={'act_output': 'categorical_crossentropy', 151 | 'role_output': 'categorical_crossentropy', 152 | 'time_output': 'mae'}, optimizer=opt) 153 | 154 | model.summary() 155 | 156 | early_stopping = EarlyStopping(monitor='val_loss', patience=40) 157 | if log_path: 158 | cb = tc.TimingCallback(output_folder, log_path=log_path) 159 | else: 160 | cb = tc.TimingCallback(output_folder) 161 | 162 | # Output file 163 | output_file_path = os.path.join(output_folder, 164 | os.path.splitext(args['file'])[0]+'.h5') 165 | 166 | # Saving 167 | model_checkpoint = ModelCheckpoint(output_file_path, 168 | monitor='val_loss', 169 | verbose=0, 170 | save_best_only=True, 171 | save_weights_only=False, 172 | mode='auto') 173 | lr_reducer = ReduceLROnPlateau(monitor='val_loss', 174 | factor=0.5, 175 | patience=10, 176 | verbose=0, 177 | mode='auto', 178 | min_delta=0.0001, 179 | cooldown=0, 180 | min_lr=0) 181 | 182 | batch_size = args['batch_size'] 183 | model.fit({'ac_input': train_vec['prefixes']['activities'], 184 | 'rl_input': train_vec['prefixes']['roles'], 185 | 't_input': train_vec['prefixes']['times'], 186 | 'inter_input': train_vec['prefixes']['inter_attr']}, 187 | {'act_output': train_vec['next_evt']['activities'], 188 | 'role_output': train_vec['next_evt']['roles'], 189 | 'time_output': train_vec['next_evt']['times']}, 190 | validation_data=( 191 | {'ac_input': valdn_vec['prefixes']['activities'], 192 | 'rl_input': valdn_vec['prefixes']['roles'], 193 | 't_input': valdn_vec['prefixes']['times'], 194 | 'inter_input': valdn_vec['prefixes']['inter_attr']}, 195 | {'act_output': valdn_vec['next_evt']['activities'], 196 | 'role_output': valdn_vec['next_evt']['roles'], 197 | 'time_output': valdn_vec['next_evt']['times']}), 198 | verbose=2, 199 | callbacks=[early_stopping, model_checkpoint, 200 | lr_reducer, cb], 201 | batch_size=batch_size, 202 | epochs=args['epochs']) 203 | return model 204 | -------------------------------------------------------------------------------- /model_training/models/model_gru_shared_cat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Feb 28 10:15:12 2019 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import os 8 | 9 | from tensorflow.keras.models import Model 10 | from tensorflow.keras.layers import Input, Embedding, Concatenate 11 | from tensorflow.keras.layers import Dense, GRU, BatchNormalization 12 | from tensorflow.keras.optimizers import Nadam, Adam, SGD, Adagrad 13 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau 14 | 15 | try: 16 | from support_modules.callbacks import time_callback as tc 17 | except: 18 | from importlib import util 19 | spec = util.spec_from_file_location( 20 | 'time_callback', 21 | os.path.join(os.getcwd(), 'support_modules', 'callbacks', 'time_callback.py')) 22 | tc = util.module_from_spec(spec) 23 | spec.loader.exec_module(tc) 24 | 25 | 26 | def _training_model(train_vec, valdn_vec, ac_weights, rl_weights, 27 | output_folder, args, log_path=None): 28 | """Example function with types documented in the docstring. 29 | Args: 30 | param1 (int): The first parameter. 31 | param2 (str): The second parameter. 32 | Returns: 33 | bool: The return value. True for success, False otherwise. 34 | """ 35 | 36 | print('Build model...') 37 | print(args) 38 | # ============================================================================= 39 | # Input layer 40 | # ============================================================================= 41 | ac_input = Input(shape=(train_vec['prefixes']['activities'].shape[1], ), name='ac_input') 42 | rl_input = Input(shape=(train_vec['prefixes']['roles'].shape[1], ), name='rl_input') 43 | t_input = Input(shape=(train_vec['prefixes']['times'].shape[1], 44 | train_vec['prefixes']['times'].shape[2]), name='t_input') 45 | 46 | # ============================================================================= 47 | # Embedding layer for categorical attributes 48 | # ============================================================================= 49 | ac_embedding = Embedding(ac_weights.shape[0], 50 | ac_weights.shape[1], 51 | weights=[ac_weights], 52 | input_length=train_vec['prefixes']['activities'].shape[1], 53 | trainable=False, name='ac_embedding')(ac_input) 54 | 55 | rl_embedding = Embedding(rl_weights.shape[0], 56 | rl_weights.shape[1], 57 | weights=[rl_weights], 58 | input_length=train_vec['prefixes']['roles'].shape[1], 59 | trainable=False, name='rl_embedding')(rl_input) 60 | # ============================================================================= 61 | # Layer 1 62 | # ============================================================================= 63 | 64 | merged = Concatenate(name='concatenated', axis=2)([ac_embedding, rl_embedding]) 65 | 66 | l1_c1 = GRU(args['l_size'], 67 | kernel_initializer='glorot_uniform', 68 | return_sequences=True, 69 | dropout=0.2, 70 | implementation=args['imp'])(merged) 71 | 72 | l1_c3 = GRU(args['l_size'], 73 | activation=args['lstm_act'], 74 | kernel_initializer='glorot_uniform', 75 | return_sequences=True, 76 | dropout=0.2, 77 | implementation=args['imp'])(t_input) 78 | 79 | # ============================================================================= 80 | # Batch Normalization Layer 81 | # ============================================================================= 82 | batch1 = BatchNormalization()(l1_c1) 83 | batch3 = BatchNormalization()(l1_c3) 84 | 85 | # ============================================================================= 86 | # The layer specialized in prediction 87 | # ============================================================================= 88 | l2_c1 = GRU(args['l_size'], 89 | kernel_initializer='glorot_uniform', 90 | return_sequences=False, 91 | dropout=0.2, 92 | implementation=args['imp'])(batch1) 93 | 94 | # The layer specialized in role prediction 95 | l2_c2 = GRU(args['l_size'], 96 | kernel_initializer='glorot_uniform', 97 | return_sequences=False, 98 | dropout=0.2, 99 | implementation=args['imp'])(batch1) 100 | 101 | # The layer specialized in role prediction 102 | l2_3 = GRU(args['l_size'], 103 | activation=args['lstm_act'], 104 | kernel_initializer='glorot_uniform', 105 | return_sequences=False, 106 | dropout=0.2, 107 | implementation=args['imp'])(batch3) 108 | 109 | # ============================================================================= 110 | # Output Layer 111 | # ============================================================================= 112 | act_output = Dense(ac_weights.shape[0], 113 | activation='softmax', 114 | kernel_initializer='glorot_uniform', 115 | name='act_output')(l2_c1) 116 | 117 | role_output = Dense(rl_weights.shape[0], 118 | activation='softmax', 119 | kernel_initializer='glorot_uniform', 120 | name='role_output')(l2_c2) 121 | 122 | if ('dense_act' in args) and (args['dense_act'] is not None): 123 | time_output = Dense(train_vec['next_evt']['times'].shape[1], 124 | activation=args['dense_act'], 125 | kernel_initializer='glorot_uniform', 126 | name='time_output')(l2_3) 127 | else: 128 | time_output = Dense(train_vec['next_evt']['times'].shape[1], 129 | kernel_initializer='glorot_uniform', 130 | name='time_output')(l2_3) 131 | 132 | model = Model(inputs=[ac_input, rl_input, t_input], 133 | outputs=[act_output, role_output, time_output]) 134 | 135 | if args['optim'] == 'Nadam': 136 | opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999) 137 | elif args['optim'] == 'Adam': 138 | opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False) 139 | elif args['optim'] == 'SGD': 140 | opt = SGD(learning_rate=0.01, momentum=0.0, nesterov=False) 141 | elif args['optim'] == 'Adagrad': 142 | opt = Adagrad(learning_rate=0.01) 143 | 144 | model.compile(loss={'act_output': 'categorical_crossentropy', 145 | 'role_output': 'categorical_crossentropy', 146 | 'time_output': 'mae'}, optimizer=opt) 147 | 148 | model.summary() 149 | 150 | early_stopping = EarlyStopping(monitor='val_loss', patience=40) 151 | if log_path: 152 | cb = tc.TimingCallback(output_folder, log_path=log_path) 153 | else: 154 | cb = tc.TimingCallback(output_folder) 155 | 156 | # Output file 157 | output_file_path = os.path.join(output_folder, 158 | os.path.splitext(args['file'])[0]+'.h5') 159 | 160 | # Saving 161 | model_checkpoint = ModelCheckpoint(output_file_path, 162 | monitor='val_loss', 163 | verbose=0, 164 | save_best_only=True, 165 | save_weights_only=False, 166 | mode='auto') 167 | lr_reducer = ReduceLROnPlateau(monitor='val_loss', 168 | factor=0.5, 169 | patience=10, 170 | verbose=0, 171 | mode='auto', 172 | min_delta=0.0001, 173 | cooldown=0, 174 | min_lr=0) 175 | 176 | batch_size = args['batch_size'] 177 | model.fit({'ac_input': train_vec['prefixes']['activities'], 178 | 'rl_input': train_vec['prefixes']['roles'], 179 | 't_input': train_vec['prefixes']['times']}, 180 | {'act_output': train_vec['next_evt']['activities'], 181 | 'role_output': train_vec['next_evt']['roles'], 182 | 'time_output': train_vec['next_evt']['times']}, 183 | validation_data=( 184 | {'ac_input': valdn_vec['prefixes']['activities'], 185 | 'rl_input': valdn_vec['prefixes']['roles'], 186 | 't_input': valdn_vec['prefixes']['times']}, 187 | {'act_output': valdn_vec['next_evt']['activities'], 188 | 'role_output': valdn_vec['next_evt']['roles'], 189 | 'time_output': valdn_vec['next_evt']['times']}), 190 | verbose=2, 191 | callbacks=[early_stopping, model_checkpoint, lr_reducer, cb], 192 | batch_size=batch_size, 193 | epochs=args['epochs']) 194 | return model 195 | -------------------------------------------------------------------------------- /model_training/models/model_gru_shared_cat_cx.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Feb 28 10:15:12 2019 4 | 5 | @author: Manuel Camargo 6 | """ 7 | 8 | import os 9 | 10 | from tensorflow.keras.models import Model 11 | from tensorflow.keras.layers import Input, Embedding, Concatenate 12 | from tensorflow.keras.layers import Dense, GRU, BatchNormalization 13 | from tensorflow.keras.optimizers import Nadam, Adam, SGD, Adagrad 14 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau 15 | 16 | try: 17 | from support_modules.callbacks import time_callback as tc 18 | except: 19 | from importlib import util 20 | spec = util.spec_from_file_location( 21 | 'time_callback', 22 | os.path.join(os.getcwd(), 'support_modules', 'callbacks', 'time_callback.py')) 23 | tc = util.module_from_spec(spec) 24 | spec.loader.exec_module(tc) 25 | 26 | 27 | def _training_model(train_vec, valdn_vec, ac_weights, rl_weights, 28 | output_folder, args, log_path=None): 29 | """Example function with types documented in the docstring. 30 | Args: 31 | param1 (int): The first parameter. 32 | param2 (str): The second parameter. 33 | Returns: 34 | bool: The return value. True for success, False otherwise. 35 | """ 36 | 37 | print('Build model...') 38 | print(args) 39 | # ============================================================================= 40 | # Input layer 41 | # ============================================================================= 42 | ac_input = Input(shape=(train_vec['prefixes']['activities'].shape[1], ), 43 | name='ac_input') 44 | rl_input = Input(shape=(train_vec['prefixes']['roles'].shape[1], ), 45 | name='rl_input') 46 | t_input = Input(shape=(train_vec['prefixes']['times'].shape[1], 47 | train_vec['prefixes']['times'].shape[2]), name='t_input') 48 | inter_input = Input(shape=(train_vec['prefixes']['inter_attr'].shape[1], 49 | train_vec['prefixes']['inter_attr'].shape[2]), 50 | name='inter_input') 51 | 52 | # ============================================================================= 53 | # Embedding layer for categorical attributes 54 | # ============================================================================= 55 | ac_embedding = Embedding(ac_weights.shape[0], 56 | ac_weights.shape[1], 57 | weights=[ac_weights], 58 | input_length=(train_vec['prefixes']['activities'] 59 | .shape[1]), 60 | trainable=False, name='ac_embedding')(ac_input) 61 | 62 | rl_embedding = Embedding(rl_weights.shape[0], 63 | rl_weights.shape[1], 64 | weights=[rl_weights], 65 | input_length=train_vec['prefixes']['roles'].shape[1], 66 | trainable=False, name='rl_embedding')(rl_input) 67 | # ============================================================================= 68 | # Concatenation layer 69 | # ============================================================================= 70 | 71 | merged1 = Concatenate(name='conc_categorical', 72 | axis=2)([ac_embedding, rl_embedding]) 73 | merged2 = Concatenate(name='conc_continuous', axis=2)([t_input, inter_input]) 74 | 75 | # ============================================================================= 76 | # Layer 1 77 | # ============================================================================= 78 | 79 | l1_c1 = GRU(args['l_size'], 80 | kernel_initializer='glorot_uniform', 81 | return_sequences=True, 82 | dropout=0.2, 83 | implementation=args['imp'])(merged1) 84 | 85 | l1_c3 = GRU(args['l_size'], 86 | activation=args['lstm_act'], 87 | kernel_initializer='glorot_uniform', 88 | return_sequences=True, 89 | dropout=0.2, 90 | implementation=args['imp'])(merged2) 91 | 92 | # ============================================================================= 93 | # Batch Normalization Layer 94 | # ============================================================================= 95 | batch1 = BatchNormalization()(l1_c1) 96 | batch3 = BatchNormalization()(l1_c3) 97 | 98 | # ============================================================================= 99 | # The layer specialized in prediction 100 | # ============================================================================= 101 | l2_c1 = GRU(args['l_size'], 102 | kernel_initializer='glorot_uniform', 103 | return_sequences=False, 104 | dropout=0.2, 105 | implementation=args['imp'])(batch1) 106 | 107 | # The layer specialized in role prediction 108 | l2_c2 = GRU(args['l_size'], 109 | kernel_initializer='glorot_uniform', 110 | return_sequences=False, 111 | dropout=0.2, 112 | implementation=args['imp'])(batch1) 113 | 114 | # The layer specialized in time prediction 115 | l2_c4 = GRU(args['l_size'], 116 | activation=args['lstm_act'], 117 | kernel_initializer='glorot_uniform', 118 | return_sequences=False, 119 | dropout=0.2, 120 | implementation=args['imp'])(batch3) 121 | 122 | # ============================================================================= 123 | # Output Layer 124 | # ============================================================================= 125 | act_output = Dense(train_vec['next_evt']['activities'].shape[1], 126 | activation='softmax', 127 | kernel_initializer='glorot_uniform', 128 | name='act_output')(l2_c1) 129 | 130 | role_output = Dense(train_vec['next_evt']['roles'].shape[1], 131 | activation='softmax', 132 | kernel_initializer='glorot_uniform', 133 | name='role_output')(l2_c2) 134 | 135 | if ('dense_act' in args) and (args['dense_act'] is not None): 136 | time_output = Dense(train_vec['next_evt']['times'].shape[1], 137 | activation=args['dense_act'], 138 | kernel_initializer='glorot_uniform', 139 | name='time_output')(l2_c4) 140 | else: 141 | time_output = Dense(train_vec['next_evt']['times'].shape[1], 142 | kernel_initializer='glorot_uniform', 143 | name='time_output')(l2_c4) 144 | 145 | model = Model(inputs=[ac_input, rl_input, t_input, inter_input], 146 | outputs=[act_output, role_output, time_output]) 147 | 148 | if args['optim'] == 'Nadam': 149 | opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999) 150 | elif args['optim'] == 'Adam': 151 | opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, 152 | amsgrad=False) 153 | elif args['optim'] == 'SGD': 154 | opt = SGD(learning_rate=0.01, momentum=0.0, nesterov=False) 155 | elif args['optim'] == 'Adagrad': 156 | opt = Adagrad(learning_rate=0.01) 157 | 158 | model.compile(loss={'act_output': 'categorical_crossentropy', 159 | 'role_output': 'categorical_crossentropy', 160 | 'time_output': 'mae'}, optimizer=opt) 161 | 162 | model.summary() 163 | 164 | early_stopping = EarlyStopping(monitor='val_loss', patience=40) 165 | if log_path: 166 | cb = tc.TimingCallback(output_folder, log_path=log_path) 167 | else: 168 | cb = tc.TimingCallback(output_folder) 169 | 170 | # Output file 171 | output_file_path = os.path.join(output_folder, 172 | os.path.splitext(args['file'])[0]+'.h5') 173 | 174 | # Saving 175 | model_checkpoint = ModelCheckpoint(output_file_path, 176 | monitor='val_loss', 177 | verbose=0, 178 | save_best_only=True, 179 | save_weights_only=False, 180 | mode='auto') 181 | lr_reducer = ReduceLROnPlateau(monitor='val_loss', 182 | factor=0.5, 183 | patience=10, 184 | verbose=0, 185 | mode='auto', 186 | min_delta=0.0001, 187 | cooldown=0, 188 | min_lr=0) 189 | 190 | batch_size = args['batch_size'] 191 | model.fit({'ac_input': train_vec['prefixes']['activities'], 192 | 'rl_input': train_vec['prefixes']['roles'], 193 | 't_input': train_vec['prefixes']['times'], 194 | 'inter_input': train_vec['prefixes']['inter_attr']}, 195 | {'act_output': train_vec['next_evt']['activities'], 196 | 'role_output': train_vec['next_evt']['roles'], 197 | 'time_output': train_vec['next_evt']['times']}, 198 | validation_data=( 199 | {'ac_input': valdn_vec['prefixes']['activities'], 200 | 'rl_input': valdn_vec['prefixes']['roles'], 201 | 't_input': valdn_vec['prefixes']['times'], 202 | 'inter_input': valdn_vec['prefixes']['inter_attr']}, 203 | {'act_output': valdn_vec['next_evt']['activities'], 204 | 'role_output': valdn_vec['next_evt']['roles'], 205 | 'time_output': valdn_vec['next_evt']['times']}), 206 | verbose=2, 207 | callbacks=[early_stopping, model_checkpoint, 208 | lr_reducer, cb], 209 | batch_size=batch_size, 210 | epochs=args['epochs']) 211 | return model 212 | -------------------------------------------------------------------------------- /model_training/models/model_gru_specialized.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Feb 28 10:15:12 2019 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import os 8 | 9 | from tensorflow.keras.models import Model 10 | from tensorflow.keras.layers import Input, Embedding 11 | from tensorflow.keras.layers import Dense, GRU, BatchNormalization 12 | from tensorflow.keras.optimizers import Nadam, Adam, SGD, Adagrad 13 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau 14 | 15 | try: 16 | from support_modules.callbacks import time_callback as tc 17 | except: 18 | from importlib import util 19 | spec = util.spec_from_file_location( 20 | 'time_callback', 21 | os.path.join(os.getcwd(), 'support_modules', 'callbacks', 'time_callback.py')) 22 | tc = util.module_from_spec(spec) 23 | spec.loader.exec_module(tc) 24 | 25 | 26 | def _training_model(train_vec, valdn_vec, ac_weights, rl_weights, 27 | output_folder, args, log_path=None): 28 | """Example function with types documented in the docstring. 29 | Args: 30 | param1 (int): The first parameter. 31 | param2 (str): The second parameter. 32 | Returns: 33 | bool: The return value. True for success, False otherwise. 34 | """ 35 | 36 | print('Build model...') 37 | print(args) 38 | # ============================================================================= 39 | # Input layer 40 | # ============================================================================= 41 | ac_input = Input(shape=(train_vec['prefixes']['activities'].shape[1], ), name='ac_input') 42 | rl_input = Input(shape=(train_vec['prefixes']['roles'].shape[1], ), name='rl_input') 43 | t_input = Input(shape=(train_vec['prefixes']['times'].shape[1], 44 | train_vec['prefixes']['times'].shape[2]), name='t_input') 45 | 46 | # ============================================================================= 47 | # Embedding layer for categorical attributes 48 | # ============================================================================= 49 | ac_embedding = Embedding(ac_weights.shape[0], 50 | ac_weights.shape[1], 51 | weights=[ac_weights], 52 | input_length=train_vec['prefixes']['activities'].shape[1], 53 | trainable=False, name='ac_embedding')(ac_input) 54 | 55 | rl_embedding = Embedding(rl_weights.shape[0], 56 | rl_weights.shape[1], 57 | weights=[rl_weights], 58 | input_length=train_vec['prefixes']['roles'].shape[1], 59 | trainable=False, name='rl_embedding')(rl_input) 60 | # ============================================================================= 61 | # Layer 1 62 | # ============================================================================= 63 | l1_c1 = GRU(args['l_size'], 64 | kernel_initializer='glorot_uniform', 65 | return_sequences=True, 66 | dropout=0.2, 67 | implementation=args['imp'])(ac_embedding) 68 | 69 | l1_c2 = GRU(args['l_size'], 70 | kernel_initializer='glorot_uniform', 71 | return_sequences=True, 72 | dropout=0.2, 73 | implementation=args['imp'])(rl_embedding) 74 | 75 | l1_c3 = GRU(args['l_size'], 76 | activation=args['lstm_act'], 77 | kernel_initializer='glorot_uniform', 78 | return_sequences=True, 79 | dropout=0.2, 80 | implementation=args['imp'])(t_input) 81 | 82 | # ============================================================================= 83 | # Batch Normalization Layer 84 | # ============================================================================= 85 | batch1 = BatchNormalization()(l1_c1) 86 | batch2 = BatchNormalization()(l1_c2) 87 | batch3 = BatchNormalization()(l1_c3) 88 | 89 | # ============================================================================= 90 | # The layer specialized in prediction 91 | # ============================================================================= 92 | l2_c1 = GRU(args['l_size'], 93 | kernel_initializer='glorot_uniform', 94 | return_sequences=False, 95 | dropout=0.2, 96 | implementation=args['imp'])(batch1) 97 | 98 | # The layer specialized in role prediction 99 | l2_c2 = GRU(args['l_size'], 100 | kernel_initializer='glorot_uniform', 101 | return_sequences=False, 102 | dropout=0.2, 103 | implementation=args['imp'])(batch2) 104 | 105 | # The layer specialized in role prediction 106 | if args['lstm_act'] is not None: 107 | l2_3 = GRU(args['l_size'], 108 | activation=args['lstm_act'], 109 | kernel_initializer='glorot_uniform', 110 | return_sequences=False, 111 | dropout=0.2, 112 | implementation=args['imp'])(batch3) 113 | else: 114 | l2_3 = GRU(args['l_size'], 115 | kernel_initializer='glorot_uniform', 116 | return_sequences=False, 117 | dropout=0.2, 118 | implementation=args['imp'])(batch3) 119 | 120 | 121 | 122 | # ============================================================================= 123 | # Output Layer 124 | # ============================================================================= 125 | act_output = Dense(ac_weights.shape[0], 126 | activation='softmax', 127 | kernel_initializer='glorot_uniform', 128 | name='act_output')(l2_c1) 129 | 130 | role_output = Dense(rl_weights.shape[0], 131 | activation='softmax', 132 | kernel_initializer='glorot_uniform', 133 | name='role_output')(l2_c2) 134 | 135 | if ('dense_act' in args) and (args['dense_act'] is not None): 136 | time_output = Dense(train_vec['next_evt']['times'].shape[1], activation=args['dense_act'], 137 | kernel_initializer='glorot_uniform', 138 | name='time_output')(l2_3) 139 | else: 140 | time_output = Dense(train_vec['next_evt']['times'].shape[1], 141 | kernel_initializer='glorot_uniform', 142 | name='time_output')(l2_3) 143 | 144 | model = Model(inputs=[ac_input, rl_input, t_input], 145 | outputs=[act_output, role_output, time_output]) 146 | 147 | if args['optim'] == 'Nadam': 148 | opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999) 149 | elif args['optim'] == 'Adam': 150 | opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False) 151 | elif args['optim'] == 'SGD': 152 | opt = SGD(learning_rate=0.01, momentum=0.0, nesterov=False) 153 | elif args['optim'] == 'Adagrad': 154 | opt = Adagrad(learning_rate=0.01) 155 | 156 | model.compile(loss={'act_output':'categorical_crossentropy', 157 | 'role_output':'categorical_crossentropy', 158 | 'time_output':'mae'}, optimizer=opt) 159 | 160 | model.summary() 161 | 162 | early_stopping = EarlyStopping(monitor='val_loss', patience=40) 163 | if log_path: 164 | cb = tc.TimingCallback(output_folder, log_path=log_path) 165 | else: 166 | cb = tc.TimingCallback(output_folder) 167 | 168 | # Output file 169 | output_file_path = os.path.join(output_folder, 170 | os.path.splitext(args['file'])[0]+'.h5') 171 | 172 | # Saving 173 | model_checkpoint = ModelCheckpoint(output_file_path, 174 | monitor='val_loss', 175 | verbose=0, 176 | save_best_only=True, 177 | save_weights_only=False, 178 | mode='auto') 179 | lr_reducer = ReduceLROnPlateau(monitor='val_loss', 180 | factor=0.5, 181 | patience=10, 182 | verbose=0, 183 | mode='auto', 184 | min_delta=0.0001, 185 | cooldown=0, 186 | min_lr=0) 187 | 188 | batch_size = args['batch_size'] 189 | model.fit({'ac_input': train_vec['prefixes']['activities'], 190 | 'rl_input': train_vec['prefixes']['roles'], 191 | 't_input': train_vec['prefixes']['times']}, 192 | {'act_output': train_vec['next_evt']['activities'], 193 | 'role_output': train_vec['next_evt']['roles'], 194 | 'time_output': train_vec['next_evt']['times']}, 195 | validation_data=( 196 | {'ac_input': valdn_vec['prefixes']['activities'], 197 | 'rl_input': valdn_vec['prefixes']['roles'], 198 | 't_input': valdn_vec['prefixes']['times']}, 199 | {'act_output': valdn_vec['next_evt']['activities'], 200 | 'role_output': valdn_vec['next_evt']['roles'], 201 | 'time_output': valdn_vec['next_evt']['times']}), 202 | verbose=2, 203 | callbacks=[early_stopping, model_checkpoint, lr_reducer, cb], 204 | batch_size=batch_size, 205 | epochs=args['epochs']) 206 | return model 207 | -------------------------------------------------------------------------------- /model_training/models/model_shared_cat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Feb 28 10:15:12 2019 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import os 8 | 9 | from tensorflow.keras.models import Model 10 | from tensorflow.keras.layers import Input, Embedding, Concatenate 11 | from tensorflow.keras.layers import Dense, LSTM, BatchNormalization 12 | from tensorflow.keras.optimizers import Nadam, Adam, SGD, Adagrad 13 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau 14 | 15 | try: 16 | from support_modules.callbacks import time_callback as tc 17 | except: 18 | from importlib import util 19 | spec = util.spec_from_file_location( 20 | 'time_callback', 21 | os.path.join(os.getcwd(), 'support_modules', 'callbacks', 'time_callback.py')) 22 | tc = util.module_from_spec(spec) 23 | spec.loader.exec_module(tc) 24 | 25 | 26 | def _training_model(train_vec, valdn_vec, ac_weights, rl_weights, 27 | output_folder, args, log_path=None): 28 | """Example function with types documented in the docstring. 29 | Args: 30 | param1 (int): The first parameter. 31 | param2 (str): The second parameter. 32 | Returns: 33 | bool: The return value. True for success, False otherwise. 34 | """ 35 | print('Build model...') 36 | print(args) 37 | # ============================================================================= 38 | # Input layer 39 | # ============================================================================= 40 | ac_input = Input(shape=(train_vec['prefixes']['activities'].shape[1], ), 41 | name='ac_input') 42 | rl_input = Input(shape=(train_vec['prefixes']['roles'].shape[1], ), 43 | name='rl_input') 44 | t_input = Input(shape=(train_vec['prefixes']['times'].shape[1], 45 | train_vec['prefixes']['times'].shape[2]), 46 | name='t_input') 47 | 48 | # ============================================================================= 49 | # Embedding layer for categorical attributes 50 | # ============================================================================= 51 | ac_embedding = Embedding(ac_weights.shape[0], 52 | ac_weights.shape[1], 53 | weights=[ac_weights], 54 | input_length=train_vec['prefixes']['activities'].shape[1], 55 | trainable=False, name='ac_embedding')(ac_input) 56 | 57 | rl_embedding = Embedding(rl_weights.shape[0], 58 | rl_weights.shape[1], 59 | weights=[rl_weights], 60 | input_length=train_vec['prefixes']['roles'].shape[1], 61 | trainable=False, name='rl_embedding')(rl_input) 62 | # ============================================================================= 63 | # Layer 1 64 | # ============================================================================= 65 | 66 | merged = Concatenate(name='concatenated', axis=2)([ac_embedding, rl_embedding]) 67 | 68 | l1_c1 = LSTM(args['l_size'], 69 | kernel_initializer='glorot_uniform', 70 | return_sequences=True, 71 | dropout=0.2, 72 | implementation=args['imp'])(merged) 73 | 74 | l1_c3 = LSTM(args['l_size'], 75 | activation=args['lstm_act'], 76 | kernel_initializer='glorot_uniform', 77 | return_sequences=True, 78 | dropout=0.2, 79 | implementation=args['imp'])(t_input) 80 | 81 | # ============================================================================= 82 | # Batch Normalization Layer 83 | # ============================================================================= 84 | batch1 = BatchNormalization()(l1_c1) 85 | batch3 = BatchNormalization()(l1_c3) 86 | 87 | # ============================================================================= 88 | # The layer specialized in prediction 89 | # ============================================================================= 90 | l2_c1 = LSTM(args['l_size'], 91 | kernel_initializer='glorot_uniform', 92 | return_sequences=False, 93 | dropout=0.2, 94 | implementation=args['imp'])(batch1) 95 | 96 | # The layer specialized in role prediction 97 | l2_c2 = LSTM(args['l_size'], 98 | kernel_initializer='glorot_uniform', 99 | return_sequences=False, 100 | dropout=0.2, 101 | implementation=args['imp'])(batch1) 102 | 103 | # The layer specialized in role prediction 104 | l2_3 = LSTM(args['l_size'], 105 | activation=args['lstm_act'], 106 | kernel_initializer='glorot_uniform', 107 | return_sequences=False, 108 | dropout=0.2, 109 | implementation=args['imp'])(batch3) 110 | 111 | # ============================================================================= 112 | # Output Layer 113 | # ============================================================================= 114 | act_output = Dense(ac_weights.shape[0], 115 | activation='softmax', 116 | kernel_initializer='glorot_uniform', 117 | name='act_output')(l2_c1) 118 | 119 | role_output = Dense(rl_weights.shape[0], 120 | activation='softmax', 121 | kernel_initializer='glorot_uniform', 122 | name='role_output')(l2_c2) 123 | 124 | if ('dense_act' in args) and (args['dense_act'] is not None): 125 | time_output = Dense(train_vec['next_evt']['times'].shape[1], 126 | activation=args['dense_act'], 127 | kernel_initializer='glorot_uniform', 128 | name='time_output')(l2_3) 129 | else: 130 | time_output = Dense(train_vec['next_evt']['times'].shape[1], 131 | kernel_initializer='glorot_uniform', 132 | name='time_output')(l2_3) 133 | 134 | model = Model(inputs=[ac_input, rl_input, t_input], 135 | outputs=[act_output, role_output, time_output]) 136 | 137 | if args['optim'] == 'Nadam': 138 | opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999) 139 | elif args['optim'] == 'Adam': 140 | opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False) 141 | elif args['optim'] == 'SGD': 142 | opt = SGD(learning_rate=0.01, momentum=0.0, nesterov=False) 143 | elif args['optim'] == 'Adagrad': 144 | opt = Adagrad(learning_rate=0.01) 145 | 146 | model.compile(loss={'act_output': 'categorical_crossentropy', 147 | 'role_output': 'categorical_crossentropy', 148 | 'time_output': 'mae'}, optimizer=opt) 149 | 150 | model.summary() 151 | 152 | early_stopping = EarlyStopping(monitor='val_loss', patience=40) 153 | if log_path: 154 | cb = tc.TimingCallback(output_folder, log_path=log_path) 155 | else: 156 | cb = tc.TimingCallback(output_folder) 157 | 158 | # Output file 159 | output_file_path = os.path.join(output_folder, 160 | os.path.splitext(args['file'])[0]+'.h5') 161 | 162 | # Saving 163 | model_checkpoint = ModelCheckpoint(output_file_path, 164 | monitor='val_loss', 165 | verbose=0, 166 | save_best_only=True, 167 | save_weights_only=False, 168 | mode='auto') 169 | lr_reducer = ReduceLROnPlateau(monitor='val_loss', 170 | factor=0.5, 171 | patience=10, 172 | verbose=0, 173 | mode='auto', 174 | min_delta=0.0001, 175 | cooldown=0, 176 | min_lr=0) 177 | 178 | batch_size = args['batch_size'] 179 | model.fit({'ac_input': train_vec['prefixes']['activities'], 180 | 'rl_input': train_vec['prefixes']['roles'], 181 | 't_input': train_vec['prefixes']['times']}, 182 | {'act_output': train_vec['next_evt']['activities'], 183 | 'role_output': train_vec['next_evt']['roles'], 184 | 'time_output': train_vec['next_evt']['times']}, 185 | validation_data=( 186 | {'ac_input': valdn_vec['prefixes']['activities'], 187 | 'rl_input': valdn_vec['prefixes']['roles'], 188 | 't_input': valdn_vec['prefixes']['times']}, 189 | {'act_output': valdn_vec['next_evt']['activities'], 190 | 'role_output': valdn_vec['next_evt']['roles'], 191 | 'time_output': valdn_vec['next_evt']['times']}), 192 | verbose=2, 193 | callbacks=[early_stopping, model_checkpoint, lr_reducer, cb], 194 | batch_size=batch_size, 195 | epochs=args['epochs']) 196 | return model 197 | -------------------------------------------------------------------------------- /model_training/models/model_shared_cat_cx.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Feb 28 10:15:12 2019 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import os 8 | 9 | from tensorflow.keras.models import Model 10 | from tensorflow.keras.layers import Input, Embedding, Concatenate 11 | from tensorflow.keras.layers import Dense, LSTM, BatchNormalization 12 | from tensorflow.keras.optimizers import Nadam, Adam, SGD, Adagrad 13 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau 14 | 15 | try: 16 | from support_modules.callbacks import time_callback as tc 17 | except: 18 | from importlib import util 19 | spec = util.spec_from_file_location( 20 | 'time_callback', 21 | os.path.join(os.getcwd(), 'support_modules', 'callbacks', 'time_callback.py')) 22 | tc = util.module_from_spec(spec) 23 | spec.loader.exec_module(tc) 24 | 25 | 26 | def _training_model(train_vec, valdn_vec, ac_weights, rl_weights, 27 | output_folder, args, log_path=None): 28 | """Example function with types documented in the docstring. 29 | Args: 30 | param1 (int): The first parameter. 31 | param2 (str): The second parameter. 32 | Returns: 33 | bool: The return value. True for success, False otherwise. 34 | """ 35 | 36 | print('Build model...') 37 | print(args) 38 | # ============================================================================= 39 | # Input layer 40 | # ============================================================================= 41 | ac_input = Input(shape=(train_vec['prefixes']['activities'].shape[1], ), 42 | name='ac_input') 43 | rl_input = Input(shape=(train_vec['prefixes']['roles'].shape[1], ), 44 | name='rl_input') 45 | t_input = Input(shape=(train_vec['prefixes']['times'].shape[1], 46 | train_vec['prefixes']['times'].shape[2]), name='t_input') 47 | inter_input = Input(shape=(train_vec['prefixes']['inter_attr'].shape[1], 48 | train_vec['prefixes']['inter_attr'].shape[2]), 49 | name='inter_input') 50 | 51 | # ============================================================================= 52 | # Embedding layer for categorical attributes 53 | # ============================================================================= 54 | ac_embedding = Embedding(ac_weights.shape[0], 55 | ac_weights.shape[1], 56 | weights=[ac_weights], 57 | input_length=(train_vec['prefixes']['activities'] 58 | .shape[1]), 59 | trainable=False, name='ac_embedding')(ac_input) 60 | 61 | rl_embedding = Embedding(rl_weights.shape[0], 62 | rl_weights.shape[1], 63 | weights=[rl_weights], 64 | input_length=train_vec['prefixes']['roles'].shape[1], 65 | trainable=False, name='rl_embedding')(rl_input) 66 | # ============================================================================= 67 | # Concatenation layer 68 | # ============================================================================= 69 | 70 | merged1 = Concatenate(name='conc_categorical', 71 | axis=2)([ac_embedding, rl_embedding]) 72 | merged2 = Concatenate(name='conc_continuous', axis=2)([t_input, inter_input]) 73 | 74 | # ============================================================================= 75 | # Layer 1 76 | # ============================================================================= 77 | 78 | l1_c1 = LSTM(args['l_size'], 79 | kernel_initializer='glorot_uniform', 80 | return_sequences=True, 81 | dropout=0.2, 82 | implementation=args['imp'])(merged1) 83 | 84 | l1_c3 = LSTM(args['l_size'], 85 | activation=args['lstm_act'], 86 | kernel_initializer='glorot_uniform', 87 | return_sequences=True, 88 | dropout=0.2, 89 | implementation=args['imp'])(merged2) 90 | 91 | # ============================================================================= 92 | # Batch Normalization Layer 93 | # ============================================================================= 94 | batch1 = BatchNormalization()(l1_c1) 95 | batch3 = BatchNormalization()(l1_c3) 96 | 97 | # ============================================================================= 98 | # The layer specialized in prediction 99 | # ============================================================================= 100 | l2_c1 = LSTM(args['l_size'], 101 | kernel_initializer='glorot_uniform', 102 | return_sequences=False, 103 | dropout=0.2, 104 | implementation=args['imp'])(batch1) 105 | 106 | # The layer specialized in role prediction 107 | l2_c2 = LSTM(args['l_size'], 108 | kernel_initializer='glorot_uniform', 109 | return_sequences=False, 110 | dropout=0.2, 111 | implementation=args['imp'])(batch1) 112 | 113 | 114 | # The layer specialized in time prediction 115 | l2_c4 = LSTM(args['l_size'], 116 | activation=args['lstm_act'], 117 | kernel_initializer='glorot_uniform', 118 | return_sequences=False, 119 | dropout=0.2, 120 | implementation=args['imp'])(batch3) 121 | 122 | # ============================================================================= 123 | # Output Layer 124 | # ============================================================================= 125 | act_output = Dense(train_vec['next_evt']['activities'].shape[1], 126 | activation='softmax', 127 | kernel_initializer='glorot_uniform', 128 | name='act_output')(l2_c1) 129 | 130 | role_output = Dense(train_vec['next_evt']['roles'].shape[1], 131 | activation='softmax', 132 | kernel_initializer='glorot_uniform', 133 | name='role_output')(l2_c2) 134 | if ('dense_act' in args) and (args['dense_act'] is not None): 135 | time_output = Dense(train_vec['next_evt']['times'].shape[1], 136 | activation=args['dense_act'], 137 | kernel_initializer='glorot_uniform', 138 | name='time_output')(l2_c4) 139 | else: 140 | time_output = Dense(train_vec['next_evt']['times'].shape[1], 141 | kernel_initializer='glorot_uniform', 142 | name='time_output')(l2_c4) 143 | 144 | model = Model(inputs=[ac_input, rl_input, t_input, inter_input], 145 | outputs=[act_output, role_output, time_output]) 146 | 147 | if args['optim'] == 'Nadam': 148 | opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999) 149 | elif args['optim'] == 'Adam': 150 | opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, 151 | amsgrad=False) 152 | elif args['optim'] == 'SGD': 153 | opt = SGD(learning_rate=0.01, momentum=0.0, nesterov=False) 154 | elif args['optim'] == 'Adagrad': 155 | opt = Adagrad(learning_rate=0.01) 156 | 157 | model.compile(loss={'act_output': 'categorical_crossentropy', 158 | 'role_output': 'categorical_crossentropy', 159 | 'time_output': 'mae'}, optimizer=opt) 160 | 161 | model.summary() 162 | 163 | early_stopping = EarlyStopping(monitor='val_loss', patience=40) 164 | if log_path: 165 | cb = tc.TimingCallback(output_folder, log_path=log_path) 166 | else: 167 | cb = tc.TimingCallback(output_folder) 168 | 169 | # Output file 170 | output_file_path = os.path.join(output_folder, 171 | os.path.splitext(args['file'])[0]+'.h5') 172 | 173 | # Saving 174 | model_checkpoint = ModelCheckpoint(output_file_path, 175 | monitor='val_loss', 176 | verbose=0, 177 | save_best_only=True, 178 | save_weights_only=False, 179 | mode='auto') 180 | lr_reducer = ReduceLROnPlateau(monitor='val_loss', 181 | factor=0.5, 182 | patience=10, 183 | verbose=0, 184 | mode='auto', 185 | min_delta=0.0001, 186 | cooldown=0, 187 | min_lr=0) 188 | 189 | batch_size = args['batch_size'] 190 | model.fit({'ac_input': train_vec['prefixes']['activities'], 191 | 'rl_input': train_vec['prefixes']['roles'], 192 | 't_input': train_vec['prefixes']['times'], 193 | 'inter_input': train_vec['prefixes']['inter_attr']}, 194 | {'act_output': train_vec['next_evt']['activities'], 195 | 'role_output': train_vec['next_evt']['roles'], 196 | 'time_output': train_vec['next_evt']['times']}, 197 | validation_data=( 198 | {'ac_input': valdn_vec['prefixes']['activities'], 199 | 'rl_input': valdn_vec['prefixes']['roles'], 200 | 't_input': valdn_vec['prefixes']['times'], 201 | 'inter_input': valdn_vec['prefixes']['inter_attr']}, 202 | {'act_output': valdn_vec['next_evt']['activities'], 203 | 'role_output': valdn_vec['next_evt']['roles'], 204 | 'time_output': valdn_vec['next_evt']['times']}), 205 | verbose=2, 206 | callbacks=[early_stopping, model_checkpoint, 207 | lr_reducer, cb], 208 | batch_size=batch_size, 209 | epochs=args['epochs']) 210 | return model 211 | -------------------------------------------------------------------------------- /model_training/models/model_specialized.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Feb 28 10:15:12 2019 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import os 8 | 9 | from tensorflow.keras.models import Model 10 | from tensorflow.keras.layers import Input, Embedding 11 | from tensorflow.keras.layers import Dense, LSTM, BatchNormalization 12 | from tensorflow.keras.optimizers import Nadam, Adam, SGD, Adagrad 13 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau 14 | 15 | try: 16 | from support_modules.callbacks import time_callback as tc 17 | except: 18 | from importlib import util 19 | spec = util.spec_from_file_location( 20 | 'time_callback', 21 | os.path.join(os.getcwd(), 'support_modules', 'callbacks', 'time_callback.py')) 22 | tc = util.module_from_spec(spec) 23 | spec.loader.exec_module(tc) 24 | 25 | 26 | def _training_model(train_vec, valdn_vec, ac_weights, rl_weights, output_folder, args, log_path=None): 27 | """Example function with types documented in the docstring. 28 | Args: 29 | param1 (int): The first parameter. 30 | param2 (str): The second parameter. 31 | Returns: 32 | bool: The return value. True for success, False otherwise. 33 | """ 34 | 35 | print('Build model...') 36 | print(args) 37 | # ============================================================================= 38 | # Input layer 39 | # ============================================================================= 40 | ac_input = Input(shape=(train_vec['prefixes']['activities'].shape[1], ), name='ac_input') 41 | rl_input = Input(shape=(train_vec['prefixes']['roles'].shape[1], ), name='rl_input') 42 | t_input = Input(shape=(train_vec['prefixes']['times'].shape[1], 43 | train_vec['prefixes']['times'].shape[2]), name='t_input') 44 | 45 | # ============================================================================= 46 | # Embedding layer for categorical attributes 47 | # ============================================================================= 48 | ac_embedding = Embedding(ac_weights.shape[0], 49 | ac_weights.shape[1], 50 | weights=[ac_weights], 51 | input_length=train_vec['prefixes']['activities'].shape[1], 52 | trainable=False, name='ac_embedding')(ac_input) 53 | 54 | rl_embedding = Embedding(rl_weights.shape[0], 55 | rl_weights.shape[1], 56 | weights=[rl_weights], 57 | input_length=train_vec['prefixes']['roles'].shape[1], 58 | trainable=False, name='rl_embedding')(rl_input) 59 | # ============================================================================= 60 | # Layer 1 61 | # ============================================================================= 62 | l1_c1 = LSTM(args['l_size'], 63 | kernel_initializer='glorot_uniform', 64 | return_sequences=True, 65 | dropout=0.2, 66 | implementation=args['imp'])(ac_embedding) 67 | 68 | l1_c2 = LSTM(args['l_size'], 69 | kernel_initializer='glorot_uniform', 70 | return_sequences=True, 71 | dropout=0.2, 72 | implementation=args['imp'])(rl_embedding) 73 | 74 | l1_c3 = LSTM(args['l_size'], 75 | activation=args['lstm_act'], 76 | kernel_initializer='glorot_uniform', 77 | return_sequences=True, 78 | dropout=0.2, 79 | implementation=args['imp'])(t_input) 80 | 81 | # ============================================================================= 82 | # Batch Normalization Layer 83 | # ============================================================================= 84 | batch1 = BatchNormalization()(l1_c1) 85 | batch2 = BatchNormalization()(l1_c2) 86 | batch3 = BatchNormalization()(l1_c3) 87 | 88 | # ============================================================================= 89 | # The layer specialized in prediction 90 | # ============================================================================= 91 | l2_c1 = LSTM(args['l_size'], 92 | kernel_initializer='glorot_uniform', 93 | return_sequences=False, 94 | dropout=0.2, 95 | implementation=args['imp'])(batch1) 96 | 97 | # The layer specialized in role prediction 98 | l2_c2 = LSTM(args['l_size'], 99 | kernel_initializer='glorot_uniform', 100 | return_sequences=False, 101 | dropout=0.2, 102 | implementation=args['imp'])(batch2) 103 | 104 | # The layer specialized in role prediction 105 | if args['lstm_act'] is not None: 106 | l2_3 = LSTM(args['l_size'], 107 | activation=args['lstm_act'], 108 | kernel_initializer='glorot_uniform', 109 | return_sequences=False, 110 | dropout=0.2, 111 | implementation=args['imp'])(batch3) 112 | else: 113 | l2_3 = LSTM(args['l_size'], 114 | kernel_initializer='glorot_uniform', 115 | return_sequences=False, 116 | dropout=0.2, 117 | implementation=args['imp'])(batch3) 118 | 119 | 120 | 121 | # ============================================================================= 122 | # Output Layer 123 | # ============================================================================= 124 | act_output = Dense(ac_weights.shape[0], 125 | activation='softmax', 126 | kernel_initializer='glorot_uniform', 127 | name='act_output')(l2_c1) 128 | 129 | role_output = Dense(rl_weights.shape[0], 130 | activation='softmax', 131 | kernel_initializer='glorot_uniform', 132 | name='role_output')(l2_c2) 133 | 134 | if ('dense_act' in args) and (args['dense_act'] is not None): 135 | time_output = Dense(train_vec['next_evt']['times'].shape[1], activation=args['dense_act'], 136 | kernel_initializer='glorot_uniform', 137 | name='time_output')(l2_3) 138 | else: 139 | time_output = Dense(train_vec['next_evt']['times'].shape[1], 140 | kernel_initializer='glorot_uniform', 141 | name='time_output')(l2_3) 142 | 143 | model = Model(inputs=[ac_input, rl_input, t_input], 144 | outputs=[act_output, role_output, time_output]) 145 | 146 | if args['optim'] == 'Nadam': 147 | opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999) 148 | elif args['optim'] == 'Adam': 149 | opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False) 150 | elif args['optim'] == 'SGD': 151 | opt = SGD(learning_rate=0.01, momentum=0.0, nesterov=False) 152 | elif args['optim'] == 'Adagrad': 153 | opt = Adagrad(learning_rate=0.01) 154 | 155 | model.compile(loss={'act_output':'categorical_crossentropy', 156 | 'role_output':'categorical_crossentropy', 157 | 'time_output':'mae'}, optimizer=opt) 158 | 159 | model.summary() 160 | 161 | early_stopping = EarlyStopping(monitor='val_loss', patience=40) 162 | if log_path: 163 | cb = tc.TimingCallback(output_folder, log_path=log_path) 164 | else: 165 | cb = tc.TimingCallback(output_folder) 166 | 167 | # Output file 168 | output_file_path = os.path.join(output_folder, 169 | os.path.splitext(args['file'])[0]+'.h5') 170 | 171 | # Saving 172 | model_checkpoint = ModelCheckpoint(output_file_path, 173 | monitor='val_loss', 174 | verbose=0, 175 | save_best_only=True, 176 | save_weights_only=False, 177 | mode='auto') 178 | lr_reducer = ReduceLROnPlateau(monitor='val_loss', 179 | factor=0.5, 180 | patience=10, 181 | verbose=0, 182 | mode='auto', 183 | min_delta=0.0001, 184 | cooldown=0, 185 | min_lr=0) 186 | 187 | batch_size = args['batch_size'] 188 | model.fit({'ac_input': train_vec['prefixes']['activities'], 189 | 'rl_input': train_vec['prefixes']['roles'], 190 | 't_input': train_vec['prefixes']['times']}, 191 | {'act_output': train_vec['next_evt']['activities'], 192 | 'role_output': train_vec['next_evt']['roles'], 193 | 'time_output': train_vec['next_evt']['times']}, 194 | validation_data=( 195 | {'ac_input': valdn_vec['prefixes']['activities'], 196 | 'rl_input': valdn_vec['prefixes']['roles'], 197 | 't_input': valdn_vec['prefixes']['times']}, 198 | {'act_output': valdn_vec['next_evt']['activities'], 199 | 'role_output': valdn_vec['next_evt']['roles'], 200 | 'time_output': valdn_vec['next_evt']['times']}), 201 | verbose=2, 202 | callbacks=[early_stopping, model_checkpoint, lr_reducer, cb], 203 | batch_size=batch_size, 204 | epochs=args['epochs']) 205 | return model 206 | -------------------------------------------------------------------------------- /model_training/samples_creator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Mar 14 19:13:15 2020 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import itertools 8 | import numpy as np 9 | import random 10 | 11 | from nltk.util import ngrams 12 | import keras.utils as ku 13 | 14 | 15 | class SequencesCreator(): 16 | 17 | def __init__(self, one_timestamp, ac_index, rl_index): 18 | """constructor""" 19 | self.one_timestamp = one_timestamp 20 | self.ac_index = ac_index 21 | self.rl_index = rl_index 22 | self._vectorizers = dict() 23 | self._vec_dispatcher = {'basic': self._vectorize_seq, 24 | 'inter': self._vectorize_seq_inter, 25 | 'gan': self.gan_simple} 26 | 27 | def vectorize(self, model_type, log, params, add_cols): 28 | self.log = log 29 | columns = self.define_columns(add_cols, self.one_timestamp) 30 | loader = self._get_vectorizer(model_type) 31 | return loader(params, columns) 32 | 33 | def register_vectorizer(self, model_type, vectorizer): 34 | try: 35 | self._vectorizers[model_type] = self._vec_dispatcher[vectorizer] 36 | except KeyError: 37 | raise ValueError(vectorizer) 38 | 39 | def _get_vectorizer(self, model_type): 40 | vectorizer = self._vectorizers.get(model_type) 41 | if not vectorizer: 42 | raise ValueError(model_type) 43 | return vectorizer 44 | 45 | @staticmethod 46 | def define_columns(add_cols, one_timestamp): 47 | columns = ['ac_index', 'rl_index', 'dur_norm'] 48 | add_cols = [x+'_norm' if x != 'weekday' else x for x in add_cols ] 49 | columns.extend(add_cols) 50 | if not one_timestamp: 51 | columns.extend(['wait_norm']) 52 | return columns 53 | 54 | def _vectorize_seq(self, parms, columns): 55 | """ 56 | Dataframe vectorizer. 57 | parms: 58 | columns: list of features to vectorize. 59 | parms (dict): parms for training the network 60 | Returns: 61 | dict: Dictionary that contains all the LSTM inputs. 62 | """ 63 | # TODO: reorganizar este metoo para poder vectorizar los tiempos 64 | # con uno o dos features de tiempo, posiblemente la idea es 65 | # hacer equi como si fueran intercases. 66 | times = ['dur_norm'] if parms['one_timestamp'] else ['dur_norm', 'wait_norm'] 67 | equi = {'ac_index': 'activities', 'rl_index': 'roles'} 68 | vec = {'prefixes': dict(), 69 | 'next_evt': dict()} 70 | x_times_dict = dict() 71 | y_times_dict = dict() 72 | self.log = self.reformat_events(columns, parms['one_timestamp']) 73 | # n-gram definition 74 | for i, _ in enumerate(self.log): 75 | for x in columns: 76 | serie = list(ngrams(self.log[i][x], parms['n_size'], 77 | pad_left=True, left_pad_symbol=0)) 78 | y_serie = [x[-1] for x in serie] 79 | serie = serie[:-1] 80 | y_serie = y_serie[1:] 81 | if x in list(equi.keys()): 82 | vec['prefixes'][equi[x]] = (vec['prefixes'][equi[x]] + serie 83 | if i > 0 else serie) 84 | vec['next_evt'][equi[x]] = (vec['next_evt'][equi[x]] + y_serie 85 | if i > 0 else y_serie) 86 | elif x in times: 87 | x_times_dict[x] = ( 88 | x_times_dict[x] + serie if i > 0 else serie) 89 | y_times_dict[x] = ( 90 | y_times_dict[x] + y_serie if i > 0 else y_serie) 91 | 92 | # Transform task, dur and role prefixes in vectors 93 | for value in equi.values(): 94 | vec['prefixes'][value] = np.array(vec['prefixes'][value]) 95 | vec['next_evt'][value] = np.array(vec['next_evt'][value]) 96 | # one-hot encode target values 97 | vec['next_evt']['activities'] = ku.to_categorical( 98 | vec['next_evt']['activities'], num_classes=len(self.ac_index)) 99 | vec['next_evt']['roles'] = ku.to_categorical( 100 | vec['next_evt']['roles'], num_classes=len(self.rl_index)) 101 | # reshape times 102 | for key, value in x_times_dict.items(): 103 | x_times_dict[key] = np.array(value) 104 | x_times_dict[key] = x_times_dict[key].reshape( 105 | (x_times_dict[key].shape[0], x_times_dict[key].shape[1], 1)) 106 | vec['prefixes']['times'] = np.dstack(list(x_times_dict.values())) 107 | # Reshape y times attributes (suffixes, number of attributes) 108 | vec['next_evt']['times'] = np.dstack(list(y_times_dict.values()))[0] 109 | return vec 110 | 111 | def _vectorize_seq_inter(self, parms, columns): 112 | """ 113 | Dataframe vectorizer to process intercase or data atributes features. 114 | parms: 115 | columns: list of features to vectorize. 116 | parms (dict): parms for training the network 117 | Returns: 118 | dict: Dictionary that contains all the LSTM inputs. 119 | """ 120 | times = ['dur_norm'] if parms['one_timestamp'] else ['dur_norm', 'wait_norm'] 121 | equi = {'ac_index': 'activities', 'rl_index': 'roles'} 122 | vec = {'prefixes': dict(), 123 | 'next_evt': dict()} 124 | x_weekday = list() 125 | y_weekday = list() 126 | # times 127 | x_times_dict = dict() 128 | y_times_dict = dict() 129 | # intercases 130 | x_inter_dict = dict() 131 | y_inter_dict = dict() 132 | # self.log = self.log[self.log.caseid.isin(['1', '1770'])].head(3) 133 | self.log = self.reformat_events(columns, parms['one_timestamp']) 134 | for i, _ in enumerate(self.log): 135 | for x in columns: 136 | serie = list(ngrams(self.log[i][x], parms['n_size'], 137 | pad_left=True, left_pad_symbol=0)) 138 | y_serie = [x[-1] for x in serie] 139 | serie = serie[:-1] 140 | y_serie = y_serie[1:] 141 | if x in list(equi.keys()): 142 | vec['prefixes'][equi[x]] = ( 143 | vec['prefixes'][equi[x]] + serie if i > 0 else serie) 144 | vec['next_evt'][equi[x]] = ( 145 | vec['next_evt'][equi[x]] + y_serie 146 | if i > 0 else y_serie) 147 | elif x in times: 148 | x_times_dict[x] = ( 149 | x_times_dict[x] + serie if i > 0 else serie) 150 | y_times_dict[x] = ( 151 | y_times_dict[x] + y_serie if i > 0 else y_serie) 152 | elif x == 'weekday': 153 | x_weekday = ( 154 | x_weekday + serie if i > 0 else serie) 155 | y_weekday = ( 156 | y_weekday + y_serie if i > 0 else y_serie) 157 | else: 158 | x_inter_dict[x] = ( 159 | x_inter_dict[x] + serie if i > 0 else serie) 160 | y_inter_dict[x] = ( 161 | y_inter_dict[x] + y_serie if i > 0 else y_serie) 162 | # Transform task, dur and role prefixes in vectors 163 | for value in equi.values(): 164 | vec['prefixes'][value] = np.array(vec['prefixes'][value]) 165 | vec['next_evt'][value] = np.array(vec['next_evt'][value]) 166 | # one-hot encode target values 167 | vec['next_evt']['activities'] = ku.to_categorical( 168 | vec['next_evt']['activities'], num_classes=len(self.ac_index)) 169 | vec['next_evt']['roles'] = ku.to_categorical( 170 | vec['next_evt']['roles'], num_classes=len(self.rl_index)) 171 | # reshape times 172 | for key, value in x_times_dict.items(): 173 | x_times_dict[key] = np.array(value) 174 | x_times_dict[key] = x_times_dict[key].reshape( 175 | (x_times_dict[key].shape[0], x_times_dict[key].shape[1], 1)) 176 | vec['prefixes']['times'] = np.dstack(list(x_times_dict.values())) 177 | # Reshape y times attributes (suffixes, number of attributes) 178 | vec['next_evt']['times'] = np.dstack(list(y_times_dict.values()))[0] 179 | # Reshape intercase attributes (prefixes, n-gram size, number of attributes) 180 | for key, value in x_inter_dict.items(): 181 | x_inter_dict[key] = np.array(value) 182 | x_inter_dict[key] = x_inter_dict[key].reshape( 183 | (x_inter_dict[key].shape[0], x_inter_dict[key].shape[1], 1)) 184 | vec['prefixes']['inter_attr'] = np.dstack(list(x_inter_dict.values())) 185 | # Reshape y intercase attributes (suffixes, number of attributes) 186 | vec['next_evt']['inter_attr'] = np.dstack(list(y_inter_dict.values()))[0] 187 | if 'weekday' in columns: 188 | # Onehot encode weekday 189 | x_weekday = ku.to_categorical(x_weekday, num_classes=7) 190 | y_weekday = ku.to_categorical(y_weekday, num_classes=7) 191 | vec['prefixes']['inter_attr'] = np.concatenate( 192 | [vec['prefixes']['inter_attr'], x_weekday], axis=2) 193 | vec['next_evt']['inter_attr'] = np.concatenate( 194 | [vec['next_evt']['inter_attr'], y_weekday], axis=1) 195 | return vec 196 | 197 | 198 | def gan_simple(self, parms, columns): 199 | print(columns) 200 | vec = {'training':dict()} 201 | pairs = self.log.copy() 202 | pairs = pairs[['ac_index', 'rl_index']] 203 | pairs = pairs.to_records(index=False).tolist() 204 | # Vectorize discriminator training real inputs 205 | vec['training']['activities'] = [x[0] for x in pairs] 206 | vec['training']['activities'] = ku.to_categorical( 207 | vec['training']['activities'], num_classes=len(self.ac_index)) 208 | vec['training']['roles'] = [x[1] for x in pairs] 209 | vec['training']['roles'] = ku.to_categorical( 210 | vec['training']['roles'], num_classes=len(self.rl_index)) 211 | vec['training']['class'] = np.zeros(len(pairs)) 212 | 213 | # If the discriminator will be pretrained create pretraining examples 214 | if parms['gan_pretrain']: 215 | # one third of real events randomly selected 216 | n_positive = int(round(len(pairs)/3)) 217 | negative_ratio = 2 218 | 219 | batch_size = n_positive * (1 + negative_ratio) 220 | batch = np.zeros((batch_size, 3)) 221 | pairs_set = set(pairs) 222 | activities = list(self.ac_index.keys()) 223 | roles = list(self.rl_index.keys()) 224 | # randomly choose positive examples 225 | idx = 0 226 | for idx, (activity, role) in enumerate( 227 | random.sample(pairs, n_positive)): 228 | batch[idx, :] = (activity, role, 0) 229 | # Increment idx by 1 230 | idx += 1 231 | # Add negative examples until reach batch size 232 | while idx < batch_size: 233 | # random selection 234 | random_ac = random.randrange(len(activities)) 235 | random_rl = random.randrange(len(roles)) 236 | # Check to make sure this is not a positive example 237 | if (random_ac, random_rl) not in pairs_set: 238 | # Add to batch and increment index, 0 due classification task 239 | batch[idx, :] = (random_ac, random_rl, 1) 240 | idx += 1 241 | vec['pretraining'] = dict() 242 | # Make sure to shuffle order 243 | np.random.shuffle(batch) 244 | vec['pretraining']['activities'] = ku.to_categorical( 245 | batch[:, 0], num_classes=len(self.ac_index)) 246 | vec['pretraining']['roles'] = ku.to_categorical( 247 | batch[:, 1], num_classes=len(self.rl_index)) 248 | vec['pretraining']['class'] = batch[:, 2] 249 | return vec 250 | 251 | # ============================================================================= 252 | # Reformat events 253 | # ============================================================================= 254 | def reformat_events(self, columns, one_timestamp): 255 | """Creates series of activities, roles and relative times per trace. 256 | parms: 257 | self.log: dataframe. 258 | ac_index (dict): index of activities. 259 | rl_index (dict): index of roles. 260 | Returns: 261 | list: lists of activities, roles and relative times. 262 | """ 263 | temp_data = list() 264 | log_df = self.log.to_dict('records') 265 | key = 'end_timestamp' if one_timestamp else 'start_timestamp' 266 | log_df = sorted(log_df, key=lambda x: (x['caseid'], key)) 267 | for key, group in itertools.groupby(log_df, key=lambda x: x['caseid']): 268 | trace = list(group) 269 | temp_dict = dict() 270 | for x in columns: 271 | serie = [y[x] for y in trace] 272 | if x == 'ac_index': 273 | serie.insert(0, self.ac_index[('start')]) 274 | serie.append(self.ac_index[('end')]) 275 | elif x == 'rl_index': 276 | serie.insert(0, self.rl_index[('start')]) 277 | serie.append(self.rl_index[('end')]) 278 | else: 279 | serie.insert(0, 0) 280 | serie.append(0) 281 | temp_dict = {**{x: serie}, **temp_dict} 282 | temp_dict = {**{'caseid': key}, **temp_dict} 283 | temp_data.append(temp_dict) 284 | return temp_data -------------------------------------------------------------------------------- /models_spec.ini: -------------------------------------------------------------------------------- 1 | [shared_cat] 2 | scaler = basic 3 | additional_columns = [] 4 | vectorizer = basic 5 | trainer = shared_cat 6 | [specialized] 7 | scaler = basic 8 | additional_columns = [] 9 | vectorizer = basic 10 | trainer = specialized 11 | [concatenated] 12 | scaler = basic 13 | additional_columns = [] 14 | vectorizer = basic 15 | trainer = concatenated 16 | [shared_cat_cx] 17 | scaler = inter 18 | additional_columns = [daytime, weekday] 19 | vectorizer = inter 20 | trainer = shared_cat_cx 21 | [concatenated_cx] 22 | scaler = inter 23 | additional_columns = [daytime, weekday] 24 | vectorizer = inter 25 | trainer = concatenated_cx 26 | [shared_cat_gru_cx] 27 | scaler = inter 28 | additional_columns = [daytime, weekday] 29 | vectorizer = inter 30 | trainer = shared_cat_gru_cx 31 | [concatenated_gru_cx] 32 | scaler = inter 33 | additional_columns = [daytime, weekday] 34 | vectorizer = inter 35 | trainer = concatenated_gru_cx 36 | [shared_cat_inter] 37 | scaler = inter 38 | additional_columns = [ev_et, ev_et_t, ev_rd, ev_rp_occ] 39 | vectorizer = inter 40 | trainer = shared_cat_inter 41 | [shared_cat_inter_full] 42 | scaler = inter 43 | additional_columns = [daytime, acc_cycle, ev_et, ev_et_t, ev_rd, ev_rp_occ] 44 | vectorizer = inter 45 | trainer = shared_cat_inter 46 | [concatenated_inter] 47 | scaler = inter 48 | additional_columns = [daytime, acc_cycle, ev_et, ev_et_t, ev_rd, ev_rp_occ] 49 | vectorizer = inter 50 | trainer = concatenated_inter 51 | [simple_gan] 52 | scaler = basic 53 | additional_columns = [] 54 | vectorizer = gan 55 | trainer = gan 56 | [shared_cat_gru] 57 | scaler = basic 58 | additional_columns = [] 59 | vectorizer = basic 60 | trainer = shared_cat_gru 61 | [specialized_gru] 62 | scaler = basic 63 | additional_columns = [] 64 | vectorizer = basic 65 | trainer = specialized_gru 66 | [concatenated_gru] 67 | scaler = basic 68 | additional_columns = [] 69 | vectorizer = basic 70 | trainer = concatenated_gru -------------------------------------------------------------------------------- /support_modules/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /support_modules/callbacks/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /support_modules/callbacks/clean_models_callback.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Nov 13 23:56:26 2019 4 | 5 | @author: Manuel Camargo 6 | """ 7 | 8 | import os 9 | import datetime 10 | 11 | from keras.callbacks import Callback 12 | 13 | 14 | class CleanSavedModelsCallback(Callback): 15 | def __init__(self, output_folder, num_models): 16 | self.logs=[] 17 | self.num_models=num_models 18 | self.path=output_folder 19 | 20 | def on_epoch_end(self, epoch, logs={}): 21 | files = self.create_folder_list(self) 22 | for file in files: 23 | os.unlink(os.path.join(self.path, file)) 24 | 25 | def create_folder_list(self, logs={}): 26 | file_list = list() 27 | for _, _, files in os.walk(self.path): 28 | files_filtered = list() 29 | for f in files: 30 | _, file_extension = os.path.splitext(f) 31 | if file_extension == '.h5': 32 | files_filtered.append(f) 33 | creation_list = list() 34 | for f in files_filtered: 35 | date=os.path.getmtime(os.path.join(self.path, f)) 36 | creation_list.append(dict(filename=f, creation=datetime.datetime.utcfromtimestamp(date))) 37 | creation_list = sorted(creation_list, key=lambda x:x['creation'], reverse=True) 38 | for f in creation_list[self.num_models:]: 39 | file_list.append(f['filename']) 40 | return file_list 41 | 42 | -------------------------------------------------------------------------------- /support_modules/callbacks/time_callback.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Nov 12 17:21:35 2019 4 | 5 | @author: Manuel Camargo 6 | """ 7 | import os 8 | import numpy as np 9 | 10 | from time import time 11 | from keras.callbacks import Callback 12 | import utils.support as sup 13 | 14 | 15 | class TimingCallback(Callback): 16 | def __init__(self, output_folder, log_path=os.path.join( 17 | 'output_files', 'training_times.csv')): 18 | self.logs=[] 19 | self.output_folder=output_folder 20 | self.log_path=log_path 21 | 22 | def on_epoch_begin(self, epoch, logs={}): 23 | self.starttime=time() 24 | def on_epoch_end(self, epoch, logs={}): 25 | self.logs.append(time()-self.starttime) 26 | def on_train_end(self, logs={}): 27 | log_file = self.log_path 28 | data = [{'output_folder': self.output_folder, 29 | 'train_epochs': len(self.logs), 30 | 'avg_time': np.mean(self.logs), 31 | 'min_time': np.min(self.logs), 32 | 'max_time': np.max(self.logs)}] 33 | if os.path.exists(log_file): 34 | sup.create_csv_file(data, log_file, mode='a') 35 | else: 36 | sup.create_csv_file_header(data, log_file) 37 | -------------------------------------------------------------------------------- /support_modules/role_discovery.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scipy 3 | from scipy.stats import pearsonr 4 | import networkx as nx 5 | import utils.support as sup 6 | from operator import itemgetter 7 | import pandas as pd 8 | 9 | 10 | class ResourcePoolAnalyser(): 11 | """ 12 | This class evaluates the tasks durations and associates resources to it 13 | """ 14 | 15 | def __init__(self, log, drawing=False, sim_threshold=0.7): 16 | """constructor""" 17 | self.data = self.read_resource_pool(log) 18 | self.drawing = drawing 19 | self.sim_threshold = sim_threshold 20 | 21 | self.tasks = {val: i for i, val in enumerate(self.data.task.unique())} 22 | self.users = {val: i for i, val in enumerate(self.data.user.unique())} 23 | 24 | self.roles, self.resource_table = self.discover_roles() 25 | 26 | def read_resource_pool(self, log): 27 | if isinstance(log, pd.DataFrame): 28 | filtered_list = log[['task', 'user']] 29 | else: 30 | filtered_list = pd.DataFrame(log.data)[['task', 'user']] 31 | filtered_list = filtered_list[~filtered_list.task.isin(['Start', 'End'])] 32 | filtered_list = filtered_list[filtered_list.user != 'AUTO'] 33 | return filtered_list 34 | 35 | 36 | def discover_roles(self): 37 | associations = lambda x: (self.tasks[x['task']], self.users[x['user']]) 38 | self.data['ac_rl'] = self.data.apply(associations, axis=1) 39 | 40 | freq_matrix = (self.data.groupby(by='ac_rl')['task'] 41 | .count() 42 | .reset_index() 43 | .rename(columns={'task': 'freq'})) 44 | freq_matrix = {x['ac_rl']: x['freq'] for x in freq_matrix.to_dict('records')} 45 | 46 | profiles = self.build_profile(freq_matrix) 47 | 48 | sup.print_progress(((20 / 100)* 100),'Analysing resource pool ') 49 | # building of a correl matrix between resouces profiles 50 | correl_matrix = self.det_correl_matrix(profiles) 51 | sup.print_progress(((40 / 100)* 100),'Analysing resource pool ') 52 | # creation of a rel network between resouces 53 | g = nx.Graph() 54 | for user in self.users.values(): 55 | g.add_node(user) 56 | for rel in correl_matrix: 57 | # creation of edges between nodes excluding the same elements 58 | # and those below the similarity threshold 59 | if rel['distance'] > self.sim_threshold and rel['x'] != rel['y']: 60 | g.add_edge(rel['x'], 61 | rel['y'], 62 | weight=rel['distance']) 63 | sup.print_progress(((60 / 100) * 100),'Analysing resource pool ') 64 | # extraction of fully conected subgraphs as roles 65 | sub_graphs = list(nx.connected_components(g)) 66 | sup.print_progress(((80 / 100) * 100),'Analysing resource pool ') 67 | # role definition from graph 68 | roles = self.role_definition(sub_graphs) 69 | # plot creation (optional) 70 | # if drawing == True: 71 | # graph_network(g, sub_graphs) 72 | sup.print_progress(((100 / 100)* 100),'Analysing resource pool ') 73 | sup.print_done_task() 74 | return roles 75 | 76 | def build_profile(self, freq_matrix): 77 | profiles=list() 78 | for user, idx in self.users.items(): 79 | profile = [0,] * len(self.tasks) 80 | for ac_rl, freq in freq_matrix.items(): 81 | if idx == ac_rl[1]: 82 | profile[ac_rl[0]] = freq 83 | profiles.append({'user': idx, 'profile': profile}) 84 | return profiles 85 | 86 | def det_correl_matrix(self, profiles): 87 | correl_matrix = list() 88 | for profile_x in profiles: 89 | for profile_y in profiles: 90 | x = scipy.array(profile_x['profile']) 91 | y = scipy.array(profile_y['profile']) 92 | r_row, p_value = pearsonr(x, y) 93 | correl_matrix.append(({'x': profile_x['user'], 94 | 'y': profile_y['user'], 95 | 'distance': r_row})) 96 | return correl_matrix 97 | 98 | def role_definition(self, sub_graphs): 99 | user_index = {v: k for k, v in self.users.items()} 100 | records= list() 101 | for i in range(0, len(sub_graphs)): 102 | users_names = [user_index[x] for x in sub_graphs[i]] 103 | records.append({'role': 'Role '+ str(i + 1), 104 | 'quantity': len(sub_graphs[i]), 105 | 'members': users_names}) 106 | #Sort roles by number of resources 107 | records = sorted(records, key=itemgetter('quantity'), reverse=True) 108 | for i in range(0,len(records)): 109 | records[i]['role']='Role '+ str(i + 1) 110 | resource_table = list() 111 | for record in records: 112 | for member in record['members']: 113 | resource_table.append({'role': record['role'], 114 | 'resource': member}) 115 | return records, resource_table 116 | 117 | # # == support 118 | # def random_color(size): 119 | # number_of_colors = size 120 | # color = ["#"+''.join([random.choice('0123456789ABCDEF') 121 | # for j in range(6)]) for i in range(number_of_colors)] 122 | # return color 123 | 124 | # def graph_network(g, sub_graphs): 125 | # pos = nx.spring_layout(g, k=0.5,scale=10) 126 | # color = random_color(len(sub_graphs)) 127 | # for i in range(0,len(sub_graphs)): 128 | # subgraph = sub_graphs[i] 129 | # nx.draw_networkx_nodes(g,pos, nodelist=list(subgraph), 130 | # node_color=color[i], node_size=200, alpha=0.8) 131 | # nx.draw_networkx_edges(g,pos,width=1.0,alpha=0.5) 132 | # nx.draw_networkx_edges(g,pos, edgelist=subgraph.edges, 133 | # width=8,alpha=0.5,edge_color=color[i]) 134 | # plt.draw() 135 | # plt.show() # display 136 | 137 | --------------------------------------------------------------------------------