├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── dg_predictiction.py
├── dg_training.py
├── environment.yml
├── input_files
    └── event_logs.zip
├── model_prediction
    ├── __init__.py
    ├── event_log_predictor.py
    ├── interfaces.py
    ├── model_predictor.py
    ├── next_event_predictor.py
    ├── next_event_samples_creator.py
    ├── suffix_predictor.py
    └── suffix_samples_creator.py
├── model_training
    ├── __init__.py
    ├── embedding_training.py
    ├── features_manager.py
    ├── intercase_features
    │   └── __init__.py
    ├── model_hpc_optimizer.py
    ├── model_loader.py
    ├── model_optimizer.py
    ├── model_trainer.py
    ├── models
    │   ├── __init__.py
    │   ├── model_concatenated.py
    │   ├── model_concatenated_cx.py
    │   ├── model_gru_concatenated.py
    │   ├── model_gru_concatenated_cx.py
    │   ├── model_gru_shared_cat.py
    │   ├── model_gru_shared_cat_cx.py
    │   ├── model_gru_specialized.py
    │   ├── model_shared_cat.py
    │   ├── model_shared_cat_cx.py
    │   └── model_specialized.py
    ├── samples_creator.py
    └── slurm_trainer.py
├── models_spec.ini
└── support_modules
    ├── __init__.py
    ├── callbacks
        ├── __init__.py
        ├── clean_models_callback.py
        └── time_callback.py
    └── role_discovery.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | output_files/
3 | jobs_files/
4 | .idea/*
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DeepGenerator: Learning Accurate Generative Models of Business Processes with LSTM Neural Networks
 2 | 
 3 | The code here presented is able to execute different pre- and post-processing methods and architectures for building and using generative models from event logs in XES format using LSTM anf GRU neural networks. This code can perform the next tasks:
 4 | 
 5 | 
 6 | * Training LSTM neuronal networks using an event log as input.
 7 | * Generate full event logs using a trained LSTM neuronal network.
 8 | * Predict the remaining time and the continuation (suffix) of an incomplete business process trace. 
 9 | 
10 | 
11 | ## Getting Started
12 | 
13 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes.
14 | 
15 | ```
16 | git clone https://github.com/AdaptiveBProcess/GenerativeLSTM.git
17 | ```
18 | 
19 | ### Prerequisites
20 | 
21 | To execute this code you just need to install Anaconda in your system, and create an environment using the *environment.yml* specification provided in the repository.
22 | ```
23 | cd GenerativeLSTM
24 | conda env create -f environment.yml
25 | conda activate deep_generator
26 | ```
27 | 
28 | ## Running the script
29 | 
30 | Once created the environment, you can perform each one of the tasks, specifying the following parameters in the lstm.py module, or by command line as is described below:
31 | 
32 | *Training LSTM neuronal network:* To perform this task you need to set the required activity (-a) as 'training' followed by the name of the (-f) event log, and all the following parameters:
33 | 
34 | * Filename (-f): Log filename.
35 | * Model family (-m): The available options are lstm, gru, lstm_cx and gru_cx.
36 | * Max Eval (-e): Maximum number of evaluations.
37 | * Opt method (-o): Optimization method used. The available options are hpc and bayesian.
38 | 
39 | ```
40 | (lstm_env) C:\sc_lstm>python dg_training.py -f Helpdesk.xes -m lstm -e 1 -o bayesian
41 | ```
42 | 
43 | *Predictive task:* It is possible to execute various predictive tasks with DeepGenerator, such as predicting the next event, the case continuation, and the remaining time of an ongoing case. Similarly, it is possible to generate complete event logs starting from a zero prefix size. To perform these tasks, you need to set the activity (-a) as ‘predict_next’ for the next event prediction, ‘pred_sfx’ for case continuation and remaining time, and ‘pred_log’ for the full event log generation. Additionally, it's required to indicate the folder where the predictive model is located (-c), and the name of the .h5 model (-b). Finally, you need to specify the method for selecting the next predicted task (-v) ‘random_choice’ or ‘arg_max’ and the number of repetitions of the experiment (-r). **NB! The folders and models were generated in the training task and can be found in the output_files folder:
44 | 
45 | ```
46 | (lstm_env) C:\sc_lstm>-a pred_log -c 20201001_426975C9_FAC6_453A_9F0B_4DD528CB554B -b "model_shared_cat_02-1.10.h5" -v "random_choice" -r 1"
47 | ```
48 | *Predict the next event and role:* To perform this task the only changes with respect with the previous ones are that you need to set the required activity as 'predict_next' and its not necesary to set the maximum trace length:
49 | 
50 | ```
51 | (lstm_env) C:\sc_lstm>python lstm.py -a predict_next -c 20190228_155935509575 -b "model_rd_150 Nadam_22-0.59.h5" -x False
52 | ```
53 | ## Examples
54 | 
55 | Models examples and experimental results can be found at <a href="http://kodu.ut.ee/~chavez85/bpm2019/" target="_blank">examples</a>
56 | ## Authors
57 | 
58 | * **Manuel Camargo**
59 | * **Marlon Dumas**
60 | * **Oscar Gonzalez-Rojas**
61 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/dg_predictiction.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Feb 23 19:08:25 2021
 4 | 
 5 | @author: Manuel Camargo
 6 | """
 7 | import os
 8 | 
 9 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
10 | import sys
11 | import getopt
12 | 
13 | from model_prediction import model_predictor as pr
14 | 
15 | 
16 | # =============================================================================
17 | # Main function
18 | # =============================================================================
19 | def catch_parameter(opt):
20 |     """Change the captured parameters names"""
21 |     switch = {'-h': 'help', '-a': 'activity', '-c': 'folder',
22 |               '-b': 'model_file', '-v': 'variant', '-r': 'rep'}
23 |     return switch.get(opt)
24 | 
25 | 
26 | def main(argv):
27 |     parameters = dict()
28 |     column_names = {'Case ID': 'caseid',
29 |                     'Activity': 'task',
30 |                     'lifecycle:transition': 'event_type',
31 |                     'Resource': 'user'}
32 |     parameters['one_timestamp'] = False  # Only one timestamp in the log
33 |     parameters['read_options'] = {
34 |         'timeformat': '%Y-%m-%dT%H:%M:%S.%f',
35 |         'column_names': column_names,
36 |         'one_timestamp': parameters['one_timestamp'],
37 |         'filter_d_attrib': False}
38 |     # Parameters settled manually or catched by console for batch operations
39 |     if not argv:
40 |         # predict_next, pred_sfx
41 |         parameters['activity'] = 'pred_log'
42 |         parameters['folder'] = '20230302_3CC0DC8C_5A76_4ED1_8E90_AFB851EB1AA0'
43 |         parameters['model_file'] = 'Production.h5'
44 |         parameters['is_single_exec'] = False  # single or batch execution
45 |         # variants and repetitions to be tested Random Choice, Arg Max
46 |         parameters['variant'] = 'Random Choice'
47 |         parameters['rep'] = 1
48 |     else:
49 |         # Catch parms by console
50 |         try:
51 |             opts, _ = getopt.getopt(argv, "ho:a:f:c:b:v:r:",
52 |                                     ['one_timestamp=', 'activity=', 'folder=',
53 |                                      'model_file=', 'variant=', 'rep='])
54 |             for opt, arg in opts:
55 |                 key = catch_parameter(opt)
56 |                 if key in ['rep']:
57 |                     parameters[key] = int(arg)
58 |                 else:
59 |                     parameters[key] = arg
60 |         except getopt.GetoptError:
61 |             print('Invalid option')
62 |             sys.exit(2)
63 |     print(parameters['folder'])
64 |     print(parameters['model_file'])
65 |     pr.ModelPredictor(parameters)
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     main(sys.argv[1:])
70 | 


--------------------------------------------------------------------------------
/dg_training.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Jun 26 13:27:58 2020
 4 | 
 5 | @author: Manuel Camargo
 6 | """
 7 | import os
 8 | 
 9 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
10 | import sys
11 | import getopt
12 | 
13 | from model_training import model_trainer as tr
14 | 
15 | 
16 | # =============================================================================
17 | # Main function
18 | # =============================================================================
19 | def catch_parameter(opt):
20 |     """Change the captured parameters names"""
21 |     switch = {'-h': 'help', '-f': 'file_name', '-m': 'model_family',
22 |               '-e': 'max_eval', '-o': 'opt_method'}
23 |     return switch.get(opt)
24 | 
25 | 
26 | def main(argv):
27 |     parameters = dict()
28 |     column_names = {'Case ID': 'caseid',
29 |                     'Activity': 'task',
30 |                     'lifecycle:transition': 'event_type',
31 |                     'Resource': 'user'}
32 |     parameters['one_timestamp'] = False  # Only one timestamp in the log
33 |     parameters['read_options'] = {
34 |         'timeformat': '%Y-%m-%dT%H:%M:%S.%f',
35 |         'column_names': column_names,
36 |         'one_timestamp': parameters['one_timestamp']}
37 |     # Parameters settled manually or catched by console for batch operations
38 |     if not argv:
39 |         # Event-log filename
40 |         parameters['file_name'] = 'Production.csv'
41 |         parameters['model_family'] = 'lstm'
42 |         parameters['opt_method'] = 'bayesian'  # 'rand_hpc', 'bayesian'
43 |         parameters['max_eval'] = 1
44 |     else:
45 |         # Catch parms by console
46 |         try:
47 |             opts, _ = getopt.getopt(argv, "h:f:m:e:o:",
48 |                                     ['file_name=', 'model_family=',
49 |                                      'max_eval=', 'opt_method='])
50 |             for opt, arg in opts:
51 |                 key = catch_parameter(opt)
52 |                 if key in ['max_eval']:
53 |                     parameters[key] = int(arg)
54 |                 else:
55 |                     parameters[key] = arg
56 |         except getopt.GetoptError:
57 |             print('Invalid option')
58 |             sys.exit(2)
59 |     # Similarity btw the resources profile execution (Song e.t. all)
60 |     parameters['rp_sim'] = 0.85
61 |     parameters['batch_size'] = 32  # Usually 32/64/128/256
62 |     parameters['norm_method'] = ['max', 'lognorm']
63 |     parameters['imp'] = 1
64 |     parameters['epochs'] = 200
65 |     parameters['n_size'] = [5, 10, 15]
66 |     parameters['l_size'] = [50, 100]
67 |     parameters['lstm_act'] = ['selu', 'tanh']
68 |     if parameters['model_family'] == 'lstm':
69 |         parameters['model_type'] = ['shared_cat', 'concatenated']
70 |     elif parameters['model_family'] == 'gru':
71 |         parameters['model_type'] = ['shared_cat_gru', 'concatenated_gru']
72 |     elif parameters['model_family'] == 'lstm_cx':
73 |         parameters['model_type'] = ['shared_cat_cx', 'concatenated_cx']
74 |     elif parameters['model_family'] == 'gru_cx':
75 |         parameters['model_type'] = ['shared_cat_gru_cx', 'concatenated_gru_cx']
76 |     parameters['dense_act'] = ['linear']
77 |     parameters['optim'] = ['Nadam']
78 | 
79 |     if parameters['model_type'] == 'simple_gan':
80 |         parameters['gan_pretrain'] = False
81 |     parameters.pop('model_family', None)
82 |     # Train models
83 |     tr.ModelTrainer(parameters)
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     main(sys.argv[1:])
88 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: deep_generator
 2 | channels:
 3 |   - defaults
 4 | dependencies:
 5 |   - spyder==5.1.5
 6 |   - pandas
 7 |   - lxml
 8 |   - matplotlib
 9 |   - nltk
10 |   - scikit-learn
11 |   - ipywidgets
12 |   - pip:
13 |     - hyperopt
14 |     - jellyfish
15 |     - keras
16 |     - pm4py
17 |     - tensorflow
18 |     - opyenxes
19 |     - git+http://github.com/Mcamargo85/support_modules.git
20 | 


--------------------------------------------------------------------------------
/input_files/event_logs.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AdaptiveBProcess/GenerativeLSTM/cf0e2bee25843fa58a9710314a33778821cd92c9/input_files/event_logs.zip


--------------------------------------------------------------------------------
/model_prediction/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/model_prediction/interfaces.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Mar 17 16:24:38 2020
 4 | 
 5 | @author: Manuel Camargo
 6 | """
 7 | from model_prediction import next_event_samples_creator as nesc
 8 | from model_prediction import suffix_samples_creator as ssc
 9 | 
10 | 
11 | from model_prediction import next_event_predictor as nep
12 | from model_prediction import suffix_predictor as sp
13 | from model_prediction import event_log_predictor as elp
14 | 
15 | 
16 | class SamplesCreator:
17 |     def create(self, predictor, activity):
18 |         sampler = self._get_samples_creator(activity)
19 |         predictor.sampling(sampler)
20 | 
21 |     def _get_samples_creator(self, activity):
22 |         if activity == 'predict_next':
23 |             return nesc.NextEventSamplesCreator()
24 |         elif activity == 'pred_sfx':
25 |             return ssc.SuffixSamplesCreator()
26 |         else:
27 |             raise ValueError(activity)
28 | 
29 | 
30 | class PredictionTasksExecutioner:
31 |     def predict(self, predictor, activity, run_num):
32 |         executioner = self._get_predictor(activity)
33 |         predictor.predict(executioner, run_num)
34 | 
35 |     def _get_predictor(self, activity):
36 |         if activity == 'predict_next':
37 |             return nep.NextEventPredictor()
38 |         elif activity == 'pred_sfx':
39 |             return sp.SuffixPredictor()
40 |         elif activity == 'pred_log':
41 |             return elp.EventLogPredictor()
42 |         else:
43 |             raise ValueError(activity)
44 | 


--------------------------------------------------------------------------------
/model_prediction/model_predictor.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Mar 17 10:49:28 2020
  4 | 
  5 | @author: Manuel Camargo
  6 | """
  7 | import os
  8 | import json
  9 | import copy
 10 | 
 11 | import pandas as pd
 12 | import numpy as np
 13 | import configparser as cp
 14 | 
 15 | import readers.log_reader as lr
 16 | import utils.support as sup
 17 | 
 18 | from model_training import features_manager as feat
 19 | from model_prediction import interfaces as it
 20 | import analyzers.sim_evaluator as ev
 21 | 
 22 | 
 23 | class ModelPredictor():
 24 |     """
 25 |     This is the man class encharged of the model evaluation
 26 |     """
 27 | 
 28 |     def __init__(self, parms):
 29 |         self.output_route = os.path.join('output_files', parms['folder'])
 30 |         self.parms = parms
 31 |         # load parameters
 32 |         self.load_parameters()
 33 |         self.model_name = os.path.join(self.output_route, parms['model_file'])
 34 |         self.log = self.load_log_test(self.output_route, self.parms)
 35 | 
 36 |         self.samples = dict()
 37 |         self.predictions = None
 38 |         self.sim_values = list()
 39 | 
 40 |         self.model_def = dict()
 41 |         self.read_model_definition(self.parms['model_type'])
 42 |         self.parms['additional_columns'] = self.model_def['additional_columns']
 43 |         self.acc = self.execute_predictive_task()
 44 | 
 45 |     def execute_predictive_task(self):
 46 |         # create examples for next event and suffix
 47 |         if self.parms['activity'] == 'pred_log':
 48 |             self.parms['num_cases'] = len(self.log.caseid.unique())
 49 |             self.parms['start_time'] = self.log.start_timestamp.min()
 50 |         else:
 51 |             feat_mannager = feat.FeaturesMannager(self.parms)
 52 |             feat_mannager.register_scaler(self.parms['model_type'],
 53 |                                           self.model_def['vectorizer'])
 54 |             self.log, _ = feat_mannager.calculate(
 55 |                 self.log, self.parms['additional_columns'])
 56 |             sampler = it.SamplesCreator()
 57 |             sampler.create(self, self.parms['activity'])
 58 |         # predict
 59 |         self.imp = self.parms['variant']
 60 |         for run_num in range(0, self.parms['rep']):
 61 |             self.predict_values(run_num)
 62 |             # export predictions
 63 |             self.export_predictions(run_num)
 64 |             # assesment
 65 |             evaluator = EvaluateTask()
 66 |             if self.parms['activity'] == 'pred_log':
 67 |                 self.sim_values.extend(
 68 |                     evaluator.evaluate(self.parms,
 69 |                                        self.log,
 70 |                                     self.predictions,
 71 |                                     run_num))
 72 |             else:
 73 |                 evaluator.evaluate(self.predictions, self.parms)
 74 |         self._export_results(self.output_route)
 75 | 
 76 |     def predict_values(self, run_num):
 77 |         # Predict values
 78 |         executioner = it.PredictionTasksExecutioner()
 79 |         executioner.predict(self, self.parms['activity'], run_num)
 80 | 
 81 |     @staticmethod
 82 |     def load_log_test(output_route, parms):
 83 |         df_test = lr.LogReader(
 84 |             os.path.join(output_route, 'parameters', 'test_log.csv'),
 85 |             parms['read_options'])
 86 |         df_test = pd.DataFrame(df_test.data)
 87 |         df_test = df_test[~df_test.task.isin(['Start', 'End'])]
 88 |         return df_test
 89 | 
 90 |     def load_parameters(self):
 91 |         # Loading of parameters from training
 92 |         path = os.path.join(self.output_route,
 93 |                             'parameters',
 94 |                             'model_parameters.json')
 95 |         with open(path) as file:
 96 |             data = json.load(file)
 97 |             if 'activity' in data:
 98 |                 del data['activity']
 99 |             parms = {k: v for k, v in data.items()}
100 |             parms.pop('rep', None)
101 |             self.parms = {**self.parms, **parms}
102 |             if 'dim' in data.keys():
103 |                 self.parms['dim'] = {k: int(v) for k, v in data['dim'].items()}
104 |             if self.parms['one_timestamp']:
105 |                 self.parms['scale_args'] = {
106 |                     k: float(v) for k, v in data['scale_args'].items()}
107 |             else:
108 |                 for key in data['scale_args'].keys():
109 |                     self.parms['scale_args'][key] = {
110 |                         k: float(v) for k, v in data['scale_args'][key].items()}
111 |             self.parms['index_ac'] = {int(k): v
112 |                                       for k, v in data['index_ac'].items()}
113 |             self.parms['index_rl'] = {int(k): v
114 |                                       for k, v in data['index_rl'].items()}
115 |             file.close()
116 |             self.ac_index = {v: k for k, v in self.parms['index_ac'].items()}
117 |             self.rl_index = {v: k for k, v in self.parms['index_rl'].items()}
118 | 
119 |     def sampling(self, sampler):
120 |         sampler.register_sampler(self.parms['model_type'],
121 |                                  self.model_def['vectorizer'])
122 |         self.samples = sampler.create_samples(
123 |             self.parms, self.log, self.ac_index,
124 |             self.rl_index, self.model_def['additional_columns'])
125 | 
126 | 
127 |     def predict(self, executioner, run_num):
128 |         
129 |         results = executioner.predict(self.parms,
130 |                                       self.model_name,
131 |                                       self.samples,
132 |                                       self.imp,
133 |                                       self.model_def['vectorizer'])
134 |         results = pd.DataFrame(results)
135 |         self.predictions = results
136 | 
137 |     def export_predictions(self, r_num):
138 |         # output_folder = os.path.join(self.output_route, 'results')
139 |         if not os.path.exists(self.output_route):
140 |             os.makedirs(self.output_route)
141 |         self.predictions.to_csv(
142 |             os.path.join(
143 |                 self.output_route, 'gen_'+ 
144 |                 self.parms['model_file'].split('.')[0]+'_'+str(r_num+1)+'.csv'), 
145 |             index=False)
146 | 
147 |     @staticmethod
148 |     def scale_feature(log, feature, parms, replace=False):
149 |         """Scales a number given a technique.
150 |         Args:
151 |             log: Event-log to be scaled.
152 |             feature: Feature to be scaled.
153 |             method: Scaling method max, lognorm, normal, per activity.
154 |             replace (optional): replace the original value or keep both.
155 |         Returns:
156 |             Scaleded value between 0 and 1.
157 |         """
158 |         method = parms['norm_method']
159 |         scale_args = parms['scale_args']
160 |         if method == 'lognorm':
161 |             log[feature + '_log'] = np.log1p(log[feature])
162 |             max_value = scale_args['max_value']
163 |             min_value = scale_args['min_value']
164 |             log[feature+'_norm'] = np.divide(
165 |                     np.subtract(log[feature+'_log'], min_value), (max_value - min_value))
166 |             log = log.drop((feature + '_log'), axis=1)
167 |         elif method == 'normal':
168 |             max_value = scale_args['max_value']
169 |             min_value = scale_args['min_value']
170 |             log[feature+'_norm'] = np.divide(
171 |                     np.subtract(log[feature], min_value), (max_value - min_value))
172 |         elif method == 'standard':
173 |             mean = scale_args['mean']
174 |             std = scale_args['std']
175 |             log[feature + '_norm'] = np.divide(np.subtract(log[feature], mean),
176 |                                                std)
177 |         elif method == 'max':
178 |             max_value = scale_args['max_value']
179 |             log[feature + '_norm'] = (np.divide(log[feature], max_value)
180 |                                       if max_value > 0 else 0)
181 |         elif method is None:
182 |             log[feature+'_norm'] = log[feature]
183 |         else:
184 |             raise ValueError(method)
185 |         if replace:
186 |             log = log.drop(feature, axis=1)
187 |         return log
188 | 
189 |     def read_model_definition(self, model_type):
190 |         Config = cp.ConfigParser(interpolation=None)
191 |         Config.read('models_spec.ini')
192 |         #File name with extension
193 |         self.model_def['additional_columns'] = sup.reduce_list(
194 |             Config.get(model_type,'additional_columns'), dtype='str')
195 |         self.model_def['vectorizer'] = Config.get(model_type, 'vectorizer')
196 | 
197 |     def _export_results(self, output_path) -> None:
198 |         # Save results
199 |         pd.DataFrame(self.sim_values).to_csv(
200 |             os.path.join(self.output_route, sup.file_id(prefix='SE_')), 
201 |             index=False)
202 |         # Save logs        
203 |         log_test = self.log[~self.log.task.isin(['Start', 'End'])]
204 |         log_test.to_csv(
205 |             os.path.join(self.output_route, 'tst_'+
206 |                          self.parms['model_file'].split('.')[0]+'.csv'), 
207 |             index=False)
208 |         
209 | class EvaluateTask():
210 | 
211 |     def evaluate(self, parms, log, predictions, rep_num):
212 |         sampler = self._get_evaluator(parms['activity'])
213 |         return sampler(parms, log, predictions, rep_num)
214 | 
215 |     def _get_evaluator(self, activity):
216 |         if activity == 'predict_next':
217 |             return self._evaluate_predict_next
218 |         elif activity == 'pred_sfx':
219 |             return self._evaluate_pred_sfx
220 |         elif activity == 'pred_log':
221 |             return self._evaluate_predict_log
222 |         else:
223 |             raise ValueError(activity)
224 | 
225 |     def _evaluate_predict_next(self, data, parms, rep_num):
226 |         exp_desc = self.clean_parameters(parms.copy())
227 |         evaluator = ev.Evaluator(parms['one_timestamp'])
228 |         ac_sim = evaluator.measure('accuracy', data, 'ac')
229 |         rl_sim = evaluator.measure('accuracy', data, 'rl')
230 |         mean_ac = ac_sim.accuracy.mean()
231 |         exp_desc = pd.DataFrame([exp_desc])
232 |         exp_desc = pd.concat([exp_desc]*len(ac_sim), ignore_index=True)
233 |         ac_sim = pd.concat([ac_sim, exp_desc], axis=1).to_dict('records')
234 |         rl_sim = pd.concat([rl_sim, exp_desc], axis=1).to_dict('records')
235 |         self.save_results(ac_sim, 'ac', parms)
236 |         self.save_results(rl_sim, 'rl', parms)
237 |         if parms['one_timestamp']:
238 |             tm_mae = evaluator.measure('mae_next', data, 'tm')
239 |             tm_mae = pd.concat([tm_mae, exp_desc], axis=1).to_dict('records')
240 |             self.save_results(tm_mae, 'tm', parms)
241 |         else:
242 |             dur_mae = evaluator.measure('mae_next', data, 'dur')
243 |             wait_mae = evaluator.measure('mae_next', data, 'wait')
244 |             dur_mae = pd.concat([dur_mae, exp_desc], axis=1).to_dict('records')
245 |             wait_mae = pd.concat([wait_mae, exp_desc], axis=1).to_dict('records')
246 |             self.save_results(dur_mae, 'dur', parms)
247 |             self.save_results(wait_mae, 'wait', parms)
248 |         return mean_ac
249 | 
250 |     def _evaluate_pred_sfx(self, data, parms, rep_num):
251 |         exp_desc = self.clean_parameters(parms.copy())
252 |         evaluator = ev.Evaluator(parms['one_timestamp'])
253 |         ac_sim = evaluator.measure('similarity', data, 'ac')
254 |         rl_sim = evaluator.measure('similarity', data, 'rl')
255 |         mean_sim = ac_sim['mean'].mean()
256 |         exp_desc = pd.DataFrame([exp_desc])
257 |         exp_desc = pd.concat([exp_desc]*len(ac_sim), ignore_index=True)
258 |         ac_sim = pd.concat([ac_sim, exp_desc], axis=1).to_dict('records')
259 |         rl_sim = pd.concat([rl_sim, exp_desc], axis=1).to_dict('records')
260 |         self.save_results(ac_sim, 'ac', parms)
261 |         self.save_results(rl_sim, 'rl', parms)
262 |         if parms['one_timestamp']:
263 |             tm_mae = evaluator.measure('mae_suffix', data, 'tm')
264 |             tm_mae = pd.concat([tm_mae, exp_desc], axis=1).to_dict('records')
265 |             self.save_results(tm_mae, 'tm', parms)
266 |         else:
267 |             dur_mae = evaluator.measure('mae_suffix', data, 'dur')
268 |             wait_mae = evaluator.measure('mae_suffix', data, 'wait')
269 |             dur_mae = pd.concat([dur_mae, exp_desc], axis=1).to_dict('records')
270 |             wait_mae = pd.concat([wait_mae, exp_desc], axis=1).to_dict('records')
271 |             self.save_results(dur_mae, 'dur', parms)
272 |             self.save_results(wait_mae, 'wait', parms)
273 |         return mean_sim
274 | 
275 |     @staticmethod
276 |     def _evaluate_predict_log(parms, log, sim_log, rep_num):
277 |         """Reads the simulation results stats
278 |         Args:
279 |             settings (dict): Path to jar and file names
280 |             rep (int): repetition number
281 |         """
282 |         sim_values = list()
283 |         log = copy.deepcopy(log)
284 |         log = log[~log.task.isin(['Start', 'End', 'start', 'end'])]
285 |         log['caseid'] = log['caseid'].astype(str)
286 |         log['caseid'] = 'Case' + log['caseid']
287 |         sim_log = sim_log[~sim_log.task.isin(['Start', 'End', 'start', 'end'])]
288 |         evaluator = ev.SimilarityEvaluator(log, sim_log, parms)
289 |         metrics = ['tsd', 'day_hour_emd', 'log_mae', 'dl', 'mae']
290 |         for metric in metrics:
291 |             evaluator.measure_distance(metric)
292 |             sim_values.append({**{'run_num': rep_num}, **evaluator.similarity})
293 |         return sim_values
294 | 
295 |     @staticmethod
296 |     def clean_parameters(parms):
297 |         exp_desc = parms.copy()
298 |         exp_desc.pop('activity', None)
299 |         exp_desc.pop('read_options', None)
300 |         exp_desc.pop('column_names', None)
301 |         exp_desc.pop('one_timestamp', None)
302 |         exp_desc.pop('reorder', None)
303 |         exp_desc.pop('index_ac', None)
304 |         exp_desc.pop('index_rl', None)
305 |         exp_desc.pop('dim', None)
306 |         exp_desc.pop('max_dur', None)
307 |         exp_desc.pop('variants', None)
308 |         exp_desc.pop('is_single_exec', None)
309 |         return exp_desc
310 | 


--------------------------------------------------------------------------------
/model_prediction/next_event_predictor.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Mar 17 20:35:53 2020
  4 | 
  5 | @author: Manuel Camargo
  6 | """
  7 | import numpy as np
  8 | 
  9 | import utils.support as sup
 10 | 
 11 | 
 12 | class NextEventPredictor():
 13 | 
 14 |     def __init__(self):
 15 |         """constructor"""
 16 |         self.model = None
 17 |         self.spl = dict()
 18 |         self.imp = 'arg_max'
 19 | 
 20 |     def predict(self, params, model, spl, imp, vectorizer):
 21 |         self.model = model
 22 |         self.spl = spl
 23 |         self.imp = imp
 24 |         predictor = self._get_predictor(params['model_type'])
 25 |         sup.print_performed_task('Predicting next events')
 26 |         return predictor(params, vectorizer)
 27 | 
 28 |     def _get_predictor(self, model_type):
 29 |         # OJO: This is an extension point just incase
 30 |         # a different predictor being neccesary
 31 |         return self._predict_next_event_shared_cat
 32 | 
 33 |     def _predict_next_event_shared_cat(self, parameters, vectorizer):
 34 |         """Generate business process suffixes using a keras trained model.
 35 |         Args:
 36 |             model (keras model): keras trained model.
 37 |             prefixes (list): list of prefixes.
 38 |             ac_index (dict): index of activities.
 39 |             rl_index (dict): index of roles.
 40 |             imp (str): method of next event selection.
 41 |         """
 42 |         # Generation of predictions
 43 |         results = list()
 44 |         for i, _ in enumerate(self.spl['prefixes']['activities']):
 45 |             # Activities and roles input shape(1,5)
 46 |             x_ac_ngram = (np.append(
 47 |                     np.zeros(parameters['dim']['time_dim']),
 48 |                     np.array(self.spl['prefixes']['activities'][i]),
 49 |                     axis=0)[-parameters['dim']['time_dim']:]
 50 |                 .reshape((1, parameters['dim']['time_dim'])))
 51 | 
 52 |             x_rl_ngram = (np.append(
 53 |                     np.zeros(parameters['dim']['time_dim']),
 54 |                     np.array(self.spl['prefixes']['roles'][i]),
 55 |                     axis=0)[-parameters['dim']['time_dim']:]
 56 |                 .reshape((1, parameters['dim']['time_dim'])))
 57 | 
 58 |             # times input shape(1,5,1)
 59 |             times_attr_num = (self.spl['prefixes']['times'][i].shape[1])
 60 |             x_t_ngram = np.array(
 61 |                 [np.append(np.zeros(
 62 |                     (parameters['dim']['time_dim'], times_attr_num)),
 63 |                     self.spl['prefixes']['times'][i], axis=0)
 64 |                     [-parameters['dim']['time_dim']:]
 65 |                     .reshape((parameters['dim']['time_dim'], times_attr_num))]
 66 |                 )
 67 | 
 68 |             # add intercase features if necessary
 69 |             if vectorizer in ['basic']:
 70 |                 inputs = [x_ac_ngram, x_rl_ngram, x_t_ngram]
 71 |             elif vectorizer in ['inter']:
 72 |                 # times input shape(1,5,1)
 73 |                 inter_attr_num = (self.spl['prefixes']['inter_attr'][i]
 74 |                                   .shape[1])
 75 |                 x_inter_ngram = np.array(
 76 |                     [np.append(np.zeros((
 77 |                         parameters['dim']['time_dim'], inter_attr_num)),
 78 |                         self.spl['prefixes']['inter_attr'][i], axis=0)
 79 |                         [-parameters['dim']['time_dim']:]
 80 |                         .reshape(
 81 |                             (parameters['dim']['time_dim'], inter_attr_num))]
 82 |                     )
 83 |                 inputs = [x_ac_ngram, x_rl_ngram, x_t_ngram, x_inter_ngram]
 84 |             # predict
 85 |             preds = self.model.predict(inputs)
 86 |             if self.imp == 'random_choice':
 87 |                 # Use this to get a random choice following as PDF
 88 |                 pos = np.random.choice(np.arange(0, len(preds[0][0])),
 89 |                                        p=preds[0][0])
 90 |                 pos1 = np.random.choice(np.arange(0, len(preds[1][0])),
 91 |                                         p=preds[1][0])
 92 |             elif self.imp == 'arg_max':
 93 |                 # Use this to get the max prediction
 94 |                 pos = np.argmax(preds[0][0])
 95 |                 pos1 = np.argmax(preds[1][0])
 96 | 
 97 |             # save results
 98 |             predictions = [pos, pos1, preds[2][0][0]]
 99 |             if not parameters['one_timestamp']:
100 |                 predictions.extend([preds[2][0][1]])
101 |             results.append(
102 |                 self.create_result_record(i, self.spl, predictions, parameters))
103 |         sup.print_done_task()
104 |         return results
105 | 
106 |     def create_result_record(self, index, spl, preds, parms):
107 |         record = dict()
108 |         record['ac_prefix'] = spl['prefixes']['activities'][index]
109 |         record['ac_expect'] = spl['next_evt']['activities'][index]
110 |         record['ac_pred'] = preds[0]
111 |         record['rl_prefix'] = spl['prefixes']['roles'][index]
112 |         record['rl_expect'] = spl['next_evt']['roles'][index]
113 |         record['rl_pred'] = preds[1]
114 |         if parms['one_timestamp']:
115 |             record['tm_prefix'] = [self.rescale(
116 |                 x, parms, parms['scale_args']) 
117 |                 for x in spl['prefixes']['times'][index]]
118 |             record['tm_expect'] = self.rescale(
119 |                 spl['next_evt']['times'][index][0],
120 |                 parms, parms['scale_args'])
121 |             record['tm_pred'] = self.rescale(
122 |                 preds[2], parms, parms['scale_args'])
123 |         else:
124 |             # Duration
125 |             record['dur_prefix'] = [self.rescale(
126 |                 x[0], parms, parms['scale_args']['dur'])
127 |                 for x in spl['prefixes']['times'][index]]
128 |             record['dur_expect'] = self.rescale(
129 |                 spl['next_evt']['times'][index][0], parms,
130 |                 parms['scale_args']['dur'])
131 |             record['dur_pred'] = self.rescale(
132 |                 preds[2], parms, parms['scale_args']['dur'])
133 |             # Waiting
134 |             record['wait_prefix'] = [self.rescale(
135 |                 x[1], parms, parms['scale_args']['wait'])
136 |                 for x in spl['prefixes']['times'][index]]
137 |             record['wait_expect'] = self.rescale(
138 |                 spl['next_evt']['times'][index][1], parms,
139 |                 parms['scale_args']['wait'])
140 |             record['wait_pred'] = self.rescale(
141 |                 preds[3], parms, parms['scale_args']['wait'])
142 |         return record
143 | 
144 |     @staticmethod
145 |     def rescale(value, parms, scale_args):
146 |         if parms['norm_method'] == 'lognorm':
147 |             max_value = scale_args['max_value']
148 |             min_value = scale_args['min_value']
149 |             value = (value * (max_value - min_value)) + min_value
150 |             value = np.expm1(value)
151 |         elif parms['norm_method'] == 'normal':
152 |             max_value = scale_args['max_value']
153 |             min_value = scale_args['min_value']
154 |             value = (value * (max_value - min_value)) + min_value
155 |         elif parms['norm_method'] == 'standard':
156 |             mean = scale_args['mean']
157 |             std = scale_args['std']
158 |             value = (value * std) + mean
159 |         elif parms['norm_method'] == 'max':
160 |             max_value = scale_args['max_value']
161 |             value = np.rint(value * max_value)
162 |         elif parms['norm_method'] is None:
163 |             value = value
164 |         else:
165 |             raise ValueError(parms['norm_method'])
166 |         return value
167 | 


--------------------------------------------------------------------------------
/model_prediction/suffix_predictor.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Mar 18 10:35:37 2020
  4 | 
  5 | @author: Manuel Camargo
  6 | """
  7 | import numpy as np
  8 | 
  9 | import utils.support as sup
 10 | 
 11 | 
 12 | class SuffixPredictor():
 13 | 
 14 |     def __init__(self):
 15 |         """constructor"""
 16 |         self.model = None
 17 |         self.spl = dict()
 18 |         self.imp = 'arg_max'
 19 |         self.max_trace_size = 0
 20 | 
 21 |     def predict(self, params, model, spl, imp, vectorizer):
 22 |         self.model = model
 23 |         self.spl = spl
 24 |         self.max_trace_size = params['max_trace_size']
 25 |         self.imp = imp
 26 |         predictor = self._get_predictor(params['model_type'])
 27 |         sup.print_performed_task('Predicting suffixes')
 28 |         return predictor(params, vectorizer)
 29 | 
 30 |     def _get_predictor(self, model_type):
 31 |         # OJO: This is an extension point just incase 
 32 |         # a different predictor being neccesary
 33 |         return self._predict_suffix_shared_cat
 34 | 
 35 |     def _predict_suffix_shared_cat(self, parms, vectorizer):
 36 |         """Generate business process suffixes using a keras trained model.
 37 |         Args:
 38 |             model (keras model): keras trained model.
 39 |             prefixes (list): list of prefixes.
 40 |             ac_index (dict): index of activities.
 41 |             rl_index (dict): index of roles.
 42 |             imp (str): method of next event selection.
 43 |         """
 44 |         # Generation of predictions
 45 |         results = list()
 46 |         for i, _ in enumerate(self.spl['prefixes']['activities']):
 47 |             # Activities and roles input shape(1,5)
 48 |             x_ac_ngram = (np.append(
 49 |                     np.zeros(parms['dim']['time_dim']),
 50 |                     np.array(self.spl['prefixes']['activities'][i]),
 51 |                     axis=0)[-parms['dim']['time_dim']:]
 52 |                 .reshape((1, parms['dim']['time_dim'])))
 53 | 
 54 |             x_rl_ngram = (np.append(
 55 |                     np.zeros(parms['dim']['time_dim']),
 56 |                     np.array(self.spl['prefixes']['roles'][i]),
 57 |                     axis=0)[-parms['dim']['time_dim']:]
 58 |                 .reshape((1, parms['dim']['time_dim'])))
 59 |            
 60 |             times_attr_num = (self.spl['prefixes']['times'][i].shape[1])
 61 |             x_t_ngram = np.array(
 62 |                 [np.append(np.zeros(
 63 |                     (parms['dim']['time_dim'], times_attr_num)),
 64 |                     self.spl['prefixes']['times'][i], axis=0)
 65 |                     [-parms['dim']['time_dim']:]
 66 |                     .reshape((parms['dim']['time_dim'], times_attr_num))]
 67 |                 )
 68 |             if vectorizer in ['basic']:
 69 |                 inputs = [x_ac_ngram, x_rl_ngram, x_t_ngram]
 70 |             elif vectorizer in ['inter']:
 71 |                 inter_attr_num = self.spl['prefixes']['inter_attr'][i].shape[1]
 72 |                 x_inter_ngram = np.array([np.append(
 73 |                         np.zeros((parms['dim']['time_dim'], inter_attr_num)),
 74 |                         self.spl['prefixes']['inter_attr'][i],
 75 |                         axis=0)[-parms['dim']['time_dim']:].reshape((parms['dim']['time_dim'], inter_attr_num))])
 76 |                 inputs = [x_ac_ngram, x_rl_ngram, x_t_ngram, x_inter_ngram]
 77 | 
 78 |             pref_size = len(self.spl['prefixes']['activities'][i])
 79 |             acum_dur, acum_wait = list(), list()
 80 |             ac_suf, rl_suf = list(), list()
 81 |             for _  in range(1, self.max_trace_size):
 82 |                 preds = self.model.predict(inputs)
 83 |                 if self.imp == 'random_choice':
 84 |                     # Use this to get a random choice following as PDF the predictions
 85 |                     pos = np.random.choice(
 86 |                         np.arange(0,len(preds[0][0])), p=preds[0][0])
 87 |                     pos1 = np.random.choice(
 88 |                         np.arange(0, len(preds[1][0])), p=preds[1][0])
 89 |                 elif self.imp == 'arg_max':
 90 |                     # Use this to get the max prediction
 91 |                     pos = np.argmax(preds[0][0])
 92 |                     pos1 = np.argmax(preds[1][0])
 93 |                 # Activities accuracy evaluation
 94 |                 x_ac_ngram = np.append(x_ac_ngram, [[pos]], axis=1)
 95 |                 x_ac_ngram = np.delete(x_ac_ngram, 0, 1)
 96 |                 x_rl_ngram = np.append(x_rl_ngram, [[pos1]], axis=1)
 97 |                 x_rl_ngram = np.delete(x_rl_ngram, 0, 1)
 98 |                 x_t_ngram = np.append(x_t_ngram, [preds[2]], axis=1)
 99 |                 x_t_ngram = np.delete(x_t_ngram, 0, 1)
100 |                 if vectorizer in ['basic']:
101 |                     inputs = [x_ac_ngram, x_rl_ngram, x_t_ngram]
102 |                 elif vectorizer in ['inter']:
103 |                     x_inter_ngram = np.append(x_inter_ngram, [preds[3]], axis=1)
104 |                     x_inter_ngram = np.delete(x_inter_ngram, 0, 1)
105 |                     inputs = [x_ac_ngram, x_rl_ngram, x_t_ngram, x_inter_ngram]
106 |                 # Stop if the next prediction is the end of the trace
107 |                 # otherwise until the defined max_size
108 |                 ac_suf.append(pos)
109 |                 rl_suf.append(pos1)
110 |                 acum_dur.append(preds[2][0][0])
111 |                 if not parms['one_timestamp']:
112 |                     acum_wait.append(preds[2][0][1])
113 |                 if parms['index_ac'][pos] == 'end':
114 |                     break
115 |             # save results
116 |             predictions = [ac_suf, rl_suf, acum_dur]
117 |             if not parms['one_timestamp']:
118 |                 predictions.extend([acum_wait])
119 |             results.append(
120 |                 self.create_result_record(i, self.spl, predictions, parms, pref_size))
121 |         sup.print_done_task()
122 |         return results
123 | 
124 |     def create_result_record(self, index, spl, preds, parms, pref_size):
125 |         record = dict()
126 |         record['pref_size'] = pref_size
127 |         record['ac_prefix'] = spl['prefixes']['activities'][index]
128 |         record['ac_expect'] = spl['next_evt']['activities'][index]
129 |         record['ac_pred'] = preds[0]
130 |         record['rl_prefix'] = spl['prefixes']['roles'][index]
131 |         record['rl_expect'] = spl['next_evt']['roles'][index]
132 |         record['rl_pred'] = preds[1]
133 |         if parms['one_timestamp']:
134 |             record['tm_prefix'] = [self.rescale(
135 |                 x[0], parms, parms['scale_args']) 
136 |                 for x in spl['prefixes']['times'][index]]
137 |             record['tm_expect'] = [self.rescale(
138 |                 x[0], parms, parms['scale_args']) 
139 |                 for x in spl['next_evt']['times'][index]]
140 |             record['tm_pred'] = [self.rescale(
141 |                 x, parms, parms['scale_args']) 
142 |                 for x in preds[2]]
143 |         else:
144 |             # Duration
145 |             record['dur_prefix'] = [self.rescale(
146 |                 x[0], parms, parms['scale_args']['dur']) 
147 |                 for x in spl['prefixes']['times'][index]]
148 |             record['dur_expect'] = [self.rescale(
149 |                 x[0], parms, parms['scale_args']['dur']) 
150 |                 for x in spl['next_evt']['times'][index]]
151 |             record['dur_pred'] = [self.rescale(
152 |                 x, parms, parms['scale_args']['dur']) 
153 |                 for x in preds[2]]
154 |             # Waiting
155 |             record['wait_prefix'] = [self.rescale(
156 |                 x[1], parms, parms['scale_args']['wait']) 
157 |                 for x in spl['prefixes']['times'][index]]
158 |             record['wait_expect'] = [self.rescale(
159 |                 x[1], parms, parms['scale_args']['wait']) 
160 |                 for x in spl['next_evt']['times'][index]]
161 |             record['wait_pred'] = [self.rescale(
162 |                 x, parms, parms['scale_args']['wait']) 
163 |                 for x in preds[3]]
164 |         return record
165 | 
166 |     @staticmethod
167 |     def rescale(value, parms, scale_args):
168 |         if parms['norm_method'] == 'lognorm':
169 |             max_value = scale_args['max_value']
170 |             min_value = scale_args['min_value']
171 |             value = (value * (max_value - min_value)) + min_value
172 |             value = np.expm1(value)
173 |         elif parms['norm_method'] == 'normal':
174 |             max_value = scale_args['max_value']
175 |             min_value = scale_args['min_value']
176 |             value = (value * (max_value - min_value)) + min_value
177 |         elif parms['norm_method'] == 'standard':
178 |             mean = scale_args['mean']
179 |             std = scale_args['std']
180 |             value = (value * std) + mean
181 |         elif parms['norm_method'] == 'max':
182 |             max_value = scale_args['max_value']
183 |             value = np.rint(value * max_value)
184 |         elif parms['norm_method'] is None:
185 |             value = value
186 |         else:
187 |             raise ValueError(parms['norm_method'])
188 |         return value
189 | 


--------------------------------------------------------------------------------
/model_prediction/suffix_samples_creator.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Mar 18 10:03:26 2020
  4 | 
  5 | @author: Manuel Camargo
  6 | """
  7 | import itertools
  8 | 
  9 | import pandas as pd
 10 | import numpy as np
 11 | 
 12 | 
 13 | class SuffixSamplesCreator():
 14 |     """
 15 |     This is the man class encharged of the model training
 16 |     """
 17 | 
 18 |     def __init__(self):
 19 |         self.log = pd.DataFrame
 20 |         self.ac_index = dict()
 21 |         self.rl_index = dict()
 22 |         self._samplers = dict()
 23 |         self._samp_dispatcher = {'basic': self._sample_suffix,
 24 |                                  'inter': self._sample_suffix_inter}
 25 | 
 26 |     def create_samples(self, params, log, ac_index, rl_index, add_cols):
 27 |         self.log = log
 28 |         self.ac_index = ac_index
 29 |         self.rl_index = rl_index
 30 |         columns = self.define_columns(add_cols, params['one_timestamp'])
 31 |         sampler = self._get_model_specific_sampler(params['model_type'])
 32 |         return sampler(columns, params)
 33 | 
 34 |     @staticmethod
 35 |     def define_columns(add_cols, one_timestamp):
 36 |         columns = ['ac_index', 'rl_index', 'dur_norm']
 37 |         add_cols = [x+'_norm' if x != 'weekday' else x for x in add_cols ]
 38 |         columns.extend(add_cols)
 39 |         if not one_timestamp:
 40 |             columns.extend(['wait_norm'])
 41 |         return columns
 42 |     # def define_columns(add_cols, one_timestamp):
 43 |     #     columns = ['ac_index', 'rl_index', 'dur_norm']
 44 |     #     add_cols = [x+'_norm' for x in add_cols]
 45 |     #     columns.extend(add_cols)
 46 |     #     if not one_timestamp:
 47 |     #         columns.extend(['wait_norm'])
 48 |     #     return columns
 49 | 
 50 |     def register_sampler(self, model_type, sampler):
 51 |         try:
 52 |             self._samplers[model_type] = self._samp_dispatcher[sampler]
 53 |         except KeyError:
 54 |             raise ValueError(sampler)
 55 | 
 56 |     def _get_model_specific_sampler(self, model_type):
 57 |         sampler = self._samplers.get(model_type)
 58 |         if not sampler:
 59 |             raise ValueError(model_type)
 60 |         return sampler
 61 | 
 62 |     def _sample_suffix(self, columns, parms):
 63 |         """
 64 |         Extraction of prefixes and expected suffixes from event log.
 65 |         Args:
 66 |             self.log (dataframe): testing dataframe in pandas format.
 67 |             ac_index (dict): index of activities.
 68 |             rl_index (dict): index of roles.
 69 |             pref_size (int): size of the prefixes to extract.
 70 |         Returns:
 71 |             list: list of prefixes and expected sufixes.
 72 |         """
 73 |         print(columns)
 74 |         times = ['dur_norm'] if parms['one_timestamp'] else ['dur_norm', 'wait_norm']
 75 |         equi = {'ac_index': 'activities', 'rl_index': 'roles'}
 76 |         vec = {'prefixes': dict(),
 77 |                'next_evt': dict()}
 78 |         x_times_dict = dict()
 79 |         y_times_dict = dict()
 80 |         self.log = self.reformat_events(columns, parms['one_timestamp'])
 81 |         # n-gram definition
 82 |         for i, _ in enumerate(self.log):
 83 |             for x in columns:
 84 |                 serie, y_serie = list(), list()
 85 |                 for idx in range(1, len(self.log[i][x])):
 86 |                     serie.append(self.log[i][x][:idx])
 87 |                     y_serie.append(self.log[i][x][idx:])
 88 |                 if x in list(equi.keys()):
 89 |                     vec['prefixes'][equi[x]] = (
 90 |                         vec['prefixes'][equi[x]] + serie
 91 |                         if i > 0 else serie)
 92 |                     vec['next_evt'][equi[x]] = (
 93 |                         vec['next_evt'][equi[x]] + y_serie
 94 |                         if i > 0 else y_serie)
 95 |                 elif x in times:
 96 |                     x_times_dict[x] = (
 97 |                         x_times_dict[x] + serie if i > 0 else serie)
 98 |                     y_times_dict[x] = (
 99 |                         y_times_dict[x] + y_serie if i > 0 else y_serie)
100 |         vec['prefixes']['times'] = list()
101 |         x_times_dict = pd.DataFrame(x_times_dict)
102 |         for row in x_times_dict.values:
103 |             new_row = [np.array(x) for x in row]
104 |             new_row = np.dstack(new_row)
105 |             new_row = new_row.reshape((new_row.shape[1], new_row.shape[2]))
106 |             vec['prefixes']['times'].append(new_row)
107 |         # Reshape intercase expected attributes (prefixes, # attributes)
108 |         vec['next_evt']['times'] = list()
109 |         y_times_dict = pd.DataFrame(y_times_dict)
110 |         for row in y_times_dict.values:
111 |             new_row = [np.array(x) for x in row]
112 |             new_row = np.dstack(new_row)
113 |             new_row = new_row.reshape((new_row.shape[1], new_row.shape[2]))
114 |             vec['next_evt']['times'].append(new_row)
115 |         return vec
116 | 
117 | 
118 |     def _sample_suffix_inter(self, columns, parms):
119 |         self.log = self.reformat_events(columns, parms['one_timestamp'])
120 |         spl = {'prefixes': dict(), 'suffixes': dict()}
121 |         # n-gram definition
122 |         equi = {'ac_index': 'activities',
123 |                 'rl_index': 'roles',
124 |                 'dur_norm': 'times'}
125 |         x_inter_dict, y_inter_dict = dict(), dict()
126 |         for i, _ in enumerate(self.log):
127 |             for x in columns:
128 |                 serie, y_serie = list(), list()
129 |                 for idx in range(1, len(self.log[i][x])):
130 |                     serie.append(self.log[i][x][:idx])
131 |                     y_serie.append(self.log[i][x][idx:])
132 |                 if x in list(equi.keys()):
133 |                     spl['prefixes'][equi[x]] = (
134 |                         spl['prefixes'][equi[x]] + serie if i > 0 else serie)
135 |                     spl['suffixes'][equi[x]] = (
136 |                         spl['suffixes'][equi[x]] + y_serie if i > 0 else y_serie)
137 |                 else:
138 |                     x_inter_dict[x] = (
139 |                         x_inter_dict[x] + serie if i > 0 else serie)
140 |                     y_inter_dict[x] = (
141 |                         y_inter_dict[x] + y_serie if i > 0 else y_serie)
142 |         # Reshape intercase attributes (prefixes, n-gram size, # attributes)
143 |         spl['prefixes']['inter_attr'] = list()
144 |         x_inter_dict = pd.DataFrame(x_inter_dict)
145 |         for row in x_inter_dict.values:
146 |             new_row = [np.array(x) for x in row]
147 |             new_row = np.dstack(new_row)
148 |             new_row = new_row.reshape((new_row.shape[1], new_row.shape[2]))
149 |             spl['prefixes']['inter_attr'].append(new_row)
150 |         # Reshape intercase expected attributes (prefixes, # attributes)
151 |         spl['suffixes']['inter_attr'] = list()
152 |         y_inter_dict = pd.DataFrame(y_inter_dict)
153 |         for row in y_inter_dict.values:
154 |             new_row = [np.array(x) for x in row]
155 |             new_row = np.dstack(new_row)
156 |             new_row = new_row.reshape((new_row.shape[1], new_row.shape[2]))
157 |             spl['suffixes']['inter_attr'].append(new_row)
158 |         return spl
159 | 
160 | # =============================================================================
161 | # Reformat
162 | # =============================================================================
163 |     def reformat_events(self, columns, one_timestamp):
164 |         """Creates series of activities, roles and relative times per trace.
165 |         Args:
166 |             log_df: dataframe.
167 |             ac_index (dict): index of activities.
168 |             rl_index (dict): index of roles.
169 |         Returns:
170 |             list: lists of activities, roles and relative times.
171 |         """
172 |         temp_data = list()
173 |         log_df = self.log.to_dict('records')
174 |         key = 'end_timestamp' if one_timestamp else 'start_timestamp'
175 |         log_df = sorted(log_df, key=lambda x: (x['caseid'], key))
176 |         for key, group in itertools.groupby(log_df, key=lambda x: x['caseid']):
177 |             trace = list(group)
178 |             temp_dict = dict()
179 |             for x in columns:
180 |                 serie = [y[x] for y in trace]
181 |                 if x == 'ac_index':
182 |                     serie.insert(0, self.ac_index[('start')])
183 |                     serie.append(self.ac_index[('end')])
184 |                 elif x == 'rl_index':
185 |                     serie.insert(0, self.rl_index[('start')])
186 |                     serie.append(self.rl_index[('end')])
187 |                 else:
188 |                     serie.insert(0, 0)
189 |                     serie.append(0)
190 |                 temp_dict = {**{x: serie}, **temp_dict}
191 |             temp_dict = {**{'caseid': key}, **temp_dict}
192 |             temp_data.append(temp_dict)
193 |         return temp_data
194 | 


--------------------------------------------------------------------------------
/model_training/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/model_training/embedding_training.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Nov 21 21:23:55 2018
  4 | 
  5 | @author: Manuel Camargo
  6 | """
  7 | import os
  8 | import random
  9 | import itertools
 10 | import math
 11 | import numpy as np
 12 | 
 13 | from keras.models import Model
 14 | from keras.layers import Input, Embedding, Dot, Reshape
 15 | 
 16 | import utils.support as sup
 17 | 
 18 | 
 19 | def training_model(parameters, log, ac_index, index_ac, rl_index, index_rl):
 20 |     """Main method of the embedding training module.
 21 |     Args:
 22 |         parameters (dict): parameters for training the embeddeding network.
 23 |         timeformat (str): event-log date-time format.
 24 |         no_loops (boolean): remove loops fom the event-log (optional).
 25 |     """
 26 |     # Define the number of dimensions as the 4th root of the # of categories
 27 |     dim_number = math.ceil(
 28 |         len(list(itertools.product(*[list(ac_index.items()),
 29 |                                      list(rl_index.items())])))**0.25)
 30 | 
 31 |     ac_weights, rl_weights = train_embedded(log,
 32 |                                             ac_index, rl_index, dim_number)
 33 | 
 34 |     if not os.path.exists(os.path.join('input_files', 'embedded_matix')):
 35 |         os.makedirs(os.path.join('input_files', 'embedded_matix'))
 36 | 
 37 |     sup.create_file_from_list(
 38 |         reformat_matrix(index_ac, ac_weights),
 39 |         os.path.join(os.path.join('input_files', 'embedded_matix'),
 40 |                      'ac_' + parameters['file_name'].split('.')[0]+'.emb'))
 41 |     sup.create_file_from_list(
 42 |         reformat_matrix(index_rl, rl_weights),
 43 |         os.path.join(os.path.join('input_files', 'embedded_matix'),
 44 |                      'rl_' + parameters['file_name'].split('.')[0]+'.emb'))
 45 | 
 46 | 
 47 | # =============================================================================
 48 | # Pre-processing: embedded dimension
 49 | # =============================================================================
 50 | 
 51 | def train_embedded(log_df, ac_index, rl_index, dim_number):
 52 |     """Carry out the training of the embeddings"""
 53 |     # Iterate through each book
 54 |     pairs = list()
 55 |     for i in range(0, len(log_df)):
 56 |         # Iterate through the links in the book
 57 |         pairs.append((ac_index[log_df.iloc[i]['task']],
 58 |                       rl_index[log_df.iloc[i]['role']]))
 59 | 
 60 |     model = ac_rl_embedding_model(ac_index, rl_index, dim_number)
 61 |     model.summary()
 62 | 
 63 |     n_positive = 1024
 64 |     gen = generate_batch(pairs, ac_index, rl_index,
 65 |                          n_positive, negative_ratio=2)
 66 |     # Train
 67 |     model.fit_generator(gen, epochs=100,
 68 |                         steps_per_epoch=len(pairs) // n_positive,
 69 |                         verbose=2)
 70 | 
 71 |     # Extract embeddings
 72 |     ac_layer = model.get_layer('activity_embedding')
 73 |     rl_layer = model.get_layer('role_embedding')
 74 | 
 75 |     ac_weights = ac_layer.get_weights()[0]
 76 |     rl_weights = rl_layer.get_weights()[0]
 77 | 
 78 |     return ac_weights, rl_weights
 79 | 
 80 | 
 81 | def generate_batch(pairs, ac_index, rl_index, n_positive=50,
 82 |                    negative_ratio=1.0):
 83 |     """Generate batches of samples for training"""
 84 |     batch_size = n_positive * (1 + negative_ratio)
 85 |     batch = np.zeros((batch_size, 3))
 86 |     pairs_set = set(pairs)
 87 |     activities = list(ac_index.keys())
 88 |     roles = list(rl_index.keys())
 89 |     # This creates a generator
 90 |     while True:
 91 |         # randomly choose positive examples
 92 |         idx = 0
 93 |         for idx, (activity, role) in enumerate(random.sample(pairs,
 94 |                                                              n_positive)):
 95 |             batch[idx, :] = (activity, role, 1)
 96 |         # Increment idx by 1
 97 |         idx += 1
 98 | 
 99 |         # Add negative examples until reach batch size
100 |         while idx < batch_size:
101 |             # random selection
102 |             random_ac = random.randrange(len(activities))
103 |             random_rl = random.randrange(len(roles))
104 | 
105 |             # Check to make sure this is not a positive example
106 |             if (random_ac, random_rl) not in pairs_set:
107 | 
108 |                 # Add to batch and increment index,  0 due classification task
109 |                 batch[idx, :] = (random_ac, random_rl, 0)
110 |                 idx += 1
111 | 
112 |         # Make sure to shuffle order
113 |         np.random.shuffle(batch)
114 |         yield {'activity': batch[:, 0], 'role': batch[:, 1]}, batch[:, 2]
115 | 
116 | 
117 | def ac_rl_embedding_model(ac_index, rl_index, embedding_size):
118 |     """Model to embed activities and roles using the functional API"""
119 | 
120 |     # Both inputs are 1-dimensional
121 |     activity = Input(name='activity', shape=[1])
122 |     role = Input(name='role', shape=[1])
123 | 
124 |     # Embedding the activity (shape will be (None, 1, embedding_size))
125 |     activity_embedding = Embedding(name='activity_embedding',
126 |                                    input_dim=len(ac_index),
127 |                                    output_dim=embedding_size)(activity)
128 | 
129 |     # Embedding the role (shape will be (None, 1, embedding_size))
130 |     role_embedding = Embedding(name='role_embedding',
131 |                                input_dim=len(rl_index),
132 |                                output_dim=embedding_size)(role)
133 | 
134 |     # Merge the layers with a dot product
135 |     # along the second axis (shape will be (None, 1, 1))
136 |     merged = Dot(name='dot_product',
137 |                  normalize=True, axes=2)([activity_embedding, role_embedding])
138 | 
139 |     # Reshape to be a single number (shape will be (None, 1))
140 |     merged = Reshape(target_shape=[1])(merged)
141 | 
142 |     # Loss function is mean squared error
143 |     model = Model(inputs=[activity, role], outputs=merged)
144 |     model.compile(optimizer='Adam', loss='mse')
145 | 
146 |     return model
147 | 
148 | # =============================================================================
149 | # Support
150 | # =============================================================================
151 | 
152 | 
153 | def reformat_matrix(index, weigths):
154 |     """Reformating of the embedded matrix for exporting.
155 |     Args:
156 |         index: index of activities or roles.
157 |         weigths: matrix of calculated coordinates.
158 |     Returns:
159 |         matrix with indexes.
160 |     """
161 |     matrix = list()
162 |     for i, _ in enumerate(index):
163 |         data = [i, index[i]]
164 |         data.extend(weigths[i])
165 |         matrix.append(data)
166 |     return matrix
167 | 


--------------------------------------------------------------------------------
/model_training/features_manager.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Mar 14 19:18:18 2020
  4 | 
  5 | @author: Manuel Camargo
  6 | """
  7 | import pandas as pd
  8 | import numpy as np
  9 | 
 10 | import itertools
 11 | from operator import itemgetter
 12 | try:
 13 |     from support_modules import role_discovery as rl
 14 | except:
 15 |     import os
 16 |     from importlib import util
 17 |     spec = util.spec_from_file_location(
 18 |         'role_discovery', 
 19 |         os.path.join(os.getcwd(), 'support_modules', 'role_discovery.py'))
 20 |     rl = util.module_from_spec(spec)
 21 |     spec.loader.exec_module(rl)
 22 | 
 23 | class FeaturesMannager():
 24 | 
 25 | 
 26 |     def __init__(self, params):
 27 |         """constructor"""
 28 |         self.model_type = params['model_type']
 29 |         self.one_timestamp = params['one_timestamp']
 30 |         # self.resources = pd.DataFrame
 31 |         self.norm_method = params['norm_method']
 32 |         self._scalers = dict()
 33 |         self.scale_dispatcher = {'basic': self._scale_base,
 34 |                                  'inter': self._scale_inter}
 35 | 
 36 |     def calculate(self, log, add_cols):
 37 |         log = self.add_calculated_times(log)
 38 |         log = self.filter_features(log, add_cols)
 39 |         return self.scale_features(log, add_cols)
 40 | 
 41 |     @staticmethod
 42 |     def add_resources(log, rp_sim):
 43 |         # Resource pool discovery
 44 |         res_analyzer = rl.ResourcePoolAnalyser(log, sim_threshold=rp_sim)
 45 |         # Role discovery
 46 |         resources = pd.DataFrame.from_records(res_analyzer.resource_table)
 47 |         resources = resources.rename(index=str,
 48 |                                                columns={"resource": "user"})
 49 |         # Add roles information
 50 |         log = log.merge(resources, on='user', how='left')
 51 |         log = log[~log.task.isin(['Start', 'End'])]
 52 |         log = log.reset_index(drop=True)
 53 |         return log
 54 | 
 55 |     def filter_features(self, log, add_cols):
 56 |         # Add intercase features
 57 |         columns = ['caseid', 'task', 'user', 'end_timestamp', 
 58 |                    'role', 'dur', 'ac_index',  'rl_index']
 59 |         if not self.one_timestamp:
 60 |             columns.extend(['start_timestamp', 'wait'])
 61 |         columns.extend(add_cols)
 62 |         log = log[columns]
 63 |         return log
 64 | 
 65 |     def add_calculated_times(self, log):
 66 |         """Appends the indexes and relative time to the dataframe.
 67 |         parms:
 68 |             log: dataframe.
 69 |         Returns:
 70 |             Dataframe: The dataframe with the calculated features added.
 71 |         """
 72 |         log['dur'] = 0
 73 |         log['acc_cycle'] = 0
 74 |         log['daytime'] = 0
 75 |         log = log.to_dict('records')
 76 |         log = sorted(log, key=lambda x: x['caseid'])
 77 |         for _, group in itertools.groupby(log, key=lambda x: x['caseid']):
 78 |             events = list(group)
 79 |             ordk = 'end_timestamp' if self.one_timestamp else 'start_timestamp'
 80 |             events = sorted(events, key=itemgetter(ordk))
 81 |             for i in range(0, len(events)):
 82 |                 # In one-timestamp approach the first activity of the trace
 83 |                 # is taken as instantsince there is no previous timestamp
 84 |                 # to find a range
 85 |                 if self.one_timestamp:
 86 |                     if i == 0:
 87 |                         dur = 0
 88 |                         acc = 0
 89 |                     else:
 90 |                         dur = (events[i]['end_timestamp'] -
 91 |                                events[i-1]['end_timestamp']).total_seconds()
 92 |                         acc = (events[i]['end_timestamp'] -
 93 |                                events[0]['end_timestamp']).total_seconds()
 94 |                 else:
 95 |                     dur = (events[i]['end_timestamp'] -
 96 |                            events[i]['start_timestamp']).total_seconds()
 97 |                     acc = (events[i]['end_timestamp'] -
 98 |                            events[0]['start_timestamp']).total_seconds()
 99 |                     if i == 0:
100 |                         wit = 0
101 |                     else:
102 |                         wit = (events[i]['start_timestamp'] -
103 |                                events[i-1]['end_timestamp']).total_seconds()
104 |                     events[i]['wait'] = wit if wit >= 0 else 0
105 |                 events[i]['dur'] = dur
106 |                 events[i]['acc_cycle'] = acc
107 |                 time = events[i][ordk].time()
108 |                 time = time.second + time.minute*60 + time.hour*3600
109 |                 events[i]['daytime'] = time
110 |                 events[i]['weekday'] = events[i]['start_timestamp'].weekday()
111 |         return pd.DataFrame.from_dict(log)
112 | 
113 |     def scale_features(self, log, add_cols):
114 |         scaler = self._get_scaler(self.model_type)
115 |         return scaler(log, add_cols)
116 | 
117 |     def register_scaler(self, model_type, scaler):
118 |         try:
119 |             self._scalers[model_type] = self.scale_dispatcher[scaler]
120 |         except KeyError:
121 |             raise ValueError(scaler)
122 | 
123 |     def _get_scaler(self, model_type):
124 |         scaler = self._scalers.get(model_type)
125 |         if not scaler:
126 |             raise ValueError(model_type)
127 |         return scaler
128 | 
129 |     def _scale_base(self, log, add_cols):
130 |         if self.one_timestamp:
131 |             log, scale_args = self.scale_feature(log, 'dur', self.norm_method)
132 |         else:
133 |             log, dur_scale = self.scale_feature(log, 'dur', self.norm_method)
134 |             log, wait_scale = self.scale_feature(log, 'wait', self.norm_method)
135 |             scale_args = {'dur': dur_scale, 'wait': wait_scale}
136 |         return log, scale_args
137 | 
138 |     def _scale_inter(self, log, add_cols):
139 |         # log, scale_args = self.scale_feature(log, 'dur', self.norm_method)
140 |         if self.one_timestamp:
141 |             log, scale_args = self.scale_feature(log, 'dur', self.norm_method)
142 |         else:
143 |             log, dur_scale = self.scale_feature(log, 'dur', self.norm_method)
144 |             log, wait_scale = self.scale_feature(log, 'wait', self.norm_method)
145 |             scale_args = {'dur': dur_scale, 'wait': wait_scale}
146 |         for col in add_cols:
147 |             if col == 'daytime':
148 |                 log, _ = self.scale_feature(log, 'daytime', 'day_secs', True)
149 |             elif col == 'weekday':
150 |                 continue
151 |             else:
152 |                 log, _ = self.scale_feature(log, col, self.norm_method, True)
153 |         return log, scale_args
154 | 
155 |     # =========================================================================
156 |     # Scale features
157 |     # =========================================================================
158 |     @staticmethod
159 |     def scale_feature(log, feature, method, replace=False):
160 |         """Scales a number given a technique.
161 |         Args:
162 |             log: Event-log to be scaled.
163 |             feature: Feature to be scaled.
164 |             method: Scaling method max, lognorm, normal, per activity.
165 |             replace (optional): replace the original value or keep both.
166 |         Returns:
167 |             Scaleded value between 0 and 1.
168 |         """
169 |         scale_args = dict()
170 |         if method == 'lognorm':
171 |             log[feature + '_log'] = np.log1p(log[feature])
172 |             max_value = np.max(log[feature+'_log'])
173 |             min_value = np.min(log[feature+'_log'])
174 |             log[feature+'_norm'] = np.divide(
175 |                     np.subtract(log[feature+'_log'], min_value), (max_value - min_value))
176 |             log = log.drop((feature + '_log'), axis=1)
177 |             scale_args = {'max_value': max_value, 'min_value': min_value}
178 |         elif method == 'normal':
179 |             max_value = np.max(log[feature])
180 |             min_value = np.min(log[feature])
181 |             log[feature+'_norm'] = np.divide(
182 |                     np.subtract(log[feature], min_value), (max_value - min_value))
183 |             scale_args = {'max_value': max_value, 'min_value': min_value}
184 |         elif method == 'standard':
185 |             mean = np.mean(log[feature])
186 |             std = np.std(log[feature])
187 |             log[feature + '_norm'] = np.divide(np.subtract(log[feature], mean),
188 |                                                std)
189 |             scale_args = {'mean': mean, 'std': std}
190 |         elif method == 'max':
191 |             max_value = np.max(log[feature])
192 |             log[feature + '_norm'] = (np.divide(log[feature], max_value)
193 |                                       if max_value > 0 else 0)
194 |             scale_args = {'max_value': max_value}
195 |         elif method == 'day_secs':
196 |             max_value = 86400
197 |             log[feature + '_norm'] = (np.divide(log[feature], max_value)
198 |                                       if max_value > 0 else 0)
199 |             scale_args = {'max_value': max_value}
200 |         elif method is None:
201 |             log[feature+'_norm'] = log[feature]
202 |         else:
203 |             raise ValueError(method)
204 |         if replace:
205 |             log = log.drop(feature, axis=1)
206 |         return log, scale_args


--------------------------------------------------------------------------------
/model_training/intercase_features/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/model_training/model_hpc_optimizer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Jan 26 15:05:36 2021
  4 | 
  5 | @author: Manuel Camargo
  6 | """
  7 | import os
  8 | import copy
  9 | import random
 10 | import itertools
 11 | import traceback
 12 | import ast
 13 | 
 14 | import pandas as pd
 15 | import utils.support as sup
 16 | import utils.slurm_multiprocess as slmp
 17 | 
 18 | 
 19 | class ModelHPCOptimizer():
 20 |     """
 21 |     Hyperparameter-optimizer class
 22 |     """
 23 |        
 24 |     def __init__(self, parms, log, ac_index, rl_index):
 25 |         """constructor"""
 26 |         self.space = self.define_search_space(parms)
 27 |         self.log = copy.deepcopy(log)
 28 |         self.ac_index = ac_index
 29 |         self.rl_index = rl_index
 30 |         
 31 |         # Load settings
 32 |         self.parms = parms
 33 |         self.temp_output = parms['output']
 34 |         if not os.path.exists(self.temp_output):
 35 |             os.makedirs(self.temp_output)
 36 |             os.makedirs(os.path.join(self.temp_output, 'opt_parms'))
 37 |         self.file_name = sup.file_id(prefix='OP_')
 38 |         # Results file
 39 |         if not os.path.exists(os.path.join(self.temp_output, self.file_name)):
 40 |             open(os.path.join(self.temp_output, self.file_name), 'w').close()
 41 |         
 42 |         self.conn = {'partition': 'main',
 43 |                     'mem': str(32000),
 44 |                     'cpus': str(10),
 45 |                     'env': 'deep_generator_pip',
 46 |                     'script': os.path.join('model_training', 
 47 |                                             'slurm_trainer.py')}
 48 |         self.slurm_workers = 50
 49 |         self.best_output = None
 50 |         self.best_parms = dict()
 51 |         self.best_loss = 1
 52 |         
 53 |     @staticmethod
 54 |     def define_search_space(parms):
 55 |         space = list()
 56 |         listOLists = [parms['lstm_act'], 
 57 |                       parms['dense_act'], 
 58 |                       parms['norm_method'], 
 59 |                       parms['n_size'],
 60 |                       parms['l_size'], 
 61 |                       parms['optim'], 
 62 |                       parms['model_type']]
 63 |         # selection method definition
 64 |         preconfigs = list()
 65 |         for lists in itertools.product(*listOLists):
 66 |             preconfigs.append(dict(lstm_act=lists[0],
 67 |                                    dense_act=lists[1],
 68 |                                    norm_method=lists[2],
 69 |                                    n_size=lists[3],
 70 |                                    l_size=lists[4],
 71 |                                    optim=lists[5],
 72 |                                    model_type=lists[6]))
 73 |         def_parms = {
 74 |             'imp': parms['imp'], 'file': parms['file_name'],
 75 |             'batch_size': parms['batch_size'], 'epochs': parms['epochs'],
 76 |             'one_timestamp': parms['one_timestamp']}
 77 |         for config in random.sample(preconfigs, parms['max_eval']):
 78 |             space.append({**config, **def_parms})
 79 |         return space
 80 | 
 81 |     def export_params(self):
 82 |         configs_files = list()
 83 |         for config in self.space:
 84 |             config['ac_index'] = self.ac_index
 85 |             config['rl_index'] = self.rl_index
 86 |             conf_file = sup.file_id(prefix='CNF_', extension='.json')
 87 |             sup.create_json(
 88 |                 config, os.path.join(self.temp_output, 'opt_parms', conf_file))
 89 |             configs_files.append(conf_file)
 90 |         self.log.to_csv(
 91 |             os.path.join(self.temp_output, 'opt_parms', 'train.csv'),
 92 |             index=False, encoding='utf-8')
 93 |         return configs_files
 94 | 
 95 |     def execute_trials(self):
 96 |         configs_files = self.export_params()
 97 |         args = [{'p': config, 
 98 |                  'f': self.temp_output,
 99 |                  'r': self.file_name} for config in configs_files]
100 |         mprocessor = slmp.HPC_Multiprocess(self.conn,
101 |                                             args,
102 |                                             self.temp_output,
103 |                                             None,
104 |                                             self.slurm_workers,
105 |                                             timeout=5)
106 |         mprocessor.parallelize()
107 |         try:
108 |             self.file_name = os.path.join(self.temp_output, self.file_name)
109 |             results = (pd.read_csv(self.file_name)
110 |                        .sort_values('loss', ascending=bool))
111 |             result = results.head(1).iloc[0]
112 |             self.best_output = result.output
113 |             self.best_loss = result.loss
114 |             self.best_parms = results.head(1).to_dict('records')[0]
115 |             self.best_parms['scale_args'] = ast.literal_eval(
116 |                 self.best_parms.get('scale_args'))
117 |         except Exception as e:
118 |             print(e)
119 |             traceback.print_exc()
120 |             pass
121 | 
122 | 


--------------------------------------------------------------------------------
/model_training/model_loader.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Mar 16 09:53:54 2020
 4 | 
 5 | @author: Manuel Camargo
 6 | """
 7 | import tensorflow as tf
 8 | 
 9 | from model_training.models import model_specialized as mspec
10 | from model_training.models import model_concatenated as mcat
11 | from model_training.models import model_shared_cat as mshcat
12 | 
13 | from model_training.models import model_gru_specialized as mspecg
14 | from model_training.models import model_gru_concatenated as mcatg
15 | from model_training.models import model_gru_shared_cat as mshcatg
16 | 
17 | 
18 | from model_training.models import model_shared_cat_cx as mshcati
19 | from model_training.models import model_concatenated_cx as mcati
20 | from model_training.models import model_gru_concatenated_cx as mcatgi
21 | from model_training.models import model_gru_shared_cat_cx as mshcatgi
22 | 
23 | # from model_training.models import model_shared_cat_intercase as mshcati
24 | # from model_training.models import model_concatenated_inter as mcati
25 | # from model_training.models import model_gru_concatenated_inter as mcatgi
26 | # from model_training.models import model_gru_shared_cat_intercase as mshcatgi
27 | 
28 | class ModelLoader():
29 | 
30 |     def __init__(self, parms):
31 |         self.parms = parms
32 |         self._trainers = dict()
33 |         self.trainer_dispatcher = {'specialized': mspec._training_model,
34 |                                    'concatenated': mcat._training_model,
35 |                                    'concatenated_cx': mcati._training_model,
36 |                                    'shared_cat': mshcat._training_model,
37 |                                    'shared_cat_cx': mshcati._training_model,
38 |                                    'specialized_gru': mspecg._training_model,
39 |                                    'concatenated_gru': mcatg._training_model,
40 |                                    'concatenated_gru_cx': mcatgi._training_model,
41 |                                    'shared_cat_gru': mshcatg._training_model,
42 |                                    'shared_cat_gru_cx': mshcatgi._training_model,
43 |                                    # 'cnn_lstm': cnnl._training_model,
44 |                                    # 'gan': mgan._training_model
45 |                                    }
46 | 
47 |     def train(self, model_type, train_vec, valdn_vec, ac_weights, rl_weights, output_folder):
48 |         loader = self._get_trainer(model_type)
49 |         tf.compat.v1.reset_default_graph()
50 |         return loader(train_vec, 
51 |                       valdn_vec, 
52 |                       ac_weights, 
53 |                       rl_weights, 
54 |                       output_folder, 
55 |                       self.parms)
56 | 
57 |     def register_model(self, model_type, trainer):
58 |         try:
59 |             self._trainers[model_type] = self.trainer_dispatcher[trainer]
60 |         except KeyError:
61 |             raise ValueError(trainer)
62 | 
63 |     def _get_trainer(self, model_type):
64 |         trainer = self._trainers.get(model_type)
65 |         if not trainer:
66 |             raise ValueError(model_type)
67 |         return trainer


--------------------------------------------------------------------------------
/model_training/model_optimizer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Nov 17 10:48:57 2020
  4 | 
  5 | @author: Manuel Camargo
  6 | """
  7 | import os
  8 | import copy
  9 | import ast
 10 | import traceback
 11 | import pandas as pd
 12 | import configparser as cp
 13 | from hyperopt import tpe
 14 | from hyperopt import Trials, hp, fmin, STATUS_OK, STATUS_FAIL
 15 | 
 16 | import utils.support as sup
 17 | import readers.log_splitter as ls
 18 | 
 19 | import tensorflow as tf
 20 | from model_training import samples_creator as sc
 21 | from model_training import model_loader as mload
 22 | from model_training import features_manager as feat
 23 | 
 24 | 
 25 | class ModelOptimizer():
 26 |     """
 27 |     Hyperparameter-optimizer class
 28 |     """
 29 |     class Decorators(object):
 30 | 
 31 |         @classmethod
 32 |         def safe_exec(cls, method):
 33 |             """
 34 |             Decorator to safe execute methods and return the state
 35 |             ----------
 36 |             method : Any method.
 37 |             Returns
 38 |             -------
 39 |             dict : execution status
 40 |             """
 41 |             def safety_check(*args, **kw):
 42 |                 status = kw.get('status', method.__name__.upper())
 43 |                 response = {'values': [], 'status': status}
 44 |                 if status == STATUS_OK:
 45 |                     try:
 46 |                         response['values'] = method(*args)
 47 |                     except Exception as e:
 48 |                         print(e)
 49 |                         traceback.print_exc()
 50 |                         response['status'] = STATUS_FAIL
 51 |                 return response
 52 |             return safety_check
 53 |         
 54 |     def __init__(self, parms, log, ac_index, ac_weights, rl_index, rl_weights):
 55 |         """constructor"""
 56 |         self.space = self.define_search_space(parms)
 57 |         self.log = copy.deepcopy(log)
 58 |         self.ac_index = ac_index
 59 |         self.ac_weights = ac_weights
 60 |         self.rl_index = rl_index
 61 |         self.rl_weights = rl_weights
 62 |         
 63 |         # Load settings
 64 |         self.parms = parms
 65 |         self.temp_output = parms['output']
 66 |         if not os.path.exists(self.temp_output):
 67 |             os.makedirs(self.temp_output)
 68 |         self.file_name = os.path.join(self.temp_output, 
 69 |                                       sup.file_id(prefix='OP_'))
 70 |         # Results file
 71 |         if not os.path.exists(self.file_name):
 72 |             open(self.file_name, 'w').close()
 73 |         # Trials object to track progress
 74 |         self.bayes_trials = Trials()
 75 |         self.best_output = None
 76 |         self.best_params = dict()
 77 |         self.best_loss = 1
 78 |         
 79 |     @staticmethod
 80 |     def define_search_space(parms):
 81 |         space = {'model_type': hp.choice('model_type', parms['model_type']),
 82 |                  'n_size': hp.choice('n_size', parms['n_size']),
 83 |                  'l_size': hp.choice('l_size', parms['l_size']),
 84 |                  'lstm_act': hp.choice('lstm_act', parms['lstm_act']),
 85 |                  'dense_act': hp.choice('dense_act', parms['dense_act']),
 86 |                  'norm_method': hp.choice('norm_method', parms['norm_method']),
 87 |                  'optim': hp.choice('optim', parms['optim']),
 88 |                  'imp': parms['imp'], 'file': parms['file_name'],
 89 |                  'batch_size': parms['batch_size'], 'epochs': parms['epochs'],
 90 |                  'one_timestamp': parms['one_timestamp']}
 91 |         return space
 92 | 
 93 |     def execute_trials(self):
 94 |         def exec_pipeline(trial_stg):
 95 |             print(trial_stg)
 96 |             status = STATUS_OK
 97 |             # Path redefinition
 98 |             rsp = self._temp_path_redef(trial_stg, status=status)
 99 |             status = rsp['status']
100 |             trial_stg = rsp['values'] if status == STATUS_OK else trial_stg
101 |             # Model definition
102 |             model_def = self.read_model_definition(trial_stg['model_type'])
103 |             # Scale values
104 |             log, trial_stg = self._scale_values(self.log, trial_stg, model_def)
105 |             # split validation
106 |             log_valdn, log_train = self.split_timeline(0.8, log, trial_stg['one_timestamp'])
107 |             print('train split size:', len(log_train))
108 |             print('valdn split size:', len(log_valdn))
109 |             # Vectorize input
110 |             vectorizer = sc.SequencesCreator(
111 |                 self.parms['read_options']['one_timestamp'], 
112 |                 self.ac_index, self.rl_index)
113 |             vectorizer.register_vectorizer(trial_stg['model_type'],
114 |                                            model_def['vectorizer'])
115 |             train_vec = vectorizer.vectorize(trial_stg['model_type'],
116 |                                              log_train,
117 |                                              trial_stg,
118 |                                              model_def['additional_columns'])
119 |             valdn_vec = vectorizer.vectorize(trial_stg['model_type'],
120 |                                              log_valdn,
121 |                                              trial_stg,
122 |                                              model_def['additional_columns'])
123 |             # Train
124 |             m_loader = mload.ModelLoader(trial_stg)
125 |             m_loader.register_model(trial_stg['model_type'],
126 |                                     model_def['trainer'])
127 |             tf.compat.v1.reset_default_graph()
128 |             model = m_loader.train(trial_stg['model_type'],
129 |                                    train_vec, 
130 |                                    valdn_vec,
131 |                                    self.ac_weights,
132 |                                    self.rl_weights,
133 |                                    trial_stg['output'])
134 |             # evaluation
135 |             x_input = {'ac_input': valdn_vec['prefixes']['activities'],
136 |                        'rl_input': valdn_vec['prefixes']['roles'],
137 |                        't_input': valdn_vec['prefixes']['times']}
138 |             if trial_stg['model_type'] in ['shared_cat_cx', 
139 |                                            'concatenated_cx',
140 |                                            'shared_cat_gru_cx', 
141 |                                            'concatenated_gru_cx']:
142 |                 x_input['inter_input']= valdn_vec['prefixes']['inter_attr']
143 |             acc = model.evaluate(
144 |                 x=x_input,
145 |                 y={'act_output': valdn_vec['next_evt']['activities'],
146 |                    'role_output': valdn_vec['next_evt']['roles'],
147 |                    'time_output': valdn_vec['next_evt']['times']},
148 |                 return_dict=True)
149 |             rsp = self._define_response(trial_stg, status, acc['loss'])
150 |             print("-- End of trial --")
151 |             return rsp
152 | 
153 |         # Optimize
154 |         best = fmin(fn=exec_pipeline,
155 |                     space=self.space,
156 |                     algo=tpe.suggest,
157 |                     max_evals=self.parms['max_eval'],
158 |                     trials=self.bayes_trials,
159 |                     show_progressbar=False)
160 |         # Save results
161 |         try:
162 |             results = (pd.DataFrame(self.bayes_trials.results)
163 |                        .sort_values('loss', ascending=True))
164 |             result = results[results.status == 'ok'].head(1).iloc[0]
165 |             self.best_output = result.output
166 |             self.best_loss = result.loss
167 |             self.best_params = {k: self.parms[k][v] for k, v in best.items()}
168 |             opt_res = pd.read_csv(self.file_name)
169 |             opt_res = opt_res[opt_res.output == result.output].iloc[0]
170 |             self.best_params['scale_args'] = ast.literal_eval(opt_res.scale_args)
171 |         except Exception as e:
172 |             print(e)
173 |             pass
174 | 
175 |     @Decorators.safe_exec
176 |     def _temp_path_redef(self, settings, **kwargs) -> dict:
177 |         # Paths redefinition
178 |         settings['output'] = os.path.join(self.temp_output, sup.folder_id())
179 |         # Output folder creation
180 |         if not os.path.exists(settings['output']):
181 |             os.makedirs(settings['output'])
182 |         return settings
183 | 
184 |     @staticmethod
185 |     def _scale_values(log, params, model_def):
186 |         # Features treatment
187 |         inp = feat.FeaturesMannager(params)
188 |         # Register scaler
189 |         inp.register_scaler(params['model_type'], model_def['scaler'])
190 |         # Scale features
191 |         log, params['scale_args'] = inp.calculate(
192 |             log, model_def['additional_columns'])
193 |         return log, params
194 | 
195 |     def _define_response(self, parms, status, loss, **kwargs) -> dict:
196 |         print(loss)
197 |         response = dict()
198 |         measurements = list()
199 |         data = {'n_size': parms['n_size'],
200 |                 'l_size': parms['l_size'],
201 |                 'lstm_act': parms['lstm_act'],
202 |                 'dense_act': parms['dense_act'],
203 |                 'optim': parms['optim'],
204 |                 'scale_args': parms['scale_args'],
205 |                 'output': parms['output']}
206 |         response['output'] = parms['output']
207 |         if status == STATUS_OK:
208 |             response['loss'] = loss
209 |             response['status'] = status if loss > 0 else STATUS_FAIL
210 |             measurements.append({**{'loss': loss,
211 |                                     'sim_metric': 'val_loss', 
212 |                                     'status': response['status']},
213 |                                  **data})
214 |         else:
215 |             response['status'] = status
216 |             measurements.append({**{'loss': 1,
217 |                                     'sim_metric': 'val_loss',
218 |                                     'status': response['status']},
219 |                                  **data})
220 |         if os.path.getsize(self.file_name) > 0:
221 |             sup.create_csv_file(measurements, self.file_name, mode='a')
222 |         else:
223 |             sup.create_csv_file_header(measurements, self.file_name)
224 |         return response
225 | 
226 |     @staticmethod
227 |     def split_timeline(size: float, log: pd.DataFrame, one_ts: bool) -> None:
228 |         """
229 |         Split an event log dataframe by time to peform split-validation.
230 |         prefered method time splitting removing incomplete traces.
231 |         If the testing set is smaller than the 10% of the log size
232 |         the second method is sort by traces start and split taking the whole
233 |         traces no matter if they are contained in the timeframe or not
234 | 
235 |         Parameters
236 |         ----------
237 |         size : float, validation percentage.
238 |         one_ts : bool, Support only one timestamp.
239 |         """
240 |         # Split log data
241 |         splitter = ls.LogSplitter(log)
242 |         train, valdn = splitter.split_log('timeline_contained', size, one_ts)
243 |         total_events = len(log)
244 |         # Check size and change time splitting method if necesary
245 |         if len(valdn) < int(total_events*0.1):
246 |             train, valdn = splitter.split_log('timeline_trace', size, one_ts)
247 |         # Set splits
248 |         key = 'end_timestamp' if one_ts else 'start_timestamp'
249 |         valdn = pd.DataFrame(valdn)
250 |         train = pd.DataFrame(train)
251 |         log_valdn = (valdn.sort_values(key, ascending=True).reset_index(drop=True))
252 |         log_train = (train.sort_values(key, ascending=True).reset_index(drop=True))
253 |         return log_valdn, log_train
254 | 
255 |     @staticmethod
256 |     def read_model_definition(model_type):
257 |         model_def = dict()
258 |         config = cp.ConfigParser(interpolation=None)
259 |         config.read('models_spec.ini')
260 |         #  File name with extension
261 |         model_def['additional_columns'] = sup.reduce_list(config.get(model_type, 'additional_columns'), dtype='str')
262 |         model_def['scaler'] = config.get(model_type, 'scaler')
263 |         model_def['vectorizer'] = config.get(model_type, 'vectorizer')
264 |         model_def['trainer'] = config.get(model_type, 'trainer')
265 |         return model_def
266 | 


--------------------------------------------------------------------------------
/model_training/model_trainer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Mar 12 15:07:19 2020
  4 | 
  5 | @author: Manuel Camargo
  6 | """
  7 | import os
  8 | import csv
  9 | 
 10 | import pandas as pd
 11 | import numpy as np
 12 | import shutil
 13 | 
 14 | import readers.log_reader as lr
 15 | import utils.support as sup
 16 | import readers.log_splitter as ls
 17 | 
 18 | from model_training.features_manager import FeaturesMannager as feat
 19 | from model_training import embedding_training as em
 20 | from model_training import model_optimizer as op
 21 | from model_training import model_hpc_optimizer as hpc_op
 22 | 
 23 | 
 24 | class ModelTrainer():
 25 |     """
 26 |     This is the man class encharged of the model training
 27 |     """
 28 | 
 29 |     def __init__(self, params):
 30 |         """constructor"""
 31 |         self.log = self.load_log(params)
 32 |         # Split validation partitions
 33 |         self.log_train = pd.DataFrame()
 34 |         self.log_test = pd.DataFrame()
 35 |         # Activities and roles indexes
 36 |         self.ac_index = dict()
 37 |         self.index_ac = dict()
 38 | 
 39 |         self.rl_index = dict()
 40 |         self.index_rl = dict()
 41 |         # Training examples
 42 |         self.examples = dict()
 43 |         # Embedded dimensions
 44 |         self.ac_weights = list()
 45 |         self.rl_weights = list()
 46 |         # Preprocess the event-log
 47 |         self.preprocess(params)
 48 |         # Train model
 49 |         params['output'] = os.path.join('output_files', sup.folder_id())
 50 |         if params['opt_method'] == 'rand_hpc':
 51 |             optimizer = hpc_op.ModelHPCOptimizer(params, 
 52 |                                                  self.log, 
 53 |                                                  self.ac_index, 
 54 |                                                  self.rl_index)
 55 |             optimizer.execute_trials()
 56 |         elif params['opt_method'] == 'bayesian':
 57 |             optimizer = op.ModelOptimizer(params, 
 58 |                                           self.log, 
 59 |                                           self.ac_index, 
 60 |                                           self.ac_weights,
 61 |                                           self.rl_index,
 62 |                                           self.rl_weights)
 63 |             optimizer.execute_trials()
 64 |         # Export results
 65 |         output_path = os.path.join('output_files', sup.folder_id())
 66 |         shutil.copytree(optimizer.best_output, output_path)
 67 |         shutil.copy(optimizer.file_name, output_path)
 68 |         self.export_parms(output_path, optimizer.best_params)
 69 |         # Remove folder
 70 |         shutil.rmtree(params['output'])
 71 | 
 72 |     def preprocess(self, params):
 73 |         self.log = feat.add_resources(self.log, params['rp_sim'])
 74 |         # indexes creation
 75 |         self.indexing()
 76 |         # split validation
 77 |         self.split_timeline(0.8, params['one_timestamp'])
 78 |         # Load embedded matrix
 79 |         ac_emb_name = 'ac_' + params['file_name'].split('.')[0]+'.emb'
 80 |         rl_emb_name = 'rl_' + params['file_name'].split('.')[0]+'.emb'
 81 |         if os.path.exists(os.path.join('input_files',
 82 |                                        'embedded_matix',
 83 |                                        ac_emb_name)):
 84 |             self.ac_weights = self.load_embedded(self.index_ac, ac_emb_name)
 85 |             self.rl_weights = self.load_embedded(self.index_rl, rl_emb_name)
 86 |         else:
 87 |             em.training_model(params,
 88 |                               self.log,
 89 |                               self.ac_index, self.index_ac,
 90 |                               self.rl_index, self.index_rl)
 91 |             self.ac_weights = self.load_embedded(self.index_ac, ac_emb_name)
 92 |             self.rl_weights = self.load_embedded(self.index_rl, rl_emb_name)
 93 | 
 94 |     @staticmethod
 95 |     def load_log(params):
 96 |         params['read_options']['filter_d_attrib'] = False
 97 |         log = lr.LogReader(os.path.join('input_files', params['file_name']),
 98 |                            params['read_options'])
 99 |         log_df = pd.DataFrame(log.data)
100 |         if set(['Unnamed: 0', 'role']).issubset(set(log_df.columns)):
101 |             log_df.drop(columns=['Unnamed: 0', 'role'], inplace=True)
102 |         log_df = log_df[~log_df.task.isin(['Start', 'End'])]
103 |         return log_df
104 | 
105 |     def indexing(self):
106 |         # Activities index creation
107 |         self.ac_index = self.create_index(self.log, 'task')
108 |         self.ac_index['start'] = 0
109 |         self.ac_index['end'] = len(self.ac_index)
110 |         self.index_ac = {v: k for k, v in self.ac_index.items()}
111 |         # Roles index creation
112 |         self.rl_index = self.create_index(self.log, 'role')
113 |         self.rl_index['start'] = 0
114 |         self.rl_index['end'] = len(self.rl_index)
115 |         self.index_rl = {v: k for k, v in self.rl_index.items()}
116 |         # Add index to the event log
117 |         ac_idx = lambda x: self.ac_index[x['task']]
118 |         self.log['ac_index'] = self.log.apply(ac_idx, axis=1)
119 |         rl_idx = lambda x: self.rl_index[x['role']]
120 |         self.log['rl_index'] = self.log.apply(rl_idx, axis=1)
121 | 
122 |     @staticmethod
123 |     def create_index(log_df, column):
124 |         """Creates an idx for a categorical attribute.
125 |         parms:
126 |             log_df: dataframe.
127 |             column: column name.
128 |         Returns:
129 |             index of a categorical attribute pairs.
130 |         """
131 |         temp_list = log_df[[column]].values.tolist()
132 |         subsec_set = {(x[0]) for x in temp_list}
133 |         subsec_set = sorted(list(subsec_set))
134 |         alias = dict()
135 |         for i, _ in enumerate(subsec_set):
136 |             alias[subsec_set[i]] = i + 1
137 |         return alias
138 | 
139 | 
140 |     def split_timeline(self, size: float, one_ts: bool) -> None:
141 |         """
142 |         Split an event log dataframe by time to peform split-validation.
143 |         prefered method time splitting removing incomplete traces.
144 |         If the testing set is smaller than the 10% of the log size
145 |         the second method is sort by traces start and split taking the whole
146 |         traces no matter if they are contained in the timeframe or not
147 | 
148 |         Parameters
149 |         ----------
150 |         size : float, validation percentage.
151 |         one_ts : bool, Support only one timestamp.
152 |         """
153 |         # Split log data
154 |         splitter = ls.LogSplitter(self.log)
155 |         train, test = splitter.split_log('timeline_contained', size, one_ts)
156 |         total_events = len(self.log)
157 |         # Check size and change time splitting method if necesary
158 |         if len(test) < int(total_events*0.1):
159 |             train, test = splitter.split_log('timeline_trace', size, one_ts)
160 |         # Set splits
161 |         key = 'end_timestamp' if one_ts else 'start_timestamp'
162 |         test = pd.DataFrame(test)
163 |         train = pd.DataFrame(train)
164 |         self.log_test = (test.sort_values(key, ascending=True)
165 |                          .reset_index(drop=True))
166 |         self.log_train = (train.sort_values(key, ascending=True)
167 |                           .reset_index(drop=True))
168 | 
169 | 
170 | 
171 |     @staticmethod
172 |     def load_embedded(index, filename):
173 |         """Loading of the embedded matrices.
174 |         parms:
175 |             index (dict): index of activities or roles.
176 |             filename (str): filename of the matrix file.
177 |         Returns:
178 |             numpy array: array of weights.
179 |         """
180 |         weights = list()
181 |         input_folder = os.path.join('input_files', 'embedded_matix')
182 |         with open(os.path.join(input_folder, filename), 'r') as csvfile:
183 |             filereader = csv.reader(csvfile, delimiter=',', quotechar='"')
184 |             for row in filereader:
185 |                 cat_ix = int(row[0])
186 |                 if index[cat_ix] == row[1].strip():
187 |                     weights.append([float(x) for x in row[2:]])
188 |             csvfile.close()
189 |         return np.array(weights)
190 | 
191 |     def export_parms(self, output_folder, parms):
192 |         if not os.path.exists(os.path.join(output_folder, 'parameters')):
193 |             os.makedirs(os.path.join(output_folder, 'parameters'))
194 | 
195 |         parms['max_trace_size'] = int(self.log.groupby('caseid')['task']
196 |                                       .count().max())
197 |         
198 |         parms['index_ac'] = self.index_ac
199 |         parms['index_rl'] = self.index_rl
200 |         
201 |         sup.create_json(parms, os.path.join(output_folder,
202 |                                             'parameters',
203 |                                             'model_parameters.json'))
204 |         self.log_test.to_csv(os.path.join(output_folder,
205 |                                           'parameters',
206 |                                           'test_log.csv'),
207 |                              index=False,
208 |                              encoding='utf-8')
209 |         
210 | 


--------------------------------------------------------------------------------
/model_training/models/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/model_training/models/model_concatenated.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Feb 28 10:15:12 2019
  4 | 
  5 | @author: Manuel Camargo
  6 | """
  7 | import os
  8 | 
  9 | from tensorflow.keras.models import Model
 10 | from tensorflow.keras.layers import Input, Embedding, Concatenate
 11 | from tensorflow.keras.layers import Dense, LSTM, BatchNormalization
 12 | from tensorflow.keras.optimizers import Nadam, Adam, SGD, Adagrad
 13 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
 14 | 
 15 | try:
 16 |     from support_modules.callbacks import time_callback as tc
 17 | except:
 18 |     from importlib import util
 19 |     spec = util.spec_from_file_location(
 20 |         'time_callback', 
 21 |         os.path.join(os.getcwd(), 'support_modules', 'callbacks', 'time_callback.py'))
 22 |     tc = util.module_from_spec(spec)
 23 |     spec.loader.exec_module(tc)
 24 | 
 25 | 
 26 | def _training_model(train_vec, valdn_vec, ac_weights, rl_weights, output_folder, args, log_path=None):
 27 |     """Example function with types documented in the docstring.
 28 |     Args:
 29 |         param1 (int): The first parameter.
 30 |         param2 (str): The second parameter.
 31 |     Returns:
 32 |         bool: The return value. True for success, False otherwise.
 33 |     """
 34 | 
 35 |     print('Build model...')
 36 |     print(args)
 37 | # =============================================================================
 38 | #     Input layer
 39 | # =============================================================================
 40 |     ac_input = Input(shape=(train_vec['prefixes']['activities'].shape[1], ), name='ac_input')
 41 |     rl_input = Input(shape=(train_vec['prefixes']['roles'].shape[1], ), name='rl_input')
 42 |     t_input = Input(shape=(train_vec['prefixes']['times'].shape[1],
 43 |                            train_vec['prefixes']['times'].shape[2]), name='t_input')
 44 | 
 45 | # =============================================================================
 46 | #    Embedding layer for categorical attributes
 47 | # =============================================================================
 48 |     ac_embedding = Embedding(ac_weights.shape[0],
 49 |                              ac_weights.shape[1],
 50 |                              weights=[ac_weights],
 51 |                              input_length=train_vec['prefixes']['activities'].shape[1],
 52 |                              trainable=False, name='ac_embedding')(ac_input)
 53 | 
 54 |     rl_embedding = Embedding(rl_weights.shape[0],
 55 |                              rl_weights.shape[1],
 56 |                              weights=[rl_weights],
 57 |                              input_length=train_vec['prefixes']['roles'].shape[1],
 58 |                              trainable=False, name='rl_embedding')(rl_input)
 59 | 
 60 | # =============================================================================
 61 | #    Layer 1
 62 | # =============================================================================
 63 |     concatenate = Concatenate(name='concatenated', axis=2)([ac_embedding, rl_embedding, t_input])
 64 | 
 65 |     if args['lstm_act'] is not None:
 66 |         l1_c1 = LSTM(args['l_size'],
 67 |                      activation=args['lstm_act'],
 68 |                      kernel_initializer='glorot_uniform',
 69 |                      return_sequences=True,
 70 |                      dropout=0.2,
 71 |                      implementation=args['imp'])(concatenate)
 72 |     else:
 73 |         l1_c1 = LSTM(args['l_size'],
 74 |                      kernel_initializer='glorot_uniform',
 75 |                      return_sequences=True,
 76 |                      dropout=0.2,
 77 |                      implementation=args['imp'])(concatenate)
 78 | 
 79 | # =============================================================================
 80 | #    Batch Normalization Layer
 81 | # =============================================================================
 82 |     batch1 = BatchNormalization()(l1_c1)
 83 | 
 84 | # =============================================================================
 85 | # The layer specialized in prediction
 86 | # =============================================================================
 87 |     l2_c1 = LSTM(args['l_size'],
 88 |                  kernel_initializer='glorot_uniform',
 89 |                  return_sequences=False,
 90 |                  dropout=0.2,
 91 |                  implementation=args['imp'])(batch1)
 92 | 
 93 | #   The layer specialized in role prediction
 94 |     l2_c2 = LSTM(args['l_size'],
 95 |                  kernel_initializer='glorot_uniform',
 96 |                  return_sequences=False,
 97 |                  dropout=0.2,
 98 |                  implementation=args['imp'])(batch1)
 99 | 
100 | #   The layer specialized in role prediction
101 |     l2_3 = LSTM(args['l_size'],
102 |                 activation=args['lstm_act'],
103 |                 kernel_initializer='glorot_uniform',
104 |                 return_sequences=False,
105 |                 dropout=0.2,
106 |                 implementation=args['imp'])(batch1)
107 | 
108 | # =============================================================================
109 | # Output Layer
110 | # =============================================================================
111 |     act_output = Dense(ac_weights.shape[0],
112 |                        activation='softmax',
113 |                        kernel_initializer='glorot_uniform',
114 |                        name='act_output')(l2_c1)
115 | 
116 |     role_output = Dense(rl_weights.shape[0],
117 |                         activation='softmax',
118 |                         kernel_initializer='glorot_uniform',
119 |                         name='role_output')(l2_c2)
120 | 
121 |     if ('dense_act' in args) and (args['dense_act'] is not None):
122 |         time_output = Dense(train_vec['next_evt']['times'].shape[1],
123 |                             activation=args['dense_act'],
124 |                             kernel_initializer='glorot_uniform',
125 |                             name='time_output')(l2_3)
126 |     else:
127 |         time_output = Dense(train_vec['next_evt']['times'].shape[1],
128 |                             kernel_initializer='glorot_uniform',
129 |                             name='time_output')(l2_3)
130 | 
131 |     model = Model(inputs=[ac_input, rl_input, t_input],
132 |                   outputs=[act_output, role_output, time_output])
133 | 
134 |     if args['optim'] == 'Nadam':
135 |         opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999)
136 |     elif args['optim'] == 'Adam':
137 |         opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
138 |     elif args['optim'] == 'SGD':
139 |         opt = SGD(learning_rate=0.01, momentum=0.0, nesterov=False)
140 |     elif args['optim'] == 'Adagrad':
141 |         opt = Adagrad(learning_rate=0.01)
142 | 
143 |     model.compile(loss={'act_output': 'categorical_crossentropy',
144 |                         'role_output': 'categorical_crossentropy',
145 |                         'time_output': 'mae'}, optimizer=opt)
146 | 
147 |     model.summary()
148 | 
149 |     early_stopping = EarlyStopping(monitor='val_loss', patience=50)
150 |     if log_path:
151 |         cb = tc.TimingCallback(output_folder, log_path=log_path)
152 |     else:
153 |         cb = tc.TimingCallback(output_folder)
154 | 
155 |     # Output file
156 |     output_file_path = os.path.join(output_folder, 
157 |                                     os.path.splitext(args['file'])[0]+'.h5')
158 | 
159 |     # Saving
160 |     model_checkpoint = ModelCheckpoint(output_file_path,
161 |                                        monitor='val_loss',
162 |                                        verbose=0,
163 |                                        save_best_only=True,
164 |                                        save_weights_only=False,
165 |                                        mode='auto')
166 |     lr_reducer = ReduceLROnPlateau(monitor='val_loss',
167 |                                    factor=0.5,
168 |                                    patience=10,
169 |                                    verbose=0,
170 |                                    mode='auto',
171 |                                    min_delta=0.0001,
172 |                                    cooldown=0,
173 |                                    min_lr=0)
174 | 
175 |     batch_size = args['batch_size']
176 |     model.fit({'ac_input': train_vec['prefixes']['activities'],
177 |                 'rl_input': train_vec['prefixes']['roles'],
178 |                 't_input': train_vec['prefixes']['times']},
179 |               {'act_output': train_vec['next_evt']['activities'],
180 |                 'role_output': train_vec['next_evt']['roles'],
181 |                 'time_output': train_vec['next_evt']['times']},
182 |               validation_data=(
183 |                   {'ac_input': valdn_vec['prefixes']['activities'],
184 |                    'rl_input': valdn_vec['prefixes']['roles'],
185 |                    't_input': valdn_vec['prefixes']['times']},
186 |                   {'act_output': valdn_vec['next_evt']['activities'],
187 |                    'role_output': valdn_vec['next_evt']['roles'],
188 |                    'time_output': valdn_vec['next_evt']['times']}),
189 |               verbose=2,
190 |               callbacks=[early_stopping, model_checkpoint,
191 |                          lr_reducer, cb],
192 |               batch_size=batch_size,
193 |               epochs=args['epochs'])
194 |     return model
195 | 


--------------------------------------------------------------------------------
/model_training/models/model_concatenated_cx.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Feb 28 10:15:12 2019
  4 | 
  5 | @author: Manuel Camargo
  6 | """
  7 | import os
  8 | 
  9 | from tensorflow.keras.models import Model
 10 | from tensorflow.keras.layers import Input, Embedding, Concatenate
 11 | from tensorflow.keras.layers import Dense, LSTM, BatchNormalization
 12 | from tensorflow.keras.optimizers import Nadam, Adam, SGD, Adagrad
 13 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
 14 | 
 15 | try:
 16 |     from support_modules.callbacks import time_callback as tc
 17 | except:
 18 |     from importlib import util
 19 |     spec = util.spec_from_file_location(
 20 |         'time_callback', 
 21 |         os.path.join(os.getcwd(), 'support_modules', 'callbacks', 'time_callback.py'))
 22 |     tc = util.module_from_spec(spec)
 23 |     spec.loader.exec_module(tc)
 24 | 
 25 | 
 26 | def _training_model(train_vec, valdn_vec, ac_weights, rl_weights, 
 27 |                     output_folder, args, log_path=None):
 28 |     """Example function with types documented in the docstring.
 29 |     Args:
 30 |         param1 (int): The first parameter.
 31 |         param2 (str): The second parameter.
 32 |     Returns:
 33 |         bool: The return value. True for success, False otherwise.
 34 |     """
 35 | 
 36 |     print('Build model...')
 37 |     print(args)
 38 | # =============================================================================
 39 | #     Input layer
 40 | # =============================================================================
 41 |     ac_input = Input(shape=(train_vec['prefixes']['activities'].shape[1], ),
 42 |                      name='ac_input')
 43 |     rl_input = Input(shape=(train_vec['prefixes']['roles'].shape[1], ),
 44 |                      name='rl_input')
 45 |     t_input = Input(shape=(train_vec['prefixes']['times'].shape[1],
 46 |                            train_vec['prefixes']['times'].shape[2]), name='t_input')
 47 |     inter_input = Input(shape=(train_vec['prefixes']['inter_attr'].shape[1],
 48 |                             train_vec['prefixes']['inter_attr'].shape[2]),
 49 |                      name='inter_input')
 50 | 
 51 | # =============================================================================
 52 | #    Embedding layer for categorical attributes
 53 | # =============================================================================
 54 |     ac_embedding = Embedding(ac_weights.shape[0],
 55 |                              ac_weights.shape[1],
 56 |                              weights=[ac_weights],
 57 |                              input_length=train_vec['prefixes']['activities'].shape[1],
 58 |                              trainable=False, name='ac_embedding')(ac_input)
 59 | 
 60 |     rl_embedding = Embedding(rl_weights.shape[0],
 61 |                              rl_weights.shape[1],
 62 |                              weights=[rl_weights],
 63 |                              input_length=train_vec['prefixes']['roles'].shape[1],
 64 |                              trainable=False, name='rl_embedding')(rl_input)
 65 | 
 66 | # =============================================================================
 67 | #    Layer 1
 68 | # =============================================================================
 69 |     concatenate = Concatenate(name='concatenated', axis=2)(
 70 |         [ac_embedding, rl_embedding, t_input, inter_input])
 71 | 
 72 |     if args['lstm_act'] is not None:
 73 |         l1_c1 = LSTM(args['l_size'],
 74 |                      activation=args['lstm_act'],
 75 |                      kernel_initializer='glorot_uniform',
 76 |                      return_sequences=True,
 77 |                      dropout=0.2,
 78 |                      implementation=args['imp'])(concatenate)
 79 |     else:
 80 |         l1_c1 = LSTM(args['l_size'],
 81 |                      kernel_initializer='glorot_uniform',
 82 |                      return_sequences=True,
 83 |                      dropout=0.2,
 84 |                      implementation=args['imp'])(concatenate)
 85 | 
 86 | # =============================================================================
 87 | #    Batch Normalization Layer
 88 | # =============================================================================
 89 |     batch1 = BatchNormalization()(l1_c1)
 90 | 
 91 | # =============================================================================
 92 | # The layer specialized in prediction
 93 | # =============================================================================
 94 |     l2_c1 = LSTM(args['l_size'],
 95 |                  kernel_initializer='glorot_uniform',
 96 |                  return_sequences=False,
 97 |                  dropout=0.2,
 98 |                  implementation=args['imp'])(batch1)
 99 | 
100 | #   The layer specialized in role prediction
101 |     l2_c2 = LSTM(args['l_size'],
102 |                  kernel_initializer='glorot_uniform',
103 |                  return_sequences=False,
104 |                  dropout=0.2,
105 |                  implementation=args['imp'])(batch1)
106 | 
107 | #   The layer specialized in role prediction
108 |     l2_c3 = LSTM(args['l_size'],
109 |                 activation=args['lstm_act'],
110 |                 kernel_initializer='glorot_uniform',
111 |                 return_sequences=False,
112 |                 dropout=0.2,
113 |                 implementation=args['imp'])(batch1)
114 | 
115 | # =============================================================================
116 | # Output Layer
117 | # =============================================================================
118 |     act_output = Dense(ac_weights.shape[0],
119 |                        activation='softmax',
120 |                        kernel_initializer='glorot_uniform',
121 |                        name='act_output')(l2_c1)
122 | 
123 |     role_output = Dense(rl_weights.shape[0],
124 |                         activation='softmax',
125 |                         kernel_initializer='glorot_uniform',
126 |                         name='role_output')(l2_c2)
127 | 
128 |     if ('dense_act' in args) and (args['dense_act'] is not None):
129 |         time_output = Dense(train_vec['next_evt']['times'].shape[1],
130 |                             activation=args['dense_act'],
131 |                             kernel_initializer='glorot_uniform',
132 |                             name='time_output')(l2_c3)
133 |     else:
134 |         time_output = Dense(train_vec['next_evt']['times'].shape[1],
135 |                             kernel_initializer='glorot_uniform',
136 |                             name='time_output')(l2_c3)
137 |     model = Model(inputs=[ac_input, rl_input, t_input, inter_input],
138 |                   outputs=[act_output, role_output, time_output])
139 | 
140 |     if args['optim'] == 'Nadam':
141 |         opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999)
142 |     elif args['optim'] == 'Adam':
143 |         opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
144 |     elif args['optim'] == 'SGD':
145 |         opt = SGD(learning_rate=0.01, momentum=0.0, nesterov=False)
146 |     elif args['optim'] == 'Adagrad':
147 |         opt = Adagrad(learning_rate=0.01)
148 | 
149 |     model.compile(loss={'act_output': 'categorical_crossentropy',
150 |                         'role_output': 'categorical_crossentropy',
151 |                         'time_output': 'mae'}, optimizer=opt)
152 | 
153 |     model.summary()
154 | 
155 |     early_stopping = EarlyStopping(monitor='val_loss', patience=40)
156 |     if log_path:
157 |         cb = tc.TimingCallback(output_folder, log_path=log_path)
158 |     else:
159 |         cb = tc.TimingCallback(output_folder)
160 | 
161 |     # Output file
162 |     output_file_path = os.path.join(output_folder, 
163 |                                     os.path.splitext(args['file'])[0]+'.h5')
164 | 
165 |     # Saving
166 |     model_checkpoint = ModelCheckpoint(output_file_path,
167 |                                        monitor='val_loss',
168 |                                        verbose=0,
169 |                                        save_best_only=True,
170 |                                        save_weights_only=False,
171 |                                        mode='auto')
172 |     lr_reducer = ReduceLROnPlateau(monitor='val_loss',
173 |                                    factor=0.5,
174 |                                    patience=10,
175 |                                    verbose=0,
176 |                                    mode='auto',
177 |                                    min_delta=0.0001,
178 |                                    cooldown=0,
179 |                                    min_lr=0)
180 | 
181 |     batch_size = args['batch_size']
182 |     model.fit({'ac_input': train_vec['prefixes']['activities'],
183 |                'rl_input': train_vec['prefixes']['roles'],
184 |                't_input': train_vec['prefixes']['times'],
185 |                'inter_input': train_vec['prefixes']['inter_attr']},
186 |               {'act_output': train_vec['next_evt']['activities'],
187 |                'role_output': train_vec['next_evt']['roles'],
188 |                'time_output': train_vec['next_evt']['times']},
189 |               validation_data=(
190 |                   {'ac_input': valdn_vec['prefixes']['activities'],
191 |                    'rl_input': valdn_vec['prefixes']['roles'],
192 |                    't_input': valdn_vec['prefixes']['times'],
193 |                    'inter_input': valdn_vec['prefixes']['inter_attr']},
194 |                   {'act_output': valdn_vec['next_evt']['activities'],
195 |                    'role_output': valdn_vec['next_evt']['roles'],
196 |                    'time_output': valdn_vec['next_evt']['times']}),
197 |               verbose=2,
198 |               callbacks=[early_stopping, model_checkpoint,
199 |                          lr_reducer, cb],
200 |               batch_size=batch_size, 
201 |               epochs=args['epochs'])
202 |     return model
203 | 


--------------------------------------------------------------------------------
/model_training/models/model_gru_concatenated.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Feb 28 10:15:12 2019
  4 | 
  5 | @author: Manuel Camargo
  6 | """
  7 | import os
  8 | 
  9 | from tensorflow.keras.models import Model
 10 | from tensorflow.keras.layers import Input, Embedding, Concatenate
 11 | from tensorflow.keras.layers import Dense, GRU, BatchNormalization
 12 | from tensorflow.keras.optimizers import Nadam, Adam, SGD, Adagrad
 13 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
 14 | 
 15 | try:
 16 |     from support_modules.callbacks import time_callback as tc
 17 | except:
 18 |     from importlib import util
 19 |     spec = util.spec_from_file_location(
 20 |         'time_callback', 
 21 |         os.path.join(os.getcwd(), 'support_modules', 'callbacks', 'time_callback.py'))
 22 |     tc = util.module_from_spec(spec)
 23 |     spec.loader.exec_module(tc)
 24 | 
 25 | 
 26 | def _training_model(train_vec, valdn_vec, ac_weights, rl_weights, 
 27 |                     output_folder, args, log_path=None):
 28 |     """Example function with types documented in the docstring.
 29 |     Args:
 30 |         param1 (int): The first parameter.
 31 |         param2 (str): The second parameter.
 32 |     Returns:
 33 |         bool: The return value. True for success, False otherwise.
 34 |     """
 35 | 
 36 |     print('Build model...')
 37 |     print(args)
 38 | # =============================================================================
 39 | #     Input layer
 40 | # =============================================================================
 41 |     ac_input = Input(shape=(train_vec['prefixes']['activities'].shape[1], ), name='ac_input')
 42 |     rl_input = Input(shape=(train_vec['prefixes']['roles'].shape[1], ), name='rl_input')
 43 |     t_input = Input(shape=(train_vec['prefixes']['times'].shape[1],
 44 |                            train_vec['prefixes']['times'].shape[2]), name='t_input')
 45 | 
 46 | # =============================================================================
 47 | #    Embedding layer for categorical attributes
 48 | # =============================================================================
 49 |     ac_embedding = Embedding(ac_weights.shape[0],
 50 |                              ac_weights.shape[1],
 51 |                              weights=[ac_weights],
 52 |                              input_length=train_vec['prefixes']['activities'].shape[1],
 53 |                              trainable=False, name='ac_embedding')(ac_input)
 54 | 
 55 |     rl_embedding = Embedding(rl_weights.shape[0],
 56 |                              rl_weights.shape[1],
 57 |                              weights=[rl_weights],
 58 |                              input_length=train_vec['prefixes']['roles'].shape[1],
 59 |                              trainable=False, name='rl_embedding')(rl_input)
 60 | 
 61 | # =============================================================================
 62 | #    Layer 1
 63 | # =============================================================================
 64 |     concatenate = Concatenate(name='concatenated', axis=2)([ac_embedding, rl_embedding, t_input])
 65 | 
 66 |     if args['lstm_act'] is not None:
 67 |         l1_c1 = GRU(args['l_size'],
 68 |                      activation=args['lstm_act'],
 69 |                      kernel_initializer='glorot_uniform',
 70 |                      return_sequences=True,
 71 |                      dropout=0.2,
 72 |                      implementation=args['imp'])(concatenate)
 73 |     else:
 74 |         l1_c1 = GRU(args['l_size'],
 75 |                      kernel_initializer='glorot_uniform',
 76 |                      return_sequences=True,
 77 |                      dropout=0.2,
 78 |                      implementation=args['imp'])(concatenate)
 79 | 
 80 | # =============================================================================
 81 | #    Batch Normalization Layer
 82 | # =============================================================================
 83 |     batch1 = BatchNormalization()(l1_c1)
 84 | 
 85 | # =============================================================================
 86 | # The layer specialized in prediction
 87 | # =============================================================================
 88 |     l2_c1 = GRU(args['l_size'],
 89 |                  kernel_initializer='glorot_uniform',
 90 |                  return_sequences=False,
 91 |                  dropout=0.2,
 92 |                  implementation=args['imp'])(batch1)
 93 | 
 94 | #   The layer specialized in role prediction
 95 |     l2_c2 = GRU(args['l_size'],
 96 |                  kernel_initializer='glorot_uniform',
 97 |                  return_sequences=False,
 98 |                  dropout=0.2,
 99 |                  implementation=args['imp'])(batch1)
100 | 
101 | #   The layer specialized in role prediction
102 |     l2_3 = GRU(args['l_size'],
103 |                 activation=args['lstm_act'],
104 |                 kernel_initializer='glorot_uniform',
105 |                 return_sequences=False,
106 |                 dropout=0.2,
107 |                 implementation=args['imp'])(batch1)
108 | 
109 | # =============================================================================
110 | # Output Layer
111 | # =============================================================================
112 |     act_output = Dense(ac_weights.shape[0],
113 |                        activation='softmax',
114 |                        kernel_initializer='glorot_uniform',
115 |                        name='act_output')(l2_c1)
116 | 
117 |     role_output = Dense(rl_weights.shape[0],
118 |                         activation='softmax',
119 |                         kernel_initializer='glorot_uniform',
120 |                         name='role_output')(l2_c2)
121 | 
122 |     if ('dense_act' in args) and (args['dense_act'] is not None):
123 |         time_output = Dense(train_vec['next_evt']['times'].shape[1],
124 |                             activation=args['dense_act'],
125 |                             kernel_initializer='glorot_uniform',
126 |                             name='time_output')(l2_3)
127 |     else:
128 |         time_output = Dense(train_vec['next_evt']['times'].shape[1],
129 |                             kernel_initializer='glorot_uniform',
130 |                             name='time_output')(l2_3)
131 | 
132 |     model = Model(inputs=[ac_input, rl_input, t_input],
133 |                   outputs=[act_output, role_output, time_output])
134 | 
135 |     if args['optim'] == 'Nadam':
136 |         opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999)
137 |     elif args['optim'] == 'Adam':
138 |         opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
139 |     elif args['optim'] == 'SGD':
140 |         opt = SGD(learning_rate=0.01, momentum=0.0, nesterov=False)
141 |     elif args['optim'] == 'Adagrad':
142 |         opt = Adagrad(learning_rate=0.01)
143 | 
144 |     model.compile(loss={'act_output': 'categorical_crossentropy',
145 |                         'role_output': 'categorical_crossentropy',
146 |                         'time_output': 'mae'}, optimizer=opt)
147 | 
148 |     model.summary()
149 | 
150 |     early_stopping = EarlyStopping(monitor='val_loss', patience=40)
151 |     if log_path:
152 |         cb = tc.TimingCallback(output_folder, log_path=log_path)
153 |     else:
154 |         cb = tc.TimingCallback(output_folder)
155 | 
156 |     # Output file
157 |     output_file_path = os.path.join(output_folder, 
158 |                                     os.path.splitext(args['file'])[0]+'.h5')
159 | 
160 |     # Saving
161 |     model_checkpoint = ModelCheckpoint(output_file_path,
162 |                                        monitor='val_loss',
163 |                                        verbose=0,
164 |                                        save_best_only=True,
165 |                                        save_weights_only=False,
166 |                                        mode='auto')
167 |     lr_reducer = ReduceLROnPlateau(monitor='val_loss',
168 |                                    factor=0.5,
169 |                                    patience=10,
170 |                                    verbose=0,
171 |                                    mode='auto',
172 |                                    min_delta=0.0001,
173 |                                    cooldown=0,
174 |                                    min_lr=0)
175 | 
176 |     batch_size = args['batch_size']
177 |     model.fit({'ac_input': train_vec['prefixes']['activities'],
178 |                 'rl_input': train_vec['prefixes']['roles'],
179 |                 't_input': train_vec['prefixes']['times']},
180 |               {'act_output': train_vec['next_evt']['activities'],
181 |                 'role_output': train_vec['next_evt']['roles'],
182 |                 'time_output': train_vec['next_evt']['times']},
183 |               validation_data=(
184 |                   {'ac_input': valdn_vec['prefixes']['activities'],
185 |                    'rl_input': valdn_vec['prefixes']['roles'],
186 |                    't_input': valdn_vec['prefixes']['times']},
187 |                   {'act_output': valdn_vec['next_evt']['activities'],
188 |                    'role_output': valdn_vec['next_evt']['roles'],
189 |                    'time_output': valdn_vec['next_evt']['times']}),
190 |               verbose=2,
191 |               callbacks=[early_stopping, model_checkpoint, lr_reducer, cb],
192 |               batch_size=batch_size,
193 |               epochs=args['epochs'])
194 |     return model
195 | 


--------------------------------------------------------------------------------
/model_training/models/model_gru_concatenated_cx.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Feb 28 10:15:12 2019
  4 | 
  5 | @author: Manuel Camargo
  6 | """
  7 | import os
  8 | 
  9 | from tensorflow.keras.models import Model
 10 | from tensorflow.keras.layers import Input, Embedding, Concatenate
 11 | from tensorflow.keras.layers import Dense, GRU, BatchNormalization
 12 | from tensorflow.keras.optimizers import Nadam, Adam, SGD, Adagrad
 13 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
 14 | 
 15 | try:
 16 |     from support_modules.callbacks import time_callback as tc
 17 | except:
 18 |     from importlib import util
 19 |     spec = util.spec_from_file_location(
 20 |         'time_callback', 
 21 |         os.path.join(os.getcwd(), 'support_modules', 'callbacks', 'time_callback.py'))
 22 |     tc = util.module_from_spec(spec)
 23 |     spec.loader.exec_module(tc)
 24 | 
 25 | 
 26 | def _training_model(train_vec, valdn_vec, ac_weights, rl_weights, 
 27 |                     output_folder, args, log_path=None):
 28 |     """Example function with types documented in the docstring.
 29 |     Args:
 30 |         param1 (int): The first parameter.
 31 |         param2 (str): The second parameter.
 32 |     Returns:
 33 |         bool: The return value. True for success, False otherwise.
 34 |     """
 35 | 
 36 |     print('Build model...')
 37 |     print(args)
 38 | # =============================================================================
 39 | #     Input layer
 40 | # =============================================================================
 41 |     ac_input = Input(shape=(train_vec['prefixes']['activities'].shape[1], ),
 42 |                      name='ac_input')
 43 |     rl_input = Input(shape=(train_vec['prefixes']['roles'].shape[1], ),
 44 |                      name='rl_input')
 45 |     t_input = Input(shape=(train_vec['prefixes']['times'].shape[1],
 46 |                            train_vec['prefixes']['times'].shape[2]), name='t_input')
 47 |     inter_input = Input(shape=(train_vec['prefixes']['inter_attr'].shape[1],
 48 |                             train_vec['prefixes']['inter_attr'].shape[2]),
 49 |                      name='inter_input')
 50 | 
 51 | # =============================================================================
 52 | #    Embedding layer for categorical attributes
 53 | # =============================================================================
 54 |     ac_embedding = Embedding(ac_weights.shape[0],
 55 |                              ac_weights.shape[1],
 56 |                              weights=[ac_weights],
 57 |                              input_length=train_vec['prefixes']['activities'].shape[1],
 58 |                              trainable=False, name='ac_embedding')(ac_input)
 59 | 
 60 |     rl_embedding = Embedding(rl_weights.shape[0],
 61 |                              rl_weights.shape[1],
 62 |                              weights=[rl_weights],
 63 |                              input_length=train_vec['prefixes']['roles'].shape[1],
 64 |                              trainable=False, name='rl_embedding')(rl_input)
 65 | 
 66 | # =============================================================================
 67 | #    Layer 1
 68 | # =============================================================================
 69 |     concatenate = Concatenate(name='concatenated', axis=2)(
 70 |         [ac_embedding, rl_embedding, t_input, inter_input])
 71 | 
 72 |     if args['lstm_act'] is not None:
 73 |         l1_c1 = GRU(args['l_size'],
 74 |                      activation=args['lstm_act'],
 75 |                      kernel_initializer='glorot_uniform',
 76 |                      return_sequences=True,
 77 |                      dropout=0.2,
 78 |                      implementation=args['imp'])(concatenate)
 79 |     else:
 80 |         l1_c1 = GRU(args['l_size'],
 81 |                      kernel_initializer='glorot_uniform',
 82 |                      return_sequences=True,
 83 |                      dropout=0.2,
 84 |                      implementation=args['imp'])(concatenate)
 85 | 
 86 | # =============================================================================
 87 | #    Batch Normalization Layer
 88 | # =============================================================================
 89 |     batch1 = BatchNormalization()(l1_c1)
 90 | 
 91 | # =============================================================================
 92 | # The layer specialized in prediction
 93 | # =============================================================================
 94 |     l2_c1 = GRU(args['l_size'],
 95 |                  kernel_initializer='glorot_uniform',
 96 |                  return_sequences=False,
 97 |                  dropout=0.2,
 98 |                  implementation=args['imp'])(batch1)
 99 | 
100 | #   The layer specialized in role prediction
101 |     l2_c2 = GRU(args['l_size'],
102 |                  kernel_initializer='glorot_uniform',
103 |                  return_sequences=False,
104 |                  dropout=0.2,
105 |                  implementation=args['imp'])(batch1)
106 | 
107 | #   The layer specialized in role prediction
108 |     l2_c3 = GRU(args['l_size'],
109 |                 activation=args['lstm_act'],
110 |                 kernel_initializer='glorot_uniform',
111 |                 return_sequences=False,
112 |                 dropout=0.2,
113 |                 implementation=args['imp'])(batch1)
114 | 
115 | 
116 | # =============================================================================
117 | # Output Layer
118 | # =============================================================================
119 |     act_output = Dense(ac_weights.shape[0],
120 |                        activation='softmax',
121 |                        kernel_initializer='glorot_uniform',
122 |                        name='act_output')(l2_c1)
123 | 
124 |     role_output = Dense(rl_weights.shape[0],
125 |                         activation='softmax',
126 |                         kernel_initializer='glorot_uniform',
127 |                         name='role_output')(l2_c2)
128 | 
129 |     if ('dense_act' in args) and (args['dense_act'] is not None):
130 |         time_output = Dense(train_vec['next_evt']['times'].shape[1],
131 |                             activation=args['dense_act'],
132 |                             kernel_initializer='glorot_uniform',
133 |                             name='time_output')(l2_c3)
134 |     else:
135 |         time_output = Dense(train_vec['next_evt']['times'].shape[1],
136 |                             kernel_initializer='glorot_uniform',
137 |                             name='time_output')(l2_c3)
138 |     model = Model(inputs=[ac_input, rl_input, t_input, inter_input],
139 |                   outputs=[act_output, role_output, time_output])
140 | 
141 |     if args['optim'] == 'Nadam':
142 |         opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999)
143 |     elif args['optim'] == 'Adam':
144 |         opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
145 |     elif args['optim'] == 'SGD':
146 |         opt = SGD(learning_rate=0.01, momentum=0.0, nesterov=False)
147 |     elif args['optim'] == 'Adagrad':
148 |         opt = Adagrad(learning_rate=0.01)
149 | 
150 |     model.compile(loss={'act_output': 'categorical_crossentropy',
151 |                         'role_output': 'categorical_crossentropy',
152 |                         'time_output': 'mae'}, optimizer=opt)
153 | 
154 |     model.summary()
155 | 
156 |     early_stopping = EarlyStopping(monitor='val_loss', patience=40)
157 |     if log_path:
158 |         cb = tc.TimingCallback(output_folder, log_path=log_path)
159 |     else:
160 |         cb = tc.TimingCallback(output_folder)
161 | 
162 |     # Output file
163 |     output_file_path = os.path.join(output_folder, 
164 |                                     os.path.splitext(args['file'])[0]+'.h5')
165 | 
166 |     # Saving
167 |     model_checkpoint = ModelCheckpoint(output_file_path,
168 |                                        monitor='val_loss',
169 |                                        verbose=0,
170 |                                        save_best_only=True,
171 |                                        save_weights_only=False,
172 |                                        mode='auto')
173 |     lr_reducer = ReduceLROnPlateau(monitor='val_loss',
174 |                                    factor=0.5,
175 |                                    patience=10,
176 |                                    verbose=0,
177 |                                    mode='auto',
178 |                                    min_delta=0.0001,
179 |                                    cooldown=0,
180 |                                    min_lr=0)
181 | 
182 |     batch_size = args['batch_size']
183 |     model.fit({'ac_input': train_vec['prefixes']['activities'],
184 |                'rl_input': train_vec['prefixes']['roles'],
185 |                't_input': train_vec['prefixes']['times'],
186 |                'inter_input': train_vec['prefixes']['inter_attr']},
187 |               {'act_output': train_vec['next_evt']['activities'],
188 |                'role_output': train_vec['next_evt']['roles'],
189 |                'time_output': train_vec['next_evt']['times']},
190 |               validation_data=(
191 |                   {'ac_input': valdn_vec['prefixes']['activities'],
192 |                    'rl_input': valdn_vec['prefixes']['roles'],
193 |                    't_input': valdn_vec['prefixes']['times'],
194 |                    'inter_input': valdn_vec['prefixes']['inter_attr']},
195 |                   {'act_output': valdn_vec['next_evt']['activities'],
196 |                    'role_output': valdn_vec['next_evt']['roles'],
197 |                    'time_output': valdn_vec['next_evt']['times']}),
198 |               verbose=2,
199 |               callbacks=[early_stopping, model_checkpoint,
200 |                          lr_reducer, cb],
201 |               batch_size=batch_size, 
202 |               epochs=args['epochs'])
203 |     return model
204 | 


--------------------------------------------------------------------------------
/model_training/models/model_gru_shared_cat.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Feb 28 10:15:12 2019
  4 | 
  5 | @author: Manuel Camargo
  6 | """
  7 | import os
  8 | 
  9 | from tensorflow.keras.models import Model
 10 | from tensorflow.keras.layers import Input, Embedding, Concatenate
 11 | from tensorflow.keras.layers import Dense, GRU, BatchNormalization
 12 | from tensorflow.keras.optimizers import Nadam, Adam, SGD, Adagrad
 13 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
 14 | 
 15 | try:
 16 |     from support_modules.callbacks import time_callback as tc
 17 | except:
 18 |     from importlib import util
 19 |     spec = util.spec_from_file_location(
 20 |         'time_callback', 
 21 |         os.path.join(os.getcwd(), 'support_modules', 'callbacks', 'time_callback.py'))
 22 |     tc = util.module_from_spec(spec)
 23 |     spec.loader.exec_module(tc)
 24 | 
 25 | 
 26 | def _training_model(train_vec, valdn_vec, ac_weights, rl_weights, 
 27 |                     output_folder, args, log_path=None):
 28 |     """Example function with types documented in the docstring.
 29 |     Args:
 30 |         param1 (int): The first parameter.
 31 |         param2 (str): The second parameter.
 32 |     Returns:
 33 |         bool: The return value. True for success, False otherwise.
 34 |     """
 35 | 
 36 |     print('Build model...')
 37 |     print(args)
 38 | # =============================================================================
 39 | #     Input layer
 40 | # =============================================================================
 41 |     ac_input = Input(shape=(train_vec['prefixes']['activities'].shape[1], ), name='ac_input')
 42 |     rl_input = Input(shape=(train_vec['prefixes']['roles'].shape[1], ), name='rl_input')
 43 |     t_input = Input(shape=(train_vec['prefixes']['times'].shape[1],
 44 |                            train_vec['prefixes']['times'].shape[2]), name='t_input')
 45 | 
 46 | # =============================================================================
 47 | #    Embedding layer for categorical attributes
 48 | # =============================================================================
 49 |     ac_embedding = Embedding(ac_weights.shape[0],
 50 |                              ac_weights.shape[1],
 51 |                              weights=[ac_weights],
 52 |                              input_length=train_vec['prefixes']['activities'].shape[1],
 53 |                              trainable=False, name='ac_embedding')(ac_input)
 54 | 
 55 |     rl_embedding = Embedding(rl_weights.shape[0],
 56 |                              rl_weights.shape[1],
 57 |                              weights=[rl_weights],
 58 |                              input_length=train_vec['prefixes']['roles'].shape[1],
 59 |                              trainable=False, name='rl_embedding')(rl_input)
 60 | # =============================================================================
 61 | #    Layer 1
 62 | # =============================================================================
 63 | 
 64 |     merged = Concatenate(name='concatenated', axis=2)([ac_embedding, rl_embedding])
 65 | 
 66 |     l1_c1 = GRU(args['l_size'],
 67 |                  kernel_initializer='glorot_uniform',
 68 |                  return_sequences=True,
 69 |                  dropout=0.2,
 70 |                  implementation=args['imp'])(merged)
 71 | 
 72 |     l1_c3 = GRU(args['l_size'],
 73 |                  activation=args['lstm_act'],
 74 |                  kernel_initializer='glorot_uniform',
 75 |                  return_sequences=True,
 76 |                  dropout=0.2,
 77 |                  implementation=args['imp'])(t_input)
 78 | 
 79 | # =============================================================================
 80 | #    Batch Normalization Layer
 81 | # =============================================================================
 82 |     batch1 = BatchNormalization()(l1_c1)
 83 |     batch3 = BatchNormalization()(l1_c3)
 84 | 
 85 | # =============================================================================
 86 | # The layer specialized in prediction
 87 | # =============================================================================
 88 |     l2_c1 = GRU(args['l_size'],
 89 |                  kernel_initializer='glorot_uniform',
 90 |                  return_sequences=False,
 91 |                  dropout=0.2,
 92 |                  implementation=args['imp'])(batch1)
 93 | 
 94 | #   The layer specialized in role prediction
 95 |     l2_c2 = GRU(args['l_size'],
 96 |                  kernel_initializer='glorot_uniform',
 97 |                  return_sequences=False,
 98 |                  dropout=0.2,
 99 |                  implementation=args['imp'])(batch1)
100 | 
101 | #   The layer specialized in role prediction
102 |     l2_3 = GRU(args['l_size'],
103 |                 activation=args['lstm_act'],
104 |                 kernel_initializer='glorot_uniform',
105 |                 return_sequences=False,
106 |                 dropout=0.2,
107 |                 implementation=args['imp'])(batch3)
108 | 
109 | # =============================================================================
110 | # Output Layer
111 | # =============================================================================
112 |     act_output = Dense(ac_weights.shape[0],
113 |                        activation='softmax',
114 |                        kernel_initializer='glorot_uniform',
115 |                        name='act_output')(l2_c1)
116 | 
117 |     role_output = Dense(rl_weights.shape[0],
118 |                         activation='softmax',
119 |                         kernel_initializer='glorot_uniform',
120 |                         name='role_output')(l2_c2)
121 | 
122 |     if ('dense_act' in args) and (args['dense_act'] is not None):
123 |         time_output = Dense(train_vec['next_evt']['times'].shape[1],
124 |                             activation=args['dense_act'],
125 |                             kernel_initializer='glorot_uniform',
126 |                             name='time_output')(l2_3)
127 |     else:
128 |         time_output = Dense(train_vec['next_evt']['times'].shape[1],
129 |                             kernel_initializer='glorot_uniform',
130 |                             name='time_output')(l2_3)
131 | 
132 |     model = Model(inputs=[ac_input, rl_input, t_input],
133 |                   outputs=[act_output, role_output, time_output])
134 | 
135 |     if args['optim'] == 'Nadam':
136 |         opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999)
137 |     elif args['optim'] == 'Adam':
138 |         opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
139 |     elif args['optim'] == 'SGD':
140 |         opt = SGD(learning_rate=0.01, momentum=0.0, nesterov=False)
141 |     elif args['optim'] == 'Adagrad':
142 |         opt = Adagrad(learning_rate=0.01)
143 | 
144 |     model.compile(loss={'act_output': 'categorical_crossentropy',
145 |                         'role_output': 'categorical_crossentropy',
146 |                         'time_output': 'mae'}, optimizer=opt)
147 | 
148 |     model.summary()
149 | 
150 |     early_stopping = EarlyStopping(monitor='val_loss', patience=40)
151 |     if log_path:
152 |         cb = tc.TimingCallback(output_folder, log_path=log_path)
153 |     else:
154 |         cb = tc.TimingCallback(output_folder)
155 | 
156 |     # Output file
157 |     output_file_path = os.path.join(output_folder, 
158 |                                     os.path.splitext(args['file'])[0]+'.h5')
159 | 
160 |     # Saving
161 |     model_checkpoint = ModelCheckpoint(output_file_path,
162 |                                        monitor='val_loss',
163 |                                        verbose=0,
164 |                                        save_best_only=True,
165 |                                        save_weights_only=False,
166 |                                        mode='auto')
167 |     lr_reducer = ReduceLROnPlateau(monitor='val_loss',
168 |                                    factor=0.5,
169 |                                    patience=10,
170 |                                    verbose=0,
171 |                                    mode='auto',
172 |                                    min_delta=0.0001,
173 |                                    cooldown=0,
174 |                                    min_lr=0)
175 | 
176 |     batch_size = args['batch_size']
177 |     model.fit({'ac_input': train_vec['prefixes']['activities'],
178 |                 'rl_input': train_vec['prefixes']['roles'],
179 |                 't_input': train_vec['prefixes']['times']},
180 |               {'act_output': train_vec['next_evt']['activities'],
181 |                 'role_output': train_vec['next_evt']['roles'],
182 |                 'time_output': train_vec['next_evt']['times']},
183 |               validation_data=(
184 |                   {'ac_input': valdn_vec['prefixes']['activities'],
185 |                    'rl_input': valdn_vec['prefixes']['roles'],
186 |                    't_input': valdn_vec['prefixes']['times']},
187 |                   {'act_output': valdn_vec['next_evt']['activities'],
188 |                    'role_output': valdn_vec['next_evt']['roles'],
189 |                    'time_output': valdn_vec['next_evt']['times']}),
190 |               verbose=2,
191 |               callbacks=[early_stopping, model_checkpoint, lr_reducer, cb],
192 |               batch_size=batch_size,
193 |               epochs=args['epochs'])
194 |     return model
195 | 


--------------------------------------------------------------------------------
/model_training/models/model_gru_shared_cat_cx.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Feb 28 10:15:12 2019
  4 | 
  5 | @author: Manuel Camargo
  6 | """
  7 | 
  8 | import os
  9 | 
 10 | from tensorflow.keras.models import Model
 11 | from tensorflow.keras.layers import Input, Embedding, Concatenate
 12 | from tensorflow.keras.layers import Dense, GRU, BatchNormalization
 13 | from tensorflow.keras.optimizers import Nadam, Adam, SGD, Adagrad
 14 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
 15 | 
 16 | try:
 17 |     from support_modules.callbacks import time_callback as tc
 18 | except:
 19 |     from importlib import util
 20 |     spec = util.spec_from_file_location(
 21 |         'time_callback', 
 22 |         os.path.join(os.getcwd(), 'support_modules', 'callbacks', 'time_callback.py'))
 23 |     tc = util.module_from_spec(spec)
 24 |     spec.loader.exec_module(tc)
 25 | 
 26 | 
 27 | def _training_model(train_vec, valdn_vec, ac_weights, rl_weights, 
 28 |                     output_folder, args, log_path=None):
 29 |     """Example function with types documented in the docstring.
 30 |     Args:
 31 |         param1 (int): The first parameter.
 32 |         param2 (str): The second parameter.
 33 |     Returns:
 34 |         bool: The return value. True for success, False otherwise.
 35 |     """
 36 | 
 37 |     print('Build model...')
 38 |     print(args)
 39 | # =============================================================================
 40 | #     Input layer
 41 | # =============================================================================
 42 |     ac_input = Input(shape=(train_vec['prefixes']['activities'].shape[1], ),
 43 |                      name='ac_input')
 44 |     rl_input = Input(shape=(train_vec['prefixes']['roles'].shape[1], ),
 45 |                      name='rl_input')
 46 |     t_input = Input(shape=(train_vec['prefixes']['times'].shape[1],
 47 |                            train_vec['prefixes']['times'].shape[2]), name='t_input')
 48 |     inter_input = Input(shape=(train_vec['prefixes']['inter_attr'].shape[1],
 49 |                             train_vec['prefixes']['inter_attr'].shape[2]),
 50 |                      name='inter_input')
 51 | 
 52 | # =============================================================================
 53 | #    Embedding layer for categorical attributes
 54 | # =============================================================================
 55 |     ac_embedding = Embedding(ac_weights.shape[0],
 56 |                              ac_weights.shape[1],
 57 |                              weights=[ac_weights],
 58 |                              input_length=(train_vec['prefixes']['activities']
 59 |                                            .shape[1]),
 60 |                              trainable=False, name='ac_embedding')(ac_input)
 61 | 
 62 |     rl_embedding = Embedding(rl_weights.shape[0],
 63 |                              rl_weights.shape[1],
 64 |                              weights=[rl_weights],
 65 |                              input_length=train_vec['prefixes']['roles'].shape[1],
 66 |                              trainable=False, name='rl_embedding')(rl_input)
 67 | # =============================================================================
 68 | #    Concatenation layer
 69 | # =============================================================================
 70 | 
 71 |     merged1 = Concatenate(name='conc_categorical',
 72 |                           axis=2)([ac_embedding, rl_embedding])
 73 |     merged2 = Concatenate(name='conc_continuous', axis=2)([t_input, inter_input])
 74 | 
 75 | # =============================================================================
 76 | #    Layer 1
 77 | # =============================================================================
 78 | 
 79 |     l1_c1 = GRU(args['l_size'],
 80 |                  kernel_initializer='glorot_uniform',
 81 |                  return_sequences=True,
 82 |                  dropout=0.2,
 83 |                  implementation=args['imp'])(merged1)
 84 | 
 85 |     l1_c3 = GRU(args['l_size'],
 86 |                  activation=args['lstm_act'],
 87 |                  kernel_initializer='glorot_uniform',
 88 |                  return_sequences=True,
 89 |                  dropout=0.2,
 90 |                  implementation=args['imp'])(merged2)
 91 | 
 92 | # =============================================================================
 93 | #    Batch Normalization Layer
 94 | # =============================================================================
 95 |     batch1 = BatchNormalization()(l1_c1)
 96 |     batch3 = BatchNormalization()(l1_c3)
 97 | 
 98 | # =============================================================================
 99 | # The layer specialized in prediction
100 | # =============================================================================
101 |     l2_c1 = GRU(args['l_size'],
102 |                  kernel_initializer='glorot_uniform',
103 |                  return_sequences=False,
104 |                  dropout=0.2,
105 |                  implementation=args['imp'])(batch1)
106 | 
107 | #   The layer specialized in role prediction
108 |     l2_c2 = GRU(args['l_size'],
109 |                  kernel_initializer='glorot_uniform',
110 |                  return_sequences=False,
111 |                  dropout=0.2,
112 |                  implementation=args['imp'])(batch1)
113 | 
114 | #   The layer specialized in time prediction
115 |     l2_c4 = GRU(args['l_size'],
116 |                  activation=args['lstm_act'],
117 |                  kernel_initializer='glorot_uniform',
118 |                  return_sequences=False,
119 |                  dropout=0.2,
120 |                  implementation=args['imp'])(batch3)
121 | 
122 | # =============================================================================
123 | # Output Layer
124 | # =============================================================================
125 |     act_output = Dense(train_vec['next_evt']['activities'].shape[1],
126 |                        activation='softmax',
127 |                        kernel_initializer='glorot_uniform',
128 |                        name='act_output')(l2_c1)
129 | 
130 |     role_output = Dense(train_vec['next_evt']['roles'].shape[1],
131 |                         activation='softmax',
132 |                         kernel_initializer='glorot_uniform',
133 |                         name='role_output')(l2_c2)
134 | 
135 |     if ('dense_act' in args) and (args['dense_act'] is not None):
136 |         time_output = Dense(train_vec['next_evt']['times'].shape[1],
137 |                             activation=args['dense_act'],
138 |                             kernel_initializer='glorot_uniform',
139 |                             name='time_output')(l2_c4)
140 |     else:
141 |         time_output = Dense(train_vec['next_evt']['times'].shape[1],
142 |                             kernel_initializer='glorot_uniform',
143 |                             name='time_output')(l2_c4)
144 | 
145 |     model = Model(inputs=[ac_input, rl_input, t_input, inter_input],
146 |                   outputs=[act_output, role_output, time_output])
147 | 
148 |     if args['optim'] == 'Nadam':
149 |         opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999)
150 |     elif args['optim'] == 'Adam':
151 |         opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999,
152 |                    amsgrad=False)
153 |     elif args['optim'] == 'SGD':
154 |         opt = SGD(learning_rate=0.01, momentum=0.0, nesterov=False)
155 |     elif args['optim'] == 'Adagrad':
156 |         opt = Adagrad(learning_rate=0.01)
157 | 
158 |     model.compile(loss={'act_output': 'categorical_crossentropy',
159 |                         'role_output': 'categorical_crossentropy',
160 |                         'time_output': 'mae'}, optimizer=opt)
161 | 
162 |     model.summary()
163 |     
164 |     early_stopping = EarlyStopping(monitor='val_loss', patience=40)
165 |     if log_path:
166 |         cb = tc.TimingCallback(output_folder, log_path=log_path)
167 |     else:
168 |         cb = tc.TimingCallback(output_folder)
169 | 
170 |     # Output file
171 |     output_file_path = os.path.join(output_folder, 
172 |                                     os.path.splitext(args['file'])[0]+'.h5')
173 | 
174 |     # Saving
175 |     model_checkpoint = ModelCheckpoint(output_file_path,
176 |                                        monitor='val_loss',
177 |                                        verbose=0,
178 |                                        save_best_only=True,
179 |                                        save_weights_only=False,
180 |                                        mode='auto')
181 |     lr_reducer = ReduceLROnPlateau(monitor='val_loss',
182 |                                    factor=0.5,
183 |                                    patience=10,
184 |                                    verbose=0,
185 |                                    mode='auto',
186 |                                    min_delta=0.0001,
187 |                                    cooldown=0,
188 |                                    min_lr=0)
189 | 
190 |     batch_size = args['batch_size']
191 |     model.fit({'ac_input': train_vec['prefixes']['activities'],
192 |                'rl_input': train_vec['prefixes']['roles'],
193 |                't_input': train_vec['prefixes']['times'],
194 |                'inter_input': train_vec['prefixes']['inter_attr']},
195 |               {'act_output': train_vec['next_evt']['activities'],
196 |                'role_output': train_vec['next_evt']['roles'],
197 |                'time_output': train_vec['next_evt']['times']},
198 |               validation_data=(
199 |                   {'ac_input': valdn_vec['prefixes']['activities'],
200 |                    'rl_input': valdn_vec['prefixes']['roles'],
201 |                    't_input': valdn_vec['prefixes']['times'],
202 |                    'inter_input': valdn_vec['prefixes']['inter_attr']},
203 |                   {'act_output': valdn_vec['next_evt']['activities'],
204 |                    'role_output': valdn_vec['next_evt']['roles'],
205 |                    'time_output': valdn_vec['next_evt']['times']}),
206 |               verbose=2,
207 |               callbacks=[early_stopping, model_checkpoint,
208 |                          lr_reducer, cb],
209 |               batch_size=batch_size, 
210 |               epochs=args['epochs'])
211 |     return model
212 | 


--------------------------------------------------------------------------------
/model_training/models/model_gru_specialized.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Feb 28 10:15:12 2019
  4 | 
  5 | @author: Manuel Camargo
  6 | """
  7 | import os
  8 | 
  9 | from tensorflow.keras.models import Model
 10 | from tensorflow.keras.layers import Input, Embedding
 11 | from tensorflow.keras.layers import Dense, GRU, BatchNormalization
 12 | from tensorflow.keras.optimizers import Nadam, Adam, SGD, Adagrad
 13 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
 14 | 
 15 | try:
 16 |     from support_modules.callbacks import time_callback as tc
 17 | except:
 18 |     from importlib import util
 19 |     spec = util.spec_from_file_location(
 20 |         'time_callback', 
 21 |         os.path.join(os.getcwd(), 'support_modules', 'callbacks', 'time_callback.py'))
 22 |     tc = util.module_from_spec(spec)
 23 |     spec.loader.exec_module(tc)
 24 | 
 25 | 
 26 | def _training_model(train_vec, valdn_vec, ac_weights, rl_weights, 
 27 |                     output_folder, args, log_path=None):
 28 |     """Example function with types documented in the docstring.
 29 |     Args:
 30 |         param1 (int): The first parameter.
 31 |         param2 (str): The second parameter.
 32 |     Returns:
 33 |         bool: The return value. True for success, False otherwise.
 34 |     """
 35 | 
 36 |     print('Build model...')
 37 |     print(args)
 38 | # =============================================================================
 39 | #     Input layer
 40 | # =============================================================================
 41 |     ac_input = Input(shape=(train_vec['prefixes']['activities'].shape[1], ), name='ac_input')
 42 |     rl_input = Input(shape=(train_vec['prefixes']['roles'].shape[1], ), name='rl_input')
 43 |     t_input = Input(shape=(train_vec['prefixes']['times'].shape[1],
 44 |                            train_vec['prefixes']['times'].shape[2]), name='t_input')
 45 | 
 46 | # =============================================================================
 47 | #    Embedding layer for categorical attributes        
 48 | # =============================================================================
 49 |     ac_embedding = Embedding(ac_weights.shape[0],
 50 |                             ac_weights.shape[1],
 51 |                             weights=[ac_weights],
 52 |                             input_length=train_vec['prefixes']['activities'].shape[1],
 53 |                             trainable=False, name='ac_embedding')(ac_input)
 54 | 
 55 |     rl_embedding = Embedding(rl_weights.shape[0],
 56 |                             rl_weights.shape[1],
 57 |                             weights=[rl_weights],
 58 |                             input_length=train_vec['prefixes']['roles'].shape[1],
 59 |                             trainable=False, name='rl_embedding')(rl_input)
 60 | # =============================================================================
 61 | #    Layer 1
 62 | # =============================================================================
 63 |     l1_c1 = GRU(args['l_size'],
 64 |                   kernel_initializer='glorot_uniform',
 65 |                   return_sequences=True,
 66 |                   dropout=0.2,
 67 |                   implementation=args['imp'])(ac_embedding)
 68 |     
 69 |     l1_c2 = GRU(args['l_size'],
 70 |                   kernel_initializer='glorot_uniform',
 71 |                   return_sequences=True,
 72 |                   dropout=0.2,
 73 |                   implementation=args['imp'])(rl_embedding)
 74 | 
 75 |     l1_c3 = GRU(args['l_size'],
 76 |                  activation=args['lstm_act'],
 77 |                  kernel_initializer='glorot_uniform',
 78 |                  return_sequences=True,
 79 |                  dropout=0.2,
 80 |                  implementation=args['imp'])(t_input)
 81 | 
 82 | # =============================================================================
 83 | #    Batch Normalization Layer
 84 | # =============================================================================
 85 |     batch1 = BatchNormalization()(l1_c1)
 86 |     batch2 = BatchNormalization()(l1_c2)
 87 |     batch3 = BatchNormalization()(l1_c3)
 88 |     
 89 | # =============================================================================
 90 | # The layer specialized in prediction
 91 | # =============================================================================
 92 |     l2_c1 = GRU(args['l_size'],
 93 |                     kernel_initializer='glorot_uniform',
 94 |                     return_sequences=False,
 95 |                     dropout=0.2,
 96 |                     implementation=args['imp'])(batch1)
 97 |  
 98 | #   The layer specialized in role prediction
 99 |     l2_c2 = GRU(args['l_size'],
100 |                     kernel_initializer='glorot_uniform',
101 |                     return_sequences=False,
102 |                     dropout=0.2,
103 |                     implementation=args['imp'])(batch2)
104 |     
105 | #   The layer specialized in role prediction
106 |     if args['lstm_act'] is not None:
107 |         l2_3 = GRU(args['l_size'],
108 |                     activation=args['lstm_act'],
109 |                     kernel_initializer='glorot_uniform',
110 |                     return_sequences=False,
111 |                     dropout=0.2,
112 |                     implementation=args['imp'])(batch3)
113 |     else:
114 |         l2_3 = GRU(args['l_size'],
115 |                     kernel_initializer='glorot_uniform',
116 |                     return_sequences=False,
117 |                     dropout=0.2,
118 |                     implementation=args['imp'])(batch3)
119 |     
120 | 
121 |     
122 | # =============================================================================
123 | # Output Layer
124 | # =============================================================================
125 |     act_output = Dense(ac_weights.shape[0],
126 |                        activation='softmax',
127 |                        kernel_initializer='glorot_uniform',
128 |                        name='act_output')(l2_c1)
129 | 
130 |     role_output = Dense(rl_weights.shape[0],
131 |                        activation='softmax',
132 |                        kernel_initializer='glorot_uniform',
133 |                        name='role_output')(l2_c2)
134 | 
135 |     if ('dense_act' in args) and (args['dense_act'] is not None):
136 |         time_output = Dense(train_vec['next_evt']['times'].shape[1], activation=args['dense_act'],
137 |                             kernel_initializer='glorot_uniform',
138 |                             name='time_output')(l2_3)
139 |     else:
140 |         time_output = Dense(train_vec['next_evt']['times'].shape[1],
141 |                             kernel_initializer='glorot_uniform',
142 |                             name='time_output')(l2_3)
143 | 
144 |     model = Model(inputs=[ac_input, rl_input, t_input],
145 |                   outputs=[act_output, role_output, time_output])
146 | 
147 |     if args['optim'] == 'Nadam':
148 |         opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999)
149 |     elif args['optim'] == 'Adam':
150 |         opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
151 |     elif args['optim'] == 'SGD':
152 |         opt = SGD(learning_rate=0.01, momentum=0.0, nesterov=False)
153 |     elif args['optim'] == 'Adagrad':
154 |         opt = Adagrad(learning_rate=0.01)
155 | 
156 |     model.compile(loss={'act_output':'categorical_crossentropy',
157 |                         'role_output':'categorical_crossentropy',
158 |                         'time_output':'mae'}, optimizer=opt)
159 |     
160 |     model.summary()
161 | 
162 |     early_stopping = EarlyStopping(monitor='val_loss', patience=40)
163 |     if log_path:
164 |         cb = tc.TimingCallback(output_folder, log_path=log_path)
165 |     else:
166 |         cb = tc.TimingCallback(output_folder)
167 | 
168 |     # Output file
169 |     output_file_path = os.path.join(output_folder, 
170 |                                     os.path.splitext(args['file'])[0]+'.h5')
171 | 
172 |     # Saving
173 |     model_checkpoint = ModelCheckpoint(output_file_path,
174 |                                        monitor='val_loss',
175 |                                        verbose=0,
176 |                                        save_best_only=True,
177 |                                        save_weights_only=False,
178 |                                        mode='auto')
179 |     lr_reducer = ReduceLROnPlateau(monitor='val_loss',
180 |                                    factor=0.5,
181 |                                    patience=10,
182 |                                    verbose=0,
183 |                                    mode='auto',
184 |                                    min_delta=0.0001,
185 |                                    cooldown=0,
186 |                                    min_lr=0)
187 | 
188 |     batch_size = args['batch_size']
189 |     model.fit({'ac_input': train_vec['prefixes']['activities'],
190 |                 'rl_input': train_vec['prefixes']['roles'],
191 |                 't_input': train_vec['prefixes']['times']},
192 |               {'act_output': train_vec['next_evt']['activities'],
193 |                 'role_output': train_vec['next_evt']['roles'],
194 |                 'time_output': train_vec['next_evt']['times']},
195 |               validation_data=(
196 |                   {'ac_input': valdn_vec['prefixes']['activities'],
197 |                    'rl_input': valdn_vec['prefixes']['roles'],
198 |                    't_input': valdn_vec['prefixes']['times']},
199 |                   {'act_output': valdn_vec['next_evt']['activities'],
200 |                    'role_output': valdn_vec['next_evt']['roles'],
201 |                    'time_output': valdn_vec['next_evt']['times']}),
202 |               verbose=2,
203 |               callbacks=[early_stopping, model_checkpoint, lr_reducer, cb],
204 |               batch_size=batch_size,
205 |               epochs=args['epochs'])
206 |     return model
207 | 


--------------------------------------------------------------------------------
/model_training/models/model_shared_cat.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Feb 28 10:15:12 2019
  4 | 
  5 | @author: Manuel Camargo
  6 | """
  7 | import os
  8 | 
  9 | from tensorflow.keras.models import Model
 10 | from tensorflow.keras.layers import Input, Embedding, Concatenate
 11 | from tensorflow.keras.layers import Dense, LSTM, BatchNormalization
 12 | from tensorflow.keras.optimizers import Nadam, Adam, SGD, Adagrad
 13 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
 14 | 
 15 | try:
 16 |     from support_modules.callbacks import time_callback as tc
 17 | except:
 18 |     from importlib import util
 19 |     spec = util.spec_from_file_location(
 20 |         'time_callback', 
 21 |         os.path.join(os.getcwd(), 'support_modules', 'callbacks', 'time_callback.py'))
 22 |     tc = util.module_from_spec(spec)
 23 |     spec.loader.exec_module(tc)
 24 | 
 25 | 
 26 | def _training_model(train_vec, valdn_vec, ac_weights, rl_weights, 
 27 |                     output_folder, args, log_path=None):
 28 |     """Example function with types documented in the docstring.
 29 |     Args:
 30 |         param1 (int): The first parameter.
 31 |         param2 (str): The second parameter.
 32 |     Returns:
 33 |         bool: The return value. True for success, False otherwise.
 34 |     """
 35 |     print('Build model...')
 36 |     print(args)
 37 | # =============================================================================
 38 | #     Input layer
 39 | # =============================================================================
 40 |     ac_input = Input(shape=(train_vec['prefixes']['activities'].shape[1], ), 
 41 |                      name='ac_input')
 42 |     rl_input = Input(shape=(train_vec['prefixes']['roles'].shape[1], ), 
 43 |                      name='rl_input')
 44 |     t_input = Input(shape=(train_vec['prefixes']['times'].shape[1],
 45 |                            train_vec['prefixes']['times'].shape[2]), 
 46 |                     name='t_input')
 47 | 
 48 | # =============================================================================
 49 | #    Embedding layer for categorical attributes
 50 | # =============================================================================
 51 |     ac_embedding = Embedding(ac_weights.shape[0],
 52 |                              ac_weights.shape[1],
 53 |                              weights=[ac_weights],
 54 |                              input_length=train_vec['prefixes']['activities'].shape[1],
 55 |                              trainable=False, name='ac_embedding')(ac_input)
 56 | 
 57 |     rl_embedding = Embedding(rl_weights.shape[0],
 58 |                              rl_weights.shape[1],
 59 |                              weights=[rl_weights],
 60 |                              input_length=train_vec['prefixes']['roles'].shape[1],
 61 |                              trainable=False, name='rl_embedding')(rl_input)
 62 | # =============================================================================
 63 | #    Layer 1
 64 | # =============================================================================
 65 | 
 66 |     merged = Concatenate(name='concatenated', axis=2)([ac_embedding, rl_embedding])
 67 | 
 68 |     l1_c1 = LSTM(args['l_size'],
 69 |                  kernel_initializer='glorot_uniform',
 70 |                  return_sequences=True,
 71 |                  dropout=0.2,
 72 |                  implementation=args['imp'])(merged)
 73 | 
 74 |     l1_c3 = LSTM(args['l_size'],
 75 |                  activation=args['lstm_act'],
 76 |                  kernel_initializer='glorot_uniform',
 77 |                  return_sequences=True,
 78 |                  dropout=0.2,
 79 |                  implementation=args['imp'])(t_input)
 80 | 
 81 | # =============================================================================
 82 | #    Batch Normalization Layer
 83 | # =============================================================================
 84 |     batch1 = BatchNormalization()(l1_c1)
 85 |     batch3 = BatchNormalization()(l1_c3)
 86 | 
 87 | # =============================================================================
 88 | # The layer specialized in prediction
 89 | # =============================================================================
 90 |     l2_c1 = LSTM(args['l_size'],
 91 |                  kernel_initializer='glorot_uniform',
 92 |                  return_sequences=False,
 93 |                  dropout=0.2,
 94 |                  implementation=args['imp'])(batch1)
 95 | 
 96 | #   The layer specialized in role prediction
 97 |     l2_c2 = LSTM(args['l_size'],
 98 |                  kernel_initializer='glorot_uniform',
 99 |                  return_sequences=False,
100 |                  dropout=0.2,
101 |                  implementation=args['imp'])(batch1)
102 | 
103 | #   The layer specialized in role prediction
104 |     l2_3 = LSTM(args['l_size'],
105 |                 activation=args['lstm_act'],
106 |                 kernel_initializer='glorot_uniform',
107 |                 return_sequences=False,
108 |                 dropout=0.2,
109 |                 implementation=args['imp'])(batch3)
110 | 
111 | # =============================================================================
112 | # Output Layer
113 | # =============================================================================
114 |     act_output = Dense(ac_weights.shape[0],
115 |                        activation='softmax',
116 |                        kernel_initializer='glorot_uniform',
117 |                        name='act_output')(l2_c1)
118 | 
119 |     role_output = Dense(rl_weights.shape[0],
120 |                         activation='softmax',
121 |                         kernel_initializer='glorot_uniform',
122 |                         name='role_output')(l2_c2)
123 | 
124 |     if ('dense_act' in args) and (args['dense_act'] is not None):
125 |         time_output = Dense(train_vec['next_evt']['times'].shape[1],
126 |                             activation=args['dense_act'],
127 |                             kernel_initializer='glorot_uniform',
128 |                             name='time_output')(l2_3)
129 |     else:
130 |         time_output = Dense(train_vec['next_evt']['times'].shape[1],
131 |                             kernel_initializer='glorot_uniform',
132 |                             name='time_output')(l2_3)
133 | 
134 |     model = Model(inputs=[ac_input, rl_input, t_input],
135 |                   outputs=[act_output, role_output, time_output])
136 | 
137 |     if args['optim'] == 'Nadam':
138 |         opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999)
139 |     elif args['optim'] == 'Adam':
140 |         opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
141 |     elif args['optim'] == 'SGD':
142 |         opt = SGD(learning_rate=0.01, momentum=0.0, nesterov=False)
143 |     elif args['optim'] == 'Adagrad':
144 |         opt = Adagrad(learning_rate=0.01)
145 | 
146 |     model.compile(loss={'act_output': 'categorical_crossentropy',
147 |                         'role_output': 'categorical_crossentropy',
148 |                         'time_output': 'mae'}, optimizer=opt)
149 | 
150 |     model.summary()
151 | 
152 |     early_stopping = EarlyStopping(monitor='val_loss', patience=40)
153 |     if log_path:
154 |         cb = tc.TimingCallback(output_folder, log_path=log_path)
155 |     else:
156 |         cb = tc.TimingCallback(output_folder)
157 | 
158 |     # Output file
159 |     output_file_path = os.path.join(output_folder, 
160 |                                     os.path.splitext(args['file'])[0]+'.h5')
161 | 
162 |     # Saving
163 |     model_checkpoint = ModelCheckpoint(output_file_path,
164 |                                        monitor='val_loss',
165 |                                        verbose=0,
166 |                                        save_best_only=True,
167 |                                        save_weights_only=False,
168 |                                        mode='auto')
169 |     lr_reducer = ReduceLROnPlateau(monitor='val_loss',
170 |                                    factor=0.5,
171 |                                    patience=10,
172 |                                    verbose=0,
173 |                                    mode='auto',
174 |                                    min_delta=0.0001,
175 |                                    cooldown=0,
176 |                                    min_lr=0)
177 | 
178 |     batch_size = args['batch_size']
179 |     model.fit({'ac_input': train_vec['prefixes']['activities'],
180 |                 'rl_input': train_vec['prefixes']['roles'],
181 |                 't_input': train_vec['prefixes']['times']},
182 |               {'act_output': train_vec['next_evt']['activities'],
183 |                 'role_output': train_vec['next_evt']['roles'],
184 |                 'time_output': train_vec['next_evt']['times']},
185 |               validation_data=(
186 |                   {'ac_input': valdn_vec['prefixes']['activities'],
187 |                    'rl_input': valdn_vec['prefixes']['roles'],
188 |                    't_input': valdn_vec['prefixes']['times']},
189 |                   {'act_output': valdn_vec['next_evt']['activities'],
190 |                    'role_output': valdn_vec['next_evt']['roles'],
191 |                    'time_output': valdn_vec['next_evt']['times']}),
192 |               verbose=2,
193 |               callbacks=[early_stopping, model_checkpoint, lr_reducer, cb],
194 |               batch_size=batch_size,
195 |               epochs=args['epochs'])
196 |     return model
197 | 


--------------------------------------------------------------------------------
/model_training/models/model_shared_cat_cx.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Feb 28 10:15:12 2019
  4 | 
  5 | @author: Manuel Camargo
  6 | """
  7 | import os
  8 | 
  9 | from tensorflow.keras.models import Model
 10 | from tensorflow.keras.layers import Input, Embedding, Concatenate
 11 | from tensorflow.keras.layers import Dense, LSTM, BatchNormalization
 12 | from tensorflow.keras.optimizers import Nadam, Adam, SGD, Adagrad
 13 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
 14 | 
 15 | try:
 16 |     from support_modules.callbacks import time_callback as tc
 17 | except:
 18 |     from importlib import util
 19 |     spec = util.spec_from_file_location(
 20 |         'time_callback', 
 21 |         os.path.join(os.getcwd(), 'support_modules', 'callbacks', 'time_callback.py'))
 22 |     tc = util.module_from_spec(spec)
 23 |     spec.loader.exec_module(tc)
 24 | 
 25 | 
 26 | def _training_model(train_vec, valdn_vec, ac_weights, rl_weights, 
 27 |                     output_folder, args, log_path=None):
 28 |     """Example function with types documented in the docstring.
 29 |     Args:
 30 |         param1 (int): The first parameter.
 31 |         param2 (str): The second parameter.
 32 |     Returns:
 33 |         bool: The return value. True for success, False otherwise.
 34 |     """
 35 | 
 36 |     print('Build model...')
 37 |     print(args)
 38 | # =============================================================================
 39 | #     Input layer
 40 | # =============================================================================
 41 |     ac_input = Input(shape=(train_vec['prefixes']['activities'].shape[1], ),
 42 |                      name='ac_input')
 43 |     rl_input = Input(shape=(train_vec['prefixes']['roles'].shape[1], ),
 44 |                      name='rl_input')
 45 |     t_input = Input(shape=(train_vec['prefixes']['times'].shape[1],
 46 |                            train_vec['prefixes']['times'].shape[2]), name='t_input')
 47 |     inter_input = Input(shape=(train_vec['prefixes']['inter_attr'].shape[1],
 48 |                             train_vec['prefixes']['inter_attr'].shape[2]),
 49 |                      name='inter_input')
 50 | 
 51 | # =============================================================================
 52 | #    Embedding layer for categorical attributes
 53 | # =============================================================================
 54 |     ac_embedding = Embedding(ac_weights.shape[0],
 55 |                              ac_weights.shape[1],
 56 |                              weights=[ac_weights],
 57 |                              input_length=(train_vec['prefixes']['activities']
 58 |                                            .shape[1]),
 59 |                              trainable=False, name='ac_embedding')(ac_input)
 60 | 
 61 |     rl_embedding = Embedding(rl_weights.shape[0],
 62 |                              rl_weights.shape[1],
 63 |                              weights=[rl_weights],
 64 |                              input_length=train_vec['prefixes']['roles'].shape[1],
 65 |                              trainable=False, name='rl_embedding')(rl_input)
 66 | # =============================================================================
 67 | #    Concatenation layer
 68 | # =============================================================================
 69 | 
 70 |     merged1 = Concatenate(name='conc_categorical',
 71 |                           axis=2)([ac_embedding, rl_embedding])
 72 |     merged2 = Concatenate(name='conc_continuous', axis=2)([t_input, inter_input])
 73 | 
 74 | # =============================================================================
 75 | #    Layer 1
 76 | # =============================================================================
 77 | 
 78 |     l1_c1 = LSTM(args['l_size'],
 79 |                  kernel_initializer='glorot_uniform',
 80 |                  return_sequences=True,
 81 |                  dropout=0.2,
 82 |                  implementation=args['imp'])(merged1)
 83 | 
 84 |     l1_c3 = LSTM(args['l_size'],
 85 |                  activation=args['lstm_act'],
 86 |                  kernel_initializer='glorot_uniform',
 87 |                  return_sequences=True,
 88 |                  dropout=0.2,
 89 |                  implementation=args['imp'])(merged2)
 90 | 
 91 | # =============================================================================
 92 | #    Batch Normalization Layer
 93 | # =============================================================================
 94 |     batch1 = BatchNormalization()(l1_c1)
 95 |     batch3 = BatchNormalization()(l1_c3)
 96 | 
 97 | # =============================================================================
 98 | # The layer specialized in prediction
 99 | # =============================================================================
100 |     l2_c1 = LSTM(args['l_size'],
101 |                  kernel_initializer='glorot_uniform',
102 |                  return_sequences=False,
103 |                  dropout=0.2,
104 |                  implementation=args['imp'])(batch1)
105 | 
106 | #   The layer specialized in role prediction
107 |     l2_c2 = LSTM(args['l_size'],
108 |                  kernel_initializer='glorot_uniform',
109 |                  return_sequences=False,
110 |                  dropout=0.2,
111 |                  implementation=args['imp'])(batch1)
112 | 
113 | 
114 | #   The layer specialized in time prediction
115 |     l2_c4 = LSTM(args['l_size'],
116 |                  activation=args['lstm_act'],
117 |                  kernel_initializer='glorot_uniform',
118 |                  return_sequences=False,
119 |                  dropout=0.2,
120 |                  implementation=args['imp'])(batch3)
121 | 
122 | # =============================================================================
123 | # Output Layer
124 | # =============================================================================
125 |     act_output = Dense(train_vec['next_evt']['activities'].shape[1],
126 |                        activation='softmax',
127 |                        kernel_initializer='glorot_uniform',
128 |                        name='act_output')(l2_c1)
129 | 
130 |     role_output = Dense(train_vec['next_evt']['roles'].shape[1],
131 |                         activation='softmax',
132 |                         kernel_initializer='glorot_uniform',
133 |                         name='role_output')(l2_c2)
134 |     if ('dense_act' in args) and (args['dense_act'] is not None):
135 |         time_output = Dense(train_vec['next_evt']['times'].shape[1],
136 |                             activation=args['dense_act'],
137 |                             kernel_initializer='glorot_uniform',
138 |                             name='time_output')(l2_c4)
139 |     else:
140 |         time_output = Dense(train_vec['next_evt']['times'].shape[1],
141 |                             kernel_initializer='glorot_uniform',
142 |                             name='time_output')(l2_c4)
143 | 
144 |     model = Model(inputs=[ac_input, rl_input, t_input, inter_input],
145 |                   outputs=[act_output, role_output, time_output])
146 | 
147 |     if args['optim'] == 'Nadam':
148 |         opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999)
149 |     elif args['optim'] == 'Adam':
150 |         opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999,
151 |                    amsgrad=False)
152 |     elif args['optim'] == 'SGD':
153 |         opt = SGD(learning_rate=0.01, momentum=0.0, nesterov=False)
154 |     elif args['optim'] == 'Adagrad':
155 |         opt = Adagrad(learning_rate=0.01)
156 | 
157 |     model.compile(loss={'act_output': 'categorical_crossentropy',
158 |                         'role_output': 'categorical_crossentropy',
159 |                         'time_output': 'mae'}, optimizer=opt)
160 | 
161 |     model.summary()
162 |     
163 |     early_stopping = EarlyStopping(monitor='val_loss', patience=40)
164 |     if log_path:
165 |         cb = tc.TimingCallback(output_folder, log_path=log_path)
166 |     else:
167 |         cb = tc.TimingCallback(output_folder)
168 | 
169 |     # Output file
170 |     output_file_path = os.path.join(output_folder, 
171 |                                     os.path.splitext(args['file'])[0]+'.h5')
172 | 
173 |     # Saving
174 |     model_checkpoint = ModelCheckpoint(output_file_path,
175 |                                        monitor='val_loss',
176 |                                        verbose=0,
177 |                                        save_best_only=True,
178 |                                        save_weights_only=False,
179 |                                        mode='auto')
180 |     lr_reducer = ReduceLROnPlateau(monitor='val_loss',
181 |                                    factor=0.5,
182 |                                    patience=10,
183 |                                    verbose=0,
184 |                                    mode='auto',
185 |                                    min_delta=0.0001,
186 |                                    cooldown=0,
187 |                                    min_lr=0)
188 | 
189 |     batch_size = args['batch_size']
190 |     model.fit({'ac_input': train_vec['prefixes']['activities'],
191 |                'rl_input': train_vec['prefixes']['roles'],
192 |                't_input': train_vec['prefixes']['times'],
193 |                'inter_input': train_vec['prefixes']['inter_attr']},
194 |               {'act_output': train_vec['next_evt']['activities'],
195 |                'role_output': train_vec['next_evt']['roles'],
196 |                'time_output': train_vec['next_evt']['times']},
197 |               validation_data=(
198 |                   {'ac_input': valdn_vec['prefixes']['activities'],
199 |                    'rl_input': valdn_vec['prefixes']['roles'],
200 |                    't_input': valdn_vec['prefixes']['times'],
201 |                    'inter_input': valdn_vec['prefixes']['inter_attr']},
202 |                   {'act_output': valdn_vec['next_evt']['activities'],
203 |                    'role_output': valdn_vec['next_evt']['roles'],
204 |                    'time_output': valdn_vec['next_evt']['times']}),
205 |               verbose=2,
206 |               callbacks=[early_stopping, model_checkpoint,
207 |                          lr_reducer, cb],
208 |               batch_size=batch_size, 
209 |               epochs=args['epochs'])
210 |     return model
211 | 


--------------------------------------------------------------------------------
/model_training/models/model_specialized.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Feb 28 10:15:12 2019
  4 | 
  5 | @author: Manuel Camargo
  6 | """
  7 | import os
  8 | 
  9 | from tensorflow.keras.models import Model
 10 | from tensorflow.keras.layers import Input, Embedding
 11 | from tensorflow.keras.layers import Dense, LSTM, BatchNormalization
 12 | from tensorflow.keras.optimizers import Nadam, Adam, SGD, Adagrad
 13 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
 14 | 
 15 | try:
 16 |     from support_modules.callbacks import time_callback as tc
 17 | except:
 18 |     from importlib import util
 19 |     spec = util.spec_from_file_location(
 20 |         'time_callback', 
 21 |         os.path.join(os.getcwd(), 'support_modules', 'callbacks', 'time_callback.py'))
 22 |     tc = util.module_from_spec(spec)
 23 |     spec.loader.exec_module(tc)
 24 | 
 25 | 
 26 | def _training_model(train_vec, valdn_vec, ac_weights, rl_weights, output_folder, args, log_path=None):
 27 |     """Example function with types documented in the docstring.
 28 |     Args:
 29 |         param1 (int): The first parameter.
 30 |         param2 (str): The second parameter.
 31 |     Returns:
 32 |         bool: The return value. True for success, False otherwise.
 33 |     """
 34 | 
 35 |     print('Build model...')
 36 |     print(args)
 37 | # =============================================================================
 38 | #     Input layer
 39 | # =============================================================================
 40 |     ac_input = Input(shape=(train_vec['prefixes']['activities'].shape[1], ), name='ac_input')
 41 |     rl_input = Input(shape=(train_vec['prefixes']['roles'].shape[1], ), name='rl_input')
 42 |     t_input = Input(shape=(train_vec['prefixes']['times'].shape[1],
 43 |                            train_vec['prefixes']['times'].shape[2]), name='t_input')
 44 | 
 45 | # =============================================================================
 46 | #    Embedding layer for categorical attributes        
 47 | # =============================================================================
 48 |     ac_embedding = Embedding(ac_weights.shape[0],
 49 |                             ac_weights.shape[1],
 50 |                             weights=[ac_weights],
 51 |                             input_length=train_vec['prefixes']['activities'].shape[1],
 52 |                             trainable=False, name='ac_embedding')(ac_input)
 53 | 
 54 |     rl_embedding = Embedding(rl_weights.shape[0],
 55 |                             rl_weights.shape[1],
 56 |                             weights=[rl_weights],
 57 |                             input_length=train_vec['prefixes']['roles'].shape[1],
 58 |                             trainable=False, name='rl_embedding')(rl_input)
 59 | # =============================================================================
 60 | #    Layer 1
 61 | # =============================================================================
 62 |     l1_c1 = LSTM(args['l_size'],
 63 |                   kernel_initializer='glorot_uniform',
 64 |                   return_sequences=True,
 65 |                   dropout=0.2,
 66 |                   implementation=args['imp'])(ac_embedding)
 67 |     
 68 |     l1_c2 = LSTM(args['l_size'],
 69 |                   kernel_initializer='glorot_uniform',
 70 |                   return_sequences=True,
 71 |                   dropout=0.2,
 72 |                   implementation=args['imp'])(rl_embedding)
 73 | 
 74 |     l1_c3 = LSTM(args['l_size'],
 75 |                  activation=args['lstm_act'],
 76 |                  kernel_initializer='glorot_uniform',
 77 |                  return_sequences=True,
 78 |                  dropout=0.2,
 79 |                  implementation=args['imp'])(t_input)
 80 | 
 81 | # =============================================================================
 82 | #    Batch Normalization Layer
 83 | # =============================================================================
 84 |     batch1 = BatchNormalization()(l1_c1)
 85 |     batch2 = BatchNormalization()(l1_c2)
 86 |     batch3 = BatchNormalization()(l1_c3)
 87 |     
 88 | # =============================================================================
 89 | # The layer specialized in prediction
 90 | # =============================================================================
 91 |     l2_c1 = LSTM(args['l_size'],
 92 |                     kernel_initializer='glorot_uniform',
 93 |                     return_sequences=False,
 94 |                     dropout=0.2,
 95 |                     implementation=args['imp'])(batch1)
 96 |  
 97 | #   The layer specialized in role prediction
 98 |     l2_c2 = LSTM(args['l_size'],
 99 |                     kernel_initializer='glorot_uniform',
100 |                     return_sequences=False,
101 |                     dropout=0.2,
102 |                     implementation=args['imp'])(batch2)
103 |     
104 | #   The layer specialized in role prediction
105 |     if args['lstm_act'] is not None:
106 |         l2_3 = LSTM(args['l_size'],
107 |                     activation=args['lstm_act'],
108 |                     kernel_initializer='glorot_uniform',
109 |                     return_sequences=False,
110 |                     dropout=0.2,
111 |                     implementation=args['imp'])(batch3)
112 |     else:
113 |         l2_3 = LSTM(args['l_size'],
114 |                     kernel_initializer='glorot_uniform',
115 |                     return_sequences=False,
116 |                     dropout=0.2,
117 |                     implementation=args['imp'])(batch3)
118 |     
119 | 
120 |     
121 | # =============================================================================
122 | # Output Layer
123 | # =============================================================================
124 |     act_output = Dense(ac_weights.shape[0],
125 |                        activation='softmax',
126 |                        kernel_initializer='glorot_uniform',
127 |                        name='act_output')(l2_c1)
128 | 
129 |     role_output = Dense(rl_weights.shape[0],
130 |                        activation='softmax',
131 |                        kernel_initializer='glorot_uniform',
132 |                        name='role_output')(l2_c2)
133 | 
134 |     if ('dense_act' in args) and (args['dense_act'] is not None):
135 |         time_output = Dense(train_vec['next_evt']['times'].shape[1], activation=args['dense_act'],
136 |                             kernel_initializer='glorot_uniform',
137 |                             name='time_output')(l2_3)
138 |     else:
139 |         time_output = Dense(train_vec['next_evt']['times'].shape[1],
140 |                             kernel_initializer='glorot_uniform',
141 |                             name='time_output')(l2_3)
142 | 
143 |     model = Model(inputs=[ac_input, rl_input, t_input],
144 |                   outputs=[act_output, role_output, time_output])
145 | 
146 |     if args['optim'] == 'Nadam':
147 |         opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999)
148 |     elif args['optim'] == 'Adam':
149 |         opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
150 |     elif args['optim'] == 'SGD':
151 |         opt = SGD(learning_rate=0.01, momentum=0.0, nesterov=False)
152 |     elif args['optim'] == 'Adagrad':
153 |         opt = Adagrad(learning_rate=0.01)
154 | 
155 |     model.compile(loss={'act_output':'categorical_crossentropy',
156 |                         'role_output':'categorical_crossentropy',
157 |                         'time_output':'mae'}, optimizer=opt)
158 |     
159 |     model.summary()
160 | 
161 |     early_stopping = EarlyStopping(monitor='val_loss', patience=40)
162 |     if log_path:
163 |         cb = tc.TimingCallback(output_folder, log_path=log_path)
164 |     else:
165 |         cb = tc.TimingCallback(output_folder)
166 | 
167 |     # Output file
168 |     output_file_path = os.path.join(output_folder, 
169 |                                     os.path.splitext(args['file'])[0]+'.h5')
170 | 
171 |     # Saving
172 |     model_checkpoint = ModelCheckpoint(output_file_path,
173 |                                        monitor='val_loss',
174 |                                        verbose=0,
175 |                                        save_best_only=True,
176 |                                        save_weights_only=False,
177 |                                        mode='auto')
178 |     lr_reducer = ReduceLROnPlateau(monitor='val_loss',
179 |                                    factor=0.5,
180 |                                    patience=10,
181 |                                    verbose=0,
182 |                                    mode='auto',
183 |                                    min_delta=0.0001,
184 |                                    cooldown=0,
185 |                                    min_lr=0)
186 | 
187 |     batch_size = args['batch_size']
188 |     model.fit({'ac_input': train_vec['prefixes']['activities'],
189 |                 'rl_input': train_vec['prefixes']['roles'],
190 |                 't_input': train_vec['prefixes']['times']},
191 |               {'act_output': train_vec['next_evt']['activities'],
192 |                 'role_output': train_vec['next_evt']['roles'],
193 |                 'time_output': train_vec['next_evt']['times']},
194 |               validation_data=(
195 |                   {'ac_input': valdn_vec['prefixes']['activities'],
196 |                    'rl_input': valdn_vec['prefixes']['roles'],
197 |                    't_input': valdn_vec['prefixes']['times']},
198 |                   {'act_output': valdn_vec['next_evt']['activities'],
199 |                    'role_output': valdn_vec['next_evt']['roles'],
200 |                    'time_output': valdn_vec['next_evt']['times']}),
201 |               verbose=2,
202 |               callbacks=[early_stopping, model_checkpoint, lr_reducer, cb],
203 |               batch_size=batch_size,
204 |               epochs=args['epochs'])
205 |     return model
206 | 


--------------------------------------------------------------------------------
/model_training/samples_creator.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Mar 14 19:13:15 2020
  4 | 
  5 | @author: Manuel Camargo
  6 | """
  7 | import itertools
  8 | import numpy as np
  9 | import random
 10 | 
 11 | from nltk.util import ngrams
 12 | import keras.utils as ku
 13 | 
 14 | 
 15 | class SequencesCreator():
 16 | 
 17 |     def __init__(self, one_timestamp, ac_index, rl_index):
 18 |         """constructor"""
 19 |         self.one_timestamp = one_timestamp
 20 |         self.ac_index = ac_index
 21 |         self.rl_index = rl_index
 22 |         self._vectorizers = dict()
 23 |         self._vec_dispatcher = {'basic': self._vectorize_seq,
 24 |                                 'inter': self._vectorize_seq_inter,
 25 |                                 'gan': self.gan_simple}
 26 | 
 27 |     def vectorize(self, model_type, log, params, add_cols):
 28 |         self.log = log
 29 |         columns = self.define_columns(add_cols, self.one_timestamp)
 30 |         loader = self._get_vectorizer(model_type)
 31 |         return loader(params, columns)
 32 | 
 33 |     def register_vectorizer(self, model_type, vectorizer):
 34 |         try:
 35 |             self._vectorizers[model_type] = self._vec_dispatcher[vectorizer]
 36 |         except KeyError:
 37 |             raise ValueError(vectorizer)
 38 | 
 39 |     def _get_vectorizer(self, model_type):
 40 |         vectorizer = self._vectorizers.get(model_type)
 41 |         if not vectorizer:
 42 |             raise ValueError(model_type)
 43 |         return vectorizer
 44 | 
 45 |     @staticmethod
 46 |     def define_columns(add_cols, one_timestamp):
 47 |         columns = ['ac_index', 'rl_index', 'dur_norm']
 48 |         add_cols = [x+'_norm' if x != 'weekday' else x for x in add_cols ]
 49 |         columns.extend(add_cols)
 50 |         if not one_timestamp:
 51 |             columns.extend(['wait_norm'])
 52 |         return columns
 53 | 
 54 |     def _vectorize_seq(self, parms, columns):
 55 |         """
 56 |         Dataframe vectorizer.
 57 |         parms:
 58 |             columns: list of features to vectorize.
 59 |             parms (dict): parms for training the network
 60 |         Returns:
 61 |             dict: Dictionary that contains all the LSTM inputs.
 62 |         """
 63 |         # TODO: reorganizar este metoo para poder vectorizar los tiempos
 64 |         # con uno o dos features de tiempo, posiblemente la idea es
 65 |         # hacer equi como si fueran intercases.
 66 |         times = ['dur_norm'] if parms['one_timestamp'] else ['dur_norm', 'wait_norm']
 67 |         equi = {'ac_index': 'activities', 'rl_index': 'roles'}
 68 |         vec = {'prefixes': dict(),
 69 |                'next_evt': dict()}
 70 |         x_times_dict = dict()
 71 |         y_times_dict = dict()
 72 |         self.log = self.reformat_events(columns, parms['one_timestamp'])
 73 |         # n-gram definition
 74 |         for i, _ in enumerate(self.log):
 75 |             for x in columns:
 76 |                 serie = list(ngrams(self.log[i][x], parms['n_size'],
 77 |                                     pad_left=True, left_pad_symbol=0))
 78 |                 y_serie = [x[-1] for x in serie]
 79 |                 serie = serie[:-1]
 80 |                 y_serie = y_serie[1:]
 81 |                 if x in list(equi.keys()):
 82 |                     vec['prefixes'][equi[x]] = (vec['prefixes'][equi[x]] + serie
 83 |                                                 if i > 0 else serie)
 84 |                     vec['next_evt'][equi[x]] = (vec['next_evt'][equi[x]] + y_serie
 85 |                                                 if i > 0 else y_serie)
 86 |                 elif x in times:
 87 |                     x_times_dict[x] = (
 88 |                         x_times_dict[x] + serie if i > 0 else serie)
 89 |                     y_times_dict[x] = (
 90 |                         y_times_dict[x] + y_serie if i > 0 else y_serie)
 91 | 
 92 |         # Transform task, dur and role prefixes in vectors
 93 |         for value in equi.values():
 94 |             vec['prefixes'][value] = np.array(vec['prefixes'][value])
 95 |             vec['next_evt'][value] = np.array(vec['next_evt'][value])
 96 |         # one-hot encode target values
 97 |         vec['next_evt']['activities'] = ku.to_categorical(
 98 |             vec['next_evt']['activities'], num_classes=len(self.ac_index))
 99 |         vec['next_evt']['roles'] = ku.to_categorical(
100 |             vec['next_evt']['roles'], num_classes=len(self.rl_index))
101 |         # reshape times
102 |         for key, value in x_times_dict.items():
103 |             x_times_dict[key] = np.array(value)
104 |             x_times_dict[key] = x_times_dict[key].reshape(
105 |                 (x_times_dict[key].shape[0], x_times_dict[key].shape[1], 1))
106 |         vec['prefixes']['times'] = np.dstack(list(x_times_dict.values()))
107 |         # Reshape y times attributes (suffixes, number of attributes)
108 |         vec['next_evt']['times'] = np.dstack(list(y_times_dict.values()))[0]
109 |         return vec
110 | 
111 |     def _vectorize_seq_inter(self, parms, columns):
112 |         """
113 |         Dataframe vectorizer to process intercase or data atributes features.
114 |         parms:
115 |             columns: list of features to vectorize.
116 |             parms (dict): parms for training the network
117 |         Returns:
118 |             dict: Dictionary that contains all the LSTM inputs.
119 |         """
120 |         times = ['dur_norm'] if parms['one_timestamp'] else ['dur_norm', 'wait_norm']
121 |         equi = {'ac_index': 'activities', 'rl_index': 'roles'}
122 |         vec = {'prefixes': dict(),
123 |                'next_evt': dict()}
124 |         x_weekday = list()
125 |         y_weekday = list()
126 |         # times
127 |         x_times_dict = dict()
128 |         y_times_dict = dict()
129 |         # intercases
130 |         x_inter_dict = dict()
131 |         y_inter_dict = dict()
132 |         # self.log = self.log[self.log.caseid.isin(['1', '1770'])].head(3)
133 |         self.log = self.reformat_events(columns, parms['one_timestamp'])
134 |         for i, _ in enumerate(self.log):
135 |             for x in columns:
136 |                 serie = list(ngrams(self.log[i][x], parms['n_size'],
137 |                                     pad_left=True, left_pad_symbol=0))
138 |                 y_serie = [x[-1] for x in serie]
139 |                 serie = serie[:-1]
140 |                 y_serie = y_serie[1:]
141 |                 if x in list(equi.keys()):
142 |                     vec['prefixes'][equi[x]] = (
143 |                         vec['prefixes'][equi[x]] + serie if i > 0 else serie)
144 |                     vec['next_evt'][equi[x]] = (
145 |                         vec['next_evt'][equi[x]] + y_serie
146 |                         if i > 0 else y_serie)
147 |                 elif x in times:
148 |                     x_times_dict[x] = (
149 |                         x_times_dict[x] + serie if i > 0 else serie)
150 |                     y_times_dict[x] = (
151 |                         y_times_dict[x] + y_serie if i > 0 else y_serie)
152 |                 elif x == 'weekday':
153 |                     x_weekday = (
154 |                         x_weekday + serie if i > 0 else serie)
155 |                     y_weekday = (
156 |                         y_weekday + y_serie if i > 0 else y_serie)
157 |                 else:
158 |                     x_inter_dict[x] = (
159 |                         x_inter_dict[x] + serie if i > 0 else serie)
160 |                     y_inter_dict[x] = (
161 |                         y_inter_dict[x] + y_serie if i > 0 else y_serie)
162 |         # Transform task, dur and role prefixes in vectors
163 |         for value in equi.values():
164 |             vec['prefixes'][value] = np.array(vec['prefixes'][value])
165 |             vec['next_evt'][value] = np.array(vec['next_evt'][value])
166 |         # one-hot encode target values
167 |         vec['next_evt']['activities'] = ku.to_categorical(
168 |             vec['next_evt']['activities'], num_classes=len(self.ac_index))
169 |         vec['next_evt']['roles'] = ku.to_categorical(
170 |             vec['next_evt']['roles'], num_classes=len(self.rl_index))
171 |         # reshape times
172 |         for key, value in x_times_dict.items():
173 |             x_times_dict[key] = np.array(value)
174 |             x_times_dict[key] = x_times_dict[key].reshape(
175 |                 (x_times_dict[key].shape[0], x_times_dict[key].shape[1], 1))
176 |         vec['prefixes']['times'] = np.dstack(list(x_times_dict.values()))
177 |         # Reshape y times attributes (suffixes, number of attributes)
178 |         vec['next_evt']['times'] = np.dstack(list(y_times_dict.values()))[0]
179 |         # Reshape intercase attributes (prefixes, n-gram size, number of attributes)
180 |         for key, value in x_inter_dict.items():
181 |             x_inter_dict[key] = np.array(value)
182 |             x_inter_dict[key] = x_inter_dict[key].reshape(
183 |                 (x_inter_dict[key].shape[0], x_inter_dict[key].shape[1], 1))
184 |         vec['prefixes']['inter_attr'] = np.dstack(list(x_inter_dict.values()))
185 |         # Reshape y intercase attributes (suffixes, number of attributes)
186 |         vec['next_evt']['inter_attr'] = np.dstack(list(y_inter_dict.values()))[0]
187 |         if 'weekday' in columns:
188 |             # Onehot encode weekday
189 |             x_weekday = ku.to_categorical(x_weekday, num_classes=7)
190 |             y_weekday = ku.to_categorical(y_weekday, num_classes=7)
191 |             vec['prefixes']['inter_attr'] = np.concatenate(
192 |                 [vec['prefixes']['inter_attr'], x_weekday], axis=2)
193 |             vec['next_evt']['inter_attr'] = np.concatenate(
194 |                 [vec['next_evt']['inter_attr'], y_weekday], axis=1)
195 |         return vec
196 | 
197 | 
198 |     def gan_simple(self, parms, columns):
199 |         print(columns)
200 |         vec = {'training':dict()}
201 |         pairs = self.log.copy()
202 |         pairs = pairs[['ac_index', 'rl_index']]
203 |         pairs = pairs.to_records(index=False).tolist()
204 |         # Vectorize discriminator training real inputs
205 |         vec['training']['activities'] = [x[0] for x in pairs]
206 |         vec['training']['activities'] = ku.to_categorical(
207 |             vec['training']['activities'], num_classes=len(self.ac_index))
208 |         vec['training']['roles'] = [x[1] for x in pairs]
209 |         vec['training']['roles'] = ku.to_categorical(
210 |             vec['training']['roles'], num_classes=len(self.rl_index))
211 |         vec['training']['class'] = np.zeros(len(pairs))
212 |         
213 |         # If the discriminator will be pretrained create pretraining examples
214 |         if parms['gan_pretrain']:
215 |             # one third of real events randomly selected
216 |             n_positive = int(round(len(pairs)/3))
217 |             negative_ratio = 2
218 |     
219 |             batch_size = n_positive * (1 + negative_ratio)
220 |             batch = np.zeros((batch_size, 3))
221 |             pairs_set = set(pairs)
222 |             activities = list(self.ac_index.keys())
223 |             roles = list(self.rl_index.keys())
224 |             # randomly choose positive examples
225 |             idx = 0
226 |             for idx, (activity, role) in enumerate(
227 |                     random.sample(pairs, n_positive)):
228 |                 batch[idx, :] = (activity, role, 0)
229 |             # Increment idx by 1
230 |             idx += 1
231 |             # Add negative examples until reach batch size
232 |             while idx < batch_size:
233 |                 # random selection
234 |                 random_ac = random.randrange(len(activities))
235 |                 random_rl = random.randrange(len(roles))
236 |                 # Check to make sure this is not a positive example
237 |                 if (random_ac, random_rl) not in pairs_set:
238 |                     # Add to batch and increment index,  0 due classification task
239 |                     batch[idx, :] = (random_ac, random_rl, 1)
240 |                     idx += 1
241 |             vec['pretraining'] = dict()
242 |             # Make sure to shuffle order
243 |             np.random.shuffle(batch)
244 |             vec['pretraining']['activities'] = ku.to_categorical(
245 |                 batch[:, 0], num_classes=len(self.ac_index))
246 |             vec['pretraining']['roles'] = ku.to_categorical(
247 |                 batch[:, 1], num_classes=len(self.rl_index))
248 |             vec['pretraining']['class'] = batch[:, 2]
249 |         return vec
250 | 
251 |     # =============================================================================
252 |     # Reformat events
253 |     # =============================================================================
254 |     def reformat_events(self, columns, one_timestamp):
255 |         """Creates series of activities, roles and relative times per trace.
256 |         parms:
257 |             self.log: dataframe.
258 |             ac_index (dict): index of activities.
259 |             rl_index (dict): index of roles.
260 |         Returns:
261 |             list: lists of activities, roles and relative times.
262 |         """
263 |         temp_data = list()
264 |         log_df = self.log.to_dict('records')
265 |         key = 'end_timestamp' if one_timestamp else 'start_timestamp'
266 |         log_df = sorted(log_df, key=lambda x: (x['caseid'], key))
267 |         for key, group in itertools.groupby(log_df, key=lambda x: x['caseid']):
268 |             trace = list(group)
269 |             temp_dict = dict()
270 |             for x in columns:
271 |                 serie = [y[x] for y in trace]
272 |                 if x == 'ac_index':
273 |                     serie.insert(0, self.ac_index[('start')])
274 |                     serie.append(self.ac_index[('end')])
275 |                 elif x == 'rl_index':
276 |                     serie.insert(0, self.rl_index[('start')])
277 |                     serie.append(self.rl_index[('end')])
278 |                 else:
279 |                     serie.insert(0, 0)
280 |                     serie.append(0)
281 |                 temp_dict = {**{x: serie}, **temp_dict}
282 |             temp_dict = {**{'caseid': key}, **temp_dict}
283 |             temp_data.append(temp_dict)
284 |         return temp_data


--------------------------------------------------------------------------------
/models_spec.ini:
--------------------------------------------------------------------------------
 1 | [shared_cat]
 2 | scaler = basic
 3 | additional_columns = []
 4 | vectorizer = basic
 5 | trainer = shared_cat
 6 | [specialized]
 7 | scaler = basic
 8 | additional_columns = []
 9 | vectorizer = basic
10 | trainer = specialized
11 | [concatenated]
12 | scaler = basic
13 | additional_columns = []
14 | vectorizer = basic
15 | trainer = concatenated
16 | [shared_cat_cx]
17 | scaler = inter
18 | additional_columns = [daytime, weekday]
19 | vectorizer = inter
20 | trainer = shared_cat_cx
21 | [concatenated_cx]
22 | scaler = inter
23 | additional_columns = [daytime, weekday]
24 | vectorizer = inter
25 | trainer = concatenated_cx
26 | [shared_cat_gru_cx]
27 | scaler = inter
28 | additional_columns = [daytime, weekday]
29 | vectorizer = inter
30 | trainer = shared_cat_gru_cx
31 | [concatenated_gru_cx]
32 | scaler = inter
33 | additional_columns = [daytime, weekday]
34 | vectorizer = inter
35 | trainer = concatenated_gru_cx
36 | [shared_cat_inter]
37 | scaler = inter
38 | additional_columns = [ev_et, ev_et_t, ev_rd, ev_rp_occ]
39 | vectorizer = inter
40 | trainer = shared_cat_inter
41 | [shared_cat_inter_full]
42 | scaler = inter
43 | additional_columns = [daytime, acc_cycle, ev_et, ev_et_t, ev_rd, ev_rp_occ]
44 | vectorizer = inter
45 | trainer = shared_cat_inter
46 | [concatenated_inter]
47 | scaler = inter
48 | additional_columns = [daytime, acc_cycle, ev_et, ev_et_t, ev_rd, ev_rp_occ]
49 | vectorizer = inter
50 | trainer = concatenated_inter
51 | [simple_gan]
52 | scaler = basic
53 | additional_columns = []
54 | vectorizer = gan
55 | trainer = gan
56 | [shared_cat_gru]
57 | scaler = basic
58 | additional_columns = []
59 | vectorizer = basic
60 | trainer = shared_cat_gru
61 | [specialized_gru]
62 | scaler = basic
63 | additional_columns = []
64 | vectorizer = basic
65 | trainer = specialized_gru
66 | [concatenated_gru]
67 | scaler = basic
68 | additional_columns = []
69 | vectorizer = basic
70 | trainer = concatenated_gru


--------------------------------------------------------------------------------
/support_modules/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/support_modules/callbacks/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/support_modules/callbacks/clean_models_callback.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Nov 13 23:56:26 2019
 4 | 
 5 | @author: Manuel Camargo
 6 | """
 7 | 
 8 | import os
 9 | import datetime
10 | 
11 | from keras.callbacks import Callback
12 |  
13 | 
14 | class CleanSavedModelsCallback(Callback):
15 |     def __init__(self, output_folder, num_models):
16 |         self.logs=[]
17 |         self.num_models=num_models
18 |         self.path=output_folder
19 | 
20 |     def on_epoch_end(self, epoch, logs={}):
21 |         files = self.create_folder_list(self)
22 |         for file in files:
23 |             os.unlink(os.path.join(self.path, file))
24 |         
25 |     def create_folder_list(self, logs={}): 
26 |         file_list = list()
27 |         for _, _, files in os.walk(self.path):
28 |             files_filtered = list()
29 |             for f in files:
30 |                 _, file_extension = os.path.splitext(f)
31 |                 if file_extension == '.h5':
32 |                     files_filtered.append(f)
33 |             creation_list = list() 
34 |             for f in files_filtered:
35 |                 date=os.path.getmtime(os.path.join(self.path, f))
36 |                 creation_list.append(dict(filename=f, creation=datetime.datetime.utcfromtimestamp(date)))
37 |             creation_list = sorted(creation_list, key=lambda x:x['creation'], reverse=True)
38 |             for f in creation_list[self.num_models:]:
39 |                 file_list.append(f['filename'])
40 |         return file_list
41 | 
42 |         


--------------------------------------------------------------------------------
/support_modules/callbacks/time_callback.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Nov 12 17:21:35 2019
 4 | 
 5 | @author: Manuel Camargo
 6 | """
 7 | import os
 8 | import numpy as np
 9 | 
10 | from time import time
11 | from keras.callbacks import Callback
12 | import utils.support as sup
13 |  
14 | 
15 | class TimingCallback(Callback):
16 |     def __init__(self, output_folder, log_path=os.path.join(
17 |             'output_files', 'training_times.csv')):
18 |         self.logs=[]
19 |         self.output_folder=output_folder
20 |         self.log_path=log_path
21 |         
22 |     def on_epoch_begin(self, epoch, logs={}):
23 |         self.starttime=time()
24 |     def on_epoch_end(self, epoch, logs={}):
25 |         self.logs.append(time()-self.starttime)
26 |     def on_train_end(self, logs={}):
27 |         log_file = self.log_path
28 |         data = [{'output_folder': self.output_folder,
29 |                 'train_epochs': len(self.logs),
30 |                 'avg_time': np.mean(self.logs),
31 |                 'min_time': np.min(self.logs),
32 |                 'max_time': np.max(self.logs)}]
33 |         if os.path.exists(log_file):
34 |             sup.create_csv_file(data, log_file, mode='a')
35 |         else:
36 |             sup.create_csv_file_header(data, log_file)
37 |         


--------------------------------------------------------------------------------
/support_modules/role_discovery.py:
--------------------------------------------------------------------------------
  1 | ﻿# -*- coding: utf-8 -*-
  2 | import scipy
  3 | from scipy.stats import pearsonr
  4 | import networkx as nx
  5 | import utils.support as sup
  6 | from operator import itemgetter
  7 | import pandas as pd
  8 | 
  9 | 
 10 | class ResourcePoolAnalyser():
 11 |     """
 12 |         This class evaluates the tasks durations and associates resources to it
 13 |      """
 14 | 
 15 |     def __init__(self, log, drawing=False, sim_threshold=0.7):
 16 |         """constructor"""
 17 |         self.data = self.read_resource_pool(log)
 18 |         self.drawing = drawing
 19 |         self.sim_threshold = sim_threshold
 20 |         
 21 |         self.tasks = {val: i for i, val in enumerate(self.data.task.unique())}
 22 |         self.users = {val: i for i, val in enumerate(self.data.user.unique())}
 23 |         
 24 |         self.roles, self.resource_table = self.discover_roles()
 25 | 
 26 |     def read_resource_pool(self, log):
 27 |         if isinstance(log, pd.DataFrame):
 28 |             filtered_list = log[['task', 'user']]
 29 |         else:
 30 |             filtered_list = pd.DataFrame(log.data)[['task', 'user']]
 31 |         filtered_list = filtered_list[~filtered_list.task.isin(['Start', 'End'])]
 32 |         filtered_list = filtered_list[filtered_list.user != 'AUTO']
 33 |         return filtered_list
 34 | 
 35 | 
 36 |     def discover_roles(self):
 37 |         associations = lambda x: (self.tasks[x['task']], self.users[x['user']])
 38 |         self.data['ac_rl'] = self.data.apply(associations, axis=1)
 39 |     
 40 |         freq_matrix = (self.data.groupby(by='ac_rl')['task']
 41 |                        .count()
 42 |                        .reset_index()
 43 |                        .rename(columns={'task': 'freq'}))
 44 |         freq_matrix = {x['ac_rl']: x['freq'] for x in freq_matrix.to_dict('records')}
 45 |         
 46 |         profiles = self.build_profile(freq_matrix)
 47 |     
 48 |         sup.print_progress(((20 / 100)* 100),'Analysing resource pool ')
 49 |         # building of a correl matrix between resouces profiles
 50 |         correl_matrix = self.det_correl_matrix(profiles)
 51 |         sup.print_progress(((40 / 100)* 100),'Analysing resource pool ')
 52 |         # creation of a rel network between resouces
 53 |         g = nx.Graph()
 54 |         for user in self.users.values():
 55 |             g.add_node(user)
 56 |         for rel in correl_matrix:
 57 |             # creation of edges between nodes excluding the same elements
 58 |             # and those below the similarity threshold 
 59 |             if rel['distance'] > self.sim_threshold and rel['x'] != rel['y']:
 60 |                 g.add_edge(rel['x'],
 61 |                            rel['y'],
 62 |                            weight=rel['distance'])
 63 |         sup.print_progress(((60 / 100) * 100),'Analysing resource pool ')
 64 |         # extraction of fully conected subgraphs as roles
 65 |         sub_graphs = list(nx.connected_components(g))
 66 |         sup.print_progress(((80 / 100) * 100),'Analysing resource pool ')
 67 |         # role definition from graph
 68 |         roles = self.role_definition(sub_graphs)
 69 |         # plot creation (optional)
 70 |         # if drawing == True:
 71 |         #     graph_network(g, sub_graphs)
 72 |         sup.print_progress(((100 / 100)* 100),'Analysing resource pool ')
 73 |         sup.print_done_task()
 74 |         return roles
 75 |     
 76 |     def build_profile(self, freq_matrix):
 77 |         profiles=list()
 78 |         for user, idx in self.users.items():
 79 |             profile = [0,] * len(self.tasks)
 80 |             for ac_rl, freq in freq_matrix.items():
 81 |                 if idx == ac_rl[1]:
 82 |                     profile[ac_rl[0]] = freq
 83 |             profiles.append({'user': idx, 'profile': profile})
 84 |         return profiles
 85 | 
 86 |     def det_correl_matrix(self, profiles):
 87 |         correl_matrix = list()
 88 |         for profile_x in profiles:
 89 |             for profile_y in profiles:
 90 |                 x = scipy.array(profile_x['profile'])
 91 |                 y = scipy.array(profile_y['profile'])
 92 |                 r_row, p_value = pearsonr(x, y)
 93 |                 correl_matrix.append(({'x': profile_x['user'],
 94 |                                             'y': profile_y['user'],
 95 |                                             'distance': r_row}))
 96 |         return correl_matrix
 97 | 
 98 |     def role_definition(self, sub_graphs):
 99 |         user_index = {v: k for k, v in self.users.items()}
100 |         records= list()
101 |         for i in range(0, len(sub_graphs)):
102 |             users_names = [user_index[x] for x in sub_graphs[i]]
103 |             records.append({'role': 'Role '+ str(i + 1),
104 |                             'quantity': len(sub_graphs[i]),
105 |                             'members': users_names})
106 |         #Sort roles by number of resources
107 |         records = sorted(records, key=itemgetter('quantity'), reverse=True)
108 |         for i in range(0,len(records)):
109 |             records[i]['role']='Role '+ str(i + 1)
110 |         resource_table = list()
111 |         for record in records:
112 |             for member in record['members']:
113 |                 resource_table.append({'role': record['role'],
114 |                                        'resource': member})
115 |         return records, resource_table
116 | 
117 |     # # == support
118 |     # def random_color(size):
119 |     #     number_of_colors = size
120 |     #     color = ["#"+''.join([random.choice('0123456789ABCDEF')
121 |     #                           for j in range(6)]) for i in range(number_of_colors)]
122 |     #     return color
123 |     
124 |     # def graph_network(g, sub_graphs):
125 |     #     pos = nx.spring_layout(g, k=0.5,scale=10)
126 |     #     color = random_color(len(sub_graphs))
127 |     #     for i in range(0,len(sub_graphs)):
128 |     #         subgraph = sub_graphs[i]
129 |     #         nx.draw_networkx_nodes(g,pos, nodelist=list(subgraph),
130 |     #                                node_color=color[i], node_size=200, alpha=0.8)
131 |     #         nx.draw_networkx_edges(g,pos,width=1.0,alpha=0.5)
132 |     #         nx.draw_networkx_edges(g,pos, edgelist=subgraph.edges,
133 |     #                                width=8,alpha=0.5,edge_color=color[i])
134 |     #     plt.draw()
135 |     #     plt.show() # display
136 | 
137 | 


--------------------------------------------------------------------------------