├── .gitignore ├── LICENSE ├── README.md ├── algo ├── __init__.py └── alpha.py ├── logs ├── exercise1.txt ├── exercise2.txt ├── exercise3.txt └── exercise4.txt ├── output ├── exercise1 ├── exercise1.png ├── exercise2 ├── exercise2.png ├── exercise3 ├── exercise3.png ├── exercise4 └── exercise4.png ├── pyprom.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | .DS_Store 104 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Harry Wang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # What is PyProM? 2 | PyProM is a Python-based, open-source process mining package. 3 | 4 | # About Event Logs 5 | 6 | The example logs (in /logs folder) are from the ProM site (http://www.promtools.org). Each line is a case with a sequence of activities. 7 | 8 | TODO: Logs should be stored in a csv file with columns, such as Case ID, Activity, Start Time, End Time, Agent, Role, and Data. This format is used in Disco (https://fluxicon.com/disco/) 9 | 10 | # Setup Instructions 11 | Intall graphviz - we use graphviz to visualize the process in petri net format 12 | ``` 13 | brew install graphviz 14 | ``` 15 | Setup virtual environment and activate it: 16 | ``` 17 | virtualenv venv 18 | source venv/bin/activate 19 | ``` 20 | Install packages: `pip install -r requirements.txt` 21 | 22 | Run the program with different log files to generate the petri net images and related dot files in the output folder 23 | ``` 24 | python pyprom.py exercise1.txt 25 | ``` 26 | 27 | # References 28 | - PyPM: https://github.com/tdi/pypm: I referred to this project to start PyProM - many thanks to the author. 29 | - ProM: http://www.promtools.org 30 | - Disco: https://fluxicon.com/disco/ 31 | -------------------------------------------------------------------------------- /algo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/pyprom/f52eb295dac0d345ae65c5cba808336147682c6d/algo/__init__.py -------------------------------------------------------------------------------- /algo/alpha.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Module for Alpha Algorithm 3 | # https://en.wikipedia.org/wiki/Alpha_algorithm 4 | import graphviz as gv 5 | 6 | 7 | def apply(log, input_file, output_file): 8 | 9 | tl = set() # all task list 10 | df = [] # direct following tasks 11 | cs = [] # causalities tasks 12 | ncs = [] # non-causality tasks 13 | par = [] # parallel tasks 14 | xl = [] 15 | yl = [] 16 | ti = [] 17 | to = [] 18 | 19 | tl, df, cs, ncs, par = build_ordering_relations(log) 20 | xl, yl, ti, to = make_sets(log, tl, df, cs, ncs) 21 | 22 | print "all tasks:", tl 23 | print "direct followers:", df 24 | print "causalities:", cs 25 | print "no_causalities:", ncs 26 | print "parallels:", par 27 | print "x list:", xl 28 | print "y list:", yl 29 | print "initial tasks:", ti 30 | print "terminal tasks:", to 31 | 32 | build_petrinet(tl, yl, ti, to, output_file) 33 | 34 | 35 | def build_ordering_relations(log): 36 | tl = set([item for sub in log for item in sub]) 37 | df = get_direct_followers(log) 38 | cs = get_causalities(tl, df) 39 | ncs = get_no_causalities(tl, df) 40 | par = get_parallels(tl, df) 41 | 42 | return tl, df, cs, ncs, par 43 | 44 | 45 | def make_sets(log, tl, df, cs, ncs): 46 | xl = make_xl_set(tl, df, cs, ncs) 47 | yl = make_yl_set(xl) 48 | ti = make_ti_set(log) 49 | to = make_to_set(log) 50 | 51 | return xl, yl, ti, to 52 | 53 | 54 | def get_direct_followers(log): 55 | df = [] 56 | for trace in log: 57 | for index, event in enumerate(trace): 58 | print index, event 59 | if index != len(trace)-1: 60 | if (event, trace[index+1]) not in df: 61 | df.append((event, trace[index+1])) 62 | return df 63 | 64 | 65 | def get_causalities(all_tasks, direct_followers): 66 | cs = [] # causalities 67 | for event in all_tasks: 68 | for event2 in all_tasks: 69 | if (event, event2) not in cs: 70 | if (event, event2) in direct_followers and \ 71 | (event2, event) not in direct_followers: 72 | cs.append((event, event2)) 73 | return cs 74 | 75 | 76 | def get_no_causalities(all_tasks, direct_followers): 77 | ncs = [] # no causalities 78 | for event in all_tasks: 79 | for event2 in all_tasks: 80 | if (event, event2) not in ncs: 81 | if (event, event2) not in direct_followers and \ 82 | (event2, event) not in direct_followers: 83 | ncs.append((event, event2)) 84 | return ncs 85 | 86 | 87 | def get_parallels(all_tasks, direct_followers): 88 | par = [] # parallel tasks 89 | for event in all_tasks: 90 | for event2 in all_tasks: 91 | if (event, event2) not in par: 92 | if (event, event2) in direct_followers and \ 93 | (event2, event) in direct_followers: 94 | par.append((event, event2)) 95 | return par 96 | 97 | 98 | def check_set(A, ncs): 99 | for event in A: 100 | for event2 in A: 101 | if (event, event2) not in ncs: 102 | return False 103 | return True 104 | 105 | 106 | def check_outsets(A, B, cs): 107 | for event in A: 108 | for event2 in B: 109 | if (event, event2) not in cs: 110 | return False 111 | return True 112 | 113 | 114 | def make_xl_set(all_tasks, direct_followers, causalities, no_causalities): 115 | import itertools 116 | xl = set() 117 | subsets = set() 118 | for i in range(1, len(all_tasks)): 119 | for s in itertools.combinations(all_tasks, i): 120 | subsets.add(s) 121 | for a in subsets: 122 | reta = check_set(a, no_causalities) 123 | for b in subsets: 124 | retb = check_set(b, no_causalities) 125 | if reta and retb and \ 126 | check_outsets(a, b, causalities): 127 | xl.add((a, b)) 128 | return xl 129 | 130 | 131 | def make_yl_set(xl): 132 | import copy 133 | yl = copy.deepcopy(xl) 134 | for a in xl: 135 | A = a[0] 136 | B = a[1] 137 | for b in xl: 138 | 139 | if set(A).issubset(b[0]) and set(B).issubset(b[1]): 140 | if a != b: 141 | yl.discard(a) 142 | return yl 143 | 144 | 145 | # Ti is the set of all tasks which occur trace-initially 146 | def make_ti_set(log): 147 | ti = set() 148 | [ti.add(event[0]) for event in log] 149 | return ti 150 | 151 | 152 | # To is the set of all tasks which occur trace-terminally 153 | def make_to_set(log): 154 | to = set() 155 | [to.add(event[-1]) for event in log] 156 | return to 157 | 158 | 159 | def build_petrinet(tl, yl, ti, to, output_file): 160 | pn = gv.Digraph(format='png') 161 | pn.attr(rankdir='LR') # left to righ layout - default is top down 162 | pn.node('start') 163 | pn.node('end') 164 | 165 | for elem in yl: 166 | for i in elem[0]: 167 | pn.edge(i, str(elem)) 168 | pn.node(i, shape='box') 169 | pn.node(str(elem), shape='circle') 170 | for i in elem[1]: 171 | pn.edge(str(elem), i) 172 | pn.node(i, shape='box') 173 | for i in ti: 174 | pn.edge('start', i) 175 | for o in to: 176 | pn.edge(o, 'end') 177 | pn.render(output_file) 178 | -------------------------------------------------------------------------------- /logs/exercise1.txt: -------------------------------------------------------------------------------- 1 | A B C D 2 | A C B D 3 | A E D 4 | -------------------------------------------------------------------------------- /logs/exercise2.txt: -------------------------------------------------------------------------------- 1 | 1x Case1 A C D 2 | 1x Case2 B C E 3 | -------------------------------------------------------------------------------- /logs/exercise3.txt: -------------------------------------------------------------------------------- 1 | A C E G 2 | A E C G 3 | B D F G 4 | B F D G 5 | -------------------------------------------------------------------------------- /logs/exercise4.txt: -------------------------------------------------------------------------------- 1 | a b c d f 2 | a c b d f 3 | a b d c f 4 | a c d b f 5 | a d e f 6 | a e d f 7 | -------------------------------------------------------------------------------- /output/exercise1: -------------------------------------------------------------------------------- 1 | digraph { 2 | rankdir=LR 3 | start 4 | end 5 | B -> "(('B', 'E'), ('D',))" 6 | B [shape=box] 7 | "(('B', 'E'), ('D',))" [shape=circle] 8 | E -> "(('B', 'E'), ('D',))" 9 | E [shape=box] 10 | "(('B', 'E'), ('D',))" [shape=circle] 11 | "(('B', 'E'), ('D',))" -> D 12 | D [shape=box] 13 | A -> "(('A',), ('B', 'E'))" 14 | A [shape=box] 15 | "(('A',), ('B', 'E'))" [shape=circle] 16 | "(('A',), ('B', 'E'))" -> B 17 | B [shape=box] 18 | "(('A',), ('B', 'E'))" -> E 19 | E [shape=box] 20 | A -> "(('A',), ('C', 'E'))" 21 | A [shape=box] 22 | "(('A',), ('C', 'E'))" [shape=circle] 23 | "(('A',), ('C', 'E'))" -> C 24 | C [shape=box] 25 | "(('A',), ('C', 'E'))" -> E 26 | E [shape=box] 27 | C -> "(('C', 'E'), ('D',))" 28 | C [shape=box] 29 | "(('C', 'E'), ('D',))" [shape=circle] 30 | E -> "(('C', 'E'), ('D',))" 31 | E [shape=box] 32 | "(('C', 'E'), ('D',))" [shape=circle] 33 | "(('C', 'E'), ('D',))" -> D 34 | D [shape=box] 35 | start -> A 36 | D -> end 37 | } 38 | -------------------------------------------------------------------------------- /output/exercise1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/pyprom/f52eb295dac0d345ae65c5cba808336147682c6d/output/exercise1.png -------------------------------------------------------------------------------- /output/exercise2: -------------------------------------------------------------------------------- 1 | digraph { 2 | rankdir=LR 3 | start 4 | end 5 | "1x" -> "(('1x',), ('Case1', 'Case2'))" 6 | "1x" [shape=box] 7 | "(('1x',), ('Case1', 'Case2'))" [shape=circle] 8 | "(('1x',), ('Case1', 'Case2'))" -> Case1 9 | Case1 [shape=box] 10 | "(('1x',), ('Case1', 'Case2'))" -> Case2 11 | Case2 [shape=box] 12 | C -> "(('C',), ('E', 'D'))" 13 | C [shape=box] 14 | "(('C',), ('E', 'D'))" [shape=circle] 15 | "(('C',), ('E', 'D'))" -> E 16 | E [shape=box] 17 | "(('C',), ('E', 'D'))" -> D 18 | D [shape=box] 19 | A -> "(('A', 'B'), ('C',))" 20 | A [shape=box] 21 | "(('A', 'B'), ('C',))" [shape=circle] 22 | B -> "(('A', 'B'), ('C',))" 23 | B [shape=box] 24 | "(('A', 'B'), ('C',))" [shape=circle] 25 | "(('A', 'B'), ('C',))" -> C 26 | C [shape=box] 27 | Case2 -> "(('Case2',), ('B',))" 28 | Case2 [shape=box] 29 | "(('Case2',), ('B',))" [shape=circle] 30 | "(('Case2',), ('B',))" -> B 31 | B [shape=box] 32 | Case1 -> "(('Case1',), ('A',))" 33 | Case1 [shape=box] 34 | "(('Case1',), ('A',))" [shape=circle] 35 | "(('Case1',), ('A',))" -> A 36 | A [shape=box] 37 | start -> "1x" 38 | E -> end 39 | D -> end 40 | } 41 | -------------------------------------------------------------------------------- /output/exercise2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/pyprom/f52eb295dac0d345ae65c5cba808336147682c6d/output/exercise2.png -------------------------------------------------------------------------------- /output/exercise3: -------------------------------------------------------------------------------- 1 | digraph { 2 | rankdir=LR 3 | start 4 | end 5 | A -> "(('A',), ('E',))" 6 | A [shape=box] 7 | "(('A',), ('E',))" [shape=circle] 8 | "(('A',), ('E',))" -> E 9 | E [shape=box] 10 | C -> "(('C', 'D'), ('G',))" 11 | C [shape=box] 12 | "(('C', 'D'), ('G',))" [shape=circle] 13 | D -> "(('C', 'D'), ('G',))" 14 | D [shape=box] 15 | "(('C', 'D'), ('G',))" [shape=circle] 16 | "(('C', 'D'), ('G',))" -> G 17 | G [shape=box] 18 | C -> "(('C', 'F'), ('G',))" 19 | C [shape=box] 20 | "(('C', 'F'), ('G',))" [shape=circle] 21 | F -> "(('C', 'F'), ('G',))" 22 | F [shape=box] 23 | "(('C', 'F'), ('G',))" [shape=circle] 24 | "(('C', 'F'), ('G',))" -> G 25 | G [shape=box] 26 | E -> "(('E', 'F'), ('G',))" 27 | E [shape=box] 28 | "(('E', 'F'), ('G',))" [shape=circle] 29 | F -> "(('E', 'F'), ('G',))" 30 | F [shape=box] 31 | "(('E', 'F'), ('G',))" [shape=circle] 32 | "(('E', 'F'), ('G',))" -> G 33 | G [shape=box] 34 | E -> "(('E', 'D'), ('G',))" 35 | E [shape=box] 36 | "(('E', 'D'), ('G',))" [shape=circle] 37 | D -> "(('E', 'D'), ('G',))" 38 | D [shape=box] 39 | "(('E', 'D'), ('G',))" [shape=circle] 40 | "(('E', 'D'), ('G',))" -> G 41 | G [shape=box] 42 | B -> "(('B',), ('F',))" 43 | B [shape=box] 44 | "(('B',), ('F',))" [shape=circle] 45 | "(('B',), ('F',))" -> F 46 | F [shape=box] 47 | B -> "(('B',), ('D',))" 48 | B [shape=box] 49 | "(('B',), ('D',))" [shape=circle] 50 | "(('B',), ('D',))" -> D 51 | D [shape=box] 52 | A -> "(('A',), ('C',))" 53 | A [shape=box] 54 | "(('A',), ('C',))" [shape=circle] 55 | "(('A',), ('C',))" -> C 56 | C [shape=box] 57 | start -> A 58 | start -> B 59 | G -> end 60 | } 61 | -------------------------------------------------------------------------------- /output/exercise3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/pyprom/f52eb295dac0d345ae65c5cba808336147682c6d/output/exercise3.png -------------------------------------------------------------------------------- /output/exercise4: -------------------------------------------------------------------------------- 1 | digraph { 2 | rankdir=LR 3 | start 4 | end 5 | d -> "(('d',), ('f',))" 6 | d [shape=box] 7 | "(('d',), ('f',))" [shape=circle] 8 | "(('d',), ('f',))" -> f 9 | f [shape=box] 10 | a -> "(('a',), ('d',))" 11 | a [shape=box] 12 | "(('a',), ('d',))" [shape=circle] 13 | "(('a',), ('d',))" -> d 14 | d [shape=box] 15 | a -> "(('a',), ('b', 'e'))" 16 | a [shape=box] 17 | "(('a',), ('b', 'e'))" [shape=circle] 18 | "(('a',), ('b', 'e'))" -> b 19 | b [shape=box] 20 | "(('a',), ('b', 'e'))" -> e 21 | e [shape=box] 22 | c -> "(('c', 'e'), ('f',))" 23 | c [shape=box] 24 | "(('c', 'e'), ('f',))" [shape=circle] 25 | e -> "(('c', 'e'), ('f',))" 26 | e [shape=box] 27 | "(('c', 'e'), ('f',))" [shape=circle] 28 | "(('c', 'e'), ('f',))" -> f 29 | f [shape=box] 30 | a -> "(('a',), ('c', 'e'))" 31 | a [shape=box] 32 | "(('a',), ('c', 'e'))" [shape=circle] 33 | "(('a',), ('c', 'e'))" -> c 34 | c [shape=box] 35 | "(('a',), ('c', 'e'))" -> e 36 | e [shape=box] 37 | b -> "(('b', 'e'), ('f',))" 38 | b [shape=box] 39 | "(('b', 'e'), ('f',))" [shape=circle] 40 | e -> "(('b', 'e'), ('f',))" 41 | e [shape=box] 42 | "(('b', 'e'), ('f',))" [shape=circle] 43 | "(('b', 'e'), ('f',))" -> f 44 | f [shape=box] 45 | start -> a 46 | f -> end 47 | } 48 | -------------------------------------------------------------------------------- /output/exercise4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/pyprom/f52eb295dac0d345ae65c5cba808336147682c6d/output/exercise4.png -------------------------------------------------------------------------------- /pyprom.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import sys 4 | from algo import alpha 5 | 6 | 7 | def main(argv): 8 | log_dir = "logs" # log folder name 9 | output_dir = "output" # output folder name 10 | 11 | # clean the output folder 12 | # for the_file in os.listdir(output_dir): 13 | # file_path = os.path.join(output_dir, the_file) 14 | # try: 15 | # if os.path.isfile(file_path): 16 | # os.remove(file_path) 17 | # except Exception as e: 18 | # print(e) 19 | 20 | # read the log file 21 | log = [] 22 | input_file = log_dir + "/" + argv[1] 23 | output_file = os.path.splitext(output_dir + 24 | "/" + os.path.basename(argv[1]))[0] 25 | with open(input_file, "r") as f: 26 | for line in f.readlines(): 27 | line = line.split() 28 | if line not in log: # some sequence only counts once 29 | log.append(line) 30 | 31 | print log, input_file, output_file 32 | 33 | alpha.apply(log, input_file, output_file) 34 | 35 | 36 | if __name__ == "__main__": 37 | main(sys.argv) 38 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | graphviz==0.7.1 2 | --------------------------------------------------------------------------------