├── README.md ├── classes ├── FPGrowth.py ├── FPTree.py └── __init__.py └── main.py /README.md: -------------------------------------------------------------------------------- 1 | # FPGrowth-python 2 | This implementation is based on [FP-Growth-Java](https://github.com/goodinges/FP-Growth-Java). 3 | # Input File Format 4 | The python script accepts input file with format: 5 | ``` 6 | f,c,a,m,p 7 | f,c,b 8 | ``` 9 | or 10 | ``` 11 | f c a m p 12 | f c a 13 | ``` 14 | Use along with IBM Quest Synthetic Data Generator and [IBM Data Converter](https://github.com/mhwong2007/IBM-Quest-Data-Converter) to produce csv file. 15 | 16 | # How to Use 17 | First make `main.py` executable. 18 | ``` sh 19 | chmod +x main.py 20 | ``` 21 | Run FP-Growth algorithm with 22 | ``` sh 23 | ./main input_file minsup minconf 24 | ``` 25 | 26 | # Output 27 | This program first prints frequent patterns: 28 | ``` 29 | { frequent itemset } (support of the frequent item set) 30 | ``` 31 | Eg. 32 | ``` 33 | { a } ( 3 ) 34 | { a c } ( 3 ) 35 | { a c f } ( 3 ) 36 | { a f } ( 3 ) 37 | ... 38 | ``` 39 | After that it prints the rules: 40 | ``` 41 | { frequent itemset } => { frequent itemset } ( confidence ) 42 | ``` 43 | Eg. 44 | ``` 45 | { a } => { c } ( 1.0 ) 46 | { c } => { a } ( 0.75 ) 47 | { a } => { c f } ( 1.0 ) 48 | ... 49 | ``` 50 | -------------------------------------------------------------------------------- /classes/FPGrowth.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from itertools import chain, combinations 3 | import re 4 | 5 | __author__ = 'mhwong' 6 | from classes.FPTree import FPTree 7 | 8 | 9 | def build_power_set(iterable): 10 | s = list(iterable) 11 | return chain.from_iterable(combinations(s, r) for r in range(1, len(s))) 12 | 13 | 14 | class FPGrowth: 15 | def __init__(self, _input, _minsup, _minconf): 16 | # the threshold value 17 | self.threshold = 0 18 | 19 | # header table and fp tree 20 | self.header_table = [] 21 | self.fpTree = None 22 | 23 | # the minimum confidence 24 | self.minconf = _minconf 25 | 26 | items_map_to_frequencies = {} 27 | sorted_items_by_frequencies = [] 28 | items_to_remove = [] 29 | 30 | self.build_item_list(_input, items_map_to_frequencies, sorted_items_by_frequencies, items_to_remove, _minsup) 31 | 32 | # build fp tree 33 | self.build_fp_tree(_input, items_map_to_frequencies, sorted_items_by_frequencies, items_to_remove) 34 | 35 | # perform fp growth 36 | self.frequent_patterns = dict() 37 | self.fp_growth(None, self.threshold, self.header_table, self.frequent_patterns) 38 | 39 | # frequent patterns sorted by key 40 | self.frequent_patterns = OrderedDict(sorted(self.frequent_patterns.items(), key=lambda t: t[0])) 41 | 42 | # print result 43 | self.print_fp() 44 | 45 | # generate rules 46 | self.generating_rules() 47 | 48 | def build_item_list(self, _input, items_map_to_frequencies, sorted_items_by_frequencies, items_to_remove, _minsup): 49 | with open(_input) as input_file: 50 | trans = 0 51 | for input_line in input_file: 52 | trans += 1 53 | token_list = re.split(r'[\s,\r\n]+', input_line) 54 | # filter empty string 55 | token_list = filter(None, token_list) 56 | for item in token_list: 57 | if item in items_map_to_frequencies: 58 | items_map_to_frequencies[item] += 1 59 | 60 | else: 61 | items_map_to_frequencies[item] = 1 62 | 63 | input_file.close() 64 | 65 | self.threshold = _minsup * trans 66 | 67 | # build sorted item list 68 | sorted_items_by_frequencies.append("null") 69 | items_map_to_frequencies["null"] = 0 70 | for item in items_map_to_frequencies: 71 | count = items_map_to_frequencies[item] 72 | i = 0 73 | for list_item in sorted_items_by_frequencies: 74 | if items_map_to_frequencies[list_item] < count: 75 | sorted_items_by_frequencies.insert(i, item) 76 | break 77 | i += 1 78 | 79 | # removing non-frequent 80 | for list_item in sorted_items_by_frequencies: 81 | if items_map_to_frequencies[list_item] < self.threshold: 82 | items_to_remove.append(list_item) 83 | 84 | for itemToRemove in items_to_remove: 85 | sorted_items_by_frequencies.remove(itemToRemove) 86 | 87 | def build_fp_tree(self, _input, items_map_to_frequencies, sorted_items_by_frequencies, items_to_remove): 88 | # build header table 89 | # first element used as pointer 90 | self.header_table = [] 91 | for itemsForTable in sorted_items_by_frequencies: 92 | self.header_table.append(FPTree(itemsForTable)) 93 | 94 | self.fpTree = FPTree(None) 95 | self.fpTree.root = True 96 | 97 | transaction_sorted_by_frequency = [] 98 | with open(_input) as input_file: 99 | for input_line in input_file: 100 | token_list = re.split(r'[\s,\r\n]+', input_line) 101 | # filter empty string 102 | token_list = filter(None, token_list) 103 | for item in token_list: 104 | # ignore non-frequent items 105 | if item in items_to_remove: 106 | continue 107 | index = 0 108 | for string in transaction_sorted_by_frequency: 109 | if items_map_to_frequencies[string] < items_map_to_frequencies[item] \ 110 | or (items_map_to_frequencies[string] == items_map_to_frequencies[item] 111 | and string.lower() < item.lower()): 112 | transaction_sorted_by_frequency.insert(index, item) 113 | break 114 | index += 1 115 | 116 | if item not in transaction_sorted_by_frequency: 117 | transaction_sorted_by_frequency.append(item) 118 | 119 | # add to tree 120 | self.insert_into_tree(transaction_sorted_by_frequency, self.fpTree) 121 | transaction_sorted_by_frequency.clear() 122 | 123 | input_file.close() 124 | 125 | # header table with reversing order 126 | # first calculate the item frequencies in tree 127 | for item in self.header_table: 128 | count = 0 129 | temp_item = item 130 | while temp_item.next is not None: 131 | temp_item = temp_item.next 132 | count += temp_item.count 133 | item.count = count 134 | 135 | # sort header table 136 | self.header_table.sort(key=lambda x: x.count, reverse=True) 137 | 138 | def insert_into_tree(self, transaction_sorted_by_frequency, fp_tree): 139 | # return when list is empty 140 | if not transaction_sorted_by_frequency: 141 | return 142 | item_to_add_to_tree = transaction_sorted_by_frequency[0] 143 | new_node = None 144 | done = False 145 | for child in fp_tree.children: 146 | if child.item == item_to_add_to_tree: 147 | new_node = child 148 | child.count += 1 149 | done = True 150 | break 151 | 152 | if not done: 153 | new_node = FPTree(item_to_add_to_tree) 154 | new_node.count = 1 155 | new_node.parent = fp_tree 156 | fp_tree.children.append(new_node) 157 | for header_pointer in self.header_table: 158 | if header_pointer.item == item_to_add_to_tree: 159 | while header_pointer.next is not None: 160 | header_pointer = header_pointer.next 161 | header_pointer.next = new_node 162 | 163 | transaction_sorted_by_frequency.pop(0) 164 | self.insert_into_tree(transaction_sorted_by_frequency, new_node) 165 | 166 | def fp_growth(self, base, threshold, header_table, frequent_patterns): 167 | for item_in_tree in header_table: 168 | current_pattern = (base if base is not None else "") + (" " if base is not None else "") + item_in_tree.item 169 | support_of_current_pattern = 0 170 | conditional_pattern_base = dict() 171 | while item_in_tree.next is not None: 172 | item_in_tree = item_in_tree.next 173 | support_of_current_pattern += item_in_tree.count 174 | conditional_pattern = None 175 | conditional_item = item_in_tree.parent 176 | 177 | while not conditional_item.is_root(): 178 | conditional_pattern = conditional_item.item + " " + ( 179 | conditional_pattern if conditional_pattern is not None else "") 180 | conditional_item = conditional_item.parent 181 | 182 | if conditional_pattern is not None: 183 | conditional_pattern_base[conditional_pattern] = item_in_tree.count 184 | 185 | frequent_patterns[tuple(current_pattern.split())] = support_of_current_pattern 186 | 187 | # counting frequencies of single items in conditional pattern-base 188 | conditional_items_map_to_frequency = dict() 189 | for conditional_pattern in conditional_pattern_base: 190 | split_conditional_pattern = conditional_pattern.split() 191 | for item in split_conditional_pattern: 192 | if item in conditional_items_map_to_frequency: 193 | count = conditional_items_map_to_frequency[item] 194 | count += conditional_pattern_base[conditional_pattern] 195 | conditional_items_map_to_frequency[item] = count 196 | else: 197 | conditional_items_map_to_frequency[item] = conditional_pattern_base[conditional_pattern] 198 | 199 | # create header table for conditional fp tree 200 | conditional_header_table = [] 201 | for itemsForTable in conditional_items_map_to_frequency: 202 | count = conditional_items_map_to_frequency[itemsForTable] 203 | if count < threshold: 204 | continue 205 | f = FPTree(itemsForTable) 206 | f.count = count 207 | conditional_header_table.append(f) 208 | 209 | conditional_fp_tree = self.build_conditional_fp_tree(conditional_pattern_base, 210 | conditional_items_map_to_frequency, threshold, 211 | conditional_header_table) 212 | 213 | # header table with reverse ordering 214 | conditional_header_table.sort(key=lambda x: x.count, reverse=True) 215 | # children is not empty 216 | if conditional_fp_tree.children: 217 | self.fp_growth(current_pattern, threshold, conditional_header_table, frequent_patterns) 218 | 219 | def build_conditional_fp_tree(self, conditional_pattern_base, conditional_items_map_to_frequency, threshold, 220 | conditional_header_table): 221 | conditional_fp_tree = FPTree(None) 222 | conditional_fp_tree.root = True 223 | 224 | for pattern in conditional_pattern_base: 225 | # removing non-frequent pattern and make a list instead of string 226 | pattern_list = [] 227 | split_pattern = pattern.split() 228 | for item in split_pattern: 229 | if conditional_items_map_to_frequency[item] >= threshold: 230 | pattern_list.append(item) 231 | self.insert_into_conditional_fp_tree(pattern_list, conditional_pattern_base[pattern], conditional_fp_tree, 232 | conditional_header_table) 233 | return conditional_fp_tree 234 | 235 | # the insert function for conditional fp tree 236 | def insert_into_conditional_fp_tree(self, pattern_list, count_of_pattern, conditional_fp_tree, 237 | conditional_header_table): 238 | # return if patternArrayList is empty 239 | if not pattern_list: 240 | return 241 | 242 | item_to_add_to_tree = pattern_list[0] 243 | new_node = None 244 | done = False 245 | for child in conditional_fp_tree.children: 246 | if child.item == item_to_add_to_tree: 247 | new_node = child 248 | child.count += count_of_pattern 249 | done = True 250 | break 251 | 252 | if not done: 253 | for header_pointer in conditional_header_table: 254 | # remove non frequents too 255 | if header_pointer.item == item_to_add_to_tree: 256 | new_node = FPTree(item_to_add_to_tree) 257 | new_node.count = count_of_pattern 258 | new_node.parent = conditional_fp_tree 259 | conditional_fp_tree.children.append(new_node) 260 | while header_pointer.next is not None: 261 | header_pointer = header_pointer.next 262 | header_pointer.next = new_node 263 | pattern_list.pop(0) 264 | self.insert_into_conditional_fp_tree(pattern_list, count_of_pattern, new_node, conditional_header_table) 265 | 266 | def print_fp(self): 267 | for item in self.frequent_patterns: 268 | print("{ %s } ( %d )" % (" ".join(item), self.frequent_patterns[item])) 269 | 270 | def generating_rules(self): 271 | # proceed if frequent pattern's size is larger than 1 272 | for frequent_pattern in self.frequent_patterns: 273 | if len(frequent_pattern) >= 2: 274 | power_set = list(build_power_set(frequent_pattern)) 275 | for subset in power_set: 276 | if subset in self.frequent_patterns.keys(): 277 | conf = self.frequent_patterns[frequent_pattern] / self.frequent_patterns[subset] 278 | if conf >= self.minconf: 279 | frequent_minus_subset = "" 280 | for item in frequent_pattern: 281 | if item not in subset: 282 | frequent_minus_subset += item + " " 283 | print("{ %s } => { %s} ( %.2f )" % (" ".join(subset), frequent_minus_subset, conf)) -------------------------------------------------------------------------------- /classes/FPTree.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mhwong' 2 | 3 | # This is a FPTree Structure Class 4 | 5 | 6 | class FPTree: 7 | 8 | def __init__(self, item): 9 | # to check if the node is root 10 | self.root = False 11 | 12 | # the holding item 13 | self.item = item 14 | 15 | # the node's children 16 | self.children = [] 17 | 18 | # the frequency of the holding item 19 | self.count = 0 20 | 21 | # parent node 22 | self.parent = None 23 | 24 | # the next connect node, used in header table 25 | self.next = None 26 | 27 | def is_root(self): 28 | return self.root -------------------------------------------------------------------------------- /classes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/munhouiani/FPGrowth-python/6fdbe059ed80b55b4180c6cb10b21c1679f8fe31/classes/__init__.py -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env pypy 2 | import time 3 | 4 | 5 | __author__ = 'mhwong' 6 | from classes.FPGrowth import FPGrowth 7 | from sys import argv 8 | if __name__ == '__main__': 9 | if len(argv) != 4: 10 | print("Usage:", argv[0], "input_file minsup minconf") 11 | else: 12 | start_time = time.clock() 13 | FPGrowth(argv[1], float(argv[2]), float(argv[3])) 14 | end_time = time.clock() 15 | print("Execution time", (end_time - start_time), "s") 16 | # FPGrowth("/home/mhwong/Desktop/testdata", 0.6, 0.5) 17 | --------------------------------------------------------------------------------