├── input.txt ├── data.txt ├── items.txt ├── rules.txt ├── README.md └── eclat.py /input.txt: -------------------------------------------------------------------------------- 1 | 5 a b c d e 2 | 5 f g c h i 3 | 5 j g c h k 4 | 5 j l c d m 5 | 5 n l c d m 6 | 4 o g c h 7 | 5 o p q d k 8 | -------------------------------------------------------------------------------- /data.txt: -------------------------------------------------------------------------------- 1 | 5 1 2 3 4 5 2 | 5 6 7 3 8 9 3 | 5 10 7 3 8 11 4 | 5 10 12 3 4 13 5 | 5 14 12 3 4 13 6 | 4 15 7 3 8 7 | 5 15 16 17 4 11 8 | -------------------------------------------------------------------------------- /items.txt: -------------------------------------------------------------------------------- 1 | ['4', '5'] : 4 2 | ['3', '5'] : 5 3 | ['4'] : 5 4 | ['4', '3'] : 4 5 | ['5'] : 6 6 | ['4', '3', '5'] : 3 7 | ['3'] : 6 8 | ['7'] : 3 9 | ['7', '3'] : 3 10 | -------------------------------------------------------------------------------- /rules.txt: -------------------------------------------------------------------------------- 1 | Rule: ['3'] ==> ['5'] : 5 : 0.8333 2 | Rule: ['4'] ==> ['3'] : 4 : 0.8 3 | Rule: ['4'] ==> ['5'] : 4 : 0.8 4 | Rule: ['4', '3'] ==> ['5'] : 3 : 0.75 5 | Rule: ['7'] ==> ['3'] : 3 : 1.0 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Eclat-Python-Implementation 2 | Here is a fast implementation of Pattern mining algorithm for finding frequent itemsets in a transactional database named Eclat 3 | Moreover, extraction association rules has been implemented. 4 | 5 | The example of running the Algorithms is : 6 | 7 | python3 eclat.py input.txt support confidence output_FreqItems Output_Rules.txt 8 | 9 | Example: python3 eclat.py input.txt 3 0.4 items.txt rules.txt 10 | 11 | Requirements: 12 | Python >=3.n 13 | 14 | If you provide your input file, you should take care about the delimiter. You could change it in the code at read_Data function call. 15 | 16 | -------------------------------------------------------------------------------- /eclat.py: -------------------------------------------------------------------------------- 1 | import numpy as np, itertools 2 | import pandas as pd 3 | np.random.seed(1) 4 | kot = 0 5 | FreqItems = dict() 6 | support = dict() 7 | 8 | 9 | def eclat(prefix, items, dict_id): 10 | while items: 11 | i,itids = items.pop() 12 | isupp = len(itids) 13 | if isupp >= minsup: 14 | 15 | FreqItems[frozenset(prefix + [i])] = isupp 16 | suffix = [] 17 | for j, ojtids in items: 18 | jtids = itids & ojtids 19 | if len(jtids) >= minsup: 20 | suffix.append((j,jtids)) 21 | 22 | dict_id += 1 23 | eclat(prefix+[i], sorted(suffix, key=lambda item: len(item[1]), reverse=True), dict_id) 24 | 25 | def rules(FreqItems, confidence): 26 | Rules = [] 27 | cnt = 0 28 | 29 | for items, support in FreqItems.items(): 30 | if (len(items) > 1): 31 | all_perms = list(itertools.permutations(items, len(items))) 32 | for lst in all_perms: 33 | antecedent = lst[:len(lst) - 1] 34 | consequent = lst[-1:] 35 | 36 | conf = float(FreqItems[frozenset(items)]/FreqItems[frozenset(antecedent)]*100) 37 | if (conf >= confidence): 38 | cnt += 1 39 | lift = float(conf/FreqItems[frozenset(consequent)]) 40 | if lift >= 1: 41 | Rules.append((antecedent, consequent, support, conf, lift)) 42 | 43 | 44 | print('Found %d Rules ' % (cnt)) 45 | return Rules 46 | 47 | 48 | def getantecendent(FreqItems, confidence): 49 | ant = [] 50 | cnt = 0 51 | 52 | for items, support in FreqItems.items(): 53 | if(len(items) > 1): 54 | all_perms = list(itertools.permutations(items, len(items))) 55 | for lst in all_perms: 56 | antecedent = lst[:len(lst) - 1] 57 | consequent = lst[-1:] 58 | 59 | conf = float(FreqItems[frozenset(items)]/FreqItems[frozenset(antecedent)]*100) 60 | if (conf >= confidence): 61 | cnt += 1 62 | lift = float(conf/FreqItems[frozenset(consequent)]) 63 | if lift >= 1: 64 | ant.append((antecedent)) 65 | 66 | print('Print %d attributes' % (cnt)) 67 | return ant 68 | 69 | def print_Frequent_Itemsets(output_FreqItems, FreqItems): 70 | file = open(output_FreqItems, 'w+') 71 | for item, support in FreqItems.items(): 72 | file.write(" {} : {} \n".format(list(item), round(support,4))) 73 | 74 | def print_Rules(output_Rules, Rules): 75 | file = open(output_Rules, 'w+') 76 | for a, b,supp, conf, lift in sorted(Rules): 77 | file.write("{} ==> {} support: {} confidence: {} \n".format((a), (b), round(supp, 4),round(conf, 4),round(lift, 4))) 78 | file.close() 79 | 80 | def print_Antecendent(ant): 81 | file = open('output_antecendent.csv', 'w+') 82 | for a in sorted(ant): 83 | file.write("[] \n".format((a))) 84 | file.close() 85 | 86 | def Read_Data(filename, delimiter=','): 87 | data = {} 88 | trans = 0 89 | f = open(filename, 'r', encoding="utf8") 90 | for row in f: 91 | trans += 1 92 | for item in row.split(delimiter): 93 | if item not in data: 94 | data[item] = set() 95 | data[item].add(trans) 96 | f.close() 97 | return data 98 | 99 | if __name__ == "__main__": 100 | minsup = 10 101 | confidence = 75 102 | output_FreqItems = 'output_freqitems.csv' 103 | output_Rules = 'output_rule.csv' 104 | dict_id = 0 105 | data = Read_Data('input.txt', ',') #change the delimiter based on your input file 106 | data.pop("\n",None) 107 | data.pop("",None) 108 | print('finished reading data..... \n Starting mining .....') 109 | eclat([], sorted(data.items(), key=lambda item: len(item[1]), reverse=True), dict_id) 110 | print('found %d Frequent items' % len(FreqItems)) 111 | Rules = rules(FreqItems, confidence) 112 | print('Writing Rules .....') 113 | 114 | 115 | 116 | print_Frequent_Itemsets(output_FreqItems, FreqItems) 117 | print_Rules(output_Rules, Rules) 118 | Antecendent = getantecendent(FreqItems, confidence) 119 | print_Antecendent(Antecendent) 120 | 121 | Ant1d = np.hstack(Antecendent) 122 | 123 | count = np.array(Ant1d) 124 | unique, counts = np.unique(count, return_counts=True) 125 | dict(zip(unique, counts)) 126 | counted = np.stack((unique, counts), axis=1) 127 | appendFile = open('candidate.csv','w') 128 | for i in range(0,len(counted)): 129 | appendFile.write(str(unique[i])+";"+str(counts[i])+","+"\n") 130 | appendFile.close() 131 | 132 | df = pd.DataFrame(counted, columns=['word','counter']) 133 | df["counter"] = pd.to_numeric(df["counter"]) 134 | sortcounted = df.sort_values(["counter"], axis=0, 135 | ascending=[False]) 136 | elimcounted = sortcounted.drop(sortcounted[sortcounted['counter']<2].index) 137 | 138 | listfrequent = list(elimcounted.iloc[:, 0].values) 139 | 140 | 141 | --------------------------------------------------------------------------------