├── .gitignore ├── Apriori-algorithm-using-HashTree.py ├── FPTree-algorithm.py ├── README.md ├── basket-dataset.txt ├── chess-dataset.txt └── small-test-input.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /Apriori-algorithm-using-HashTree.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import time 3 | 4 | #take input of file name and minimum support count 5 | print("Enter the filename:") 6 | filename = input() 7 | print("Enter the minimum support count:") 8 | min_support = int(input()) 9 | 10 | #read data from txt file 11 | with open(filename) as f: 12 | content = f.readlines() 13 | 14 | content = [x.strip() for x in content] 15 | 16 | Transaction = [] #to store transaction 17 | Frequent_items_value = {} #to store all frequent item sets 18 | 19 | #to fill values in transaction from txt file 20 | for i in range(0,len(content)): 21 | Transaction.append(content[i].split()) 22 | 23 | #function to get frequent one itemset 24 | def frequent_one_item(Transaction,min_support): 25 | candidate1 = {} 26 | 27 | for i in range(0,len(Transaction)): 28 | for j in range(0,len(Transaction[i])): 29 | if Transaction[i][j] not in candidate1: 30 | candidate1[Transaction[i][j]] = 1 31 | else: 32 | candidate1[Transaction[i][j]] += 1 33 | 34 | frequentitem1 = [] #to get frequent 1 itemsets with minimum support count 35 | for value in candidate1: 36 | if candidate1[value] >= min_support: 37 | frequentitem1 = frequentitem1 + [[value]] 38 | Frequent_items_value[tuple(value)] = candidate1[value] 39 | 40 | return frequentitem1 41 | 42 | values = frequent_one_item(Transaction,min_support) 43 | print(values) 44 | print(Frequent_items_value) 45 | 46 | 47 | # to remove infrequent 1 itemsets from transaction 48 | Transaction1 = [] 49 | for i in range(0,len(Transaction)): 50 | list_val = [] 51 | for j in range(0,len(Transaction[i])): 52 | if [Transaction[i][j]] in values: 53 | list_val.append(Transaction[i][j]) 54 | Transaction1.append(list_val) 55 | 56 | 57 | #class of Hash node 58 | class Hash_node: 59 | def __init__(self): 60 | self.children = {} #pointer to its children 61 | self.Leaf_status = True #to know the status whether current node is leaf or not 62 | self.bucket = {} #contains itemsets in bucket 63 | 64 | #class of constructing and getting hashtree 65 | class HashTree: 66 | # class constructor 67 | def __init__(self, max_leaf_count, max_child_count): 68 | self.root = Hash_node() 69 | self.max_leaf_count = max_leaf_count 70 | self.max_child_count = max_child_count 71 | self.frequent_itemsets = [] 72 | 73 | # function to recursive insertion to make hashtree 74 | def recursively_insert(self, node, itemset, index, count): 75 | if index == len(itemset): 76 | if itemset in node.bucket: 77 | node.bucket[itemset] += count 78 | else: 79 | node.bucket[itemset] = count 80 | return 81 | 82 | if node.Leaf_status: #if node is leaf 83 | if itemset in node.bucket: 84 | node.bucket[itemset] += count 85 | else: 86 | node.bucket[itemset] = count 87 | if len(node.bucket) == self.max_leaf_count: #if bucket capacity increases 88 | for old_itemset, old_count in node.bucket.items(): 89 | 90 | hash_key = self.hash_function(old_itemset[index]) #do hashing on next index 91 | if hash_key not in node.children: 92 | node.children[hash_key] = Hash_node() 93 | self.recursively_insert(node.children[hash_key], old_itemset, index + 1, old_count) 94 | #since no more requirement of this bucket 95 | del node.bucket 96 | node.Leaf_status = False 97 | else: #if node is not leaf 98 | hash_key = self.hash_function(itemset[index]) 99 | if hash_key not in node.children: 100 | node.children[hash_key] = Hash_node() 101 | self.recursively_insert(node.children[hash_key], itemset, index + 1, count) 102 | 103 | def insert(self, itemset): 104 | itemset = tuple(itemset) 105 | self.recursively_insert(self.root, itemset, 0, 0) 106 | 107 | # to add support to candidate itemsets. Transverse the Tree and find the bucket in which this itemset is present. 108 | def add_support(self, itemset): 109 | Transverse_HNode = self.root 110 | itemset = tuple(itemset) 111 | index = 0 112 | while True: 113 | if Transverse_HNode.Leaf_status: 114 | if itemset in Transverse_HNode.bucket: #found the itemset in this bucket 115 | Transverse_HNode.bucket[itemset] += 1 #increment the count of this itemset. 116 | break 117 | hash_key = self.hash_function(itemset[index]) 118 | if hash_key in Transverse_HNode.children: 119 | Transverse_HNode = Transverse_HNode.children[hash_key] 120 | else: 121 | break 122 | index += 1 123 | 124 | # to transverse the hashtree to get frequent itemsets with minimum support count 125 | def get_frequent_itemsets(self, node, support_count,frequent_itemsets): 126 | if node.Leaf_status: 127 | for key, value in node.bucket.items(): 128 | if value >= support_count: #if it satisfies the condition 129 | frequent_itemsets.append(list(key)) #then add it to frequent itemsets. 130 | Frequent_items_value[key] = value 131 | return 132 | 133 | for child in node.children.values(): 134 | self.get_frequent_itemsets(child, support_count,frequent_itemsets) 135 | 136 | # hash function for making HashTree 137 | def hash_function(self, val): 138 | return int(val) % self.max_child_count 139 | 140 | #To generate hash tree from candidate itemsets 141 | def generate_hash_tree(candidate_itemsets, max_leaf_count, max_child_count): 142 | htree = HashTree(max_child_count, max_leaf_count) #create instance of HashTree 143 | for itemset in candidate_itemsets: 144 | htree.insert(itemset) #to insert itemset into Hashtree 145 | return htree 146 | 147 | #to generate subsets of itemsets of size k 148 | def generate_k_subsets(dataset, length): 149 | subsets = [] 150 | for itemset in dataset: 151 | subsets.extend(map(list, itertools.combinations(itemset, length))) 152 | return subsets 153 | 154 | def subset_generation(ck_data,l): 155 | return map(list,set(itertools.combinations(ck_data,l))) 156 | 157 | #apriori generate function to generate ck 158 | def apriori_generate(dataset,k): 159 | ck = [] 160 | #join step 161 | lenlk = len(dataset) 162 | for i in range(lenlk): 163 | for j in range(i+1,lenlk): 164 | L1 = list(dataset[i])[:k - 2] 165 | L2 = list(dataset[j])[:k - 2] 166 | if L1 == L2: 167 | ck.append(sorted(list(set(dataset[i]) | set(dataset[j])))) 168 | 169 | #prune step 170 | final_ck = [] 171 | for candidate in ck: 172 | all_subsets = list(subset_generation(set(candidate), k - 1)) 173 | found = True 174 | for i in range(len(all_subsets)): 175 | value = list(sorted(all_subsets[i])) 176 | if value not in dataset: 177 | found = False 178 | if found == True: 179 | final_ck.append(candidate) 180 | 181 | return ck,final_ck 182 | 183 | def generateL(ck,min_support): 184 | support_ck = {} 185 | for val in Transaction1: 186 | for val1 in ck: 187 | value = set(val) 188 | value1 = set(val1) 189 | 190 | if value1.issubset(value): 191 | if tuple(val1) not in support_ck: 192 | support_ck[tuple(val1)] = 1 193 | else: 194 | support_ck[tuple(val1)] += 1 195 | frequent_item = [] 196 | for item_set in support_ck: 197 | if support_ck[item_set] >= min_support: 198 | frequent_item.append(sorted(list(item_set))) 199 | Frequent_items_value[item_set] = support_ck[item_set] 200 | 201 | return frequent_item 202 | 203 | # main apriori algorithm function 204 | def apriori(L1,min_support): 205 | k = 2; 206 | L = [] 207 | L.append(0) 208 | L.append(L1) 209 | print("enter max_leaf_count") #maximum number of items in bucket i.e. bucket capacity of each node 210 | max_leaf_count = int(input()) 211 | print("enter max_child_count") #maximum number of child you want for a node 212 | max_child_count = int(input()) 213 | 214 | start = time.time() 215 | while(len(L[k-1])>0): 216 | ck,final_ck = apriori_generate(L[k-1],k) #to generate candidate itemsets 217 | print("C%d" %(k)) 218 | print(final_ck) 219 | h_tree = generate_hash_tree(ck,max_leaf_count,max_child_count) #to generate hashtree 220 | if (k > 2): 221 | while(len(L[k-1])>0): 222 | l = generateL(final_ck, min_support) 223 | L.append(l) 224 | print("Frequent %d item" % (k)) 225 | print(l) 226 | k = k + 1 227 | ck, final_ck = apriori_generate(L[k - 1], k) 228 | print("C%d" % (k)) 229 | print(final_ck) 230 | break 231 | k_subsets = generate_k_subsets(Transaction1,k) #to generate subsets of each transaction 232 | for subset in k_subsets: 233 | h_tree.add_support(subset) #to add support count to itemsets in hashtree 234 | lk = [] 235 | h_tree.get_frequent_itemsets(h_tree.root,min_support,lk) #to get frequent itemsets 236 | print("Frequent %d item" %(k)) 237 | print(lk) 238 | L.append(lk) 239 | k = k + 1 240 | end = time.time() 241 | return L,(end-start) 242 | 243 | 244 | L_value,time_taken = apriori(values,min_support) 245 | print("Time Taken is:") 246 | print(time_taken) 247 | #print("final L_value") 248 | #print(L_value) 249 | print("All frequent itemsets with their support count:") 250 | print(Frequent_items_value) -------------------------------------------------------------------------------- /FPTree-algorithm.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | #Function to load file and return lists of Transactions 4 | def Load_data(filename): 5 | with open(filename) as f: 6 | content = f.readlines() 7 | 8 | content = [x.strip() for x in content] 9 | Transaction = [] 10 | 11 | for i in range(0, len(content)): 12 | Transaction.append(content[i].split()) 13 | 14 | return Transaction 15 | 16 | #To convert initial transaction into frozenset 17 | def create_initialset(dataset): 18 | retDict = {} 19 | for trans in dataset: 20 | retDict[frozenset(trans)] = 1 21 | return retDict 22 | 23 | #class of FP TREE node 24 | class TreeNode: 25 | def __init__(self, Node_name,counter,parentNode): 26 | self.name = Node_name 27 | self.count = counter 28 | self.nodeLink = None 29 | self.parent = parentNode 30 | self.children = {} 31 | 32 | def increment_counter(self, counter): 33 | self.count += counter 34 | 35 | #To create Headertable and ordered itemsets for FP Tree 36 | def create_FPTree(dataset, minSupport): 37 | HeaderTable = {} 38 | for transaction in dataset: 39 | for item in transaction: 40 | HeaderTable[item] = HeaderTable.get(item,0) + dataset[transaction] 41 | for k in list(HeaderTable): 42 | if HeaderTable[k] < minSupport: 43 | del(HeaderTable[k]) 44 | 45 | frequent_itemset = set(HeaderTable.keys()) 46 | 47 | if len(frequent_itemset) == 0: 48 | return None, None 49 | 50 | for k in HeaderTable: 51 | HeaderTable[k] = [HeaderTable[k], None] 52 | 53 | retTree = TreeNode('Null Set',1,None) 54 | for itemset,count in dataset.items(): 55 | frequent_transaction = {} 56 | for item in itemset: 57 | if item in frequent_itemset: 58 | frequent_transaction[item] = HeaderTable[item][0] 59 | if len(frequent_transaction) > 0: 60 | #to get ordered itemsets form transactions 61 | ordered_itemset = [v[0] for v in sorted(frequent_transaction.items(), key=lambda p: p[1], reverse=True)] 62 | #to update the FPTree 63 | updateTree(ordered_itemset, retTree, HeaderTable, count) 64 | return retTree, HeaderTable 65 | 66 | #To create the FP Tree using ordered itemsets 67 | def updateTree(itemset, FPTree, HeaderTable, count): 68 | if itemset[0] in FPTree.children: 69 | FPTree.children[itemset[0]].increment_counter(count) 70 | else: 71 | FPTree.children[itemset[0]] = TreeNode(itemset[0], count, FPTree) 72 | 73 | if HeaderTable[itemset[0]][1] == None: 74 | HeaderTable[itemset[0]][1] = FPTree.children[itemset[0]] 75 | else: 76 | update_NodeLink(HeaderTable[itemset[0]][1], FPTree.children[itemset[0]]) 77 | 78 | if len(itemset) > 1: 79 | updateTree(itemset[1::], FPTree.children[itemset[0]], HeaderTable, count) 80 | 81 | #To update the link of node in FP Tree 82 | def update_NodeLink(Test_Node, Target_Node): 83 | while (Test_Node.nodeLink != None): 84 | Test_Node = Test_Node.nodeLink 85 | 86 | Test_Node.nodeLink = Target_Node 87 | 88 | #To transverse FPTree in upward direction 89 | def FPTree_uptransveral(leaf_Node, prefixPath): 90 | if leaf_Node.parent != None: 91 | prefixPath.append(leaf_Node.name) 92 | FPTree_uptransveral(leaf_Node.parent, prefixPath) 93 | 94 | #To find conditional Pattern Bases 95 | def find_prefix_path(basePat, TreeNode): 96 | Conditional_patterns_base = {} 97 | 98 | while TreeNode != None: 99 | prefixPath = [] 100 | FPTree_uptransveral(TreeNode, prefixPath) 101 | if len(prefixPath) > 1: 102 | Conditional_patterns_base[frozenset(prefixPath[1:])] = TreeNode.count 103 | TreeNode = TreeNode.nodeLink 104 | 105 | return Conditional_patterns_base 106 | 107 | #function to mine recursively conditional patterns base and conditional FP tree 108 | def Mine_Tree(FPTree, HeaderTable, minSupport, prefix, frequent_itemset): 109 | bigL = [v[0] for v in sorted(HeaderTable.items(),key=lambda p: p[1][0])] 110 | for basePat in bigL: 111 | new_frequentset = prefix.copy() 112 | new_frequentset.add(basePat) 113 | #add frequent itemset to final list of frequent itemsets 114 | frequent_itemset.append(new_frequentset) 115 | #get all conditional pattern bases for item or itemsets 116 | Conditional_pattern_bases = find_prefix_path(basePat, HeaderTable[basePat][1]) 117 | #call FP Tree construction to make conditional FP Tree 118 | Conditional_FPTree, Conditional_header = create_FPTree(Conditional_pattern_bases,minSupport) 119 | 120 | if Conditional_header != None: 121 | Mine_Tree(Conditional_FPTree, Conditional_header, minSupport, new_frequentset, frequent_itemset) 122 | 123 | #to take input of filename and minimum support 124 | print("Enter the filename:") 125 | filename = input() 126 | print("Enter the minimum support count:") 127 | min_Support = int(input()) 128 | 129 | initSet = create_initialset(Load_data(filename)) 130 | start = time.time() 131 | FPtree, HeaderTable = create_FPTree(initSet, min_Support) 132 | 133 | frequent_itemset = [] 134 | #call function to mine all ferquent itemsets 135 | Mine_Tree(FPtree, HeaderTable, min_Support, set([]), frequent_itemset) 136 | end = time.time() 137 | 138 | print("Time Taken is:") 139 | print(end-start) 140 | print("All frequent itemsets:") 141 | print(frequent_itemset) 142 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FPTree-Algorithm 2 | Implementation of FPTree Algorithm and Apriori Algorithm using HashTree for finding frequent pattern in Transactional Database. Run the code and enter the filename 3 | and minimum support count as input. I have also attached two input files of chess-dataset and basket-datset (retail) from the official site http://fimi.ua.ac.be/data. 4 | -------------------------------------------------------------------------------- /small-test-input.txt: -------------------------------------------------------------------------------- 1 | 1 2 5 2 | 2 4 3 | 2 3 4 | 1 2 4 5 | 1 3 6 | 2 3 7 | 1 3 8 | 1 2 3 5 9 | 1 2 3 --------------------------------------------------------------------------------