├── .gitignore
├── Apriori-algorithm-using-HashTree.py
├── FPTree-algorithm.py
├── README.md
├── basket-dataset.txt
├── chess-dataset.txt
└── small-test-input.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/Apriori-algorithm-using-HashTree.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import time
  3 | 
  4 | #take input of file name and minimum support count
  5 | print("Enter the filename:")
  6 | filename = input()
  7 | print("Enter the minimum support count:")
  8 | min_support = int(input())
  9 | 
 10 | #read data from txt file
 11 | with open(filename) as f:
 12 |     content = f.readlines()
 13 | 
 14 | content = [x.strip() for x in content]
 15 | 
 16 | Transaction = []                  #to store transaction
 17 | Frequent_items_value = {}         #to store all frequent item sets
 18 | 
 19 | #to fill values in transaction from txt file
 20 | for i in range(0,len(content)):
 21 |     Transaction.append(content[i].split())
 22 | 
 23 | #function to get frequent one itemset
 24 | def frequent_one_item(Transaction,min_support):
 25 |     candidate1 = {}
 26 | 
 27 |     for i in range(0,len(Transaction)):
 28 |         for j in range(0,len(Transaction[i])):
 29 |             if Transaction[i][j] not in candidate1:
 30 |                 candidate1[Transaction[i][j]] = 1
 31 |             else:
 32 |                 candidate1[Transaction[i][j]] += 1
 33 | 
 34 |     frequentitem1 = []                      #to get frequent 1 itemsets with minimum support count
 35 |     for value in candidate1:
 36 |         if candidate1[value] >= min_support:
 37 |             frequentitem1 = frequentitem1 + [[value]]
 38 |             Frequent_items_value[tuple(value)] = candidate1[value]
 39 | 
 40 |     return frequentitem1
 41 | 
 42 | values = frequent_one_item(Transaction,min_support)
 43 | print(values)
 44 | print(Frequent_items_value)
 45 | 
 46 | 
 47 | # to remove infrequent 1 itemsets from transaction
 48 | Transaction1 = []
 49 | for i in range(0,len(Transaction)):
 50 |     list_val = []
 51 |     for j in range(0,len(Transaction[i])):
 52 |         if [Transaction[i][j]] in values:
 53 |             list_val.append(Transaction[i][j])
 54 |     Transaction1.append(list_val)
 55 | 
 56 | 
 57 | #class of Hash node
 58 | class Hash_node:
 59 |     def __init__(self):
 60 |         self.children = {}           #pointer to its children
 61 |         self.Leaf_status = True      #to know the status whether current node is leaf or not
 62 |         self.bucket = {}             #contains itemsets in bucket
 63 | 
 64 | #class of constructing and getting hashtree
 65 | class HashTree:
 66 |     # class constructor
 67 |     def __init__(self, max_leaf_count, max_child_count):
 68 |         self.root = Hash_node()
 69 |         self.max_leaf_count = max_leaf_count
 70 |         self.max_child_count = max_child_count
 71 |         self.frequent_itemsets = []
 72 | 
 73 |     # function to recursive insertion to make hashtree
 74 |     def recursively_insert(self, node, itemset, index, count):
 75 |         if index == len(itemset):
 76 |             if itemset in node.bucket:
 77 |                 node.bucket[itemset] += count
 78 |             else:
 79 |                 node.bucket[itemset] = count
 80 |             return
 81 | 
 82 |         if node.Leaf_status:                             #if node is leaf
 83 |             if itemset in node.bucket:
 84 |                 node.bucket[itemset] += count
 85 |             else:
 86 |                 node.bucket[itemset] = count
 87 |             if len(node.bucket) == self.max_leaf_count:  #if bucket capacity increases
 88 |                 for old_itemset, old_count in node.bucket.items():
 89 | 
 90 |                     hash_key = self.hash_function(old_itemset[index])  #do hashing on next index
 91 |                     if hash_key not in node.children:
 92 |                         node.children[hash_key] = Hash_node()
 93 |                     self.recursively_insert(node.children[hash_key], old_itemset, index + 1, old_count)
 94 |                 #since no more requirement of this bucket
 95 |                 del node.bucket
 96 |                 node.Leaf_status = False
 97 |         else:                                            #if node is not leaf
 98 |             hash_key = self.hash_function(itemset[index])
 99 |             if hash_key not in node.children:
100 |                 node.children[hash_key] = Hash_node()
101 |             self.recursively_insert(node.children[hash_key], itemset, index + 1, count)
102 | 
103 |     def insert(self, itemset):
104 |         itemset = tuple(itemset)
105 |         self.recursively_insert(self.root, itemset, 0, 0)
106 | 
107 |     # to add support to candidate itemsets. Transverse the Tree and find the bucket in which this itemset is present.
108 |     def add_support(self, itemset):
109 |         Transverse_HNode = self.root
110 |         itemset = tuple(itemset)
111 |         index = 0
112 |         while True:
113 |             if Transverse_HNode.Leaf_status:
114 |                 if itemset in Transverse_HNode.bucket:    #found the itemset in this bucket
115 |                     Transverse_HNode.bucket[itemset] += 1 #increment the count of this itemset.
116 |                 break
117 |             hash_key = self.hash_function(itemset[index])
118 |             if hash_key in Transverse_HNode.children:
119 |                 Transverse_HNode = Transverse_HNode.children[hash_key]
120 |             else:
121 |                 break
122 |             index += 1
123 | 
124 |     # to transverse the hashtree to get frequent itemsets with minimum support count
125 |     def get_frequent_itemsets(self, node, support_count,frequent_itemsets):
126 |         if node.Leaf_status:
127 |             for key, value in node.bucket.items():
128 |                 if value >= support_count:                       #if it satisfies the condition
129 |                     frequent_itemsets.append(list(key))          #then add it to frequent itemsets.
130 |                     Frequent_items_value[key] = value
131 |             return
132 | 
133 |         for child in node.children.values():
134 |             self.get_frequent_itemsets(child, support_count,frequent_itemsets)
135 | 
136 |     # hash function for making HashTree
137 |     def hash_function(self, val):
138 |         return int(val) % self.max_child_count
139 | 
140 | #To generate hash tree from candidate itemsets
141 | def generate_hash_tree(candidate_itemsets, max_leaf_count, max_child_count):
142 |     htree = HashTree(max_child_count, max_leaf_count)             #create instance of HashTree
143 |     for itemset in candidate_itemsets:
144 |         htree.insert(itemset)                                     #to insert itemset into Hashtree
145 |     return htree
146 | 
147 | #to generate subsets of itemsets of size k
148 | def generate_k_subsets(dataset, length):
149 |     subsets = []
150 |     for itemset in dataset:
151 |         subsets.extend(map(list, itertools.combinations(itemset, length)))
152 |     return subsets
153 | 
154 | def subset_generation(ck_data,l):
155 |     return map(list,set(itertools.combinations(ck_data,l)))
156 | 
157 | #apriori generate function to generate ck
158 | def apriori_generate(dataset,k):
159 |     ck = []
160 |     #join step
161 |     lenlk = len(dataset)
162 |     for i in range(lenlk):
163 |         for j in range(i+1,lenlk):
164 |             L1 = list(dataset[i])[:k - 2]
165 |             L2 = list(dataset[j])[:k - 2]
166 |             if L1 == L2:
167 |                 ck.append(sorted(list(set(dataset[i]) | set(dataset[j]))))
168 | 
169 |     #prune step
170 |     final_ck = []
171 |     for candidate in ck:
172 |         all_subsets = list(subset_generation(set(candidate), k - 1))
173 |         found = True
174 |         for i in range(len(all_subsets)):
175 |             value = list(sorted(all_subsets[i]))
176 |             if value not in dataset:
177 |                 found = False
178 |         if found == True:
179 |             final_ck.append(candidate)
180 | 
181 |     return ck,final_ck
182 | 
183 | def generateL(ck,min_support):
184 |     support_ck = {}
185 |     for val in Transaction1:
186 |         for val1 in ck:
187 |             value = set(val)
188 |             value1 = set(val1)
189 | 
190 |             if value1.issubset(value):
191 |                 if tuple(val1) not in support_ck:
192 |                     support_ck[tuple(val1)] = 1
193 |                 else:
194 |                     support_ck[tuple(val1)] += 1
195 |     frequent_item = []
196 |     for item_set in support_ck:
197 |         if support_ck[item_set] >= min_support:
198 |             frequent_item.append(sorted(list(item_set)))
199 |             Frequent_items_value[item_set] = support_ck[item_set]
200 | 
201 |     return frequent_item
202 | 
203 | # main apriori algorithm function
204 | def apriori(L1,min_support):
205 |     k = 2;
206 |     L = []
207 |     L.append(0)
208 |     L.append(L1)
209 |     print("enter max_leaf_count")              #maximum number of items in bucket i.e. bucket capacity of each node
210 |     max_leaf_count = int(input())
211 |     print("enter max_child_count")             #maximum number of child you want for a node
212 |     max_child_count = int(input())
213 | 
214 |     start = time.time()
215 |     while(len(L[k-1])>0):
216 |         ck,final_ck = apriori_generate(L[k-1],k)                 #to generate candidate itemsets
217 |         print("C%d" %(k))
218 |         print(final_ck)
219 |         h_tree = generate_hash_tree(ck,max_leaf_count,max_child_count)       #to generate hashtree
220 |         if (k > 2):
221 |             while(len(L[k-1])>0):
222 |                 l = generateL(final_ck, min_support)
223 |                 L.append(l)
224 |                 print("Frequent %d item" % (k))
225 |                 print(l)
226 |                 k = k + 1
227 |                 ck, final_ck = apriori_generate(L[k - 1], k)
228 |                 print("C%d" % (k))
229 |                 print(final_ck)
230 |             break
231 |         k_subsets = generate_k_subsets(Transaction1,k)                  #to generate subsets of each transaction
232 |         for subset in k_subsets:
233 |             h_tree.add_support(subset)                                  #to add support count to itemsets in hashtree
234 |         lk = []
235 |         h_tree.get_frequent_itemsets(h_tree.root,min_support,lk)                  #to get frequent itemsets
236 |         print("Frequent %d item" %(k))
237 |         print(lk)
238 |         L.append(lk)
239 |         k = k + 1
240 |     end = time.time()
241 |     return L,(end-start)
242 | 
243 | 
244 | L_value,time_taken = apriori(values,min_support)
245 | print("Time Taken is:")
246 | print(time_taken)
247 | #print("final L_value")
248 | #print(L_value)
249 | print("All frequent itemsets with their support count:")
250 | print(Frequent_items_value)


--------------------------------------------------------------------------------
/FPTree-algorithm.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | #Function to load file and return lists of Transactions
  4 | def Load_data(filename):
  5 |     with open(filename) as f:
  6 |         content = f.readlines()
  7 | 
  8 |     content = [x.strip() for x in content]
  9 |     Transaction = []
 10 | 
 11 |     for i in range(0, len(content)):
 12 |         Transaction.append(content[i].split())
 13 | 
 14 |     return Transaction
 15 | 
 16 | #To convert initial transaction into frozenset
 17 | def create_initialset(dataset):
 18 |     retDict = {}
 19 |     for trans in dataset:
 20 |         retDict[frozenset(trans)] = 1
 21 |     return retDict
 22 | 
 23 | #class of FP TREE node
 24 | class TreeNode:
 25 |     def __init__(self, Node_name,counter,parentNode):
 26 |         self.name = Node_name
 27 |         self.count = counter
 28 |         self.nodeLink = None
 29 |         self.parent = parentNode
 30 |         self.children = {}
 31 |         
 32 |     def increment_counter(self, counter):
 33 |         self.count += counter
 34 | 
 35 | #To create Headertable and ordered itemsets for FP Tree
 36 | def create_FPTree(dataset, minSupport):
 37 |     HeaderTable = {}
 38 |     for transaction in dataset:
 39 |         for item in transaction:
 40 |             HeaderTable[item] = HeaderTable.get(item,0) + dataset[transaction]
 41 |     for k in list(HeaderTable):
 42 |         if HeaderTable[k] < minSupport:
 43 |             del(HeaderTable[k])
 44 | 
 45 |     frequent_itemset = set(HeaderTable.keys())
 46 | 
 47 |     if len(frequent_itemset) == 0:
 48 |         return None, None
 49 | 
 50 |     for k in HeaderTable:
 51 |         HeaderTable[k] = [HeaderTable[k], None]
 52 | 
 53 |     retTree = TreeNode('Null Set',1,None)
 54 |     for itemset,count in dataset.items():
 55 |         frequent_transaction = {}
 56 |         for item in itemset:
 57 |             if item in frequent_itemset:
 58 |                 frequent_transaction[item] = HeaderTable[item][0]
 59 |         if len(frequent_transaction) > 0:
 60 |             #to get ordered itemsets form transactions
 61 |             ordered_itemset = [v[0] for v in sorted(frequent_transaction.items(), key=lambda p: p[1], reverse=True)]
 62 |             #to update the FPTree
 63 |             updateTree(ordered_itemset, retTree, HeaderTable, count)
 64 |     return retTree, HeaderTable
 65 | 
 66 | #To create the FP Tree using ordered itemsets
 67 | def updateTree(itemset, FPTree, HeaderTable, count):
 68 |     if itemset[0] in FPTree.children:
 69 |         FPTree.children[itemset[0]].increment_counter(count)
 70 |     else:
 71 |         FPTree.children[itemset[0]] = TreeNode(itemset[0], count, FPTree)
 72 | 
 73 |         if HeaderTable[itemset[0]][1] == None:
 74 |             HeaderTable[itemset[0]][1] = FPTree.children[itemset[0]]
 75 |         else:
 76 |             update_NodeLink(HeaderTable[itemset[0]][1], FPTree.children[itemset[0]])
 77 | 
 78 |     if len(itemset) > 1:
 79 |         updateTree(itemset[1::], FPTree.children[itemset[0]], HeaderTable, count)
 80 | 
 81 | #To update the link of node in FP Tree
 82 | def update_NodeLink(Test_Node, Target_Node):
 83 |     while (Test_Node.nodeLink != None):
 84 |         Test_Node = Test_Node.nodeLink
 85 | 
 86 |     Test_Node.nodeLink = Target_Node
 87 | 
 88 | #To transverse FPTree in upward direction
 89 | def FPTree_uptransveral(leaf_Node, prefixPath):
 90 |  if leaf_Node.parent != None:
 91 |     prefixPath.append(leaf_Node.name)
 92 |     FPTree_uptransveral(leaf_Node.parent, prefixPath)
 93 | 
 94 | #To find conditional Pattern Bases
 95 | def find_prefix_path(basePat, TreeNode):
 96 |  Conditional_patterns_base = {}
 97 | 
 98 |  while TreeNode != None:
 99 |     prefixPath = []
100 |     FPTree_uptransveral(TreeNode, prefixPath)
101 |     if len(prefixPath) > 1:
102 |         Conditional_patterns_base[frozenset(prefixPath[1:])] = TreeNode.count
103 |     TreeNode = TreeNode.nodeLink
104 | 
105 |  return Conditional_patterns_base
106 | 
107 | #function to mine recursively conditional patterns base and conditional FP tree
108 | def Mine_Tree(FPTree, HeaderTable, minSupport, prefix, frequent_itemset):
109 |     bigL = [v[0] for v in sorted(HeaderTable.items(),key=lambda p: p[1][0])]
110 |     for basePat in bigL:
111 |         new_frequentset = prefix.copy()
112 |         new_frequentset.add(basePat)
113 |         #add frequent itemset to final list of frequent itemsets
114 |         frequent_itemset.append(new_frequentset)
115 |         #get all conditional pattern bases for item or itemsets
116 |         Conditional_pattern_bases = find_prefix_path(basePat, HeaderTable[basePat][1])
117 |         #call FP Tree construction to make conditional FP Tree
118 |         Conditional_FPTree, Conditional_header = create_FPTree(Conditional_pattern_bases,minSupport)
119 | 
120 |         if Conditional_header != None:
121 |             Mine_Tree(Conditional_FPTree, Conditional_header, minSupport, new_frequentset, frequent_itemset)
122 | 
123 | #to take input of filename and minimum support
124 | print("Enter the filename:")
125 | filename = input()
126 | print("Enter the minimum support count:")
127 | min_Support = int(input())
128 | 
129 | initSet = create_initialset(Load_data(filename))
130 | start = time.time()
131 | FPtree, HeaderTable = create_FPTree(initSet, min_Support)
132 | 
133 | frequent_itemset = []
134 | #call function to mine all ferquent itemsets
135 | Mine_Tree(FPtree, HeaderTable, min_Support, set([]), frequent_itemset)
136 | end = time.time()
137 | 
138 | print("Time Taken is:")
139 | print(end-start)
140 | print("All frequent itemsets:")
141 | print(frequent_itemset)
142 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # FPTree-Algorithm
2 | Implementation of FPTree Algorithm and Apriori Algorithm using HashTree for finding frequent pattern in Transactional Database. Run the code and enter the filename 
3 | and minimum support count as input. I have also attached two input files of chess-dataset and basket-datset (retail) from the official site http://fimi.ua.ac.be/data.
4 | 


--------------------------------------------------------------------------------
/small-test-input.txt:
--------------------------------------------------------------------------------
1 | 1 2 5
2 | 2 4
3 | 2 3
4 | 1 2 4
5 | 1 3
6 | 2 3
7 | 1 3
8 | 1 2 3 5
9 | 1 2 3


--------------------------------------------------------------------------------