├── DataSet3.txt ├── DataSet1.txt ├── README.md ├── DataSet5.txt ├── DataSet2.txt ├── DataSet4.txt └── apriori.py /DataSet3.txt: -------------------------------------------------------------------------------- 1 | diaper toys scooter bike skate fan AC curtain 2 | diaper toys scooter skate paint lamp bulb 3 | toys scooter bike mattress skate 4 | mattress bed pillow paint AC fan 5 | toys mattress bed pillow 6 | scooter mattress bed pillow 7 | diaper mattress pillow 8 | diaper scooter toys mattress pillow 9 | pillow diaper bike 10 | scooter diaper bed 11 | lamp fan bulb AC 12 | lamp paint curtain 13 | fan bulb AC 14 | paint lamp AC 15 | skate bike bulb 16 | fan AC skate 17 | skate AC lamp bulb 18 | bike scooter AC 19 | paint AC skate diaper toys 20 | toys skate scooter bike 21 | -------------------------------------------------------------------------------- /DataSet1.txt: -------------------------------------------------------------------------------- 1 | books camera laptop headphones table 2 | cellphone monitor mouse TV keyboard 3 | clothes shoes cosmetics headphones TV 4 | cellphone TV books chair 5 | shoes camera clothes cosmetics 6 | cellphone books 7 | cosmetics shoes books 8 | mouse keyboard headphones cosmetics 9 | chair table books 10 | cellphone laptop TV 11 | camera laptop clothes chair 12 | table mouse books shoes 13 | cosmetics clothes shoes laptop 14 | chair mouse TV camera 15 | headphones monitor laptop camera 16 | books mouse laptop cosmetics 17 | clothes cosmetics cellphone TV 18 | table chair headphones shoes 19 | camera cosmetics laptop cellphone 20 | books camera clothes cellphone TV 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #Association Rule Mining in Python 2 | 3 | File Description: 4 | - apriori.py : Python implementation of the apriori algorithm. This code reads a transactional database file specified by the user and based on user's specified support and confidence values, frequent itemsets and association rules are generated. 5 | - DataSetx.txt : (x: 1,2,3,4,5) Five different dataset files containing transactions. 6 | 7 | Usage: 8 | - Modify line #14 of apriori.py and specify the name of dataset file to use. 9 | - Run the program from the command line: python apriori.py 10 | - When prompted, provide support and confidence values in percentage 11 | - Frequent itemsets and association rules are generated 12 | -------------------------------------------------------------------------------- /DataSet5.txt: -------------------------------------------------------------------------------- 1 | chocolate candy coke mango peanut 2 | pen luggage comforter bag cap detergent towel 3 | chocolate pen candy luggage 4 | chococate candy coke 5 | pen luggage comforter bag 6 | chocolate candy coke mango 7 | pen luggage comforter bag cap 8 | pen luggage comforter bag cap detergent 9 | pen luggage comforter 10 | candy mango luggage bag 11 | coke peanut detergent towel 12 | mango pen comforter cap 13 | bag cap towel peanut 14 | pen luggage comforter chocolate candy 15 | pen luggage comforter mango peanut 16 | comforter bag cap candy coke mango 17 | peanut candy cap towel 18 | pen peanut comforter chocolate 19 | pen peanut comforter chocolate cap candy 20 | chocolate peanut mango luggage bag towel detergent 21 | -------------------------------------------------------------------------------- /DataSet2.txt: -------------------------------------------------------------------------------- 1 | refrigerator microwave dishwasher freezer juicer expresso 2 | refrigerator microwave dishwasher juicer jewelry 3 | refrigerator microwave dishwasher expresso 4 | microwave juicer jewelry tablet 5 | microwave juicer expresso jewelry tablet 6 | microwave dishwasher freezer juicer expresso 7 | dishwasher freezer juicer expresso 8 | freezer juicer expresso jewelry tablet 9 | dishwasher freezer expresso tablet 10 | dishwasher freezer juicer tablet 11 | tablet speakers jewelry microwave 12 | DSLR printer tablet speakers 13 | freezer DSLR printer 14 | printer tablet jewelry 15 | microwave speakers DSLR 16 | speakers jewelry microwave 17 | speakers jewelry tablet DSLR 18 | refrigerator freezer expresso DSLR 19 | refrigerator microwave DSLR printer 20 | speakers expresso juicer jewelry 21 | -------------------------------------------------------------------------------- /DataSet4.txt: -------------------------------------------------------------------------------- 1 | toothpaste brush milk cereals honey bread butter cheese yogurt 2 | milk cereals honey bread cheese razor gel shampoo 3 | milk cereals honey cheese soap shampoo 4 | honey bread butter cheese mouthwash toothpaste 5 | cereals honey bread butter gel soap 6 | cheesse yogurt milk cereals honey shampoo gel 7 | honey bread cheese razor butter yogurt 8 | honey bread cheese butter milk 9 | cereals butter cookies chips 10 | cerals cheese yogurt cookies chips 11 | toothpaste brush gel shampoo soap cookies chips 12 | toothpaste brush gel razor mouthwash milk cookies 13 | razor shampoo gel soap bread butter 14 | brush shampoo gel toothpaste mouthwash bread cheese 15 | mouthwash toothpaste soap shampoo cheese yogurt 16 | razor mouthwash soap butter bread cheese 17 | shampoo soap gel milk honey cereals 18 | toothpaste razor gel brush mouthwash shampoo 19 | gel razor shampoo milk cereals bread cookies 20 | mouthwash toothpaste milk bread cookies 21 | -------------------------------------------------------------------------------- /apriori.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | 3 | """prompt user to enter support and confidence values in percent""" 4 | 5 | support = int(raw_input("Please enter support value in %: ")) 6 | confidence = int(raw_input("Please enter confidence value in %: ")) 7 | 8 | """Compute candidate 1-itemset""" 9 | C1 = {} 10 | """total number of transactions contained in the file""" 11 | transactions = 0 12 | D = [] 13 | T = [] 14 | with open("DataSet5.txt", "r") as f: 15 | for line in f: 16 | T = [] 17 | transactions += 1 18 | for word in line.split(): 19 | T.append(word) 20 | if word not in C1.keys(): 21 | C1[word] = 1 22 | else: 23 | count = C1[word] 24 | C1[word] = count + 1 25 | D.append(T) 26 | print "-------------------------TEST DATASET----------------------------" 27 | print D 28 | print "-----------------------------------------------------------------" 29 | #print "--------------------CANDIDATE 1-ITEMSET------------------------- " 30 | #print C1 31 | #print "-----------------------------------------------------------------" 32 | 33 | """Compute frequent 1-itemset""" 34 | L1 = [] 35 | for key in C1: 36 | if (100 * C1[key]/transactions) >= support: 37 | list = [] 38 | list.append(key) 39 | L1.append(list) 40 | print "----------------------FREQUENT 1-ITEMSET-------------------------" 41 | print L1 42 | print "-----------------------------------------------------------------" 43 | 44 | """apriori_gen function to compute candidate k-itemset, (Ck) , using frequent (k-1)-itemset, (Lk_1)""" 45 | 46 | def apriori_gen(Lk_1, k): 47 | length = k 48 | Ck = [] 49 | for list1 in Lk_1: 50 | for list2 in Lk_1: 51 | count = 0 52 | c = [] 53 | if list1 != list2: 54 | while count < length-1: 55 | if list1[count] != list2[count]: 56 | break 57 | else: 58 | count += 1 59 | else: 60 | if list1[length-1] < list2[length-1]: 61 | for item in list1: 62 | c.append(item) 63 | c.append(list2[length-1]) 64 | if not has_infrequent_subset(c, Lk_1, k): 65 | Ck.append(c) 66 | c = [] 67 | return Ck 68 | 69 | """function to compute 'm' element subsets of a set S""" 70 | 71 | def findsubsets(S,m): 72 | return set(itertools.combinations(S, m)) 73 | 74 | """has_infrequent_subsets function to determine if pruning is required to remove unfruitful candidates (c) using the Apriori property, with prior knowledge of frequent (k-1)-itemset (Lk_1)""" 75 | 76 | def has_infrequent_subset(c, Lk_1, k): 77 | list = [] 78 | list = findsubsets(c,k) 79 | for item in list: 80 | s = [] 81 | for l in item: 82 | s.append(l) 83 | s.sort() 84 | if s not in Lk_1: 85 | return True 86 | return False 87 | 88 | 89 | """frequent_itemsets function to compute all frequent itemsets""" 90 | 91 | def frequent_itemsets(): 92 | k = 2 93 | Lk_1 = [] 94 | Lk = [] 95 | L = [] 96 | count = 0 97 | transactions = 0 98 | for item in L1: 99 | Lk_1.append(item) 100 | while Lk_1 != []: 101 | Ck = [] 102 | Lk = [] 103 | Ck = apriori_gen(Lk_1, k-1) 104 | #print "-------------------------CANDIDATE %d-ITEMSET---------------------" % k 105 | #print "Ck: %s" % Ck 106 | #print "------------------------------------------------------------------" 107 | for c in Ck: 108 | count = 0 109 | transactions = 0 110 | s = set(c) 111 | for T in D: 112 | transactions += 1 113 | t = set(T) 114 | if s.issubset(t) == True: 115 | count += 1 116 | if (100 * count/transactions) >= support: 117 | c.sort() 118 | Lk.append(c) 119 | Lk_1 = [] 120 | print "-----------------------FREQUENT %d-ITEMSET------------------------" % k 121 | print Lk 122 | print "------------------------------------------------------------------" 123 | for l in Lk: 124 | Lk_1.append(l) 125 | k += 1 126 | if Lk != []: 127 | L.append(Lk) 128 | 129 | return L 130 | 131 | 132 | """generate_association_rules function to mine and print all the association rules with given support and confidence value""" 133 | 134 | def generate_association_rules(): 135 | s = [] 136 | r = [] 137 | length = 0 138 | count = 1 139 | inc1 = 0 140 | inc2 = 0 141 | num = 1 142 | m = [] 143 | L= frequent_itemsets() 144 | print "---------------------ASSOCIATION RULES------------------" 145 | print "RULES \t SUPPORT \t CONFIDENCE" 146 | print "--------------------------------------------------------" 147 | for list in L: 148 | for l in list: 149 | length = len(l) 150 | count = 1 151 | while count < length: 152 | s = [] 153 | r = findsubsets(l,count) 154 | count += 1 155 | for item in r: 156 | inc1 = 0 157 | inc2 = 0 158 | s = [] 159 | m = [] 160 | for i in item: 161 | s.append(i) 162 | for T in D: 163 | if set(s).issubset(set(T)) == True: 164 | inc1 += 1 165 | if set(l).issubset(set(T)) == True: 166 | inc2 += 1 167 | if 100*inc2/inc1 >= confidence: 168 | for index in l: 169 | if index not in s: 170 | m.append(index) 171 | print "Rule# %d : %s ==> %s %d %d" %(num, s, m, 100*inc2/len(D), 100*inc2/inc1) 172 | num += 1 173 | 174 | generate_association_rules() 175 | print "--------------------------------------------------------" 176 | --------------------------------------------------------------------------------