├── DataSet3.txt
├── DataSet1.txt
├── README.md
├── DataSet5.txt
├── DataSet2.txt
├── DataSet4.txt
└── apriori.py


/DataSet3.txt:
--------------------------------------------------------------------------------
 1 | diaper toys scooter bike skate fan AC curtain 
 2 | diaper toys scooter skate paint lamp bulb
 3 | toys scooter bike mattress skate 
 4 | mattress bed pillow paint AC fan
 5 | toys mattress bed pillow 
 6 | scooter mattress bed pillow
 7 | diaper mattress pillow
 8 | diaper scooter toys mattress pillow
 9 | pillow diaper bike
10 | scooter diaper bed
11 | lamp fan bulb AC
12 | lamp paint curtain 
13 | fan bulb AC
14 | paint lamp AC
15 | skate bike bulb
16 | fan AC skate
17 | skate AC lamp bulb
18 | bike scooter AC
19 | paint AC skate diaper toys 
20 | toys skate scooter bike
21 | 


--------------------------------------------------------------------------------
/DataSet1.txt:
--------------------------------------------------------------------------------
 1 | books camera laptop headphones table
 2 | cellphone monitor mouse TV keyboard
 3 | clothes shoes cosmetics headphones TV
 4 | cellphone TV books chair 
 5 | shoes camera clothes cosmetics
 6 | cellphone books 
 7 | cosmetics shoes books 
 8 | mouse keyboard headphones cosmetics
 9 | chair table books 
10 | cellphone laptop TV 
11 | camera laptop clothes chair
12 | table mouse books shoes 
13 | cosmetics clothes shoes laptop
14 | chair mouse TV camera 
15 | headphones monitor laptop camera 
16 | books mouse laptop cosmetics
17 | clothes cosmetics cellphone TV
18 | table chair headphones shoes
19 | camera cosmetics laptop cellphone
20 | books camera clothes cellphone TV 
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | #Association Rule Mining in Python
 2 | 
 3 | File Description:
 4 | - apriori.py : Python implementation of the apriori algorithm. This code reads a transactional database file specified by the user and based on user's specified support and confidence values, frequent itemsets and association rules are generated.
 5 | - DataSetx.txt : (x: 1,2,3,4,5) Five different dataset files containing transactions.
 6 | 
 7 | Usage:
 8 | - Modify line #14 of apriori.py and specify the name of dataset file to use.
 9 | - Run the program from the command line: python apriori.py
10 | - When prompted, provide support and confidence values in percentage
11 | - Frequent itemsets and association rules are generated
12 | 


--------------------------------------------------------------------------------
/DataSet5.txt:
--------------------------------------------------------------------------------
 1 | chocolate candy coke mango peanut
 2 | pen luggage comforter bag cap detergent towel 
 3 | chocolate pen candy luggage 
 4 | chococate candy coke 
 5 | pen luggage comforter bag
 6 | chocolate candy coke mango
 7 | pen luggage comforter bag cap 
 8 | pen luggage comforter bag cap detergent 
 9 | pen luggage comforter
10 | candy mango luggage bag
11 | coke peanut detergent towel
12 | mango pen comforter cap
13 | bag cap towel peanut
14 | pen luggage comforter chocolate candy
15 | pen luggage comforter mango peanut
16 | comforter bag cap candy coke mango
17 | peanut candy cap towel
18 | pen peanut comforter chocolate 
19 | pen peanut comforter chocolate cap candy
20 | chocolate peanut mango luggage bag towel detergent 
21 | 


--------------------------------------------------------------------------------
/DataSet2.txt:
--------------------------------------------------------------------------------
 1 | refrigerator microwave dishwasher freezer juicer expresso
 2 | refrigerator microwave dishwasher juicer jewelry
 3 | refrigerator microwave dishwasher expresso
 4 | microwave juicer jewelry tablet
 5 | microwave juicer expresso jewelry tablet 
 6 | microwave dishwasher freezer juicer expresso
 7 | dishwasher freezer juicer expresso
 8 | freezer juicer expresso jewelry tablet 
 9 | dishwasher freezer expresso tablet 
10 | dishwasher freezer juicer tablet 
11 | tablet speakers jewelry microwave
12 | DSLR printer tablet speakers
13 | freezer DSLR printer 
14 | printer tablet jewelry
15 | microwave speakers DSLR
16 | speakers jewelry microwave
17 | speakers jewelry tablet DSLR 
18 | refrigerator freezer expresso DSLR 
19 | refrigerator microwave DSLR printer 
20 | speakers expresso juicer jewelry
21 | 


--------------------------------------------------------------------------------
/DataSet4.txt:
--------------------------------------------------------------------------------
 1 | toothpaste brush milk cereals honey bread butter cheese yogurt 
 2 | milk cereals honey bread cheese razor gel shampoo
 3 | milk cereals honey cheese soap shampoo 
 4 | honey bread butter cheese mouthwash toothpaste
 5 | cereals honey bread butter gel soap 
 6 | cheesse yogurt milk cereals honey shampoo gel
 7 | honey bread cheese razor butter yogurt
 8 | honey bread cheese butter milk 
 9 | cereals butter cookies chips 
10 | cerals cheese yogurt cookies chips
11 | toothpaste brush gel shampoo soap cookies chips
12 | toothpaste brush gel razor mouthwash milk cookies 
13 | razor shampoo gel soap bread butter 
14 | brush shampoo gel toothpaste mouthwash bread cheese 
15 | mouthwash toothpaste soap shampoo cheese yogurt
16 | razor mouthwash soap butter bread cheese
17 | shampoo soap gel milk honey cereals 
18 | toothpaste razor gel brush mouthwash shampoo
19 | gel razor shampoo milk cereals bread cookies
20 | mouthwash toothpaste milk bread cookies
21 | 


--------------------------------------------------------------------------------
/apriori.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | 
  3 | """prompt user to enter support and confidence values in percent"""
  4 | 
  5 | support = int(raw_input("Please enter support value in %: "))
  6 | confidence = int(raw_input("Please enter confidence value in %: "))
  7 | 
  8 | """Compute candidate 1-itemset"""
  9 | C1 = {}
 10 | """total number of transactions contained in the file"""
 11 | transactions = 0
 12 | D = []
 13 | T = []
 14 | with open("DataSet5.txt", "r") as f:
 15 |     for line in f:
 16 |         T = []
 17 |         transactions += 1
 18 |         for word in line.split():
 19 |             T.append(word)
 20 |             if word not in C1.keys():
 21 |                 C1[word] = 1
 22 |             else:
 23 |                 count = C1[word]
 24 |                 C1[word] = count + 1
 25 |         D.append(T)
 26 | print "-------------------------TEST DATASET----------------------------"
 27 | print D
 28 | print "-----------------------------------------------------------------"
 29 | #print "--------------------CANDIDATE 1-ITEMSET------------------------- "
 30 | #print C1
 31 | #print "-----------------------------------------------------------------"
 32 | 
 33 | """Compute frequent 1-itemset"""
 34 | L1 = []
 35 | for key in C1:
 36 |     if (100 * C1[key]/transactions) >= support:
 37 |         list = []
 38 |         list.append(key)
 39 |         L1.append(list)
 40 | print "----------------------FREQUENT 1-ITEMSET-------------------------"
 41 | print L1
 42 | print "-----------------------------------------------------------------"
 43 | 
 44 | """apriori_gen function to compute candidate k-itemset, (Ck) , using frequent (k-1)-itemset, (Lk_1)"""
 45 |  
 46 | def apriori_gen(Lk_1, k):
 47 |     length = k
 48 |     Ck = [] 
 49 |     for list1 in Lk_1:
 50 |         for list2 in Lk_1:
 51 |             count = 0
 52 |             c = []
 53 |             if list1 != list2:
 54 |                 while count < length-1:
 55 |                     if list1[count] != list2[count]:
 56 |                         break
 57 |                     else:
 58 |                         count += 1
 59 |                 else:
 60 |                     if list1[length-1] < list2[length-1]:
 61 |                         for item in list1:
 62 |                             c.append(item)
 63 |                         c.append(list2[length-1])
 64 |                         if not has_infrequent_subset(c, Lk_1, k):
 65 |                             Ck.append(c) 
 66 |                             c = []
 67 |     return Ck
 68 | 
 69 | """function to compute 'm' element subsets of a set S"""
 70 | 
 71 | def findsubsets(S,m):
 72 |     return set(itertools.combinations(S, m))
 73 | 
 74 | """has_infrequent_subsets function to determine if pruning is required to remove unfruitful candidates (c) using the Apriori property, with prior knowledge of frequent (k-1)-itemset (Lk_1)"""
 75 |    
 76 | def has_infrequent_subset(c, Lk_1, k):
 77 |     list = []
 78 |     list = findsubsets(c,k)
 79 |     for item in list: 
 80 |         s = []
 81 |         for l in item:
 82 |             s.append(l)
 83 |         s.sort()
 84 |         if s not in Lk_1:
 85 |             return True
 86 |     return False
 87 | 
 88 | 
 89 | """frequent_itemsets function to compute all frequent itemsets"""
 90 | 
 91 | def frequent_itemsets():
 92 |     k = 2
 93 |     Lk_1 = []
 94 |     Lk = []
 95 |     L = []
 96 |     count = 0
 97 |     transactions = 0
 98 |     for item in L1:
 99 |         Lk_1.append(item)
100 |     while Lk_1 != []:
101 |         Ck = []
102 |         Lk = []
103 |         Ck = apriori_gen(Lk_1, k-1)
104 |         #print "-------------------------CANDIDATE %d-ITEMSET---------------------" % k
105 |         #print "Ck: %s" % Ck
106 |         #print "------------------------------------------------------------------"
107 |         for c in Ck:
108 |             count = 0
109 |             transactions = 0
110 |             s = set(c)
111 |             for T in D:
112 |                 transactions += 1
113 |                 t = set(T)
114 |                 if s.issubset(t) == True:
115 |                     count += 1
116 |             if (100 * count/transactions) >= support:
117 |                 c.sort()
118 |                 Lk.append(c)
119 |         Lk_1 = []
120 |         print "-----------------------FREQUENT %d-ITEMSET------------------------" % k
121 |         print Lk
122 |         print "------------------------------------------------------------------"
123 |         for l in Lk:
124 |             Lk_1.append(l)
125 |         k += 1
126 |         if Lk != []:
127 |             L.append(Lk)
128 |     
129 |     return L
130 |      
131 |         
132 | """generate_association_rules function to mine and print all the association rules with given support and confidence value"""
133 | 
134 | def generate_association_rules():
135 |     s = []
136 |     r = []
137 |     length = 0
138 |     count = 1
139 |     inc1 = 0
140 |     inc2 = 0
141 |     num = 1
142 |     m = []
143 |     L= frequent_itemsets()
144 |     print "---------------------ASSOCIATION RULES------------------"
145 |     print "RULES \t SUPPORT \t CONFIDENCE"
146 |     print "--------------------------------------------------------"
147 |     for list in L:
148 |         for l in list:
149 |             length = len(l)
150 |             count = 1
151 |             while count < length: 
152 |                 s = []
153 |                 r = findsubsets(l,count)
154 |                 count += 1
155 |                 for item in r:
156 |                     inc1 = 0
157 |                     inc2 = 0
158 |                     s = []
159 |                     m = []
160 |                     for i in item:
161 |                         s.append(i)
162 |                     for T in D:
163 |                         if set(s).issubset(set(T)) == True:
164 |                             inc1 += 1
165 |                         if set(l).issubset(set(T)) == True:
166 |                             inc2 += 1
167 |                     if 100*inc2/inc1 >= confidence:
168 |                         for index in l:
169 |                             if index not in s:
170 |                                 m.append(index)
171 |                         print "Rule#  %d : %s ==> %s %d %d" %(num, s, m, 100*inc2/len(D), 100*inc2/inc1)
172 |                         num += 1  
173 | 
174 | generate_association_rules()   
175 | print "--------------------------------------------------------"
176 | 


--------------------------------------------------------------------------------