├── collaborative_filtering.py
├── recommendation_data.py
└── recommendation_data.pyc


/collaborative_filtering.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Implementation of collaborative filtering recommendation engine
  3 | 
  4 | from recommendation_data import dataset
  5 | from math import sqrt
  6 | 
  7 | def similarity_score(person1,person2):
  8 | 	
  9 | 	# Returns ratio Euclidean distance score of person1 and person2 
 10 |  
 11 | 	both_viewed = {}		# To get both rated items by person1 and person2
 12 |  
 13 | 	for item in dataset[person1]:
 14 | 		if item in dataset[person2]:
 15 | 			both_viewed[item] = 1
 16 |  
 17 | 		# Conditions to check they both have an common rating items	
 18 | 		if len(both_viewed) == 0:
 19 | 			return 0
 20 |  
 21 | 		# Finding Euclidean distance 
 22 | 		sum_of_eclidean_distance = []	
 23 |  
 24 | 		for item in dataset[person1]:
 25 | 			if item in dataset[person2]:
 26 | 				sum_of_eclidean_distance.append(pow(dataset[person1][item] - dataset[person2][item],2))
 27 | 		sum_of_eclidean_distance = sum(sum_of_eclidean_distance)
 28 |  
 29 | 		return 1/(1+sqrt(sum_of_eclidean_distance))
 30 | 
 31 | def pearson_correlation(person1,person2):
 32 |  
 33 | 	# To get both rated items
 34 | 	both_rated = {}
 35 | 	for item in dataset[person1]:
 36 | 		if item in dataset[person2]:
 37 | 			both_rated[item] = 1
 38 |  
 39 | 	number_of_ratings = len(both_rated)		
 40 | 	
 41 | 	# Checking for number of ratings in common
 42 | 	if number_of_ratings == 0:
 43 | 		return 0
 44 |  
 45 | 	# Add up all the preferences of each user
 46 | 	person1_preferences_sum = sum([dataset[person1][item] for item in both_rated])
 47 | 	person2_preferences_sum = sum([dataset[person2][item] for item in both_rated])
 48 |  
 49 | 	# Sum up the squares of preferences of each user
 50 | 	person1_square_preferences_sum = sum([pow(dataset[person1][item],2) for item in both_rated])
 51 | 	person2_square_preferences_sum = sum([pow(dataset[person2][item],2) for item in both_rated])
 52 |  
 53 | 	# Sum up the product value of both preferences for each item
 54 | 	product_sum_of_both_users = sum([dataset[person1][item] * dataset[person2][item] for item in both_rated])
 55 |  
 56 | 	# Calculate the pearson score
 57 | 	numerator_value = product_sum_of_both_users - (person1_preferences_sum*person2_preferences_sum/number_of_ratings)
 58 | 	denominator_value = sqrt((person1_square_preferences_sum - pow(person1_preferences_sum,2)/number_of_ratings) * (person2_square_preferences_sum -pow(person2_preferences_sum,2)/number_of_ratings))
 59 | 	if denominator_value == 0:
 60 | 		return 0
 61 | 	else:
 62 | 		r = numerator_value/denominator_value
 63 | 		return r
 64 | 
 65 | def most_similar_users(person,number_of_users):
 66 | 	# returns the number_of_users (similar persons) for a given specific person.
 67 | 	scores = [(pearson_correlation(person,other_person),other_person) for other_person in dataset if  other_person != person ]
 68 | 	
 69 | 	# Sort the similar persons so that highest scores person will appear at the first
 70 | 	scores.sort()
 71 | 	scores.reverse()
 72 | 	return scores[0:number_of_users]
 73 | 
 74 | def user_reommendations(person):
 75 |  
 76 | 	# Gets recommendations for a person by using a weighted average of every other user's rankings
 77 | 	totals = {}
 78 | 	simSums = {}
 79 | 	rankings_list =[]
 80 | 	for other in dataset:
 81 | 		# don't compare me to myself
 82 | 		if other == person:
 83 | 			continue
 84 | 		sim = pearson_correlation(person,other)
 85 |  
 86 | 		# ignore scores of zero or lower
 87 | 		if sim == 0: 
 88 | 			continue
 89 | 		for item in dataset[other]:
 90 |  
 91 | 			# only score movies i haven't seen yet
 92 | 			if item not in dataset[person] or dataset[person][item] == 0:
 93 |  
 94 | 			# Similrity * score
 95 | 				totals.setdefault(item,0)
 96 | 				totals[item] += dataset[other][item]* sim
 97 | 				# sum of similarities
 98 | 				simSums.setdefault(item,0)
 99 | 				simSums[item]+= sim
100 |  
101 | 		# Create the normalized list
102 |  
103 | 	rankings = [(total/simSums[item],item) for item,total in totals.items()]
104 | 	rankings.sort()
105 | 	rankings.reverse()
106 | 	# returns the recommended items
107 | 	recommendataions_list = [recommend_item for score,recommend_item in rankings]
108 | 	return recommendataions_list
109 | print
110 | print "Lisa Rose rating on Lady in the water score: {}".format(dataset['Lisa Rose']['Lady in the Water'])
111 | print "Michael Phillips rating on Lady in the water score: {}".format(dataset['Michael Phillips']['Lady in the Water'])
112 | print
113 | print '**************Jack Matthews ratings**************'
114 | print dataset['Jack Matthews']
115 | print
116 | print '************** Smililarity Score Euclidean Distance Socre Lisa and Jack **************'
117 | print similarity_score('Lisa Rose','Jack Matthews')
118 | print
119 | print '************** Lisa : Who is most similar Users **************'
120 | print most_similar_users('Lisa Rose',3)
121 | print
122 | print "************** Recommendations for Toby **************"
123 | print 'Toby :'
124 | print user_reommendations('Toby')
125 | 


--------------------------------------------------------------------------------
/recommendation_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 |  
 3 | # Collabrative Filtering data set taken from Collective Intelligence book.
 4 |  
 5 | dataset={
 6 |  'Lisa Rose': {
 7 |  'Lady in the Water': 2.5,
 8 |  'Snakes on a Plane': 3.5,
 9 |  'Just My Luck': 3.0,
10 |  'Superman Returns': 3.5,
11 |  'You, Me and Dupree': 2.5,
12 |  'The Night Listener': 3.0},
13 |  'Gene Seymour': {'Lady in the Water': 3.0,
14 |  'Snakes on a Plane': 3.5,
15 |  'Just My Luck': 1.5,
16 |  'Superman Returns': 5.0,
17 |  'The Night Listener': 3.0,
18 |  'You, Me and Dupree': 3.5},
19 |  
20 |  'Michael Phillips': {'Lady in the Water': 2.5,
21 |  'Snakes on a Plane': 3.0,
22 |  'Superman Returns': 3.5,
23 |  'The Night Listener': 4.0},
24 |  'Claudia Puig': {'Snakes on a Plane': 3.5,
25 |  'Just My Luck': 3.0,
26 |  'The Night Listener': 4.5,
27 |  'Superman Returns': 4.0,
28 |  'You, Me and Dupree': 2.5},
29 |  
30 |  'Mick LaSalle': {'Lady in the Water': 3.0,
31 |  'Snakes on a Plane': 4.0,
32 |  'Just My Luck': 2.0,
33 |  'Superman Returns': 3.0,
34 |  'The Night Listener': 3.0,
35 |  'You, Me and Dupree': 2.0},
36 |  
37 |  'Jack Matthews': {'Lady in the Water': 3.0,
38 |  'Snakes on a Plane': 4.0,
39 |  'The Night Listener': 3.0,
40 |  'Superman Returns': 5.0,
41 |  'You, Me and Dupree': 3.5},
42 |  
43 |  'Toby': {'Snakes on a Plane':4.5,
44 |  'You, Me and Dupree':1.0,
45 |  'Superman Returns':4.0}}


--------------------------------------------------------------------------------
/recommendation_data.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddps-lab/instacart/4ae445be340731e1f56cfd958685429c608638de/recommendation_data.pyc


--------------------------------------------------------------------------------