├── .gitignore ├── README.md └── recommendationSystem.py /.gitignore: -------------------------------------------------------------------------------- 1 | ml-latest-small.zip 2 | ml-latest-small 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # content-based-recommendation-system 2 | content-based recommendation system using numpy, scipy and pandas. 3 | Here goal is to implement a content-based recommendation algorithm. It will use the list of genres for a movie as the content and data comes from the MovieLens project. 4 | I have used pandas library as data structure, tf-idf and cosine similarity is used in this recommendation system. Part of Assignment series for CS579 Course. 5 | -------------------------------------------------------------------------------- /recommendationSystem.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # # Assignment 3: Recommendation systems 4 | # 5 | # Here we'll implement a content-based recommendation algorithm. 6 | # It will use the list of genres for a movie as the content. 7 | # The data come from the MovieLens project: http://grouplens.org/datasets/movielens/ 8 | 9 | # Please only use these imports. 10 | from collections import Counter, defaultdict 11 | import math 12 | import numpy as np 13 | import os 14 | import pandas as pd 15 | import re 16 | from scipy.sparse import csr_matrix 17 | import urllib.request 18 | import zipfile 19 | 20 | def download_data(): 21 | """ DONE. Download and unzip data. 22 | """ 23 | url = 'https://www.dropbox.com/s/h9ubx22ftdkyvd5/ml-latest-small.zip?dl=1' 24 | urllib.request.urlretrieve(url, 'ml-latest-small.zip') 25 | zfile = zipfile.ZipFile('ml-latest-small.zip') 26 | zfile.extractall() 27 | zfile.close() 28 | 29 | 30 | def tokenize_string(my_string): 31 | """ DONE. You should use this in your tokenize function. 32 | """ 33 | return re.findall('[\w\-]+', my_string.lower()) 34 | 35 | 36 | def tokenize(movies): 37 | """ 38 | Append a new column to the movies DataFrame with header 'tokens'. 39 | This will contain a list of strings, one per token, extracted 40 | from the 'genre' field of each movie. Use the tokenize_string method above. 41 | 42 | Note: you may modify the movies parameter directly; no need to make 43 | a new copy. 44 | Params: 45 | movies...The movies DataFrame 46 | Returns: 47 | The movies DataFrame, augmented to include a new column called 'tokens'. 48 | 49 | >>> movies = pd.DataFrame([[123, 'Horror|Romance'], [456, 'Sci-Fi']], columns=['movieId', 'genres']) 50 | >>> movies = tokenize(movies) 51 | >>> movies['tokens'].tolist() 52 | [['horror', 'romance'], ['sci-fi']] 53 | """ 54 | tokenlist=[] 55 | for index,row in movies.iterrows(): 56 | tokenlist.append(tokenize_string(row.genres)) 57 | movies['tokens']=tokenlist 58 | return movies 59 | 60 | def featurize(movies): 61 | """ 62 | Append a new column to the movies DataFrame with header 'features'. 63 | Each row will contain a csr_matrix of shape (1, num_features). Each 64 | entry in this matrix will contain the tf-idf value of the term, as 65 | defined in class: 66 | tfidf(i, d) := tf(i, d) / max_k tf(k, d) * log10(N/df(i)) 67 | where: 68 | i is a term 69 | d is a document (movie) 70 | tf(i, d) is the frequency of term i in document d 71 | max_k tf(k, d) is the maximum frequency of any term in document d 72 | N is the number of documents (movies) 73 | df(i) is the number of unique documents containing term i 74 | 75 | Params: 76 | movies...The movies DataFrame 77 | Returns: 78 | A tuple containing: 79 | - The movies DataFrame, which has been modified to include a column named 'features'. 80 | - The vocab, a dict from term to int. Make sure the vocab is sorted alphabetically as in a2 (e.g., {'aardvark': 0, 'boy': 1, ...}) 81 | """ 82 | def tf(word,doc): 83 | return doc.count(word) / Counter(doc).most_common()[0][1] 84 | 85 | def df(word, doclist): 86 | return sum(1 for d in doclist if word in d) 87 | 88 | def tfidf(word, doc, dfdict, N): 89 | return tf(word, doc) * math.log10((N/dfdict[word])) 90 | 91 | def getcsrmatrix(tokens,dfdict,N,vocab): 92 | matrixRow_list = [] 93 | matrixRow_list = np.zeros((1,len(vocab)),dtype='float') 94 | for t in tokens: 95 | if t in vocab: 96 | matrixRow_list[0][vocab[t]] = tfidf(t,tokens,dfdict,N) 97 | return csr_matrix(matrixRow_list) 98 | 99 | N=len(movies) 100 | doclist = movies['tokens'].tolist() 101 | vocab = { i:x for x,i in enumerate(sorted(list(set(i for s in doclist for i in s)))) } 102 | 103 | dfdict = {} 104 | for v in vocab.items(): 105 | dfdict[v[0]] = df(v[0],doclist) 106 | 107 | csrlist = [] 108 | for index, row in movies.iterrows(): 109 | csrlist.append(getcsrmatrix(row['tokens'],dfdict,N,vocab)) 110 | 111 | movies['features'] = csrlist 112 | return (movies,vocab) 113 | 114 | 115 | def train_test_split(ratings): 116 | """DONE. 117 | Returns a random split of the ratings matrix into a training and testing set. 118 | """ 119 | test = set(range(len(ratings))[::1000]) 120 | train = sorted(set(range(len(ratings))) - test) 121 | test = sorted(test) 122 | return ratings.iloc[train], ratings.iloc[test] 123 | 124 | 125 | def cosine_sim(a, b): 126 | """ 127 | Compute the cosine similarity between two 1-d csr_matrices. 128 | Each matrix represents the tf-idf feature vector of a movie. 129 | Params: 130 | a...A csr_matrix with shape (1, number_features) 131 | b...A csr_matrix with shape (1, number_features) 132 | Returns: 133 | The cosine similarity, defined as: dot(a, b) / ||a|| * ||b|| 134 | where ||a|| indicates the Euclidean norm (aka L2 norm) of vector a. 135 | """ 136 | v1 = a.toarray()[0] 137 | v2 = b.toarray()[0] 138 | return sum(i[0] * i[1] for i in zip(v1, v2))/(math.sqrt(sum([i*i for i in v1]))*math.sqrt(sum([i*i for i in v2]))) 139 | 140 | def make_predictions(movies, ratings_train, ratings_test): 141 | """ 142 | Using the ratings in ratings_train, predict the ratings for each 143 | row in ratings_test. 144 | 145 | To predict the rating of user u for movie i: Compute the weighted average 146 | rating for every other movie that u has rated. Restrict this weighted 147 | average to movies that have a positive cosine similarity with movie 148 | i. The weight for movie m corresponds to the cosine similarity between m 149 | and i. 150 | 151 | If there are no other movies with positive cosine similarity to use in the 152 | prediction, use the mean rating of the target user in ratings_train as the 153 | prediction. 154 | 155 | Params: 156 | movies..........The movies DataFrame. 157 | ratings_train...The subset of ratings used for making predictions. These are the "historical" data. 158 | ratings_test....The subset of ratings that need to predicted. These are the "future" data. 159 | Returns: 160 | A numpy array containing one predicted rating for each element of ratings_test. 161 | """ 162 | result = [] 163 | for index,row in ratings_test.iterrows(): 164 | mlist = list(ratings_train.loc[ratings_train['userId'] == row['userId']]['movieId']) 165 | csrlist = list(movies.loc[movies['movieId'].isin(mlist)]['features']) 166 | mrlist = list(ratings_train.loc[ratings_train['userId'] ==row['userId']]['rating']) 167 | cmlist = [cosine_sim(c,movies.loc[movies['movieId'] ==row['movieId']]['features'].values[0]) for c in csrlist] 168 | wan = sum([ v*mrlist[i] for i,v in enumerate(cmlist) if v>0 ]) 169 | wadlist = [i for i in cmlist if i>0] 170 | if (len(wadlist)>0): 171 | result.append(wan/sum(wadlist)) 172 | else: 173 | result.append(np.mean(mrlist)) 174 | return np.array(result) 175 | 176 | def mean_absolute_error(predictions, ratings_test): 177 | """DONE. 178 | Return the mean absolute error of the predictions. 179 | """ 180 | return np.abs(predictions - np.array(ratings_test.rating)).mean() 181 | 182 | 183 | def main(): 184 | download_data() 185 | path = 'ml-latest-small' 186 | ratings = pd.read_csv(path + os.path.sep + 'ratings.csv') 187 | movies = pd.read_csv(path + os.path.sep + 'movies.csv') 188 | movies = tokenize(movies) 189 | movies, vocab = featurize(movies) 190 | print('vocab:') 191 | print(sorted(vocab.items())[:10]) 192 | ratings_train, ratings_test = train_test_split(ratings) 193 | print('%d training ratings; %d testing ratings' % (len(ratings_train), len(ratings_test))) 194 | predictions = make_predictions(movies, ratings_train, ratings_test) 195 | print('error=%f' % mean_absolute_error(predictions, ratings_test)) 196 | print(predictions[:10]) 197 | 198 | 199 | if __name__ == '__main__': 200 | main() 201 | --------------------------------------------------------------------------------