├── .gitignore
├── README.md
└── recommendationSystem.py


/.gitignore:
--------------------------------------------------------------------------------
1 | ml-latest-small.zip
2 | ml-latest-small
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # content-based-recommendation-system
2 | content-based recommendation system using numpy, scipy and pandas.
3 | Here goal is to implement a content-based recommendation algorithm. It will use the list of genres for a movie as the content and data comes from the MovieLens project.
4 | I have used pandas library as data structure, tf-idf and cosine similarity is used in this recommendation system.   Part of Assignment series for CS579 Course.
5 | 


--------------------------------------------------------------------------------
/recommendationSystem.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | # # Assignment 3:  Recommendation systems
  4 | #
  5 | # Here we'll implement a content-based recommendation algorithm.
  6 | # It will use the list of genres for a movie as the content.
  7 | # The data come from the MovieLens project: http://grouplens.org/datasets/movielens/
  8 | 
  9 | # Please only use these imports.
 10 | from collections import Counter, defaultdict
 11 | import math
 12 | import numpy as np
 13 | import os
 14 | import pandas as pd
 15 | import re
 16 | from scipy.sparse import csr_matrix
 17 | import urllib.request
 18 | import zipfile
 19 | 
 20 | def download_data():
 21 |     """ DONE. Download and unzip data.
 22 |     """
 23 |     url = 'https://www.dropbox.com/s/h9ubx22ftdkyvd5/ml-latest-small.zip?dl=1'
 24 |     urllib.request.urlretrieve(url, 'ml-latest-small.zip')
 25 |     zfile = zipfile.ZipFile('ml-latest-small.zip')
 26 |     zfile.extractall()
 27 |     zfile.close()
 28 | 
 29 | 
 30 | def tokenize_string(my_string):
 31 |     """ DONE. You should use this in your tokenize function.
 32 |     """
 33 |     return re.findall('[\w\-]+', my_string.lower())
 34 | 
 35 | 
 36 | def tokenize(movies):
 37 |     """
 38 |     Append a new column to the movies DataFrame with header 'tokens'.
 39 |     This will contain a list of strings, one per token, extracted
 40 |     from the 'genre' field of each movie. Use the tokenize_string method above.
 41 | 
 42 |     Note: you may modify the movies parameter directly; no need to make
 43 |     a new copy.
 44 |     Params:
 45 |       movies...The movies DataFrame
 46 |     Returns:
 47 |       The movies DataFrame, augmented to include a new column called 'tokens'.
 48 | 
 49 |     >>> movies = pd.DataFrame([[123, 'Horror|Romance'], [456, 'Sci-Fi']], columns=['movieId', 'genres'])
 50 |     >>> movies = tokenize(movies)
 51 |     >>> movies['tokens'].tolist()
 52 |     [['horror', 'romance'], ['sci-fi']]
 53 |     """
 54 |     tokenlist=[]
 55 |     for index,row in movies.iterrows():
 56 |         tokenlist.append(tokenize_string(row.genres))
 57 |     movies['tokens']=tokenlist
 58 |     return movies
 59 | 
 60 | def featurize(movies):
 61 |     """
 62 |     Append a new column to the movies DataFrame with header 'features'.
 63 |     Each row will contain a csr_matrix of shape (1, num_features). Each
 64 |     entry in this matrix will contain the tf-idf value of the term, as
 65 |     defined in class:
 66 |     tfidf(i, d) := tf(i, d) / max_k tf(k, d) * log10(N/df(i))
 67 |     where:
 68 |     i is a term
 69 |     d is a document (movie)
 70 |     tf(i, d) is the frequency of term i in document d
 71 |     max_k tf(k, d) is the maximum frequency of any term in document d
 72 |     N is the number of documents (movies)
 73 |     df(i) is the number of unique documents containing term i
 74 | 
 75 |     Params:
 76 |       movies...The movies DataFrame
 77 |     Returns:
 78 |       A tuple containing:
 79 |       - The movies DataFrame, which has been modified to include a column named 'features'.
 80 |       - The vocab, a dict from term to int. Make sure the vocab is sorted alphabetically as in a2 (e.g., {'aardvark': 0, 'boy': 1, ...})
 81 |     """
 82 |     def tf(word,doc):
 83 |         return doc.count(word) / Counter(doc).most_common()[0][1]
 84 | 
 85 |     def df(word, doclist):
 86 |         return sum(1 for d in doclist if word in d)
 87 | 
 88 |     def tfidf(word, doc, dfdict, N):
 89 |         return tf(word, doc) * math.log10((N/dfdict[word]))
 90 | 
 91 |     def getcsrmatrix(tokens,dfdict,N,vocab):
 92 |         matrixRow_list = []
 93 |         matrixRow_list = np.zeros((1,len(vocab)),dtype='float')
 94 |         for t in tokens:
 95 |             if t in vocab:
 96 |                 matrixRow_list[0][vocab[t]] = tfidf(t,tokens,dfdict,N)
 97 |         return csr_matrix(matrixRow_list)
 98 | 
 99 |     N=len(movies)
100 |     doclist = movies['tokens'].tolist()
101 |     vocab = { i:x for x,i in enumerate(sorted(list(set(i for s in doclist for i in s)))) }
102 | 
103 |     dfdict = {}
104 |     for v in vocab.items():
105 |         dfdict[v[0]] = df(v[0],doclist)
106 | 
107 |     csrlist = []
108 |     for index, row in movies.iterrows():
109 |          csrlist.append(getcsrmatrix(row['tokens'],dfdict,N,vocab))
110 | 
111 |     movies['features'] =  csrlist
112 |     return (movies,vocab)
113 | 
114 | 
115 | def train_test_split(ratings):
116 |     """DONE.
117 |     Returns a random split of the ratings matrix into a training and testing set.
118 |     """
119 |     test = set(range(len(ratings))[::1000])
120 |     train = sorted(set(range(len(ratings))) - test)
121 |     test = sorted(test)
122 |     return ratings.iloc[train], ratings.iloc[test]
123 | 
124 | 
125 | def cosine_sim(a, b):
126 |     """
127 |     Compute the cosine similarity between two 1-d csr_matrices.
128 |     Each matrix represents the tf-idf feature vector of a movie.
129 |     Params:
130 |       a...A csr_matrix with shape (1, number_features)
131 |       b...A csr_matrix with shape (1, number_features)
132 |     Returns:
133 |       The cosine similarity, defined as: dot(a, b) / ||a|| * ||b||
134 |       where ||a|| indicates the Euclidean norm (aka L2 norm) of vector a.
135 |     """
136 |     v1 = a.toarray()[0]
137 |     v2  = b.toarray()[0]
138 |     return sum(i[0] * i[1] for i in zip(v1, v2))/(math.sqrt(sum([i*i for i in v1]))*math.sqrt(sum([i*i for i in v2])))
139 | 
140 | def make_predictions(movies, ratings_train, ratings_test):
141 |     """
142 |     Using the ratings in ratings_train, predict the ratings for each
143 |     row in ratings_test.
144 | 
145 |     To predict the rating of user u for movie i: Compute the weighted average
146 |     rating for every other movie that u has rated.  Restrict this weighted
147 |     average to movies that have a positive cosine similarity with movie
148 |     i. The weight for movie m corresponds to the cosine similarity between m
149 |     and i.
150 | 
151 |     If there are no other movies with positive cosine similarity to use in the
152 |     prediction, use the mean rating of the target user in ratings_train as the
153 |     prediction.
154 | 
155 |     Params:
156 |       movies..........The movies DataFrame.
157 |       ratings_train...The subset of ratings used for making predictions. These are the "historical" data.
158 |       ratings_test....The subset of ratings that need to predicted. These are the "future" data.
159 |     Returns:
160 |       A numpy array containing one predicted rating for each element of ratings_test.
161 |     """
162 |     result = []
163 |     for index,row in ratings_test.iterrows():
164 |         mlist = list(ratings_train.loc[ratings_train['userId'] == row['userId']]['movieId'])
165 |         csrlist = list(movies.loc[movies['movieId'].isin(mlist)]['features'])
166 |         mrlist = list(ratings_train.loc[ratings_train['userId'] ==row['userId']]['rating'])
167 |         cmlist = [cosine_sim(c,movies.loc[movies['movieId'] ==row['movieId']]['features'].values[0]) for c in csrlist]
168 |         wan = sum([ v*mrlist[i] for i,v in enumerate(cmlist) if v>0 ])
169 |         wadlist = [i for i in cmlist if i>0]
170 |         if (len(wadlist)>0):
171 |             result.append(wan/sum(wadlist))
172 |         else:
173 |             result.append(np.mean(mrlist))
174 |     return np.array(result)
175 | 
176 | def mean_absolute_error(predictions, ratings_test):
177 |     """DONE.
178 |     Return the mean absolute error of the predictions.
179 |     """
180 |     return np.abs(predictions - np.array(ratings_test.rating)).mean()
181 | 
182 | 
183 | def main():
184 |     download_data()
185 |     path = 'ml-latest-small'
186 |     ratings = pd.read_csv(path + os.path.sep + 'ratings.csv')
187 |     movies = pd.read_csv(path + os.path.sep + 'movies.csv')
188 |     movies = tokenize(movies)
189 |     movies, vocab = featurize(movies)
190 |     print('vocab:')
191 |     print(sorted(vocab.items())[:10])
192 |     ratings_train, ratings_test = train_test_split(ratings)
193 |     print('%d training ratings; %d testing ratings' % (len(ratings_train), len(ratings_test)))
194 |     predictions = make_predictions(movies, ratings_train, ratings_test)
195 |     print('error=%f' % mean_absolute_error(predictions, ratings_test))
196 |     print(predictions[:10])
197 | 
198 | 
199 | if __name__ == '__main__':
200 |     main()
201 | 


--------------------------------------------------------------------------------