├── .DS_Store ├── Report.pdf ├── Presentation.pptx ├── Results ├── .DS_Store ├── images │ ├── ndcg.png │ ├── rating.png │ ├── Algo_analysis.png │ ├── Hybrid_Model.png │ ├── knn_neighbors.png │ ├── KNN_similarity.png │ ├── prec_recall_fm.png │ ├── genre_distribution.png │ ├── vector_generation.png │ ├── genre_based_popularity.png │ └── mae_rmse_including_pearson.png ├── Final_model_results.xlsx ├── README.md └── algo_results.csv ├── README.md └── Code ├── README.md ├── test_ndcg.py ├── evaluating_recs.py ├── generating_predictions.py ├── combined_model.ipynb ├── surprise_model_predictions.ipynb ├── cold_start_analysis.ipynb ├── movie_era_based_recs.ipynb └── preprocessing.ipynb /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/.DS_Store -------------------------------------------------------------------------------- /Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Report.pdf -------------------------------------------------------------------------------- /Presentation.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Presentation.pptx -------------------------------------------------------------------------------- /Results/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/.DS_Store -------------------------------------------------------------------------------- /Results/images/ndcg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/ndcg.png -------------------------------------------------------------------------------- /Results/images/rating.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/rating.png -------------------------------------------------------------------------------- /Results/Final_model_results.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/Final_model_results.xlsx -------------------------------------------------------------------------------- /Results/images/Algo_analysis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/Algo_analysis.png -------------------------------------------------------------------------------- /Results/images/Hybrid_Model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/Hybrid_Model.png -------------------------------------------------------------------------------- /Results/images/knn_neighbors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/knn_neighbors.png -------------------------------------------------------------------------------- /Results/images/KNN_similarity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/KNN_similarity.png -------------------------------------------------------------------------------- /Results/images/prec_recall_fm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/prec_recall_fm.png -------------------------------------------------------------------------------- /Results/images/genre_distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/genre_distribution.png -------------------------------------------------------------------------------- /Results/images/vector_generation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/vector_generation.png -------------------------------------------------------------------------------- /Results/images/genre_based_popularity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/genre_based_popularity.png -------------------------------------------------------------------------------- /Results/images/mae_rmse_including_pearson.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/mae_rmse_including_pearson.png -------------------------------------------------------------------------------- /Results/README.md: -------------------------------------------------------------------------------- 1 | #### Analysis Plots: 2 | 3 | 1. Comparison of methods: 4 | ![Model results](images/prec_recall_fm.png) 5 | 6 | 2. Surprise models: 7 | ![Model results](images/Algo_analysis.png) 8 | 9 | 3. Vector generation in content based approach: 10 | ![Content based vector](images/vector_generation.png) 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Movie-Recommendation-System 2 | 3 | **Dataset used:** 4 | 1. [MovieLens](https://grouplens.org/datasets/movielens/) 5 | 2. [The Movie Database: tmdb](https://www.kaggle.com/tmdb/tmdb-movie-metadata) 6 | 7 | **Aim:** Build a movie recommendation system by integrating the aspects of personalization of user with the overall features of movie such as genre, popularity etc.
8 | 9 | **Models:** 10 | * Popularity model 11 | * Content based model: genre, year of release, ratings of movies 12 | * Collaborative filtering: User vs item, KNN similarity measures 13 | * Latent Factor based SVD 14 | * Combined linear model using surprise library (CF + SVD) 15 | * Hybrid model (content based + popularity based + item-item CF + svd) 16 | 17 | **Results:** 18 | 19 | ![Hybrid model](Results/images/Hybrid_Model.png) 20 | 21 | All the models are implemented in Python using pandas, sklearn and [surprise](http://surpriselib.com/) library. The hyperparameter tuning, testing accuracy (RMSE and MAE) and evaluation of recommendations (precision, recall, f-measure and ndcg) for each model are thoroughly performed. The detailed analysis of the models is presented in the report. 22 | -------------------------------------------------------------------------------- /Results/algo_results.csv: -------------------------------------------------------------------------------- 1 | ,Algorithm,test_rmse,test_mae,fit_time,test_time,Precision,Recall,F-measure,NDCG 2 | 0,KNNBaseline (pearson_baseline,0.8527048407985288,0.6481831050363938,9.183264017105103,6.47820782661438,0.8311748633879806,0.4131693896380987,0.5519630281626648,0.9631023306515691 3 | 1,CoClustering,0.9522076927566949,0.7332637064416873,1.8252005577087402,0.1734142303466797,0.7826229508196745,0.3809810109755996,0.5124844754841872,0.9557478562520727 4 | 2,BaselineOnly,0.873457272373568,0.6717695910159173,0.1396622657775879,0.09251093864440918,0.8074316939890736,0.4003500565116332,0.5352876447815149,0.9590854168667904 5 | 3,KNNWithZScore,0.8991840819159082,0.6786227249764061,0.1465740203857422,1.7950918674468994,0.7948087431694016,0.3915249143270615,0.524620410302617,0.9541951019768238 6 | 4,KNNWithMeans,0.9000344577297286,0.6836932871288725,0.10967230796813965,1.5405960083007812,0.8009562841530078,0.3871394034076282,0.521981084939778,0.9527093414526527 7 | 5,KNNBaseline,0.8762934782625659,0.6659918272130076,0.21010994911193848,1.9365522861480713,0.7964207650273253,0.4158810520288814,0.546425487378205,0.9562275612497768 8 | 6,NMF,0.9291418868270431,0.7094694164253142,5.15004301071167,0.20406055450439453,0.7792896174863412,0.38074469729699817,0.5115545044405817,0.9548841017932751 9 | 7,SlopeOne,0.9056446586210445,0.6876919079393096,4.759229898452759,5.59233283996582,0.8075409836065599,0.3965250076150167,0.5318814699668782,0.9555464596666479 10 | 8,SVDpp,0.8691186490330676,0.6640513869365521,480.6970820426941,7.9973015785217285,0.8178415300546472,0.3978840866429081,0.5353282446879383,0.9603166226261703 11 | 9,SVD,0.8794364853143987,0.6739493321877302,4.6099772453308105,0.12506961822509766,0.8033060109289645,0.38554157622502233,0.52102198631871,0.956595790499096 12 | 10,KNNBasic,0.9507724809063621,0.7266525395708078,0.09878921508789062,1.3892457485198975,0.7838797814207674,0.4215349947874059,0.5482474018023665,0.9586757463063268 13 | -------------------------------------------------------------------------------- /Code/README.md: -------------------------------------------------------------------------------- 1 | ### Description: 2 | 3 | #### 1. cold_start_analysis: 4 | Analyses the performance of different approaches in case of a new user or a user with less number of interaction with the system, namely the cold start problem. Computed the rmse and mae for those customers who have rated less than 18 books and also who have rated more than 1000 movies.
5 | For less interactions, content based and item-item based collaborative filtering approaches work better. As the number of interactions per customer increases, SVD and collaborative approaches work better. 6 | 7 | #### 2. combined_model: 8 | Combination of different surprise model results by applying weighted linear combination to generate final rating. 9 | 10 | #### 3. content_based_recommendation: 11 | Genreating user and movie vectors based on genre and predicting the ratings for movies in test data. 12 | 13 | #### 4. evaluating_recs: 14 | Code for Precision, Recall, F-1 score and NDCG. 15 | 16 | #### 5. generating_predictions: 17 | Generating rating predictions for test data using surprise library. 18 | 19 | #### 6. hybrid_model: 20 | Code for the hybrid model based on combining recommendations from different models such as content based, CF, SVD to improve accuracy and quality of recommendations. 21 | 22 | #### 7. knn_analysis: 23 | Analysis of KNN algorithms by changing different parameters like: 24 | * number of neighbors 25 | * similarity metrices 26 | * user v/s item based CF 27 | 28 | #### 8. model_hyperparameter_tuning: 29 | Fine-tuned surprise models by experimenting with different hyperparameters for training and model. Compared models based on RMSE and MAE. 30 | 31 | #### 9. movie_era_based_recs: 32 | Content based approach to include the time period in which the movie was launced in the user vector. This method personalizes the users recommendations to include this feature. 33 | 34 | #### 10. movie_similarity_based_recs: 35 | Content based approach to include the user's genre preference and recommend movies similar to user's highly rated movies. 36 | 37 | #### 11. movie_year_analysis: 38 | Experiments with the year of the movie release. Analysed the distribution of data and determine the appropriate era intervals to classify movies. Used the content based approach to form a user vector based on the era preference. 39 | 40 | #### 12. popularity_model: 41 | Model which uses the popularity attribute as well as the average rating and voter count in the TMDB data to generate popular movies genre wise. The genres are determined using the IMDB data. 42 | 43 | #### 13. preprocessing: 44 | Code for spliting the data into training and testing set for each user such that 80% ratings are in training and 20% are for testing. 45 | 46 | #### 14. surprise_model_predictions: 47 | Code for generating ratings for test data using surprise models such as KNN (CF), SVD, Baseline approach, Slopeone etc. 48 | 49 | #### 15. surprise_model_recs: 50 | Comparison between the surprise models based on test data ratings (RMSE and MAE) and quality of recommendations (precision, recall, ndcg, f-measure). 51 | 52 | #### 16. test_ndcg: 53 | Code to test implementation of [NDCG metric](https://en.wikipedia.org/wiki/Discounted_cumulative_gain) for evaluting recommendations. 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /Code/test_ndcg.py: -------------------------------------------------------------------------------- 1 | import math 2 | from collections import defaultdict 3 | 4 | from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader 5 | from surprise import Dataset 6 | from surprise.model_selection import cross_validate 7 | from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore 8 | from surprise import accuracy 9 | from surprise.model_selection import train_test_split 10 | 11 | import pandas as pd 12 | import numpy as np 13 | 14 | 15 | def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe): 16 | reader = Reader(rating_scale=(0, 5)) 17 | trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader) 18 | testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader) 19 | trainset = trainset.construct_trainset(trainset.raw_ratings) 20 | testset = testset.construct_testset(testset.raw_ratings) 21 | return trainset, testset 22 | 23 | 24 | # Modified get_top_n function ----------------------------------- 25 | # actual_ratings: list of actual ratings for all iids for each user 26 | def get_top_n(predictions, n): 27 | # First map the predictions to each user. 28 | top_n = defaultdict(list) 29 | org_ratings = defaultdict(list) 30 | 31 | for uid, iid, true_r, est, _ in predictions: 32 | top_n[uid].append((iid, est)) 33 | org_ratings[uid].append((iid, true_r)) 34 | 35 | # Then sort the predictions for each user and retrieve the k highest ones. 36 | for uid, user_ratings in top_n.items(): 37 | user_ratings.sort(key=lambda x: x[1], reverse=True) 38 | top_n[uid] = user_ratings[:n] 39 | 40 | return top_n, org_ratings 41 | # ------------------------------------------------------------------- 42 | 43 | 44 | def dcg_at_k(scores): 45 | return scores[0] + sum(sc/math.log(ind, 2) for sc, ind in zip(scores[1:], range(2, len(scores) + 1))) 46 | 47 | 48 | # Modified to include only one parameter------------------------------- 49 | def ndcg_at_k(scores): 50 | idcg = dcg_at_k(sorted(scores, reverse=True)) 51 | return (dcg_at_k(scores)/idcg) if idcg > 0.0 else 0.0 52 | # --------------------------------------------------------------------- 53 | 54 | 55 | file_path_train = 'training_data.csv' 56 | file_path_test = 'testing_data.csv' 57 | traindf = pd.read_csv(file_path_train) 58 | testdf = pd.read_csv(file_path_test) 59 | trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf) 60 | 61 | print("Starting algo") 62 | algo = SVDpp() 63 | algo.fit(trainset) 64 | test_predictions = algo.test(testset) 65 | test_rmse = accuracy.rmse(test_predictions) 66 | test_mae = accuracy.mae(test_predictions) 67 | print("Ended algo") 68 | 69 | top_n, org_ratings = get_top_n(test_predictions, 5) # --------------- Modified this line 70 | 71 | ndcg_scores = dict() 72 | 73 | # Modified---------------------- 74 | for uid, user_ratings in top_n.items(): 75 | scores = [] 76 | for iid, est_r in user_ratings: 77 | iid_found = False 78 | org_user_ratings = org_ratings[uid] 79 | for i, r in org_user_ratings: 80 | if iid == i: 81 | scores.append(r) 82 | iid_found = True 83 | break 84 | if not iid_found: 85 | scores.append(0) 86 | ndcg_scores[uid] = ndcg_at_k(scores) 87 | # -------------------------------- 88 | 89 | ndcg_score = sum(ndcg for ndcg in ndcg_scores.values())/len(ndcg_scores) 90 | print(ndcg_score) 91 | -------------------------------------------------------------------------------- /Code/evaluating_recs.py: -------------------------------------------------------------------------------- 1 | import math 2 | from collections import defaultdict 3 | import csv 4 | from sklearn.metrics import ndcg_score 5 | import numpy as np 6 | import pandas as pd 7 | 8 | 9 | def get_top_n(predictions, algo_weights, n): 10 | '''Return the top-N recommendation for each user from a set of predictions. 11 | 12 | Args: 13 | predictions(list of Prediction objects): The list of predictions, as 14 | returned by the test method of an algorithm. 15 | n(int): The number of recommendation to output for each user. Default 16 | is 10. 17 | 18 | Returns: 19 | A dict where keys are user (raw) ids and values are lists of tuples: 20 | [(raw item id, rating estimation), ...] of size n. 21 | ''' 22 | 23 | # First map the predictions to each user. 24 | top_n = defaultdict(list) 25 | top_n_ndcg = defaultdict(list) 26 | for i in range(len(predictions)): 27 | row = predictions.iloc[i, :] 28 | final_est = algo_weights['svd']*float(row['svd_rating']) + algo_weights['knn']*float(row['knn_rating']) + \ 29 | algo_weights['svdpp']*float(row['svdpp_rating']) + algo_weights['slope']*float(row['slopeone_rating']) + \ 30 | algo_weights['baseline']*float(row['baseline_rating']) 31 | top_n[row[0]].append((row[1], final_est)) 32 | top_n_ndcg[row[0]].append((row[1], row[2], final_est)) 33 | 34 | # Then sort the predictions for each user and retrieve the k highest ones. 35 | for uid, user_ratings in top_n.items(): 36 | user_ratings.sort(key=lambda x: x[1], reverse=True) 37 | top_n[uid] = user_ratings[:n] 38 | 39 | for uid, user_ratings in top_n_ndcg.items(): 40 | user_ratings.sort(key=lambda x: x[2], reverse=True) 41 | top_n_ndcg[uid] = user_ratings[:n] 42 | 43 | return top_n, top_n_ndcg 44 | 45 | 46 | def precision_recall_at_k(predictions, algo_weights, k, threshold): 47 | '''Return precision and recall at k metrics for each user.''' 48 | 49 | # First map the predictions to each user. 50 | user_est_true = defaultdict(list) 51 | for i in range(len(predictions)): 52 | row = predictions.iloc[i, :] 53 | final_est = algo_weights['svd']*float(row['svd_rating']) + algo_weights['knn']*float(row['knn_rating']) + \ 54 | algo_weights['svdpp']*float(row['svdpp_rating']) + algo_weights['slope']*float(row['slopeone_rating']) + \ 55 | algo_weights['baseline']*float(row['baseline_rating']) 56 | user_est_true[row[0]].append((final_est, row[2])) 57 | 58 | precisions = dict() 59 | recalls = dict() 60 | for uid, user_ratings in user_est_true.items(): 61 | # Sort user ratings by estimated value 62 | user_ratings.sort(key=lambda x: x[0], reverse=True) 63 | 64 | # Number of relevant items 65 | n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings) 66 | 67 | # Number of recommended items in top k 68 | n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k]) 69 | 70 | # Number of relevant and recommended items in top k 71 | n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) 72 | for (est, true_r) in user_ratings[:k]) 73 | 74 | # Precision@K: Proportion of recommended items that are relevant 75 | precisions[uid] = n_rel_and_rec_k/n_rec_k if n_rec_k != 0 else 1 76 | 77 | # Recall@K: Proportion of relevant items that are recommended 78 | recalls[uid] = n_rel_and_rec_k/n_rel if n_rel != 0 else 1 79 | 80 | return precisions, recalls 81 | 82 | 83 | def dcg_at_k(scores): 84 | return scores[0] + sum(sc/math.log(ind, 2) for sc, ind in zip(scores[1:], range(2, len(scores) + 1))) 85 | 86 | 87 | def ndcg_at_k(predicted_scores, actual_scores): 88 | idcg = dcg_at_k(sorted(actual_scores, reverse=True)) 89 | return (dcg_at_k(predicted_scores)/idcg) if idcg > 0.0 else 0.0 90 | 91 | 92 | predictions = pd.read_csv("test_prediction_HP.csv", usecols=range(1, 9)) 93 | algo_weights = dict() 94 | algo_weights['svd'] = 0 95 | algo_weights['knn'] = 0 96 | algo_weights['svdpp'] = 1 97 | algo_weights['slope'] = 0 98 | algo_weights['baseline'] = 0 99 | n = 5 100 | threshold = 3.75 101 | top_n, top_n_ndcg = get_top_n(predictions, algo_weights, n) 102 | with open('top5_svdpp.csv', 'w', newline="") as csv_file: 103 | writer = csv.writer(csv_file) 104 | for key, value in top_n.items(): 105 | writer.writerow([key, value]) 106 | 107 | ndcg_scores = dict() 108 | for uid, user_ratings in top_n_ndcg.items(): 109 | true = [] 110 | est = [] 111 | for _, tru_r, est_r in user_ratings: 112 | true.append(tru_r) 113 | est.append(est_r) 114 | ndcg = ndcg_at_k(est, true) 115 | ndcg_scores[uid] = ndcg 116 | 117 | # Print the recommended items for each user 118 | # for uid, user_ratings in top_n.items(): 119 | # print(uid, [iid for (iid, _) in user_ratings]) 120 | 121 | precisions, recalls = precision_recall_at_k(predictions, algo_weights, n, threshold) 122 | precision = sum(prec for prec in precisions.values())/len(precisions) 123 | recall = sum(rec for rec in recalls.values())/len(recalls) 124 | fmeasure = (2*precision*recall)/(precision + recall) 125 | ndcg_score = sum(ndcg for ndcg in ndcg_scores.values())/len(ndcg_scores) 126 | print("Precision: ", precision) 127 | print("Recall: ", recall) 128 | print("F-Measure", fmeasure) 129 | print("NDCG Score: ", ndcg_score) 130 | -------------------------------------------------------------------------------- /Code/generating_predictions.py: -------------------------------------------------------------------------------- 1 | from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader 2 | from surprise import Dataset 3 | from surprise.model_selection import cross_validate 4 | from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore 5 | from surprise import accuracy 6 | from surprise.model_selection import train_test_split 7 | 8 | import pandas as pd 9 | import numpy as np 10 | 11 | 12 | def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe): 13 | reader = Reader(rating_scale=(0, 5)) 14 | trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader) 15 | testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader) 16 | trainset = trainset.construct_trainset(trainset.raw_ratings) 17 | testset = testset.construct_testset(testset.raw_ratings) 18 | return trainset, testset 19 | 20 | 21 | def recommendation(algo, trainset, testset): 22 | # Train the algorithm on the trainset, and predict ratings for the testset 23 | algo.fit(trainset) 24 | 25 | # # Predictions on training set 26 | # train_predictions = algo.test(trainset) 27 | # train_rmse = accuracy.rmse(train_predictions) 28 | # train_mae = accuracy.mae(train_predictions) 29 | 30 | # Predictions on testing set 31 | test_predictions = algo.test(testset) 32 | test_rmse = accuracy.rmse(test_predictions) 33 | test_mae = accuracy.mae(test_predictions) 34 | 35 | return test_rmse, test_mae, test_predictions 36 | 37 | 38 | file_path_train = 'training_data.csv' 39 | file_path_test = 'testing_data.csv' 40 | traindf = pd.read_csv(file_path_train) 41 | testdf = pd.read_csv(file_path_test) 42 | trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf) 43 | 44 | 45 | print("1") 46 | BaselineOnly() 47 | 48 | algo = BaselineOnly() 49 | test_base_rmse, test_base_mae, test_base_pred = recommendation(algo, trainset, testset) 50 | 51 | print("2") 52 | # basic collaborative filtering algorithm taking into account a baseline rating. 53 | sim_options = {'name': 'pearson_baseline', 54 | 'user_based': False # compute similarities between items 55 | } 56 | algo = KNNBaseline(sim_options=sim_options) 57 | test_knn_rmse, test_knn_mae, test_knn_pred = recommendation(algo, trainset, testset) 58 | 59 | print("3") 60 | # SlopeOne 61 | algo = SlopeOne() 62 | test_slopeone_rmse, test_slopeone_mae, test_slopeone_pred = recommendation(algo, trainset, testset) 63 | 64 | print("4") 65 | # SVD 66 | algo = SVD() 67 | test_svd_rmse, test_svd_mae, test_svd_pred = recommendation(algo, trainset, testset) 68 | 69 | print("5") 70 | # SVDpp 71 | algo = SVDpp() 72 | test_svdpp_rmse, test_svdpp_mae, test_svdpp_pred = recommendation(algo, trainset, testset) 73 | 74 | print("6") 75 | test_pred_df = pd.DataFrame( 76 | columns=['uid', 'iid', 'og_rating', 'svd_rating', 'knn_rating', 'svdpp_rating', 'slopeone_rating', 77 | 'baseline_rating']) 78 | test_svd_df = pd.DataFrame( 79 | columns=['uid', 'iid', 'og_rating', 'est_rating']) 80 | test_svdpp_df = pd.DataFrame( 81 | columns=['uid', 'iid', 'og_rating', 'est_rating']) 82 | test_knnb_df = pd.DataFrame( 83 | columns=['uid', 'iid', 'og_rating', 'est_rating']) 84 | test_slope_df = pd.DataFrame( 85 | columns=['uid', 'iid', 'og_rating', 'est_rating']) 86 | test_bonly_df = pd.DataFrame( 87 | columns=['uid', 'iid', 'og_rating', 'est_rating']) 88 | num_test = len(test_base_pred) 89 | for i in range(num_test): 90 | svd = test_svd_pred[i] 91 | slopeone = test_slopeone_pred[i] 92 | knn = test_knn_pred[i] 93 | svdpp = test_svdpp_pred[i] 94 | baseline = test_base_pred[i] 95 | df = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, svd.est, knn.est, svdpp.est, slopeone.est, baseline.est]], 96 | columns=['uid', 'iid', 'og_rating', 'svd_rating', 'knn_rating', 'svdpp_rating', 'slopeone_rating', 97 | 'baseline_rating']) 98 | df_svd = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, svd.est]], 99 | columns=['uid', 'iid', 'og_rating', 'est_rating']) 100 | df_svdpp = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, svdpp.est]], 101 | columns=['uid', 'iid', 'og_rating', 'est_rating']) 102 | df_knnb = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, knn.est]], 103 | columns=['uid', 'iid', 'og_rating', 'est_rating']) 104 | df_slope = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, slopeone.est]], 105 | columns=['uid', 'iid', 'og_rating', 'est_rating']) 106 | df_bonly = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, baseline.est]], 107 | columns=['uid', 'iid', 'og_rating', 'est_rating']) 108 | test_pred_df = pd.concat([df, test_pred_df], ignore_index=True) 109 | test_svd_df = pd.concat([df_svd, test_svd_df], ignore_index=True) 110 | test_svdpp_df = pd.concat([df_svdpp, test_svdpp_df], ignore_index=True) 111 | test_slope_df = pd.concat([df_slope, test_slope_df], ignore_index=True) 112 | test_knnb_df = pd.concat([df_knnb, test_knnb_df], ignore_index=True) 113 | test_bonly_df = pd.concat([df_bonly, test_bonly_df], ignore_index=True) 114 | 115 | print("7") 116 | test_pred_df.to_csv('test_prediction_HP.csv') 117 | test_svd_df.to_csv('test_predictions_svd.csv') 118 | test_svdpp_df.to_csv('test_predictions_svdpp.csv') 119 | test_knnb_df.to_csv('test_predictions_knnb.csv') 120 | test_slope_df.to_csv('test_predictions_slope.csv') 121 | test_bonly_df.to_csv('test_predictions_bonly.csv') 122 | -------------------------------------------------------------------------------- /Code/combined_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "combined_model.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "6IBmQTfmBf2k", 20 | "colab_type": "text" 21 | }, 22 | "source": [ 23 | "This notebook combines the individual model's rating to form a unified model which performs better. The ratings from individual models are combined using a weighted linear combination to form a resultant rating. This method helps overcome the shortcomings of individual method. \\\\\n", 24 | "The ratings are those generated using surprise library." 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "metadata": { 30 | "id": "fRRCvSdKBeX_", 31 | "colab_type": "code", 32 | "colab": {} 33 | }, 34 | "source": [ 35 | "import pandas as pd\n", 36 | "import numpy as np\n", 37 | "import math" 38 | ], 39 | "execution_count": null, 40 | "outputs": [] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "metadata": { 45 | "id": "NBUye_PbsZdg", 46 | "colab_type": "code", 47 | "colab": { 48 | "base_uri": "https://localhost:8080/", 49 | "height": 195 50 | }, 51 | "outputId": "b9048120-dea6-4485-f0b9-118915146f4b" 52 | }, 53 | "source": [ 54 | "pred_data = pd.read_csv('test_prediction_HP.csv')\n", 55 | "pred_data.head() " 56 | ], 57 | "execution_count": null, 58 | "outputs": [ 59 | { 60 | "output_type": "execute_result", 61 | "data": { 62 | "text/html": [ 63 | "
\n", 64 | "\n", 77 | "\n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | "
Unnamed: 0uidiidog_ratingsvd_ratingknn_ratingsvdpp_ratingslopeone_ratingbaseline_rating
006101639813.53.5716373.6032563.5385273.5010783.603256
116101623503.53.4300783.5172003.3235702.7528713.601820
226101615824.03.7157224.0250553.8368454.2531103.760107
336101590933.03.8891873.7171443.4994363.7372763.728456
446101567264.53.2093413.8632983.0042461.8500293.439723
\n", 155 | "
" 156 | ], 157 | "text/plain": [ 158 | " Unnamed: 0 uid iid ... svdpp_rating slopeone_rating baseline_rating\n", 159 | "0 0 610 163981 ... 3.538527 3.501078 3.603256\n", 160 | "1 1 610 162350 ... 3.323570 2.752871 3.601820\n", 161 | "2 2 610 161582 ... 3.836845 4.253110 3.760107\n", 162 | "3 3 610 159093 ... 3.499436 3.737276 3.728456\n", 163 | "4 4 610 156726 ... 3.004246 1.850029 3.439723\n", 164 | "\n", 165 | "[5 rows x 9 columns]" 166 | ] 167 | }, 168 | "metadata": { 169 | "tags": [] 170 | }, 171 | "execution_count": 2 172 | } 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "metadata": { 178 | "id": "Uc2p2jUlSLJC", 179 | "colab_type": "code", 180 | "colab": {} 181 | }, 182 | "source": [ 183 | "# pred_data = pred_data.drop(169639)" 184 | ], 185 | "execution_count": null, 186 | "outputs": [] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "metadata": { 191 | "id": "a9zxxdMpRSSv", 192 | "colab_type": "code", 193 | "colab": { 194 | "base_uri": "https://localhost:8080/", 195 | "height": 34 196 | }, 197 | "outputId": "6513a5da-0d2d-4f34-80cf-4d006d57df72" 198 | }, 199 | "source": [ 200 | "# num of rows:\n", 201 | "T = pred_data.shape[0]\n", 202 | "print(T)" 203 | ], 204 | "execution_count": null, 205 | "outputs": [ 206 | { 207 | "output_type": "stream", 208 | "text": [ 209 | "20168\n" 210 | ], 211 | "name": "stdout" 212 | } 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "metadata": { 218 | "id": "kqfVFGZ8sxLg", 219 | "colab_type": "code", 220 | "colab": {} 221 | }, 222 | "source": [ 223 | "svd_wt = 0.05\n", 224 | "knn_wt = 0.6\n", 225 | "svdpp_wt = 0.4\n", 226 | "slopeone_wt = 0\n", 227 | "baseline_wt = 0" 228 | ], 229 | "execution_count": null, 230 | "outputs": [] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "metadata": { 235 | "id": "5lWF0bq2OhV9", 236 | "colab_type": "code", 237 | "colab": { 238 | "base_uri": "https://localhost:8080/", 239 | "height": 50 240 | }, 241 | "outputId": "04f216b1-c95a-400a-9599-2c1044bfcaad" 242 | }, 243 | "source": [ 244 | "rmse = ((pred_data.og_rating - pred_data.knn_rating) ** 2).mean() ** .5\n", 245 | "print(rmse)\n", 246 | "mae = (((pred_data.og_rating - pred_data.knn_rating) ** 2) ** .5).mean()\n", 247 | "print(mae)" 248 | ], 249 | "execution_count": null, 250 | "outputs": [ 251 | { 252 | "output_type": "stream", 253 | "text": [ 254 | "0.8527048407985283\n", 255 | "0.64818310503639\n" 256 | ], 257 | "name": "stdout" 258 | } 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "metadata": { 264 | "id": "oxcquF0lQOVa", 265 | "colab_type": "code", 266 | "colab": { 267 | "base_uri": "https://localhost:8080/", 268 | "height": 50 269 | }, 270 | "outputId": "cfe99592-5a54-4ade-c80f-a5f58817f727" 271 | }, 272 | "source": [ 273 | "rmse = ((pred_data.og_rating - pred_data.svdpp_rating) ** 2).mean() ** .5\n", 274 | "print(rmse)\n", 275 | "mae = (((pred_data.og_rating - pred_data.svdpp_rating) ** 2) ** .5).mean()\n", 276 | "print(mae)" 277 | ], 278 | "execution_count": null, 279 | "outputs": [ 280 | { 281 | "output_type": "stream", 282 | "text": [ 283 | "0.8668435463304792\n", 284 | "0.6611243052231001\n" 285 | ], 286 | "name": "stdout" 287 | } 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "metadata": { 293 | "id": "IEcpZCbBLTCS", 294 | "colab_type": "code", 295 | "colab": { 296 | "base_uri": "https://localhost:8080/", 297 | "height": 50 298 | }, 299 | "outputId": "a9b0788a-8986-487a-ecae-9d1583d3d51b" 300 | }, 301 | "source": [ 302 | "sqr_sum = 0\n", 303 | "abs_sum = 0\n", 304 | "\n", 305 | "for ind, row in pred_data.iterrows():\n", 306 | " org_r = row['og_rating']\n", 307 | " pred_r = svd_wt*row['svd_rating'] + knn_wt*row['knn_rating'] + svdpp_wt*row['svdpp_rating'] + slopeone_wt*row['slopeone_rating'] + baseline_wt*row['baseline_rating']\n", 308 | " diff = np.abs(org_r - pred_r)\n", 309 | " # print(diff)\n", 310 | " abs_sum += diff\n", 311 | " sqr_sum += diff**2\n", 312 | "\n", 313 | "rmse = np.sqrt(sqr_sum/T)\n", 314 | "print(\"RMSE\", rmse)\n", 315 | "mae = abs_sum/T\n", 316 | "print(\"MAE\", mae)" 317 | ], 318 | "execution_count": null, 319 | "outputs": [ 320 | { 321 | "output_type": "stream", 322 | "text": [ 323 | "RMSE 0.8440081164615088\n", 324 | "MAE 0.6426598370928285\n" 325 | ], 326 | "name": "stdout" 327 | } 328 | ] 329 | } 330 | ] 331 | } -------------------------------------------------------------------------------- /Code/surprise_model_predictions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "surprise_model_predictions.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "code", 18 | "metadata": { 19 | "id": "fi8oBmWXD1rA", 20 | "colab_type": "code", 21 | "colab": { 22 | "base_uri": "https://localhost:8080/", 23 | "height": 118 24 | }, 25 | "outputId": "d86444ed-d23b-4e76-b327-9d766fd375f4" 26 | }, 27 | "source": [ 28 | "!pip install surprise" 29 | ], 30 | "execution_count": null, 31 | "outputs": [ 32 | { 33 | "output_type": "stream", 34 | "text": [ 35 | "Requirement already satisfied: surprise in /usr/local/lib/python3.6/dist-packages (0.1)\n", 36 | "Requirement already satisfied: scikit-surprise in /usr/local/lib/python3.6/dist-packages (from surprise) (1.1.0)\n", 37 | "Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (1.12.0)\n", 38 | "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (0.14.1)\n", 39 | "Requirement already satisfied: numpy>=1.11.2 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (1.18.2)\n", 40 | "Requirement already satisfied: scipy>=1.0.0 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (1.4.1)\n" 41 | ], 42 | "name": "stdout" 43 | } 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "metadata": { 49 | "id": "J9FfIKsk0bDJ", 50 | "colab_type": "code", 51 | "colab": {} 52 | }, 53 | "source": [ 54 | "from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader\n", 55 | "from surprise import Dataset\n", 56 | "from surprise.model_selection import cross_validate\n", 57 | "from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore\n", 58 | "from surprise import accuracy\n", 59 | "from surprise.model_selection import train_test_split" 60 | ], 61 | "execution_count": null, 62 | "outputs": [] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "metadata": { 67 | "id": "mCZrwlMiOZPg", 68 | "colab_type": "code", 69 | "colab": {} 70 | }, 71 | "source": [ 72 | "import pandas as pd\n", 73 | "import numpy as np" 74 | ], 75 | "execution_count": null, 76 | "outputs": [] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "metadata": { 81 | "id": "olgICu7ZYyM7", 82 | "colab_type": "code", 83 | "colab": { 84 | "base_uri": "https://localhost:8080/", 85 | "height": 67 86 | }, 87 | "outputId": "1c8bb714-5fe1-497a-9344-130c4fbb91ef" 88 | }, 89 | "source": [ 90 | "# Load the movielens-1M dataset\n", 91 | "data = Dataset.load_builtin('ml-1m')" 92 | ], 93 | "execution_count": null, 94 | "outputs": [ 95 | { 96 | "output_type": "stream", 97 | "text": [ 98 | "Dataset ml-1m could not be found. Do you want to download it? [Y/n] y\n", 99 | "Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-1m.zip...\n", 100 | "Done! Dataset ml-1m has been saved to /root/.surprise_data/ml-1m\n" 101 | ], 102 | "name": "stdout" 103 | } 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "metadata": { 109 | "id": "DnLayS6VaXZL", 110 | "colab_type": "code", 111 | "colab": {} 112 | }, 113 | "source": [ 114 | "# sample random trainset and testset\n", 115 | "# test set is made of 20% of the ratings.\n", 116 | "trainset, testset = train_test_split(data, test_size=.20)" 117 | ], 118 | "execution_count": null, 119 | "outputs": [] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "metadata": { 124 | "id": "AC2Mt8xUyccA", 125 | "colab_type": "code", 126 | "colab": {} 127 | }, 128 | "source": [ 129 | "def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):\n", 130 | " reader = Reader(rating_scale=(0, 5))\n", 131 | " trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)\n", 132 | " testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)\n", 133 | " trainset = trainset.construct_trainset(trainset.raw_ratings)\n", 134 | " testset = testset.construct_testset(testset.raw_ratings)\n", 135 | " return trainset, testset" 136 | ], 137 | "execution_count": null, 138 | "outputs": [] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "metadata": { 143 | "id": "dHBcLq3eyi0T", 144 | "colab_type": "code", 145 | "colab": {} 146 | }, 147 | "source": [ 148 | "file_path_train = 'training_data.csv'\n", 149 | "file_path_test = 'testing_data.csv'\n", 150 | "traindf = pd.read_csv(file_path_train)\n", 151 | "testdf = pd.read_csv(file_path_test)\n", 152 | "trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf)" 153 | ], 154 | "execution_count": null, 155 | "outputs": [] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "metadata": { 160 | "id": "AlitWSrNb2wZ", 161 | "colab_type": "code", 162 | "colab": {} 163 | }, 164 | "source": [ 165 | "def recommendation(algo, trainset, testset):\n", 166 | " # Train the algorithm on the trainset, and predict ratings for the testset\n", 167 | " algo.fit(trainset)\n", 168 | "\n", 169 | " # Predictions on testing set\n", 170 | " test_predictions = algo.test(testset)\n", 171 | " test_rmse = accuracy.rmse(test_predictions)\n", 172 | " test_mae = accuracy.mae(test_predictions)\n", 173 | " \n", 174 | " return test_rmse, test_mae, test_predictions" 175 | ], 176 | "execution_count": null, 177 | "outputs": [] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "metadata": { 182 | "id": "9ZblN_7unqoU", 183 | "colab_type": "code", 184 | "colab": {} 185 | }, 186 | "source": [ 187 | "# results = cross_validate(SVD(), data, measures=['RMSE', 'MAE'], cv=5, verbose=False)" 188 | ], 189 | "execution_count": null, 190 | "outputs": [] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": { 195 | "id": "Iv9GSCQx24RI", 196 | "colab_type": "text" 197 | }, 198 | "source": [ 199 | "#### Experimenting" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "metadata": { 205 | "id": "E777XIBI26SQ", 206 | "colab_type": "code", 207 | "colab": { 208 | "base_uri": "https://localhost:8080/", 209 | "height": 84 210 | }, 211 | "outputId": "b5629514-2562-4eda-b44f-67cfbfa18a8a" 212 | }, 213 | "source": [ 214 | "print('Using ALS')\n", 215 | "bsl_options = {'method': 'als',\n", 216 | " 'n_epochs': 5,\n", 217 | " 'reg_u': 12,\n", 218 | " 'reg_i': 5\n", 219 | " }\n", 220 | "algo = BaselineOnly(bsl_options=bsl_options)\n", 221 | "test_rmse, test_mae, test_pred = recommendation(algo, trainset, testset)" 222 | ], 223 | "execution_count": null, 224 | "outputs": [ 225 | { 226 | "output_type": "stream", 227 | "text": [ 228 | "Using ALS\n", 229 | "Estimating biases using als...\n", 230 | "RMSE: 0.8677\n", 231 | "MAE: 0.6659\n" 232 | ], 233 | "name": "stdout" 234 | } 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "metadata": { 240 | "id": "luHqcF-H30jl", 241 | "colab_type": "code", 242 | "colab": { 243 | "base_uri": "https://localhost:8080/", 244 | "height": 84 245 | }, 246 | "outputId": "a96a611a-eab4-49ee-e34d-c99c847b584f" 247 | }, 248 | "source": [ 249 | "print('Using SGD')\n", 250 | "# bsl_options = {'method': 'sgd',\n", 251 | "# 'learning_rate': .00005,\n", 252 | "# }\n", 253 | "algo = BaselineOnly() # bsl_options=bsl_options\n", 254 | "test_rmse, test_mae, test_pred = recommendation(algo, trainset, testset)" 255 | ], 256 | "execution_count": null, 257 | "outputs": [ 258 | { 259 | "output_type": "stream", 260 | "text": [ 261 | "Using SGD\n", 262 | "Estimating biases using als...\n", 263 | "RMSE: 0.8735\n", 264 | "MAE: 0.6718\n" 265 | ], 266 | "name": "stdout" 267 | } 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": { 273 | "id": "gBCl6LOoBPgQ", 274 | "colab_type": "text" 275 | }, 276 | "source": [ 277 | "##### Calculating predictions for the top methods:" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "metadata": { 283 | "id": "KuTTB-6Th8ZN", 284 | "colab_type": "code", 285 | "colab": { 286 | "base_uri": "https://localhost:8080/", 287 | "height": 101 288 | }, 289 | "outputId": "a202ac5a-0dc8-4a9b-9349-847ce601c9fc" 290 | }, 291 | "source": [ 292 | "# KNNBaseline\n", 293 | "\n", 294 | "algo = KNNBaseline()\n", 295 | "test_knn_rmse, test_knn_mae, test_knn_pred = recommendation(algo, trainset, testset)" 296 | ], 297 | "execution_count": null, 298 | "outputs": [ 299 | { 300 | "output_type": "stream", 301 | "text": [ 302 | "Estimating biases using als...\n", 303 | "Computing the msd similarity matrix...\n", 304 | "Done computing similarity matrix.\n", 305 | "RMSE: 0.8763\n", 306 | "MAE: 0.6660\n" 307 | ], 308 | "name": "stdout" 309 | } 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "metadata": { 315 | "id": "ndRC8sVBBoje", 316 | "colab_type": "code", 317 | "colab": { 318 | "base_uri": "https://localhost:8080/", 319 | "height": 50 320 | }, 321 | "outputId": "8f1c4945-631c-4a2c-e7ef-92a13db48983" 322 | }, 323 | "source": [ 324 | "# SlopeOne\n", 325 | "\n", 326 | "algo = SlopeOne()\n", 327 | "test_slopeone_rmse, test_slopeone_mae, test_slopeone_pred = recommendation(algo, trainset, testset)" 328 | ], 329 | "execution_count": null, 330 | "outputs": [ 331 | { 332 | "output_type": "stream", 333 | "text": [ 334 | "RMSE: 0.9070\n", 335 | "MAE: 0.7145\n" 336 | ], 337 | "name": "stdout" 338 | } 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "metadata": { 344 | "id": "INAgGkTFBxlT", 345 | "colab_type": "code", 346 | "colab": { 347 | "base_uri": "https://localhost:8080/", 348 | "height": 50 349 | }, 350 | "outputId": "0e9a390a-e87d-4f55-a97e-6284f7348074" 351 | }, 352 | "source": [ 353 | "# SVD\n", 354 | "\n", 355 | "algo = SVD()\n", 356 | "test_svd_rmse, test_svd_mae, test_svd_pred = recommendation(algo, trainset, testset)" 357 | ], 358 | "execution_count": null, 359 | "outputs": [ 360 | { 361 | "output_type": "stream", 362 | "text": [ 363 | "RMSE: 0.8743\n", 364 | "MAE: 0.6858\n" 365 | ], 366 | "name": "stdout" 367 | } 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "metadata": { 373 | "id": "uJCkdey1B02t", 374 | "colab_type": "code", 375 | "colab": { 376 | "base_uri": "https://localhost:8080/", 377 | "height": 50 378 | }, 379 | "outputId": "662ecf2e-f6b3-4f46-d55e-40c825c9009b" 380 | }, 381 | "source": [ 382 | "# SVDpp\n", 383 | "\n", 384 | "algo = SVDpp()\n", 385 | "test_svdpp_rmse, test_svdpp_mae, test_svdpp_pred = recommendation(algo, trainset, testset)" 386 | ], 387 | "execution_count": null, 388 | "outputs": [ 389 | { 390 | "output_type": "stream", 391 | "text": [ 392 | "RMSE: 0.8697\n", 393 | "MAE: 0.6643\n" 394 | ], 395 | "name": "stdout" 396 | } 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "metadata": { 402 | "id": "nprC9tRcymxk", 403 | "colab_type": "code", 404 | "colab": { 405 | "base_uri": "https://localhost:8080/", 406 | "height": 67 407 | }, 408 | "outputId": "8b64d3ec-8a8b-4963-8a0f-54ef0ae090c4" 409 | }, 410 | "source": [ 411 | "# BaselineOnly()\n", 412 | "\n", 413 | "algo = BaselineOnly()\n", 414 | "test_base_rmse, test_base_mae, test_base_pred = recommendation(algo, trainset, testset)" 415 | ], 416 | "execution_count": null, 417 | "outputs": [ 418 | { 419 | "output_type": "stream", 420 | "text": [ 421 | "Estimating biases using als...\n", 422 | "RMSE: 0.8735\n", 423 | "MAE: 0.6718\n" 424 | ], 425 | "name": "stdout" 426 | } 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "metadata": { 432 | "id": "5OqtsT-5MPAh", 433 | "colab_type": "code", 434 | "colab": {} 435 | }, 436 | "source": [ 437 | "test_pred_df = pd.DataFrame(columns= ['uid', 'iid', 'og_rating', 'svd_rating', 'knn_rating', 'svdpp_rating', 'slopeone_rating', 'baseline_rating'])" 438 | ], 439 | "execution_count": null, 440 | "outputs": [] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "metadata": { 445 | "id": "ExyS3zHlzxsP", 446 | "colab_type": "code", 447 | "colab": { 448 | "base_uri": "https://localhost:8080/", 449 | "height": 34 450 | }, 451 | "outputId": "3e7f502e-2286-4806-cf93-217aa64a4b08" 452 | }, 453 | "source": [ 454 | "num_test = len(test_base_pred)\n", 455 | "print(num_test)" 456 | ], 457 | "execution_count": null, 458 | "outputs": [ 459 | { 460 | "output_type": "stream", 461 | "text": [ 462 | "200042\n" 463 | ], 464 | "name": "stdout" 465 | } 466 | ] 467 | }, 468 | { 469 | "cell_type": "markdown", 470 | "metadata": { 471 | "id": "qHYtUbMyrOA9", 472 | "colab_type": "text" 473 | }, 474 | "source": [ 475 | "##### Storing testing set predictions:" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "metadata": { 481 | "id": "YSPLXRAgzfka", 482 | "colab_type": "code", 483 | "colab": {} 484 | }, 485 | "source": [ 486 | "for i in range(num_test): \n", 487 | " svd = test_svd_pred[i]\n", 488 | " slopeone = test_slopeone_pred[i]\n", 489 | " knn = test_knn_pred[i]\n", 490 | " svdpp = test_svdpp_pred[i]\n", 491 | " baseline = test_base_pred[i]\n", 492 | " df = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, svd.est, knn.est, svdpp.est, slopeone.est, baseline.est]], columns=['uid', 'iid', 'og_rating', 'svd_rating', 'knn_rating', 'svdpp_rating', 'slopeone_rating','baseline_rating'])\n", 493 | " # print(df)\n", 494 | " test_pred_df = pd.concat([df, test_pred_df], ignore_index=True)" 495 | ], 496 | "execution_count": null, 497 | "outputs": [] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "metadata": { 502 | "id": "fJdb0S-A5PiX", 503 | "colab_type": "code", 504 | "colab": { 505 | "base_uri": "https://localhost:8080/", 506 | "height": 402 507 | }, 508 | "outputId": "7e1bbd47-57f4-464e-ea62-95cda20ad0b5" 509 | }, 510 | "source": [ 511 | "test_pred_df" 512 | ], 513 | "execution_count": null, 514 | "outputs": [ 515 | { 516 | "output_type": "execute_result", 517 | "data": { 518 | "text/html": [ 519 | "
\n", 520 | "\n", 533 | "\n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | "
uidiidog_ratingsvd_ratingknn_ratingsvdpp_ratingslopeone_ratingbaseline_rating
069527914.03.5076853.8158403.9366854.2407114.146045
1601636683.03.4048773.5579223.6589793.3901323.442060
2548212215.04.6224524.4916654.4513634.6690424.554867
3338929594.03.8999923.2175744.2352803.5593923.450094
443036084.04.0937494.2504974.7574544.2827074.180708
...........................
200037144734124.02.6789373.4126083.3098913.1921293.238168
20003830133964.04.2925834.2283404.5946474.1281574.114891
20003998439273.03.5376463.4460793.4869743.5142103.475889
200040467223694.02.6386342.8824402.6767852.7424152.817915
200041523435565.03.9702033.6566313.9884563.7799913.712596
\n", 671 | "

200042 rows × 8 columns

\n", 672 | "
" 673 | ], 674 | "text/plain": [ 675 | " uid iid og_rating ... svdpp_rating slopeone_rating baseline_rating\n", 676 | "0 695 2791 4.0 ... 3.936685 4.240711 4.146045\n", 677 | "1 6016 3668 3.0 ... 3.658979 3.390132 3.442060\n", 678 | "2 5482 1221 5.0 ... 4.451363 4.669042 4.554867\n", 679 | "3 3389 2959 4.0 ... 4.235280 3.559392 3.450094\n", 680 | "4 4303 608 4.0 ... 4.757454 4.282707 4.180708\n", 681 | "... ... ... ... ... ... ... ...\n", 682 | "200037 1447 3412 4.0 ... 3.309891 3.192129 3.238168\n", 683 | "200038 301 3396 4.0 ... 4.594647 4.128157 4.114891\n", 684 | "200039 984 3927 3.0 ... 3.486974 3.514210 3.475889\n", 685 | "200040 4672 2369 4.0 ... 2.676785 2.742415 2.817915\n", 686 | "200041 5234 3556 5.0 ... 3.988456 3.779991 3.712596\n", 687 | "\n", 688 | "[200042 rows x 8 columns]" 689 | ] 690 | }, 691 | "metadata": { 692 | "tags": [] 693 | }, 694 | "execution_count": 20 695 | } 696 | ] 697 | }, 698 | { 699 | "cell_type": "code", 700 | "metadata": { 701 | "id": "tSwp06K6JClS", 702 | "colab_type": "code", 703 | "colab": {} 704 | }, 705 | "source": [ 706 | "test_pred_df.to_csv('test_prediction.csv')" 707 | ], 708 | "execution_count": null, 709 | "outputs": [] 710 | } 711 | ] 712 | } -------------------------------------------------------------------------------- /Code/cold_start_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "cold_start_analysis.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "S5rY5TFX_Fzq", 20 | "colab_type": "text" 21 | }, 22 | "source": [ 23 | "#### Cold Start Analysis:\n", 24 | "\n", 25 | "This notebook analyses the performance of different approaches in case of a new user or a user with less number of interaction with the system, namely the cold start problem. \\\\\n", 26 | "We compute the rmse and mae for those customers who have rated less than 18 books and so on. \\\\\n", 27 | "We also observe the performance of approached for customers who have rated more than 1000 movies. " 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "metadata": { 33 | "id": "l6H9h87h_DXr", 34 | "colab_type": "code", 35 | "colab": {} 36 | }, 37 | "source": [ 38 | "!pip install surprise" 39 | ], 40 | "execution_count": null, 41 | "outputs": [] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "metadata": { 46 | "id": "G8QgQWPZ3kFu", 47 | "colab_type": "code", 48 | "colab": {} 49 | }, 50 | "source": [ 51 | "import pickle\n", 52 | "import os\n", 53 | "\n", 54 | "import pandas as pd\n", 55 | "\n", 56 | "from surprise import SVD, SVDpp\n", 57 | "from surprise import KNNBasic, KNNBaseline, BaselineOnly\n", 58 | "from surprise import Dataset \n", 59 | "from surprise import Reader \n", 60 | "from surprise import dump\n", 61 | "from surprise.accuracy import rmse" 62 | ], 63 | "execution_count": null, 64 | "outputs": [] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "metadata": { 69 | "id": "Y3nN4GjS3sjT", 70 | "colab_type": "code", 71 | "colab": {} 72 | }, 73 | "source": [ 74 | "def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):\n", 75 | " reader = Reader(rating_scale=(0, 5))\n", 76 | " trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)\n", 77 | " testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)\n", 78 | " trainset = trainset.construct_trainset(trainset.raw_ratings)\n", 79 | " testset = testset.construct_testset(testset.raw_ratings)\n", 80 | " return trainset, testset" 81 | ], 82 | "execution_count": null, 83 | "outputs": [] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "metadata": { 88 | "id": "z6OJ9U-E3zQP", 89 | "colab_type": "code", 90 | "colab": {} 91 | }, 92 | "source": [ 93 | "file_path_train = 'training_data.csv'\n", 94 | "file_path_test = 'testing_data.csv'\n", 95 | "traindf = pd.read_csv(file_path_train)\n", 96 | "testdf = pd.read_csv(file_path_test)\n", 97 | "trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf)" 98 | ], 99 | "execution_count": null, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "metadata": { 105 | "id": "-aF-xIYxX4aB", 106 | "colab_type": "code", 107 | "colab": { 108 | "base_uri": "https://localhost:8080/", 109 | "height": 195 110 | }, 111 | "outputId": "d1aa662c-301c-4d5f-cedf-23ed3984f02e" 112 | }, 113 | "source": [ 114 | "traindf.head()" 115 | ], 116 | "execution_count": null, 117 | "outputs": [ 118 | { 119 | "output_type": "execute_result", 120 | "data": { 121 | "text/html": [ 122 | "
\n", 123 | "\n", 136 | "\n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | "
userIdmovieIdratingtimestampgenrestag
0114.0964982703['Adventure', 'Animation', 'Children', 'Comedy...[]
1164.0964982224['Action', 'Crime', 'Thriller'][]
21475.0964983815['Mystery', 'Thriller'][]
31505.0964982931['Crime', 'Mystery', 'Thriller'][]
41703.0964982400['Action', 'Comedy', 'Horror', 'Thriller'][]
\n", 196 | "
" 197 | ], 198 | "text/plain": [ 199 | " userId movieId ... genres tag\n", 200 | "0 1 1 ... ['Adventure', 'Animation', 'Children', 'Comedy... []\n", 201 | "1 1 6 ... ['Action', 'Crime', 'Thriller'] []\n", 202 | "2 1 47 ... ['Mystery', 'Thriller'] []\n", 203 | "3 1 50 ... ['Crime', 'Mystery', 'Thriller'] []\n", 204 | "4 1 70 ... ['Action', 'Comedy', 'Horror', 'Thriller'] []\n", 205 | "\n", 206 | "[5 rows x 6 columns]" 207 | ] 208 | }, 209 | "metadata": { 210 | "tags": [] 211 | }, 212 | "execution_count": 5 213 | } 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "metadata": { 219 | "id": "TzdYPykH4DMR", 220 | "colab_type": "code", 221 | "colab": { 222 | "base_uri": "https://localhost:8080/", 223 | "height": 50 224 | }, 225 | "outputId": "91278275-9801-4593-f78c-3dc9ca7caf08" 226 | }, 227 | "source": [ 228 | "algo_svd = SVD() \n", 229 | "algo_svdpp = SVDpp() \n", 230 | "algo_knn = KNNBasic()\n", 231 | "\n", 232 | "\n", 233 | "algo_svd.fit(trainset) \n", 234 | "predictions_svd = algo_svd.test(testset)\n", 235 | "\n", 236 | "algo_svdpp.fit(trainset) \n", 237 | "predictions_svdpp = algo_svdpp.test(testset)\n", 238 | "\n", 239 | "algo_knn.fit(trainset)\n", 240 | "predictions_knn = algo_knn.test(testset)\n", 241 | "\n", 242 | "# rmse(predictions_svd)\n", 243 | "# rmse(predictions_knn) \n", 244 | "\n", 245 | "dump.dump('./dump_SVD', predictions_svd, algo_svd)\n", 246 | "dump.dump('./dump_SVDpp', predictions_svdpp, algo_svdpp)\n", 247 | "dump.dump('./dump_KNN', predictions_knn, algo_knn)" 248 | ], 249 | "execution_count": null, 250 | "outputs": [ 251 | { 252 | "output_type": "stream", 253 | "text": [ 254 | "Computing the msd similarity matrix...\n", 255 | "Done computing similarity matrix.\n" 256 | ], 257 | "name": "stdout" 258 | } 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "metadata": { 264 | "id": "IWpBO-jw4gR4", 265 | "colab_type": "code", 266 | "colab": {} 267 | }, 268 | "source": [ 269 | "df_svd = pd.DataFrame(predictions_svd, columns=['uid', 'iid', 'rui', 'est', 'details']) \n", 270 | "df_svdpp = pd.DataFrame(predictions_svdpp, columns=['uid', 'iid', 'rui', 'est', 'details']) \n", 271 | "df_knn = pd.DataFrame(predictions_knn, columns=['uid', 'iid', 'rui', 'est', 'details']) " 272 | ], 273 | "execution_count": null, 274 | "outputs": [] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "metadata": { 279 | "id": "5ytiPn_6Z4D5", 280 | "colab_type": "code", 281 | "colab": { 282 | "base_uri": "https://localhost:8080/", 283 | "height": 67 284 | }, 285 | "outputId": "c172233e-b73b-4226-faea-c9505e9c0b09" 286 | }, 287 | "source": [ 288 | "sim_options = {'name': 'pearson_baseline',\n", 289 | " 'user_based': False # compute similarities between items\n", 290 | " }\n", 291 | "# algo = KNNBaseline(sim_options=sim_options)\n", 292 | "algo_knnbaseline = KNNBaseline(sim_options=sim_options)\n", 293 | "algo_knnbaseline.fit(trainset)\n", 294 | "predictions_knnbaseline = algo_knnbaseline.test(testset)" 295 | ], 296 | "execution_count": null, 297 | "outputs": [ 298 | { 299 | "output_type": "stream", 300 | "text": [ 301 | "Estimating biases using als...\n", 302 | "Computing the pearson_baseline similarity matrix...\n", 303 | "Done computing similarity matrix.\n" 304 | ], 305 | "name": "stdout" 306 | } 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "metadata": { 312 | "id": "-qpAZxicab7y", 313 | "colab_type": "code", 314 | "colab": {} 315 | }, 316 | "source": [ 317 | "df_knnbaseline = pd.DataFrame(predictions_knnbaseline, columns=['uid', 'iid', 'rui', 'est', 'details']) \n", 318 | "df_knnbaseline['err'] = abs(df_knnbaseline.est - df_knnbaseline.rui)\n", 319 | "df_knnbaseline['sqr_err'] = (df_knnbaseline.est - df_knnbaseline.rui)**2" 320 | ], 321 | "execution_count": null, 322 | "outputs": [] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "metadata": { 327 | "id": "yIlRty-X4z2T", 328 | "colab_type": "code", 329 | "colab": {} 330 | }, 331 | "source": [ 332 | "df_svd['err'] = abs(df_svd.est - df_svd.rui)\n", 333 | "df_svdpp['err'] = abs(df_svdpp.est - df_svdpp.rui)\n", 334 | "df_knn['err'] = abs(df_knn.est - df_knn.rui)" 335 | ], 336 | "execution_count": null, 337 | "outputs": [] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "metadata": { 342 | "id": "VdC0IyDxY4xB", 343 | "colab_type": "code", 344 | "colab": {} 345 | }, 346 | "source": [ 347 | "df_svd['sqr_err'] = (df_svd.est - df_svd.rui)**2\n", 348 | "df_svdpp['sqr_err'] = (df_svdpp.est - df_svdpp.rui)**2\n", 349 | "df_knn['sqr_err'] = (df_knn.est - df_knn.rui)**2" 350 | ], 351 | "execution_count": null, 352 | "outputs": [] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "metadata": { 357 | "id": "t4gOt7SHcVnO", 358 | "colab_type": "code", 359 | "colab": { 360 | "base_uri": "https://localhost:8080/", 361 | "height": 34 362 | }, 363 | "outputId": "21a9276e-f43b-4dfc-9afb-65987e0cd1f1" 364 | }, 365 | "source": [ 366 | "algo_baselineonly = BaselineOnly()\n", 367 | "algo_baselineonly.fit(trainset)\n", 368 | "predictions_baselineonly = algo_baselineonly.test(testset)\n", 369 | "\n", 370 | "df_baselineonly = pd.DataFrame(predictions_baselineonly, columns=['uid', 'iid', 'rui', 'est', 'details']) \n", 371 | "df_baselineonly['err'] = abs(df_baselineonly.est - df_baselineonly.rui)\n", 372 | "df_baselineonly['sqr_err'] = (df_baselineonly.est - df_baselineonly.rui)**2\n", 373 | "df_baselineonly['Iu'] = df_baselineonly.uid.apply(get_Iu)" 374 | ], 375 | "execution_count": null, 376 | "outputs": [ 377 | { 378 | "output_type": "stream", 379 | "text": [ 380 | "Estimating biases using als...\n" 381 | ], 382 | "name": "stdout" 383 | } 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "metadata": { 389 | "id": "falRCQt3dYFC", 390 | "colab_type": "code", 391 | "colab": { 392 | "base_uri": "https://localhost:8080/", 393 | "height": 67 394 | }, 395 | "outputId": "f3e8404f-77c9-427e-c663-f9ddadd0fb11" 396 | }, 397 | "source": [ 398 | "sim_options = {'name': 'pearson_baseline',\n", 399 | " 'user_based': True # compute similarities between items\n", 400 | " }\n", 401 | "algo_knnbaseline_user = KNNBaseline(sim_options=sim_options)\n", 402 | "algo_knnbaseline_user.fit(trainset)\n", 403 | "predictions_knnbaseline_user = algo_knnbaseline_user.test(testset)\n", 404 | "\n", 405 | "df_knn_user = pd.DataFrame(predictions_knnbaseline_user, columns=['uid', 'iid', 'rui', 'est', 'details']) \n", 406 | "df_knn_user['err'] = abs(df_knn_user.est - df_knn_user.rui)\n", 407 | "df_knn_user['sqr_err'] = (df_knn_user.est - df_knn_user.rui)**2\n", 408 | "df_knn_user['Iu'] = df_knn_user.uid.apply(get_Iu)" 409 | ], 410 | "execution_count": null, 411 | "outputs": [ 412 | { 413 | "output_type": "stream", 414 | "text": [ 415 | "Estimating biases using als...\n", 416 | "Computing the pearson_baseline similarity matrix...\n", 417 | "Done computing similarity matrix.\n" 418 | ], 419 | "name": "stdout" 420 | } 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "metadata": { 426 | "id": "SSoLuqrV65pK", 427 | "colab_type": "code", 428 | "colab": { 429 | "base_uri": "https://localhost:8080/", 430 | "height": 195 431 | }, 432 | "outputId": "897911ad-5086-4798-914e-58df7da6b068" 433 | }, 434 | "source": [ 435 | "df_svd.head()" 436 | ], 437 | "execution_count": null, 438 | "outputs": [ 439 | { 440 | "output_type": "execute_result", 441 | "data": { 442 | "text/html": [ 443 | "
\n", 444 | "\n", 457 | "\n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | "
uidiidruiestdetailserrIu
0134.04.200548{'was_impossible': False}0.200548186
111635.04.261322{'was_impossible': False}0.738678186
213163.04.024986{'was_impossible': False}1.024986186
313494.04.443186{'was_impossible': False}0.443186186
414414.04.758104{'was_impossible': False}0.758104186
\n", 523 | "
" 524 | ], 525 | "text/plain": [ 526 | " uid iid rui est details err Iu\n", 527 | "0 1 3 4.0 4.200548 {'was_impossible': False} 0.200548 186\n", 528 | "1 1 163 5.0 4.261322 {'was_impossible': False} 0.738678 186\n", 529 | "2 1 316 3.0 4.024986 {'was_impossible': False} 1.024986 186\n", 530 | "3 1 349 4.0 4.443186 {'was_impossible': False} 0.443186 186\n", 531 | "4 1 441 4.0 4.758104 {'was_impossible': False} 0.758104 186" 532 | ] 533 | }, 534 | "metadata": { 535 | "tags": [] 536 | }, 537 | "execution_count": 12 538 | } 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "metadata": { 544 | "id": "gyU3U3mLWG42", 545 | "colab_type": "code", 546 | "colab": {} 547 | }, 548 | "source": [ 549 | "content = pd.read_csv('content_based_genre_ratings.csv')" 550 | ], 551 | "execution_count": null, 552 | "outputs": [] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "metadata": { 557 | "id": "V9pCMloU45Sh", 558 | "colab_type": "code", 559 | "colab": {} 560 | }, 561 | "source": [ 562 | "def get_Iu(uid):\n", 563 | " \"\"\"Return the number of items rated by given user\n", 564 | " \n", 565 | " Args:\n", 566 | " uid: The raw id of the user.\n", 567 | " Returns:\n", 568 | " The number of items rated by the user.\n", 569 | " \"\"\"\n", 570 | " \n", 571 | " try:\n", 572 | " return traindf[traindf['userId'] == uid].shape[0]\n", 573 | " except ValueError: # user was not part of the trainset\n", 574 | " return 0" 575 | ], 576 | "execution_count": null, 577 | "outputs": [] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "metadata": { 582 | "id": "Xaia-Iy2WPYY", 583 | "colab_type": "code", 584 | "colab": {} 585 | }, 586 | "source": [ 587 | "content['Iu'] = content.userId.apply(get_Iu)" 588 | ], 589 | "execution_count": null, 590 | "outputs": [] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "metadata": { 595 | "id": "8a_bM4hsWyHI", 596 | "colab_type": "code", 597 | "colab": {} 598 | }, 599 | "source": [ 600 | "content['err'] = abs(content.pred_rating - content.og_rating)\n", 601 | "content['sqr_err'] = (content.pred_rating - content.og_rating)**2\n", 602 | "# rmse = ((algo_predictions.og_rating - algo_predictions.pred_rating) ** 2).mean() ** .5\n", 603 | "# mae = (((algo_predictions.og_rating - algo_predictions.pred_rating) ** 2) ** .5).mean()\n" 604 | ], 605 | "execution_count": null, 606 | "outputs": [] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "metadata": { 611 | "id": "yobQqAZTWd_4", 612 | "colab_type": "code", 613 | "colab": { 614 | "base_uri": "https://localhost:8080/", 615 | "height": 50 616 | }, 617 | "outputId": "5c486dcc-0636-4452-b859-b51d6b2da9af" 618 | }, 619 | "source": [ 620 | "print(\"Content based \",content[content.Iu < 18].err.mean())\n", 621 | "print(\"Content based \",content[content.Iu < 18].sqr_err.mean()** .5)" 622 | ], 623 | "execution_count": null, 624 | "outputs": [ 625 | { 626 | "output_type": "stream", 627 | "text": [ 628 | "Content based 0.7942792057878261\n", 629 | "Content based 1.0584107905057996\n" 630 | ], 631 | "name": "stdout" 632 | } 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "metadata": { 638 | "id": "wz1Pkbzd4-dl", 639 | "colab_type": "code", 640 | "colab": {} 641 | }, 642 | "source": [ 643 | "df_knn['Iu'] = df_knn.uid.apply(get_Iu)\n", 644 | "df_svd['Iu'] = df_svd.uid.apply(get_Iu)\n", 645 | "df_svdpp['Iu'] = df_svdpp.uid.apply(get_Iu)\n", 646 | "df_knnbaseline['Iu'] = df_knnbaseline.uid.apply(get_Iu)" 647 | ], 648 | "execution_count": null, 649 | "outputs": [] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "metadata": { 654 | "id": "q-VorHYTayVQ", 655 | "colab_type": "code", 656 | "colab": { 657 | "base_uri": "https://localhost:8080/", 658 | "height": 134 659 | }, 660 | "outputId": "bb83cb23-6ebd-4ba1-ec31-8a599d8dd4ec" 661 | }, 662 | "source": [ 663 | "print(\"--------------------------MAE-----------------------\")\n", 664 | "print(\"KNN Basic \",df_knn[df_knn.Iu < 18].err.mean())\n", 665 | "print(\"SVD \", df_svd[df_svd.Iu < 18].err.mean())\n", 666 | "print(\"SVDpp \", df_svdpp[df_svdpp.Iu < 18].err.mean())\n", 667 | "print(\"KNN Baseline (item-item) \", df_knnbaseline[df_knnbaseline.Iu < 18].err.mean())\n", 668 | "print(\"BaselineOnly \",df_baselineonly[df_baselineonly.Iu < 18].err.mean() )\n", 669 | "print(\"KNN Baseline (user-user) \",df_knn_user[df_knn_user.Iu < 18].err.mean() )" 670 | ], 671 | "execution_count": null, 672 | "outputs": [ 673 | { 674 | "output_type": "stream", 675 | "text": [ 676 | "--------------------------MAE-----------------------\n", 677 | "KNN Basic 0.9356541418761788\n", 678 | "SVD 0.8174986369636367\n", 679 | "SVDpp 0.7853538665933238\n", 680 | "KNN Baseline (item-item) 0.7549100058171629\n", 681 | "BaselineOnly 0.828373767989461\n", 682 | "KNN Baseline (user-user) 0.8527037143570998\n" 683 | ], 684 | "name": "stdout" 685 | } 686 | ] 687 | }, 688 | { 689 | "cell_type": "code", 690 | "metadata": { 691 | "id": "nQOEO64Jf9BE", 692 | "colab_type": "code", 693 | "colab": { 694 | "base_uri": "https://localhost:8080/", 695 | "height": 134 696 | }, 697 | "outputId": "d2da200e-954a-44d4-89c2-890afc4b14e5" 698 | }, 699 | "source": [ 700 | "print(\"--------------------------RMSE-----------------------\")\n", 701 | "print(\"KNN Basic \",df_knn[df_knn.Iu < 18].sqr_err.mean()** .5)\n", 702 | "print(\"SVD \", df_svd[df_svd.Iu < 18].sqr_err.mean()** .5)\n", 703 | "print(\"SVDpp \", df_svdpp[df_svdpp.Iu < 18].sqr_err.mean()** .5)\n", 704 | "print(\"KNN Baseline (item-item) \", df_knnbaseline[df_knnbaseline.Iu < 18].sqr_err.mean()** .5)\n", 705 | "print(\"BaselineOnly \",df_baselineonly[df_baselineonly.Iu < 18].sqr_err.mean()** .5 )\n", 706 | "print(\"KNN Baseline (user-user) \",df_knn_user[df_knn_user.Iu < 18].sqr_err.mean()** .5)" 707 | ], 708 | "execution_count": null, 709 | "outputs": [ 710 | { 711 | "output_type": "stream", 712 | "text": [ 713 | "--------------------------RMSE-----------------------\n", 714 | "KNN Basic 1.1998253947989697\n", 715 | "SVD 1.0549483774463828\n", 716 | "SVDpp 1.0083634724152428\n", 717 | "KNN Baseline (item-item) 0.9896562169806813\n", 718 | "BaselineOnly 1.0612306019619604\n", 719 | "KNN Baseline (user-user) 1.1082756354422056\n" 720 | ], 721 | "name": "stdout" 722 | } 723 | ] 724 | }, 725 | { 726 | "cell_type": "code", 727 | "metadata": { 728 | "id": "xnzmsButgiyw", 729 | "colab_type": "code", 730 | "colab": { 731 | "base_uri": "https://localhost:8080/", 732 | "height": 134 733 | }, 734 | "outputId": "f326c366-38ea-45b8-9133-cf22dcb72358" 735 | }, 736 | "source": [ 737 | "print(\"--------------------------MAE-----------------------\")\n", 738 | "print(\"KNN Basic \",df_knn[df_knn.Iu > 1000].err.mean())\n", 739 | "print(\"SVD \", df_svd[df_svd.Iu > 1000].err.mean())\n", 740 | "print(\"SVDpp \", df_svdpp[df_svdpp.Iu > 1000].err.mean())\n", 741 | "print(\"KNN Baseline (item-item) \", df_knnbaseline[df_knnbaseline.Iu > 1000].err.mean())\n", 742 | "print(\"BaselineOnly \",df_baselineonly[df_baselineonly.Iu > 1000].err.mean() )\n", 743 | "print(\"KNN Baseline (user-user) \",df_knn_user[df_knn_user.Iu > 1000].err.mean() )" 744 | ], 745 | "execution_count": null, 746 | "outputs": [ 747 | { 748 | "output_type": "stream", 749 | "text": [ 750 | "--------------------------MAE-----------------------\n", 751 | "KNN Basic 0.7118277630004157\n", 752 | "SVD 0.6349197611192368\n", 753 | "SVDpp 0.626063757313411\n", 754 | "KNN Baseline (item-item) 0.6120430789383057\n", 755 | "BaselineOnly 0.6306031032475772\n", 756 | "KNN Baseline (user-user) 0.6330297364319998\n" 757 | ], 758 | "name": "stdout" 759 | } 760 | ] 761 | }, 762 | { 763 | "cell_type": "code", 764 | "metadata": { 765 | "id": "K1CnsM3mg0wi", 766 | "colab_type": "code", 767 | "colab": { 768 | "base_uri": "https://localhost:8080/", 769 | "height": 134 770 | }, 771 | "outputId": "c79eaa77-b9a5-4d90-d470-9539c3af5858" 772 | }, 773 | "source": [ 774 | "print(\"--------------------------RMSE-----------------------\")\n", 775 | "print(\"KNN Basic \",df_knn[df_knn.Iu > 1000].sqr_err.mean()** .5)\n", 776 | "print(\"SVD \", df_svd[df_svd.Iu > 1000].sqr_err.mean()** .5)\n", 777 | "print(\"SVDpp \", df_svdpp[df_svdpp.Iu > 1000].sqr_err.mean()** .5)\n", 778 | "print(\"KNN Baseline (item-item) \", df_knnbaseline[df_knnbaseline.Iu > 1000].sqr_err.mean()** .5)\n", 779 | "print(\"BaselineOnly \",df_baselineonly[df_baselineonly.Iu > 1000].sqr_err.mean()** .5 )\n", 780 | "print(\"KNN Baseline (user-user) \",df_knn_user[df_knn_user.Iu > 1000].sqr_err.mean()** .5)" 781 | ], 782 | "execution_count": null, 783 | "outputs": [ 784 | { 785 | "output_type": "stream", 786 | "text": [ 787 | "--------------------------RMSE-----------------------\n", 788 | "KNN Basic 0.9174613388905646\n", 789 | "SVD 0.8207944406250214\n", 790 | "SVDpp 0.8136491891525117\n", 791 | "KNN Baseline (item-item) 0.789275629286978\n", 792 | "BaselineOnly 0.799990922710614\n", 793 | "KNN Baseline (user-user) 0.8198697577732832\n" 794 | ], 795 | "name": "stdout" 796 | } 797 | ] 798 | }, 799 | { 800 | "cell_type": "code", 801 | "metadata": { 802 | "id": "7aWKXObN6uhT", 803 | "colab_type": "code", 804 | "colab": { 805 | "base_uri": "https://localhost:8080/", 806 | "height": 34 807 | }, 808 | "outputId": "2a40f35d-3055-43aa-d508-be83732ad842" 809 | }, 810 | "source": [ 811 | "iid_df = traindf.groupby(['userId'],as_index=False).movieId.count()\n", 812 | "iid_df.movieId.max()" 813 | ], 814 | "execution_count": null, 815 | "outputs": [ 816 | { 817 | "output_type": "execute_result", 818 | "data": { 819 | "text/plain": [ 820 | "2158" 821 | ] 822 | }, 823 | "metadata": { 824 | "tags": [] 825 | }, 826 | "execution_count": 47 827 | } 828 | ] 829 | } 830 | ] 831 | } -------------------------------------------------------------------------------- /Code/movie_era_based_recs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.7.6" 21 | }, 22 | "colab": { 23 | "name": "movie_era_based_recs.ipynb", 24 | "provenance": [] 25 | } 26 | }, 27 | "cells": [ 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "4Cox5k2AMKsw", 32 | "colab_type": "text" 33 | }, 34 | "source": [ 35 | "This notebook uses the content based approach to include the time period in which the movie was launced. This method personalizes the users recommendations to include this feature." 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "metadata": { 41 | "id": "bC1OAh-JMLd7", 42 | "colab_type": "code", 43 | "colab": {} 44 | }, 45 | "source": [ 46 | "import numpy as np\n", 47 | "import pandas as pd\n", 48 | "import matplotlib.pyplot as plt\n", 49 | "from ast import literal_eval\n", 50 | "import pdb" 51 | ], 52 | "execution_count": null, 53 | "outputs": [] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "metadata": { 58 | "id": "wLkmHYnGLw5Z", 59 | "colab_type": "code", 60 | "colab": {} 61 | }, 62 | "source": [ 63 | "genre_user_vector = pd.read_csv(\"user_info.csv\")\n", 64 | "genre_user_vector = genre_user_vector[['userId', 'user_vector']]\n", 65 | "\n", 66 | "genre_user_vector['user_vector'] = genre_user_vector['user_vector'].apply(lambda x: x.replace('[', ' ').replace(']', ' ').strip().split())\n", 67 | "genre_user_vector['user_vector'] = genre_user_vector['user_vector'].apply(lambda x: np.asarray(x).astype(float))" 68 | ], 69 | "execution_count": null, 70 | "outputs": [] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "metadata": { 75 | "id": "m4m_b7JbLw5c", 76 | "colab_type": "code", 77 | "colab": {} 78 | }, 79 | "source": [ 80 | "era_user_vector = pd.read_csv(\"user_era_vector.csv\")\n", 81 | "era_user_vector = era_user_vector[['userId', 'user_era_vector']]\n", 82 | "\n", 83 | "era_user_vector['user_era_vector'] = era_user_vector['user_era_vector'].apply(lambda x: x.replace('[', ' ').replace(']', ' ').strip().split())\n", 84 | "era_user_vector['user_era_vector'] = era_user_vector['user_era_vector'].apply(lambda x: np.asarray(x).astype(float))" 85 | ], 86 | "execution_count": null, 87 | "outputs": [] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "metadata": { 92 | "id": "mCq1wzsaLw5f", 93 | "colab_type": "code", 94 | "colab": {} 95 | }, 96 | "source": [ 97 | "merged_user = genre_user_vector.join(era_user_vector['user_era_vector'])" 98 | ], 99 | "execution_count": null, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "metadata": { 105 | "id": "Obc2U0jILw5h", 106 | "colab_type": "code", 107 | "colab": {}, 108 | "outputId": "f2a77bef-df14-4981-acba-8bd0fe07b36d" 109 | }, 110 | "source": [ 111 | "merged_user.head()" 112 | ], 113 | "execution_count": null, 114 | "outputs": [ 115 | { 116 | "output_type": "execute_result", 117 | "data": { 118 | "text/html": [ 119 | "
\n", 120 | "\n", 133 | "\n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | "
userIduser_vectoruser_era_vector
01[4.39189189, 4.65217391, 4.48571429, 4.2676056...[4.63265306, 4.27272727, 4.6, 0.0]
12[4.16666667, 0.0, 0.0, 4.2, 0.0, 4.5, 3.866666...[0.0, 3.83333333, 4.05, 3.85]
23[2.5, 0.5, 0.5, 0.5, 4.33333333, 0.5, 0.833333...[2.45833333, 2.6875, 0.5, 0.0]
34[3.47826087, 4.0, 3.77777778, 3.43902439, 3.53...[4.4375, 3.25663717, 3.32142857, 0.0]
45[3.16666667, 4.25, 4.0, 3.36363636, 4.0, 3.1, ...[5.0, 3.55882353, 0.0, 0.0]
\n", 175 | "
" 176 | ], 177 | "text/plain": [ 178 | " userId user_vector \\\n", 179 | "0 1 [4.39189189, 4.65217391, 4.48571429, 4.2676056... \n", 180 | "1 2 [4.16666667, 0.0, 0.0, 4.2, 0.0, 4.5, 3.866666... \n", 181 | "2 3 [2.5, 0.5, 0.5, 0.5, 4.33333333, 0.5, 0.833333... \n", 182 | "3 4 [3.47826087, 4.0, 3.77777778, 3.43902439, 3.53... \n", 183 | "4 5 [3.16666667, 4.25, 4.0, 3.36363636, 4.0, 3.1, ... \n", 184 | "\n", 185 | " user_era_vector \n", 186 | "0 [4.63265306, 4.27272727, 4.6, 0.0] \n", 187 | "1 [0.0, 3.83333333, 4.05, 3.85] \n", 188 | "2 [2.45833333, 2.6875, 0.5, 0.0] \n", 189 | "3 [4.4375, 3.25663717, 3.32142857, 0.0] \n", 190 | "4 [5.0, 3.55882353, 0.0, 0.0] " 191 | ] 192 | }, 193 | "metadata": { 194 | "tags": [] 195 | }, 196 | "execution_count": 5 197 | } 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "metadata": { 203 | "scrolled": false, 204 | "id": "FqxlInOILw5k", 205 | "colab_type": "code", 206 | "colab": {} 207 | }, 208 | "source": [ 209 | "merged_user['final_user_vector'] = merged_user.apply(lambda x: np.concatenate((2*x['user_vector'], x['user_era_vector'])), axis=1)" 210 | ], 211 | "execution_count": null, 212 | "outputs": [] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "metadata": { 217 | "id": "R-USwe0RLw5m", 218 | "colab_type": "code", 219 | "colab": {}, 220 | "outputId": "df1b11eb-6da3-4070-fbf7-149e619605b7" 221 | }, 222 | "source": [ 223 | "merged_user.head()" 224 | ], 225 | "execution_count": null, 226 | "outputs": [ 227 | { 228 | "output_type": "execute_result", 229 | "data": { 230 | "text/html": [ 231 | "
\n", 232 | "\n", 245 | "\n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | "
userIduser_vectoruser_era_vectorfinal_user_vector
01[4.39189189, 4.65217391, 4.48571429, 4.2676056...[4.63265306, 4.27272727, 4.6, 0.0][8.78378378, 9.30434782, 8.97142858, 8.5352112...
12[4.16666667, 0.0, 0.0, 4.2, 0.0, 4.5, 3.866666...[0.0, 3.83333333, 4.05, 3.85][8.33333334, 0.0, 0.0, 8.4, 0.0, 9.0, 7.733333...
23[2.5, 0.5, 0.5, 0.5, 4.33333333, 0.5, 0.833333...[2.45833333, 2.6875, 0.5, 0.0][5.0, 1.0, 1.0, 1.0, 8.66666666, 1.0, 1.666666...
34[3.47826087, 4.0, 3.77777778, 3.43902439, 3.53...[4.4375, 3.25663717, 3.32142857, 0.0][6.95652174, 8.0, 7.55555556, 6.87804878, 7.06...
45[3.16666667, 4.25, 4.0, 3.36363636, 4.0, 3.1, ...[5.0, 3.55882353, 0.0, 0.0][6.33333334, 8.5, 8.0, 6.72727272, 8.0, 6.2, 7...
\n", 293 | "
" 294 | ], 295 | "text/plain": [ 296 | " userId user_vector \\\n", 297 | "0 1 [4.39189189, 4.65217391, 4.48571429, 4.2676056... \n", 298 | "1 2 [4.16666667, 0.0, 0.0, 4.2, 0.0, 4.5, 3.866666... \n", 299 | "2 3 [2.5, 0.5, 0.5, 0.5, 4.33333333, 0.5, 0.833333... \n", 300 | "3 4 [3.47826087, 4.0, 3.77777778, 3.43902439, 3.53... \n", 301 | "4 5 [3.16666667, 4.25, 4.0, 3.36363636, 4.0, 3.1, ... \n", 302 | "\n", 303 | " user_era_vector \\\n", 304 | "0 [4.63265306, 4.27272727, 4.6, 0.0] \n", 305 | "1 [0.0, 3.83333333, 4.05, 3.85] \n", 306 | "2 [2.45833333, 2.6875, 0.5, 0.0] \n", 307 | "3 [4.4375, 3.25663717, 3.32142857, 0.0] \n", 308 | "4 [5.0, 3.55882353, 0.0, 0.0] \n", 309 | "\n", 310 | " final_user_vector \n", 311 | "0 [8.78378378, 9.30434782, 8.97142858, 8.5352112... \n", 312 | "1 [8.33333334, 0.0, 0.0, 8.4, 0.0, 9.0, 7.733333... \n", 313 | "2 [5.0, 1.0, 1.0, 1.0, 8.66666666, 1.0, 1.666666... \n", 314 | "3 [6.95652174, 8.0, 7.55555556, 6.87804878, 7.06... \n", 315 | "4 [6.33333334, 8.5, 8.0, 6.72727272, 8.0, 6.2, 7... " 316 | ] 317 | }, 318 | "metadata": { 319 | "tags": [] 320 | }, 321 | "execution_count": 7 322 | } 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "metadata": { 328 | "id": "0-zBOAM8Lw5q", 329 | "colab_type": "code", 330 | "colab": {} 331 | }, 332 | "source": [ 333 | "movie_genre_vector = pd.read_csv(\"movie_vector.csv\")\n", 334 | "movie_genre_vector = movie_genre_vector[['movieId', 'movie_vector']]\n", 335 | "\n", 336 | "movie_genre_vector['movie_vector'] = movie_genre_vector['movie_vector'].apply(lambda x: x.replace('[', ' ').replace(']', ' ').strip().split())\n", 337 | "movie_genre_vector['movie_vector'] = movie_genre_vector['movie_vector'].apply(lambda x: np.asarray(x).astype(float))" 338 | ], 339 | "execution_count": null, 340 | "outputs": [] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "metadata": { 345 | "id": "4awx_m7pLw5s", 346 | "colab_type": "code", 347 | "colab": {} 348 | }, 349 | "source": [ 350 | "movie_era_vector = pd.read_csv(\"movie_era_vector.csv\")\n", 351 | "movie_era_vector = movie_era_vector[['movieId', 'era_vector']]\n", 352 | "\n", 353 | "movie_era_vector['era_vector'] = movie_era_vector['era_vector'].apply(lambda x: x.replace('[', ' ').replace(']', ' ').strip().split())\n", 354 | "movie_era_vector['era_vector'] = movie_era_vector['era_vector'].apply(lambda x: np.asarray(x).astype(float))" 355 | ], 356 | "execution_count": null, 357 | "outputs": [] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "metadata": { 362 | "id": "v92UsULNLw5u", 363 | "colab_type": "code", 364 | "colab": {} 365 | }, 366 | "source": [ 367 | "merged_movie = movie_genre_vector.join(movie_era_vector['era_vector'])\n", 368 | "merged_movie['final_movie_vector'] = merged_movie.apply(lambda x: np.concatenate((2*x['movie_vector'], x['era_vector'])), axis=1)" 369 | ], 370 | "execution_count": null, 371 | "outputs": [] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "metadata": { 376 | "id": "jHxejQEiLw5w", 377 | "colab_type": "code", 378 | "colab": {}, 379 | "outputId": "87e2afb0-4ac5-4411-de9e-23d316b3b758" 380 | }, 381 | "source": [ 382 | "merged_movie.head()" 383 | ], 384 | "execution_count": null, 385 | "outputs": [ 386 | { 387 | "output_type": "execute_result", 388 | "data": { 389 | "text/html": [ 390 | "
\n", 391 | "\n", 404 | "\n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | "
movieIdmovie_vectorera_vectorfinal_movie_vector
01[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...[0.0, 1.0, 0.0, 0.0][2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0, ...
12[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...[0.0, 1.0, 0.0, 0.0][2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, ...
23[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...[0.0, 1.0, 0.0, 0.0][0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, ...
34[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, ...[0.0, 1.0, 0.0, 0.0][0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 2.0, 0.0, 0.0, ...
45[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...[0.0, 1.0, 0.0, 0.0][0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
\n", 452 | "
" 453 | ], 454 | "text/plain": [ 455 | " movieId movie_vector \\\n", 456 | "0 1 [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ... \n", 457 | "1 2 [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ... \n", 458 | "2 3 [0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ... \n", 459 | "3 4 [0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, ... \n", 460 | "4 5 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... \n", 461 | "\n", 462 | " era_vector final_movie_vector \n", 463 | "0 [0.0, 1.0, 0.0, 0.0] [2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0, ... \n", 464 | "1 [0.0, 1.0, 0.0, 0.0] [2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, ... \n", 465 | "2 [0.0, 1.0, 0.0, 0.0] [0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, ... \n", 466 | "3 [0.0, 1.0, 0.0, 0.0] [0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 2.0, 0.0, 0.0, ... \n", 467 | "4 [0.0, 1.0, 0.0, 0.0] [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... " 468 | ] 469 | }, 470 | "metadata": { 471 | "tags": [] 472 | }, 473 | "execution_count": 11 474 | } 475 | ] 476 | }, 477 | { 478 | "cell_type": "markdown", 479 | "metadata": { 480 | "id": "KF3gzUpwLw5y", 481 | "colab_type": "text" 482 | }, 483 | "source": [ 484 | "## Test" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "metadata": { 490 | "id": "NlP1KEMgLw5z", 491 | "colab_type": "code", 492 | "colab": {}, 493 | "outputId": "96cb9714-bb7e-49c5-84e9-48a99eb3a0f8" 494 | }, 495 | "source": [ 496 | "ratings_test = pd.read_csv(\"testing_data.csv\", converters={\"genres\": literal_eval, \"tag\": literal_eval}) \n", 497 | "ratings_test.head()" 498 | ], 499 | "execution_count": null, 500 | "outputs": [ 501 | { 502 | "output_type": "execute_result", 503 | "data": { 504 | "text/html": [ 505 | "
\n", 506 | "\n", 519 | "\n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | "
userIdmovieIdratingtimestampgenrestag
0134.0964981247[Comedy, Romance][]
111635.0964983650[Action, Romance, Western][]
213163.0964982310[Action, Adventure, Sci-Fi][]
313494.0964982563[Action, Crime, Drama, Thriller][]
414414.0964980868[Comedy][]
\n", 579 | "
" 580 | ], 581 | "text/plain": [ 582 | " userId movieId rating timestamp genres tag\n", 583 | "0 1 3 4.0 964981247 [Comedy, Romance] []\n", 584 | "1 1 163 5.0 964983650 [Action, Romance, Western] []\n", 585 | "2 1 316 3.0 964982310 [Action, Adventure, Sci-Fi] []\n", 586 | "3 1 349 4.0 964982563 [Action, Crime, Drama, Thriller] []\n", 587 | "4 1 441 4.0 964980868 [Comedy] []" 588 | ] 589 | }, 590 | "metadata": { 591 | "tags": [] 592 | }, 593 | "execution_count": 12 594 | } 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "metadata": { 600 | "id": "3X3-GJDNLw51", 601 | "colab_type": "code", 602 | "colab": {} 603 | }, 604 | "source": [ 605 | "ratings_test = pd.read_csv(\"testing_data.csv\", converters={\"genres\": literal_eval, \"tag\": literal_eval}) \n", 606 | "ratings_test.head()\n", 607 | "\n", 608 | "algo_predictions = pd.DataFrame(columns=['userId', 'movieId', 'user_vector', 'movie_vector', 'og_rating', 'pred_rating'])\n", 609 | "error_count = 0\n", 610 | "for ind, row in ratings_test.iterrows():\n", 611 | " userId = row['userId']\n", 612 | " movieId = row['movieId']\n", 613 | " og_rating = row['rating']\n", 614 | " \n", 615 | " user_vector = merged_user[merged_user['userId'] == int(userId)].final_user_vector.values[0]\n", 616 | " if len(merged_movie[merged_movie['movieId'] == int(movieId)].final_movie_vector.values):\n", 617 | " movie_vector = merged_movie[merged_movie['movieId'] == int(movieId)].final_movie_vector.values[0]\n", 618 | " else:\n", 619 | " error_count += 1\n", 620 | " print(\"Movie vector not found!\", movieId)\n", 621 | " predicted_rating = user_vector*movie_vector\n", 622 | "\n", 623 | " if predicted_rating.any():\n", 624 | " predicted_rating = np.nanmean(np.where(predicted_rating!=0, predicted_rating, np.nan))\n", 625 | " else:\n", 626 | " predicted_rating = 0\n", 627 | "\n", 628 | " row_df = pd.DataFrame([[userId, movieId, user_vector, movie_vector, og_rating, predicted_rating]], \n", 629 | " columns=['userId', 'movieId', 'user_vector', 'movie_vector', 'og_rating', 'pred_rating'])\n", 630 | " algo_predictions = pd.concat([algo_predictions, row_df], ignore_index=True)" 631 | ], 632 | "execution_count": null, 633 | "outputs": [] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "metadata": { 638 | "id": "spMCRnulLw53", 639 | "colab_type": "code", 640 | "colab": {}, 641 | "outputId": "89e8b4bf-20e2-4cdf-d227-6bfc6fc71934" 642 | }, 643 | "source": [ 644 | "algo_predictions" 645 | ], 646 | "execution_count": null, 647 | "outputs": [ 648 | { 649 | "output_type": "execute_result", 650 | "data": { 651 | "text/html": [ 652 | "
\n", 653 | "\n", 666 | "\n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | "
userIdmovieIduser_vectormovie_vectorog_ratingpred_rating
013[8.78378378, 9.30434782, 8.97142858, 8.5352112...[0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, ...4.012.892161
11163[8.78378378, 9.30434782, 8.97142858, 8.5352112...[0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0, ...5.014.134848
21316[8.78378378, 9.30434782, 8.97142858, 8.5352112...[2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, ...3.013.986955
31349[8.78378378, 9.30434782, 8.97142858, 8.5352112...[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 2.0, ...4.014.707133
41441[8.78378378, 9.30434782, 8.97142858, 8.5352112...[0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...4.010.671575
.....................
20163610156726[7.38967136, 7.8490566, 7.34210526, 7.44984802...[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...4.58.852847
20164610159093[7.38967136, 7.8490566, 7.34210526, 7.44984802...[0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 2.0, 0.0, ...3.011.726114
20165610161582[7.38967136, 7.8490566, 7.34210526, 7.44984802...[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 2.0, ...4.011.335023
20166610162350[7.38967136, 7.8490566, 7.34210526, 7.44984802...[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, ...3.511.036977
20167610163981[7.38967136, 7.8490566, 7.34210526, 7.44984802...[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...3.58.701610
\n", 780 | "

20168 rows × 6 columns

\n", 781 | "
" 782 | ], 783 | "text/plain": [ 784 | " userId movieId user_vector \\\n", 785 | "0 1 3 [8.78378378, 9.30434782, 8.97142858, 8.5352112... \n", 786 | "1 1 163 [8.78378378, 9.30434782, 8.97142858, 8.5352112... \n", 787 | "2 1 316 [8.78378378, 9.30434782, 8.97142858, 8.5352112... \n", 788 | "3 1 349 [8.78378378, 9.30434782, 8.97142858, 8.5352112... \n", 789 | "4 1 441 [8.78378378, 9.30434782, 8.97142858, 8.5352112... \n", 790 | "... ... ... ... \n", 791 | "20163 610 156726 [7.38967136, 7.8490566, 7.34210526, 7.44984802... \n", 792 | "20164 610 159093 [7.38967136, 7.8490566, 7.34210526, 7.44984802... \n", 793 | "20165 610 161582 [7.38967136, 7.8490566, 7.34210526, 7.44984802... \n", 794 | "20166 610 162350 [7.38967136, 7.8490566, 7.34210526, 7.44984802... \n", 795 | "20167 610 163981 [7.38967136, 7.8490566, 7.34210526, 7.44984802... \n", 796 | "\n", 797 | " movie_vector og_rating \\\n", 798 | "0 [0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, ... 4.0 \n", 799 | "1 [0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0, ... 5.0 \n", 800 | "2 [2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, ... 3.0 \n", 801 | "3 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 2.0, ... 4.0 \n", 802 | "4 [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... 4.0 \n", 803 | "... ... ... \n", 804 | "20163 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... 4.5 \n", 805 | "20164 [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 2.0, 0.0, ... 3.0 \n", 806 | "20165 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 2.0, ... 4.0 \n", 807 | "20166 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, ... 3.5 \n", 808 | "20167 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... 3.5 \n", 809 | "\n", 810 | " pred_rating \n", 811 | "0 12.892161 \n", 812 | "1 14.134848 \n", 813 | "2 13.986955 \n", 814 | "3 14.707133 \n", 815 | "4 10.671575 \n", 816 | "... ... \n", 817 | "20163 8.852847 \n", 818 | "20164 11.726114 \n", 819 | "20165 11.335023 \n", 820 | "20166 11.036977 \n", 821 | "20167 8.701610 \n", 822 | "\n", 823 | "[20168 rows x 6 columns]" 824 | ] 825 | }, 826 | "metadata": { 827 | "tags": [] 828 | }, 829 | "execution_count": 14 830 | } 831 | ] 832 | }, 833 | { 834 | "cell_type": "code", 835 | "metadata": { 836 | "id": "wzJucPlCLw55", 837 | "colab_type": "code", 838 | "colab": {} 839 | }, 840 | "source": [ 841 | "# algo_predictions.to_csv(\"genre_era_predictions.csv\")" 842 | ], 843 | "execution_count": null, 844 | "outputs": [] 845 | }, 846 | { 847 | "cell_type": "code", 848 | "metadata": { 849 | "scrolled": true, 850 | "id": "xnnjyw58Lw57", 851 | "colab_type": "code", 852 | "colab": {}, 853 | "outputId": "67264ae5-6d78-4260-cef3-992bcdb1eed7" 854 | }, 855 | "source": [ 856 | "rmse = ((algo_predictions.og_rating - algo_predictions.pred_rating/3) ** 2).mean() ** .5\n", 857 | "rmse" 858 | ], 859 | "execution_count": null, 860 | "outputs": [ 861 | { 862 | "output_type": "execute_result", 863 | "data": { 864 | "text/plain": [ 865 | "0.9898749125266205" 866 | ] 867 | }, 868 | "metadata": { 869 | "tags": [] 870 | }, 871 | "execution_count": 16 872 | } 873 | ] 874 | }, 875 | { 876 | "cell_type": "code", 877 | "metadata": { 878 | "id": "wDfACdMsLw59", 879 | "colab_type": "code", 880 | "colab": {}, 881 | "outputId": "b46389e7-6071-4f44-be9b-3731cf8fa46a" 882 | }, 883 | "source": [ 884 | "mae = (((algo_predictions.og_rating - algo_predictions.pred_rating/3) ** 2) ** .5).mean()\n", 885 | "mae" 886 | ], 887 | "execution_count": null, 888 | "outputs": [ 889 | { 890 | "output_type": "execute_result", 891 | "data": { 892 | "text/plain": [ 893 | "0.7651172008808977" 894 | ] 895 | }, 896 | "metadata": { 897 | "tags": [] 898 | }, 899 | "execution_count": 17 900 | } 901 | ] 902 | }, 903 | { 904 | "cell_type": "code", 905 | "metadata": { 906 | "id": "KQj2WqJZLw6A", 907 | "colab_type": "code", 908 | "colab": {} 909 | }, 910 | "source": [ 911 | "" 912 | ], 913 | "execution_count": null, 914 | "outputs": [] 915 | } 916 | ] 917 | } -------------------------------------------------------------------------------- /Code/preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.7.6" 21 | }, 22 | "colab": { 23 | "name": "preprocessing.ipynb", 24 | "provenance": [], 25 | "collapsed_sections": [] 26 | } 27 | }, 28 | "cells": [ 29 | { 30 | "cell_type": "markdown", 31 | "metadata": { 32 | "id": "slsyFHKMOwm9", 33 | "colab_type": "text" 34 | }, 35 | "source": [ 36 | "This notebook splits the data into training and testing set for each user such that 80% ratings are in training and 20% are for testing." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "metadata": { 42 | "id": "fauYg6bNOu24", 43 | "colab_type": "code", 44 | "colab": {} 45 | }, 46 | "source": [ 47 | "import pandas as pd\n", 48 | "import numpy as np" 49 | ], 50 | "execution_count": null, 51 | "outputs": [] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "metadata": { 56 | "id": "WdMSaP4bOu26", 57 | "colab_type": "code", 58 | "colab": {}, 59 | "outputId": "060f423a-f966-4a2b-c38d-55cf40e5c075" 60 | }, 61 | "source": [ 62 | "movies = pd.read_csv('ml-latest-small/movies.csv')\n", 63 | "ratings = pd.read_csv('ml-latest-small/ratings.csv')\n", 64 | "tags = pd.read_csv('ml-latest-small/tags.csv')\n", 65 | "print('movies: ', movies.shape)\n", 66 | "print('ratings: ', ratings.shape)\n", 67 | "print('tags: ', tags.shape)" 68 | ], 69 | "execution_count": null, 70 | "outputs": [ 71 | { 72 | "output_type": "stream", 73 | "text": [ 74 | "movies: (9742, 3)\n", 75 | "ratings: (100836, 4)\n", 76 | "tags: (3683, 4)\n" 77 | ], 78 | "name": "stdout" 79 | } 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "metadata": { 85 | "scrolled": false, 86 | "id": "pTK-7Pn_Ou29", 87 | "colab_type": "code", 88 | "colab": {}, 89 | "outputId": "19cee6e2-8455-4a4f-f413-3a10fd4b2a9e" 90 | }, 91 | "source": [ 92 | "movies.head(5)" 93 | ], 94 | "execution_count": null, 95 | "outputs": [ 96 | { 97 | "output_type": "execute_result", 98 | "data": { 99 | "text/html": [ 100 | "
\n", 101 | "\n", 114 | "\n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | "
movieIdtitlegenres
01Toy Story (1995)Adventure|Animation|Children|Comedy|Fantasy
12Jumanji (1995)Adventure|Children|Fantasy
23Grumpier Old Men (1995)Comedy|Romance
34Waiting to Exhale (1995)Comedy|Drama|Romance
45Father of the Bride Part II (1995)Comedy
\n", 156 | "
" 157 | ], 158 | "text/plain": [ 159 | " movieId title \\\n", 160 | "0 1 Toy Story (1995) \n", 161 | "1 2 Jumanji (1995) \n", 162 | "2 3 Grumpier Old Men (1995) \n", 163 | "3 4 Waiting to Exhale (1995) \n", 164 | "4 5 Father of the Bride Part II (1995) \n", 165 | "\n", 166 | " genres \n", 167 | "0 Adventure|Animation|Children|Comedy|Fantasy \n", 168 | "1 Adventure|Children|Fantasy \n", 169 | "2 Comedy|Romance \n", 170 | "3 Comedy|Drama|Romance \n", 171 | "4 Comedy " 172 | ] 173 | }, 174 | "metadata": { 175 | "tags": [] 176 | }, 177 | "execution_count": 4 178 | } 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "metadata": { 184 | "id": "9szjN9tSOu3A", 185 | "colab_type": "code", 186 | "colab": {}, 187 | "outputId": "9cded57b-9cf5-4c5f-eaeb-619018a65bcf" 188 | }, 189 | "source": [ 190 | "ratings.head(5)" 191 | ], 192 | "execution_count": null, 193 | "outputs": [ 194 | { 195 | "output_type": "execute_result", 196 | "data": { 197 | "text/html": [ 198 | "
\n", 199 | "\n", 212 | "\n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | "
userIdmovieIdratingtimestamp
0114.0964982703
1134.0964981247
2164.0964982224
31475.0964983815
41505.0964982931
\n", 260 | "
" 261 | ], 262 | "text/plain": [ 263 | " userId movieId rating timestamp\n", 264 | "0 1 1 4.0 964982703\n", 265 | "1 1 3 4.0 964981247\n", 266 | "2 1 6 4.0 964982224\n", 267 | "3 1 47 5.0 964983815\n", 268 | "4 1 50 5.0 964982931" 269 | ] 270 | }, 271 | "metadata": { 272 | "tags": [] 273 | }, 274 | "execution_count": 5 275 | } 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "metadata": { 281 | "id": "o6mexZ2NOu3C", 282 | "colab_type": "code", 283 | "colab": {}, 284 | "outputId": "7f6eebb9-213f-4642-cabf-64780a92d5aa" 285 | }, 286 | "source": [ 287 | "tags.head(5)" 288 | ], 289 | "execution_count": null, 290 | "outputs": [ 291 | { 292 | "output_type": "execute_result", 293 | "data": { 294 | "text/html": [ 295 | "
\n", 296 | "\n", 309 | "\n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | "
userIdmovieIdtagtimestamp
0260756funny1445714994
1260756Highly quotable1445714996
2260756will ferrell1445714992
3289774Boxing story1445715207
4289774MMA1445715200
\n", 357 | "
" 358 | ], 359 | "text/plain": [ 360 | " userId movieId tag timestamp\n", 361 | "0 2 60756 funny 1445714994\n", 362 | "1 2 60756 Highly quotable 1445714996\n", 363 | "2 2 60756 will ferrell 1445714992\n", 364 | "3 2 89774 Boxing story 1445715207\n", 365 | "4 2 89774 MMA 1445715200" 366 | ] 367 | }, 368 | "metadata": { 369 | "tags": [] 370 | }, 371 | "execution_count": 6 372 | } 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "metadata": { 378 | "id": "LIjAsKI8Ou3F", 379 | "colab_type": "code", 380 | "colab": {}, 381 | "outputId": "54655a32-c28b-4134-c0e4-3c19fa3b3c61" 382 | }, 383 | "source": [ 384 | "df = pd.merge(ratings, movies, on='movieId' , how='left')\n", 385 | "df = df.drop('title', axis=1)\n", 386 | "df.head(5)" 387 | ], 388 | "execution_count": null, 389 | "outputs": [ 390 | { 391 | "output_type": "execute_result", 392 | "data": { 393 | "text/html": [ 394 | "
\n", 395 | "\n", 408 | "\n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | "
userIdmovieIdratingtimestampgenres
0114.0964982703Adventure|Animation|Children|Comedy|Fantasy
1134.0964981247Comedy|Romance
2164.0964982224Action|Crime|Thriller
31475.0964983815Mystery|Thriller
41505.0964982931Crime|Mystery|Thriller
\n", 462 | "
" 463 | ], 464 | "text/plain": [ 465 | " userId movieId rating timestamp \\\n", 466 | "0 1 1 4.0 964982703 \n", 467 | "1 1 3 4.0 964981247 \n", 468 | "2 1 6 4.0 964982224 \n", 469 | "3 1 47 5.0 964983815 \n", 470 | "4 1 50 5.0 964982931 \n", 471 | "\n", 472 | " genres \n", 473 | "0 Adventure|Animation|Children|Comedy|Fantasy \n", 474 | "1 Comedy|Romance \n", 475 | "2 Action|Crime|Thriller \n", 476 | "3 Mystery|Thriller \n", 477 | "4 Crime|Mystery|Thriller " 478 | ] 479 | }, 480 | "metadata": { 481 | "tags": [] 482 | }, 483 | "execution_count": 7 484 | } 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "metadata": { 490 | "id": "ymbJzpjJOu3H", 491 | "colab_type": "code", 492 | "colab": {} 493 | }, 494 | "source": [ 495 | "df['genres'] = df['genres'].str.split('|')" 496 | ], 497 | "execution_count": null, 498 | "outputs": [] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "metadata": { 503 | "id": "MiUn_Qp0Ou3J", 504 | "colab_type": "code", 505 | "colab": {}, 506 | "outputId": "879c002c-96f8-4f77-c957-1653c0b06b2f" 507 | }, 508 | "source": [ 509 | "df.head(5)" 510 | ], 511 | "execution_count": null, 512 | "outputs": [ 513 | { 514 | "output_type": "execute_result", 515 | "data": { 516 | "text/html": [ 517 | "
\n", 518 | "\n", 531 | "\n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | "
userIdmovieIdratingtimestampgenres
0114.0964982703[Adventure, Animation, Children, Comedy, Fantasy]
1134.0964981247[Comedy, Romance]
2164.0964982224[Action, Crime, Thriller]
31475.0964983815[Mystery, Thriller]
41505.0964982931[Crime, Mystery, Thriller]
\n", 585 | "
" 586 | ], 587 | "text/plain": [ 588 | " userId movieId rating timestamp \\\n", 589 | "0 1 1 4.0 964982703 \n", 590 | "1 1 3 4.0 964981247 \n", 591 | "2 1 6 4.0 964982224 \n", 592 | "3 1 47 5.0 964983815 \n", 593 | "4 1 50 5.0 964982931 \n", 594 | "\n", 595 | " genres \n", 596 | "0 [Adventure, Animation, Children, Comedy, Fantasy] \n", 597 | "1 [Comedy, Romance] \n", 598 | "2 [Action, Crime, Thriller] \n", 599 | "3 [Mystery, Thriller] \n", 600 | "4 [Crime, Mystery, Thriller] " 601 | ] 602 | }, 603 | "metadata": { 604 | "tags": [] 605 | }, 606 | "execution_count": 9 607 | } 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "metadata": { 613 | "id": "B3xDyEX_Ou3L", 614 | "colab_type": "code", 615 | "colab": {} 616 | }, 617 | "source": [ 618 | "tags['tag'] = tags['tag'].str.split('|')\n", 619 | "tags.drop('timestamp', axis=1, inplace=True)" 620 | ], 621 | "execution_count": null, 622 | "outputs": [] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "metadata": { 627 | "id": "4OEft6CkOu3O", 628 | "colab_type": "code", 629 | "colab": {}, 630 | "outputId": "7f500eee-1b28-4735-a48f-900f6cd3be1b" 631 | }, 632 | "source": [ 633 | "tags = tags.groupby(['userId','movieId'])['tag'].apply(lambda x: ','.join(x.astype(str))).reset_index()\n", 634 | "tags.head(5)" 635 | ], 636 | "execution_count": null, 637 | "outputs": [ 638 | { 639 | "output_type": "execute_result", 640 | "data": { 641 | "text/html": [ 642 | "
\n", 643 | "\n", 656 | "\n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | "
userIdmovieIdtag
0260756['funny'],['Highly quotable'],['will ferrell']
1289774['Boxing story'],['MMA'],['Tom Hardy']
22106782['drugs'],['Leonardo DiCaprio'],['Martin Scors...
3748516['way too long']
418431['Al Pacino'],['gangster'],['mafia']
\n", 698 | "
" 699 | ], 700 | "text/plain": [ 701 | " userId movieId tag\n", 702 | "0 2 60756 ['funny'],['Highly quotable'],['will ferrell']\n", 703 | "1 2 89774 ['Boxing story'],['MMA'],['Tom Hardy']\n", 704 | "2 2 106782 ['drugs'],['Leonardo DiCaprio'],['Martin Scors...\n", 705 | "3 7 48516 ['way too long']\n", 706 | "4 18 431 ['Al Pacino'],['gangster'],['mafia']" 707 | ] 708 | }, 709 | "metadata": { 710 | "tags": [] 711 | }, 712 | "execution_count": 11 713 | } 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "metadata": { 719 | "id": "Q-ukKMhbOu3Q", 720 | "colab_type": "code", 721 | "colab": {} 722 | }, 723 | "source": [ 724 | "df = pd.merge(df, tags, on=['userId','movieId'], how='left')" 725 | ], 726 | "execution_count": null, 727 | "outputs": [] 728 | }, 729 | { 730 | "cell_type": "code", 731 | "metadata": { 732 | "id": "05Y3LfkMOu3S", 733 | "colab_type": "code", 734 | "colab": {}, 735 | "outputId": "6d8110de-34dd-463d-c123-dd69218f3954" 736 | }, 737 | "source": [ 738 | "df.shape" 739 | ], 740 | "execution_count": null, 741 | "outputs": [ 742 | { 743 | "output_type": "execute_result", 744 | "data": { 745 | "text/plain": [ 746 | "(100836, 6)" 747 | ] 748 | }, 749 | "metadata": { 750 | "tags": [] 751 | }, 752 | "execution_count": 13 753 | } 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "metadata": { 759 | "id": "2RZvAHrHOu3U", 760 | "colab_type": "code", 761 | "colab": {} 762 | }, 763 | "source": [ 764 | "df['tag'] = df['tag'].apply(lambda d: d if isinstance(d, list) else [])\n", 765 | "df['genres'] = df['genres'].apply(lambda d: d if isinstance(d, list) else [])" 766 | ], 767 | "execution_count": null, 768 | "outputs": [] 769 | }, 770 | { 771 | "cell_type": "code", 772 | "metadata": { 773 | "id": "M0M_IJc5Ou3W", 774 | "colab_type": "code", 775 | "colab": {}, 776 | "outputId": "a95ea1a2-3435-4268-a6ab-a1db53e5a464" 777 | }, 778 | "source": [ 779 | "df.head()" 780 | ], 781 | "execution_count": null, 782 | "outputs": [ 783 | { 784 | "output_type": "execute_result", 785 | "data": { 786 | "text/html": [ 787 | "
\n", 788 | "\n", 801 | "\n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | "
userIdmovieIdratingtimestampgenrestag
0114.0964982703[Adventure, Animation, Children, Comedy, Fantasy][]
1134.0964981247[Comedy, Romance][]
2164.0964982224[Action, Crime, Thriller][]
31475.0964983815[Mystery, Thriller][]
41505.0964982931[Crime, Mystery, Thriller][]
\n", 861 | "
" 862 | ], 863 | "text/plain": [ 864 | " userId movieId rating timestamp \\\n", 865 | "0 1 1 4.0 964982703 \n", 866 | "1 1 3 4.0 964981247 \n", 867 | "2 1 6 4.0 964982224 \n", 868 | "3 1 47 5.0 964983815 \n", 869 | "4 1 50 5.0 964982931 \n", 870 | "\n", 871 | " genres tag \n", 872 | "0 [Adventure, Animation, Children, Comedy, Fantasy] [] \n", 873 | "1 [Comedy, Romance] [] \n", 874 | "2 [Action, Crime, Thriller] [] \n", 875 | "3 [Mystery, Thriller] [] \n", 876 | "4 [Crime, Mystery, Thriller] [] " 877 | ] 878 | }, 879 | "metadata": { 880 | "tags": [] 881 | }, 882 | "execution_count": 15 883 | } 884 | ] 885 | }, 886 | { 887 | "cell_type": "markdown", 888 | "metadata": { 889 | "id": "PPkusLncOu3Y", 890 | "colab_type": "text" 891 | }, 892 | "source": [ 893 | "#### Split into train and test data" 894 | ] 895 | }, 896 | { 897 | "cell_type": "code", 898 | "metadata": { 899 | "id": "2-tF8n50Ou3Y", 900 | "colab_type": "code", 901 | "colab": {} 902 | }, 903 | "source": [ 904 | "from sklearn.model_selection import train_test_split\n", 905 | "train_data, test_data = train_test_split(df, test_size=0.2, stratify=df.userId)" 906 | ], 907 | "execution_count": null, 908 | "outputs": [] 909 | }, 910 | { 911 | "cell_type": "code", 912 | "metadata": { 913 | "id": "vFCvxS3yOu3a", 914 | "colab_type": "code", 915 | "colab": {}, 916 | "outputId": "b6cfc121-411d-49e8-c807-726cfebdcac7" 917 | }, 918 | "source": [ 919 | "train_data = train_data.sort_values(['userId', 'movieId'])\n", 920 | "train_data.head()" 921 | ], 922 | "execution_count": null, 923 | "outputs": [ 924 | { 925 | "output_type": "execute_result", 926 | "data": { 927 | "text/html": [ 928 | "
\n", 929 | "\n", 942 | "\n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | "
userIdmovieIdratingtimestampgenrestag
0114.0964982703[Adventure, Animation, Children, Comedy, Fantasy][]
1134.0964981247[Comedy, Romance][]
2164.0964982224[Action, Crime, Thriller][]
31475.0964983815[Mystery, Thriller][]
51703.0964982400[Action, Comedy, Horror, Thriller][]
\n", 1002 | "
" 1003 | ], 1004 | "text/plain": [ 1005 | " userId movieId rating timestamp \\\n", 1006 | "0 1 1 4.0 964982703 \n", 1007 | "1 1 3 4.0 964981247 \n", 1008 | "2 1 6 4.0 964982224 \n", 1009 | "3 1 47 5.0 964983815 \n", 1010 | "5 1 70 3.0 964982400 \n", 1011 | "\n", 1012 | " genres tag \n", 1013 | "0 [Adventure, Animation, Children, Comedy, Fantasy] [] \n", 1014 | "1 [Comedy, Romance] [] \n", 1015 | "2 [Action, Crime, Thriller] [] \n", 1016 | "3 [Mystery, Thriller] [] \n", 1017 | "5 [Action, Comedy, Horror, Thriller] [] " 1018 | ] 1019 | }, 1020 | "metadata": { 1021 | "tags": [] 1022 | }, 1023 | "execution_count": 17 1024 | } 1025 | ] 1026 | }, 1027 | { 1028 | "cell_type": "code", 1029 | "metadata": { 1030 | "scrolled": true, 1031 | "id": "ojrEaFoMOu3c", 1032 | "colab_type": "code", 1033 | "colab": {}, 1034 | "outputId": "b617ba3b-9a8b-44f3-b41f-22109097a9f0" 1035 | }, 1036 | "source": [ 1037 | "test_data = test_data.sort_values(['userId','movieId'])\n", 1038 | "test_data.head()" 1039 | ], 1040 | "execution_count": null, 1041 | "outputs": [ 1042 | { 1043 | "output_type": "execute_result", 1044 | "data": { 1045 | "text/html": [ 1046 | "
\n", 1047 | "\n", 1060 | "\n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | "
userIdmovieIdratingtimestampgenrestag
41505.0964982931[Crime, Mystery, Thriller][]
611015.0964980868[Adventure, Comedy, Crime, Romance][]
1112165.0964981208[Comedy][]
1612963.0964982967[Comedy, Crime, Drama, Thriller][]
1713163.0964982310[Action, Adventure, Sci-Fi][]
\n", 1120 | "
" 1121 | ], 1122 | "text/plain": [ 1123 | " userId movieId rating timestamp genres \\\n", 1124 | "4 1 50 5.0 964982931 [Crime, Mystery, Thriller] \n", 1125 | "6 1 101 5.0 964980868 [Adventure, Comedy, Crime, Romance] \n", 1126 | "11 1 216 5.0 964981208 [Comedy] \n", 1127 | "16 1 296 3.0 964982967 [Comedy, Crime, Drama, Thriller] \n", 1128 | "17 1 316 3.0 964982310 [Action, Adventure, Sci-Fi] \n", 1129 | "\n", 1130 | " tag \n", 1131 | "4 [] \n", 1132 | "6 [] \n", 1133 | "11 [] \n", 1134 | "16 [] \n", 1135 | "17 [] " 1136 | ] 1137 | }, 1138 | "metadata": { 1139 | "tags": [] 1140 | }, 1141 | "execution_count": 18 1142 | } 1143 | ] 1144 | }, 1145 | { 1146 | "cell_type": "markdown", 1147 | "metadata": { 1148 | "id": "zlFDgR0COu3f", 1149 | "colab_type": "text" 1150 | }, 1151 | "source": [ 1152 | "#### Save the dataframes as csv files" 1153 | ] 1154 | }, 1155 | { 1156 | "cell_type": "code", 1157 | "metadata": { 1158 | "id": "U8ssEJZ0Ou3f", 1159 | "colab_type": "code", 1160 | "colab": {} 1161 | }, 1162 | "source": [ 1163 | "# train_data.to_csv('training_data.csv', index = False)\n", 1164 | "# test_data.to_csv('testing_data.csv', index = False)" 1165 | ], 1166 | "execution_count": null, 1167 | "outputs": [] 1168 | }, 1169 | { 1170 | "cell_type": "markdown", 1171 | "metadata": { 1172 | "id": "dCl7m8u3Ou3h", 1173 | "colab_type": "text" 1174 | }, 1175 | "source": [ 1176 | "## Pre-process the movie data" 1177 | ] 1178 | }, 1179 | { 1180 | "cell_type": "code", 1181 | "metadata": { 1182 | "scrolled": true, 1183 | "id": "SNQrqxwwOu3h", 1184 | "colab_type": "code", 1185 | "colab": {}, 1186 | "outputId": "06ea051d-cab0-42be-d43e-36cf3da4d733" 1187 | }, 1188 | "source": [ 1189 | "movies['genres'] = movies['genres'].str.split('|')\n", 1190 | "movies['genres'] = movies['genres'].apply(lambda d: d if isinstance(d, list) else [])\n", 1191 | "movies.head()\n", 1192 | "# movies.to_csv('movies.csv', index = False)" 1193 | ], 1194 | "execution_count": null, 1195 | "outputs": [ 1196 | { 1197 | "output_type": "execute_result", 1198 | "data": { 1199 | "text/html": [ 1200 | "
\n", 1201 | "\n", 1214 | "\n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | "
movieIdtitlegenres
01Toy Story (1995)[Adventure, Animation, Children, Comedy, Fantasy]
12Jumanji (1995)[Adventure, Children, Fantasy]
23Grumpier Old Men (1995)[Comedy, Romance]
34Waiting to Exhale (1995)[Comedy, Drama, Romance]
45Father of the Bride Part II (1995)[Comedy]
\n", 1256 | "
" 1257 | ], 1258 | "text/plain": [ 1259 | " movieId title \\\n", 1260 | "0 1 Toy Story (1995) \n", 1261 | "1 2 Jumanji (1995) \n", 1262 | "2 3 Grumpier Old Men (1995) \n", 1263 | "3 4 Waiting to Exhale (1995) \n", 1264 | "4 5 Father of the Bride Part II (1995) \n", 1265 | "\n", 1266 | " genres \n", 1267 | "0 [Adventure, Animation, Children, Comedy, Fantasy] \n", 1268 | "1 [Adventure, Children, Fantasy] \n", 1269 | "2 [Comedy, Romance] \n", 1270 | "3 [Comedy, Drama, Romance] \n", 1271 | "4 [Comedy] " 1272 | ] 1273 | }, 1274 | "metadata": { 1275 | "tags": [] 1276 | }, 1277 | "execution_count": 20 1278 | } 1279 | ] 1280 | }, 1281 | { 1282 | "cell_type": "code", 1283 | "metadata": { 1284 | "id": "tUOMY7C9Ou3j", 1285 | "colab_type": "code", 1286 | "colab": {} 1287 | }, 1288 | "source": [ 1289 | "" 1290 | ], 1291 | "execution_count": null, 1292 | "outputs": [] 1293 | } 1294 | ] 1295 | } --------------------------------------------------------------------------------