├── .DS_Store
├── Report.pdf
├── Presentation.pptx
├── Results
├── .DS_Store
├── images
│ ├── ndcg.png
│ ├── rating.png
│ ├── Algo_analysis.png
│ ├── Hybrid_Model.png
│ ├── knn_neighbors.png
│ ├── KNN_similarity.png
│ ├── prec_recall_fm.png
│ ├── genre_distribution.png
│ ├── vector_generation.png
│ ├── genre_based_popularity.png
│ └── mae_rmse_including_pearson.png
├── Final_model_results.xlsx
├── README.md
└── algo_results.csv
├── README.md
└── Code
├── README.md
├── test_ndcg.py
├── evaluating_recs.py
├── generating_predictions.py
├── combined_model.ipynb
├── surprise_model_predictions.ipynb
├── cold_start_analysis.ipynb
├── movie_era_based_recs.ipynb
└── preprocessing.ipynb
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/.DS_Store
--------------------------------------------------------------------------------
/Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Report.pdf
--------------------------------------------------------------------------------
/Presentation.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Presentation.pptx
--------------------------------------------------------------------------------
/Results/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/.DS_Store
--------------------------------------------------------------------------------
/Results/images/ndcg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/ndcg.png
--------------------------------------------------------------------------------
/Results/images/rating.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/rating.png
--------------------------------------------------------------------------------
/Results/Final_model_results.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/Final_model_results.xlsx
--------------------------------------------------------------------------------
/Results/images/Algo_analysis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/Algo_analysis.png
--------------------------------------------------------------------------------
/Results/images/Hybrid_Model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/Hybrid_Model.png
--------------------------------------------------------------------------------
/Results/images/knn_neighbors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/knn_neighbors.png
--------------------------------------------------------------------------------
/Results/images/KNN_similarity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/KNN_similarity.png
--------------------------------------------------------------------------------
/Results/images/prec_recall_fm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/prec_recall_fm.png
--------------------------------------------------------------------------------
/Results/images/genre_distribution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/genre_distribution.png
--------------------------------------------------------------------------------
/Results/images/vector_generation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/vector_generation.png
--------------------------------------------------------------------------------
/Results/images/genre_based_popularity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/genre_based_popularity.png
--------------------------------------------------------------------------------
/Results/images/mae_rmse_including_pearson.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/mae_rmse_including_pearson.png
--------------------------------------------------------------------------------
/Results/README.md:
--------------------------------------------------------------------------------
1 | #### Analysis Plots:
2 |
3 | 1. Comparison of methods:
4 | 
5 |
6 | 2. Surprise models:
7 | 
8 |
9 | 3. Vector generation in content based approach:
10 | 
11 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Movie-Recommendation-System
2 |
3 | **Dataset used:**
4 | 1. [MovieLens](https://grouplens.org/datasets/movielens/)
5 | 2. [The Movie Database: tmdb](https://www.kaggle.com/tmdb/tmdb-movie-metadata)
6 |
7 | **Aim:** Build a movie recommendation system by integrating the aspects of personalization of user with the overall features of movie such as genre, popularity etc.
8 |
9 | **Models:**
10 | * Popularity model
11 | * Content based model: genre, year of release, ratings of movies
12 | * Collaborative filtering: User vs item, KNN similarity measures
13 | * Latent Factor based SVD
14 | * Combined linear model using surprise library (CF + SVD)
15 | * Hybrid model (content based + popularity based + item-item CF + svd)
16 |
17 | **Results:**
18 |
19 | 
20 |
21 | All the models are implemented in Python using pandas, sklearn and [surprise](http://surpriselib.com/) library. The hyperparameter tuning, testing accuracy (RMSE and MAE) and evaluation of recommendations (precision, recall, f-measure and ndcg) for each model are thoroughly performed. The detailed analysis of the models is presented in the report.
22 |
--------------------------------------------------------------------------------
/Results/algo_results.csv:
--------------------------------------------------------------------------------
1 | ,Algorithm,test_rmse,test_mae,fit_time,test_time,Precision,Recall,F-measure,NDCG
2 | 0,KNNBaseline (pearson_baseline,0.8527048407985288,0.6481831050363938,9.183264017105103,6.47820782661438,0.8311748633879806,0.4131693896380987,0.5519630281626648,0.9631023306515691
3 | 1,CoClustering,0.9522076927566949,0.7332637064416873,1.8252005577087402,0.1734142303466797,0.7826229508196745,0.3809810109755996,0.5124844754841872,0.9557478562520727
4 | 2,BaselineOnly,0.873457272373568,0.6717695910159173,0.1396622657775879,0.09251093864440918,0.8074316939890736,0.4003500565116332,0.5352876447815149,0.9590854168667904
5 | 3,KNNWithZScore,0.8991840819159082,0.6786227249764061,0.1465740203857422,1.7950918674468994,0.7948087431694016,0.3915249143270615,0.524620410302617,0.9541951019768238
6 | 4,KNNWithMeans,0.9000344577297286,0.6836932871288725,0.10967230796813965,1.5405960083007812,0.8009562841530078,0.3871394034076282,0.521981084939778,0.9527093414526527
7 | 5,KNNBaseline,0.8762934782625659,0.6659918272130076,0.21010994911193848,1.9365522861480713,0.7964207650273253,0.4158810520288814,0.546425487378205,0.9562275612497768
8 | 6,NMF,0.9291418868270431,0.7094694164253142,5.15004301071167,0.20406055450439453,0.7792896174863412,0.38074469729699817,0.5115545044405817,0.9548841017932751
9 | 7,SlopeOne,0.9056446586210445,0.6876919079393096,4.759229898452759,5.59233283996582,0.8075409836065599,0.3965250076150167,0.5318814699668782,0.9555464596666479
10 | 8,SVDpp,0.8691186490330676,0.6640513869365521,480.6970820426941,7.9973015785217285,0.8178415300546472,0.3978840866429081,0.5353282446879383,0.9603166226261703
11 | 9,SVD,0.8794364853143987,0.6739493321877302,4.6099772453308105,0.12506961822509766,0.8033060109289645,0.38554157622502233,0.52102198631871,0.956595790499096
12 | 10,KNNBasic,0.9507724809063621,0.7266525395708078,0.09878921508789062,1.3892457485198975,0.7838797814207674,0.4215349947874059,0.5482474018023665,0.9586757463063268
13 |
--------------------------------------------------------------------------------
/Code/README.md:
--------------------------------------------------------------------------------
1 | ### Description:
2 |
3 | #### 1. cold_start_analysis:
4 | Analyses the performance of different approaches in case of a new user or a user with less number of interaction with the system, namely the cold start problem. Computed the rmse and mae for those customers who have rated less than 18 books and also who have rated more than 1000 movies.
5 | For less interactions, content based and item-item based collaborative filtering approaches work better. As the number of interactions per customer increases, SVD and collaborative approaches work better.
6 |
7 | #### 2. combined_model:
8 | Combination of different surprise model results by applying weighted linear combination to generate final rating.
9 |
10 | #### 3. content_based_recommendation:
11 | Genreating user and movie vectors based on genre and predicting the ratings for movies in test data.
12 |
13 | #### 4. evaluating_recs:
14 | Code for Precision, Recall, F-1 score and NDCG.
15 |
16 | #### 5. generating_predictions:
17 | Generating rating predictions for test data using surprise library.
18 |
19 | #### 6. hybrid_model:
20 | Code for the hybrid model based on combining recommendations from different models such as content based, CF, SVD to improve accuracy and quality of recommendations.
21 |
22 | #### 7. knn_analysis:
23 | Analysis of KNN algorithms by changing different parameters like:
24 | * number of neighbors
25 | * similarity metrices
26 | * user v/s item based CF
27 |
28 | #### 8. model_hyperparameter_tuning:
29 | Fine-tuned surprise models by experimenting with different hyperparameters for training and model. Compared models based on RMSE and MAE.
30 |
31 | #### 9. movie_era_based_recs:
32 | Content based approach to include the time period in which the movie was launced in the user vector. This method personalizes the users recommendations to include this feature.
33 |
34 | #### 10. movie_similarity_based_recs:
35 | Content based approach to include the user's genre preference and recommend movies similar to user's highly rated movies.
36 |
37 | #### 11. movie_year_analysis:
38 | Experiments with the year of the movie release. Analysed the distribution of data and determine the appropriate era intervals to classify movies. Used the content based approach to form a user vector based on the era preference.
39 |
40 | #### 12. popularity_model:
41 | Model which uses the popularity attribute as well as the average rating and voter count in the TMDB data to generate popular movies genre wise. The genres are determined using the IMDB data.
42 |
43 | #### 13. preprocessing:
44 | Code for spliting the data into training and testing set for each user such that 80% ratings are in training and 20% are for testing.
45 |
46 | #### 14. surprise_model_predictions:
47 | Code for generating ratings for test data using surprise models such as KNN (CF), SVD, Baseline approach, Slopeone etc.
48 |
49 | #### 15. surprise_model_recs:
50 | Comparison between the surprise models based on test data ratings (RMSE and MAE) and quality of recommendations (precision, recall, ndcg, f-measure).
51 |
52 | #### 16. test_ndcg:
53 | Code to test implementation of [NDCG metric](https://en.wikipedia.org/wiki/Discounted_cumulative_gain) for evaluting recommendations.
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/Code/test_ndcg.py:
--------------------------------------------------------------------------------
1 | import math
2 | from collections import defaultdict
3 |
4 | from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader
5 | from surprise import Dataset
6 | from surprise.model_selection import cross_validate
7 | from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
8 | from surprise import accuracy
9 | from surprise.model_selection import train_test_split
10 |
11 | import pandas as pd
12 | import numpy as np
13 |
14 |
15 | def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):
16 | reader = Reader(rating_scale=(0, 5))
17 | trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
18 | testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
19 | trainset = trainset.construct_trainset(trainset.raw_ratings)
20 | testset = testset.construct_testset(testset.raw_ratings)
21 | return trainset, testset
22 |
23 |
24 | # Modified get_top_n function -----------------------------------
25 | # actual_ratings: list of actual ratings for all iids for each user
26 | def get_top_n(predictions, n):
27 | # First map the predictions to each user.
28 | top_n = defaultdict(list)
29 | org_ratings = defaultdict(list)
30 |
31 | for uid, iid, true_r, est, _ in predictions:
32 | top_n[uid].append((iid, est))
33 | org_ratings[uid].append((iid, true_r))
34 |
35 | # Then sort the predictions for each user and retrieve the k highest ones.
36 | for uid, user_ratings in top_n.items():
37 | user_ratings.sort(key=lambda x: x[1], reverse=True)
38 | top_n[uid] = user_ratings[:n]
39 |
40 | return top_n, org_ratings
41 | # -------------------------------------------------------------------
42 |
43 |
44 | def dcg_at_k(scores):
45 | return scores[0] + sum(sc/math.log(ind, 2) for sc, ind in zip(scores[1:], range(2, len(scores) + 1)))
46 |
47 |
48 | # Modified to include only one parameter-------------------------------
49 | def ndcg_at_k(scores):
50 | idcg = dcg_at_k(sorted(scores, reverse=True))
51 | return (dcg_at_k(scores)/idcg) if idcg > 0.0 else 0.0
52 | # ---------------------------------------------------------------------
53 |
54 |
55 | file_path_train = 'training_data.csv'
56 | file_path_test = 'testing_data.csv'
57 | traindf = pd.read_csv(file_path_train)
58 | testdf = pd.read_csv(file_path_test)
59 | trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf)
60 |
61 | print("Starting algo")
62 | algo = SVDpp()
63 | algo.fit(trainset)
64 | test_predictions = algo.test(testset)
65 | test_rmse = accuracy.rmse(test_predictions)
66 | test_mae = accuracy.mae(test_predictions)
67 | print("Ended algo")
68 |
69 | top_n, org_ratings = get_top_n(test_predictions, 5) # --------------- Modified this line
70 |
71 | ndcg_scores = dict()
72 |
73 | # Modified----------------------
74 | for uid, user_ratings in top_n.items():
75 | scores = []
76 | for iid, est_r in user_ratings:
77 | iid_found = False
78 | org_user_ratings = org_ratings[uid]
79 | for i, r in org_user_ratings:
80 | if iid == i:
81 | scores.append(r)
82 | iid_found = True
83 | break
84 | if not iid_found:
85 | scores.append(0)
86 | ndcg_scores[uid] = ndcg_at_k(scores)
87 | # --------------------------------
88 |
89 | ndcg_score = sum(ndcg for ndcg in ndcg_scores.values())/len(ndcg_scores)
90 | print(ndcg_score)
91 |
--------------------------------------------------------------------------------
/Code/evaluating_recs.py:
--------------------------------------------------------------------------------
1 | import math
2 | from collections import defaultdict
3 | import csv
4 | from sklearn.metrics import ndcg_score
5 | import numpy as np
6 | import pandas as pd
7 |
8 |
9 | def get_top_n(predictions, algo_weights, n):
10 | '''Return the top-N recommendation for each user from a set of predictions.
11 |
12 | Args:
13 | predictions(list of Prediction objects): The list of predictions, as
14 | returned by the test method of an algorithm.
15 | n(int): The number of recommendation to output for each user. Default
16 | is 10.
17 |
18 | Returns:
19 | A dict where keys are user (raw) ids and values are lists of tuples:
20 | [(raw item id, rating estimation), ...] of size n.
21 | '''
22 |
23 | # First map the predictions to each user.
24 | top_n = defaultdict(list)
25 | top_n_ndcg = defaultdict(list)
26 | for i in range(len(predictions)):
27 | row = predictions.iloc[i, :]
28 | final_est = algo_weights['svd']*float(row['svd_rating']) + algo_weights['knn']*float(row['knn_rating']) + \
29 | algo_weights['svdpp']*float(row['svdpp_rating']) + algo_weights['slope']*float(row['slopeone_rating']) + \
30 | algo_weights['baseline']*float(row['baseline_rating'])
31 | top_n[row[0]].append((row[1], final_est))
32 | top_n_ndcg[row[0]].append((row[1], row[2], final_est))
33 |
34 | # Then sort the predictions for each user and retrieve the k highest ones.
35 | for uid, user_ratings in top_n.items():
36 | user_ratings.sort(key=lambda x: x[1], reverse=True)
37 | top_n[uid] = user_ratings[:n]
38 |
39 | for uid, user_ratings in top_n_ndcg.items():
40 | user_ratings.sort(key=lambda x: x[2], reverse=True)
41 | top_n_ndcg[uid] = user_ratings[:n]
42 |
43 | return top_n, top_n_ndcg
44 |
45 |
46 | def precision_recall_at_k(predictions, algo_weights, k, threshold):
47 | '''Return precision and recall at k metrics for each user.'''
48 |
49 | # First map the predictions to each user.
50 | user_est_true = defaultdict(list)
51 | for i in range(len(predictions)):
52 | row = predictions.iloc[i, :]
53 | final_est = algo_weights['svd']*float(row['svd_rating']) + algo_weights['knn']*float(row['knn_rating']) + \
54 | algo_weights['svdpp']*float(row['svdpp_rating']) + algo_weights['slope']*float(row['slopeone_rating']) + \
55 | algo_weights['baseline']*float(row['baseline_rating'])
56 | user_est_true[row[0]].append((final_est, row[2]))
57 |
58 | precisions = dict()
59 | recalls = dict()
60 | for uid, user_ratings in user_est_true.items():
61 | # Sort user ratings by estimated value
62 | user_ratings.sort(key=lambda x: x[0], reverse=True)
63 |
64 | # Number of relevant items
65 | n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
66 |
67 | # Number of recommended items in top k
68 | n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
69 |
70 | # Number of relevant and recommended items in top k
71 | n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
72 | for (est, true_r) in user_ratings[:k])
73 |
74 | # Precision@K: Proportion of recommended items that are relevant
75 | precisions[uid] = n_rel_and_rec_k/n_rec_k if n_rec_k != 0 else 1
76 |
77 | # Recall@K: Proportion of relevant items that are recommended
78 | recalls[uid] = n_rel_and_rec_k/n_rel if n_rel != 0 else 1
79 |
80 | return precisions, recalls
81 |
82 |
83 | def dcg_at_k(scores):
84 | return scores[0] + sum(sc/math.log(ind, 2) for sc, ind in zip(scores[1:], range(2, len(scores) + 1)))
85 |
86 |
87 | def ndcg_at_k(predicted_scores, actual_scores):
88 | idcg = dcg_at_k(sorted(actual_scores, reverse=True))
89 | return (dcg_at_k(predicted_scores)/idcg) if idcg > 0.0 else 0.0
90 |
91 |
92 | predictions = pd.read_csv("test_prediction_HP.csv", usecols=range(1, 9))
93 | algo_weights = dict()
94 | algo_weights['svd'] = 0
95 | algo_weights['knn'] = 0
96 | algo_weights['svdpp'] = 1
97 | algo_weights['slope'] = 0
98 | algo_weights['baseline'] = 0
99 | n = 5
100 | threshold = 3.75
101 | top_n, top_n_ndcg = get_top_n(predictions, algo_weights, n)
102 | with open('top5_svdpp.csv', 'w', newline="") as csv_file:
103 | writer = csv.writer(csv_file)
104 | for key, value in top_n.items():
105 | writer.writerow([key, value])
106 |
107 | ndcg_scores = dict()
108 | for uid, user_ratings in top_n_ndcg.items():
109 | true = []
110 | est = []
111 | for _, tru_r, est_r in user_ratings:
112 | true.append(tru_r)
113 | est.append(est_r)
114 | ndcg = ndcg_at_k(est, true)
115 | ndcg_scores[uid] = ndcg
116 |
117 | # Print the recommended items for each user
118 | # for uid, user_ratings in top_n.items():
119 | # print(uid, [iid for (iid, _) in user_ratings])
120 |
121 | precisions, recalls = precision_recall_at_k(predictions, algo_weights, n, threshold)
122 | precision = sum(prec for prec in precisions.values())/len(precisions)
123 | recall = sum(rec for rec in recalls.values())/len(recalls)
124 | fmeasure = (2*precision*recall)/(precision + recall)
125 | ndcg_score = sum(ndcg for ndcg in ndcg_scores.values())/len(ndcg_scores)
126 | print("Precision: ", precision)
127 | print("Recall: ", recall)
128 | print("F-Measure", fmeasure)
129 | print("NDCG Score: ", ndcg_score)
130 |
--------------------------------------------------------------------------------
/Code/generating_predictions.py:
--------------------------------------------------------------------------------
1 | from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader
2 | from surprise import Dataset
3 | from surprise.model_selection import cross_validate
4 | from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
5 | from surprise import accuracy
6 | from surprise.model_selection import train_test_split
7 |
8 | import pandas as pd
9 | import numpy as np
10 |
11 |
12 | def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):
13 | reader = Reader(rating_scale=(0, 5))
14 | trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
15 | testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
16 | trainset = trainset.construct_trainset(trainset.raw_ratings)
17 | testset = testset.construct_testset(testset.raw_ratings)
18 | return trainset, testset
19 |
20 |
21 | def recommendation(algo, trainset, testset):
22 | # Train the algorithm on the trainset, and predict ratings for the testset
23 | algo.fit(trainset)
24 |
25 | # # Predictions on training set
26 | # train_predictions = algo.test(trainset)
27 | # train_rmse = accuracy.rmse(train_predictions)
28 | # train_mae = accuracy.mae(train_predictions)
29 |
30 | # Predictions on testing set
31 | test_predictions = algo.test(testset)
32 | test_rmse = accuracy.rmse(test_predictions)
33 | test_mae = accuracy.mae(test_predictions)
34 |
35 | return test_rmse, test_mae, test_predictions
36 |
37 |
38 | file_path_train = 'training_data.csv'
39 | file_path_test = 'testing_data.csv'
40 | traindf = pd.read_csv(file_path_train)
41 | testdf = pd.read_csv(file_path_test)
42 | trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf)
43 |
44 |
45 | print("1")
46 | BaselineOnly()
47 |
48 | algo = BaselineOnly()
49 | test_base_rmse, test_base_mae, test_base_pred = recommendation(algo, trainset, testset)
50 |
51 | print("2")
52 | # basic collaborative filtering algorithm taking into account a baseline rating.
53 | sim_options = {'name': 'pearson_baseline',
54 | 'user_based': False # compute similarities between items
55 | }
56 | algo = KNNBaseline(sim_options=sim_options)
57 | test_knn_rmse, test_knn_mae, test_knn_pred = recommendation(algo, trainset, testset)
58 |
59 | print("3")
60 | # SlopeOne
61 | algo = SlopeOne()
62 | test_slopeone_rmse, test_slopeone_mae, test_slopeone_pred = recommendation(algo, trainset, testset)
63 |
64 | print("4")
65 | # SVD
66 | algo = SVD()
67 | test_svd_rmse, test_svd_mae, test_svd_pred = recommendation(algo, trainset, testset)
68 |
69 | print("5")
70 | # SVDpp
71 | algo = SVDpp()
72 | test_svdpp_rmse, test_svdpp_mae, test_svdpp_pred = recommendation(algo, trainset, testset)
73 |
74 | print("6")
75 | test_pred_df = pd.DataFrame(
76 | columns=['uid', 'iid', 'og_rating', 'svd_rating', 'knn_rating', 'svdpp_rating', 'slopeone_rating',
77 | 'baseline_rating'])
78 | test_svd_df = pd.DataFrame(
79 | columns=['uid', 'iid', 'og_rating', 'est_rating'])
80 | test_svdpp_df = pd.DataFrame(
81 | columns=['uid', 'iid', 'og_rating', 'est_rating'])
82 | test_knnb_df = pd.DataFrame(
83 | columns=['uid', 'iid', 'og_rating', 'est_rating'])
84 | test_slope_df = pd.DataFrame(
85 | columns=['uid', 'iid', 'og_rating', 'est_rating'])
86 | test_bonly_df = pd.DataFrame(
87 | columns=['uid', 'iid', 'og_rating', 'est_rating'])
88 | num_test = len(test_base_pred)
89 | for i in range(num_test):
90 | svd = test_svd_pred[i]
91 | slopeone = test_slopeone_pred[i]
92 | knn = test_knn_pred[i]
93 | svdpp = test_svdpp_pred[i]
94 | baseline = test_base_pred[i]
95 | df = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, svd.est, knn.est, svdpp.est, slopeone.est, baseline.est]],
96 | columns=['uid', 'iid', 'og_rating', 'svd_rating', 'knn_rating', 'svdpp_rating', 'slopeone_rating',
97 | 'baseline_rating'])
98 | df_svd = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, svd.est]],
99 | columns=['uid', 'iid', 'og_rating', 'est_rating'])
100 | df_svdpp = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, svdpp.est]],
101 | columns=['uid', 'iid', 'og_rating', 'est_rating'])
102 | df_knnb = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, knn.est]],
103 | columns=['uid', 'iid', 'og_rating', 'est_rating'])
104 | df_slope = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, slopeone.est]],
105 | columns=['uid', 'iid', 'og_rating', 'est_rating'])
106 | df_bonly = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, baseline.est]],
107 | columns=['uid', 'iid', 'og_rating', 'est_rating'])
108 | test_pred_df = pd.concat([df, test_pred_df], ignore_index=True)
109 | test_svd_df = pd.concat([df_svd, test_svd_df], ignore_index=True)
110 | test_svdpp_df = pd.concat([df_svdpp, test_svdpp_df], ignore_index=True)
111 | test_slope_df = pd.concat([df_slope, test_slope_df], ignore_index=True)
112 | test_knnb_df = pd.concat([df_knnb, test_knnb_df], ignore_index=True)
113 | test_bonly_df = pd.concat([df_bonly, test_bonly_df], ignore_index=True)
114 |
115 | print("7")
116 | test_pred_df.to_csv('test_prediction_HP.csv')
117 | test_svd_df.to_csv('test_predictions_svd.csv')
118 | test_svdpp_df.to_csv('test_predictions_svdpp.csv')
119 | test_knnb_df.to_csv('test_predictions_knnb.csv')
120 | test_slope_df.to_csv('test_predictions_slope.csv')
121 | test_bonly_df.to_csv('test_predictions_bonly.csv')
122 |
--------------------------------------------------------------------------------
/Code/combined_model.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "combined_model.ipynb",
7 | "provenance": [],
8 | "collapsed_sections": []
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | }
14 | },
15 | "cells": [
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {
19 | "id": "6IBmQTfmBf2k",
20 | "colab_type": "text"
21 | },
22 | "source": [
23 | "This notebook combines the individual model's rating to form a unified model which performs better. The ratings from individual models are combined using a weighted linear combination to form a resultant rating. This method helps overcome the shortcomings of individual method. \\\\\n",
24 | "The ratings are those generated using surprise library."
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "metadata": {
30 | "id": "fRRCvSdKBeX_",
31 | "colab_type": "code",
32 | "colab": {}
33 | },
34 | "source": [
35 | "import pandas as pd\n",
36 | "import numpy as np\n",
37 | "import math"
38 | ],
39 | "execution_count": null,
40 | "outputs": []
41 | },
42 | {
43 | "cell_type": "code",
44 | "metadata": {
45 | "id": "NBUye_PbsZdg",
46 | "colab_type": "code",
47 | "colab": {
48 | "base_uri": "https://localhost:8080/",
49 | "height": 195
50 | },
51 | "outputId": "b9048120-dea6-4485-f0b9-118915146f4b"
52 | },
53 | "source": [
54 | "pred_data = pd.read_csv('test_prediction_HP.csv')\n",
55 | "pred_data.head() "
56 | ],
57 | "execution_count": null,
58 | "outputs": [
59 | {
60 | "output_type": "execute_result",
61 | "data": {
62 | "text/html": [
63 | "
\n",
64 | "\n",
77 | "
\n",
78 | " \n",
79 | " \n",
80 | " | \n",
81 | " Unnamed: 0 | \n",
82 | " uid | \n",
83 | " iid | \n",
84 | " og_rating | \n",
85 | " svd_rating | \n",
86 | " knn_rating | \n",
87 | " svdpp_rating | \n",
88 | " slopeone_rating | \n",
89 | " baseline_rating | \n",
90 | "
\n",
91 | " \n",
92 | " \n",
93 | " \n",
94 | " | 0 | \n",
95 | " 0 | \n",
96 | " 610 | \n",
97 | " 163981 | \n",
98 | " 3.5 | \n",
99 | " 3.571637 | \n",
100 | " 3.603256 | \n",
101 | " 3.538527 | \n",
102 | " 3.501078 | \n",
103 | " 3.603256 | \n",
104 | "
\n",
105 | " \n",
106 | " | 1 | \n",
107 | " 1 | \n",
108 | " 610 | \n",
109 | " 162350 | \n",
110 | " 3.5 | \n",
111 | " 3.430078 | \n",
112 | " 3.517200 | \n",
113 | " 3.323570 | \n",
114 | " 2.752871 | \n",
115 | " 3.601820 | \n",
116 | "
\n",
117 | " \n",
118 | " | 2 | \n",
119 | " 2 | \n",
120 | " 610 | \n",
121 | " 161582 | \n",
122 | " 4.0 | \n",
123 | " 3.715722 | \n",
124 | " 4.025055 | \n",
125 | " 3.836845 | \n",
126 | " 4.253110 | \n",
127 | " 3.760107 | \n",
128 | "
\n",
129 | " \n",
130 | " | 3 | \n",
131 | " 3 | \n",
132 | " 610 | \n",
133 | " 159093 | \n",
134 | " 3.0 | \n",
135 | " 3.889187 | \n",
136 | " 3.717144 | \n",
137 | " 3.499436 | \n",
138 | " 3.737276 | \n",
139 | " 3.728456 | \n",
140 | "
\n",
141 | " \n",
142 | " | 4 | \n",
143 | " 4 | \n",
144 | " 610 | \n",
145 | " 156726 | \n",
146 | " 4.5 | \n",
147 | " 3.209341 | \n",
148 | " 3.863298 | \n",
149 | " 3.004246 | \n",
150 | " 1.850029 | \n",
151 | " 3.439723 | \n",
152 | "
\n",
153 | " \n",
154 | "
\n",
155 | "
"
156 | ],
157 | "text/plain": [
158 | " Unnamed: 0 uid iid ... svdpp_rating slopeone_rating baseline_rating\n",
159 | "0 0 610 163981 ... 3.538527 3.501078 3.603256\n",
160 | "1 1 610 162350 ... 3.323570 2.752871 3.601820\n",
161 | "2 2 610 161582 ... 3.836845 4.253110 3.760107\n",
162 | "3 3 610 159093 ... 3.499436 3.737276 3.728456\n",
163 | "4 4 610 156726 ... 3.004246 1.850029 3.439723\n",
164 | "\n",
165 | "[5 rows x 9 columns]"
166 | ]
167 | },
168 | "metadata": {
169 | "tags": []
170 | },
171 | "execution_count": 2
172 | }
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "metadata": {
178 | "id": "Uc2p2jUlSLJC",
179 | "colab_type": "code",
180 | "colab": {}
181 | },
182 | "source": [
183 | "# pred_data = pred_data.drop(169639)"
184 | ],
185 | "execution_count": null,
186 | "outputs": []
187 | },
188 | {
189 | "cell_type": "code",
190 | "metadata": {
191 | "id": "a9zxxdMpRSSv",
192 | "colab_type": "code",
193 | "colab": {
194 | "base_uri": "https://localhost:8080/",
195 | "height": 34
196 | },
197 | "outputId": "6513a5da-0d2d-4f34-80cf-4d006d57df72"
198 | },
199 | "source": [
200 | "# num of rows:\n",
201 | "T = pred_data.shape[0]\n",
202 | "print(T)"
203 | ],
204 | "execution_count": null,
205 | "outputs": [
206 | {
207 | "output_type": "stream",
208 | "text": [
209 | "20168\n"
210 | ],
211 | "name": "stdout"
212 | }
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "metadata": {
218 | "id": "kqfVFGZ8sxLg",
219 | "colab_type": "code",
220 | "colab": {}
221 | },
222 | "source": [
223 | "svd_wt = 0.05\n",
224 | "knn_wt = 0.6\n",
225 | "svdpp_wt = 0.4\n",
226 | "slopeone_wt = 0\n",
227 | "baseline_wt = 0"
228 | ],
229 | "execution_count": null,
230 | "outputs": []
231 | },
232 | {
233 | "cell_type": "code",
234 | "metadata": {
235 | "id": "5lWF0bq2OhV9",
236 | "colab_type": "code",
237 | "colab": {
238 | "base_uri": "https://localhost:8080/",
239 | "height": 50
240 | },
241 | "outputId": "04f216b1-c95a-400a-9599-2c1044bfcaad"
242 | },
243 | "source": [
244 | "rmse = ((pred_data.og_rating - pred_data.knn_rating) ** 2).mean() ** .5\n",
245 | "print(rmse)\n",
246 | "mae = (((pred_data.og_rating - pred_data.knn_rating) ** 2) ** .5).mean()\n",
247 | "print(mae)"
248 | ],
249 | "execution_count": null,
250 | "outputs": [
251 | {
252 | "output_type": "stream",
253 | "text": [
254 | "0.8527048407985283\n",
255 | "0.64818310503639\n"
256 | ],
257 | "name": "stdout"
258 | }
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "metadata": {
264 | "id": "oxcquF0lQOVa",
265 | "colab_type": "code",
266 | "colab": {
267 | "base_uri": "https://localhost:8080/",
268 | "height": 50
269 | },
270 | "outputId": "cfe99592-5a54-4ade-c80f-a5f58817f727"
271 | },
272 | "source": [
273 | "rmse = ((pred_data.og_rating - pred_data.svdpp_rating) ** 2).mean() ** .5\n",
274 | "print(rmse)\n",
275 | "mae = (((pred_data.og_rating - pred_data.svdpp_rating) ** 2) ** .5).mean()\n",
276 | "print(mae)"
277 | ],
278 | "execution_count": null,
279 | "outputs": [
280 | {
281 | "output_type": "stream",
282 | "text": [
283 | "0.8668435463304792\n",
284 | "0.6611243052231001\n"
285 | ],
286 | "name": "stdout"
287 | }
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "metadata": {
293 | "id": "IEcpZCbBLTCS",
294 | "colab_type": "code",
295 | "colab": {
296 | "base_uri": "https://localhost:8080/",
297 | "height": 50
298 | },
299 | "outputId": "a9b0788a-8986-487a-ecae-9d1583d3d51b"
300 | },
301 | "source": [
302 | "sqr_sum = 0\n",
303 | "abs_sum = 0\n",
304 | "\n",
305 | "for ind, row in pred_data.iterrows():\n",
306 | " org_r = row['og_rating']\n",
307 | " pred_r = svd_wt*row['svd_rating'] + knn_wt*row['knn_rating'] + svdpp_wt*row['svdpp_rating'] + slopeone_wt*row['slopeone_rating'] + baseline_wt*row['baseline_rating']\n",
308 | " diff = np.abs(org_r - pred_r)\n",
309 | " # print(diff)\n",
310 | " abs_sum += diff\n",
311 | " sqr_sum += diff**2\n",
312 | "\n",
313 | "rmse = np.sqrt(sqr_sum/T)\n",
314 | "print(\"RMSE\", rmse)\n",
315 | "mae = abs_sum/T\n",
316 | "print(\"MAE\", mae)"
317 | ],
318 | "execution_count": null,
319 | "outputs": [
320 | {
321 | "output_type": "stream",
322 | "text": [
323 | "RMSE 0.8440081164615088\n",
324 | "MAE 0.6426598370928285\n"
325 | ],
326 | "name": "stdout"
327 | }
328 | ]
329 | }
330 | ]
331 | }
--------------------------------------------------------------------------------
/Code/surprise_model_predictions.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "surprise_model_predictions.ipynb",
7 | "provenance": [],
8 | "collapsed_sections": []
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | }
14 | },
15 | "cells": [
16 | {
17 | "cell_type": "code",
18 | "metadata": {
19 | "id": "fi8oBmWXD1rA",
20 | "colab_type": "code",
21 | "colab": {
22 | "base_uri": "https://localhost:8080/",
23 | "height": 118
24 | },
25 | "outputId": "d86444ed-d23b-4e76-b327-9d766fd375f4"
26 | },
27 | "source": [
28 | "!pip install surprise"
29 | ],
30 | "execution_count": null,
31 | "outputs": [
32 | {
33 | "output_type": "stream",
34 | "text": [
35 | "Requirement already satisfied: surprise in /usr/local/lib/python3.6/dist-packages (0.1)\n",
36 | "Requirement already satisfied: scikit-surprise in /usr/local/lib/python3.6/dist-packages (from surprise) (1.1.0)\n",
37 | "Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (1.12.0)\n",
38 | "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (0.14.1)\n",
39 | "Requirement already satisfied: numpy>=1.11.2 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (1.18.2)\n",
40 | "Requirement already satisfied: scipy>=1.0.0 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (1.4.1)\n"
41 | ],
42 | "name": "stdout"
43 | }
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "metadata": {
49 | "id": "J9FfIKsk0bDJ",
50 | "colab_type": "code",
51 | "colab": {}
52 | },
53 | "source": [
54 | "from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader\n",
55 | "from surprise import Dataset\n",
56 | "from surprise.model_selection import cross_validate\n",
57 | "from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore\n",
58 | "from surprise import accuracy\n",
59 | "from surprise.model_selection import train_test_split"
60 | ],
61 | "execution_count": null,
62 | "outputs": []
63 | },
64 | {
65 | "cell_type": "code",
66 | "metadata": {
67 | "id": "mCZrwlMiOZPg",
68 | "colab_type": "code",
69 | "colab": {}
70 | },
71 | "source": [
72 | "import pandas as pd\n",
73 | "import numpy as np"
74 | ],
75 | "execution_count": null,
76 | "outputs": []
77 | },
78 | {
79 | "cell_type": "code",
80 | "metadata": {
81 | "id": "olgICu7ZYyM7",
82 | "colab_type": "code",
83 | "colab": {
84 | "base_uri": "https://localhost:8080/",
85 | "height": 67
86 | },
87 | "outputId": "1c8bb714-5fe1-497a-9344-130c4fbb91ef"
88 | },
89 | "source": [
90 | "# Load the movielens-1M dataset\n",
91 | "data = Dataset.load_builtin('ml-1m')"
92 | ],
93 | "execution_count": null,
94 | "outputs": [
95 | {
96 | "output_type": "stream",
97 | "text": [
98 | "Dataset ml-1m could not be found. Do you want to download it? [Y/n] y\n",
99 | "Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-1m.zip...\n",
100 | "Done! Dataset ml-1m has been saved to /root/.surprise_data/ml-1m\n"
101 | ],
102 | "name": "stdout"
103 | }
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "metadata": {
109 | "id": "DnLayS6VaXZL",
110 | "colab_type": "code",
111 | "colab": {}
112 | },
113 | "source": [
114 | "# sample random trainset and testset\n",
115 | "# test set is made of 20% of the ratings.\n",
116 | "trainset, testset = train_test_split(data, test_size=.20)"
117 | ],
118 | "execution_count": null,
119 | "outputs": []
120 | },
121 | {
122 | "cell_type": "code",
123 | "metadata": {
124 | "id": "AC2Mt8xUyccA",
125 | "colab_type": "code",
126 | "colab": {}
127 | },
128 | "source": [
129 | "def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):\n",
130 | " reader = Reader(rating_scale=(0, 5))\n",
131 | " trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)\n",
132 | " testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)\n",
133 | " trainset = trainset.construct_trainset(trainset.raw_ratings)\n",
134 | " testset = testset.construct_testset(testset.raw_ratings)\n",
135 | " return trainset, testset"
136 | ],
137 | "execution_count": null,
138 | "outputs": []
139 | },
140 | {
141 | "cell_type": "code",
142 | "metadata": {
143 | "id": "dHBcLq3eyi0T",
144 | "colab_type": "code",
145 | "colab": {}
146 | },
147 | "source": [
148 | "file_path_train = 'training_data.csv'\n",
149 | "file_path_test = 'testing_data.csv'\n",
150 | "traindf = pd.read_csv(file_path_train)\n",
151 | "testdf = pd.read_csv(file_path_test)\n",
152 | "trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf)"
153 | ],
154 | "execution_count": null,
155 | "outputs": []
156 | },
157 | {
158 | "cell_type": "code",
159 | "metadata": {
160 | "id": "AlitWSrNb2wZ",
161 | "colab_type": "code",
162 | "colab": {}
163 | },
164 | "source": [
165 | "def recommendation(algo, trainset, testset):\n",
166 | " # Train the algorithm on the trainset, and predict ratings for the testset\n",
167 | " algo.fit(trainset)\n",
168 | "\n",
169 | " # Predictions on testing set\n",
170 | " test_predictions = algo.test(testset)\n",
171 | " test_rmse = accuracy.rmse(test_predictions)\n",
172 | " test_mae = accuracy.mae(test_predictions)\n",
173 | " \n",
174 | " return test_rmse, test_mae, test_predictions"
175 | ],
176 | "execution_count": null,
177 | "outputs": []
178 | },
179 | {
180 | "cell_type": "code",
181 | "metadata": {
182 | "id": "9ZblN_7unqoU",
183 | "colab_type": "code",
184 | "colab": {}
185 | },
186 | "source": [
187 | "# results = cross_validate(SVD(), data, measures=['RMSE', 'MAE'], cv=5, verbose=False)"
188 | ],
189 | "execution_count": null,
190 | "outputs": []
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "metadata": {
195 | "id": "Iv9GSCQx24RI",
196 | "colab_type": "text"
197 | },
198 | "source": [
199 | "#### Experimenting"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "metadata": {
205 | "id": "E777XIBI26SQ",
206 | "colab_type": "code",
207 | "colab": {
208 | "base_uri": "https://localhost:8080/",
209 | "height": 84
210 | },
211 | "outputId": "b5629514-2562-4eda-b44f-67cfbfa18a8a"
212 | },
213 | "source": [
214 | "print('Using ALS')\n",
215 | "bsl_options = {'method': 'als',\n",
216 | " 'n_epochs': 5,\n",
217 | " 'reg_u': 12,\n",
218 | " 'reg_i': 5\n",
219 | " }\n",
220 | "algo = BaselineOnly(bsl_options=bsl_options)\n",
221 | "test_rmse, test_mae, test_pred = recommendation(algo, trainset, testset)"
222 | ],
223 | "execution_count": null,
224 | "outputs": [
225 | {
226 | "output_type": "stream",
227 | "text": [
228 | "Using ALS\n",
229 | "Estimating biases using als...\n",
230 | "RMSE: 0.8677\n",
231 | "MAE: 0.6659\n"
232 | ],
233 | "name": "stdout"
234 | }
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "metadata": {
240 | "id": "luHqcF-H30jl",
241 | "colab_type": "code",
242 | "colab": {
243 | "base_uri": "https://localhost:8080/",
244 | "height": 84
245 | },
246 | "outputId": "a96a611a-eab4-49ee-e34d-c99c847b584f"
247 | },
248 | "source": [
249 | "print('Using SGD')\n",
250 | "# bsl_options = {'method': 'sgd',\n",
251 | "# 'learning_rate': .00005,\n",
252 | "# }\n",
253 | "algo = BaselineOnly() # bsl_options=bsl_options\n",
254 | "test_rmse, test_mae, test_pred = recommendation(algo, trainset, testset)"
255 | ],
256 | "execution_count": null,
257 | "outputs": [
258 | {
259 | "output_type": "stream",
260 | "text": [
261 | "Using SGD\n",
262 | "Estimating biases using als...\n",
263 | "RMSE: 0.8735\n",
264 | "MAE: 0.6718\n"
265 | ],
266 | "name": "stdout"
267 | }
268 | ]
269 | },
270 | {
271 | "cell_type": "markdown",
272 | "metadata": {
273 | "id": "gBCl6LOoBPgQ",
274 | "colab_type": "text"
275 | },
276 | "source": [
277 | "##### Calculating predictions for the top methods:"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "metadata": {
283 | "id": "KuTTB-6Th8ZN",
284 | "colab_type": "code",
285 | "colab": {
286 | "base_uri": "https://localhost:8080/",
287 | "height": 101
288 | },
289 | "outputId": "a202ac5a-0dc8-4a9b-9349-847ce601c9fc"
290 | },
291 | "source": [
292 | "# KNNBaseline\n",
293 | "\n",
294 | "algo = KNNBaseline()\n",
295 | "test_knn_rmse, test_knn_mae, test_knn_pred = recommendation(algo, trainset, testset)"
296 | ],
297 | "execution_count": null,
298 | "outputs": [
299 | {
300 | "output_type": "stream",
301 | "text": [
302 | "Estimating biases using als...\n",
303 | "Computing the msd similarity matrix...\n",
304 | "Done computing similarity matrix.\n",
305 | "RMSE: 0.8763\n",
306 | "MAE: 0.6660\n"
307 | ],
308 | "name": "stdout"
309 | }
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "metadata": {
315 | "id": "ndRC8sVBBoje",
316 | "colab_type": "code",
317 | "colab": {
318 | "base_uri": "https://localhost:8080/",
319 | "height": 50
320 | },
321 | "outputId": "8f1c4945-631c-4a2c-e7ef-92a13db48983"
322 | },
323 | "source": [
324 | "# SlopeOne\n",
325 | "\n",
326 | "algo = SlopeOne()\n",
327 | "test_slopeone_rmse, test_slopeone_mae, test_slopeone_pred = recommendation(algo, trainset, testset)"
328 | ],
329 | "execution_count": null,
330 | "outputs": [
331 | {
332 | "output_type": "stream",
333 | "text": [
334 | "RMSE: 0.9070\n",
335 | "MAE: 0.7145\n"
336 | ],
337 | "name": "stdout"
338 | }
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "metadata": {
344 | "id": "INAgGkTFBxlT",
345 | "colab_type": "code",
346 | "colab": {
347 | "base_uri": "https://localhost:8080/",
348 | "height": 50
349 | },
350 | "outputId": "0e9a390a-e87d-4f55-a97e-6284f7348074"
351 | },
352 | "source": [
353 | "# SVD\n",
354 | "\n",
355 | "algo = SVD()\n",
356 | "test_svd_rmse, test_svd_mae, test_svd_pred = recommendation(algo, trainset, testset)"
357 | ],
358 | "execution_count": null,
359 | "outputs": [
360 | {
361 | "output_type": "stream",
362 | "text": [
363 | "RMSE: 0.8743\n",
364 | "MAE: 0.6858\n"
365 | ],
366 | "name": "stdout"
367 | }
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "metadata": {
373 | "id": "uJCkdey1B02t",
374 | "colab_type": "code",
375 | "colab": {
376 | "base_uri": "https://localhost:8080/",
377 | "height": 50
378 | },
379 | "outputId": "662ecf2e-f6b3-4f46-d55e-40c825c9009b"
380 | },
381 | "source": [
382 | "# SVDpp\n",
383 | "\n",
384 | "algo = SVDpp()\n",
385 | "test_svdpp_rmse, test_svdpp_mae, test_svdpp_pred = recommendation(algo, trainset, testset)"
386 | ],
387 | "execution_count": null,
388 | "outputs": [
389 | {
390 | "output_type": "stream",
391 | "text": [
392 | "RMSE: 0.8697\n",
393 | "MAE: 0.6643\n"
394 | ],
395 | "name": "stdout"
396 | }
397 | ]
398 | },
399 | {
400 | "cell_type": "code",
401 | "metadata": {
402 | "id": "nprC9tRcymxk",
403 | "colab_type": "code",
404 | "colab": {
405 | "base_uri": "https://localhost:8080/",
406 | "height": 67
407 | },
408 | "outputId": "8b64d3ec-8a8b-4963-8a0f-54ef0ae090c4"
409 | },
410 | "source": [
411 | "# BaselineOnly()\n",
412 | "\n",
413 | "algo = BaselineOnly()\n",
414 | "test_base_rmse, test_base_mae, test_base_pred = recommendation(algo, trainset, testset)"
415 | ],
416 | "execution_count": null,
417 | "outputs": [
418 | {
419 | "output_type": "stream",
420 | "text": [
421 | "Estimating biases using als...\n",
422 | "RMSE: 0.8735\n",
423 | "MAE: 0.6718\n"
424 | ],
425 | "name": "stdout"
426 | }
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "metadata": {
432 | "id": "5OqtsT-5MPAh",
433 | "colab_type": "code",
434 | "colab": {}
435 | },
436 | "source": [
437 | "test_pred_df = pd.DataFrame(columns= ['uid', 'iid', 'og_rating', 'svd_rating', 'knn_rating', 'svdpp_rating', 'slopeone_rating', 'baseline_rating'])"
438 | ],
439 | "execution_count": null,
440 | "outputs": []
441 | },
442 | {
443 | "cell_type": "code",
444 | "metadata": {
445 | "id": "ExyS3zHlzxsP",
446 | "colab_type": "code",
447 | "colab": {
448 | "base_uri": "https://localhost:8080/",
449 | "height": 34
450 | },
451 | "outputId": "3e7f502e-2286-4806-cf93-217aa64a4b08"
452 | },
453 | "source": [
454 | "num_test = len(test_base_pred)\n",
455 | "print(num_test)"
456 | ],
457 | "execution_count": null,
458 | "outputs": [
459 | {
460 | "output_type": "stream",
461 | "text": [
462 | "200042\n"
463 | ],
464 | "name": "stdout"
465 | }
466 | ]
467 | },
468 | {
469 | "cell_type": "markdown",
470 | "metadata": {
471 | "id": "qHYtUbMyrOA9",
472 | "colab_type": "text"
473 | },
474 | "source": [
475 | "##### Storing testing set predictions:"
476 | ]
477 | },
478 | {
479 | "cell_type": "code",
480 | "metadata": {
481 | "id": "YSPLXRAgzfka",
482 | "colab_type": "code",
483 | "colab": {}
484 | },
485 | "source": [
486 | "for i in range(num_test): \n",
487 | " svd = test_svd_pred[i]\n",
488 | " slopeone = test_slopeone_pred[i]\n",
489 | " knn = test_knn_pred[i]\n",
490 | " svdpp = test_svdpp_pred[i]\n",
491 | " baseline = test_base_pred[i]\n",
492 | " df = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, svd.est, knn.est, svdpp.est, slopeone.est, baseline.est]], columns=['uid', 'iid', 'og_rating', 'svd_rating', 'knn_rating', 'svdpp_rating', 'slopeone_rating','baseline_rating'])\n",
493 | " # print(df)\n",
494 | " test_pred_df = pd.concat([df, test_pred_df], ignore_index=True)"
495 | ],
496 | "execution_count": null,
497 | "outputs": []
498 | },
499 | {
500 | "cell_type": "code",
501 | "metadata": {
502 | "id": "fJdb0S-A5PiX",
503 | "colab_type": "code",
504 | "colab": {
505 | "base_uri": "https://localhost:8080/",
506 | "height": 402
507 | },
508 | "outputId": "7e1bbd47-57f4-464e-ea62-95cda20ad0b5"
509 | },
510 | "source": [
511 | "test_pred_df"
512 | ],
513 | "execution_count": null,
514 | "outputs": [
515 | {
516 | "output_type": "execute_result",
517 | "data": {
518 | "text/html": [
519 | "\n",
520 | "\n",
533 | "
\n",
534 | " \n",
535 | " \n",
536 | " | \n",
537 | " uid | \n",
538 | " iid | \n",
539 | " og_rating | \n",
540 | " svd_rating | \n",
541 | " knn_rating | \n",
542 | " svdpp_rating | \n",
543 | " slopeone_rating | \n",
544 | " baseline_rating | \n",
545 | "
\n",
546 | " \n",
547 | " \n",
548 | " \n",
549 | " | 0 | \n",
550 | " 695 | \n",
551 | " 2791 | \n",
552 | " 4.0 | \n",
553 | " 3.507685 | \n",
554 | " 3.815840 | \n",
555 | " 3.936685 | \n",
556 | " 4.240711 | \n",
557 | " 4.146045 | \n",
558 | "
\n",
559 | " \n",
560 | " | 1 | \n",
561 | " 6016 | \n",
562 | " 3668 | \n",
563 | " 3.0 | \n",
564 | " 3.404877 | \n",
565 | " 3.557922 | \n",
566 | " 3.658979 | \n",
567 | " 3.390132 | \n",
568 | " 3.442060 | \n",
569 | "
\n",
570 | " \n",
571 | " | 2 | \n",
572 | " 5482 | \n",
573 | " 1221 | \n",
574 | " 5.0 | \n",
575 | " 4.622452 | \n",
576 | " 4.491665 | \n",
577 | " 4.451363 | \n",
578 | " 4.669042 | \n",
579 | " 4.554867 | \n",
580 | "
\n",
581 | " \n",
582 | " | 3 | \n",
583 | " 3389 | \n",
584 | " 2959 | \n",
585 | " 4.0 | \n",
586 | " 3.899992 | \n",
587 | " 3.217574 | \n",
588 | " 4.235280 | \n",
589 | " 3.559392 | \n",
590 | " 3.450094 | \n",
591 | "
\n",
592 | " \n",
593 | " | 4 | \n",
594 | " 4303 | \n",
595 | " 608 | \n",
596 | " 4.0 | \n",
597 | " 4.093749 | \n",
598 | " 4.250497 | \n",
599 | " 4.757454 | \n",
600 | " 4.282707 | \n",
601 | " 4.180708 | \n",
602 | "
\n",
603 | " \n",
604 | " | ... | \n",
605 | " ... | \n",
606 | " ... | \n",
607 | " ... | \n",
608 | " ... | \n",
609 | " ... | \n",
610 | " ... | \n",
611 | " ... | \n",
612 | " ... | \n",
613 | "
\n",
614 | " \n",
615 | " | 200037 | \n",
616 | " 1447 | \n",
617 | " 3412 | \n",
618 | " 4.0 | \n",
619 | " 2.678937 | \n",
620 | " 3.412608 | \n",
621 | " 3.309891 | \n",
622 | " 3.192129 | \n",
623 | " 3.238168 | \n",
624 | "
\n",
625 | " \n",
626 | " | 200038 | \n",
627 | " 301 | \n",
628 | " 3396 | \n",
629 | " 4.0 | \n",
630 | " 4.292583 | \n",
631 | " 4.228340 | \n",
632 | " 4.594647 | \n",
633 | " 4.128157 | \n",
634 | " 4.114891 | \n",
635 | "
\n",
636 | " \n",
637 | " | 200039 | \n",
638 | " 984 | \n",
639 | " 3927 | \n",
640 | " 3.0 | \n",
641 | " 3.537646 | \n",
642 | " 3.446079 | \n",
643 | " 3.486974 | \n",
644 | " 3.514210 | \n",
645 | " 3.475889 | \n",
646 | "
\n",
647 | " \n",
648 | " | 200040 | \n",
649 | " 4672 | \n",
650 | " 2369 | \n",
651 | " 4.0 | \n",
652 | " 2.638634 | \n",
653 | " 2.882440 | \n",
654 | " 2.676785 | \n",
655 | " 2.742415 | \n",
656 | " 2.817915 | \n",
657 | "
\n",
658 | " \n",
659 | " | 200041 | \n",
660 | " 5234 | \n",
661 | " 3556 | \n",
662 | " 5.0 | \n",
663 | " 3.970203 | \n",
664 | " 3.656631 | \n",
665 | " 3.988456 | \n",
666 | " 3.779991 | \n",
667 | " 3.712596 | \n",
668 | "
\n",
669 | " \n",
670 | "
\n",
671 | "
200042 rows × 8 columns
\n",
672 | "
"
673 | ],
674 | "text/plain": [
675 | " uid iid og_rating ... svdpp_rating slopeone_rating baseline_rating\n",
676 | "0 695 2791 4.0 ... 3.936685 4.240711 4.146045\n",
677 | "1 6016 3668 3.0 ... 3.658979 3.390132 3.442060\n",
678 | "2 5482 1221 5.0 ... 4.451363 4.669042 4.554867\n",
679 | "3 3389 2959 4.0 ... 4.235280 3.559392 3.450094\n",
680 | "4 4303 608 4.0 ... 4.757454 4.282707 4.180708\n",
681 | "... ... ... ... ... ... ... ...\n",
682 | "200037 1447 3412 4.0 ... 3.309891 3.192129 3.238168\n",
683 | "200038 301 3396 4.0 ... 4.594647 4.128157 4.114891\n",
684 | "200039 984 3927 3.0 ... 3.486974 3.514210 3.475889\n",
685 | "200040 4672 2369 4.0 ... 2.676785 2.742415 2.817915\n",
686 | "200041 5234 3556 5.0 ... 3.988456 3.779991 3.712596\n",
687 | "\n",
688 | "[200042 rows x 8 columns]"
689 | ]
690 | },
691 | "metadata": {
692 | "tags": []
693 | },
694 | "execution_count": 20
695 | }
696 | ]
697 | },
698 | {
699 | "cell_type": "code",
700 | "metadata": {
701 | "id": "tSwp06K6JClS",
702 | "colab_type": "code",
703 | "colab": {}
704 | },
705 | "source": [
706 | "test_pred_df.to_csv('test_prediction.csv')"
707 | ],
708 | "execution_count": null,
709 | "outputs": []
710 | }
711 | ]
712 | }
--------------------------------------------------------------------------------
/Code/cold_start_analysis.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "cold_start_analysis.ipynb",
7 | "provenance": [],
8 | "collapsed_sections": []
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | }
14 | },
15 | "cells": [
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {
19 | "id": "S5rY5TFX_Fzq",
20 | "colab_type": "text"
21 | },
22 | "source": [
23 | "#### Cold Start Analysis:\n",
24 | "\n",
25 | "This notebook analyses the performance of different approaches in case of a new user or a user with less number of interaction with the system, namely the cold start problem. \\\\\n",
26 | "We compute the rmse and mae for those customers who have rated less than 18 books and so on. \\\\\n",
27 | "We also observe the performance of approached for customers who have rated more than 1000 movies. "
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "metadata": {
33 | "id": "l6H9h87h_DXr",
34 | "colab_type": "code",
35 | "colab": {}
36 | },
37 | "source": [
38 | "!pip install surprise"
39 | ],
40 | "execution_count": null,
41 | "outputs": []
42 | },
43 | {
44 | "cell_type": "code",
45 | "metadata": {
46 | "id": "G8QgQWPZ3kFu",
47 | "colab_type": "code",
48 | "colab": {}
49 | },
50 | "source": [
51 | "import pickle\n",
52 | "import os\n",
53 | "\n",
54 | "import pandas as pd\n",
55 | "\n",
56 | "from surprise import SVD, SVDpp\n",
57 | "from surprise import KNNBasic, KNNBaseline, BaselineOnly\n",
58 | "from surprise import Dataset \n",
59 | "from surprise import Reader \n",
60 | "from surprise import dump\n",
61 | "from surprise.accuracy import rmse"
62 | ],
63 | "execution_count": null,
64 | "outputs": []
65 | },
66 | {
67 | "cell_type": "code",
68 | "metadata": {
69 | "id": "Y3nN4GjS3sjT",
70 | "colab_type": "code",
71 | "colab": {}
72 | },
73 | "source": [
74 | "def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):\n",
75 | " reader = Reader(rating_scale=(0, 5))\n",
76 | " trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)\n",
77 | " testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)\n",
78 | " trainset = trainset.construct_trainset(trainset.raw_ratings)\n",
79 | " testset = testset.construct_testset(testset.raw_ratings)\n",
80 | " return trainset, testset"
81 | ],
82 | "execution_count": null,
83 | "outputs": []
84 | },
85 | {
86 | "cell_type": "code",
87 | "metadata": {
88 | "id": "z6OJ9U-E3zQP",
89 | "colab_type": "code",
90 | "colab": {}
91 | },
92 | "source": [
93 | "file_path_train = 'training_data.csv'\n",
94 | "file_path_test = 'testing_data.csv'\n",
95 | "traindf = pd.read_csv(file_path_train)\n",
96 | "testdf = pd.read_csv(file_path_test)\n",
97 | "trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf)"
98 | ],
99 | "execution_count": null,
100 | "outputs": []
101 | },
102 | {
103 | "cell_type": "code",
104 | "metadata": {
105 | "id": "-aF-xIYxX4aB",
106 | "colab_type": "code",
107 | "colab": {
108 | "base_uri": "https://localhost:8080/",
109 | "height": 195
110 | },
111 | "outputId": "d1aa662c-301c-4d5f-cedf-23ed3984f02e"
112 | },
113 | "source": [
114 | "traindf.head()"
115 | ],
116 | "execution_count": null,
117 | "outputs": [
118 | {
119 | "output_type": "execute_result",
120 | "data": {
121 | "text/html": [
122 | "\n",
123 | "\n",
136 | "
\n",
137 | " \n",
138 | " \n",
139 | " | \n",
140 | " userId | \n",
141 | " movieId | \n",
142 | " rating | \n",
143 | " timestamp | \n",
144 | " genres | \n",
145 | " tag | \n",
146 | "
\n",
147 | " \n",
148 | " \n",
149 | " \n",
150 | " | 0 | \n",
151 | " 1 | \n",
152 | " 1 | \n",
153 | " 4.0 | \n",
154 | " 964982703 | \n",
155 | " ['Adventure', 'Animation', 'Children', 'Comedy... | \n",
156 | " [] | \n",
157 | "
\n",
158 | " \n",
159 | " | 1 | \n",
160 | " 1 | \n",
161 | " 6 | \n",
162 | " 4.0 | \n",
163 | " 964982224 | \n",
164 | " ['Action', 'Crime', 'Thriller'] | \n",
165 | " [] | \n",
166 | "
\n",
167 | " \n",
168 | " | 2 | \n",
169 | " 1 | \n",
170 | " 47 | \n",
171 | " 5.0 | \n",
172 | " 964983815 | \n",
173 | " ['Mystery', 'Thriller'] | \n",
174 | " [] | \n",
175 | "
\n",
176 | " \n",
177 | " | 3 | \n",
178 | " 1 | \n",
179 | " 50 | \n",
180 | " 5.0 | \n",
181 | " 964982931 | \n",
182 | " ['Crime', 'Mystery', 'Thriller'] | \n",
183 | " [] | \n",
184 | "
\n",
185 | " \n",
186 | " | 4 | \n",
187 | " 1 | \n",
188 | " 70 | \n",
189 | " 3.0 | \n",
190 | " 964982400 | \n",
191 | " ['Action', 'Comedy', 'Horror', 'Thriller'] | \n",
192 | " [] | \n",
193 | "
\n",
194 | " \n",
195 | "
\n",
196 | "
"
197 | ],
198 | "text/plain": [
199 | " userId movieId ... genres tag\n",
200 | "0 1 1 ... ['Adventure', 'Animation', 'Children', 'Comedy... []\n",
201 | "1 1 6 ... ['Action', 'Crime', 'Thriller'] []\n",
202 | "2 1 47 ... ['Mystery', 'Thriller'] []\n",
203 | "3 1 50 ... ['Crime', 'Mystery', 'Thriller'] []\n",
204 | "4 1 70 ... ['Action', 'Comedy', 'Horror', 'Thriller'] []\n",
205 | "\n",
206 | "[5 rows x 6 columns]"
207 | ]
208 | },
209 | "metadata": {
210 | "tags": []
211 | },
212 | "execution_count": 5
213 | }
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "metadata": {
219 | "id": "TzdYPykH4DMR",
220 | "colab_type": "code",
221 | "colab": {
222 | "base_uri": "https://localhost:8080/",
223 | "height": 50
224 | },
225 | "outputId": "91278275-9801-4593-f78c-3dc9ca7caf08"
226 | },
227 | "source": [
228 | "algo_svd = SVD() \n",
229 | "algo_svdpp = SVDpp() \n",
230 | "algo_knn = KNNBasic()\n",
231 | "\n",
232 | "\n",
233 | "algo_svd.fit(trainset) \n",
234 | "predictions_svd = algo_svd.test(testset)\n",
235 | "\n",
236 | "algo_svdpp.fit(trainset) \n",
237 | "predictions_svdpp = algo_svdpp.test(testset)\n",
238 | "\n",
239 | "algo_knn.fit(trainset)\n",
240 | "predictions_knn = algo_knn.test(testset)\n",
241 | "\n",
242 | "# rmse(predictions_svd)\n",
243 | "# rmse(predictions_knn) \n",
244 | "\n",
245 | "dump.dump('./dump_SVD', predictions_svd, algo_svd)\n",
246 | "dump.dump('./dump_SVDpp', predictions_svdpp, algo_svdpp)\n",
247 | "dump.dump('./dump_KNN', predictions_knn, algo_knn)"
248 | ],
249 | "execution_count": null,
250 | "outputs": [
251 | {
252 | "output_type": "stream",
253 | "text": [
254 | "Computing the msd similarity matrix...\n",
255 | "Done computing similarity matrix.\n"
256 | ],
257 | "name": "stdout"
258 | }
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "metadata": {
264 | "id": "IWpBO-jw4gR4",
265 | "colab_type": "code",
266 | "colab": {}
267 | },
268 | "source": [
269 | "df_svd = pd.DataFrame(predictions_svd, columns=['uid', 'iid', 'rui', 'est', 'details']) \n",
270 | "df_svdpp = pd.DataFrame(predictions_svdpp, columns=['uid', 'iid', 'rui', 'est', 'details']) \n",
271 | "df_knn = pd.DataFrame(predictions_knn, columns=['uid', 'iid', 'rui', 'est', 'details']) "
272 | ],
273 | "execution_count": null,
274 | "outputs": []
275 | },
276 | {
277 | "cell_type": "code",
278 | "metadata": {
279 | "id": "5ytiPn_6Z4D5",
280 | "colab_type": "code",
281 | "colab": {
282 | "base_uri": "https://localhost:8080/",
283 | "height": 67
284 | },
285 | "outputId": "c172233e-b73b-4226-faea-c9505e9c0b09"
286 | },
287 | "source": [
288 | "sim_options = {'name': 'pearson_baseline',\n",
289 | " 'user_based': False # compute similarities between items\n",
290 | " }\n",
291 | "# algo = KNNBaseline(sim_options=sim_options)\n",
292 | "algo_knnbaseline = KNNBaseline(sim_options=sim_options)\n",
293 | "algo_knnbaseline.fit(trainset)\n",
294 | "predictions_knnbaseline = algo_knnbaseline.test(testset)"
295 | ],
296 | "execution_count": null,
297 | "outputs": [
298 | {
299 | "output_type": "stream",
300 | "text": [
301 | "Estimating biases using als...\n",
302 | "Computing the pearson_baseline similarity matrix...\n",
303 | "Done computing similarity matrix.\n"
304 | ],
305 | "name": "stdout"
306 | }
307 | ]
308 | },
309 | {
310 | "cell_type": "code",
311 | "metadata": {
312 | "id": "-qpAZxicab7y",
313 | "colab_type": "code",
314 | "colab": {}
315 | },
316 | "source": [
317 | "df_knnbaseline = pd.DataFrame(predictions_knnbaseline, columns=['uid', 'iid', 'rui', 'est', 'details']) \n",
318 | "df_knnbaseline['err'] = abs(df_knnbaseline.est - df_knnbaseline.rui)\n",
319 | "df_knnbaseline['sqr_err'] = (df_knnbaseline.est - df_knnbaseline.rui)**2"
320 | ],
321 | "execution_count": null,
322 | "outputs": []
323 | },
324 | {
325 | "cell_type": "code",
326 | "metadata": {
327 | "id": "yIlRty-X4z2T",
328 | "colab_type": "code",
329 | "colab": {}
330 | },
331 | "source": [
332 | "df_svd['err'] = abs(df_svd.est - df_svd.rui)\n",
333 | "df_svdpp['err'] = abs(df_svdpp.est - df_svdpp.rui)\n",
334 | "df_knn['err'] = abs(df_knn.est - df_knn.rui)"
335 | ],
336 | "execution_count": null,
337 | "outputs": []
338 | },
339 | {
340 | "cell_type": "code",
341 | "metadata": {
342 | "id": "VdC0IyDxY4xB",
343 | "colab_type": "code",
344 | "colab": {}
345 | },
346 | "source": [
347 | "df_svd['sqr_err'] = (df_svd.est - df_svd.rui)**2\n",
348 | "df_svdpp['sqr_err'] = (df_svdpp.est - df_svdpp.rui)**2\n",
349 | "df_knn['sqr_err'] = (df_knn.est - df_knn.rui)**2"
350 | ],
351 | "execution_count": null,
352 | "outputs": []
353 | },
354 | {
355 | "cell_type": "code",
356 | "metadata": {
357 | "id": "t4gOt7SHcVnO",
358 | "colab_type": "code",
359 | "colab": {
360 | "base_uri": "https://localhost:8080/",
361 | "height": 34
362 | },
363 | "outputId": "21a9276e-f43b-4dfc-9afb-65987e0cd1f1"
364 | },
365 | "source": [
366 | "algo_baselineonly = BaselineOnly()\n",
367 | "algo_baselineonly.fit(trainset)\n",
368 | "predictions_baselineonly = algo_baselineonly.test(testset)\n",
369 | "\n",
370 | "df_baselineonly = pd.DataFrame(predictions_baselineonly, columns=['uid', 'iid', 'rui', 'est', 'details']) \n",
371 | "df_baselineonly['err'] = abs(df_baselineonly.est - df_baselineonly.rui)\n",
372 | "df_baselineonly['sqr_err'] = (df_baselineonly.est - df_baselineonly.rui)**2\n",
373 | "df_baselineonly['Iu'] = df_baselineonly.uid.apply(get_Iu)"
374 | ],
375 | "execution_count": null,
376 | "outputs": [
377 | {
378 | "output_type": "stream",
379 | "text": [
380 | "Estimating biases using als...\n"
381 | ],
382 | "name": "stdout"
383 | }
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "metadata": {
389 | "id": "falRCQt3dYFC",
390 | "colab_type": "code",
391 | "colab": {
392 | "base_uri": "https://localhost:8080/",
393 | "height": 67
394 | },
395 | "outputId": "f3e8404f-77c9-427e-c663-f9ddadd0fb11"
396 | },
397 | "source": [
398 | "sim_options = {'name': 'pearson_baseline',\n",
399 | " 'user_based': True # compute similarities between items\n",
400 | " }\n",
401 | "algo_knnbaseline_user = KNNBaseline(sim_options=sim_options)\n",
402 | "algo_knnbaseline_user.fit(trainset)\n",
403 | "predictions_knnbaseline_user = algo_knnbaseline_user.test(testset)\n",
404 | "\n",
405 | "df_knn_user = pd.DataFrame(predictions_knnbaseline_user, columns=['uid', 'iid', 'rui', 'est', 'details']) \n",
406 | "df_knn_user['err'] = abs(df_knn_user.est - df_knn_user.rui)\n",
407 | "df_knn_user['sqr_err'] = (df_knn_user.est - df_knn_user.rui)**2\n",
408 | "df_knn_user['Iu'] = df_knn_user.uid.apply(get_Iu)"
409 | ],
410 | "execution_count": null,
411 | "outputs": [
412 | {
413 | "output_type": "stream",
414 | "text": [
415 | "Estimating biases using als...\n",
416 | "Computing the pearson_baseline similarity matrix...\n",
417 | "Done computing similarity matrix.\n"
418 | ],
419 | "name": "stdout"
420 | }
421 | ]
422 | },
423 | {
424 | "cell_type": "code",
425 | "metadata": {
426 | "id": "SSoLuqrV65pK",
427 | "colab_type": "code",
428 | "colab": {
429 | "base_uri": "https://localhost:8080/",
430 | "height": 195
431 | },
432 | "outputId": "897911ad-5086-4798-914e-58df7da6b068"
433 | },
434 | "source": [
435 | "df_svd.head()"
436 | ],
437 | "execution_count": null,
438 | "outputs": [
439 | {
440 | "output_type": "execute_result",
441 | "data": {
442 | "text/html": [
443 | "\n",
444 | "\n",
457 | "
\n",
458 | " \n",
459 | " \n",
460 | " | \n",
461 | " uid | \n",
462 | " iid | \n",
463 | " rui | \n",
464 | " est | \n",
465 | " details | \n",
466 | " err | \n",
467 | " Iu | \n",
468 | "
\n",
469 | " \n",
470 | " \n",
471 | " \n",
472 | " | 0 | \n",
473 | " 1 | \n",
474 | " 3 | \n",
475 | " 4.0 | \n",
476 | " 4.200548 | \n",
477 | " {'was_impossible': False} | \n",
478 | " 0.200548 | \n",
479 | " 186 | \n",
480 | "
\n",
481 | " \n",
482 | " | 1 | \n",
483 | " 1 | \n",
484 | " 163 | \n",
485 | " 5.0 | \n",
486 | " 4.261322 | \n",
487 | " {'was_impossible': False} | \n",
488 | " 0.738678 | \n",
489 | " 186 | \n",
490 | "
\n",
491 | " \n",
492 | " | 2 | \n",
493 | " 1 | \n",
494 | " 316 | \n",
495 | " 3.0 | \n",
496 | " 4.024986 | \n",
497 | " {'was_impossible': False} | \n",
498 | " 1.024986 | \n",
499 | " 186 | \n",
500 | "
\n",
501 | " \n",
502 | " | 3 | \n",
503 | " 1 | \n",
504 | " 349 | \n",
505 | " 4.0 | \n",
506 | " 4.443186 | \n",
507 | " {'was_impossible': False} | \n",
508 | " 0.443186 | \n",
509 | " 186 | \n",
510 | "
\n",
511 | " \n",
512 | " | 4 | \n",
513 | " 1 | \n",
514 | " 441 | \n",
515 | " 4.0 | \n",
516 | " 4.758104 | \n",
517 | " {'was_impossible': False} | \n",
518 | " 0.758104 | \n",
519 | " 186 | \n",
520 | "
\n",
521 | " \n",
522 | "
\n",
523 | "
"
524 | ],
525 | "text/plain": [
526 | " uid iid rui est details err Iu\n",
527 | "0 1 3 4.0 4.200548 {'was_impossible': False} 0.200548 186\n",
528 | "1 1 163 5.0 4.261322 {'was_impossible': False} 0.738678 186\n",
529 | "2 1 316 3.0 4.024986 {'was_impossible': False} 1.024986 186\n",
530 | "3 1 349 4.0 4.443186 {'was_impossible': False} 0.443186 186\n",
531 | "4 1 441 4.0 4.758104 {'was_impossible': False} 0.758104 186"
532 | ]
533 | },
534 | "metadata": {
535 | "tags": []
536 | },
537 | "execution_count": 12
538 | }
539 | ]
540 | },
541 | {
542 | "cell_type": "code",
543 | "metadata": {
544 | "id": "gyU3U3mLWG42",
545 | "colab_type": "code",
546 | "colab": {}
547 | },
548 | "source": [
549 | "content = pd.read_csv('content_based_genre_ratings.csv')"
550 | ],
551 | "execution_count": null,
552 | "outputs": []
553 | },
554 | {
555 | "cell_type": "code",
556 | "metadata": {
557 | "id": "V9pCMloU45Sh",
558 | "colab_type": "code",
559 | "colab": {}
560 | },
561 | "source": [
562 | "def get_Iu(uid):\n",
563 | " \"\"\"Return the number of items rated by given user\n",
564 | " \n",
565 | " Args:\n",
566 | " uid: The raw id of the user.\n",
567 | " Returns:\n",
568 | " The number of items rated by the user.\n",
569 | " \"\"\"\n",
570 | " \n",
571 | " try:\n",
572 | " return traindf[traindf['userId'] == uid].shape[0]\n",
573 | " except ValueError: # user was not part of the trainset\n",
574 | " return 0"
575 | ],
576 | "execution_count": null,
577 | "outputs": []
578 | },
579 | {
580 | "cell_type": "code",
581 | "metadata": {
582 | "id": "Xaia-Iy2WPYY",
583 | "colab_type": "code",
584 | "colab": {}
585 | },
586 | "source": [
587 | "content['Iu'] = content.userId.apply(get_Iu)"
588 | ],
589 | "execution_count": null,
590 | "outputs": []
591 | },
592 | {
593 | "cell_type": "code",
594 | "metadata": {
595 | "id": "8a_bM4hsWyHI",
596 | "colab_type": "code",
597 | "colab": {}
598 | },
599 | "source": [
600 | "content['err'] = abs(content.pred_rating - content.og_rating)\n",
601 | "content['sqr_err'] = (content.pred_rating - content.og_rating)**2\n",
602 | "# rmse = ((algo_predictions.og_rating - algo_predictions.pred_rating) ** 2).mean() ** .5\n",
603 | "# mae = (((algo_predictions.og_rating - algo_predictions.pred_rating) ** 2) ** .5).mean()\n"
604 | ],
605 | "execution_count": null,
606 | "outputs": []
607 | },
608 | {
609 | "cell_type": "code",
610 | "metadata": {
611 | "id": "yobQqAZTWd_4",
612 | "colab_type": "code",
613 | "colab": {
614 | "base_uri": "https://localhost:8080/",
615 | "height": 50
616 | },
617 | "outputId": "5c486dcc-0636-4452-b859-b51d6b2da9af"
618 | },
619 | "source": [
620 | "print(\"Content based \",content[content.Iu < 18].err.mean())\n",
621 | "print(\"Content based \",content[content.Iu < 18].sqr_err.mean()** .5)"
622 | ],
623 | "execution_count": null,
624 | "outputs": [
625 | {
626 | "output_type": "stream",
627 | "text": [
628 | "Content based 0.7942792057878261\n",
629 | "Content based 1.0584107905057996\n"
630 | ],
631 | "name": "stdout"
632 | }
633 | ]
634 | },
635 | {
636 | "cell_type": "code",
637 | "metadata": {
638 | "id": "wz1Pkbzd4-dl",
639 | "colab_type": "code",
640 | "colab": {}
641 | },
642 | "source": [
643 | "df_knn['Iu'] = df_knn.uid.apply(get_Iu)\n",
644 | "df_svd['Iu'] = df_svd.uid.apply(get_Iu)\n",
645 | "df_svdpp['Iu'] = df_svdpp.uid.apply(get_Iu)\n",
646 | "df_knnbaseline['Iu'] = df_knnbaseline.uid.apply(get_Iu)"
647 | ],
648 | "execution_count": null,
649 | "outputs": []
650 | },
651 | {
652 | "cell_type": "code",
653 | "metadata": {
654 | "id": "q-VorHYTayVQ",
655 | "colab_type": "code",
656 | "colab": {
657 | "base_uri": "https://localhost:8080/",
658 | "height": 134
659 | },
660 | "outputId": "bb83cb23-6ebd-4ba1-ec31-8a599d8dd4ec"
661 | },
662 | "source": [
663 | "print(\"--------------------------MAE-----------------------\")\n",
664 | "print(\"KNN Basic \",df_knn[df_knn.Iu < 18].err.mean())\n",
665 | "print(\"SVD \", df_svd[df_svd.Iu < 18].err.mean())\n",
666 | "print(\"SVDpp \", df_svdpp[df_svdpp.Iu < 18].err.mean())\n",
667 | "print(\"KNN Baseline (item-item) \", df_knnbaseline[df_knnbaseline.Iu < 18].err.mean())\n",
668 | "print(\"BaselineOnly \",df_baselineonly[df_baselineonly.Iu < 18].err.mean() )\n",
669 | "print(\"KNN Baseline (user-user) \",df_knn_user[df_knn_user.Iu < 18].err.mean() )"
670 | ],
671 | "execution_count": null,
672 | "outputs": [
673 | {
674 | "output_type": "stream",
675 | "text": [
676 | "--------------------------MAE-----------------------\n",
677 | "KNN Basic 0.9356541418761788\n",
678 | "SVD 0.8174986369636367\n",
679 | "SVDpp 0.7853538665933238\n",
680 | "KNN Baseline (item-item) 0.7549100058171629\n",
681 | "BaselineOnly 0.828373767989461\n",
682 | "KNN Baseline (user-user) 0.8527037143570998\n"
683 | ],
684 | "name": "stdout"
685 | }
686 | ]
687 | },
688 | {
689 | "cell_type": "code",
690 | "metadata": {
691 | "id": "nQOEO64Jf9BE",
692 | "colab_type": "code",
693 | "colab": {
694 | "base_uri": "https://localhost:8080/",
695 | "height": 134
696 | },
697 | "outputId": "d2da200e-954a-44d4-89c2-890afc4b14e5"
698 | },
699 | "source": [
700 | "print(\"--------------------------RMSE-----------------------\")\n",
701 | "print(\"KNN Basic \",df_knn[df_knn.Iu < 18].sqr_err.mean()** .5)\n",
702 | "print(\"SVD \", df_svd[df_svd.Iu < 18].sqr_err.mean()** .5)\n",
703 | "print(\"SVDpp \", df_svdpp[df_svdpp.Iu < 18].sqr_err.mean()** .5)\n",
704 | "print(\"KNN Baseline (item-item) \", df_knnbaseline[df_knnbaseline.Iu < 18].sqr_err.mean()** .5)\n",
705 | "print(\"BaselineOnly \",df_baselineonly[df_baselineonly.Iu < 18].sqr_err.mean()** .5 )\n",
706 | "print(\"KNN Baseline (user-user) \",df_knn_user[df_knn_user.Iu < 18].sqr_err.mean()** .5)"
707 | ],
708 | "execution_count": null,
709 | "outputs": [
710 | {
711 | "output_type": "stream",
712 | "text": [
713 | "--------------------------RMSE-----------------------\n",
714 | "KNN Basic 1.1998253947989697\n",
715 | "SVD 1.0549483774463828\n",
716 | "SVDpp 1.0083634724152428\n",
717 | "KNN Baseline (item-item) 0.9896562169806813\n",
718 | "BaselineOnly 1.0612306019619604\n",
719 | "KNN Baseline (user-user) 1.1082756354422056\n"
720 | ],
721 | "name": "stdout"
722 | }
723 | ]
724 | },
725 | {
726 | "cell_type": "code",
727 | "metadata": {
728 | "id": "xnzmsButgiyw",
729 | "colab_type": "code",
730 | "colab": {
731 | "base_uri": "https://localhost:8080/",
732 | "height": 134
733 | },
734 | "outputId": "f326c366-38ea-45b8-9133-cf22dcb72358"
735 | },
736 | "source": [
737 | "print(\"--------------------------MAE-----------------------\")\n",
738 | "print(\"KNN Basic \",df_knn[df_knn.Iu > 1000].err.mean())\n",
739 | "print(\"SVD \", df_svd[df_svd.Iu > 1000].err.mean())\n",
740 | "print(\"SVDpp \", df_svdpp[df_svdpp.Iu > 1000].err.mean())\n",
741 | "print(\"KNN Baseline (item-item) \", df_knnbaseline[df_knnbaseline.Iu > 1000].err.mean())\n",
742 | "print(\"BaselineOnly \",df_baselineonly[df_baselineonly.Iu > 1000].err.mean() )\n",
743 | "print(\"KNN Baseline (user-user) \",df_knn_user[df_knn_user.Iu > 1000].err.mean() )"
744 | ],
745 | "execution_count": null,
746 | "outputs": [
747 | {
748 | "output_type": "stream",
749 | "text": [
750 | "--------------------------MAE-----------------------\n",
751 | "KNN Basic 0.7118277630004157\n",
752 | "SVD 0.6349197611192368\n",
753 | "SVDpp 0.626063757313411\n",
754 | "KNN Baseline (item-item) 0.6120430789383057\n",
755 | "BaselineOnly 0.6306031032475772\n",
756 | "KNN Baseline (user-user) 0.6330297364319998\n"
757 | ],
758 | "name": "stdout"
759 | }
760 | ]
761 | },
762 | {
763 | "cell_type": "code",
764 | "metadata": {
765 | "id": "K1CnsM3mg0wi",
766 | "colab_type": "code",
767 | "colab": {
768 | "base_uri": "https://localhost:8080/",
769 | "height": 134
770 | },
771 | "outputId": "c79eaa77-b9a5-4d90-d470-9539c3af5858"
772 | },
773 | "source": [
774 | "print(\"--------------------------RMSE-----------------------\")\n",
775 | "print(\"KNN Basic \",df_knn[df_knn.Iu > 1000].sqr_err.mean()** .5)\n",
776 | "print(\"SVD \", df_svd[df_svd.Iu > 1000].sqr_err.mean()** .5)\n",
777 | "print(\"SVDpp \", df_svdpp[df_svdpp.Iu > 1000].sqr_err.mean()** .5)\n",
778 | "print(\"KNN Baseline (item-item) \", df_knnbaseline[df_knnbaseline.Iu > 1000].sqr_err.mean()** .5)\n",
779 | "print(\"BaselineOnly \",df_baselineonly[df_baselineonly.Iu > 1000].sqr_err.mean()** .5 )\n",
780 | "print(\"KNN Baseline (user-user) \",df_knn_user[df_knn_user.Iu > 1000].sqr_err.mean()** .5)"
781 | ],
782 | "execution_count": null,
783 | "outputs": [
784 | {
785 | "output_type": "stream",
786 | "text": [
787 | "--------------------------RMSE-----------------------\n",
788 | "KNN Basic 0.9174613388905646\n",
789 | "SVD 0.8207944406250214\n",
790 | "SVDpp 0.8136491891525117\n",
791 | "KNN Baseline (item-item) 0.789275629286978\n",
792 | "BaselineOnly 0.799990922710614\n",
793 | "KNN Baseline (user-user) 0.8198697577732832\n"
794 | ],
795 | "name": "stdout"
796 | }
797 | ]
798 | },
799 | {
800 | "cell_type": "code",
801 | "metadata": {
802 | "id": "7aWKXObN6uhT",
803 | "colab_type": "code",
804 | "colab": {
805 | "base_uri": "https://localhost:8080/",
806 | "height": 34
807 | },
808 | "outputId": "2a40f35d-3055-43aa-d508-be83732ad842"
809 | },
810 | "source": [
811 | "iid_df = traindf.groupby(['userId'],as_index=False).movieId.count()\n",
812 | "iid_df.movieId.max()"
813 | ],
814 | "execution_count": null,
815 | "outputs": [
816 | {
817 | "output_type": "execute_result",
818 | "data": {
819 | "text/plain": [
820 | "2158"
821 | ]
822 | },
823 | "metadata": {
824 | "tags": []
825 | },
826 | "execution_count": 47
827 | }
828 | ]
829 | }
830 | ]
831 | }
--------------------------------------------------------------------------------
/Code/movie_era_based_recs.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "kernelspec": {
6 | "display_name": "Python 3",
7 | "language": "python",
8 | "name": "python3"
9 | },
10 | "language_info": {
11 | "codemirror_mode": {
12 | "name": "ipython",
13 | "version": 3
14 | },
15 | "file_extension": ".py",
16 | "mimetype": "text/x-python",
17 | "name": "python",
18 | "nbconvert_exporter": "python",
19 | "pygments_lexer": "ipython3",
20 | "version": "3.7.6"
21 | },
22 | "colab": {
23 | "name": "movie_era_based_recs.ipynb",
24 | "provenance": []
25 | }
26 | },
27 | "cells": [
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {
31 | "id": "4Cox5k2AMKsw",
32 | "colab_type": "text"
33 | },
34 | "source": [
35 | "This notebook uses the content based approach to include the time period in which the movie was launced. This method personalizes the users recommendations to include this feature."
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "metadata": {
41 | "id": "bC1OAh-JMLd7",
42 | "colab_type": "code",
43 | "colab": {}
44 | },
45 | "source": [
46 | "import numpy as np\n",
47 | "import pandas as pd\n",
48 | "import matplotlib.pyplot as plt\n",
49 | "from ast import literal_eval\n",
50 | "import pdb"
51 | ],
52 | "execution_count": null,
53 | "outputs": []
54 | },
55 | {
56 | "cell_type": "code",
57 | "metadata": {
58 | "id": "wLkmHYnGLw5Z",
59 | "colab_type": "code",
60 | "colab": {}
61 | },
62 | "source": [
63 | "genre_user_vector = pd.read_csv(\"user_info.csv\")\n",
64 | "genre_user_vector = genre_user_vector[['userId', 'user_vector']]\n",
65 | "\n",
66 | "genre_user_vector['user_vector'] = genre_user_vector['user_vector'].apply(lambda x: x.replace('[', ' ').replace(']', ' ').strip().split())\n",
67 | "genre_user_vector['user_vector'] = genre_user_vector['user_vector'].apply(lambda x: np.asarray(x).astype(float))"
68 | ],
69 | "execution_count": null,
70 | "outputs": []
71 | },
72 | {
73 | "cell_type": "code",
74 | "metadata": {
75 | "id": "m4m_b7JbLw5c",
76 | "colab_type": "code",
77 | "colab": {}
78 | },
79 | "source": [
80 | "era_user_vector = pd.read_csv(\"user_era_vector.csv\")\n",
81 | "era_user_vector = era_user_vector[['userId', 'user_era_vector']]\n",
82 | "\n",
83 | "era_user_vector['user_era_vector'] = era_user_vector['user_era_vector'].apply(lambda x: x.replace('[', ' ').replace(']', ' ').strip().split())\n",
84 | "era_user_vector['user_era_vector'] = era_user_vector['user_era_vector'].apply(lambda x: np.asarray(x).astype(float))"
85 | ],
86 | "execution_count": null,
87 | "outputs": []
88 | },
89 | {
90 | "cell_type": "code",
91 | "metadata": {
92 | "id": "mCq1wzsaLw5f",
93 | "colab_type": "code",
94 | "colab": {}
95 | },
96 | "source": [
97 | "merged_user = genre_user_vector.join(era_user_vector['user_era_vector'])"
98 | ],
99 | "execution_count": null,
100 | "outputs": []
101 | },
102 | {
103 | "cell_type": "code",
104 | "metadata": {
105 | "id": "Obc2U0jILw5h",
106 | "colab_type": "code",
107 | "colab": {},
108 | "outputId": "f2a77bef-df14-4981-acba-8bd0fe07b36d"
109 | },
110 | "source": [
111 | "merged_user.head()"
112 | ],
113 | "execution_count": null,
114 | "outputs": [
115 | {
116 | "output_type": "execute_result",
117 | "data": {
118 | "text/html": [
119 | "\n",
120 | "\n",
133 | "
\n",
134 | " \n",
135 | " \n",
136 | " | \n",
137 | " userId | \n",
138 | " user_vector | \n",
139 | " user_era_vector | \n",
140 | "
\n",
141 | " \n",
142 | " \n",
143 | " \n",
144 | " | 0 | \n",
145 | " 1 | \n",
146 | " [4.39189189, 4.65217391, 4.48571429, 4.2676056... | \n",
147 | " [4.63265306, 4.27272727, 4.6, 0.0] | \n",
148 | "
\n",
149 | " \n",
150 | " | 1 | \n",
151 | " 2 | \n",
152 | " [4.16666667, 0.0, 0.0, 4.2, 0.0, 4.5, 3.866666... | \n",
153 | " [0.0, 3.83333333, 4.05, 3.85] | \n",
154 | "
\n",
155 | " \n",
156 | " | 2 | \n",
157 | " 3 | \n",
158 | " [2.5, 0.5, 0.5, 0.5, 4.33333333, 0.5, 0.833333... | \n",
159 | " [2.45833333, 2.6875, 0.5, 0.0] | \n",
160 | "
\n",
161 | " \n",
162 | " | 3 | \n",
163 | " 4 | \n",
164 | " [3.47826087, 4.0, 3.77777778, 3.43902439, 3.53... | \n",
165 | " [4.4375, 3.25663717, 3.32142857, 0.0] | \n",
166 | "
\n",
167 | " \n",
168 | " | 4 | \n",
169 | " 5 | \n",
170 | " [3.16666667, 4.25, 4.0, 3.36363636, 4.0, 3.1, ... | \n",
171 | " [5.0, 3.55882353, 0.0, 0.0] | \n",
172 | "
\n",
173 | " \n",
174 | "
\n",
175 | "
"
176 | ],
177 | "text/plain": [
178 | " userId user_vector \\\n",
179 | "0 1 [4.39189189, 4.65217391, 4.48571429, 4.2676056... \n",
180 | "1 2 [4.16666667, 0.0, 0.0, 4.2, 0.0, 4.5, 3.866666... \n",
181 | "2 3 [2.5, 0.5, 0.5, 0.5, 4.33333333, 0.5, 0.833333... \n",
182 | "3 4 [3.47826087, 4.0, 3.77777778, 3.43902439, 3.53... \n",
183 | "4 5 [3.16666667, 4.25, 4.0, 3.36363636, 4.0, 3.1, ... \n",
184 | "\n",
185 | " user_era_vector \n",
186 | "0 [4.63265306, 4.27272727, 4.6, 0.0] \n",
187 | "1 [0.0, 3.83333333, 4.05, 3.85] \n",
188 | "2 [2.45833333, 2.6875, 0.5, 0.0] \n",
189 | "3 [4.4375, 3.25663717, 3.32142857, 0.0] \n",
190 | "4 [5.0, 3.55882353, 0.0, 0.0] "
191 | ]
192 | },
193 | "metadata": {
194 | "tags": []
195 | },
196 | "execution_count": 5
197 | }
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "metadata": {
203 | "scrolled": false,
204 | "id": "FqxlInOILw5k",
205 | "colab_type": "code",
206 | "colab": {}
207 | },
208 | "source": [
209 | "merged_user['final_user_vector'] = merged_user.apply(lambda x: np.concatenate((2*x['user_vector'], x['user_era_vector'])), axis=1)"
210 | ],
211 | "execution_count": null,
212 | "outputs": []
213 | },
214 | {
215 | "cell_type": "code",
216 | "metadata": {
217 | "id": "R-USwe0RLw5m",
218 | "colab_type": "code",
219 | "colab": {},
220 | "outputId": "df1b11eb-6da3-4070-fbf7-149e619605b7"
221 | },
222 | "source": [
223 | "merged_user.head()"
224 | ],
225 | "execution_count": null,
226 | "outputs": [
227 | {
228 | "output_type": "execute_result",
229 | "data": {
230 | "text/html": [
231 | "\n",
232 | "\n",
245 | "
\n",
246 | " \n",
247 | " \n",
248 | " | \n",
249 | " userId | \n",
250 | " user_vector | \n",
251 | " user_era_vector | \n",
252 | " final_user_vector | \n",
253 | "
\n",
254 | " \n",
255 | " \n",
256 | " \n",
257 | " | 0 | \n",
258 | " 1 | \n",
259 | " [4.39189189, 4.65217391, 4.48571429, 4.2676056... | \n",
260 | " [4.63265306, 4.27272727, 4.6, 0.0] | \n",
261 | " [8.78378378, 9.30434782, 8.97142858, 8.5352112... | \n",
262 | "
\n",
263 | " \n",
264 | " | 1 | \n",
265 | " 2 | \n",
266 | " [4.16666667, 0.0, 0.0, 4.2, 0.0, 4.5, 3.866666... | \n",
267 | " [0.0, 3.83333333, 4.05, 3.85] | \n",
268 | " [8.33333334, 0.0, 0.0, 8.4, 0.0, 9.0, 7.733333... | \n",
269 | "
\n",
270 | " \n",
271 | " | 2 | \n",
272 | " 3 | \n",
273 | " [2.5, 0.5, 0.5, 0.5, 4.33333333, 0.5, 0.833333... | \n",
274 | " [2.45833333, 2.6875, 0.5, 0.0] | \n",
275 | " [5.0, 1.0, 1.0, 1.0, 8.66666666, 1.0, 1.666666... | \n",
276 | "
\n",
277 | " \n",
278 | " | 3 | \n",
279 | " 4 | \n",
280 | " [3.47826087, 4.0, 3.77777778, 3.43902439, 3.53... | \n",
281 | " [4.4375, 3.25663717, 3.32142857, 0.0] | \n",
282 | " [6.95652174, 8.0, 7.55555556, 6.87804878, 7.06... | \n",
283 | "
\n",
284 | " \n",
285 | " | 4 | \n",
286 | " 5 | \n",
287 | " [3.16666667, 4.25, 4.0, 3.36363636, 4.0, 3.1, ... | \n",
288 | " [5.0, 3.55882353, 0.0, 0.0] | \n",
289 | " [6.33333334, 8.5, 8.0, 6.72727272, 8.0, 6.2, 7... | \n",
290 | "
\n",
291 | " \n",
292 | "
\n",
293 | "
"
294 | ],
295 | "text/plain": [
296 | " userId user_vector \\\n",
297 | "0 1 [4.39189189, 4.65217391, 4.48571429, 4.2676056... \n",
298 | "1 2 [4.16666667, 0.0, 0.0, 4.2, 0.0, 4.5, 3.866666... \n",
299 | "2 3 [2.5, 0.5, 0.5, 0.5, 4.33333333, 0.5, 0.833333... \n",
300 | "3 4 [3.47826087, 4.0, 3.77777778, 3.43902439, 3.53... \n",
301 | "4 5 [3.16666667, 4.25, 4.0, 3.36363636, 4.0, 3.1, ... \n",
302 | "\n",
303 | " user_era_vector \\\n",
304 | "0 [4.63265306, 4.27272727, 4.6, 0.0] \n",
305 | "1 [0.0, 3.83333333, 4.05, 3.85] \n",
306 | "2 [2.45833333, 2.6875, 0.5, 0.0] \n",
307 | "3 [4.4375, 3.25663717, 3.32142857, 0.0] \n",
308 | "4 [5.0, 3.55882353, 0.0, 0.0] \n",
309 | "\n",
310 | " final_user_vector \n",
311 | "0 [8.78378378, 9.30434782, 8.97142858, 8.5352112... \n",
312 | "1 [8.33333334, 0.0, 0.0, 8.4, 0.0, 9.0, 7.733333... \n",
313 | "2 [5.0, 1.0, 1.0, 1.0, 8.66666666, 1.0, 1.666666... \n",
314 | "3 [6.95652174, 8.0, 7.55555556, 6.87804878, 7.06... \n",
315 | "4 [6.33333334, 8.5, 8.0, 6.72727272, 8.0, 6.2, 7... "
316 | ]
317 | },
318 | "metadata": {
319 | "tags": []
320 | },
321 | "execution_count": 7
322 | }
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "metadata": {
328 | "id": "0-zBOAM8Lw5q",
329 | "colab_type": "code",
330 | "colab": {}
331 | },
332 | "source": [
333 | "movie_genre_vector = pd.read_csv(\"movie_vector.csv\")\n",
334 | "movie_genre_vector = movie_genre_vector[['movieId', 'movie_vector']]\n",
335 | "\n",
336 | "movie_genre_vector['movie_vector'] = movie_genre_vector['movie_vector'].apply(lambda x: x.replace('[', ' ').replace(']', ' ').strip().split())\n",
337 | "movie_genre_vector['movie_vector'] = movie_genre_vector['movie_vector'].apply(lambda x: np.asarray(x).astype(float))"
338 | ],
339 | "execution_count": null,
340 | "outputs": []
341 | },
342 | {
343 | "cell_type": "code",
344 | "metadata": {
345 | "id": "4awx_m7pLw5s",
346 | "colab_type": "code",
347 | "colab": {}
348 | },
349 | "source": [
350 | "movie_era_vector = pd.read_csv(\"movie_era_vector.csv\")\n",
351 | "movie_era_vector = movie_era_vector[['movieId', 'era_vector']]\n",
352 | "\n",
353 | "movie_era_vector['era_vector'] = movie_era_vector['era_vector'].apply(lambda x: x.replace('[', ' ').replace(']', ' ').strip().split())\n",
354 | "movie_era_vector['era_vector'] = movie_era_vector['era_vector'].apply(lambda x: np.asarray(x).astype(float))"
355 | ],
356 | "execution_count": null,
357 | "outputs": []
358 | },
359 | {
360 | "cell_type": "code",
361 | "metadata": {
362 | "id": "v92UsULNLw5u",
363 | "colab_type": "code",
364 | "colab": {}
365 | },
366 | "source": [
367 | "merged_movie = movie_genre_vector.join(movie_era_vector['era_vector'])\n",
368 | "merged_movie['final_movie_vector'] = merged_movie.apply(lambda x: np.concatenate((2*x['movie_vector'], x['era_vector'])), axis=1)"
369 | ],
370 | "execution_count": null,
371 | "outputs": []
372 | },
373 | {
374 | "cell_type": "code",
375 | "metadata": {
376 | "id": "jHxejQEiLw5w",
377 | "colab_type": "code",
378 | "colab": {},
379 | "outputId": "87e2afb0-4ac5-4411-de9e-23d316b3b758"
380 | },
381 | "source": [
382 | "merged_movie.head()"
383 | ],
384 | "execution_count": null,
385 | "outputs": [
386 | {
387 | "output_type": "execute_result",
388 | "data": {
389 | "text/html": [
390 | "\n",
391 | "\n",
404 | "
\n",
405 | " \n",
406 | " \n",
407 | " | \n",
408 | " movieId | \n",
409 | " movie_vector | \n",
410 | " era_vector | \n",
411 | " final_movie_vector | \n",
412 | "
\n",
413 | " \n",
414 | " \n",
415 | " \n",
416 | " | 0 | \n",
417 | " 1 | \n",
418 | " [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ... | \n",
419 | " [0.0, 1.0, 0.0, 0.0] | \n",
420 | " [2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0, ... | \n",
421 | "
\n",
422 | " \n",
423 | " | 1 | \n",
424 | " 2 | \n",
425 | " [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ... | \n",
426 | " [0.0, 1.0, 0.0, 0.0] | \n",
427 | " [2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, ... | \n",
428 | "
\n",
429 | " \n",
430 | " | 2 | \n",
431 | " 3 | \n",
432 | " [0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ... | \n",
433 | " [0.0, 1.0, 0.0, 0.0] | \n",
434 | " [0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, ... | \n",
435 | "
\n",
436 | " \n",
437 | " | 3 | \n",
438 | " 4 | \n",
439 | " [0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, ... | \n",
440 | " [0.0, 1.0, 0.0, 0.0] | \n",
441 | " [0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 2.0, 0.0, 0.0, ... | \n",
442 | "
\n",
443 | " \n",
444 | " | 4 | \n",
445 | " 5 | \n",
446 | " [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... | \n",
447 | " [0.0, 1.0, 0.0, 0.0] | \n",
448 | " [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... | \n",
449 | "
\n",
450 | " \n",
451 | "
\n",
452 | "
"
453 | ],
454 | "text/plain": [
455 | " movieId movie_vector \\\n",
456 | "0 1 [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ... \n",
457 | "1 2 [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ... \n",
458 | "2 3 [0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ... \n",
459 | "3 4 [0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, ... \n",
460 | "4 5 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... \n",
461 | "\n",
462 | " era_vector final_movie_vector \n",
463 | "0 [0.0, 1.0, 0.0, 0.0] [2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0, ... \n",
464 | "1 [0.0, 1.0, 0.0, 0.0] [2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, ... \n",
465 | "2 [0.0, 1.0, 0.0, 0.0] [0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, ... \n",
466 | "3 [0.0, 1.0, 0.0, 0.0] [0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 2.0, 0.0, 0.0, ... \n",
467 | "4 [0.0, 1.0, 0.0, 0.0] [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... "
468 | ]
469 | },
470 | "metadata": {
471 | "tags": []
472 | },
473 | "execution_count": 11
474 | }
475 | ]
476 | },
477 | {
478 | "cell_type": "markdown",
479 | "metadata": {
480 | "id": "KF3gzUpwLw5y",
481 | "colab_type": "text"
482 | },
483 | "source": [
484 | "## Test"
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "metadata": {
490 | "id": "NlP1KEMgLw5z",
491 | "colab_type": "code",
492 | "colab": {},
493 | "outputId": "96cb9714-bb7e-49c5-84e9-48a99eb3a0f8"
494 | },
495 | "source": [
496 | "ratings_test = pd.read_csv(\"testing_data.csv\", converters={\"genres\": literal_eval, \"tag\": literal_eval}) \n",
497 | "ratings_test.head()"
498 | ],
499 | "execution_count": null,
500 | "outputs": [
501 | {
502 | "output_type": "execute_result",
503 | "data": {
504 | "text/html": [
505 | "\n",
506 | "\n",
519 | "
\n",
520 | " \n",
521 | " \n",
522 | " | \n",
523 | " userId | \n",
524 | " movieId | \n",
525 | " rating | \n",
526 | " timestamp | \n",
527 | " genres | \n",
528 | " tag | \n",
529 | "
\n",
530 | " \n",
531 | " \n",
532 | " \n",
533 | " | 0 | \n",
534 | " 1 | \n",
535 | " 3 | \n",
536 | " 4.0 | \n",
537 | " 964981247 | \n",
538 | " [Comedy, Romance] | \n",
539 | " [] | \n",
540 | "
\n",
541 | " \n",
542 | " | 1 | \n",
543 | " 1 | \n",
544 | " 163 | \n",
545 | " 5.0 | \n",
546 | " 964983650 | \n",
547 | " [Action, Romance, Western] | \n",
548 | " [] | \n",
549 | "
\n",
550 | " \n",
551 | " | 2 | \n",
552 | " 1 | \n",
553 | " 316 | \n",
554 | " 3.0 | \n",
555 | " 964982310 | \n",
556 | " [Action, Adventure, Sci-Fi] | \n",
557 | " [] | \n",
558 | "
\n",
559 | " \n",
560 | " | 3 | \n",
561 | " 1 | \n",
562 | " 349 | \n",
563 | " 4.0 | \n",
564 | " 964982563 | \n",
565 | " [Action, Crime, Drama, Thriller] | \n",
566 | " [] | \n",
567 | "
\n",
568 | " \n",
569 | " | 4 | \n",
570 | " 1 | \n",
571 | " 441 | \n",
572 | " 4.0 | \n",
573 | " 964980868 | \n",
574 | " [Comedy] | \n",
575 | " [] | \n",
576 | "
\n",
577 | " \n",
578 | "
\n",
579 | "
"
580 | ],
581 | "text/plain": [
582 | " userId movieId rating timestamp genres tag\n",
583 | "0 1 3 4.0 964981247 [Comedy, Romance] []\n",
584 | "1 1 163 5.0 964983650 [Action, Romance, Western] []\n",
585 | "2 1 316 3.0 964982310 [Action, Adventure, Sci-Fi] []\n",
586 | "3 1 349 4.0 964982563 [Action, Crime, Drama, Thriller] []\n",
587 | "4 1 441 4.0 964980868 [Comedy] []"
588 | ]
589 | },
590 | "metadata": {
591 | "tags": []
592 | },
593 | "execution_count": 12
594 | }
595 | ]
596 | },
597 | {
598 | "cell_type": "code",
599 | "metadata": {
600 | "id": "3X3-GJDNLw51",
601 | "colab_type": "code",
602 | "colab": {}
603 | },
604 | "source": [
605 | "ratings_test = pd.read_csv(\"testing_data.csv\", converters={\"genres\": literal_eval, \"tag\": literal_eval}) \n",
606 | "ratings_test.head()\n",
607 | "\n",
608 | "algo_predictions = pd.DataFrame(columns=['userId', 'movieId', 'user_vector', 'movie_vector', 'og_rating', 'pred_rating'])\n",
609 | "error_count = 0\n",
610 | "for ind, row in ratings_test.iterrows():\n",
611 | " userId = row['userId']\n",
612 | " movieId = row['movieId']\n",
613 | " og_rating = row['rating']\n",
614 | " \n",
615 | " user_vector = merged_user[merged_user['userId'] == int(userId)].final_user_vector.values[0]\n",
616 | " if len(merged_movie[merged_movie['movieId'] == int(movieId)].final_movie_vector.values):\n",
617 | " movie_vector = merged_movie[merged_movie['movieId'] == int(movieId)].final_movie_vector.values[0]\n",
618 | " else:\n",
619 | " error_count += 1\n",
620 | " print(\"Movie vector not found!\", movieId)\n",
621 | " predicted_rating = user_vector*movie_vector\n",
622 | "\n",
623 | " if predicted_rating.any():\n",
624 | " predicted_rating = np.nanmean(np.where(predicted_rating!=0, predicted_rating, np.nan))\n",
625 | " else:\n",
626 | " predicted_rating = 0\n",
627 | "\n",
628 | " row_df = pd.DataFrame([[userId, movieId, user_vector, movie_vector, og_rating, predicted_rating]], \n",
629 | " columns=['userId', 'movieId', 'user_vector', 'movie_vector', 'og_rating', 'pred_rating'])\n",
630 | " algo_predictions = pd.concat([algo_predictions, row_df], ignore_index=True)"
631 | ],
632 | "execution_count": null,
633 | "outputs": []
634 | },
635 | {
636 | "cell_type": "code",
637 | "metadata": {
638 | "id": "spMCRnulLw53",
639 | "colab_type": "code",
640 | "colab": {},
641 | "outputId": "89e8b4bf-20e2-4cdf-d227-6bfc6fc71934"
642 | },
643 | "source": [
644 | "algo_predictions"
645 | ],
646 | "execution_count": null,
647 | "outputs": [
648 | {
649 | "output_type": "execute_result",
650 | "data": {
651 | "text/html": [
652 | "\n",
653 | "\n",
666 | "
\n",
667 | " \n",
668 | " \n",
669 | " | \n",
670 | " userId | \n",
671 | " movieId | \n",
672 | " user_vector | \n",
673 | " movie_vector | \n",
674 | " og_rating | \n",
675 | " pred_rating | \n",
676 | "
\n",
677 | " \n",
678 | " \n",
679 | " \n",
680 | " | 0 | \n",
681 | " 1 | \n",
682 | " 3 | \n",
683 | " [8.78378378, 9.30434782, 8.97142858, 8.5352112... | \n",
684 | " [0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, ... | \n",
685 | " 4.0 | \n",
686 | " 12.892161 | \n",
687 | "
\n",
688 | " \n",
689 | " | 1 | \n",
690 | " 1 | \n",
691 | " 163 | \n",
692 | " [8.78378378, 9.30434782, 8.97142858, 8.5352112... | \n",
693 | " [0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0, ... | \n",
694 | " 5.0 | \n",
695 | " 14.134848 | \n",
696 | "
\n",
697 | " \n",
698 | " | 2 | \n",
699 | " 1 | \n",
700 | " 316 | \n",
701 | " [8.78378378, 9.30434782, 8.97142858, 8.5352112... | \n",
702 | " [2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, ... | \n",
703 | " 3.0 | \n",
704 | " 13.986955 | \n",
705 | "
\n",
706 | " \n",
707 | " | 3 | \n",
708 | " 1 | \n",
709 | " 349 | \n",
710 | " [8.78378378, 9.30434782, 8.97142858, 8.5352112... | \n",
711 | " [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 2.0, ... | \n",
712 | " 4.0 | \n",
713 | " 14.707133 | \n",
714 | "
\n",
715 | " \n",
716 | " | 4 | \n",
717 | " 1 | \n",
718 | " 441 | \n",
719 | " [8.78378378, 9.30434782, 8.97142858, 8.5352112... | \n",
720 | " [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... | \n",
721 | " 4.0 | \n",
722 | " 10.671575 | \n",
723 | "
\n",
724 | " \n",
725 | " | ... | \n",
726 | " ... | \n",
727 | " ... | \n",
728 | " ... | \n",
729 | " ... | \n",
730 | " ... | \n",
731 | " ... | \n",
732 | "
\n",
733 | " \n",
734 | " | 20163 | \n",
735 | " 610 | \n",
736 | " 156726 | \n",
737 | " [7.38967136, 7.8490566, 7.34210526, 7.44984802... | \n",
738 | " [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... | \n",
739 | " 4.5 | \n",
740 | " 8.852847 | \n",
741 | "
\n",
742 | " \n",
743 | " | 20164 | \n",
744 | " 610 | \n",
745 | " 159093 | \n",
746 | " [7.38967136, 7.8490566, 7.34210526, 7.44984802... | \n",
747 | " [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 2.0, 0.0, ... | \n",
748 | " 3.0 | \n",
749 | " 11.726114 | \n",
750 | "
\n",
751 | " \n",
752 | " | 20165 | \n",
753 | " 610 | \n",
754 | " 161582 | \n",
755 | " [7.38967136, 7.8490566, 7.34210526, 7.44984802... | \n",
756 | " [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 2.0, ... | \n",
757 | " 4.0 | \n",
758 | " 11.335023 | \n",
759 | "
\n",
760 | " \n",
761 | " | 20166 | \n",
762 | " 610 | \n",
763 | " 162350 | \n",
764 | " [7.38967136, 7.8490566, 7.34210526, 7.44984802... | \n",
765 | " [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, ... | \n",
766 | " 3.5 | \n",
767 | " 11.036977 | \n",
768 | "
\n",
769 | " \n",
770 | " | 20167 | \n",
771 | " 610 | \n",
772 | " 163981 | \n",
773 | " [7.38967136, 7.8490566, 7.34210526, 7.44984802... | \n",
774 | " [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... | \n",
775 | " 3.5 | \n",
776 | " 8.701610 | \n",
777 | "
\n",
778 | " \n",
779 | "
\n",
780 | "
20168 rows × 6 columns
\n",
781 | "
"
782 | ],
783 | "text/plain": [
784 | " userId movieId user_vector \\\n",
785 | "0 1 3 [8.78378378, 9.30434782, 8.97142858, 8.5352112... \n",
786 | "1 1 163 [8.78378378, 9.30434782, 8.97142858, 8.5352112... \n",
787 | "2 1 316 [8.78378378, 9.30434782, 8.97142858, 8.5352112... \n",
788 | "3 1 349 [8.78378378, 9.30434782, 8.97142858, 8.5352112... \n",
789 | "4 1 441 [8.78378378, 9.30434782, 8.97142858, 8.5352112... \n",
790 | "... ... ... ... \n",
791 | "20163 610 156726 [7.38967136, 7.8490566, 7.34210526, 7.44984802... \n",
792 | "20164 610 159093 [7.38967136, 7.8490566, 7.34210526, 7.44984802... \n",
793 | "20165 610 161582 [7.38967136, 7.8490566, 7.34210526, 7.44984802... \n",
794 | "20166 610 162350 [7.38967136, 7.8490566, 7.34210526, 7.44984802... \n",
795 | "20167 610 163981 [7.38967136, 7.8490566, 7.34210526, 7.44984802... \n",
796 | "\n",
797 | " movie_vector og_rating \\\n",
798 | "0 [0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, ... 4.0 \n",
799 | "1 [0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0, ... 5.0 \n",
800 | "2 [2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, ... 3.0 \n",
801 | "3 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 2.0, ... 4.0 \n",
802 | "4 [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... 4.0 \n",
803 | "... ... ... \n",
804 | "20163 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... 4.5 \n",
805 | "20164 [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 2.0, 0.0, ... 3.0 \n",
806 | "20165 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 2.0, ... 4.0 \n",
807 | "20166 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, ... 3.5 \n",
808 | "20167 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... 3.5 \n",
809 | "\n",
810 | " pred_rating \n",
811 | "0 12.892161 \n",
812 | "1 14.134848 \n",
813 | "2 13.986955 \n",
814 | "3 14.707133 \n",
815 | "4 10.671575 \n",
816 | "... ... \n",
817 | "20163 8.852847 \n",
818 | "20164 11.726114 \n",
819 | "20165 11.335023 \n",
820 | "20166 11.036977 \n",
821 | "20167 8.701610 \n",
822 | "\n",
823 | "[20168 rows x 6 columns]"
824 | ]
825 | },
826 | "metadata": {
827 | "tags": []
828 | },
829 | "execution_count": 14
830 | }
831 | ]
832 | },
833 | {
834 | "cell_type": "code",
835 | "metadata": {
836 | "id": "wzJucPlCLw55",
837 | "colab_type": "code",
838 | "colab": {}
839 | },
840 | "source": [
841 | "# algo_predictions.to_csv(\"genre_era_predictions.csv\")"
842 | ],
843 | "execution_count": null,
844 | "outputs": []
845 | },
846 | {
847 | "cell_type": "code",
848 | "metadata": {
849 | "scrolled": true,
850 | "id": "xnnjyw58Lw57",
851 | "colab_type": "code",
852 | "colab": {},
853 | "outputId": "67264ae5-6d78-4260-cef3-992bcdb1eed7"
854 | },
855 | "source": [
856 | "rmse = ((algo_predictions.og_rating - algo_predictions.pred_rating/3) ** 2).mean() ** .5\n",
857 | "rmse"
858 | ],
859 | "execution_count": null,
860 | "outputs": [
861 | {
862 | "output_type": "execute_result",
863 | "data": {
864 | "text/plain": [
865 | "0.9898749125266205"
866 | ]
867 | },
868 | "metadata": {
869 | "tags": []
870 | },
871 | "execution_count": 16
872 | }
873 | ]
874 | },
875 | {
876 | "cell_type": "code",
877 | "metadata": {
878 | "id": "wDfACdMsLw59",
879 | "colab_type": "code",
880 | "colab": {},
881 | "outputId": "b46389e7-6071-4f44-be9b-3731cf8fa46a"
882 | },
883 | "source": [
884 | "mae = (((algo_predictions.og_rating - algo_predictions.pred_rating/3) ** 2) ** .5).mean()\n",
885 | "mae"
886 | ],
887 | "execution_count": null,
888 | "outputs": [
889 | {
890 | "output_type": "execute_result",
891 | "data": {
892 | "text/plain": [
893 | "0.7651172008808977"
894 | ]
895 | },
896 | "metadata": {
897 | "tags": []
898 | },
899 | "execution_count": 17
900 | }
901 | ]
902 | },
903 | {
904 | "cell_type": "code",
905 | "metadata": {
906 | "id": "KQj2WqJZLw6A",
907 | "colab_type": "code",
908 | "colab": {}
909 | },
910 | "source": [
911 | ""
912 | ],
913 | "execution_count": null,
914 | "outputs": []
915 | }
916 | ]
917 | }
--------------------------------------------------------------------------------
/Code/preprocessing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "kernelspec": {
6 | "display_name": "Python 3",
7 | "language": "python",
8 | "name": "python3"
9 | },
10 | "language_info": {
11 | "codemirror_mode": {
12 | "name": "ipython",
13 | "version": 3
14 | },
15 | "file_extension": ".py",
16 | "mimetype": "text/x-python",
17 | "name": "python",
18 | "nbconvert_exporter": "python",
19 | "pygments_lexer": "ipython3",
20 | "version": "3.7.6"
21 | },
22 | "colab": {
23 | "name": "preprocessing.ipynb",
24 | "provenance": [],
25 | "collapsed_sections": []
26 | }
27 | },
28 | "cells": [
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {
32 | "id": "slsyFHKMOwm9",
33 | "colab_type": "text"
34 | },
35 | "source": [
36 | "This notebook splits the data into training and testing set for each user such that 80% ratings are in training and 20% are for testing."
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "metadata": {
42 | "id": "fauYg6bNOu24",
43 | "colab_type": "code",
44 | "colab": {}
45 | },
46 | "source": [
47 | "import pandas as pd\n",
48 | "import numpy as np"
49 | ],
50 | "execution_count": null,
51 | "outputs": []
52 | },
53 | {
54 | "cell_type": "code",
55 | "metadata": {
56 | "id": "WdMSaP4bOu26",
57 | "colab_type": "code",
58 | "colab": {},
59 | "outputId": "060f423a-f966-4a2b-c38d-55cf40e5c075"
60 | },
61 | "source": [
62 | "movies = pd.read_csv('ml-latest-small/movies.csv')\n",
63 | "ratings = pd.read_csv('ml-latest-small/ratings.csv')\n",
64 | "tags = pd.read_csv('ml-latest-small/tags.csv')\n",
65 | "print('movies: ', movies.shape)\n",
66 | "print('ratings: ', ratings.shape)\n",
67 | "print('tags: ', tags.shape)"
68 | ],
69 | "execution_count": null,
70 | "outputs": [
71 | {
72 | "output_type": "stream",
73 | "text": [
74 | "movies: (9742, 3)\n",
75 | "ratings: (100836, 4)\n",
76 | "tags: (3683, 4)\n"
77 | ],
78 | "name": "stdout"
79 | }
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "metadata": {
85 | "scrolled": false,
86 | "id": "pTK-7Pn_Ou29",
87 | "colab_type": "code",
88 | "colab": {},
89 | "outputId": "19cee6e2-8455-4a4f-f413-3a10fd4b2a9e"
90 | },
91 | "source": [
92 | "movies.head(5)"
93 | ],
94 | "execution_count": null,
95 | "outputs": [
96 | {
97 | "output_type": "execute_result",
98 | "data": {
99 | "text/html": [
100 | "\n",
101 | "\n",
114 | "
\n",
115 | " \n",
116 | " \n",
117 | " | \n",
118 | " movieId | \n",
119 | " title | \n",
120 | " genres | \n",
121 | "
\n",
122 | " \n",
123 | " \n",
124 | " \n",
125 | " | 0 | \n",
126 | " 1 | \n",
127 | " Toy Story (1995) | \n",
128 | " Adventure|Animation|Children|Comedy|Fantasy | \n",
129 | "
\n",
130 | " \n",
131 | " | 1 | \n",
132 | " 2 | \n",
133 | " Jumanji (1995) | \n",
134 | " Adventure|Children|Fantasy | \n",
135 | "
\n",
136 | " \n",
137 | " | 2 | \n",
138 | " 3 | \n",
139 | " Grumpier Old Men (1995) | \n",
140 | " Comedy|Romance | \n",
141 | "
\n",
142 | " \n",
143 | " | 3 | \n",
144 | " 4 | \n",
145 | " Waiting to Exhale (1995) | \n",
146 | " Comedy|Drama|Romance | \n",
147 | "
\n",
148 | " \n",
149 | " | 4 | \n",
150 | " 5 | \n",
151 | " Father of the Bride Part II (1995) | \n",
152 | " Comedy | \n",
153 | "
\n",
154 | " \n",
155 | "
\n",
156 | "
"
157 | ],
158 | "text/plain": [
159 | " movieId title \\\n",
160 | "0 1 Toy Story (1995) \n",
161 | "1 2 Jumanji (1995) \n",
162 | "2 3 Grumpier Old Men (1995) \n",
163 | "3 4 Waiting to Exhale (1995) \n",
164 | "4 5 Father of the Bride Part II (1995) \n",
165 | "\n",
166 | " genres \n",
167 | "0 Adventure|Animation|Children|Comedy|Fantasy \n",
168 | "1 Adventure|Children|Fantasy \n",
169 | "2 Comedy|Romance \n",
170 | "3 Comedy|Drama|Romance \n",
171 | "4 Comedy "
172 | ]
173 | },
174 | "metadata": {
175 | "tags": []
176 | },
177 | "execution_count": 4
178 | }
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "metadata": {
184 | "id": "9szjN9tSOu3A",
185 | "colab_type": "code",
186 | "colab": {},
187 | "outputId": "9cded57b-9cf5-4c5f-eaeb-619018a65bcf"
188 | },
189 | "source": [
190 | "ratings.head(5)"
191 | ],
192 | "execution_count": null,
193 | "outputs": [
194 | {
195 | "output_type": "execute_result",
196 | "data": {
197 | "text/html": [
198 | "\n",
199 | "\n",
212 | "
\n",
213 | " \n",
214 | " \n",
215 | " | \n",
216 | " userId | \n",
217 | " movieId | \n",
218 | " rating | \n",
219 | " timestamp | \n",
220 | "
\n",
221 | " \n",
222 | " \n",
223 | " \n",
224 | " | 0 | \n",
225 | " 1 | \n",
226 | " 1 | \n",
227 | " 4.0 | \n",
228 | " 964982703 | \n",
229 | "
\n",
230 | " \n",
231 | " | 1 | \n",
232 | " 1 | \n",
233 | " 3 | \n",
234 | " 4.0 | \n",
235 | " 964981247 | \n",
236 | "
\n",
237 | " \n",
238 | " | 2 | \n",
239 | " 1 | \n",
240 | " 6 | \n",
241 | " 4.0 | \n",
242 | " 964982224 | \n",
243 | "
\n",
244 | " \n",
245 | " | 3 | \n",
246 | " 1 | \n",
247 | " 47 | \n",
248 | " 5.0 | \n",
249 | " 964983815 | \n",
250 | "
\n",
251 | " \n",
252 | " | 4 | \n",
253 | " 1 | \n",
254 | " 50 | \n",
255 | " 5.0 | \n",
256 | " 964982931 | \n",
257 | "
\n",
258 | " \n",
259 | "
\n",
260 | "
"
261 | ],
262 | "text/plain": [
263 | " userId movieId rating timestamp\n",
264 | "0 1 1 4.0 964982703\n",
265 | "1 1 3 4.0 964981247\n",
266 | "2 1 6 4.0 964982224\n",
267 | "3 1 47 5.0 964983815\n",
268 | "4 1 50 5.0 964982931"
269 | ]
270 | },
271 | "metadata": {
272 | "tags": []
273 | },
274 | "execution_count": 5
275 | }
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "metadata": {
281 | "id": "o6mexZ2NOu3C",
282 | "colab_type": "code",
283 | "colab": {},
284 | "outputId": "7f6eebb9-213f-4642-cabf-64780a92d5aa"
285 | },
286 | "source": [
287 | "tags.head(5)"
288 | ],
289 | "execution_count": null,
290 | "outputs": [
291 | {
292 | "output_type": "execute_result",
293 | "data": {
294 | "text/html": [
295 | "\n",
296 | "\n",
309 | "
\n",
310 | " \n",
311 | " \n",
312 | " | \n",
313 | " userId | \n",
314 | " movieId | \n",
315 | " tag | \n",
316 | " timestamp | \n",
317 | "
\n",
318 | " \n",
319 | " \n",
320 | " \n",
321 | " | 0 | \n",
322 | " 2 | \n",
323 | " 60756 | \n",
324 | " funny | \n",
325 | " 1445714994 | \n",
326 | "
\n",
327 | " \n",
328 | " | 1 | \n",
329 | " 2 | \n",
330 | " 60756 | \n",
331 | " Highly quotable | \n",
332 | " 1445714996 | \n",
333 | "
\n",
334 | " \n",
335 | " | 2 | \n",
336 | " 2 | \n",
337 | " 60756 | \n",
338 | " will ferrell | \n",
339 | " 1445714992 | \n",
340 | "
\n",
341 | " \n",
342 | " | 3 | \n",
343 | " 2 | \n",
344 | " 89774 | \n",
345 | " Boxing story | \n",
346 | " 1445715207 | \n",
347 | "
\n",
348 | " \n",
349 | " | 4 | \n",
350 | " 2 | \n",
351 | " 89774 | \n",
352 | " MMA | \n",
353 | " 1445715200 | \n",
354 | "
\n",
355 | " \n",
356 | "
\n",
357 | "
"
358 | ],
359 | "text/plain": [
360 | " userId movieId tag timestamp\n",
361 | "0 2 60756 funny 1445714994\n",
362 | "1 2 60756 Highly quotable 1445714996\n",
363 | "2 2 60756 will ferrell 1445714992\n",
364 | "3 2 89774 Boxing story 1445715207\n",
365 | "4 2 89774 MMA 1445715200"
366 | ]
367 | },
368 | "metadata": {
369 | "tags": []
370 | },
371 | "execution_count": 6
372 | }
373 | ]
374 | },
375 | {
376 | "cell_type": "code",
377 | "metadata": {
378 | "id": "LIjAsKI8Ou3F",
379 | "colab_type": "code",
380 | "colab": {},
381 | "outputId": "54655a32-c28b-4134-c0e4-3c19fa3b3c61"
382 | },
383 | "source": [
384 | "df = pd.merge(ratings, movies, on='movieId' , how='left')\n",
385 | "df = df.drop('title', axis=1)\n",
386 | "df.head(5)"
387 | ],
388 | "execution_count": null,
389 | "outputs": [
390 | {
391 | "output_type": "execute_result",
392 | "data": {
393 | "text/html": [
394 | "\n",
395 | "\n",
408 | "
\n",
409 | " \n",
410 | " \n",
411 | " | \n",
412 | " userId | \n",
413 | " movieId | \n",
414 | " rating | \n",
415 | " timestamp | \n",
416 | " genres | \n",
417 | "
\n",
418 | " \n",
419 | " \n",
420 | " \n",
421 | " | 0 | \n",
422 | " 1 | \n",
423 | " 1 | \n",
424 | " 4.0 | \n",
425 | " 964982703 | \n",
426 | " Adventure|Animation|Children|Comedy|Fantasy | \n",
427 | "
\n",
428 | " \n",
429 | " | 1 | \n",
430 | " 1 | \n",
431 | " 3 | \n",
432 | " 4.0 | \n",
433 | " 964981247 | \n",
434 | " Comedy|Romance | \n",
435 | "
\n",
436 | " \n",
437 | " | 2 | \n",
438 | " 1 | \n",
439 | " 6 | \n",
440 | " 4.0 | \n",
441 | " 964982224 | \n",
442 | " Action|Crime|Thriller | \n",
443 | "
\n",
444 | " \n",
445 | " | 3 | \n",
446 | " 1 | \n",
447 | " 47 | \n",
448 | " 5.0 | \n",
449 | " 964983815 | \n",
450 | " Mystery|Thriller | \n",
451 | "
\n",
452 | " \n",
453 | " | 4 | \n",
454 | " 1 | \n",
455 | " 50 | \n",
456 | " 5.0 | \n",
457 | " 964982931 | \n",
458 | " Crime|Mystery|Thriller | \n",
459 | "
\n",
460 | " \n",
461 | "
\n",
462 | "
"
463 | ],
464 | "text/plain": [
465 | " userId movieId rating timestamp \\\n",
466 | "0 1 1 4.0 964982703 \n",
467 | "1 1 3 4.0 964981247 \n",
468 | "2 1 6 4.0 964982224 \n",
469 | "3 1 47 5.0 964983815 \n",
470 | "4 1 50 5.0 964982931 \n",
471 | "\n",
472 | " genres \n",
473 | "0 Adventure|Animation|Children|Comedy|Fantasy \n",
474 | "1 Comedy|Romance \n",
475 | "2 Action|Crime|Thriller \n",
476 | "3 Mystery|Thriller \n",
477 | "4 Crime|Mystery|Thriller "
478 | ]
479 | },
480 | "metadata": {
481 | "tags": []
482 | },
483 | "execution_count": 7
484 | }
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "metadata": {
490 | "id": "ymbJzpjJOu3H",
491 | "colab_type": "code",
492 | "colab": {}
493 | },
494 | "source": [
495 | "df['genres'] = df['genres'].str.split('|')"
496 | ],
497 | "execution_count": null,
498 | "outputs": []
499 | },
500 | {
501 | "cell_type": "code",
502 | "metadata": {
503 | "id": "MiUn_Qp0Ou3J",
504 | "colab_type": "code",
505 | "colab": {},
506 | "outputId": "879c002c-96f8-4f77-c957-1653c0b06b2f"
507 | },
508 | "source": [
509 | "df.head(5)"
510 | ],
511 | "execution_count": null,
512 | "outputs": [
513 | {
514 | "output_type": "execute_result",
515 | "data": {
516 | "text/html": [
517 | "\n",
518 | "\n",
531 | "
\n",
532 | " \n",
533 | " \n",
534 | " | \n",
535 | " userId | \n",
536 | " movieId | \n",
537 | " rating | \n",
538 | " timestamp | \n",
539 | " genres | \n",
540 | "
\n",
541 | " \n",
542 | " \n",
543 | " \n",
544 | " | 0 | \n",
545 | " 1 | \n",
546 | " 1 | \n",
547 | " 4.0 | \n",
548 | " 964982703 | \n",
549 | " [Adventure, Animation, Children, Comedy, Fantasy] | \n",
550 | "
\n",
551 | " \n",
552 | " | 1 | \n",
553 | " 1 | \n",
554 | " 3 | \n",
555 | " 4.0 | \n",
556 | " 964981247 | \n",
557 | " [Comedy, Romance] | \n",
558 | "
\n",
559 | " \n",
560 | " | 2 | \n",
561 | " 1 | \n",
562 | " 6 | \n",
563 | " 4.0 | \n",
564 | " 964982224 | \n",
565 | " [Action, Crime, Thriller] | \n",
566 | "
\n",
567 | " \n",
568 | " | 3 | \n",
569 | " 1 | \n",
570 | " 47 | \n",
571 | " 5.0 | \n",
572 | " 964983815 | \n",
573 | " [Mystery, Thriller] | \n",
574 | "
\n",
575 | " \n",
576 | " | 4 | \n",
577 | " 1 | \n",
578 | " 50 | \n",
579 | " 5.0 | \n",
580 | " 964982931 | \n",
581 | " [Crime, Mystery, Thriller] | \n",
582 | "
\n",
583 | " \n",
584 | "
\n",
585 | "
"
586 | ],
587 | "text/plain": [
588 | " userId movieId rating timestamp \\\n",
589 | "0 1 1 4.0 964982703 \n",
590 | "1 1 3 4.0 964981247 \n",
591 | "2 1 6 4.0 964982224 \n",
592 | "3 1 47 5.0 964983815 \n",
593 | "4 1 50 5.0 964982931 \n",
594 | "\n",
595 | " genres \n",
596 | "0 [Adventure, Animation, Children, Comedy, Fantasy] \n",
597 | "1 [Comedy, Romance] \n",
598 | "2 [Action, Crime, Thriller] \n",
599 | "3 [Mystery, Thriller] \n",
600 | "4 [Crime, Mystery, Thriller] "
601 | ]
602 | },
603 | "metadata": {
604 | "tags": []
605 | },
606 | "execution_count": 9
607 | }
608 | ]
609 | },
610 | {
611 | "cell_type": "code",
612 | "metadata": {
613 | "id": "B3xDyEX_Ou3L",
614 | "colab_type": "code",
615 | "colab": {}
616 | },
617 | "source": [
618 | "tags['tag'] = tags['tag'].str.split('|')\n",
619 | "tags.drop('timestamp', axis=1, inplace=True)"
620 | ],
621 | "execution_count": null,
622 | "outputs": []
623 | },
624 | {
625 | "cell_type": "code",
626 | "metadata": {
627 | "id": "4OEft6CkOu3O",
628 | "colab_type": "code",
629 | "colab": {},
630 | "outputId": "7f500eee-1b28-4735-a48f-900f6cd3be1b"
631 | },
632 | "source": [
633 | "tags = tags.groupby(['userId','movieId'])['tag'].apply(lambda x: ','.join(x.astype(str))).reset_index()\n",
634 | "tags.head(5)"
635 | ],
636 | "execution_count": null,
637 | "outputs": [
638 | {
639 | "output_type": "execute_result",
640 | "data": {
641 | "text/html": [
642 | "\n",
643 | "\n",
656 | "
\n",
657 | " \n",
658 | " \n",
659 | " | \n",
660 | " userId | \n",
661 | " movieId | \n",
662 | " tag | \n",
663 | "
\n",
664 | " \n",
665 | " \n",
666 | " \n",
667 | " | 0 | \n",
668 | " 2 | \n",
669 | " 60756 | \n",
670 | " ['funny'],['Highly quotable'],['will ferrell'] | \n",
671 | "
\n",
672 | " \n",
673 | " | 1 | \n",
674 | " 2 | \n",
675 | " 89774 | \n",
676 | " ['Boxing story'],['MMA'],['Tom Hardy'] | \n",
677 | "
\n",
678 | " \n",
679 | " | 2 | \n",
680 | " 2 | \n",
681 | " 106782 | \n",
682 | " ['drugs'],['Leonardo DiCaprio'],['Martin Scors... | \n",
683 | "
\n",
684 | " \n",
685 | " | 3 | \n",
686 | " 7 | \n",
687 | " 48516 | \n",
688 | " ['way too long'] | \n",
689 | "
\n",
690 | " \n",
691 | " | 4 | \n",
692 | " 18 | \n",
693 | " 431 | \n",
694 | " ['Al Pacino'],['gangster'],['mafia'] | \n",
695 | "
\n",
696 | " \n",
697 | "
\n",
698 | "
"
699 | ],
700 | "text/plain": [
701 | " userId movieId tag\n",
702 | "0 2 60756 ['funny'],['Highly quotable'],['will ferrell']\n",
703 | "1 2 89774 ['Boxing story'],['MMA'],['Tom Hardy']\n",
704 | "2 2 106782 ['drugs'],['Leonardo DiCaprio'],['Martin Scors...\n",
705 | "3 7 48516 ['way too long']\n",
706 | "4 18 431 ['Al Pacino'],['gangster'],['mafia']"
707 | ]
708 | },
709 | "metadata": {
710 | "tags": []
711 | },
712 | "execution_count": 11
713 | }
714 | ]
715 | },
716 | {
717 | "cell_type": "code",
718 | "metadata": {
719 | "id": "Q-ukKMhbOu3Q",
720 | "colab_type": "code",
721 | "colab": {}
722 | },
723 | "source": [
724 | "df = pd.merge(df, tags, on=['userId','movieId'], how='left')"
725 | ],
726 | "execution_count": null,
727 | "outputs": []
728 | },
729 | {
730 | "cell_type": "code",
731 | "metadata": {
732 | "id": "05Y3LfkMOu3S",
733 | "colab_type": "code",
734 | "colab": {},
735 | "outputId": "6d8110de-34dd-463d-c123-dd69218f3954"
736 | },
737 | "source": [
738 | "df.shape"
739 | ],
740 | "execution_count": null,
741 | "outputs": [
742 | {
743 | "output_type": "execute_result",
744 | "data": {
745 | "text/plain": [
746 | "(100836, 6)"
747 | ]
748 | },
749 | "metadata": {
750 | "tags": []
751 | },
752 | "execution_count": 13
753 | }
754 | ]
755 | },
756 | {
757 | "cell_type": "code",
758 | "metadata": {
759 | "id": "2RZvAHrHOu3U",
760 | "colab_type": "code",
761 | "colab": {}
762 | },
763 | "source": [
764 | "df['tag'] = df['tag'].apply(lambda d: d if isinstance(d, list) else [])\n",
765 | "df['genres'] = df['genres'].apply(lambda d: d if isinstance(d, list) else [])"
766 | ],
767 | "execution_count": null,
768 | "outputs": []
769 | },
770 | {
771 | "cell_type": "code",
772 | "metadata": {
773 | "id": "M0M_IJc5Ou3W",
774 | "colab_type": "code",
775 | "colab": {},
776 | "outputId": "a95ea1a2-3435-4268-a6ab-a1db53e5a464"
777 | },
778 | "source": [
779 | "df.head()"
780 | ],
781 | "execution_count": null,
782 | "outputs": [
783 | {
784 | "output_type": "execute_result",
785 | "data": {
786 | "text/html": [
787 | "\n",
788 | "\n",
801 | "
\n",
802 | " \n",
803 | " \n",
804 | " | \n",
805 | " userId | \n",
806 | " movieId | \n",
807 | " rating | \n",
808 | " timestamp | \n",
809 | " genres | \n",
810 | " tag | \n",
811 | "
\n",
812 | " \n",
813 | " \n",
814 | " \n",
815 | " | 0 | \n",
816 | " 1 | \n",
817 | " 1 | \n",
818 | " 4.0 | \n",
819 | " 964982703 | \n",
820 | " [Adventure, Animation, Children, Comedy, Fantasy] | \n",
821 | " [] | \n",
822 | "
\n",
823 | " \n",
824 | " | 1 | \n",
825 | " 1 | \n",
826 | " 3 | \n",
827 | " 4.0 | \n",
828 | " 964981247 | \n",
829 | " [Comedy, Romance] | \n",
830 | " [] | \n",
831 | "
\n",
832 | " \n",
833 | " | 2 | \n",
834 | " 1 | \n",
835 | " 6 | \n",
836 | " 4.0 | \n",
837 | " 964982224 | \n",
838 | " [Action, Crime, Thriller] | \n",
839 | " [] | \n",
840 | "
\n",
841 | " \n",
842 | " | 3 | \n",
843 | " 1 | \n",
844 | " 47 | \n",
845 | " 5.0 | \n",
846 | " 964983815 | \n",
847 | " [Mystery, Thriller] | \n",
848 | " [] | \n",
849 | "
\n",
850 | " \n",
851 | " | 4 | \n",
852 | " 1 | \n",
853 | " 50 | \n",
854 | " 5.0 | \n",
855 | " 964982931 | \n",
856 | " [Crime, Mystery, Thriller] | \n",
857 | " [] | \n",
858 | "
\n",
859 | " \n",
860 | "
\n",
861 | "
"
862 | ],
863 | "text/plain": [
864 | " userId movieId rating timestamp \\\n",
865 | "0 1 1 4.0 964982703 \n",
866 | "1 1 3 4.0 964981247 \n",
867 | "2 1 6 4.0 964982224 \n",
868 | "3 1 47 5.0 964983815 \n",
869 | "4 1 50 5.0 964982931 \n",
870 | "\n",
871 | " genres tag \n",
872 | "0 [Adventure, Animation, Children, Comedy, Fantasy] [] \n",
873 | "1 [Comedy, Romance] [] \n",
874 | "2 [Action, Crime, Thriller] [] \n",
875 | "3 [Mystery, Thriller] [] \n",
876 | "4 [Crime, Mystery, Thriller] [] "
877 | ]
878 | },
879 | "metadata": {
880 | "tags": []
881 | },
882 | "execution_count": 15
883 | }
884 | ]
885 | },
886 | {
887 | "cell_type": "markdown",
888 | "metadata": {
889 | "id": "PPkusLncOu3Y",
890 | "colab_type": "text"
891 | },
892 | "source": [
893 | "#### Split into train and test data"
894 | ]
895 | },
896 | {
897 | "cell_type": "code",
898 | "metadata": {
899 | "id": "2-tF8n50Ou3Y",
900 | "colab_type": "code",
901 | "colab": {}
902 | },
903 | "source": [
904 | "from sklearn.model_selection import train_test_split\n",
905 | "train_data, test_data = train_test_split(df, test_size=0.2, stratify=df.userId)"
906 | ],
907 | "execution_count": null,
908 | "outputs": []
909 | },
910 | {
911 | "cell_type": "code",
912 | "metadata": {
913 | "id": "vFCvxS3yOu3a",
914 | "colab_type": "code",
915 | "colab": {},
916 | "outputId": "b6cfc121-411d-49e8-c807-726cfebdcac7"
917 | },
918 | "source": [
919 | "train_data = train_data.sort_values(['userId', 'movieId'])\n",
920 | "train_data.head()"
921 | ],
922 | "execution_count": null,
923 | "outputs": [
924 | {
925 | "output_type": "execute_result",
926 | "data": {
927 | "text/html": [
928 | "\n",
929 | "\n",
942 | "
\n",
943 | " \n",
944 | " \n",
945 | " | \n",
946 | " userId | \n",
947 | " movieId | \n",
948 | " rating | \n",
949 | " timestamp | \n",
950 | " genres | \n",
951 | " tag | \n",
952 | "
\n",
953 | " \n",
954 | " \n",
955 | " \n",
956 | " | 0 | \n",
957 | " 1 | \n",
958 | " 1 | \n",
959 | " 4.0 | \n",
960 | " 964982703 | \n",
961 | " [Adventure, Animation, Children, Comedy, Fantasy] | \n",
962 | " [] | \n",
963 | "
\n",
964 | " \n",
965 | " | 1 | \n",
966 | " 1 | \n",
967 | " 3 | \n",
968 | " 4.0 | \n",
969 | " 964981247 | \n",
970 | " [Comedy, Romance] | \n",
971 | " [] | \n",
972 | "
\n",
973 | " \n",
974 | " | 2 | \n",
975 | " 1 | \n",
976 | " 6 | \n",
977 | " 4.0 | \n",
978 | " 964982224 | \n",
979 | " [Action, Crime, Thriller] | \n",
980 | " [] | \n",
981 | "
\n",
982 | " \n",
983 | " | 3 | \n",
984 | " 1 | \n",
985 | " 47 | \n",
986 | " 5.0 | \n",
987 | " 964983815 | \n",
988 | " [Mystery, Thriller] | \n",
989 | " [] | \n",
990 | "
\n",
991 | " \n",
992 | " | 5 | \n",
993 | " 1 | \n",
994 | " 70 | \n",
995 | " 3.0 | \n",
996 | " 964982400 | \n",
997 | " [Action, Comedy, Horror, Thriller] | \n",
998 | " [] | \n",
999 | "
\n",
1000 | " \n",
1001 | "
\n",
1002 | "
"
1003 | ],
1004 | "text/plain": [
1005 | " userId movieId rating timestamp \\\n",
1006 | "0 1 1 4.0 964982703 \n",
1007 | "1 1 3 4.0 964981247 \n",
1008 | "2 1 6 4.0 964982224 \n",
1009 | "3 1 47 5.0 964983815 \n",
1010 | "5 1 70 3.0 964982400 \n",
1011 | "\n",
1012 | " genres tag \n",
1013 | "0 [Adventure, Animation, Children, Comedy, Fantasy] [] \n",
1014 | "1 [Comedy, Romance] [] \n",
1015 | "2 [Action, Crime, Thriller] [] \n",
1016 | "3 [Mystery, Thriller] [] \n",
1017 | "5 [Action, Comedy, Horror, Thriller] [] "
1018 | ]
1019 | },
1020 | "metadata": {
1021 | "tags": []
1022 | },
1023 | "execution_count": 17
1024 | }
1025 | ]
1026 | },
1027 | {
1028 | "cell_type": "code",
1029 | "metadata": {
1030 | "scrolled": true,
1031 | "id": "ojrEaFoMOu3c",
1032 | "colab_type": "code",
1033 | "colab": {},
1034 | "outputId": "b617ba3b-9a8b-44f3-b41f-22109097a9f0"
1035 | },
1036 | "source": [
1037 | "test_data = test_data.sort_values(['userId','movieId'])\n",
1038 | "test_data.head()"
1039 | ],
1040 | "execution_count": null,
1041 | "outputs": [
1042 | {
1043 | "output_type": "execute_result",
1044 | "data": {
1045 | "text/html": [
1046 | "\n",
1047 | "\n",
1060 | "
\n",
1061 | " \n",
1062 | " \n",
1063 | " | \n",
1064 | " userId | \n",
1065 | " movieId | \n",
1066 | " rating | \n",
1067 | " timestamp | \n",
1068 | " genres | \n",
1069 | " tag | \n",
1070 | "
\n",
1071 | " \n",
1072 | " \n",
1073 | " \n",
1074 | " | 4 | \n",
1075 | " 1 | \n",
1076 | " 50 | \n",
1077 | " 5.0 | \n",
1078 | " 964982931 | \n",
1079 | " [Crime, Mystery, Thriller] | \n",
1080 | " [] | \n",
1081 | "
\n",
1082 | " \n",
1083 | " | 6 | \n",
1084 | " 1 | \n",
1085 | " 101 | \n",
1086 | " 5.0 | \n",
1087 | " 964980868 | \n",
1088 | " [Adventure, Comedy, Crime, Romance] | \n",
1089 | " [] | \n",
1090 | "
\n",
1091 | " \n",
1092 | " | 11 | \n",
1093 | " 1 | \n",
1094 | " 216 | \n",
1095 | " 5.0 | \n",
1096 | " 964981208 | \n",
1097 | " [Comedy] | \n",
1098 | " [] | \n",
1099 | "
\n",
1100 | " \n",
1101 | " | 16 | \n",
1102 | " 1 | \n",
1103 | " 296 | \n",
1104 | " 3.0 | \n",
1105 | " 964982967 | \n",
1106 | " [Comedy, Crime, Drama, Thriller] | \n",
1107 | " [] | \n",
1108 | "
\n",
1109 | " \n",
1110 | " | 17 | \n",
1111 | " 1 | \n",
1112 | " 316 | \n",
1113 | " 3.0 | \n",
1114 | " 964982310 | \n",
1115 | " [Action, Adventure, Sci-Fi] | \n",
1116 | " [] | \n",
1117 | "
\n",
1118 | " \n",
1119 | "
\n",
1120 | "
"
1121 | ],
1122 | "text/plain": [
1123 | " userId movieId rating timestamp genres \\\n",
1124 | "4 1 50 5.0 964982931 [Crime, Mystery, Thriller] \n",
1125 | "6 1 101 5.0 964980868 [Adventure, Comedy, Crime, Romance] \n",
1126 | "11 1 216 5.0 964981208 [Comedy] \n",
1127 | "16 1 296 3.0 964982967 [Comedy, Crime, Drama, Thriller] \n",
1128 | "17 1 316 3.0 964982310 [Action, Adventure, Sci-Fi] \n",
1129 | "\n",
1130 | " tag \n",
1131 | "4 [] \n",
1132 | "6 [] \n",
1133 | "11 [] \n",
1134 | "16 [] \n",
1135 | "17 [] "
1136 | ]
1137 | },
1138 | "metadata": {
1139 | "tags": []
1140 | },
1141 | "execution_count": 18
1142 | }
1143 | ]
1144 | },
1145 | {
1146 | "cell_type": "markdown",
1147 | "metadata": {
1148 | "id": "zlFDgR0COu3f",
1149 | "colab_type": "text"
1150 | },
1151 | "source": [
1152 | "#### Save the dataframes as csv files"
1153 | ]
1154 | },
1155 | {
1156 | "cell_type": "code",
1157 | "metadata": {
1158 | "id": "U8ssEJZ0Ou3f",
1159 | "colab_type": "code",
1160 | "colab": {}
1161 | },
1162 | "source": [
1163 | "# train_data.to_csv('training_data.csv', index = False)\n",
1164 | "# test_data.to_csv('testing_data.csv', index = False)"
1165 | ],
1166 | "execution_count": null,
1167 | "outputs": []
1168 | },
1169 | {
1170 | "cell_type": "markdown",
1171 | "metadata": {
1172 | "id": "dCl7m8u3Ou3h",
1173 | "colab_type": "text"
1174 | },
1175 | "source": [
1176 | "## Pre-process the movie data"
1177 | ]
1178 | },
1179 | {
1180 | "cell_type": "code",
1181 | "metadata": {
1182 | "scrolled": true,
1183 | "id": "SNQrqxwwOu3h",
1184 | "colab_type": "code",
1185 | "colab": {},
1186 | "outputId": "06ea051d-cab0-42be-d43e-36cf3da4d733"
1187 | },
1188 | "source": [
1189 | "movies['genres'] = movies['genres'].str.split('|')\n",
1190 | "movies['genres'] = movies['genres'].apply(lambda d: d if isinstance(d, list) else [])\n",
1191 | "movies.head()\n",
1192 | "# movies.to_csv('movies.csv', index = False)"
1193 | ],
1194 | "execution_count": null,
1195 | "outputs": [
1196 | {
1197 | "output_type": "execute_result",
1198 | "data": {
1199 | "text/html": [
1200 | "\n",
1201 | "\n",
1214 | "
\n",
1215 | " \n",
1216 | " \n",
1217 | " | \n",
1218 | " movieId | \n",
1219 | " title | \n",
1220 | " genres | \n",
1221 | "
\n",
1222 | " \n",
1223 | " \n",
1224 | " \n",
1225 | " | 0 | \n",
1226 | " 1 | \n",
1227 | " Toy Story (1995) | \n",
1228 | " [Adventure, Animation, Children, Comedy, Fantasy] | \n",
1229 | "
\n",
1230 | " \n",
1231 | " | 1 | \n",
1232 | " 2 | \n",
1233 | " Jumanji (1995) | \n",
1234 | " [Adventure, Children, Fantasy] | \n",
1235 | "
\n",
1236 | " \n",
1237 | " | 2 | \n",
1238 | " 3 | \n",
1239 | " Grumpier Old Men (1995) | \n",
1240 | " [Comedy, Romance] | \n",
1241 | "
\n",
1242 | " \n",
1243 | " | 3 | \n",
1244 | " 4 | \n",
1245 | " Waiting to Exhale (1995) | \n",
1246 | " [Comedy, Drama, Romance] | \n",
1247 | "
\n",
1248 | " \n",
1249 | " | 4 | \n",
1250 | " 5 | \n",
1251 | " Father of the Bride Part II (1995) | \n",
1252 | " [Comedy] | \n",
1253 | "
\n",
1254 | " \n",
1255 | "
\n",
1256 | "
"
1257 | ],
1258 | "text/plain": [
1259 | " movieId title \\\n",
1260 | "0 1 Toy Story (1995) \n",
1261 | "1 2 Jumanji (1995) \n",
1262 | "2 3 Grumpier Old Men (1995) \n",
1263 | "3 4 Waiting to Exhale (1995) \n",
1264 | "4 5 Father of the Bride Part II (1995) \n",
1265 | "\n",
1266 | " genres \n",
1267 | "0 [Adventure, Animation, Children, Comedy, Fantasy] \n",
1268 | "1 [Adventure, Children, Fantasy] \n",
1269 | "2 [Comedy, Romance] \n",
1270 | "3 [Comedy, Drama, Romance] \n",
1271 | "4 [Comedy] "
1272 | ]
1273 | },
1274 | "metadata": {
1275 | "tags": []
1276 | },
1277 | "execution_count": 20
1278 | }
1279 | ]
1280 | },
1281 | {
1282 | "cell_type": "code",
1283 | "metadata": {
1284 | "id": "tUOMY7C9Ou3j",
1285 | "colab_type": "code",
1286 | "colab": {}
1287 | },
1288 | "source": [
1289 | ""
1290 | ],
1291 | "execution_count": null,
1292 | "outputs": []
1293 | }
1294 | ]
1295 | }
--------------------------------------------------------------------------------