├── .DS_Store
├── Report.pdf
├── Presentation.pptx
├── Results
    ├── .DS_Store
    ├── images
    │   ├── ndcg.png
    │   ├── rating.png
    │   ├── Algo_analysis.png
    │   ├── Hybrid_Model.png
    │   ├── knn_neighbors.png
    │   ├── KNN_similarity.png
    │   ├── prec_recall_fm.png
    │   ├── genre_distribution.png
    │   ├── vector_generation.png
    │   ├── genre_based_popularity.png
    │   └── mae_rmse_including_pearson.png
    ├── Final_model_results.xlsx
    ├── README.md
    └── algo_results.csv
├── README.md
└── Code
    ├── README.md
    ├── test_ndcg.py
    ├── evaluating_recs.py
    ├── generating_predictions.py
    ├── combined_model.ipynb
    ├── surprise_model_predictions.ipynb
    ├── cold_start_analysis.ipynb
    ├── movie_era_based_recs.ipynb
    └── preprocessing.ipynb


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/.DS_Store


--------------------------------------------------------------------------------
/Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Report.pdf


--------------------------------------------------------------------------------
/Presentation.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Presentation.pptx


--------------------------------------------------------------------------------
/Results/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/.DS_Store


--------------------------------------------------------------------------------
/Results/images/ndcg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/ndcg.png


--------------------------------------------------------------------------------
/Results/images/rating.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/rating.png


--------------------------------------------------------------------------------
/Results/Final_model_results.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/Final_model_results.xlsx


--------------------------------------------------------------------------------
/Results/images/Algo_analysis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/Algo_analysis.png


--------------------------------------------------------------------------------
/Results/images/Hybrid_Model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/Hybrid_Model.png


--------------------------------------------------------------------------------
/Results/images/knn_neighbors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/knn_neighbors.png


--------------------------------------------------------------------------------
/Results/images/KNN_similarity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/KNN_similarity.png


--------------------------------------------------------------------------------
/Results/images/prec_recall_fm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/prec_recall_fm.png


--------------------------------------------------------------------------------
/Results/images/genre_distribution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/genre_distribution.png


--------------------------------------------------------------------------------
/Results/images/vector_generation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/vector_generation.png


--------------------------------------------------------------------------------
/Results/images/genre_based_popularity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/genre_based_popularity.png


--------------------------------------------------------------------------------
/Results/images/mae_rmse_including_pearson.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prakruti-joshi/Movie-Recommendation-System/HEAD/Results/images/mae_rmse_including_pearson.png


--------------------------------------------------------------------------------
/Results/README.md:
--------------------------------------------------------------------------------
 1 | #### Analysis Plots:
 2 | 
 3 | 1. Comparison of methods:
 4 | ![Model results](images/prec_recall_fm.png)
 5 | 
 6 | 2. Surprise models:
 7 | ![Model results](images/Algo_analysis.png)
 8 | 
 9 | 3. Vector generation in content based approach:
10 | ![Content based vector](images/vector_generation.png)
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Movie-Recommendation-System
 2 | 
 3 | **Dataset used:** 
 4 | 1. [MovieLens](https://grouplens.org/datasets/movielens/)
 5 | 2. [The Movie Database: tmdb](https://www.kaggle.com/tmdb/tmdb-movie-metadata)
 6 | 
 7 | **Aim:** Build a movie recommendation system by integrating the aspects of personalization of user with the overall features of movie such as genre, popularity etc. <br>
 8 | 
 9 | **Models:**
10 | * Popularity model
11 | * Content based model: genre, year of release, ratings of movies
12 | * Collaborative filtering: User vs item, KNN similarity measures
13 | * Latent Factor based SVD
14 | * Combined linear model using surprise library (CF + SVD)
15 | * Hybrid model (content based + popularity based + item-item CF + svd)
16 | 
17 | **Results:**
18 | 
19 | ![Hybrid model](Results/images/Hybrid_Model.png)
20 | 
21 | All the models are implemented in Python using pandas, sklearn and [surprise](http://surpriselib.com/) library. The hyperparameter tuning, testing accuracy (RMSE and MAE) and evaluation of recommendations (precision, recall, f-measure and ndcg) for each model are thoroughly performed. The detailed analysis of the models is presented in the report. 
22 | 


--------------------------------------------------------------------------------
/Results/algo_results.csv:
--------------------------------------------------------------------------------
 1 | ,Algorithm,test_rmse,test_mae,fit_time,test_time,Precision,Recall,F-measure,NDCG
 2 | 0,KNNBaseline (pearson_baseline,0.8527048407985288,0.6481831050363938,9.183264017105103,6.47820782661438,0.8311748633879806,0.4131693896380987,0.5519630281626648,0.9631023306515691
 3 | 1,CoClustering,0.9522076927566949,0.7332637064416873,1.8252005577087402,0.1734142303466797,0.7826229508196745,0.3809810109755996,0.5124844754841872,0.9557478562520727
 4 | 2,BaselineOnly,0.873457272373568,0.6717695910159173,0.1396622657775879,0.09251093864440918,0.8074316939890736,0.4003500565116332,0.5352876447815149,0.9590854168667904
 5 | 3,KNNWithZScore,0.8991840819159082,0.6786227249764061,0.1465740203857422,1.7950918674468994,0.7948087431694016,0.3915249143270615,0.524620410302617,0.9541951019768238
 6 | 4,KNNWithMeans,0.9000344577297286,0.6836932871288725,0.10967230796813965,1.5405960083007812,0.8009562841530078,0.3871394034076282,0.521981084939778,0.9527093414526527
 7 | 5,KNNBaseline,0.8762934782625659,0.6659918272130076,0.21010994911193848,1.9365522861480713,0.7964207650273253,0.4158810520288814,0.546425487378205,0.9562275612497768
 8 | 6,NMF,0.9291418868270431,0.7094694164253142,5.15004301071167,0.20406055450439453,0.7792896174863412,0.38074469729699817,0.5115545044405817,0.9548841017932751
 9 | 7,SlopeOne,0.9056446586210445,0.6876919079393096,4.759229898452759,5.59233283996582,0.8075409836065599,0.3965250076150167,0.5318814699668782,0.9555464596666479
10 | 8,SVDpp,0.8691186490330676,0.6640513869365521,480.6970820426941,7.9973015785217285,0.8178415300546472,0.3978840866429081,0.5353282446879383,0.9603166226261703
11 | 9,SVD,0.8794364853143987,0.6739493321877302,4.6099772453308105,0.12506961822509766,0.8033060109289645,0.38554157622502233,0.52102198631871,0.956595790499096
12 | 10,KNNBasic,0.9507724809063621,0.7266525395708078,0.09878921508789062,1.3892457485198975,0.7838797814207674,0.4215349947874059,0.5482474018023665,0.9586757463063268
13 | 


--------------------------------------------------------------------------------
/Code/README.md:
--------------------------------------------------------------------------------
 1 | ### Description:
 2 | 
 3 | #### 1. cold_start_analysis:
 4 | Analyses the performance of different approaches in case of a new user or a user with less number of interaction with the system, namely the cold start problem. Computed the rmse and mae for those customers who have rated less than 18 books and also who have rated more than 1000 movies. <br>
 5 | For less interactions, content based and item-item based collaborative filtering approaches work better. As the number of interactions per customer increases, SVD and collaborative approaches work better.
 6 | 
 7 | #### 2. combined_model:
 8 | Combination of different surprise model results by applying weighted linear combination to generate final rating.
 9 | 
10 | #### 3. content_based_recommendation:
11 | Genreating user and movie vectors based on genre and predicting the ratings for movies in test data.
12 | 
13 | #### 4. evaluating_recs:
14 | Code for Precision, Recall, F-1 score and NDCG.
15 | 
16 | #### 5. generating_predictions:
17 | Generating rating predictions for test data using surprise library.
18 | 
19 | #### 6. hybrid_model:
20 | Code for the hybrid model based on combining recommendations from different models such as content based, CF, SVD to improve accuracy and quality of recommendations.
21 | 
22 | #### 7. knn_analysis:
23 | Analysis of KNN algorithms by changing different parameters like:
24 | * number of neighbors
25 | * similarity metrices
26 | * user v/s item based CF
27 | 
28 | #### 8. model_hyperparameter_tuning:
29 | Fine-tuned surprise models by experimenting with different hyperparameters for training and model. Compared models based on RMSE and MAE.
30 | 
31 | #### 9. movie_era_based_recs:
32 | Content based approach to include the time period in which the movie was launced in the user vector. This method personalizes the users recommendations to include this feature.
33 | 
34 | #### 10. movie_similarity_based_recs:
35 | Content based approach to include the user's genre preference and recommend movies similar to user's highly rated movies.
36 | 
37 | #### 11. movie_year_analysis:
38 | Experiments with the year of the movie release. Analysed the distribution of data and determine the appropriate era intervals to classify movies. Used the content based approach to form a user vector based on the era preference.
39 | 
40 | #### 12. popularity_model:
41 | Model  which uses the popularity attribute as well as the average rating and voter count in the TMDB data to generate popular movies genre wise. The genres are determined using the IMDB data.
42 | 
43 | #### 13. preprocessing:
44 | Code for spliting the data into training and testing set for each user such that 80% ratings are in training and 20% are for testing.
45 | 
46 | #### 14. surprise_model_predictions:
47 | Code for generating ratings for test data using surprise models such as KNN (CF), SVD, Baseline approach, Slopeone etc. 
48 | 
49 | #### 15. surprise_model_recs:
50 | Comparison between the surprise models based on test data ratings (RMSE and MAE) and quality of recommendations (precision, recall, ndcg, f-measure).
51 | 
52 | #### 16. test_ndcg:
53 | Code to test implementation of [NDCG metric](https://en.wikipedia.org/wiki/Discounted_cumulative_gain) for evaluting recommendations. 
54 | 
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/Code/test_ndcg.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from collections import defaultdict
 3 | 
 4 | from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader
 5 | from surprise import Dataset
 6 | from surprise.model_selection import cross_validate
 7 | from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
 8 | from surprise import accuracy
 9 | from surprise.model_selection import train_test_split
10 | 
11 | import pandas as pd
12 | import numpy as np
13 | 
14 | 
15 | def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):
16 |     reader = Reader(rating_scale=(0, 5))
17 |     trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
18 |     testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
19 |     trainset = trainset.construct_trainset(trainset.raw_ratings)
20 |     testset = testset.construct_testset(testset.raw_ratings)
21 |     return trainset, testset
22 | 
23 | 
24 | # Modified get_top_n function   -----------------------------------
25 | # actual_ratings: list of actual ratings for all iids for each user
26 | def get_top_n(predictions, n):
27 |     # First map the predictions to each user.
28 |     top_n = defaultdict(list)
29 |     org_ratings = defaultdict(list)
30 | 
31 |     for uid, iid, true_r, est, _ in predictions:
32 |         top_n[uid].append((iid, est))
33 |         org_ratings[uid].append((iid, true_r))
34 | 
35 |     # Then sort the predictions for each user and retrieve the k highest ones.
36 |     for uid, user_ratings in top_n.items():
37 |         user_ratings.sort(key=lambda x: x[1], reverse=True)
38 |         top_n[uid] = user_ratings[:n]
39 | 
40 |     return top_n, org_ratings
41 | # -------------------------------------------------------------------
42 | 
43 | 
44 | def dcg_at_k(scores):
45 |     return scores[0] + sum(sc/math.log(ind, 2) for sc, ind in zip(scores[1:], range(2, len(scores) + 1)))
46 | 
47 | 
48 | # Modified to include only one parameter-------------------------------
49 | def ndcg_at_k(scores):
50 |     idcg = dcg_at_k(sorted(scores, reverse=True))
51 |     return (dcg_at_k(scores)/idcg) if idcg > 0.0 else 0.0
52 | # ---------------------------------------------------------------------
53 | 
54 | 
55 | file_path_train = 'training_data.csv'
56 | file_path_test = 'testing_data.csv'
57 | traindf = pd.read_csv(file_path_train)
58 | testdf = pd.read_csv(file_path_test)
59 | trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf)
60 | 
61 | print("Starting algo")
62 | algo = SVDpp()
63 | algo.fit(trainset)
64 | test_predictions = algo.test(testset)
65 | test_rmse = accuracy.rmse(test_predictions)
66 | test_mae = accuracy.mae(test_predictions)
67 | print("Ended algo")
68 | 
69 | top_n, org_ratings = get_top_n(test_predictions, 5)   # --------------- Modified this line
70 | 
71 | ndcg_scores = dict()
72 | 
73 | # Modified----------------------
74 | for uid, user_ratings in top_n.items():
75 |     scores = []
76 |     for iid, est_r in user_ratings:
77 |         iid_found = False
78 |         org_user_ratings = org_ratings[uid]
79 |         for i, r in org_user_ratings:
80 |             if iid == i:
81 |                 scores.append(r)
82 |                 iid_found = True
83 |                 break
84 |         if not iid_found:
85 |             scores.append(0)
86 |     ndcg_scores[uid] = ndcg_at_k(scores)
87 | # --------------------------------
88 | 
89 | ndcg_score = sum(ndcg for ndcg in ndcg_scores.values())/len(ndcg_scores)
90 | print(ndcg_score)
91 | 


--------------------------------------------------------------------------------
/Code/evaluating_recs.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from collections import defaultdict
  3 | import csv
  4 | from sklearn.metrics import ndcg_score
  5 | import numpy as np
  6 | import pandas as pd
  7 | 
  8 | 
  9 | def get_top_n(predictions, algo_weights, n):
 10 |     '''Return the top-N recommendation for each user from a set of predictions.
 11 | 
 12 |     Args:
 13 |         predictions(list of Prediction objects): The list of predictions, as
 14 |             returned by the test method of an algorithm.
 15 |         n(int): The number of recommendation to output for each user. Default
 16 |             is 10.
 17 | 
 18 |     Returns:
 19 |     A dict where keys are user (raw) ids and values are lists of tuples:
 20 |         [(raw item id, rating estimation), ...] of size n.
 21 |     '''
 22 | 
 23 |     # First map the predictions to each user.
 24 |     top_n = defaultdict(list)
 25 |     top_n_ndcg = defaultdict(list)
 26 |     for i in range(len(predictions)):
 27 |         row = predictions.iloc[i, :]
 28 |         final_est = algo_weights['svd']*float(row['svd_rating']) + algo_weights['knn']*float(row['knn_rating']) + \
 29 |                     algo_weights['svdpp']*float(row['svdpp_rating']) + algo_weights['slope']*float(row['slopeone_rating']) + \
 30 |                     algo_weights['baseline']*float(row['baseline_rating'])
 31 |         top_n[row[0]].append((row[1], final_est))
 32 |         top_n_ndcg[row[0]].append((row[1], row[2], final_est))
 33 | 
 34 |     # Then sort the predictions for each user and retrieve the k highest ones.
 35 |     for uid, user_ratings in top_n.items():
 36 |         user_ratings.sort(key=lambda x: x[1], reverse=True)
 37 |         top_n[uid] = user_ratings[:n]
 38 | 
 39 |     for uid, user_ratings in top_n_ndcg.items():
 40 |         user_ratings.sort(key=lambda x: x[2], reverse=True)
 41 |         top_n_ndcg[uid] = user_ratings[:n]
 42 | 
 43 |     return top_n, top_n_ndcg
 44 | 
 45 | 
 46 | def precision_recall_at_k(predictions, algo_weights, k, threshold):
 47 |     '''Return precision and recall at k metrics for each user.'''
 48 | 
 49 |     # First map the predictions to each user.
 50 |     user_est_true = defaultdict(list)
 51 |     for i in range(len(predictions)):
 52 |         row = predictions.iloc[i, :]
 53 |         final_est = algo_weights['svd']*float(row['svd_rating']) + algo_weights['knn']*float(row['knn_rating']) + \
 54 |                     algo_weights['svdpp']*float(row['svdpp_rating']) + algo_weights['slope']*float(row['slopeone_rating']) + \
 55 |                     algo_weights['baseline']*float(row['baseline_rating'])
 56 |         user_est_true[row[0]].append((final_est, row[2]))
 57 | 
 58 |     precisions = dict()
 59 |     recalls = dict()
 60 |     for uid, user_ratings in user_est_true.items():
 61 |         # Sort user ratings by estimated value
 62 |         user_ratings.sort(key=lambda x: x[0], reverse=True)
 63 | 
 64 |         # Number of relevant items
 65 |         n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
 66 | 
 67 |         # Number of recommended items in top k
 68 |         n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
 69 | 
 70 |         # Number of relevant and recommended items in top k
 71 |         n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
 72 |                               for (est, true_r) in user_ratings[:k])
 73 | 
 74 |         # Precision@K: Proportion of recommended items that are relevant
 75 |         precisions[uid] = n_rel_and_rec_k/n_rec_k if n_rec_k != 0 else 1
 76 | 
 77 |         # Recall@K: Proportion of relevant items that are recommended
 78 |         recalls[uid] = n_rel_and_rec_k/n_rel if n_rel != 0 else 1
 79 | 
 80 |     return precisions, recalls
 81 | 
 82 | 
 83 | def dcg_at_k(scores):
 84 |     return scores[0] + sum(sc/math.log(ind, 2) for sc, ind in zip(scores[1:], range(2, len(scores) + 1)))
 85 | 
 86 | 
 87 | def ndcg_at_k(predicted_scores, actual_scores):
 88 |     idcg = dcg_at_k(sorted(actual_scores, reverse=True))
 89 |     return (dcg_at_k(predicted_scores)/idcg) if idcg > 0.0 else 0.0
 90 | 
 91 | 
 92 | predictions = pd.read_csv("test_prediction_HP.csv", usecols=range(1, 9))
 93 | algo_weights = dict()
 94 | algo_weights['svd'] = 0
 95 | algo_weights['knn'] = 0
 96 | algo_weights['svdpp'] = 1
 97 | algo_weights['slope'] = 0
 98 | algo_weights['baseline'] = 0
 99 | n = 5
100 | threshold = 3.75
101 | top_n, top_n_ndcg = get_top_n(predictions, algo_weights, n)
102 | with open('top5_svdpp.csv', 'w', newline="") as csv_file:
103 |     writer = csv.writer(csv_file)
104 |     for key, value in top_n.items():
105 |         writer.writerow([key, value])
106 | 
107 | ndcg_scores = dict()
108 | for uid, user_ratings in top_n_ndcg.items():
109 |     true = []
110 |     est = []
111 |     for _, tru_r, est_r in user_ratings:
112 |         true.append(tru_r)
113 |         est.append(est_r)
114 |     ndcg = ndcg_at_k(est, true)
115 |     ndcg_scores[uid] = ndcg
116 | 
117 | # Print the recommended items for each user
118 | # for uid, user_ratings in top_n.items():
119 | #     print(uid, [iid for (iid, _) in user_ratings])
120 | 
121 | precisions, recalls = precision_recall_at_k(predictions, algo_weights, n, threshold)
122 | precision = sum(prec for prec in precisions.values())/len(precisions)
123 | recall = sum(rec for rec in recalls.values())/len(recalls)
124 | fmeasure = (2*precision*recall)/(precision + recall)
125 | ndcg_score = sum(ndcg for ndcg in ndcg_scores.values())/len(ndcg_scores)
126 | print("Precision: ", precision)
127 | print("Recall: ", recall)
128 | print("F-Measure", fmeasure)
129 | print("NDCG Score: ", ndcg_score)
130 | 


--------------------------------------------------------------------------------
/Code/generating_predictions.py:
--------------------------------------------------------------------------------
  1 | from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader
  2 | from surprise import Dataset
  3 | from surprise.model_selection import cross_validate
  4 | from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
  5 | from surprise import accuracy
  6 | from surprise.model_selection import train_test_split
  7 | 
  8 | import pandas as pd
  9 | import numpy as np
 10 | 
 11 | 
 12 | def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):
 13 |     reader = Reader(rating_scale=(0, 5))
 14 |     trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
 15 |     testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
 16 |     trainset = trainset.construct_trainset(trainset.raw_ratings)
 17 |     testset = testset.construct_testset(testset.raw_ratings)
 18 |     return trainset, testset
 19 | 
 20 | 
 21 | def recommendation(algo, trainset, testset):
 22 |     # Train the algorithm on the trainset, and predict ratings for the testset
 23 |     algo.fit(trainset)
 24 | 
 25 |     # # Predictions on training set
 26 |     # train_predictions = algo.test(trainset)
 27 |     # train_rmse = accuracy.rmse(train_predictions)
 28 |     # train_mae = accuracy.mae(train_predictions)
 29 | 
 30 |     # Predictions on testing set
 31 |     test_predictions = algo.test(testset)
 32 |     test_rmse = accuracy.rmse(test_predictions)
 33 |     test_mae = accuracy.mae(test_predictions)
 34 | 
 35 |     return test_rmse, test_mae, test_predictions
 36 | 
 37 | 
 38 | file_path_train = 'training_data.csv'
 39 | file_path_test = 'testing_data.csv'
 40 | traindf = pd.read_csv(file_path_train)
 41 | testdf = pd.read_csv(file_path_test)
 42 | trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf)
 43 | 
 44 | 
 45 | print("1")
 46 | BaselineOnly()
 47 | 
 48 | algo = BaselineOnly()
 49 | test_base_rmse, test_base_mae, test_base_pred = recommendation(algo, trainset, testset)
 50 | 
 51 | print("2")
 52 | # basic collaborative filtering algorithm taking into account a baseline rating.
 53 | sim_options = {'name': 'pearson_baseline',
 54 |                'user_based': False  # compute  similarities between items
 55 |                }
 56 | algo = KNNBaseline(sim_options=sim_options)
 57 | test_knn_rmse, test_knn_mae, test_knn_pred = recommendation(algo, trainset, testset)
 58 | 
 59 | print("3")
 60 | # SlopeOne
 61 | algo = SlopeOne()
 62 | test_slopeone_rmse, test_slopeone_mae, test_slopeone_pred = recommendation(algo, trainset, testset)
 63 | 
 64 | print("4")
 65 | # SVD
 66 | algo = SVD()
 67 | test_svd_rmse, test_svd_mae, test_svd_pred = recommendation(algo, trainset, testset)
 68 | 
 69 | print("5")
 70 | # SVDpp
 71 | algo = SVDpp()
 72 | test_svdpp_rmse, test_svdpp_mae, test_svdpp_pred = recommendation(algo, trainset, testset)
 73 | 
 74 | print("6")
 75 | test_pred_df = pd.DataFrame(
 76 |     columns=['uid', 'iid', 'og_rating', 'svd_rating', 'knn_rating', 'svdpp_rating', 'slopeone_rating',
 77 |              'baseline_rating'])
 78 | test_svd_df = pd.DataFrame(
 79 |     columns=['uid', 'iid', 'og_rating', 'est_rating'])
 80 | test_svdpp_df = pd.DataFrame(
 81 |     columns=['uid', 'iid', 'og_rating', 'est_rating'])
 82 | test_knnb_df = pd.DataFrame(
 83 |     columns=['uid', 'iid', 'og_rating', 'est_rating'])
 84 | test_slope_df = pd.DataFrame(
 85 |     columns=['uid', 'iid', 'og_rating', 'est_rating'])
 86 | test_bonly_df = pd.DataFrame(
 87 |     columns=['uid', 'iid', 'og_rating', 'est_rating'])
 88 | num_test = len(test_base_pred)
 89 | for i in range(num_test):
 90 |     svd = test_svd_pred[i]
 91 |     slopeone = test_slopeone_pred[i]
 92 |     knn = test_knn_pred[i]
 93 |     svdpp = test_svdpp_pred[i]
 94 |     baseline = test_base_pred[i]
 95 |     df = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, svd.est, knn.est, svdpp.est, slopeone.est, baseline.est]],
 96 |                       columns=['uid', 'iid', 'og_rating', 'svd_rating', 'knn_rating', 'svdpp_rating', 'slopeone_rating',
 97 |                                'baseline_rating'])
 98 |     df_svd = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, svd.est]],
 99 |                           columns=['uid', 'iid', 'og_rating', 'est_rating'])
100 |     df_svdpp = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, svdpp.est]],
101 |                             columns=['uid', 'iid', 'og_rating', 'est_rating'])
102 |     df_knnb = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, knn.est]],
103 |                            columns=['uid', 'iid', 'og_rating', 'est_rating'])
104 |     df_slope = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, slopeone.est]],
105 |                             columns=['uid', 'iid', 'og_rating', 'est_rating'])
106 |     df_bonly = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, baseline.est]],
107 |                             columns=['uid', 'iid', 'og_rating', 'est_rating'])
108 |     test_pred_df = pd.concat([df, test_pred_df], ignore_index=True)
109 |     test_svd_df = pd.concat([df_svd, test_svd_df], ignore_index=True)
110 |     test_svdpp_df = pd.concat([df_svdpp, test_svdpp_df], ignore_index=True)
111 |     test_slope_df = pd.concat([df_slope, test_slope_df], ignore_index=True)
112 |     test_knnb_df = pd.concat([df_knnb, test_knnb_df], ignore_index=True)
113 |     test_bonly_df = pd.concat([df_bonly, test_bonly_df], ignore_index=True)
114 | 
115 | print("7")
116 | test_pred_df.to_csv('test_prediction_HP.csv')
117 | test_svd_df.to_csv('test_predictions_svd.csv')
118 | test_svdpp_df.to_csv('test_predictions_svdpp.csv')
119 | test_knnb_df.to_csv('test_predictions_knnb.csv')
120 | test_slope_df.to_csv('test_predictions_slope.csv')
121 | test_bonly_df.to_csv('test_predictions_bonly.csv')
122 | 


--------------------------------------------------------------------------------
/Code/combined_model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "combined_model.ipynb",
  7 |       "provenance": [],
  8 |       "collapsed_sections": []
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     }
 14 |   },
 15 |   "cells": [
 16 |     {
 17 |       "cell_type": "markdown",
 18 |       "metadata": {
 19 |         "id": "6IBmQTfmBf2k",
 20 |         "colab_type": "text"
 21 |       },
 22 |       "source": [
 23 |         "This notebook combines the individual model's rating to form a unified model which performs better. The ratings from individual models are combined using a weighted linear combination to form a resultant rating. This method helps overcome the shortcomings of individual method. \\\\\n",
 24 |         "The ratings are those generated using surprise library."
 25 |       ]
 26 |     },
 27 |     {
 28 |       "cell_type": "code",
 29 |       "metadata": {
 30 |         "id": "fRRCvSdKBeX_",
 31 |         "colab_type": "code",
 32 |         "colab": {}
 33 |       },
 34 |       "source": [
 35 |         "import pandas as pd\n",
 36 |         "import numpy as np\n",
 37 |         "import math"
 38 |       ],
 39 |       "execution_count": null,
 40 |       "outputs": []
 41 |     },
 42 |     {
 43 |       "cell_type": "code",
 44 |       "metadata": {
 45 |         "id": "NBUye_PbsZdg",
 46 |         "colab_type": "code",
 47 |         "colab": {
 48 |           "base_uri": "https://localhost:8080/",
 49 |           "height": 195
 50 |         },
 51 |         "outputId": "b9048120-dea6-4485-f0b9-118915146f4b"
 52 |       },
 53 |       "source": [
 54 |         "pred_data = pd.read_csv('test_prediction_HP.csv')\n",
 55 |         "pred_data.head() "
 56 |       ],
 57 |       "execution_count": null,
 58 |       "outputs": [
 59 |         {
 60 |           "output_type": "execute_result",
 61 |           "data": {
 62 |             "text/html": [
 63 |               "<div>\n",
 64 |               "<style scoped>\n",
 65 |               "    .dataframe tbody tr th:only-of-type {\n",
 66 |               "        vertical-align: middle;\n",
 67 |               "    }\n",
 68 |               "\n",
 69 |               "    .dataframe tbody tr th {\n",
 70 |               "        vertical-align: top;\n",
 71 |               "    }\n",
 72 |               "\n",
 73 |               "    .dataframe thead th {\n",
 74 |               "        text-align: right;\n",
 75 |               "    }\n",
 76 |               "</style>\n",
 77 |               "<table border=\"1\" class=\"dataframe\">\n",
 78 |               "  <thead>\n",
 79 |               "    <tr style=\"text-align: right;\">\n",
 80 |               "      <th></th>\n",
 81 |               "      <th>Unnamed: 0</th>\n",
 82 |               "      <th>uid</th>\n",
 83 |               "      <th>iid</th>\n",
 84 |               "      <th>og_rating</th>\n",
 85 |               "      <th>svd_rating</th>\n",
 86 |               "      <th>knn_rating</th>\n",
 87 |               "      <th>svdpp_rating</th>\n",
 88 |               "      <th>slopeone_rating</th>\n",
 89 |               "      <th>baseline_rating</th>\n",
 90 |               "    </tr>\n",
 91 |               "  </thead>\n",
 92 |               "  <tbody>\n",
 93 |               "    <tr>\n",
 94 |               "      <th>0</th>\n",
 95 |               "      <td>0</td>\n",
 96 |               "      <td>610</td>\n",
 97 |               "      <td>163981</td>\n",
 98 |               "      <td>3.5</td>\n",
 99 |               "      <td>3.571637</td>\n",
100 |               "      <td>3.603256</td>\n",
101 |               "      <td>3.538527</td>\n",
102 |               "      <td>3.501078</td>\n",
103 |               "      <td>3.603256</td>\n",
104 |               "    </tr>\n",
105 |               "    <tr>\n",
106 |               "      <th>1</th>\n",
107 |               "      <td>1</td>\n",
108 |               "      <td>610</td>\n",
109 |               "      <td>162350</td>\n",
110 |               "      <td>3.5</td>\n",
111 |               "      <td>3.430078</td>\n",
112 |               "      <td>3.517200</td>\n",
113 |               "      <td>3.323570</td>\n",
114 |               "      <td>2.752871</td>\n",
115 |               "      <td>3.601820</td>\n",
116 |               "    </tr>\n",
117 |               "    <tr>\n",
118 |               "      <th>2</th>\n",
119 |               "      <td>2</td>\n",
120 |               "      <td>610</td>\n",
121 |               "      <td>161582</td>\n",
122 |               "      <td>4.0</td>\n",
123 |               "      <td>3.715722</td>\n",
124 |               "      <td>4.025055</td>\n",
125 |               "      <td>3.836845</td>\n",
126 |               "      <td>4.253110</td>\n",
127 |               "      <td>3.760107</td>\n",
128 |               "    </tr>\n",
129 |               "    <tr>\n",
130 |               "      <th>3</th>\n",
131 |               "      <td>3</td>\n",
132 |               "      <td>610</td>\n",
133 |               "      <td>159093</td>\n",
134 |               "      <td>3.0</td>\n",
135 |               "      <td>3.889187</td>\n",
136 |               "      <td>3.717144</td>\n",
137 |               "      <td>3.499436</td>\n",
138 |               "      <td>3.737276</td>\n",
139 |               "      <td>3.728456</td>\n",
140 |               "    </tr>\n",
141 |               "    <tr>\n",
142 |               "      <th>4</th>\n",
143 |               "      <td>4</td>\n",
144 |               "      <td>610</td>\n",
145 |               "      <td>156726</td>\n",
146 |               "      <td>4.5</td>\n",
147 |               "      <td>3.209341</td>\n",
148 |               "      <td>3.863298</td>\n",
149 |               "      <td>3.004246</td>\n",
150 |               "      <td>1.850029</td>\n",
151 |               "      <td>3.439723</td>\n",
152 |               "    </tr>\n",
153 |               "  </tbody>\n",
154 |               "</table>\n",
155 |               "</div>"
156 |             ],
157 |             "text/plain": [
158 |               "   Unnamed: 0  uid     iid  ...  svdpp_rating  slopeone_rating  baseline_rating\n",
159 |               "0           0  610  163981  ...      3.538527         3.501078         3.603256\n",
160 |               "1           1  610  162350  ...      3.323570         2.752871         3.601820\n",
161 |               "2           2  610  161582  ...      3.836845         4.253110         3.760107\n",
162 |               "3           3  610  159093  ...      3.499436         3.737276         3.728456\n",
163 |               "4           4  610  156726  ...      3.004246         1.850029         3.439723\n",
164 |               "\n",
165 |               "[5 rows x 9 columns]"
166 |             ]
167 |           },
168 |           "metadata": {
169 |             "tags": []
170 |           },
171 |           "execution_count": 2
172 |         }
173 |       ]
174 |     },
175 |     {
176 |       "cell_type": "code",
177 |       "metadata": {
178 |         "id": "Uc2p2jUlSLJC",
179 |         "colab_type": "code",
180 |         "colab": {}
181 |       },
182 |       "source": [
183 |         "# pred_data = pred_data.drop(169639)"
184 |       ],
185 |       "execution_count": null,
186 |       "outputs": []
187 |     },
188 |     {
189 |       "cell_type": "code",
190 |       "metadata": {
191 |         "id": "a9zxxdMpRSSv",
192 |         "colab_type": "code",
193 |         "colab": {
194 |           "base_uri": "https://localhost:8080/",
195 |           "height": 34
196 |         },
197 |         "outputId": "6513a5da-0d2d-4f34-80cf-4d006d57df72"
198 |       },
199 |       "source": [
200 |         "# num of rows:\n",
201 |         "T = pred_data.shape[0]\n",
202 |         "print(T)"
203 |       ],
204 |       "execution_count": null,
205 |       "outputs": [
206 |         {
207 |           "output_type": "stream",
208 |           "text": [
209 |             "20168\n"
210 |           ],
211 |           "name": "stdout"
212 |         }
213 |       ]
214 |     },
215 |     {
216 |       "cell_type": "code",
217 |       "metadata": {
218 |         "id": "kqfVFGZ8sxLg",
219 |         "colab_type": "code",
220 |         "colab": {}
221 |       },
222 |       "source": [
223 |         "svd_wt = 0.05\n",
224 |         "knn_wt = 0.6\n",
225 |         "svdpp_wt = 0.4\n",
226 |         "slopeone_wt = 0\n",
227 |         "baseline_wt = 0"
228 |       ],
229 |       "execution_count": null,
230 |       "outputs": []
231 |     },
232 |     {
233 |       "cell_type": "code",
234 |       "metadata": {
235 |         "id": "5lWF0bq2OhV9",
236 |         "colab_type": "code",
237 |         "colab": {
238 |           "base_uri": "https://localhost:8080/",
239 |           "height": 50
240 |         },
241 |         "outputId": "04f216b1-c95a-400a-9599-2c1044bfcaad"
242 |       },
243 |       "source": [
244 |         "rmse = ((pred_data.og_rating - pred_data.knn_rating) ** 2).mean() ** .5\n",
245 |         "print(rmse)\n",
246 |         "mae = (((pred_data.og_rating - pred_data.knn_rating) ** 2) ** .5).mean()\n",
247 |         "print(mae)"
248 |       ],
249 |       "execution_count": null,
250 |       "outputs": [
251 |         {
252 |           "output_type": "stream",
253 |           "text": [
254 |             "0.8527048407985283\n",
255 |             "0.64818310503639\n"
256 |           ],
257 |           "name": "stdout"
258 |         }
259 |       ]
260 |     },
261 |     {
262 |       "cell_type": "code",
263 |       "metadata": {
264 |         "id": "oxcquF0lQOVa",
265 |         "colab_type": "code",
266 |         "colab": {
267 |           "base_uri": "https://localhost:8080/",
268 |           "height": 50
269 |         },
270 |         "outputId": "cfe99592-5a54-4ade-c80f-a5f58817f727"
271 |       },
272 |       "source": [
273 |         "rmse = ((pred_data.og_rating - pred_data.svdpp_rating) ** 2).mean() ** .5\n",
274 |         "print(rmse)\n",
275 |         "mae = (((pred_data.og_rating - pred_data.svdpp_rating) ** 2) ** .5).mean()\n",
276 |         "print(mae)"
277 |       ],
278 |       "execution_count": null,
279 |       "outputs": [
280 |         {
281 |           "output_type": "stream",
282 |           "text": [
283 |             "0.8668435463304792\n",
284 |             "0.6611243052231001\n"
285 |           ],
286 |           "name": "stdout"
287 |         }
288 |       ]
289 |     },
290 |     {
291 |       "cell_type": "code",
292 |       "metadata": {
293 |         "id": "IEcpZCbBLTCS",
294 |         "colab_type": "code",
295 |         "colab": {
296 |           "base_uri": "https://localhost:8080/",
297 |           "height": 50
298 |         },
299 |         "outputId": "a9b0788a-8986-487a-ecae-9d1583d3d51b"
300 |       },
301 |       "source": [
302 |         "sqr_sum = 0\n",
303 |         "abs_sum = 0\n",
304 |         "\n",
305 |         "for ind, row in pred_data.iterrows():\n",
306 |         "  org_r = row['og_rating']\n",
307 |         "  pred_r = svd_wt*row['svd_rating'] + knn_wt*row['knn_rating'] + svdpp_wt*row['svdpp_rating'] + slopeone_wt*row['slopeone_rating'] + baseline_wt*row['baseline_rating']\n",
308 |         "  diff = np.abs(org_r - pred_r)\n",
309 |         "  # print(diff)\n",
310 |         "  abs_sum += diff\n",
311 |         "  sqr_sum += diff**2\n",
312 |         "\n",
313 |         "rmse = np.sqrt(sqr_sum/T)\n",
314 |         "print(\"RMSE\", rmse)\n",
315 |         "mae = abs_sum/T\n",
316 |         "print(\"MAE\", mae)"
317 |       ],
318 |       "execution_count": null,
319 |       "outputs": [
320 |         {
321 |           "output_type": "stream",
322 |           "text": [
323 |             "RMSE 0.8440081164615088\n",
324 |             "MAE 0.6426598370928285\n"
325 |           ],
326 |           "name": "stdout"
327 |         }
328 |       ]
329 |     }
330 |   ]
331 | }


--------------------------------------------------------------------------------
/Code/surprise_model_predictions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "surprise_model_predictions.ipynb",
  7 |       "provenance": [],
  8 |       "collapsed_sections": []
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     }
 14 |   },
 15 |   "cells": [
 16 |     {
 17 |       "cell_type": "code",
 18 |       "metadata": {
 19 |         "id": "fi8oBmWXD1rA",
 20 |         "colab_type": "code",
 21 |         "colab": {
 22 |           "base_uri": "https://localhost:8080/",
 23 |           "height": 118
 24 |         },
 25 |         "outputId": "d86444ed-d23b-4e76-b327-9d766fd375f4"
 26 |       },
 27 |       "source": [
 28 |         "!pip install surprise"
 29 |       ],
 30 |       "execution_count": null,
 31 |       "outputs": [
 32 |         {
 33 |           "output_type": "stream",
 34 |           "text": [
 35 |             "Requirement already satisfied: surprise in /usr/local/lib/python3.6/dist-packages (0.1)\n",
 36 |             "Requirement already satisfied: scikit-surprise in /usr/local/lib/python3.6/dist-packages (from surprise) (1.1.0)\n",
 37 |             "Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (1.12.0)\n",
 38 |             "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (0.14.1)\n",
 39 |             "Requirement already satisfied: numpy>=1.11.2 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (1.18.2)\n",
 40 |             "Requirement already satisfied: scipy>=1.0.0 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (1.4.1)\n"
 41 |           ],
 42 |           "name": "stdout"
 43 |         }
 44 |       ]
 45 |     },
 46 |     {
 47 |       "cell_type": "code",
 48 |       "metadata": {
 49 |         "id": "J9FfIKsk0bDJ",
 50 |         "colab_type": "code",
 51 |         "colab": {}
 52 |       },
 53 |       "source": [
 54 |         "from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader\n",
 55 |         "from surprise import Dataset\n",
 56 |         "from surprise.model_selection import cross_validate\n",
 57 |         "from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore\n",
 58 |         "from surprise import accuracy\n",
 59 |         "from surprise.model_selection import train_test_split"
 60 |       ],
 61 |       "execution_count": null,
 62 |       "outputs": []
 63 |     },
 64 |     {
 65 |       "cell_type": "code",
 66 |       "metadata": {
 67 |         "id": "mCZrwlMiOZPg",
 68 |         "colab_type": "code",
 69 |         "colab": {}
 70 |       },
 71 |       "source": [
 72 |         "import pandas as pd\n",
 73 |         "import numpy as np"
 74 |       ],
 75 |       "execution_count": null,
 76 |       "outputs": []
 77 |     },
 78 |     {
 79 |       "cell_type": "code",
 80 |       "metadata": {
 81 |         "id": "olgICu7ZYyM7",
 82 |         "colab_type": "code",
 83 |         "colab": {
 84 |           "base_uri": "https://localhost:8080/",
 85 |           "height": 67
 86 |         },
 87 |         "outputId": "1c8bb714-5fe1-497a-9344-130c4fbb91ef"
 88 |       },
 89 |       "source": [
 90 |         "# Load the movielens-1M dataset\n",
 91 |         "data = Dataset.load_builtin('ml-1m')"
 92 |       ],
 93 |       "execution_count": null,
 94 |       "outputs": [
 95 |         {
 96 |           "output_type": "stream",
 97 |           "text": [
 98 |             "Dataset ml-1m could not be found. Do you want to download it? [Y/n] y\n",
 99 |             "Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-1m.zip...\n",
100 |             "Done! Dataset ml-1m has been saved to /root/.surprise_data/ml-1m\n"
101 |           ],
102 |           "name": "stdout"
103 |         }
104 |       ]
105 |     },
106 |     {
107 |       "cell_type": "code",
108 |       "metadata": {
109 |         "id": "DnLayS6VaXZL",
110 |         "colab_type": "code",
111 |         "colab": {}
112 |       },
113 |       "source": [
114 |         "# sample random trainset and testset\n",
115 |         "# test set is made of 20% of the ratings.\n",
116 |         "trainset, testset = train_test_split(data, test_size=.20)"
117 |       ],
118 |       "execution_count": null,
119 |       "outputs": []
120 |     },
121 |     {
122 |       "cell_type": "code",
123 |       "metadata": {
124 |         "id": "AC2Mt8xUyccA",
125 |         "colab_type": "code",
126 |         "colab": {}
127 |       },
128 |       "source": [
129 |         "def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):\n",
130 |         "    reader = Reader(rating_scale=(0, 5))\n",
131 |         "    trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)\n",
132 |         "    testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)\n",
133 |         "    trainset = trainset.construct_trainset(trainset.raw_ratings)\n",
134 |         "    testset = testset.construct_testset(testset.raw_ratings)\n",
135 |         "    return trainset, testset"
136 |       ],
137 |       "execution_count": null,
138 |       "outputs": []
139 |     },
140 |     {
141 |       "cell_type": "code",
142 |       "metadata": {
143 |         "id": "dHBcLq3eyi0T",
144 |         "colab_type": "code",
145 |         "colab": {}
146 |       },
147 |       "source": [
148 |         "file_path_train = 'training_data.csv'\n",
149 |         "file_path_test = 'testing_data.csv'\n",
150 |         "traindf = pd.read_csv(file_path_train)\n",
151 |         "testdf = pd.read_csv(file_path_test)\n",
152 |         "trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf)"
153 |       ],
154 |       "execution_count": null,
155 |       "outputs": []
156 |     },
157 |     {
158 |       "cell_type": "code",
159 |       "metadata": {
160 |         "id": "AlitWSrNb2wZ",
161 |         "colab_type": "code",
162 |         "colab": {}
163 |       },
164 |       "source": [
165 |         "def recommendation(algo, trainset, testset):\n",
166 |         "  # Train the algorithm on the trainset, and predict ratings for the testset\n",
167 |         "  algo.fit(trainset)\n",
168 |         "\n",
169 |         "  # Predictions on testing set\n",
170 |         "  test_predictions = algo.test(testset)\n",
171 |         "  test_rmse = accuracy.rmse(test_predictions)\n",
172 |         "  test_mae = accuracy.mae(test_predictions)\n",
173 |         "  \n",
174 |         "  return test_rmse, test_mae, test_predictions"
175 |       ],
176 |       "execution_count": null,
177 |       "outputs": []
178 |     },
179 |     {
180 |       "cell_type": "code",
181 |       "metadata": {
182 |         "id": "9ZblN_7unqoU",
183 |         "colab_type": "code",
184 |         "colab": {}
185 |       },
186 |       "source": [
187 |         "# results = cross_validate(SVD(), data, measures=['RMSE', 'MAE'], cv=5, verbose=False)"
188 |       ],
189 |       "execution_count": null,
190 |       "outputs": []
191 |     },
192 |     {
193 |       "cell_type": "markdown",
194 |       "metadata": {
195 |         "id": "Iv9GSCQx24RI",
196 |         "colab_type": "text"
197 |       },
198 |       "source": [
199 |         "#### Experimenting"
200 |       ]
201 |     },
202 |     {
203 |       "cell_type": "code",
204 |       "metadata": {
205 |         "id": "E777XIBI26SQ",
206 |         "colab_type": "code",
207 |         "colab": {
208 |           "base_uri": "https://localhost:8080/",
209 |           "height": 84
210 |         },
211 |         "outputId": "b5629514-2562-4eda-b44f-67cfbfa18a8a"
212 |       },
213 |       "source": [
214 |         "print('Using ALS')\n",
215 |         "bsl_options = {'method': 'als',\n",
216 |         "               'n_epochs': 5,\n",
217 |         "               'reg_u': 12,\n",
218 |         "               'reg_i': 5\n",
219 |         "               }\n",
220 |         "algo = BaselineOnly(bsl_options=bsl_options)\n",
221 |         "test_rmse, test_mae, test_pred = recommendation(algo, trainset, testset)"
222 |       ],
223 |       "execution_count": null,
224 |       "outputs": [
225 |         {
226 |           "output_type": "stream",
227 |           "text": [
228 |             "Using ALS\n",
229 |             "Estimating biases using als...\n",
230 |             "RMSE: 0.8677\n",
231 |             "MAE:  0.6659\n"
232 |           ],
233 |           "name": "stdout"
234 |         }
235 |       ]
236 |     },
237 |     {
238 |       "cell_type": "code",
239 |       "metadata": {
240 |         "id": "luHqcF-H30jl",
241 |         "colab_type": "code",
242 |         "colab": {
243 |           "base_uri": "https://localhost:8080/",
244 |           "height": 84
245 |         },
246 |         "outputId": "a96a611a-eab4-49ee-e34d-c99c847b584f"
247 |       },
248 |       "source": [
249 |         "print('Using SGD')\n",
250 |         "# bsl_options = {'method': 'sgd',\n",
251 |         "#                'learning_rate': .00005,\n",
252 |         "#                }\n",
253 |         "algo = BaselineOnly()  # bsl_options=bsl_options\n",
254 |         "test_rmse, test_mae, test_pred = recommendation(algo, trainset, testset)"
255 |       ],
256 |       "execution_count": null,
257 |       "outputs": [
258 |         {
259 |           "output_type": "stream",
260 |           "text": [
261 |             "Using SGD\n",
262 |             "Estimating biases using als...\n",
263 |             "RMSE: 0.8735\n",
264 |             "MAE:  0.6718\n"
265 |           ],
266 |           "name": "stdout"
267 |         }
268 |       ]
269 |     },
270 |     {
271 |       "cell_type": "markdown",
272 |       "metadata": {
273 |         "id": "gBCl6LOoBPgQ",
274 |         "colab_type": "text"
275 |       },
276 |       "source": [
277 |         "##### Calculating predictions for the top methods:"
278 |       ]
279 |     },
280 |     {
281 |       "cell_type": "code",
282 |       "metadata": {
283 |         "id": "KuTTB-6Th8ZN",
284 |         "colab_type": "code",
285 |         "colab": {
286 |           "base_uri": "https://localhost:8080/",
287 |           "height": 101
288 |         },
289 |         "outputId": "a202ac5a-0dc8-4a9b-9349-847ce601c9fc"
290 |       },
291 |       "source": [
292 |         "# KNNBaseline\n",
293 |         "\n",
294 |         "algo = KNNBaseline()\n",
295 |         "test_knn_rmse, test_knn_mae, test_knn_pred = recommendation(algo, trainset, testset)"
296 |       ],
297 |       "execution_count": null,
298 |       "outputs": [
299 |         {
300 |           "output_type": "stream",
301 |           "text": [
302 |             "Estimating biases using als...\n",
303 |             "Computing the msd similarity matrix...\n",
304 |             "Done computing similarity matrix.\n",
305 |             "RMSE: 0.8763\n",
306 |             "MAE:  0.6660\n"
307 |           ],
308 |           "name": "stdout"
309 |         }
310 |       ]
311 |     },
312 |     {
313 |       "cell_type": "code",
314 |       "metadata": {
315 |         "id": "ndRC8sVBBoje",
316 |         "colab_type": "code",
317 |         "colab": {
318 |           "base_uri": "https://localhost:8080/",
319 |           "height": 50
320 |         },
321 |         "outputId": "8f1c4945-631c-4a2c-e7ef-92a13db48983"
322 |       },
323 |       "source": [
324 |         "# SlopeOne\n",
325 |         "\n",
326 |         "algo = SlopeOne()\n",
327 |         "test_slopeone_rmse, test_slopeone_mae, test_slopeone_pred = recommendation(algo, trainset, testset)"
328 |       ],
329 |       "execution_count": null,
330 |       "outputs": [
331 |         {
332 |           "output_type": "stream",
333 |           "text": [
334 |             "RMSE: 0.9070\n",
335 |             "MAE:  0.7145\n"
336 |           ],
337 |           "name": "stdout"
338 |         }
339 |       ]
340 |     },
341 |     {
342 |       "cell_type": "code",
343 |       "metadata": {
344 |         "id": "INAgGkTFBxlT",
345 |         "colab_type": "code",
346 |         "colab": {
347 |           "base_uri": "https://localhost:8080/",
348 |           "height": 50
349 |         },
350 |         "outputId": "0e9a390a-e87d-4f55-a97e-6284f7348074"
351 |       },
352 |       "source": [
353 |         "# SVD\n",
354 |         "\n",
355 |         "algo = SVD()\n",
356 |         "test_svd_rmse, test_svd_mae, test_svd_pred  = recommendation(algo, trainset, testset)"
357 |       ],
358 |       "execution_count": null,
359 |       "outputs": [
360 |         {
361 |           "output_type": "stream",
362 |           "text": [
363 |             "RMSE: 0.8743\n",
364 |             "MAE:  0.6858\n"
365 |           ],
366 |           "name": "stdout"
367 |         }
368 |       ]
369 |     },
370 |     {
371 |       "cell_type": "code",
372 |       "metadata": {
373 |         "id": "uJCkdey1B02t",
374 |         "colab_type": "code",
375 |         "colab": {
376 |           "base_uri": "https://localhost:8080/",
377 |           "height": 50
378 |         },
379 |         "outputId": "662ecf2e-f6b3-4f46-d55e-40c825c9009b"
380 |       },
381 |       "source": [
382 |         "# SVDpp\n",
383 |         "\n",
384 |         "algo = SVDpp()\n",
385 |         "test_svdpp_rmse, test_svdpp_mae, test_svdpp_pred = recommendation(algo, trainset, testset)"
386 |       ],
387 |       "execution_count": null,
388 |       "outputs": [
389 |         {
390 |           "output_type": "stream",
391 |           "text": [
392 |             "RMSE: 0.8697\n",
393 |             "MAE:  0.6643\n"
394 |           ],
395 |           "name": "stdout"
396 |         }
397 |       ]
398 |     },
399 |     {
400 |       "cell_type": "code",
401 |       "metadata": {
402 |         "id": "nprC9tRcymxk",
403 |         "colab_type": "code",
404 |         "colab": {
405 |           "base_uri": "https://localhost:8080/",
406 |           "height": 67
407 |         },
408 |         "outputId": "8b64d3ec-8a8b-4963-8a0f-54ef0ae090c4"
409 |       },
410 |       "source": [
411 |         "# BaselineOnly()\n",
412 |         "\n",
413 |         "algo = BaselineOnly()\n",
414 |         "test_base_rmse, test_base_mae, test_base_pred  = recommendation(algo, trainset, testset)"
415 |       ],
416 |       "execution_count": null,
417 |       "outputs": [
418 |         {
419 |           "output_type": "stream",
420 |           "text": [
421 |             "Estimating biases using als...\n",
422 |             "RMSE: 0.8735\n",
423 |             "MAE:  0.6718\n"
424 |           ],
425 |           "name": "stdout"
426 |         }
427 |       ]
428 |     },
429 |     {
430 |       "cell_type": "code",
431 |       "metadata": {
432 |         "id": "5OqtsT-5MPAh",
433 |         "colab_type": "code",
434 |         "colab": {}
435 |       },
436 |       "source": [
437 |         "test_pred_df = pd.DataFrame(columns= ['uid', 'iid', 'og_rating', 'svd_rating', 'knn_rating', 'svdpp_rating', 'slopeone_rating', 'baseline_rating'])"
438 |       ],
439 |       "execution_count": null,
440 |       "outputs": []
441 |     },
442 |     {
443 |       "cell_type": "code",
444 |       "metadata": {
445 |         "id": "ExyS3zHlzxsP",
446 |         "colab_type": "code",
447 |         "colab": {
448 |           "base_uri": "https://localhost:8080/",
449 |           "height": 34
450 |         },
451 |         "outputId": "3e7f502e-2286-4806-cf93-217aa64a4b08"
452 |       },
453 |       "source": [
454 |         "num_test = len(test_base_pred)\n",
455 |         "print(num_test)"
456 |       ],
457 |       "execution_count": null,
458 |       "outputs": [
459 |         {
460 |           "output_type": "stream",
461 |           "text": [
462 |             "200042\n"
463 |           ],
464 |           "name": "stdout"
465 |         }
466 |       ]
467 |     },
468 |     {
469 |       "cell_type": "markdown",
470 |       "metadata": {
471 |         "id": "qHYtUbMyrOA9",
472 |         "colab_type": "text"
473 |       },
474 |       "source": [
475 |         "##### Storing testing set predictions:"
476 |       ]
477 |     },
478 |     {
479 |       "cell_type": "code",
480 |       "metadata": {
481 |         "id": "YSPLXRAgzfka",
482 |         "colab_type": "code",
483 |         "colab": {}
484 |       },
485 |       "source": [
486 |         "for i in range(num_test): \n",
487 |         "  svd = test_svd_pred[i]\n",
488 |         "  slopeone = test_slopeone_pred[i]\n",
489 |         "  knn = test_knn_pred[i]\n",
490 |         "  svdpp = test_svdpp_pred[i]\n",
491 |         "  baseline = test_base_pred[i]\n",
492 |         "  df = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, svd.est, knn.est, svdpp.est, slopeone.est, baseline.est]], columns=['uid', 'iid', 'og_rating', 'svd_rating', 'knn_rating', 'svdpp_rating', 'slopeone_rating','baseline_rating'])\n",
493 |         "  # print(df)\n",
494 |         "  test_pred_df = pd.concat([df, test_pred_df], ignore_index=True)"
495 |       ],
496 |       "execution_count": null,
497 |       "outputs": []
498 |     },
499 |     {
500 |       "cell_type": "code",
501 |       "metadata": {
502 |         "id": "fJdb0S-A5PiX",
503 |         "colab_type": "code",
504 |         "colab": {
505 |           "base_uri": "https://localhost:8080/",
506 |           "height": 402
507 |         },
508 |         "outputId": "7e1bbd47-57f4-464e-ea62-95cda20ad0b5"
509 |       },
510 |       "source": [
511 |         "test_pred_df"
512 |       ],
513 |       "execution_count": null,
514 |       "outputs": [
515 |         {
516 |           "output_type": "execute_result",
517 |           "data": {
518 |             "text/html": [
519 |               "<div>\n",
520 |               "<style scoped>\n",
521 |               "    .dataframe tbody tr th:only-of-type {\n",
522 |               "        vertical-align: middle;\n",
523 |               "    }\n",
524 |               "\n",
525 |               "    .dataframe tbody tr th {\n",
526 |               "        vertical-align: top;\n",
527 |               "    }\n",
528 |               "\n",
529 |               "    .dataframe thead th {\n",
530 |               "        text-align: right;\n",
531 |               "    }\n",
532 |               "</style>\n",
533 |               "<table border=\"1\" class=\"dataframe\">\n",
534 |               "  <thead>\n",
535 |               "    <tr style=\"text-align: right;\">\n",
536 |               "      <th></th>\n",
537 |               "      <th>uid</th>\n",
538 |               "      <th>iid</th>\n",
539 |               "      <th>og_rating</th>\n",
540 |               "      <th>svd_rating</th>\n",
541 |               "      <th>knn_rating</th>\n",
542 |               "      <th>svdpp_rating</th>\n",
543 |               "      <th>slopeone_rating</th>\n",
544 |               "      <th>baseline_rating</th>\n",
545 |               "    </tr>\n",
546 |               "  </thead>\n",
547 |               "  <tbody>\n",
548 |               "    <tr>\n",
549 |               "      <th>0</th>\n",
550 |               "      <td>695</td>\n",
551 |               "      <td>2791</td>\n",
552 |               "      <td>4.0</td>\n",
553 |               "      <td>3.507685</td>\n",
554 |               "      <td>3.815840</td>\n",
555 |               "      <td>3.936685</td>\n",
556 |               "      <td>4.240711</td>\n",
557 |               "      <td>4.146045</td>\n",
558 |               "    </tr>\n",
559 |               "    <tr>\n",
560 |               "      <th>1</th>\n",
561 |               "      <td>6016</td>\n",
562 |               "      <td>3668</td>\n",
563 |               "      <td>3.0</td>\n",
564 |               "      <td>3.404877</td>\n",
565 |               "      <td>3.557922</td>\n",
566 |               "      <td>3.658979</td>\n",
567 |               "      <td>3.390132</td>\n",
568 |               "      <td>3.442060</td>\n",
569 |               "    </tr>\n",
570 |               "    <tr>\n",
571 |               "      <th>2</th>\n",
572 |               "      <td>5482</td>\n",
573 |               "      <td>1221</td>\n",
574 |               "      <td>5.0</td>\n",
575 |               "      <td>4.622452</td>\n",
576 |               "      <td>4.491665</td>\n",
577 |               "      <td>4.451363</td>\n",
578 |               "      <td>4.669042</td>\n",
579 |               "      <td>4.554867</td>\n",
580 |               "    </tr>\n",
581 |               "    <tr>\n",
582 |               "      <th>3</th>\n",
583 |               "      <td>3389</td>\n",
584 |               "      <td>2959</td>\n",
585 |               "      <td>4.0</td>\n",
586 |               "      <td>3.899992</td>\n",
587 |               "      <td>3.217574</td>\n",
588 |               "      <td>4.235280</td>\n",
589 |               "      <td>3.559392</td>\n",
590 |               "      <td>3.450094</td>\n",
591 |               "    </tr>\n",
592 |               "    <tr>\n",
593 |               "      <th>4</th>\n",
594 |               "      <td>4303</td>\n",
595 |               "      <td>608</td>\n",
596 |               "      <td>4.0</td>\n",
597 |               "      <td>4.093749</td>\n",
598 |               "      <td>4.250497</td>\n",
599 |               "      <td>4.757454</td>\n",
600 |               "      <td>4.282707</td>\n",
601 |               "      <td>4.180708</td>\n",
602 |               "    </tr>\n",
603 |               "    <tr>\n",
604 |               "      <th>...</th>\n",
605 |               "      <td>...</td>\n",
606 |               "      <td>...</td>\n",
607 |               "      <td>...</td>\n",
608 |               "      <td>...</td>\n",
609 |               "      <td>...</td>\n",
610 |               "      <td>...</td>\n",
611 |               "      <td>...</td>\n",
612 |               "      <td>...</td>\n",
613 |               "    </tr>\n",
614 |               "    <tr>\n",
615 |               "      <th>200037</th>\n",
616 |               "      <td>1447</td>\n",
617 |               "      <td>3412</td>\n",
618 |               "      <td>4.0</td>\n",
619 |               "      <td>2.678937</td>\n",
620 |               "      <td>3.412608</td>\n",
621 |               "      <td>3.309891</td>\n",
622 |               "      <td>3.192129</td>\n",
623 |               "      <td>3.238168</td>\n",
624 |               "    </tr>\n",
625 |               "    <tr>\n",
626 |               "      <th>200038</th>\n",
627 |               "      <td>301</td>\n",
628 |               "      <td>3396</td>\n",
629 |               "      <td>4.0</td>\n",
630 |               "      <td>4.292583</td>\n",
631 |               "      <td>4.228340</td>\n",
632 |               "      <td>4.594647</td>\n",
633 |               "      <td>4.128157</td>\n",
634 |               "      <td>4.114891</td>\n",
635 |               "    </tr>\n",
636 |               "    <tr>\n",
637 |               "      <th>200039</th>\n",
638 |               "      <td>984</td>\n",
639 |               "      <td>3927</td>\n",
640 |               "      <td>3.0</td>\n",
641 |               "      <td>3.537646</td>\n",
642 |               "      <td>3.446079</td>\n",
643 |               "      <td>3.486974</td>\n",
644 |               "      <td>3.514210</td>\n",
645 |               "      <td>3.475889</td>\n",
646 |               "    </tr>\n",
647 |               "    <tr>\n",
648 |               "      <th>200040</th>\n",
649 |               "      <td>4672</td>\n",
650 |               "      <td>2369</td>\n",
651 |               "      <td>4.0</td>\n",
652 |               "      <td>2.638634</td>\n",
653 |               "      <td>2.882440</td>\n",
654 |               "      <td>2.676785</td>\n",
655 |               "      <td>2.742415</td>\n",
656 |               "      <td>2.817915</td>\n",
657 |               "    </tr>\n",
658 |               "    <tr>\n",
659 |               "      <th>200041</th>\n",
660 |               "      <td>5234</td>\n",
661 |               "      <td>3556</td>\n",
662 |               "      <td>5.0</td>\n",
663 |               "      <td>3.970203</td>\n",
664 |               "      <td>3.656631</td>\n",
665 |               "      <td>3.988456</td>\n",
666 |               "      <td>3.779991</td>\n",
667 |               "      <td>3.712596</td>\n",
668 |               "    </tr>\n",
669 |               "  </tbody>\n",
670 |               "</table>\n",
671 |               "<p>200042 rows × 8 columns</p>\n",
672 |               "</div>"
673 |             ],
674 |             "text/plain": [
675 |               "         uid   iid  og_rating  ...  svdpp_rating  slopeone_rating  baseline_rating\n",
676 |               "0        695  2791        4.0  ...      3.936685         4.240711         4.146045\n",
677 |               "1       6016  3668        3.0  ...      3.658979         3.390132         3.442060\n",
678 |               "2       5482  1221        5.0  ...      4.451363         4.669042         4.554867\n",
679 |               "3       3389  2959        4.0  ...      4.235280         3.559392         3.450094\n",
680 |               "4       4303   608        4.0  ...      4.757454         4.282707         4.180708\n",
681 |               "...      ...   ...        ...  ...           ...              ...              ...\n",
682 |               "200037  1447  3412        4.0  ...      3.309891         3.192129         3.238168\n",
683 |               "200038   301  3396        4.0  ...      4.594647         4.128157         4.114891\n",
684 |               "200039   984  3927        3.0  ...      3.486974         3.514210         3.475889\n",
685 |               "200040  4672  2369        4.0  ...      2.676785         2.742415         2.817915\n",
686 |               "200041  5234  3556        5.0  ...      3.988456         3.779991         3.712596\n",
687 |               "\n",
688 |               "[200042 rows x 8 columns]"
689 |             ]
690 |           },
691 |           "metadata": {
692 |             "tags": []
693 |           },
694 |           "execution_count": 20
695 |         }
696 |       ]
697 |     },
698 |     {
699 |       "cell_type": "code",
700 |       "metadata": {
701 |         "id": "tSwp06K6JClS",
702 |         "colab_type": "code",
703 |         "colab": {}
704 |       },
705 |       "source": [
706 |         "test_pred_df.to_csv('test_prediction.csv')"
707 |       ],
708 |       "execution_count": null,
709 |       "outputs": []
710 |     }
711 |   ]
712 | }


--------------------------------------------------------------------------------
/Code/cold_start_analysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "cold_start_analysis.ipynb",
  7 |       "provenance": [],
  8 |       "collapsed_sections": []
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     }
 14 |   },
 15 |   "cells": [
 16 |     {
 17 |       "cell_type": "markdown",
 18 |       "metadata": {
 19 |         "id": "S5rY5TFX_Fzq",
 20 |         "colab_type": "text"
 21 |       },
 22 |       "source": [
 23 |         "#### Cold Start Analysis:\n",
 24 |         "\n",
 25 |         "This notebook analyses the performance of different approaches in case of a new user or a user with less number of interaction with the system, namely the cold start problem. \\\\\n",
 26 |         "We compute the rmse and mae for those customers who have rated less than 18 books and so on. \\\\\n",
 27 |         "We also observe the performance of approached for customers who have rated more than 1000 movies. "
 28 |       ]
 29 |     },
 30 |     {
 31 |       "cell_type": "code",
 32 |       "metadata": {
 33 |         "id": "l6H9h87h_DXr",
 34 |         "colab_type": "code",
 35 |         "colab": {}
 36 |       },
 37 |       "source": [
 38 |         "!pip install surprise"
 39 |       ],
 40 |       "execution_count": null,
 41 |       "outputs": []
 42 |     },
 43 |     {
 44 |       "cell_type": "code",
 45 |       "metadata": {
 46 |         "id": "G8QgQWPZ3kFu",
 47 |         "colab_type": "code",
 48 |         "colab": {}
 49 |       },
 50 |       "source": [
 51 |         "import pickle\n",
 52 |         "import os\n",
 53 |         "\n",
 54 |         "import pandas as pd\n",
 55 |         "\n",
 56 |         "from surprise import SVD, SVDpp\n",
 57 |         "from surprise import KNNBasic, KNNBaseline, BaselineOnly\n",
 58 |         "from surprise import Dataset                                                     \n",
 59 |         "from surprise import Reader                                                      \n",
 60 |         "from surprise import dump\n",
 61 |         "from surprise.accuracy import rmse"
 62 |       ],
 63 |       "execution_count": null,
 64 |       "outputs": []
 65 |     },
 66 |     {
 67 |       "cell_type": "code",
 68 |       "metadata": {
 69 |         "id": "Y3nN4GjS3sjT",
 70 |         "colab_type": "code",
 71 |         "colab": {}
 72 |       },
 73 |       "source": [
 74 |         "def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):\n",
 75 |         "    reader = Reader(rating_scale=(0, 5))\n",
 76 |         "    trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)\n",
 77 |         "    testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)\n",
 78 |         "    trainset = trainset.construct_trainset(trainset.raw_ratings)\n",
 79 |         "    testset = testset.construct_testset(testset.raw_ratings)\n",
 80 |         "    return trainset, testset"
 81 |       ],
 82 |       "execution_count": null,
 83 |       "outputs": []
 84 |     },
 85 |     {
 86 |       "cell_type": "code",
 87 |       "metadata": {
 88 |         "id": "z6OJ9U-E3zQP",
 89 |         "colab_type": "code",
 90 |         "colab": {}
 91 |       },
 92 |       "source": [
 93 |         "file_path_train = 'training_data.csv'\n",
 94 |         "file_path_test = 'testing_data.csv'\n",
 95 |         "traindf = pd.read_csv(file_path_train)\n",
 96 |         "testdf = pd.read_csv(file_path_test)\n",
 97 |         "trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf)"
 98 |       ],
 99 |       "execution_count": null,
100 |       "outputs": []
101 |     },
102 |     {
103 |       "cell_type": "code",
104 |       "metadata": {
105 |         "id": "-aF-xIYxX4aB",
106 |         "colab_type": "code",
107 |         "colab": {
108 |           "base_uri": "https://localhost:8080/",
109 |           "height": 195
110 |         },
111 |         "outputId": "d1aa662c-301c-4d5f-cedf-23ed3984f02e"
112 |       },
113 |       "source": [
114 |         "traindf.head()"
115 |       ],
116 |       "execution_count": null,
117 |       "outputs": [
118 |         {
119 |           "output_type": "execute_result",
120 |           "data": {
121 |             "text/html": [
122 |               "<div>\n",
123 |               "<style scoped>\n",
124 |               "    .dataframe tbody tr th:only-of-type {\n",
125 |               "        vertical-align: middle;\n",
126 |               "    }\n",
127 |               "\n",
128 |               "    .dataframe tbody tr th {\n",
129 |               "        vertical-align: top;\n",
130 |               "    }\n",
131 |               "\n",
132 |               "    .dataframe thead th {\n",
133 |               "        text-align: right;\n",
134 |               "    }\n",
135 |               "</style>\n",
136 |               "<table border=\"1\" class=\"dataframe\">\n",
137 |               "  <thead>\n",
138 |               "    <tr style=\"text-align: right;\">\n",
139 |               "      <th></th>\n",
140 |               "      <th>userId</th>\n",
141 |               "      <th>movieId</th>\n",
142 |               "      <th>rating</th>\n",
143 |               "      <th>timestamp</th>\n",
144 |               "      <th>genres</th>\n",
145 |               "      <th>tag</th>\n",
146 |               "    </tr>\n",
147 |               "  </thead>\n",
148 |               "  <tbody>\n",
149 |               "    <tr>\n",
150 |               "      <th>0</th>\n",
151 |               "      <td>1</td>\n",
152 |               "      <td>1</td>\n",
153 |               "      <td>4.0</td>\n",
154 |               "      <td>964982703</td>\n",
155 |               "      <td>['Adventure', 'Animation', 'Children', 'Comedy...</td>\n",
156 |               "      <td>[]</td>\n",
157 |               "    </tr>\n",
158 |               "    <tr>\n",
159 |               "      <th>1</th>\n",
160 |               "      <td>1</td>\n",
161 |               "      <td>6</td>\n",
162 |               "      <td>4.0</td>\n",
163 |               "      <td>964982224</td>\n",
164 |               "      <td>['Action', 'Crime', 'Thriller']</td>\n",
165 |               "      <td>[]</td>\n",
166 |               "    </tr>\n",
167 |               "    <tr>\n",
168 |               "      <th>2</th>\n",
169 |               "      <td>1</td>\n",
170 |               "      <td>47</td>\n",
171 |               "      <td>5.0</td>\n",
172 |               "      <td>964983815</td>\n",
173 |               "      <td>['Mystery', 'Thriller']</td>\n",
174 |               "      <td>[]</td>\n",
175 |               "    </tr>\n",
176 |               "    <tr>\n",
177 |               "      <th>3</th>\n",
178 |               "      <td>1</td>\n",
179 |               "      <td>50</td>\n",
180 |               "      <td>5.0</td>\n",
181 |               "      <td>964982931</td>\n",
182 |               "      <td>['Crime', 'Mystery', 'Thriller']</td>\n",
183 |               "      <td>[]</td>\n",
184 |               "    </tr>\n",
185 |               "    <tr>\n",
186 |               "      <th>4</th>\n",
187 |               "      <td>1</td>\n",
188 |               "      <td>70</td>\n",
189 |               "      <td>3.0</td>\n",
190 |               "      <td>964982400</td>\n",
191 |               "      <td>['Action', 'Comedy', 'Horror', 'Thriller']</td>\n",
192 |               "      <td>[]</td>\n",
193 |               "    </tr>\n",
194 |               "  </tbody>\n",
195 |               "</table>\n",
196 |               "</div>"
197 |             ],
198 |             "text/plain": [
199 |               "   userId  movieId  ...                                             genres  tag\n",
200 |               "0       1        1  ...  ['Adventure', 'Animation', 'Children', 'Comedy...   []\n",
201 |               "1       1        6  ...                    ['Action', 'Crime', 'Thriller']   []\n",
202 |               "2       1       47  ...                            ['Mystery', 'Thriller']   []\n",
203 |               "3       1       50  ...                   ['Crime', 'Mystery', 'Thriller']   []\n",
204 |               "4       1       70  ...         ['Action', 'Comedy', 'Horror', 'Thriller']   []\n",
205 |               "\n",
206 |               "[5 rows x 6 columns]"
207 |             ]
208 |           },
209 |           "metadata": {
210 |             "tags": []
211 |           },
212 |           "execution_count": 5
213 |         }
214 |       ]
215 |     },
216 |     {
217 |       "cell_type": "code",
218 |       "metadata": {
219 |         "id": "TzdYPykH4DMR",
220 |         "colab_type": "code",
221 |         "colab": {
222 |           "base_uri": "https://localhost:8080/",
223 |           "height": 50
224 |         },
225 |         "outputId": "91278275-9801-4593-f78c-3dc9ca7caf08"
226 |       },
227 |       "source": [
228 |         "algo_svd = SVD()     \n",
229 |         "algo_svdpp = SVDpp()                                    \n",
230 |         "algo_knn = KNNBasic()\n",
231 |         "\n",
232 |         "\n",
233 |         "algo_svd.fit(trainset)                             \n",
234 |         "predictions_svd = algo_svd.test(testset)\n",
235 |         "\n",
236 |         "algo_svdpp.fit(trainset)                             \n",
237 |         "predictions_svdpp = algo_svdpp.test(testset)\n",
238 |         "\n",
239 |         "algo_knn.fit(trainset)\n",
240 |         "predictions_knn = algo_knn.test(testset)\n",
241 |         "\n",
242 |         "# rmse(predictions_svd)\n",
243 |         "# rmse(predictions_knn)                                                                           \n",
244 |         "\n",
245 |         "dump.dump('./dump_SVD', predictions_svd, algo_svd)\n",
246 |         "dump.dump('./dump_SVDpp', predictions_svdpp, algo_svdpp)\n",
247 |         "dump.dump('./dump_KNN', predictions_knn, algo_knn)"
248 |       ],
249 |       "execution_count": null,
250 |       "outputs": [
251 |         {
252 |           "output_type": "stream",
253 |           "text": [
254 |             "Computing the msd similarity matrix...\n",
255 |             "Done computing similarity matrix.\n"
256 |           ],
257 |           "name": "stdout"
258 |         }
259 |       ]
260 |     },
261 |     {
262 |       "cell_type": "code",
263 |       "metadata": {
264 |         "id": "IWpBO-jw4gR4",
265 |         "colab_type": "code",
266 |         "colab": {}
267 |       },
268 |       "source": [
269 |         "df_svd = pd.DataFrame(predictions_svd, columns=['uid', 'iid', 'rui', 'est', 'details']) \n",
270 |         "df_svdpp = pd.DataFrame(predictions_svdpp, columns=['uid', 'iid', 'rui', 'est', 'details'])        \n",
271 |         "df_knn = pd.DataFrame(predictions_knn, columns=['uid', 'iid', 'rui', 'est', 'details']) "
272 |       ],
273 |       "execution_count": null,
274 |       "outputs": []
275 |     },
276 |     {
277 |       "cell_type": "code",
278 |       "metadata": {
279 |         "id": "5ytiPn_6Z4D5",
280 |         "colab_type": "code",
281 |         "colab": {
282 |           "base_uri": "https://localhost:8080/",
283 |           "height": 67
284 |         },
285 |         "outputId": "c172233e-b73b-4226-faea-c9505e9c0b09"
286 |       },
287 |       "source": [
288 |         "sim_options = {'name': 'pearson_baseline',\n",
289 |         "               'user_based': False  # compute  similarities between items\n",
290 |         "               }\n",
291 |         "# algo = KNNBaseline(sim_options=sim_options)\n",
292 |         "algo_knnbaseline = KNNBaseline(sim_options=sim_options)\n",
293 |         "algo_knnbaseline.fit(trainset)\n",
294 |         "predictions_knnbaseline = algo_knnbaseline.test(testset)"
295 |       ],
296 |       "execution_count": null,
297 |       "outputs": [
298 |         {
299 |           "output_type": "stream",
300 |           "text": [
301 |             "Estimating biases using als...\n",
302 |             "Computing the pearson_baseline similarity matrix...\n",
303 |             "Done computing similarity matrix.\n"
304 |           ],
305 |           "name": "stdout"
306 |         }
307 |       ]
308 |     },
309 |     {
310 |       "cell_type": "code",
311 |       "metadata": {
312 |         "id": "-qpAZxicab7y",
313 |         "colab_type": "code",
314 |         "colab": {}
315 |       },
316 |       "source": [
317 |         "df_knnbaseline = pd.DataFrame(predictions_knnbaseline, columns=['uid', 'iid', 'rui', 'est', 'details']) \n",
318 |         "df_knnbaseline['err'] = abs(df_knnbaseline.est - df_knnbaseline.rui)\n",
319 |         "df_knnbaseline['sqr_err'] = (df_knnbaseline.est - df_knnbaseline.rui)**2"
320 |       ],
321 |       "execution_count": null,
322 |       "outputs": []
323 |     },
324 |     {
325 |       "cell_type": "code",
326 |       "metadata": {
327 |         "id": "yIlRty-X4z2T",
328 |         "colab_type": "code",
329 |         "colab": {}
330 |       },
331 |       "source": [
332 |         "df_svd['err'] = abs(df_svd.est - df_svd.rui)\n",
333 |         "df_svdpp['err'] = abs(df_svdpp.est - df_svdpp.rui)\n",
334 |         "df_knn['err'] = abs(df_knn.est - df_knn.rui)"
335 |       ],
336 |       "execution_count": null,
337 |       "outputs": []
338 |     },
339 |     {
340 |       "cell_type": "code",
341 |       "metadata": {
342 |         "id": "VdC0IyDxY4xB",
343 |         "colab_type": "code",
344 |         "colab": {}
345 |       },
346 |       "source": [
347 |         "df_svd['sqr_err'] = (df_svd.est - df_svd.rui)**2\n",
348 |         "df_svdpp['sqr_err'] = (df_svdpp.est - df_svdpp.rui)**2\n",
349 |         "df_knn['sqr_err'] = (df_knn.est - df_knn.rui)**2"
350 |       ],
351 |       "execution_count": null,
352 |       "outputs": []
353 |     },
354 |     {
355 |       "cell_type": "code",
356 |       "metadata": {
357 |         "id": "t4gOt7SHcVnO",
358 |         "colab_type": "code",
359 |         "colab": {
360 |           "base_uri": "https://localhost:8080/",
361 |           "height": 34
362 |         },
363 |         "outputId": "21a9276e-f43b-4dfc-9afb-65987e0cd1f1"
364 |       },
365 |       "source": [
366 |         "algo_baselineonly = BaselineOnly()\n",
367 |         "algo_baselineonly.fit(trainset)\n",
368 |         "predictions_baselineonly = algo_baselineonly.test(testset)\n",
369 |         "\n",
370 |         "df_baselineonly = pd.DataFrame(predictions_baselineonly, columns=['uid', 'iid', 'rui', 'est', 'details']) \n",
371 |         "df_baselineonly['err'] = abs(df_baselineonly.est - df_baselineonly.rui)\n",
372 |         "df_baselineonly['sqr_err'] = (df_baselineonly.est - df_baselineonly.rui)**2\n",
373 |         "df_baselineonly['Iu'] = df_baselineonly.uid.apply(get_Iu)"
374 |       ],
375 |       "execution_count": null,
376 |       "outputs": [
377 |         {
378 |           "output_type": "stream",
379 |           "text": [
380 |             "Estimating biases using als...\n"
381 |           ],
382 |           "name": "stdout"
383 |         }
384 |       ]
385 |     },
386 |     {
387 |       "cell_type": "code",
388 |       "metadata": {
389 |         "id": "falRCQt3dYFC",
390 |         "colab_type": "code",
391 |         "colab": {
392 |           "base_uri": "https://localhost:8080/",
393 |           "height": 67
394 |         },
395 |         "outputId": "f3e8404f-77c9-427e-c663-f9ddadd0fb11"
396 |       },
397 |       "source": [
398 |         "sim_options = {'name': 'pearson_baseline',\n",
399 |         "               'user_based': True  # compute  similarities between items\n",
400 |         "               }\n",
401 |         "algo_knnbaseline_user = KNNBaseline(sim_options=sim_options)\n",
402 |         "algo_knnbaseline_user.fit(trainset)\n",
403 |         "predictions_knnbaseline_user = algo_knnbaseline_user.test(testset)\n",
404 |         "\n",
405 |         "df_knn_user = pd.DataFrame(predictions_knnbaseline_user, columns=['uid', 'iid', 'rui', 'est', 'details']) \n",
406 |         "df_knn_user['err'] = abs(df_knn_user.est - df_knn_user.rui)\n",
407 |         "df_knn_user['sqr_err'] = (df_knn_user.est - df_knn_user.rui)**2\n",
408 |         "df_knn_user['Iu'] = df_knn_user.uid.apply(get_Iu)"
409 |       ],
410 |       "execution_count": null,
411 |       "outputs": [
412 |         {
413 |           "output_type": "stream",
414 |           "text": [
415 |             "Estimating biases using als...\n",
416 |             "Computing the pearson_baseline similarity matrix...\n",
417 |             "Done computing similarity matrix.\n"
418 |           ],
419 |           "name": "stdout"
420 |         }
421 |       ]
422 |     },
423 |     {
424 |       "cell_type": "code",
425 |       "metadata": {
426 |         "id": "SSoLuqrV65pK",
427 |         "colab_type": "code",
428 |         "colab": {
429 |           "base_uri": "https://localhost:8080/",
430 |           "height": 195
431 |         },
432 |         "outputId": "897911ad-5086-4798-914e-58df7da6b068"
433 |       },
434 |       "source": [
435 |         "df_svd.head()"
436 |       ],
437 |       "execution_count": null,
438 |       "outputs": [
439 |         {
440 |           "output_type": "execute_result",
441 |           "data": {
442 |             "text/html": [
443 |               "<div>\n",
444 |               "<style scoped>\n",
445 |               "    .dataframe tbody tr th:only-of-type {\n",
446 |               "        vertical-align: middle;\n",
447 |               "    }\n",
448 |               "\n",
449 |               "    .dataframe tbody tr th {\n",
450 |               "        vertical-align: top;\n",
451 |               "    }\n",
452 |               "\n",
453 |               "    .dataframe thead th {\n",
454 |               "        text-align: right;\n",
455 |               "    }\n",
456 |               "</style>\n",
457 |               "<table border=\"1\" class=\"dataframe\">\n",
458 |               "  <thead>\n",
459 |               "    <tr style=\"text-align: right;\">\n",
460 |               "      <th></th>\n",
461 |               "      <th>uid</th>\n",
462 |               "      <th>iid</th>\n",
463 |               "      <th>rui</th>\n",
464 |               "      <th>est</th>\n",
465 |               "      <th>details</th>\n",
466 |               "      <th>err</th>\n",
467 |               "      <th>Iu</th>\n",
468 |               "    </tr>\n",
469 |               "  </thead>\n",
470 |               "  <tbody>\n",
471 |               "    <tr>\n",
472 |               "      <th>0</th>\n",
473 |               "      <td>1</td>\n",
474 |               "      <td>3</td>\n",
475 |               "      <td>4.0</td>\n",
476 |               "      <td>4.200548</td>\n",
477 |               "      <td>{'was_impossible': False}</td>\n",
478 |               "      <td>0.200548</td>\n",
479 |               "      <td>186</td>\n",
480 |               "    </tr>\n",
481 |               "    <tr>\n",
482 |               "      <th>1</th>\n",
483 |               "      <td>1</td>\n",
484 |               "      <td>163</td>\n",
485 |               "      <td>5.0</td>\n",
486 |               "      <td>4.261322</td>\n",
487 |               "      <td>{'was_impossible': False}</td>\n",
488 |               "      <td>0.738678</td>\n",
489 |               "      <td>186</td>\n",
490 |               "    </tr>\n",
491 |               "    <tr>\n",
492 |               "      <th>2</th>\n",
493 |               "      <td>1</td>\n",
494 |               "      <td>316</td>\n",
495 |               "      <td>3.0</td>\n",
496 |               "      <td>4.024986</td>\n",
497 |               "      <td>{'was_impossible': False}</td>\n",
498 |               "      <td>1.024986</td>\n",
499 |               "      <td>186</td>\n",
500 |               "    </tr>\n",
501 |               "    <tr>\n",
502 |               "      <th>3</th>\n",
503 |               "      <td>1</td>\n",
504 |               "      <td>349</td>\n",
505 |               "      <td>4.0</td>\n",
506 |               "      <td>4.443186</td>\n",
507 |               "      <td>{'was_impossible': False}</td>\n",
508 |               "      <td>0.443186</td>\n",
509 |               "      <td>186</td>\n",
510 |               "    </tr>\n",
511 |               "    <tr>\n",
512 |               "      <th>4</th>\n",
513 |               "      <td>1</td>\n",
514 |               "      <td>441</td>\n",
515 |               "      <td>4.0</td>\n",
516 |               "      <td>4.758104</td>\n",
517 |               "      <td>{'was_impossible': False}</td>\n",
518 |               "      <td>0.758104</td>\n",
519 |               "      <td>186</td>\n",
520 |               "    </tr>\n",
521 |               "  </tbody>\n",
522 |               "</table>\n",
523 |               "</div>"
524 |             ],
525 |             "text/plain": [
526 |               "   uid  iid  rui       est                    details       err   Iu\n",
527 |               "0    1    3  4.0  4.200548  {'was_impossible': False}  0.200548  186\n",
528 |               "1    1  163  5.0  4.261322  {'was_impossible': False}  0.738678  186\n",
529 |               "2    1  316  3.0  4.024986  {'was_impossible': False}  1.024986  186\n",
530 |               "3    1  349  4.0  4.443186  {'was_impossible': False}  0.443186  186\n",
531 |               "4    1  441  4.0  4.758104  {'was_impossible': False}  0.758104  186"
532 |             ]
533 |           },
534 |           "metadata": {
535 |             "tags": []
536 |           },
537 |           "execution_count": 12
538 |         }
539 |       ]
540 |     },
541 |     {
542 |       "cell_type": "code",
543 |       "metadata": {
544 |         "id": "gyU3U3mLWG42",
545 |         "colab_type": "code",
546 |         "colab": {}
547 |       },
548 |       "source": [
549 |         "content = pd.read_csv('content_based_genre_ratings.csv')"
550 |       ],
551 |       "execution_count": null,
552 |       "outputs": []
553 |     },
554 |     {
555 |       "cell_type": "code",
556 |       "metadata": {
557 |         "id": "V9pCMloU45Sh",
558 |         "colab_type": "code",
559 |         "colab": {}
560 |       },
561 |       "source": [
562 |         "def get_Iu(uid):\n",
563 |         "    \"\"\"Return the number of items rated by given user\n",
564 |         "    \n",
565 |         "    Args:\n",
566 |         "        uid: The raw id of the user.\n",
567 |         "    Returns:\n",
568 |         "        The number of items rated by the user.\n",
569 |         "    \"\"\"\n",
570 |         "    \n",
571 |         "    try:\n",
572 |         "        return traindf[traindf['userId'] == uid].shape[0]\n",
573 |         "    except ValueError:  # user was not part of the trainset\n",
574 |         "        return 0"
575 |       ],
576 |       "execution_count": null,
577 |       "outputs": []
578 |     },
579 |     {
580 |       "cell_type": "code",
581 |       "metadata": {
582 |         "id": "Xaia-Iy2WPYY",
583 |         "colab_type": "code",
584 |         "colab": {}
585 |       },
586 |       "source": [
587 |         "content['Iu'] = content.userId.apply(get_Iu)"
588 |       ],
589 |       "execution_count": null,
590 |       "outputs": []
591 |     },
592 |     {
593 |       "cell_type": "code",
594 |       "metadata": {
595 |         "id": "8a_bM4hsWyHI",
596 |         "colab_type": "code",
597 |         "colab": {}
598 |       },
599 |       "source": [
600 |         "content['err'] = abs(content.pred_rating - content.og_rating)\n",
601 |         "content['sqr_err'] = (content.pred_rating - content.og_rating)**2\n",
602 |         "# rmse = ((algo_predictions.og_rating - algo_predictions.pred_rating) ** 2).mean() ** .5\n",
603 |         "# mae = (((algo_predictions.og_rating - algo_predictions.pred_rating) ** 2) ** .5).mean()\n"
604 |       ],
605 |       "execution_count": null,
606 |       "outputs": []
607 |     },
608 |     {
609 |       "cell_type": "code",
610 |       "metadata": {
611 |         "id": "yobQqAZTWd_4",
612 |         "colab_type": "code",
613 |         "colab": {
614 |           "base_uri": "https://localhost:8080/",
615 |           "height": 50
616 |         },
617 |         "outputId": "5c486dcc-0636-4452-b859-b51d6b2da9af"
618 |       },
619 |       "source": [
620 |         "print(\"Content based                 \",content[content.Iu < 18].err.mean())\n",
621 |         "print(\"Content based                \",content[content.Iu < 18].sqr_err.mean()** .5)"
622 |       ],
623 |       "execution_count": null,
624 |       "outputs": [
625 |         {
626 |           "output_type": "stream",
627 |           "text": [
628 |             "Content based                  0.7942792057878261\n",
629 |             "Content based                 1.0584107905057996\n"
630 |           ],
631 |           "name": "stdout"
632 |         }
633 |       ]
634 |     },
635 |     {
636 |       "cell_type": "code",
637 |       "metadata": {
638 |         "id": "wz1Pkbzd4-dl",
639 |         "colab_type": "code",
640 |         "colab": {}
641 |       },
642 |       "source": [
643 |         "df_knn['Iu'] = df_knn.uid.apply(get_Iu)\n",
644 |         "df_svd['Iu'] = df_svd.uid.apply(get_Iu)\n",
645 |         "df_svdpp['Iu'] = df_svdpp.uid.apply(get_Iu)\n",
646 |         "df_knnbaseline['Iu'] = df_knnbaseline.uid.apply(get_Iu)"
647 |       ],
648 |       "execution_count": null,
649 |       "outputs": []
650 |     },
651 |     {
652 |       "cell_type": "code",
653 |       "metadata": {
654 |         "id": "q-VorHYTayVQ",
655 |         "colab_type": "code",
656 |         "colab": {
657 |           "base_uri": "https://localhost:8080/",
658 |           "height": 134
659 |         },
660 |         "outputId": "bb83cb23-6ebd-4ba1-ec31-8a599d8dd4ec"
661 |       },
662 |       "source": [
663 |         "print(\"--------------------------MAE-----------------------\")\n",
664 |         "print(\"KNN Basic                 \",df_knn[df_knn.Iu < 18].err.mean())\n",
665 |         "print(\"SVD                       \", df_svd[df_svd.Iu < 18].err.mean())\n",
666 |         "print(\"SVDpp                     \",  df_svdpp[df_svdpp.Iu < 18].err.mean())\n",
667 |         "print(\"KNN Baseline (item-item)  \", df_knnbaseline[df_knnbaseline.Iu < 18].err.mean())\n",
668 |         "print(\"BaselineOnly              \",df_baselineonly[df_baselineonly.Iu < 18].err.mean() )\n",
669 |         "print(\"KNN Baseline (user-user)  \",df_knn_user[df_knn_user.Iu < 18].err.mean() )"
670 |       ],
671 |       "execution_count": null,
672 |       "outputs": [
673 |         {
674 |           "output_type": "stream",
675 |           "text": [
676 |             "--------------------------MAE-----------------------\n",
677 |             "KNN Basic                  0.9356541418761788\n",
678 |             "SVD                        0.8174986369636367\n",
679 |             "SVDpp                      0.7853538665933238\n",
680 |             "KNN Baseline (item-item)   0.7549100058171629\n",
681 |             "BaselineOnly               0.828373767989461\n",
682 |             "KNN Baseline (user-user)   0.8527037143570998\n"
683 |           ],
684 |           "name": "stdout"
685 |         }
686 |       ]
687 |     },
688 |     {
689 |       "cell_type": "code",
690 |       "metadata": {
691 |         "id": "nQOEO64Jf9BE",
692 |         "colab_type": "code",
693 |         "colab": {
694 |           "base_uri": "https://localhost:8080/",
695 |           "height": 134
696 |         },
697 |         "outputId": "d2da200e-954a-44d4-89c2-890afc4b14e5"
698 |       },
699 |       "source": [
700 |         "print(\"--------------------------RMSE-----------------------\")\n",
701 |         "print(\"KNN Basic                \",df_knn[df_knn.Iu < 18].sqr_err.mean()** .5)\n",
702 |         "print(\"SVD                      \", df_svd[df_svd.Iu < 18].sqr_err.mean()** .5)\n",
703 |         "print(\"SVDpp                    \",  df_svdpp[df_svdpp.Iu < 18].sqr_err.mean()** .5)\n",
704 |         "print(\"KNN Baseline (item-item) \", df_knnbaseline[df_knnbaseline.Iu < 18].sqr_err.mean()** .5)\n",
705 |         "print(\"BaselineOnly             \",df_baselineonly[df_baselineonly.Iu < 18].sqr_err.mean()** .5 )\n",
706 |         "print(\"KNN Baseline (user-user) \",df_knn_user[df_knn_user.Iu < 18].sqr_err.mean()** .5)"
707 |       ],
708 |       "execution_count": null,
709 |       "outputs": [
710 |         {
711 |           "output_type": "stream",
712 |           "text": [
713 |             "--------------------------RMSE-----------------------\n",
714 |             "KNN Basic                 1.1998253947989697\n",
715 |             "SVD                       1.0549483774463828\n",
716 |             "SVDpp                     1.0083634724152428\n",
717 |             "KNN Baseline (item-item)  0.9896562169806813\n",
718 |             "BaselineOnly              1.0612306019619604\n",
719 |             "KNN Baseline (user-user)  1.1082756354422056\n"
720 |           ],
721 |           "name": "stdout"
722 |         }
723 |       ]
724 |     },
725 |     {
726 |       "cell_type": "code",
727 |       "metadata": {
728 |         "id": "xnzmsButgiyw",
729 |         "colab_type": "code",
730 |         "colab": {
731 |           "base_uri": "https://localhost:8080/",
732 |           "height": 134
733 |         },
734 |         "outputId": "f326c366-38ea-45b8-9133-cf22dcb72358"
735 |       },
736 |       "source": [
737 |         "print(\"--------------------------MAE-----------------------\")\n",
738 |         "print(\"KNN Basic                 \",df_knn[df_knn.Iu > 1000].err.mean())\n",
739 |         "print(\"SVD                       \", df_svd[df_svd.Iu > 1000].err.mean())\n",
740 |         "print(\"SVDpp                     \",  df_svdpp[df_svdpp.Iu > 1000].err.mean())\n",
741 |         "print(\"KNN Baseline (item-item)  \", df_knnbaseline[df_knnbaseline.Iu > 1000].err.mean())\n",
742 |         "print(\"BaselineOnly              \",df_baselineonly[df_baselineonly.Iu > 1000].err.mean() )\n",
743 |         "print(\"KNN Baseline (user-user)  \",df_knn_user[df_knn_user.Iu > 1000].err.mean() )"
744 |       ],
745 |       "execution_count": null,
746 |       "outputs": [
747 |         {
748 |           "output_type": "stream",
749 |           "text": [
750 |             "--------------------------MAE-----------------------\n",
751 |             "KNN Basic                  0.7118277630004157\n",
752 |             "SVD                        0.6349197611192368\n",
753 |             "SVDpp                      0.626063757313411\n",
754 |             "KNN Baseline (item-item)   0.6120430789383057\n",
755 |             "BaselineOnly               0.6306031032475772\n",
756 |             "KNN Baseline (user-user)   0.6330297364319998\n"
757 |           ],
758 |           "name": "stdout"
759 |         }
760 |       ]
761 |     },
762 |     {
763 |       "cell_type": "code",
764 |       "metadata": {
765 |         "id": "K1CnsM3mg0wi",
766 |         "colab_type": "code",
767 |         "colab": {
768 |           "base_uri": "https://localhost:8080/",
769 |           "height": 134
770 |         },
771 |         "outputId": "c79eaa77-b9a5-4d90-d470-9539c3af5858"
772 |       },
773 |       "source": [
774 |         "print(\"--------------------------RMSE-----------------------\")\n",
775 |         "print(\"KNN Basic                \",df_knn[df_knn.Iu > 1000].sqr_err.mean()** .5)\n",
776 |         "print(\"SVD                      \", df_svd[df_svd.Iu > 1000].sqr_err.mean()** .5)\n",
777 |         "print(\"SVDpp                    \",  df_svdpp[df_svdpp.Iu > 1000].sqr_err.mean()** .5)\n",
778 |         "print(\"KNN Baseline (item-item) \", df_knnbaseline[df_knnbaseline.Iu > 1000].sqr_err.mean()** .5)\n",
779 |         "print(\"BaselineOnly             \",df_baselineonly[df_baselineonly.Iu > 1000].sqr_err.mean()** .5 )\n",
780 |         "print(\"KNN Baseline (user-user) \",df_knn_user[df_knn_user.Iu > 1000].sqr_err.mean()** .5)"
781 |       ],
782 |       "execution_count": null,
783 |       "outputs": [
784 |         {
785 |           "output_type": "stream",
786 |           "text": [
787 |             "--------------------------RMSE-----------------------\n",
788 |             "KNN Basic                 0.9174613388905646\n",
789 |             "SVD                       0.8207944406250214\n",
790 |             "SVDpp                     0.8136491891525117\n",
791 |             "KNN Baseline (item-item)  0.789275629286978\n",
792 |             "BaselineOnly              0.799990922710614\n",
793 |             "KNN Baseline (user-user)  0.8198697577732832\n"
794 |           ],
795 |           "name": "stdout"
796 |         }
797 |       ]
798 |     },
799 |     {
800 |       "cell_type": "code",
801 |       "metadata": {
802 |         "id": "7aWKXObN6uhT",
803 |         "colab_type": "code",
804 |         "colab": {
805 |           "base_uri": "https://localhost:8080/",
806 |           "height": 34
807 |         },
808 |         "outputId": "2a40f35d-3055-43aa-d508-be83732ad842"
809 |       },
810 |       "source": [
811 |         "iid_df = traindf.groupby(['userId'],as_index=False).movieId.count()\n",
812 |         "iid_df.movieId.max()"
813 |       ],
814 |       "execution_count": null,
815 |       "outputs": [
816 |         {
817 |           "output_type": "execute_result",
818 |           "data": {
819 |             "text/plain": [
820 |               "2158"
821 |             ]
822 |           },
823 |           "metadata": {
824 |             "tags": []
825 |           },
826 |           "execution_count": 47
827 |         }
828 |       ]
829 |     }
830 |   ]
831 | }


--------------------------------------------------------------------------------
/Code/movie_era_based_recs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "kernelspec": {
  6 |       "display_name": "Python 3",
  7 |       "language": "python",
  8 |       "name": "python3"
  9 |     },
 10 |     "language_info": {
 11 |       "codemirror_mode": {
 12 |         "name": "ipython",
 13 |         "version": 3
 14 |       },
 15 |       "file_extension": ".py",
 16 |       "mimetype": "text/x-python",
 17 |       "name": "python",
 18 |       "nbconvert_exporter": "python",
 19 |       "pygments_lexer": "ipython3",
 20 |       "version": "3.7.6"
 21 |     },
 22 |     "colab": {
 23 |       "name": "movie_era_based_recs.ipynb",
 24 |       "provenance": []
 25 |     }
 26 |   },
 27 |   "cells": [
 28 |     {
 29 |       "cell_type": "markdown",
 30 |       "metadata": {
 31 |         "id": "4Cox5k2AMKsw",
 32 |         "colab_type": "text"
 33 |       },
 34 |       "source": [
 35 |         "This notebook uses the content based approach to include the time period in which the movie was launced. This method personalizes the users recommendations to include this feature."
 36 |       ]
 37 |     },
 38 |     {
 39 |       "cell_type": "code",
 40 |       "metadata": {
 41 |         "id": "bC1OAh-JMLd7",
 42 |         "colab_type": "code",
 43 |         "colab": {}
 44 |       },
 45 |       "source": [
 46 |         "import numpy as np\n",
 47 |         "import pandas as pd\n",
 48 |         "import matplotlib.pyplot as plt\n",
 49 |         "from ast import literal_eval\n",
 50 |         "import pdb"
 51 |       ],
 52 |       "execution_count": null,
 53 |       "outputs": []
 54 |     },
 55 |     {
 56 |       "cell_type": "code",
 57 |       "metadata": {
 58 |         "id": "wLkmHYnGLw5Z",
 59 |         "colab_type": "code",
 60 |         "colab": {}
 61 |       },
 62 |       "source": [
 63 |         "genre_user_vector = pd.read_csv(\"user_info.csv\")\n",
 64 |         "genre_user_vector = genre_user_vector[['userId', 'user_vector']]\n",
 65 |         "\n",
 66 |         "genre_user_vector['user_vector'] = genre_user_vector['user_vector'].apply(lambda x: x.replace('[', ' ').replace(']', ' ').strip().split())\n",
 67 |         "genre_user_vector['user_vector'] = genre_user_vector['user_vector'].apply(lambda x: np.asarray(x).astype(float))"
 68 |       ],
 69 |       "execution_count": null,
 70 |       "outputs": []
 71 |     },
 72 |     {
 73 |       "cell_type": "code",
 74 |       "metadata": {
 75 |         "id": "m4m_b7JbLw5c",
 76 |         "colab_type": "code",
 77 |         "colab": {}
 78 |       },
 79 |       "source": [
 80 |         "era_user_vector = pd.read_csv(\"user_era_vector.csv\")\n",
 81 |         "era_user_vector = era_user_vector[['userId', 'user_era_vector']]\n",
 82 |         "\n",
 83 |         "era_user_vector['user_era_vector'] = era_user_vector['user_era_vector'].apply(lambda x: x.replace('[', ' ').replace(']', ' ').strip().split())\n",
 84 |         "era_user_vector['user_era_vector'] = era_user_vector['user_era_vector'].apply(lambda x: np.asarray(x).astype(float))"
 85 |       ],
 86 |       "execution_count": null,
 87 |       "outputs": []
 88 |     },
 89 |     {
 90 |       "cell_type": "code",
 91 |       "metadata": {
 92 |         "id": "mCq1wzsaLw5f",
 93 |         "colab_type": "code",
 94 |         "colab": {}
 95 |       },
 96 |       "source": [
 97 |         "merged_user = genre_user_vector.join(era_user_vector['user_era_vector'])"
 98 |       ],
 99 |       "execution_count": null,
100 |       "outputs": []
101 |     },
102 |     {
103 |       "cell_type": "code",
104 |       "metadata": {
105 |         "id": "Obc2U0jILw5h",
106 |         "colab_type": "code",
107 |         "colab": {},
108 |         "outputId": "f2a77bef-df14-4981-acba-8bd0fe07b36d"
109 |       },
110 |       "source": [
111 |         "merged_user.head()"
112 |       ],
113 |       "execution_count": null,
114 |       "outputs": [
115 |         {
116 |           "output_type": "execute_result",
117 |           "data": {
118 |             "text/html": [
119 |               "<div>\n",
120 |               "<style scoped>\n",
121 |               "    .dataframe tbody tr th:only-of-type {\n",
122 |               "        vertical-align: middle;\n",
123 |               "    }\n",
124 |               "\n",
125 |               "    .dataframe tbody tr th {\n",
126 |               "        vertical-align: top;\n",
127 |               "    }\n",
128 |               "\n",
129 |               "    .dataframe thead th {\n",
130 |               "        text-align: right;\n",
131 |               "    }\n",
132 |               "</style>\n",
133 |               "<table border=\"1\" class=\"dataframe\">\n",
134 |               "  <thead>\n",
135 |               "    <tr style=\"text-align: right;\">\n",
136 |               "      <th></th>\n",
137 |               "      <th>userId</th>\n",
138 |               "      <th>user_vector</th>\n",
139 |               "      <th>user_era_vector</th>\n",
140 |               "    </tr>\n",
141 |               "  </thead>\n",
142 |               "  <tbody>\n",
143 |               "    <tr>\n",
144 |               "      <th>0</th>\n",
145 |               "      <td>1</td>\n",
146 |               "      <td>[4.39189189, 4.65217391, 4.48571429, 4.2676056...</td>\n",
147 |               "      <td>[4.63265306, 4.27272727, 4.6, 0.0]</td>\n",
148 |               "    </tr>\n",
149 |               "    <tr>\n",
150 |               "      <th>1</th>\n",
151 |               "      <td>2</td>\n",
152 |               "      <td>[4.16666667, 0.0, 0.0, 4.2, 0.0, 4.5, 3.866666...</td>\n",
153 |               "      <td>[0.0, 3.83333333, 4.05, 3.85]</td>\n",
154 |               "    </tr>\n",
155 |               "    <tr>\n",
156 |               "      <th>2</th>\n",
157 |               "      <td>3</td>\n",
158 |               "      <td>[2.5, 0.5, 0.5, 0.5, 4.33333333, 0.5, 0.833333...</td>\n",
159 |               "      <td>[2.45833333, 2.6875, 0.5, 0.0]</td>\n",
160 |               "    </tr>\n",
161 |               "    <tr>\n",
162 |               "      <th>3</th>\n",
163 |               "      <td>4</td>\n",
164 |               "      <td>[3.47826087, 4.0, 3.77777778, 3.43902439, 3.53...</td>\n",
165 |               "      <td>[4.4375, 3.25663717, 3.32142857, 0.0]</td>\n",
166 |               "    </tr>\n",
167 |               "    <tr>\n",
168 |               "      <th>4</th>\n",
169 |               "      <td>5</td>\n",
170 |               "      <td>[3.16666667, 4.25, 4.0, 3.36363636, 4.0, 3.1, ...</td>\n",
171 |               "      <td>[5.0, 3.55882353, 0.0, 0.0]</td>\n",
172 |               "    </tr>\n",
173 |               "  </tbody>\n",
174 |               "</table>\n",
175 |               "</div>"
176 |             ],
177 |             "text/plain": [
178 |               "   userId                                        user_vector  \\\n",
179 |               "0       1  [4.39189189, 4.65217391, 4.48571429, 4.2676056...   \n",
180 |               "1       2  [4.16666667, 0.0, 0.0, 4.2, 0.0, 4.5, 3.866666...   \n",
181 |               "2       3  [2.5, 0.5, 0.5, 0.5, 4.33333333, 0.5, 0.833333...   \n",
182 |               "3       4  [3.47826087, 4.0, 3.77777778, 3.43902439, 3.53...   \n",
183 |               "4       5  [3.16666667, 4.25, 4.0, 3.36363636, 4.0, 3.1, ...   \n",
184 |               "\n",
185 |               "                         user_era_vector  \n",
186 |               "0     [4.63265306, 4.27272727, 4.6, 0.0]  \n",
187 |               "1          [0.0, 3.83333333, 4.05, 3.85]  \n",
188 |               "2         [2.45833333, 2.6875, 0.5, 0.0]  \n",
189 |               "3  [4.4375, 3.25663717, 3.32142857, 0.0]  \n",
190 |               "4            [5.0, 3.55882353, 0.0, 0.0]  "
191 |             ]
192 |           },
193 |           "metadata": {
194 |             "tags": []
195 |           },
196 |           "execution_count": 5
197 |         }
198 |       ]
199 |     },
200 |     {
201 |       "cell_type": "code",
202 |       "metadata": {
203 |         "scrolled": false,
204 |         "id": "FqxlInOILw5k",
205 |         "colab_type": "code",
206 |         "colab": {}
207 |       },
208 |       "source": [
209 |         "merged_user['final_user_vector'] = merged_user.apply(lambda x: np.concatenate((2*x['user_vector'], x['user_era_vector'])), axis=1)"
210 |       ],
211 |       "execution_count": null,
212 |       "outputs": []
213 |     },
214 |     {
215 |       "cell_type": "code",
216 |       "metadata": {
217 |         "id": "R-USwe0RLw5m",
218 |         "colab_type": "code",
219 |         "colab": {},
220 |         "outputId": "df1b11eb-6da3-4070-fbf7-149e619605b7"
221 |       },
222 |       "source": [
223 |         "merged_user.head()"
224 |       ],
225 |       "execution_count": null,
226 |       "outputs": [
227 |         {
228 |           "output_type": "execute_result",
229 |           "data": {
230 |             "text/html": [
231 |               "<div>\n",
232 |               "<style scoped>\n",
233 |               "    .dataframe tbody tr th:only-of-type {\n",
234 |               "        vertical-align: middle;\n",
235 |               "    }\n",
236 |               "\n",
237 |               "    .dataframe tbody tr th {\n",
238 |               "        vertical-align: top;\n",
239 |               "    }\n",
240 |               "\n",
241 |               "    .dataframe thead th {\n",
242 |               "        text-align: right;\n",
243 |               "    }\n",
244 |               "</style>\n",
245 |               "<table border=\"1\" class=\"dataframe\">\n",
246 |               "  <thead>\n",
247 |               "    <tr style=\"text-align: right;\">\n",
248 |               "      <th></th>\n",
249 |               "      <th>userId</th>\n",
250 |               "      <th>user_vector</th>\n",
251 |               "      <th>user_era_vector</th>\n",
252 |               "      <th>final_user_vector</th>\n",
253 |               "    </tr>\n",
254 |               "  </thead>\n",
255 |               "  <tbody>\n",
256 |               "    <tr>\n",
257 |               "      <th>0</th>\n",
258 |               "      <td>1</td>\n",
259 |               "      <td>[4.39189189, 4.65217391, 4.48571429, 4.2676056...</td>\n",
260 |               "      <td>[4.63265306, 4.27272727, 4.6, 0.0]</td>\n",
261 |               "      <td>[8.78378378, 9.30434782, 8.97142858, 8.5352112...</td>\n",
262 |               "    </tr>\n",
263 |               "    <tr>\n",
264 |               "      <th>1</th>\n",
265 |               "      <td>2</td>\n",
266 |               "      <td>[4.16666667, 0.0, 0.0, 4.2, 0.0, 4.5, 3.866666...</td>\n",
267 |               "      <td>[0.0, 3.83333333, 4.05, 3.85]</td>\n",
268 |               "      <td>[8.33333334, 0.0, 0.0, 8.4, 0.0, 9.0, 7.733333...</td>\n",
269 |               "    </tr>\n",
270 |               "    <tr>\n",
271 |               "      <th>2</th>\n",
272 |               "      <td>3</td>\n",
273 |               "      <td>[2.5, 0.5, 0.5, 0.5, 4.33333333, 0.5, 0.833333...</td>\n",
274 |               "      <td>[2.45833333, 2.6875, 0.5, 0.0]</td>\n",
275 |               "      <td>[5.0, 1.0, 1.0, 1.0, 8.66666666, 1.0, 1.666666...</td>\n",
276 |               "    </tr>\n",
277 |               "    <tr>\n",
278 |               "      <th>3</th>\n",
279 |               "      <td>4</td>\n",
280 |               "      <td>[3.47826087, 4.0, 3.77777778, 3.43902439, 3.53...</td>\n",
281 |               "      <td>[4.4375, 3.25663717, 3.32142857, 0.0]</td>\n",
282 |               "      <td>[6.95652174, 8.0, 7.55555556, 6.87804878, 7.06...</td>\n",
283 |               "    </tr>\n",
284 |               "    <tr>\n",
285 |               "      <th>4</th>\n",
286 |               "      <td>5</td>\n",
287 |               "      <td>[3.16666667, 4.25, 4.0, 3.36363636, 4.0, 3.1, ...</td>\n",
288 |               "      <td>[5.0, 3.55882353, 0.0, 0.0]</td>\n",
289 |               "      <td>[6.33333334, 8.5, 8.0, 6.72727272, 8.0, 6.2, 7...</td>\n",
290 |               "    </tr>\n",
291 |               "  </tbody>\n",
292 |               "</table>\n",
293 |               "</div>"
294 |             ],
295 |             "text/plain": [
296 |               "   userId                                        user_vector  \\\n",
297 |               "0       1  [4.39189189, 4.65217391, 4.48571429, 4.2676056...   \n",
298 |               "1       2  [4.16666667, 0.0, 0.0, 4.2, 0.0, 4.5, 3.866666...   \n",
299 |               "2       3  [2.5, 0.5, 0.5, 0.5, 4.33333333, 0.5, 0.833333...   \n",
300 |               "3       4  [3.47826087, 4.0, 3.77777778, 3.43902439, 3.53...   \n",
301 |               "4       5  [3.16666667, 4.25, 4.0, 3.36363636, 4.0, 3.1, ...   \n",
302 |               "\n",
303 |               "                         user_era_vector  \\\n",
304 |               "0     [4.63265306, 4.27272727, 4.6, 0.0]   \n",
305 |               "1          [0.0, 3.83333333, 4.05, 3.85]   \n",
306 |               "2         [2.45833333, 2.6875, 0.5, 0.0]   \n",
307 |               "3  [4.4375, 3.25663717, 3.32142857, 0.0]   \n",
308 |               "4            [5.0, 3.55882353, 0.0, 0.0]   \n",
309 |               "\n",
310 |               "                                   final_user_vector  \n",
311 |               "0  [8.78378378, 9.30434782, 8.97142858, 8.5352112...  \n",
312 |               "1  [8.33333334, 0.0, 0.0, 8.4, 0.0, 9.0, 7.733333...  \n",
313 |               "2  [5.0, 1.0, 1.0, 1.0, 8.66666666, 1.0, 1.666666...  \n",
314 |               "3  [6.95652174, 8.0, 7.55555556, 6.87804878, 7.06...  \n",
315 |               "4  [6.33333334, 8.5, 8.0, 6.72727272, 8.0, 6.2, 7...  "
316 |             ]
317 |           },
318 |           "metadata": {
319 |             "tags": []
320 |           },
321 |           "execution_count": 7
322 |         }
323 |       ]
324 |     },
325 |     {
326 |       "cell_type": "code",
327 |       "metadata": {
328 |         "id": "0-zBOAM8Lw5q",
329 |         "colab_type": "code",
330 |         "colab": {}
331 |       },
332 |       "source": [
333 |         "movie_genre_vector = pd.read_csv(\"movie_vector.csv\")\n",
334 |         "movie_genre_vector = movie_genre_vector[['movieId', 'movie_vector']]\n",
335 |         "\n",
336 |         "movie_genre_vector['movie_vector'] = movie_genre_vector['movie_vector'].apply(lambda x: x.replace('[', ' ').replace(']', ' ').strip().split())\n",
337 |         "movie_genre_vector['movie_vector'] = movie_genre_vector['movie_vector'].apply(lambda x: np.asarray(x).astype(float))"
338 |       ],
339 |       "execution_count": null,
340 |       "outputs": []
341 |     },
342 |     {
343 |       "cell_type": "code",
344 |       "metadata": {
345 |         "id": "4awx_m7pLw5s",
346 |         "colab_type": "code",
347 |         "colab": {}
348 |       },
349 |       "source": [
350 |         "movie_era_vector = pd.read_csv(\"movie_era_vector.csv\")\n",
351 |         "movie_era_vector = movie_era_vector[['movieId', 'era_vector']]\n",
352 |         "\n",
353 |         "movie_era_vector['era_vector'] = movie_era_vector['era_vector'].apply(lambda x: x.replace('[', ' ').replace(']', ' ').strip().split())\n",
354 |         "movie_era_vector['era_vector'] = movie_era_vector['era_vector'].apply(lambda x: np.asarray(x).astype(float))"
355 |       ],
356 |       "execution_count": null,
357 |       "outputs": []
358 |     },
359 |     {
360 |       "cell_type": "code",
361 |       "metadata": {
362 |         "id": "v92UsULNLw5u",
363 |         "colab_type": "code",
364 |         "colab": {}
365 |       },
366 |       "source": [
367 |         "merged_movie = movie_genre_vector.join(movie_era_vector['era_vector'])\n",
368 |         "merged_movie['final_movie_vector'] = merged_movie.apply(lambda x: np.concatenate((2*x['movie_vector'], x['era_vector'])), axis=1)"
369 |       ],
370 |       "execution_count": null,
371 |       "outputs": []
372 |     },
373 |     {
374 |       "cell_type": "code",
375 |       "metadata": {
376 |         "id": "jHxejQEiLw5w",
377 |         "colab_type": "code",
378 |         "colab": {},
379 |         "outputId": "87e2afb0-4ac5-4411-de9e-23d316b3b758"
380 |       },
381 |       "source": [
382 |         "merged_movie.head()"
383 |       ],
384 |       "execution_count": null,
385 |       "outputs": [
386 |         {
387 |           "output_type": "execute_result",
388 |           "data": {
389 |             "text/html": [
390 |               "<div>\n",
391 |               "<style scoped>\n",
392 |               "    .dataframe tbody tr th:only-of-type {\n",
393 |               "        vertical-align: middle;\n",
394 |               "    }\n",
395 |               "\n",
396 |               "    .dataframe tbody tr th {\n",
397 |               "        vertical-align: top;\n",
398 |               "    }\n",
399 |               "\n",
400 |               "    .dataframe thead th {\n",
401 |               "        text-align: right;\n",
402 |               "    }\n",
403 |               "</style>\n",
404 |               "<table border=\"1\" class=\"dataframe\">\n",
405 |               "  <thead>\n",
406 |               "    <tr style=\"text-align: right;\">\n",
407 |               "      <th></th>\n",
408 |               "      <th>movieId</th>\n",
409 |               "      <th>movie_vector</th>\n",
410 |               "      <th>era_vector</th>\n",
411 |               "      <th>final_movie_vector</th>\n",
412 |               "    </tr>\n",
413 |               "  </thead>\n",
414 |               "  <tbody>\n",
415 |               "    <tr>\n",
416 |               "      <th>0</th>\n",
417 |               "      <td>1</td>\n",
418 |               "      <td>[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
419 |               "      <td>[0.0, 1.0, 0.0, 0.0]</td>\n",
420 |               "      <td>[2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
421 |               "    </tr>\n",
422 |               "    <tr>\n",
423 |               "      <th>1</th>\n",
424 |               "      <td>2</td>\n",
425 |               "      <td>[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
426 |               "      <td>[0.0, 1.0, 0.0, 0.0]</td>\n",
427 |               "      <td>[2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
428 |               "    </tr>\n",
429 |               "    <tr>\n",
430 |               "      <th>2</th>\n",
431 |               "      <td>3</td>\n",
432 |               "      <td>[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...</td>\n",
433 |               "      <td>[0.0, 1.0, 0.0, 0.0]</td>\n",
434 |               "      <td>[0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, ...</td>\n",
435 |               "    </tr>\n",
436 |               "    <tr>\n",
437 |               "      <th>3</th>\n",
438 |               "      <td>4</td>\n",
439 |               "      <td>[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, ...</td>\n",
440 |               "      <td>[0.0, 1.0, 0.0, 0.0]</td>\n",
441 |               "      <td>[0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 2.0, 0.0, 0.0, ...</td>\n",
442 |               "    </tr>\n",
443 |               "    <tr>\n",
444 |               "      <th>4</th>\n",
445 |               "      <td>5</td>\n",
446 |               "      <td>[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
447 |               "      <td>[0.0, 1.0, 0.0, 0.0]</td>\n",
448 |               "      <td>[0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
449 |               "    </tr>\n",
450 |               "  </tbody>\n",
451 |               "</table>\n",
452 |               "</div>"
453 |             ],
454 |             "text/plain": [
455 |               "   movieId                                       movie_vector  \\\n",
456 |               "0        1  [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...   \n",
457 |               "1        2  [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...   \n",
458 |               "2        3  [0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...   \n",
459 |               "3        4  [0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, ...   \n",
460 |               "4        5  [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   \n",
461 |               "\n",
462 |               "             era_vector                                 final_movie_vector  \n",
463 |               "0  [0.0, 1.0, 0.0, 0.0]  [2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0, ...  \n",
464 |               "1  [0.0, 1.0, 0.0, 0.0]  [2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, ...  \n",
465 |               "2  [0.0, 1.0, 0.0, 0.0]  [0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, ...  \n",
466 |               "3  [0.0, 1.0, 0.0, 0.0]  [0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 2.0, 0.0, 0.0, ...  \n",
467 |               "4  [0.0, 1.0, 0.0, 0.0]  [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  "
468 |             ]
469 |           },
470 |           "metadata": {
471 |             "tags": []
472 |           },
473 |           "execution_count": 11
474 |         }
475 |       ]
476 |     },
477 |     {
478 |       "cell_type": "markdown",
479 |       "metadata": {
480 |         "id": "KF3gzUpwLw5y",
481 |         "colab_type": "text"
482 |       },
483 |       "source": [
484 |         "## Test"
485 |       ]
486 |     },
487 |     {
488 |       "cell_type": "code",
489 |       "metadata": {
490 |         "id": "NlP1KEMgLw5z",
491 |         "colab_type": "code",
492 |         "colab": {},
493 |         "outputId": "96cb9714-bb7e-49c5-84e9-48a99eb3a0f8"
494 |       },
495 |       "source": [
496 |         "ratings_test = pd.read_csv(\"testing_data.csv\", converters={\"genres\": literal_eval, \"tag\": literal_eval}) \n",
497 |         "ratings_test.head()"
498 |       ],
499 |       "execution_count": null,
500 |       "outputs": [
501 |         {
502 |           "output_type": "execute_result",
503 |           "data": {
504 |             "text/html": [
505 |               "<div>\n",
506 |               "<style scoped>\n",
507 |               "    .dataframe tbody tr th:only-of-type {\n",
508 |               "        vertical-align: middle;\n",
509 |               "    }\n",
510 |               "\n",
511 |               "    .dataframe tbody tr th {\n",
512 |               "        vertical-align: top;\n",
513 |               "    }\n",
514 |               "\n",
515 |               "    .dataframe thead th {\n",
516 |               "        text-align: right;\n",
517 |               "    }\n",
518 |               "</style>\n",
519 |               "<table border=\"1\" class=\"dataframe\">\n",
520 |               "  <thead>\n",
521 |               "    <tr style=\"text-align: right;\">\n",
522 |               "      <th></th>\n",
523 |               "      <th>userId</th>\n",
524 |               "      <th>movieId</th>\n",
525 |               "      <th>rating</th>\n",
526 |               "      <th>timestamp</th>\n",
527 |               "      <th>genres</th>\n",
528 |               "      <th>tag</th>\n",
529 |               "    </tr>\n",
530 |               "  </thead>\n",
531 |               "  <tbody>\n",
532 |               "    <tr>\n",
533 |               "      <th>0</th>\n",
534 |               "      <td>1</td>\n",
535 |               "      <td>3</td>\n",
536 |               "      <td>4.0</td>\n",
537 |               "      <td>964981247</td>\n",
538 |               "      <td>[Comedy, Romance]</td>\n",
539 |               "      <td>[]</td>\n",
540 |               "    </tr>\n",
541 |               "    <tr>\n",
542 |               "      <th>1</th>\n",
543 |               "      <td>1</td>\n",
544 |               "      <td>163</td>\n",
545 |               "      <td>5.0</td>\n",
546 |               "      <td>964983650</td>\n",
547 |               "      <td>[Action, Romance, Western]</td>\n",
548 |               "      <td>[]</td>\n",
549 |               "    </tr>\n",
550 |               "    <tr>\n",
551 |               "      <th>2</th>\n",
552 |               "      <td>1</td>\n",
553 |               "      <td>316</td>\n",
554 |               "      <td>3.0</td>\n",
555 |               "      <td>964982310</td>\n",
556 |               "      <td>[Action, Adventure, Sci-Fi]</td>\n",
557 |               "      <td>[]</td>\n",
558 |               "    </tr>\n",
559 |               "    <tr>\n",
560 |               "      <th>3</th>\n",
561 |               "      <td>1</td>\n",
562 |               "      <td>349</td>\n",
563 |               "      <td>4.0</td>\n",
564 |               "      <td>964982563</td>\n",
565 |               "      <td>[Action, Crime, Drama, Thriller]</td>\n",
566 |               "      <td>[]</td>\n",
567 |               "    </tr>\n",
568 |               "    <tr>\n",
569 |               "      <th>4</th>\n",
570 |               "      <td>1</td>\n",
571 |               "      <td>441</td>\n",
572 |               "      <td>4.0</td>\n",
573 |               "      <td>964980868</td>\n",
574 |               "      <td>[Comedy]</td>\n",
575 |               "      <td>[]</td>\n",
576 |               "    </tr>\n",
577 |               "  </tbody>\n",
578 |               "</table>\n",
579 |               "</div>"
580 |             ],
581 |             "text/plain": [
582 |               "   userId  movieId  rating  timestamp                            genres tag\n",
583 |               "0       1        3     4.0  964981247                 [Comedy, Romance]  []\n",
584 |               "1       1      163     5.0  964983650        [Action, Romance, Western]  []\n",
585 |               "2       1      316     3.0  964982310       [Action, Adventure, Sci-Fi]  []\n",
586 |               "3       1      349     4.0  964982563  [Action, Crime, Drama, Thriller]  []\n",
587 |               "4       1      441     4.0  964980868                          [Comedy]  []"
588 |             ]
589 |           },
590 |           "metadata": {
591 |             "tags": []
592 |           },
593 |           "execution_count": 12
594 |         }
595 |       ]
596 |     },
597 |     {
598 |       "cell_type": "code",
599 |       "metadata": {
600 |         "id": "3X3-GJDNLw51",
601 |         "colab_type": "code",
602 |         "colab": {}
603 |       },
604 |       "source": [
605 |         "ratings_test = pd.read_csv(\"testing_data.csv\", converters={\"genres\": literal_eval, \"tag\": literal_eval}) \n",
606 |         "ratings_test.head()\n",
607 |         "\n",
608 |         "algo_predictions = pd.DataFrame(columns=['userId', 'movieId', 'user_vector', 'movie_vector', 'og_rating', 'pred_rating'])\n",
609 |         "error_count = 0\n",
610 |         "for ind, row in ratings_test.iterrows():\n",
611 |         "    userId = row['userId']\n",
612 |         "    movieId = row['movieId']\n",
613 |         "    og_rating = row['rating']\n",
614 |         "    \n",
615 |         "    user_vector = merged_user[merged_user['userId'] == int(userId)].final_user_vector.values[0]\n",
616 |         "    if len(merged_movie[merged_movie['movieId'] == int(movieId)].final_movie_vector.values):\n",
617 |         "        movie_vector = merged_movie[merged_movie['movieId'] == int(movieId)].final_movie_vector.values[0]\n",
618 |         "    else:\n",
619 |         "        error_count += 1\n",
620 |         "        print(\"Movie vector not found!\", movieId)\n",
621 |         "    predicted_rating = user_vector*movie_vector\n",
622 |         "\n",
623 |         "    if predicted_rating.any():\n",
624 |         "        predicted_rating = np.nanmean(np.where(predicted_rating!=0, predicted_rating, np.nan))\n",
625 |         "    else:\n",
626 |         "        predicted_rating = 0\n",
627 |         "\n",
628 |         "    row_df = pd.DataFrame([[userId, movieId, user_vector, movie_vector, og_rating, predicted_rating]], \n",
629 |         "                columns=['userId', 'movieId', 'user_vector', 'movie_vector', 'og_rating', 'pred_rating'])\n",
630 |         "    algo_predictions = pd.concat([algo_predictions, row_df], ignore_index=True)"
631 |       ],
632 |       "execution_count": null,
633 |       "outputs": []
634 |     },
635 |     {
636 |       "cell_type": "code",
637 |       "metadata": {
638 |         "id": "spMCRnulLw53",
639 |         "colab_type": "code",
640 |         "colab": {},
641 |         "outputId": "89e8b4bf-20e2-4cdf-d227-6bfc6fc71934"
642 |       },
643 |       "source": [
644 |         "algo_predictions"
645 |       ],
646 |       "execution_count": null,
647 |       "outputs": [
648 |         {
649 |           "output_type": "execute_result",
650 |           "data": {
651 |             "text/html": [
652 |               "<div>\n",
653 |               "<style scoped>\n",
654 |               "    .dataframe tbody tr th:only-of-type {\n",
655 |               "        vertical-align: middle;\n",
656 |               "    }\n",
657 |               "\n",
658 |               "    .dataframe tbody tr th {\n",
659 |               "        vertical-align: top;\n",
660 |               "    }\n",
661 |               "\n",
662 |               "    .dataframe thead th {\n",
663 |               "        text-align: right;\n",
664 |               "    }\n",
665 |               "</style>\n",
666 |               "<table border=\"1\" class=\"dataframe\">\n",
667 |               "  <thead>\n",
668 |               "    <tr style=\"text-align: right;\">\n",
669 |               "      <th></th>\n",
670 |               "      <th>userId</th>\n",
671 |               "      <th>movieId</th>\n",
672 |               "      <th>user_vector</th>\n",
673 |               "      <th>movie_vector</th>\n",
674 |               "      <th>og_rating</th>\n",
675 |               "      <th>pred_rating</th>\n",
676 |               "    </tr>\n",
677 |               "  </thead>\n",
678 |               "  <tbody>\n",
679 |               "    <tr>\n",
680 |               "      <th>0</th>\n",
681 |               "      <td>1</td>\n",
682 |               "      <td>3</td>\n",
683 |               "      <td>[8.78378378, 9.30434782, 8.97142858, 8.5352112...</td>\n",
684 |               "      <td>[0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, ...</td>\n",
685 |               "      <td>4.0</td>\n",
686 |               "      <td>12.892161</td>\n",
687 |               "    </tr>\n",
688 |               "    <tr>\n",
689 |               "      <th>1</th>\n",
690 |               "      <td>1</td>\n",
691 |               "      <td>163</td>\n",
692 |               "      <td>[8.78378378, 9.30434782, 8.97142858, 8.5352112...</td>\n",
693 |               "      <td>[0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0, ...</td>\n",
694 |               "      <td>5.0</td>\n",
695 |               "      <td>14.134848</td>\n",
696 |               "    </tr>\n",
697 |               "    <tr>\n",
698 |               "      <th>2</th>\n",
699 |               "      <td>1</td>\n",
700 |               "      <td>316</td>\n",
701 |               "      <td>[8.78378378, 9.30434782, 8.97142858, 8.5352112...</td>\n",
702 |               "      <td>[2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, ...</td>\n",
703 |               "      <td>3.0</td>\n",
704 |               "      <td>13.986955</td>\n",
705 |               "    </tr>\n",
706 |               "    <tr>\n",
707 |               "      <th>3</th>\n",
708 |               "      <td>1</td>\n",
709 |               "      <td>349</td>\n",
710 |               "      <td>[8.78378378, 9.30434782, 8.97142858, 8.5352112...</td>\n",
711 |               "      <td>[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 2.0, ...</td>\n",
712 |               "      <td>4.0</td>\n",
713 |               "      <td>14.707133</td>\n",
714 |               "    </tr>\n",
715 |               "    <tr>\n",
716 |               "      <th>4</th>\n",
717 |               "      <td>1</td>\n",
718 |               "      <td>441</td>\n",
719 |               "      <td>[8.78378378, 9.30434782, 8.97142858, 8.5352112...</td>\n",
720 |               "      <td>[0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
721 |               "      <td>4.0</td>\n",
722 |               "      <td>10.671575</td>\n",
723 |               "    </tr>\n",
724 |               "    <tr>\n",
725 |               "      <th>...</th>\n",
726 |               "      <td>...</td>\n",
727 |               "      <td>...</td>\n",
728 |               "      <td>...</td>\n",
729 |               "      <td>...</td>\n",
730 |               "      <td>...</td>\n",
731 |               "      <td>...</td>\n",
732 |               "    </tr>\n",
733 |               "    <tr>\n",
734 |               "      <th>20163</th>\n",
735 |               "      <td>610</td>\n",
736 |               "      <td>156726</td>\n",
737 |               "      <td>[7.38967136, 7.8490566, 7.34210526, 7.44984802...</td>\n",
738 |               "      <td>[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
739 |               "      <td>4.5</td>\n",
740 |               "      <td>8.852847</td>\n",
741 |               "    </tr>\n",
742 |               "    <tr>\n",
743 |               "      <th>20164</th>\n",
744 |               "      <td>610</td>\n",
745 |               "      <td>159093</td>\n",
746 |               "      <td>[7.38967136, 7.8490566, 7.34210526, 7.44984802...</td>\n",
747 |               "      <td>[0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 2.0, 0.0, ...</td>\n",
748 |               "      <td>3.0</td>\n",
749 |               "      <td>11.726114</td>\n",
750 |               "    </tr>\n",
751 |               "    <tr>\n",
752 |               "      <th>20165</th>\n",
753 |               "      <td>610</td>\n",
754 |               "      <td>161582</td>\n",
755 |               "      <td>[7.38967136, 7.8490566, 7.34210526, 7.44984802...</td>\n",
756 |               "      <td>[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 2.0, ...</td>\n",
757 |               "      <td>4.0</td>\n",
758 |               "      <td>11.335023</td>\n",
759 |               "    </tr>\n",
760 |               "    <tr>\n",
761 |               "      <th>20166</th>\n",
762 |               "      <td>610</td>\n",
763 |               "      <td>162350</td>\n",
764 |               "      <td>[7.38967136, 7.8490566, 7.34210526, 7.44984802...</td>\n",
765 |               "      <td>[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, ...</td>\n",
766 |               "      <td>3.5</td>\n",
767 |               "      <td>11.036977</td>\n",
768 |               "    </tr>\n",
769 |               "    <tr>\n",
770 |               "      <th>20167</th>\n",
771 |               "      <td>610</td>\n",
772 |               "      <td>163981</td>\n",
773 |               "      <td>[7.38967136, 7.8490566, 7.34210526, 7.44984802...</td>\n",
774 |               "      <td>[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...</td>\n",
775 |               "      <td>3.5</td>\n",
776 |               "      <td>8.701610</td>\n",
777 |               "    </tr>\n",
778 |               "  </tbody>\n",
779 |               "</table>\n",
780 |               "<p>20168 rows × 6 columns</p>\n",
781 |               "</div>"
782 |             ],
783 |             "text/plain": [
784 |               "      userId movieId                                        user_vector  \\\n",
785 |               "0          1       3  [8.78378378, 9.30434782, 8.97142858, 8.5352112...   \n",
786 |               "1          1     163  [8.78378378, 9.30434782, 8.97142858, 8.5352112...   \n",
787 |               "2          1     316  [8.78378378, 9.30434782, 8.97142858, 8.5352112...   \n",
788 |               "3          1     349  [8.78378378, 9.30434782, 8.97142858, 8.5352112...   \n",
789 |               "4          1     441  [8.78378378, 9.30434782, 8.97142858, 8.5352112...   \n",
790 |               "...      ...     ...                                                ...   \n",
791 |               "20163    610  156726  [7.38967136, 7.8490566, 7.34210526, 7.44984802...   \n",
792 |               "20164    610  159093  [7.38967136, 7.8490566, 7.34210526, 7.44984802...   \n",
793 |               "20165    610  161582  [7.38967136, 7.8490566, 7.34210526, 7.44984802...   \n",
794 |               "20166    610  162350  [7.38967136, 7.8490566, 7.34210526, 7.44984802...   \n",
795 |               "20167    610  163981  [7.38967136, 7.8490566, 7.34210526, 7.44984802...   \n",
796 |               "\n",
797 |               "                                            movie_vector  og_rating  \\\n",
798 |               "0      [0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, ...        4.0   \n",
799 |               "1      [0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0, ...        5.0   \n",
800 |               "2      [2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, ...        3.0   \n",
801 |               "3      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 2.0, ...        4.0   \n",
802 |               "4      [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...        4.0   \n",
803 |               "...                                                  ...        ...   \n",
804 |               "20163  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...        4.5   \n",
805 |               "20164  [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 2.0, 0.0, ...        3.0   \n",
806 |               "20165  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 2.0, ...        4.0   \n",
807 |               "20166  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, ...        3.5   \n",
808 |               "20167  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...        3.5   \n",
809 |               "\n",
810 |               "       pred_rating  \n",
811 |               "0        12.892161  \n",
812 |               "1        14.134848  \n",
813 |               "2        13.986955  \n",
814 |               "3        14.707133  \n",
815 |               "4        10.671575  \n",
816 |               "...            ...  \n",
817 |               "20163     8.852847  \n",
818 |               "20164    11.726114  \n",
819 |               "20165    11.335023  \n",
820 |               "20166    11.036977  \n",
821 |               "20167     8.701610  \n",
822 |               "\n",
823 |               "[20168 rows x 6 columns]"
824 |             ]
825 |           },
826 |           "metadata": {
827 |             "tags": []
828 |           },
829 |           "execution_count": 14
830 |         }
831 |       ]
832 |     },
833 |     {
834 |       "cell_type": "code",
835 |       "metadata": {
836 |         "id": "wzJucPlCLw55",
837 |         "colab_type": "code",
838 |         "colab": {}
839 |       },
840 |       "source": [
841 |         "# algo_predictions.to_csv(\"genre_era_predictions.csv\")"
842 |       ],
843 |       "execution_count": null,
844 |       "outputs": []
845 |     },
846 |     {
847 |       "cell_type": "code",
848 |       "metadata": {
849 |         "scrolled": true,
850 |         "id": "xnnjyw58Lw57",
851 |         "colab_type": "code",
852 |         "colab": {},
853 |         "outputId": "67264ae5-6d78-4260-cef3-992bcdb1eed7"
854 |       },
855 |       "source": [
856 |         "rmse = ((algo_predictions.og_rating - algo_predictions.pred_rating/3) ** 2).mean() ** .5\n",
857 |         "rmse"
858 |       ],
859 |       "execution_count": null,
860 |       "outputs": [
861 |         {
862 |           "output_type": "execute_result",
863 |           "data": {
864 |             "text/plain": [
865 |               "0.9898749125266205"
866 |             ]
867 |           },
868 |           "metadata": {
869 |             "tags": []
870 |           },
871 |           "execution_count": 16
872 |         }
873 |       ]
874 |     },
875 |     {
876 |       "cell_type": "code",
877 |       "metadata": {
878 |         "id": "wDfACdMsLw59",
879 |         "colab_type": "code",
880 |         "colab": {},
881 |         "outputId": "b46389e7-6071-4f44-be9b-3731cf8fa46a"
882 |       },
883 |       "source": [
884 |         "mae = (((algo_predictions.og_rating - algo_predictions.pred_rating/3) ** 2) ** .5).mean()\n",
885 |         "mae"
886 |       ],
887 |       "execution_count": null,
888 |       "outputs": [
889 |         {
890 |           "output_type": "execute_result",
891 |           "data": {
892 |             "text/plain": [
893 |               "0.7651172008808977"
894 |             ]
895 |           },
896 |           "metadata": {
897 |             "tags": []
898 |           },
899 |           "execution_count": 17
900 |         }
901 |       ]
902 |     },
903 |     {
904 |       "cell_type": "code",
905 |       "metadata": {
906 |         "id": "KQj2WqJZLw6A",
907 |         "colab_type": "code",
908 |         "colab": {}
909 |       },
910 |       "source": [
911 |         ""
912 |       ],
913 |       "execution_count": null,
914 |       "outputs": []
915 |     }
916 |   ]
917 | }


--------------------------------------------------------------------------------
/Code/preprocessing.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |   "nbformat": 4,
   3 |   "nbformat_minor": 0,
   4 |   "metadata": {
   5 |     "kernelspec": {
   6 |       "display_name": "Python 3",
   7 |       "language": "python",
   8 |       "name": "python3"
   9 |     },
  10 |     "language_info": {
  11 |       "codemirror_mode": {
  12 |         "name": "ipython",
  13 |         "version": 3
  14 |       },
  15 |       "file_extension": ".py",
  16 |       "mimetype": "text/x-python",
  17 |       "name": "python",
  18 |       "nbconvert_exporter": "python",
  19 |       "pygments_lexer": "ipython3",
  20 |       "version": "3.7.6"
  21 |     },
  22 |     "colab": {
  23 |       "name": "preprocessing.ipynb",
  24 |       "provenance": [],
  25 |       "collapsed_sections": []
  26 |     }
  27 |   },
  28 |   "cells": [
  29 |     {
  30 |       "cell_type": "markdown",
  31 |       "metadata": {
  32 |         "id": "slsyFHKMOwm9",
  33 |         "colab_type": "text"
  34 |       },
  35 |       "source": [
  36 |         "This notebook splits the data into training and testing set for each user such that 80% ratings are in training and 20% are for testing."
  37 |       ]
  38 |     },
  39 |     {
  40 |       "cell_type": "code",
  41 |       "metadata": {
  42 |         "id": "fauYg6bNOu24",
  43 |         "colab_type": "code",
  44 |         "colab": {}
  45 |       },
  46 |       "source": [
  47 |         "import pandas as pd\n",
  48 |         "import numpy as np"
  49 |       ],
  50 |       "execution_count": null,
  51 |       "outputs": []
  52 |     },
  53 |     {
  54 |       "cell_type": "code",
  55 |       "metadata": {
  56 |         "id": "WdMSaP4bOu26",
  57 |         "colab_type": "code",
  58 |         "colab": {},
  59 |         "outputId": "060f423a-f966-4a2b-c38d-55cf40e5c075"
  60 |       },
  61 |       "source": [
  62 |         "movies = pd.read_csv('ml-latest-small/movies.csv')\n",
  63 |         "ratings = pd.read_csv('ml-latest-small/ratings.csv')\n",
  64 |         "tags = pd.read_csv('ml-latest-small/tags.csv')\n",
  65 |         "print('movies: ', movies.shape)\n",
  66 |         "print('ratings: ', ratings.shape)\n",
  67 |         "print('tags: ', tags.shape)"
  68 |       ],
  69 |       "execution_count": null,
  70 |       "outputs": [
  71 |         {
  72 |           "output_type": "stream",
  73 |           "text": [
  74 |             "movies:  (9742, 3)\n",
  75 |             "ratings:  (100836, 4)\n",
  76 |             "tags:  (3683, 4)\n"
  77 |           ],
  78 |           "name": "stdout"
  79 |         }
  80 |       ]
  81 |     },
  82 |     {
  83 |       "cell_type": "code",
  84 |       "metadata": {
  85 |         "scrolled": false,
  86 |         "id": "pTK-7Pn_Ou29",
  87 |         "colab_type": "code",
  88 |         "colab": {},
  89 |         "outputId": "19cee6e2-8455-4a4f-f413-3a10fd4b2a9e"
  90 |       },
  91 |       "source": [
  92 |         "movies.head(5)"
  93 |       ],
  94 |       "execution_count": null,
  95 |       "outputs": [
  96 |         {
  97 |           "output_type": "execute_result",
  98 |           "data": {
  99 |             "text/html": [
 100 |               "<div>\n",
 101 |               "<style scoped>\n",
 102 |               "    .dataframe tbody tr th:only-of-type {\n",
 103 |               "        vertical-align: middle;\n",
 104 |               "    }\n",
 105 |               "\n",
 106 |               "    .dataframe tbody tr th {\n",
 107 |               "        vertical-align: top;\n",
 108 |               "    }\n",
 109 |               "\n",
 110 |               "    .dataframe thead th {\n",
 111 |               "        text-align: right;\n",
 112 |               "    }\n",
 113 |               "</style>\n",
 114 |               "<table border=\"1\" class=\"dataframe\">\n",
 115 |               "  <thead>\n",
 116 |               "    <tr style=\"text-align: right;\">\n",
 117 |               "      <th></th>\n",
 118 |               "      <th>movieId</th>\n",
 119 |               "      <th>title</th>\n",
 120 |               "      <th>genres</th>\n",
 121 |               "    </tr>\n",
 122 |               "  </thead>\n",
 123 |               "  <tbody>\n",
 124 |               "    <tr>\n",
 125 |               "      <th>0</th>\n",
 126 |               "      <td>1</td>\n",
 127 |               "      <td>Toy Story (1995)</td>\n",
 128 |               "      <td>Adventure|Animation|Children|Comedy|Fantasy</td>\n",
 129 |               "    </tr>\n",
 130 |               "    <tr>\n",
 131 |               "      <th>1</th>\n",
 132 |               "      <td>2</td>\n",
 133 |               "      <td>Jumanji (1995)</td>\n",
 134 |               "      <td>Adventure|Children|Fantasy</td>\n",
 135 |               "    </tr>\n",
 136 |               "    <tr>\n",
 137 |               "      <th>2</th>\n",
 138 |               "      <td>3</td>\n",
 139 |               "      <td>Grumpier Old Men (1995)</td>\n",
 140 |               "      <td>Comedy|Romance</td>\n",
 141 |               "    </tr>\n",
 142 |               "    <tr>\n",
 143 |               "      <th>3</th>\n",
 144 |               "      <td>4</td>\n",
 145 |               "      <td>Waiting to Exhale (1995)</td>\n",
 146 |               "      <td>Comedy|Drama|Romance</td>\n",
 147 |               "    </tr>\n",
 148 |               "    <tr>\n",
 149 |               "      <th>4</th>\n",
 150 |               "      <td>5</td>\n",
 151 |               "      <td>Father of the Bride Part II (1995)</td>\n",
 152 |               "      <td>Comedy</td>\n",
 153 |               "    </tr>\n",
 154 |               "  </tbody>\n",
 155 |               "</table>\n",
 156 |               "</div>"
 157 |             ],
 158 |             "text/plain": [
 159 |               "   movieId                               title  \\\n",
 160 |               "0        1                    Toy Story (1995)   \n",
 161 |               "1        2                      Jumanji (1995)   \n",
 162 |               "2        3             Grumpier Old Men (1995)   \n",
 163 |               "3        4            Waiting to Exhale (1995)   \n",
 164 |               "4        5  Father of the Bride Part II (1995)   \n",
 165 |               "\n",
 166 |               "                                        genres  \n",
 167 |               "0  Adventure|Animation|Children|Comedy|Fantasy  \n",
 168 |               "1                   Adventure|Children|Fantasy  \n",
 169 |               "2                               Comedy|Romance  \n",
 170 |               "3                         Comedy|Drama|Romance  \n",
 171 |               "4                                       Comedy  "
 172 |             ]
 173 |           },
 174 |           "metadata": {
 175 |             "tags": []
 176 |           },
 177 |           "execution_count": 4
 178 |         }
 179 |       ]
 180 |     },
 181 |     {
 182 |       "cell_type": "code",
 183 |       "metadata": {
 184 |         "id": "9szjN9tSOu3A",
 185 |         "colab_type": "code",
 186 |         "colab": {},
 187 |         "outputId": "9cded57b-9cf5-4c5f-eaeb-619018a65bcf"
 188 |       },
 189 |       "source": [
 190 |         "ratings.head(5)"
 191 |       ],
 192 |       "execution_count": null,
 193 |       "outputs": [
 194 |         {
 195 |           "output_type": "execute_result",
 196 |           "data": {
 197 |             "text/html": [
 198 |               "<div>\n",
 199 |               "<style scoped>\n",
 200 |               "    .dataframe tbody tr th:only-of-type {\n",
 201 |               "        vertical-align: middle;\n",
 202 |               "    }\n",
 203 |               "\n",
 204 |               "    .dataframe tbody tr th {\n",
 205 |               "        vertical-align: top;\n",
 206 |               "    }\n",
 207 |               "\n",
 208 |               "    .dataframe thead th {\n",
 209 |               "        text-align: right;\n",
 210 |               "    }\n",
 211 |               "</style>\n",
 212 |               "<table border=\"1\" class=\"dataframe\">\n",
 213 |               "  <thead>\n",
 214 |               "    <tr style=\"text-align: right;\">\n",
 215 |               "      <th></th>\n",
 216 |               "      <th>userId</th>\n",
 217 |               "      <th>movieId</th>\n",
 218 |               "      <th>rating</th>\n",
 219 |               "      <th>timestamp</th>\n",
 220 |               "    </tr>\n",
 221 |               "  </thead>\n",
 222 |               "  <tbody>\n",
 223 |               "    <tr>\n",
 224 |               "      <th>0</th>\n",
 225 |               "      <td>1</td>\n",
 226 |               "      <td>1</td>\n",
 227 |               "      <td>4.0</td>\n",
 228 |               "      <td>964982703</td>\n",
 229 |               "    </tr>\n",
 230 |               "    <tr>\n",
 231 |               "      <th>1</th>\n",
 232 |               "      <td>1</td>\n",
 233 |               "      <td>3</td>\n",
 234 |               "      <td>4.0</td>\n",
 235 |               "      <td>964981247</td>\n",
 236 |               "    </tr>\n",
 237 |               "    <tr>\n",
 238 |               "      <th>2</th>\n",
 239 |               "      <td>1</td>\n",
 240 |               "      <td>6</td>\n",
 241 |               "      <td>4.0</td>\n",
 242 |               "      <td>964982224</td>\n",
 243 |               "    </tr>\n",
 244 |               "    <tr>\n",
 245 |               "      <th>3</th>\n",
 246 |               "      <td>1</td>\n",
 247 |               "      <td>47</td>\n",
 248 |               "      <td>5.0</td>\n",
 249 |               "      <td>964983815</td>\n",
 250 |               "    </tr>\n",
 251 |               "    <tr>\n",
 252 |               "      <th>4</th>\n",
 253 |               "      <td>1</td>\n",
 254 |               "      <td>50</td>\n",
 255 |               "      <td>5.0</td>\n",
 256 |               "      <td>964982931</td>\n",
 257 |               "    </tr>\n",
 258 |               "  </tbody>\n",
 259 |               "</table>\n",
 260 |               "</div>"
 261 |             ],
 262 |             "text/plain": [
 263 |               "   userId  movieId  rating  timestamp\n",
 264 |               "0       1        1     4.0  964982703\n",
 265 |               "1       1        3     4.0  964981247\n",
 266 |               "2       1        6     4.0  964982224\n",
 267 |               "3       1       47     5.0  964983815\n",
 268 |               "4       1       50     5.0  964982931"
 269 |             ]
 270 |           },
 271 |           "metadata": {
 272 |             "tags": []
 273 |           },
 274 |           "execution_count": 5
 275 |         }
 276 |       ]
 277 |     },
 278 |     {
 279 |       "cell_type": "code",
 280 |       "metadata": {
 281 |         "id": "o6mexZ2NOu3C",
 282 |         "colab_type": "code",
 283 |         "colab": {},
 284 |         "outputId": "7f6eebb9-213f-4642-cabf-64780a92d5aa"
 285 |       },
 286 |       "source": [
 287 |         "tags.head(5)"
 288 |       ],
 289 |       "execution_count": null,
 290 |       "outputs": [
 291 |         {
 292 |           "output_type": "execute_result",
 293 |           "data": {
 294 |             "text/html": [
 295 |               "<div>\n",
 296 |               "<style scoped>\n",
 297 |               "    .dataframe tbody tr th:only-of-type {\n",
 298 |               "        vertical-align: middle;\n",
 299 |               "    }\n",
 300 |               "\n",
 301 |               "    .dataframe tbody tr th {\n",
 302 |               "        vertical-align: top;\n",
 303 |               "    }\n",
 304 |               "\n",
 305 |               "    .dataframe thead th {\n",
 306 |               "        text-align: right;\n",
 307 |               "    }\n",
 308 |               "</style>\n",
 309 |               "<table border=\"1\" class=\"dataframe\">\n",
 310 |               "  <thead>\n",
 311 |               "    <tr style=\"text-align: right;\">\n",
 312 |               "      <th></th>\n",
 313 |               "      <th>userId</th>\n",
 314 |               "      <th>movieId</th>\n",
 315 |               "      <th>tag</th>\n",
 316 |               "      <th>timestamp</th>\n",
 317 |               "    </tr>\n",
 318 |               "  </thead>\n",
 319 |               "  <tbody>\n",
 320 |               "    <tr>\n",
 321 |               "      <th>0</th>\n",
 322 |               "      <td>2</td>\n",
 323 |               "      <td>60756</td>\n",
 324 |               "      <td>funny</td>\n",
 325 |               "      <td>1445714994</td>\n",
 326 |               "    </tr>\n",
 327 |               "    <tr>\n",
 328 |               "      <th>1</th>\n",
 329 |               "      <td>2</td>\n",
 330 |               "      <td>60756</td>\n",
 331 |               "      <td>Highly quotable</td>\n",
 332 |               "      <td>1445714996</td>\n",
 333 |               "    </tr>\n",
 334 |               "    <tr>\n",
 335 |               "      <th>2</th>\n",
 336 |               "      <td>2</td>\n",
 337 |               "      <td>60756</td>\n",
 338 |               "      <td>will ferrell</td>\n",
 339 |               "      <td>1445714992</td>\n",
 340 |               "    </tr>\n",
 341 |               "    <tr>\n",
 342 |               "      <th>3</th>\n",
 343 |               "      <td>2</td>\n",
 344 |               "      <td>89774</td>\n",
 345 |               "      <td>Boxing story</td>\n",
 346 |               "      <td>1445715207</td>\n",
 347 |               "    </tr>\n",
 348 |               "    <tr>\n",
 349 |               "      <th>4</th>\n",
 350 |               "      <td>2</td>\n",
 351 |               "      <td>89774</td>\n",
 352 |               "      <td>MMA</td>\n",
 353 |               "      <td>1445715200</td>\n",
 354 |               "    </tr>\n",
 355 |               "  </tbody>\n",
 356 |               "</table>\n",
 357 |               "</div>"
 358 |             ],
 359 |             "text/plain": [
 360 |               "   userId  movieId              tag   timestamp\n",
 361 |               "0       2    60756            funny  1445714994\n",
 362 |               "1       2    60756  Highly quotable  1445714996\n",
 363 |               "2       2    60756     will ferrell  1445714992\n",
 364 |               "3       2    89774     Boxing story  1445715207\n",
 365 |               "4       2    89774              MMA  1445715200"
 366 |             ]
 367 |           },
 368 |           "metadata": {
 369 |             "tags": []
 370 |           },
 371 |           "execution_count": 6
 372 |         }
 373 |       ]
 374 |     },
 375 |     {
 376 |       "cell_type": "code",
 377 |       "metadata": {
 378 |         "id": "LIjAsKI8Ou3F",
 379 |         "colab_type": "code",
 380 |         "colab": {},
 381 |         "outputId": "54655a32-c28b-4134-c0e4-3c19fa3b3c61"
 382 |       },
 383 |       "source": [
 384 |         "df = pd.merge(ratings, movies, on='movieId' , how='left')\n",
 385 |         "df = df.drop('title', axis=1)\n",
 386 |         "df.head(5)"
 387 |       ],
 388 |       "execution_count": null,
 389 |       "outputs": [
 390 |         {
 391 |           "output_type": "execute_result",
 392 |           "data": {
 393 |             "text/html": [
 394 |               "<div>\n",
 395 |               "<style scoped>\n",
 396 |               "    .dataframe tbody tr th:only-of-type {\n",
 397 |               "        vertical-align: middle;\n",
 398 |               "    }\n",
 399 |               "\n",
 400 |               "    .dataframe tbody tr th {\n",
 401 |               "        vertical-align: top;\n",
 402 |               "    }\n",
 403 |               "\n",
 404 |               "    .dataframe thead th {\n",
 405 |               "        text-align: right;\n",
 406 |               "    }\n",
 407 |               "</style>\n",
 408 |               "<table border=\"1\" class=\"dataframe\">\n",
 409 |               "  <thead>\n",
 410 |               "    <tr style=\"text-align: right;\">\n",
 411 |               "      <th></th>\n",
 412 |               "      <th>userId</th>\n",
 413 |               "      <th>movieId</th>\n",
 414 |               "      <th>rating</th>\n",
 415 |               "      <th>timestamp</th>\n",
 416 |               "      <th>genres</th>\n",
 417 |               "    </tr>\n",
 418 |               "  </thead>\n",
 419 |               "  <tbody>\n",
 420 |               "    <tr>\n",
 421 |               "      <th>0</th>\n",
 422 |               "      <td>1</td>\n",
 423 |               "      <td>1</td>\n",
 424 |               "      <td>4.0</td>\n",
 425 |               "      <td>964982703</td>\n",
 426 |               "      <td>Adventure|Animation|Children|Comedy|Fantasy</td>\n",
 427 |               "    </tr>\n",
 428 |               "    <tr>\n",
 429 |               "      <th>1</th>\n",
 430 |               "      <td>1</td>\n",
 431 |               "      <td>3</td>\n",
 432 |               "      <td>4.0</td>\n",
 433 |               "      <td>964981247</td>\n",
 434 |               "      <td>Comedy|Romance</td>\n",
 435 |               "    </tr>\n",
 436 |               "    <tr>\n",
 437 |               "      <th>2</th>\n",
 438 |               "      <td>1</td>\n",
 439 |               "      <td>6</td>\n",
 440 |               "      <td>4.0</td>\n",
 441 |               "      <td>964982224</td>\n",
 442 |               "      <td>Action|Crime|Thriller</td>\n",
 443 |               "    </tr>\n",
 444 |               "    <tr>\n",
 445 |               "      <th>3</th>\n",
 446 |               "      <td>1</td>\n",
 447 |               "      <td>47</td>\n",
 448 |               "      <td>5.0</td>\n",
 449 |               "      <td>964983815</td>\n",
 450 |               "      <td>Mystery|Thriller</td>\n",
 451 |               "    </tr>\n",
 452 |               "    <tr>\n",
 453 |               "      <th>4</th>\n",
 454 |               "      <td>1</td>\n",
 455 |               "      <td>50</td>\n",
 456 |               "      <td>5.0</td>\n",
 457 |               "      <td>964982931</td>\n",
 458 |               "      <td>Crime|Mystery|Thriller</td>\n",
 459 |               "    </tr>\n",
 460 |               "  </tbody>\n",
 461 |               "</table>\n",
 462 |               "</div>"
 463 |             ],
 464 |             "text/plain": [
 465 |               "   userId  movieId  rating  timestamp  \\\n",
 466 |               "0       1        1     4.0  964982703   \n",
 467 |               "1       1        3     4.0  964981247   \n",
 468 |               "2       1        6     4.0  964982224   \n",
 469 |               "3       1       47     5.0  964983815   \n",
 470 |               "4       1       50     5.0  964982931   \n",
 471 |               "\n",
 472 |               "                                        genres  \n",
 473 |               "0  Adventure|Animation|Children|Comedy|Fantasy  \n",
 474 |               "1                               Comedy|Romance  \n",
 475 |               "2                        Action|Crime|Thriller  \n",
 476 |               "3                             Mystery|Thriller  \n",
 477 |               "4                       Crime|Mystery|Thriller  "
 478 |             ]
 479 |           },
 480 |           "metadata": {
 481 |             "tags": []
 482 |           },
 483 |           "execution_count": 7
 484 |         }
 485 |       ]
 486 |     },
 487 |     {
 488 |       "cell_type": "code",
 489 |       "metadata": {
 490 |         "id": "ymbJzpjJOu3H",
 491 |         "colab_type": "code",
 492 |         "colab": {}
 493 |       },
 494 |       "source": [
 495 |         "df['genres'] = df['genres'].str.split('|')"
 496 |       ],
 497 |       "execution_count": null,
 498 |       "outputs": []
 499 |     },
 500 |     {
 501 |       "cell_type": "code",
 502 |       "metadata": {
 503 |         "id": "MiUn_Qp0Ou3J",
 504 |         "colab_type": "code",
 505 |         "colab": {},
 506 |         "outputId": "879c002c-96f8-4f77-c957-1653c0b06b2f"
 507 |       },
 508 |       "source": [
 509 |         "df.head(5)"
 510 |       ],
 511 |       "execution_count": null,
 512 |       "outputs": [
 513 |         {
 514 |           "output_type": "execute_result",
 515 |           "data": {
 516 |             "text/html": [
 517 |               "<div>\n",
 518 |               "<style scoped>\n",
 519 |               "    .dataframe tbody tr th:only-of-type {\n",
 520 |               "        vertical-align: middle;\n",
 521 |               "    }\n",
 522 |               "\n",
 523 |               "    .dataframe tbody tr th {\n",
 524 |               "        vertical-align: top;\n",
 525 |               "    }\n",
 526 |               "\n",
 527 |               "    .dataframe thead th {\n",
 528 |               "        text-align: right;\n",
 529 |               "    }\n",
 530 |               "</style>\n",
 531 |               "<table border=\"1\" class=\"dataframe\">\n",
 532 |               "  <thead>\n",
 533 |               "    <tr style=\"text-align: right;\">\n",
 534 |               "      <th></th>\n",
 535 |               "      <th>userId</th>\n",
 536 |               "      <th>movieId</th>\n",
 537 |               "      <th>rating</th>\n",
 538 |               "      <th>timestamp</th>\n",
 539 |               "      <th>genres</th>\n",
 540 |               "    </tr>\n",
 541 |               "  </thead>\n",
 542 |               "  <tbody>\n",
 543 |               "    <tr>\n",
 544 |               "      <th>0</th>\n",
 545 |               "      <td>1</td>\n",
 546 |               "      <td>1</td>\n",
 547 |               "      <td>4.0</td>\n",
 548 |               "      <td>964982703</td>\n",
 549 |               "      <td>[Adventure, Animation, Children, Comedy, Fantasy]</td>\n",
 550 |               "    </tr>\n",
 551 |               "    <tr>\n",
 552 |               "      <th>1</th>\n",
 553 |               "      <td>1</td>\n",
 554 |               "      <td>3</td>\n",
 555 |               "      <td>4.0</td>\n",
 556 |               "      <td>964981247</td>\n",
 557 |               "      <td>[Comedy, Romance]</td>\n",
 558 |               "    </tr>\n",
 559 |               "    <tr>\n",
 560 |               "      <th>2</th>\n",
 561 |               "      <td>1</td>\n",
 562 |               "      <td>6</td>\n",
 563 |               "      <td>4.0</td>\n",
 564 |               "      <td>964982224</td>\n",
 565 |               "      <td>[Action, Crime, Thriller]</td>\n",
 566 |               "    </tr>\n",
 567 |               "    <tr>\n",
 568 |               "      <th>3</th>\n",
 569 |               "      <td>1</td>\n",
 570 |               "      <td>47</td>\n",
 571 |               "      <td>5.0</td>\n",
 572 |               "      <td>964983815</td>\n",
 573 |               "      <td>[Mystery, Thriller]</td>\n",
 574 |               "    </tr>\n",
 575 |               "    <tr>\n",
 576 |               "      <th>4</th>\n",
 577 |               "      <td>1</td>\n",
 578 |               "      <td>50</td>\n",
 579 |               "      <td>5.0</td>\n",
 580 |               "      <td>964982931</td>\n",
 581 |               "      <td>[Crime, Mystery, Thriller]</td>\n",
 582 |               "    </tr>\n",
 583 |               "  </tbody>\n",
 584 |               "</table>\n",
 585 |               "</div>"
 586 |             ],
 587 |             "text/plain": [
 588 |               "   userId  movieId  rating  timestamp  \\\n",
 589 |               "0       1        1     4.0  964982703   \n",
 590 |               "1       1        3     4.0  964981247   \n",
 591 |               "2       1        6     4.0  964982224   \n",
 592 |               "3       1       47     5.0  964983815   \n",
 593 |               "4       1       50     5.0  964982931   \n",
 594 |               "\n",
 595 |               "                                              genres  \n",
 596 |               "0  [Adventure, Animation, Children, Comedy, Fantasy]  \n",
 597 |               "1                                  [Comedy, Romance]  \n",
 598 |               "2                          [Action, Crime, Thriller]  \n",
 599 |               "3                                [Mystery, Thriller]  \n",
 600 |               "4                         [Crime, Mystery, Thriller]  "
 601 |             ]
 602 |           },
 603 |           "metadata": {
 604 |             "tags": []
 605 |           },
 606 |           "execution_count": 9
 607 |         }
 608 |       ]
 609 |     },
 610 |     {
 611 |       "cell_type": "code",
 612 |       "metadata": {
 613 |         "id": "B3xDyEX_Ou3L",
 614 |         "colab_type": "code",
 615 |         "colab": {}
 616 |       },
 617 |       "source": [
 618 |         "tags['tag'] = tags['tag'].str.split('|')\n",
 619 |         "tags.drop('timestamp', axis=1, inplace=True)"
 620 |       ],
 621 |       "execution_count": null,
 622 |       "outputs": []
 623 |     },
 624 |     {
 625 |       "cell_type": "code",
 626 |       "metadata": {
 627 |         "id": "4OEft6CkOu3O",
 628 |         "colab_type": "code",
 629 |         "colab": {},
 630 |         "outputId": "7f500eee-1b28-4735-a48f-900f6cd3be1b"
 631 |       },
 632 |       "source": [
 633 |         "tags = tags.groupby(['userId','movieId'])['tag'].apply(lambda x: ','.join(x.astype(str))).reset_index()\n",
 634 |         "tags.head(5)"
 635 |       ],
 636 |       "execution_count": null,
 637 |       "outputs": [
 638 |         {
 639 |           "output_type": "execute_result",
 640 |           "data": {
 641 |             "text/html": [
 642 |               "<div>\n",
 643 |               "<style scoped>\n",
 644 |               "    .dataframe tbody tr th:only-of-type {\n",
 645 |               "        vertical-align: middle;\n",
 646 |               "    }\n",
 647 |               "\n",
 648 |               "    .dataframe tbody tr th {\n",
 649 |               "        vertical-align: top;\n",
 650 |               "    }\n",
 651 |               "\n",
 652 |               "    .dataframe thead th {\n",
 653 |               "        text-align: right;\n",
 654 |               "    }\n",
 655 |               "</style>\n",
 656 |               "<table border=\"1\" class=\"dataframe\">\n",
 657 |               "  <thead>\n",
 658 |               "    <tr style=\"text-align: right;\">\n",
 659 |               "      <th></th>\n",
 660 |               "      <th>userId</th>\n",
 661 |               "      <th>movieId</th>\n",
 662 |               "      <th>tag</th>\n",
 663 |               "    </tr>\n",
 664 |               "  </thead>\n",
 665 |               "  <tbody>\n",
 666 |               "    <tr>\n",
 667 |               "      <th>0</th>\n",
 668 |               "      <td>2</td>\n",
 669 |               "      <td>60756</td>\n",
 670 |               "      <td>['funny'],['Highly quotable'],['will ferrell']</td>\n",
 671 |               "    </tr>\n",
 672 |               "    <tr>\n",
 673 |               "      <th>1</th>\n",
 674 |               "      <td>2</td>\n",
 675 |               "      <td>89774</td>\n",
 676 |               "      <td>['Boxing story'],['MMA'],['Tom Hardy']</td>\n",
 677 |               "    </tr>\n",
 678 |               "    <tr>\n",
 679 |               "      <th>2</th>\n",
 680 |               "      <td>2</td>\n",
 681 |               "      <td>106782</td>\n",
 682 |               "      <td>['drugs'],['Leonardo DiCaprio'],['Martin Scors...</td>\n",
 683 |               "    </tr>\n",
 684 |               "    <tr>\n",
 685 |               "      <th>3</th>\n",
 686 |               "      <td>7</td>\n",
 687 |               "      <td>48516</td>\n",
 688 |               "      <td>['way too long']</td>\n",
 689 |               "    </tr>\n",
 690 |               "    <tr>\n",
 691 |               "      <th>4</th>\n",
 692 |               "      <td>18</td>\n",
 693 |               "      <td>431</td>\n",
 694 |               "      <td>['Al Pacino'],['gangster'],['mafia']</td>\n",
 695 |               "    </tr>\n",
 696 |               "  </tbody>\n",
 697 |               "</table>\n",
 698 |               "</div>"
 699 |             ],
 700 |             "text/plain": [
 701 |               "   userId  movieId                                                tag\n",
 702 |               "0       2    60756     ['funny'],['Highly quotable'],['will ferrell']\n",
 703 |               "1       2    89774             ['Boxing story'],['MMA'],['Tom Hardy']\n",
 704 |               "2       2   106782  ['drugs'],['Leonardo DiCaprio'],['Martin Scors...\n",
 705 |               "3       7    48516                                   ['way too long']\n",
 706 |               "4      18      431               ['Al Pacino'],['gangster'],['mafia']"
 707 |             ]
 708 |           },
 709 |           "metadata": {
 710 |             "tags": []
 711 |           },
 712 |           "execution_count": 11
 713 |         }
 714 |       ]
 715 |     },
 716 |     {
 717 |       "cell_type": "code",
 718 |       "metadata": {
 719 |         "id": "Q-ukKMhbOu3Q",
 720 |         "colab_type": "code",
 721 |         "colab": {}
 722 |       },
 723 |       "source": [
 724 |         "df = pd.merge(df, tags, on=['userId','movieId'], how='left')"
 725 |       ],
 726 |       "execution_count": null,
 727 |       "outputs": []
 728 |     },
 729 |     {
 730 |       "cell_type": "code",
 731 |       "metadata": {
 732 |         "id": "05Y3LfkMOu3S",
 733 |         "colab_type": "code",
 734 |         "colab": {},
 735 |         "outputId": "6d8110de-34dd-463d-c123-dd69218f3954"
 736 |       },
 737 |       "source": [
 738 |         "df.shape"
 739 |       ],
 740 |       "execution_count": null,
 741 |       "outputs": [
 742 |         {
 743 |           "output_type": "execute_result",
 744 |           "data": {
 745 |             "text/plain": [
 746 |               "(100836, 6)"
 747 |             ]
 748 |           },
 749 |           "metadata": {
 750 |             "tags": []
 751 |           },
 752 |           "execution_count": 13
 753 |         }
 754 |       ]
 755 |     },
 756 |     {
 757 |       "cell_type": "code",
 758 |       "metadata": {
 759 |         "id": "2RZvAHrHOu3U",
 760 |         "colab_type": "code",
 761 |         "colab": {}
 762 |       },
 763 |       "source": [
 764 |         "df['tag'] = df['tag'].apply(lambda d: d if isinstance(d, list) else [])\n",
 765 |         "df['genres'] = df['genres'].apply(lambda d: d if isinstance(d, list) else [])"
 766 |       ],
 767 |       "execution_count": null,
 768 |       "outputs": []
 769 |     },
 770 |     {
 771 |       "cell_type": "code",
 772 |       "metadata": {
 773 |         "id": "M0M_IJc5Ou3W",
 774 |         "colab_type": "code",
 775 |         "colab": {},
 776 |         "outputId": "a95ea1a2-3435-4268-a6ab-a1db53e5a464"
 777 |       },
 778 |       "source": [
 779 |         "df.head()"
 780 |       ],
 781 |       "execution_count": null,
 782 |       "outputs": [
 783 |         {
 784 |           "output_type": "execute_result",
 785 |           "data": {
 786 |             "text/html": [
 787 |               "<div>\n",
 788 |               "<style scoped>\n",
 789 |               "    .dataframe tbody tr th:only-of-type {\n",
 790 |               "        vertical-align: middle;\n",
 791 |               "    }\n",
 792 |               "\n",
 793 |               "    .dataframe tbody tr th {\n",
 794 |               "        vertical-align: top;\n",
 795 |               "    }\n",
 796 |               "\n",
 797 |               "    .dataframe thead th {\n",
 798 |               "        text-align: right;\n",
 799 |               "    }\n",
 800 |               "</style>\n",
 801 |               "<table border=\"1\" class=\"dataframe\">\n",
 802 |               "  <thead>\n",
 803 |               "    <tr style=\"text-align: right;\">\n",
 804 |               "      <th></th>\n",
 805 |               "      <th>userId</th>\n",
 806 |               "      <th>movieId</th>\n",
 807 |               "      <th>rating</th>\n",
 808 |               "      <th>timestamp</th>\n",
 809 |               "      <th>genres</th>\n",
 810 |               "      <th>tag</th>\n",
 811 |               "    </tr>\n",
 812 |               "  </thead>\n",
 813 |               "  <tbody>\n",
 814 |               "    <tr>\n",
 815 |               "      <th>0</th>\n",
 816 |               "      <td>1</td>\n",
 817 |               "      <td>1</td>\n",
 818 |               "      <td>4.0</td>\n",
 819 |               "      <td>964982703</td>\n",
 820 |               "      <td>[Adventure, Animation, Children, Comedy, Fantasy]</td>\n",
 821 |               "      <td>[]</td>\n",
 822 |               "    </tr>\n",
 823 |               "    <tr>\n",
 824 |               "      <th>1</th>\n",
 825 |               "      <td>1</td>\n",
 826 |               "      <td>3</td>\n",
 827 |               "      <td>4.0</td>\n",
 828 |               "      <td>964981247</td>\n",
 829 |               "      <td>[Comedy, Romance]</td>\n",
 830 |               "      <td>[]</td>\n",
 831 |               "    </tr>\n",
 832 |               "    <tr>\n",
 833 |               "      <th>2</th>\n",
 834 |               "      <td>1</td>\n",
 835 |               "      <td>6</td>\n",
 836 |               "      <td>4.0</td>\n",
 837 |               "      <td>964982224</td>\n",
 838 |               "      <td>[Action, Crime, Thriller]</td>\n",
 839 |               "      <td>[]</td>\n",
 840 |               "    </tr>\n",
 841 |               "    <tr>\n",
 842 |               "      <th>3</th>\n",
 843 |               "      <td>1</td>\n",
 844 |               "      <td>47</td>\n",
 845 |               "      <td>5.0</td>\n",
 846 |               "      <td>964983815</td>\n",
 847 |               "      <td>[Mystery, Thriller]</td>\n",
 848 |               "      <td>[]</td>\n",
 849 |               "    </tr>\n",
 850 |               "    <tr>\n",
 851 |               "      <th>4</th>\n",
 852 |               "      <td>1</td>\n",
 853 |               "      <td>50</td>\n",
 854 |               "      <td>5.0</td>\n",
 855 |               "      <td>964982931</td>\n",
 856 |               "      <td>[Crime, Mystery, Thriller]</td>\n",
 857 |               "      <td>[]</td>\n",
 858 |               "    </tr>\n",
 859 |               "  </tbody>\n",
 860 |               "</table>\n",
 861 |               "</div>"
 862 |             ],
 863 |             "text/plain": [
 864 |               "   userId  movieId  rating  timestamp  \\\n",
 865 |               "0       1        1     4.0  964982703   \n",
 866 |               "1       1        3     4.0  964981247   \n",
 867 |               "2       1        6     4.0  964982224   \n",
 868 |               "3       1       47     5.0  964983815   \n",
 869 |               "4       1       50     5.0  964982931   \n",
 870 |               "\n",
 871 |               "                                              genres tag  \n",
 872 |               "0  [Adventure, Animation, Children, Comedy, Fantasy]  []  \n",
 873 |               "1                                  [Comedy, Romance]  []  \n",
 874 |               "2                          [Action, Crime, Thriller]  []  \n",
 875 |               "3                                [Mystery, Thriller]  []  \n",
 876 |               "4                         [Crime, Mystery, Thriller]  []  "
 877 |             ]
 878 |           },
 879 |           "metadata": {
 880 |             "tags": []
 881 |           },
 882 |           "execution_count": 15
 883 |         }
 884 |       ]
 885 |     },
 886 |     {
 887 |       "cell_type": "markdown",
 888 |       "metadata": {
 889 |         "id": "PPkusLncOu3Y",
 890 |         "colab_type": "text"
 891 |       },
 892 |       "source": [
 893 |         "#### Split into train and test data"
 894 |       ]
 895 |     },
 896 |     {
 897 |       "cell_type": "code",
 898 |       "metadata": {
 899 |         "id": "2-tF8n50Ou3Y",
 900 |         "colab_type": "code",
 901 |         "colab": {}
 902 |       },
 903 |       "source": [
 904 |         "from sklearn.model_selection import train_test_split\n",
 905 |         "train_data, test_data = train_test_split(df, test_size=0.2, stratify=df.userId)"
 906 |       ],
 907 |       "execution_count": null,
 908 |       "outputs": []
 909 |     },
 910 |     {
 911 |       "cell_type": "code",
 912 |       "metadata": {
 913 |         "id": "vFCvxS3yOu3a",
 914 |         "colab_type": "code",
 915 |         "colab": {},
 916 |         "outputId": "b6cfc121-411d-49e8-c807-726cfebdcac7"
 917 |       },
 918 |       "source": [
 919 |         "train_data = train_data.sort_values(['userId', 'movieId'])\n",
 920 |         "train_data.head()"
 921 |       ],
 922 |       "execution_count": null,
 923 |       "outputs": [
 924 |         {
 925 |           "output_type": "execute_result",
 926 |           "data": {
 927 |             "text/html": [
 928 |               "<div>\n",
 929 |               "<style scoped>\n",
 930 |               "    .dataframe tbody tr th:only-of-type {\n",
 931 |               "        vertical-align: middle;\n",
 932 |               "    }\n",
 933 |               "\n",
 934 |               "    .dataframe tbody tr th {\n",
 935 |               "        vertical-align: top;\n",
 936 |               "    }\n",
 937 |               "\n",
 938 |               "    .dataframe thead th {\n",
 939 |               "        text-align: right;\n",
 940 |               "    }\n",
 941 |               "</style>\n",
 942 |               "<table border=\"1\" class=\"dataframe\">\n",
 943 |               "  <thead>\n",
 944 |               "    <tr style=\"text-align: right;\">\n",
 945 |               "      <th></th>\n",
 946 |               "      <th>userId</th>\n",
 947 |               "      <th>movieId</th>\n",
 948 |               "      <th>rating</th>\n",
 949 |               "      <th>timestamp</th>\n",
 950 |               "      <th>genres</th>\n",
 951 |               "      <th>tag</th>\n",
 952 |               "    </tr>\n",
 953 |               "  </thead>\n",
 954 |               "  <tbody>\n",
 955 |               "    <tr>\n",
 956 |               "      <th>0</th>\n",
 957 |               "      <td>1</td>\n",
 958 |               "      <td>1</td>\n",
 959 |               "      <td>4.0</td>\n",
 960 |               "      <td>964982703</td>\n",
 961 |               "      <td>[Adventure, Animation, Children, Comedy, Fantasy]</td>\n",
 962 |               "      <td>[]</td>\n",
 963 |               "    </tr>\n",
 964 |               "    <tr>\n",
 965 |               "      <th>1</th>\n",
 966 |               "      <td>1</td>\n",
 967 |               "      <td>3</td>\n",
 968 |               "      <td>4.0</td>\n",
 969 |               "      <td>964981247</td>\n",
 970 |               "      <td>[Comedy, Romance]</td>\n",
 971 |               "      <td>[]</td>\n",
 972 |               "    </tr>\n",
 973 |               "    <tr>\n",
 974 |               "      <th>2</th>\n",
 975 |               "      <td>1</td>\n",
 976 |               "      <td>6</td>\n",
 977 |               "      <td>4.0</td>\n",
 978 |               "      <td>964982224</td>\n",
 979 |               "      <td>[Action, Crime, Thriller]</td>\n",
 980 |               "      <td>[]</td>\n",
 981 |               "    </tr>\n",
 982 |               "    <tr>\n",
 983 |               "      <th>3</th>\n",
 984 |               "      <td>1</td>\n",
 985 |               "      <td>47</td>\n",
 986 |               "      <td>5.0</td>\n",
 987 |               "      <td>964983815</td>\n",
 988 |               "      <td>[Mystery, Thriller]</td>\n",
 989 |               "      <td>[]</td>\n",
 990 |               "    </tr>\n",
 991 |               "    <tr>\n",
 992 |               "      <th>5</th>\n",
 993 |               "      <td>1</td>\n",
 994 |               "      <td>70</td>\n",
 995 |               "      <td>3.0</td>\n",
 996 |               "      <td>964982400</td>\n",
 997 |               "      <td>[Action, Comedy, Horror, Thriller]</td>\n",
 998 |               "      <td>[]</td>\n",
 999 |               "    </tr>\n",
1000 |               "  </tbody>\n",
1001 |               "</table>\n",
1002 |               "</div>"
1003 |             ],
1004 |             "text/plain": [
1005 |               "   userId  movieId  rating  timestamp  \\\n",
1006 |               "0       1        1     4.0  964982703   \n",
1007 |               "1       1        3     4.0  964981247   \n",
1008 |               "2       1        6     4.0  964982224   \n",
1009 |               "3       1       47     5.0  964983815   \n",
1010 |               "5       1       70     3.0  964982400   \n",
1011 |               "\n",
1012 |               "                                              genres tag  \n",
1013 |               "0  [Adventure, Animation, Children, Comedy, Fantasy]  []  \n",
1014 |               "1                                  [Comedy, Romance]  []  \n",
1015 |               "2                          [Action, Crime, Thriller]  []  \n",
1016 |               "3                                [Mystery, Thriller]  []  \n",
1017 |               "5                 [Action, Comedy, Horror, Thriller]  []  "
1018 |             ]
1019 |           },
1020 |           "metadata": {
1021 |             "tags": []
1022 |           },
1023 |           "execution_count": 17
1024 |         }
1025 |       ]
1026 |     },
1027 |     {
1028 |       "cell_type": "code",
1029 |       "metadata": {
1030 |         "scrolled": true,
1031 |         "id": "ojrEaFoMOu3c",
1032 |         "colab_type": "code",
1033 |         "colab": {},
1034 |         "outputId": "b617ba3b-9a8b-44f3-b41f-22109097a9f0"
1035 |       },
1036 |       "source": [
1037 |         "test_data = test_data.sort_values(['userId','movieId'])\n",
1038 |         "test_data.head()"
1039 |       ],
1040 |       "execution_count": null,
1041 |       "outputs": [
1042 |         {
1043 |           "output_type": "execute_result",
1044 |           "data": {
1045 |             "text/html": [
1046 |               "<div>\n",
1047 |               "<style scoped>\n",
1048 |               "    .dataframe tbody tr th:only-of-type {\n",
1049 |               "        vertical-align: middle;\n",
1050 |               "    }\n",
1051 |               "\n",
1052 |               "    .dataframe tbody tr th {\n",
1053 |               "        vertical-align: top;\n",
1054 |               "    }\n",
1055 |               "\n",
1056 |               "    .dataframe thead th {\n",
1057 |               "        text-align: right;\n",
1058 |               "    }\n",
1059 |               "</style>\n",
1060 |               "<table border=\"1\" class=\"dataframe\">\n",
1061 |               "  <thead>\n",
1062 |               "    <tr style=\"text-align: right;\">\n",
1063 |               "      <th></th>\n",
1064 |               "      <th>userId</th>\n",
1065 |               "      <th>movieId</th>\n",
1066 |               "      <th>rating</th>\n",
1067 |               "      <th>timestamp</th>\n",
1068 |               "      <th>genres</th>\n",
1069 |               "      <th>tag</th>\n",
1070 |               "    </tr>\n",
1071 |               "  </thead>\n",
1072 |               "  <tbody>\n",
1073 |               "    <tr>\n",
1074 |               "      <th>4</th>\n",
1075 |               "      <td>1</td>\n",
1076 |               "      <td>50</td>\n",
1077 |               "      <td>5.0</td>\n",
1078 |               "      <td>964982931</td>\n",
1079 |               "      <td>[Crime, Mystery, Thriller]</td>\n",
1080 |               "      <td>[]</td>\n",
1081 |               "    </tr>\n",
1082 |               "    <tr>\n",
1083 |               "      <th>6</th>\n",
1084 |               "      <td>1</td>\n",
1085 |               "      <td>101</td>\n",
1086 |               "      <td>5.0</td>\n",
1087 |               "      <td>964980868</td>\n",
1088 |               "      <td>[Adventure, Comedy, Crime, Romance]</td>\n",
1089 |               "      <td>[]</td>\n",
1090 |               "    </tr>\n",
1091 |               "    <tr>\n",
1092 |               "      <th>11</th>\n",
1093 |               "      <td>1</td>\n",
1094 |               "      <td>216</td>\n",
1095 |               "      <td>5.0</td>\n",
1096 |               "      <td>964981208</td>\n",
1097 |               "      <td>[Comedy]</td>\n",
1098 |               "      <td>[]</td>\n",
1099 |               "    </tr>\n",
1100 |               "    <tr>\n",
1101 |               "      <th>16</th>\n",
1102 |               "      <td>1</td>\n",
1103 |               "      <td>296</td>\n",
1104 |               "      <td>3.0</td>\n",
1105 |               "      <td>964982967</td>\n",
1106 |               "      <td>[Comedy, Crime, Drama, Thriller]</td>\n",
1107 |               "      <td>[]</td>\n",
1108 |               "    </tr>\n",
1109 |               "    <tr>\n",
1110 |               "      <th>17</th>\n",
1111 |               "      <td>1</td>\n",
1112 |               "      <td>316</td>\n",
1113 |               "      <td>3.0</td>\n",
1114 |               "      <td>964982310</td>\n",
1115 |               "      <td>[Action, Adventure, Sci-Fi]</td>\n",
1116 |               "      <td>[]</td>\n",
1117 |               "    </tr>\n",
1118 |               "  </tbody>\n",
1119 |               "</table>\n",
1120 |               "</div>"
1121 |             ],
1122 |             "text/plain": [
1123 |               "    userId  movieId  rating  timestamp                               genres  \\\n",
1124 |               "4        1       50     5.0  964982931           [Crime, Mystery, Thriller]   \n",
1125 |               "6        1      101     5.0  964980868  [Adventure, Comedy, Crime, Romance]   \n",
1126 |               "11       1      216     5.0  964981208                             [Comedy]   \n",
1127 |               "16       1      296     3.0  964982967     [Comedy, Crime, Drama, Thriller]   \n",
1128 |               "17       1      316     3.0  964982310          [Action, Adventure, Sci-Fi]   \n",
1129 |               "\n",
1130 |               "   tag  \n",
1131 |               "4   []  \n",
1132 |               "6   []  \n",
1133 |               "11  []  \n",
1134 |               "16  []  \n",
1135 |               "17  []  "
1136 |             ]
1137 |           },
1138 |           "metadata": {
1139 |             "tags": []
1140 |           },
1141 |           "execution_count": 18
1142 |         }
1143 |       ]
1144 |     },
1145 |     {
1146 |       "cell_type": "markdown",
1147 |       "metadata": {
1148 |         "id": "zlFDgR0COu3f",
1149 |         "colab_type": "text"
1150 |       },
1151 |       "source": [
1152 |         "#### Save the dataframes as csv files"
1153 |       ]
1154 |     },
1155 |     {
1156 |       "cell_type": "code",
1157 |       "metadata": {
1158 |         "id": "U8ssEJZ0Ou3f",
1159 |         "colab_type": "code",
1160 |         "colab": {}
1161 |       },
1162 |       "source": [
1163 |         "# train_data.to_csv('training_data.csv', index = False)\n",
1164 |         "# test_data.to_csv('testing_data.csv', index = False)"
1165 |       ],
1166 |       "execution_count": null,
1167 |       "outputs": []
1168 |     },
1169 |     {
1170 |       "cell_type": "markdown",
1171 |       "metadata": {
1172 |         "id": "dCl7m8u3Ou3h",
1173 |         "colab_type": "text"
1174 |       },
1175 |       "source": [
1176 |         "## Pre-process the movie data"
1177 |       ]
1178 |     },
1179 |     {
1180 |       "cell_type": "code",
1181 |       "metadata": {
1182 |         "scrolled": true,
1183 |         "id": "SNQrqxwwOu3h",
1184 |         "colab_type": "code",
1185 |         "colab": {},
1186 |         "outputId": "06ea051d-cab0-42be-d43e-36cf3da4d733"
1187 |       },
1188 |       "source": [
1189 |         "movies['genres'] = movies['genres'].str.split('|')\n",
1190 |         "movies['genres'] = movies['genres'].apply(lambda d: d if isinstance(d, list) else [])\n",
1191 |         "movies.head()\n",
1192 |         "# movies.to_csv('movies.csv', index = False)"
1193 |       ],
1194 |       "execution_count": null,
1195 |       "outputs": [
1196 |         {
1197 |           "output_type": "execute_result",
1198 |           "data": {
1199 |             "text/html": [
1200 |               "<div>\n",
1201 |               "<style scoped>\n",
1202 |               "    .dataframe tbody tr th:only-of-type {\n",
1203 |               "        vertical-align: middle;\n",
1204 |               "    }\n",
1205 |               "\n",
1206 |               "    .dataframe tbody tr th {\n",
1207 |               "        vertical-align: top;\n",
1208 |               "    }\n",
1209 |               "\n",
1210 |               "    .dataframe thead th {\n",
1211 |               "        text-align: right;\n",
1212 |               "    }\n",
1213 |               "</style>\n",
1214 |               "<table border=\"1\" class=\"dataframe\">\n",
1215 |               "  <thead>\n",
1216 |               "    <tr style=\"text-align: right;\">\n",
1217 |               "      <th></th>\n",
1218 |               "      <th>movieId</th>\n",
1219 |               "      <th>title</th>\n",
1220 |               "      <th>genres</th>\n",
1221 |               "    </tr>\n",
1222 |               "  </thead>\n",
1223 |               "  <tbody>\n",
1224 |               "    <tr>\n",
1225 |               "      <th>0</th>\n",
1226 |               "      <td>1</td>\n",
1227 |               "      <td>Toy Story (1995)</td>\n",
1228 |               "      <td>[Adventure, Animation, Children, Comedy, Fantasy]</td>\n",
1229 |               "    </tr>\n",
1230 |               "    <tr>\n",
1231 |               "      <th>1</th>\n",
1232 |               "      <td>2</td>\n",
1233 |               "      <td>Jumanji (1995)</td>\n",
1234 |               "      <td>[Adventure, Children, Fantasy]</td>\n",
1235 |               "    </tr>\n",
1236 |               "    <tr>\n",
1237 |               "      <th>2</th>\n",
1238 |               "      <td>3</td>\n",
1239 |               "      <td>Grumpier Old Men (1995)</td>\n",
1240 |               "      <td>[Comedy, Romance]</td>\n",
1241 |               "    </tr>\n",
1242 |               "    <tr>\n",
1243 |               "      <th>3</th>\n",
1244 |               "      <td>4</td>\n",
1245 |               "      <td>Waiting to Exhale (1995)</td>\n",
1246 |               "      <td>[Comedy, Drama, Romance]</td>\n",
1247 |               "    </tr>\n",
1248 |               "    <tr>\n",
1249 |               "      <th>4</th>\n",
1250 |               "      <td>5</td>\n",
1251 |               "      <td>Father of the Bride Part II (1995)</td>\n",
1252 |               "      <td>[Comedy]</td>\n",
1253 |               "    </tr>\n",
1254 |               "  </tbody>\n",
1255 |               "</table>\n",
1256 |               "</div>"
1257 |             ],
1258 |             "text/plain": [
1259 |               "   movieId                               title  \\\n",
1260 |               "0        1                    Toy Story (1995)   \n",
1261 |               "1        2                      Jumanji (1995)   \n",
1262 |               "2        3             Grumpier Old Men (1995)   \n",
1263 |               "3        4            Waiting to Exhale (1995)   \n",
1264 |               "4        5  Father of the Bride Part II (1995)   \n",
1265 |               "\n",
1266 |               "                                              genres  \n",
1267 |               "0  [Adventure, Animation, Children, Comedy, Fantasy]  \n",
1268 |               "1                     [Adventure, Children, Fantasy]  \n",
1269 |               "2                                  [Comedy, Romance]  \n",
1270 |               "3                           [Comedy, Drama, Romance]  \n",
1271 |               "4                                           [Comedy]  "
1272 |             ]
1273 |           },
1274 |           "metadata": {
1275 |             "tags": []
1276 |           },
1277 |           "execution_count": 20
1278 |         }
1279 |       ]
1280 |     },
1281 |     {
1282 |       "cell_type": "code",
1283 |       "metadata": {
1284 |         "id": "tUOMY7C9Ou3j",
1285 |         "colab_type": "code",
1286 |         "colab": {}
1287 |       },
1288 |       "source": [
1289 |         ""
1290 |       ],
1291 |       "execution_count": null,
1292 |       "outputs": []
1293 |     }
1294 |   ]
1295 | }


--------------------------------------------------------------------------------