├── .idea
├── Reccomender-Systems-Using-Python.iml
├── inspectionProfiles
│ ├── Project_Default.xml
│ └── profiles_settings.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── 01 - Evaluating Recommender Systems
├── .ipynb_checkpoints
│ ├── MovieLens-checkpoint.py
│ └── Test Evaluation Metrics-checkpoint.ipynb
├── MovieLens.py
├── RecommenderMetrics.py
├── Test Evaluation Metrics.ipynb
└── __pycache__
│ ├── MovieLens.cpython-38.pyc
│ └── RecommenderMetrics.cpython-38.pyc
├── 02 - Recommender Engine Framework
├── .ipynb_checkpoints
│ ├── EvaluatedAlgorithm-checkpoint.py
│ ├── EvaluationData-checkpoint.py
│ ├── Evaluator-checkpoint.py
│ ├── MovieLens-checkpoint.py
│ ├── RecSys Framework Notebook-checkpoint.ipynb
│ └── RecommenderMetrics-checkpoint.py
├── EvaluatedAlgorithm.py
├── EvaluationData.py
├── Evaluator.py
├── MovieLens.py
├── RecSys Framework Notebook.ipynb
├── RecommenderMetrics.py
└── __pycache__
│ ├── EvaluatedAlgorithm.cpython-38.pyc
│ ├── EvaluationData.cpython-38.pyc
│ ├── Evaluator.cpython-38.pyc
│ ├── MovieLens.cpython-38.pyc
│ └── RecommenderMetrics.cpython-38.pyc
├── 03 - Content Based Recommendation
├── .ipynb_checkpoints
│ ├── Content Based Recommendation with MisEnScene-checkpoint.ipynb
│ ├── Content Based Recommendation-checkpoint.ipynb
│ ├── ContentKNNAlgorithm-checkpoint.py
│ ├── ContentKNNwithMisEnScene-checkpoint.py
│ ├── ContentRecs-checkpoint.py
│ ├── EvaluationData-checkpoint.py
│ ├── Evaluator-checkpoint.py
│ ├── MovieLens-checkpoint.py
│ └── RecommenderMetrics-checkpoint.py
├── Content Based Recommendation with MisEnScene.ipynb
├── Content Based Recommendation.ipynb
├── ContentKNNAlgorithm.py
├── ContentKNNwithMisEnScene.py
├── ContentRecs.py
├── EvaluatedAlgorithm.py
├── EvaluationData.py
├── Evaluator.py
├── LLVisualFeatures13K_Log.csv
├── MovieLens.py
├── RecommenderMetrics.py
└── __pycache__
│ ├── ContentKNNAlgorithm.cpython-38.pyc
│ ├── ContentKNNwithMisEnScene.cpython-38.pyc
│ ├── EvaluatedAlgorithm.cpython-38.pyc
│ ├── EvaluationData.cpython-38.pyc
│ ├── Evaluator.cpython-38.pyc
│ ├── MovieLens.cpython-38.pyc
│ └── RecommenderMetrics.cpython-38.pyc
├── 04 - Neighborhood Based Collaborative Filtering
├── .ipynb_checkpoints
│ ├── EvaluateUserCF-checkpoint.py
│ ├── EvaluatedAlgorithm-checkpoint.py
│ ├── EvaluationData-checkpoint.py
│ ├── Evaluator-checkpoint.py
│ ├── Item-Based Collaborative Filtering-checkpoint.ipynb
│ ├── KNNBakeOff-checkpoint.py
│ ├── MovieLens-checkpoint.py
│ ├── RecommenderMetrics-checkpoint.py
│ ├── SimpleItemCF-checkpoint.py
│ └── User-Based Collaborative Filtering-checkpoint.ipynb
├── EvaluateUserCF.py
├── EvaluatedAlgorithm.py
├── EvaluationData.py
├── Evaluator.py
├── Item-Based Collaborative Filtering.ipynb
├── KNNBakeOff.py
├── MovieLens.py
├── RecommenderMetrics.py
├── SimpleItemCF.py
├── User-Based Collaborative Filtering.ipynb
└── __pycache__
│ └── MovieLens.cpython-38.pyc
├── 05 - Matrix Factorization Methods
├── .ipynb_checkpoints
│ ├── EvaluatedAlgorithm-checkpoint.py
│ ├── EvaluationData-checkpoint.py
│ ├── Evaluator-checkpoint.py
│ ├── MovieLens-checkpoint.py
│ ├── RecommenderMetrics-checkpoint.py
│ ├── SVD Matrix Factorization-checkpoint.ipynb
│ ├── SVDBakeOff-checkpoint.py
│ └── SVDTuning-checkpoint.py
├── EvaluatedAlgorithm.py
├── EvaluationData.py
├── Evaluator.py
├── MovieLens.py
├── RecommenderMetrics.py
├── SVD Matrix Factorization.ipynb
└── __pycache__
│ ├── EvaluatedAlgorithm.cpython-38.pyc
│ ├── EvaluationData.cpython-38.pyc
│ ├── Evaluator.cpython-38.pyc
│ ├── MovieLens.cpython-38.pyc
│ └── RecommenderMetrics.cpython-38.pyc
├── 06 - Deep Learning for Recommender Systems
├── .ipynb_checkpoints
│ ├── AutoRec-checkpoint.py
│ ├── AutoRecAlgorithm-checkpoint.py
│ ├── AutoRecBakeOff-checkpoint.py
│ ├── EvaluatedAlgorithm-checkpoint.py
│ ├── EvaluationData-checkpoint.py
│ ├── Evaluator-checkpoint.py
│ ├── MovieLens-checkpoint.py
│ ├── RBM-checkpoint.py
│ ├── RBMAlgorithm-checkpoint.py
│ ├── RBMBakeOff-checkpoint.py
│ ├── RBMTuning-checkpoint.py
│ ├── Recommendations using Restricted Boltzmann Machine(RBM)-checkpoint.ipynb
│ ├── Recommendations with Deep Neural Networks-checkpoint.ipynb
│ └── RecommenderMetrics-checkpoint.py
├── AutoRec.py
├── AutoRecAlgorithm.py
├── AutoRecBakeOff.py
├── EvaluatedAlgorithm.py
├── EvaluationData.py
├── Evaluator.py
├── MovieLens.py
├── RBM.py
├── RBMAlgorithm.py
├── RBMBakeOff.py
├── RBMTuning.py
├── Recommendations using Restricted Boltzmann Machine(RBM).ipynb
├── Recommendations with Deep Neural Networks.ipynb
├── RecommenderMetrics.py
└── __pycache__
│ ├── AutoRec.cpython-38.pyc
│ ├── AutoRecAlgorithm.cpython-38.pyc
│ ├── EvaluatedAlgorithm.cpython-38.pyc
│ ├── EvaluationData.cpython-38.pyc
│ ├── Evaluator.cpython-38.pyc
│ ├── MovieLens.cpython-38.pyc
│ ├── RBM.cpython-38.pyc
│ ├── RBMAlgorithm.cpython-38.pyc
│ └── RecommenderMetrics.cpython-38.pyc
├── README.md
└── ml-latest-small
├── README.txt
├── links.csv
├── movies.csv
├── ratings.csv
└── tags.csv
/.idea/Reccomender-Systems-Using-Python.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 | 1594925174437
69 |
70 |
71 | 1594925174437
72 |
73 |
74 |
75 |
76 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
--------------------------------------------------------------------------------
/01 - Evaluating Recommender Systems/__pycache__/MovieLens.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/01 - Evaluating Recommender Systems/__pycache__/MovieLens.cpython-38.pyc
--------------------------------------------------------------------------------
/01 - Evaluating Recommender Systems/__pycache__/RecommenderMetrics.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/01 - Evaluating Recommender Systems/__pycache__/RecommenderMetrics.cpython-38.pyc
--------------------------------------------------------------------------------
/02 - Recommender Engine Framework/.ipynb_checkpoints/EvaluatedAlgorithm-checkpoint.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu May 3 10:45:33 2018
4 |
5 | @author: Frank
6 | """
7 | from RecommenderMetrics import RecommenderMetrics
8 | from EvaluationData import EvaluationData
9 |
10 | class EvaluatedAlgorithm:
11 |
12 | def __init__(self, algorithm, name):
13 | self.algorithm = algorithm
14 | self.name = name
15 |
16 | def Evaluate(self, evaluationData, doTopN, n=10, verbose=True):
17 | metrics = {}
18 | # Compute accuracy
19 | if (verbose):
20 | print("Evaluating accuracy...")
21 | self.algorithm.fit(evaluationData.GetTrainSet())
22 | predictions = self.algorithm.test(evaluationData.GetTestSet())
23 | metrics["RMSE"] = RecommenderMetrics.RMSE(predictions)
24 | metrics["MAE"] = RecommenderMetrics.MAE(predictions)
25 |
26 | if (doTopN):
27 | # Evaluate top-10 with Leave One Out testing
28 | if (verbose):
29 | print("Evaluating top-N with leave-one-out...")
30 | self.algorithm.fit(evaluationData.GetLOOCVTrainSet())
31 | leftOutPredictions = self.algorithm.test(evaluationData.GetLOOCVTestSet())
32 | # Build predictions for all ratings not in the training set
33 | allPredictions = self.algorithm.test(evaluationData.GetLOOCVAntiTestSet())
34 | # Compute top 10 recs for each user
35 | topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)
36 | if (verbose):
37 | print("Computing hit-rate and rank metrics...")
38 | # See how often we recommended a movie the user actually rated
39 | metrics["HR"] = RecommenderMetrics.HitRate(topNPredicted, leftOutPredictions)
40 | # See how often we recommended a movie the user actually liked
41 | metrics["cHR"] = RecommenderMetrics.CumulativeHitRate(topNPredicted, leftOutPredictions)
42 | # Compute ARHR
43 | metrics["ARHR"] = RecommenderMetrics.AverageReciprocalHitRank(topNPredicted, leftOutPredictions)
44 |
45 | #Evaluate properties of recommendations on full training set
46 | if (verbose):
47 | print("Computing recommendations with full data set...")
48 | self.algorithm.fit(evaluationData.GetFullTrainSet())
49 | allPredictions = self.algorithm.test(evaluationData.GetFullAntiTestSet())
50 | topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)
51 | if (verbose):
52 | print("Analyzing coverage, diversity, and novelty...")
53 | # Print user coverage with a minimum predicted rating of 4.0:
54 | metrics["Coverage"] = RecommenderMetrics.UserCoverage( topNPredicted,
55 | evaluationData.GetFullTrainSet().n_users,
56 | ratingThreshold=4.0)
57 | # Measure diversity of recommendations:
58 | metrics["Diversity"] = RecommenderMetrics.Diversity(topNPredicted, evaluationData.GetSimilarities())
59 |
60 | # Measure novelty (average popularity rank of recommendations):
61 | metrics["Novelty"] = RecommenderMetrics.Novelty(topNPredicted,
62 | evaluationData.GetPopularityRankings())
63 |
64 | if (verbose):
65 | print("Analysis complete.")
66 |
67 | return metrics
68 |
69 | def GetName(self):
70 | return self.name
71 |
72 | def GetAlgorithm(self):
73 | return self.algorithm
74 |
75 |
--------------------------------------------------------------------------------
/02 - Recommender Engine Framework/.ipynb_checkpoints/EvaluationData-checkpoint.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu May 3 10:48:02 2018
4 |
5 | @author: Frank
6 | """
7 | from surprise.model_selection import train_test_split
8 | from surprise.model_selection import LeaveOneOut
9 | from surprise import KNNBaseline
10 |
11 | class EvaluationData:
12 |
13 | def __init__(self, data, popularityRankings):
14 |
15 | self.rankings = popularityRankings
16 |
17 | #Build a full training set for evaluating overall properties
18 | self.fullTrainSet = data.build_full_trainset()
19 | self.fullAntiTestSet = self.fullTrainSet.build_anti_testset()
20 |
21 | #Build a 75/25 train/test split for measuring accuracy
22 | self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1)
23 |
24 | #Build a "leave one out" train/test split for evaluating top-N recommenders
25 | #And build an anti-test-set for building predictions
26 | LOOCV = LeaveOneOut(n_splits=1, random_state=1)
27 | for train, test in LOOCV.split(data):
28 | self.LOOCVTrain = train
29 | self.LOOCVTest = test
30 |
31 | self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset()
32 |
33 | #Compute similarty matrix between items so we can measure diversity
34 | sim_options = {'name': 'cosine', 'user_based': False}
35 | self.simsAlgo = KNNBaseline(sim_options=sim_options)
36 | self.simsAlgo.fit(self.fullTrainSet)
37 |
38 | def GetFullTrainSet(self):
39 | return self.fullTrainSet
40 |
41 | def GetFullAntiTestSet(self):
42 | return self.fullAntiTestSet
43 |
44 | def GetAntiTestSetForUser(self, testSubject):
45 | trainset = self.fullTrainSet
46 | fill = trainset.global_mean
47 | anti_testset = []
48 | u = trainset.to_inner_uid(str(testSubject))
49 | user_items = set([j for (j, _) in trainset.ur[u]])
50 | anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
51 | i in trainset.all_items() if
52 | i not in user_items]
53 | return anti_testset
54 |
55 | def GetTrainSet(self):
56 | return self.trainSet
57 |
58 | def GetTestSet(self):
59 | return self.testSet
60 |
61 | def GetLOOCVTrainSet(self):
62 | return self.LOOCVTrain
63 |
64 | def GetLOOCVTestSet(self):
65 | return self.LOOCVTest
66 |
67 | def GetLOOCVAntiTestSet(self):
68 | return self.LOOCVAntiTestSet
69 |
70 | def GetSimilarities(self):
71 | return self.simsAlgo
72 |
73 | def GetPopularityRankings(self):
74 | return self.rankings
--------------------------------------------------------------------------------
/02 - Recommender Engine Framework/.ipynb_checkpoints/Evaluator-checkpoint.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu May 3 10:22:34 2018
4 |
5 | @author: Frank
6 | """
7 | from EvaluationData import EvaluationData
8 | from EvaluatedAlgorithm import EvaluatedAlgorithm
9 |
10 | class Evaluator:
11 |
12 | algorithms = []
13 |
14 | def __init__(self, dataset, rankings):
15 | ed = EvaluationData(dataset, rankings)
16 | self.dataset = ed
17 |
18 | def AddAlgorithm(self, algorithm, name):
19 | alg = EvaluatedAlgorithm(algorithm, name)
20 | self.algorithms.append(alg)
21 |
22 | def Evaluate(self, doTopN):
23 | results = {}
24 | for algorithm in self.algorithms:
25 | print("Evaluating ", algorithm.GetName(), "...")
26 | results[algorithm.GetName()] = algorithm.Evaluate(self.dataset, doTopN)
27 |
28 | # Print results
29 | print("\n")
30 |
31 | if (doTopN):
32 | print("{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}".format(
33 | "Algorithm", "RMSE", "MAE", "HR", "cHR", "ARHR", "Coverage", "Diversity", "Novelty"))
34 | for (name, metrics) in results.items():
35 | print("{:<10} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f}".format(
36 | name, metrics["RMSE"], metrics["MAE"], metrics["HR"], metrics["cHR"], metrics["ARHR"],
37 | metrics["Coverage"], metrics["Diversity"], metrics["Novelty"]))
38 | else:
39 | print("{:<10} {:<10} {:<10}".format("Algorithm", "RMSE", "MAE"))
40 | for (name, metrics) in results.items():
41 | print("{:<10} {:<10.4f} {:<10.4f}".format(name, metrics["RMSE"], metrics["MAE"]))
42 |
43 | print("\nLegend:\n")
44 | print("RMSE: Root Mean Squared Error. Lower values mean better accuracy.")
45 | print("MAE: Mean Absolute Error. Lower values mean better accuracy.")
46 | if (doTopN):
47 | print("HR: Hit Rate; how often we are able to recommend a left-out rating. Higher is better.")
48 | print("cHR: Cumulative Hit Rate; hit rate, confined to ratings above a certain threshold. Higher is better.")
49 | print("ARHR: Average Reciprocal Hit Rank - Hit rate that takes the ranking into account. Higher is better." )
50 | print("Coverage: Ratio of users for whom recommendations above a certain threshold exist. Higher is better.")
51 | print("Diversity: 1-S, where S is the average similarity score between every possible pair of recommendations")
52 | print(" for a given user. Higher means more diverse.")
53 | print("Novelty: Average popularity rank of recommended items. Higher means more novel.")
54 |
55 | def SampleTopNRecs(self, ml, testSubject=85, k=10):
56 |
57 | for algo in self.algorithms:
58 | print("\nUsing recommender ", algo.GetName())
59 |
60 | print("\nBuilding recommendation model...")
61 | trainSet = self.dataset.GetFullTrainSet()
62 | algo.GetAlgorithm().fit(trainSet)
63 |
64 | print("Computing recommendations...")
65 | testSet = self.dataset.GetAntiTestSetForUser(testSubject)
66 |
67 | predictions = algo.GetAlgorithm().test(testSet)
68 |
69 | recommendations = []
70 |
71 | print ("\nWe recommend:")
72 | for userID, movieID, actualRating, estimatedRating, _ in predictions:
73 | intMovieID = int(movieID)
74 | recommendations.append((intMovieID, estimatedRating))
75 |
76 | recommendations.sort(key=lambda x: x[1], reverse=True)
77 |
78 | for ratings in recommendations[:10]:
79 | print(ml.getMovieName(ratings[0]), ratings[1])
80 |
81 |
82 |
83 |
84 |
85 |
--------------------------------------------------------------------------------
/02 - Recommender Engine Framework/EvaluatedAlgorithm.py:
--------------------------------------------------------------------------------
1 | from RecommenderMetrics import RecommenderMetrics
2 | from EvaluationData import EvaluationData
3 |
4 | class EvaluatedAlgorithm:
5 |
6 | def __init__(self, algorithm, name):
7 | self.algorithm = algorithm
8 | self.name = name
9 |
10 | def Evaluate(self, evaluationData, doTopN, n=10, verbose=True):
11 | metrics = {}
12 | # Compute accuracy
13 | if (verbose):
14 | print("Evaluating accuracy...")
15 | self.algorithm.fit(evaluationData.GetTrainSet())
16 | predictions = self.algorithm.test(evaluationData.GetTestSet())
17 | metrics["RMSE"] = RecommenderMetrics.RMSE(predictions)
18 | metrics["MAE"] = RecommenderMetrics.MAE(predictions)
19 |
20 | if (doTopN):
21 | # Evaluate top-10 with Leave One Out testing
22 | if (verbose):
23 | print("Evaluating top-N with leave-one-out...")
24 | self.algorithm.fit(evaluationData.GetLOOCVTrainSet())
25 | leftOutPredictions = self.algorithm.test(evaluationData.GetLOOCVTestSet())
26 | # Build predictions for all ratings not in the training set
27 | allPredictions = self.algorithm.test(evaluationData.GetLOOCVAntiTestSet())
28 | # Compute top 10 recs for each user
29 | topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)
30 | if (verbose):
31 | print("Computing hit-rate and rank metrics...")
32 | # See how often we recommended a movie the user actually rated
33 | metrics["HR"] = RecommenderMetrics.HitRate(topNPredicted, leftOutPredictions)
34 | # See how often we recommended a movie the user actually liked
35 | metrics["cHR"] = RecommenderMetrics.CumulativeHitRate(topNPredicted, leftOutPredictions)
36 | # Compute ARHR
37 | metrics["ARHR"] = RecommenderMetrics.AverageReciprocalHitRank(topNPredicted, leftOutPredictions)
38 |
39 | #Evaluate properties of recommendations on full training set
40 | if (verbose):
41 | print("Computing recommendations with full data set...")
42 | self.algorithm.fit(evaluationData.GetFullTrainSet())
43 | allPredictions = self.algorithm.test(evaluationData.GetFullAntiTestSet())
44 | topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)
45 | if (verbose):
46 | print("Analyzing coverage, diversity, and novelty...")
47 | # Print user coverage with a minimum predicted rating of 4.0:
48 | metrics["Coverage"] = RecommenderMetrics.UserCoverage( topNPredicted,
49 | evaluationData.GetFullTrainSet().n_users,
50 | ratingThreshold=4.0)
51 | # Measure diversity of recommendations:
52 | metrics["Diversity"] = RecommenderMetrics.Diversity(topNPredicted, evaluationData.GetSimilarities())
53 |
54 | # Measure novelty (average popularity rank of recommendations):
55 | metrics["Novelty"] = RecommenderMetrics.Novelty(topNPredicted,
56 | evaluationData.GetPopularityRankings())
57 |
58 | if (verbose):
59 | print("Analysis complete.")
60 |
61 | return metrics
62 |
63 | def GetName(self):
64 | return self.name
65 |
66 | def GetAlgorithm(self):
67 | return self.algorithm
68 |
69 |
70 |
--------------------------------------------------------------------------------
/02 - Recommender Engine Framework/EvaluationData.py:
--------------------------------------------------------------------------------
1 | from surprise.model_selection import train_test_split
2 | from surprise.model_selection import LeaveOneOut
3 | from surprise import KNNBaseline
4 |
5 | class EvaluationData:
6 |
7 | def __init__(self, data, popularityRankings):
8 |
9 | self.rankings = popularityRankings
10 |
11 | #Build a full training set for evaluating overall properties
12 | self.fullTrainSet = data.build_full_trainset()
13 | self.fullAntiTestSet = self.fullTrainSet.build_anti_testset()
14 |
15 | #Build a 75/25 train/test split for measuring accuracy
16 | self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1)
17 |
18 | #Build a "leave one out" train/test split for evaluating top-N recommenders
19 | #And build an anti-test-set for building predictions
20 | LOOCV = LeaveOneOut(n_splits=1, random_state=1)
21 | for train, test in LOOCV.split(data):
22 | self.LOOCVTrain = train
23 | self.LOOCVTest = test
24 |
25 | self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset()
26 |
27 | #Compute similarty matrix between items so we can measure diversity
28 | sim_options = {'name': 'cosine', 'user_based': False}
29 | self.simsAlgo = KNNBaseline(sim_options=sim_options)
30 | self.simsAlgo.fit(self.fullTrainSet)
31 |
32 | def GetFullTrainSet(self):
33 | return self.fullTrainSet
34 |
35 | def GetFullAntiTestSet(self):
36 | return self.fullAntiTestSet
37 |
38 | def GetAntiTestSetForUser(self, testSubject):
39 | trainset = self.fullTrainSet
40 | fill = trainset.global_mean
41 | anti_testset = []
42 | u = trainset.to_inner_uid(str(testSubject))
43 | user_items = set([j for (j, _) in trainset.ur[u]])
44 | anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
45 | i in trainset.all_items() if
46 | i not in user_items]
47 | return anti_testset
48 |
49 | def GetTrainSet(self):
50 | return self.trainSet
51 |
52 | def GetTestSet(self):
53 | return self.testSet
54 |
55 | def GetLOOCVTrainSet(self):
56 | return self.LOOCVTrain
57 |
58 | def GetLOOCVTestSet(self):
59 | return self.LOOCVTest
60 |
61 | def GetLOOCVAntiTestSet(self):
62 | return self.LOOCVAntiTestSet
63 |
64 | def GetSimilarities(self):
65 | return self.simsAlgo
66 |
67 | def GetPopularityRankings(self):
68 | return self.rankings
69 |
--------------------------------------------------------------------------------
/02 - Recommender Engine Framework/Evaluator.py:
--------------------------------------------------------------------------------
1 | from EvaluationData import EvaluationData
2 | from EvaluatedAlgorithm import EvaluatedAlgorithm
3 |
4 | class Evaluator:
5 |
6 | algorithms = []
7 |
8 | def __init__(self, dataset, rankings):
9 | ed = EvaluationData(dataset, rankings)
10 | self.dataset = ed
11 |
12 | def AddAlgorithm(self, algorithm, name):
13 | alg = EvaluatedAlgorithm(algorithm, name)
14 | self.algorithms.append(alg)
15 |
16 | def Evaluate(self, doTopN):
17 | results = {}
18 | for algorithm in self.algorithms:
19 | print("Evaluating ", algorithm.GetName(), "...")
20 | results[algorithm.GetName()] = algorithm.Evaluate(self.dataset, doTopN)
21 |
22 | # Print results
23 | print("\n")
24 |
25 | if (doTopN):
26 | print("{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}".format(
27 | "Algorithm", "RMSE", "MAE", "HR", "cHR", "ARHR", "Coverage", "Diversity", "Novelty"))
28 | for (name, metrics) in results.items():
29 | print("{:<10} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f}".format(
30 | name, metrics["RMSE"], metrics["MAE"], metrics["HR"], metrics["cHR"], metrics["ARHR"],
31 | metrics["Coverage"], metrics["Diversity"], metrics["Novelty"]))
32 | else:
33 | print("{:<10} {:<10} {:<10}".format("Algorithm", "RMSE", "MAE"))
34 | for (name, metrics) in results.items():
35 | print("{:<10} {:<10.4f} {:<10.4f}".format(name, metrics["RMSE"], metrics["MAE"]))
36 |
37 | print("\nLegend:\n")
38 | print("RMSE: Root Mean Squared Error. Lower values mean better accuracy.")
39 | print("MAE: Mean Absolute Error. Lower values mean better accuracy.")
40 | if (doTopN):
41 | print("HR: Hit Rate; how often we are able to recommend a left-out rating. Higher is better.")
42 | print("cHR: Cumulative Hit Rate; hit rate, confined to ratings above a certain threshold. Higher is better.")
43 | print("ARHR: Average Reciprocal Hit Rank - Hit rate that takes the ranking into account. Higher is better." )
44 | print("Coverage: Ratio of users for whom recommendations above a certain threshold exist. Higher is better.")
45 | print("Diversity: 1-S, where S is the average similarity score between every possible pair of recommendations")
46 | print(" for a given user. Higher means more diverse.")
47 | print("Novelty: Average popularity rank of recommended items. Higher means more novel.")
48 |
49 | def SampleTopNRecs(self, ml, testSubject=85, k=10):
50 |
51 | for algo in self.algorithms:
52 | print("\nUsing recommender ", algo.GetName())
53 |
54 | print("\nBuilding recommendation model...")
55 | trainSet = self.dataset.GetFullTrainSet()
56 | algo.GetAlgorithm().fit(trainSet)
57 |
58 | print("Computing recommendations...")
59 | testSet = self.dataset.GetAntiTestSetForUser(testSubject)
60 |
61 | predictions = algo.GetAlgorithm().test(testSet)
62 |
63 | recommendations = []
64 |
65 | print ("\nWe recommend:")
66 | for userID, movieID, actualRating, estimatedRating, _ in predictions:
67 | intMovieID = int(movieID)
68 | recommendations.append((intMovieID, estimatedRating))
69 |
70 | recommendations.sort(key=lambda x: x[1], reverse=True)
71 |
72 | for ratings in recommendations[:10]:
73 | print(ml.getMovieName(ratings[0]), ratings[1])
74 |
75 |
76 |
77 |
78 |
79 |
80 |
--------------------------------------------------------------------------------
/02 - Recommender Engine Framework/RecommenderMetrics.py:
--------------------------------------------------------------------------------
1 | import itertools
2 |
3 | from surprise import accuracy
4 | from collections import defaultdict
5 |
6 | class RecommenderMetrics:
7 |
8 | def MAE(predictions):
9 | return accuracy.mae(predictions, verbose=False)
10 |
11 | def RMSE(predictions):
12 | return accuracy.rmse(predictions, verbose=False)
13 |
14 | def GetTopN(predictions, n=10, minimumRating=4.0):
15 | topN = defaultdict(list)
16 |
17 |
18 | for userID, movieID, actualRating, estimatedRating, _ in predictions:
19 | if (estimatedRating >= minimumRating):
20 | topN[int(userID)].append((int(movieID), estimatedRating))
21 |
22 | for userID, ratings in topN.items():
23 | ratings.sort(key=lambda x: x[1], reverse=True)
24 | topN[int(userID)] = ratings[:n]
25 |
26 | return topN
27 |
28 | def HitRate(topNPredicted, leftOutPredictions):
29 | hits = 0
30 | total = 0
31 |
32 | # For each left-out rating
33 | for leftOut in leftOutPredictions:
34 | userID = leftOut[0]
35 | leftOutMovieID = leftOut[1]
36 | # Is it in the predicted top 10 for this user?
37 | hit = False
38 | for movieID, predictedRating in topNPredicted[int(userID)]:
39 | if (int(leftOutMovieID) == int(movieID)):
40 | hit = True
41 | break
42 | if (hit) :
43 | hits += 1
44 |
45 | total += 1
46 |
47 | # Compute overall precision
48 | return hits/total
49 |
50 | def CumulativeHitRate(topNPredicted, leftOutPredictions, ratingCutoff=0):
51 | hits = 0
52 | total = 0
53 |
54 | # For each left-out rating
55 | for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
56 | # Only look at ability to recommend things the users actually liked...
57 | if (actualRating >= ratingCutoff):
58 | # Is it in the predicted top 10 for this user?
59 | hit = False
60 | for movieID, predictedRating in topNPredicted[int(userID)]:
61 | if (int(leftOutMovieID) == movieID):
62 | hit = True
63 | break
64 | if (hit) :
65 | hits += 1
66 |
67 | total += 1
68 |
69 | # Compute overall precision
70 | return hits/total
71 |
72 | def RatingHitRate(topNPredicted, leftOutPredictions):
73 | hits = defaultdict(float)
74 | total = defaultdict(float)
75 |
76 | # For each left-out rating
77 | for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
78 | # Is it in the predicted top N for this user?
79 | hit = False
80 | for movieID, predictedRating in topNPredicted[int(userID)]:
81 | if (int(leftOutMovieID) == movieID):
82 | hit = True
83 | break
84 | if (hit) :
85 | hits[actualRating] += 1
86 |
87 | total[actualRating] += 1
88 |
89 | # Compute overall precision
90 | for rating in sorted(hits.keys()):
91 | print (rating, hits[rating] / total[rating])
92 |
93 | def AverageReciprocalHitRank(topNPredicted, leftOutPredictions):
94 | summation = 0
95 | total = 0
96 | # For each left-out rating
97 | for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
98 | # Is it in the predicted top N for this user?
99 | hitRank = 0
100 | rank = 0
101 | for movieID, predictedRating in topNPredicted[int(userID)]:
102 | rank = rank + 1
103 | if (int(leftOutMovieID) == movieID):
104 | hitRank = rank
105 | break
106 | if (hitRank > 0) :
107 | summation += 1.0 / hitRank
108 |
109 | total += 1
110 |
111 | return summation / total
112 |
113 | # What percentage of users have at least one "good" recommendation
114 | def UserCoverage(topNPredicted, numUsers, ratingThreshold=0):
115 | hits = 0
116 | for userID in topNPredicted.keys():
117 | hit = False
118 | for movieID, predictedRating in topNPredicted[userID]:
119 | if (predictedRating >= ratingThreshold):
120 | hit = True
121 | break
122 | if (hit):
123 | hits += 1
124 |
125 | return hits / numUsers
126 |
127 | def Diversity(topNPredicted, simsAlgo):
128 | n = 0
129 | total = 0
130 | simsMatrix = simsAlgo.compute_similarities()
131 | for userID in topNPredicted.keys():
132 | pairs = itertools.combinations(topNPredicted[userID], 2)
133 | for pair in pairs:
134 | movie1 = pair[0][0]
135 | movie2 = pair[1][0]
136 | innerID1 = simsAlgo.trainset.to_inner_iid(str(movie1))
137 | innerID2 = simsAlgo.trainset.to_inner_iid(str(movie2))
138 | similarity = simsMatrix[innerID1][innerID2]
139 | total += similarity
140 | n += 1
141 |
142 | S = total / n
143 | return (1-S)
144 |
145 | def Novelty(topNPredicted, rankings):
146 | n = 0
147 | total = 0
148 | for userID in topNPredicted.keys():
149 | for rating in topNPredicted[userID]:
150 | movieID = rating[0]
151 | rank = rankings[movieID]
152 | total += rank
153 | n += 1
154 | return total / n
155 |
--------------------------------------------------------------------------------
/02 - Recommender Engine Framework/__pycache__/EvaluatedAlgorithm.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/02 - Recommender Engine Framework/__pycache__/EvaluatedAlgorithm.cpython-38.pyc
--------------------------------------------------------------------------------
/02 - Recommender Engine Framework/__pycache__/EvaluationData.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/02 - Recommender Engine Framework/__pycache__/EvaluationData.cpython-38.pyc
--------------------------------------------------------------------------------
/02 - Recommender Engine Framework/__pycache__/Evaluator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/02 - Recommender Engine Framework/__pycache__/Evaluator.cpython-38.pyc
--------------------------------------------------------------------------------
/02 - Recommender Engine Framework/__pycache__/MovieLens.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/02 - Recommender Engine Framework/__pycache__/MovieLens.cpython-38.pyc
--------------------------------------------------------------------------------
/02 - Recommender Engine Framework/__pycache__/RecommenderMetrics.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/02 - Recommender Engine Framework/__pycache__/RecommenderMetrics.cpython-38.pyc
--------------------------------------------------------------------------------
/03 - Content Based Recommendation/.ipynb_checkpoints/Content Based Recommendation with MisEnScene-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Content Based Movie Recommendation"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Importing Dependencies"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "from MovieLens import MovieLens\n",
24 | "from ContentKNNAlgorithm import ContentKNNAlgorithm\n",
25 | "from Evaluator import Evaluator\n",
26 | "from surprise import NormalPredictor\n",
27 | "\n",
28 | "import random\n",
29 | "import numpy as np"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {},
35 | "source": [
36 | "## Loading Data"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 2,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "def LoadMovieLensData():\n",
46 | " ml = MovieLens()\n",
47 | " print(\"Loading movie ratings...\")\n",
48 | " data = ml.loadMovieLensLatestSmall()\n",
49 | " print(\"\\nComputing movie popularity ranks so we can measure novelty later...\")\n",
50 | " rankings = ml.getPopularityRanks()\n",
51 | " return (ml, data, rankings)"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 3,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "np.random.seed(0)\n",
61 | "random.seed(0)"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 4,
67 | "metadata": {},
68 | "outputs": [
69 | {
70 | "name": "stdout",
71 | "output_type": "stream",
72 | "text": [
73 | "Loading movie ratings...\n",
74 | "\n",
75 | "Computing movie popularity ranks so we can measure novelty later...\n"
76 | ]
77 | }
78 | ],
79 | "source": [
80 | "# Load up common data set for the recommender algorithms\n",
81 | "(ml, evaluationData, rankings) = LoadMovieLensData()"
82 | ]
83 | },
84 | {
85 | "cell_type": "markdown",
86 | "metadata": {},
87 | "source": [
88 | "## Instantiating Evaluator"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 5,
94 | "metadata": {},
95 | "outputs": [
96 | {
97 | "name": "stdout",
98 | "output_type": "stream",
99 | "text": [
100 | "Estimating biases using als...\n",
101 | "Computing the cosine similarity matrix...\n",
102 | "Done computing similarity matrix.\n"
103 | ]
104 | }
105 | ],
106 | "source": [
107 | "# Construct an Evaluator to, you know, evaluate them\n",
108 | "evaluator = Evaluator(evaluationData, rankings)"
109 | ]
110 | },
111 | {
112 | "cell_type": "markdown",
113 | "metadata": {},
114 | "source": [
115 | "## Bulding Recommender Algorithms"
116 | ]
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "metadata": {},
121 | "source": [
122 | "### Adding a Content KNN Algorithm"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 6,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "contentKNN = ContentKNNAlgorithm()\n",
132 | "evaluator.AddAlgorithm(contentKNN, \"ContentKNN\")"
133 | ]
134 | },
135 | {
136 | "cell_type": "markdown",
137 | "metadata": {},
138 | "source": [
139 | "### Adding a Random Recommendation Algorithm"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 7,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "# Just make random recommendations\n",
149 | "Random = NormalPredictor()\n",
150 | "evaluator.AddAlgorithm(Random, \"Random\")"
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "metadata": {},
156 | "source": [
157 | "## Evaluate"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {},
164 | "outputs": [
165 | {
166 | "name": "stdout",
167 | "output_type": "stream",
168 | "text": [
169 | "Evaluating ContentKNN ...\n",
170 | "Evaluating accuracy...\n",
171 | "Computing content-based similarity matrix...\n",
172 | "0 of 8211\n",
173 | "1000 of 8211\n",
174 | "2000 of 8211\n"
175 | ]
176 | }
177 | ],
178 | "source": [
179 | "evaluator.Evaluate(False)"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": null,
185 | "metadata": {},
186 | "outputs": [],
187 | "source": []
188 | }
189 | ],
190 | "metadata": {
191 | "kernelspec": {
192 | "display_name": "Python 3",
193 | "language": "python",
194 | "name": "python3"
195 | },
196 | "language_info": {
197 | "codemirror_mode": {
198 | "name": "ipython",
199 | "version": 3
200 | },
201 | "file_extension": ".py",
202 | "mimetype": "text/x-python",
203 | "name": "python",
204 | "nbconvert_exporter": "python",
205 | "pygments_lexer": "ipython3",
206 | "version": "3.8.2"
207 | }
208 | },
209 | "nbformat": 4,
210 | "nbformat_minor": 4
211 | }
212 |
--------------------------------------------------------------------------------
/03 - Content Based Recommendation/.ipynb_checkpoints/Content Based Recommendation-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 4
6 | }
7 |
--------------------------------------------------------------------------------
/03 - Content Based Recommendation/.ipynb_checkpoints/ContentKNNAlgorithm-checkpoint.py:
--------------------------------------------------------------------------------
1 | from surprise import AlgoBase
2 | from surprise import PredictionImpossible
3 | from MovieLens import MovieLens
4 | import math
5 | import numpy as np
6 | import heapq
7 |
8 | class ContentKNNAlgorithm(AlgoBase):
9 |
10 | def __init__(self, k=40, sim_options={}):
11 | AlgoBase.__init__(self)
12 | self.k = k
13 |
14 | def fit(self, trainset):
15 | AlgoBase.fit(self, trainset)
16 |
17 | # Compute item similarity matrix based on content attributes
18 |
19 | # Load up genre vectors for every movie
20 | ml = MovieLens()
21 | genres = ml.getGenres()
22 | years = ml.getYears()
23 | mes = ml.getMiseEnScene()
24 |
25 | print("Computing content-based similarity matrix...")
26 |
27 | # Compute genre distance for every movie combination as a 2x2 matrix
28 | self.similarities = np.zeros((self.trainset.n_items, self.trainset.n_items))
29 |
30 | for thisRating in range(self.trainset.n_items):
31 | if (thisRating % 1000 == 0):
32 | print(thisRating, " of ", self.trainset.n_items)
33 | for otherRating in range(thisRating+1, self.trainset.n_items):
34 | thisMovieID = int(self.trainset.to_raw_iid(thisRating))
35 | otherMovieID = int(self.trainset.to_raw_iid(otherRating))
36 | genreSimilarity = self.computeGenreSimilarity(thisMovieID, otherMovieID, genres)
37 | yearSimilarity = self.computeYearSimilarity(thisMovieID, otherMovieID, years)
38 | #mesSimilarity = self.computeMiseEnSceneSimilarity(thisMovieID, otherMovieID, mes)
39 | self.similarities[thisRating, otherRating] = genreSimilarity * yearSimilarity
40 | self.similarities[otherRating, thisRating] = self.similarities[thisRating, otherRating]
41 |
42 | print("...done.")
43 |
44 | return self
45 |
46 | def computeGenreSimilarity(self, movie1, movie2, genres):
47 | genres1 = genres[movie1]
48 | genres2 = genres[movie2]
49 | sumxx, sumxy, sumyy = 0, 0, 0
50 | for i in range(len(genres1)):
51 | x = genres1[i]
52 | y = genres2[i]
53 | sumxx += x * x
54 | sumyy += y * y
55 | sumxy += x * y
56 |
57 | return sumxy/math.sqrt(sumxx*sumyy)
58 |
59 | def computeYearSimilarity(self, movie1, movie2, years):
60 | diff = abs(years[movie1] - years[movie2])
61 | sim = math.exp(-diff / 10.0)
62 | return sim
63 |
64 | def computeMiseEnSceneSimilarity(self, movie1, movie2, mes):
65 | mes1 = mes[movie1]
66 | mes2 = mes[movie2]
67 | if (mes1 and mes2):
68 | shotLengthDiff = math.fabs(mes1[0] - mes2[0])
69 | colorVarianceDiff = math.fabs(mes1[1] - mes2[1])
70 | motionDiff = math.fabs(mes1[3] - mes2[3])
71 | lightingDiff = math.fabs(mes1[5] - mes2[5])
72 | numShotsDiff = math.fabs(mes1[6] - mes2[6])
73 | return shotLengthDiff * colorVarianceDiff * motionDiff * lightingDiff * numShotsDiff
74 | else:
75 | return 0
76 |
77 | def estimate(self, u, i):
78 |
79 | if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
80 | raise PredictionImpossible('User and/or item is unkown.')
81 |
82 | # Build up similarity scores between this item and everything the user rated
83 | neighbors = []
84 | for rating in self.trainset.ur[u]:
85 | genreSimilarity = self.similarities[i,rating[0]]
86 | neighbors.append( (genreSimilarity, rating[1]) )
87 |
88 | # Extract the top-K most-similar ratings
89 | k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[0])
90 |
91 | # Compute average sim score of K neighbors weighted by user ratings
92 | simTotal = weightedSum = 0
93 | for (simScore, rating) in k_neighbors:
94 | if (simScore > 0):
95 | simTotal += simScore
96 | weightedSum += simScore * rating
97 |
98 | if (simTotal == 0):
99 | raise PredictionImpossible('No neighbors')
100 |
101 | predictedRating = weightedSum / simTotal
102 |
103 | return predictedRating
104 |
--------------------------------------------------------------------------------
/03 - Content Based Recommendation/.ipynb_checkpoints/ContentKNNwithMisEnScene-checkpoint.py:
--------------------------------------------------------------------------------
1 | from surprise import AlgoBase
2 | from surprise import PredictionImpossible
3 | from MovieLens import MovieLens
4 | import math
5 | import numpy as np
6 | import heapq
7 |
8 | class ContentKNNwithMisEnScene(AlgoBase):
9 |
10 | def __init__(self, k=40, sim_options={}):
11 | AlgoBase.__init__(self)
12 | self.k = k
13 |
14 | def fit(self, trainset):
15 | AlgoBase.fit(self, trainset)
16 |
17 | # Compute item similarity matrix based on content attributes
18 |
19 | # Load up genre vectors for every movie
20 | ml = MovieLens()
21 | genres = ml.getGenres()
22 | years = ml.getYears()
23 | mes = ml.getMiseEnScene()
24 |
25 | print("Computing content-based similarity matrix...")
26 |
27 | # Compute genre distance for every movie combination as a 2x2 matrix
28 | self.similarities = np.zeros((self.trainset.n_items, self.trainset.n_items))
29 |
30 | for thisRating in range(self.trainset.n_items):
31 | if (thisRating % 1000 == 0):
32 | print(thisRating, " of ", self.trainset.n_items)
33 | for otherRating in range(thisRating+1, self.trainset.n_items):
34 | thisMovieID = int(self.trainset.to_raw_iid(thisRating))
35 | otherMovieID = int(self.trainset.to_raw_iid(otherRating))
36 | genreSimilarity = self.computeGenreSimilarity(thisMovieID, otherMovieID, genres)
37 | yearSimilarity = self.computeYearSimilarity(thisMovieID, otherMovieID, years)
38 | mesSimilarity = self.computeMiseEnSceneSimilarity(thisMovieID, otherMovieID, mes)
39 | self.similarities[thisRating, otherRating] = genreSimilarity * yearSimilarity * mesSimilarity
40 | self.similarities[otherRating, thisRating] = self.similarities[thisRating, otherRating]
41 |
42 | print("...done.")
43 |
44 | return self
45 |
46 | def computeGenreSimilarity(self, movie1, movie2, genres):
47 | genres1 = genres[movie1]
48 | genres2 = genres[movie2]
49 | sumxx, sumxy, sumyy = 0, 0, 0
50 | for i in range(len(genres1)):
51 | x = genres1[i]
52 | y = genres2[i]
53 | sumxx += x * x
54 | sumyy += y * y
55 | sumxy += x * y
56 |
57 | return sumxy/math.sqrt(sumxx*sumyy)
58 |
59 | def computeYearSimilarity(self, movie1, movie2, years):
60 | diff = abs(years[movie1] - years[movie2])
61 | sim = math.exp(-diff / 10.0)
62 | return sim
63 |
64 | def computeMiseEnSceneSimilarity(self, movie1, movie2, mes):
65 | mes1 = mes[movie1]
66 | mes2 = mes[movie2]
67 | if (mes1 and mes2):
68 | shotLengthDiff = math.fabs(mes1[0] - mes2[0])
69 | colorVarianceDiff = math.fabs(mes1[1] - mes2[1])
70 | motionDiff = math.fabs(mes1[3] - mes2[3])
71 | lightingDiff = math.fabs(mes1[5] - mes2[5])
72 | numShotsDiff = math.fabs(mes1[6] - mes2[6])
73 | return shotLengthDiff * colorVarianceDiff * motionDiff * lightingDiff * numShotsDiff
74 | else:
75 | return 0
76 |
77 | def estimate(self, u, i):
78 |
79 | if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
80 | raise PredictionImpossible('User and/or item is unkown.')
81 |
82 | # Build up similarity scores between this item and everything the user rated
83 | neighbors = []
84 | for rating in self.trainset.ur[u]:
85 | genreSimilarity = self.similarities[i,rating[0]]
86 | neighbors.append( (genreSimilarity, rating[1]) )
87 |
88 | # Extract the top-K most-similar ratings
89 | k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[0])
90 |
91 | # Compute average sim score of K neighbors weighted by user ratings
92 | simTotal = weightedSum = 0
93 | for (simScore, rating) in k_neighbors:
94 | if (simScore > 0):
95 | simTotal += simScore
96 | weightedSum += simScore * rating
97 |
98 | if (simTotal == 0):
99 | raise PredictionImpossible('No neighbors')
100 |
101 | predictedRating = weightedSum / simTotal
102 |
103 | return predictedRating
104 |
--------------------------------------------------------------------------------
/03 - Content Based Recommendation/.ipynb_checkpoints/ContentRecs-checkpoint.py:
--------------------------------------------------------------------------------
1 | from MovieLens import MovieLens
2 | from ContentKNNAlgorithm import ContentKNNAlgorithm
3 | from Evaluator import Evaluator
4 | from surprise import NormalPredictor
5 |
6 | import random
7 | import numpy as np
8 |
9 | def LoadMovieLensData():
10 | ml = MovieLens()
11 | print("Loading movie ratings...")
12 | data = ml.loadMovieLensLatestSmall()
13 | print("\nComputing movie popularity ranks so we can measure novelty later...")
14 | rankings = ml.getPopularityRanks()
15 | return (ml, data, rankings)
16 |
17 | np.random.seed(0)
18 | random.seed(0)
19 |
20 | # Load up common data set for the recommender algorithms
21 | (ml, evaluationData, rankings) = LoadMovieLensData()
22 |
23 | # Construct an Evaluator to, you know, evaluate them
24 | evaluator = Evaluator(evaluationData, rankings)
25 |
26 | contentKNN = ContentKNNAlgorithm()
27 | evaluator.AddAlgorithm(contentKNN, "ContentKNN")
28 |
29 | # Just make random recommendations
30 | Random = NormalPredictor()
31 | evaluator.AddAlgorithm(Random, "Random")
32 |
33 | evaluator.Evaluate(False)
34 |
35 | evaluator.SampleTopNRecs(ml)
36 |
37 |
38 |
--------------------------------------------------------------------------------
/03 - Content Based Recommendation/.ipynb_checkpoints/EvaluationData-checkpoint.py:
--------------------------------------------------------------------------------
1 | from surprise.model_selection import train_test_split
2 | from surprise.model_selection import LeaveOneOut
3 | from surprise import KNNBaseline
4 |
5 | class EvaluationData:
6 |
7 | def __init__(self, data, popularityRankings):
8 |
9 | self.rankings = popularityRankings
10 |
11 | #Build a full training set for evaluating overall properties
12 | self.fullTrainSet = data.build_full_trainset()
13 | self.fullAntiTestSet = self.fullTrainSet.build_anti_testset()
14 |
15 | #Build a 75/25 train/test split for measuring accuracy
16 | self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1)
17 |
18 | #Build a "leave one out" train/test split for evaluating top-N recommenders
19 | #And build an anti-test-set for building predictions
20 | LOOCV = LeaveOneOut(n_splits=1, random_state=1)
21 | for train, test in LOOCV.split(data):
22 | self.LOOCVTrain = train
23 | self.LOOCVTest = test
24 |
25 | self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset()
26 |
27 | #Compute similarty matrix between items so we can measure diversity
28 | sim_options = {'name': 'cosine', 'user_based': False}
29 | self.simsAlgo = KNNBaseline(sim_options=sim_options)
30 | self.simsAlgo.fit(self.fullTrainSet)
31 |
32 | def GetFullTrainSet(self):
33 | return self.fullTrainSet
34 |
35 | def GetFullAntiTestSet(self):
36 | return self.fullAntiTestSet
37 |
38 | def GetAntiTestSetForUser(self, testSubject):
39 | trainset = self.fullTrainSet
40 | fill = trainset.global_mean
41 | anti_testset = []
42 | u = trainset.to_inner_uid(str(testSubject))
43 | user_items = set([j for (j, _) in trainset.ur[u]])
44 | anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
45 | i in trainset.all_items() if
46 | i not in user_items]
47 | return anti_testset
48 |
49 | def GetTrainSet(self):
50 | return self.trainSet
51 |
52 | def GetTestSet(self):
53 | return self.testSet
54 |
55 | def GetLOOCVTrainSet(self):
56 | return self.LOOCVTrain
57 |
58 | def GetLOOCVTestSet(self):
59 | return self.LOOCVTest
60 |
61 | def GetLOOCVAntiTestSet(self):
62 | return self.LOOCVAntiTestSet
63 |
64 | def GetSimilarities(self):
65 | return self.simsAlgo
66 |
67 | def GetPopularityRankings(self):
68 | return self.rankings
--------------------------------------------------------------------------------
/03 - Content Based Recommendation/.ipynb_checkpoints/Evaluator-checkpoint.py:
--------------------------------------------------------------------------------
1 | from EvaluationData import EvaluationData
2 | from EvaluatedAlgorithm import EvaluatedAlgorithm
3 |
4 | class Evaluator:
5 |
6 | algorithms = []
7 |
8 | def __init__(self, dataset, rankings):
9 | ed = EvaluationData(dataset, rankings)
10 | self.dataset = ed
11 |
12 | def AddAlgorithm(self, algorithm, name):
13 | alg = EvaluatedAlgorithm(algorithm, name)
14 | self.algorithms.append(alg)
15 |
16 | def Evaluate(self, doTopN):
17 | results = {}
18 | for algorithm in self.algorithms:
19 | print("Evaluating ", algorithm.GetName(), "...")
20 | results[algorithm.GetName()] = algorithm.Evaluate(self.dataset, doTopN)
21 |
22 | # Print results
23 | print("\n")
24 |
25 | if (doTopN):
26 | print("{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}".format(
27 | "Algorithm", "RMSE", "MAE", "HR", "cHR", "ARHR", "Coverage", "Diversity", "Novelty"))
28 | for (name, metrics) in results.items():
29 | print("{:<10} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f}".format(
30 | name, metrics["RMSE"], metrics["MAE"], metrics["HR"], metrics["cHR"], metrics["ARHR"],
31 | metrics["Coverage"], metrics["Diversity"], metrics["Novelty"]))
32 | else:
33 | print("{:<10} {:<10} {:<10}".format("Algorithm", "RMSE", "MAE"))
34 | for (name, metrics) in results.items():
35 | print("{:<10} {:<10.4f} {:<10.4f}".format(name, metrics["RMSE"], metrics["MAE"]))
36 |
37 | print("\nLegend:\n")
38 | print("RMSE: Root Mean Squared Error. Lower values mean better accuracy.")
39 | print("MAE: Mean Absolute Error. Lower values mean better accuracy.")
40 | if (doTopN):
41 | print("HR: Hit Rate; how often we are able to recommend a left-out rating. Higher is better.")
42 | print("cHR: Cumulative Hit Rate; hit rate, confined to ratings above a certain threshold. Higher is better.")
43 | print("ARHR: Average Reciprocal Hit Rank - Hit rate that takes the ranking into account. Higher is better." )
44 | print("Coverage: Ratio of users for whom recommendations above a certain threshold exist. Higher is better.")
45 | print("Diversity: 1-S, where S is the average similarity score between every possible pair of recommendations")
46 | print(" for a given user. Higher means more diverse.")
47 | print("Novelty: Average popularity rank of recommended items. Higher means more novel.")
48 |
49 | def SampleTopNRecs(self, ml, testSubject=85, k=10):
50 |
51 | for algo in self.algorithms:
52 | print("\nUsing recommender ", algo.GetName())
53 |
54 | print("\nBuilding recommendation model...")
55 | trainSet = self.dataset.GetFullTrainSet()
56 | algo.GetAlgorithm().fit(trainSet)
57 |
58 | print("Computing recommendations...")
59 | testSet = self.dataset.GetAntiTestSetForUser(testSubject)
60 |
61 | predictions = algo.GetAlgorithm().test(testSet)
62 |
63 | recommendations = []
64 |
65 | print ("\nWe recommend:")
66 | for userID, movieID, actualRating, estimatedRating, _ in predictions:
67 | intMovieID = int(movieID)
68 | recommendations.append((intMovieID, estimatedRating))
69 |
70 | recommendations.sort(key=lambda x: x[1], reverse=True)
71 |
72 | for ratings in recommendations[:10]:
73 | print(ml.getMovieName(ratings[0]), ratings[1])
74 |
75 |
76 |
77 |
78 |
79 |
--------------------------------------------------------------------------------
/03 - Content Based Recommendation/ContentKNNAlgorithm.py:
--------------------------------------------------------------------------------
1 | from surprise import AlgoBase
2 | from surprise import PredictionImpossible
3 | from MovieLens import MovieLens
4 | import math
5 | import numpy as np
6 | import heapq
7 |
8 | class ContentKNNAlgorithm(AlgoBase):
9 |
10 | def __init__(self, k=40, sim_options={}):
11 | AlgoBase.__init__(self)
12 | self.k = k
13 |
14 | def fit(self, trainset):
15 | AlgoBase.fit(self, trainset)
16 |
17 | # Compute item similarity matrix based on content attributes
18 |
19 | # Load up genre vectors for every movie
20 | ml = MovieLens()
21 | genres = ml.getGenres()
22 | years = ml.getYears()
23 | mes = ml.getMiseEnScene()
24 |
25 | print("Computing content-based similarity matrix...")
26 |
27 | # Compute genre distance for every movie combination as a 2x2 matrix
28 | self.similarities = np.zeros((self.trainset.n_items, self.trainset.n_items))
29 |
30 | for thisRating in range(self.trainset.n_items):
31 | if (thisRating % 1000 == 0):
32 | print(thisRating, " of ", self.trainset.n_items)
33 | for otherRating in range(thisRating+1, self.trainset.n_items):
34 | thisMovieID = int(self.trainset.to_raw_iid(thisRating))
35 | otherMovieID = int(self.trainset.to_raw_iid(otherRating))
36 | genreSimilarity = self.computeGenreSimilarity(thisMovieID, otherMovieID, genres)
37 | yearSimilarity = self.computeYearSimilarity(thisMovieID, otherMovieID, years)
38 | #mesSimilarity = self.computeMiseEnSceneSimilarity(thisMovieID, otherMovieID, mes)
39 | self.similarities[thisRating, otherRating] = genreSimilarity * yearSimilarity
40 | self.similarities[otherRating, thisRating] = self.similarities[thisRating, otherRating]
41 |
42 | print("...done.")
43 |
44 | return self
45 |
46 | def computeGenreSimilarity(self, movie1, movie2, genres):
47 | genres1 = genres[movie1]
48 | genres2 = genres[movie2]
49 | sumxx, sumxy, sumyy = 0, 0, 0
50 | for i in range(len(genres1)):
51 | x = genres1[i]
52 | y = genres2[i]
53 | sumxx += x * x
54 | sumyy += y * y
55 | sumxy += x * y
56 |
57 | return sumxy/math.sqrt(sumxx*sumyy)
58 |
59 | def computeYearSimilarity(self, movie1, movie2, years):
60 | diff = abs(years[movie1] - years[movie2])
61 | sim = math.exp(-diff / 10.0)
62 | return sim
63 |
64 | def computeMiseEnSceneSimilarity(self, movie1, movie2, mes):
65 | mes1 = mes[movie1]
66 | mes2 = mes[movie2]
67 | if (mes1 and mes2):
68 | shotLengthDiff = math.fabs(mes1[0] - mes2[0])
69 | colorVarianceDiff = math.fabs(mes1[1] - mes2[1])
70 | motionDiff = math.fabs(mes1[3] - mes2[3])
71 | lightingDiff = math.fabs(mes1[5] - mes2[5])
72 | numShotsDiff = math.fabs(mes1[6] - mes2[6])
73 | return shotLengthDiff * colorVarianceDiff * motionDiff * lightingDiff * numShotsDiff
74 | else:
75 | return 0
76 |
77 | def estimate(self, u, i):
78 |
79 | if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
80 | raise PredictionImpossible('User and/or item is unkown.')
81 |
82 | # Build up similarity scores between this item and everything the user rated
83 | neighbors = []
84 | for rating in self.trainset.ur[u]:
85 | genreSimilarity = self.similarities[i,rating[0]]
86 | neighbors.append( (genreSimilarity, rating[1]) )
87 |
88 | # Extract the top-K most-similar ratings
89 | k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[0])
90 |
91 | # Compute average sim score of K neighbors weighted by user ratings
92 | simTotal = weightedSum = 0
93 | for (simScore, rating) in k_neighbors:
94 | if (simScore > 0):
95 | simTotal += simScore
96 | weightedSum += simScore * rating
97 |
98 | if (simTotal == 0):
99 | raise PredictionImpossible('No neighbors')
100 |
101 | predictedRating = weightedSum / simTotal
102 |
103 | return predictedRating
104 |
--------------------------------------------------------------------------------
/03 - Content Based Recommendation/ContentKNNwithMisEnScene.py:
--------------------------------------------------------------------------------
1 | from surprise import AlgoBase
2 | from surprise import PredictionImpossible
3 | from MovieLens import MovieLens
4 | import math
5 | import numpy as np
6 | import heapq
7 |
8 | class ContentKNNwithMisEnScene(AlgoBase):
9 |
10 | def __init__(self, k=40, sim_options={}):
11 | AlgoBase.__init__(self)
12 | self.k = k
13 |
14 | def fit(self, trainset):
15 | AlgoBase.fit(self, trainset)
16 |
17 | # Compute item similarity matrix based on content attributes
18 |
19 | # Load up genre vectors for every movie
20 | ml = MovieLens()
21 | genres = ml.getGenres()
22 | years = ml.getYears()
23 | mes = ml.getMiseEnScene()
24 |
25 | print("Computing content-based similarity matrix...")
26 |
27 | # Compute genre distance for every movie combination as a 2x2 matrix
28 | self.similarities = np.zeros((self.trainset.n_items, self.trainset.n_items))
29 |
30 | for thisRating in range(self.trainset.n_items):
31 | if (thisRating % 1000 == 0):
32 | print(thisRating, " of ", self.trainset.n_items)
33 | for otherRating in range(thisRating+1, self.trainset.n_items):
34 | thisMovieID = int(self.trainset.to_raw_iid(thisRating))
35 | otherMovieID = int(self.trainset.to_raw_iid(otherRating))
36 | genreSimilarity = self.computeGenreSimilarity(thisMovieID, otherMovieID, genres)
37 | yearSimilarity = self.computeYearSimilarity(thisMovieID, otherMovieID, years)
38 | mesSimilarity = self.computeMiseEnSceneSimilarity(thisMovieID, otherMovieID, mes)
39 | self.similarities[thisRating, otherRating] = genreSimilarity * yearSimilarity * mesSimilarity
40 | self.similarities[otherRating, thisRating] = self.similarities[thisRating, otherRating]
41 |
42 | print("...done.")
43 |
44 | return self
45 |
46 | def computeGenreSimilarity(self, movie1, movie2, genres):
47 | genres1 = genres[movie1]
48 | genres2 = genres[movie2]
49 | sumxx, sumxy, sumyy = 0, 0, 0
50 | for i in range(len(genres1)):
51 | x = genres1[i]
52 | y = genres2[i]
53 | sumxx += x * x
54 | sumyy += y * y
55 | sumxy += x * y
56 |
57 | return sumxy/math.sqrt(sumxx*sumyy)
58 |
59 | def computeYearSimilarity(self, movie1, movie2, years):
60 | diff = abs(years[movie1] - years[movie2])
61 | sim = math.exp(-diff / 10.0)
62 | return sim
63 |
64 | def computeMiseEnSceneSimilarity(self, movie1, movie2, mes):
65 | mes1 = mes[movie1]
66 | mes2 = mes[movie2]
67 | if (mes1 and mes2):
68 | shotLengthDiff = math.fabs(mes1[0] - mes2[0])
69 | colorVarianceDiff = math.fabs(mes1[1] - mes2[1])
70 | motionDiff = math.fabs(mes1[3] - mes2[3])
71 | lightingDiff = math.fabs(mes1[5] - mes2[5])
72 | numShotsDiff = math.fabs(mes1[6] - mes2[6])
73 | return shotLengthDiff * colorVarianceDiff * motionDiff * lightingDiff * numShotsDiff
74 | else:
75 | return 0
76 |
77 | def estimate(self, u, i):
78 |
79 | if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
80 | raise PredictionImpossible('User and/or item is unkown.')
81 |
82 | # Build up similarity scores between this item and everything the user rated
83 | neighbors = []
84 | for rating in self.trainset.ur[u]:
85 | genreSimilarity = self.similarities[i,rating[0]]
86 | neighbors.append( (genreSimilarity, rating[1]) )
87 |
88 | # Extract the top-K most-similar ratings
89 | k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[0])
90 |
91 | # Compute average sim score of K neighbors weighted by user ratings
92 | simTotal = weightedSum = 0
93 | for (simScore, rating) in k_neighbors:
94 | if (simScore > 0):
95 | simTotal += simScore
96 | weightedSum += simScore * rating
97 |
98 | if (simTotal == 0):
99 | raise PredictionImpossible('No neighbors')
100 |
101 | predictedRating = weightedSum / simTotal
102 |
103 | return predictedRating
104 |
--------------------------------------------------------------------------------
/03 - Content Based Recommendation/ContentRecs.py:
--------------------------------------------------------------------------------
1 | from MovieLens import MovieLens
2 | from ContentKNNAlgorithm import ContentKNNAlgorithm
3 | from Evaluator import Evaluator
4 | from surprise import NormalPredictor
5 |
6 | import random
7 | import numpy as np
8 |
9 | def LoadMovieLensData():
10 | ml = MovieLens()
11 | print("Loading movie ratings...")
12 | data = ml.loadMovieLensLatestSmall()
13 | print("\nComputing movie popularity ranks so we can measure novelty later...")
14 | rankings = ml.getPopularityRanks()
15 | return (ml, data, rankings)
16 |
17 | np.random.seed(0)
18 | random.seed(0)
19 |
20 | # Load up common data set for the recommender algorithms
21 | (ml, evaluationData, rankings) = LoadMovieLensData()
22 |
23 | # Construct an Evaluator to, you know, evaluate them
24 | evaluator = Evaluator(evaluationData, rankings)
25 |
26 | contentKNN = ContentKNNAlgorithm()
27 | evaluator.AddAlgorithm(contentKNN, "ContentKNN")
28 |
29 | # Just make random recommendations
30 | Random = NormalPredictor()
31 | evaluator.AddAlgorithm(Random, "Random")
32 |
33 | evaluator.Evaluate(False)
34 |
35 | evaluator.SampleTopNRecs(ml)
36 |
37 |
38 |
--------------------------------------------------------------------------------
/03 - Content Based Recommendation/EvaluatedAlgorithm.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu May 3 10:45:33 2018
4 |
5 | @author: Frank
6 | """
7 | from RecommenderMetrics import RecommenderMetrics
8 | from EvaluationData import EvaluationData
9 |
10 | class EvaluatedAlgorithm:
11 |
12 | def __init__(self, algorithm, name):
13 | self.algorithm = algorithm
14 | self.name = name
15 |
16 | def Evaluate(self, evaluationData, doTopN, n=10, verbose=True):
17 | metrics = {}
18 | # Compute accuracy
19 | if (verbose):
20 | print("Evaluating accuracy...")
21 | self.algorithm.fit(evaluationData.GetTrainSet())
22 | predictions = self.algorithm.test(evaluationData.GetTestSet())
23 | metrics["RMSE"] = RecommenderMetrics.RMSE(predictions)
24 | metrics["MAE"] = RecommenderMetrics.MAE(predictions)
25 |
26 | if (doTopN):
27 | # Evaluate top-10 with Leave One Out testing
28 | if (verbose):
29 | print("Evaluating top-N with leave-one-out...")
30 | self.algorithm.fit(evaluationData.GetLOOCVTrainSet())
31 | leftOutPredictions = self.algorithm.test(evaluationData.GetLOOCVTestSet())
32 | # Build predictions for all ratings not in the training set
33 | allPredictions = self.algorithm.test(evaluationData.GetLOOCVAntiTestSet())
34 | # Compute top 10 recs for each user
35 | topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)
36 | if (verbose):
37 | print("Computing hit-rate and rank metrics...")
38 | # See how often we recommended a movie the user actually rated
39 | metrics["HR"] = RecommenderMetrics.HitRate(topNPredicted, leftOutPredictions)
40 | # See how often we recommended a movie the user actually liked
41 | metrics["cHR"] = RecommenderMetrics.CumulativeHitRate(topNPredicted, leftOutPredictions)
42 | # Compute ARHR
43 | metrics["ARHR"] = RecommenderMetrics.AverageReciprocalHitRank(topNPredicted, leftOutPredictions)
44 |
45 | #Evaluate properties of recommendations on full training set
46 | if (verbose):
47 | print("Computing recommendations with full data set...")
48 | self.algorithm.fit(evaluationData.GetFullTrainSet())
49 | allPredictions = self.algorithm.test(evaluationData.GetFullAntiTestSet())
50 | topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)
51 | if (verbose):
52 | print("Analyzing coverage, diversity, and novelty...")
53 | # Print user coverage with a minimum predicted rating of 4.0:
54 | metrics["Coverage"] = RecommenderMetrics.UserCoverage( topNPredicted,
55 | evaluationData.GetFullTrainSet().n_users,
56 | ratingThreshold=4.0)
57 | # Measure diversity of recommendations:
58 | metrics["Diversity"] = RecommenderMetrics.Diversity(topNPredicted, evaluationData.GetSimilarities())
59 |
60 | # Measure novelty (average popularity rank of recommendations):
61 | metrics["Novelty"] = RecommenderMetrics.Novelty(topNPredicted,
62 | evaluationData.GetPopularityRankings())
63 |
64 | if (verbose):
65 | print("Analysis complete.")
66 |
67 | return metrics
68 |
69 | def GetName(self):
70 | return self.name
71 |
72 | def GetAlgorithm(self):
73 | return self.algorithm
74 |
75 |
--------------------------------------------------------------------------------
/03 - Content Based Recommendation/EvaluationData.py:
--------------------------------------------------------------------------------
1 | from surprise.model_selection import train_test_split
2 | from surprise.model_selection import LeaveOneOut
3 | from surprise import KNNBaseline
4 |
5 | class EvaluationData:
6 |
7 | def __init__(self, data, popularityRankings):
8 |
9 | self.rankings = popularityRankings
10 |
11 | #Build a full training set for evaluating overall properties
12 | self.fullTrainSet = data.build_full_trainset()
13 | self.fullAntiTestSet = self.fullTrainSet.build_anti_testset()
14 |
15 | #Build a 75/25 train/test split for measuring accuracy
16 | self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1)
17 |
18 | #Build a "leave one out" train/test split for evaluating top-N recommenders
19 | #And build an anti-test-set for building predictions
20 | LOOCV = LeaveOneOut(n_splits=1, random_state=1)
21 | for train, test in LOOCV.split(data):
22 | self.LOOCVTrain = train
23 | self.LOOCVTest = test
24 |
25 | self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset()
26 |
27 | #Compute similarty matrix between items so we can measure diversity
28 | sim_options = {'name': 'cosine', 'user_based': False}
29 | self.simsAlgo = KNNBaseline(sim_options=sim_options)
30 | self.simsAlgo.fit(self.fullTrainSet)
31 |
32 | def GetFullTrainSet(self):
33 | return self.fullTrainSet
34 |
35 | def GetFullAntiTestSet(self):
36 | return self.fullAntiTestSet
37 |
38 | def GetAntiTestSetForUser(self, testSubject):
39 | trainset = self.fullTrainSet
40 | fill = trainset.global_mean
41 | anti_testset = []
42 | u = trainset.to_inner_uid(str(testSubject))
43 | user_items = set([j for (j, _) in trainset.ur[u]])
44 | anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
45 | i in trainset.all_items() if
46 | i not in user_items]
47 | return anti_testset
48 |
49 | def GetTrainSet(self):
50 | return self.trainSet
51 |
52 | def GetTestSet(self):
53 | return self.testSet
54 |
55 | def GetLOOCVTrainSet(self):
56 | return self.LOOCVTrain
57 |
58 | def GetLOOCVTestSet(self):
59 | return self.LOOCVTest
60 |
61 | def GetLOOCVAntiTestSet(self):
62 | return self.LOOCVAntiTestSet
63 |
64 | def GetSimilarities(self):
65 | return self.simsAlgo
66 |
67 | def GetPopularityRankings(self):
68 | return self.rankings
--------------------------------------------------------------------------------
/03 - Content Based Recommendation/Evaluator.py:
--------------------------------------------------------------------------------
1 | from EvaluationData import EvaluationData
2 | from EvaluatedAlgorithm import EvaluatedAlgorithm
3 |
4 | class Evaluator:
5 |
6 | algorithms = []
7 |
8 | def __init__(self, dataset, rankings):
9 | ed = EvaluationData(dataset, rankings)
10 | self.dataset = ed
11 |
12 | def AddAlgorithm(self, algorithm, name):
13 | alg = EvaluatedAlgorithm(algorithm, name)
14 | self.algorithms.append(alg)
15 |
16 | def Evaluate(self, doTopN):
17 | results = {}
18 | for algorithm in self.algorithms:
19 | print("Evaluating ", algorithm.GetName(), "...")
20 | results[algorithm.GetName()] = algorithm.Evaluate(self.dataset, doTopN)
21 |
22 | # Print results
23 | print("\n")
24 |
25 | if (doTopN):
26 | print("{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}".format(
27 | "Algorithm", "RMSE", "MAE", "HR", "cHR", "ARHR", "Coverage", "Diversity", "Novelty"))
28 | for (name, metrics) in results.items():
29 | print("{:<10} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f}".format(
30 | name, metrics["RMSE"], metrics["MAE"], metrics["HR"], metrics["cHR"], metrics["ARHR"],
31 | metrics["Coverage"], metrics["Diversity"], metrics["Novelty"]))
32 | else:
33 | print("{:<10} {:<10} {:<10}".format("Algorithm", "RMSE", "MAE"))
34 | for (name, metrics) in results.items():
35 | print("{:<10} {:<10.4f} {:<10.4f}".format(name, metrics["RMSE"], metrics["MAE"]))
36 |
37 | print("\nLegend:\n")
38 | print("RMSE: Root Mean Squared Error. Lower values mean better accuracy.")
39 | print("MAE: Mean Absolute Error. Lower values mean better accuracy.")
40 | if (doTopN):
41 | print("HR: Hit Rate; how often we are able to recommend a left-out rating. Higher is better.")
42 | print("cHR: Cumulative Hit Rate; hit rate, confined to ratings above a certain threshold. Higher is better.")
43 | print("ARHR: Average Reciprocal Hit Rank - Hit rate that takes the ranking into account. Higher is better." )
44 | print("Coverage: Ratio of users for whom recommendations above a certain threshold exist. Higher is better.")
45 | print("Diversity: 1-S, where S is the average similarity score between every possible pair of recommendations")
46 | print(" for a given user. Higher means more diverse.")
47 | print("Novelty: Average popularity rank of recommended items. Higher means more novel.")
48 |
49 | def SampleTopNRecs(self, ml, testSubject=85, k=10):
50 |
51 | for algo in self.algorithms:
52 | print("\nUsing recommender ", algo.GetName())
53 |
54 | print("\nBuilding recommendation model...")
55 | trainSet = self.dataset.GetFullTrainSet()
56 | algo.GetAlgorithm().fit(trainSet)
57 |
58 | print("Computing recommendations...")
59 | testSet = self.dataset.GetAntiTestSetForUser(testSubject)
60 |
61 | predictions = algo.GetAlgorithm().test(testSet)
62 |
63 | recommendations = []
64 |
65 | print ("\nWe recommend:")
66 | for userID, movieID, actualRating, estimatedRating, _ in predictions:
67 | intMovieID = int(movieID)
68 | recommendations.append((intMovieID, estimatedRating))
69 |
70 | recommendations.sort(key=lambda x: x[1], reverse=True)
71 |
72 | for ratings in recommendations[:10]:
73 | print(ml.getMovieName(ratings[0]), ratings[1])
74 |
75 |
76 |
77 |
78 |
79 |
--------------------------------------------------------------------------------
/03 - Content Based Recommendation/RecommenderMetrics.py:
--------------------------------------------------------------------------------
1 | import itertools
2 |
3 | from surprise import accuracy
4 | from collections import defaultdict
5 |
6 | class RecommenderMetrics:
7 |
8 | def MAE(predictions):
9 | return accuracy.mae(predictions, verbose=False)
10 |
11 | def RMSE(predictions):
12 | return accuracy.rmse(predictions, verbose=False)
13 |
14 | def GetTopN(predictions, n=10, minimumRating=4.0):
15 | topN = defaultdict(list)
16 |
17 |
18 | for userID, movieID, actualRating, estimatedRating, _ in predictions:
19 | if (estimatedRating >= minimumRating):
20 | topN[int(userID)].append((int(movieID), estimatedRating))
21 |
22 | for userID, ratings in topN.items():
23 | ratings.sort(key=lambda x: x[1], reverse=True)
24 | topN[int(userID)] = ratings[:n]
25 |
26 | return topN
27 |
28 | def HitRate(topNPredicted, leftOutPredictions):
29 | hits = 0
30 | total = 0
31 |
32 | # For each left-out rating
33 | for leftOut in leftOutPredictions:
34 | userID = leftOut[0]
35 | leftOutMovieID = leftOut[1]
36 | # Is it in the predicted top 10 for this user?
37 | hit = False
38 | for movieID, predictedRating in topNPredicted[int(userID)]:
39 | if (int(leftOutMovieID) == int(movieID)):
40 | hit = True
41 | break
42 | if (hit) :
43 | hits += 1
44 |
45 | total += 1
46 |
47 | # Compute overall precision
48 | return hits/total
49 |
50 | def CumulativeHitRate(topNPredicted, leftOutPredictions, ratingCutoff=0):
51 | hits = 0
52 | total = 0
53 |
54 | # For each left-out rating
55 | for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
56 | # Only look at ability to recommend things the users actually liked...
57 | if (actualRating >= ratingCutoff):
58 | # Is it in the predicted top 10 for this user?
59 | hit = False
60 | for movieID, predictedRating in topNPredicted[int(userID)]:
61 | if (int(leftOutMovieID) == movieID):
62 | hit = True
63 | break
64 | if (hit) :
65 | hits += 1
66 |
67 | total += 1
68 |
69 | # Compute overall precision
70 | return hits/total
71 |
72 | def RatingHitRate(topNPredicted, leftOutPredictions):
73 | hits = defaultdict(float)
74 | total = defaultdict(float)
75 |
76 | # For each left-out rating
77 | for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
78 | # Is it in the predicted top N for this user?
79 | hit = False
80 | for movieID, predictedRating in topNPredicted[int(userID)]:
81 | if (int(leftOutMovieID) == movieID):
82 | hit = True
83 | break
84 | if (hit) :
85 | hits[actualRating] += 1
86 |
87 | total[actualRating] += 1
88 |
89 | # Compute overall precision
90 | for rating in sorted(hits.keys()):
91 | print (rating, hits[rating] / total[rating])
92 |
93 | def AverageReciprocalHitRank(topNPredicted, leftOutPredictions):
94 | summation = 0
95 | total = 0
96 | # For each left-out rating
97 | for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
98 | # Is it in the predicted top N for this user?
99 | hitRank = 0
100 | rank = 0
101 | for movieID, predictedRating in topNPredicted[int(userID)]:
102 | rank = rank + 1
103 | if (int(leftOutMovieID) == movieID):
104 | hitRank = rank
105 | break
106 | if (hitRank > 0) :
107 | summation += 1.0 / hitRank
108 |
109 | total += 1
110 |
111 | return summation / total
112 |
113 | # What percentage of users have at least one "good" recommendation
114 | def UserCoverage(topNPredicted, numUsers, ratingThreshold=0):
115 | hits = 0
116 | for userID in topNPredicted.keys():
117 | hit = False
118 | for movieID, predictedRating in topNPredicted[userID]:
119 | if (predictedRating >= ratingThreshold):
120 | hit = True
121 | break
122 | if (hit):
123 | hits += 1
124 |
125 | return hits / numUsers
126 |
127 | def Diversity(topNPredicted, simsAlgo):
128 | n = 0
129 | total = 0
130 | simsMatrix = simsAlgo.compute_similarities()
131 | for userID in topNPredicted.keys():
132 | pairs = itertools.combinations(topNPredicted[userID], 2)
133 | for pair in pairs:
134 | movie1 = pair[0][0]
135 | movie2 = pair[1][0]
136 | innerID1 = simsAlgo.trainset.to_inner_iid(str(movie1))
137 | innerID2 = simsAlgo.trainset.to_inner_iid(str(movie2))
138 | similarity = simsMatrix[innerID1][innerID2]
139 | total += similarity
140 | n += 1
141 |
142 | S = total / n
143 | return (1-S)
144 |
145 | def Novelty(topNPredicted, rankings):
146 | n = 0
147 | total = 0
148 | for userID in topNPredicted.keys():
149 | for rating in topNPredicted[userID]:
150 | movieID = rating[0]
151 | rank = rankings[movieID]
152 | total += rank
153 | n += 1
154 | return total / n
155 |
--------------------------------------------------------------------------------
/03 - Content Based Recommendation/__pycache__/ContentKNNAlgorithm.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/03 - Content Based Recommendation/__pycache__/ContentKNNAlgorithm.cpython-38.pyc
--------------------------------------------------------------------------------
/03 - Content Based Recommendation/__pycache__/ContentKNNwithMisEnScene.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/03 - Content Based Recommendation/__pycache__/ContentKNNwithMisEnScene.cpython-38.pyc
--------------------------------------------------------------------------------
/03 - Content Based Recommendation/__pycache__/EvaluatedAlgorithm.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/03 - Content Based Recommendation/__pycache__/EvaluatedAlgorithm.cpython-38.pyc
--------------------------------------------------------------------------------
/03 - Content Based Recommendation/__pycache__/EvaluationData.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/03 - Content Based Recommendation/__pycache__/EvaluationData.cpython-38.pyc
--------------------------------------------------------------------------------
/03 - Content Based Recommendation/__pycache__/Evaluator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/03 - Content Based Recommendation/__pycache__/Evaluator.cpython-38.pyc
--------------------------------------------------------------------------------
/03 - Content Based Recommendation/__pycache__/MovieLens.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/03 - Content Based Recommendation/__pycache__/MovieLens.cpython-38.pyc
--------------------------------------------------------------------------------
/03 - Content Based Recommendation/__pycache__/RecommenderMetrics.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/03 - Content Based Recommendation/__pycache__/RecommenderMetrics.cpython-38.pyc
--------------------------------------------------------------------------------
/04 - Neighborhood Based Collaborative Filtering/.ipynb_checkpoints/EvaluateUserCF-checkpoint.py:
--------------------------------------------------------------------------------
1 | from MovieLens import MovieLens
2 | from surprise import KNNBasic
3 | import heapq
4 | from collections import defaultdict
5 | from operator import itemgetter
6 | from surprise.model_selection import LeaveOneOut
7 | from RecommenderMetrics import RecommenderMetrics
8 | from EvaluationData import EvaluationData
9 |
10 | def LoadMovieLensData():
11 | ml = MovieLens()
12 | print("Loading movie ratings...")
13 | data = ml.loadMovieLensLatestSmall()
14 | print("\nComputing movie popularity ranks so we can measure novelty later...")
15 | rankings = ml.getPopularityRanks()
16 | return (ml, data, rankings)
17 |
18 | ml, data, rankings = LoadMovieLensData()
19 |
20 | evalData = EvaluationData(data, rankings)
21 |
22 | # Train on leave-One-Out train set
23 | trainSet = evalData.GetLOOCVTrainSet()
24 | sim_options = {'name': 'cosine',
25 | 'user_based': True
26 | }
27 |
28 | model = KNNBasic(sim_options=sim_options)
29 | model.fit(trainSet)
30 | simsMatrix = model.compute_similarities()
31 |
32 | leftOutTestSet = evalData.GetLOOCVTestSet()
33 |
34 | # Build up dict to lists of (int(movieID), predictedrating) pairs
35 | topN = defaultdict(list)
36 | k = 10
37 | for uiid in range(trainSet.n_users):
38 | # Get top N similar users to this one
39 | similarityRow = simsMatrix[uiid]
40 |
41 | similarUsers = []
42 | for innerID, score in enumerate(similarityRow):
43 | if (innerID != uiid):
44 | similarUsers.append( (innerID, score) )
45 |
46 | kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])
47 |
48 | # Get the stuff they rated, and add up ratings for each item, weighted by user similarity
49 | candidates = defaultdict(float)
50 | for similarUser in kNeighbors:
51 | innerID = similarUser[0]
52 | userSimilarityScore = similarUser[1]
53 | theirRatings = trainSet.ur[innerID]
54 | for rating in theirRatings:
55 | candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore
56 |
57 | # Build a dictionary of stuff the user has already seen
58 | watched = {}
59 | for itemID, rating in trainSet.ur[uiid]:
60 | watched[itemID] = 1
61 |
62 | # Get top-rated items from similar users:
63 | pos = 0
64 | for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
65 | if not itemID in watched:
66 | movieID = trainSet.to_raw_iid(itemID)
67 | topN[int(trainSet.to_raw_uid(uiid))].append( (int(movieID), 0.0) )
68 | pos += 1
69 | if (pos > 40):
70 | break
71 |
72 | # Measure
73 | print("HR", RecommenderMetrics.HitRate(topN, leftOutTestSet))
74 |
75 |
76 |
--------------------------------------------------------------------------------
/04 - Neighborhood Based Collaborative Filtering/.ipynb_checkpoints/EvaluatedAlgorithm-checkpoint.py:
--------------------------------------------------------------------------------
1 | from RecommenderMetrics import RecommenderMetrics
2 | from EvaluationData import EvaluationData
3 |
4 | class EvaluatedAlgorithm:
5 |
6 | def __init__(self, algorithm, name):
7 | self.algorithm = algorithm
8 | self.name = name
9 |
10 | def Evaluate(self, evaluationData, doTopN, n=10, verbose=True):
11 | metrics = {}
12 | # Compute accuracy
13 | if (verbose):
14 | print("Evaluating accuracy...")
15 | self.algorithm.fit(evaluationData.GetTrainSet())
16 | predictions = self.algorithm.test(evaluationData.GetTestSet())
17 | metrics["RMSE"] = RecommenderMetrics.RMSE(predictions)
18 | metrics["MAE"] = RecommenderMetrics.MAE(predictions)
19 |
20 | if (doTopN):
21 | # Evaluate top-10 with Leave One Out testing
22 | if (verbose):
23 | print("Evaluating top-N with leave-one-out...")
24 | self.algorithm.fit(evaluationData.GetLOOCVTrainSet())
25 | leftOutPredictions = self.algorithm.test(evaluationData.GetLOOCVTestSet())
26 | # Build predictions for all ratings not in the training set
27 | allPredictions = self.algorithm.test(evaluationData.GetLOOCVAntiTestSet())
28 | # Compute top 10 recs for each user
29 | topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)
30 | if (verbose):
31 | print("Computing hit-rate and rank metrics...")
32 | # See how often we recommended a movie the user actually rated
33 | metrics["HR"] = RecommenderMetrics.HitRate(topNPredicted, leftOutPredictions)
34 | # See how often we recommended a movie the user actually liked
35 | metrics["cHR"] = RecommenderMetrics.CumulativeHitRate(topNPredicted, leftOutPredictions)
36 | # Compute ARHR
37 | metrics["ARHR"] = RecommenderMetrics.AverageReciprocalHitRank(topNPredicted, leftOutPredictions)
38 |
39 | #Evaluate properties of recommendations on full training set
40 | if (verbose):
41 | print("Computing recommendations with full data set...")
42 | self.algorithm.fit(evaluationData.GetFullTrainSet())
43 | allPredictions = self.algorithm.test(evaluationData.GetFullAntiTestSet())
44 | topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)
45 | if (verbose):
46 | print("Analyzing coverage, diversity, and novelty...")
47 | # Print user coverage with a minimum predicted rating of 4.0:
48 | metrics["Coverage"] = RecommenderMetrics.UserCoverage( topNPredicted,
49 | evaluationData.GetFullTrainSet().n_users,
50 | ratingThreshold=4.0)
51 | # Measure diversity of recommendations:
52 | metrics["Diversity"] = RecommenderMetrics.Diversity(topNPredicted, evaluationData.GetSimilarities())
53 |
54 | # Measure novelty (average popularity rank of recommendations):
55 | metrics["Novelty"] = RecommenderMetrics.Novelty(topNPredicted,
56 | evaluationData.GetPopularityRankings())
57 |
58 | if (verbose):
59 | print("Analysis complete.")
60 |
61 | return metrics
62 |
63 | def GetName(self):
64 | return self.name
65 |
66 | def GetAlgorithm(self):
67 | return self.algorithm
68 |
69 |
--------------------------------------------------------------------------------
/04 - Neighborhood Based Collaborative Filtering/.ipynb_checkpoints/EvaluationData-checkpoint.py:
--------------------------------------------------------------------------------
1 | from surprise.model_selection import train_test_split
2 | from surprise.model_selection import LeaveOneOut
3 | from surprise import KNNBaseline
4 |
5 | class EvaluationData:
6 |
7 | def __init__(self, data, popularityRankings):
8 |
9 | self.rankings = popularityRankings
10 |
11 | #Build a full training set for evaluating overall properties
12 | self.fullTrainSet = data.build_full_trainset()
13 | self.fullAntiTestSet = self.fullTrainSet.build_anti_testset()
14 |
15 | #Build a 75/25 train/test split for measuring accuracy
16 | self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1)
17 |
18 | #Build a "leave one out" train/test split for evaluating top-N recommenders
19 | #And build an anti-test-set for building predictions
20 | LOOCV = LeaveOneOut(n_splits=1, random_state=1)
21 | for train, test in LOOCV.split(data):
22 | self.LOOCVTrain = train
23 | self.LOOCVTest = test
24 |
25 | self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset()
26 |
27 | #Compute similarty matrix between items so we can measure diversity
28 | sim_options = {'name': 'cosine', 'user_based': False}
29 | self.simsAlgo = KNNBaseline(sim_options=sim_options)
30 | self.simsAlgo.fit(self.fullTrainSet)
31 |
32 | def GetFullTrainSet(self):
33 | return self.fullTrainSet
34 |
35 | def GetFullAntiTestSet(self):
36 | return self.fullAntiTestSet
37 |
38 | def GetAntiTestSetForUser(self, testSubject):
39 | trainset = self.fullTrainSet
40 | fill = trainset.global_mean
41 | anti_testset = []
42 | u = trainset.to_inner_uid(str(testSubject))
43 | user_items = set([j for (j, _) in trainset.ur[u]])
44 | anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
45 | i in trainset.all_items() if
46 | i not in user_items]
47 | return anti_testset
48 |
49 | def GetTrainSet(self):
50 | return self.trainSet
51 |
52 | def GetTestSet(self):
53 | return self.testSet
54 |
55 | def GetLOOCVTrainSet(self):
56 | return self.LOOCVTrain
57 |
58 | def GetLOOCVTestSet(self):
59 | return self.LOOCVTest
60 |
61 | def GetLOOCVAntiTestSet(self):
62 | return self.LOOCVAntiTestSet
63 |
64 | def GetSimilarities(self):
65 | return self.simsAlgo
66 |
67 | def GetPopularityRankings(self):
68 | return self.rankings
--------------------------------------------------------------------------------
/04 - Neighborhood Based Collaborative Filtering/.ipynb_checkpoints/Evaluator-checkpoint.py:
--------------------------------------------------------------------------------
1 | from EvaluationData import EvaluationData
2 | from EvaluatedAlgorithm import EvaluatedAlgorithm
3 |
4 | class Evaluator:
5 |
6 | algorithms = []
7 |
8 | def __init__(self, dataset, rankings):
9 | ed = EvaluationData(dataset, rankings)
10 | self.dataset = ed
11 |
12 | def AddAlgorithm(self, algorithm, name):
13 | alg = EvaluatedAlgorithm(algorithm, name)
14 | self.algorithms.append(alg)
15 |
16 | def Evaluate(self, doTopN):
17 | results = {}
18 | for algorithm in self.algorithms:
19 | print("Evaluating ", algorithm.GetName(), "...")
20 | results[algorithm.GetName()] = algorithm.Evaluate(self.dataset, doTopN)
21 |
22 | # Print results
23 | print("\n")
24 |
25 | if (doTopN):
26 | print("{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}".format(
27 | "Algorithm", "RMSE", "MAE", "HR", "cHR", "ARHR", "Coverage", "Diversity", "Novelty"))
28 | for (name, metrics) in results.items():
29 | print("{:<10} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f}".format(
30 | name, metrics["RMSE"], metrics["MAE"], metrics["HR"], metrics["cHR"], metrics["ARHR"],
31 | metrics["Coverage"], metrics["Diversity"], metrics["Novelty"]))
32 | else:
33 | print("{:<10} {:<10} {:<10}".format("Algorithm", "RMSE", "MAE"))
34 | for (name, metrics) in results.items():
35 | print("{:<10} {:<10.4f} {:<10.4f}".format(name, metrics["RMSE"], metrics["MAE"]))
36 |
37 | print("\nLegend:\n")
38 | print("RMSE: Root Mean Squared Error. Lower values mean better accuracy.")
39 | print("MAE: Mean Absolute Error. Lower values mean better accuracy.")
40 | if (doTopN):
41 | print("HR: Hit Rate; how often we are able to recommend a left-out rating. Higher is better.")
42 | print("cHR: Cumulative Hit Rate; hit rate, confined to ratings above a certain threshold. Higher is better.")
43 | print("ARHR: Average Reciprocal Hit Rank - Hit rate that takes the ranking into account. Higher is better." )
44 | print("Coverage: Ratio of users for whom recommendations above a certain threshold exist. Higher is better.")
45 | print("Diversity: 1-S, where S is the average similarity score between every possible pair of recommendations")
46 | print(" for a given user. Higher means more diverse.")
47 | print("Novelty: Average popularity rank of recommended items. Higher means more novel.")
48 |
49 | def SampleTopNRecs(self, ml, testSubject=85, k=10):
50 |
51 | for algo in self.algorithms:
52 | print("\nUsing recommender ", algo.GetName())
53 |
54 | print("\nBuilding recommendation model...")
55 | trainSet = self.dataset.GetFullTrainSet()
56 | algo.GetAlgorithm().fit(trainSet)
57 |
58 | print("Computing recommendations...")
59 | testSet = self.dataset.GetAntiTestSetForUser(testSubject)
60 |
61 | predictions = algo.GetAlgorithm().test(testSet)
62 |
63 | recommendations = []
64 |
65 | print ("\nWe recommend:")
66 | for userID, movieID, actualRating, estimatedRating, _ in predictions:
67 | intMovieID = int(movieID)
68 | recommendations.append((intMovieID, estimatedRating))
69 |
70 | recommendations.sort(key=lambda x: x[1], reverse=True)
71 |
72 | for ratings in recommendations[:10]:
73 | print(ml.getMovieName(ratings[0]), ratings[1])
74 |
75 |
76 |
77 |
78 |
79 |
--------------------------------------------------------------------------------
/04 - Neighborhood Based Collaborative Filtering/.ipynb_checkpoints/KNNBakeOff-checkpoint.py:
--------------------------------------------------------------------------------
1 | from MovieLens import MovieLens
2 | from surprise import KNNBasic
3 | from surprise import NormalPredictor
4 | from Evaluator import Evaluator
5 |
6 | import random
7 | import numpy as np
8 |
9 | def LoadMovieLensData():
10 | ml = MovieLens()
11 | print("Loading movie ratings...")
12 | data = ml.loadMovieLensLatestSmall()
13 | print("\nComputing movie popularity ranks so we can measure novelty later...")
14 | rankings = ml.getPopularityRanks()
15 | return (ml, data, rankings)
16 |
17 | np.random.seed(0)
18 | random.seed(0)
19 |
20 | # Load up common data set for the recommender algorithms
21 | (ml, evaluationData, rankings) = LoadMovieLensData()
22 |
23 | # Construct an Evaluator to, you know, evaluate them
24 | evaluator = Evaluator(evaluationData, rankings)
25 |
26 | # User-based KNN
27 | UserKNN = KNNBasic(sim_options = {'name': 'cosine', 'user_based': True})
28 | evaluator.AddAlgorithm(UserKNN, "User KNN")
29 |
30 | # Item-based KNN
31 | ItemKNN = KNNBasic(sim_options = {'name': 'cosine', 'user_based': False})
32 | evaluator.AddAlgorithm(ItemKNN, "Item KNN")
33 |
34 | # Just make random recommendations
35 | Random = NormalPredictor()
36 | evaluator.AddAlgorithm(Random, "Random")
37 |
38 | # Fight!
39 | evaluator.Evaluate(False)
40 |
41 | evaluator.SampleTopNRecs(ml)
42 |
--------------------------------------------------------------------------------
/04 - Neighborhood Based Collaborative Filtering/.ipynb_checkpoints/SimpleItemCF-checkpoint.py:
--------------------------------------------------------------------------------
1 | from MovieLens import MovieLens
2 | from surprise import KNNBasic
3 | import heapq
4 | from collections import defaultdict
5 | from operator import itemgetter
6 |
7 | testSubject = '85'
8 | k = 10
9 |
10 | ml = MovieLens()
11 | data = ml.loadMovieLensLatestSmall()
12 |
13 | trainSet = data.build_full_trainset()
14 |
15 | sim_options = {'name': 'cosine',
16 | 'user_based': False
17 | }
18 |
19 | model = KNNBasic(sim_options=sim_options)
20 | model.fit(trainSet)
21 | simsMatrix = model.compute_similarities()
22 |
23 | testUserInnerID = trainSet.to_inner_uid(testSubject)
24 |
25 | # Get the top K items we rated
26 | testUserRatings = trainSet.ur[testUserInnerID]
27 | kNeighbors = heapq.nlargest(k, testUserRatings, key=lambda t: t[1])
28 |
29 | # Get similar items to stuff we liked (weighted by rating)
30 | candidates = defaultdict(float)
31 | for itemID, rating in kNeighbors:
32 | similarityRow = simsMatrix[itemID]
33 | for innerID, score in enumerate(similarityRow):
34 | candidates[innerID] += score * (rating / 5.0)
35 |
36 | # Build a dictionary of stuff the user has already seen
37 | watched = {}
38 | for itemID, rating in trainSet.ur[testUserInnerID]:
39 | watched[itemID] = 1
40 |
41 | # Get top-rated items from similar users:
42 | pos = 0
43 | for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
44 | if not itemID in watched:
45 | movieID = trainSet.to_raw_iid(itemID)
46 | print(ml.getMovieName(int(movieID)), ratingSum)
47 | pos += 1
48 | if (pos > 10):
49 | break
50 |
51 |
52 |
53 |
54 |
--------------------------------------------------------------------------------
/04 - Neighborhood Based Collaborative Filtering/.ipynb_checkpoints/User-Based Collaborative Filtering-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 4
6 | }
7 |
--------------------------------------------------------------------------------
/04 - Neighborhood Based Collaborative Filtering/EvaluateUserCF.py:
--------------------------------------------------------------------------------
1 | from MovieLens import MovieLens
2 | from surprise import KNNBasic
3 | import heapq
4 | from collections import defaultdict
5 | from operator import itemgetter
6 | from surprise.model_selection import LeaveOneOut
7 | from RecommenderMetrics import RecommenderMetrics
8 | from EvaluationData import EvaluationData
9 |
10 | def LoadMovieLensData():
11 | ml = MovieLens()
12 | print("Loading movie ratings...")
13 | data = ml.loadMovieLensLatestSmall()
14 | print("\nComputing movie popularity ranks so we can measure novelty later...")
15 | rankings = ml.getPopularityRanks()
16 | return (ml, data, rankings)
17 |
18 | ml, data, rankings = LoadMovieLensData()
19 |
20 | evalData = EvaluationData(data, rankings)
21 |
22 | # Train on leave-One-Out train set
23 | trainSet = evalData.GetLOOCVTrainSet()
24 | sim_options = {'name': 'cosine',
25 | 'user_based': True
26 | }
27 |
28 | model = KNNBasic(sim_options=sim_options)
29 | model.fit(trainSet)
30 | simsMatrix = model.compute_similarities()
31 |
32 | leftOutTestSet = evalData.GetLOOCVTestSet()
33 |
34 | # Build up dict to lists of (int(movieID), predictedrating) pairs
35 | topN = defaultdict(list)
36 | k = 10
37 | for uiid in range(trainSet.n_users):
38 | # Get top N similar users to this one
39 | similarityRow = simsMatrix[uiid]
40 |
41 | similarUsers = []
42 | for innerID, score in enumerate(similarityRow):
43 | if (innerID != uiid):
44 | similarUsers.append( (innerID, score) )
45 |
46 | kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])
47 |
48 | # Get the stuff they rated, and add up ratings for each item, weighted by user similarity
49 | candidates = defaultdict(float)
50 | for similarUser in kNeighbors:
51 | innerID = similarUser[0]
52 | userSimilarityScore = similarUser[1]
53 | theirRatings = trainSet.ur[innerID]
54 | for rating in theirRatings:
55 | candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore
56 |
57 | # Build a dictionary of stuff the user has already seen
58 | watched = {}
59 | for itemID, rating in trainSet.ur[uiid]:
60 | watched[itemID] = 1
61 |
62 | # Get top-rated items from similar users:
63 | pos = 0
64 | for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
65 | if not itemID in watched:
66 | movieID = trainSet.to_raw_iid(itemID)
67 | topN[int(trainSet.to_raw_uid(uiid))].append( (int(movieID), 0.0) )
68 | pos += 1
69 | if (pos > 40):
70 | break
71 |
72 | # Measure
73 | print("HR", RecommenderMetrics.HitRate(topN, leftOutTestSet))
74 |
75 |
76 |
--------------------------------------------------------------------------------
/04 - Neighborhood Based Collaborative Filtering/EvaluatedAlgorithm.py:
--------------------------------------------------------------------------------
1 | from RecommenderMetrics import RecommenderMetrics
2 | from EvaluationData import EvaluationData
3 |
4 | class EvaluatedAlgorithm:
5 |
6 | def __init__(self, algorithm, name):
7 | self.algorithm = algorithm
8 | self.name = name
9 |
10 | def Evaluate(self, evaluationData, doTopN, n=10, verbose=True):
11 | metrics = {}
12 | # Compute accuracy
13 | if (verbose):
14 | print("Evaluating accuracy...")
15 | self.algorithm.fit(evaluationData.GetTrainSet())
16 | predictions = self.algorithm.test(evaluationData.GetTestSet())
17 | metrics["RMSE"] = RecommenderMetrics.RMSE(predictions)
18 | metrics["MAE"] = RecommenderMetrics.MAE(predictions)
19 |
20 | if (doTopN):
21 | # Evaluate top-10 with Leave One Out testing
22 | if (verbose):
23 | print("Evaluating top-N with leave-one-out...")
24 | self.algorithm.fit(evaluationData.GetLOOCVTrainSet())
25 | leftOutPredictions = self.algorithm.test(evaluationData.GetLOOCVTestSet())
26 | # Build predictions for all ratings not in the training set
27 | allPredictions = self.algorithm.test(evaluationData.GetLOOCVAntiTestSet())
28 | # Compute top 10 recs for each user
29 | topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)
30 | if (verbose):
31 | print("Computing hit-rate and rank metrics...")
32 | # See how often we recommended a movie the user actually rated
33 | metrics["HR"] = RecommenderMetrics.HitRate(topNPredicted, leftOutPredictions)
34 | # See how often we recommended a movie the user actually liked
35 | metrics["cHR"] = RecommenderMetrics.CumulativeHitRate(topNPredicted, leftOutPredictions)
36 | # Compute ARHR
37 | metrics["ARHR"] = RecommenderMetrics.AverageReciprocalHitRank(topNPredicted, leftOutPredictions)
38 |
39 | #Evaluate properties of recommendations on full training set
40 | if (verbose):
41 | print("Computing recommendations with full data set...")
42 | self.algorithm.fit(evaluationData.GetFullTrainSet())
43 | allPredictions = self.algorithm.test(evaluationData.GetFullAntiTestSet())
44 | topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)
45 | if (verbose):
46 | print("Analyzing coverage, diversity, and novelty...")
47 | # Print user coverage with a minimum predicted rating of 4.0:
48 | metrics["Coverage"] = RecommenderMetrics.UserCoverage( topNPredicted,
49 | evaluationData.GetFullTrainSet().n_users,
50 | ratingThreshold=4.0)
51 | # Measure diversity of recommendations:
52 | metrics["Diversity"] = RecommenderMetrics.Diversity(topNPredicted, evaluationData.GetSimilarities())
53 |
54 | # Measure novelty (average popularity rank of recommendations):
55 | metrics["Novelty"] = RecommenderMetrics.Novelty(topNPredicted,
56 | evaluationData.GetPopularityRankings())
57 |
58 | if (verbose):
59 | print("Analysis complete.")
60 |
61 | return metrics
62 |
63 | def GetName(self):
64 | return self.name
65 |
66 | def GetAlgorithm(self):
67 | return self.algorithm
68 |
69 |
--------------------------------------------------------------------------------
/04 - Neighborhood Based Collaborative Filtering/EvaluationData.py:
--------------------------------------------------------------------------------
1 | from surprise.model_selection import train_test_split
2 | from surprise.model_selection import LeaveOneOut
3 | from surprise import KNNBaseline
4 |
5 | class EvaluationData:
6 |
7 | def __init__(self, data, popularityRankings):
8 |
9 | self.rankings = popularityRankings
10 |
11 | #Build a full training set for evaluating overall properties
12 | self.fullTrainSet = data.build_full_trainset()
13 | self.fullAntiTestSet = self.fullTrainSet.build_anti_testset()
14 |
15 | #Build a 75/25 train/test split for measuring accuracy
16 | self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1)
17 |
18 | #Build a "leave one out" train/test split for evaluating top-N recommenders
19 | #And build an anti-test-set for building predictions
20 | LOOCV = LeaveOneOut(n_splits=1, random_state=1)
21 | for train, test in LOOCV.split(data):
22 | self.LOOCVTrain = train
23 | self.LOOCVTest = test
24 |
25 | self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset()
26 |
27 | #Compute similarty matrix between items so we can measure diversity
28 | sim_options = {'name': 'cosine', 'user_based': False}
29 | self.simsAlgo = KNNBaseline(sim_options=sim_options)
30 | self.simsAlgo.fit(self.fullTrainSet)
31 |
32 | def GetFullTrainSet(self):
33 | return self.fullTrainSet
34 |
35 | def GetFullAntiTestSet(self):
36 | return self.fullAntiTestSet
37 |
38 | def GetAntiTestSetForUser(self, testSubject):
39 | trainset = self.fullTrainSet
40 | fill = trainset.global_mean
41 | anti_testset = []
42 | u = trainset.to_inner_uid(str(testSubject))
43 | user_items = set([j for (j, _) in trainset.ur[u]])
44 | anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
45 | i in trainset.all_items() if
46 | i not in user_items]
47 | return anti_testset
48 |
49 | def GetTrainSet(self):
50 | return self.trainSet
51 |
52 | def GetTestSet(self):
53 | return self.testSet
54 |
55 | def GetLOOCVTrainSet(self):
56 | return self.LOOCVTrain
57 |
58 | def GetLOOCVTestSet(self):
59 | return self.LOOCVTest
60 |
61 | def GetLOOCVAntiTestSet(self):
62 | return self.LOOCVAntiTestSet
63 |
64 | def GetSimilarities(self):
65 | return self.simsAlgo
66 |
67 | def GetPopularityRankings(self):
68 | return self.rankings
--------------------------------------------------------------------------------
/04 - Neighborhood Based Collaborative Filtering/Evaluator.py:
--------------------------------------------------------------------------------
1 | from EvaluationData import EvaluationData
2 | from EvaluatedAlgorithm import EvaluatedAlgorithm
3 |
4 | class Evaluator:
5 |
6 | algorithms = []
7 |
8 | def __init__(self, dataset, rankings):
9 | ed = EvaluationData(dataset, rankings)
10 | self.dataset = ed
11 |
12 | def AddAlgorithm(self, algorithm, name):
13 | alg = EvaluatedAlgorithm(algorithm, name)
14 | self.algorithms.append(alg)
15 |
16 | def Evaluate(self, doTopN):
17 | results = {}
18 | for algorithm in self.algorithms:
19 | print("Evaluating ", algorithm.GetName(), "...")
20 | results[algorithm.GetName()] = algorithm.Evaluate(self.dataset, doTopN)
21 |
22 | # Print results
23 | print("\n")
24 |
25 | if (doTopN):
26 | print("{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}".format(
27 | "Algorithm", "RMSE", "MAE", "HR", "cHR", "ARHR", "Coverage", "Diversity", "Novelty"))
28 | for (name, metrics) in results.items():
29 | print("{:<10} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f}".format(
30 | name, metrics["RMSE"], metrics["MAE"], metrics["HR"], metrics["cHR"], metrics["ARHR"],
31 | metrics["Coverage"], metrics["Diversity"], metrics["Novelty"]))
32 | else:
33 | print("{:<10} {:<10} {:<10}".format("Algorithm", "RMSE", "MAE"))
34 | for (name, metrics) in results.items():
35 | print("{:<10} {:<10.4f} {:<10.4f}".format(name, metrics["RMSE"], metrics["MAE"]))
36 |
37 | print("\nLegend:\n")
38 | print("RMSE: Root Mean Squared Error. Lower values mean better accuracy.")
39 | print("MAE: Mean Absolute Error. Lower values mean better accuracy.")
40 | if (doTopN):
41 | print("HR: Hit Rate; how often we are able to recommend a left-out rating. Higher is better.")
42 | print("cHR: Cumulative Hit Rate; hit rate, confined to ratings above a certain threshold. Higher is better.")
43 | print("ARHR: Average Reciprocal Hit Rank - Hit rate that takes the ranking into account. Higher is better." )
44 | print("Coverage: Ratio of users for whom recommendations above a certain threshold exist. Higher is better.")
45 | print("Diversity: 1-S, where S is the average similarity score between every possible pair of recommendations")
46 | print(" for a given user. Higher means more diverse.")
47 | print("Novelty: Average popularity rank of recommended items. Higher means more novel.")
48 |
49 | def SampleTopNRecs(self, ml, testSubject=85, k=10):
50 |
51 | for algo in self.algorithms:
52 | print("\nUsing recommender ", algo.GetName())
53 |
54 | print("\nBuilding recommendation model...")
55 | trainSet = self.dataset.GetFullTrainSet()
56 | algo.GetAlgorithm().fit(trainSet)
57 |
58 | print("Computing recommendations...")
59 | testSet = self.dataset.GetAntiTestSetForUser(testSubject)
60 |
61 | predictions = algo.GetAlgorithm().test(testSet)
62 |
63 | recommendations = []
64 |
65 | print ("\nWe recommend:")
66 | for userID, movieID, actualRating, estimatedRating, _ in predictions:
67 | intMovieID = int(movieID)
68 | recommendations.append((intMovieID, estimatedRating))
69 |
70 | recommendations.sort(key=lambda x: x[1], reverse=True)
71 |
72 | for ratings in recommendations[:10]:
73 | print(ml.getMovieName(ratings[0]), ratings[1])
74 |
75 |
76 |
77 |
78 |
79 |
--------------------------------------------------------------------------------
/04 - Neighborhood Based Collaborative Filtering/Item-Based Collaborative Filtering.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Item-Based Collaborative Movie Recommendation"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Importing Dependencies"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "from MovieLens import MovieLens\n",
24 | "from surprise import KNNBasic\n",
25 | "import heapq\n",
26 | "from collections import defaultdict\n",
27 | "from operator import itemgetter"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "## Loading Dataset"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 2,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "ml = MovieLens()\n",
44 | "data = ml.loadMovieLensLatestSmall()\n",
45 | "trainSet = data.build_full_trainset()"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "## Calculating Item Similarities using Supriselib"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 3,
58 | "metadata": {},
59 | "outputs": [
60 | {
61 | "name": "stdout",
62 | "output_type": "stream",
63 | "text": [
64 | "Computing the cosine similarity matrix...\n",
65 | "Done computing similarity matrix.\n",
66 | "Computing the cosine similarity matrix...\n",
67 | "Done computing similarity matrix.\n"
68 | ]
69 | }
70 | ],
71 | "source": [
72 | "sim_options = {'name': 'cosine',\n",
73 | " 'user_based': False\n",
74 | " }\n",
75 | "\n",
76 | "model = KNNBasic(sim_options=sim_options)\n",
77 | "model.fit(trainSet)\n",
78 | "simsMatrix = model.compute_similarities()"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {},
84 | "source": [
85 | "## Getting Top-N User Recommendations"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 4,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "testSubject = '85'\n",
95 | "k = 10"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 5,
101 | "metadata": {},
102 | "outputs": [],
103 | "source": [
104 | "testUserInnerID = trainSet.to_inner_uid(testSubject)"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 6,
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "# Get the top K items we rated\n",
114 | "testUserRatings = trainSet.ur[testUserInnerID]\n",
115 | "kNeighbors = heapq.nlargest(k, testUserRatings, key=lambda t: t[1])"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 7,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "# Get similar items to stuff we liked (weighted by rating)\n",
125 | "candidates = defaultdict(float)\n",
126 | "for itemID, rating in kNeighbors:\n",
127 | " similarityRow = simsMatrix[itemID]\n",
128 | " for innerID, score in enumerate(similarityRow):\n",
129 | " candidates[innerID] += score * (rating / 5.0)"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 8,
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "# Build a dictionary of stuff the user has already seen\n",
139 | "watched = {}\n",
140 | "for itemID, rating in trainSet.ur[testUserInnerID]:\n",
141 | " watched[itemID] = 1"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 9,
147 | "metadata": {},
148 | "outputs": [
149 | {
150 | "name": "stdout",
151 | "output_type": "stream",
152 | "text": [
153 | "James Dean Story, The (1957) 10.0\n",
154 | "Get Real (1998) 9.987241120712646\n",
155 | "Kiss of Death (1995) 9.966881877751941\n",
156 | "Set It Off (1996) 9.963732215657119\n",
157 | "How Green Was My Valley (1941) 9.943984081065269\n",
158 | "Amos & Andrew (1993) 9.93973694500253\n",
159 | "My Crazy Life (Mi vida loca) (1993) 9.938290487546041\n",
160 | "Grace of My Heart (1996) 9.926255896645218\n",
161 | "Fanny and Alexander (Fanny och Alexander) (1982) 9.925699671455906\n",
162 | "Wild Reeds (Les roseaux sauvages) (1994) 9.916226404418774\n",
163 | "Edge of Seventeen (1998) 9.913028764691676\n"
164 | ]
165 | }
166 | ],
167 | "source": [
168 | "# Get top-rated items from similar users:\n",
169 | "pos = 0\n",
170 | "for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):\n",
171 | " if not itemID in watched:\n",
172 | " movieID = trainSet.to_raw_iid(itemID)\n",
173 | " print(ml.getMovieName(int(movieID)), ratingSum)\n",
174 | " pos += 1\n",
175 | " if (pos > 10):\n",
176 | " break"
177 | ]
178 | }
179 | ],
180 | "metadata": {
181 | "kernelspec": {
182 | "display_name": "Python 3",
183 | "language": "python",
184 | "name": "python3"
185 | },
186 | "language_info": {
187 | "codemirror_mode": {
188 | "name": "ipython",
189 | "version": 3
190 | },
191 | "file_extension": ".py",
192 | "mimetype": "text/x-python",
193 | "name": "python",
194 | "nbconvert_exporter": "python",
195 | "pygments_lexer": "ipython3",
196 | "version": "3.8.2"
197 | }
198 | },
199 | "nbformat": 4,
200 | "nbformat_minor": 4
201 | }
202 |
--------------------------------------------------------------------------------
/04 - Neighborhood Based Collaborative Filtering/KNNBakeOff.py:
--------------------------------------------------------------------------------
1 | from MovieLens import MovieLens
2 | from surprise import KNNBasic
3 | from surprise import NormalPredictor
4 | from Evaluator import Evaluator
5 |
6 | import random
7 | import numpy as np
8 |
9 | def LoadMovieLensData():
10 | ml = MovieLens()
11 | print("Loading movie ratings...")
12 | data = ml.loadMovieLensLatestSmall()
13 | print("\nComputing movie popularity ranks so we can measure novelty later...")
14 | rankings = ml.getPopularityRanks()
15 | return (ml, data, rankings)
16 |
17 | np.random.seed(0)
18 | random.seed(0)
19 |
20 | # Load up common data set for the recommender algorithms
21 | (ml, evaluationData, rankings) = LoadMovieLensData()
22 |
23 | # Construct an Evaluator to, you know, evaluate them
24 | evaluator = Evaluator(evaluationData, rankings)
25 |
26 | # User-based KNN
27 | UserKNN = KNNBasic(sim_options = {'name': 'cosine', 'user_based': True})
28 | evaluator.AddAlgorithm(UserKNN, "User KNN")
29 |
30 | # Item-based KNN
31 | ItemKNN = KNNBasic(sim_options = {'name': 'cosine', 'user_based': False})
32 | evaluator.AddAlgorithm(ItemKNN, "Item KNN")
33 |
34 | # Just make random recommendations
35 | Random = NormalPredictor()
36 | evaluator.AddAlgorithm(Random, "Random")
37 |
38 | # Fight!
39 | evaluator.Evaluate(False)
40 |
41 | evaluator.SampleTopNRecs(ml)
42 |
--------------------------------------------------------------------------------
/04 - Neighborhood Based Collaborative Filtering/SimpleItemCF.py:
--------------------------------------------------------------------------------
1 | from MovieLens import MovieLens
2 | from surprise import KNNBasic
3 | import heapq
4 | from collections import defaultdict
5 | from operator import itemgetter
6 |
7 | testSubject = '85'
8 | k = 10
9 |
10 | ml = MovieLens()
11 | data = ml.loadMovieLensLatestSmall()
12 |
13 | trainSet = data.build_full_trainset()
14 |
15 | sim_options = {'name': 'cosine',
16 | 'user_based': False
17 | }
18 |
19 | model = KNNBasic(sim_options=sim_options)
20 | model.fit(trainSet)
21 | simsMatrix = model.compute_similarities()
22 |
23 | testUserInnerID = trainSet.to_inner_uid(testSubject)
24 |
25 | # Get the top K items we rated
26 | testUserRatings = trainSet.ur[testUserInnerID]
27 | kNeighbors = heapq.nlargest(k, testUserRatings, key=lambda t: t[1])
28 |
29 | # Get similar items to stuff we liked (weighted by rating)
30 | candidates = defaultdict(float)
31 | for itemID, rating in kNeighbors:
32 | similarityRow = simsMatrix[itemID]
33 | for innerID, score in enumerate(similarityRow):
34 | candidates[innerID] += score * (rating / 5.0)
35 |
36 | # Build a dictionary of stuff the user has already seen
37 | watched = {}
38 | for itemID, rating in trainSet.ur[testUserInnerID]:
39 | watched[itemID] = 1
40 |
41 | # Get top-rated items from similar users:
42 | pos = 0
43 | for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
44 | if not itemID in watched:
45 | movieID = trainSet.to_raw_iid(itemID)
46 | print(ml.getMovieName(int(movieID)), ratingSum)
47 | pos += 1
48 | if (pos > 10):
49 | break
50 |
51 |
52 |
53 |
54 |
--------------------------------------------------------------------------------
/04 - Neighborhood Based Collaborative Filtering/__pycache__/MovieLens.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/04 - Neighborhood Based Collaborative Filtering/__pycache__/MovieLens.cpython-38.pyc
--------------------------------------------------------------------------------
/05 - Matrix Factorization Methods/.ipynb_checkpoints/EvaluatedAlgorithm-checkpoint.py:
--------------------------------------------------------------------------------
1 | from RecommenderMetrics import RecommenderMetrics
2 | from EvaluationData import EvaluationData
3 |
4 | class EvaluatedAlgorithm:
5 |
6 | def __init__(self, algorithm, name):
7 | self.algorithm = algorithm
8 | self.name = name
9 |
10 | def Evaluate(self, evaluationData, doTopN, n=10, verbose=True):
11 | metrics = {}
12 | # Compute accuracy
13 | if (verbose):
14 | print("Evaluating accuracy...")
15 | self.algorithm.fit(evaluationData.GetTrainSet())
16 | predictions = self.algorithm.test(evaluationData.GetTestSet())
17 | metrics["RMSE"] = RecommenderMetrics.RMSE(predictions)
18 | metrics["MAE"] = RecommenderMetrics.MAE(predictions)
19 |
20 | if (doTopN):
21 | # Evaluate top-10 with Leave One Out testing
22 | if (verbose):
23 | print("Evaluating top-N with leave-one-out...")
24 | self.algorithm.fit(evaluationData.GetLOOCVTrainSet())
25 | leftOutPredictions = self.algorithm.test(evaluationData.GetLOOCVTestSet())
26 | # Build predictions for all ratings not in the training set
27 | allPredictions = self.algorithm.test(evaluationData.GetLOOCVAntiTestSet())
28 | # Compute top 10 recs for each user
29 | topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)
30 | if (verbose):
31 | print("Computing hit-rate and rank metrics...")
32 | # See how often we recommended a movie the user actually rated
33 | metrics["HR"] = RecommenderMetrics.HitRate(topNPredicted, leftOutPredictions)
34 | # See how often we recommended a movie the user actually liked
35 | metrics["cHR"] = RecommenderMetrics.CumulativeHitRate(topNPredicted, leftOutPredictions)
36 | # Compute ARHR
37 | metrics["ARHR"] = RecommenderMetrics.AverageReciprocalHitRank(topNPredicted, leftOutPredictions)
38 |
39 | #Evaluate properties of recommendations on full training set
40 | if (verbose):
41 | print("Computing recommendations with full data set...")
42 | self.algorithm.fit(evaluationData.GetFullTrainSet())
43 | allPredictions = self.algorithm.test(evaluationData.GetFullAntiTestSet())
44 | topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)
45 | if (verbose):
46 | print("Analyzing coverage, diversity, and novelty...")
47 | # Print user coverage with a minimum predicted rating of 4.0:
48 | metrics["Coverage"] = RecommenderMetrics.UserCoverage( topNPredicted,
49 | evaluationData.GetFullTrainSet().n_users,
50 | ratingThreshold=4.0)
51 | # Measure diversity of recommendations:
52 | metrics["Diversity"] = RecommenderMetrics.Diversity(topNPredicted, evaluationData.GetSimilarities())
53 |
54 | # Measure novelty (average popularity rank of recommendations):
55 | metrics["Novelty"] = RecommenderMetrics.Novelty(topNPredicted,
56 | evaluationData.GetPopularityRankings())
57 |
58 | if (verbose):
59 | print("Analysis complete.")
60 |
61 | return metrics
62 |
63 | def GetName(self):
64 | return self.name
65 |
66 | def GetAlgorithm(self):
67 | return self.algorithm
68 |
69 |
--------------------------------------------------------------------------------
/05 - Matrix Factorization Methods/.ipynb_checkpoints/EvaluationData-checkpoint.py:
--------------------------------------------------------------------------------
1 | from surprise.model_selection import train_test_split
2 | from surprise.model_selection import LeaveOneOut
3 | from surprise import KNNBaseline
4 |
5 | class EvaluationData:
6 |
7 | def __init__(self, data, popularityRankings):
8 |
9 | self.rankings = popularityRankings
10 |
11 | #Build a full training set for evaluating overall properties
12 | self.fullTrainSet = data.build_full_trainset()
13 | self.fullAntiTestSet = self.fullTrainSet.build_anti_testset()
14 |
15 | #Build a 75/25 train/test split for measuring accuracy
16 | self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1)
17 |
18 | #Build a "leave one out" train/test split for evaluating top-N recommenders
19 | #And build an anti-test-set for building predictions
20 | LOOCV = LeaveOneOut(n_splits=1, random_state=1)
21 | for train, test in LOOCV.split(data):
22 | self.LOOCVTrain = train
23 | self.LOOCVTest = test
24 |
25 | self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset()
26 |
27 | #Compute similarty matrix between items so we can measure diversity
28 | sim_options = {'name': 'cosine', 'user_based': False}
29 | self.simsAlgo = KNNBaseline(sim_options=sim_options)
30 | self.simsAlgo.fit(self.fullTrainSet)
31 |
32 | def GetFullTrainSet(self):
33 | return self.fullTrainSet
34 |
35 | def GetFullAntiTestSet(self):
36 | return self.fullAntiTestSet
37 |
38 | def GetAntiTestSetForUser(self, testSubject):
39 | trainset = self.fullTrainSet
40 | fill = trainset.global_mean
41 | anti_testset = []
42 | u = trainset.to_inner_uid(str(testSubject))
43 | user_items = set([j for (j, _) in trainset.ur[u]])
44 | anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
45 | i in trainset.all_items() if
46 | i not in user_items]
47 | return anti_testset
48 |
49 | def GetTrainSet(self):
50 | return self.trainSet
51 |
52 | def GetTestSet(self):
53 | return self.testSet
54 |
55 | def GetLOOCVTrainSet(self):
56 | return self.LOOCVTrain
57 |
58 | def GetLOOCVTestSet(self):
59 | return self.LOOCVTest
60 |
61 | def GetLOOCVAntiTestSet(self):
62 | return self.LOOCVAntiTestSet
63 |
64 | def GetSimilarities(self):
65 | return self.simsAlgo
66 |
67 | def GetPopularityRankings(self):
68 | return self.rankings
--------------------------------------------------------------------------------
/05 - Matrix Factorization Methods/.ipynb_checkpoints/Evaluator-checkpoint.py:
--------------------------------------------------------------------------------
1 | from EvaluationData import EvaluationData
2 | from EvaluatedAlgorithm import EvaluatedAlgorithm
3 |
4 | class Evaluator:
5 |
6 | algorithms = []
7 |
8 | def __init__(self, dataset, rankings):
9 | ed = EvaluationData(dataset, rankings)
10 | self.dataset = ed
11 |
12 | def AddAlgorithm(self, algorithm, name):
13 | alg = EvaluatedAlgorithm(algorithm, name)
14 | self.algorithms.append(alg)
15 |
16 | def Evaluate(self, doTopN):
17 | results = {}
18 | for algorithm in self.algorithms:
19 | print("Evaluating ", algorithm.GetName(), "...")
20 | results[algorithm.GetName()] = algorithm.Evaluate(self.dataset, doTopN)
21 |
22 | # Print results
23 | print("\n")
24 |
25 | if (doTopN):
26 | print("{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}".format(
27 | "Algorithm", "RMSE", "MAE", "HR", "cHR", "ARHR", "Coverage", "Diversity", "Novelty"))
28 | for (name, metrics) in results.items():
29 | print("{:<10} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f}".format(
30 | name, metrics["RMSE"], metrics["MAE"], metrics["HR"], metrics["cHR"], metrics["ARHR"],
31 | metrics["Coverage"], metrics["Diversity"], metrics["Novelty"]))
32 | else:
33 | print("{:<10} {:<10} {:<10}".format("Algorithm", "RMSE", "MAE"))
34 | for (name, metrics) in results.items():
35 | print("{:<10} {:<10.4f} {:<10.4f}".format(name, metrics["RMSE"], metrics["MAE"]))
36 |
37 | print("\nLegend:\n")
38 | print("RMSE: Root Mean Squared Error. Lower values mean better accuracy.")
39 | print("MAE: Mean Absolute Error. Lower values mean better accuracy.")
40 | if (doTopN):
41 | print("HR: Hit Rate; how often we are able to recommend a left-out rating. Higher is better.")
42 | print("cHR: Cumulative Hit Rate; hit rate, confined to ratings above a certain threshold. Higher is better.")
43 | print("ARHR: Average Reciprocal Hit Rank - Hit rate that takes the ranking into account. Higher is better." )
44 | print("Coverage: Ratio of users for whom recommendations above a certain threshold exist. Higher is better.")
45 | print("Diversity: 1-S, where S is the average similarity score between every possible pair of recommendations")
46 | print(" for a given user. Higher means more diverse.")
47 | print("Novelty: Average popularity rank of recommended items. Higher means more novel.")
48 |
49 | def SampleTopNRecs(self, ml, testSubject=85, k=10):
50 |
51 | for algo in self.algorithms:
52 | print("\nUsing recommender ", algo.GetName())
53 |
54 | print("\nBuilding recommendation model...")
55 | trainSet = self.dataset.GetFullTrainSet()
56 | algo.GetAlgorithm().fit(trainSet)
57 |
58 | print("Computing recommendations...")
59 | testSet = self.dataset.GetAntiTestSetForUser(testSubject)
60 |
61 | predictions = algo.GetAlgorithm().test(testSet)
62 |
63 | recommendations = []
64 |
65 | print ("\nWe recommend:")
66 | for userID, movieID, actualRating, estimatedRating, _ in predictions:
67 | intMovieID = int(movieID)
68 | recommendations.append((intMovieID, estimatedRating))
69 |
70 | recommendations.sort(key=lambda x: x[1], reverse=True)
71 |
72 | for ratings in recommendations[:10]:
73 | print(ml.getMovieName(ratings[0]), ratings[1])
74 |
75 |
76 |
77 |
78 |
79 |
--------------------------------------------------------------------------------
/05 - Matrix Factorization Methods/.ipynb_checkpoints/SVD Matrix Factorization-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 4
6 | }
7 |
--------------------------------------------------------------------------------
/05 - Matrix Factorization Methods/.ipynb_checkpoints/SVDBakeOff-checkpoint.py:
--------------------------------------------------------------------------------
1 | from MovieLens import MovieLens
2 | from surprise import SVD, SVDpp
3 | from surprise import NormalPredictor
4 | from Evaluator import Evaluator
5 |
6 | import random
7 | import numpy as np
8 |
9 | def LoadMovieLensData():
10 | ml = MovieLens()
11 | print("Loading movie ratings...")
12 | data = ml.loadMovieLensLatestSmall()
13 | print("\nComputing movie popularity ranks so we can measure novelty later...")
14 | rankings = ml.getPopularityRanks()
15 | return (ml, data, rankings)
16 |
17 | np.random.seed(0)
18 | random.seed(0)
19 |
20 | # Load up common data set for the recommender algorithms
21 | (ml, evaluationData, rankings) = LoadMovieLensData()
22 |
23 | # Construct an Evaluator to, you know, evaluate them
24 | evaluator = Evaluator(evaluationData, rankings)
25 |
26 | # SVD
27 | SVD = SVD()
28 | evaluator.AddAlgorithm(SVD, "SVD")
29 |
30 | # SVD++
31 | SVDPlusPlus = SVDpp()
32 | evaluator.AddAlgorithm(SVDPlusPlus, "SVD++")
33 |
34 | # Just make random recommendations
35 | Random = NormalPredictor()
36 | evaluator.AddAlgorithm(Random, "Random")
37 |
38 | # Fight!
39 | evaluator.Evaluate(False)
40 |
41 | evaluator.SampleTopNRecs(ml)
42 |
--------------------------------------------------------------------------------
/05 - Matrix Factorization Methods/.ipynb_checkpoints/SVDTuning-checkpoint.py:
--------------------------------------------------------------------------------
1 | from MovieLens import MovieLens
2 | from surprise import SVD
3 | from surprise import NormalPredictor
4 | from Evaluator import Evaluator
5 | from surprise.model_selection import GridSearchCV
6 |
7 | import random
8 | import numpy as np
9 |
10 | def LoadMovieLensData():
11 | ml = MovieLens()
12 | print("Loading movie ratings...")
13 | data = ml.loadMovieLensLatestSmall()
14 | print("\nComputing movie popularity ranks so we can measure novelty later...")
15 | rankings = ml.getPopularityRanks()
16 | return (ml, data, rankings)
17 |
18 | np.random.seed(0)
19 | random.seed(0)
20 |
21 | # Load up common data set for the recommender algorithms
22 | (ml, evaluationData, rankings) = LoadMovieLensData()
23 |
24 | print("Searching for best parameters...")
25 | param_grid = {'n_epochs': [20, 30], 'lr_all': [0.005, 0.010],
26 | 'n_factors': [50, 100]}
27 | gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
28 |
29 | gs.fit(evaluationData)
30 |
31 | # best RMSE score
32 | print("Best RMSE score attained: ", gs.best_score['rmse'])
33 |
34 | # combination of parameters that gave the best RMSE score
35 | print(gs.best_params['rmse'])
36 |
37 | # Construct an Evaluator to, you know, evaluate them
38 | evaluator = Evaluator(evaluationData, rankings)
39 |
40 | params = gs.best_params['rmse']
41 | SVDtuned = SVD(n_epochs = params['n_epochs'], lr_all = params['lr_all'], n_factors = params['n_factors'])
42 | evaluator.AddAlgorithm(SVDtuned, "SVD - Tuned")
43 |
44 | SVDUntuned = SVD()
45 | evaluator.AddAlgorithm(SVDUntuned, "SVD - Untuned")
46 |
47 | # Just make random recommendations
48 | Random = NormalPredictor()
49 | evaluator.AddAlgorithm(Random, "Random")
50 |
51 | # Fight!
52 | evaluator.Evaluate(False)
53 |
54 | evaluator.SampleTopNRecs(ml)
55 |
--------------------------------------------------------------------------------
/05 - Matrix Factorization Methods/EvaluatedAlgorithm.py:
--------------------------------------------------------------------------------
1 | from RecommenderMetrics import RecommenderMetrics
2 | from EvaluationData import EvaluationData
3 |
4 | class EvaluatedAlgorithm:
5 |
6 | def __init__(self, algorithm, name):
7 | self.algorithm = algorithm
8 | self.name = name
9 |
10 | def Evaluate(self, evaluationData, doTopN, n=10, verbose=True):
11 | metrics = {}
12 | # Compute accuracy
13 | if (verbose):
14 | print("Evaluating accuracy...")
15 | self.algorithm.fit(evaluationData.GetTrainSet())
16 | predictions = self.algorithm.test(evaluationData.GetTestSet())
17 | metrics["RMSE"] = RecommenderMetrics.RMSE(predictions)
18 | metrics["MAE"] = RecommenderMetrics.MAE(predictions)
19 |
20 | if (doTopN):
21 | # Evaluate top-10 with Leave One Out testing
22 | if (verbose):
23 | print("Evaluating top-N with leave-one-out...")
24 | self.algorithm.fit(evaluationData.GetLOOCVTrainSet())
25 | leftOutPredictions = self.algorithm.test(evaluationData.GetLOOCVTestSet())
26 | # Build predictions for all ratings not in the training set
27 | allPredictions = self.algorithm.test(evaluationData.GetLOOCVAntiTestSet())
28 | # Compute top 10 recs for each user
29 | topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)
30 | if (verbose):
31 | print("Computing hit-rate and rank metrics...")
32 | # See how often we recommended a movie the user actually rated
33 | metrics["HR"] = RecommenderMetrics.HitRate(topNPredicted, leftOutPredictions)
34 | # See how often we recommended a movie the user actually liked
35 | metrics["cHR"] = RecommenderMetrics.CumulativeHitRate(topNPredicted, leftOutPredictions)
36 | # Compute ARHR
37 | metrics["ARHR"] = RecommenderMetrics.AverageReciprocalHitRank(topNPredicted, leftOutPredictions)
38 |
39 | #Evaluate properties of recommendations on full training set
40 | if (verbose):
41 | print("Computing recommendations with full data set...")
42 | self.algorithm.fit(evaluationData.GetFullTrainSet())
43 | allPredictions = self.algorithm.test(evaluationData.GetFullAntiTestSet())
44 | topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)
45 | if (verbose):
46 | print("Analyzing coverage, diversity, and novelty...")
47 | # Print user coverage with a minimum predicted rating of 4.0:
48 | metrics["Coverage"] = RecommenderMetrics.UserCoverage( topNPredicted,
49 | evaluationData.GetFullTrainSet().n_users,
50 | ratingThreshold=4.0)
51 | # Measure diversity of recommendations:
52 | metrics["Diversity"] = RecommenderMetrics.Diversity(topNPredicted, evaluationData.GetSimilarities())
53 |
54 | # Measure novelty (average popularity rank of recommendations):
55 | metrics["Novelty"] = RecommenderMetrics.Novelty(topNPredicted,
56 | evaluationData.GetPopularityRankings())
57 |
58 | if (verbose):
59 | print("Analysis complete.")
60 |
61 | return metrics
62 |
63 | def GetName(self):
64 | return self.name
65 |
66 | def GetAlgorithm(self):
67 | return self.algorithm
68 |
69 |
--------------------------------------------------------------------------------
/05 - Matrix Factorization Methods/EvaluationData.py:
--------------------------------------------------------------------------------
1 | from surprise.model_selection import train_test_split
2 | from surprise.model_selection import LeaveOneOut
3 | from surprise import KNNBaseline
4 |
5 | class EvaluationData:
6 |
7 | def __init__(self, data, popularityRankings):
8 |
9 | self.rankings = popularityRankings
10 |
11 | #Build a full training set for evaluating overall properties
12 | self.fullTrainSet = data.build_full_trainset()
13 | self.fullAntiTestSet = self.fullTrainSet.build_anti_testset()
14 |
15 | #Build a 75/25 train/test split for measuring accuracy
16 | self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1)
17 |
18 | #Build a "leave one out" train/test split for evaluating top-N recommenders
19 | #And build an anti-test-set for building predictions
20 | LOOCV = LeaveOneOut(n_splits=1, random_state=1)
21 | for train, test in LOOCV.split(data):
22 | self.LOOCVTrain = train
23 | self.LOOCVTest = test
24 |
25 | self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset()
26 |
27 | #Compute similarty matrix between items so we can measure diversity
28 | sim_options = {'name': 'cosine', 'user_based': False}
29 | self.simsAlgo = KNNBaseline(sim_options=sim_options)
30 | self.simsAlgo.fit(self.fullTrainSet)
31 |
32 | def GetFullTrainSet(self):
33 | return self.fullTrainSet
34 |
35 | def GetFullAntiTestSet(self):
36 | return self.fullAntiTestSet
37 |
38 | def GetAntiTestSetForUser(self, testSubject):
39 | trainset = self.fullTrainSet
40 | fill = trainset.global_mean
41 | anti_testset = []
42 | u = trainset.to_inner_uid(str(testSubject))
43 | user_items = set([j for (j, _) in trainset.ur[u]])
44 | anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
45 | i in trainset.all_items() if
46 | i not in user_items]
47 | return anti_testset
48 |
49 | def GetTrainSet(self):
50 | return self.trainSet
51 |
52 | def GetTestSet(self):
53 | return self.testSet
54 |
55 | def GetLOOCVTrainSet(self):
56 | return self.LOOCVTrain
57 |
58 | def GetLOOCVTestSet(self):
59 | return self.LOOCVTest
60 |
61 | def GetLOOCVAntiTestSet(self):
62 | return self.LOOCVAntiTestSet
63 |
64 | def GetSimilarities(self):
65 | return self.simsAlgo
66 |
67 | def GetPopularityRankings(self):
68 | return self.rankings
--------------------------------------------------------------------------------
/05 - Matrix Factorization Methods/Evaluator.py:
--------------------------------------------------------------------------------
1 | from EvaluationData import EvaluationData
2 | from EvaluatedAlgorithm import EvaluatedAlgorithm
3 |
4 | class Evaluator:
5 |
6 | algorithms = []
7 |
8 | def __init__(self, dataset, rankings):
9 | ed = EvaluationData(dataset, rankings)
10 | self.dataset = ed
11 |
12 | def AddAlgorithm(self, algorithm, name):
13 | alg = EvaluatedAlgorithm(algorithm, name)
14 | self.algorithms.append(alg)
15 |
16 | def Evaluate(self, doTopN):
17 | results = {}
18 | for algorithm in self.algorithms:
19 | print("Evaluating ", algorithm.GetName(), "...")
20 | results[algorithm.GetName()] = algorithm.Evaluate(self.dataset, doTopN)
21 |
22 | # Print results
23 | print("\n")
24 |
25 | if (doTopN):
26 | print("{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}".format(
27 | "Algorithm", "RMSE", "MAE", "HR", "cHR", "ARHR", "Coverage", "Diversity", "Novelty"))
28 | for (name, metrics) in results.items():
29 | print("{:<10} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f}".format(
30 | name, metrics["RMSE"], metrics["MAE"], metrics["HR"], metrics["cHR"], metrics["ARHR"],
31 | metrics["Coverage"], metrics["Diversity"], metrics["Novelty"]))
32 | else:
33 | print("{:<10} {:<10} {:<10}".format("Algorithm", "RMSE", "MAE"))
34 | for (name, metrics) in results.items():
35 | print("{:<10} {:<10.4f} {:<10.4f}".format(name, metrics["RMSE"], metrics["MAE"]))
36 |
37 | print("\nLegend:\n")
38 | print("RMSE: Root Mean Squared Error. Lower values mean better accuracy.")
39 | print("MAE: Mean Absolute Error. Lower values mean better accuracy.")
40 | if (doTopN):
41 | print("HR: Hit Rate; how often we are able to recommend a left-out rating. Higher is better.")
42 | print("cHR: Cumulative Hit Rate; hit rate, confined to ratings above a certain threshold. Higher is better.")
43 | print("ARHR: Average Reciprocal Hit Rank - Hit rate that takes the ranking into account. Higher is better." )
44 | print("Coverage: Ratio of users for whom recommendations above a certain threshold exist. Higher is better.")
45 | print("Diversity: 1-S, where S is the average similarity score between every possible pair of recommendations")
46 | print(" for a given user. Higher means more diverse.")
47 | print("Novelty: Average popularity rank of recommended items. Higher means more novel.")
48 |
49 | def SampleTopNRecs(self, ml, testSubject=85, k=10):
50 |
51 | for algo in self.algorithms:
52 | print("\nUsing recommender ", algo.GetName())
53 |
54 | print("\nBuilding recommendation model...")
55 | trainSet = self.dataset.GetFullTrainSet()
56 | algo.GetAlgorithm().fit(trainSet)
57 |
58 | print("Computing recommendations...")
59 | testSet = self.dataset.GetAntiTestSetForUser(testSubject)
60 |
61 | predictions = algo.GetAlgorithm().test(testSet)
62 |
63 | recommendations = []
64 |
65 | print ("\nWe recommend:")
66 | for userID, movieID, actualRating, estimatedRating, _ in predictions:
67 | intMovieID = int(movieID)
68 | recommendations.append((intMovieID, estimatedRating))
69 |
70 | recommendations.sort(key=lambda x: x[1], reverse=True)
71 |
72 | for ratings in recommendations[:10]:
73 | print(ml.getMovieName(ratings[0]), ratings[1])
74 |
75 |
76 |
77 |
78 |
79 |
--------------------------------------------------------------------------------
/05 - Matrix Factorization Methods/RecommenderMetrics.py:
--------------------------------------------------------------------------------
1 | import itertools
2 |
3 | from surprise import accuracy
4 | from collections import defaultdict
5 |
6 | class RecommenderMetrics:
7 |
8 | def MAE(predictions):
9 | return accuracy.mae(predictions, verbose=False)
10 |
11 | def RMSE(predictions):
12 | return accuracy.rmse(predictions, verbose=False)
13 |
14 | def GetTopN(predictions, n=10, minimumRating=4.0):
15 | topN = defaultdict(list)
16 |
17 |
18 | for userID, movieID, actualRating, estimatedRating, _ in predictions:
19 | if (estimatedRating >= minimumRating):
20 | topN[int(userID)].append((int(movieID), estimatedRating))
21 |
22 | for userID, ratings in topN.items():
23 | ratings.sort(key=lambda x: x[1], reverse=True)
24 | topN[int(userID)] = ratings[:n]
25 |
26 | return topN
27 |
28 | def HitRate(topNPredicted, leftOutPredictions):
29 | hits = 0
30 | total = 0
31 |
32 | # For each left-out rating
33 | for leftOut in leftOutPredictions:
34 | userID = leftOut[0]
35 | leftOutMovieID = leftOut[1]
36 | # Is it in the predicted top 10 for this user?
37 | hit = False
38 | for movieID, predictedRating in topNPredicted[int(userID)]:
39 | if (int(leftOutMovieID) == int(movieID)):
40 | hit = True
41 | break
42 | if (hit) :
43 | hits += 1
44 |
45 | total += 1
46 |
47 | # Compute overall precision
48 | return hits/total
49 |
50 | def CumulativeHitRate(topNPredicted, leftOutPredictions, ratingCutoff=0):
51 | hits = 0
52 | total = 0
53 |
54 | # For each left-out rating
55 | for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
56 | # Only look at ability to recommend things the users actually liked...
57 | if (actualRating >= ratingCutoff):
58 | # Is it in the predicted top 10 for this user?
59 | hit = False
60 | for movieID, predictedRating in topNPredicted[int(userID)]:
61 | if (int(leftOutMovieID) == movieID):
62 | hit = True
63 | break
64 | if (hit) :
65 | hits += 1
66 |
67 | total += 1
68 |
69 | # Compute overall precision
70 | return hits/total
71 |
72 | def RatingHitRate(topNPredicted, leftOutPredictions):
73 | hits = defaultdict(float)
74 | total = defaultdict(float)
75 |
76 | # For each left-out rating
77 | for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
78 | # Is it in the predicted top N for this user?
79 | hit = False
80 | for movieID, predictedRating in topNPredicted[int(userID)]:
81 | if (int(leftOutMovieID) == movieID):
82 | hit = True
83 | break
84 | if (hit) :
85 | hits[actualRating] += 1
86 |
87 | total[actualRating] += 1
88 |
89 | # Compute overall precision
90 | for rating in sorted(hits.keys()):
91 | print (rating, hits[rating] / total[rating])
92 |
93 | def AverageReciprocalHitRank(topNPredicted, leftOutPredictions):
94 | summation = 0
95 | total = 0
96 | # For each left-out rating
97 | for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
98 | # Is it in the predicted top N for this user?
99 | hitRank = 0
100 | rank = 0
101 | for movieID, predictedRating in topNPredicted[int(userID)]:
102 | rank = rank + 1
103 | if (int(leftOutMovieID) == movieID):
104 | hitRank = rank
105 | break
106 | if (hitRank > 0) :
107 | summation += 1.0 / hitRank
108 |
109 | total += 1
110 |
111 | return summation / total
112 |
113 | # What percentage of users have at least one "good" recommendation
114 | def UserCoverage(topNPredicted, numUsers, ratingThreshold=0):
115 | hits = 0
116 | for userID in topNPredicted.keys():
117 | hit = False
118 | for movieID, predictedRating in topNPredicted[userID]:
119 | if (predictedRating >= ratingThreshold):
120 | hit = True
121 | break
122 | if (hit):
123 | hits += 1
124 |
125 | return hits / numUsers
126 |
127 | def Diversity(topNPredicted, simsAlgo):
128 | n = 0
129 | total = 0
130 | simsMatrix = simsAlgo.compute_similarities()
131 | for userID in topNPredicted.keys():
132 | pairs = itertools.combinations(topNPredicted[userID], 2)
133 | for pair in pairs:
134 | movie1 = pair[0][0]
135 | movie2 = pair[1][0]
136 | innerID1 = simsAlgo.trainset.to_inner_iid(str(movie1))
137 | innerID2 = simsAlgo.trainset.to_inner_iid(str(movie2))
138 | similarity = simsMatrix[innerID1][innerID2]
139 | total += similarity
140 | n += 1
141 |
142 | S = total / n
143 | return (1-S)
144 |
145 | def Novelty(topNPredicted, rankings):
146 | n = 0
147 | total = 0
148 | for userID in topNPredicted.keys():
149 | for rating in topNPredicted[userID]:
150 | movieID = rating[0]
151 | rank = rankings[movieID]
152 | total += rank
153 | n += 1
154 | return total / n
155 |
--------------------------------------------------------------------------------
/05 - Matrix Factorization Methods/__pycache__/EvaluatedAlgorithm.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/05 - Matrix Factorization Methods/__pycache__/EvaluatedAlgorithm.cpython-38.pyc
--------------------------------------------------------------------------------
/05 - Matrix Factorization Methods/__pycache__/EvaluationData.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/05 - Matrix Factorization Methods/__pycache__/EvaluationData.cpython-38.pyc
--------------------------------------------------------------------------------
/05 - Matrix Factorization Methods/__pycache__/Evaluator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/05 - Matrix Factorization Methods/__pycache__/Evaluator.cpython-38.pyc
--------------------------------------------------------------------------------
/05 - Matrix Factorization Methods/__pycache__/MovieLens.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/05 - Matrix Factorization Methods/__pycache__/MovieLens.cpython-38.pyc
--------------------------------------------------------------------------------
/05 - Matrix Factorization Methods/__pycache__/RecommenderMetrics.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/05 - Matrix Factorization Methods/__pycache__/RecommenderMetrics.cpython-38.pyc
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/.ipynb_checkpoints/AutoRec-checkpoint.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 |
4 | class AutoRec(object):
5 |
6 | def __init__(self, visibleDimensions, epochs=200, hiddenDimensions=50, learningRate=0.1, batchSize=100):
7 |
8 | self.visibleDimensions = visibleDimensions
9 | self.epochs = epochs
10 | self.hiddenDimensions = hiddenDimensions
11 | self.learningRate = learningRate
12 | self.batchSize = batchSize
13 | self.optimizer = tf.keras.optimizers.RMSprop(self.learningRate)
14 |
15 |
16 | def Train(self, X):
17 |
18 | for epoch in range(self.epochs):
19 | for i in range(0, X.shape[0], self.batchSize):
20 | epochX = X[i:i+self.batchSize]
21 | self.run_optimization(epochX)
22 |
23 |
24 | print("Trained epoch ", epoch)
25 |
26 | def GetRecommendations(self, inputUser):
27 |
28 | # Feed through a single user and return predictions from the output layer.
29 | rec = self.neural_net(inputUser)
30 |
31 | # It is being used as the return type is Eager Tensor.
32 | return rec[0]
33 |
34 |
35 | def neural_net(self, inputUser):
36 |
37 | #tf.set_random_seed(0)
38 |
39 | # Create varaibles for weights for the encoding (visible->hidden) and decoding (hidden->output) stages, randomly initialized
40 | self.weights = {
41 | 'h1': tf.Variable(tf.random.normal([self.visibleDimensions, self.hiddenDimensions])),
42 | 'out': tf.Variable(tf.random.normal([self.hiddenDimensions, self.visibleDimensions]))
43 | }
44 |
45 | # Create biases
46 | self.biases = {
47 | 'b1': tf.Variable(tf.random.normal([self.hiddenDimensions])),
48 | 'out': tf.Variable(tf.random.normal([self.visibleDimensions]))
49 | }
50 |
51 | # Create the input layer
52 | self.inputLayer = inputUser
53 |
54 | # hidden layer
55 | hidden = tf.nn.sigmoid(tf.add(tf.matmul(self.inputLayer, self.weights['h1']), self.biases['b1']))
56 |
57 | # output layer for our predictions.
58 | self.outputLayer = tf.nn.sigmoid(tf.add(tf.matmul(hidden, self.weights['out']), self.biases['out']))
59 |
60 | return self.outputLayer
61 |
62 | def run_optimization(self, inputUser):
63 | with tf.GradientTape() as g:
64 | pred = self.neural_net(inputUser)
65 | loss = tf.keras.losses.MSE(inputUser, pred)
66 |
67 | trainable_variables = list(self.weights.values()) + list(self.biases.values())
68 |
69 | gradients = g.gradient(loss, trainable_variables)
70 |
71 | self.optimizer.apply_gradients(zip(gradients, trainable_variables))
72 |
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/.ipynb_checkpoints/AutoRecAlgorithm-checkpoint.py:
--------------------------------------------------------------------------------
1 | from surprise import AlgoBase
2 | from surprise import PredictionImpossible
3 | import numpy as np
4 | from AutoRec import AutoRec
5 |
6 | class AutoRecAlgorithm(AlgoBase):
7 |
8 | def __init__(self, epochs=100, hiddenDim=100, learningRate=0.01, batchSize=100, sim_options={}):
9 | AlgoBase.__init__(self)
10 | self.epochs = epochs
11 | self.hiddenDim = hiddenDim
12 | self.learningRate = learningRate
13 | self.batchSize = batchSize
14 |
15 | def fit(self, trainset):
16 | AlgoBase.fit(self, trainset)
17 |
18 | numUsers = trainset.n_users
19 | numItems = trainset.n_items
20 |
21 | trainingMatrix = np.zeros([numUsers, numItems], dtype=np.float32)
22 |
23 | for (uid, iid, rating) in trainset.all_ratings():
24 | trainingMatrix[int(uid), int(iid)] = rating / 5.0
25 |
26 | # Create an RBM with (num items * rating values) visible nodes
27 | autoRec = AutoRec(trainingMatrix.shape[1], hiddenDimensions=self.hiddenDim, learningRate=self.learningRate, batchSize=self.batchSize, epochs=self.epochs)
28 | autoRec.Train(trainingMatrix)
29 |
30 | self.predictedRatings = np.zeros([numUsers, numItems], dtype=np.float32)
31 |
32 | for uiid in range(trainset.n_users):
33 | if (uiid % 50 == 0):
34 | print("Processing user ", uiid)
35 | recs = autoRec.GetRecommendations([trainingMatrix[uiid]])
36 |
37 | for itemID, rec in enumerate(recs):
38 | self.predictedRatings[uiid, itemID] = rec * 5.0
39 |
40 | return self
41 |
42 |
43 | def estimate(self, u, i):
44 |
45 | if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
46 | raise PredictionImpossible('User and/or item is unkown.')
47 |
48 | rating = self.predictedRatings[u, i]
49 |
50 | if (rating < 0.001):
51 | raise PredictionImpossible('No valid prediction exists.')
52 |
53 | return rating
54 |
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/.ipynb_checkpoints/AutoRecBakeOff-checkpoint.py:
--------------------------------------------------------------------------------
1 | from MovieLens import MovieLens
2 | from AutoRecAlgorithm import AutoRecAlgorithm
3 | from surprise import NormalPredictor
4 | from Evaluator import Evaluator
5 |
6 | import random
7 | import numpy as np
8 |
9 | def LoadMovieLensData():
10 | ml = MovieLens()
11 | print("Loading movie ratings...")
12 | data = ml.loadMovieLensLatestSmall()
13 | print("\nComputing movie popularity ranks so we can measure novelty later...")
14 | rankings = ml.getPopularityRanks()
15 | return (ml, data, rankings)
16 |
17 | np.random.seed(0)
18 | random.seed(0)
19 |
20 | # Load up common data set for the recommender algorithms
21 | (ml, evaluationData, rankings) = LoadMovieLensData()
22 |
23 | # Construct an Evaluator to, you know, evaluate them
24 | evaluator = Evaluator(evaluationData, rankings)
25 |
26 | #Autoencoder
27 | AutoRec = AutoRecAlgorithm()
28 | evaluator.AddAlgorithm(AutoRec, "AutoRec")
29 |
30 | # Just make random recommendations
31 | Random = NormalPredictor()
32 | evaluator.AddAlgorithm(Random, "Random")
33 |
34 | # Fight!
35 | evaluator.Evaluate(True)
36 |
37 | evaluator.SampleTopNRecs(ml)
38 |
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/.ipynb_checkpoints/EvaluatedAlgorithm-checkpoint.py:
--------------------------------------------------------------------------------
1 | from RecommenderMetrics import RecommenderMetrics
2 | from EvaluationData import EvaluationData
3 |
4 | class EvaluatedAlgorithm:
5 |
6 | def __init__(self, algorithm, name):
7 | self.algorithm = algorithm
8 | self.name = name
9 |
10 | def Evaluate(self, evaluationData, doTopN, n=10, verbose=True):
11 | metrics = {}
12 | # Compute accuracy
13 | if (verbose):
14 | print("Evaluating accuracy...")
15 | self.algorithm.fit(evaluationData.GetTrainSet())
16 | predictions = self.algorithm.test(evaluationData.GetTestSet())
17 | metrics["RMSE"] = RecommenderMetrics.RMSE(predictions)
18 | metrics["MAE"] = RecommenderMetrics.MAE(predictions)
19 |
20 | if (doTopN):
21 | # Evaluate top-10 with Leave One Out testing
22 | if (verbose):
23 | print("Evaluating top-N with leave-one-out...")
24 | self.algorithm.fit(evaluationData.GetLOOCVTrainSet())
25 | leftOutPredictions = self.algorithm.test(evaluationData.GetLOOCVTestSet())
26 | # Build predictions for all ratings not in the training set
27 | allPredictions = self.algorithm.test(evaluationData.GetLOOCVAntiTestSet())
28 | # Compute top 10 recs for each user
29 | topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)
30 | if (verbose):
31 | print("Computing hit-rate and rank metrics...")
32 | # See how often we recommended a movie the user actually rated
33 | metrics["HR"] = RecommenderMetrics.HitRate(topNPredicted, leftOutPredictions)
34 | # See how often we recommended a movie the user actually liked
35 | metrics["cHR"] = RecommenderMetrics.CumulativeHitRate(topNPredicted, leftOutPredictions)
36 | # Compute ARHR
37 | metrics["ARHR"] = RecommenderMetrics.AverageReciprocalHitRank(topNPredicted, leftOutPredictions)
38 |
39 | #Evaluate properties of recommendations on full training set
40 | if (verbose):
41 | print("Computing recommendations with full data set...")
42 | self.algorithm.fit(evaluationData.GetFullTrainSet())
43 | allPredictions = self.algorithm.test(evaluationData.GetFullAntiTestSet())
44 | topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)
45 | if (verbose):
46 | print("Analyzing coverage, diversity, and novelty...")
47 | # Print user coverage with a minimum predicted rating of 4.0:
48 | metrics["Coverage"] = RecommenderMetrics.UserCoverage( topNPredicted,
49 | evaluationData.GetFullTrainSet().n_users,
50 | ratingThreshold=4.0)
51 | # Measure diversity of recommendations:
52 | metrics["Diversity"] = RecommenderMetrics.Diversity(topNPredicted, evaluationData.GetSimilarities())
53 |
54 | # Measure novelty (average popularity rank of recommendations):
55 | metrics["Novelty"] = RecommenderMetrics.Novelty(topNPredicted,
56 | evaluationData.GetPopularityRankings())
57 |
58 | if (verbose):
59 | print("Analysis complete.")
60 |
61 | return metrics
62 |
63 | def GetName(self):
64 | return self.name
65 |
66 | def GetAlgorithm(self):
67 | return self.algorithm
68 |
69 |
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/.ipynb_checkpoints/EvaluationData-checkpoint.py:
--------------------------------------------------------------------------------
1 | from surprise.model_selection import train_test_split
2 | from surprise.model_selection import LeaveOneOut
3 | from surprise import KNNBaseline
4 |
5 | class EvaluationData:
6 |
7 | def __init__(self, data, popularityRankings):
8 |
9 | self.rankings = popularityRankings
10 |
11 | #Build a full training set for evaluating overall properties
12 | self.fullTrainSet = data.build_full_trainset()
13 | self.fullAntiTestSet = self.fullTrainSet.build_anti_testset()
14 |
15 | #Build a 75/25 train/test split for measuring accuracy
16 | self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1)
17 |
18 | #Build a "leave one out" train/test split for evaluating top-N recommenders
19 | #And build an anti-test-set for building predictions
20 | LOOCV = LeaveOneOut(n_splits=1, random_state=1)
21 | for train, test in LOOCV.split(data):
22 | self.LOOCVTrain = train
23 | self.LOOCVTest = test
24 |
25 | self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset()
26 |
27 | #Compute similarty matrix between items so we can measure diversity
28 | sim_options = {'name': 'cosine', 'user_based': False}
29 | self.simsAlgo = KNNBaseline(sim_options=sim_options)
30 | self.simsAlgo.fit(self.fullTrainSet)
31 |
32 | def GetFullTrainSet(self):
33 | return self.fullTrainSet
34 |
35 | def GetFullAntiTestSet(self):
36 | return self.fullAntiTestSet
37 |
38 | def GetAntiTestSetForUser(self, testSubject):
39 | trainset = self.fullTrainSet
40 | fill = trainset.global_mean
41 | anti_testset = []
42 | u = trainset.to_inner_uid(str(testSubject))
43 | user_items = set([j for (j, _) in trainset.ur[u]])
44 | anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
45 | i in trainset.all_items() if
46 | i not in user_items]
47 | return anti_testset
48 |
49 | def GetTrainSet(self):
50 | return self.trainSet
51 |
52 | def GetTestSet(self):
53 | return self.testSet
54 |
55 | def GetLOOCVTrainSet(self):
56 | return self.LOOCVTrain
57 |
58 | def GetLOOCVTestSet(self):
59 | return self.LOOCVTest
60 |
61 | def GetLOOCVAntiTestSet(self):
62 | return self.LOOCVAntiTestSet
63 |
64 | def GetSimilarities(self):
65 | return self.simsAlgo
66 |
67 | def GetPopularityRankings(self):
68 | return self.rankings
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/.ipynb_checkpoints/Evaluator-checkpoint.py:
--------------------------------------------------------------------------------
1 | from EvaluationData import EvaluationData
2 | from EvaluatedAlgorithm import EvaluatedAlgorithm
3 |
4 | class Evaluator:
5 |
6 | algorithms = []
7 |
8 | def __init__(self, dataset, rankings):
9 | ed = EvaluationData(dataset, rankings)
10 | self.dataset = ed
11 |
12 | def AddAlgorithm(self, algorithm, name):
13 | alg = EvaluatedAlgorithm(algorithm, name)
14 | self.algorithms.append(alg)
15 |
16 | def Evaluate(self, doTopN):
17 | results = {}
18 | for algorithm in self.algorithms:
19 | print("Evaluating ", algorithm.GetName(), "...")
20 | results[algorithm.GetName()] = algorithm.Evaluate(self.dataset, doTopN)
21 |
22 | # Print results
23 | print("\n")
24 |
25 | if (doTopN):
26 | print("{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}".format(
27 | "Algorithm", "RMSE", "MAE", "HR", "cHR", "ARHR", "Coverage", "Diversity", "Novelty"))
28 | for (name, metrics) in results.items():
29 | print("{:<10} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f}".format(
30 | name, metrics["RMSE"], metrics["MAE"], metrics["HR"], metrics["cHR"], metrics["ARHR"],
31 | metrics["Coverage"], metrics["Diversity"], metrics["Novelty"]))
32 | else:
33 | print("{:<10} {:<10} {:<10}".format("Algorithm", "RMSE", "MAE"))
34 | for (name, metrics) in results.items():
35 | print("{:<10} {:<10.4f} {:<10.4f}".format(name, metrics["RMSE"], metrics["MAE"]))
36 |
37 | print("\nLegend:\n")
38 | print("RMSE: Root Mean Squared Error. Lower values mean better accuracy.")
39 | print("MAE: Mean Absolute Error. Lower values mean better accuracy.")
40 | if (doTopN):
41 | print("HR: Hit Rate; how often we are able to recommend a left-out rating. Higher is better.")
42 | print("cHR: Cumulative Hit Rate; hit rate, confined to ratings above a certain threshold. Higher is better.")
43 | print("ARHR: Average Reciprocal Hit Rank - Hit rate that takes the ranking into account. Higher is better." )
44 | print("Coverage: Ratio of users for whom recommendations above a certain threshold exist. Higher is better.")
45 | print("Diversity: 1-S, where S is the average similarity score between every possible pair of recommendations")
46 | print(" for a given user. Higher means more diverse.")
47 | print("Novelty: Average popularity rank of recommended items. Higher means more novel.")
48 |
49 | def SampleTopNRecs(self, ml, testSubject=85, k=10):
50 |
51 | for algo in self.algorithms:
52 | print("\nUsing recommender ", algo.GetName())
53 |
54 | print("\nBuilding recommendation model...")
55 | trainSet = self.dataset.GetFullTrainSet()
56 | algo.GetAlgorithm().fit(trainSet)
57 |
58 | print("Computing recommendations...")
59 | testSet = self.dataset.GetAntiTestSetForUser(testSubject)
60 |
61 | predictions = algo.GetAlgorithm().test(testSet)
62 |
63 | recommendations = []
64 |
65 | print ("\nWe recommend:")
66 | for userID, movieID, actualRating, estimatedRating, _ in predictions:
67 | intMovieID = int(movieID)
68 | recommendations.append((intMovieID, estimatedRating))
69 |
70 | recommendations.sort(key=lambda x: x[1], reverse=True)
71 |
72 | for ratings in recommendations[:10]:
73 | print(ml.getMovieName(ratings[0]), ratings[1])
74 |
75 |
76 |
77 |
78 |
79 |
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/.ipynb_checkpoints/RBM-checkpoint.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 |
4 | class RBM(object):
5 |
6 | def __init__(self, visibleDimensions, epochs=20, hiddenDimensions=50, ratingValues=10, learningRate=0.001, batchSize=100):
7 |
8 | self.visibleDimensions = visibleDimensions
9 | self.epochs = epochs
10 | self.hiddenDimensions = hiddenDimensions
11 | self.ratingValues = ratingValues
12 | self.learningRate = learningRate
13 | self.batchSize = batchSize
14 |
15 |
16 | def Train(self, X):
17 |
18 | for epoch in range(self.epochs):
19 | np.random.shuffle(X)
20 |
21 | trX = np.array(X)
22 | for i in range(0, trX.shape[0], self.batchSize):
23 | epochX = trX[i:i+self.batchSize]
24 | self.MakeGraph(epochX)
25 |
26 | print("Trained epoch ", epoch)
27 |
28 |
29 | def GetRecommendations(self, inputUser):
30 |
31 | feed = self.MakeHidden(inputUser)
32 | rec = self.MakeVisible(feed)
33 | return rec[0]
34 |
35 | def MakeGraph(self, inputUser):
36 |
37 | # Initialize weights randomly
38 | maxWeight = -4.0 * np.sqrt(6.0 / (self.hiddenDimensions + self.visibleDimensions))
39 | self.weights = tf.Variable(tf.random.uniform([self.visibleDimensions, self.hiddenDimensions], minval=-maxWeight, maxval=maxWeight), tf.float32, name="weights")
40 |
41 | self.hiddenBias = tf.Variable(tf.zeros([self.hiddenDimensions], tf.float32, name="hiddenBias"))
42 | self.visibleBias = tf.Variable(tf.zeros([self.visibleDimensions], tf.float32, name="visibleBias"))
43 |
44 | # Perform Gibbs Sampling for Contrastive Divergence, per the paper we assume k=1 instead of iterating over the
45 | # forward pass multiple times since it seems to work just fine
46 |
47 | # Forward pass
48 | # Sample hidden layer given visible...
49 | # Get tensor of hidden probabilities
50 | hProb0 = tf.nn.sigmoid(tf.matmul(inputUser, self.weights) + self.hiddenBias)
51 | # Sample from all of the distributions
52 | hSample = tf.nn.relu(tf.sign(hProb0 - tf.random.uniform(tf.shape(hProb0))))
53 | # Stitch it together
54 | forward = tf.matmul(tf.transpose(inputUser), hSample)
55 |
56 | # Backward pass
57 | # Reconstruct visible layer given hidden layer sample
58 | v = tf.matmul(hSample, tf.transpose(self.weights)) + self.visibleBias
59 |
60 | # Build up our mask for missing ratings
61 | vMask = tf.sign(inputUser) # Make sure everything is 0 or 1
62 | vMask3D = tf.reshape(vMask, [tf.shape(v)[0], -1, self.ratingValues]) # Reshape into arrays of individual ratings
63 | vMask3D = tf.reduce_max(vMask3D, axis=[2], keepdims=True) # Use reduce_max to either give us 1 for ratings that exist, and 0 for missing ratings
64 |
65 | # Extract rating vectors for each individual set of 10 rating binary values
66 | v = tf.reshape(v, [tf.shape(v)[0], -1, self.ratingValues])
67 | vProb = tf.nn.softmax(v * vMask3D) # Apply softmax activation function
68 | vProb = tf.reshape(vProb, [tf.shape(v)[0], -1]) # And shove them back into the flattened state. Reconstruction is done now.
69 | # Stitch it together to define the backward pass and updated hidden biases
70 | hProb1 = tf.nn.sigmoid(tf.matmul(vProb, self.weights) + self.hiddenBias)
71 | backward = tf.matmul(tf.transpose(vProb), hProb1)
72 |
73 | # Now define what each epoch will do...
74 | # Run the forward and backward passes, and update the weights
75 | weightUpdate = self.weights.assign_add(self.learningRate * (forward - backward))
76 | # Update hidden bias, minimizing the divergence in the hidden nodes
77 | hiddenBiasUpdate = self.hiddenBias.assign_add(self.learningRate * tf.reduce_mean(hProb0 - hProb1, 0))
78 | # Update the visible bias, minimizng divergence in the visible results
79 | visibleBiasUpdate = self.visibleBias.assign_add(self.learningRate * tf.reduce_mean(inputUser - vProb, 0))
80 |
81 | self.update = [weightUpdate, hiddenBiasUpdate, visibleBiasUpdate]
82 |
83 | def MakeHidden(self, inputUser):
84 | hidden = tf.nn.sigmoid(tf.matmul(inputUser, self.weights) + self.hiddenBias)
85 | self.MakeGraph(inputUser)
86 | return hidden
87 |
88 | def MakeVisible(self, feed):
89 | visible = tf.nn.sigmoid(tf.matmul(feed, tf.transpose(self.weights)) + self.visibleBias)
90 | #self.MakeGraph(feed)
91 | return visible
92 |
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/.ipynb_checkpoints/RBMAlgorithm-checkpoint.py:
--------------------------------------------------------------------------------
1 | from surprise import AlgoBase
2 | from surprise import PredictionImpossible
3 | import numpy as np
4 | from RBM import RBM
5 |
6 | class RBMAlgorithm(AlgoBase):
7 |
8 | def __init__(self, epochs=20, hiddenDim=100, learningRate=0.001, batchSize=100, sim_options={}):
9 | AlgoBase.__init__(self)
10 | self.epochs = epochs
11 | self.hiddenDim = hiddenDim
12 | self.learningRate = learningRate
13 | self.batchSize = batchSize
14 |
15 | def softmax(self, x):
16 | return np.exp(x) / np.sum(np.exp(x), axis=0)
17 |
18 | def fit(self, trainset):
19 | AlgoBase.fit(self, trainset)
20 |
21 | numUsers = trainset.n_users
22 | numItems = trainset.n_items
23 |
24 | trainingMatrix = np.zeros([numUsers, numItems, 10], dtype=np.float32)
25 |
26 | for (uid, iid, rating) in trainset.all_ratings():
27 | adjustedRating = int(float(rating)*2.0) - 1
28 | trainingMatrix[int(uid), int(iid), adjustedRating] = 1
29 |
30 | # Flatten to a 2D array, with nodes for each possible rating type on each possible item, for every user.
31 | trainingMatrix = np.reshape(trainingMatrix, [trainingMatrix.shape[0], -1])
32 |
33 | # Create an RBM with (num items * rating values) visible nodes
34 | rbm = RBM(trainingMatrix.shape[1], hiddenDimensions=self.hiddenDim, learningRate=self.learningRate, batchSize=self.batchSize, epochs=self.epochs)
35 | rbm.Train(trainingMatrix)
36 |
37 | self.predictedRatings = np.zeros([numUsers, numItems], dtype=np.float32)
38 | for uiid in range(trainset.n_users):
39 | if (uiid % 50 == 0):
40 | print("Processing user ", uiid)
41 | recs = rbm.GetRecommendations([trainingMatrix[uiid]])
42 | recs = np.reshape(recs, [numItems, 10])
43 |
44 | for itemID, rec in enumerate(recs):
45 | # The obvious thing would be to just take the rating with the highest score:
46 | #rating = rec.argmax()
47 | # ... but this just leads to a huge multi-way tie for 5-star predictions.
48 | # The paper suggests performing normalization over K values to get probabilities
49 | # and take the expectation as your prediction, so we'll do that instead:
50 | normalized = self.softmax(rec)
51 | rating = np.average(np.arange(10), weights=normalized)
52 | self.predictedRatings[uiid, itemID] = (rating + 1) * 0.5
53 |
54 | return self
55 |
56 |
57 | def estimate(self, u, i):
58 |
59 | if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
60 | raise PredictionImpossible('User and/or item is unkown.')
61 |
62 | rating = self.predictedRatings[u, i]
63 |
64 | if (rating < 0.001):
65 | raise PredictionImpossible('No valid prediction exists.')
66 |
67 | return rating
68 |
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/.ipynb_checkpoints/RBMBakeOff-checkpoint.py:
--------------------------------------------------------------------------------
1 | from MovieLens import MovieLens
2 | from RBMAlgorithm import RBMAlgorithm
3 | from surprise import NormalPredictor
4 | from Evaluator import Evaluator
5 |
6 | import random
7 | import numpy as np
8 |
9 | def LoadMovieLensData():
10 | ml = MovieLens()
11 | print("Loading movie ratings...")
12 | data = ml.loadMovieLensLatestSmall()
13 | print("\nComputing movie popularity ranks so we can measure novelty later...")
14 | rankings = ml.getPopularityRanks()
15 | return (ml, data, rankings)
16 |
17 | np.random.seed(0)
18 | random.seed(0)
19 |
20 | # Load up common data set for the recommender algorithms
21 | (ml, evaluationData, rankings) = LoadMovieLensData()
22 |
23 | # Construct an Evaluator to, you know, evaluate them
24 | evaluator = Evaluator(evaluationData, rankings)
25 |
26 | #RBM
27 | RBM = RBMAlgorithm(epochs=20)
28 | evaluator.AddAlgorithm(RBM, "RBM")
29 |
30 | # Just make random recommendations
31 | Random = NormalPredictor()
32 | evaluator.AddAlgorithm(Random, "Random")
33 |
34 | # Fight!
35 | evaluator.Evaluate(True)
36 |
37 | evaluator.SampleTopNRecs(ml)
38 |
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/.ipynb_checkpoints/RBMTuning-checkpoint.py:
--------------------------------------------------------------------------------
1 | from MovieLens import MovieLens
2 | from RBMAlgorithm import RBMAlgorithm
3 | from surprise import NormalPredictor
4 | from Evaluator import Evaluator
5 | from surprise.model_selection import GridSearchCV
6 |
7 | import random
8 | import numpy as np
9 |
10 | def LoadMovieLensData():
11 | ml = MovieLens()
12 | print("Loading movie ratings...")
13 | data = ml.loadMovieLensLatestSmall()
14 | print("\nComputing movie popularity ranks so we can measure novelty later...")
15 | rankings = ml.getPopularityRanks()
16 | return (ml, data, rankings)
17 |
18 | np.random.seed(0)
19 | random.seed(0)
20 |
21 | # Load up common data set for the recommender algorithms
22 | (ml, evaluationData, rankings) = LoadMovieLensData()
23 |
24 | print("Searching for best parameters...")
25 | param_grid = {'hiddenDim': [20, 10], 'learningRate': [0.1, 0.01]}
26 | gs = GridSearchCV(RBMAlgorithm, param_grid, measures=['rmse', 'mae'], cv=3)
27 |
28 | gs.fit(evaluationData)
29 |
30 | # best RMSE score
31 | print("Best RMSE score attained: ", gs.best_score['rmse'])
32 |
33 | # combination of parameters that gave the best RMSE score
34 | print(gs.best_params['rmse'])
35 |
36 | # Construct an Evaluator to, you know, evaluate them
37 | evaluator = Evaluator(evaluationData, rankings)
38 |
39 | params = gs.best_params['rmse']
40 | RBMtuned = RBMAlgorithm(hiddenDim = params['hiddenDim'], learningRate = params['learningRate'])
41 | evaluator.AddAlgorithm(RBMtuned, "RBM - Tuned")
42 |
43 | RBMUntuned = RBMAlgorithm()
44 | evaluator.AddAlgorithm(RBMUntuned, "RBM - Untuned")
45 |
46 | # Just make random recommendations
47 | Random = NormalPredictor()
48 | evaluator.AddAlgorithm(Random, "Random")
49 |
50 | # Fight!
51 | evaluator.Evaluate(False)
52 |
53 | evaluator.SampleTopNRecs(ml)
54 |
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/.ipynb_checkpoints/Recommendations using Restricted Boltzmann Machine(RBM)-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 4
6 | }
7 |
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/.ipynb_checkpoints/Recommendations with Deep Neural Networks-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 4
6 | }
7 |
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/AutoRec.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 |
4 | class AutoRec(object):
5 |
6 | def __init__(self, visibleDimensions, epochs=200, hiddenDimensions=50, learningRate=0.1, batchSize=100):
7 |
8 | self.visibleDimensions = visibleDimensions
9 | self.epochs = epochs
10 | self.hiddenDimensions = hiddenDimensions
11 | self.learningRate = learningRate
12 | self.batchSize = batchSize
13 | self.optimizer = tf.keras.optimizers.RMSprop(self.learningRate)
14 |
15 |
16 | def Train(self, X):
17 |
18 | for epoch in range(self.epochs):
19 | for i in range(0, X.shape[0], self.batchSize):
20 | epochX = X[i:i+self.batchSize]
21 | self.run_optimization(epochX)
22 |
23 |
24 | print("Trained epoch ", epoch)
25 |
26 | def GetRecommendations(self, inputUser):
27 |
28 | # Feed through a single user and return predictions from the output layer.
29 | rec = self.neural_net(inputUser)
30 |
31 | # It is being used as the return type is Eager Tensor.
32 | return rec[0]
33 |
34 |
35 | def neural_net(self, inputUser):
36 |
37 | #tf.set_random_seed(0)
38 |
39 | # Create varaibles for weights for the encoding (visible->hidden) and decoding (hidden->output) stages, randomly initialized
40 | self.weights = {
41 | 'h1': tf.Variable(tf.random.normal([self.visibleDimensions, self.hiddenDimensions])),
42 | 'out': tf.Variable(tf.random.normal([self.hiddenDimensions, self.visibleDimensions]))
43 | }
44 |
45 | # Create biases
46 | self.biases = {
47 | 'b1': tf.Variable(tf.random.normal([self.hiddenDimensions])),
48 | 'out': tf.Variable(tf.random.normal([self.visibleDimensions]))
49 | }
50 |
51 | # Create the input layer
52 | self.inputLayer = inputUser
53 |
54 | # hidden layer
55 | hidden = tf.nn.sigmoid(tf.add(tf.matmul(self.inputLayer, self.weights['h1']), self.biases['b1']))
56 |
57 | # output layer for our predictions.
58 | self.outputLayer = tf.nn.sigmoid(tf.add(tf.matmul(hidden, self.weights['out']), self.biases['out']))
59 |
60 | return self.outputLayer
61 |
62 | def run_optimization(self, inputUser):
63 | with tf.GradientTape() as g:
64 | pred = self.neural_net(inputUser)
65 | loss = tf.keras.losses.MSE(inputUser, pred)
66 |
67 | trainable_variables = list(self.weights.values()) + list(self.biases.values())
68 |
69 | gradients = g.gradient(loss, trainable_variables)
70 |
71 | self.optimizer.apply_gradients(zip(gradients, trainable_variables))
72 |
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/AutoRecAlgorithm.py:
--------------------------------------------------------------------------------
1 | from surprise import AlgoBase
2 | from surprise import PredictionImpossible
3 | import numpy as np
4 | from AutoRec import AutoRec
5 |
6 | class AutoRecAlgorithm(AlgoBase):
7 |
8 | def __init__(self, epochs=100, hiddenDim=100, learningRate=0.01, batchSize=100, sim_options={}):
9 | AlgoBase.__init__(self)
10 | self.epochs = epochs
11 | self.hiddenDim = hiddenDim
12 | self.learningRate = learningRate
13 | self.batchSize = batchSize
14 |
15 | def fit(self, trainset):
16 | AlgoBase.fit(self, trainset)
17 |
18 | numUsers = trainset.n_users
19 | numItems = trainset.n_items
20 |
21 | trainingMatrix = np.zeros([numUsers, numItems], dtype=np.float32)
22 |
23 | for (uid, iid, rating) in trainset.all_ratings():
24 | trainingMatrix[int(uid), int(iid)] = rating / 5.0
25 |
26 | # Create an RBM with (num items * rating values) visible nodes
27 | autoRec = AutoRec(trainingMatrix.shape[1], hiddenDimensions=self.hiddenDim, learningRate=self.learningRate, batchSize=self.batchSize, epochs=self.epochs)
28 | autoRec.Train(trainingMatrix)
29 |
30 | self.predictedRatings = np.zeros([numUsers, numItems], dtype=np.float32)
31 |
32 | for uiid in range(trainset.n_users):
33 | if (uiid % 50 == 0):
34 | print("Processing user ", uiid)
35 | recs = autoRec.GetRecommendations([trainingMatrix[uiid]])
36 |
37 | for itemID, rec in enumerate(recs):
38 | self.predictedRatings[uiid, itemID] = rec * 5.0
39 |
40 | return self
41 |
42 |
43 | def estimate(self, u, i):
44 |
45 | if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
46 | raise PredictionImpossible('User and/or item is unkown.')
47 |
48 | rating = self.predictedRatings[u, i]
49 |
50 | if (rating < 0.001):
51 | raise PredictionImpossible('No valid prediction exists.')
52 |
53 | return rating
54 |
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/AutoRecBakeOff.py:
--------------------------------------------------------------------------------
1 | from MovieLens import MovieLens
2 | from AutoRecAlgorithm import AutoRecAlgorithm
3 | from surprise import NormalPredictor
4 | from Evaluator import Evaluator
5 |
6 | import random
7 | import numpy as np
8 |
9 | def LoadMovieLensData():
10 | ml = MovieLens()
11 | print("Loading movie ratings...")
12 | data = ml.loadMovieLensLatestSmall()
13 | print("\nComputing movie popularity ranks so we can measure novelty later...")
14 | rankings = ml.getPopularityRanks()
15 | return (ml, data, rankings)
16 |
17 | np.random.seed(0)
18 | random.seed(0)
19 |
20 | # Load up common data set for the recommender algorithms
21 | (ml, evaluationData, rankings) = LoadMovieLensData()
22 |
23 | # Construct an Evaluator to, you know, evaluate them
24 | evaluator = Evaluator(evaluationData, rankings)
25 |
26 | #Autoencoder
27 | AutoRec = AutoRecAlgorithm()
28 | evaluator.AddAlgorithm(AutoRec, "AutoRec")
29 |
30 | # Just make random recommendations
31 | Random = NormalPredictor()
32 | evaluator.AddAlgorithm(Random, "Random")
33 |
34 | # Fight!
35 | evaluator.Evaluate(True)
36 |
37 | evaluator.SampleTopNRecs(ml)
38 |
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/EvaluatedAlgorithm.py:
--------------------------------------------------------------------------------
1 | from RecommenderMetrics import RecommenderMetrics
2 | from EvaluationData import EvaluationData
3 |
4 | class EvaluatedAlgorithm:
5 |
6 | def __init__(self, algorithm, name):
7 | self.algorithm = algorithm
8 | self.name = name
9 |
10 | def Evaluate(self, evaluationData, doTopN, n=10, verbose=True):
11 | metrics = {}
12 | # Compute accuracy
13 | if (verbose):
14 | print("Evaluating accuracy...")
15 | self.algorithm.fit(evaluationData.GetTrainSet())
16 | predictions = self.algorithm.test(evaluationData.GetTestSet())
17 | metrics["RMSE"] = RecommenderMetrics.RMSE(predictions)
18 | metrics["MAE"] = RecommenderMetrics.MAE(predictions)
19 |
20 | if (doTopN):
21 | # Evaluate top-10 with Leave One Out testing
22 | if (verbose):
23 | print("Evaluating top-N with leave-one-out...")
24 | self.algorithm.fit(evaluationData.GetLOOCVTrainSet())
25 | leftOutPredictions = self.algorithm.test(evaluationData.GetLOOCVTestSet())
26 | # Build predictions for all ratings not in the training set
27 | allPredictions = self.algorithm.test(evaluationData.GetLOOCVAntiTestSet())
28 | # Compute top 10 recs for each user
29 | topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)
30 | if (verbose):
31 | print("Computing hit-rate and rank metrics...")
32 | # See how often we recommended a movie the user actually rated
33 | metrics["HR"] = RecommenderMetrics.HitRate(topNPredicted, leftOutPredictions)
34 | # See how often we recommended a movie the user actually liked
35 | metrics["cHR"] = RecommenderMetrics.CumulativeHitRate(topNPredicted, leftOutPredictions)
36 | # Compute ARHR
37 | metrics["ARHR"] = RecommenderMetrics.AverageReciprocalHitRank(topNPredicted, leftOutPredictions)
38 |
39 | #Evaluate properties of recommendations on full training set
40 | if (verbose):
41 | print("Computing recommendations with full data set...")
42 | self.algorithm.fit(evaluationData.GetFullTrainSet())
43 | allPredictions = self.algorithm.test(evaluationData.GetFullAntiTestSet())
44 | topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)
45 | if (verbose):
46 | print("Analyzing coverage, diversity, and novelty...")
47 | # Print user coverage with a minimum predicted rating of 4.0:
48 | metrics["Coverage"] = RecommenderMetrics.UserCoverage( topNPredicted,
49 | evaluationData.GetFullTrainSet().n_users,
50 | ratingThreshold=4.0)
51 | # Measure diversity of recommendations:
52 | metrics["Diversity"] = RecommenderMetrics.Diversity(topNPredicted, evaluationData.GetSimilarities())
53 |
54 | # Measure novelty (average popularity rank of recommendations):
55 | metrics["Novelty"] = RecommenderMetrics.Novelty(topNPredicted,
56 | evaluationData.GetPopularityRankings())
57 |
58 | if (verbose):
59 | print("Analysis complete.")
60 |
61 | return metrics
62 |
63 | def GetName(self):
64 | return self.name
65 |
66 | def GetAlgorithm(self):
67 | return self.algorithm
68 |
69 |
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/EvaluationData.py:
--------------------------------------------------------------------------------
1 | from surprise.model_selection import train_test_split
2 | from surprise.model_selection import LeaveOneOut
3 | from surprise import KNNBaseline
4 |
5 | class EvaluationData:
6 |
7 | def __init__(self, data, popularityRankings):
8 |
9 | self.rankings = popularityRankings
10 |
11 | #Build a full training set for evaluating overall properties
12 | self.fullTrainSet = data.build_full_trainset()
13 | self.fullAntiTestSet = self.fullTrainSet.build_anti_testset()
14 |
15 | #Build a 75/25 train/test split for measuring accuracy
16 | self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1)
17 |
18 | #Build a "leave one out" train/test split for evaluating top-N recommenders
19 | #And build an anti-test-set for building predictions
20 | LOOCV = LeaveOneOut(n_splits=1, random_state=1)
21 | for train, test in LOOCV.split(data):
22 | self.LOOCVTrain = train
23 | self.LOOCVTest = test
24 |
25 | self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset()
26 |
27 | #Compute similarty matrix between items so we can measure diversity
28 | sim_options = {'name': 'cosine', 'user_based': False}
29 | self.simsAlgo = KNNBaseline(sim_options=sim_options)
30 | self.simsAlgo.fit(self.fullTrainSet)
31 |
32 | def GetFullTrainSet(self):
33 | return self.fullTrainSet
34 |
35 | def GetFullAntiTestSet(self):
36 | return self.fullAntiTestSet
37 |
38 | def GetAntiTestSetForUser(self, testSubject):
39 | trainset = self.fullTrainSet
40 | fill = trainset.global_mean
41 | anti_testset = []
42 | u = trainset.to_inner_uid(str(testSubject))
43 | user_items = set([j for (j, _) in trainset.ur[u]])
44 | anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
45 | i in trainset.all_items() if
46 | i not in user_items]
47 | return anti_testset
48 |
49 | def GetTrainSet(self):
50 | return self.trainSet
51 |
52 | def GetTestSet(self):
53 | return self.testSet
54 |
55 | def GetLOOCVTrainSet(self):
56 | return self.LOOCVTrain
57 |
58 | def GetLOOCVTestSet(self):
59 | return self.LOOCVTest
60 |
61 | def GetLOOCVAntiTestSet(self):
62 | return self.LOOCVAntiTestSet
63 |
64 | def GetSimilarities(self):
65 | return self.simsAlgo
66 |
67 | def GetPopularityRankings(self):
68 | return self.rankings
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/Evaluator.py:
--------------------------------------------------------------------------------
1 | from EvaluationData import EvaluationData
2 | from EvaluatedAlgorithm import EvaluatedAlgorithm
3 |
4 | class Evaluator:
5 |
6 | algorithms = []
7 |
8 | def __init__(self, dataset, rankings):
9 | ed = EvaluationData(dataset, rankings)
10 | self.dataset = ed
11 |
12 | def AddAlgorithm(self, algorithm, name):
13 | alg = EvaluatedAlgorithm(algorithm, name)
14 | self.algorithms.append(alg)
15 |
16 | def Evaluate(self, doTopN):
17 | results = {}
18 | for algorithm in self.algorithms:
19 | print("Evaluating ", algorithm.GetName(), "...")
20 | results[algorithm.GetName()] = algorithm.Evaluate(self.dataset, doTopN)
21 |
22 | # Print results
23 | print("\n")
24 |
25 | if (doTopN):
26 | print("{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}".format(
27 | "Algorithm", "RMSE", "MAE", "HR", "cHR", "ARHR", "Coverage", "Diversity", "Novelty"))
28 | for (name, metrics) in results.items():
29 | print("{:<10} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f}".format(
30 | name, metrics["RMSE"], metrics["MAE"], metrics["HR"], metrics["cHR"], metrics["ARHR"],
31 | metrics["Coverage"], metrics["Diversity"], metrics["Novelty"]))
32 | else:
33 | print("{:<10} {:<10} {:<10}".format("Algorithm", "RMSE", "MAE"))
34 | for (name, metrics) in results.items():
35 | print("{:<10} {:<10.4f} {:<10.4f}".format(name, metrics["RMSE"], metrics["MAE"]))
36 |
37 | print("\nLegend:\n")
38 | print("RMSE: Root Mean Squared Error. Lower values mean better accuracy.")
39 | print("MAE: Mean Absolute Error. Lower values mean better accuracy.")
40 | if (doTopN):
41 | print("HR: Hit Rate; how often we are able to recommend a left-out rating. Higher is better.")
42 | print("cHR: Cumulative Hit Rate; hit rate, confined to ratings above a certain threshold. Higher is better.")
43 | print("ARHR: Average Reciprocal Hit Rank - Hit rate that takes the ranking into account. Higher is better." )
44 | print("Coverage: Ratio of users for whom recommendations above a certain threshold exist. Higher is better.")
45 | print("Diversity: 1-S, where S is the average similarity score between every possible pair of recommendations")
46 | print(" for a given user. Higher means more diverse.")
47 | print("Novelty: Average popularity rank of recommended items. Higher means more novel.")
48 |
49 | def SampleTopNRecs(self, ml, testSubject=85, k=10):
50 |
51 | for algo in self.algorithms:
52 | print("\nUsing recommender ", algo.GetName())
53 |
54 | print("\nBuilding recommendation model...")
55 | trainSet = self.dataset.GetFullTrainSet()
56 | algo.GetAlgorithm().fit(trainSet)
57 |
58 | print("Computing recommendations...")
59 | testSet = self.dataset.GetAntiTestSetForUser(testSubject)
60 |
61 | predictions = algo.GetAlgorithm().test(testSet)
62 |
63 | recommendations = []
64 |
65 | print ("\nWe recommend:")
66 | for userID, movieID, actualRating, estimatedRating, _ in predictions:
67 | intMovieID = int(movieID)
68 | recommendations.append((intMovieID, estimatedRating))
69 |
70 | recommendations.sort(key=lambda x: x[1], reverse=True)
71 |
72 | for ratings in recommendations[:10]:
73 | print(ml.getMovieName(ratings[0]), ratings[1])
74 |
75 |
76 |
77 |
78 |
79 |
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/RBM.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 |
4 | class RBM(object):
5 |
6 | def __init__(self, visibleDimensions, epochs=20, hiddenDimensions=50, ratingValues=10, learningRate=0.001, batchSize=100):
7 |
8 | self.visibleDimensions = visibleDimensions
9 | self.epochs = epochs
10 | self.hiddenDimensions = hiddenDimensions
11 | self.ratingValues = ratingValues
12 | self.learningRate = learningRate
13 | self.batchSize = batchSize
14 |
15 |
16 | def Train(self, X):
17 |
18 | for epoch in range(self.epochs):
19 | np.random.shuffle(X)
20 |
21 | trX = np.array(X)
22 | for i in range(0, trX.shape[0], self.batchSize):
23 | epochX = trX[i:i+self.batchSize]
24 | self.MakeGraph(epochX)
25 |
26 | print("Trained epoch ", epoch)
27 |
28 |
29 | def GetRecommendations(self, inputUser):
30 |
31 | feed = self.MakeHidden(inputUser)
32 | rec = self.MakeVisible(feed)
33 | return rec[0]
34 |
35 | def MakeGraph(self, inputUser):
36 |
37 | # Initialize weights randomly
38 | maxWeight = -4.0 * np.sqrt(6.0 / (self.hiddenDimensions + self.visibleDimensions))
39 | self.weights = tf.Variable(tf.random.uniform([self.visibleDimensions, self.hiddenDimensions], minval=-maxWeight, maxval=maxWeight), tf.float32, name="weights")
40 |
41 | self.hiddenBias = tf.Variable(tf.zeros([self.hiddenDimensions], tf.float32, name="hiddenBias"))
42 | self.visibleBias = tf.Variable(tf.zeros([self.visibleDimensions], tf.float32, name="visibleBias"))
43 |
44 | # Perform Gibbs Sampling for Contrastive Divergence, per the paper we assume k=1 instead of iterating over the
45 | # forward pass multiple times since it seems to work just fine
46 |
47 | # Forward pass
48 | # Sample hidden layer given visible...
49 | # Get tensor of hidden probabilities
50 | hProb0 = tf.nn.sigmoid(tf.matmul(inputUser, self.weights) + self.hiddenBias)
51 | # Sample from all of the distributions
52 | hSample = tf.nn.relu(tf.sign(hProb0 - tf.random.uniform(tf.shape(hProb0))))
53 | # Stitch it together
54 | forward = tf.matmul(tf.transpose(inputUser), hSample)
55 |
56 | # Backward pass
57 | # Reconstruct visible layer given hidden layer sample
58 | v = tf.matmul(hSample, tf.transpose(self.weights)) + self.visibleBias
59 |
60 | # Build up our mask for missing ratings
61 | vMask = tf.sign(inputUser) # Make sure everything is 0 or 1
62 | vMask3D = tf.reshape(vMask, [tf.shape(v)[0], -1, self.ratingValues]) # Reshape into arrays of individual ratings
63 | vMask3D = tf.reduce_max(vMask3D, axis=[2], keepdims=True) # Use reduce_max to either give us 1 for ratings that exist, and 0 for missing ratings
64 |
65 | # Extract rating vectors for each individual set of 10 rating binary values
66 | v = tf.reshape(v, [tf.shape(v)[0], -1, self.ratingValues])
67 | vProb = tf.nn.softmax(v * vMask3D) # Apply softmax activation function
68 | vProb = tf.reshape(vProb, [tf.shape(v)[0], -1]) # And shove them back into the flattened state. Reconstruction is done now.
69 | # Stitch it together to define the backward pass and updated hidden biases
70 | hProb1 = tf.nn.sigmoid(tf.matmul(vProb, self.weights) + self.hiddenBias)
71 | backward = tf.matmul(tf.transpose(vProb), hProb1)
72 |
73 | # Now define what each epoch will do...
74 | # Run the forward and backward passes, and update the weights
75 | weightUpdate = self.weights.assign_add(self.learningRate * (forward - backward))
76 | # Update hidden bias, minimizing the divergence in the hidden nodes
77 | hiddenBiasUpdate = self.hiddenBias.assign_add(self.learningRate * tf.reduce_mean(hProb0 - hProb1, 0))
78 | # Update the visible bias, minimizng divergence in the visible results
79 | visibleBiasUpdate = self.visibleBias.assign_add(self.learningRate * tf.reduce_mean(inputUser - vProb, 0))
80 |
81 | self.update = [weightUpdate, hiddenBiasUpdate, visibleBiasUpdate]
82 |
83 | def MakeHidden(self, inputUser):
84 | hidden = tf.nn.sigmoid(tf.matmul(inputUser, self.weights) + self.hiddenBias)
85 | self.MakeGraph(inputUser)
86 | return hidden
87 |
88 | def MakeVisible(self, feed):
89 | visible = tf.nn.sigmoid(tf.matmul(feed, tf.transpose(self.weights)) + self.visibleBias)
90 | #self.MakeGraph(feed)
91 | return visible
92 |
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/RBMAlgorithm.py:
--------------------------------------------------------------------------------
1 | from surprise import AlgoBase
2 | from surprise import PredictionImpossible
3 | import numpy as np
4 | from RBM import RBM
5 |
6 | class RBMAlgorithm(AlgoBase):
7 |
8 | def __init__(self, epochs=20, hiddenDim=100, learningRate=0.001, batchSize=100, sim_options={}):
9 | AlgoBase.__init__(self)
10 | self.epochs = epochs
11 | self.hiddenDim = hiddenDim
12 | self.learningRate = learningRate
13 | self.batchSize = batchSize
14 |
15 | def softmax(self, x):
16 | return np.exp(x) / np.sum(np.exp(x), axis=0)
17 |
18 | def fit(self, trainset):
19 | AlgoBase.fit(self, trainset)
20 |
21 | numUsers = trainset.n_users
22 | numItems = trainset.n_items
23 |
24 | trainingMatrix = np.zeros([numUsers, numItems, 10], dtype=np.float32)
25 |
26 | for (uid, iid, rating) in trainset.all_ratings():
27 | adjustedRating = int(float(rating)*2.0) - 1
28 | trainingMatrix[int(uid), int(iid), adjustedRating] = 1
29 |
30 | # Flatten to a 2D array, with nodes for each possible rating type on each possible item, for every user.
31 | trainingMatrix = np.reshape(trainingMatrix, [trainingMatrix.shape[0], -1])
32 |
33 | # Create an RBM with (num items * rating values) visible nodes
34 | rbm = RBM(trainingMatrix.shape[1], hiddenDimensions=self.hiddenDim, learningRate=self.learningRate, batchSize=self.batchSize, epochs=self.epochs)
35 | rbm.Train(trainingMatrix)
36 |
37 | self.predictedRatings = np.zeros([numUsers, numItems], dtype=np.float32)
38 | for uiid in range(trainset.n_users):
39 | if (uiid % 50 == 0):
40 | print("Processing user ", uiid)
41 | recs = rbm.GetRecommendations([trainingMatrix[uiid]])
42 | recs = np.reshape(recs, [numItems, 10])
43 |
44 | for itemID, rec in enumerate(recs):
45 | # The obvious thing would be to just take the rating with the highest score:
46 | #rating = rec.argmax()
47 | # ... but this just leads to a huge multi-way tie for 5-star predictions.
48 | # The paper suggests performing normalization over K values to get probabilities
49 | # and take the expectation as your prediction, so we'll do that instead:
50 | normalized = self.softmax(rec)
51 | rating = np.average(np.arange(10), weights=normalized)
52 | self.predictedRatings[uiid, itemID] = (rating + 1) * 0.5
53 |
54 | return self
55 |
56 |
57 | def estimate(self, u, i):
58 |
59 | if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
60 | raise PredictionImpossible('User and/or item is unkown.')
61 |
62 | rating = self.predictedRatings[u, i]
63 |
64 | if (rating < 0.001):
65 | raise PredictionImpossible('No valid prediction exists.')
66 |
67 | return rating
68 |
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/RBMBakeOff.py:
--------------------------------------------------------------------------------
1 | from MovieLens import MovieLens
2 | from RBMAlgorithm import RBMAlgorithm
3 | from surprise import NormalPredictor
4 | from Evaluator import Evaluator
5 |
6 | import random
7 | import numpy as np
8 |
9 | def LoadMovieLensData():
10 | ml = MovieLens()
11 | print("Loading movie ratings...")
12 | data = ml.loadMovieLensLatestSmall()
13 | print("\nComputing movie popularity ranks so we can measure novelty later...")
14 | rankings = ml.getPopularityRanks()
15 | return (ml, data, rankings)
16 |
17 | np.random.seed(0)
18 | random.seed(0)
19 |
20 | # Load up common data set for the recommender algorithms
21 | (ml, evaluationData, rankings) = LoadMovieLensData()
22 |
23 | # Construct an Evaluator to, you know, evaluate them
24 | evaluator = Evaluator(evaluationData, rankings)
25 |
26 | #RBM
27 | RBM = RBMAlgorithm(epochs=20)
28 | evaluator.AddAlgorithm(RBM, "RBM")
29 |
30 | # Just make random recommendations
31 | Random = NormalPredictor()
32 | evaluator.AddAlgorithm(Random, "Random")
33 |
34 | # Fight!
35 | evaluator.Evaluate(True)
36 |
37 | evaluator.SampleTopNRecs(ml)
38 |
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/RBMTuning.py:
--------------------------------------------------------------------------------
1 | from MovieLens import MovieLens
2 | from RBMAlgorithm import RBMAlgorithm
3 | from surprise import NormalPredictor
4 | from Evaluator import Evaluator
5 | from surprise.model_selection import GridSearchCV
6 |
7 | import random
8 | import numpy as np
9 |
10 | def LoadMovieLensData():
11 | ml = MovieLens()
12 | print("Loading movie ratings...")
13 | data = ml.loadMovieLensLatestSmall()
14 | print("\nComputing movie popularity ranks so we can measure novelty later...")
15 | rankings = ml.getPopularityRanks()
16 | return (ml, data, rankings)
17 |
18 | np.random.seed(0)
19 | random.seed(0)
20 |
21 | # Load up common data set for the recommender algorithms
22 | (ml, evaluationData, rankings) = LoadMovieLensData()
23 |
24 | print("Searching for best parameters...")
25 | param_grid = {'hiddenDim': [20, 10], 'learningRate': [0.1, 0.01]}
26 | gs = GridSearchCV(RBMAlgorithm, param_grid, measures=['rmse', 'mae'], cv=3)
27 |
28 | gs.fit(evaluationData)
29 |
30 | # best RMSE score
31 | print("Best RMSE score attained: ", gs.best_score['rmse'])
32 |
33 | # combination of parameters that gave the best RMSE score
34 | print(gs.best_params['rmse'])
35 |
36 | # Construct an Evaluator to, you know, evaluate them
37 | evaluator = Evaluator(evaluationData, rankings)
38 |
39 | params = gs.best_params['rmse']
40 | RBMtuned = RBMAlgorithm(hiddenDim = params['hiddenDim'], learningRate = params['learningRate'])
41 | evaluator.AddAlgorithm(RBMtuned, "RBM - Tuned")
42 |
43 | RBMUntuned = RBMAlgorithm()
44 | evaluator.AddAlgorithm(RBMUntuned, "RBM - Untuned")
45 |
46 | # Just make random recommendations
47 | Random = NormalPredictor()
48 | evaluator.AddAlgorithm(Random, "Random")
49 |
50 | # Fight!
51 | evaluator.Evaluate(False)
52 |
53 | evaluator.SampleTopNRecs(ml)
54 |
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/Recommendations with Deep Neural Networks.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Recommendations with Deep Neural Networks"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Importing Dependencies"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "from MovieLens import MovieLens\n",
24 | "from AutoRecAlgorithm import AutoRecAlgorithm\n",
25 | "from surprise import NormalPredictor\n",
26 | "from Evaluator import Evaluator"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "import random\n",
36 | "import numpy as np"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "## Loading Data"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 3,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "def LoadMovieLensData():\n",
53 | " ml = MovieLens()\n",
54 | " print(\"Loading movie ratings...\")\n",
55 | " data = ml.loadMovieLensLatestSmall()\n",
56 | " print(\"\\nComputing movie popularity ranks so we can measure novelty later...\")\n",
57 | " rankings = ml.getPopularityRanks()\n",
58 | " return (ml, data, rankings)"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 4,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "np.random.seed(0)\n",
68 | "random.seed(0)"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 5,
74 | "metadata": {},
75 | "outputs": [
76 | {
77 | "name": "stdout",
78 | "output_type": "stream",
79 | "text": [
80 | "Loading movie ratings...\n",
81 | "\n",
82 | "Computing movie popularity ranks so we can measure novelty later...\n"
83 | ]
84 | }
85 | ],
86 | "source": [
87 | "# Load up common data set for the recommender algorithms\n",
88 | "(ml, evaluationData, rankings) = LoadMovieLensData()"
89 | ]
90 | },
91 | {
92 | "cell_type": "markdown",
93 | "metadata": {},
94 | "source": [
95 | "## Instantiating the Evaluator"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 6,
101 | "metadata": {},
102 | "outputs": [
103 | {
104 | "name": "stdout",
105 | "output_type": "stream",
106 | "text": [
107 | "Estimating biases using als...\n",
108 | "Computing the cosine similarity matrix...\n",
109 | "Done computing similarity matrix.\n"
110 | ]
111 | }
112 | ],
113 | "source": [
114 | "# Construct an Evaluator to, you know, evaluate them\n",
115 | "evaluator = Evaluator(evaluationData, rankings)"
116 | ]
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "metadata": {},
121 | "source": [
122 | "## Recommendation Algorithm"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 7,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "#Autoencoder\n",
132 | "AutoRec = AutoRecAlgorithm()\n",
133 | "evaluator.AddAlgorithm(AutoRec, \"AutoRec\")"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 8,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "# Just make random recommendations\n",
143 | "Random = NormalPredictor()\n",
144 | "evaluator.AddAlgorithm(Random, \"Random\")"
145 | ]
146 | },
147 | {
148 | "cell_type": "markdown",
149 | "metadata": {},
150 | "source": [
151 | "## Evaluation"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {},
158 | "outputs": [
159 | {
160 | "name": "stdout",
161 | "output_type": "stream",
162 | "text": [
163 | "Evaluating AutoRec ...\n",
164 | "Evaluating accuracy...\n",
165 | "Trained epoch 0\n",
166 | "Trained epoch 1\n",
167 | "Trained epoch 2\n",
168 | "Trained epoch 3\n",
169 | "Trained epoch 4\n",
170 | "Trained epoch 5\n",
171 | "Trained epoch 6\n",
172 | "Trained epoch 7\n"
173 | ]
174 | }
175 | ],
176 | "source": [
177 | "evaluator.Evaluate(True)"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "metadata": {},
184 | "outputs": [],
185 | "source": [
186 | "evaluator.SampleTopNRecs(ml)"
187 | ]
188 | }
189 | ],
190 | "metadata": {
191 | "kernelspec": {
192 | "display_name": "Python 3",
193 | "language": "python",
194 | "name": "python3"
195 | },
196 | "language_info": {
197 | "codemirror_mode": {
198 | "name": "ipython",
199 | "version": 3
200 | },
201 | "file_extension": ".py",
202 | "mimetype": "text/x-python",
203 | "name": "python",
204 | "nbconvert_exporter": "python",
205 | "pygments_lexer": "ipython3",
206 | "version": "3.8.2"
207 | }
208 | },
209 | "nbformat": 4,
210 | "nbformat_minor": 4
211 | }
212 |
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/__pycache__/AutoRec.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/06 - Deep Learning for Recommender Systems/__pycache__/AutoRec.cpython-38.pyc
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/__pycache__/AutoRecAlgorithm.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/06 - Deep Learning for Recommender Systems/__pycache__/AutoRecAlgorithm.cpython-38.pyc
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/__pycache__/EvaluatedAlgorithm.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/06 - Deep Learning for Recommender Systems/__pycache__/EvaluatedAlgorithm.cpython-38.pyc
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/__pycache__/EvaluationData.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/06 - Deep Learning for Recommender Systems/__pycache__/EvaluationData.cpython-38.pyc
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/__pycache__/Evaluator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/06 - Deep Learning for Recommender Systems/__pycache__/Evaluator.cpython-38.pyc
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/__pycache__/MovieLens.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/06 - Deep Learning for Recommender Systems/__pycache__/MovieLens.cpython-38.pyc
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/__pycache__/RBM.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/06 - Deep Learning for Recommender Systems/__pycache__/RBM.cpython-38.pyc
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/__pycache__/RBMAlgorithm.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/06 - Deep Learning for Recommender Systems/__pycache__/RBMAlgorithm.cpython-38.pyc
--------------------------------------------------------------------------------
/06 - Deep Learning for Recommender Systems/__pycache__/RecommenderMetrics.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amanjeetsahu/Recommender-Systems-Using-Python/012a21902c05bb0a20acbb91bd0df1bf49f67f99/06 - Deep Learning for Recommender Systems/__pycache__/RecommenderMetrics.cpython-38.pyc
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Reccomender-Systems-Using-Python
--------------------------------------------------------------------------------