"
806 | ]
807 | },
808 | "output_type": "execute_result",
809 | "metadata": {}
810 | }
811 | ],
812 | "source": [
813 | "dataDf_train.head()"
814 | ]
815 | },
816 | {
817 | "cell_type": "code",
818 | "metadata": {},
819 | "outputs": [],
820 | "source": [
821 | ""
822 | ]
823 | }
824 | ],
825 | "metadata": {
826 | "anaconda-cloud": {},
827 | "kernelspec": {
828 | "display_name": "Python [Root]",
829 | "language": "python",
830 | "name": "Python [Root]"
831 | },
832 | "language_info": {
833 | "codemirror_mode": {
834 | "name": "ipython",
835 | "version": 2.0
836 | },
837 | "file_extension": ".py",
838 | "mimetype": "text/x-python",
839 | "name": "python",
840 | "nbconvert_exporter": "python",
841 | "pygments_lexer": "ipython2",
842 | "version": "2.7.12"
843 | }
844 | },
845 | "nbformat": 4,
846 | "nbformat_minor": 0
847 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # RecommenderSystem
2 | A general form of recommender system
3 |
4 |
5 | # To start:
6 | download https://grouplens.org/datasets/movielens/100k/ \
7 | unzip to a directory that is in the same root directory of this project folder \
8 | your file system layout should be like: \
9 | /project - mk-100/
10 | \ RecommenderSystem/
11 |
12 | Inside RecommenderSystem/, run the Preprocessing.ipynb in the jupyter notebook \
13 | copy mk-100/DATA to RecommenderSystem/ \
14 |
15 | now in the DATA/, you have all the data you need for this code to run
16 |
--------------------------------------------------------------------------------
/Ranker.py:
--------------------------------------------------------------------------------
1 | # Ranker
2 |
3 | import logging
4 | import numpy as np
5 |
6 | # rank the items from each recommendation module
7 | # highly influenced by business strategy and varies from system to system
8 | from DatabaseInterface import DatabaseInterface
9 |
10 |
11 | class Ranker(object):
12 | logging.basicConfig(level=logging.INFO)
13 | def __init__(self, numberToServe, database):
14 | self.numberToServe = numberToServe
15 | self.userHistoryDB = database.extract(DatabaseInterface.HISTORY_KEY) # who rated what
16 | self.log = logging.getLogger(__name__)
17 |
18 | def _getUsedItems(self, userId):
19 | # return a python set of all the movies that have been seen
20 | if userId == -1 :
21 | return set([])
22 | else:
23 | return set(self.userHistoryDB[self.userHistoryDB.loc[:,"user_id"]==userId].loc[:,"item_id"])
24 |
25 | def rerank(self,recommendationsTuple):
26 | # recommendationTupe is a tuple of (userId, recommendations)
27 | # recommendations is a dictionary of lists {RecType: Items}, RecType can be "online", "offline", "popular"
28 | # return the ranked recommendation
29 | # here is the strategy:
30 | # if the userId is -1, it means it is from anonymous user.
31 | # else remove the watched item and
32 |
33 | userId = recommendationsTuple[0]
34 | recommendations = recommendationsTuple[1]
35 |
36 | usedItems = self._getUsedItems(userId)
37 |
38 |
39 | self.log.info("Recommendations received in Ranker: %s" %recommendations)
40 | self.log.info("Recommendation types received in Ranker: %s" %recommendations.keys())
41 | results = []
42 |
43 | if "online" in recommendations: # online exists as long as user has been active
44 | results.extend(recommendations["online"][:self.numberToServe]) # should only has one
45 |
46 | if "offline" in recommendations: # offline exist only if user are registered, the recs could be from CF or LR
47 | results.extend(recommendations["offline"][:self.numberToServe])
48 |
49 | if "popular" in recommendations: # most popular should always exist
50 | # if there is no personalized recs, the remaining should be filled by most popular
51 | results.extend(recommendations["popular"][:self.numberToServe])
52 | else:
53 | self.log.error("recommendations do not contain popular items")
54 |
55 | try:
56 | # remove the already visited items
57 | results = np.random.choice(list(set(results)-usedItems), self.numberToServe, replace=False)
58 | except ValueError:
59 | # sometimes the user may watched a lot
60 | # this is apparently not a good strategy, why?
61 | results = np.random.choice(results, self.numberToServe, replace=False)
62 |
63 |
64 | return results
65 |
66 | if __name__=="__main__":
67 | from DatabaseInterface import DatabaseInterface
68 | db = DatabaseInterface("DATA")
69 | db.startEngine()
70 | ranker = Ranker(numberToServe=10, database=db)
71 | print sorted(ranker._getUsedItems(1))
--------------------------------------------------------------------------------
/RecEngine.py:
--------------------------------------------------------------------------------
1 | # Recommendation Engine
2 |
3 | from ModelStore import ModelStore
4 | import logging
5 |
6 | class RecEngine(object):
7 | logging.basicConfig(level=logging.INFO)
8 |
9 | def __init__(self, userAnalyzer, modelStore, userActivityTable):
10 | self.userAnalyzer = userAnalyzer
11 | self.modelStore = modelStore
12 | self.userActivityTable = userActivityTable
13 | self._cacheMostPopular()
14 | # to pre-compute the most popular items, because this recommendation is independent from users
15 | self.log = logging.getLogger(__name__)
16 |
17 | def resetCache(self):
18 | self._cacheMostPopular()
19 |
20 | def _cacheMostPopular(self):
21 | self.mostPopularList = self.modelStore.getModel(ModelStore.MP_MODEL_KEY).provideRec()
22 |
23 | def provideRecommendation(self, request):
24 | recommendations = {}
25 | # dictionary, dict1 = {"key":"value"}, when I try to get the value, I can use dict1["key"]
26 | # construct recommendation content, which is implemented as a dictionary
27 | # three sections will be used: popular, online, offline
28 |
29 | recommendations["popular"] = self.mostPopularList
30 | requestAnalyzed = self.userAnalyzer.analyze(request, self.userActivityTable)
31 |
32 | # online recommendation
33 | onlineRecs = self.modelStore.getModel(ModelStore.SI_MODEL_KEY, request.userId).provideRec()
34 |
35 | self.log.info("user type: %s" %requestAnalyzed[0])
36 |
37 | # now we start to construct our recommendation data
38 | if len(onlineRecs)>0:
39 | recommendations["online"] = onlineRecs # a lit of ids
40 |
41 | if requestAnalyzed[0] == "new":
42 | # for new user, we use KNN model for offline model recommendation
43 | recommendations["offline"] = self.modelStore.getModel(ModelStore.KNN_MODEL_KEY)\
44 | .provideRec(requestAnalyzed[2].userId)
45 | elif requestAnalyzed[0] == "old":
46 | # for new user, we use CF model for offline model recommendation
47 | recommendations["offline"] = self.modelStore.getModel(ModelStore.CF_MODEL_KEY)\
48 | .provideRec(requestAnalyzed[2].userId)
49 |
50 | return requestAnalyzed[1], recommendations
51 |
52 |
53 |
--------------------------------------------------------------------------------
/UserAnalyzer.py:
--------------------------------------------------------------------------------
1 | # User Type Analyzer
2 | # Determine different type of user, send different user to different recommendation module
3 |
4 | class UserAnalyzer(object):
5 | def __init__(self):
6 | pass
7 |
8 | def analyze(self, request, userActivityDB):
9 | # should return an identifier such that the recommender engine knows what to do
10 | # userActitivyDB is defined in DatabaseInterface, to count user's total amount of activity
11 | if isinstance(request.userId,str):
12 | # it is an anonymous request
13 | return ["anonymous", -1, request]
14 | elif request.userId in userActivityDB.index:
15 | if userActivityDB[request.userId] >= 30:
16 | # if the user has already rated more than 30 items, we call it an old user
17 | return ["old", request.userId, request]
18 | else:
19 | return ["new", request.userId, request]
20 | else:
21 | return ["new", request.userId, request]
22 |
23 | def analyzeAction(self, action):
24 | if isinstance(action.userId, str):
25 | return "anonymous"
26 | else:
27 | return "registered"
28 |
--------------------------------------------------------------------------------
/Webserver.py:
--------------------------------------------------------------------------------
1 | # A simulation framework
2 | import logging
3 |
4 | from DatabaseInterface import DatabaseInterface
5 | from RecEngine import RecEngine
6 | from Ranker import Ranker
7 | from Learners.OfflineLearner import OfflineLearner
8 | from Learners.OnlineLearner import OnlineLearner
9 | from UserAnalyzer import UserAnalyzer
10 | from ModelStore import ModelStore
11 |
12 |
13 | class WebServer(object):
14 | logging.basicConfig(level=logging.INFO)
15 |
16 | def __init__(self, configMap):
17 | self.db = DatabaseInterface(configMap['data_dir'])
18 | # numberToServe: the number of items finally served to the users
19 | self.numberToServe = configMap['numberToServe']
20 | self.log = logging.getLogger(__name__)
21 |
22 | def start(self):
23 | # each object here simulates the API calls through network
24 | # passing an object A to the constructor of B means A will communication to B
25 | self.db.startEngine()
26 | self.ranker = Ranker(self.numberToServe, self.db)
27 | self.userAnalyzer = UserAnalyzer()
28 | self.modelStore = ModelStore()
29 | self.offlineLearner = OfflineLearner(self.db, self.modelStore)
30 | self.onlineLearner = OnlineLearner(self.db, self.modelStore)
31 | self.offlineLearner.trainModel()
32 | # when we start the webserver, we should let offline learner to train the models,
33 | # such that, after the start(), we can start to give recommendation
34 | self.recEngine = RecEngine(self.userAnalyzer, self.modelStore, self.db.extract(DatabaseInterface.USER_ACTIVITY_KEY))
35 |
36 |
37 | def getAction(self, action):
38 | assert(isinstance(action, Action))
39 | # taking the action from users
40 | self.onlineLearner.trainModel(action)
41 | # analyze action type, and save the registered user's action
42 | actionType = self.userAnalyzer.analyzeAction(action)
43 | if actionType == "registered":
44 | self.log.info("Recording action %s" %action)
45 | self.db.putAction(action)
46 |
47 | def provideRecommendation(self, request):
48 | # return the ID's for the recommended items
49 | assert(isinstance(request, Request))
50 | # provide recommendations to user
51 | self.log.info("responding to request: %s" %request)
52 | recommendations = self.recEngine.provideRecommendation(request)
53 | recsReranked = self.ranker.rerank(recommendations)
54 | return recsReranked # a list of item ids
55 |
56 | def renderRecommendation(self, request):
57 | assert(isinstance(request, Request))
58 | recsReranked = self.provideRecommendation(request)
59 | # for the purpose of testing, we sort the index, output item names
60 | # output is ordered by the id value
61 | return self.db.extract(DatabaseInterface.INVENTORY_KEY).loc[recsReranked].sort_index()
62 |
63 | def increment(self):
64 | self.log.info("incrementing the system, update the models")
65 | # increment the whole system by one day, trigger offline training
66 | self.offlineLearner.trainModel()
67 | self.modelStore.cleanOnlineModel()
68 | self.recEngine.resetCache()
69 |
70 | # for demo purpose, given an itemId, return the item name
71 | def getFromInventory(self, itemId):
72 | return self.db.extract(DatabaseInterface.INVENTORY_KEY).loc[itemId]
73 |
74 | # simulate a web request
75 | class Request(object):
76 | def __init__(self, userId):
77 | self.userId = userId
78 |
79 | def __str__(self):
80 | return "request for user: "+str(self.userId)
81 |
82 | # simulate a tracking event or a user's rating
83 | class Action(object):
84 | def __init__(self, userId, itemId,rating):
85 | self.userId = userId
86 | self.itemId = itemId
87 | self.rating = rating
88 |
89 | def __str__(self):
90 | return "user: %s, item: %s, rating %s" %(self.userId, self.itemId, self.rating)
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | # main.py
2 | # simulate different request coming into the system
3 |
4 | from Webserver import WebServer, Request, Action
5 |
6 | configMap = {"numberToServe": 10, "data_dir": "DATA"}
7 | server = WebServer(configMap)
8 | server.start() # load all the data in the database, start the first model training
9 |
10 | # now experiment
11 | reqX1 = Request(userId='X1') # anonymous user
12 | req1 = Request(userId=1) # if it is a registered user, we use integer
13 | print(reqX1)
14 | print(req1)
15 |
16 | recX1 = server.renderRecommendation(reqX1) # output recommendations
17 | print recX1
18 |
19 | rec1 = server.renderRecommendation(req1) # output recommendations
20 | print(rec1)
21 |
22 | # now we start an action
23 | action1 = Action(1, 255, 5) # user 1 rated item 255 as score 5
24 | print server.getFromInventory(255) # find out the name of item 255
25 | server.getAction(action1) # feed the action to the server
26 | rec1_afteraction = server.renderRecommendation(req1) # get recommendation after the system knows about the action
27 | print(rec1_afteraction)
28 |
29 | actionX1 = Action('X1', 123, 5) # anonymous user's action won't be saved in database
30 | print server.getFromInventory(123)
31 | server.getAction(actionX1)
32 | recX1_afteraction = server.renderRecommendation(reqX1)
33 | print(recX1_afteraction)
34 |
35 | # update the system, e.g. one day has passed
36 | server.increment()
37 | # the system should forget about actionX1
38 | recX1_aftercleaning = server.renderRecommendation(reqX1)
39 | print(recX1_aftercleaning) # should be similar to recX1
40 |
41 |
42 | req19 = Request(userId=19) # the one with very few history, so it is a new user
43 | rec19 = server.renderRecommendation(req19)
44 | print(rec19)
--------------------------------------------------------------------------------