├── .idea └── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── DatabaseInterface.py ├── Learners ├── OfflineLearner.py ├── OnlineLearner.py └── __init__.py ├── ModelStore.py ├── Models ├── CFmodel.py ├── ClusteringModel.py ├── KNNmodel.py ├── MostPopularModel.py ├── SimilarItemModel.py └── __init__.py ├── Preprocessing.ipynb ├── README.md ├── Ranker.py ├── RecEngine.py ├── UserAnalyzer.py ├── Webserver.py └── main.py /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 8 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | -------------------------------------------------------------------------------- /DatabaseInterface.py: -------------------------------------------------------------------------------- 1 | # Database Interface 2 | # to simulate some database operations 3 | 4 | import os 5 | import pandas as pd 6 | import logging 7 | 8 | class DatabaseInterface(object): 9 | logging.basicConfig(level=logging.INFO) 10 | 11 | # in reality, it should be a configuration file 12 | HISTORY = "ratings.csv" 13 | USER_FEATURE = "userFeature.csv" 14 | ITEM_FEATURE = "itemFeature.csv" 15 | INVENTORY = "inventory.csv" #in reality, inventory store all the representations, such as video link 16 | 17 | HISTORY_KEY = "history" 18 | USER_FEATURE_KEY = "user_feature" 19 | ITEM_FEATURE_KEY = "item_feature" 20 | INVENTORY_KEY = "inventory" 21 | USER_ACTIVITY_KEY = "user_activity" 22 | 23 | # register the static database first 24 | dbTable = {HISTORY_KEY: HISTORY, 25 | USER_FEATURE_KEY: USER_FEATURE, 26 | ITEM_FEATURE_KEY: ITEM_FEATURE, 27 | INVENTORY_KEY: INVENTORY} 28 | 29 | def __init__(self, path): 30 | self.log = logging.getLogger(__name__) 31 | self.path = path 32 | self.started = False 33 | self.connTable = {} 34 | 35 | def startEngine(self): 36 | if self.started: 37 | self.log.warning("the data base has already started") 38 | # start a running engine is not permitted here since it will remove all unsaved data 39 | else: 40 | self.log.info("start the database engine...") 41 | for tableName, tablePath in self.dbTable.iteritems(): 42 | self.log.info("loading table %s..." % tableName) 43 | self.connTable[tableName] = pd.read_csv(os.path.join(self.path, tablePath), index_col=0) 44 | 45 | self.log.info("creating table user_activity...") 46 | self.connTable[self.USER_ACTIVITY_KEY] = self.connTable["history"].groupby("user_id").size() # actually a series 47 | 48 | self.log.info("database successfully started") 49 | self.started = True 50 | 51 | # ideally a sql should be used to query a database, in this case, pandas operation will used instead in client 52 | # https://pandas.pydata.org/pandas-docs/stable/comparison_with_sql.html 53 | def extract(self, tableName): 54 | return self.connTable[tableName] 55 | 56 | def putAction(self, action): 57 | insertRow(self.connTable[self.HISTORY_KEY], [action.userId, action.itemId, action.rating]) 58 | 59 | def insertRow(df,row): 60 | # unsafe insertion into pandas dataframe 61 | df.loc[len(df)] = row 62 | 63 | 64 | if __name__ == "__main__": 65 | connector = DatabaseInterface("DATA") 66 | connector.startEngine() 67 | df1 = connector.connTable["history"] 68 | print df1.head() 69 | df2 = connector.connTable["user_activity"] 70 | print df2[10] 71 | df3 = connector.connTable["item_feature"] 72 | print df3.loc[:,"unknown":] 73 | df4 = connector.connTable["user_feature"] 74 | print df4.loc[:,"age":] 75 | print set(df1[df1.loc[:,"user_id"]==2].loc[:,"item_id"]) 76 | 77 | -------------------------------------------------------------------------------- /Learners/OfflineLearner.py: -------------------------------------------------------------------------------- 1 | # Offline Learner 2 | # Read in user history data, make models 3 | 4 | from ModelStore import ModelStore 5 | from DatabaseInterface import DatabaseInterface 6 | import logging 7 | import numpy as np 8 | 9 | class OfflineLearner(object): 10 | logging.basicConfig(level=logging.INFO) 11 | 12 | def __init__(self, database, modelStore): 13 | self.database = database 14 | self.modelStore = modelStore 15 | self.log = logging.getLogger(__name__) 16 | self.modelRegistry = [(ModelStore.KNN_MODEL_KEY, "k nearest neighbor most popular model"), 17 | (ModelStore.MP_MODEL_KEY, "most popular item model"), 18 | (ModelStore.CL_MODEL_KEY, "item feature clustering model"), 19 | (ModelStore.CF_MODEL_KEY, "collaborative filtering model")] 20 | 21 | def trainModel(self): 22 | self.log.info("Start offline training...") 23 | self.log.info("creating training data...") 24 | # now extract data 25 | # K nearest neighbor is to provide similar users, 26 | # trained on user feature, 27 | # given a user feature, predict the nearest users 28 | # recommend based nearest neighbor's result (use user item rating matrix) 29 | # most popular item 30 | # trained on user item rating matrix 31 | # predict most popular items 32 | # recommend most popular items 33 | # item feature clustering model (only used for online model and CN) 34 | # trained on item feature 35 | # given item features, predict groups 36 | # collaborative filtering model 37 | # trained on user item rating matrix 38 | # given a userId or itemId, give the full prediction 39 | # recommend high predicted ratings 40 | historyRating = self.database.extract(DatabaseInterface.HISTORY_KEY) 41 | itemFeatureTable = self.database.extract(DatabaseInterface.ITEM_FEATURE_KEY).loc[:, "unknown":] 42 | userFeatureTable = self.database.extract(DatabaseInterface.USER_FEATURE_KEY).loc[:, "age":] 43 | ratingsMat = self.transformToMat(historyRating) 44 | 45 | # update model and push back 46 | # for offline model, here we only implement fully retrain, but one should always have a backup 47 | # here we directly update the current model 48 | self.log.info("loading models...") 49 | for record in self.modelRegistry: 50 | model = self.modelStore.getModel(record[0]) 51 | self.log.info("training %s" %record[1]) 52 | if record[0] == ModelStore.KNN_MODEL_KEY: 53 | model.train(userFeatureTable, ratingsMat) 54 | elif record[0] == ModelStore.MP_MODEL_KEY: 55 | model.train(historyRating) 56 | elif record[0] == ModelStore.CL_MODEL_KEY: 57 | model.train(itemFeatureTable) 58 | elif record[0] == ModelStore.CF_MODEL_KEY: 59 | model.train(ratingsMat, itemFeatureTable) 60 | else: 61 | raise Exception("model registry may be broken") 62 | 63 | self.log.info("updating %s", record[1]) 64 | self.pushModel(model, record[0]) 65 | 66 | 67 | def pushModel(self, model, key): 68 | self.modelStore.setModel(model, key) 69 | 70 | @staticmethod 71 | def transformToMat(historyRating): 72 | n_users = historyRating.user_id.max() 73 | n_items = historyRating.item_id.max() 74 | ratingsMat = np.zeros([n_users, n_items]) 75 | for r in historyRating.itertuples(): 76 | ratingsMat[r[1]-1,r[2]-1] = r[3] 77 | return ratingsMat 78 | 79 | 80 | if __name__=="__main__": 81 | db = DatabaseInterface("DATA") 82 | db.startEngine() 83 | modelStore = ModelStore() 84 | learner = OfflineLearner(db, modelStore) 85 | learner.trainModel() -------------------------------------------------------------------------------- /Learners/OnlineLearner.py: -------------------------------------------------------------------------------- 1 | # Online Learner 2 | # take in the user action, serve models 3 | 4 | from ModelStore import ModelStore 5 | from DatabaseInterface import DatabaseInterface 6 | import logging 7 | 8 | class OnlineLearner(object): 9 | logging.basicConfig(level=logging.INFO) 10 | 11 | def __init__(self, database, modelStore): 12 | self.database = database 13 | self.modelStore = modelStore 14 | self.log = logging.getLogger(__name__) 15 | 16 | def trainModel(self, action): 17 | self.log.info("training on action: (%s)" %action) 18 | # action has three fields (userId, itemId, rate) 19 | userId = action.userId 20 | itemId = action.itemId 21 | rating = action.rating 22 | model = self.modelStore.getModel(ModelStore.SI_MODEL_KEY, userId) 23 | itemFeatureTable = self.database.extract(DatabaseInterface.ITEM_FEATURE_KEY) 24 | itemFeature = itemFeatureTable.loc[itemId, "unknown":] 25 | model.train(itemFeature, rating) 26 | self.pushModel(model, userId) 27 | 28 | def pushModel(self, model, userId): 29 | self.log.info("pushing model for user: %s" %userId) 30 | self.modelStore.setModel(model, ModelStore.SI_MODEL_KEY, userId) 31 | -------------------------------------------------------------------------------- /Learners/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /ModelStore.py: -------------------------------------------------------------------------------- 1 | # model store 2 | # keep all the models 3 | # responsible to send the models to RecEngine 4 | 5 | from Models.ClusteringModel import ClusteringModel 6 | from Models.SimilarItemModel import SimilarItemModel 7 | from Models.CFmodel import CFmodel 8 | from Models.MostPopularModel import MostPopularModel 9 | from Models.KNNmodel import KNNmodel 10 | 11 | 12 | class ModelStore(object): 13 | 14 | CF_MODEL_KEY = "cf_model_key" # collaborative filtering 15 | KNN_MODEL_KEY = "knn_model_key" # K nearest neighbor most popular model 16 | MP_MODEL_KEY = "mp_model_key" # most popular 17 | SI_MODEL_KEY = "si_model_key" # similar item 18 | CL_MODEL_KEY = "cl_model_key" # clustering model, used for similarity item model 19 | 20 | def __init__(self): 21 | self.persistModels = {self.KNN_MODEL_KEY: KNNmodel(), 22 | self.MP_MODEL_KEY: MostPopularModel(), 23 | self.CL_MODEL_KEY: ClusteringModel(), 24 | self.CF_MODEL_KEY: CFmodel()} 25 | 26 | # similarity model is used for each user 27 | # online recommendation, trained by online learner 28 | self.transientModels = {self.SI_MODEL_KEY: {}} 29 | 30 | def setModel(self, model, key, memberId = None): 31 | if memberId is None: 32 | self.persistModels[key] = model 33 | else: 34 | self.transientModels[key][memberId] = model 35 | 36 | def getModel(self, key, memberId = None): 37 | #send out the object of models to learning system 38 | if memberId is None: 39 | return self.persistModels[key] 40 | else: 41 | transientModels = self.transientModels[key] 42 | if memberId in transientModels: 43 | return transientModels[memberId] 44 | else: 45 | # it means it is the first time we build the online model for this particular user 46 | assert self.persistModels[self.CL_MODEL_KEY].trained 47 | # since the online model we used is depending on the clustering model trained offline 48 | # this assert is to make sure the clustering model is already trained. 49 | 50 | # in this case, we create a new model for this user 51 | return SimilarItemModel(self.persistModels[self.CL_MODEL_KEY]) 52 | 53 | def cleanOnlineModel(self): 54 | self.transientModels = {self.SI_MODEL_KEY: {}} 55 | 56 | -------------------------------------------------------------------------------- /Models/CFmodel.py: -------------------------------------------------------------------------------- 1 | # Collaborative filtering model 2 | import numpy as np 3 | from sklearn.neighbors import NearestNeighbors 4 | import logging 5 | 6 | 7 | class CFmodel(): 8 | RARECASE_THRESHOLD = 5 9 | logging.basicConfig(level=logging.INFO) 10 | 11 | def __init__(self): 12 | self.knnModel = NearestNeighbors(n_neighbors=15) 13 | self.log = logging.getLogger(__name__) 14 | 15 | def _CFSVD(self, ratingsMat): 16 | user_ratings_mean = np.mean(ratingsMat, axis = 1) # mean over user ratings 17 | R_demeaned = ratingsMat - user_ratings_mean.reshape(-1, 1) 18 | from scipy.sparse.linalg import svds 19 | U, sigma, Vt = svds(R_demeaned, k = 10) 20 | sigma = np.diag(sigma) 21 | self.all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1) 22 | 23 | 24 | def train(self, ratingsMat, itemFeatureTable): 25 | # the logic: 26 | # using content-based modeling for rare items, predict some ratings 27 | # using the ratings matrix filled with the predicted ratings from the content-based model to do matrix factorization 28 | # itemFeatureTable is used for content-based model, which will predict for those items with few ratings 29 | # SVD will be used for collaborative filtering after the rare items have enough ratings 30 | indices = itemFeatureTable.index 31 | self.knnModel.fit(itemFeatureTable) 32 | # print ratingsMat.shape 33 | assert(ratingsMat.shape[1] == itemFeatureTable.index.max()) 34 | rareCases = np.where((ratingsMat>0).sum(axis=0) < self.RARECASE_THRESHOLD)[0] 35 | # if an item has less than 5 ratings, it is considered as a rare case 36 | # it is the 0-based matrix indices 37 | self.log.info("Number of rare cases: %s" %rareCases.shape[0]) 38 | 39 | fillCount = 0 40 | ratingsMatFinal = ratingsMat.copy() 41 | for case in rareCases: 42 | if case+1 in itemFeatureTable.index: 43 | features = itemFeatureTable.loc[case+1] 44 | neighbors = self.knnModel.kneighbors(features.values.reshape(1, -1), return_distance=False)[0] 45 | neighborPos = indices[neighbors]-1 46 | # compute the number of ratings got by the neighbors from each user 47 | target_count = (ratingsMat[:,neighborPos] > 0).sum(axis=1) 48 | # compute the predicted ratings generated from the content-based model 49 | target_ratings = ratingsMat[:,neighborPos].sum(axis=1).astype(float)/target_count 50 | #nonzero mean 51 | 52 | for i in range(ratingsMat.shape[0]): 53 | if ratingsMat[i, case] == 0 and target_count[i]>10: 54 | # if the rating is missing and in its neighbors, more than 10 ratings are available 55 | if target_ratings[i]!=0: 56 | ratingsMatFinal[i,case] = target_ratings[i] 57 | fillCount += 1 58 | 59 | # now we have the filled matrix for matrix factorization 60 | self.log.info("Number of ratings added by content-based model: %s" %fillCount) 61 | 62 | self._CFSVD(ratingsMatFinal) 63 | 64 | 65 | 66 | def predict(self, userId): 67 | return self.all_user_predicted_ratings[userId-1] 68 | 69 | def provideRec(self, userId): 70 | # data is a tuple of (user feature, item feature) 71 | # compute the the average score, sorted from large to small, then report the item ids 72 | return self.all_user_predicted_ratings[userId-1].argsort()[::-1]+1 73 | 74 | if __name__=="__main__": 75 | from DatabaseInterface import DatabaseInterface 76 | from Learners.OfflineLearner import OfflineLearner 77 | db = DatabaseInterface("../DATA") 78 | db.startEngine() 79 | history = db.extract("history") 80 | itemFeatureTable = db.extract(DatabaseInterface.ITEM_FEATURE_KEY).loc[:, "unknown":] 81 | ratingsMat = OfflineLearner.transformToMat(history) 82 | 83 | model = CFmodel() 84 | model.train(ratingsMat, itemFeatureTable) 85 | 86 | recs = model.provideRec(1) 87 | print recs 88 | print ratingsMat[0,recs-1] -------------------------------------------------------------------------------- /Models/ClusteringModel.py: -------------------------------------------------------------------------------- 1 | from sklearn.cluster import KMeans 2 | 3 | # clustering model is to group items with similar features 4 | # it is used for online recommendation 5 | 6 | class ClusteringModel(): 7 | def __init__(self, n_cluster=10): 8 | self.model = KMeans(n_cluster, random_state=12345) # set random state for reproducible 9 | self.groups = {} # keyed by cluster index and values are itemId's 10 | self.trained = False 11 | 12 | def train(self, itemFeatures): 13 | self.indices = itemFeatures.index # the itemIds 14 | self.model.fit(itemFeatures) 15 | self.labels = self.model.labels_ 16 | # the label given for each data point 17 | # the label indicates which cluster it belongs to 18 | # for example, if we have four data, 1,2,3,4 19 | # dataId, clusterId 20 | # 1 1 21 | # 2 1 22 | # 3 2 23 | # 4 2 24 | 25 | # and we want {1:[1,2],2:[3,4]}, called self.groups 26 | for k, v in zip(self.labels, itemFeatures.index.tolist()): 27 | self.groups.setdefault(k,[]).append(v) 28 | self.trained = True 29 | 30 | def predict(self, itemFeatures): 31 | centers = self.model.predict(itemFeatures) 32 | 33 | # based on the predicted centers, find the corresponding cluster members 34 | return centers, [self.groups[c] for c in centers] 35 | 36 | 37 | if __name__=="__main__": 38 | from DatabaseInterface import DatabaseInterface 39 | db = DatabaseInterface("../DATA") 40 | db.startEngine() 41 | itemFeatureTable = db.extract(DatabaseInterface.ITEM_FEATURE_KEY).loc[:, "unknown":] 42 | 43 | model = ClusteringModel() 44 | model.train(itemFeatureTable) 45 | 46 | print model.predict(itemFeatureTable.loc[1].values.reshape(1,-1)) 47 | print itemFeatureTable.loc[[1,422]] 48 | print model.labels[:20] -------------------------------------------------------------------------------- /Models/KNNmodel.py: -------------------------------------------------------------------------------- 1 | # KNN model 2 | # for new user, using user feature, find the k nearest neighbor, using their ratings for the recommendation 3 | 4 | import numpy as np 5 | from sklearn.neighbors import NearestNeighbors 6 | 7 | 8 | class KNNmodel(): 9 | def __init__(self): 10 | self.knnModel = None 11 | 12 | def train(self, userFeatureTable, ratingsMat): 13 | userFeatureTable.loc[:,"age"] = userFeatureTable.loc[:,"age"]/10. 14 | # ad hoc fix, make sure feature's range is similar 15 | self.knnModel = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(userFeatureTable) 16 | 17 | # ratingMat is the rating matrix 18 | self.ratingsMat = ratingsMat 19 | self.userFeatureTable = userFeatureTable 20 | self.userIds = self.userFeatureTable.index # the actual order seen by the knnmodel 21 | 22 | def predict(self, userFeature): 23 | distances, indices = self.knnModel.kneighbors(userFeature) 24 | 25 | # indices are the nearest neighbors' index in the matrix, which is different from userId. 26 | return self.userIds[indices[0]] 27 | 28 | def provideRec(self, userId): 29 | #data is a tuple of (user feature, item feature) 30 | userIds = self.predict(self.userFeatureTable.loc[userId].as_matrix().reshape(1,-1)) 31 | # remove himself as a nearest neighbor 32 | userIds = np.array(list(set(userIds) - set([userId]))) 33 | 34 | # for all nearest neighbors, compute the the average score, sorted from large to small 35 | # then report the item ids 36 | return self.ratingsMat[userIds-1].mean(axis = 0).argsort()[::-1]+1 37 | 38 | 39 | 40 | if __name__=="__main__": 41 | from DatabaseInterface import DatabaseInterface 42 | from Learners.OfflineLearner import OfflineLearner 43 | db = DatabaseInterface("../DATA") 44 | db.startEngine() 45 | history = db.extract("history") 46 | userFeatureTable = db.extract(DatabaseInterface.USER_FEATURE_KEY).loc[:, "age":] 47 | ratingsMat = OfflineLearner.transformToMat(history) 48 | 49 | model = KNNmodel() 50 | model.train(userFeatureTable, ratingsMat) 51 | print model.provideRec(97)[:20] 52 | print ratingsMat[96,model.provideRec(97)-1][:20] 53 | -------------------------------------------------------------------------------- /Models/MostPopularModel.py: -------------------------------------------------------------------------------- 1 | # most popular model 2 | # here it is a simple design: find the one with highest score with most of the users 3 | 4 | class MostPopularModel(): 5 | N_Freq_limit = 0.001 # at least 0.1% of users have rated it, we can start to consider if it is most popular 6 | 7 | def __init__(self): 8 | pass 9 | 10 | def train(self, history): 11 | # X must be a dataframe, with the second key as itemID, and third key as ratings 12 | itemID = list(history)[1] 13 | ratings = list(history)[2] 14 | 15 | # what if only an item only got rated by one user, and the rating is 5, are we confident it is most popular? 16 | nLimit = int(history.shape[0]*self.N_Freq_limit) 17 | itemRatingGrouped = history.groupby(itemID) 18 | itemRatingGroupedCount = itemRatingGrouped[ratings].count() 19 | # print itemRatingGrouped[ratings].mean() 20 | self.mostPopular = itemRatingGrouped[ratings].mean()[itemRatingGroupedCount>nLimit].sort_values(ascending=False) 21 | 22 | def predict(self,X): 23 | # X can only be a list of itemID's 24 | return [self.mostPopular.index.get_loc(x) for x in X] 25 | 26 | def provideRec(self): 27 | return self.mostPopular.index.tolist() 28 | 29 | 30 | 31 | if __name__=="__main__": 32 | from DatabaseInterface import DatabaseInterface 33 | db = DatabaseInterface("DATA") 34 | db.startEngine() 35 | df = db.extract("history") 36 | print df.head() 37 | model = MostPopularModel() 38 | model.train(df) 39 | print model.mostPopular 40 | print model.predict([408]) 41 | print model.provideRec() -------------------------------------------------------------------------------- /Models/SimilarItemModel.py: -------------------------------------------------------------------------------- 1 | # similar item model 2 | # underneath it is to use a clustering model 3 | # for simplicity, return all in the same cluster if rating is higher or equal to 3; return empty cluster otherwise 4 | 5 | class SimilarItemModel(): 6 | THRESHOLD = 3.0 # if ratings are below threshold, it will not be used 7 | 8 | def __init__(self, clusteringModel): 9 | # we use a trained clustering model (trained offline) 10 | self.clusteringModel = clusteringModel 11 | self.recs = [] 12 | 13 | def train(self, itemFeature, rating): 14 | # itemFeature: the feature of the item in the Action 15 | # rating: the rating of the user to the item, also in the Action 16 | # only single record 17 | # each model learns one person's current interest 18 | itemFeature = itemFeature.values.reshape(1,-1) 19 | center, indices = self.clusteringModel.predict(itemFeature) 20 | 21 | # indices: the itemIds that are in the same cluster as the item we get 22 | # that is, the similar items 23 | 24 | if rating >= self.THRESHOLD: 25 | self.recs = indices[0] # the indices is a list of list, like: [[1,2,3,4,5]] 26 | else: 27 | self.recs = [] 28 | 29 | def predict(self, itemFeature): 30 | # X should be item's category feature, only single record 31 | # return the similar items 32 | itemFeature = itemFeature.values.reshape(1,-1) 33 | center, indices = self.clusteringModel.predict(itemFeature) 34 | return indices[0] 35 | 36 | def provideRec(self): 37 | return self.recs 38 | 39 | 40 | if __name__=="__main__": 41 | from DatabaseInterface import DatabaseInterface 42 | from Models.ClusteringModel import ClusteringModel 43 | db = DatabaseInterface("../DATA") 44 | db.startEngine() 45 | itemFeatureTable = db.extract(DatabaseInterface.ITEM_FEATURE_KEY).loc[:, "unknown":] 46 | 47 | model = ClusteringModel() 48 | model.train(itemFeatureTable) 49 | 50 | modelSI = SimilarItemModel(model) 51 | modelSI.train(itemFeatureTable.loc[1], 4) 52 | print modelSI.provideRec() 53 | modelSI.train(itemFeatureTable.loc[1], 2) 54 | print modelSI.provideRec() 55 | -------------------------------------------------------------------------------- /Models/__init__.py: -------------------------------------------------------------------------------- 1 | # All models should recommend the indices of items in the inventory 2 | -------------------------------------------------------------------------------- /Preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 144, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 145, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "name": "stdout", 20 | "output_type": "stream", 21 | "text": [ 22 | "\u001b[34mDATA\u001b[m\u001b[m/ u.genre u2.test ua.test\r\n", 23 | "README u.info u3.base ub.base\r\n", 24 | "\u001b[31mallbut.pl\u001b[m\u001b[m* u.item u3.test ub.test\r\n", 25 | "itemFeature.csv u.occupation u4.base user_test.csv\r\n", 26 | "\u001b[31mmku.sh\u001b[m\u001b[m* u.user u4.test user_train.csv\r\n", 27 | "ratings_test.csv u1.base u5.base\r\n", 28 | "ratings_train.csv u1.test u5.test\r\n", 29 | "u.data u2.base ua.base\r\n" 30 | ] 31 | } 32 | ], 33 | "source": [ 34 | "ls ../ml-100k/" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 146, 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "name": "stdout", 44 | "output_type": "stream", 45 | "text": [ 46 | "/Users/Ke/Google Drive/Bit-tiger-AI-Engineer/Week3/Project/ml-100k\n" 47 | ] 48 | } 49 | ], 50 | "source": [ 51 | "cd ../ml-100k/" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 147, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "\u001b[34mDATA\u001b[m\u001b[m/ u.genre u2.test ua.test\r\n", 64 | "README u.info u3.base ub.base\r\n", 65 | "\u001b[31mallbut.pl\u001b[m\u001b[m* u.item u3.test ub.test\r\n", 66 | "itemFeature.csv u.occupation u4.base user_test.csv\r\n", 67 | "\u001b[31mmku.sh\u001b[m\u001b[m* u.user u4.test user_train.csv\r\n", 68 | "ratings_test.csv u1.base u5.base\r\n", 69 | "ratings_train.csv u1.test u5.test\r\n", 70 | "u.data u2.base ua.base\r\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "ls" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 148, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "mkdir: DATA: File exists\r\n" 88 | ] 89 | } 90 | ], 91 | "source": [ 92 | "mkdir DATA" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 149, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "1|24|M|technician|85711\r\n", 105 | "2|53|F|other|94043\r\n", 106 | "3|23|M|writer|32067\r\n", 107 | "4|24|M|technician|43537\r\n", 108 | "5|33|F|other|15213\r\n", 109 | "6|42|M|executive|98101\r\n", 110 | "7|57|M|administrator|91344\r\n", 111 | "8|36|M|administrator|05201\r\n", 112 | "9|29|M|student|01002\r\n", 113 | "10|53|M|lawyer|90703\r\n" 114 | ] 115 | } 116 | ], 117 | "source": [ 118 | "!head u.user" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 150, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "names = ['user_id','age','gender','occupation','zipcode']\n", 128 | "userDf = pd.read_csv('u.user',sep='|',names=names, index_col=0) \n", 129 | "userDf = userDf.loc[:, 'age':'occupation']" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 151, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "data": { 139 | "text/html": [ 140 | "
\n", 141 | "\n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | "
agegender_Fgender_Moccupation_administratoroccupation_artistoccupation_doctoroccupation_educatoroccupation_engineeroccupation_entertainmentoccupation_executive...occupation_marketingoccupation_noneoccupation_otheroccupation_programmeroccupation_retiredoccupation_salesmanoccupation_scientistoccupation_studentoccupation_technicianoccupation_writer
user_id
1240.01.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.01.00.0
2531.00.00.00.00.00.00.00.00.0...0.00.01.00.00.00.00.00.00.00.0
3230.01.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.01.0
4240.01.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.01.00.0
5331.00.00.00.00.00.00.00.00.0...0.00.01.00.00.00.00.00.00.00.0
\n", 315 | "

5 rows × 24 columns

\n", 316 | "
" 317 | ] 318 | }, 319 | "output_type": "execute_result", 320 | "metadata": {} 321 | } 322 | ], 323 | "source": [ 324 | "userDf = pd.get_dummies(userDf)\n", 325 | "userDf.head()" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 152, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "userDf.to_csv(\"DATA/userFeature.csv\")" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 153, 340 | "metadata": {}, 341 | "outputs": [ 342 | { 343 | "name": "stdout", 344 | "output_type": "stream", 345 | "text": [ 346 | "unknown|0\r\n", 347 | "Action|1\r\n", 348 | "Adventure|2\r\n", 349 | "Animation|3\r\n", 350 | "Children's|4\r\n", 351 | "Comedy|5\r\n", 352 | "Crime|6\r\n", 353 | "Documentary|7\r\n", 354 | "Drama|8\r\n", 355 | "Fantasy|9\r\n", 356 | "Film-Noir|10\r\n", 357 | "Horror|11\r\n", 358 | "Musical|12\r\n", 359 | "Mystery|13\r\n", 360 | "Romance|14\r\n", 361 | "Sci-Fi|15\r\n", 362 | "Thriller|16\r\n", 363 | "War|17\r\n", 364 | "Western|18\r\n", 365 | "\r\n" 366 | ] 367 | } 368 | ], 369 | "source": [ 370 | "!cat u.genre" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 154, 376 | "metadata": {}, 377 | "outputs": [ 378 | { 379 | "name": "stdout", 380 | "output_type": "stream", 381 | "text": [ 382 | "1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0\r\n", 383 | "2|GoldenEye (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?GoldenEye%20(1995)|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0\r\n", 384 | "3|Four Rooms (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0\r\n", 385 | "4|Get Shorty (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995)|0|1|0|0|0|1|0|0|1|0|0|0|0|0|0|0|0|0|0\r\n", 386 | "5|Copycat (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Copycat%20(1995)|0|0|0|0|0|0|1|0|1|0|0|0|0|0|0|0|1|0|0\r\n", 387 | "6|Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)|01-Jan-1995||http://us.imdb.com/Title?Yao+a+yao+yao+dao+waipo+qiao+(1995)|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0\r\n", 388 | "7|Twelve Monkeys (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Twelve%20Monkeys%20(1995)|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|1|0|0|0\r\n", 389 | "8|Babe (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Babe%20(1995)|0|0|0|0|1|1|0|0|1|0|0|0|0|0|0|0|0|0|0\r\n", 390 | "9|Dead Man Walking (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Dead%20Man%20Walking%20(1995)|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0\r\n", 391 | "10|Richard III (1995)|22-Jan-1996||http://us.imdb.com/M/title-exact?Richard%20III%20(1995)|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|1|0\r\n" 392 | ] 393 | } 394 | ], 395 | "source": [ 396 | "!head u.item" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 155, 402 | "metadata": {}, 403 | "outputs": [], 404 | "source": [ 405 | "genreNames = pd.read_csv('u.genre',sep='|', index_col=0, names=[\"name\",\"index\"]).index.tolist()" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 156, 411 | "metadata": {}, 412 | "outputs": [ 413 | { 414 | "name": "stderr", 415 | "output_type": "stream", 416 | "text": [ 417 | "/Users/Ke/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", 418 | " if __name__ == '__main__':\n" 419 | ] 420 | } 421 | ], 422 | "source": [ 423 | "itemFeatureDf = pd.read_csv('u.item',sep='\\|\\|?', index_col=0, names=[\"itemId\",\"itemName\",\"Date\",\"URL\"]+genreNames)" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": 157, 429 | "metadata": {}, 430 | "outputs": [ 431 | { 432 | "data": { 433 | "text/html": [ 434 | "
\n", 435 | "\n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | "
itemNameDateURLunknownActionAdventureAnimationChildren'sComedyCrime...FantasyFilm-NoirHorrorMusicalMysteryRomanceSci-FiThrillerWarWestern
itemId
267unknownNaN10000000...000000000NaN
1358The Deadly Cure (1996)16-Sep-1996NaN0100000...0000000000.0
1359Boys in Venice (1996)24-Sep-1996NaN0000000...0000000000.0
\n", 561 | "

3 rows × 22 columns

\n", 562 | "
" 563 | ] 564 | }, 565 | "output_type": "execute_result", 566 | "metadata": {} 567 | } 568 | ], 569 | "source": [ 570 | "itemFeatureDf[itemFeatureDf.isnull().any(axis=1)]" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": 158, 576 | "metadata": {}, 577 | "outputs": [ 578 | { 579 | "data": { 580 | "text/plain": [ 581 | "itemName False\n", 582 | "Date False\n", 583 | "URL True\n", 584 | "unknown False\n", 585 | "Action False\n", 586 | "Adventure False\n", 587 | "Animation False\n", 588 | "Children's False\n", 589 | "Comedy False\n", 590 | "Crime False\n", 591 | "Documentary False\n", 592 | "Drama False\n", 593 | "Fantasy False\n", 594 | "Film-Noir False\n", 595 | "Horror False\n", 596 | "Musical False\n", 597 | "Mystery False\n", 598 | "Romance False\n", 599 | "Sci-Fi False\n", 600 | "Thriller False\n", 601 | "War False\n", 602 | "Western False\n", 603 | "dtype: bool" 604 | ] 605 | }, 606 | "execution_count": 158, 607 | "output_type": "execute_result", 608 | "metadata": {} 609 | } 610 | ], 611 | "source": [ 612 | "itemFeatureDf = itemFeatureDf.drop(267)\n", 613 | "itemFeatureDf.isnull().any()" 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": 159, 619 | "metadata": {}, 620 | "outputs": [], 621 | "source": [ 622 | "itemFeatureDf.to_csv(\"DATA/itemFeature.csv\")" 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": 160, 628 | "metadata": {}, 629 | "outputs": [], 630 | "source": [ 631 | "itemFeatureDf.loc[:,[\"itemName\"]].to_csv(\"DATA/inventory.csv\")" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": 161, 637 | "metadata": {}, 638 | "outputs": [ 639 | { 640 | "name": "stdout", 641 | "output_type": "stream", 642 | "text": [ 643 | "196\t242\t3\t881250949\r\n", 644 | "186\t302\t3\t891717742\r\n", 645 | "22\t377\t1\t878887116\r\n", 646 | "244\t51\t2\t880606923\r\n", 647 | "166\t346\t1\t886397596\r\n", 648 | "298\t474\t4\t884182806\r\n", 649 | "115\t265\t2\t881171488\r\n", 650 | "253\t465\t5\t891628467\r\n", 651 | "305\t451\t3\t886324817\r\n", 652 | "6\t86\t3\t883603013\r\n" 653 | ] 654 | } 655 | ], 656 | "source": [ 657 | "!head u.data" 658 | ] 659 | }, 660 | { 661 | "cell_type": "code", 662 | "execution_count": 162, 663 | "metadata": {}, 664 | "outputs": [], 665 | "source": [ 666 | "names = ['user_id','item_id','rating','timestamp']\n", 667 | "dataDf = pd.read_csv('u.data',sep='\\t',names=names)" 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": 163, 673 | "metadata": {}, 674 | "outputs": [], 675 | "source": [ 676 | "dataDf = dataDf.drop('timestamp',1)" 677 | ] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "execution_count": 164, 682 | "metadata": {}, 683 | "outputs": [], 684 | "source": [ 685 | "# remove test users from training users" 686 | ] 687 | }, 688 | { 689 | "cell_type": "code", 690 | "execution_count": 165, 691 | "metadata": {}, 692 | "outputs": [], 693 | "source": [ 694 | "userTrainIndex = userDf.sample(frac = 0.90).index\n", 695 | "dataDf_train = dataDf[dataDf.user_id.isin(userTrainIndex.tolist())]" 696 | ] 697 | }, 698 | { 699 | "cell_type": "code", 700 | "execution_count": 166, 701 | "metadata": {}, 702 | "outputs": [ 703 | { 704 | "name": "stdout", 705 | "output_type": "stream", 706 | "text": [ 707 | "(100000, 3)\n", 708 | "(90876, 3)\n" 709 | ] 710 | } 711 | ], 712 | "source": [ 713 | "print dataDf.shape\n", 714 | "print dataDf_train.shape" 715 | ] 716 | }, 717 | { 718 | "cell_type": "code", 719 | "execution_count": 167, 720 | "metadata": {}, 721 | "outputs": [], 722 | "source": [ 723 | "dataDf_test = dataDf.drop(dataDf_train.index)" 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": 168, 729 | "metadata": {}, 730 | "outputs": [ 731 | { 732 | "name": "stdout", 733 | "output_type": "stream", 734 | "text": [ 735 | "(9124, 3)\n" 736 | ] 737 | } 738 | ], 739 | "source": [ 740 | "print dataDf_test.shape" 741 | ] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": 169, 746 | "metadata": {}, 747 | "outputs": [], 748 | "source": [ 749 | "dataDf_train.to_csv('DATA/ratings_train.csv')\n", 750 | "dataDf_test.to_csv('DATA/ratings_test.csv')\n", 751 | "dataDf.to_csv('DATA/ratings.csv')" 752 | ] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": 170, 757 | "metadata": {}, 758 | "outputs": [ 759 | { 760 | "data": { 761 | "text/html": [ 762 | "
\n", 763 | "\n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | "
user_iditem_idrating
01962423
11863023
2223771
3244512
41663461
\n", 805 | "
" 806 | ] 807 | }, 808 | "output_type": "execute_result", 809 | "metadata": {} 810 | } 811 | ], 812 | "source": [ 813 | "dataDf_train.head()" 814 | ] 815 | }, 816 | { 817 | "cell_type": "code", 818 | "metadata": {}, 819 | "outputs": [], 820 | "source": [ 821 | "" 822 | ] 823 | } 824 | ], 825 | "metadata": { 826 | "anaconda-cloud": {}, 827 | "kernelspec": { 828 | "display_name": "Python [Root]", 829 | "language": "python", 830 | "name": "Python [Root]" 831 | }, 832 | "language_info": { 833 | "codemirror_mode": { 834 | "name": "ipython", 835 | "version": 2.0 836 | }, 837 | "file_extension": ".py", 838 | "mimetype": "text/x-python", 839 | "name": "python", 840 | "nbconvert_exporter": "python", 841 | "pygments_lexer": "ipython2", 842 | "version": "2.7.12" 843 | } 844 | }, 845 | "nbformat": 4, 846 | "nbformat_minor": 0 847 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RecommenderSystem 2 | A general form of recommender system 3 | 4 | 5 | # To start: 6 | download https://grouplens.org/datasets/movielens/100k/ \ 7 | unzip to a directory that is in the same root directory of this project folder \ 8 | your file system layout should be like: \ 9 | /project - mk-100/ 10 | \ RecommenderSystem/ 11 | 12 | Inside RecommenderSystem/, run the Preprocessing.ipynb in the jupyter notebook \ 13 | copy mk-100/DATA to RecommenderSystem/ \ 14 | 15 | now in the DATA/, you have all the data you need for this code to run 16 | -------------------------------------------------------------------------------- /Ranker.py: -------------------------------------------------------------------------------- 1 | # Ranker 2 | 3 | import logging 4 | import numpy as np 5 | 6 | # rank the items from each recommendation module 7 | # highly influenced by business strategy and varies from system to system 8 | from DatabaseInterface import DatabaseInterface 9 | 10 | 11 | class Ranker(object): 12 | logging.basicConfig(level=logging.INFO) 13 | def __init__(self, numberToServe, database): 14 | self.numberToServe = numberToServe 15 | self.userHistoryDB = database.extract(DatabaseInterface.HISTORY_KEY) # who rated what 16 | self.log = logging.getLogger(__name__) 17 | 18 | def _getUsedItems(self, userId): 19 | # return a python set of all the movies that have been seen 20 | if userId == -1 : 21 | return set([]) 22 | else: 23 | return set(self.userHistoryDB[self.userHistoryDB.loc[:,"user_id"]==userId].loc[:,"item_id"]) 24 | 25 | def rerank(self,recommendationsTuple): 26 | # recommendationTupe is a tuple of (userId, recommendations) 27 | # recommendations is a dictionary of lists {RecType: Items}, RecType can be "online", "offline", "popular" 28 | # return the ranked recommendation 29 | # here is the strategy: 30 | # if the userId is -1, it means it is from anonymous user. 31 | # else remove the watched item and 32 | 33 | userId = recommendationsTuple[0] 34 | recommendations = recommendationsTuple[1] 35 | 36 | usedItems = self._getUsedItems(userId) 37 | 38 | 39 | self.log.info("Recommendations received in Ranker: %s" %recommendations) 40 | self.log.info("Recommendation types received in Ranker: %s" %recommendations.keys()) 41 | results = [] 42 | 43 | if "online" in recommendations: # online exists as long as user has been active 44 | results.extend(recommendations["online"][:self.numberToServe]) # should only has one 45 | 46 | if "offline" in recommendations: # offline exist only if user are registered, the recs could be from CF or LR 47 | results.extend(recommendations["offline"][:self.numberToServe]) 48 | 49 | if "popular" in recommendations: # most popular should always exist 50 | # if there is no personalized recs, the remaining should be filled by most popular 51 | results.extend(recommendations["popular"][:self.numberToServe]) 52 | else: 53 | self.log.error("recommendations do not contain popular items") 54 | 55 | try: 56 | # remove the already visited items 57 | results = np.random.choice(list(set(results)-usedItems), self.numberToServe, replace=False) 58 | except ValueError: 59 | # sometimes the user may watched a lot 60 | # this is apparently not a good strategy, why? 61 | results = np.random.choice(results, self.numberToServe, replace=False) 62 | 63 | 64 | return results 65 | 66 | if __name__=="__main__": 67 | from DatabaseInterface import DatabaseInterface 68 | db = DatabaseInterface("DATA") 69 | db.startEngine() 70 | ranker = Ranker(numberToServe=10, database=db) 71 | print sorted(ranker._getUsedItems(1)) -------------------------------------------------------------------------------- /RecEngine.py: -------------------------------------------------------------------------------- 1 | # Recommendation Engine 2 | 3 | from ModelStore import ModelStore 4 | import logging 5 | 6 | class RecEngine(object): 7 | logging.basicConfig(level=logging.INFO) 8 | 9 | def __init__(self, userAnalyzer, modelStore, userActivityTable): 10 | self.userAnalyzer = userAnalyzer 11 | self.modelStore = modelStore 12 | self.userActivityTable = userActivityTable 13 | self._cacheMostPopular() 14 | # to pre-compute the most popular items, because this recommendation is independent from users 15 | self.log = logging.getLogger(__name__) 16 | 17 | def resetCache(self): 18 | self._cacheMostPopular() 19 | 20 | def _cacheMostPopular(self): 21 | self.mostPopularList = self.modelStore.getModel(ModelStore.MP_MODEL_KEY).provideRec() 22 | 23 | def provideRecommendation(self, request): 24 | recommendations = {} 25 | # dictionary, dict1 = {"key":"value"}, when I try to get the value, I can use dict1["key"] 26 | # construct recommendation content, which is implemented as a dictionary 27 | # three sections will be used: popular, online, offline 28 | 29 | recommendations["popular"] = self.mostPopularList 30 | requestAnalyzed = self.userAnalyzer.analyze(request, self.userActivityTable) 31 | 32 | # online recommendation 33 | onlineRecs = self.modelStore.getModel(ModelStore.SI_MODEL_KEY, request.userId).provideRec() 34 | 35 | self.log.info("user type: %s" %requestAnalyzed[0]) 36 | 37 | # now we start to construct our recommendation data 38 | if len(onlineRecs)>0: 39 | recommendations["online"] = onlineRecs # a lit of ids 40 | 41 | if requestAnalyzed[0] == "new": 42 | # for new user, we use KNN model for offline model recommendation 43 | recommendations["offline"] = self.modelStore.getModel(ModelStore.KNN_MODEL_KEY)\ 44 | .provideRec(requestAnalyzed[2].userId) 45 | elif requestAnalyzed[0] == "old": 46 | # for new user, we use CF model for offline model recommendation 47 | recommendations["offline"] = self.modelStore.getModel(ModelStore.CF_MODEL_KEY)\ 48 | .provideRec(requestAnalyzed[2].userId) 49 | 50 | return requestAnalyzed[1], recommendations 51 | 52 | 53 | -------------------------------------------------------------------------------- /UserAnalyzer.py: -------------------------------------------------------------------------------- 1 | # User Type Analyzer 2 | # Determine different type of user, send different user to different recommendation module 3 | 4 | class UserAnalyzer(object): 5 | def __init__(self): 6 | pass 7 | 8 | def analyze(self, request, userActivityDB): 9 | # should return an identifier such that the recommender engine knows what to do 10 | # userActitivyDB is defined in DatabaseInterface, to count user's total amount of activity 11 | if isinstance(request.userId,str): 12 | # it is an anonymous request 13 | return ["anonymous", -1, request] 14 | elif request.userId in userActivityDB.index: 15 | if userActivityDB[request.userId] >= 30: 16 | # if the user has already rated more than 30 items, we call it an old user 17 | return ["old", request.userId, request] 18 | else: 19 | return ["new", request.userId, request] 20 | else: 21 | return ["new", request.userId, request] 22 | 23 | def analyzeAction(self, action): 24 | if isinstance(action.userId, str): 25 | return "anonymous" 26 | else: 27 | return "registered" 28 | -------------------------------------------------------------------------------- /Webserver.py: -------------------------------------------------------------------------------- 1 | # A simulation framework 2 | import logging 3 | 4 | from DatabaseInterface import DatabaseInterface 5 | from RecEngine import RecEngine 6 | from Ranker import Ranker 7 | from Learners.OfflineLearner import OfflineLearner 8 | from Learners.OnlineLearner import OnlineLearner 9 | from UserAnalyzer import UserAnalyzer 10 | from ModelStore import ModelStore 11 | 12 | 13 | class WebServer(object): 14 | logging.basicConfig(level=logging.INFO) 15 | 16 | def __init__(self, configMap): 17 | self.db = DatabaseInterface(configMap['data_dir']) 18 | # numberToServe: the number of items finally served to the users 19 | self.numberToServe = configMap['numberToServe'] 20 | self.log = logging.getLogger(__name__) 21 | 22 | def start(self): 23 | # each object here simulates the API calls through network 24 | # passing an object A to the constructor of B means A will communication to B 25 | self.db.startEngine() 26 | self.ranker = Ranker(self.numberToServe, self.db) 27 | self.userAnalyzer = UserAnalyzer() 28 | self.modelStore = ModelStore() 29 | self.offlineLearner = OfflineLearner(self.db, self.modelStore) 30 | self.onlineLearner = OnlineLearner(self.db, self.modelStore) 31 | self.offlineLearner.trainModel() 32 | # when we start the webserver, we should let offline learner to train the models, 33 | # such that, after the start(), we can start to give recommendation 34 | self.recEngine = RecEngine(self.userAnalyzer, self.modelStore, self.db.extract(DatabaseInterface.USER_ACTIVITY_KEY)) 35 | 36 | 37 | def getAction(self, action): 38 | assert(isinstance(action, Action)) 39 | # taking the action from users 40 | self.onlineLearner.trainModel(action) 41 | # analyze action type, and save the registered user's action 42 | actionType = self.userAnalyzer.analyzeAction(action) 43 | if actionType == "registered": 44 | self.log.info("Recording action %s" %action) 45 | self.db.putAction(action) 46 | 47 | def provideRecommendation(self, request): 48 | # return the ID's for the recommended items 49 | assert(isinstance(request, Request)) 50 | # provide recommendations to user 51 | self.log.info("responding to request: %s" %request) 52 | recommendations = self.recEngine.provideRecommendation(request) 53 | recsReranked = self.ranker.rerank(recommendations) 54 | return recsReranked # a list of item ids 55 | 56 | def renderRecommendation(self, request): 57 | assert(isinstance(request, Request)) 58 | recsReranked = self.provideRecommendation(request) 59 | # for the purpose of testing, we sort the index, output item names 60 | # output is ordered by the id value 61 | return self.db.extract(DatabaseInterface.INVENTORY_KEY).loc[recsReranked].sort_index() 62 | 63 | def increment(self): 64 | self.log.info("incrementing the system, update the models") 65 | # increment the whole system by one day, trigger offline training 66 | self.offlineLearner.trainModel() 67 | self.modelStore.cleanOnlineModel() 68 | self.recEngine.resetCache() 69 | 70 | # for demo purpose, given an itemId, return the item name 71 | def getFromInventory(self, itemId): 72 | return self.db.extract(DatabaseInterface.INVENTORY_KEY).loc[itemId] 73 | 74 | # simulate a web request 75 | class Request(object): 76 | def __init__(self, userId): 77 | self.userId = userId 78 | 79 | def __str__(self): 80 | return "request for user: "+str(self.userId) 81 | 82 | # simulate a tracking event or a user's rating 83 | class Action(object): 84 | def __init__(self, userId, itemId,rating): 85 | self.userId = userId 86 | self.itemId = itemId 87 | self.rating = rating 88 | 89 | def __str__(self): 90 | return "user: %s, item: %s, rating %s" %(self.userId, self.itemId, self.rating) -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # main.py 2 | # simulate different request coming into the system 3 | 4 | from Webserver import WebServer, Request, Action 5 | 6 | configMap = {"numberToServe": 10, "data_dir": "DATA"} 7 | server = WebServer(configMap) 8 | server.start() # load all the data in the database, start the first model training 9 | 10 | # now experiment 11 | reqX1 = Request(userId='X1') # anonymous user 12 | req1 = Request(userId=1) # if it is a registered user, we use integer 13 | print(reqX1) 14 | print(req1) 15 | 16 | recX1 = server.renderRecommendation(reqX1) # output recommendations 17 | print recX1 18 | 19 | rec1 = server.renderRecommendation(req1) # output recommendations 20 | print(rec1) 21 | 22 | # now we start an action 23 | action1 = Action(1, 255, 5) # user 1 rated item 255 as score 5 24 | print server.getFromInventory(255) # find out the name of item 255 25 | server.getAction(action1) # feed the action to the server 26 | rec1_afteraction = server.renderRecommendation(req1) # get recommendation after the system knows about the action 27 | print(rec1_afteraction) 28 | 29 | actionX1 = Action('X1', 123, 5) # anonymous user's action won't be saved in database 30 | print server.getFromInventory(123) 31 | server.getAction(actionX1) 32 | recX1_afteraction = server.renderRecommendation(reqX1) 33 | print(recX1_afteraction) 34 | 35 | # update the system, e.g. one day has passed 36 | server.increment() 37 | # the system should forget about actionX1 38 | recX1_aftercleaning = server.renderRecommendation(reqX1) 39 | print(recX1_aftercleaning) # should be similar to recX1 40 | 41 | 42 | req19 = Request(userId=19) # the one with very few history, so it is a new user 43 | rec19 = server.renderRecommendation(req19) 44 | print(rec19) --------------------------------------------------------------------------------