├── .idea
    └── inspectionProfiles
    │   ├── Project_Default.xml
    │   └── profiles_settings.xml
├── DatabaseInterface.py
├── Learners
    ├── OfflineLearner.py
    ├── OnlineLearner.py
    └── __init__.py
├── ModelStore.py
├── Models
    ├── CFmodel.py
    ├── ClusteringModel.py
    ├── KNNmodel.py
    ├── MostPopularModel.py
    ├── SimilarItemModel.py
    └── __init__.py
├── Preprocessing.ipynb
├── README.md
├── Ranker.py
├── RecEngine.py
├── UserAnalyzer.py
├── Webserver.py
└── main.py


/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <profile version="1.0" is_locked="false">
3 |     <option name="myName" value="Project Default" />
4 |     <option name="myLocal" value="false" />
5 |     <inspection_tool class="PyPep8Inspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
6 |     <inspection_tool class="PyPep8NamingInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
7 |   </profile>
8 | </component>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="PROJECT_PROFILE" value="Project Default" />
4 |     <option name="USE_PROJECT_PROFILE" value="true" />
5 |     <version value="1.0" />
6 |   </settings>
7 | </component>


--------------------------------------------------------------------------------
/DatabaseInterface.py:
--------------------------------------------------------------------------------
 1 | # Database Interface
 2 | # to simulate some database operations
 3 | 
 4 | import os
 5 | import pandas as pd
 6 | import logging
 7 | 
 8 | class DatabaseInterface(object):
 9 | 	logging.basicConfig(level=logging.INFO)
10 | 
11 | 	# in reality, it should be a configuration file
12 | 	HISTORY = "ratings.csv"
13 | 	USER_FEATURE = "userFeature.csv"
14 | 	ITEM_FEATURE = "itemFeature.csv"
15 | 	INVENTORY = "inventory.csv" #in reality, inventory store all the representations, such as video link
16 | 
17 | 	HISTORY_KEY = "history"
18 | 	USER_FEATURE_KEY = "user_feature"
19 | 	ITEM_FEATURE_KEY = "item_feature"
20 | 	INVENTORY_KEY = "inventory"
21 | 	USER_ACTIVITY_KEY = "user_activity"
22 | 
23 | 	# register the static database first
24 | 	dbTable = {HISTORY_KEY: HISTORY,
25 | 			   USER_FEATURE_KEY: USER_FEATURE,
26 | 			   ITEM_FEATURE_KEY: ITEM_FEATURE,
27 | 			   INVENTORY_KEY: INVENTORY}
28 | 
29 | 	def __init__(self, path):
30 | 		self.log = logging.getLogger(__name__)
31 | 		self.path = path
32 | 		self.started = False
33 | 		self.connTable = {}
34 | 
35 | 	def startEngine(self):
36 | 		if self.started:
37 | 			self.log.warning("the data base has already started")
38 | 			# start a running engine is not permitted here since it will remove all unsaved data
39 | 		else:
40 | 			self.log.info("start the database engine...")
41 | 			for tableName, tablePath in self.dbTable.iteritems():
42 | 				self.log.info("loading table %s..." % tableName)
43 | 				self.connTable[tableName] = pd.read_csv(os.path.join(self.path, tablePath), index_col=0)
44 | 
45 | 			self.log.info("creating table user_activity...")
46 | 			self.connTable[self.USER_ACTIVITY_KEY] = self.connTable["history"].groupby("user_id").size() # actually a series
47 | 
48 | 			self.log.info("database successfully started")
49 | 			self.started = True
50 | 
51 | 	# ideally a sql should be used to query a database, in this case, pandas operation will used instead in client
52 | 	# https://pandas.pydata.org/pandas-docs/stable/comparison_with_sql.html
53 | 	def extract(self, tableName):
54 | 		return self.connTable[tableName]
55 | 
56 | 	def putAction(self, action):
57 | 		insertRow(self.connTable[self.HISTORY_KEY], [action.userId, action.itemId, action.rating])
58 | 
59 | def insertRow(df,row):
60 | 	# unsafe insertion into pandas dataframe
61 | 	df.loc[len(df)] = row
62 | 
63 | 
64 | if __name__ == "__main__":
65 | 	connector = DatabaseInterface("DATA")
66 | 	connector.startEngine()
67 | 	df1 = connector.connTable["history"]
68 | 	print df1.head()
69 | 	df2 = connector.connTable["user_activity"]
70 | 	print df2[10]
71 | 	df3 = connector.connTable["item_feature"]
72 | 	print df3.loc[:,"unknown":]
73 | 	df4 = connector.connTable["user_feature"]
74 | 	print df4.loc[:,"age":]
75 | 	print set(df1[df1.loc[:,"user_id"]==2].loc[:,"item_id"])
76 | 
77 | 


--------------------------------------------------------------------------------
/Learners/OfflineLearner.py:
--------------------------------------------------------------------------------
 1 | # Offline Learner
 2 | # Read in user history data, make models
 3 | 
 4 | from ModelStore import ModelStore
 5 | from DatabaseInterface import DatabaseInterface
 6 | import logging
 7 | import numpy as np
 8 | 
 9 | class OfflineLearner(object):
10 | 	logging.basicConfig(level=logging.INFO)
11 | 
12 | 	def __init__(self, database, modelStore):
13 | 		self.database = database
14 | 		self.modelStore = modelStore
15 | 		self.log = logging.getLogger(__name__)
16 | 		self.modelRegistry = [(ModelStore.KNN_MODEL_KEY, "k nearest neighbor most popular model"),
17 | 								(ModelStore.MP_MODEL_KEY, "most popular item model"),
18 | 								(ModelStore.CL_MODEL_KEY, "item feature clustering model"),
19 | 								(ModelStore.CF_MODEL_KEY, "collaborative filtering model")]
20 | 
21 | 	def trainModel(self):
22 | 		self.log.info("Start offline training...")
23 | 		self.log.info("creating training data...")
24 | 		# now extract data
25 | 		# K nearest neighbor is to provide similar users,
26 | 		# 	trained on user feature,
27 | 		# 	given a user feature, predict the nearest users
28 | 		#	recommend based nearest neighbor's result (use user item rating matrix)
29 | 		# most popular item
30 | 		#	trained on user item rating matrix
31 | 		#	predict most popular items
32 | 		#	recommend most popular items
33 | 		# item feature clustering model (only used for online model and CN)
34 | 		# 	trained on item feature
35 | 		#	given item features, predict groups
36 | 		# collaborative filtering model
37 | 		#	trained on user item rating matrix
38 | 		#	given a userId or itemId, give the full prediction
39 | 		#	recommend high predicted ratings
40 | 		historyRating = self.database.extract(DatabaseInterface.HISTORY_KEY)
41 | 		itemFeatureTable = self.database.extract(DatabaseInterface.ITEM_FEATURE_KEY).loc[:, "unknown":]
42 | 		userFeatureTable = self.database.extract(DatabaseInterface.USER_FEATURE_KEY).loc[:, "age":]
43 | 		ratingsMat = self.transformToMat(historyRating)
44 | 
45 | 		# update model and push back
46 | 		# for offline model, here we only implement fully retrain, but one should always have a backup
47 | 		# here we directly update the current model
48 | 		self.log.info("loading models...")
49 | 		for record in self.modelRegistry:
50 | 			model = self.modelStore.getModel(record[0])
51 | 			self.log.info("training %s" %record[1])
52 | 			if record[0] == ModelStore.KNN_MODEL_KEY:
53 | 				model.train(userFeatureTable, ratingsMat)
54 | 			elif record[0] == ModelStore.MP_MODEL_KEY:
55 | 				model.train(historyRating)
56 | 			elif record[0] == ModelStore.CL_MODEL_KEY:
57 | 				model.train(itemFeatureTable)
58 | 			elif record[0] == ModelStore.CF_MODEL_KEY:
59 | 				model.train(ratingsMat, itemFeatureTable)
60 | 			else:
61 | 				raise Exception("model registry may be broken")
62 | 
63 | 			self.log.info("updating %s", record[1])
64 | 			self.pushModel(model, record[0])
65 | 
66 | 
67 | 	def pushModel(self, model, key):
68 | 		self.modelStore.setModel(model, key)
69 | 
70 | 	@staticmethod
71 | 	def transformToMat(historyRating):
72 | 		n_users = historyRating.user_id.max()
73 | 		n_items = historyRating.item_id.max()
74 | 		ratingsMat = np.zeros([n_users, n_items])
75 | 		for r in historyRating.itertuples():
76 | 			ratingsMat[r[1]-1,r[2]-1] = r[3]
77 | 		return ratingsMat
78 | 
79 | 
80 | if __name__=="__main__":
81 | 	db = DatabaseInterface("DATA")
82 | 	db.startEngine()
83 | 	modelStore = ModelStore()
84 | 	learner = OfflineLearner(db, modelStore)
85 | 	learner.trainModel()


--------------------------------------------------------------------------------
/Learners/OnlineLearner.py:
--------------------------------------------------------------------------------
 1 | # Online Learner
 2 | # take in the user action, serve models
 3 | 
 4 | from ModelStore import ModelStore
 5 | from DatabaseInterface import DatabaseInterface
 6 | import logging
 7 | 
 8 | class OnlineLearner(object):
 9 | 	logging.basicConfig(level=logging.INFO)
10 | 
11 | 	def __init__(self, database, modelStore):
12 | 		self.database = database
13 | 		self.modelStore = modelStore
14 | 		self.log = logging.getLogger(__name__)
15 | 
16 | 	def trainModel(self, action):
17 | 		self.log.info("training on action: (%s)" %action)
18 | 		# action has three fields (userId, itemId, rate)
19 | 		userId = action.userId
20 | 		itemId = action.itemId
21 | 		rating = action.rating
22 | 		model = self.modelStore.getModel(ModelStore.SI_MODEL_KEY, userId)
23 | 		itemFeatureTable = self.database.extract(DatabaseInterface.ITEM_FEATURE_KEY)
24 | 		itemFeature = itemFeatureTable.loc[itemId, "unknown":]
25 | 		model.train(itemFeature, rating)
26 | 		self.pushModel(model, userId)
27 | 
28 | 	def pushModel(self, model, userId):
29 | 		self.log.info("pushing model for user: %s" %userId)
30 | 		self.modelStore.setModel(model, ModelStore.SI_MODEL_KEY, userId)
31 | 


--------------------------------------------------------------------------------
/Learners/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/ModelStore.py:
--------------------------------------------------------------------------------
 1 | # model store
 2 | # keep all the models
 3 | # responsible to send the models to RecEngine
 4 | 
 5 | from Models.ClusteringModel import ClusteringModel
 6 | from Models.SimilarItemModel import SimilarItemModel
 7 | from Models.CFmodel import CFmodel
 8 | from Models.MostPopularModel import MostPopularModel
 9 | from Models.KNNmodel import KNNmodel
10 | 
11 | 
12 | class ModelStore(object):
13 | 
14 | 	CF_MODEL_KEY = "cf_model_key" # collaborative filtering
15 | 	KNN_MODEL_KEY = "knn_model_key" # K nearest neighbor most popular model
16 | 	MP_MODEL_KEY = "mp_model_key" # most popular
17 | 	SI_MODEL_KEY = "si_model_key" # similar item
18 | 	CL_MODEL_KEY = "cl_model_key" # clustering model, used for similarity item model
19 | 
20 | 	def __init__(self):
21 | 		self.persistModels = {self.KNN_MODEL_KEY: KNNmodel(),
22 | 		                      self.MP_MODEL_KEY: MostPopularModel(),
23 | 		                      self.CL_MODEL_KEY: ClusteringModel(),
24 | 		                      self.CF_MODEL_KEY: CFmodel()}
25 | 
26 | 		# similarity model is used for each user
27 | 		# online recommendation, trained by online learner
28 | 		self.transientModels = {self.SI_MODEL_KEY: {}}
29 | 
30 | 	def setModel(self, model, key, memberId = None):
31 | 		if memberId is None:
32 | 			self.persistModels[key] = model
33 | 		else:
34 | 			self.transientModels[key][memberId] = model
35 | 
36 | 	def getModel(self, key, memberId = None):
37 | 		#send out the object of models to learning system
38 | 		if memberId is None:
39 | 			return self.persistModels[key]
40 | 		else:
41 | 			transientModels = self.transientModels[key]
42 | 			if memberId in transientModels:
43 | 				return transientModels[memberId]
44 | 			else:
45 | 				# it means it is the first time we build the online model for this particular user
46 | 				assert self.persistModels[self.CL_MODEL_KEY].trained
47 | 				# since the online model we used is depending on the clustering model trained offline
48 | 				# this assert is to make sure the clustering model is already trained.
49 | 
50 | 				# in this case, we create a new model for this user
51 | 				return SimilarItemModel(self.persistModels[self.CL_MODEL_KEY])
52 | 
53 | 	def cleanOnlineModel(self):
54 | 		self.transientModels = {self.SI_MODEL_KEY: {}}
55 | 
56 | 


--------------------------------------------------------------------------------
/Models/CFmodel.py:
--------------------------------------------------------------------------------
 1 | # Collaborative filtering model
 2 | import numpy as np
 3 | from sklearn.neighbors import NearestNeighbors
 4 | import logging
 5 | 
 6 | 
 7 | class CFmodel():
 8 | 	RARECASE_THRESHOLD = 5
 9 | 	logging.basicConfig(level=logging.INFO)
10 | 
11 | 	def __init__(self):
12 | 		self.knnModel = NearestNeighbors(n_neighbors=15)
13 | 		self.log = logging.getLogger(__name__)
14 | 
15 | 	def _CFSVD(self, ratingsMat):
16 | 		user_ratings_mean = np.mean(ratingsMat, axis = 1) # mean over user ratings
17 | 		R_demeaned = ratingsMat - user_ratings_mean.reshape(-1, 1)
18 | 		from scipy.sparse.linalg import svds
19 | 		U, sigma, Vt = svds(R_demeaned, k = 10)
20 | 		sigma = np.diag(sigma)
21 | 		self.all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
22 | 
23 | 
24 | 	def train(self, ratingsMat, itemFeatureTable):
25 | 		# the logic:
26 | 		# using content-based modeling for rare items, predict some ratings
27 | 		# using the ratings matrix filled with the predicted ratings from the content-based model to do matrix factorization
28 | 		# itemFeatureTable is used for content-based model, which will predict for those items with few ratings
29 | 		# SVD will be used for collaborative filtering after the rare items have enough ratings
30 | 		indices = itemFeatureTable.index
31 | 		self.knnModel.fit(itemFeatureTable)
32 | 		# print ratingsMat.shape
33 | 		assert(ratingsMat.shape[1] == itemFeatureTable.index.max())
34 | 		rareCases = np.where((ratingsMat>0).sum(axis=0) < self.RARECASE_THRESHOLD)[0]
35 | 		# if an item has less than 5 ratings, it is considered as a rare case
36 | 		# it is the 0-based matrix indices
37 | 		self.log.info("Number of rare cases: %s" %rareCases.shape[0])
38 | 
39 | 		fillCount = 0
40 | 		ratingsMatFinal = ratingsMat.copy()
41 | 		for case in rareCases:
42 | 			if case+1 in itemFeatureTable.index:
43 | 				features  = itemFeatureTable.loc[case+1]
44 | 				neighbors = self.knnModel.kneighbors(features.values.reshape(1, -1), return_distance=False)[0]
45 | 				neighborPos = indices[neighbors]-1
46 | 				# compute the number of ratings got by the neighbors from each user
47 | 				target_count = (ratingsMat[:,neighborPos] > 0).sum(axis=1)
48 | 				# compute the predicted ratings generated from the content-based model
49 | 				target_ratings = ratingsMat[:,neighborPos].sum(axis=1).astype(float)/target_count
50 | 				#nonzero mean
51 | 
52 | 				for i in range(ratingsMat.shape[0]):
53 | 					if ratingsMat[i, case] == 0 and target_count[i]>10:
54 | 						# if the rating is missing and in its neighbors, more than 10 ratings are available
55 | 						if target_ratings[i]!=0:
56 | 							ratingsMatFinal[i,case] = target_ratings[i]
57 | 							fillCount += 1
58 | 
59 | 		# now we have the filled matrix for matrix factorization
60 | 		self.log.info("Number of ratings added by content-based model: %s" %fillCount)
61 | 
62 | 		self._CFSVD(ratingsMatFinal)
63 | 
64 | 
65 | 
66 | 	def predict(self, userId):
67 | 		return self.all_user_predicted_ratings[userId-1]
68 | 
69 | 	def provideRec(self, userId):
70 | 		# data is a tuple of (user feature, item feature)
71 | 		# compute the the average score, sorted from large to small, then report the item ids
72 | 		return self.all_user_predicted_ratings[userId-1].argsort()[::-1]+1
73 | 
74 | if __name__=="__main__":
75 | 	from DatabaseInterface import DatabaseInterface
76 | 	from Learners.OfflineLearner import OfflineLearner
77 | 	db = DatabaseInterface("../DATA")
78 | 	db.startEngine()
79 | 	history = db.extract("history")
80 | 	itemFeatureTable = db.extract(DatabaseInterface.ITEM_FEATURE_KEY).loc[:, "unknown":]
81 | 	ratingsMat = OfflineLearner.transformToMat(history)
82 | 
83 | 	model = CFmodel()
84 | 	model.train(ratingsMat, itemFeatureTable)
85 | 
86 | 	recs = model.provideRec(1)
87 | 	print recs
88 | 	print ratingsMat[0,recs-1]


--------------------------------------------------------------------------------
/Models/ClusteringModel.py:
--------------------------------------------------------------------------------
 1 | from sklearn.cluster import KMeans
 2 | 
 3 | # clustering model is to group items with similar features
 4 | # it is used for online recommendation
 5 | 
 6 | class ClusteringModel():
 7 | 	def __init__(self, n_cluster=10):
 8 | 		self.model = KMeans(n_cluster, random_state=12345) # set random state for reproducible
 9 | 		self.groups = {} # keyed by cluster index and values are itemId's
10 | 		self.trained = False
11 | 
12 | 	def train(self, itemFeatures):
13 | 		self.indices = itemFeatures.index # the itemIds
14 | 		self.model.fit(itemFeatures)
15 | 		self.labels = self.model.labels_
16 | 		# the label given for each data point
17 | 		# the label indicates which cluster it belongs to
18 | 		# for example, if we have four data, 1,2,3,4
19 | 		# dataId, clusterId
20 | 		# 1			1
21 | 		# 2			1
22 | 		# 3			2
23 | 		# 4			2
24 | 
25 | 		# and we want {1:[1,2],2:[3,4]}, called self.groups
26 | 		for k, v in zip(self.labels, itemFeatures.index.tolist()):
27 | 			self.groups.setdefault(k,[]).append(v)
28 | 		self.trained = True
29 | 
30 | 	def predict(self, itemFeatures):
31 | 		centers = self.model.predict(itemFeatures)
32 | 
33 | 		# based on the predicted centers, find the corresponding cluster members
34 | 		return centers, [self.groups[c] for c in centers]
35 | 
36 | 
37 | if __name__=="__main__":
38 | 	from DatabaseInterface import DatabaseInterface
39 | 	db = DatabaseInterface("../DATA")
40 | 	db.startEngine()
41 | 	itemFeatureTable = db.extract(DatabaseInterface.ITEM_FEATURE_KEY).loc[:, "unknown":]
42 | 
43 | 	model = ClusteringModel()
44 | 	model.train(itemFeatureTable)
45 | 
46 | 	print model.predict(itemFeatureTable.loc[1].values.reshape(1,-1))
47 | 	print itemFeatureTable.loc[[1,422]]
48 | 	print model.labels[:20]


--------------------------------------------------------------------------------
/Models/KNNmodel.py:
--------------------------------------------------------------------------------
 1 | # KNN model
 2 | # for new user, using user feature, find the k nearest neighbor, using their ratings for the recommendation
 3 | 
 4 | import numpy as np
 5 | from sklearn.neighbors import NearestNeighbors
 6 | 
 7 | 
 8 | class KNNmodel():
 9 | 	def __init__(self):
10 | 		self.knnModel = None
11 | 
12 | 	def train(self, userFeatureTable, ratingsMat):
13 | 		userFeatureTable.loc[:,"age"] = userFeatureTable.loc[:,"age"]/10.
14 | 		# ad hoc fix, make sure feature's range is similar
15 | 		self.knnModel = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(userFeatureTable)
16 | 
17 | 		# ratingMat is the rating matrix
18 | 		self.ratingsMat = ratingsMat
19 | 		self.userFeatureTable = userFeatureTable
20 | 		self.userIds = self.userFeatureTable.index # the actual order seen by the knnmodel
21 | 
22 | 	def predict(self, userFeature):
23 | 		distances, indices = self.knnModel.kneighbors(userFeature)
24 | 
25 | 		# indices are the nearest neighbors' index in the matrix, which is different from userId.
26 | 		return self.userIds[indices[0]]
27 | 
28 | 	def provideRec(self, userId):
29 | 		#data is a tuple of (user feature, item feature)
30 | 		userIds = self.predict(self.userFeatureTable.loc[userId].as_matrix().reshape(1,-1))
31 | 		# remove himself as a nearest neighbor
32 | 		userIds = np.array(list(set(userIds) - set([userId])))
33 | 
34 | 		# for all nearest neighbors, compute the the average score, sorted from large to small
35 | 		# then report the item ids
36 | 		return self.ratingsMat[userIds-1].mean(axis = 0).argsort()[::-1]+1
37 | 
38 | 
39 | 
40 | if __name__=="__main__":
41 | 	from DatabaseInterface import DatabaseInterface
42 | 	from Learners.OfflineLearner import OfflineLearner
43 | 	db = DatabaseInterface("../DATA")
44 | 	db.startEngine()
45 | 	history = db.extract("history")
46 | 	userFeatureTable = db.extract(DatabaseInterface.USER_FEATURE_KEY).loc[:, "age":]
47 | 	ratingsMat = OfflineLearner.transformToMat(history)
48 | 
49 | 	model = KNNmodel()
50 | 	model.train(userFeatureTable, ratingsMat)
51 | 	print model.provideRec(97)[:20]
52 | 	print ratingsMat[96,model.provideRec(97)-1][:20]
53 | 


--------------------------------------------------------------------------------
/Models/MostPopularModel.py:
--------------------------------------------------------------------------------
 1 | # most popular model
 2 | # here it is a simple design: find the one with highest score with most of the users
 3 | 
 4 | class MostPopularModel():
 5 | 	N_Freq_limit = 0.001 # at least 0.1% of users have rated it, we can start to consider if it is most popular
 6 | 
 7 | 	def __init__(self):
 8 | 		pass
 9 | 
10 | 	def train(self, history):
11 | 		# X must be a dataframe, with the second key as itemID, and third key as ratings
12 | 		itemID = list(history)[1]
13 | 		ratings = list(history)[2]
14 | 
15 | 		# what if only an item only got rated by one user, and the rating is 5, are we confident it is most popular?
16 | 		nLimit = int(history.shape[0]*self.N_Freq_limit)
17 | 		itemRatingGrouped = history.groupby(itemID)
18 | 		itemRatingGroupedCount = itemRatingGrouped[ratings].count()
19 | 		# print itemRatingGrouped[ratings].mean()
20 | 		self.mostPopular = itemRatingGrouped[ratings].mean()[itemRatingGroupedCount>nLimit].sort_values(ascending=False)
21 | 
22 | 	def predict(self,X):
23 | 		# X can only be a list of itemID's
24 | 		return [self.mostPopular.index.get_loc(x) for x in X]
25 | 
26 | 	def provideRec(self):
27 | 		return self.mostPopular.index.tolist()
28 | 
29 | 
30 | 
31 | if __name__=="__main__":
32 | 	from DatabaseInterface import DatabaseInterface
33 | 	db = DatabaseInterface("DATA")
34 | 	db.startEngine()
35 | 	df = db.extract("history")
36 | 	print df.head()
37 | 	model = MostPopularModel()
38 | 	model.train(df)
39 | 	print model.mostPopular
40 | 	print model.predict([408])
41 | 	print model.provideRec()


--------------------------------------------------------------------------------
/Models/SimilarItemModel.py:
--------------------------------------------------------------------------------
 1 | # similar item model
 2 | # underneath it is to use a clustering model
 3 | # for simplicity, return all in the same cluster if rating is higher or equal to 3; return empty cluster otherwise
 4 | 
 5 | class SimilarItemModel():
 6 | 	THRESHOLD = 3.0 # if ratings are below threshold, it will not be used
 7 | 
 8 | 	def __init__(self, clusteringModel):
 9 | 		# we use a trained clustering model (trained offline)
10 | 		self.clusteringModel = clusteringModel
11 | 		self.recs = []
12 | 
13 | 	def train(self, itemFeature, rating):
14 | 		# itemFeature: the feature of the item in the Action
15 | 		# rating: the rating of the user to the item, also in the Action
16 | 		# only single record
17 | 		# each model learns one person's current interest
18 | 		itemFeature = itemFeature.values.reshape(1,-1)
19 | 		center, indices = self.clusteringModel.predict(itemFeature)
20 | 
21 | 		# indices: the itemIds that are in the same cluster as the item we get
22 | 		# that is, the similar items
23 | 
24 | 		if rating >= self.THRESHOLD:
25 | 			self.recs = indices[0] # the indices is a list of list, like: [[1,2,3,4,5]]
26 | 		else:
27 | 			self.recs = []
28 | 
29 | 	def predict(self, itemFeature):
30 | 		# X should be item's category feature, only single record
31 | 		# return the similar items
32 | 		itemFeature = itemFeature.values.reshape(1,-1)
33 | 		center, indices = self.clusteringModel.predict(itemFeature)
34 | 		return indices[0]
35 | 
36 | 	def provideRec(self):
37 | 		return self.recs
38 | 
39 | 
40 | if __name__=="__main__":
41 | 	from DatabaseInterface import DatabaseInterface
42 | 	from Models.ClusteringModel import ClusteringModel
43 | 	db = DatabaseInterface("../DATA")
44 | 	db.startEngine()
45 | 	itemFeatureTable = db.extract(DatabaseInterface.ITEM_FEATURE_KEY).loc[:, "unknown":]
46 | 
47 | 	model = ClusteringModel()
48 | 	model.train(itemFeatureTable)
49 | 
50 | 	modelSI = SimilarItemModel(model)
51 | 	modelSI.train(itemFeatureTable.loc[1], 4)
52 | 	print modelSI.provideRec()
53 | 	modelSI.train(itemFeatureTable.loc[1], 2)
54 | 	print modelSI.provideRec()
55 | 


--------------------------------------------------------------------------------
/Models/__init__.py:
--------------------------------------------------------------------------------
1 | # All models should recommend the indices of items in the inventory
2 | 


--------------------------------------------------------------------------------
/Preprocessing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 144,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 145,
 16 |    "metadata": {},
 17 |    "outputs": [
 18 |     {
 19 |      "name": "stdout",
 20 |      "output_type": "stream",
 21 |      "text": [
 22 |       "\u001b[34mDATA\u001b[m\u001b[m/              u.genre            u2.test            ua.test\r\n",
 23 |       "README             u.info             u3.base            ub.base\r\n",
 24 |       "\u001b[31mallbut.pl\u001b[m\u001b[m*         u.item             u3.test            ub.test\r\n",
 25 |       "itemFeature.csv    u.occupation       u4.base            user_test.csv\r\n",
 26 |       "\u001b[31mmku.sh\u001b[m\u001b[m*            u.user             u4.test            user_train.csv\r\n",
 27 |       "ratings_test.csv   u1.base            u5.base\r\n",
 28 |       "ratings_train.csv  u1.test            u5.test\r\n",
 29 |       "u.data             u2.base            ua.base\r\n"
 30 |      ]
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "ls ../ml-100k/"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 146,
 40 |    "metadata": {},
 41 |    "outputs": [
 42 |     {
 43 |      "name": "stdout",
 44 |      "output_type": "stream",
 45 |      "text": [
 46 |       "/Users/Ke/Google Drive/Bit-tiger-AI-Engineer/Week3/Project/ml-100k\n"
 47 |      ]
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "cd ../ml-100k/"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 147,
 57 |    "metadata": {},
 58 |    "outputs": [
 59 |     {
 60 |      "name": "stdout",
 61 |      "output_type": "stream",
 62 |      "text": [
 63 |       "\u001b[34mDATA\u001b[m\u001b[m/              u.genre            u2.test            ua.test\r\n",
 64 |       "README             u.info             u3.base            ub.base\r\n",
 65 |       "\u001b[31mallbut.pl\u001b[m\u001b[m*         u.item             u3.test            ub.test\r\n",
 66 |       "itemFeature.csv    u.occupation       u4.base            user_test.csv\r\n",
 67 |       "\u001b[31mmku.sh\u001b[m\u001b[m*            u.user             u4.test            user_train.csv\r\n",
 68 |       "ratings_test.csv   u1.base            u5.base\r\n",
 69 |       "ratings_train.csv  u1.test            u5.test\r\n",
 70 |       "u.data             u2.base            ua.base\r\n"
 71 |      ]
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "ls"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 148,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "name": "stdout",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "mkdir: DATA: File exists\r\n"
 88 |      ]
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "mkdir DATA"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 149,
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "name": "stdout",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "1|24|M|technician|85711\r\n",
105 |       "2|53|F|other|94043\r\n",
106 |       "3|23|M|writer|32067\r\n",
107 |       "4|24|M|technician|43537\r\n",
108 |       "5|33|F|other|15213\r\n",
109 |       "6|42|M|executive|98101\r\n",
110 |       "7|57|M|administrator|91344\r\n",
111 |       "8|36|M|administrator|05201\r\n",
112 |       "9|29|M|student|01002\r\n",
113 |       "10|53|M|lawyer|90703\r\n"
114 |      ]
115 |     }
116 |    ],
117 |    "source": [
118 |     "!head u.user"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 150,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "names = ['user_id','age','gender','occupation','zipcode']\n",
128 |     "userDf = pd.read_csv('u.user',sep='|',names=names, index_col=0) \n",
129 |     "userDf = userDf.loc[:, 'age':'occupation']"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 151,
135 |    "metadata": {},
136 |    "outputs": [
137 |     {
138 |      "data": {
139 |       "text/html": [
140 |        "<div>\n",
141 |        "<table border=\"1\" class=\"dataframe\">\n",
142 |        "  <thead>\n",
143 |        "    <tr style=\"text-align: right;\">\n",
144 |        "      <th></th>\n",
145 |        "      <th>age</th>\n",
146 |        "      <th>gender_F</th>\n",
147 |        "      <th>gender_M</th>\n",
148 |        "      <th>occupation_administrator</th>\n",
149 |        "      <th>occupation_artist</th>\n",
150 |        "      <th>occupation_doctor</th>\n",
151 |        "      <th>occupation_educator</th>\n",
152 |        "      <th>occupation_engineer</th>\n",
153 |        "      <th>occupation_entertainment</th>\n",
154 |        "      <th>occupation_executive</th>\n",
155 |        "      <th>...</th>\n",
156 |        "      <th>occupation_marketing</th>\n",
157 |        "      <th>occupation_none</th>\n",
158 |        "      <th>occupation_other</th>\n",
159 |        "      <th>occupation_programmer</th>\n",
160 |        "      <th>occupation_retired</th>\n",
161 |        "      <th>occupation_salesman</th>\n",
162 |        "      <th>occupation_scientist</th>\n",
163 |        "      <th>occupation_student</th>\n",
164 |        "      <th>occupation_technician</th>\n",
165 |        "      <th>occupation_writer</th>\n",
166 |        "    </tr>\n",
167 |        "    <tr>\n",
168 |        "      <th>user_id</th>\n",
169 |        "      <th></th>\n",
170 |        "      <th></th>\n",
171 |        "      <th></th>\n",
172 |        "      <th></th>\n",
173 |        "      <th></th>\n",
174 |        "      <th></th>\n",
175 |        "      <th></th>\n",
176 |        "      <th></th>\n",
177 |        "      <th></th>\n",
178 |        "      <th></th>\n",
179 |        "      <th></th>\n",
180 |        "      <th></th>\n",
181 |        "      <th></th>\n",
182 |        "      <th></th>\n",
183 |        "      <th></th>\n",
184 |        "      <th></th>\n",
185 |        "      <th></th>\n",
186 |        "      <th></th>\n",
187 |        "      <th></th>\n",
188 |        "      <th></th>\n",
189 |        "      <th></th>\n",
190 |        "    </tr>\n",
191 |        "  </thead>\n",
192 |        "  <tbody>\n",
193 |        "    <tr>\n",
194 |        "      <th>1</th>\n",
195 |        "      <td>24</td>\n",
196 |        "      <td>0.0</td>\n",
197 |        "      <td>1.0</td>\n",
198 |        "      <td>0.0</td>\n",
199 |        "      <td>0.0</td>\n",
200 |        "      <td>0.0</td>\n",
201 |        "      <td>0.0</td>\n",
202 |        "      <td>0.0</td>\n",
203 |        "      <td>0.0</td>\n",
204 |        "      <td>0.0</td>\n",
205 |        "      <td>...</td>\n",
206 |        "      <td>0.0</td>\n",
207 |        "      <td>0.0</td>\n",
208 |        "      <td>0.0</td>\n",
209 |        "      <td>0.0</td>\n",
210 |        "      <td>0.0</td>\n",
211 |        "      <td>0.0</td>\n",
212 |        "      <td>0.0</td>\n",
213 |        "      <td>0.0</td>\n",
214 |        "      <td>1.0</td>\n",
215 |        "      <td>0.0</td>\n",
216 |        "    </tr>\n",
217 |        "    <tr>\n",
218 |        "      <th>2</th>\n",
219 |        "      <td>53</td>\n",
220 |        "      <td>1.0</td>\n",
221 |        "      <td>0.0</td>\n",
222 |        "      <td>0.0</td>\n",
223 |        "      <td>0.0</td>\n",
224 |        "      <td>0.0</td>\n",
225 |        "      <td>0.0</td>\n",
226 |        "      <td>0.0</td>\n",
227 |        "      <td>0.0</td>\n",
228 |        "      <td>0.0</td>\n",
229 |        "      <td>...</td>\n",
230 |        "      <td>0.0</td>\n",
231 |        "      <td>0.0</td>\n",
232 |        "      <td>1.0</td>\n",
233 |        "      <td>0.0</td>\n",
234 |        "      <td>0.0</td>\n",
235 |        "      <td>0.0</td>\n",
236 |        "      <td>0.0</td>\n",
237 |        "      <td>0.0</td>\n",
238 |        "      <td>0.0</td>\n",
239 |        "      <td>0.0</td>\n",
240 |        "    </tr>\n",
241 |        "    <tr>\n",
242 |        "      <th>3</th>\n",
243 |        "      <td>23</td>\n",
244 |        "      <td>0.0</td>\n",
245 |        "      <td>1.0</td>\n",
246 |        "      <td>0.0</td>\n",
247 |        "      <td>0.0</td>\n",
248 |        "      <td>0.0</td>\n",
249 |        "      <td>0.0</td>\n",
250 |        "      <td>0.0</td>\n",
251 |        "      <td>0.0</td>\n",
252 |        "      <td>0.0</td>\n",
253 |        "      <td>...</td>\n",
254 |        "      <td>0.0</td>\n",
255 |        "      <td>0.0</td>\n",
256 |        "      <td>0.0</td>\n",
257 |        "      <td>0.0</td>\n",
258 |        "      <td>0.0</td>\n",
259 |        "      <td>0.0</td>\n",
260 |        "      <td>0.0</td>\n",
261 |        "      <td>0.0</td>\n",
262 |        "      <td>0.0</td>\n",
263 |        "      <td>1.0</td>\n",
264 |        "    </tr>\n",
265 |        "    <tr>\n",
266 |        "      <th>4</th>\n",
267 |        "      <td>24</td>\n",
268 |        "      <td>0.0</td>\n",
269 |        "      <td>1.0</td>\n",
270 |        "      <td>0.0</td>\n",
271 |        "      <td>0.0</td>\n",
272 |        "      <td>0.0</td>\n",
273 |        "      <td>0.0</td>\n",
274 |        "      <td>0.0</td>\n",
275 |        "      <td>0.0</td>\n",
276 |        "      <td>0.0</td>\n",
277 |        "      <td>...</td>\n",
278 |        "      <td>0.0</td>\n",
279 |        "      <td>0.0</td>\n",
280 |        "      <td>0.0</td>\n",
281 |        "      <td>0.0</td>\n",
282 |        "      <td>0.0</td>\n",
283 |        "      <td>0.0</td>\n",
284 |        "      <td>0.0</td>\n",
285 |        "      <td>0.0</td>\n",
286 |        "      <td>1.0</td>\n",
287 |        "      <td>0.0</td>\n",
288 |        "    </tr>\n",
289 |        "    <tr>\n",
290 |        "      <th>5</th>\n",
291 |        "      <td>33</td>\n",
292 |        "      <td>1.0</td>\n",
293 |        "      <td>0.0</td>\n",
294 |        "      <td>0.0</td>\n",
295 |        "      <td>0.0</td>\n",
296 |        "      <td>0.0</td>\n",
297 |        "      <td>0.0</td>\n",
298 |        "      <td>0.0</td>\n",
299 |        "      <td>0.0</td>\n",
300 |        "      <td>0.0</td>\n",
301 |        "      <td>...</td>\n",
302 |        "      <td>0.0</td>\n",
303 |        "      <td>0.0</td>\n",
304 |        "      <td>1.0</td>\n",
305 |        "      <td>0.0</td>\n",
306 |        "      <td>0.0</td>\n",
307 |        "      <td>0.0</td>\n",
308 |        "      <td>0.0</td>\n",
309 |        "      <td>0.0</td>\n",
310 |        "      <td>0.0</td>\n",
311 |        "      <td>0.0</td>\n",
312 |        "    </tr>\n",
313 |        "  </tbody>\n",
314 |        "</table>\n",
315 |        "<p>5 rows × 24 columns</p>\n",
316 |        "</div>"
317 |       ]
318 |      },
319 |      "output_type": "execute_result",
320 |      "metadata": {}
321 |     }
322 |    ],
323 |    "source": [
324 |     "userDf = pd.get_dummies(userDf)\n",
325 |     "userDf.head()"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 152,
331 |    "metadata": {},
332 |    "outputs": [],
333 |    "source": [
334 |     "userDf.to_csv(\"DATA/userFeature.csv\")"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": 153,
340 |    "metadata": {},
341 |    "outputs": [
342 |     {
343 |      "name": "stdout",
344 |      "output_type": "stream",
345 |      "text": [
346 |       "unknown|0\r\n",
347 |       "Action|1\r\n",
348 |       "Adventure|2\r\n",
349 |       "Animation|3\r\n",
350 |       "Children's|4\r\n",
351 |       "Comedy|5\r\n",
352 |       "Crime|6\r\n",
353 |       "Documentary|7\r\n",
354 |       "Drama|8\r\n",
355 |       "Fantasy|9\r\n",
356 |       "Film-Noir|10\r\n",
357 |       "Horror|11\r\n",
358 |       "Musical|12\r\n",
359 |       "Mystery|13\r\n",
360 |       "Romance|14\r\n",
361 |       "Sci-Fi|15\r\n",
362 |       "Thriller|16\r\n",
363 |       "War|17\r\n",
364 |       "Western|18\r\n",
365 |       "\r\n"
366 |      ]
367 |     }
368 |    ],
369 |    "source": [
370 |     "!cat u.genre"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": 154,
376 |    "metadata": {},
377 |    "outputs": [
378 |     {
379 |      "name": "stdout",
380 |      "output_type": "stream",
381 |      "text": [
382 |       "1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0\r\n",
383 |       "2|GoldenEye (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?GoldenEye%20(1995)|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0\r\n",
384 |       "3|Four Rooms (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0\r\n",
385 |       "4|Get Shorty (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995)|0|1|0|0|0|1|0|0|1|0|0|0|0|0|0|0|0|0|0\r\n",
386 |       "5|Copycat (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Copycat%20(1995)|0|0|0|0|0|0|1|0|1|0|0|0|0|0|0|0|1|0|0\r\n",
387 |       "6|Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)|01-Jan-1995||http://us.imdb.com/Title?Yao+a+yao+yao+dao+waipo+qiao+(1995)|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0\r\n",
388 |       "7|Twelve Monkeys (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Twelve%20Monkeys%20(1995)|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|1|0|0|0\r\n",
389 |       "8|Babe (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Babe%20(1995)|0|0|0|0|1|1|0|0|1|0|0|0|0|0|0|0|0|0|0\r\n",
390 |       "9|Dead Man Walking (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Dead%20Man%20Walking%20(1995)|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0\r\n",
391 |       "10|Richard III (1995)|22-Jan-1996||http://us.imdb.com/M/title-exact?Richard%20III%20(1995)|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|1|0\r\n"
392 |      ]
393 |     }
394 |    ],
395 |    "source": [
396 |     "!head u.item"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": 155,
402 |    "metadata": {},
403 |    "outputs": [],
404 |    "source": [
405 |     "genreNames = pd.read_csv('u.genre',sep='|', index_col=0, names=[\"name\",\"index\"]).index.tolist()"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "code",
410 |    "execution_count": 156,
411 |    "metadata": {},
412 |    "outputs": [
413 |     {
414 |      "name": "stderr",
415 |      "output_type": "stream",
416 |      "text": [
417 |       "/Users/Ke/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
418 |       "  if __name__ == '__main__':\n"
419 |      ]
420 |     }
421 |    ],
422 |    "source": [
423 |     "itemFeatureDf = pd.read_csv('u.item',sep='\\|\\|?', index_col=0, names=[\"itemId\",\"itemName\",\"Date\",\"URL\"]+genreNames)"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": 157,
429 |    "metadata": {},
430 |    "outputs": [
431 |     {
432 |      "data": {
433 |       "text/html": [
434 |        "<div>\n",
435 |        "<table border=\"1\" class=\"dataframe\">\n",
436 |        "  <thead>\n",
437 |        "    <tr style=\"text-align: right;\">\n",
438 |        "      <th></th>\n",
439 |        "      <th>itemName</th>\n",
440 |        "      <th>Date</th>\n",
441 |        "      <th>URL</th>\n",
442 |        "      <th>unknown</th>\n",
443 |        "      <th>Action</th>\n",
444 |        "      <th>Adventure</th>\n",
445 |        "      <th>Animation</th>\n",
446 |        "      <th>Children's</th>\n",
447 |        "      <th>Comedy</th>\n",
448 |        "      <th>Crime</th>\n",
449 |        "      <th>...</th>\n",
450 |        "      <th>Fantasy</th>\n",
451 |        "      <th>Film-Noir</th>\n",
452 |        "      <th>Horror</th>\n",
453 |        "      <th>Musical</th>\n",
454 |        "      <th>Mystery</th>\n",
455 |        "      <th>Romance</th>\n",
456 |        "      <th>Sci-Fi</th>\n",
457 |        "      <th>Thriller</th>\n",
458 |        "      <th>War</th>\n",
459 |        "      <th>Western</th>\n",
460 |        "    </tr>\n",
461 |        "    <tr>\n",
462 |        "      <th>itemId</th>\n",
463 |        "      <th></th>\n",
464 |        "      <th></th>\n",
465 |        "      <th></th>\n",
466 |        "      <th></th>\n",
467 |        "      <th></th>\n",
468 |        "      <th></th>\n",
469 |        "      <th></th>\n",
470 |        "      <th></th>\n",
471 |        "      <th></th>\n",
472 |        "      <th></th>\n",
473 |        "      <th></th>\n",
474 |        "      <th></th>\n",
475 |        "      <th></th>\n",
476 |        "      <th></th>\n",
477 |        "      <th></th>\n",
478 |        "      <th></th>\n",
479 |        "      <th></th>\n",
480 |        "      <th></th>\n",
481 |        "      <th></th>\n",
482 |        "      <th></th>\n",
483 |        "      <th></th>\n",
484 |        "    </tr>\n",
485 |        "  </thead>\n",
486 |        "  <tbody>\n",
487 |        "    <tr>\n",
488 |        "      <th>267</th>\n",
489 |        "      <td>unknown</td>\n",
490 |        "      <td>NaN</td>\n",
491 |        "      <td>1</td>\n",
492 |        "      <td>0</td>\n",
493 |        "      <td>0</td>\n",
494 |        "      <td>0</td>\n",
495 |        "      <td>0</td>\n",
496 |        "      <td>0</td>\n",
497 |        "      <td>0</td>\n",
498 |        "      <td>0</td>\n",
499 |        "      <td>...</td>\n",
500 |        "      <td>0</td>\n",
501 |        "      <td>0</td>\n",
502 |        "      <td>0</td>\n",
503 |        "      <td>0</td>\n",
504 |        "      <td>0</td>\n",
505 |        "      <td>0</td>\n",
506 |        "      <td>0</td>\n",
507 |        "      <td>0</td>\n",
508 |        "      <td>0</td>\n",
509 |        "      <td>NaN</td>\n",
510 |        "    </tr>\n",
511 |        "    <tr>\n",
512 |        "      <th>1358</th>\n",
513 |        "      <td>The Deadly Cure (1996)</td>\n",
514 |        "      <td>16-Sep-1996</td>\n",
515 |        "      <td>NaN</td>\n",
516 |        "      <td>0</td>\n",
517 |        "      <td>1</td>\n",
518 |        "      <td>0</td>\n",
519 |        "      <td>0</td>\n",
520 |        "      <td>0</td>\n",
521 |        "      <td>0</td>\n",
522 |        "      <td>0</td>\n",
523 |        "      <td>...</td>\n",
524 |        "      <td>0</td>\n",
525 |        "      <td>0</td>\n",
526 |        "      <td>0</td>\n",
527 |        "      <td>0</td>\n",
528 |        "      <td>0</td>\n",
529 |        "      <td>0</td>\n",
530 |        "      <td>0</td>\n",
531 |        "      <td>0</td>\n",
532 |        "      <td>0</td>\n",
533 |        "      <td>0.0</td>\n",
534 |        "    </tr>\n",
535 |        "    <tr>\n",
536 |        "      <th>1359</th>\n",
537 |        "      <td>Boys in Venice (1996)</td>\n",
538 |        "      <td>24-Sep-1996</td>\n",
539 |        "      <td>NaN</td>\n",
540 |        "      <td>0</td>\n",
541 |        "      <td>0</td>\n",
542 |        "      <td>0</td>\n",
543 |        "      <td>0</td>\n",
544 |        "      <td>0</td>\n",
545 |        "      <td>0</td>\n",
546 |        "      <td>0</td>\n",
547 |        "      <td>...</td>\n",
548 |        "      <td>0</td>\n",
549 |        "      <td>0</td>\n",
550 |        "      <td>0</td>\n",
551 |        "      <td>0</td>\n",
552 |        "      <td>0</td>\n",
553 |        "      <td>0</td>\n",
554 |        "      <td>0</td>\n",
555 |        "      <td>0</td>\n",
556 |        "      <td>0</td>\n",
557 |        "      <td>0.0</td>\n",
558 |        "    </tr>\n",
559 |        "  </tbody>\n",
560 |        "</table>\n",
561 |        "<p>3 rows × 22 columns</p>\n",
562 |        "</div>"
563 |       ]
564 |      },
565 |      "output_type": "execute_result",
566 |      "metadata": {}
567 |     }
568 |    ],
569 |    "source": [
570 |     "itemFeatureDf[itemFeatureDf.isnull().any(axis=1)]"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "code",
575 |    "execution_count": 158,
576 |    "metadata": {},
577 |    "outputs": [
578 |     {
579 |      "data": {
580 |       "text/plain": [
581 |        "itemName       False\n",
582 |        "Date           False\n",
583 |        "URL             True\n",
584 |        "unknown        False\n",
585 |        "Action         False\n",
586 |        "Adventure      False\n",
587 |        "Animation      False\n",
588 |        "Children's     False\n",
589 |        "Comedy         False\n",
590 |        "Crime          False\n",
591 |        "Documentary    False\n",
592 |        "Drama          False\n",
593 |        "Fantasy        False\n",
594 |        "Film-Noir      False\n",
595 |        "Horror         False\n",
596 |        "Musical        False\n",
597 |        "Mystery        False\n",
598 |        "Romance        False\n",
599 |        "Sci-Fi         False\n",
600 |        "Thriller       False\n",
601 |        "War            False\n",
602 |        "Western        False\n",
603 |        "dtype: bool"
604 |       ]
605 |      },
606 |      "execution_count": 158,
607 |      "output_type": "execute_result",
608 |      "metadata": {}
609 |     }
610 |    ],
611 |    "source": [
612 |     "itemFeatureDf = itemFeatureDf.drop(267)\n",
613 |     "itemFeatureDf.isnull().any()"
614 |    ]
615 |   },
616 |   {
617 |    "cell_type": "code",
618 |    "execution_count": 159,
619 |    "metadata": {},
620 |    "outputs": [],
621 |    "source": [
622 |     "itemFeatureDf.to_csv(\"DATA/itemFeature.csv\")"
623 |    ]
624 |   },
625 |   {
626 |    "cell_type": "code",
627 |    "execution_count": 160,
628 |    "metadata": {},
629 |    "outputs": [],
630 |    "source": [
631 |     "itemFeatureDf.loc[:,[\"itemName\"]].to_csv(\"DATA/inventory.csv\")"
632 |    ]
633 |   },
634 |   {
635 |    "cell_type": "code",
636 |    "execution_count": 161,
637 |    "metadata": {},
638 |    "outputs": [
639 |     {
640 |      "name": "stdout",
641 |      "output_type": "stream",
642 |      "text": [
643 |       "196\t242\t3\t881250949\r\n",
644 |       "186\t302\t3\t891717742\r\n",
645 |       "22\t377\t1\t878887116\r\n",
646 |       "244\t51\t2\t880606923\r\n",
647 |       "166\t346\t1\t886397596\r\n",
648 |       "298\t474\t4\t884182806\r\n",
649 |       "115\t265\t2\t881171488\r\n",
650 |       "253\t465\t5\t891628467\r\n",
651 |       "305\t451\t3\t886324817\r\n",
652 |       "6\t86\t3\t883603013\r\n"
653 |      ]
654 |     }
655 |    ],
656 |    "source": [
657 |     "!head u.data"
658 |    ]
659 |   },
660 |   {
661 |    "cell_type": "code",
662 |    "execution_count": 162,
663 |    "metadata": {},
664 |    "outputs": [],
665 |    "source": [
666 |     "names = ['user_id','item_id','rating','timestamp']\n",
667 |     "dataDf = pd.read_csv('u.data',sep='\\t',names=names)"
668 |    ]
669 |   },
670 |   {
671 |    "cell_type": "code",
672 |    "execution_count": 163,
673 |    "metadata": {},
674 |    "outputs": [],
675 |    "source": [
676 |     "dataDf = dataDf.drop('timestamp',1)"
677 |    ]
678 |   },
679 |   {
680 |    "cell_type": "code",
681 |    "execution_count": 164,
682 |    "metadata": {},
683 |    "outputs": [],
684 |    "source": [
685 |     "# remove test users from training users"
686 |    ]
687 |   },
688 |   {
689 |    "cell_type": "code",
690 |    "execution_count": 165,
691 |    "metadata": {},
692 |    "outputs": [],
693 |    "source": [
694 |     "userTrainIndex = userDf.sample(frac = 0.90).index\n",
695 |     "dataDf_train = dataDf[dataDf.user_id.isin(userTrainIndex.tolist())]"
696 |    ]
697 |   },
698 |   {
699 |    "cell_type": "code",
700 |    "execution_count": 166,
701 |    "metadata": {},
702 |    "outputs": [
703 |     {
704 |      "name": "stdout",
705 |      "output_type": "stream",
706 |      "text": [
707 |       "(100000, 3)\n",
708 |       "(90876, 3)\n"
709 |      ]
710 |     }
711 |    ],
712 |    "source": [
713 |     "print dataDf.shape\n",
714 |     "print dataDf_train.shape"
715 |    ]
716 |   },
717 |   {
718 |    "cell_type": "code",
719 |    "execution_count": 167,
720 |    "metadata": {},
721 |    "outputs": [],
722 |    "source": [
723 |     "dataDf_test = dataDf.drop(dataDf_train.index)"
724 |    ]
725 |   },
726 |   {
727 |    "cell_type": "code",
728 |    "execution_count": 168,
729 |    "metadata": {},
730 |    "outputs": [
731 |     {
732 |      "name": "stdout",
733 |      "output_type": "stream",
734 |      "text": [
735 |       "(9124, 3)\n"
736 |      ]
737 |     }
738 |    ],
739 |    "source": [
740 |     "print dataDf_test.shape"
741 |    ]
742 |   },
743 |   {
744 |    "cell_type": "code",
745 |    "execution_count": 169,
746 |    "metadata": {},
747 |    "outputs": [],
748 |    "source": [
749 |     "dataDf_train.to_csv('DATA/ratings_train.csv')\n",
750 |     "dataDf_test.to_csv('DATA/ratings_test.csv')\n",
751 |     "dataDf.to_csv('DATA/ratings.csv')"
752 |    ]
753 |   },
754 |   {
755 |    "cell_type": "code",
756 |    "execution_count": 170,
757 |    "metadata": {},
758 |    "outputs": [
759 |     {
760 |      "data": {
761 |       "text/html": [
762 |        "<div>\n",
763 |        "<table border=\"1\" class=\"dataframe\">\n",
764 |        "  <thead>\n",
765 |        "    <tr style=\"text-align: right;\">\n",
766 |        "      <th></th>\n",
767 |        "      <th>user_id</th>\n",
768 |        "      <th>item_id</th>\n",
769 |        "      <th>rating</th>\n",
770 |        "    </tr>\n",
771 |        "  </thead>\n",
772 |        "  <tbody>\n",
773 |        "    <tr>\n",
774 |        "      <th>0</th>\n",
775 |        "      <td>196</td>\n",
776 |        "      <td>242</td>\n",
777 |        "      <td>3</td>\n",
778 |        "    </tr>\n",
779 |        "    <tr>\n",
780 |        "      <th>1</th>\n",
781 |        "      <td>186</td>\n",
782 |        "      <td>302</td>\n",
783 |        "      <td>3</td>\n",
784 |        "    </tr>\n",
785 |        "    <tr>\n",
786 |        "      <th>2</th>\n",
787 |        "      <td>22</td>\n",
788 |        "      <td>377</td>\n",
789 |        "      <td>1</td>\n",
790 |        "    </tr>\n",
791 |        "    <tr>\n",
792 |        "      <th>3</th>\n",
793 |        "      <td>244</td>\n",
794 |        "      <td>51</td>\n",
795 |        "      <td>2</td>\n",
796 |        "    </tr>\n",
797 |        "    <tr>\n",
798 |        "      <th>4</th>\n",
799 |        "      <td>166</td>\n",
800 |        "      <td>346</td>\n",
801 |        "      <td>1</td>\n",
802 |        "    </tr>\n",
803 |        "  </tbody>\n",
804 |        "</table>\n",
805 |        "</div>"
806 |       ]
807 |      },
808 |      "output_type": "execute_result",
809 |      "metadata": {}
810 |     }
811 |    ],
812 |    "source": [
813 |     "dataDf_train.head()"
814 |    ]
815 |   },
816 |   {
817 |    "cell_type": "code",
818 |    "metadata": {},
819 |    "outputs": [],
820 |    "source": [
821 |     ""
822 |    ]
823 |   }
824 |  ],
825 |  "metadata": {
826 |   "anaconda-cloud": {},
827 |   "kernelspec": {
828 |    "display_name": "Python [Root]",
829 |    "language": "python",
830 |    "name": "Python [Root]"
831 |   },
832 |   "language_info": {
833 |    "codemirror_mode": {
834 |     "name": "ipython",
835 |     "version": 2.0
836 |    },
837 |    "file_extension": ".py",
838 |    "mimetype": "text/x-python",
839 |    "name": "python",
840 |    "nbconvert_exporter": "python",
841 |    "pygments_lexer": "ipython2",
842 |    "version": "2.7.12"
843 |   }
844 |  },
845 |  "nbformat": 4,
846 |  "nbformat_minor": 0
847 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RecommenderSystem
 2 | A general form of recommender system
 3 | 
 4 | 
 5 | # To start:
 6 | download https://grouplens.org/datasets/movielens/100k/ \
 7 | unzip to a directory that is in the same root directory of this project folder \
 8 | your file system layout should be like: \
 9 |  /project - mk-100/
10 |            \ RecommenderSystem/
11 | 
12 | Inside RecommenderSystem/, run the Preprocessing.ipynb in the jupyter notebook \
13 | copy mk-100/DATA to RecommenderSystem/ \
14 | 
15 | now in the DATA/, you have all the data you need for this code to run
16 | 


--------------------------------------------------------------------------------
/Ranker.py:
--------------------------------------------------------------------------------
 1 | # Ranker
 2 | 
 3 | import logging
 4 | import numpy as np
 5 | 
 6 | # rank the items from each recommendation module
 7 | # highly influenced by business strategy and varies from system to system
 8 | from DatabaseInterface import DatabaseInterface
 9 | 
10 | 
11 | class Ranker(object):
12 | 	logging.basicConfig(level=logging.INFO)
13 | 	def __init__(self, numberToServe, database):
14 | 		self.numberToServe = numberToServe
15 | 		self.userHistoryDB = database.extract(DatabaseInterface.HISTORY_KEY) # who rated what
16 | 		self.log = logging.getLogger(__name__)
17 | 
18 | 	def _getUsedItems(self, userId):
19 | 		# return a python set of all the movies that have been seen
20 | 		if userId == -1 :
21 | 			return set([])
22 | 		else:
23 | 			return set(self.userHistoryDB[self.userHistoryDB.loc[:,"user_id"]==userId].loc[:,"item_id"])
24 | 
25 | 	def rerank(self,recommendationsTuple):
26 | 		# recommendationTupe is a tuple of (userId, recommendations)
27 | 		# recommendations is a dictionary of lists {RecType: Items}, RecType can be "online", "offline", "popular"
28 | 		# return the ranked recommendation
29 | 		# here is the strategy:
30 | 		# if the userId is -1, it means it is from anonymous user.
31 | 		# else remove the watched item and
32 | 
33 | 		userId = recommendationsTuple[0]
34 | 		recommendations = recommendationsTuple[1]
35 | 
36 | 		usedItems = self._getUsedItems(userId)
37 | 
38 | 
39 | 		self.log.info("Recommendations received in Ranker: %s" %recommendations)
40 | 		self.log.info("Recommendation types received in Ranker: %s" %recommendations.keys())
41 | 		results = []
42 | 
43 | 		if "online" in recommendations: # online exists as long as user has been active
44 | 			results.extend(recommendations["online"][:self.numberToServe]) # should only has one
45 | 
46 | 		if "offline" in recommendations: # offline exist only if user are registered, the recs could be from CF or LR
47 | 			results.extend(recommendations["offline"][:self.numberToServe])
48 | 
49 | 		if "popular" in recommendations: # most popular should always exist
50 | 			# if there is no personalized recs, the remaining should be filled by most popular
51 | 			results.extend(recommendations["popular"][:self.numberToServe])
52 | 		else:
53 | 			self.log.error("recommendations do not contain popular items")
54 | 
55 | 		try:
56 | 			# remove the already visited items
57 | 			results = np.random.choice(list(set(results)-usedItems), self.numberToServe, replace=False)
58 | 		except ValueError:
59 | 			# sometimes the user may watched a lot
60 | 			# this is apparently not a good strategy, why?
61 | 			results = np.random.choice(results, self.numberToServe, replace=False)
62 | 
63 | 
64 | 		return results
65 | 
66 | if __name__=="__main__":
67 | 	from DatabaseInterface import DatabaseInterface
68 | 	db = DatabaseInterface("DATA")
69 | 	db.startEngine()
70 | 	ranker = Ranker(numberToServe=10, database=db)
71 | 	print sorted(ranker._getUsedItems(1))


--------------------------------------------------------------------------------
/RecEngine.py:
--------------------------------------------------------------------------------
 1 | # Recommendation Engine
 2 | 
 3 | from ModelStore import ModelStore
 4 | import logging
 5 | 
 6 | class RecEngine(object):
 7 | 	logging.basicConfig(level=logging.INFO)
 8 | 
 9 | 	def __init__(self, userAnalyzer, modelStore, userActivityTable):
10 | 		self.userAnalyzer = userAnalyzer
11 | 		self.modelStore = modelStore
12 | 		self.userActivityTable = userActivityTable
13 | 		self._cacheMostPopular()
14 | 		# to pre-compute the most popular items, because this recommendation is independent from users
15 | 		self.log = logging.getLogger(__name__)
16 | 
17 | 	def resetCache(self):
18 | 		self._cacheMostPopular()
19 | 
20 | 	def _cacheMostPopular(self):
21 | 		self.mostPopularList = self.modelStore.getModel(ModelStore.MP_MODEL_KEY).provideRec()
22 | 
23 | 	def provideRecommendation(self, request):
24 | 		recommendations = {}
25 | 		# dictionary, dict1 = {"key":"value"}, when I try to get the value, I can use dict1["key"]
26 | 		# construct recommendation content, which is implemented as a dictionary
27 | 		# three sections will be used: popular, online, offline
28 | 
29 | 		recommendations["popular"] = self.mostPopularList
30 | 		requestAnalyzed = self.userAnalyzer.analyze(request, self.userActivityTable)
31 | 
32 | 		# online recommendation
33 | 		onlineRecs = self.modelStore.getModel(ModelStore.SI_MODEL_KEY, request.userId).provideRec()
34 | 
35 | 		self.log.info("user type: %s" %requestAnalyzed[0])
36 | 
37 | 		# now we start to construct our recommendation data
38 | 		if len(onlineRecs)>0:
39 | 			recommendations["online"] = onlineRecs # a lit of ids
40 | 
41 | 		if requestAnalyzed[0] == "new":
42 | 			# for new user, we use KNN model for offline model recommendation
43 | 			recommendations["offline"] = self.modelStore.getModel(ModelStore.KNN_MODEL_KEY)\
44 | 				.provideRec(requestAnalyzed[2].userId)
45 | 		elif requestAnalyzed[0] == "old":
46 | 			# for new user, we use CF model for offline model recommendation
47 | 			recommendations["offline"] = self.modelStore.getModel(ModelStore.CF_MODEL_KEY)\
48 | 				.provideRec(requestAnalyzed[2].userId)
49 | 
50 | 		return requestAnalyzed[1], recommendations
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/UserAnalyzer.py:
--------------------------------------------------------------------------------
 1 | # User Type Analyzer
 2 | # Determine different type of user, send different user to different recommendation module
 3 | 
 4 | class UserAnalyzer(object):
 5 | 	def __init__(self):
 6 | 		pass
 7 | 
 8 | 	def analyze(self, request, userActivityDB):
 9 | 		# should return an identifier such that the recommender engine knows what to do
10 | 		# userActitivyDB is defined in DatabaseInterface, to count user's total amount of activity
11 | 		if isinstance(request.userId,str):
12 | 			# it is an anonymous request
13 | 			return ["anonymous", -1,  request]
14 | 		elif request.userId in userActivityDB.index:
15 | 			if userActivityDB[request.userId] >= 30:
16 | 				# if the user has already rated more than 30 items, we call it an old user
17 | 				return ["old", request.userId, request]
18 | 			else:
19 | 				return ["new", request.userId, request]
20 | 		else:
21 | 			return ["new", request.userId, request]
22 | 
23 | 	def analyzeAction(self, action):
24 | 		if isinstance(action.userId, str):
25 | 			return "anonymous"
26 | 		else:
27 | 			return "registered"
28 | 


--------------------------------------------------------------------------------
/Webserver.py:
--------------------------------------------------------------------------------
 1 | # A simulation framework
 2 | import logging
 3 | 
 4 | from DatabaseInterface import DatabaseInterface
 5 | from RecEngine import RecEngine
 6 | from Ranker import Ranker
 7 | from Learners.OfflineLearner import OfflineLearner
 8 | from Learners.OnlineLearner import OnlineLearner
 9 | from UserAnalyzer import UserAnalyzer
10 | from ModelStore import ModelStore
11 | 
12 | 
13 | class WebServer(object):
14 | 	logging.basicConfig(level=logging.INFO)
15 | 
16 | 	def __init__(self, configMap):
17 | 		self.db = DatabaseInterface(configMap['data_dir'])
18 | 		# numberToServe: the number of items finally served to the users
19 | 		self.numberToServe = configMap['numberToServe']
20 | 		self.log = logging.getLogger(__name__)
21 | 
22 | 	def start(self):
23 | 		# each object here simulates the API calls through network
24 | 		# passing an object A to the constructor of B means A will communication to B
25 | 		self.db.startEngine()
26 | 		self.ranker = Ranker(self.numberToServe, self.db)
27 | 		self.userAnalyzer = UserAnalyzer()
28 | 		self.modelStore = ModelStore()
29 | 		self.offlineLearner = OfflineLearner(self.db, self.modelStore)
30 | 		self.onlineLearner = OnlineLearner(self.db, self.modelStore)
31 | 		self.offlineLearner.trainModel()
32 | 		# when we start the webserver, we should let offline learner to train the models,
33 | 		# such that, after the start(), we can start to give recommendation
34 | 		self.recEngine = RecEngine(self.userAnalyzer, self.modelStore, self.db.extract(DatabaseInterface.USER_ACTIVITY_KEY))
35 | 
36 | 
37 | 	def getAction(self, action):
38 | 		assert(isinstance(action, Action))
39 | 		# taking the action from users
40 | 		self.onlineLearner.trainModel(action)
41 | 		# analyze action type, and save the registered user's action
42 | 		actionType = self.userAnalyzer.analyzeAction(action)
43 | 		if actionType == "registered":
44 | 			self.log.info("Recording action %s" %action)
45 | 			self.db.putAction(action)
46 | 
47 | 	def provideRecommendation(self, request):
48 | 		# return the ID's for the recommended items
49 | 		assert(isinstance(request, Request))
50 | 		# provide recommendations to user
51 | 		self.log.info("responding to request: %s" %request)
52 | 		recommendations = self.recEngine.provideRecommendation(request)
53 | 		recsReranked = self.ranker.rerank(recommendations)
54 | 		return recsReranked # a list of item ids
55 | 
56 | 	def renderRecommendation(self, request):
57 | 		assert(isinstance(request, Request))
58 | 		recsReranked = self.provideRecommendation(request)
59 | 		# for the purpose of testing, we sort the index, output item names
60 | 		# output is ordered by the id value
61 | 		return self.db.extract(DatabaseInterface.INVENTORY_KEY).loc[recsReranked].sort_index()
62 | 
63 | 	def increment(self):
64 | 		self.log.info("incrementing the system, update the models")
65 | 		# increment the whole system by one day, trigger offline training
66 | 		self.offlineLearner.trainModel()
67 | 		self.modelStore.cleanOnlineModel()
68 | 		self.recEngine.resetCache()
69 | 
70 | 	# for demo purpose, given an itemId, return the item name
71 | 	def getFromInventory(self, itemId):
72 | 		return self.db.extract(DatabaseInterface.INVENTORY_KEY).loc[itemId]
73 | 
74 | # simulate a web request
75 | class Request(object):
76 | 	def __init__(self, userId):
77 | 		self.userId = userId
78 | 
79 | 	def __str__(self):
80 | 		return "request for user: "+str(self.userId)
81 | 
82 | # simulate a tracking event or a user's rating
83 | class Action(object):
84 | 	def __init__(self, userId, itemId,rating):
85 | 		self.userId = userId
86 | 		self.itemId = itemId
87 | 		self.rating = rating
88 | 
89 | 	def __str__(self):
90 | 		return "user: %s, item: %s, rating %s" %(self.userId, self.itemId, self.rating)


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | # main.py
 2 | # simulate different request coming into the system
 3 | 
 4 | from Webserver import WebServer, Request, Action
 5 | 
 6 | configMap = {"numberToServe": 10, "data_dir": "DATA"}
 7 | server = WebServer(configMap)
 8 | server.start() # load all the data in the database, start the first model training
 9 | 
10 | # now experiment
11 | reqX1 = Request(userId='X1') # anonymous user
12 | req1 = Request(userId=1) # if it is a registered user, we use integer
13 | print(reqX1)
14 | print(req1)
15 | 
16 | recX1 = server.renderRecommendation(reqX1) # output recommendations
17 | print recX1
18 | 
19 | rec1 = server.renderRecommendation(req1) # output recommendations
20 | print(rec1)
21 | 
22 | # now we start an action
23 | action1 = Action(1, 255, 5) # user 1 rated item 255 as score 5
24 | print server.getFromInventory(255) # find out the name of item 255
25 | server.getAction(action1) # feed the action to the server
26 | rec1_afteraction = server.renderRecommendation(req1) # get recommendation after the system knows about the action
27 | print(rec1_afteraction)
28 | 
29 | actionX1 = Action('X1', 123, 5) # anonymous user's action won't be saved in database
30 | print server.getFromInventory(123)
31 | server.getAction(actionX1)
32 | recX1_afteraction = server.renderRecommendation(reqX1)
33 | print(recX1_afteraction)
34 | 
35 | # update the system, e.g. one day has passed
36 | server.increment()
37 | # the system should forget about actionX1
38 | recX1_aftercleaning = server.renderRecommendation(reqX1)
39 | print(recX1_aftercleaning) # should be similar to recX1
40 | 
41 | 
42 | req19 = Request(userId=19) # the one with very few history, so it is a new user
43 | rec19 = server.renderRecommendation(req19)
44 | print(rec19)


--------------------------------------------------------------------------------
	age	gender_F	gender_M	occupation_administrator	occupation_artist	occupation_doctor	occupation_educator	occupation_engineer	occupation_entertainment	occupation_executive	...	occupation_marketing	occupation_none	occupation_other	occupation_programmer	occupation_retired	occupation_salesman	occupation_scientist	occupation_student	occupation_technician	occupation_writer
user_id
1	24	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0
2	53	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	23	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
4	24	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0
5	33	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
	itemName	Date	URL	unknown	Action	Adventure	Animation	Children's	Comedy	Crime	...	Fantasy	Film-Noir	Horror	Musical	Mystery	Romance	Sci-Fi	Thriller	War	Western
itemId
267	unknown	NaN	1	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	NaN
1358	The Deadly Cure (1996)	16-Sep-1996	NaN	0	1	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0.0
1359	Boys in Venice (1996)	24-Sep-1996	NaN	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0.0