├── .ipynb_checkpoints ├── Untitled-checkpoint.ipynb └── recommender-checkpoint.ipynb ├── Error Chart.png ├── LICENSE ├── Presentation.pdf ├── Presentation_Images ├── 1.png ├── 2.png └── 3.png ├── README.md ├── __pycache__ └── mf.cpython-36.pyc ├── feasible_data_10000.txt ├── feasible_data_100000.txt ├── feasible_data_1024.txt ├── feasible_data_150000.txt ├── feasible_data_175000.txt ├── feasible_data_200000.txt ├── feasible_data_25000.txt ├── feasible_data_5000.txt ├── feasible_data_50000.txt ├── feasible_data_75000.txt ├── mf.py ├── movie_titles.csv ├── presentation.tex ├── recommender.ipynb ├── recommender_final.py └── recommender_final_toy_dataset.py /.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/recommender-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 26, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Setup Complete\n", 13 | "\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "#Setting up prerequisites\n", 19 | "import pandas as pd\n", 20 | "import numpy as np\n", 21 | "import math\n", 22 | "import re\n", 23 | "import sklearn\n", 24 | "from scipy.sparse import csr_matrix\n", 25 | "import matplotlib.pyplot as plt\n", 26 | "import seaborn as sns\n", 27 | "from surprise import Reader, Dataset, SVD, evaluate\n", 28 | "sns.set_style(\"darkgrid\")\n", 29 | "\n", 30 | "from cvxpy import *\n", 31 | "from numpy import matrix\n", 32 | "\n", 33 | "print(\"Setup Complete\\n\")" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "Dataset 1 shape: (1024, 3)\n", 46 | "-Dataset examples-\n", 47 | " Cust_Id Rating Date\n", 48 | "0 1: NaN NaN\n", 49 | "100 2630337 5.0 20050310.0\n", 50 | "200 573434 4.0 20040526.0\n", 51 | "300 638824 5.0 20040519.0\n", 52 | "400 1653834 4.0 20040822.0\n", 53 | "500 1033930 3.0 20050811.0\n", 54 | "600 349407 5.0 20050102.0\n", 55 | "700 656399 4.0 20030920.0\n", 56 | "800 1456369 4.0 20030708.0\n", 57 | "900 253037 3.0 20050805.0\n", 58 | "1000 1369550 3.0 20041011.0\n", 59 | "float64\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "df1 = pd.read_csv('netflix-prize-data/toy_combined_data.txt', header = None, names = ['Cust_Id', 'Rating', 'Date'], usecols = [0,1,2])\n", 65 | "df1['Rating'] = df1['Rating'].astype(float)\n", 66 | "df1['Date'] = df1['Date'].astype(str)\n", 67 | "df1['Date'] = df1['Date'].map( lambda s : (s[:4])+(s[5:7])+(s[8:]))\n", 68 | "df1['Date'] = df1['Date'].astype(float)\n", 69 | "print('Dataset 1 shape: {}'.format(df1.shape))\n", 70 | "print('-Dataset examples-')\n", 71 | "print(df1.iloc[::100, :])\n", 72 | "print(df1['Date'].dtype)\n", 73 | "df = df1" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 3, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "See Overview of the Data\n" 86 | ] 87 | }, 88 | { 89 | "data": { 90 | "image/png": "\n", 91 | "text/plain": [ 92 | "
" 93 | ] 94 | }, 95 | "metadata": {}, 96 | "output_type": "display_data" 97 | } 98 | ], 99 | "source": [ 100 | "#Seeing the distribution of ratings given by the users\n", 101 | "print(\"See Overview of the Data\")\n", 102 | "p = df.groupby('Rating')['Rating'].agg(['count'])\n", 103 | "# get movie count\n", 104 | "movie_count = df.isnull().sum()[1]\n", 105 | "# get customer count\n", 106 | "cust_count = df['Cust_Id'].nunique() - movie_count\n", 107 | "# get rating count\n", 108 | "rating_count = df['Cust_Id'].count() - movie_count\n", 109 | "ax = p.plot(kind = 'barh', legend = False, figsize = (15,10))\n", 110 | "plt.title('Total pool: {:,} Movies, {:,} customers, {:,} ratings given'.format(movie_count, cust_count, rating_count), fontsize=20)\n", 111 | "plt.axis('off')\n", 112 | "for i in range(1,6):\n", 113 | " ax.text(p.iloc[i-1][0]/4, i-1, 'Rated {}: {:.0f}%'.format(i, p.iloc[i-1][0]*100 / p.sum()[0]), color = 'white', weight = 'bold')" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 4, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "Movie IDs extracted from the extra rows given\n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "#Adding movie IDs to the dataset\n", 131 | "movie_np = []\n", 132 | "movie_id = 0\n", 133 | "for x in range(df.shape[0]):\n", 134 | " if(np.isnan(df.iloc[x]['Rating'])):\n", 135 | " movie_id = movie_id+1\n", 136 | " movie_np = np.append(movie_np,movie_id)\n", 137 | "\n", 138 | "#print(movie_np)\n", 139 | "#print(len(movie_np))\n", 140 | "df['Movie_Id'] = movie_np.astype(int)\n", 141 | "print(\"Movie IDs extracted from the extra rows given\")" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 5, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "-Dataset examples-\n", 154 | " Cust_Id Rating Date Movie_Id\n", 155 | "1 1488844 3.0 20050906.0 1\n", 156 | "101 1155747 3.0 20050703.0 1\n", 157 | "201 1141189 4.0 20041215.0 1\n", 158 | "301 2256485 1.0 20040819.0 1\n", 159 | "401 2322840 3.0 20050712.0 1\n", 160 | "501 45117 5.0 20050815.0 1\n", 161 | "602 2596999 4.0 20051007.0 2\n", 162 | "703 1644750 3.0 20030319.0 3\n", 163 | "803 372528 3.0 20040630.0 3\n", 164 | "903 1115632 3.0 20031124.0 3\n", 165 | "1003 2085230 4.0 20040315.0 3\n", 166 | "\n", 167 | "\n", 168 | "These are the final datatypes of the dataset\n", 169 | "Cust_Id int64\n", 170 | "Rating float64\n", 171 | "Date float64\n", 172 | "Movie_Id int64\n", 173 | "dtype: object\n" 174 | ] 175 | }, 176 | { 177 | "name": "stderr", 178 | "output_type": "stream", 179 | "text": [ 180 | "/usr/lib/python3/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: \n", 181 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 182 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 183 | "\n", 184 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 185 | " This is separate from the ipykernel package so we can avoid doing imports until\n" 186 | ] 187 | } 188 | ], 189 | "source": [ 190 | "# remove the extra Movie ID rows\n", 191 | "df = df[pd.notnull(df['Rating'])]\n", 192 | "df['Cust_Id'] = df['Cust_Id'].astype(int)\n", 193 | "print('-Dataset examples-')\n", 194 | "print(df.iloc[::100, :])\n", 195 | "\n", 196 | "\n", 197 | "print(\"\\n\\nThese are the final datatypes of the dataset\")\n", 198 | "print(df.dtypes)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 6, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "name": "stdout", 208 | "output_type": "stream", 209 | "text": [ 210 | "(1009, 3)\n" 211 | ] 212 | } 213 | ], 214 | "source": [ 215 | "#Creating Data Matrix\n", 216 | "df_matrix=pd.pivot_table(df,values='Rating',index='Cust_Id',columns='Movie_Id')\n", 217 | "print(df_matrix.shape)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 7, 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "name": "stdout", 227 | "output_type": "stream", 228 | "text": [ 229 | "See some Movie ID- Movie Title Mapping : \n", 230 | "\n", 231 | " Year Name\n", 232 | "Movie_Id \n", 233 | "1 2003.0 Dinosaur Planet\n", 234 | "2 2004.0 Isle of Man TT 2004 Review\n", 235 | "3 1997.0 Character\n", 236 | "4 1994.0 Paula Abdul's Get Up & Dance\n", 237 | "5 2004.0 The Rise and Fall of ECW\n", 238 | "6 1997.0 Sick\n", 239 | "7 1992.0 8 Man\n", 240 | "8 2004.0 What the #$*! Do We Know!?\n" 241 | ] 242 | } 243 | ], 244 | "source": [ 245 | "#Loading the Movie ID- Movie Title Mapping File\n", 246 | "\n", 247 | "df_title = pd.read_csv('netflix-prize-data/movie_titles.csv', encoding = \"ISO-8859-1\", header = None, names = ['Movie_Id', 'Year', 'Name'])\n", 248 | "df_title.set_index('Movie_Id', inplace = True)\n", 249 | "print(\"See some Movie ID- Movie Title Mapping : \\n\")\n", 250 | "print (df_title.head(8))" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 8, 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "name": "stdout", 260 | "output_type": "stream", 261 | "text": [ 262 | "\n", 263 | "\n", 264 | "Data Cleaning Complete.\n", 265 | " See head of the Data Matrix:\n", 266 | "\n", 267 | "Movie_Id 1 2 3\n", 268 | "Cust_Id \n", 269 | "915 5.0 NaN NaN\n", 270 | "1333 NaN NaN 4.0\n", 271 | "2442 3.0 NaN NaN\n", 272 | "3321 3.0 NaN NaN\n", 273 | "4326 4.0 NaN NaN\n", 274 | "\n", 275 | "Num of movies = 3\n", 276 | "Num of users = 1009\n" 277 | ] 278 | } 279 | ], 280 | "source": [ 281 | "print(\"\\n\\nData Cleaning Complete.\\n See head of the Data Matrix:\\n\")\n", 282 | "print(df_matrix.head())\n", 283 | "\n", 284 | "n_movies = movie_count\n", 285 | "n_customers = cust_count\n", 286 | "\n", 287 | "print(\"\\nNum of movies =\", movie_count)\n", 288 | "print(\"Num of users =\", cust_count)" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 9, 294 | "metadata": {}, 295 | "outputs": [ 296 | { 297 | "name": "stdout", 298 | "output_type": "stream", 299 | "text": [ 300 | "Movie_Id 1 2 3\n", 301 | "Cust_Id \n", 302 | "915 5.0 0.0 0.0\n", 303 | "1333 0.0 0.0 4.0\n", 304 | "2442 3.0 0.0 0.0\n", 305 | "3321 3.0 0.0 0.0\n", 306 | "4326 4.0 0.0 0.0\n" 307 | ] 308 | } 309 | ], 310 | "source": [ 311 | "#Choosing the number of latent attributes\n", 312 | "n_attr= 100*1000000\n", 313 | "#print(type(n_attr),type(n_movies), type(n_customers))\n", 314 | "Q = Variable((n_attr,n_movies))\n", 315 | "P = Variable((n_attr, n_customers))\n", 316 | "\n", 317 | "\n", 318 | "\n", 319 | "acq_data = df_matrix.fillna(0.0)\n", 320 | "print(acq_data.head())\n", 321 | "\n" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 49, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "class MF():\n", 331 | "\n", 332 | " def __init__(self, R, K, alpha, beta, iterations):\n", 333 | " \"\"\"\n", 334 | " Perform matrix factorization to predict empty\n", 335 | " entries in a matrix.\n", 336 | "\n", 337 | " Arguments\n", 338 | " - R (ndarray) : user-item rating matrix\n", 339 | " - K (int) : number of latent dimensions\n", 340 | " - alpha (float) : learning rate\n", 341 | " - beta (float) : regularization parameter\n", 342 | " \"\"\"\n", 343 | "\n", 344 | " self.R = R\n", 345 | " self.num_users, self.num_items = R.shape\n", 346 | " self.K = K\n", 347 | " self.alpha = alpha\n", 348 | " self.beta = beta\n", 349 | " self.iterations = iterations\n", 350 | "\n", 351 | " def train(self):\n", 352 | " # Initialize user and item latent feature matrice\n", 353 | " self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))\n", 354 | " self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))\n", 355 | "\n", 356 | " # Initialize the biases\n", 357 | " self.b_u = np.zeros(self.num_users)\n", 358 | " self.b_i = np.zeros(self.num_items)\n", 359 | " self.b = np.mean(self.R[np.where(self.R != 0)])\n", 360 | "\n", 361 | " # Create a list of training samples\n", 362 | " self.samples = [\n", 363 | " (i, j, self.R[i, j])\n", 364 | " for i in range(self.num_users)\n", 365 | " for j in range(self.num_items)\n", 366 | " if self.R[i, j] > 0\n", 367 | " ]\n", 368 | "\n", 369 | " # Perform stochastic gradient descent for number of iterations\n", 370 | " training_process = []\n", 371 | " for i in range(self.iterations):\n", 372 | " np.random.shuffle(self.samples)\n", 373 | " self.sgd()\n", 374 | " mse = self.mse()\n", 375 | " training_process.append((i, mse))\n", 376 | " if (i+1) % 100 == 0:\n", 377 | " print(\"Iteration: %d ; error = %.4f\" % (i+1, mse))\n", 378 | "\n", 379 | " return training_process\n", 380 | "\n", 381 | " def mse(self):\n", 382 | " \"\"\"\n", 383 | " A function to compute the total mean square error\n", 384 | " \"\"\"\n", 385 | " xs, ys = self.R.nonzero()\n", 386 | " predicted = self.full_matrix()\n", 387 | " error = 0\n", 388 | " for x, y in zip(xs, ys):\n", 389 | " error += pow(self.R[x, y] - predicted[x, y], 2)\n", 390 | " return np.sqrt(error)\n", 391 | "\n", 392 | " def sgd(self):\n", 393 | " \"\"\"\n", 394 | " Perform stochastic graident descent\n", 395 | " \"\"\"\n", 396 | " for i, j, r in self.samples:\n", 397 | " # Computer prediction and error\n", 398 | " prediction = self.get_rating(i, j)\n", 399 | " e = (r - prediction)\n", 400 | "\n", 401 | " # Update biases\n", 402 | " self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])\n", 403 | " self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])\n", 404 | "\n", 405 | " # Update user and item latent feature matrices\n", 406 | " self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])\n", 407 | " self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])\n", 408 | "\n", 409 | " def get_rating(self, i, j):\n", 410 | " \"\"\"\n", 411 | " Get the predicted rating of user i and item j\n", 412 | " \"\"\"\n", 413 | " prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)\n", 414 | " return prediction\n", 415 | "\n", 416 | " def full_matrix(self):\n", 417 | " \"\"\"\n", 418 | " Computer the full matrix using the resultant biases, P and Q\n", 419 | " \"\"\"\n", 420 | " return self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.P.dot(self.Q.T)" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 50, 426 | "metadata": {}, 427 | "outputs": [ 428 | { 429 | "name": "stdout", 430 | "output_type": "stream", 431 | "text": [ 432 | "\n", 433 | "P x Q:\n", 434 | "[[4.46271506 3.82325605 3.80535557]\n", 435 | " [3.85422058 3.64783311 3.68900504]\n", 436 | " [3.36439893 3.43993058 3.44318097]\n", 437 | " ...\n", 438 | " [3.58337245 3.46726285 3.37777364]\n", 439 | " [3.91898975 3.63739457 3.58991505]\n", 440 | " [3.63308644 3.42857823 3.47186301]]\n", 441 | "\n", 442 | "Global bias:\n", 443 | "3.6787463271302645\n", 444 | "\n", 445 | "User bias:\n", 446 | "[ 0.20857453 0.06703809 -0.12422918 ... -0.10967525 0.04237086\n", 447 | " -0.10272106]\n", 448 | "\n", 449 | "Item bias:\n", 450 | "[ 0.0937364 -0.10371626 -0.10917139]\n" 451 | ] 452 | } 453 | ], 454 | "source": [ 455 | "#R = np.array([\n", 456 | "# [5, 3, 0, 1],\n", 457 | "# [4, 0, 0, 1],\n", 458 | "# [1, 1, 0, 5],\n", 459 | "# [1, 0, 0, 4],\n", 460 | "# [0, 1, 5, 4],\n", 461 | "#])\n", 462 | "\n", 463 | "R = np.array(acq_data)\n", 464 | "\n", 465 | "mf = MF(R, K=100, alpha=0.01, beta=0.01, iterations=20)\n", 466 | "training_process = mf.train()\n", 467 | "print()\n", 468 | "print(\"P x Q:\")\n", 469 | "print(mf.full_matrix())\n", 470 | "print()\n", 471 | "print(\"Global bias:\")\n", 472 | "print(mf.b)\n", 473 | "print()\n", 474 | "print(\"User bias:\")\n", 475 | "print(mf.b_u)\n", 476 | "print()\n", 477 | "print(\"Item bias:\")\n", 478 | "print(mf.b_i)" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 51, 484 | "metadata": {}, 485 | "outputs": [ 486 | { 487 | "name": "stdout", 488 | "output_type": "stream", 489 | "text": [ 490 | "This is the Result for Training Set:\n", 491 | "\n" 492 | ] 493 | }, 494 | { 495 | "data": { 496 | "image/png": "\n", 497 | "text/plain": [ 498 | "
" 499 | ] 500 | }, 501 | "metadata": {}, 502 | "output_type": "display_data" 503 | } 504 | ], 505 | "source": [ 506 | "x = [x for x, y in training_process]\n", 507 | "y = [y for x, y in training_process]\n", 508 | "plt.figure(figsize=((16,4)))\n", 509 | "plt.plot(x, np.sqrt(y))\n", 510 | "plt.xticks(x, x)\n", 511 | "\n", 512 | "print(\"This is the Result for Training Set:\\n\")\n", 513 | "plt.xlabel(\"Iterations\")\n", 514 | "plt.ylabel(\"Root Mean Square Error\")\n", 515 | "plt.grid(axis=\"y\")" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 58, 521 | "metadata": {}, 522 | "outputs": [ 523 | { 524 | "name": "stdout", 525 | "output_type": "stream", 526 | "text": [ 527 | "Original:\n", 528 | " [[5 3 0 1]\n", 529 | " [4 0 0 1]\n", 530 | " [1 1 0 5]\n", 531 | " [1 0 0 4]\n", 532 | " [0 1 5 4]]\n", 533 | "Test Set:\n", 534 | " [[5 3 0 1]\n", 535 | " [4 0 0 1]\n", 536 | " [0 1 0 5]\n", 537 | " [1 0 0 4]\n", 538 | " [0 1 5 4]]\n", 539 | "MSE= 0.22360679774997896\n", 540 | "Iteration: 100 ; error = 4.3799\n", 541 | "Iteration: 200 ; error = 0.3920\n", 542 | "Iteration: 300 ; error = 0.1484\n", 543 | "Iteration: 400 ; error = 0.0773\n", 544 | "Iteration: 500 ; error = 0.0555\n", 545 | "Iteration: 600 ; error = 0.0488\n", 546 | "Iteration: 700 ; error = 0.0465\n", 547 | "Iteration: 800 ; error = 0.0457\n", 548 | "Iteration: 900 ; error = 0.0452\n", 549 | "Iteration: 1000 ; error = 0.0449\n", 550 | "Iteration: 1100 ; error = 0.0447\n", 551 | "Iteration: 1200 ; error = 0.0445\n", 552 | "Iteration: 1300 ; error = 0.0444\n", 553 | "Iteration: 1400 ; error = 0.0442\n", 554 | "Iteration: 1500 ; error = 0.0440\n", 555 | "Iteration: 1600 ; error = 0.0439\n", 556 | "Iteration: 1700 ; error = 0.0438\n", 557 | "Iteration: 1800 ; error = 0.0437\n", 558 | "Iteration: 1900 ; error = 0.0436\n", 559 | "Iteration: 2000 ; error = 0.0434\n", 560 | "Iteration: 2100 ; error = 0.0433\n", 561 | "Iteration: 2200 ; error = 0.0433\n", 562 | "Iteration: 2300 ; error = 0.0431\n", 563 | "Iteration: 2400 ; error = 0.0430\n", 564 | "Iteration: 2500 ; error = 0.0430\n", 565 | "Iteration: 2600 ; error = 0.0429\n", 566 | "Iteration: 2700 ; error = 0.0429\n", 567 | "Iteration: 2800 ; error = 0.0427\n", 568 | "Iteration: 2900 ; error = 0.0426\n", 569 | "Iteration: 3000 ; error = 0.0426\n", 570 | "Iteration: 3100 ; error = 0.0425\n", 571 | "Iteration: 3200 ; error = 0.0425\n", 572 | "Iteration: 3300 ; error = 0.0423\n", 573 | "Iteration: 3400 ; error = 0.0423\n", 574 | "Iteration: 3500 ; error = 0.0423\n", 575 | "Iteration: 3600 ; error = 0.0422\n", 576 | "Iteration: 3700 ; error = 0.0421\n", 577 | "Iteration: 3800 ; error = 0.0420\n", 578 | "Iteration: 3900 ; error = 0.0420\n", 579 | "Iteration: 4000 ; error = 0.0419\n", 580 | "Iteration: 4100 ; error = 0.0420\n", 581 | "Iteration: 4200 ; error = 0.0418\n", 582 | "Iteration: 4300 ; error = 0.0418\n", 583 | "Iteration: 4400 ; error = 0.0418\n", 584 | "Iteration: 4500 ; error = 0.0418\n", 585 | "Iteration: 4600 ; error = 0.0417\n", 586 | "Iteration: 4700 ; error = 0.0417\n", 587 | "Iteration: 4800 ; error = 0.0416\n", 588 | "Iteration: 4900 ; error = 0.0416\n", 589 | "Iteration: 5000 ; error = 0.0416\n", 590 | "Iteration: 5100 ; error = 0.0416\n", 591 | "Iteration: 5200 ; error = 0.0415\n", 592 | "Iteration: 5300 ; error = 0.0415\n", 593 | "Iteration: 5400 ; error = 0.0414\n", 594 | "Iteration: 5500 ; error = 0.0414\n", 595 | "Iteration: 5600 ; error = 0.0413\n", 596 | "Iteration: 5700 ; error = 0.0413\n", 597 | "Iteration: 5800 ; error = 0.0413\n", 598 | "Iteration: 5900 ; error = 0.0413\n", 599 | "Iteration: 6000 ; error = 0.0414\n", 600 | "Iteration: 6100 ; error = 0.0413\n", 601 | "Iteration: 6200 ; error = 0.0412\n", 602 | "Iteration: 6300 ; error = 0.0412\n", 603 | "Iteration: 6400 ; error = 0.0412\n", 604 | "Iteration: 6500 ; error = 0.0411\n", 605 | "Iteration: 6600 ; error = 0.0412\n", 606 | "Iteration: 6700 ; error = 0.0412\n", 607 | "Iteration: 6800 ; error = 0.0411\n", 608 | "Iteration: 6900 ; error = 0.0411\n", 609 | "Iteration: 7000 ; error = 0.0411\n", 610 | "Iteration: 7100 ; error = 0.0410\n", 611 | "Iteration: 7200 ; error = 0.0410\n", 612 | "Iteration: 7300 ; error = 0.0411\n", 613 | "Iteration: 7400 ; error = 0.0411\n", 614 | "Iteration: 7500 ; error = 0.0410\n", 615 | "Iteration: 7600 ; error = 0.0410\n", 616 | "Iteration: 7700 ; error = 0.0410\n", 617 | "Iteration: 7800 ; error = 0.0410\n", 618 | "Iteration: 7900 ; error = 0.0409\n", 619 | "Iteration: 8000 ; error = 0.0409\n", 620 | "Iteration: 8100 ; error = 0.0409\n", 621 | "Iteration: 8200 ; error = 0.0408\n", 622 | "Iteration: 8300 ; error = 0.0409\n", 623 | "Iteration: 8400 ; error = 0.0409\n", 624 | "Iteration: 8500 ; error = 0.0409\n", 625 | "Iteration: 8600 ; error = 0.0409\n", 626 | "Iteration: 8700 ; error = 0.0408\n", 627 | "Iteration: 8800 ; error = 0.0408\n", 628 | "Iteration: 8900 ; error = 0.0408\n", 629 | "Iteration: 9000 ; error = 0.0408\n", 630 | "Iteration: 9100 ; error = 0.0408\n", 631 | "Iteration: 9200 ; error = 0.0408\n", 632 | "Iteration: 9300 ; error = 0.0408\n", 633 | "Iteration: 9400 ; error = 0.0408\n", 634 | "Iteration: 9500 ; error = 0.0408\n", 635 | "Iteration: 9600 ; error = 0.0407\n", 636 | "Iteration: 9700 ; error = 0.0407\n", 637 | "Iteration: 9800 ; error = 0.0408\n", 638 | "Iteration: 9900 ; error = 0.0407\n", 639 | "Iteration: 10000 ; error = 0.0407\n", 640 | "Learnt=\n", 641 | " [[4.98310779 3.00307074 3.28549611 1.01519266]\n", 642 | " [3.99763226 2.207354 3.07663016 1.01276062]\n", 643 | " [1.57652706 1.00963135 5.71488038 4.9846755 ]\n", 644 | " [1.01285017 0.37091408 4.80884404 3.99512921]\n", 645 | " [1.82542641 1.0173647 4.9885115 3.99623184]]\n" 646 | ] 647 | } 648 | ], 649 | "source": [ 650 | "R = np.array([\n", 651 | " [5, 3, 0, 1],\n", 652 | " [4, 0, 0, 1],\n", 653 | " [1, 1, 0, 5],\n", 654 | " [1, 0, 0, 4],\n", 655 | " [0, 1, 5, 4],\n", 656 | "])\n", 657 | "\n", 658 | "R1= np.array([\n", 659 | " [5, 3, 0, 1],\n", 660 | " [4, 0, 0, 1],\n", 661 | " [1, 1, 0, 5],\n", 662 | " [1, 0, 0, 4],\n", 663 | " [0, 1, 5, 4],\n", 664 | "])\n", 665 | "\n", 666 | "#Set the number of values to replace. For example 20%:\n", 667 | "\n", 668 | "# Edit: changed len(mat) for mat.size\n", 669 | "prop = int(R.size * 0.2)\n", 670 | "\n", 671 | "#Randomly choose indices of the numpy array:\n", 672 | "i = [np.random.choice(range(R.shape[0])) for _ in range(prop)]\n", 673 | "j = [np.random.choice(range(R.shape[1])) for _ in range(prop)]\n", 674 | "\n", 675 | "#Change values with NaN\n", 676 | "R[i,j] = 0\n", 677 | "print(\"Original:\\n\",R1)\n", 678 | "print(\"Test Set:\\n\",R)\n", 679 | "R=np.rint(R)\n", 680 | "\n", 681 | "from sklearn.metrics import mean_squared_error\n", 682 | "mse = mean_squared_error(R, R1)\n", 683 | "\n", 684 | "print(\"MSE=\",mse**0.5)\n", 685 | "\n", 686 | "mf = MF(R, K=10000, alpha=0.01, beta=0.01, iterations=10000)\n", 687 | "training_process = mf.train()\n", 688 | "\n", 689 | "print(\"Learnt=\\n\",mf.full_matrix())\n" 690 | ] 691 | }, 692 | { 693 | "cell_type": "code", 694 | "execution_count": null, 695 | "metadata": {}, 696 | "outputs": [ 697 | { 698 | "name": "stdout", 699 | "output_type": "stream", 700 | "text": [ 701 | "Original:\n", 702 | " [[5. 0. 0.]\n", 703 | " [0. 0. 4.]\n", 704 | " [3. 0. 0.]\n", 705 | " ...\n", 706 | " [0. 0. 3.]\n", 707 | " [4. 0. 0.]\n", 708 | " [0. 3. 0.]]\n", 709 | "Test Set:\n", 710 | " [[5. 0. 0.]\n", 711 | " [0. 0. 0.]\n", 712 | " [0. 0. 0.]\n", 713 | " ...\n", 714 | " [0. 0. 3.]\n", 715 | " [4. 0. 0.]\n", 716 | " [0. 3. 0.]]\n", 717 | "RMSE= 0.9737214516879918\n", 718 | "Learnt=\n", 719 | " [[3.81775822 3.68859571 3.70036228]\n", 720 | " [3.69523794 3.56623209 3.57800652]\n", 721 | " [3.69523284 3.56623826 3.57801557]\n", 722 | " ...\n", 723 | " [3.63535051 3.50635783 3.5180917 ]\n", 724 | " [3.72342232 3.59438358 3.60615318]\n", 725 | " [3.64198163 3.51296124 3.52476696]]\n" 726 | ] 727 | } 728 | ], 729 | "source": [ 730 | "R = np.array(acq_data)\n", 731 | "\n", 732 | "R1= np.array(acq_data)\n", 733 | "\n", 734 | "#Set the number of values to replace. For example 20%:\n", 735 | "\n", 736 | "# Edit: changed len(mat) for mat.size\n", 737 | "prop = int(R.size * 0.2)\n", 738 | "\n", 739 | "#Randomly choose indices of the numpy array:\n", 740 | "i = [np.random.choice(range(R.shape[0])) for _ in range(prop)]\n", 741 | "j = [np.random.choice(range(R.shape[1])) for _ in range(prop)]\n", 742 | "\n", 743 | "#Change values with NaN\n", 744 | "R[i,j] = 0\n", 745 | "print(\"Original:\\n\",R1)\n", 746 | "print(\"Test Set:\\n\",R)\n", 747 | "R=np.rint(R)\n", 748 | "\n", 749 | "from sklearn.metrics import mean_squared_error\n", 750 | "mse = mean_squared_error(R, R1)\n", 751 | "print(\"RMSE=\",mse**0.5)\n", 752 | "\n", 753 | "mf = MF(R, K=10000, alpha=0.01, beta=0.01, iterations=10)\n", 754 | "training_process = mf.train()\n", 755 | "\n", 756 | "print(\"Learnt=\\n\",mf.full_matrix())\n", 757 | "msef=0.0\n", 758 | "for i1 in range(len(i)):\n", 759 | " for i2 in range(len(j)):\n", 760 | " msef = msef + (R1[i,j]-(mf.full_matrix())[i,j])**2\n", 761 | "msef = (msef/(len(j)*len(i)))\n", 762 | "print(\"RMSE f=\",msef**0.5)" 763 | ] 764 | }, 765 | { 766 | "cell_type": "code", 767 | "execution_count": null, 768 | "metadata": {}, 769 | "outputs": [], 770 | "source": [] 771 | } 772 | ], 773 | "metadata": { 774 | "kernelspec": { 775 | "display_name": "Python 3", 776 | "language": "python", 777 | "name": "python3" 778 | }, 779 | "language_info": { 780 | "codemirror_mode": { 781 | "name": "ipython", 782 | "version": 3 783 | }, 784 | "file_extension": ".py", 785 | "mimetype": "text/x-python", 786 | "name": "python", 787 | "nbconvert_exporter": "python", 788 | "pygments_lexer": "ipython3", 789 | "version": "3.6.7" 790 | } 791 | }, 792 | "nbformat": 4, 793 | "nbformat_minor": 2 794 | } 795 | -------------------------------------------------------------------------------- /Error Chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshraj11584/Paper-Implementation-Matrix-Factorization-Recommender-Systems-Netflix/752452296cbee241df0100a82b90e885c9ef6ec7/Error Chart.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Harsh Raj 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Presentation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshraj11584/Paper-Implementation-Matrix-Factorization-Recommender-Systems-Netflix/752452296cbee241df0100a82b90e885c9ef6ec7/Presentation.pdf -------------------------------------------------------------------------------- /Presentation_Images/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshraj11584/Paper-Implementation-Matrix-Factorization-Recommender-Systems-Netflix/752452296cbee241df0100a82b90e885c9ef6ec7/Presentation_Images/1.png -------------------------------------------------------------------------------- /Presentation_Images/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshraj11584/Paper-Implementation-Matrix-Factorization-Recommender-Systems-Netflix/752452296cbee241df0100a82b90e885c9ef6ec7/Presentation_Images/2.png -------------------------------------------------------------------------------- /Presentation_Images/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshraj11584/Paper-Implementation-Matrix-Factorization-Recommender-Systems-Netflix/752452296cbee241df0100a82b90e885c9ef6ec7/Presentation_Images/3.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Paper-Implementation-Matrix-Factorization-Recommender-Systems-Netflix 2 | [![license](https://img.shields.io/github/license/DAVFoundation/captain-n3m0.svg?style=flat-square)](https://github.com/harshraj11584/Paper-Implementation-Matrix-Factorization-Recommender-Systems-Netflix/blob/master/LICENSE) [![Python 3.6](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/downloads/release/python-360/) 3 | ## IEEE paper **"Matrix Factorization Techniques for Recommender Systems"** 4 | ### - Yehuda Koren, Robert Bell, Chris Volinsky 5 | ### Python 3.6 6 | 7 | Links to original paper published by IEEE Computer Society : [[1]](https://ieeexplore.ieee.org/document/5197422), [[2]](https://datajobs.com/data-science-repo/Recommender-Systems-[Netflix].pdf) 8 | 9 | Link to Netflix Dataset Used : [[1]](https://www.kaggle.com/netflix-inc/netflix-prize-data) 10 | 11 | ### Files 12 | 13 | 1) **Presentation.pdf** : Explains the paper. Was written in Latex Beamer, tex code is in _presentation.tex_ 14 | 15 | 2) **recommender_final.py** : The final recommender. Includes biases and regularization. Requires **mf.py** to be imported to run. Use directly on any dataset by changing line 19 in **recommender_final.py**. 16 | 17 | 3) **recommender_final_toy_dataset.py** shows how exactly Matrix Factorization Techniques work by considering a 5x5 toy dataset. 18 | 19 | 4) The **.ipynb_** files include visualizations of RMSE decreasing with iterations when fitting on the training dataset. All **.ipynb** files are standalone and do not require importing **mf.py** 20 | 21 | 5) **feasible_data_n.txt** : Files with only the first n datapoints from whole dataset. Used for Testing. 22 | 23 | 5) **Training** and **Testing Data** : 24 | Not given separately. Program randomly separates k% of data as Test data, trains on remaining, then tests on the k% values. Default k=20, can be changed on line 154. 25 | 26 | 27 | ### Error Analysis 28 | 29 | ![img](https://github.com/harshraj11584/Paper-Implementation-Matrix-Factorization-Recommender-Systems-Netflix/blob/master/Error%20Chart.png) 30 | -------------------------------------------------------------------------------- /__pycache__/mf.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshraj11584/Paper-Implementation-Matrix-Factorization-Recommender-Systems-Netflix/752452296cbee241df0100a82b90e885c9ef6ec7/__pycache__/mf.cpython-36.pyc -------------------------------------------------------------------------------- /feasible_data_1024.txt: -------------------------------------------------------------------------------- 1 | 1: 2 | 1488844,3,2005-09-06 3 | 822109,5,2005-05-13 4 | 885013,4,2005-10-19 5 | 30878,4,2005-12-26 6 | 823519,3,2004-05-03 7 | 893988,3,2005-11-17 8 | 124105,4,2004-08-05 9 | 1248029,3,2004-04-22 10 | 1842128,4,2004-05-09 11 | 2238063,3,2005-05-11 12 | 1503895,4,2005-05-19 13 | 2207774,5,2005-06-06 14 | 2590061,3,2004-08-12 15 | 2442,3,2004-04-14 16 | 543865,4,2004-05-28 17 | 1209119,4,2004-03-23 18 | 804919,4,2004-06-10 19 | 1086807,3,2004-12-28 20 | 1711859,4,2005-05-08 21 | 372233,5,2005-11-23 22 | 1080361,3,2005-03-28 23 | 1245640,3,2005-12-19 24 | 558634,4,2004-12-14 25 | 2165002,4,2004-04-06 26 | 1181550,3,2004-02-01 27 | 1227322,4,2004-02-06 28 | 427928,4,2004-02-26 29 | 814701,5,2005-09-29 30 | 808731,4,2005-10-31 31 | 662870,5,2005-08-24 32 | 337541,5,2005-03-23 33 | 786312,3,2004-11-16 34 | 1133214,4,2004-03-07 35 | 1537427,4,2004-03-29 36 | 1209954,5,2005-05-09 37 | 2381599,3,2005-09-12 38 | 525356,2,2004-07-11 39 | 1910569,4,2004-04-12 40 | 2263586,4,2004-08-20 41 | 2421815,2,2004-02-26 42 | 1009622,1,2005-01-19 43 | 1481961,2,2005-05-24 44 | 401047,4,2005-06-03 45 | 2179073,3,2004-08-29 46 | 1434636,3,2004-05-01 47 | 93986,5,2005-10-06 48 | 1308744,5,2005-10-29 49 | 2647871,4,2005-12-30 50 | 1905581,5,2005-08-16 51 | 2508819,3,2004-05-18 52 | 1578279,1,2005-05-19 53 | 1159695,4,2005-02-15 54 | 2588432,3,2005-03-31 55 | 2423091,3,2005-09-12 56 | 470232,4,2004-04-08 57 | 2148699,2,2004-06-05 58 | 1342007,3,2004-07-16 59 | 466135,4,2004-07-13 60 | 2472440,3,2005-08-13 61 | 1283744,3,2004-04-17 62 | 1927580,4,2004-11-08 63 | 716874,5,2005-05-06 64 | 4326,4,2005-10-29 65 | 1546549,5,2004-07-20 66 | 1493697,1,2005-11-01 67 | 880166,5,2005-07-12 68 | 535396,2,2005-02-03 69 | 494609,4,2004-12-08 70 | 1961619,5,2005-06-01 71 | 883478,4,2005-12-16 72 | 793564,4,2004-04-19 73 | 1567202,2,2004-03-01 74 | 573537,4,2005-02-03 75 | 1972040,4,2005-06-02 76 | 1838912,3,2005-11-08 77 | 411705,4,2004-05-11 78 | 2244518,5,2004-09-15 79 | 584542,5,2005-02-01 80 | 667730,5,2005-05-26 81 | 2488120,5,2005-09-20 82 | 1926776,1,2005-10-05 83 | 38052,3,2004-06-03 84 | 1196100,4,2004-10-30 85 | 314933,3,2005-09-12 86 | 1792741,2,2004-02-09 87 | 769643,1,2004-11-18 88 | 2477242,5,2005-01-30 89 | 1421006,3,2005-08-03 90 | 729846,4,2005-08-09 91 | 1719610,2,2005-08-19 92 | 1696031,4,2005-01-25 93 | 1817215,4,2005-09-01 94 | 406057,4,2004-09-27 95 | 636262,1,2005-09-26 96 | 1245406,4,2004-12-30 97 | 1834590,3,2005-02-05 98 | 593225,3,2004-07-11 99 | 1011918,4,2005-05-28 100 | 1665054,4,2005-07-06 101 | 2630337,5,2005-03-10 102 | 1155747,3,2005-07-03 103 | 2439493,1,2004-02-06 104 | 479924,5,2005-08-02 105 | 530789,5,2005-03-29 106 | 765860,4,2005-06-13 107 | 231001,3,2005-01-03 108 | 1493615,5,2005-05-23 109 | 1850615,2,2005-07-12 110 | 68959,3,2005-08-11 111 | 147386,5,2004-09-19 112 | 624035,5,2004-02-19 113 | 782308,5,2004-07-02 114 | 1116080,5,2005-08-08 115 | 421374,5,2004-11-17 116 | 1158759,4,2004-06-13 117 | 1025798,3,2005-09-02 118 | 1215397,5,2005-07-19 119 | 2475251,4,2005-08-25 120 | 321111,2,2005-05-02 121 | 2162676,3,2005-08-01 122 | 2635437,4,2004-12-19 123 | 2389367,1,2004-07-01 124 | 485622,5,2005-08-01 125 | 235553,4,2004-07-15 126 | 831869,4,2005-06-02 127 | 99400,5,2005-09-16 128 | 684876,4,2005-11-01 129 | 1871179,3,2004-12-08 130 | 1107678,5,2005-03-15 131 | 642036,3,2005-09-21 132 | 700890,5,2004-05-06 133 | 2289956,5,2004-04-29 134 | 2040859,1,2004-11-23 135 | 1524964,3,2005-04-19 136 | 121318,4,2005-06-06 137 | 317050,5,2005-11-15 138 | 2287003,5,2004-12-20 139 | 59052,2,2004-07-12 140 | 893742,4,2005-04-27 141 | 1346257,3,2005-08-23 142 | 55016,3,2004-05-12 143 | 30245,5,2004-10-19 144 | 743633,4,2005-08-09 145 | 1596531,5,2004-01-23 146 | 1125499,5,2005-01-31 147 | 706832,4,2005-05-06 148 | 2465337,3,2005-04-04 149 | 2291422,1,2005-08-28 150 | 1777406,3,2005-01-02 151 | 1904905,4,2005-05-13 152 | 2450433,3,2005-01-08 153 | 1348967,2,2004-06-21 154 | 638020,3,2005-08-10 155 | 2217779,4,2005-02-14 156 | 194280,1,2004-12-14 157 | 493009,4,2005-03-27 158 | 1567167,4,2005-09-11 159 | 850327,5,2005-09-18 160 | 520386,3,2005-10-25 161 | 320540,2,2005-07-26 162 | 1188228,2,2004-12-05 163 | 57961,4,2004-11-09 164 | 1113230,3,2005-05-04 165 | 1374216,2,2005-09-23 166 | 595778,3,2004-03-03 167 | 209573,4,2005-10-31 168 | 2354601,5,2005-12-10 169 | 2563596,4,2004-04-04 170 | 835265,4,2005-09-01 171 | 1819474,3,2004-04-03 172 | 1447104,3,2004-08-28 173 | 1100940,1,2005-10-24 174 | 143274,3,2005-08-31 175 | 2329565,4,2005-01-19 176 | 181592,4,2005-03-26 177 | 936396,2,2004-02-09 178 | 1125797,3,2004-07-05 179 | 2283366,3,2004-12-27 180 | 514495,4,2005-04-13 181 | 1772176,3,2005-09-27 182 | 1877347,4,2005-07-13 183 | 1287892,4,2005-05-17 184 | 255443,2,2005-05-23 185 | 890669,4,2005-11-02 186 | 1989766,4,2005-07-08 187 | 2315073,4,2004-03-24 188 | 14756,4,2005-12-27 189 | 907623,3,2004-05-11 190 | 991423,4,2004-10-24 191 | 1604238,4,2005-06-02 192 | 1027056,3,2005-12-03 193 | 2025883,5,2005-03-24 194 | 732936,5,2005-04-24 195 | 563962,5,2005-11-09 196 | 799442,4,2005-11-21 197 | 352635,5,2004-05-11 198 | 2537543,5,2005-10-26 199 | 1564395,4,2005-07-09 200 | 1655178,4,2004-08-26 201 | 573434,4,2004-05-26 202 | 1141189,4,2004-12-15 203 | 383247,5,2005-02-03 204 | 1763921,5,2004-05-10 205 | 1943970,5,2004-09-20 206 | 322009,3,2004-05-20 207 | 2333817,3,2004-10-22 208 | 2095681,2,2005-10-29 209 | 1149588,4,2005-12-13 210 | 2354740,5,2005-05-05 211 | 2421360,5,2005-10-19 212 | 496087,2,2004-04-25 213 | 2191781,1,2005-04-03 214 | 1694083,4,2005-10-03 215 | 818416,3,2005-07-27 216 | 701960,5,2005-04-29 217 | 2090477,4,2005-06-28 218 | 1664010,5,2005-10-12 219 | 2583822,5,2005-08-17 220 | 369646,5,2005-04-30 221 | 2234063,4,2005-05-11 222 | 259799,4,2005-06-23 223 | 1077982,4,2004-03-16 224 | 2631796,4,2005-09-27 225 | 1122383,3,2004-10-22 226 | 1508526,3,2004-07-25 227 | 1600207,5,2004-04-17 228 | 1283117,5,2005-03-18 229 | 1727869,5,2005-07-11 230 | 1522799,4,2004-10-19 231 | 1394012,5,2005-12-19 232 | 1558286,3,2005-03-15 233 | 1155602,3,2005-07-05 234 | 361066,3,2004-09-24 235 | 1743210,5,2005-09-22 236 | 1148389,4,2004-04-03 237 | 2268101,4,2005-10-28 238 | 519684,5,2005-07-25 239 | 767518,5,2005-08-02 240 | 122197,1,2005-03-09 241 | 2112162,4,2005-10-14 242 | 1073367,3,2005-07-25 243 | 400162,5,2004-11-08 244 | 1524343,5,2005-03-24 245 | 741245,4,2005-06-28 246 | 2563768,3,2005-07-04 247 | 1406595,4,2005-08-27 248 | 1137010,4,2004-10-05 249 | 60343,5,2005-05-08 250 | 225765,4,2004-05-10 251 | 2530404,3,2004-05-01 252 | 437881,3,2004-11-17 253 | 1935793,1,2005-04-18 254 | 134001,4,2005-05-06 255 | 2607300,3,2005-07-25 256 | 1008986,4,2004-10-25 257 | 94565,4,2004-12-15 258 | 828410,4,2005-01-03 259 | 1805202,4,2005-10-03 260 | 1922925,4,2004-04-03 261 | 1435717,5,2005-08-11 262 | 2277395,4,2004-03-09 263 | 2305014,5,2004-08-31 264 | 166041,4,2005-09-02 265 | 2413320,4,2004-02-06 266 | 87113,2,2005-09-21 267 | 722591,5,2004-03-08 268 | 2291306,1,2004-03-01 269 | 2010770,4,2004-12-30 270 | 255383,5,2005-04-10 271 | 1873429,4,2005-10-27 272 | 1647618,4,2005-08-27 273 | 608234,5,2005-02-25 274 | 42930,3,2005-05-14 275 | 1462072,5,2005-06-22 276 | 685565,5,2005-09-26 277 | 3321,3,2005-09-27 278 | 2554942,4,2005-10-03 279 | 1874547,4,2004-08-11 280 | 2269844,5,2005-11-02 281 | 34907,3,2005-08-17 282 | 1779903,4,2005-06-04 283 | 2576424,4,2005-08-10 284 | 230112,3,2004-08-26 285 | 508727,3,2004-07-11 286 | 1603525,3,2004-10-13 287 | 172264,4,2005-09-17 288 | 1182185,4,2005-10-06 289 | 2275470,2,2005-10-05 290 | 491531,5,2005-02-17 291 | 1346432,4,2005-11-15 292 | 1554712,5,2004-10-27 293 | 1450941,5,2005-09-13 294 | 1714116,3,2005-08-09 295 | 2016488,4,2004-11-22 296 | 1782762,4,2005-01-19 297 | 1343170,5,2005-02-26 298 | 2565752,4,2004-12-10 299 | 435841,3,2005-09-19 300 | 2242821,5,2005-08-29 301 | 638824,5,2004-05-19 302 | 2256485,1,2004-08-19 303 | 101597,5,2004-10-01 304 | 623036,5,2005-05-16 305 | 1559445,5,2005-06-08 306 | 1723381,5,2005-08-30 307 | 1824586,4,2005-03-03 308 | 2233105,4,2005-09-08 309 | 682963,3,2005-06-21 310 | 2529547,5,2005-11-18 311 | 504620,2,2005-08-12 312 | 1682104,4,2005-08-30 313 | 16272,4,2005-01-20 314 | 2491785,5,2005-05-09 315 | 978412,5,2005-07-02 316 | 2054145,3,2005-07-28 317 | 2444240,3,2005-08-14 318 | 547732,3,2005-06-11 319 | 811790,5,2005-09-02 320 | 31913,4,2004-10-15 321 | 437111,4,2005-06-27 322 | 640588,4,2004-09-06 323 | 2625019,3,2005-09-12 324 | 2605190,5,2005-11-05 325 | 915,5,2005-08-17 326 | 1430587,4,2005-05-18 327 | 2544219,5,2005-11-20 328 | 2603381,5,2005-11-29 329 | 305344,1,2004-02-08 330 | 2569099,1,2005-08-16 331 | 2430356,4,2004-07-15 332 | 885165,4,2005-06-02 333 | 2380806,5,2005-09-06 334 | 1512406,1,2005-10-03 335 | 1774623,4,2005-11-23 336 | 2226525,4,2005-02-08 337 | 2537076,4,2005-10-17 338 | 2060858,4,2005-05-09 339 | 498469,5,2005-03-22 340 | 68033,4,2005-10-04 341 | 1819146,5,2005-08-15 342 | 2088415,4,2005-02-01 343 | 473070,5,2005-04-06 344 | 1823641,5,2004-03-29 345 | 1839976,2,2004-03-31 346 | 14924,5,2005-10-04 347 | 1852606,4,2004-07-23 348 | 453694,5,2004-07-21 349 | 921487,2,2004-07-19 350 | 1022254,5,2004-09-15 351 | 2464081,4,2005-01-22 352 | 1228324,4,2005-11-12 353 | 1563530,4,2004-08-18 354 | 1181170,3,2004-09-08 355 | 1357013,3,2004-10-02 356 | 21722,4,2005-02-07 357 | 288420,5,2005-06-02 358 | 1739170,5,2005-09-19 359 | 2584676,3,2005-08-06 360 | 2013504,4,2005-08-10 361 | 1245176,4,2004-07-27 362 | 269524,3,2005-03-05 363 | 661344,3,2005-03-16 364 | 652324,3,2004-04-15 365 | 2239213,3,2005-08-24 366 | 863302,4,2004-08-19 367 | 758850,4,2004-09-21 368 | 1884755,2,2004-11-15 369 | 544833,3,2005-09-27 370 | 1562707,1,2005-07-25 371 | 810700,5,2004-08-31 372 | 837756,5,2004-10-26 373 | 155164,4,2004-10-29 374 | 493945,5,2005-04-12 375 | 1565175,5,2004-08-10 376 | 2005193,4,2005-11-17 377 | 1605780,4,2004-09-17 378 | 1294335,2,2004-09-22 379 | 608576,4,2005-03-19 380 | 659505,4,2005-05-16 381 | 1604707,4,2005-10-17 382 | 2630797,5,2005-12-09 383 | 402266,5,2004-10-16 384 | 752642,3,2004-02-24 385 | 1906145,4,2005-07-19 386 | 389872,2,2005-08-09 387 | 1462866,2,2004-10-09 388 | 1952116,4,2005-04-28 389 | 54774,4,2005-05-25 390 | 1776980,5,2005-10-13 391 | 1494196,5,2004-02-29 392 | 253794,5,2004-08-10 393 | 1569513,3,2004-02-26 394 | 596728,2,2004-04-26 395 | 1107588,1,2004-02-22 396 | 1133763,3,2005-05-15 397 | 1398076,4,2004-07-02 398 | 1178171,4,2004-07-01 399 | 984369,3,2005-08-25 400 | 2618594,4,2004-07-27 401 | 1653834,4,2004-08-22 402 | 2322840,3,2005-07-12 403 | 2207647,4,2004-08-12 404 | 1994111,4,2005-01-11 405 | 1824044,4,2004-04-29 406 | 2255037,3,2004-06-01 407 | 2056022,3,2004-11-22 408 | 1458179,4,2005-01-26 409 | 1508350,4,2005-06-27 410 | 1168571,5,2005-09-14 411 | 766489,3,2005-10-01 412 | 1424199,5,2005-08-08 413 | 2054180,3,2004-07-09 414 | 448902,5,2005-07-20 415 | 1547173,3,2005-11-18 416 | 1751103,4,2004-08-05 417 | 121073,5,2004-12-16 418 | 2609436,4,2004-11-09 419 | 1398626,2,2004-12-03 420 | 1311231,3,2004-03-30 421 | 2279000,3,2005-02-18 422 | 236921,5,2005-03-19 423 | 2566259,5,2005-04-06 424 | 758937,4,2005-10-24 425 | 2260684,4,2004-11-08 426 | 1190829,4,2004-02-10 427 | 136106,3,2005-08-08 428 | 344753,3,2004-07-21 429 | 568930,5,2005-05-02 430 | 206115,4,2005-08-23 431 | 2390644,3,2004-09-06 432 | 2078679,5,2005-01-07 433 | 1682651,4,2005-05-02 434 | 386915,4,2005-05-27 435 | 972136,3,2005-06-20 436 | 1806515,3,2005-09-29 437 | 11589,3,2005-10-19 438 | 2118461,5,2005-10-24 439 | 444411,3,2004-09-05 440 | 691108,4,2005-02-27 441 | 332401,3,2005-04-28 442 | 1278488,4,2005-04-27 443 | 358776,4,2005-11-21 444 | 387418,1,2004-02-08 445 | 872408,4,2005-08-26 446 | 646098,4,2004-07-19 447 | 396595,5,2005-02-13 448 | 1366860,4,2004-01-26 449 | 1046882,3,2004-08-12 450 | 470861,5,2004-06-28 451 | 1455257,4,2004-09-17 452 | 1274780,3,2004-10-11 453 | 379184,4,2005-02-11 454 | 1273630,4,2005-09-08 455 | 492291,3,2005-06-06 456 | 145873,3,2004-02-25 457 | 1388284,5,2004-12-21 458 | 712610,4,2005-04-27 459 | 1116065,1,2005-05-03 460 | 660499,1,2005-07-08 461 | 1918987,4,2005-07-10 462 | 1357894,3,2004-09-09 463 | 190418,3,2004-09-30 464 | 1060658,3,2005-03-23 465 | 1443203,4,2005-05-22 466 | 1772839,5,2005-09-19 467 | 2385774,3,2004-04-07 468 | 1059319,3,2005-10-10 469 | 831775,4,2005-10-15 470 | 881346,5,2005-11-07 471 | 1066317,4,2004-05-07 472 | 13651,3,2004-06-16 473 | 208920,4,2005-01-23 474 | 308753,5,2005-10-31 475 | 2564257,3,2005-11-27 476 | 565041,4,2004-08-23 477 | 1602153,4,2005-06-13 478 | 173930,4,2005-06-30 479 | 202811,3,2005-07-18 480 | 353369,3,2005-08-01 481 | 1201176,4,2005-05-02 482 | 2047577,3,2005-05-27 483 | 685113,4,2005-10-24 484 | 1686060,5,2004-03-21 485 | 151004,5,2004-07-09 486 | 2126192,3,2004-04-16 487 | 1981464,4,2005-08-16 488 | 1862581,4,2004-06-08 489 | 1255780,5,2005-03-28 490 | 1962300,3,2005-06-22 491 | 1515355,3,2004-05-12 492 | 1001779,4,2005-09-09 493 | 2093105,3,2004-06-17 494 | 1123959,3,2005-01-05 495 | 1876297,5,2005-05-19 496 | 1364481,4,2004-05-29 497 | 998236,5,2004-06-23 498 | 328415,3,2004-11-01 499 | 1347129,4,2005-01-24 500 | 1117062,4,2005-07-07 501 | 1033930,3,2005-08-11 502 | 45117,5,2005-08-15 503 | 1005769,5,2004-09-28 504 | 712609,4,2005-03-20 505 | 740495,4,2005-02-05 506 | 2497991,4,2005-07-07 507 | 1017324,4,2005-04-05 508 | 120491,5,2004-09-13 509 | 1645794,4,2005-04-05 510 | 1658790,3,2005-06-23 511 | 2451020,4,2004-09-14 512 | 1878798,4,2005-01-11 513 | 1790903,4,2005-04-23 514 | 1254683,1,2004-02-13 515 | 874943,5,2005-03-22 516 | 121456,4,2005-04-19 517 | 1140108,4,2005-09-20 518 | 515436,1,2005-02-13 519 | 272689,5,2005-03-03 520 | 1247177,3,2005-12-04 521 | 263240,3,2004-07-07 522 | 2539549,3,2005-02-23 523 | 2565654,5,2004-11-15 524 | 334701,3,2005-02-07 525 | 42921,3,2005-10-04 526 | 2011399,5,2005-08-08 527 | 433945,5,2004-11-06 528 | 2151149,4,2005-01-13 529 | 1415954,2,2005-02-14 530 | 1086360,3,2005-03-10 531 | 2419258,4,2005-08-12 532 | 2380848,5,2005-01-11 533 | 1550216,1,2005-02-07 534 | 596533,5,2005-03-20 535 | 287901,5,2005-05-30 536 | 188613,4,2005-09-15 537 | 1654508,3,2005-04-25 538 | 1313126,5,2005-04-27 539 | 51334,4,2005-05-18 540 | 2374451,4,2005-06-05 541 | 2031093,4,2005-06-30 542 | 548064,5,2005-12-02 543 | 946102,5,2005-02-02 544 | 1790158,4,2005-05-17 545 | 1403184,3,2005-11-12 546 | 1535440,4,2005-08-18 547 | 1426604,4,2005-09-01 548 | 1815755,5,2004-07-20 549 | 2: 550 | 2059652,4,2005-09-05 551 | 1666394,3,2005-04-19 552 | 1759415,4,2005-04-22 553 | 1959936,5,2005-11-21 554 | 998862,4,2004-11-13 555 | 2625420,2,2004-12-06 556 | 573975,3,2005-07-21 557 | 392722,4,2004-12-10 558 | 1401650,4,2005-02-24 559 | 988104,3,2005-05-23 560 | 977632,4,2004-11-12 561 | 2557870,4,2005-03-27 562 | 1793899,5,2005-06-04 563 | 1340535,5,2004-12-12 564 | 1888322,5,2005-01-20 565 | 1283598,3,2004-11-30 566 | 1784150,4,2005-06-17 567 | 2271251,5,2005-08-19 568 | 65932,3,2005-07-19 569 | 1828884,5,2004-12-21 570 | 1878728,4,2005-12-01 571 | 1922778,3,2005-02-07 572 | 1176404,4,2005-02-23 573 | 2265116,3,2005-09-06 574 | 1078701,4,2005-10-12 575 | 1832577,4,2005-09-07 576 | 748922,5,2005-07-05 577 | 1013802,1,2005-05-30 578 | 1131325,2,2005-11-14 579 | 2244378,4,2005-02-09 580 | 494639,2,2005-09-26 581 | 636262,1,2005-08-23 582 | 1903158,4,2005-03-04 583 | 220427,4,2005-05-05 584 | 2439493,1,2005-02-10 585 | 2225116,4,2005-08-29 586 | 1445632,5,2005-01-30 587 | 2592823,4,2005-02-06 588 | 1288603,5,2005-06-20 589 | 2556926,3,2005-07-09 590 | 1190070,4,2005-11-05 591 | 1312846,3,2005-08-26 592 | 2226229,3,2005-04-24 593 | 1563935,1,2005-08-12 594 | 69809,5,2005-04-08 595 | 1349753,3,2005-03-23 596 | 785768,3,2005-04-19 597 | 426476,5,2005-05-10 598 | 810636,4,2005-04-25 599 | 468713,5,2005-07-18 600 | 222290,4,2005-03-14 601 | 349407,5,2005-01-02 602 | 311232,2,2005-06-28 603 | 2596999,4,2005-10-07 604 | 1025601,5,2005-11-09 605 | 1743759,4,2005-01-30 606 | 2385553,5,2005-05-24 607 | 1374216,1,2005-09-25 608 | 526466,4,2005-03-05 609 | 2648861,3,2005-05-25 610 | 1210631,3,2005-03-24 611 | 2314531,4,2005-07-15 612 | 618272,1,2005-08-14 613 | 2532807,3,2005-01-28 614 | 412535,4,2005-05-23 615 | 1315005,4,2005-06-28 616 | 1358911,5,2005-12-13 617 | 507603,1,2005-07-23 618 | 1507649,5,2005-02-17 619 | 845529,5,2005-04-26 620 | 1479907,5,2005-02-28 621 | 236271,2,2005-06-21 622 | 2422676,3,2005-04-07 623 | 1636093,5,2005-10-20 624 | 995594,5,2005-01-19 625 | 1664010,4,2005-10-13 626 | 2431481,3,2004-11-18 627 | 1980668,5,2004-12-16 628 | 402321,4,2005-04-18 629 | 1344564,3,2005-01-25 630 | 1632603,3,2005-09-09 631 | 2567280,3,2005-03-17 632 | 1623166,3,2005-02-10 633 | 521932,4,2005-01-11 634 | 105086,5,2005-09-20 635 | 2072554,5,2005-05-05 636 | 2231529,3,2005-08-25 637 | 2103439,2,2005-06-05 638 | 261764,1,2005-11-10 639 | 193476,5,2005-03-23 640 | 1576540,4,2005-04-14 641 | 1783594,5,2005-03-07 642 | 503334,4,2005-11-25 643 | 183903,5,2005-01-30 644 | 2606799,1,2005-05-16 645 | 1236127,4,2005-06-07 646 | 2375962,3,2005-07-04 647 | 2212071,3,2005-09-17 648 | 1252841,3,2005-02-06 649 | 247898,5,2005-07-15 650 | 970975,3,2005-09-02 651 | 305344,1,2004-10-16 652 | 1581186,4,2005-02-27 653 | 1129620,3,2005-09-03 654 | 584750,3,2005-11-27 655 | 11409,5,2005-01-13 656 | 1875495,2,2005-01-12 657 | 1403217,2,2005-08-17 658 | 2147527,1,2005-11-18 659 | 2418486,4,2005-11-08 660 | 1476323,5,2004-11-22 661 | 2345723,4,2004-12-26 662 | 2640085,5,2005-04-26 663 | 1803154,2,2004-12-14 664 | 1251170,2,2005-02-11 665 | 527491,4,2005-07-30 666 | 391517,4,2005-05-16 667 | 1398626,3,2005-11-26 668 | 828919,5,2004-12-18 669 | 196494,5,2005-10-03 670 | 715897,5,2005-10-18 671 | 268917,2,2005-06-03 672 | 41422,4,2005-09-11 673 | 1806515,3,2005-09-23 674 | 2118461,4,2005-12-20 675 | 387418,1,2004-11-19 676 | 2019055,5,2005-05-12 677 | 348960,1,2005-09-07 678 | 1167731,4,2005-06-07 679 | 2468831,5,2005-07-05 680 | 219925,4,2005-04-18 681 | 1025193,5,2005-07-07 682 | 630887,5,2005-01-20 683 | 1461435,1,2005-03-21 684 | 1838586,1,2005-08-17 685 | 1515430,3,2005-01-27 686 | 1807053,5,2005-04-25 687 | 1172326,5,2005-06-18 688 | 1785842,3,2005-06-01 689 | 803752,3,2004-11-17 690 | 1581265,3,2005-04-15 691 | 515436,1,2005-02-13 692 | 1824543,4,2005-04-20 693 | 1283204,3,2004-12-23 694 | 1272122,5,2005-07-25 695 | 3: 696 | 1025579,4,2003-03-29 697 | 712664,5,2004-02-01 698 | 1331154,4,2004-07-03 699 | 2632461,3,2005-07-22 700 | 44937,5,2004-06-22 701 | 656399,4,2003-09-20 702 | 439011,1,2004-01-22 703 | 1436762,3,2003-03-17 704 | 1644750,3,2003-03-19 705 | 2031561,4,2004-03-31 706 | 616720,4,2003-08-10 707 | 2467008,4,2004-03-15 708 | 975874,5,2004-02-09 709 | 701730,2,2005-10-05 710 | 1614320,4,2003-08-11 711 | 115498,3,2003-07-16 712 | 931626,2,2004-07-08 713 | 699878,4,2003-05-02 714 | 1694958,3,2005-08-10 715 | 66414,5,2004-02-21 716 | 2519847,5,2003-03-10 717 | 948069,3,2003-05-09 718 | 67315,4,2003-08-25 719 | 704249,4,2004-08-16 720 | 454417,4,2004-04-18 721 | 1995318,3,2004-07-14 722 | 2158448,5,2005-05-31 723 | 574843,5,2005-06-13 724 | 714960,4,2005-06-21 725 | 620771,2,2005-09-26 726 | 253876,4,2005-11-01 727 | 1632700,4,2005-11-14 728 | 603277,3,2003-06-21 729 | 79160,4,2005-07-11 730 | 1859725,4,2003-05-30 731 | 283774,5,2003-08-23 732 | 1983667,2,2003-08-08 733 | 2267507,4,2004-05-25 734 | 1813349,4,2003-03-18 735 | 2424721,3,2005-09-04 736 | 1275804,4,2003-08-27 737 | 1204327,4,2004-03-16 738 | 2143489,1,2004-08-23 739 | 672980,5,2004-11-18 740 | 166100,4,2004-12-17 741 | 2537764,5,2005-01-06 742 | 1650301,2,2005-06-17 743 | 553931,4,2005-10-04 744 | 214166,3,2005-10-09 745 | 6689,4,2003-02-20 746 | 109089,5,2003-05-27 747 | 1854303,1,2004-05-29 748 | 525003,5,2003-04-30 749 | 2312349,4,2003-05-13 750 | 188416,3,2004-01-27 751 | 2213550,4,2004-07-01 752 | 24344,4,2004-08-20 753 | 2344483,4,2004-09-28 754 | 531155,4,2005-02-28 755 | 1959707,3,2005-03-19 756 | 2120279,5,2005-05-15 757 | 1977959,4,2003-02-12 758 | 21983,4,2003-04-19 759 | 2173816,1,2003-07-17 760 | 78931,5,2003-07-09 761 | 2145227,4,2003-04-07 762 | 2463079,3,2003-08-26 763 | 1286051,3,2004-01-28 764 | 958104,4,2003-09-03 765 | 489962,3,2003-09-03 766 | 2297863,4,2004-01-13 767 | 958382,4,2004-08-22 768 | 248932,4,2005-04-03 769 | 1756658,3,2005-06-16 770 | 2579794,4,2005-10-26 771 | 1628475,4,2005-11-14 772 | 206809,5,2003-03-05 773 | 1333,4,2004-05-18 774 | 445828,2,2003-10-28 775 | 2079559,5,2004-08-28 776 | 1007809,4,2003-12-16 777 | 1562675,1,2003-03-06 778 | 1477923,4,2003-03-25 779 | 44783,3,2003-11-22 780 | 52540,1,2003-03-22 781 | 2436327,2,2004-07-13 782 | 1830211,4,2004-10-06 783 | 1857979,2,2005-03-08 784 | 1198785,2,2005-06-30 785 | 870391,2,2003-05-19 786 | 2164676,4,2003-03-03 787 | 1281996,4,2003-07-23 788 | 1853885,4,2004-06-18 789 | 2646060,3,2003-09-24 790 | 709342,4,2003-09-10 791 | 1195585,5,2003-08-04 792 | 1319527,5,2005-01-05 793 | 1478381,4,2005-05-26 794 | 1658752,3,2003-03-09 795 | 41371,5,2004-01-21 796 | 1479793,3,2004-01-18 797 | 1406148,3,2004-09-17 798 | 2446687,5,2004-12-14 799 | 968796,3,2005-02-06 800 | 2266857,3,2003-06-13 801 | 1456369,4,2003-07-08 802 | 1078792,4,2005-09-09 803 | 104768,5,2003-11-25 804 | 372528,3,2004-06-30 805 | 2240742,5,2004-11-05 806 | 1401399,4,2005-04-03 807 | 402377,4,2005-04-10 808 | 51230,4,2005-05-22 809 | 2229289,4,2005-12-30 810 | 2554745,4,2004-04-08 811 | 1710932,3,2004-08-16 812 | 1355097,4,2003-03-26 813 | 1231910,3,2003-12-23 814 | 2599552,4,2003-04-14 815 | 1394444,5,2005-02-17 816 | 1094443,2,2003-09-04 817 | 77266,2,2003-03-06 818 | 153249,4,2003-06-24 819 | 2590630,3,2003-10-10 820 | 2596383,3,2005-11-03 821 | 2601294,4,2004-02-10 822 | 2623268,4,2004-12-14 823 | 1756597,5,2005-04-11 824 | 1673185,3,2005-04-14 825 | 2611525,3,2004-02-05 826 | 2013198,4,2003-03-10 827 | 1704175,4,2004-04-29 828 | 2186436,2,2004-04-21 829 | 2252223,5,2004-06-24 830 | 780282,4,2004-06-06 831 | 203667,5,2003-03-25 832 | 2338873,5,2003-03-18 833 | 479779,4,2004-01-13 834 | 1927897,5,2004-07-27 835 | 719833,4,2003-09-08 836 | 871489,5,2004-09-14 837 | 968765,5,2004-10-11 838 | 1057518,4,2005-01-03 839 | 257517,4,2004-05-17 840 | 2003554,4,2003-04-25 841 | 2203875,4,2003-03-07 842 | 2213289,5,2003-06-06 843 | 2630072,3,2003-12-02 844 | 1142291,4,2005-08-22 845 | 1733188,4,2004-03-24 846 | 1614895,4,2003-09-13 847 | 1947922,2,2004-02-19 848 | 1036823,3,2004-09-20 849 | 786312,3,2004-11-06 850 | 1197233,2,2005-02-17 851 | 1100037,3,2005-04-02 852 | 1130826,3,2005-05-02 853 | 620147,5,2005-05-26 854 | 1479047,4,2005-09-27 855 | 1221390,4,2003-02-17 856 | 2193643,5,2003-06-04 857 | 544496,2,2004-02-02 858 | 357507,3,2003-03-04 859 | 976059,3,2004-06-22 860 | 820624,4,2004-02-09 861 | 924839,5,2004-05-18 862 | 966255,4,2004-07-09 863 | 108052,4,2005-03-22 864 | 375319,4,2005-09-15 865 | 309333,4,2005-10-16 866 | 1599030,5,2003-03-05 867 | 2443370,4,2003-07-07 868 | 871580,3,2003-05-13 869 | 311641,1,2005-08-27 870 | 532382,4,2003-08-21 871 | 2378011,5,2004-01-27 872 | 946970,4,2004-01-23 873 | 175763,4,2004-05-04 874 | 619721,4,2004-10-27 875 | 1248452,3,2005-01-20 876 | 1863843,2,2005-06-29 877 | 1915354,4,2005-10-20 878 | 920625,2,2005-11-14 879 | 1100170,4,2005-12-18 880 | 1733406,4,2003-03-26 881 | 755319,3,2005-08-14 882 | 1743030,3,2004-04-05 883 | 309567,2,2003-05-07 884 | 2096587,5,2003-06-09 885 | 345869,4,2004-08-08 886 | 1544094,5,2005-02-22 887 | 1022903,4,2005-09-14 888 | 290951,4,2003-02-24 889 | 1737484,4,2004-08-16 890 | 183215,4,2005-08-15 891 | 2065639,4,2004-04-02 892 | 250836,3,2005-09-08 893 | 2126122,4,2005-02-22 894 | 1206452,4,2005-03-21 895 | 2068821,3,2005-05-30 896 | 1924939,4,2005-10-25 897 | 1940163,4,2005-07-14 898 | 1033433,2,2004-06-05 899 | 1213801,3,2003-04-09 900 | 1045221,1,2003-05-13 901 | 253037,3,2005-08-05 902 | 341954,2,2003-12-15 903 | 697945,4,2003-05-28 904 | 1115632,3,2003-11-24 905 | 2103655,3,2003-08-01 906 | 2495200,5,2004-10-05 907 | 714550,4,2005-04-02 908 | 979820,1,2005-06-26 909 | 514312,3,2004-05-19 910 | 2025577,3,2003-08-22 911 | 1589677,4,2003-06-12 912 | 425033,3,2003-06-13 913 | 2352327,4,2004-09-08 914 | 156078,5,2004-08-03 915 | 851855,4,2003-05-20 916 | 2441707,3,2003-05-23 917 | 1278394,4,2003-06-07 918 | 962955,2,2003-10-03 919 | 811218,5,2004-05-11 920 | 636475,3,2005-01-29 921 | 1087412,4,2005-02-20 922 | 410537,4,2005-06-06 923 | 1586499,3,2003-07-28 924 | 917063,4,2003-02-15 925 | 1023101,3,2003-08-04 926 | 2393306,4,2004-08-31 927 | 788774,4,2003-04-25 928 | 2586963,4,2003-06-08 929 | 2368791,3,2003-04-30 930 | 244266,3,2003-12-04 931 | 2622138,1,2004-02-23 932 | 793228,4,2004-05-19 933 | 1283965,4,2005-11-08 934 | 2135038,4,2003-10-20 935 | 722006,4,2005-02-20 936 | 1511683,1,2003-05-30 937 | 1939663,4,2003-07-20 938 | 1763372,3,2003-06-18 939 | 1834472,3,2003-04-08 940 | 209549,5,2004-06-15 941 | 515850,5,2003-09-23 942 | 1455472,2,2004-06-22 943 | 2301782,4,2004-11-08 944 | 770921,5,2004-12-13 945 | 297498,5,2005-04-04 946 | 386510,5,2005-09-16 947 | 2494367,2,2005-10-08 948 | 1607574,4,2005-10-20 949 | 1594095,4,2004-09-08 950 | 1124822,4,2005-08-15 951 | 544022,3,2004-08-19 952 | 1817216,2,2003-06-10 953 | 1931698,2,2004-07-08 954 | 569099,3,2003-05-19 955 | 1771085,2,2003-07-21 956 | 604949,4,2004-09-19 957 | 213541,3,2004-10-31 958 | 790920,3,2004-12-02 959 | 2554707,4,2005-03-08 960 | 376148,5,2005-04-25 961 | 1689439,5,2004-06-25 962 | 2485566,5,2004-08-09 963 | 2232958,4,2004-03-01 964 | 2267858,3,2003-04-24 965 | 1956967,4,2005-08-13 966 | 2494005,2,2005-08-27 967 | 323148,3,2003-11-29 968 | 2158065,4,2003-10-28 969 | 2370268,4,2003-09-29 970 | 2152838,5,2004-08-01 971 | 1407746,3,2004-08-19 972 | 1666581,4,2005-05-25 973 | 871548,5,2005-03-10 974 | 2385706,4,2003-04-23 975 | 1969676,5,2004-01-26 976 | 1927329,5,2004-01-06 977 | 1938559,5,2004-06-21 978 | 199769,4,2004-01-22 979 | 2576108,1,2003-06-16 980 | 162854,4,2004-10-25 981 | 215406,5,2005-02-18 982 | 2095263,4,2005-05-22 983 | 556045,5,2005-10-18 984 | 817851,4,2004-01-17 985 | 2491399,2,2005-07-24 986 | 1134816,4,2003-07-23 987 | 660454,4,2003-03-24 988 | 581199,4,2003-10-27 989 | 1545189,4,2003-07-07 990 | 1929487,3,2005-09-04 991 | 528384,5,2003-09-26 992 | 2646115,3,2003-10-08 993 | 727242,1,2005-01-04 994 | 883478,1,2005-10-27 995 | 247940,4,2003-08-29 996 | 369761,3,2003-08-19 997 | 1065126,4,2004-03-09 998 | 1101467,4,2003-07-29 999 | 393413,3,2004-04-06 1000 | 478176,4,2004-05-26 1001 | 1369550,3,2004-10-11 1002 | 2428502,4,2005-12-05 1003 | 282525,4,2003-02-21 1004 | 2085230,4,2004-03-15 1005 | 282522,3,2003-06-18 1006 | 2246070,3,2004-01-06 1007 | 532649,3,2004-01-19 1008 | 1053903,2,2004-05-02 1009 | 1521266,3,2004-09-28 1010 | 2303969,4,2004-12-27 1011 | 2580481,2,2005-03-08 1012 | 2551806,5,2003-02-17 1013 | 1749903,4,2003-07-30 1014 | 2549926,5,2003-06-20 1015 | 781779,3,2003-06-17 1016 | 22853,4,2004-02-10 1017 | 1788346,1,2003-05-19 1018 | 1858421,4,2004-02-01 1019 | 354704,3,2003-10-15 1020 | 841137,5,2004-07-19 1021 | 475797,4,2004-05-11 1022 | 1876156,3,2005-01-12 1023 | 769670,4,2005-01-19 1024 | 1272379,1,2005-01-19 -------------------------------------------------------------------------------- /mf.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import math 4 | import re 5 | import sklearn 6 | from scipy.sparse import csr_matrix 7 | import matplotlib.pyplot as plt 8 | import seaborn as sns 9 | from surprise import Reader, Dataset, SVD, evaluate 10 | sns.set_style("darkgrid") 11 | from cvxpy import * 12 | from numpy import matrix 13 | 14 | 15 | class MF(): 16 | 17 | def __init__(self, R, K, alpha, beta, iterations): 18 | """ 19 | Perform matrix factorization to predict empty 20 | entries in a matrix. 21 | 22 | Arguments 23 | - R (ndarray) : user-item rating matrix 24 | - K (int) : number of latent dimensions 25 | - alpha (float) : learning rate 26 | - beta (float) : regularization parameter 27 | """ 28 | 29 | self.R = R 30 | self.num_users, self.num_items = R.shape 31 | self.K = K 32 | self.alpha = alpha 33 | self.beta = beta 34 | self.iterations = iterations 35 | 36 | def train(self): 37 | # Initialize user and item latent feature matrice 38 | self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K)) 39 | self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K)) 40 | 41 | # Initialize the biases 42 | self.b_u = np.zeros(self.num_users) 43 | self.b_i = np.zeros(self.num_items) 44 | self.b = np.mean(self.R[np.where(self.R != 0)]) 45 | 46 | # Create a list of training samples 47 | self.samples = [ 48 | (i, j, self.R[i, j]) 49 | for i in range(self.num_users) 50 | for j in range(self.num_items) 51 | if self.R[i, j] > 0 52 | ] 53 | 54 | # Perform stochastic gradient descent for number of iterations 55 | training_process = [] 56 | for i in range(self.iterations): 57 | np.random.shuffle(self.samples) 58 | self.sgd() 59 | mse = self.mse() 60 | training_process.append((i, mse)) 61 | #if (i+1) % 100 == 0: 62 | # print("Iteration: %d ; error = %.4f" % (i+1, mse)) 63 | 64 | return training_process 65 | 66 | def mse(self): 67 | """ 68 | A function to compute the total mean square error 69 | """ 70 | xs, ys = self.R.nonzero() 71 | predicted = self.full_matrix() 72 | error = 0 73 | for x, y in zip(xs, ys): 74 | error += pow(self.R[x, y] - predicted[x, y], 2) 75 | return np.sqrt(error) 76 | 77 | def sgd(self): 78 | """ 79 | Perform stochastic graident descent 80 | """ 81 | for i, j, r in self.samples: 82 | # Computer prediction and error 83 | prediction = self.get_rating(i, j) 84 | e = (r - prediction) 85 | 86 | # Update biases 87 | self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i]) 88 | self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j]) 89 | 90 | # Update user and item latent feature matrices 91 | self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:]) 92 | self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:]) 93 | 94 | def get_rating(self, i, j): 95 | """ 96 | Get the predicted rating of user i and item j 97 | """ 98 | prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T) 99 | return prediction 100 | 101 | def full_matrix(self): 102 | """ 103 | Computer the full matrix using the resultant biases, P and Q 104 | """ 105 | return self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.P.dot(self.Q.T) -------------------------------------------------------------------------------- /movie_titles.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshraj11584/Paper-Implementation-Matrix-Factorization-Recommender-Systems-Netflix/752452296cbee241df0100a82b90e885c9ef6ec7/movie_titles.csv -------------------------------------------------------------------------------- /presentation.tex: -------------------------------------------------------------------------------- 1 | \documentclass[10pt]{beamer} 2 | 3 | \usetheme[progressbar=frametitle]{metropolis} 4 | \usepackage{appendixnumberbeamer} 5 | \usepackage{amssymb} 6 | \usepackage{booktabs} 7 | \usepackage[scale=2]{ccicons} 8 | \usepackage{tikz} 9 | \usepackage{pgfplots} 10 | \usepgfplotslibrary{dateplot} 11 | \usepackage{wrapfig} 12 | \usepackage{xspace} 13 | \newcommand{\themename}{\textbf{\textsc{metropolis}}\xspace} 14 | 15 | \usepackage{graphicx} 16 | 17 | \usepackage{subcaption} 18 | 19 | \usepackage[export]{adjustbox} 20 | \title{EE5327 : Optimization} 21 | % \date{\today} 22 | \date{} 23 | \author{Harsh Raj - MA17BTECH11003 \newline Aravind Reddy K V - MA17BTECH11010} 24 | \institute{Mathematics and Computing, IIT-Hyderabad} 25 | % \titlegraphic{\hfill\includegraphics[height=1.5cm]{logo.pdf}} 26 | \begin{document} 27 | \maketitle 28 | %begin{frame}{Table of contents} 29 | % \setbeamertemplate{section in toc}[sections numbered] 30 | % \tableofcontents[hideallsubsections] 31 | %\end{frame} 32 | 33 | %\section{Recommender Systems Strategies} 34 | 35 | %\begin{frame}[fragile]{Recommender Systems Strategies} 36 | % \begin{itemize} 37 | % \item Electronic retailers and content providers offer a huge selection of products to meet a variety of special needs and tastes. 38 | % \item Matching consumers with the most appropriate products is key to enhancing user satisfaction and loyalty. 39 | % \item Therefore, more retailers and e-commerce leaders like Amazon and Netflix have become interested in recommender systems, which analyze patterns of %user interest in products to provide personalized recommendations that suit a user's taste. 40 | % \end{itemize} 41 | %\end{frame} 42 | \section{Recommender Systems Strategies} 43 | \begin{frame}[fragile]{Recommender Systems Strategies} 44 | \begin{center} 45 | \begin{tikzpicture}[sibling distance=10em, 46 | every node/.style = {shape=rectangle, rounded corners, 47 | draw, align=center, 48 | top color=white, bottom color=blue!20}]] 49 | \node {Recommender System Strategies} 50 | child { node {Neighbour Method} } 51 | child { node {Latent Factor Methods} }; 52 | \end{tikzpicture} 53 | \end{center} 54 | \\ 55 | %\begin{itemize} 56 | 57 | %\item The \textbf{content filtering} approach creates a profile for each user or product to characterize its nature. 58 | %\item The profiles allow programs to associate users with matching products. 59 | %\item Content-based strategies require gathering external information that might not be available or easy to collect. 60 | %\end{itemize} 61 | %\end{frame} 62 | 63 | %\begin{frame}[fragile]{Collaborative Filtering} 64 | % \textbf{Collaborative filtering} relies only on past user behavior—for example, previous transactions or product ratings—without requiring the creation of explicit profiles. 65 | % \\ 66 | % \vspace{3mm} 67 | % The two primary areas of collaborative filtering are the \textbf{neighborhood methods} and \textbf{latent factor models}. 68 | \begin{center} 69 | \graphicspath{ {./images/} } 70 | \includegraphics [scale=0.2] {3} 71 | \end{center} 72 | \end{frame} 73 | 74 | \begin{frame}[fragile]{Neighbourhood Method} 75 | 76 | This method involves finding $K$-nearest neighbours (\textbf{K-NN} algorithm and it's variants).\\ 77 | \vspace{3mm} 78 | \begin{center} 79 | \graphicspath{ {./images/} } 80 | \includegraphics [scale=0.2] {1} 81 | \end{center} 82 | Why is this method very slow and also not accurate always? 83 | \newline 84 | - If predicting among all possibilities, requires \textbf{O(n)} iterations for each prediction. 85 | \newline 86 | - Only predicts within predetermined cluster if time reduced to \textbf{O(1)}. 87 | \newline 88 | - Cannot Incorporate Item/User based Bias 89 | \end{frame} 90 | 91 | \begin{frame}[fragile]{Latent Factor Models} 92 | \textbf{Latent factor models} try to explain the ratings by characterizing both items and users on factors inferred from the ratings patterns. 93 | \begin{center} 94 | \graphicspath{ {./images/} } 95 | \includegraphics [scale=0.2] {2} 96 | \end{center} 97 | User's predicted relative rating for a movie = 98 | \newline \textbf{Dot product} of the movie's and user's location vectors on \text\textbf{Latent Space}. 99 | \end{frame} 100 | 101 | %\begin{frame}[fragile]{Matrix Factorization Meth%ods} 102 | % \begin{item%ize} 103 | % \item Some of the most successful realizations of latent factor models are based on \textbf{matrix factorizati%on}. 104 | % \item In its basic form, matrix factorization characterizes both items and users by vectors of factors inferred from item rating patte%rns. 105 | % \end{item%ize} 106 | % Recommender systems rely on different types of \textbf{input data}, which are often placed in a matrix with one dimension representing users and the other %dimension representing items of interes%t.\\ 107 | % \vspace{%3mm} 108 | % The input form could be either \textbf{explicit feedback} or \textbf{implicit feedb%ack} 109 | %\end{frame} 110 | 111 | 112 | \begin{frame}[fragile]{Input Data} 113 | \begin{enumerate} 114 | \item \textbf{\textit{Explicit Feedback}} 115 | \vspace{3mm} 116 | \begin{itemize} 117 | \item Explicit input by users regarding their interest in products. 118 | \vspace{3mm} 119 | \item Comprises a \textbf{Sparse Matrix}, since any single user is likely to have rated only a small percentage of possible i 120 | \vspace{3mm} 121 | \item \textbf{High confidence} on this data. 122 | \end{itemize} 123 | \vspace{3mm} 124 | \item \textbf{\textit{Implicit Feedback}} 125 | \vspace{3mm} 126 | \begin{itemize} 127 | \item Observing user behavior, including purchase history, browsing history, search patterns etc. 128 | \vspace{3mm} 129 | \item Denotes the presence or absence of an event, so it is typically represented by a \textbf{Dense Matrix} 130 | \vspace{3mm} 131 | \item \textbf{Low confidence} on this data. 132 | \end{itemize} 133 | \end{enumerate} 134 | \end{frame} 135 | 136 | \begin{frame}[fragile]{Matrix Factorization Model} 137 | Matrix factorization models map both users and items to a joint \textbf{Latent Factor Space} of dimensionality \boldsymbol{f}.\\ 138 | \vspace{3mm} 139 | Each item \boldsymbol{i} is associated with a vector \boldsymbol{q_i} $\in \mathbb{R}^{f} $, quantizing the amount of each attribute present in item {i}. 140 | \newline Each user \boldsymbol{u} is associated with a vector \boldsymbol{p_u} $\in \mathbb{R}^{f}$, quantizing the weightage of each attribute in the user's final decision.\\ 141 | \vspace{3mm} 142 | The resulting dot product, \boldsymbol{q_i^{T} p_u}, captures the user \boldsymbol{u}’s overall interest in the item \boldsymbol{i}.\\ 143 | \vspace{3mm} 144 | This approximates user \boldsymbol{u}'s estimated rating of item \boldsymbol{i}, denoted by \boldsymbol{\hat{r}_{ui}}: 145 | \begin{equation} 146 | \boldsymbol{\hat{r}_{ui}=q_{i}^{T}p_{u}}. 147 | \end{equation} 148 | \end{frame} 149 | 150 | \begin{frame}{Example} 151 | 152 | For 5 movies, 7 latent attributes, we get : 153 | \newline \newline 154 | \begin{bmatrix} 155 | q_{11} & q_{12}& q_{13}& q_{14}& q_{15}& q_{16}& q_{17}& \\ 156 | q_{21} & q_{22}& q_{23}& q_{24}& q_{25}& q_{26}& q_{27}& \\ 157 | q_{31} & q_{32}& q_{33}& q_{34}& q_{35}& q_{36}& q_{37}& \\ 158 | q_{41} & q_{42}& q_{43}& q_{44}& q_{45}& q_{46}& q_{47}& \\ 159 | q_{51} & q_{52}& q_{53}& q_{54}& q_{55}& q_{56}& q_{57}& \\ 160 | \end{bmatrix} 161 | \begin{bmatrix} 162 | p_{1} &\\ 163 | p_{2} &\\ 164 | p_{3} &\\ 165 | p_{4} &\\ 166 | p_{5} &\\ 167 | p_{6} &\\ 168 | p_{7} &\\ 169 | \end{bmatrix} 170 | = 171 | \begin{bmatrix} 172 | \hat{r}_{1} &\\ 173 | \hat{r}_{2} &\\ 174 | \hat{r}_{3} &\\ 175 | \hat{r}_{4} &\\ 176 | \hat{r}_{5} &\\ 177 | \end{bmatrix} 178 | \end{frame} 179 | 180 | 181 | \begin{frame}{Optimization Problem} 182 | %Minimizes the regularized squared error on the set of known ratings 183 | Introduce Regularization Parameter to avoid overfitting. 184 | To learn the factor vectors \boldsymbol{p_u} and \boldsymbol{q_i}, the system minimizes the regularized squared error on the set of known ratings: 185 | \begin{center} 186 | {\min\limits_{q^{\star}, p^{\star}}$\sum\limits_{(u,i)\in \kappa}(r_{ui}-q_{i}^{T} p_{u})^{2}+\lambda(\Vert q_{i}\Vert^{2}+\Vert p_{u}\Vert^{2})$} 187 | \end{center} 188 | 189 | The constant $\lambda$ controls the extent of regularization, by keeping each attribute close to zero. $\lambda$ is determined by cross-validation. 190 | \end{frame} 191 | 192 | \begin{frame}{Learning Algorithm : SGD} 193 | One option is to use Stochastic Gradient Descent Algorithm, i.e., 194 | $e_{ui} = r_{ui}- {q_i}^T p_u$ 195 | \newline 196 | $q_i \longleftarrow q_i + \gamma(e_{ui}p_u - \lambda q_i)$ 197 | \newline 198 | $p_u \longleftarrow p_u + \gamma(e_{ui}q_i - \lambda p_u)$ 199 | \newline \newline 200 | \textbf{Problems :} 201 | \newline \newline 202 | - Requires $\textbf{O(n)}$ operations for each iteration. 203 | \newline. \hspace{2mm} Feasible only for $\textbf{Sparse Matrix}$. 204 | \newline. \hspace{3mm}$\implies$ Cannot Use $\textbf{Implicit Feedback}$ Data. 205 | \newline 206 | - All operations must be performed in serial order. 207 | \end{frame} 208 | 209 | \begin{frame}{Learning Algorithm : ALS} 210 | \textbf{Alternating Least Squares:} 211 | \newline \newline 212 | As both \boldsymbol{p_u} and \boldsymbol{q_i} are unknown, the objective is not Convex. 213 | \begin{equation} 214 | \sum\limits_{(u,i)\in \kappa}(r_{ui}-q_{i}^{T} p_{u})^{2}+\lambda(\Vert q_{i}\Vert^{2}+\Vert p_{u}\Vert^{2}) 215 | \end{equation} 216 | \newline 217 | If we fix one of the unknowns, the optimization problem becomes Quadratic Convex (QCP) and can be solved optimally. 218 | \newline \newline 219 | ALS technique rotates between fixing \boldsymbol{q_i}’s and \boldsymbol{p_u}’s. 220 | \newline When all \boldsymbol{p_u}’s are fixed, the system recomputes the \boldsymbol{q_i}’s by Directly solving a least-squares problem, and vice-versa. 221 | \newline Each step decreases objective function until convergence. 222 | \end{frame} 223 | 224 | \begin{frame}{Learning Algorithm : ALS} 225 | \textbf{Repeat Until Convergence: } 226 | \begin{center} 227 | {(i) \min\limits_{q^{\star}}$\sum\limits_{(u,i)\in \kappa}(r_{ui}-q_{i}^{T} p_{u})^{2}+\lambda(\Vert q_{i}\Vert^{2}+\Vert p_{u}\Vert^{2})$} 228 | \end{center} 229 | \begin{center} 230 | {(ii) \min\limits_{p^{\star}}$\sum\limits_{(u,i)\in \kappa}(r_{ui}-q_{i}^{T} p_{u})^{2}+\lambda(\Vert q_{i}\Vert^{2}+\Vert p_{u}\Vert^{2})$} 231 | \end{center} 232 | \textbf{Advantages :} 233 | \newline \newline 234 | - Feasible for $\textbf{Dense Matrix}$. 235 | \newline. \hspace{2mm}$\implies$ Can Use $\textbf{Implicit Feedback}$ Data. 236 | \newline 237 | - All \boldsymbol{p_i} are computed independent of other factors (same for all \boldsymbol{q_i}). 238 | \newline. \hspace{2mm}$\implies$ Parallelization can be done here. 239 | \end{frame} 240 | 241 | \begin{frame}{Adding Biases and Confidence} 242 | 243 | Incorporate \textbf{Bias} in this model - 244 | \newline \newline 245 | (i) \boldsymbol{\mu} : Shifts the Prediction Mean from 0 to \boldsymbol{\mu} 246 | \newline . \hspace{7mm} where \boldsymbol{\mu}= Overall Average Rating 247 | \newline 248 | (ii) \boldsymbol{b_i} : Item Based Bias 249 | \newline . \hspace{7mm} where \boldsymbol{b_i}= Average Rating of Item i - Overall Average Rating 250 | \newline 251 | (iii) \boldsymbol{b_u} : User Based Bias 252 | \newline . \hspace{7mm} where \boldsymbol{b_u}= Average Rating by User u - Overall Average Rating 253 | \newline \newline 254 | 255 | Incorporate \textbf{Confidence} in this model - 256 | \newline \newline 257 | (iv) \boldsymbol{c_{ui}} : Confidence in observing \boldsymbol{r_{ui}} 258 | 259 | %(iv) $\boldsymbol{{\mid N(u) \mid}}^{-0.5}$ $(\sum\limits_{i \in N(u)}\boldsymbol{x_i} )$: Normalized Implicit Feedback 260 | %\vspace{2mm} 261 | %\newline . \hspace{7mm} where \boldsymbol{\mid N(u)\mid}= Items with Implicit Feedback from User u 262 | %\newline . \hspace{7mm} and \boldsymbol{x_i} = Implicit Feedback Vector for i $\in \boldsymbol{N(u)}$ 263 | %\newline 264 | \end{frame} 265 | 266 | 267 | \begin{frame}{Final Recommender} 268 | \newline Final Prediction is : \newline 269 | $ \boldsymbol{\hat{r}_{ui}}=c_{ui}(\mu + b_u + b_i + {p_u}^T q_i) $ 270 | \newline \newline 271 | Final form of Recommender: 272 | \begin{center} 273 | {\min\limits_{q^{\star}, p^{\star},b^{\star}}$\sum\limits_{(u,i)}c_{ui}(r_{ui}- \mu -b_u -b_i - q_{i}^{T} p_{u})^{2}+\lambda(\Vert q_{i}\Vert^{2}+\Vert p_{u}\Vert^{2} +\Vert b_{u}\Vert^{2} +\Vert b_{i}\Vert^{2} )$ 274 | } 275 | \end{center} 276 | subject to : $c_{ui} \geqslant 0 $ $\forall (u,i) $ 277 | \newline. \hspace{1.53cm} $ \lambda \geqslant 0 $ 278 | 279 | 280 | 281 | \end{frame} 282 | 283 | \begin{frame}{Proof of Convexity} 284 | \textbf{Claim:} For a fixed $p_u$, \newline $\sum\limits_{(u,i)}c_{ui}(r_{ui}- \mu -b_u -b_i - q_{i}^{T} p_{u})^{2}+\lambda(\Vert q_{i}\Vert^{2}+\Vert p_{u}\Vert^{2} +\Vert b_{u}\Vert^{2} +\Vert b_{i}\Vert^{2} )$ \newline 285 | is convex in q_{i}. 286 | \newline \newline \textbf{Proof:}\newline 287 | i) $(r_{ui}- \mu -b_u -b_i - q_{i}^{T} p_{u})$ is affine in q_{i} 288 | \newline 289 | ii) $(r_{ui}- \mu -b_u -b_i - q_{i}^{T} p_{u})^{2}$ is convex in $q_{i}$ as it is square of affine function 290 | \newline 291 | iii) $\lambda(\Vert q_{i}\Vert^{2} )$ is convex in $q_{i}$ because it is a norm. 292 | \newline 293 | iv) As sum of convex functions is convex, adding ii) and ii), we get, \newline $\sum\limits_{(u,i)}c_{ui}(r_{ui}- \mu -b_u -b_i - q_{i}^{T} p_{u})^{2}+\lambda(\Vert q_{i}\Vert^{2}+\Vert p_{u}\Vert^{2} +\Vert b_{u}\Vert^{2} +\Vert b_{i}\Vert^{2} )$ is convex. \newline(Proved) 294 | \end{frame} 295 | 296 | \begin{frame}{Accuracy Improvement} 297 | 298 | Original Netflix system : \newline RMSE = 0.9514 299 | \newline \newline 300 | Plain Matrix Factorization Model : \newline RMSE = 0.9025 301 | \newline \newline 302 | Included User and Item Biases : \newline RMSE = 0.9000 303 | \newline \newline 304 | Included Implicit Feedback and Confidence Parameter : \newline RMSE = 0.8925 305 | 306 | \end{frame} 307 | 308 | \end{document} 309 | -------------------------------------------------------------------------------- /recommender.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 116, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Setup Complete\n", 13 | "\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "#Setting up prerequisites\n", 19 | "import pandas as pd\n", 20 | "import numpy as np\n", 21 | "import math\n", 22 | "import re\n", 23 | "import sklearn\n", 24 | "from scipy.sparse import csr_matrix\n", 25 | "import matplotlib.pyplot as plt\n", 26 | "import seaborn as sns\n", 27 | "from surprise import Reader, Dataset, SVD, evaluate\n", 28 | "sns.set_style(\"darkgrid\")\n", 29 | "\n", 30 | "from cvxpy import *\n", 31 | "from numpy import matrix\n", 32 | "\n", 33 | "print(\"Setup Complete\\n\")" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 117, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "Dataset 1 shape: (1024, 3)\n", 46 | "-Dataset examples-\n", 47 | " Cust_Id Rating Date\n", 48 | "0 1: NaN NaN\n", 49 | "100 2630337 5.0 20050310.0\n", 50 | "200 573434 4.0 20040526.0\n", 51 | "300 638824 5.0 20040519.0\n", 52 | "400 1653834 4.0 20040822.0\n", 53 | "500 1033930 3.0 20050811.0\n", 54 | "600 349407 5.0 20050102.0\n", 55 | "700 656399 4.0 20030920.0\n", 56 | "800 1456369 4.0 20030708.0\n", 57 | "900 253037 3.0 20050805.0\n", 58 | "1000 1369550 3.0 20041011.0\n", 59 | "float64\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "df1 = pd.read_csv('netflix-prize-data/toy_combined_data.txt', header = None, names = ['Cust_Id', 'Rating', 'Date'], usecols = [0,1,2])\n", 65 | "df1['Rating'] = df1['Rating'].astype(float)\n", 66 | "df1['Date'] = df1['Date'].astype(str)\n", 67 | "df1['Date'] = df1['Date'].map( lambda s : (s[:4])+(s[5:7])+(s[8:]))\n", 68 | "df1['Date'] = df1['Date'].astype(float)\n", 69 | "print('Dataset 1 shape: {}'.format(df1.shape))\n", 70 | "print('-Dataset examples-')\n", 71 | "print(df1.iloc[::100, :])\n", 72 | "print(df1['Date'].dtype)\n", 73 | "df = df1" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 118, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "See Overview of the Data\n" 86 | ] 87 | }, 88 | { 89 | "data": { 90 | "image/png": "\n", 91 | "text/plain": [ 92 | "
" 93 | ] 94 | }, 95 | "metadata": {}, 96 | "output_type": "display_data" 97 | } 98 | ], 99 | "source": [ 100 | "#Seeing the distribution of ratings given by the users\n", 101 | "print(\"See Overview of the Data\")\n", 102 | "p = df.groupby('Rating')['Rating'].agg(['count'])\n", 103 | "# get movie count\n", 104 | "movie_count = df.isnull().sum()[1]\n", 105 | "# get customer count\n", 106 | "cust_count = df['Cust_Id'].nunique() - movie_count\n", 107 | "# get rating count\n", 108 | "rating_count = df['Cust_Id'].count() - movie_count\n", 109 | "ax = p.plot(kind = 'barh', legend = False, figsize = (15,10))\n", 110 | "plt.title('Total pool: {:,} Movies, {:,} customers, {:,} ratings given'.format(movie_count, cust_count, rating_count), fontsize=20)\n", 111 | "plt.axis('off')\n", 112 | "for i in range(1,6):\n", 113 | " ax.text(p.iloc[i-1][0]/4, i-1, 'Rated {}: {:.0f}%'.format(i, p.iloc[i-1][0]*100 / p.sum()[0]), color = 'white', weight = 'bold')" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 119, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "Movie IDs extracted from the extra rows given\n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "#Adding movie IDs to the dataset\n", 131 | "movie_np = []\n", 132 | "movie_id = 0\n", 133 | "for x in range(df.shape[0]):\n", 134 | " if(np.isnan(df.iloc[x]['Rating'])):\n", 135 | " movie_id = movie_id+1\n", 136 | " movie_np = np.append(movie_np,movie_id)\n", 137 | "\n", 138 | "#print(movie_np)\n", 139 | "#print(len(movie_np))\n", 140 | "df['Movie_Id'] = movie_np.astype(int)\n", 141 | "print(\"Movie IDs extracted from the extra rows given\")" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 120, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "-Dataset examples-\n", 154 | " Cust_Id Rating Date Movie_Id\n", 155 | "1 1488844 3.0 20050906.0 1\n", 156 | "101 1155747 3.0 20050703.0 1\n", 157 | "201 1141189 4.0 20041215.0 1\n", 158 | "301 2256485 1.0 20040819.0 1\n", 159 | "401 2322840 3.0 20050712.0 1\n", 160 | "501 45117 5.0 20050815.0 1\n", 161 | "602 2596999 4.0 20051007.0 2\n", 162 | "703 1644750 3.0 20030319.0 3\n", 163 | "803 372528 3.0 20040630.0 3\n", 164 | "903 1115632 3.0 20031124.0 3\n", 165 | "1003 2085230 4.0 20040315.0 3\n", 166 | "\n", 167 | "\n", 168 | "These are the final datatypes of the dataset\n", 169 | "Cust_Id int64\n", 170 | "Rating float64\n", 171 | "Date float64\n", 172 | "Movie_Id int64\n", 173 | "dtype: object\n" 174 | ] 175 | }, 176 | { 177 | "name": "stderr", 178 | "output_type": "stream", 179 | "text": [ 180 | "/usr/lib/python3/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: \n", 181 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 182 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 183 | "\n", 184 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 185 | " This is separate from the ipykernel package so we can avoid doing imports until\n" 186 | ] 187 | } 188 | ], 189 | "source": [ 190 | "# remove the extra Movie ID rows\n", 191 | "df = df[pd.notnull(df['Rating'])]\n", 192 | "df['Cust_Id'] = df['Cust_Id'].astype(int)\n", 193 | "print('-Dataset examples-')\n", 194 | "print(df.iloc[::100, :])\n", 195 | "\n", 196 | "\n", 197 | "print(\"\\n\\nThese are the final datatypes of the dataset\")\n", 198 | "print(df.dtypes)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 121, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "name": "stdout", 208 | "output_type": "stream", 209 | "text": [ 210 | "(1009, 3)\n" 211 | ] 212 | } 213 | ], 214 | "source": [ 215 | "#Creating Data Matrix\n", 216 | "df_matrix=pd.pivot_table(df,values='Rating',index='Cust_Id',columns='Movie_Id')\n", 217 | "print(df_matrix.shape)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 122, 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "name": "stdout", 227 | "output_type": "stream", 228 | "text": [ 229 | "See some Movie ID- Movie Title Mapping : \n", 230 | "\n", 231 | " Year Name\n", 232 | "Movie_Id \n", 233 | "1 2003.0 Dinosaur Planet\n", 234 | "2 2004.0 Isle of Man TT 2004 Review\n", 235 | "3 1997.0 Character\n", 236 | "4 1994.0 Paula Abdul's Get Up & Dance\n", 237 | "5 2004.0 The Rise and Fall of ECW\n", 238 | "6 1997.0 Sick\n", 239 | "7 1992.0 8 Man\n", 240 | "8 2004.0 What the #$*! Do We Know!?\n" 241 | ] 242 | } 243 | ], 244 | "source": [ 245 | "#Loading the Movie ID- Movie Title Mapping File\n", 246 | "\n", 247 | "df_title = pd.read_csv('netflix-prize-data/movie_titles.csv', encoding = \"ISO-8859-1\", header = None, names = ['Movie_Id', 'Year', 'Name'])\n", 248 | "df_title.set_index('Movie_Id', inplace = True)\n", 249 | "print(\"See some Movie ID- Movie Title Mapping : \\n\")\n", 250 | "print (df_title.head(8))" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 123, 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "name": "stdout", 260 | "output_type": "stream", 261 | "text": [ 262 | "\n", 263 | "\n", 264 | "Data Cleaning Complete.\n", 265 | " See head of the Data Matrix:\n", 266 | "\n", 267 | "Movie_Id 1 2 3\n", 268 | "Cust_Id \n", 269 | "915 5.0 NaN NaN\n", 270 | "1333 NaN NaN 4.0\n", 271 | "2442 3.0 NaN NaN\n", 272 | "3321 3.0 NaN NaN\n", 273 | "4326 4.0 NaN NaN\n", 274 | "\n", 275 | "Num of movies = 3\n", 276 | "Num of users = 1009\n" 277 | ] 278 | } 279 | ], 280 | "source": [ 281 | "print(\"\\n\\nData Cleaning Complete.\\n See head of the Data Matrix:\\n\")\n", 282 | "print(df_matrix.head())\n", 283 | "\n", 284 | "n_movies = movie_count\n", 285 | "n_customers = cust_count\n", 286 | "\n", 287 | "print(\"\\nNum of movies =\", movie_count)\n", 288 | "print(\"Num of users =\", cust_count)" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 124, 294 | "metadata": {}, 295 | "outputs": [ 296 | { 297 | "name": "stdout", 298 | "output_type": "stream", 299 | "text": [ 300 | "Movie_Id 1 2 3\n", 301 | "Cust_Id \n", 302 | "915 5.0 0.0 0.0\n", 303 | "1333 0.0 0.0 4.0\n", 304 | "2442 3.0 0.0 0.0\n", 305 | "3321 3.0 0.0 0.0\n", 306 | "4326 4.0 0.0 0.0\n" 307 | ] 308 | } 309 | ], 310 | "source": [ 311 | "#Choosing the number of latent attributes\n", 312 | "n_attr= 100*1000000\n", 313 | "#print(type(n_attr),type(n_movies), type(n_customers))\n", 314 | "Q = Variable((n_attr,n_movies))\n", 315 | "P = Variable((n_attr, n_customers))\n", 316 | "\n", 317 | "\n", 318 | "\n", 319 | "acq_data = df_matrix.fillna(0.0)\n", 320 | "print(acq_data.head())\n", 321 | "\n" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 125, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "class MF():\n", 331 | "\n", 332 | " def __init__(self, R, K, alpha, beta, iterations):\n", 333 | " \"\"\"\n", 334 | " Perform matrix factorization to predict empty\n", 335 | " entries in a matrix.\n", 336 | "\n", 337 | " Arguments\n", 338 | " - R (ndarray) : user-item rating matrix\n", 339 | " - K (int) : number of latent dimensions\n", 340 | " - alpha (float) : learning rate\n", 341 | " - beta (float) : regularization parameter\n", 342 | " \"\"\"\n", 343 | "\n", 344 | " self.R = R\n", 345 | " self.num_users, self.num_items = R.shape\n", 346 | " self.K = K\n", 347 | " self.alpha = alpha\n", 348 | " self.beta = beta\n", 349 | " self.iterations = iterations\n", 350 | "\n", 351 | " def train(self):\n", 352 | " # Initialize user and item latent feature matrice\n", 353 | " self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))\n", 354 | " self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))\n", 355 | "\n", 356 | " # Initialize the biases\n", 357 | " self.b_u = np.zeros(self.num_users)\n", 358 | " self.b_i = np.zeros(self.num_items)\n", 359 | " self.b = np.mean(self.R[np.where(self.R != 0)])\n", 360 | "\n", 361 | " # Create a list of training samples\n", 362 | " self.samples = [\n", 363 | " (i, j, self.R[i, j])\n", 364 | " for i in range(self.num_users)\n", 365 | " for j in range(self.num_items)\n", 366 | " if self.R[i, j] > 0\n", 367 | " ]\n", 368 | "\n", 369 | " # Perform stochastic gradient descent for number of iterations\n", 370 | " training_process = []\n", 371 | " for i in range(self.iterations):\n", 372 | " np.random.shuffle(self.samples)\n", 373 | " self.sgd()\n", 374 | " mse = self.mse()\n", 375 | " training_process.append((i, mse))\n", 376 | " #if (i+1) % 100 == 0:\n", 377 | " # print(\"Iteration: %d ; error = %.4f\" % (i+1, mse))\n", 378 | "\n", 379 | " return training_process\n", 380 | "\n", 381 | " def mse(self):\n", 382 | " \"\"\"\n", 383 | " A function to compute the total mean square error\n", 384 | " \"\"\"\n", 385 | " xs, ys = self.R.nonzero()\n", 386 | " predicted = self.full_matrix()\n", 387 | " error = 0\n", 388 | " for x, y in zip(xs, ys):\n", 389 | " error += pow(self.R[x, y] - predicted[x, y], 2)\n", 390 | " return np.sqrt(error)\n", 391 | "\n", 392 | " def sgd(self):\n", 393 | " \"\"\"\n", 394 | " Perform stochastic graident descent\n", 395 | " \"\"\"\n", 396 | " for i, j, r in self.samples:\n", 397 | " # Computer prediction and error\n", 398 | " prediction = self.get_rating(i, j)\n", 399 | " e = (r - prediction)\n", 400 | "\n", 401 | " # Update biases\n", 402 | " self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])\n", 403 | " self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])\n", 404 | "\n", 405 | " # Update user and item latent feature matrices\n", 406 | " self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])\n", 407 | " self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])\n", 408 | "\n", 409 | " def get_rating(self, i, j):\n", 410 | " \"\"\"\n", 411 | " Get the predicted rating of user i and item j\n", 412 | " \"\"\"\n", 413 | " prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)\n", 414 | " return prediction\n", 415 | "\n", 416 | " def full_matrix(self):\n", 417 | " \"\"\"\n", 418 | " Computer the full matrix using the resultant biases, P and Q\n", 419 | " \"\"\"\n", 420 | " return self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.P.dot(self.Q.T)" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 101, 426 | "metadata": {}, 427 | "outputs": [ 428 | { 429 | "name": "stdout", 430 | "output_type": "stream", 431 | "text": [ 432 | "Original:\n", 433 | " [[5 3 0 1]\n", 434 | " [4 0 0 1]\n", 435 | " [1 1 0 5]\n", 436 | " [1 0 0 4]\n", 437 | " [0 1 5 4]]\n", 438 | "Test Set:\n", 439 | " [[5 3 0 1]\n", 440 | " [4 0 0 1]\n", 441 | " [1 1 0 5]\n", 442 | " [0 0 0 4]\n", 443 | " [0 1 5 0]]\n", 444 | "MSE= 0.9219544457292888\n", 445 | "\n", 446 | "Training ...\n", 447 | "\n", 448 | "Learnt=\n", 449 | " [[5. 3. 3. 1.]\n", 450 | " [4. 2. 3. 1.]\n", 451 | " [1. 1. 6. 5.]\n", 452 | " [3. 2. 5. 4.]\n", 453 | " [1. 1. 5. 4.]]\n", 454 | "RMSE f= 0.8660254037844386\n" 455 | ] 456 | } 457 | ], 458 | "source": [ 459 | "#This cell works on Toy Dataset\n", 460 | "#The next cell is for real data\n", 461 | "R = np.array([\n", 462 | " [5, 3, 0, 1],\n", 463 | " [4, 0, 0, 1],\n", 464 | " [1, 1, 0, 5],\n", 465 | " [1, 0, 0, 4],\n", 466 | " [0, 1, 5, 4],\n", 467 | "])\n", 468 | "\n", 469 | "R1= np.array([\n", 470 | " [5, 3, 0, 1],\n", 471 | " [4, 0, 0, 1],\n", 472 | " [1, 1, 0, 5],\n", 473 | " [1, 0, 0, 4],\n", 474 | " [0, 1, 5, 4],\n", 475 | "])\n", 476 | "\n", 477 | "#Set the number of values to replace. For example 20%:\n", 478 | "prop = int(R.size * 0.2)\n", 479 | "\n", 480 | "#Randomly choose indices of the numpy array:\n", 481 | "i = [np.random.choice(range(R.shape[0])) for _ in range(prop)]\n", 482 | "j = [np.random.choice(range(R.shape[1])) for _ in range(prop)]\n", 483 | "\n", 484 | "#Change values with 0\n", 485 | "R[i,j] = 0\n", 486 | "print(\"Original:\\n\",R1)\n", 487 | "print(\"Test Set:\\n\",R)\n", 488 | "R=np.rint(R)\n", 489 | "\n", 490 | "from sklearn.metrics import mean_squared_error\n", 491 | "mse = mean_squared_error(R, R1)\n", 492 | "\n", 493 | "print(\"MSE=\",mse**0.5)\n", 494 | "\n", 495 | "print(\"\\nTraining ...\\n\")\n", 496 | "\n", 497 | "\n", 498 | "mf = MF(R, K=10000, alpha=0.01, beta=0.01, iterations=10000)\n", 499 | "training_process = mf.train()\n", 500 | "L=np.rint(mf.full_matrix())\n", 501 | "\n", 502 | "\n", 503 | "\n", 504 | "print(\"Learnt=\\n\",L)\n", 505 | "msef=0.0\n", 506 | "for i1 in range(len(i)):\n", 507 | " for i2 in range(len(j)):\n", 508 | " if R1.item(i[i1],j[i2])!=0:\n", 509 | " msef = msef + (R1.item((i[i1],j[i2]))-(L).item((i[i1],j[i2])))**2\n", 510 | "msef = (msef/(len(j)*len(i)))\n", 511 | "print(\"RMSE f=\",msef**0.5)" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": 139, 517 | "metadata": {}, 518 | "outputs": [ 519 | { 520 | "name": "stdout", 521 | "output_type": "stream", 522 | "text": [ 523 | "Original:\n", 524 | " [[5. 0. 0.]\n", 525 | " [0. 0. 4.]\n", 526 | " [3. 0. 0.]\n", 527 | " ...\n", 528 | " [0. 0. 3.]\n", 529 | " [4. 0. 0.]\n", 530 | " [0. 3. 0.]]\n", 531 | "Test Set:\n", 532 | " [[5. 0. 0.]\n", 533 | " [0. 0. 4.]\n", 534 | " [3. 0. 0.]\n", 535 | " ...\n", 536 | " [0. 0. 3.]\n", 537 | " [4. 0. 0.]\n", 538 | " [0. 3. 0.]]\n", 539 | "MSE= 0.9522806592149013\n", 540 | "\n", 541 | "Training ...\n", 542 | "\n", 543 | "\n", 544 | "Done\n", 545 | "\n" 546 | ] 547 | } 548 | ], 549 | "source": [ 550 | "#This cell works on Real DataSet\n", 551 | "\n", 552 | "R = np.array(acq_data)\n", 553 | "\n", 554 | "R1= np.array(acq_data)\n", 555 | "\n", 556 | "#Set the number of values to replace. For example 20%:\n", 557 | "prop = int(R.size * 0.2)\n", 558 | "\n", 559 | "#Randomly choose indices of the numpy array:\n", 560 | "i = [np.random.choice(range(R.shape[0])) for _ in range(prop)]\n", 561 | "j = [np.random.choice(range(R.shape[1])) for _ in range(prop)]\n", 562 | "\n", 563 | "#Change values with 0\n", 564 | "R[i,j] = 0\n", 565 | "print(\"Original:\\n\",R1)\n", 566 | "print(\"Test Set:\\n\",R)\n", 567 | "R=np.rint(R)\n", 568 | "\n", 569 | "from sklearn.metrics import mean_squared_error\n", 570 | "mse = mean_squared_error(R, R1)\n", 571 | "\n", 572 | "print(\"MSE=\",mse**0.5)\n", 573 | "\n", 574 | "print(\"\\nTraining ...\\n\")\n", 575 | "\n", 576 | "\n", 577 | "mf = MF(R, K=10000, alpha=0.01, beta=0.01, iterations=100)\n", 578 | "training_process = mf.train()\n", 579 | "L=np.rint(mf.full_matrix())\n", 580 | "\n", 581 | "print(\"\\nDone\\n\")" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": 138, 587 | "metadata": {}, 588 | "outputs": [ 589 | { 590 | "name": "stdout", 591 | "output_type": "stream", 592 | "text": [ 593 | "Minimizing Error on Training Set:\n", 594 | "\n" 595 | ] 596 | }, 597 | { 598 | "data": { 599 | "image/png": "\n", 600 | "text/plain": [ 601 | "
" 602 | ] 603 | }, 604 | "metadata": {}, 605 | "output_type": "display_data" 606 | } 607 | ], 608 | "source": [ 609 | "x = [x for x, y in training_process]\n", 610 | "y = [y for x, y in training_process]\n", 611 | "x = x[::10]\n", 612 | "y = y[::10]\n", 613 | "plt.figure(figsize=((16,4)))\n", 614 | "plt.plot(x, np.sqrt(y))\n", 615 | "plt.xticks(x, x)\n", 616 | "\n", 617 | "print(\"Minimizing Error on Training Set:\\n\")\n", 618 | "plt.xlabel(\"Iterations\")\n", 619 | "plt.ylabel(\"Root Mean Square Error\")\n", 620 | "plt.grid(axis=\"y\")" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": 135, 626 | "metadata": {}, 627 | "outputs": [ 628 | { 629 | "name": "stdout", 630 | "output_type": "stream", 631 | "text": [ 632 | "Learnt=\n", 633 | " [[4.92519455 4.45237587 4.03105201]\n", 634 | " [3.92656424 3.75922101 3.97495875]\n", 635 | " [3.03875459 3.06278695 3.34900645]\n", 636 | " ...\n", 637 | " [3.76684225 3.59991078 3.61421131]\n", 638 | " [3.76735095 3.6001197 3.61418442]\n", 639 | " [3.33974773 3.02411301 3.35034179]]\n", 640 | "\n", 641 | "Rating predictions=\n", 642 | " [[5. 4. 4.]\n", 643 | " [4. 4. 4.]\n", 644 | " [3. 3. 3.]\n", 645 | " ...\n", 646 | " [4. 4. 4.]\n", 647 | " [4. 4. 4.]\n", 648 | " [3. 3. 3.]]\n", 649 | "\n", 650 | "P x Q:\n", 651 | "[[4.92519455 4.45237587 4.03105201]\n", 652 | " [3.92656424 3.75922101 3.97495875]\n", 653 | " [3.03875459 3.06278695 3.34900645]\n", 654 | " ...\n", 655 | " [3.76684225 3.59991078 3.61421131]\n", 656 | " [3.76735095 3.6001197 3.61418442]\n", 657 | " [3.33974773 3.02411301 3.35034179]]\n", 658 | "\n", 659 | "Global bias:\n", 660 | "3.688861985472155\n", 661 | "\n", 662 | "User bias:\n", 663 | "[ 0.42868902 0.16245438 -0.27258023 ... 0. 0.\n", 664 | " -0.2687577 ]\n", 665 | "\n", 666 | "Item bias:\n", 667 | "[ 0.07818754 -0.08891514 -0.07470079]\n", 668 | "\n", 669 | "Finding Error on test set...\n", 670 | "\n", 671 | "RMSE f= 0.46105629990963165\n" 672 | ] 673 | } 674 | ], 675 | "source": [ 676 | "print(\"Learnt=\\n\",mf.full_matrix())\n", 677 | "print(\"\\nRating predictions=\\n\",L)\n", 678 | "\n", 679 | "print()\n", 680 | "print(\"P x Q:\")\n", 681 | "print(mf.full_matrix())\n", 682 | "print()\n", 683 | "print(\"Global bias:\")\n", 684 | "print(mf.b)\n", 685 | "print()\n", 686 | "print(\"User bias:\")\n", 687 | "print(mf.b_u)\n", 688 | "print()\n", 689 | "print(\"Item bias:\")\n", 690 | "print(mf.b_i)\n", 691 | "\n", 692 | "print(\"\\nFinding Error on test set...\\n\")\n", 693 | "msef=0.0\n", 694 | "for i1 in range(len(i)):\n", 695 | " for i2 in range(len(j)):\n", 696 | " if R1.item(i[i1],j[i2])!=0:\n", 697 | " msef = msef + (R1.item((i[i1],j[i2]))-(L).item((i[i1],j[i2])))**2\n", 698 | "msef = (msef/(len(j)*len(i)))\n", 699 | "print(\"RMSE f=\",msef**0.5)" 700 | ] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": null, 705 | "metadata": {}, 706 | "outputs": [], 707 | "source": [] 708 | } 709 | ], 710 | "metadata": { 711 | "kernelspec": { 712 | "display_name": "Python 3", 713 | "language": "python", 714 | "name": "python3" 715 | }, 716 | "language_info": { 717 | "codemirror_mode": { 718 | "name": "ipython", 719 | "version": 3 720 | }, 721 | "file_extension": ".py", 722 | "mimetype": "text/x-python", 723 | "name": "python", 724 | "nbconvert_exporter": "python", 725 | "pygments_lexer": "ipython3", 726 | "version": "3.6.7" 727 | } 728 | }, 729 | "nbformat": 4, 730 | "nbformat_minor": 2 731 | } 732 | -------------------------------------------------------------------------------- /recommender_final.py: -------------------------------------------------------------------------------- 1 | #Setting up prerequisites 2 | #from numba import prange 3 | from mf import MF 4 | import pandas as pd 5 | import numpy as np 6 | import math 7 | import re 8 | import sklearn 9 | from scipy.sparse import csr_matrix 10 | import matplotlib.pyplot as plt 11 | import seaborn as sns 12 | from surprise import Reader, Dataset, SVD, evaluate 13 | sns.set_style("darkgrid") 14 | from cvxpy import * 15 | from numpy import matrix 16 | print("Setup Complete\n") 17 | 18 | 19 | print("Select Number of DataPoints to Train on: \n1: 1024 \t2: 10000 \n3: 25000 \t4: 75000 \n5: 100000 \t6: 200000\n\n") 20 | choice = int(input()) 21 | print("\nLoading Data\n") 22 | if (choice==1 or choice==1024): 23 | df1 = pd.read_csv('feasible_data_1024.txt', header = None, names = ['Cust_Id', 'Rating', 'Date'], usecols = [0,1,2]) 24 | elif (choice==2 or choice==10000): 25 | df1 = pd.read_csv('feasible_data_10000.txt', header = None, names = ['Cust_Id', 'Rating', 'Date'], usecols = [0,1,2]) 26 | elif (choice==3 or choice==25000): 27 | df1 = pd.read_csv('feasible_data_25000.txt', header = None, names = ['Cust_Id', 'Rating', 'Date'], usecols = [0,1,2]) 28 | elif (choice==4 or choice==75000): 29 | df1 = pd.read_csv('feasible_data_75000.txt', header = None, names = ['Cust_Id', 'Rating', 'Date'], usecols = [0,1,2]) 30 | elif (choice==5 or choice==100000): 31 | df1 = pd.read_csv('feasible_data_100000.txt', header = None, names = ['Cust_Id', 'Rating', 'Date'], usecols = [0,1,2]) 32 | elif (choice==6 or choice==200000): 33 | df1 = pd.read_csv('feasible_data_200000.txt', header = None, names = ['Cust_Id', 'Rating', 'Date'], usecols = [0,1,2]) 34 | 35 | 36 | df1['Rating'] = df1['Rating'].astype(float) 37 | df1['Date'] = df1['Date'].astype(str) 38 | df1['Date'] = df1['Date'].map( lambda s : (s[:4])+(s[5:7])+(s[8:])) 39 | df1['Date'] = df1['Date'].astype(float) 40 | print('Dataset 1 shape: {}'.format(df1.shape)) 41 | print('-Dataset examples-') 42 | print(df1.iloc[::10000, :]) 43 | #print(df1['Date'].dtype) 44 | df = df1 45 | 46 | 47 | 48 | 49 | #Seeing the distribution of ratings given by the users 50 | #print("See Overview of the Data") 51 | p = df.groupby('Rating')['Rating'].agg(['count']) 52 | # get movie count 53 | movie_count = df.isnull().sum()[1] 54 | # get customer count 55 | cust_count = df['Cust_Id'].nunique() - movie_count 56 | # get rating count 57 | rating_count = df['Cust_Id'].count() - movie_count 58 | ax = p.plot(kind = 'barh', legend = False, figsize = (15,10)) 59 | plt.title('Total pool: {:,} Movies, {:,} customers, {:,} ratings given'.format(movie_count, cust_count, rating_count), fontsize=20) 60 | plt.axis('off') 61 | for i in range(1,6): 62 | ax.text(p.iloc[i-1][0]/4, i-1, 'Rated {}: {:.0f}%'.format(i, p.iloc[i-1][0]*100 / p.sum()[0]), color = 'white', weight = 'bold') 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | #Adding movie IDs to the dataset 74 | print("\nExtracting Movie IDs\n") 75 | movie_np = [] 76 | movie_id = 0 77 | for x in range(df.shape[0]): 78 | if(np.isnan(df.iloc[x]['Rating'])): 79 | movie_id = movie_id+1 80 | movie_np = np.append(movie_np,movie_id) 81 | #print(movie_np) 82 | #print(len(movie_np)) 83 | df['Movie_Id'] = movie_np.astype(int) 84 | print("Movie IDs extracted from the extra rows given") 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | # remove the extra Movie ID rows 94 | print("\nRemoving extra Movie ID rows\n") 95 | df = df[pd.notnull(df['Rating'])] 96 | df['Cust_Id'] = df['Cust_Id'].astype(int) 97 | print('-Dataset examples-') 98 | print(df.iloc[::100, :]) 99 | print("\n\nThese are the final datatypes of the dataset") 100 | print(df.dtypes) 101 | 102 | 103 | 104 | 105 | #Creating Data Matrix 106 | df_matrix=pd.pivot_table(df,values='Rating',index='Cust_Id',columns='Movie_Id') 107 | print(df_matrix.shape) 108 | 109 | 110 | 111 | #Loading the Movie ID- Movie Title Mapping File 112 | print("\nLoading the Movie ID- Movie Title Mapping File\n") 113 | df_title = pd.read_csv('netflix-prize-data/movie_titles.csv', encoding = "ISO-8859-1", header = None, names = ['Movie_Id', 'Year', 'Name']) 114 | df_title.set_index('Movie_Id', inplace = True) 115 | print("See some Movie ID- Movie Title Mapping : \n") 116 | print (df_title.head(8)) 117 | 118 | 119 | 120 | 121 | print("\n\nData Cleaning Complete.\n See head of the Data Matrix:\n") 122 | print(df_matrix.head()) 123 | n_movies = movie_count 124 | n_customers = cust_count 125 | print("\nNum of movies =", movie_count) 126 | print("Num of users =", cust_count) 127 | print() 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | #Choosing the number of latent attributes 137 | n_attr= 100*1000000 138 | #print(type(n_attr),type(n_movies), type(n_customers)) 139 | Q = Variable((n_attr,n_movies)) 140 | P = Variable((n_attr, n_customers)) 141 | acq_data = df_matrix.fillna(0.0) 142 | print(acq_data.head()) 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | R = np.array(acq_data) 152 | R1= np.array(acq_data) 153 | 154 | 155 | print("\nRandomly Distributing Test and Train Set by removing 20% values...\n") 156 | #This cell works on Real DataSet 157 | R = np.array(acq_data) 158 | R1= np.array(acq_data) 159 | #Set the number of values to replace. For example 20%: 160 | prop = int(R.size * 0.2) 161 | #Randomly choose indices of the numpy array: 162 | #print("Creating Random Indices\n") 163 | # i = [np.random.choice(range(R.shape[0])) for _ in range(prop)] 164 | # j = [np.random.choice(range(R.shape[1])) for _ in range(prop)] 165 | i = np.random.randint(0,R.shape[0],size=prop) 166 | j = np.random.randint(0,R.shape[1],size=prop) 167 | #print("Created Random Indices\n") 168 | print("Done\n") 169 | #print("i=",i) 170 | #print("j=",j) 171 | #Change values with 0 172 | R[i,j] = 0 173 | print("Original:\n",R1) 174 | print("Test Set:\n",R) 175 | R=np.rint(R) 176 | from sklearn.metrics import mean_squared_error 177 | mse = mean_squared_error(R, R1) 178 | print("RMSE=",mse**0.5) 179 | print("\nTraining ...\n") 180 | mf = MF(R, K=2, alpha=0.01, beta=0.01, iterations=100) 181 | training_process = mf.train() 182 | L=np.rint(mf.full_matrix()) 183 | print("\nDone\n") 184 | x = [x for x, y in training_process] 185 | y = [y for x, y in training_process] 186 | x = x[::10] 187 | y = y[::10] 188 | plt.figure(figsize=((16,4))) 189 | plt.plot(x, np.sqrt(y)) 190 | plt.xticks(x, x) 191 | print("Minimizing Error on Training Set:\n") 192 | plt.xlabel("Iterations") 193 | plt.ylabel("Root Mean Square Error") 194 | plt.grid(axis="y") 195 | print("Learnt=\n",mf.full_matrix()) 196 | print("\nRating predictions=\n",L) 197 | print() 198 | print() 199 | # print("Global bias:") 200 | # print(mf.b) 201 | # print() 202 | # print("User bias:") 203 | # print(mf.b_u) 204 | # print() 205 | # print("Item bias:") 206 | # print(mf.b_i) 207 | print("\nFinding Error on test set...\n") 208 | msef=0.0 209 | # for i1 in range(len(i)): 210 | # for i2 in range(len(j)): 211 | # if R1.item(i[i1],j[i2])!=0: 212 | # msef = msef + (R1.item((i[i1],j[i2]))-(L).item((i[i1],j[i2])))**2 213 | # msef = (msef/(len(j)*len(i))) 214 | valid_cmp = ~np.isnan(df_matrix) 215 | msef = np.sum(np.sum(np.multiply(valid_cmp,np.square(R1-L)),axis=None))/(len(j)*len(i)*1.00) 216 | 217 | print("RMSE final=",msef**0.5) -------------------------------------------------------------------------------- /recommender_final_toy_dataset.py: -------------------------------------------------------------------------------- 1 | #Setting up prerequisites 2 | from mf import MF 3 | import pandas as pd 4 | import numpy as np 5 | import math 6 | import re 7 | import sklearn 8 | from scipy.sparse import csr_matrix 9 | import matplotlib.pyplot as plt 10 | import seaborn as sns 11 | from surprise import Reader, Dataset, SVD, evaluate 12 | sns.set_style("darkgrid") 13 | from cvxpy import * 14 | from numpy import matrix 15 | print("Setup Complete\n") 16 | 17 | 18 | # df1 = pd.read_csv('netflix-prize-data/toy_combined_data.txt', header = None, names = ['Cust_Id', 'Rating', 'Date'], usecols = [0,1,2]) 19 | # df1['Rating'] = df1['Rating'].astype(float) 20 | # df1['Date'] = df1['Date'].astype(str) 21 | # df1['Date'] = df1['Date'].map( lambda s : (s[:4])+(s[5:7])+(s[8:])) 22 | # df1['Date'] = df1['Date'].astype(float) 23 | # print('Dataset 1 shape: {}'.format(df1.shape)) 24 | # print('-Dataset examples-') 25 | # print(df1.iloc[::100, :]) 26 | # print(df1['Date'].dtype) 27 | # df = df1 28 | 29 | 30 | 31 | 32 | 33 | 34 | # #Seeing the distribution of ratings given by the users 35 | # print("See Overview of the Data") 36 | # p = df.groupby('Rating')['Rating'].agg(['count']) 37 | # # get movie count 38 | # movie_count = df.isnull().sum()[1] 39 | # # get customer count 40 | # cust_count = df['Cust_Id'].nunique() - movie_count 41 | # # get rating count 42 | # rating_count = df['Cust_Id'].count() - movie_count 43 | # ax = p.plot(kind = 'barh', legend = False, figsize = (15,10)) 44 | # plt.title('Total pool: {:,} Movies, {:,} customers, {:,} ratings given'.format(movie_count, cust_count, rating_count), fontsize=20) 45 | # plt.axis('off') 46 | # for i in range(1,6): 47 | # ax.text(p.iloc[i-1][0]/4, i-1, 'Rated {}: {:.0f}%'.format(i, p.iloc[i-1][0]*100 / p.sum()[0]), color = 'white', weight = 'bold') 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | # #Adding movie IDs to the dataset 59 | # movie_np = [] 60 | # movie_id = 0 61 | # for x in range(df.shape[0]): 62 | # if(np.isnan(df.iloc[x]['Rating'])): 63 | # movie_id = movie_id+1 64 | # movie_np = np.append(movie_np,movie_id) 65 | # #print(movie_np) 66 | # #print(len(movie_np)) 67 | # df['Movie_Id'] = movie_np.astype(int) 68 | # print("Movie IDs extracted from the extra rows given") 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | # # remove the extra Movie ID rows 78 | # df = df[pd.notnull(df['Rating'])] 79 | # df['Cust_Id'] = df['Cust_Id'].astype(int) 80 | # print('-Dataset examples-') 81 | # print(df.iloc[::100, :]) 82 | # print("\n\nThese are the final datatypes of the dataset") 83 | # print(df.dtypes) 84 | 85 | 86 | 87 | 88 | # #Creating Data Matrix 89 | # df_matrix=pd.pivot_table(df,values='Rating',index='Cust_Id',columns='Movie_Id') 90 | # print(df_matrix.shape) 91 | 92 | 93 | 94 | # #Loading the Movie ID- Movie Title Mapping File 95 | # df_title = pd.read_csv('netflix-prize-data/movie_titles.csv', encoding = "ISO-8859-1", header = None, names = ['Movie_Id', 'Year', 'Name']) 96 | # df_title.set_index('Movie_Id', inplace = True) 97 | # print("See some Movie ID- Movie Title Mapping : \n") 98 | # print (df_title.head(8)) 99 | 100 | 101 | 102 | 103 | # print("\n\nData Cleaning Complete.\n See head of the Data Matrix:\n") 104 | # print(df_matrix.head()) 105 | # n_movies = movie_count 106 | # n_customers = cust_count 107 | # print("\nNum of movies =", movie_count) 108 | # print("Num of users =", cust_count) 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | # #Choosing the number of latent attributes 119 | # n_attr= 100*1000000 120 | # #print(type(n_attr),type(n_movies), type(n_customers)) 121 | # Q = Variable((n_attr,n_movies)) 122 | # P = Variable((n_attr, n_customers)) 123 | # acq_data = df_matrix.fillna(0.0) 124 | # print(acq_data.head()) 125 | 126 | 127 | 128 | 129 | 130 | #This cell works on Toy Dataset 131 | #The next cell is for real data 132 | R = np.array([ 133 | [5, 3, 0, 1], 134 | [4, 0, 0, 1], 135 | [1, 1, 0, 5], 136 | [1, 0, 0, 4], 137 | [0, 1, 5, 4], 138 | ]) 139 | R1= np.array([ 140 | [5, 3, 0, 1], 141 | [4, 0, 0, 1], 142 | [1, 1, 0, 5], 143 | [1, 0, 0, 4], 144 | [0, 1, 5, 4], 145 | ]) 146 | #Set the number of values to replace. For example 20%: 147 | prop = int(R.size * 0.2) 148 | #Randomly choose indices of the numpy array: 149 | i = [np.random.choice(range(R.shape[0])) for _ in range(prop)] 150 | j = [np.random.choice(range(R.shape[1])) for _ in range(prop)] 151 | #Change values with 0 152 | R[i,j] = 0 153 | print("Original:\n",R1) 154 | print("Test Set:\n",R) 155 | R=np.rint(R) 156 | from sklearn.metrics import mean_squared_error 157 | mse = mean_squared_error(R, R1) 158 | print("MSE=",mse**0.5) 159 | print("\nTraining ...\n") 160 | mf = MF(R, K=10000, alpha=0.01, beta=0.01, iterations=10000) 161 | training_process = mf.train() 162 | L=np.rint(mf.full_matrix()) 163 | print("Learnt=\n",L) 164 | print("\nFinding Error on test set...\n") 165 | msef=0.0 166 | for i1 in range(len(i)): 167 | for i2 in range(len(j)): 168 | if R1.item(i[i1],j[i2])!=0: 169 | msef = msef + (R1.item((i[i1],j[i2]))-(L).item((i[i1],j[i2])))**2 170 | msef = (msef/(len(j)*len(i))) 171 | print("RMSE f=",msef**0.5) --------------------------------------------------------------------------------