├── README.md └── CF Recommendation System - Examples.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # JNBforBlogs 2 | Jupyter notebooks used in Blogs 3 | -------------------------------------------------------------------------------- /CF Recommendation System - Examples.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "**Examples of Collaborative Filtering based Recommendation Systems**" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "#make necesarry imports\n", 17 | "import numpy as np\n", 18 | "import pandas as pd\n", 19 | "import matplotlib.pyplot as plt\n", 20 | "import sklearn.metrics as metrics\n", 21 | "import numpy as np\n", 22 | "from sklearn.neighbors import NearestNeighbors\n", 23 | "from scipy.spatial.distance import correlation, cosine\n", 24 | "import ipywidgets as widgets\n", 25 | "from IPython.display import display, clear_output\n", 26 | "from sklearn.metrics import pairwise_distances\n", 27 | "from sklearn.metrics import mean_squared_error\n", 28 | "from math import sqrt\n", 29 | "import sys, os\n", 30 | "from contextlib import contextmanager" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": { 37 | "collapsed": true 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "#M is user-item ratings matrix where ratings are integers from 1-10\n", 42 | "M = np.asarray([[3,7,4,9,9,7], \n", 43 | " [7,0,5,3,8,8],\n", 44 | " [7,5,5,0,8,4],\n", 45 | " [5,6,8,5,9,8],\n", 46 | " [5,8,8,8,10,9],\n", 47 | " [7,7,0,4,7,8]])\n", 48 | "M=pd.DataFrame(M)\n", 49 | "\n", 50 | "#declaring k,metric as global which can be changed by the user later\n", 51 | "global k,metric\n", 52 | "k=4\n", 53 | "metric='cosine' #can be changed to 'correlation' for Pearson correlation similaries" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 3, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "text/html": [ 64 | "
\n", 65 | "\n", 78 | "\n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | "
012345
0374997
1705388
2755084
3568598
45888109
5770478
\n", 147 | "
" 148 | ], 149 | "text/plain": [ 150 | " 0 1 2 3 4 5\n", 151 | "0 3 7 4 9 9 7\n", 152 | "1 7 0 5 3 8 8\n", 153 | "2 7 5 5 0 8 4\n", 154 | "3 5 6 8 5 9 8\n", 155 | "4 5 8 8 8 10 9\n", 156 | "5 7 7 0 4 7 8" 157 | ] 158 | }, 159 | "execution_count": 3, 160 | "metadata": {}, 161 | "output_type": "execute_result" 162 | } 163 | ], 164 | "source": [ 165 | "M" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "**User-based Recommendation Systems**" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 4, 178 | "metadata": { 179 | "collapsed": true 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "#get cosine similarities for ratings matrix M; pairwise_distances returns the distances between ratings and hence\n", 184 | "#similarities are obtained by subtracting distances from 1\n", 185 | "cosine_sim = 1-pairwise_distances(M, metric=\"cosine\")" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 5, 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "data": { 195 | "text/html": [ 196 | "
\n", 197 | "\n", 210 | "\n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | "
012345
01.0000000.7992680.7792270.9346220.9738900.884600
10.7992681.0000000.8747440.9058500.8661460.827036
20.7792270.8747441.0000000.9095130.8654540.853275
30.9346220.9058500.9095131.0000000.9893440.865614
40.9738900.8661460.8654540.9893441.0000000.881640
50.8846000.8270360.8532750.8656140.8816401.000000
\n", 279 | "
" 280 | ], 281 | "text/plain": [ 282 | " 0 1 2 3 4 5\n", 283 | "0 1.000000 0.799268 0.779227 0.934622 0.973890 0.884600\n", 284 | "1 0.799268 1.000000 0.874744 0.905850 0.866146 0.827036\n", 285 | "2 0.779227 0.874744 1.000000 0.909513 0.865454 0.853275\n", 286 | "3 0.934622 0.905850 0.909513 1.000000 0.989344 0.865614\n", 287 | "4 0.973890 0.866146 0.865454 0.989344 1.000000 0.881640\n", 288 | "5 0.884600 0.827036 0.853275 0.865614 0.881640 1.000000" 289 | ] 290 | }, 291 | "execution_count": 5, 292 | "metadata": {}, 293 | "output_type": "execute_result" 294 | } 295 | ], 296 | "source": [ 297 | "#Cosine similarity matrix\n", 298 | "pd.DataFrame(cosine_sim)" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 6, 304 | "metadata": { 305 | "collapsed": true 306 | }, 307 | "outputs": [], 308 | "source": [ 309 | "#get pearson similarities for ratings matrix M\n", 310 | "pearson_sim = 1-pairwise_distances(M, metric=\"correlation\")" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 7, 316 | "metadata": {}, 317 | "outputs": [ 318 | { 319 | "data": { 320 | "text/html": [ 321 | "
\n", 322 | "\n", 335 | "\n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | "
012345
01.000000-0.137446-0.3573980.2081790.7619050.277350
1-0.1374461.0000000.4538970.5159100.1124560.218328
2-0.3573980.4538971.0000000.451378-0.0428880.297373
30.2081790.5159100.4513781.0000000.763325-0.057739
40.7619050.112456-0.0428880.7633251.0000000.039621
50.2773500.2183280.297373-0.0577390.0396211.000000
\n", 404 | "
" 405 | ], 406 | "text/plain": [ 407 | " 0 1 2 3 4 5\n", 408 | "0 1.000000 -0.137446 -0.357398 0.208179 0.761905 0.277350\n", 409 | "1 -0.137446 1.000000 0.453897 0.515910 0.112456 0.218328\n", 410 | "2 -0.357398 0.453897 1.000000 0.451378 -0.042888 0.297373\n", 411 | "3 0.208179 0.515910 0.451378 1.000000 0.763325 -0.057739\n", 412 | "4 0.761905 0.112456 -0.042888 0.763325 1.000000 0.039621\n", 413 | "5 0.277350 0.218328 0.297373 -0.057739 0.039621 1.000000" 414 | ] 415 | }, 416 | "execution_count": 7, 417 | "metadata": {}, 418 | "output_type": "execute_result" 419 | } 420 | ], 421 | "source": [ 422 | "#Pearson correlation similarity matrix\n", 423 | "pd.DataFrame(pearson_sim)" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": 8, 429 | "metadata": { 430 | "collapsed": true 431 | }, 432 | "outputs": [], 433 | "source": [ 434 | "#This function finds k similar users given the user_id and ratings matrix M\n", 435 | "#Note that the similarities are same as obtained via using pairwise_distances\n", 436 | "def findksimilarusers(user_id, ratings, metric = metric, k=k):\n", 437 | " similarities=[]\n", 438 | " indices=[]\n", 439 | " model_knn = NearestNeighbors(metric = metric, algorithm = 'brute') \n", 440 | " model_knn.fit(ratings)\n", 441 | "\n", 442 | " distances, indices = model_knn.kneighbors(ratings.iloc[user_id-1, :].values.reshape(1, -1), n_neighbors = k)\n", 443 | " similarities = 1-distances.flatten()\n", 444 | " print '{0} most similar users for User {1}:\\n'.format(k-1,user_id)\n", 445 | " for i in range(0, len(indices.flatten())):\n", 446 | " if indices.flatten()[i]+1 == user_id:\n", 447 | " continue;\n", 448 | "\n", 449 | " else:\n", 450 | " print '{0}: User {1}, with similarity of {2}'.format(i, indices.flatten()[i]+1, similarities.flatten()[i])\n", 451 | " \n", 452 | " return similarities,indices" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": 9, 458 | "metadata": {}, 459 | "outputs": [ 460 | { 461 | "name": "stdout", 462 | "output_type": "stream", 463 | "text": [ 464 | "3 most similar users for User 1:\n", 465 | "\n", 466 | "1: User 5, with similarity of 0.973889935402\n", 467 | "2: User 4, with similarity of 0.934621684178\n", 468 | "3: User 6, with similarity of 0.88460045723\n" 469 | ] 470 | } 471 | ], 472 | "source": [ 473 | "similarities,indices = findksimilarusers(1,M, metric='cosine')" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": 10, 479 | "metadata": { 480 | "scrolled": true 481 | }, 482 | "outputs": [ 483 | { 484 | "name": "stdout", 485 | "output_type": "stream", 486 | "text": [ 487 | "3 most similar users for User 1:\n", 488 | "\n", 489 | "1: User 5, with similarity of 0.761904761905\n", 490 | "2: User 6, with similarity of 0.277350098113\n", 491 | "3: User 4, with similarity of 0.208179450927\n" 492 | ] 493 | } 494 | ], 495 | "source": [ 496 | "similarities,indices = findksimilarusers(1,M, metric='correlation')" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": 11, 502 | "metadata": { 503 | "collapsed": true 504 | }, 505 | "outputs": [], 506 | "source": [ 507 | "#This function predicts rating for specified user-item combination based on user-based approach\n", 508 | "def predict_userbased(user_id, item_id, ratings, metric = metric, k=k):\n", 509 | " prediction=0\n", 510 | " similarities, indices=findksimilarusers(user_id, ratings,metric, k) #similar users based on cosine similarity\n", 511 | " mean_rating = ratings.loc[user_id-1,:].mean() #to adjust for zero based indexing\n", 512 | " sum_wt = np.sum(similarities)-1\n", 513 | " product=1\n", 514 | " wtd_sum = 0 \n", 515 | " \n", 516 | " for i in range(0, len(indices.flatten())):\n", 517 | " if indices.flatten()[i]+1 == user_id:\n", 518 | " continue;\n", 519 | " else: \n", 520 | " ratings_diff = ratings.iloc[indices.flatten()[i],item_id-1]-np.mean(ratings.iloc[indices.flatten()[i],:])\n", 521 | " product = ratings_diff * (similarities[i])\n", 522 | " wtd_sum = wtd_sum + product\n", 523 | " \n", 524 | " prediction = int(round(mean_rating + (wtd_sum/sum_wt)))\n", 525 | " print '\\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction)\n", 526 | "\n", 527 | " return prediction" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": 12, 533 | "metadata": {}, 534 | "outputs": [ 535 | { 536 | "name": "stdout", 537 | "output_type": "stream", 538 | "text": [ 539 | "3 most similar users for User 3:\n", 540 | "\n", 541 | "1: User 4, with similarity of 0.90951268934\n", 542 | "2: User 2, with similarity of 0.874744414849\n", 543 | "3: User 5, with similarity of 0.86545387815\n", 544 | "\n", 545 | "Predicted rating for user 3 -> item 4: 3\n" 546 | ] 547 | } 548 | ], 549 | "source": [ 550 | "predict_userbased(3,4,M);" 551 | ] 552 | }, 553 | { 554 | "cell_type": "markdown", 555 | "metadata": {}, 556 | "source": [ 557 | "**Item-based Recommendation Systems**" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": 13, 563 | "metadata": { 564 | "collapsed": true 565 | }, 566 | "outputs": [], 567 | "source": [ 568 | "#This function finds k similar items given the item_id and ratings matrix M\n", 569 | "\n", 570 | "def findksimilaritems(item_id, ratings, metric=metric, k=k):\n", 571 | " similarities=[]\n", 572 | " indices=[] \n", 573 | " ratings=ratings.T\n", 574 | " model_knn = NearestNeighbors(metric = metric, algorithm = 'brute')\n", 575 | " model_knn.fit(ratings)\n", 576 | "\n", 577 | " distances, indices = model_knn.kneighbors(ratings.iloc[item_id-1, :].values.reshape(1, -1), n_neighbors = k)\n", 578 | " similarities = 1-distances.flatten()\n", 579 | " print '{0} most similar items for item {1}:\\n'.format(k-1,item_id)\n", 580 | " for i in range(0, len(indices.flatten())):\n", 581 | " if indices.flatten()[i]+1 == item_id:\n", 582 | " continue;\n", 583 | "\n", 584 | " else:\n", 585 | " print '{0}: Item {1} :, with similarity of {2}'.format(i,indices.flatten()[i]+1, similarities.flatten()[i])\n", 586 | "\n", 587 | "\n", 588 | " return similarities,indices" 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": 14, 594 | "metadata": {}, 595 | "outputs": [ 596 | { 597 | "name": "stdout", 598 | "output_type": "stream", 599 | "text": [ 600 | "3 most similar items for item 3:\n", 601 | "\n", 602 | "1: Item 5 :, with similarity of 0.918336125535\n", 603 | "2: Item 6 :, with similarity of 0.874759773038\n", 604 | "3: Item 1 :, with similarity of 0.810364746222\n" 605 | ] 606 | } 607 | ], 608 | "source": [ 609 | "similarities,indices=findksimilaritems(3,M)" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": 15, 615 | "metadata": { 616 | "collapsed": true 617 | }, 618 | "outputs": [], 619 | "source": [ 620 | "#This function predicts the rating for specified user-item combination based on item-based approach\n", 621 | "def predict_itembased(user_id, item_id, ratings, metric = metric, k=k):\n", 622 | " prediction= wtd_sum =0\n", 623 | " similarities, indices=findksimilaritems(item_id, ratings) #similar users based on correlation coefficients\n", 624 | " sum_wt = np.sum(similarities)-1\n", 625 | " product=1\n", 626 | " \n", 627 | " for i in range(0, len(indices.flatten())):\n", 628 | " if indices.flatten()[i]+1 == item_id:\n", 629 | " continue;\n", 630 | " else:\n", 631 | " product = ratings.iloc[user_id-1,indices.flatten()[i]] * (similarities[i])\n", 632 | " wtd_sum = wtd_sum + product \n", 633 | " prediction = int(round(wtd_sum/sum_wt))\n", 634 | " print '\\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction) \n", 635 | "\n", 636 | " return prediction" 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": 16, 642 | "metadata": {}, 643 | "outputs": [ 644 | { 645 | "name": "stdout", 646 | "output_type": "stream", 647 | "text": [ 648 | "3 most similar items for item 3:\n", 649 | "\n", 650 | "1: Item 5 :, with similarity of 0.918336125535\n", 651 | "2: Item 6 :, with similarity of 0.874759773038\n", 652 | "3: Item 1 :, with similarity of 0.810364746222\n", 653 | "\n", 654 | "Predicted rating for user 1 -> item 3: 6\n" 655 | ] 656 | } 657 | ], 658 | "source": [ 659 | "prediction = predict_itembased(1,3,M)" 660 | ] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "execution_count": 17, 665 | "metadata": {}, 666 | "outputs": [], 667 | "source": [ 668 | "#This function is used to compute adjusted cosine similarity matrix for items\n", 669 | "def computeAdjCosSim(M):\n", 670 | " sim_matrix = np.zeros((M.shape[1], M.shape[1]))\n", 671 | " M_u = M.mean(axis=1) #means\n", 672 | " \n", 673 | " for i in range(M.shape[1]):\n", 674 | " for j in range(M.shape[1]):\n", 675 | " if i == j:\n", 676 | " \n", 677 | " sim_matrix[i][j] = 1\n", 678 | " else: \n", 679 | " if i\n", 727 | "\n", 740 | "\n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | "
012345
01.0000000.2369080.421263-0.519085-0.1258920.010090
10.2369081.000000-0.8052430.0857410.2372730.520625
20.421263-0.8052431.000000-0.767941-0.230521-0.053640
3-0.5190850.085741-0.7679411.000000-0.299059-0.644550
4-0.1258920.237273-0.230521-0.2990591.0000000.599158
50.0100900.520625-0.053640-0.6445500.5991581.000000
\n", 809 | "" 810 | ], 811 | "text/plain": [ 812 | " 0 1 2 3 4 5\n", 813 | "0 1.000000 0.236908 0.421263 -0.519085 -0.125892 0.010090\n", 814 | "1 0.236908 1.000000 -0.805243 0.085741 0.237273 0.520625\n", 815 | "2 0.421263 -0.805243 1.000000 -0.767941 -0.230521 -0.053640\n", 816 | "3 -0.519085 0.085741 -0.767941 1.000000 -0.299059 -0.644550\n", 817 | "4 -0.125892 0.237273 -0.230521 -0.299059 1.000000 0.599158\n", 818 | "5 0.010090 0.520625 -0.053640 -0.644550 0.599158 1.000000" 819 | ] 820 | }, 821 | "execution_count": 19, 822 | "metadata": {}, 823 | "output_type": "execute_result" 824 | } 825 | ], 826 | "source": [ 827 | "adjcos_sim" 828 | ] 829 | }, 830 | { 831 | "cell_type": "code", 832 | "execution_count": 20, 833 | "metadata": { 834 | "collapsed": true 835 | }, 836 | "outputs": [], 837 | "source": [ 838 | "#This function finds k similar items given the item_id and ratings matrix M\n", 839 | "\n", 840 | "def findksimilaritems_adjcos(item_id, ratings, k=k):\n", 841 | " \n", 842 | " sim_matrix = computeAdjCosSim(ratings)\n", 843 | " similarities = sim_matrix[item_id-1].sort_values(ascending=False)[:k].values\n", 844 | " indices = sim_matrix[item_id-1].sort_values(ascending=False)[:k].index\n", 845 | " \n", 846 | " print '{0} most similar items for item {1}:\\n'.format(k-1,item_id)\n", 847 | " for i in range(0, len(indices)):\n", 848 | " if indices[i]+1 == item_id:\n", 849 | " continue;\n", 850 | "\n", 851 | " else:\n", 852 | " print '{0}: Item {1} :, with similarity of {2}'.format(i,indices[i]+1, similarities[i])\n", 853 | " \n", 854 | " return similarities ,indices " 855 | ] 856 | }, 857 | { 858 | "cell_type": "code", 859 | "execution_count": 21, 860 | "metadata": {}, 861 | "outputs": [ 862 | { 863 | "name": "stdout", 864 | "output_type": "stream", 865 | "text": [ 866 | "3 most similar items for item 3:\n", 867 | "\n", 868 | "1: Item 1 :, with similarity of 0.421262731871\n", 869 | "2: Item 6 :, with similarity of -0.0536398904889\n", 870 | "3: Item 5 :, with similarity of -0.230521358269\n" 871 | ] 872 | } 873 | ], 874 | "source": [ 875 | "similarities, indices = findksimilaritems_adjcos(3,M)" 876 | ] 877 | }, 878 | { 879 | "cell_type": "code", 880 | "execution_count": 22, 881 | "metadata": { 882 | "collapsed": true 883 | }, 884 | "outputs": [], 885 | "source": [ 886 | "#This function predicts the rating for specified user-item combination for adjusted cosine item-based approach\n", 887 | "#As the adjusted cosine similarities range from -1,+1, sometimes the predicted rating can be negative or greater than max value\n", 888 | "#Hack to deal with this: Rating is set to min if prediction is negative, Rating is set to max if prediction is above max\n", 889 | "def predict_itembased_adjcos(user_id, item_id, ratings):\n", 890 | " prediction=0\n", 891 | "\n", 892 | " similarities, indices=findksimilaritems_adjcos(item_id, ratings) #similar users based on correlation coefficients\n", 893 | " sum_wt = np.sum(similarities)-1\n", 894 | "\n", 895 | " product=1\n", 896 | " wtd_sum = 0 \n", 897 | " for i in range(0, len(indices)):\n", 898 | " if indices[i]+1 == item_id:\n", 899 | " continue;\n", 900 | " else:\n", 901 | " product = ratings.iloc[user_id-1,indices[i]] * (similarities[i])\n", 902 | " wtd_sum = wtd_sum + product \n", 903 | " prediction = int(round(wtd_sum/sum_wt))\n", 904 | " if prediction < 0:\n", 905 | " prediction = 1\n", 906 | " elif prediction >10:\n", 907 | " prediction = 10\n", 908 | " print '\\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction) \n", 909 | " \n", 910 | " return prediction" 911 | ] 912 | }, 913 | { 914 | "cell_type": "code", 915 | "execution_count": 23, 916 | "metadata": {}, 917 | "outputs": [ 918 | { 919 | "name": "stdout", 920 | "output_type": "stream", 921 | "text": [ 922 | "3 most similar items for item 4:\n", 923 | "\n", 924 | "1: Item 2 :, with similarity of 0.0857414341149\n", 925 | "2: Item 5 :, with similarity of -0.29905882779\n", 926 | "3: Item 1 :, with similarity of -0.519085268895\n", 927 | "\n", 928 | "Predicted rating for user 3 -> item 4: 8\n" 929 | ] 930 | } 931 | ], 932 | "source": [ 933 | "prediction=predict_itembased_adjcos(3,4,M)" 934 | ] 935 | }, 936 | { 937 | "cell_type": "code", 938 | "execution_count": 24, 939 | "metadata": {}, 940 | "outputs": [ 941 | { 942 | "data": { 943 | "text/html": [ 944 | "
\n", 945 | "\n", 958 | "\n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | "
012345
01.0000000.2369080.421263-0.519085-0.1258920.010090
10.2369081.000000-0.8052430.0857410.2372730.520625
20.421263-0.8052431.000000-0.767941-0.230521-0.053640
3-0.5190850.085741-0.7679411.000000-0.299059-0.644550
4-0.1258920.237273-0.230521-0.2990591.0000000.599158
50.0100900.520625-0.053640-0.6445500.5991581.000000
\n", 1027 | "
" 1028 | ], 1029 | "text/plain": [ 1030 | " 0 1 2 3 4 5\n", 1031 | "0 1.000000 0.236908 0.421263 -0.519085 -0.125892 0.010090\n", 1032 | "1 0.236908 1.000000 -0.805243 0.085741 0.237273 0.520625\n", 1033 | "2 0.421263 -0.805243 1.000000 -0.767941 -0.230521 -0.053640\n", 1034 | "3 -0.519085 0.085741 -0.767941 1.000000 -0.299059 -0.644550\n", 1035 | "4 -0.125892 0.237273 -0.230521 -0.299059 1.000000 0.599158\n", 1036 | "5 0.010090 0.520625 -0.053640 -0.644550 0.599158 1.000000" 1037 | ] 1038 | }, 1039 | "execution_count": 24, 1040 | "metadata": {}, 1041 | "output_type": "execute_result" 1042 | } 1043 | ], 1044 | "source": [ 1045 | "adjcos_sim" 1046 | ] 1047 | }, 1048 | { 1049 | "cell_type": "code", 1050 | "execution_count": 25, 1051 | "metadata": { 1052 | "collapsed": true 1053 | }, 1054 | "outputs": [], 1055 | "source": [ 1056 | "#This function utilizes above function to recommend items for selected approach. Recommendations are made if the predicted\n", 1057 | "#rating for an item is greater than or equal to 6, and the items has not been rated already\n", 1058 | "def recommendItem(user_id, item_id, ratings):\n", 1059 | " \n", 1060 | " if user_id<1 or user_id>6 or type(user_id) is not int:\n", 1061 | " print 'Userid does not exist. Enter numbers from 1-6'\n", 1062 | " else: \n", 1063 | " ids = ['User-based CF (cosine)','User-based CF (correlation)','Item-based CF (cosine)',\n", 1064 | " 'Item-based CF (adjusted cosine)']\n", 1065 | "\n", 1066 | " approach = widgets.Dropdown(options=ids, value=ids[0],\n", 1067 | " description='Select Approach', width='500px')\n", 1068 | " \n", 1069 | " def on_change(change):\n", 1070 | " prediction = 0\n", 1071 | " clear_output(wait=True)\n", 1072 | " if change['type'] == 'change' and change['name'] == 'value': \n", 1073 | " if (approach.value == 'User-based CF (cosine)'):\n", 1074 | " metric = 'cosine'\n", 1075 | " prediction = predict_userbased(user_id, item_id, ratings, metric)\n", 1076 | " elif (approach.value == 'User-based CF (correlation)') : \n", 1077 | " metric = 'correlation' \n", 1078 | " prediction = predict_userbased(user_id, item_id, ratings, metric)\n", 1079 | " elif (approach.value == 'Item-based CF (cosine)'):\n", 1080 | " prediction = predict_itembased(user_id, item_id, ratings)\n", 1081 | " else:\n", 1082 | " prediction = predict_itembased_adjcos(user_id,item_id,ratings)\n", 1083 | "\n", 1084 | " if ratings[item_id-1][user_id-1] != 0: \n", 1085 | " print 'Item already rated'\n", 1086 | " else:\n", 1087 | " if prediction>=6:\n", 1088 | " print '\\nItem recommended'\n", 1089 | " else:\n", 1090 | " print 'Item not recommended'\n", 1091 | "\n", 1092 | " approach.observe(on_change)\n", 1093 | " display(approach)" 1094 | ] 1095 | }, 1096 | { 1097 | "cell_type": "code", 1098 | "execution_count": 26, 1099 | "metadata": {}, 1100 | "outputs": [ 1101 | { 1102 | "name": "stdout", 1103 | "output_type": "stream", 1104 | "text": [ 1105 | "Userid does not exist. Enter numbers from 1-6\n" 1106 | ] 1107 | } 1108 | ], 1109 | "source": [ 1110 | "#check for incorrect entries\n", 1111 | "recommendItem(-1,3,M)" 1112 | ] 1113 | }, 1114 | { 1115 | "cell_type": "code", 1116 | "execution_count": 27, 1117 | "metadata": {}, 1118 | "outputs": [ 1119 | { 1120 | "name": "stdout", 1121 | "output_type": "stream", 1122 | "text": [ 1123 | "3 most similar users for User 3:\n", 1124 | "\n", 1125 | "1: User 4, with similarity of 0.90951268934\n", 1126 | "2: User 2, with similarity of 0.874744414849\n", 1127 | "3: User 5, with similarity of 0.86545387815\n", 1128 | "\n", 1129 | "Predicted rating for user 3 -> item 4: 3\n", 1130 | "Item not recommended\n" 1131 | ] 1132 | } 1133 | ], 1134 | "source": [ 1135 | "recommendItem(3,4,M)" 1136 | ] 1137 | }, 1138 | { 1139 | "cell_type": "code", 1140 | "execution_count": 28, 1141 | "metadata": {}, 1142 | "outputs": [ 1143 | { 1144 | "name": "stdout", 1145 | "output_type": "stream", 1146 | "text": [ 1147 | "3 most similar users for User 3:\n", 1148 | "\n", 1149 | "1: User 2, with similarity of 0.453897185842\n", 1150 | "2: User 4, with similarity of 0.451378005098\n", 1151 | "3: User 6, with similarity of 0.297373304825\n", 1152 | "\n", 1153 | "Predicted rating for user 3 -> item 4: 3\n", 1154 | "Item not recommended\n" 1155 | ] 1156 | } 1157 | ], 1158 | "source": [ 1159 | "recommendItem(3,4,M)" 1160 | ] 1161 | }, 1162 | { 1163 | "cell_type": "code", 1164 | "execution_count": 29, 1165 | "metadata": {}, 1166 | "outputs": [ 1167 | { 1168 | "name": "stdout", 1169 | "output_type": "stream", 1170 | "text": [ 1171 | "3 most similar items for item 4:\n", 1172 | "\n", 1173 | "1: Item 6 :, with similarity of 0.89977997614\n", 1174 | "2: Item 2 :, with similarity of 0.887160079571\n", 1175 | "3: Item 5 :, with similarity of 0.88180009273\n", 1176 | "\n", 1177 | "Predicted rating for user 3 -> item 4: 6\n", 1178 | "\n", 1179 | "Item recommended\n" 1180 | ] 1181 | } 1182 | ], 1183 | "source": [ 1184 | "recommendItem(3,4,M)" 1185 | ] 1186 | }, 1187 | { 1188 | "cell_type": "code", 1189 | "execution_count": 30, 1190 | "metadata": {}, 1191 | "outputs": [ 1192 | { 1193 | "name": "stdout", 1194 | "output_type": "stream", 1195 | "text": [ 1196 | "3 most similar items for item 4:\n", 1197 | "\n", 1198 | "1: Item 2 :, with similarity of 0.0857414341149\n", 1199 | "2: Item 5 :, with similarity of -0.29905882779\n", 1200 | "3: Item 1 :, with similarity of -0.519085268895\n", 1201 | "\n", 1202 | "Predicted rating for user 3 -> item 4: 8\n", 1203 | "\n", 1204 | "Item recommended\n" 1205 | ] 1206 | } 1207 | ], 1208 | "source": [ 1209 | "recommendItem(3,4,M)" 1210 | ] 1211 | }, 1212 | { 1213 | "cell_type": "code", 1214 | "execution_count": 31, 1215 | "metadata": {}, 1216 | "outputs": [ 1217 | { 1218 | "name": "stdout", 1219 | "output_type": "stream", 1220 | "text": [ 1221 | "3 most similar users for User 2:\n", 1222 | "\n", 1223 | "1: User 4, with similarity of 0.515910067398\n", 1224 | "2: User 3, with similarity of 0.453897185842\n", 1225 | "3: User 6, with similarity of 0.218327934565\n", 1226 | "\n", 1227 | "Predicted rating for user 2 -> item 1: 5\n", 1228 | "Item already rated\n" 1229 | ] 1230 | } 1231 | ], 1232 | "source": [ 1233 | "#if the item is already rated, it is not recommended\n", 1234 | "recommendItem(2,1,M)" 1235 | ] 1236 | }, 1237 | { 1238 | "cell_type": "code", 1239 | "execution_count": 35, 1240 | "metadata": {}, 1241 | "outputs": [], 1242 | "source": [ 1243 | "#This is a quick way to temporarily suppress stdout in particular code section\n", 1244 | "@contextmanager\n", 1245 | "def suppress_stdout():\n", 1246 | " with open(os.devnull, \"w\") as devnull:\n", 1247 | " old_stdout = sys.stdout\n", 1248 | " sys.stdout = devnull\n", 1249 | " try: \n", 1250 | " yield\n", 1251 | " finally:\n", 1252 | " sys.stdout = old_stdout" 1253 | ] 1254 | }, 1255 | { 1256 | "cell_type": "code", 1257 | "execution_count": 45, 1258 | "metadata": {}, 1259 | "outputs": [], 1260 | "source": [ 1261 | "#This is final function to evaluate the performance of selected recommendation approach and the metric used here is RMSE\n", 1262 | "#suppress_stdout function is used to suppress the print outputs of all the functions inside this function. It will only print \n", 1263 | "#RMSE values\n", 1264 | "def evaluateRS(ratings):\n", 1265 | " ids = ['User-based CF (cosine)','User-based CF (correlation)','Item-based CF (cosine)','Item-based CF (adjusted cosine)']\n", 1266 | " approach = widgets.Dropdown(options=ids, value=ids[0],description='Select Approach', width='500px')\n", 1267 | " n_users = ratings.shape[0]\n", 1268 | " n_items = ratings.shape[1]\n", 1269 | " prediction = np.zeros((n_users, n_items))\n", 1270 | " prediction= pd.DataFrame(prediction)\n", 1271 | " def on_change(change):\n", 1272 | " clear_output(wait=True)\n", 1273 | " with suppress_stdout():\n", 1274 | " if change['type'] == 'change' and change['name'] == 'value': \n", 1275 | " if (approach.value == 'User-based CF (cosine)'):\n", 1276 | " metric = 'cosine'\n", 1277 | " for i in range(n_users):\n", 1278 | " for j in range(n_items):\n", 1279 | " prediction[i][j] = predict_userbased(i+1, j+1, ratings, metric)\n", 1280 | " elif (approach.value == 'User-based CF (correlation)') : \n", 1281 | " metric = 'correlation' \n", 1282 | " for i in range(n_users):\n", 1283 | " for j in range(n_items):\n", 1284 | " prediction[i][j] = predict_userbased(i+1, j+1, ratings, metric)\n", 1285 | " elif (approach.value == 'Item-based CF (cosine)'):\n", 1286 | " for i in range(n_users):\n", 1287 | " for j in range(n_items):\n", 1288 | " prediction[i][j] = predict_userbased(i+1, j+1, ratings)\n", 1289 | " else:\n", 1290 | " for i in range(n_users):\n", 1291 | " for j in range(n_items):\n", 1292 | " prediction[i][j] = predict_userbased(i+1, j+1, ratings)\n", 1293 | " \n", 1294 | " MSE = mean_squared_error(prediction, ratings)\n", 1295 | " RMSE = round(sqrt(MSE),3)\n", 1296 | " print \"RMSE using {0} approach is: {1}\".format(approach.value,RMSE)\n", 1297 | " \n", 1298 | " approach.observe(on_change)\n", 1299 | " display(approach)" 1300 | ] 1301 | }, 1302 | { 1303 | "cell_type": "code", 1304 | "execution_count": 46, 1305 | "metadata": {}, 1306 | "outputs": [ 1307 | { 1308 | "name": "stdout", 1309 | "output_type": "stream", 1310 | "text": [ 1311 | "RMSE using User-based CF (cosine) approach is: 2.667\n" 1312 | ] 1313 | } 1314 | ], 1315 | "source": [ 1316 | "evaluateRS(M)" 1317 | ] 1318 | }, 1319 | { 1320 | "cell_type": "code", 1321 | "execution_count": 47, 1322 | "metadata": {}, 1323 | "outputs": [ 1324 | { 1325 | "name": "stdout", 1326 | "output_type": "stream", 1327 | "text": [ 1328 | "RMSE using User-based CF (correlation) approach is: 2.764\n" 1329 | ] 1330 | } 1331 | ], 1332 | "source": [ 1333 | "evaluateRS(M)" 1334 | ] 1335 | }, 1336 | { 1337 | "cell_type": "markdown", 1338 | "metadata": {}, 1339 | "source": [ 1340 | "**Thanks for reading this notebook**" 1341 | ] 1342 | } 1343 | ], 1344 | "metadata": { 1345 | "kernelspec": { 1346 | "display_name": "Python 2", 1347 | "language": "python", 1348 | "name": "python2" 1349 | }, 1350 | "language_info": { 1351 | "codemirror_mode": { 1352 | "name": "ipython", 1353 | "version": 2 1354 | }, 1355 | "file_extension": ".py", 1356 | "mimetype": "text/x-python", 1357 | "name": "python", 1358 | "nbconvert_exporter": "python", 1359 | "pygments_lexer": "ipython2", 1360 | "version": "2.7.13" 1361 | } 1362 | }, 1363 | "nbformat": 4, 1364 | "nbformat_minor": 2 1365 | } 1366 | --------------------------------------------------------------------------------