├── README.md ├── CF Recommendation System-Examples.ipynb └── Book Recommendation System.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # JupyterNotebooks-Medium -------------------------------------------------------------------------------- /CF Recommendation System-Examples.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "**Examples of Collaborative Filtering based Recommendation Systems**" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 2, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "#make necesarry imports\n", 19 | "import numpy as np\n", 20 | "import pandas as pd\n", 21 | "import matplotlib.pyplot as plt\n", 22 | "import sklearn.metrics as metrics\n", 23 | "import numpy as np\n", 24 | "from sklearn.neighbors import NearestNeighbors\n", 25 | "from scipy.spatial.distance import correlation, cosine\n", 26 | "import ipywidgets as widgets\n", 27 | "from IPython.display import display, clear_output\n", 28 | "from sklearn.metrics import pairwise_distances\n", 29 | "from sklearn.metrics import mean_squared_error\n", 30 | "from math import sqrt\n", 31 | "import sys, os\n", 32 | "from contextlib import contextmanager" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "#M is user-item ratings matrix where ratings are integers from 1-10\n", 44 | "M = np.asarray([[3,7,4,9,9,7], \n", 45 | " [7,0,5,3,8,8],\n", 46 | " [7,5,5,0,8,4],\n", 47 | " [5,6,8,5,9,8],\n", 48 | " [5,8,8,8,10,9],\n", 49 | " [7,7,0,4,7,8]])\n", 50 | "M=pd.DataFrame(M)\n", 51 | "\n", 52 | "#declaring k,metric as global which can be changed by the user later\n", 53 | "global k,metric\n", 54 | "k=4\n", 55 | "metric='cosine' #can be changed to 'correlation' for Pearson correlation similaries" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/html": [ 66 | "

\n", 67 | "\n", 80 | "\n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | "

	0	1	2	3	4	5
0	3	7	4	9	9	7
1	7	0	5	3	8	8
2	7	5	5	0	8	4
3	5	6	8	5	9	8
4	5	8	8	8	10	9
5	7	7	0	4	7	8

\n", 149 | "

" 150 | ], 151 | "text/plain": [ 152 | " 0 1 2 3 4 5\n", 153 | "0 3 7 4 9 9 7\n", 154 | "1 7 0 5 3 8 8\n", 155 | "2 7 5 5 0 8 4\n", 156 | "3 5 6 8 5 9 8\n", 157 | "4 5 8 8 8 10 9\n", 158 | "5 7 7 0 4 7 8" 159 | ] 160 | }, 161 | "execution_count": 4, 162 | "metadata": {}, 163 | "output_type": "execute_result" 164 | } 165 | ], 166 | "source": [ 167 | "M" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "**User-based Recommendation Systems**" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 5, 180 | "metadata": { 181 | "collapsed": true 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "#get cosine similarities for ratings matrix M; pairwise_distances returns the distances between ratings and hence\n", 186 | "#similarities are obtained by subtracting distances from 1\n", 187 | "cosine_sim = 1-pairwise_distances(M, metric=\"cosine\")" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 6, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "data": { 197 | "text/html": [ 198 | "

\n", 199 | "\n", 212 | "\n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | "

	0	1	2	3	4	5
0	1.000000	0.799268	0.779227	0.934622	0.973890	0.884600
1	0.799268	1.000000	0.874744	0.905850	0.866146	0.827036
2	0.779227	0.874744	1.000000	0.909513	0.865454	0.853275
3	0.934622	0.905850	0.909513	1.000000	0.989344	0.865614
4	0.973890	0.866146	0.865454	0.989344	1.000000	0.881640
5	0.884600	0.827036	0.853275	0.865614	0.881640	1.000000

\n", 281 | "

" 282 | ], 283 | "text/plain": [ 284 | " 0 1 2 3 4 5\n", 285 | "0 1.000000 0.799268 0.779227 0.934622 0.973890 0.884600\n", 286 | "1 0.799268 1.000000 0.874744 0.905850 0.866146 0.827036\n", 287 | "2 0.779227 0.874744 1.000000 0.909513 0.865454 0.853275\n", 288 | "3 0.934622 0.905850 0.909513 1.000000 0.989344 0.865614\n", 289 | "4 0.973890 0.866146 0.865454 0.989344 1.000000 0.881640\n", 290 | "5 0.884600 0.827036 0.853275 0.865614 0.881640 1.000000" 291 | ] 292 | }, 293 | "execution_count": 6, 294 | "metadata": {}, 295 | "output_type": "execute_result" 296 | } 297 | ], 298 | "source": [ 299 | "#Cosine similarity matrix\n", 300 | "pd.DataFrame(cosine_sim)" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 7, 306 | "metadata": { 307 | "collapsed": true 308 | }, 309 | "outputs": [], 310 | "source": [ 311 | "#get pearson similarities for ratings matrix M\n", 312 | "pearson_sim = 1-pairwise_distances(M, metric=\"correlation\")" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 8, 318 | "metadata": {}, 319 | "outputs": [ 320 | { 321 | "data": { 322 | "text/html": [ 323 | "

\n", 324 | "\n", 337 | "\n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | "

	0	1	2	3	4	5
0	1.000000	-0.137446	-0.357398	0.208179	0.761905	0.277350
1	-0.137446	1.000000	0.453897	0.515910	0.112456	0.218328
2	-0.357398	0.453897	1.000000	0.451378	-0.042888	0.297373
3	0.208179	0.515910	0.451378	1.000000	0.763325	-0.057739
4	0.761905	0.112456	-0.042888	0.763325	1.000000	0.039621
5	0.277350	0.218328	0.297373	-0.057739	0.039621	1.000000

\n", 406 | "

" 407 | ], 408 | "text/plain": [ 409 | " 0 1 2 3 4 5\n", 410 | "0 1.000000 -0.137446 -0.357398 0.208179 0.761905 0.277350\n", 411 | "1 -0.137446 1.000000 0.453897 0.515910 0.112456 0.218328\n", 412 | "2 -0.357398 0.453897 1.000000 0.451378 -0.042888 0.297373\n", 413 | "3 0.208179 0.515910 0.451378 1.000000 0.763325 -0.057739\n", 414 | "4 0.761905 0.112456 -0.042888 0.763325 1.000000 0.039621\n", 415 | "5 0.277350 0.218328 0.297373 -0.057739 0.039621 1.000000" 416 | ] 417 | }, 418 | "execution_count": 8, 419 | "metadata": {}, 420 | "output_type": "execute_result" 421 | } 422 | ], 423 | "source": [ 424 | "#Pearson correlation similarity matrix\n", 425 | "pd.DataFrame(pearson_sim)" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 9, 431 | "metadata": {}, 432 | "outputs": [], 433 | "source": [ 434 | "#This function finds k similar users given the user_id and ratings matrix M\n", 435 | "#Note that the similarities are same as obtained via using pairwise_distances\n", 436 | "def findksimilarusers(user_id, ratings, metric = metric, k=k):\n", 437 | " similarities=[]\n", 438 | " indices=[]\n", 439 | " model_knn = NearestNeighbors(metric = metric, algorithm = 'brute') \n", 440 | " model_knn.fit(ratings)\n", 441 | "\n", 442 | " distances, indices = model_knn.kneighbors(ratings.iloc[user_id-1, :].values.reshape(1, -1), n_neighbors = k+1)\n", 443 | " similarities = 1-distances.flatten()\n", 444 | " print '{0} most similar users for User {1}:\\n'.format(k,user_id)\n", 445 | " for i in range(0, len(indices.flatten())):\n", 446 | " if indices.flatten()[i]+1 == user_id:\n", 447 | " continue;\n", 448 | "\n", 449 | " else:\n", 450 | " print '{0}: User {1}, with similarity of {2}'.format(i, indices.flatten()[i]+1, similarities.flatten()[i])\n", 451 | " \n", 452 | " return similarities,indices" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": 10, 458 | "metadata": {}, 459 | "outputs": [ 460 | { 461 | "name": "stdout", 462 | "output_type": "stream", 463 | "text": [ 464 | "4 most similar users for User 1:\n", 465 | "\n", 466 | "1: User 5, with similarity of 0.973889935402\n", 467 | "2: User 4, with similarity of 0.934621684178\n", 468 | "3: User 6, with similarity of 0.88460045723\n", 469 | "4: User 2, with similarity of 0.799267978052\n" 470 | ] 471 | } 472 | ], 473 | "source": [ 474 | "similarities,indices = findksimilarusers(1,M, metric='cosine')" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": 11, 480 | "metadata": { 481 | "scrolled": true 482 | }, 483 | "outputs": [ 484 | { 485 | "name": "stdout", 486 | "output_type": "stream", 487 | "text": [ 488 | "4 most similar users for User 1:\n", 489 | "\n", 490 | "1: User 5, with similarity of 0.761904761905\n", 491 | "2: User 6, with similarity of 0.277350098113\n", 492 | "3: User 4, with similarity of 0.208179450927\n", 493 | "4: User 2, with similarity of -0.137446320513\n" 494 | ] 495 | } 496 | ], 497 | "source": [ 498 | "similarities,indices = findksimilarusers(1,M, metric='correlation')" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": 12, 504 | "metadata": { 505 | "collapsed": true 506 | }, 507 | "outputs": [], 508 | "source": [ 509 | "#This function predicts rating for specified user-item combination based on user-based approach\n", 510 | "def predict_userbased(user_id, item_id, ratings, metric = metric, k=k):\n", 511 | " prediction=0\n", 512 | " similarities, indices=findksimilarusers(user_id, ratings,metric, k) #similar users based on cosine similarity\n", 513 | " mean_rating = ratings.loc[user_id-1,:].mean() #to adjust for zero based indexing\n", 514 | " sum_wt = np.sum(similarities)-1\n", 515 | " product=1\n", 516 | " wtd_sum = 0 \n", 517 | " \n", 518 | " for i in range(0, len(indices.flatten())):\n", 519 | " if indices.flatten()[i]+1 == user_id:\n", 520 | " continue;\n", 521 | " else: \n", 522 | " ratings_diff = ratings.iloc[indices.flatten()[i],item_id-1]-np.mean(ratings.iloc[indices.flatten()[i],:])\n", 523 | " product = ratings_diff * (similarities[i])\n", 524 | " wtd_sum = wtd_sum + product\n", 525 | " \n", 526 | " prediction = int(round(mean_rating + (wtd_sum/sum_wt)))\n", 527 | " print '\\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction)\n", 528 | "\n", 529 | " return prediction" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 13, 535 | "metadata": {}, 536 | "outputs": [ 537 | { 538 | "name": "stdout", 539 | "output_type": "stream", 540 | "text": [ 541 | "4 most similar users for User 3:\n", 542 | "\n", 543 | "1: User 4, with similarity of 0.90951268934\n", 544 | "2: User 2, with similarity of 0.874744414849\n", 545 | "3: User 5, with similarity of 0.86545387815\n", 546 | "4: User 6, with similarity of 0.853274963344\n", 547 | "\n", 548 | "Predicted rating for user 3 -> item 4: 3\n" 549 | ] 550 | } 551 | ], 552 | "source": [ 553 | "predict_userbased(3,4,M);" 554 | ] 555 | }, 556 | { 557 | "cell_type": "markdown", 558 | "metadata": {}, 559 | "source": [ 560 | "**Item-based Recommendation Systems**" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 14, 566 | "metadata": { 567 | "collapsed": true 568 | }, 569 | "outputs": [], 570 | "source": [ 571 | "#This function finds k similar items given the item_id and ratings matrix M\n", 572 | "\n", 573 | "def findksimilaritems(item_id, ratings, metric=metric, k=k):\n", 574 | " similarities=[]\n", 575 | " indices=[] \n", 576 | " ratings=ratings.T\n", 577 | " model_knn = NearestNeighbors(metric = metric, algorithm = 'brute')\n", 578 | " model_knn.fit(ratings)\n", 579 | "\n", 580 | " distances, indices = model_knn.kneighbors(ratings.iloc[item_id-1, :].values.reshape(1, -1), n_neighbors = k+1)\n", 581 | " similarities = 1-distances.flatten()\n", 582 | " print '{0} most similar items for item {1}:\\n'.format(k,item_id)\n", 583 | " for i in range(0, len(indices.flatten())):\n", 584 | " if indices.flatten()[i]+1 == item_id:\n", 585 | " continue;\n", 586 | "\n", 587 | " else:\n", 588 | " print '{0}: Item {1} :, with similarity of {2}'.format(i,indices.flatten()[i]+1, similarities.flatten()[i])\n", 589 | "\n", 590 | "\n", 591 | " return similarities,indices" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 15, 597 | "metadata": {}, 598 | "outputs": [ 599 | { 600 | "name": "stdout", 601 | "output_type": "stream", 602 | "text": [ 603 | "4 most similar items for item 3:\n", 604 | "\n", 605 | "1: Item 5 :, with similarity of 0.918336125535\n", 606 | "2: Item 6 :, with similarity of 0.874759773038\n", 607 | "3: Item 1 :, with similarity of 0.810364746222\n", 608 | "4: Item 4 :, with similarity of 0.796917800302\n" 609 | ] 610 | } 611 | ], 612 | "source": [ 613 | "similarities,indices=findksimilaritems(3,M)" 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": 16, 619 | "metadata": { 620 | "collapsed": true 621 | }, 622 | "outputs": [], 623 | "source": [ 624 | "#This function predicts the rating for specified user-item combination based on item-based approach\n", 625 | "def predict_itembased(user_id, item_id, ratings, metric = metric, k=k):\n", 626 | " prediction= wtd_sum =0\n", 627 | " similarities, indices=findksimilaritems(item_id, ratings) #similar users based on correlation coefficients\n", 628 | " sum_wt = np.sum(similarities)-1\n", 629 | " product=1\n", 630 | " \n", 631 | " for i in range(0, len(indices.flatten())):\n", 632 | " if indices.flatten()[i]+1 == item_id:\n", 633 | " continue;\n", 634 | " else:\n", 635 | " product = ratings.iloc[user_id-1,indices.flatten()[i]] * (similarities[i])\n", 636 | " wtd_sum = wtd_sum + product \n", 637 | " prediction = int(round(wtd_sum/sum_wt))\n", 638 | " print '\\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction) \n", 639 | "\n", 640 | " return prediction" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": 17, 646 | "metadata": {}, 647 | "outputs": [ 648 | { 649 | "name": "stdout", 650 | "output_type": "stream", 651 | "text": [ 652 | "4 most similar items for item 3:\n", 653 | "\n", 654 | "1: Item 5 :, with similarity of 0.918336125535\n", 655 | "2: Item 6 :, with similarity of 0.874759773038\n", 656 | "3: Item 1 :, with similarity of 0.810364746222\n", 657 | "4: Item 4 :, with similarity of 0.796917800302\n", 658 | "\n", 659 | "Predicted rating for user 1 -> item 3: 7\n" 660 | ] 661 | } 662 | ], 663 | "source": [ 664 | "prediction = predict_itembased(1,3,M)" 665 | ] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": 18, 670 | "metadata": { 671 | "collapsed": true 672 | }, 673 | "outputs": [], 674 | "source": [ 675 | "#This function is used to compute adjusted cosine similarity matrix for items\n", 676 | "def computeAdjCosSim(M):\n", 677 | " sim_matrix = np.zeros((M.shape[1], M.shape[1]))\n", 678 | " M_u = M.mean(axis=1) #means\n", 679 | " \n", 680 | " for i in range(M.shape[1]):\n", 681 | " for j in range(M.shape[1]):\n", 682 | " if i == j:\n", 683 | " \n", 684 | " sim_matrix[i][j] = 1\n", 685 | " else: \n", 686 | " if i\n", 736 | "\n", 749 | "\n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | "

	0	1	2	3	4	5
0	1.000000	0.236908	0.421263	-0.519085	-0.125892	0.010090
1	0.236908	1.000000	-0.805243	0.085741	0.237273	0.520625
2	0.421263	-0.805243	1.000000	-0.767941	-0.230521	-0.053640
3	-0.519085	0.085741	-0.767941	1.000000	-0.299059	-0.644550
4	-0.125892	0.237273	-0.230521	-0.299059	1.000000	0.599158
5	0.010090	0.520625	-0.053640	-0.644550	0.599158	1.000000

\n", 818 | "" 819 | ], 820 | "text/plain": [ 821 | " 0 1 2 3 4 5\n", 822 | "0 1.000000 0.236908 0.421263 -0.519085 -0.125892 0.010090\n", 823 | "1 0.236908 1.000000 -0.805243 0.085741 0.237273 0.520625\n", 824 | "2 0.421263 -0.805243 1.000000 -0.767941 -0.230521 -0.053640\n", 825 | "3 -0.519085 0.085741 -0.767941 1.000000 -0.299059 -0.644550\n", 826 | "4 -0.125892 0.237273 -0.230521 -0.299059 1.000000 0.599158\n", 827 | "5 0.010090 0.520625 -0.053640 -0.644550 0.599158 1.000000" 828 | ] 829 | }, 830 | "execution_count": 20, 831 | "metadata": {}, 832 | "output_type": "execute_result" 833 | } 834 | ], 835 | "source": [ 836 | "adjcos_sim" 837 | ] 838 | }, 839 | { 840 | "cell_type": "code", 841 | "execution_count": 26, 842 | "metadata": { 843 | "collapsed": true 844 | }, 845 | "outputs": [], 846 | "source": [ 847 | "#This function finds k similar items given the item_id and ratings matrix M\n", 848 | "\n", 849 | "def findksimilaritems_adjcos(item_id, ratings, k=k):\n", 850 | " \n", 851 | " sim_matrix = computeAdjCosSim(ratings)\n", 852 | " similarities = sim_matrix[item_id-1].sort_values(ascending=False)[:k+1].values\n", 853 | " indices = sim_matrix[item_id-1].sort_values(ascending=False)[:k+1].index\n", 854 | " \n", 855 | " print '{0} most similar items for item {1}:\\n'.format(k,item_id)\n", 856 | " for i in range(0, len(indices)):\n", 857 | " if indices[i]+1 == item_id:\n", 858 | " continue;\n", 859 | "\n", 860 | " else:\n", 861 | " print '{0}: Item {1} :, with similarity of {2}'.format(i,indices[i]+1, similarities[i])\n", 862 | " \n", 863 | " return similarities ,indices " 864 | ] 865 | }, 866 | { 867 | "cell_type": "code", 868 | "execution_count": 27, 869 | "metadata": {}, 870 | "outputs": [ 871 | { 872 | "name": "stdout", 873 | "output_type": "stream", 874 | "text": [ 875 | "4 most similar items for item 3:\n", 876 | "\n", 877 | "1: Item 1 :, with similarity of 0.421262731871\n", 878 | "2: Item 6 :, with similarity of -0.0536398904889\n", 879 | "3: Item 5 :, with similarity of -0.230521358269\n", 880 | "4: Item 4 :, with similarity of -0.767941046575\n" 881 | ] 882 | } 883 | ], 884 | "source": [ 885 | "similarities, indices = findksimilaritems_adjcos(3,M)" 886 | ] 887 | }, 888 | { 889 | "cell_type": "code", 890 | "execution_count": 28, 891 | "metadata": { 892 | "collapsed": true 893 | }, 894 | "outputs": [], 895 | "source": [ 896 | "#This function predicts the rating for specified user-item combination for adjusted cosine item-based approach\n", 897 | "#As the adjusted cosine similarities range from -1,+1, sometimes the predicted rating can be negative or greater than max value\n", 898 | "#Hack to deal with this: Rating is set to min if prediction is negative, Rating is set to max if prediction is above max\n", 899 | "def predict_itembased_adjcos(user_id, item_id, ratings):\n", 900 | " prediction=0\n", 901 | "\n", 902 | " similarities, indices=findksimilaritems_adjcos(item_id, ratings) #similar users based on correlation coefficients\n", 903 | " sum_wt = np.sum(similarities)-1\n", 904 | "\n", 905 | " product=1\n", 906 | " wtd_sum = 0 \n", 907 | " for i in range(0, len(indices)):\n", 908 | " if indices[i]+1 == item_id:\n", 909 | " continue;\n", 910 | " else:\n", 911 | " product = ratings.iloc[user_id-1,indices[i]] * (similarities[i])\n", 912 | " wtd_sum = wtd_sum + product \n", 913 | " prediction = int(round(wtd_sum/sum_wt))\n", 914 | " if prediction < 0:\n", 915 | " prediction = 1\n", 916 | " elif prediction >10:\n", 917 | " prediction = 10\n", 918 | " print '\\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction) \n", 919 | " \n", 920 | " return prediction" 921 | ] 922 | }, 923 | { 924 | "cell_type": "code", 925 | "execution_count": 29, 926 | "metadata": {}, 927 | "outputs": [ 928 | { 929 | "name": "stdout", 930 | "output_type": "stream", 931 | "text": [ 932 | "4 most similar items for item 4:\n", 933 | "\n", 934 | "1: Item 2 :, with similarity of 0.0857414341149\n", 935 | "2: Item 5 :, with similarity of -0.29905882779\n", 936 | "3: Item 1 :, with similarity of -0.519085268895\n", 937 | "4: Item 6 :, with similarity of -0.644550286954\n", 938 | "\n", 939 | "Predicted rating for user 3 -> item 4: 6\n" 940 | ] 941 | } 942 | ], 943 | "source": [ 944 | "prediction=predict_itembased_adjcos(3,4,M)" 945 | ] 946 | }, 947 | { 948 | "cell_type": "code", 949 | "execution_count": 30, 950 | "metadata": {}, 951 | "outputs": [ 952 | { 953 | "data": { 954 | "text/html": [ 955 | "

\n", 956 | "\n", 969 | "\n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | "

	0	1	2	3	4	5
0	1.000000	0.236908	0.421263	-0.519085	-0.125892	0.010090
1	0.236908	1.000000	-0.805243	0.085741	0.237273	0.520625
2	0.421263	-0.805243	1.000000	-0.767941	-0.230521	-0.053640
3	-0.519085	0.085741	-0.767941	1.000000	-0.299059	-0.644550
4	-0.125892	0.237273	-0.230521	-0.299059	1.000000	0.599158
5	0.010090	0.520625	-0.053640	-0.644550	0.599158	1.000000

\n", 1038 | "

" 1039 | ], 1040 | "text/plain": [ 1041 | " 0 1 2 3 4 5\n", 1042 | "0 1.000000 0.236908 0.421263 -0.519085 -0.125892 0.010090\n", 1043 | "1 0.236908 1.000000 -0.805243 0.085741 0.237273 0.520625\n", 1044 | "2 0.421263 -0.805243 1.000000 -0.767941 -0.230521 -0.053640\n", 1045 | "3 -0.519085 0.085741 -0.767941 1.000000 -0.299059 -0.644550\n", 1046 | "4 -0.125892 0.237273 -0.230521 -0.299059 1.000000 0.599158\n", 1047 | "5 0.010090 0.520625 -0.053640 -0.644550 0.599158 1.000000" 1048 | ] 1049 | }, 1050 | "execution_count": 30, 1051 | "metadata": {}, 1052 | "output_type": "execute_result" 1053 | } 1054 | ], 1055 | "source": [ 1056 | "adjcos_sim" 1057 | ] 1058 | }, 1059 | { 1060 | "cell_type": "code", 1061 | "execution_count": 31, 1062 | "metadata": { 1063 | "collapsed": true 1064 | }, 1065 | "outputs": [], 1066 | "source": [ 1067 | "#This function utilizes above function to recommend items for selected approach. Recommendations are made if the predicted\n", 1068 | "#rating for an item is greater than or equal to 6, and the items has not been rated already\n", 1069 | "def recommendItem(user_id, item_id, ratings):\n", 1070 | " \n", 1071 | " if user_id<1 or user_id>6 or type(user_id) is not int:\n", 1072 | " print 'Userid does not exist. Enter numbers from 1-6'\n", 1073 | " else: \n", 1074 | " ids = ['User-based CF (cosine)','User-based CF (correlation)','Item-based CF (cosine)',\n", 1075 | " 'Item-based CF (adjusted cosine)']\n", 1076 | "\n", 1077 | " approach = widgets.Dropdown(options=ids, value=ids[0],\n", 1078 | " description='Select Approach', width='500px')\n", 1079 | " \n", 1080 | " def on_change(change):\n", 1081 | " prediction = 0\n", 1082 | " clear_output(wait=True)\n", 1083 | " if change['type'] == 'change' and change['name'] == 'value': \n", 1084 | " if (approach.value == 'User-based CF (cosine)'):\n", 1085 | " metric = 'cosine'\n", 1086 | " prediction = predict_userbased(user_id, item_id, ratings, metric)\n", 1087 | " elif (approach.value == 'User-based CF (correlation)') : \n", 1088 | " metric = 'correlation' \n", 1089 | " prediction = predict_userbased(user_id, item_id, ratings, metric)\n", 1090 | " elif (approach.value == 'Item-based CF (cosine)'):\n", 1091 | " prediction = predict_itembased(user_id, item_id, ratings)\n", 1092 | " else:\n", 1093 | " prediction = predict_itembased_adjcos(user_id,item_id,ratings)\n", 1094 | "\n", 1095 | " if ratings[item_id-1][user_id-1] != 0: \n", 1096 | " print 'Item already rated'\n", 1097 | " else:\n", 1098 | " if prediction>=6:\n", 1099 | " print '\\nItem recommended'\n", 1100 | " else:\n", 1101 | " print 'Item not recommended'\n", 1102 | "\n", 1103 | " approach.observe(on_change)\n", 1104 | " display(approach)" 1105 | ] 1106 | }, 1107 | { 1108 | "cell_type": "code", 1109 | "execution_count": 32, 1110 | "metadata": {}, 1111 | "outputs": [ 1112 | { 1113 | "name": "stdout", 1114 | "output_type": "stream", 1115 | "text": [ 1116 | "Userid does not exist. Enter numbers from 1-6\n" 1117 | ] 1118 | } 1119 | ], 1120 | "source": [ 1121 | "#check for incorrect entries\n", 1122 | "recommendItem(-1,3,M)" 1123 | ] 1124 | }, 1125 | { 1126 | "cell_type": "code", 1127 | "execution_count": 33, 1128 | "metadata": {}, 1129 | "outputs": [ 1130 | { 1131 | "name": "stdout", 1132 | "output_type": "stream", 1133 | "text": [ 1134 | "4 most similar users for User 3:\n", 1135 | "\n", 1136 | "1: User 4, with similarity of 0.90951268934\n", 1137 | "2: User 2, with similarity of 0.874744414849\n", 1138 | "3: User 5, with similarity of 0.86545387815\n", 1139 | "4: User 6, with similarity of 0.853274963344\n", 1140 | "\n", 1141 | "Predicted rating for user 3 -> item 4: 3\n", 1142 | "Item not recommended\n" 1143 | ] 1144 | } 1145 | ], 1146 | "source": [ 1147 | "recommendItem(3,4,M)" 1148 | ] 1149 | }, 1150 | { 1151 | "cell_type": "code", 1152 | "execution_count": 34, 1153 | "metadata": {}, 1154 | "outputs": [ 1155 | { 1156 | "name": "stdout", 1157 | "output_type": "stream", 1158 | "text": [ 1159 | "4 most similar users for User 3:\n", 1160 | "\n", 1161 | "1: User 2, with similarity of 0.453897185842\n", 1162 | "2: User 4, with similarity of 0.451378005098\n", 1163 | "3: User 6, with similarity of 0.297373304825\n", 1164 | "4: User 5, with similarity of -0.04288778794\n", 1165 | "\n", 1166 | "Predicted rating for user 3 -> item 4: 3\n", 1167 | "Item not recommended\n" 1168 | ] 1169 | } 1170 | ], 1171 | "source": [ 1172 | "recommendItem(3,4,M)" 1173 | ] 1174 | }, 1175 | { 1176 | "cell_type": "code", 1177 | "execution_count": 35, 1178 | "metadata": {}, 1179 | "outputs": [ 1180 | { 1181 | "name": "stdout", 1182 | "output_type": "stream", 1183 | "text": [ 1184 | "4 most similar items for item 4:\n", 1185 | "\n", 1186 | "1: Item 6 :, with similarity of 0.89977997614\n", 1187 | "2: Item 2 :, with similarity of 0.887160079571\n", 1188 | "3: Item 5 :, with similarity of 0.88180009273\n", 1189 | "4: Item 3 :, with similarity of 0.796917800302\n", 1190 | "\n", 1191 | "Predicted rating for user 3 -> item 4: 6\n", 1192 | "\n", 1193 | "Item recommended\n" 1194 | ] 1195 | } 1196 | ], 1197 | "source": [ 1198 | "recommendItem(3,4,M)" 1199 | ] 1200 | }, 1201 | { 1202 | "cell_type": "code", 1203 | "execution_count": 36, 1204 | "metadata": {}, 1205 | "outputs": [ 1206 | { 1207 | "name": "stdout", 1208 | "output_type": "stream", 1209 | "text": [ 1210 | "4 most similar items for item 4:\n", 1211 | "\n", 1212 | "1: Item 2 :, with similarity of 0.0857414341149\n", 1213 | "2: Item 5 :, with similarity of -0.29905882779\n", 1214 | "3: Item 1 :, with similarity of -0.519085268895\n", 1215 | "4: Item 6 :, with similarity of -0.644550286954\n", 1216 | "\n", 1217 | "Predicted rating for user 3 -> item 4: 6\n", 1218 | "\n", 1219 | "Item recommended\n" 1220 | ] 1221 | } 1222 | ], 1223 | "source": [ 1224 | "recommendItem(3,4,M)" 1225 | ] 1226 | }, 1227 | { 1228 | "cell_type": "code", 1229 | "execution_count": 37, 1230 | "metadata": {}, 1231 | "outputs": [ 1232 | { 1233 | "name": "stdout", 1234 | "output_type": "stream", 1235 | "text": [ 1236 | "4 most similar users for User 2:\n", 1237 | "\n", 1238 | "1: User 4, with similarity of 0.515910067398\n", 1239 | "2: User 3, with similarity of 0.453897185842\n", 1240 | "3: User 6, with similarity of 0.218327934565\n", 1241 | "4: User 5, with similarity of 0.11245608042\n", 1242 | "\n", 1243 | "Predicted rating for user 2 -> item 1: 5\n", 1244 | "Item already rated\n" 1245 | ] 1246 | } 1247 | ], 1248 | "source": [ 1249 | "#if the item is already rated, it is not recommended\n", 1250 | "recommendItem(2,1,M)" 1251 | ] 1252 | }, 1253 | { 1254 | "cell_type": "code", 1255 | "execution_count": 38, 1256 | "metadata": { 1257 | "collapsed": true 1258 | }, 1259 | "outputs": [], 1260 | "source": [ 1261 | "#This is a quick way to temporarily suppress stdout in particular code section\n", 1262 | "@contextmanager\n", 1263 | "def suppress_stdout():\n", 1264 | " with open(os.devnull, \"w\") as devnull:\n", 1265 | " old_stdout = sys.stdout\n", 1266 | " sys.stdout = devnull\n", 1267 | " try: \n", 1268 | " yield\n", 1269 | " finally:\n", 1270 | " sys.stdout = old_stdout" 1271 | ] 1272 | }, 1273 | { 1274 | "cell_type": "code", 1275 | "execution_count": 39, 1276 | "metadata": { 1277 | "collapsed": true 1278 | }, 1279 | "outputs": [], 1280 | "source": [ 1281 | "#This is final function to evaluate the performance of selected recommendation approach and the metric used here is RMSE\n", 1282 | "#suppress_stdout function is used to suppress the print outputs of all the functions inside this function. It will only print \n", 1283 | "#RMSE values\n", 1284 | "def evaluateRS(ratings):\n", 1285 | " ids = ['User-based CF (cosine)','User-based CF (correlation)','Item-based CF (cosine)','Item-based CF (adjusted cosine)']\n", 1286 | " approach = widgets.Dropdown(options=ids, value=ids[0],description='Select Approach', width='500px')\n", 1287 | " n_users = ratings.shape[0]\n", 1288 | " n_items = ratings.shape[1]\n", 1289 | " prediction = np.zeros((n_users, n_items))\n", 1290 | " prediction= pd.DataFrame(prediction)\n", 1291 | " def on_change(change):\n", 1292 | " clear_output(wait=True)\n", 1293 | " with suppress_stdout():\n", 1294 | " if change['type'] == 'change' and change['name'] == 'value': \n", 1295 | " if (approach.value == 'User-based CF (cosine)'):\n", 1296 | " metric = 'cosine'\n", 1297 | " for i in range(n_users):\n", 1298 | " for j in range(n_items):\n", 1299 | " prediction[i][j] = predict_userbased(i+1, j+1, ratings, metric)\n", 1300 | " elif (approach.value == 'User-based CF (correlation)') : \n", 1301 | " metric = 'correlation' \n", 1302 | " for i in range(n_users):\n", 1303 | " for j in range(n_items):\n", 1304 | " prediction[i][j] = predict_userbased(i+1, j+1, ratings, metric)\n", 1305 | " elif (approach.value == 'Item-based CF (cosine)'):\n", 1306 | " for i in range(n_users):\n", 1307 | " for j in range(n_items):\n", 1308 | " prediction[i][j] = predict_userbased(i+1, j+1, ratings)\n", 1309 | " else:\n", 1310 | " for i in range(n_users):\n", 1311 | " for j in range(n_items):\n", 1312 | " prediction[i][j] = predict_userbased(i+1, j+1, ratings)\n", 1313 | " \n", 1314 | " MSE = mean_squared_error(prediction, ratings)\n", 1315 | " RMSE = round(sqrt(MSE),3)\n", 1316 | " print \"RMSE using {0} approach is: {1}\".format(approach.value,RMSE)\n", 1317 | " \n", 1318 | " approach.observe(on_change)\n", 1319 | " display(approach)" 1320 | ] 1321 | }, 1322 | { 1323 | "cell_type": "code", 1324 | "execution_count": 40, 1325 | "metadata": {}, 1326 | "outputs": [ 1327 | { 1328 | "name": "stdout", 1329 | "output_type": "stream", 1330 | "text": [ 1331 | "RMSE using Item-based CF (cosine) approach is: 2.804\n" 1332 | ] 1333 | } 1334 | ], 1335 | "source": [ 1336 | "evaluateRS(M)" 1337 | ] 1338 | }, 1339 | { 1340 | "cell_type": "code", 1341 | "execution_count": 41, 1342 | "metadata": {}, 1343 | "outputs": [ 1344 | { 1345 | "name": "stdout", 1346 | "output_type": "stream", 1347 | "text": [ 1348 | "RMSE using Item-based CF (cosine) approach is: 2.804\n" 1349 | ] 1350 | } 1351 | ], 1352 | "source": [ 1353 | "evaluateRS(M)" 1354 | ] 1355 | }, 1356 | { 1357 | "cell_type": "markdown", 1358 | "metadata": {}, 1359 | "source": [ 1360 | "**Thanks for reading this notebook**" 1361 | ] 1362 | } 1363 | ], 1364 | "metadata": { 1365 | "kernelspec": { 1366 | "display_name": "Python 2", 1367 | "language": "python", 1368 | "name": "python2" 1369 | }, 1370 | "language_info": { 1371 | "codemirror_mode": { 1372 | "name": "ipython", 1373 | "version": 2 1374 | }, 1375 | "file_extension": ".py", 1376 | "mimetype": "text/x-python", 1377 | "name": "python", 1378 | "nbconvert_exporter": "python", 1379 | "pygments_lexer": "ipython2", 1380 | "version": "2.7.13" 1381 | } 1382 | }, 1383 | "nbformat": 4, 1384 | "nbformat_minor": 2 1385 | } 1386 | -------------------------------------------------------------------------------- /Book Recommendation System.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "**About Book Crossing Dataset**
\n", 8 | "\n", 9 | "This dataset has been compiled by Cai-Nicolas Ziegler in 2004, and it comprises of three tables for users, books and ratings. Explicit ratings are expressed on a scale from 1-10 (higher values denoting higher appreciation) and implicit rating is expressed by 0" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "Link to dataset files
\n", 17 | "http://www2.informatik.uni-freiburg.de/~cziegler/BX/ " 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "**About this Project**\n", 25 | "\n", 26 | "This project entails building a Book Recommender System for users based on user-based and item-based collaborative filtering approaches" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 177, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "#Making necesarry imports\n", 38 | "import pandas as pd\n", 39 | "import matplotlib.pyplot as plt\n", 40 | "import sklearn.metrics as metrics\n", 41 | "import numpy as np\n", 42 | "from sklearn.neighbors import NearestNeighbors\n", 43 | "from scipy.spatial.distance import correlation\n", 44 | "from sklearn.metrics.pairwise import pairwise_distances\n", 45 | "import ipywidgets as widgets\n", 46 | "from IPython.display import display, clear_output\n", 47 | "from contextlib import contextmanager\n", 48 | "import warnings\n", 49 | "warnings.filterwarnings('ignore')\n", 50 | "import numpy as np\n", 51 | "import os, sys\n", 52 | "import re\n", 53 | "import seaborn as sns" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 178, 59 | "metadata": { 60 | "collapsed": true 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "#Setting the current working directory\n", 65 | "os.chdir('D:\\Data Science\\Projects\\Book Crossing Dataset - Recommender System')" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 179, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/html": [ 76 | "\n", 88 | "To toggle on/off output_stderr, click here." 89 | ], 90 | "text/plain": [ 91 | "" 92 | ] 93 | }, 94 | "execution_count": 179, 95 | "metadata": {}, 96 | "output_type": "execute_result" 97 | } 98 | ], 99 | "source": [ 100 | "from IPython.display import HTML\n", 101 | "HTML('''\n", 113 | "To toggle on/off output_stderr, click here.''')" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 180, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "name": "stderr", 123 | "output_type": "stream", 124 | "text": [ 125 | "Skipping line 6452: expected 8 fields, saw 9\n", 126 | "Skipping line 43667: expected 8 fields, saw 10\n", 127 | "Skipping line 51751: expected 8 fields, saw 9\n", 128 | "\n", 129 | "Skipping line 92038: expected 8 fields, saw 9\n", 130 | "Skipping line 104319: expected 8 fields, saw 9\n", 131 | "Skipping line 121768: expected 8 fields, saw 9\n", 132 | "\n", 133 | "Skipping line 144058: expected 8 fields, saw 9\n", 134 | "Skipping line 150789: expected 8 fields, saw 9\n", 135 | "Skipping line 157128: expected 8 fields, saw 9\n", 136 | "Skipping line 180189: expected 8 fields, saw 9\n", 137 | "Skipping line 185738: expected 8 fields, saw 9\n", 138 | "\n", 139 | "Skipping line 209388: expected 8 fields, saw 9\n", 140 | "Skipping line 220626: expected 8 fields, saw 9\n", 141 | "Skipping line 227933: expected 8 fields, saw 11\n", 142 | "Skipping line 228957: expected 8 fields, saw 10\n", 143 | "Skipping line 245933: expected 8 fields, saw 9\n", 144 | "Skipping line 251296: expected 8 fields, saw 9\n", 145 | "Skipping line 259941: expected 8 fields, saw 9\n", 146 | "Skipping line 261529: expected 8 fields, saw 9\n", 147 | "\n" 148 | ] 149 | } 150 | ], 151 | "source": [ 152 | "#Loading data\n", 153 | "books = pd.read_csv('books.csv', sep=';', error_bad_lines=False, encoding=\"latin-1\")\n", 154 | "books.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']\n", 155 | "users = pd.read_csv('users.csv', sep=';', error_bad_lines=False, encoding=\"latin-1\")\n", 156 | "users.columns = ['userID', 'Location', 'Age']\n", 157 | "ratings = pd.read_csv('ratings.csv', sep=';', error_bad_lines=False, encoding=\"latin-1\")\n", 158 | "ratings.columns = ['userID', 'ISBN', 'bookRating']" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 181, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": [ 170 | "(271360, 8)\n", 171 | "(278858, 3)\n", 172 | "(1149780, 3)\n" 173 | ] 174 | } 175 | ], 176 | "source": [ 177 | "#checking shapes of the datasets\n", 178 | "print books.shape\n", 179 | "print users.shape\n", 180 | "print ratings.shape" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 182, 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "data": { 190 | "text/html": [ 191 | "

\n", 192 | "\n", 205 | "\n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | "

	ISBN	bookTitle	bookAuthor	yearOfPublication	publisher	imageUrlS	imageUrlM	imageUrlL
0	0195153448	Classical Mythology	Mark P. O. Morford	2002	Oxford University Press	http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg	http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg	http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg
1	0002005018	Clara Callan	Richard Bruce Wright	2001	HarperFlamingo Canada	http://images.amazon.com/images/P/0002005018.01.THUMBZZZ.jpg	http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg	http://images.amazon.com/images/P/0002005018.01.LZZZZZZZ.jpg
2	0060973129	Decision in Normandy	Carlo D'Este	1991	HarperPerennial	http://images.amazon.com/images/P/0060973129.01.THUMBZZZ.jpg	http://images.amazon.com/images/P/0060973129.01.MZZZZZZZ.jpg	http://images.amazon.com/images/P/0060973129.01.LZZZZZZZ.jpg
3	0374157065	Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It	Gina Bari Kolata	1999	Farrar Straus Giroux	http://images.amazon.com/images/P/0374157065.01.THUMBZZZ.jpg	http://images.amazon.com/images/P/0374157065.01.MZZZZZZZ.jpg	http://images.amazon.com/images/P/0374157065.01.LZZZZZZZ.jpg
4	0393045218	The Mummies of Urumchi	E. J. W. Barber	1999	W. W. Norton & Company	http://images.amazon.com/images/P/0393045218.01.THUMBZZZ.jpg	http://images.amazon.com/images/P/0393045218.01.MZZZZZZZ.jpg	http://images.amazon.com/images/P/0393045218.01.LZZZZZZZ.jpg

\n", 277 | "

" 278 | ], 279 | "text/plain": [ 280 | " ISBN \\\n", 281 | "0 0195153448 \n", 282 | "1 0002005018 \n", 283 | "2 0060973129 \n", 284 | "3 0374157065 \n", 285 | "4 0393045218 \n", 286 | "\n", 287 | " bookTitle \\\n", 288 | "0 Classical Mythology \n", 289 | "1 Clara Callan \n", 290 | "2 Decision in Normandy \n", 291 | "3 Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It \n", 292 | "4 The Mummies of Urumchi \n", 293 | "\n", 294 | " bookAuthor yearOfPublication publisher \\\n", 295 | "0 Mark P. O. Morford 2002 Oxford University Press \n", 296 | "1 Richard Bruce Wright 2001 HarperFlamingo Canada \n", 297 | "2 Carlo D'Este 1991 HarperPerennial \n", 298 | "3 Gina Bari Kolata 1999 Farrar Straus Giroux \n", 299 | "4 E. J. W. Barber 1999 W. W. Norton & Company \n", 300 | "\n", 301 | " imageUrlS \\\n", 302 | "0 http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg \n", 303 | "1 http://images.amazon.com/images/P/0002005018.01.THUMBZZZ.jpg \n", 304 | "2 http://images.amazon.com/images/P/0060973129.01.THUMBZZZ.jpg \n", 305 | "3 http://images.amazon.com/images/P/0374157065.01.THUMBZZZ.jpg \n", 306 | "4 http://images.amazon.com/images/P/0393045218.01.THUMBZZZ.jpg \n", 307 | "\n", 308 | " imageUrlM \\\n", 309 | "0 http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg \n", 310 | "1 http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg \n", 311 | "2 http://images.amazon.com/images/P/0060973129.01.MZZZZZZZ.jpg \n", 312 | "3 http://images.amazon.com/images/P/0374157065.01.MZZZZZZZ.jpg \n", 313 | "4 http://images.amazon.com/images/P/0393045218.01.MZZZZZZZ.jpg \n", 314 | "\n", 315 | " imageUrlL \n", 316 | "0 http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg \n", 317 | "1 http://images.amazon.com/images/P/0002005018.01.LZZZZZZZ.jpg \n", 318 | "2 http://images.amazon.com/images/P/0060973129.01.LZZZZZZZ.jpg \n", 319 | "3 http://images.amazon.com/images/P/0374157065.01.LZZZZZZZ.jpg \n", 320 | "4 http://images.amazon.com/images/P/0393045218.01.LZZZZZZZ.jpg " 321 | ] 322 | }, 323 | "execution_count": 182, 324 | "metadata": {}, 325 | "output_type": "execute_result" 326 | } 327 | ], 328 | "source": [ 329 | "#Exploring books dataset\n", 330 | "books.head()" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 183, 336 | "metadata": { 337 | "collapsed": true 338 | }, 339 | "outputs": [], 340 | "source": [ 341 | "#dropping last three columns containing image URLs which will not be required for analysis\n", 342 | "books.drop(['imageUrlS', 'imageUrlM', 'imageUrlL'],axis=1,inplace=True)" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 184, 348 | "metadata": { 349 | "scrolled": true 350 | }, 351 | "outputs": [ 352 | { 353 | "data": { 354 | "text/html": [ 355 | "

\n", 356 | "\n", 369 | "\n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | "

	ISBN	bookTitle	bookAuthor	yearOfPublication	publisher
0	0195153448	Classical Mythology	Mark P. O. Morford	2002	Oxford University Press
1	0002005018	Clara Callan	Richard Bruce Wright	2001	HarperFlamingo Canada
2	0060973129	Decision in Normandy	Carlo D'Este	1991	HarperPerennial
3	0374157065	Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It	Gina Bari Kolata	1999	Farrar Straus Giroux
4	0393045218	The Mummies of Urumchi	E. J. W. Barber	1999	W. W. Norton & Company

\n", 423 | "

" 424 | ], 425 | "text/plain": [ 426 | " ISBN \\\n", 427 | "0 0195153448 \n", 428 | "1 0002005018 \n", 429 | "2 0060973129 \n", 430 | "3 0374157065 \n", 431 | "4 0393045218 \n", 432 | "\n", 433 | " bookTitle \\\n", 434 | "0 Classical Mythology \n", 435 | "1 Clara Callan \n", 436 | "2 Decision in Normandy \n", 437 | "3 Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It \n", 438 | "4 The Mummies of Urumchi \n", 439 | "\n", 440 | " bookAuthor yearOfPublication publisher \n", 441 | "0 Mark P. O. Morford 2002 Oxford University Press \n", 442 | "1 Richard Bruce Wright 2001 HarperFlamingo Canada \n", 443 | "2 Carlo D'Este 1991 HarperPerennial \n", 444 | "3 Gina Bari Kolata 1999 Farrar Straus Giroux \n", 445 | "4 E. J. W. Barber 1999 W. W. Norton & Company " 446 | ] 447 | }, 448 | "execution_count": 184, 449 | "metadata": {}, 450 | "output_type": "execute_result" 451 | } 452 | ], 453 | "source": [ 454 | "#Now the books datasets looks like....\n", 455 | "books.head()" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": 135, 461 | "metadata": { 462 | "scrolled": false 463 | }, 464 | "outputs": [ 465 | { 466 | "data": { 467 | "text/plain": [ 468 | "ISBN object\n", 469 | "bookTitle object\n", 470 | "bookAuthor object\n", 471 | "yearOfPublication object\n", 472 | "publisher object\n", 473 | "dtype: object" 474 | ] 475 | }, 476 | "execution_count": 135, 477 | "metadata": {}, 478 | "output_type": "execute_result" 479 | } 480 | ], 481 | "source": [ 482 | "#checking data types of columns\n", 483 | "books.dtypes" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 188, 489 | "metadata": { 490 | "collapsed": true 491 | }, 492 | "outputs": [], 493 | "source": [ 494 | "#making this setting to display full text in columns\n", 495 | "pd.set_option('display.max_colwidth', -1)" 496 | ] 497 | }, 498 | { 499 | "cell_type": "markdown", 500 | "metadata": {}, 501 | "source": [ 502 | "**yearOfPublication**" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 189, 508 | "metadata": {}, 509 | "outputs": [ 510 | { 511 | "data": { 512 | "text/plain": [ 513 | "array([2002L, 2001L, 1991L, 1999L, 2000L, 1993L, 1996L, 1988L, 2004L,\n", 514 | " 1998L, 1994L, 2003L, 1997L, 1983L, 1979L, 1995L, 1982L, 1985L,\n", 515 | " 1992L, 1986L, 1978L, 1980L, 1952L, 1987L, 1990L, 1981L, 1989L,\n", 516 | " 1984L, 0L, 1968L, 1961L, 1958L, 1974L, 1976L, 1971L, 1977L, 1975L,\n", 517 | " 1965L, 1941L, 1970L, 1962L, 1973L, 1972L, 1960L, 1966L, 1920L,\n", 518 | " 1956L, 1959L, 1953L, 1951L, 1942L, 1963L, 1964L, 1969L, 1954L,\n", 519 | " 1950L, 1967L, 2005L, 1957L, 1940L, 1937L, 1955L, 1946L, 1936L,\n", 520 | " 1930L, 2011L, 1925L, 1948L, 1943L, 1947L, 1945L, 1923L, 2020L,\n", 521 | " 1939L, 1926L, 1938L, 2030L, 1911L, 1904L, 1949L, 1932L, 1928L,\n", 522 | " 1929L, 1927L, 1931L, 1914L, 2050L, 1934L, 1910L, 1933L, 1902L,\n", 523 | " 1924L, 1921L, 1900L, 2038L, 2026L, 1944L, 1917L, 1901L, 2010L,\n", 524 | " 1908L, 1906L, 1935L, 1806L, 2021L, u'2000', u'1995', u'1999',\n", 525 | " u'2004', u'2003', u'1990', u'1994', u'1986', u'1989', u'2002',\n", 526 | " u'1981', u'1993', u'1983', u'1982', u'1976', u'1991', u'1977',\n", 527 | " u'1998', u'1992', u'1996', u'0', u'1997', u'2001', u'1974', u'1968',\n", 528 | " u'1987', u'1984', u'1988', u'1963', u'1956', u'1970', u'1985',\n", 529 | " u'1978', u'1973', u'1980', u'1979', u'1975', u'1969', u'1961',\n", 530 | " u'1965', u'1939', u'1958', u'1950', u'1953', u'1966', u'1971',\n", 531 | " u'1959', u'1972', u'1955', u'1957', u'1945', u'1960', u'1967',\n", 532 | " u'1932', u'1924', u'1964', u'2012', u'1911', u'1927', u'1948',\n", 533 | " u'1962', u'2006', u'1952', u'1940', u'1951', u'1931', u'1954',\n", 534 | " u'2005', u'1930', u'1941', u'1944', u'DK Publishing Inc', u'1943',\n", 535 | " u'1938', u'1900', u'1942', u'1923', u'1920', u'1933', u'Gallimard',\n", 536 | " u'1909', u'1946', u'2008', u'1378', u'2030', u'1936', u'1947',\n", 537 | " u'2011', u'2020', u'1919', u'1949', u'1922', u'1897', u'2024',\n", 538 | " u'1376', u'1926', u'2037'], dtype=object)" 539 | ] 540 | }, 541 | "execution_count": 189, 542 | "metadata": {}, 543 | "output_type": "execute_result" 544 | } 545 | ], 546 | "source": [ 547 | "#yearOfPublication should be set as having dtype as int\n", 548 | "#checking the unique values of yearOfPublication\n", 549 | "books.yearOfPublication.unique()\n", 550 | "\n", 551 | "#as it can be seen from below that there are some incorrect entries in this field. It looks like Publisher names \n", 552 | "#'DK Publishing Inc' and 'Gallimard' have been incorrectly loaded as yearOfPublication in dataset due to some errors in csv file\n", 553 | "#Also some of the entries are strings and same years have been entered as numbers in some places" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": 190, 559 | "metadata": { 560 | "scrolled": true 561 | }, 562 | "outputs": [ 563 | { 564 | "data": { 565 | "text/html": [ 566 | "

\n", 567 | "\n", 580 | "\n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | "

	ISBN	bookTitle	bookAuthor	yearOfPublication	publisher
209538	078946697X	DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)\\\";Michael Teitelbaum\"	2000	DK Publishing Inc	http://images.amazon.com/images/P/078946697X.01.THUMBZZZ.jpg
221678	0789466953	DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)\\\";James Buckley\"	2000	DK Publishing Inc	http://images.amazon.com/images/P/0789466953.01.THUMBZZZ.jpg

\n", 610 | "

" 611 | ], 612 | "text/plain": [ 613 | " ISBN \\\n", 614 | "209538 078946697X \n", 615 | "221678 0789466953 \n", 616 | "\n", 617 | " bookTitle \\\n", 618 | "209538 DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)\\\";Michael Teitelbaum\" \n", 619 | "221678 DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)\\\";James Buckley\" \n", 620 | "\n", 621 | " bookAuthor yearOfPublication \\\n", 622 | "209538 2000 DK Publishing Inc \n", 623 | "221678 2000 DK Publishing Inc \n", 624 | "\n", 625 | " publisher \n", 626 | "209538 http://images.amazon.com/images/P/078946697X.01.THUMBZZZ.jpg \n", 627 | "221678 http://images.amazon.com/images/P/0789466953.01.THUMBZZZ.jpg " 628 | ] 629 | }, 630 | "execution_count": 190, 631 | "metadata": {}, 632 | "output_type": "execute_result" 633 | } 634 | ], 635 | "source": [ 636 | "#investigating the rows having 'DK Publishing Inc' as yearOfPublication\n", 637 | "books.loc[books.yearOfPublication == 'DK Publishing Inc',:]" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": 191, 643 | "metadata": { 644 | "collapsed": true 645 | }, 646 | "outputs": [], 647 | "source": [ 648 | "#From above, it is seen that bookAuthor is incorrectly loaded with bookTitle, hence making required corrections\n", 649 | "#ISBN '0789466953'\n", 650 | "books.loc[books.ISBN == '0789466953','yearOfPublication'] = 2000\n", 651 | "books.loc[books.ISBN == '0789466953','bookAuthor'] = \"James Buckley\"\n", 652 | "books.loc[books.ISBN == '0789466953','publisher'] = \"DK Publishing Inc\"\n", 653 | "books.loc[books.ISBN == '0789466953','bookTitle'] = \"DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)\"" 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": 192, 659 | "metadata": { 660 | "collapsed": true 661 | }, 662 | "outputs": [], 663 | "source": [ 664 | "#ISBN '078946697X'\n", 665 | "books.loc[books.ISBN == '078946697X','yearOfPublication'] = 2000\n", 666 | "books.loc[books.ISBN == '078946697X','bookAuthor'] = \"Michael Teitelbaum\"\n", 667 | "books.loc[books.ISBN == '078946697X','publisher'] = \"DK Publishing Inc\"\n", 668 | "books.loc[books.ISBN == '078946697X','bookTitle'] = \"DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)\"" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": 193, 674 | "metadata": {}, 675 | "outputs": [ 676 | { 677 | "data": { 678 | "text/html": [ 679 | "

\n", 680 | "\n", 693 | "\n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | "

	ISBN	bookTitle	bookAuthor	yearOfPublication	publisher
209538	078946697X	DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)	Michael Teitelbaum	2000	DK Publishing Inc
221678	0789466953	DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)	James Buckley	2000	DK Publishing Inc

\n", 723 | "

" 724 | ], 725 | "text/plain": [ 726 | " ISBN \\\n", 727 | "209538 078946697X \n", 728 | "221678 0789466953 \n", 729 | "\n", 730 | " bookTitle \\\n", 731 | "209538 DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers) \n", 732 | "221678 DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers) \n", 733 | "\n", 734 | " bookAuthor yearOfPublication publisher \n", 735 | "209538 Michael Teitelbaum 2000 DK Publishing Inc \n", 736 | "221678 James Buckley 2000 DK Publishing Inc " 737 | ] 738 | }, 739 | "execution_count": 193, 740 | "metadata": {}, 741 | "output_type": "execute_result" 742 | } 743 | ], 744 | "source": [ 745 | "#rechecking\n", 746 | "books.loc[(books.ISBN == '0789466953') | (books.ISBN == '078946697X'),:]\n", 747 | "#corrections done" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": 194, 753 | "metadata": { 754 | "scrolled": true 755 | }, 756 | "outputs": [ 757 | { 758 | "data": { 759 | "text/html": [ 760 | "

\n", 761 | "\n", 774 | "\n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | "

	ISBN	bookTitle	bookAuthor	yearOfPublication	publisher
220731	2070426769	Peuple du ciel, suivi de 'Les Bergers\\\";Jean-Marie Gustave Le ClÃ?Â©zio\"	2003	Gallimard	http://images.amazon.com/images/P/2070426769.01.THUMBZZZ.jpg

\n", 796 | "

" 797 | ], 798 | "text/plain": [ 799 | " ISBN \\\n", 800 | "220731 2070426769 \n", 801 | "\n", 802 | " bookTitle \\\n", 803 | "220731 Peuple du ciel, suivi de 'Les Bergers\\\";Jean-Marie Gustave Le ClÃ?Â©zio\" \n", 804 | "\n", 805 | " bookAuthor yearOfPublication \\\n", 806 | "220731 2003 Gallimard \n", 807 | "\n", 808 | " publisher \n", 809 | "220731 http://images.amazon.com/images/P/2070426769.01.THUMBZZZ.jpg " 810 | ] 811 | }, 812 | "execution_count": 194, 813 | "metadata": {}, 814 | "output_type": "execute_result" 815 | } 816 | ], 817 | "source": [ 818 | "#investigating the rows having 'Gallimard' as yearOfPublication\n", 819 | "books.loc[books.yearOfPublication == 'Gallimard',:]" 820 | ] 821 | }, 822 | { 823 | "cell_type": "code", 824 | "execution_count": 195, 825 | "metadata": { 826 | "collapsed": true 827 | }, 828 | "outputs": [], 829 | "source": [ 830 | "#making required corrections as above, keeping other fields intact\n", 831 | "books.loc[books.ISBN == '2070426769','yearOfPublication'] = 2003\n", 832 | "books.loc[books.ISBN == '2070426769','bookAuthor'] = \"Jean-Marie Gustave Le ClÃ?Â©zio\"\n", 833 | "books.loc[books.ISBN == '2070426769','publisher'] = \"Gallimard\"\n", 834 | "books.loc[books.ISBN == '2070426769','bookTitle'] = \"Peuple du ciel, suivi de 'Les Bergers\"" 835 | ] 836 | }, 837 | { 838 | "cell_type": "code", 839 | "execution_count": 196, 840 | "metadata": { 841 | "scrolled": true 842 | }, 843 | "outputs": [ 844 | { 845 | "data": { 846 | "text/html": [ 847 | "

\n", 848 | "\n", 861 | "\n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | "

	ISBN	bookTitle	bookAuthor	yearOfPublication	publisher
220731	2070426769	Peuple du ciel, suivi de 'Les Bergers	Jean-Marie Gustave Le ClÃ?Â©zio	2003	Gallimard

\n", 883 | "

" 884 | ], 885 | "text/plain": [ 886 | " ISBN bookTitle \\\n", 887 | "220731 2070426769 Peuple du ciel, suivi de 'Les Bergers \n", 888 | "\n", 889 | " bookAuthor yearOfPublication publisher \n", 890 | "220731 Jean-Marie Gustave Le ClÃ?Â©zio 2003 Gallimard " 891 | ] 892 | }, 893 | "execution_count": 196, 894 | "metadata": {}, 895 | "output_type": "execute_result" 896 | } 897 | ], 898 | "source": [ 899 | "#rechecking\n", 900 | "books.loc[books.ISBN == '2070426769',:]\n", 901 | "#corrections done" 902 | ] 903 | }, 904 | { 905 | "cell_type": "code", 906 | "execution_count": 197, 907 | "metadata": { 908 | "collapsed": true 909 | }, 910 | "outputs": [], 911 | "source": [ 912 | "#Correcting the dtypes of yearOfPublication\n", 913 | "books.yearOfPublication=pd.to_numeric(books.yearOfPublication, errors='coerce')" 914 | ] 915 | }, 916 | { 917 | "cell_type": "code", 918 | "execution_count": 198, 919 | "metadata": {}, 920 | "outputs": [ 921 | { 922 | "name": "stdout", 923 | "output_type": "stream", 924 | "text": [ 925 | "[0, 1376, 1378, 1806, 1897, 1900, 1901, 1902, 1904, 1906, 1908, 1909, 1910, 1911, 1914, 1917, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2008, 2010, 2011, 2012, 2020, 2021, 2024, 2026, 2030, 2037, 2038, 2050]\n" 926 | ] 927 | } 928 | ], 929 | "source": [ 930 | "print sorted(books['yearOfPublication'].unique())\n", 931 | "#Now it can be seen that yearOfPublication has all values as integers" 932 | ] 933 | }, 934 | { 935 | "cell_type": "code", 936 | "execution_count": 199, 937 | "metadata": { 938 | "collapsed": true 939 | }, 940 | "outputs": [], 941 | "source": [ 942 | "#However, the value 0 is invalid and as this dataset was published in 2004, I have assumed the the years after 2006 to be \n", 943 | "#invalid keeping some margin in case dataset was updated thereafer\n", 944 | "#setting invalid years as NaN\n", 945 | "books.loc[(books.yearOfPublication > 2006) | (books.yearOfPublication == 0),'yearOfPublication'] = np.NAN" 946 | ] 947 | }, 948 | { 949 | "cell_type": "code", 950 | "execution_count": 200, 951 | "metadata": { 952 | "collapsed": true 953 | }, 954 | "outputs": [], 955 | "source": [ 956 | "#replacing NaNs with mean value of yearOfPublication\n", 957 | "books.yearOfPublication.fillna(round(books.yearOfPublication.mean()), inplace=True)" 958 | ] 959 | }, 960 | { 961 | "cell_type": "code", 962 | "execution_count": 201, 963 | "metadata": {}, 964 | "outputs": [ 965 | { 966 | "data": { 967 | "text/plain": [ 968 | "0" 969 | ] 970 | }, 971 | "execution_count": 201, 972 | "metadata": {}, 973 | "output_type": "execute_result" 974 | } 975 | ], 976 | "source": [ 977 | "#rechecking\n", 978 | "books.yearOfPublication.isnull().sum()\n", 979 | "#No NaNs" 980 | ] 981 | }, 982 | { 983 | "cell_type": "code", 984 | "execution_count": 202, 985 | "metadata": { 986 | "collapsed": true 987 | }, 988 | "outputs": [], 989 | "source": [ 990 | "#resetting the dtype as int32\n", 991 | "books.yearOfPublication = books.yearOfPublication.astype(np.int32)" 992 | ] 993 | }, 994 | { 995 | "cell_type": "markdown", 996 | "metadata": {}, 997 | "source": [ 998 | "**publisher**" 999 | ] 1000 | }, 1001 | { 1002 | "cell_type": "code", 1003 | "execution_count": 203, 1004 | "metadata": {}, 1005 | "outputs": [ 1006 | { 1007 | "data": { 1008 | "text/html": [ 1009 | "

\n", 1010 | "\n", 1023 | "\n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | "

	ISBN	bookTitle	bookAuthor	yearOfPublication	publisher
128890	193169656X	Tyrant Moon	Elaine Corvidae	2002	NaN
129037	1931696993	Finders Keepers	Linnea Sinclair	2001	NaN

\n", 1053 | "

" 1054 | ], 1055 | "text/plain": [ 1056 | " ISBN bookTitle bookAuthor yearOfPublication \\\n", 1057 | "128890 193169656X Tyrant Moon Elaine Corvidae 2002 \n", 1058 | "129037 1931696993 Finders Keepers Linnea Sinclair 2001 \n", 1059 | "\n", 1060 | " publisher \n", 1061 | "128890 NaN \n", 1062 | "129037 NaN " 1063 | ] 1064 | }, 1065 | "execution_count": 203, 1066 | "metadata": {}, 1067 | "output_type": "execute_result" 1068 | } 1069 | ], 1070 | "source": [ 1071 | "#exploring 'publisher' column\n", 1072 | "books.loc[books.publisher.isnull(),:]\n", 1073 | "#two NaNs" 1074 | ] 1075 | }, 1076 | { 1077 | "cell_type": "code", 1078 | "execution_count": 204, 1079 | "metadata": { 1080 | "scrolled": true 1081 | }, 1082 | "outputs": [ 1083 | { 1084 | "data": { 1085 | "text/html": [ 1086 | "

\n", 1087 | "\n", 1100 | "\n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | "

	ISBN	bookTitle	bookAuthor	yearOfPublication	publisher
128890	193169656X	Tyrant Moon	Elaine Corvidae	2002	NaN

\n", 1122 | "

" 1123 | ], 1124 | "text/plain": [ 1125 | " ISBN bookTitle bookAuthor yearOfPublication publisher\n", 1126 | "128890 193169656X Tyrant Moon Elaine Corvidae 2002 NaN " 1127 | ] 1128 | }, 1129 | "execution_count": 204, 1130 | "metadata": {}, 1131 | "output_type": "execute_result" 1132 | } 1133 | ], 1134 | "source": [ 1135 | "#investigating rows having NaNs\n", 1136 | "#Checking with rows having bookTitle as Tyrant Moon to see if we can get any clues\n", 1137 | "books.loc[(books.bookTitle == 'Tyrant Moon'),:]\n", 1138 | "#no clues" 1139 | ] 1140 | }, 1141 | { 1142 | "cell_type": "code", 1143 | "execution_count": 205, 1144 | "metadata": {}, 1145 | "outputs": [ 1146 | { 1147 | "data": { 1148 | "text/html": [ 1149 | "

\n", 1150 | "\n", 1163 | "\n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | "

	ISBN	bookTitle	bookAuthor	yearOfPublication	publisher
10799	082177364X	Finders Keepers	Fern Michaels	2002	Zebra Books
42019	0070465037	Finders Keepers	Barbara Nickolae	1989	McGraw-Hill Companies
58264	0688118461	Finders Keepers	Emily Rodda	1993	Harpercollins Juvenile Books
66678	1575663236	Finders Keepers	Fern Michaels	1998	Kensington Publishing Corporation
129037	1931696993	Finders Keepers	Linnea Sinclair	2001	NaN
134309	0156309505	Finders Keepers	Will	1989	Voyager Books
173473	0973146907	Finders Keepers	Sean M. Costello	2002	Red Tower Publications
195885	0061083909	Finders Keepers	Sharon Sala	2003	HarperTorch
211874	0373261160	Finders Keepers	Elizabeth Travis	1993	Worldwide Library

\n", 1249 | "

" 1250 | ], 1251 | "text/plain": [ 1252 | " ISBN bookTitle bookAuthor yearOfPublication \\\n", 1253 | "10799 082177364X Finders Keepers Fern Michaels 2002 \n", 1254 | "42019 0070465037 Finders Keepers Barbara Nickolae 1989 \n", 1255 | "58264 0688118461 Finders Keepers Emily Rodda 1993 \n", 1256 | "66678 1575663236 Finders Keepers Fern Michaels 1998 \n", 1257 | "129037 1931696993 Finders Keepers Linnea Sinclair 2001 \n", 1258 | "134309 0156309505 Finders Keepers Will 1989 \n", 1259 | "173473 0973146907 Finders Keepers Sean M. Costello 2002 \n", 1260 | "195885 0061083909 Finders Keepers Sharon Sala 2003 \n", 1261 | "211874 0373261160 Finders Keepers Elizabeth Travis 1993 \n", 1262 | "\n", 1263 | " publisher \n", 1264 | "10799 Zebra Books \n", 1265 | "42019 McGraw-Hill Companies \n", 1266 | "58264 Harpercollins Juvenile Books \n", 1267 | "66678 Kensington Publishing Corporation \n", 1268 | "129037 NaN \n", 1269 | "134309 Voyager Books \n", 1270 | "173473 Red Tower Publications \n", 1271 | "195885 HarperTorch \n", 1272 | "211874 Worldwide Library " 1273 | ] 1274 | }, 1275 | "execution_count": 205, 1276 | "metadata": {}, 1277 | "output_type": "execute_result" 1278 | } 1279 | ], 1280 | "source": [ 1281 | "#Checking with rows having bookTitle as Finder Keepers to see if we can get any clues\n", 1282 | "books.loc[(books.bookTitle == 'Finders Keepers'),:]\n", 1283 | "#all rows with different publisher and bookAuthor" 1284 | ] 1285 | }, 1286 | { 1287 | "cell_type": "code", 1288 | "execution_count": 206, 1289 | "metadata": {}, 1290 | "outputs": [ 1291 | { 1292 | "data": { 1293 | "text/html": [ 1294 | "

\n", 1295 | "\n", 1308 | "\n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | "

	ISBN	bookTitle	bookAuthor	yearOfPublication	publisher
126762	1931696934	Winter's Orphans	Elaine Corvidae	2001	Novelbooks
128890	193169656X	Tyrant Moon	Elaine Corvidae	2002	NaN
129001	0759901880	Wolfkin	Elaine Corvidae	2001	Hard Shell Word Factory

\n", 1346 | "

" 1347 | ], 1348 | "text/plain": [ 1349 | " ISBN bookTitle bookAuthor yearOfPublication \\\n", 1350 | "126762 1931696934 Winter's Orphans Elaine Corvidae 2001 \n", 1351 | "128890 193169656X Tyrant Moon Elaine Corvidae 2002 \n", 1352 | "129001 0759901880 Wolfkin Elaine Corvidae 2001 \n", 1353 | "\n", 1354 | " publisher \n", 1355 | "126762 Novelbooks \n", 1356 | "128890 NaN \n", 1357 | "129001 Hard Shell Word Factory " 1358 | ] 1359 | }, 1360 | "execution_count": 206, 1361 | "metadata": {}, 1362 | "output_type": "execute_result" 1363 | } 1364 | ], 1365 | "source": [ 1366 | "#checking by bookAuthor to find patterns\n", 1367 | "books.loc[(books.bookAuthor == 'Elaine Corvidae'),:]\n", 1368 | "#all having different publisher...no clues here" 1369 | ] 1370 | }, 1371 | { 1372 | "cell_type": "code", 1373 | "execution_count": 207, 1374 | "metadata": {}, 1375 | "outputs": [ 1376 | { 1377 | "data": { 1378 | "text/html": [ 1379 | "

\n", 1380 | "\n", 1393 | "\n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | "

	ISBN	bookTitle	bookAuthor	yearOfPublication	publisher
129037	1931696993	Finders Keepers	Linnea Sinclair	2001	NaN

\n", 1415 | "

" 1416 | ], 1417 | "text/plain": [ 1418 | " ISBN bookTitle bookAuthor yearOfPublication \\\n", 1419 | "129037 1931696993 Finders Keepers Linnea Sinclair 2001 \n", 1420 | "\n", 1421 | " publisher \n", 1422 | "129037 NaN " 1423 | ] 1424 | }, 1425 | "execution_count": 207, 1426 | "metadata": {}, 1427 | "output_type": "execute_result" 1428 | } 1429 | ], 1430 | "source": [ 1431 | "#checking by bookAuthor to find patterns\n", 1432 | "books.loc[(books.bookAuthor == 'Linnea Sinclair'),:]" 1433 | ] 1434 | }, 1435 | { 1436 | "cell_type": "code", 1437 | "execution_count": 208, 1438 | "metadata": { 1439 | "collapsed": true 1440 | }, 1441 | "outputs": [], 1442 | "source": [ 1443 | "#since there is nothing in common to infer publisher for NaNs, replacing these with 'other\n", 1444 | "books.loc[(books.ISBN == '193169656X'),'publisher'] = 'other'\n", 1445 | "books.loc[(books.ISBN == '1931696993'),'publisher'] = 'other'" 1446 | ] 1447 | }, 1448 | { 1449 | "cell_type": "markdown", 1450 | "metadata": {}, 1451 | "source": [ 1452 | "**Users**" 1453 | ] 1454 | }, 1455 | { 1456 | "cell_type": "code", 1457 | "execution_count": 209, 1458 | "metadata": { 1459 | "scrolled": true 1460 | }, 1461 | "outputs": [ 1462 | { 1463 | "name": "stdout", 1464 | "output_type": "stream", 1465 | "text": [ 1466 | "(278858, 3)\n" 1467 | ] 1468 | }, 1469 | { 1470 | "data": { 1471 | "text/html": [ 1472 | "

\n", 1473 | "\n", 1486 | "\n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | " \n", 1509 | " \n", 1510 | " \n", 1511 | " \n", 1512 | " \n", 1513 | " \n", 1514 | " \n", 1515 | " \n", 1516 | " \n", 1517 | " \n", 1518 | " \n", 1519 | " \n", 1520 | " \n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | " \n", 1527 | "

	userID	Location	Age
0	1	nyc, new york, usa	NaN
1	2	stockton, california, usa	18.0
2	3	moscow, yukon territory, russia	NaN
3	4	porto, v.n.gaia, portugal	17.0
4	5	farnborough, hants, united kingdom	NaN

\n", 1528 | "

" 1529 | ], 1530 | "text/plain": [ 1531 | " userID Location Age\n", 1532 | "0 1 nyc, new york, usa NaN \n", 1533 | "1 2 stockton, california, usa 18.0\n", 1534 | "2 3 moscow, yukon territory, russia NaN \n", 1535 | "3 4 porto, v.n.gaia, portugal 17.0\n", 1536 | "4 5 farnborough, hants, united kingdom NaN " 1537 | ] 1538 | }, 1539 | "execution_count": 209, 1540 | "metadata": {}, 1541 | "output_type": "execute_result" 1542 | } 1543 | ], 1544 | "source": [ 1545 | "print users.shape\n", 1546 | "users.head()" 1547 | ] 1548 | }, 1549 | { 1550 | "cell_type": "code", 1551 | "execution_count": 210, 1552 | "metadata": { 1553 | "scrolled": true 1554 | }, 1555 | "outputs": [ 1556 | { 1557 | "data": { 1558 | "text/plain": [ 1559 | "userID int64 \n", 1560 | "Location object \n", 1561 | "Age float64\n", 1562 | "dtype: object" 1563 | ] 1564 | }, 1565 | "execution_count": 210, 1566 | "metadata": {}, 1567 | "output_type": "execute_result" 1568 | } 1569 | ], 1570 | "source": [ 1571 | "users.dtypes" 1572 | ] 1573 | }, 1574 | { 1575 | "cell_type": "markdown", 1576 | "metadata": {}, 1577 | "source": [ 1578 | "**userID**" 1579 | ] 1580 | }, 1581 | { 1582 | "cell_type": "code", 1583 | "execution_count": 211, 1584 | "metadata": {}, 1585 | "outputs": [ 1586 | { 1587 | "data": { 1588 | "text/plain": [ 1589 | "array([ 1, 2, 3, ..., 278856, 278857, 278858], dtype=int64)" 1590 | ] 1591 | }, 1592 | "execution_count": 211, 1593 | "metadata": {}, 1594 | "output_type": "execute_result" 1595 | } 1596 | ], 1597 | "source": [ 1598 | "users.userID.values\n", 1599 | "#it can be seen that these are unique" 1600 | ] 1601 | }, 1602 | { 1603 | "cell_type": "markdown", 1604 | "metadata": {}, 1605 | "source": [ 1606 | "**Age**" 1607 | ] 1608 | }, 1609 | { 1610 | "cell_type": "code", 1611 | "execution_count": 212, 1612 | "metadata": {}, 1613 | "outputs": [ 1614 | { 1615 | "name": "stdout", 1616 | "output_type": "stream", 1617 | "text": [ 1618 | "[nan, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, 111.0, 113.0, 114.0, 115.0, 116.0, 118.0, 119.0, 123.0, 124.0, 127.0, 128.0, 132.0, 133.0, 136.0, 137.0, 138.0, 140.0, 141.0, 143.0, 146.0, 147.0, 148.0, 151.0, 152.0, 156.0, 157.0, 159.0, 162.0, 168.0, 172.0, 175.0, 183.0, 186.0, 189.0, 199.0, 200.0, 201.0, 204.0, 207.0, 208.0, 209.0, 210.0, 212.0, 219.0, 220.0, 223.0, 226.0, 228.0, 229.0, 230.0, 231.0, 237.0, 239.0, 244.0]\n" 1619 | ] 1620 | } 1621 | ], 1622 | "source": [ 1623 | "print sorted(users.Age.unique())\n", 1624 | "#Age column has some invalid entries like nan, 0 and very high values like 100 and above" 1625 | ] 1626 | }, 1627 | { 1628 | "cell_type": "code", 1629 | "execution_count": 213, 1630 | "metadata": { 1631 | "collapsed": true 1632 | }, 1633 | "outputs": [], 1634 | "source": [ 1635 | "#In my view values below 5 and above 90 do not make much sense for our book rating case...hence replacing these by NaNs\n", 1636 | "users.loc[(users.Age > 90) | (users.Age < 5), 'Age'] = np.nan" 1637 | ] 1638 | }, 1639 | { 1640 | "cell_type": "code", 1641 | "execution_count": 214, 1642 | "metadata": { 1643 | "collapsed": true 1644 | }, 1645 | "outputs": [], 1646 | "source": [ 1647 | "#replacing NaNs with mean\n", 1648 | "users.Age = users.Age.fillna(users.Age.mean())" 1649 | ] 1650 | }, 1651 | { 1652 | "cell_type": "code", 1653 | "execution_count": 215, 1654 | "metadata": { 1655 | "collapsed": true 1656 | }, 1657 | "outputs": [], 1658 | "source": [ 1659 | "#setting the data type as int\n", 1660 | "users.Age = users.Age.astype(np.int32)" 1661 | ] 1662 | }, 1663 | { 1664 | "cell_type": "code", 1665 | "execution_count": 216, 1666 | "metadata": { 1667 | "scrolled": true 1668 | }, 1669 | "outputs": [ 1670 | { 1671 | "name": "stdout", 1672 | "output_type": "stream", 1673 | "text": [ 1674 | "[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90]\n" 1675 | ] 1676 | } 1677 | ], 1678 | "source": [ 1679 | "#rechecking\n", 1680 | "print sorted(users.Age.unique())\n", 1681 | "#looks good now" 1682 | ] 1683 | }, 1684 | { 1685 | "cell_type": "markdown", 1686 | "metadata": {}, 1687 | "source": [ 1688 | "**Ratings Dataset**" 1689 | ] 1690 | }, 1691 | { 1692 | "cell_type": "code", 1693 | "execution_count": 217, 1694 | "metadata": {}, 1695 | "outputs": [ 1696 | { 1697 | "data": { 1698 | "text/plain": [ 1699 | "(1149780, 3)" 1700 | ] 1701 | }, 1702 | "execution_count": 217, 1703 | "metadata": {}, 1704 | "output_type": "execute_result" 1705 | } 1706 | ], 1707 | "source": [ 1708 | "#checking shape\n", 1709 | "ratings.shape" 1710 | ] 1711 | }, 1712 | { 1713 | "cell_type": "code", 1714 | "execution_count": 218, 1715 | "metadata": {}, 1716 | "outputs": [ 1717 | { 1718 | "name": "stdout", 1719 | "output_type": "stream", 1720 | "text": [ 1721 | "75670906880\n" 1722 | ] 1723 | } 1724 | ], 1725 | "source": [ 1726 | "#ratings dataset will have n_users*n_books entries if every user rated every item, this shows that the dataset is very sparse\n", 1727 | "n_users = users.shape[0]\n", 1728 | "n_books = books.shape[0]\n", 1729 | "print n_users * n_books" 1730 | ] 1731 | }, 1732 | { 1733 | "cell_type": "code", 1734 | "execution_count": 219, 1735 | "metadata": { 1736 | "scrolled": true 1737 | }, 1738 | "outputs": [ 1739 | { 1740 | "data": { 1741 | "text/html": [ 1742 | "

\n", 1743 | "\n", 1756 | "\n", 1757 | " \n", 1758 | " \n", 1759 | " \n", 1760 | " \n", 1761 | " \n", 1762 | " \n", 1763 | " \n", 1764 | " \n", 1765 | " \n", 1766 | " \n", 1767 | " \n", 1768 | " \n", 1769 | " \n", 1770 | " \n", 1771 | " \n", 1772 | " \n", 1773 | " \n", 1774 | " \n", 1775 | " \n", 1776 | " \n", 1777 | " \n", 1778 | " \n", 1779 | " \n", 1780 | " \n", 1781 | " \n", 1782 | " \n", 1783 | " \n", 1784 | " \n", 1785 | " \n", 1786 | " \n", 1787 | " \n", 1788 | " \n", 1789 | " \n", 1790 | " \n", 1791 | " \n", 1792 | " \n", 1793 | " \n", 1794 | " \n", 1795 | " \n", 1796 | " \n", 1797 | "

	userID	ISBN	bookRating
0	276725	034545104X	0
1	276726	0155061224	5
2	276727	0446520802	0
3	276729	052165615X	3
4	276729	0521795028	6

\n", 1798 | "

" 1799 | ], 1800 | "text/plain": [ 1801 | " userID ISBN bookRating\n", 1802 | "0 276725 034545104X 0 \n", 1803 | "1 276726 0155061224 5 \n", 1804 | "2 276727 0446520802 0 \n", 1805 | "3 276729 052165615X 3 \n", 1806 | "4 276729 0521795028 6 " 1807 | ] 1808 | }, 1809 | "execution_count": 219, 1810 | "metadata": {}, 1811 | "output_type": "execute_result" 1812 | } 1813 | ], 1814 | "source": [ 1815 | "#checking first few rows...\n", 1816 | "ratings.head(5)" 1817 | ] 1818 | }, 1819 | { 1820 | "cell_type": "code", 1821 | "execution_count": 220, 1822 | "metadata": {}, 1823 | "outputs": [ 1824 | { 1825 | "data": { 1826 | "text/plain": [ 1827 | "array([ 0, 5, 3, 6, 8, 7, 10, 9, 4, 1, 2], dtype=int64)" 1828 | ] 1829 | }, 1830 | "execution_count": 220, 1831 | "metadata": {}, 1832 | "output_type": "execute_result" 1833 | } 1834 | ], 1835 | "source": [ 1836 | "ratings.bookRating.unique()" 1837 | ] 1838 | }, 1839 | { 1840 | "cell_type": "code", 1841 | "execution_count": 221, 1842 | "metadata": { 1843 | "collapsed": true 1844 | }, 1845 | "outputs": [], 1846 | "source": [ 1847 | "#ratings dataset should have books only which exist in our books dataset, unless new books are added to books dataset\n", 1848 | "ratings_new = ratings[ratings.ISBN.isin(books.ISBN)]" 1849 | ] 1850 | }, 1851 | { 1852 | "cell_type": "code", 1853 | "execution_count": 222, 1854 | "metadata": {}, 1855 | "outputs": [ 1856 | { 1857 | "name": "stdout", 1858 | "output_type": "stream", 1859 | "text": [ 1860 | "(1149780, 3)\n", 1861 | "(1031136, 3)\n" 1862 | ] 1863 | } 1864 | ], 1865 | "source": [ 1866 | "print ratings.shape\n", 1867 | "print ratings_new.shape\n", 1868 | "#it can be seen that many rows having book ISBN not part of books dataset got dropped off" 1869 | ] 1870 | }, 1871 | { 1872 | "cell_type": "code", 1873 | "execution_count": 223, 1874 | "metadata": { 1875 | "collapsed": true 1876 | }, 1877 | "outputs": [], 1878 | "source": [ 1879 | "#ratings dataset should have ratings from users which exist in users dataset, unless new users are added to users dataset\n", 1880 | "ratings = ratings[ratings.userID.isin(users.userID)]" 1881 | ] 1882 | }, 1883 | { 1884 | "cell_type": "code", 1885 | "execution_count": 224, 1886 | "metadata": {}, 1887 | "outputs": [ 1888 | { 1889 | "name": "stdout", 1890 | "output_type": "stream", 1891 | "text": [ 1892 | "(1149780, 3)\n", 1893 | "(1031136, 3)\n" 1894 | ] 1895 | } 1896 | ], 1897 | "source": [ 1898 | "print ratings.shape\n", 1899 | "print ratings_new.shape\n", 1900 | "#no new users added, hence we will go with above dataset ratings_new (1031136, 3)" 1901 | ] 1902 | }, 1903 | { 1904 | "cell_type": "code", 1905 | "execution_count": 225, 1906 | "metadata": {}, 1907 | "outputs": [ 1908 | { 1909 | "name": "stdout", 1910 | "output_type": "stream", 1911 | "text": [ 1912 | "number of users: 278858\n", 1913 | "number of books: 271360\n" 1914 | ] 1915 | } 1916 | ], 1917 | "source": [ 1918 | "print \"number of users: \" + str(n_users)\n", 1919 | "print \"number of books: \" + str(n_books)" 1920 | ] 1921 | }, 1922 | { 1923 | "cell_type": "code", 1924 | "execution_count": 226, 1925 | "metadata": {}, 1926 | "outputs": [ 1927 | { 1928 | "name": "stdout", 1929 | "output_type": "stream", 1930 | "text": [ 1931 | "The sparsity level of Book Crossing dataset is 99.9986373416 %\n" 1932 | ] 1933 | } 1934 | ], 1935 | "source": [ 1936 | "#Sparsity of dataset in %\n", 1937 | "sparsity=1.0-len(ratings_new)/float(n_users*n_books)\n", 1938 | "print 'The sparsity level of Book Crossing dataset is ' + str(sparsity*100) + ' %'" 1939 | ] 1940 | }, 1941 | { 1942 | "cell_type": "code", 1943 | "execution_count": 228, 1944 | "metadata": {}, 1945 | "outputs": [ 1946 | { 1947 | "data": { 1948 | "text/plain": [ 1949 | "array([ 0, 5, 3, 6, 8, 7, 10, 9, 4, 1, 2], dtype=int64)" 1950 | ] 1951 | }, 1952 | "execution_count": 228, 1953 | "metadata": {}, 1954 | "output_type": "execute_result" 1955 | } 1956 | ], 1957 | "source": [ 1958 | "#As quoted in the description of the dataset -\n", 1959 | "#BX-Book-Ratings contains the book rating information. Ratings are either explicit, expressed on a scale from 1-10 \n", 1960 | "#higher values denoting higher appreciation, or implicit, expressed by 0\n", 1961 | "ratings.bookRating.unique()" 1962 | ] 1963 | }, 1964 | { 1965 | "cell_type": "code", 1966 | "execution_count": 229, 1967 | "metadata": { 1968 | "collapsed": true 1969 | }, 1970 | "outputs": [], 1971 | "source": [ 1972 | "#Hence segragating implicit and explict ratings datasets\n", 1973 | "ratings_explicit = ratings_new[ratings_new.bookRating != 0]\n", 1974 | "ratings_implicit = ratings_new[ratings_new.bookRating == 0]" 1975 | ] 1976 | }, 1977 | { 1978 | "cell_type": "code", 1979 | "execution_count": 230, 1980 | "metadata": {}, 1981 | "outputs": [ 1982 | { 1983 | "name": "stdout", 1984 | "output_type": "stream", 1985 | "text": [ 1986 | "(1031136, 3)\n", 1987 | "(383842, 3)\n", 1988 | "(647294, 3)\n" 1989 | ] 1990 | } 1991 | ], 1992 | "source": [ 1993 | "#checking shapes\n", 1994 | "print ratings_new.shape\n", 1995 | "print ratings_explicit.shape\n", 1996 | "print ratings_implicit.shape" 1997 | ] 1998 | }, 1999 | { 2000 | "cell_type": "code", 2001 | "execution_count": 231, 2002 | "metadata": { 2003 | "scrolled": true 2004 | }, 2005 | "outputs": [ 2006 | { 2007 | "data": { 2008 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAf0AAAFXCAYAAACoS5cAAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHsdJREFUeJzt3X9Y1fXh9/HX4Zc/OAeRpW3NUClZtoa/SGvXkXK51Gu3\n05la0K3utrViusLS8DcpkuMq0JVz2LK7LhSQ0pXXfTVbmoMMIy+uwNJs5SwVi9Do6hxEOMDn/uN7\n7VwyFU7Njwd4Px9/6YcP8DomPfkcjuc4LMuyBAAAur2QYA8AAABXBtEHAMAQRB8AAEMQfQAADEH0\nAQAwBNEHAMAQYcEeYLfaWk+wJwAAcEX16+e66HGu9AEAMATRBwDAEEQfAABDEH0AAAxB9AEAMATR\nBwDAEEQfAABDEH0AAAxB9AEAMATRBwDAEEQfAABDEH0AAAxB9AEAMES3f5U9ADBJ8b6zwZ7Qxkx3\n72BPwHm40gcAwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAA\nQxB9AAAMQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQBADAE0QcA\nwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQB\nADAE0QcAwBBhdn1gn8+nxYsXq7q6WiEhIcrMzFRYWJgWL14sh8OhIUOGKCMjQyEhISouLlZRUZHC\nwsKUmpqqcePG6dy5c1q0aJHOnDmjyMhIZWdnKyYmRpWVlcrKylJoaKjcbrfmz59v100AAKBbse1K\nv6SkRM3NzSoqKtK8efO0fv16rV27VmlpaSooKJBlWdqzZ49qa2uVn5+voqIibd68Wbm5uWpqalJh\nYaHi4+NVUFCgqVOnauPGjZKkjIwM5eTkqLCwUFVVVTp8+LBdNwEAgG7FtugPHjxYLS0tam1tldfr\nVVhYmA4dOqTRo0dLkpKSklRWVqaDBw9qxIgRioiIkMvlUmxsrI4cOaKKigqNHTvWf+7+/fvl9XrV\n1NSk2NhYORwOud1ulZWV2XUTAADoVmy7e793796qrq7WpEmTVFdXp7y8PB04cEAOh0OSFBkZKY/H\nI6/XK5fL5X+/yMhIeb3eNsfPP9fpdLY598SJE+3u6Nu3t8LCQm24hQDQGZ0N9oA2+vVzdXwSrhjb\nov/CCy/I7Xbr0Ucf1eeff645c+bI5/P5315fX6+oqCg5nU7V19e3Oe5yudocb+/cqKiodnfU1XWu\nLwAAMEltrSfYE4x0qW+2bLt7Pyoqyn+l3qdPHzU3N+vGG29UeXm5JKm0tFSJiYlKSEhQRUWFGhsb\n5fF4dPToUcXHx2vkyJEqKSnxnztq1Cg5nU6Fh4fr+PHjsixL+/btU2Jiol03AQCAbsVhWZZlxweu\nr6/X0qVLVVtbK5/Pp9mzZ+umm27SihUr5PP5FBcXpzVr1ig0NFTFxcXatm2bLMvSAw88oAkTJqih\noUHp6emqra1VeHi4cnJy1K9fP1VWVuqJJ55QS0uL3G63FixY0O4OvssEYJLifZ3r3s2Z7t7BnmCk\nS13p2xb9zoLoAzAJ0YcUhLv3AQBA50L0AQAwBNEHAMAQRB8AAEMQfQAADEH0AQAwBNEHAMAQRB8A\nAEMQfQAADEH0AQAwBNEHAMAQRB8AAEMQfQAADEH0AQAwBNEHAMAQRB8AAEMQfQAADEH0AQAwBNEH\nAMAQRB8AAEMQfQAADEH0AQAwRFiwBwAA0NW0bv1nsCf4hdwbH/i5Nu4AAACdCNEHAMAQRB8AAEMQ\nfQAADEH0AQAwBNEHAMAQRB8AAEMQfQAADEH0AQAwBNEHAMAQRB8AAEMQfQAADEH0AQAwBNEHAMAQ\nvLQuALTjt6VVwZ7g92zSsGBPQBfHlT4AAIYg+gAAGILoAwBgCKIPAIAhiD4AAIYg+gAAGILoAwBg\nCKIPAIAhiD4AAIYg+gAAGILoAwBgCKIPAIAhiD4AAIYg+gAAGILoAwBgCKIPAIAhwuz84Js2bdKb\nb74pn8+n5ORkjR49WosXL5bD4dCQIUOUkZGhkJAQFRcXq6ioSGFhYUpNTdW4ceN07tw5LVq0SGfO\nnFFkZKSys7MVExOjyspKZWVlKTQ0VG63W/Pnz7fzJgAAbHby1YZgT2hjwJRewZ5gG9uu9MvLy/Xe\ne++psLBQ+fn5+uKLL7R27VqlpaWpoKBAlmVpz549qq2tVX5+voqKirR582bl5uaqqalJhYWFio+P\nV0FBgaZOnaqNGzdKkjIyMpSTk6PCwkJVVVXp8OHDdt0EAAC6Fduiv2/fPsXHx2vevHl68MEHdfvt\nt+vQoUMaPXq0JCkpKUllZWU6ePCgRowYoYiICLlcLsXGxurIkSOqqKjQ2LFj/efu379fXq9XTU1N\nio2NlcPhkNvtVllZmV03AQCAbsW2u/fr6up06tQp5eXl6eTJk0pNTZVlWXI4HJKkyMhIeTweeb1e\nuVwu//tFRkbK6/W2OX7+uU6ns825J06caHdH3769FRYWasMtBIArq18/V8cn6aztO76NQDafVOe6\nez+QzTVXYEegAvt78T9si350dLTi4uIUERGhuLg49ejRQ1988YX/7fX19YqKipLT6VR9fX2b4y6X\nq83x9s6Niopqd0ddXef6AgCA76q21hPsCd8am+13sb2X+kbAtrv3R40apbfeekuWZammpkYNDQ26\n9dZbVV5eLkkqLS1VYmKiEhISVFFRocbGRnk8Hh09elTx8fEaOXKkSkpK/OeOGjVKTqdT4eHhOn78\nuCzL0r59+5SYmGjXTQAAoFux7Up/3LhxOnDggKZPny7LsrRy5UoNGDBAK1asUG5uruLi4jRhwgSF\nhoZq1qxZSklJkWVZWrBggXr06KHk5GSlp6crOTlZ4eHhysnJkSStWrVKCxcuVEtLi9xut4YNG2bX\nTQAAoFtxWJZlBXuEnbra3TQAOpffllYFe4Lfs0kdX+QU7+tcP9Kc6e7d4Tld8Z/stW795xVYEpiQ\ne+MvOHbF794HAACdC9EHAMAQRB8AAEMQfQAADEH0AQAwBNEHAMAQRB8AAEMQfQAADEH0AQAwBNEH\nAMAQRB8AAEMQfQAADEH0AQAwBNEHAMAQRB8AAEMQfQAADEH0AQAwBNEHAMAQRB8AAEMQfQAADBFQ\n9DMzMy84lp6eftnHAAAA+4S198Zly5bpxIkT+uCDD/Txxx/7jzc3N8vj8dg+DgAAXD7tRj81NVXV\n1dXKysrS/Pnz/cdDQ0N13XXX2T4OAABcPu1Gf8CAARowYIB27twpr9crj8cjy7IkSWfPnlV0dPQV\nGQkAAP577Ub/3zZt2qRNmza1ibzD4dCePXtsGwYAAC6vgKL/0ksvaffu3YqJibF7DwAAsElAj97/\nwQ9+oD59+ti9BQAA2CigK/1BgwYpJSVFY8aMUUREhP/4+Q/uAwAAnVtA0b/66qt19dVX270FAADY\nKKDoc0UPAEDXF1D0b7jhBjkcjjbH+vfvr5KSEltGAQCAyy+g6B85csT/a5/Pp927d6uystK2UQAA\n4PL71i+4Ex4erkmTJumdd96xYw8AALBJQFf6r7zyiv/XlmXp448/Vnh4uG2jAADA5RdQ9MvLy9v8\nvm/fvlq3bp0tgwAAgD0Civ7atWvl8/l07NgxtbS0aMiQIQoLC+hdAQBAJxFQuT/44AM99NBDio6O\nVmtrq06fPq0//elPGjZsmN37AADAZRJQ9NesWaN169b5I19ZWanMzEy9/PLLto4DAACXT0CP3j97\n9mybq/rhw4ersbHRtlEAAODyCyj6ffr00e7du/2/3717d5uX2QUAAJ1fQHfvZ2Zm6oEHHtCyZcv8\nx4qKimwbBaBjc/b9MdgT2njR/XCwJwDoQEBX+qWlperVq5f27t2rF198UTExMXr33Xft3gYAAC6j\ngKJfXFyswsJC9e7dWzfccIN27NihLVu22L0NAABcRgFF3+fztXkGPp6NDwCAriegn+mPHz9ec+bM\n0aRJkyRJf//733XHHXfYOgwAAFxeAUV/0aJF2rVrlw4cOKCwsDDNnj1b48ePt3sbAAC4jAJ+Lt2J\nEydq4sSJdm4BAAA2+tYvrQsAALomog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhrA1+mfO\nnNFtt92mo0eP6rPPPlNycrJSUlKUkZGh1tZWSf/zvP7Tpk3TzJkztXfvXknSuXPn9Pvf/14pKSm6\n//779dVXX0mSKisrNWPGDN1zzz3asGGDndMBAOh2bIu+z+fTypUr1bNnT0nS2rVrlZaWpoKCAlmW\npT179qi2tlb5+fkqKirS5s2blZubq6amJhUWFio+Pl4FBQWaOnWqNm7cKEnKyMhQTk6OCgsLVVVV\npcOHD9s1HwCAbse26GdnZ+uee+5R//79JUmHDh3S6NGjJUlJSUkqKyvTwYMHNWLECEVERMjlcik2\nNlZHjhxRRUWFxo4d6z93//798nq9ampqUmxsrBwOh9xut8rKyuyaDwBAtxPw0/B+Gzt27FBMTIzG\njh2rZ599VpJkWZYcDockKTIyUh6PR16vVy6Xy/9+kZGR8nq9bY6ff67T6Wxz7okTJzrc0rdvb4WF\nhV7OmwfgIvr1c3V8Ev4rgf0Zn7V9x7cRyOaTargCSwIXyOaaK7AjUN/ma8+W6G/fvl0Oh0P79+/X\nhx9+qPT0dP/P5SWpvr5eUVFRcjqdqq+vb3Pc5XK1Od7euVFRUR1uqavrXF8AQHdVW+sJ9oRuryv+\nGbPZfhfbe6lvBGy5e3/r1q3asmWL8vPzNXToUGVnZyspKUnl5eWSpNLSUiUmJiohIUEVFRVqbGyU\nx+PR0aNHFR8fr5EjR6qkpMR/7qhRo+R0OhUeHq7jx4/Lsizt27dPiYmJdswHAKBbsuVK/2LS09O1\nYsUK5ebmKi4uThMmTFBoaKhmzZqllJQUWZalBQsWqEePHkpOTlZ6erqSk5MVHh6unJwcSdKqVau0\ncOFCtbS0yO12a9iwYVdqPgAAXZ7t0c/Pz/f/esuWLRe8febMmZo5c2abY7169dLTTz99wbnDhw9X\ncXHx5R8JAIABeHIeAAAMQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAAQxB9AAAM\nQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAA\nQ4QFewAAc/yfkp3BntDG/73tl8GeAFxRXOkDAGAIog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGI\nPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgugDAGAI\nog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6AAAY\ngugDAGAIog8AgCGIPgAAhiD6AAAYgugDAGCIMDs+qM/n09KlS1VdXa2mpialpqbq+uuv1+LFi+Vw\nODRkyBBlZGQoJCRExcXFKioqUlhYmFJTUzVu3DidO3dOixYt0pkzZxQZGans7GzFxMSosrJSWVlZ\nCg0Nldvt1vz58+2YDwBAt2TLlf7OnTsVHR2tgoICPffcc8rMzNTatWuVlpamgoICWZalPXv2qLa2\nVvn5+SoqKtLmzZuVm5urpqYmFRYWKj4+XgUFBZo6dao2btwoScrIyFBOTo4KCwtVVVWlw4cP2zEf\nAIBuyZboT5w4UQ8//LAkybIshYaG6tChQxo9erQkKSkpSWVlZTp48KBGjBihiIgIuVwuxcbG6siR\nI6qoqNDYsWP95+7fv19er1dNTU2KjY2Vw+GQ2+1WWVmZHfMBAOiWbLl7PzIyUpLk9Xr10EMPKS0t\nTdnZ2XI4HP63ezweeb1euVyuNu/n9XrbHD//XKfT2ebcEydOdLilb9/eCgsLvZw3D8BF9Ovn6vik\nTqarbQ5s71nbd3wbgWw+qYYrsCRwgWyuuQI7AvVt/h7bEn1J+vzzzzVv3jylpKRo8uTJevLJJ/1v\nq6+vV1RUlJxOp+rr69scd7lcbY63d25UVFSHO+rqOtcXANBd1dZ6gj3hW+tqm7vaXonNV8LF9l7q\nGwFb7t4/ffq05s6dq0WLFmn69OmSpBtvvFHl5eWSpNLSUiUmJiohIUEVFRVqbGyUx+PR0aNHFR8f\nr5EjR6qkpMR/7qhRo+R0OhUeHq7jx4/Lsizt27dPiYmJdswHAKBbsuVKPy8vT9988402btzofxDe\nsmXLtGbNGuXm5iouLk4TJkxQaGioZs2apZSUFFmWpQULFqhHjx5KTk5Wenq6kpOTFR4erpycHEnS\nqlWrtHDhQrW0tMjtdmvYsGF2zAcAoFuyJfrLly/X8uXLLzi+ZcuWC47NnDlTM2fObHOsV69eevrp\npy84d/jw4SouLr58QwEAMAhPzgMAgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgugD\nAGAIog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6\nAAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGI\nPgAAhiD6AAAYgugDAGAIog8AgCHCgj0A6Cz+tHd6sCf4zRv3crAnAOiGuNIHAMAQRB8AAEMQfQAA\nDEH0AQAwBNEHAMAQRB8AAEMQfQAADMG/04ctSv7fjGBPaOO2//VSsCcAQNBxpQ8AgCGIPgAAhiD6\nAAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgn+n3wWcKf7fwZ7Qxvdmbgn2BADAd8CVPgAAhiD6AAAY\nwsy7919+NdgL2po+JdgLAAAG4EofAABDdLkr/dbWVj3++OP66KOPFBERoTVr1mjgwIHBngUAQKfX\n5a70d+/eraamJm3btk2PPvqo/vCHPwR7EgAAXUKXi35FRYXGjh0rSRo+fLg++OCDIC8CAKBrcFiW\nZQV7xLexbNky3XnnnbrtttskSbfffrt2796tsLAu95MKAACuqC53pe90OlVfX+//fWtrK8EHACAA\nXS76I0eOVGlpqSSpsrJS8fHxQV4EAEDX0OXu3v/3o/f/+c9/yrIsPfHEE7ruuuuCPQsAgE6vy0Uf\nAAB8N13u7n0AAPDdEH0AAAzBw96/o6qqKj311FPKz88P9pQO+Xw+LV26VNXV1WpqalJqaqruuOOO\nYM9qV0tLi5YvX65jx47J4XBo1apVXeJBm2fOnNG0adP0/PPPd4nHmvzqV7+S0+mUJA0YMEBr164N\n8qL2bdq0SW+++aZ8Pp+Sk5M1Y8aMYE9q144dO/TXv/5VktTY2KgPP/xQb7/9tqKiooK87NJ8Pp8W\nL16s6upqhYSEKDMzs1P/XW5qatKSJUt04sQJOZ1OrVy5UoMGDQr2rEs6vx2fffaZFi9eLIfDoSFD\nhigjI0MhIfZeixP97+Avf/mLdu7cqV69egV7SkB27typ6OhoPfnkk/r66681derUTh/9vXv3SpKK\niopUXl6udevW6c9//nOQV7XP5/Np5cqV6tmzZ7CnBKSxsVGWZXWJb1wlqby8XO+9954KCwvV0NCg\n559/PtiTOjRt2jRNmzZNkrRq1SrdddddnTr4klRSUqLm5mYVFRXp7bff1vr16/XMM88Ee9YlFRcX\nq3fv3iouLta//vUvZWZmavPmzcGedVH/2Y61a9cqLS1NY8aM0cqVK7Vnzx79/Oc/t3UDd+9/B7Gx\nsZ36i+A/TZw4UQ8//LAkybIshYaGBnlRx8aPH6/MzExJ0qlTpzr9/yglKTs7W/fcc4/69+8f7CkB\nOXLkiBoaGjR37lzNnj1blZWVwZ7Urn379ik+Pl7z5s3Tgw8+qNtvvz3YkwL2/vvv65NPPtHdd98d\n7CkdGjx4sFpaWtTa2iqv19vpnwflk08+UVJSkiQpLi5OR48eDfKiS/vPdhw6dEijR4+WJCUlJams\nrMz2DZ37v2YnNWHCBJ08eTLYMwIWGRkpSfJ6vXrooYeUlpYW5EWBCQsLU3p6ut544w09/fTTwZ7T\nrh07digmJkZjx47Vs88+G+w5AenZs6fuu+8+zZgxQ59++qnuv/9+7dq1q9P+T76urk6nTp1SXl6e\nTp48qdTUVO3atUsOhyPY0zq0adMmzZs3L9gzAtK7d29VV1dr0qRJqqurU15eXrAntWvo0KHau3ev\nxo8fr6qqKtXU1KilpaVTXtz8Zzssy/L//Y2MjJTH47F9A1f6hvj88881e/ZsTZkyRZMnTw72nIBl\nZ2fr9ddf14oVK3T27Nlgz7mk7du3q6ysTLNmzdKHH36o9PR01dbWBntWuwYPHqxf/vKXcjgcGjx4\nsKKjozv15ujoaLndbkVERCguLk49evTQV199FexZHfrmm2907Ngx3XLLLcGeEpAXXnhBbrdbr7/+\nul599VUtXrxYjY2NwZ51SXfddZecTqdSUlL0xhtv6Mc//nGnDP7FnP/z+/r6+ityjybRN8Dp06c1\nd+5cLVq0SNOnTw/2nIC88sor2rRpkySpV69ecjgctj/A5b+xdetWbdmyRfn5+Ro6dKiys7PVr1+/\nYM9q18svv+x/lcqamhp5vd5OvXnUqFF66623ZFmWampq1NDQoOjo6GDP6tCBAwd06623BntGwKKi\nouRyuSRJffr0UXNzs1paWoK86tLef/993XrrrSosLNTEiRN17bXXBntSwG688UaVl5dLkkpLS5WY\nmGj75+yc9+PhssrLy9M333yjjRs3auPGjZL+5wElnfkBZ3feeaeWLFmie++9V83NzVq6dGmn3tsV\nTZ8+XUuWLFFycrIcDoeeeOKJTnvXviSNGzdOBw4c0PTp02VZllauXNklruiOHTumAQMGBHtGwH79\n619r6dKlSklJkc/n04IFC9S7d+9gz7qkgQMH6o9//KPy8vLkcrmUlZUV7EkBS09P14oVK5Sbm6u4\nuDhNmDDB9s/JM/IBAGCIznt/KQAAuKyIPgAAhiD6AAAYgugDAGAIog8AgCGIPmCQ8vJyzZo167/+\nOLNmzfL/++J/O3nypG666SZNmTLF/yRQP/vZzwJ6NsXzN02ZMuW/3gfg4jrvP8oF0OX0799fr776\nqv/3NTU1mjBhgn7xi1+0+0pt7777rv/X578/gMuL6AOGqaur03333acvv/xSCQkJysjI8L+aWmtr\nq6699lqtXr1aV111lSorK5WVlaXGxkb17dtXq1ev1sCBA/0f68yZM5ozZ47S0tJ0ww03XPC5amtr\nZVmWIiMj1dzcrMcff1wff/yxTp8+rcGDB2vDhg166qmnJEkzZszQSy+9pB/96Ef66KOP9Mwzz6im\npkafffaZqqurNWPGDKWmpsrn8ykjI0MVFRW6+uqr5XA49Lvf/U5jxoy5Yn+GQFdF9AHDnDx5Uhs2\nbNDAgQO1YMECPfvss9q2bZsKCws1YMAAPffcc1q9erWeeuopPfLII1q/fr0SEhL0t7/9TY888oi2\nb98uSfJ4PPrtb3+r+fPna/z48Tp58qS+/PJLTZkyRY2Njaqrq9NPfvITbdiwQd///vd14MABhYeH\na9u2bWptbdWcOXNUUlKi5cuXKz8/Xy+99NIFWz/66CNt3bpVHo9H48eP17333qtXX31VDQ0N2rVr\nl06dOtWlXksCCDZ+pg8YJjExUYMGDZLD4dDkyZP14osvKiEhwf9UsXfffbfeeecdffrpp4qKilJC\nQoIkadKkSTp+/Lj/lcAyMjLU3NysO++80/+x/333/muvvaYpU6bI5/P5X2jm5ptvVkpKirZu3aqs\nrCx9+umnHb6I0pgxYxQREaHvfe97io6Olsfj0dtvv63JkyfL4XDohz/8YZd6Xnsg2Ig+YJjzn1//\n/Jf2PP9Yc3OzWltbL3hfy7L8L75y//33KyYmRoWFhRecFxISoscee0xnzpzR888/L0nas2ePFi5c\nqJ49e2ratGm6+eab1dGzgPfo0cP/a4fDIcuyFBoaetFtADpG9AHDVFRU6NSpU2ptbdUrr7yi3/zm\nN6qqqvK/zve2bds0ZswYxcXF6euvv9bBgwclSa+99pquueYa/yvbDR06VBkZGdqwYYNqamou+Dxh\nYWF67LHHlJeXp9raWu3fv1+TJk3SXXfdpauuukoHDhzwfwMRGhqq5ubmgPb/9Kc/1WuvveZ/tb13\n3333gm9cAFwcP9MHDHP99ddr6dKlqq2t1S233KL77rtP119/vebPny+fz6drrrlGWVlZioiI0Lp1\n65SZmamGhgb16dNH69ata/OxBg0apHvvvVerV6/WkiVLLvhcSUlJGj58uNavX6/Zs2dr4cKF2rVr\nlyIiIjR8+HD/Nxp33HGHpkyZoh07dnS4f+bMmTpy5IgmT56sfv366ZprruEVGIEA8Sp7ALqUf/zj\nH7IsS+PGjZPH49HUqVO1fft2/z0QAC6N6APoUk6cOKHHHnvM/yDAuXPn8oQ+QICIPgAAhuCBfAAA\nGILoAwBgCKIPAIAhiD4AAIYg+gAAGILoAwBgiP8PjIYQhxu3xXEAAAAASUVORK5CYII=\n", 2009 | "text/plain": [ 2010 | "" 2011 | ] 2012 | }, 2013 | "metadata": {}, 2014 | "output_type": "display_data" 2015 | } 2016 | ], 2017 | "source": [ 2018 | "#plotting count of bookRating\n", 2019 | "sns.countplot(data=ratings_explicit , x='bookRating')\n", 2020 | "plt.show()\n", 2021 | "#It can be seen that higher ratings are more common amongst users and rating 8 has been rated highest number of times" 2022 | ] 2023 | }, 2024 | { 2025 | "cell_type": "markdown", 2026 | "metadata": {}, 2027 | "source": [ 2028 | "**Simple Popularity Based Recommendation System**" 2029 | ] 2030 | }, 2031 | { 2032 | "cell_type": "code", 2033 | "execution_count": 232, 2034 | "metadata": { 2035 | "scrolled": true 2036 | }, 2037 | "outputs": [ 2038 | { 2039 | "name": "stdout", 2040 | "output_type": "stream", 2041 | "text": [ 2042 | "Following books are recommended\n" 2043 | ] 2044 | }, 2045 | { 2046 | "data": { 2047 | "text/html": [ 2048 | "

\n", 2049 | "\n", 2062 | "\n", 2063 | " \n", 2064 | " \n", 2065 | " \n", 2066 | " \n", 2067 | " \n", 2068 | " \n", 2069 | " \n", 2070 | " \n", 2071 | " \n", 2072 | " \n", 2073 | " \n", 2074 | " \n", 2075 | " \n", 2076 | " \n", 2077 | " \n", 2078 | " \n", 2079 | " \n", 2080 | " \n", 2081 | " \n", 2082 | " \n", 2083 | " \n", 2084 | " \n", 2085 | " \n", 2086 | " \n", 2087 | " \n", 2088 | " \n", 2089 | " \n", 2090 | " \n", 2091 | " \n", 2092 | " \n", 2093 | " \n", 2094 | " \n", 2095 | " \n", 2096 | " \n", 2097 | " \n", 2098 | " \n", 2099 | " \n", 2100 | " \n", 2101 | " \n", 2102 | " \n", 2103 | " \n", 2104 | " \n", 2105 | " \n", 2106 | " \n", 2107 | " \n", 2108 | " \n", 2109 | " \n", 2110 | " \n", 2111 | " \n", 2112 | " \n", 2113 | " \n", 2114 | " \n", 2115 | " \n", 2116 | " \n", 2117 | " \n", 2118 | " \n", 2119 | " \n", 2120 | " \n", 2121 | " \n", 2122 | " \n", 2123 | " \n", 2124 | " \n", 2125 | " \n", 2126 | " \n", 2127 | " \n", 2128 | " \n", 2129 | " \n", 2130 | " \n", 2131 | " \n", 2132 | " \n", 2133 | " \n", 2134 | " \n", 2135 | " \n", 2136 | " \n", 2137 | " \n", 2138 | " \n", 2139 | " \n", 2140 | " \n", 2141 | " \n", 2142 | " \n", 2143 | " \n", 2144 | " \n", 2145 | " \n", 2146 | " \n", 2147 | " \n", 2148 | " \n", 2149 | " \n", 2150 | " \n", 2151 | " \n", 2152 | " \n", 2153 | " \n", 2154 | " \n", 2155 | " \n", 2156 | " \n", 2157 | " \n", 2158 | " \n", 2159 | " \n", 2160 | " \n", 2161 | " \n", 2162 | " \n", 2163 | " \n", 2164 | " \n", 2165 | " \n", 2166 | "

	bookRating	ISBN	bookTitle	bookAuthor	yearOfPublication	publisher
408	5787	0316666343	The Lovely Bones: A Novel	Alice Sebold	2002	Little, Brown
748	4108	0385504209	The Da Vinci Code	Dan Brown	2003	Doubleday
522	3134	0312195516	The Red Tent (Bestselling Backlist)	Anita Diamant	1998	Picador USA
2143	2798	059035342X	Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))	J. K. Rowling	1999	Arthur A. Levine Books
356	2595	0142001740	The Secret Life of Bees	Sue Monk Kidd	2003	Penguin Books
26	2551	0971880107	Wild Animus	Rich Shapero	2004	Too Far
1105	2524	0060928336	Divine Secrets of the Ya-Ya Sisterhood: A Novel	Rebecca Wells	1997	Perennial
706	2402	0446672211	Where the Heart Is (Oprah's Book Club (Paperback))	Billie Letts	1998	Warner Books
231	2219	0452282152	Girl with a Pearl Earring	Tracy Chevalier	2001	Plume Books
118	2179	0671027360	Angels & Demons	Dan Brown	2001	Pocket Star

\n", 2167 | "

" 2168 | ], 2169 | "text/plain": [ 2170 | " bookRating ISBN \\\n", 2171 | "408 5787 0316666343 \n", 2172 | "748 4108 0385504209 \n", 2173 | "522 3134 0312195516 \n", 2174 | "2143 2798 059035342X \n", 2175 | "356 2595 0142001740 \n", 2176 | "26 2551 0971880107 \n", 2177 | "1105 2524 0060928336 \n", 2178 | "706 2402 0446672211 \n", 2179 | "231 2219 0452282152 \n", 2180 | "118 2179 0671027360 \n", 2181 | "\n", 2182 | " bookTitle \\\n", 2183 | "408 The Lovely Bones: A Novel \n", 2184 | "748 The Da Vinci Code \n", 2185 | "522 The Red Tent (Bestselling Backlist) \n", 2186 | "2143 Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback)) \n", 2187 | "356 The Secret Life of Bees \n", 2188 | "26 Wild Animus \n", 2189 | "1105 Divine Secrets of the Ya-Ya Sisterhood: A Novel \n", 2190 | "706 Where the Heart Is (Oprah's Book Club (Paperback)) \n", 2191 | "231 Girl with a Pearl Earring \n", 2192 | "118 Angels & Demons \n", 2193 | "\n", 2194 | " bookAuthor yearOfPublication publisher \n", 2195 | "408 Alice Sebold 2002 Little, Brown \n", 2196 | "748 Dan Brown 2003 Doubleday \n", 2197 | "522 Anita Diamant 1998 Picador USA \n", 2198 | "2143 J. K. Rowling 1999 Arthur A. Levine Books \n", 2199 | "356 Sue Monk Kidd 2003 Penguin Books \n", 2200 | "26 Rich Shapero 2004 Too Far \n", 2201 | "1105 Rebecca Wells 1997 Perennial \n", 2202 | "706 Billie Letts 1998 Warner Books \n", 2203 | "231 Tracy Chevalier 2001 Plume Books \n", 2204 | "118 Dan Brown 2001 Pocket Star " 2205 | ] 2206 | }, 2207 | "execution_count": 232, 2208 | "metadata": {}, 2209 | "output_type": "execute_result" 2210 | } 2211 | ], 2212 | "source": [ 2213 | "#At this point , a simple popularity based recommendation system can be built based on count of user ratings for different books\n", 2214 | "ratings_count = pd.DataFrame(ratings_explicit.groupby(['ISBN'])['bookRating'].sum())\n", 2215 | "top10 = ratings_count.sort_values('bookRating', ascending = False).head(10)\n", 2216 | "print \"Following books are recommended\"\n", 2217 | "top10.merge(books, left_index = True, right_on = 'ISBN')\n", 2218 | "\n", 2219 | "#Given below are top 10 recommendations based on popularity. It is evident that books authored by J.K. Rowling are most popular" 2220 | ] 2221 | }, 2222 | { 2223 | "cell_type": "code", 2224 | "execution_count": 233, 2225 | "metadata": { 2226 | "collapsed": true 2227 | }, 2228 | "outputs": [], 2229 | "source": [ 2230 | "#Similarly segregating users who have given explicit ratings from 1-10 and those whose implicit behavior was tracked\n", 2231 | "users_exp_ratings = users[users.userID.isin(ratings_explicit.userID)]\n", 2232 | "users_imp_ratings = users[users.userID.isin(ratings_implicit.userID)]" 2233 | ] 2234 | }, 2235 | { 2236 | "cell_type": "code", 2237 | "execution_count": 234, 2238 | "metadata": {}, 2239 | "outputs": [ 2240 | { 2241 | "name": "stdout", 2242 | "output_type": "stream", 2243 | "text": [ 2244 | "(278858, 3)\n", 2245 | "(68091, 3)\n", 2246 | "(52451, 3)\n" 2247 | ] 2248 | } 2249 | ], 2250 | "source": [ 2251 | "#checking shapes\n", 2252 | "print users.shape\n", 2253 | "print users_exp_ratings.shape\n", 2254 | "print users_imp_ratings.shape" 2255 | ] 2256 | }, 2257 | { 2258 | "cell_type": "markdown", 2259 | "metadata": {}, 2260 | "source": [ 2261 | "**Collaborative Filtering Based Recommendation Systems**" 2262 | ] 2263 | }, 2264 | { 2265 | "cell_type": "code", 2266 | "execution_count": 235, 2267 | "metadata": { 2268 | "collapsed": true 2269 | }, 2270 | "outputs": [], 2271 | "source": [ 2272 | "#To cope up with computing power I have and to reduce the dataset size, I am considering users who have rated atleast 100 books\n", 2273 | "#and books which have atleast 100 ratings\n", 2274 | "counts1 = ratings_explicit['userID'].value_counts()\n", 2275 | "ratings_explicit = ratings_explicit[ratings_explicit['userID'].isin(counts1[counts1 >= 100].index)]\n", 2276 | "counts = ratings_explicit['bookRating'].value_counts()\n", 2277 | "ratings_explicit = ratings_explicit[ratings_explicit['bookRating'].isin(counts[counts >= 100].index)]" 2278 | ] 2279 | }, 2280 | { 2281 | "cell_type": "code", 2282 | "execution_count": 236, 2283 | "metadata": {}, 2284 | "outputs": [ 2285 | { 2286 | "name": "stdout", 2287 | "output_type": "stream", 2288 | "text": [ 2289 | "(449, 66574)\n" 2290 | ] 2291 | }, 2292 | { 2293 | "data": { 2294 | "text/html": [ 2295 | "

\n", 2296 | "\n", 2309 | "\n", 2310 | " \n", 2311 | " \n", 2312 | " \n", 2313 | " \n", 2314 | " \n", 2315 | " \n", 2316 | " \n", 2317 | " \n", 2318 | " \n", 2319 | " \n", 2320 | " \n", 2321 | " \n", 2322 | " \n", 2323 | " \n", 2324 | " \n", 2325 | " \n", 2326 | " \n", 2327 | " \n", 2328 | " \n", 2329 | " \n", 2330 | " \n", 2331 | " \n", 2332 | " \n", 2333 | " \n", 2334 | " \n", 2335 | " \n", 2336 | " \n", 2337 | " \n", 2338 | " \n", 2339 | " \n", 2340 | " \n", 2341 | " \n", 2342 | " \n", 2343 | " \n", 2344 | " \n", 2345 | " \n", 2346 | " \n", 2347 | " \n", 2348 | " \n", 2349 | " \n", 2350 | " \n", 2351 | " \n", 2352 | " \n", 2353 | " \n", 2354 | " \n", 2355 | " \n", 2356 | " \n", 2357 | " \n", 2358 | " \n", 2359 | " \n", 2360 | " \n", 2361 | " \n", 2362 | " \n", 2363 | " \n", 2364 | " \n", 2365 | " \n", 2366 | " \n", 2367 | " \n", 2368 | " \n", 2369 | " \n", 2370 | " \n", 2371 | " \n", 2372 | " \n", 2373 | " \n", 2374 | " \n", 2375 | " \n", 2376 | " \n", 2377 | " \n", 2378 | " \n", 2379 | " \n", 2380 | " \n", 2381 | " \n", 2382 | " \n", 2383 | " \n", 2384 | " \n", 2385 | " \n", 2386 | " \n", 2387 | " \n", 2388 | " \n", 2389 | " \n", 2390 | " \n", 2391 | " \n", 2392 | " \n", 2393 | " \n", 2394 | " \n", 2395 | " \n", 2396 | " \n", 2397 | " \n", 2398 | " \n", 2399 | " \n", 2400 | " \n", 2401 | " \n", 2402 | " \n", 2403 | " \n", 2404 | " \n", 2405 | " \n", 2406 | " \n", 2407 | " \n", 2408 | " \n", 2409 | " \n", 2410 | " \n", 2411 | " \n", 2412 | " \n", 2413 | " \n", 2414 | " \n", 2415 | " \n", 2416 | " \n", 2417 | " \n", 2418 | " \n", 2419 | " \n", 2420 | " \n", 2421 | " \n", 2422 | " \n", 2423 | " \n", 2424 | " \n", 2425 | " \n", 2426 | " \n", 2427 | " \n", 2428 | " \n", 2429 | " \n", 2430 | " \n", 2431 | " \n", 2432 | " \n", 2433 | " \n", 2434 | " \n", 2435 | " \n", 2436 | " \n", 2437 | " \n", 2438 | " \n", 2439 | " \n", 2440 | " \n", 2441 | " \n", 2442 | " \n", 2443 | " \n", 2444 | " \n", 2445 | " \n", 2446 | " \n", 2447 | " \n", 2448 | " \n", 2449 | " \n", 2450 | " \n", 2451 | " \n", 2452 | " \n", 2453 | " \n", 2454 | " \n", 2455 | " \n", 2456 | " \n", 2457 | " \n", 2458 | " \n", 2459 | " \n", 2460 | " \n", 2461 | " \n", 2462 | " \n", 2463 | " \n", 2464 | " \n", 2465 | " \n", 2466 | " \n", 2467 | " \n", 2468 | " \n", 2469 | " \n", 2470 | " \n", 2471 | " \n", 2472 | " \n", 2473 | " \n", 2474 | " \n", 2475 | " \n", 2476 | " \n", 2477 | " \n", 2478 | " \n", 2479 | " \n", 2480 | " \n", 2481 | " \n", 2482 | "

ISBN	0000913154	0001046438	000104687X	0001047213	0001047973	000104799X	0001048082	0001053736	0001053744	0001055607	...	B000092Q0A	B00009EF82	B00009NDAN	B0000DYXID	B0000T6KHI	B0000VZEJQ	B0000X8HIE	B00013AX9E	B0001I1KOG	B000234N3A
userID
2033	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2110	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2276	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4017	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4385	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

\n", 2483 | "

5 rows × 66574 columns

\n", 2484 | "

" 2485 | ], 2486 | "text/plain": [ 2487 | "ISBN 0000913154 0001046438 000104687X 0001047213 0001047973 \\\n", 2488 | "userID \n", 2489 | "2033 NaN NaN NaN NaN NaN \n", 2490 | "2110 NaN NaN NaN NaN NaN \n", 2491 | "2276 NaN NaN NaN NaN NaN \n", 2492 | "4017 NaN NaN NaN NaN NaN \n", 2493 | "4385 NaN NaN NaN NaN NaN \n", 2494 | "\n", 2495 | "ISBN 000104799X 0001048082 0001053736 0001053744 0001055607 \\\n", 2496 | "userID \n", 2497 | "2033 NaN NaN NaN NaN NaN \n", 2498 | "2110 NaN NaN NaN NaN NaN \n", 2499 | "2276 NaN NaN NaN NaN NaN \n", 2500 | "4017 NaN NaN NaN NaN NaN \n", 2501 | "4385 NaN NaN NaN NaN NaN \n", 2502 | "\n", 2503 | "ISBN ... B000092Q0A B00009EF82 B00009NDAN B0000DYXID \\\n", 2504 | "userID ... \n", 2505 | "2033 ... NaN NaN NaN NaN \n", 2506 | "2110 ... NaN NaN NaN NaN \n", 2507 | "2276 ... NaN NaN NaN NaN \n", 2508 | "4017 ... NaN NaN NaN NaN \n", 2509 | "4385 ... NaN NaN NaN NaN \n", 2510 | "\n", 2511 | "ISBN B0000T6KHI B0000VZEJQ B0000X8HIE B00013AX9E B0001I1KOG B000234N3A \n", 2512 | "userID \n", 2513 | "2033 NaN NaN NaN NaN NaN NaN \n", 2514 | "2110 NaN NaN NaN NaN NaN NaN \n", 2515 | "2276 NaN NaN NaN NaN NaN NaN \n", 2516 | "4017 NaN NaN NaN NaN NaN NaN \n", 2517 | "4385 NaN NaN NaN NaN NaN NaN \n", 2518 | "\n", 2519 | "[5 rows x 66574 columns]" 2520 | ] 2521 | }, 2522 | "execution_count": 236, 2523 | "metadata": {}, 2524 | "output_type": "execute_result" 2525 | } 2526 | ], 2527 | "source": [ 2528 | "#Generating ratings matrix from explicit ratings table\n", 2529 | "ratings_matrix = ratings_explicit.pivot(index='userID', columns='ISBN', values='bookRating')\n", 2530 | "userID = ratings_matrix.index\n", 2531 | "ISBN = ratings_matrix.columns\n", 2532 | "print(ratings_matrix.shape)\n", 2533 | "ratings_matrix.head()\n", 2534 | "#Notice that most of the values are NaN (undefined) implying absence of ratings" 2535 | ] 2536 | }, 2537 | { 2538 | "cell_type": "code", 2539 | "execution_count": 237, 2540 | "metadata": {}, 2541 | "outputs": [ 2542 | { 2543 | "name": "stdout", 2544 | "output_type": "stream", 2545 | "text": [ 2546 | "449 66574\n" 2547 | ] 2548 | } 2549 | ], 2550 | "source": [ 2551 | "n_users = ratings_matrix.shape[0] #considering only those users who gave explicit ratings\n", 2552 | "n_books = ratings_matrix.shape[1]\n", 2553 | "print n_users, n_books" 2554 | ] 2555 | }, 2556 | { 2557 | "cell_type": "code", 2558 | "execution_count": 238, 2559 | "metadata": { 2560 | "collapsed": true 2561 | }, 2562 | "outputs": [], 2563 | "source": [ 2564 | "#since NaNs cannot be handled by training algorithms, replacing these by 0, which indicates absence of ratings\n", 2565 | "#setting data type\n", 2566 | "ratings_matrix.fillna(0, inplace = True)\n", 2567 | "ratings_matrix = ratings_matrix.astype(np.int32)" 2568 | ] 2569 | }, 2570 | { 2571 | "cell_type": "code", 2572 | "execution_count": 239, 2573 | "metadata": {}, 2574 | "outputs": [ 2575 | { 2576 | "data": { 2577 | "text/html": [ 2578 | "

\n", 2579 | "\n", 2592 | "\n", 2593 | " \n", 2594 | " \n", 2595 | " \n", 2596 | " \n", 2597 | " \n", 2598 | " \n", 2599 | " \n", 2600 | " \n", 2601 | " \n", 2602 | " \n", 2603 | " \n", 2604 | " \n", 2605 | " \n", 2606 | " \n", 2607 | " \n", 2608 | " \n", 2609 | " \n", 2610 | " \n", 2611 | " \n", 2612 | " \n", 2613 | " \n", 2614 | " \n", 2615 | " \n", 2616 | " \n", 2617 | " \n", 2618 | " \n", 2619 | " \n", 2620 | " \n", 2621 | " \n", 2622 | " \n", 2623 | " \n", 2624 | " \n", 2625 | " \n", 2626 | " \n", 2627 | " \n", 2628 | " \n", 2629 | " \n", 2630 | " \n", 2631 | " \n", 2632 | " \n", 2633 | " \n", 2634 | " \n", 2635 | " \n", 2636 | " \n", 2637 | " \n", 2638 | " \n", 2639 | " \n", 2640 | " \n", 2641 | " \n", 2642 | " \n", 2643 | " \n", 2644 | " \n", 2645 | " \n", 2646 | " \n", 2647 | " \n", 2648 | " \n", 2649 | " \n", 2650 | " \n", 2651 | " \n", 2652 | " \n", 2653 | " \n", 2654 | " \n", 2655 | " \n", 2656 | " \n", 2657 | " \n", 2658 | " \n", 2659 | " \n", 2660 | " \n", 2661 | " \n", 2662 | " \n", 2663 | " \n", 2664 | " \n", 2665 | " \n", 2666 | " \n", 2667 | " \n", 2668 | " \n", 2669 | " \n", 2670 | " \n", 2671 | " \n", 2672 | " \n", 2673 | " \n", 2674 | " \n", 2675 | " \n", 2676 | " \n", 2677 | " \n", 2678 | " \n", 2679 | " \n", 2680 | " \n", 2681 | " \n", 2682 | " \n", 2683 | " \n", 2684 | " \n", 2685 | " \n", 2686 | " \n", 2687 | " \n", 2688 | " \n", 2689 | " \n", 2690 | " \n", 2691 | " \n", 2692 | " \n", 2693 | " \n", 2694 | " \n", 2695 | " \n", 2696 | " \n", 2697 | " \n", 2698 | " \n", 2699 | " \n", 2700 | " \n", 2701 | " \n", 2702 | " \n", 2703 | " \n", 2704 | " \n", 2705 | " \n", 2706 | " \n", 2707 | " \n", 2708 | " \n", 2709 | " \n", 2710 | " \n", 2711 | " \n", 2712 | " \n", 2713 | " \n", 2714 | " \n", 2715 | " \n", 2716 | " \n", 2717 | " \n", 2718 | " \n", 2719 | " \n", 2720 | " \n", 2721 | " \n", 2722 | " \n", 2723 | " \n", 2724 | " \n", 2725 | " \n", 2726 | " \n", 2727 | " \n", 2728 | " \n", 2729 | " \n", 2730 | " \n", 2731 | " \n", 2732 | " \n", 2733 | " \n", 2734 | " \n", 2735 | " \n", 2736 | " \n", 2737 | " \n", 2738 | " \n", 2739 | " \n", 2740 | " \n", 2741 | " \n", 2742 | " \n", 2743 | " \n", 2744 | " \n", 2745 | " \n", 2746 | " \n", 2747 | " \n", 2748 | " \n", 2749 | " \n", 2750 | " \n", 2751 | " \n", 2752 | " \n", 2753 | " \n", 2754 | " \n", 2755 | " \n", 2756 | " \n", 2757 | " \n", 2758 | " \n", 2759 | " \n", 2760 | " \n", 2761 | " \n", 2762 | " \n", 2763 | " \n", 2764 | " \n", 2765 | "

ISBN	0000913154	0001046438	000104687X	0001047213	0001047973	000104799X	0001048082	0001053736	0001053744	0001055607	...	B000092Q0A	B00009EF82	B00009NDAN	B0000DYXID	B0000T6KHI	B0000VZEJQ	B0000X8HIE	B00013AX9E	B0001I1KOG	B000234N3A
userID
2033	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2110	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2276	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4017	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4385	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

\n", 2766 | "

5 rows × 66574 columns

\n", 2767 | "

" 2768 | ], 2769 | "text/plain": [ 2770 | "ISBN 0000913154 0001046438 000104687X 0001047213 0001047973 \\\n", 2771 | "userID \n", 2772 | "2033 0 0 0 0 0 \n", 2773 | "2110 0 0 0 0 0 \n", 2774 | "2276 0 0 0 0 0 \n", 2775 | "4017 0 0 0 0 0 \n", 2776 | "4385 0 0 0 0 0 \n", 2777 | "\n", 2778 | "ISBN 000104799X 0001048082 0001053736 0001053744 0001055607 \\\n", 2779 | "userID \n", 2780 | "2033 0 0 0 0 0 \n", 2781 | "2110 0 0 0 0 0 \n", 2782 | "2276 0 0 0 0 0 \n", 2783 | "4017 0 0 0 0 0 \n", 2784 | "4385 0 0 0 0 0 \n", 2785 | "\n", 2786 | "ISBN ... B000092Q0A B00009EF82 B00009NDAN B0000DYXID \\\n", 2787 | "userID ... \n", 2788 | "2033 ... 0 0 0 0 \n", 2789 | "2110 ... 0 0 0 0 \n", 2790 | "2276 ... 0 0 0 0 \n", 2791 | "4017 ... 0 0 0 0 \n", 2792 | "4385 ... 0 0 0 0 \n", 2793 | "\n", 2794 | "ISBN B0000T6KHI B0000VZEJQ B0000X8HIE B00013AX9E B0001I1KOG B000234N3A \n", 2795 | "userID \n", 2796 | "2033 0 0 0 0 0 0 \n", 2797 | "2110 0 0 0 0 0 0 \n", 2798 | "2276 0 0 0 0 0 0 \n", 2799 | "4017 0 0 0 0 0 0 \n", 2800 | "4385 0 0 0 0 0 0 \n", 2801 | "\n", 2802 | "[5 rows x 66574 columns]" 2803 | ] 2804 | }, 2805 | "execution_count": 239, 2806 | "metadata": {}, 2807 | "output_type": "execute_result" 2808 | } 2809 | ], 2810 | "source": [ 2811 | "#checking first few rows\n", 2812 | "ratings_matrix.head(5)" 2813 | ] 2814 | }, 2815 | { 2816 | "cell_type": "code", 2817 | "execution_count": 240, 2818 | "metadata": {}, 2819 | "outputs": [ 2820 | { 2821 | "name": "stdout", 2822 | "output_type": "stream", 2823 | "text": [ 2824 | "The sparsity level of Book Crossing dataset is 99.9977218411 %\n" 2825 | ] 2826 | } 2827 | ], 2828 | "source": [ 2829 | "#rechecking the sparsity\n", 2830 | "sparsity=1.0-len(ratings_explicit)/float(users_exp_ratings.shape[0]*n_books)\n", 2831 | "print 'The sparsity level of Book Crossing dataset is ' + str(sparsity*100) + ' %'" 2832 | ] 2833 | }, 2834 | { 2835 | "cell_type": "markdown", 2836 | "metadata": {}, 2837 | "source": [ 2838 | "**Training our recommendation system**" 2839 | ] 2840 | }, 2841 | { 2842 | "cell_type": "code", 2843 | "execution_count": 241, 2844 | "metadata": { 2845 | "collapsed": true 2846 | }, 2847 | "outputs": [], 2848 | "source": [ 2849 | "#setting global variables\n", 2850 | "global metric,k\n", 2851 | "k=10\n", 2852 | "metric='cosine'" 2853 | ] 2854 | }, 2855 | { 2856 | "cell_type": "markdown", 2857 | "metadata": {}, 2858 | "source": [ 2859 | "**User-based Recommendation System**" 2860 | ] 2861 | }, 2862 | { 2863 | "cell_type": "code", 2864 | "execution_count": 242, 2865 | "metadata": { 2866 | "collapsed": true 2867 | }, 2868 | "outputs": [], 2869 | "source": [ 2870 | "#This function finds k similar users given the user_id and ratings matrix \n", 2871 | "#These similarities are same as obtained via using pairwise_distances\n", 2872 | "def findksimilarusers(user_id, ratings, metric = metric, k=k):\n", 2873 | " similarities=[]\n", 2874 | " indices=[]\n", 2875 | " model_knn = NearestNeighbors(metric = metric, algorithm = 'brute') \n", 2876 | " model_knn.fit(ratings)\n", 2877 | " loc = ratings.index.get_loc(user_id)\n", 2878 | " distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k+1)\n", 2879 | " similarities = 1-distances.flatten()\n", 2880 | " \n", 2881 | " return similarities,indices" 2882 | ] 2883 | }, 2884 | { 2885 | "cell_type": "code", 2886 | "execution_count": 243, 2887 | "metadata": { 2888 | "collapsed": true 2889 | }, 2890 | "outputs": [], 2891 | "source": [ 2892 | "#This function predicts rating for specified user-item combination based on user-based approach\n", 2893 | "def predict_userbased(user_id, item_id, ratings, metric = metric, k=k):\n", 2894 | " prediction=0\n", 2895 | " user_loc = ratings.index.get_loc(user_id)\n", 2896 | " item_loc = ratings.columns.get_loc(item_id)\n", 2897 | " similarities, indices=findksimilarusers(user_id, ratings,metric, k) #similar users based on cosine similarity\n", 2898 | " mean_rating = ratings.iloc[user_loc,:].mean() #to adjust for zero based indexing\n", 2899 | " sum_wt = np.sum(similarities)-1\n", 2900 | " product=1\n", 2901 | " wtd_sum = 0 \n", 2902 | " \n", 2903 | " for i in range(0, len(indices.flatten())):\n", 2904 | " if indices.flatten()[i] == user_loc:\n", 2905 | " continue;\n", 2906 | " else: \n", 2907 | " ratings_diff = ratings.iloc[indices.flatten()[i],item_loc]-np.mean(ratings.iloc[indices.flatten()[i],:])\n", 2908 | " product = ratings_diff * (similarities[i])\n", 2909 | " wtd_sum = wtd_sum + product\n", 2910 | " \n", 2911 | " #in case of very sparse datasets, using correlation metric for collaborative based approach may give negative ratings\n", 2912 | " #which are handled here as below\n", 2913 | " if prediction <= 0:\n", 2914 | " prediction = 1 \n", 2915 | " elif prediction >10:\n", 2916 | " prediction = 10\n", 2917 | " \n", 2918 | " prediction = int(round(mean_rating + (wtd_sum/sum_wt)))\n", 2919 | " print '\\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction)\n", 2920 | "\n", 2921 | " return prediction" 2922 | ] 2923 | }, 2924 | { 2925 | "cell_type": "code", 2926 | "execution_count": 244, 2927 | "metadata": {}, 2928 | "outputs": [ 2929 | { 2930 | "name": "stdout", 2931 | "output_type": "stream", 2932 | "text": [ 2933 | "\n", 2934 | "Predicted rating for user 11676 -> item 0001056107: 2\n" 2935 | ] 2936 | } 2937 | ], 2938 | "source": [ 2939 | "predict_userbased(11676,'0001056107',ratings_matrix);" 2940 | ] 2941 | }, 2942 | { 2943 | "cell_type": "markdown", 2944 | "metadata": {}, 2945 | "source": [ 2946 | "**Item-based Recommendation Systems**" 2947 | ] 2948 | }, 2949 | { 2950 | "cell_type": "code", 2951 | "execution_count": 245, 2952 | "metadata": { 2953 | "collapsed": true 2954 | }, 2955 | "outputs": [], 2956 | "source": [ 2957 | "#This function finds k similar items given the item_id and ratings matrix\n", 2958 | "\n", 2959 | "def findksimilaritems(item_id, ratings, metric=metric, k=k):\n", 2960 | " similarities=[]\n", 2961 | " indices=[]\n", 2962 | " ratings=ratings.T\n", 2963 | " loc = ratings.index.get_loc(item_id)\n", 2964 | " model_knn = NearestNeighbors(metric = metric, algorithm = 'brute')\n", 2965 | " model_knn.fit(ratings)\n", 2966 | " \n", 2967 | " distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k+1)\n", 2968 | " similarities = 1-distances.flatten()\n", 2969 | "\n", 2970 | " return similarities,indices" 2971 | ] 2972 | }, 2973 | { 2974 | "cell_type": "code", 2975 | "execution_count": 246, 2976 | "metadata": { 2977 | "collapsed": true 2978 | }, 2979 | "outputs": [], 2980 | "source": [ 2981 | "similarities,indices=findksimilaritems('0001056107',ratings_matrix)" 2982 | ] 2983 | }, 2984 | { 2985 | "cell_type": "code", 2986 | "execution_count": 247, 2987 | "metadata": { 2988 | "collapsed": true 2989 | }, 2990 | "outputs": [], 2991 | "source": [ 2992 | "#This function predicts the rating for specified user-item combination based on item-based approach\n", 2993 | "def predict_itembased(user_id, item_id, ratings, metric = metric, k=k):\n", 2994 | " prediction= wtd_sum =0\n", 2995 | " user_loc = ratings.index.get_loc(user_id)\n", 2996 | " item_loc = ratings.columns.get_loc(item_id)\n", 2997 | " similarities, indices=findksimilaritems(item_id, ratings) #similar users based on correlation coefficients\n", 2998 | " sum_wt = np.sum(similarities)-1\n", 2999 | " product=1\n", 3000 | " for i in range(0, len(indices.flatten())):\n", 3001 | " if indices.flatten()[i] == item_loc:\n", 3002 | " continue;\n", 3003 | " else:\n", 3004 | " product = ratings.iloc[user_loc,indices.flatten()[i]] * (similarities[i])\n", 3005 | " wtd_sum = wtd_sum + product \n", 3006 | " prediction = int(round(wtd_sum/sum_wt))\n", 3007 | " \n", 3008 | " #in case of very sparse datasets, using correlation metric for collaborative based approach may give negative ratings\n", 3009 | " #which are handled here as below //code has been validated without the code snippet below, below snippet is to avoid negative\n", 3010 | " #predictions which might arise in case of very sparse datasets when using correlation metric\n", 3011 | " if prediction <= 0:\n", 3012 | " prediction = 1 \n", 3013 | " elif prediction >10:\n", 3014 | " prediction = 10\n", 3015 | "\n", 3016 | " print '\\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction) \n", 3017 | " \n", 3018 | " return prediction" 3019 | ] 3020 | }, 3021 | { 3022 | "cell_type": "code", 3023 | "execution_count": 248, 3024 | "metadata": {}, 3025 | "outputs": [ 3026 | { 3027 | "name": "stdout", 3028 | "output_type": "stream", 3029 | "text": [ 3030 | "\n", 3031 | "Predicted rating for user 11676 -> item 0001056107: 1\n" 3032 | ] 3033 | } 3034 | ], 3035 | "source": [ 3036 | "prediction = predict_itembased(11676,'0001056107',ratings_matrix)" 3037 | ] 3038 | }, 3039 | { 3040 | "cell_type": "code", 3041 | "execution_count": 249, 3042 | "metadata": { 3043 | "collapsed": true 3044 | }, 3045 | "outputs": [], 3046 | "source": [ 3047 | "@contextmanager\n", 3048 | "def suppress_stdout():\n", 3049 | " with open(os.devnull, \"w\") as devnull:\n", 3050 | " old_stdout = sys.stdout\n", 3051 | " sys.stdout = devnull\n", 3052 | " try: \n", 3053 | " yield\n", 3054 | " finally:\n", 3055 | " sys.stdout = old_stdout" 3056 | ] 3057 | }, 3058 | { 3059 | "cell_type": "code", 3060 | "execution_count": 252, 3061 | "metadata": { 3062 | "collapsed": true 3063 | }, 3064 | "outputs": [], 3065 | "source": [ 3066 | "#This function utilizes above functions to recommend items for item/user based approach and cosine/correlation. \n", 3067 | "#Recommendations are made if the predicted rating for an item is >= to 6,and the items have not been rated already\n", 3068 | "def recommendItem(user_id, ratings, metric=metric): \n", 3069 | " if (user_id not in ratings.index.values) or type(user_id) is not int:\n", 3070 | " print \"User id should be a valid integer from this list :\\n\\n {} \".format(re.sub('[\\[\\]]', '', np.array_str(ratings_matrix.index.values)))\n", 3071 | " else: \n", 3072 | " ids = ['Item-based (correlation)','Item-based (cosine)','User-based (correlation)','User-based (cosine)']\n", 3073 | " select = widgets.Dropdown(options=ids, value=ids[0],description='Select approach', width='1000px')\n", 3074 | " def on_change(change):\n", 3075 | " clear_output(wait=True)\n", 3076 | " prediction = [] \n", 3077 | " if change['type'] == 'change' and change['name'] == 'value': \n", 3078 | " if (select.value == 'Item-based (correlation)') | (select.value == 'User-based (correlation)') :\n", 3079 | " metric = 'correlation'\n", 3080 | " else: \n", 3081 | " metric = 'cosine' \n", 3082 | " with suppress_stdout():\n", 3083 | " if (select.value == 'Item-based (correlation)') | (select.value == 'Item-based (cosine)'):\n", 3084 | " for i in range(ratings.shape[1]):\n", 3085 | " if (ratings[str(ratings.columns[i])][user_id] !=0): #not rated already\n", 3086 | " prediction.append(predict_itembased(user_id, str(ratings.columns[i]) ,ratings, metric))\n", 3087 | " else: \n", 3088 | " prediction.append(-1) #for already rated items\n", 3089 | " else:\n", 3090 | " for i in range(ratings.shape[1]):\n", 3091 | " if (ratings[str(ratings.columns[i])][user_id] !=0): #not rated already\n", 3092 | " prediction.append(predict_userbased(user_id, str(ratings.columns[i]) ,ratings, metric))\n", 3093 | " else: \n", 3094 | " prediction.append(-1) #for already rated items\n", 3095 | " prediction = pd.Series(prediction)\n", 3096 | " prediction = prediction.sort_values(ascending=False)\n", 3097 | " recommended = prediction[:10]\n", 3098 | " print \"As per {0} approach....Following books are recommended...\".format(select.value)\n", 3099 | " for i in range(len(recommended)):\n", 3100 | " print \"{0}. {1}\".format(i+1,books.bookTitle[recommended.index[i]].encode('utf-8')) \n", 3101 | " select.observe(on_change)\n", 3102 | " display(select)" 3103 | ] 3104 | }, 3105 | { 3106 | "cell_type": "code", 3107 | "execution_count": 255, 3108 | "metadata": {}, 3109 | "outputs": [ 3110 | { 3111 | "name": "stdout", 3112 | "output_type": "stream", 3113 | "text": [ 3114 | "User id should be a valid integer from this list :\n", 3115 | "\n", 3116 | " 2033 2110 2276 4017 4385 5582 6242 6251 6543 6575\n", 3117 | " 7286 7346 8067 8245 8681 8890 10560 11676 11993 12538\n", 3118 | " 12824 12982 13552 13850 14422 15408 15418 16634 16795 16966\n", 3119 | " 17950 19085 21014 23768 23872 23902 25409 25601 25981 26535\n", 3120 | " 26544 26583 28591 28634 29259 30276 30511 30711 30735 30810\n", 3121 | " 31315 31556 31826 32773 33145 35433 35836 35857 35859 36299\n", 3122 | " 36554 36606 36609 36836 36907 37644 37712 37950 38023 38273\n", 3123 | " 38281 39281 39467 40889 40943 43246 43910 46398 47316 48025\n", 3124 | " 48494 49144 49889 51883 52199 52350 52584 52614 52917 53220\n", 3125 | " 55187 55490 55492 56271 56399 56447 56554 56959 59172 60244\n", 3126 | " 60337 60707 63714 63956 65258 66942 67840 68555 69078 69389\n", 3127 | " 69697 70415 70594 70666 72352 73681 75591 75819 76151 76223\n", 3128 | " 76499 76626 78553 78783 78834 78973 79441 81492 81560 83287\n", 3129 | " 83637 83671 85526 85656 86189 86947 87141 87555 88283 88677\n", 3130 | " 88693 88733 89602 91113 92652 92810 93047 93363 93629 94242\n", 3131 | " 94347 94853 94951 95010 95359 95902 95932 96448 97754 97874\n", 3132 | " 98391 98758 100459 100906 101209 101606 101851 102359 102647 102702\n", 3133 | " 102967 104399 104636 105028 105517 105979 106007 107784 107951 109574\n", 3134 | " 109901 109955 110483 110912 110934 110973 112001 113270 113519 114368\n", 3135 | " 114868 114988 115002 115003 116599 117384 120565 122429 122793 123094\n", 3136 | " 123608 123883 123981 125519 125774 126492 126736 127200 127359 128835\n", 3137 | " 129074 129716 129851 130554 130571 132492 132836 133747 134434 135149\n", 3138 | " 135265 136010 136139 136348 136382 138578 138844 140000 140358 141902\n", 3139 | " 142524 143175 143253 143415 145449 146113 146348 147847 148199 148258\n", 3140 | " 148744 148966 149907 149908 150979 153662 156150 156269 156300 156467\n", 3141 | " 157247 157273 158226 158295 158433 159506 160295 162052 162639 162738\n", 3142 | " 163759 163761 163804 163973 164096 164323 164533 164828 164905 165308\n", 3143 | " 165319 165758 166123 166596 168047 168245 169682 170513 170634 171118\n", 3144 | " 172030 172742 172888 173291 173415 174304 174892 177072 177432 177458\n", 3145 | " 178522 179718 179978 180378 180651 181176 182085 182086 182993 183958\n", 3146 | " 183995 184299 184532 185233 185384 187145 187256 187517 189139 189334\n", 3147 | " 189835 189973 190708 190925 193458 193560 193898 194600 196077 196160\n", 3148 | " 196502 197659 199416 200226 201290 203240 204864 205735 205943 206534\n", 3149 | " 207782 208406 208671 209516 210485 211426 211919 212965 214786 216012\n", 3150 | " 216444 216683 217106 217318 217740 218552 218608 219546 219683 222204\n", 3151 | " 222296 223087 223501 224349 224525 224646 224764 225087 225199 225232\n", 3152 | " 225595 225763 226965 227250 227447 227520 227705 229011 229329 229551\n", 3153 | " 229741 230522 231210 232131 232945 233911 234359 234828 235105 235282\n", 3154 | " 235935 236058 236283 236340 236757 236948 239584 239594 240144 240403\n", 3155 | " 240543 240567 240568 241198 241666 241980 242006 242083 242409 242465\n", 3156 | " 244627 244685 245410 245827 246311 247429 247447 248718 249894 250405\n", 3157 | " 250709 251394 251843 251844 252695 252820 254206 254465 254899 255489\n", 3158 | " 257204 258152 258185 258534 261105 261829 262998 264031 264082 264321\n", 3159 | " 264525 265115 265313 265889 266056 266226 268110 268300 268932 269566\n", 3160 | " 270713 271448 271705 273113 274061 274301 275970 277427 278418 \n" 3161 | ] 3162 | } 3163 | ], 3164 | "source": [ 3165 | "#checking for incorrect entries\n", 3166 | "recommendItem(999999,ratings_matrix)" 3167 | ] 3168 | }, 3169 | { 3170 | "cell_type": "code", 3171 | "execution_count": 253, 3172 | "metadata": { 3173 | "scrolled": true 3174 | }, 3175 | "outputs": [ 3176 | { 3177 | "name": "stdout", 3178 | "output_type": "stream", 3179 | "text": [ 3180 | "As per Item-based (cosine) approach....Following books are recommended...\n", 3181 | "1. My Wicked Wicked Ways\n", 3182 | "2. Fair Peril\n", 3183 | "3. Wolfpointe\n", 3184 | "4. A Nest of Ninnies\n", 3185 | "5. A Bitter Legacy\n", 3186 | "6. A Hymn Before Battle\n", 3187 | "7. Thomas the Rhymer\n", 3188 | "8. Gatherer of Clouds (Initiate Brother Duology)\n", 3189 | "9. Wege zum Ruhm: 13 Hilfestellungen fÃ¼r junge KÃ¼nstler und 1 Warnung\n", 3190 | "10. Love In Bloom's\n" 3191 | ] 3192 | } 3193 | ], 3194 | "source": [ 3195 | "recommendItem(4385, ratings_matrix)" 3196 | ] 3197 | }, 3198 | { 3199 | "cell_type": "code", 3200 | "execution_count": 254, 3201 | "metadata": {}, 3202 | "outputs": [ 3203 | { 3204 | "name": "stdout", 3205 | "output_type": "stream", 3206 | "text": [ 3207 | "As per User-based (correlation) approach....Following books are recommended...\n", 3208 | "1. The Gift\n", 3209 | "2. A Close Run Thing : A Novel of Wellington's Army of 1815\n", 3210 | "3. The Romantic: A Novel\n", 3211 | "4. Mazurka for Two Dead Men\n", 3212 | "5. The Titanic Conspiracy: Cover-Ups and Mysteries of the World's Most Famous Sea Disaster\n", 3213 | "6. And Never Let Her Go : Thomas Capano: The Deadly Seducer\n", 3214 | "7. Chop Wood, Carry Water: A Guide to Finding Spiritual Fulfillment in Everyday Life\n", 3215 | "8. WHO NEEDS GOD\n", 3216 | "9. Lords of the White Castle\n", 3217 | "10. Prince Charming Isn't Coming: How Women Get Smart About Money\n" 3218 | ] 3219 | } 3220 | ], 3221 | "source": [ 3222 | "recommendItem(4385, ratings_matrix)" 3223 | ] 3224 | }, 3225 | { 3226 | "cell_type": "markdown", 3227 | "metadata": {}, 3228 | "source": [ 3229 | "**Thanks for reading this notebook**" 3230 | ] 3231 | } 3232 | ], 3233 | "metadata": { 3234 | "kernelspec": { 3235 | "display_name": "Python 2", 3236 | "language": "python", 3237 | "name": "python2" 3238 | }, 3239 | "language_info": { 3240 | "codemirror_mode": { 3241 | "name": "ipython", 3242 | "version": 2 3243 | }, 3244 | "file_extension": ".py", 3245 | "mimetype": "text/x-python", 3246 | "name": "python", 3247 | "nbconvert_exporter": "python", 3248 | "pygments_lexer": "ipython2", 3249 | "version": "2.7.13" 3250 | } 3251 | }, 3252 | "nbformat": 4, 3253 | "nbformat_minor": 2 3254 | } 3255 | --------------------------------------------------------------------------------