├── Book Recommendation System.ipynb ├── CF Recommendation System-Examples.ipynb └── README.md /Book Recommendation System.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "**About Book Crossing Dataset**
\n", 8 | "\n", 9 | "This dataset has been compiled by Cai-Nicolas Ziegler in 2004, and it comprises of three tables for users, books and ratings. Explicit ratings are expressed on a scale from 1-10 (higher values denoting higher appreciation) and implicit rating is expressed by 0" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "Link to dataset files
\n", 17 | "http://www2.informatik.uni-freiburg.de/~cziegler/BX/ " 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "**About this Project**\n", 25 | "\n", 26 | "This project entails building a Book Recommender System for users based on user-based and item-based collaborative filtering approaches" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 177, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "#Making necesarry imports\n", 38 | "import pandas as pd\n", 39 | "import matplotlib.pyplot as plt\n", 40 | "import sklearn.metrics as metrics\n", 41 | "import numpy as np\n", 42 | "from sklearn.neighbors import NearestNeighbors\n", 43 | "from scipy.spatial.distance import correlation\n", 44 | "from sklearn.metrics.pairwise import pairwise_distances\n", 45 | "import ipywidgets as widgets\n", 46 | "from IPython.display import display, clear_output\n", 47 | "from contextlib import contextmanager\n", 48 | "import warnings\n", 49 | "warnings.filterwarnings('ignore')\n", 50 | "import numpy as np\n", 51 | "import os, sys\n", 52 | "import re\n", 53 | "import seaborn as sns" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 178, 59 | "metadata": { 60 | "collapsed": true 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "#Setting the current working directory\n", 65 | "os.chdir('D:\\Data Science\\Projects\\Book Crossing Dataset - Recommender System')" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 179, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/html": [ 76 | "\n", 88 | "To toggle on/off output_stderr, click here." 89 | ], 90 | "text/plain": [ 91 | "" 92 | ] 93 | }, 94 | "execution_count": 179, 95 | "metadata": {}, 96 | "output_type": "execute_result" 97 | } 98 | ], 99 | "source": [ 100 | "from IPython.display import HTML\n", 101 | "HTML('''\n", 113 | "To toggle on/off output_stderr, click here.''')" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 180, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "name": "stderr", 123 | "output_type": "stream", 124 | "text": [ 125 | "Skipping line 6452: expected 8 fields, saw 9\n", 126 | "Skipping line 43667: expected 8 fields, saw 10\n", 127 | "Skipping line 51751: expected 8 fields, saw 9\n", 128 | "\n", 129 | "Skipping line 92038: expected 8 fields, saw 9\n", 130 | "Skipping line 104319: expected 8 fields, saw 9\n", 131 | "Skipping line 121768: expected 8 fields, saw 9\n", 132 | "\n", 133 | "Skipping line 144058: expected 8 fields, saw 9\n", 134 | "Skipping line 150789: expected 8 fields, saw 9\n", 135 | "Skipping line 157128: expected 8 fields, saw 9\n", 136 | "Skipping line 180189: expected 8 fields, saw 9\n", 137 | "Skipping line 185738: expected 8 fields, saw 9\n", 138 | "\n", 139 | "Skipping line 209388: expected 8 fields, saw 9\n", 140 | "Skipping line 220626: expected 8 fields, saw 9\n", 141 | "Skipping line 227933: expected 8 fields, saw 11\n", 142 | "Skipping line 228957: expected 8 fields, saw 10\n", 143 | "Skipping line 245933: expected 8 fields, saw 9\n", 144 | "Skipping line 251296: expected 8 fields, saw 9\n", 145 | "Skipping line 259941: expected 8 fields, saw 9\n", 146 | "Skipping line 261529: expected 8 fields, saw 9\n", 147 | "\n" 148 | ] 149 | } 150 | ], 151 | "source": [ 152 | "#Loading data\n", 153 | "books = pd.read_csv('books.csv', sep=';', error_bad_lines=False, encoding=\"latin-1\")\n", 154 | "books.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']\n", 155 | "users = pd.read_csv('users.csv', sep=';', error_bad_lines=False, encoding=\"latin-1\")\n", 156 | "users.columns = ['userID', 'Location', 'Age']\n", 157 | "ratings = pd.read_csv('ratings.csv', sep=';', error_bad_lines=False, encoding=\"latin-1\")\n", 158 | "ratings.columns = ['userID', 'ISBN', 'bookRating']" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 181, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": [ 170 | "(271360, 8)\n", 171 | "(278858, 3)\n", 172 | "(1149780, 3)\n" 173 | ] 174 | } 175 | ], 176 | "source": [ 177 | "#checking shapes of the datasets\n", 178 | "print books.shape\n", 179 | "print users.shape\n", 180 | "print ratings.shape" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 182, 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "data": { 190 | "text/html": [ 191 | "
\n", 192 | "\n", 205 | "\n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | "
ISBNbookTitlebookAuthoryearOfPublicationpublisherimageUrlSimageUrlMimageUrlL
00195153448Classical MythologyMark P. O. Morford2002Oxford University Presshttp://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpghttp://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpghttp://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg
10002005018Clara CallanRichard Bruce Wright2001HarperFlamingo Canadahttp://images.amazon.com/images/P/0002005018.01.THUMBZZZ.jpghttp://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpghttp://images.amazon.com/images/P/0002005018.01.LZZZZZZZ.jpg
20060973129Decision in NormandyCarlo D'Este1991HarperPerennialhttp://images.amazon.com/images/P/0060973129.01.THUMBZZZ.jpghttp://images.amazon.com/images/P/0060973129.01.MZZZZZZZ.jpghttp://images.amazon.com/images/P/0060973129.01.LZZZZZZZ.jpg
30374157065Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused ItGina Bari Kolata1999Farrar Straus Girouxhttp://images.amazon.com/images/P/0374157065.01.THUMBZZZ.jpghttp://images.amazon.com/images/P/0374157065.01.MZZZZZZZ.jpghttp://images.amazon.com/images/P/0374157065.01.LZZZZZZZ.jpg
40393045218The Mummies of UrumchiE. J. W. Barber1999W. W. Norton & Companyhttp://images.amazon.com/images/P/0393045218.01.THUMBZZZ.jpghttp://images.amazon.com/images/P/0393045218.01.MZZZZZZZ.jpghttp://images.amazon.com/images/P/0393045218.01.LZZZZZZZ.jpg
\n", 277 | "
" 278 | ], 279 | "text/plain": [ 280 | " ISBN \\\n", 281 | "0 0195153448 \n", 282 | "1 0002005018 \n", 283 | "2 0060973129 \n", 284 | "3 0374157065 \n", 285 | "4 0393045218 \n", 286 | "\n", 287 | " bookTitle \\\n", 288 | "0 Classical Mythology \n", 289 | "1 Clara Callan \n", 290 | "2 Decision in Normandy \n", 291 | "3 Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It \n", 292 | "4 The Mummies of Urumchi \n", 293 | "\n", 294 | " bookAuthor yearOfPublication publisher \\\n", 295 | "0 Mark P. O. Morford 2002 Oxford University Press \n", 296 | "1 Richard Bruce Wright 2001 HarperFlamingo Canada \n", 297 | "2 Carlo D'Este 1991 HarperPerennial \n", 298 | "3 Gina Bari Kolata 1999 Farrar Straus Giroux \n", 299 | "4 E. J. W. Barber 1999 W. W. Norton & Company \n", 300 | "\n", 301 | " imageUrlS \\\n", 302 | "0 http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg \n", 303 | "1 http://images.amazon.com/images/P/0002005018.01.THUMBZZZ.jpg \n", 304 | "2 http://images.amazon.com/images/P/0060973129.01.THUMBZZZ.jpg \n", 305 | "3 http://images.amazon.com/images/P/0374157065.01.THUMBZZZ.jpg \n", 306 | "4 http://images.amazon.com/images/P/0393045218.01.THUMBZZZ.jpg \n", 307 | "\n", 308 | " imageUrlM \\\n", 309 | "0 http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg \n", 310 | "1 http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg \n", 311 | "2 http://images.amazon.com/images/P/0060973129.01.MZZZZZZZ.jpg \n", 312 | "3 http://images.amazon.com/images/P/0374157065.01.MZZZZZZZ.jpg \n", 313 | "4 http://images.amazon.com/images/P/0393045218.01.MZZZZZZZ.jpg \n", 314 | "\n", 315 | " imageUrlL \n", 316 | "0 http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg \n", 317 | "1 http://images.amazon.com/images/P/0002005018.01.LZZZZZZZ.jpg \n", 318 | "2 http://images.amazon.com/images/P/0060973129.01.LZZZZZZZ.jpg \n", 319 | "3 http://images.amazon.com/images/P/0374157065.01.LZZZZZZZ.jpg \n", 320 | "4 http://images.amazon.com/images/P/0393045218.01.LZZZZZZZ.jpg " 321 | ] 322 | }, 323 | "execution_count": 182, 324 | "metadata": {}, 325 | "output_type": "execute_result" 326 | } 327 | ], 328 | "source": [ 329 | "#Exploring books dataset\n", 330 | "books.head()" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 183, 336 | "metadata": { 337 | "collapsed": true 338 | }, 339 | "outputs": [], 340 | "source": [ 341 | "#dropping last three columns containing image URLs which will not be required for analysis\n", 342 | "books.drop(['imageUrlS', 'imageUrlM', 'imageUrlL'],axis=1,inplace=True)" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 184, 348 | "metadata": { 349 | "scrolled": true 350 | }, 351 | "outputs": [ 352 | { 353 | "data": { 354 | "text/html": [ 355 | "
\n", 356 | "\n", 369 | "\n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | "
ISBNbookTitlebookAuthoryearOfPublicationpublisher
00195153448Classical MythologyMark P. O. Morford2002Oxford University Press
10002005018Clara CallanRichard Bruce Wright2001HarperFlamingo Canada
20060973129Decision in NormandyCarlo D'Este1991HarperPerennial
30374157065Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused ItGina Bari Kolata1999Farrar Straus Giroux
40393045218The Mummies of UrumchiE. J. W. Barber1999W. W. Norton & Company
\n", 423 | "
" 424 | ], 425 | "text/plain": [ 426 | " ISBN \\\n", 427 | "0 0195153448 \n", 428 | "1 0002005018 \n", 429 | "2 0060973129 \n", 430 | "3 0374157065 \n", 431 | "4 0393045218 \n", 432 | "\n", 433 | " bookTitle \\\n", 434 | "0 Classical Mythology \n", 435 | "1 Clara Callan \n", 436 | "2 Decision in Normandy \n", 437 | "3 Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It \n", 438 | "4 The Mummies of Urumchi \n", 439 | "\n", 440 | " bookAuthor yearOfPublication publisher \n", 441 | "0 Mark P. O. Morford 2002 Oxford University Press \n", 442 | "1 Richard Bruce Wright 2001 HarperFlamingo Canada \n", 443 | "2 Carlo D'Este 1991 HarperPerennial \n", 444 | "3 Gina Bari Kolata 1999 Farrar Straus Giroux \n", 445 | "4 E. J. W. Barber 1999 W. W. Norton & Company " 446 | ] 447 | }, 448 | "execution_count": 184, 449 | "metadata": {}, 450 | "output_type": "execute_result" 451 | } 452 | ], 453 | "source": [ 454 | "#Now the books datasets looks like....\n", 455 | "books.head()" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": 135, 461 | "metadata": { 462 | "scrolled": false 463 | }, 464 | "outputs": [ 465 | { 466 | "data": { 467 | "text/plain": [ 468 | "ISBN object\n", 469 | "bookTitle object\n", 470 | "bookAuthor object\n", 471 | "yearOfPublication object\n", 472 | "publisher object\n", 473 | "dtype: object" 474 | ] 475 | }, 476 | "execution_count": 135, 477 | "metadata": {}, 478 | "output_type": "execute_result" 479 | } 480 | ], 481 | "source": [ 482 | "#checking data types of columns\n", 483 | "books.dtypes" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 188, 489 | "metadata": { 490 | "collapsed": true 491 | }, 492 | "outputs": [], 493 | "source": [ 494 | "#making this setting to display full text in columns\n", 495 | "pd.set_option('display.max_colwidth', -1)" 496 | ] 497 | }, 498 | { 499 | "cell_type": "markdown", 500 | "metadata": {}, 501 | "source": [ 502 | "**yearOfPublication**" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 189, 508 | "metadata": {}, 509 | "outputs": [ 510 | { 511 | "data": { 512 | "text/plain": [ 513 | "array([2002L, 2001L, 1991L, 1999L, 2000L, 1993L, 1996L, 1988L, 2004L,\n", 514 | " 1998L, 1994L, 2003L, 1997L, 1983L, 1979L, 1995L, 1982L, 1985L,\n", 515 | " 1992L, 1986L, 1978L, 1980L, 1952L, 1987L, 1990L, 1981L, 1989L,\n", 516 | " 1984L, 0L, 1968L, 1961L, 1958L, 1974L, 1976L, 1971L, 1977L, 1975L,\n", 517 | " 1965L, 1941L, 1970L, 1962L, 1973L, 1972L, 1960L, 1966L, 1920L,\n", 518 | " 1956L, 1959L, 1953L, 1951L, 1942L, 1963L, 1964L, 1969L, 1954L,\n", 519 | " 1950L, 1967L, 2005L, 1957L, 1940L, 1937L, 1955L, 1946L, 1936L,\n", 520 | " 1930L, 2011L, 1925L, 1948L, 1943L, 1947L, 1945L, 1923L, 2020L,\n", 521 | " 1939L, 1926L, 1938L, 2030L, 1911L, 1904L, 1949L, 1932L, 1928L,\n", 522 | " 1929L, 1927L, 1931L, 1914L, 2050L, 1934L, 1910L, 1933L, 1902L,\n", 523 | " 1924L, 1921L, 1900L, 2038L, 2026L, 1944L, 1917L, 1901L, 2010L,\n", 524 | " 1908L, 1906L, 1935L, 1806L, 2021L, u'2000', u'1995', u'1999',\n", 525 | " u'2004', u'2003', u'1990', u'1994', u'1986', u'1989', u'2002',\n", 526 | " u'1981', u'1993', u'1983', u'1982', u'1976', u'1991', u'1977',\n", 527 | " u'1998', u'1992', u'1996', u'0', u'1997', u'2001', u'1974', u'1968',\n", 528 | " u'1987', u'1984', u'1988', u'1963', u'1956', u'1970', u'1985',\n", 529 | " u'1978', u'1973', u'1980', u'1979', u'1975', u'1969', u'1961',\n", 530 | " u'1965', u'1939', u'1958', u'1950', u'1953', u'1966', u'1971',\n", 531 | " u'1959', u'1972', u'1955', u'1957', u'1945', u'1960', u'1967',\n", 532 | " u'1932', u'1924', u'1964', u'2012', u'1911', u'1927', u'1948',\n", 533 | " u'1962', u'2006', u'1952', u'1940', u'1951', u'1931', u'1954',\n", 534 | " u'2005', u'1930', u'1941', u'1944', u'DK Publishing Inc', u'1943',\n", 535 | " u'1938', u'1900', u'1942', u'1923', u'1920', u'1933', u'Gallimard',\n", 536 | " u'1909', u'1946', u'2008', u'1378', u'2030', u'1936', u'1947',\n", 537 | " u'2011', u'2020', u'1919', u'1949', u'1922', u'1897', u'2024',\n", 538 | " u'1376', u'1926', u'2037'], dtype=object)" 539 | ] 540 | }, 541 | "execution_count": 189, 542 | "metadata": {}, 543 | "output_type": "execute_result" 544 | } 545 | ], 546 | "source": [ 547 | "#yearOfPublication should be set as having dtype as int\n", 548 | "#checking the unique values of yearOfPublication\n", 549 | "books.yearOfPublication.unique()\n", 550 | "\n", 551 | "#as it can be seen from below that there are some incorrect entries in this field. It looks like Publisher names \n", 552 | "#'DK Publishing Inc' and 'Gallimard' have been incorrectly loaded as yearOfPublication in dataset due to some errors in csv file\n", 553 | "#Also some of the entries are strings and same years have been entered as numbers in some places" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": 190, 559 | "metadata": { 560 | "scrolled": true 561 | }, 562 | "outputs": [ 563 | { 564 | "data": { 565 | "text/html": [ 566 | "
\n", 567 | "\n", 580 | "\n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | "
ISBNbookTitlebookAuthoryearOfPublicationpublisher
209538078946697XDK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)\\\";Michael Teitelbaum\"2000DK Publishing Inchttp://images.amazon.com/images/P/078946697X.01.THUMBZZZ.jpg
2216780789466953DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)\\\";James Buckley\"2000DK Publishing Inchttp://images.amazon.com/images/P/0789466953.01.THUMBZZZ.jpg
\n", 610 | "
" 611 | ], 612 | "text/plain": [ 613 | " ISBN \\\n", 614 | "209538 078946697X \n", 615 | "221678 0789466953 \n", 616 | "\n", 617 | " bookTitle \\\n", 618 | "209538 DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)\\\";Michael Teitelbaum\" \n", 619 | "221678 DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)\\\";James Buckley\" \n", 620 | "\n", 621 | " bookAuthor yearOfPublication \\\n", 622 | "209538 2000 DK Publishing Inc \n", 623 | "221678 2000 DK Publishing Inc \n", 624 | "\n", 625 | " publisher \n", 626 | "209538 http://images.amazon.com/images/P/078946697X.01.THUMBZZZ.jpg \n", 627 | "221678 http://images.amazon.com/images/P/0789466953.01.THUMBZZZ.jpg " 628 | ] 629 | }, 630 | "execution_count": 190, 631 | "metadata": {}, 632 | "output_type": "execute_result" 633 | } 634 | ], 635 | "source": [ 636 | "#investigating the rows having 'DK Publishing Inc' as yearOfPublication\n", 637 | "books.loc[books.yearOfPublication == 'DK Publishing Inc',:]" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": 191, 643 | "metadata": { 644 | "collapsed": true 645 | }, 646 | "outputs": [], 647 | "source": [ 648 | "#From above, it is seen that bookAuthor is incorrectly loaded with bookTitle, hence making required corrections\n", 649 | "#ISBN '0789466953'\n", 650 | "books.loc[books.ISBN == '0789466953','yearOfPublication'] = 2000\n", 651 | "books.loc[books.ISBN == '0789466953','bookAuthor'] = \"James Buckley\"\n", 652 | "books.loc[books.ISBN == '0789466953','publisher'] = \"DK Publishing Inc\"\n", 653 | "books.loc[books.ISBN == '0789466953','bookTitle'] = \"DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)\"" 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": 192, 659 | "metadata": { 660 | "collapsed": true 661 | }, 662 | "outputs": [], 663 | "source": [ 664 | "#ISBN '078946697X'\n", 665 | "books.loc[books.ISBN == '078946697X','yearOfPublication'] = 2000\n", 666 | "books.loc[books.ISBN == '078946697X','bookAuthor'] = \"Michael Teitelbaum\"\n", 667 | "books.loc[books.ISBN == '078946697X','publisher'] = \"DK Publishing Inc\"\n", 668 | "books.loc[books.ISBN == '078946697X','bookTitle'] = \"DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)\"" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": 193, 674 | "metadata": {}, 675 | "outputs": [ 676 | { 677 | "data": { 678 | "text/html": [ 679 | "
\n", 680 | "\n", 693 | "\n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | "
ISBNbookTitlebookAuthoryearOfPublicationpublisher
209538078946697XDK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)Michael Teitelbaum2000DK Publishing Inc
2216780789466953DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)James Buckley2000DK Publishing Inc
\n", 723 | "
" 724 | ], 725 | "text/plain": [ 726 | " ISBN \\\n", 727 | "209538 078946697X \n", 728 | "221678 0789466953 \n", 729 | "\n", 730 | " bookTitle \\\n", 731 | "209538 DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers) \n", 732 | "221678 DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers) \n", 733 | "\n", 734 | " bookAuthor yearOfPublication publisher \n", 735 | "209538 Michael Teitelbaum 2000 DK Publishing Inc \n", 736 | "221678 James Buckley 2000 DK Publishing Inc " 737 | ] 738 | }, 739 | "execution_count": 193, 740 | "metadata": {}, 741 | "output_type": "execute_result" 742 | } 743 | ], 744 | "source": [ 745 | "#rechecking\n", 746 | "books.loc[(books.ISBN == '0789466953') | (books.ISBN == '078946697X'),:]\n", 747 | "#corrections done" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": 194, 753 | "metadata": { 754 | "scrolled": true 755 | }, 756 | "outputs": [ 757 | { 758 | "data": { 759 | "text/html": [ 760 | "
\n", 761 | "\n", 774 | "\n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | "
ISBNbookTitlebookAuthoryearOfPublicationpublisher
2207312070426769Peuple du ciel, suivi de 'Les Bergers\\\";Jean-Marie Gustave Le Cl�©zio\"2003Gallimardhttp://images.amazon.com/images/P/2070426769.01.THUMBZZZ.jpg
\n", 796 | "
" 797 | ], 798 | "text/plain": [ 799 | " ISBN \\\n", 800 | "220731 2070426769 \n", 801 | "\n", 802 | " bookTitle \\\n", 803 | "220731 Peuple du ciel, suivi de 'Les Bergers\\\";Jean-Marie Gustave Le Cl�©zio\" \n", 804 | "\n", 805 | " bookAuthor yearOfPublication \\\n", 806 | "220731 2003 Gallimard \n", 807 | "\n", 808 | " publisher \n", 809 | "220731 http://images.amazon.com/images/P/2070426769.01.THUMBZZZ.jpg " 810 | ] 811 | }, 812 | "execution_count": 194, 813 | "metadata": {}, 814 | "output_type": "execute_result" 815 | } 816 | ], 817 | "source": [ 818 | "#investigating the rows having 'Gallimard' as yearOfPublication\n", 819 | "books.loc[books.yearOfPublication == 'Gallimard',:]" 820 | ] 821 | }, 822 | { 823 | "cell_type": "code", 824 | "execution_count": 195, 825 | "metadata": { 826 | "collapsed": true 827 | }, 828 | "outputs": [], 829 | "source": [ 830 | "#making required corrections as above, keeping other fields intact\n", 831 | "books.loc[books.ISBN == '2070426769','yearOfPublication'] = 2003\n", 832 | "books.loc[books.ISBN == '2070426769','bookAuthor'] = \"Jean-Marie Gustave Le Cl�©zio\"\n", 833 | "books.loc[books.ISBN == '2070426769','publisher'] = \"Gallimard\"\n", 834 | "books.loc[books.ISBN == '2070426769','bookTitle'] = \"Peuple du ciel, suivi de 'Les Bergers\"" 835 | ] 836 | }, 837 | { 838 | "cell_type": "code", 839 | "execution_count": 196, 840 | "metadata": { 841 | "scrolled": true 842 | }, 843 | "outputs": [ 844 | { 845 | "data": { 846 | "text/html": [ 847 | "
\n", 848 | "\n", 861 | "\n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | "
ISBNbookTitlebookAuthoryearOfPublicationpublisher
2207312070426769Peuple du ciel, suivi de 'Les BergersJean-Marie Gustave Le Cl�©zio2003Gallimard
\n", 883 | "
" 884 | ], 885 | "text/plain": [ 886 | " ISBN bookTitle \\\n", 887 | "220731 2070426769 Peuple du ciel, suivi de 'Les Bergers \n", 888 | "\n", 889 | " bookAuthor yearOfPublication publisher \n", 890 | "220731 Jean-Marie Gustave Le Cl�©zio 2003 Gallimard " 891 | ] 892 | }, 893 | "execution_count": 196, 894 | "metadata": {}, 895 | "output_type": "execute_result" 896 | } 897 | ], 898 | "source": [ 899 | "#rechecking\n", 900 | "books.loc[books.ISBN == '2070426769',:]\n", 901 | "#corrections done" 902 | ] 903 | }, 904 | { 905 | "cell_type": "code", 906 | "execution_count": 197, 907 | "metadata": { 908 | "collapsed": true 909 | }, 910 | "outputs": [], 911 | "source": [ 912 | "#Correcting the dtypes of yearOfPublication\n", 913 | "books.yearOfPublication=pd.to_numeric(books.yearOfPublication, errors='coerce')" 914 | ] 915 | }, 916 | { 917 | "cell_type": "code", 918 | "execution_count": 198, 919 | "metadata": {}, 920 | "outputs": [ 921 | { 922 | "name": "stdout", 923 | "output_type": "stream", 924 | "text": [ 925 | "[0, 1376, 1378, 1806, 1897, 1900, 1901, 1902, 1904, 1906, 1908, 1909, 1910, 1911, 1914, 1917, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2008, 2010, 2011, 2012, 2020, 2021, 2024, 2026, 2030, 2037, 2038, 2050]\n" 926 | ] 927 | } 928 | ], 929 | "source": [ 930 | "print sorted(books['yearOfPublication'].unique())\n", 931 | "#Now it can be seen that yearOfPublication has all values as integers" 932 | ] 933 | }, 934 | { 935 | "cell_type": "code", 936 | "execution_count": 199, 937 | "metadata": { 938 | "collapsed": true 939 | }, 940 | "outputs": [], 941 | "source": [ 942 | "#However, the value 0 is invalid and as this dataset was published in 2004, I have assumed the the years after 2006 to be \n", 943 | "#invalid keeping some margin in case dataset was updated thereafer\n", 944 | "#setting invalid years as NaN\n", 945 | "books.loc[(books.yearOfPublication > 2006) | (books.yearOfPublication == 0),'yearOfPublication'] = np.NAN" 946 | ] 947 | }, 948 | { 949 | "cell_type": "code", 950 | "execution_count": 200, 951 | "metadata": { 952 | "collapsed": true 953 | }, 954 | "outputs": [], 955 | "source": [ 956 | "#replacing NaNs with mean value of yearOfPublication\n", 957 | "books.yearOfPublication.fillna(round(books.yearOfPublication.mean()), inplace=True)" 958 | ] 959 | }, 960 | { 961 | "cell_type": "code", 962 | "execution_count": 201, 963 | "metadata": {}, 964 | "outputs": [ 965 | { 966 | "data": { 967 | "text/plain": [ 968 | "0" 969 | ] 970 | }, 971 | "execution_count": 201, 972 | "metadata": {}, 973 | "output_type": "execute_result" 974 | } 975 | ], 976 | "source": [ 977 | "#rechecking\n", 978 | "books.yearOfPublication.isnull().sum()\n", 979 | "#No NaNs" 980 | ] 981 | }, 982 | { 983 | "cell_type": "code", 984 | "execution_count": 202, 985 | "metadata": { 986 | "collapsed": true 987 | }, 988 | "outputs": [], 989 | "source": [ 990 | "#resetting the dtype as int32\n", 991 | "books.yearOfPublication = books.yearOfPublication.astype(np.int32)" 992 | ] 993 | }, 994 | { 995 | "cell_type": "markdown", 996 | "metadata": {}, 997 | "source": [ 998 | "**publisher**" 999 | ] 1000 | }, 1001 | { 1002 | "cell_type": "code", 1003 | "execution_count": 203, 1004 | "metadata": {}, 1005 | "outputs": [ 1006 | { 1007 | "data": { 1008 | "text/html": [ 1009 | "
\n", 1010 | "\n", 1023 | "\n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | "
ISBNbookTitlebookAuthoryearOfPublicationpublisher
128890193169656XTyrant MoonElaine Corvidae2002NaN
1290371931696993Finders KeepersLinnea Sinclair2001NaN
\n", 1053 | "
" 1054 | ], 1055 | "text/plain": [ 1056 | " ISBN bookTitle bookAuthor yearOfPublication \\\n", 1057 | "128890 193169656X Tyrant Moon Elaine Corvidae 2002 \n", 1058 | "129037 1931696993 Finders Keepers Linnea Sinclair 2001 \n", 1059 | "\n", 1060 | " publisher \n", 1061 | "128890 NaN \n", 1062 | "129037 NaN " 1063 | ] 1064 | }, 1065 | "execution_count": 203, 1066 | "metadata": {}, 1067 | "output_type": "execute_result" 1068 | } 1069 | ], 1070 | "source": [ 1071 | "#exploring 'publisher' column\n", 1072 | "books.loc[books.publisher.isnull(),:]\n", 1073 | "#two NaNs" 1074 | ] 1075 | }, 1076 | { 1077 | "cell_type": "code", 1078 | "execution_count": 204, 1079 | "metadata": { 1080 | "scrolled": true 1081 | }, 1082 | "outputs": [ 1083 | { 1084 | "data": { 1085 | "text/html": [ 1086 | "
\n", 1087 | "\n", 1100 | "\n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | "
ISBNbookTitlebookAuthoryearOfPublicationpublisher
128890193169656XTyrant MoonElaine Corvidae2002NaN
\n", 1122 | "
" 1123 | ], 1124 | "text/plain": [ 1125 | " ISBN bookTitle bookAuthor yearOfPublication publisher\n", 1126 | "128890 193169656X Tyrant Moon Elaine Corvidae 2002 NaN " 1127 | ] 1128 | }, 1129 | "execution_count": 204, 1130 | "metadata": {}, 1131 | "output_type": "execute_result" 1132 | } 1133 | ], 1134 | "source": [ 1135 | "#investigating rows having NaNs\n", 1136 | "#Checking with rows having bookTitle as Tyrant Moon to see if we can get any clues\n", 1137 | "books.loc[(books.bookTitle == 'Tyrant Moon'),:]\n", 1138 | "#no clues" 1139 | ] 1140 | }, 1141 | { 1142 | "cell_type": "code", 1143 | "execution_count": 205, 1144 | "metadata": {}, 1145 | "outputs": [ 1146 | { 1147 | "data": { 1148 | "text/html": [ 1149 | "
\n", 1150 | "\n", 1163 | "\n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | "
ISBNbookTitlebookAuthoryearOfPublicationpublisher
10799082177364XFinders KeepersFern Michaels2002Zebra Books
420190070465037Finders KeepersBarbara Nickolae1989McGraw-Hill Companies
582640688118461Finders KeepersEmily Rodda1993Harpercollins Juvenile Books
666781575663236Finders KeepersFern Michaels1998Kensington Publishing Corporation
1290371931696993Finders KeepersLinnea Sinclair2001NaN
1343090156309505Finders KeepersWill1989Voyager Books
1734730973146907Finders KeepersSean M. Costello2002Red Tower Publications
1958850061083909Finders KeepersSharon Sala2003HarperTorch
2118740373261160Finders KeepersElizabeth Travis1993Worldwide Library
\n", 1249 | "
" 1250 | ], 1251 | "text/plain": [ 1252 | " ISBN bookTitle bookAuthor yearOfPublication \\\n", 1253 | "10799 082177364X Finders Keepers Fern Michaels 2002 \n", 1254 | "42019 0070465037 Finders Keepers Barbara Nickolae 1989 \n", 1255 | "58264 0688118461 Finders Keepers Emily Rodda 1993 \n", 1256 | "66678 1575663236 Finders Keepers Fern Michaels 1998 \n", 1257 | "129037 1931696993 Finders Keepers Linnea Sinclair 2001 \n", 1258 | "134309 0156309505 Finders Keepers Will 1989 \n", 1259 | "173473 0973146907 Finders Keepers Sean M. Costello 2002 \n", 1260 | "195885 0061083909 Finders Keepers Sharon Sala 2003 \n", 1261 | "211874 0373261160 Finders Keepers Elizabeth Travis 1993 \n", 1262 | "\n", 1263 | " publisher \n", 1264 | "10799 Zebra Books \n", 1265 | "42019 McGraw-Hill Companies \n", 1266 | "58264 Harpercollins Juvenile Books \n", 1267 | "66678 Kensington Publishing Corporation \n", 1268 | "129037 NaN \n", 1269 | "134309 Voyager Books \n", 1270 | "173473 Red Tower Publications \n", 1271 | "195885 HarperTorch \n", 1272 | "211874 Worldwide Library " 1273 | ] 1274 | }, 1275 | "execution_count": 205, 1276 | "metadata": {}, 1277 | "output_type": "execute_result" 1278 | } 1279 | ], 1280 | "source": [ 1281 | "#Checking with rows having bookTitle as Finder Keepers to see if we can get any clues\n", 1282 | "books.loc[(books.bookTitle == 'Finders Keepers'),:]\n", 1283 | "#all rows with different publisher and bookAuthor" 1284 | ] 1285 | }, 1286 | { 1287 | "cell_type": "code", 1288 | "execution_count": 206, 1289 | "metadata": {}, 1290 | "outputs": [ 1291 | { 1292 | "data": { 1293 | "text/html": [ 1294 | "
\n", 1295 | "\n", 1308 | "\n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | "
ISBNbookTitlebookAuthoryearOfPublicationpublisher
1267621931696934Winter's OrphansElaine Corvidae2001Novelbooks
128890193169656XTyrant MoonElaine Corvidae2002NaN
1290010759901880WolfkinElaine Corvidae2001Hard Shell Word Factory
\n", 1346 | "
" 1347 | ], 1348 | "text/plain": [ 1349 | " ISBN bookTitle bookAuthor yearOfPublication \\\n", 1350 | "126762 1931696934 Winter's Orphans Elaine Corvidae 2001 \n", 1351 | "128890 193169656X Tyrant Moon Elaine Corvidae 2002 \n", 1352 | "129001 0759901880 Wolfkin Elaine Corvidae 2001 \n", 1353 | "\n", 1354 | " publisher \n", 1355 | "126762 Novelbooks \n", 1356 | "128890 NaN \n", 1357 | "129001 Hard Shell Word Factory " 1358 | ] 1359 | }, 1360 | "execution_count": 206, 1361 | "metadata": {}, 1362 | "output_type": "execute_result" 1363 | } 1364 | ], 1365 | "source": [ 1366 | "#checking by bookAuthor to find patterns\n", 1367 | "books.loc[(books.bookAuthor == 'Elaine Corvidae'),:]\n", 1368 | "#all having different publisher...no clues here" 1369 | ] 1370 | }, 1371 | { 1372 | "cell_type": "code", 1373 | "execution_count": 207, 1374 | "metadata": {}, 1375 | "outputs": [ 1376 | { 1377 | "data": { 1378 | "text/html": [ 1379 | "
\n", 1380 | "\n", 1393 | "\n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | "
ISBNbookTitlebookAuthoryearOfPublicationpublisher
1290371931696993Finders KeepersLinnea Sinclair2001NaN
\n", 1415 | "
" 1416 | ], 1417 | "text/plain": [ 1418 | " ISBN bookTitle bookAuthor yearOfPublication \\\n", 1419 | "129037 1931696993 Finders Keepers Linnea Sinclair 2001 \n", 1420 | "\n", 1421 | " publisher \n", 1422 | "129037 NaN " 1423 | ] 1424 | }, 1425 | "execution_count": 207, 1426 | "metadata": {}, 1427 | "output_type": "execute_result" 1428 | } 1429 | ], 1430 | "source": [ 1431 | "#checking by bookAuthor to find patterns\n", 1432 | "books.loc[(books.bookAuthor == 'Linnea Sinclair'),:]" 1433 | ] 1434 | }, 1435 | { 1436 | "cell_type": "code", 1437 | "execution_count": 208, 1438 | "metadata": { 1439 | "collapsed": true 1440 | }, 1441 | "outputs": [], 1442 | "source": [ 1443 | "#since there is nothing in common to infer publisher for NaNs, replacing these with 'other\n", 1444 | "books.loc[(books.ISBN == '193169656X'),'publisher'] = 'other'\n", 1445 | "books.loc[(books.ISBN == '1931696993'),'publisher'] = 'other'" 1446 | ] 1447 | }, 1448 | { 1449 | "cell_type": "markdown", 1450 | "metadata": {}, 1451 | "source": [ 1452 | "**Users**" 1453 | ] 1454 | }, 1455 | { 1456 | "cell_type": "code", 1457 | "execution_count": 209, 1458 | "metadata": { 1459 | "scrolled": true 1460 | }, 1461 | "outputs": [ 1462 | { 1463 | "name": "stdout", 1464 | "output_type": "stream", 1465 | "text": [ 1466 | "(278858, 3)\n" 1467 | ] 1468 | }, 1469 | { 1470 | "data": { 1471 | "text/html": [ 1472 | "
\n", 1473 | "\n", 1486 | "\n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | " \n", 1509 | " \n", 1510 | " \n", 1511 | " \n", 1512 | " \n", 1513 | " \n", 1514 | " \n", 1515 | " \n", 1516 | " \n", 1517 | " \n", 1518 | " \n", 1519 | " \n", 1520 | " \n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | " \n", 1527 | "
userIDLocationAge
01nyc, new york, usaNaN
12stockton, california, usa18.0
23moscow, yukon territory, russiaNaN
34porto, v.n.gaia, portugal17.0
45farnborough, hants, united kingdomNaN
\n", 1528 | "
" 1529 | ], 1530 | "text/plain": [ 1531 | " userID Location Age\n", 1532 | "0 1 nyc, new york, usa NaN \n", 1533 | "1 2 stockton, california, usa 18.0\n", 1534 | "2 3 moscow, yukon territory, russia NaN \n", 1535 | "3 4 porto, v.n.gaia, portugal 17.0\n", 1536 | "4 5 farnborough, hants, united kingdom NaN " 1537 | ] 1538 | }, 1539 | "execution_count": 209, 1540 | "metadata": {}, 1541 | "output_type": "execute_result" 1542 | } 1543 | ], 1544 | "source": [ 1545 | "print users.shape\n", 1546 | "users.head()" 1547 | ] 1548 | }, 1549 | { 1550 | "cell_type": "code", 1551 | "execution_count": 210, 1552 | "metadata": { 1553 | "scrolled": true 1554 | }, 1555 | "outputs": [ 1556 | { 1557 | "data": { 1558 | "text/plain": [ 1559 | "userID int64 \n", 1560 | "Location object \n", 1561 | "Age float64\n", 1562 | "dtype: object" 1563 | ] 1564 | }, 1565 | "execution_count": 210, 1566 | "metadata": {}, 1567 | "output_type": "execute_result" 1568 | } 1569 | ], 1570 | "source": [ 1571 | "users.dtypes" 1572 | ] 1573 | }, 1574 | { 1575 | "cell_type": "markdown", 1576 | "metadata": {}, 1577 | "source": [ 1578 | "**userID**" 1579 | ] 1580 | }, 1581 | { 1582 | "cell_type": "code", 1583 | "execution_count": 211, 1584 | "metadata": {}, 1585 | "outputs": [ 1586 | { 1587 | "data": { 1588 | "text/plain": [ 1589 | "array([ 1, 2, 3, ..., 278856, 278857, 278858], dtype=int64)" 1590 | ] 1591 | }, 1592 | "execution_count": 211, 1593 | "metadata": {}, 1594 | "output_type": "execute_result" 1595 | } 1596 | ], 1597 | "source": [ 1598 | "users.userID.values\n", 1599 | "#it can be seen that these are unique" 1600 | ] 1601 | }, 1602 | { 1603 | "cell_type": "markdown", 1604 | "metadata": {}, 1605 | "source": [ 1606 | "**Age**" 1607 | ] 1608 | }, 1609 | { 1610 | "cell_type": "code", 1611 | "execution_count": 212, 1612 | "metadata": {}, 1613 | "outputs": [ 1614 | { 1615 | "name": "stdout", 1616 | "output_type": "stream", 1617 | "text": [ 1618 | "[nan, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, 111.0, 113.0, 114.0, 115.0, 116.0, 118.0, 119.0, 123.0, 124.0, 127.0, 128.0, 132.0, 133.0, 136.0, 137.0, 138.0, 140.0, 141.0, 143.0, 146.0, 147.0, 148.0, 151.0, 152.0, 156.0, 157.0, 159.0, 162.0, 168.0, 172.0, 175.0, 183.0, 186.0, 189.0, 199.0, 200.0, 201.0, 204.0, 207.0, 208.0, 209.0, 210.0, 212.0, 219.0, 220.0, 223.0, 226.0, 228.0, 229.0, 230.0, 231.0, 237.0, 239.0, 244.0]\n" 1619 | ] 1620 | } 1621 | ], 1622 | "source": [ 1623 | "print sorted(users.Age.unique())\n", 1624 | "#Age column has some invalid entries like nan, 0 and very high values like 100 and above" 1625 | ] 1626 | }, 1627 | { 1628 | "cell_type": "code", 1629 | "execution_count": 213, 1630 | "metadata": { 1631 | "collapsed": true 1632 | }, 1633 | "outputs": [], 1634 | "source": [ 1635 | "#In my view values below 5 and above 90 do not make much sense for our book rating case...hence replacing these by NaNs\n", 1636 | "users.loc[(users.Age > 90) | (users.Age < 5), 'Age'] = np.nan" 1637 | ] 1638 | }, 1639 | { 1640 | "cell_type": "code", 1641 | "execution_count": 214, 1642 | "metadata": { 1643 | "collapsed": true 1644 | }, 1645 | "outputs": [], 1646 | "source": [ 1647 | "#replacing NaNs with mean\n", 1648 | "users.Age = users.Age.fillna(users.Age.mean())" 1649 | ] 1650 | }, 1651 | { 1652 | "cell_type": "code", 1653 | "execution_count": 215, 1654 | "metadata": { 1655 | "collapsed": true 1656 | }, 1657 | "outputs": [], 1658 | "source": [ 1659 | "#setting the data type as int\n", 1660 | "users.Age = users.Age.astype(np.int32)" 1661 | ] 1662 | }, 1663 | { 1664 | "cell_type": "code", 1665 | "execution_count": 216, 1666 | "metadata": { 1667 | "scrolled": true 1668 | }, 1669 | "outputs": [ 1670 | { 1671 | "name": "stdout", 1672 | "output_type": "stream", 1673 | "text": [ 1674 | "[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90]\n" 1675 | ] 1676 | } 1677 | ], 1678 | "source": [ 1679 | "#rechecking\n", 1680 | "print sorted(users.Age.unique())\n", 1681 | "#looks good now" 1682 | ] 1683 | }, 1684 | { 1685 | "cell_type": "markdown", 1686 | "metadata": {}, 1687 | "source": [ 1688 | "**Ratings Dataset**" 1689 | ] 1690 | }, 1691 | { 1692 | "cell_type": "code", 1693 | "execution_count": 217, 1694 | "metadata": {}, 1695 | "outputs": [ 1696 | { 1697 | "data": { 1698 | "text/plain": [ 1699 | "(1149780, 3)" 1700 | ] 1701 | }, 1702 | "execution_count": 217, 1703 | "metadata": {}, 1704 | "output_type": "execute_result" 1705 | } 1706 | ], 1707 | "source": [ 1708 | "#checking shape\n", 1709 | "ratings.shape" 1710 | ] 1711 | }, 1712 | { 1713 | "cell_type": "code", 1714 | "execution_count": 218, 1715 | "metadata": {}, 1716 | "outputs": [ 1717 | { 1718 | "name": "stdout", 1719 | "output_type": "stream", 1720 | "text": [ 1721 | "75670906880\n" 1722 | ] 1723 | } 1724 | ], 1725 | "source": [ 1726 | "#ratings dataset will have n_users*n_books entries if every user rated every item, this shows that the dataset is very sparse\n", 1727 | "n_users = users.shape[0]\n", 1728 | "n_books = books.shape[0]\n", 1729 | "print n_users * n_books" 1730 | ] 1731 | }, 1732 | { 1733 | "cell_type": "code", 1734 | "execution_count": 219, 1735 | "metadata": { 1736 | "scrolled": true 1737 | }, 1738 | "outputs": [ 1739 | { 1740 | "data": { 1741 | "text/html": [ 1742 | "
\n", 1743 | "\n", 1756 | "\n", 1757 | " \n", 1758 | " \n", 1759 | " \n", 1760 | " \n", 1761 | " \n", 1762 | " \n", 1763 | " \n", 1764 | " \n", 1765 | " \n", 1766 | " \n", 1767 | " \n", 1768 | " \n", 1769 | " \n", 1770 | " \n", 1771 | " \n", 1772 | " \n", 1773 | " \n", 1774 | " \n", 1775 | " \n", 1776 | " \n", 1777 | " \n", 1778 | " \n", 1779 | " \n", 1780 | " \n", 1781 | " \n", 1782 | " \n", 1783 | " \n", 1784 | " \n", 1785 | " \n", 1786 | " \n", 1787 | " \n", 1788 | " \n", 1789 | " \n", 1790 | " \n", 1791 | " \n", 1792 | " \n", 1793 | " \n", 1794 | " \n", 1795 | " \n", 1796 | " \n", 1797 | "
userIDISBNbookRating
0276725034545104X0
127672601550612245
227672704465208020
3276729052165615X3
427672905217950286
\n", 1798 | "
" 1799 | ], 1800 | "text/plain": [ 1801 | " userID ISBN bookRating\n", 1802 | "0 276725 034545104X 0 \n", 1803 | "1 276726 0155061224 5 \n", 1804 | "2 276727 0446520802 0 \n", 1805 | "3 276729 052165615X 3 \n", 1806 | "4 276729 0521795028 6 " 1807 | ] 1808 | }, 1809 | "execution_count": 219, 1810 | "metadata": {}, 1811 | "output_type": "execute_result" 1812 | } 1813 | ], 1814 | "source": [ 1815 | "#checking first few rows...\n", 1816 | "ratings.head(5)" 1817 | ] 1818 | }, 1819 | { 1820 | "cell_type": "code", 1821 | "execution_count": 220, 1822 | "metadata": {}, 1823 | "outputs": [ 1824 | { 1825 | "data": { 1826 | "text/plain": [ 1827 | "array([ 0, 5, 3, 6, 8, 7, 10, 9, 4, 1, 2], dtype=int64)" 1828 | ] 1829 | }, 1830 | "execution_count": 220, 1831 | "metadata": {}, 1832 | "output_type": "execute_result" 1833 | } 1834 | ], 1835 | "source": [ 1836 | "ratings.bookRating.unique()" 1837 | ] 1838 | }, 1839 | { 1840 | "cell_type": "code", 1841 | "execution_count": 221, 1842 | "metadata": { 1843 | "collapsed": true 1844 | }, 1845 | "outputs": [], 1846 | "source": [ 1847 | "#ratings dataset should have books only which exist in our books dataset, unless new books are added to books dataset\n", 1848 | "ratings_new = ratings[ratings.ISBN.isin(books.ISBN)]" 1849 | ] 1850 | }, 1851 | { 1852 | "cell_type": "code", 1853 | "execution_count": 222, 1854 | "metadata": {}, 1855 | "outputs": [ 1856 | { 1857 | "name": "stdout", 1858 | "output_type": "stream", 1859 | "text": [ 1860 | "(1149780, 3)\n", 1861 | "(1031136, 3)\n" 1862 | ] 1863 | } 1864 | ], 1865 | "source": [ 1866 | "print ratings.shape\n", 1867 | "print ratings_new.shape\n", 1868 | "#it can be seen that many rows having book ISBN not part of books dataset got dropped off" 1869 | ] 1870 | }, 1871 | { 1872 | "cell_type": "code", 1873 | "execution_count": 223, 1874 | "metadata": { 1875 | "collapsed": true 1876 | }, 1877 | "outputs": [], 1878 | "source": [ 1879 | "#ratings dataset should have ratings from users which exist in users dataset, unless new users are added to users dataset\n", 1880 | "ratings = ratings[ratings.userID.isin(users.userID)]" 1881 | ] 1882 | }, 1883 | { 1884 | "cell_type": "code", 1885 | "execution_count": 224, 1886 | "metadata": {}, 1887 | "outputs": [ 1888 | { 1889 | "name": "stdout", 1890 | "output_type": "stream", 1891 | "text": [ 1892 | "(1149780, 3)\n", 1893 | "(1031136, 3)\n" 1894 | ] 1895 | } 1896 | ], 1897 | "source": [ 1898 | "print ratings.shape\n", 1899 | "print ratings_new.shape\n", 1900 | "#no new users added, hence we will go with above dataset ratings_new (1031136, 3)" 1901 | ] 1902 | }, 1903 | { 1904 | "cell_type": "code", 1905 | "execution_count": 225, 1906 | "metadata": {}, 1907 | "outputs": [ 1908 | { 1909 | "name": "stdout", 1910 | "output_type": "stream", 1911 | "text": [ 1912 | "number of users: 278858\n", 1913 | "number of books: 271360\n" 1914 | ] 1915 | } 1916 | ], 1917 | "source": [ 1918 | "print \"number of users: \" + str(n_users)\n", 1919 | "print \"number of books: \" + str(n_books)" 1920 | ] 1921 | }, 1922 | { 1923 | "cell_type": "code", 1924 | "execution_count": 226, 1925 | "metadata": {}, 1926 | "outputs": [ 1927 | { 1928 | "name": "stdout", 1929 | "output_type": "stream", 1930 | "text": [ 1931 | "The sparsity level of Book Crossing dataset is 99.9986373416 %\n" 1932 | ] 1933 | } 1934 | ], 1935 | "source": [ 1936 | "#Sparsity of dataset in %\n", 1937 | "sparsity=1.0-len(ratings_new)/float(n_users*n_books)\n", 1938 | "print 'The sparsity level of Book Crossing dataset is ' + str(sparsity*100) + ' %'" 1939 | ] 1940 | }, 1941 | { 1942 | "cell_type": "code", 1943 | "execution_count": 228, 1944 | "metadata": {}, 1945 | "outputs": [ 1946 | { 1947 | "data": { 1948 | "text/plain": [ 1949 | "array([ 0, 5, 3, 6, 8, 7, 10, 9, 4, 1, 2], dtype=int64)" 1950 | ] 1951 | }, 1952 | "execution_count": 228, 1953 | "metadata": {}, 1954 | "output_type": "execute_result" 1955 | } 1956 | ], 1957 | "source": [ 1958 | "#As quoted in the description of the dataset -\n", 1959 | "#BX-Book-Ratings contains the book rating information. Ratings are either explicit, expressed on a scale from 1-10 \n", 1960 | "#higher values denoting higher appreciation, or implicit, expressed by 0\n", 1961 | "ratings.bookRating.unique()" 1962 | ] 1963 | }, 1964 | { 1965 | "cell_type": "code", 1966 | "execution_count": 229, 1967 | "metadata": { 1968 | "collapsed": true 1969 | }, 1970 | "outputs": [], 1971 | "source": [ 1972 | "#Hence segragating implicit and explict ratings datasets\n", 1973 | "ratings_explicit = ratings_new[ratings_new.bookRating != 0]\n", 1974 | "ratings_implicit = ratings_new[ratings_new.bookRating == 0]" 1975 | ] 1976 | }, 1977 | { 1978 | "cell_type": "code", 1979 | "execution_count": 230, 1980 | "metadata": {}, 1981 | "outputs": [ 1982 | { 1983 | "name": "stdout", 1984 | "output_type": "stream", 1985 | "text": [ 1986 | "(1031136, 3)\n", 1987 | "(383842, 3)\n", 1988 | "(647294, 3)\n" 1989 | ] 1990 | } 1991 | ], 1992 | "source": [ 1993 | "#checking shapes\n", 1994 | "print ratings_new.shape\n", 1995 | "print ratings_explicit.shape\n", 1996 | "print ratings_implicit.shape" 1997 | ] 1998 | }, 1999 | { 2000 | "cell_type": "code", 2001 | "execution_count": 231, 2002 | "metadata": { 2003 | "scrolled": true 2004 | }, 2005 | "outputs": [ 2006 | { 2007 | "data": { 2008 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAf0AAAFXCAYAAACoS5cAAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHsdJREFUeJzt3X9Y1fXh9/HX4Zc/OAeRpW3NUClZtoa/SGvXkXK51Gu3\n05la0K3utrViusLS8DcpkuMq0JVz2LK7LhSQ0pXXfTVbmoMMIy+uwNJs5SwVi9Do6hxEOMDn/uN7\n7VwyFU7Njwd4Px9/6YcP8DomPfkcjuc4LMuyBAAAur2QYA8AAABXBtEHAMAQRB8AAEMQfQAADEH0\nAQAwBNEHAMAQYcEeYLfaWk+wJwAAcEX16+e66HGu9AEAMATRBwDAEEQfAABDEH0AAAxB9AEAMATR\nBwDAEEQfAABDEH0AAAxB9AEAMATRBwDAEEQfAABDEH0AAAxB9AEAMES3f5U9ADBJ8b6zwZ7Qxkx3\n72BPwHm40gcAwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAA\nQxB9AAAMQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQBADAE0QcA\nwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQB\nADAE0QcAwBBhdn1gn8+nxYsXq7q6WiEhIcrMzFRYWJgWL14sh8OhIUOGKCMjQyEhISouLlZRUZHC\nwsKUmpqqcePG6dy5c1q0aJHOnDmjyMhIZWdnKyYmRpWVlcrKylJoaKjcbrfmz59v100AAKBbse1K\nv6SkRM3NzSoqKtK8efO0fv16rV27VmlpaSooKJBlWdqzZ49qa2uVn5+voqIibd68Wbm5uWpqalJh\nYaHi4+NVUFCgqVOnauPGjZKkjIwM5eTkqLCwUFVVVTp8+LBdNwEAgG7FtugPHjxYLS0tam1tldfr\nVVhYmA4dOqTRo0dLkpKSklRWVqaDBw9qxIgRioiIkMvlUmxsrI4cOaKKigqNHTvWf+7+/fvl9XrV\n1NSk2NhYORwOud1ulZWV2XUTAADoVmy7e793796qrq7WpEmTVFdXp7y8PB04cEAOh0OSFBkZKY/H\nI6/XK5fL5X+/yMhIeb3eNsfPP9fpdLY598SJE+3u6Nu3t8LCQm24hQDQGZ0N9oA2+vVzdXwSrhjb\nov/CCy/I7Xbr0Ucf1eeff645c+bI5/P5315fX6+oqCg5nU7V19e3Oe5yudocb+/cqKiodnfU1XWu\nLwAAMEltrSfYE4x0qW+2bLt7Pyoqyn+l3qdPHzU3N+vGG29UeXm5JKm0tFSJiYlKSEhQRUWFGhsb\n5fF4dPToUcXHx2vkyJEqKSnxnztq1Cg5nU6Fh4fr+PHjsixL+/btU2Jiol03AQCAbsVhWZZlxweu\nr6/X0qVLVVtbK5/Pp9mzZ+umm27SihUr5PP5FBcXpzVr1ig0NFTFxcXatm2bLMvSAw88oAkTJqih\noUHp6emqra1VeHi4cnJy1K9fP1VWVuqJJ55QS0uL3G63FixY0O4OvssEYJLifZ3r3s2Z7t7BnmCk\nS13p2xb9zoLoAzAJ0YcUhLv3AQBA50L0AQAwBNEHAMAQRB8AAEMQfQAADEH0AQAwBNEHAMAQRB8A\nAEMQfQAADEH0AQAwBNEHAMAQRB8AAEMQfQAADEH0AQAwBNEHAMAQRB8AAEMQfQAADEH0AQAwBNEH\nAMAQRB8AAEMQfQAADEH0AQAwRFiwBwAA0NW0bv1nsCf4hdwbH/i5Nu4AAACdCNEHAMAQRB8AAEMQ\nfQAADEH0AQAwBNEHAMAQRB8AAEMQfQAADEH0AQAwBNEHAMAQRB8AAEMQfQAADEH0AQAwBNEHAMAQ\nvLQuALTjt6VVwZ7g92zSsGBPQBfHlT4AAIYg+gAAGILoAwBgCKIPAIAhiD4AAIYg+gAAGILoAwBg\nCKIPAIAhiD4AAIYg+gAAGILoAwBgCKIPAIAhiD4AAIYg+gAAGILoAwBgCKIPAIAhwuz84Js2bdKb\nb74pn8+n5ORkjR49WosXL5bD4dCQIUOUkZGhkJAQFRcXq6ioSGFhYUpNTdW4ceN07tw5LVq0SGfO\nnFFkZKSys7MVExOjyspKZWVlKTQ0VG63W/Pnz7fzJgAAbHby1YZgT2hjwJRewZ5gG9uu9MvLy/Xe\ne++psLBQ+fn5+uKLL7R27VqlpaWpoKBAlmVpz549qq2tVX5+voqKirR582bl5uaqqalJhYWFio+P\nV0FBgaZOnaqNGzdKkjIyMpSTk6PCwkJVVVXp8OHDdt0EAAC6Fduiv2/fPsXHx2vevHl68MEHdfvt\nt+vQoUMaPXq0JCkpKUllZWU6ePCgRowYoYiICLlcLsXGxurIkSOqqKjQ2LFj/efu379fXq9XTU1N\nio2NlcPhkNvtVllZmV03AQCAbsW2u/fr6up06tQp5eXl6eTJk0pNTZVlWXI4HJKkyMhIeTweeb1e\nuVwu//tFRkbK6/W2OX7+uU6ns825J06caHdH3769FRYWasMtBIArq18/V8cn6aztO76NQDafVOe6\nez+QzTVXYEegAvt78T9si350dLTi4uIUERGhuLg49ejRQ1988YX/7fX19YqKipLT6VR9fX2b4y6X\nq83x9s6Niopqd0ddXef6AgCA76q21hPsCd8am+13sb2X+kbAtrv3R40apbfeekuWZammpkYNDQ26\n9dZbVV5eLkkqLS1VYmKiEhISVFFRocbGRnk8Hh09elTx8fEaOXKkSkpK/OeOGjVKTqdT4eHhOn78\nuCzL0r59+5SYmGjXTQAAoFux7Up/3LhxOnDggKZPny7LsrRy5UoNGDBAK1asUG5uruLi4jRhwgSF\nhoZq1qxZSklJkWVZWrBggXr06KHk5GSlp6crOTlZ4eHhysnJkSStWrVKCxcuVEtLi9xut4YNG2bX\nTQAAoFtxWJZlBXuEnbra3TQAOpffllYFe4Lfs0kdX+QU7+tcP9Kc6e7d4Tld8Z/stW795xVYEpiQ\ne+MvOHbF794HAACdC9EHAMAQRB8AAEMQfQAADEH0AQAwBNEHAMAQRB8AAEMQfQAADEH0AQAwBNEH\nAMAQRB8AAEMQfQAADEH0AQAwBNEHAMAQRB8AAEMQfQAADEH0AQAwBNEHAMAQRB8AAEMQfQAADBFQ\n9DMzMy84lp6eftnHAAAA+4S198Zly5bpxIkT+uCDD/Txxx/7jzc3N8vj8dg+DgAAXD7tRj81NVXV\n1dXKysrS/Pnz/cdDQ0N13XXX2T4OAABcPu1Gf8CAARowYIB27twpr9crj8cjy7IkSWfPnlV0dPQV\nGQkAAP577Ub/3zZt2qRNmza1ibzD4dCePXtsGwYAAC6vgKL/0ksvaffu3YqJibF7DwAAsElAj97/\nwQ9+oD59+ti9BQAA2CigK/1BgwYpJSVFY8aMUUREhP/4+Q/uAwAAnVtA0b/66qt19dVX270FAADY\nKKDoc0UPAEDXF1D0b7jhBjkcjjbH+vfvr5KSEltGAQCAyy+g6B85csT/a5/Pp927d6uystK2UQAA\n4PL71i+4Ex4erkmTJumdd96xYw8AALBJQFf6r7zyiv/XlmXp448/Vnh4uG2jAADA5RdQ9MvLy9v8\nvm/fvlq3bp0tgwAAgD0Civ7atWvl8/l07NgxtbS0aMiQIQoLC+hdAQBAJxFQuT/44AM99NBDio6O\nVmtrq06fPq0//elPGjZsmN37AADAZRJQ9NesWaN169b5I19ZWanMzEy9/PLLto4DAACXT0CP3j97\n9mybq/rhw4ersbHRtlEAAODyCyj6ffr00e7du/2/3717d5uX2QUAAJ1fQHfvZ2Zm6oEHHtCyZcv8\nx4qKimwbBaBjc/b9MdgT2njR/XCwJwDoQEBX+qWlperVq5f27t2rF198UTExMXr33Xft3gYAAC6j\ngKJfXFyswsJC9e7dWzfccIN27NihLVu22L0NAABcRgFF3+fztXkGPp6NDwCAriegn+mPHz9ec+bM\n0aRJkyRJf//733XHHXfYOgwAAFxeAUV/0aJF2rVrlw4cOKCwsDDNnj1b48ePt3sbAAC4jAJ+Lt2J\nEydq4sSJdm4BAAA2+tYvrQsAALomog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhrA1+mfO\nnNFtt92mo0eP6rPPPlNycrJSUlKUkZGh1tZWSf/zvP7Tpk3TzJkztXfvXknSuXPn9Pvf/14pKSm6\n//779dVXX0mSKisrNWPGDN1zzz3asGGDndMBAOh2bIu+z+fTypUr1bNnT0nS2rVrlZaWpoKCAlmW\npT179qi2tlb5+fkqKirS5s2blZubq6amJhUWFio+Pl4FBQWaOnWqNm7cKEnKyMhQTk6OCgsLVVVV\npcOHD9s1HwCAbse26GdnZ+uee+5R//79JUmHDh3S6NGjJUlJSUkqKyvTwYMHNWLECEVERMjlcik2\nNlZHjhxRRUWFxo4d6z93//798nq9ampqUmxsrBwOh9xut8rKyuyaDwBAtxPw0/B+Gzt27FBMTIzG\njh2rZ599VpJkWZYcDockKTIyUh6PR16vVy6Xy/9+kZGR8nq9bY6ff67T6Wxz7okTJzrc0rdvb4WF\nhV7OmwfgIvr1c3V8Ev4rgf0Zn7V9x7cRyOaTargCSwIXyOaaK7AjUN/ma8+W6G/fvl0Oh0P79+/X\nhx9+qPT0dP/P5SWpvr5eUVFRcjqdqq+vb3Pc5XK1Od7euVFRUR1uqavrXF8AQHdVW+sJ9oRuryv+\nGbPZfhfbe6lvBGy5e3/r1q3asmWL8vPzNXToUGVnZyspKUnl5eWSpNLSUiUmJiohIUEVFRVqbGyU\nx+PR0aNHFR8fr5EjR6qkpMR/7qhRo+R0OhUeHq7jx4/Lsizt27dPiYmJdswHAKBbsuVK/2LS09O1\nYsUK5ebmKi4uThMmTFBoaKhmzZqllJQUWZalBQsWqEePHkpOTlZ6erqSk5MVHh6unJwcSdKqVau0\ncOFCtbS0yO12a9iwYVdqPgAAXZ7t0c/Pz/f/esuWLRe8febMmZo5c2abY7169dLTTz99wbnDhw9X\ncXHx5R8JAIABeHIeAAAMQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAAQxB9AAAM\nQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAA\nQ4QFewAAc/yfkp3BntDG/73tl8GeAFxRXOkDAGAIog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGI\nPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgugDAGAI\nog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6AAAY\ngugDAGAIog8AgCGIPgAAhiD6AAAYgugDAGCIMDs+qM/n09KlS1VdXa2mpialpqbq+uuv1+LFi+Vw\nODRkyBBlZGQoJCRExcXFKioqUlhYmFJTUzVu3DidO3dOixYt0pkzZxQZGans7GzFxMSosrJSWVlZ\nCg0Nldvt1vz58+2YDwBAt2TLlf7OnTsVHR2tgoICPffcc8rMzNTatWuVlpamgoICWZalPXv2qLa2\nVvn5+SoqKtLmzZuVm5urpqYmFRYWKj4+XgUFBZo6dao2btwoScrIyFBOTo4KCwtVVVWlw4cP2zEf\nAIBuyZboT5w4UQ8//LAkybIshYaG6tChQxo9erQkKSkpSWVlZTp48KBGjBihiIgIuVwuxcbG6siR\nI6qoqNDYsWP95+7fv19er1dNTU2KjY2Vw+GQ2+1WWVmZHfMBAOiWbLl7PzIyUpLk9Xr10EMPKS0t\nTdnZ2XI4HP63ezweeb1euVyuNu/n9XrbHD//XKfT2ebcEydOdLilb9/eCgsLvZw3D8BF9Ovn6vik\nTqarbQ5s71nbd3wbgWw+qYYrsCRwgWyuuQI7AvVt/h7bEn1J+vzzzzVv3jylpKRo8uTJevLJJ/1v\nq6+vV1RUlJxOp+rr69scd7lcbY63d25UVFSHO+rqOtcXANBd1dZ6gj3hW+tqm7vaXonNV8LF9l7q\nGwFb7t4/ffq05s6dq0WLFmn69OmSpBtvvFHl5eWSpNLSUiUmJiohIUEVFRVqbGyUx+PR0aNHFR8f\nr5EjR6qkpMR/7qhRo+R0OhUeHq7jx4/Lsizt27dPiYmJdswHAKBbsuVKPy8vT9988402btzofxDe\nsmXLtGbNGuXm5iouLk4TJkxQaGioZs2apZSUFFmWpQULFqhHjx5KTk5Wenq6kpOTFR4erpycHEnS\nqlWrtHDhQrW0tMjtdmvYsGF2zAcAoFuyJfrLly/X8uXLLzi+ZcuWC47NnDlTM2fObHOsV69eevrp\npy84d/jw4SouLr58QwEAMAhPzgMAgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgugD\nAGAIog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6\nAAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGI\nPgAAhiD6AAAYgugDAGAIog8AgCHCgj0A6Cz+tHd6sCf4zRv3crAnAOiGuNIHAMAQRB8AAEMQfQAA\nDEH0AQAwBNEHAMAQRB8AAEMQfQAADMG/04ctSv7fjGBPaOO2//VSsCcAQNBxpQ8AgCGIPgAAhiD6\nAAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgn+n3wWcKf7fwZ7Qxvdmbgn2BADAd8CVPgAAhiD6AAAY\nwsy7919+NdgL2po+JdgLAAAG4EofAABDdLkr/dbWVj3++OP66KOPFBERoTVr1mjgwIHBngUAQKfX\n5a70d+/eraamJm3btk2PPvqo/vCHPwR7EgAAXUKXi35FRYXGjh0rSRo+fLg++OCDIC8CAKBrcFiW\nZQV7xLexbNky3XnnnbrtttskSbfffrt2796tsLAu95MKAACuqC53pe90OlVfX+//fWtrK8EHACAA\nXS76I0eOVGlpqSSpsrJS8fHxQV4EAEDX0OXu3v/3o/f/+c9/yrIsPfHEE7ruuuuCPQsAgE6vy0Uf\nAAB8N13u7n0AAPDdEH0AAAzBw96/o6qqKj311FPKz88P9pQO+Xw+LV26VNXV1WpqalJqaqruuOOO\nYM9qV0tLi5YvX65jx47J4XBo1apVXeJBm2fOnNG0adP0/PPPd4nHmvzqV7+S0+mUJA0YMEBr164N\n8qL2bdq0SW+++aZ8Pp+Sk5M1Y8aMYE9q144dO/TXv/5VktTY2KgPP/xQb7/9tqKiooK87NJ8Pp8W\nL16s6upqhYSEKDMzs1P/XW5qatKSJUt04sQJOZ1OrVy5UoMGDQr2rEs6vx2fffaZFi9eLIfDoSFD\nhigjI0MhIfZeixP97+Avf/mLdu7cqV69egV7SkB27typ6OhoPfnkk/r66681derUTh/9vXv3SpKK\niopUXl6udevW6c9//nOQV7XP5/Np5cqV6tmzZ7CnBKSxsVGWZXWJb1wlqby8XO+9954KCwvV0NCg\n559/PtiTOjRt2jRNmzZNkrRq1SrdddddnTr4klRSUqLm5mYVFRXp7bff1vr16/XMM88Ee9YlFRcX\nq3fv3iouLta//vUvZWZmavPmzcGedVH/2Y61a9cqLS1NY8aM0cqVK7Vnzx79/Oc/t3UDd+9/B7Gx\nsZ36i+A/TZw4UQ8//LAkybIshYaGBnlRx8aPH6/MzExJ0qlTpzr9/yglKTs7W/fcc4/69+8f7CkB\nOXLkiBoaGjR37lzNnj1blZWVwZ7Urn379ik+Pl7z5s3Tgw8+qNtvvz3YkwL2/vvv65NPPtHdd98d\n7CkdGjx4sFpaWtTa2iqv19vpnwflk08+UVJSkiQpLi5OR48eDfKiS/vPdhw6dEijR4+WJCUlJams\nrMz2DZ37v2YnNWHCBJ08eTLYMwIWGRkpSfJ6vXrooYeUlpYW5EWBCQsLU3p6ut544w09/fTTwZ7T\nrh07digmJkZjx47Vs88+G+w5AenZs6fuu+8+zZgxQ59++qnuv/9+7dq1q9P+T76urk6nTp1SXl6e\nTp48qdTUVO3atUsOhyPY0zq0adMmzZs3L9gzAtK7d29VV1dr0qRJqqurU15eXrAntWvo0KHau3ev\nxo8fr6qqKtXU1KilpaVTXtz8Zzssy/L//Y2MjJTH47F9A1f6hvj88881e/ZsTZkyRZMnTw72nIBl\nZ2fr9ddf14oVK3T27Nlgz7mk7du3q6ysTLNmzdKHH36o9PR01dbWBntWuwYPHqxf/vKXcjgcGjx4\nsKKjozv15ujoaLndbkVERCguLk49evTQV199FexZHfrmm2907Ngx3XLLLcGeEpAXXnhBbrdbr7/+\nul599VUtXrxYjY2NwZ51SXfddZecTqdSUlL0xhtv6Mc//nGnDP7FnP/z+/r6+ityjybRN8Dp06c1\nd+5cLVq0SNOnTw/2nIC88sor2rRpkySpV69ecjgctj/A5b+xdetWbdmyRfn5+Ro6dKiys7PVr1+/\nYM9q18svv+x/lcqamhp5vd5OvXnUqFF66623ZFmWampq1NDQoOjo6GDP6tCBAwd06623BntGwKKi\nouRyuSRJffr0UXNzs1paWoK86tLef/993XrrrSosLNTEiRN17bXXBntSwG688UaVl5dLkkpLS5WY\nmGj75+yc9+PhssrLy9M333yjjRs3auPGjZL+5wElnfkBZ3feeaeWLFmie++9V83NzVq6dGmn3tsV\nTZ8+XUuWLFFycrIcDoeeeOKJTnvXviSNGzdOBw4c0PTp02VZllauXNklruiOHTumAQMGBHtGwH79\n619r6dKlSklJkc/n04IFC9S7d+9gz7qkgQMH6o9//KPy8vLkcrmUlZUV7EkBS09P14oVK5Sbm6u4\nuDhNmDDB9s/JM/IBAGCIznt/KQAAuKyIPgAAhiD6AAAYgugDAGAIog8AgCGIPmCQ8vJyzZo167/+\nOLNmzfL/++J/O3nypG666SZNmTLF/yRQP/vZzwJ6NsXzN02ZMuW/3gfg4jrvP8oF0OX0799fr776\nqv/3NTU1mjBhgn7xi1+0+0pt7777rv/X578/gMuL6AOGqaur03333acvv/xSCQkJysjI8L+aWmtr\nq6699lqtXr1aV111lSorK5WVlaXGxkb17dtXq1ev1sCBA/0f68yZM5ozZ47S0tJ0ww03XPC5amtr\nZVmWIiMj1dzcrMcff1wff/yxTp8+rcGDB2vDhg166qmnJEkzZszQSy+9pB/96Ef66KOP9Mwzz6im\npkafffaZqqurNWPGDKWmpsrn8ykjI0MVFRW6+uqr5XA49Lvf/U5jxoy5Yn+GQFdF9AHDnDx5Uhs2\nbNDAgQO1YMECPfvss9q2bZsKCws1YMAAPffcc1q9erWeeuopPfLII1q/fr0SEhL0t7/9TY888oi2\nb98uSfJ4PPrtb3+r+fPna/z48Tp58qS+/PJLTZkyRY2Njaqrq9NPfvITbdiwQd///vd14MABhYeH\na9u2bWptbdWcOXNUUlKi5cuXKz8/Xy+99NIFWz/66CNt3bpVHo9H48eP17333qtXX31VDQ0N2rVr\nl06dOtWlXksCCDZ+pg8YJjExUYMGDZLD4dDkyZP14osvKiEhwf9UsXfffbfeeecdffrpp4qKilJC\nQoIkadKkSTp+/Lj/lcAyMjLU3NysO++80/+x/333/muvvaYpU6bI5/P5X2jm5ptvVkpKirZu3aqs\nrCx9+umnHb6I0pgxYxQREaHvfe97io6Olsfj0dtvv63JkyfL4XDohz/8YZd6Xnsg2Ig+YJjzn1//\n/Jf2PP9Yc3OzWltbL3hfy7L8L75y//33KyYmRoWFhRecFxISoscee0xnzpzR888/L0nas2ePFi5c\nqJ49e2ratGm6+eab1dGzgPfo0cP/a4fDIcuyFBoaetFtADpG9AHDVFRU6NSpU2ptbdUrr7yi3/zm\nN6qqqvK/zve2bds0ZswYxcXF6euvv9bBgwclSa+99pquueYa/yvbDR06VBkZGdqwYYNqamou+Dxh\nYWF67LHHlJeXp9raWu3fv1+TJk3SXXfdpauuukoHDhzwfwMRGhqq5ubmgPb/9Kc/1WuvveZ/tb13\n3333gm9cAFwcP9MHDHP99ddr6dKlqq2t1S233KL77rtP119/vebPny+fz6drrrlGWVlZioiI0Lp1\n65SZmamGhgb16dNH69ata/OxBg0apHvvvVerV6/WkiVLLvhcSUlJGj58uNavX6/Zs2dr4cKF2rVr\nlyIiIjR8+HD/Nxp33HGHpkyZoh07dnS4f+bMmTpy5IgmT56sfv366ZprruEVGIEA8Sp7ALqUf/zj\nH7IsS+PGjZPH49HUqVO1fft2/z0QAC6N6APoUk6cOKHHHnvM/yDAuXPn8oQ+QICIPgAAhuCBfAAA\nGILoAwBgCKIPAIAhiD4AAIYg+gAAGILoAwBgiP8PjIYQhxu3xXEAAAAASUVORK5CYII=\n", 2009 | "text/plain": [ 2010 | "" 2011 | ] 2012 | }, 2013 | "metadata": {}, 2014 | "output_type": "display_data" 2015 | } 2016 | ], 2017 | "source": [ 2018 | "#plotting count of bookRating\n", 2019 | "sns.countplot(data=ratings_explicit , x='bookRating')\n", 2020 | "plt.show()\n", 2021 | "#It can be seen that higher ratings are more common amongst users and rating 8 has been rated highest number of times" 2022 | ] 2023 | }, 2024 | { 2025 | "cell_type": "markdown", 2026 | "metadata": {}, 2027 | "source": [ 2028 | "**Simple Popularity Based Recommendation System**" 2029 | ] 2030 | }, 2031 | { 2032 | "cell_type": "code", 2033 | "execution_count": 232, 2034 | "metadata": { 2035 | "scrolled": true 2036 | }, 2037 | "outputs": [ 2038 | { 2039 | "name": "stdout", 2040 | "output_type": "stream", 2041 | "text": [ 2042 | "Following books are recommended\n" 2043 | ] 2044 | }, 2045 | { 2046 | "data": { 2047 | "text/html": [ 2048 | "
\n", 2049 | "\n", 2062 | "\n", 2063 | " \n", 2064 | " \n", 2065 | " \n", 2066 | " \n", 2067 | " \n", 2068 | " \n", 2069 | " \n", 2070 | " \n", 2071 | " \n", 2072 | " \n", 2073 | " \n", 2074 | " \n", 2075 | " \n", 2076 | " \n", 2077 | " \n", 2078 | " \n", 2079 | " \n", 2080 | " \n", 2081 | " \n", 2082 | " \n", 2083 | " \n", 2084 | " \n", 2085 | " \n", 2086 | " \n", 2087 | " \n", 2088 | " \n", 2089 | " \n", 2090 | " \n", 2091 | " \n", 2092 | " \n", 2093 | " \n", 2094 | " \n", 2095 | " \n", 2096 | " \n", 2097 | " \n", 2098 | " \n", 2099 | " \n", 2100 | " \n", 2101 | " \n", 2102 | " \n", 2103 | " \n", 2104 | " \n", 2105 | " \n", 2106 | " \n", 2107 | " \n", 2108 | " \n", 2109 | " \n", 2110 | " \n", 2111 | " \n", 2112 | " \n", 2113 | " \n", 2114 | " \n", 2115 | " \n", 2116 | " \n", 2117 | " \n", 2118 | " \n", 2119 | " \n", 2120 | " \n", 2121 | " \n", 2122 | " \n", 2123 | " \n", 2124 | " \n", 2125 | " \n", 2126 | " \n", 2127 | " \n", 2128 | " \n", 2129 | " \n", 2130 | " \n", 2131 | " \n", 2132 | " \n", 2133 | " \n", 2134 | " \n", 2135 | " \n", 2136 | " \n", 2137 | " \n", 2138 | " \n", 2139 | " \n", 2140 | " \n", 2141 | " \n", 2142 | " \n", 2143 | " \n", 2144 | " \n", 2145 | " \n", 2146 | " \n", 2147 | " \n", 2148 | " \n", 2149 | " \n", 2150 | " \n", 2151 | " \n", 2152 | " \n", 2153 | " \n", 2154 | " \n", 2155 | " \n", 2156 | " \n", 2157 | " \n", 2158 | " \n", 2159 | " \n", 2160 | " \n", 2161 | " \n", 2162 | " \n", 2163 | " \n", 2164 | " \n", 2165 | " \n", 2166 | "
bookRatingISBNbookTitlebookAuthoryearOfPublicationpublisher
40857870316666343The Lovely Bones: A NovelAlice Sebold2002Little, Brown
74841080385504209The Da Vinci CodeDan Brown2003Doubleday
52231340312195516The Red Tent (Bestselling Backlist)Anita Diamant1998Picador USA
21432798059035342XHarry Potter and the Sorcerer's Stone (Harry Potter (Paperback))J. K. Rowling1999Arthur A. Levine Books
35625950142001740The Secret Life of BeesSue Monk Kidd2003Penguin Books
2625510971880107Wild AnimusRich Shapero2004Too Far
110525240060928336Divine Secrets of the Ya-Ya Sisterhood: A NovelRebecca Wells1997Perennial
70624020446672211Where the Heart Is (Oprah's Book Club (Paperback))Billie Letts1998Warner Books
23122190452282152Girl with a Pearl EarringTracy Chevalier2001Plume Books
11821790671027360Angels &amp; DemonsDan Brown2001Pocket Star
\n", 2167 | "
" 2168 | ], 2169 | "text/plain": [ 2170 | " bookRating ISBN \\\n", 2171 | "408 5787 0316666343 \n", 2172 | "748 4108 0385504209 \n", 2173 | "522 3134 0312195516 \n", 2174 | "2143 2798 059035342X \n", 2175 | "356 2595 0142001740 \n", 2176 | "26 2551 0971880107 \n", 2177 | "1105 2524 0060928336 \n", 2178 | "706 2402 0446672211 \n", 2179 | "231 2219 0452282152 \n", 2180 | "118 2179 0671027360 \n", 2181 | "\n", 2182 | " bookTitle \\\n", 2183 | "408 The Lovely Bones: A Novel \n", 2184 | "748 The Da Vinci Code \n", 2185 | "522 The Red Tent (Bestselling Backlist) \n", 2186 | "2143 Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback)) \n", 2187 | "356 The Secret Life of Bees \n", 2188 | "26 Wild Animus \n", 2189 | "1105 Divine Secrets of the Ya-Ya Sisterhood: A Novel \n", 2190 | "706 Where the Heart Is (Oprah's Book Club (Paperback)) \n", 2191 | "231 Girl with a Pearl Earring \n", 2192 | "118 Angels & Demons \n", 2193 | "\n", 2194 | " bookAuthor yearOfPublication publisher \n", 2195 | "408 Alice Sebold 2002 Little, Brown \n", 2196 | "748 Dan Brown 2003 Doubleday \n", 2197 | "522 Anita Diamant 1998 Picador USA \n", 2198 | "2143 J. K. Rowling 1999 Arthur A. Levine Books \n", 2199 | "356 Sue Monk Kidd 2003 Penguin Books \n", 2200 | "26 Rich Shapero 2004 Too Far \n", 2201 | "1105 Rebecca Wells 1997 Perennial \n", 2202 | "706 Billie Letts 1998 Warner Books \n", 2203 | "231 Tracy Chevalier 2001 Plume Books \n", 2204 | "118 Dan Brown 2001 Pocket Star " 2205 | ] 2206 | }, 2207 | "execution_count": 232, 2208 | "metadata": {}, 2209 | "output_type": "execute_result" 2210 | } 2211 | ], 2212 | "source": [ 2213 | "#At this point , a simple popularity based recommendation system can be built based on count of user ratings for different books\n", 2214 | "ratings_count = pd.DataFrame(ratings_explicit.groupby(['ISBN'])['bookRating'].sum())\n", 2215 | "top10 = ratings_count.sort_values('bookRating', ascending = False).head(10)\n", 2216 | "print \"Following books are recommended\"\n", 2217 | "top10.merge(books, left_index = True, right_on = 'ISBN')\n", 2218 | "\n", 2219 | "#Given below are top 10 recommendations based on popularity. It is evident that books authored by J.K. Rowling are most popular" 2220 | ] 2221 | }, 2222 | { 2223 | "cell_type": "code", 2224 | "execution_count": 233, 2225 | "metadata": { 2226 | "collapsed": true 2227 | }, 2228 | "outputs": [], 2229 | "source": [ 2230 | "#Similarly segregating users who have given explicit ratings from 1-10 and those whose implicit behavior was tracked\n", 2231 | "users_exp_ratings = users[users.userID.isin(ratings_explicit.userID)]\n", 2232 | "users_imp_ratings = users[users.userID.isin(ratings_implicit.userID)]" 2233 | ] 2234 | }, 2235 | { 2236 | "cell_type": "code", 2237 | "execution_count": 234, 2238 | "metadata": {}, 2239 | "outputs": [ 2240 | { 2241 | "name": "stdout", 2242 | "output_type": "stream", 2243 | "text": [ 2244 | "(278858, 3)\n", 2245 | "(68091, 3)\n", 2246 | "(52451, 3)\n" 2247 | ] 2248 | } 2249 | ], 2250 | "source": [ 2251 | "#checking shapes\n", 2252 | "print users.shape\n", 2253 | "print users_exp_ratings.shape\n", 2254 | "print users_imp_ratings.shape" 2255 | ] 2256 | }, 2257 | { 2258 | "cell_type": "markdown", 2259 | "metadata": {}, 2260 | "source": [ 2261 | "**Collaborative Filtering Based Recommendation Systems**" 2262 | ] 2263 | }, 2264 | { 2265 | "cell_type": "code", 2266 | "execution_count": 235, 2267 | "metadata": { 2268 | "collapsed": true 2269 | }, 2270 | "outputs": [], 2271 | "source": [ 2272 | "#To cope up with computing power I have and to reduce the dataset size, I am considering users who have rated atleast 100 books\n", 2273 | "#and books which have atleast 100 ratings\n", 2274 | "counts1 = ratings_explicit['userID'].value_counts()\n", 2275 | "ratings_explicit = ratings_explicit[ratings_explicit['userID'].isin(counts1[counts1 >= 100].index)]\n", 2276 | "counts = ratings_explicit['bookRating'].value_counts()\n", 2277 | "ratings_explicit = ratings_explicit[ratings_explicit['bookRating'].isin(counts[counts >= 100].index)]" 2278 | ] 2279 | }, 2280 | { 2281 | "cell_type": "code", 2282 | "execution_count": 236, 2283 | "metadata": {}, 2284 | "outputs": [ 2285 | { 2286 | "name": "stdout", 2287 | "output_type": "stream", 2288 | "text": [ 2289 | "(449, 66574)\n" 2290 | ] 2291 | }, 2292 | { 2293 | "data": { 2294 | "text/html": [ 2295 | "
\n", 2296 | "\n", 2309 | "\n", 2310 | " \n", 2311 | " \n", 2312 | " \n", 2313 | " \n", 2314 | " \n", 2315 | " \n", 2316 | " \n", 2317 | " \n", 2318 | " \n", 2319 | " \n", 2320 | " \n", 2321 | " \n", 2322 | " \n", 2323 | " \n", 2324 | " \n", 2325 | " \n", 2326 | " \n", 2327 | " \n", 2328 | " \n", 2329 | " \n", 2330 | " \n", 2331 | " \n", 2332 | " \n", 2333 | " \n", 2334 | " \n", 2335 | " \n", 2336 | " \n", 2337 | " \n", 2338 | " \n", 2339 | " \n", 2340 | " \n", 2341 | " \n", 2342 | " \n", 2343 | " \n", 2344 | " \n", 2345 | " \n", 2346 | " \n", 2347 | " \n", 2348 | " \n", 2349 | " \n", 2350 | " \n", 2351 | " \n", 2352 | " \n", 2353 | " \n", 2354 | " \n", 2355 | " \n", 2356 | " \n", 2357 | " \n", 2358 | " \n", 2359 | " \n", 2360 | " \n", 2361 | " \n", 2362 | " \n", 2363 | " \n", 2364 | " \n", 2365 | " \n", 2366 | " \n", 2367 | " \n", 2368 | " \n", 2369 | " \n", 2370 | " \n", 2371 | " \n", 2372 | " \n", 2373 | " \n", 2374 | " \n", 2375 | " \n", 2376 | " \n", 2377 | " \n", 2378 | " \n", 2379 | " \n", 2380 | " \n", 2381 | " \n", 2382 | " \n", 2383 | " \n", 2384 | " \n", 2385 | " \n", 2386 | " \n", 2387 | " \n", 2388 | " \n", 2389 | " \n", 2390 | " \n", 2391 | " \n", 2392 | " \n", 2393 | " \n", 2394 | " \n", 2395 | " \n", 2396 | " \n", 2397 | " \n", 2398 | " \n", 2399 | " \n", 2400 | " \n", 2401 | " \n", 2402 | " \n", 2403 | " \n", 2404 | " \n", 2405 | " \n", 2406 | " \n", 2407 | " \n", 2408 | " \n", 2409 | " \n", 2410 | " \n", 2411 | " \n", 2412 | " \n", 2413 | " \n", 2414 | " \n", 2415 | " \n", 2416 | " \n", 2417 | " \n", 2418 | " \n", 2419 | " \n", 2420 | " \n", 2421 | " \n", 2422 | " \n", 2423 | " \n", 2424 | " \n", 2425 | " \n", 2426 | " \n", 2427 | " \n", 2428 | " \n", 2429 | " \n", 2430 | " \n", 2431 | " \n", 2432 | " \n", 2433 | " \n", 2434 | " \n", 2435 | " \n", 2436 | " \n", 2437 | " \n", 2438 | " \n", 2439 | " \n", 2440 | " \n", 2441 | " \n", 2442 | " \n", 2443 | " \n", 2444 | " \n", 2445 | " \n", 2446 | " \n", 2447 | " \n", 2448 | " \n", 2449 | " \n", 2450 | " \n", 2451 | " \n", 2452 | " \n", 2453 | " \n", 2454 | " \n", 2455 | " \n", 2456 | " \n", 2457 | " \n", 2458 | " \n", 2459 | " \n", 2460 | " \n", 2461 | " \n", 2462 | " \n", 2463 | " \n", 2464 | " \n", 2465 | " \n", 2466 | " \n", 2467 | " \n", 2468 | " \n", 2469 | " \n", 2470 | " \n", 2471 | " \n", 2472 | " \n", 2473 | " \n", 2474 | " \n", 2475 | " \n", 2476 | " \n", 2477 | " \n", 2478 | " \n", 2479 | " \n", 2480 | " \n", 2481 | " \n", 2482 | "
ISBN00009131540001046438000104687X00010472130001047973000104799X0001048082000105373600010537440001055607...B000092Q0AB00009EF82B00009NDANB0000DYXIDB0000T6KHIB0000VZEJQB0000X8HIEB00013AX9EB0001I1KOGB000234N3A
userID
2033NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2110NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2276NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4017NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4385NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", 2483 | "

5 rows × 66574 columns

\n", 2484 | "
" 2485 | ], 2486 | "text/plain": [ 2487 | "ISBN 0000913154 0001046438 000104687X 0001047213 0001047973 \\\n", 2488 | "userID \n", 2489 | "2033 NaN NaN NaN NaN NaN \n", 2490 | "2110 NaN NaN NaN NaN NaN \n", 2491 | "2276 NaN NaN NaN NaN NaN \n", 2492 | "4017 NaN NaN NaN NaN NaN \n", 2493 | "4385 NaN NaN NaN NaN NaN \n", 2494 | "\n", 2495 | "ISBN 000104799X 0001048082 0001053736 0001053744 0001055607 \\\n", 2496 | "userID \n", 2497 | "2033 NaN NaN NaN NaN NaN \n", 2498 | "2110 NaN NaN NaN NaN NaN \n", 2499 | "2276 NaN NaN NaN NaN NaN \n", 2500 | "4017 NaN NaN NaN NaN NaN \n", 2501 | "4385 NaN NaN NaN NaN NaN \n", 2502 | "\n", 2503 | "ISBN ... B000092Q0A B00009EF82 B00009NDAN B0000DYXID \\\n", 2504 | "userID ... \n", 2505 | "2033 ... NaN NaN NaN NaN \n", 2506 | "2110 ... NaN NaN NaN NaN \n", 2507 | "2276 ... NaN NaN NaN NaN \n", 2508 | "4017 ... NaN NaN NaN NaN \n", 2509 | "4385 ... NaN NaN NaN NaN \n", 2510 | "\n", 2511 | "ISBN B0000T6KHI B0000VZEJQ B0000X8HIE B00013AX9E B0001I1KOG B000234N3A \n", 2512 | "userID \n", 2513 | "2033 NaN NaN NaN NaN NaN NaN \n", 2514 | "2110 NaN NaN NaN NaN NaN NaN \n", 2515 | "2276 NaN NaN NaN NaN NaN NaN \n", 2516 | "4017 NaN NaN NaN NaN NaN NaN \n", 2517 | "4385 NaN NaN NaN NaN NaN NaN \n", 2518 | "\n", 2519 | "[5 rows x 66574 columns]" 2520 | ] 2521 | }, 2522 | "execution_count": 236, 2523 | "metadata": {}, 2524 | "output_type": "execute_result" 2525 | } 2526 | ], 2527 | "source": [ 2528 | "#Generating ratings matrix from explicit ratings table\n", 2529 | "ratings_matrix = ratings_explicit.pivot(index='userID', columns='ISBN', values='bookRating')\n", 2530 | "userID = ratings_matrix.index\n", 2531 | "ISBN = ratings_matrix.columns\n", 2532 | "print(ratings_matrix.shape)\n", 2533 | "ratings_matrix.head()\n", 2534 | "#Notice that most of the values are NaN (undefined) implying absence of ratings" 2535 | ] 2536 | }, 2537 | { 2538 | "cell_type": "code", 2539 | "execution_count": 237, 2540 | "metadata": {}, 2541 | "outputs": [ 2542 | { 2543 | "name": "stdout", 2544 | "output_type": "stream", 2545 | "text": [ 2546 | "449 66574\n" 2547 | ] 2548 | } 2549 | ], 2550 | "source": [ 2551 | "n_users = ratings_matrix.shape[0] #considering only those users who gave explicit ratings\n", 2552 | "n_books = ratings_matrix.shape[1]\n", 2553 | "print n_users, n_books" 2554 | ] 2555 | }, 2556 | { 2557 | "cell_type": "code", 2558 | "execution_count": 238, 2559 | "metadata": { 2560 | "collapsed": true 2561 | }, 2562 | "outputs": [], 2563 | "source": [ 2564 | "#since NaNs cannot be handled by training algorithms, replacing these by 0, which indicates absence of ratings\n", 2565 | "#setting data type\n", 2566 | "ratings_matrix.fillna(0, inplace = True)\n", 2567 | "ratings_matrix = ratings_matrix.astype(np.int32)" 2568 | ] 2569 | }, 2570 | { 2571 | "cell_type": "code", 2572 | "execution_count": 239, 2573 | "metadata": {}, 2574 | "outputs": [ 2575 | { 2576 | "data": { 2577 | "text/html": [ 2578 | "
\n", 2579 | "\n", 2592 | "\n", 2593 | " \n", 2594 | " \n", 2595 | " \n", 2596 | " \n", 2597 | " \n", 2598 | " \n", 2599 | " \n", 2600 | " \n", 2601 | " \n", 2602 | " \n", 2603 | " \n", 2604 | " \n", 2605 | " \n", 2606 | " \n", 2607 | " \n", 2608 | " \n", 2609 | " \n", 2610 | " \n", 2611 | " \n", 2612 | " \n", 2613 | " \n", 2614 | " \n", 2615 | " \n", 2616 | " \n", 2617 | " \n", 2618 | " \n", 2619 | " \n", 2620 | " \n", 2621 | " \n", 2622 | " \n", 2623 | " \n", 2624 | " \n", 2625 | " \n", 2626 | " \n", 2627 | " \n", 2628 | " \n", 2629 | " \n", 2630 | " \n", 2631 | " \n", 2632 | " \n", 2633 | " \n", 2634 | " \n", 2635 | " \n", 2636 | " \n", 2637 | " \n", 2638 | " \n", 2639 | " \n", 2640 | " \n", 2641 | " \n", 2642 | " \n", 2643 | " \n", 2644 | " \n", 2645 | " \n", 2646 | " \n", 2647 | " \n", 2648 | " \n", 2649 | " \n", 2650 | " \n", 2651 | " \n", 2652 | " \n", 2653 | " \n", 2654 | " \n", 2655 | " \n", 2656 | " \n", 2657 | " \n", 2658 | " \n", 2659 | " \n", 2660 | " \n", 2661 | " \n", 2662 | " \n", 2663 | " \n", 2664 | " \n", 2665 | " \n", 2666 | " \n", 2667 | " \n", 2668 | " \n", 2669 | " \n", 2670 | " \n", 2671 | " \n", 2672 | " \n", 2673 | " \n", 2674 | " \n", 2675 | " \n", 2676 | " \n", 2677 | " \n", 2678 | " \n", 2679 | " \n", 2680 | " \n", 2681 | " \n", 2682 | " \n", 2683 | " \n", 2684 | " \n", 2685 | " \n", 2686 | " \n", 2687 | " \n", 2688 | " \n", 2689 | " \n", 2690 | " \n", 2691 | " \n", 2692 | " \n", 2693 | " \n", 2694 | " \n", 2695 | " \n", 2696 | " \n", 2697 | " \n", 2698 | " \n", 2699 | " \n", 2700 | " \n", 2701 | " \n", 2702 | " \n", 2703 | " \n", 2704 | " \n", 2705 | " \n", 2706 | " \n", 2707 | " \n", 2708 | " \n", 2709 | " \n", 2710 | " \n", 2711 | " \n", 2712 | " \n", 2713 | " \n", 2714 | " \n", 2715 | " \n", 2716 | " \n", 2717 | " \n", 2718 | " \n", 2719 | " \n", 2720 | " \n", 2721 | " \n", 2722 | " \n", 2723 | " \n", 2724 | " \n", 2725 | " \n", 2726 | " \n", 2727 | " \n", 2728 | " \n", 2729 | " \n", 2730 | " \n", 2731 | " \n", 2732 | " \n", 2733 | " \n", 2734 | " \n", 2735 | " \n", 2736 | " \n", 2737 | " \n", 2738 | " \n", 2739 | " \n", 2740 | " \n", 2741 | " \n", 2742 | " \n", 2743 | " \n", 2744 | " \n", 2745 | " \n", 2746 | " \n", 2747 | " \n", 2748 | " \n", 2749 | " \n", 2750 | " \n", 2751 | " \n", 2752 | " \n", 2753 | " \n", 2754 | " \n", 2755 | " \n", 2756 | " \n", 2757 | " \n", 2758 | " \n", 2759 | " \n", 2760 | " \n", 2761 | " \n", 2762 | " \n", 2763 | " \n", 2764 | " \n", 2765 | "
ISBN00009131540001046438000104687X00010472130001047973000104799X0001048082000105373600010537440001055607...B000092Q0AB00009EF82B00009NDANB0000DYXIDB0000T6KHIB0000VZEJQB0000X8HIEB00013AX9EB0001I1KOGB000234N3A
userID
20330000000000...0000000000
21100000000000...0000000000
22760000000000...0000000000
40170000000000...0000000000
43850000000000...0000000000
\n", 2766 | "

5 rows × 66574 columns

\n", 2767 | "
" 2768 | ], 2769 | "text/plain": [ 2770 | "ISBN 0000913154 0001046438 000104687X 0001047213 0001047973 \\\n", 2771 | "userID \n", 2772 | "2033 0 0 0 0 0 \n", 2773 | "2110 0 0 0 0 0 \n", 2774 | "2276 0 0 0 0 0 \n", 2775 | "4017 0 0 0 0 0 \n", 2776 | "4385 0 0 0 0 0 \n", 2777 | "\n", 2778 | "ISBN 000104799X 0001048082 0001053736 0001053744 0001055607 \\\n", 2779 | "userID \n", 2780 | "2033 0 0 0 0 0 \n", 2781 | "2110 0 0 0 0 0 \n", 2782 | "2276 0 0 0 0 0 \n", 2783 | "4017 0 0 0 0 0 \n", 2784 | "4385 0 0 0 0 0 \n", 2785 | "\n", 2786 | "ISBN ... B000092Q0A B00009EF82 B00009NDAN B0000DYXID \\\n", 2787 | "userID ... \n", 2788 | "2033 ... 0 0 0 0 \n", 2789 | "2110 ... 0 0 0 0 \n", 2790 | "2276 ... 0 0 0 0 \n", 2791 | "4017 ... 0 0 0 0 \n", 2792 | "4385 ... 0 0 0 0 \n", 2793 | "\n", 2794 | "ISBN B0000T6KHI B0000VZEJQ B0000X8HIE B00013AX9E B0001I1KOG B000234N3A \n", 2795 | "userID \n", 2796 | "2033 0 0 0 0 0 0 \n", 2797 | "2110 0 0 0 0 0 0 \n", 2798 | "2276 0 0 0 0 0 0 \n", 2799 | "4017 0 0 0 0 0 0 \n", 2800 | "4385 0 0 0 0 0 0 \n", 2801 | "\n", 2802 | "[5 rows x 66574 columns]" 2803 | ] 2804 | }, 2805 | "execution_count": 239, 2806 | "metadata": {}, 2807 | "output_type": "execute_result" 2808 | } 2809 | ], 2810 | "source": [ 2811 | "#checking first few rows\n", 2812 | "ratings_matrix.head(5)" 2813 | ] 2814 | }, 2815 | { 2816 | "cell_type": "code", 2817 | "execution_count": 240, 2818 | "metadata": {}, 2819 | "outputs": [ 2820 | { 2821 | "name": "stdout", 2822 | "output_type": "stream", 2823 | "text": [ 2824 | "The sparsity level of Book Crossing dataset is 99.9977218411 %\n" 2825 | ] 2826 | } 2827 | ], 2828 | "source": [ 2829 | "#rechecking the sparsity\n", 2830 | "sparsity=1.0-len(ratings_explicit)/float(users_exp_ratings.shape[0]*n_books)\n", 2831 | "print 'The sparsity level of Book Crossing dataset is ' + str(sparsity*100) + ' %'" 2832 | ] 2833 | }, 2834 | { 2835 | "cell_type": "markdown", 2836 | "metadata": {}, 2837 | "source": [ 2838 | "**Training our recommendation system**" 2839 | ] 2840 | }, 2841 | { 2842 | "cell_type": "code", 2843 | "execution_count": 241, 2844 | "metadata": { 2845 | "collapsed": true 2846 | }, 2847 | "outputs": [], 2848 | "source": [ 2849 | "#setting global variables\n", 2850 | "global metric,k\n", 2851 | "k=10\n", 2852 | "metric='cosine'" 2853 | ] 2854 | }, 2855 | { 2856 | "cell_type": "markdown", 2857 | "metadata": {}, 2858 | "source": [ 2859 | "**User-based Recommendation System**" 2860 | ] 2861 | }, 2862 | { 2863 | "cell_type": "code", 2864 | "execution_count": 242, 2865 | "metadata": { 2866 | "collapsed": true 2867 | }, 2868 | "outputs": [], 2869 | "source": [ 2870 | "#This function finds k similar users given the user_id and ratings matrix \n", 2871 | "#These similarities are same as obtained via using pairwise_distances\n", 2872 | "def findksimilarusers(user_id, ratings, metric = metric, k=k):\n", 2873 | " similarities=[]\n", 2874 | " indices=[]\n", 2875 | " model_knn = NearestNeighbors(metric = metric, algorithm = 'brute') \n", 2876 | " model_knn.fit(ratings)\n", 2877 | " loc = ratings.index.get_loc(user_id)\n", 2878 | " distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k+1)\n", 2879 | " similarities = 1-distances.flatten()\n", 2880 | " \n", 2881 | " return similarities,indices" 2882 | ] 2883 | }, 2884 | { 2885 | "cell_type": "code", 2886 | "execution_count": 243, 2887 | "metadata": { 2888 | "collapsed": true 2889 | }, 2890 | "outputs": [], 2891 | "source": [ 2892 | "#This function predicts rating for specified user-item combination based on user-based approach\n", 2893 | "def predict_userbased(user_id, item_id, ratings, metric = metric, k=k):\n", 2894 | " prediction=0\n", 2895 | " user_loc = ratings.index.get_loc(user_id)\n", 2896 | " item_loc = ratings.columns.get_loc(item_id)\n", 2897 | " similarities, indices=findksimilarusers(user_id, ratings,metric, k) #similar users based on cosine similarity\n", 2898 | " mean_rating = ratings.iloc[user_loc,:].mean() #to adjust for zero based indexing\n", 2899 | " sum_wt = np.sum(similarities)-1\n", 2900 | " product=1\n", 2901 | " wtd_sum = 0 \n", 2902 | " \n", 2903 | " for i in range(0, len(indices.flatten())):\n", 2904 | " if indices.flatten()[i] == user_loc:\n", 2905 | " continue;\n", 2906 | " else: \n", 2907 | " ratings_diff = ratings.iloc[indices.flatten()[i],item_loc]-np.mean(ratings.iloc[indices.flatten()[i],:])\n", 2908 | " product = ratings_diff * (similarities[i])\n", 2909 | " wtd_sum = wtd_sum + product\n", 2910 | " \n", 2911 | " #in case of very sparse datasets, using correlation metric for collaborative based approach may give negative ratings\n", 2912 | " #which are handled here as below\n", 2913 | " if prediction <= 0:\n", 2914 | " prediction = 1 \n", 2915 | " elif prediction >10:\n", 2916 | " prediction = 10\n", 2917 | " \n", 2918 | " prediction = int(round(mean_rating + (wtd_sum/sum_wt)))\n", 2919 | " print '\\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction)\n", 2920 | "\n", 2921 | " return prediction" 2922 | ] 2923 | }, 2924 | { 2925 | "cell_type": "code", 2926 | "execution_count": 244, 2927 | "metadata": {}, 2928 | "outputs": [ 2929 | { 2930 | "name": "stdout", 2931 | "output_type": "stream", 2932 | "text": [ 2933 | "\n", 2934 | "Predicted rating for user 11676 -> item 0001056107: 2\n" 2935 | ] 2936 | } 2937 | ], 2938 | "source": [ 2939 | "predict_userbased(11676,'0001056107',ratings_matrix);" 2940 | ] 2941 | }, 2942 | { 2943 | "cell_type": "markdown", 2944 | "metadata": {}, 2945 | "source": [ 2946 | "**Item-based Recommendation Systems**" 2947 | ] 2948 | }, 2949 | { 2950 | "cell_type": "code", 2951 | "execution_count": 245, 2952 | "metadata": { 2953 | "collapsed": true 2954 | }, 2955 | "outputs": [], 2956 | "source": [ 2957 | "#This function finds k similar items given the item_id and ratings matrix\n", 2958 | "\n", 2959 | "def findksimilaritems(item_id, ratings, metric=metric, k=k):\n", 2960 | " similarities=[]\n", 2961 | " indices=[]\n", 2962 | " ratings=ratings.T\n", 2963 | " loc = ratings.index.get_loc(item_id)\n", 2964 | " model_knn = NearestNeighbors(metric = metric, algorithm = 'brute')\n", 2965 | " model_knn.fit(ratings)\n", 2966 | " \n", 2967 | " distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k+1)\n", 2968 | " similarities = 1-distances.flatten()\n", 2969 | "\n", 2970 | " return similarities,indices" 2971 | ] 2972 | }, 2973 | { 2974 | "cell_type": "code", 2975 | "execution_count": 246, 2976 | "metadata": { 2977 | "collapsed": true 2978 | }, 2979 | "outputs": [], 2980 | "source": [ 2981 | "similarities,indices=findksimilaritems('0001056107',ratings_matrix)" 2982 | ] 2983 | }, 2984 | { 2985 | "cell_type": "code", 2986 | "execution_count": 247, 2987 | "metadata": { 2988 | "collapsed": true 2989 | }, 2990 | "outputs": [], 2991 | "source": [ 2992 | "#This function predicts the rating for specified user-item combination based on item-based approach\n", 2993 | "def predict_itembased(user_id, item_id, ratings, metric = metric, k=k):\n", 2994 | " prediction= wtd_sum =0\n", 2995 | " user_loc = ratings.index.get_loc(user_id)\n", 2996 | " item_loc = ratings.columns.get_loc(item_id)\n", 2997 | " similarities, indices=findksimilaritems(item_id, ratings) #similar users based on correlation coefficients\n", 2998 | " sum_wt = np.sum(similarities)-1\n", 2999 | " product=1\n", 3000 | " for i in range(0, len(indices.flatten())):\n", 3001 | " if indices.flatten()[i] == item_loc:\n", 3002 | " continue;\n", 3003 | " else:\n", 3004 | " product = ratings.iloc[user_loc,indices.flatten()[i]] * (similarities[i])\n", 3005 | " wtd_sum = wtd_sum + product \n", 3006 | " prediction = int(round(wtd_sum/sum_wt))\n", 3007 | " \n", 3008 | " #in case of very sparse datasets, using correlation metric for collaborative based approach may give negative ratings\n", 3009 | " #which are handled here as below //code has been validated without the code snippet below, below snippet is to avoid negative\n", 3010 | " #predictions which might arise in case of very sparse datasets when using correlation metric\n", 3011 | " if prediction <= 0:\n", 3012 | " prediction = 1 \n", 3013 | " elif prediction >10:\n", 3014 | " prediction = 10\n", 3015 | "\n", 3016 | " print '\\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction) \n", 3017 | " \n", 3018 | " return prediction" 3019 | ] 3020 | }, 3021 | { 3022 | "cell_type": "code", 3023 | "execution_count": 248, 3024 | "metadata": {}, 3025 | "outputs": [ 3026 | { 3027 | "name": "stdout", 3028 | "output_type": "stream", 3029 | "text": [ 3030 | "\n", 3031 | "Predicted rating for user 11676 -> item 0001056107: 1\n" 3032 | ] 3033 | } 3034 | ], 3035 | "source": [ 3036 | "prediction = predict_itembased(11676,'0001056107',ratings_matrix)" 3037 | ] 3038 | }, 3039 | { 3040 | "cell_type": "code", 3041 | "execution_count": 249, 3042 | "metadata": { 3043 | "collapsed": true 3044 | }, 3045 | "outputs": [], 3046 | "source": [ 3047 | "@contextmanager\n", 3048 | "def suppress_stdout():\n", 3049 | " with open(os.devnull, \"w\") as devnull:\n", 3050 | " old_stdout = sys.stdout\n", 3051 | " sys.stdout = devnull\n", 3052 | " try: \n", 3053 | " yield\n", 3054 | " finally:\n", 3055 | " sys.stdout = old_stdout" 3056 | ] 3057 | }, 3058 | { 3059 | "cell_type": "code", 3060 | "execution_count": 252, 3061 | "metadata": { 3062 | "collapsed": true 3063 | }, 3064 | "outputs": [], 3065 | "source": [ 3066 | "#This function utilizes above functions to recommend items for item/user based approach and cosine/correlation. \n", 3067 | "#Recommendations are made if the predicted rating for an item is >= to 6,and the items have not been rated already\n", 3068 | "def recommendItem(user_id, ratings, metric=metric): \n", 3069 | " if (user_id not in ratings.index.values) or type(user_id) is not int:\n", 3070 | " print \"User id should be a valid integer from this list :\\n\\n {} \".format(re.sub('[\\[\\]]', '', np.array_str(ratings_matrix.index.values)))\n", 3071 | " else: \n", 3072 | " ids = ['Item-based (correlation)','Item-based (cosine)','User-based (correlation)','User-based (cosine)']\n", 3073 | " select = widgets.Dropdown(options=ids, value=ids[0],description='Select approach', width='1000px')\n", 3074 | " def on_change(change):\n", 3075 | " clear_output(wait=True)\n", 3076 | " prediction = [] \n", 3077 | " if change['type'] == 'change' and change['name'] == 'value': \n", 3078 | " if (select.value == 'Item-based (correlation)') | (select.value == 'User-based (correlation)') :\n", 3079 | " metric = 'correlation'\n", 3080 | " else: \n", 3081 | " metric = 'cosine' \n", 3082 | " with suppress_stdout():\n", 3083 | " if (select.value == 'Item-based (correlation)') | (select.value == 'Item-based (cosine)'):\n", 3084 | " for i in range(ratings.shape[1]):\n", 3085 | " if (ratings[str(ratings.columns[i])][user_id] !=0): #not rated already\n", 3086 | " prediction.append(predict_itembased(user_id, str(ratings.columns[i]) ,ratings, metric))\n", 3087 | " else: \n", 3088 | " prediction.append(-1) #for already rated items\n", 3089 | " else:\n", 3090 | " for i in range(ratings.shape[1]):\n", 3091 | " if (ratings[str(ratings.columns[i])][user_id] !=0): #not rated already\n", 3092 | " prediction.append(predict_userbased(user_id, str(ratings.columns[i]) ,ratings, metric))\n", 3093 | " else: \n", 3094 | " prediction.append(-1) #for already rated items\n", 3095 | " prediction = pd.Series(prediction)\n", 3096 | " prediction = prediction.sort_values(ascending=False)\n", 3097 | " recommended = prediction[:10]\n", 3098 | " print \"As per {0} approach....Following books are recommended...\".format(select.value)\n", 3099 | " for i in range(len(recommended)):\n", 3100 | " print \"{0}. {1}\".format(i+1,books.bookTitle[recommended.index[i]].encode('utf-8')) \n", 3101 | " select.observe(on_change)\n", 3102 | " display(select)" 3103 | ] 3104 | }, 3105 | { 3106 | "cell_type": "code", 3107 | "execution_count": 255, 3108 | "metadata": {}, 3109 | "outputs": [ 3110 | { 3111 | "name": "stdout", 3112 | "output_type": "stream", 3113 | "text": [ 3114 | "User id should be a valid integer from this list :\n", 3115 | "\n", 3116 | " 2033 2110 2276 4017 4385 5582 6242 6251 6543 6575\n", 3117 | " 7286 7346 8067 8245 8681 8890 10560 11676 11993 12538\n", 3118 | " 12824 12982 13552 13850 14422 15408 15418 16634 16795 16966\n", 3119 | " 17950 19085 21014 23768 23872 23902 25409 25601 25981 26535\n", 3120 | " 26544 26583 28591 28634 29259 30276 30511 30711 30735 30810\n", 3121 | " 31315 31556 31826 32773 33145 35433 35836 35857 35859 36299\n", 3122 | " 36554 36606 36609 36836 36907 37644 37712 37950 38023 38273\n", 3123 | " 38281 39281 39467 40889 40943 43246 43910 46398 47316 48025\n", 3124 | " 48494 49144 49889 51883 52199 52350 52584 52614 52917 53220\n", 3125 | " 55187 55490 55492 56271 56399 56447 56554 56959 59172 60244\n", 3126 | " 60337 60707 63714 63956 65258 66942 67840 68555 69078 69389\n", 3127 | " 69697 70415 70594 70666 72352 73681 75591 75819 76151 76223\n", 3128 | " 76499 76626 78553 78783 78834 78973 79441 81492 81560 83287\n", 3129 | " 83637 83671 85526 85656 86189 86947 87141 87555 88283 88677\n", 3130 | " 88693 88733 89602 91113 92652 92810 93047 93363 93629 94242\n", 3131 | " 94347 94853 94951 95010 95359 95902 95932 96448 97754 97874\n", 3132 | " 98391 98758 100459 100906 101209 101606 101851 102359 102647 102702\n", 3133 | " 102967 104399 104636 105028 105517 105979 106007 107784 107951 109574\n", 3134 | " 109901 109955 110483 110912 110934 110973 112001 113270 113519 114368\n", 3135 | " 114868 114988 115002 115003 116599 117384 120565 122429 122793 123094\n", 3136 | " 123608 123883 123981 125519 125774 126492 126736 127200 127359 128835\n", 3137 | " 129074 129716 129851 130554 130571 132492 132836 133747 134434 135149\n", 3138 | " 135265 136010 136139 136348 136382 138578 138844 140000 140358 141902\n", 3139 | " 142524 143175 143253 143415 145449 146113 146348 147847 148199 148258\n", 3140 | " 148744 148966 149907 149908 150979 153662 156150 156269 156300 156467\n", 3141 | " 157247 157273 158226 158295 158433 159506 160295 162052 162639 162738\n", 3142 | " 163759 163761 163804 163973 164096 164323 164533 164828 164905 165308\n", 3143 | " 165319 165758 166123 166596 168047 168245 169682 170513 170634 171118\n", 3144 | " 172030 172742 172888 173291 173415 174304 174892 177072 177432 177458\n", 3145 | " 178522 179718 179978 180378 180651 181176 182085 182086 182993 183958\n", 3146 | " 183995 184299 184532 185233 185384 187145 187256 187517 189139 189334\n", 3147 | " 189835 189973 190708 190925 193458 193560 193898 194600 196077 196160\n", 3148 | " 196502 197659 199416 200226 201290 203240 204864 205735 205943 206534\n", 3149 | " 207782 208406 208671 209516 210485 211426 211919 212965 214786 216012\n", 3150 | " 216444 216683 217106 217318 217740 218552 218608 219546 219683 222204\n", 3151 | " 222296 223087 223501 224349 224525 224646 224764 225087 225199 225232\n", 3152 | " 225595 225763 226965 227250 227447 227520 227705 229011 229329 229551\n", 3153 | " 229741 230522 231210 232131 232945 233911 234359 234828 235105 235282\n", 3154 | " 235935 236058 236283 236340 236757 236948 239584 239594 240144 240403\n", 3155 | " 240543 240567 240568 241198 241666 241980 242006 242083 242409 242465\n", 3156 | " 244627 244685 245410 245827 246311 247429 247447 248718 249894 250405\n", 3157 | " 250709 251394 251843 251844 252695 252820 254206 254465 254899 255489\n", 3158 | " 257204 258152 258185 258534 261105 261829 262998 264031 264082 264321\n", 3159 | " 264525 265115 265313 265889 266056 266226 268110 268300 268932 269566\n", 3160 | " 270713 271448 271705 273113 274061 274301 275970 277427 278418 \n" 3161 | ] 3162 | } 3163 | ], 3164 | "source": [ 3165 | "#checking for incorrect entries\n", 3166 | "recommendItem(999999,ratings_matrix)" 3167 | ] 3168 | }, 3169 | { 3170 | "cell_type": "code", 3171 | "execution_count": 253, 3172 | "metadata": { 3173 | "scrolled": true 3174 | }, 3175 | "outputs": [ 3176 | { 3177 | "name": "stdout", 3178 | "output_type": "stream", 3179 | "text": [ 3180 | "As per Item-based (cosine) approach....Following books are recommended...\n", 3181 | "1. My Wicked Wicked Ways\n", 3182 | "2. Fair Peril\n", 3183 | "3. Wolfpointe\n", 3184 | "4. A Nest of Ninnies\n", 3185 | "5. A Bitter Legacy\n", 3186 | "6. A Hymn Before Battle\n", 3187 | "7. Thomas the Rhymer\n", 3188 | "8. Gatherer of Clouds (Initiate Brother Duology)\n", 3189 | "9. Wege zum Ruhm: 13 Hilfestellungen für junge Künstler und 1 Warnung\n", 3190 | "10. Love In Bloom's\n" 3191 | ] 3192 | } 3193 | ], 3194 | "source": [ 3195 | "recommendItem(4385, ratings_matrix)" 3196 | ] 3197 | }, 3198 | { 3199 | "cell_type": "code", 3200 | "execution_count": 254, 3201 | "metadata": {}, 3202 | "outputs": [ 3203 | { 3204 | "name": "stdout", 3205 | "output_type": "stream", 3206 | "text": [ 3207 | "As per User-based (correlation) approach....Following books are recommended...\n", 3208 | "1. The Gift\n", 3209 | "2. A Close Run Thing : A Novel of Wellington's Army of 1815\n", 3210 | "3. The Romantic: A Novel\n", 3211 | "4. Mazurka for Two Dead Men\n", 3212 | "5. The Titanic Conspiracy: Cover-Ups and Mysteries of the World's Most Famous Sea Disaster\n", 3213 | "6. And Never Let Her Go : Thomas Capano: The Deadly Seducer\n", 3214 | "7. Chop Wood, Carry Water: A Guide to Finding Spiritual Fulfillment in Everyday Life\n", 3215 | "8. WHO NEEDS GOD\n", 3216 | "9. Lords of the White Castle\n", 3217 | "10. Prince Charming Isn't Coming: How Women Get Smart About Money\n" 3218 | ] 3219 | } 3220 | ], 3221 | "source": [ 3222 | "recommendItem(4385, ratings_matrix)" 3223 | ] 3224 | }, 3225 | { 3226 | "cell_type": "markdown", 3227 | "metadata": {}, 3228 | "source": [ 3229 | "**Thanks for reading this notebook**" 3230 | ] 3231 | } 3232 | ], 3233 | "metadata": { 3234 | "kernelspec": { 3235 | "display_name": "Python 2", 3236 | "language": "python", 3237 | "name": "python2" 3238 | }, 3239 | "language_info": { 3240 | "codemirror_mode": { 3241 | "name": "ipython", 3242 | "version": 2 3243 | }, 3244 | "file_extension": ".py", 3245 | "mimetype": "text/x-python", 3246 | "name": "python", 3247 | "nbconvert_exporter": "python", 3248 | "pygments_lexer": "ipython2", 3249 | "version": "2.7.13" 3250 | } 3251 | }, 3252 | "nbformat": 4, 3253 | "nbformat_minor": 2 3254 | } 3255 | -------------------------------------------------------------------------------- /CF Recommendation System-Examples.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "**Examples of Collaborative Filtering based Recommendation Systems**" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 2, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "#make necesarry imports\n", 19 | "import numpy as np\n", 20 | "import pandas as pd\n", 21 | "import matplotlib.pyplot as plt\n", 22 | "import sklearn.metrics as metrics\n", 23 | "import numpy as np\n", 24 | "from sklearn.neighbors import NearestNeighbors\n", 25 | "from scipy.spatial.distance import correlation, cosine\n", 26 | "import ipywidgets as widgets\n", 27 | "from IPython.display import display, clear_output\n", 28 | "from sklearn.metrics import pairwise_distances\n", 29 | "from sklearn.metrics import mean_squared_error\n", 30 | "from math import sqrt\n", 31 | "import sys, os\n", 32 | "from contextlib import contextmanager" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "#M is user-item ratings matrix where ratings are integers from 1-10\n", 44 | "M = np.asarray([[3,7,4,9,9,7], \n", 45 | " [7,0,5,3,8,8],\n", 46 | " [7,5,5,0,8,4],\n", 47 | " [5,6,8,5,9,8],\n", 48 | " [5,8,8,8,10,9],\n", 49 | " [7,7,0,4,7,8]])\n", 50 | "M=pd.DataFrame(M)\n", 51 | "\n", 52 | "#declaring k,metric as global which can be changed by the user later\n", 53 | "global k,metric\n", 54 | "k=4\n", 55 | "metric='cosine' #can be changed to 'correlation' for Pearson correlation similaries" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/html": [ 66 | "
\n", 67 | "\n", 80 | "\n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | "
012345
0374997
1705388
2755084
3568598
45888109
5770478
\n", 149 | "
" 150 | ], 151 | "text/plain": [ 152 | " 0 1 2 3 4 5\n", 153 | "0 3 7 4 9 9 7\n", 154 | "1 7 0 5 3 8 8\n", 155 | "2 7 5 5 0 8 4\n", 156 | "3 5 6 8 5 9 8\n", 157 | "4 5 8 8 8 10 9\n", 158 | "5 7 7 0 4 7 8" 159 | ] 160 | }, 161 | "execution_count": 4, 162 | "metadata": {}, 163 | "output_type": "execute_result" 164 | } 165 | ], 166 | "source": [ 167 | "M" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "**User-based Recommendation Systems**" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 5, 180 | "metadata": { 181 | "collapsed": true 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "#get cosine similarities for ratings matrix M; pairwise_distances returns the distances between ratings and hence\n", 186 | "#similarities are obtained by subtracting distances from 1\n", 187 | "cosine_sim = 1-pairwise_distances(M, metric=\"cosine\")" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 6, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "data": { 197 | "text/html": [ 198 | "
\n", 199 | "\n", 212 | "\n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | "
012345
01.0000000.7992680.7792270.9346220.9738900.884600
10.7992681.0000000.8747440.9058500.8661460.827036
20.7792270.8747441.0000000.9095130.8654540.853275
30.9346220.9058500.9095131.0000000.9893440.865614
40.9738900.8661460.8654540.9893441.0000000.881640
50.8846000.8270360.8532750.8656140.8816401.000000
\n", 281 | "
" 282 | ], 283 | "text/plain": [ 284 | " 0 1 2 3 4 5\n", 285 | "0 1.000000 0.799268 0.779227 0.934622 0.973890 0.884600\n", 286 | "1 0.799268 1.000000 0.874744 0.905850 0.866146 0.827036\n", 287 | "2 0.779227 0.874744 1.000000 0.909513 0.865454 0.853275\n", 288 | "3 0.934622 0.905850 0.909513 1.000000 0.989344 0.865614\n", 289 | "4 0.973890 0.866146 0.865454 0.989344 1.000000 0.881640\n", 290 | "5 0.884600 0.827036 0.853275 0.865614 0.881640 1.000000" 291 | ] 292 | }, 293 | "execution_count": 6, 294 | "metadata": {}, 295 | "output_type": "execute_result" 296 | } 297 | ], 298 | "source": [ 299 | "#Cosine similarity matrix\n", 300 | "pd.DataFrame(cosine_sim)" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 7, 306 | "metadata": { 307 | "collapsed": true 308 | }, 309 | "outputs": [], 310 | "source": [ 311 | "#get pearson similarities for ratings matrix M\n", 312 | "pearson_sim = 1-pairwise_distances(M, metric=\"correlation\")" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 8, 318 | "metadata": {}, 319 | "outputs": [ 320 | { 321 | "data": { 322 | "text/html": [ 323 | "
\n", 324 | "\n", 337 | "\n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | "
012345
01.000000-0.137446-0.3573980.2081790.7619050.277350
1-0.1374461.0000000.4538970.5159100.1124560.218328
2-0.3573980.4538971.0000000.451378-0.0428880.297373
30.2081790.5159100.4513781.0000000.763325-0.057739
40.7619050.112456-0.0428880.7633251.0000000.039621
50.2773500.2183280.297373-0.0577390.0396211.000000
\n", 406 | "
" 407 | ], 408 | "text/plain": [ 409 | " 0 1 2 3 4 5\n", 410 | "0 1.000000 -0.137446 -0.357398 0.208179 0.761905 0.277350\n", 411 | "1 -0.137446 1.000000 0.453897 0.515910 0.112456 0.218328\n", 412 | "2 -0.357398 0.453897 1.000000 0.451378 -0.042888 0.297373\n", 413 | "3 0.208179 0.515910 0.451378 1.000000 0.763325 -0.057739\n", 414 | "4 0.761905 0.112456 -0.042888 0.763325 1.000000 0.039621\n", 415 | "5 0.277350 0.218328 0.297373 -0.057739 0.039621 1.000000" 416 | ] 417 | }, 418 | "execution_count": 8, 419 | "metadata": {}, 420 | "output_type": "execute_result" 421 | } 422 | ], 423 | "source": [ 424 | "#Pearson correlation similarity matrix\n", 425 | "pd.DataFrame(pearson_sim)" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 9, 431 | "metadata": {}, 432 | "outputs": [], 433 | "source": [ 434 | "#This function finds k similar users given the user_id and ratings matrix M\n", 435 | "#Note that the similarities are same as obtained via using pairwise_distances\n", 436 | "def findksimilarusers(user_id, ratings, metric = metric, k=k):\n", 437 | " similarities=[]\n", 438 | " indices=[]\n", 439 | " model_knn = NearestNeighbors(metric = metric, algorithm = 'brute') \n", 440 | " model_knn.fit(ratings)\n", 441 | "\n", 442 | " distances, indices = model_knn.kneighbors(ratings.iloc[user_id-1, :].values.reshape(1, -1), n_neighbors = k+1)\n", 443 | " similarities = 1-distances.flatten()\n", 444 | " print '{0} most similar users for User {1}:\\n'.format(k,user_id)\n", 445 | " for i in range(0, len(indices.flatten())):\n", 446 | " if indices.flatten()[i]+1 == user_id:\n", 447 | " continue;\n", 448 | "\n", 449 | " else:\n", 450 | " print '{0}: User {1}, with similarity of {2}'.format(i, indices.flatten()[i]+1, similarities.flatten()[i])\n", 451 | " \n", 452 | " return similarities,indices" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": 10, 458 | "metadata": {}, 459 | "outputs": [ 460 | { 461 | "name": "stdout", 462 | "output_type": "stream", 463 | "text": [ 464 | "4 most similar users for User 1:\n", 465 | "\n", 466 | "1: User 5, with similarity of 0.973889935402\n", 467 | "2: User 4, with similarity of 0.934621684178\n", 468 | "3: User 6, with similarity of 0.88460045723\n", 469 | "4: User 2, with similarity of 0.799267978052\n" 470 | ] 471 | } 472 | ], 473 | "source": [ 474 | "similarities,indices = findksimilarusers(1,M, metric='cosine')" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": 11, 480 | "metadata": { 481 | "scrolled": true 482 | }, 483 | "outputs": [ 484 | { 485 | "name": "stdout", 486 | "output_type": "stream", 487 | "text": [ 488 | "4 most similar users for User 1:\n", 489 | "\n", 490 | "1: User 5, with similarity of 0.761904761905\n", 491 | "2: User 6, with similarity of 0.277350098113\n", 492 | "3: User 4, with similarity of 0.208179450927\n", 493 | "4: User 2, with similarity of -0.137446320513\n" 494 | ] 495 | } 496 | ], 497 | "source": [ 498 | "similarities,indices = findksimilarusers(1,M, metric='correlation')" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": 12, 504 | "metadata": { 505 | "collapsed": true 506 | }, 507 | "outputs": [], 508 | "source": [ 509 | "#This function predicts rating for specified user-item combination based on user-based approach\n", 510 | "def predict_userbased(user_id, item_id, ratings, metric = metric, k=k):\n", 511 | " prediction=0\n", 512 | " similarities, indices=findksimilarusers(user_id, ratings,metric, k) #similar users based on cosine similarity\n", 513 | " mean_rating = ratings.loc[user_id-1,:].mean() #to adjust for zero based indexing\n", 514 | " sum_wt = np.sum(similarities)-1\n", 515 | " product=1\n", 516 | " wtd_sum = 0 \n", 517 | " \n", 518 | " for i in range(0, len(indices.flatten())):\n", 519 | " if indices.flatten()[i]+1 == user_id:\n", 520 | " continue;\n", 521 | " else: \n", 522 | " ratings_diff = ratings.iloc[indices.flatten()[i],item_id-1]-np.mean(ratings.iloc[indices.flatten()[i],:])\n", 523 | " product = ratings_diff * (similarities[i])\n", 524 | " wtd_sum = wtd_sum + product\n", 525 | " \n", 526 | " prediction = int(round(mean_rating + (wtd_sum/sum_wt)))\n", 527 | " print '\\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction)\n", 528 | "\n", 529 | " return prediction" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 13, 535 | "metadata": {}, 536 | "outputs": [ 537 | { 538 | "name": "stdout", 539 | "output_type": "stream", 540 | "text": [ 541 | "4 most similar users for User 3:\n", 542 | "\n", 543 | "1: User 4, with similarity of 0.90951268934\n", 544 | "2: User 2, with similarity of 0.874744414849\n", 545 | "3: User 5, with similarity of 0.86545387815\n", 546 | "4: User 6, with similarity of 0.853274963344\n", 547 | "\n", 548 | "Predicted rating for user 3 -> item 4: 3\n" 549 | ] 550 | } 551 | ], 552 | "source": [ 553 | "predict_userbased(3,4,M);" 554 | ] 555 | }, 556 | { 557 | "cell_type": "markdown", 558 | "metadata": {}, 559 | "source": [ 560 | "**Item-based Recommendation Systems**" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 14, 566 | "metadata": { 567 | "collapsed": true 568 | }, 569 | "outputs": [], 570 | "source": [ 571 | "#This function finds k similar items given the item_id and ratings matrix M\n", 572 | "\n", 573 | "def findksimilaritems(item_id, ratings, metric=metric, k=k):\n", 574 | " similarities=[]\n", 575 | " indices=[] \n", 576 | " ratings=ratings.T\n", 577 | " model_knn = NearestNeighbors(metric = metric, algorithm = 'brute')\n", 578 | " model_knn.fit(ratings)\n", 579 | "\n", 580 | " distances, indices = model_knn.kneighbors(ratings.iloc[item_id-1, :].values.reshape(1, -1), n_neighbors = k+1)\n", 581 | " similarities = 1-distances.flatten()\n", 582 | " print '{0} most similar items for item {1}:\\n'.format(k,item_id)\n", 583 | " for i in range(0, len(indices.flatten())):\n", 584 | " if indices.flatten()[i]+1 == item_id:\n", 585 | " continue;\n", 586 | "\n", 587 | " else:\n", 588 | " print '{0}: Item {1} :, with similarity of {2}'.format(i,indices.flatten()[i]+1, similarities.flatten()[i])\n", 589 | "\n", 590 | "\n", 591 | " return similarities,indices" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 15, 597 | "metadata": {}, 598 | "outputs": [ 599 | { 600 | "name": "stdout", 601 | "output_type": "stream", 602 | "text": [ 603 | "4 most similar items for item 3:\n", 604 | "\n", 605 | "1: Item 5 :, with similarity of 0.918336125535\n", 606 | "2: Item 6 :, with similarity of 0.874759773038\n", 607 | "3: Item 1 :, with similarity of 0.810364746222\n", 608 | "4: Item 4 :, with similarity of 0.796917800302\n" 609 | ] 610 | } 611 | ], 612 | "source": [ 613 | "similarities,indices=findksimilaritems(3,M)" 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": 16, 619 | "metadata": { 620 | "collapsed": true 621 | }, 622 | "outputs": [], 623 | "source": [ 624 | "#This function predicts the rating for specified user-item combination based on item-based approach\n", 625 | "def predict_itembased(user_id, item_id, ratings, metric = metric, k=k):\n", 626 | " prediction= wtd_sum =0\n", 627 | " similarities, indices=findksimilaritems(item_id, ratings) #similar users based on correlation coefficients\n", 628 | " sum_wt = np.sum(similarities)-1\n", 629 | " product=1\n", 630 | " \n", 631 | " for i in range(0, len(indices.flatten())):\n", 632 | " if indices.flatten()[i]+1 == item_id:\n", 633 | " continue;\n", 634 | " else:\n", 635 | " product = ratings.iloc[user_id-1,indices.flatten()[i]] * (similarities[i])\n", 636 | " wtd_sum = wtd_sum + product \n", 637 | " prediction = int(round(wtd_sum/sum_wt))\n", 638 | " print '\\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction) \n", 639 | "\n", 640 | " return prediction" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": 17, 646 | "metadata": {}, 647 | "outputs": [ 648 | { 649 | "name": "stdout", 650 | "output_type": "stream", 651 | "text": [ 652 | "4 most similar items for item 3:\n", 653 | "\n", 654 | "1: Item 5 :, with similarity of 0.918336125535\n", 655 | "2: Item 6 :, with similarity of 0.874759773038\n", 656 | "3: Item 1 :, with similarity of 0.810364746222\n", 657 | "4: Item 4 :, with similarity of 0.796917800302\n", 658 | "\n", 659 | "Predicted rating for user 1 -> item 3: 7\n" 660 | ] 661 | } 662 | ], 663 | "source": [ 664 | "prediction = predict_itembased(1,3,M)" 665 | ] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": 18, 670 | "metadata": { 671 | "collapsed": true 672 | }, 673 | "outputs": [], 674 | "source": [ 675 | "#This function is used to compute adjusted cosine similarity matrix for items\n", 676 | "def computeAdjCosSim(M):\n", 677 | " sim_matrix = np.zeros((M.shape[1], M.shape[1]))\n", 678 | " M_u = M.mean(axis=1) #means\n", 679 | " \n", 680 | " for i in range(M.shape[1]):\n", 681 | " for j in range(M.shape[1]):\n", 682 | " if i == j:\n", 683 | " \n", 684 | " sim_matrix[i][j] = 1\n", 685 | " else: \n", 686 | " if i\n", 736 | "\n", 749 | "\n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | "
012345
01.0000000.2369080.421263-0.519085-0.1258920.010090
10.2369081.000000-0.8052430.0857410.2372730.520625
20.421263-0.8052431.000000-0.767941-0.230521-0.053640
3-0.5190850.085741-0.7679411.000000-0.299059-0.644550
4-0.1258920.237273-0.230521-0.2990591.0000000.599158
50.0100900.520625-0.053640-0.6445500.5991581.000000
\n", 818 | "" 819 | ], 820 | "text/plain": [ 821 | " 0 1 2 3 4 5\n", 822 | "0 1.000000 0.236908 0.421263 -0.519085 -0.125892 0.010090\n", 823 | "1 0.236908 1.000000 -0.805243 0.085741 0.237273 0.520625\n", 824 | "2 0.421263 -0.805243 1.000000 -0.767941 -0.230521 -0.053640\n", 825 | "3 -0.519085 0.085741 -0.767941 1.000000 -0.299059 -0.644550\n", 826 | "4 -0.125892 0.237273 -0.230521 -0.299059 1.000000 0.599158\n", 827 | "5 0.010090 0.520625 -0.053640 -0.644550 0.599158 1.000000" 828 | ] 829 | }, 830 | "execution_count": 20, 831 | "metadata": {}, 832 | "output_type": "execute_result" 833 | } 834 | ], 835 | "source": [ 836 | "adjcos_sim" 837 | ] 838 | }, 839 | { 840 | "cell_type": "code", 841 | "execution_count": 26, 842 | "metadata": { 843 | "collapsed": true 844 | }, 845 | "outputs": [], 846 | "source": [ 847 | "#This function finds k similar items given the item_id and ratings matrix M\n", 848 | "\n", 849 | "def findksimilaritems_adjcos(item_id, ratings, k=k):\n", 850 | " \n", 851 | " sim_matrix = computeAdjCosSim(ratings)\n", 852 | " similarities = sim_matrix[item_id-1].sort_values(ascending=False)[:k+1].values\n", 853 | " indices = sim_matrix[item_id-1].sort_values(ascending=False)[:k+1].index\n", 854 | " \n", 855 | " print '{0} most similar items for item {1}:\\n'.format(k,item_id)\n", 856 | " for i in range(0, len(indices)):\n", 857 | " if indices[i]+1 == item_id:\n", 858 | " continue;\n", 859 | "\n", 860 | " else:\n", 861 | " print '{0}: Item {1} :, with similarity of {2}'.format(i,indices[i]+1, similarities[i])\n", 862 | " \n", 863 | " return similarities ,indices " 864 | ] 865 | }, 866 | { 867 | "cell_type": "code", 868 | "execution_count": 27, 869 | "metadata": {}, 870 | "outputs": [ 871 | { 872 | "name": "stdout", 873 | "output_type": "stream", 874 | "text": [ 875 | "4 most similar items for item 3:\n", 876 | "\n", 877 | "1: Item 1 :, with similarity of 0.421262731871\n", 878 | "2: Item 6 :, with similarity of -0.0536398904889\n", 879 | "3: Item 5 :, with similarity of -0.230521358269\n", 880 | "4: Item 4 :, with similarity of -0.767941046575\n" 881 | ] 882 | } 883 | ], 884 | "source": [ 885 | "similarities, indices = findksimilaritems_adjcos(3,M)" 886 | ] 887 | }, 888 | { 889 | "cell_type": "code", 890 | "execution_count": 28, 891 | "metadata": { 892 | "collapsed": true 893 | }, 894 | "outputs": [], 895 | "source": [ 896 | "#This function predicts the rating for specified user-item combination for adjusted cosine item-based approach\n", 897 | "#As the adjusted cosine similarities range from -1,+1, sometimes the predicted rating can be negative or greater than max value\n", 898 | "#Hack to deal with this: Rating is set to min if prediction is negative, Rating is set to max if prediction is above max\n", 899 | "def predict_itembased_adjcos(user_id, item_id, ratings):\n", 900 | " prediction=0\n", 901 | "\n", 902 | " similarities, indices=findksimilaritems_adjcos(item_id, ratings) #similar users based on correlation coefficients\n", 903 | " sum_wt = np.sum(similarities)-1\n", 904 | "\n", 905 | " product=1\n", 906 | " wtd_sum = 0 \n", 907 | " for i in range(0, len(indices)):\n", 908 | " if indices[i]+1 == item_id:\n", 909 | " continue;\n", 910 | " else:\n", 911 | " product = ratings.iloc[user_id-1,indices[i]] * (similarities[i])\n", 912 | " wtd_sum = wtd_sum + product \n", 913 | " prediction = int(round(wtd_sum/sum_wt))\n", 914 | " if prediction < 0:\n", 915 | " prediction = 1\n", 916 | " elif prediction >10:\n", 917 | " prediction = 10\n", 918 | " print '\\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction) \n", 919 | " \n", 920 | " return prediction" 921 | ] 922 | }, 923 | { 924 | "cell_type": "code", 925 | "execution_count": 29, 926 | "metadata": {}, 927 | "outputs": [ 928 | { 929 | "name": "stdout", 930 | "output_type": "stream", 931 | "text": [ 932 | "4 most similar items for item 4:\n", 933 | "\n", 934 | "1: Item 2 :, with similarity of 0.0857414341149\n", 935 | "2: Item 5 :, with similarity of -0.29905882779\n", 936 | "3: Item 1 :, with similarity of -0.519085268895\n", 937 | "4: Item 6 :, with similarity of -0.644550286954\n", 938 | "\n", 939 | "Predicted rating for user 3 -> item 4: 6\n" 940 | ] 941 | } 942 | ], 943 | "source": [ 944 | "prediction=predict_itembased_adjcos(3,4,M)" 945 | ] 946 | }, 947 | { 948 | "cell_type": "code", 949 | "execution_count": 30, 950 | "metadata": {}, 951 | "outputs": [ 952 | { 953 | "data": { 954 | "text/html": [ 955 | "
\n", 956 | "\n", 969 | "\n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | "
012345
01.0000000.2369080.421263-0.519085-0.1258920.010090
10.2369081.000000-0.8052430.0857410.2372730.520625
20.421263-0.8052431.000000-0.767941-0.230521-0.053640
3-0.5190850.085741-0.7679411.000000-0.299059-0.644550
4-0.1258920.237273-0.230521-0.2990591.0000000.599158
50.0100900.520625-0.053640-0.6445500.5991581.000000
\n", 1038 | "
" 1039 | ], 1040 | "text/plain": [ 1041 | " 0 1 2 3 4 5\n", 1042 | "0 1.000000 0.236908 0.421263 -0.519085 -0.125892 0.010090\n", 1043 | "1 0.236908 1.000000 -0.805243 0.085741 0.237273 0.520625\n", 1044 | "2 0.421263 -0.805243 1.000000 -0.767941 -0.230521 -0.053640\n", 1045 | "3 -0.519085 0.085741 -0.767941 1.000000 -0.299059 -0.644550\n", 1046 | "4 -0.125892 0.237273 -0.230521 -0.299059 1.000000 0.599158\n", 1047 | "5 0.010090 0.520625 -0.053640 -0.644550 0.599158 1.000000" 1048 | ] 1049 | }, 1050 | "execution_count": 30, 1051 | "metadata": {}, 1052 | "output_type": "execute_result" 1053 | } 1054 | ], 1055 | "source": [ 1056 | "adjcos_sim" 1057 | ] 1058 | }, 1059 | { 1060 | "cell_type": "code", 1061 | "execution_count": 31, 1062 | "metadata": { 1063 | "collapsed": true 1064 | }, 1065 | "outputs": [], 1066 | "source": [ 1067 | "#This function utilizes above function to recommend items for selected approach. Recommendations are made if the predicted\n", 1068 | "#rating for an item is greater than or equal to 6, and the items has not been rated already\n", 1069 | "def recommendItem(user_id, item_id, ratings):\n", 1070 | " \n", 1071 | " if user_id<1 or user_id>6 or type(user_id) is not int:\n", 1072 | " print 'Userid does not exist. Enter numbers from 1-6'\n", 1073 | " else: \n", 1074 | " ids = ['User-based CF (cosine)','User-based CF (correlation)','Item-based CF (cosine)',\n", 1075 | " 'Item-based CF (adjusted cosine)']\n", 1076 | "\n", 1077 | " approach = widgets.Dropdown(options=ids, value=ids[0],\n", 1078 | " description='Select Approach', width='500px')\n", 1079 | " \n", 1080 | " def on_change(change):\n", 1081 | " prediction = 0\n", 1082 | " clear_output(wait=True)\n", 1083 | " if change['type'] == 'change' and change['name'] == 'value': \n", 1084 | " if (approach.value == 'User-based CF (cosine)'):\n", 1085 | " metric = 'cosine'\n", 1086 | " prediction = predict_userbased(user_id, item_id, ratings, metric)\n", 1087 | " elif (approach.value == 'User-based CF (correlation)') : \n", 1088 | " metric = 'correlation' \n", 1089 | " prediction = predict_userbased(user_id, item_id, ratings, metric)\n", 1090 | " elif (approach.value == 'Item-based CF (cosine)'):\n", 1091 | " prediction = predict_itembased(user_id, item_id, ratings)\n", 1092 | " else:\n", 1093 | " prediction = predict_itembased_adjcos(user_id,item_id,ratings)\n", 1094 | "\n", 1095 | " if ratings[item_id-1][user_id-1] != 0: \n", 1096 | " print 'Item already rated'\n", 1097 | " else:\n", 1098 | " if prediction>=6:\n", 1099 | " print '\\nItem recommended'\n", 1100 | " else:\n", 1101 | " print 'Item not recommended'\n", 1102 | "\n", 1103 | " approach.observe(on_change)\n", 1104 | " display(approach)" 1105 | ] 1106 | }, 1107 | { 1108 | "cell_type": "code", 1109 | "execution_count": 32, 1110 | "metadata": {}, 1111 | "outputs": [ 1112 | { 1113 | "name": "stdout", 1114 | "output_type": "stream", 1115 | "text": [ 1116 | "Userid does not exist. Enter numbers from 1-6\n" 1117 | ] 1118 | } 1119 | ], 1120 | "source": [ 1121 | "#check for incorrect entries\n", 1122 | "recommendItem(-1,3,M)" 1123 | ] 1124 | }, 1125 | { 1126 | "cell_type": "code", 1127 | "execution_count": 33, 1128 | "metadata": {}, 1129 | "outputs": [ 1130 | { 1131 | "name": "stdout", 1132 | "output_type": "stream", 1133 | "text": [ 1134 | "4 most similar users for User 3:\n", 1135 | "\n", 1136 | "1: User 4, with similarity of 0.90951268934\n", 1137 | "2: User 2, with similarity of 0.874744414849\n", 1138 | "3: User 5, with similarity of 0.86545387815\n", 1139 | "4: User 6, with similarity of 0.853274963344\n", 1140 | "\n", 1141 | "Predicted rating for user 3 -> item 4: 3\n", 1142 | "Item not recommended\n" 1143 | ] 1144 | } 1145 | ], 1146 | "source": [ 1147 | "recommendItem(3,4,M)" 1148 | ] 1149 | }, 1150 | { 1151 | "cell_type": "code", 1152 | "execution_count": 34, 1153 | "metadata": {}, 1154 | "outputs": [ 1155 | { 1156 | "name": "stdout", 1157 | "output_type": "stream", 1158 | "text": [ 1159 | "4 most similar users for User 3:\n", 1160 | "\n", 1161 | "1: User 2, with similarity of 0.453897185842\n", 1162 | "2: User 4, with similarity of 0.451378005098\n", 1163 | "3: User 6, with similarity of 0.297373304825\n", 1164 | "4: User 5, with similarity of -0.04288778794\n", 1165 | "\n", 1166 | "Predicted rating for user 3 -> item 4: 3\n", 1167 | "Item not recommended\n" 1168 | ] 1169 | } 1170 | ], 1171 | "source": [ 1172 | "recommendItem(3,4,M)" 1173 | ] 1174 | }, 1175 | { 1176 | "cell_type": "code", 1177 | "execution_count": 35, 1178 | "metadata": {}, 1179 | "outputs": [ 1180 | { 1181 | "name": "stdout", 1182 | "output_type": "stream", 1183 | "text": [ 1184 | "4 most similar items for item 4:\n", 1185 | "\n", 1186 | "1: Item 6 :, with similarity of 0.89977997614\n", 1187 | "2: Item 2 :, with similarity of 0.887160079571\n", 1188 | "3: Item 5 :, with similarity of 0.88180009273\n", 1189 | "4: Item 3 :, with similarity of 0.796917800302\n", 1190 | "\n", 1191 | "Predicted rating for user 3 -> item 4: 6\n", 1192 | "\n", 1193 | "Item recommended\n" 1194 | ] 1195 | } 1196 | ], 1197 | "source": [ 1198 | "recommendItem(3,4,M)" 1199 | ] 1200 | }, 1201 | { 1202 | "cell_type": "code", 1203 | "execution_count": 36, 1204 | "metadata": {}, 1205 | "outputs": [ 1206 | { 1207 | "name": "stdout", 1208 | "output_type": "stream", 1209 | "text": [ 1210 | "4 most similar items for item 4:\n", 1211 | "\n", 1212 | "1: Item 2 :, with similarity of 0.0857414341149\n", 1213 | "2: Item 5 :, with similarity of -0.29905882779\n", 1214 | "3: Item 1 :, with similarity of -0.519085268895\n", 1215 | "4: Item 6 :, with similarity of -0.644550286954\n", 1216 | "\n", 1217 | "Predicted rating for user 3 -> item 4: 6\n", 1218 | "\n", 1219 | "Item recommended\n" 1220 | ] 1221 | } 1222 | ], 1223 | "source": [ 1224 | "recommendItem(3,4,M)" 1225 | ] 1226 | }, 1227 | { 1228 | "cell_type": "code", 1229 | "execution_count": 37, 1230 | "metadata": {}, 1231 | "outputs": [ 1232 | { 1233 | "name": "stdout", 1234 | "output_type": "stream", 1235 | "text": [ 1236 | "4 most similar users for User 2:\n", 1237 | "\n", 1238 | "1: User 4, with similarity of 0.515910067398\n", 1239 | "2: User 3, with similarity of 0.453897185842\n", 1240 | "3: User 6, with similarity of 0.218327934565\n", 1241 | "4: User 5, with similarity of 0.11245608042\n", 1242 | "\n", 1243 | "Predicted rating for user 2 -> item 1: 5\n", 1244 | "Item already rated\n" 1245 | ] 1246 | } 1247 | ], 1248 | "source": [ 1249 | "#if the item is already rated, it is not recommended\n", 1250 | "recommendItem(2,1,M)" 1251 | ] 1252 | }, 1253 | { 1254 | "cell_type": "code", 1255 | "execution_count": 38, 1256 | "metadata": { 1257 | "collapsed": true 1258 | }, 1259 | "outputs": [], 1260 | "source": [ 1261 | "#This is a quick way to temporarily suppress stdout in particular code section\n", 1262 | "@contextmanager\n", 1263 | "def suppress_stdout():\n", 1264 | " with open(os.devnull, \"w\") as devnull:\n", 1265 | " old_stdout = sys.stdout\n", 1266 | " sys.stdout = devnull\n", 1267 | " try: \n", 1268 | " yield\n", 1269 | " finally:\n", 1270 | " sys.stdout = old_stdout" 1271 | ] 1272 | }, 1273 | { 1274 | "cell_type": "code", 1275 | "execution_count": 39, 1276 | "metadata": { 1277 | "collapsed": true 1278 | }, 1279 | "outputs": [], 1280 | "source": [ 1281 | "#This is final function to evaluate the performance of selected recommendation approach and the metric used here is RMSE\n", 1282 | "#suppress_stdout function is used to suppress the print outputs of all the functions inside this function. It will only print \n", 1283 | "#RMSE values\n", 1284 | "def evaluateRS(ratings):\n", 1285 | " ids = ['User-based CF (cosine)','User-based CF (correlation)','Item-based CF (cosine)','Item-based CF (adjusted cosine)']\n", 1286 | " approach = widgets.Dropdown(options=ids, value=ids[0],description='Select Approach', width='500px')\n", 1287 | " n_users = ratings.shape[0]\n", 1288 | " n_items = ratings.shape[1]\n", 1289 | " prediction = np.zeros((n_users, n_items))\n", 1290 | " prediction= pd.DataFrame(prediction)\n", 1291 | " def on_change(change):\n", 1292 | " clear_output(wait=True)\n", 1293 | " with suppress_stdout():\n", 1294 | " if change['type'] == 'change' and change['name'] == 'value': \n", 1295 | " if (approach.value == 'User-based CF (cosine)'):\n", 1296 | " metric = 'cosine'\n", 1297 | " for i in range(n_users):\n", 1298 | " for j in range(n_items):\n", 1299 | " prediction[i][j] = predict_userbased(i+1, j+1, ratings, metric)\n", 1300 | " elif (approach.value == 'User-based CF (correlation)') : \n", 1301 | " metric = 'correlation' \n", 1302 | " for i in range(n_users):\n", 1303 | " for j in range(n_items):\n", 1304 | " prediction[i][j] = predict_userbased(i+1, j+1, ratings, metric)\n", 1305 | " elif (approach.value == 'Item-based CF (cosine)'):\n", 1306 | " for i in range(n_users):\n", 1307 | " for j in range(n_items):\n", 1308 | " prediction[i][j] = predict_userbased(i+1, j+1, ratings)\n", 1309 | " else:\n", 1310 | " for i in range(n_users):\n", 1311 | " for j in range(n_items):\n", 1312 | " prediction[i][j] = predict_userbased(i+1, j+1, ratings)\n", 1313 | " \n", 1314 | " MSE = mean_squared_error(prediction, ratings)\n", 1315 | " RMSE = round(sqrt(MSE),3)\n", 1316 | " print \"RMSE using {0} approach is: {1}\".format(approach.value,RMSE)\n", 1317 | " \n", 1318 | " approach.observe(on_change)\n", 1319 | " display(approach)" 1320 | ] 1321 | }, 1322 | { 1323 | "cell_type": "code", 1324 | "execution_count": 40, 1325 | "metadata": {}, 1326 | "outputs": [ 1327 | { 1328 | "name": "stdout", 1329 | "output_type": "stream", 1330 | "text": [ 1331 | "RMSE using Item-based CF (cosine) approach is: 2.804\n" 1332 | ] 1333 | } 1334 | ], 1335 | "source": [ 1336 | "evaluateRS(M)" 1337 | ] 1338 | }, 1339 | { 1340 | "cell_type": "code", 1341 | "execution_count": 41, 1342 | "metadata": {}, 1343 | "outputs": [ 1344 | { 1345 | "name": "stdout", 1346 | "output_type": "stream", 1347 | "text": [ 1348 | "RMSE using Item-based CF (cosine) approach is: 2.804\n" 1349 | ] 1350 | } 1351 | ], 1352 | "source": [ 1353 | "evaluateRS(M)" 1354 | ] 1355 | }, 1356 | { 1357 | "cell_type": "markdown", 1358 | "metadata": {}, 1359 | "source": [ 1360 | "**Thanks for reading this notebook**" 1361 | ] 1362 | } 1363 | ], 1364 | "metadata": { 1365 | "kernelspec": { 1366 | "display_name": "Python 2", 1367 | "language": "python", 1368 | "name": "python2" 1369 | }, 1370 | "language_info": { 1371 | "codemirror_mode": { 1372 | "name": "ipython", 1373 | "version": 2 1374 | }, 1375 | "file_extension": ".py", 1376 | "mimetype": "text/x-python", 1377 | "name": "python", 1378 | "nbconvert_exporter": "python", 1379 | "pygments_lexer": "ipython2", 1380 | "version": "2.7.13" 1381 | } 1382 | }, 1383 | "nbformat": 4, 1384 | "nbformat_minor": 2 1385 | } 1386 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # JupyterNotebooks-Medium --------------------------------------------------------------------------------