├── README.md
├── CF Recommendation System-Examples.ipynb
└── Book Recommendation System.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # JupyterNotebooks-Medium
--------------------------------------------------------------------------------
/CF Recommendation System-Examples.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "**Examples of Collaborative Filtering based Recommendation Systems**"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 2,
13 | "metadata": {
14 | "collapsed": true
15 | },
16 | "outputs": [],
17 | "source": [
18 | "#make necesarry imports\n",
19 | "import numpy as np\n",
20 | "import pandas as pd\n",
21 | "import matplotlib.pyplot as plt\n",
22 | "import sklearn.metrics as metrics\n",
23 | "import numpy as np\n",
24 | "from sklearn.neighbors import NearestNeighbors\n",
25 | "from scipy.spatial.distance import correlation, cosine\n",
26 | "import ipywidgets as widgets\n",
27 | "from IPython.display import display, clear_output\n",
28 | "from sklearn.metrics import pairwise_distances\n",
29 | "from sklearn.metrics import mean_squared_error\n",
30 | "from math import sqrt\n",
31 | "import sys, os\n",
32 | "from contextlib import contextmanager"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 3,
38 | "metadata": {
39 | "collapsed": true
40 | },
41 | "outputs": [],
42 | "source": [
43 | "#M is user-item ratings matrix where ratings are integers from 1-10\n",
44 | "M = np.asarray([[3,7,4,9,9,7], \n",
45 | " [7,0,5,3,8,8],\n",
46 | " [7,5,5,0,8,4],\n",
47 | " [5,6,8,5,9,8],\n",
48 | " [5,8,8,8,10,9],\n",
49 | " [7,7,0,4,7,8]])\n",
50 | "M=pd.DataFrame(M)\n",
51 | "\n",
52 | "#declaring k,metric as global which can be changed by the user later\n",
53 | "global k,metric\n",
54 | "k=4\n",
55 | "metric='cosine' #can be changed to 'correlation' for Pearson correlation similaries"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 4,
61 | "metadata": {},
62 | "outputs": [
63 | {
64 | "data": {
65 | "text/html": [
66 | "
\n",
67 | "\n",
80 | "
\n",
81 | " \n",
82 | " \n",
83 | " | \n",
84 | " 0 | \n",
85 | " 1 | \n",
86 | " 2 | \n",
87 | " 3 | \n",
88 | " 4 | \n",
89 | " 5 | \n",
90 | "
\n",
91 | " \n",
92 | " \n",
93 | " \n",
94 | " | 0 | \n",
95 | " 3 | \n",
96 | " 7 | \n",
97 | " 4 | \n",
98 | " 9 | \n",
99 | " 9 | \n",
100 | " 7 | \n",
101 | "
\n",
102 | " \n",
103 | " | 1 | \n",
104 | " 7 | \n",
105 | " 0 | \n",
106 | " 5 | \n",
107 | " 3 | \n",
108 | " 8 | \n",
109 | " 8 | \n",
110 | "
\n",
111 | " \n",
112 | " | 2 | \n",
113 | " 7 | \n",
114 | " 5 | \n",
115 | " 5 | \n",
116 | " 0 | \n",
117 | " 8 | \n",
118 | " 4 | \n",
119 | "
\n",
120 | " \n",
121 | " | 3 | \n",
122 | " 5 | \n",
123 | " 6 | \n",
124 | " 8 | \n",
125 | " 5 | \n",
126 | " 9 | \n",
127 | " 8 | \n",
128 | "
\n",
129 | " \n",
130 | " | 4 | \n",
131 | " 5 | \n",
132 | " 8 | \n",
133 | " 8 | \n",
134 | " 8 | \n",
135 | " 10 | \n",
136 | " 9 | \n",
137 | "
\n",
138 | " \n",
139 | " | 5 | \n",
140 | " 7 | \n",
141 | " 7 | \n",
142 | " 0 | \n",
143 | " 4 | \n",
144 | " 7 | \n",
145 | " 8 | \n",
146 | "
\n",
147 | " \n",
148 | "
\n",
149 | "
"
150 | ],
151 | "text/plain": [
152 | " 0 1 2 3 4 5\n",
153 | "0 3 7 4 9 9 7\n",
154 | "1 7 0 5 3 8 8\n",
155 | "2 7 5 5 0 8 4\n",
156 | "3 5 6 8 5 9 8\n",
157 | "4 5 8 8 8 10 9\n",
158 | "5 7 7 0 4 7 8"
159 | ]
160 | },
161 | "execution_count": 4,
162 | "metadata": {},
163 | "output_type": "execute_result"
164 | }
165 | ],
166 | "source": [
167 | "M"
168 | ]
169 | },
170 | {
171 | "cell_type": "markdown",
172 | "metadata": {},
173 | "source": [
174 | "**User-based Recommendation Systems**"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": 5,
180 | "metadata": {
181 | "collapsed": true
182 | },
183 | "outputs": [],
184 | "source": [
185 | "#get cosine similarities for ratings matrix M; pairwise_distances returns the distances between ratings and hence\n",
186 | "#similarities are obtained by subtracting distances from 1\n",
187 | "cosine_sim = 1-pairwise_distances(M, metric=\"cosine\")"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 6,
193 | "metadata": {},
194 | "outputs": [
195 | {
196 | "data": {
197 | "text/html": [
198 | "\n",
199 | "\n",
212 | "
\n",
213 | " \n",
214 | " \n",
215 | " | \n",
216 | " 0 | \n",
217 | " 1 | \n",
218 | " 2 | \n",
219 | " 3 | \n",
220 | " 4 | \n",
221 | " 5 | \n",
222 | "
\n",
223 | " \n",
224 | " \n",
225 | " \n",
226 | " | 0 | \n",
227 | " 1.000000 | \n",
228 | " 0.799268 | \n",
229 | " 0.779227 | \n",
230 | " 0.934622 | \n",
231 | " 0.973890 | \n",
232 | " 0.884600 | \n",
233 | "
\n",
234 | " \n",
235 | " | 1 | \n",
236 | " 0.799268 | \n",
237 | " 1.000000 | \n",
238 | " 0.874744 | \n",
239 | " 0.905850 | \n",
240 | " 0.866146 | \n",
241 | " 0.827036 | \n",
242 | "
\n",
243 | " \n",
244 | " | 2 | \n",
245 | " 0.779227 | \n",
246 | " 0.874744 | \n",
247 | " 1.000000 | \n",
248 | " 0.909513 | \n",
249 | " 0.865454 | \n",
250 | " 0.853275 | \n",
251 | "
\n",
252 | " \n",
253 | " | 3 | \n",
254 | " 0.934622 | \n",
255 | " 0.905850 | \n",
256 | " 0.909513 | \n",
257 | " 1.000000 | \n",
258 | " 0.989344 | \n",
259 | " 0.865614 | \n",
260 | "
\n",
261 | " \n",
262 | " | 4 | \n",
263 | " 0.973890 | \n",
264 | " 0.866146 | \n",
265 | " 0.865454 | \n",
266 | " 0.989344 | \n",
267 | " 1.000000 | \n",
268 | " 0.881640 | \n",
269 | "
\n",
270 | " \n",
271 | " | 5 | \n",
272 | " 0.884600 | \n",
273 | " 0.827036 | \n",
274 | " 0.853275 | \n",
275 | " 0.865614 | \n",
276 | " 0.881640 | \n",
277 | " 1.000000 | \n",
278 | "
\n",
279 | " \n",
280 | "
\n",
281 | "
"
282 | ],
283 | "text/plain": [
284 | " 0 1 2 3 4 5\n",
285 | "0 1.000000 0.799268 0.779227 0.934622 0.973890 0.884600\n",
286 | "1 0.799268 1.000000 0.874744 0.905850 0.866146 0.827036\n",
287 | "2 0.779227 0.874744 1.000000 0.909513 0.865454 0.853275\n",
288 | "3 0.934622 0.905850 0.909513 1.000000 0.989344 0.865614\n",
289 | "4 0.973890 0.866146 0.865454 0.989344 1.000000 0.881640\n",
290 | "5 0.884600 0.827036 0.853275 0.865614 0.881640 1.000000"
291 | ]
292 | },
293 | "execution_count": 6,
294 | "metadata": {},
295 | "output_type": "execute_result"
296 | }
297 | ],
298 | "source": [
299 | "#Cosine similarity matrix\n",
300 | "pd.DataFrame(cosine_sim)"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": 7,
306 | "metadata": {
307 | "collapsed": true
308 | },
309 | "outputs": [],
310 | "source": [
311 | "#get pearson similarities for ratings matrix M\n",
312 | "pearson_sim = 1-pairwise_distances(M, metric=\"correlation\")"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": 8,
318 | "metadata": {},
319 | "outputs": [
320 | {
321 | "data": {
322 | "text/html": [
323 | "\n",
324 | "\n",
337 | "
\n",
338 | " \n",
339 | " \n",
340 | " | \n",
341 | " 0 | \n",
342 | " 1 | \n",
343 | " 2 | \n",
344 | " 3 | \n",
345 | " 4 | \n",
346 | " 5 | \n",
347 | "
\n",
348 | " \n",
349 | " \n",
350 | " \n",
351 | " | 0 | \n",
352 | " 1.000000 | \n",
353 | " -0.137446 | \n",
354 | " -0.357398 | \n",
355 | " 0.208179 | \n",
356 | " 0.761905 | \n",
357 | " 0.277350 | \n",
358 | "
\n",
359 | " \n",
360 | " | 1 | \n",
361 | " -0.137446 | \n",
362 | " 1.000000 | \n",
363 | " 0.453897 | \n",
364 | " 0.515910 | \n",
365 | " 0.112456 | \n",
366 | " 0.218328 | \n",
367 | "
\n",
368 | " \n",
369 | " | 2 | \n",
370 | " -0.357398 | \n",
371 | " 0.453897 | \n",
372 | " 1.000000 | \n",
373 | " 0.451378 | \n",
374 | " -0.042888 | \n",
375 | " 0.297373 | \n",
376 | "
\n",
377 | " \n",
378 | " | 3 | \n",
379 | " 0.208179 | \n",
380 | " 0.515910 | \n",
381 | " 0.451378 | \n",
382 | " 1.000000 | \n",
383 | " 0.763325 | \n",
384 | " -0.057739 | \n",
385 | "
\n",
386 | " \n",
387 | " | 4 | \n",
388 | " 0.761905 | \n",
389 | " 0.112456 | \n",
390 | " -0.042888 | \n",
391 | " 0.763325 | \n",
392 | " 1.000000 | \n",
393 | " 0.039621 | \n",
394 | "
\n",
395 | " \n",
396 | " | 5 | \n",
397 | " 0.277350 | \n",
398 | " 0.218328 | \n",
399 | " 0.297373 | \n",
400 | " -0.057739 | \n",
401 | " 0.039621 | \n",
402 | " 1.000000 | \n",
403 | "
\n",
404 | " \n",
405 | "
\n",
406 | "
"
407 | ],
408 | "text/plain": [
409 | " 0 1 2 3 4 5\n",
410 | "0 1.000000 -0.137446 -0.357398 0.208179 0.761905 0.277350\n",
411 | "1 -0.137446 1.000000 0.453897 0.515910 0.112456 0.218328\n",
412 | "2 -0.357398 0.453897 1.000000 0.451378 -0.042888 0.297373\n",
413 | "3 0.208179 0.515910 0.451378 1.000000 0.763325 -0.057739\n",
414 | "4 0.761905 0.112456 -0.042888 0.763325 1.000000 0.039621\n",
415 | "5 0.277350 0.218328 0.297373 -0.057739 0.039621 1.000000"
416 | ]
417 | },
418 | "execution_count": 8,
419 | "metadata": {},
420 | "output_type": "execute_result"
421 | }
422 | ],
423 | "source": [
424 | "#Pearson correlation similarity matrix\n",
425 | "pd.DataFrame(pearson_sim)"
426 | ]
427 | },
428 | {
429 | "cell_type": "code",
430 | "execution_count": 9,
431 | "metadata": {},
432 | "outputs": [],
433 | "source": [
434 | "#This function finds k similar users given the user_id and ratings matrix M\n",
435 | "#Note that the similarities are same as obtained via using pairwise_distances\n",
436 | "def findksimilarusers(user_id, ratings, metric = metric, k=k):\n",
437 | " similarities=[]\n",
438 | " indices=[]\n",
439 | " model_knn = NearestNeighbors(metric = metric, algorithm = 'brute') \n",
440 | " model_knn.fit(ratings)\n",
441 | "\n",
442 | " distances, indices = model_knn.kneighbors(ratings.iloc[user_id-1, :].values.reshape(1, -1), n_neighbors = k+1)\n",
443 | " similarities = 1-distances.flatten()\n",
444 | " print '{0} most similar users for User {1}:\\n'.format(k,user_id)\n",
445 | " for i in range(0, len(indices.flatten())):\n",
446 | " if indices.flatten()[i]+1 == user_id:\n",
447 | " continue;\n",
448 | "\n",
449 | " else:\n",
450 | " print '{0}: User {1}, with similarity of {2}'.format(i, indices.flatten()[i]+1, similarities.flatten()[i])\n",
451 | " \n",
452 | " return similarities,indices"
453 | ]
454 | },
455 | {
456 | "cell_type": "code",
457 | "execution_count": 10,
458 | "metadata": {},
459 | "outputs": [
460 | {
461 | "name": "stdout",
462 | "output_type": "stream",
463 | "text": [
464 | "4 most similar users for User 1:\n",
465 | "\n",
466 | "1: User 5, with similarity of 0.973889935402\n",
467 | "2: User 4, with similarity of 0.934621684178\n",
468 | "3: User 6, with similarity of 0.88460045723\n",
469 | "4: User 2, with similarity of 0.799267978052\n"
470 | ]
471 | }
472 | ],
473 | "source": [
474 | "similarities,indices = findksimilarusers(1,M, metric='cosine')"
475 | ]
476 | },
477 | {
478 | "cell_type": "code",
479 | "execution_count": 11,
480 | "metadata": {
481 | "scrolled": true
482 | },
483 | "outputs": [
484 | {
485 | "name": "stdout",
486 | "output_type": "stream",
487 | "text": [
488 | "4 most similar users for User 1:\n",
489 | "\n",
490 | "1: User 5, with similarity of 0.761904761905\n",
491 | "2: User 6, with similarity of 0.277350098113\n",
492 | "3: User 4, with similarity of 0.208179450927\n",
493 | "4: User 2, with similarity of -0.137446320513\n"
494 | ]
495 | }
496 | ],
497 | "source": [
498 | "similarities,indices = findksimilarusers(1,M, metric='correlation')"
499 | ]
500 | },
501 | {
502 | "cell_type": "code",
503 | "execution_count": 12,
504 | "metadata": {
505 | "collapsed": true
506 | },
507 | "outputs": [],
508 | "source": [
509 | "#This function predicts rating for specified user-item combination based on user-based approach\n",
510 | "def predict_userbased(user_id, item_id, ratings, metric = metric, k=k):\n",
511 | " prediction=0\n",
512 | " similarities, indices=findksimilarusers(user_id, ratings,metric, k) #similar users based on cosine similarity\n",
513 | " mean_rating = ratings.loc[user_id-1,:].mean() #to adjust for zero based indexing\n",
514 | " sum_wt = np.sum(similarities)-1\n",
515 | " product=1\n",
516 | " wtd_sum = 0 \n",
517 | " \n",
518 | " for i in range(0, len(indices.flatten())):\n",
519 | " if indices.flatten()[i]+1 == user_id:\n",
520 | " continue;\n",
521 | " else: \n",
522 | " ratings_diff = ratings.iloc[indices.flatten()[i],item_id-1]-np.mean(ratings.iloc[indices.flatten()[i],:])\n",
523 | " product = ratings_diff * (similarities[i])\n",
524 | " wtd_sum = wtd_sum + product\n",
525 | " \n",
526 | " prediction = int(round(mean_rating + (wtd_sum/sum_wt)))\n",
527 | " print '\\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction)\n",
528 | "\n",
529 | " return prediction"
530 | ]
531 | },
532 | {
533 | "cell_type": "code",
534 | "execution_count": 13,
535 | "metadata": {},
536 | "outputs": [
537 | {
538 | "name": "stdout",
539 | "output_type": "stream",
540 | "text": [
541 | "4 most similar users for User 3:\n",
542 | "\n",
543 | "1: User 4, with similarity of 0.90951268934\n",
544 | "2: User 2, with similarity of 0.874744414849\n",
545 | "3: User 5, with similarity of 0.86545387815\n",
546 | "4: User 6, with similarity of 0.853274963344\n",
547 | "\n",
548 | "Predicted rating for user 3 -> item 4: 3\n"
549 | ]
550 | }
551 | ],
552 | "source": [
553 | "predict_userbased(3,4,M);"
554 | ]
555 | },
556 | {
557 | "cell_type": "markdown",
558 | "metadata": {},
559 | "source": [
560 | "**Item-based Recommendation Systems**"
561 | ]
562 | },
563 | {
564 | "cell_type": "code",
565 | "execution_count": 14,
566 | "metadata": {
567 | "collapsed": true
568 | },
569 | "outputs": [],
570 | "source": [
571 | "#This function finds k similar items given the item_id and ratings matrix M\n",
572 | "\n",
573 | "def findksimilaritems(item_id, ratings, metric=metric, k=k):\n",
574 | " similarities=[]\n",
575 | " indices=[] \n",
576 | " ratings=ratings.T\n",
577 | " model_knn = NearestNeighbors(metric = metric, algorithm = 'brute')\n",
578 | " model_knn.fit(ratings)\n",
579 | "\n",
580 | " distances, indices = model_knn.kneighbors(ratings.iloc[item_id-1, :].values.reshape(1, -1), n_neighbors = k+1)\n",
581 | " similarities = 1-distances.flatten()\n",
582 | " print '{0} most similar items for item {1}:\\n'.format(k,item_id)\n",
583 | " for i in range(0, len(indices.flatten())):\n",
584 | " if indices.flatten()[i]+1 == item_id:\n",
585 | " continue;\n",
586 | "\n",
587 | " else:\n",
588 | " print '{0}: Item {1} :, with similarity of {2}'.format(i,indices.flatten()[i]+1, similarities.flatten()[i])\n",
589 | "\n",
590 | "\n",
591 | " return similarities,indices"
592 | ]
593 | },
594 | {
595 | "cell_type": "code",
596 | "execution_count": 15,
597 | "metadata": {},
598 | "outputs": [
599 | {
600 | "name": "stdout",
601 | "output_type": "stream",
602 | "text": [
603 | "4 most similar items for item 3:\n",
604 | "\n",
605 | "1: Item 5 :, with similarity of 0.918336125535\n",
606 | "2: Item 6 :, with similarity of 0.874759773038\n",
607 | "3: Item 1 :, with similarity of 0.810364746222\n",
608 | "4: Item 4 :, with similarity of 0.796917800302\n"
609 | ]
610 | }
611 | ],
612 | "source": [
613 | "similarities,indices=findksimilaritems(3,M)"
614 | ]
615 | },
616 | {
617 | "cell_type": "code",
618 | "execution_count": 16,
619 | "metadata": {
620 | "collapsed": true
621 | },
622 | "outputs": [],
623 | "source": [
624 | "#This function predicts the rating for specified user-item combination based on item-based approach\n",
625 | "def predict_itembased(user_id, item_id, ratings, metric = metric, k=k):\n",
626 | " prediction= wtd_sum =0\n",
627 | " similarities, indices=findksimilaritems(item_id, ratings) #similar users based on correlation coefficients\n",
628 | " sum_wt = np.sum(similarities)-1\n",
629 | " product=1\n",
630 | " \n",
631 | " for i in range(0, len(indices.flatten())):\n",
632 | " if indices.flatten()[i]+1 == item_id:\n",
633 | " continue;\n",
634 | " else:\n",
635 | " product = ratings.iloc[user_id-1,indices.flatten()[i]] * (similarities[i])\n",
636 | " wtd_sum = wtd_sum + product \n",
637 | " prediction = int(round(wtd_sum/sum_wt))\n",
638 | " print '\\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction) \n",
639 | "\n",
640 | " return prediction"
641 | ]
642 | },
643 | {
644 | "cell_type": "code",
645 | "execution_count": 17,
646 | "metadata": {},
647 | "outputs": [
648 | {
649 | "name": "stdout",
650 | "output_type": "stream",
651 | "text": [
652 | "4 most similar items for item 3:\n",
653 | "\n",
654 | "1: Item 5 :, with similarity of 0.918336125535\n",
655 | "2: Item 6 :, with similarity of 0.874759773038\n",
656 | "3: Item 1 :, with similarity of 0.810364746222\n",
657 | "4: Item 4 :, with similarity of 0.796917800302\n",
658 | "\n",
659 | "Predicted rating for user 1 -> item 3: 7\n"
660 | ]
661 | }
662 | ],
663 | "source": [
664 | "prediction = predict_itembased(1,3,M)"
665 | ]
666 | },
667 | {
668 | "cell_type": "code",
669 | "execution_count": 18,
670 | "metadata": {
671 | "collapsed": true
672 | },
673 | "outputs": [],
674 | "source": [
675 | "#This function is used to compute adjusted cosine similarity matrix for items\n",
676 | "def computeAdjCosSim(M):\n",
677 | " sim_matrix = np.zeros((M.shape[1], M.shape[1]))\n",
678 | " M_u = M.mean(axis=1) #means\n",
679 | " \n",
680 | " for i in range(M.shape[1]):\n",
681 | " for j in range(M.shape[1]):\n",
682 | " if i == j:\n",
683 | " \n",
684 | " sim_matrix[i][j] = 1\n",
685 | " else: \n",
686 | " if i\n",
736 | "\n",
749 | "\n",
750 | " \n",
751 | " \n",
752 | " | \n",
753 | " 0 | \n",
754 | " 1 | \n",
755 | " 2 | \n",
756 | " 3 | \n",
757 | " 4 | \n",
758 | " 5 | \n",
759 | "
\n",
760 | " \n",
761 | " \n",
762 | " \n",
763 | " | 0 | \n",
764 | " 1.000000 | \n",
765 | " 0.236908 | \n",
766 | " 0.421263 | \n",
767 | " -0.519085 | \n",
768 | " -0.125892 | \n",
769 | " 0.010090 | \n",
770 | "
\n",
771 | " \n",
772 | " | 1 | \n",
773 | " 0.236908 | \n",
774 | " 1.000000 | \n",
775 | " -0.805243 | \n",
776 | " 0.085741 | \n",
777 | " 0.237273 | \n",
778 | " 0.520625 | \n",
779 | "
\n",
780 | " \n",
781 | " | 2 | \n",
782 | " 0.421263 | \n",
783 | " -0.805243 | \n",
784 | " 1.000000 | \n",
785 | " -0.767941 | \n",
786 | " -0.230521 | \n",
787 | " -0.053640 | \n",
788 | "
\n",
789 | " \n",
790 | " | 3 | \n",
791 | " -0.519085 | \n",
792 | " 0.085741 | \n",
793 | " -0.767941 | \n",
794 | " 1.000000 | \n",
795 | " -0.299059 | \n",
796 | " -0.644550 | \n",
797 | "
\n",
798 | " \n",
799 | " | 4 | \n",
800 | " -0.125892 | \n",
801 | " 0.237273 | \n",
802 | " -0.230521 | \n",
803 | " -0.299059 | \n",
804 | " 1.000000 | \n",
805 | " 0.599158 | \n",
806 | "
\n",
807 | " \n",
808 | " | 5 | \n",
809 | " 0.010090 | \n",
810 | " 0.520625 | \n",
811 | " -0.053640 | \n",
812 | " -0.644550 | \n",
813 | " 0.599158 | \n",
814 | " 1.000000 | \n",
815 | "
\n",
816 | " \n",
817 | "
\n",
818 | ""
819 | ],
820 | "text/plain": [
821 | " 0 1 2 3 4 5\n",
822 | "0 1.000000 0.236908 0.421263 -0.519085 -0.125892 0.010090\n",
823 | "1 0.236908 1.000000 -0.805243 0.085741 0.237273 0.520625\n",
824 | "2 0.421263 -0.805243 1.000000 -0.767941 -0.230521 -0.053640\n",
825 | "3 -0.519085 0.085741 -0.767941 1.000000 -0.299059 -0.644550\n",
826 | "4 -0.125892 0.237273 -0.230521 -0.299059 1.000000 0.599158\n",
827 | "5 0.010090 0.520625 -0.053640 -0.644550 0.599158 1.000000"
828 | ]
829 | },
830 | "execution_count": 20,
831 | "metadata": {},
832 | "output_type": "execute_result"
833 | }
834 | ],
835 | "source": [
836 | "adjcos_sim"
837 | ]
838 | },
839 | {
840 | "cell_type": "code",
841 | "execution_count": 26,
842 | "metadata": {
843 | "collapsed": true
844 | },
845 | "outputs": [],
846 | "source": [
847 | "#This function finds k similar items given the item_id and ratings matrix M\n",
848 | "\n",
849 | "def findksimilaritems_adjcos(item_id, ratings, k=k):\n",
850 | " \n",
851 | " sim_matrix = computeAdjCosSim(ratings)\n",
852 | " similarities = sim_matrix[item_id-1].sort_values(ascending=False)[:k+1].values\n",
853 | " indices = sim_matrix[item_id-1].sort_values(ascending=False)[:k+1].index\n",
854 | " \n",
855 | " print '{0} most similar items for item {1}:\\n'.format(k,item_id)\n",
856 | " for i in range(0, len(indices)):\n",
857 | " if indices[i]+1 == item_id:\n",
858 | " continue;\n",
859 | "\n",
860 | " else:\n",
861 | " print '{0}: Item {1} :, with similarity of {2}'.format(i,indices[i]+1, similarities[i])\n",
862 | " \n",
863 | " return similarities ,indices "
864 | ]
865 | },
866 | {
867 | "cell_type": "code",
868 | "execution_count": 27,
869 | "metadata": {},
870 | "outputs": [
871 | {
872 | "name": "stdout",
873 | "output_type": "stream",
874 | "text": [
875 | "4 most similar items for item 3:\n",
876 | "\n",
877 | "1: Item 1 :, with similarity of 0.421262731871\n",
878 | "2: Item 6 :, with similarity of -0.0536398904889\n",
879 | "3: Item 5 :, with similarity of -0.230521358269\n",
880 | "4: Item 4 :, with similarity of -0.767941046575\n"
881 | ]
882 | }
883 | ],
884 | "source": [
885 | "similarities, indices = findksimilaritems_adjcos(3,M)"
886 | ]
887 | },
888 | {
889 | "cell_type": "code",
890 | "execution_count": 28,
891 | "metadata": {
892 | "collapsed": true
893 | },
894 | "outputs": [],
895 | "source": [
896 | "#This function predicts the rating for specified user-item combination for adjusted cosine item-based approach\n",
897 | "#As the adjusted cosine similarities range from -1,+1, sometimes the predicted rating can be negative or greater than max value\n",
898 | "#Hack to deal with this: Rating is set to min if prediction is negative, Rating is set to max if prediction is above max\n",
899 | "def predict_itembased_adjcos(user_id, item_id, ratings):\n",
900 | " prediction=0\n",
901 | "\n",
902 | " similarities, indices=findksimilaritems_adjcos(item_id, ratings) #similar users based on correlation coefficients\n",
903 | " sum_wt = np.sum(similarities)-1\n",
904 | "\n",
905 | " product=1\n",
906 | " wtd_sum = 0 \n",
907 | " for i in range(0, len(indices)):\n",
908 | " if indices[i]+1 == item_id:\n",
909 | " continue;\n",
910 | " else:\n",
911 | " product = ratings.iloc[user_id-1,indices[i]] * (similarities[i])\n",
912 | " wtd_sum = wtd_sum + product \n",
913 | " prediction = int(round(wtd_sum/sum_wt))\n",
914 | " if prediction < 0:\n",
915 | " prediction = 1\n",
916 | " elif prediction >10:\n",
917 | " prediction = 10\n",
918 | " print '\\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction) \n",
919 | " \n",
920 | " return prediction"
921 | ]
922 | },
923 | {
924 | "cell_type": "code",
925 | "execution_count": 29,
926 | "metadata": {},
927 | "outputs": [
928 | {
929 | "name": "stdout",
930 | "output_type": "stream",
931 | "text": [
932 | "4 most similar items for item 4:\n",
933 | "\n",
934 | "1: Item 2 :, with similarity of 0.0857414341149\n",
935 | "2: Item 5 :, with similarity of -0.29905882779\n",
936 | "3: Item 1 :, with similarity of -0.519085268895\n",
937 | "4: Item 6 :, with similarity of -0.644550286954\n",
938 | "\n",
939 | "Predicted rating for user 3 -> item 4: 6\n"
940 | ]
941 | }
942 | ],
943 | "source": [
944 | "prediction=predict_itembased_adjcos(3,4,M)"
945 | ]
946 | },
947 | {
948 | "cell_type": "code",
949 | "execution_count": 30,
950 | "metadata": {},
951 | "outputs": [
952 | {
953 | "data": {
954 | "text/html": [
955 | "\n",
956 | "\n",
969 | "
\n",
970 | " \n",
971 | " \n",
972 | " | \n",
973 | " 0 | \n",
974 | " 1 | \n",
975 | " 2 | \n",
976 | " 3 | \n",
977 | " 4 | \n",
978 | " 5 | \n",
979 | "
\n",
980 | " \n",
981 | " \n",
982 | " \n",
983 | " | 0 | \n",
984 | " 1.000000 | \n",
985 | " 0.236908 | \n",
986 | " 0.421263 | \n",
987 | " -0.519085 | \n",
988 | " -0.125892 | \n",
989 | " 0.010090 | \n",
990 | "
\n",
991 | " \n",
992 | " | 1 | \n",
993 | " 0.236908 | \n",
994 | " 1.000000 | \n",
995 | " -0.805243 | \n",
996 | " 0.085741 | \n",
997 | " 0.237273 | \n",
998 | " 0.520625 | \n",
999 | "
\n",
1000 | " \n",
1001 | " | 2 | \n",
1002 | " 0.421263 | \n",
1003 | " -0.805243 | \n",
1004 | " 1.000000 | \n",
1005 | " -0.767941 | \n",
1006 | " -0.230521 | \n",
1007 | " -0.053640 | \n",
1008 | "
\n",
1009 | " \n",
1010 | " | 3 | \n",
1011 | " -0.519085 | \n",
1012 | " 0.085741 | \n",
1013 | " -0.767941 | \n",
1014 | " 1.000000 | \n",
1015 | " -0.299059 | \n",
1016 | " -0.644550 | \n",
1017 | "
\n",
1018 | " \n",
1019 | " | 4 | \n",
1020 | " -0.125892 | \n",
1021 | " 0.237273 | \n",
1022 | " -0.230521 | \n",
1023 | " -0.299059 | \n",
1024 | " 1.000000 | \n",
1025 | " 0.599158 | \n",
1026 | "
\n",
1027 | " \n",
1028 | " | 5 | \n",
1029 | " 0.010090 | \n",
1030 | " 0.520625 | \n",
1031 | " -0.053640 | \n",
1032 | " -0.644550 | \n",
1033 | " 0.599158 | \n",
1034 | " 1.000000 | \n",
1035 | "
\n",
1036 | " \n",
1037 | "
\n",
1038 | "
"
1039 | ],
1040 | "text/plain": [
1041 | " 0 1 2 3 4 5\n",
1042 | "0 1.000000 0.236908 0.421263 -0.519085 -0.125892 0.010090\n",
1043 | "1 0.236908 1.000000 -0.805243 0.085741 0.237273 0.520625\n",
1044 | "2 0.421263 -0.805243 1.000000 -0.767941 -0.230521 -0.053640\n",
1045 | "3 -0.519085 0.085741 -0.767941 1.000000 -0.299059 -0.644550\n",
1046 | "4 -0.125892 0.237273 -0.230521 -0.299059 1.000000 0.599158\n",
1047 | "5 0.010090 0.520625 -0.053640 -0.644550 0.599158 1.000000"
1048 | ]
1049 | },
1050 | "execution_count": 30,
1051 | "metadata": {},
1052 | "output_type": "execute_result"
1053 | }
1054 | ],
1055 | "source": [
1056 | "adjcos_sim"
1057 | ]
1058 | },
1059 | {
1060 | "cell_type": "code",
1061 | "execution_count": 31,
1062 | "metadata": {
1063 | "collapsed": true
1064 | },
1065 | "outputs": [],
1066 | "source": [
1067 | "#This function utilizes above function to recommend items for selected approach. Recommendations are made if the predicted\n",
1068 | "#rating for an item is greater than or equal to 6, and the items has not been rated already\n",
1069 | "def recommendItem(user_id, item_id, ratings):\n",
1070 | " \n",
1071 | " if user_id<1 or user_id>6 or type(user_id) is not int:\n",
1072 | " print 'Userid does not exist. Enter numbers from 1-6'\n",
1073 | " else: \n",
1074 | " ids = ['User-based CF (cosine)','User-based CF (correlation)','Item-based CF (cosine)',\n",
1075 | " 'Item-based CF (adjusted cosine)']\n",
1076 | "\n",
1077 | " approach = widgets.Dropdown(options=ids, value=ids[0],\n",
1078 | " description='Select Approach', width='500px')\n",
1079 | " \n",
1080 | " def on_change(change):\n",
1081 | " prediction = 0\n",
1082 | " clear_output(wait=True)\n",
1083 | " if change['type'] == 'change' and change['name'] == 'value': \n",
1084 | " if (approach.value == 'User-based CF (cosine)'):\n",
1085 | " metric = 'cosine'\n",
1086 | " prediction = predict_userbased(user_id, item_id, ratings, metric)\n",
1087 | " elif (approach.value == 'User-based CF (correlation)') : \n",
1088 | " metric = 'correlation' \n",
1089 | " prediction = predict_userbased(user_id, item_id, ratings, metric)\n",
1090 | " elif (approach.value == 'Item-based CF (cosine)'):\n",
1091 | " prediction = predict_itembased(user_id, item_id, ratings)\n",
1092 | " else:\n",
1093 | " prediction = predict_itembased_adjcos(user_id,item_id,ratings)\n",
1094 | "\n",
1095 | " if ratings[item_id-1][user_id-1] != 0: \n",
1096 | " print 'Item already rated'\n",
1097 | " else:\n",
1098 | " if prediction>=6:\n",
1099 | " print '\\nItem recommended'\n",
1100 | " else:\n",
1101 | " print 'Item not recommended'\n",
1102 | "\n",
1103 | " approach.observe(on_change)\n",
1104 | " display(approach)"
1105 | ]
1106 | },
1107 | {
1108 | "cell_type": "code",
1109 | "execution_count": 32,
1110 | "metadata": {},
1111 | "outputs": [
1112 | {
1113 | "name": "stdout",
1114 | "output_type": "stream",
1115 | "text": [
1116 | "Userid does not exist. Enter numbers from 1-6\n"
1117 | ]
1118 | }
1119 | ],
1120 | "source": [
1121 | "#check for incorrect entries\n",
1122 | "recommendItem(-1,3,M)"
1123 | ]
1124 | },
1125 | {
1126 | "cell_type": "code",
1127 | "execution_count": 33,
1128 | "metadata": {},
1129 | "outputs": [
1130 | {
1131 | "name": "stdout",
1132 | "output_type": "stream",
1133 | "text": [
1134 | "4 most similar users for User 3:\n",
1135 | "\n",
1136 | "1: User 4, with similarity of 0.90951268934\n",
1137 | "2: User 2, with similarity of 0.874744414849\n",
1138 | "3: User 5, with similarity of 0.86545387815\n",
1139 | "4: User 6, with similarity of 0.853274963344\n",
1140 | "\n",
1141 | "Predicted rating for user 3 -> item 4: 3\n",
1142 | "Item not recommended\n"
1143 | ]
1144 | }
1145 | ],
1146 | "source": [
1147 | "recommendItem(3,4,M)"
1148 | ]
1149 | },
1150 | {
1151 | "cell_type": "code",
1152 | "execution_count": 34,
1153 | "metadata": {},
1154 | "outputs": [
1155 | {
1156 | "name": "stdout",
1157 | "output_type": "stream",
1158 | "text": [
1159 | "4 most similar users for User 3:\n",
1160 | "\n",
1161 | "1: User 2, with similarity of 0.453897185842\n",
1162 | "2: User 4, with similarity of 0.451378005098\n",
1163 | "3: User 6, with similarity of 0.297373304825\n",
1164 | "4: User 5, with similarity of -0.04288778794\n",
1165 | "\n",
1166 | "Predicted rating for user 3 -> item 4: 3\n",
1167 | "Item not recommended\n"
1168 | ]
1169 | }
1170 | ],
1171 | "source": [
1172 | "recommendItem(3,4,M)"
1173 | ]
1174 | },
1175 | {
1176 | "cell_type": "code",
1177 | "execution_count": 35,
1178 | "metadata": {},
1179 | "outputs": [
1180 | {
1181 | "name": "stdout",
1182 | "output_type": "stream",
1183 | "text": [
1184 | "4 most similar items for item 4:\n",
1185 | "\n",
1186 | "1: Item 6 :, with similarity of 0.89977997614\n",
1187 | "2: Item 2 :, with similarity of 0.887160079571\n",
1188 | "3: Item 5 :, with similarity of 0.88180009273\n",
1189 | "4: Item 3 :, with similarity of 0.796917800302\n",
1190 | "\n",
1191 | "Predicted rating for user 3 -> item 4: 6\n",
1192 | "\n",
1193 | "Item recommended\n"
1194 | ]
1195 | }
1196 | ],
1197 | "source": [
1198 | "recommendItem(3,4,M)"
1199 | ]
1200 | },
1201 | {
1202 | "cell_type": "code",
1203 | "execution_count": 36,
1204 | "metadata": {},
1205 | "outputs": [
1206 | {
1207 | "name": "stdout",
1208 | "output_type": "stream",
1209 | "text": [
1210 | "4 most similar items for item 4:\n",
1211 | "\n",
1212 | "1: Item 2 :, with similarity of 0.0857414341149\n",
1213 | "2: Item 5 :, with similarity of -0.29905882779\n",
1214 | "3: Item 1 :, with similarity of -0.519085268895\n",
1215 | "4: Item 6 :, with similarity of -0.644550286954\n",
1216 | "\n",
1217 | "Predicted rating for user 3 -> item 4: 6\n",
1218 | "\n",
1219 | "Item recommended\n"
1220 | ]
1221 | }
1222 | ],
1223 | "source": [
1224 | "recommendItem(3,4,M)"
1225 | ]
1226 | },
1227 | {
1228 | "cell_type": "code",
1229 | "execution_count": 37,
1230 | "metadata": {},
1231 | "outputs": [
1232 | {
1233 | "name": "stdout",
1234 | "output_type": "stream",
1235 | "text": [
1236 | "4 most similar users for User 2:\n",
1237 | "\n",
1238 | "1: User 4, with similarity of 0.515910067398\n",
1239 | "2: User 3, with similarity of 0.453897185842\n",
1240 | "3: User 6, with similarity of 0.218327934565\n",
1241 | "4: User 5, with similarity of 0.11245608042\n",
1242 | "\n",
1243 | "Predicted rating for user 2 -> item 1: 5\n",
1244 | "Item already rated\n"
1245 | ]
1246 | }
1247 | ],
1248 | "source": [
1249 | "#if the item is already rated, it is not recommended\n",
1250 | "recommendItem(2,1,M)"
1251 | ]
1252 | },
1253 | {
1254 | "cell_type": "code",
1255 | "execution_count": 38,
1256 | "metadata": {
1257 | "collapsed": true
1258 | },
1259 | "outputs": [],
1260 | "source": [
1261 | "#This is a quick way to temporarily suppress stdout in particular code section\n",
1262 | "@contextmanager\n",
1263 | "def suppress_stdout():\n",
1264 | " with open(os.devnull, \"w\") as devnull:\n",
1265 | " old_stdout = sys.stdout\n",
1266 | " sys.stdout = devnull\n",
1267 | " try: \n",
1268 | " yield\n",
1269 | " finally:\n",
1270 | " sys.stdout = old_stdout"
1271 | ]
1272 | },
1273 | {
1274 | "cell_type": "code",
1275 | "execution_count": 39,
1276 | "metadata": {
1277 | "collapsed": true
1278 | },
1279 | "outputs": [],
1280 | "source": [
1281 | "#This is final function to evaluate the performance of selected recommendation approach and the metric used here is RMSE\n",
1282 | "#suppress_stdout function is used to suppress the print outputs of all the functions inside this function. It will only print \n",
1283 | "#RMSE values\n",
1284 | "def evaluateRS(ratings):\n",
1285 | " ids = ['User-based CF (cosine)','User-based CF (correlation)','Item-based CF (cosine)','Item-based CF (adjusted cosine)']\n",
1286 | " approach = widgets.Dropdown(options=ids, value=ids[0],description='Select Approach', width='500px')\n",
1287 | " n_users = ratings.shape[0]\n",
1288 | " n_items = ratings.shape[1]\n",
1289 | " prediction = np.zeros((n_users, n_items))\n",
1290 | " prediction= pd.DataFrame(prediction)\n",
1291 | " def on_change(change):\n",
1292 | " clear_output(wait=True)\n",
1293 | " with suppress_stdout():\n",
1294 | " if change['type'] == 'change' and change['name'] == 'value': \n",
1295 | " if (approach.value == 'User-based CF (cosine)'):\n",
1296 | " metric = 'cosine'\n",
1297 | " for i in range(n_users):\n",
1298 | " for j in range(n_items):\n",
1299 | " prediction[i][j] = predict_userbased(i+1, j+1, ratings, metric)\n",
1300 | " elif (approach.value == 'User-based CF (correlation)') : \n",
1301 | " metric = 'correlation' \n",
1302 | " for i in range(n_users):\n",
1303 | " for j in range(n_items):\n",
1304 | " prediction[i][j] = predict_userbased(i+1, j+1, ratings, metric)\n",
1305 | " elif (approach.value == 'Item-based CF (cosine)'):\n",
1306 | " for i in range(n_users):\n",
1307 | " for j in range(n_items):\n",
1308 | " prediction[i][j] = predict_userbased(i+1, j+1, ratings)\n",
1309 | " else:\n",
1310 | " for i in range(n_users):\n",
1311 | " for j in range(n_items):\n",
1312 | " prediction[i][j] = predict_userbased(i+1, j+1, ratings)\n",
1313 | " \n",
1314 | " MSE = mean_squared_error(prediction, ratings)\n",
1315 | " RMSE = round(sqrt(MSE),3)\n",
1316 | " print \"RMSE using {0} approach is: {1}\".format(approach.value,RMSE)\n",
1317 | " \n",
1318 | " approach.observe(on_change)\n",
1319 | " display(approach)"
1320 | ]
1321 | },
1322 | {
1323 | "cell_type": "code",
1324 | "execution_count": 40,
1325 | "metadata": {},
1326 | "outputs": [
1327 | {
1328 | "name": "stdout",
1329 | "output_type": "stream",
1330 | "text": [
1331 | "RMSE using Item-based CF (cosine) approach is: 2.804\n"
1332 | ]
1333 | }
1334 | ],
1335 | "source": [
1336 | "evaluateRS(M)"
1337 | ]
1338 | },
1339 | {
1340 | "cell_type": "code",
1341 | "execution_count": 41,
1342 | "metadata": {},
1343 | "outputs": [
1344 | {
1345 | "name": "stdout",
1346 | "output_type": "stream",
1347 | "text": [
1348 | "RMSE using Item-based CF (cosine) approach is: 2.804\n"
1349 | ]
1350 | }
1351 | ],
1352 | "source": [
1353 | "evaluateRS(M)"
1354 | ]
1355 | },
1356 | {
1357 | "cell_type": "markdown",
1358 | "metadata": {},
1359 | "source": [
1360 | "**Thanks for reading this notebook**"
1361 | ]
1362 | }
1363 | ],
1364 | "metadata": {
1365 | "kernelspec": {
1366 | "display_name": "Python 2",
1367 | "language": "python",
1368 | "name": "python2"
1369 | },
1370 | "language_info": {
1371 | "codemirror_mode": {
1372 | "name": "ipython",
1373 | "version": 2
1374 | },
1375 | "file_extension": ".py",
1376 | "mimetype": "text/x-python",
1377 | "name": "python",
1378 | "nbconvert_exporter": "python",
1379 | "pygments_lexer": "ipython2",
1380 | "version": "2.7.13"
1381 | }
1382 | },
1383 | "nbformat": 4,
1384 | "nbformat_minor": 2
1385 | }
1386 |
--------------------------------------------------------------------------------
/Book Recommendation System.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "**About Book Crossing Dataset**
\n",
8 | "\n",
9 | "This dataset has been compiled by Cai-Nicolas Ziegler in 2004, and it comprises of three tables for users, books and ratings. Explicit ratings are expressed on a scale from 1-10 (higher values denoting higher appreciation) and implicit rating is expressed by 0"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "Link to dataset files
\n",
17 | "http://www2.informatik.uni-freiburg.de/~cziegler/BX/ "
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "**About this Project**\n",
25 | "\n",
26 | "This project entails building a Book Recommender System for users based on user-based and item-based collaborative filtering approaches"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 177,
32 | "metadata": {
33 | "collapsed": true
34 | },
35 | "outputs": [],
36 | "source": [
37 | "#Making necesarry imports\n",
38 | "import pandas as pd\n",
39 | "import matplotlib.pyplot as plt\n",
40 | "import sklearn.metrics as metrics\n",
41 | "import numpy as np\n",
42 | "from sklearn.neighbors import NearestNeighbors\n",
43 | "from scipy.spatial.distance import correlation\n",
44 | "from sklearn.metrics.pairwise import pairwise_distances\n",
45 | "import ipywidgets as widgets\n",
46 | "from IPython.display import display, clear_output\n",
47 | "from contextlib import contextmanager\n",
48 | "import warnings\n",
49 | "warnings.filterwarnings('ignore')\n",
50 | "import numpy as np\n",
51 | "import os, sys\n",
52 | "import re\n",
53 | "import seaborn as sns"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 178,
59 | "metadata": {
60 | "collapsed": true
61 | },
62 | "outputs": [],
63 | "source": [
64 | "#Setting the current working directory\n",
65 | "os.chdir('D:\\Data Science\\Projects\\Book Crossing Dataset - Recommender System')"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 179,
71 | "metadata": {},
72 | "outputs": [
73 | {
74 | "data": {
75 | "text/html": [
76 | "\n",
88 | "To toggle on/off output_stderr, click here."
89 | ],
90 | "text/plain": [
91 | ""
92 | ]
93 | },
94 | "execution_count": 179,
95 | "metadata": {},
96 | "output_type": "execute_result"
97 | }
98 | ],
99 | "source": [
100 | "from IPython.display import HTML\n",
101 | "HTML('''\n",
113 | "To toggle on/off output_stderr, click here.''')"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 180,
119 | "metadata": {},
120 | "outputs": [
121 | {
122 | "name": "stderr",
123 | "output_type": "stream",
124 | "text": [
125 | "Skipping line 6452: expected 8 fields, saw 9\n",
126 | "Skipping line 43667: expected 8 fields, saw 10\n",
127 | "Skipping line 51751: expected 8 fields, saw 9\n",
128 | "\n",
129 | "Skipping line 92038: expected 8 fields, saw 9\n",
130 | "Skipping line 104319: expected 8 fields, saw 9\n",
131 | "Skipping line 121768: expected 8 fields, saw 9\n",
132 | "\n",
133 | "Skipping line 144058: expected 8 fields, saw 9\n",
134 | "Skipping line 150789: expected 8 fields, saw 9\n",
135 | "Skipping line 157128: expected 8 fields, saw 9\n",
136 | "Skipping line 180189: expected 8 fields, saw 9\n",
137 | "Skipping line 185738: expected 8 fields, saw 9\n",
138 | "\n",
139 | "Skipping line 209388: expected 8 fields, saw 9\n",
140 | "Skipping line 220626: expected 8 fields, saw 9\n",
141 | "Skipping line 227933: expected 8 fields, saw 11\n",
142 | "Skipping line 228957: expected 8 fields, saw 10\n",
143 | "Skipping line 245933: expected 8 fields, saw 9\n",
144 | "Skipping line 251296: expected 8 fields, saw 9\n",
145 | "Skipping line 259941: expected 8 fields, saw 9\n",
146 | "Skipping line 261529: expected 8 fields, saw 9\n",
147 | "\n"
148 | ]
149 | }
150 | ],
151 | "source": [
152 | "#Loading data\n",
153 | "books = pd.read_csv('books.csv', sep=';', error_bad_lines=False, encoding=\"latin-1\")\n",
154 | "books.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']\n",
155 | "users = pd.read_csv('users.csv', sep=';', error_bad_lines=False, encoding=\"latin-1\")\n",
156 | "users.columns = ['userID', 'Location', 'Age']\n",
157 | "ratings = pd.read_csv('ratings.csv', sep=';', error_bad_lines=False, encoding=\"latin-1\")\n",
158 | "ratings.columns = ['userID', 'ISBN', 'bookRating']"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 181,
164 | "metadata": {},
165 | "outputs": [
166 | {
167 | "name": "stdout",
168 | "output_type": "stream",
169 | "text": [
170 | "(271360, 8)\n",
171 | "(278858, 3)\n",
172 | "(1149780, 3)\n"
173 | ]
174 | }
175 | ],
176 | "source": [
177 | "#checking shapes of the datasets\n",
178 | "print books.shape\n",
179 | "print users.shape\n",
180 | "print ratings.shape"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 182,
186 | "metadata": {},
187 | "outputs": [
188 | {
189 | "data": {
190 | "text/html": [
191 | "\n",
192 | "\n",
205 | "
\n",
206 | " \n",
207 | " \n",
208 | " | \n",
209 | " ISBN | \n",
210 | " bookTitle | \n",
211 | " bookAuthor | \n",
212 | " yearOfPublication | \n",
213 | " publisher | \n",
214 | " imageUrlS | \n",
215 | " imageUrlM | \n",
216 | " imageUrlL | \n",
217 | "
\n",
218 | " \n",
219 | " \n",
220 | " \n",
221 | " | 0 | \n",
222 | " 0195153448 | \n",
223 | " Classical Mythology | \n",
224 | " Mark P. O. Morford | \n",
225 | " 2002 | \n",
226 | " Oxford University Press | \n",
227 | " http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg | \n",
228 | " http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg | \n",
229 | " http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg | \n",
230 | "
\n",
231 | " \n",
232 | " | 1 | \n",
233 | " 0002005018 | \n",
234 | " Clara Callan | \n",
235 | " Richard Bruce Wright | \n",
236 | " 2001 | \n",
237 | " HarperFlamingo Canada | \n",
238 | " http://images.amazon.com/images/P/0002005018.01.THUMBZZZ.jpg | \n",
239 | " http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg | \n",
240 | " http://images.amazon.com/images/P/0002005018.01.LZZZZZZZ.jpg | \n",
241 | "
\n",
242 | " \n",
243 | " | 2 | \n",
244 | " 0060973129 | \n",
245 | " Decision in Normandy | \n",
246 | " Carlo D'Este | \n",
247 | " 1991 | \n",
248 | " HarperPerennial | \n",
249 | " http://images.amazon.com/images/P/0060973129.01.THUMBZZZ.jpg | \n",
250 | " http://images.amazon.com/images/P/0060973129.01.MZZZZZZZ.jpg | \n",
251 | " http://images.amazon.com/images/P/0060973129.01.LZZZZZZZ.jpg | \n",
252 | "
\n",
253 | " \n",
254 | " | 3 | \n",
255 | " 0374157065 | \n",
256 | " Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It | \n",
257 | " Gina Bari Kolata | \n",
258 | " 1999 | \n",
259 | " Farrar Straus Giroux | \n",
260 | " http://images.amazon.com/images/P/0374157065.01.THUMBZZZ.jpg | \n",
261 | " http://images.amazon.com/images/P/0374157065.01.MZZZZZZZ.jpg | \n",
262 | " http://images.amazon.com/images/P/0374157065.01.LZZZZZZZ.jpg | \n",
263 | "
\n",
264 | " \n",
265 | " | 4 | \n",
266 | " 0393045218 | \n",
267 | " The Mummies of Urumchi | \n",
268 | " E. J. W. Barber | \n",
269 | " 1999 | \n",
270 | " W. W. Norton & Company | \n",
271 | " http://images.amazon.com/images/P/0393045218.01.THUMBZZZ.jpg | \n",
272 | " http://images.amazon.com/images/P/0393045218.01.MZZZZZZZ.jpg | \n",
273 | " http://images.amazon.com/images/P/0393045218.01.LZZZZZZZ.jpg | \n",
274 | "
\n",
275 | " \n",
276 | "
\n",
277 | "
"
278 | ],
279 | "text/plain": [
280 | " ISBN \\\n",
281 | "0 0195153448 \n",
282 | "1 0002005018 \n",
283 | "2 0060973129 \n",
284 | "3 0374157065 \n",
285 | "4 0393045218 \n",
286 | "\n",
287 | " bookTitle \\\n",
288 | "0 Classical Mythology \n",
289 | "1 Clara Callan \n",
290 | "2 Decision in Normandy \n",
291 | "3 Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It \n",
292 | "4 The Mummies of Urumchi \n",
293 | "\n",
294 | " bookAuthor yearOfPublication publisher \\\n",
295 | "0 Mark P. O. Morford 2002 Oxford University Press \n",
296 | "1 Richard Bruce Wright 2001 HarperFlamingo Canada \n",
297 | "2 Carlo D'Este 1991 HarperPerennial \n",
298 | "3 Gina Bari Kolata 1999 Farrar Straus Giroux \n",
299 | "4 E. J. W. Barber 1999 W. W. Norton & Company \n",
300 | "\n",
301 | " imageUrlS \\\n",
302 | "0 http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg \n",
303 | "1 http://images.amazon.com/images/P/0002005018.01.THUMBZZZ.jpg \n",
304 | "2 http://images.amazon.com/images/P/0060973129.01.THUMBZZZ.jpg \n",
305 | "3 http://images.amazon.com/images/P/0374157065.01.THUMBZZZ.jpg \n",
306 | "4 http://images.amazon.com/images/P/0393045218.01.THUMBZZZ.jpg \n",
307 | "\n",
308 | " imageUrlM \\\n",
309 | "0 http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg \n",
310 | "1 http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg \n",
311 | "2 http://images.amazon.com/images/P/0060973129.01.MZZZZZZZ.jpg \n",
312 | "3 http://images.amazon.com/images/P/0374157065.01.MZZZZZZZ.jpg \n",
313 | "4 http://images.amazon.com/images/P/0393045218.01.MZZZZZZZ.jpg \n",
314 | "\n",
315 | " imageUrlL \n",
316 | "0 http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg \n",
317 | "1 http://images.amazon.com/images/P/0002005018.01.LZZZZZZZ.jpg \n",
318 | "2 http://images.amazon.com/images/P/0060973129.01.LZZZZZZZ.jpg \n",
319 | "3 http://images.amazon.com/images/P/0374157065.01.LZZZZZZZ.jpg \n",
320 | "4 http://images.amazon.com/images/P/0393045218.01.LZZZZZZZ.jpg "
321 | ]
322 | },
323 | "execution_count": 182,
324 | "metadata": {},
325 | "output_type": "execute_result"
326 | }
327 | ],
328 | "source": [
329 | "#Exploring books dataset\n",
330 | "books.head()"
331 | ]
332 | },
333 | {
334 | "cell_type": "code",
335 | "execution_count": 183,
336 | "metadata": {
337 | "collapsed": true
338 | },
339 | "outputs": [],
340 | "source": [
341 | "#dropping last three columns containing image URLs which will not be required for analysis\n",
342 | "books.drop(['imageUrlS', 'imageUrlM', 'imageUrlL'],axis=1,inplace=True)"
343 | ]
344 | },
345 | {
346 | "cell_type": "code",
347 | "execution_count": 184,
348 | "metadata": {
349 | "scrolled": true
350 | },
351 | "outputs": [
352 | {
353 | "data": {
354 | "text/html": [
355 | "\n",
356 | "\n",
369 | "
\n",
370 | " \n",
371 | " \n",
372 | " | \n",
373 | " ISBN | \n",
374 | " bookTitle | \n",
375 | " bookAuthor | \n",
376 | " yearOfPublication | \n",
377 | " publisher | \n",
378 | "
\n",
379 | " \n",
380 | " \n",
381 | " \n",
382 | " | 0 | \n",
383 | " 0195153448 | \n",
384 | " Classical Mythology | \n",
385 | " Mark P. O. Morford | \n",
386 | " 2002 | \n",
387 | " Oxford University Press | \n",
388 | "
\n",
389 | " \n",
390 | " | 1 | \n",
391 | " 0002005018 | \n",
392 | " Clara Callan | \n",
393 | " Richard Bruce Wright | \n",
394 | " 2001 | \n",
395 | " HarperFlamingo Canada | \n",
396 | "
\n",
397 | " \n",
398 | " | 2 | \n",
399 | " 0060973129 | \n",
400 | " Decision in Normandy | \n",
401 | " Carlo D'Este | \n",
402 | " 1991 | \n",
403 | " HarperPerennial | \n",
404 | "
\n",
405 | " \n",
406 | " | 3 | \n",
407 | " 0374157065 | \n",
408 | " Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It | \n",
409 | " Gina Bari Kolata | \n",
410 | " 1999 | \n",
411 | " Farrar Straus Giroux | \n",
412 | "
\n",
413 | " \n",
414 | " | 4 | \n",
415 | " 0393045218 | \n",
416 | " The Mummies of Urumchi | \n",
417 | " E. J. W. Barber | \n",
418 | " 1999 | \n",
419 | " W. W. Norton & Company | \n",
420 | "
\n",
421 | " \n",
422 | "
\n",
423 | "
"
424 | ],
425 | "text/plain": [
426 | " ISBN \\\n",
427 | "0 0195153448 \n",
428 | "1 0002005018 \n",
429 | "2 0060973129 \n",
430 | "3 0374157065 \n",
431 | "4 0393045218 \n",
432 | "\n",
433 | " bookTitle \\\n",
434 | "0 Classical Mythology \n",
435 | "1 Clara Callan \n",
436 | "2 Decision in Normandy \n",
437 | "3 Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It \n",
438 | "4 The Mummies of Urumchi \n",
439 | "\n",
440 | " bookAuthor yearOfPublication publisher \n",
441 | "0 Mark P. O. Morford 2002 Oxford University Press \n",
442 | "1 Richard Bruce Wright 2001 HarperFlamingo Canada \n",
443 | "2 Carlo D'Este 1991 HarperPerennial \n",
444 | "3 Gina Bari Kolata 1999 Farrar Straus Giroux \n",
445 | "4 E. J. W. Barber 1999 W. W. Norton & Company "
446 | ]
447 | },
448 | "execution_count": 184,
449 | "metadata": {},
450 | "output_type": "execute_result"
451 | }
452 | ],
453 | "source": [
454 | "#Now the books datasets looks like....\n",
455 | "books.head()"
456 | ]
457 | },
458 | {
459 | "cell_type": "code",
460 | "execution_count": 135,
461 | "metadata": {
462 | "scrolled": false
463 | },
464 | "outputs": [
465 | {
466 | "data": {
467 | "text/plain": [
468 | "ISBN object\n",
469 | "bookTitle object\n",
470 | "bookAuthor object\n",
471 | "yearOfPublication object\n",
472 | "publisher object\n",
473 | "dtype: object"
474 | ]
475 | },
476 | "execution_count": 135,
477 | "metadata": {},
478 | "output_type": "execute_result"
479 | }
480 | ],
481 | "source": [
482 | "#checking data types of columns\n",
483 | "books.dtypes"
484 | ]
485 | },
486 | {
487 | "cell_type": "code",
488 | "execution_count": 188,
489 | "metadata": {
490 | "collapsed": true
491 | },
492 | "outputs": [],
493 | "source": [
494 | "#making this setting to display full text in columns\n",
495 | "pd.set_option('display.max_colwidth', -1)"
496 | ]
497 | },
498 | {
499 | "cell_type": "markdown",
500 | "metadata": {},
501 | "source": [
502 | "**yearOfPublication**"
503 | ]
504 | },
505 | {
506 | "cell_type": "code",
507 | "execution_count": 189,
508 | "metadata": {},
509 | "outputs": [
510 | {
511 | "data": {
512 | "text/plain": [
513 | "array([2002L, 2001L, 1991L, 1999L, 2000L, 1993L, 1996L, 1988L, 2004L,\n",
514 | " 1998L, 1994L, 2003L, 1997L, 1983L, 1979L, 1995L, 1982L, 1985L,\n",
515 | " 1992L, 1986L, 1978L, 1980L, 1952L, 1987L, 1990L, 1981L, 1989L,\n",
516 | " 1984L, 0L, 1968L, 1961L, 1958L, 1974L, 1976L, 1971L, 1977L, 1975L,\n",
517 | " 1965L, 1941L, 1970L, 1962L, 1973L, 1972L, 1960L, 1966L, 1920L,\n",
518 | " 1956L, 1959L, 1953L, 1951L, 1942L, 1963L, 1964L, 1969L, 1954L,\n",
519 | " 1950L, 1967L, 2005L, 1957L, 1940L, 1937L, 1955L, 1946L, 1936L,\n",
520 | " 1930L, 2011L, 1925L, 1948L, 1943L, 1947L, 1945L, 1923L, 2020L,\n",
521 | " 1939L, 1926L, 1938L, 2030L, 1911L, 1904L, 1949L, 1932L, 1928L,\n",
522 | " 1929L, 1927L, 1931L, 1914L, 2050L, 1934L, 1910L, 1933L, 1902L,\n",
523 | " 1924L, 1921L, 1900L, 2038L, 2026L, 1944L, 1917L, 1901L, 2010L,\n",
524 | " 1908L, 1906L, 1935L, 1806L, 2021L, u'2000', u'1995', u'1999',\n",
525 | " u'2004', u'2003', u'1990', u'1994', u'1986', u'1989', u'2002',\n",
526 | " u'1981', u'1993', u'1983', u'1982', u'1976', u'1991', u'1977',\n",
527 | " u'1998', u'1992', u'1996', u'0', u'1997', u'2001', u'1974', u'1968',\n",
528 | " u'1987', u'1984', u'1988', u'1963', u'1956', u'1970', u'1985',\n",
529 | " u'1978', u'1973', u'1980', u'1979', u'1975', u'1969', u'1961',\n",
530 | " u'1965', u'1939', u'1958', u'1950', u'1953', u'1966', u'1971',\n",
531 | " u'1959', u'1972', u'1955', u'1957', u'1945', u'1960', u'1967',\n",
532 | " u'1932', u'1924', u'1964', u'2012', u'1911', u'1927', u'1948',\n",
533 | " u'1962', u'2006', u'1952', u'1940', u'1951', u'1931', u'1954',\n",
534 | " u'2005', u'1930', u'1941', u'1944', u'DK Publishing Inc', u'1943',\n",
535 | " u'1938', u'1900', u'1942', u'1923', u'1920', u'1933', u'Gallimard',\n",
536 | " u'1909', u'1946', u'2008', u'1378', u'2030', u'1936', u'1947',\n",
537 | " u'2011', u'2020', u'1919', u'1949', u'1922', u'1897', u'2024',\n",
538 | " u'1376', u'1926', u'2037'], dtype=object)"
539 | ]
540 | },
541 | "execution_count": 189,
542 | "metadata": {},
543 | "output_type": "execute_result"
544 | }
545 | ],
546 | "source": [
547 | "#yearOfPublication should be set as having dtype as int\n",
548 | "#checking the unique values of yearOfPublication\n",
549 | "books.yearOfPublication.unique()\n",
550 | "\n",
551 | "#as it can be seen from below that there are some incorrect entries in this field. It looks like Publisher names \n",
552 | "#'DK Publishing Inc' and 'Gallimard' have been incorrectly loaded as yearOfPublication in dataset due to some errors in csv file\n",
553 | "#Also some of the entries are strings and same years have been entered as numbers in some places"
554 | ]
555 | },
556 | {
557 | "cell_type": "code",
558 | "execution_count": 190,
559 | "metadata": {
560 | "scrolled": true
561 | },
562 | "outputs": [
563 | {
564 | "data": {
565 | "text/html": [
566 | "\n",
567 | "\n",
580 | "
\n",
581 | " \n",
582 | " \n",
583 | " | \n",
584 | " ISBN | \n",
585 | " bookTitle | \n",
586 | " bookAuthor | \n",
587 | " yearOfPublication | \n",
588 | " publisher | \n",
589 | "
\n",
590 | " \n",
591 | " \n",
592 | " \n",
593 | " | 209538 | \n",
594 | " 078946697X | \n",
595 | " DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)\\\";Michael Teitelbaum\" | \n",
596 | " 2000 | \n",
597 | " DK Publishing Inc | \n",
598 | " http://images.amazon.com/images/P/078946697X.01.THUMBZZZ.jpg | \n",
599 | "
\n",
600 | " \n",
601 | " | 221678 | \n",
602 | " 0789466953 | \n",
603 | " DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)\\\";James Buckley\" | \n",
604 | " 2000 | \n",
605 | " DK Publishing Inc | \n",
606 | " http://images.amazon.com/images/P/0789466953.01.THUMBZZZ.jpg | \n",
607 | "
\n",
608 | " \n",
609 | "
\n",
610 | "
"
611 | ],
612 | "text/plain": [
613 | " ISBN \\\n",
614 | "209538 078946697X \n",
615 | "221678 0789466953 \n",
616 | "\n",
617 | " bookTitle \\\n",
618 | "209538 DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)\\\";Michael Teitelbaum\" \n",
619 | "221678 DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)\\\";James Buckley\" \n",
620 | "\n",
621 | " bookAuthor yearOfPublication \\\n",
622 | "209538 2000 DK Publishing Inc \n",
623 | "221678 2000 DK Publishing Inc \n",
624 | "\n",
625 | " publisher \n",
626 | "209538 http://images.amazon.com/images/P/078946697X.01.THUMBZZZ.jpg \n",
627 | "221678 http://images.amazon.com/images/P/0789466953.01.THUMBZZZ.jpg "
628 | ]
629 | },
630 | "execution_count": 190,
631 | "metadata": {},
632 | "output_type": "execute_result"
633 | }
634 | ],
635 | "source": [
636 | "#investigating the rows having 'DK Publishing Inc' as yearOfPublication\n",
637 | "books.loc[books.yearOfPublication == 'DK Publishing Inc',:]"
638 | ]
639 | },
640 | {
641 | "cell_type": "code",
642 | "execution_count": 191,
643 | "metadata": {
644 | "collapsed": true
645 | },
646 | "outputs": [],
647 | "source": [
648 | "#From above, it is seen that bookAuthor is incorrectly loaded with bookTitle, hence making required corrections\n",
649 | "#ISBN '0789466953'\n",
650 | "books.loc[books.ISBN == '0789466953','yearOfPublication'] = 2000\n",
651 | "books.loc[books.ISBN == '0789466953','bookAuthor'] = \"James Buckley\"\n",
652 | "books.loc[books.ISBN == '0789466953','publisher'] = \"DK Publishing Inc\"\n",
653 | "books.loc[books.ISBN == '0789466953','bookTitle'] = \"DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)\""
654 | ]
655 | },
656 | {
657 | "cell_type": "code",
658 | "execution_count": 192,
659 | "metadata": {
660 | "collapsed": true
661 | },
662 | "outputs": [],
663 | "source": [
664 | "#ISBN '078946697X'\n",
665 | "books.loc[books.ISBN == '078946697X','yearOfPublication'] = 2000\n",
666 | "books.loc[books.ISBN == '078946697X','bookAuthor'] = \"Michael Teitelbaum\"\n",
667 | "books.loc[books.ISBN == '078946697X','publisher'] = \"DK Publishing Inc\"\n",
668 | "books.loc[books.ISBN == '078946697X','bookTitle'] = \"DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)\""
669 | ]
670 | },
671 | {
672 | "cell_type": "code",
673 | "execution_count": 193,
674 | "metadata": {},
675 | "outputs": [
676 | {
677 | "data": {
678 | "text/html": [
679 | "\n",
680 | "\n",
693 | "
\n",
694 | " \n",
695 | " \n",
696 | " | \n",
697 | " ISBN | \n",
698 | " bookTitle | \n",
699 | " bookAuthor | \n",
700 | " yearOfPublication | \n",
701 | " publisher | \n",
702 | "
\n",
703 | " \n",
704 | " \n",
705 | " \n",
706 | " | 209538 | \n",
707 | " 078946697X | \n",
708 | " DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers) | \n",
709 | " Michael Teitelbaum | \n",
710 | " 2000 | \n",
711 | " DK Publishing Inc | \n",
712 | "
\n",
713 | " \n",
714 | " | 221678 | \n",
715 | " 0789466953 | \n",
716 | " DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers) | \n",
717 | " James Buckley | \n",
718 | " 2000 | \n",
719 | " DK Publishing Inc | \n",
720 | "
\n",
721 | " \n",
722 | "
\n",
723 | "
"
724 | ],
725 | "text/plain": [
726 | " ISBN \\\n",
727 | "209538 078946697X \n",
728 | "221678 0789466953 \n",
729 | "\n",
730 | " bookTitle \\\n",
731 | "209538 DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers) \n",
732 | "221678 DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers) \n",
733 | "\n",
734 | " bookAuthor yearOfPublication publisher \n",
735 | "209538 Michael Teitelbaum 2000 DK Publishing Inc \n",
736 | "221678 James Buckley 2000 DK Publishing Inc "
737 | ]
738 | },
739 | "execution_count": 193,
740 | "metadata": {},
741 | "output_type": "execute_result"
742 | }
743 | ],
744 | "source": [
745 | "#rechecking\n",
746 | "books.loc[(books.ISBN == '0789466953') | (books.ISBN == '078946697X'),:]\n",
747 | "#corrections done"
748 | ]
749 | },
750 | {
751 | "cell_type": "code",
752 | "execution_count": 194,
753 | "metadata": {
754 | "scrolled": true
755 | },
756 | "outputs": [
757 | {
758 | "data": {
759 | "text/html": [
760 | "\n",
761 | "\n",
774 | "
\n",
775 | " \n",
776 | " \n",
777 | " | \n",
778 | " ISBN | \n",
779 | " bookTitle | \n",
780 | " bookAuthor | \n",
781 | " yearOfPublication | \n",
782 | " publisher | \n",
783 | "
\n",
784 | " \n",
785 | " \n",
786 | " \n",
787 | " | 220731 | \n",
788 | " 2070426769 | \n",
789 | " Peuple du ciel, suivi de 'Les Bergers\\\";Jean-Marie Gustave Le Cl�©zio\" | \n",
790 | " 2003 | \n",
791 | " Gallimard | \n",
792 | " http://images.amazon.com/images/P/2070426769.01.THUMBZZZ.jpg | \n",
793 | "
\n",
794 | " \n",
795 | "
\n",
796 | "
"
797 | ],
798 | "text/plain": [
799 | " ISBN \\\n",
800 | "220731 2070426769 \n",
801 | "\n",
802 | " bookTitle \\\n",
803 | "220731 Peuple du ciel, suivi de 'Les Bergers\\\";Jean-Marie Gustave Le Cl�©zio\" \n",
804 | "\n",
805 | " bookAuthor yearOfPublication \\\n",
806 | "220731 2003 Gallimard \n",
807 | "\n",
808 | " publisher \n",
809 | "220731 http://images.amazon.com/images/P/2070426769.01.THUMBZZZ.jpg "
810 | ]
811 | },
812 | "execution_count": 194,
813 | "metadata": {},
814 | "output_type": "execute_result"
815 | }
816 | ],
817 | "source": [
818 | "#investigating the rows having 'Gallimard' as yearOfPublication\n",
819 | "books.loc[books.yearOfPublication == 'Gallimard',:]"
820 | ]
821 | },
822 | {
823 | "cell_type": "code",
824 | "execution_count": 195,
825 | "metadata": {
826 | "collapsed": true
827 | },
828 | "outputs": [],
829 | "source": [
830 | "#making required corrections as above, keeping other fields intact\n",
831 | "books.loc[books.ISBN == '2070426769','yearOfPublication'] = 2003\n",
832 | "books.loc[books.ISBN == '2070426769','bookAuthor'] = \"Jean-Marie Gustave Le Cl�©zio\"\n",
833 | "books.loc[books.ISBN == '2070426769','publisher'] = \"Gallimard\"\n",
834 | "books.loc[books.ISBN == '2070426769','bookTitle'] = \"Peuple du ciel, suivi de 'Les Bergers\""
835 | ]
836 | },
837 | {
838 | "cell_type": "code",
839 | "execution_count": 196,
840 | "metadata": {
841 | "scrolled": true
842 | },
843 | "outputs": [
844 | {
845 | "data": {
846 | "text/html": [
847 | "\n",
848 | "\n",
861 | "
\n",
862 | " \n",
863 | " \n",
864 | " | \n",
865 | " ISBN | \n",
866 | " bookTitle | \n",
867 | " bookAuthor | \n",
868 | " yearOfPublication | \n",
869 | " publisher | \n",
870 | "
\n",
871 | " \n",
872 | " \n",
873 | " \n",
874 | " | 220731 | \n",
875 | " 2070426769 | \n",
876 | " Peuple du ciel, suivi de 'Les Bergers | \n",
877 | " Jean-Marie Gustave Le Cl�©zio | \n",
878 | " 2003 | \n",
879 | " Gallimard | \n",
880 | "
\n",
881 | " \n",
882 | "
\n",
883 | "
"
884 | ],
885 | "text/plain": [
886 | " ISBN bookTitle \\\n",
887 | "220731 2070426769 Peuple du ciel, suivi de 'Les Bergers \n",
888 | "\n",
889 | " bookAuthor yearOfPublication publisher \n",
890 | "220731 Jean-Marie Gustave Le Cl�©zio 2003 Gallimard "
891 | ]
892 | },
893 | "execution_count": 196,
894 | "metadata": {},
895 | "output_type": "execute_result"
896 | }
897 | ],
898 | "source": [
899 | "#rechecking\n",
900 | "books.loc[books.ISBN == '2070426769',:]\n",
901 | "#corrections done"
902 | ]
903 | },
904 | {
905 | "cell_type": "code",
906 | "execution_count": 197,
907 | "metadata": {
908 | "collapsed": true
909 | },
910 | "outputs": [],
911 | "source": [
912 | "#Correcting the dtypes of yearOfPublication\n",
913 | "books.yearOfPublication=pd.to_numeric(books.yearOfPublication, errors='coerce')"
914 | ]
915 | },
916 | {
917 | "cell_type": "code",
918 | "execution_count": 198,
919 | "metadata": {},
920 | "outputs": [
921 | {
922 | "name": "stdout",
923 | "output_type": "stream",
924 | "text": [
925 | "[0, 1376, 1378, 1806, 1897, 1900, 1901, 1902, 1904, 1906, 1908, 1909, 1910, 1911, 1914, 1917, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2008, 2010, 2011, 2012, 2020, 2021, 2024, 2026, 2030, 2037, 2038, 2050]\n"
926 | ]
927 | }
928 | ],
929 | "source": [
930 | "print sorted(books['yearOfPublication'].unique())\n",
931 | "#Now it can be seen that yearOfPublication has all values as integers"
932 | ]
933 | },
934 | {
935 | "cell_type": "code",
936 | "execution_count": 199,
937 | "metadata": {
938 | "collapsed": true
939 | },
940 | "outputs": [],
941 | "source": [
942 | "#However, the value 0 is invalid and as this dataset was published in 2004, I have assumed the the years after 2006 to be \n",
943 | "#invalid keeping some margin in case dataset was updated thereafer\n",
944 | "#setting invalid years as NaN\n",
945 | "books.loc[(books.yearOfPublication > 2006) | (books.yearOfPublication == 0),'yearOfPublication'] = np.NAN"
946 | ]
947 | },
948 | {
949 | "cell_type": "code",
950 | "execution_count": 200,
951 | "metadata": {
952 | "collapsed": true
953 | },
954 | "outputs": [],
955 | "source": [
956 | "#replacing NaNs with mean value of yearOfPublication\n",
957 | "books.yearOfPublication.fillna(round(books.yearOfPublication.mean()), inplace=True)"
958 | ]
959 | },
960 | {
961 | "cell_type": "code",
962 | "execution_count": 201,
963 | "metadata": {},
964 | "outputs": [
965 | {
966 | "data": {
967 | "text/plain": [
968 | "0"
969 | ]
970 | },
971 | "execution_count": 201,
972 | "metadata": {},
973 | "output_type": "execute_result"
974 | }
975 | ],
976 | "source": [
977 | "#rechecking\n",
978 | "books.yearOfPublication.isnull().sum()\n",
979 | "#No NaNs"
980 | ]
981 | },
982 | {
983 | "cell_type": "code",
984 | "execution_count": 202,
985 | "metadata": {
986 | "collapsed": true
987 | },
988 | "outputs": [],
989 | "source": [
990 | "#resetting the dtype as int32\n",
991 | "books.yearOfPublication = books.yearOfPublication.astype(np.int32)"
992 | ]
993 | },
994 | {
995 | "cell_type": "markdown",
996 | "metadata": {},
997 | "source": [
998 | "**publisher**"
999 | ]
1000 | },
1001 | {
1002 | "cell_type": "code",
1003 | "execution_count": 203,
1004 | "metadata": {},
1005 | "outputs": [
1006 | {
1007 | "data": {
1008 | "text/html": [
1009 | "\n",
1010 | "\n",
1023 | "
\n",
1024 | " \n",
1025 | " \n",
1026 | " | \n",
1027 | " ISBN | \n",
1028 | " bookTitle | \n",
1029 | " bookAuthor | \n",
1030 | " yearOfPublication | \n",
1031 | " publisher | \n",
1032 | "
\n",
1033 | " \n",
1034 | " \n",
1035 | " \n",
1036 | " | 128890 | \n",
1037 | " 193169656X | \n",
1038 | " Tyrant Moon | \n",
1039 | " Elaine Corvidae | \n",
1040 | " 2002 | \n",
1041 | " NaN | \n",
1042 | "
\n",
1043 | " \n",
1044 | " | 129037 | \n",
1045 | " 1931696993 | \n",
1046 | " Finders Keepers | \n",
1047 | " Linnea Sinclair | \n",
1048 | " 2001 | \n",
1049 | " NaN | \n",
1050 | "
\n",
1051 | " \n",
1052 | "
\n",
1053 | "
"
1054 | ],
1055 | "text/plain": [
1056 | " ISBN bookTitle bookAuthor yearOfPublication \\\n",
1057 | "128890 193169656X Tyrant Moon Elaine Corvidae 2002 \n",
1058 | "129037 1931696993 Finders Keepers Linnea Sinclair 2001 \n",
1059 | "\n",
1060 | " publisher \n",
1061 | "128890 NaN \n",
1062 | "129037 NaN "
1063 | ]
1064 | },
1065 | "execution_count": 203,
1066 | "metadata": {},
1067 | "output_type": "execute_result"
1068 | }
1069 | ],
1070 | "source": [
1071 | "#exploring 'publisher' column\n",
1072 | "books.loc[books.publisher.isnull(),:]\n",
1073 | "#two NaNs"
1074 | ]
1075 | },
1076 | {
1077 | "cell_type": "code",
1078 | "execution_count": 204,
1079 | "metadata": {
1080 | "scrolled": true
1081 | },
1082 | "outputs": [
1083 | {
1084 | "data": {
1085 | "text/html": [
1086 | "\n",
1087 | "\n",
1100 | "
\n",
1101 | " \n",
1102 | " \n",
1103 | " | \n",
1104 | " ISBN | \n",
1105 | " bookTitle | \n",
1106 | " bookAuthor | \n",
1107 | " yearOfPublication | \n",
1108 | " publisher | \n",
1109 | "
\n",
1110 | " \n",
1111 | " \n",
1112 | " \n",
1113 | " | 128890 | \n",
1114 | " 193169656X | \n",
1115 | " Tyrant Moon | \n",
1116 | " Elaine Corvidae | \n",
1117 | " 2002 | \n",
1118 | " NaN | \n",
1119 | "
\n",
1120 | " \n",
1121 | "
\n",
1122 | "
"
1123 | ],
1124 | "text/plain": [
1125 | " ISBN bookTitle bookAuthor yearOfPublication publisher\n",
1126 | "128890 193169656X Tyrant Moon Elaine Corvidae 2002 NaN "
1127 | ]
1128 | },
1129 | "execution_count": 204,
1130 | "metadata": {},
1131 | "output_type": "execute_result"
1132 | }
1133 | ],
1134 | "source": [
1135 | "#investigating rows having NaNs\n",
1136 | "#Checking with rows having bookTitle as Tyrant Moon to see if we can get any clues\n",
1137 | "books.loc[(books.bookTitle == 'Tyrant Moon'),:]\n",
1138 | "#no clues"
1139 | ]
1140 | },
1141 | {
1142 | "cell_type": "code",
1143 | "execution_count": 205,
1144 | "metadata": {},
1145 | "outputs": [
1146 | {
1147 | "data": {
1148 | "text/html": [
1149 | "\n",
1150 | "\n",
1163 | "
\n",
1164 | " \n",
1165 | " \n",
1166 | " | \n",
1167 | " ISBN | \n",
1168 | " bookTitle | \n",
1169 | " bookAuthor | \n",
1170 | " yearOfPublication | \n",
1171 | " publisher | \n",
1172 | "
\n",
1173 | " \n",
1174 | " \n",
1175 | " \n",
1176 | " | 10799 | \n",
1177 | " 082177364X | \n",
1178 | " Finders Keepers | \n",
1179 | " Fern Michaels | \n",
1180 | " 2002 | \n",
1181 | " Zebra Books | \n",
1182 | "
\n",
1183 | " \n",
1184 | " | 42019 | \n",
1185 | " 0070465037 | \n",
1186 | " Finders Keepers | \n",
1187 | " Barbara Nickolae | \n",
1188 | " 1989 | \n",
1189 | " McGraw-Hill Companies | \n",
1190 | "
\n",
1191 | " \n",
1192 | " | 58264 | \n",
1193 | " 0688118461 | \n",
1194 | " Finders Keepers | \n",
1195 | " Emily Rodda | \n",
1196 | " 1993 | \n",
1197 | " Harpercollins Juvenile Books | \n",
1198 | "
\n",
1199 | " \n",
1200 | " | 66678 | \n",
1201 | " 1575663236 | \n",
1202 | " Finders Keepers | \n",
1203 | " Fern Michaels | \n",
1204 | " 1998 | \n",
1205 | " Kensington Publishing Corporation | \n",
1206 | "
\n",
1207 | " \n",
1208 | " | 129037 | \n",
1209 | " 1931696993 | \n",
1210 | " Finders Keepers | \n",
1211 | " Linnea Sinclair | \n",
1212 | " 2001 | \n",
1213 | " NaN | \n",
1214 | "
\n",
1215 | " \n",
1216 | " | 134309 | \n",
1217 | " 0156309505 | \n",
1218 | " Finders Keepers | \n",
1219 | " Will | \n",
1220 | " 1989 | \n",
1221 | " Voyager Books | \n",
1222 | "
\n",
1223 | " \n",
1224 | " | 173473 | \n",
1225 | " 0973146907 | \n",
1226 | " Finders Keepers | \n",
1227 | " Sean M. Costello | \n",
1228 | " 2002 | \n",
1229 | " Red Tower Publications | \n",
1230 | "
\n",
1231 | " \n",
1232 | " | 195885 | \n",
1233 | " 0061083909 | \n",
1234 | " Finders Keepers | \n",
1235 | " Sharon Sala | \n",
1236 | " 2003 | \n",
1237 | " HarperTorch | \n",
1238 | "
\n",
1239 | " \n",
1240 | " | 211874 | \n",
1241 | " 0373261160 | \n",
1242 | " Finders Keepers | \n",
1243 | " Elizabeth Travis | \n",
1244 | " 1993 | \n",
1245 | " Worldwide Library | \n",
1246 | "
\n",
1247 | " \n",
1248 | "
\n",
1249 | "
"
1250 | ],
1251 | "text/plain": [
1252 | " ISBN bookTitle bookAuthor yearOfPublication \\\n",
1253 | "10799 082177364X Finders Keepers Fern Michaels 2002 \n",
1254 | "42019 0070465037 Finders Keepers Barbara Nickolae 1989 \n",
1255 | "58264 0688118461 Finders Keepers Emily Rodda 1993 \n",
1256 | "66678 1575663236 Finders Keepers Fern Michaels 1998 \n",
1257 | "129037 1931696993 Finders Keepers Linnea Sinclair 2001 \n",
1258 | "134309 0156309505 Finders Keepers Will 1989 \n",
1259 | "173473 0973146907 Finders Keepers Sean M. Costello 2002 \n",
1260 | "195885 0061083909 Finders Keepers Sharon Sala 2003 \n",
1261 | "211874 0373261160 Finders Keepers Elizabeth Travis 1993 \n",
1262 | "\n",
1263 | " publisher \n",
1264 | "10799 Zebra Books \n",
1265 | "42019 McGraw-Hill Companies \n",
1266 | "58264 Harpercollins Juvenile Books \n",
1267 | "66678 Kensington Publishing Corporation \n",
1268 | "129037 NaN \n",
1269 | "134309 Voyager Books \n",
1270 | "173473 Red Tower Publications \n",
1271 | "195885 HarperTorch \n",
1272 | "211874 Worldwide Library "
1273 | ]
1274 | },
1275 | "execution_count": 205,
1276 | "metadata": {},
1277 | "output_type": "execute_result"
1278 | }
1279 | ],
1280 | "source": [
1281 | "#Checking with rows having bookTitle as Finder Keepers to see if we can get any clues\n",
1282 | "books.loc[(books.bookTitle == 'Finders Keepers'),:]\n",
1283 | "#all rows with different publisher and bookAuthor"
1284 | ]
1285 | },
1286 | {
1287 | "cell_type": "code",
1288 | "execution_count": 206,
1289 | "metadata": {},
1290 | "outputs": [
1291 | {
1292 | "data": {
1293 | "text/html": [
1294 | "\n",
1295 | "\n",
1308 | "
\n",
1309 | " \n",
1310 | " \n",
1311 | " | \n",
1312 | " ISBN | \n",
1313 | " bookTitle | \n",
1314 | " bookAuthor | \n",
1315 | " yearOfPublication | \n",
1316 | " publisher | \n",
1317 | "
\n",
1318 | " \n",
1319 | " \n",
1320 | " \n",
1321 | " | 126762 | \n",
1322 | " 1931696934 | \n",
1323 | " Winter's Orphans | \n",
1324 | " Elaine Corvidae | \n",
1325 | " 2001 | \n",
1326 | " Novelbooks | \n",
1327 | "
\n",
1328 | " \n",
1329 | " | 128890 | \n",
1330 | " 193169656X | \n",
1331 | " Tyrant Moon | \n",
1332 | " Elaine Corvidae | \n",
1333 | " 2002 | \n",
1334 | " NaN | \n",
1335 | "
\n",
1336 | " \n",
1337 | " | 129001 | \n",
1338 | " 0759901880 | \n",
1339 | " Wolfkin | \n",
1340 | " Elaine Corvidae | \n",
1341 | " 2001 | \n",
1342 | " Hard Shell Word Factory | \n",
1343 | "
\n",
1344 | " \n",
1345 | "
\n",
1346 | "
"
1347 | ],
1348 | "text/plain": [
1349 | " ISBN bookTitle bookAuthor yearOfPublication \\\n",
1350 | "126762 1931696934 Winter's Orphans Elaine Corvidae 2001 \n",
1351 | "128890 193169656X Tyrant Moon Elaine Corvidae 2002 \n",
1352 | "129001 0759901880 Wolfkin Elaine Corvidae 2001 \n",
1353 | "\n",
1354 | " publisher \n",
1355 | "126762 Novelbooks \n",
1356 | "128890 NaN \n",
1357 | "129001 Hard Shell Word Factory "
1358 | ]
1359 | },
1360 | "execution_count": 206,
1361 | "metadata": {},
1362 | "output_type": "execute_result"
1363 | }
1364 | ],
1365 | "source": [
1366 | "#checking by bookAuthor to find patterns\n",
1367 | "books.loc[(books.bookAuthor == 'Elaine Corvidae'),:]\n",
1368 | "#all having different publisher...no clues here"
1369 | ]
1370 | },
1371 | {
1372 | "cell_type": "code",
1373 | "execution_count": 207,
1374 | "metadata": {},
1375 | "outputs": [
1376 | {
1377 | "data": {
1378 | "text/html": [
1379 | "\n",
1380 | "\n",
1393 | "
\n",
1394 | " \n",
1395 | " \n",
1396 | " | \n",
1397 | " ISBN | \n",
1398 | " bookTitle | \n",
1399 | " bookAuthor | \n",
1400 | " yearOfPublication | \n",
1401 | " publisher | \n",
1402 | "
\n",
1403 | " \n",
1404 | " \n",
1405 | " \n",
1406 | " | 129037 | \n",
1407 | " 1931696993 | \n",
1408 | " Finders Keepers | \n",
1409 | " Linnea Sinclair | \n",
1410 | " 2001 | \n",
1411 | " NaN | \n",
1412 | "
\n",
1413 | " \n",
1414 | "
\n",
1415 | "
"
1416 | ],
1417 | "text/plain": [
1418 | " ISBN bookTitle bookAuthor yearOfPublication \\\n",
1419 | "129037 1931696993 Finders Keepers Linnea Sinclair 2001 \n",
1420 | "\n",
1421 | " publisher \n",
1422 | "129037 NaN "
1423 | ]
1424 | },
1425 | "execution_count": 207,
1426 | "metadata": {},
1427 | "output_type": "execute_result"
1428 | }
1429 | ],
1430 | "source": [
1431 | "#checking by bookAuthor to find patterns\n",
1432 | "books.loc[(books.bookAuthor == 'Linnea Sinclair'),:]"
1433 | ]
1434 | },
1435 | {
1436 | "cell_type": "code",
1437 | "execution_count": 208,
1438 | "metadata": {
1439 | "collapsed": true
1440 | },
1441 | "outputs": [],
1442 | "source": [
1443 | "#since there is nothing in common to infer publisher for NaNs, replacing these with 'other\n",
1444 | "books.loc[(books.ISBN == '193169656X'),'publisher'] = 'other'\n",
1445 | "books.loc[(books.ISBN == '1931696993'),'publisher'] = 'other'"
1446 | ]
1447 | },
1448 | {
1449 | "cell_type": "markdown",
1450 | "metadata": {},
1451 | "source": [
1452 | "**Users**"
1453 | ]
1454 | },
1455 | {
1456 | "cell_type": "code",
1457 | "execution_count": 209,
1458 | "metadata": {
1459 | "scrolled": true
1460 | },
1461 | "outputs": [
1462 | {
1463 | "name": "stdout",
1464 | "output_type": "stream",
1465 | "text": [
1466 | "(278858, 3)\n"
1467 | ]
1468 | },
1469 | {
1470 | "data": {
1471 | "text/html": [
1472 | "\n",
1473 | "\n",
1486 | "
\n",
1487 | " \n",
1488 | " \n",
1489 | " | \n",
1490 | " userID | \n",
1491 | " Location | \n",
1492 | " Age | \n",
1493 | "
\n",
1494 | " \n",
1495 | " \n",
1496 | " \n",
1497 | " | 0 | \n",
1498 | " 1 | \n",
1499 | " nyc, new york, usa | \n",
1500 | " NaN | \n",
1501 | "
\n",
1502 | " \n",
1503 | " | 1 | \n",
1504 | " 2 | \n",
1505 | " stockton, california, usa | \n",
1506 | " 18.0 | \n",
1507 | "
\n",
1508 | " \n",
1509 | " | 2 | \n",
1510 | " 3 | \n",
1511 | " moscow, yukon territory, russia | \n",
1512 | " NaN | \n",
1513 | "
\n",
1514 | " \n",
1515 | " | 3 | \n",
1516 | " 4 | \n",
1517 | " porto, v.n.gaia, portugal | \n",
1518 | " 17.0 | \n",
1519 | "
\n",
1520 | " \n",
1521 | " | 4 | \n",
1522 | " 5 | \n",
1523 | " farnborough, hants, united kingdom | \n",
1524 | " NaN | \n",
1525 | "
\n",
1526 | " \n",
1527 | "
\n",
1528 | "
"
1529 | ],
1530 | "text/plain": [
1531 | " userID Location Age\n",
1532 | "0 1 nyc, new york, usa NaN \n",
1533 | "1 2 stockton, california, usa 18.0\n",
1534 | "2 3 moscow, yukon territory, russia NaN \n",
1535 | "3 4 porto, v.n.gaia, portugal 17.0\n",
1536 | "4 5 farnborough, hants, united kingdom NaN "
1537 | ]
1538 | },
1539 | "execution_count": 209,
1540 | "metadata": {},
1541 | "output_type": "execute_result"
1542 | }
1543 | ],
1544 | "source": [
1545 | "print users.shape\n",
1546 | "users.head()"
1547 | ]
1548 | },
1549 | {
1550 | "cell_type": "code",
1551 | "execution_count": 210,
1552 | "metadata": {
1553 | "scrolled": true
1554 | },
1555 | "outputs": [
1556 | {
1557 | "data": {
1558 | "text/plain": [
1559 | "userID int64 \n",
1560 | "Location object \n",
1561 | "Age float64\n",
1562 | "dtype: object"
1563 | ]
1564 | },
1565 | "execution_count": 210,
1566 | "metadata": {},
1567 | "output_type": "execute_result"
1568 | }
1569 | ],
1570 | "source": [
1571 | "users.dtypes"
1572 | ]
1573 | },
1574 | {
1575 | "cell_type": "markdown",
1576 | "metadata": {},
1577 | "source": [
1578 | "**userID**"
1579 | ]
1580 | },
1581 | {
1582 | "cell_type": "code",
1583 | "execution_count": 211,
1584 | "metadata": {},
1585 | "outputs": [
1586 | {
1587 | "data": {
1588 | "text/plain": [
1589 | "array([ 1, 2, 3, ..., 278856, 278857, 278858], dtype=int64)"
1590 | ]
1591 | },
1592 | "execution_count": 211,
1593 | "metadata": {},
1594 | "output_type": "execute_result"
1595 | }
1596 | ],
1597 | "source": [
1598 | "users.userID.values\n",
1599 | "#it can be seen that these are unique"
1600 | ]
1601 | },
1602 | {
1603 | "cell_type": "markdown",
1604 | "metadata": {},
1605 | "source": [
1606 | "**Age**"
1607 | ]
1608 | },
1609 | {
1610 | "cell_type": "code",
1611 | "execution_count": 212,
1612 | "metadata": {},
1613 | "outputs": [
1614 | {
1615 | "name": "stdout",
1616 | "output_type": "stream",
1617 | "text": [
1618 | "[nan, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, 111.0, 113.0, 114.0, 115.0, 116.0, 118.0, 119.0, 123.0, 124.0, 127.0, 128.0, 132.0, 133.0, 136.0, 137.0, 138.0, 140.0, 141.0, 143.0, 146.0, 147.0, 148.0, 151.0, 152.0, 156.0, 157.0, 159.0, 162.0, 168.0, 172.0, 175.0, 183.0, 186.0, 189.0, 199.0, 200.0, 201.0, 204.0, 207.0, 208.0, 209.0, 210.0, 212.0, 219.0, 220.0, 223.0, 226.0, 228.0, 229.0, 230.0, 231.0, 237.0, 239.0, 244.0]\n"
1619 | ]
1620 | }
1621 | ],
1622 | "source": [
1623 | "print sorted(users.Age.unique())\n",
1624 | "#Age column has some invalid entries like nan, 0 and very high values like 100 and above"
1625 | ]
1626 | },
1627 | {
1628 | "cell_type": "code",
1629 | "execution_count": 213,
1630 | "metadata": {
1631 | "collapsed": true
1632 | },
1633 | "outputs": [],
1634 | "source": [
1635 | "#In my view values below 5 and above 90 do not make much sense for our book rating case...hence replacing these by NaNs\n",
1636 | "users.loc[(users.Age > 90) | (users.Age < 5), 'Age'] = np.nan"
1637 | ]
1638 | },
1639 | {
1640 | "cell_type": "code",
1641 | "execution_count": 214,
1642 | "metadata": {
1643 | "collapsed": true
1644 | },
1645 | "outputs": [],
1646 | "source": [
1647 | "#replacing NaNs with mean\n",
1648 | "users.Age = users.Age.fillna(users.Age.mean())"
1649 | ]
1650 | },
1651 | {
1652 | "cell_type": "code",
1653 | "execution_count": 215,
1654 | "metadata": {
1655 | "collapsed": true
1656 | },
1657 | "outputs": [],
1658 | "source": [
1659 | "#setting the data type as int\n",
1660 | "users.Age = users.Age.astype(np.int32)"
1661 | ]
1662 | },
1663 | {
1664 | "cell_type": "code",
1665 | "execution_count": 216,
1666 | "metadata": {
1667 | "scrolled": true
1668 | },
1669 | "outputs": [
1670 | {
1671 | "name": "stdout",
1672 | "output_type": "stream",
1673 | "text": [
1674 | "[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90]\n"
1675 | ]
1676 | }
1677 | ],
1678 | "source": [
1679 | "#rechecking\n",
1680 | "print sorted(users.Age.unique())\n",
1681 | "#looks good now"
1682 | ]
1683 | },
1684 | {
1685 | "cell_type": "markdown",
1686 | "metadata": {},
1687 | "source": [
1688 | "**Ratings Dataset**"
1689 | ]
1690 | },
1691 | {
1692 | "cell_type": "code",
1693 | "execution_count": 217,
1694 | "metadata": {},
1695 | "outputs": [
1696 | {
1697 | "data": {
1698 | "text/plain": [
1699 | "(1149780, 3)"
1700 | ]
1701 | },
1702 | "execution_count": 217,
1703 | "metadata": {},
1704 | "output_type": "execute_result"
1705 | }
1706 | ],
1707 | "source": [
1708 | "#checking shape\n",
1709 | "ratings.shape"
1710 | ]
1711 | },
1712 | {
1713 | "cell_type": "code",
1714 | "execution_count": 218,
1715 | "metadata": {},
1716 | "outputs": [
1717 | {
1718 | "name": "stdout",
1719 | "output_type": "stream",
1720 | "text": [
1721 | "75670906880\n"
1722 | ]
1723 | }
1724 | ],
1725 | "source": [
1726 | "#ratings dataset will have n_users*n_books entries if every user rated every item, this shows that the dataset is very sparse\n",
1727 | "n_users = users.shape[0]\n",
1728 | "n_books = books.shape[0]\n",
1729 | "print n_users * n_books"
1730 | ]
1731 | },
1732 | {
1733 | "cell_type": "code",
1734 | "execution_count": 219,
1735 | "metadata": {
1736 | "scrolled": true
1737 | },
1738 | "outputs": [
1739 | {
1740 | "data": {
1741 | "text/html": [
1742 | "\n",
1743 | "\n",
1756 | "
\n",
1757 | " \n",
1758 | " \n",
1759 | " | \n",
1760 | " userID | \n",
1761 | " ISBN | \n",
1762 | " bookRating | \n",
1763 | "
\n",
1764 | " \n",
1765 | " \n",
1766 | " \n",
1767 | " | 0 | \n",
1768 | " 276725 | \n",
1769 | " 034545104X | \n",
1770 | " 0 | \n",
1771 | "
\n",
1772 | " \n",
1773 | " | 1 | \n",
1774 | " 276726 | \n",
1775 | " 0155061224 | \n",
1776 | " 5 | \n",
1777 | "
\n",
1778 | " \n",
1779 | " | 2 | \n",
1780 | " 276727 | \n",
1781 | " 0446520802 | \n",
1782 | " 0 | \n",
1783 | "
\n",
1784 | " \n",
1785 | " | 3 | \n",
1786 | " 276729 | \n",
1787 | " 052165615X | \n",
1788 | " 3 | \n",
1789 | "
\n",
1790 | " \n",
1791 | " | 4 | \n",
1792 | " 276729 | \n",
1793 | " 0521795028 | \n",
1794 | " 6 | \n",
1795 | "
\n",
1796 | " \n",
1797 | "
\n",
1798 | "
"
1799 | ],
1800 | "text/plain": [
1801 | " userID ISBN bookRating\n",
1802 | "0 276725 034545104X 0 \n",
1803 | "1 276726 0155061224 5 \n",
1804 | "2 276727 0446520802 0 \n",
1805 | "3 276729 052165615X 3 \n",
1806 | "4 276729 0521795028 6 "
1807 | ]
1808 | },
1809 | "execution_count": 219,
1810 | "metadata": {},
1811 | "output_type": "execute_result"
1812 | }
1813 | ],
1814 | "source": [
1815 | "#checking first few rows...\n",
1816 | "ratings.head(5)"
1817 | ]
1818 | },
1819 | {
1820 | "cell_type": "code",
1821 | "execution_count": 220,
1822 | "metadata": {},
1823 | "outputs": [
1824 | {
1825 | "data": {
1826 | "text/plain": [
1827 | "array([ 0, 5, 3, 6, 8, 7, 10, 9, 4, 1, 2], dtype=int64)"
1828 | ]
1829 | },
1830 | "execution_count": 220,
1831 | "metadata": {},
1832 | "output_type": "execute_result"
1833 | }
1834 | ],
1835 | "source": [
1836 | "ratings.bookRating.unique()"
1837 | ]
1838 | },
1839 | {
1840 | "cell_type": "code",
1841 | "execution_count": 221,
1842 | "metadata": {
1843 | "collapsed": true
1844 | },
1845 | "outputs": [],
1846 | "source": [
1847 | "#ratings dataset should have books only which exist in our books dataset, unless new books are added to books dataset\n",
1848 | "ratings_new = ratings[ratings.ISBN.isin(books.ISBN)]"
1849 | ]
1850 | },
1851 | {
1852 | "cell_type": "code",
1853 | "execution_count": 222,
1854 | "metadata": {},
1855 | "outputs": [
1856 | {
1857 | "name": "stdout",
1858 | "output_type": "stream",
1859 | "text": [
1860 | "(1149780, 3)\n",
1861 | "(1031136, 3)\n"
1862 | ]
1863 | }
1864 | ],
1865 | "source": [
1866 | "print ratings.shape\n",
1867 | "print ratings_new.shape\n",
1868 | "#it can be seen that many rows having book ISBN not part of books dataset got dropped off"
1869 | ]
1870 | },
1871 | {
1872 | "cell_type": "code",
1873 | "execution_count": 223,
1874 | "metadata": {
1875 | "collapsed": true
1876 | },
1877 | "outputs": [],
1878 | "source": [
1879 | "#ratings dataset should have ratings from users which exist in users dataset, unless new users are added to users dataset\n",
1880 | "ratings = ratings[ratings.userID.isin(users.userID)]"
1881 | ]
1882 | },
1883 | {
1884 | "cell_type": "code",
1885 | "execution_count": 224,
1886 | "metadata": {},
1887 | "outputs": [
1888 | {
1889 | "name": "stdout",
1890 | "output_type": "stream",
1891 | "text": [
1892 | "(1149780, 3)\n",
1893 | "(1031136, 3)\n"
1894 | ]
1895 | }
1896 | ],
1897 | "source": [
1898 | "print ratings.shape\n",
1899 | "print ratings_new.shape\n",
1900 | "#no new users added, hence we will go with above dataset ratings_new (1031136, 3)"
1901 | ]
1902 | },
1903 | {
1904 | "cell_type": "code",
1905 | "execution_count": 225,
1906 | "metadata": {},
1907 | "outputs": [
1908 | {
1909 | "name": "stdout",
1910 | "output_type": "stream",
1911 | "text": [
1912 | "number of users: 278858\n",
1913 | "number of books: 271360\n"
1914 | ]
1915 | }
1916 | ],
1917 | "source": [
1918 | "print \"number of users: \" + str(n_users)\n",
1919 | "print \"number of books: \" + str(n_books)"
1920 | ]
1921 | },
1922 | {
1923 | "cell_type": "code",
1924 | "execution_count": 226,
1925 | "metadata": {},
1926 | "outputs": [
1927 | {
1928 | "name": "stdout",
1929 | "output_type": "stream",
1930 | "text": [
1931 | "The sparsity level of Book Crossing dataset is 99.9986373416 %\n"
1932 | ]
1933 | }
1934 | ],
1935 | "source": [
1936 | "#Sparsity of dataset in %\n",
1937 | "sparsity=1.0-len(ratings_new)/float(n_users*n_books)\n",
1938 | "print 'The sparsity level of Book Crossing dataset is ' + str(sparsity*100) + ' %'"
1939 | ]
1940 | },
1941 | {
1942 | "cell_type": "code",
1943 | "execution_count": 228,
1944 | "metadata": {},
1945 | "outputs": [
1946 | {
1947 | "data": {
1948 | "text/plain": [
1949 | "array([ 0, 5, 3, 6, 8, 7, 10, 9, 4, 1, 2], dtype=int64)"
1950 | ]
1951 | },
1952 | "execution_count": 228,
1953 | "metadata": {},
1954 | "output_type": "execute_result"
1955 | }
1956 | ],
1957 | "source": [
1958 | "#As quoted in the description of the dataset -\n",
1959 | "#BX-Book-Ratings contains the book rating information. Ratings are either explicit, expressed on a scale from 1-10 \n",
1960 | "#higher values denoting higher appreciation, or implicit, expressed by 0\n",
1961 | "ratings.bookRating.unique()"
1962 | ]
1963 | },
1964 | {
1965 | "cell_type": "code",
1966 | "execution_count": 229,
1967 | "metadata": {
1968 | "collapsed": true
1969 | },
1970 | "outputs": [],
1971 | "source": [
1972 | "#Hence segragating implicit and explict ratings datasets\n",
1973 | "ratings_explicit = ratings_new[ratings_new.bookRating != 0]\n",
1974 | "ratings_implicit = ratings_new[ratings_new.bookRating == 0]"
1975 | ]
1976 | },
1977 | {
1978 | "cell_type": "code",
1979 | "execution_count": 230,
1980 | "metadata": {},
1981 | "outputs": [
1982 | {
1983 | "name": "stdout",
1984 | "output_type": "stream",
1985 | "text": [
1986 | "(1031136, 3)\n",
1987 | "(383842, 3)\n",
1988 | "(647294, 3)\n"
1989 | ]
1990 | }
1991 | ],
1992 | "source": [
1993 | "#checking shapes\n",
1994 | "print ratings_new.shape\n",
1995 | "print ratings_explicit.shape\n",
1996 | "print ratings_implicit.shape"
1997 | ]
1998 | },
1999 | {
2000 | "cell_type": "code",
2001 | "execution_count": 231,
2002 | "metadata": {
2003 | "scrolled": true
2004 | },
2005 | "outputs": [
2006 | {
2007 | "data": {
2008 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAf0AAAFXCAYAAACoS5cAAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHsdJREFUeJzt3X9Y1fXh9/HX4Zc/OAeRpW3NUClZtoa/SGvXkXK51Gu3\n05la0K3utrViusLS8DcpkuMq0JVz2LK7LhSQ0pXXfTVbmoMMIy+uwNJs5SwVi9Do6hxEOMDn/uN7\n7VwyFU7Njwd4Px9/6YcP8DomPfkcjuc4LMuyBAAAur2QYA8AAABXBtEHAMAQRB8AAEMQfQAADEH0\nAQAwBNEHAMAQYcEeYLfaWk+wJwAAcEX16+e66HGu9AEAMATRBwDAEEQfAABDEH0AAAxB9AEAMATR\nBwDAEEQfAABDEH0AAAxB9AEAMATRBwDAEEQfAABDEH0AAAxB9AEAMES3f5U9ADBJ8b6zwZ7Qxkx3\n72BPwHm40gcAwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAA\nQxB9AAAMQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQBADAE0QcA\nwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQB\nADAE0QcAwBBhdn1gn8+nxYsXq7q6WiEhIcrMzFRYWJgWL14sh8OhIUOGKCMjQyEhISouLlZRUZHC\nwsKUmpqqcePG6dy5c1q0aJHOnDmjyMhIZWdnKyYmRpWVlcrKylJoaKjcbrfmz59v100AAKBbse1K\nv6SkRM3NzSoqKtK8efO0fv16rV27VmlpaSooKJBlWdqzZ49qa2uVn5+voqIibd68Wbm5uWpqalJh\nYaHi4+NVUFCgqVOnauPGjZKkjIwM5eTkqLCwUFVVVTp8+LBdNwEAgG7FtugPHjxYLS0tam1tldfr\nVVhYmA4dOqTRo0dLkpKSklRWVqaDBw9qxIgRioiIkMvlUmxsrI4cOaKKigqNHTvWf+7+/fvl9XrV\n1NSk2NhYORwOud1ulZWV2XUTAADoVmy7e793796qrq7WpEmTVFdXp7y8PB04cEAOh0OSFBkZKY/H\nI6/XK5fL5X+/yMhIeb3eNsfPP9fpdLY598SJE+3u6Nu3t8LCQm24hQDQGZ0N9oA2+vVzdXwSrhjb\nov/CCy/I7Xbr0Ucf1eeff645c+bI5/P5315fX6+oqCg5nU7V19e3Oe5yudocb+/cqKiodnfU1XWu\nLwAAMEltrSfYE4x0qW+2bLt7Pyoqyn+l3qdPHzU3N+vGG29UeXm5JKm0tFSJiYlKSEhQRUWFGhsb\n5fF4dPToUcXHx2vkyJEqKSnxnztq1Cg5nU6Fh4fr+PHjsixL+/btU2Jiol03AQCAbsVhWZZlxweu\nr6/X0qVLVVtbK5/Pp9mzZ+umm27SihUr5PP5FBcXpzVr1ig0NFTFxcXatm2bLMvSAw88oAkTJqih\noUHp6emqra1VeHi4cnJy1K9fP1VWVuqJJ55QS0uL3G63FixY0O4OvssEYJLifZ3r3s2Z7t7BnmCk\nS13p2xb9zoLoAzAJ0YcUhLv3AQBA50L0AQAwBNEHAMAQRB8AAEMQfQAADEH0AQAwBNEHAMAQRB8A\nAEMQfQAADEH0AQAwBNEHAMAQRB8AAEMQfQAADEH0AQAwBNEHAMAQRB8AAEMQfQAADEH0AQAwBNEH\nAMAQRB8AAEMQfQAADEH0AQAwRFiwBwAA0NW0bv1nsCf4hdwbH/i5Nu4AAACdCNEHAMAQRB8AAEMQ\nfQAADEH0AQAwBNEHAMAQRB8AAEMQfQAADEH0AQAwBNEHAMAQRB8AAEMQfQAADEH0AQAwBNEHAMAQ\nvLQuALTjt6VVwZ7g92zSsGBPQBfHlT4AAIYg+gAAGILoAwBgCKIPAIAhiD4AAIYg+gAAGILoAwBg\nCKIPAIAhiD4AAIYg+gAAGILoAwBgCKIPAIAhiD4AAIYg+gAAGILoAwBgCKIPAIAhwuz84Js2bdKb\nb74pn8+n5ORkjR49WosXL5bD4dCQIUOUkZGhkJAQFRcXq6ioSGFhYUpNTdW4ceN07tw5LVq0SGfO\nnFFkZKSys7MVExOjyspKZWVlKTQ0VG63W/Pnz7fzJgAAbHby1YZgT2hjwJRewZ5gG9uu9MvLy/Xe\ne++psLBQ+fn5+uKLL7R27VqlpaWpoKBAlmVpz549qq2tVX5+voqKirR582bl5uaqqalJhYWFio+P\nV0FBgaZOnaqNGzdKkjIyMpSTk6PCwkJVVVXp8OHDdt0EAAC6Fduiv2/fPsXHx2vevHl68MEHdfvt\nt+vQoUMaPXq0JCkpKUllZWU6ePCgRowYoYiICLlcLsXGxurIkSOqqKjQ2LFj/efu379fXq9XTU1N\nio2NlcPhkNvtVllZmV03AQCAbsW2u/fr6up06tQp5eXl6eTJk0pNTZVlWXI4HJKkyMhIeTweeb1e\nuVwu//tFRkbK6/W2OX7+uU6ns825J06caHdH3769FRYWasMtBIArq18/V8cn6aztO76NQDafVOe6\nez+QzTVXYEegAvt78T9si350dLTi4uIUERGhuLg49ejRQ1988YX/7fX19YqKipLT6VR9fX2b4y6X\nq83x9s6Niopqd0ddXef6AgCA76q21hPsCd8am+13sb2X+kbAtrv3R40apbfeekuWZammpkYNDQ26\n9dZbVV5eLkkqLS1VYmKiEhISVFFRocbGRnk8Hh09elTx8fEaOXKkSkpK/OeOGjVKTqdT4eHhOn78\nuCzL0r59+5SYmGjXTQAAoFux7Up/3LhxOnDggKZPny7LsrRy5UoNGDBAK1asUG5uruLi4jRhwgSF\nhoZq1qxZSklJkWVZWrBggXr06KHk5GSlp6crOTlZ4eHhysnJkSStWrVKCxcuVEtLi9xut4YNG2bX\nTQAAoFtxWJZlBXuEnbra3TQAOpffllYFe4Lfs0kdX+QU7+tcP9Kc6e7d4Tld8Z/stW795xVYEpiQ\ne+MvOHbF794HAACdC9EHAMAQRB8AAEMQfQAADEH0AQAwBNEHAMAQRB8AAEMQfQAADEH0AQAwBNEH\nAMAQRB8AAEMQfQAADEH0AQAwBNEHAMAQRB8AAEMQfQAADEH0AQAwBNEHAMAQRB8AAEMQfQAADBFQ\n9DMzMy84lp6eftnHAAAA+4S198Zly5bpxIkT+uCDD/Txxx/7jzc3N8vj8dg+DgAAXD7tRj81NVXV\n1dXKysrS/Pnz/cdDQ0N13XXX2T4OAABcPu1Gf8CAARowYIB27twpr9crj8cjy7IkSWfPnlV0dPQV\nGQkAAP577Ub/3zZt2qRNmza1ibzD4dCePXtsGwYAAC6vgKL/0ksvaffu3YqJibF7DwAAsElAj97/\nwQ9+oD59+ti9BQAA2CigK/1BgwYpJSVFY8aMUUREhP/4+Q/uAwAAnVtA0b/66qt19dVX270FAADY\nKKDoc0UPAEDXF1D0b7jhBjkcjjbH+vfvr5KSEltGAQCAyy+g6B85csT/a5/Pp927d6uystK2UQAA\n4PL71i+4Ex4erkmTJumdd96xYw8AALBJQFf6r7zyiv/XlmXp448/Vnh4uG2jAADA5RdQ9MvLy9v8\nvm/fvlq3bp0tgwAAgD0Civ7atWvl8/l07NgxtbS0aMiQIQoLC+hdAQBAJxFQuT/44AM99NBDio6O\nVmtrq06fPq0//elPGjZsmN37AADAZRJQ9NesWaN169b5I19ZWanMzEy9/PLLto4DAACXT0CP3j97\n9mybq/rhw4ersbHRtlEAAODyCyj6ffr00e7du/2/3717d5uX2QUAAJ1fQHfvZ2Zm6oEHHtCyZcv8\nx4qKimwbBaBjc/b9MdgT2njR/XCwJwDoQEBX+qWlperVq5f27t2rF198UTExMXr33Xft3gYAAC6j\ngKJfXFyswsJC9e7dWzfccIN27NihLVu22L0NAABcRgFF3+fztXkGPp6NDwCAriegn+mPHz9ec+bM\n0aRJkyRJf//733XHHXfYOgwAAFxeAUV/0aJF2rVrlw4cOKCwsDDNnj1b48ePt3sbAAC4jAJ+Lt2J\nEydq4sSJdm4BAAA2+tYvrQsAALomog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhrA1+mfO\nnNFtt92mo0eP6rPPPlNycrJSUlKUkZGh1tZWSf/zvP7Tpk3TzJkztXfvXknSuXPn9Pvf/14pKSm6\n//779dVXX0mSKisrNWPGDN1zzz3asGGDndMBAOh2bIu+z+fTypUr1bNnT0nS2rVrlZaWpoKCAlmW\npT179qi2tlb5+fkqKirS5s2blZubq6amJhUWFio+Pl4FBQWaOnWqNm7cKEnKyMhQTk6OCgsLVVVV\npcOHD9s1HwCAbse26GdnZ+uee+5R//79JUmHDh3S6NGjJUlJSUkqKyvTwYMHNWLECEVERMjlcik2\nNlZHjhxRRUWFxo4d6z93//798nq9ampqUmxsrBwOh9xut8rKyuyaDwBAtxPw0/B+Gzt27FBMTIzG\njh2rZ599VpJkWZYcDockKTIyUh6PR16vVy6Xy/9+kZGR8nq9bY6ff67T6Wxz7okTJzrc0rdvb4WF\nhV7OmwfgIvr1c3V8Ev4rgf0Zn7V9x7cRyOaTargCSwIXyOaaK7AjUN/ma8+W6G/fvl0Oh0P79+/X\nhx9+qPT0dP/P5SWpvr5eUVFRcjqdqq+vb3Pc5XK1Od7euVFRUR1uqavrXF8AQHdVW+sJ9oRuryv+\nGbPZfhfbe6lvBGy5e3/r1q3asmWL8vPzNXToUGVnZyspKUnl5eWSpNLSUiUmJiohIUEVFRVqbGyU\nx+PR0aNHFR8fr5EjR6qkpMR/7qhRo+R0OhUeHq7jx4/Lsizt27dPiYmJdswHAKBbsuVK/2LS09O1\nYsUK5ebmKi4uThMmTFBoaKhmzZqllJQUWZalBQsWqEePHkpOTlZ6erqSk5MVHh6unJwcSdKqVau0\ncOFCtbS0yO12a9iwYVdqPgAAXZ7t0c/Pz/f/esuWLRe8febMmZo5c2abY7169dLTTz99wbnDhw9X\ncXHx5R8JAIABeHIeAAAMQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAAQxB9AAAM\nQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAAQxB9AAAMQfQBADAE0QcAwBBEHwAA\nQ4QFewAAc/yfkp3BntDG/73tl8GeAFxRXOkDAGAIog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGI\nPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgugDAGAI\nog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6AAAY\ngugDAGAIog8AgCGIPgAAhiD6AAAYgugDAGCIMDs+qM/n09KlS1VdXa2mpialpqbq+uuv1+LFi+Vw\nODRkyBBlZGQoJCRExcXFKioqUlhYmFJTUzVu3DidO3dOixYt0pkzZxQZGans7GzFxMSosrJSWVlZ\nCg0Nldvt1vz58+2YDwBAt2TLlf7OnTsVHR2tgoICPffcc8rMzNTatWuVlpamgoICWZalPXv2qLa2\nVvn5+SoqKtLmzZuVm5urpqYmFRYWKj4+XgUFBZo6dao2btwoScrIyFBOTo4KCwtVVVWlw4cP2zEf\nAIBuyZboT5w4UQ8//LAkybIshYaG6tChQxo9erQkKSkpSWVlZTp48KBGjBihiIgIuVwuxcbG6siR\nI6qoqNDYsWP95+7fv19er1dNTU2KjY2Vw+GQ2+1WWVmZHfMBAOiWbLl7PzIyUpLk9Xr10EMPKS0t\nTdnZ2XI4HP63ezweeb1euVyuNu/n9XrbHD//XKfT2ebcEydOdLilb9/eCgsLvZw3D8BF9Ovn6vik\nTqarbQ5s71nbd3wbgWw+qYYrsCRwgWyuuQI7AvVt/h7bEn1J+vzzzzVv3jylpKRo8uTJevLJJ/1v\nq6+vV1RUlJxOp+rr69scd7lcbY63d25UVFSHO+rqOtcXANBd1dZ6gj3hW+tqm7vaXonNV8LF9l7q\nGwFb7t4/ffq05s6dq0WLFmn69OmSpBtvvFHl5eWSpNLSUiUmJiohIUEVFRVqbGyUx+PR0aNHFR8f\nr5EjR6qkpMR/7qhRo+R0OhUeHq7jx4/Lsizt27dPiYmJdswHAKBbsuVKPy8vT9988402btzofxDe\nsmXLtGbNGuXm5iouLk4TJkxQaGioZs2apZSUFFmWpQULFqhHjx5KTk5Wenq6kpOTFR4erpycHEnS\nqlWrtHDhQrW0tMjtdmvYsGF2zAcAoFuyJfrLly/X8uXLLzi+ZcuWC47NnDlTM2fObHOsV69eevrp\npy84d/jw4SouLr58QwEAMAhPzgMAgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgugD\nAGAIog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6\nAAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgugDAGAIog8AgCGI\nPgAAhiD6AAAYgugDAGAIog8AgCHCgj0A6Cz+tHd6sCf4zRv3crAnAOiGuNIHAMAQRB8AAEMQfQAA\nDEH0AQAwBNEHAMAQRB8AAEMQfQAADMG/04ctSv7fjGBPaOO2//VSsCcAQNBxpQ8AgCGIPgAAhiD6\nAAAYgugDAGAIog8AgCGIPgAAhiD6AAAYgn+n3wWcKf7fwZ7Qxvdmbgn2BADAd8CVPgAAhiD6AAAY\nwsy7919+NdgL2po+JdgLAAAG4EofAABDdLkr/dbWVj3++OP66KOPFBERoTVr1mjgwIHBngUAQKfX\n5a70d+/eraamJm3btk2PPvqo/vCHPwR7EgAAXUKXi35FRYXGjh0rSRo+fLg++OCDIC8CAKBrcFiW\nZQV7xLexbNky3XnnnbrtttskSbfffrt2796tsLAu95MKAACuqC53pe90OlVfX+//fWtrK8EHACAA\nXS76I0eOVGlpqSSpsrJS8fHxQV4EAEDX0OXu3v/3o/f/+c9/yrIsPfHEE7ruuuuCPQsAgE6vy0Uf\nAAB8N13u7n0AAPDdEH0AAAzBw96/o6qqKj311FPKz88P9pQO+Xw+LV26VNXV1WpqalJqaqruuOOO\nYM9qV0tLi5YvX65jx47J4XBo1apVXeJBm2fOnNG0adP0/PPPd4nHmvzqV7+S0+mUJA0YMEBr164N\n8qL2bdq0SW+++aZ8Pp+Sk5M1Y8aMYE9q144dO/TXv/5VktTY2KgPP/xQb7/9tqKiooK87NJ8Pp8W\nL16s6upqhYSEKDMzs1P/XW5qatKSJUt04sQJOZ1OrVy5UoMGDQr2rEs6vx2fffaZFi9eLIfDoSFD\nhigjI0MhIfZeixP97+Avf/mLdu7cqV69egV7SkB27typ6OhoPfnkk/r66681derUTh/9vXv3SpKK\niopUXl6udevW6c9//nOQV7XP5/Np5cqV6tmzZ7CnBKSxsVGWZXWJb1wlqby8XO+9954KCwvV0NCg\n559/PtiTOjRt2jRNmzZNkrRq1SrdddddnTr4klRSUqLm5mYVFRXp7bff1vr16/XMM88Ee9YlFRcX\nq3fv3iouLta//vUvZWZmavPmzcGedVH/2Y61a9cqLS1NY8aM0cqVK7Vnzx79/Oc/t3UDd+9/B7Gx\nsZ36i+A/TZw4UQ8//LAkybIshYaGBnlRx8aPH6/MzExJ0qlTpzr9/yglKTs7W/fcc4/69+8f7CkB\nOXLkiBoaGjR37lzNnj1blZWVwZ7Urn379ik+Pl7z5s3Tgw8+qNtvvz3YkwL2/vvv65NPPtHdd98d\n7CkdGjx4sFpaWtTa2iqv19vpnwflk08+UVJSkiQpLi5OR48eDfKiS/vPdhw6dEijR4+WJCUlJams\nrMz2DZ37v2YnNWHCBJ08eTLYMwIWGRkpSfJ6vXrooYeUlpYW5EWBCQsLU3p6ut544w09/fTTwZ7T\nrh07digmJkZjx47Vs88+G+w5AenZs6fuu+8+zZgxQ59++qnuv/9+7dq1q9P+T76urk6nTp1SXl6e\nTp48qdTUVO3atUsOhyPY0zq0adMmzZs3L9gzAtK7d29VV1dr0qRJqqurU15eXrAntWvo0KHau3ev\nxo8fr6qqKtXU1KilpaVTXtz8Zzssy/L//Y2MjJTH47F9A1f6hvj88881e/ZsTZkyRZMnTw72nIBl\nZ2fr9ddf14oVK3T27Nlgz7mk7du3q6ysTLNmzdKHH36o9PR01dbWBntWuwYPHqxf/vKXcjgcGjx4\nsKKjozv15ujoaLndbkVERCguLk49evTQV199FexZHfrmm2907Ngx3XLLLcGeEpAXXnhBbrdbr7/+\nul599VUtXrxYjY2NwZ51SXfddZecTqdSUlL0xhtv6Mc//nGnDP7FnP/z+/r6+ityjybRN8Dp06c1\nd+5cLVq0SNOnTw/2nIC88sor2rRpkySpV69ecjgctj/A5b+xdetWbdmyRfn5+Ro6dKiys7PVr1+/\nYM9q18svv+x/lcqamhp5vd5OvXnUqFF66623ZFmWampq1NDQoOjo6GDP6tCBAwd06623BntGwKKi\nouRyuSRJffr0UXNzs1paWoK86tLef/993XrrrSosLNTEiRN17bXXBntSwG688UaVl5dLkkpLS5WY\nmGj75+yc9+PhssrLy9M333yjjRs3auPGjZL+5wElnfkBZ3feeaeWLFmie++9V83NzVq6dGmn3tsV\nTZ8+XUuWLFFycrIcDoeeeOKJTnvXviSNGzdOBw4c0PTp02VZllauXNklruiOHTumAQMGBHtGwH79\n619r6dKlSklJkc/n04IFC9S7d+9gz7qkgQMH6o9//KPy8vLkcrmUlZUV7EkBS09P14oVK5Sbm6u4\nuDhNmDDB9s/JM/IBAGCIznt/KQAAuKyIPgAAhiD6AAAYgugDAGAIog8AgCGIPmCQ8vJyzZo167/+\nOLNmzfL/++J/O3nypG666SZNmTLF/yRQP/vZzwJ6NsXzN02ZMuW/3gfg4jrvP8oF0OX0799fr776\nqv/3NTU1mjBhgn7xi1+0+0pt7777rv/X578/gMuL6AOGqaur03333acvv/xSCQkJysjI8L+aWmtr\nq6699lqtXr1aV111lSorK5WVlaXGxkb17dtXq1ev1sCBA/0f68yZM5ozZ47S0tJ0ww03XPC5amtr\nZVmWIiMj1dzcrMcff1wff/yxTp8+rcGDB2vDhg166qmnJEkzZszQSy+9pB/96Ef66KOP9Mwzz6im\npkafffaZqqurNWPGDKWmpsrn8ykjI0MVFRW6+uqr5XA49Lvf/U5jxoy5Yn+GQFdF9AHDnDx5Uhs2\nbNDAgQO1YMECPfvss9q2bZsKCws1YMAAPffcc1q9erWeeuopPfLII1q/fr0SEhL0t7/9TY888oi2\nb98uSfJ4PPrtb3+r+fPna/z48Tp58qS+/PJLTZkyRY2Njaqrq9NPfvITbdiwQd///vd14MABhYeH\na9u2bWptbdWcOXNUUlKi5cuXKz8/Xy+99NIFWz/66CNt3bpVHo9H48eP17333qtXX31VDQ0N2rVr\nl06dOtWlXksCCDZ+pg8YJjExUYMGDZLD4dDkyZP14osvKiEhwf9UsXfffbfeeecdffrpp4qKilJC\nQoIkadKkSTp+/Lj/lcAyMjLU3NysO++80/+x/333/muvvaYpU6bI5/P5X2jm5ptvVkpKirZu3aqs\nrCx9+umnHb6I0pgxYxQREaHvfe97io6Olsfj0dtvv63JkyfL4XDohz/8YZd6Xnsg2Ig+YJjzn1//\n/Jf2PP9Yc3OzWltbL3hfy7L8L75y//33KyYmRoWFhRecFxISoscee0xnzpzR888/L0nas2ePFi5c\nqJ49e2ratGm6+eab1dGzgPfo0cP/a4fDIcuyFBoaetFtADpG9AHDVFRU6NSpU2ptbdUrr7yi3/zm\nN6qqqvK/zve2bds0ZswYxcXF6euvv9bBgwclSa+99pquueYa/yvbDR06VBkZGdqwYYNqamou+Dxh\nYWF67LHHlJeXp9raWu3fv1+TJk3SXXfdpauuukoHDhzwfwMRGhqq5ubmgPb/9Kc/1WuvveZ/tb13\n3333gm9cAFwcP9MHDHP99ddr6dKlqq2t1S233KL77rtP119/vebPny+fz6drrrlGWVlZioiI0Lp1\n65SZmamGhgb16dNH69ata/OxBg0apHvvvVerV6/WkiVLLvhcSUlJGj58uNavX6/Zs2dr4cKF2rVr\nlyIiIjR8+HD/Nxp33HGHpkyZoh07dnS4f+bMmTpy5IgmT56sfv366ZprruEVGIEA8Sp7ALqUf/zj\nH7IsS+PGjZPH49HUqVO1fft2/z0QAC6N6APoUk6cOKHHHnvM/yDAuXPn8oQ+QICIPgAAhuCBfAAA\nGILoAwBgCKIPAIAhiD4AAIYg+gAAGILoAwBgiP8PjIYQhxu3xXEAAAAASUVORK5CYII=\n",
2009 | "text/plain": [
2010 | ""
2011 | ]
2012 | },
2013 | "metadata": {},
2014 | "output_type": "display_data"
2015 | }
2016 | ],
2017 | "source": [
2018 | "#plotting count of bookRating\n",
2019 | "sns.countplot(data=ratings_explicit , x='bookRating')\n",
2020 | "plt.show()\n",
2021 | "#It can be seen that higher ratings are more common amongst users and rating 8 has been rated highest number of times"
2022 | ]
2023 | },
2024 | {
2025 | "cell_type": "markdown",
2026 | "metadata": {},
2027 | "source": [
2028 | "**Simple Popularity Based Recommendation System**"
2029 | ]
2030 | },
2031 | {
2032 | "cell_type": "code",
2033 | "execution_count": 232,
2034 | "metadata": {
2035 | "scrolled": true
2036 | },
2037 | "outputs": [
2038 | {
2039 | "name": "stdout",
2040 | "output_type": "stream",
2041 | "text": [
2042 | "Following books are recommended\n"
2043 | ]
2044 | },
2045 | {
2046 | "data": {
2047 | "text/html": [
2048 | "\n",
2049 | "\n",
2062 | "
\n",
2063 | " \n",
2064 | " \n",
2065 | " | \n",
2066 | " bookRating | \n",
2067 | " ISBN | \n",
2068 | " bookTitle | \n",
2069 | " bookAuthor | \n",
2070 | " yearOfPublication | \n",
2071 | " publisher | \n",
2072 | "
\n",
2073 | " \n",
2074 | " \n",
2075 | " \n",
2076 | " | 408 | \n",
2077 | " 5787 | \n",
2078 | " 0316666343 | \n",
2079 | " The Lovely Bones: A Novel | \n",
2080 | " Alice Sebold | \n",
2081 | " 2002 | \n",
2082 | " Little, Brown | \n",
2083 | "
\n",
2084 | " \n",
2085 | " | 748 | \n",
2086 | " 4108 | \n",
2087 | " 0385504209 | \n",
2088 | " The Da Vinci Code | \n",
2089 | " Dan Brown | \n",
2090 | " 2003 | \n",
2091 | " Doubleday | \n",
2092 | "
\n",
2093 | " \n",
2094 | " | 522 | \n",
2095 | " 3134 | \n",
2096 | " 0312195516 | \n",
2097 | " The Red Tent (Bestselling Backlist) | \n",
2098 | " Anita Diamant | \n",
2099 | " 1998 | \n",
2100 | " Picador USA | \n",
2101 | "
\n",
2102 | " \n",
2103 | " | 2143 | \n",
2104 | " 2798 | \n",
2105 | " 059035342X | \n",
2106 | " Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback)) | \n",
2107 | " J. K. Rowling | \n",
2108 | " 1999 | \n",
2109 | " Arthur A. Levine Books | \n",
2110 | "
\n",
2111 | " \n",
2112 | " | 356 | \n",
2113 | " 2595 | \n",
2114 | " 0142001740 | \n",
2115 | " The Secret Life of Bees | \n",
2116 | " Sue Monk Kidd | \n",
2117 | " 2003 | \n",
2118 | " Penguin Books | \n",
2119 | "
\n",
2120 | " \n",
2121 | " | 26 | \n",
2122 | " 2551 | \n",
2123 | " 0971880107 | \n",
2124 | " Wild Animus | \n",
2125 | " Rich Shapero | \n",
2126 | " 2004 | \n",
2127 | " Too Far | \n",
2128 | "
\n",
2129 | " \n",
2130 | " | 1105 | \n",
2131 | " 2524 | \n",
2132 | " 0060928336 | \n",
2133 | " Divine Secrets of the Ya-Ya Sisterhood: A Novel | \n",
2134 | " Rebecca Wells | \n",
2135 | " 1997 | \n",
2136 | " Perennial | \n",
2137 | "
\n",
2138 | " \n",
2139 | " | 706 | \n",
2140 | " 2402 | \n",
2141 | " 0446672211 | \n",
2142 | " Where the Heart Is (Oprah's Book Club (Paperback)) | \n",
2143 | " Billie Letts | \n",
2144 | " 1998 | \n",
2145 | " Warner Books | \n",
2146 | "
\n",
2147 | " \n",
2148 | " | 231 | \n",
2149 | " 2219 | \n",
2150 | " 0452282152 | \n",
2151 | " Girl with a Pearl Earring | \n",
2152 | " Tracy Chevalier | \n",
2153 | " 2001 | \n",
2154 | " Plume Books | \n",
2155 | "
\n",
2156 | " \n",
2157 | " | 118 | \n",
2158 | " 2179 | \n",
2159 | " 0671027360 | \n",
2160 | " Angels & Demons | \n",
2161 | " Dan Brown | \n",
2162 | " 2001 | \n",
2163 | " Pocket Star | \n",
2164 | "
\n",
2165 | " \n",
2166 | "
\n",
2167 | "
"
2168 | ],
2169 | "text/plain": [
2170 | " bookRating ISBN \\\n",
2171 | "408 5787 0316666343 \n",
2172 | "748 4108 0385504209 \n",
2173 | "522 3134 0312195516 \n",
2174 | "2143 2798 059035342X \n",
2175 | "356 2595 0142001740 \n",
2176 | "26 2551 0971880107 \n",
2177 | "1105 2524 0060928336 \n",
2178 | "706 2402 0446672211 \n",
2179 | "231 2219 0452282152 \n",
2180 | "118 2179 0671027360 \n",
2181 | "\n",
2182 | " bookTitle \\\n",
2183 | "408 The Lovely Bones: A Novel \n",
2184 | "748 The Da Vinci Code \n",
2185 | "522 The Red Tent (Bestselling Backlist) \n",
2186 | "2143 Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback)) \n",
2187 | "356 The Secret Life of Bees \n",
2188 | "26 Wild Animus \n",
2189 | "1105 Divine Secrets of the Ya-Ya Sisterhood: A Novel \n",
2190 | "706 Where the Heart Is (Oprah's Book Club (Paperback)) \n",
2191 | "231 Girl with a Pearl Earring \n",
2192 | "118 Angels & Demons \n",
2193 | "\n",
2194 | " bookAuthor yearOfPublication publisher \n",
2195 | "408 Alice Sebold 2002 Little, Brown \n",
2196 | "748 Dan Brown 2003 Doubleday \n",
2197 | "522 Anita Diamant 1998 Picador USA \n",
2198 | "2143 J. K. Rowling 1999 Arthur A. Levine Books \n",
2199 | "356 Sue Monk Kidd 2003 Penguin Books \n",
2200 | "26 Rich Shapero 2004 Too Far \n",
2201 | "1105 Rebecca Wells 1997 Perennial \n",
2202 | "706 Billie Letts 1998 Warner Books \n",
2203 | "231 Tracy Chevalier 2001 Plume Books \n",
2204 | "118 Dan Brown 2001 Pocket Star "
2205 | ]
2206 | },
2207 | "execution_count": 232,
2208 | "metadata": {},
2209 | "output_type": "execute_result"
2210 | }
2211 | ],
2212 | "source": [
2213 | "#At this point , a simple popularity based recommendation system can be built based on count of user ratings for different books\n",
2214 | "ratings_count = pd.DataFrame(ratings_explicit.groupby(['ISBN'])['bookRating'].sum())\n",
2215 | "top10 = ratings_count.sort_values('bookRating', ascending = False).head(10)\n",
2216 | "print \"Following books are recommended\"\n",
2217 | "top10.merge(books, left_index = True, right_on = 'ISBN')\n",
2218 | "\n",
2219 | "#Given below are top 10 recommendations based on popularity. It is evident that books authored by J.K. Rowling are most popular"
2220 | ]
2221 | },
2222 | {
2223 | "cell_type": "code",
2224 | "execution_count": 233,
2225 | "metadata": {
2226 | "collapsed": true
2227 | },
2228 | "outputs": [],
2229 | "source": [
2230 | "#Similarly segregating users who have given explicit ratings from 1-10 and those whose implicit behavior was tracked\n",
2231 | "users_exp_ratings = users[users.userID.isin(ratings_explicit.userID)]\n",
2232 | "users_imp_ratings = users[users.userID.isin(ratings_implicit.userID)]"
2233 | ]
2234 | },
2235 | {
2236 | "cell_type": "code",
2237 | "execution_count": 234,
2238 | "metadata": {},
2239 | "outputs": [
2240 | {
2241 | "name": "stdout",
2242 | "output_type": "stream",
2243 | "text": [
2244 | "(278858, 3)\n",
2245 | "(68091, 3)\n",
2246 | "(52451, 3)\n"
2247 | ]
2248 | }
2249 | ],
2250 | "source": [
2251 | "#checking shapes\n",
2252 | "print users.shape\n",
2253 | "print users_exp_ratings.shape\n",
2254 | "print users_imp_ratings.shape"
2255 | ]
2256 | },
2257 | {
2258 | "cell_type": "markdown",
2259 | "metadata": {},
2260 | "source": [
2261 | "**Collaborative Filtering Based Recommendation Systems**"
2262 | ]
2263 | },
2264 | {
2265 | "cell_type": "code",
2266 | "execution_count": 235,
2267 | "metadata": {
2268 | "collapsed": true
2269 | },
2270 | "outputs": [],
2271 | "source": [
2272 | "#To cope up with computing power I have and to reduce the dataset size, I am considering users who have rated atleast 100 books\n",
2273 | "#and books which have atleast 100 ratings\n",
2274 | "counts1 = ratings_explicit['userID'].value_counts()\n",
2275 | "ratings_explicit = ratings_explicit[ratings_explicit['userID'].isin(counts1[counts1 >= 100].index)]\n",
2276 | "counts = ratings_explicit['bookRating'].value_counts()\n",
2277 | "ratings_explicit = ratings_explicit[ratings_explicit['bookRating'].isin(counts[counts >= 100].index)]"
2278 | ]
2279 | },
2280 | {
2281 | "cell_type": "code",
2282 | "execution_count": 236,
2283 | "metadata": {},
2284 | "outputs": [
2285 | {
2286 | "name": "stdout",
2287 | "output_type": "stream",
2288 | "text": [
2289 | "(449, 66574)\n"
2290 | ]
2291 | },
2292 | {
2293 | "data": {
2294 | "text/html": [
2295 | "\n",
2296 | "\n",
2309 | "
\n",
2310 | " \n",
2311 | " \n",
2312 | " | ISBN | \n",
2313 | " 0000913154 | \n",
2314 | " 0001046438 | \n",
2315 | " 000104687X | \n",
2316 | " 0001047213 | \n",
2317 | " 0001047973 | \n",
2318 | " 000104799X | \n",
2319 | " 0001048082 | \n",
2320 | " 0001053736 | \n",
2321 | " 0001053744 | \n",
2322 | " 0001055607 | \n",
2323 | " ... | \n",
2324 | " B000092Q0A | \n",
2325 | " B00009EF82 | \n",
2326 | " B00009NDAN | \n",
2327 | " B0000DYXID | \n",
2328 | " B0000T6KHI | \n",
2329 | " B0000VZEJQ | \n",
2330 | " B0000X8HIE | \n",
2331 | " B00013AX9E | \n",
2332 | " B0001I1KOG | \n",
2333 | " B000234N3A | \n",
2334 | "
\n",
2335 | " \n",
2336 | " | userID | \n",
2337 | " | \n",
2338 | " | \n",
2339 | " | \n",
2340 | " | \n",
2341 | " | \n",
2342 | " | \n",
2343 | " | \n",
2344 | " | \n",
2345 | " | \n",
2346 | " | \n",
2347 | " | \n",
2348 | " | \n",
2349 | " | \n",
2350 | " | \n",
2351 | " | \n",
2352 | " | \n",
2353 | " | \n",
2354 | " | \n",
2355 | " | \n",
2356 | " | \n",
2357 | " | \n",
2358 | "
\n",
2359 | " \n",
2360 | " \n",
2361 | " \n",
2362 | " | 2033 | \n",
2363 | " NaN | \n",
2364 | " NaN | \n",
2365 | " NaN | \n",
2366 | " NaN | \n",
2367 | " NaN | \n",
2368 | " NaN | \n",
2369 | " NaN | \n",
2370 | " NaN | \n",
2371 | " NaN | \n",
2372 | " NaN | \n",
2373 | " ... | \n",
2374 | " NaN | \n",
2375 | " NaN | \n",
2376 | " NaN | \n",
2377 | " NaN | \n",
2378 | " NaN | \n",
2379 | " NaN | \n",
2380 | " NaN | \n",
2381 | " NaN | \n",
2382 | " NaN | \n",
2383 | " NaN | \n",
2384 | "
\n",
2385 | " \n",
2386 | " | 2110 | \n",
2387 | " NaN | \n",
2388 | " NaN | \n",
2389 | " NaN | \n",
2390 | " NaN | \n",
2391 | " NaN | \n",
2392 | " NaN | \n",
2393 | " NaN | \n",
2394 | " NaN | \n",
2395 | " NaN | \n",
2396 | " NaN | \n",
2397 | " ... | \n",
2398 | " NaN | \n",
2399 | " NaN | \n",
2400 | " NaN | \n",
2401 | " NaN | \n",
2402 | " NaN | \n",
2403 | " NaN | \n",
2404 | " NaN | \n",
2405 | " NaN | \n",
2406 | " NaN | \n",
2407 | " NaN | \n",
2408 | "
\n",
2409 | " \n",
2410 | " | 2276 | \n",
2411 | " NaN | \n",
2412 | " NaN | \n",
2413 | " NaN | \n",
2414 | " NaN | \n",
2415 | " NaN | \n",
2416 | " NaN | \n",
2417 | " NaN | \n",
2418 | " NaN | \n",
2419 | " NaN | \n",
2420 | " NaN | \n",
2421 | " ... | \n",
2422 | " NaN | \n",
2423 | " NaN | \n",
2424 | " NaN | \n",
2425 | " NaN | \n",
2426 | " NaN | \n",
2427 | " NaN | \n",
2428 | " NaN | \n",
2429 | " NaN | \n",
2430 | " NaN | \n",
2431 | " NaN | \n",
2432 | "
\n",
2433 | " \n",
2434 | " | 4017 | \n",
2435 | " NaN | \n",
2436 | " NaN | \n",
2437 | " NaN | \n",
2438 | " NaN | \n",
2439 | " NaN | \n",
2440 | " NaN | \n",
2441 | " NaN | \n",
2442 | " NaN | \n",
2443 | " NaN | \n",
2444 | " NaN | \n",
2445 | " ... | \n",
2446 | " NaN | \n",
2447 | " NaN | \n",
2448 | " NaN | \n",
2449 | " NaN | \n",
2450 | " NaN | \n",
2451 | " NaN | \n",
2452 | " NaN | \n",
2453 | " NaN | \n",
2454 | " NaN | \n",
2455 | " NaN | \n",
2456 | "
\n",
2457 | " \n",
2458 | " | 4385 | \n",
2459 | " NaN | \n",
2460 | " NaN | \n",
2461 | " NaN | \n",
2462 | " NaN | \n",
2463 | " NaN | \n",
2464 | " NaN | \n",
2465 | " NaN | \n",
2466 | " NaN | \n",
2467 | " NaN | \n",
2468 | " NaN | \n",
2469 | " ... | \n",
2470 | " NaN | \n",
2471 | " NaN | \n",
2472 | " NaN | \n",
2473 | " NaN | \n",
2474 | " NaN | \n",
2475 | " NaN | \n",
2476 | " NaN | \n",
2477 | " NaN | \n",
2478 | " NaN | \n",
2479 | " NaN | \n",
2480 | "
\n",
2481 | " \n",
2482 | "
\n",
2483 | "
5 rows × 66574 columns
\n",
2484 | "
"
2485 | ],
2486 | "text/plain": [
2487 | "ISBN 0000913154 0001046438 000104687X 0001047213 0001047973 \\\n",
2488 | "userID \n",
2489 | "2033 NaN NaN NaN NaN NaN \n",
2490 | "2110 NaN NaN NaN NaN NaN \n",
2491 | "2276 NaN NaN NaN NaN NaN \n",
2492 | "4017 NaN NaN NaN NaN NaN \n",
2493 | "4385 NaN NaN NaN NaN NaN \n",
2494 | "\n",
2495 | "ISBN 000104799X 0001048082 0001053736 0001053744 0001055607 \\\n",
2496 | "userID \n",
2497 | "2033 NaN NaN NaN NaN NaN \n",
2498 | "2110 NaN NaN NaN NaN NaN \n",
2499 | "2276 NaN NaN NaN NaN NaN \n",
2500 | "4017 NaN NaN NaN NaN NaN \n",
2501 | "4385 NaN NaN NaN NaN NaN \n",
2502 | "\n",
2503 | "ISBN ... B000092Q0A B00009EF82 B00009NDAN B0000DYXID \\\n",
2504 | "userID ... \n",
2505 | "2033 ... NaN NaN NaN NaN \n",
2506 | "2110 ... NaN NaN NaN NaN \n",
2507 | "2276 ... NaN NaN NaN NaN \n",
2508 | "4017 ... NaN NaN NaN NaN \n",
2509 | "4385 ... NaN NaN NaN NaN \n",
2510 | "\n",
2511 | "ISBN B0000T6KHI B0000VZEJQ B0000X8HIE B00013AX9E B0001I1KOG B000234N3A \n",
2512 | "userID \n",
2513 | "2033 NaN NaN NaN NaN NaN NaN \n",
2514 | "2110 NaN NaN NaN NaN NaN NaN \n",
2515 | "2276 NaN NaN NaN NaN NaN NaN \n",
2516 | "4017 NaN NaN NaN NaN NaN NaN \n",
2517 | "4385 NaN NaN NaN NaN NaN NaN \n",
2518 | "\n",
2519 | "[5 rows x 66574 columns]"
2520 | ]
2521 | },
2522 | "execution_count": 236,
2523 | "metadata": {},
2524 | "output_type": "execute_result"
2525 | }
2526 | ],
2527 | "source": [
2528 | "#Generating ratings matrix from explicit ratings table\n",
2529 | "ratings_matrix = ratings_explicit.pivot(index='userID', columns='ISBN', values='bookRating')\n",
2530 | "userID = ratings_matrix.index\n",
2531 | "ISBN = ratings_matrix.columns\n",
2532 | "print(ratings_matrix.shape)\n",
2533 | "ratings_matrix.head()\n",
2534 | "#Notice that most of the values are NaN (undefined) implying absence of ratings"
2535 | ]
2536 | },
2537 | {
2538 | "cell_type": "code",
2539 | "execution_count": 237,
2540 | "metadata": {},
2541 | "outputs": [
2542 | {
2543 | "name": "stdout",
2544 | "output_type": "stream",
2545 | "text": [
2546 | "449 66574\n"
2547 | ]
2548 | }
2549 | ],
2550 | "source": [
2551 | "n_users = ratings_matrix.shape[0] #considering only those users who gave explicit ratings\n",
2552 | "n_books = ratings_matrix.shape[1]\n",
2553 | "print n_users, n_books"
2554 | ]
2555 | },
2556 | {
2557 | "cell_type": "code",
2558 | "execution_count": 238,
2559 | "metadata": {
2560 | "collapsed": true
2561 | },
2562 | "outputs": [],
2563 | "source": [
2564 | "#since NaNs cannot be handled by training algorithms, replacing these by 0, which indicates absence of ratings\n",
2565 | "#setting data type\n",
2566 | "ratings_matrix.fillna(0, inplace = True)\n",
2567 | "ratings_matrix = ratings_matrix.astype(np.int32)"
2568 | ]
2569 | },
2570 | {
2571 | "cell_type": "code",
2572 | "execution_count": 239,
2573 | "metadata": {},
2574 | "outputs": [
2575 | {
2576 | "data": {
2577 | "text/html": [
2578 | "\n",
2579 | "\n",
2592 | "
\n",
2593 | " \n",
2594 | " \n",
2595 | " | ISBN | \n",
2596 | " 0000913154 | \n",
2597 | " 0001046438 | \n",
2598 | " 000104687X | \n",
2599 | " 0001047213 | \n",
2600 | " 0001047973 | \n",
2601 | " 000104799X | \n",
2602 | " 0001048082 | \n",
2603 | " 0001053736 | \n",
2604 | " 0001053744 | \n",
2605 | " 0001055607 | \n",
2606 | " ... | \n",
2607 | " B000092Q0A | \n",
2608 | " B00009EF82 | \n",
2609 | " B00009NDAN | \n",
2610 | " B0000DYXID | \n",
2611 | " B0000T6KHI | \n",
2612 | " B0000VZEJQ | \n",
2613 | " B0000X8HIE | \n",
2614 | " B00013AX9E | \n",
2615 | " B0001I1KOG | \n",
2616 | " B000234N3A | \n",
2617 | "
\n",
2618 | " \n",
2619 | " | userID | \n",
2620 | " | \n",
2621 | " | \n",
2622 | " | \n",
2623 | " | \n",
2624 | " | \n",
2625 | " | \n",
2626 | " | \n",
2627 | " | \n",
2628 | " | \n",
2629 | " | \n",
2630 | " | \n",
2631 | " | \n",
2632 | " | \n",
2633 | " | \n",
2634 | " | \n",
2635 | " | \n",
2636 | " | \n",
2637 | " | \n",
2638 | " | \n",
2639 | " | \n",
2640 | " | \n",
2641 | "
\n",
2642 | " \n",
2643 | " \n",
2644 | " \n",
2645 | " | 2033 | \n",
2646 | " 0 | \n",
2647 | " 0 | \n",
2648 | " 0 | \n",
2649 | " 0 | \n",
2650 | " 0 | \n",
2651 | " 0 | \n",
2652 | " 0 | \n",
2653 | " 0 | \n",
2654 | " 0 | \n",
2655 | " 0 | \n",
2656 | " ... | \n",
2657 | " 0 | \n",
2658 | " 0 | \n",
2659 | " 0 | \n",
2660 | " 0 | \n",
2661 | " 0 | \n",
2662 | " 0 | \n",
2663 | " 0 | \n",
2664 | " 0 | \n",
2665 | " 0 | \n",
2666 | " 0 | \n",
2667 | "
\n",
2668 | " \n",
2669 | " | 2110 | \n",
2670 | " 0 | \n",
2671 | " 0 | \n",
2672 | " 0 | \n",
2673 | " 0 | \n",
2674 | " 0 | \n",
2675 | " 0 | \n",
2676 | " 0 | \n",
2677 | " 0 | \n",
2678 | " 0 | \n",
2679 | " 0 | \n",
2680 | " ... | \n",
2681 | " 0 | \n",
2682 | " 0 | \n",
2683 | " 0 | \n",
2684 | " 0 | \n",
2685 | " 0 | \n",
2686 | " 0 | \n",
2687 | " 0 | \n",
2688 | " 0 | \n",
2689 | " 0 | \n",
2690 | " 0 | \n",
2691 | "
\n",
2692 | " \n",
2693 | " | 2276 | \n",
2694 | " 0 | \n",
2695 | " 0 | \n",
2696 | " 0 | \n",
2697 | " 0 | \n",
2698 | " 0 | \n",
2699 | " 0 | \n",
2700 | " 0 | \n",
2701 | " 0 | \n",
2702 | " 0 | \n",
2703 | " 0 | \n",
2704 | " ... | \n",
2705 | " 0 | \n",
2706 | " 0 | \n",
2707 | " 0 | \n",
2708 | " 0 | \n",
2709 | " 0 | \n",
2710 | " 0 | \n",
2711 | " 0 | \n",
2712 | " 0 | \n",
2713 | " 0 | \n",
2714 | " 0 | \n",
2715 | "
\n",
2716 | " \n",
2717 | " | 4017 | \n",
2718 | " 0 | \n",
2719 | " 0 | \n",
2720 | " 0 | \n",
2721 | " 0 | \n",
2722 | " 0 | \n",
2723 | " 0 | \n",
2724 | " 0 | \n",
2725 | " 0 | \n",
2726 | " 0 | \n",
2727 | " 0 | \n",
2728 | " ... | \n",
2729 | " 0 | \n",
2730 | " 0 | \n",
2731 | " 0 | \n",
2732 | " 0 | \n",
2733 | " 0 | \n",
2734 | " 0 | \n",
2735 | " 0 | \n",
2736 | " 0 | \n",
2737 | " 0 | \n",
2738 | " 0 | \n",
2739 | "
\n",
2740 | " \n",
2741 | " | 4385 | \n",
2742 | " 0 | \n",
2743 | " 0 | \n",
2744 | " 0 | \n",
2745 | " 0 | \n",
2746 | " 0 | \n",
2747 | " 0 | \n",
2748 | " 0 | \n",
2749 | " 0 | \n",
2750 | " 0 | \n",
2751 | " 0 | \n",
2752 | " ... | \n",
2753 | " 0 | \n",
2754 | " 0 | \n",
2755 | " 0 | \n",
2756 | " 0 | \n",
2757 | " 0 | \n",
2758 | " 0 | \n",
2759 | " 0 | \n",
2760 | " 0 | \n",
2761 | " 0 | \n",
2762 | " 0 | \n",
2763 | "
\n",
2764 | " \n",
2765 | "
\n",
2766 | "
5 rows × 66574 columns
\n",
2767 | "
"
2768 | ],
2769 | "text/plain": [
2770 | "ISBN 0000913154 0001046438 000104687X 0001047213 0001047973 \\\n",
2771 | "userID \n",
2772 | "2033 0 0 0 0 0 \n",
2773 | "2110 0 0 0 0 0 \n",
2774 | "2276 0 0 0 0 0 \n",
2775 | "4017 0 0 0 0 0 \n",
2776 | "4385 0 0 0 0 0 \n",
2777 | "\n",
2778 | "ISBN 000104799X 0001048082 0001053736 0001053744 0001055607 \\\n",
2779 | "userID \n",
2780 | "2033 0 0 0 0 0 \n",
2781 | "2110 0 0 0 0 0 \n",
2782 | "2276 0 0 0 0 0 \n",
2783 | "4017 0 0 0 0 0 \n",
2784 | "4385 0 0 0 0 0 \n",
2785 | "\n",
2786 | "ISBN ... B000092Q0A B00009EF82 B00009NDAN B0000DYXID \\\n",
2787 | "userID ... \n",
2788 | "2033 ... 0 0 0 0 \n",
2789 | "2110 ... 0 0 0 0 \n",
2790 | "2276 ... 0 0 0 0 \n",
2791 | "4017 ... 0 0 0 0 \n",
2792 | "4385 ... 0 0 0 0 \n",
2793 | "\n",
2794 | "ISBN B0000T6KHI B0000VZEJQ B0000X8HIE B00013AX9E B0001I1KOG B000234N3A \n",
2795 | "userID \n",
2796 | "2033 0 0 0 0 0 0 \n",
2797 | "2110 0 0 0 0 0 0 \n",
2798 | "2276 0 0 0 0 0 0 \n",
2799 | "4017 0 0 0 0 0 0 \n",
2800 | "4385 0 0 0 0 0 0 \n",
2801 | "\n",
2802 | "[5 rows x 66574 columns]"
2803 | ]
2804 | },
2805 | "execution_count": 239,
2806 | "metadata": {},
2807 | "output_type": "execute_result"
2808 | }
2809 | ],
2810 | "source": [
2811 | "#checking first few rows\n",
2812 | "ratings_matrix.head(5)"
2813 | ]
2814 | },
2815 | {
2816 | "cell_type": "code",
2817 | "execution_count": 240,
2818 | "metadata": {},
2819 | "outputs": [
2820 | {
2821 | "name": "stdout",
2822 | "output_type": "stream",
2823 | "text": [
2824 | "The sparsity level of Book Crossing dataset is 99.9977218411 %\n"
2825 | ]
2826 | }
2827 | ],
2828 | "source": [
2829 | "#rechecking the sparsity\n",
2830 | "sparsity=1.0-len(ratings_explicit)/float(users_exp_ratings.shape[0]*n_books)\n",
2831 | "print 'The sparsity level of Book Crossing dataset is ' + str(sparsity*100) + ' %'"
2832 | ]
2833 | },
2834 | {
2835 | "cell_type": "markdown",
2836 | "metadata": {},
2837 | "source": [
2838 | "**Training our recommendation system**"
2839 | ]
2840 | },
2841 | {
2842 | "cell_type": "code",
2843 | "execution_count": 241,
2844 | "metadata": {
2845 | "collapsed": true
2846 | },
2847 | "outputs": [],
2848 | "source": [
2849 | "#setting global variables\n",
2850 | "global metric,k\n",
2851 | "k=10\n",
2852 | "metric='cosine'"
2853 | ]
2854 | },
2855 | {
2856 | "cell_type": "markdown",
2857 | "metadata": {},
2858 | "source": [
2859 | "**User-based Recommendation System**"
2860 | ]
2861 | },
2862 | {
2863 | "cell_type": "code",
2864 | "execution_count": 242,
2865 | "metadata": {
2866 | "collapsed": true
2867 | },
2868 | "outputs": [],
2869 | "source": [
2870 | "#This function finds k similar users given the user_id and ratings matrix \n",
2871 | "#These similarities are same as obtained via using pairwise_distances\n",
2872 | "def findksimilarusers(user_id, ratings, metric = metric, k=k):\n",
2873 | " similarities=[]\n",
2874 | " indices=[]\n",
2875 | " model_knn = NearestNeighbors(metric = metric, algorithm = 'brute') \n",
2876 | " model_knn.fit(ratings)\n",
2877 | " loc = ratings.index.get_loc(user_id)\n",
2878 | " distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k+1)\n",
2879 | " similarities = 1-distances.flatten()\n",
2880 | " \n",
2881 | " return similarities,indices"
2882 | ]
2883 | },
2884 | {
2885 | "cell_type": "code",
2886 | "execution_count": 243,
2887 | "metadata": {
2888 | "collapsed": true
2889 | },
2890 | "outputs": [],
2891 | "source": [
2892 | "#This function predicts rating for specified user-item combination based on user-based approach\n",
2893 | "def predict_userbased(user_id, item_id, ratings, metric = metric, k=k):\n",
2894 | " prediction=0\n",
2895 | " user_loc = ratings.index.get_loc(user_id)\n",
2896 | " item_loc = ratings.columns.get_loc(item_id)\n",
2897 | " similarities, indices=findksimilarusers(user_id, ratings,metric, k) #similar users based on cosine similarity\n",
2898 | " mean_rating = ratings.iloc[user_loc,:].mean() #to adjust for zero based indexing\n",
2899 | " sum_wt = np.sum(similarities)-1\n",
2900 | " product=1\n",
2901 | " wtd_sum = 0 \n",
2902 | " \n",
2903 | " for i in range(0, len(indices.flatten())):\n",
2904 | " if indices.flatten()[i] == user_loc:\n",
2905 | " continue;\n",
2906 | " else: \n",
2907 | " ratings_diff = ratings.iloc[indices.flatten()[i],item_loc]-np.mean(ratings.iloc[indices.flatten()[i],:])\n",
2908 | " product = ratings_diff * (similarities[i])\n",
2909 | " wtd_sum = wtd_sum + product\n",
2910 | " \n",
2911 | " #in case of very sparse datasets, using correlation metric for collaborative based approach may give negative ratings\n",
2912 | " #which are handled here as below\n",
2913 | " if prediction <= 0:\n",
2914 | " prediction = 1 \n",
2915 | " elif prediction >10:\n",
2916 | " prediction = 10\n",
2917 | " \n",
2918 | " prediction = int(round(mean_rating + (wtd_sum/sum_wt)))\n",
2919 | " print '\\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction)\n",
2920 | "\n",
2921 | " return prediction"
2922 | ]
2923 | },
2924 | {
2925 | "cell_type": "code",
2926 | "execution_count": 244,
2927 | "metadata": {},
2928 | "outputs": [
2929 | {
2930 | "name": "stdout",
2931 | "output_type": "stream",
2932 | "text": [
2933 | "\n",
2934 | "Predicted rating for user 11676 -> item 0001056107: 2\n"
2935 | ]
2936 | }
2937 | ],
2938 | "source": [
2939 | "predict_userbased(11676,'0001056107',ratings_matrix);"
2940 | ]
2941 | },
2942 | {
2943 | "cell_type": "markdown",
2944 | "metadata": {},
2945 | "source": [
2946 | "**Item-based Recommendation Systems**"
2947 | ]
2948 | },
2949 | {
2950 | "cell_type": "code",
2951 | "execution_count": 245,
2952 | "metadata": {
2953 | "collapsed": true
2954 | },
2955 | "outputs": [],
2956 | "source": [
2957 | "#This function finds k similar items given the item_id and ratings matrix\n",
2958 | "\n",
2959 | "def findksimilaritems(item_id, ratings, metric=metric, k=k):\n",
2960 | " similarities=[]\n",
2961 | " indices=[]\n",
2962 | " ratings=ratings.T\n",
2963 | " loc = ratings.index.get_loc(item_id)\n",
2964 | " model_knn = NearestNeighbors(metric = metric, algorithm = 'brute')\n",
2965 | " model_knn.fit(ratings)\n",
2966 | " \n",
2967 | " distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k+1)\n",
2968 | " similarities = 1-distances.flatten()\n",
2969 | "\n",
2970 | " return similarities,indices"
2971 | ]
2972 | },
2973 | {
2974 | "cell_type": "code",
2975 | "execution_count": 246,
2976 | "metadata": {
2977 | "collapsed": true
2978 | },
2979 | "outputs": [],
2980 | "source": [
2981 | "similarities,indices=findksimilaritems('0001056107',ratings_matrix)"
2982 | ]
2983 | },
2984 | {
2985 | "cell_type": "code",
2986 | "execution_count": 247,
2987 | "metadata": {
2988 | "collapsed": true
2989 | },
2990 | "outputs": [],
2991 | "source": [
2992 | "#This function predicts the rating for specified user-item combination based on item-based approach\n",
2993 | "def predict_itembased(user_id, item_id, ratings, metric = metric, k=k):\n",
2994 | " prediction= wtd_sum =0\n",
2995 | " user_loc = ratings.index.get_loc(user_id)\n",
2996 | " item_loc = ratings.columns.get_loc(item_id)\n",
2997 | " similarities, indices=findksimilaritems(item_id, ratings) #similar users based on correlation coefficients\n",
2998 | " sum_wt = np.sum(similarities)-1\n",
2999 | " product=1\n",
3000 | " for i in range(0, len(indices.flatten())):\n",
3001 | " if indices.flatten()[i] == item_loc:\n",
3002 | " continue;\n",
3003 | " else:\n",
3004 | " product = ratings.iloc[user_loc,indices.flatten()[i]] * (similarities[i])\n",
3005 | " wtd_sum = wtd_sum + product \n",
3006 | " prediction = int(round(wtd_sum/sum_wt))\n",
3007 | " \n",
3008 | " #in case of very sparse datasets, using correlation metric for collaborative based approach may give negative ratings\n",
3009 | " #which are handled here as below //code has been validated without the code snippet below, below snippet is to avoid negative\n",
3010 | " #predictions which might arise in case of very sparse datasets when using correlation metric\n",
3011 | " if prediction <= 0:\n",
3012 | " prediction = 1 \n",
3013 | " elif prediction >10:\n",
3014 | " prediction = 10\n",
3015 | "\n",
3016 | " print '\\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction) \n",
3017 | " \n",
3018 | " return prediction"
3019 | ]
3020 | },
3021 | {
3022 | "cell_type": "code",
3023 | "execution_count": 248,
3024 | "metadata": {},
3025 | "outputs": [
3026 | {
3027 | "name": "stdout",
3028 | "output_type": "stream",
3029 | "text": [
3030 | "\n",
3031 | "Predicted rating for user 11676 -> item 0001056107: 1\n"
3032 | ]
3033 | }
3034 | ],
3035 | "source": [
3036 | "prediction = predict_itembased(11676,'0001056107',ratings_matrix)"
3037 | ]
3038 | },
3039 | {
3040 | "cell_type": "code",
3041 | "execution_count": 249,
3042 | "metadata": {
3043 | "collapsed": true
3044 | },
3045 | "outputs": [],
3046 | "source": [
3047 | "@contextmanager\n",
3048 | "def suppress_stdout():\n",
3049 | " with open(os.devnull, \"w\") as devnull:\n",
3050 | " old_stdout = sys.stdout\n",
3051 | " sys.stdout = devnull\n",
3052 | " try: \n",
3053 | " yield\n",
3054 | " finally:\n",
3055 | " sys.stdout = old_stdout"
3056 | ]
3057 | },
3058 | {
3059 | "cell_type": "code",
3060 | "execution_count": 252,
3061 | "metadata": {
3062 | "collapsed": true
3063 | },
3064 | "outputs": [],
3065 | "source": [
3066 | "#This function utilizes above functions to recommend items for item/user based approach and cosine/correlation. \n",
3067 | "#Recommendations are made if the predicted rating for an item is >= to 6,and the items have not been rated already\n",
3068 | "def recommendItem(user_id, ratings, metric=metric): \n",
3069 | " if (user_id not in ratings.index.values) or type(user_id) is not int:\n",
3070 | " print \"User id should be a valid integer from this list :\\n\\n {} \".format(re.sub('[\\[\\]]', '', np.array_str(ratings_matrix.index.values)))\n",
3071 | " else: \n",
3072 | " ids = ['Item-based (correlation)','Item-based (cosine)','User-based (correlation)','User-based (cosine)']\n",
3073 | " select = widgets.Dropdown(options=ids, value=ids[0],description='Select approach', width='1000px')\n",
3074 | " def on_change(change):\n",
3075 | " clear_output(wait=True)\n",
3076 | " prediction = [] \n",
3077 | " if change['type'] == 'change' and change['name'] == 'value': \n",
3078 | " if (select.value == 'Item-based (correlation)') | (select.value == 'User-based (correlation)') :\n",
3079 | " metric = 'correlation'\n",
3080 | " else: \n",
3081 | " metric = 'cosine' \n",
3082 | " with suppress_stdout():\n",
3083 | " if (select.value == 'Item-based (correlation)') | (select.value == 'Item-based (cosine)'):\n",
3084 | " for i in range(ratings.shape[1]):\n",
3085 | " if (ratings[str(ratings.columns[i])][user_id] !=0): #not rated already\n",
3086 | " prediction.append(predict_itembased(user_id, str(ratings.columns[i]) ,ratings, metric))\n",
3087 | " else: \n",
3088 | " prediction.append(-1) #for already rated items\n",
3089 | " else:\n",
3090 | " for i in range(ratings.shape[1]):\n",
3091 | " if (ratings[str(ratings.columns[i])][user_id] !=0): #not rated already\n",
3092 | " prediction.append(predict_userbased(user_id, str(ratings.columns[i]) ,ratings, metric))\n",
3093 | " else: \n",
3094 | " prediction.append(-1) #for already rated items\n",
3095 | " prediction = pd.Series(prediction)\n",
3096 | " prediction = prediction.sort_values(ascending=False)\n",
3097 | " recommended = prediction[:10]\n",
3098 | " print \"As per {0} approach....Following books are recommended...\".format(select.value)\n",
3099 | " for i in range(len(recommended)):\n",
3100 | " print \"{0}. {1}\".format(i+1,books.bookTitle[recommended.index[i]].encode('utf-8')) \n",
3101 | " select.observe(on_change)\n",
3102 | " display(select)"
3103 | ]
3104 | },
3105 | {
3106 | "cell_type": "code",
3107 | "execution_count": 255,
3108 | "metadata": {},
3109 | "outputs": [
3110 | {
3111 | "name": "stdout",
3112 | "output_type": "stream",
3113 | "text": [
3114 | "User id should be a valid integer from this list :\n",
3115 | "\n",
3116 | " 2033 2110 2276 4017 4385 5582 6242 6251 6543 6575\n",
3117 | " 7286 7346 8067 8245 8681 8890 10560 11676 11993 12538\n",
3118 | " 12824 12982 13552 13850 14422 15408 15418 16634 16795 16966\n",
3119 | " 17950 19085 21014 23768 23872 23902 25409 25601 25981 26535\n",
3120 | " 26544 26583 28591 28634 29259 30276 30511 30711 30735 30810\n",
3121 | " 31315 31556 31826 32773 33145 35433 35836 35857 35859 36299\n",
3122 | " 36554 36606 36609 36836 36907 37644 37712 37950 38023 38273\n",
3123 | " 38281 39281 39467 40889 40943 43246 43910 46398 47316 48025\n",
3124 | " 48494 49144 49889 51883 52199 52350 52584 52614 52917 53220\n",
3125 | " 55187 55490 55492 56271 56399 56447 56554 56959 59172 60244\n",
3126 | " 60337 60707 63714 63956 65258 66942 67840 68555 69078 69389\n",
3127 | " 69697 70415 70594 70666 72352 73681 75591 75819 76151 76223\n",
3128 | " 76499 76626 78553 78783 78834 78973 79441 81492 81560 83287\n",
3129 | " 83637 83671 85526 85656 86189 86947 87141 87555 88283 88677\n",
3130 | " 88693 88733 89602 91113 92652 92810 93047 93363 93629 94242\n",
3131 | " 94347 94853 94951 95010 95359 95902 95932 96448 97754 97874\n",
3132 | " 98391 98758 100459 100906 101209 101606 101851 102359 102647 102702\n",
3133 | " 102967 104399 104636 105028 105517 105979 106007 107784 107951 109574\n",
3134 | " 109901 109955 110483 110912 110934 110973 112001 113270 113519 114368\n",
3135 | " 114868 114988 115002 115003 116599 117384 120565 122429 122793 123094\n",
3136 | " 123608 123883 123981 125519 125774 126492 126736 127200 127359 128835\n",
3137 | " 129074 129716 129851 130554 130571 132492 132836 133747 134434 135149\n",
3138 | " 135265 136010 136139 136348 136382 138578 138844 140000 140358 141902\n",
3139 | " 142524 143175 143253 143415 145449 146113 146348 147847 148199 148258\n",
3140 | " 148744 148966 149907 149908 150979 153662 156150 156269 156300 156467\n",
3141 | " 157247 157273 158226 158295 158433 159506 160295 162052 162639 162738\n",
3142 | " 163759 163761 163804 163973 164096 164323 164533 164828 164905 165308\n",
3143 | " 165319 165758 166123 166596 168047 168245 169682 170513 170634 171118\n",
3144 | " 172030 172742 172888 173291 173415 174304 174892 177072 177432 177458\n",
3145 | " 178522 179718 179978 180378 180651 181176 182085 182086 182993 183958\n",
3146 | " 183995 184299 184532 185233 185384 187145 187256 187517 189139 189334\n",
3147 | " 189835 189973 190708 190925 193458 193560 193898 194600 196077 196160\n",
3148 | " 196502 197659 199416 200226 201290 203240 204864 205735 205943 206534\n",
3149 | " 207782 208406 208671 209516 210485 211426 211919 212965 214786 216012\n",
3150 | " 216444 216683 217106 217318 217740 218552 218608 219546 219683 222204\n",
3151 | " 222296 223087 223501 224349 224525 224646 224764 225087 225199 225232\n",
3152 | " 225595 225763 226965 227250 227447 227520 227705 229011 229329 229551\n",
3153 | " 229741 230522 231210 232131 232945 233911 234359 234828 235105 235282\n",
3154 | " 235935 236058 236283 236340 236757 236948 239584 239594 240144 240403\n",
3155 | " 240543 240567 240568 241198 241666 241980 242006 242083 242409 242465\n",
3156 | " 244627 244685 245410 245827 246311 247429 247447 248718 249894 250405\n",
3157 | " 250709 251394 251843 251844 252695 252820 254206 254465 254899 255489\n",
3158 | " 257204 258152 258185 258534 261105 261829 262998 264031 264082 264321\n",
3159 | " 264525 265115 265313 265889 266056 266226 268110 268300 268932 269566\n",
3160 | " 270713 271448 271705 273113 274061 274301 275970 277427 278418 \n"
3161 | ]
3162 | }
3163 | ],
3164 | "source": [
3165 | "#checking for incorrect entries\n",
3166 | "recommendItem(999999,ratings_matrix)"
3167 | ]
3168 | },
3169 | {
3170 | "cell_type": "code",
3171 | "execution_count": 253,
3172 | "metadata": {
3173 | "scrolled": true
3174 | },
3175 | "outputs": [
3176 | {
3177 | "name": "stdout",
3178 | "output_type": "stream",
3179 | "text": [
3180 | "As per Item-based (cosine) approach....Following books are recommended...\n",
3181 | "1. My Wicked Wicked Ways\n",
3182 | "2. Fair Peril\n",
3183 | "3. Wolfpointe\n",
3184 | "4. A Nest of Ninnies\n",
3185 | "5. A Bitter Legacy\n",
3186 | "6. A Hymn Before Battle\n",
3187 | "7. Thomas the Rhymer\n",
3188 | "8. Gatherer of Clouds (Initiate Brother Duology)\n",
3189 | "9. Wege zum Ruhm: 13 Hilfestellungen für junge Künstler und 1 Warnung\n",
3190 | "10. Love In Bloom's\n"
3191 | ]
3192 | }
3193 | ],
3194 | "source": [
3195 | "recommendItem(4385, ratings_matrix)"
3196 | ]
3197 | },
3198 | {
3199 | "cell_type": "code",
3200 | "execution_count": 254,
3201 | "metadata": {},
3202 | "outputs": [
3203 | {
3204 | "name": "stdout",
3205 | "output_type": "stream",
3206 | "text": [
3207 | "As per User-based (correlation) approach....Following books are recommended...\n",
3208 | "1. The Gift\n",
3209 | "2. A Close Run Thing : A Novel of Wellington's Army of 1815\n",
3210 | "3. The Romantic: A Novel\n",
3211 | "4. Mazurka for Two Dead Men\n",
3212 | "5. The Titanic Conspiracy: Cover-Ups and Mysteries of the World's Most Famous Sea Disaster\n",
3213 | "6. And Never Let Her Go : Thomas Capano: The Deadly Seducer\n",
3214 | "7. Chop Wood, Carry Water: A Guide to Finding Spiritual Fulfillment in Everyday Life\n",
3215 | "8. WHO NEEDS GOD\n",
3216 | "9. Lords of the White Castle\n",
3217 | "10. Prince Charming Isn't Coming: How Women Get Smart About Money\n"
3218 | ]
3219 | }
3220 | ],
3221 | "source": [
3222 | "recommendItem(4385, ratings_matrix)"
3223 | ]
3224 | },
3225 | {
3226 | "cell_type": "markdown",
3227 | "metadata": {},
3228 | "source": [
3229 | "**Thanks for reading this notebook**"
3230 | ]
3231 | }
3232 | ],
3233 | "metadata": {
3234 | "kernelspec": {
3235 | "display_name": "Python 2",
3236 | "language": "python",
3237 | "name": "python2"
3238 | },
3239 | "language_info": {
3240 | "codemirror_mode": {
3241 | "name": "ipython",
3242 | "version": 2
3243 | },
3244 | "file_extension": ".py",
3245 | "mimetype": "text/x-python",
3246 | "name": "python",
3247 | "nbconvert_exporter": "python",
3248 | "pygments_lexer": "ipython2",
3249 | "version": "2.7.13"
3250 | }
3251 | },
3252 | "nbformat": 4,
3253 | "nbformat_minor": 2
3254 | }
3255 |
--------------------------------------------------------------------------------