├── README.md
└── CF Recommendation System - Examples.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # JNBforBlogs
2 | Jupyter notebooks used in Blogs
3 |
--------------------------------------------------------------------------------
/CF Recommendation System - Examples.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "**Examples of Collaborative Filtering based Recommendation Systems**"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "#make necesarry imports\n",
17 | "import numpy as np\n",
18 | "import pandas as pd\n",
19 | "import matplotlib.pyplot as plt\n",
20 | "import sklearn.metrics as metrics\n",
21 | "import numpy as np\n",
22 | "from sklearn.neighbors import NearestNeighbors\n",
23 | "from scipy.spatial.distance import correlation, cosine\n",
24 | "import ipywidgets as widgets\n",
25 | "from IPython.display import display, clear_output\n",
26 | "from sklearn.metrics import pairwise_distances\n",
27 | "from sklearn.metrics import mean_squared_error\n",
28 | "from math import sqrt\n",
29 | "import sys, os\n",
30 | "from contextlib import contextmanager"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 2,
36 | "metadata": {
37 | "collapsed": true
38 | },
39 | "outputs": [],
40 | "source": [
41 | "#M is user-item ratings matrix where ratings are integers from 1-10\n",
42 | "M = np.asarray([[3,7,4,9,9,7], \n",
43 | " [7,0,5,3,8,8],\n",
44 | " [7,5,5,0,8,4],\n",
45 | " [5,6,8,5,9,8],\n",
46 | " [5,8,8,8,10,9],\n",
47 | " [7,7,0,4,7,8]])\n",
48 | "M=pd.DataFrame(M)\n",
49 | "\n",
50 | "#declaring k,metric as global which can be changed by the user later\n",
51 | "global k,metric\n",
52 | "k=4\n",
53 | "metric='cosine' #can be changed to 'correlation' for Pearson correlation similaries"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 3,
59 | "metadata": {},
60 | "outputs": [
61 | {
62 | "data": {
63 | "text/html": [
64 | "
\n",
65 | "\n",
78 | "
\n",
79 | " \n",
80 | " \n",
81 | " | \n",
82 | " 0 | \n",
83 | " 1 | \n",
84 | " 2 | \n",
85 | " 3 | \n",
86 | " 4 | \n",
87 | " 5 | \n",
88 | "
\n",
89 | " \n",
90 | " \n",
91 | " \n",
92 | " | 0 | \n",
93 | " 3 | \n",
94 | " 7 | \n",
95 | " 4 | \n",
96 | " 9 | \n",
97 | " 9 | \n",
98 | " 7 | \n",
99 | "
\n",
100 | " \n",
101 | " | 1 | \n",
102 | " 7 | \n",
103 | " 0 | \n",
104 | " 5 | \n",
105 | " 3 | \n",
106 | " 8 | \n",
107 | " 8 | \n",
108 | "
\n",
109 | " \n",
110 | " | 2 | \n",
111 | " 7 | \n",
112 | " 5 | \n",
113 | " 5 | \n",
114 | " 0 | \n",
115 | " 8 | \n",
116 | " 4 | \n",
117 | "
\n",
118 | " \n",
119 | " | 3 | \n",
120 | " 5 | \n",
121 | " 6 | \n",
122 | " 8 | \n",
123 | " 5 | \n",
124 | " 9 | \n",
125 | " 8 | \n",
126 | "
\n",
127 | " \n",
128 | " | 4 | \n",
129 | " 5 | \n",
130 | " 8 | \n",
131 | " 8 | \n",
132 | " 8 | \n",
133 | " 10 | \n",
134 | " 9 | \n",
135 | "
\n",
136 | " \n",
137 | " | 5 | \n",
138 | " 7 | \n",
139 | " 7 | \n",
140 | " 0 | \n",
141 | " 4 | \n",
142 | " 7 | \n",
143 | " 8 | \n",
144 | "
\n",
145 | " \n",
146 | "
\n",
147 | "
"
148 | ],
149 | "text/plain": [
150 | " 0 1 2 3 4 5\n",
151 | "0 3 7 4 9 9 7\n",
152 | "1 7 0 5 3 8 8\n",
153 | "2 7 5 5 0 8 4\n",
154 | "3 5 6 8 5 9 8\n",
155 | "4 5 8 8 8 10 9\n",
156 | "5 7 7 0 4 7 8"
157 | ]
158 | },
159 | "execution_count": 3,
160 | "metadata": {},
161 | "output_type": "execute_result"
162 | }
163 | ],
164 | "source": [
165 | "M"
166 | ]
167 | },
168 | {
169 | "cell_type": "markdown",
170 | "metadata": {},
171 | "source": [
172 | "**User-based Recommendation Systems**"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": 4,
178 | "metadata": {
179 | "collapsed": true
180 | },
181 | "outputs": [],
182 | "source": [
183 | "#get cosine similarities for ratings matrix M; pairwise_distances returns the distances between ratings and hence\n",
184 | "#similarities are obtained by subtracting distances from 1\n",
185 | "cosine_sim = 1-pairwise_distances(M, metric=\"cosine\")"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": 5,
191 | "metadata": {},
192 | "outputs": [
193 | {
194 | "data": {
195 | "text/html": [
196 | "\n",
197 | "\n",
210 | "
\n",
211 | " \n",
212 | " \n",
213 | " | \n",
214 | " 0 | \n",
215 | " 1 | \n",
216 | " 2 | \n",
217 | " 3 | \n",
218 | " 4 | \n",
219 | " 5 | \n",
220 | "
\n",
221 | " \n",
222 | " \n",
223 | " \n",
224 | " | 0 | \n",
225 | " 1.000000 | \n",
226 | " 0.799268 | \n",
227 | " 0.779227 | \n",
228 | " 0.934622 | \n",
229 | " 0.973890 | \n",
230 | " 0.884600 | \n",
231 | "
\n",
232 | " \n",
233 | " | 1 | \n",
234 | " 0.799268 | \n",
235 | " 1.000000 | \n",
236 | " 0.874744 | \n",
237 | " 0.905850 | \n",
238 | " 0.866146 | \n",
239 | " 0.827036 | \n",
240 | "
\n",
241 | " \n",
242 | " | 2 | \n",
243 | " 0.779227 | \n",
244 | " 0.874744 | \n",
245 | " 1.000000 | \n",
246 | " 0.909513 | \n",
247 | " 0.865454 | \n",
248 | " 0.853275 | \n",
249 | "
\n",
250 | " \n",
251 | " | 3 | \n",
252 | " 0.934622 | \n",
253 | " 0.905850 | \n",
254 | " 0.909513 | \n",
255 | " 1.000000 | \n",
256 | " 0.989344 | \n",
257 | " 0.865614 | \n",
258 | "
\n",
259 | " \n",
260 | " | 4 | \n",
261 | " 0.973890 | \n",
262 | " 0.866146 | \n",
263 | " 0.865454 | \n",
264 | " 0.989344 | \n",
265 | " 1.000000 | \n",
266 | " 0.881640 | \n",
267 | "
\n",
268 | " \n",
269 | " | 5 | \n",
270 | " 0.884600 | \n",
271 | " 0.827036 | \n",
272 | " 0.853275 | \n",
273 | " 0.865614 | \n",
274 | " 0.881640 | \n",
275 | " 1.000000 | \n",
276 | "
\n",
277 | " \n",
278 | "
\n",
279 | "
"
280 | ],
281 | "text/plain": [
282 | " 0 1 2 3 4 5\n",
283 | "0 1.000000 0.799268 0.779227 0.934622 0.973890 0.884600\n",
284 | "1 0.799268 1.000000 0.874744 0.905850 0.866146 0.827036\n",
285 | "2 0.779227 0.874744 1.000000 0.909513 0.865454 0.853275\n",
286 | "3 0.934622 0.905850 0.909513 1.000000 0.989344 0.865614\n",
287 | "4 0.973890 0.866146 0.865454 0.989344 1.000000 0.881640\n",
288 | "5 0.884600 0.827036 0.853275 0.865614 0.881640 1.000000"
289 | ]
290 | },
291 | "execution_count": 5,
292 | "metadata": {},
293 | "output_type": "execute_result"
294 | }
295 | ],
296 | "source": [
297 | "#Cosine similarity matrix\n",
298 | "pd.DataFrame(cosine_sim)"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": 6,
304 | "metadata": {
305 | "collapsed": true
306 | },
307 | "outputs": [],
308 | "source": [
309 | "#get pearson similarities for ratings matrix M\n",
310 | "pearson_sim = 1-pairwise_distances(M, metric=\"correlation\")"
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": 7,
316 | "metadata": {},
317 | "outputs": [
318 | {
319 | "data": {
320 | "text/html": [
321 | "\n",
322 | "\n",
335 | "
\n",
336 | " \n",
337 | " \n",
338 | " | \n",
339 | " 0 | \n",
340 | " 1 | \n",
341 | " 2 | \n",
342 | " 3 | \n",
343 | " 4 | \n",
344 | " 5 | \n",
345 | "
\n",
346 | " \n",
347 | " \n",
348 | " \n",
349 | " | 0 | \n",
350 | " 1.000000 | \n",
351 | " -0.137446 | \n",
352 | " -0.357398 | \n",
353 | " 0.208179 | \n",
354 | " 0.761905 | \n",
355 | " 0.277350 | \n",
356 | "
\n",
357 | " \n",
358 | " | 1 | \n",
359 | " -0.137446 | \n",
360 | " 1.000000 | \n",
361 | " 0.453897 | \n",
362 | " 0.515910 | \n",
363 | " 0.112456 | \n",
364 | " 0.218328 | \n",
365 | "
\n",
366 | " \n",
367 | " | 2 | \n",
368 | " -0.357398 | \n",
369 | " 0.453897 | \n",
370 | " 1.000000 | \n",
371 | " 0.451378 | \n",
372 | " -0.042888 | \n",
373 | " 0.297373 | \n",
374 | "
\n",
375 | " \n",
376 | " | 3 | \n",
377 | " 0.208179 | \n",
378 | " 0.515910 | \n",
379 | " 0.451378 | \n",
380 | " 1.000000 | \n",
381 | " 0.763325 | \n",
382 | " -0.057739 | \n",
383 | "
\n",
384 | " \n",
385 | " | 4 | \n",
386 | " 0.761905 | \n",
387 | " 0.112456 | \n",
388 | " -0.042888 | \n",
389 | " 0.763325 | \n",
390 | " 1.000000 | \n",
391 | " 0.039621 | \n",
392 | "
\n",
393 | " \n",
394 | " | 5 | \n",
395 | " 0.277350 | \n",
396 | " 0.218328 | \n",
397 | " 0.297373 | \n",
398 | " -0.057739 | \n",
399 | " 0.039621 | \n",
400 | " 1.000000 | \n",
401 | "
\n",
402 | " \n",
403 | "
\n",
404 | "
"
405 | ],
406 | "text/plain": [
407 | " 0 1 2 3 4 5\n",
408 | "0 1.000000 -0.137446 -0.357398 0.208179 0.761905 0.277350\n",
409 | "1 -0.137446 1.000000 0.453897 0.515910 0.112456 0.218328\n",
410 | "2 -0.357398 0.453897 1.000000 0.451378 -0.042888 0.297373\n",
411 | "3 0.208179 0.515910 0.451378 1.000000 0.763325 -0.057739\n",
412 | "4 0.761905 0.112456 -0.042888 0.763325 1.000000 0.039621\n",
413 | "5 0.277350 0.218328 0.297373 -0.057739 0.039621 1.000000"
414 | ]
415 | },
416 | "execution_count": 7,
417 | "metadata": {},
418 | "output_type": "execute_result"
419 | }
420 | ],
421 | "source": [
422 | "#Pearson correlation similarity matrix\n",
423 | "pd.DataFrame(pearson_sim)"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": 8,
429 | "metadata": {
430 | "collapsed": true
431 | },
432 | "outputs": [],
433 | "source": [
434 | "#This function finds k similar users given the user_id and ratings matrix M\n",
435 | "#Note that the similarities are same as obtained via using pairwise_distances\n",
436 | "def findksimilarusers(user_id, ratings, metric = metric, k=k):\n",
437 | " similarities=[]\n",
438 | " indices=[]\n",
439 | " model_knn = NearestNeighbors(metric = metric, algorithm = 'brute') \n",
440 | " model_knn.fit(ratings)\n",
441 | "\n",
442 | " distances, indices = model_knn.kneighbors(ratings.iloc[user_id-1, :].values.reshape(1, -1), n_neighbors = k)\n",
443 | " similarities = 1-distances.flatten()\n",
444 | " print '{0} most similar users for User {1}:\\n'.format(k-1,user_id)\n",
445 | " for i in range(0, len(indices.flatten())):\n",
446 | " if indices.flatten()[i]+1 == user_id:\n",
447 | " continue;\n",
448 | "\n",
449 | " else:\n",
450 | " print '{0}: User {1}, with similarity of {2}'.format(i, indices.flatten()[i]+1, similarities.flatten()[i])\n",
451 | " \n",
452 | " return similarities,indices"
453 | ]
454 | },
455 | {
456 | "cell_type": "code",
457 | "execution_count": 9,
458 | "metadata": {},
459 | "outputs": [
460 | {
461 | "name": "stdout",
462 | "output_type": "stream",
463 | "text": [
464 | "3 most similar users for User 1:\n",
465 | "\n",
466 | "1: User 5, with similarity of 0.973889935402\n",
467 | "2: User 4, with similarity of 0.934621684178\n",
468 | "3: User 6, with similarity of 0.88460045723\n"
469 | ]
470 | }
471 | ],
472 | "source": [
473 | "similarities,indices = findksimilarusers(1,M, metric='cosine')"
474 | ]
475 | },
476 | {
477 | "cell_type": "code",
478 | "execution_count": 10,
479 | "metadata": {
480 | "scrolled": true
481 | },
482 | "outputs": [
483 | {
484 | "name": "stdout",
485 | "output_type": "stream",
486 | "text": [
487 | "3 most similar users for User 1:\n",
488 | "\n",
489 | "1: User 5, with similarity of 0.761904761905\n",
490 | "2: User 6, with similarity of 0.277350098113\n",
491 | "3: User 4, with similarity of 0.208179450927\n"
492 | ]
493 | }
494 | ],
495 | "source": [
496 | "similarities,indices = findksimilarusers(1,M, metric='correlation')"
497 | ]
498 | },
499 | {
500 | "cell_type": "code",
501 | "execution_count": 11,
502 | "metadata": {
503 | "collapsed": true
504 | },
505 | "outputs": [],
506 | "source": [
507 | "#This function predicts rating for specified user-item combination based on user-based approach\n",
508 | "def predict_userbased(user_id, item_id, ratings, metric = metric, k=k):\n",
509 | " prediction=0\n",
510 | " similarities, indices=findksimilarusers(user_id, ratings,metric, k) #similar users based on cosine similarity\n",
511 | " mean_rating = ratings.loc[user_id-1,:].mean() #to adjust for zero based indexing\n",
512 | " sum_wt = np.sum(similarities)-1\n",
513 | " product=1\n",
514 | " wtd_sum = 0 \n",
515 | " \n",
516 | " for i in range(0, len(indices.flatten())):\n",
517 | " if indices.flatten()[i]+1 == user_id:\n",
518 | " continue;\n",
519 | " else: \n",
520 | " ratings_diff = ratings.iloc[indices.flatten()[i],item_id-1]-np.mean(ratings.iloc[indices.flatten()[i],:])\n",
521 | " product = ratings_diff * (similarities[i])\n",
522 | " wtd_sum = wtd_sum + product\n",
523 | " \n",
524 | " prediction = int(round(mean_rating + (wtd_sum/sum_wt)))\n",
525 | " print '\\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction)\n",
526 | "\n",
527 | " return prediction"
528 | ]
529 | },
530 | {
531 | "cell_type": "code",
532 | "execution_count": 12,
533 | "metadata": {},
534 | "outputs": [
535 | {
536 | "name": "stdout",
537 | "output_type": "stream",
538 | "text": [
539 | "3 most similar users for User 3:\n",
540 | "\n",
541 | "1: User 4, with similarity of 0.90951268934\n",
542 | "2: User 2, with similarity of 0.874744414849\n",
543 | "3: User 5, with similarity of 0.86545387815\n",
544 | "\n",
545 | "Predicted rating for user 3 -> item 4: 3\n"
546 | ]
547 | }
548 | ],
549 | "source": [
550 | "predict_userbased(3,4,M);"
551 | ]
552 | },
553 | {
554 | "cell_type": "markdown",
555 | "metadata": {},
556 | "source": [
557 | "**Item-based Recommendation Systems**"
558 | ]
559 | },
560 | {
561 | "cell_type": "code",
562 | "execution_count": 13,
563 | "metadata": {
564 | "collapsed": true
565 | },
566 | "outputs": [],
567 | "source": [
568 | "#This function finds k similar items given the item_id and ratings matrix M\n",
569 | "\n",
570 | "def findksimilaritems(item_id, ratings, metric=metric, k=k):\n",
571 | " similarities=[]\n",
572 | " indices=[] \n",
573 | " ratings=ratings.T\n",
574 | " model_knn = NearestNeighbors(metric = metric, algorithm = 'brute')\n",
575 | " model_knn.fit(ratings)\n",
576 | "\n",
577 | " distances, indices = model_knn.kneighbors(ratings.iloc[item_id-1, :].values.reshape(1, -1), n_neighbors = k)\n",
578 | " similarities = 1-distances.flatten()\n",
579 | " print '{0} most similar items for item {1}:\\n'.format(k-1,item_id)\n",
580 | " for i in range(0, len(indices.flatten())):\n",
581 | " if indices.flatten()[i]+1 == item_id:\n",
582 | " continue;\n",
583 | "\n",
584 | " else:\n",
585 | " print '{0}: Item {1} :, with similarity of {2}'.format(i,indices.flatten()[i]+1, similarities.flatten()[i])\n",
586 | "\n",
587 | "\n",
588 | " return similarities,indices"
589 | ]
590 | },
591 | {
592 | "cell_type": "code",
593 | "execution_count": 14,
594 | "metadata": {},
595 | "outputs": [
596 | {
597 | "name": "stdout",
598 | "output_type": "stream",
599 | "text": [
600 | "3 most similar items for item 3:\n",
601 | "\n",
602 | "1: Item 5 :, with similarity of 0.918336125535\n",
603 | "2: Item 6 :, with similarity of 0.874759773038\n",
604 | "3: Item 1 :, with similarity of 0.810364746222\n"
605 | ]
606 | }
607 | ],
608 | "source": [
609 | "similarities,indices=findksimilaritems(3,M)"
610 | ]
611 | },
612 | {
613 | "cell_type": "code",
614 | "execution_count": 15,
615 | "metadata": {
616 | "collapsed": true
617 | },
618 | "outputs": [],
619 | "source": [
620 | "#This function predicts the rating for specified user-item combination based on item-based approach\n",
621 | "def predict_itembased(user_id, item_id, ratings, metric = metric, k=k):\n",
622 | " prediction= wtd_sum =0\n",
623 | " similarities, indices=findksimilaritems(item_id, ratings) #similar users based on correlation coefficients\n",
624 | " sum_wt = np.sum(similarities)-1\n",
625 | " product=1\n",
626 | " \n",
627 | " for i in range(0, len(indices.flatten())):\n",
628 | " if indices.flatten()[i]+1 == item_id:\n",
629 | " continue;\n",
630 | " else:\n",
631 | " product = ratings.iloc[user_id-1,indices.flatten()[i]] * (similarities[i])\n",
632 | " wtd_sum = wtd_sum + product \n",
633 | " prediction = int(round(wtd_sum/sum_wt))\n",
634 | " print '\\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction) \n",
635 | "\n",
636 | " return prediction"
637 | ]
638 | },
639 | {
640 | "cell_type": "code",
641 | "execution_count": 16,
642 | "metadata": {},
643 | "outputs": [
644 | {
645 | "name": "stdout",
646 | "output_type": "stream",
647 | "text": [
648 | "3 most similar items for item 3:\n",
649 | "\n",
650 | "1: Item 5 :, with similarity of 0.918336125535\n",
651 | "2: Item 6 :, with similarity of 0.874759773038\n",
652 | "3: Item 1 :, with similarity of 0.810364746222\n",
653 | "\n",
654 | "Predicted rating for user 1 -> item 3: 6\n"
655 | ]
656 | }
657 | ],
658 | "source": [
659 | "prediction = predict_itembased(1,3,M)"
660 | ]
661 | },
662 | {
663 | "cell_type": "code",
664 | "execution_count": 17,
665 | "metadata": {},
666 | "outputs": [],
667 | "source": [
668 | "#This function is used to compute adjusted cosine similarity matrix for items\n",
669 | "def computeAdjCosSim(M):\n",
670 | " sim_matrix = np.zeros((M.shape[1], M.shape[1]))\n",
671 | " M_u = M.mean(axis=1) #means\n",
672 | " \n",
673 | " for i in range(M.shape[1]):\n",
674 | " for j in range(M.shape[1]):\n",
675 | " if i == j:\n",
676 | " \n",
677 | " sim_matrix[i][j] = 1\n",
678 | " else: \n",
679 | " if i\n",
727 | "\n",
740 | "\n",
741 | " \n",
742 | " \n",
743 | " | \n",
744 | " 0 | \n",
745 | " 1 | \n",
746 | " 2 | \n",
747 | " 3 | \n",
748 | " 4 | \n",
749 | " 5 | \n",
750 | "
\n",
751 | " \n",
752 | " \n",
753 | " \n",
754 | " | 0 | \n",
755 | " 1.000000 | \n",
756 | " 0.236908 | \n",
757 | " 0.421263 | \n",
758 | " -0.519085 | \n",
759 | " -0.125892 | \n",
760 | " 0.010090 | \n",
761 | "
\n",
762 | " \n",
763 | " | 1 | \n",
764 | " 0.236908 | \n",
765 | " 1.000000 | \n",
766 | " -0.805243 | \n",
767 | " 0.085741 | \n",
768 | " 0.237273 | \n",
769 | " 0.520625 | \n",
770 | "
\n",
771 | " \n",
772 | " | 2 | \n",
773 | " 0.421263 | \n",
774 | " -0.805243 | \n",
775 | " 1.000000 | \n",
776 | " -0.767941 | \n",
777 | " -0.230521 | \n",
778 | " -0.053640 | \n",
779 | "
\n",
780 | " \n",
781 | " | 3 | \n",
782 | " -0.519085 | \n",
783 | " 0.085741 | \n",
784 | " -0.767941 | \n",
785 | " 1.000000 | \n",
786 | " -0.299059 | \n",
787 | " -0.644550 | \n",
788 | "
\n",
789 | " \n",
790 | " | 4 | \n",
791 | " -0.125892 | \n",
792 | " 0.237273 | \n",
793 | " -0.230521 | \n",
794 | " -0.299059 | \n",
795 | " 1.000000 | \n",
796 | " 0.599158 | \n",
797 | "
\n",
798 | " \n",
799 | " | 5 | \n",
800 | " 0.010090 | \n",
801 | " 0.520625 | \n",
802 | " -0.053640 | \n",
803 | " -0.644550 | \n",
804 | " 0.599158 | \n",
805 | " 1.000000 | \n",
806 | "
\n",
807 | " \n",
808 | "
\n",
809 | ""
810 | ],
811 | "text/plain": [
812 | " 0 1 2 3 4 5\n",
813 | "0 1.000000 0.236908 0.421263 -0.519085 -0.125892 0.010090\n",
814 | "1 0.236908 1.000000 -0.805243 0.085741 0.237273 0.520625\n",
815 | "2 0.421263 -0.805243 1.000000 -0.767941 -0.230521 -0.053640\n",
816 | "3 -0.519085 0.085741 -0.767941 1.000000 -0.299059 -0.644550\n",
817 | "4 -0.125892 0.237273 -0.230521 -0.299059 1.000000 0.599158\n",
818 | "5 0.010090 0.520625 -0.053640 -0.644550 0.599158 1.000000"
819 | ]
820 | },
821 | "execution_count": 19,
822 | "metadata": {},
823 | "output_type": "execute_result"
824 | }
825 | ],
826 | "source": [
827 | "adjcos_sim"
828 | ]
829 | },
830 | {
831 | "cell_type": "code",
832 | "execution_count": 20,
833 | "metadata": {
834 | "collapsed": true
835 | },
836 | "outputs": [],
837 | "source": [
838 | "#This function finds k similar items given the item_id and ratings matrix M\n",
839 | "\n",
840 | "def findksimilaritems_adjcos(item_id, ratings, k=k):\n",
841 | " \n",
842 | " sim_matrix = computeAdjCosSim(ratings)\n",
843 | " similarities = sim_matrix[item_id-1].sort_values(ascending=False)[:k].values\n",
844 | " indices = sim_matrix[item_id-1].sort_values(ascending=False)[:k].index\n",
845 | " \n",
846 | " print '{0} most similar items for item {1}:\\n'.format(k-1,item_id)\n",
847 | " for i in range(0, len(indices)):\n",
848 | " if indices[i]+1 == item_id:\n",
849 | " continue;\n",
850 | "\n",
851 | " else:\n",
852 | " print '{0}: Item {1} :, with similarity of {2}'.format(i,indices[i]+1, similarities[i])\n",
853 | " \n",
854 | " return similarities ,indices "
855 | ]
856 | },
857 | {
858 | "cell_type": "code",
859 | "execution_count": 21,
860 | "metadata": {},
861 | "outputs": [
862 | {
863 | "name": "stdout",
864 | "output_type": "stream",
865 | "text": [
866 | "3 most similar items for item 3:\n",
867 | "\n",
868 | "1: Item 1 :, with similarity of 0.421262731871\n",
869 | "2: Item 6 :, with similarity of -0.0536398904889\n",
870 | "3: Item 5 :, with similarity of -0.230521358269\n"
871 | ]
872 | }
873 | ],
874 | "source": [
875 | "similarities, indices = findksimilaritems_adjcos(3,M)"
876 | ]
877 | },
878 | {
879 | "cell_type": "code",
880 | "execution_count": 22,
881 | "metadata": {
882 | "collapsed": true
883 | },
884 | "outputs": [],
885 | "source": [
886 | "#This function predicts the rating for specified user-item combination for adjusted cosine item-based approach\n",
887 | "#As the adjusted cosine similarities range from -1,+1, sometimes the predicted rating can be negative or greater than max value\n",
888 | "#Hack to deal with this: Rating is set to min if prediction is negative, Rating is set to max if prediction is above max\n",
889 | "def predict_itembased_adjcos(user_id, item_id, ratings):\n",
890 | " prediction=0\n",
891 | "\n",
892 | " similarities, indices=findksimilaritems_adjcos(item_id, ratings) #similar users based on correlation coefficients\n",
893 | " sum_wt = np.sum(similarities)-1\n",
894 | "\n",
895 | " product=1\n",
896 | " wtd_sum = 0 \n",
897 | " for i in range(0, len(indices)):\n",
898 | " if indices[i]+1 == item_id:\n",
899 | " continue;\n",
900 | " else:\n",
901 | " product = ratings.iloc[user_id-1,indices[i]] * (similarities[i])\n",
902 | " wtd_sum = wtd_sum + product \n",
903 | " prediction = int(round(wtd_sum/sum_wt))\n",
904 | " if prediction < 0:\n",
905 | " prediction = 1\n",
906 | " elif prediction >10:\n",
907 | " prediction = 10\n",
908 | " print '\\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction) \n",
909 | " \n",
910 | " return prediction"
911 | ]
912 | },
913 | {
914 | "cell_type": "code",
915 | "execution_count": 23,
916 | "metadata": {},
917 | "outputs": [
918 | {
919 | "name": "stdout",
920 | "output_type": "stream",
921 | "text": [
922 | "3 most similar items for item 4:\n",
923 | "\n",
924 | "1: Item 2 :, with similarity of 0.0857414341149\n",
925 | "2: Item 5 :, with similarity of -0.29905882779\n",
926 | "3: Item 1 :, with similarity of -0.519085268895\n",
927 | "\n",
928 | "Predicted rating for user 3 -> item 4: 8\n"
929 | ]
930 | }
931 | ],
932 | "source": [
933 | "prediction=predict_itembased_adjcos(3,4,M)"
934 | ]
935 | },
936 | {
937 | "cell_type": "code",
938 | "execution_count": 24,
939 | "metadata": {},
940 | "outputs": [
941 | {
942 | "data": {
943 | "text/html": [
944 | "\n",
945 | "\n",
958 | "
\n",
959 | " \n",
960 | " \n",
961 | " | \n",
962 | " 0 | \n",
963 | " 1 | \n",
964 | " 2 | \n",
965 | " 3 | \n",
966 | " 4 | \n",
967 | " 5 | \n",
968 | "
\n",
969 | " \n",
970 | " \n",
971 | " \n",
972 | " | 0 | \n",
973 | " 1.000000 | \n",
974 | " 0.236908 | \n",
975 | " 0.421263 | \n",
976 | " -0.519085 | \n",
977 | " -0.125892 | \n",
978 | " 0.010090 | \n",
979 | "
\n",
980 | " \n",
981 | " | 1 | \n",
982 | " 0.236908 | \n",
983 | " 1.000000 | \n",
984 | " -0.805243 | \n",
985 | " 0.085741 | \n",
986 | " 0.237273 | \n",
987 | " 0.520625 | \n",
988 | "
\n",
989 | " \n",
990 | " | 2 | \n",
991 | " 0.421263 | \n",
992 | " -0.805243 | \n",
993 | " 1.000000 | \n",
994 | " -0.767941 | \n",
995 | " -0.230521 | \n",
996 | " -0.053640 | \n",
997 | "
\n",
998 | " \n",
999 | " | 3 | \n",
1000 | " -0.519085 | \n",
1001 | " 0.085741 | \n",
1002 | " -0.767941 | \n",
1003 | " 1.000000 | \n",
1004 | " -0.299059 | \n",
1005 | " -0.644550 | \n",
1006 | "
\n",
1007 | " \n",
1008 | " | 4 | \n",
1009 | " -0.125892 | \n",
1010 | " 0.237273 | \n",
1011 | " -0.230521 | \n",
1012 | " -0.299059 | \n",
1013 | " 1.000000 | \n",
1014 | " 0.599158 | \n",
1015 | "
\n",
1016 | " \n",
1017 | " | 5 | \n",
1018 | " 0.010090 | \n",
1019 | " 0.520625 | \n",
1020 | " -0.053640 | \n",
1021 | " -0.644550 | \n",
1022 | " 0.599158 | \n",
1023 | " 1.000000 | \n",
1024 | "
\n",
1025 | " \n",
1026 | "
\n",
1027 | "
"
1028 | ],
1029 | "text/plain": [
1030 | " 0 1 2 3 4 5\n",
1031 | "0 1.000000 0.236908 0.421263 -0.519085 -0.125892 0.010090\n",
1032 | "1 0.236908 1.000000 -0.805243 0.085741 0.237273 0.520625\n",
1033 | "2 0.421263 -0.805243 1.000000 -0.767941 -0.230521 -0.053640\n",
1034 | "3 -0.519085 0.085741 -0.767941 1.000000 -0.299059 -0.644550\n",
1035 | "4 -0.125892 0.237273 -0.230521 -0.299059 1.000000 0.599158\n",
1036 | "5 0.010090 0.520625 -0.053640 -0.644550 0.599158 1.000000"
1037 | ]
1038 | },
1039 | "execution_count": 24,
1040 | "metadata": {},
1041 | "output_type": "execute_result"
1042 | }
1043 | ],
1044 | "source": [
1045 | "adjcos_sim"
1046 | ]
1047 | },
1048 | {
1049 | "cell_type": "code",
1050 | "execution_count": 25,
1051 | "metadata": {
1052 | "collapsed": true
1053 | },
1054 | "outputs": [],
1055 | "source": [
1056 | "#This function utilizes above function to recommend items for selected approach. Recommendations are made if the predicted\n",
1057 | "#rating for an item is greater than or equal to 6, and the items has not been rated already\n",
1058 | "def recommendItem(user_id, item_id, ratings):\n",
1059 | " \n",
1060 | " if user_id<1 or user_id>6 or type(user_id) is not int:\n",
1061 | " print 'Userid does not exist. Enter numbers from 1-6'\n",
1062 | " else: \n",
1063 | " ids = ['User-based CF (cosine)','User-based CF (correlation)','Item-based CF (cosine)',\n",
1064 | " 'Item-based CF (adjusted cosine)']\n",
1065 | "\n",
1066 | " approach = widgets.Dropdown(options=ids, value=ids[0],\n",
1067 | " description='Select Approach', width='500px')\n",
1068 | " \n",
1069 | " def on_change(change):\n",
1070 | " prediction = 0\n",
1071 | " clear_output(wait=True)\n",
1072 | " if change['type'] == 'change' and change['name'] == 'value': \n",
1073 | " if (approach.value == 'User-based CF (cosine)'):\n",
1074 | " metric = 'cosine'\n",
1075 | " prediction = predict_userbased(user_id, item_id, ratings, metric)\n",
1076 | " elif (approach.value == 'User-based CF (correlation)') : \n",
1077 | " metric = 'correlation' \n",
1078 | " prediction = predict_userbased(user_id, item_id, ratings, metric)\n",
1079 | " elif (approach.value == 'Item-based CF (cosine)'):\n",
1080 | " prediction = predict_itembased(user_id, item_id, ratings)\n",
1081 | " else:\n",
1082 | " prediction = predict_itembased_adjcos(user_id,item_id,ratings)\n",
1083 | "\n",
1084 | " if ratings[item_id-1][user_id-1] != 0: \n",
1085 | " print 'Item already rated'\n",
1086 | " else:\n",
1087 | " if prediction>=6:\n",
1088 | " print '\\nItem recommended'\n",
1089 | " else:\n",
1090 | " print 'Item not recommended'\n",
1091 | "\n",
1092 | " approach.observe(on_change)\n",
1093 | " display(approach)"
1094 | ]
1095 | },
1096 | {
1097 | "cell_type": "code",
1098 | "execution_count": 26,
1099 | "metadata": {},
1100 | "outputs": [
1101 | {
1102 | "name": "stdout",
1103 | "output_type": "stream",
1104 | "text": [
1105 | "Userid does not exist. Enter numbers from 1-6\n"
1106 | ]
1107 | }
1108 | ],
1109 | "source": [
1110 | "#check for incorrect entries\n",
1111 | "recommendItem(-1,3,M)"
1112 | ]
1113 | },
1114 | {
1115 | "cell_type": "code",
1116 | "execution_count": 27,
1117 | "metadata": {},
1118 | "outputs": [
1119 | {
1120 | "name": "stdout",
1121 | "output_type": "stream",
1122 | "text": [
1123 | "3 most similar users for User 3:\n",
1124 | "\n",
1125 | "1: User 4, with similarity of 0.90951268934\n",
1126 | "2: User 2, with similarity of 0.874744414849\n",
1127 | "3: User 5, with similarity of 0.86545387815\n",
1128 | "\n",
1129 | "Predicted rating for user 3 -> item 4: 3\n",
1130 | "Item not recommended\n"
1131 | ]
1132 | }
1133 | ],
1134 | "source": [
1135 | "recommendItem(3,4,M)"
1136 | ]
1137 | },
1138 | {
1139 | "cell_type": "code",
1140 | "execution_count": 28,
1141 | "metadata": {},
1142 | "outputs": [
1143 | {
1144 | "name": "stdout",
1145 | "output_type": "stream",
1146 | "text": [
1147 | "3 most similar users for User 3:\n",
1148 | "\n",
1149 | "1: User 2, with similarity of 0.453897185842\n",
1150 | "2: User 4, with similarity of 0.451378005098\n",
1151 | "3: User 6, with similarity of 0.297373304825\n",
1152 | "\n",
1153 | "Predicted rating for user 3 -> item 4: 3\n",
1154 | "Item not recommended\n"
1155 | ]
1156 | }
1157 | ],
1158 | "source": [
1159 | "recommendItem(3,4,M)"
1160 | ]
1161 | },
1162 | {
1163 | "cell_type": "code",
1164 | "execution_count": 29,
1165 | "metadata": {},
1166 | "outputs": [
1167 | {
1168 | "name": "stdout",
1169 | "output_type": "stream",
1170 | "text": [
1171 | "3 most similar items for item 4:\n",
1172 | "\n",
1173 | "1: Item 6 :, with similarity of 0.89977997614\n",
1174 | "2: Item 2 :, with similarity of 0.887160079571\n",
1175 | "3: Item 5 :, with similarity of 0.88180009273\n",
1176 | "\n",
1177 | "Predicted rating for user 3 -> item 4: 6\n",
1178 | "\n",
1179 | "Item recommended\n"
1180 | ]
1181 | }
1182 | ],
1183 | "source": [
1184 | "recommendItem(3,4,M)"
1185 | ]
1186 | },
1187 | {
1188 | "cell_type": "code",
1189 | "execution_count": 30,
1190 | "metadata": {},
1191 | "outputs": [
1192 | {
1193 | "name": "stdout",
1194 | "output_type": "stream",
1195 | "text": [
1196 | "3 most similar items for item 4:\n",
1197 | "\n",
1198 | "1: Item 2 :, with similarity of 0.0857414341149\n",
1199 | "2: Item 5 :, with similarity of -0.29905882779\n",
1200 | "3: Item 1 :, with similarity of -0.519085268895\n",
1201 | "\n",
1202 | "Predicted rating for user 3 -> item 4: 8\n",
1203 | "\n",
1204 | "Item recommended\n"
1205 | ]
1206 | }
1207 | ],
1208 | "source": [
1209 | "recommendItem(3,4,M)"
1210 | ]
1211 | },
1212 | {
1213 | "cell_type": "code",
1214 | "execution_count": 31,
1215 | "metadata": {},
1216 | "outputs": [
1217 | {
1218 | "name": "stdout",
1219 | "output_type": "stream",
1220 | "text": [
1221 | "3 most similar users for User 2:\n",
1222 | "\n",
1223 | "1: User 4, with similarity of 0.515910067398\n",
1224 | "2: User 3, with similarity of 0.453897185842\n",
1225 | "3: User 6, with similarity of 0.218327934565\n",
1226 | "\n",
1227 | "Predicted rating for user 2 -> item 1: 5\n",
1228 | "Item already rated\n"
1229 | ]
1230 | }
1231 | ],
1232 | "source": [
1233 | "#if the item is already rated, it is not recommended\n",
1234 | "recommendItem(2,1,M)"
1235 | ]
1236 | },
1237 | {
1238 | "cell_type": "code",
1239 | "execution_count": 35,
1240 | "metadata": {},
1241 | "outputs": [],
1242 | "source": [
1243 | "#This is a quick way to temporarily suppress stdout in particular code section\n",
1244 | "@contextmanager\n",
1245 | "def suppress_stdout():\n",
1246 | " with open(os.devnull, \"w\") as devnull:\n",
1247 | " old_stdout = sys.stdout\n",
1248 | " sys.stdout = devnull\n",
1249 | " try: \n",
1250 | " yield\n",
1251 | " finally:\n",
1252 | " sys.stdout = old_stdout"
1253 | ]
1254 | },
1255 | {
1256 | "cell_type": "code",
1257 | "execution_count": 45,
1258 | "metadata": {},
1259 | "outputs": [],
1260 | "source": [
1261 | "#This is final function to evaluate the performance of selected recommendation approach and the metric used here is RMSE\n",
1262 | "#suppress_stdout function is used to suppress the print outputs of all the functions inside this function. It will only print \n",
1263 | "#RMSE values\n",
1264 | "def evaluateRS(ratings):\n",
1265 | " ids = ['User-based CF (cosine)','User-based CF (correlation)','Item-based CF (cosine)','Item-based CF (adjusted cosine)']\n",
1266 | " approach = widgets.Dropdown(options=ids, value=ids[0],description='Select Approach', width='500px')\n",
1267 | " n_users = ratings.shape[0]\n",
1268 | " n_items = ratings.shape[1]\n",
1269 | " prediction = np.zeros((n_users, n_items))\n",
1270 | " prediction= pd.DataFrame(prediction)\n",
1271 | " def on_change(change):\n",
1272 | " clear_output(wait=True)\n",
1273 | " with suppress_stdout():\n",
1274 | " if change['type'] == 'change' and change['name'] == 'value': \n",
1275 | " if (approach.value == 'User-based CF (cosine)'):\n",
1276 | " metric = 'cosine'\n",
1277 | " for i in range(n_users):\n",
1278 | " for j in range(n_items):\n",
1279 | " prediction[i][j] = predict_userbased(i+1, j+1, ratings, metric)\n",
1280 | " elif (approach.value == 'User-based CF (correlation)') : \n",
1281 | " metric = 'correlation' \n",
1282 | " for i in range(n_users):\n",
1283 | " for j in range(n_items):\n",
1284 | " prediction[i][j] = predict_userbased(i+1, j+1, ratings, metric)\n",
1285 | " elif (approach.value == 'Item-based CF (cosine)'):\n",
1286 | " for i in range(n_users):\n",
1287 | " for j in range(n_items):\n",
1288 | " prediction[i][j] = predict_userbased(i+1, j+1, ratings)\n",
1289 | " else:\n",
1290 | " for i in range(n_users):\n",
1291 | " for j in range(n_items):\n",
1292 | " prediction[i][j] = predict_userbased(i+1, j+1, ratings)\n",
1293 | " \n",
1294 | " MSE = mean_squared_error(prediction, ratings)\n",
1295 | " RMSE = round(sqrt(MSE),3)\n",
1296 | " print \"RMSE using {0} approach is: {1}\".format(approach.value,RMSE)\n",
1297 | " \n",
1298 | " approach.observe(on_change)\n",
1299 | " display(approach)"
1300 | ]
1301 | },
1302 | {
1303 | "cell_type": "code",
1304 | "execution_count": 46,
1305 | "metadata": {},
1306 | "outputs": [
1307 | {
1308 | "name": "stdout",
1309 | "output_type": "stream",
1310 | "text": [
1311 | "RMSE using User-based CF (cosine) approach is: 2.667\n"
1312 | ]
1313 | }
1314 | ],
1315 | "source": [
1316 | "evaluateRS(M)"
1317 | ]
1318 | },
1319 | {
1320 | "cell_type": "code",
1321 | "execution_count": 47,
1322 | "metadata": {},
1323 | "outputs": [
1324 | {
1325 | "name": "stdout",
1326 | "output_type": "stream",
1327 | "text": [
1328 | "RMSE using User-based CF (correlation) approach is: 2.764\n"
1329 | ]
1330 | }
1331 | ],
1332 | "source": [
1333 | "evaluateRS(M)"
1334 | ]
1335 | },
1336 | {
1337 | "cell_type": "markdown",
1338 | "metadata": {},
1339 | "source": [
1340 | "**Thanks for reading this notebook**"
1341 | ]
1342 | }
1343 | ],
1344 | "metadata": {
1345 | "kernelspec": {
1346 | "display_name": "Python 2",
1347 | "language": "python",
1348 | "name": "python2"
1349 | },
1350 | "language_info": {
1351 | "codemirror_mode": {
1352 | "name": "ipython",
1353 | "version": 2
1354 | },
1355 | "file_extension": ".py",
1356 | "mimetype": "text/x-python",
1357 | "name": "python",
1358 | "nbconvert_exporter": "python",
1359 | "pygments_lexer": "ipython2",
1360 | "version": "2.7.13"
1361 | }
1362 | },
1363 | "nbformat": 4,
1364 | "nbformat_minor": 2
1365 | }
1366 |
--------------------------------------------------------------------------------