├── .gitignore
├── 11_1-3_to_11_1-5.mw
├── Chapter 11 - Dimensionality Reduction.ipynb
├── Chapter 7 - Clustering.ipynb
├── Chapter 9 - Recommendation Systems.ipynb
├── Exercises 6.1.1 and 6.1.3 and their related problems (from Ch.6 Frequent Itemsets).ipynb
└── hiearchical_clustering_and_heaps.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .ipynb_checkpoints
3 |
--------------------------------------------------------------------------------
/Chapter 11 - Dimensionality Reduction.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Contents\n",
8 | "- Exercise 11.1.1\n",
9 | "- Exercise 11.1.2\n",
10 | "- Exercise 11.1.6\n",
11 | "- Exercise 11.1.7\n",
12 | "- Exercise 11.2.1\n",
13 | "- Exercise 11.3.1\n",
14 | "- Exercise 11.3.2"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "import numpy as np\n",
26 | "from __future__ import division"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "# 11.1.1"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 10,
39 | "metadata": {
40 | "collapsed": false
41 | },
42 | "outputs": [],
43 | "source": [
44 | "# unit vector in same direction as [1,2,3]\n",
45 | "a = np.arange(1,4)\n",
46 | "a_unit = a/np.sqrt(np.dot(a,a))"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 11,
52 | "metadata": {
53 | "collapsed": false
54 | },
55 | "outputs": [
56 | {
57 | "name": "stdout",
58 | "output_type": "stream",
59 | "text": [
60 | "[1 2 3]\n",
61 | "[ 0.26726124 0.53452248 0.80178373]\n"
62 | ]
63 | }
64 | ],
65 | "source": [
66 | "print a\n",
67 | "print a_unit"
68 | ]
69 | },
70 | {
71 | "cell_type": "markdown",
72 | "metadata": {},
73 | "source": [
74 | "\n",
75 | "# 11.1.2"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "metadata": {},
81 | "source": [
82 | "Using power-iteration method to find the second eigenpair of M"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 325,
88 | "metadata": {
89 | "collapsed": false
90 | },
91 | "outputs": [
92 | {
93 | "data": {
94 | "text/plain": [
95 | "matrix([[3, 2],\n",
96 | " [2, 6]])"
97 | ]
98 | },
99 | "execution_count": 325,
100 | "metadata": {},
101 | "output_type": "execute_result"
102 | }
103 | ],
104 | "source": [
105 | "M = np.array([3,2,2,6]).reshape(2,2)\n",
106 | "M = np.asmatrix(M)\n",
107 | "M"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 326,
113 | "metadata": {
114 | "collapsed": false
115 | },
116 | "outputs": [
117 | {
118 | "data": {
119 | "text/plain": [
120 | "array([[ 2.601, -0.797],\n",
121 | " [-0.797, 0.413]])"
122 | ]
123 | },
124 | "execution_count": 326,
125 | "metadata": {},
126 | "output_type": "execute_result"
127 | }
128 | ],
129 | "source": [
130 | "Ms = np.array([2.601,-0.797,-0.797,0.413]).reshape(2,2)\n",
131 | "Ms"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 327,
137 | "metadata": {
138 | "collapsed": true
139 | },
140 | "outputs": [],
141 | "source": [
142 | "def Frobenius(x):\n",
143 | " assert type(x) == np.matrix\n",
144 | " foo = x.A # converts x to a np.array so we can use ufunc\n",
145 | " return np.sqrt(np.sum(foo**2))"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 328,
151 | "metadata": {
152 | "collapsed": true
153 | },
154 | "outputs": [],
155 | "source": [
156 | "def pow(M,x0=None, thres = 0.00001, pr = False):\n",
157 | " if x0 == None:\n",
158 | " x0 = np.ones(len(M))\n",
159 | " oldx = np.mat(x0).T\n",
160 | " newx = np.mat(np.zeros(len(M))).T\n",
161 | " converge = False\n",
162 | " while not converge:\n",
163 | " newx = np.dot(M,oldx)/Frobenius(np.dot(M,oldx))\n",
164 | " if pr == True:\n",
165 | " print newx\n",
166 | " if Frobenius(oldx-newx) < thres: # threshold of convergence\n",
167 | " converge = True\n",
168 | " oldx = newx\n",
169 | " \n",
170 | " # find eigenvector\n",
171 | " lam = ((newx.T*M*newx).A1)[0] # A1 attribute returns a flattened array\n",
172 | " return lam,newx"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": 329,
178 | "metadata": {
179 | "collapsed": false
180 | },
181 | "outputs": [
182 | {
183 | "name": "stdout",
184 | "output_type": "stream",
185 | "text": [
186 | "6.99999999999 [[ 0.44721468]\n",
187 | " [ 0.89442665]]\n"
188 | ]
189 | }
190 | ],
191 | "source": [
192 | "lam1, x1 = pow(M)\n",
193 | "print lam1, x1"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 330,
199 | "metadata": {
200 | "collapsed": false
201 | },
202 | "outputs": [
203 | {
204 | "data": {
205 | "text/plain": [
206 | "[7, 0.44721359549995793, 0.89442719099991586]"
207 | ]
208 | },
209 | "execution_count": 330,
210 | "metadata": {},
211 | "output_type": "execute_result"
212 | }
213 | ],
214 | "source": [
215 | "# exact principle eigenpair\n",
216 | "[7,1/np.sqrt(5),2/np.sqrt(5)]"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": 331,
222 | "metadata": {
223 | "collapsed": false
224 | },
225 | "outputs": [
226 | {
227 | "data": {
228 | "text/plain": [
229 | "matrix([[ 1.59999323, -0.80000508],\n",
230 | " [-0.80000508, 0.40000677]])"
231 | ]
232 | },
233 | "execution_count": 331,
234 | "metadata": {},
235 | "output_type": "execute_result"
236 | }
237 | ],
238 | "source": [
239 | "# second eigenvector, use poweriteration on Ms = M - lambda_1 xx.T\n",
240 | "Ms = M - lam1*(x1*x1.T)\n",
241 | "Ms"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": 332,
247 | "metadata": {
248 | "collapsed": false
249 | },
250 | "outputs": [
251 | {
252 | "name": "stdout",
253 | "output_type": "stream",
254 | "text": [
255 | "2.00000000003 [[ 0.8944253 ]\n",
256 | " [-0.44721738]]\n"
257 | ]
258 | }
259 | ],
260 | "source": [
261 | "lam2, x2 = pow(Ms)\n",
262 | "print lam2, x2"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": 333,
268 | "metadata": {
269 | "collapsed": false
270 | },
271 | "outputs": [
272 | {
273 | "data": {
274 | "text/plain": [
275 | "[2, 0.89442719099991586, -0.44721359549995793]"
276 | ]
277 | },
278 | "execution_count": 333,
279 | "metadata": {},
280 | "output_type": "execute_result"
281 | }
282 | ],
283 | "source": [
284 | "# exact second eigenpair\n",
285 | "[2,2/np.sqrt(5), -1/np.sqrt(5)]"
286 | ]
287 | },
288 | {
289 | "cell_type": "markdown",
290 | "metadata": {},
291 | "source": [
292 | "\n",
293 | "# 11.1.6"
294 | ]
295 | },
296 | {
297 | "cell_type": "markdown",
298 | "metadata": {},
299 | "source": [
300 | "(a)-(b)"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": 334,
306 | "metadata": {
307 | "collapsed": false
308 | },
309 | "outputs": [
310 | {
311 | "data": {
312 | "text/plain": [
313 | "matrix([[1, 1, 1],\n",
314 | " [1, 2, 3],\n",
315 | " [1, 3, 5]])"
316 | ]
317 | },
318 | "execution_count": 334,
319 | "metadata": {},
320 | "output_type": "execute_result"
321 | }
322 | ],
323 | "source": [
324 | "M = np.matrix([[1,1,1],[1,2,3],[1,3,5]])\n",
325 | "M"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": 335,
331 | "metadata": {
332 | "collapsed": false
333 | },
334 | "outputs": [
335 | {
336 | "name": "stdout",
337 | "output_type": "stream",
338 | "text": [
339 | "7.16227766016 [[ 0.21848282]\n",
340 | " [ 0.52160927]\n",
341 | " [ 0.82473573]]\n"
342 | ]
343 | }
344 | ],
345 | "source": [
346 | "# using pow function defined above\n",
347 | "lam1, x1 = pow(M)\n",
348 | "print lam1, x1"
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": 336,
354 | "metadata": {
355 | "collapsed": true
356 | },
357 | "outputs": [],
358 | "source": [
359 | "x1_exact = np.array([(1/5)*(5+np.sqrt(10))/(3+np.sqrt(10)),np.sqrt(10)/5,1])\n",
360 | "x1_exact = x1_exact/(np.sqrt(sum(x1_exact**2)))"
361 | ]
362 | },
363 | {
364 | "cell_type": "code",
365 | "execution_count": 337,
366 | "metadata": {
367 | "collapsed": false
368 | },
369 | "outputs": [
370 | {
371 | "data": {
372 | "text/plain": [
373 | "[7.16227766016838, array([ 0.21848175, 0.52160897, 0.8247362 ])]"
374 | ]
375 | },
376 | "execution_count": 337,
377 | "metadata": {},
378 | "output_type": "execute_result"
379 | }
380 | ],
381 | "source": [
382 | "# exact principal eigenpair\n",
383 | "[4+np.sqrt(10), x1_exact]"
384 | ]
385 | },
386 | {
387 | "cell_type": "markdown",
388 | "metadata": {},
389 | "source": [
390 | "(c) Removing the influence of the principal eigenvector from M"
391 | ]
392 | },
393 | {
394 | "cell_type": "code",
395 | "execution_count": 338,
396 | "metadata": {
397 | "collapsed": false
398 | },
399 | "outputs": [
400 | {
401 | "data": {
402 | "text/plain": [
403 | "matrix([[ 0.65811051, 0.18376774, -0.29057503],\n",
404 | " [ 0.18376774, 0.05131446, -0.08113883],\n",
405 | " [-0.29057503, -0.08113883, 0.12829737]])"
406 | ]
407 | },
408 | "execution_count": 338,
409 | "metadata": {},
410 | "output_type": "execute_result"
411 | }
412 | ],
413 | "source": [
414 | "# second eigenvector, use poweriteration on Ms = M - lambda_1 xx.T\n",
415 | "Ms = M - lam1*(x1*x1.T)\n",
416 | "Ms"
417 | ]
418 | },
419 | {
420 | "cell_type": "markdown",
421 | "metadata": {},
422 | "source": [
423 | "(d) Finding the second eigenpair of M using Ms"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": 339,
429 | "metadata": {
430 | "collapsed": false
431 | },
432 | "outputs": [
433 | {
434 | "name": "stdout",
435 | "output_type": "stream",
436 | "text": [
437 | "0.837722339911 [[ 0.88633799]\n",
438 | " [ 0.24749693]\n",
439 | " [-0.39134414]]\n"
440 | ]
441 | }
442 | ],
443 | "source": [
444 | "lam2, x2 = pow(Ms)\n",
445 | "print lam2, x2"
446 | ]
447 | },
448 | {
449 | "cell_type": "code",
450 | "execution_count": 340,
451 | "metadata": {
452 | "collapsed": true
453 | },
454 | "outputs": [],
455 | "source": [
456 | "x2_exact = np.array([(1/5)*(5-np.sqrt(10))/(3-np.sqrt(10)),-np.sqrt(10)/5,1])\n",
457 | "x2_exact = x2_exact/(np.sqrt(sum(x2_exact**2)))"
458 | ]
459 | },
460 | {
461 | "cell_type": "code",
462 | "execution_count": 341,
463 | "metadata": {
464 | "collapsed": false
465 | },
466 | "outputs": [
467 | {
468 | "data": {
469 | "text/plain": [
470 | "[0.83772233983162048, array([-0.88634026, -0.24750235, 0.39133557])]"
471 | ]
472 | },
473 | "execution_count": 341,
474 | "metadata": {},
475 | "output_type": "execute_result"
476 | }
477 | ],
478 | "source": [
479 | "# exact second eigenpair\n",
480 | "[4-np.sqrt(10),x2_exact]"
481 | ]
482 | },
483 | {
484 | "cell_type": "markdown",
485 | "metadata": {},
486 | "source": [
487 | "(e) Repeating the above steps to find third eigenpair"
488 | ]
489 | },
490 | {
491 | "cell_type": "code",
492 | "execution_count": 345,
493 | "metadata": {
494 | "collapsed": false
495 | },
496 | "outputs": [
497 | {
498 | "data": {
499 | "text/plain": [
500 | "matrix([[ -3.36408679e-12, -8.03135336e-12, -1.26988975e-11],\n",
501 | " [ -8.03135336e-12, -1.91732394e-11, -3.03155834e-11],\n",
502 | " [ -1.26988975e-11, -3.03155834e-11, -4.79333517e-11]])"
503 | ]
504 | },
505 | "execution_count": 345,
506 | "metadata": {},
507 | "output_type": "execute_result"
508 | }
509 | ],
510 | "source": [
511 | "Mss = Ms - lam2*(x2*x2.T)\n",
512 | "Mss"
513 | ]
514 | },
515 | {
516 | "cell_type": "code",
517 | "execution_count": 346,
518 | "metadata": {
519 | "collapsed": false
520 | },
521 | "outputs": [
522 | {
523 | "name": "stdout",
524 | "output_type": "stream",
525 | "text": [
526 | "[[-0.21849284]\n",
527 | " [-0.52160581]\n",
528 | " [-0.82473527]]\n",
529 | "[[ 0.21849413]\n",
530 | " [ 0.52160601]\n",
531 | " [ 0.8247348 ]]\n",
532 | "-7.04708085388e-11 [[ 0.21849413]\n",
533 | " [ 0.52160601]\n",
534 | " [ 0.8247348 ]]\n"
535 | ]
536 | }
537 | ],
538 | "source": [
539 | "lam3, x3 = pow(Mss, thres=2, pr = True)\n",
540 | "print lam3, x3"
541 | ]
542 | },
543 | {
544 | "cell_type": "markdown",
545 | "metadata": {},
546 | "source": [
547 | "The power iteration method is getting stuck at two vectors. Might need to increase amount of digits used for the calculation."
548 | ]
549 | },
550 | {
551 | "cell_type": "code",
552 | "execution_count": 354,
553 | "metadata": {
554 | "collapsed": false
555 | },
556 | "outputs": [
557 | {
558 | "data": {
559 | "text/plain": [
560 | "matrix([[-0.21849413],\n",
561 | " [-0.52160601],\n",
562 | " [-0.8247348 ]])"
563 | ]
564 | },
565 | "execution_count": 354,
566 | "metadata": {},
567 | "output_type": "execute_result"
568 | }
569 | ],
570 | "source": [
571 | "# power iteration method is stuck, oscillating between these two vectors\n",
572 | "Mss*x3/Frobenius(Mss*x3)"
573 | ]
574 | },
575 | {
576 | "cell_type": "code",
577 | "execution_count": 349,
578 | "metadata": {
579 | "collapsed": true
580 | },
581 | "outputs": [],
582 | "source": [
583 | "x3_exact = np.array([1,-2,1])\n",
584 | "x3_exact = x3_exact/np.sqrt(sum(x3_exact**2))"
585 | ]
586 | },
587 | {
588 | "cell_type": "code",
589 | "execution_count": 351,
590 | "metadata": {
591 | "collapsed": false
592 | },
593 | "outputs": [
594 | {
595 | "data": {
596 | "text/plain": [
597 | "array([ 0.40824829, -0.81649658, 0.40824829])"
598 | ]
599 | },
600 | "execution_count": 351,
601 | "metadata": {},
602 | "output_type": "execute_result"
603 | }
604 | ],
605 | "source": [
606 | "x3_exact"
607 | ]
608 | },
609 | {
610 | "cell_type": "markdown",
611 | "metadata": {},
612 | "source": [
613 | "\n",
614 | "# 11.1.7"
615 | ]
616 | },
617 | {
618 | "cell_type": "code",
619 | "execution_count": 355,
620 | "metadata": {
621 | "collapsed": false
622 | },
623 | "outputs": [
624 | {
625 | "data": {
626 | "text/plain": [
627 | "matrix([[1, 1, 1],\n",
628 | " [1, 2, 3],\n",
629 | " [1, 3, 6]])"
630 | ]
631 | },
632 | "execution_count": 355,
633 | "metadata": {},
634 | "output_type": "execute_result"
635 | }
636 | ],
637 | "source": [
638 | "M = np.matrix([[1,1,1],[1,2,3],[1,3,6]])\n",
639 | "M"
640 | ]
641 | },
642 | {
643 | "cell_type": "code",
644 | "execution_count": 357,
645 | "metadata": {
646 | "collapsed": false
647 | },
648 | "outputs": [
649 | {
650 | "name": "stdout",
651 | "output_type": "stream",
652 | "text": [
653 | "7.87298334621 [[ 0.19382289]\n",
654 | " [ 0.4722474 ]\n",
655 | " [ 0.85989248]]\n"
656 | ]
657 | }
658 | ],
659 | "source": [
660 | "# first eigenpair\n",
661 | "lam1, x1 = pow(M)\n",
662 | "print lam1, x1"
663 | ]
664 | },
665 | {
666 | "cell_type": "code",
667 | "execution_count": 368,
668 | "metadata": {
669 | "collapsed": false
670 | },
671 | "outputs": [
672 | {
673 | "data": {
674 | "text/plain": [
675 | "array([ 0.19382266, 0.47224729, 0.8598926 ])"
676 | ]
677 | },
678 | "execution_count": 368,
679 | "metadata": {},
680 | "output_type": "execute_result"
681 | }
682 | ],
683 | "source": [
684 | "x1_exact = np.array([(6/5)*(5+np.sqrt(15))/(3+np.sqrt(15))**2,\n",
685 | " (1/5)*(15+np.sqrt(15))/(3+np.sqrt(15)),1])\n",
686 | "x1_exact = x1_exact/np.sqrt(sum(x1_exact**2))\n",
687 | "x1_exact"
688 | ]
689 | },
690 | {
691 | "cell_type": "code",
692 | "execution_count": 360,
693 | "metadata": {
694 | "collapsed": false
695 | },
696 | "outputs": [
697 | {
698 | "data": {
699 | "text/plain": [
700 | "matrix([[ 0.70423318, 0.27936729, -0.31216529],\n",
701 | " [ 0.27936729, 0.24418608, -0.19707675],\n",
702 | " [-0.31216529, -0.19707675, 0.1785974 ]])"
703 | ]
704 | },
705 | "execution_count": 360,
706 | "metadata": {},
707 | "output_type": "execute_result"
708 | }
709 | ],
710 | "source": [
711 | "Ms = M - lam1*(x1*x1.T)\n",
712 | "Ms"
713 | ]
714 | },
715 | {
716 | "cell_type": "code",
717 | "execution_count": 362,
718 | "metadata": {
719 | "collapsed": false
720 | },
721 | "outputs": [
722 | {
723 | "name": "stdout",
724 | "output_type": "stream",
725 | "text": [
726 | "1.0 [[ 0.81649634]\n",
727 | " [ 0.40824695]\n",
728 | " [-0.40825011]]\n"
729 | ]
730 | }
731 | ],
732 | "source": [
733 | "# second eigenpair\n",
734 | "lam2, x2 = pow(Ms)\n",
735 | "print lam2, x2"
736 | ]
737 | },
738 | {
739 | "cell_type": "code",
740 | "execution_count": 369,
741 | "metadata": {
742 | "collapsed": false
743 | },
744 | "outputs": [
745 | {
746 | "data": {
747 | "text/plain": [
748 | "array([-0.81649658, -0.40824829, 0.40824829])"
749 | ]
750 | },
751 | "execution_count": 369,
752 | "metadata": {},
753 | "output_type": "execute_result"
754 | }
755 | ],
756 | "source": [
757 | "x2_exact = np.array([-2,-1,1])\n",
758 | "x2_exact = x2_exact/np.sqrt(sum(x2_exact**2))\n",
759 | "x2_exact"
760 | ]
761 | },
762 | {
763 | "cell_type": "code",
764 | "execution_count": 364,
765 | "metadata": {
766 | "collapsed": false
767 | },
768 | "outputs": [
769 | {
770 | "data": {
771 | "text/plain": [
772 | "matrix([[ 0.03756691, -0.05396485, 0.02116943],\n",
773 | " [-0.05396485, 0.0775205 , -0.03040989],\n",
774 | " [ 0.02116943, -0.03040989, 0.01192925]])"
775 | ]
776 | },
777 | "execution_count": 364,
778 | "metadata": {},
779 | "output_type": "execute_result"
780 | }
781 | ],
782 | "source": [
783 | "Mss = Ms - lam2*(x2*x2.T)\n",
784 | "Mss"
785 | ]
786 | },
787 | {
788 | "cell_type": "code",
789 | "execution_count": 365,
790 | "metadata": {
791 | "collapsed": false
792 | },
793 | "outputs": [
794 | {
795 | "name": "stdout",
796 | "output_type": "stream",
797 | "text": [
798 | "0.127016653793 [[ 0.54384155]\n",
799 | " [-0.78122828]\n",
800 | " [ 0.30646167]]\n"
801 | ]
802 | }
803 | ],
804 | "source": [
805 | "# third eigenpair\n",
806 | "lam3, x3 = pow(Mss)\n",
807 | "print lam3, x3"
808 | ]
809 | },
810 | {
811 | "cell_type": "code",
812 | "execution_count": 370,
813 | "metadata": {
814 | "collapsed": false
815 | },
816 | "outputs": [
817 | {
818 | "data": {
819 | "text/plain": [
820 | "array([ 0.54384383, -0.78122713, 0.30646053])"
821 | ]
822 | },
823 | "execution_count": 370,
824 | "metadata": {},
825 | "output_type": "execute_result"
826 | }
827 | ],
828 | "source": [
829 | "x3_exact = np.array([(6/5)*(5-np.sqrt(15))/(3-np.sqrt(15))**2,\n",
830 | " (1/5)*(15-np.sqrt(15))/(3-np.sqrt(15)),1])\n",
831 | "x3_exact = x3_exact/np.sqrt(sum(x3_exact**2))\n",
832 | "x3_exact"
833 | ]
834 | },
835 | {
836 | "cell_type": "markdown",
837 | "metadata": {},
838 | "source": [
839 | "\n",
840 | "# 11.2.1"
841 | ]
842 | },
843 | {
844 | "cell_type": "code",
845 | "execution_count": 373,
846 | "metadata": {
847 | "collapsed": false
848 | },
849 | "outputs": [
850 | {
851 | "data": {
852 | "text/plain": [
853 | "matrix([[ 1, 1],\n",
854 | " [ 2, 4],\n",
855 | " [ 3, 9],\n",
856 | " [ 4, 16]])"
857 | ]
858 | },
859 | "execution_count": 373,
860 | "metadata": {},
861 | "output_type": "execute_result"
862 | }
863 | ],
864 | "source": [
865 | "M = np.array([1,1,2,4,3,9,4,16]).reshape(4,2)\n",
866 | "M = np.asmatrix(M)\n",
867 | "M"
868 | ]
869 | },
870 | {
871 | "cell_type": "markdown",
872 | "metadata": {},
873 | "source": [
874 | "(a)"
875 | ]
876 | },
877 | {
878 | "cell_type": "code",
879 | "execution_count": 375,
880 | "metadata": {
881 | "collapsed": false
882 | },
883 | "outputs": [
884 | {
885 | "data": {
886 | "text/plain": [
887 | "matrix([[ 30, 100],\n",
888 | " [100, 354]])"
889 | ]
890 | },
891 | "execution_count": 375,
892 | "metadata": {},
893 | "output_type": "execute_result"
894 | }
895 | ],
896 | "source": [
897 | "MtM = M.T*M\n",
898 | "MtM"
899 | ]
900 | },
901 | {
902 | "cell_type": "code",
903 | "execution_count": 377,
904 | "metadata": {
905 | "collapsed": false
906 | },
907 | "outputs": [
908 | {
909 | "data": {
910 | "text/plain": [
911 | "matrix([[ 2, 6, 12, 20],\n",
912 | " [ 6, 20, 42, 72],\n",
913 | " [ 12, 42, 90, 156],\n",
914 | " [ 20, 72, 156, 272]])"
915 | ]
916 | },
917 | "execution_count": 377,
918 | "metadata": {},
919 | "output_type": "execute_result"
920 | }
921 | ],
922 | "source": [
923 | "MMt = M*M.T\n",
924 | "MMt"
925 | ]
926 | },
927 | {
928 | "cell_type": "markdown",
929 | "metadata": {},
930 | "source": [
931 | "(b)"
932 | ]
933 | },
934 | {
935 | "cell_type": "code",
936 | "execution_count": 378,
937 | "metadata": {
938 | "collapsed": false
939 | },
940 | "outputs": [
941 | {
942 | "name": "stdout",
943 | "output_type": "stream",
944 | "text": [
945 | "382.378570223 [[ 0.27300543]\n",
946 | " [ 0.96201249]]\n"
947 | ]
948 | }
949 | ],
950 | "source": [
951 | "# Using power=iteration method\n",
952 | "lam1, x1 = pow(MtM)\n",
953 | "print lam1, x1"
954 | ]
955 | },
956 | {
957 | "cell_type": "code",
958 | "execution_count": 381,
959 | "metadata": {
960 | "collapsed": false
961 | },
962 | "outputs": [
963 | {
964 | "data": {
965 | "text/plain": [
966 | "array([ 0.27300539, 0.9620125 ])"
967 | ]
968 | },
969 | "execution_count": 381,
970 | "metadata": {},
971 | "output_type": "execute_result"
972 | }
973 | ],
974 | "source": [
975 | "x1_exact = np.array([100/(162+2*np.sqrt(9061)),1])\n",
976 | "x1_exact = x1_exact/np.sqrt(sum(x1_exact**2))\n",
977 | "x1_exact"
978 | ]
979 | },
980 | {
981 | "cell_type": "code",
982 | "execution_count": 379,
983 | "metadata": {
984 | "collapsed": false
985 | },
986 | "outputs": [
987 | {
988 | "data": {
989 | "text/plain": [
990 | "matrix([[ 1.50057292, -0.4258574 ],\n",
991 | " [-0.4258574 , 0.12085686]])"
992 | ]
993 | },
994 | "execution_count": 379,
995 | "metadata": {},
996 | "output_type": "execute_result"
997 | }
998 | ],
999 | "source": [
1000 | "Ms = MtM - lam1*(x1*x1.T)\n",
1001 | "Ms"
1002 | ]
1003 | },
1004 | {
1005 | "cell_type": "code",
1006 | "execution_count": 380,
1007 | "metadata": {
1008 | "collapsed": false
1009 | },
1010 | "outputs": [
1011 | {
1012 | "name": "stdout",
1013 | "output_type": "stream",
1014 | "text": [
1015 | "1.62142977757 [[ 0.96200976]\n",
1016 | " [-0.27301504]]\n"
1017 | ]
1018 | }
1019 | ],
1020 | "source": [
1021 | "lam2, x2 = pow(Ms)\n",
1022 | "print lam2, x2"
1023 | ]
1024 | },
1025 | {
1026 | "cell_type": "code",
1027 | "execution_count": 382,
1028 | "metadata": {
1029 | "collapsed": false
1030 | },
1031 | "outputs": [
1032 | {
1033 | "data": {
1034 | "text/plain": [
1035 | "array([-0.9620125 , 0.27300539])"
1036 | ]
1037 | },
1038 | "execution_count": 382,
1039 | "metadata": {},
1040 | "output_type": "execute_result"
1041 | }
1042 | ],
1043 | "source": [
1044 | "x2_exact = np.array([100/(162-2*np.sqrt(9061)),1])\n",
1045 | "x2_exact = x2_exact/np.sqrt(sum(x2_exact**2))\n",
1046 | "x2_exact"
1047 | ]
1048 | },
1049 | {
1050 | "cell_type": "markdown",
1051 | "metadata": {},
1052 | "source": [
1053 | "(c) Since M.T\\*M has dimension 2x2 and M\\*M.T has dimension 4x4, the eigenvalues of M\\*M.T are (382.38,1.62,0,0)"
1054 | ]
1055 | },
1056 | {
1057 | "cell_type": "markdown",
1058 | "metadata": {},
1059 | "source": [
1060 | "(d)"
1061 | ]
1062 | },
1063 | {
1064 | "cell_type": "code",
1065 | "execution_count": 383,
1066 | "metadata": {
1067 | "collapsed": false
1068 | },
1069 | "outputs": [
1070 | {
1071 | "data": {
1072 | "text/plain": [
1073 | "matrix([[ 1.23501793],\n",
1074 | " [ 4.39406083],\n",
1075 | " [ 9.47712872],\n",
1076 | " [ 16.48422159]])"
1077 | ]
1078 | },
1079 | "execution_count": 383,
1080 | "metadata": {},
1081 | "output_type": "execute_result"
1082 | }
1083 | ],
1084 | "source": [
1085 | "# eigenvector of MM.T due to lam1\n",
1086 | "M*x1"
1087 | ]
1088 | },
1089 | {
1090 | "cell_type": "code",
1091 | "execution_count": 386,
1092 | "metadata": {
1093 | "collapsed": false
1094 | },
1095 | "outputs": [
1096 | {
1097 | "data": {
1098 | "text/plain": [
1099 | "19.554502556255315"
1100 | ]
1101 | },
1102 | "execution_count": 386,
1103 | "metadata": {},
1104 | "output_type": "execute_result"
1105 | }
1106 | ],
1107 | "source": [
1108 | "Frobenius(M*x1)"
1109 | ]
1110 | },
1111 | {
1112 | "cell_type": "code",
1113 | "execution_count": 389,
1114 | "metadata": {
1115 | "collapsed": false
1116 | },
1117 | "outputs": [
1118 | {
1119 | "name": "stdout",
1120 | "output_type": "stream",
1121 | "text": [
1122 | "[[ 472.24437737]\n",
1123 | " [ 1680.19468524]\n",
1124 | " [ 3623.85092359]\n",
1125 | " [ 6303.21309243]]\n",
1126 | "[[ 472.24438853]\n",
1127 | " [ 1680.19469871]\n",
1128 | " [ 3623.85093054]\n",
1129 | " [ 6303.21308401]]\n"
1130 | ]
1131 | }
1132 | ],
1133 | "source": [
1134 | "# verifying that we indeed have an eigenvector of MM.T\n",
1135 | "print MMt*(M*x1) \n",
1136 | "print lam1*(M*x1)"
1137 | ]
1138 | },
1139 | {
1140 | "cell_type": "code",
1141 | "execution_count": 384,
1142 | "metadata": {
1143 | "collapsed": false
1144 | },
1145 | "outputs": [
1146 | {
1147 | "data": {
1148 | "text/plain": [
1149 | "matrix([[ 0.68899472],\n",
1150 | " [ 0.83195935],\n",
1151 | " [ 0.4288939 ],\n",
1152 | " [-0.52020165]])"
1153 | ]
1154 | },
1155 | "execution_count": 384,
1156 | "metadata": {},
1157 | "output_type": "execute_result"
1158 | }
1159 | ],
1160 | "source": [
1161 | "# eigenvector of MM.T due to lam2\n",
1162 | "M*x2"
1163 | ]
1164 | },
1165 | {
1166 | "cell_type": "code",
1167 | "execution_count": 385,
1168 | "metadata": {
1169 | "collapsed": false
1170 | },
1171 | "outputs": [
1172 | {
1173 | "data": {
1174 | "text/plain": [
1175 | "1.2733537669157973"
1176 | ]
1177 | },
1178 | "execution_count": 385,
1179 | "metadata": {},
1180 | "output_type": "execute_result"
1181 | }
1182 | ],
1183 | "source": [
1184 | "Frobenius(M*x2)"
1185 | ]
1186 | },
1187 | {
1188 | "cell_type": "code",
1189 | "execution_count": 390,
1190 | "metadata": {
1191 | "collapsed": false
1192 | },
1193 | "outputs": [
1194 | {
1195 | "name": "stdout",
1196 | "output_type": "stream",
1197 | "text": [
1198 | "[[ 1.11243939]\n",
1199 | " [ 1.33218051]\n",
1200 | " [ 0.65922334]\n",
1201 | " [-0.9064321 ]]\n",
1202 | "[[ 1.11715656]\n",
1203 | " [ 1.34896367]\n",
1204 | " [ 0.69542134]\n",
1205 | " [-0.84347044]]\n"
1206 | ]
1207 | }
1208 | ],
1209 | "source": [
1210 | "print MMt*(M*x2) \n",
1211 | "print lam2*(M*x2)"
1212 | ]
1213 | },
1214 | {
1215 | "cell_type": "code",
1216 | "execution_count": 392,
1217 | "metadata": {
1218 | "collapsed": false
1219 | },
1220 | "outputs": [
1221 | {
1222 | "data": {
1223 | "text/plain": [
1224 | "matrix([[-0.00471716],\n",
1225 | " [-0.01678316],\n",
1226 | " [-0.03619799],\n",
1227 | " [-0.06296166]])"
1228 | ]
1229 | },
1230 | "execution_count": 392,
1231 | "metadata": {},
1232 | "output_type": "execute_result"
1233 | }
1234 | ],
1235 | "source": [
1236 | "# accuracy should improve if we allow for more digits\n",
1237 | "MMt*(M*x2) - lam2*(M*x2)"
1238 | ]
1239 | },
1240 | {
1241 | "cell_type": "markdown",
1242 | "metadata": {},
1243 | "source": [
1244 | "For the eigenvectors of lam = 0, we first perform Gaussian Elimination on MM.T"
1245 | ]
1246 | },
1247 | {
1248 | "cell_type": "code",
1249 | "execution_count": 412,
1250 | "metadata": {
1251 | "collapsed": false
1252 | },
1253 | "outputs": [
1254 | {
1255 | "data": {
1256 | "text/plain": [
1257 | "array([[ 2, 6, 12, 20],\n",
1258 | " [ 0, 2, 6, 12],\n",
1259 | " [ 0, 0, 0, 0],\n",
1260 | " [ 0, 0, 0, 0]])"
1261 | ]
1262 | },
1263 | "execution_count": 412,
1264 | "metadata": {},
1265 | "output_type": "execute_result"
1266 | }
1267 | ],
1268 | "source": [
1269 | "# Gaussian Elimination on MM.T results\n",
1270 | "MMt_Gauss = np.array([2,6,12,20,0,2,6,12]+[0]*8).reshape(4,4)\n",
1271 | "MMt_Gauss"
1272 | ]
1273 | },
1274 | {
1275 | "cell_type": "code",
1276 | "execution_count": 418,
1277 | "metadata": {
1278 | "collapsed": false
1279 | },
1280 | "outputs": [
1281 | {
1282 | "name": "stdout",
1283 | "output_type": "stream",
1284 | "text": [
1285 | "[[ 3]\n",
1286 | " [-3]\n",
1287 | " [ 1]\n",
1288 | " [ 0]]\n",
1289 | "[[ 8]\n",
1290 | " [-6]\n",
1291 | " [ 0]\n",
1292 | " [ 1]]\n"
1293 | ]
1294 | }
1295 | ],
1296 | "source": [
1297 | "# solving this system we get the basis for the eigenspace of lam = 0\n",
1298 | "x3 = np.matrix([3,-3,1,0]).T\n",
1299 | "x4 = np.matrix([8,-6,0,1]).T\n",
1300 | "print x3\n",
1301 | "print x4"
1302 | ]
1303 | },
1304 | {
1305 | "cell_type": "code",
1306 | "execution_count": 420,
1307 | "metadata": {
1308 | "collapsed": false
1309 | },
1310 | "outputs": [
1311 | {
1312 | "data": {
1313 | "text/plain": [
1314 | "matrix([[0],\n",
1315 | " [0],\n",
1316 | " [0],\n",
1317 | " [0]])"
1318 | ]
1319 | },
1320 | "execution_count": 420,
1321 | "metadata": {},
1322 | "output_type": "execute_result"
1323 | }
1324 | ],
1325 | "source": [
1326 | "MMt*x3"
1327 | ]
1328 | },
1329 | {
1330 | "cell_type": "code",
1331 | "execution_count": 421,
1332 | "metadata": {
1333 | "collapsed": false
1334 | },
1335 | "outputs": [
1336 | {
1337 | "data": {
1338 | "text/plain": [
1339 | "matrix([[0],\n",
1340 | " [0],\n",
1341 | " [0],\n",
1342 | " [0]])"
1343 | ]
1344 | },
1345 | "execution_count": 421,
1346 | "metadata": {},
1347 | "output_type": "execute_result"
1348 | }
1349 | ],
1350 | "source": [
1351 | "MMt*x4"
1352 | ]
1353 | },
1354 | {
1355 | "cell_type": "markdown",
1356 | "metadata": {},
1357 | "source": [
1358 | "\n",
1359 | "# 11.3.1"
1360 | ]
1361 | },
1362 | {
1363 | "cell_type": "code",
1364 | "execution_count": 540,
1365 | "metadata": {
1366 | "collapsed": false
1367 | },
1368 | "outputs": [
1369 | {
1370 | "data": {
1371 | "text/plain": [
1372 | "matrix([[1, 2, 3],\n",
1373 | " [3, 4, 5],\n",
1374 | " [5, 4, 3],\n",
1375 | " [1, 2, 4],\n",
1376 | " [1, 3, 5]])"
1377 | ]
1378 | },
1379 | "execution_count": 540,
1380 | "metadata": {},
1381 | "output_type": "execute_result"
1382 | }
1383 | ],
1384 | "source": [
1385 | "M = np.array([1,2,3,3,4,5,5,4,3,1,2,4,1,3,5]).reshape(5,3)\n",
1386 | "M = np.asmatrix(M)\n",
1387 | "M"
1388 | ]
1389 | },
1390 | {
1391 | "cell_type": "markdown",
1392 | "metadata": {},
1393 | "source": [
1394 | "(a)"
1395 | ]
1396 | },
1397 | {
1398 | "cell_type": "code",
1399 | "execution_count": 541,
1400 | "metadata": {
1401 | "collapsed": false
1402 | },
1403 | "outputs": [
1404 | {
1405 | "data": {
1406 | "text/plain": [
1407 | "matrix([[37, 39, 42],\n",
1408 | " [39, 49, 61],\n",
1409 | " [42, 61, 84]])"
1410 | ]
1411 | },
1412 | "execution_count": 541,
1413 | "metadata": {},
1414 | "output_type": "execute_result"
1415 | }
1416 | ],
1417 | "source": [
1418 | "MtM = M.T*M\n",
1419 | "MtM"
1420 | ]
1421 | },
1422 | {
1423 | "cell_type": "code",
1424 | "execution_count": 542,
1425 | "metadata": {
1426 | "collapsed": false
1427 | },
1428 | "outputs": [
1429 | {
1430 | "data": {
1431 | "text/plain": [
1432 | "matrix([[14, 26, 22, 17, 22],\n",
1433 | " [26, 50, 46, 31, 40],\n",
1434 | " [22, 46, 50, 25, 32],\n",
1435 | " [17, 31, 25, 21, 27],\n",
1436 | " [22, 40, 32, 27, 35]])"
1437 | ]
1438 | },
1439 | "execution_count": 542,
1440 | "metadata": {},
1441 | "output_type": "execute_result"
1442 | }
1443 | ],
1444 | "source": [
1445 | "MMt = M*M.T\n",
1446 | "MMt"
1447 | ]
1448 | },
1449 | {
1450 | "cell_type": "markdown",
1451 | "metadata": {},
1452 | "source": [
1453 | "(b), (c) Using power iteration"
1454 | ]
1455 | },
1456 | {
1457 | "cell_type": "markdown",
1458 | "metadata": {},
1459 | "source": [
1460 | "### MTM"
1461 | ]
1462 | },
1463 | {
1464 | "cell_type": "code",
1465 | "execution_count": 581,
1466 | "metadata": {
1467 | "collapsed": false
1468 | },
1469 | "outputs": [
1470 | {
1471 | "name": "stdout",
1472 | "output_type": "stream",
1473 | "text": [
1474 | "157.080496022 [[ 0.42949875]\n",
1475 | " [ 0.55642476]\n",
1476 | " [ 0.71128216]]\n"
1477 | ]
1478 | }
1479 | ],
1480 | "source": [
1481 | "lam1, x1_mtm = pow(MtM, thres = 0.000001)\n",
1482 | "print lam1, x1_mtm"
1483 | ]
1484 | },
1485 | {
1486 | "cell_type": "code",
1487 | "execution_count": 582,
1488 | "metadata": {
1489 | "collapsed": false
1490 | },
1491 | "outputs": [
1492 | {
1493 | "data": {
1494 | "text/plain": [
1495 | "matrix([[ 8.02349012, 1.46031556, -5.98727448],\n",
1496 | " [ 1.46031556, 0.36654091, -1.16853405],\n",
1497 | " [-5.98727448, -1.16853405, 4.52947295]])"
1498 | ]
1499 | },
1500 | "execution_count": 582,
1501 | "metadata": {},
1502 | "output_type": "execute_result"
1503 | }
1504 | ],
1505 | "source": [
1506 | "Ms = MtM - lam1*(x1_mtm*x1_mtm.T)\n",
1507 | "Ms"
1508 | ]
1509 | },
1510 | {
1511 | "cell_type": "code",
1512 | "execution_count": 583,
1513 | "metadata": {
1514 | "collapsed": false
1515 | },
1516 | "outputs": [
1517 | {
1518 | "name": "stdout",
1519 | "output_type": "stream",
1520 | "text": [
1521 | "12.7946149636 [[ 0.79072218]\n",
1522 | " [ 0.14874499]\n",
1523 | " [-0.5938294 ]]\n"
1524 | ]
1525 | }
1526 | ],
1527 | "source": [
1528 | "lam2, x2_mtm = pow(Ms, thres = 0.000001)\n",
1529 | "print lam2, x2_mtm"
1530 | ]
1531 | },
1532 | {
1533 | "cell_type": "code",
1534 | "execution_count": 584,
1535 | "metadata": {
1536 | "collapsed": false
1537 | },
1538 | "outputs": [
1539 | {
1540 | "data": {
1541 | "text/plain": [
1542 | "matrix([[ 0.02376498, -0.04453543, 0.02048918],\n",
1543 | " [-0.04453543, 0.08345912, -0.03839659],\n",
1544 | " [ 0.02048918, -0.03839659, 0.01766491]])"
1545 | ]
1546 | },
1547 | "execution_count": 584,
1548 | "metadata": {},
1549 | "output_type": "execute_result"
1550 | }
1551 | ],
1552 | "source": [
1553 | "Mss = Ms - lam2*(x2_mtm*x2_mtm.T)\n",
1554 | "Mss"
1555 | ]
1556 | },
1557 | {
1558 | "cell_type": "code",
1559 | "execution_count": 585,
1560 | "metadata": {
1561 | "collapsed": false
1562 | },
1563 | "outputs": [
1564 | {
1565 | "name": "stdout",
1566 | "output_type": "stream",
1567 | "text": [
1568 | "0.124889013913 [[-0.43622105]\n",
1569 | " [ 0.81747556]\n",
1570 | " [-0.3760916 ]]\n"
1571 | ]
1572 | }
1573 | ],
1574 | "source": [
1575 | "lam3, x3_mtm = pow(Mss, thres=0.000001)\n",
1576 | "print lam3, x3_mtm"
1577 | ]
1578 | },
1579 | {
1580 | "cell_type": "markdown",
1581 | "metadata": {},
1582 | "source": [
1583 | "### MMT"
1584 | ]
1585 | },
1586 | {
1587 | "cell_type": "code",
1588 | "execution_count": 586,
1589 | "metadata": {
1590 | "collapsed": false
1591 | },
1592 | "outputs": [
1593 | {
1594 | "name": "stdout",
1595 | "output_type": "stream",
1596 | "text": [
1597 | "157.080496022 [[ 0.29331718]\n",
1598 | " [ 0.56415119]\n",
1599 | " [ 0.51918484]\n",
1600 | " [ 0.35006921]\n",
1601 | " [ 0.45121737]]\n"
1602 | ]
1603 | }
1604 | ],
1605 | "source": [
1606 | "lam1, x1_mmt = pow(MMt)\n",
1607 | "print lam1, x1_mmt"
1608 | ]
1609 | },
1610 | {
1611 | "cell_type": "code",
1612 | "execution_count": 587,
1613 | "metadata": {
1614 | "collapsed": false
1615 | },
1616 | "outputs": [
1617 | {
1618 | "data": {
1619 | "text/plain": [
1620 | "matrix([[ 4.85584094e-01, 7.06725610e-03, -1.92113461e+00,\n",
1621 | " 8.70768115e-01, 1.21042661e+00],\n",
1622 | " [ 7.06725610e-03, 6.52952018e-03, -8.68046388e-03,\n",
1623 | " -2.21353639e-02, 1.44033802e-02],\n",
1624 | " [ -1.92113461e+00, -8.68046388e-03, 7.65849705e+00,\n",
1625 | " -3.54947855e+00, -4.79849629e+00],\n",
1626 | " [ 8.70768115e-01, -2.21353639e-02, -3.54947855e+00,\n",
1627 | " 1.75002849e+00, 2.18798797e+00],\n",
1628 | " [ 1.21042661e+00, 1.44033802e-02, -4.79849629e+00,\n",
1629 | " 2.18798797e+00, 3.01886483e+00]])"
1630 | ]
1631 | },
1632 | "execution_count": 587,
1633 | "metadata": {},
1634 | "output_type": "execute_result"
1635 | }
1636 | ],
1637 | "source": [
1638 | "Ms = MMt - lam1*(x1_mmt*x1_mmt.T)\n",
1639 | "Ms"
1640 | ]
1641 | },
1642 | {
1643 | "cell_type": "code",
1644 | "execution_count": 588,
1645 | "metadata": {
1646 | "collapsed": false
1647 | },
1648 | "outputs": [
1649 | {
1650 | "name": "stdout",
1651 | "output_type": "stream",
1652 | "text": [
1653 | "12.7946149639 [[ 1.93814995e-01]\n",
1654 | " [ 5.54806259e-04]\n",
1655 | " [ -7.73595563e-01]\n",
1656 | " [ 3.59829841e-01]\n",
1657 | " [ 4.84260085e-01]]\n"
1658 | ]
1659 | }
1660 | ],
1661 | "source": [
1662 | "lam2, x2_mmt = pow(Ms)\n",
1663 | "print lam2, x2_mmt"
1664 | ]
1665 | },
1666 | {
1667 | "cell_type": "code",
1668 | "execution_count": 589,
1669 | "metadata": {
1670 | "collapsed": false
1671 | },
1672 | "outputs": [
1673 | {
1674 | "data": {
1675 | "text/plain": [
1676 | "matrix([[ 0.00496395, 0.00569145, -0.00278143, -0.02153369, 0.00956414],\n",
1677 | " [ 0.00569145, 0.00652558, -0.00318907, -0.02468963, 0.01096584],\n",
1678 | " [-0.00278143, -0.00318907, 0.00155851, 0.01206589, -0.00535904],\n",
1679 | " [-0.02153369, -0.02468963, 0.01206589, 0.09341354, -0.04148942],\n",
1680 | " [ 0.00956414, 0.01096584, -0.00535904, -0.04148942, 0.01842744]])"
1681 | ]
1682 | },
1683 | "execution_count": 589,
1684 | "metadata": {},
1685 | "output_type": "execute_result"
1686 | }
1687 | ],
1688 | "source": [
1689 | "Mss = Ms - lam2*(x2_mmt*x2_mmt.T)\n",
1690 | "Mss"
1691 | ]
1692 | },
1693 | {
1694 | "cell_type": "code",
1695 | "execution_count": 590,
1696 | "metadata": {
1697 | "collapsed": false
1698 | },
1699 | "outputs": [
1700 | {
1701 | "name": "stdout",
1702 | "output_type": "stream",
1703 | "text": [
1704 | "0.124889013913 [[-0.19936618]\n",
1705 | " [-0.22858488]\n",
1706 | " [ 0.11171009]\n",
1707 | " [ 0.864854 ]\n",
1708 | " [-0.38412302]]\n"
1709 | ]
1710 | }
1711 | ],
1712 | "source": [
1713 | "lam3, x3_mmt = pow(Mss)\n",
1714 | "print lam3, x3_mmt"
1715 | ]
1716 | },
1717 | {
1718 | "cell_type": "markdown",
1719 | "metadata": {},
1720 | "source": [
1721 | "And we know that since MMt is bigger than MtM, that the other eignevalue of MMt is zero, which is three times repeated."
1722 | ]
1723 | },
1724 | {
1725 | "cell_type": "markdown",
1726 | "metadata": {},
1727 | "source": [
1728 | "### (d) \n",
1729 | "SVD:\n",
1730 | "- V is the eigenvectors matrix of MTM\n",
1731 | "- sigma is the diagonal matrix whose diagonal elements are the square roots of the eigenvalues of MTM\n",
1732 | "- U is the eigenvectors matrix of MMT\n",
1733 | "\n",
1734 | "*Note that we only keep the non-zero singular values because they are the only ones that influence the SVD decomposition (i.e., recall also that rank of M is 3).*"
1735 | ]
1736 | },
1737 | {
1738 | "cell_type": "code",
1739 | "execution_count": 591,
1740 | "metadata": {
1741 | "collapsed": false
1742 | },
1743 | "outputs": [
1744 | {
1745 | "data": {
1746 | "text/plain": [
1747 | "matrix([[ 0.42949875, 0.79072218, -0.43622105],\n",
1748 | " [ 0.55642476, 0.14874499, 0.81747556],\n",
1749 | " [ 0.71128216, -0.5938294 , -0.3760916 ]])"
1750 | ]
1751 | },
1752 | "execution_count": 591,
1753 | "metadata": {},
1754 | "output_type": "execute_result"
1755 | }
1756 | ],
1757 | "source": [
1758 | "V = np.concatenate((x1_mtm,x2_mtm,x3_mtm),axis=1)\n",
1759 | "V"
1760 | ]
1761 | },
1762 | {
1763 | "cell_type": "code",
1764 | "execution_count": 592,
1765 | "metadata": {
1766 | "collapsed": false
1767 | },
1768 | "outputs": [
1769 | {
1770 | "data": {
1771 | "text/plain": [
1772 | "matrix([[ 2.93317185e-01, 1.93814995e-01, -1.99366184e-01],\n",
1773 | " [ 5.64151193e-01, 5.54806259e-04, -2.28584883e-01],\n",
1774 | " [ 5.19184840e-01, -7.73595563e-01, 1.11710086e-01],\n",
1775 | " [ 3.50069209e-01, 3.59829841e-01, 8.64853999e-01],\n",
1776 | " [ 4.51217365e-01, 4.84260085e-01, -3.84123018e-01]])"
1777 | ]
1778 | },
1779 | "execution_count": 592,
1780 | "metadata": {},
1781 | "output_type": "execute_result"
1782 | }
1783 | ],
1784 | "source": [
1785 | "U = np.concatenate((x1_mmt,x2_mmt,x3_mmt),axis=1)\n",
1786 | "U"
1787 | ]
1788 | },
1789 | {
1790 | "cell_type": "code",
1791 | "execution_count": 593,
1792 | "metadata": {
1793 | "collapsed": false
1794 | },
1795 | "outputs": [
1796 | {
1797 | "data": {
1798 | "text/plain": [
1799 | "matrix([[ 12.53317582, 0. , 0. ],\n",
1800 | " [ 0. , 3.5769561 , 0. ],\n",
1801 | " [ 0. , 0. , 0.3533964 ]])"
1802 | ]
1803 | },
1804 | "execution_count": 593,
1805 | "metadata": {},
1806 | "output_type": "execute_result"
1807 | }
1808 | ],
1809 | "source": [
1810 | "sigma = np.matrix([[np.sqrt(lam1),0,0],[0,np.sqrt(lam2),0],\n",
1811 | " [0,0,np.sqrt(lam3)]])\n",
1812 | "sigma"
1813 | ]
1814 | },
1815 | {
1816 | "cell_type": "code",
1817 | "execution_count": 594,
1818 | "metadata": {
1819 | "collapsed": false
1820 | },
1821 | "outputs": [
1822 | {
1823 | "data": {
1824 | "text/plain": [
1825 | "matrix([[ 2.15783778, 2.09105102, 2.2296274 ],\n",
1826 | " [ 3.07362409, 3.86851894, 5.05839857],\n",
1827 | " [ 0.58952116, 3.24135273, 6.25668614],\n",
1828 | " [ 2.76882675, 2.88260114, 2.24147307],\n",
1829 | " [ 3.8577824 , 3.29336808, 3.0448692 ]])"
1830 | ]
1831 | },
1832 | "execution_count": 594,
1833 | "metadata": {},
1834 | "output_type": "execute_result"
1835 | }
1836 | ],
1837 | "source": [
1838 | "# Checking SVD correctness\n",
1839 | "U*sigma*(V.T)"
1840 | ]
1841 | },
1842 | {
1843 | "cell_type": "code",
1844 | "execution_count": 595,
1845 | "metadata": {
1846 | "collapsed": false
1847 | },
1848 | "outputs": [
1849 | {
1850 | "data": {
1851 | "text/plain": [
1852 | "matrix([[1, 2, 3],\n",
1853 | " [3, 4, 5],\n",
1854 | " [5, 4, 3],\n",
1855 | " [1, 2, 4],\n",
1856 | " [1, 3, 5]])"
1857 | ]
1858 | },
1859 | "execution_count": 595,
1860 | "metadata": {},
1861 | "output_type": "execute_result"
1862 | }
1863 | ],
1864 | "source": [
1865 | "M"
1866 | ]
1867 | },
1868 | {
1869 | "cell_type": "markdown",
1870 | "metadata": {},
1871 | "source": [
1872 | "Using np.linalg.svd"
1873 | ]
1874 | },
1875 | {
1876 | "cell_type": "code",
1877 | "execution_count": 597,
1878 | "metadata": {
1879 | "collapsed": false
1880 | },
1881 | "outputs": [],
1882 | "source": [
1883 | "U, s, V = np.linalg.svd(M, full_matrices=False)"
1884 | ]
1885 | },
1886 | {
1887 | "cell_type": "code",
1888 | "execution_count": 598,
1889 | "metadata": {
1890 | "collapsed": false
1891 | },
1892 | "outputs": [
1893 | {
1894 | "data": {
1895 | "text/plain": [
1896 | "matrix([[ -2.93317100e-01, 1.93816567e-01, 1.99366072e-01],\n",
1897 | " [ -5.64151193e-01, 5.57827581e-04, 2.28584883e-01],\n",
1898 | " [ -5.19185177e-01, -7.73592784e-01, -1.11709640e-01],\n",
1899 | " [ -3.50069052e-01, 3.59831710e-01, -8.64854207e-01],\n",
1900 | " [ -4.51217154e-01, 4.84262503e-01, 3.84122739e-01]])"
1901 | ]
1902 | },
1903 | "execution_count": 598,
1904 | "metadata": {},
1905 | "output_type": "execute_result"
1906 | }
1907 | ],
1908 | "source": [
1909 | "U"
1910 | ]
1911 | },
1912 | {
1913 | "cell_type": "code",
1914 | "execution_count": 599,
1915 | "metadata": {
1916 | "collapsed": false
1917 | },
1918 | "outputs": [
1919 | {
1920 | "data": {
1921 | "text/plain": [
1922 | "matrix([[ 12.53317582, 0. , 0. ],\n",
1923 | " [ 0. , 3.5769561 , 0. ],\n",
1924 | " [ 0. , 0. , 0.3533964 ]])"
1925 | ]
1926 | },
1927 | "execution_count": 599,
1928 | "metadata": {},
1929 | "output_type": "execute_result"
1930 | }
1931 | ],
1932 | "source": [
1933 | "s = np.mat(np.diag(s))\n",
1934 | "s"
1935 | ]
1936 | },
1937 | {
1938 | "cell_type": "code",
1939 | "execution_count": 600,
1940 | "metadata": {
1941 | "collapsed": false
1942 | },
1943 | "outputs": [
1944 | {
1945 | "data": {
1946 | "text/plain": [
1947 | "matrix([[-0.4294987 , -0.55642475, -0.71128219],\n",
1948 | " [-0.7907225 , -0.1487454 , 0.59382888],\n",
1949 | " [-0.43622104, 0.81747557, -0.37609161]])"
1950 | ]
1951 | },
1952 | "execution_count": 600,
1953 | "metadata": {},
1954 | "output_type": "execute_result"
1955 | }
1956 | ],
1957 | "source": [
1958 | "V"
1959 | ]
1960 | },
1961 | {
1962 | "cell_type": "code",
1963 | "execution_count": 603,
1964 | "metadata": {
1965 | "collapsed": false
1966 | },
1967 | "outputs": [
1968 | {
1969 | "data": {
1970 | "text/plain": [
1971 | "matrix([[ 1.14305288, 2.84556706, 2.14386992],\n",
1972 | " [ 2.97824776, 5.63856063, 3.0555972 ],\n",
1973 | " [ 4.36253178, 5.53341366, 0.59131193],\n",
1974 | " [ 1.38563384, 3.09633052, 3.08103168],\n",
1975 | " [ 1.36850832, 4.29463733, 3.83187619]])"
1976 | ]
1977 | },
1978 | "execution_count": 603,
1979 | "metadata": {},
1980 | "output_type": "execute_result"
1981 | }
1982 | ],
1983 | "source": [
1984 | "U*s*(V.T)"
1985 | ]
1986 | },
1987 | {
1988 | "cell_type": "code",
1989 | "execution_count": 604,
1990 | "metadata": {
1991 | "collapsed": false
1992 | },
1993 | "outputs": [
1994 | {
1995 | "data": {
1996 | "text/plain": [
1997 | "matrix([[1, 2, 3],\n",
1998 | " [3, 4, 5],\n",
1999 | " [5, 4, 3],\n",
2000 | " [1, 2, 4],\n",
2001 | " [1, 3, 5]])"
2002 | ]
2003 | },
2004 | "execution_count": 604,
2005 | "metadata": {},
2006 | "output_type": "execute_result"
2007 | }
2008 | ],
2009 | "source": [
2010 | "M"
2011 | ]
2012 | },
2013 | {
2014 | "cell_type": "code",
2015 | "execution_count": 605,
2016 | "metadata": {
2017 | "collapsed": true
2018 | },
2019 | "outputs": [],
2020 | "source": [
2021 | "# np.linalg.svd does a much better job than the eigendecomposition above"
2022 | ]
2023 | },
2024 | {
2025 | "cell_type": "markdown",
2026 | "metadata": {},
2027 | "source": [
2028 | "\n",
2029 | "# 11.3.2 "
2030 | ]
2031 | },
2032 | {
2033 | "cell_type": "code",
2034 | "execution_count": 606,
2035 | "metadata": {
2036 | "collapsed": true
2037 | },
2038 | "outputs": [],
2039 | "source": [
2040 | "# mapping a user representation in the item space to the concept space\n",
2041 | "Leslie = np.matrix([0,3,0,0,4])"
2042 | ]
2043 | },
2044 | {
2045 | "cell_type": "code",
2046 | "execution_count": 608,
2047 | "metadata": {
2048 | "collapsed": false
2049 | },
2050 | "outputs": [
2051 | {
2052 | "data": {
2053 | "text/plain": [
2054 | "matrix([[ 0.58, 0. ],\n",
2055 | " [ 0.58, 0. ],\n",
2056 | " [ 0.58, 0. ],\n",
2057 | " [ 0. , 0.71],\n",
2058 | " [ 0. , 0.71]])"
2059 | ]
2060 | },
2061 | "execution_count": 608,
2062 | "metadata": {},
2063 | "output_type": "execute_result"
2064 | }
2065 | ],
2066 | "source": [
2067 | "# V from the SVD decomposition of Fig. 11.7\n",
2068 | "V = np.matrix([[0.58,0.58,0.58,0,0],[0,0,0,0.71,0.71]]).T\n",
2069 | "V"
2070 | ]
2071 | },
2072 | {
2073 | "cell_type": "code",
2074 | "execution_count": 609,
2075 | "metadata": {
2076 | "collapsed": false
2077 | },
2078 | "outputs": [
2079 | {
2080 | "data": {
2081 | "text/plain": [
2082 | "matrix([[ 1.74, 2.84]])"
2083 | ]
2084 | },
2085 | "execution_count": 609,
2086 | "metadata": {},
2087 | "output_type": "execute_result"
2088 | }
2089 | ],
2090 | "source": [
2091 | "Leslie*V"
2092 | ]
2093 | },
2094 | {
2095 | "cell_type": "markdown",
2096 | "metadata": {},
2097 | "source": [
2098 | "The above vector in the concept space suggests that Leslie likes both science-fiction and romance movies, but with more preference towards the latter genre."
2099 | ]
2100 | },
2101 | {
2102 | "cell_type": "markdown",
2103 | "metadata": {},
2104 | "source": [
2105 | "# 11.4.1 "
2106 | ]
2107 | },
2108 | {
2109 | "cell_type": "code",
2110 | "execution_count": 610,
2111 | "metadata": {
2112 | "collapsed": false
2113 | },
2114 | "outputs": [
2115 | {
2116 | "data": {
2117 | "text/plain": [
2118 | "matrix([[ 48, 14],\n",
2119 | " [ 14, -48]])"
2120 | ]
2121 | },
2122 | "execution_count": 610,
2123 | "metadata": {},
2124 | "output_type": "execute_result"
2125 | }
2126 | ],
2127 | "source": [
2128 | "M = np.matrix([[48,14],[14,-48]])\n",
2129 | "M"
2130 | ]
2131 | },
2132 | {
2133 | "cell_type": "code",
2134 | "execution_count": 611,
2135 | "metadata": {
2136 | "collapsed": false
2137 | },
2138 | "outputs": [
2139 | {
2140 | "data": {
2141 | "text/plain": [
2142 | "matrix([[ 0.6, 0.8],\n",
2143 | " [ 0.8, -0.6]])"
2144 | ]
2145 | },
2146 | "execution_count": 611,
2147 | "metadata": {},
2148 | "output_type": "execute_result"
2149 | }
2150 | ],
2151 | "source": [
2152 | "U = np.matrix([[3/5,4/5],[4/5,-3/5]])\n",
2153 | "U"
2154 | ]
2155 | },
2156 | {
2157 | "cell_type": "code",
2158 | "execution_count": 615,
2159 | "metadata": {
2160 | "collapsed": false
2161 | },
2162 | "outputs": [
2163 | {
2164 | "data": {
2165 | "text/plain": [
2166 | "matrix([[ 0.8, 0.6],\n",
2167 | " [-0.6, 0.8]])"
2168 | ]
2169 | },
2170 | "execution_count": 615,
2171 | "metadata": {},
2172 | "output_type": "execute_result"
2173 | }
2174 | ],
2175 | "source": [
2176 | "V = np.matrix([[4/5,-3/5],[3/5,4/5]]).T\n",
2177 | "V"
2178 | ]
2179 | },
2180 | {
2181 | "cell_type": "code",
2182 | "execution_count": 614,
2183 | "metadata": {
2184 | "collapsed": false
2185 | },
2186 | "outputs": [
2187 | {
2188 | "data": {
2189 | "text/plain": [
2190 | "matrix([[50, 0],\n",
2191 | " [ 0, 25]])"
2192 | ]
2193 | },
2194 | "execution_count": 614,
2195 | "metadata": {},
2196 | "output_type": "execute_result"
2197 | }
2198 | ],
2199 | "source": [
2200 | "s = np.mat(np.diag([50,25]))\n",
2201 | "s"
2202 | ]
2203 | },
2204 | {
2205 | "cell_type": "code",
2206 | "execution_count": 616,
2207 | "metadata": {
2208 | "collapsed": false
2209 | },
2210 | "outputs": [
2211 | {
2212 | "data": {
2213 | "text/plain": [
2214 | "matrix([[ 36., -2.],\n",
2215 | " [ 23., -36.]])"
2216 | ]
2217 | },
2218 | "execution_count": 616,
2219 | "metadata": {},
2220 | "output_type": "execute_result"
2221 | }
2222 | ],
2223 | "source": [
2224 | "U*s*V.T"
2225 | ]
2226 | },
2227 | {
2228 | "cell_type": "markdown",
2229 | "metadata": {},
2230 | "source": [
2231 | "Thus, this SVD is not correct!"
2232 | ]
2233 | },
2234 | {
2235 | "cell_type": "code",
2236 | "execution_count": null,
2237 | "metadata": {
2238 | "collapsed": true
2239 | },
2240 | "outputs": [],
2241 | "source": []
2242 | }
2243 | ],
2244 | "metadata": {
2245 | "kernelspec": {
2246 | "display_name": "Python 2",
2247 | "language": "python",
2248 | "name": "python2"
2249 | },
2250 | "language_info": {
2251 | "codemirror_mode": {
2252 | "name": "ipython",
2253 | "version": 2
2254 | },
2255 | "file_extension": ".py",
2256 | "mimetype": "text/x-python",
2257 | "name": "python",
2258 | "nbconvert_exporter": "python",
2259 | "pygments_lexer": "ipython2",
2260 | "version": "2.7.11"
2261 | }
2262 | },
2263 | "nbformat": 4,
2264 | "nbformat_minor": 0
2265 | }
2266 |
--------------------------------------------------------------------------------
/Chapter 9 - Recommendation Systems.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Contents\n",
8 | "- Exercise 9.2.1\n",
9 | "- Exercise 9.2.2\n",
10 | "- Exercise 9.2.3\n",
11 | "- Exercise 9.3.1\n",
12 | "- Exercise 9.4.1 and 9.4.2\n",
13 | "- Exercise 9.4.3\n",
14 | "- Exercise 9.4.5 (Normalizing the Utility Matrix)"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 5,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "import numpy as np\n",
26 | "from __future__ import division"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "\n",
34 | "# 9.2.1"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 97,
40 | "metadata": {
41 | "collapsed": false
42 | },
43 | "outputs": [],
44 | "source": [
45 | "class Computer(object):\n",
46 | " def __init__(self, proc, disk, mem):\n",
47 | " self.processor_speed = proc\n",
48 | " self.disk = disk\n",
49 | " self.main_memory = mem\n",
50 | " self.summary = [proc, disk, mem]\n",
51 | " \n",
52 | " def dot_prod(self,X):\n",
53 | " if isinstance(X, Computer):\n",
54 | " bar = [X.processor_speed, X.disk, X.main_memory]\n",
55 | " return sum([x*y for x,y in zip(self.summary,bar)])\n",
56 | " else:\n",
57 | " assert len(X) == 3\n",
58 | " return sum([x*y for x,y in zip(self.summary,X)])\n",
59 | " \n",
60 | " def cosine(self,X,alpha=1,beta=1):\n",
61 | " if isinstance(X, Computer):\n",
62 | " foo = [self.processor_speed, alpha*self.disk, beta*self.main_memory]\n",
63 | " bar = [X.processor_speed, alpha*X.disk, alpha*X.main_memory]\n",
64 | " ati = np.dot(foo,bar)\n",
65 | " tun = np.sqrt(np.dot(foo,foo))*np.sqrt(np.dot(bar,bar))\n",
66 | " return ati/tun\n",
67 | " \n",
68 | " def normalize(self, mu):\n",
69 | " assert len(mu) == 3\n",
70 | " return [self.processor_speed - mu[0], self.disk - mu[1], \n",
71 | " self.main_memory - mu[2]]"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 98,
77 | "metadata": {
78 | "collapsed": true
79 | },
80 | "outputs": [],
81 | "source": [
82 | "A = Computer(3.06,500,6)\n",
83 | "B = Computer(2.68,320,4)\n",
84 | "C = Computer(2.92,640,6)"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 99,
90 | "metadata": {
91 | "collapsed": false
92 | },
93 | "outputs": [],
94 | "source": [
95 | "computers = {'A':A, 'B':B, 'C':C}"
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "metadata": {},
101 | "source": [
102 | "(b) cosine similarities when alpha=beta=1"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 100,
108 | "metadata": {
109 | "collapsed": false
110 | },
111 | "outputs": [],
112 | "source": [
113 | "pairs = [['A','B'],['A','C'],['B','C']]"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 101,
119 | "metadata": {
120 | "collapsed": false
121 | },
122 | "outputs": [
123 | {
124 | "name": "stdout",
125 | "output_type": "stream",
126 | "text": [
127 | "['A', 'B'] 0.999997333284\n",
128 | "['A', 'C'] 0.999995343121\n",
129 | "['B', 'C'] 0.999987853375\n"
130 | ]
131 | }
132 | ],
133 | "source": [
134 | "# cosine similarities\n",
135 | "for pair in pairs:\n",
136 | " print pair, computers[pair[0]].cosine(computers[pair[1]])"
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "metadata": {},
142 | "source": [
143 | "(c) cosine similarities when alpha=0.01 and beta=0.5"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 102,
149 | "metadata": {
150 | "collapsed": false
151 | },
152 | "outputs": [
153 | {
154 | "name": "stdout",
155 | "output_type": "stream",
156 | "text": [
157 | "['A', 'B'] 0.884792148899\n",
158 | "['A', 'C'] 0.887525858762\n",
159 | "['B', 'C'] 0.873005241921\n"
160 | ]
161 | }
162 | ],
163 | "source": [
164 | "for pair in pairs:\n",
165 | " print pair, computers[pair[0]].cosine(computers[pair[1]],0.01,0.5)"
166 | ]
167 | },
168 | {
169 | "cell_type": "markdown",
170 | "metadata": {},
171 | "source": [
172 | "(d) setting alpha = 1/avg(disk size) and beta = 1/avg(main_memory)"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": 103,
178 | "metadata": {
179 | "collapsed": true
180 | },
181 | "outputs": [],
182 | "source": [
183 | "alpha = 1/np.mean([A.disk,B.disk,C.disk])\n",
184 | "beta = 1/np.mean([A.main_memory,B.main_memory,C.main_memory])"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 104,
190 | "metadata": {
191 | "collapsed": false
192 | },
193 | "outputs": [
194 | {
195 | "name": "stdout",
196 | "output_type": "stream",
197 | "text": [
198 | "0.00205479452055 0.1875\n"
199 | ]
200 | }
201 | ],
202 | "source": [
203 | "print alpha,beta"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 105,
209 | "metadata": {
210 | "collapsed": false
211 | },
212 | "outputs": [
213 | {
214 | "name": "stdout",
215 | "output_type": "stream",
216 | "text": [
217 | "['A', 'B'] 0.941990802633\n",
218 | "['A', 'C'] 0.940905717338\n",
219 | "['B', 'C'] 0.949959248828\n"
220 | ]
221 | }
222 | ],
223 | "source": [
224 | "for pair in pairs:\n",
225 | " print pair, computers[pair[0]].cosine(computers[pair[1]],alpha,beta)"
226 | ]
227 | },
228 | {
229 | "cell_type": "markdown",
230 | "metadata": {},
231 | "source": [
232 | "\n",
233 | "# 9.2.2 "
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "metadata": {},
239 | "source": [
240 | "(a) Normalizing the vectors of the three computers of 9.2.1"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": 106,
246 | "metadata": {
247 | "collapsed": false
248 | },
249 | "outputs": [],
250 | "source": [
251 | "mean_proc = np.mean([comp.processor_speed for comp in computers.values()])\n",
252 | "mean_disk = np.mean([comp.disk for comp in computers.values()])\n",
253 | "mean_memory = np.mean([comp.main_memory for comp in computers.values()])"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": 107,
259 | "metadata": {
260 | "collapsed": true
261 | },
262 | "outputs": [],
263 | "source": [
264 | "means = [mean_proc, mean_disk, mean_memory]"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 111,
270 | "metadata": {
271 | "collapsed": false
272 | },
273 | "outputs": [
274 | {
275 | "name": "stdout",
276 | "output_type": "stream",
277 | "text": [
278 | "A: [0.17333333333333334, 13.333333333333314, 0.66666666666666696]\n",
279 | "B: [-0.20666666666666655, -166.66666666666669, -1.333333333333333]\n",
280 | "C: [0.033333333333333215, 153.33333333333331, 0.66666666666666696]\n"
281 | ]
282 | }
283 | ],
284 | "source": [
285 | "print 'A:', A.normalize(means)\n",
286 | "print 'B:',B.normalize(means)\n",
287 | "print 'C:',C.normalize(means)"
288 | ]
289 | },
290 | {
291 | "cell_type": "markdown",
292 | "metadata": {},
293 | "source": [
294 | "(b) A few options I can think of: median (of differences), length (or norm), max etc. In all cases, the interpretation of a small angle (note that cosine lies between -1 and 1) means that the two vectors are similarly directed. To be similarly directed in this context of normalized components implies that the items are similarly dispersed about the average."
295 | ]
296 | },
297 | {
298 | "cell_type": "markdown",
299 | "metadata": {},
300 | "source": [
301 | "\n",
302 | "# 9.2.3 "
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": 123,
308 | "metadata": {
309 | "collapsed": true
310 | },
311 | "outputs": [],
312 | "source": [
313 | "# ordered [A,B,C]\n",
314 | "user_ratings = [4,2,5]"
315 | ]
316 | },
317 | {
318 | "cell_type": "markdown",
319 | "metadata": {},
320 | "source": [
321 | "(a) normalizing the ratings for this user"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": 116,
327 | "metadata": {
328 | "collapsed": false
329 | },
330 | "outputs": [],
331 | "source": [
332 | "avg_rating = np.mean(user_ratings)"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": 118,
338 | "metadata": {
339 | "collapsed": false
340 | },
341 | "outputs": [
342 | {
343 | "data": {
344 | "text/plain": [
345 | "[0.33333333333333348, -1.6666666666666665, 1.3333333333333335]"
346 | ]
347 | },
348 | "execution_count": 118,
349 | "metadata": {},
350 | "output_type": "execute_result"
351 | }
352 | ],
353 | "source": [
354 | "# normalize the ratings for this user\n",
355 | "[rate - avg_rating for rate in user_ratings]"
356 | ]
357 | },
358 | {
359 | "cell_type": "markdown",
360 | "metadata": {},
361 | "source": [
362 | "(b) constructing a user profile from the items profiles\n",
363 | "\n",
364 | "*I use the weights rating/5 *"
365 | ]
366 | },
367 | {
368 | "cell_type": "code",
369 | "execution_count": 119,
370 | "metadata": {
371 | "collapsed": true
372 | },
373 | "outputs": [],
374 | "source": [
375 | "weights = [rate/5 for rate in user_ratings]"
376 | ]
377 | },
378 | {
379 | "cell_type": "code",
380 | "execution_count": 126,
381 | "metadata": {
382 | "collapsed": false
383 | },
384 | "outputs": [
385 | {
386 | "data": {
387 | "text/plain": [
388 | "[0.8, 0.4, 1.0]"
389 | ]
390 | },
391 | "execution_count": 126,
392 | "metadata": {},
393 | "output_type": "execute_result"
394 | }
395 | ],
396 | "source": [
397 | "weights"
398 | ]
399 | },
400 | {
401 | "cell_type": "code",
402 | "execution_count": 124,
403 | "metadata": {
404 | "collapsed": false
405 | },
406 | "outputs": [],
407 | "source": [
408 | "user_profile = {}\n",
409 | "user_profile['proc_speed'] = sum([wt*proc for wt,proc in zip(weights,[A.processor_speed,\n",
410 | " B.processor_speed,C.processor_speed])])\n",
411 | "user_profile['disk'] = sum([wt*disk for wt,disk in zip(weights,[A.disk,B.disk,C.disk])])\n",
412 | "user_profile['main_memory'] = sum([wt*mm for wt,mm in zip(weights,[A.main_memory,B.main_memory,\n",
413 | " C.main_memory])])"
414 | ]
415 | },
416 | {
417 | "cell_type": "code",
418 | "execution_count": 125,
419 | "metadata": {
420 | "collapsed": false
421 | },
422 | "outputs": [
423 | {
424 | "data": {
425 | "text/plain": [
426 | "{'disk': 1168.0, 'main_memory': 12.4, 'proc_speed': 6.44}"
427 | ]
428 | },
429 | "execution_count": 125,
430 | "metadata": {},
431 | "output_type": "execute_result"
432 | }
433 | ],
434 | "source": [
435 | "user_profile"
436 | ]
437 | },
438 | {
439 | "cell_type": "markdown",
440 | "metadata": {},
441 | "source": [
442 | "Alternatively, can use the following weights."
443 | ]
444 | },
445 | {
446 | "cell_type": "code",
447 | "execution_count": 128,
448 | "metadata": {
449 | "collapsed": false
450 | },
451 | "outputs": [
452 | {
453 | "data": {
454 | "text/plain": [
455 | "[0.36363636363636365, 0.18181818181818182, 0.45454545454545453]"
456 | ]
457 | },
458 | "execution_count": 128,
459 | "metadata": {},
460 | "output_type": "execute_result"
461 | }
462 | ],
463 | "source": [
464 | "weights = [rate/sum(user_ratings) for rate in user_ratings]\n",
465 | "weights"
466 | ]
467 | },
468 | {
469 | "cell_type": "code",
470 | "execution_count": 129,
471 | "metadata": {
472 | "collapsed": true
473 | },
474 | "outputs": [],
475 | "source": [
476 | "user_profile = {}\n",
477 | "user_profile['proc_speed'] = sum([wt*proc for wt,proc in zip(weights,[A.processor_speed,\n",
478 | " B.processor_speed,C.processor_speed])])\n",
479 | "user_profile['disk'] = sum([wt*disk for wt,disk in zip(weights,[A.disk,B.disk,C.disk])])\n",
480 | "user_profile['main_memory'] = sum([wt*mm for wt,mm in zip(weights,[A.main_memory,B.main_memory,\n",
481 | " C.main_memory])])"
482 | ]
483 | },
484 | {
485 | "cell_type": "code",
486 | "execution_count": 130,
487 | "metadata": {
488 | "collapsed": false
489 | },
490 | "outputs": [
491 | {
492 | "data": {
493 | "text/plain": [
494 | "{'disk': 530.9090909090909,\n",
495 | " 'main_memory': 5.636363636363637,\n",
496 | " 'proc_speed': 2.9272727272727272}"
497 | ]
498 | },
499 | "execution_count": 130,
500 | "metadata": {},
501 | "output_type": "execute_result"
502 | }
503 | ],
504 | "source": [
505 | "user_profile"
506 | ]
507 | },
508 | {
509 | "cell_type": "markdown",
510 | "metadata": {},
511 | "source": [
512 | "This user_profile supplies aggregates that are within the support of each component."
513 | ]
514 | },
515 | {
516 | "cell_type": "markdown",
517 | "metadata": {},
518 | "source": [
519 | "\n",
520 | "# 9.3.1 "
521 | ]
522 | },
523 | {
524 | "cell_type": "markdown",
525 | "metadata": {},
526 | "source": [
527 | "(a) Jaccard similarities: SIM(A,B) = 4/8; SIM(A,C) = 3/8; SIM(B,C) = 4/8"
528 | ]
529 | },
530 | {
531 | "cell_type": "markdown",
532 | "metadata": {},
533 | "source": [
534 | "(b) Cosine distance:"
535 | ]
536 | },
537 | {
538 | "cell_type": "code",
539 | "execution_count": 132,
540 | "metadata": {
541 | "collapsed": false
542 | },
543 | "outputs": [],
544 | "source": [
545 | "U = np.array([4,5,0,5,1,0,3,2,0,3,4,3,1,2,1,0,2,0,1,3,0,4,5,3]).reshape(3,8)"
546 | ]
547 | },
548 | {
549 | "cell_type": "code",
550 | "execution_count": 147,
551 | "metadata": {
552 | "collapsed": false
553 | },
554 | "outputs": [],
555 | "source": [
556 | "# user ratings are rows of Utility matrix\n",
557 | "A = U[0]\n",
558 | "B = U[1]\n",
559 | "C = U[2]"
560 | ]
561 | },
562 | {
563 | "cell_type": "code",
564 | "execution_count": 148,
565 | "metadata": {
566 | "collapsed": true
567 | },
568 | "outputs": [],
569 | "source": [
570 | "def cosine(X,Y):\n",
571 | " return np.dot(X,Y)/(np.sqrt(np.dot(X,X)*np.dot(Y,Y)))"
572 | ]
573 | },
574 | {
575 | "cell_type": "code",
576 | "execution_count": 149,
577 | "metadata": {
578 | "collapsed": false
579 | },
580 | "outputs": [
581 | {
582 | "name": "stdout",
583 | "output_type": "stream",
584 | "text": [
585 | "A,B: 0.601040764009\n",
586 | "A,C: 0.614918693812\n",
587 | "B,C: 0.513870119777\n"
588 | ]
589 | }
590 | ],
591 | "source": [
592 | "print 'A,B:', cosine(A,B)\n",
593 | "print 'A,C:', cosine(A,C)\n",
594 | "print 'B,C:', cosine(B,C)"
595 | ]
596 | },
597 | {
598 | "cell_type": "code",
599 | "execution_count": 150,
600 | "metadata": {
601 | "collapsed": false
602 | },
603 | "outputs": [
604 | {
605 | "data": {
606 | "text/plain": [
607 | "array([[4, 5, 0, 5, 1, 0, 3, 2],\n",
608 | " [0, 3, 4, 3, 1, 2, 1, 0],\n",
609 | " [2, 0, 1, 3, 0, 4, 5, 3]])"
610 | ]
611 | },
612 | "execution_count": 150,
613 | "metadata": {},
614 | "output_type": "execute_result"
615 | }
616 | ],
617 | "source": [
618 | "U"
619 | ]
620 | },
621 | {
622 | "cell_type": "markdown",
623 | "metadata": {},
624 | "source": [
625 | "(c) Rounding data in the utility matrix and computing Jaccard distance"
626 | ]
627 | },
628 | {
629 | "cell_type": "code",
630 | "execution_count": 151,
631 | "metadata": {
632 | "collapsed": true
633 | },
634 | "outputs": [],
635 | "source": [
636 | "ratings = [4,5,0,5,1,0,3,2,0,3,4,3,1,2,1,0,2,0,1,3,0,4,5,3]"
637 | ]
638 | },
639 | {
640 | "cell_type": "code",
641 | "execution_count": 154,
642 | "metadata": {
643 | "collapsed": false
644 | },
645 | "outputs": [],
646 | "source": [
647 | "# mapping 3,4,5 to 1 and 1,2 to 0\n",
648 | "U_rounded = np.array(map(lambda x: 1 if x>=3 else 0, ratings)).reshape(3,8)"
649 | ]
650 | },
651 | {
652 | "cell_type": "code",
653 | "execution_count": 155,
654 | "metadata": {
655 | "collapsed": false
656 | },
657 | "outputs": [
658 | {
659 | "data": {
660 | "text/plain": [
661 | "array([[1, 1, 0, 1, 0, 0, 1, 0],\n",
662 | " [0, 1, 1, 1, 0, 0, 0, 0],\n",
663 | " [0, 0, 0, 1, 0, 1, 1, 1]])"
664 | ]
665 | },
666 | "execution_count": 155,
667 | "metadata": {},
668 | "output_type": "execute_result"
669 | }
670 | ],
671 | "source": [
672 | "U_rounded"
673 | ]
674 | },
675 | {
676 | "cell_type": "markdown",
677 | "metadata": {},
678 | "source": [
679 | "Jaccard distances: sim(A,B) = 2/5; sim(A,C) = 2/6; sim(B,C) = 1/6"
680 | ]
681 | },
682 | {
683 | "cell_type": "markdown",
684 | "metadata": {},
685 | "source": [
686 | "(d) computing cosine similarities for rounded data"
687 | ]
688 | },
689 | {
690 | "cell_type": "code",
691 | "execution_count": 156,
692 | "metadata": {
693 | "collapsed": false
694 | },
695 | "outputs": [
696 | {
697 | "name": "stdout",
698 | "output_type": "stream",
699 | "text": [
700 | "A,B: 0.57735026919\n",
701 | "A,C: 0.5\n",
702 | "B,C: 0.288675134595\n"
703 | ]
704 | }
705 | ],
706 | "source": [
707 | "A_rd = U_rounded[0]\n",
708 | "B_rd = U_rounded[1]\n",
709 | "C_rd = U_rounded[2]\n",
710 | "\n",
711 | "print 'A,B:', cosine(A_rd,B_rd)\n",
712 | "print 'A,C:', cosine(A_rd,C_rd)\n",
713 | "print 'B,C:', cosine(B_rd,C_rd)"
714 | ]
715 | },
716 | {
717 | "cell_type": "markdown",
718 | "metadata": {},
719 | "source": [
720 | "(e) Normalizing the matrix by user ratings: *subtract from each nonblank entry the average value for its user*"
721 | ]
722 | },
723 | {
724 | "cell_type": "code",
725 | "execution_count": 167,
726 | "metadata": {
727 | "collapsed": false
728 | },
729 | "outputs": [],
730 | "source": [
731 | "A_norm = map(lambda x: x-np.mean(A) if x>0 else 0, A)\n",
732 | "B_norm = map(lambda x: x-np.mean(B) if x>0 else 0, B)\n",
733 | "C_norm = map(lambda x: x-np.mean(C) if x>0 else 0, C)"
734 | ]
735 | },
736 | {
737 | "cell_type": "code",
738 | "execution_count": 165,
739 | "metadata": {
740 | "collapsed": false
741 | },
742 | "outputs": [],
743 | "source": [
744 | "U_norm = np.array([A_norm,B_norm,C_norm])"
745 | ]
746 | },
747 | {
748 | "cell_type": "code",
749 | "execution_count": 166,
750 | "metadata": {
751 | "collapsed": false
752 | },
753 | "outputs": [
754 | {
755 | "data": {
756 | "text/plain": [
757 | "array([[ 1.5 , 2.5 , 0. , 2.5 , -1.5 , 0. , 0.5 , -0.5 ],\n",
758 | " [ 0. , 1.25, 2.25, 1.25, -0.75, 0.25, -0.75, 0. ],\n",
759 | " [-0.25, 0. , -1.25, 0.75, 0. , 1.75, 2.75, 0.75]])"
760 | ]
761 | },
762 | "execution_count": 166,
763 | "metadata": {},
764 | "output_type": "execute_result"
765 | }
766 | ],
767 | "source": [
768 | "U_norm"
769 | ]
770 | },
771 | {
772 | "cell_type": "markdown",
773 | "metadata": {},
774 | "source": [
775 | "(e) Computing the cosine distance between each pair of users"
776 | ]
777 | },
778 | {
779 | "cell_type": "code",
780 | "execution_count": 168,
781 | "metadata": {
782 | "collapsed": false
783 | },
784 | "outputs": [
785 | {
786 | "name": "stdout",
787 | "output_type": "stream",
788 | "text": [
789 | "A,B: 0.546504040851\n",
790 | "A,C: 0.163408291384\n",
791 | "B,C: -0.312561520424\n"
792 | ]
793 | }
794 | ],
795 | "source": [
796 | "print 'A,B:', cosine(A_norm,B_norm)\n",
797 | "print 'A,C:', cosine(A_norm,C_norm)\n",
798 | "print 'B,C:', cosine(B_norm,C_norm)"
799 | ]
800 | },
801 | {
802 | "cell_type": "markdown",
803 | "metadata": {},
804 | "source": [
805 | "\n",
806 | "# 9.4.1 and 9.4.2 (UV Decomposition) "
807 | ]
808 | },
809 | {
810 | "cell_type": "code",
811 | "execution_count": 280,
812 | "metadata": {
813 | "collapsed": false
814 | },
815 | "outputs": [],
816 | "source": [
817 | "U = np.array([1]*10).reshape(5,2)"
818 | ]
819 | },
820 | {
821 | "cell_type": "code",
822 | "execution_count": 281,
823 | "metadata": {
824 | "collapsed": false
825 | },
826 | "outputs": [],
827 | "source": [
828 | "V = np.array([1]*10).reshape(2,5)"
829 | ]
830 | },
831 | {
832 | "cell_type": "code",
833 | "execution_count": 282,
834 | "metadata": {
835 | "collapsed": false
836 | },
837 | "outputs": [],
838 | "source": [
839 | "M = np.array([5,2,4,4,3,3,1,2,4,1,2,99,3,1,4,2,5,4,3,5,4,4,5,4,99]).reshape(5,5)"
840 | ]
841 | },
842 | {
843 | "cell_type": "code",
844 | "execution_count": 283,
845 | "metadata": {
846 | "collapsed": false
847 | },
848 | "outputs": [
849 | {
850 | "data": {
851 | "text/plain": [
852 | "array([[ 5, 2, 4, 4, 3],\n",
853 | " [ 3, 1, 2, 4, 1],\n",
854 | " [ 2, 99, 3, 1, 4],\n",
855 | " [ 2, 5, 4, 3, 5],\n",
856 | " [ 4, 4, 5, 4, 99]])"
857 | ]
858 | },
859 | "execution_count": 283,
860 | "metadata": {},
861 | "output_type": "execute_result"
862 | }
863 | ],
864 | "source": [
865 | "M"
866 | ]
867 | },
868 | {
869 | "cell_type": "markdown",
870 | "metadata": {},
871 | "source": [
872 | "**Gradient descent**"
873 | ]
874 | },
875 | {
876 | "cell_type": "code",
877 | "execution_count": 311,
878 | "metadata": {
879 | "collapsed": true
880 | },
881 | "outputs": [],
882 | "source": [
883 | "def opt_x(r,s):\n",
884 | " num = 0\n",
885 | " den = 0\n",
886 | " for j in range(1,6):\n",
887 | " if M[r-1,j-1] != 99:\n",
888 | " num += V[s-1,j-1]*(M[r-1,j-1]-np.dot(U[r-1,:],V[:,j-1])\n",
889 | " +U[r-1,s-1]*V[s-1,j-1]) # add back k=r\n",
890 | " den += V[s-1,j-1]**2\n",
891 | " return num/den"
892 | ]
893 | },
894 | {
895 | "cell_type": "code",
896 | "execution_count": 324,
897 | "metadata": {
898 | "collapsed": true
899 | },
900 | "outputs": [],
901 | "source": [
902 | "def opt_y(r,s):\n",
903 | " num = 0\n",
904 | " den = 0\n",
905 | " for i in range(1,6):\n",
906 | " if M[i-1,s-1] != 99:\n",
907 | " num += U[i-1,r-1]*(M[i-1,s-1]-np.dot(U[i-1,:],V[:,s-1])\n",
908 | " +U[i-1,r-1]*V[r-1,s-1]) # add back when k=s\n",
909 | " den += U[i-1,r-1]**2\n",
910 | " return num/den"
911 | ]
912 | },
913 | {
914 | "cell_type": "markdown",
915 | "metadata": {},
916 | "source": [
917 | "## 9.4.1 (a) and (b) "
918 | ]
919 | },
920 | {
921 | "cell_type": "code",
922 | "execution_count": 325,
923 | "metadata": {
924 | "collapsed": false
925 | },
926 | "outputs": [
927 | {
928 | "data": {
929 | "text/plain": [
930 | "1.5"
931 | ]
932 | },
933 | "execution_count": 325,
934 | "metadata": {},
935 | "output_type": "execute_result"
936 | }
937 | ],
938 | "source": [
939 | "opt_x(3,2)"
940 | ]
941 | },
942 | {
943 | "cell_type": "code",
944 | "execution_count": 326,
945 | "metadata": {
946 | "collapsed": false
947 | },
948 | "outputs": [
949 | {
950 | "data": {
951 | "text/plain": [
952 | "2.2000000000000002"
953 | ]
954 | },
955 | "execution_count": 326,
956 | "metadata": {},
957 | "output_type": "execute_result"
958 | }
959 | ],
960 | "source": [
961 | "opt_y(1,4)"
962 | ]
963 | },
964 | {
965 | "cell_type": "markdown",
966 | "metadata": {},
967 | "source": [
968 | "## Rest of solution to 9.4.2 "
969 | ]
970 | },
971 | {
972 | "cell_type": "code",
973 | "execution_count": 384,
974 | "metadata": {
975 | "collapsed": false
976 | },
977 | "outputs": [],
978 | "source": [
979 | "def RMSE(r,s,axis):\n",
980 | " \"\"\"\n",
981 | " if axis=0 then we are starting the decomposition with u_{r,s},\n",
982 | " if axis=1, then start decomposition with v_{r,s}\n",
983 | " \"\"\"\n",
984 | " assert axis == 0 or axis == 1\n",
985 | " if axis == 0:\n",
986 | " x = opt_x(r,s)\n",
987 | " # contribution of mse due to the r-th row of UV\n",
988 | " mse = sum(map(lambda u: (u-(x+1))**2 if u!=99 else 0, M[r-1,:]))\n",
989 | " # contribution of mse due to the other rows of UV\n",
990 | " mse += sum(sum(map(lambda u: (u-2)**2 if u!=99 else 0, M[i,:])) for i in range(0,5) if i != r-1)\n",
991 | " return np.sqrt(mse)\n",
992 | " else:\n",
993 | " y = opt_y(r,s)\n",
994 | " # contribution of mse due to the s-th row of UV\n",
995 | " mse = sum(map(lambda u: (u-(y+1))**2 if u!=99 else 0, M[:,s-1]))\n",
996 | " mse += sum(sum(map(lambda u: (u-2)**2 if u!=99 else 0, M[:,j])) for j in range(0,5) if j != s-1)\n",
997 | " return np.sqrt(mse)"
998 | ]
999 | },
1000 | {
1001 | "cell_type": "code",
1002 | "execution_count": 385,
1003 | "metadata": {
1004 | "collapsed": false
1005 | },
1006 | "outputs": [
1007 | {
1008 | "data": {
1009 | "text/plain": [
1010 | "7.8866976612521418"
1011 | ]
1012 | },
1013 | "execution_count": 385,
1014 | "metadata": {},
1015 | "output_type": "execute_result"
1016 | }
1017 | ],
1018 | "source": [
1019 | "RMSE(1,1,0)"
1020 | ]
1021 | },
1022 | {
1023 | "cell_type": "code",
1024 | "execution_count": 386,
1025 | "metadata": {
1026 | "collapsed": false
1027 | },
1028 | "outputs": [
1029 | {
1030 | "data": {
1031 | "text/plain": [
1032 | "7.8866976612521418"
1033 | ]
1034 | },
1035 | "execution_count": 386,
1036 | "metadata": {},
1037 | "output_type": "execute_result"
1038 | }
1039 | ],
1040 | "source": [
1041 | "# manual check for starting UV decomposition with u_{1,1}\n",
1042 | "np.sqrt(sum((M[0,:]-3.6)**2+(M[1,:]-2)**2+(M[3,:]-2)**2)\n",
1043 | "+sum([(m-2)**2 for m in M[2,:] if m!=99])+sum([(m-2)**2 for m in M[4,:] if m!=99]))"
1044 | ]
1045 | },
1046 | {
1047 | "cell_type": "code",
1048 | "execution_count": 392,
1049 | "metadata": {
1050 | "collapsed": false
1051 | },
1052 | "outputs": [
1053 | {
1054 | "name": "stdout",
1055 | "output_type": "stream",
1056 | "text": [
1057 | "minimum RMSE from U: 7.3993 occurring from: [5, 1]\n"
1058 | ]
1059 | }
1060 | ],
1061 | "source": [
1062 | "# finding the pair (r,s) that achieves the min RMSE after starting decomposition with u_{r,s}\n",
1063 | "min_RMSE = 2**32\n",
1064 | "min_pair = []\n",
1065 | "for r in range(1,6):\n",
1066 | " for s in range(1,3):\n",
1067 | " step_RMSE = RMSE(r,s,0)\n",
1068 | " if step_RMSE < min_RMSE:\n",
1069 | " min_RMSE, min_pair = step_RMSE, [r,s]\n",
1070 | "\n",
1071 | "print 'minimum RMSE from U: %.4f occurring from: %s' %(min_RMSE, str(min_pair))\n"
1072 | ]
1073 | },
1074 | {
1075 | "cell_type": "code",
1076 | "execution_count": 393,
1077 | "metadata": {
1078 | "collapsed": false
1079 | },
1080 | "outputs": [
1081 | {
1082 | "name": "stdout",
1083 | "output_type": "stream",
1084 | "text": [
1085 | "minimum RMSE from V: 7.8867 occurring from: [1, 3]\n"
1086 | ]
1087 | }
1088 | ],
1089 | "source": [
1090 | "# finding the pair (r,s) that achieves the min RMSE after starting decomposition with v_{r,s}\n",
1091 | "min_RMSE = 2**32\n",
1092 | "min_pair = []\n",
1093 | "for r in range(1,3):\n",
1094 | " for s in range(1,6):\n",
1095 | " step_RMSE = RMSE(r,s,1)\n",
1096 | " if step_RMSE < min_RMSE:\n",
1097 | " min_RMSE, min_pair = step_RMSE, [r,s]\n",
1098 | "\n",
1099 | "print 'minimum RMSE from V: %.4f occurring from: %s' %(min_RMSE, str(min_pair))"
1100 | ]
1101 | },
1102 | {
1103 | "cell_type": "markdown",
1104 | "metadata": {},
1105 | "source": [
1106 | "The above shows that the minimum RMSE from all possible starting points is 7.3993 which we obtain by starting the decomposition at u_{5,1}. *Note that the above code only finds one such pair that results in the minimum*."
1107 | ]
1108 | },
1109 | {
1110 | "cell_type": "code",
1111 | "execution_count": 406,
1112 | "metadata": {
1113 | "collapsed": false
1114 | },
1115 | "outputs": [
1116 | {
1117 | "data": {
1118 | "text/plain": [
1119 | "7.399324293474371"
1120 | ]
1121 | },
1122 | "execution_count": 406,
1123 | "metadata": {},
1124 | "output_type": "execute_result"
1125 | }
1126 | ],
1127 | "source": [
1128 | "RMSE(5,2,0)"
1129 | ]
1130 | },
1131 | {
1132 | "cell_type": "code",
1133 | "execution_count": 408,
1134 | "metadata": {
1135 | "collapsed": false
1136 | },
1137 | "outputs": [
1138 | {
1139 | "data": {
1140 | "text/plain": [
1141 | "7.6681158050723255"
1142 | ]
1143 | },
1144 | "execution_count": 408,
1145 | "metadata": {},
1146 | "output_type": "execute_result"
1147 | }
1148 | ],
1149 | "source": [
1150 | "RMSE(4,2,0)"
1151 | ]
1152 | },
1153 | {
1154 | "cell_type": "markdown",
1155 | "metadata": {},
1156 | "source": [
1157 | "For example, u_{5,2} also achieves the minimum RMSE, whereas starting from u_{4,2} does not."
1158 | ]
1159 | },
1160 | {
1161 | "cell_type": "markdown",
1162 | "metadata": {},
1163 | "source": [
1164 | "\n",
1165 | "# 9.4.3 "
1166 | ]
1167 | },
1168 | {
1169 | "cell_type": "code",
1170 | "execution_count": 430,
1171 | "metadata": {
1172 | "collapsed": false
1173 | },
1174 | "outputs": [
1175 | {
1176 | "data": {
1177 | "text/plain": [
1178 | "array([[ 2.6 , 1. ],\n",
1179 | " [ 1. , 1. ],\n",
1180 | " [ 1.178, 1. ],\n",
1181 | " [ 1. , 1. ],\n",
1182 | " [ 1. , 1. ]])"
1183 | ]
1184 | },
1185 | "execution_count": 430,
1186 | "metadata": {},
1187 | "output_type": "execute_result"
1188 | }
1189 | ],
1190 | "source": [
1191 | "U = np.array([2.6,1,1,1,1.178,1,1,1,1,1]).reshape(5,2)\n",
1192 | "U"
1193 | ]
1194 | },
1195 | {
1196 | "cell_type": "code",
1197 | "execution_count": 431,
1198 | "metadata": {
1199 | "collapsed": false
1200 | },
1201 | "outputs": [
1202 | {
1203 | "data": {
1204 | "text/plain": [
1205 | "array([[ 1.617, 1. , 1. , 1. , 1. ],\n",
1206 | " [ 1. , 1. , 1. , 1. , 1. ]])"
1207 | ]
1208 | },
1209 | "execution_count": 431,
1210 | "metadata": {},
1211 | "output_type": "execute_result"
1212 | }
1213 | ],
1214 | "source": [
1215 | "V = np.array([1.617,1,1,1,1,1,1,1,1,1]).reshape(2,5)\n",
1216 | "V"
1217 | ]
1218 | },
1219 | {
1220 | "cell_type": "markdown",
1221 | "metadata": {},
1222 | "source": [
1223 | "Can do matrix multiplication using numpy's dot() method"
1224 | ]
1225 | },
1226 | {
1227 | "cell_type": "code",
1228 | "execution_count": 432,
1229 | "metadata": {
1230 | "collapsed": false
1231 | },
1232 | "outputs": [
1233 | {
1234 | "data": {
1235 | "text/plain": [
1236 | "array([[ 5.2042 , 3.6 , 3.6 , 3.6 , 3.6 ],\n",
1237 | " [ 2.617 , 2. , 2. , 2. , 2. ],\n",
1238 | " [ 2.904826, 2.178 , 2.178 , 2.178 , 2.178 ],\n",
1239 | " [ 2.617 , 2. , 2. , 2. , 2. ],\n",
1240 | " [ 2.617 , 2. , 2. , 2. , 2. ]])"
1241 | ]
1242 | },
1243 | "execution_count": 432,
1244 | "metadata": {},
1245 | "output_type": "execute_result"
1246 | }
1247 | ],
1248 | "source": [
1249 | "UV = np.dot(U,V)\n",
1250 | "UV"
1251 | ]
1252 | },
1253 | {
1254 | "cell_type": "markdown",
1255 | "metadata": {},
1256 | "source": [
1257 | "We can use functions similar to `opt_x` and `opt_y` from the previous question. In this function, opt_x and opt_y will alter the U and V matrices in addition to returing the optimal values."
1258 | ]
1259 | },
1260 | {
1261 | "cell_type": "code",
1262 | "execution_count": 433,
1263 | "metadata": {
1264 | "collapsed": true
1265 | },
1266 | "outputs": [],
1267 | "source": [
1268 | "def opt_x(r,s):\n",
1269 | " num = 0\n",
1270 | " den = 0\n",
1271 | " for j in range(1,6):\n",
1272 | " if M[r-1,j-1] != 99:\n",
1273 | " num += V[s-1,j-1]*(M[r-1,j-1]-np.dot(U[r-1,:],V[:,j-1])\n",
1274 | " +U[r-1,s-1]*V[s-1,j-1]) # add back k=r\n",
1275 | " den += V[s-1,j-1]**2\n",
1276 | " U[r-1,s-1] = num/den\n",
1277 | " return num/den\n",
1278 | "\n",
1279 | "def opt_y(r,s):\n",
1280 | " num = 0\n",
1281 | " den = 0\n",
1282 | " for i in range(1,6):\n",
1283 | " if M[i-1,s-1] != 99:\n",
1284 | " num += U[i-1,r-1]*(M[i-1,s-1]-np.dot(U[i-1,:],V[:,s-1])\n",
1285 | " +U[i-1,r-1]*V[r-1,s-1]) # add back when k=s\n",
1286 | " den += U[i-1,r-1]**2\n",
1287 | " V[r-1,s-1] = num/den\n",
1288 | " return num/den"
1289 | ]
1290 | },
1291 | {
1292 | "cell_type": "code",
1293 | "execution_count": 434,
1294 | "metadata": {
1295 | "collapsed": true
1296 | },
1297 | "outputs": [],
1298 | "source": [
1299 | "def RMSE_general():\n",
1300 | " UV = np.dot(U,V)\n",
1301 | " mse = 0\n",
1302 | " for i in range(5):\n",
1303 | " for j in range(5):\n",
1304 | " if M[i,j] != 99:\n",
1305 | " mse += (M[i,j]-UV[i,j])**2\n",
1306 | " return np.sqrt(mse)"
1307 | ]
1308 | },
1309 | {
1310 | "cell_type": "code",
1311 | "execution_count": 435,
1312 | "metadata": {
1313 | "collapsed": false
1314 | },
1315 | "outputs": [
1316 | {
1317 | "data": {
1318 | "text/plain": [
1319 | "7.6107507336842932"
1320 | ]
1321 | },
1322 | "execution_count": 435,
1323 | "metadata": {},
1324 | "output_type": "execute_result"
1325 | }
1326 | ],
1327 | "source": [
1328 | "# Current RMSE\n",
1329 | "RMSE_general()"
1330 | ]
1331 | },
1332 | {
1333 | "cell_type": "markdown",
1334 | "metadata": {},
1335 | "source": [
1336 | "(a) Considering u_{1,1} as the element to update "
1337 | ]
1338 | },
1339 | {
1340 | "cell_type": "code",
1341 | "execution_count": 436,
1342 | "metadata": {
1343 | "collapsed": false
1344 | },
1345 | "outputs": [
1346 | {
1347 | "data": {
1348 | "text/plain": [
1349 | "2.3384319353487366"
1350 | ]
1351 | },
1352 | "execution_count": 436,
1353 | "metadata": {},
1354 | "output_type": "execute_result"
1355 | }
1356 | ],
1357 | "source": [
1358 | "# update U with optimal x and print out optimal x\n",
1359 | "opt_x(1,1)"
1360 | ]
1361 | },
1362 | {
1363 | "cell_type": "code",
1364 | "execution_count": 437,
1365 | "metadata": {
1366 | "collapsed": false
1367 | },
1368 | "outputs": [
1369 | {
1370 | "data": {
1371 | "text/plain": [
1372 | "array([[ 2.33843194, 1. ],\n",
1373 | " [ 1. , 1. ],\n",
1374 | " [ 1.178 , 1. ],\n",
1375 | " [ 1. , 1. ],\n",
1376 | " [ 1. , 1. ]])"
1377 | ]
1378 | },
1379 | "execution_count": 437,
1380 | "metadata": {},
1381 | "output_type": "execute_result"
1382 | }
1383 | ],
1384 | "source": [
1385 | "U"
1386 | ]
1387 | },
1388 | {
1389 | "cell_type": "code",
1390 | "execution_count": 439,
1391 | "metadata": {
1392 | "collapsed": false
1393 | },
1394 | "outputs": [
1395 | {
1396 | "data": {
1397 | "text/plain": [
1398 | "7.5809606194928714"
1399 | ]
1400 | },
1401 | "execution_count": 439,
1402 | "metadata": {},
1403 | "output_type": "execute_result"
1404 | }
1405 | ],
1406 | "source": [
1407 | "# check new RMSE has decreased\n",
1408 | "RMSE_general()"
1409 | ]
1410 | },
1411 | {
1412 | "cell_type": "markdown",
1413 | "metadata": {},
1414 | "source": [
1415 | "(b) Then choose the best value for u_{5,2}"
1416 | ]
1417 | },
1418 | {
1419 | "cell_type": "code",
1420 | "execution_count": 440,
1421 | "metadata": {
1422 | "collapsed": false
1423 | },
1424 | "outputs": [
1425 | {
1426 | "data": {
1427 | "text/plain": [
1428 | "array([[ 2.33843194, 1. ],\n",
1429 | " [ 1. , 1. ],\n",
1430 | " [ 1.178 , 1. ],\n",
1431 | " [ 1. , 1. ],\n",
1432 | " [ 1. , 3.09575 ]])"
1433 | ]
1434 | },
1435 | "execution_count": 440,
1436 | "metadata": {},
1437 | "output_type": "execute_result"
1438 | }
1439 | ],
1440 | "source": [
1441 | "opt_x(5,2)\n",
1442 | "U"
1443 | ]
1444 | },
1445 | {
1446 | "cell_type": "code",
1447 | "execution_count": 444,
1448 | "metadata": {
1449 | "collapsed": false
1450 | },
1451 | "outputs": [
1452 | {
1453 | "data": {
1454 | "text/plain": [
1455 | "6.3168260751980299"
1456 | ]
1457 | },
1458 | "execution_count": 444,
1459 | "metadata": {},
1460 | "output_type": "execute_result"
1461 | }
1462 | ],
1463 | "source": [
1464 | "# checking if RMSE was decreased\n",
1465 | "RMSE_general()"
1466 | ]
1467 | },
1468 | {
1469 | "cell_type": "markdown",
1470 | "metadata": {},
1471 | "source": [
1472 | "(c) Next, choosing the best value for v_{2,2}"
1473 | ]
1474 | },
1475 | {
1476 | "cell_type": "code",
1477 | "execution_count": 445,
1478 | "metadata": {
1479 | "collapsed": false
1480 | },
1481 | "outputs": [
1482 | {
1483 | "data": {
1484 | "text/plain": [
1485 | "array([[ 1.617, 1. , 1. , 1. , 1. ],\n",
1486 | " [ 1. , 1. , 1. , 1. , 1. ]])"
1487 | ]
1488 | },
1489 | "execution_count": 445,
1490 | "metadata": {},
1491 | "output_type": "execute_result"
1492 | }
1493 | ],
1494 | "source": [
1495 | "V"
1496 | ]
1497 | },
1498 | {
1499 | "cell_type": "code",
1500 | "execution_count": 446,
1501 | "metadata": {
1502 | "collapsed": false
1503 | },
1504 | "outputs": [
1505 | {
1506 | "data": {
1507 | "text/plain": [
1508 | "array([[ 1.617 , 1. , 1. , 1. , 1. ],\n",
1509 | " [ 1. , 1.02901777, 1. , 1. , 1. ]])"
1510 | ]
1511 | },
1512 | "execution_count": 446,
1513 | "metadata": {},
1514 | "output_type": "execute_result"
1515 | }
1516 | ],
1517 | "source": [
1518 | "# updating V at v_{2,2}\n",
1519 | "opt_y(2,2)\n",
1520 | "V"
1521 | ]
1522 | },
1523 | {
1524 | "cell_type": "code",
1525 | "execution_count": 447,
1526 | "metadata": {
1527 | "collapsed": false
1528 | },
1529 | "outputs": [
1530 | {
1531 | "data": {
1532 | "text/plain": [
1533 | "6.3159873198925407"
1534 | ]
1535 | },
1536 | "execution_count": 447,
1537 | "metadata": {},
1538 | "output_type": "execute_result"
1539 | }
1540 | ],
1541 | "source": [
1542 | "# checking that RMSE indeed decreased\n",
1543 | "RMSE_general()"
1544 | ]
1545 | },
1546 | {
1547 | "cell_type": "markdown",
1548 | "metadata": {},
1549 | "source": [
1550 | "**BONUS STEP: choosing best value for v_{1,5}**"
1551 | ]
1552 | },
1553 | {
1554 | "cell_type": "code",
1555 | "execution_count": 449,
1556 | "metadata": {
1557 | "collapsed": false
1558 | },
1559 | "outputs": [
1560 | {
1561 | "data": {
1562 | "text/plain": [
1563 | "array([[ 1.617 , 1. , 1. , 1. , 1.37883194],\n",
1564 | " [ 1. , 1.02901777, 1. , 1. , 1. ]])"
1565 | ]
1566 | },
1567 | "execution_count": 449,
1568 | "metadata": {},
1569 | "output_type": "execute_result"
1570 | }
1571 | ],
1572 | "source": [
1573 | "opt_y(1,5)\n",
1574 | "V"
1575 | ]
1576 | },
1577 | {
1578 | "cell_type": "code",
1579 | "execution_count": 450,
1580 | "metadata": {
1581 | "collapsed": false
1582 | },
1583 | "outputs": [
1584 | {
1585 | "data": {
1586 | "text/plain": [
1587 | "6.2145592358668278"
1588 | ]
1589 | },
1590 | "execution_count": 450,
1591 | "metadata": {},
1592 | "output_type": "execute_result"
1593 | }
1594 | ],
1595 | "source": [
1596 | "RMSE_general()"
1597 | ]
1598 | },
1599 | {
1600 | "cell_type": "markdown",
1601 | "metadata": {},
1602 | "source": [
1603 | "\n",
1604 | "# 9.4.5 (Normalizing the Utility Matrix)"
1605 | ]
1606 | },
1607 | {
1608 | "cell_type": "code",
1609 | "execution_count": 519,
1610 | "metadata": {
1611 | "collapsed": false
1612 | },
1613 | "outputs": [
1614 | {
1615 | "data": {
1616 | "text/plain": [
1617 | "array([[ 5, 2, 4, 4, 3],\n",
1618 | " [ 3, 1, 2, 4, 1],\n",
1619 | " [ 2, 99, 3, 1, 4],\n",
1620 | " [ 2, 5, 4, 3, 5],\n",
1621 | " [ 4, 4, 5, 4, 99]])"
1622 | ]
1623 | },
1624 | "execution_count": 519,
1625 | "metadata": {},
1626 | "output_type": "execute_result"
1627 | }
1628 | ],
1629 | "source": [
1630 | "M"
1631 | ]
1632 | },
1633 | {
1634 | "cell_type": "markdown",
1635 | "metadata": {},
1636 | "source": [
1637 | "### (a) First subtract from each element the average of its row, and then subtract from each element the average of its (modified) column"
1638 | ]
1639 | },
1640 | {
1641 | "cell_type": "markdown",
1642 | "metadata": {},
1643 | "source": [
1644 | "### Step 1"
1645 | ]
1646 | },
1647 | {
1648 | "cell_type": "code",
1649 | "execution_count": 520,
1650 | "metadata": {
1651 | "collapsed": false
1652 | },
1653 | "outputs": [
1654 | {
1655 | "data": {
1656 | "text/plain": [
1657 | "array([[5, 2, 4, 4, 3],\n",
1658 | " [3, 1, 2, 4, 1],\n",
1659 | " [2, 0, 3, 1, 4],\n",
1660 | " [2, 5, 4, 3, 5],\n",
1661 | " [4, 4, 5, 4, 0]])"
1662 | ]
1663 | },
1664 | "execution_count": 520,
1665 | "metadata": {},
1666 | "output_type": "execute_result"
1667 | }
1668 | ],
1669 | "source": [
1670 | "# changing 99 values to 0 so that they don't affect the row sums\n",
1671 | "(M!=99)*M"
1672 | ]
1673 | },
1674 | {
1675 | "cell_type": "code",
1676 | "execution_count": 521,
1677 | "metadata": {
1678 | "collapsed": false
1679 | },
1680 | "outputs": [
1681 | {
1682 | "data": {
1683 | "text/plain": [
1684 | "array([18, 11, 10, 19, 17])"
1685 | ]
1686 | },
1687 | "execution_count": 521,
1688 | "metadata": {},
1689 | "output_type": "execute_result"
1690 | }
1691 | ],
1692 | "source": [
1693 | "# row sums of the above matrix\n",
1694 | "foo = np.sum((M!=99)*M,1)\n",
1695 | "foo"
1696 | ]
1697 | },
1698 | {
1699 | "cell_type": "code",
1700 | "execution_count": 522,
1701 | "metadata": {
1702 | "collapsed": false
1703 | },
1704 | "outputs": [
1705 | {
1706 | "data": {
1707 | "text/plain": [
1708 | "array([5, 5, 4, 5, 4])"
1709 | ]
1710 | },
1711 | "execution_count": 522,
1712 | "metadata": {},
1713 | "output_type": "execute_result"
1714 | }
1715 | ],
1716 | "source": [
1717 | "# number of ratings per user (i.e. per row)\n",
1718 | "bar = np.sum((M!=99),1)\n",
1719 | "bar"
1720 | ]
1721 | },
1722 | {
1723 | "cell_type": "code",
1724 | "execution_count": 523,
1725 | "metadata": {
1726 | "collapsed": false
1727 | },
1728 | "outputs": [
1729 | {
1730 | "data": {
1731 | "text/plain": [
1732 | "array([ 3.6 , 2.2 , 2.5 , 3.8 , 4.25])"
1733 | ]
1734 | },
1735 | "execution_count": 523,
1736 | "metadata": {},
1737 | "output_type": "execute_result"
1738 | }
1739 | ],
1740 | "source": [
1741 | "# row averages\n",
1742 | "row_averages = foo/bar # elementwise division\n",
1743 | "row_averages"
1744 | ]
1745 | },
1746 | {
1747 | "cell_type": "code",
1748 | "execution_count": 524,
1749 | "metadata": {
1750 | "collapsed": false
1751 | },
1752 | "outputs": [
1753 | {
1754 | "data": {
1755 | "text/plain": [
1756 | "array([[ 1.4 , -1.6 , 0.4 , 0.4 , -0.6 ],\n",
1757 | " [ 0.8 , -1.2 , -0.2 , 1.8 , -1.2 ],\n",
1758 | " [ -0.5 , 99. , 0.5 , -1.5 , 1.5 ],\n",
1759 | " [ -1.8 , 1.2 , 0.2 , -0.8 , 1.2 ],\n",
1760 | " [ -0.25, -0.25, 0.75, -0.25, 99. ]])"
1761 | ]
1762 | },
1763 | "execution_count": 524,
1764 | "metadata": {},
1765 | "output_type": "execute_result"
1766 | }
1767 | ],
1768 | "source": [
1769 | "M_step1 = []\n",
1770 | "for i,row in enumerate(M):\n",
1771 | " M_step1 += map(lambda x: x-row_averages[i] if x!=99 else 99, row)\n",
1772 | "\n",
1773 | "M_step1 = np.array(M_step1).reshape(5,5)\n",
1774 | "M_step1"
1775 | ]
1776 | },
1777 | {
1778 | "cell_type": "markdown",
1779 | "metadata": {},
1780 | "source": [
1781 | "### Step 2 "
1782 | ]
1783 | },
1784 | {
1785 | "cell_type": "code",
1786 | "execution_count": 525,
1787 | "metadata": {
1788 | "collapsed": false
1789 | },
1790 | "outputs": [
1791 | {
1792 | "data": {
1793 | "text/plain": [
1794 | "array([-0.35, -1.85, 1.65, -0.35, 0.9 ])"
1795 | ]
1796 | },
1797 | "execution_count": 525,
1798 | "metadata": {},
1799 | "output_type": "execute_result"
1800 | }
1801 | ],
1802 | "source": [
1803 | "# column sums of the above matrix\n",
1804 | "foo = np.sum((M_step1!=99)*M_step1,0)\n",
1805 | "foo"
1806 | ]
1807 | },
1808 | {
1809 | "cell_type": "code",
1810 | "execution_count": 526,
1811 | "metadata": {
1812 | "collapsed": false
1813 | },
1814 | "outputs": [
1815 | {
1816 | "data": {
1817 | "text/plain": [
1818 | "array([5, 4, 5, 5, 4])"
1819 | ]
1820 | },
1821 | "execution_count": 526,
1822 | "metadata": {},
1823 | "output_type": "execute_result"
1824 | }
1825 | ],
1826 | "source": [
1827 | "# number of ratings per item (i.e. per column)\n",
1828 | "bar = np.sum((M_step1!=99),0)\n",
1829 | "bar# number of ratings per item (i.e. per column)\n",
1830 | "bar = np.sum((M_step1!=99),0)\n",
1831 | "bar"
1832 | ]
1833 | },
1834 | {
1835 | "cell_type": "code",
1836 | "execution_count": 527,
1837 | "metadata": {
1838 | "collapsed": false
1839 | },
1840 | "outputs": [
1841 | {
1842 | "data": {
1843 | "text/plain": [
1844 | "array([-0.07 , -0.4625, 0.33 , -0.07 , 0.225 ])"
1845 | ]
1846 | },
1847 | "execution_count": 527,
1848 | "metadata": {},
1849 | "output_type": "execute_result"
1850 | }
1851 | ],
1852 | "source": [
1853 | "# column averages\n",
1854 | "col_averages = foo/bar # elementwise division\n",
1855 | "col_averages"
1856 | ]
1857 | },
1858 | {
1859 | "cell_type": "code",
1860 | "execution_count": 548,
1861 | "metadata": {
1862 | "collapsed": false
1863 | },
1864 | "outputs": [
1865 | {
1866 | "data": {
1867 | "text/plain": [
1868 | "array([[ 1.47000000e+00, -1.13750000e+00, 7.00000000e-02,\n",
1869 | " 4.70000000e-01, -8.25000000e-01],\n",
1870 | " [ 8.70000000e-01, -7.37500000e-01, -5.30000000e-01,\n",
1871 | " 1.87000000e+00, -1.42500000e+00],\n",
1872 | " [ -4.30000000e-01, 9.90000000e+01, 1.70000000e-01,\n",
1873 | " -1.43000000e+00, 1.27500000e+00],\n",
1874 | " [ -1.73000000e+00, 1.66250000e+00, -1.30000000e-01,\n",
1875 | " -7.30000000e-01, 9.75000000e-01],\n",
1876 | " [ -1.80000000e-01, 2.12500000e-01, 4.20000000e-01,\n",
1877 | " -1.80000000e-01, 9.90000000e+01]])"
1878 | ]
1879 | },
1880 | "execution_count": 548,
1881 | "metadata": {},
1882 | "output_type": "execute_result"
1883 | }
1884 | ],
1885 | "source": [
1886 | "M_step2 = []\n",
1887 | "for i,row in enumerate(M_step1.T): # take transpose of the M_step1 and consider the rows\n",
1888 | " M_step2 += map(lambda x: x-col_averages[i] if x!=99 else 99, row)\n",
1889 | "\n",
1890 | "M_step2 = (np.array(M_step2).reshape(5,5)).T # need to take the transpose again\n",
1891 | "M_step2"
1892 | ]
1893 | },
1894 | {
1895 | "cell_type": "markdown",
1896 | "metadata": {},
1897 | "source": [
1898 | "### (b) First subtract from each element the average of its column, and then subtract from each element the average of its modified row."
1899 | ]
1900 | },
1901 | {
1902 | "cell_type": "code",
1903 | "execution_count": 550,
1904 | "metadata": {
1905 | "collapsed": false
1906 | },
1907 | "outputs": [
1908 | {
1909 | "data": {
1910 | "text/plain": [
1911 | "array([[ 5, 2, 4, 4, 3],\n",
1912 | " [ 3, 1, 2, 4, 1],\n",
1913 | " [ 2, 99, 3, 1, 4],\n",
1914 | " [ 2, 5, 4, 3, 5],\n",
1915 | " [ 4, 4, 5, 4, 99]])"
1916 | ]
1917 | },
1918 | "execution_count": 550,
1919 | "metadata": {},
1920 | "output_type": "execute_result"
1921 | }
1922 | ],
1923 | "source": [
1924 | "M"
1925 | ]
1926 | },
1927 | {
1928 | "cell_type": "markdown",
1929 | "metadata": {},
1930 | "source": [
1931 | "### Step 1"
1932 | ]
1933 | },
1934 | {
1935 | "cell_type": "code",
1936 | "execution_count": 560,
1937 | "metadata": {
1938 | "collapsed": false
1939 | },
1940 | "outputs": [
1941 | {
1942 | "data": {
1943 | "text/plain": [
1944 | "array([16, 12, 18, 16, 13])"
1945 | ]
1946 | },
1947 | "execution_count": 560,
1948 | "metadata": {},
1949 | "output_type": "execute_result"
1950 | }
1951 | ],
1952 | "source": [
1953 | "# column sums of the above matrix (not including the missing values denoted by 99)\n",
1954 | "foo = np.sum((M!=99)*M,0)\n",
1955 | "foo"
1956 | ]
1957 | },
1958 | {
1959 | "cell_type": "code",
1960 | "execution_count": 561,
1961 | "metadata": {
1962 | "collapsed": false
1963 | },
1964 | "outputs": [
1965 | {
1966 | "data": {
1967 | "text/plain": [
1968 | "array([5, 4, 5, 5, 4])"
1969 | ]
1970 | },
1971 | "execution_count": 561,
1972 | "metadata": {},
1973 | "output_type": "execute_result"
1974 | }
1975 | ],
1976 | "source": [
1977 | "# number of ratings per item (i.e. per column)\n",
1978 | "bar = np.sum((M!=99),0)\n",
1979 | "bar"
1980 | ]
1981 | },
1982 | {
1983 | "cell_type": "code",
1984 | "execution_count": 562,
1985 | "metadata": {
1986 | "collapsed": false
1987 | },
1988 | "outputs": [
1989 | {
1990 | "data": {
1991 | "text/plain": [
1992 | "array([ 3.2 , 3. , 3.6 , 3.2 , 3.25])"
1993 | ]
1994 | },
1995 | "execution_count": 562,
1996 | "metadata": {},
1997 | "output_type": "execute_result"
1998 | }
1999 | ],
2000 | "source": [
2001 | "# column averages\n",
2002 | "col_averages = foo/bar # elementwise division\n",
2003 | "col_averages"
2004 | ]
2005 | },
2006 | {
2007 | "cell_type": "code",
2008 | "execution_count": 563,
2009 | "metadata": {
2010 | "collapsed": false
2011 | },
2012 | "outputs": [
2013 | {
2014 | "data": {
2015 | "text/plain": [
2016 | "array([[ 1.8 , -1. , 0.4 , 0.8 , -0.25],\n",
2017 | " [ -0.2 , -2. , -1.6 , 0.8 , -2.25],\n",
2018 | " [ -1.2 , 99. , -0.6 , -2.2 , 0.75],\n",
2019 | " [ -1.2 , 2. , 0.4 , -0.2 , 1.75],\n",
2020 | " [ 0.8 , 1. , 1.4 , 0.8 , 99. ]])"
2021 | ]
2022 | },
2023 | "execution_count": 563,
2024 | "metadata": {},
2025 | "output_type": "execute_result"
2026 | }
2027 | ],
2028 | "source": [
2029 | "M_step1 = []\n",
2030 | "for i,row in enumerate(M.T): # take transpose of the M_step1 and consider the rows\n",
2031 | " M_step1 += map(lambda x: x-col_averages[i] if x!=99 else 99, row)\n",
2032 | "\n",
2033 | "M_step1 = (np.array(M_step1).reshape(5,5)).T # need to take the transpose again\n",
2034 | "M_step1"
2035 | ]
2036 | },
2037 | {
2038 | "cell_type": "markdown",
2039 | "metadata": {},
2040 | "source": [
2041 | "### Step 2"
2042 | ]
2043 | },
2044 | {
2045 | "cell_type": "code",
2046 | "execution_count": 564,
2047 | "metadata": {
2048 | "collapsed": false
2049 | },
2050 | "outputs": [
2051 | {
2052 | "data": {
2053 | "text/plain": [
2054 | "array([ 1.75, -5.25, -3.25, 2.75, 4. ])"
2055 | ]
2056 | },
2057 | "execution_count": 564,
2058 | "metadata": {},
2059 | "output_type": "execute_result"
2060 | }
2061 | ],
2062 | "source": [
2063 | "# row sums of the above matrix\n",
2064 | "foo = np.sum((M!=99)*M_step1,1)\n",
2065 | "foo"
2066 | ]
2067 | },
2068 | {
2069 | "cell_type": "code",
2070 | "execution_count": 565,
2071 | "metadata": {
2072 | "collapsed": false
2073 | },
2074 | "outputs": [
2075 | {
2076 | "data": {
2077 | "text/plain": [
2078 | "array([5, 5, 4, 5, 4])"
2079 | ]
2080 | },
2081 | "execution_count": 565,
2082 | "metadata": {},
2083 | "output_type": "execute_result"
2084 | }
2085 | ],
2086 | "source": [
2087 | "# number of ratings per user (i.e. per row)\n",
2088 | "bar = np.sum((M!=99),1)\n",
2089 | "bar"
2090 | ]
2091 | },
2092 | {
2093 | "cell_type": "code",
2094 | "execution_count": 566,
2095 | "metadata": {
2096 | "collapsed": false
2097 | },
2098 | "outputs": [
2099 | {
2100 | "data": {
2101 | "text/plain": [
2102 | "array([ 0.35 , -1.05 , -0.8125, 0.55 , 1. ])"
2103 | ]
2104 | },
2105 | "execution_count": 566,
2106 | "metadata": {},
2107 | "output_type": "execute_result"
2108 | }
2109 | ],
2110 | "source": [
2111 | "# row averages\n",
2112 | "row_averages = foo/bar # elementwise division\n",
2113 | "row_averages"
2114 | ]
2115 | },
2116 | {
2117 | "cell_type": "code",
2118 | "execution_count": 568,
2119 | "metadata": {
2120 | "collapsed": false
2121 | },
2122 | "outputs": [
2123 | {
2124 | "data": {
2125 | "text/plain": [
2126 | "array([[ 1.45000000e+00, -1.35000000e+00, 5.00000000e-02,\n",
2127 | " 4.50000000e-01, -6.00000000e-01],\n",
2128 | " [ 8.50000000e-01, -9.50000000e-01, -5.50000000e-01,\n",
2129 | " 1.85000000e+00, -1.20000000e+00],\n",
2130 | " [ -3.87500000e-01, 9.90000000e+01, 2.12500000e-01,\n",
2131 | " -1.38750000e+00, 1.56250000e+00],\n",
2132 | " [ -1.75000000e+00, 1.45000000e+00, -1.50000000e-01,\n",
2133 | " -7.50000000e-01, 1.20000000e+00],\n",
2134 | " [ -2.00000000e-01, 1.11022302e-16, 4.00000000e-01,\n",
2135 | " -2.00000000e-01, 9.90000000e+01]])"
2136 | ]
2137 | },
2138 | "execution_count": 568,
2139 | "metadata": {},
2140 | "output_type": "execute_result"
2141 | }
2142 | ],
2143 | "source": [
2144 | "M_step2 = []\n",
2145 | "for i,row in enumerate(M_step1):\n",
2146 | " M_step2 += map(lambda x: x-row_averages[i] if x!=99 else 99, row)\n",
2147 | "\n",
2148 | "M_step2 = np.array(M_step2).reshape(5,5)\n",
2149 | "M_step2"
2150 | ]
2151 | },
2152 | {
2153 | "cell_type": "markdown",
2154 | "metadata": {},
2155 | "source": [
2156 | "Yes, the two methods produce different normalized matrices, but they are indeed close."
2157 | ]
2158 | }
2159 | ],
2160 | "metadata": {
2161 | "kernelspec": {
2162 | "display_name": "Python 2",
2163 | "language": "python",
2164 | "name": "python2"
2165 | },
2166 | "language_info": {
2167 | "codemirror_mode": {
2168 | "name": "ipython",
2169 | "version": 2
2170 | },
2171 | "file_extension": ".py",
2172 | "mimetype": "text/x-python",
2173 | "name": "python",
2174 | "nbconvert_exporter": "python",
2175 | "pygments_lexer": "ipython2",
2176 | "version": "2.7.11"
2177 | }
2178 | },
2179 | "nbformat": 4,
2180 | "nbformat_minor": 0
2181 | }
2182 |
--------------------------------------------------------------------------------
/Exercises 6.1.1 and 6.1.3 and their related problems (from Ch.6 Frequent Itemsets).ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "Contents:\n",
8 | "- Exercise 6.1.1\n",
9 | "- Exercise 6.1.2\n",
10 | "- Exercise 6.1.3\n",
11 | "- Exercise 6.1.5\n",
12 | "- Exercise 6.1.6\n",
13 | "- Exercise 6.2.5\n",
14 | "- Exercise 6.2.6 (A-Priori Algorithm)"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {
20 | "collapsed": true
21 | },
22 | "source": [
23 | "\n",
24 | "# Exercise 6.1.1:\n",
25 | "Suppose there are 100 items, numbered 1 to 100, and also 100 baskets, also numbered 1 to 100. Item `i` is in basket `b` if and only if `i` divides `b` with no remainder. Thus, item 1 is in all the baskets, item 2 is in all fifty of the even-numbered baskets, and so on. Basket 12 consists of items {1,2,3,4,6,12}, since these are all the integers that divide 12. Answer the following questions:\n",
26 | "\n",
27 | "(a) If the support threshold is 5, which items are frequent?\n",
28 | "\n",
29 | "(b) If the support threshold is 5, which pairs of items are frequent?\n",
30 | "\n",
31 | "(c) What is the sum of the sizes of all the baskets?"
32 | ]
33 | },
34 | {
35 | "cell_type": "markdown",
36 | "metadata": {},
37 | "source": [
38 | "#### Solution:\n",
39 | "\n",
40 | "(a) Item `i` is in basket `b` if `i` is a factor of `b`. In other words, `i` is in basket `b` if and only if there exists a constant integer `k`>=1 such that `b=k*i`. As a result, item `i` is found in 5 or more baskets if `100/i >=5`. Therefore items {1},{2},...,{20} represent the frequent singletons."
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 1,
46 | "metadata": {
47 | "collapsed": true
48 | },
49 | "outputs": [],
50 | "source": [
51 | "import numpy as np"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {},
57 | "source": [
58 | "Can get the set of frequent pairs, by explicitly counting the support set of each pair and returning those whose counts are greater than 5. "
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 229,
64 | "metadata": {
65 | "collapsed": false
66 | },
67 | "outputs": [],
68 | "source": [
69 | "# baskets[i] gives the list of baskets in which item i is in contained\n",
70 | "baskets = {}\n",
71 | "for i in range(1,101):\n",
72 | " baskets[i] = []\n",
73 | " k = 1\n",
74 | " while (i*k) <= 100:\n",
75 | " baskets[i].append(k*i)\n",
76 | " k += 1"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 230,
82 | "metadata": {
83 | "collapsed": false
84 | },
85 | "outputs": [
86 | {
87 | "name": "stdout",
88 | "output_type": "stream",
89 | "text": [
90 | "(1, 2) (1, 3) (1, 4) (1, 5) (1, 6) (1, 7) (1, 8) (1, 9) (1, 10) (1, 11) (1, 12) (1, 13) (1, 14) (1, 15) (1, 16) (1, 17) (1, 18) (1, 19) (1, 20) (2, 3) (2, 4) (2, 5) (2, 6) (2, 7) (2, 8) (2, 9) (2, 10) (2, 12) (2, 14) (2, 16) (2, 18) (2, 20) (3, 4) (3, 5) (3, 6) (3, 9) (3, 12) (3, 15) (3, 18) (4, 5) (4, 6) (4, 8) (4, 10) (4, 12) (4, 16) (4, 20) (5, 10) (5, 15) (5, 20) (6, 9) (6, 12) (6, 18) (7, 14) (8, 16) (9, 18) (10, 20)\n"
91 | ]
92 | }
93 | ],
94 | "source": [
95 | "# finding frequent pairs using a nested loop to select the pairs (i,j) which appear in 5 or more baskets\n",
96 | "for i in range(1,20): # these are the only singletons which are frequent\n",
97 | " for j in range(i+1,21):\n",
98 | " commonbask = [b for b in baskets[i] if b in baskets[j]]\n",
99 | " if len(commonbask) >= 5:\n",
100 | " print (i,j),"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "(c) Define, `num_factors(b)` as the number of factors that b has. Then sum of the sizes all baskets = `sum(num_factors(b), b=1,2,...,20)`.\n",
108 | "\n",
109 | "So, I didn't feel like thinking about how to grab prime factors of a number myself, so I stackoverflow'ed this. [Here](http://stackoverflow.com/questions/16996217/prime-factorization-list) is simple function to extract the prime factors of a list (it essentially follows how one would find prime factors by hand). "
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 5,
115 | "metadata": {
116 | "collapsed": true
117 | },
118 | "outputs": [],
119 | "source": [
120 | "def primes(n):\n",
121 | " primfac = []\n",
122 | " d = 2\n",
123 | " while d*d <= n:\n",
124 | " while (n % d) == 0:\n",
125 | " primfac.append(d) # supposing you want multiple factors repeated\n",
126 | " n /= d\n",
127 | " d += 1\n",
128 | " if n > 1:\n",
129 | " primfac.append(n)\n",
130 | " return primfac"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 6,
136 | "metadata": {
137 | "collapsed": false
138 | },
139 | "outputs": [],
140 | "source": [
141 | "# create dictionaries for prime factors of baskets 1,2,...,100\n",
142 | "primefactors = {}\n",
143 | "for b in range(1,101):\n",
144 | " # initializing the dictionary for each basket b\n",
145 | " primefactors[b] = {fac:0 for fac in primes(b)}\n",
146 | " for key in primes(b):\n",
147 | " primefactors[b][key] += 1"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": 7,
153 | "metadata": {
154 | "collapsed": false
155 | },
156 | "outputs": [
157 | {
158 | "data": {
159 | "text/plain": [
160 | "{2: 2, 3: 1}"
161 | ]
162 | },
163 | "execution_count": 7,
164 | "metadata": {},
165 | "output_type": "execute_result"
166 | }
167 | ],
168 | "source": [
169 | "# for example, the prime factorization of 12 = 2^2 * 3\n",
170 | "primefactors[12]"
171 | ]
172 | },
173 | {
174 | "cell_type": "markdown",
175 | "metadata": {},
176 | "source": [
177 | "To get the number of factors of 12, we add 1 to each of the replications of its factors and apply the multiplication rule. So 12 has (2+1)*(1+1)=6 factors in total."
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": 8,
183 | "metadata": {
184 | "collapsed": true
185 | },
186 | "outputs": [],
187 | "source": [
188 | "# to get number of factors, we add 1 to each rep of factor and apply multiplication rule\n",
189 | "def num_factors(b):\n",
190 | " numfac = 1\n",
191 | " for fac,reps in primefactors[b].items():\n",
192 | " numfac *= reps + 1\n",
193 | " return numfac"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 9,
199 | "metadata": {
200 | "collapsed": false
201 | },
202 | "outputs": [
203 | {
204 | "data": {
205 | "text/plain": [
206 | "6"
207 | ]
208 | },
209 | "execution_count": 9,
210 | "metadata": {},
211 | "output_type": "execute_result"
212 | }
213 | ],
214 | "source": [
215 | "num_factors(12)"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 10,
221 | "metadata": {
222 | "collapsed": false
223 | },
224 | "outputs": [],
225 | "source": [
226 | "# sum of the sizes of all baskets\n",
227 | "sizeofbaskets = [num_factors(b) for b in range(1,101)]\n",
228 | "totalsize = sum(sizeofbaskets)"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": 11,
234 | "metadata": {
235 | "collapsed": false
236 | },
237 | "outputs": [
238 | {
239 | "data": {
240 | "text/plain": [
241 | "482"
242 | ]
243 | },
244 | "execution_count": 11,
245 | "metadata": {},
246 | "output_type": "execute_result"
247 | }
248 | ],
249 | "source": [
250 | "totalsize"
251 | ]
252 | },
253 | {
254 | "cell_type": "markdown",
255 | "metadata": {},
256 | "source": [
257 | "\n",
258 | "# Exercise 6.1.2\n",
259 | "For the item-basket data of Exercise 6.1.1, which basket is the largest?"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": 12,
265 | "metadata": {
266 | "collapsed": false
267 | },
268 | "outputs": [
269 | {
270 | "data": {
271 | "text/plain": [
272 | "12"
273 | ]
274 | },
275 | "execution_count": 12,
276 | "metadata": {},
277 | "output_type": "execute_result"
278 | }
279 | ],
280 | "source": [
281 | "# the largest baskets hav 12 items\n",
282 | "max(sizeofbaskets)"
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": 13,
288 | "metadata": {
289 | "collapsed": false
290 | },
291 | "outputs": [
292 | {
293 | "name": "stdout",
294 | "output_type": "stream",
295 | "text": [
296 | "60\n",
297 | "72\n",
298 | "84\n",
299 | "90\n",
300 | "96\n"
301 | ]
302 | }
303 | ],
304 | "source": [
305 | "# these baskets are\n",
306 | "for b in range(1,101):\n",
307 | " if num_factors(b) == 12:\n",
308 | " print b"
309 | ]
310 | },
311 | {
312 | "cell_type": "markdown",
313 | "metadata": {},
314 | "source": [
315 | "\n",
316 | "# Exercise 6.1.3\n",
317 | "Suppose there are 100 items, numbered 1 to 100, and also 100 baskets, also numbered 1 to 100. Item `i` is in basket `b` if and only if `b` divides `i` with no remainder. For example, basket 12 consists of items {12,24,36,48,60,72,84,96}. Repeat Exercise 6.1.1 for this data."
318 | ]
319 | },
320 | {
321 | "cell_type": "markdown",
322 | "metadata": {},
323 | "source": [
324 | "#### Solution\n",
325 | "\n",
326 | "(a) Basket `b` consists of items which are multiples of `b`. Alternatively, item `i` is in basket `b` if `b` is a factor of `i`. Thus, item `i` is frequent if it has at least 5 factors <= 100."
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 14,
332 | "metadata": {
333 | "collapsed": false
334 | },
335 | "outputs": [],
336 | "source": [
337 | "# List of frequent items\n",
338 | "L1 = [b for b in range(1,101) if num_factors(b)>=5]"
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": 15,
344 | "metadata": {
345 | "collapsed": false
346 | },
347 | "outputs": [
348 | {
349 | "name": "stdout",
350 | "output_type": "stream",
351 | "text": [
352 | "12 16 18 20 24 28 30 32 36 40 42 44 45 48 50 52 54 56 60 63 64 66 68 70 72 75 76 78 80 81 84 88 90 92 96 98 99 100\n"
353 | ]
354 | }
355 | ],
356 | "source": [
357 | "for b in L1:\n",
358 | " print b,"
359 | ]
360 | },
361 | {
362 | "cell_type": "markdown",
363 | "metadata": {},
364 | "source": [
365 | "(b) Clearly, `(i,j)` represent a frequent pair if `i` and `j` share at least 5 common factors."
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": 16,
371 | "metadata": {
372 | "collapsed": false
373 | },
374 | "outputs": [],
375 | "source": [
376 | "def lexorders_ofexp(b):\n",
377 | " \"\"\"\n",
378 | " This function returns the lexicographic ordering of the exponents\n",
379 | " of the prime factors of b. We can use this to get all the factors\n",
380 | " of b.\n",
381 | " \"\"\"\n",
382 | " n = len(primefactors[b])\n",
383 | " ati = primefactors[b].values()\n",
384 | " foo = []\n",
385 | " if n == 1:\n",
386 | " for j in range(ati[0]+1):\n",
387 | " foo.append([j])\n",
388 | " if n == 2:\n",
389 | " i = 0\n",
390 | " while i < ati[0]+1:\n",
391 | " j = 0\n",
392 | " while j < ati[1]+1:\n",
393 | " foo.append([i,j])\n",
394 | " j+=1\n",
395 | " i+=1\n",
396 | " if n == 3:\n",
397 | " i = 0\n",
398 | " while i < ati[0]+1:\n",
399 | " j = 0\n",
400 | " while j < ati[1]+1:\n",
401 | " k = 0\n",
402 | " while k < ati[2]+1:\n",
403 | " foo.append([i,j,k])\n",
404 | " k+=1\n",
405 | " j+=1\n",
406 | " i+=1\n",
407 | " return foo "
408 | ]
409 | },
410 | {
411 | "cell_type": "code",
412 | "execution_count": 17,
413 | "metadata": {
414 | "collapsed": false
415 | },
416 | "outputs": [],
417 | "source": [
418 | "# getting all factors of b from its prime factors\n",
419 | "def factors(b):\n",
420 | " facs = []\n",
421 | " exps = lexorders_ofexp(b)\n",
422 | " for i in range(num_factors(b)):\n",
423 | " bar = 1 # this also takes care of base case factors(1)\n",
424 | " for el,key in enumerate(primefactors[b].keys()):\n",
425 | " bar *= int(key**exps[i][el])\n",
426 | " facs.append(bar)\n",
427 | " return facs"
428 | ]
429 | },
430 | {
431 | "cell_type": "code",
432 | "execution_count": 18,
433 | "metadata": {
434 | "collapsed": false
435 | },
436 | "outputs": [],
437 | "source": [
438 | "def num_commonfactors(b1,b2):\n",
439 | " foo = factors(b1)\n",
440 | " bar = factors(b2)\n",
441 | " return len([el for el in foo if el in bar])"
442 | ]
443 | },
444 | {
445 | "cell_type": "code",
446 | "execution_count": 19,
447 | "metadata": {
448 | "collapsed": false
449 | },
450 | "outputs": [
451 | {
452 | "data": {
453 | "text/plain": [
454 | "[1, 3, 9, 2, 6, 18, 4, 12, 36, 8, 24, 72]"
455 | ]
456 | },
457 | "execution_count": 19,
458 | "metadata": {},
459 | "output_type": "execute_result"
460 | }
461 | ],
462 | "source": [
463 | "factors(72)"
464 | ]
465 | },
466 | {
467 | "cell_type": "code",
468 | "execution_count": 20,
469 | "metadata": {
470 | "collapsed": false
471 | },
472 | "outputs": [
473 | {
474 | "data": {
475 | "text/plain": [
476 | "[1, 3, 2, 6, 4, 12]"
477 | ]
478 | },
479 | "execution_count": 20,
480 | "metadata": {},
481 | "output_type": "execute_result"
482 | }
483 | ],
484 | "source": [
485 | "factors(12)"
486 | ]
487 | },
488 | {
489 | "cell_type": "code",
490 | "execution_count": 21,
491 | "metadata": {
492 | "collapsed": false
493 | },
494 | "outputs": [
495 | {
496 | "data": {
497 | "text/plain": [
498 | "[1, 3, 2, 6, 4, 12]"
499 | ]
500 | },
501 | "execution_count": 21,
502 | "metadata": {},
503 | "output_type": "execute_result"
504 | }
505 | ],
506 | "source": [
507 | "# common factors\n",
508 | "[el for el in factors(12) if el in factors(72)]"
509 | ]
510 | },
511 | {
512 | "cell_type": "code",
513 | "execution_count": 22,
514 | "metadata": {
515 | "collapsed": false
516 | },
517 | "outputs": [
518 | {
519 | "data": {
520 | "text/plain": [
521 | "6"
522 | ]
523 | },
524 | "execution_count": 22,
525 | "metadata": {},
526 | "output_type": "execute_result"
527 | }
528 | ],
529 | "source": [
530 | "num_commonfactors(12,72)"
531 | ]
532 | },
533 | {
534 | "cell_type": "markdown",
535 | "metadata": {},
536 | "source": [
537 | "#### We are now ready to grab the frequent pairs."
538 | ]
539 | },
540 | {
541 | "cell_type": "code",
542 | "execution_count": 23,
543 | "metadata": {
544 | "collapsed": false
545 | },
546 | "outputs": [
547 | {
548 | "name": "stdout",
549 | "output_type": "stream",
550 | "text": [
551 | "(12, 24) (12, 36) (12, 48) (12, 60) (12, 72) (12, 84) (12, 96) (16, 32) (16, 48) (16, 64) (16, 80) (16, 96) (18, 36) (18, 54) (18, 72) (18, 90) (20, 40) (20, 60) (20, 80) (20, 100) (24, 36) (24, 48) (24, 60) (24, 72) (24, 84) (24, 96) (28, 56) (28, 84) (30, 60) (30, 90) (32, 48) (32, 64) (32, 80) (32, 96) (36, 48) (36, 54) (36, 60) (36, 72) (36, 84) (36, 90) (36, 96) (40, 60) (40, 80) (40, 100) (42, 84) (44, 88) (45, 90) (48, 60) (48, 64) (48, 72) (48, 80) (48, 84) (48, 96) (50, 100) (54, 72) (54, 90) (56, 84) (60, 72) (60, 80) (60, 84) (60, 90) (60, 96) (60, 100) (64, 80) (64, 96) (72, 84) (72, 90) (72, 96) (80, 96) (80, 100) (84, 96)\n"
552 | ]
553 | }
554 | ],
555 | "source": [
556 | "# List of pairs (i,j) with at least 5 common factors\n",
557 | "for i in range(1,100):\n",
558 | " for j in range(i+1,101):\n",
559 | " if num_commonfactors(i,j) >= 5:\n",
560 | " print (i,j),"
561 | ]
562 | },
563 | {
564 | "cell_type": "markdown",
565 | "metadata": {},
566 | "source": [
567 | "(c) The size of basket `b` is `floor(100/b)`"
568 | ]
569 | },
570 | {
571 | "cell_type": "code",
572 | "execution_count": 24,
573 | "metadata": {
574 | "collapsed": false
575 | },
576 | "outputs": [
577 | {
578 | "data": {
579 | "text/plain": [
580 | "482"
581 | ]
582 | },
583 | "execution_count": 24,
584 | "metadata": {},
585 | "output_type": "execute_result"
586 | }
587 | ],
588 | "source": [
589 | "sizeofbaskets = [int(100/b) for b in range(1,101)]\n",
590 | "totalsize = sum(sizeofbaskets)\n",
591 | "totalsize"
592 | ]
593 | },
594 | {
595 | "cell_type": "markdown",
596 | "metadata": {},
597 | "source": [
598 | "\n",
599 | "# Exercise 6.1.5\n",
600 | "For the data of Exercise 6.1.1, what is the confidence of the following association rules?\n",
601 | "\n",
602 | "(a) {5,7} -> 2\n",
603 | "- The support of {5,7} is 2 since {5,7} can be found in baskets (35) and (70). On the other hand, the support of {5,7,2} is 1 since this triple can only be found in basket {70}. Therefore the confidence of this association rule is 1/2.\n",
604 | "\n",
605 | "(b) {2,3,4} -> 5\n",
606 | "- The confidence of this rule is 1/8. Note that basket `b` contains itemset `I` if all of its items are factors of `b`."
607 | ]
608 | },
609 | {
610 | "cell_type": "code",
611 | "execution_count": 25,
612 | "metadata": {
613 | "collapsed": true
614 | },
615 | "outputs": [],
616 | "source": [
617 | "from __future__ import division"
618 | ]
619 | },
620 | {
621 | "cell_type": "code",
622 | "execution_count": 26,
623 | "metadata": {
624 | "collapsed": false
625 | },
626 | "outputs": [],
627 | "source": [
628 | "def supportset_611(I):\n",
629 | " sup = []\n",
630 | " for b in range(1,101):\n",
631 | " # check if all items in I are factors of b\n",
632 | " if all(item in factors(b) for item in I):\n",
633 | " sup.append(b)\n",
634 | " return sup"
635 | ]
636 | },
637 | {
638 | "cell_type": "code",
639 | "execution_count": 27,
640 | "metadata": {
641 | "collapsed": false
642 | },
643 | "outputs": [
644 | {
645 | "data": {
646 | "text/plain": [
647 | "[12, 24, 36, 48, 60, 72, 84, 96]"
648 | ]
649 | },
650 | "execution_count": 27,
651 | "metadata": {},
652 | "output_type": "execute_result"
653 | }
654 | ],
655 | "source": [
656 | "# support set of {2,3,4}\n",
657 | "supportset_611([2,3,4])"
658 | ]
659 | },
660 | {
661 | "cell_type": "code",
662 | "execution_count": 28,
663 | "metadata": {
664 | "collapsed": false
665 | },
666 | "outputs": [
667 | {
668 | "data": {
669 | "text/plain": [
670 | "[60]"
671 | ]
672 | },
673 | "execution_count": 28,
674 | "metadata": {},
675 | "output_type": "execute_result"
676 | }
677 | ],
678 | "source": [
679 | "# support set of {2,3,4,5}\n",
680 | "supportset_611([2,3,4,5])"
681 | ]
682 | },
683 | {
684 | "cell_type": "code",
685 | "execution_count": 29,
686 | "metadata": {
687 | "collapsed": false
688 | },
689 | "outputs": [
690 | {
691 | "data": {
692 | "text/plain": [
693 | "0.125"
694 | ]
695 | },
696 | "execution_count": 29,
697 | "metadata": {},
698 | "output_type": "execute_result"
699 | }
700 | ],
701 | "source": [
702 | "# Confidence of {2,3,4}->5\n",
703 | "conf_b = len(supportset_611([2,3,4,5]))/len(supportset_611([2,3,4]))\n",
704 | "conf_b"
705 | ]
706 | },
707 | {
708 | "cell_type": "markdown",
709 | "metadata": {},
710 | "source": [
711 | "\n",
712 | "# Exercise 6.1.6\n",
713 | "\n",
714 | "For the data of Exercise 6.1.3, what is the confidence of the following association rules?\n",
715 | "\n",
716 | "(a) {24,60} -> 8\n",
717 | "- The support of {24,60} is 6 since {24} and {60} share the common factors (1),(2),(3),(4),(6), and (12). The support of {8,24,60} is 3 since only factors (1), (2) and (4) are shared amongst them. Therefore, the confidence of this association rule is 3/6=1/2.\n",
718 | "\n",
719 | "(b) {2,3,4} -> 5\n",
720 | "- The confidence of this association rule is 1. See below"
721 | ]
722 | },
723 | {
724 | "cell_type": "code",
725 | "execution_count": 35,
726 | "metadata": {
727 | "collapsed": false
728 | },
729 | "outputs": [],
730 | "source": [
731 | "def supportset_613(I):\n",
732 | " \"\"\"\n",
733 | " The set of baskets containing itemset I is the set of common\n",
734 | " of factors amongst all items in I. This function takes I as\n",
735 | " input and outputs the common factors (i.e, the baskets) of \n",
736 | " all items contained in I.\n",
737 | " \"\"\"\n",
738 | " n = len(I)\n",
739 | " if n == 1:\n",
740 | " return factors(I[0])\n",
741 | " elif n == 2:\n",
742 | " foo = factors(I[0])\n",
743 | " bar = factors(I[1])\n",
744 | " return [el for el in foo if el in bar]\n",
745 | " else: # use divide and conquer here\n",
746 | " half = int(n/2)\n",
747 | " firstpart = supportset_613(I[:half])\n",
748 | " secondpart = supportset_613(I[half:])\n",
749 | " return [el for el in firstpart if el in secondpart]"
750 | ]
751 | },
752 | {
753 | "cell_type": "code",
754 | "execution_count": 36,
755 | "metadata": {
756 | "collapsed": false
757 | },
758 | "outputs": [
759 | {
760 | "data": {
761 | "text/plain": [
762 | "[1]"
763 | ]
764 | },
765 | "execution_count": 36,
766 | "metadata": {},
767 | "output_type": "execute_result"
768 | }
769 | ],
770 | "source": [
771 | "# support set of {2,3,4} is\n",
772 | "supportset_613([2,3,4])"
773 | ]
774 | },
775 | {
776 | "cell_type": "code",
777 | "execution_count": 37,
778 | "metadata": {
779 | "collapsed": false
780 | },
781 | "outputs": [
782 | {
783 | "data": {
784 | "text/plain": [
785 | "[1]"
786 | ]
787 | },
788 | "execution_count": 37,
789 | "metadata": {},
790 | "output_type": "execute_result"
791 | }
792 | ],
793 | "source": [
794 | "# support set of {2,3,4,5}\n",
795 | "supportset_613([2,3,4,5])"
796 | ]
797 | },
798 | {
799 | "cell_type": "markdown",
800 | "metadata": {},
801 | "source": [
802 | "The confidence of this association rule is 1."
803 | ]
804 | },
805 | {
806 | "cell_type": "markdown",
807 | "metadata": {},
808 | "source": [
809 | "\n",
810 | "# Exercise 6.2.5\n",
811 | "\n",
812 | "Suppose the support threshold is 5. Find the maximal frequent itemsets for the data of:\n",
813 | "\n",
814 | "(a) Exercise 6.1.1\n",
815 | "\n",
816 | "(b) Exercise 6.1.3"
817 | ]
818 | },
819 | {
820 | "cell_type": "markdown",
821 | "metadata": {},
822 | "source": [
823 | "# Solutions to 6.2.5(a):"
824 | ]
825 | },
826 | {
827 | "cell_type": "markdown",
828 | "metadata": {},
829 | "source": [
830 | "### L1 and L2 "
831 | ]
832 | },
833 | {
834 | "cell_type": "code",
835 | "execution_count": 38,
836 | "metadata": {
837 | "collapsed": false
838 | },
839 | "outputs": [
840 | {
841 | "data": {
842 | "text/plain": [
843 | "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]"
844 | ]
845 | },
846 | "execution_count": 38,
847 | "metadata": {},
848 | "output_type": "execute_result"
849 | }
850 | ],
851 | "source": [
852 | "#(a)\n",
853 | "# frequent singletons\n",
854 | "L1 = range(1,21)\n",
855 | "L1"
856 | ]
857 | },
858 | {
859 | "cell_type": "code",
860 | "execution_count": 39,
861 | "metadata": {
862 | "collapsed": false
863 | },
864 | "outputs": [],
865 | "source": [
866 | "# frequent pairs\n",
867 | "L2 = []\n",
868 | "for i in range(1,20): # these are the only singletons which are frequent\n",
869 | " for j in range(i+1,21):\n",
870 | " commonbask = [b for b in baskets[i] if b in baskets[j]]\n",
871 | " if len(commonbask) >= 5:\n",
872 | " L2.append([i,j])"
873 | ]
874 | },
875 | {
876 | "cell_type": "code",
877 | "execution_count": 40,
878 | "metadata": {
879 | "collapsed": false
880 | },
881 | "outputs": [
882 | {
883 | "name": "stdout",
884 | "output_type": "stream",
885 | "text": [
886 | "[1, 2] [1, 3] [1, 4] [1, 5] [1, 6] [1, 7] [1, 8] [1, 9] [1, 10] [1, 11] [1, 12] [1, 13] [1, 14] [1, 15] [1, 16] [1, 17] [1, 18] [1, 19] [1, 20] [2, 3] [2, 4] [2, 5] [2, 6] [2, 7] [2, 8] [2, 9] [2, 10] [2, 12] [2, 14] [2, 16] [2, 18] [2, 20] [3, 4] [3, 5] [3, 6] [3, 9] [3, 12] [3, 15] [3, 18] [4, 5] [4, 6] [4, 8] [4, 10] [4, 12] [4, 16] [4, 20] [5, 10] [5, 15] [5, 20] [6, 9] [6, 12] [6, 18] [7, 14] [8, 16] [9, 18] [10, 20]\n"
887 | ]
888 | }
889 | ],
890 | "source": [
891 | "for pair in L2:\n",
892 | " print pair,"
893 | ]
894 | },
895 | {
896 | "cell_type": "markdown",
897 | "metadata": {},
898 | "source": [
899 | "### Maximal singletons "
900 | ]
901 | },
902 | {
903 | "cell_type": "code",
904 | "execution_count": 41,
905 | "metadata": {
906 | "collapsed": false
907 | },
908 | "outputs": [
909 | {
910 | "data": {
911 | "text/plain": [
912 | "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]"
913 | ]
914 | },
915 | "execution_count": 41,
916 | "metadata": {},
917 | "output_type": "execute_result"
918 | }
919 | ],
920 | "source": [
921 | "L2_flatten = []\n",
922 | "for pair in L2:\n",
923 | " L2_flatten += pair\n",
924 | " \n",
925 | "L2_flatten = list(set(L2_flatten))\n",
926 | "L2_flatten"
927 | ]
928 | },
929 | {
930 | "cell_type": "markdown",
931 | "metadata": {},
932 | "source": [
933 | "## Therefore, there are no maximal singletons."
934 | ]
935 | },
936 | {
937 | "cell_type": "markdown",
938 | "metadata": {},
939 | "source": [
940 | "### L3 "
941 | ]
942 | },
943 | {
944 | "cell_type": "code",
945 | "execution_count": 42,
946 | "metadata": {
947 | "collapsed": false
948 | },
949 | "outputs": [],
950 | "source": [
951 | "# frequent triples\n",
952 | "L3 = []\n",
953 | "for i in range(1,21):\n",
954 | " for pair in L2:\n",
955 | " if i not in pair:\n",
956 | " foo = pair+[i]\n",
957 | " foo.sort() # works in place\n",
958 | " if len(supportset_611(foo)) >= 5 and foo not in L3:\n",
959 | " L3.append(foo)"
960 | ]
961 | },
962 | {
963 | "cell_type": "markdown",
964 | "metadata": {},
965 | "source": [
966 | "### Maximal doubletons "
967 | ]
968 | },
969 | {
970 | "cell_type": "code",
971 | "execution_count": 43,
972 | "metadata": {
973 | "collapsed": false
974 | },
975 | "outputs": [],
976 | "source": [
977 | "# check for maximal doubletons\n",
978 | "maximal_doub = []\n",
979 | "for pair in L2:\n",
980 | " pair_max = False\n",
981 | " for trip in L3:\n",
982 | " if all(item in trip for item in pair):\n",
983 | " pair_max = True\n",
984 | " break\n",
985 | " if not pair_max: \n",
986 | " maximal_doub.append(pair)"
987 | ]
988 | },
989 | {
990 | "cell_type": "code",
991 | "execution_count": 44,
992 | "metadata": {
993 | "collapsed": false
994 | },
995 | "outputs": [
996 | {
997 | "data": {
998 | "text/plain": [
999 | "[[1, 11], [1, 13], [1, 17], [1, 19]]"
1000 | ]
1001 | },
1002 | "execution_count": 44,
1003 | "metadata": {},
1004 | "output_type": "execute_result"
1005 | }
1006 | ],
1007 | "source": [
1008 | "maximal_doub"
1009 | ]
1010 | },
1011 | {
1012 | "cell_type": "markdown",
1013 | "metadata": {},
1014 | "source": [
1015 | "### L4"
1016 | ]
1017 | },
1018 | {
1019 | "cell_type": "code",
1020 | "execution_count": 45,
1021 | "metadata": {
1022 | "collapsed": false
1023 | },
1024 | "outputs": [],
1025 | "source": [
1026 | "# frequent quads\n",
1027 | "L4 = []\n",
1028 | "for i in range(1,21):\n",
1029 | " for trip in L3:\n",
1030 | " if i not in trip:\n",
1031 | " foo = trip+[i]\n",
1032 | " foo.sort() # works in place\n",
1033 | " if len(supportset_611(foo)) >= 5 and foo not in L4:\n",
1034 | " L4.append(foo)"
1035 | ]
1036 | },
1037 | {
1038 | "cell_type": "markdown",
1039 | "metadata": {},
1040 | "source": [
1041 | "### maximal triples "
1042 | ]
1043 | },
1044 | {
1045 | "cell_type": "code",
1046 | "execution_count": 46,
1047 | "metadata": {
1048 | "collapsed": false
1049 | },
1050 | "outputs": [
1051 | {
1052 | "data": {
1053 | "text/plain": [
1054 | "[]"
1055 | ]
1056 | },
1057 | "execution_count": 46,
1058 | "metadata": {},
1059 | "output_type": "execute_result"
1060 | }
1061 | ],
1062 | "source": [
1063 | "# check for maximal triples\n",
1064 | "maximal_triples = []\n",
1065 | "for trip in L3:\n",
1066 | " trip_max = False\n",
1067 | " for quad in L4:\n",
1068 | " if all(item in quad for item in trip):\n",
1069 | " trip_max = True\n",
1070 | " break\n",
1071 | " if not trip_max: \n",
1072 | " maximal_triples.append(trip)\n",
1073 | "maximal_triples"
1074 | ]
1075 | },
1076 | {
1077 | "cell_type": "markdown",
1078 | "metadata": {},
1079 | "source": [
1080 | "### L5"
1081 | ]
1082 | },
1083 | {
1084 | "cell_type": "code",
1085 | "execution_count": 47,
1086 | "metadata": {
1087 | "collapsed": false
1088 | },
1089 | "outputs": [
1090 | {
1091 | "data": {
1092 | "text/plain": [
1093 | "[[1, 2, 3, 4, 6],\n",
1094 | " [1, 2, 3, 4, 12],\n",
1095 | " [1, 2, 3, 6, 9],\n",
1096 | " [1, 2, 3, 6, 12],\n",
1097 | " [1, 2, 3, 6, 18],\n",
1098 | " [1, 2, 3, 9, 18],\n",
1099 | " [1, 2, 4, 5, 10],\n",
1100 | " [1, 2, 4, 5, 20],\n",
1101 | " [1, 2, 4, 6, 12],\n",
1102 | " [1, 2, 4, 8, 16],\n",
1103 | " [1, 2, 4, 10, 20],\n",
1104 | " [1, 2, 5, 10, 20],\n",
1105 | " [1, 2, 6, 9, 18],\n",
1106 | " [1, 3, 4, 6, 12],\n",
1107 | " [1, 3, 6, 9, 18],\n",
1108 | " [1, 4, 5, 10, 20],\n",
1109 | " [2, 3, 4, 6, 12],\n",
1110 | " [2, 3, 6, 9, 18],\n",
1111 | " [2, 4, 5, 10, 20]]"
1112 | ]
1113 | },
1114 | "execution_count": 47,
1115 | "metadata": {},
1116 | "output_type": "execute_result"
1117 | }
1118 | ],
1119 | "source": [
1120 | "# frequent quintiples\n",
1121 | "L5 = []\n",
1122 | "for i in range(1,21):\n",
1123 | " for quad in L4:\n",
1124 | " if i not in quad:\n",
1125 | " foo = quad+[i]\n",
1126 | " foo.sort() # works in place\n",
1127 | " if len(supportset_611(foo)) >= 5 and foo not in L5:\n",
1128 | " L5.append(foo)\n",
1129 | "L5"
1130 | ]
1131 | },
1132 | {
1133 | "cell_type": "markdown",
1134 | "metadata": {},
1135 | "source": [
1136 | "### Maximal quads "
1137 | ]
1138 | },
1139 | {
1140 | "cell_type": "code",
1141 | "execution_count": 48,
1142 | "metadata": {
1143 | "collapsed": false
1144 | },
1145 | "outputs": [],
1146 | "source": [
1147 | "# check for maximal quads\n",
1148 | "maximal_quads = []\n",
1149 | "for quad in L4:\n",
1150 | " quad_max = False\n",
1151 | " for quint in L5:\n",
1152 | " if all(item in quint for item in quad):\n",
1153 | " quad_max = True\n",
1154 | " break\n",
1155 | " if not quad_max: \n",
1156 | " maximal_quads.append(quad)"
1157 | ]
1158 | },
1159 | {
1160 | "cell_type": "code",
1161 | "execution_count": 49,
1162 | "metadata": {
1163 | "collapsed": false
1164 | },
1165 | "outputs": [
1166 | {
1167 | "data": {
1168 | "text/plain": [
1169 | "[[1, 2, 7, 14], [1, 3, 5, 15]]"
1170 | ]
1171 | },
1172 | "execution_count": 49,
1173 | "metadata": {},
1174 | "output_type": "execute_result"
1175 | }
1176 | ],
1177 | "source": [
1178 | "maximal_quads"
1179 | ]
1180 | },
1181 | {
1182 | "cell_type": "code",
1183 | "execution_count": 50,
1184 | "metadata": {
1185 | "collapsed": false
1186 | },
1187 | "outputs": [
1188 | {
1189 | "data": {
1190 | "text/plain": [
1191 | "[15, 30, 45, 60, 75, 90]"
1192 | ]
1193 | },
1194 | "execution_count": 50,
1195 | "metadata": {},
1196 | "output_type": "execute_result"
1197 | }
1198 | ],
1199 | "source": [
1200 | "# looking at one of these\n",
1201 | "supportset_611([1, 3, 5, 15])"
1202 | ]
1203 | },
1204 | {
1205 | "cell_type": "markdown",
1206 | "metadata": {},
1207 | "source": [
1208 | "### L6"
1209 | ]
1210 | },
1211 | {
1212 | "cell_type": "code",
1213 | "execution_count": 51,
1214 | "metadata": {
1215 | "collapsed": false
1216 | },
1217 | "outputs": [
1218 | {
1219 | "data": {
1220 | "text/plain": [
1221 | "[[1, 2, 3, 4, 6, 12], [1, 2, 3, 6, 9, 18], [1, 2, 4, 5, 10, 20]]"
1222 | ]
1223 | },
1224 | "execution_count": 51,
1225 | "metadata": {},
1226 | "output_type": "execute_result"
1227 | }
1228 | ],
1229 | "source": [
1230 | "# frequent sixtuplets\n",
1231 | "L6 = []\n",
1232 | "for i in range(1,21):\n",
1233 | " for quint in L5:\n",
1234 | " if i not in quint:\n",
1235 | " foo = quint+[i]\n",
1236 | " foo.sort() # works in place\n",
1237 | " if len(supportset_611(foo)) >= 5 and foo not in L6:\n",
1238 | " L6.append(foo)\n",
1239 | "L6"
1240 | ]
1241 | },
1242 | {
1243 | "cell_type": "markdown",
1244 | "metadata": {},
1245 | "source": [
1246 | "### Maximal quintuplets "
1247 | ]
1248 | },
1249 | {
1250 | "cell_type": "code",
1251 | "execution_count": 52,
1252 | "metadata": {
1253 | "collapsed": false
1254 | },
1255 | "outputs": [],
1256 | "source": [
1257 | "# check for maximal quints\n",
1258 | "maximal_quints = []\n",
1259 | "for quint in L5:\n",
1260 | " quint_max = False\n",
1261 | " for sixt in L6:\n",
1262 | " if all(item in sixt for item in quint):\n",
1263 | " quint_max = True\n",
1264 | " break\n",
1265 | " if not quint_max: \n",
1266 | " maximal_quints.append(quint)"
1267 | ]
1268 | },
1269 | {
1270 | "cell_type": "code",
1271 | "execution_count": 53,
1272 | "metadata": {
1273 | "collapsed": false
1274 | },
1275 | "outputs": [
1276 | {
1277 | "data": {
1278 | "text/plain": [
1279 | "[[1, 2, 4, 8, 16]]"
1280 | ]
1281 | },
1282 | "execution_count": 53,
1283 | "metadata": {},
1284 | "output_type": "execute_result"
1285 | }
1286 | ],
1287 | "source": [
1288 | "maximal_quints"
1289 | ]
1290 | },
1291 | {
1292 | "cell_type": "markdown",
1293 | "metadata": {},
1294 | "source": [
1295 | "### L7"
1296 | ]
1297 | },
1298 | {
1299 | "cell_type": "code",
1300 | "execution_count": 54,
1301 | "metadata": {
1302 | "collapsed": false
1303 | },
1304 | "outputs": [
1305 | {
1306 | "data": {
1307 | "text/plain": [
1308 | "[]"
1309 | ]
1310 | },
1311 | "execution_count": 54,
1312 | "metadata": {},
1313 | "output_type": "execute_result"
1314 | }
1315 | ],
1316 | "source": [
1317 | "# frequent septuplets\n",
1318 | "L7 = []\n",
1319 | "for i in range(1,21):\n",
1320 | " for sixt in L6:\n",
1321 | " if i not in sixt:\n",
1322 | " foo = sixt+[i]\n",
1323 | " foo.sort() # works in place\n",
1324 | " if len(supportset_611(foo)) >= 5 and foo not in L7:\n",
1325 | " L7.append(foo)\n",
1326 | "L7"
1327 | ]
1328 | },
1329 | {
1330 | "cell_type": "markdown",
1331 | "metadata": {},
1332 | "source": [
1333 | "## No septuplets implies that all frequent sixtuplets are maximal!"
1334 | ]
1335 | },
1336 | {
1337 | "cell_type": "code",
1338 | "execution_count": 55,
1339 | "metadata": {
1340 | "collapsed": false
1341 | },
1342 | "outputs": [
1343 | {
1344 | "name": "stdout",
1345 | "output_type": "stream",
1346 | "text": [
1347 | "[12, 24, 36, 48, 60, 72, 84, 96]\n",
1348 | "[18, 36, 54, 72, 90]\n",
1349 | "[20, 40, 60, 80, 100]\n"
1350 | ]
1351 | }
1352 | ],
1353 | "source": [
1354 | "# here are the support sets of each of sixtuplets\n",
1355 | "for sixt in L6:\n",
1356 | " print supportset_611(sixt)"
1357 | ]
1358 | },
1359 | {
1360 | "cell_type": "markdown",
1361 | "metadata": {},
1362 | "source": [
1363 | "# Solutions to 6.2.5(b)"
1364 | ]
1365 | },
1366 | {
1367 | "cell_type": "markdown",
1368 | "metadata": {},
1369 | "source": [
1370 | "### L1 and L2"
1371 | ]
1372 | },
1373 | {
1374 | "cell_type": "code",
1375 | "execution_count": 56,
1376 | "metadata": {
1377 | "collapsed": false
1378 | },
1379 | "outputs": [
1380 | {
1381 | "name": "stdout",
1382 | "output_type": "stream",
1383 | "text": [
1384 | "12 16 18 20 24 28 30 32 36 40 42 44 45 48 50 52 54 56 60 63 64 66 68 70 72 75 76 78 80 81 84 88 90 92 96 98 99 100\n"
1385 | ]
1386 | }
1387 | ],
1388 | "source": [
1389 | "# List of frequent items\n",
1390 | "L1 = [b for b in range(1,101) if num_factors(b)>=5]\n",
1391 | "\n",
1392 | "for items in L1:\n",
1393 | " print items,"
1394 | ]
1395 | },
1396 | {
1397 | "cell_type": "code",
1398 | "execution_count": 57,
1399 | "metadata": {
1400 | "collapsed": false
1401 | },
1402 | "outputs": [
1403 | {
1404 | "name": "stdout",
1405 | "output_type": "stream",
1406 | "text": [
1407 | "[12, 24] [12, 36] [12, 48] [12, 60] [12, 72] [12, 84] [12, 96] [16, 32] [16, 48] [16, 64] [16, 80] [16, 96] [18, 36] [18, 54] [18, 72] [18, 90] [20, 40] [20, 60] [20, 80] [20, 100] [24, 36] [24, 48] [24, 60] [24, 72] [24, 84] [24, 96] [28, 56] [28, 84] [30, 60] [30, 90] [32, 48] [32, 64] [32, 80] [32, 96] [36, 48] [36, 54] [36, 60] [36, 72] [36, 84] [36, 90] [36, 96] [40, 60] [40, 80] [40, 100] [42, 84] [44, 88] [45, 90] [48, 60] [48, 64] [48, 72] [48, 80] [48, 84] [48, 96] [50, 100] [54, 72] [54, 90] [56, 84] [60, 72] [60, 80] [60, 84] [60, 90] [60, 96] [60, 100] [64, 80] [64, 96] [72, 84] [72, 90] [72, 96] [80, 96] [80, 100] [84, 96]\n"
1408 | ]
1409 | }
1410 | ],
1411 | "source": [
1412 | "# List of frequent pairs\n",
1413 | "L2 = []\n",
1414 | "for i in range(1,100):\n",
1415 | " for j in range(i+1,101):\n",
1416 | " if len(supportset_613([i,j])) >= 5:\n",
1417 | " L2.append([i,j])\n",
1418 | "\n",
1419 | "for pair in L2:\n",
1420 | " print pair,"
1421 | ]
1422 | },
1423 | {
1424 | "cell_type": "code",
1425 | "execution_count": 58,
1426 | "metadata": {
1427 | "collapsed": false
1428 | },
1429 | "outputs": [
1430 | {
1431 | "name": "stdout",
1432 | "output_type": "stream",
1433 | "text": [
1434 | "[12, 24] [12, 36] [12, 48] [12, 60] [12, 72] [12, 84] [12, 96] [16, 32] [16, 48] [16, 64] [16, 80] [16, 96] [18, 36] [18, 54] [18, 72] [18, 90] [20, 40] [20, 60] [20, 80] [20, 100] [24, 36] [24, 48] [24, 60] [24, 72] [24, 84] [24, 96] [28, 56] [28, 84] [30, 60] [30, 90] [32, 48] [32, 64] [32, 80] [32, 96] [36, 48] [36, 54] [36, 60] [36, 72] [36, 84] [36, 90] [36, 96] [40, 60] [40, 80] [40, 100] [42, 84] [44, 88] [45, 90] [48, 60] [48, 64] [48, 72] [48, 80] [48, 84] [48, 96] [50, 100] [54, 72] [54, 90] [56, 84] [60, 72] [60, 80] [60, 84] [60, 90] [60, 96] [60, 100] [64, 80] [64, 96] [72, 84] [72, 90] [72, 96] [80, 96] [80, 100] [84, 96]\n"
1435 | ]
1436 | }
1437 | ],
1438 | "source": [
1439 | "# List of frequent pairs\n",
1440 | "L2 = []\n",
1441 | "for i in range(1,100):\n",
1442 | " for j in range(i+1,101):\n",
1443 | " if num_commonfactors(i,j) >= 5:\n",
1444 | " L2.append([i,j])\n",
1445 | "\n",
1446 | "for pair in L2:\n",
1447 | " print pair,"
1448 | ]
1449 | },
1450 | {
1451 | "cell_type": "markdown",
1452 | "metadata": {},
1453 | "source": [
1454 | "### Maximal singletons"
1455 | ]
1456 | },
1457 | {
1458 | "cell_type": "code",
1459 | "execution_count": 59,
1460 | "metadata": {
1461 | "collapsed": false
1462 | },
1463 | "outputs": [
1464 | {
1465 | "data": {
1466 | "text/plain": [
1467 | "[52, 63, 66, 68, 70, 75, 76, 78, 81, 92, 98, 99]"
1468 | ]
1469 | },
1470 | "execution_count": 59,
1471 | "metadata": {},
1472 | "output_type": "execute_result"
1473 | }
1474 | ],
1475 | "source": [
1476 | "# check for maximal singletons\n",
1477 | "maximal_single = []\n",
1478 | "for single in L1:\n",
1479 | " single_max = False\n",
1480 | " for pair in L2:\n",
1481 | " if all(item in pair for item in [single]):\n",
1482 | " single_max = True\n",
1483 | " break\n",
1484 | " if not single_max: \n",
1485 | " maximal_single.append(single)\n",
1486 | "maximal_single"
1487 | ]
1488 | },
1489 | {
1490 | "cell_type": "markdown",
1491 | "metadata": {},
1492 | "source": [
1493 | "### L3 "
1494 | ]
1495 | },
1496 | {
1497 | "cell_type": "code",
1498 | "execution_count": 60,
1499 | "metadata": {
1500 | "collapsed": false
1501 | },
1502 | "outputs": [
1503 | {
1504 | "data": {
1505 | "text/plain": [
1506 | "[]"
1507 | ]
1508 | },
1509 | "execution_count": 60,
1510 | "metadata": {},
1511 | "output_type": "execute_result"
1512 | }
1513 | ],
1514 | "source": [
1515 | "# List of frequent triples\n",
1516 | "L3 = []\n",
1517 | "for single in L1:\n",
1518 | " for pair in L2:\n",
1519 | " if single not in pair:\n",
1520 | " foo = pair+[i]\n",
1521 | " foo.sort() # works in place\n",
1522 | " if len(supportset_613(foo)) >= 5 and foo not in L3:\n",
1523 | " L3.append(foo)\n",
1524 | "L3"
1525 | ]
1526 | },
1527 | {
1528 | "cell_type": "markdown",
1529 | "metadata": {},
1530 | "source": [
1531 | "### No frequent triples, means that every doubleton is maximal."
1532 | ]
1533 | },
1534 | {
1535 | "cell_type": "markdown",
1536 | "metadata": {},
1537 | "source": [
1538 | "\n",
1539 | "# Exercise 6.2.6 (A-Priori Algorithm) \n",
1540 | "\n",
1541 | "Apply the A-Priori Algorithm with support threshold 5 to the data of:\n",
1542 | "\n",
1543 | "(a) Exercise 6.1.1.\n",
1544 | "\n",
1545 | "(b) Exercise 6.1.3.\n",
1546 | "\n",
1547 | "To make this more interesting, let's map integers `i` to strings using english stopwords."
1548 | ]
1549 | },
1550 | {
1551 | "cell_type": "code",
1552 | "execution_count": 61,
1553 | "metadata": {
1554 | "collapsed": false
1555 | },
1556 | "outputs": [],
1557 | "source": [
1558 | "# generate the baskets: basket b consists of all items i that are factors of b\n",
1559 | "baskets_611 = {}\n",
1560 | "for i in range(1,101):\n",
1561 | " baskets_611[i] = factors(i)"
1562 | ]
1563 | },
1564 | {
1565 | "cell_type": "markdown",
1566 | "metadata": {},
1567 | "source": [
1568 | "### Converting numeric baskets to corresponding words"
1569 | ]
1570 | },
1571 | {
1572 | "cell_type": "code",
1573 | "execution_count": 296,
1574 | "metadata": {
1575 | "collapsed": false
1576 | },
1577 | "outputs": [],
1578 | "source": [
1579 | "# print a file containing baskets enclosed by curly braces {}\n",
1580 | "baskets_611_words = []\n",
1581 | "for i in range(1,101):\n",
1582 | " baskets_611_words.append([','.join([words[int] for int in baskets_611[i]])])"
1583 | ]
1584 | },
1585 | {
1586 | "cell_type": "markdown",
1587 | "metadata": {},
1588 | "source": [
1589 | "# Solution to 6.2.6(a) "
1590 | ]
1591 | },
1592 | {
1593 | "cell_type": "markdown",
1594 | "metadata": {},
1595 | "source": [
1596 | "## First pass of A-Priori\n",
1597 | "\n",
1598 | "In the first pass, we count the frequency of singletons as we read the baskets and store these into an array. Clearly do not need to map items into integers."
1599 | ]
1600 | },
1601 | {
1602 | "cell_type": "code",
1603 | "execution_count": 80,
1604 | "metadata": {
1605 | "collapsed": false
1606 | },
1607 | "outputs": [],
1608 | "source": [
1609 | "C1 = {i:0 for i in range(1,101)}\n",
1610 | "for b in baskets_611.values():\n",
1611 | " for i in b:\n",
1612 | " C1[i] += 1"
1613 | ]
1614 | },
1615 | {
1616 | "cell_type": "markdown",
1617 | "metadata": {},
1618 | "source": [
1619 | "## Between the Passes of A-Priori\n",
1620 | "\n",
1621 | "Generate the list of singletons that are frequent"
1622 | ]
1623 | },
1624 | {
1625 | "cell_type": "code",
1626 | "execution_count": 76,
1627 | "metadata": {
1628 | "collapsed": true
1629 | },
1630 | "outputs": [],
1631 | "source": [
1632 | "# support threshold\n",
1633 | "s = 5"
1634 | ]
1635 | },
1636 | {
1637 | "cell_type": "code",
1638 | "execution_count": 82,
1639 | "metadata": {
1640 | "collapsed": false
1641 | },
1642 | "outputs": [],
1643 | "source": [
1644 | "L1 = [item for item,count in C1.items() if count >= s]"
1645 | ]
1646 | },
1647 | {
1648 | "cell_type": "code",
1649 | "execution_count": 83,
1650 | "metadata": {
1651 | "collapsed": false
1652 | },
1653 | "outputs": [
1654 | {
1655 | "data": {
1656 | "text/plain": [
1657 | "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]"
1658 | ]
1659 | },
1660 | "execution_count": 83,
1661 | "metadata": {},
1662 | "output_type": "execute_result"
1663 | }
1664 | ],
1665 | "source": [
1666 | "L1"
1667 | ]
1668 | },
1669 | {
1670 | "cell_type": "markdown",
1671 | "metadata": {},
1672 | "source": [
1673 | "## Second Pass of A-Priori\n",
1674 | "\n",
1675 | "1. For each basket, look in the frequent-items table to see which of its items are frequent.\n",
1676 | "\n",
1677 | "2. In a double loop, generate all pairs of frequent items in that basket.\n",
1678 | "\n",
1679 | "3. For each such pair, add one to its count in the data structure used to store counts."
1680 | ]
1681 | },
1682 | {
1683 | "cell_type": "markdown",
1684 | "metadata": {},
1685 | "source": [
1686 | "Using triangular-matrix method to store counts of pairs:"
1687 | ]
1688 | },
1689 | {
1690 | "cell_type": "code",
1691 | "execution_count": 96,
1692 | "metadata": {
1693 | "collapsed": true
1694 | },
1695 | "outputs": [],
1696 | "source": [
1697 | "def triangularmatrix_method(i,j,n):\n",
1698 | " \"\"\"\n",
1699 | " returns the index for the triangular matrix method \n",
1700 | " A[i,j] = a[k] in a flattened array... 1<=i= s]"
1739 | ]
1740 | },
1741 | {
1742 | "cell_type": "code",
1743 | "execution_count": 218,
1744 | "metadata": {
1745 | "collapsed": false
1746 | },
1747 | "outputs": [
1748 | {
1749 | "data": {
1750 | "text/plain": [
1751 | "56"
1752 | ]
1753 | },
1754 | "execution_count": 218,
1755 | "metadata": {},
1756 | "output_type": "execute_result"
1757 | }
1758 | ],
1759 | "source": [
1760 | "len(pair_indices)"
1761 | ]
1762 | },
1763 | {
1764 | "cell_type": "code",
1765 | "execution_count": 219,
1766 | "metadata": {
1767 | "collapsed": false
1768 | },
1769 | "outputs": [
1770 | {
1771 | "name": "stdout",
1772 | "output_type": "stream",
1773 | "text": [
1774 | "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 28 30 32 34 36 37 38 39 42 45 48 51 54 55 57 59 61 65 69 74 79 84 87 90 96 105 119 132 144\n"
1775 | ]
1776 | }
1777 | ],
1778 | "source": [
1779 | "# k values we need to inverse map to get pair (i,j)\n",
1780 | "for ix in pair_indices:\n",
1781 | " print ix,"
1782 | ]
1783 | },
1784 | {
1785 | "cell_type": "code",
1786 | "execution_count": 220,
1787 | "metadata": {
1788 | "collapsed": true
1789 | },
1790 | "outputs": [],
1791 | "source": [
1792 | "# grabbing pairs (i,j) from flattened array\n",
1793 | "def inv_triangle(ix,n):\n",
1794 | " init = n-1\n",
1795 | " i = 1\n",
1796 | " while ix > init:\n",
1797 | " i += 1\n",
1798 | " init += n-i\n",
1799 | " # decrement init back once\n",
1800 | " init -= n-i\n",
1801 | " return [i,i+ix-init]"
1802 | ]
1803 | },
1804 | {
1805 | "cell_type": "code",
1806 | "execution_count": 221,
1807 | "metadata": {
1808 | "collapsed": false
1809 | },
1810 | "outputs": [],
1811 | "source": [
1812 | "# have to add one in argument because Python arrays start at index 0\n",
1813 | "L2 = [inv_triangle(ix+1,m) for ix in pair_indices]"
1814 | ]
1815 | },
1816 | {
1817 | "cell_type": "code",
1818 | "execution_count": 222,
1819 | "metadata": {
1820 | "collapsed": false
1821 | },
1822 | "outputs": [
1823 | {
1824 | "name": "stdout",
1825 | "output_type": "stream",
1826 | "text": [
1827 | "[1, 2] [1, 3] [1, 4] [1, 5] [1, 6] [1, 7] [1, 8] [1, 9] [1, 10] [1, 11] [1, 12] [1, 13] [1, 14] [1, 15] [1, 16] [1, 17] [1, 18] [1, 19] [1, 20] [2, 3] [2, 4] [2, 5] [2, 6] [2, 7] [2, 8] [2, 9] [2, 10] [2, 12] [2, 14] [2, 16] [2, 18] [2, 20] [3, 4] [3, 5] [3, 6] [3, 9] [3, 12] [3, 15] [3, 18] [4, 5] [4, 6] [4, 8] [4, 10] [4, 12] [4, 16] [4, 20] [5, 10] [5, 15] [5, 20] [6, 9] [6, 12] [6, 18] [7, 14] [8, 16] [9, 18] [10, 20]\n"
1828 | ]
1829 | }
1830 | ],
1831 | "source": [
1832 | "# frequent pairs is thus:\n",
1833 | "for pair in L2:\n",
1834 | " print pair,"
1835 | ]
1836 | }
1837 | ],
1838 | "metadata": {
1839 | "kernelspec": {
1840 | "display_name": "Python 2",
1841 | "language": "python",
1842 | "name": "python2"
1843 | },
1844 | "language_info": {
1845 | "codemirror_mode": {
1846 | "name": "ipython",
1847 | "version": 2
1848 | },
1849 | "file_extension": ".py",
1850 | "mimetype": "text/x-python",
1851 | "name": "python",
1852 | "nbconvert_exporter": "python",
1853 | "pygments_lexer": "ipython2",
1854 | "version": "2.7.11"
1855 | }
1856 | },
1857 | "nbformat": 4,
1858 | "nbformat_minor": 0
1859 | }
1860 |
--------------------------------------------------------------------------------
/hiearchical_clustering_and_heaps.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | from __future__ import division
4 | import numpy as np
5 | import heapq
6 |
7 | """
8 | This script produces a program to conduct a Hierarchical clustering
9 | procedure on set of vectors in n-dimensional Euclidean space.
10 |
11 | The basic algorithm for the agglomerative clustering procedure is the
12 | acc_() function. This function requires the maintenance of a clusters
13 | list which tracks the clusters formed/remaining throughout the procedure.
14 | This function iterates until we are left with one cluster, printing the
15 | details of each merge step.
16 |
17 | The maintenance vector stores cluster x as follows: [(centroid of x),
18 | [[all points in x]]].
19 |
20 | *** This script also produces an improved version of the basic algorithm
21 | by exploiting the heap data structure. In addition to the clusters maintenance
22 | list, we require a heap (denoted by h), a task/cluster dictionary in the heap
23 | (to keep track of removed clusters).
24 |
25 | For this part, the main function is agg_heap(), which requires the following
26 | functions:
27 |
28 | 1. remove_clusters():
29 | performs lazy deletion of merged clusters
30 |
31 | 2. pop_cluster():
32 | heappops the merged cluster with smallest distance
33 | only if the that cluster was not marked as 'REMOVED'
34 |
35 | 3. add_cluster():
36 | takes as input, the output from pop_cluster and
37 | calculates the distances between this merged cluster and all of the
38 | clusters in the maintenance list clusters_remaining. These distances,
39 | along with the candidate merged clusters are added to heap using
40 | heappush. Note that we must also add the cluster that was actually
41 | merged to the clusters_remaining maintenance vector.
42 |
43 | Things to do:
44 | - Create a class for agg_heap.
45 | """
46 |
47 | def Euclidean(x,y):
48 | """
49 | This function returns the Euclidean distance between two vectors
50 | of a Euclidean space.
51 | """
52 | xc = np.array(x)
53 | yc = np.array(y)
54 | return np.sqrt(np.dot(xc-yc,xc-yc))
55 |
56 | def mean(x):
57 | """
58 | This function takes as input a lists of the points and outputs
59 | the overall average of these points. This output is stored as
60 | a tuple so that it can be used to access the cluster index. In other
61 | words, the centroid of cluster x.
62 | """
63 | N = len(x)
64 | n = len(x[0])
65 | sum_vec = np.zeros(n)
66 | for point in x:
67 | sum_vec += np.array(point)
68 | mean_vec = sum_vec / N
69 | return tuple(mean_vec)
70 |
71 | def mins(x,y):
72 | """
73 | This function takes as input two clusters of points (i.e. vectors) each of which
74 | are represented by of their own individual lists. The output of this function
75 | is the minimum distance between any two points one from each cluster
76 | """
77 | nx = len(x)
78 | ny = len(y)
79 | running_min = 2**32 - 1
80 | for pt_x in x:
81 | for pt_y in y:
82 | if Euclidean(pt_x,pt_y) < running_min:
83 | running_min = Euclidean(pt_x,pt_y)
84 | return running_min
85 |
86 | def avg(x,y):
87 | """
88 | This function takes as input two clusters of points (i.e. vectors) each of which
89 | are represented by of their own individual lists. The output of this function
90 | is the average distance between any two points one from each of the two clusters.
91 | """
92 | nx = len(x)
93 | ny = len(y)
94 | running_sum = 0
95 | for pt_x in x:
96 | for pt_y in y:
97 | running_sum += Euclidean(pt_x,pt_y)
98 | return running_sum/(nx*ny) # total number of pairs is nx*ny (i.e., by multiplication rule)
99 |
100 | def radius(x,y=[]):
101 | """
102 | This function takes as input two clusters of points (i.e. vectors) each of which
103 | are represented by of their own individual lists. The output of this function
104 | is the radius of the custer which results from the merge of x and y.
105 |
106 | If the input is simply one cluster, then the output is the radius of that
107 | cluster.
108 | """
109 | nx = len(x)
110 | ny = len(y)
111 | # merge two clusters x and y
112 | merged_clus = x + y
113 | # the centroid of the new merged cluster
114 | # in non-Euclidean setting, we should change centroid to a clustroid
115 | merged_cent = mean(merged_clus)
116 | # determine the radius of this merged cluster
117 | # radius is the maximum distance between all the points and the centroid
118 | radius = 0
119 | for pt in merged_clus:
120 | if Euclidean(pt,merged_cent) > radius:
121 | radius = Euclidean(pt,merged_cent)
122 | return radius
123 |
124 | def diameter(x,y=[]):
125 | """
126 | This function takes as input two clusters of points (i.e. vectors) each of which
127 | are represented by of their own individual lists. The output of this function
128 | is the diameter of the merged custer of x and y.
129 |
130 | If the input is simply one cluster, then the output is the diameter of that
131 | cluster.
132 | """
133 | # merge two clusters x and y
134 | merged_clus = x + y
135 | n = len(merged_clus)
136 | # determine the diameter of this merged cluster
137 | # diameter is the maximum distance between any two points of the cluster
138 | diameter = 0
139 | for i in range(n-1):
140 | for j in range(i+1,n):
141 | distance_ij = Euclidean(merged_clus[i],merged_clus[j])
142 | if distance_ij > diameter:
143 | diameter = distance_ij
144 | return diameter
145 |
146 | def agg_(clusters, print_summary = True, dist = 'Euclidean'):
147 | """
148 | This function takes as input a dictionary of clusters in
149 | Euclidean space and returns the Agglomerative clustering.
150 | The key of the dictionary is the centroid of the corresponding
151 | cluster.
152 |
153 | Note that the clustering agglomerative clustering is done in
154 | place with respect to the clusters list input.
155 | """
156 |
157 | # specifying the distance function used
158 | # r_ = 0 implies we consider centroids of the two clusters in merge step
159 | # r_ = 1 means that we consider the points of the two clusters themselves in merge step
160 | if dist == 'Euclidean':
161 | f_dist = Euclidean
162 | r_ = 0
163 | if dist == 'mins':
164 | f_dist = mins
165 | r_ = 1
166 | if dist == 'avg':
167 | f_dist = avg
168 | r_ = 1
169 | if dist == 'radius':
170 | f_dist = radius
171 | r_ = 1
172 | if dist == 'diameter':
173 | f_dist = diameter
174 | r_ = 1
175 |
176 | # start main code to conduct clustering
177 | step = 1
178 | while len(clusters) > 1:
179 | # while step < 3:
180 | # clusters hash table (use centroids as hash keys)
181 | clusters_ix = {el[0]:i for i,el in enumerate(clusters)}
182 | # double loop to consider the minimal distance between all pairs of clusters
183 | n = len(clusters)
184 | min_dist = 2**32-1
185 | c1 = None
186 | c2 = None
187 | for i in range(n-1):
188 | for j in range(i+1,n):
189 | # the distance between centroids of cluster i and cluster j
190 | distance_ij = f_dist(clusters[i][r_], clusters[j][r_])
191 | if distance_ij < min_dist:
192 | min_dist = distance_ij
193 | c1 = clusters[i]
194 | c2 = clusters[j]
195 | # merge the two clusters that result in minimum Euclidean distance
196 | new_cluster = c1[1] + c2[1]
197 | new_centroid = mean(new_cluster)
198 | clusters.append([new_centroid, new_cluster])
199 | # remove the merged clusters from the list
200 | del clusters[max(clusters_ix[c1[0]],clusters_ix[c2[0]])]
201 | del clusters[min(clusters_ix[c1[0]],clusters_ix[c2[0]])]
202 | if print_summary:
203 | print 'Step %d:' % step
204 | print 'Merged clusters: %s and %s' %(str(c1[1]),str(c2[1]))
205 | print 'Minimum distance: %f' % min_dist
206 | print 'New clusters list:'
207 | print [el[1] for el in clusters]
208 | print 'New centroids:'
209 | print [el[0] for el in clusters]
210 | print ''
211 | print '--------------------------------------------------------'
212 | print ''
213 | step += 1
214 |
215 | # Alternatively, can use np.mean to create the new centroid
216 | # new_centroid = tuple(np.mean(np.array(new_cluster),axis=0))
217 |
218 | """ **************************************************************************
219 |
220 | This part of the script defines agg_heap() and all of its necessary components
221 |
222 | Sample input:
223 | clusters = [[(4,10),[[4,10]]], [(7,10),[[7,10]]], [(4,8),[[4,8]]],
224 | [(6,8),[[6,8]]],[(3,4),[[3,4]]],[(2,2),[[2,2]]],[(5,2),[[5,2]]],
225 | [(12,6),[[12,6]]],[(10,5),[[10,5]]],[(11,4),[[11,4]]],[(9,3),[[9,3]]],
226 | [(12,3),[[12,3]]]]
227 |
228 | # creating a dictionary tracking the remaining clusters
229 | clusters_remaining = {tuple(tuple(el) for el in clusters[i][1]):clusters[i][1]
230 | for i in range(len(clusters))}
231 | clusters_remaining
232 |
233 | # creating a heap h with item keys (dist, pair)
234 | h = []
235 | n = len(clusters)
236 | f_dist = Euclidean
237 | r_ = 0
238 | clusters_handle = {} # keys are centroids of the pairs of clusters
239 | for i in range(n-1):
240 | for j in range(i+1,n):
241 | distance_ij = f_dist(clusters[i][r_],clusters[j][r_])
242 | ati = tuple(tuple(el) for el in clusters[i][1])
243 | tun = tuple(tuple(el) for el in clusters[j][1])
244 | foo = [distance_ij, (tuple(ati),tuple(tun)),
245 | clusters[i][1]+clusters[j][1]]
246 | clusters_handle[(tuple(ati),tuple(tun))] = foo
247 | heapq.heappush(h,foo)
248 |
249 | ************************************************************************** """
250 |
251 | REMOVED = ''
252 | def remove_clusters(i):
253 | """
254 | This function lazily deletes any clusters that have been
255 | merged from the dictionary clusters_handle.
256 | """
257 | for key in clusters_handle.keys():
258 | if i in key:
259 | # mark task as removed
260 | clusters_handle[key][1] = REMOVED
261 |
262 | def pop_cluster():
263 | """
264 | To maintain the heap property, we lazily deleted merged clusters.
265 | In this function, we only pop (extract) minimum distance
266 | clusters if the merged cluster has not been removed.
267 | """
268 | # this pops until it returns something
269 | while h: # while there are entries in the heap
270 | distance, tup, merged_clus = heapq.heappop(h)
271 | if tup != REMOVED:
272 | del clusters_handle[tup]
273 | # remove newly merged clusters from heap task dict and clusters_remaining dictionary
274 | for cluster in tup:
275 | remove_clusters(cluster)
276 | del clusters_remaining[cluster]
277 | return distance, tup, merged_clus
278 | raise KeyError('pop from an empty heap')
279 |
280 |
281 | def add_cluster(entry):
282 | """
283 | This function takes the pop'ed entry and calculates the
284 | distances of the newly-merged cluster to all of the clusters
285 | in the clusters_remaining dictionary.
286 | """
287 | distance, tup, merged_clus = entry
288 | tup = tup[0] + tup[1]
289 | centroid = mean(merged_clus)
290 | # for every entry in the clusters_remaining compute new distances
291 | for tup_cmp, clus_cmp in clusters_remaining.items():
292 | centroid_cmp = mean(clus_cmp)
293 | new_distance = Euclidean(centroid, centroid_cmp)
294 | # generate new entry for the heap
295 | new_tup = (tup, tup_cmp)
296 | new_merged_clus = merged_clus + clus_cmp
297 | new_entry = [new_distance, new_tup, new_merged_clus]
298 | # add new entry to clusters_handle
299 | clusters_handle[new_tup] = new_entry
300 | # add new entry to the heap
301 | heapq.heappush(h,new_entry)
302 | # add the recently merged cluster to clusters_remaining dict
303 | clusters_remaining[tup] = merged_clus
304 |
305 | def agg_heap(clusters, print_summary = True):
306 | """
307 | This function takes as input a dictionary of clusters in
308 | Euclidean space and returns the Agglomerative clustering. This is
309 | an improvement over agg_ since it exploits the heap data structure.
310 |
311 | Note that Python heapq and queue module do not support element deletion.
312 | In this code, we simply use lazy deletion.
313 | """
314 | # add methods here to create heap from clusters and
315 | # other necessary components
316 | clusters_remaining = {tuple(tuple(el) for el in clusters[i][1]):clusters[i][1]
317 | for i in range(len(clusters))}
318 |
319 | # start main code to conduct clustering
320 | step = 1
321 | while len(clusters_remaining) > 1:
322 | entry = pop_cluster()
323 | add_cluster(entry)
324 | distance, tup, merged_clus = entry
325 | if print_summary:
326 | print 'Step %d:' % step
327 | print 'Merged clusters: %s and %s' %(str(tup[0]),str(tup[1]))
328 | print 'Minimum distance: %f' % distance
329 | print 'New clusters list:'
330 | print [key for key in clusters_remaining.keys()]
331 | print 'New centroids:'
332 | print [mean(el) for el in clusters_remaining.values()]
333 | print ''
334 | print '--------------------------------------------------------'
335 | print ''
336 | step += 1
337 |
338 |
--------------------------------------------------------------------------------