├── .gitignore ├── 11_1-3_to_11_1-5.mw ├── Chapter 11 - Dimensionality Reduction.ipynb ├── Chapter 7 - Clustering.ipynb ├── Chapter 9 - Recommendation Systems.ipynb ├── Exercises 6.1.1 and 6.1.3 and their related problems (from Ch.6 Frequent Itemsets).ipynb └── hiearchical_clustering_and_heaps.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .ipynb_checkpoints 3 | -------------------------------------------------------------------------------- /Chapter 11 - Dimensionality Reduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Contents\n", 8 | "- Exercise 11.1.1\n", 9 | "- Exercise 11.1.2\n", 10 | "- Exercise 11.1.6\n", 11 | "- Exercise 11.1.7\n", 12 | "- Exercise 11.2.1\n", 13 | "- Exercise 11.3.1\n", 14 | "- Exercise 11.3.2" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import numpy as np\n", 26 | "from __future__ import division" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "# 11.1.1" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 10, 39 | "metadata": { 40 | "collapsed": false 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "# unit vector in same direction as [1,2,3]\n", 45 | "a = np.arange(1,4)\n", 46 | "a_unit = a/np.sqrt(np.dot(a,a))" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 11, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "[1 2 3]\n", 61 | "[ 0.26726124 0.53452248 0.80178373]\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "print a\n", 67 | "print a_unit" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "\n", 75 | "# 11.1.2" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "Using power-iteration method to find the second eigenpair of M" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 325, 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "outputs": [ 92 | { 93 | "data": { 94 | "text/plain": [ 95 | "matrix([[3, 2],\n", 96 | " [2, 6]])" 97 | ] 98 | }, 99 | "execution_count": 325, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "M = np.array([3,2,2,6]).reshape(2,2)\n", 106 | "M = np.asmatrix(M)\n", 107 | "M" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 326, 113 | "metadata": { 114 | "collapsed": false 115 | }, 116 | "outputs": [ 117 | { 118 | "data": { 119 | "text/plain": [ 120 | "array([[ 2.601, -0.797],\n", 121 | " [-0.797, 0.413]])" 122 | ] 123 | }, 124 | "execution_count": 326, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "Ms = np.array([2.601,-0.797,-0.797,0.413]).reshape(2,2)\n", 131 | "Ms" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 327, 137 | "metadata": { 138 | "collapsed": true 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "def Frobenius(x):\n", 143 | " assert type(x) == np.matrix\n", 144 | " foo = x.A # converts x to a np.array so we can use ufunc\n", 145 | " return np.sqrt(np.sum(foo**2))" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 328, 151 | "metadata": { 152 | "collapsed": true 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "def pow(M,x0=None, thres = 0.00001, pr = False):\n", 157 | " if x0 == None:\n", 158 | " x0 = np.ones(len(M))\n", 159 | " oldx = np.mat(x0).T\n", 160 | " newx = np.mat(np.zeros(len(M))).T\n", 161 | " converge = False\n", 162 | " while not converge:\n", 163 | " newx = np.dot(M,oldx)/Frobenius(np.dot(M,oldx))\n", 164 | " if pr == True:\n", 165 | " print newx\n", 166 | " if Frobenius(oldx-newx) < thres: # threshold of convergence\n", 167 | " converge = True\n", 168 | " oldx = newx\n", 169 | " \n", 170 | " # find eigenvector\n", 171 | " lam = ((newx.T*M*newx).A1)[0] # A1 attribute returns a flattened array\n", 172 | " return lam,newx" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 329, 178 | "metadata": { 179 | "collapsed": false 180 | }, 181 | "outputs": [ 182 | { 183 | "name": "stdout", 184 | "output_type": "stream", 185 | "text": [ 186 | "6.99999999999 [[ 0.44721468]\n", 187 | " [ 0.89442665]]\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "lam1, x1 = pow(M)\n", 193 | "print lam1, x1" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 330, 199 | "metadata": { 200 | "collapsed": false 201 | }, 202 | "outputs": [ 203 | { 204 | "data": { 205 | "text/plain": [ 206 | "[7, 0.44721359549995793, 0.89442719099991586]" 207 | ] 208 | }, 209 | "execution_count": 330, 210 | "metadata": {}, 211 | "output_type": "execute_result" 212 | } 213 | ], 214 | "source": [ 215 | "# exact principle eigenpair\n", 216 | "[7,1/np.sqrt(5),2/np.sqrt(5)]" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 331, 222 | "metadata": { 223 | "collapsed": false 224 | }, 225 | "outputs": [ 226 | { 227 | "data": { 228 | "text/plain": [ 229 | "matrix([[ 1.59999323, -0.80000508],\n", 230 | " [-0.80000508, 0.40000677]])" 231 | ] 232 | }, 233 | "execution_count": 331, 234 | "metadata": {}, 235 | "output_type": "execute_result" 236 | } 237 | ], 238 | "source": [ 239 | "# second eigenvector, use poweriteration on Ms = M - lambda_1 xx.T\n", 240 | "Ms = M - lam1*(x1*x1.T)\n", 241 | "Ms" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 332, 247 | "metadata": { 248 | "collapsed": false 249 | }, 250 | "outputs": [ 251 | { 252 | "name": "stdout", 253 | "output_type": "stream", 254 | "text": [ 255 | "2.00000000003 [[ 0.8944253 ]\n", 256 | " [-0.44721738]]\n" 257 | ] 258 | } 259 | ], 260 | "source": [ 261 | "lam2, x2 = pow(Ms)\n", 262 | "print lam2, x2" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 333, 268 | "metadata": { 269 | "collapsed": false 270 | }, 271 | "outputs": [ 272 | { 273 | "data": { 274 | "text/plain": [ 275 | "[2, 0.89442719099991586, -0.44721359549995793]" 276 | ] 277 | }, 278 | "execution_count": 333, 279 | "metadata": {}, 280 | "output_type": "execute_result" 281 | } 282 | ], 283 | "source": [ 284 | "# exact second eigenpair\n", 285 | "[2,2/np.sqrt(5), -1/np.sqrt(5)]" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "\n", 293 | "# 11.1.6" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "(a)-(b)" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 334, 306 | "metadata": { 307 | "collapsed": false 308 | }, 309 | "outputs": [ 310 | { 311 | "data": { 312 | "text/plain": [ 313 | "matrix([[1, 1, 1],\n", 314 | " [1, 2, 3],\n", 315 | " [1, 3, 5]])" 316 | ] 317 | }, 318 | "execution_count": 334, 319 | "metadata": {}, 320 | "output_type": "execute_result" 321 | } 322 | ], 323 | "source": [ 324 | "M = np.matrix([[1,1,1],[1,2,3],[1,3,5]])\n", 325 | "M" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 335, 331 | "metadata": { 332 | "collapsed": false 333 | }, 334 | "outputs": [ 335 | { 336 | "name": "stdout", 337 | "output_type": "stream", 338 | "text": [ 339 | "7.16227766016 [[ 0.21848282]\n", 340 | " [ 0.52160927]\n", 341 | " [ 0.82473573]]\n" 342 | ] 343 | } 344 | ], 345 | "source": [ 346 | "# using pow function defined above\n", 347 | "lam1, x1 = pow(M)\n", 348 | "print lam1, x1" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 336, 354 | "metadata": { 355 | "collapsed": true 356 | }, 357 | "outputs": [], 358 | "source": [ 359 | "x1_exact = np.array([(1/5)*(5+np.sqrt(10))/(3+np.sqrt(10)),np.sqrt(10)/5,1])\n", 360 | "x1_exact = x1_exact/(np.sqrt(sum(x1_exact**2)))" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 337, 366 | "metadata": { 367 | "collapsed": false 368 | }, 369 | "outputs": [ 370 | { 371 | "data": { 372 | "text/plain": [ 373 | "[7.16227766016838, array([ 0.21848175, 0.52160897, 0.8247362 ])]" 374 | ] 375 | }, 376 | "execution_count": 337, 377 | "metadata": {}, 378 | "output_type": "execute_result" 379 | } 380 | ], 381 | "source": [ 382 | "# exact principal eigenpair\n", 383 | "[4+np.sqrt(10), x1_exact]" 384 | ] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "metadata": {}, 389 | "source": [ 390 | "(c) Removing the influence of the principal eigenvector from M" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 338, 396 | "metadata": { 397 | "collapsed": false 398 | }, 399 | "outputs": [ 400 | { 401 | "data": { 402 | "text/plain": [ 403 | "matrix([[ 0.65811051, 0.18376774, -0.29057503],\n", 404 | " [ 0.18376774, 0.05131446, -0.08113883],\n", 405 | " [-0.29057503, -0.08113883, 0.12829737]])" 406 | ] 407 | }, 408 | "execution_count": 338, 409 | "metadata": {}, 410 | "output_type": "execute_result" 411 | } 412 | ], 413 | "source": [ 414 | "# second eigenvector, use poweriteration on Ms = M - lambda_1 xx.T\n", 415 | "Ms = M - lam1*(x1*x1.T)\n", 416 | "Ms" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "(d) Finding the second eigenpair of M using Ms" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": 339, 429 | "metadata": { 430 | "collapsed": false 431 | }, 432 | "outputs": [ 433 | { 434 | "name": "stdout", 435 | "output_type": "stream", 436 | "text": [ 437 | "0.837722339911 [[ 0.88633799]\n", 438 | " [ 0.24749693]\n", 439 | " [-0.39134414]]\n" 440 | ] 441 | } 442 | ], 443 | "source": [ 444 | "lam2, x2 = pow(Ms)\n", 445 | "print lam2, x2" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 340, 451 | "metadata": { 452 | "collapsed": true 453 | }, 454 | "outputs": [], 455 | "source": [ 456 | "x2_exact = np.array([(1/5)*(5-np.sqrt(10))/(3-np.sqrt(10)),-np.sqrt(10)/5,1])\n", 457 | "x2_exact = x2_exact/(np.sqrt(sum(x2_exact**2)))" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": 341, 463 | "metadata": { 464 | "collapsed": false 465 | }, 466 | "outputs": [ 467 | { 468 | "data": { 469 | "text/plain": [ 470 | "[0.83772233983162048, array([-0.88634026, -0.24750235, 0.39133557])]" 471 | ] 472 | }, 473 | "execution_count": 341, 474 | "metadata": {}, 475 | "output_type": "execute_result" 476 | } 477 | ], 478 | "source": [ 479 | "# exact second eigenpair\n", 480 | "[4-np.sqrt(10),x2_exact]" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": {}, 486 | "source": [ 487 | "(e) Repeating the above steps to find third eigenpair" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": 345, 493 | "metadata": { 494 | "collapsed": false 495 | }, 496 | "outputs": [ 497 | { 498 | "data": { 499 | "text/plain": [ 500 | "matrix([[ -3.36408679e-12, -8.03135336e-12, -1.26988975e-11],\n", 501 | " [ -8.03135336e-12, -1.91732394e-11, -3.03155834e-11],\n", 502 | " [ -1.26988975e-11, -3.03155834e-11, -4.79333517e-11]])" 503 | ] 504 | }, 505 | "execution_count": 345, 506 | "metadata": {}, 507 | "output_type": "execute_result" 508 | } 509 | ], 510 | "source": [ 511 | "Mss = Ms - lam2*(x2*x2.T)\n", 512 | "Mss" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 346, 518 | "metadata": { 519 | "collapsed": false 520 | }, 521 | "outputs": [ 522 | { 523 | "name": "stdout", 524 | "output_type": "stream", 525 | "text": [ 526 | "[[-0.21849284]\n", 527 | " [-0.52160581]\n", 528 | " [-0.82473527]]\n", 529 | "[[ 0.21849413]\n", 530 | " [ 0.52160601]\n", 531 | " [ 0.8247348 ]]\n", 532 | "-7.04708085388e-11 [[ 0.21849413]\n", 533 | " [ 0.52160601]\n", 534 | " [ 0.8247348 ]]\n" 535 | ] 536 | } 537 | ], 538 | "source": [ 539 | "lam3, x3 = pow(Mss, thres=2, pr = True)\n", 540 | "print lam3, x3" 541 | ] 542 | }, 543 | { 544 | "cell_type": "markdown", 545 | "metadata": {}, 546 | "source": [ 547 | "The power iteration method is getting stuck at two vectors. Might need to increase amount of digits used for the calculation." 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": 354, 553 | "metadata": { 554 | "collapsed": false 555 | }, 556 | "outputs": [ 557 | { 558 | "data": { 559 | "text/plain": [ 560 | "matrix([[-0.21849413],\n", 561 | " [-0.52160601],\n", 562 | " [-0.8247348 ]])" 563 | ] 564 | }, 565 | "execution_count": 354, 566 | "metadata": {}, 567 | "output_type": "execute_result" 568 | } 569 | ], 570 | "source": [ 571 | "# power iteration method is stuck, oscillating between these two vectors\n", 572 | "Mss*x3/Frobenius(Mss*x3)" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": 349, 578 | "metadata": { 579 | "collapsed": true 580 | }, 581 | "outputs": [], 582 | "source": [ 583 | "x3_exact = np.array([1,-2,1])\n", 584 | "x3_exact = x3_exact/np.sqrt(sum(x3_exact**2))" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": 351, 590 | "metadata": { 591 | "collapsed": false 592 | }, 593 | "outputs": [ 594 | { 595 | "data": { 596 | "text/plain": [ 597 | "array([ 0.40824829, -0.81649658, 0.40824829])" 598 | ] 599 | }, 600 | "execution_count": 351, 601 | "metadata": {}, 602 | "output_type": "execute_result" 603 | } 604 | ], 605 | "source": [ 606 | "x3_exact" 607 | ] 608 | }, 609 | { 610 | "cell_type": "markdown", 611 | "metadata": {}, 612 | "source": [ 613 | "\n", 614 | "# 11.1.7" 615 | ] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": 355, 620 | "metadata": { 621 | "collapsed": false 622 | }, 623 | "outputs": [ 624 | { 625 | "data": { 626 | "text/plain": [ 627 | "matrix([[1, 1, 1],\n", 628 | " [1, 2, 3],\n", 629 | " [1, 3, 6]])" 630 | ] 631 | }, 632 | "execution_count": 355, 633 | "metadata": {}, 634 | "output_type": "execute_result" 635 | } 636 | ], 637 | "source": [ 638 | "M = np.matrix([[1,1,1],[1,2,3],[1,3,6]])\n", 639 | "M" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": 357, 645 | "metadata": { 646 | "collapsed": false 647 | }, 648 | "outputs": [ 649 | { 650 | "name": "stdout", 651 | "output_type": "stream", 652 | "text": [ 653 | "7.87298334621 [[ 0.19382289]\n", 654 | " [ 0.4722474 ]\n", 655 | " [ 0.85989248]]\n" 656 | ] 657 | } 658 | ], 659 | "source": [ 660 | "# first eigenpair\n", 661 | "lam1, x1 = pow(M)\n", 662 | "print lam1, x1" 663 | ] 664 | }, 665 | { 666 | "cell_type": "code", 667 | "execution_count": 368, 668 | "metadata": { 669 | "collapsed": false 670 | }, 671 | "outputs": [ 672 | { 673 | "data": { 674 | "text/plain": [ 675 | "array([ 0.19382266, 0.47224729, 0.8598926 ])" 676 | ] 677 | }, 678 | "execution_count": 368, 679 | "metadata": {}, 680 | "output_type": "execute_result" 681 | } 682 | ], 683 | "source": [ 684 | "x1_exact = np.array([(6/5)*(5+np.sqrt(15))/(3+np.sqrt(15))**2,\n", 685 | " (1/5)*(15+np.sqrt(15))/(3+np.sqrt(15)),1])\n", 686 | "x1_exact = x1_exact/np.sqrt(sum(x1_exact**2))\n", 687 | "x1_exact" 688 | ] 689 | }, 690 | { 691 | "cell_type": "code", 692 | "execution_count": 360, 693 | "metadata": { 694 | "collapsed": false 695 | }, 696 | "outputs": [ 697 | { 698 | "data": { 699 | "text/plain": [ 700 | "matrix([[ 0.70423318, 0.27936729, -0.31216529],\n", 701 | " [ 0.27936729, 0.24418608, -0.19707675],\n", 702 | " [-0.31216529, -0.19707675, 0.1785974 ]])" 703 | ] 704 | }, 705 | "execution_count": 360, 706 | "metadata": {}, 707 | "output_type": "execute_result" 708 | } 709 | ], 710 | "source": [ 711 | "Ms = M - lam1*(x1*x1.T)\n", 712 | "Ms" 713 | ] 714 | }, 715 | { 716 | "cell_type": "code", 717 | "execution_count": 362, 718 | "metadata": { 719 | "collapsed": false 720 | }, 721 | "outputs": [ 722 | { 723 | "name": "stdout", 724 | "output_type": "stream", 725 | "text": [ 726 | "1.0 [[ 0.81649634]\n", 727 | " [ 0.40824695]\n", 728 | " [-0.40825011]]\n" 729 | ] 730 | } 731 | ], 732 | "source": [ 733 | "# second eigenpair\n", 734 | "lam2, x2 = pow(Ms)\n", 735 | "print lam2, x2" 736 | ] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": 369, 741 | "metadata": { 742 | "collapsed": false 743 | }, 744 | "outputs": [ 745 | { 746 | "data": { 747 | "text/plain": [ 748 | "array([-0.81649658, -0.40824829, 0.40824829])" 749 | ] 750 | }, 751 | "execution_count": 369, 752 | "metadata": {}, 753 | "output_type": "execute_result" 754 | } 755 | ], 756 | "source": [ 757 | "x2_exact = np.array([-2,-1,1])\n", 758 | "x2_exact = x2_exact/np.sqrt(sum(x2_exact**2))\n", 759 | "x2_exact" 760 | ] 761 | }, 762 | { 763 | "cell_type": "code", 764 | "execution_count": 364, 765 | "metadata": { 766 | "collapsed": false 767 | }, 768 | "outputs": [ 769 | { 770 | "data": { 771 | "text/plain": [ 772 | "matrix([[ 0.03756691, -0.05396485, 0.02116943],\n", 773 | " [-0.05396485, 0.0775205 , -0.03040989],\n", 774 | " [ 0.02116943, -0.03040989, 0.01192925]])" 775 | ] 776 | }, 777 | "execution_count": 364, 778 | "metadata": {}, 779 | "output_type": "execute_result" 780 | } 781 | ], 782 | "source": [ 783 | "Mss = Ms - lam2*(x2*x2.T)\n", 784 | "Mss" 785 | ] 786 | }, 787 | { 788 | "cell_type": "code", 789 | "execution_count": 365, 790 | "metadata": { 791 | "collapsed": false 792 | }, 793 | "outputs": [ 794 | { 795 | "name": "stdout", 796 | "output_type": "stream", 797 | "text": [ 798 | "0.127016653793 [[ 0.54384155]\n", 799 | " [-0.78122828]\n", 800 | " [ 0.30646167]]\n" 801 | ] 802 | } 803 | ], 804 | "source": [ 805 | "# third eigenpair\n", 806 | "lam3, x3 = pow(Mss)\n", 807 | "print lam3, x3" 808 | ] 809 | }, 810 | { 811 | "cell_type": "code", 812 | "execution_count": 370, 813 | "metadata": { 814 | "collapsed": false 815 | }, 816 | "outputs": [ 817 | { 818 | "data": { 819 | "text/plain": [ 820 | "array([ 0.54384383, -0.78122713, 0.30646053])" 821 | ] 822 | }, 823 | "execution_count": 370, 824 | "metadata": {}, 825 | "output_type": "execute_result" 826 | } 827 | ], 828 | "source": [ 829 | "x3_exact = np.array([(6/5)*(5-np.sqrt(15))/(3-np.sqrt(15))**2,\n", 830 | " (1/5)*(15-np.sqrt(15))/(3-np.sqrt(15)),1])\n", 831 | "x3_exact = x3_exact/np.sqrt(sum(x3_exact**2))\n", 832 | "x3_exact" 833 | ] 834 | }, 835 | { 836 | "cell_type": "markdown", 837 | "metadata": {}, 838 | "source": [ 839 | "\n", 840 | "# 11.2.1" 841 | ] 842 | }, 843 | { 844 | "cell_type": "code", 845 | "execution_count": 373, 846 | "metadata": { 847 | "collapsed": false 848 | }, 849 | "outputs": [ 850 | { 851 | "data": { 852 | "text/plain": [ 853 | "matrix([[ 1, 1],\n", 854 | " [ 2, 4],\n", 855 | " [ 3, 9],\n", 856 | " [ 4, 16]])" 857 | ] 858 | }, 859 | "execution_count": 373, 860 | "metadata": {}, 861 | "output_type": "execute_result" 862 | } 863 | ], 864 | "source": [ 865 | "M = np.array([1,1,2,4,3,9,4,16]).reshape(4,2)\n", 866 | "M = np.asmatrix(M)\n", 867 | "M" 868 | ] 869 | }, 870 | { 871 | "cell_type": "markdown", 872 | "metadata": {}, 873 | "source": [ 874 | "(a)" 875 | ] 876 | }, 877 | { 878 | "cell_type": "code", 879 | "execution_count": 375, 880 | "metadata": { 881 | "collapsed": false 882 | }, 883 | "outputs": [ 884 | { 885 | "data": { 886 | "text/plain": [ 887 | "matrix([[ 30, 100],\n", 888 | " [100, 354]])" 889 | ] 890 | }, 891 | "execution_count": 375, 892 | "metadata": {}, 893 | "output_type": "execute_result" 894 | } 895 | ], 896 | "source": [ 897 | "MtM = M.T*M\n", 898 | "MtM" 899 | ] 900 | }, 901 | { 902 | "cell_type": "code", 903 | "execution_count": 377, 904 | "metadata": { 905 | "collapsed": false 906 | }, 907 | "outputs": [ 908 | { 909 | "data": { 910 | "text/plain": [ 911 | "matrix([[ 2, 6, 12, 20],\n", 912 | " [ 6, 20, 42, 72],\n", 913 | " [ 12, 42, 90, 156],\n", 914 | " [ 20, 72, 156, 272]])" 915 | ] 916 | }, 917 | "execution_count": 377, 918 | "metadata": {}, 919 | "output_type": "execute_result" 920 | } 921 | ], 922 | "source": [ 923 | "MMt = M*M.T\n", 924 | "MMt" 925 | ] 926 | }, 927 | { 928 | "cell_type": "markdown", 929 | "metadata": {}, 930 | "source": [ 931 | "(b)" 932 | ] 933 | }, 934 | { 935 | "cell_type": "code", 936 | "execution_count": 378, 937 | "metadata": { 938 | "collapsed": false 939 | }, 940 | "outputs": [ 941 | { 942 | "name": "stdout", 943 | "output_type": "stream", 944 | "text": [ 945 | "382.378570223 [[ 0.27300543]\n", 946 | " [ 0.96201249]]\n" 947 | ] 948 | } 949 | ], 950 | "source": [ 951 | "# Using power=iteration method\n", 952 | "lam1, x1 = pow(MtM)\n", 953 | "print lam1, x1" 954 | ] 955 | }, 956 | { 957 | "cell_type": "code", 958 | "execution_count": 381, 959 | "metadata": { 960 | "collapsed": false 961 | }, 962 | "outputs": [ 963 | { 964 | "data": { 965 | "text/plain": [ 966 | "array([ 0.27300539, 0.9620125 ])" 967 | ] 968 | }, 969 | "execution_count": 381, 970 | "metadata": {}, 971 | "output_type": "execute_result" 972 | } 973 | ], 974 | "source": [ 975 | "x1_exact = np.array([100/(162+2*np.sqrt(9061)),1])\n", 976 | "x1_exact = x1_exact/np.sqrt(sum(x1_exact**2))\n", 977 | "x1_exact" 978 | ] 979 | }, 980 | { 981 | "cell_type": "code", 982 | "execution_count": 379, 983 | "metadata": { 984 | "collapsed": false 985 | }, 986 | "outputs": [ 987 | { 988 | "data": { 989 | "text/plain": [ 990 | "matrix([[ 1.50057292, -0.4258574 ],\n", 991 | " [-0.4258574 , 0.12085686]])" 992 | ] 993 | }, 994 | "execution_count": 379, 995 | "metadata": {}, 996 | "output_type": "execute_result" 997 | } 998 | ], 999 | "source": [ 1000 | "Ms = MtM - lam1*(x1*x1.T)\n", 1001 | "Ms" 1002 | ] 1003 | }, 1004 | { 1005 | "cell_type": "code", 1006 | "execution_count": 380, 1007 | "metadata": { 1008 | "collapsed": false 1009 | }, 1010 | "outputs": [ 1011 | { 1012 | "name": "stdout", 1013 | "output_type": "stream", 1014 | "text": [ 1015 | "1.62142977757 [[ 0.96200976]\n", 1016 | " [-0.27301504]]\n" 1017 | ] 1018 | } 1019 | ], 1020 | "source": [ 1021 | "lam2, x2 = pow(Ms)\n", 1022 | "print lam2, x2" 1023 | ] 1024 | }, 1025 | { 1026 | "cell_type": "code", 1027 | "execution_count": 382, 1028 | "metadata": { 1029 | "collapsed": false 1030 | }, 1031 | "outputs": [ 1032 | { 1033 | "data": { 1034 | "text/plain": [ 1035 | "array([-0.9620125 , 0.27300539])" 1036 | ] 1037 | }, 1038 | "execution_count": 382, 1039 | "metadata": {}, 1040 | "output_type": "execute_result" 1041 | } 1042 | ], 1043 | "source": [ 1044 | "x2_exact = np.array([100/(162-2*np.sqrt(9061)),1])\n", 1045 | "x2_exact = x2_exact/np.sqrt(sum(x2_exact**2))\n", 1046 | "x2_exact" 1047 | ] 1048 | }, 1049 | { 1050 | "cell_type": "markdown", 1051 | "metadata": {}, 1052 | "source": [ 1053 | "(c) Since M.T\\*M has dimension 2x2 and M\\*M.T has dimension 4x4, the eigenvalues of M\\*M.T are (382.38,1.62,0,0)" 1054 | ] 1055 | }, 1056 | { 1057 | "cell_type": "markdown", 1058 | "metadata": {}, 1059 | "source": [ 1060 | "(d)" 1061 | ] 1062 | }, 1063 | { 1064 | "cell_type": "code", 1065 | "execution_count": 383, 1066 | "metadata": { 1067 | "collapsed": false 1068 | }, 1069 | "outputs": [ 1070 | { 1071 | "data": { 1072 | "text/plain": [ 1073 | "matrix([[ 1.23501793],\n", 1074 | " [ 4.39406083],\n", 1075 | " [ 9.47712872],\n", 1076 | " [ 16.48422159]])" 1077 | ] 1078 | }, 1079 | "execution_count": 383, 1080 | "metadata": {}, 1081 | "output_type": "execute_result" 1082 | } 1083 | ], 1084 | "source": [ 1085 | "# eigenvector of MM.T due to lam1\n", 1086 | "M*x1" 1087 | ] 1088 | }, 1089 | { 1090 | "cell_type": "code", 1091 | "execution_count": 386, 1092 | "metadata": { 1093 | "collapsed": false 1094 | }, 1095 | "outputs": [ 1096 | { 1097 | "data": { 1098 | "text/plain": [ 1099 | "19.554502556255315" 1100 | ] 1101 | }, 1102 | "execution_count": 386, 1103 | "metadata": {}, 1104 | "output_type": "execute_result" 1105 | } 1106 | ], 1107 | "source": [ 1108 | "Frobenius(M*x1)" 1109 | ] 1110 | }, 1111 | { 1112 | "cell_type": "code", 1113 | "execution_count": 389, 1114 | "metadata": { 1115 | "collapsed": false 1116 | }, 1117 | "outputs": [ 1118 | { 1119 | "name": "stdout", 1120 | "output_type": "stream", 1121 | "text": [ 1122 | "[[ 472.24437737]\n", 1123 | " [ 1680.19468524]\n", 1124 | " [ 3623.85092359]\n", 1125 | " [ 6303.21309243]]\n", 1126 | "[[ 472.24438853]\n", 1127 | " [ 1680.19469871]\n", 1128 | " [ 3623.85093054]\n", 1129 | " [ 6303.21308401]]\n" 1130 | ] 1131 | } 1132 | ], 1133 | "source": [ 1134 | "# verifying that we indeed have an eigenvector of MM.T\n", 1135 | "print MMt*(M*x1) \n", 1136 | "print lam1*(M*x1)" 1137 | ] 1138 | }, 1139 | { 1140 | "cell_type": "code", 1141 | "execution_count": 384, 1142 | "metadata": { 1143 | "collapsed": false 1144 | }, 1145 | "outputs": [ 1146 | { 1147 | "data": { 1148 | "text/plain": [ 1149 | "matrix([[ 0.68899472],\n", 1150 | " [ 0.83195935],\n", 1151 | " [ 0.4288939 ],\n", 1152 | " [-0.52020165]])" 1153 | ] 1154 | }, 1155 | "execution_count": 384, 1156 | "metadata": {}, 1157 | "output_type": "execute_result" 1158 | } 1159 | ], 1160 | "source": [ 1161 | "# eigenvector of MM.T due to lam2\n", 1162 | "M*x2" 1163 | ] 1164 | }, 1165 | { 1166 | "cell_type": "code", 1167 | "execution_count": 385, 1168 | "metadata": { 1169 | "collapsed": false 1170 | }, 1171 | "outputs": [ 1172 | { 1173 | "data": { 1174 | "text/plain": [ 1175 | "1.2733537669157973" 1176 | ] 1177 | }, 1178 | "execution_count": 385, 1179 | "metadata": {}, 1180 | "output_type": "execute_result" 1181 | } 1182 | ], 1183 | "source": [ 1184 | "Frobenius(M*x2)" 1185 | ] 1186 | }, 1187 | { 1188 | "cell_type": "code", 1189 | "execution_count": 390, 1190 | "metadata": { 1191 | "collapsed": false 1192 | }, 1193 | "outputs": [ 1194 | { 1195 | "name": "stdout", 1196 | "output_type": "stream", 1197 | "text": [ 1198 | "[[ 1.11243939]\n", 1199 | " [ 1.33218051]\n", 1200 | " [ 0.65922334]\n", 1201 | " [-0.9064321 ]]\n", 1202 | "[[ 1.11715656]\n", 1203 | " [ 1.34896367]\n", 1204 | " [ 0.69542134]\n", 1205 | " [-0.84347044]]\n" 1206 | ] 1207 | } 1208 | ], 1209 | "source": [ 1210 | "print MMt*(M*x2) \n", 1211 | "print lam2*(M*x2)" 1212 | ] 1213 | }, 1214 | { 1215 | "cell_type": "code", 1216 | "execution_count": 392, 1217 | "metadata": { 1218 | "collapsed": false 1219 | }, 1220 | "outputs": [ 1221 | { 1222 | "data": { 1223 | "text/plain": [ 1224 | "matrix([[-0.00471716],\n", 1225 | " [-0.01678316],\n", 1226 | " [-0.03619799],\n", 1227 | " [-0.06296166]])" 1228 | ] 1229 | }, 1230 | "execution_count": 392, 1231 | "metadata": {}, 1232 | "output_type": "execute_result" 1233 | } 1234 | ], 1235 | "source": [ 1236 | "# accuracy should improve if we allow for more digits\n", 1237 | "MMt*(M*x2) - lam2*(M*x2)" 1238 | ] 1239 | }, 1240 | { 1241 | "cell_type": "markdown", 1242 | "metadata": {}, 1243 | "source": [ 1244 | "For the eigenvectors of lam = 0, we first perform Gaussian Elimination on MM.T" 1245 | ] 1246 | }, 1247 | { 1248 | "cell_type": "code", 1249 | "execution_count": 412, 1250 | "metadata": { 1251 | "collapsed": false 1252 | }, 1253 | "outputs": [ 1254 | { 1255 | "data": { 1256 | "text/plain": [ 1257 | "array([[ 2, 6, 12, 20],\n", 1258 | " [ 0, 2, 6, 12],\n", 1259 | " [ 0, 0, 0, 0],\n", 1260 | " [ 0, 0, 0, 0]])" 1261 | ] 1262 | }, 1263 | "execution_count": 412, 1264 | "metadata": {}, 1265 | "output_type": "execute_result" 1266 | } 1267 | ], 1268 | "source": [ 1269 | "# Gaussian Elimination on MM.T results\n", 1270 | "MMt_Gauss = np.array([2,6,12,20,0,2,6,12]+[0]*8).reshape(4,4)\n", 1271 | "MMt_Gauss" 1272 | ] 1273 | }, 1274 | { 1275 | "cell_type": "code", 1276 | "execution_count": 418, 1277 | "metadata": { 1278 | "collapsed": false 1279 | }, 1280 | "outputs": [ 1281 | { 1282 | "name": "stdout", 1283 | "output_type": "stream", 1284 | "text": [ 1285 | "[[ 3]\n", 1286 | " [-3]\n", 1287 | " [ 1]\n", 1288 | " [ 0]]\n", 1289 | "[[ 8]\n", 1290 | " [-6]\n", 1291 | " [ 0]\n", 1292 | " [ 1]]\n" 1293 | ] 1294 | } 1295 | ], 1296 | "source": [ 1297 | "# solving this system we get the basis for the eigenspace of lam = 0\n", 1298 | "x3 = np.matrix([3,-3,1,0]).T\n", 1299 | "x4 = np.matrix([8,-6,0,1]).T\n", 1300 | "print x3\n", 1301 | "print x4" 1302 | ] 1303 | }, 1304 | { 1305 | "cell_type": "code", 1306 | "execution_count": 420, 1307 | "metadata": { 1308 | "collapsed": false 1309 | }, 1310 | "outputs": [ 1311 | { 1312 | "data": { 1313 | "text/plain": [ 1314 | "matrix([[0],\n", 1315 | " [0],\n", 1316 | " [0],\n", 1317 | " [0]])" 1318 | ] 1319 | }, 1320 | "execution_count": 420, 1321 | "metadata": {}, 1322 | "output_type": "execute_result" 1323 | } 1324 | ], 1325 | "source": [ 1326 | "MMt*x3" 1327 | ] 1328 | }, 1329 | { 1330 | "cell_type": "code", 1331 | "execution_count": 421, 1332 | "metadata": { 1333 | "collapsed": false 1334 | }, 1335 | "outputs": [ 1336 | { 1337 | "data": { 1338 | "text/plain": [ 1339 | "matrix([[0],\n", 1340 | " [0],\n", 1341 | " [0],\n", 1342 | " [0]])" 1343 | ] 1344 | }, 1345 | "execution_count": 421, 1346 | "metadata": {}, 1347 | "output_type": "execute_result" 1348 | } 1349 | ], 1350 | "source": [ 1351 | "MMt*x4" 1352 | ] 1353 | }, 1354 | { 1355 | "cell_type": "markdown", 1356 | "metadata": {}, 1357 | "source": [ 1358 | "\n", 1359 | "# 11.3.1" 1360 | ] 1361 | }, 1362 | { 1363 | "cell_type": "code", 1364 | "execution_count": 540, 1365 | "metadata": { 1366 | "collapsed": false 1367 | }, 1368 | "outputs": [ 1369 | { 1370 | "data": { 1371 | "text/plain": [ 1372 | "matrix([[1, 2, 3],\n", 1373 | " [3, 4, 5],\n", 1374 | " [5, 4, 3],\n", 1375 | " [1, 2, 4],\n", 1376 | " [1, 3, 5]])" 1377 | ] 1378 | }, 1379 | "execution_count": 540, 1380 | "metadata": {}, 1381 | "output_type": "execute_result" 1382 | } 1383 | ], 1384 | "source": [ 1385 | "M = np.array([1,2,3,3,4,5,5,4,3,1,2,4,1,3,5]).reshape(5,3)\n", 1386 | "M = np.asmatrix(M)\n", 1387 | "M" 1388 | ] 1389 | }, 1390 | { 1391 | "cell_type": "markdown", 1392 | "metadata": {}, 1393 | "source": [ 1394 | "(a)" 1395 | ] 1396 | }, 1397 | { 1398 | "cell_type": "code", 1399 | "execution_count": 541, 1400 | "metadata": { 1401 | "collapsed": false 1402 | }, 1403 | "outputs": [ 1404 | { 1405 | "data": { 1406 | "text/plain": [ 1407 | "matrix([[37, 39, 42],\n", 1408 | " [39, 49, 61],\n", 1409 | " [42, 61, 84]])" 1410 | ] 1411 | }, 1412 | "execution_count": 541, 1413 | "metadata": {}, 1414 | "output_type": "execute_result" 1415 | } 1416 | ], 1417 | "source": [ 1418 | "MtM = M.T*M\n", 1419 | "MtM" 1420 | ] 1421 | }, 1422 | { 1423 | "cell_type": "code", 1424 | "execution_count": 542, 1425 | "metadata": { 1426 | "collapsed": false 1427 | }, 1428 | "outputs": [ 1429 | { 1430 | "data": { 1431 | "text/plain": [ 1432 | "matrix([[14, 26, 22, 17, 22],\n", 1433 | " [26, 50, 46, 31, 40],\n", 1434 | " [22, 46, 50, 25, 32],\n", 1435 | " [17, 31, 25, 21, 27],\n", 1436 | " [22, 40, 32, 27, 35]])" 1437 | ] 1438 | }, 1439 | "execution_count": 542, 1440 | "metadata": {}, 1441 | "output_type": "execute_result" 1442 | } 1443 | ], 1444 | "source": [ 1445 | "MMt = M*M.T\n", 1446 | "MMt" 1447 | ] 1448 | }, 1449 | { 1450 | "cell_type": "markdown", 1451 | "metadata": {}, 1452 | "source": [ 1453 | "(b), (c) Using power iteration" 1454 | ] 1455 | }, 1456 | { 1457 | "cell_type": "markdown", 1458 | "metadata": {}, 1459 | "source": [ 1460 | "### MTM" 1461 | ] 1462 | }, 1463 | { 1464 | "cell_type": "code", 1465 | "execution_count": 581, 1466 | "metadata": { 1467 | "collapsed": false 1468 | }, 1469 | "outputs": [ 1470 | { 1471 | "name": "stdout", 1472 | "output_type": "stream", 1473 | "text": [ 1474 | "157.080496022 [[ 0.42949875]\n", 1475 | " [ 0.55642476]\n", 1476 | " [ 0.71128216]]\n" 1477 | ] 1478 | } 1479 | ], 1480 | "source": [ 1481 | "lam1, x1_mtm = pow(MtM, thres = 0.000001)\n", 1482 | "print lam1, x1_mtm" 1483 | ] 1484 | }, 1485 | { 1486 | "cell_type": "code", 1487 | "execution_count": 582, 1488 | "metadata": { 1489 | "collapsed": false 1490 | }, 1491 | "outputs": [ 1492 | { 1493 | "data": { 1494 | "text/plain": [ 1495 | "matrix([[ 8.02349012, 1.46031556, -5.98727448],\n", 1496 | " [ 1.46031556, 0.36654091, -1.16853405],\n", 1497 | " [-5.98727448, -1.16853405, 4.52947295]])" 1498 | ] 1499 | }, 1500 | "execution_count": 582, 1501 | "metadata": {}, 1502 | "output_type": "execute_result" 1503 | } 1504 | ], 1505 | "source": [ 1506 | "Ms = MtM - lam1*(x1_mtm*x1_mtm.T)\n", 1507 | "Ms" 1508 | ] 1509 | }, 1510 | { 1511 | "cell_type": "code", 1512 | "execution_count": 583, 1513 | "metadata": { 1514 | "collapsed": false 1515 | }, 1516 | "outputs": [ 1517 | { 1518 | "name": "stdout", 1519 | "output_type": "stream", 1520 | "text": [ 1521 | "12.7946149636 [[ 0.79072218]\n", 1522 | " [ 0.14874499]\n", 1523 | " [-0.5938294 ]]\n" 1524 | ] 1525 | } 1526 | ], 1527 | "source": [ 1528 | "lam2, x2_mtm = pow(Ms, thres = 0.000001)\n", 1529 | "print lam2, x2_mtm" 1530 | ] 1531 | }, 1532 | { 1533 | "cell_type": "code", 1534 | "execution_count": 584, 1535 | "metadata": { 1536 | "collapsed": false 1537 | }, 1538 | "outputs": [ 1539 | { 1540 | "data": { 1541 | "text/plain": [ 1542 | "matrix([[ 0.02376498, -0.04453543, 0.02048918],\n", 1543 | " [-0.04453543, 0.08345912, -0.03839659],\n", 1544 | " [ 0.02048918, -0.03839659, 0.01766491]])" 1545 | ] 1546 | }, 1547 | "execution_count": 584, 1548 | "metadata": {}, 1549 | "output_type": "execute_result" 1550 | } 1551 | ], 1552 | "source": [ 1553 | "Mss = Ms - lam2*(x2_mtm*x2_mtm.T)\n", 1554 | "Mss" 1555 | ] 1556 | }, 1557 | { 1558 | "cell_type": "code", 1559 | "execution_count": 585, 1560 | "metadata": { 1561 | "collapsed": false 1562 | }, 1563 | "outputs": [ 1564 | { 1565 | "name": "stdout", 1566 | "output_type": "stream", 1567 | "text": [ 1568 | "0.124889013913 [[-0.43622105]\n", 1569 | " [ 0.81747556]\n", 1570 | " [-0.3760916 ]]\n" 1571 | ] 1572 | } 1573 | ], 1574 | "source": [ 1575 | "lam3, x3_mtm = pow(Mss, thres=0.000001)\n", 1576 | "print lam3, x3_mtm" 1577 | ] 1578 | }, 1579 | { 1580 | "cell_type": "markdown", 1581 | "metadata": {}, 1582 | "source": [ 1583 | "### MMT" 1584 | ] 1585 | }, 1586 | { 1587 | "cell_type": "code", 1588 | "execution_count": 586, 1589 | "metadata": { 1590 | "collapsed": false 1591 | }, 1592 | "outputs": [ 1593 | { 1594 | "name": "stdout", 1595 | "output_type": "stream", 1596 | "text": [ 1597 | "157.080496022 [[ 0.29331718]\n", 1598 | " [ 0.56415119]\n", 1599 | " [ 0.51918484]\n", 1600 | " [ 0.35006921]\n", 1601 | " [ 0.45121737]]\n" 1602 | ] 1603 | } 1604 | ], 1605 | "source": [ 1606 | "lam1, x1_mmt = pow(MMt)\n", 1607 | "print lam1, x1_mmt" 1608 | ] 1609 | }, 1610 | { 1611 | "cell_type": "code", 1612 | "execution_count": 587, 1613 | "metadata": { 1614 | "collapsed": false 1615 | }, 1616 | "outputs": [ 1617 | { 1618 | "data": { 1619 | "text/plain": [ 1620 | "matrix([[ 4.85584094e-01, 7.06725610e-03, -1.92113461e+00,\n", 1621 | " 8.70768115e-01, 1.21042661e+00],\n", 1622 | " [ 7.06725610e-03, 6.52952018e-03, -8.68046388e-03,\n", 1623 | " -2.21353639e-02, 1.44033802e-02],\n", 1624 | " [ -1.92113461e+00, -8.68046388e-03, 7.65849705e+00,\n", 1625 | " -3.54947855e+00, -4.79849629e+00],\n", 1626 | " [ 8.70768115e-01, -2.21353639e-02, -3.54947855e+00,\n", 1627 | " 1.75002849e+00, 2.18798797e+00],\n", 1628 | " [ 1.21042661e+00, 1.44033802e-02, -4.79849629e+00,\n", 1629 | " 2.18798797e+00, 3.01886483e+00]])" 1630 | ] 1631 | }, 1632 | "execution_count": 587, 1633 | "metadata": {}, 1634 | "output_type": "execute_result" 1635 | } 1636 | ], 1637 | "source": [ 1638 | "Ms = MMt - lam1*(x1_mmt*x1_mmt.T)\n", 1639 | "Ms" 1640 | ] 1641 | }, 1642 | { 1643 | "cell_type": "code", 1644 | "execution_count": 588, 1645 | "metadata": { 1646 | "collapsed": false 1647 | }, 1648 | "outputs": [ 1649 | { 1650 | "name": "stdout", 1651 | "output_type": "stream", 1652 | "text": [ 1653 | "12.7946149639 [[ 1.93814995e-01]\n", 1654 | " [ 5.54806259e-04]\n", 1655 | " [ -7.73595563e-01]\n", 1656 | " [ 3.59829841e-01]\n", 1657 | " [ 4.84260085e-01]]\n" 1658 | ] 1659 | } 1660 | ], 1661 | "source": [ 1662 | "lam2, x2_mmt = pow(Ms)\n", 1663 | "print lam2, x2_mmt" 1664 | ] 1665 | }, 1666 | { 1667 | "cell_type": "code", 1668 | "execution_count": 589, 1669 | "metadata": { 1670 | "collapsed": false 1671 | }, 1672 | "outputs": [ 1673 | { 1674 | "data": { 1675 | "text/plain": [ 1676 | "matrix([[ 0.00496395, 0.00569145, -0.00278143, -0.02153369, 0.00956414],\n", 1677 | " [ 0.00569145, 0.00652558, -0.00318907, -0.02468963, 0.01096584],\n", 1678 | " [-0.00278143, -0.00318907, 0.00155851, 0.01206589, -0.00535904],\n", 1679 | " [-0.02153369, -0.02468963, 0.01206589, 0.09341354, -0.04148942],\n", 1680 | " [ 0.00956414, 0.01096584, -0.00535904, -0.04148942, 0.01842744]])" 1681 | ] 1682 | }, 1683 | "execution_count": 589, 1684 | "metadata": {}, 1685 | "output_type": "execute_result" 1686 | } 1687 | ], 1688 | "source": [ 1689 | "Mss = Ms - lam2*(x2_mmt*x2_mmt.T)\n", 1690 | "Mss" 1691 | ] 1692 | }, 1693 | { 1694 | "cell_type": "code", 1695 | "execution_count": 590, 1696 | "metadata": { 1697 | "collapsed": false 1698 | }, 1699 | "outputs": [ 1700 | { 1701 | "name": "stdout", 1702 | "output_type": "stream", 1703 | "text": [ 1704 | "0.124889013913 [[-0.19936618]\n", 1705 | " [-0.22858488]\n", 1706 | " [ 0.11171009]\n", 1707 | " [ 0.864854 ]\n", 1708 | " [-0.38412302]]\n" 1709 | ] 1710 | } 1711 | ], 1712 | "source": [ 1713 | "lam3, x3_mmt = pow(Mss)\n", 1714 | "print lam3, x3_mmt" 1715 | ] 1716 | }, 1717 | { 1718 | "cell_type": "markdown", 1719 | "metadata": {}, 1720 | "source": [ 1721 | "And we know that since MMt is bigger than MtM, that the other eignevalue of MMt is zero, which is three times repeated." 1722 | ] 1723 | }, 1724 | { 1725 | "cell_type": "markdown", 1726 | "metadata": {}, 1727 | "source": [ 1728 | "### (d) \n", 1729 | "SVD:\n", 1730 | "- V is the eigenvectors matrix of MTM\n", 1731 | "- sigma is the diagonal matrix whose diagonal elements are the square roots of the eigenvalues of MTM\n", 1732 | "- U is the eigenvectors matrix of MMT\n", 1733 | "\n", 1734 | "*Note that we only keep the non-zero singular values because they are the only ones that influence the SVD decomposition (i.e., recall also that rank of M is 3).*" 1735 | ] 1736 | }, 1737 | { 1738 | "cell_type": "code", 1739 | "execution_count": 591, 1740 | "metadata": { 1741 | "collapsed": false 1742 | }, 1743 | "outputs": [ 1744 | { 1745 | "data": { 1746 | "text/plain": [ 1747 | "matrix([[ 0.42949875, 0.79072218, -0.43622105],\n", 1748 | " [ 0.55642476, 0.14874499, 0.81747556],\n", 1749 | " [ 0.71128216, -0.5938294 , -0.3760916 ]])" 1750 | ] 1751 | }, 1752 | "execution_count": 591, 1753 | "metadata": {}, 1754 | "output_type": "execute_result" 1755 | } 1756 | ], 1757 | "source": [ 1758 | "V = np.concatenate((x1_mtm,x2_mtm,x3_mtm),axis=1)\n", 1759 | "V" 1760 | ] 1761 | }, 1762 | { 1763 | "cell_type": "code", 1764 | "execution_count": 592, 1765 | "metadata": { 1766 | "collapsed": false 1767 | }, 1768 | "outputs": [ 1769 | { 1770 | "data": { 1771 | "text/plain": [ 1772 | "matrix([[ 2.93317185e-01, 1.93814995e-01, -1.99366184e-01],\n", 1773 | " [ 5.64151193e-01, 5.54806259e-04, -2.28584883e-01],\n", 1774 | " [ 5.19184840e-01, -7.73595563e-01, 1.11710086e-01],\n", 1775 | " [ 3.50069209e-01, 3.59829841e-01, 8.64853999e-01],\n", 1776 | " [ 4.51217365e-01, 4.84260085e-01, -3.84123018e-01]])" 1777 | ] 1778 | }, 1779 | "execution_count": 592, 1780 | "metadata": {}, 1781 | "output_type": "execute_result" 1782 | } 1783 | ], 1784 | "source": [ 1785 | "U = np.concatenate((x1_mmt,x2_mmt,x3_mmt),axis=1)\n", 1786 | "U" 1787 | ] 1788 | }, 1789 | { 1790 | "cell_type": "code", 1791 | "execution_count": 593, 1792 | "metadata": { 1793 | "collapsed": false 1794 | }, 1795 | "outputs": [ 1796 | { 1797 | "data": { 1798 | "text/plain": [ 1799 | "matrix([[ 12.53317582, 0. , 0. ],\n", 1800 | " [ 0. , 3.5769561 , 0. ],\n", 1801 | " [ 0. , 0. , 0.3533964 ]])" 1802 | ] 1803 | }, 1804 | "execution_count": 593, 1805 | "metadata": {}, 1806 | "output_type": "execute_result" 1807 | } 1808 | ], 1809 | "source": [ 1810 | "sigma = np.matrix([[np.sqrt(lam1),0,0],[0,np.sqrt(lam2),0],\n", 1811 | " [0,0,np.sqrt(lam3)]])\n", 1812 | "sigma" 1813 | ] 1814 | }, 1815 | { 1816 | "cell_type": "code", 1817 | "execution_count": 594, 1818 | "metadata": { 1819 | "collapsed": false 1820 | }, 1821 | "outputs": [ 1822 | { 1823 | "data": { 1824 | "text/plain": [ 1825 | "matrix([[ 2.15783778, 2.09105102, 2.2296274 ],\n", 1826 | " [ 3.07362409, 3.86851894, 5.05839857],\n", 1827 | " [ 0.58952116, 3.24135273, 6.25668614],\n", 1828 | " [ 2.76882675, 2.88260114, 2.24147307],\n", 1829 | " [ 3.8577824 , 3.29336808, 3.0448692 ]])" 1830 | ] 1831 | }, 1832 | "execution_count": 594, 1833 | "metadata": {}, 1834 | "output_type": "execute_result" 1835 | } 1836 | ], 1837 | "source": [ 1838 | "# Checking SVD correctness\n", 1839 | "U*sigma*(V.T)" 1840 | ] 1841 | }, 1842 | { 1843 | "cell_type": "code", 1844 | "execution_count": 595, 1845 | "metadata": { 1846 | "collapsed": false 1847 | }, 1848 | "outputs": [ 1849 | { 1850 | "data": { 1851 | "text/plain": [ 1852 | "matrix([[1, 2, 3],\n", 1853 | " [3, 4, 5],\n", 1854 | " [5, 4, 3],\n", 1855 | " [1, 2, 4],\n", 1856 | " [1, 3, 5]])" 1857 | ] 1858 | }, 1859 | "execution_count": 595, 1860 | "metadata": {}, 1861 | "output_type": "execute_result" 1862 | } 1863 | ], 1864 | "source": [ 1865 | "M" 1866 | ] 1867 | }, 1868 | { 1869 | "cell_type": "markdown", 1870 | "metadata": {}, 1871 | "source": [ 1872 | "Using np.linalg.svd" 1873 | ] 1874 | }, 1875 | { 1876 | "cell_type": "code", 1877 | "execution_count": 597, 1878 | "metadata": { 1879 | "collapsed": false 1880 | }, 1881 | "outputs": [], 1882 | "source": [ 1883 | "U, s, V = np.linalg.svd(M, full_matrices=False)" 1884 | ] 1885 | }, 1886 | { 1887 | "cell_type": "code", 1888 | "execution_count": 598, 1889 | "metadata": { 1890 | "collapsed": false 1891 | }, 1892 | "outputs": [ 1893 | { 1894 | "data": { 1895 | "text/plain": [ 1896 | "matrix([[ -2.93317100e-01, 1.93816567e-01, 1.99366072e-01],\n", 1897 | " [ -5.64151193e-01, 5.57827581e-04, 2.28584883e-01],\n", 1898 | " [ -5.19185177e-01, -7.73592784e-01, -1.11709640e-01],\n", 1899 | " [ -3.50069052e-01, 3.59831710e-01, -8.64854207e-01],\n", 1900 | " [ -4.51217154e-01, 4.84262503e-01, 3.84122739e-01]])" 1901 | ] 1902 | }, 1903 | "execution_count": 598, 1904 | "metadata": {}, 1905 | "output_type": "execute_result" 1906 | } 1907 | ], 1908 | "source": [ 1909 | "U" 1910 | ] 1911 | }, 1912 | { 1913 | "cell_type": "code", 1914 | "execution_count": 599, 1915 | "metadata": { 1916 | "collapsed": false 1917 | }, 1918 | "outputs": [ 1919 | { 1920 | "data": { 1921 | "text/plain": [ 1922 | "matrix([[ 12.53317582, 0. , 0. ],\n", 1923 | " [ 0. , 3.5769561 , 0. ],\n", 1924 | " [ 0. , 0. , 0.3533964 ]])" 1925 | ] 1926 | }, 1927 | "execution_count": 599, 1928 | "metadata": {}, 1929 | "output_type": "execute_result" 1930 | } 1931 | ], 1932 | "source": [ 1933 | "s = np.mat(np.diag(s))\n", 1934 | "s" 1935 | ] 1936 | }, 1937 | { 1938 | "cell_type": "code", 1939 | "execution_count": 600, 1940 | "metadata": { 1941 | "collapsed": false 1942 | }, 1943 | "outputs": [ 1944 | { 1945 | "data": { 1946 | "text/plain": [ 1947 | "matrix([[-0.4294987 , -0.55642475, -0.71128219],\n", 1948 | " [-0.7907225 , -0.1487454 , 0.59382888],\n", 1949 | " [-0.43622104, 0.81747557, -0.37609161]])" 1950 | ] 1951 | }, 1952 | "execution_count": 600, 1953 | "metadata": {}, 1954 | "output_type": "execute_result" 1955 | } 1956 | ], 1957 | "source": [ 1958 | "V" 1959 | ] 1960 | }, 1961 | { 1962 | "cell_type": "code", 1963 | "execution_count": 603, 1964 | "metadata": { 1965 | "collapsed": false 1966 | }, 1967 | "outputs": [ 1968 | { 1969 | "data": { 1970 | "text/plain": [ 1971 | "matrix([[ 1.14305288, 2.84556706, 2.14386992],\n", 1972 | " [ 2.97824776, 5.63856063, 3.0555972 ],\n", 1973 | " [ 4.36253178, 5.53341366, 0.59131193],\n", 1974 | " [ 1.38563384, 3.09633052, 3.08103168],\n", 1975 | " [ 1.36850832, 4.29463733, 3.83187619]])" 1976 | ] 1977 | }, 1978 | "execution_count": 603, 1979 | "metadata": {}, 1980 | "output_type": "execute_result" 1981 | } 1982 | ], 1983 | "source": [ 1984 | "U*s*(V.T)" 1985 | ] 1986 | }, 1987 | { 1988 | "cell_type": "code", 1989 | "execution_count": 604, 1990 | "metadata": { 1991 | "collapsed": false 1992 | }, 1993 | "outputs": [ 1994 | { 1995 | "data": { 1996 | "text/plain": [ 1997 | "matrix([[1, 2, 3],\n", 1998 | " [3, 4, 5],\n", 1999 | " [5, 4, 3],\n", 2000 | " [1, 2, 4],\n", 2001 | " [1, 3, 5]])" 2002 | ] 2003 | }, 2004 | "execution_count": 604, 2005 | "metadata": {}, 2006 | "output_type": "execute_result" 2007 | } 2008 | ], 2009 | "source": [ 2010 | "M" 2011 | ] 2012 | }, 2013 | { 2014 | "cell_type": "code", 2015 | "execution_count": 605, 2016 | "metadata": { 2017 | "collapsed": true 2018 | }, 2019 | "outputs": [], 2020 | "source": [ 2021 | "# np.linalg.svd does a much better job than the eigendecomposition above" 2022 | ] 2023 | }, 2024 | { 2025 | "cell_type": "markdown", 2026 | "metadata": {}, 2027 | "source": [ 2028 | "\n", 2029 | "# 11.3.2 " 2030 | ] 2031 | }, 2032 | { 2033 | "cell_type": "code", 2034 | "execution_count": 606, 2035 | "metadata": { 2036 | "collapsed": true 2037 | }, 2038 | "outputs": [], 2039 | "source": [ 2040 | "# mapping a user representation in the item space to the concept space\n", 2041 | "Leslie = np.matrix([0,3,0,0,4])" 2042 | ] 2043 | }, 2044 | { 2045 | "cell_type": "code", 2046 | "execution_count": 608, 2047 | "metadata": { 2048 | "collapsed": false 2049 | }, 2050 | "outputs": [ 2051 | { 2052 | "data": { 2053 | "text/plain": [ 2054 | "matrix([[ 0.58, 0. ],\n", 2055 | " [ 0.58, 0. ],\n", 2056 | " [ 0.58, 0. ],\n", 2057 | " [ 0. , 0.71],\n", 2058 | " [ 0. , 0.71]])" 2059 | ] 2060 | }, 2061 | "execution_count": 608, 2062 | "metadata": {}, 2063 | "output_type": "execute_result" 2064 | } 2065 | ], 2066 | "source": [ 2067 | "# V from the SVD decomposition of Fig. 11.7\n", 2068 | "V = np.matrix([[0.58,0.58,0.58,0,0],[0,0,0,0.71,0.71]]).T\n", 2069 | "V" 2070 | ] 2071 | }, 2072 | { 2073 | "cell_type": "code", 2074 | "execution_count": 609, 2075 | "metadata": { 2076 | "collapsed": false 2077 | }, 2078 | "outputs": [ 2079 | { 2080 | "data": { 2081 | "text/plain": [ 2082 | "matrix([[ 1.74, 2.84]])" 2083 | ] 2084 | }, 2085 | "execution_count": 609, 2086 | "metadata": {}, 2087 | "output_type": "execute_result" 2088 | } 2089 | ], 2090 | "source": [ 2091 | "Leslie*V" 2092 | ] 2093 | }, 2094 | { 2095 | "cell_type": "markdown", 2096 | "metadata": {}, 2097 | "source": [ 2098 | "The above vector in the concept space suggests that Leslie likes both science-fiction and romance movies, but with more preference towards the latter genre." 2099 | ] 2100 | }, 2101 | { 2102 | "cell_type": "markdown", 2103 | "metadata": {}, 2104 | "source": [ 2105 | "# 11.4.1 " 2106 | ] 2107 | }, 2108 | { 2109 | "cell_type": "code", 2110 | "execution_count": 610, 2111 | "metadata": { 2112 | "collapsed": false 2113 | }, 2114 | "outputs": [ 2115 | { 2116 | "data": { 2117 | "text/plain": [ 2118 | "matrix([[ 48, 14],\n", 2119 | " [ 14, -48]])" 2120 | ] 2121 | }, 2122 | "execution_count": 610, 2123 | "metadata": {}, 2124 | "output_type": "execute_result" 2125 | } 2126 | ], 2127 | "source": [ 2128 | "M = np.matrix([[48,14],[14,-48]])\n", 2129 | "M" 2130 | ] 2131 | }, 2132 | { 2133 | "cell_type": "code", 2134 | "execution_count": 611, 2135 | "metadata": { 2136 | "collapsed": false 2137 | }, 2138 | "outputs": [ 2139 | { 2140 | "data": { 2141 | "text/plain": [ 2142 | "matrix([[ 0.6, 0.8],\n", 2143 | " [ 0.8, -0.6]])" 2144 | ] 2145 | }, 2146 | "execution_count": 611, 2147 | "metadata": {}, 2148 | "output_type": "execute_result" 2149 | } 2150 | ], 2151 | "source": [ 2152 | "U = np.matrix([[3/5,4/5],[4/5,-3/5]])\n", 2153 | "U" 2154 | ] 2155 | }, 2156 | { 2157 | "cell_type": "code", 2158 | "execution_count": 615, 2159 | "metadata": { 2160 | "collapsed": false 2161 | }, 2162 | "outputs": [ 2163 | { 2164 | "data": { 2165 | "text/plain": [ 2166 | "matrix([[ 0.8, 0.6],\n", 2167 | " [-0.6, 0.8]])" 2168 | ] 2169 | }, 2170 | "execution_count": 615, 2171 | "metadata": {}, 2172 | "output_type": "execute_result" 2173 | } 2174 | ], 2175 | "source": [ 2176 | "V = np.matrix([[4/5,-3/5],[3/5,4/5]]).T\n", 2177 | "V" 2178 | ] 2179 | }, 2180 | { 2181 | "cell_type": "code", 2182 | "execution_count": 614, 2183 | "metadata": { 2184 | "collapsed": false 2185 | }, 2186 | "outputs": [ 2187 | { 2188 | "data": { 2189 | "text/plain": [ 2190 | "matrix([[50, 0],\n", 2191 | " [ 0, 25]])" 2192 | ] 2193 | }, 2194 | "execution_count": 614, 2195 | "metadata": {}, 2196 | "output_type": "execute_result" 2197 | } 2198 | ], 2199 | "source": [ 2200 | "s = np.mat(np.diag([50,25]))\n", 2201 | "s" 2202 | ] 2203 | }, 2204 | { 2205 | "cell_type": "code", 2206 | "execution_count": 616, 2207 | "metadata": { 2208 | "collapsed": false 2209 | }, 2210 | "outputs": [ 2211 | { 2212 | "data": { 2213 | "text/plain": [ 2214 | "matrix([[ 36., -2.],\n", 2215 | " [ 23., -36.]])" 2216 | ] 2217 | }, 2218 | "execution_count": 616, 2219 | "metadata": {}, 2220 | "output_type": "execute_result" 2221 | } 2222 | ], 2223 | "source": [ 2224 | "U*s*V.T" 2225 | ] 2226 | }, 2227 | { 2228 | "cell_type": "markdown", 2229 | "metadata": {}, 2230 | "source": [ 2231 | "Thus, this SVD is not correct!" 2232 | ] 2233 | }, 2234 | { 2235 | "cell_type": "code", 2236 | "execution_count": null, 2237 | "metadata": { 2238 | "collapsed": true 2239 | }, 2240 | "outputs": [], 2241 | "source": [] 2242 | } 2243 | ], 2244 | "metadata": { 2245 | "kernelspec": { 2246 | "display_name": "Python 2", 2247 | "language": "python", 2248 | "name": "python2" 2249 | }, 2250 | "language_info": { 2251 | "codemirror_mode": { 2252 | "name": "ipython", 2253 | "version": 2 2254 | }, 2255 | "file_extension": ".py", 2256 | "mimetype": "text/x-python", 2257 | "name": "python", 2258 | "nbconvert_exporter": "python", 2259 | "pygments_lexer": "ipython2", 2260 | "version": "2.7.11" 2261 | } 2262 | }, 2263 | "nbformat": 4, 2264 | "nbformat_minor": 0 2265 | } 2266 | -------------------------------------------------------------------------------- /Chapter 9 - Recommendation Systems.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Contents\n", 8 | "- Exercise 9.2.1\n", 9 | "- Exercise 9.2.2\n", 10 | "- Exercise 9.2.3\n", 11 | "- Exercise 9.3.1\n", 12 | "- Exercise 9.4.1 and 9.4.2\n", 13 | "- Exercise 9.4.3\n", 14 | "- Exercise 9.4.5 (Normalizing the Utility Matrix)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 5, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import numpy as np\n", 26 | "from __future__ import division" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "\n", 34 | "# 9.2.1" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 97, 40 | "metadata": { 41 | "collapsed": false 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "class Computer(object):\n", 46 | " def __init__(self, proc, disk, mem):\n", 47 | " self.processor_speed = proc\n", 48 | " self.disk = disk\n", 49 | " self.main_memory = mem\n", 50 | " self.summary = [proc, disk, mem]\n", 51 | " \n", 52 | " def dot_prod(self,X):\n", 53 | " if isinstance(X, Computer):\n", 54 | " bar = [X.processor_speed, X.disk, X.main_memory]\n", 55 | " return sum([x*y for x,y in zip(self.summary,bar)])\n", 56 | " else:\n", 57 | " assert len(X) == 3\n", 58 | " return sum([x*y for x,y in zip(self.summary,X)])\n", 59 | " \n", 60 | " def cosine(self,X,alpha=1,beta=1):\n", 61 | " if isinstance(X, Computer):\n", 62 | " foo = [self.processor_speed, alpha*self.disk, beta*self.main_memory]\n", 63 | " bar = [X.processor_speed, alpha*X.disk, alpha*X.main_memory]\n", 64 | " ati = np.dot(foo,bar)\n", 65 | " tun = np.sqrt(np.dot(foo,foo))*np.sqrt(np.dot(bar,bar))\n", 66 | " return ati/tun\n", 67 | " \n", 68 | " def normalize(self, mu):\n", 69 | " assert len(mu) == 3\n", 70 | " return [self.processor_speed - mu[0], self.disk - mu[1], \n", 71 | " self.main_memory - mu[2]]" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 98, 77 | "metadata": { 78 | "collapsed": true 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "A = Computer(3.06,500,6)\n", 83 | "B = Computer(2.68,320,4)\n", 84 | "C = Computer(2.92,640,6)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 99, 90 | "metadata": { 91 | "collapsed": false 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "computers = {'A':A, 'B':B, 'C':C}" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "(b) cosine similarities when alpha=beta=1" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 100, 108 | "metadata": { 109 | "collapsed": false 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "pairs = [['A','B'],['A','C'],['B','C']]" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 101, 119 | "metadata": { 120 | "collapsed": false 121 | }, 122 | "outputs": [ 123 | { 124 | "name": "stdout", 125 | "output_type": "stream", 126 | "text": [ 127 | "['A', 'B'] 0.999997333284\n", 128 | "['A', 'C'] 0.999995343121\n", 129 | "['B', 'C'] 0.999987853375\n" 130 | ] 131 | } 132 | ], 133 | "source": [ 134 | "# cosine similarities\n", 135 | "for pair in pairs:\n", 136 | " print pair, computers[pair[0]].cosine(computers[pair[1]])" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "(c) cosine similarities when alpha=0.01 and beta=0.5" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 102, 149 | "metadata": { 150 | "collapsed": false 151 | }, 152 | "outputs": [ 153 | { 154 | "name": "stdout", 155 | "output_type": "stream", 156 | "text": [ 157 | "['A', 'B'] 0.884792148899\n", 158 | "['A', 'C'] 0.887525858762\n", 159 | "['B', 'C'] 0.873005241921\n" 160 | ] 161 | } 162 | ], 163 | "source": [ 164 | "for pair in pairs:\n", 165 | " print pair, computers[pair[0]].cosine(computers[pair[1]],0.01,0.5)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "(d) setting alpha = 1/avg(disk size) and beta = 1/avg(main_memory)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 103, 178 | "metadata": { 179 | "collapsed": true 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "alpha = 1/np.mean([A.disk,B.disk,C.disk])\n", 184 | "beta = 1/np.mean([A.main_memory,B.main_memory,C.main_memory])" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 104, 190 | "metadata": { 191 | "collapsed": false 192 | }, 193 | "outputs": [ 194 | { 195 | "name": "stdout", 196 | "output_type": "stream", 197 | "text": [ 198 | "0.00205479452055 0.1875\n" 199 | ] 200 | } 201 | ], 202 | "source": [ 203 | "print alpha,beta" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 105, 209 | "metadata": { 210 | "collapsed": false 211 | }, 212 | "outputs": [ 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | "['A', 'B'] 0.941990802633\n", 218 | "['A', 'C'] 0.940905717338\n", 219 | "['B', 'C'] 0.949959248828\n" 220 | ] 221 | } 222 | ], 223 | "source": [ 224 | "for pair in pairs:\n", 225 | " print pair, computers[pair[0]].cosine(computers[pair[1]],alpha,beta)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "\n", 233 | "# 9.2.2 " 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "(a) Normalizing the vectors of the three computers of 9.2.1" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 106, 246 | "metadata": { 247 | "collapsed": false 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "mean_proc = np.mean([comp.processor_speed for comp in computers.values()])\n", 252 | "mean_disk = np.mean([comp.disk for comp in computers.values()])\n", 253 | "mean_memory = np.mean([comp.main_memory for comp in computers.values()])" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 107, 259 | "metadata": { 260 | "collapsed": true 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "means = [mean_proc, mean_disk, mean_memory]" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 111, 270 | "metadata": { 271 | "collapsed": false 272 | }, 273 | "outputs": [ 274 | { 275 | "name": "stdout", 276 | "output_type": "stream", 277 | "text": [ 278 | "A: [0.17333333333333334, 13.333333333333314, 0.66666666666666696]\n", 279 | "B: [-0.20666666666666655, -166.66666666666669, -1.333333333333333]\n", 280 | "C: [0.033333333333333215, 153.33333333333331, 0.66666666666666696]\n" 281 | ] 282 | } 283 | ], 284 | "source": [ 285 | "print 'A:', A.normalize(means)\n", 286 | "print 'B:',B.normalize(means)\n", 287 | "print 'C:',C.normalize(means)" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "(b) A few options I can think of: median (of differences), length (or norm), max etc. In all cases, the interpretation of a small angle (note that cosine lies between -1 and 1) means that the two vectors are similarly directed. To be similarly directed in this context of normalized components implies that the items are similarly dispersed about the average." 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "\n", 302 | "# 9.2.3 " 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 123, 308 | "metadata": { 309 | "collapsed": true 310 | }, 311 | "outputs": [], 312 | "source": [ 313 | "# ordered [A,B,C]\n", 314 | "user_ratings = [4,2,5]" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "(a) normalizing the ratings for this user" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 116, 327 | "metadata": { 328 | "collapsed": false 329 | }, 330 | "outputs": [], 331 | "source": [ 332 | "avg_rating = np.mean(user_ratings)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 118, 338 | "metadata": { 339 | "collapsed": false 340 | }, 341 | "outputs": [ 342 | { 343 | "data": { 344 | "text/plain": [ 345 | "[0.33333333333333348, -1.6666666666666665, 1.3333333333333335]" 346 | ] 347 | }, 348 | "execution_count": 118, 349 | "metadata": {}, 350 | "output_type": "execute_result" 351 | } 352 | ], 353 | "source": [ 354 | "# normalize the ratings for this user\n", 355 | "[rate - avg_rating for rate in user_ratings]" 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "metadata": {}, 361 | "source": [ 362 | "(b) constructing a user profile from the items profiles\n", 363 | "\n", 364 | "*I use the weights rating/5 *" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 119, 370 | "metadata": { 371 | "collapsed": true 372 | }, 373 | "outputs": [], 374 | "source": [ 375 | "weights = [rate/5 for rate in user_ratings]" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 126, 381 | "metadata": { 382 | "collapsed": false 383 | }, 384 | "outputs": [ 385 | { 386 | "data": { 387 | "text/plain": [ 388 | "[0.8, 0.4, 1.0]" 389 | ] 390 | }, 391 | "execution_count": 126, 392 | "metadata": {}, 393 | "output_type": "execute_result" 394 | } 395 | ], 396 | "source": [ 397 | "weights" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 124, 403 | "metadata": { 404 | "collapsed": false 405 | }, 406 | "outputs": [], 407 | "source": [ 408 | "user_profile = {}\n", 409 | "user_profile['proc_speed'] = sum([wt*proc for wt,proc in zip(weights,[A.processor_speed,\n", 410 | " B.processor_speed,C.processor_speed])])\n", 411 | "user_profile['disk'] = sum([wt*disk for wt,disk in zip(weights,[A.disk,B.disk,C.disk])])\n", 412 | "user_profile['main_memory'] = sum([wt*mm for wt,mm in zip(weights,[A.main_memory,B.main_memory,\n", 413 | " C.main_memory])])" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 125, 419 | "metadata": { 420 | "collapsed": false 421 | }, 422 | "outputs": [ 423 | { 424 | "data": { 425 | "text/plain": [ 426 | "{'disk': 1168.0, 'main_memory': 12.4, 'proc_speed': 6.44}" 427 | ] 428 | }, 429 | "execution_count": 125, 430 | "metadata": {}, 431 | "output_type": "execute_result" 432 | } 433 | ], 434 | "source": [ 435 | "user_profile" 436 | ] 437 | }, 438 | { 439 | "cell_type": "markdown", 440 | "metadata": {}, 441 | "source": [ 442 | "Alternatively, can use the following weights." 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": 128, 448 | "metadata": { 449 | "collapsed": false 450 | }, 451 | "outputs": [ 452 | { 453 | "data": { 454 | "text/plain": [ 455 | "[0.36363636363636365, 0.18181818181818182, 0.45454545454545453]" 456 | ] 457 | }, 458 | "execution_count": 128, 459 | "metadata": {}, 460 | "output_type": "execute_result" 461 | } 462 | ], 463 | "source": [ 464 | "weights = [rate/sum(user_ratings) for rate in user_ratings]\n", 465 | "weights" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": 129, 471 | "metadata": { 472 | "collapsed": true 473 | }, 474 | "outputs": [], 475 | "source": [ 476 | "user_profile = {}\n", 477 | "user_profile['proc_speed'] = sum([wt*proc for wt,proc in zip(weights,[A.processor_speed,\n", 478 | " B.processor_speed,C.processor_speed])])\n", 479 | "user_profile['disk'] = sum([wt*disk for wt,disk in zip(weights,[A.disk,B.disk,C.disk])])\n", 480 | "user_profile['main_memory'] = sum([wt*mm for wt,mm in zip(weights,[A.main_memory,B.main_memory,\n", 481 | " C.main_memory])])" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": 130, 487 | "metadata": { 488 | "collapsed": false 489 | }, 490 | "outputs": [ 491 | { 492 | "data": { 493 | "text/plain": [ 494 | "{'disk': 530.9090909090909,\n", 495 | " 'main_memory': 5.636363636363637,\n", 496 | " 'proc_speed': 2.9272727272727272}" 497 | ] 498 | }, 499 | "execution_count": 130, 500 | "metadata": {}, 501 | "output_type": "execute_result" 502 | } 503 | ], 504 | "source": [ 505 | "user_profile" 506 | ] 507 | }, 508 | { 509 | "cell_type": "markdown", 510 | "metadata": {}, 511 | "source": [ 512 | "This user_profile supplies aggregates that are within the support of each component." 513 | ] 514 | }, 515 | { 516 | "cell_type": "markdown", 517 | "metadata": {}, 518 | "source": [ 519 | "\n", 520 | "# 9.3.1 " 521 | ] 522 | }, 523 | { 524 | "cell_type": "markdown", 525 | "metadata": {}, 526 | "source": [ 527 | "(a) Jaccard similarities: SIM(A,B) = 4/8; SIM(A,C) = 3/8; SIM(B,C) = 4/8" 528 | ] 529 | }, 530 | { 531 | "cell_type": "markdown", 532 | "metadata": {}, 533 | "source": [ 534 | "(b) Cosine distance:" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": 132, 540 | "metadata": { 541 | "collapsed": false 542 | }, 543 | "outputs": [], 544 | "source": [ 545 | "U = np.array([4,5,0,5,1,0,3,2,0,3,4,3,1,2,1,0,2,0,1,3,0,4,5,3]).reshape(3,8)" 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": 147, 551 | "metadata": { 552 | "collapsed": false 553 | }, 554 | "outputs": [], 555 | "source": [ 556 | "# user ratings are rows of Utility matrix\n", 557 | "A = U[0]\n", 558 | "B = U[1]\n", 559 | "C = U[2]" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": 148, 565 | "metadata": { 566 | "collapsed": true 567 | }, 568 | "outputs": [], 569 | "source": [ 570 | "def cosine(X,Y):\n", 571 | " return np.dot(X,Y)/(np.sqrt(np.dot(X,X)*np.dot(Y,Y)))" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 149, 577 | "metadata": { 578 | "collapsed": false 579 | }, 580 | "outputs": [ 581 | { 582 | "name": "stdout", 583 | "output_type": "stream", 584 | "text": [ 585 | "A,B: 0.601040764009\n", 586 | "A,C: 0.614918693812\n", 587 | "B,C: 0.513870119777\n" 588 | ] 589 | } 590 | ], 591 | "source": [ 592 | "print 'A,B:', cosine(A,B)\n", 593 | "print 'A,C:', cosine(A,C)\n", 594 | "print 'B,C:', cosine(B,C)" 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": 150, 600 | "metadata": { 601 | "collapsed": false 602 | }, 603 | "outputs": [ 604 | { 605 | "data": { 606 | "text/plain": [ 607 | "array([[4, 5, 0, 5, 1, 0, 3, 2],\n", 608 | " [0, 3, 4, 3, 1, 2, 1, 0],\n", 609 | " [2, 0, 1, 3, 0, 4, 5, 3]])" 610 | ] 611 | }, 612 | "execution_count": 150, 613 | "metadata": {}, 614 | "output_type": "execute_result" 615 | } 616 | ], 617 | "source": [ 618 | "U" 619 | ] 620 | }, 621 | { 622 | "cell_type": "markdown", 623 | "metadata": {}, 624 | "source": [ 625 | "(c) Rounding data in the utility matrix and computing Jaccard distance" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": 151, 631 | "metadata": { 632 | "collapsed": true 633 | }, 634 | "outputs": [], 635 | "source": [ 636 | "ratings = [4,5,0,5,1,0,3,2,0,3,4,3,1,2,1,0,2,0,1,3,0,4,5,3]" 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": 154, 642 | "metadata": { 643 | "collapsed": false 644 | }, 645 | "outputs": [], 646 | "source": [ 647 | "# mapping 3,4,5 to 1 and 1,2 to 0\n", 648 | "U_rounded = np.array(map(lambda x: 1 if x>=3 else 0, ratings)).reshape(3,8)" 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": 155, 654 | "metadata": { 655 | "collapsed": false 656 | }, 657 | "outputs": [ 658 | { 659 | "data": { 660 | "text/plain": [ 661 | "array([[1, 1, 0, 1, 0, 0, 1, 0],\n", 662 | " [0, 1, 1, 1, 0, 0, 0, 0],\n", 663 | " [0, 0, 0, 1, 0, 1, 1, 1]])" 664 | ] 665 | }, 666 | "execution_count": 155, 667 | "metadata": {}, 668 | "output_type": "execute_result" 669 | } 670 | ], 671 | "source": [ 672 | "U_rounded" 673 | ] 674 | }, 675 | { 676 | "cell_type": "markdown", 677 | "metadata": {}, 678 | "source": [ 679 | "Jaccard distances: sim(A,B) = 2/5; sim(A,C) = 2/6; sim(B,C) = 1/6" 680 | ] 681 | }, 682 | { 683 | "cell_type": "markdown", 684 | "metadata": {}, 685 | "source": [ 686 | "(d) computing cosine similarities for rounded data" 687 | ] 688 | }, 689 | { 690 | "cell_type": "code", 691 | "execution_count": 156, 692 | "metadata": { 693 | "collapsed": false 694 | }, 695 | "outputs": [ 696 | { 697 | "name": "stdout", 698 | "output_type": "stream", 699 | "text": [ 700 | "A,B: 0.57735026919\n", 701 | "A,C: 0.5\n", 702 | "B,C: 0.288675134595\n" 703 | ] 704 | } 705 | ], 706 | "source": [ 707 | "A_rd = U_rounded[0]\n", 708 | "B_rd = U_rounded[1]\n", 709 | "C_rd = U_rounded[2]\n", 710 | "\n", 711 | "print 'A,B:', cosine(A_rd,B_rd)\n", 712 | "print 'A,C:', cosine(A_rd,C_rd)\n", 713 | "print 'B,C:', cosine(B_rd,C_rd)" 714 | ] 715 | }, 716 | { 717 | "cell_type": "markdown", 718 | "metadata": {}, 719 | "source": [ 720 | "(e) Normalizing the matrix by user ratings: *subtract from each nonblank entry the average value for its user*" 721 | ] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": 167, 726 | "metadata": { 727 | "collapsed": false 728 | }, 729 | "outputs": [], 730 | "source": [ 731 | "A_norm = map(lambda x: x-np.mean(A) if x>0 else 0, A)\n", 732 | "B_norm = map(lambda x: x-np.mean(B) if x>0 else 0, B)\n", 733 | "C_norm = map(lambda x: x-np.mean(C) if x>0 else 0, C)" 734 | ] 735 | }, 736 | { 737 | "cell_type": "code", 738 | "execution_count": 165, 739 | "metadata": { 740 | "collapsed": false 741 | }, 742 | "outputs": [], 743 | "source": [ 744 | "U_norm = np.array([A_norm,B_norm,C_norm])" 745 | ] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": 166, 750 | "metadata": { 751 | "collapsed": false 752 | }, 753 | "outputs": [ 754 | { 755 | "data": { 756 | "text/plain": [ 757 | "array([[ 1.5 , 2.5 , 0. , 2.5 , -1.5 , 0. , 0.5 , -0.5 ],\n", 758 | " [ 0. , 1.25, 2.25, 1.25, -0.75, 0.25, -0.75, 0. ],\n", 759 | " [-0.25, 0. , -1.25, 0.75, 0. , 1.75, 2.75, 0.75]])" 760 | ] 761 | }, 762 | "execution_count": 166, 763 | "metadata": {}, 764 | "output_type": "execute_result" 765 | } 766 | ], 767 | "source": [ 768 | "U_norm" 769 | ] 770 | }, 771 | { 772 | "cell_type": "markdown", 773 | "metadata": {}, 774 | "source": [ 775 | "(e) Computing the cosine distance between each pair of users" 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": 168, 781 | "metadata": { 782 | "collapsed": false 783 | }, 784 | "outputs": [ 785 | { 786 | "name": "stdout", 787 | "output_type": "stream", 788 | "text": [ 789 | "A,B: 0.546504040851\n", 790 | "A,C: 0.163408291384\n", 791 | "B,C: -0.312561520424\n" 792 | ] 793 | } 794 | ], 795 | "source": [ 796 | "print 'A,B:', cosine(A_norm,B_norm)\n", 797 | "print 'A,C:', cosine(A_norm,C_norm)\n", 798 | "print 'B,C:', cosine(B_norm,C_norm)" 799 | ] 800 | }, 801 | { 802 | "cell_type": "markdown", 803 | "metadata": {}, 804 | "source": [ 805 | "\n", 806 | "# 9.4.1 and 9.4.2 (UV Decomposition) " 807 | ] 808 | }, 809 | { 810 | "cell_type": "code", 811 | "execution_count": 280, 812 | "metadata": { 813 | "collapsed": false 814 | }, 815 | "outputs": [], 816 | "source": [ 817 | "U = np.array([1]*10).reshape(5,2)" 818 | ] 819 | }, 820 | { 821 | "cell_type": "code", 822 | "execution_count": 281, 823 | "metadata": { 824 | "collapsed": false 825 | }, 826 | "outputs": [], 827 | "source": [ 828 | "V = np.array([1]*10).reshape(2,5)" 829 | ] 830 | }, 831 | { 832 | "cell_type": "code", 833 | "execution_count": 282, 834 | "metadata": { 835 | "collapsed": false 836 | }, 837 | "outputs": [], 838 | "source": [ 839 | "M = np.array([5,2,4,4,3,3,1,2,4,1,2,99,3,1,4,2,5,4,3,5,4,4,5,4,99]).reshape(5,5)" 840 | ] 841 | }, 842 | { 843 | "cell_type": "code", 844 | "execution_count": 283, 845 | "metadata": { 846 | "collapsed": false 847 | }, 848 | "outputs": [ 849 | { 850 | "data": { 851 | "text/plain": [ 852 | "array([[ 5, 2, 4, 4, 3],\n", 853 | " [ 3, 1, 2, 4, 1],\n", 854 | " [ 2, 99, 3, 1, 4],\n", 855 | " [ 2, 5, 4, 3, 5],\n", 856 | " [ 4, 4, 5, 4, 99]])" 857 | ] 858 | }, 859 | "execution_count": 283, 860 | "metadata": {}, 861 | "output_type": "execute_result" 862 | } 863 | ], 864 | "source": [ 865 | "M" 866 | ] 867 | }, 868 | { 869 | "cell_type": "markdown", 870 | "metadata": {}, 871 | "source": [ 872 | "**Gradient descent**" 873 | ] 874 | }, 875 | { 876 | "cell_type": "code", 877 | "execution_count": 311, 878 | "metadata": { 879 | "collapsed": true 880 | }, 881 | "outputs": [], 882 | "source": [ 883 | "def opt_x(r,s):\n", 884 | " num = 0\n", 885 | " den = 0\n", 886 | " for j in range(1,6):\n", 887 | " if M[r-1,j-1] != 99:\n", 888 | " num += V[s-1,j-1]*(M[r-1,j-1]-np.dot(U[r-1,:],V[:,j-1])\n", 889 | " +U[r-1,s-1]*V[s-1,j-1]) # add back k=r\n", 890 | " den += V[s-1,j-1]**2\n", 891 | " return num/den" 892 | ] 893 | }, 894 | { 895 | "cell_type": "code", 896 | "execution_count": 324, 897 | "metadata": { 898 | "collapsed": true 899 | }, 900 | "outputs": [], 901 | "source": [ 902 | "def opt_y(r,s):\n", 903 | " num = 0\n", 904 | " den = 0\n", 905 | " for i in range(1,6):\n", 906 | " if M[i-1,s-1] != 99:\n", 907 | " num += U[i-1,r-1]*(M[i-1,s-1]-np.dot(U[i-1,:],V[:,s-1])\n", 908 | " +U[i-1,r-1]*V[r-1,s-1]) # add back when k=s\n", 909 | " den += U[i-1,r-1]**2\n", 910 | " return num/den" 911 | ] 912 | }, 913 | { 914 | "cell_type": "markdown", 915 | "metadata": {}, 916 | "source": [ 917 | "## 9.4.1 (a) and (b) " 918 | ] 919 | }, 920 | { 921 | "cell_type": "code", 922 | "execution_count": 325, 923 | "metadata": { 924 | "collapsed": false 925 | }, 926 | "outputs": [ 927 | { 928 | "data": { 929 | "text/plain": [ 930 | "1.5" 931 | ] 932 | }, 933 | "execution_count": 325, 934 | "metadata": {}, 935 | "output_type": "execute_result" 936 | } 937 | ], 938 | "source": [ 939 | "opt_x(3,2)" 940 | ] 941 | }, 942 | { 943 | "cell_type": "code", 944 | "execution_count": 326, 945 | "metadata": { 946 | "collapsed": false 947 | }, 948 | "outputs": [ 949 | { 950 | "data": { 951 | "text/plain": [ 952 | "2.2000000000000002" 953 | ] 954 | }, 955 | "execution_count": 326, 956 | "metadata": {}, 957 | "output_type": "execute_result" 958 | } 959 | ], 960 | "source": [ 961 | "opt_y(1,4)" 962 | ] 963 | }, 964 | { 965 | "cell_type": "markdown", 966 | "metadata": {}, 967 | "source": [ 968 | "## Rest of solution to 9.4.2 " 969 | ] 970 | }, 971 | { 972 | "cell_type": "code", 973 | "execution_count": 384, 974 | "metadata": { 975 | "collapsed": false 976 | }, 977 | "outputs": [], 978 | "source": [ 979 | "def RMSE(r,s,axis):\n", 980 | " \"\"\"\n", 981 | " if axis=0 then we are starting the decomposition with u_{r,s},\n", 982 | " if axis=1, then start decomposition with v_{r,s}\n", 983 | " \"\"\"\n", 984 | " assert axis == 0 or axis == 1\n", 985 | " if axis == 0:\n", 986 | " x = opt_x(r,s)\n", 987 | " # contribution of mse due to the r-th row of UV\n", 988 | " mse = sum(map(lambda u: (u-(x+1))**2 if u!=99 else 0, M[r-1,:]))\n", 989 | " # contribution of mse due to the other rows of UV\n", 990 | " mse += sum(sum(map(lambda u: (u-2)**2 if u!=99 else 0, M[i,:])) for i in range(0,5) if i != r-1)\n", 991 | " return np.sqrt(mse)\n", 992 | " else:\n", 993 | " y = opt_y(r,s)\n", 994 | " # contribution of mse due to the s-th row of UV\n", 995 | " mse = sum(map(lambda u: (u-(y+1))**2 if u!=99 else 0, M[:,s-1]))\n", 996 | " mse += sum(sum(map(lambda u: (u-2)**2 if u!=99 else 0, M[:,j])) for j in range(0,5) if j != s-1)\n", 997 | " return np.sqrt(mse)" 998 | ] 999 | }, 1000 | { 1001 | "cell_type": "code", 1002 | "execution_count": 385, 1003 | "metadata": { 1004 | "collapsed": false 1005 | }, 1006 | "outputs": [ 1007 | { 1008 | "data": { 1009 | "text/plain": [ 1010 | "7.8866976612521418" 1011 | ] 1012 | }, 1013 | "execution_count": 385, 1014 | "metadata": {}, 1015 | "output_type": "execute_result" 1016 | } 1017 | ], 1018 | "source": [ 1019 | "RMSE(1,1,0)" 1020 | ] 1021 | }, 1022 | { 1023 | "cell_type": "code", 1024 | "execution_count": 386, 1025 | "metadata": { 1026 | "collapsed": false 1027 | }, 1028 | "outputs": [ 1029 | { 1030 | "data": { 1031 | "text/plain": [ 1032 | "7.8866976612521418" 1033 | ] 1034 | }, 1035 | "execution_count": 386, 1036 | "metadata": {}, 1037 | "output_type": "execute_result" 1038 | } 1039 | ], 1040 | "source": [ 1041 | "# manual check for starting UV decomposition with u_{1,1}\n", 1042 | "np.sqrt(sum((M[0,:]-3.6)**2+(M[1,:]-2)**2+(M[3,:]-2)**2)\n", 1043 | "+sum([(m-2)**2 for m in M[2,:] if m!=99])+sum([(m-2)**2 for m in M[4,:] if m!=99]))" 1044 | ] 1045 | }, 1046 | { 1047 | "cell_type": "code", 1048 | "execution_count": 392, 1049 | "metadata": { 1050 | "collapsed": false 1051 | }, 1052 | "outputs": [ 1053 | { 1054 | "name": "stdout", 1055 | "output_type": "stream", 1056 | "text": [ 1057 | "minimum RMSE from U: 7.3993 occurring from: [5, 1]\n" 1058 | ] 1059 | } 1060 | ], 1061 | "source": [ 1062 | "# finding the pair (r,s) that achieves the min RMSE after starting decomposition with u_{r,s}\n", 1063 | "min_RMSE = 2**32\n", 1064 | "min_pair = []\n", 1065 | "for r in range(1,6):\n", 1066 | " for s in range(1,3):\n", 1067 | " step_RMSE = RMSE(r,s,0)\n", 1068 | " if step_RMSE < min_RMSE:\n", 1069 | " min_RMSE, min_pair = step_RMSE, [r,s]\n", 1070 | "\n", 1071 | "print 'minimum RMSE from U: %.4f occurring from: %s' %(min_RMSE, str(min_pair))\n" 1072 | ] 1073 | }, 1074 | { 1075 | "cell_type": "code", 1076 | "execution_count": 393, 1077 | "metadata": { 1078 | "collapsed": false 1079 | }, 1080 | "outputs": [ 1081 | { 1082 | "name": "stdout", 1083 | "output_type": "stream", 1084 | "text": [ 1085 | "minimum RMSE from V: 7.8867 occurring from: [1, 3]\n" 1086 | ] 1087 | } 1088 | ], 1089 | "source": [ 1090 | "# finding the pair (r,s) that achieves the min RMSE after starting decomposition with v_{r,s}\n", 1091 | "min_RMSE = 2**32\n", 1092 | "min_pair = []\n", 1093 | "for r in range(1,3):\n", 1094 | " for s in range(1,6):\n", 1095 | " step_RMSE = RMSE(r,s,1)\n", 1096 | " if step_RMSE < min_RMSE:\n", 1097 | " min_RMSE, min_pair = step_RMSE, [r,s]\n", 1098 | "\n", 1099 | "print 'minimum RMSE from V: %.4f occurring from: %s' %(min_RMSE, str(min_pair))" 1100 | ] 1101 | }, 1102 | { 1103 | "cell_type": "markdown", 1104 | "metadata": {}, 1105 | "source": [ 1106 | "The above shows that the minimum RMSE from all possible starting points is 7.3993 which we obtain by starting the decomposition at u_{5,1}. *Note that the above code only finds one such pair that results in the minimum*." 1107 | ] 1108 | }, 1109 | { 1110 | "cell_type": "code", 1111 | "execution_count": 406, 1112 | "metadata": { 1113 | "collapsed": false 1114 | }, 1115 | "outputs": [ 1116 | { 1117 | "data": { 1118 | "text/plain": [ 1119 | "7.399324293474371" 1120 | ] 1121 | }, 1122 | "execution_count": 406, 1123 | "metadata": {}, 1124 | "output_type": "execute_result" 1125 | } 1126 | ], 1127 | "source": [ 1128 | "RMSE(5,2,0)" 1129 | ] 1130 | }, 1131 | { 1132 | "cell_type": "code", 1133 | "execution_count": 408, 1134 | "metadata": { 1135 | "collapsed": false 1136 | }, 1137 | "outputs": [ 1138 | { 1139 | "data": { 1140 | "text/plain": [ 1141 | "7.6681158050723255" 1142 | ] 1143 | }, 1144 | "execution_count": 408, 1145 | "metadata": {}, 1146 | "output_type": "execute_result" 1147 | } 1148 | ], 1149 | "source": [ 1150 | "RMSE(4,2,0)" 1151 | ] 1152 | }, 1153 | { 1154 | "cell_type": "markdown", 1155 | "metadata": {}, 1156 | "source": [ 1157 | "For example, u_{5,2} also achieves the minimum RMSE, whereas starting from u_{4,2} does not." 1158 | ] 1159 | }, 1160 | { 1161 | "cell_type": "markdown", 1162 | "metadata": {}, 1163 | "source": [ 1164 | "\n", 1165 | "# 9.4.3 " 1166 | ] 1167 | }, 1168 | { 1169 | "cell_type": "code", 1170 | "execution_count": 430, 1171 | "metadata": { 1172 | "collapsed": false 1173 | }, 1174 | "outputs": [ 1175 | { 1176 | "data": { 1177 | "text/plain": [ 1178 | "array([[ 2.6 , 1. ],\n", 1179 | " [ 1. , 1. ],\n", 1180 | " [ 1.178, 1. ],\n", 1181 | " [ 1. , 1. ],\n", 1182 | " [ 1. , 1. ]])" 1183 | ] 1184 | }, 1185 | "execution_count": 430, 1186 | "metadata": {}, 1187 | "output_type": "execute_result" 1188 | } 1189 | ], 1190 | "source": [ 1191 | "U = np.array([2.6,1,1,1,1.178,1,1,1,1,1]).reshape(5,2)\n", 1192 | "U" 1193 | ] 1194 | }, 1195 | { 1196 | "cell_type": "code", 1197 | "execution_count": 431, 1198 | "metadata": { 1199 | "collapsed": false 1200 | }, 1201 | "outputs": [ 1202 | { 1203 | "data": { 1204 | "text/plain": [ 1205 | "array([[ 1.617, 1. , 1. , 1. , 1. ],\n", 1206 | " [ 1. , 1. , 1. , 1. , 1. ]])" 1207 | ] 1208 | }, 1209 | "execution_count": 431, 1210 | "metadata": {}, 1211 | "output_type": "execute_result" 1212 | } 1213 | ], 1214 | "source": [ 1215 | "V = np.array([1.617,1,1,1,1,1,1,1,1,1]).reshape(2,5)\n", 1216 | "V" 1217 | ] 1218 | }, 1219 | { 1220 | "cell_type": "markdown", 1221 | "metadata": {}, 1222 | "source": [ 1223 | "Can do matrix multiplication using numpy's dot() method" 1224 | ] 1225 | }, 1226 | { 1227 | "cell_type": "code", 1228 | "execution_count": 432, 1229 | "metadata": { 1230 | "collapsed": false 1231 | }, 1232 | "outputs": [ 1233 | { 1234 | "data": { 1235 | "text/plain": [ 1236 | "array([[ 5.2042 , 3.6 , 3.6 , 3.6 , 3.6 ],\n", 1237 | " [ 2.617 , 2. , 2. , 2. , 2. ],\n", 1238 | " [ 2.904826, 2.178 , 2.178 , 2.178 , 2.178 ],\n", 1239 | " [ 2.617 , 2. , 2. , 2. , 2. ],\n", 1240 | " [ 2.617 , 2. , 2. , 2. , 2. ]])" 1241 | ] 1242 | }, 1243 | "execution_count": 432, 1244 | "metadata": {}, 1245 | "output_type": "execute_result" 1246 | } 1247 | ], 1248 | "source": [ 1249 | "UV = np.dot(U,V)\n", 1250 | "UV" 1251 | ] 1252 | }, 1253 | { 1254 | "cell_type": "markdown", 1255 | "metadata": {}, 1256 | "source": [ 1257 | "We can use functions similar to `opt_x` and `opt_y` from the previous question. In this function, opt_x and opt_y will alter the U and V matrices in addition to returing the optimal values." 1258 | ] 1259 | }, 1260 | { 1261 | "cell_type": "code", 1262 | "execution_count": 433, 1263 | "metadata": { 1264 | "collapsed": true 1265 | }, 1266 | "outputs": [], 1267 | "source": [ 1268 | "def opt_x(r,s):\n", 1269 | " num = 0\n", 1270 | " den = 0\n", 1271 | " for j in range(1,6):\n", 1272 | " if M[r-1,j-1] != 99:\n", 1273 | " num += V[s-1,j-1]*(M[r-1,j-1]-np.dot(U[r-1,:],V[:,j-1])\n", 1274 | " +U[r-1,s-1]*V[s-1,j-1]) # add back k=r\n", 1275 | " den += V[s-1,j-1]**2\n", 1276 | " U[r-1,s-1] = num/den\n", 1277 | " return num/den\n", 1278 | "\n", 1279 | "def opt_y(r,s):\n", 1280 | " num = 0\n", 1281 | " den = 0\n", 1282 | " for i in range(1,6):\n", 1283 | " if M[i-1,s-1] != 99:\n", 1284 | " num += U[i-1,r-1]*(M[i-1,s-1]-np.dot(U[i-1,:],V[:,s-1])\n", 1285 | " +U[i-1,r-1]*V[r-1,s-1]) # add back when k=s\n", 1286 | " den += U[i-1,r-1]**2\n", 1287 | " V[r-1,s-1] = num/den\n", 1288 | " return num/den" 1289 | ] 1290 | }, 1291 | { 1292 | "cell_type": "code", 1293 | "execution_count": 434, 1294 | "metadata": { 1295 | "collapsed": true 1296 | }, 1297 | "outputs": [], 1298 | "source": [ 1299 | "def RMSE_general():\n", 1300 | " UV = np.dot(U,V)\n", 1301 | " mse = 0\n", 1302 | " for i in range(5):\n", 1303 | " for j in range(5):\n", 1304 | " if M[i,j] != 99:\n", 1305 | " mse += (M[i,j]-UV[i,j])**2\n", 1306 | " return np.sqrt(mse)" 1307 | ] 1308 | }, 1309 | { 1310 | "cell_type": "code", 1311 | "execution_count": 435, 1312 | "metadata": { 1313 | "collapsed": false 1314 | }, 1315 | "outputs": [ 1316 | { 1317 | "data": { 1318 | "text/plain": [ 1319 | "7.6107507336842932" 1320 | ] 1321 | }, 1322 | "execution_count": 435, 1323 | "metadata": {}, 1324 | "output_type": "execute_result" 1325 | } 1326 | ], 1327 | "source": [ 1328 | "# Current RMSE\n", 1329 | "RMSE_general()" 1330 | ] 1331 | }, 1332 | { 1333 | "cell_type": "markdown", 1334 | "metadata": {}, 1335 | "source": [ 1336 | "(a) Considering u_{1,1} as the element to update " 1337 | ] 1338 | }, 1339 | { 1340 | "cell_type": "code", 1341 | "execution_count": 436, 1342 | "metadata": { 1343 | "collapsed": false 1344 | }, 1345 | "outputs": [ 1346 | { 1347 | "data": { 1348 | "text/plain": [ 1349 | "2.3384319353487366" 1350 | ] 1351 | }, 1352 | "execution_count": 436, 1353 | "metadata": {}, 1354 | "output_type": "execute_result" 1355 | } 1356 | ], 1357 | "source": [ 1358 | "# update U with optimal x and print out optimal x\n", 1359 | "opt_x(1,1)" 1360 | ] 1361 | }, 1362 | { 1363 | "cell_type": "code", 1364 | "execution_count": 437, 1365 | "metadata": { 1366 | "collapsed": false 1367 | }, 1368 | "outputs": [ 1369 | { 1370 | "data": { 1371 | "text/plain": [ 1372 | "array([[ 2.33843194, 1. ],\n", 1373 | " [ 1. , 1. ],\n", 1374 | " [ 1.178 , 1. ],\n", 1375 | " [ 1. , 1. ],\n", 1376 | " [ 1. , 1. ]])" 1377 | ] 1378 | }, 1379 | "execution_count": 437, 1380 | "metadata": {}, 1381 | "output_type": "execute_result" 1382 | } 1383 | ], 1384 | "source": [ 1385 | "U" 1386 | ] 1387 | }, 1388 | { 1389 | "cell_type": "code", 1390 | "execution_count": 439, 1391 | "metadata": { 1392 | "collapsed": false 1393 | }, 1394 | "outputs": [ 1395 | { 1396 | "data": { 1397 | "text/plain": [ 1398 | "7.5809606194928714" 1399 | ] 1400 | }, 1401 | "execution_count": 439, 1402 | "metadata": {}, 1403 | "output_type": "execute_result" 1404 | } 1405 | ], 1406 | "source": [ 1407 | "# check new RMSE has decreased\n", 1408 | "RMSE_general()" 1409 | ] 1410 | }, 1411 | { 1412 | "cell_type": "markdown", 1413 | "metadata": {}, 1414 | "source": [ 1415 | "(b) Then choose the best value for u_{5,2}" 1416 | ] 1417 | }, 1418 | { 1419 | "cell_type": "code", 1420 | "execution_count": 440, 1421 | "metadata": { 1422 | "collapsed": false 1423 | }, 1424 | "outputs": [ 1425 | { 1426 | "data": { 1427 | "text/plain": [ 1428 | "array([[ 2.33843194, 1. ],\n", 1429 | " [ 1. , 1. ],\n", 1430 | " [ 1.178 , 1. ],\n", 1431 | " [ 1. , 1. ],\n", 1432 | " [ 1. , 3.09575 ]])" 1433 | ] 1434 | }, 1435 | "execution_count": 440, 1436 | "metadata": {}, 1437 | "output_type": "execute_result" 1438 | } 1439 | ], 1440 | "source": [ 1441 | "opt_x(5,2)\n", 1442 | "U" 1443 | ] 1444 | }, 1445 | { 1446 | "cell_type": "code", 1447 | "execution_count": 444, 1448 | "metadata": { 1449 | "collapsed": false 1450 | }, 1451 | "outputs": [ 1452 | { 1453 | "data": { 1454 | "text/plain": [ 1455 | "6.3168260751980299" 1456 | ] 1457 | }, 1458 | "execution_count": 444, 1459 | "metadata": {}, 1460 | "output_type": "execute_result" 1461 | } 1462 | ], 1463 | "source": [ 1464 | "# checking if RMSE was decreased\n", 1465 | "RMSE_general()" 1466 | ] 1467 | }, 1468 | { 1469 | "cell_type": "markdown", 1470 | "metadata": {}, 1471 | "source": [ 1472 | "(c) Next, choosing the best value for v_{2,2}" 1473 | ] 1474 | }, 1475 | { 1476 | "cell_type": "code", 1477 | "execution_count": 445, 1478 | "metadata": { 1479 | "collapsed": false 1480 | }, 1481 | "outputs": [ 1482 | { 1483 | "data": { 1484 | "text/plain": [ 1485 | "array([[ 1.617, 1. , 1. , 1. , 1. ],\n", 1486 | " [ 1. , 1. , 1. , 1. , 1. ]])" 1487 | ] 1488 | }, 1489 | "execution_count": 445, 1490 | "metadata": {}, 1491 | "output_type": "execute_result" 1492 | } 1493 | ], 1494 | "source": [ 1495 | "V" 1496 | ] 1497 | }, 1498 | { 1499 | "cell_type": "code", 1500 | "execution_count": 446, 1501 | "metadata": { 1502 | "collapsed": false 1503 | }, 1504 | "outputs": [ 1505 | { 1506 | "data": { 1507 | "text/plain": [ 1508 | "array([[ 1.617 , 1. , 1. , 1. , 1. ],\n", 1509 | " [ 1. , 1.02901777, 1. , 1. , 1. ]])" 1510 | ] 1511 | }, 1512 | "execution_count": 446, 1513 | "metadata": {}, 1514 | "output_type": "execute_result" 1515 | } 1516 | ], 1517 | "source": [ 1518 | "# updating V at v_{2,2}\n", 1519 | "opt_y(2,2)\n", 1520 | "V" 1521 | ] 1522 | }, 1523 | { 1524 | "cell_type": "code", 1525 | "execution_count": 447, 1526 | "metadata": { 1527 | "collapsed": false 1528 | }, 1529 | "outputs": [ 1530 | { 1531 | "data": { 1532 | "text/plain": [ 1533 | "6.3159873198925407" 1534 | ] 1535 | }, 1536 | "execution_count": 447, 1537 | "metadata": {}, 1538 | "output_type": "execute_result" 1539 | } 1540 | ], 1541 | "source": [ 1542 | "# checking that RMSE indeed decreased\n", 1543 | "RMSE_general()" 1544 | ] 1545 | }, 1546 | { 1547 | "cell_type": "markdown", 1548 | "metadata": {}, 1549 | "source": [ 1550 | "**BONUS STEP: choosing best value for v_{1,5}**" 1551 | ] 1552 | }, 1553 | { 1554 | "cell_type": "code", 1555 | "execution_count": 449, 1556 | "metadata": { 1557 | "collapsed": false 1558 | }, 1559 | "outputs": [ 1560 | { 1561 | "data": { 1562 | "text/plain": [ 1563 | "array([[ 1.617 , 1. , 1. , 1. , 1.37883194],\n", 1564 | " [ 1. , 1.02901777, 1. , 1. , 1. ]])" 1565 | ] 1566 | }, 1567 | "execution_count": 449, 1568 | "metadata": {}, 1569 | "output_type": "execute_result" 1570 | } 1571 | ], 1572 | "source": [ 1573 | "opt_y(1,5)\n", 1574 | "V" 1575 | ] 1576 | }, 1577 | { 1578 | "cell_type": "code", 1579 | "execution_count": 450, 1580 | "metadata": { 1581 | "collapsed": false 1582 | }, 1583 | "outputs": [ 1584 | { 1585 | "data": { 1586 | "text/plain": [ 1587 | "6.2145592358668278" 1588 | ] 1589 | }, 1590 | "execution_count": 450, 1591 | "metadata": {}, 1592 | "output_type": "execute_result" 1593 | } 1594 | ], 1595 | "source": [ 1596 | "RMSE_general()" 1597 | ] 1598 | }, 1599 | { 1600 | "cell_type": "markdown", 1601 | "metadata": {}, 1602 | "source": [ 1603 | "\n", 1604 | "# 9.4.5 (Normalizing the Utility Matrix)" 1605 | ] 1606 | }, 1607 | { 1608 | "cell_type": "code", 1609 | "execution_count": 519, 1610 | "metadata": { 1611 | "collapsed": false 1612 | }, 1613 | "outputs": [ 1614 | { 1615 | "data": { 1616 | "text/plain": [ 1617 | "array([[ 5, 2, 4, 4, 3],\n", 1618 | " [ 3, 1, 2, 4, 1],\n", 1619 | " [ 2, 99, 3, 1, 4],\n", 1620 | " [ 2, 5, 4, 3, 5],\n", 1621 | " [ 4, 4, 5, 4, 99]])" 1622 | ] 1623 | }, 1624 | "execution_count": 519, 1625 | "metadata": {}, 1626 | "output_type": "execute_result" 1627 | } 1628 | ], 1629 | "source": [ 1630 | "M" 1631 | ] 1632 | }, 1633 | { 1634 | "cell_type": "markdown", 1635 | "metadata": {}, 1636 | "source": [ 1637 | "### (a) First subtract from each element the average of its row, and then subtract from each element the average of its (modified) column" 1638 | ] 1639 | }, 1640 | { 1641 | "cell_type": "markdown", 1642 | "metadata": {}, 1643 | "source": [ 1644 | "### Step 1" 1645 | ] 1646 | }, 1647 | { 1648 | "cell_type": "code", 1649 | "execution_count": 520, 1650 | "metadata": { 1651 | "collapsed": false 1652 | }, 1653 | "outputs": [ 1654 | { 1655 | "data": { 1656 | "text/plain": [ 1657 | "array([[5, 2, 4, 4, 3],\n", 1658 | " [3, 1, 2, 4, 1],\n", 1659 | " [2, 0, 3, 1, 4],\n", 1660 | " [2, 5, 4, 3, 5],\n", 1661 | " [4, 4, 5, 4, 0]])" 1662 | ] 1663 | }, 1664 | "execution_count": 520, 1665 | "metadata": {}, 1666 | "output_type": "execute_result" 1667 | } 1668 | ], 1669 | "source": [ 1670 | "# changing 99 values to 0 so that they don't affect the row sums\n", 1671 | "(M!=99)*M" 1672 | ] 1673 | }, 1674 | { 1675 | "cell_type": "code", 1676 | "execution_count": 521, 1677 | "metadata": { 1678 | "collapsed": false 1679 | }, 1680 | "outputs": [ 1681 | { 1682 | "data": { 1683 | "text/plain": [ 1684 | "array([18, 11, 10, 19, 17])" 1685 | ] 1686 | }, 1687 | "execution_count": 521, 1688 | "metadata": {}, 1689 | "output_type": "execute_result" 1690 | } 1691 | ], 1692 | "source": [ 1693 | "# row sums of the above matrix\n", 1694 | "foo = np.sum((M!=99)*M,1)\n", 1695 | "foo" 1696 | ] 1697 | }, 1698 | { 1699 | "cell_type": "code", 1700 | "execution_count": 522, 1701 | "metadata": { 1702 | "collapsed": false 1703 | }, 1704 | "outputs": [ 1705 | { 1706 | "data": { 1707 | "text/plain": [ 1708 | "array([5, 5, 4, 5, 4])" 1709 | ] 1710 | }, 1711 | "execution_count": 522, 1712 | "metadata": {}, 1713 | "output_type": "execute_result" 1714 | } 1715 | ], 1716 | "source": [ 1717 | "# number of ratings per user (i.e. per row)\n", 1718 | "bar = np.sum((M!=99),1)\n", 1719 | "bar" 1720 | ] 1721 | }, 1722 | { 1723 | "cell_type": "code", 1724 | "execution_count": 523, 1725 | "metadata": { 1726 | "collapsed": false 1727 | }, 1728 | "outputs": [ 1729 | { 1730 | "data": { 1731 | "text/plain": [ 1732 | "array([ 3.6 , 2.2 , 2.5 , 3.8 , 4.25])" 1733 | ] 1734 | }, 1735 | "execution_count": 523, 1736 | "metadata": {}, 1737 | "output_type": "execute_result" 1738 | } 1739 | ], 1740 | "source": [ 1741 | "# row averages\n", 1742 | "row_averages = foo/bar # elementwise division\n", 1743 | "row_averages" 1744 | ] 1745 | }, 1746 | { 1747 | "cell_type": "code", 1748 | "execution_count": 524, 1749 | "metadata": { 1750 | "collapsed": false 1751 | }, 1752 | "outputs": [ 1753 | { 1754 | "data": { 1755 | "text/plain": [ 1756 | "array([[ 1.4 , -1.6 , 0.4 , 0.4 , -0.6 ],\n", 1757 | " [ 0.8 , -1.2 , -0.2 , 1.8 , -1.2 ],\n", 1758 | " [ -0.5 , 99. , 0.5 , -1.5 , 1.5 ],\n", 1759 | " [ -1.8 , 1.2 , 0.2 , -0.8 , 1.2 ],\n", 1760 | " [ -0.25, -0.25, 0.75, -0.25, 99. ]])" 1761 | ] 1762 | }, 1763 | "execution_count": 524, 1764 | "metadata": {}, 1765 | "output_type": "execute_result" 1766 | } 1767 | ], 1768 | "source": [ 1769 | "M_step1 = []\n", 1770 | "for i,row in enumerate(M):\n", 1771 | " M_step1 += map(lambda x: x-row_averages[i] if x!=99 else 99, row)\n", 1772 | "\n", 1773 | "M_step1 = np.array(M_step1).reshape(5,5)\n", 1774 | "M_step1" 1775 | ] 1776 | }, 1777 | { 1778 | "cell_type": "markdown", 1779 | "metadata": {}, 1780 | "source": [ 1781 | "### Step 2 " 1782 | ] 1783 | }, 1784 | { 1785 | "cell_type": "code", 1786 | "execution_count": 525, 1787 | "metadata": { 1788 | "collapsed": false 1789 | }, 1790 | "outputs": [ 1791 | { 1792 | "data": { 1793 | "text/plain": [ 1794 | "array([-0.35, -1.85, 1.65, -0.35, 0.9 ])" 1795 | ] 1796 | }, 1797 | "execution_count": 525, 1798 | "metadata": {}, 1799 | "output_type": "execute_result" 1800 | } 1801 | ], 1802 | "source": [ 1803 | "# column sums of the above matrix\n", 1804 | "foo = np.sum((M_step1!=99)*M_step1,0)\n", 1805 | "foo" 1806 | ] 1807 | }, 1808 | { 1809 | "cell_type": "code", 1810 | "execution_count": 526, 1811 | "metadata": { 1812 | "collapsed": false 1813 | }, 1814 | "outputs": [ 1815 | { 1816 | "data": { 1817 | "text/plain": [ 1818 | "array([5, 4, 5, 5, 4])" 1819 | ] 1820 | }, 1821 | "execution_count": 526, 1822 | "metadata": {}, 1823 | "output_type": "execute_result" 1824 | } 1825 | ], 1826 | "source": [ 1827 | "# number of ratings per item (i.e. per column)\n", 1828 | "bar = np.sum((M_step1!=99),0)\n", 1829 | "bar# number of ratings per item (i.e. per column)\n", 1830 | "bar = np.sum((M_step1!=99),0)\n", 1831 | "bar" 1832 | ] 1833 | }, 1834 | { 1835 | "cell_type": "code", 1836 | "execution_count": 527, 1837 | "metadata": { 1838 | "collapsed": false 1839 | }, 1840 | "outputs": [ 1841 | { 1842 | "data": { 1843 | "text/plain": [ 1844 | "array([-0.07 , -0.4625, 0.33 , -0.07 , 0.225 ])" 1845 | ] 1846 | }, 1847 | "execution_count": 527, 1848 | "metadata": {}, 1849 | "output_type": "execute_result" 1850 | } 1851 | ], 1852 | "source": [ 1853 | "# column averages\n", 1854 | "col_averages = foo/bar # elementwise division\n", 1855 | "col_averages" 1856 | ] 1857 | }, 1858 | { 1859 | "cell_type": "code", 1860 | "execution_count": 548, 1861 | "metadata": { 1862 | "collapsed": false 1863 | }, 1864 | "outputs": [ 1865 | { 1866 | "data": { 1867 | "text/plain": [ 1868 | "array([[ 1.47000000e+00, -1.13750000e+00, 7.00000000e-02,\n", 1869 | " 4.70000000e-01, -8.25000000e-01],\n", 1870 | " [ 8.70000000e-01, -7.37500000e-01, -5.30000000e-01,\n", 1871 | " 1.87000000e+00, -1.42500000e+00],\n", 1872 | " [ -4.30000000e-01, 9.90000000e+01, 1.70000000e-01,\n", 1873 | " -1.43000000e+00, 1.27500000e+00],\n", 1874 | " [ -1.73000000e+00, 1.66250000e+00, -1.30000000e-01,\n", 1875 | " -7.30000000e-01, 9.75000000e-01],\n", 1876 | " [ -1.80000000e-01, 2.12500000e-01, 4.20000000e-01,\n", 1877 | " -1.80000000e-01, 9.90000000e+01]])" 1878 | ] 1879 | }, 1880 | "execution_count": 548, 1881 | "metadata": {}, 1882 | "output_type": "execute_result" 1883 | } 1884 | ], 1885 | "source": [ 1886 | "M_step2 = []\n", 1887 | "for i,row in enumerate(M_step1.T): # take transpose of the M_step1 and consider the rows\n", 1888 | " M_step2 += map(lambda x: x-col_averages[i] if x!=99 else 99, row)\n", 1889 | "\n", 1890 | "M_step2 = (np.array(M_step2).reshape(5,5)).T # need to take the transpose again\n", 1891 | "M_step2" 1892 | ] 1893 | }, 1894 | { 1895 | "cell_type": "markdown", 1896 | "metadata": {}, 1897 | "source": [ 1898 | "### (b) First subtract from each element the average of its column, and then subtract from each element the average of its modified row." 1899 | ] 1900 | }, 1901 | { 1902 | "cell_type": "code", 1903 | "execution_count": 550, 1904 | "metadata": { 1905 | "collapsed": false 1906 | }, 1907 | "outputs": [ 1908 | { 1909 | "data": { 1910 | "text/plain": [ 1911 | "array([[ 5, 2, 4, 4, 3],\n", 1912 | " [ 3, 1, 2, 4, 1],\n", 1913 | " [ 2, 99, 3, 1, 4],\n", 1914 | " [ 2, 5, 4, 3, 5],\n", 1915 | " [ 4, 4, 5, 4, 99]])" 1916 | ] 1917 | }, 1918 | "execution_count": 550, 1919 | "metadata": {}, 1920 | "output_type": "execute_result" 1921 | } 1922 | ], 1923 | "source": [ 1924 | "M" 1925 | ] 1926 | }, 1927 | { 1928 | "cell_type": "markdown", 1929 | "metadata": {}, 1930 | "source": [ 1931 | "### Step 1" 1932 | ] 1933 | }, 1934 | { 1935 | "cell_type": "code", 1936 | "execution_count": 560, 1937 | "metadata": { 1938 | "collapsed": false 1939 | }, 1940 | "outputs": [ 1941 | { 1942 | "data": { 1943 | "text/plain": [ 1944 | "array([16, 12, 18, 16, 13])" 1945 | ] 1946 | }, 1947 | "execution_count": 560, 1948 | "metadata": {}, 1949 | "output_type": "execute_result" 1950 | } 1951 | ], 1952 | "source": [ 1953 | "# column sums of the above matrix (not including the missing values denoted by 99)\n", 1954 | "foo = np.sum((M!=99)*M,0)\n", 1955 | "foo" 1956 | ] 1957 | }, 1958 | { 1959 | "cell_type": "code", 1960 | "execution_count": 561, 1961 | "metadata": { 1962 | "collapsed": false 1963 | }, 1964 | "outputs": [ 1965 | { 1966 | "data": { 1967 | "text/plain": [ 1968 | "array([5, 4, 5, 5, 4])" 1969 | ] 1970 | }, 1971 | "execution_count": 561, 1972 | "metadata": {}, 1973 | "output_type": "execute_result" 1974 | } 1975 | ], 1976 | "source": [ 1977 | "# number of ratings per item (i.e. per column)\n", 1978 | "bar = np.sum((M!=99),0)\n", 1979 | "bar" 1980 | ] 1981 | }, 1982 | { 1983 | "cell_type": "code", 1984 | "execution_count": 562, 1985 | "metadata": { 1986 | "collapsed": false 1987 | }, 1988 | "outputs": [ 1989 | { 1990 | "data": { 1991 | "text/plain": [ 1992 | "array([ 3.2 , 3. , 3.6 , 3.2 , 3.25])" 1993 | ] 1994 | }, 1995 | "execution_count": 562, 1996 | "metadata": {}, 1997 | "output_type": "execute_result" 1998 | } 1999 | ], 2000 | "source": [ 2001 | "# column averages\n", 2002 | "col_averages = foo/bar # elementwise division\n", 2003 | "col_averages" 2004 | ] 2005 | }, 2006 | { 2007 | "cell_type": "code", 2008 | "execution_count": 563, 2009 | "metadata": { 2010 | "collapsed": false 2011 | }, 2012 | "outputs": [ 2013 | { 2014 | "data": { 2015 | "text/plain": [ 2016 | "array([[ 1.8 , -1. , 0.4 , 0.8 , -0.25],\n", 2017 | " [ -0.2 , -2. , -1.6 , 0.8 , -2.25],\n", 2018 | " [ -1.2 , 99. , -0.6 , -2.2 , 0.75],\n", 2019 | " [ -1.2 , 2. , 0.4 , -0.2 , 1.75],\n", 2020 | " [ 0.8 , 1. , 1.4 , 0.8 , 99. ]])" 2021 | ] 2022 | }, 2023 | "execution_count": 563, 2024 | "metadata": {}, 2025 | "output_type": "execute_result" 2026 | } 2027 | ], 2028 | "source": [ 2029 | "M_step1 = []\n", 2030 | "for i,row in enumerate(M.T): # take transpose of the M_step1 and consider the rows\n", 2031 | " M_step1 += map(lambda x: x-col_averages[i] if x!=99 else 99, row)\n", 2032 | "\n", 2033 | "M_step1 = (np.array(M_step1).reshape(5,5)).T # need to take the transpose again\n", 2034 | "M_step1" 2035 | ] 2036 | }, 2037 | { 2038 | "cell_type": "markdown", 2039 | "metadata": {}, 2040 | "source": [ 2041 | "### Step 2" 2042 | ] 2043 | }, 2044 | { 2045 | "cell_type": "code", 2046 | "execution_count": 564, 2047 | "metadata": { 2048 | "collapsed": false 2049 | }, 2050 | "outputs": [ 2051 | { 2052 | "data": { 2053 | "text/plain": [ 2054 | "array([ 1.75, -5.25, -3.25, 2.75, 4. ])" 2055 | ] 2056 | }, 2057 | "execution_count": 564, 2058 | "metadata": {}, 2059 | "output_type": "execute_result" 2060 | } 2061 | ], 2062 | "source": [ 2063 | "# row sums of the above matrix\n", 2064 | "foo = np.sum((M!=99)*M_step1,1)\n", 2065 | "foo" 2066 | ] 2067 | }, 2068 | { 2069 | "cell_type": "code", 2070 | "execution_count": 565, 2071 | "metadata": { 2072 | "collapsed": false 2073 | }, 2074 | "outputs": [ 2075 | { 2076 | "data": { 2077 | "text/plain": [ 2078 | "array([5, 5, 4, 5, 4])" 2079 | ] 2080 | }, 2081 | "execution_count": 565, 2082 | "metadata": {}, 2083 | "output_type": "execute_result" 2084 | } 2085 | ], 2086 | "source": [ 2087 | "# number of ratings per user (i.e. per row)\n", 2088 | "bar = np.sum((M!=99),1)\n", 2089 | "bar" 2090 | ] 2091 | }, 2092 | { 2093 | "cell_type": "code", 2094 | "execution_count": 566, 2095 | "metadata": { 2096 | "collapsed": false 2097 | }, 2098 | "outputs": [ 2099 | { 2100 | "data": { 2101 | "text/plain": [ 2102 | "array([ 0.35 , -1.05 , -0.8125, 0.55 , 1. ])" 2103 | ] 2104 | }, 2105 | "execution_count": 566, 2106 | "metadata": {}, 2107 | "output_type": "execute_result" 2108 | } 2109 | ], 2110 | "source": [ 2111 | "# row averages\n", 2112 | "row_averages = foo/bar # elementwise division\n", 2113 | "row_averages" 2114 | ] 2115 | }, 2116 | { 2117 | "cell_type": "code", 2118 | "execution_count": 568, 2119 | "metadata": { 2120 | "collapsed": false 2121 | }, 2122 | "outputs": [ 2123 | { 2124 | "data": { 2125 | "text/plain": [ 2126 | "array([[ 1.45000000e+00, -1.35000000e+00, 5.00000000e-02,\n", 2127 | " 4.50000000e-01, -6.00000000e-01],\n", 2128 | " [ 8.50000000e-01, -9.50000000e-01, -5.50000000e-01,\n", 2129 | " 1.85000000e+00, -1.20000000e+00],\n", 2130 | " [ -3.87500000e-01, 9.90000000e+01, 2.12500000e-01,\n", 2131 | " -1.38750000e+00, 1.56250000e+00],\n", 2132 | " [ -1.75000000e+00, 1.45000000e+00, -1.50000000e-01,\n", 2133 | " -7.50000000e-01, 1.20000000e+00],\n", 2134 | " [ -2.00000000e-01, 1.11022302e-16, 4.00000000e-01,\n", 2135 | " -2.00000000e-01, 9.90000000e+01]])" 2136 | ] 2137 | }, 2138 | "execution_count": 568, 2139 | "metadata": {}, 2140 | "output_type": "execute_result" 2141 | } 2142 | ], 2143 | "source": [ 2144 | "M_step2 = []\n", 2145 | "for i,row in enumerate(M_step1):\n", 2146 | " M_step2 += map(lambda x: x-row_averages[i] if x!=99 else 99, row)\n", 2147 | "\n", 2148 | "M_step2 = np.array(M_step2).reshape(5,5)\n", 2149 | "M_step2" 2150 | ] 2151 | }, 2152 | { 2153 | "cell_type": "markdown", 2154 | "metadata": {}, 2155 | "source": [ 2156 | "Yes, the two methods produce different normalized matrices, but they are indeed close." 2157 | ] 2158 | } 2159 | ], 2160 | "metadata": { 2161 | "kernelspec": { 2162 | "display_name": "Python 2", 2163 | "language": "python", 2164 | "name": "python2" 2165 | }, 2166 | "language_info": { 2167 | "codemirror_mode": { 2168 | "name": "ipython", 2169 | "version": 2 2170 | }, 2171 | "file_extension": ".py", 2172 | "mimetype": "text/x-python", 2173 | "name": "python", 2174 | "nbconvert_exporter": "python", 2175 | "pygments_lexer": "ipython2", 2176 | "version": "2.7.11" 2177 | } 2178 | }, 2179 | "nbformat": 4, 2180 | "nbformat_minor": 0 2181 | } 2182 | -------------------------------------------------------------------------------- /Exercises 6.1.1 and 6.1.3 and their related problems (from Ch.6 Frequent Itemsets).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Contents:\n", 8 | "- Exercise 6.1.1\n", 9 | "- Exercise 6.1.2\n", 10 | "- Exercise 6.1.3\n", 11 | "- Exercise 6.1.5\n", 12 | "- Exercise 6.1.6\n", 13 | "- Exercise 6.2.5\n", 14 | "- Exercise 6.2.6 (A-Priori Algorithm)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "source": [ 23 | "\n", 24 | "# Exercise 6.1.1:\n", 25 | "Suppose there are 100 items, numbered 1 to 100, and also 100 baskets, also numbered 1 to 100. Item `i` is in basket `b` if and only if `i` divides `b` with no remainder. Thus, item 1 is in all the baskets, item 2 is in all fifty of the even-numbered baskets, and so on. Basket 12 consists of items {1,2,3,4,6,12}, since these are all the integers that divide 12. Answer the following questions:\n", 26 | "\n", 27 | "(a) If the support threshold is 5, which items are frequent?\n", 28 | "\n", 29 | "(b) If the support threshold is 5, which pairs of items are frequent?\n", 30 | "\n", 31 | "(c) What is the sum of the sizes of all the baskets?" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "#### Solution:\n", 39 | "\n", 40 | "(a) Item `i` is in basket `b` if `i` is a factor of `b`. In other words, `i` is in basket `b` if and only if there exists a constant integer `k`>=1 such that `b=k*i`. As a result, item `i` is found in 5 or more baskets if `100/i >=5`. Therefore items {1},{2},...,{20} represent the frequent singletons." 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 1, 46 | "metadata": { 47 | "collapsed": true 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "import numpy as np" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "Can get the set of frequent pairs, by explicitly counting the support set of each pair and returning those whose counts are greater than 5. " 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 229, 64 | "metadata": { 65 | "collapsed": false 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "# baskets[i] gives the list of baskets in which item i is in contained\n", 70 | "baskets = {}\n", 71 | "for i in range(1,101):\n", 72 | " baskets[i] = []\n", 73 | " k = 1\n", 74 | " while (i*k) <= 100:\n", 75 | " baskets[i].append(k*i)\n", 76 | " k += 1" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 230, 82 | "metadata": { 83 | "collapsed": false 84 | }, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "(1, 2) (1, 3) (1, 4) (1, 5) (1, 6) (1, 7) (1, 8) (1, 9) (1, 10) (1, 11) (1, 12) (1, 13) (1, 14) (1, 15) (1, 16) (1, 17) (1, 18) (1, 19) (1, 20) (2, 3) (2, 4) (2, 5) (2, 6) (2, 7) (2, 8) (2, 9) (2, 10) (2, 12) (2, 14) (2, 16) (2, 18) (2, 20) (3, 4) (3, 5) (3, 6) (3, 9) (3, 12) (3, 15) (3, 18) (4, 5) (4, 6) (4, 8) (4, 10) (4, 12) (4, 16) (4, 20) (5, 10) (5, 15) (5, 20) (6, 9) (6, 12) (6, 18) (7, 14) (8, 16) (9, 18) (10, 20)\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "# finding frequent pairs using a nested loop to select the pairs (i,j) which appear in 5 or more baskets\n", 96 | "for i in range(1,20): # these are the only singletons which are frequent\n", 97 | " for j in range(i+1,21):\n", 98 | " commonbask = [b for b in baskets[i] if b in baskets[j]]\n", 99 | " if len(commonbask) >= 5:\n", 100 | " print (i,j)," 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "(c) Define, `num_factors(b)` as the number of factors that b has. Then sum of the sizes all baskets = `sum(num_factors(b), b=1,2,...,20)`.\n", 108 | "\n", 109 | "So, I didn't feel like thinking about how to grab prime factors of a number myself, so I stackoverflow'ed this. [Here](http://stackoverflow.com/questions/16996217/prime-factorization-list) is simple function to extract the prime factors of a list (it essentially follows how one would find prime factors by hand). " 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 5, 115 | "metadata": { 116 | "collapsed": true 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "def primes(n):\n", 121 | " primfac = []\n", 122 | " d = 2\n", 123 | " while d*d <= n:\n", 124 | " while (n % d) == 0:\n", 125 | " primfac.append(d) # supposing you want multiple factors repeated\n", 126 | " n /= d\n", 127 | " d += 1\n", 128 | " if n > 1:\n", 129 | " primfac.append(n)\n", 130 | " return primfac" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 6, 136 | "metadata": { 137 | "collapsed": false 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "# create dictionaries for prime factors of baskets 1,2,...,100\n", 142 | "primefactors = {}\n", 143 | "for b in range(1,101):\n", 144 | " # initializing the dictionary for each basket b\n", 145 | " primefactors[b] = {fac:0 for fac in primes(b)}\n", 146 | " for key in primes(b):\n", 147 | " primefactors[b][key] += 1" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 7, 153 | "metadata": { 154 | "collapsed": false 155 | }, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "text/plain": [ 160 | "{2: 2, 3: 1}" 161 | ] 162 | }, 163 | "execution_count": 7, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "# for example, the prime factorization of 12 = 2^2 * 3\n", 170 | "primefactors[12]" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "To get the number of factors of 12, we add 1 to each of the replications of its factors and apply the multiplication rule. So 12 has (2+1)*(1+1)=6 factors in total." 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 8, 183 | "metadata": { 184 | "collapsed": true 185 | }, 186 | "outputs": [], 187 | "source": [ 188 | "# to get number of factors, we add 1 to each rep of factor and apply multiplication rule\n", 189 | "def num_factors(b):\n", 190 | " numfac = 1\n", 191 | " for fac,reps in primefactors[b].items():\n", 192 | " numfac *= reps + 1\n", 193 | " return numfac" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 9, 199 | "metadata": { 200 | "collapsed": false 201 | }, 202 | "outputs": [ 203 | { 204 | "data": { 205 | "text/plain": [ 206 | "6" 207 | ] 208 | }, 209 | "execution_count": 9, 210 | "metadata": {}, 211 | "output_type": "execute_result" 212 | } 213 | ], 214 | "source": [ 215 | "num_factors(12)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 10, 221 | "metadata": { 222 | "collapsed": false 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "# sum of the sizes of all baskets\n", 227 | "sizeofbaskets = [num_factors(b) for b in range(1,101)]\n", 228 | "totalsize = sum(sizeofbaskets)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 11, 234 | "metadata": { 235 | "collapsed": false 236 | }, 237 | "outputs": [ 238 | { 239 | "data": { 240 | "text/plain": [ 241 | "482" 242 | ] 243 | }, 244 | "execution_count": 11, 245 | "metadata": {}, 246 | "output_type": "execute_result" 247 | } 248 | ], 249 | "source": [ 250 | "totalsize" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "\n", 258 | "# Exercise 6.1.2\n", 259 | "For the item-basket data of Exercise 6.1.1, which basket is the largest?" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 12, 265 | "metadata": { 266 | "collapsed": false 267 | }, 268 | "outputs": [ 269 | { 270 | "data": { 271 | "text/plain": [ 272 | "12" 273 | ] 274 | }, 275 | "execution_count": 12, 276 | "metadata": {}, 277 | "output_type": "execute_result" 278 | } 279 | ], 280 | "source": [ 281 | "# the largest baskets hav 12 items\n", 282 | "max(sizeofbaskets)" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 13, 288 | "metadata": { 289 | "collapsed": false 290 | }, 291 | "outputs": [ 292 | { 293 | "name": "stdout", 294 | "output_type": "stream", 295 | "text": [ 296 | "60\n", 297 | "72\n", 298 | "84\n", 299 | "90\n", 300 | "96\n" 301 | ] 302 | } 303 | ], 304 | "source": [ 305 | "# these baskets are\n", 306 | "for b in range(1,101):\n", 307 | " if num_factors(b) == 12:\n", 308 | " print b" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "\n", 316 | "# Exercise 6.1.3\n", 317 | "Suppose there are 100 items, numbered 1 to 100, and also 100 baskets, also numbered 1 to 100. Item `i` is in basket `b` if and only if `b` divides `i` with no remainder. For example, basket 12 consists of items {12,24,36,48,60,72,84,96}. Repeat Exercise 6.1.1 for this data." 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "#### Solution\n", 325 | "\n", 326 | "(a) Basket `b` consists of items which are multiples of `b`. Alternatively, item `i` is in basket `b` if `b` is a factor of `i`. Thus, item `i` is frequent if it has at least 5 factors <= 100." 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 14, 332 | "metadata": { 333 | "collapsed": false 334 | }, 335 | "outputs": [], 336 | "source": [ 337 | "# List of frequent items\n", 338 | "L1 = [b for b in range(1,101) if num_factors(b)>=5]" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 15, 344 | "metadata": { 345 | "collapsed": false 346 | }, 347 | "outputs": [ 348 | { 349 | "name": "stdout", 350 | "output_type": "stream", 351 | "text": [ 352 | "12 16 18 20 24 28 30 32 36 40 42 44 45 48 50 52 54 56 60 63 64 66 68 70 72 75 76 78 80 81 84 88 90 92 96 98 99 100\n" 353 | ] 354 | } 355 | ], 356 | "source": [ 357 | "for b in L1:\n", 358 | " print b," 359 | ] 360 | }, 361 | { 362 | "cell_type": "markdown", 363 | "metadata": {}, 364 | "source": [ 365 | "(b) Clearly, `(i,j)` represent a frequent pair if `i` and `j` share at least 5 common factors." 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 16, 371 | "metadata": { 372 | "collapsed": false 373 | }, 374 | "outputs": [], 375 | "source": [ 376 | "def lexorders_ofexp(b):\n", 377 | " \"\"\"\n", 378 | " This function returns the lexicographic ordering of the exponents\n", 379 | " of the prime factors of b. We can use this to get all the factors\n", 380 | " of b.\n", 381 | " \"\"\"\n", 382 | " n = len(primefactors[b])\n", 383 | " ati = primefactors[b].values()\n", 384 | " foo = []\n", 385 | " if n == 1:\n", 386 | " for j in range(ati[0]+1):\n", 387 | " foo.append([j])\n", 388 | " if n == 2:\n", 389 | " i = 0\n", 390 | " while i < ati[0]+1:\n", 391 | " j = 0\n", 392 | " while j < ati[1]+1:\n", 393 | " foo.append([i,j])\n", 394 | " j+=1\n", 395 | " i+=1\n", 396 | " if n == 3:\n", 397 | " i = 0\n", 398 | " while i < ati[0]+1:\n", 399 | " j = 0\n", 400 | " while j < ati[1]+1:\n", 401 | " k = 0\n", 402 | " while k < ati[2]+1:\n", 403 | " foo.append([i,j,k])\n", 404 | " k+=1\n", 405 | " j+=1\n", 406 | " i+=1\n", 407 | " return foo " 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 17, 413 | "metadata": { 414 | "collapsed": false 415 | }, 416 | "outputs": [], 417 | "source": [ 418 | "# getting all factors of b from its prime factors\n", 419 | "def factors(b):\n", 420 | " facs = []\n", 421 | " exps = lexorders_ofexp(b)\n", 422 | " for i in range(num_factors(b)):\n", 423 | " bar = 1 # this also takes care of base case factors(1)\n", 424 | " for el,key in enumerate(primefactors[b].keys()):\n", 425 | " bar *= int(key**exps[i][el])\n", 426 | " facs.append(bar)\n", 427 | " return facs" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 18, 433 | "metadata": { 434 | "collapsed": false 435 | }, 436 | "outputs": [], 437 | "source": [ 438 | "def num_commonfactors(b1,b2):\n", 439 | " foo = factors(b1)\n", 440 | " bar = factors(b2)\n", 441 | " return len([el for el in foo if el in bar])" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 19, 447 | "metadata": { 448 | "collapsed": false 449 | }, 450 | "outputs": [ 451 | { 452 | "data": { 453 | "text/plain": [ 454 | "[1, 3, 9, 2, 6, 18, 4, 12, 36, 8, 24, 72]" 455 | ] 456 | }, 457 | "execution_count": 19, 458 | "metadata": {}, 459 | "output_type": "execute_result" 460 | } 461 | ], 462 | "source": [ 463 | "factors(72)" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": 20, 469 | "metadata": { 470 | "collapsed": false 471 | }, 472 | "outputs": [ 473 | { 474 | "data": { 475 | "text/plain": [ 476 | "[1, 3, 2, 6, 4, 12]" 477 | ] 478 | }, 479 | "execution_count": 20, 480 | "metadata": {}, 481 | "output_type": "execute_result" 482 | } 483 | ], 484 | "source": [ 485 | "factors(12)" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": 21, 491 | "metadata": { 492 | "collapsed": false 493 | }, 494 | "outputs": [ 495 | { 496 | "data": { 497 | "text/plain": [ 498 | "[1, 3, 2, 6, 4, 12]" 499 | ] 500 | }, 501 | "execution_count": 21, 502 | "metadata": {}, 503 | "output_type": "execute_result" 504 | } 505 | ], 506 | "source": [ 507 | "# common factors\n", 508 | "[el for el in factors(12) if el in factors(72)]" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 22, 514 | "metadata": { 515 | "collapsed": false 516 | }, 517 | "outputs": [ 518 | { 519 | "data": { 520 | "text/plain": [ 521 | "6" 522 | ] 523 | }, 524 | "execution_count": 22, 525 | "metadata": {}, 526 | "output_type": "execute_result" 527 | } 528 | ], 529 | "source": [ 530 | "num_commonfactors(12,72)" 531 | ] 532 | }, 533 | { 534 | "cell_type": "markdown", 535 | "metadata": {}, 536 | "source": [ 537 | "#### We are now ready to grab the frequent pairs." 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": 23, 543 | "metadata": { 544 | "collapsed": false 545 | }, 546 | "outputs": [ 547 | { 548 | "name": "stdout", 549 | "output_type": "stream", 550 | "text": [ 551 | "(12, 24) (12, 36) (12, 48) (12, 60) (12, 72) (12, 84) (12, 96) (16, 32) (16, 48) (16, 64) (16, 80) (16, 96) (18, 36) (18, 54) (18, 72) (18, 90) (20, 40) (20, 60) (20, 80) (20, 100) (24, 36) (24, 48) (24, 60) (24, 72) (24, 84) (24, 96) (28, 56) (28, 84) (30, 60) (30, 90) (32, 48) (32, 64) (32, 80) (32, 96) (36, 48) (36, 54) (36, 60) (36, 72) (36, 84) (36, 90) (36, 96) (40, 60) (40, 80) (40, 100) (42, 84) (44, 88) (45, 90) (48, 60) (48, 64) (48, 72) (48, 80) (48, 84) (48, 96) (50, 100) (54, 72) (54, 90) (56, 84) (60, 72) (60, 80) (60, 84) (60, 90) (60, 96) (60, 100) (64, 80) (64, 96) (72, 84) (72, 90) (72, 96) (80, 96) (80, 100) (84, 96)\n" 552 | ] 553 | } 554 | ], 555 | "source": [ 556 | "# List of pairs (i,j) with at least 5 common factors\n", 557 | "for i in range(1,100):\n", 558 | " for j in range(i+1,101):\n", 559 | " if num_commonfactors(i,j) >= 5:\n", 560 | " print (i,j)," 561 | ] 562 | }, 563 | { 564 | "cell_type": "markdown", 565 | "metadata": {}, 566 | "source": [ 567 | "(c) The size of basket `b` is `floor(100/b)`" 568 | ] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "execution_count": 24, 573 | "metadata": { 574 | "collapsed": false 575 | }, 576 | "outputs": [ 577 | { 578 | "data": { 579 | "text/plain": [ 580 | "482" 581 | ] 582 | }, 583 | "execution_count": 24, 584 | "metadata": {}, 585 | "output_type": "execute_result" 586 | } 587 | ], 588 | "source": [ 589 | "sizeofbaskets = [int(100/b) for b in range(1,101)]\n", 590 | "totalsize = sum(sizeofbaskets)\n", 591 | "totalsize" 592 | ] 593 | }, 594 | { 595 | "cell_type": "markdown", 596 | "metadata": {}, 597 | "source": [ 598 | "\n", 599 | "# Exercise 6.1.5\n", 600 | "For the data of Exercise 6.1.1, what is the confidence of the following association rules?\n", 601 | "\n", 602 | "(a) {5,7} -> 2\n", 603 | "- The support of {5,7} is 2 since {5,7} can be found in baskets (35) and (70). On the other hand, the support of {5,7,2} is 1 since this triple can only be found in basket {70}. Therefore the confidence of this association rule is 1/2.\n", 604 | "\n", 605 | "(b) {2,3,4} -> 5\n", 606 | "- The confidence of this rule is 1/8. Note that basket `b` contains itemset `I` if all of its items are factors of `b`." 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": 25, 612 | "metadata": { 613 | "collapsed": true 614 | }, 615 | "outputs": [], 616 | "source": [ 617 | "from __future__ import division" 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": 26, 623 | "metadata": { 624 | "collapsed": false 625 | }, 626 | "outputs": [], 627 | "source": [ 628 | "def supportset_611(I):\n", 629 | " sup = []\n", 630 | " for b in range(1,101):\n", 631 | " # check if all items in I are factors of b\n", 632 | " if all(item in factors(b) for item in I):\n", 633 | " sup.append(b)\n", 634 | " return sup" 635 | ] 636 | }, 637 | { 638 | "cell_type": "code", 639 | "execution_count": 27, 640 | "metadata": { 641 | "collapsed": false 642 | }, 643 | "outputs": [ 644 | { 645 | "data": { 646 | "text/plain": [ 647 | "[12, 24, 36, 48, 60, 72, 84, 96]" 648 | ] 649 | }, 650 | "execution_count": 27, 651 | "metadata": {}, 652 | "output_type": "execute_result" 653 | } 654 | ], 655 | "source": [ 656 | "# support set of {2,3,4}\n", 657 | "supportset_611([2,3,4])" 658 | ] 659 | }, 660 | { 661 | "cell_type": "code", 662 | "execution_count": 28, 663 | "metadata": { 664 | "collapsed": false 665 | }, 666 | "outputs": [ 667 | { 668 | "data": { 669 | "text/plain": [ 670 | "[60]" 671 | ] 672 | }, 673 | "execution_count": 28, 674 | "metadata": {}, 675 | "output_type": "execute_result" 676 | } 677 | ], 678 | "source": [ 679 | "# support set of {2,3,4,5}\n", 680 | "supportset_611([2,3,4,5])" 681 | ] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "execution_count": 29, 686 | "metadata": { 687 | "collapsed": false 688 | }, 689 | "outputs": [ 690 | { 691 | "data": { 692 | "text/plain": [ 693 | "0.125" 694 | ] 695 | }, 696 | "execution_count": 29, 697 | "metadata": {}, 698 | "output_type": "execute_result" 699 | } 700 | ], 701 | "source": [ 702 | "# Confidence of {2,3,4}->5\n", 703 | "conf_b = len(supportset_611([2,3,4,5]))/len(supportset_611([2,3,4]))\n", 704 | "conf_b" 705 | ] 706 | }, 707 | { 708 | "cell_type": "markdown", 709 | "metadata": {}, 710 | "source": [ 711 | "\n", 712 | "# Exercise 6.1.6\n", 713 | "\n", 714 | "For the data of Exercise 6.1.3, what is the confidence of the following association rules?\n", 715 | "\n", 716 | "(a) {24,60} -> 8\n", 717 | "- The support of {24,60} is 6 since {24} and {60} share the common factors (1),(2),(3),(4),(6), and (12). The support of {8,24,60} is 3 since only factors (1), (2) and (4) are shared amongst them. Therefore, the confidence of this association rule is 3/6=1/2.\n", 718 | "\n", 719 | "(b) {2,3,4} -> 5\n", 720 | "- The confidence of this association rule is 1. See below" 721 | ] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": 35, 726 | "metadata": { 727 | "collapsed": false 728 | }, 729 | "outputs": [], 730 | "source": [ 731 | "def supportset_613(I):\n", 732 | " \"\"\"\n", 733 | " The set of baskets containing itemset I is the set of common\n", 734 | " of factors amongst all items in I. This function takes I as\n", 735 | " input and outputs the common factors (i.e, the baskets) of \n", 736 | " all items contained in I.\n", 737 | " \"\"\"\n", 738 | " n = len(I)\n", 739 | " if n == 1:\n", 740 | " return factors(I[0])\n", 741 | " elif n == 2:\n", 742 | " foo = factors(I[0])\n", 743 | " bar = factors(I[1])\n", 744 | " return [el for el in foo if el in bar]\n", 745 | " else: # use divide and conquer here\n", 746 | " half = int(n/2)\n", 747 | " firstpart = supportset_613(I[:half])\n", 748 | " secondpart = supportset_613(I[half:])\n", 749 | " return [el for el in firstpart if el in secondpart]" 750 | ] 751 | }, 752 | { 753 | "cell_type": "code", 754 | "execution_count": 36, 755 | "metadata": { 756 | "collapsed": false 757 | }, 758 | "outputs": [ 759 | { 760 | "data": { 761 | "text/plain": [ 762 | "[1]" 763 | ] 764 | }, 765 | "execution_count": 36, 766 | "metadata": {}, 767 | "output_type": "execute_result" 768 | } 769 | ], 770 | "source": [ 771 | "# support set of {2,3,4} is\n", 772 | "supportset_613([2,3,4])" 773 | ] 774 | }, 775 | { 776 | "cell_type": "code", 777 | "execution_count": 37, 778 | "metadata": { 779 | "collapsed": false 780 | }, 781 | "outputs": [ 782 | { 783 | "data": { 784 | "text/plain": [ 785 | "[1]" 786 | ] 787 | }, 788 | "execution_count": 37, 789 | "metadata": {}, 790 | "output_type": "execute_result" 791 | } 792 | ], 793 | "source": [ 794 | "# support set of {2,3,4,5}\n", 795 | "supportset_613([2,3,4,5])" 796 | ] 797 | }, 798 | { 799 | "cell_type": "markdown", 800 | "metadata": {}, 801 | "source": [ 802 | "The confidence of this association rule is 1." 803 | ] 804 | }, 805 | { 806 | "cell_type": "markdown", 807 | "metadata": {}, 808 | "source": [ 809 | "\n", 810 | "# Exercise 6.2.5\n", 811 | "\n", 812 | "Suppose the support threshold is 5. Find the maximal frequent itemsets for the data of:\n", 813 | "\n", 814 | "(a) Exercise 6.1.1\n", 815 | "\n", 816 | "(b) Exercise 6.1.3" 817 | ] 818 | }, 819 | { 820 | "cell_type": "markdown", 821 | "metadata": {}, 822 | "source": [ 823 | "# Solutions to 6.2.5(a):" 824 | ] 825 | }, 826 | { 827 | "cell_type": "markdown", 828 | "metadata": {}, 829 | "source": [ 830 | "### L1 and L2 " 831 | ] 832 | }, 833 | { 834 | "cell_type": "code", 835 | "execution_count": 38, 836 | "metadata": { 837 | "collapsed": false 838 | }, 839 | "outputs": [ 840 | { 841 | "data": { 842 | "text/plain": [ 843 | "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]" 844 | ] 845 | }, 846 | "execution_count": 38, 847 | "metadata": {}, 848 | "output_type": "execute_result" 849 | } 850 | ], 851 | "source": [ 852 | "#(a)\n", 853 | "# frequent singletons\n", 854 | "L1 = range(1,21)\n", 855 | "L1" 856 | ] 857 | }, 858 | { 859 | "cell_type": "code", 860 | "execution_count": 39, 861 | "metadata": { 862 | "collapsed": false 863 | }, 864 | "outputs": [], 865 | "source": [ 866 | "# frequent pairs\n", 867 | "L2 = []\n", 868 | "for i in range(1,20): # these are the only singletons which are frequent\n", 869 | " for j in range(i+1,21):\n", 870 | " commonbask = [b for b in baskets[i] if b in baskets[j]]\n", 871 | " if len(commonbask) >= 5:\n", 872 | " L2.append([i,j])" 873 | ] 874 | }, 875 | { 876 | "cell_type": "code", 877 | "execution_count": 40, 878 | "metadata": { 879 | "collapsed": false 880 | }, 881 | "outputs": [ 882 | { 883 | "name": "stdout", 884 | "output_type": "stream", 885 | "text": [ 886 | "[1, 2] [1, 3] [1, 4] [1, 5] [1, 6] [1, 7] [1, 8] [1, 9] [1, 10] [1, 11] [1, 12] [1, 13] [1, 14] [1, 15] [1, 16] [1, 17] [1, 18] [1, 19] [1, 20] [2, 3] [2, 4] [2, 5] [2, 6] [2, 7] [2, 8] [2, 9] [2, 10] [2, 12] [2, 14] [2, 16] [2, 18] [2, 20] [3, 4] [3, 5] [3, 6] [3, 9] [3, 12] [3, 15] [3, 18] [4, 5] [4, 6] [4, 8] [4, 10] [4, 12] [4, 16] [4, 20] [5, 10] [5, 15] [5, 20] [6, 9] [6, 12] [6, 18] [7, 14] [8, 16] [9, 18] [10, 20]\n" 887 | ] 888 | } 889 | ], 890 | "source": [ 891 | "for pair in L2:\n", 892 | " print pair," 893 | ] 894 | }, 895 | { 896 | "cell_type": "markdown", 897 | "metadata": {}, 898 | "source": [ 899 | "### Maximal singletons " 900 | ] 901 | }, 902 | { 903 | "cell_type": "code", 904 | "execution_count": 41, 905 | "metadata": { 906 | "collapsed": false 907 | }, 908 | "outputs": [ 909 | { 910 | "data": { 911 | "text/plain": [ 912 | "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]" 913 | ] 914 | }, 915 | "execution_count": 41, 916 | "metadata": {}, 917 | "output_type": "execute_result" 918 | } 919 | ], 920 | "source": [ 921 | "L2_flatten = []\n", 922 | "for pair in L2:\n", 923 | " L2_flatten += pair\n", 924 | " \n", 925 | "L2_flatten = list(set(L2_flatten))\n", 926 | "L2_flatten" 927 | ] 928 | }, 929 | { 930 | "cell_type": "markdown", 931 | "metadata": {}, 932 | "source": [ 933 | "## Therefore, there are no maximal singletons." 934 | ] 935 | }, 936 | { 937 | "cell_type": "markdown", 938 | "metadata": {}, 939 | "source": [ 940 | "### L3 " 941 | ] 942 | }, 943 | { 944 | "cell_type": "code", 945 | "execution_count": 42, 946 | "metadata": { 947 | "collapsed": false 948 | }, 949 | "outputs": [], 950 | "source": [ 951 | "# frequent triples\n", 952 | "L3 = []\n", 953 | "for i in range(1,21):\n", 954 | " for pair in L2:\n", 955 | " if i not in pair:\n", 956 | " foo = pair+[i]\n", 957 | " foo.sort() # works in place\n", 958 | " if len(supportset_611(foo)) >= 5 and foo not in L3:\n", 959 | " L3.append(foo)" 960 | ] 961 | }, 962 | { 963 | "cell_type": "markdown", 964 | "metadata": {}, 965 | "source": [ 966 | "### Maximal doubletons " 967 | ] 968 | }, 969 | { 970 | "cell_type": "code", 971 | "execution_count": 43, 972 | "metadata": { 973 | "collapsed": false 974 | }, 975 | "outputs": [], 976 | "source": [ 977 | "# check for maximal doubletons\n", 978 | "maximal_doub = []\n", 979 | "for pair in L2:\n", 980 | " pair_max = False\n", 981 | " for trip in L3:\n", 982 | " if all(item in trip for item in pair):\n", 983 | " pair_max = True\n", 984 | " break\n", 985 | " if not pair_max: \n", 986 | " maximal_doub.append(pair)" 987 | ] 988 | }, 989 | { 990 | "cell_type": "code", 991 | "execution_count": 44, 992 | "metadata": { 993 | "collapsed": false 994 | }, 995 | "outputs": [ 996 | { 997 | "data": { 998 | "text/plain": [ 999 | "[[1, 11], [1, 13], [1, 17], [1, 19]]" 1000 | ] 1001 | }, 1002 | "execution_count": 44, 1003 | "metadata": {}, 1004 | "output_type": "execute_result" 1005 | } 1006 | ], 1007 | "source": [ 1008 | "maximal_doub" 1009 | ] 1010 | }, 1011 | { 1012 | "cell_type": "markdown", 1013 | "metadata": {}, 1014 | "source": [ 1015 | "### L4" 1016 | ] 1017 | }, 1018 | { 1019 | "cell_type": "code", 1020 | "execution_count": 45, 1021 | "metadata": { 1022 | "collapsed": false 1023 | }, 1024 | "outputs": [], 1025 | "source": [ 1026 | "# frequent quads\n", 1027 | "L4 = []\n", 1028 | "for i in range(1,21):\n", 1029 | " for trip in L3:\n", 1030 | " if i not in trip:\n", 1031 | " foo = trip+[i]\n", 1032 | " foo.sort() # works in place\n", 1033 | " if len(supportset_611(foo)) >= 5 and foo not in L4:\n", 1034 | " L4.append(foo)" 1035 | ] 1036 | }, 1037 | { 1038 | "cell_type": "markdown", 1039 | "metadata": {}, 1040 | "source": [ 1041 | "### maximal triples " 1042 | ] 1043 | }, 1044 | { 1045 | "cell_type": "code", 1046 | "execution_count": 46, 1047 | "metadata": { 1048 | "collapsed": false 1049 | }, 1050 | "outputs": [ 1051 | { 1052 | "data": { 1053 | "text/plain": [ 1054 | "[]" 1055 | ] 1056 | }, 1057 | "execution_count": 46, 1058 | "metadata": {}, 1059 | "output_type": "execute_result" 1060 | } 1061 | ], 1062 | "source": [ 1063 | "# check for maximal triples\n", 1064 | "maximal_triples = []\n", 1065 | "for trip in L3:\n", 1066 | " trip_max = False\n", 1067 | " for quad in L4:\n", 1068 | " if all(item in quad for item in trip):\n", 1069 | " trip_max = True\n", 1070 | " break\n", 1071 | " if not trip_max: \n", 1072 | " maximal_triples.append(trip)\n", 1073 | "maximal_triples" 1074 | ] 1075 | }, 1076 | { 1077 | "cell_type": "markdown", 1078 | "metadata": {}, 1079 | "source": [ 1080 | "### L5" 1081 | ] 1082 | }, 1083 | { 1084 | "cell_type": "code", 1085 | "execution_count": 47, 1086 | "metadata": { 1087 | "collapsed": false 1088 | }, 1089 | "outputs": [ 1090 | { 1091 | "data": { 1092 | "text/plain": [ 1093 | "[[1, 2, 3, 4, 6],\n", 1094 | " [1, 2, 3, 4, 12],\n", 1095 | " [1, 2, 3, 6, 9],\n", 1096 | " [1, 2, 3, 6, 12],\n", 1097 | " [1, 2, 3, 6, 18],\n", 1098 | " [1, 2, 3, 9, 18],\n", 1099 | " [1, 2, 4, 5, 10],\n", 1100 | " [1, 2, 4, 5, 20],\n", 1101 | " [1, 2, 4, 6, 12],\n", 1102 | " [1, 2, 4, 8, 16],\n", 1103 | " [1, 2, 4, 10, 20],\n", 1104 | " [1, 2, 5, 10, 20],\n", 1105 | " [1, 2, 6, 9, 18],\n", 1106 | " [1, 3, 4, 6, 12],\n", 1107 | " [1, 3, 6, 9, 18],\n", 1108 | " [1, 4, 5, 10, 20],\n", 1109 | " [2, 3, 4, 6, 12],\n", 1110 | " [2, 3, 6, 9, 18],\n", 1111 | " [2, 4, 5, 10, 20]]" 1112 | ] 1113 | }, 1114 | "execution_count": 47, 1115 | "metadata": {}, 1116 | "output_type": "execute_result" 1117 | } 1118 | ], 1119 | "source": [ 1120 | "# frequent quintiples\n", 1121 | "L5 = []\n", 1122 | "for i in range(1,21):\n", 1123 | " for quad in L4:\n", 1124 | " if i not in quad:\n", 1125 | " foo = quad+[i]\n", 1126 | " foo.sort() # works in place\n", 1127 | " if len(supportset_611(foo)) >= 5 and foo not in L5:\n", 1128 | " L5.append(foo)\n", 1129 | "L5" 1130 | ] 1131 | }, 1132 | { 1133 | "cell_type": "markdown", 1134 | "metadata": {}, 1135 | "source": [ 1136 | "### Maximal quads " 1137 | ] 1138 | }, 1139 | { 1140 | "cell_type": "code", 1141 | "execution_count": 48, 1142 | "metadata": { 1143 | "collapsed": false 1144 | }, 1145 | "outputs": [], 1146 | "source": [ 1147 | "# check for maximal quads\n", 1148 | "maximal_quads = []\n", 1149 | "for quad in L4:\n", 1150 | " quad_max = False\n", 1151 | " for quint in L5:\n", 1152 | " if all(item in quint for item in quad):\n", 1153 | " quad_max = True\n", 1154 | " break\n", 1155 | " if not quad_max: \n", 1156 | " maximal_quads.append(quad)" 1157 | ] 1158 | }, 1159 | { 1160 | "cell_type": "code", 1161 | "execution_count": 49, 1162 | "metadata": { 1163 | "collapsed": false 1164 | }, 1165 | "outputs": [ 1166 | { 1167 | "data": { 1168 | "text/plain": [ 1169 | "[[1, 2, 7, 14], [1, 3, 5, 15]]" 1170 | ] 1171 | }, 1172 | "execution_count": 49, 1173 | "metadata": {}, 1174 | "output_type": "execute_result" 1175 | } 1176 | ], 1177 | "source": [ 1178 | "maximal_quads" 1179 | ] 1180 | }, 1181 | { 1182 | "cell_type": "code", 1183 | "execution_count": 50, 1184 | "metadata": { 1185 | "collapsed": false 1186 | }, 1187 | "outputs": [ 1188 | { 1189 | "data": { 1190 | "text/plain": [ 1191 | "[15, 30, 45, 60, 75, 90]" 1192 | ] 1193 | }, 1194 | "execution_count": 50, 1195 | "metadata": {}, 1196 | "output_type": "execute_result" 1197 | } 1198 | ], 1199 | "source": [ 1200 | "# looking at one of these\n", 1201 | "supportset_611([1, 3, 5, 15])" 1202 | ] 1203 | }, 1204 | { 1205 | "cell_type": "markdown", 1206 | "metadata": {}, 1207 | "source": [ 1208 | "### L6" 1209 | ] 1210 | }, 1211 | { 1212 | "cell_type": "code", 1213 | "execution_count": 51, 1214 | "metadata": { 1215 | "collapsed": false 1216 | }, 1217 | "outputs": [ 1218 | { 1219 | "data": { 1220 | "text/plain": [ 1221 | "[[1, 2, 3, 4, 6, 12], [1, 2, 3, 6, 9, 18], [1, 2, 4, 5, 10, 20]]" 1222 | ] 1223 | }, 1224 | "execution_count": 51, 1225 | "metadata": {}, 1226 | "output_type": "execute_result" 1227 | } 1228 | ], 1229 | "source": [ 1230 | "# frequent sixtuplets\n", 1231 | "L6 = []\n", 1232 | "for i in range(1,21):\n", 1233 | " for quint in L5:\n", 1234 | " if i not in quint:\n", 1235 | " foo = quint+[i]\n", 1236 | " foo.sort() # works in place\n", 1237 | " if len(supportset_611(foo)) >= 5 and foo not in L6:\n", 1238 | " L6.append(foo)\n", 1239 | "L6" 1240 | ] 1241 | }, 1242 | { 1243 | "cell_type": "markdown", 1244 | "metadata": {}, 1245 | "source": [ 1246 | "### Maximal quintuplets " 1247 | ] 1248 | }, 1249 | { 1250 | "cell_type": "code", 1251 | "execution_count": 52, 1252 | "metadata": { 1253 | "collapsed": false 1254 | }, 1255 | "outputs": [], 1256 | "source": [ 1257 | "# check for maximal quints\n", 1258 | "maximal_quints = []\n", 1259 | "for quint in L5:\n", 1260 | " quint_max = False\n", 1261 | " for sixt in L6:\n", 1262 | " if all(item in sixt for item in quint):\n", 1263 | " quint_max = True\n", 1264 | " break\n", 1265 | " if not quint_max: \n", 1266 | " maximal_quints.append(quint)" 1267 | ] 1268 | }, 1269 | { 1270 | "cell_type": "code", 1271 | "execution_count": 53, 1272 | "metadata": { 1273 | "collapsed": false 1274 | }, 1275 | "outputs": [ 1276 | { 1277 | "data": { 1278 | "text/plain": [ 1279 | "[[1, 2, 4, 8, 16]]" 1280 | ] 1281 | }, 1282 | "execution_count": 53, 1283 | "metadata": {}, 1284 | "output_type": "execute_result" 1285 | } 1286 | ], 1287 | "source": [ 1288 | "maximal_quints" 1289 | ] 1290 | }, 1291 | { 1292 | "cell_type": "markdown", 1293 | "metadata": {}, 1294 | "source": [ 1295 | "### L7" 1296 | ] 1297 | }, 1298 | { 1299 | "cell_type": "code", 1300 | "execution_count": 54, 1301 | "metadata": { 1302 | "collapsed": false 1303 | }, 1304 | "outputs": [ 1305 | { 1306 | "data": { 1307 | "text/plain": [ 1308 | "[]" 1309 | ] 1310 | }, 1311 | "execution_count": 54, 1312 | "metadata": {}, 1313 | "output_type": "execute_result" 1314 | } 1315 | ], 1316 | "source": [ 1317 | "# frequent septuplets\n", 1318 | "L7 = []\n", 1319 | "for i in range(1,21):\n", 1320 | " for sixt in L6:\n", 1321 | " if i not in sixt:\n", 1322 | " foo = sixt+[i]\n", 1323 | " foo.sort() # works in place\n", 1324 | " if len(supportset_611(foo)) >= 5 and foo not in L7:\n", 1325 | " L7.append(foo)\n", 1326 | "L7" 1327 | ] 1328 | }, 1329 | { 1330 | "cell_type": "markdown", 1331 | "metadata": {}, 1332 | "source": [ 1333 | "## No septuplets implies that all frequent sixtuplets are maximal!" 1334 | ] 1335 | }, 1336 | { 1337 | "cell_type": "code", 1338 | "execution_count": 55, 1339 | "metadata": { 1340 | "collapsed": false 1341 | }, 1342 | "outputs": [ 1343 | { 1344 | "name": "stdout", 1345 | "output_type": "stream", 1346 | "text": [ 1347 | "[12, 24, 36, 48, 60, 72, 84, 96]\n", 1348 | "[18, 36, 54, 72, 90]\n", 1349 | "[20, 40, 60, 80, 100]\n" 1350 | ] 1351 | } 1352 | ], 1353 | "source": [ 1354 | "# here are the support sets of each of sixtuplets\n", 1355 | "for sixt in L6:\n", 1356 | " print supportset_611(sixt)" 1357 | ] 1358 | }, 1359 | { 1360 | "cell_type": "markdown", 1361 | "metadata": {}, 1362 | "source": [ 1363 | "# Solutions to 6.2.5(b)" 1364 | ] 1365 | }, 1366 | { 1367 | "cell_type": "markdown", 1368 | "metadata": {}, 1369 | "source": [ 1370 | "### L1 and L2" 1371 | ] 1372 | }, 1373 | { 1374 | "cell_type": "code", 1375 | "execution_count": 56, 1376 | "metadata": { 1377 | "collapsed": false 1378 | }, 1379 | "outputs": [ 1380 | { 1381 | "name": "stdout", 1382 | "output_type": "stream", 1383 | "text": [ 1384 | "12 16 18 20 24 28 30 32 36 40 42 44 45 48 50 52 54 56 60 63 64 66 68 70 72 75 76 78 80 81 84 88 90 92 96 98 99 100\n" 1385 | ] 1386 | } 1387 | ], 1388 | "source": [ 1389 | "# List of frequent items\n", 1390 | "L1 = [b for b in range(1,101) if num_factors(b)>=5]\n", 1391 | "\n", 1392 | "for items in L1:\n", 1393 | " print items," 1394 | ] 1395 | }, 1396 | { 1397 | "cell_type": "code", 1398 | "execution_count": 57, 1399 | "metadata": { 1400 | "collapsed": false 1401 | }, 1402 | "outputs": [ 1403 | { 1404 | "name": "stdout", 1405 | "output_type": "stream", 1406 | "text": [ 1407 | "[12, 24] [12, 36] [12, 48] [12, 60] [12, 72] [12, 84] [12, 96] [16, 32] [16, 48] [16, 64] [16, 80] [16, 96] [18, 36] [18, 54] [18, 72] [18, 90] [20, 40] [20, 60] [20, 80] [20, 100] [24, 36] [24, 48] [24, 60] [24, 72] [24, 84] [24, 96] [28, 56] [28, 84] [30, 60] [30, 90] [32, 48] [32, 64] [32, 80] [32, 96] [36, 48] [36, 54] [36, 60] [36, 72] [36, 84] [36, 90] [36, 96] [40, 60] [40, 80] [40, 100] [42, 84] [44, 88] [45, 90] [48, 60] [48, 64] [48, 72] [48, 80] [48, 84] [48, 96] [50, 100] [54, 72] [54, 90] [56, 84] [60, 72] [60, 80] [60, 84] [60, 90] [60, 96] [60, 100] [64, 80] [64, 96] [72, 84] [72, 90] [72, 96] [80, 96] [80, 100] [84, 96]\n" 1408 | ] 1409 | } 1410 | ], 1411 | "source": [ 1412 | "# List of frequent pairs\n", 1413 | "L2 = []\n", 1414 | "for i in range(1,100):\n", 1415 | " for j in range(i+1,101):\n", 1416 | " if len(supportset_613([i,j])) >= 5:\n", 1417 | " L2.append([i,j])\n", 1418 | "\n", 1419 | "for pair in L2:\n", 1420 | " print pair," 1421 | ] 1422 | }, 1423 | { 1424 | "cell_type": "code", 1425 | "execution_count": 58, 1426 | "metadata": { 1427 | "collapsed": false 1428 | }, 1429 | "outputs": [ 1430 | { 1431 | "name": "stdout", 1432 | "output_type": "stream", 1433 | "text": [ 1434 | "[12, 24] [12, 36] [12, 48] [12, 60] [12, 72] [12, 84] [12, 96] [16, 32] [16, 48] [16, 64] [16, 80] [16, 96] [18, 36] [18, 54] [18, 72] [18, 90] [20, 40] [20, 60] [20, 80] [20, 100] [24, 36] [24, 48] [24, 60] [24, 72] [24, 84] [24, 96] [28, 56] [28, 84] [30, 60] [30, 90] [32, 48] [32, 64] [32, 80] [32, 96] [36, 48] [36, 54] [36, 60] [36, 72] [36, 84] [36, 90] [36, 96] [40, 60] [40, 80] [40, 100] [42, 84] [44, 88] [45, 90] [48, 60] [48, 64] [48, 72] [48, 80] [48, 84] [48, 96] [50, 100] [54, 72] [54, 90] [56, 84] [60, 72] [60, 80] [60, 84] [60, 90] [60, 96] [60, 100] [64, 80] [64, 96] [72, 84] [72, 90] [72, 96] [80, 96] [80, 100] [84, 96]\n" 1435 | ] 1436 | } 1437 | ], 1438 | "source": [ 1439 | "# List of frequent pairs\n", 1440 | "L2 = []\n", 1441 | "for i in range(1,100):\n", 1442 | " for j in range(i+1,101):\n", 1443 | " if num_commonfactors(i,j) >= 5:\n", 1444 | " L2.append([i,j])\n", 1445 | "\n", 1446 | "for pair in L2:\n", 1447 | " print pair," 1448 | ] 1449 | }, 1450 | { 1451 | "cell_type": "markdown", 1452 | "metadata": {}, 1453 | "source": [ 1454 | "### Maximal singletons" 1455 | ] 1456 | }, 1457 | { 1458 | "cell_type": "code", 1459 | "execution_count": 59, 1460 | "metadata": { 1461 | "collapsed": false 1462 | }, 1463 | "outputs": [ 1464 | { 1465 | "data": { 1466 | "text/plain": [ 1467 | "[52, 63, 66, 68, 70, 75, 76, 78, 81, 92, 98, 99]" 1468 | ] 1469 | }, 1470 | "execution_count": 59, 1471 | "metadata": {}, 1472 | "output_type": "execute_result" 1473 | } 1474 | ], 1475 | "source": [ 1476 | "# check for maximal singletons\n", 1477 | "maximal_single = []\n", 1478 | "for single in L1:\n", 1479 | " single_max = False\n", 1480 | " for pair in L2:\n", 1481 | " if all(item in pair for item in [single]):\n", 1482 | " single_max = True\n", 1483 | " break\n", 1484 | " if not single_max: \n", 1485 | " maximal_single.append(single)\n", 1486 | "maximal_single" 1487 | ] 1488 | }, 1489 | { 1490 | "cell_type": "markdown", 1491 | "metadata": {}, 1492 | "source": [ 1493 | "### L3 " 1494 | ] 1495 | }, 1496 | { 1497 | "cell_type": "code", 1498 | "execution_count": 60, 1499 | "metadata": { 1500 | "collapsed": false 1501 | }, 1502 | "outputs": [ 1503 | { 1504 | "data": { 1505 | "text/plain": [ 1506 | "[]" 1507 | ] 1508 | }, 1509 | "execution_count": 60, 1510 | "metadata": {}, 1511 | "output_type": "execute_result" 1512 | } 1513 | ], 1514 | "source": [ 1515 | "# List of frequent triples\n", 1516 | "L3 = []\n", 1517 | "for single in L1:\n", 1518 | " for pair in L2:\n", 1519 | " if single not in pair:\n", 1520 | " foo = pair+[i]\n", 1521 | " foo.sort() # works in place\n", 1522 | " if len(supportset_613(foo)) >= 5 and foo not in L3:\n", 1523 | " L3.append(foo)\n", 1524 | "L3" 1525 | ] 1526 | }, 1527 | { 1528 | "cell_type": "markdown", 1529 | "metadata": {}, 1530 | "source": [ 1531 | "### No frequent triples, means that every doubleton is maximal." 1532 | ] 1533 | }, 1534 | { 1535 | "cell_type": "markdown", 1536 | "metadata": {}, 1537 | "source": [ 1538 | "\n", 1539 | "# Exercise 6.2.6 (A-Priori Algorithm) \n", 1540 | "\n", 1541 | "Apply the A-Priori Algorithm with support threshold 5 to the data of:\n", 1542 | "\n", 1543 | "(a) Exercise 6.1.1.\n", 1544 | "\n", 1545 | "(b) Exercise 6.1.3.\n", 1546 | "\n", 1547 | "To make this more interesting, let's map integers `i` to strings using english stopwords." 1548 | ] 1549 | }, 1550 | { 1551 | "cell_type": "code", 1552 | "execution_count": 61, 1553 | "metadata": { 1554 | "collapsed": false 1555 | }, 1556 | "outputs": [], 1557 | "source": [ 1558 | "# generate the baskets: basket b consists of all items i that are factors of b\n", 1559 | "baskets_611 = {}\n", 1560 | "for i in range(1,101):\n", 1561 | " baskets_611[i] = factors(i)" 1562 | ] 1563 | }, 1564 | { 1565 | "cell_type": "markdown", 1566 | "metadata": {}, 1567 | "source": [ 1568 | "### Converting numeric baskets to corresponding words" 1569 | ] 1570 | }, 1571 | { 1572 | "cell_type": "code", 1573 | "execution_count": 296, 1574 | "metadata": { 1575 | "collapsed": false 1576 | }, 1577 | "outputs": [], 1578 | "source": [ 1579 | "# print a file containing baskets enclosed by curly braces {}\n", 1580 | "baskets_611_words = []\n", 1581 | "for i in range(1,101):\n", 1582 | " baskets_611_words.append([','.join([words[int] for int in baskets_611[i]])])" 1583 | ] 1584 | }, 1585 | { 1586 | "cell_type": "markdown", 1587 | "metadata": {}, 1588 | "source": [ 1589 | "# Solution to 6.2.6(a) " 1590 | ] 1591 | }, 1592 | { 1593 | "cell_type": "markdown", 1594 | "metadata": {}, 1595 | "source": [ 1596 | "## First pass of A-Priori\n", 1597 | "\n", 1598 | "In the first pass, we count the frequency of singletons as we read the baskets and store these into an array. Clearly do not need to map items into integers." 1599 | ] 1600 | }, 1601 | { 1602 | "cell_type": "code", 1603 | "execution_count": 80, 1604 | "metadata": { 1605 | "collapsed": false 1606 | }, 1607 | "outputs": [], 1608 | "source": [ 1609 | "C1 = {i:0 for i in range(1,101)}\n", 1610 | "for b in baskets_611.values():\n", 1611 | " for i in b:\n", 1612 | " C1[i] += 1" 1613 | ] 1614 | }, 1615 | { 1616 | "cell_type": "markdown", 1617 | "metadata": {}, 1618 | "source": [ 1619 | "## Between the Passes of A-Priori\n", 1620 | "\n", 1621 | "Generate the list of singletons that are frequent" 1622 | ] 1623 | }, 1624 | { 1625 | "cell_type": "code", 1626 | "execution_count": 76, 1627 | "metadata": { 1628 | "collapsed": true 1629 | }, 1630 | "outputs": [], 1631 | "source": [ 1632 | "# support threshold\n", 1633 | "s = 5" 1634 | ] 1635 | }, 1636 | { 1637 | "cell_type": "code", 1638 | "execution_count": 82, 1639 | "metadata": { 1640 | "collapsed": false 1641 | }, 1642 | "outputs": [], 1643 | "source": [ 1644 | "L1 = [item for item,count in C1.items() if count >= s]" 1645 | ] 1646 | }, 1647 | { 1648 | "cell_type": "code", 1649 | "execution_count": 83, 1650 | "metadata": { 1651 | "collapsed": false 1652 | }, 1653 | "outputs": [ 1654 | { 1655 | "data": { 1656 | "text/plain": [ 1657 | "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]" 1658 | ] 1659 | }, 1660 | "execution_count": 83, 1661 | "metadata": {}, 1662 | "output_type": "execute_result" 1663 | } 1664 | ], 1665 | "source": [ 1666 | "L1" 1667 | ] 1668 | }, 1669 | { 1670 | "cell_type": "markdown", 1671 | "metadata": {}, 1672 | "source": [ 1673 | "## Second Pass of A-Priori\n", 1674 | "\n", 1675 | "1. For each basket, look in the frequent-items table to see which of its items are frequent.\n", 1676 | "\n", 1677 | "2. In a double loop, generate all pairs of frequent items in that basket.\n", 1678 | "\n", 1679 | "3. For each such pair, add one to its count in the data structure used to store counts." 1680 | ] 1681 | }, 1682 | { 1683 | "cell_type": "markdown", 1684 | "metadata": {}, 1685 | "source": [ 1686 | "Using triangular-matrix method to store counts of pairs:" 1687 | ] 1688 | }, 1689 | { 1690 | "cell_type": "code", 1691 | "execution_count": 96, 1692 | "metadata": { 1693 | "collapsed": true 1694 | }, 1695 | "outputs": [], 1696 | "source": [ 1697 | "def triangularmatrix_method(i,j,n):\n", 1698 | " \"\"\"\n", 1699 | " returns the index for the triangular matrix method \n", 1700 | " A[i,j] = a[k] in a flattened array... 1<=i= s]" 1739 | ] 1740 | }, 1741 | { 1742 | "cell_type": "code", 1743 | "execution_count": 218, 1744 | "metadata": { 1745 | "collapsed": false 1746 | }, 1747 | "outputs": [ 1748 | { 1749 | "data": { 1750 | "text/plain": [ 1751 | "56" 1752 | ] 1753 | }, 1754 | "execution_count": 218, 1755 | "metadata": {}, 1756 | "output_type": "execute_result" 1757 | } 1758 | ], 1759 | "source": [ 1760 | "len(pair_indices)" 1761 | ] 1762 | }, 1763 | { 1764 | "cell_type": "code", 1765 | "execution_count": 219, 1766 | "metadata": { 1767 | "collapsed": false 1768 | }, 1769 | "outputs": [ 1770 | { 1771 | "name": "stdout", 1772 | "output_type": "stream", 1773 | "text": [ 1774 | "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 28 30 32 34 36 37 38 39 42 45 48 51 54 55 57 59 61 65 69 74 79 84 87 90 96 105 119 132 144\n" 1775 | ] 1776 | } 1777 | ], 1778 | "source": [ 1779 | "# k values we need to inverse map to get pair (i,j)\n", 1780 | "for ix in pair_indices:\n", 1781 | " print ix," 1782 | ] 1783 | }, 1784 | { 1785 | "cell_type": "code", 1786 | "execution_count": 220, 1787 | "metadata": { 1788 | "collapsed": true 1789 | }, 1790 | "outputs": [], 1791 | "source": [ 1792 | "# grabbing pairs (i,j) from flattened array\n", 1793 | "def inv_triangle(ix,n):\n", 1794 | " init = n-1\n", 1795 | " i = 1\n", 1796 | " while ix > init:\n", 1797 | " i += 1\n", 1798 | " init += n-i\n", 1799 | " # decrement init back once\n", 1800 | " init -= n-i\n", 1801 | " return [i,i+ix-init]" 1802 | ] 1803 | }, 1804 | { 1805 | "cell_type": "code", 1806 | "execution_count": 221, 1807 | "metadata": { 1808 | "collapsed": false 1809 | }, 1810 | "outputs": [], 1811 | "source": [ 1812 | "# have to add one in argument because Python arrays start at index 0\n", 1813 | "L2 = [inv_triangle(ix+1,m) for ix in pair_indices]" 1814 | ] 1815 | }, 1816 | { 1817 | "cell_type": "code", 1818 | "execution_count": 222, 1819 | "metadata": { 1820 | "collapsed": false 1821 | }, 1822 | "outputs": [ 1823 | { 1824 | "name": "stdout", 1825 | "output_type": "stream", 1826 | "text": [ 1827 | "[1, 2] [1, 3] [1, 4] [1, 5] [1, 6] [1, 7] [1, 8] [1, 9] [1, 10] [1, 11] [1, 12] [1, 13] [1, 14] [1, 15] [1, 16] [1, 17] [1, 18] [1, 19] [1, 20] [2, 3] [2, 4] [2, 5] [2, 6] [2, 7] [2, 8] [2, 9] [2, 10] [2, 12] [2, 14] [2, 16] [2, 18] [2, 20] [3, 4] [3, 5] [3, 6] [3, 9] [3, 12] [3, 15] [3, 18] [4, 5] [4, 6] [4, 8] [4, 10] [4, 12] [4, 16] [4, 20] [5, 10] [5, 15] [5, 20] [6, 9] [6, 12] [6, 18] [7, 14] [8, 16] [9, 18] [10, 20]\n" 1828 | ] 1829 | } 1830 | ], 1831 | "source": [ 1832 | "# frequent pairs is thus:\n", 1833 | "for pair in L2:\n", 1834 | " print pair," 1835 | ] 1836 | } 1837 | ], 1838 | "metadata": { 1839 | "kernelspec": { 1840 | "display_name": "Python 2", 1841 | "language": "python", 1842 | "name": "python2" 1843 | }, 1844 | "language_info": { 1845 | "codemirror_mode": { 1846 | "name": "ipython", 1847 | "version": 2 1848 | }, 1849 | "file_extension": ".py", 1850 | "mimetype": "text/x-python", 1851 | "name": "python", 1852 | "nbconvert_exporter": "python", 1853 | "pygments_lexer": "ipython2", 1854 | "version": "2.7.11" 1855 | } 1856 | }, 1857 | "nbformat": 4, 1858 | "nbformat_minor": 0 1859 | } 1860 | -------------------------------------------------------------------------------- /hiearchical_clustering_and_heaps.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from __future__ import division 4 | import numpy as np 5 | import heapq 6 | 7 | """ 8 | This script produces a program to conduct a Hierarchical clustering 9 | procedure on set of vectors in n-dimensional Euclidean space. 10 | 11 | The basic algorithm for the agglomerative clustering procedure is the 12 | acc_() function. This function requires the maintenance of a clusters 13 | list which tracks the clusters formed/remaining throughout the procedure. 14 | This function iterates until we are left with one cluster, printing the 15 | details of each merge step. 16 | 17 | The maintenance vector stores cluster x as follows: [(centroid of x), 18 | [[all points in x]]]. 19 | 20 | *** This script also produces an improved version of the basic algorithm 21 | by exploiting the heap data structure. In addition to the clusters maintenance 22 | list, we require a heap (denoted by h), a task/cluster dictionary in the heap 23 | (to keep track of removed clusters). 24 | 25 | For this part, the main function is agg_heap(), which requires the following 26 | functions: 27 | 28 | 1. remove_clusters(): 29 | performs lazy deletion of merged clusters 30 | 31 | 2. pop_cluster(): 32 | heappops the merged cluster with smallest distance 33 | only if the that cluster was not marked as 'REMOVED' 34 | 35 | 3. add_cluster(): 36 | takes as input, the output from pop_cluster and 37 | calculates the distances between this merged cluster and all of the 38 | clusters in the maintenance list clusters_remaining. These distances, 39 | along with the candidate merged clusters are added to heap using 40 | heappush. Note that we must also add the cluster that was actually 41 | merged to the clusters_remaining maintenance vector. 42 | 43 | Things to do: 44 | - Create a class for agg_heap. 45 | """ 46 | 47 | def Euclidean(x,y): 48 | """ 49 | This function returns the Euclidean distance between two vectors 50 | of a Euclidean space. 51 | """ 52 | xc = np.array(x) 53 | yc = np.array(y) 54 | return np.sqrt(np.dot(xc-yc,xc-yc)) 55 | 56 | def mean(x): 57 | """ 58 | This function takes as input a lists of the points and outputs 59 | the overall average of these points. This output is stored as 60 | a tuple so that it can be used to access the cluster index. In other 61 | words, the centroid of cluster x. 62 | """ 63 | N = len(x) 64 | n = len(x[0]) 65 | sum_vec = np.zeros(n) 66 | for point in x: 67 | sum_vec += np.array(point) 68 | mean_vec = sum_vec / N 69 | return tuple(mean_vec) 70 | 71 | def mins(x,y): 72 | """ 73 | This function takes as input two clusters of points (i.e. vectors) each of which 74 | are represented by of their own individual lists. The output of this function 75 | is the minimum distance between any two points one from each cluster 76 | """ 77 | nx = len(x) 78 | ny = len(y) 79 | running_min = 2**32 - 1 80 | for pt_x in x: 81 | for pt_y in y: 82 | if Euclidean(pt_x,pt_y) < running_min: 83 | running_min = Euclidean(pt_x,pt_y) 84 | return running_min 85 | 86 | def avg(x,y): 87 | """ 88 | This function takes as input two clusters of points (i.e. vectors) each of which 89 | are represented by of their own individual lists. The output of this function 90 | is the average distance between any two points one from each of the two clusters. 91 | """ 92 | nx = len(x) 93 | ny = len(y) 94 | running_sum = 0 95 | for pt_x in x: 96 | for pt_y in y: 97 | running_sum += Euclidean(pt_x,pt_y) 98 | return running_sum/(nx*ny) # total number of pairs is nx*ny (i.e., by multiplication rule) 99 | 100 | def radius(x,y=[]): 101 | """ 102 | This function takes as input two clusters of points (i.e. vectors) each of which 103 | are represented by of their own individual lists. The output of this function 104 | is the radius of the custer which results from the merge of x and y. 105 | 106 | If the input is simply one cluster, then the output is the radius of that 107 | cluster. 108 | """ 109 | nx = len(x) 110 | ny = len(y) 111 | # merge two clusters x and y 112 | merged_clus = x + y 113 | # the centroid of the new merged cluster 114 | # in non-Euclidean setting, we should change centroid to a clustroid 115 | merged_cent = mean(merged_clus) 116 | # determine the radius of this merged cluster 117 | # radius is the maximum distance between all the points and the centroid 118 | radius = 0 119 | for pt in merged_clus: 120 | if Euclidean(pt,merged_cent) > radius: 121 | radius = Euclidean(pt,merged_cent) 122 | return radius 123 | 124 | def diameter(x,y=[]): 125 | """ 126 | This function takes as input two clusters of points (i.e. vectors) each of which 127 | are represented by of their own individual lists. The output of this function 128 | is the diameter of the merged custer of x and y. 129 | 130 | If the input is simply one cluster, then the output is the diameter of that 131 | cluster. 132 | """ 133 | # merge two clusters x and y 134 | merged_clus = x + y 135 | n = len(merged_clus) 136 | # determine the diameter of this merged cluster 137 | # diameter is the maximum distance between any two points of the cluster 138 | diameter = 0 139 | for i in range(n-1): 140 | for j in range(i+1,n): 141 | distance_ij = Euclidean(merged_clus[i],merged_clus[j]) 142 | if distance_ij > diameter: 143 | diameter = distance_ij 144 | return diameter 145 | 146 | def agg_(clusters, print_summary = True, dist = 'Euclidean'): 147 | """ 148 | This function takes as input a dictionary of clusters in 149 | Euclidean space and returns the Agglomerative clustering. 150 | The key of the dictionary is the centroid of the corresponding 151 | cluster. 152 | 153 | Note that the clustering agglomerative clustering is done in 154 | place with respect to the clusters list input. 155 | """ 156 | 157 | # specifying the distance function used 158 | # r_ = 0 implies we consider centroids of the two clusters in merge step 159 | # r_ = 1 means that we consider the points of the two clusters themselves in merge step 160 | if dist == 'Euclidean': 161 | f_dist = Euclidean 162 | r_ = 0 163 | if dist == 'mins': 164 | f_dist = mins 165 | r_ = 1 166 | if dist == 'avg': 167 | f_dist = avg 168 | r_ = 1 169 | if dist == 'radius': 170 | f_dist = radius 171 | r_ = 1 172 | if dist == 'diameter': 173 | f_dist = diameter 174 | r_ = 1 175 | 176 | # start main code to conduct clustering 177 | step = 1 178 | while len(clusters) > 1: 179 | # while step < 3: 180 | # clusters hash table (use centroids as hash keys) 181 | clusters_ix = {el[0]:i for i,el in enumerate(clusters)} 182 | # double loop to consider the minimal distance between all pairs of clusters 183 | n = len(clusters) 184 | min_dist = 2**32-1 185 | c1 = None 186 | c2 = None 187 | for i in range(n-1): 188 | for j in range(i+1,n): 189 | # the distance between centroids of cluster i and cluster j 190 | distance_ij = f_dist(clusters[i][r_], clusters[j][r_]) 191 | if distance_ij < min_dist: 192 | min_dist = distance_ij 193 | c1 = clusters[i] 194 | c2 = clusters[j] 195 | # merge the two clusters that result in minimum Euclidean distance 196 | new_cluster = c1[1] + c2[1] 197 | new_centroid = mean(new_cluster) 198 | clusters.append([new_centroid, new_cluster]) 199 | # remove the merged clusters from the list 200 | del clusters[max(clusters_ix[c1[0]],clusters_ix[c2[0]])] 201 | del clusters[min(clusters_ix[c1[0]],clusters_ix[c2[0]])] 202 | if print_summary: 203 | print 'Step %d:' % step 204 | print 'Merged clusters: %s and %s' %(str(c1[1]),str(c2[1])) 205 | print 'Minimum distance: %f' % min_dist 206 | print 'New clusters list:' 207 | print [el[1] for el in clusters] 208 | print 'New centroids:' 209 | print [el[0] for el in clusters] 210 | print '' 211 | print '--------------------------------------------------------' 212 | print '' 213 | step += 1 214 | 215 | # Alternatively, can use np.mean to create the new centroid 216 | # new_centroid = tuple(np.mean(np.array(new_cluster),axis=0)) 217 | 218 | """ ************************************************************************** 219 | 220 | This part of the script defines agg_heap() and all of its necessary components 221 | 222 | Sample input: 223 | clusters = [[(4,10),[[4,10]]], [(7,10),[[7,10]]], [(4,8),[[4,8]]], 224 | [(6,8),[[6,8]]],[(3,4),[[3,4]]],[(2,2),[[2,2]]],[(5,2),[[5,2]]], 225 | [(12,6),[[12,6]]],[(10,5),[[10,5]]],[(11,4),[[11,4]]],[(9,3),[[9,3]]], 226 | [(12,3),[[12,3]]]] 227 | 228 | # creating a dictionary tracking the remaining clusters 229 | clusters_remaining = {tuple(tuple(el) for el in clusters[i][1]):clusters[i][1] 230 | for i in range(len(clusters))} 231 | clusters_remaining 232 | 233 | # creating a heap h with item keys (dist, pair) 234 | h = [] 235 | n = len(clusters) 236 | f_dist = Euclidean 237 | r_ = 0 238 | clusters_handle = {} # keys are centroids of the pairs of clusters 239 | for i in range(n-1): 240 | for j in range(i+1,n): 241 | distance_ij = f_dist(clusters[i][r_],clusters[j][r_]) 242 | ati = tuple(tuple(el) for el in clusters[i][1]) 243 | tun = tuple(tuple(el) for el in clusters[j][1]) 244 | foo = [distance_ij, (tuple(ati),tuple(tun)), 245 | clusters[i][1]+clusters[j][1]] 246 | clusters_handle[(tuple(ati),tuple(tun))] = foo 247 | heapq.heappush(h,foo) 248 | 249 | ************************************************************************** """ 250 | 251 | REMOVED = '' 252 | def remove_clusters(i): 253 | """ 254 | This function lazily deletes any clusters that have been 255 | merged from the dictionary clusters_handle. 256 | """ 257 | for key in clusters_handle.keys(): 258 | if i in key: 259 | # mark task as removed 260 | clusters_handle[key][1] = REMOVED 261 | 262 | def pop_cluster(): 263 | """ 264 | To maintain the heap property, we lazily deleted merged clusters. 265 | In this function, we only pop (extract) minimum distance 266 | clusters if the merged cluster has not been removed. 267 | """ 268 | # this pops until it returns something 269 | while h: # while there are entries in the heap 270 | distance, tup, merged_clus = heapq.heappop(h) 271 | if tup != REMOVED: 272 | del clusters_handle[tup] 273 | # remove newly merged clusters from heap task dict and clusters_remaining dictionary 274 | for cluster in tup: 275 | remove_clusters(cluster) 276 | del clusters_remaining[cluster] 277 | return distance, tup, merged_clus 278 | raise KeyError('pop from an empty heap') 279 | 280 | 281 | def add_cluster(entry): 282 | """ 283 | This function takes the pop'ed entry and calculates the 284 | distances of the newly-merged cluster to all of the clusters 285 | in the clusters_remaining dictionary. 286 | """ 287 | distance, tup, merged_clus = entry 288 | tup = tup[0] + tup[1] 289 | centroid = mean(merged_clus) 290 | # for every entry in the clusters_remaining compute new distances 291 | for tup_cmp, clus_cmp in clusters_remaining.items(): 292 | centroid_cmp = mean(clus_cmp) 293 | new_distance = Euclidean(centroid, centroid_cmp) 294 | # generate new entry for the heap 295 | new_tup = (tup, tup_cmp) 296 | new_merged_clus = merged_clus + clus_cmp 297 | new_entry = [new_distance, new_tup, new_merged_clus] 298 | # add new entry to clusters_handle 299 | clusters_handle[new_tup] = new_entry 300 | # add new entry to the heap 301 | heapq.heappush(h,new_entry) 302 | # add the recently merged cluster to clusters_remaining dict 303 | clusters_remaining[tup] = merged_clus 304 | 305 | def agg_heap(clusters, print_summary = True): 306 | """ 307 | This function takes as input a dictionary of clusters in 308 | Euclidean space and returns the Agglomerative clustering. This is 309 | an improvement over agg_ since it exploits the heap data structure. 310 | 311 | Note that Python heapq and queue module do not support element deletion. 312 | In this code, we simply use lazy deletion. 313 | """ 314 | # add methods here to create heap from clusters and 315 | # other necessary components 316 | clusters_remaining = {tuple(tuple(el) for el in clusters[i][1]):clusters[i][1] 317 | for i in range(len(clusters))} 318 | 319 | # start main code to conduct clustering 320 | step = 1 321 | while len(clusters_remaining) > 1: 322 | entry = pop_cluster() 323 | add_cluster(entry) 324 | distance, tup, merged_clus = entry 325 | if print_summary: 326 | print 'Step %d:' % step 327 | print 'Merged clusters: %s and %s' %(str(tup[0]),str(tup[1])) 328 | print 'Minimum distance: %f' % distance 329 | print 'New clusters list:' 330 | print [key for key in clusters_remaining.keys()] 331 | print 'New centroids:' 332 | print [mean(el) for el in clusters_remaining.values()] 333 | print '' 334 | print '--------------------------------------------------------' 335 | print '' 336 | step += 1 337 | 338 | --------------------------------------------------------------------------------